File size: 3,510 Bytes
cf91771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import math
import os.path
import uuid

import gradio
import numpy
import torch

from hubert.hubert_manager import HuBERTManager
from hubert.pre_kmeans_hubert import CustomHubert
from hubert.customtokenizer import CustomTokenizer
from encodec import EncodecModel
from encodec.utils import convert_audio


hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed())
tokenizer_model = CustomTokenizer.load_from_checkpoint(
    HuBERTManager.make_sure_tokenizer_installed(model='quantifier_V1_hubert_base_ls960_23.pth'),
    map_location=torch.device('cpu')
)
encodec_model = EncodecModel.encodec_model_24khz()



def clone(audio, *args):
    sr, wav = audio

    wav = torch.tensor(wav)

    if wav.dtype == torch.int16:
        wav = wav.float() / 32767.0

    if len(wav.shape) == 2:
        if wav.shape[0] == 2:  # Stereo to mono if needed
            wav = wav.mean(0, keepdim=True)
        if wav.shape[1] == 2:
            wav = wav.mean(1, keepdim=False).unsqueeze(-1)

    wav = wav[-int(sr*20):]  # Take only the last 20 seconds

    wav = wav.reshape(1, -1)  # Reshape from gradio style to HuBERT shape. (N, 1) to (1, N)

    semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
    semantic_tokens = tokenizer_model.get_token(semantic_vectors)

    encodec_model.set_target_bandwidth(6.0)
    wav = convert_audio(wav, sr, encodec_model.sample_rate, 1)
    wav = wav.unsqueeze(0)

    with torch.no_grad():
        encoded_frames = encodec_model.encode(wav)

    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [B, n_q, T]

    if not os.path.isdir('data/speakers'):
        os.makedirs('data/speakers')

    file_path = f'data/speakers/{uuid.uuid4().hex}.npz'

    numpy.savez(
        file_path,
        semantic_prompt=semantic_tokens,
        fine_prompt=codes,
        coarse_prompt=codes[:2, :]
    )

    return file_path



iface = gradio.interface.Interface(fn=clone, inputs=[
    'audio',
    gradio.Markdown(
        '''
        # Bark text to speech voice cloning
        [Model](https://huggingface.co./GitMylo/bark-voice-cloning/), [Model GitHub](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer), [Webui GitHub](https://github.com/gitmylo/audio-webui)

        For faster creation of voice clones [Duplicate this space](https://huggingface.co./spaces/GitMylo/bark-voice-cloning?duplicate=true)

        Uploaded audio files get cut to 20 seconds in order to keep it fast for everyone. Only the last 20 seconds will be used. (Bark only uses the last 14 seconds anyway)

        ## Tips for better cloning
        ### Make sure these things are **NOT** in your voice input: (in no particular order)
        * Noise (You can use a noise remover before)
        * Music (There are also music remover tools) (Unless you want music in the background)
        * A cut-off at the end (This will cause it to try and continue on the generation)
        * Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)

        ### What makes for good prompt audio? (in no particular order)
        * Clearly spoken
        * No weird background noises
        * Only one speaker
        * Audio which ends after a sentence ends
        * Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
        * Around 10 seconds of data
        ''')
], outputs='file')
iface.launch()