diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..fff7e3758d79002d2df6dce6edbd2ff97384c9fa 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,35 +1,36 @@ *.7z filter=lfs diff=lfs merge=lfs -text *.arrow filter=lfs diff=lfs merge=lfs -text *.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text *.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text *.ftz filter=lfs diff=lfs merge=lfs -text *.gz filter=lfs diff=lfs merge=lfs -text *.h5 filter=lfs diff=lfs merge=lfs -text *.joblib filter=lfs diff=lfs merge=lfs -text *.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text *.model filter=lfs diff=lfs merge=lfs -text *.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text *.onnx filter=lfs diff=lfs merge=lfs -text *.ot filter=lfs diff=lfs merge=lfs -text *.parquet filter=lfs diff=lfs merge=lfs -text *.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text *.pt filter=lfs diff=lfs merge=lfs -text *.pth filter=lfs diff=lfs merge=lfs -text *.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text *.tflite filter=lfs diff=lfs merge=lfs -text *.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text *.xz filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -textoldVersion/V200/text/cmudict_cache.pickle filter=lfs diff=lfs merge=lfs -text +oldVersion/V210/text/cmudict_cache.pickle filter=lfs diff=lfs merge=lfs -text +text/cmudict_cache.pickle filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 4404cd572b16a56fa8487e3159e5f4befc7f41a9..f875d659e6f5107febe668decfbcee5d98727ad9 100644 --- a/.gitignore +++ b/.gitignore @@ -159,27 +159,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -.DS_Store -/models -/logs - -filelists/* -!/filelists/esd.list -data/* -/*.yml -!/default_config.yml -/Web/ -/emotional/*/*.bin -/bert/*/*.bin -/bert/*/*.h5 -/bert/*/*.model -/bert/*/*.safetensors -/bert/*/*.msgpack -asr_transcript.py -extract_list.py -dataset -/Data -Model -raw/ -logs/ -Data/* +.DS_Store \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b26b4375df0c83ab02f3d58507c65baba3620033..ac150ac5ac0b1a1920bfaab8469c31132bb201d6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,13 +7,13 @@ repos: - id: trailing-whitespace - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.7 + rev: v0.1.8 hooks: - id: ruff args: [ --fix ] - repo: https://github.com/psf/black - rev: 23.11.0 + rev: 23.12.0 hooks: - id: black diff --git a/Data/Azusa/config.json b/Data/Azusa/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4ba6b296b6acbb28e366e94c69306cb7e2851647 --- /dev/null +++ b/Data/Azusa/config.json @@ -0,0 +1,108 @@ +{ + "train": { + "log_interval": 100, + "eval_interval": 100, + "seed": 42, + "epochs": 1000, + "learning_rate": 0.0001, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-09, + "batch_size": 12, + "bf16_run": false, + "lr_decay": 0.99995, + "segment_size": 16384, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "c_commit": 100, + "skip_optimizer": true, + "freeze_ZH_bert": false, + "freeze_JP_bert": false, + "freeze_EN_bert": false, + "freeze_emo": false + }, + "data": { + "training_files": "Data/Azusa/filelists/train.list", + "validation_files": "Data/Azusa/filelists/val.list", + "max_wav_value": 32768.0, + "sampling_rate": 44100, + "filter_length": 2048, + "hop_length": 512, + "win_length": 2048, + "n_mel_channels": 128, + "mel_fmin": 0.0, + "mel_fmax": null, + "add_blank": true, + "n_speakers": 1, + "cleaned_text": true, + "spk2id": { + "Azusa": 0 + } + }, + "model": { + "use_spk_conditioned_encoder": true, + "use_noise_scaled_mas": true, + "use_mel_posterior_encoder": false, + "use_duration_discriminator": true, + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 8, + 8, + 2, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 8, + 2, + 2 + ], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 512, + "slm": { + "model": "./slm/wavlm-base-plus", + "sr": 16000, + "hidden": 768, + "nlayers": 13, + "initial_channel": 64 + } + }, + "version": "2.3" +} \ No newline at end of file diff --git a/Data/Azusa/models/G_11300.pth b/Data/Azusa/models/G_11300.pth new file mode 100644 index 0000000000000000000000000000000000000000..96f8ff1f9fb0cf4edb646db0b3f22b35b8f7413f --- /dev/null +++ b/Data/Azusa/models/G_11300.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0708043b54ab21eb8ec1b600982ea7b105bcded370a9207281e043c64e195dc3 +size 728379830 diff --git a/app.py b/app.py index a2511b75c8b4521e65a05524dd567f9e83be120a..757a4054da967feb4777363c4262a8c3eddc8337 100644 --- a/app.py +++ b/app.py @@ -16,6 +16,10 @@ logging.basicConfig( logger = logging.getLogger(__name__) import torch +import ssl +ssl._create_default_https_context = ssl._create_unverified_context +import nltk +nltk.download('cmudict') import utils from infer import infer, latest_version, get_net_g, infer_multilang import gradio as gr @@ -42,6 +46,8 @@ def generate_audio( language, reference_audio, emotion, + style_text, + style_weight, skip_start=False, skip_end=False, ): @@ -49,8 +55,8 @@ def generate_audio( # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) with torch.no_grad(): for idx, piece in enumerate(slices): - skip_start = (idx != 0) and skip_start - skip_end = (idx != len(slices) - 1) and skip_end + skip_start = idx != 0 + skip_end = idx != len(slices) - 1 audio = infer( piece, reference_audio=reference_audio, @@ -66,10 +72,11 @@ def generate_audio( device=device, skip_start=skip_start, skip_end=skip_end, + style_text=style_text, + style_weight=style_weight, ) audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) audio_list.append(audio16bit) - # audio_list.append(silence) # 将静音添加到列表中 return audio_list @@ -90,8 +97,8 @@ def generate_audio_multilang( # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) with torch.no_grad(): for idx, piece in enumerate(slices): - skip_start = (idx != 0) and skip_start - skip_end = (idx != len(slices) - 1) and skip_end + skip_start = idx != 0 + skip_end = idx != len(slices) - 1 audio = infer_multilang( piece, reference_audio=reference_audio, @@ -110,7 +117,6 @@ def generate_audio_multilang( ) audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) audio_list.append(audio16bit) - # audio_list.append(silence) # 将静音添加到列表中 return audio_list @@ -127,63 +133,50 @@ def tts_split( interval_between_sent, reference_audio, emotion, + style_text, + style_weight, ): - if language == "mix": - return ("invalid", None) while text.find("\n\n") != -1: text = text.replace("\n\n", "\n") + text = text.replace("|", "") para_list = re_matching.cut_para(text) + para_list = [p for p in para_list if p != ""] audio_list = [] - if not cut_by_sent: - for idx, p in enumerate(para_list): - skip_start = idx != 0 - skip_end = idx != len(para_list) - 1 - audio = infer( + for p in para_list: + if not cut_by_sent: + audio_list += process_text( p, - reference_audio=reference_audio, - emotion=emotion, - sdp_ratio=sdp_ratio, - noise_scale=noise_scale, - noise_scale_w=noise_scale_w, - length_scale=length_scale, - sid=speaker, - language=language, - hps=hps, - net_g=net_g, - device=device, - skip_start=skip_start, - skip_end=skip_end, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + reference_audio, + emotion, + style_text, + style_weight, ) - audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) - audio_list.append(audio16bit) silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16) audio_list.append(silence) - else: - for idx, p in enumerate(para_list): - skip_start = idx != 0 - skip_end = idx != len(para_list) - 1 + else: audio_list_sent = [] sent_list = re_matching.cut_sent(p) - for idx, s in enumerate(sent_list): - skip_start = (idx != 0) and skip_start - skip_end = (idx != len(sent_list) - 1) and skip_end - audio = infer( + sent_list = [s for s in sent_list if s != ""] + for s in sent_list: + audio_list_sent += process_text( s, - reference_audio=reference_audio, - emotion=emotion, - sdp_ratio=sdp_ratio, - noise_scale=noise_scale, - noise_scale_w=noise_scale_w, - length_scale=length_scale, - sid=speaker, - language=language, - hps=hps, - net_g=net_g, - device=device, - skip_start=skip_start, - skip_end=skip_end, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + reference_audio, + emotion, + style_text, + style_weight, ) - audio_list_sent.append(audio) silence = np.zeros((int)(44100 * interval_between_sent)) audio_list_sent.append(silence) if (interval_between_para - interval_between_sent) > 0: @@ -196,10 +189,47 @@ def tts_split( ) # 对完整句子做音量归一 audio_list.append(audio16bit) audio_concat = np.concatenate(audio_list) - return ("Success", (44100, audio_concat)) + return ("Success", (hps.data.sampling_rate, audio_concat)) -def tts_fn( +def process_mix(slice): + _speaker = slice.pop() + _text, _lang = [], [] + for lang, content in slice: + content = content.split("|") + content = [part for part in content if part != ""] + if len(content) == 0: + continue + if len(_text) == 0: + _text = [[part] for part in content] + _lang = [[lang] for part in content] + else: + _text[-1].append(content[0]) + _lang[-1].append(lang) + if len(content) > 1: + _text += [[part] for part in content[1:]] + _lang += [[lang] for part in content[1:]] + return _text, _lang, _speaker + + +def process_auto(text): + _text, _lang = [], [] + for slice in text.split("|"): + if slice == "": + continue + temp_text, temp_lang = [], [] + sentences_list = split_by_language(slice, target_languages=["zh", "ja", "en"]) + for sentence, lang in sentences_list: + if sentence == "": + continue + temp_text.append(sentence) + temp_lang.append(lang.upper()) + _text.append(temp_text) + _lang.append(temp_lang) + return _text, _lang + + +def process_text( text: str, speaker, sdp_ratio, @@ -209,15 +239,9 @@ def tts_fn( language, reference_audio, emotion, - prompt_mode, + style_text=None, + style_weight=0, ): - if prompt_mode == "Audio prompt": - if reference_audio == None: - return ("Invalid audio prompt", None) - else: - reference_audio = load_audio(reference_audio)[1] - else: - reference_audio = None audio_list = [] if language == "mix": bool_valid, str_valid = re_matching.validate_text(text) @@ -226,120 +250,40 @@ def tts_fn( hps.data.sampling_rate, np.concatenate([np.zeros(hps.data.sampling_rate // 2)]), ) - result = [] for slice in re_matching.text_matching(text): - _speaker = slice.pop() - temp_contant = [] - temp_lang = [] - for lang, content in slice: - if "|" in content: - temp = [] - temp_ = [] - for i in content.split("|"): - if i != "": - temp.append([i]) - temp_.append([lang]) - else: - temp.append([]) - temp_.append([]) - temp_contant += temp - temp_lang += temp_ - else: - if len(temp_contant) == 0: - temp_contant.append([]) - temp_lang.append([]) - temp_contant[-1].append(content) - temp_lang[-1].append(lang) - for i, j in zip(temp_lang, temp_contant): - result.append([*zip(i, j), _speaker]) - for i, one in enumerate(result): - skip_start = i != 0 - skip_end = i != len(result) - 1 - _speaker = one.pop() - idx = 0 - while idx < len(one): - text_to_generate = [] - lang_to_generate = [] - while True: - lang, content = one[idx] - temp_text = [content] - if len(text_to_generate) > 0: - text_to_generate[-1] += [temp_text.pop(0)] - lang_to_generate[-1] += [lang] - if len(temp_text) > 0: - text_to_generate += [[i] for i in temp_text] - lang_to_generate += [[lang]] * len(temp_text) - if idx + 1 < len(one): - idx += 1 - else: - break - skip_start = (idx != 0) and skip_start - skip_end = (idx != len(one) - 1) and skip_end - print(text_to_generate, lang_to_generate) - audio_list.extend( - generate_audio_multilang( - text_to_generate, - sdp_ratio, - noise_scale, - noise_scale_w, - length_scale, - _speaker, - lang_to_generate, - reference_audio, - emotion, - skip_start, - skip_end, - ) + _text, _lang, _speaker = process_mix(slice) + if _speaker is None: + continue + print(f"Text: {_text}\nLang: {_lang}") + audio_list.extend( + generate_audio_multilang( + _text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + _speaker, + _lang, + reference_audio, + emotion, ) - idx += 1 + ) elif language.lower() == "auto": - for idx, slice in enumerate(text.split("|")): - if slice == "": - continue - skip_start = idx != 0 - skip_end = idx != len(text.split("|")) - 1 - sentences_list = split_by_language( - slice, target_languages=["zh", "ja", "en"] + _text, _lang = process_auto(text) + print(f"Text: {_text}\nLang: {_lang}") + audio_list.extend( + generate_audio_multilang( + _text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + speaker, + _lang, + reference_audio, + emotion, ) - idx = 0 - while idx < len(sentences_list): - text_to_generate = [] - lang_to_generate = [] - while True: - content, lang = sentences_list[idx] - temp_text = [content] - lang = lang.upper() - if lang == "JA": - lang = "JP" - if len(text_to_generate) > 0: - text_to_generate[-1] += [temp_text.pop(0)] - lang_to_generate[-1] += [lang] - if len(temp_text) > 0: - text_to_generate += [[i] for i in temp_text] - lang_to_generate += [[lang]] * len(temp_text) - if idx + 1 < len(sentences_list): - idx += 1 - else: - break - skip_start = (idx != 0) and skip_start - skip_end = (idx != len(sentences_list) - 1) and skip_end - print(text_to_generate, lang_to_generate) - audio_list.extend( - generate_audio_multilang( - text_to_generate, - sdp_ratio, - noise_scale, - noise_scale_w, - length_scale, - speaker, - lang_to_generate, - reference_audio, - emotion, - skip_start, - skip_end, - ) - ) - idx += 1 + ) else: audio_list.extend( generate_audio( @@ -352,13 +296,65 @@ def tts_fn( language, reference_audio, emotion, + style_text, + style_weight, ) ) + return audio_list + + +def tts_fn( + text: str, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + reference_audio, + emotion, + prompt_mode, + style_text=None, + style_weight=0, +): + if style_text == "": + style_text = None + if prompt_mode == "Audio prompt": + if reference_audio == None: + return ("Invalid audio prompt", None) + else: + reference_audio = load_audio(reference_audio)[1] + else: + reference_audio = None + + audio_list = process_text( + text, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + reference_audio, + emotion, + style_text, + style_weight, + ) audio_concat = np.concatenate(audio_list) return "Success", (hps.data.sampling_rate, audio_concat) +def format_utils(text, speaker): + _text, _lang = process_auto(text) + res = f"[{speaker}]" + for lang_s, content_s in zip(_lang, _text): + for lang, content in zip(lang_s, content_s): + res += f"<{lang.lower()}>{content}" + res += "|" + return "mix", res[:-1] + + def load_audio(path): audio, sr = librosa.load(path, 48000) # audio = librosa.resample(audio, 44100, 48000) @@ -394,10 +390,10 @@ if __name__ == "__main__": with gr.Blocks() as app: with gr.Row(): with gr.Column(): - gr.Markdown(value=""" - 【AI星瞳2.2】在线语音合成(Bert-Vits2 2.2中日英)\n + gr.Markdown(value=""" + 【AI阿梓】在线语音合成(Bert-Vits2 2.3中日英)\n 作者:Xz乔希 https://space.bilibili.com/5859321\n - 声音归属:星瞳_Official https://space.bilibili.com/401315430\n + 声音归属:阿梓从小就很可爱 https://space.bilibili.com/7706705\n 【AI合集】https://www.modelscope.cn/studios/xzjosh/Bert-VITS2\n Bert-VITS2项目:https://github.com/Stardust-minus/Bert-VITS2\n 使用本模型请严格遵守法律法规!\n @@ -414,27 +410,31 @@ if __name__ == "__main__": 另外,所有的语言选项都可以用'|'分割长段实现分句生成。 """, ) + formatter = gr.Button("检测语言,并整理为 MIX 格式", variant="primary") speaker = gr.Dropdown( choices=speakers, value=speakers[0], label="Speaker" ) _ = gr.Markdown( - value="提示模式(Prompt mode):使用首字母大写英文单词或上传音频提示,用于生成指定风格的声音。\n" + value="提示模式(Prompt mode):可选文字提示或音频提示,用于生成文字或音频指定风格的声音。\n", + visible=False, ) prompt_mode = gr.Radio( ["Text prompt", "Audio prompt"], label="Prompt Mode", value="Text prompt", + visible=False, ) text_prompt = gr.Textbox( label="Text prompt", - placeholder="单词描述生成风格。如:Happy", - visible=True, + placeholder="用文字描述生成风格。如:Happy", + value="Happy", + visible=False, ) audio_prompt = gr.Audio( label="Audio prompt", type="filepath", visible=False ) sdp_ratio = gr.Slider( - minimum=0, maximum=1, value=0.2, step=0.01, label="SDP Ratio" + minimum=0, maximum=1, value=0.5, step=0.01, label="SDP Ratio" ) noise_scale = gr.Slider( minimum=0.1, maximum=2, value=0.5, step=0.01, label="Noise" @@ -450,6 +450,21 @@ if __name__ == "__main__": ) btn = gr.Button("点击生成", variant="primary") with gr.Column(): + with gr.Accordion("融合文本语义(实验功能)", open=False): + gr.Markdown( + value="使用辅助文本的语意来辅助生成对话(语言保持与主文本相同)\n\n" + "**注意**:请使用**带有强烈情感的文本**(如:我好快乐!)\n\n" + "效果较不明确,留空即为不使用该功能" + ) + style_text = gr.Textbox(label="辅助文本") + style_weight = gr.Slider( + minimum=0, + maximum=1, + value=0.7, + step=0.1, + label="Weight", + info="主文本和辅助文本的bert混合比率,0表示仅主文本,1表示仅辅助文本", + ) with gr.Row(): with gr.Column(): interval_between_sent = gr.Slider( @@ -492,6 +507,8 @@ if __name__ == "__main__": audio_prompt, text_prompt, prompt_mode, + style_text, + style_weight, ], outputs=[text_output, audio_output], ) @@ -510,6 +527,8 @@ if __name__ == "__main__": interval_between_sent, audio_prompt, text_prompt, + style_text, + style_weight, ], outputs=[text_output, audio_output], ) diff --git a/bert_gen.py b/bert_gen.py index 588c768c86f32d9e421172379abc712196f1a66c..81175967cb6e5b66777409353834dd1270a2a6a2 100644 --- a/bert_gen.py +++ b/bert_gen.py @@ -1,17 +1,16 @@ -import argparse -from multiprocessing import Pool, cpu_count - import torch -import torch.multiprocessing as mp -from tqdm import tqdm - +from multiprocessing import Pool import commons import utils +from tqdm import tqdm +from text import check_bert_models, cleaned_text_to_sequence, get_bert +import argparse +import torch.multiprocessing as mp from config import config -from text import cleaned_text_to_sequence, get_bert -def process_line(line): +def process_line(x): + line, add_blank = x device = config.bert_gen_config.device if config.bert_gen_config.use_multi_device: rank = mp.current_process()._identity @@ -28,12 +27,13 @@ def process_line(line): word2ph = [i for i in word2ph] phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) - phone = commons.intersperse(phone, 0) - tone = commons.intersperse(tone, 0) - language = commons.intersperse(language, 0) - for i in range(len(word2ph)): - word2ph[i] = word2ph[i] * 2 - word2ph[0] += 1 + if add_blank: + phone = commons.intersperse(phone, 0) + tone = commons.intersperse(tone, 0) + language = commons.intersperse(language, 0) + for i in range(len(word2ph)): + word2ph[i] = word2ph[i] * 2 + word2ph[0] += 1 bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt") @@ -59,16 +59,23 @@ if __name__ == "__main__": args, _ = parser.parse_known_args() config_path = args.config hps = utils.get_hparams_from_file(config_path) + check_bert_models() lines = [] with open(hps.data.training_files, encoding="utf-8") as f: lines.extend(f.readlines()) with open(hps.data.validation_files, encoding="utf-8") as f: lines.extend(f.readlines()) + add_blank = [hps.data.add_blank] * len(lines) + if len(lines) != 0: - num_processes = min(args.num_processes, cpu_count()) + num_processes = args.num_processes with Pool(processes=num_processes) as pool: - for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)): - pass + for _ in tqdm( + pool.imap_unordered(process_line, zip(lines, add_blank)), + total=len(lines), + ): + # 这里是缩进的代码块,表示循环体 + pass # 使用pass语句作为占位符 print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!") diff --git a/clap_gen.py b/clap_gen.py index 20380abe6eb6657962f49d30fdb0a5f85c9c1a87..3054759d608fe60298d24ac38a5a03604eb8a3d2 100644 --- a/clap_gen.py +++ b/clap_gen.py @@ -27,7 +27,7 @@ def process_line(line): device = torch.device("cpu") wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|") - clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy") + clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.pt") if os.path.isfile(clap_path): return diff --git a/configs/config.json b/configs/config.json index d1828ffb2142818b62fc218fc291ab44b4947123..6f3c5cc3f429f032db158f6e9e44cd0c1fa40526 100644 --- a/configs/config.json +++ b/configs/config.json @@ -10,18 +10,20 @@ 0.99 ], "eps": 1e-09, - "batch_size": 12, - "fp16_run": false, + "batch_size": 16, + "bf16_run": false, "lr_decay": 0.99995, "segment_size": 16384, "init_lr_ratio": 1, "warmup_epochs": 0, "c_mel": 45, "c_kl": 1.0, + "c_commit": 100, "skip_optimizer": true, "freeze_ZH_bert": false, "freeze_JP_bert": false, - "freeze_EN_bert": false + "freeze_EN_bert": false, + "freeze_emo": false }, "data": { "training_files": "filelists/train.list", @@ -35,7 +37,7 @@ "mel_fmin": 0.0, "mel_fmax": null, "add_blank": true, - "n_speakers": 896, + "n_speakers": 850, "cleaned_text": true, "spk2id": { "派蒙_ZH": 0, @@ -119,203 +121,203 @@ "伊迪娅_ZH": 78, "留云借风真君_ZH": 79, "绮良良_ZH": 80, - "七七_ZH": 81, - "式大将_ZH": 82, - "瑶瑶_ZH": 83, - "奥兹_ZH": 84, - "菲米尼_ZH": 85, - "米卡_ZH": 86, - "哲平_ZH": 87, - "大肉丸_ZH": 88, - "托克_ZH": 89, - "蒂玛乌斯_ZH": 90, - "昆钧_ZH": 91, - "欧菲妮_ZH": 92, - "塞琉斯_ZH": 93, - "仆人_ZH": 94, - "迈勒斯_ZH": 95, - "希格雯_ZH": 96, - "阿守_ZH": 97, - "拉赫曼_ZH": 98, - "杜拉夫_ZH": 99, - "伊利亚斯_ZH": 100, - "阿晃_ZH": 101, - "旁白_ZH": 102, - "爱德琳_ZH": 103, - "埃洛伊_ZH": 104, - "德沃沙克_ZH": 105, - "玛乔丽_ZH": 106, - "塞塔蕾_ZH": 107, - "柊千里_ZH": 108, - "海芭夏_ZH": 109, - "九条镰治_ZH": 110, - "阿娜耶_ZH": 111, - "笼钓瓶一心_ZH": 112, - "回声海螺_ZH": 113, - "劳维克_ZH": 114, - "元太_ZH": 115, - "阿扎尔_ZH": 116, - "查尔斯_ZH": 117, - "阿洛瓦_ZH": 118, - "埃勒曼_ZH": 119, - "纳比尔_ZH": 120, - "莎拉_ZH": 121, - "康纳_ZH": 122, - "博来_ZH": 123, - "玛塞勒_ZH": 124, - "阿祇_ZH": 125, - "博士_ZH": 126, - "玛格丽特_ZH": 127, - "迪尔菲_ZH": 128, - "宛烟_ZH": 129, - "羽生田千鹤_ZH": 130, - "海妮耶_ZH": 131, - "旅行者_ZH": 132, - "霍夫曼_ZH": 133, - "佐西摩斯_ZH": 134, - "鹿野奈奈_ZH": 135, - "舒伯特_ZH": 136, - "天叔_ZH": 137, - "艾莉丝_ZH": 138, - "龙二_ZH": 139, - "莺儿_ZH": 140, - "嘉良_ZH": 141, - "一心传名刀_ZH": 142, - "费迪南德_ZH": 143, - "珊瑚_ZH": 144, - "言笑_ZH": 145, - "久利须_ZH": 146, - "嘉玛_ZH": 147, - "艾文_ZH": 148, - "克洛琳德_ZH": 149, - "丹吉尔_ZH": 150, - "女士_ZH": 151, - "白老先生_ZH": 152, - "天目十五_ZH": 153, - "老孟_ZH": 154, - "巴达维_ZH": 155, - "长生_ZH": 156, - "吴船长_ZH": 157, - "拉齐_ZH": 158, - "艾伯特_ZH": 159, - "松浦_ZH": 160, - "埃泽_ZH": 161, - "阿圆_ZH": 162, - "莫塞伊思_ZH": 163, - "阿拉夫_ZH": 164, - "杜吉耶_ZH": 165, - "石头_ZH": 166, - "百闻_ZH": 167, - "波洛_ZH": 168, - "斯坦利_ZH": 169, - "博易_ZH": 170, - "迈蒙_ZH": 171, - "掇星攫辰天君_ZH": 172, - "毗伽尔_ZH": 173, - "芙卡洛斯_ZH": 174, - "恶龙_ZH": 175, - "恕筠_ZH": 176, - "知易_ZH": 177, - "克列门特_ZH": 178, - "大慈树王_ZH": 179, - "西拉杰_ZH": 180, - "上杉_ZH": 181, - "阿尔卡米_ZH": 182, - "纯水精灵_ZH": 183, - "常九爷_ZH": 184, - "沙扎曼_ZH": 185, - "田铁嘴_ZH": 186, - "克罗索_ZH": 187, - "阿巴图伊_ZH": 188, - "悦_ZH": 189, + "陌生人_ZH": 81, + "七七_ZH": 82, + "式大将_ZH": 83, + "瑶瑶_ZH": 84, + "奥兹_ZH": 85, + "菲米尼_ZH": 86, + "米卡_ZH": 87, + "哲平_ZH": 88, + "浮游水蕈兽·元素生命_ZH": 89, + "大肉丸_ZH": 90, + "托克_ZH": 91, + "蒂玛乌斯_ZH": 92, + "昆钧_ZH": 93, + "欧菲妮_ZH": 94, + "塞琉斯_ZH": 95, + "仆人_ZH": 96, + "迈勒斯_ZH": 97, + "希格雯_ZH": 98, + "阿守_ZH": 99, + "拉赫曼_ZH": 100, + "杜拉夫_ZH": 101, + "伊利亚斯_ZH": 102, + "阿晃_ZH": 103, + "旁白_ZH": 104, + "爱德琳_ZH": 105, + "埃洛伊_ZH": 106, + "德沃沙克_ZH": 107, + "玛乔丽_ZH": 108, + "塞塔蕾_ZH": 109, + "柊千里_ZH": 110, + "海芭夏_ZH": 111, + "九条镰治_ZH": 112, + "阿娜耶_ZH": 113, + "笼钓瓶一心_ZH": 114, + "回声海螺_ZH": 115, + "劳维克_ZH": 116, + "元太_ZH": 117, + "阿扎尔_ZH": 118, + "查尔斯_ZH": 119, + "阿洛瓦_ZH": 120, + "埃勒曼_ZH": 121, + "纳比尔_ZH": 122, + "莎拉_ZH": 123, + "康纳_ZH": 124, + "博来_ZH": 125, + "玛塞勒_ZH": 126, + "阿祇_ZH": 127, + "博士_ZH": 128, + "玛格丽特_ZH": 129, + "迪尔菲_ZH": 130, + "宛烟_ZH": 131, + "羽生田千鹤_ZH": 132, + "海妮耶_ZH": 133, + "旅行者_ZH": 134, + "霍夫曼_ZH": 135, + "佐西摩斯_ZH": 136, + "鹿野奈奈_ZH": 137, + "舒伯特_ZH": 138, + "天叔_ZH": 139, + "艾莉丝_ZH": 140, + "龙二_ZH": 141, + "莺儿_ZH": 142, + "嘉良_ZH": 143, + "一心传名刀_ZH": 144, + "珊瑚_ZH": 145, + "言笑_ZH": 146, + "久利须_ZH": 147, + "嘉玛_ZH": 148, + "艾文_ZH": 149, + "克洛琳德_ZH": 150, + "丹吉尔_ZH": 151, + "女士_ZH": 152, + "白老先生_ZH": 153, + "天目十五_ZH": 154, + "老孟_ZH": 155, + "巴达维_ZH": 156, + "长生_ZH": 157, + "吴船长_ZH": 158, + "拉齐_ZH": 159, + "艾伯特_ZH": 160, + "松浦_ZH": 161, + "埃泽_ZH": 162, + "阿圆_ZH": 163, + "莫塞伊思_ZH": 164, + "阿拉夫_ZH": 165, + "杜吉耶_ZH": 166, + "石头_ZH": 167, + "百闻_ZH": 168, + "波洛_ZH": 169, + "斯坦利_ZH": 170, + "博易_ZH": 171, + "迈蒙_ZH": 172, + "掇星攫辰天君_ZH": 173, + "毗伽尔_ZH": 174, + "芙卡洛斯_ZH": 175, + "恶龙_ZH": 176, + "恕筠_ZH": 177, + "知易_ZH": 178, + "克列门特_ZH": 179, + "大慈树王_ZH": 180, + "西拉杰_ZH": 181, + "上杉_ZH": 182, + "阿尔卡米_ZH": 183, + "纯水精灵_ZH": 184, + "常九爷_ZH": 185, + "沙扎曼_ZH": 186, + "田铁嘴_ZH": 187, + "克罗索_ZH": 188, + "阿巴图伊_ZH": 189, "阿佩普_ZH": 190, "埃尔欣根_ZH": 191, "萨赫哈蒂_ZH": 192, "塔杰·拉德卡尼_ZH": 193, "安西_ZH": 194, - "埃舍尔_ZH": 195, - "萨齐因_ZH": 196, - "派蒙_JP": 197, - "纳西妲_JP": 198, - "凯亚_JP": 199, - "阿贝多_JP": 200, - "温迪_JP": 201, - "枫原万叶_JP": 202, - "钟离_JP": 203, - "荒泷一斗_JP": 204, - "八重神子_JP": 205, - "艾尔海森_JP": 206, - "提纳里_JP": 207, - "迪希雅_JP": 208, - "卡维_JP": 209, - "宵宫_JP": 210, - "那维莱特_JP": 211, - "莱依拉_JP": 212, - "赛诺_JP": 213, - "莫娜_JP": 214, - "诺艾尔_JP": 215, - "托马_JP": 216, - "凝光_JP": 217, - "林尼_JP": 218, - "北斗_JP": 219, - "柯莱_JP": 220, - "神里绫华_JP": 221, - "可莉_JP": 222, - "芭芭拉_JP": 223, - "雷电将军_JP": 224, - "娜维娅_JP": 225, - "芙宁娜_JP": 226, - "珊瑚宫心海_JP": 227, - "鹿野院平藏_JP": 228, - "迪奥娜_JP": 229, - "琴_JP": 230, - "五郎_JP": 231, - "班尼特_JP": 232, - "达达利亚_JP": 233, - "安柏_JP": 234, - "莱欧斯利_JP": 235, - "夜兰_JP": 236, - "妮露_JP": 237, - "辛焱_JP": 238, - "丽莎_JP": 239, - "珐露珊_JP": 240, - "魈_JP": 241, - "香菱_JP": 242, - "迪卢克_JP": 243, - "砂糖_JP": 244, - "烟绯_JP": 245, - "早柚_JP": 246, - "云堇_JP": 247, - "刻晴_JP": 248, - "重云_JP": 249, - "优菈_JP": 250, - "胡桃_JP": 251, - "流浪者_JP": 252, - "久岐忍_JP": 253, - "神里绫人_JP": 254, - "甘雨_JP": 255, - "戴因斯雷布_JP": 256, - "菲谢尔_JP": 257, - "白术_JP": 258, - "行秋_JP": 259, - "九条裟罗_JP": 260, - "夏洛蒂_JP": 261, - "雷泽_JP": 262, - "申鹤_JP": 263, - "空_JP": 264, - "荧_JP": 265, - "迪娜泽黛_JP": 266, - "凯瑟琳_JP": 267, - "多莉_JP": 268, - "坎蒂丝_JP": 269, - "琳妮特_JP": 270, - "萍姥姥_JP": 271, - "罗莎莉亚_JP": 272, - "埃德_JP": 273, - "爱贝尔_JP": 274, - "伊迪娅_JP": 275, - "留云借风真君_JP": 276, - "绮良良_JP": 277, + "陆行岩本真蕈·元素生命_ZH": 195, + "派蒙_JP": 196, + "纳西妲_JP": 197, + "凯亚_JP": 198, + "阿贝多_JP": 199, + "温迪_JP": 200, + "枫原万叶_JP": 201, + "钟离_JP": 202, + "荒泷一斗_JP": 203, + "八重神子_JP": 204, + "艾尔海森_JP": 205, + "提纳里_JP": 206, + "迪希雅_JP": 207, + "卡维_JP": 208, + "宵宫_JP": 209, + "那维莱特_JP": 210, + "莱依拉_JP": 211, + "赛诺_JP": 212, + "莫娜_JP": 213, + "诺艾尔_JP": 214, + "托马_JP": 215, + "凝光_JP": 216, + "林尼_JP": 217, + "北斗_JP": 218, + "柯莱_JP": 219, + "神里绫华_JP": 220, + "可莉_JP": 221, + "芭芭拉_JP": 222, + "雷电将军_JP": 223, + "娜维娅_JP": 224, + "芙宁娜_JP": 225, + "珊瑚宫心海_JP": 226, + "鹿野院平藏_JP": 227, + "迪奥娜_JP": 228, + "琴_JP": 229, + "五郎_JP": 230, + "班尼特_JP": 231, + "达达利亚_JP": 232, + "安柏_JP": 233, + "莱欧斯利_JP": 234, + "夜兰_JP": 235, + "妮露_JP": 236, + "辛焱_JP": 237, + "丽莎_JP": 238, + "珐露珊_JP": 239, + "魈_JP": 240, + "香菱_JP": 241, + "迪卢克_JP": 242, + "砂糖_JP": 243, + "烟绯_JP": 244, + "早柚_JP": 245, + "云堇_JP": 246, + "刻晴_JP": 247, + "重云_JP": 248, + "优菈_JP": 249, + "胡桃_JP": 250, + "流浪者_JP": 251, + "久岐忍_JP": 252, + "神里绫人_JP": 253, + "甘雨_JP": 254, + "戴因斯雷布_JP": 255, + "菲谢尔_JP": 256, + "白术_JP": 257, + "行秋_JP": 258, + "九条裟罗_JP": 259, + "夏洛蒂_JP": 260, + "雷泽_JP": 261, + "申鹤_JP": 262, + "空_JP": 263, + "荧_JP": 264, + "迪娜泽黛_JP": 265, + "凯瑟琳_JP": 266, + "多莉_JP": 267, + "坎蒂丝_JP": 268, + "琳妮特_JP": 269, + "萍姥姥_JP": 270, + "罗莎莉亚_JP": 271, + "埃德_JP": 272, + "爱贝尔_JP": 273, + "伊迪娅_JP": 274, + "留云借风真君_JP": 275, + "绮良良_JP": 276, + "陌生人_JP": 277, "七七_JP": 278, "式大将_JP": 279, "瑶瑶_JP": 280, @@ -323,576 +325,571 @@ "菲米尼_JP": 282, "米卡_JP": 283, "哲平_JP": 284, - "大肉丸_JP": 285, - "托克_JP": 286, - "蒂玛乌斯_JP": 287, - "昆钧_JP": 288, - "欧菲妮_JP": 289, - "塞琉斯_JP": 290, - "仆人_JP": 291, - "迈勒斯_JP": 292, - "希格雯_JP": 293, - "阿守_JP": 294, - "拉赫曼_JP": 295, - "杜拉夫_JP": 296, - "伊利亚斯_JP": 297, - "阿晃_JP": 298, - "旁白_JP": 299, - "爱德琳_JP": 300, - "埃洛伊_JP": 301, - "德沃沙克_JP": 302, - "玛乔丽_JP": 303, - "塞塔蕾_JP": 304, - "柊千里_JP": 305, - "海芭夏_JP": 306, - "九条镰治_JP": 307, - "阿娜耶_JP": 308, - "笼钓瓶一心_JP": 309, - "回声海螺_JP": 310, - "劳维克_JP": 311, - "元太_JP": 312, - "阿扎尔_JP": 313, - "查尔斯_JP": 314, - "阿洛瓦_JP": 315, - "埃勒曼_JP": 316, - "纳比尔_JP": 317, - "莎拉_JP": 318, - "康纳_JP": 319, - "博来_JP": 320, - "玛塞勒_JP": 321, - "阿祇_JP": 322, - "博士_JP": 323, - "迪尔菲_JP": 324, - "玛格丽特_JP": 325, - "宛烟_JP": 326, - "羽生田千鹤_JP": 327, - "海妮耶_JP": 328, - "霍夫曼_JP": 329, - "旅行者_JP": 330, - "佐西摩斯_JP": 331, - "舒伯特_JP": 332, - "鹿野奈奈_JP": 333, - "天叔_JP": 334, - "龙二_JP": 335, - "艾莉丝_JP": 336, - "莺儿_JP": 337, - "嘉良_JP": 338, - "珊瑚_JP": 339, - "言笑_JP": 340, - "一心传名刀_JP": 341, - "费迪南德_JP": 342, - "久利须_JP": 343, - "嘉玛_JP": 344, - "艾文_JP": 345, - "克洛琳德_JP": 346, - "丹吉尔_JP": 347, - "天目十五_JP": 348, - "女士_JP": 349, - "老孟_JP": 350, - "白老先生_JP": 351, - "舍利夫_JP": 352, - "巴达维_JP": 353, - "拉齐_JP": 354, - "长生_JP": 355, - "吴船长_JP": 356, - "艾伯特_JP": 357, - "松浦_JP": 358, - "埃泽_JP": 359, - "阿圆_JP": 360, - "阿拉夫_JP": 361, - "莫塞伊思_JP": 362, - "石头_JP": 363, - "百闻_JP": 364, - "杜吉耶_JP": 365, - "波洛_JP": 366, - "掇星攫辰天君_JP": 367, - "迈蒙_JP": 368, - "博易_JP": 369, - "诗筠_JP": 370, - "斯坦利_JP": 371, - "毗伽尔_JP": 372, - "芙卡洛斯_JP": 373, - "恶龙_JP": 374, - "小仓澪_JP": 375, - "恕筠_JP": 376, - "知易_JP": 377, - "克列门特_JP": 378, - "大慈树王_JP": 379, - "望雅_JP": 380, - "黑田_JP": 381, - "卡莉娜_JP": 382, - "马姆杜_JP": 383, - "科林斯_JP": 384, - "上杉_JP": 385, - "西拉杰_JP": 386, - "菲尔戈黛特_JP": 387, - "一平_JP": 388, - "纯水精灵_JP": 389, - "阿尔卡米_JP": 390, - "老戴_JP": 391, - "谢赫祖拜尔_JP": 392, - "沙扎曼_JP": 393, - "田铁嘴_JP": 394, - "小野寺_JP": 395, - "百识_JP": 396, - "克罗索_JP": 397, - "莱斯格_JP": 398, - "芷巧_JP": 399, - "加藤洋平_JP": 400, - "阿巴图伊_JP": 401, - "埃尔欣根_JP": 402, - "斯嘉莉_JP": 403, - "阿佩普_JP": 404, - "巫女_JP": 405, - "卡布斯_JP": 406, - "洛伦佐_JP": 407, - "萨赫哈蒂_JP": 408, - "娜德瓦_JP": 409, - "塞德娜_JP": 410, - "塔杰·拉德卡尼_JP": 411, - "绘星_JP": 412, - "泽田_JP": 413, - "安西_JP": 414, - "拉伊德_JP": 415, - "亚卡巴_JP": 416, - "有乐斋_JP": 417, - "莱昂_JP": 418, - "尤苏波夫_JP": 419, - "夏妮_JP": 420, - "埃舍尔_JP": 421, - "萨齐因_JP": 422, - "古山_JP": 423, - "自称渊上之物_JP": 424, - "丹羽_JP": 425, - "塞萨尔的日记_JP": 426, - "派蒙_EN": 427, - "纳西妲_EN": 428, - "凯亚_EN": 429, - "阿贝多_EN": 430, - "温迪_EN": 431, - "枫原万叶_EN": 432, - "钟离_EN": 433, - "荒泷一斗_EN": 434, - "八重神子_EN": 435, - "艾尔海森_EN": 436, - "提纳里_EN": 437, - "迪希雅_EN": 438, - "卡维_EN": 439, - "宵宫_EN": 440, - "莱依拉_EN": 441, - "那维莱特_EN": 442, - "赛诺_EN": 443, - "莫娜_EN": 444, - "诺艾尔_EN": 445, - "托马_EN": 446, - "凝光_EN": 447, - "林尼_EN": 448, - "北斗_EN": 449, - "柯莱_EN": 450, - "神里绫华_EN": 451, - "可莉_EN": 452, - "芭芭拉_EN": 453, - "雷电将军_EN": 454, - "娜维娅_EN": 455, - "芙宁娜_EN": 456, - "珊瑚宫心海_EN": 457, - "鹿野院平藏_EN": 458, - "迪奥娜_EN": 459, - "五郎_EN": 460, - "琴_EN": 461, - "班尼特_EN": 462, - "达达利亚_EN": 463, - "安柏_EN": 464, - "莱欧斯利_EN": 465, - "夜兰_EN": 466, - "妮露_EN": 467, - "辛焱_EN": 468, - "珐露珊_EN": 469, - "丽莎_EN": 470, - "魈_EN": 471, - "香菱_EN": 472, - "迪卢克_EN": 473, - "砂糖_EN": 474, - "烟绯_EN": 475, - "早柚_EN": 476, - "云堇_EN": 477, - "刻晴_EN": 478, - "重云_EN": 479, - "优菈_EN": 480, - "胡桃_EN": 481, - "流浪者_EN": 482, - "久岐忍_EN": 483, - "神里绫人_EN": 484, - "甘雨_EN": 485, - "戴因斯雷布_EN": 486, - "菲谢尔_EN": 487, - "白术_EN": 488, - "行秋_EN": 489, - "九条裟罗_EN": 490, - "夏洛蒂_EN": 491, - "雷泽_EN": 492, - "申鹤_EN": 493, - "荧_EN": 494, - "空_EN": 495, - "迪娜泽黛_EN": 496, - "凯瑟琳_EN": 497, - "多莉_EN": 498, - "坎蒂丝_EN": 499, - "琳妮特_EN": 500, - "萍姥姥_EN": 501, - "罗莎莉亚_EN": 502, - "埃德_EN": 503, - "爱贝尔_EN": 504, - "伊迪娅_EN": 505, - "留云借风真君_EN": 506, - "绮良良_EN": 507, - "七七_EN": 508, - "式大将_EN": 509, - "瑶瑶_EN": 510, - "奥兹_EN": 511, - "菲米尼_EN": 512, - "米卡_EN": 513, - "哲平_EN": 514, - "大肉丸_EN": 515, - "托克_EN": 516, - "蒂玛乌斯_EN": 517, - "昆钧_EN": 518, - "欧菲妮_EN": 519, - "塞琉斯_EN": 520, - "仆人_EN": 521, - "迈勒斯_EN": 522, - "希格雯_EN": 523, - "阿守_EN": 524, - "拉赫曼_EN": 525, - "杜拉夫_EN": 526, - "伊利亚斯_EN": 527, - "阿晃_EN": 528, - "旁白_EN": 529, - "爱德琳_EN": 530, - "埃洛伊_EN": 531, - "德沃沙克_EN": 532, - "玛乔丽_EN": 533, - "塞塔蕾_EN": 534, - "柊千里_EN": 535, - "海芭夏_EN": 536, - "九条镰治_EN": 537, - "阿娜耶_EN": 538, - "笼钓瓶一心_EN": 539, - "回声海螺_EN": 540, - "劳维克_EN": 541, - "元太_EN": 542, - "阿扎尔_EN": 543, - "查尔斯_EN": 544, - "阿洛瓦_EN": 545, - "埃勒曼_EN": 546, - "纳比尔_EN": 547, - "莎拉_EN": 548, - "康纳_EN": 549, - "博来_EN": 550, - "玛塞勒_EN": 551, - "阿祇_EN": 552, - "博士_EN": 553, - "迪尔菲_EN": 554, - "宛烟_EN": 555, - "玛格丽特_EN": 556, - "羽生田千鹤_EN": 557, - "海妮耶_EN": 558, - "霍夫曼_EN": 559, - "旅行者_EN": 560, - "佐西摩斯_EN": 561, - "鹿野奈奈_EN": 562, - "舒伯特_EN": 563, - "天叔_EN": 564, - "艾莉丝_EN": 565, - "龙二_EN": 566, - "莺儿_EN": 567, - "嘉良_EN": 568, - "珊瑚_EN": 569, - "费迪南德_EN": 570, - "言笑_EN": 571, - "一心传名刀_EN": 572, - "久利须_EN": 573, - "嘉玛_EN": 574, - "艾文_EN": 575, - "克洛琳德_EN": 576, - "丹吉尔_EN": 577, - "女士_EN": 578, - "天目十五_EN": 579, - "老孟_EN": 580, - "白老先生_EN": 581, - "舍利夫_EN": 582, - "巴达维_EN": 583, - "拉齐_EN": 584, - "长生_EN": 585, - "吴船长_EN": 586, - "艾伯特_EN": 587, - "松浦_EN": 588, - "埃泽_EN": 589, - "阿圆_EN": 590, - "阿拉夫_EN": 591, - "莫塞伊思_EN": 592, - "石头_EN": 593, - "百闻_EN": 594, - "杜吉耶_EN": 595, - "波洛_EN": 596, - "斯坦利_EN": 597, - "掇星攫辰天君_EN": 598, - "迈蒙_EN": 599, - "博易_EN": 600, - "诗筠_EN": 601, - "毗伽尔_EN": 602, - "慧心_EN": 603, - "芙卡洛斯_EN": 604, - "恶龙_EN": 605, - "小仓澪_EN": 606, - "恕筠_EN": 607, - "知易_EN": 608, - "克列门特_EN": 609, - "大慈树王_EN": 610, - "维多利亚_EN": 611, - "黑田_EN": 612, - "马姆杜_EN": 613, - "科林斯_EN": 614, - "上杉_EN": 615, - "西拉杰_EN": 616, - "宁禄_EN": 617, - "纯水精灵_EN": 618, - "常九爷_EN": 619, - "阿尔卡米_EN": 620, - "沙扎曼_EN": 621, - "田铁嘴_EN": 622, - "加萨尼_EN": 623, - "克罗索_EN": 624, - "星稀_EN": 625, - "莱斯格_EN": 626, - "阿巴图伊_EN": 627, - "悦_EN": 628, - "德田_EN": 629, - "埃尔欣根_EN": 630, - "阿佩普_EN": 631, - "萨赫哈蒂_EN": 632, - "洛伦佐_EN": 633, - "塔杰·拉德卡尼_EN": 634, - "泽田_EN": 635, - "安西_EN": 636, - "理水叠山真君_EN": 637, + "浮游水蕈兽·元素生命_JP": 285, + "大肉丸_JP": 286, + "托克_JP": 287, + "蒂玛乌斯_JP": 288, + "昆钧_JP": 289, + "欧菲妮_JP": 290, + "塞琉斯_JP": 291, + "仆人_JP": 292, + "迈勒斯_JP": 293, + "希格雯_JP": 294, + "阿守_JP": 295, + "拉赫曼_JP": 296, + "杜拉夫_JP": 297, + "伊利亚斯_JP": 298, + "阿晃_JP": 299, + "旁白_JP": 300, + "爱德琳_JP": 301, + "埃洛伊_JP": 302, + "德沃沙克_JP": 303, + "玛乔丽_JP": 304, + "塞塔蕾_JP": 305, + "柊千里_JP": 306, + "海芭夏_JP": 307, + "九条镰治_JP": 308, + "阿娜耶_JP": 309, + "笼钓瓶一心_JP": 310, + "回声海螺_JP": 311, + "劳维克_JP": 312, + "元太_JP": 313, + "阿扎尔_JP": 314, + "查尔斯_JP": 315, + "阿洛瓦_JP": 316, + "埃勒曼_JP": 317, + "纳比尔_JP": 318, + "莎拉_JP": 319, + "康纳_JP": 320, + "博来_JP": 321, + "玛塞勒_JP": 322, + "阿祇_JP": 323, + "博士_JP": 324, + "迪尔菲_JP": 325, + "玛格丽特_JP": 326, + "宛烟_JP": 327, + "羽生田千鹤_JP": 328, + "海妮耶_JP": 329, + "霍夫曼_JP": 330, + "旅行者_JP": 331, + "佐西摩斯_JP": 332, + "舒伯特_JP": 333, + "鹿野奈奈_JP": 334, + "天叔_JP": 335, + "龙二_JP": 336, + "艾莉丝_JP": 337, + "莺儿_JP": 338, + "嘉良_JP": 339, + "珊瑚_JP": 340, + "言笑_JP": 341, + "一心传名刀_JP": 342, + "费迪南德_JP": 343, + "久利须_JP": 344, + "嘉玛_JP": 345, + "艾文_JP": 346, + "克洛琳德_JP": 347, + "丹吉尔_JP": 348, + "天目十五_JP": 349, + "女士_JP": 350, + "老孟_JP": 351, + "白老先生_JP": 352, + "舍利夫_JP": 353, + "巴达维_JP": 354, + "拉齐_JP": 355, + "长生_JP": 356, + "吴船长_JP": 357, + "艾伯特_JP": 358, + "松浦_JP": 359, + "埃泽_JP": 360, + "阿圆_JP": 361, + "阿拉夫_JP": 362, + "莫塞伊思_JP": 363, + "石头_JP": 364, + "百闻_JP": 365, + "杜吉耶_JP": 366, + "波洛_JP": 367, + "掇星攫辰天君_JP": 368, + "迈蒙_JP": 369, + "博易_JP": 370, + "诗筠_JP": 371, + "斯坦利_JP": 372, + "毗伽尔_JP": 373, + "芙卡洛斯_JP": 374, + "恶龙_JP": 375, + "小仓澪_JP": 376, + "恕筠_JP": 377, + "知易_JP": 378, + "克列门特_JP": 379, + "大慈树王_JP": 380, + "望雅_JP": 381, + "黑田_JP": 382, + "卡莉娜_JP": 383, + "马姆杜_JP": 384, + "科林斯_JP": 385, + "上杉_JP": 386, + "西拉杰_JP": 387, + "菲尔戈黛特_JP": 388, + "一平_JP": 389, + "纯水精灵_JP": 390, + "阿尔卡米_JP": 391, + "老戴_JP": 392, + "谢赫祖拜尔_JP": 393, + "沙扎曼_JP": 394, + "田铁嘴_JP": 395, + "小野寺_JP": 396, + "百识_JP": 397, + "克罗索_JP": 398, + "莱斯格_JP": 399, + "芷巧_JP": 400, + "加藤洋平_JP": 401, + "阿巴图伊_JP": 402, + "埃尔欣根_JP": 403, + "斯嘉莉_JP": 404, + "阿佩普_JP": 405, + "巫女_JP": 406, + "卡布斯_JP": 407, + "洛伦佐_JP": 408, + "萨赫哈蒂_JP": 409, + "娜德瓦_JP": 410, + "塞德娜_JP": 411, + "塔杰·拉德卡尼_JP": 412, + "绘星_JP": 413, + "泽田_JP": 414, + "安西_JP": 415, + "拉伊德_JP": 416, + "亚卡巴_JP": 417, + "有乐斋_JP": 418, + "莱昂_JP": 419, + "尤苏波夫_JP": 420, + "夏妮_JP": 421, + "埃舍尔_JP": 422, + "萨齐因_JP": 423, + "古山_JP": 424, + "自称渊上之物_JP": 425, + "丹羽_JP": 426, + "塞萨尔的日记_JP": 427, + "派蒙_EN": 428, + "纳西妲_EN": 429, + "凯亚_EN": 430, + "阿贝多_EN": 431, + "温迪_EN": 432, + "枫原万叶_EN": 433, + "钟离_EN": 434, + "荒泷一斗_EN": 435, + "八重神子_EN": 436, + "艾尔海森_EN": 437, + "提纳里_EN": 438, + "迪希雅_EN": 439, + "卡维_EN": 440, + "宵宫_EN": 441, + "莱依拉_EN": 442, + "那维莱特_EN": 443, + "赛诺_EN": 444, + "莫娜_EN": 445, + "诺艾尔_EN": 446, + "托马_EN": 447, + "凝光_EN": 448, + "林尼_EN": 449, + "北斗_EN": 450, + "柯莱_EN": 451, + "神里绫华_EN": 452, + "可莉_EN": 453, + "芭芭拉_EN": 454, + "雷电将军_EN": 455, + "娜维娅_EN": 456, + "芙宁娜_EN": 457, + "珊瑚宫心海_EN": 458, + "鹿野院平藏_EN": 459, + "迪奥娜_EN": 460, + "五郎_EN": 461, + "琴_EN": 462, + "班尼特_EN": 463, + "达达利亚_EN": 464, + "安柏_EN": 465, + "莱欧斯利_EN": 466, + "夜兰_EN": 467, + "妮露_EN": 468, + "辛焱_EN": 469, + "珐露珊_EN": 470, + "丽莎_EN": 471, + "魈_EN": 472, + "香菱_EN": 473, + "迪卢克_EN": 474, + "砂糖_EN": 475, + "烟绯_EN": 476, + "早柚_EN": 477, + "云堇_EN": 478, + "刻晴_EN": 479, + "重云_EN": 480, + "优菈_EN": 481, + "胡桃_EN": 482, + "流浪者_EN": 483, + "久岐忍_EN": 484, + "神里绫人_EN": 485, + "甘雨_EN": 486, + "戴因斯雷布_EN": 487, + "菲谢尔_EN": 488, + "白术_EN": 489, + "行秋_EN": 490, + "九条裟罗_EN": 491, + "夏洛蒂_EN": 492, + "雷泽_EN": 493, + "申鹤_EN": 494, + "荧_EN": 495, + "空_EN": 496, + "迪娜泽黛_EN": 497, + "凯瑟琳_EN": 498, + "多莉_EN": 499, + "坎蒂丝_EN": 500, + "琳妮特_EN": 501, + "萍姥姥_EN": 502, + "罗莎莉亚_EN": 503, + "埃德_EN": 504, + "爱贝尔_EN": 505, + "伊迪娅_EN": 506, + "留云借风真君_EN": 507, + "绮良良_EN": 508, + "陌生人_EN": 509, + "七七_EN": 510, + "式大将_EN": 511, + "瑶瑶_EN": 512, + "奥兹_EN": 513, + "菲米尼_EN": 514, + "米卡_EN": 515, + "哲平_EN": 516, + "浮游水蕈兽·元素生命_EN": 517, + "大肉丸_EN": 518, + "托克_EN": 519, + "蒂玛乌斯_EN": 520, + "昆钧_EN": 521, + "欧菲妮_EN": 522, + "塞琉斯_EN": 523, + "仆人_EN": 524, + "迈勒斯_EN": 525, + "希格雯_EN": 526, + "阿守_EN": 527, + "拉赫曼_EN": 528, + "杜拉夫_EN": 529, + "伊利亚斯_EN": 530, + "阿晃_EN": 531, + "旁白_EN": 532, + "爱德琳_EN": 533, + "埃洛伊_EN": 534, + "德沃沙克_EN": 535, + "玛乔丽_EN": 536, + "塞塔蕾_EN": 537, + "柊千里_EN": 538, + "海芭夏_EN": 539, + "九条镰治_EN": 540, + "阿娜耶_EN": 541, + "笼钓瓶一心_EN": 542, + "回声海螺_EN": 543, + "劳维克_EN": 544, + "元太_EN": 545, + "阿扎尔_EN": 546, + "查尔斯_EN": 547, + "阿洛瓦_EN": 548, + "埃勒曼_EN": 549, + "纳比尔_EN": 550, + "莎拉_EN": 551, + "康纳_EN": 552, + "博来_EN": 553, + "玛塞勒_EN": 554, + "阿祇_EN": 555, + "博士_EN": 556, + "迪尔菲_EN": 557, + "宛烟_EN": 558, + "玛格丽特_EN": 559, + "羽生田千鹤_EN": 560, + "海妮耶_EN": 561, + "霍夫曼_EN": 562, + "旅行者_EN": 563, + "佐西摩斯_EN": 564, + "鹿野奈奈_EN": 565, + "舒伯特_EN": 566, + "天叔_EN": 567, + "艾莉丝_EN": 568, + "龙二_EN": 569, + "莺儿_EN": 570, + "嘉良_EN": 571, + "珊瑚_EN": 572, + "费迪南德_EN": 573, + "言笑_EN": 574, + "一心传名刀_EN": 575, + "久利须_EN": 576, + "嘉玛_EN": 577, + "艾文_EN": 578, + "克洛琳德_EN": 579, + "丹吉尔_EN": 580, + "女士_EN": 581, + "天目十五_EN": 582, + "老孟_EN": 583, + "白老先生_EN": 584, + "舍利夫_EN": 585, + "巴达维_EN": 586, + "拉齐_EN": 587, + "长生_EN": 588, + "吴船长_EN": 589, + "艾伯特_EN": 590, + "松浦_EN": 591, + "埃泽_EN": 592, + "阿圆_EN": 593, + "阿拉夫_EN": 594, + "莫塞伊思_EN": 595, + "石头_EN": 596, + "百闻_EN": 597, + "杜吉耶_EN": 598, + "波洛_EN": 599, + "斯坦利_EN": 600, + "掇星攫辰天君_EN": 601, + "迈蒙_EN": 602, + "博易_EN": 603, + "诗筠_EN": 604, + "毗伽尔_EN": 605, + "慧心_EN": 606, + "芙卡洛斯_EN": 607, + "恶龙_EN": 608, + "小仓澪_EN": 609, + "恕筠_EN": 610, + "知易_EN": 611, + "克列门特_EN": 612, + "大慈树王_EN": 613, + "维多利亚_EN": 614, + "黑田_EN": 615, + "马姆杜_EN": 616, + "科林斯_EN": 617, + "上杉_EN": 618, + "西拉杰_EN": 619, + "宁禄_EN": 620, + "纯水精灵_EN": 621, + "常九爷_EN": 622, + "阿尔卡米_EN": 623, + "沙扎曼_EN": 624, + "田铁嘴_EN": 625, + "加萨尼_EN": 626, + "克罗索_EN": 627, + "星稀_EN": 628, + "莱斯格_EN": 629, + "阿巴图伊_EN": 630, + "埃尔欣根_EN": 631, + "阿佩普_EN": 632, + "萨赫哈蒂_EN": 633, + "洛伦佐_EN": 634, + "塔杰·拉德卡尼_EN": 635, + "泽田_EN": 636, + "安西_EN": 637, "埃舍尔_EN": 638, - "萨齐因_EN": 639, - "古田_EN": 640, - "三月七_ZH": 641, - "丹恒_ZH": 642, - "希儿_ZH": 643, - "娜塔莎_ZH": 644, - "希露瓦_ZH": 645, - "瓦尔特_ZH": 646, - "佩拉_ZH": 647, - "布洛妮娅_ZH": 648, - "虎克_ZH": 649, - "素裳_ZH": 650, - "克拉拉_ZH": 651, - "符玄_ZH": 652, - "白露_ZH": 653, - "杰帕德_ZH": 654, - "景元_ZH": 655, - "藿藿_ZH": 656, - "姬子_ZH": 657, - "穹_ZH": 658, - "星_ZH": 659, - "卡芙卡_ZH": 660, - "桂乃芬_ZH": 661, - "艾丝妲_ZH": 662, - "玲可_ZH": 663, - "彦卿_ZH": 664, - "托帕_ZH": 665, - "驭空_ZH": 666, - "浮烟_ZH": 667, - "停云_ZH": 668, - "镜流_ZH": 669, - "罗刹_ZH": 670, - "卢卡_ZH": 671, - "史瓦罗_ZH": 672, - "黑塔_ZH": 673, - "桑博_ZH": 674, - "伦纳德_ZH": 675, - "明曦_ZH": 676, - "银狼_ZH": 677, - "帕姆_ZH": 678, - "青雀_ZH": 679, - "乔瓦尼_ZH": 680, - "公输师傅_ZH": 681, - "晴霓_ZH": 682, - "螺丝咕姆_ZH": 683, - "阿兰_ZH": 684, - "奥列格_ZH": 685, - "丹枢_ZH": 686, - "尾巴_ZH": 687, - "寒鸦_ZH": 688, - "雪衣_ZH": 689, - "可可利亚_ZH": 690, - "青镞_ZH": 691, - "半夏_ZH": 692, - "银枝_ZH": 693, - "大毫_ZH": 694, - "霄翰_ZH": 695, - "信使_ZH": 696, - "费斯曼_ZH": 697, - "绿芙蓉_ZH": 698, - "dev_成男_ZH": 699, - "金人会长_ZH": 700, - "维利特_ZH": 701, - "维尔德_ZH": 702, - "斯科特_ZH": 703, - "卡波特_ZH": 704, - "刃_ZH": 705, - "岩明_ZH": 706, - "浣溪_ZH": 707, - "三月七_JP": 708, - "丹恒_JP": 709, - "希儿_JP": 710, - "娜塔莎_JP": 711, - "希露瓦_JP": 712, - "瓦尔特_JP": 713, - "佩拉_JP": 714, - "布洛妮娅_JP": 715, - "虎克_JP": 716, - "素裳_JP": 717, - "克拉拉_JP": 718, - "符玄_JP": 719, - "白露_JP": 720, - "杰帕德_JP": 721, - "景元_JP": 722, - "藿藿_JP": 723, - "姬子_JP": 724, - "卡芙卡_JP": 725, - "穹_JP": 726, - "星_JP": 727, - "桂乃芬_JP": 728, - "艾丝妲_JP": 729, - "彦卿_JP": 730, - "玲可_JP": 731, - "托帕_JP": 732, - "驭空_JP": 733, - "浮烟_JP": 734, - "停云_JP": 735, - "镜流_JP": 736, - "罗刹_JP": 737, - "卢卡_JP": 738, - "史瓦罗_JP": 739, - "黑塔_JP": 740, - "桑博_JP": 741, - "伦纳德_JP": 742, - "明曦_JP": 743, - "银狼_JP": 744, - "帕姆_JP": 745, - "青雀_JP": 746, - "乔瓦尼_JP": 747, - "公输师傅_JP": 748, - "晴霓_JP": 749, - "螺丝咕姆_JP": 750, - "阿兰_JP": 751, - "奥列格_JP": 752, - "丹枢_JP": 753, - "尾巴_JP": 754, - "寒鸦_JP": 755, - "雪衣_JP": 756, - "可可利亚_JP": 757, - "青镞_JP": 758, - "半夏_JP": 759, - "银枝_JP": 760, - "大毫_JP": 761, - "霄翰_JP": 762, - "信使_JP": 763, - "费斯曼_JP": 764, - "绿芙蓉_JP": 765, - "dev_成男_JP": 766, - "金人会长_JP": 767, - "维利特_JP": 768, - "维尔德_JP": 769, - "斯科特_JP": 770, - "刃_JP": 771, - "卡波特_JP": 772, - "岩明_JP": 773, - "浣溪_JP": 774, - "净砚_JP": 775, - "紫月季_JP": 776, - "歌蒂_JP": 777, - "奇怪的云骑_JP": 778, - "幻胧_JP": 779, - "斯薇塔_JP": 780, - "隐书_JP": 781, - "三月七_EN": 782, - "丹恒_EN": 783, - "希儿_EN": 784, - "娜塔莎_EN": 785, - "希露瓦_EN": 786, - "瓦尔特_EN": 787, - "佩拉_EN": 788, - "布洛妮娅_EN": 789, - "虎克_EN": 790, - "素裳_EN": 791, - "克拉拉_EN": 792, - "符玄_EN": 793, - "白露_EN": 794, - "杰帕德_EN": 795, - "景元_EN": 796, - "藿藿_EN": 797, - "姬子_EN": 798, - "卡芙卡_EN": 799, - "穹_EN": 800, - "星_EN": 801, - "桂乃芬_EN": 802, - "艾丝妲_EN": 803, - "彦卿_EN": 804, - "玲可_EN": 805, - "托帕_EN": 806, - "驭空_EN": 807, - "浮烟_EN": 808, - "停云_EN": 809, - "镜流_EN": 810, - "罗刹_EN": 811, - "卢卡_EN": 812, - "史瓦罗_EN": 813, - "黑塔_EN": 814, - "桑博_EN": 815, - "伦纳德_EN": 816, - "明曦_EN": 817, - "银狼_EN": 818, - "帕姆_EN": 819, - "青雀_EN": 820, - "乔瓦尼_EN": 821, - "公输师傅_EN": 822, - "晴霓_EN": 823, - "螺丝咕姆_EN": 824, - "阿兰_EN": 825, - "奥列格_EN": 826, - "丹枢_EN": 827, - "尾巴_EN": 828, - "寒鸦_EN": 829, - "雪衣_EN": 830, - "可可利亚_EN": 831, - "青镞_EN": 832, - "半夏_EN": 833, - "银枝_EN": 834, - "大毫_EN": 835, - "霄翰_EN": 836, - "信使_EN": 837, - "费斯曼_EN": 838, - "绿芙蓉_EN": 839, - "dev_成男_EN": 840, - "金人会长_EN": 841, - "维利特_EN": 842, - "维尔德_EN": 843, - "刃_EN": 844, - "卡波特_EN": 845, - "岩明_EN": 846, - "浣溪_EN": 847, - "紫月季_EN": 848, - "幻胧_EN": 849, - "女声_EN": 850, - "陆景和": 851, - "莫弈": 852, - "左然": 853, - "夏彦": 854 + "三月七_ZH": 639, + "丹恒_ZH": 640, + "希儿_ZH": 641, + "娜塔莎_ZH": 642, + "希露瓦_ZH": 643, + "瓦尔特_ZH": 644, + "佩拉_ZH": 645, + "布洛妮娅_ZH": 646, + "虎克_ZH": 647, + "素裳_ZH": 648, + "克拉拉_ZH": 649, + "符玄_ZH": 650, + "白露_ZH": 651, + "杰帕德_ZH": 652, + "景元_ZH": 653, + "藿藿_ZH": 654, + "姬子_ZH": 655, + "穹_ZH": 656, + "星_ZH": 657, + "卡芙卡_ZH": 658, + "桂乃芬_ZH": 659, + "艾丝妲_ZH": 660, + "玲可_ZH": 661, + "彦卿_ZH": 662, + "托帕_ZH": 663, + "驭空_ZH": 664, + "浮烟_ZH": 665, + "停云_ZH": 666, + "镜流_ZH": 667, + "罗刹_ZH": 668, + "卢卡_ZH": 669, + "史瓦罗_ZH": 670, + "黑塔_ZH": 671, + "桑博_ZH": 672, + "伦纳德_ZH": 673, + "明曦_ZH": 674, + "银狼_ZH": 675, + "帕姆_ZH": 676, + "青雀_ZH": 677, + "乔瓦尼_ZH": 678, + "公输师傅_ZH": 679, + "晴霓_ZH": 680, + "螺丝咕姆_ZH": 681, + "阿兰_ZH": 682, + "奥列格_ZH": 683, + "丹枢_ZH": 684, + "尾巴_ZH": 685, + "寒鸦_ZH": 686, + "雪衣_ZH": 687, + "可可利亚_ZH": 688, + "青镞_ZH": 689, + "半夏_ZH": 690, + "银枝_ZH": 691, + "大毫_ZH": 692, + "霄翰_ZH": 693, + "信使_ZH": 694, + "费斯曼_ZH": 695, + "绿芙蓉_ZH": 696, + "金人会长_ZH": 697, + "维利特_ZH": 698, + "维尔德_ZH": 699, + "斯科特_ZH": 700, + "卡波特_ZH": 701, + "刃_ZH": 702, + "岩明_ZH": 703, + "浣溪_ZH": 704, + "三月七_JP": 705, + "丹恒_JP": 706, + "希儿_JP": 707, + "娜塔莎_JP": 708, + "希露瓦_JP": 709, + "瓦尔特_JP": 710, + "佩拉_JP": 711, + "布洛妮娅_JP": 712, + "虎克_JP": 713, + "素裳_JP": 714, + "克拉拉_JP": 715, + "符玄_JP": 716, + "白露_JP": 717, + "杰帕德_JP": 718, + "景元_JP": 719, + "藿藿_JP": 720, + "姬子_JP": 721, + "卡芙卡_JP": 722, + "穹_JP": 723, + "星_JP": 724, + "桂乃芬_JP": 725, + "艾丝妲_JP": 726, + "彦卿_JP": 727, + "玲可_JP": 728, + "托帕_JP": 729, + "驭空_JP": 730, + "浮烟_JP": 731, + "停云_JP": 732, + "镜流_JP": 733, + "罗刹_JP": 734, + "卢卡_JP": 735, + "史瓦罗_JP": 736, + "黑塔_JP": 737, + "桑博_JP": 738, + "伦纳德_JP": 739, + "明曦_JP": 740, + "银狼_JP": 741, + "帕姆_JP": 742, + "青雀_JP": 743, + "乔瓦尼_JP": 744, + "公输师傅_JP": 745, + "晴霓_JP": 746, + "螺丝咕姆_JP": 747, + "阿兰_JP": 748, + "奥列格_JP": 749, + "丹枢_JP": 750, + "尾巴_JP": 751, + "寒鸦_JP": 752, + "雪衣_JP": 753, + "可可利亚_JP": 754, + "青镞_JP": 755, + "半夏_JP": 756, + "银枝_JP": 757, + "大毫_JP": 758, + "霄翰_JP": 759, + "信使_JP": 760, + "费斯曼_JP": 761, + "绿芙蓉_JP": 762, + "金人会长_JP": 763, + "维利特_JP": 764, + "维尔德_JP": 765, + "斯科特_JP": 766, + "刃_JP": 767, + "卡波特_JP": 768, + "岩明_JP": 769, + "浣溪_JP": 770, + "净砚_JP": 771, + "紫月季_JP": 772, + "歌蒂_JP": 773, + "奇怪的云骑_JP": 774, + "幻胧_JP": 775, + "斯薇塔_JP": 776, + "隐书_JP": 777, + "三月七_EN": 778, + "丹恒_EN": 779, + "希儿_EN": 780, + "娜塔莎_EN": 781, + "希露瓦_EN": 782, + "瓦尔特_EN": 783, + "佩拉_EN": 784, + "布洛妮娅_EN": 785, + "虎克_EN": 786, + "素裳_EN": 787, + "克拉拉_EN": 788, + "符玄_EN": 789, + "白露_EN": 790, + "杰帕德_EN": 791, + "景元_EN": 792, + "藿藿_EN": 793, + "姬子_EN": 794, + "卡芙卡_EN": 795, + "穹_EN": 796, + "星_EN": 797, + "桂乃芬_EN": 798, + "艾丝妲_EN": 799, + "彦卿_EN": 800, + "玲可_EN": 801, + "托帕_EN": 802, + "驭空_EN": 803, + "浮烟_EN": 804, + "停云_EN": 805, + "镜流_EN": 806, + "罗刹_EN": 807, + "卢卡_EN": 808, + "史瓦罗_EN": 809, + "黑塔_EN": 810, + "桑博_EN": 811, + "伦纳德_EN": 812, + "明曦_EN": 813, + "银狼_EN": 814, + "帕姆_EN": 815, + "青雀_EN": 816, + "乔瓦尼_EN": 817, + "公输师傅_EN": 818, + "晴霓_EN": 819, + "螺丝咕姆_EN": 820, + "阿兰_EN": 821, + "奥列格_EN": 822, + "丹枢_EN": 823, + "尾巴_EN": 824, + "寒鸦_EN": 825, + "雪衣_EN": 826, + "可可利亚_EN": 827, + "青镞_EN": 828, + "半夏_EN": 829, + "银枝_EN": 830, + "大毫_EN": 831, + "霄翰_EN": 832, + "信使_EN": 833, + "费斯曼_EN": 834, + "绿芙蓉_EN": 835, + "金人会长_EN": 836, + "维利特_EN": 837, + "维尔德_EN": 838, + "刃_EN": 839, + "卡波特_EN": 840, + "岩明_EN": 841, + "浣溪_EN": 842, + "紫月季_EN": 843, + "幻胧_EN": 844, + "女声_EN": 845, + "陆景和": 846, + "莫弈": 847, + "左然": 848, + "夏彦": 849 } }, "model": { @@ -947,7 +944,14 @@ ], "n_layers_q": 3, "use_spectral_norm": false, - "gin_channels": 256 + "gin_channels": 512, + "slm": { + "model": "./slm/wavlm-base-plus", + "sr": 16000, + "hidden": 768, + "nlayers": 13, + "initial_channel": 64 + } }, - "version": "2.2" + "version": "2.3" } diff --git a/data_utils.py b/data_utils.py index ef89656ff78bd9aefbfe456fe76f17186e50b384..ab9e6f25e562fe8605b21c9237f9b85da6be88b1 100644 --- a/data_utils.py +++ b/data_utils.py @@ -44,10 +44,6 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): self.min_text_len = getattr(hparams, "min_text_len", 1) self.max_text_len = getattr(hparams, "max_text_len", 384) - self.empty_emo = torch.squeeze( - torch.load("empty_emo.npy", map_location="cpu"), dim=1 - ) - random.seed(1234) random.shuffle(self.audiopaths_sid_text) self._filter() @@ -98,14 +94,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): spec, wav = self.get_audio(audiopath) sid = torch.LongTensor([int(self.spk_map[sid])]) - if np.random.rand() > 0.1: - emo = torch.squeeze( - torch.load(audiopath.replace(".wav", ".emo.npy"), map_location="cpu"), - dim=1, - ) - else: - emo = self.empty_emo - return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert, emo) + return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert) def get_audio(self, filename): audio, sampling_rate = load_wav_to_torch(filename) @@ -168,15 +157,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): if language_str == "ZH": bert = bert_ori - ja_bert = torch.rand(1024, len(phone)) - en_bert = torch.rand(1024, len(phone)) + ja_bert = torch.randn(1024, len(phone)) + en_bert = torch.randn(1024, len(phone)) elif language_str == "JP": - bert = torch.rand(1024, len(phone)) + bert = torch.randn(1024, len(phone)) ja_bert = bert_ori - en_bert = torch.rand(1024, len(phone)) + en_bert = torch.randn(1024, len(phone)) elif language_str == "EN": - bert = torch.rand(1024, len(phone)) - ja_bert = torch.rand(1024, len(phone)) + bert = torch.randn(1024, len(phone)) + ja_bert = torch.randn(1024, len(phone)) en_bert = bert_ori phone = torch.LongTensor(phone) tone = torch.LongTensor(tone) @@ -226,7 +215,6 @@ class TextAudioSpeakerCollate: bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len) ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len) en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len) - emo = torch.FloatTensor(len(batch), 512) spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) @@ -238,7 +226,6 @@ class TextAudioSpeakerCollate: bert_padded.zero_() ja_bert_padded.zero_() en_bert_padded.zero_() - emo.zero_() for i in range(len(ids_sorted_decreasing)): row = batch[ids_sorted_decreasing[i]] @@ -272,8 +259,6 @@ class TextAudioSpeakerCollate: en_bert = row[8] en_bert_padded[i, :, : en_bert.size(1)] = en_bert - emo[i, :] = row[9] - return ( text_padded, text_lengths, @@ -287,7 +272,6 @@ class TextAudioSpeakerCollate: bert_padded, ja_bert_padded, en_bert_padded, - emo, ) diff --git a/default_config.yml b/default_config.yml index 1158816ad0324f21e87eea40fa186e130ecc7b5a..b8803a8da245dd01db5b679cfbf3e8900fe9d036 100644 --- a/default_config.yml +++ b/default_config.yml @@ -83,11 +83,11 @@ train_ms: base: use_base_model: false repo_id: "Stardust_minus/Bert-VITS2" - model_image: "Bert-VITS2_2.1-Emo底模" # openi网页的模型名 + model_image: "Bert-VITS2_2.3底模" # openi网页的模型名 # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下 model: "models" # 配置文件路径 - config_path: "configs/config.json" + config_path: "config.json" # 训练使用的worker,不建议超过CPU核心数 num_workers: 16 # 关闭此项可以节约接近50%的磁盘空间,但是可能导致实际训练速度变慢和更高的CPU使用率。 @@ -104,7 +104,7 @@ webui: # 模型路径 model: "models/G_8000.pth" # 配置文件路径 - config_path: "configs/config.json" + config_path: "config.json" # 端口号 port: 7860 # 是否公开部署,对外网开放 diff --git a/export_onnx.py b/export_onnx.py index 0e6be5240746724b535c7fe3ce7bbb103cc88db4..6cbbf6f4f63d65ff6f522898294c5fc000aab18e 100644 --- a/export_onnx.py +++ b/export_onnx.py @@ -2,11 +2,13 @@ from onnx_modules import export_onnx import os if __name__ == "__main__": - export_path = "MyModel" - model_path = "S:\\VSGIT\\bert-vits2\\G_178000.pth" - config_path = "S:\\VSGIT\\bert-vits2\\config.json" + export_path = "BertVits2.2PT" + model_path = "model\\G_0.pth" + config_path = "model\\config.json" + novq = False + dev = False if not os.path.exists("onnx"): os.makedirs("onnx") if not os.path.exists(f"onnx/{export_path}"): os.makedirs(f"onnx/{export_path}") - export_onnx(export_path, model_path, config_path) + export_onnx(export_path, model_path, config_path, novq, dev) diff --git a/for_deploy/infer.py b/for_deploy/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..5168109694c2aa8e1fd41c6bba317f65ceb15e8b --- /dev/null +++ b/for_deploy/infer.py @@ -0,0 +1,386 @@ +""" +版本管理、兼容推理及模型加载实现。 +版本说明: + 1. 版本号与github的release版本号对应,使用哪个release版本训练的模型即对应其版本号 + 2. 请在模型的config.json中显示声明版本号,添加一个字段"version" : "你的版本号" +特殊版本说明: + 1.1.1-fix: 1.1.1版本训练的模型,但是在推理时使用dev的日语修复 + 2.2:当前版本 +""" +import torch +import commons +from text import cleaned_text_to_sequence, get_bert +from clap_wrapper import get_clap_audio_feature, get_clap_text_feature +from text.cleaner import clean_text +import utils +import numpy as np + +from models import SynthesizerTrn +from text.symbols import symbols + +from oldVersion.V210.models import SynthesizerTrn as V210SynthesizerTrn +from oldVersion.V210.text import symbols as V210symbols +from oldVersion.V200.models import SynthesizerTrn as V200SynthesizerTrn +from oldVersion.V200.text import symbols as V200symbols +from oldVersion.V111.models import SynthesizerTrn as V111SynthesizerTrn +from oldVersion.V111.text import symbols as V111symbols +from oldVersion.V110.models import SynthesizerTrn as V110SynthesizerTrn +from oldVersion.V110.text import symbols as V110symbols +from oldVersion.V101.models import SynthesizerTrn as V101SynthesizerTrn +from oldVersion.V101.text import symbols as V101symbols + +from oldVersion import V111, V110, V101, V200, V210 + +# 当前版本信息 +latest_version = "2.2" + +# 版本兼容 +SynthesizerTrnMap = { + "2.1": V210SynthesizerTrn, + "2.0.2-fix": V200SynthesizerTrn, + "2.0.1": V200SynthesizerTrn, + "2.0": V200SynthesizerTrn, + "1.1.1-fix": V111SynthesizerTrn, + "1.1.1": V111SynthesizerTrn, + "1.1": V110SynthesizerTrn, + "1.1.0": V110SynthesizerTrn, + "1.0.1": V101SynthesizerTrn, + "1.0": V101SynthesizerTrn, + "1.0.0": V101SynthesizerTrn, +} + +symbolsMap = { + "2.1": V210symbols, + "2.0.2-fix": V200symbols, + "2.0.1": V200symbols, + "2.0": V200symbols, + "1.1.1-fix": V111symbols, + "1.1.1": V111symbols, + "1.1": V110symbols, + "1.1.0": V110symbols, + "1.0.1": V101symbols, + "1.0": V101symbols, + "1.0.0": V101symbols, +} + + +# def get_emo_(reference_audio, emotion, sid): +# emo = ( +# torch.from_numpy(get_emo(reference_audio)) +# if reference_audio and emotion == -1 +# else torch.FloatTensor( +# np.load(f"emo_clustering/{sid}/cluster_center_{emotion}.npy") +# ) +# ) +# return emo + + +def get_net_g(model_path: str, version: str, device: str, hps): + if version != latest_version: + net_g = SynthesizerTrnMap[version]( + len(symbolsMap[version]), + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).to(device) + else: + # 当前版本模型 net_g + net_g = SynthesizerTrn( + len(symbols), + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).to(device) + _ = net_g.eval() + _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True) + return net_g + + +def get_text(text, language_str, bert, hps, device): + # 在此处实现当前版本的get_text + norm_text, phone, tone, word2ph = clean_text(text, language_str) + phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) + + if hps.data.add_blank: + phone = commons.intersperse(phone, 0) + tone = commons.intersperse(tone, 0) + language = commons.intersperse(language, 0) + for i in range(len(word2ph)): + word2ph[i] = word2ph[i] * 2 + word2ph[0] += 1 + # bert_ori = get_bert(norm_text, word2ph, language_str, device) + bert_ori = bert[language_str].get_bert_feature(norm_text, word2ph, device) + del word2ph + assert bert_ori.shape[-1] == len(phone), phone + + if language_str == "ZH": + bert = bert_ori + ja_bert = torch.randn(1024, len(phone)) + en_bert = torch.randn(1024, len(phone)) + elif language_str == "JP": + bert = torch.randn(1024, len(phone)) + ja_bert = bert_ori + en_bert = torch.randn(1024, len(phone)) + elif language_str == "EN": + bert = torch.randn(1024, len(phone)) + ja_bert = torch.randn(1024, len(phone)) + en_bert = bert_ori + else: + raise ValueError("language_str should be ZH, JP or EN") + + assert bert.shape[-1] == len( + phone + ), f"Bert seq len {bert.shape[-1]} != {len(phone)}" + + phone = torch.LongTensor(phone) + tone = torch.LongTensor(tone) + language = torch.LongTensor(language) + return bert, ja_bert, en_bert, phone, tone, language + + +def infer( + text, + emotion, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + sid, + language, + hps, + net_g, + device, + bert=None, + clap=None, + reference_audio=None, + skip_start=False, + skip_end=False, +): + # 2.2版本参数位置变了 + # 2.1 参数新增 emotion reference_audio skip_start skip_end + inferMap_V3 = { + "2.1": V210.infer, + } + # 支持中日英三语版本 + inferMap_V2 = { + "2.0.2-fix": V200.infer, + "2.0.1": V200.infer, + "2.0": V200.infer, + "1.1.1-fix": V111.infer_fix, + "1.1.1": V111.infer, + "1.1": V110.infer, + "1.1.0": V110.infer, + } + # 仅支持中文版本 + # 在测试中,并未发现两个版本的模型不能互相通用 + inferMap_V1 = { + "1.0.1": V101.infer, + "1.0": V101.infer, + "1.0.0": V101.infer, + } + version = hps.version if hasattr(hps, "version") else latest_version + # 非当前版本,根据版本号选择合适的infer + if version != latest_version: + if version in inferMap_V3.keys(): + return inferMap_V3[version]( + text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + sid, + language, + hps, + net_g, + device, + reference_audio, + emotion, + skip_start, + skip_end, + ) + if version in inferMap_V2.keys(): + return inferMap_V2[version]( + text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + sid, + language, + hps, + net_g, + device, + ) + if version in inferMap_V1.keys(): + return inferMap_V1[version]( + text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + sid, + hps, + net_g, + device, + ) + # 在此处实现当前版本的推理 + # emo = get_emo_(reference_audio, emotion, sid) + if isinstance(reference_audio, np.ndarray): + emo = clap.get_clap_audio_feature(reference_audio, device) + else: + emo = clap.get_clap_text_feature(emotion, device) + emo = torch.squeeze(emo, dim=1) + + bert, ja_bert, en_bert, phones, tones, lang_ids = get_text( + text, language, bert, hps, device + ) + if skip_start: + phones = phones[3:] + tones = tones[3:] + lang_ids = lang_ids[3:] + bert = bert[:, 3:] + ja_bert = ja_bert[:, 3:] + en_bert = en_bert[:, 3:] + if skip_end: + phones = phones[:-2] + tones = tones[:-2] + lang_ids = lang_ids[:-2] + bert = bert[:, :-2] + ja_bert = ja_bert[:, :-2] + en_bert = en_bert[:, :-2] + with torch.no_grad(): + x_tst = phones.to(device).unsqueeze(0) + tones = tones.to(device).unsqueeze(0) + lang_ids = lang_ids.to(device).unsqueeze(0) + bert = bert.to(device).unsqueeze(0) + ja_bert = ja_bert.to(device).unsqueeze(0) + en_bert = en_bert.to(device).unsqueeze(0) + x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) + emo = emo.to(device).unsqueeze(0) + del phones + speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) + audio = ( + net_g.infer( + x_tst, + x_tst_lengths, + speakers, + tones, + lang_ids, + bert, + ja_bert, + en_bert, + emo, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + )[0][0, 0] + .data.cpu() + .float() + .numpy() + ) + del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio + + +def infer_multilang( + text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + sid, + language, + hps, + net_g, + device, + bert=None, + clap=None, + reference_audio=None, + emotion=None, + skip_start=False, + skip_end=False, +): + bert, ja_bert, en_bert, phones, tones, lang_ids = [], [], [], [], [], [] + # emo = get_emo_(reference_audio, emotion, sid) + if isinstance(reference_audio, np.ndarray): + emo = clap.get_clap_audio_feature(reference_audio, device) + else: + emo = clap.get_clap_text_feature(emotion, device) + emo = torch.squeeze(emo, dim=1) + for idx, (txt, lang) in enumerate(zip(text, language)): + skip_start = (idx != 0) or (skip_start and idx == 0) + skip_end = (idx != len(text) - 1) or (skip_end and idx == len(text) - 1) + ( + temp_bert, + temp_ja_bert, + temp_en_bert, + temp_phones, + temp_tones, + temp_lang_ids, + ) = get_text(txt, lang, bert, hps, device) + if skip_start: + temp_bert = temp_bert[:, 3:] + temp_ja_bert = temp_ja_bert[:, 3:] + temp_en_bert = temp_en_bert[:, 3:] + temp_phones = temp_phones[3:] + temp_tones = temp_tones[3:] + temp_lang_ids = temp_lang_ids[3:] + if skip_end: + temp_bert = temp_bert[:, :-2] + temp_ja_bert = temp_ja_bert[:, :-2] + temp_en_bert = temp_en_bert[:, :-2] + temp_phones = temp_phones[:-2] + temp_tones = temp_tones[:-2] + temp_lang_ids = temp_lang_ids[:-2] + bert.append(temp_bert) + ja_bert.append(temp_ja_bert) + en_bert.append(temp_en_bert) + phones.append(temp_phones) + tones.append(temp_tones) + lang_ids.append(temp_lang_ids) + bert = torch.concatenate(bert, dim=1) + ja_bert = torch.concatenate(ja_bert, dim=1) + en_bert = torch.concatenate(en_bert, dim=1) + phones = torch.concatenate(phones, dim=0) + tones = torch.concatenate(tones, dim=0) + lang_ids = torch.concatenate(lang_ids, dim=0) + with torch.no_grad(): + x_tst = phones.to(device).unsqueeze(0) + tones = tones.to(device).unsqueeze(0) + lang_ids = lang_ids.to(device).unsqueeze(0) + bert = bert.to(device).unsqueeze(0) + ja_bert = ja_bert.to(device).unsqueeze(0) + en_bert = en_bert.to(device).unsqueeze(0) + emo = emo.to(device).unsqueeze(0) + x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) + del phones + speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) + audio = ( + net_g.infer( + x_tst, + x_tst_lengths, + speakers, + tones, + lang_ids, + bert, + ja_bert, + en_bert, + emo, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + )[0][0, 0] + .data.cpu() + .float() + .numpy() + ) + del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio diff --git a/for_deploy/infer_utils.py b/for_deploy/infer_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..dd00e4c9ca2227022855bd5fb8d6afb1ae76b229 --- /dev/null +++ b/for_deploy/infer_utils.py @@ -0,0 +1,111 @@ +import sys + +import torch +from transformers import ( + AutoModelForMaskedLM, + AutoTokenizer, + DebertaV2Model, + DebertaV2Tokenizer, + ClapModel, + ClapProcessor, +) + +from config import config +from text.japanese import text2sep_kata + + +class BertFeature: + def __init__(self, model_path, language="ZH"): + self.model_path = model_path + self.language = language + self.tokenizer = None + self.model = None + self.device = None + + self._prepare() + + def _get_device(self, device=config.bert_gen_config.device): + if ( + sys.platform == "darwin" + and torch.backends.mps.is_available() + and device == "cpu" + ): + device = "mps" + if not device: + device = "cuda" + return device + + def _prepare(self): + self.device = self._get_device() + + if self.language == "EN": + self.tokenizer = DebertaV2Tokenizer.from_pretrained(self.model_path) + self.model = DebertaV2Model.from_pretrained(self.model_path).to(self.device) + else: + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.model = AutoModelForMaskedLM.from_pretrained(self.model_path).to( + self.device + ) + self.model.eval() + + def get_bert_feature(self, text, word2ph): + if self.language == "JP": + text = "".join(text2sep_kata(text)[0]) + with torch.no_grad(): + inputs = self.tokenizer(text, return_tensors="pt") + for i in inputs: + inputs[i] = inputs[i].to(self.device) + res = self.model(**inputs, output_hidden_states=True) + res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + + word2phone = word2ph + phone_level_feature = [] + for i in range(len(word2phone)): + repeat_feature = res[i].repeat(word2phone[i], 1) + phone_level_feature.append(repeat_feature) + + phone_level_feature = torch.cat(phone_level_feature, dim=0) + + return phone_level_feature.T + + +class ClapFeature: + def __init__(self, model_path): + self.model_path = model_path + self.processor = None + self.model = None + self.device = None + + self._prepare() + + def _get_device(self, device=config.bert_gen_config.device): + if ( + sys.platform == "darwin" + and torch.backends.mps.is_available() + and device == "cpu" + ): + device = "mps" + if not device: + device = "cuda" + return device + + def _prepare(self): + self.device = self._get_device() + + self.processor = ClapProcessor.from_pretrained(self.model_path) + self.model = ClapModel.from_pretrained(self.model_path).to(self.device) + self.model.eval() + + def get_clap_audio_feature(self, audio_data): + with torch.no_grad(): + inputs = self.processor( + audios=audio_data, return_tensors="pt", sampling_rate=48000 + ).to(self.device) + emb = self.model.get_audio_features(**inputs) + return emb.T + + def get_clap_text_feature(self, text): + with torch.no_grad(): + inputs = self.processor(text=text, return_tensors="pt").to(self.device) + emb = self.model.get_text_features(**inputs) + return emb.T diff --git a/for_deploy/webui.py b/for_deploy/webui.py new file mode 100644 index 0000000000000000000000000000000000000000..f813e513e07f29397103f3559da3019acfe4d11c --- /dev/null +++ b/for_deploy/webui.py @@ -0,0 +1,556 @@ +# flake8: noqa: E402 +import os +import logging +import re_matching +from tools.sentence import split_by_language + +logging.getLogger("numba").setLevel(logging.WARNING) +logging.getLogger("markdown_it").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) +logging.getLogger("matplotlib").setLevel(logging.WARNING) + +logging.basicConfig( + level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s" +) + +logger = logging.getLogger(__name__) + +import torch +import utils +from infer import infer, latest_version, get_net_g, infer_multilang +import gradio as gr +import webbrowser +import numpy as np +from config import config +from tools.translate import translate +import librosa +from infer_utils import BertFeature, ClapFeature + + +net_g = None + +device = config.webui_config.device +if device == "mps": + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +os.environ["OMP_NUM_THREADS"] = "1" +os.environ["MKL_NUM_THREADS"] = "1" + +bert_feature_map = { + "ZH": BertFeature( + "./bert/chinese-roberta-wwm-ext-large", + language="ZH", + ), + "JP": BertFeature( + "./bert/deberta-v2-large-japanese-char-wwm", + language="JP", + ), + "EN": BertFeature( + "./bert/deberta-v3-large", + language="EN", + ), +} + +clap_feature = ClapFeature("./emotional/clap-htsat-fused") + + +def generate_audio( + slices, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + speaker, + language, + reference_audio, + emotion, + skip_start=False, + skip_end=False, +): + audio_list = [] + # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) + with torch.no_grad(): + for idx, piece in enumerate(slices): + skip_start = (idx != 0) and skip_start + skip_end = (idx != len(slices) - 1) and skip_end + audio = infer( + piece, + reference_audio=reference_audio, + emotion=emotion, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + sid=speaker, + language=language, + hps=hps, + net_g=net_g, + device=device, + skip_start=skip_start, + skip_end=skip_end, + bert=bert_feature_map, + clap=clap_feature, + ) + audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) + audio_list.append(audio16bit) + # audio_list.append(silence) # 将静音添加到列表中 + return audio_list + + +def generate_audio_multilang( + slices, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + speaker, + language, + reference_audio, + emotion, + skip_start=False, + skip_end=False, +): + audio_list = [] + # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) + with torch.no_grad(): + for idx, piece in enumerate(slices): + skip_start = (idx != 0) and skip_start + skip_end = (idx != len(slices) - 1) and skip_end + audio = infer_multilang( + piece, + reference_audio=reference_audio, + emotion=emotion, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + sid=speaker, + language=language[idx], + hps=hps, + net_g=net_g, + device=device, + skip_start=skip_start, + skip_end=skip_end, + ) + audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) + audio_list.append(audio16bit) + # audio_list.append(silence) # 将静音添加到列表中 + return audio_list + + +def tts_split( + text: str, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + cut_by_sent, + interval_between_para, + interval_between_sent, + reference_audio, + emotion, +): + if language == "mix": + return ("invalid", None) + while text.find("\n\n") != -1: + text = text.replace("\n\n", "\n") + para_list = re_matching.cut_para(text) + audio_list = [] + if not cut_by_sent: + for idx, p in enumerate(para_list): + skip_start = idx != 0 + skip_end = idx != len(para_list) - 1 + audio = infer( + p, + reference_audio=reference_audio, + emotion=emotion, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + sid=speaker, + language=language, + hps=hps, + net_g=net_g, + device=device, + skip_start=skip_start, + skip_end=skip_end, + ) + audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) + audio_list.append(audio16bit) + silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16) + audio_list.append(silence) + else: + for idx, p in enumerate(para_list): + skip_start = idx != 0 + skip_end = idx != len(para_list) - 1 + audio_list_sent = [] + sent_list = re_matching.cut_sent(p) + for idx, s in enumerate(sent_list): + skip_start = (idx != 0) and skip_start + skip_end = (idx != len(sent_list) - 1) and skip_end + audio = infer( + s, + reference_audio=reference_audio, + emotion=emotion, + sdp_ratio=sdp_ratio, + noise_scale=noise_scale, + noise_scale_w=noise_scale_w, + length_scale=length_scale, + sid=speaker, + language=language, + hps=hps, + net_g=net_g, + device=device, + skip_start=skip_start, + skip_end=skip_end, + ) + audio_list_sent.append(audio) + silence = np.zeros((int)(44100 * interval_between_sent)) + audio_list_sent.append(silence) + if (interval_between_para - interval_between_sent) > 0: + silence = np.zeros( + (int)(44100 * (interval_between_para - interval_between_sent)) + ) + audio_list_sent.append(silence) + audio16bit = gr.processing_utils.convert_to_16_bit_wav( + np.concatenate(audio_list_sent) + ) # 对完整句子做音量归一 + audio_list.append(audio16bit) + audio_concat = np.concatenate(audio_list) + return ("Success", (44100, audio_concat)) + + +def tts_fn( + text: str, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + reference_audio, + emotion, + prompt_mode, +): + if prompt_mode == "Audio prompt": + if reference_audio == None: + return ("Invalid audio prompt", None) + else: + reference_audio = load_audio(reference_audio)[1] + else: + reference_audio = None + audio_list = [] + if language == "mix": + bool_valid, str_valid = re_matching.validate_text(text) + if not bool_valid: + return str_valid, ( + hps.data.sampling_rate, + np.concatenate([np.zeros(hps.data.sampling_rate // 2)]), + ) + result = [] + for slice in re_matching.text_matching(text): + _speaker = slice.pop() + temp_contant = [] + temp_lang = [] + for lang, content in slice: + if "|" in content: + temp = [] + temp_ = [] + for i in content.split("|"): + if i != "": + temp.append([i]) + temp_.append([lang]) + else: + temp.append([]) + temp_.append([]) + temp_contant += temp + temp_lang += temp_ + else: + if len(temp_contant) == 0: + temp_contant.append([]) + temp_lang.append([]) + temp_contant[-1].append(content) + temp_lang[-1].append(lang) + for i, j in zip(temp_lang, temp_contant): + result.append([*zip(i, j), _speaker]) + for i, one in enumerate(result): + skip_start = i != 0 + skip_end = i != len(result) - 1 + _speaker = one.pop() + idx = 0 + while idx < len(one): + text_to_generate = [] + lang_to_generate = [] + while True: + lang, content = one[idx] + temp_text = [content] + if len(text_to_generate) > 0: + text_to_generate[-1] += [temp_text.pop(0)] + lang_to_generate[-1] += [lang] + if len(temp_text) > 0: + text_to_generate += [[i] for i in temp_text] + lang_to_generate += [[lang]] * len(temp_text) + if idx + 1 < len(one): + idx += 1 + else: + break + skip_start = (idx != 0) and skip_start + skip_end = (idx != len(one) - 1) and skip_end + print(text_to_generate, lang_to_generate) + audio_list.extend( + generate_audio_multilang( + text_to_generate, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + _speaker, + lang_to_generate, + reference_audio, + emotion, + skip_start, + skip_end, + ) + ) + idx += 1 + elif language.lower() == "auto": + for idx, slice in enumerate(text.split("|")): + if slice == "": + continue + skip_start = idx != 0 + skip_end = idx != len(text.split("|")) - 1 + sentences_list = split_by_language( + slice, target_languages=["zh", "ja", "en"] + ) + idx = 0 + while idx < len(sentences_list): + text_to_generate = [] + lang_to_generate = [] + while True: + content, lang = sentences_list[idx] + temp_text = [content] + lang = lang.upper() + if lang == "JA": + lang = "JP" + if len(text_to_generate) > 0: + text_to_generate[-1] += [temp_text.pop(0)] + lang_to_generate[-1] += [lang] + if len(temp_text) > 0: + text_to_generate += [[i] for i in temp_text] + lang_to_generate += [[lang]] * len(temp_text) + if idx + 1 < len(sentences_list): + idx += 1 + else: + break + skip_start = (idx != 0) and skip_start + skip_end = (idx != len(sentences_list) - 1) and skip_end + print(text_to_generate, lang_to_generate) + audio_list.extend( + generate_audio_multilang( + text_to_generate, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + speaker, + lang_to_generate, + reference_audio, + emotion, + skip_start, + skip_end, + ) + ) + idx += 1 + else: + audio_list.extend( + generate_audio( + text.split("|"), + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + speaker, + language, + reference_audio, + emotion, + ) + ) + + audio_concat = np.concatenate(audio_list) + return "Success", (hps.data.sampling_rate, audio_concat) + + +def load_audio(path): + audio, sr = librosa.load(path, 48000) + # audio = librosa.resample(audio, 44100, 48000) + return sr, audio + + +def gr_util(item): + if item == "Text prompt": + return {"visible": True, "__type__": "update"}, { + "visible": False, + "__type__": "update", + } + else: + return {"visible": False, "__type__": "update"}, { + "visible": True, + "__type__": "update", + } + + +if __name__ == "__main__": + if config.webui_config.debug: + logger.info("Enable DEBUG-LEVEL log") + logging.basicConfig(level=logging.DEBUG) + hps = utils.get_hparams_from_file(config.webui_config.config_path) + # 若config.json中未指定版本则默认为最新版本 + version = hps.version if hasattr(hps, "version") else latest_version + net_g = get_net_g( + model_path=config.webui_config.model, version=version, device=device, hps=hps + ) + speaker_ids = hps.data.spk2id + speakers = list(speaker_ids.keys()) + languages = ["ZH", "JP", "EN", "mix", "auto"] + with gr.Blocks() as app: + with gr.Row(): + with gr.Column(): + text = gr.TextArea( + label="输入文本内容", + placeholder=""" + 如果你选择语言为\'mix\',必须按照格式输入,否则报错: + 格式举例(zh是中文,jp是日语,不区分大小写;说话人举例:gongzi): + [说话人1]你好,こんにちは! こんにちは,世界。 + [说话人2]你好吗?元気ですか? + [说话人3]谢谢。どういたしまして。 + ... + 另外,所有的语言选项都可以用'|'分割长段实现分句生成。 + """, + ) + trans = gr.Button("中翻日", variant="primary") + slicer = gr.Button("快速切分", variant="primary") + speaker = gr.Dropdown( + choices=speakers, value=speakers[0], label="Speaker" + ) + _ = gr.Markdown( + value="提示模式(Prompt mode):可选文字提示或音频提示,用于生成文字或音频指定风格的声音。\n" + ) + prompt_mode = gr.Radio( + ["Text prompt", "Audio prompt"], + label="Prompt Mode", + value="Text prompt", + ) + text_prompt = gr.Textbox( + label="Text prompt", + placeholder="用文字描述生成风格。如:Happy", + value="Happy", + visible=True, + ) + audio_prompt = gr.Audio( + label="Audio prompt", type="filepath", visible=False + ) + sdp_ratio = gr.Slider( + minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio" + ) + noise_scale = gr.Slider( + minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise" + ) + noise_scale_w = gr.Slider( + minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W" + ) + length_scale = gr.Slider( + minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length" + ) + language = gr.Dropdown( + choices=languages, value=languages[0], label="Language" + ) + btn = gr.Button("生成音频!", variant="primary") + with gr.Column(): + with gr.Row(): + with gr.Column(): + interval_between_sent = gr.Slider( + minimum=0, + maximum=5, + value=0.2, + step=0.1, + label="句间停顿(秒),勾选按句切分才生效", + ) + interval_between_para = gr.Slider( + minimum=0, + maximum=10, + value=1, + step=0.1, + label="段间停顿(秒),需要大于句间停顿才有效", + ) + opt_cut_by_sent = gr.Checkbox( + label="按句切分 在按段落切分的基础上再按句子切分文本" + ) + slicer = gr.Button("切分生成", variant="primary") + text_output = gr.Textbox(label="状态信息") + audio_output = gr.Audio(label="输出音频") + # explain_image = gr.Image( + # label="参数解释信息", + # show_label=True, + # show_share_button=False, + # show_download_button=False, + # value=os.path.abspath("./img/参数说明.png"), + # ) + btn.click( + tts_fn, + inputs=[ + text, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + audio_prompt, + text_prompt, + prompt_mode, + ], + outputs=[text_output, audio_output], + ) + + trans.click( + translate, + inputs=[text], + outputs=[text], + ) + slicer.click( + tts_split, + inputs=[ + text, + speaker, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + language, + opt_cut_by_sent, + interval_between_para, + interval_between_sent, + audio_prompt, + text_prompt, + ], + outputs=[text_output, audio_output], + ) + + prompt_mode.change( + lambda x: gr_util(x), + inputs=[prompt_mode], + outputs=[text_prompt, audio_prompt], + ) + + audio_prompt.upload( + lambda x: load_audio(x), + inputs=[audio_prompt], + outputs=[audio_prompt], + ) + + print("推理页面已开启!") + webbrowser.open(f"http://127.0.0.1:{config.webui_config.port}") + app.launch(share=config.webui_config.share, server_port=config.webui_config.port) diff --git a/infer.py b/infer.py index 7b3f7157e16625be7cfe10047c4044080ed0cb6b..62bfa4efba12f4242484e4c68ff0229e93d71ce1 100644 --- a/infer.py +++ b/infer.py @@ -10,7 +10,8 @@ import torch import commons from text import cleaned_text_to_sequence, get_bert -from clap_wrapper import get_clap_audio_feature, get_clap_text_feature + +# from clap_wrapper import get_clap_audio_feature, get_clap_text_feature from text.cleaner import clean_text import utils import numpy as np @@ -20,47 +21,47 @@ from text.symbols import symbols # from oldVersion.V210.models import SynthesizerTrn as V210SynthesizerTrn # from oldVersion.V210.text import symbols as V210symbols -from oldVersion.V200.models import SynthesizerTrn as V200SynthesizerTrn -from oldVersion.V200.text import symbols as V200symbols -from oldVersion.V111.models import SynthesizerTrn as V111SynthesizerTrn -from oldVersion.V111.text import symbols as V111symbols -from oldVersion.V110.models import SynthesizerTrn as V110SynthesizerTrn -from oldVersion.V110.text import symbols as V110symbols -from oldVersion.V101.models import SynthesizerTrn as V101SynthesizerTrn -from oldVersion.V101.text import symbols as V101symbols +# from oldVersion.V200.models import SynthesizerTrn as V200SynthesizerTrn +# from oldVersion.V200.text import symbols as V200symbols +# from oldVersion.V111.models import SynthesizerTrn as V111SynthesizerTrn +# from oldVersion.V111.text import symbols as V111symbols +# from oldVersion.V110.models import SynthesizerTrn as V110SynthesizerTrn +# from oldVersion.V110.text import symbols as V110symbols +# from oldVersion.V101.models import SynthesizerTrn as V101SynthesizerTrn +# from oldVersion.V101.text import symbols as V101symbols -from oldVersion import V111, V110, V101, V200 # , V210 +# from oldVersion import V111, V110, V101, V200, V210 # 当前版本信息 -latest_version = "2.2" +latest_version = "2.3" # 版本兼容 SynthesizerTrnMap = { # "2.1": V210SynthesizerTrn, - "2.0.2-fix": V200SynthesizerTrn, - "2.0.1": V200SynthesizerTrn, - "2.0": V200SynthesizerTrn, - "1.1.1-fix": V111SynthesizerTrn, - "1.1.1": V111SynthesizerTrn, - "1.1": V110SynthesizerTrn, - "1.1.0": V110SynthesizerTrn, - "1.0.1": V101SynthesizerTrn, - "1.0": V101SynthesizerTrn, - "1.0.0": V101SynthesizerTrn, + # "2.0.2-fix": V200SynthesizerTrn, + # "2.0.1": V200SynthesizerTrn, + # "2.0": V200SynthesizerTrn, + # "1.1.1-fix": V111SynthesizerTrn, + # "1.1.1": V111SynthesizerTrn, + # "1.1": V110SynthesizerTrn, + # "1.1.0": V110SynthesizerTrn, + # "1.0.1": V101SynthesizerTrn, + # "1.0": V101SynthesizerTrn, + # "1.0.0": V101SynthesizerTrn, } symbolsMap = { # "2.1": V210symbols, - "2.0.2-fix": V200symbols, - "2.0.1": V200symbols, - "2.0": V200symbols, - "1.1.1-fix": V111symbols, - "1.1.1": V111symbols, - "1.1": V110symbols, - "1.1.0": V110symbols, - "1.0.1": V101symbols, - "1.0": V101symbols, - "1.0.0": V101symbols, + # "2.0.2-fix": V200symbols, + # "2.0.1": V200symbols, + # "2.0": V200symbols, + # "1.1.1-fix": V111symbols, + # "1.1.1": V111symbols, + # "1.1": V110symbols, + # "1.1.0": V110symbols, + # "1.0.1": V101symbols, + # "1.0": V101symbols, + # "1.0.0": V101symbols, } @@ -98,7 +99,8 @@ def get_net_g(model_path: str, version: str, device: str, hps): return net_g -def get_text(text, language_str, hps, device): +def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7): + style_text = None if style_text == "" else style_text # 在此处实现当前版本的get_text norm_text, phone, tone, word2ph = clean_text(text, language_str) phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) @@ -110,21 +112,23 @@ def get_text(text, language_str, hps, device): for i in range(len(word2ph)): word2ph[i] = word2ph[i] * 2 word2ph[0] += 1 - bert_ori = get_bert(norm_text, word2ph, language_str, device) + bert_ori = get_bert( + norm_text, word2ph, language_str, device, style_text, style_weight + ) del word2ph assert bert_ori.shape[-1] == len(phone), phone if language_str == "ZH": bert = bert_ori - ja_bert = torch.zeros(1024, len(phone)) - en_bert = torch.zeros(1024, len(phone)) + ja_bert = torch.randn(1024, len(phone)) + en_bert = torch.randn(1024, len(phone)) elif language_str == "JP": - bert = torch.zeros(1024, len(phone)) + bert = torch.randn(1024, len(phone)) ja_bert = bert_ori - en_bert = torch.zeros(1024, len(phone)) + en_bert = torch.randn(1024, len(phone)) elif language_str == "EN": - bert = torch.zeros(1024, len(phone)) - ja_bert = torch.zeros(1024, len(phone)) + bert = torch.randn(1024, len(phone)) + ja_bert = torch.randn(1024, len(phone)) en_bert = bert_ori else: raise ValueError("language_str should be ZH, JP or EN") @@ -154,49 +158,54 @@ def infer( reference_audio=None, skip_start=False, skip_end=False, + style_text=None, + style_weight=0.7, ): # 2.2版本参数位置变了 # 2.1 参数新增 emotion reference_audio skip_start skip_end # inferMap_V3 = { # "2.1": V210.infer, - # } + } # 支持中日英三语版本 inferMap_V2 = { - "2.0.2-fix": V200.infer, - "2.0.1": V200.infer, - "2.0": V200.infer, - "1.1.1-fix": V111.infer_fix, - "1.1.1": V111.infer, - "1.1": V110.infer, - "1.1.0": V110.infer, + # "2.0.2-fix": V200.infer, + # "2.0.1": V200.infer, + # "2.0": V200.infer, + # "1.1.1-fix": V111.infer_fix, + # "1.1.1": V111.infer, + # "1.1": V110.infer, + # "1.1.0": V110.infer, } # 仅支持中文版本 # 在测试中,并未发现两个版本的模型不能互相通用 inferMap_V1 = { - "1.0.1": V101.infer, - "1.0": V101.infer, - "1.0.0": V101.infer, + # "1.0.1": V101.infer, + # "1.0": V101.infer, + # "1.0.0": V101.infer, } version = hps.version if hasattr(hps, "version") else latest_version # 非当前版本,根据版本号选择合适的infer if version != latest_version: - # if version in inferMap_V3.keys(): - # return inferMap_V3[version]( - # text, - # sdp_ratio, - # noise_scale, - # noise_scale_w, - # length_scale, - # sid, - # language, - # hps, - # net_g, - # device, - # reference_audio, - # emotion, - # skip_start, - # skip_end, - # ) + if version in inferMap_V3.keys(): + emotion = 0 + return inferMap_V3[version]( + text, + sdp_ratio, + noise_scale, + noise_scale_w, + length_scale, + sid, + language, + hps, + net_g, + device, + reference_audio, + emotion, + skip_start, + skip_end, + style_text, + style_weight, + ) if version in inferMap_V2.keys(): return inferMap_V2[version]( text, @@ -224,14 +233,19 @@ def infer( ) # 在此处实现当前版本的推理 # emo = get_emo_(reference_audio, emotion, sid) - if isinstance(reference_audio, np.ndarray): - emo = get_clap_audio_feature(reference_audio, device) - else: - emo = get_clap_text_feature(emotion, device) - emo = torch.squeeze(emo, dim=1) + # if isinstance(reference_audio, np.ndarray): + # emo = get_clap_audio_feature(reference_audio, device) + # else: + # emo = get_clap_text_feature(emotion, device) + # emo = torch.squeeze(emo, dim=1) bert, ja_bert, en_bert, phones, tones, lang_ids = get_text( - text, language, hps, device + text, + language, + hps, + device, + style_text=style_text, + style_weight=style_weight, ) if skip_start: phones = phones[3:] @@ -255,7 +269,7 @@ def infer( ja_bert = ja_bert.to(device).unsqueeze(0) en_bert = en_bert.to(device).unsqueeze(0) x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) - emo = emo.to(device).unsqueeze(0) + # emo = emo.to(device).unsqueeze(0) del phones speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) audio = ( @@ -268,7 +282,6 @@ def infer( bert, ja_bert, en_bert, - emo, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, @@ -278,7 +291,16 @@ def infer( .float() .numpy() ) - del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo + del ( + x_tst, + tones, + lang_ids, + bert, + x_tst_lengths, + speakers, + ja_bert, + en_bert, + ) # , emo if torch.cuda.is_available(): torch.cuda.empty_cache() return audio @@ -302,14 +324,14 @@ def infer_multilang( ): bert, ja_bert, en_bert, phones, tones, lang_ids = [], [], [], [], [], [] # emo = get_emo_(reference_audio, emotion, sid) - if isinstance(reference_audio, np.ndarray): - emo = get_clap_audio_feature(reference_audio, device) - else: - emo = get_clap_text_feature(emotion, device) - emo = torch.squeeze(emo, dim=1) + # if isinstance(reference_audio, np.ndarray): + # emo = get_clap_audio_feature(reference_audio, device) + # else: + # emo = get_clap_text_feature(emotion, device) + # emo = torch.squeeze(emo, dim=1) for idx, (txt, lang) in enumerate(zip(text, language)): - skip_start = (idx != 0) or (skip_start and idx == 0) - skip_end = (idx != len(text) - 1) or (skip_end and idx == len(text) - 1) + _skip_start = (idx != 0) or (skip_start and idx == 0) + _skip_end = (idx != len(language) - 1) or skip_end ( temp_bert, temp_ja_bert, @@ -318,14 +340,14 @@ def infer_multilang( temp_tones, temp_lang_ids, ) = get_text(txt, lang, hps, device) - if skip_start: + if _skip_start: temp_bert = temp_bert[:, 3:] temp_ja_bert = temp_ja_bert[:, 3:] temp_en_bert = temp_en_bert[:, 3:] temp_phones = temp_phones[3:] temp_tones = temp_tones[3:] temp_lang_ids = temp_lang_ids[3:] - if skip_end: + if _skip_end: temp_bert = temp_bert[:, :-2] temp_ja_bert = temp_ja_bert[:, :-2] temp_en_bert = temp_en_bert[:, :-2] @@ -351,7 +373,7 @@ def infer_multilang( bert = bert.to(device).unsqueeze(0) ja_bert = ja_bert.to(device).unsqueeze(0) en_bert = en_bert.to(device).unsqueeze(0) - emo = emo.to(device).unsqueeze(0) + # emo = emo.to(device).unsqueeze(0) x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device) del phones speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device) @@ -365,7 +387,6 @@ def infer_multilang( bert, ja_bert, en_bert, - emo, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, @@ -375,7 +396,16 @@ def infer_multilang( .float() .numpy() ) - del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo + del ( + x_tst, + tones, + lang_ids, + bert, + x_tst_lengths, + speakers, + ja_bert, + en_bert, + ) # , emo if torch.cuda.is_available(): torch.cuda.empty_cache() return audio diff --git a/losses.py b/losses.py index b1b263e4c205e78ffe970f622ab6ff68f36d3b17..62982fcca79e476953e4c1661bac635585991a6e 100644 --- a/losses.py +++ b/losses.py @@ -1,4 +1,6 @@ import torch +import torchaudio +from transformers import AutoModel def feature_loss(fmap_r, fmap_g): @@ -56,3 +58,96 @@ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): kl = torch.sum(kl * z_mask) l = kl / torch.sum(z_mask) return l + + +class WavLMLoss(torch.nn.Module): + def __init__(self, model, wd, model_sr, slm_sr=16000): + super(WavLMLoss, self).__init__() + self.wavlm = AutoModel.from_pretrained(model) + self.wd = wd + self.resample = torchaudio.transforms.Resample(model_sr, slm_sr) + self.wavlm.eval() + for param in self.wavlm.parameters(): + param.requires_grad = False + + def forward(self, wav, y_rec): + with torch.no_grad(): + wav_16 = self.resample(wav) + wav_embeddings = self.wavlm( + input_values=wav_16, output_hidden_states=True + ).hidden_states + y_rec_16 = self.resample(y_rec) + y_rec_embeddings = self.wavlm( + input_values=y_rec_16.squeeze(), output_hidden_states=True + ).hidden_states + + floss = 0 + for er, eg in zip(wav_embeddings, y_rec_embeddings): + floss += torch.mean(torch.abs(er - eg)) + + return floss.mean() + + def generator(self, y_rec): + y_rec_16 = self.resample(y_rec) + y_rec_embeddings = self.wavlm( + input_values=y_rec_16, output_hidden_states=True + ).hidden_states + y_rec_embeddings = ( + torch.stack(y_rec_embeddings, dim=1) + .transpose(-1, -2) + .flatten(start_dim=1, end_dim=2) + ) + y_df_hat_g = self.wd(y_rec_embeddings) + loss_gen = torch.mean((1 - y_df_hat_g) ** 2) + + return loss_gen + + def discriminator(self, wav, y_rec): + with torch.no_grad(): + wav_16 = self.resample(wav) + wav_embeddings = self.wavlm( + input_values=wav_16, output_hidden_states=True + ).hidden_states + y_rec_16 = self.resample(y_rec) + y_rec_embeddings = self.wavlm( + input_values=y_rec_16, output_hidden_states=True + ).hidden_states + + y_embeddings = ( + torch.stack(wav_embeddings, dim=1) + .transpose(-1, -2) + .flatten(start_dim=1, end_dim=2) + ) + y_rec_embeddings = ( + torch.stack(y_rec_embeddings, dim=1) + .transpose(-1, -2) + .flatten(start_dim=1, end_dim=2) + ) + + y_d_rs = self.wd(y_embeddings) + y_d_gs = self.wd(y_rec_embeddings) + + y_df_hat_r, y_df_hat_g = y_d_rs, y_d_gs + + r_loss = torch.mean((1 - y_df_hat_r) ** 2) + g_loss = torch.mean((y_df_hat_g) ** 2) + + loss_disc_f = r_loss + g_loss + + return loss_disc_f.mean() + + def discriminator_forward(self, wav): + with torch.no_grad(): + wav_16 = self.resample(wav) + wav_embeddings = self.wavlm( + input_values=wav_16, output_hidden_states=True + ).hidden_states + y_embeddings = ( + torch.stack(wav_embeddings, dim=1) + .transpose(-1, -2) + .flatten(start_dim=1, end_dim=2) + ) + + y_d_rs = self.wd(y_embeddings) + + return y_d_rs diff --git a/models.py b/models.py index 6257d8e216daa8a619738afe890df9ffc70dea7a..97dcdce3263e69574239e9ed53cc5a8c3feae6c1 100644 --- a/models.py +++ b/models.py @@ -40,33 +40,22 @@ class DurationDiscriminator(nn.Module): # vits2 self.norm_2 = modules.LayerNorm(filter_channels) self.dur_proj = nn.Conv1d(1, filter_channels, 1) - self.pre_out_conv_1 = nn.Conv1d( - 2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + self.LSTM = nn.LSTM( + 2 * filter_channels, filter_channels, batch_first=True, bidirectional=True ) - self.pre_out_norm_1 = modules.LayerNorm(filter_channels) - self.pre_out_conv_2 = nn.Conv1d( - filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 - ) - self.pre_out_norm_2 = modules.LayerNorm(filter_channels) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, in_channels, 1) - self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid()) + self.output_layer = nn.Sequential( + nn.Linear(2 * filter_channels, 1), nn.Sigmoid() + ) - def forward_probability(self, x, x_mask, dur, g=None): + def forward_probability(self, x, dur): dur = self.dur_proj(dur) x = torch.cat([x, dur], dim=1) - x = self.pre_out_conv_1(x * x_mask) - x = torch.relu(x) - x = self.pre_out_norm_1(x) - x = self.drop(x) - x = self.pre_out_conv_2(x * x_mask) - x = torch.relu(x) - x = self.pre_out_norm_2(x) - x = self.drop(x) - x = x * x_mask x = x.transpose(1, 2) + x, _ = self.LSTM(x) output_prob = self.output_layer(x) return output_prob @@ -86,7 +75,7 @@ class DurationDiscriminator(nn.Module): # vits2 output_probs = [] for dur in [dur_r, dur_hat]: - output_prob = self.forward_probability(x, x_mask, dur, g) + output_prob = self.forward_probability(x, dur) output_probs.append(output_prob) return output_probs @@ -354,7 +343,6 @@ class TextEncoder(nn.Module): n_layers, kernel_size, p_dropout, - n_speakers, gin_channels=0, ): super().__init__() @@ -376,31 +364,6 @@ class TextEncoder(nn.Module): self.bert_proj = nn.Conv1d(1024, hidden_channels, 1) self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1) self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1) - # self.emo_proj = nn.Linear(512, hidden_channels) - self.in_feature_net = nn.Sequential( - # input is assumed to an already normalized embedding - nn.Linear(512, 1028, bias=False), - nn.GELU(), - nn.LayerNorm(1028), - *[Block(1028, 512) for _ in range(1)], - nn.Linear(1028, 512, bias=False), - # normalize before passing to VQ? - # nn.GELU(), - # nn.LayerNorm(512), - ) - self.emo_vq = VectorQuantize( - dim=512, - codebook_size=64, - codebook_dim=32, - commitment_weight=0.1, - decay=0.85, - heads=32, - kmeans_iters=20, - separate_codebook_per_head=True, - stochastic_sample_codes=True, - threshold_ema_dead_code=2, - ) - self.out_feature_net = nn.Linear(512, hidden_channels) self.encoder = attentions.Encoder( hidden_channels, @@ -413,18 +376,10 @@ class TextEncoder(nn.Module): ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward( - self, x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=None - ): - sid = sid.cpu() + def forward(self, x, x_lengths, tone, language, bert, ja_bert, en_bert, g=None): bert_emb = self.bert_proj(bert).transpose(1, 2) ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2) en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2) - emo_emb = self.in_feature_net(emo) - emo_emb, _, loss_commit = self.emo_vq(emo_emb.unsqueeze(1)) - loss_commit = loss_commit.mean() - emo_emb = self.out_feature_net(emo_emb) - # emo_emb = self.emo_proj(emo.unsqueeze(1)) x = ( self.emb(x) + self.tone_emb(tone) @@ -432,7 +387,6 @@ class TextEncoder(nn.Module): + bert_emb + ja_bert_emb + en_bert_emb - + emo_emb ) * math.sqrt( self.hidden_channels ) # [b, t, h] @@ -445,7 +399,7 @@ class TextEncoder(nn.Module): stats = self.proj(x) * x_mask m, logs = torch.split(stats, self.out_channels, dim=1) - return x, m, logs, x_mask, loss_commit + return x, m, logs, x_mask class ResidualCouplingBlock(nn.Module): @@ -748,6 +702,55 @@ class MultiPeriodDiscriminator(torch.nn.Module): return y_d_rs, y_d_gs, fmap_rs, fmap_gs +class WavLMDiscriminator(nn.Module): + """docstring for Discriminator.""" + + def __init__( + self, slm_hidden=768, slm_layers=13, initial_channel=64, use_spectral_norm=False + ): + super(WavLMDiscriminator, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.pre = norm_f( + Conv1d(slm_hidden * slm_layers, initial_channel, 1, 1, padding=0) + ) + + self.convs = nn.ModuleList( + [ + norm_f( + nn.Conv1d( + initial_channel, initial_channel * 2, kernel_size=5, padding=2 + ) + ), + norm_f( + nn.Conv1d( + initial_channel * 2, + initial_channel * 4, + kernel_size=5, + padding=2, + ) + ), + norm_f( + nn.Conv1d(initial_channel * 4, initial_channel * 4, 5, 1, padding=2) + ), + ] + ) + + self.conv_post = norm_f(Conv1d(initial_channel * 4, 1, 3, 1, padding=1)) + + def forward(self, x): + x = self.pre(x) + + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + x = torch.flatten(x, 1, -1) + + return x + + class ReferenceEncoder(nn.Module): """ inputs --- [N, Ty/r, n_mels*r] mels @@ -878,7 +881,6 @@ class SynthesizerTrn(nn.Module): n_layers, kernel_size, p_dropout, - self.n_speakers, gin_channels=self.enc_gin_channels, ) self.dec = Generator( @@ -946,14 +948,13 @@ class SynthesizerTrn(nn.Module): bert, ja_bert, en_bert, - emo=None, ): if self.n_speakers > 0: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] else: g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1) - x, m_p, logs_p, x_mask, loss_commit = self.enc_p( - x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=g + x, m_p, logs_p, x_mask = self.enc_p( + x, x_lengths, tone, language, bert, ja_bert, en_bert, g=g ) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) z_p = self.flow(z, y_mask, g=g) @@ -996,9 +997,11 @@ class SynthesizerTrn(nn.Module): logw_ = torch.log(w + 1e-6) * x_mask logw = self.dp(x, x_mask, g=g) + logw_sdp = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=1.0) l_length_dp = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum( x_mask ) # for averaging + l_length_sdp += torch.sum((logw_sdp - logw_) ** 2, [1, 2]) / torch.sum(x_mask) l_length = l_length_dp + l_length_sdp @@ -1018,9 +1021,8 @@ class SynthesizerTrn(nn.Module): x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q), - (x, logw, logw_), + (x, logw, logw_, logw_sdp), g, - loss_commit, ) def infer( @@ -1033,7 +1035,6 @@ class SynthesizerTrn(nn.Module): bert, ja_bert, en_bert, - emo=None, noise_scale=0.667, length_scale=1, noise_scale_w=0.8, @@ -1047,8 +1048,8 @@ class SynthesizerTrn(nn.Module): g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] else: g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1) - x, m_p, logs_p, x_mask, _ = self.enc_p( - x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=g + x, m_p, logs_p, x_mask = self.enc_p( + x, x_lengths, tone, language, bert, ja_bert, en_bert, g=g ) logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * ( sdp_ratio diff --git a/oldVersion/V210/__init__.py b/oldVersion/V210/__init__.py index e49dcf3c4f709d14a96e28d8ff722f9c21545c40..89cfab3e284749acc4755373f2d733d165fed9b2 100644 --- a/oldVersion/V210/__init__.py +++ b/oldVersion/V210/__init__.py @@ -5,10 +5,9 @@ import torch import commons from .text import cleaned_text_to_sequence, get_bert from .text.cleaner import clean_text -from .emo_gen import get_emo -def get_text(text, language_str, hps, device): +def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7): # 在此处实现当前版本的get_text norm_text, phone, tone, word2ph = clean_text(text, language_str) phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str) @@ -20,7 +19,9 @@ def get_text(text, language_str, hps, device): for i in range(len(word2ph)): word2ph[i] = word2ph[i] * 2 word2ph[0] += 1 - bert_ori = get_bert(norm_text, word2ph, language_str, device) + bert_ori = get_bert( + norm_text, word2ph, language_str, device, style_text, style_weight + ) del word2ph assert bert_ori.shape[-1] == len(phone), phone @@ -50,6 +51,8 @@ def get_text(text, language_str, hps, device): def get_emo_(reference_audio, emotion): + from .emo_gen import get_emo + emo = ( torch.from_numpy(get_emo(reference_audio)) if reference_audio @@ -73,9 +76,11 @@ def infer( emotion=None, skip_start=False, skip_end=False, + style_text=None, + style_weight=0.7, ): bert, ja_bert, en_bert, phones, tones, lang_ids = get_text( - text, language, hps, device + text, language, hps, device, style_text, style_weight ) emo = get_emo_(reference_audio, emotion) if skip_start: diff --git a/oldVersion/V210/models.py b/oldVersion/V210/models.py index f5a0d5ce0177b754bd4eeb4f585c7def8a273c6b..60dffdcf58997932e464372a5c62100eb10676a1 100644 --- a/oldVersion/V210/models.py +++ b/oldVersion/V210/models.py @@ -13,7 +13,7 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from vector_quantize_pytorch import VectorQuantize from commons import init_weights, get_padding -from text import symbols, num_tones, num_languages +from .text import symbols, num_tones, num_languages class DurationDiscriminator(nn.Module): # vits2 diff --git a/oldVersion/V210/text/__init__.py b/oldVersion/V210/text/__init__.py index e7a61585f05e9e79894f5d55bdd4a761972aa639..e29856f98bfb0357186571f4b5e1dd7fcff46726 100644 --- a/oldVersion/V210/text/__init__.py +++ b/oldVersion/V210/text/__init__.py @@ -18,13 +18,15 @@ def cleaned_text_to_sequence(cleaned_text, tones, language): return phones, tones, lang_ids -def get_bert(norm_text, word2ph, language, device): +def get_bert(norm_text, word2ph, language, device, style_text, style_weight): from .chinese_bert import get_bert_feature as zh_bert from .english_bert_mock import get_bert_feature as en_bert from .japanese_bert import get_bert_feature as jp_bert lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} - bert = lang_bert_func_map[language](norm_text, word2ph, device) + bert = lang_bert_func_map[language]( + norm_text, word2ph, device, style_text, style_weight + ) return bert diff --git a/oldVersion/V210/text/chinese_bert.py b/oldVersion/V210/text/chinese_bert.py index 36f1e2a09350584dfe5fca42b27402aa571aba3f..1b60bb4fc9839f908098074eaa54ceef1dff1752 100644 --- a/oldVersion/V210/text/chinese_bert.py +++ b/oldVersion/V210/text/chinese_bert.py @@ -12,7 +12,13 @@ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -29,12 +35,25 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == len(text) + 2 word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/oldVersion/V210/text/english_bert_mock.py b/oldVersion/V210/text/english_bert_mock.py index 85b241c405219c83c617c28bfa0cd274cbc2a557..2f3c9af3d2e9ea6035a6756817948d966bb75a42 100644 --- a/oldVersion/V210/text/english_bert_mock.py +++ b/oldVersion/V210/text/english_bert_mock.py @@ -13,7 +13,13 @@ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -30,11 +36,24 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph)) word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/oldVersion/V210/text/japanese_bert.py b/oldVersion/V210/text/japanese_bert.py index 7dbe28423fd5da5f4b736b74b29449a8376487dc..ae4bfb8d2160b43d22e8a930d837f76ce52cf12d 100644 --- a/oldVersion/V210/text/japanese_bert.py +++ b/oldVersion/V210/text/japanese_bert.py @@ -13,8 +13,16 @@ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): text = "".join(text2sep_kata(text)[0]) + if style_text: + style_text = "".join(text2sep_kata(style_text)[0]) if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -31,12 +39,25 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == len(text) + 2 word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/onnx_infer.py b/onnx_infer.py new file mode 100644 index 0000000000000000000000000000000000000000..b740a381f278bd0ca3944e7a5fd013f05ed22748 --- /dev/null +++ b/onnx_infer.py @@ -0,0 +1,68 @@ +from onnx_modules.V220_OnnxInference import OnnxInferenceSession +import numpy as np +Session = OnnxInferenceSession( + { + "enc" : "onnx/BertVits2.2PT/BertVits2.2PT_enc_p.onnx", + "emb_g" : "onnx/BertVits2.2PT/BertVits2.2PT_emb.onnx", + "dp" : "onnx/BertVits2.2PT/BertVits2.2PT_dp.onnx", + "sdp" : "onnx/BertVits2.2PT/BertVits2.2PT_sdp.onnx", + "flow" : "onnx/BertVits2.2PT/BertVits2.2PT_flow.onnx", + "dec" : "onnx/BertVits2.2PT/BertVits2.2PT_dec.onnx" + }, + Providers = ["CPUExecutionProvider"] + ) + +#这里的输入和原版是一样的,只需要在原版预处理结果出来之后加上.numpy()即可 +x = np.array( + [ + 0, + 97, + 0, + 8, + 0, + 78, + 0, + 8, + 0, + 76, + 0, + 37, + 0, + 40, + 0, + 97, + 0, + 8, + 0, + 23, + 0, + 8, + 0, + 74, + 0, + 26, + 0, + 104, + 0, + ] + ) +tone = np.zeros_like(x) +language = np.zeros_like(x) +sid = np.array([0]) +bert = np.random.randn(x.shape[0], 1024) +ja_bert = np.random.randn(x.shape[0], 1024) +en_bert = np.random.randn(x.shape[0], 1024) +emo = np.random.randn(512, 1) + +audio = Session( + x, + tone, + language, + bert, + ja_bert, + en_bert, + emo, + sid +) + +print(audio) diff --git a/onnx_modules/V200/__init__.py b/onnx_modules/V200/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e90089ffbadbc73e96e38a8f2eae9542e184c2d5 100644 --- a/onnx_modules/V200/__init__.py +++ b/onnx_modules/V200/__init__.py @@ -0,0 +1,4 @@ +from .text.symbols import symbols +from .models_onnx import SynthesizerTrn + +__all__ = ["symbols", "SynthesizerTrn"] diff --git a/onnx_modules/V200_OnnxInference/__init__.py b/onnx_modules/V200_OnnxInference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7170e74be0869a146d865e8d4af0823be875ef35 --- /dev/null +++ b/onnx_modules/V200_OnnxInference/__init__.py @@ -0,0 +1,126 @@ +import numpy as np +import onnxruntime as ort + + +def convert_pad_shape(pad_shape): + layer = pad_shape[::-1] + pad_shape = [item for sublist in layer for item in sublist] + return pad_shape + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = np.arange(max_length, dtype=length.dtype) + return np.expand_dims(x, 0) < np.expand_dims(length, 1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + + b, _, t_y, t_x = mask.shape + cum_duration = np.cumsum(duration, -1) + + cum_duration_flat = cum_duration.reshape(b * t_x) + path = sequence_mask(cum_duration_flat, t_y) + path = path.reshape(b, t_x, t_y) + path = path ^ np.pad(path, ((0, 0), (1, 0), (0, 0)))[:, :-1] + path = np.expand_dims(path, 1).transpose(0, 1, 3, 2) + return path + + +class OnnxInferenceSession: + def __init__(self, path, Providers=["CPUExecutionProvider"]): + self.enc = ort.InferenceSession(path["enc"], providers=Providers) + self.emb_g = ort.InferenceSession(path["emb_g"], providers=Providers) + self.dp = ort.InferenceSession(path["dp"], providers=Providers) + self.sdp = ort.InferenceSession(path["sdp"], providers=Providers) + self.flow = ort.InferenceSession(path["flow"], providers=Providers) + self.dec = ort.InferenceSession(path["dec"], providers=Providers) + + def __call__( + self, + seq, + tone, + language, + bert_zh, + bert_jp, + bert_en, + sid, + seed=114514, + seq_noise_scale=0.8, + sdp_noise_scale=0.6, + length_scale=1.0, + sdp_ratio=0.0, + ): + if seq.ndim == 1: + seq = np.expand_dims(seq, 0) + if tone.ndim == 1: + tone = np.expand_dims(tone, 0) + if language.ndim == 1: + language = np.expand_dims(language, 0) + assert(seq.ndim == 2,tone.ndim == 2,language.ndim == 2) + g = self.emb_g.run( + None, + { + "sid": sid.astype(np.int64), + }, + )[0] + g = np.expand_dims(g, -1) + enc_rtn = self.enc.run( + None, + { + "x": seq.astype(np.int64), + "t": tone.astype(np.int64), + "language": language.astype(np.int64), + "bert_0": bert_zh.astype(np.float32), + "bert_1": bert_jp.astype(np.float32), + "bert_2": bert_en.astype(np.float32), + "g": g.astype(np.float32), + }, + ) + x, m_p, logs_p, x_mask = enc_rtn[0], enc_rtn[1], enc_rtn[2], enc_rtn[3] + np.random.seed(seed) + zinput = np.random.randn(x.shape[0], 2, x.shape[2]) * sdp_noise_scale + logw = self.sdp.run( + None, {"x": x, "x_mask": x_mask, "zin": zinput.astype(np.float32), "g": g} + )[0] * (sdp_ratio) + self.dp.run(None, {"x": x, "x_mask": x_mask, "g": g})[ + 0 + ] * ( + 1 - sdp_ratio + ) + w = np.exp(logw) * x_mask * length_scale + w_ceil = np.ceil(w) + y_lengths = np.clip(np.sum(w_ceil, (1, 2)), a_min=1.0, a_max=100000).astype( + np.int64 + ) + y_mask = np.expand_dims(sequence_mask(y_lengths, None), 1) + attn_mask = np.expand_dims(x_mask, 2) * np.expand_dims(y_mask, -1) + attn = generate_path(w_ceil, attn_mask) + m_p = np.matmul(attn.squeeze(1), m_p.transpose(0, 2, 1)).transpose( + 0, 2, 1 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + logs_p = np.matmul(attn.squeeze(1), logs_p.transpose(0, 2, 1)).transpose( + 0, 2, 1 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + z_p = ( + m_p + + np.random.randn(m_p.shape[0], m_p.shape[1], m_p.shape[2]) + * np.exp(logs_p) + * seq_noise_scale + ) + + z = self.flow.run( + None, + { + "z_p": z_p.astype(np.float32), + "y_mask": y_mask.astype(np.float32), + "g": g, + }, + )[0] + + return self.dec.run(None, {"z_in": z.astype(np.float32), "g": g})[0] diff --git a/onnx_modules/V210/__init__.py b/onnx_modules/V210/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e90089ffbadbc73e96e38a8f2eae9542e184c2d5 100644 --- a/onnx_modules/V210/__init__.py +++ b/onnx_modules/V210/__init__.py @@ -0,0 +1,4 @@ +from .text.symbols import symbols +from .models_onnx import SynthesizerTrn + +__all__ = ["symbols", "SynthesizerTrn"] diff --git a/onnx_modules/V210/models_onnx.py b/onnx_modules/V210/models_onnx.py index eb5be89056f0697e3c30008fe56ce6420c9ca362..6be05f1a0892c877a6b0b150de0db061fa023183 100644 --- a/onnx_modules/V210/models_onnx.py +++ b/onnx_modules/V210/models_onnx.py @@ -942,7 +942,7 @@ class SynthesizerTrn(nn.Module): torch.onnx.export( self.enc_p, - (x, x_lengths, tone, language, bert, ja_bert, en_bert, g, sid + 1, sid + 2), + (x, x_lengths, tone, language, bert, ja_bert, en_bert, g, sid, sid), f"onnx/{path}/{path}_enc_p.onnx", input_names=[ "x", diff --git a/onnx_modules/V210_OnnxInference/__init__.py b/onnx_modules/V210_OnnxInference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2518ef97fc4cf3c8f2e3aed11097d23f2d48e58e --- /dev/null +++ b/onnx_modules/V210_OnnxInference/__init__.py @@ -0,0 +1,129 @@ +import numpy as np +import onnxruntime as ort + + +def convert_pad_shape(pad_shape): + layer = pad_shape[::-1] + pad_shape = [item for sublist in layer for item in sublist] + return pad_shape + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = np.arange(max_length, dtype=length.dtype) + return np.expand_dims(x, 0) < np.expand_dims(length, 1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + + b, _, t_y, t_x = mask.shape + cum_duration = np.cumsum(duration, -1) + + cum_duration_flat = cum_duration.reshape(b * t_x) + path = sequence_mask(cum_duration_flat, t_y) + path = path.reshape(b, t_x, t_y) + path = path ^ np.pad(path, ((0, 0), (1, 0), (0, 0)))[:, :-1] + path = np.expand_dims(path, 1).transpose(0, 1, 3, 2) + return path + + +class OnnxInferenceSession: + def __init__(self, path, Providers=["CPUExecutionProvider"]): + self.enc = ort.InferenceSession(path["enc"], providers=Providers) + self.emb_g = ort.InferenceSession(path["emb_g"], providers=Providers) + self.dp = ort.InferenceSession(path["dp"], providers=Providers) + self.sdp = ort.InferenceSession(path["sdp"], providers=Providers) + self.flow = ort.InferenceSession(path["flow"], providers=Providers) + self.dec = ort.InferenceSession(path["dec"], providers=Providers) + + def __call__( + self, + seq, + tone, + language, + bert_zh, + bert_jp, + bert_en, + vqidx, + sid, + seed=114514, + seq_noise_scale=0.8, + sdp_noise_scale=0.6, + length_scale=1.0, + sdp_ratio=0.0, + ): + if seq.ndim == 1: + seq = np.expand_dims(seq, 0) + if tone.ndim == 1: + tone = np.expand_dims(tone, 0) + if language.ndim == 1: + language = np.expand_dims(language, 0) + assert(seq.ndim == 2,tone.ndim == 2,language.ndim == 2) + g = self.emb_g.run( + None, + { + "sid": sid.astype(np.int64), + }, + )[0] + g = np.expand_dims(g, -1) + enc_rtn = self.enc.run( + None, + { + "x": seq.astype(np.int64), + "t": tone.astype(np.int64), + "language": language.astype(np.int64), + "bert_0": bert_zh.astype(np.float32), + "bert_1": bert_jp.astype(np.float32), + "bert_2": bert_en.astype(np.float32), + "g": g.astype(np.float32), + "vqidx": vqidx.astype(np.int64), + "sid": sid.astype(np.int64) + }, + ) + x, m_p, logs_p, x_mask = enc_rtn[0], enc_rtn[1], enc_rtn[2], enc_rtn[3] + np.random.seed(seed) + zinput = np.random.randn(x.shape[0], 2, x.shape[2]) * sdp_noise_scale + logw = self.sdp.run( + None, {"x": x, "x_mask": x_mask, "zin": zinput.astype(np.float32), "g": g} + )[0] * (sdp_ratio) + self.dp.run(None, {"x": x, "x_mask": x_mask, "g": g})[ + 0 + ] * ( + 1 - sdp_ratio + ) + w = np.exp(logw) * x_mask * length_scale + w_ceil = np.ceil(w) + y_lengths = np.clip(np.sum(w_ceil, (1, 2)), a_min=1.0, a_max=100000).astype( + np.int64 + ) + y_mask = np.expand_dims(sequence_mask(y_lengths, None), 1) + attn_mask = np.expand_dims(x_mask, 2) * np.expand_dims(y_mask, -1) + attn = generate_path(w_ceil, attn_mask) + m_p = np.matmul(attn.squeeze(1), m_p.transpose(0, 2, 1)).transpose( + 0, 2, 1 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + logs_p = np.matmul(attn.squeeze(1), logs_p.transpose(0, 2, 1)).transpose( + 0, 2, 1 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + z_p = ( + m_p + + np.random.randn(m_p.shape[0], m_p.shape[1], m_p.shape[2]) + * np.exp(logs_p) + * seq_noise_scale + ) + + z = self.flow.run( + None, + { + "z_p": z_p.astype(np.float32), + "y_mask": y_mask.astype(np.float32), + "g": g, + }, + )[0] + + return self.dec.run(None, {"z_in": z.astype(np.float32), "g": g})[0] diff --git a/onnx_modules/V220/__init__.py b/onnx_modules/V220/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e90089ffbadbc73e96e38a8f2eae9542e184c2d5 --- /dev/null +++ b/onnx_modules/V220/__init__.py @@ -0,0 +1,4 @@ +from .text.symbols import symbols +from .models_onnx import SynthesizerTrn + +__all__ = ["symbols", "SynthesizerTrn"] diff --git a/onnx_modules/V220/attentions_onnx.py b/onnx_modules/V220/attentions_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..3e6243cd22de6d9e70f069f991c835283cb6c501 --- /dev/null +++ b/onnx_modules/V220/attentions_onnx.py @@ -0,0 +1,378 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +import commons +import logging + +logger = logging.getLogger(__name__) + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=4, + isflow=True, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + # if isflow: + # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1) + # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1) + # self.cond_layer = weight_norm(cond_layer, name='weight') + # self.gin_channels = 256 + self.cond_layer_idx = self.n_layers + if "gin_channels" in kwargs: + self.gin_channels = kwargs["gin_channels"] + if self.gin_channels != 0: + self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels) + # vits2 says 3rd block, so idx is 2 by default + self.cond_layer_idx = ( + kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2 + ) + logging.debug(self.gin_channels, self.cond_layer_idx) + assert ( + self.cond_layer_idx < self.n_layers + ), "cond_layer_idx should be less than n_layers" + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, g=None): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + if i == self.cond_layer_idx and g is not None: + g = self.spk_emb_linear(g.transpose(1, 2)) + g = g.transpose(1, 2) + x = x + g + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x diff --git a/onnx_modules/V220/models_onnx.py b/onnx_modules/V220/models_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..d70814d95c358def14c2eb58a01e621c98f78dff --- /dev/null +++ b/onnx_modules/V220/models_onnx.py @@ -0,0 +1,1076 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +import commons +import modules +from . import attentions_onnx +from vector_quantize_pytorch import VectorQuantize + +from torch.nn import Conv1d, ConvTranspose1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from commons import init_weights, get_padding +from .text import symbols, num_tones, num_languages + + +class DurationDiscriminator(nn.Module): # vits2 + def __init__( + self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 + ): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + + self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_1 = modules.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_2 = modules.LayerNorm(filter_channels) + self.dur_proj = nn.Conv1d(1, filter_channels, 1) + + self.pre_out_conv_1 = nn.Conv1d( + 2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.pre_out_norm_1 = modules.LayerNorm(filter_channels) + self.pre_out_conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.pre_out_norm_2 = modules.LayerNorm(filter_channels) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, in_channels, 1) + + self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid()) + + def forward_probability(self, x, x_mask, dur, g=None): + dur = self.dur_proj(dur) + x = torch.cat([x, dur], dim=1) + x = self.pre_out_conv_1(x * x_mask) + x = torch.relu(x) + x = self.pre_out_norm_1(x) + x = self.drop(x) + x = self.pre_out_conv_2(x * x_mask) + x = torch.relu(x) + x = self.pre_out_norm_2(x) + x = self.drop(x) + x = x * x_mask + x = x.transpose(1, 2) + output_prob = self.output_layer(x) + return output_prob + + def forward(self, x, x_mask, dur_r, dur_hat, g=None): + x = torch.detach(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + x = self.drop(x) + + output_probs = [] + for dur in [dur_r, dur_hat]: + output_prob = self.forward_probability(x, x_mask, dur, g) + output_probs.append(output_prob) + + return output_probs + + +class TransformerCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + n_flows=4, + gin_channels=0, + share_parameter=False, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + + self.wn = ( + attentions_onnx.FFT( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + isflow=True, + gin_channels=self.gin_channels, + ) + if share_parameter + else None + ) + + for i in range(n_flows): + self.flows.append( + modules.TransformerCouplingLayer( + channels, + hidden_channels, + kernel_size, + n_layers, + n_heads, + p_dropout, + filter_channels, + mean_only=True, + wn_sharing_parameter=self.wn, + gin_channels=self.gin_channels, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=True): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class StochasticDurationPredictor(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + p_dropout, + n_flows=4, + gin_channels=0, + ): + super().__init__() + filter_channels = in_channels # it needs to be removed from future version. + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.log_flow = modules.Log() + self.flows = nn.ModuleList() + self.flows.append(modules.ElementwiseAffine(2)) + for i in range(n_flows): + self.flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) + self.flows.append(modules.Flip()) + + self.post_pre = nn.Conv1d(1, filter_channels, 1) + self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.post_convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) + self.post_flows = nn.ModuleList() + self.post_flows.append(modules.ElementwiseAffine(2)) + for i in range(4): + self.post_flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) + self.post_flows.append(modules.Flip()) + + self.pre = nn.Conv1d(in_channels, filter_channels, 1) + self.proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, filter_channels, 1) + + def forward(self, x, x_mask, z, g=None): + x = torch.detach(x) + x = self.pre(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.convs(x, x_mask) + x = self.proj(x) * x_mask + + flows = list(reversed(self.flows)) + flows = flows[:-2] + [flows[-1]] # remove a useless vflow + for flow in flows: + z = flow(z, x_mask, g=x, reverse=True) + z0, z1 = torch.split(z, [1, 1], 1) + logw = z0 + return logw + + +class DurationPredictor(nn.Module): + def __init__( + self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 + ): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + + self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_1 = modules.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_2 = modules.LayerNorm(filter_channels) + self.proj = nn.Conv1d(filter_channels, 1, 1) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, in_channels, 1) + + def forward(self, x, x_mask, g=None): + x = torch.detach(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + x = self.drop(x) + x = self.proj(x * x_mask) + return x * x_mask + + +class Bottleneck(nn.Sequential): + def __init__(self, in_dim, hidden_dim): + c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False) + c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False) + super().__init__(*[c_fc1, c_fc2]) + + +class Block(nn.Module): + def __init__(self, in_dim, hidden_dim) -> None: + super().__init__() + self.norm = nn.LayerNorm(in_dim) + self.mlp = MLP(in_dim, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.mlp(self.norm(x)) + return x + + +class MLP(nn.Module): + def __init__(self, in_dim, hidden_dim): + super().__init__() + self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False) + self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False) + self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False) + + def forward(self, x: torch.Tensor): + x = F.silu(self.c_fc1(x)) * self.c_fc2(x) + x = self.c_proj(x) + return x + + +class TextEncoder(nn.Module): + def __init__( + self, + n_vocab, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + n_speakers, + gin_channels=0, + ): + super().__init__() + self.n_vocab = n_vocab + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + self.emb = nn.Embedding(len(symbols), hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) + self.tone_emb = nn.Embedding(num_tones, hidden_channels) + nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels**-0.5) + self.language_emb = nn.Embedding(num_languages, hidden_channels) + nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5) + self.bert_proj = nn.Conv1d(1024, hidden_channels, 1) + self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1) + self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1) + # self.emo_proj = nn.Linear(1024, 1024) + # self.emo_quantizer = nn.ModuleList() + # for i in range(0, n_speakers): + # self.emo_quantizer.append( + # VectorQuantize( + # dim=1024, + # codebook_size=10, + # decay=0.8, + # commitment_weight=1.0, + # learnable_codebook=True, + # ema_update=False, + # ) + # ) + # self.emo_q_proj = nn.Linear(1024, hidden_channels) + self.n_speakers = n_speakers + self.in_feature_net = nn.Sequential( + # input is assumed to an already normalized embedding + nn.Linear(512, 1028, bias=False), + nn.GELU(), + nn.LayerNorm(1028), + *[Block(1028, 512) for _ in range(1)], + nn.Linear(1028, 512, bias=False), + # normalize before passing to VQ? + # nn.GELU(), + # nn.LayerNorm(512), + ) + self.emo_vq = VectorQuantize( + dim=512, + codebook_size=64, + codebook_dim=32, + commitment_weight=0.1, + decay=0.85, + heads=32, + kmeans_iters=20, + separate_codebook_per_head=True, + stochastic_sample_codes=True, + threshold_ema_dead_code=2, + ) + self.out_feature_net = nn.Linear(512, hidden_channels) + + self.encoder = attentions_onnx.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + gin_channels=self.gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, g=None + ): + x_mask = torch.ones_like(x).unsqueeze(0) + bert_emb = self.bert_proj(bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2) + ja_bert_emb = self.ja_bert_proj(ja_bert.transpose(0, 1).unsqueeze(0)).transpose( + 1, 2 + ) + en_bert_emb = self.en_bert_proj(en_bert.transpose(0, 1).unsqueeze(0)).transpose( + 1, 2 + ) + emo_emb = self.in_feature_net(emo.transpose(0, 1)) + emo_emb, _, _ = self.emo_vq(emo_emb.unsqueeze(1)) + + emo_emb = self.out_feature_net(emo_emb) + + x = ( + self.emb(x) + + self.tone_emb(tone) + + self.language_emb(language) + + bert_emb + + ja_bert_emb + + en_bert_emb + + emo_emb + ) * math.sqrt( + self.hidden_channels + ) # [b, t, h] + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = x_mask.to(x.dtype) + + x = self.encoder(x * x_mask, x_mask, g=g) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return x, m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=True): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for layer in self.ups: + remove_weight_norm(layer) + for layer in self.resblocks: + layer.remove_weight_norm() + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for layer in self.convs: + x = layer(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for layer in self.convs: + x = layer(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class ReferenceEncoder(nn.Module): + """ + inputs --- [N, Ty/r, n_mels*r] mels + outputs --- [N, ref_enc_gru_size] + """ + + def __init__(self, spec_channels, gin_channels=0): + super().__init__() + self.spec_channels = spec_channels + ref_enc_filters = [32, 32, 64, 64, 128, 128] + K = len(ref_enc_filters) + filters = [1] + ref_enc_filters + convs = [ + weight_norm( + nn.Conv2d( + in_channels=filters[i], + out_channels=filters[i + 1], + kernel_size=(3, 3), + stride=(2, 2), + padding=(1, 1), + ) + ) + for i in range(K) + ] + self.convs = nn.ModuleList(convs) + # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501 + + out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K) + self.gru = nn.GRU( + input_size=ref_enc_filters[-1] * out_channels, + hidden_size=256 // 2, + batch_first=True, + ) + self.proj = nn.Linear(128, gin_channels) + + def forward(self, inputs, mask=None): + N = inputs.size(0) + out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs] + for conv in self.convs: + out = conv(out) + # out = wn(out) + out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K] + + out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K] + T = out.size(1) + N = out.size(0) + out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K] + + self.gru.flatten_parameters() + memory, out = self.gru(out) # out --- [1, N, 128] + + return self.proj(out.squeeze(0)) + + def calculate_channels(self, L, kernel_size, stride, pad, n_convs): + for i in range(n_convs): + L = (L - kernel_size + 2 * pad) // stride + 1 + return L + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__( + self, + n_vocab, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=256, + gin_channels=256, + use_sdp=True, + n_flow_layer=4, + n_layers_trans_flow=4, + flow_share_parameter=False, + use_transformer_flow=True, + **kwargs, + ): + super().__init__() + self.n_vocab = n_vocab + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.n_speakers = n_speakers + self.gin_channels = gin_channels + self.n_layers_trans_flow = n_layers_trans_flow + self.use_spk_conditioned_encoder = kwargs.get( + "use_spk_conditioned_encoder", True + ) + self.use_sdp = use_sdp + self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False) + self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01) + self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6) + self.current_mas_noise_scale = self.mas_noise_scale_initial + if self.use_spk_conditioned_encoder and gin_channels > 0: + self.enc_gin_channels = gin_channels + self.enc_p = TextEncoder( + n_vocab, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + self.n_speakers, + gin_channels=self.enc_gin_channels, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + if use_transformer_flow: + self.flow = TransformerCouplingBlock( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers_trans_flow, + 5, + p_dropout, + n_flow_layer, + gin_channels=gin_channels, + share_parameter=flow_share_parameter, + ) + else: + self.flow = ResidualCouplingBlock( + inter_channels, + hidden_channels, + 5, + 1, + n_flow_layer, + gin_channels=gin_channels, + ) + self.sdp = StochasticDurationPredictor( + hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels + ) + self.dp = DurationPredictor( + hidden_channels, 256, 3, 0.5, gin_channels=gin_channels + ) + + if n_speakers >= 1: + self.emb_g = nn.Embedding(n_speakers, gin_channels) + else: + self.ref_enc = ReferenceEncoder(spec_channels, gin_channels) + + def export_onnx( + self, + path, + max_len=None, + sdp_ratio=0, + y=None, + ): + noise_scale = 0.667 + length_scale = 1 + noise_scale_w = 0.8 + x = ( + torch.LongTensor( + [ + 0, + 97, + 0, + 8, + 0, + 78, + 0, + 8, + 0, + 76, + 0, + 37, + 0, + 40, + 0, + 97, + 0, + 8, + 0, + 23, + 0, + 8, + 0, + 74, + 0, + 26, + 0, + 104, + 0, + ] + ) + .unsqueeze(0) + .cpu() + ) + tone = torch.zeros_like(x).cpu() + language = torch.zeros_like(x).cpu() + x_lengths = torch.LongTensor([x.shape[1]]).cpu() + sid = torch.LongTensor([0]).cpu() + bert = torch.randn(size=(x.shape[1], 1024)).cpu() + ja_bert = torch.randn(size=(x.shape[1], 1024)).cpu() + en_bert = torch.randn(size=(x.shape[1], 1024)).cpu() + + if self.n_speakers > 0: + g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] + torch.onnx.export( + self.emb_g, + (sid), + f"onnx/{path}/{path}_emb.onnx", + input_names=["sid"], + output_names=["g"], + verbose=True, + ) + else: + g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1) + + emo = torch.randn(512, 1) + + torch.onnx.export( + self.enc_p, + (x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, g), + f"onnx/{path}/{path}_enc_p.onnx", + input_names=[ + "x", + "x_lengths", + "t", + "language", + "bert_0", + "bert_1", + "bert_2", + "emo", + "g", + ], + output_names=["xout", "m_p", "logs_p", "x_mask"], + dynamic_axes={ + "x": [0, 1], + "t": [0, 1], + "language": [0, 1], + "bert_0": [0], + "bert_1": [0], + "bert_2": [0], + "xout": [0, 2], + "m_p": [0, 2], + "logs_p": [0, 2], + "x_mask": [0, 2], + }, + verbose=True, + opset_version=16, + ) + + x, m_p, logs_p, x_mask = self.enc_p( + x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, g + ) + + zinput = ( + torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) + * noise_scale_w + ) + torch.onnx.export( + self.sdp, + (x, x_mask, zinput, g), + f"onnx/{path}/{path}_sdp.onnx", + input_names=["x", "x_mask", "zin", "g"], + output_names=["logw"], + dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "zin": [0, 2], "logw": [0, 2]}, + verbose=True, + ) + torch.onnx.export( + self.dp, + (x, x_mask, g), + f"onnx/{path}/{path}_dp.onnx", + input_names=["x", "x_mask", "g"], + output_names=["logw"], + dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "logw": [0, 2]}, + verbose=True, + ) + logw = self.sdp(x, x_mask, zinput, g=g) * (sdp_ratio) + self.dp( + x, x_mask, g=g + ) * (1 - sdp_ratio) + w = torch.exp(logw) * x_mask * length_scale + w_ceil = torch.ceil(w) + y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to( + x_mask.dtype + ) + attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) + attn = commons.generate_path(w_ceil, attn_mask) + + m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale + torch.onnx.export( + self.flow, + (z_p, y_mask, g), + f"onnx/{path}/{path}_flow.onnx", + input_names=["z_p", "y_mask", "g"], + output_names=["z"], + dynamic_axes={"z_p": [0, 2], "y_mask": [0, 2], "z": [0, 2]}, + verbose=True, + ) + + z = self.flow(z_p, y_mask, g=g, reverse=True) + z_in = (z * y_mask)[:, :, :max_len] + + torch.onnx.export( + self.dec, + (z_in, g), + f"onnx/{path}/{path}_dec.onnx", + input_names=["z_in", "g"], + output_names=["o"], + dynamic_axes={"z_in": [0, 2], "o": [0, 2]}, + verbose=True, + ) + o = self.dec((z * y_mask)[:, :, :max_len], g=g) diff --git a/onnx_modules/V220/text/__init__.py b/onnx_modules/V220/text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6e670aedef137ce1ddcebd447524fe3834211abc --- /dev/null +++ b/onnx_modules/V220/text/__init__.py @@ -0,0 +1 @@ +from .symbols import * diff --git a/onnx_modules/V220/text/symbols.py b/onnx_modules/V220/text/symbols.py new file mode 100644 index 0000000000000000000000000000000000000000..846de64584e9ba4b8d96aab36d4efbcefb1a11e7 --- /dev/null +++ b/onnx_modules/V220/text/symbols.py @@ -0,0 +1,187 @@ +punctuation = ["!", "?", "…", ",", ".", "'", "-"] +pu_symbols = punctuation + ["SP", "UNK"] +pad = "_" + +# chinese +zh_symbols = [ + "E", + "En", + "a", + "ai", + "an", + "ang", + "ao", + "b", + "c", + "ch", + "d", + "e", + "ei", + "en", + "eng", + "er", + "f", + "g", + "h", + "i", + "i0", + "ia", + "ian", + "iang", + "iao", + "ie", + "in", + "ing", + "iong", + "ir", + "iu", + "j", + "k", + "l", + "m", + "n", + "o", + "ong", + "ou", + "p", + "q", + "r", + "s", + "sh", + "t", + "u", + "ua", + "uai", + "uan", + "uang", + "ui", + "un", + "uo", + "v", + "van", + "ve", + "vn", + "w", + "x", + "y", + "z", + "zh", + "AA", + "EE", + "OO", +] +num_zh_tones = 6 + +# japanese +ja_symbols = [ + "N", + "a", + "a:", + "b", + "by", + "ch", + "d", + "dy", + "e", + "e:", + "f", + "g", + "gy", + "h", + "hy", + "i", + "i:", + "j", + "k", + "ky", + "m", + "my", + "n", + "ny", + "o", + "o:", + "p", + "py", + "q", + "r", + "ry", + "s", + "sh", + "t", + "ts", + "ty", + "u", + "u:", + "w", + "y", + "z", + "zy", +] +num_ja_tones = 2 + +# English +en_symbols = [ + "aa", + "ae", + "ah", + "ao", + "aw", + "ay", + "b", + "ch", + "d", + "dh", + "eh", + "er", + "ey", + "f", + "g", + "hh", + "ih", + "iy", + "jh", + "k", + "l", + "m", + "n", + "ng", + "ow", + "oy", + "p", + "r", + "s", + "sh", + "t", + "th", + "uh", + "uw", + "V", + "w", + "y", + "z", + "zh", +] +num_en_tones = 4 + +# combine all symbols +normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) +symbols = [pad] + normal_symbols + pu_symbols +sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] + +# combine all tones +num_tones = num_zh_tones + num_ja_tones + num_en_tones + +# language maps +language_id_map = {"ZH": 0, "JP": 1, "EN": 2} +num_languages = len(language_id_map.keys()) + +language_tone_start_map = { + "ZH": 0, + "JP": num_zh_tones, + "EN": num_zh_tones + num_ja_tones, +} + +if __name__ == "__main__": + a = set(zh_symbols) + b = set(en_symbols) + print(sorted(a & b)) diff --git a/onnx_modules/V220_OnnxInference/__init__.py b/onnx_modules/V220_OnnxInference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bf94cb1c7f587edfc26b8f49eb2293a91b7fcbba --- /dev/null +++ b/onnx_modules/V220_OnnxInference/__init__.py @@ -0,0 +1,128 @@ +import numpy as np +import onnxruntime as ort + + +def convert_pad_shape(pad_shape): + layer = pad_shape[::-1] + pad_shape = [item for sublist in layer for item in sublist] + return pad_shape + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = np.arange(max_length, dtype=length.dtype) + return np.expand_dims(x, 0) < np.expand_dims(length, 1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + + b, _, t_y, t_x = mask.shape + cum_duration = np.cumsum(duration, -1) + + cum_duration_flat = cum_duration.reshape(b * t_x) + path = sequence_mask(cum_duration_flat, t_y) + path = path.reshape(b, t_x, t_y) + path = path ^ np.pad(path, ((0, 0), (1, 0), (0, 0)))[:, :-1] + path = np.expand_dims(path, 1).transpose(0, 1, 3, 2) + return path + + +class OnnxInferenceSession: + def __init__(self, path, Providers=["CPUExecutionProvider"]): + self.enc = ort.InferenceSession(path["enc"], providers=Providers) + self.emb_g = ort.InferenceSession(path["emb_g"], providers=Providers) + self.dp = ort.InferenceSession(path["dp"], providers=Providers) + self.sdp = ort.InferenceSession(path["sdp"], providers=Providers) + self.flow = ort.InferenceSession(path["flow"], providers=Providers) + self.dec = ort.InferenceSession(path["dec"], providers=Providers) + + def __call__( + self, + seq, + tone, + language, + bert_zh, + bert_jp, + bert_en, + emo, + sid, + seed=114514, + seq_noise_scale=0.8, + sdp_noise_scale=0.6, + length_scale=1.0, + sdp_ratio=0.0, + ): + if seq.ndim == 1: + seq = np.expand_dims(seq, 0) + if tone.ndim == 1: + tone = np.expand_dims(tone, 0) + if language.ndim == 1: + language = np.expand_dims(language, 0) + assert(seq.ndim == 2,tone.ndim == 2,language.ndim == 2) + g = self.emb_g.run( + None, + { + "sid": sid.astype(np.int64), + }, + )[0] + g = np.expand_dims(g, -1) + enc_rtn = self.enc.run( + None, + { + "x": seq.astype(np.int64), + "t": tone.astype(np.int64), + "language": language.astype(np.int64), + "bert_0": bert_zh.astype(np.float32), + "bert_1": bert_jp.astype(np.float32), + "bert_2": bert_en.astype(np.float32), + "emo": emo.astype(np.float32), + "g": g.astype(np.float32), + }, + ) + x, m_p, logs_p, x_mask = enc_rtn[0], enc_rtn[1], enc_rtn[2], enc_rtn[3] + np.random.seed(seed) + zinput = np.random.randn(x.shape[0], 2, x.shape[2]) * sdp_noise_scale + logw = self.sdp.run( + None, {"x": x, "x_mask": x_mask, "zin": zinput.astype(np.float32), "g": g} + )[0] * (sdp_ratio) + self.dp.run(None, {"x": x, "x_mask": x_mask, "g": g})[ + 0 + ] * ( + 1 - sdp_ratio + ) + w = np.exp(logw) * x_mask * length_scale + w_ceil = np.ceil(w) + y_lengths = np.clip(np.sum(w_ceil, (1, 2)), a_min=1.0, a_max=100000).astype( + np.int64 + ) + y_mask = np.expand_dims(sequence_mask(y_lengths, None), 1) + attn_mask = np.expand_dims(x_mask, 2) * np.expand_dims(y_mask, -1) + attn = generate_path(w_ceil, attn_mask) + m_p = np.matmul(attn.squeeze(1), m_p.transpose(0, 2, 1)).transpose( + 0, 2, 1 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + logs_p = np.matmul(attn.squeeze(1), logs_p.transpose(0, 2, 1)).transpose( + 0, 2, 1 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + z_p = ( + m_p + + np.random.randn(m_p.shape[0], m_p.shape[1], m_p.shape[2]) + * np.exp(logs_p) + * seq_noise_scale + ) + + z = self.flow.run( + None, + { + "z_p": z_p.astype(np.float32), + "y_mask": y_mask.astype(np.float32), + "g": g, + }, + )[0] + + return self.dec.run(None, {"z_in": z.astype(np.float32), "g": g})[0] diff --git a/onnx_modules/V220_novq_dev/__init__.py b/onnx_modules/V220_novq_dev/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e90089ffbadbc73e96e38a8f2eae9542e184c2d5 --- /dev/null +++ b/onnx_modules/V220_novq_dev/__init__.py @@ -0,0 +1,4 @@ +from .text.symbols import symbols +from .models_onnx import SynthesizerTrn + +__all__ = ["symbols", "SynthesizerTrn"] diff --git a/onnx_modules/V220_novq_dev/attentions_onnx.py b/onnx_modules/V220_novq_dev/attentions_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..3e6243cd22de6d9e70f069f991c835283cb6c501 --- /dev/null +++ b/onnx_modules/V220_novq_dev/attentions_onnx.py @@ -0,0 +1,378 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +import commons +import logging + +logger = logging.getLogger(__name__) + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=4, + isflow=True, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + # if isflow: + # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1) + # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1) + # self.cond_layer = weight_norm(cond_layer, name='weight') + # self.gin_channels = 256 + self.cond_layer_idx = self.n_layers + if "gin_channels" in kwargs: + self.gin_channels = kwargs["gin_channels"] + if self.gin_channels != 0: + self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels) + # vits2 says 3rd block, so idx is 2 by default + self.cond_layer_idx = ( + kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2 + ) + logging.debug(self.gin_channels, self.cond_layer_idx) + assert ( + self.cond_layer_idx < self.n_layers + ), "cond_layer_idx should be less than n_layers" + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, g=None): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + if i == self.cond_layer_idx and g is not None: + g = self.spk_emb_linear(g.transpose(1, 2)) + g = g.transpose(1, 2) + x = x + g + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x diff --git a/onnx_modules/V220_novq_dev/models_onnx.py b/onnx_modules/V220_novq_dev/models_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..0be615899edad1ba3f49da0ea0946be7443b2d25 --- /dev/null +++ b/onnx_modules/V220_novq_dev/models_onnx.py @@ -0,0 +1,1048 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +import commons +import modules +from . import attentions_onnx + +from torch.nn import Conv1d, ConvTranspose1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from commons import init_weights, get_padding +from .text import symbols, num_tones, num_languages + + +class DurationDiscriminator(nn.Module): # vits2 + def __init__( + self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 + ): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + + self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_1 = modules.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_2 = modules.LayerNorm(filter_channels) + self.dur_proj = nn.Conv1d(1, filter_channels, 1) + + self.pre_out_conv_1 = nn.Conv1d( + 2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.pre_out_norm_1 = modules.LayerNorm(filter_channels) + self.pre_out_conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.pre_out_norm_2 = modules.LayerNorm(filter_channels) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, in_channels, 1) + + self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid()) + + def forward_probability(self, x, x_mask, dur, g=None): + dur = self.dur_proj(dur) + x = torch.cat([x, dur], dim=1) + x = self.pre_out_conv_1(x * x_mask) + x = torch.relu(x) + x = self.pre_out_norm_1(x) + x = self.drop(x) + x = self.pre_out_conv_2(x * x_mask) + x = torch.relu(x) + x = self.pre_out_norm_2(x) + x = self.drop(x) + x = x * x_mask + x = x.transpose(1, 2) + output_prob = self.output_layer(x) + return output_prob + + def forward(self, x, x_mask, dur_r, dur_hat, g=None): + x = torch.detach(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + x = self.drop(x) + + output_probs = [] + for dur in [dur_r, dur_hat]: + output_prob = self.forward_probability(x, x_mask, dur, g) + output_probs.append(output_prob) + + return output_probs + + +class TransformerCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + n_flows=4, + gin_channels=0, + share_parameter=False, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + + self.wn = ( + attentions_onnx.FFT( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + isflow=True, + gin_channels=self.gin_channels, + ) + if share_parameter + else None + ) + + for i in range(n_flows): + self.flows.append( + modules.TransformerCouplingLayer( + channels, + hidden_channels, + kernel_size, + n_layers, + n_heads, + p_dropout, + filter_channels, + mean_only=True, + wn_sharing_parameter=self.wn, + gin_channels=self.gin_channels, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=True): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class StochasticDurationPredictor(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + p_dropout, + n_flows=4, + gin_channels=0, + ): + super().__init__() + filter_channels = in_channels # it needs to be removed from future version. + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.log_flow = modules.Log() + self.flows = nn.ModuleList() + self.flows.append(modules.ElementwiseAffine(2)) + for i in range(n_flows): + self.flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) + self.flows.append(modules.Flip()) + + self.post_pre = nn.Conv1d(1, filter_channels, 1) + self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.post_convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) + self.post_flows = nn.ModuleList() + self.post_flows.append(modules.ElementwiseAffine(2)) + for i in range(4): + self.post_flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) + self.post_flows.append(modules.Flip()) + + self.pre = nn.Conv1d(in_channels, filter_channels, 1) + self.proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, filter_channels, 1) + + def forward(self, x, x_mask, z, g=None): + x = torch.detach(x) + x = self.pre(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.convs(x, x_mask) + x = self.proj(x) * x_mask + + flows = list(reversed(self.flows)) + flows = flows[:-2] + [flows[-1]] # remove a useless vflow + for flow in flows: + z = flow(z, x_mask, g=x, reverse=True) + z0, z1 = torch.split(z, [1, 1], 1) + logw = z0 + return logw + + +class DurationPredictor(nn.Module): + def __init__( + self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 + ): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + + self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_1 = modules.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_2 = modules.LayerNorm(filter_channels) + self.proj = nn.Conv1d(filter_channels, 1, 1) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, in_channels, 1) + + def forward(self, x, x_mask, g=None): + x = torch.detach(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + x = self.drop(x) + x = self.proj(x * x_mask) + return x * x_mask + + +class Bottleneck(nn.Sequential): + def __init__(self, in_dim, hidden_dim): + c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False) + c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False) + super().__init__(*[c_fc1, c_fc2]) + + +class Block(nn.Module): + def __init__(self, in_dim, hidden_dim) -> None: + super().__init__() + self.norm = nn.LayerNorm(in_dim) + self.mlp = MLP(in_dim, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.mlp(self.norm(x)) + return x + + +class MLP(nn.Module): + def __init__(self, in_dim, hidden_dim): + super().__init__() + self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False) + self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False) + self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False) + + def forward(self, x: torch.Tensor): + x = F.silu(self.c_fc1(x)) * self.c_fc2(x) + x = self.c_proj(x) + return x + + +class TextEncoder(nn.Module): + def __init__( + self, + n_vocab, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + n_speakers, + gin_channels=0, + ): + super().__init__() + self.n_vocab = n_vocab + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + self.emb = nn.Embedding(len(symbols), hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) + self.tone_emb = nn.Embedding(num_tones, hidden_channels) + nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels**-0.5) + self.language_emb = nn.Embedding(num_languages, hidden_channels) + nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5) + self.bert_proj = nn.Conv1d(1024, hidden_channels, 1) + self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1) + self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1) + # self.emo_proj = nn.Linear(1024, 1024) + # self.emo_quantizer = nn.ModuleList() + # for i in range(0, n_speakers): + # self.emo_quantizer.append( + # VectorQuantize( + # dim=1024, + # codebook_size=10, + # decay=0.8, + # commitment_weight=1.0, + # learnable_codebook=True, + # ema_update=False, + # ) + # ) + # self.emo_q_proj = nn.Linear(1024, hidden_channels) + self.n_speakers = n_speakers + self.emo_proj = nn.Linear(512, hidden_channels) + + self.encoder = attentions_onnx.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + gin_channels=self.gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, g=None + ): + x_mask = torch.ones_like(x).unsqueeze(0) + bert_emb = self.bert_proj(bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2) + ja_bert_emb = self.ja_bert_proj(ja_bert.transpose(0, 1).unsqueeze(0)).transpose( + 1, 2 + ) + en_bert_emb = self.en_bert_proj(en_bert.transpose(0, 1).unsqueeze(0)).transpose( + 1, 2 + ) + + x = ( + self.emb(x) + + self.tone_emb(tone) + + self.language_emb(language) + + bert_emb + + ja_bert_emb + + en_bert_emb + + self.emo_proj(emo) + ) * math.sqrt( + self.hidden_channels + ) # [b, t, h] + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = x_mask.to(x.dtype) + + x = self.encoder(x * x_mask, x_mask, g=g) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return x, m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=True): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for layer in self.ups: + remove_weight_norm(layer) + for layer in self.resblocks: + layer.remove_weight_norm() + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for layer in self.convs: + x = layer(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for layer in self.convs: + x = layer(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class ReferenceEncoder(nn.Module): + """ + inputs --- [N, Ty/r, n_mels*r] mels + outputs --- [N, ref_enc_gru_size] + """ + + def __init__(self, spec_channels, gin_channels=0): + super().__init__() + self.spec_channels = spec_channels + ref_enc_filters = [32, 32, 64, 64, 128, 128] + K = len(ref_enc_filters) + filters = [1] + ref_enc_filters + convs = [ + weight_norm( + nn.Conv2d( + in_channels=filters[i], + out_channels=filters[i + 1], + kernel_size=(3, 3), + stride=(2, 2), + padding=(1, 1), + ) + ) + for i in range(K) + ] + self.convs = nn.ModuleList(convs) + # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501 + + out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K) + self.gru = nn.GRU( + input_size=ref_enc_filters[-1] * out_channels, + hidden_size=256 // 2, + batch_first=True, + ) + self.proj = nn.Linear(128, gin_channels) + + def forward(self, inputs, mask=None): + N = inputs.size(0) + out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs] + for conv in self.convs: + out = conv(out) + # out = wn(out) + out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K] + + out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K] + T = out.size(1) + N = out.size(0) + out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K] + + self.gru.flatten_parameters() + memory, out = self.gru(out) # out --- [1, N, 128] + + return self.proj(out.squeeze(0)) + + def calculate_channels(self, L, kernel_size, stride, pad, n_convs): + for i in range(n_convs): + L = (L - kernel_size + 2 * pad) // stride + 1 + return L + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__( + self, + n_vocab, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=256, + gin_channels=256, + use_sdp=True, + n_flow_layer=4, + n_layers_trans_flow=4, + flow_share_parameter=False, + use_transformer_flow=True, + **kwargs, + ): + super().__init__() + self.n_vocab = n_vocab + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.n_speakers = n_speakers + self.gin_channels = gin_channels + self.n_layers_trans_flow = n_layers_trans_flow + self.use_spk_conditioned_encoder = kwargs.get( + "use_spk_conditioned_encoder", True + ) + self.use_sdp = use_sdp + self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False) + self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01) + self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6) + self.current_mas_noise_scale = self.mas_noise_scale_initial + if self.use_spk_conditioned_encoder and gin_channels > 0: + self.enc_gin_channels = gin_channels + self.enc_p = TextEncoder( + n_vocab, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + self.n_speakers, + gin_channels=self.enc_gin_channels, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + if use_transformer_flow: + self.flow = TransformerCouplingBlock( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers_trans_flow, + 5, + p_dropout, + n_flow_layer, + gin_channels=gin_channels, + share_parameter=flow_share_parameter, + ) + else: + self.flow = ResidualCouplingBlock( + inter_channels, + hidden_channels, + 5, + 1, + n_flow_layer, + gin_channels=gin_channels, + ) + self.sdp = StochasticDurationPredictor( + hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels + ) + self.dp = DurationPredictor( + hidden_channels, 256, 3, 0.5, gin_channels=gin_channels + ) + + if n_speakers >= 1: + self.emb_g = nn.Embedding(n_speakers, gin_channels) + else: + self.ref_enc = ReferenceEncoder(spec_channels, gin_channels) + + def export_onnx( + self, + path, + max_len=None, + sdp_ratio=0, + y=None, + ): + noise_scale = 0.667 + length_scale = 1 + noise_scale_w = 0.8 + x = ( + torch.LongTensor( + [ + 0, + 97, + 0, + 8, + 0, + 78, + 0, + 8, + 0, + 76, + 0, + 37, + 0, + 40, + 0, + 97, + 0, + 8, + 0, + 23, + 0, + 8, + 0, + 74, + 0, + 26, + 0, + 104, + 0, + ] + ) + .unsqueeze(0) + .cpu() + ) + tone = torch.zeros_like(x).cpu() + language = torch.zeros_like(x).cpu() + x_lengths = torch.LongTensor([x.shape[1]]).cpu() + sid = torch.LongTensor([0]).cpu() + bert = torch.randn(size=(x.shape[1], 1024)).cpu() + ja_bert = torch.randn(size=(x.shape[1], 1024)).cpu() + en_bert = torch.randn(size=(x.shape[1], 1024)).cpu() + + if self.n_speakers > 0: + g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] + torch.onnx.export( + self.emb_g, + (sid), + f"onnx/{path}/{path}_emb.onnx", + input_names=["sid"], + output_names=["g"], + verbose=True, + ) + else: + g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1) + + emo = torch.randn(512, 1) + + torch.onnx.export( + self.enc_p, + (x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, g), + f"onnx/{path}/{path}_enc_p.onnx", + input_names=[ + "x", + "x_lengths", + "t", + "language", + "bert_0", + "bert_1", + "bert_2", + "emo", + "g", + ], + output_names=["xout", "m_p", "logs_p", "x_mask"], + dynamic_axes={ + "x": [0, 1], + "t": [0, 1], + "language": [0, 1], + "bert_0": [0], + "bert_1": [0], + "bert_2": [0], + "xout": [0, 2], + "m_p": [0, 2], + "logs_p": [0, 2], + "x_mask": [0, 2], + }, + verbose=True, + opset_version=16, + ) + + x, m_p, logs_p, x_mask = self.enc_p( + x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, g + ) + + zinput = ( + torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) + * noise_scale_w + ) + torch.onnx.export( + self.sdp, + (x, x_mask, zinput, g), + f"onnx/{path}/{path}_sdp.onnx", + input_names=["x", "x_mask", "zin", "g"], + output_names=["logw"], + dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "zin": [0, 2], "logw": [0, 2]}, + verbose=True, + ) + torch.onnx.export( + self.dp, + (x, x_mask, g), + f"onnx/{path}/{path}_dp.onnx", + input_names=["x", "x_mask", "g"], + output_names=["logw"], + dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "logw": [0, 2]}, + verbose=True, + ) + logw = self.sdp(x, x_mask, zinput, g=g) * (sdp_ratio) + self.dp( + x, x_mask, g=g + ) * (1 - sdp_ratio) + w = torch.exp(logw) * x_mask * length_scale + w_ceil = torch.ceil(w) + y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to( + x_mask.dtype + ) + attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) + attn = commons.generate_path(w_ceil, attn_mask) + + m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale + torch.onnx.export( + self.flow, + (z_p, y_mask, g), + f"onnx/{path}/{path}_flow.onnx", + input_names=["z_p", "y_mask", "g"], + output_names=["z"], + dynamic_axes={"z_p": [0, 2], "y_mask": [0, 2], "z": [0, 2]}, + verbose=True, + ) + + z = self.flow(z_p, y_mask, g=g, reverse=True) + z_in = (z * y_mask)[:, :, :max_len] + + torch.onnx.export( + self.dec, + (z_in, g), + f"onnx/{path}/{path}_dec.onnx", + input_names=["z_in", "g"], + output_names=["o"], + dynamic_axes={"z_in": [0, 2], "o": [0, 2]}, + verbose=True, + ) + o = self.dec((z * y_mask)[:, :, :max_len], g=g) diff --git a/onnx_modules/V220_novq_dev/text/__init__.py b/onnx_modules/V220_novq_dev/text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6e670aedef137ce1ddcebd447524fe3834211abc --- /dev/null +++ b/onnx_modules/V220_novq_dev/text/__init__.py @@ -0,0 +1 @@ +from .symbols import * diff --git a/onnx_modules/V220_novq_dev/text/symbols.py b/onnx_modules/V220_novq_dev/text/symbols.py new file mode 100644 index 0000000000000000000000000000000000000000..846de64584e9ba4b8d96aab36d4efbcefb1a11e7 --- /dev/null +++ b/onnx_modules/V220_novq_dev/text/symbols.py @@ -0,0 +1,187 @@ +punctuation = ["!", "?", "…", ",", ".", "'", "-"] +pu_symbols = punctuation + ["SP", "UNK"] +pad = "_" + +# chinese +zh_symbols = [ + "E", + "En", + "a", + "ai", + "an", + "ang", + "ao", + "b", + "c", + "ch", + "d", + "e", + "ei", + "en", + "eng", + "er", + "f", + "g", + "h", + "i", + "i0", + "ia", + "ian", + "iang", + "iao", + "ie", + "in", + "ing", + "iong", + "ir", + "iu", + "j", + "k", + "l", + "m", + "n", + "o", + "ong", + "ou", + "p", + "q", + "r", + "s", + "sh", + "t", + "u", + "ua", + "uai", + "uan", + "uang", + "ui", + "un", + "uo", + "v", + "van", + "ve", + "vn", + "w", + "x", + "y", + "z", + "zh", + "AA", + "EE", + "OO", +] +num_zh_tones = 6 + +# japanese +ja_symbols = [ + "N", + "a", + "a:", + "b", + "by", + "ch", + "d", + "dy", + "e", + "e:", + "f", + "g", + "gy", + "h", + "hy", + "i", + "i:", + "j", + "k", + "ky", + "m", + "my", + "n", + "ny", + "o", + "o:", + "p", + "py", + "q", + "r", + "ry", + "s", + "sh", + "t", + "ts", + "ty", + "u", + "u:", + "w", + "y", + "z", + "zy", +] +num_ja_tones = 2 + +# English +en_symbols = [ + "aa", + "ae", + "ah", + "ao", + "aw", + "ay", + "b", + "ch", + "d", + "dh", + "eh", + "er", + "ey", + "f", + "g", + "hh", + "ih", + "iy", + "jh", + "k", + "l", + "m", + "n", + "ng", + "ow", + "oy", + "p", + "r", + "s", + "sh", + "t", + "th", + "uh", + "uw", + "V", + "w", + "y", + "z", + "zh", +] +num_en_tones = 4 + +# combine all symbols +normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) +symbols = [pad] + normal_symbols + pu_symbols +sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] + +# combine all tones +num_tones = num_zh_tones + num_ja_tones + num_en_tones + +# language maps +language_id_map = {"ZH": 0, "JP": 1, "EN": 2} +num_languages = len(language_id_map.keys()) + +language_tone_start_map = { + "ZH": 0, + "JP": num_zh_tones, + "EN": num_zh_tones + num_ja_tones, +} + +if __name__ == "__main__": + a = set(zh_symbols) + b = set(en_symbols) + print(sorted(a & b)) diff --git a/onnx_modules/V230/__init__.py b/onnx_modules/V230/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e90089ffbadbc73e96e38a8f2eae9542e184c2d5 --- /dev/null +++ b/onnx_modules/V230/__init__.py @@ -0,0 +1,4 @@ +from .text.symbols import symbols +from .models_onnx import SynthesizerTrn + +__all__ = ["symbols", "SynthesizerTrn"] diff --git a/onnx_modules/V230/attentions_onnx.py b/onnx_modules/V230/attentions_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..3e6243cd22de6d9e70f069f991c835283cb6c501 --- /dev/null +++ b/onnx_modules/V230/attentions_onnx.py @@ -0,0 +1,378 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +import commons +import logging + +logger = logging.getLogger(__name__) + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=4, + isflow=True, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + # if isflow: + # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1) + # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1) + # self.cond_layer = weight_norm(cond_layer, name='weight') + # self.gin_channels = 256 + self.cond_layer_idx = self.n_layers + if "gin_channels" in kwargs: + self.gin_channels = kwargs["gin_channels"] + if self.gin_channels != 0: + self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels) + # vits2 says 3rd block, so idx is 2 by default + self.cond_layer_idx = ( + kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2 + ) + logging.debug(self.gin_channels, self.cond_layer_idx) + assert ( + self.cond_layer_idx < self.n_layers + ), "cond_layer_idx should be less than n_layers" + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, g=None): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + if i == self.cond_layer_idx and g is not None: + g = self.spk_emb_linear(g.transpose(1, 2)) + g = g.transpose(1, 2) + x = x + g + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x diff --git a/onnx_modules/V230/models_onnx.py b/onnx_modules/V230/models_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..be0f5a7ec5fd65d44edd4abbce8eef4a47ed6d5e --- /dev/null +++ b/onnx_modules/V230/models_onnx.py @@ -0,0 +1,1061 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +import commons +import modules +from . import attentions_onnx + + +from torch.nn import Conv1d, ConvTranspose1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm + +from commons import init_weights, get_padding +from .text import symbols, num_tones, num_languages + + + + +class DurationDiscriminator(nn.Module): # vits2 + def __init__( + self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 + ): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + + self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_1 = modules.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_2 = modules.LayerNorm(filter_channels) + self.dur_proj = nn.Conv1d(1, filter_channels, 1) + + self.LSTM = nn.LSTM( + 2 * filter_channels, filter_channels, batch_first=True, bidirectional=True + ) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, in_channels, 1) + + self.output_layer = nn.Sequential( + nn.Linear(2 * filter_channels, 1), nn.Sigmoid() + ) + + def forward_probability(self, x, dur): + dur = self.dur_proj(dur) + x = torch.cat([x, dur], dim=1) + x = x.transpose(1, 2) + x, _ = self.LSTM(x) + output_prob = self.output_layer(x) + return output_prob + + def forward(self, x, x_mask, dur_r, dur_hat, g=None): + x = torch.detach(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + x = self.drop(x) + + output_probs = [] + for dur in [dur_r, dur_hat]: + output_prob = self.forward_probability(x, dur) + output_probs.append(output_prob) + + return output_probs + + +class TransformerCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + n_flows=4, + gin_channels=0, + share_parameter=False, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + + self.wn = ( + attentions_onnx.FFT( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + isflow=True, + gin_channels=self.gin_channels, + ) + if share_parameter + else None + ) + + for i in range(n_flows): + self.flows.append( + modules.TransformerCouplingLayer( + channels, + hidden_channels, + kernel_size, + n_layers, + n_heads, + p_dropout, + filter_channels, + mean_only=True, + wn_sharing_parameter=self.wn, + gin_channels=self.gin_channels, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=True): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class StochasticDurationPredictor(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + p_dropout, + n_flows=4, + gin_channels=0, + ): + super().__init__() + filter_channels = in_channels # it needs to be removed from future version. + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.log_flow = modules.Log() + self.flows = nn.ModuleList() + self.flows.append(modules.ElementwiseAffine(2)) + for i in range(n_flows): + self.flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) + self.flows.append(modules.Flip()) + + self.post_pre = nn.Conv1d(1, filter_channels, 1) + self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.post_convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) + self.post_flows = nn.ModuleList() + self.post_flows.append(modules.ElementwiseAffine(2)) + for i in range(4): + self.post_flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) + self.post_flows.append(modules.Flip()) + + self.pre = nn.Conv1d(in_channels, filter_channels, 1) + self.proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, filter_channels, 1) + + def forward(self, x, x_mask, z, g=None): + x = torch.detach(x) + x = self.pre(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.convs(x, x_mask) + x = self.proj(x) * x_mask + + flows = list(reversed(self.flows)) + flows = flows[:-2] + [flows[-1]] # remove a useless vflow + for flow in flows: + z = flow(z, x_mask, g=x, reverse=True) + z0, z1 = torch.split(z, [1, 1], 1) + logw = z0 + return logw + + +class DurationPredictor(nn.Module): + def __init__( + self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 + ): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + + self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_1 = modules.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_2 = modules.LayerNorm(filter_channels) + self.proj = nn.Conv1d(filter_channels, 1, 1) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, in_channels, 1) + + def forward(self, x, x_mask, g=None): + x = torch.detach(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + x = self.drop(x) + x = self.proj(x * x_mask) + return x * x_mask + + +class Bottleneck(nn.Sequential): + def __init__(self, in_dim, hidden_dim): + c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False) + c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False) + super().__init__(*[c_fc1, c_fc2]) + + +class Block(nn.Module): + def __init__(self, in_dim, hidden_dim) -> None: + super().__init__() + self.norm = nn.LayerNorm(in_dim) + self.mlp = MLP(in_dim, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.mlp(self.norm(x)) + return x + + +class MLP(nn.Module): + def __init__(self, in_dim, hidden_dim): + super().__init__() + self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False) + self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False) + self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False) + + def forward(self, x: torch.Tensor): + x = F.silu(self.c_fc1(x)) * self.c_fc2(x) + x = self.c_proj(x) + return x + + +class TextEncoder(nn.Module): + def __init__( + self, + n_vocab, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + gin_channels=0, + ): + super().__init__() + self.n_vocab = n_vocab + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + self.emb = nn.Embedding(len(symbols), hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) + self.tone_emb = nn.Embedding(num_tones, hidden_channels) + nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels**-0.5) + self.language_emb = nn.Embedding(num_languages, hidden_channels) + nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5) + self.bert_proj = nn.Conv1d(1024, hidden_channels, 1) + self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1) + self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1) + + self.encoder = attentions_onnx.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + gin_channels=self.gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, tone, language, bert, ja_bert, en_bert, g=None): + x_mask = torch.ones_like(x).unsqueeze(0) + bert_emb = self.bert_proj(bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2) + ja_bert_emb = self.ja_bert_proj(ja_bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2) + en_bert_emb = self.en_bert_proj(en_bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2) + x = ( + self.emb(x) + + self.tone_emb(tone) + + self.language_emb(language) + + bert_emb + + ja_bert_emb + + en_bert_emb + ) * math.sqrt( + self.hidden_channels + ) # [b, t, h] + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = x_mask.to(x.dtype) + + x = self.encoder(x * x_mask, x_mask, g=g) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return x, m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=True): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for layer in self.ups: + remove_weight_norm(layer) + for layer in self.resblocks: + layer.remove_weight_norm() + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for layer in self.convs: + x = layer(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for layer in self.convs: + x = layer(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class WavLMDiscriminator(nn.Module): + """docstring for Discriminator.""" + + def __init__( + self, slm_hidden=768, slm_layers=13, initial_channel=64, use_spectral_norm=False + ): + super(WavLMDiscriminator, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.pre = norm_f( + Conv1d(slm_hidden * slm_layers, initial_channel, 1, 1, padding=0) + ) + + self.convs = nn.ModuleList( + [ + norm_f( + nn.Conv1d( + initial_channel, initial_channel * 2, kernel_size=5, padding=2 + ) + ), + norm_f( + nn.Conv1d( + initial_channel * 2, + initial_channel * 4, + kernel_size=5, + padding=2, + ) + ), + norm_f( + nn.Conv1d(initial_channel * 4, initial_channel * 4, 5, 1, padding=2) + ), + ] + ) + + self.conv_post = norm_f(Conv1d(initial_channel * 4, 1, 3, 1, padding=1)) + + def forward(self, x): + x = self.pre(x) + + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + x = torch.flatten(x, 1, -1) + + return x + + +class ReferenceEncoder(nn.Module): + """ + inputs --- [N, Ty/r, n_mels*r] mels + outputs --- [N, ref_enc_gru_size] + """ + + def __init__(self, spec_channels, gin_channels=0): + super().__init__() + self.spec_channels = spec_channels + ref_enc_filters = [32, 32, 64, 64, 128, 128] + K = len(ref_enc_filters) + filters = [1] + ref_enc_filters + convs = [ + weight_norm( + nn.Conv2d( + in_channels=filters[i], + out_channels=filters[i + 1], + kernel_size=(3, 3), + stride=(2, 2), + padding=(1, 1), + ) + ) + for i in range(K) + ] + self.convs = nn.ModuleList(convs) + # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501 + + out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K) + self.gru = nn.GRU( + input_size=ref_enc_filters[-1] * out_channels, + hidden_size=256 // 2, + batch_first=True, + ) + self.proj = nn.Linear(128, gin_channels) + + def forward(self, inputs, mask=None): + N = inputs.size(0) + out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs] + for conv in self.convs: + out = conv(out) + # out = wn(out) + out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K] + + out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K] + T = out.size(1) + N = out.size(0) + out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K] + + self.gru.flatten_parameters() + memory, out = self.gru(out) # out --- [1, N, 128] + + return self.proj(out.squeeze(0)) + + def calculate_channels(self, L, kernel_size, stride, pad, n_convs): + for i in range(n_convs): + L = (L - kernel_size + 2 * pad) // stride + 1 + return L + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__( + self, + n_vocab, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=256, + gin_channels=256, + use_sdp=True, + n_flow_layer=4, + n_layers_trans_flow=4, + flow_share_parameter=False, + use_transformer_flow=True, + **kwargs + ): + super().__init__() + self.n_vocab = n_vocab + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.n_speakers = n_speakers + self.gin_channels = gin_channels + self.n_layers_trans_flow = n_layers_trans_flow + self.use_spk_conditioned_encoder = kwargs.get( + "use_spk_conditioned_encoder", True + ) + self.use_sdp = use_sdp + self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False) + self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01) + self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6) + self.current_mas_noise_scale = self.mas_noise_scale_initial + if self.use_spk_conditioned_encoder and gin_channels > 0: + self.enc_gin_channels = gin_channels + self.enc_p = TextEncoder( + n_vocab, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + gin_channels=self.enc_gin_channels, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + if use_transformer_flow: + self.flow = TransformerCouplingBlock( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers_trans_flow, + 5, + p_dropout, + n_flow_layer, + gin_channels=gin_channels, + share_parameter=flow_share_parameter, + ) + else: + self.flow = ResidualCouplingBlock( + inter_channels, + hidden_channels, + 5, + 1, + n_flow_layer, + gin_channels=gin_channels, + ) + self.sdp = StochasticDurationPredictor( + hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels + ) + self.dp = DurationPredictor( + hidden_channels, 256, 3, 0.5, gin_channels=gin_channels + ) + + if n_speakers >= 1: + self.emb_g = nn.Embedding(n_speakers, gin_channels) + else: + self.ref_enc = ReferenceEncoder(spec_channels, gin_channels) + + def export_onnx( + self, + path, + max_len=None, + sdp_ratio=0, + y=None, + ): + noise_scale = 0.667 + length_scale = 1 + noise_scale_w = 0.8 + x = ( + torch.LongTensor( + [ + 0, + 97, + 0, + 8, + 0, + 78, + 0, + 8, + 0, + 76, + 0, + 37, + 0, + 40, + 0, + 97, + 0, + 8, + 0, + 23, + 0, + 8, + 0, + 74, + 0, + 26, + 0, + 104, + 0, + ] + ) + .unsqueeze(0) + .cpu() + ) + tone = torch.zeros_like(x).cpu() + language = torch.zeros_like(x).cpu() + x_lengths = torch.LongTensor([x.shape[1]]).cpu() + sid = torch.LongTensor([0]).cpu() + bert = torch.randn(size=(x.shape[1], 1024)).cpu() + ja_bert = torch.randn(size=(x.shape[1], 1024)).cpu() + en_bert = torch.randn(size=(x.shape[1], 1024)).cpu() + + if self.n_speakers > 0: + g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] + torch.onnx.export( + self.emb_g, + (sid), + f"onnx/{path}/{path}_emb.onnx", + input_names=["sid"], + output_names=["g"], + verbose=True, + ) + else: + g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1) + + torch.onnx.export( + self.enc_p, + (x, x_lengths, tone, language, bert, ja_bert, en_bert, g), + f"onnx/{path}/{path}_enc_p.onnx", + input_names=[ + "x", + "x_lengths", + "t", + "language", + "bert_0", + "bert_1", + "bert_2", + "g", + ], + output_names=["xout", "m_p", "logs_p", "x_mask"], + dynamic_axes={ + "x": [0, 1], + "t": [0, 1], + "language": [0, 1], + "bert_0": [0], + "bert_1": [0], + "bert_2": [0], + "xout": [0, 2], + "m_p": [0, 2], + "logs_p": [0, 2], + "x_mask": [0, 2], + }, + verbose=True, + opset_version=16, + ) + + x, m_p, logs_p, x_mask = self.enc_p( + x, x_lengths, tone, language, bert, ja_bert, en_bert, g + ) + + zinput = ( + torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) + * noise_scale_w + ) + torch.onnx.export( + self.sdp, + (x, x_mask, zinput, g), + f"onnx/{path}/{path}_sdp.onnx", + input_names=["x", "x_mask", "zin", "g"], + output_names=["logw"], + dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "zin": [0, 2], "logw": [0, 2]}, + verbose=True, + ) + torch.onnx.export( + self.dp, + (x, x_mask, g), + f"onnx/{path}/{path}_dp.onnx", + input_names=["x", "x_mask", "g"], + output_names=["logw"], + dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "logw": [0, 2]}, + verbose=True, + ) + logw = self.sdp(x, x_mask, zinput, g=g) * (sdp_ratio) + self.dp( + x, x_mask, g=g + ) * (1 - sdp_ratio) + w = torch.exp(logw) * x_mask * length_scale + w_ceil = torch.ceil(w) + y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to( + x_mask.dtype + ) + attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) + attn = commons.generate_path(w_ceil, attn_mask) + + m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale + torch.onnx.export( + self.flow, + (z_p, y_mask, g), + f"onnx/{path}/{path}_flow.onnx", + input_names=["z_p", "y_mask", "g"], + output_names=["z"], + dynamic_axes={"z_p": [0, 2], "y_mask": [0, 2], "z": [0, 2]}, + verbose=True, + ) + + z = self.flow(z_p, y_mask, g=g, reverse=True) + z_in = (z * y_mask)[:, :, :max_len] + + torch.onnx.export( + self.dec, + (z_in, g), + f"onnx/{path}/{path}_dec.onnx", + input_names=["z_in", "g"], + output_names=["o"], + dynamic_axes={"z_in": [0, 2], "o": [0, 2]}, + verbose=True, + ) + o = self.dec((z * y_mask)[:, :, :max_len], g=g) diff --git a/onnx_modules/V230/text/__init__.py b/onnx_modules/V230/text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6e670aedef137ce1ddcebd447524fe3834211abc --- /dev/null +++ b/onnx_modules/V230/text/__init__.py @@ -0,0 +1 @@ +from .symbols import * diff --git a/onnx_modules/V230/text/symbols.py b/onnx_modules/V230/text/symbols.py new file mode 100644 index 0000000000000000000000000000000000000000..e489dd0daf8de5ca6529a69734c8e1ca35087ed7 --- /dev/null +++ b/onnx_modules/V230/text/symbols.py @@ -0,0 +1,187 @@ +punctuation = ["!", "?", "…", ",", ".", "'", "-"] +pu_symbols = punctuation + ["SP", "UNK"] +pad = "_" + +# chinese +zh_symbols = [ + "E", + "En", + "a", + "ai", + "an", + "ang", + "ao", + "b", + "c", + "ch", + "d", + "e", + "ei", + "en", + "eng", + "er", + "f", + "g", + "h", + "i", + "i0", + "ia", + "ian", + "iang", + "iao", + "ie", + "in", + "ing", + "iong", + "ir", + "iu", + "j", + "k", + "l", + "m", + "n", + "o", + "ong", + "ou", + "p", + "q", + "r", + "s", + "sh", + "t", + "u", + "ua", + "uai", + "uan", + "uang", + "ui", + "un", + "uo", + "v", + "van", + "ve", + "vn", + "w", + "x", + "y", + "z", + "zh", + "AA", + "EE", + "OO", +] +num_zh_tones = 6 + +# japanese +ja_symbols = [ + "N", + "a", + "a:", + "b", + "by", + "ch", + "d", + "dy", + "e", + "e:", + "f", + "g", + "gy", + "h", + "hy", + "i", + "i:", + "j", + "k", + "ky", + "m", + "my", + "n", + "ny", + "o", + "o:", + "p", + "py", + "q", + "r", + "ry", + "s", + "sh", + "t", + "ts", + "ty", + "u", + "u:", + "w", + "y", + "z", + "zy", +] +num_ja_tones = 2 + +# English +en_symbols = [ + "aa", + "ae", + "ah", + "ao", + "aw", + "ay", + "b", + "ch", + "d", + "dh", + "eh", + "er", + "ey", + "f", + "g", + "hh", + "ih", + "iy", + "jh", + "k", + "l", + "m", + "n", + "ng", + "ow", + "oy", + "p", + "r", + "s", + "sh", + "t", + "th", + "uh", + "uw", + "V", + "w", + "y", + "z", + "zh", +] +num_en_tones = 4 + +# combine all symbols +normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols)) +symbols = [pad] + normal_symbols + pu_symbols +sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] + +# combine all tones +num_tones = num_zh_tones + num_ja_tones + num_en_tones + +# language maps +language_id_map = {"ZH": 0, "JP": 1, "EN": 2} +num_languages = len(language_id_map.keys()) + +language_tone_start_map = { + "ZH": 0, + "JP": num_zh_tones, + "EN": num_zh_tones + num_ja_tones, +} + +if __name__ == "__main__": + a = set(zh_symbols) + b = set(en_symbols) + print(sorted(a & b)) diff --git a/onnx_modules/V230_OnnxInference/__init__.py b/onnx_modules/V230_OnnxInference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7170e74be0869a146d865e8d4af0823be875ef35 --- /dev/null +++ b/onnx_modules/V230_OnnxInference/__init__.py @@ -0,0 +1,126 @@ +import numpy as np +import onnxruntime as ort + + +def convert_pad_shape(pad_shape): + layer = pad_shape[::-1] + pad_shape = [item for sublist in layer for item in sublist] + return pad_shape + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = np.arange(max_length, dtype=length.dtype) + return np.expand_dims(x, 0) < np.expand_dims(length, 1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + + b, _, t_y, t_x = mask.shape + cum_duration = np.cumsum(duration, -1) + + cum_duration_flat = cum_duration.reshape(b * t_x) + path = sequence_mask(cum_duration_flat, t_y) + path = path.reshape(b, t_x, t_y) + path = path ^ np.pad(path, ((0, 0), (1, 0), (0, 0)))[:, :-1] + path = np.expand_dims(path, 1).transpose(0, 1, 3, 2) + return path + + +class OnnxInferenceSession: + def __init__(self, path, Providers=["CPUExecutionProvider"]): + self.enc = ort.InferenceSession(path["enc"], providers=Providers) + self.emb_g = ort.InferenceSession(path["emb_g"], providers=Providers) + self.dp = ort.InferenceSession(path["dp"], providers=Providers) + self.sdp = ort.InferenceSession(path["sdp"], providers=Providers) + self.flow = ort.InferenceSession(path["flow"], providers=Providers) + self.dec = ort.InferenceSession(path["dec"], providers=Providers) + + def __call__( + self, + seq, + tone, + language, + bert_zh, + bert_jp, + bert_en, + sid, + seed=114514, + seq_noise_scale=0.8, + sdp_noise_scale=0.6, + length_scale=1.0, + sdp_ratio=0.0, + ): + if seq.ndim == 1: + seq = np.expand_dims(seq, 0) + if tone.ndim == 1: + tone = np.expand_dims(tone, 0) + if language.ndim == 1: + language = np.expand_dims(language, 0) + assert(seq.ndim == 2,tone.ndim == 2,language.ndim == 2) + g = self.emb_g.run( + None, + { + "sid": sid.astype(np.int64), + }, + )[0] + g = np.expand_dims(g, -1) + enc_rtn = self.enc.run( + None, + { + "x": seq.astype(np.int64), + "t": tone.astype(np.int64), + "language": language.astype(np.int64), + "bert_0": bert_zh.astype(np.float32), + "bert_1": bert_jp.astype(np.float32), + "bert_2": bert_en.astype(np.float32), + "g": g.astype(np.float32), + }, + ) + x, m_p, logs_p, x_mask = enc_rtn[0], enc_rtn[1], enc_rtn[2], enc_rtn[3] + np.random.seed(seed) + zinput = np.random.randn(x.shape[0], 2, x.shape[2]) * sdp_noise_scale + logw = self.sdp.run( + None, {"x": x, "x_mask": x_mask, "zin": zinput.astype(np.float32), "g": g} + )[0] * (sdp_ratio) + self.dp.run(None, {"x": x, "x_mask": x_mask, "g": g})[ + 0 + ] * ( + 1 - sdp_ratio + ) + w = np.exp(logw) * x_mask * length_scale + w_ceil = np.ceil(w) + y_lengths = np.clip(np.sum(w_ceil, (1, 2)), a_min=1.0, a_max=100000).astype( + np.int64 + ) + y_mask = np.expand_dims(sequence_mask(y_lengths, None), 1) + attn_mask = np.expand_dims(x_mask, 2) * np.expand_dims(y_mask, -1) + attn = generate_path(w_ceil, attn_mask) + m_p = np.matmul(attn.squeeze(1), m_p.transpose(0, 2, 1)).transpose( + 0, 2, 1 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + logs_p = np.matmul(attn.squeeze(1), logs_p.transpose(0, 2, 1)).transpose( + 0, 2, 1 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + z_p = ( + m_p + + np.random.randn(m_p.shape[0], m_p.shape[1], m_p.shape[2]) + * np.exp(logs_p) + * seq_noise_scale + ) + + z = self.flow.run( + None, + { + "z_p": z_p.astype(np.float32), + "y_mask": y_mask.astype(np.float32), + "g": g, + }, + )[0] + + return self.dec.run(None, {"z_in": z.astype(np.float32), "g": g})[0] diff --git a/onnx_modules/__init__.py b/onnx_modules/__init__.py index 72abb3af31c78af9d3f770b70d97e5c16f147964..1a1a165a3a5e7b4246a08a32eda9a473334677f1 100644 --- a/onnx_modules/__init__.py +++ b/onnx_modules/__init__.py @@ -1,14 +1,21 @@ -from utils import get_hparams_from_file, load_checkpoint +from utils import get_hparams_from_file, load_checkpoint import json -def export_onnx(export_path, model_path, config_path): +def export_onnx(export_path, model_path, config_path, novq, dev): hps = get_hparams_from_file(config_path) version = hps.version[0:3] - if version == "2.0": + if version == "2.0" or (version == "2.1" and novq): from .V200 import SynthesizerTrn, symbols - elif version == "2.1": + elif version == "2.1" and (not novq): from .V210 import SynthesizerTrn, symbols + elif version == "2.2": + if novq and dev: + from .V220_novq_dev import SynthesizerTrn, symbols + else: + from .V220 import SynthesizerTrn, symbols + elif version == "2.3": + from .V230 import SynthesizerTrn, symbols net_g = SynthesizerTrn( len(symbols), hps.data.filter_length // 2 + 1, @@ -41,6 +48,7 @@ def export_onnx(export_path, model_path, config_path): "deberta-v2-large-japanese", "bert-base-japanese-v3", ], + "Clap": "clap-htsat-fused", } with open(f"onnx/{export_path}.json", "w") as MoeVsConfFile: diff --git a/re_matching.py b/re_matching.py index 3c5340c99f9389c00f626cb2eb8526be0144ee13..dd464a5cca8fd00d2da35dfddf9a03295dd5b627 100644 --- a/re_matching.py +++ b/re_matching.py @@ -44,7 +44,6 @@ def text_matching(text: str) -> list: result = [] for speaker, dialogue in matches: result.append(extract_language_and_text_updated(speaker, dialogue)) - print(result) return result diff --git a/requirements.txt b/requirements.txt index c1a1306a658e07b9f53e649159ba20fe86c18908..73712096e907e84e3e4d560d9a782311e5840fe5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ jieba transformers pypinyin cn2an -gradio==3.38.0 +gradio==3.50.2 av mecab-python3 loguru @@ -21,8 +21,7 @@ fugashi num2words PyYAML requests -pyopenjtalk; sys_platform == 'linux' -openjtalk; sys_platform != 'linux' +pyopenjtalk-prebuilt jaconv psutil GPUtil diff --git a/resample.py b/resample.py index a0d617558755f1e875be79c4ea5d5696d6bcf106..678c352518a269aa2b4e1fd8acada6e2a00e1de1 100644 --- a/resample.py +++ b/resample.py @@ -10,11 +10,11 @@ from config import config def process(item): - wav_name, args = item - wav_path = os.path.join(args.in_dir, wav_name) + spkdir, wav_name, args = item + wav_path = os.path.join(args.in_dir, spkdir, wav_name) if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"): wav, sr = librosa.load(wav_path, sr=args.sr) - soundfile.write(os.path.join(args.out_dir, wav_name), wav, sr) + soundfile.write(os.path.join(args.out_dir, spkdir, wav_name), wav, sr) if __name__ == "__main__": @@ -54,11 +54,15 @@ if __name__ == "__main__": tasks = [] for dirpath, _, filenames in os.walk(args.in_dir): - if not os.path.isdir(args.out_dir): - os.makedirs(args.out_dir, exist_ok=True) + # 子级目录 + spk_dir = os.path.relpath(dirpath, args.in_dir) + spk_dir_out = os.path.join(args.out_dir, spk_dir) + if not os.path.isdir(spk_dir_out): + os.makedirs(spk_dir_out, exist_ok=True) for filename in filenames: if filename.lower().endswith(".wav"): - tasks.append((filename, args)) + twople = (spk_dir, filename, args) + tasks.append(twople) for _ in tqdm( pool.imap_unordered(process, tasks), diff --git a/resample_legacy.py b/resample_legacy.py new file mode 100644 index 0000000000000000000000000000000000000000..a0d617558755f1e875be79c4ea5d5696d6bcf106 --- /dev/null +++ b/resample_legacy.py @@ -0,0 +1,71 @@ +import os +import argparse +import librosa +from multiprocessing import Pool, cpu_count + +import soundfile +from tqdm import tqdm + +from config import config + + +def process(item): + wav_name, args = item + wav_path = os.path.join(args.in_dir, wav_name) + if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"): + wav, sr = librosa.load(wav_path, sr=args.sr) + soundfile.write(os.path.join(args.out_dir, wav_name), wav, sr) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--sr", + type=int, + default=config.resample_config.sampling_rate, + help="sampling rate", + ) + parser.add_argument( + "--in_dir", + type=str, + default=config.resample_config.in_dir, + help="path to source dir", + ) + parser.add_argument( + "--out_dir", + type=str, + default=config.resample_config.out_dir, + help="path to target dir", + ) + parser.add_argument( + "--processes", + type=int, + default=0, + help="cpu_processes", + ) + args, _ = parser.parse_known_args() + # autodl 无卡模式会识别出46个cpu + if args.processes == 0: + processes = cpu_count() - 2 if cpu_count() > 4 else 1 + else: + processes = args.processes + pool = Pool(processes=processes) + + tasks = [] + + for dirpath, _, filenames in os.walk(args.in_dir): + if not os.path.isdir(args.out_dir): + os.makedirs(args.out_dir, exist_ok=True) + for filename in filenames: + if filename.lower().endswith(".wav"): + tasks.append((filename, args)) + + for _ in tqdm( + pool.imap_unordered(process, tasks), + ): + pass + + pool.close() + pool.join() + + print("音频重采样完毕!") diff --git a/server_fastapi.py b/server_fastapi.py index 84e8765524b5eff77d4d14fcbd0f84367b91a96d..e0f0d8b6153961de83a2c8830f74913b44aeae54 100644 --- a/server_fastapi.py +++ b/server_fastapi.py @@ -5,6 +5,7 @@ import logging import gc import random +import librosa import gradio import numpy as np import utils @@ -201,30 +202,50 @@ if __name__ == "__main__": language: str, auto_translate: bool, auto_split: bool, - emotion: Optional[int] = None, + emotion: Optional[Union[int, str]] = None, reference_audio=None, + style_text: Optional[str] = None, + style_weight: float = 0.7, ) -> Union[Response, Dict[str, any]]: """TTS实现函数""" # 检查模型是否存在 if model_id not in loaded_models.models.keys(): + logger.error(f"/voice 请求错误:模型model_id={model_id}未加载") return {"status": 10, "detail": f"模型model_id={model_id}未加载"} # 检查是否提供speaker if speaker_name is None and speaker_id is None: + logger.error("/voice 请求错误:推理请求未提供speaker_name或speaker_id") return {"status": 11, "detail": "请提供speaker_name或speaker_id"} elif speaker_name is None: # 检查speaker_id是否存在 if speaker_id not in loaded_models.models[model_id].id2spk.keys(): + logger.error(f"/voice 请求错误:角色speaker_id={speaker_id}不存在") return {"status": 12, "detail": f"角色speaker_id={speaker_id}不存在"} speaker_name = loaded_models.models[model_id].id2spk[speaker_id] # 检查speaker_name是否存在 if speaker_name not in loaded_models.models[model_id].spk2id.keys(): + logger.error(f"/voice 请求错误:角色speaker_name={speaker_name}不存在") return {"status": 13, "detail": f"角色speaker_name={speaker_name}不存在"} + # 未传入则使用默认语言 if language is None: language = loaded_models.models[model_id].language + # 翻译会破坏mix结构,auto也会变得无意义。不要在这两个模式下使用 if auto_translate: + if language == "auto" or language == "mix": + logger.error( + f"/voice 请求错误:请勿同时使用language = {language}与auto_translate模式" + ) + return { + "status": 20, + "detail": f"请勿同时使用language = {language}与auto_translate模式", + } text = trans.translate(Sentence=text, to_Language=language.lower()) if reference_audio is not None: ref_audio = BytesIO(await reference_audio.read()) + # 2.2 适配 + if loaded_models.models[model_id].version == "2.2": + ref_audio, _ = librosa.load(ref_audio, 48000) + else: ref_audio = reference_audio if not auto_split: @@ -242,6 +263,8 @@ if __name__ == "__main__": device=loaded_models.models[model_id].device, emotion=emotion, reference_audio=ref_audio, + style_text=style_text, + style_weight=style_weight, ) audio = gradio.processing_utils.convert_to_16_bit_wav(audio) else: @@ -263,6 +286,8 @@ if __name__ == "__main__": device=loaded_models.models[model_id].device, emotion=emotion, reference_audio=ref_audio, + style_text=style_text, + style_weight=style_weight, ) ) audios.append(np.zeros(int(44100 * 0.2))) @@ -291,8 +316,10 @@ if __name__ == "__main__": language: str = Query(None, description="语言"), # 若不指定使用语言则使用默认值 auto_translate: bool = Query(False, description="自动翻译"), auto_split: bool = Query(False, description="自动切分"), - emotion: Optional[int] = Query(None, description="emo"), + emotion: Optional[Union[int, str]] = Query(None, description="emo"), reference_audio: UploadFile = File(None), + style_text: Optional[str] = Form(None, description="风格文本"), + style_weight: float = Query(0.7, description="风格权重"), ): """语音接口,若需要上传参考音频请仅使用post请求""" logger.info( @@ -312,6 +339,8 @@ if __name__ == "__main__": auto_split=auto_split, emotion=emotion, reference_audio=reference_audio, + style_text=style_text, + style_weight=style_weight, ) @app.get("/voice") @@ -330,7 +359,9 @@ if __name__ == "__main__": language: str = Query(None, description="语言"), # 若不指定使用语言则使用默认值 auto_translate: bool = Query(False, description="自动翻译"), auto_split: bool = Query(False, description="自动切分"), - emotion: Optional[int] = Query(None, description="emo"), + emotion: Optional[Union[int, str]] = Query(None, description="emo"), + style_text: Optional[str] = Query(None, description="风格文本"), + style_weight: float = Query(0.7, description="风格权重"), ): """语音接口""" logger.info( @@ -349,6 +380,8 @@ if __name__ == "__main__": auto_translate=auto_translate, auto_split=auto_split, emotion=emotion, + style_text=style_text, + style_weight=style_weight, ) @app.get("/models/info") @@ -370,7 +403,9 @@ if __name__ == "__main__": ) result = loaded_models.del_model(model_id) if result is None: + logger.error(f"/models/delete 模型删除错误:模型{model_id}不存在,删除失败") return {"status": 14, "detail": f"模型{model_id}不存在,删除失败"} + return {"status": 0, "detail": "删除成功"} @app.get("/models/add") @@ -394,6 +429,7 @@ if __name__ == "__main__": elif os.path.isfile(os.path.join(model_dir, "../config.json")): config_path = os.path.join(model_dir, "../config.json") else: + logger.error("/models/add 模型添加失败:未在模型所在目录以及上级目录找到config.json文件") return { "status": 15, "detail": "查询未传入配置文件路径,同时默认路径./与../中不存在配置文件config.json。", @@ -628,8 +664,10 @@ if __name__ == "__main__": f"{request.client.host}:{request.client.port}/tools/get_audio { unquote(str(request.query_params) )}" ) if not os.path.isfile(path): + logger.error(f"/tools/get_audio 获取音频错误:指定音频{path}不存在") return {"status": 18, "detail": "指定音频不存在"} - if not path.endswith(".wav"): + if not path.lower().endswith(".wav"): + logger.error(f"/tools/get_audio 获取音频错误:音频{path}非wav文件") return {"status": 19, "detail": "非wav格式文件"} return FileResponse(path=path) diff --git a/slm/wavlm-base-plus/.gitattributes b/slm/wavlm-base-plus/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..6d34772f5ca361021038b404fb913ec8dc0b1a5a --- /dev/null +++ b/slm/wavlm-base-plus/.gitattributes @@ -0,0 +1,27 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/slm/wavlm-base-plus/README.md b/slm/wavlm-base-plus/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1192671605727eff94113d3b599161892688b9d5 --- /dev/null +++ b/slm/wavlm-base-plus/README.md @@ -0,0 +1,65 @@ +--- +language: +- en +datasets: +tags: +- speech +inference: false +--- + +# WavLM-Base-Plus + +[Microsoft's WavLM](https://github.com/microsoft/unilm/tree/master/wavlm) + +The base model pretrained on 16kHz sampled speech audio. When using the model, make sure that your speech input is also sampled at 16kHz. + +**Note**: This model does not have a tokenizer as it was pretrained on audio alone. In order to use this model **speech recognition**, a tokenizer should be created and the model should be fine-tuned on labeled text data. Check out [this blog](https://huggingface.co./blog/fine-tune-wav2vec2-english) for more in-detail explanation of how to fine-tune the model. + +The model was pre-trained on: + +- 60,000 hours of [Libri-Light](https://arxiv.org/abs/1912.07875) +- 10,000 hours of [GigaSpeech](https://arxiv.org/abs/2106.06909) +- 24,000 hours of [VoxPopuli](https://arxiv.org/abs/2101.00390) + +[Paper: WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) + +Authors: Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei + +**Abstract** +*Self-supervised learning (SSL) achieves great success in speech recognition, while limited exploration has been attempted for other speech processing tasks. As speech signal contains multi-faceted information including speaker identity, paralinguistics, spoken content, etc., learning universal representations for all speech tasks is challenging. In this paper, we propose a new pre-trained model, WavLM, to solve full-stack downstream speech tasks. WavLM is built based on the HuBERT framework, with an emphasis on both spoken content modeling and speaker identity preservation. We first equip the Transformer structure with gated relative position bias to improve its capability on recognition tasks. For better speaker discrimination, we propose an utterance mixing training strategy, where additional overlapped utterances are created unsupervisely and incorporated during model training. Lastly, we scale up the training dataset from 60k hours to 94k hours. WavLM Large achieves state-of-the-art performance on the SUPERB benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks.* + +The original model can be found under https://github.com/microsoft/unilm/tree/master/wavlm. + +# Usage + +This is an English pre-trained speech model that has to be fine-tuned on a downstream task like speech recognition or audio classification before it can be +used in inference. The model was pre-trained in English and should therefore perform well only in English. The model has been shown to work well on the [SUPERB benchmark](https://superbbenchmark.org/). + +**Note**: The model was pre-trained on phonemes rather than characters. This means that one should make sure that the input text is converted to a sequence +of phonemes before fine-tuning. + +## Speech Recognition + +To fine-tune the model for speech recognition, see [the official speech recognition example](https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition). + +## Speech Classification + +To fine-tune the model for speech classification, see [the official audio classification example](https://github.com/huggingface/transformers/tree/master/examples/pytorch/audio-classification). + +## Speaker Verification + +TODO + +## Speaker Diarization + +TODO + +# Contribution + +The model was contributed by [cywang](https://huggingface.co./cywang) and [patrickvonplaten](https://huggingface.co./patrickvonplaten). + +# License + +The official license can be found [here](https://github.com/microsoft/UniSpeech/blob/main/LICENSE) + +![design](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/wavlm.png) \ No newline at end of file diff --git a/slm/wavlm-base-plus/config.json b/slm/wavlm-base-plus/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b7b4e5f6c410f71283a59e26250e40855ca99310 --- /dev/null +++ b/slm/wavlm-base-plus/config.json @@ -0,0 +1,99 @@ +{ + "_name_or_path": "wavlm-base-plus", + "activation_dropout": 0.0, + "adapter_kernel_size": 3, + "adapter_stride": 2, + "add_adapter": false, + "apply_spec_augment": true, + "architectures": [ + "WavLMModel" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "codevector_dim": 256, + "contrastive_logits_temperature": 0.1, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "diversity_loss_weight": 0.1, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.1, + "feat_quantizer_dropout": 0.0, + "final_dropout": 0.0, + "freeze_feat_extract_train": true, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "layerdrop": 0.05, + "mask_channel_length": 10, + "mask_channel_min_space": 1, + "mask_channel_other": 0.0, + "mask_channel_prob": 0.0, + "mask_channel_selection": "static", + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_min_space": 1, + "mask_time_other": 0.0, + "mask_time_prob": 0.05, + "mask_time_selection": "static", + "model_type": "wavlm", + "no_mask_channel_overlap": false, + "no_mask_time_overlap": false, + "num_adapter_layers": 3, + "num_attention_heads": 12, + "num_buckets": 320, + "num_codevector_groups": 2, + "num_codevectors_per_group": 320, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_ctc_classes": 80, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "num_negatives": 100, + "output_hidden_size": 768, + "pad_token_id": 0, + "proj_codevector_dim": 256, + "replace_prob": 0.5, + "torch_dtype": "float32", + "transformers_version": "4.13.0.dev0", + "use_weighted_layer_sum": false, + "vocab_size": 32, + "tokenizer_class": "Wav2Vec2CTCTokenizer" +} diff --git a/slm/wavlm-base-plus/preprocessor_config.json b/slm/wavlm-base-plus/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..10f6def8c83d70a2b087a567dcf523b75152a80b --- /dev/null +++ b/slm/wavlm-base-plus/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": false, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/slm/wavlm-base-plus/pytorch_model.bin b/slm/wavlm-base-plus/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a8bf334ae0e26f3e95a5eb7e34cf45fe1369e753 --- /dev/null +++ b/slm/wavlm-base-plus/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bb273a6ace99408b50cfc81afdbb7ef2de02da2eab0234e18db608ce692fe51 +size 377617425 diff --git a/text/__init__.py b/text/__init__.py index 816ad01d95eee3d5547e85d9dc9e46fbaabf34c7..98276d2a843d8f54252894c5e0a8c729b72b947b 100644 --- a/text/__init__.py +++ b/text/__init__.py @@ -18,13 +18,15 @@ def cleaned_text_to_sequence(cleaned_text, tones, language): return phones, tones, lang_ids -def get_bert(norm_text, word2ph, language, device): +def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7): from .chinese_bert import get_bert_feature as zh_bert from .english_bert_mock import get_bert_feature as en_bert from .japanese_bert import get_bert_feature as jp_bert lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert} - bert = lang_bert_func_map[language](norm_text, word2ph, device) + bert = lang_bert_func_map[language]( + norm_text, word2ph, device, style_text, style_weight + ) return bert diff --git a/text/chinese_bert.py b/text/chinese_bert.py index 36f1e2a09350584dfe5fca42b27402aa571aba3f..cfa7f6032a3157c283a37881c91c7b9d0f6dba48 100644 --- a/text/chinese_bert.py +++ b/text/chinese_bert.py @@ -12,7 +12,13 @@ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -29,12 +35,24 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() - + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == len(text) + 2 word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/text/english.py b/text/english.py index 0443829d66742101ddba0400a73a8c04be7a4e67..4a2af9523f2f96b7b34a0fff7589a82e1122ecae 100644 --- a/text/english.py +++ b/text/english.py @@ -5,6 +5,7 @@ from g2p_en import G2p from transformers import DebertaV2Tokenizer from text import symbols +from text.symbols import punctuation current_file_path = os.path.dirname(__file__) CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep") @@ -217,6 +218,8 @@ def refine_ph(phn): if re.search(r"\d$", phn): tone = int(phn[-1]) + 1 phn = phn[:-1] + else: + tone = 3 return phn.lower(), tone @@ -389,45 +392,84 @@ def sep_text(text): return words +def text_to_words(text): + tokens = tokenizer.tokenize(text) + words = [] + for idx, t in enumerate(tokens): + if t.startswith("▁"): + words.append([t[1:]]) + else: + if t in punctuation: + if idx == len(tokens) - 1: + words.append([f"{t}"]) + else: + if ( + not tokens[idx + 1].startswith("▁") + and tokens[idx + 1] not in punctuation + ): + if idx == 0: + words.append([]) + words[-1].append(f"{t}") + else: + words.append([f"{t}"]) + else: + if idx == 0: + words.append([]) + words[-1].append(f"{t}") + return words + + def g2p(text): phones = [] tones = [] - # word2ph = [] - words = sep_text(text) - tokens = [tokenizer.tokenize(i) for i in words] + phone_len = [] + # words = sep_text(text) + # tokens = [tokenizer.tokenize(i) for i in words] + words = text_to_words(text) + for word in words: - if word.upper() in eng_dict: - phns, tns = refine_syllables(eng_dict[word.upper()]) - phones.append([post_replace_ph(i) for i in phns]) - tones.append(tns) - # word2ph.append(len(phns)) - else: - phone_list = list(filter(lambda p: p != " ", _g2p(word))) - phns = [] - tns = [] - for ph in phone_list: - if ph in arpa: - ph, tn = refine_ph(ph) - phns.append(ph) - tns.append(tn) - else: - phns.append(ph) - tns.append(0) - phones.append([post_replace_ph(i) for i in phns]) - tones.append(tns) - # word2ph.append(len(phns)) - # phones = [post_replace_ph(i) for i in phones] + temp_phones, temp_tones = [], [] + if len(word) > 1: + if "'" in word: + word = ["".join(word)] + for w in word: + if w in punctuation: + temp_phones.append(w) + temp_tones.append(0) + continue + if w.upper() in eng_dict: + phns, tns = refine_syllables(eng_dict[w.upper()]) + temp_phones += [post_replace_ph(i) for i in phns] + temp_tones += tns + # w2ph.append(len(phns)) + else: + phone_list = list(filter(lambda p: p != " ", _g2p(w))) + phns = [] + tns = [] + for ph in phone_list: + if ph in arpa: + ph, tn = refine_ph(ph) + phns.append(ph) + tns.append(tn) + else: + phns.append(ph) + tns.append(0) + temp_phones += [post_replace_ph(i) for i in phns] + temp_tones += tns + phones += temp_phones + tones += temp_tones + phone_len.append(len(temp_phones)) + # phones = [post_replace_ph(i) for i in phones] word2ph = [] - for token, phoneme in zip(tokens, phones): - phone_len = len(phoneme) + for token, pl in zip(words, phone_len): word_len = len(token) - aaa = distribute_phone(phone_len, word_len) + aaa = distribute_phone(pl, word_len) word2ph += aaa - phones = ["_"] + [j for i in phones for j in i] + ["_"] - tones = [0] + [j for i in tones for j in i] + [0] + phones = ["_"] + phones + ["_"] + tones = [0] + tones + [0] word2ph = [1] + word2ph + [1] assert len(phones) == len(tones), text assert len(phones) == sum(word2ph), text diff --git a/text/english_bert_mock.py b/text/english_bert_mock.py index 85b241c405219c83c617c28bfa0cd274cbc2a557..2f3c9af3d2e9ea6035a6756817948d966bb75a42 100644 --- a/text/english_bert_mock.py +++ b/text/english_bert_mock.py @@ -13,7 +13,13 @@ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH) models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -30,11 +36,24 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph)) word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/text/japanese_bert.py b/text/japanese_bert.py index d47be80f04bfd862cc818f0646a83481b4b15dda..c69f41923f69e1c47534b32fcf76022763854889 100644 --- a/text/japanese_bert.py +++ b/text/japanese_bert.py @@ -13,8 +13,16 @@ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH) models = dict() -def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): +def get_bert_feature( + text, + word2ph, + device=config.bert_gen_config.device, + style_text=None, + style_weight=0.7, +): text = "".join(text2sep_kata(text)[0]) + if style_text: + style_text = "".join(text2sep_kata(style_text)[0]) if ( sys.platform == "darwin" and torch.backends.mps.is_available() @@ -31,12 +39,25 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device): inputs[i] = inputs[i].to(device) res = models[device](**inputs, output_hidden_states=True) res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu() + if style_text: + style_inputs = tokenizer(style_text, return_tensors="pt") + for i in style_inputs: + style_inputs[i] = style_inputs[i].to(device) + style_res = models[device](**style_inputs, output_hidden_states=True) + style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu() + style_res_mean = style_res.mean(0) assert len(word2ph) == len(text) + 2 word2phone = word2ph phone_level_feature = [] for i in range(len(word2phone)): - repeat_feature = res[i].repeat(word2phone[i], 1) + if style_text: + repeat_feature = ( + res[i].repeat(word2phone[i], 1) * (1 - style_weight) + + style_res_mean.repeat(word2phone[i], 1) * style_weight + ) + else: + repeat_feature = res[i].repeat(word2phone[i], 1) phone_level_feature.append(repeat_feature) phone_level_feature = torch.cat(phone_level_feature, dim=0) diff --git a/text/tone_sandhi.py b/text/tone_sandhi.py index 6a6e4c3e64f1a9e8b9da73fc6fbebf8a33e5602d..372308604d52cc32f80d0146efa95cfdf7e40b05 100644 --- a/text/tone_sandhi.py +++ b/text/tone_sandhi.py @@ -634,9 +634,11 @@ class ToneSandhi: # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')] # output seg: [['听一听', 'v']] def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: - new_seg = [] + new_seg = [] * len(seg) # function 1 - for i, (word, pos) in enumerate(seg): + i = 0 + while i < len(seg): + word, pos = seg[i] if ( i - 1 >= 0 and word == "一" @@ -645,6 +647,7 @@ class ToneSandhi: and seg[i - 1][1] == "v" ): new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] + i += 2 else: if ( i - 2 >= 0 @@ -655,7 +658,8 @@ class ToneSandhi: continue else: new_seg.append([word, pos]) - seg = new_seg + i += 1 + seg = [i for i in new_seg if len(i) > 0] new_seg = [] # function 2 for i, (word, pos) in enumerate(seg): diff --git a/train_ms.py b/train_ms.py index 8c626d07c8ddbf608ca36e42e813ed54577a9698..493822988ea00616910d76bc379653147c8b6395 100644 --- a/train_ms.py +++ b/train_ms.py @@ -27,8 +27,15 @@ from models import ( SynthesizerTrn, MultiPeriodDiscriminator, DurationDiscriminator, + WavLMDiscriminator, +) +from losses import ( + generator_loss, + discriminator_loss, + feature_loss, + kl_loss, + WavLMLoss, ) -from losses import generator_loss, discriminator_loss, feature_loss, kl_loss from mel_processing import mel_spectrogram_torch, spec_to_mel_torch from text.symbols import symbols @@ -42,7 +49,6 @@ torch.backends.cuda.enable_flash_sdp(True) torch.backends.cuda.enable_mem_efficient_sdp( True ) # Not available if torch version is lower than 2.0 -torch.backends.cuda.enable_math_sdp(True) global_step = 0 @@ -173,6 +179,8 @@ def run(): 0.1, gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0, ).cuda(local_rank) + else: + net_dur_disc = None if ( "use_spk_conditioned_encoder" in hps.model.keys() and hps.model.use_spk_conditioned_encoder is True @@ -210,6 +218,9 @@ def run(): param.requires_grad = False net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(local_rank) + net_wd = WavLMDiscriminator( + hps.model.slm.hidden, hps.model.slm.nlayers, hps.model.slm.initial_channel + ).cuda(local_rank) optim_g = torch.optim.AdamW( filter(lambda p: p.requires_grad, net_g.parameters()), hps.train.learning_rate, @@ -222,6 +233,12 @@ def run(): betas=hps.train.betas, eps=hps.train.eps, ) + optim_wd = torch.optim.AdamW( + net_wd.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) if net_dur_disc is not None: optim_dur_disc = torch.optim.AdamW( net_dur_disc.parameters(), @@ -233,12 +250,11 @@ def run(): optim_dur_disc = None net_g = DDP(net_g, device_ids=[local_rank], bucket_cap_mb=512) net_d = DDP(net_d, device_ids=[local_rank], bucket_cap_mb=512) - dur_resume_lr = None + net_wd = DDP(net_wd, device_ids=[local_rank], bucket_cap_mb=512) if net_dur_disc is not None: net_dur_disc = DDP( net_dur_disc, device_ids=[local_rank], - find_unused_parameters=True, bucket_cap_mb=512, ) @@ -250,9 +266,10 @@ def run(): token=config.openi_token, mirror=config.mirror, ) - - try: - if net_dur_disc is not None: + dur_resume_lr = hps.train.learning_rate + wd_resume_lr = hps.train.learning_rate + if net_dur_disc is not None: + try: _, _, dur_resume_lr, epoch_str = utils.load_checkpoint( utils.latest_checkpoint_path(hps.model_dir, "DUR_*.pth"), net_dur_disc, @@ -261,28 +278,32 @@ def run(): if "skip_optimizer" in hps.train else True, ) - _, optim_g, g_resume_lr, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), - net_g, - optim_g, - skip_optimizer=hps.train.skip_optimizer - if "skip_optimizer" in hps.train - else True, - ) - _, optim_d, d_resume_lr, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), - net_d, - optim_d, - skip_optimizer=hps.train.skip_optimizer - if "skip_optimizer" in hps.train - else True, - ) - if not optim_g.param_groups[0].get("initial_lr"): - optim_g.param_groups[0]["initial_lr"] = g_resume_lr - if not optim_d.param_groups[0].get("initial_lr"): - optim_d.param_groups[0]["initial_lr"] = d_resume_lr if not optim_dur_disc.param_groups[0].get("initial_lr"): optim_dur_disc.param_groups[0]["initial_lr"] = dur_resume_lr + except: + print("Initialize dur_disc") + + try: + _, optim_g, g_resume_lr, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), + net_g, + optim_g, + skip_optimizer=hps.train.skip_optimizer + if "skip_optimizer" in hps.train + else True, + ) + _, optim_d, d_resume_lr, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), + net_d, + optim_d, + skip_optimizer=hps.train.skip_optimizer + if "skip_optimizer" in hps.train + else True, + ) + if not optim_g.param_groups[0].get("initial_lr"): + optim_g.param_groups[0]["initial_lr"] = g_resume_lr + if not optim_d.param_groups[0].get("initial_lr"): + optim_d.param_groups[0]["initial_lr"] = d_resume_lr epoch_str = max(epoch_str, 1) # global_step = (epoch_str - 1) * len(train_loader) @@ -297,21 +318,43 @@ def run(): epoch_str = 1 global_step = 0 + try: + _, optim_wd, wd_resume_lr, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "WD_*.pth"), + net_wd, + optim_wd, + skip_optimizer=hps.train.skip_optimizer + if "skip_optimizer" in hps.train + else True, + ) + if not optim_wd.param_groups[0].get("initial_lr"): + optim_wd.param_groups[0]["initial_lr"] = wd_resume_lr + except Exception as e: + print(e) + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 ) scheduler_d = torch.optim.lr_scheduler.ExponentialLR( optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 ) + scheduler_wd = torch.optim.lr_scheduler.ExponentialLR( + optim_wd, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 + ) if net_dur_disc is not None: - if not optim_dur_disc.param_groups[0].get("initial_lr"): - optim_dur_disc.param_groups[0]["initial_lr"] = dur_resume_lr scheduler_dur_disc = torch.optim.lr_scheduler.ExponentialLR( optim_dur_disc, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 ) else: scheduler_dur_disc = None - scaler = GradScaler(enabled=hps.train.fp16_run) + scaler = GradScaler(enabled=hps.train.bf16_run) + + wl = WavLMLoss( + hps.model.slm.model, + net_wd, + hps.data.sampling_rate, + hps.model.slm.sr, + ).to(local_rank) for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: @@ -320,9 +363,9 @@ def run(): local_rank, epoch, hps, - [net_g, net_d, net_dur_disc], - [optim_g, optim_d, optim_dur_disc], - [scheduler_g, scheduler_d, scheduler_dur_disc], + [net_g, net_d, net_dur_disc, net_wd, wl], + [optim_g, optim_d, optim_dur_disc, optim_wd], + [scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd], scaler, [train_loader, eval_loader], logger, @@ -334,9 +377,9 @@ def run(): local_rank, epoch, hps, - [net_g, net_d, net_dur_disc], - [optim_g, optim_d, optim_dur_disc], - [scheduler_g, scheduler_d, scheduler_dur_disc], + [net_g, net_d, net_dur_disc, net_wd, wl], + [optim_g, optim_d, optim_dur_disc, optim_wd], + [scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd], scaler, [train_loader, None], None, @@ -344,6 +387,7 @@ def run(): ) scheduler_g.step() scheduler_d.step() + scheduler_wd.step() if net_dur_disc is not None: scheduler_dur_disc.step() @@ -361,9 +405,9 @@ def train_and_evaluate( logger, writers, ): - net_g, net_d, net_dur_disc = nets - optim_g, optim_d, optim_dur_disc = optims - scheduler_g, scheduler_d, scheduler_dur_disc = schedulers + net_g, net_d, net_dur_disc, net_wd, wl = nets + optim_g, optim_d, optim_dur_disc, optim_wd = optims + scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd = schedulers train_loader, eval_loader = loaders if writers is not None: writer, writer_eval = writers @@ -373,6 +417,7 @@ def train_and_evaluate( net_g.train() net_d.train() + net_wd.train() if net_dur_disc is not None: net_dur_disc.train() for batch_idx, ( @@ -388,7 +433,6 @@ def train_and_evaluate( bert, ja_bert, en_bert, - emo, ) in enumerate(tqdm(train_loader)): if net_g.module.use_noise_scaled_mas: current_mas_noise_scale = ( @@ -411,9 +455,8 @@ def train_and_evaluate( bert = bert.cuda(local_rank, non_blocking=True) ja_bert = ja_bert.cuda(local_rank, non_blocking=True) en_bert = en_bert.cuda(local_rank, non_blocking=True) - emo = emo.cuda(local_rank, non_blocking=True) - with autocast(enabled=hps.train.fp16_run): + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): ( y_hat, l_length, @@ -422,9 +465,8 @@ def train_and_evaluate( x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), - (hidden_x, logw, logw_), + (hidden_x, logw, logw_, logw_sdp), g, - loss_commit, ) = net_g( x, x_lengths, @@ -436,7 +478,6 @@ def train_and_evaluate( bert, ja_bert, en_bert, - emo, ) mel = spec_to_mel_torch( spec, @@ -450,7 +491,7 @@ def train_and_evaluate( mel, ids_slice, hps.train.segment_size // hps.data.hop_length ) y_hat_mel = mel_spectrogram_torch( - y_hat.squeeze(1), + y_hat.squeeze(1).float(), hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, @@ -466,7 +507,7 @@ def train_and_evaluate( # Discriminator y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) - with autocast(enabled=False): + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( y_d_hat_r, y_d_hat_g ) @@ -475,11 +516,20 @@ def train_and_evaluate( y_dur_hat_r, y_dur_hat_g = net_dur_disc( hidden_x.detach(), x_mask.detach(), + logw_.detach(), logw.detach(), + g.detach(), + ) + y_dur_hat_r_sdp, y_dur_hat_g_sdp = net_dur_disc( + hidden_x.detach(), + x_mask.detach(), logw_.detach(), + logw_sdp.detach(), g.detach(), ) - with autocast(enabled=False): + y_dur_hat_r = y_dur_hat_r + y_dur_hat_r_sdp + y_dur_hat_g = y_dur_hat_g + y_dur_hat_g_sdp + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): # TODO: I think need to mean using the mask, but for now, just mean all ( loss_dur_disc, @@ -490,31 +540,60 @@ def train_and_evaluate( optim_dur_disc.zero_grad() scaler.scale(loss_dur_disc_all).backward() scaler.unscale_(optim_dur_disc) - commons.clip_grad_value_(net_dur_disc.parameters(), None) + # torch.nn.utils.clip_grad_norm_( + # parameters=net_dur_disc.parameters(), max_norm=100 + # ) + grad_norm_dur = commons.clip_grad_value_( + net_dur_disc.parameters(), None + ) scaler.step(optim_dur_disc) optim_d.zero_grad() scaler.scale(loss_disc_all).backward() scaler.unscale_(optim_d) + if getattr(hps.train, "bf16_run", False): + torch.nn.utils.clip_grad_norm_(parameters=net_d.parameters(), max_norm=200) grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) scaler.step(optim_d) - with autocast(enabled=hps.train.fp16_run): + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): + loss_slm = wl.discriminator( + y.detach().squeeze(), y_hat.detach().squeeze() + ).mean() + + optim_wd.zero_grad() + scaler.scale(loss_slm).backward() + scaler.unscale_(optim_wd) + # torch.nn.utils.clip_grad_norm_(parameters=net_wd.parameters(), max_norm=200) + grad_norm_wd = commons.clip_grad_value_(net_wd.parameters(), None) + scaler.step(optim_wd) + + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): # Generator y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) if net_dur_disc is not None: - y_dur_hat_r, y_dur_hat_g = net_dur_disc( - hidden_x, x_mask, logw, logw_, g - ) - with autocast(enabled=False): + _, y_dur_hat_g = net_dur_disc(hidden_x, x_mask, logw_, logw, g) + _, y_dur_hat_g_sdp = net_dur_disc(hidden_x, x_mask, logw_, logw_sdp, g) + y_dur_hat_g = y_dur_hat_g + y_dur_hat_g_sdp + with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16): loss_dur = torch.sum(l_length.float()) loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl loss_fm = feature_loss(fmap_r, fmap_g) loss_gen, losses_gen = generator_loss(y_d_hat_g) + + loss_lm = wl(y.detach().squeeze(), y_hat.squeeze()).mean() + loss_lm_gen = wl.generator(y_hat.squeeze()) + loss_gen_all = ( - loss_gen + loss_fm + loss_mel + loss_dur + loss_kl + loss_commit + loss_gen + + loss_fm + + loss_mel + + loss_dur + + loss_kl + + loss_lm + + loss_lm_gen ) if net_dur_disc is not None: loss_dur_gen, losses_dur_gen = generator_loss(y_dur_hat_g) @@ -522,6 +601,8 @@ def train_and_evaluate( optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) + if getattr(hps.train, "bf16_run", False): + torch.nn.utils.clip_grad_norm_(parameters=net_g.parameters(), max_norm=500) grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) scaler.step(optim_g) scaler.update() @@ -540,9 +621,12 @@ def train_and_evaluate( scalar_dict = { "loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, + "loss/wd/total": loss_slm, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g, + "grad_norm_dur": grad_norm_dur, + "grad_norm_wd": grad_norm_wd, } scalar_dict.update( { @@ -550,6 +634,8 @@ def train_and_evaluate( "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl, + "loss/g/lm": loss_lm, + "loss/g/lm_gen": loss_lm_gen, } ) scalar_dict.update( @@ -562,6 +648,30 @@ def train_and_evaluate( {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)} ) + if net_dur_disc is not None: + scalar_dict.update({"loss/dur_disc/total": loss_dur_disc_all}) + + scalar_dict.update( + { + "loss/dur_disc_g/{}".format(i): v + for i, v in enumerate(losses_dur_disc_g) + } + ) + scalar_dict.update( + { + "loss/dur_disc_r/{}".format(i): v + for i, v in enumerate(losses_dur_disc_r) + } + ) + + scalar_dict.update({"loss/g/dur_gen": loss_dur_gen}) + scalar_dict.update( + { + "loss/g/dur_gen_{}".format(i): v + for i, v in enumerate(losses_dur_gen) + } + ) + image_dict = { "slice/mel_org": utils.plot_spectrogram_to_numpy( y_mel[0].data.cpu().numpy() @@ -599,6 +709,13 @@ def train_and_evaluate( epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step)), ) + utils.save_checkpoint( + net_wd, + optim_wd, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "WD_{}.pth".format(global_step)), + ) if net_dur_disc is not None: utils.save_checkpoint( net_dur_disc, @@ -642,7 +759,6 @@ def evaluate(hps, generator, eval_loader, writer_eval): bert, ja_bert, en_bert, - emo, ) in enumerate(eval_loader): x, x_lengths = x.cuda(), x_lengths.cuda() spec, spec_lengths = spec.cuda(), spec_lengths.cuda() @@ -653,7 +769,6 @@ def evaluate(hps, generator, eval_loader, writer_eval): en_bert = en_bert.cuda() tone = tone.cuda() language = language.cuda() - emo = emo.cuda() for use_sdp in [True, False]: y_hat, attn, mask, *_ = generator.module.infer( x, @@ -664,7 +779,6 @@ def evaluate(hps, generator, eval_loader, writer_eval): bert, ja_bert, en_bert, - emo, y=spec, max_len=1000, sdp_ratio=0.0 if not use_sdp else 1.0, diff --git a/utils.py b/utils.py index 7c1440593d62d61a368b3ec63e35aedc841e0346..68fd148fe1725ebb92bccccad15ff4630a0d98ba 100644 --- a/utils.py +++ b/utils.py @@ -301,7 +301,11 @@ def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sort_by_tim to_del = [ os.path.join(path_to_models, fn) - for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep]) + for fn in ( + x_sorted("G")[:-n_ckpts_to_keep] + + x_sorted("D")[:-n_ckpts_to_keep] + + x_sorted("WD")[:-n_ckpts_to_keep] + ) ] def del_info(fn): diff --git a/webui_preprocess.py b/webui_preprocess.py index af2ddeeb87d8aeaa895e7225f4482352e3fd1367..11673017da820f84ac259712ee05e134d746b636 100644 --- a/webui_preprocess.py +++ b/webui_preprocess.py @@ -19,9 +19,9 @@ def generate_config(data_dir, batch_size): assert data_dir != "", "数据集名称不能为空" start_path, _, train_path, val_path, config_path = get_path(data_dir) if os.path.isfile(config_path): - config = json.load(open(config_path)) + config = json.load(open(config_path, "r", encoding="utf-8")) else: - config = json.load(open("configs/config.json")) + config = json.load(open("configs/config.json", "r", encoding="utf-8")) config["data"]["training_files"] = train_path config["data"]["validation_files"] = val_path config["train"]["batch_size"] = batch_size @@ -44,7 +44,7 @@ def resample(data_dir): in_dir = os.path.join(start_path, "raw") out_dir = os.path.join(start_path, "wavs") subprocess.run( - f"python resample.py " + f"python resample_legacy.py " f"--sr 44100 " f"--in_dir {in_dir} " f"--out_dir {out_dir} ", @@ -60,7 +60,9 @@ def preprocess_text(data_dir): with open(lbl_path, "w", encoding="utf-8") as f: for line in lines: path, spk, language, text = line.strip().split("|") - path = os.path.join(start_path, "wavs", os.path.basename(path)) + path = os.path.join(start_path, "wavs", os.path.basename(path)).replace( + "\\", "/" + ) f.writelines(f"{path}|{spk}|{language}|{text}\n") subprocess.run( f"python preprocess_text.py " @@ -83,16 +85,6 @@ def bert_gen(data_dir): return "BERT 特征文件生成完成" -def clap_gen(data_dir): - assert data_dir != "", "数据集名称不能为空" - _, _, _, _, config_path = get_path(data_dir) - subprocess.run( - f"python clap_gen.py " f"--config {config_path}", - shell=True, - ) - return "CLAP 特征文件生成完成" - - if __name__ == "__main__": with gr.Blocks() as app: with gr.Row(): @@ -100,13 +92,13 @@ if __name__ == "__main__": _ = gr.Markdown( value="# Bert-VITS2 数据预处理\n" "## 预先准备:\n" - "下载 BERT 和 CLAP 模型:\n" + "下载 BERT 和 WavLM 模型:\n" "- [中文 RoBERTa](https://huggingface.co./hfl/chinese-roberta-wwm-ext-large)\n" "- [日文 DeBERTa](https://huggingface.co./ku-nlp/deberta-v2-large-japanese-char-wwm)\n" "- [英文 DeBERTa](https://huggingface.co./microsoft/deberta-v3-large)\n" - "- [CLAP](https://huggingface.co./laion/clap-htsat-fused)\n" + "- [WavLM](https://huggingface.co./microsoft/wavlm-base-plus)\n" "\n" - "将 BERT 模型放置到 `bert` 文件夹下,CLAP 模型放置到 `emotional` 文件夹下,覆盖同名文件夹。\n" + "将 BERT 模型放置到 `bert` 文件夹下,WavLM 模型放置到 `slm` 文件夹下,覆盖同名文件夹。\n" "\n" "数据准备:\n" "将数据放置在 data 文件夹下,按照如下结构组织:\n" @@ -156,12 +148,10 @@ if __name__ == "__main__": preprocess_text_btn = gr.Button(value="执行", variant="primary") _ = gr.Markdown(value="## 第四步:生成 BERT 特征文件") bert_gen_btn = gr.Button(value="执行", variant="primary") - _ = gr.Markdown(value="## 第五步:生成 CLAP 特征文件") - clap_gen_btn = gr.Button(value="执行", variant="primary") _ = gr.Markdown( value="## 训练模型及部署:\n" "修改根目录下的 `config.yml` 中 `dataset_path` 一项为 `data/{你的数据集名称}`\n" - "- 训练:将[预训练模型文件](https://openi.pcl.ac.cn/Stardust_minus/Bert-VITS2/modelmanage/show_model)(`D_0.pth`、`DUR_0.pth` 和 `G_0.pth`)放到 `data/{你的数据集名称}/models` 文件夹下,执行 `torchrun --nproc_per_node=1 train_ms.py` 命令(多卡运行可参考 `run_MnodesAndMgpus.sh` 中的命令。\n" + "- 训练:将[预训练模型文件](https://openi.pcl.ac.cn/Stardust_minus/Bert-VITS2/modelmanage/show_model)(`D_0.pth`、`DUR_0.pth`、`WD_0.pth` 和 `G_0.pth`)放到 `data/{你的数据集名称}/models` 文件夹下,执行 `torchrun --nproc_per_node=1 train_ms.py` 命令(多卡运行可参考 `run_MnodesAndMgpus.sh` 中的命令。\n" "- 部署:修改根目录下的 `config.yml` 中 `webui` 下 `model` 一项为 `models/{权重文件名}.pth` (如 G_10000.pth),然后执行 `python webui.py`" ) @@ -171,7 +161,6 @@ if __name__ == "__main__": resample_btn.click(resample, inputs=[data_dir], outputs=[info]) preprocess_text_btn.click(preprocess_text, inputs=[data_dir], outputs=[info]) bert_gen_btn.click(bert_gen, inputs=[data_dir], outputs=[info]) - clap_gen_btn.click(clap_gen, inputs=[data_dir], outputs=[info]) webbrowser.open("http://127.0.0.1:7860") app.launch(share=False, server_port=7860)