# Copyright (c) 2024 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import sys from models.tts.maskgct.g2p.g2p import PhonemeBpeTokenizer from models.tts.maskgct.g2p.utils.g2p import phonemizer_g2p import tqdm from typing import List import json import os import re def ph_g2p(text, language): return phonemizer_g2p(text=text, language=language) def g2p(text, sentence, language): return text_tokenizer.tokenize(text=text, sentence=sentence, language=language) def is_chinese(char): if char >= "\u4e00" and char <= "\u9fa5": return True else: return False def is_alphabet(char): if (char >= "\u0041" and char <= "\u005a") or ( char >= "\u0061" and char <= "\u007a" ): return True else: return False def is_other(char): if not (is_chinese(char) or is_alphabet(char)): return True else: return False def get_segment(text: str) -> List[str]: # sentence --> [ch_part, en_part, ch_part, ...] segments = [] types = [] flag = 0 temp_seg = "" temp_lang = "" # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point. for i, ch in enumerate(text): if is_chinese(ch): types.append("zh") elif is_alphabet(ch): types.append("en") else: types.append("other") assert len(types) == len(text) for i in range(len(types)): # find the first char of the seg if flag == 0: temp_seg += text[i] temp_lang = types[i] flag = 1 else: if temp_lang == "other": if types[i] == temp_lang: temp_seg += text[i] else: temp_seg += text[i] temp_lang = types[i] else: if types[i] == temp_lang: temp_seg += text[i] elif types[i] == "other": temp_seg += text[i] else: segments.append((temp_seg, temp_lang)) temp_seg = text[i] temp_lang = types[i] flag = 1 segments.append((temp_seg, temp_lang)) return segments def chn_eng_g2p(text: str): # now only en and ch segments = get_segment(text) all_phoneme = "" all_tokens = [] for index in range(len(segments)): seg = segments[index] phoneme, token = g2p(seg[0], text, seg[1]) all_phoneme += phoneme + "|" all_tokens += token if seg[1] == "en" and index == len(segments) - 1 and all_phoneme[-2] == "_": all_phoneme = all_phoneme[:-2] all_tokens = all_tokens[:-1] return all_phoneme, all_tokens text_tokenizer = PhonemeBpeTokenizer() with open("./models/tts/maskgct/g2p/g2p/vocab.json", "r") as f: json_data = f.read() data = json.loads(json_data) vocab = data["vocab"]