Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

App Files Files Community

ing0 commited on 5 days ago

Commit

b96e750

1 Parent(s): 4273121

infer

Browse files

Files changed (17) hide show

diffrhythm/g2p/g2p/__init__.py +87 -0
diffrhythm/g2p/g2p/chinese_model_g2p.py +213 -0
diffrhythm/g2p/g2p/cleaners.py +31 -0
diffrhythm/g2p/g2p/english.py +202 -0
diffrhythm/g2p/g2p/french.py +149 -0
diffrhythm/g2p/g2p/german.py +94 -0
diffrhythm/g2p/g2p/japanese.py +816 -0
diffrhythm/g2p/g2p/korean.py +81 -0
diffrhythm/g2p/g2p/mandarin.py +595 -0
diffrhythm/g2p/g2p/text_tokenizers.py +84 -0
diffrhythm/g2p/g2p/vocab.json +372 -0
diffrhythm/g2p/utils/front_utils.py +20 -0
diffrhythm/g2p/utils/g2p.py +139 -0
diffrhythm/g2p/utils/log.py +52 -0
diffrhythm/g2p/utils/mls_en.json +335 -0
diffrhythm/infer/infer.py +147 -0
diffrhythm/infer/infer_utils.py +197 -0

diffrhythm/g2p/g2p/__init__.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from diffrhythm.g2p.g2p import cleaners
+from tokenizers import Tokenizer
+from diffrhythm.g2p.g2p.text_tokenizers import TextTokenizer
+import LangSegment
+import json
+import re
+class PhonemeBpeTokenizer:
+    def __init__(self, vacab_path="./diffrhythm/g2p/g2p/vocab.json"):
+        self.lang2backend = {
+            "zh": "cmn",
+            "ja": "ja",
+            "en": "en-us",
+            "fr": "fr-fr",
+            "ko": "ko",
+            "de": "de",
+        }
+        self.text_tokenizers = {}
+        self.int_text_tokenizers()
+        with open(vacab_path, "r") as f:
+            json_data = f.read()
+        data = json.loads(json_data)
+        self.vocab = data["vocab"]
+        LangSegment.setfilters(["en", "zh", "ja", "ko", "fr", "de"])
+    def int_text_tokenizers(self):
+        for key, value in self.lang2backend.items():
+            self.text_tokenizers[key] = TextTokenizer(language=value)
+    def tokenize(self, text, sentence, language):
+        # 1. convert text to phoneme
+        phonemes = []
+        if language == "auto":
+            seglist = LangSegment.getTexts(text)
+            tmp_ph = []
+            for seg in seglist:
+                tmp_ph.append(
+                    self._clean_text(
+                        seg["text"], sentence, seg["lang"], ["cjekfd_cleaners"]
+                    )
+                )
+            phonemes = "|_|".join(tmp_ph)
+        else:
+            phonemes = self._clean_text(text, sentence, language, ["cjekfd_cleaners"])
+        # print('clean text: ', phonemes)
+        # 2. tokenize phonemes
+        phoneme_tokens = self.phoneme2token(phonemes)
+        # print('encode: ', phoneme_tokens)
+        # # 3. decode tokens [optional]
+        # decoded_text = self.tokenizer.decode(phoneme_tokens)
+        # print('decoded: ', decoded_text)
+        return phonemes, phoneme_tokens
+    def _clean_text(self, text, sentence, language, cleaner_names):
+        for name in cleaner_names:
+            cleaner = getattr(cleaners, name)
+            if not cleaner:
+                raise Exception("Unknown cleaner: %s" % name)
+        text = cleaner(text, sentence, language, self.text_tokenizers)
+        return text
+    def phoneme2token(self, phonemes):
+        tokens = []
+        if isinstance(phonemes, list):
+            for phone in phonemes:
+                phone = phone.split("\t")[0]
+                phonemes_split = phone.split("|")
+                tokens.append(
+                    [self.vocab[p] for p in phonemes_split if p in self.vocab]
+                )
+        else:
+            phonemes = phonemes.split("\t")[0]
+            phonemes_split = phonemes.split("|")
+            tokens = [self.vocab[p] for p in phonemes_split if p in self.vocab]
+        return tokens

diffrhythm/g2p/g2p/chinese_model_g2p.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+import json
+from transformers import BertTokenizer
+from torch.utils.data import Dataset
+from transformers.models.bert.modeling_bert import *
+import torch
+import torch.nn.functional as F
+from onnxruntime import InferenceSession, GraphOptimizationLevel, SessionOptions
+class PolyDataset(Dataset):
+    def __init__(self, words, labels, word_pad_idx=0, label_pad_idx=-1):
+        self.dataset = self.preprocess(words, labels)
+        self.word_pad_idx = word_pad_idx
+        self.label_pad_idx = label_pad_idx
+    def preprocess(self, origin_sentences, origin_labels):
+        """
+        Maps tokens and tags to their indices and stores them in the dict data.
+        examples:
+            word:['[CLS]', '浙', '商', '银', '行', '企', '业', '信', '贷', '部']
+            sentence:([101, 3851, 1555, 7213, 6121, 821, 689, 928, 6587, 6956],
+                        array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]))
+            label:[3, 13, 13, 13, 0, 0, 0, 0, 0]
+        """
+        data = []
+        labels = []
+        sentences = []
+        # tokenize
+        for line in origin_sentences:
+            # replace each token by its index
+            # we can not use encode_plus because our sentences are aligned to labels in list type
+            words = []
+            word_lens = []
+            for token in line:
+                words.append(token)
+                word_lens.append(1)
+            token_start_idxs = 1 + np.cumsum([0] + word_lens[:-1])
+            sentences.append(((words, token_start_idxs), 0))
+        ###
+        for tag in origin_labels:
+            labels.append(tag)
+        for sentence, label in zip(sentences, labels):
+            data.append((sentence, label))
+        return data
+    def __getitem__(self, idx):
+        """sample data to get batch"""
+        word = self.dataset[idx][0]
+        label = self.dataset[idx][1]
+        return [word, label]
+    def __len__(self):
+        """get dataset size"""
+        return len(self.dataset)
+    def collate_fn(self, batch):
+        sentences = [x[0][0] for x in batch]
+        ori_sents = [x[0][1] for x in batch]
+        labels = [x[1] for x in batch]
+        batch_len = len(sentences)
+        # compute length of longest sentence in batch
+        max_len = max([len(s[0]) for s in sentences])
+        max_label_len = 0
+        batch_data = np.ones((batch_len, max_len))
+        batch_label_starts = []
+        # padding and aligning
+        for j in range(batch_len):
+            cur_len = len(sentences[j][0])
+            batch_data[j][:cur_len] = sentences[j][0]
+            label_start_idx = sentences[j][-1]
+            label_starts = np.zeros(max_len)
+            label_starts[[idx for idx in label_start_idx if idx < max_len]] = 1
+            batch_label_starts.append(label_starts)
+            max_label_len = max(int(sum(label_starts)), max_label_len)
+        # padding label
+        batch_labels = self.label_pad_idx * np.ones((batch_len, max_label_len))
+        batch_pmasks = self.label_pad_idx * np.ones((batch_len, max_label_len))
+        for j in range(batch_len):
+            cur_tags_len = len(labels[j])
+            batch_labels[j][:cur_tags_len] = labels[j]
+            batch_pmasks[j][:cur_tags_len] = [
+                1 if item > 0 else 0 for item in labels[j]
+            ]
+        # convert data to torch LongTensors
+        batch_data = torch.tensor(batch_data, dtype=torch.long)
+        batch_label_starts = torch.tensor(batch_label_starts, dtype=torch.long)
+        batch_labels = torch.tensor(batch_labels, dtype=torch.long)
+        batch_pmasks = torch.tensor(batch_pmasks, dtype=torch.long)
+        return [batch_data, batch_label_starts, batch_labels, batch_pmasks, ori_sents]
+class BertPolyPredict:
+    def __init__(self, bert_model, jsonr_file, json_file):
+        self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
+        with open(jsonr_file, "r", encoding="utf8") as fp:
+            self.pron_dict = json.load(fp)
+        with open(json_file, "r", encoding="utf8") as fp:
+            self.pron_dict_id_2_pinyin = json.load(fp)
+        self.num_polyphone = len(self.pron_dict)
+        self.device = "cpu"
+        self.polydataset = PolyDataset
+        options = SessionOptions()  # initialize session options
+        options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+        print(os.path.join(bert_model, "poly_bert_model.onnx"))
+        self.session = InferenceSession(
+            os.path.join(bert_model, "poly_bert_model.onnx"),
+            sess_options=options,
+            providers=[
+                "CUDAExecutionProvider",
+                "CPUExecutionProvider",
+            ],  # CPUExecutionProvider #CUDAExecutionProvider
+        )
+        # self.session.set_providers(['CUDAExecutionProvider', "CPUExecutionProvider"], [ {'device_id': 0}])
+        # disable session.run() fallback mechanism, it prevents for a reset of the execution provider
+        self.session.disable_fallback()
+    def predict_process(self, txt_list):
+        word_test, label_test, texts_test = self.get_examples_po(txt_list)
+        data = self.polydataset(word_test, label_test)
+        predict_loader = DataLoader(
+            data, batch_size=1, shuffle=False, collate_fn=data.collate_fn
+        )
+        pred_tags = self.predict_onnx(predict_loader)
+        return pred_tags
+    def predict_onnx(self, dev_loader):
+        pred_tags = []
+        with torch.no_grad():
+            for idx, batch_samples in enumerate(dev_loader):
+                # [batch_data, batch_label_starts, batch_labels, batch_pmasks, ori_sents]
+                batch_data, batch_label_starts, batch_labels, batch_pmasks, _ = (
+                    batch_samples
+                )
+                # shift tensors to GPU if available
+                batch_data = batch_data.to(self.device)
+                batch_label_starts = batch_label_starts.to(self.device)
+                batch_labels = batch_labels.to(self.device)
+                batch_pmasks = batch_pmasks.to(self.device)
+                batch_data = np.asarray(batch_data, dtype=np.int32)
+                batch_pmasks = np.asarray(batch_pmasks, dtype=np.int32)
+                # batch_output = self.session.run(output_names=['outputs'], input_feed={"input_ids":batch_data, "input_pmasks": batch_pmasks})[0][0]
+                batch_output = self.session.run(
+                    output_names=["outputs"], input_feed={"input_ids": batch_data}
+                )[0]
+                label_masks = batch_pmasks == 1
+                batch_labels = batch_labels.to("cpu").numpy()
+                for i, indices in enumerate(np.argmax(batch_output, axis=2)):
+                    for j, idx in enumerate(indices):
+                        if label_masks[i][j]:
+                            # pred_tag.append(idx)
+                            pred_tags.append(self.pron_dict_id_2_pinyin[str(idx + 1)])
+        return pred_tags
+    def get_examples_po(self, text_list):
+        word_list = []
+        label_list = []
+        sentence_list = []
+        id = 0
+        for line in [text_list]:
+            sentence = line[0]
+            words = []
+            tokens = line[0]
+            index = line[-1]
+            front = index
+            back = len(tokens) - index - 1
+            labels = [0] * front + [1] + [0] * back
+            words = ["[CLS]"] + [item for item in sentence]
+            words = self.tokenizer.convert_tokens_to_ids(words)
+            word_list.append(words)
+            label_list.append(labels)
+            sentence_list.append(sentence)
+            id += 1
+            # mask_list.append(masks)
+            assert len(labels) + 1 == len(words), print(
+                (
+                    poly,
+                    sentence,
+                    words,
+                    labels,
+                    sentence,
+                    len(sentence),
+                    len(words),
+                    len(labels),
+                )
+            )
+            assert len(labels) + 1 == len(
+                words
+            ), "Number of labels does not match number of words"
+            assert len(labels) == len(
+                sentence
+            ), "Number of labels does not match number of sentences"
+            assert len(word_list) == len(
+                label_list
+            ), "Number of label sentences does not match number of word sentences"
+        return word_list, label_list, text_list

diffrhythm/g2p/g2p/cleaners.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+from diffrhythm.g2p.g2p.japanese import japanese_to_ipa
+from diffrhythm.g2p.g2p.mandarin import chinese_to_ipa
+from diffrhythm.g2p.g2p.english import english_to_ipa
+from diffrhythm.g2p.g2p.french import french_to_ipa
+from diffrhythm.g2p.g2p.korean import korean_to_ipa
+from diffrhythm.g2p.g2p.german import german_to_ipa
+def cjekfd_cleaners(text, sentence, language, text_tokenizers):
+    if language == "zh":
+        return chinese_to_ipa(text, sentence, text_tokenizers["zh"])
+    elif language == "ja":
+        return japanese_to_ipa(text, text_tokenizers["ja"])
+    elif language == "en":
+        return english_to_ipa(text, text_tokenizers["en"])
+    elif language == "fr":
+        return french_to_ipa(text, text_tokenizers["fr"])
+    elif language == "ko":
+        return korean_to_ipa(text, text_tokenizers["ko"])
+    elif language == "de":
+        return german_to_ipa(text, text_tokenizers["de"])
+    else:
+        raise Exception("Unknown language: %s" % language)
+        return None

diffrhythm/g2p/g2p/english.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+from unidecode import unidecode
+import inflect
+"""
+    Text clean time
+"""
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_percent_number_re = re.compile(r"([0-9\.\,]*[0-9]+%)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_fraction_re = re.compile(r"([0-9]+)/([0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+        ("etc", "et cetera"),
+        ("btw", "by the way"),
+    ]
+]
+_special_map = [
+    ("t|ɹ", "tɹ"),
+    ("d|ɹ", "dɹ"),
+    ("t|s", "ts"),
+    ("d|z", "dz"),
+    ("ɪ|ɹ", "ɪɹ"),
+    ("ɐ", "ɚ"),
+    ("ᵻ", "ɪ"),
+    ("əl", "l"),
+    ("x", "k"),
+    ("ɬ", "l"),
+    ("ʔ", "t"),
+    ("n̩", "n"),
+    ("oː|ɹ", "oːɹ"),
+]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+def _expand_percent(m):
+    return m.group(1).replace("%", " percent ")
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return " " + match + " dollars "  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return " %s %s, %s %s " % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return " %s %s " % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return " %s %s " % (cents, cent_unit)
+    else:
+        return " zero dollars "
+def fraction_to_words(numerator, denominator):
+    if numerator == 1 and denominator == 2:
+        return " one half "
+    if numerator == 1 and denominator == 4:
+        return " one quarter "
+    if denominator == 2:
+        return " " + _inflect.number_to_words(numerator) + " halves "
+    if denominator == 4:
+        return " " + _inflect.number_to_words(numerator) + " quarters "
+    return (
+        " "
+        + _inflect.number_to_words(numerator)
+        + " "
+        + _inflect.ordinal(_inflect.number_to_words(denominator))
+        + " "
+    )
+def _expand_fraction(m):
+    numerator = int(m.group(1))
+    denominator = int(m.group(2))
+    return fraction_to_words(numerator, denominator)
+def _expand_ordinal(m):
+    return " " + _inflect.number_to_words(m.group(0)) + " "
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return " two thousand "
+        elif num > 2000 and num < 2010:
+            return " two thousand " + _inflect.number_to_words(num % 100) + " "
+        elif num % 100 == 0:
+            return " " + _inflect.number_to_words(num // 100) + " hundred "
+        else:
+            return (
+                " "
+                + _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(
+                    ", ", " "
+                )
+                + " "
+            )
+    else:
+        return " " + _inflect.number_to_words(num, andword="") + " "
+# Normalize numbers pronunciation
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_fraction_re, _expand_fraction, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_percent_number_re, _expand_percent, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
+def _english_to_ipa(text):
+    # text = unidecode(text).lower()
+    text = expand_abbreviations(text)
+    text = normalize_numbers(text)
+    return text
+# special map
+def special_map(text):
+    for regex, replacement in _special_map:
+        regex = regex.replace("|", "\|")
+        while re.search(r"(^|[_|]){}([_|]|$)".format(regex), text):
+            text = re.sub(
+                r"(^|[_|]){}([_|]|$)".format(regex), r"\1{}\2".format(replacement), text
+            )
+    # text = re.sub(r'([,.!?])', r'|\1', text)
+    return text
+# Add some special operation
+def english_to_ipa(text, text_tokenizer):
+    if type(text) == str:
+        text = _english_to_ipa(text)
+    else:
+        text = [_english_to_ipa(t) for t in text]
+    phonemes = text_tokenizer(text)
+    if phonemes[-1] in "p⁼ʰmftnlkxʃs`ɹaoəɛɪeɑʊŋiuɥwæjː":
+        phonemes += "|_"
+    if type(text) == str:
+        return special_map(phonemes)
+    else:
+        result_ph = []
+        for phone in phonemes:
+            result_ph.append(special_map(phone))
+        return result_ph

diffrhythm/g2p/g2p/french.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+"""
+    Text clean time
+"""
+# List of (regular expression, replacement) pairs for abbreviations in french:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("M", "monsieur"),
+        ("Mlle", "mademoiselle"),
+        ("Mlles", "mesdemoiselles"),
+        ("Mme", "Madame"),
+        ("Mmes", "Mesdames"),
+        ("N.B", "nota bene"),
+        ("M", "monsieur"),
+        ("p.c.q", "parce que"),
+        ("Pr", "professeur"),
+        ("qqch", "quelque chose"),
+        ("rdv", "rendez-vous"),
+        ("max", "maximum"),
+        ("min", "minimum"),
+        ("no", "numéro"),
+        ("adr", "adresse"),
+        ("dr", "docteur"),
+        ("st", "saint"),
+        ("co", "companie"),
+        ("jr", "junior"),
+        ("sgt", "sergent"),
+        ("capt", "capitain"),
+        ("col", "colonel"),
+        ("av", "avenue"),
+        ("av. J.-C", "avant Jésus-Christ"),
+        ("apr. J.-C", "après Jésus-Christ"),
+        ("art", "article"),
+        ("boul", "boulevard"),
+        ("c.-à-d", "c’est-à-dire"),
+        ("etc", "et cetera"),
+        ("ex", "exemple"),
+        ("excl", "exclusivement"),
+        ("boul", "boulevard"),
+    ]
+] + [
+    (re.compile("\\b%s" % x[0]), x[1])
+    for x in [
+        ("Mlle", "mademoiselle"),
+        ("Mlles", "mesdemoiselles"),
+        ("Mme", "Madame"),
+        ("Mmes", "Mesdames"),
+    ]
+]
+rep_map = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "...": ".",
+    "…": ".",
+    "$": ".",
+    "“": "",
+    "”": "",
+    "‘": "",
+    "’": "",
+    "（": "",
+    "）": "",
+    "(": "",
+    ")": "",
+    "《": "",
+    "》": "",
+    "【": "",
+    "】": "",
+    "[": "",
+    "]": "",
+    "—": "",
+    "～": "-",
+    "~": "-",
+    "「": "",
+    "」": "",
+    "¿": "",
+    "¡": "",
+}
+def collapse_whitespace(text):
+    # Regular expression matching whitespace:
+    _whitespace_re = re.compile(r"\s+")
+    return re.sub(_whitespace_re, " ", text).strip()
+def remove_punctuation_at_begin(text):
+    return re.sub(r"^[,.!?]+", "", text)
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
+    return text
+def replace_symbols(text):
+    text = text.replace(";", ",")
+    text = text.replace("-", " ")
+    text = text.replace(":", ",")
+    text = text.replace("&", " et ")
+    return text
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def replace_punctuation(text):
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+    return replaced_text
+def text_normalize(text):
+    text = expand_abbreviations(text)
+    text = replace_punctuation(text)
+    text = replace_symbols(text)
+    text = remove_aux_symbols(text)
+    text = remove_punctuation_at_begin(text)
+    text = collapse_whitespace(text)
+    text = re.sub(r"([^\.,!\?\-…])$", r"\1", text)
+    return text
+def french_to_ipa(text, text_tokenizer):
+    if type(text) == str:
+        text = text_normalize(text)
+        phonemes = text_tokenizer(text)
+        return phonemes
+    else:
+        for i, t in enumerate(text):
+            text[i] = text_normalize(t)
+        return text_tokenizer(text)

diffrhythm/g2p/g2p/german.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+"""
+    Text clean time
+"""
+rep_map = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "...": ".",
+    "…": ".",
+    "$": ".",
+    "“": "",
+    "”": "",
+    "‘": "",
+    "’": "",
+    "（": "",
+    "）": "",
+    "(": "",
+    ")": "",
+    "《": "",
+    "》": "",
+    "【": "",
+    "】": "",
+    "[": "",
+    "]": "",
+    "—": "",
+    "～": "-",
+    "~": "-",
+    "「": "",
+    "」": "",
+    "¿": "",
+    "¡": "",
+}
+def collapse_whitespace(text):
+    # Regular expression matching whitespace:
+    _whitespace_re = re.compile(r"\s+")
+    return re.sub(_whitespace_re, " ", text).strip()
+def remove_punctuation_at_begin(text):
+    return re.sub(r"^[,.!?]+", "", text)
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
+    return text
+def replace_symbols(text):
+    text = text.replace(";", ",")
+    text = text.replace("-", " ")
+    text = text.replace(":", ",")
+    return text
+def replace_punctuation(text):
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+    return replaced_text
+def text_normalize(text):
+    text = replace_punctuation(text)
+    text = replace_symbols(text)
+    text = remove_aux_symbols(text)
+    text = remove_punctuation_at_begin(text)
+    text = collapse_whitespace(text)
+    text = re.sub(r"([^\.,!\?\-…])$", r"\1", text)
+    return text
+def german_to_ipa(text, text_tokenizer):
+    if type(text) == str:
+        text = text_normalize(text)
+        phonemes = text_tokenizer(text)
+        return phonemes
+    else:
+        for i, t in enumerate(text):
+            text[i] = text_normalize(t)
+        return text_tokenizer(text)

diffrhythm/g2p/g2p/japanese.py ADDED Viewed

	@@ -0,0 +1,816 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import io, re, os, sys, time, argparse, pdb, json
+from io import StringIO
+from typing import Optional
+import numpy as np
+import traceback
+import pyopenjtalk
+from pykakasi import kakasi
+punctuation = [",", ".", "!", "?", ":", ";", "'", "…"]
+jp_xphone2ipa = [
+    " a a",
+    " i i",
+    " u ɯ",
+    " e e",
+    " o o",
+    " a: aː",
+    " i: iː",
+    " u: ɯː",
+    " e: eː",
+    " o: oː",
+    " k k",
+    " s s",
+    " t t",
+    " n n",
+    " h ç",
+    " f ɸ",
+    " m m",
+    " y j",
+    " r ɾ",
+    " w ɰᵝ",
+    " N ɴ",
+    " g g",
+    " j d ʑ",
+    " z z",
+    " d d",
+    " b b",
+    " p p",
+    " q q",
+    " v v",
+    " : :",
+    " by b j",
+    " ch t ɕ",
+    " dy d e j",
+    " ty t e j",
+    " gy g j",
+    " gw g ɯ",
+    " hy ç j",
+    " ky k j",
+    " kw k ɯ",
+    " my m j",
+    " ny n j",
+    " py p j",
+    " ry ɾ j",
+    " sh ɕ",
+    " ts t s ɯ",
+]
+_mora_list_minimum: list[tuple[str, Optional[str], str]] = [
+    ("ヴォ", "v", "o"),
+    ("ヴェ", "v", "e"),
+    ("ヴィ", "v", "i"),
+    ("ヴァ", "v", "a"),
+    ("ヴ", "v", "u"),
+    ("ン", None, "N"),
+    ("ワ", "w", "a"),
+    ("ロ", "r", "o"),
+    ("レ", "r", "e"),
+    ("ル", "r", "u"),
+    ("リョ", "ry", "o"),
+    ("リュ", "ry", "u"),
+    ("リャ", "ry", "a"),
+    ("リェ", "ry", "e"),
+    ("リ", "r", "i"),
+    ("ラ", "r", "a"),
+    ("ヨ", "y", "o"),
+    ("ユ", "y", "u"),
+    ("ヤ", "y", "a"),
+    ("モ", "m", "o"),
+    ("メ", "m", "e"),
+    ("ム", "m", "u"),
+    ("ミョ", "my", "o"),
+    ("ミュ", "my", "u"),
+    ("ミャ", "my", "a"),
+    ("ミェ", "my", "e"),
+    ("ミ", "m", "i"),
+    ("マ", "m", "a"),
+    ("ポ", "p", "o"),
+    ("ボ", "b", "o"),
+    ("ホ", "h", "o"),
+    ("ペ", "p", "e"),
+    ("ベ", "b", "e"),
+    ("ヘ", "h", "e"),
+    ("プ", "p", "u"),
+    ("ブ", "b", "u"),
+    ("フォ", "f", "o"),
+    ("フェ", "f", "e"),
+    ("フィ", "f", "i"),
+    ("ファ", "f", "a"),
+    ("フ", "f", "u"),
+    ("ピョ", "py", "o"),
+    ("ピュ", "py", "u"),
+    ("ピャ", "py", "a"),
+    ("ピェ", "py", "e"),
+    ("ピ", "p", "i"),
+    ("ビョ", "by", "o"),
+    ("ビュ", "by", "u"),
+    ("ビャ", "by", "a"),
+    ("ビェ", "by", "e"),
+    ("ビ", "b", "i"),
+    ("ヒョ", "hy", "o"),
+    ("ヒュ", "hy", "u"),
+    ("ヒャ", "hy", "a"),
+    ("ヒェ", "hy", "e"),
+    ("ヒ", "h", "i"),
+    ("パ", "p", "a"),
+    ("バ", "b", "a"),
+    ("ハ", "h", "a"),
+    ("ノ", "n", "o"),
+    ("ネ", "n", "e"),
+    ("ヌ", "n", "u"),
+    ("ニョ", "ny", "o"),
+    ("ニュ", "ny", "u"),
+    ("ニャ", "ny", "a"),
+    ("ニェ", "ny", "e"),
+    ("ニ", "n", "i"),
+    ("ナ", "n", "a"),
+    ("ドゥ", "d", "u"),
+    ("ド", "d", "o"),
+    ("トゥ", "t", "u"),
+    ("ト", "t", "o"),
+    ("デョ", "dy", "o"),
+    ("デュ", "dy", "u"),
+    ("デャ", "dy", "a"),
+    # ("デェ", "dy", "e"),
+    ("ディ", "d", "i"),
+    ("デ", "d", "e"),
+    ("テョ", "ty", "o"),
+    ("テュ", "ty", "u"),
+    ("テャ", "ty", "a"),
+    ("ティ", "t", "i"),
+    ("テ", "t", "e"),
+    ("ツォ", "ts", "o"),
+    ("ツェ", "ts", "e"),
+    ("ツィ", "ts", "i"),
+    ("ツァ", "ts", "a"),
+    ("ツ", "ts", "u"),
+    ("ッ", None, "q"),  # 「cl」から「q」に変更
+    ("チョ", "ch", "o"),
+    ("チュ", "ch", "u"),
+    ("チャ", "ch", "a"),
+    ("チェ", "ch", "e"),
+    ("チ", "ch", "i"),
+    ("ダ", "d", "a"),
+    ("タ", "t", "a"),
+    ("ゾ", "z", "o"),
+    ("ソ", "s", "o"),
+    ("ゼ", "z", "e"),
+    ("セ", "s", "e"),
+    ("ズィ", "z", "i"),
+    ("ズ", "z", "u"),
+    ("スィ", "s", "i"),
+    ("ス", "s", "u"),
+    ("ジョ", "j", "o"),
+    ("ジュ", "j", "u"),
+    ("ジャ", "j", "a"),
+    ("ジェ", "j", "e"),
+    ("ジ", "j", "i"),
+    ("ショ", "sh", "o"),
+    ("シュ", "sh", "u"),
+    ("シャ", "sh", "a"),
+    ("シェ", "sh", "e"),
+    ("シ", "sh", "i"),
+    ("ザ", "z", "a"),
+    ("サ", "s", "a"),
+    ("ゴ", "g", "o"),
+    ("コ", "k", "o"),
+    ("ゲ", "g", "e"),
+    ("ケ", "k", "e"),
+    ("グヮ", "gw", "a"),
+    ("グ", "g", "u"),
+    ("クヮ", "kw", "a"),
+    ("ク", "k", "u"),
+    ("ギョ", "gy", "o"),
+    ("ギュ", "gy", "u"),
+    ("ギャ", "gy", "a"),
+    ("ギェ", "gy", "e"),
+    ("ギ", "g", "i"),
+    ("キョ", "ky", "o"),
+    ("キュ", "ky", "u"),
+    ("キャ", "ky", "a"),
+    ("キェ", "ky", "e"),
+    ("キ", "k", "i"),
+    ("ガ", "g", "a"),
+    ("カ", "k", "a"),
+    ("オ", None, "o"),
+    ("エ", None, "e"),
+    ("ウォ", "w", "o"),
+    ("ウェ", "w", "e"),
+    ("ウィ", "w", "i"),
+    ("ウ", None, "u"),
+    ("イェ", "y", "e"),
+    ("イ", None, "i"),
+    ("ア", None, "a"),
+]
+_mora_list_additional: list[tuple[str, Optional[str], str]] = [
+    ("ヴョ", "by", "o"),
+    ("ヴュ", "by", "u"),
+    ("ヴャ", "by", "a"),
+    ("ヲ", None, "o"),
+    ("ヱ", None, "e"),
+    ("ヰ", None, "i"),
+    ("ヮ", "w", "a"),
+    ("ョ", "y", "o"),
+    ("ュ", "y", "u"),
+    ("ヅ", "z", "u"),
+    ("ヂ", "j", "i"),
+    ("ヶ", "k", "e"),
+    ("ャ", "y", "a"),
+    ("ォ", None, "o"),
+    ("ェ", None, "e"),
+    ("ゥ", None, "u"),
+    ("ィ", None, "i"),
+    ("ァ", None, "a"),
+]
+# 例: "vo" -> "ヴォ", "a" -> "ア"
+mora_phonemes_to_mora_kata: dict[str, str] = {
+    (consonant or "") + vowel: kana for [kana, consonant, vowel] in _mora_list_minimum
+}
+# 例: "ヴォ" -> ("v", "o"), "ア" -> (None, "a")
+mora_kata_to_mora_phonemes: dict[str, tuple[Optional[str], str]] = {
+    kana: (consonant, vowel)
+    for [kana, consonant, vowel] in _mora_list_minimum + _mora_list_additional
+}
+# 正規化で記号を変換するための辞書
+rep_map = {
+    "：": ":",
+    "；": ";",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "．": ".",
+    "⋯": "…",
+    "···": "…",
+    "・・・": "…",
+    "·": ",",
+    "・": ",",
+    "•": ",",
+    "、": ",",
+    "$": ".",
+    # "“": "'",
+    # "”": "'",
+    # '"': "'",
+    "‘": "'",
+    "’": "'",
+    # "（": "'",
+    # "）": "'",
+    # "(": "'",
+    # ")": "'",
+    # "《": "'",
+    # "》": "'",
+    # "【": "'",
+    # "】": "'",
+    # "[": "'",
+    # "]": "'",
+    # "——": "-",
+    # "−": "-",
+    # "-": "-",
+    # "『": "'",
+    # "』": "'",
+    # "〈": "'",
+    # "〉": "'",
+    # "«": "'",
+    # "»": "'",
+    # # "～": "-",  # これは長音記号「ー」として扱うよう変更
+    # # "~": "-",  # これは長音記号「ー」として扱うよう変更
+    # "「": "'",
+    # "」": "'",
+}
+def _numeric_feature_by_regex(regex, s):
+    match = re.search(regex, s)
+    if match is None:
+        return -50
+    return int(match.group(1))
+def replace_punctuation(text: str) -> str:
+    """句読点等を「.」「,」「!」「?」「'」「-」に正規化し、OpenJTalkで読みが取得できるもののみ残す：
+    漢字・平仮名・カタカナ、アルファベット、ギリシャ文字
+    """
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+    # print("before: ", text)
+    # 句読点を辞書で置換
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+    replaced_text = re.sub(
+        # ↓ ひらがな、カタカナ、漢字
+        r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
+        # ↓ 半角アルファベット（大文字と小文字）
+        + r"\u0041-\u005A\u0061-\u007A"
+        # ↓ 全角アルファベット（大文字と小文字）
+        + r"\uFF21-\uFF3A\uFF41-\uFF5A"
+        # ↓ ギリシャ文字
+        + r"\u0370-\u03FF\u1F00-\u1FFF"
+        # ↓ "!", "?", "…", ",", ".", "'", "-", 但し`…`はすでに`...`に変換されている
+        + "".join(punctuation) + r"]+",
+        # 上述以外の文字を削除
+        "",
+        replaced_text,
+    )
+    # print("after: ", replaced_text)
+    return replaced_text
+def fix_phone_tone(phone_tone_list: list[tuple[str, int]]) -> list[tuple[str, int]]:
+    """
+    `phone_tone_list`のtone（アクセントの値）を0か1の範囲に修正する。
+    例: [(a, 0), (i, -1), (u, -1)] → [(a, 1), (i, 0), (u, 0)]
+    """
+    tone_values = set(tone for _, tone in phone_tone_list)
+    if len(tone_values) == 1:
+        assert tone_values == {0}, tone_values
+        return phone_tone_list
+    elif len(tone_values) == 2:
+        if tone_values == {0, 1}:
+            return phone_tone_list
+        elif tone_values == {-1, 0}:
+            return [
+                (letter, 0 if tone == -1 else 1) for letter, tone in phone_tone_list
+            ]
+        else:
+            raise ValueError(f"Unexpected tone values: {tone_values}")
+    else:
+        raise ValueError(f"Unexpected tone values: {tone_values}")
+def fix_phone_tone_wplen(phone_tone_list, word_phone_length_list):
+    phones = []
+    tones = []
+    w_p_len = []
+    p_len = len(phone_tone_list)
+    idx = 0
+    w_idx = 0
+    while idx < p_len:
+        offset = 0
+        if phone_tone_list[idx] == "▁":
+            w_p_len.append(w_idx + 1)
+        curr_w_p_len = word_phone_length_list[w_idx]
+        for i in range(curr_w_p_len):
+            p, t = phone_tone_list[idx]
+            if p == ":" and len(phones) > 0:
+                if phones[-1][-1] != ":":
+                    phones[-1] += ":"
+                    offset -= 1
+            else:
+                phones.append(p)
+                tones.append(str(t))
+            idx += 1
+            if idx >= p_len:
+                break
+        w_p_len.append(curr_w_p_len + offset)
+        w_idx += 1
+        # print(w_p_len)
+    return phones, tones, w_p_len
+def g2phone_tone_wo_punct(prosodies) -> list[tuple[str, int]]:
+    """
+    テキストに対して、音素とアクセント（0か1）のペアのリストを返す。
+    ただし「!」「.」「?」等の非音素記号(punctuation)は全て消える（ポーズ記号も残さない）。
+    非音素記号を含める処理は`align_tones()`で行われる。
+    また「っ」は「cl」でなく「q」に変換される（「ん」は「N」のまま）。
+    例: "こんにちは、世界ー。。元気？！" →
+    [('k', 0), ('o', 0), ('N', 1), ('n', 1), ('i', 1), ('ch', 1), ('i', 1), ('w', 1), ('a', 1), ('s', 1), ('e', 1), ('k', 0), ('a', 0), ('i', 0), ('i', 0), ('g', 1), ('e', 1), ('N', 0), ('k', 0), ('i', 0)]
+    """
+    result: list[tuple[str, int]] = []
+    current_phrase: list[tuple[str, int]] = []
+    current_tone = 0
+    last_accent = ""
+    for i, letter in enumerate(prosodies):
+        # 特殊記号の処理
+        # 文頭記号、無視する
+        if letter == "^":
+            assert i == 0, "Unexpected ^"
+        # アクセント句の終わりに来る記号
+        elif letter in ("$", "?", "_", "#"):
+            # 保持しているフレーズを、アクセント数値を0-1に修正し結果に追加
+            result.extend(fix_phone_tone(current_phrase))
+            # 末尾に来る終了記号、無視（文中の疑問文は`_`になる）
+            if letter in ("$", "?"):
+                assert i == len(prosodies) - 1, f"Unexpected {letter}"
+            # あとは"_"（ポーズ）と"#"（アクセント句の境界）のみ
+            # これらは残さず、次のアクセント句に備える。
+            current_phrase = []
+            # 0を基準点にしてそこから上昇・下降する（負の場合は上の`fix_phone_tone`で直る）
+            current_tone = 0
+            last_accent = ""
+        # アクセント上昇記号
+        elif letter == "[":
+            if last_accent != letter:
+                current_tone = current_tone + 1
+            last_accent = letter
+        # アクセント下降記号
+        elif letter == "]":
+            if last_accent != letter:
+                current_tone = current_tone - 1
+            last_accent = letter
+        # それ以外は通常の音素
+        else:
+            if letter == "cl":  # 「っ」の処理
+                letter = "q"
+            current_phrase.append((letter, current_tone))
+    return result
+def handle_long(sep_phonemes: list[list[str]]) -> list[list[str]]:
+    for i in range(len(sep_phonemes)):
+        if sep_phonemes[i][0] == "ー":
+            # sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
+            sep_phonemes[i][0] = ":"
+        if "ー" in sep_phonemes[i]:
+            for j in range(len(sep_phonemes[i])):
+                if sep_phonemes[i][j] == "ー":
+                    # sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
+                    sep_phonemes[i][j] = ":"
+    return sep_phonemes
+def handle_long_word(sep_phonemes: list[list[str]]) -> list[list[str]]:
+    res = []
+    for i in range(len(sep_phonemes)):
+        if sep_phonemes[i][0] == "ー":
+            sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
+            # sep_phonemes[i][0] = ':'
+        if "ー" in sep_phonemes[i]:
+            for j in range(len(sep_phonemes[i])):
+                if sep_phonemes[i][j] == "ー":
+                    sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
+                    # sep_phonemes[i][j] = ':'
+        res.append(sep_phonemes[i])
+        res.append("▁")
+    return res
+def align_tones(
+    phones_with_punct: list[str], phone_tone_list: list[tuple[str, int]]
+) -> list[tuple[str, int]]:
+    """
+    例:
+    …私は、、そう思う。
+    phones_with_punct:
+    [".", ".", ".", "w", "a", "t", "a", "sh", "i", "w", "a", ",", ",", "s", "o", "o", "o", "m", "o", "u", "."]
+    phone_tone_list:
+    [("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0))]
+    Return:
+    [(".", 0), (".", 0), (".", 0), ("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), (",", 0), (",", 0), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0), (".", 0)]
+    """
+    result: list[tuple[str, int]] = []
+    tone_index = 0
+    for phone in phones_with_punct:
+        if tone_index >= len(phone_tone_list):
+            # 余ったpunctuationがある場合 → (punctuation, 0)を追加
+            result.append((phone, 0))
+        elif phone == phone_tone_list[tone_index][0]:
+            # phone_tone_listの現在の音素と一致する場合 → toneをそこから取得、(phone, tone)を追加
+            result.append((phone, phone_tone_list[tone_index][1]))
+            # 探すindexを1つ進める
+            tone_index += 1
+        elif phone in punctuation or phone == "▁":
+            # phoneがpunctuationの場合 → (phone, 0)を追加
+            result.append((phone, 0))
+        else:
+            print(f"phones: {phones_with_punct}")
+            print(f"phone_tone_list: {phone_tone_list}")
+            print(f"result: {result}")
+            print(f"tone_index: {tone_index}")
+            print(f"phone: {phone}")
+            raise ValueError(f"Unexpected phone: {phone}")
+    return result
+def kata2phoneme_list(text: str) -> list[str]:
+    """
+    原則カタカナの`text`を受け取り、それをそのままいじらずに音素記号のリストに変換。
+    注意点：
+    - punctuationが来た場合（punctuationが1文字の場合がありうる）、処理せず1文字のリストを返す
+    - 冒頭に続く「ー」はそのまま「ー」のままにする（`handle_long()`で処理される）
+    - 文中の「ー」は前の音素記号の最後の音素記号に変換される。
+    例：
+    `ーーソーナノカーー` → ["ー", "ー", "s", "o", "o", "n", "a", "n", "o", "k", "a", "a", "a"]
+    `?` → ["?"]
+    """
+    if text in punctuation:
+        return [text]
+    # `text`がカタカナ（`ー`含む）のみからなるかどうかをチェック
+    if re.fullmatch(r"[\u30A0-\u30FF]+", text) is None:
+        raise ValueError(f"Input must be katakana only: {text}")
+    sorted_keys = sorted(mora_kata_to_mora_phonemes.keys(), key=len, reverse=True)
+    pattern = "|".join(map(re.escape, sorted_keys))
+    def mora2phonemes(mora: str) -> str:
+        cosonant, vowel = mora_kata_to_mora_phonemes[mora]
+        if cosonant is None:
+            return f" {vowel}"
+        return f" {cosonant} {vowel}"
+    spaced_phonemes = re.sub(pattern, lambda m: mora2phonemes(m.group()), text)
+    # 長音記号「ー」の処理
+    long_pattern = r"(\w)(ー*)"
+    long_replacement = lambda m: m.group(1) + (" " + m.group(1)) * len(m.group(2))
+    spaced_phonemes = re.sub(long_pattern, long_replacement, spaced_phonemes)
+    # spaced_phonemes += ' ▁'
+    return spaced_phonemes.strip().split(" ")
+def frontend2phoneme(labels, drop_unvoiced_vowels=False):
+    N = len(labels)
+    phones = []
+    for n in range(N):
+        lab_curr = labels[n]
+        # print(lab_curr)
+        # current phoneme
+        p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
+        # deal unvoiced vowels as normal vowels
+        if drop_unvoiced_vowels and p3 in "AEIOU":
+            p3 = p3.lower()
+        # deal with sil at the beginning and the end of text
+        if p3 == "sil":
+            # assert n == 0 or n == N - 1
+            # if n == 0:
+            #     phones.append("^")
+            # elif n == N - 1:
+            #     # check question form or not
+            #     e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
+            #     if e3 == 0:
+            #         phones.append("$")
+            #     elif e3 == 1:
+            #         phones.append("?")
+            continue
+        elif p3 == "pau":
+            phones.append("_")
+            continue
+        else:
+            phones.append(p3)
+        # accent type and position info (forward or backward)
+        a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
+        a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
+        a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
+        # number of mora in accent phrase
+        f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
+        a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
+        # accent phrase border
+        # print(p3, a1, a2, a3, f1, a2_next, lab_curr)
+        if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
+            phones.append("#")
+        # pitch falling
+        elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
+            phones.append("]")
+        # pitch rising
+        elif a2 == 1 and a2_next == 2:
+            phones.append("[")
+    # phones = ' '.join(phones)
+    return phones
+class JapanesePhoneConverter(object):
+    def __init__(self, lexicon_path=None, ipa_dict_path=None):
+        # lexicon_lines = open(lexicon_path, 'r', encoding='utf-8').readlines()
+        # self.lexicon = {}
+        # self.single_dict = {}
+        # self.double_dict = {}
+        # for curr_line in lexicon_lines:
+        #     k,v = curr_line.strip().split('+',1)
+        #     self.lexicon[k] = v
+        #     if len(k) == 2:
+        #         self.double_dict[k] = v
+        #     elif len(k) == 1:
+        #         self.single_dict[k] = v
+        self.ipa_dict = {}
+        for curr_line in jp_xphone2ipa:
+            k, v = curr_line.strip().split(" ", 1)
+            self.ipa_dict[k] = re.sub("\s", "", v)
+        # kakasi1 = kakasi()
+        # kakasi1.setMode("H","K")
+        # kakasi1.setMode("J","K")
+        # kakasi1.setMode("r","Hepburn")
+        self.japan_JH2K = kakasi()
+        self.table = {ord(f): ord(t) for f, t in zip("67", "_¯")}
+    def text2sep_kata(self, parsed) -> tuple[list[str], list[str]]:
+        """
+        `text_normalize`で正規化済みの`norm_text`を受け取り、それを単語分割し、
+        分割された単語リストとその読み（カタカナor記号1文字）のリス���のタプルを返す。
+        単語分割結果は、`g2p()`の`word2ph`で1文字あたりに割り振る音素記号の数を決めるために使う。
+        例:
+        `私はそう思う!って感じ?` →
+        ["私", "は", "そう", "思う", "!", "って", "感じ", "?"], ["ワタシ", "ワ", "ソー", "オモウ", "!", "ッテ", "カンジ", "?"]
+        """
+        # parsed: OpenJTalkの解析結果
+        sep_text: list[str] = []
+        sep_kata: list[str] = []
+        fix_parsed = []
+        i = 0
+        while i <= len(parsed) - 1:
+            # word: 実際の単語の文字列
+            # yomi: その読み、但し無声化サインの`’`は除去
+            # print(parsed)
+            yomi = parsed[i]["pron"]
+            tmp_parsed = parsed[i]
+            if i != len(parsed) - 1 and parsed[i + 1]["string"] in [
+                "々",
+                "ゝ",
+                "ヽ",
+                "ゞ",
+                "ヾ",
+                "゛",
+            ]:
+                word = parsed[i]["string"] + parsed[i + 1]["string"]
+                i += 1
+            else:
+                word = parsed[i]["string"]
+            word, yomi = replace_punctuation(word), yomi.replace("’", "")
+            """
+            ここで`yomi`の取りうる値は以下の通りのはず。
+            - `word`が通常単語 → 通常の読み（カタカナ）
+                （カタカナからなり、長音記号も含みうる、`アー` 等）
+            - `word`が`ー` から始まる → `ーラー` や `ーーー` など
+            - `word`が句読点や空白等 → `、`
+            - `word`が`?` → `？`（全角になる）
+            他にも`word`が読めないキリル文字アラビア文字等が来ると`、`になるが、正規化でこの場合は起きないはず。
+            また元のコードでは`yomi`が空白の場合の処理があったが、これは起きないはず。
+            処理すべきは`yomi`が`、`の場合のみのはず。
+            """
+            assert yomi != "", f"Empty yomi: {word}"
+            if yomi == "、":
+                # wordは正規化されているので、`.`, `,`, `!`, `'`, `-`のいずれか
+                if word not in (
+                    ".",
+                    ",",
+                    "!",
+                    "'",
+                    "-",
+                    "?",
+                    ":",
+                    ";",
+                    "…",
+                    "",
+                ):
+                    # ここはpyopenjtalkが読めない文字等のときに起こる
+                    #print(
+                    #    "{}Cannot read:{}, yomi:{}, new_word:{};".format(
+                    #        parsed, word, yomi, self.japan_JH2K.convert(word)[0]["kana"]
+                    #    )
+                    #)
+                    # raise ValueError(word)
+                    word = self.japan_JH2K.convert(word)[0]["kana"]
+                    # print(word, self.japan_JH2K.convert(word)[0]['kana'], kata2phoneme_list(self.japan_JH2K.convert(word)[0]['kana']))
+                    tmp_parsed["pron"] = word
+                    # yomi = "-"
+                    # word = ','
+                # yomiは元の記号のままに変更
+                # else:
+                #     parsed[i]['pron'] = parsed[i]["string"]
+                yomi = word
+            elif yomi == "？":
+                assert word == "?", f"yomi `？` comes from: {word}"
+                yomi = "?"
+            if word == "":
+                i += 1
+                continue
+            sep_text.append(word)
+            sep_kata.append(yomi)
+            # print(word, yomi, parts)
+            fix_parsed.append(tmp_parsed)
+            i += 1
+        # print(sep_text, sep_kata)
+        return sep_text, sep_kata, fix_parsed
+    def getSentencePhone(self, sentence, blank_mode=True, phoneme_mode=False):
+        # print("origin:", sentence)
+        words = []
+        words_phone_len = []
+        short_char_flag = False
+        output_duration_flag = []
+        output_before_sil_flag = []
+        normed_text = []
+        sentence = sentence.strip().strip("'")
+        sentence = re.sub(r"\s+", "", sentence)
+        output_res = []
+        failed_words = []
+        last_long_pause = 4
+        last_word = None
+        frontend_text = pyopenjtalk.run_frontend(sentence)
+        # print("frontend_text: ", frontend_text)
+        try:
+            frontend_text = pyopenjtalk.estimate_accent(frontend_text)
+        except:
+            pass
+        # print("estimate_accent: ", frontend_text)
+        # sep_text: 単語単位の単語のリスト
+        # sep_kata: 単語単位の単語のカタカナ読みのリスト
+        sep_text, sep_kata, frontend_text = self.text2sep_kata(frontend_text)
+        # print("sep_text: ", sep_text)
+        # print("sep_kata: ", sep_kata)
+        # print("frontend_text: ", frontend_text)
+        # sep_phonemes: 各単語ご���の音素のリストのリスト
+        sep_phonemes = handle_long_word([kata2phoneme_list(i) for i in sep_kata])
+        # print("sep_phonemes: ", sep_phonemes)
+        pron_text = [x["pron"].strip().replace("’", "") for x in frontend_text]
+        # pdb.set_trace()
+        prosodys = pyopenjtalk.make_label(frontend_text)
+        prosodys = frontend2phoneme(prosodys, drop_unvoiced_vowels=True)
+        # print("prosodys: ", ' '.join(prosodys))
+        # print("pron_text: ", pron_text)
+        normed_text = [x["string"].strip() for x in frontend_text]
+        # punctuationがすべて消えた、音素とアクセントのタプルのリスト
+        phone_tone_list_wo_punct = g2phone_tone_wo_punct(prosodys)
+        # print("phone_tone_list_wo_punct: ", phone_tone_list_wo_punct)
+        # phone_w_punct: sep_phonemesを結合した、punctuationを元のまま保持した音素列
+        phone_w_punct: list[str] = []
+        w_p_len = []
+        for i in sep_phonemes:
+            phone_w_punct += i
+            w_p_len.append(len(i))
+        phone_w_punct = phone_w_punct[:-1]
+        # punctuation無しのアクセント情報を使って、punctuationを含めたアクセント情報を作る
+        # print("phone_w_punct: ", phone_w_punct)
+        # print("phone_tone_list_wo_punct: ", phone_tone_list_wo_punct)
+        phone_tone_list = align_tones(phone_w_punct, phone_tone_list_wo_punct)
+        jp_item = {}
+        jp_p = ""
+        jp_t = ""
+        # mye rye pye bye nye
+        # je she
+        # print(phone_tone_list)
+        for p, t in phone_tone_list:
+            if p in self.ipa_dict:
+                curr_p = self.ipa_dict[p]
+                jp_p += curr_p
+                jp_t += str(t + 6) * len(curr_p)
+            elif p in punctuation:
+                jp_p += p
+                jp_t += "0"
+            elif p == "▁":
+                jp_p += p
+                jp_t += " "
+            else:
+                print(p, t)
+            jp_p += "|"
+            jp_t += "0"
+        # return phones, tones, w_p_len
+        jp_p = jp_p.replace("▁", " ")
+        jp_t = jp_t.translate(self.table)
+        jp_l = ""
+        for t in jp_t:
+            if t == " ":
+                jp_l += " "
+            else:
+                jp_l += "2"
+        # print(jp_p)
+        # print(jp_t)
+        # print(jp_l)
+        # print(len(jp_p_len), sum(w_p_len),  len(jp_p), sum(jp_p_len))
+        assert len(jp_p) == len(jp_t) and len(jp_p) == len(jp_l)
+        jp_item["jp_p"] = jp_p.replace("| |", "|").rstrip("|")
+        jp_item["jp_t"] = jp_t
+        jp_item["jp_l"] = jp_l
+        jp_item["jp_normed_text"] = " ".join(normed_text)
+        jp_item["jp_pron_text"] = " ".join(pron_text)
+        # jp_item['jp_ruoma'] = sep_phonemes
+        # print(len(normed_text), len(sep_phonemes))
+        # print(normed_text)
+        return jp_item
+jpc = JapanesePhoneConverter()
+def japanese_to_ipa(text, text_tokenizer):
+    # phonemes = text_tokenizer(text)
+    if type(text) == str:
+        return jpc.getSentencePhone(text)["jp_p"]
+    else:
+        result_ph = []
+        for t in text:
+            result_ph.append(jpc.getSentencePhone(t)["jp_p"])
+        return result_ph

diffrhythm/g2p/g2p/korean.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+"""
+    Text clean time
+"""
+english_dictionary = {
+    "KOREA": "코리아",
+    "IDOL": "아이돌",
+    "IT": "아이티",
+    "IQ": "아이큐",
+    "UP": "업",
+    "DOWN": "다운",
+    "PC": "피씨",
+    "CCTV": "씨씨티비",
+    "SNS": "에스엔에스",
+    "AI": "에이아이",
+    "CEO": "씨이오",
+    "A": "에이",
+    "B": "비",
+    "C": "씨",
+    "D": "디",
+    "E": "이",
+    "F": "에프",
+    "G": "지",
+    "H": "에이치",
+    "I": "아이",
+    "J": "제이",
+    "K": "케이",
+    "L": "엘",
+    "M": "엠",
+    "N": "엔",
+    "O": "오",
+    "P": "피",
+    "Q": "큐",
+    "R": "알",
+    "S": "에스",
+    "T": "티",
+    "U": "유",
+    "V": "브이",
+    "W": "더블유",
+    "X": "엑스",
+    "Y": "와이",
+    "Z": "제트",
+}
+def normalize(text):
+    text = text.strip()
+    text = re.sub(
+        "[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text
+    )
+    text = normalize_english(text)
+    text = text.lower()
+    return text
+def normalize_english(text):
+    def fn(m):
+        word = m.group()
+        if word in english_dictionary:
+            return english_dictionary.get(word)
+        return word
+    text = re.sub("([A-Za-z]+)", fn, text)
+    return text
+def korean_to_ipa(text, text_tokenizer):
+    if type(text) == str:
+        text = normalize(text)
+        phonemes = text_tokenizer(text)
+        return phonemes
+    else:
+        for i, t in enumerate(text):
+            text[i] = normalize(t)
+        return text_tokenizer(text)

diffrhythm/g2p/g2p/mandarin.py ADDED Viewed

	@@ -0,0 +1,595 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+import jieba
+import cn2an
+from pypinyin import lazy_pinyin, BOPOMOFO
+from typing import List
+from diffrhythm.g2p.g2p.chinese_model_g2p import BertPolyPredict
+from diffrhythm.g2p.utils.front_utils import *
+import os
+# from g2pw import G2PWConverter
+# set blank level, {0："none",1:"char", 2:"word"}
+BLANK_LEVEL = 0
+# conv = G2PWConverter(style='pinyin', enable_non_tradional_chinese=True)
+resource_path = r"./diffrhythm/g2p"
+poly_all_class_path = os.path.join(
+    resource_path, "sources", "g2p_chinese_model", "polychar.txt"
+)
+if not os.path.exists(poly_all_class_path):
+    print(
+        "Incorrect path for polyphonic character class dictionary: {}, please check...".format(
+            poly_all_class_path
+        )
+    )
+    exit()
+poly_dict = generate_poly_lexicon(poly_all_class_path)
+# Set up G2PW model parameters
+g2pw_poly_model_path = os.path.join(resource_path, "sources", "g2p_chinese_model")
+if not os.path.exists(g2pw_poly_model_path):
+    print(
+        "Incorrect path for g2pw polyphonic character model: {}, please check...".format(
+            g2pw_poly_model_path
+        )
+    )
+    exit()
+json_file_path = os.path.join(
+    resource_path, "sources", "g2p_chinese_model", "polydict.json"
+)
+if not os.path.exists(json_file_path):
+    print(
+        "Incorrect path for g2pw id to pinyin dictionary: {}, please check...".format(
+            json_file_path
+        )
+    )
+    exit()
+jsonr_file_path = os.path.join(
+    resource_path, "sources", "g2p_chinese_model", "polydict_r.json"
+)
+if not os.path.exists(jsonr_file_path):
+    print(
+        "Incorrect path for g2pw pinyin to id dictionary: {}, please check...".format(
+            jsonr_file_path
+        )
+    )
+    exit()
+g2pw_poly_predict = BertPolyPredict(
+    g2pw_poly_model_path, jsonr_file_path, json_file_path
+)
+"""
+    Text clean time
+"""
+# List of (Latin alphabet, bopomofo) pairs:
+_latin_to_bopomofo = [
+    (re.compile("%s" % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("a", "ㄟˉ"),
+        ("b", "ㄅㄧˋ"),
+        ("c", "ㄙㄧˉ"),
+        ("d", "ㄉㄧˋ"),
+        ("e", "ㄧˋ"),
+        ("f", "ㄝˊㄈㄨˋ"),
+        ("g", "ㄐㄧˋ"),
+        ("h", "ㄝˇㄑㄩˋ"),
+        ("i", "ㄞˋ"),
+        ("j", "ㄐㄟˋ"),
+        ("k", "ㄎㄟˋ"),
+        ("l", "ㄝˊㄛˋ"),
+        ("m", "ㄝˊㄇㄨˋ"),
+        ("n", "ㄣˉ"),
+        ("o", "ㄡˉ"),
+        ("p", "ㄆㄧˉ"),
+        ("q", "ㄎㄧㄡˉ"),
+        ("r", "ㄚˋ"),
+        ("s", "ㄝˊㄙˋ"),
+        ("t", "ㄊㄧˋ"),
+        ("u", "ㄧㄡˉ"),
+        ("v", "ㄨㄧˉ"),
+        ("w", "ㄉㄚˋㄅㄨˋㄌㄧㄡˋ"),
+        ("x", "ㄝˉㄎㄨˋㄙˋ"),
+        ("y", "ㄨㄞˋ"),
+        ("z", "ㄗㄟˋ"),
+    ]
+]
+# List of (bopomofo, ipa) pairs:
+_bopomofo_to_ipa = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        ("ㄅㄛ", "p⁼wo"),
+        ("ㄆㄛ", "pʰwo"),
+        ("ㄇㄛ", "mwo"),
+        ("ㄈㄛ", "fwo"),
+        ("ㄧㄢ", "|jɛn"),
+        ("ㄩㄢ", "|ɥæn"),
+        ("ㄧㄣ", "|in"),
+        ("ㄩㄣ", "|ɥn"),
+        ("ㄧㄥ", "|iŋ"),
+        ("ㄨㄥ", "|ʊŋ"),
+        ("ㄩㄥ", "|jʊŋ"),
+        # Add
+        ("ㄧㄚ", "|ia"),
+        ("ㄧㄝ", "|iɛ"),
+        ("ㄧㄠ", "|iɑʊ"),
+        ("ㄧㄡ", "|ioʊ"),
+        ("ㄧㄤ", "|iɑŋ"),
+        ("ㄨㄚ", "|ua"),
+        ("ㄨㄛ", "|uo"),
+        ("ㄨㄞ", "|uaɪ"),
+        ("ㄨㄟ", "|ueɪ"),
+        ("ㄨㄢ", "|uan"),
+        ("ㄨㄣ", "|uən"),
+        ("ㄨㄤ", "|uɑŋ"),
+        ("ㄩㄝ", "|ɥɛ"),
+        # End
+        ("ㄅ", "p⁼"),
+        ("ㄆ", "pʰ"),
+        ("ㄇ", "m"),
+        ("ㄈ", "f"),
+        ("ㄉ", "t⁼"),
+        ("ㄊ", "tʰ"),
+        ("ㄋ", "n"),
+        ("ㄌ", "l"),
+        ("ㄍ", "k⁼"),
+        ("ㄎ", "kʰ"),
+        ("ㄏ", "x"),
+        ("ㄐ", "tʃ⁼"),
+        ("ㄑ", "tʃʰ"),
+        ("ㄒ", "ʃ"),
+        ("ㄓ", "ts`⁼"),
+        ("ㄔ", "ts`ʰ"),
+        ("ㄕ", "s`"),
+        ("ㄖ", "ɹ`"),
+        ("ㄗ", "ts⁼"),
+        ("ㄘ", "tsʰ"),
+        ("ㄙ", "|s"),
+        ("ㄚ", "|a"),
+        ("ㄛ", "|o"),
+        ("ㄜ", "|ə"),
+        ("ㄝ", "|ɛ"),
+        ("ㄞ", "|aɪ"),
+        ("ㄟ", "|eɪ"),
+        ("ㄠ", "|ɑʊ"),
+        ("ㄡ", "|oʊ"),
+        ("ㄢ", "|an"),
+        ("ㄣ", "|ən"),
+        ("ㄤ", "|ɑŋ"),
+        ("ㄥ", "|əŋ"),
+        ("ㄦ", "əɹ"),
+        ("ㄧ", "|i"),
+        ("ㄨ", "|u"),
+        ("ㄩ", "|ɥ"),
+        ("ˉ", "→|"),
+        ("ˊ", "↑|"),
+        ("ˇ", "↓↑|"),
+        ("ˋ", "↓|"),
+        ("˙", "|"),
+    ]
+]
+must_not_er_words = {"女儿", "老儿", "男儿", "少儿", "小儿"}
+word_pinyin_dict = {}
+with open(
+    r"./diffrhythm/g2p/sources/chinese_lexicon.txt", "r", encoding="utf-8"
+) as fread:
+    txt_list = fread.readlines()
+    for txt in txt_list:
+        word, pinyin = txt.strip().split("\t")
+        word_pinyin_dict[word] = pinyin
+    fread.close()
+pinyin_2_bopomofo_dict = {}
+with open(
+    r"./diffrhythm/g2p/sources/pinyin_2_bpmf.txt", "r", encoding="utf-8"
+) as fread:
+    txt_list = fread.readlines()
+    for txt in txt_list:
+        pinyin, bopomofo = txt.strip().split("\t")
+        pinyin_2_bopomofo_dict[pinyin] = bopomofo
+    fread.close()
+tone_dict = {
+    "0": "˙",
+    "5": "˙",
+    "1": "",
+    "2": "ˊ",
+    "3": "ˇ",
+    "4": "ˋ",
+}
+bopomofos2pinyin_dict = {}
+with open(
+    r"./diffrhythm/g2p/sources/bpmf_2_pinyin.txt", "r", encoding="utf-8"
+) as fread:
+    txt_list = fread.readlines()
+    for txt in txt_list:
+        v, k = txt.strip().split("\t")
+        bopomofos2pinyin_dict[k] = v
+    fread.close()
+def bpmf_to_pinyin(text):
+    bopomofo_list = text.split("|")
+    pinyin_list = []
+    for info in bopomofo_list:
+        pinyin = ""
+        for c in info:
+            if c in bopomofos2pinyin_dict:
+                pinyin += bopomofos2pinyin_dict[c]
+        if len(pinyin) == 0:
+            continue
+        if pinyin[-1] not in "01234":
+            pinyin += "1"
+        if pinyin[:-1] == "ve":
+            pinyin = "y" + pinyin
+        if pinyin[:-1] == "sh":
+            pinyin = pinyin[:-1] + "i" + pinyin[-1]
+        if pinyin == "sh":
+            pinyin = pinyin[:-1] + "i"
+        if pinyin[:-1] == "s":
+            pinyin = "si" + pinyin[-1]
+        if pinyin[:-1] == "c":
+            pinyin = "ci" + pinyin[-1]
+        if pinyin[:-1] == "i":
+            pinyin = "yi" + pinyin[-1]
+        if pinyin[:-1] == "iou":
+            pinyin = "you" + pinyin[-1]
+        if pinyin[:-1] == "ien":
+            pinyin = "yin" + pinyin[-1]
+        if "iou" in pinyin and pinyin[-4:-1] == "iou":
+            pinyin = pinyin[:-4] + "iu" + pinyin[-1]
+        if "uei" in pinyin:
+            if pinyin[:-1] == "uei":
+                pinyin = "wei" + pinyin[-1]
+            elif pinyin[-4:-1] == "uei":
+                pinyin = pinyin[:-4] + "ui" + pinyin[-1]
+        if "uen" in pinyin and pinyin[-4:-1] == "uen":
+            if pinyin[:-1] == "uen":
+                pinyin = "wen" + pinyin[-1]
+            elif pinyin[-4:-1] == "uei":
+                pinyin = pinyin[:-4] + "un" + pinyin[-1]
+        if "van" in pinyin and pinyin[-4:-1] == "van":
+            if pinyin[:-1] == "van":
+                pinyin = "yuan" + pinyin[-1]
+            elif pinyin[-4:-1] == "van":
+                pinyin = pinyin[:-4] + "uan" + pinyin[-1]
+        if "ueng" in pinyin and pinyin[-5:-1] == "ueng":
+            pinyin = pinyin[:-5] + "ong" + pinyin[-1]
+        if pinyin[:-1] == "veng":
+            pinyin = "yong" + pinyin[-1]
+        if "veng" in pinyin and pinyin[-5:-1] == "veng":
+            pinyin = pinyin[:-5] + "iong" + pinyin[-1]
+        if pinyin[:-1] == "ieng":
+            pinyin = "ying" + pinyin[-1]
+        if pinyin[:-1] == "u":
+            pinyin = "wu" + pinyin[-1]
+        if pinyin[:-1] == "v":
+            pinyin = "yv" + pinyin[-1]
+        if pinyin[:-1] == "ing":
+            pinyin = "ying" + pinyin[-1]
+        if pinyin[:-1] == "z":
+            pinyin = "zi" + pinyin[-1]
+        if pinyin[:-1] == "zh":
+            pinyin = "zhi" + pinyin[-1]
+        if pinyin[0] == "u":
+            pinyin = "w" + pinyin[1:]
+        if pinyin[0] == "i":
+            pinyin = "y" + pinyin[1:]
+        pinyin = pinyin.replace("ien", "in")
+        pinyin_list.append(pinyin)
+    return " ".join(pinyin_list)
+# Convert numbers to Chinese pronunciation
+def number_to_chinese(text):
+    # numbers = re.findall(r'\d+(?:\.?\d+)?', text)
+    # for number in numbers:
+    #     text = text.replace(number, cn2an.an2cn(number), 1)
+    text = cn2an.transform(text, "an2cn")
+    return text
+def normalization(text):
+    text = text.replace("，", ",")
+    text = text.replace("。", ".")
+    text = text.replace("！", "!")
+    text = text.replace("？", "?")
+    text = text.replace("；", ";")
+    text = text.replace("：", ":")
+    text = text.replace("、", ",")
+    text = text.replace("‘", "'")
+    text = text.replace("’", "'")
+    text = text.replace("⋯", "…")
+    text = text.replace("···", "…")
+    text = text.replace("・・・", "…")
+    text = text.replace("...", "…")
+    text = re.sub(r"\s+", "", text)
+    text = re.sub(r"[^\u4e00-\u9fff\s_,\.\?!;:\'…]", "", text)
+    text = re.sub(r"\s*([,\.\?!;:\'…])\s*", r"\1", text)
+    return text
+def change_tone(bopomofo: str, tone: str) -> str:
+    if bopomofo[-1] not in "˙ˊˇˋ":
+        bopomofo = bopomofo + tone
+    else:
+        bopomofo = bopomofo[:-1] + tone
+    return bopomofo
+def er_sandhi(word: str, bopomofos: List[str]) -> List[str]:
+    if len(word) > 1 and word[-1] == "儿" and word not in must_not_er_words:
+        bopomofos[-1] = change_tone(bopomofos[-1], "˙")
+    return bopomofos
+def bu_sandhi(word: str, bopomofos: List[str]) -> List[str]:
+    valid_char = set(word)
+    if len(valid_char) == 1 and "不" in valid_char:
+        pass
+    elif word in ["不字"]:
+        pass
+    elif len(word) == 3 and word[1] == "不" and bopomofos[1][:-1] == "ㄅㄨ":
+        bopomofos[1] = bopomofos[1][:-1] + "˙"
+    else:
+        for i, char in enumerate(word):
+            if (
+                i + 1 < len(bopomofos)
+                and char == "不"
+                and i + 1 < len(word)
+                and 0 < len(bopomofos[i + 1])
+                and bopomofos[i + 1][-1] == "ˋ"
+            ):
+                bopomofos[i] = bopomofos[i][:-1] + "ˊ"
+    return bopomofos
+def yi_sandhi(word: str, bopomofos: List[str]) -> List[str]:
+    punc = "：，；。？！“”‘’':,;.?!()（）{}【】[]-~`、 "
+    if word.find("一") != -1 and any(
+        [item.isnumeric() for item in word if item != "一"]
+    ):
+        for i in range(len(word)):
+            if (
+                i == 0
+                and word[0] == "一"
+                and len(word) > 1
+                and word[1]
+                not in [
+                    "零",
+                    "一",
+                    "二",
+                    "三",
+                    "四",
+                    "五",
+                    "六",
+                    "七",
+                    "八",
+                    "九",
+                    "十",
+                ]
+            ):
+                if len(bopomofos[0]) > 0 and bopomofos[1][-1] in ["ˋ", "˙"]:
+                    bopomofos[0] = change_tone(bopomofos[0], "ˊ")
+                else:
+                    bopomofos[0] = change_tone(bopomofos[0], "ˋ")
+            elif word[i] == "一":
+                bopomofos[i] = change_tone(bopomofos[i], "")
+        return bopomofos
+    elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
+        bopomofos[1] = change_tone(bopomofos[1], "˙")
+    elif word.startswith("第一"):
+        bopomofos[1] = change_tone(bopomofos[1], "")
+    elif word.startswith("一月") or word.startswith("一日") or word.startswith("一号"):
+        bopomofos[0] = change_tone(bopomofos[0], "")
+    else:
+        for i, char in enumerate(word):
+            if char == "一" and i + 1 < len(word):
+                if (
+                    len(bopomofos) > i + 1
+                    and len(bopomofos[i + 1]) > 0
+                    and bopomofos[i + 1][-1] in {"ˋ"}
+                ):
+                    bopomofos[i] = change_tone(bopomofos[i], "ˊ")
+                else:
+                    if word[i + 1] not in punc:
+                        bopomofos[i] = change_tone(bopomofos[i], "ˋ")
+                    else:
+                        pass
+    return bopomofos
+def merge_bu(seg: List) -> List:
+    new_seg = []
+    last_word = ""
+    for word in seg:
+        if word != "不":
+            if last_word == "不":
+                word = last_word + word
+            new_seg.append(word)
+        last_word = word
+    return new_seg
+def merge_er(seg: List) -> List:
+    new_seg = []
+    for i, word in enumerate(seg):
+        if i - 1 >= 0 and word == "儿":
+            new_seg[-1] = new_seg[-1] + seg[i]
+        else:
+            new_seg.append(word)
+    return new_seg
+def merge_yi(seg: List) -> List:
+    new_seg = []
+    # function 1
+    for i, word in enumerate(seg):
+        if (
+            i - 1 >= 0
+            and word == "一"
+            and i + 1 < len(seg)
+            and seg[i - 1] == seg[i + 1]
+        ):
+            if i - 1 < len(new_seg):
+                new_seg[i - 1] = new_seg[i - 1] + "一" + new_seg[i - 1]
+            else:
+                new_seg.append(word)
+                new_seg.append(seg[i + 1])
+        else:
+            if i - 2 >= 0 and seg[i - 1] == "一" and seg[i - 2] == word:
+                continue
+            else:
+                new_seg.append(word)
+    seg = new_seg
+    new_seg = []
+    isnumeric_flag = False
+    for i, word in enumerate(seg):
+        if all([item.isnumeric() for item in word]) and not isnumeric_flag:
+            isnumeric_flag = True
+            new_seg.append(word)
+        else:
+            new_seg.append(word)
+    seg = new_seg
+    new_seg = []
+    # function 2
+    for i, word in enumerate(seg):
+        if new_seg and new_seg[-1] == "一":
+            new_seg[-1] = new_seg[-1] + word
+        else:
+            new_seg.append(word)
+    return new_seg
+# Word Segmentation, and convert Chinese pronunciation to pinyin (bopomofo)
+def chinese_to_bopomofo(text_short, sentence):
+    # bopomofos = conv(text_short)
+    words = jieba.lcut(text_short, cut_all=False)
+    words = merge_yi(words)
+    words = merge_bu(words)
+    words = merge_er(words)
+    text = ""
+    char_index = 0
+    for word in words:
+        bopomofos = []
+        if word in word_pinyin_dict and word not in poly_dict:
+            pinyin = word_pinyin_dict[word]
+            for py in pinyin.split(" "):
+                if py[:-1] in pinyin_2_bopomofo_dict and py[-1] in tone_dict:
+                    bopomofos.append(
+                        pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
+                    )
+                    if BLANK_LEVEL == 1:
+                        bopomofos.append("_")
+                else:
+                    bopomofos_lazy = lazy_pinyin(word, BOPOMOFO)
+                    bopomofos += bopomofos_lazy
+                    if BLANK_LEVEL == 1:
+                        bopomofos.append("_")
+        else:
+            for i in range(len(word)):
+                c = word[i]
+                if c in poly_dict:
+                    poly_pinyin = g2pw_poly_predict.predict_process(
+                        [text_short, char_index + i]
+                    )[0]
+                    py = poly_pinyin[2:-1]
+                    bopomofos.append(
+                        pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
+                    )
+                    if BLANK_LEVEL == 1:
+                        bopomofos.append("_")
+                elif c in word_pinyin_dict:
+                    py = word_pinyin_dict[c]
+                    bopomofos.append(
+                        pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
+                    )
+                    if BLANK_LEVEL == 1:
+                        bopomofos.append("_")
+                else:
+                    bopomofos.append(c)
+                    if BLANK_LEVEL == 1:
+                        bopomofos.append("_")
+        if BLANK_LEVEL == 2:
+            bopomofos.append("_")
+        char_index += len(word)
+        if (
+            len(word) == 3
+            and bopomofos[0][-1] == "ˇ"
+            and bopomofos[1][-1] == "ˇ"
+            and bopomofos[-1][-1] == "ˇ"
+        ):
+            bopomofos[0] = bopomofos[0] + "ˊ"
+            bopomofos[1] = bopomofos[1] + "ˊ"
+        if len(word) == 2 and bopomofos[0][-1] == "ˇ" and bopomofos[-1][-1] == "ˇ":
+            bopomofos[0] = bopomofos[0][:-1] + "ˊ"
+        bopomofos = bu_sandhi(word, bopomofos)
+        bopomofos = yi_sandhi(word, bopomofos)
+        bopomofos = er_sandhi(word, bopomofos)
+        if not re.search("[\u4e00-\u9fff]", word):
+            text += "|" + word
+            continue
+        for i in range(len(bopomofos)):
+            bopomofos[i] = re.sub(r"([\u3105-\u3129])$", r"\1ˉ", bopomofos[i])
+        if text != "":
+            text += "|"
+        text += "|".join(bopomofos)
+    return text
+# Convert latin pronunciation to pinyin (bopomofo)
+def latin_to_bopomofo(text):
+    for regex, replacement in _latin_to_bopomofo:
+        text = re.sub(regex, replacement, text)
+    return text
+# Convert pinyin (bopomofo) to IPA
+def bopomofo_to_ipa(text):
+    for regex, replacement in _bopomofo_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+def _chinese_to_ipa(text, sentence):
+    text = number_to_chinese(text.strip())
+    text = normalization(text)
+    text = chinese_to_bopomofo(text, sentence)
+    # pinyin = bpmf_to_pinyin(text)
+    text = latin_to_bopomofo(text)
+    text = bopomofo_to_ipa(text)
+    text = re.sub("([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)", r"\1ɹ\2", text)
+    text = re.sub("([s][⁼ʰ]?)([→↓↑ ]+|$)", r"\1ɹ\2", text)
+    text = re.sub(r"^\||[^\w\s_,\.\?!;:\'…\|→↓↑⁼ʰ`]", "", text)
+    text = re.sub(r"([,\.\?!;:\'…])", r"|\1|", text)
+    text = re.sub(r"\|+", "|", text)
+    text = text.rstrip("|")
+    return text
+# Convert Chinese to IPA
+def chinese_to_ipa(text, sentence, text_tokenizer):
+    # phonemes = text_tokenizer(text.strip())
+    if type(text) == str:
+        return _chinese_to_ipa(text, sentence)
+    else:
+        result_ph = []
+        for t in text:
+            result_ph.append(_chinese_to_ipa(t, sentence))
+        return result_ph

diffrhythm/g2p/g2p/text_tokenizers.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+import os
+from typing import List, Pattern, Union
+from phonemizer.utils import list2str, str2list
+from phonemizer.backend import EspeakBackend
+from phonemizer.backend.espeak.language_switch import LanguageSwitch
+from phonemizer.backend.espeak.words_mismatch import WordMismatch
+from phonemizer.punctuation import Punctuation
+from phonemizer.separator import Separator
+class TextTokenizer:
+    """Phonemize Text."""
+    def __init__(
+        self,
+        language="en-us",
+        backend="espeak",
+        separator=Separator(word="|_|", syllable="-", phone="|"),
+        preserve_punctuation=True,
+        with_stress: bool = False,
+        tie: Union[bool, str] = False,
+        language_switch: LanguageSwitch = "remove-flags",
+        words_mismatch: WordMismatch = "ignore",
+    ) -> None:
+        self.preserve_punctuation_marks = ",.?!;:'…"
+        self.backend = EspeakBackend(
+            language,
+            punctuation_marks=self.preserve_punctuation_marks,
+            preserve_punctuation=preserve_punctuation,
+            with_stress=with_stress,
+            tie=tie,
+            language_switch=language_switch,
+            words_mismatch=words_mismatch,
+        )
+        self.separator = separator
+    # convert chinese punctuation to english punctuation
+    def convert_chinese_punctuation(self, text: str) -> str:
+        text = text.replace("，", ",")
+        text = text.replace("。", ".")
+        text = text.replace("！", "!")
+        text = text.replace("？", "?")
+        text = text.replace("；", ";")
+        text = text.replace("：", ":")
+        text = text.replace("、", ",")
+        text = text.replace("‘", "'")
+        text = text.replace("’", "'")
+        text = text.replace("⋯", "…")
+        text = text.replace("···", "…")
+        text = text.replace("・・・", "…")
+        text = text.replace("...", "…")
+        return text
+    def __call__(self, text, strip=True) -> List[str]:
+        text_type = type(text)
+        normalized_text = []
+        for line in str2list(text):
+            line = self.convert_chinese_punctuation(line.strip())
+            line = re.sub(r"[^\w\s_,\.\?!;:\'…]", "", line)
+            line = re.sub(r"\s*([,\.\?!;:\'…])\s*", r"\1", line)
+            line = re.sub(r"\s+", " ", line)
+            normalized_text.append(line)
+        # print("Normalized test: ", normalized_text[0])
+        phonemized = self.backend.phonemize(
+            normalized_text, separator=self.separator, strip=strip, njobs=1
+        )
+        if text_type == str:
+            phonemized = re.sub(r"([,\.\?!;:\'…])", r"|\1|", list2str(phonemized))
+            phonemized = re.sub(r"\|+", "|", phonemized)
+            phonemized = phonemized.rstrip("|")
+        else:
+            for i in range(len(phonemized)):
+                phonemized[i] = re.sub(r"([,\.\?!;:\'…])", r"|\1|", phonemized[i])
+                phonemized[i] = re.sub(r"\|+", "|", phonemized[i])
+                phonemized[i] = phonemized[i].rstrip("|")
+        return phonemized

diffrhythm/g2p/g2p/vocab.json ADDED Viewed

	@@ -0,0 +1,372 @@

+{
+	"vocab": {
+		",": 0,
+		".": 1,
+		"?": 2,
+		"!": 3,
+		"_": 4,
+		"iː": 5,
+		"ɪ": 6,
+		"ɜː": 7,
+		"ɚ": 8,
+		"oːɹ": 9,
+		"ɔː": 10,
+		"ɔːɹ": 11,
+		"ɑː": 12,
+		"uː": 13,
+		"ʊ": 14,
+		"ɑːɹ": 15,
+		"ʌ": 16,
+		"ɛ": 17,
+		"æ": 18,
+		"eɪ": 19,
+		"aɪ": 20,
+		"ɔɪ": 21,
+		"aʊ": 22,
+		"oʊ": 23,
+		"ɪɹ": 24,
+		"ɛɹ": 25,
+		"ʊɹ": 26,
+		"p": 27,
+		"b": 28,
+		"t": 29,
+		"d": 30,
+		"k": 31,
+		"ɡ": 32,
+		"f": 33,
+		"v": 34,
+		"θ": 35,
+		"ð": 36,
+		"s": 37,
+		"z": 38,
+		"ʃ": 39,
+		"ʒ": 40,
+		"h": 41,
+		"tʃ": 42,
+		"dʒ": 43,
+		"m": 44,
+		"n": 45,
+		"ŋ": 46,
+		"j": 47,
+		"w": 48,
+		"ɹ": 49,
+		"l": 50,
+		"tɹ": 51,
+		"dɹ": 52,
+		"ts": 53,
+		"dz": 54,
+		"i": 55,
+		"ɔ": 56,
+		"ə": 57,
+		"ɾ": 58,
+		"iə": 59,
+		"r": 60,
+		"u": 61,
+		"oː": 62,
+		"ɛː": 63,
+		"ɪː": 64,
+		"aɪə": 65,
+		"aɪɚ": 66,
+		"ɑ̃": 67,
+		"ç": 68,
+		"ɔ̃": 69,
+		"ææ": 70,
+		"ɐɐ": 71,
+		"ɡʲ": 72,
+		"nʲ": 73,
+		"iːː": 74,
+		"p⁼": 75,
+		"pʰ": 76,
+		"t⁼": 77,
+		"tʰ": 78,
+		"k⁼": 79,
+		"kʰ": 80,
+		"x": 81,
+		"tʃ⁼": 82,
+		"tʃʰ": 83,
+		"ts`⁼": 84,
+		"ts`ʰ": 85,
+		"s`": 86,
+		"ɹ`": 87,
+		"ts⁼": 88,
+		"tsʰ": 89,
+		"p⁼wo": 90,
+		"p⁼wo→": 91,
+		"p⁼wo↑": 92,
+		"p⁼wo↓↑": 93,
+		"p⁼wo↓": 94,
+		"pʰwo": 95,
+		"pʰwo→": 96,
+		"pʰwo↑": 97,
+		"pʰwo↓↑": 98,
+		"pʰwo↓": 99,
+		"mwo": 100,
+		"mwo→": 101,
+		"mwo↑": 102,
+		"mwo↓↑": 103,
+		"mwo↓": 104,
+		"fwo": 105,
+		"fwo→": 106,
+		"fwo↑": 107,
+		"fwo↓↑": 108,
+		"fwo↓": 109,
+		"jɛn": 110,
+		"jɛn→": 111,
+		"jɛn↑": 112,
+		"jɛn↓↑": 113,
+		"jɛn↓": 114,
+		"ɥæn": 115,
+		"ɥæn→": 116,
+		"ɥæn↑": 117,
+		"ɥæn↓↑": 118,
+		"ɥæn↓": 119,
+		"in": 120,
+		"in→": 121,
+		"in↑": 122,
+		"in↓↑": 123,
+		"in↓": 124,
+		"ɥn": 125,
+		"ɥn→": 126,
+		"ɥn↑": 127,
+		"ɥn↓↑": 128,
+		"ɥn↓": 129,
+		"iŋ": 130,
+		"iŋ→": 131,
+		"iŋ↑": 132,
+		"iŋ↓↑": 133,
+		"iŋ↓": 134,
+		"ʊŋ": 135,
+		"ʊŋ→": 136,
+		"ʊŋ↑": 137,
+		"ʊŋ↓↑": 138,
+		"ʊŋ↓": 139,
+		"jʊŋ": 140,
+		"jʊŋ→": 141,
+		"jʊŋ↑": 142,
+		"jʊŋ↓↑": 143,
+		"jʊŋ↓": 144,
+		"ia": 145,
+		"ia→": 146,
+		"ia↑": 147,
+		"ia↓↑": 148,
+		"ia↓": 149,
+		"iɛ": 150,
+		"iɛ→": 151,
+		"iɛ↑": 152,
+		"iɛ↓↑": 153,
+		"iɛ↓": 154,
+		"iɑʊ": 155,
+		"iɑʊ→": 156,
+		"iɑʊ↑": 157,
+		"iɑʊ↓↑": 158,
+		"iɑʊ↓": 159,
+		"ioʊ": 160,
+		"ioʊ→": 161,
+		"ioʊ↑": 162,
+		"ioʊ↓↑": 163,
+		"ioʊ↓": 164,
+		"iɑŋ": 165,
+		"iɑŋ→": 166,
+		"iɑŋ↑": 167,
+		"iɑŋ↓↑": 168,
+		"iɑŋ↓": 169,
+		"ua": 170,
+		"ua→": 171,
+		"ua↑": 172,
+		"ua↓↑": 173,
+		"ua↓": 174,
+		"uo": 175,
+		"uo→": 176,
+		"uo↑": 177,
+		"uo↓↑": 178,
+		"uo↓": 179,
+		"uaɪ": 180,
+		"uaɪ→": 181,
+		"uaɪ↑": 182,
+		"uaɪ↓↑": 183,
+		"uaɪ↓": 184,
+		"ueɪ": 185,
+		"ueɪ→": 186,
+		"ueɪ↑": 187,
+		"ueɪ↓↑": 188,
+		"ueɪ↓": 189,
+		"uan": 190,
+		"uan→": 191,
+		"uan↑": 192,
+		"uan↓↑": 193,
+		"uan↓": 194,
+		"uən": 195,
+		"uən→": 196,
+		"uən↑": 197,
+		"uən↓↑": 198,
+		"uən↓": 199,
+		"uɑŋ": 200,
+		"uɑŋ→": 201,
+		"uɑŋ↑": 202,
+		"uɑŋ↓↑": 203,
+		"uɑŋ↓": 204,
+		"ɥɛ": 205,
+		"ɥɛ→": 206,
+		"ɥɛ↑": 207,
+		"ɥɛ↓↑": 208,
+		"ɥɛ↓": 209,
+		"a": 210,
+		"a→": 211,
+		"a↑": 212,
+		"a↓↑": 213,
+		"a↓": 214,
+		"o": 215,
+		"o→": 216,
+		"o↑": 217,
+		"o↓↑": 218,
+		"o↓": 219,
+		"ə→": 220,
+		"ə↑": 221,
+		"ə↓↑": 222,
+		"ə↓": 223,
+		"ɛ→": 224,
+		"ɛ↑": 225,
+		"ɛ↓↑": 226,
+		"ɛ↓": 227,
+		"aɪ→": 228,
+		"aɪ↑": 229,
+		"aɪ↓↑": 230,
+		"aɪ↓": 231,
+		"eɪ→": 232,
+		"eɪ↑": 233,
+		"eɪ↓↑": 234,
+		"eɪ↓": 235,
+		"ɑʊ": 236,
+		"ɑʊ→": 237,
+		"ɑʊ↑": 238,
+		"ɑʊ↓↑": 239,
+		"ɑʊ↓": 240,
+		"oʊ→": 241,
+		"oʊ↑": 242,
+		"oʊ↓↑": 243,
+		"oʊ↓": 244,
+		"an": 245,
+		"an→": 246,
+		"an↑": 247,
+		"an↓↑": 248,
+		"an↓": 249,
+		"ən": 250,
+		"ən→": 251,
+		"ən↑": 252,
+		"ən↓↑": 253,
+		"ən↓": 254,
+		"ɑŋ": 255,
+		"ɑŋ→": 256,
+		"ɑŋ↑": 257,
+		"ɑŋ↓↑": 258,
+		"ɑŋ↓": 259,
+		"əŋ": 260,
+		"əŋ→": 261,
+		"əŋ↑": 262,
+		"əŋ↓↑": 263,
+		"əŋ↓": 264,
+		"əɹ": 265,
+		"əɹ→": 266,
+		"əɹ↑": 267,
+		"əɹ↓↑": 268,
+		"əɹ↓": 269,
+		"i→": 270,
+		"i↑": 271,
+		"i↓↑": 272,
+		"i↓": 273,
+		"u→": 274,
+		"u↑": 275,
+		"u↓↑": 276,
+		"u↓": 277,
+		"ɥ": 278,
+		"ɥ→": 279,
+		"ɥ↑": 280,
+		"ɥ↓↑": 281,
+		"ɥ↓": 282,
+		"ts`⁼ɹ": 283,
+		"ts`⁼ɹ→": 284,
+		"ts`⁼ɹ↑": 285,
+		"ts`⁼ɹ↓↑": 286,
+		"ts`⁼ɹ↓": 287,
+		"ts`ʰɹ": 288,
+		"ts`ʰɹ→": 289,
+		"ts`ʰɹ↑": 290,
+		"ts`ʰɹ↓↑": 291,
+		"ts`ʰɹ↓": 292,
+		"s`ɹ": 293,
+		"s`ɹ→": 294,
+		"s`ɹ↑": 295,
+		"s`ɹ↓↑": 296,
+		"s`ɹ���": 297,
+		"ɹ`ɹ": 298,
+		"ɹ`ɹ→": 299,
+		"ɹ`ɹ↑": 300,
+		"ɹ`ɹ↓↑": 301,
+		"ɹ`ɹ↓": 302,
+		"ts⁼ɹ": 303,
+		"ts⁼ɹ→": 304,
+		"ts⁼ɹ↑": 305,
+		"ts⁼ɹ↓↑": 306,
+		"ts⁼ɹ↓": 307,
+		"tsʰɹ": 308,
+		"tsʰɹ→": 309,
+		"tsʰɹ↑": 310,
+		"tsʰɹ↓↑": 311,
+		"tsʰɹ↓": 312,
+		"sɹ": 313,
+		"sɹ→": 314,
+		"sɹ↑": 315,
+		"sɹ↓↑": 316,
+		"sɹ↓": 317,
+		"ɯ": 318,
+		"e": 319,
+		"aː": 320,
+		"ɯː": 321,
+		"eː": 322,
+		"ç": 323,
+		"ɸ": 324,
+		"ɰᵝ": 325,
+		"ɴ": 326,
+		"g": 327,
+		"dʑ": 328,
+		"q": 329,
+		"ː": 330,
+		"bj": 331,
+		"tɕ": 332,
+		"dej": 333,
+		"tej": 334,
+		"gj": 335,
+		"gɯ": 336,
+		"çj": 337,
+		"kj": 338,
+		"kɯ": 339,
+		"mj": 340,
+		"nj": 341,
+		"pj": 342,
+		"ɾj": 343,
+		"ɕ": 344,
+		"tsɯ": 345,
+		"ɐ": 346,
+		"ɑ": 347,
+		"ɒ": 348,
+		"ɜ": 349,
+		"ɫ": 350,
+		"ʑ": 351,
+		"ʲ": 352,
+		"y": 353,
+		"ø": 354,
+		"œ": 355,
+		"ʁ": 356,
+		"̃": 357,
+		"ɲ": 358,
+		":": 359,
+		";": 360,
+		"'": 361,
+		"…": 362
+	}
+}

diffrhythm/g2p/utils/front_utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+def generate_poly_lexicon(file_path: str):
+    """Generate poly char lexicon for Mandarin Chinese."""
+    poly_dict = {}
+    with open(file_path, "r", encoding="utf-8") as readf:
+        txt_list = readf.readlines()
+        for txt in txt_list:
+            word = txt.strip("\n")
+            if word not in poly_dict:
+                poly_dict[word] = 1
+        readf.close()
+    return poly_dict

diffrhythm/g2p/utils/g2p.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from phonemizer.backend import EspeakBackend
+from phonemizer.separator import Separator
+from phonemizer.utils import list2str, str2list
+from typing import List, Union
+import os
+import json
+import sys
+# separator=Separator(phone=' ', word=' _ ', syllable='|'),
+separator = Separator(word=" _ ", syllable="|", phone=" ")
+phonemizer_zh = EspeakBackend(
+    "cmn", preserve_punctuation=False, with_stress=False, language_switch="remove-flags"
+)
+# phonemizer_zh.separator = separator
+phonemizer_en = EspeakBackend(
+    "en-us",
+    preserve_punctuation=False,
+    with_stress=False,
+    language_switch="remove-flags",
+)
+# phonemizer_en.separator = separator
+phonemizer_ja = EspeakBackend(
+    "ja", preserve_punctuation=False, with_stress=False, language_switch="remove-flags"
+)
+# phonemizer_ja.separator = separator
+phonemizer_ko = EspeakBackend(
+    "ko", preserve_punctuation=False, with_stress=False, language_switch="remove-flags"
+)
+# phonemizer_ko.separator = separator
+phonemizer_fr = EspeakBackend(
+    "fr-fr",
+    preserve_punctuation=False,
+    with_stress=False,
+    language_switch="remove-flags",
+)
+# phonemizer_fr.separator = separator
+phonemizer_de = EspeakBackend(
+    "de", preserve_punctuation=False, with_stress=False, language_switch="remove-flags"
+)
+# phonemizer_de.separator = separator
+lang2backend = {
+    "zh": phonemizer_zh,
+    "ja": phonemizer_ja,
+    "en": phonemizer_en,
+    "fr": phonemizer_fr,
+    "ko": phonemizer_ko,
+    "de": phonemizer_de,
+}
+with open("./diffrhythm/g2p/utils/mls_en.json", "r") as f:
+    json_data = f.read()
+token = json.loads(json_data)
+def phonemizer_g2p(text, language):
+    langbackend = lang2backend[language]
+    phonemes = _phonemize(
+        langbackend,
+        text,
+        separator,
+        strip=True,
+        njobs=1,
+        prepend_text=False,
+        preserve_empty_lines=False,
+    )
+    token_id = []
+    if isinstance(phonemes, list):
+        for phone in phonemes:
+            phonemes_split = phone.split(" ")
+            token_id.append([token[p] for p in phonemes_split if p in token])
+    else:
+        phonemes_split = phonemes.split(" ")
+        token_id = [token[p] for p in phonemes_split if p in token]
+    return phonemes, token_id
+def _phonemize(  # pylint: disable=too-many-arguments
+    backend,
+    text: Union[str, List[str]],
+    separator: Separator,
+    strip: bool,
+    njobs: int,
+    prepend_text: bool,
+    preserve_empty_lines: bool,
+):
+    """Auxiliary function to phonemize()
+    Does the phonemization and returns the phonemized text. Raises a
+    RuntimeError on error.
+    """
+    # remember the text type for output (either list or string)
+    text_type = type(text)
+    # force the text as a list
+    text = [line.strip(os.linesep) for line in str2list(text)]
+    # if preserving empty lines, note the index of each empty line
+    if preserve_empty_lines:
+        empty_lines = [n for n, line in enumerate(text) if not line.strip()]
+    # ignore empty lines
+    text = [line for line in text if line.strip()]
+    if text:
+        # phonemize the text
+        phonemized = backend.phonemize(
+            text, separator=separator, strip=strip, njobs=njobs
+        )
+    else:
+        phonemized = []
+    # if preserving empty lines, reinsert them into text and phonemized lists
+    if preserve_empty_lines:
+        for i in empty_lines:  # noqa
+            if prepend_text:
+                text.insert(i, "")
+            phonemized.insert(i, "")
+    # at that point, the phonemized text is a list of str. Format it as
+    # expected by the parameters
+    if prepend_text:
+        return list(zip(text, phonemized))
+    if text_type == str:
+        return list2str(phonemized)
+    return phonemized

diffrhythm/g2p/utils/log.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import functools
+import logging
+__all__ = [
+    "logger",
+]
+class Logger(object):
+    def __init__(self, name: str = None):
+        name = "PaddleSpeech" if not name else name
+        self.logger = logging.getLogger(name)
+        log_config = {
+            "DEBUG": 10,
+            "INFO": 20,
+            "TRAIN": 21,
+            "EVAL": 22,
+            "WARNING": 30,
+            "ERROR": 40,
+            "CRITICAL": 50,
+            "EXCEPTION": 100,
+        }
+        for key, level in log_config.items():
+            logging.addLevelName(level, key)
+            if key == "EXCEPTION":
+                self.__dict__[key.lower()] = self.logger.exception
+            else:
+                self.__dict__[key.lower()] = functools.partial(self.__call__, level)
+        self.format = logging.Formatter(
+            fmt="[%(asctime)-15s] [%(levelname)8s] - %(message)s"
+        )
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+        self.logger.addHandler(self.handler)
+        self.logger.setLevel(logging.INFO)
+        self.logger.propagate = False
+    def __call__(self, log_level: str, msg: str):
+        self.logger.log(log_level, msg)
+logger = Logger()

diffrhythm/g2p/utils/mls_en.json ADDED Viewed

	@@ -0,0 +1,335 @@

+{
+	"[UNK]": 0,
+	"_": 1,
+	"b": 2,
+	"d": 3,
+	"f": 4,
+	"h": 5,
+	"i": 6,
+	"j": 7,
+	"k": 8,
+	"l": 9,
+	"m": 10,
+	"n": 11,
+	"p": 12,
+	"r": 13,
+	"s": 14,
+	"t": 15,
+	"v": 16,
+	"w": 17,
+	"x": 18,
+	"z": 19,
+	"æ": 20,
+	"ç": 21,
+	"ð": 22,
+	"ŋ": 23,
+	"ɐ": 24,
+	"ɔ": 25,
+	"ə": 26,
+	"ɚ": 27,
+	"ɛ": 28,
+	"ɡ": 29,
+	"ɪ": 30,
+	"ɬ": 31,
+	"ɹ": 32,
+	"ɾ": 33,
+	"ʃ": 34,
+	"ʊ": 35,
+	"ʌ": 36,
+	"ʒ": 37,
+	"ʔ": 38,
+	"θ": 39,
+	"ᵻ": 40,
+	"aɪ": 41,
+	"aʊ": 42,
+	"dʒ": 43,
+	"eɪ": 44,
+	"iə": 45,
+	"iː": 46,
+	"n̩": 47,
+	"oʊ": 48,
+	"oː": 49,
+	"tʃ": 50,
+	"uː": 51,
+	"ææ": 52,
+	"ɐɐ": 53,
+	"ɑː": 54,
+	"ɑ̃": 55,
+	"ɔɪ": 56,
+	"ɔː": 57,
+	"ɔ̃": 58,
+	"əl": 59,
+	"ɛɹ": 60,
+	"ɜː": 61,
+	"ɡʲ": 62,
+	"ɪɹ": 63,
+	"ʊɹ": 64,
+	"aɪə": 65,
+	"aɪɚ": 66,
+	"iːː": 67,
+	"oːɹ": 68,
+	"ɑːɹ": 69,
+	"ɔːɹ": 70,
+	"1": 71,
+	"a": 72,
+	"e": 73,
+	"o": 74,
+	"q": 75,
+	"u": 76,
+	"y": 77,
+	"ɑ": 78,
+	"ɒ": 79,
+	"ɕ": 80,
+	"ɣ": 81,
+	"ɫ": 82,
+	"ɯ": 83,
+	"ʐ": 84,
+	"ʲ": 85,
+	"a1": 86,
+	"a2": 87,
+	"a5": 88,
+	"ai": 89,
+	"aɜ": 90,
+	"aː": 91,
+	"ei": 92,
+	"eə": 93,
+	"i.": 94,
+	"i1": 95,
+	"i2": 96,
+	"i5": 97,
+	"io": 98,
+	"iɑ": 99,
+	"iɛ": 100,
+	"iɜ": 101,
+	"i̪": 102,
+	"kh": 103,
+	"nʲ": 104,
+	"o1": 105,
+	"o2": 106,
+	"o5": 107,
+	"ou": 108,
+	"oɜ": 109,
+	"ph": 110,
+	"s.": 111,
+	"th": 112,
+	"ts": 113,
+	"tɕ": 114,
+	"u1": 115,
+	"u2": 116,
+	"u5": 117,
+	"ua": 118,
+	"uo": 119,
+	"uə": 120,
+	"uɜ": 121,
+	"y1": 122,
+	"y2": 123,
+	"y5": 124,
+	"yu": 125,
+	"yæ": 126,
+	"yə": 127,
+	"yɛ": 128,
+	"yɜ": 129,
+	"ŋɜ": 130,
+	"ŋʲ": 131,
+	"ɑ1": 132,
+	"ɑ2": 133,
+	"ɑ5": 134,
+	"ɑu": 135,
+	"ɑɜ": 136,
+	"ɑʲ": 137,
+	"ə1": 138,
+	"ə2": 139,
+	"ə5": 140,
+	"ər": 141,
+	"əɜ": 142,
+	"əʊ": 143,
+	"ʊə": 144,
+	"ai1": 145,
+	"ai2": 146,
+	"ai5": 147,
+	"aiɜ": 148,
+	"ei1": 149,
+	"ei2": 150,
+	"ei5": 151,
+	"eiɜ": 152,
+	"i.1": 153,
+	"i.2": 154,
+	"i.5": 155,
+	"i.ɜ": 156,
+	"io5": 157,
+	"iou": 158,
+	"iɑ1": 159,
+	"iɑ2": 160,
+	"iɑ5": 161,
+	"iɑɜ": 162,
+	"iɛ1": 163,
+	"iɛ2": 164,
+	"iɛ5": 165,
+	"iɛɜ": 166,
+	"i̪1": 167,
+	"i̪2": 168,
+	"i̪5": 169,
+	"i̪ɜ": 170,
+	"onɡ": 171,
+	"ou1": 172,
+	"ou2": 173,
+	"ou5": 174,
+	"ouɜ": 175,
+	"ts.": 176,
+	"tsh": 177,
+	"tɕh": 178,
+	"u5ʲ": 179,
+	"ua1": 180,
+	"ua2": 181,
+	"ua5": 182,
+	"uai": 183,
+	"uaɜ": 184,
+	"uei": 185,
+	"uo1": 186,
+	"uo2": 187,
+	"uo5": 188,
+	"uoɜ": 189,
+	"uə1": 190,
+	"uə2": 191,
+	"uə5": 192,
+	"uəɜ": 193,
+	"yiɜ": 194,
+	"yu2": 195,
+	"yu5": 196,
+	"yæ2": 197,
+	"yæ5": 198,
+	"yæɜ": 199,
+	"yə2": 200,
+	"yə5": 201,
+	"yəɜ": 202,
+	"yɛ1": 203,
+	"yɛ2": 204,
+	"yɛ5": 205,
+	"yɛɜ": 206,
+	"ɑu1": 207,
+	"ɑu2": 208,
+	"ɑu5": 209,
+	"ɑuɜ": 210,
+	"ər1": 211,
+	"ər2": 212,
+	"ər5": 213,
+	"ərɜ": 214,
+	"əː1": 215,
+	"iou1": 216,
+	"iou2": 217,
+	"iou5": 218,
+	"iouɜ": 219,
+	"onɡ1": 220,
+	"onɡ2": 221,
+	"onɡ5": 222,
+	"onɡɜ": 223,
+	"ts.h": 224,
+	"uai2": 225,
+	"uai5": 226,
+	"uaiɜ": 227,
+	"uei1": 228,
+	"uei2": 229,
+	"uei5": 230,
+	"ueiɜ": 231,
+	"uoɜʲ": 232,
+	"yɛ5ʲ": 233,
+	"ɑu2ʲ": 234,
+	"2": 235,
+	"5": 236,
+	"ɜ": 237,
+	"ʂ": 238,
+	"dʑ": 239,
+	"iɪ": 240,
+	"uɪ": 241,
+	"xʲ": 242,
+	"ɑt": 243,
+	"ɛɜ": 244,
+	"ɛː": 245,
+	"ɪː": 246,
+	"phʲ": 247,
+	"ɑ5ʲ": 248,
+	"ɑuʲ": 249,
+	"ərə": 250,
+	"uozʰ": 251,
+	"ər1ʲ": 252,
+	"tɕhtɕh": 253,
+	"c": 254,
+	"ʋ": 255,
+	"ʍ": 256,
+	"ʑ": 257,
+	"ː": 258,
+	"aə": 259,
+	"eː": 260,
+	"hʲ": 261,
+	"iʊ": 262,
+	"kʲ": 263,
+	"lʲ": 264,
+	"oə": 265,
+	"oɪ": 266,
+	"oʲ": 267,
+	"pʲ": 268,
+	"sʲ": 269,
+	"u4": 270,
+	"uʲ": 271,
+	"yi": 272,
+	"yʲ": 273,
+	"ŋ2": 274,
+	"ŋ5": 275,
+	"ŋ̩": 276,
+	"ɑɪ": 277,
+	"ɑʊ": 278,
+	"ɕʲ": 279,
+	"ət": 280,
+	"əə": 281,
+	"əɪ": 282,
+	"əʲ": 283,
+	"ɛ1": 284,
+	"ɛ5": 285,
+	"aiə": 286,
+	"aiɪ": 287,
+	"azʰ": 288,
+	"eiə": 289,
+	"eiɪ": 290,
+	"eiʊ": 291,
+	"i.ə": 292,
+	"i.ɪ": 293,
+	"i.ʊ": 294,
+	"ioɜ": 295,
+	"izʰ": 296,
+	"iɑə": 297,
+	"iɑʊ": 298,
+	"iɑʲ": 299,
+	"iɛə": 300,
+	"iɛɪ": 301,
+	"iɛʊ": 302,
+	"i̪ə": 303,
+	"i̪ʊ": 304,
+	"khʲ": 305,
+	"ouʲ": 306,
+	"tsʲ": 307,
+	"u2ʲ": 308,
+	"uoɪ": 309,
+	"uzʰ": 310,
+	"uɜʲ": 311,
+	"yæɪ": 312,
+	"yəʊ": 313,
+	"ərt": 314,
+	"ərɪ": 315,
+	"ərʲ": 316,
+	"əːt": 317,
+	"iouə": 318,
+	"iouʊ": 319,
+	"iouʲ": 320,
+	"iɛzʰ": 321,
+	"onɡə": 322,
+	"onɡɪ": 323,
+	"onɡʊ": 324,
+	"ouzʰ": 325,
+	"uai1": 326,
+	"ueiɪ": 327,
+	"ɑuzʰ": 328,
+	"iouzʰ": 329
+}

diffrhythm/infer/infer.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import torch
+import torchaudio
+from einops import rearrange
+import argparse
+import json
+import os
+from tqdm import tqdm
+import random
+import numpy as np
+import time
+from diffrhythm.infer.infer_utils import (
+    get_reference_latent,
+    get_lrc_token,
+    get_style_prompt,
+    prepare_model,
+    get_negative_style_prompt
+)
+def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
+    downsampling_ratio = 2048
+    io_channels = 2
+    if not chunked:
+        # default behavior. Decode the entire latent in parallel
+        return vae_model.decode_export(latents)
+    else:
+        # chunked decoding
+        hop_size = chunk_size - overlap
+        total_size = latents.shape[2]
+        batch_size = latents.shape[0]
+        chunks = []
+        i = 0
+        for i in range(0, total_size - chunk_size + 1, hop_size):
+            chunk = latents[:,:,i:i+chunk_size]
+            chunks.append(chunk)
+        if i+chunk_size != total_size:
+            # Final chunk
+            chunk = latents[:,:,-chunk_size:]
+            chunks.append(chunk)
+        chunks = torch.stack(chunks)
+        num_chunks = chunks.shape[0]
+        # samples_per_latent is just the downsampling ratio
+        samples_per_latent = downsampling_ratio
+        # Create an empty waveform, we will populate it with chunks as decode them
+        y_size = total_size * samples_per_latent
+        y_final = torch.zeros((batch_size,io_channels,y_size)).to(latents.device)
+        for i in range(num_chunks):
+            x_chunk = chunks[i,:]
+            # decode the chunk
+            y_chunk = vae_model.decode_export(x_chunk)
+            # figure out where to put the audio along the time domain
+            if i == num_chunks-1:
+                # final chunk always goes at the end
+                t_end = y_size
+                t_start = t_end - y_chunk.shape[2]
+            else:
+                t_start = i * hop_size * samples_per_latent
+                t_end = t_start + chunk_size * samples_per_latent
+            #  remove the edges of the overlaps
+            ol = (overlap//2) * samples_per_latent
+            chunk_start = 0
+            chunk_end = y_chunk.shape[2]
+            if i > 0:
+                # no overlap for the start of the first chunk
+                t_start += ol
+                chunk_start += ol
+            if i < num_chunks-1:
+                # no overlap for the end of the last chunk
+                t_end -= ol
+                chunk_end -= ol
+            # paste the chunked audio into our y_final output audio
+            y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
+        return y_final
+def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, start_time):
+    # import pdb; pdb.set_trace()
+    with torch.inference_mode():
+        generated, _ = cfm_model.sample(
+            cond=cond,
+            text=text,
+            duration=duration,
+            style_prompt=style_prompt,
+            negative_style_prompt=negative_style_prompt,
+            steps=32,
+            cfg_strength=4.0,
+            start_time=start_time
+        )
+        generated = generated.to(torch.float32)
+        latent = generated.transpose(1, 2) # [b d t]
+        output = decode_audio(latent, vae_model)
+        # Rearrange audio batch to a single sequence
+        output = rearrange(output, "b d n -> d (b n)")
+        # Peak normalize, clip, convert to int16, and save to file
+        output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
+        return output
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--lrc-path', type=str, default="/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/infer/example/eg.lrc") # lyrics of target song
+    parser.add_argument('--ref-audio-path', type=str, default="/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/infer/example/eg.mp3") # reference audio as style prompt for target song
+    parser.add_argument('--audio-length', type=int, default=95) # length of target song
+    parser.add_argument('--output-dir', type=str, default="/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/infer/example/output")
+    args = parser.parse_args()
+    device = 'cuda'
+    audio_length = args.audio_length
+    if audio_length == 95:
+        max_frames = 2048
+    elif audio_length == 285:
+        max_frames = 6144
+    cfm, tokenizer, muq, vae = prepare_model(device)
+    with open(args.lrc_path, 'r') as f:
+        lrc = f.read()
+    lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
+    style_prompt = get_style_prompt(muq, args.ref_audio_path)
+    negative_style_prompt = get_negative_style_prompt(device)
+    latent_prompt = get_reference_latent(device, max_frames)
+    s_t = time.time()
+    generated_song = inference(cfm_model=cfm,
+                               vae_model=vae,
+                               cond=latent_prompt,
+                               text=lrc_prompt,
+                               duration=max_frames,
+                               style_prompt=style_prompt,
+                               negative_style_prompt=negative_style_prompt,
+                               start_time=start_time
+                               )
+    e_t = time.time() - s_t
+    print(f"inference cost {e_t} seconds")
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, "output.wav")
+    torchaudio.save(output_path, generated_song, sample_rate=44100)

diffrhythm/infer/infer_utils.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import torch
+import librosa
+import random
+import json
+from muq import MuQMuLan
+from mutagen.mp3 import MP3
+import os
+import numpy as np
+from diffrhythm.model import DiT, CFM
+def prepare_model(device):
+    # prepare cfm model
+    dit_ckpt_path = "/home/node59_tmpdata3/hkchen/music_opensource/dit_model_dpo_normal.pt"
+    dit_config_path = "/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/config/diffrhythm-1b.json"
+    with open(dit_config_path) as f:
+        model_config = json.load(f)
+    dit_model_cls = DiT
+    cfm = CFM(
+                transformer=dit_model_cls(**model_config["model"], use_style_prompt=True),
+                num_channels=model_config["model"]['mel_dim'],
+                use_style_prompt=True
+             )
+    cfm = cfm.to(device)
+    cfm = load_checkpoint(cfm, dit_ckpt_path, device=device, use_ema=False)
+    # prepare tokenizer
+    tokenizer = CNENTokenizer()
+    # prepare muq
+    muq = MuQMuLan.from_pretrained("OpenMuQ/MuQ-MuLan-large")
+    muq = muq.to(device).eval()
+    # prepare vae
+    vae = torch.jit.load("/home/node59_tmpdata3/hkchen/F5-TTS-V0/infer/vae_infer.pt").to(device)
+    return cfm, tokenizer, muq, vae
+# for song edit, will be added in the future
+def get_reference_latent(device, max_frames):
+    return torch.zeros(1, max_frames, 64).to(device)
+def get_negative_style_prompt(device):
+    file_path = "/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/infer/example/vocal.npy"
+    vocal_stlye = np.load(file_path)
+    vocal_stlye = torch.from_numpy(vocal_stlye).to(device) # [1, 512]
+    vocal_stlye = vocal_stlye.half()
+    return vocal_stlye
+def get_style_prompt(model, wav_path):
+    mulan = model
+    ext = os.path.splitext(wav_path)[-1].lower()
+    if ext == '.mp3':
+        meta = MP3(wav_path)
+        audio_len = meta.info.length
+        src_sr = meta.info.sample_rate
+    elif ext == '.wav':
+        audio, sr = librosa.load(wav_path, sr=None)
+        audio_len = librosa.get_duration(y=audio, sr=sr)
+        src_sr = sr
+    else:
+        raise ValueError("Unsupported file format: {}".format(ext))
+    assert(audio_len >= 10)
+    mid_time = audio_len // 2
+    start_time = mid_time - 5
+    wav, sr = librosa.load(wav_path, sr=None, offset=start_time, duration=10)
+    resampled_wav = librosa.resample(wav, orig_sr=src_sr, target_sr=24000)
+    resampled_wav = torch.tensor(resampled_wav).unsqueeze(0).to(model.device)
+    with torch.no_grad():
+        audio_emb = mulan(wavs = resampled_wav) # [1, 512]
+    audio_emb = audio_emb
+    audio_emb = audio_emb.half()
+    return audio_emb
+def parse_lyrics(lyrics: str):
+    lyrics_with_time = []
+    lyrics = lyrics.strip()
+    for line in lyrics.split('\n'):
+        try:
+            time, lyric = line[1:9], line[10:]
+            lyric = lyric.strip()
+            mins, secs = time.split(':')
+            secs = int(mins) * 60 + float(secs)
+            lyrics_with_time.append((secs, lyric))
+        except:
+            continue
+    return lyrics_with_time
+class CNENTokenizer():
+    def __init__(self):
+        with open('./diffrhythm/g2p/g2p/vocab.json', 'r') as file:
+            self.phone2id:dict = json.load(file)['vocab']
+        self.id2phone = {v:k for (k, v) in self.phone2id.items()}
+        # from f5_tts.g2p.g2p_generation import chn_eng_g2p
+        from diffrhythm.g2p.g2p_generation import chn_eng_g2p
+        self.tokenizer = chn_eng_g2p
+    def encode(self, text):
+        phone, token = self.tokenizer(text)
+        token = [x+1 for x in token]
+        return token
+    def decode(self, token):
+        return "|".join([self.id2phone[x-1] for x in token])
+def get_lrc_token(text, tokenizer, device):
+    max_frames = 2048
+    lyrics_shift = 0
+    sampling_rate = 44100
+    downsample_rate = 2048
+    max_secs = max_frames / (sampling_rate / downsample_rate)
+    pad_token_id = 0
+    comma_token_id = 1
+    period_token_id = 2
+    lrc_with_time = parse_lyrics(text)
+    modified_lrc_with_time = []
+    for i in range(len(lrc_with_time)):
+        time, line = lrc_with_time[i]
+        line_token = tokenizer.encode(line)
+        modified_lrc_with_time.append((time, line_token))
+    lrc_with_time = modified_lrc_with_time
+    lrc_with_time = [(time_start, line) for (time_start, line) in lrc_with_time if time_start < max_secs]
+    lrc_with_time = lrc_with_time[:-1] if len(lrc_with_time) >= 1 else lrc_with_time
+    normalized_start_time = 0.
+    lrc = torch.zeros((max_frames,), dtype=torch.long)
+    tokens_count = 0
+    last_end_pos = 0
+    for time_start, line in lrc_with_time:
+        tokens = [token if token != period_token_id else comma_token_id for token in line] + [period_token_id]
+        tokens = torch.tensor(tokens, dtype=torch.long)
+        num_tokens = tokens.shape[0]
+        gt_frame_start = int(time_start * sampling_rate / downsample_rate)
+        frame_shift = random.randint(int(lyrics_shift), int(lyrics_shift))
+        frame_start = max(gt_frame_start - frame_shift, last_end_pos)
+        frame_len = min(num_tokens, max_frames - frame_start)
+        #print(gt_frame_start, frame_shift, frame_start, frame_len, tokens_count, last_end_pos, full_pos_emb.shape)
+        lrc[frame_start:frame_start + frame_len] = tokens[:frame_len]
+        tokens_count += num_tokens
+        last_end_pos = frame_start + frame_len
+    lrc_emb = lrc.unsqueeze(0).to(device)
+    normalized_start_time = torch.tensor(normalized_start_time).unsqueeze(0).to(device)
+    normalized_start_time = normalized_start_time.half()
+    return lrc_emb, normalized_start_time
+def load_checkpoint(model, ckpt_path, device, use_ema=True):
+    if device == "cuda":
+        model = model.half()
+    ckpt_type = ckpt_path.split(".")[-1]
+    if ckpt_type == "safetensors":
+        from safetensors.torch import load_file
+        checkpoint = load_file(ckpt_path)
+    else:
+        checkpoint = torch.load(ckpt_path, weights_only=True)
+    if use_ema:
+        if ckpt_type == "safetensors":
+            checkpoint = {"ema_model_state_dict": checkpoint}
+        checkpoint["model_state_dict"] = {
+            k.replace("ema_model.", ""): v
+            for k, v in checkpoint["ema_model_state_dict"].items()
+            if k not in ["initted", "step"]
+        }
+        model.load_state_dict(checkpoint["model_state_dict"], strict=False)
+    else:
+        if ckpt_type == "safetensors":
+            checkpoint = {"model_state_dict": checkpoint}
+        model.load_state_dict(checkpoint["model_state_dict"], strict=False)
+    return model.to(device)