File size: 3,072 Bytes
c968fc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Copyright (c) 2024 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os
import sys

from models.tts.maskgct.g2p.g2p import PhonemeBpeTokenizer
from models.tts.maskgct.g2p.utils.g2p import phonemizer_g2p
import tqdm
from typing import List
import json
import os
import re


def ph_g2p(text, language):

    return phonemizer_g2p(text=text, language=language)


def g2p(text, sentence, language):

    return text_tokenizer.tokenize(text=text, sentence=sentence, language=language)


def is_chinese(char):
    if char >= "\u4e00" and char <= "\u9fa5":
        return True
    else:
        return False


def is_alphabet(char):
    if (char >= "\u0041" and char <= "\u005a") or (
        char >= "\u0061" and char <= "\u007a"
    ):
        return True
    else:
        return False


def is_other(char):
    if not (is_chinese(char) or is_alphabet(char)):
        return True
    else:
        return False


def get_segment(text: str) -> List[str]:
    # sentence --> [ch_part, en_part, ch_part, ...]
    segments = []
    types = []
    flag = 0
    temp_seg = ""
    temp_lang = ""

    # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
    for i, ch in enumerate(text):
        if is_chinese(ch):
            types.append("zh")
        elif is_alphabet(ch):
            types.append("en")
        else:
            types.append("other")

    assert len(types) == len(text)

    for i in range(len(types)):
        # find the first char of the seg
        if flag == 0:
            temp_seg += text[i]
            temp_lang = types[i]
            flag = 1
        else:
            if temp_lang == "other":
                if types[i] == temp_lang:
                    temp_seg += text[i]
                else:
                    temp_seg += text[i]
                    temp_lang = types[i]
            else:
                if types[i] == temp_lang:
                    temp_seg += text[i]
                elif types[i] == "other":
                    temp_seg += text[i]
                else:
                    segments.append((temp_seg, temp_lang))
                    temp_seg = text[i]
                    temp_lang = types[i]
                    flag = 1

    segments.append((temp_seg, temp_lang))
    return segments


def chn_eng_g2p(text: str):
    # now only en and ch
    segments = get_segment(text)
    all_phoneme = ""
    all_tokens = []

    for index in range(len(segments)):
        seg = segments[index]
        phoneme, token = g2p(seg[0], text, seg[1])
        all_phoneme += phoneme + "|"
        all_tokens += token

        if seg[1] == "en" and index == len(segments) - 1 and all_phoneme[-2] == "_":
            all_phoneme = all_phoneme[:-2]
            all_tokens = all_tokens[:-1]
    return all_phoneme, all_tokens


text_tokenizer = PhonemeBpeTokenizer()
with open("./models/tts/maskgct/g2p/g2p/vocab.json", "r") as f:
    json_data = f.read()
data = json.loads(json_data)
vocab = data["vocab"]