ing0 commited on
Commit
e48780a
·
1 Parent(s): ccebb03
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.txt filter=lfs diff=lfs merge=lfs -text
diffrhythm/g2p/g2p_generation.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import sys
8
+
9
+ from diffrhythm.g2p.g2p import PhonemeBpeTokenizer
10
+ from diffrhythm.g2p.utils.g2p import phonemizer_g2p
11
+ import tqdm
12
+ from typing import List
13
+ import json
14
+ import os
15
+ import re
16
+
17
+
18
+ def ph_g2p(text, language):
19
+
20
+ return phonemizer_g2p(text=text, language=language)
21
+
22
+
23
+ def g2p(text, sentence, language):
24
+
25
+ return text_tokenizer.tokenize(text=text, sentence=sentence, language=language)
26
+
27
+
28
+ def is_chinese(char):
29
+ if char >= "\u4e00" and char <= "\u9fa5":
30
+ return True
31
+ else:
32
+ return False
33
+
34
+
35
+ def is_alphabet(char):
36
+ if (char >= "\u0041" and char <= "\u005a") or (
37
+ char >= "\u0061" and char <= "\u007a"
38
+ ):
39
+ return True
40
+ else:
41
+ return False
42
+
43
+
44
+ def is_other(char):
45
+ if not (is_chinese(char) or is_alphabet(char)):
46
+ return True
47
+ else:
48
+ return False
49
+
50
+
51
+ def get_segment(text: str) -> List[str]:
52
+ # sentence --> [ch_part, en_part, ch_part, ...]
53
+ segments = []
54
+ types = []
55
+ flag = 0
56
+ temp_seg = ""
57
+ temp_lang = ""
58
+
59
+ # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
60
+ for i, ch in enumerate(text):
61
+ if is_chinese(ch):
62
+ types.append("zh")
63
+ elif is_alphabet(ch):
64
+ types.append("en")
65
+ else:
66
+ types.append("other")
67
+
68
+ assert len(types) == len(text)
69
+
70
+ for i in range(len(types)):
71
+ # find the first char of the seg
72
+ if flag == 0:
73
+ temp_seg += text[i]
74
+ temp_lang = types[i]
75
+ flag = 1
76
+ else:
77
+ if temp_lang == "other":
78
+ if types[i] == temp_lang:
79
+ temp_seg += text[i]
80
+ else:
81
+ temp_seg += text[i]
82
+ temp_lang = types[i]
83
+ else:
84
+ if types[i] == temp_lang:
85
+ temp_seg += text[i]
86
+ elif types[i] == "other":
87
+ temp_seg += text[i]
88
+ else:
89
+ segments.append((temp_seg, temp_lang))
90
+ temp_seg = text[i]
91
+ temp_lang = types[i]
92
+ flag = 1
93
+
94
+ segments.append((temp_seg, temp_lang))
95
+ return segments
96
+
97
+
98
+ def chn_eng_g2p(text: str):
99
+ # now only en and ch
100
+ segments = get_segment(text)
101
+ all_phoneme = ""
102
+ all_tokens = []
103
+
104
+ for index in range(len(segments)):
105
+ seg = segments[index]
106
+ phoneme, token = g2p(seg[0], text, seg[1])
107
+ all_phoneme += phoneme + "|"
108
+ all_tokens += token
109
+
110
+ if seg[1] == "en" and index == len(segments) - 1 and all_phoneme[-2] == "_":
111
+ all_phoneme = all_phoneme[:-2]
112
+ all_tokens = all_tokens[:-1]
113
+ return all_phoneme, all_tokens
114
+
115
+
116
+ text_tokenizer = PhonemeBpeTokenizer()
117
+ with open("./diffrhythm/g2p/g2p/vocab.json", "r") as f:
118
+ json_data = f.read()
119
+ data = json.loads(json_data)
120
+ vocab = data["vocab"]
121
+
122
+ if __name__ == '__main__':
123
+ phone, token = chn_eng_g2p("你好,hello world")
124
+ phone, token = chn_eng_g2p("你好,hello world, Bonjour, 테스트 해 보겠습니다, 五月雨緑")
125
+ print(phone)
126
+ print(token)
127
+
128
+ #phone, token = text_tokenizer.tokenize("你好,hello world, Bonjour, 테스트 해 보겠습니다, 五月雨緑", "", "auto")
129
+ phone, token = text_tokenizer.tokenize("緑", "", "auto")
130
+ #phone, token = text_tokenizer.tokenize("आइए इसका परीक्षण करें", "", "auto")
131
+ #phone, token = text_tokenizer.tokenize("आइए इसका परीक्षण करें", "", "other")
132
+ print(phone)
133
+ print(token)
diffrhythm/g2p/sources/bpmf_2_pinyin.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf24a3306ffbef6b1fbfc1780e25933361bc4d6587b8eb331b13241b8d892ba2
3
+ size 256
diffrhythm/g2p/sources/chinese_lexicon.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3a7685d1c3e68eb2fa304bfc63e90c90c3c1a1948839a5b1b507b2131b3e2fb
3
+ size 14779443
diffrhythm/g2p/sources/pinyin_2_bpmf.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc8b048a20cf61d04b5a36bc6939db74095719b8099269a1269023ee3e6535b4
3
+ size 5262