Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -26,6 +26,7 @@ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
26 |
MODEL_DIR = os.path.join(SCRIPT_DIR, "model")
|
27 |
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output")
|
28 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
29 |
|
30 |
|
31 |
def clear_gpu_cache():
|
@@ -73,6 +74,7 @@ def load_model(checkpoint_dir="model/", repo_id="capleaf/viXTTS", use_deepspeed=
|
|
73 |
print("Model Loaded!")
|
74 |
|
75 |
return XTTS_MODEL
|
|
|
76 |
def generate_hash(data):
|
77 |
hash_object = hashlib.md5()
|
78 |
hash_object.update(data)
|
@@ -95,6 +97,7 @@ def get_file_name(text, max_char=50):
|
|
95 |
|
96 |
|
97 |
def normalize_vietnamese_text(text):
|
|
|
98 |
text = (
|
99 |
text
|
100 |
.replace("..", ".")
|
@@ -107,6 +110,8 @@ def normalize_vietnamese_text(text):
|
|
107 |
.replace("AI", "Ây Ai")
|
108 |
.replace("A.I", "Ây Ai")
|
109 |
)
|
|
|
|
|
110 |
return text
|
111 |
|
112 |
|
@@ -126,24 +131,30 @@ def calculate_keep_len(text, lang):
|
|
126 |
|
127 |
|
128 |
def run_tts(lang, tts_text, speaker_audio_file, normalize_text):
|
129 |
-
global XTTS_MODEL
|
130 |
|
131 |
if XTTS_MODEL is None:
|
132 |
-
return "
|
133 |
|
134 |
if not speaker_audio_file:
|
135 |
-
return "
|
136 |
|
137 |
print("Computing conditioning latents...")
|
138 |
-
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
|
139 |
-
audio_path=speaker_audio_file,
|
140 |
-
gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
|
141 |
-
max_ref_length=XTTS_MODEL.config.max_ref_len,
|
142 |
-
sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
|
143 |
-
)
|
144 |
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
# Split text by sentence
|
149 |
if lang in ["ja", "zh-cn"]:
|
@@ -233,7 +244,7 @@ if __name__ == "__main__":
|
|
233 |
REFERENCE_AUDIO = os.path.join(SCRIPT_DIR, "audio.wav")
|
234 |
t1 = threading.Thread(target=MyThread1, args=[])
|
235 |
t1.start()
|
236 |
-
|
237 |
with gr.Blocks() as demo:
|
238 |
intro = """
|
239 |
# Fake giọng Demo
|
@@ -279,7 +290,7 @@ if __name__ == "__main__":
|
|
279 |
|
280 |
tts_text = gr.Textbox(
|
281 |
label="Input Text.",
|
282 |
-
value="
|
283 |
)
|
284 |
tts_btn = gr.Button(value="Inference", variant="primary")
|
285 |
|
|
|
26 |
MODEL_DIR = os.path.join(SCRIPT_DIR, "model")
|
27 |
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output")
|
28 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
29 |
+
REF_AUDIO_CACHE = {}
|
30 |
|
31 |
|
32 |
def clear_gpu_cache():
|
|
|
74 |
print("Model Loaded!")
|
75 |
|
76 |
return XTTS_MODEL
|
77 |
+
|
78 |
def generate_hash(data):
|
79 |
hash_object = hashlib.md5()
|
80 |
hash_object.update(data)
|
|
|
97 |
|
98 |
|
99 |
def normalize_vietnamese_text(text):
|
100 |
+
digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"]
|
101 |
text = (
|
102 |
text
|
103 |
.replace("..", ".")
|
|
|
110 |
.replace("AI", "Ây Ai")
|
111 |
.replace("A.I", "Ây Ai")
|
112 |
)
|
113 |
+
for i in range(10):
|
114 |
+
text = text.replace(i.__str__(), digits[i]+ " ")
|
115 |
return text
|
116 |
|
117 |
|
|
|
131 |
|
132 |
|
133 |
def run_tts(lang, tts_text, speaker_audio_file, normalize_text):
|
134 |
+
global XTTS_MODEL, REF_AUDIO_CACHE
|
135 |
|
136 |
if XTTS_MODEL is None:
|
137 |
+
return "Model đang được load. Vui lòng đợi !!", None, None
|
138 |
|
139 |
if not speaker_audio_file:
|
140 |
+
return "Cần giọng đọc mẫu !!!", None, None
|
141 |
|
142 |
print("Computing conditioning latents...")
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
+
cache_key_ref_audio = speaker_audio_file
|
145 |
+
if cache_key_ref_audio in REF_AUDIO_CACHE:
|
146 |
+
print("Using conditioning latents cache...")
|
147 |
+
gpt_cond_latent, speaker_embedding = REF_AUDIO_CACHE[cache_key_ref_audio]
|
148 |
+
else:
|
149 |
+
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
|
150 |
+
audio_path=speaker_audio_file,
|
151 |
+
gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
|
152 |
+
max_ref_length=XTTS_MODEL.config.max_ref_len,
|
153 |
+
sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
|
154 |
+
)
|
155 |
+
REF_AUDIO_CACHE[cache_key_ref_audio] = (gpt_cond_latent, speaker_embedding)
|
156 |
+
|
157 |
+
tts_text = normalize_vietnamese_text(tts_text)
|
158 |
|
159 |
# Split text by sentence
|
160 |
if lang in ["ja", "zh-cn"]:
|
|
|
244 |
REFERENCE_AUDIO = os.path.join(SCRIPT_DIR, "audio.wav")
|
245 |
t1 = threading.Thread(target=MyThread1, args=[])
|
246 |
t1.start()
|
247 |
+
|
248 |
with gr.Blocks() as demo:
|
249 |
intro = """
|
250 |
# Fake giọng Demo
|
|
|
290 |
|
291 |
tts_text = gr.Textbox(
|
292 |
label="Input Text.",
|
293 |
+
value="Chào bạn, đây là giọng đọc được sinh ra từ AI",
|
294 |
)
|
295 |
tts_btn = gr.Button(value="Inference", variant="primary")
|
296 |
|