Spaces:
Running
on
Zero
Running
on
Zero
mrfakename
commited on
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- inference-cli.py +49 -20
- samples/country.flac +0 -0
- samples/main.flac +0 -0
- samples/story.toml +19 -0
- samples/story.txt +1 -0
- samples/town.flac +0 -0
inference-cli.py
CHANGED
@@ -282,29 +282,12 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, model, remove_silence, cr
|
|
282 |
|
283 |
final_wave = new_wave
|
284 |
|
285 |
-
with open(wave_path, "wb") as f:
|
286 |
-
sf.write(f.name, final_wave, target_sample_rate)
|
287 |
-
# Remove silence
|
288 |
-
if remove_silence:
|
289 |
-
aseg = AudioSegment.from_file(f.name)
|
290 |
-
non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)
|
291 |
-
non_silent_wave = AudioSegment.silent(duration=0)
|
292 |
-
for non_silent_seg in non_silent_segs:
|
293 |
-
non_silent_wave += non_silent_seg
|
294 |
-
aseg = non_silent_wave
|
295 |
-
aseg.export(f.name, format="wav")
|
296 |
-
print(f.name)
|
297 |
-
|
298 |
# Create a combined spectrogram
|
299 |
combined_spectrogram = np.concatenate(spectrograms, axis=1)
|
300 |
-
save_spectrogram(combined_spectrogram, spectrogram_path)
|
301 |
-
print(spectrogram_path)
|
302 |
-
|
303 |
|
304 |
-
|
305 |
-
|
306 |
-
print(gen_text)
|
307 |
|
|
|
308 |
print("Converting audio...")
|
309 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
310 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
@@ -340,7 +323,10 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_
|
|
340 |
print("Finished transcription")
|
341 |
else:
|
342 |
print("Using custom reference text...")
|
|
|
343 |
|
|
|
|
|
344 |
# Add the functionality to ensure it ends with ". "
|
345 |
if not ref_text.endswith(". ") and not ref_text.endswith("。"):
|
346 |
if ref_text.endswith("."):
|
@@ -360,4 +346,47 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_
|
|
360 |
return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence, cross_fade_duration)
|
361 |
|
362 |
|
363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
final_wave = new_wave
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
# Create a combined spectrogram
|
286 |
combined_spectrogram = np.concatenate(spectrograms, axis=1)
|
|
|
|
|
|
|
287 |
|
288 |
+
return final_wave, combined_spectrogram
|
|
|
|
|
289 |
|
290 |
+
def process_voice(ref_audio_orig, ref_text):
|
291 |
print("Converting audio...")
|
292 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
293 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
|
|
323 |
print("Finished transcription")
|
324 |
else:
|
325 |
print("Using custom reference text...")
|
326 |
+
return ref_audio, ref_text
|
327 |
|
328 |
+
def infer(ref_audio, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15):
|
329 |
+
print(gen_text)
|
330 |
# Add the functionality to ensure it ends with ". "
|
331 |
if not ref_text.endswith(". ") and not ref_text.endswith("。"):
|
332 |
if ref_text.endswith("."):
|
|
|
346 |
return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence, cross_fade_duration)
|
347 |
|
348 |
|
349 |
+
def process(ref_audio, ref_text, text_gen, model, remove_silence):
|
350 |
+
main_voice = {"ref_audio":ref_audio, "ref_text":ref_text}
|
351 |
+
if "voices" not in config:
|
352 |
+
voices = {"main": main_voice}
|
353 |
+
else:
|
354 |
+
voices = config["voices"]
|
355 |
+
voices["main"] = main_voice
|
356 |
+
for voice in voices:
|
357 |
+
voices[voice]['ref_audio'], voices[voice]['ref_text'] = process_voice(voices[voice]['ref_audio'], voices[voice]['ref_text'])
|
358 |
+
|
359 |
+
generated_audio_segments = []
|
360 |
+
reg1 = r'(?=\[\w+\])'
|
361 |
+
chunks = re.split(reg1, text_gen)
|
362 |
+
reg2 = r'\[(\w+)\]'
|
363 |
+
for text in chunks:
|
364 |
+
match = re.match(reg2, text)
|
365 |
+
if not match or voice not in voices:
|
366 |
+
voice = "main"
|
367 |
+
else:
|
368 |
+
voice = match[1]
|
369 |
+
text = re.sub(reg2, "", text)
|
370 |
+
gen_text = text.strip()
|
371 |
+
ref_audio = voices[voice]['ref_audio']
|
372 |
+
ref_text = voices[voice]['ref_text']
|
373 |
+
print(f"Voice: {voice}")
|
374 |
+
audio, spectragram = infer(ref_audio, ref_text, gen_text, model, remove_silence)
|
375 |
+
generated_audio_segments.append(audio)
|
376 |
+
|
377 |
+
if generated_audio_segments:
|
378 |
+
final_wave = np.concatenate(generated_audio_segments)
|
379 |
+
with open(wave_path, "wb") as f:
|
380 |
+
sf.write(f.name, final_wave, target_sample_rate)
|
381 |
+
# Remove silence
|
382 |
+
if remove_silence:
|
383 |
+
aseg = AudioSegment.from_file(f.name)
|
384 |
+
non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)
|
385 |
+
non_silent_wave = AudioSegment.silent(duration=0)
|
386 |
+
for non_silent_seg in non_silent_segs:
|
387 |
+
non_silent_wave += non_silent_seg
|
388 |
+
aseg = non_silent_wave
|
389 |
+
aseg.export(f.name, format="wav")
|
390 |
+
print(f.name)
|
391 |
+
|
392 |
+
process(ref_audio, ref_text, gen_text, model, remove_silence)
|
samples/country.flac
ADDED
Binary file (180 kB). View file
|
|
samples/main.flac
ADDED
Binary file (279 kB). View file
|
|
samples/story.toml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# F5-TTS | E2-TTS
|
2 |
+
model = "F5-TTS"
|
3 |
+
ref_audio = "samples/main.flac"
|
4 |
+
# If an empty "", transcribes the reference audio automatically.
|
5 |
+
ref_text = ""
|
6 |
+
gen_text = ""
|
7 |
+
# File with text to generate. Ignores the text above.
|
8 |
+
gen_file = "samples/story.txt"
|
9 |
+
remove_silence = true
|
10 |
+
output_dir = "samples"
|
11 |
+
|
12 |
+
[voices.town]
|
13 |
+
ref_audio = "samples/town.flac"
|
14 |
+
ref_text = ""
|
15 |
+
|
16 |
+
[voices.country]
|
17 |
+
ref_audio = "samples/country.flac"
|
18 |
+
ref_text = ""
|
19 |
+
|
samples/story.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
A Town Mouse and a Country Mouse were acquaintances, and the Country Mouse one day invited his friend to come and see him at his home in the fields. The Town Mouse came, and they sat down to a dinner of barleycorns and roots, the latter of which had a distinctly earthy flavour. The fare was not much to the taste of the guest, and presently he broke out with [town] “My poor dear friend, you live here no better than the ants. Now, you should just see how I fare! My larder is a regular horn of plenty. You must come and stay with me, and I promise you you shall live on the fat of the land.” [main] So when he returned to town he took the Country Mouse with him, and showed him into a larder containing flour and oatmeal and figs and honey and dates. The Country Mouse had never seen anything like it, and sat down to enjoy the luxuries his friend provided: but before they had well begun, the door of the larder opened and someone came in. The two Mice scampered off and hid themselves in a narrow and exceedingly uncomfortable hole. Presently, when all was quiet, they ventured out again; but someone else came in, and off they scuttled again. This was too much for the visitor. [country] “Goodbye,” [main] said he, [country] “I’m off. You live in the lap of luxury, I can see, but you are surrounded by dangers; whereas at home I can enjoy my simple dinner of roots and corn in peace.”
|
samples/town.flac
ADDED
Binary file (229 kB). View file
|
|