mrfakename commited on
Commit
706f9a3
·
verified ·
1 Parent(s): e41601f

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

inference-cli.py CHANGED
@@ -282,29 +282,12 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, model, remove_silence, cr
282
 
283
  final_wave = new_wave
284
 
285
- with open(wave_path, "wb") as f:
286
- sf.write(f.name, final_wave, target_sample_rate)
287
- # Remove silence
288
- if remove_silence:
289
- aseg = AudioSegment.from_file(f.name)
290
- non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)
291
- non_silent_wave = AudioSegment.silent(duration=0)
292
- for non_silent_seg in non_silent_segs:
293
- non_silent_wave += non_silent_seg
294
- aseg = non_silent_wave
295
- aseg.export(f.name, format="wav")
296
- print(f.name)
297
-
298
  # Create a combined spectrogram
299
  combined_spectrogram = np.concatenate(spectrograms, axis=1)
300
- save_spectrogram(combined_spectrogram, spectrogram_path)
301
- print(spectrogram_path)
302
-
303
 
304
- def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15):
305
-
306
- print(gen_text)
307
 
 
308
  print("Converting audio...")
309
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
310
  aseg = AudioSegment.from_file(ref_audio_orig)
@@ -340,7 +323,10 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_
340
  print("Finished transcription")
341
  else:
342
  print("Using custom reference text...")
 
343
 
 
 
344
  # Add the functionality to ensure it ends with ". "
345
  if not ref_text.endswith(". ") and not ref_text.endswith("。"):
346
  if ref_text.endswith("."):
@@ -360,4 +346,47 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_
360
  return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence, cross_fade_duration)
361
 
362
 
363
- infer(ref_audio, ref_text, gen_text, model, remove_silence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
  final_wave = new_wave
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  # Create a combined spectrogram
286
  combined_spectrogram = np.concatenate(spectrograms, axis=1)
 
 
 
287
 
288
+ return final_wave, combined_spectrogram
 
 
289
 
290
+ def process_voice(ref_audio_orig, ref_text):
291
  print("Converting audio...")
292
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
293
  aseg = AudioSegment.from_file(ref_audio_orig)
 
323
  print("Finished transcription")
324
  else:
325
  print("Using custom reference text...")
326
+ return ref_audio, ref_text
327
 
328
+ def infer(ref_audio, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15):
329
+ print(gen_text)
330
  # Add the functionality to ensure it ends with ". "
331
  if not ref_text.endswith(". ") and not ref_text.endswith("。"):
332
  if ref_text.endswith("."):
 
346
  return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence, cross_fade_duration)
347
 
348
 
349
+ def process(ref_audio, ref_text, text_gen, model, remove_silence):
350
+ main_voice = {"ref_audio":ref_audio, "ref_text":ref_text}
351
+ if "voices" not in config:
352
+ voices = {"main": main_voice}
353
+ else:
354
+ voices = config["voices"]
355
+ voices["main"] = main_voice
356
+ for voice in voices:
357
+ voices[voice]['ref_audio'], voices[voice]['ref_text'] = process_voice(voices[voice]['ref_audio'], voices[voice]['ref_text'])
358
+
359
+ generated_audio_segments = []
360
+ reg1 = r'(?=\[\w+\])'
361
+ chunks = re.split(reg1, text_gen)
362
+ reg2 = r'\[(\w+)\]'
363
+ for text in chunks:
364
+ match = re.match(reg2, text)
365
+ if not match or voice not in voices:
366
+ voice = "main"
367
+ else:
368
+ voice = match[1]
369
+ text = re.sub(reg2, "", text)
370
+ gen_text = text.strip()
371
+ ref_audio = voices[voice]['ref_audio']
372
+ ref_text = voices[voice]['ref_text']
373
+ print(f"Voice: {voice}")
374
+ audio, spectragram = infer(ref_audio, ref_text, gen_text, model, remove_silence)
375
+ generated_audio_segments.append(audio)
376
+
377
+ if generated_audio_segments:
378
+ final_wave = np.concatenate(generated_audio_segments)
379
+ with open(wave_path, "wb") as f:
380
+ sf.write(f.name, final_wave, target_sample_rate)
381
+ # Remove silence
382
+ if remove_silence:
383
+ aseg = AudioSegment.from_file(f.name)
384
+ non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)
385
+ non_silent_wave = AudioSegment.silent(duration=0)
386
+ for non_silent_seg in non_silent_segs:
387
+ non_silent_wave += non_silent_seg
388
+ aseg = non_silent_wave
389
+ aseg.export(f.name, format="wav")
390
+ print(f.name)
391
+
392
+ process(ref_audio, ref_text, gen_text, model, remove_silence)
samples/country.flac ADDED
Binary file (180 kB). View file
 
samples/main.flac ADDED
Binary file (279 kB). View file
 
samples/story.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # F5-TTS | E2-TTS
2
+ model = "F5-TTS"
3
+ ref_audio = "samples/main.flac"
4
+ # If an empty "", transcribes the reference audio automatically.
5
+ ref_text = ""
6
+ gen_text = ""
7
+ # File with text to generate. Ignores the text above.
8
+ gen_file = "samples/story.txt"
9
+ remove_silence = true
10
+ output_dir = "samples"
11
+
12
+ [voices.town]
13
+ ref_audio = "samples/town.flac"
14
+ ref_text = ""
15
+
16
+ [voices.country]
17
+ ref_audio = "samples/country.flac"
18
+ ref_text = ""
19
+
samples/story.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ A Town Mouse and a Country Mouse were acquaintances, and the Country Mouse one day invited his friend to come and see him at his home in the fields. The Town Mouse came, and they sat down to a dinner of barleycorns and roots, the latter of which had a distinctly earthy flavour. The fare was not much to the taste of the guest, and presently he broke out with [town] “My poor dear friend, you live here no better than the ants. Now, you should just see how I fare! My larder is a regular horn of plenty. You must come and stay with me, and I promise you you shall live on the fat of the land.” [main] So when he returned to town he took the Country Mouse with him, and showed him into a larder containing flour and oatmeal and figs and honey and dates. The Country Mouse had never seen anything like it, and sat down to enjoy the luxuries his friend provided: but before they had well begun, the door of the larder opened and someone came in. The two Mice scampered off and hid themselves in a narrow and exceedingly uncomfortable hole. Presently, when all was quiet, they ventured out again; but someone else came in, and off they scuttled again. This was too much for the visitor. [country] “Goodbye,” [main] said he, [country] “I’m off. You live in the lap of luxury, I can see, but you are surrounded by dangers; whereas at home I can enjoy my simple dinner of roots and corn in peace.”
samples/town.flac ADDED
Binary file (229 kB). View file