XTTS_finetuned_dani

Sleeping

App Files Files Community

RedSparkie commited on Sep 20, 2024

Commit

3408722

verified ·

1 Parent(s): 0dcc709

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -3

app.py CHANGED Viewed

@@ -22,6 +22,20 @@ model_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="model.pt
 config_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="config.json")
 vocab_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="vocab.json")
 # Cargar el modelo XTTS
 XTTS_MODEL = None
 def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
@@ -33,7 +47,7 @@ def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
     XTTS_MODEL = Xtts.init_from_config(config)
     print("Loading XTTS model!")
-    # Cargar el checkpoint del modelo (se eliminó el argumento 'weights_only')
     XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
     print("Model Loaded!")
@@ -42,14 +56,25 @@ def run_tts(lang, tts_text, speaker_audio_file):
     if XTTS_MODEL is None or not speaker_audio_file:
         return "You need to run the previous step to load the model !!", None, None
     # Usar inference_mode para mejorar el rendimiento
     with torch.inference_mode():
         gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
-            audio_path=speaker_audio_file,
             gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
             max_ref_length=XTTS_MODEL.config.max_ref_len,
             sound_norm_refs=XTTS_MODEL.config.sound_norm_refs
         )
         out = XTTS_MODEL.inference(
             text=tts_text,
@@ -85,5 +110,5 @@ demo = gr.Interface(
     outputs=gr.Audio(type='filepath')
 )
-# Lanzar la interfaz con un enlace público (si deseas compartir el espacio públicamente)
 demo.launch(share=True)

 config_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="config.json")
 vocab_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="vocab.json")
+# Función para resamplear el audio a 24000 Hz y convertirlo a 16 bits
+def preprocess_audio(audio_path, target_sr=24000):
+    waveform, original_sr = torchaudio.load(audio_path)
+    # Resamplear si la frecuencia de muestreo es diferente
+    if original_sr != target_sr:
+        resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=target_sr)
+        waveform = resampler(waveform)
+    # Convertir a 16 bits
+    waveform = waveform * (2**15)  # Escalar para 16 bits
+    waveform = waveform.to(torch.int16)  # Convertir a formato de 16 bits
+    return waveform, target_sr
 # Cargar el modelo XTTS
 XTTS_MODEL = None
 def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
     XTTS_MODEL = Xtts.init_from_config(config)
     print("Loading XTTS model!")
+    # Cargar el checkpoint del modelo
     XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
     print("Model Loaded!")
     if XTTS_MODEL is None or not speaker_audio_file:
         return "You need to run the previous step to load the model !!", None, None
+    # Preprocesar el audio (resampleo a 24000 Hz y conversión a 16 bits)
+    waveform, sr = preprocess_audio(speaker_audio_file)
+    # Guardar el audio procesado temporalmente para usarlo con el modelo
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        torchaudio.save(fp.name, waveform, sr)
+        processed_audio_path = fp.name
     # Usar inference_mode para mejorar el rendimiento
     with torch.inference_mode():
         gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
+            audio_path=processed_audio_path,
             gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
             max_ref_length=XTTS_MODEL.config.max_ref_len,
             sound_norm_refs=XTTS_MODEL.config.sound_norm_refs
         )
+        if gpt_cond_latent is None or speaker_embedding is None:
+            return "Failed to process the audio file.", None, None
         out = XTTS_MODEL.inference(
             text=tts_text,
     outputs=gr.Audio(type='filepath')
 )
+# Lanzar la interfaz con un enlace público
 demo.launch(share=True)