RedSparkie commited on
Commit
3408722
verified
1 Parent(s): 0dcc709

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -3
app.py CHANGED
@@ -22,6 +22,20 @@ model_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="model.pt
22
  config_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="config.json")
23
  vocab_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="vocab.json")
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Cargar el modelo XTTS
26
  XTTS_MODEL = None
27
  def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
@@ -33,7 +47,7 @@ def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
33
  XTTS_MODEL = Xtts.init_from_config(config)
34
  print("Loading XTTS model!")
35
 
36
- # Cargar el checkpoint del modelo (se elimin贸 el argumento 'weights_only')
37
  XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
38
  print("Model Loaded!")
39
 
@@ -42,14 +56,25 @@ def run_tts(lang, tts_text, speaker_audio_file):
42
  if XTTS_MODEL is None or not speaker_audio_file:
43
  return "You need to run the previous step to load the model !!", None, None
44
 
 
 
 
 
 
 
 
 
45
  # Usar inference_mode para mejorar el rendimiento
46
  with torch.inference_mode():
47
  gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
48
- audio_path=speaker_audio_file,
49
  gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
50
  max_ref_length=XTTS_MODEL.config.max_ref_len,
51
  sound_norm_refs=XTTS_MODEL.config.sound_norm_refs
52
  )
 
 
 
53
 
54
  out = XTTS_MODEL.inference(
55
  text=tts_text,
@@ -85,5 +110,5 @@ demo = gr.Interface(
85
  outputs=gr.Audio(type='filepath')
86
  )
87
 
88
- # Lanzar la interfaz con un enlace p煤blico (si deseas compartir el espacio p煤blicamente)
89
  demo.launch(share=True)
 
22
  config_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="config.json")
23
  vocab_path = hf_hub_download(repo_id="RedSparkie/danielmula", filename="vocab.json")
24
 
25
+ # Funci贸n para resamplear el audio a 24000 Hz y convertirlo a 16 bits
26
+ def preprocess_audio(audio_path, target_sr=24000):
27
+ waveform, original_sr = torchaudio.load(audio_path)
28
+
29
+ # Resamplear si la frecuencia de muestreo es diferente
30
+ if original_sr != target_sr:
31
+ resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=target_sr)
32
+ waveform = resampler(waveform)
33
+
34
+ # Convertir a 16 bits
35
+ waveform = waveform * (2**15) # Escalar para 16 bits
36
+ waveform = waveform.to(torch.int16) # Convertir a formato de 16 bits
37
+ return waveform, target_sr
38
+
39
  # Cargar el modelo XTTS
40
  XTTS_MODEL = None
41
  def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
 
47
  XTTS_MODEL = Xtts.init_from_config(config)
48
  print("Loading XTTS model!")
49
 
50
+ # Cargar el checkpoint del modelo
51
  XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
52
  print("Model Loaded!")
53
 
 
56
  if XTTS_MODEL is None or not speaker_audio_file:
57
  return "You need to run the previous step to load the model !!", None, None
58
 
59
+ # Preprocesar el audio (resampleo a 24000 Hz y conversi贸n a 16 bits)
60
+ waveform, sr = preprocess_audio(speaker_audio_file)
61
+
62
+ # Guardar el audio procesado temporalmente para usarlo con el modelo
63
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
64
+ torchaudio.save(fp.name, waveform, sr)
65
+ processed_audio_path = fp.name
66
+
67
  # Usar inference_mode para mejorar el rendimiento
68
  with torch.inference_mode():
69
  gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
70
+ audio_path=processed_audio_path,
71
  gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
72
  max_ref_length=XTTS_MODEL.config.max_ref_len,
73
  sound_norm_refs=XTTS_MODEL.config.sound_norm_refs
74
  )
75
+
76
+ if gpt_cond_latent is None or speaker_embedding is None:
77
+ return "Failed to process the audio file.", None, None
78
 
79
  out = XTTS_MODEL.inference(
80
  text=tts_text,
 
110
  outputs=gr.Audio(type='filepath')
111
  )
112
 
113
+ # Lanzar la interfaz con un enlace p煤blico
114
  demo.launch(share=True)