Spaces:

LPhilp1943
/

speech_2_speech_voice_cloning

Build error

LPhilp1943 commited on Mar 17

Commit

ff5cf26

•

1 Parent(s): 7ca94c8

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -24,27 +24,23 @@ def speech_to_text(input_audio):
     transcription = asr_processor.batch_decode(predicted_ids)[0]
     return transcription.strip()
-def text_to_speech(text, sample_rate=22050):
     text = text.lower().translate(str.maketrans('', '', string.punctuation))
     inputs = tts_tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         output = tts_model(**inputs).waveform
     waveform = output.numpy().squeeze()
     output_path = f"output_audio/{text[:10].replace(' ', '_')}_to_speech.wav"
-    sf.write(output_path, waveform, sample_rate)
     return output_path
-def speech_to_speech(input_audio, target_text, sample_rate=22050):
     transcription = speech_to_text(input_audio)
-    return text_to_speech(transcription, sample_rate)
 iface = gr.Interface(
     fn=speech_to_speech,
-    inputs=[
-        gr.Audio(type="filepath", label="Input Audio"),
-        gr.Textbox(label="Target Text"),
-        gr.Slider(minimum=16000, maximum=48000, step=1000, value=22050, label="Sample Rate")
-    ],
     outputs=gr.Audio(label="Synthesized Speech"),
     title="Speech Processing Application",
     description="This app uses Facebook's Wav2Vec 2.0 for speech-to-text and VITS for text-to-speech."

     transcription = asr_processor.batch_decode(predicted_ids)[0]
     return transcription.strip()
+def text_to_speech(text):
     text = text.lower().translate(str.maketrans('', '', string.punctuation))
     inputs = tts_tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         output = tts_model(**inputs).waveform
     waveform = output.numpy().squeeze()
     output_path = f"output_audio/{text[:10].replace(' ', '_')}_to_speech.wav"
+    sf.write(output_path, waveform, 22050)  # Use a fixed sample rate for TTS output
     return output_path
+def speech_to_speech(input_audio):
     transcription = speech_to_text(input_audio)
+    return text_to_speech(transcription)
 iface = gr.Interface(
     fn=speech_to_speech,
+    inputs=gr.Audio(type="filepath", label="Input Audio"),
     outputs=gr.Audio(label="Synthesized Speech"),
     title="Speech Processing Application",
     description="This app uses Facebook's Wav2Vec 2.0 for speech-to-text and VITS for text-to-speech."