Spaces:

LPhilp1943
/

speech_2_speech_voice_cloning

Build error

App Files Files Community

LPhilp1943 commited on Mar 17

Commit

5e6eee9

•

1 Parent(s): 8dc6b2e

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -38

app.py CHANGED Viewed

@@ -4,66 +4,44 @@ import torch
 import soundfile as sf
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, VitsModel, AutoTokenizer
-# Ensure the output directory exists
 os.makedirs("output_audio", exist_ok=True)
-# Load the models and processors
 asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
 asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
 tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
 tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
 def speech_to_text(input_audio):
-    # Load and preprocess the audio
     waveform, sr = sf.read(input_audio)
     input_values = asr_processor(waveform, sampling_rate=sr, return_tensors="pt").input_values
-    # Perform speech recognition
     with torch.no_grad():
         logits = asr_model(input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
-    # Decode the predicted IDs to text
     transcription = asr_processor.batch_decode(predicted_ids)[0]
-    return transcription
-def text_to_speech(text):
-    # Tokenize text and generate waveform
     inputs = tts_tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         output = tts_model(**inputs).waveform
-    waveform = output.numpy()
-    # Define output path and save waveform as audio file
-    output_path = "output_audio/text_to_speech.wav"
-    sf.write(output_path, waveform.squeeze(), 22050)
     return output_path
-def speech_to_speech(input_audio, target_text):
-    # Synthesize speech directly from target text without transcribing the input audio
-    return text_to_speech(target_text)
 iface = gr.Interface(
-    fn={
-        "Speech to Text": speech_to_text,
-        "Text to Speech": text_to_speech,
-        "Speech to Speech": speech_to_speech
-    },
     inputs=[
-        gr.Audio(label="Speech to Text"),
-        gr.Textbox(label="Text to Speech"),
-        [gr.Audio(label="Speech to Speech Input"), gr.Textbox(label="Target Text for Speech to Speech")]  # Corrected: Use a list for multiple inputs
-    ],
-    outputs=[
-        gr.Textbox(label="Transcription"),
-        gr.Audio(label="Synthesized Speech"),
-        gr.Audio(label="Speech to Speech Output")
     ],
     title="Speech Processing Application",
-    description="This app uses Facebook's Wav2Vec 2.0 for speech-to-text and VITS for text-to-speech.",
-    layout="vertical"
-)
-if __name__ == "__main__":
-    iface.launch()

 import soundfile as sf
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, VitsModel, AutoTokenizer
 os.makedirs("output_audio", exist_ok=True)
 asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
 asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
 tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
 tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
 def speech_to_text(input_audio):
     waveform, sr = sf.read(input_audio)
     input_values = asr_processor(waveform, sampling_rate=sr, return_tensors="pt").input_values
     with torch.no_grad():
         logits = asr_model(input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = asr_processor.batch_decode(predicted_ids)[0]
+    return transcription.strip()
+def text_to_speech(text, sample_rate=22050):
+    text = text.lower().translate(str.maketrans('', '', string.punctuation))
     inputs = tts_tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         output = tts_model(**inputs).waveform
+    waveform = output.numpy().squeeze()
+    output_path = f"output_audio/{text[:10].replace(' ', '_')}_to_speech.wav"
+    sf.write(output_path, waveform, sample_rate)
     return output_path
+def speech_to_speech(input_audio, target_text, sample_rate=22050):
+    transcription = speech_to_text(input_audio)
+    return text_to_speech(target_text, sample_rate)
 iface = gr.Interface(
+    fn=speech_to_speech,
     inputs=[
+        gr.Audio(source="upload", type="file", label="Input Audio"),
+        gr.Textbox(label="Target Text"),
+        gr.Slider(minimum=16000, maximum=48000, step=1000, default=22050, label="Sample Rate")
     ],
+    outputs=gr.Audio(label="Synthesized Speech"),
     title="Speech Processing Application",
+    description="This app uses Facebook's Wav2Vec 2.0 for speech-to-text and VITS for text-to-speech."
+).launch()