Spaces:

LPhilp1943
/

speech_2_speech_voice_cloning

Build error

App Files Files Community

LPhilp1943 commited on Mar 17

Commit

e0a55da

•

1 Parent(s): 6275fb1

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -28

app.py CHANGED Viewed

@@ -21,7 +21,6 @@ def resample_audio(input_audio_path, target_sr):
 def speech_to_text(input_audio_or_text):
     if isinstance(input_audio_or_text, str):
-        # If input is audio file path, convert speech to text
         waveform = resample_audio(input_audio_or_text, 16000)
         input_values = asr_processor(waveform, sampling_rate=16000, return_tensors="pt").input_values
         with torch.no_grad():
@@ -29,41 +28,28 @@ def speech_to_text(input_audio_or_text):
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = asr_processor.batch_decode(predicted_ids)[0]
     else:
-        # If input is text, directly return it
         transcription = input_audio_or_text
     return transcription.strip()
 def text_to_speech(text):
-    if isinstance(text, str):
-        # If input is text, synthesize speech
-        text = text.lower().translate(str.maketrans('', '', string.punctuation))
-        inputs = tts_tokenizer(text, return_tensors="pt")
-        with torch.no_grad():
-            output = tts_model(**inputs).waveform
-        waveform = output.numpy().squeeze()
-        output_path = os.path.join("output_audio", f"{text[:10].replace(' ', '_')}_to_speech.wav")
-        sf.write(output_path, waveform, 22050)  # Use a fixed sample rate for TTS output
-        # Resample the TTS output to 16000 Hz for consistency with the ASR model's requirements
-        resampled_waveform = librosa.resample(waveform, orig_sr=22050, target_sr=16000)
-        resampled_output_path = os.path.join("output_audio", f"{text[:10].replace(' ', '_')}_to_speech_16khz.wav")
-        sf.write(resampled_output_path, resampled_waveform, 16000)
-        return resampled_output_path
-    else:
-        # If input is already a path to synthesized speech, return it
-        return text
 def speech_to_speech(input_audio, text_input=None):
-    if text_input is None:
-        # If no text input is provided, convert the input audio to text
-        transcription = speech_to_text(input_audio)
-    else:
-        # If text input is provided, use it directly
-        transcription = text_input
-    # Synthesize text to speech and resample to 16kHz
     synthesized_speech_path = text_to_speech(transcription)
     return synthesized_speech_path
 iface = gr.Interface(
     fn=speech_to_speech,
     inputs=[gr.Audio(type="filepath", label="Input Audio"),
@@ -74,4 +60,3 @@ iface = gr.Interface(
 )
 iface.launch()

 def speech_to_text(input_audio_or_text):
     if isinstance(input_audio_or_text, str):
         waveform = resample_audio(input_audio_or_text, 16000)
         input_values = asr_processor(waveform, sampling_rate=16000, return_tensors="pt").input_values
         with torch.no_grad():
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = asr_processor.batch_decode(predicted_ids)[0]
     else:
         transcription = input_audio_or_text
     return transcription.strip()
 def text_to_speech(text):
+    text = text.lower().translate(str.maketrans('', '', string.punctuation))
+    inputs = tts_tokenizer(text, return_tensors="pt")
+    inputs.input_ids = inputs.input_ids.long()  # Fix for the runtime error
+    with torch.no_grad():
+        output = tts_model(**inputs).waveform
+    waveform = output.numpy().squeeze()
+    output_path = os.path.join("output_audio", f"{text[:10].replace(' ', '_')}_to_speech.wav")
+    sf.write(output_path, waveform, 22050)
+    resampled_waveform = librosa.resample(waveform, orig_sr=22050, target_sr=16000)
+    resampled_output_path = os.path.join("output_audio", f"{text[:10].replace(' ', '_')}_to_speech_16khz.wav")
+    sf.write(resampled_output_path, resampled_waveform, 16000)
+    return resampled_output_path
 def speech_to_speech(input_audio, text_input=None):
+    transcription = speech_to_text(input_audio) if text_input is None else text_input
     synthesized_speech_path = text_to_speech(transcription)
     return synthesized_speech_path
 iface = gr.Interface(
     fn=speech_to_speech,
     inputs=[gr.Audio(type="filepath", label="Input Audio"),
 )
 iface.launch()