Spaces:

LPhilp1943
/

speech_2_speech_voice_cloning

Build error

App Files Files Community

LPhilp1943 commited on Mar 17

Commit

7fa5660

•

1 Parent(s): 592ca27

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -31

app.py CHANGED Viewed

@@ -2,16 +2,29 @@ import gradio as gr
 import os
 import torch
 import soundfile as sf
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, VitsModel, AutoTokenizer
 import librosa
 import string
 os.makedirs("output_audio", exist_ok=True)
 asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
 asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
-tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
-tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
 def resample_audio(input_audio_path, target_sr):
     waveform, sr = sf.read(input_audio_path)
@@ -19,48 +32,42 @@ def resample_audio(input_audio_path, target_sr):
         waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr)
     return waveform
-def speech_to_text(input_audio_or_text):
-    if isinstance(input_audio_or_text, str):
-        waveform = resample_audio(input_audio_or_text, 16000)
-        input_values = asr_processor(waveform, sampling_rate=16000, return_tensors="pt").input_values
-        with torch.no_grad():
-            logits = asr_model(input_values).logits
-        predicted_ids = torch.argmax(logits, dim=-1)
-        transcription = asr_processor.batch_decode(predicted_ids)[0]
-    else:
-        transcription = input_audio_or_text
     return transcription.strip()
-def text_to_speech(text):
-    # Ensure the text input is not empty to avoid padding errors in the transformer model
     if not text.strip():
         return "The text input is empty, please provide a valid string."
-    text = text.lower().translate(str.maketrans('', '', string.punctuation))
-    inputs = tts_tokenizer(text, return_tensors="pt")
-    inputs['input_ids'] = inputs['input_ids'].long()  # Ensure input_ids are of type Long
-    with torch.no_grad():
-        output = tts_model(**inputs).waveform
-    waveform = output.numpy().squeeze()
-    output_path = os.path.join("output_audio", f"{text[:10].replace(' ', '_')}_to_speech.wav")
     sf.write(output_path, waveform, 22050)
-    resampled_waveform = librosa.resample(waveform, orig_sr=22050, target_sr=16000)
-    resampled_output_path = os.path.join("output_audio", f"{text[:10].replace(' ', '_')}_to_speech_16khz.wav")
-    sf.write(resampled_output_path, resampled_waveform, 16000)
-    return resampled_output_path
 def speech_to_speech(input_audio, text_input=None):
-    transcription = speech_to_text(input_audio) if text_input is None else text_input
-    synthesized_speech_path = text_to_speech(transcription)
     return synthesized_speech_path
 iface = gr.Interface(
     fn=speech_to_speech,
-    inputs=[gr.Audio(type="filepath", label="Input Audio"),
-            gr.Textbox(label="Text Input", placeholder="Enter text to synthesize speech (optional)")],
     outputs=gr.Audio(label="Synthesized Speech"),
     title="Speech-to-Speech Application",
-    description="This app converts speech to text and then back to speech, ensuring the output audio is resampled to 16kHz."
 )
 iface.launch(share=True)

 import os
 import torch
 import soundfile as sf
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
 import librosa
 import string
 os.makedirs("output_audio", exist_ok=True)
+# Initialize ASR model
 asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
 asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
+asr_model.eval()
+# Initialize TTS model
+tts_config_path = "/path/to/xtts/config.json"
+tts_checkpoint_dir = "/path/to/xtts/"
+speaker_wav_path = "/path/to/target/speaker.wav"  # Update with actual speaker wav path for cloning voice
+tts_config = XttsConfig()
+tts_config.load_json(tts_config_path)
+tts_model = Xtts.init_from_config(tts_config)
+tts_model.load_checkpoint(tts_config, checkpoint_dir=tts_checkpoint_dir, eval=True)
+tts_model.cuda()
 def resample_audio(input_audio_path, target_sr):
     waveform, sr = sf.read(input_audio_path)
         waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr)
     return waveform
+def speech_to_text(input_audio_path):
+    waveform = resample_audio(input_audio_path, 16000)
+    input_values = asr_processor(waveform, sampling_rate=16000, return_tensors="pt").input_values
+    with torch.no_grad():
+        logits = asr_model(input_values).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = asr_processor.batch_decode(predicted_ids)[0]
     return transcription.strip()
+def text_to_speech(text, output_path="output_audio/output.wav"):
     if not text.strip():
         return "The text input is empty, please provide a valid string."
+    outputs = tts_model.synthesize(
+        text,
+        tts_config,
+        speaker_wav=speaker_wav_path,
+        gpt_cond_len=3,
+        language="en"
+    )
+    waveform = outputs['waveform'].squeeze().cpu().numpy()
     sf.write(output_path, waveform, 22050)
+    return output_path
 def speech_to_speech(input_audio, text_input=None):
+    if text_input is None:
+        text_input = speech_to_text(input_audio)
+    synthesized_speech_path = text_to_speech(text_input)
     return synthesized_speech_path
 iface = gr.Interface(
     fn=speech_to_speech,
+    inputs=[gr.Audio(type="filepath", label="Input Audio"), gr.Textbox(label="Text Input", optional=True)],
     outputs=gr.Audio(label="Synthesized Speech"),
     title="Speech-to-Speech Application",
+    description="Converts speech to text and then back to speech, ensuring the output audio is of high quality."
 )
 iface.launch(share=True)