LPhilp1943 commited on
Commit
ff5cf26
1 Parent(s): 7ca94c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -9
app.py CHANGED
@@ -24,27 +24,23 @@ def speech_to_text(input_audio):
24
  transcription = asr_processor.batch_decode(predicted_ids)[0]
25
  return transcription.strip()
26
 
27
- def text_to_speech(text, sample_rate=22050):
28
  text = text.lower().translate(str.maketrans('', '', string.punctuation))
29
  inputs = tts_tokenizer(text, return_tensors="pt")
30
  with torch.no_grad():
31
  output = tts_model(**inputs).waveform
32
  waveform = output.numpy().squeeze()
33
  output_path = f"output_audio/{text[:10].replace(' ', '_')}_to_speech.wav"
34
- sf.write(output_path, waveform, sample_rate)
35
  return output_path
36
 
37
- def speech_to_speech(input_audio, target_text, sample_rate=22050):
38
  transcription = speech_to_text(input_audio)
39
- return text_to_speech(transcription, sample_rate)
40
 
41
  iface = gr.Interface(
42
  fn=speech_to_speech,
43
- inputs=[
44
- gr.Audio(type="filepath", label="Input Audio"),
45
- gr.Textbox(label="Target Text"),
46
- gr.Slider(minimum=16000, maximum=48000, step=1000, value=22050, label="Sample Rate")
47
- ],
48
  outputs=gr.Audio(label="Synthesized Speech"),
49
  title="Speech Processing Application",
50
  description="This app uses Facebook's Wav2Vec 2.0 for speech-to-text and VITS for text-to-speech."
 
24
  transcription = asr_processor.batch_decode(predicted_ids)[0]
25
  return transcription.strip()
26
 
27
+ def text_to_speech(text):
28
  text = text.lower().translate(str.maketrans('', '', string.punctuation))
29
  inputs = tts_tokenizer(text, return_tensors="pt")
30
  with torch.no_grad():
31
  output = tts_model(**inputs).waveform
32
  waveform = output.numpy().squeeze()
33
  output_path = f"output_audio/{text[:10].replace(' ', '_')}_to_speech.wav"
34
+ sf.write(output_path, waveform, 22050) # Use a fixed sample rate for TTS output
35
  return output_path
36
 
37
+ def speech_to_speech(input_audio):
38
  transcription = speech_to_text(input_audio)
39
+ return text_to_speech(transcription)
40
 
41
  iface = gr.Interface(
42
  fn=speech_to_speech,
43
+ inputs=gr.Audio(type="filepath", label="Input Audio"),
 
 
 
 
44
  outputs=gr.Audio(label="Synthesized Speech"),
45
  title="Speech Processing Application",
46
  description="This app uses Facebook's Wav2Vec 2.0 for speech-to-text and VITS for text-to-speech."