File size: 2,729 Bytes
41ad754
 
ff017ef
41ad754
aad7d40
7ca94c8
 
41ad754
 
 
aad7d40
 
 
 
41ad754
68ecc2a
 
 
 
 
 
e9febae
 
 
 
 
 
 
 
 
 
5e6eee9
41ad754
ff5cf26
e0a55da
 
 
 
 
 
 
 
 
 
 
 
68ecc2a
6275fb1
e0a55da
68ecc2a
 
41ad754
 
5e6eee9
c7e7be5
 
5e6eee9
68ecc2a
 
4bc4442
 
2c5ccd6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
import os
import torch
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, VitsModel, AutoTokenizer
import librosa
import string

os.makedirs("output_audio", exist_ok=True)

asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

def resample_audio(input_audio_path, target_sr):
    waveform, sr = sf.read(input_audio_path)
    if sr != target_sr:
        waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr)
    return waveform

def speech_to_text(input_audio_or_text):
    if isinstance(input_audio_or_text, str):
        waveform = resample_audio(input_audio_or_text, 16000)
        input_values = asr_processor(waveform, sampling_rate=16000, return_tensors="pt").input_values
        with torch.no_grad():
            logits = asr_model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = asr_processor.batch_decode(predicted_ids)[0]
    else:
        transcription = input_audio_or_text
    return transcription.strip()

def text_to_speech(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    inputs = tts_tokenizer(text, return_tensors="pt")
    inputs.input_ids = inputs.input_ids.long()  # Fix for the runtime error
    with torch.no_grad():
        output = tts_model(**inputs).waveform
    waveform = output.numpy().squeeze()
    output_path = os.path.join("output_audio", f"{text[:10].replace(' ', '_')}_to_speech.wav")
    sf.write(output_path, waveform, 22050)
    resampled_waveform = librosa.resample(waveform, orig_sr=22050, target_sr=16000)
    resampled_output_path = os.path.join("output_audio", f"{text[:10].replace(' ', '_')}_to_speech_16khz.wav")
    sf.write(resampled_output_path, resampled_waveform, 16000)
    return resampled_output_path

def speech_to_speech(input_audio, text_input=None):
    transcription = speech_to_text(input_audio) if text_input is None else text_input
    synthesized_speech_path = text_to_speech(transcription)
    return synthesized_speech_path

iface = gr.Interface(
    fn=speech_to_speech,
    inputs=[gr.Audio(type="filepath", label="Input Audio"),
            gr.Textbox(label="Text Input", placeholder="Enter text to synthesize speech (optional)")],
    outputs=gr.Audio(label="Synthesized Speech"),
    title="Speech-to-Speech Application",
    description="This app converts speech to text and then back to speech, ensuring the output audio is resampled to 16kHz."
)

iface.launch()