File size: 1,990 Bytes
41ad754
aa3f3ad
ff017ef
41ad754
7fa5660
7ca94c8
aa3f3ad
 
0431ea7
aa3f3ad
 
41ad754
 
7fa5660
aad7d40
 
7fa5660
 
aa3f3ad
 
 
 
7fa5660
aa3f3ad
68ecc2a
 
 
 
 
7fa5660
aa3f3ad
 
7fa5660
 
 
 
5e6eee9
41ad754
aa3f3ad
592ca27
aa3f3ad
 
7fa5660
68ecc2a
6275fb1
aa3f3ad
7fa5660
 
aa3f3ad
4bc4442
aa3f3ad
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import gradio as gr
import torch
import soundfile as sf
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import librosa
from TTS.api import TTS
from TTS.utils.manage import ModelManager

# Agreeing to Coqui TTS terms of service and setting up environment variables
os.environ["COQUI_TOS_AGREED"] = "1"
os.makedirs("output_audio", exist_ok=True)

# Initialize ASR model
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
asr_model.eval()

# Dynamically list and select TTS model
tts_manager = ModelManager()
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name, gpu=False)

def resample_audio(input_audio_path, target_sr=16000):
    waveform, sr = sf.read(input_audio_path)
    if sr != target_sr:
        waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr)
    return waveform

def speech_to_text(input_audio_path):
    waveform = resample_audio(input_audio_path)
    input_values = asr_processor(waveform, return_tensors="pt").input_values
    with torch.no_grad():
        logits = asr_model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = asr_processor.batch_decode(predicted_ids)[0]
    return transcription.strip()

def text_to_speech(text, speaker_wav_path, output_path="output_audio/output.wav"):
    if not text.strip():
        return "Empty text input."
    tts.tts_to_file(text=text, file_path=output_path, speaker_wav=speaker_wav_path)
    return output_path

def speech_to_speech(input_audio, text_input=None):
    speaker_wav_path = input_audio
    if text_input is None:
        text_input = speech_to_text(input_audio)
    return text_to_speech(text_input, speaker_wav_path)

iface = gr.Interface(fn=speech_to_speech,
                     inputs=[gr.Audio(type="filepath"), gr.Textbox(optional=True)],
                     outputs=gr.Audio())
iface.launch()