File size: 2,444 Bytes
41ad754
 
ff017ef
41ad754
aad7d40
41ad754
ff017ef
41ad754
 
aad7d40
 
 
 
 
41ad754
 
aad7d40
 
 
 
 
 
 
 
 
 
 
41ad754
 
 
aad7d40
 
ff017ef
aad7d40
1064862
41ad754
aad7d40
41ad754
1064862
 
41ad754
 
ff017ef
aad7d40
ff017ef
 
41ad754
 
 
 
 
 
46c13ab
8dc6b2e
46c13ab
8dc6b2e
46c13ab
 
 
 
 
 
aad7d40
 
41ad754
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import gradio as gr
import os
import torch
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, VitsModel, AutoTokenizer

# Ensure the output directory exists
os.makedirs("output_audio", exist_ok=True)

# Load the models and processors
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

def speech_to_text(input_audio):
    # Load and preprocess the audio
    waveform, sr = sf.read(input_audio)
    input_values = asr_processor(waveform, sampling_rate=sr, return_tensors="pt").input_values
    
    # Perform speech recognition
    with torch.no_grad():
        logits = asr_model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    
    # Decode the predicted IDs to text
    transcription = asr_processor.batch_decode(predicted_ids)[0]
    return transcription

def text_to_speech(text):
    # Tokenize text and generate waveform
    inputs = tts_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = tts_model(**inputs).waveform
    waveform = output.numpy()

    # Define output path and save waveform as audio file
    output_path = "output_audio/text_to_speech.wav"
    sf.write(output_path, waveform.squeeze(), 22050)

    return output_path

def speech_to_speech(input_audio, target_text):
    # Synthesize speech directly from target text without transcribing the input audio
    return text_to_speech(target_text)

iface = gr.Interface(
    fn={
        "Speech to Text": speech_to_text,
        "Text to Speech": text_to_speech,
        "Speech to Speech": speech_to_speech
    },
    inputs=[
        gr.Audio(label="Speech to Text"),
        gr.Textbox(label="Text to Speech"),
        [gr.Audio(label="Speech to Speech Input"), gr.Textbox(label="Target Text for Speech to Speech")]  # Corrected: Use a list for multiple inputs
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Audio(label="Synthesized Speech"),
        gr.Audio(label="Speech to Speech Output")
    ],
    title="Speech Processing Application",
    description="This app uses Facebook's Wav2Vec 2.0 for speech-to-text and VITS for text-to-speech.",
    layout="vertical"
)

if __name__ == "__main__":
    iface.launch()