File size: 1,381 Bytes
8c1ac08
 
 
 
 
 
7e676bf
8c1ac08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import gradio as gr
from datasets import load_dataset
import torch
from transformers import SpeechT5ForSpeechToText, SpeechT5Processor

# Load the English subset of the VoxPopuli dataset
dataset = load_dataset("facebook/voxpopuli", "en")

# Example function to load audio and transcriptions
def get_sample(dataset):
    # Get a random sample from the training set
    sample = dataset['train'][0]  # You can modify to pick a random sample or any sample index
    audio_file = sample["audio"]["path"]
    transcription = sample["sentence"]
    return audio_file, transcription

# Initialize the SpeechT5 model and processor
processor = SpeechT5Processor.from_pretrained("facebook/speech_t5_base")
model = SpeechT5ForSpeechToText.from_pretrained("facebook/speech_t5_base")

# Example Gradio interface function
def transcribe(audio):
    # Process the audio and get transcription
    inputs = processor(audio, return_tensors="pt", sampling_rate=16000)
    with torch.no_grad():
        logits = model(**inputs).logits
    transcription = processor.decode(logits[0], skip_special_tokens=True)
    return transcription

# Load a sample to check if everything is set up
audio_file, transcription = get_sample(dataset)

# Set up Gradio interface
iface = gr.Interface(fn=transcribe, inputs=gr.Audio(source="upload", type="filepath"), outputs="text")

# Launch the interface
iface.launch()