|
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC |
|
import soundfile as sf |
|
import torch |
|
import gradio as gr |
|
|
|
|
|
|
|
processor = Wav2Vec2Processor.from_pretrained("h4d35/Wav2Vec2-hi") |
|
model = Wav2Vec2ForCTC.from_pretrained("h4d35/Wav2Vec2-hi") |
|
|
|
|
|
def map_to_array(file): |
|
speech, _ = sf.read(file) |
|
return speech |
|
|
|
|
|
|
|
|
|
def inference(audio): |
|
input_values = processor(map_to_array(audio.name), return_tensors="pt", padding="longest").input_values |
|
|
|
|
|
logits = model(input_values).logits |
|
|
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
transcription = processor.batch_decode(predicted_ids) |
|
return transcription[0] |
|
|
|
inputs = gr.inputs.Audio(label="Input Audio", type="file") |
|
outputs = gr.outputs.Textbox(label="Output Text") |
|
|
|
title = "HindiASR" |
|
description = "HindiASR using Wav2Vec2.0" |
|
|
|
|
|
|
|
gr.Interface(inference, inputs, outputs, title=title, description=description).launch() |