Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import numpy as np | |
import torch | |
from peft import PeftModel, PeftConfig | |
from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline | |
peft_model_id = "mfidabel/Modelo_3_Whisper_Large_V3" | |
language = "guarani" | |
task = "transcribe" | |
peft_config = PeftConfig.from_pretrained(peft_model_id) | |
model = WhisperForConditionalGeneration.from_pretrained( | |
peft_config.base_model_name_or_path, load_in_8bit=False, device_map="cuda:0" | |
) | |
model = PeftModel.from_pretrained(model, peft_model_id) | |
model = model.merge_and_unload() | |
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) | |
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) | |
feature_extractor = processor.feature_extractor | |
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task=task) | |
pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) | |
def transcribe(audio): | |
if audio is None: | |
return "Espera a que la grabaci贸n termine de subirse al servidor !! Intentelo de nuevo en unos segundos" | |
sr, y = audio | |
y = y.astype(np.float32) | |
y /= np.max(np.abs(y)) | |
with torch.autocast("cuda"): | |
return pipeline({"sampling_rate": sr, "raw": y}, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"] | |
examples = [ | |
"./examples/audio_1.mp3", | |
"./examples/audio_2.mp3", | |
"./examples/audio_3.mp3", | |
"./examples/audio_4.mp3" | |
] | |
title = "# 馃嚨馃嚲 Reconocimiento de Voz en Guaran铆" | |
description = """Esta es una demostraci贸n del reconocimiento de voz en Guaran铆 utilizando el modelo speech-to-text [Whisper](https://arxiv.org/pdf/2212.04356.pdf) | |
Autores: | |
- Mateo Andr茅s Fidabel Gill | |
- Santiago Ruben Acevedo Zarza | |
""" | |
audio_input = gr.Audio(value="./examples/audio_1.mp3", | |
sources=["upload", "microphone"], | |
label="馃帳 Audio a transcribir", | |
interactive=True) | |
transcription = gr.Textbox(label="馃摑 Transcripci贸n", | |
interactive=False) | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
# Model Title and Description | |
gr.Markdown(title) | |
gr.Markdown(description) | |
with gr.Row(): | |
# Audio Input | |
audio_input.render() | |
with gr.Row(): | |
# Text Output | |
transcription.render() | |
with gr.Row(): | |
# Submit and Clear Buttons | |
submit = gr.Button("馃摑 Transcribir el Audio") | |
with gr.Row(): | |
gr.Examples(examples=examples, | |
inputs=[audio_input], | |
outputs=[transcription], | |
fn=transcribe, | |
label="Ejemplos") | |
submit.click(transcribe, | |
inputs=[audio_input], | |
outputs = [transcription]) | |
demo.queue() | |
demo.launch(share=True) | |