mfidabel's picture
Update app.py
43ad436 verified
import spaces
import gradio as gr
import numpy as np
import torch
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline
peft_model_id = "mfidabel/Modelo_3_Whisper_Large_V3"
language = "guarani"
task = "transcribe"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
peft_config.base_model_name_or_path, load_in_8bit=False, device_map="cuda:0"
)
model = PeftModel.from_pretrained(model, peft_model_id)
model = model.merge_and_unload()
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task=task)
pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
@spaces.GPU
def transcribe(audio):
if audio is None:
return "Espera a que la grabaci贸n termine de subirse al servidor !! Intentelo de nuevo en unos segundos"
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
with torch.autocast("cuda"):
return pipeline({"sampling_rate": sr, "raw": y}, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]
examples = [
"./examples/audio_1.mp3",
"./examples/audio_2.mp3",
"./examples/audio_3.mp3",
"./examples/audio_4.mp3"
]
title = "# 馃嚨馃嚲 Reconocimiento de Voz en Guaran铆"
description = """Esta es una demostraci贸n del reconocimiento de voz en Guaran铆 utilizando el modelo speech-to-text [Whisper](https://arxiv.org/pdf/2212.04356.pdf)
Autores:
- Mateo Andr茅s Fidabel Gill
- Santiago Ruben Acevedo Zarza
"""
audio_input = gr.Audio(value="./examples/audio_1.mp3",
sources=["upload", "microphone"],
label="馃帳 Audio a transcribir",
interactive=True)
transcription = gr.Textbox(label="馃摑 Transcripci贸n",
interactive=False)
with gr.Blocks() as demo:
with gr.Row():
# Model Title and Description
gr.Markdown(title)
gr.Markdown(description)
with gr.Row():
# Audio Input
audio_input.render()
with gr.Row():
# Text Output
transcription.render()
with gr.Row():
# Submit and Clear Buttons
submit = gr.Button("馃摑 Transcribir el Audio")
with gr.Row():
gr.Examples(examples=examples,
inputs=[audio_input],
outputs=[transcription],
fn=transcribe,
label="Ejemplos")
submit.click(transcribe,
inputs=[audio_input],
outputs = [transcription])
demo.queue()
demo.launch(share=True)