Spaces:
Runtime error
Runtime error
File size: 4,457 Bytes
dbf2fc2 f4b3d1b 2edea7d f09c038 dbf2fc2 f09c038 f4b3d1b f09c038 f4b3d1b dbf2fc2 f4b3d1b dbf2fc2 bb12448 f4b3d1b 8ee61a8 dbf2fc2 f4b3d1b dbf2fc2 f4b3d1b dbf2fc2 f09c038 dbf2fc2 f4b3d1b dbf2fc2 f4b3d1b 511c6af f4b3d1b dbf2fc2 f4b3d1b dbf2fc2 f4b3d1b dbf2fc2 f4b3d1b dbf2fc2 f09c038 dbf2fc2 f4b3d1b dbf2fc2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
from io import BytesIO
from typing import Tuple
import wave
import gradio as gr
import numpy as np
from pydub.audio_segment import AudioSegment
import requests
from os.path import exists
from stt import Model
import torch
from transformers import AutoModelForCTC, Wav2Vec2Processor
import torchaudio
from speechbrain.pretrained import EncoderClassifier
# initialize language ID model
lang_classifier = EncoderClassifier.from_hparams(
source="speechbrain/lang-id-commonlanguage_ecapa",
savedir="pretrained_models/lang-id-commonlanguage_ecapa"
)
# download STT model
model_info = {
"mixteco": ("https://coqui.gateway.scarf.sh/mixtec/jemeyer/v1.0.0/model.tflite", "mixtec.tflite"),
"chatino": ("https://coqui.gateway.scarf.sh/chatino/bozden/v1.0.0/model.tflite", "chatino.tflite"),
"totonaco": ("https://coqui.gateway.scarf.sh/totonac/bozden/v1.0.0/model.tflite", "totonac.tflite"),
"español": ("jonatasgrosman/wav2vec2-large-xlsr-53-spanish", "spanish_xlsr"),
"inglés": ("facebook/wav2vec2-large-robust-ft-swbd-300h", "english_xlsr"),
}
def client(audio_data: np.array, sample_rate: int, default_lang: str):
output_audio = _convert_audio(audio_data, sample_rate)
waveform, _ = torchaudio.load(output_audio)
out_prob, score, index, text_lab = lang_classifier.classify_batch(waveform)
output_audio.seek(0)
fin = wave.open(output_audio, 'rb')
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
fin.close()
if text_lab == 'Spanish':
processor, model = STT_MODELS['español']
inputs = processor(waveform)
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
result = processor.decode(torch.argmax(logits, dim=-1).cpu().tolist())
else:
ds = STT_MODELS[default_lang]
result = ds.stt(audio)
return f"{text_lab}: {result}"
def load_models(language):
model_path, file_name = model_info.get("language", ("", ""))
if model_path.startswith('http'):
if not exists(file_name):
print(f"Downloading {model_path}")
r = requests.get(model_path, allow_redirects=True)
with open(file_name, 'wb') as file:
file.write(r.content)
else:
print(f"Found {file_name}. Skipping download...")
return Model(file_name)
processor = Wav2Vec2Processor.from_pretrained(model_path)
model = AutoModelForCTC.from_pretrained(model_path)
return processor, model
def stt(default_lang: str, audio: Tuple[int, np.array]):
sample_rate, audio = audio
use_scorer = False
recognized_result = client(audio, sample_rate, default_lang)
return recognized_result
def _convert_audio(audio_data: np.array, sample_rate: int):
source_audio = BytesIO()
source_audio.write(audio_data)
source_audio.seek(0)
output_audio = BytesIO()
wav_file = AudioSegment.from_raw(
source_audio,
channels=1,
sample_width=2,
frame_rate=sample_rate
)
wav_file.set_frame_rate(16000).set_channels(1).export(output_audio, "wav", codec="pcm_s16le")
output_audio.seek(0)
return output_audio
iface = gr.Interface(
fn=stt,
inputs=[
gr.inputs.Radio(choices=("chatino", "mixteco", "totonaco"), default="mixteco", label="Lengua principal"),
gr.inputs.Audio(type="numpy", label="Audio", optional=False),
],
outputs=gr.outputs.Textbox(label="Output"),
title="Coqui STT Yoloxochitl Mixtec",
theme="huggingface",
description="Prueba de dictado a texto para el mixteco de Yoloxochitl,"
" usando [el modelo entrenado por Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
" con [los datos recopilados por Rey Castillo y sus colaboradores](https://www.openslr.org/89)."
" Esta prueba es basada en la de [Ukraniano](https://huggingface.co./spaces/robinhad/ukrainian-stt)."
" \n\n"
"Speech-to-text demo for Yoloxochitl Mixtec,"
" using [the model trained by Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
" on [the corpus compiled by Rey Castillo and collaborators](https://www.openslr.org/89)."
" This demo is based on the [Ukrainian STT demo](https://huggingface.co./spaces/robinhad/ukrainian-stt).",
)
STT_MODELS = {lang: load_models(lang) for lang in ("inglés", "español")}
iface.launch()
|