Spaces:
Runtime error
Runtime error
File size: 6,232 Bytes
9439387 dbf2fc2 d0bbc40 dbf2fc2 f0380ff dbf2fc2 f4b3d1b f200d27 028ff01 f09c038 dbf2fc2 f09c038 f4b3d1b f09c038 2915c9d ac40f21 f09c038 f4b3d1b 4639cf2 2915c9d dbf2fc2 f4b3d1b dbf2fc2 bb12448 ff08b05 f4b3d1b 8ee61a8 dbf2fc2 028ff01 dbf2fc2 5915225 dbf2fc2 f4b3d1b ff08b05 028ff01 ac40f21 028ff01 dbf2fc2 f4b3d1b ff08b05 f4b3d1b 028ff01 dbf2fc2 f09c038 dbf2fc2 2915c9d f4b3d1b d27ee9b dbf2fc2 d47cc89 f4b3d1b 2915c9d f4b3d1b dbf2fc2 f4b3d1b dbf2fc2 f4b3d1b dbf2fc2 f4b3d1b dbf2fc2 ff08b05 dbf2fc2 652611b ccf8b98 652611b e05a9ec ff08b05 f09c038 ff08b05 f09c038 ff08b05 4639cf2 ff08b05 dbf2fc2 f4b3d1b dbf2fc2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
from io import BytesIO
from typing import Tuple
import wave
import gradio as gr
import numpy as np
from pydub.audio_segment import AudioSegment
import requests
from os.path import exists
from stt import Model
import torch
from transformers import pipeline
import librosa
import torchaudio
from speechbrain.pretrained import EncoderClassifier
# initialize language ID model
lang_classifier = EncoderClassifier.from_hparams(
source="speechbrain/lang-id-commonlanguage_ecapa",
savedir="pretrained_models/lang-id-commonlanguage_ecapa"
)
def load_hf_model(model_path="facebook/wav2vec2-large-robust-ft-swbd-300h"):
return pipeline("automatic-speech-recognition", model=model_path)
# download STT model
model_info = {
"mixteco": ("https://coqui.gateway.scarf.sh/mixtec/jemeyer/v1.0.0/model.tflite", "mixtec.tflite"),
"chatino": ("https://coqui.gateway.scarf.sh/chatino/bozden/v1.0.0/model.tflite", "chatino.tflite"),
"totonaco": ("https://coqui.gateway.scarf.sh/totonac/bozden/v1.0.0/model.tflite", "totonac.tflite"),
"español": ("jonatasgrosman/wav2vec2-large-xlsr-53-spanish", "spanish_xlsr"),
"inglés": ("facebook/wav2vec2-large-robust-ft-swbd-300h", "english_xlsr"),
}
STT_MODELS = {lang: load_hf_model(model_info[lang][0]) for lang in ("español",)}
def client(audio_data: np.array, sample_rate: int, default_lang: str):
output_audio = _convert_audio(audio_data, sample_rate)
waveform, _ = torchaudio.load(output_audio)
out_prob, score, index, text_lab = lang_classifier.classify_batch(waveform)
text_lab = text_lab[0]
output_audio.seek(0)
fin = wave.open(output_audio, 'rb')
coqui_audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
output_audio.seek(0)
hf_audio, _ = librosa.load(output_audio)
fin.close()
print(default_lang, text_lab)
if text_lab == 'Spanish':
text_lab = 'español'
asr_pipeline = STT_MODELS['español']
result = asr_pipeline(hf_audio, chunk_length_s=5, stride_length_s=1)['text']
else:
text_lab = default_lang
ds = STT_MODELS[default_lang]
result = ds.stt(coqui_audio)
return f"{text_lab}: {result}"
def load_coqui_models(language):
model_path, file_name = model_info.get(language, ("", ""))
if not exists(file_name):
print(f"Downloading {model_path}")
r = requests.get(model_path, allow_redirects=True)
with open(file_name, 'wb') as file:
file.write(r.content)
else:
print(f"Found {file_name}. Skipping download...")
return Model(file_name)
for lang in ('mixteco', 'chatino', 'totonaco'):
STT_MODELS[lang] = load_coqui_models(lang)
def stt(default_lang: str, audio: Tuple[int, np.array]):
sample_rate, audio = audio
use_scorer = False
recognized_result = client(audio, sample_rate, default_lang)
return recognized_result
def _convert_audio(audio_data: np.array, sample_rate: int):
source_audio = BytesIO()
source_audio.write(audio_data)
source_audio.seek(0)
output_audio = BytesIO()
wav_file = AudioSegment.from_raw(
source_audio,
channels=1,
sample_width=2,
frame_rate=sample_rate
)
wav_file.set_frame_rate(16000).set_channels(1).export(output_audio, "wav", codec="pcm_s16le")
output_audio.seek(0)
return output_audio
iface = gr.Interface(
fn=stt,
inputs=[
gr.inputs.Radio(choices=("chatino", "mixteco", "totonaco"), default="mixteco", label="Lengua principal"),
gr.inputs.Audio(type="numpy", label="Audio", optional=False),
],
outputs=gr.outputs.Textbox(label="Output"),
title="Coqui STT de Chatino, Mixteco, y Totonaco",
theme="huggingface",
description="Prueba de identificar frases del español en grabaciones de una lengua indígena, y prover el texto de cada una",
examples=[["mixteco", "ejemplos/espanol1.wav"],
["mixteco", "ejemplos/espanol2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
["mixteco", "ejemplos/mixteco1-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
["mixteco", "ejemplos/mixteco2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav"],
["totonaco", "ejemplos/totonaco1-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"],
["totonaco", "ejemplos/totonaco2-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav"]],
article="La identificación de lenguas usa el modelo"
" [lang-id-commonlanguage-ecapa de Speechbrain](https://huggingface.co./speechbrain/lang-id-commonlanguage_ecapa)"
" y aquí se supone que si la lengua no es español, debe ser la lengua principal del contexto."
"\n\n"
"Chatino: Prueba de dictado a texto para el chatino de la sierra (Quiahije) "
" usando [el modelo entrenado por Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
" con [los datos recopilados por Hilaria Cruz y sys colaboradores](https://gorilla.linguistlist.org/code/ctp/)"
"\n\n"
"Mixteco: Prueba de dictado a texto para el mixteco de Yoloxochitl,"
" usando [el modelo entrenado por Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
" con [los datos recopilados por Rey Castillo, Jonathan Amith y sus colaboradores](https://www.openslr.org/89)."
" Esta prueba es basada en la de [Ukraniano](https://huggingface.co./spaces/robinhad/ukrainian-stt)."
" \n\n"
"Totonaco: Prueba de dictado a texto para el totonaco de la sierra,"
" usando [el modelo entrenado por Bülent Özden](https://coqui.ai/totonac/bozden/v1.0.0)"
" con [los datos recopilados por Osbel López Francisco y Jonathan Amith](https://www.openslr.org/107)."
" \n\n"
"Los ejemplos vienen del proyecto [DEMCA](https://demca.mesolex.org/). "
" Esta prueba es basada en la de [Ukraniano](https://huggingface.co./spaces/robinhad/ukrainian-stt)."
)
iface.launch()
|