File size: 4,457 Bytes
dbf2fc2
 
 
 
 
 
 
 
 
 
f4b3d1b
2edea7d
 
f09c038
 
dbf2fc2
f09c038
f4b3d1b
 
 
 
f09c038
 
 
f4b3d1b
 
 
 
 
 
 
 
dbf2fc2
f4b3d1b
dbf2fc2
bb12448
 
f4b3d1b
8ee61a8
dbf2fc2
 
 
 
 
f4b3d1b
 
 
 
 
dbf2fc2
f4b3d1b
 
 
dbf2fc2
f09c038
dbf2fc2
 
f4b3d1b
 
 
dbf2fc2
f4b3d1b
 
 
511c6af
 
 
f4b3d1b
 
 
 
 
 
 
 
 
 
 
dbf2fc2
 
 
f4b3d1b
dbf2fc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4b3d1b
dbf2fc2
 
 
 
 
 
 
f4b3d1b
 
dbf2fc2
 
 
 
f09c038
 
 
 
 
 
 
 
 
dbf2fc2
 
f4b3d1b
 
dbf2fc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from io import BytesIO
from typing import Tuple
import wave
import gradio as gr
import numpy as np
from pydub.audio_segment import AudioSegment
import requests
from os.path import exists
from stt import Model

import torch
from transformers import AutoModelForCTC, Wav2Vec2Processor

import torchaudio
from speechbrain.pretrained import EncoderClassifier

# initialize language ID model
lang_classifier = EncoderClassifier.from_hparams(
    source="speechbrain/lang-id-commonlanguage_ecapa", 
    savedir="pretrained_models/lang-id-commonlanguage_ecapa"
)


# download STT model
model_info = {
    "mixteco": ("https://coqui.gateway.scarf.sh/mixtec/jemeyer/v1.0.0/model.tflite", "mixtec.tflite"),
    "chatino": ("https://coqui.gateway.scarf.sh/chatino/bozden/v1.0.0/model.tflite", "chatino.tflite"),
    "totonaco": ("https://coqui.gateway.scarf.sh/totonac/bozden/v1.0.0/model.tflite", "totonac.tflite"),
    "español": ("jonatasgrosman/wav2vec2-large-xlsr-53-spanish", "spanish_xlsr"),
    "inglés": ("facebook/wav2vec2-large-robust-ft-swbd-300h", "english_xlsr"),
}


def client(audio_data: np.array, sample_rate: int, default_lang: str):
    output_audio = _convert_audio(audio_data, sample_rate)
    waveform, _ = torchaudio.load(output_audio)
    out_prob, score, index, text_lab = lang_classifier.classify_batch(waveform)

    output_audio.seek(0)
    fin = wave.open(output_audio, 'rb')
    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    fin.close()

    if text_lab == 'Spanish':
        processor, model = STT_MODELS['español']
        inputs = processor(waveform)
        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
        result = processor.decode(torch.argmax(logits, dim=-1).cpu().tolist())

    else:
        ds = STT_MODELS[default_lang]
        result = ds.stt(audio)

    return f"{text_lab}: {result}"


def load_models(language):

    model_path, file_name = model_info.get("language", ("", ""))

    if model_path.startswith('http'):
        if not exists(file_name):
            print(f"Downloading {model_path}")
            r = requests.get(model_path, allow_redirects=True)
            with open(file_name, 'wb') as file:
                file.write(r.content)
        else:
            print(f"Found {file_name}. Skipping download...")
        return Model(file_name)

    processor = Wav2Vec2Processor.from_pretrained(model_path)
    model = AutoModelForCTC.from_pretrained(model_path)
    return processor, model



def stt(default_lang: str, audio: Tuple[int, np.array]):
    sample_rate, audio = audio
    use_scorer = False

    recognized_result = client(audio, sample_rate, default_lang)

    return recognized_result


def _convert_audio(audio_data: np.array, sample_rate: int):
    source_audio = BytesIO()
    source_audio.write(audio_data)
    source_audio.seek(0)
    output_audio = BytesIO()
    wav_file = AudioSegment.from_raw(
        source_audio,
        channels=1,
        sample_width=2,
        frame_rate=sample_rate
    )
    wav_file.set_frame_rate(16000).set_channels(1).export(output_audio, "wav", codec="pcm_s16le")
    output_audio.seek(0)
    return output_audio


iface = gr.Interface(
    fn=stt,
    inputs=[
        gr.inputs.Radio(choices=("chatino", "mixteco", "totonaco"), default="mixteco", label="Lengua principal"),
        gr.inputs.Audio(type="numpy", label="Audio", optional=False),
    ],
    outputs=gr.outputs.Textbox(label="Output"),
    title="Coqui STT Yoloxochitl Mixtec",
    theme="huggingface",
    description="Prueba de dictado a texto para el mixteco de Yoloxochitl,"
                " usando [el modelo entrenado por Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
                " con [los datos recopilados por Rey Castillo y sus colaboradores](https://www.openslr.org/89)."
                " Esta prueba es basada en la de [Ukraniano](https://huggingface.co./spaces/robinhad/ukrainian-stt)."
                " \n\n"
                "Speech-to-text demo for Yoloxochitl Mixtec,"
                " using [the model trained by Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
                " on [the corpus compiled by Rey Castillo and collaborators](https://www.openslr.org/89)."
                " This demo is based on the [Ukrainian STT demo](https://huggingface.co./spaces/robinhad/ukrainian-stt).",
)

STT_MODELS = {lang: load_models(lang) for lang in ("inglés", "español")}

iface.launch()