Spaces:
Runtime error
Runtime error
File size: 3,526 Bytes
7bfa718 4104468 7bfa718 d326cd2 7bfa718 4104468 d326cd2 4104468 d326cd2 4104468 d326cd2 4104468 d326cd2 4104468 d326cd2 4104468 d326cd2 a21cd0c d326cd2 a21cd0c 4104468 d326cd2 4104468 d326cd2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import gradio as gr
import torch
import zipfile
from pyctcdecode import build_ctcdecoder
from speechbrain.pretrained import EncoderASR
from transformers.file_utils import cached_path, hf_bucket_url
def download_lm(cache_dir="./cache"):
cache_dir = "./cache/"
lm_file = hf_bucket_url("dragonSwing/wav2vec2-base-vn-270h", filename="4gram.zip")
lm_file = cached_path(lm_file, cache_dir=cache_dir)
with zipfile.ZipFile(lm_file, "r") as zip_ref:
zip_ref.extractall(cache_dir)
lm_file = cache_dir + "lm.binary"
vocab_file = cache_dir + "vocab-260000.txt"
return lm_file, vocab_file
model = EncoderASR.from_hparams(
source="dragonSwing/wav2vec2-base-vn-270h", savedir="./pretrained/wav2vec-vi-asr"
)
def get_decoder_ngram_model(tokenizer, ngram_lm_path, vocab_path=None):
unigrams = None
if vocab_path is not None:
unigrams = []
with open(vocab_path, encoding="utf-8") as f:
for line in f:
unigrams.append(line.strip())
vocab_dict = tokenizer.get_vocab()
sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
vocab = [x[1] for x in sort_vocab]
vocab_list = vocab
# convert ctc blank character representation
vocab_list[tokenizer.pad_token_id] = ""
# replace special characters
vocab_list[tokenizer.word_delimiter_token_id] = " "
# specify ctc blank char index, since conventially it is the last entry of the logit matrix
decoder = build_ctcdecoder(vocab_list, ngram_lm_path, unigrams=unigrams)
return decoder
# ngram_lm_model = get_decoder_ngram_model(model.tokenizer, lm_file, vocab_file)
def transcribe_file(path, max_seconds=20, lm_model=None):
waveform = model.load_audio(path)
if max_seconds > 0:
waveform = waveform[: max_seconds * 16000]
batch = waveform.unsqueeze(0)
rel_length = torch.tensor([1.0])
if lm_model:
with torch.no_grad():
logits = model(batch, rel_length)
text_batch = [
lm_model.decode(logit.detach().cpu().numpy(), beam_width=500)
for logit in logits
]
return text_batch[0]
else:
text_batch, _ = model.transcribe_batch(
batch, rel_length
)
return text_batch[0]
def speech_recognize(file_upload, file_mic):
if file_upload is not None:
file = file_upload
elif file_mic is not None:
file = file_mic
else:
return ""
# text = model.transcribe_file(file)
text = transcribe_file(file)
return text
inputs = [
gr.Audio(source="upload", type="filepath", optional=True),
gr.Audio(source="microphone", type="filepath", optional=True),
]
outputs = gr.Textbox(label="Output Text")
title = "wav2vec2-base-vietnamese-270h"
description = "Gradio demo for a wav2vec2 base vietnamese speech recognition. To use it, simply upload your audio, click one of the examples to load them, or record from your own microphone. Read more at the links below. Currently supports 16_000hz audio files"
article = "<p style='text-align: center'><a href='https://huggingface.co./dragonSwing/wav2vec2-base-vn-270h' target='_blank'>Pretrained model</a></p>"
examples = [
["example1.wav", "example1.wav"],
["example2.mp3", "example2.mp3"],
["example3.mp3", "example3.mp3"],
["example4.wav", "example4.wav"],
]
gr.Interface(
speech_recognize,
inputs,
outputs,
title=title,
description=description,
article=article,
examples=examples,
).launch()
|