Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
b04ca6e
0
Parent(s):
initial commit
Browse files- .gitattributes +35 -0
- README.md +12 -0
- app.py +240 -0
- requirements.txt +4 -0
- samples/audio1.mp3 +0 -0
- samples/audio2.wav +0 -0
- samples/audio3.wav +0 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Dhivehi Mms Zeroshot
|
3 |
+
emoji: 📈
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: red
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.42.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spaces
|
2 |
+
import gradio as gr
|
3 |
+
import librosa
|
4 |
+
import torch
|
5 |
+
|
6 |
+
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
7 |
+
from huggingface_hub import hf_hub_download
|
8 |
+
from torchaudio.models.decoder import ctc_decoder
|
9 |
+
# https://github.com/facebookresearch/fairseq/tree/main/examples/mms/zero_shot
|
10 |
+
|
11 |
+
ASR_SAMPLING_RATE = 16_000
|
12 |
+
|
13 |
+
WORD_SCORE_DEFAULT_IF_LM = -0.18
|
14 |
+
WORD_SCORE_DEFAULT_IF_NOLM = -3.5
|
15 |
+
LM_SCORE_DEFAULT = 1.48
|
16 |
+
|
17 |
+
MODEL_ID = "mms-meta/mms-zeroshot-300m"
|
18 |
+
|
19 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
20 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
|
21 |
+
|
22 |
+
token_file = hf_hub_download(
|
23 |
+
repo_id=MODEL_ID,
|
24 |
+
filename="tokens.txt",
|
25 |
+
)
|
26 |
+
|
27 |
+
lm5gram = hf_hub_download(
|
28 |
+
repo_id="alakxender/w2v-bert-2.0-dhivehi-syn",
|
29 |
+
filename="language_model/5gram.bin",
|
30 |
+
)
|
31 |
+
|
32 |
+
lexicon_file = hf_hub_download(
|
33 |
+
repo_type="dataset",
|
34 |
+
repo_id="alakxender/dv-domain-lexicons",
|
35 |
+
filename="dv.domain.news.small.v1.lexicon",
|
36 |
+
)
|
37 |
+
|
38 |
+
@spaces.GPU
|
39 |
+
def transcribe(
|
40 |
+
audio_data,
|
41 |
+
wscore=None,
|
42 |
+
lmscore=None,
|
43 |
+
wscore_usedefault=True,
|
44 |
+
lmscore_usedefault=True,
|
45 |
+
uselm=True,
|
46 |
+
reference=None,
|
47 |
+
):
|
48 |
+
|
49 |
+
if not audio_data:
|
50 |
+
yield "ERROR: Empty audio data"
|
51 |
+
return
|
52 |
+
|
53 |
+
# audio
|
54 |
+
if isinstance(audio_data, tuple):
|
55 |
+
# microphone
|
56 |
+
sr, audio_samples = audio_data
|
57 |
+
audio_samples = (audio_samples / 32768.0).astype(float)
|
58 |
+
|
59 |
+
if sr != ASR_SAMPLING_RATE:
|
60 |
+
audio_samples = librosa.resample(
|
61 |
+
audio_samples, orig_sr=sr, target_sr=ASR_SAMPLING_RATE
|
62 |
+
)
|
63 |
+
else:
|
64 |
+
# file upload
|
65 |
+
assert isinstance(audio_data, str)
|
66 |
+
audio_samples = librosa.load(audio_data, sr=ASR_SAMPLING_RATE, mono=True)[0]
|
67 |
+
|
68 |
+
inputs = processor(
|
69 |
+
audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt"
|
70 |
+
)
|
71 |
+
|
72 |
+
# set device
|
73 |
+
if torch.cuda.is_available():
|
74 |
+
device = torch.device("cuda")
|
75 |
+
else:
|
76 |
+
device = torch.device("cpu")
|
77 |
+
|
78 |
+
model.to(device)
|
79 |
+
inputs = inputs.to(device)
|
80 |
+
|
81 |
+
with torch.no_grad():
|
82 |
+
outputs = model(**inputs).logits
|
83 |
+
|
84 |
+
# params
|
85 |
+
if uselm == True:
|
86 |
+
lm_path=lm5gram
|
87 |
+
else:
|
88 |
+
lm_path=None
|
89 |
+
|
90 |
+
if lm_path is not None and not lm_path.strip():
|
91 |
+
lm_path = None
|
92 |
+
|
93 |
+
if wscore_usedefault:
|
94 |
+
wscore = (
|
95 |
+
WORD_SCORE_DEFAULT_IF_LM
|
96 |
+
if lm_path is not None
|
97 |
+
else WORD_SCORE_DEFAULT_IF_NOLM
|
98 |
+
)
|
99 |
+
if lmscore_usedefault:
|
100 |
+
lmscore = LM_SCORE_DEFAULT if lm_path is not None else 0
|
101 |
+
|
102 |
+
beam_search_decoder = ctc_decoder(
|
103 |
+
lexicon=lexicon_file,
|
104 |
+
tokens=token_file,
|
105 |
+
lm=lm_path,
|
106 |
+
nbest=1,
|
107 |
+
beam_size=500,
|
108 |
+
beam_size_token=50,
|
109 |
+
lm_weight=lmscore,
|
110 |
+
word_score=wscore,
|
111 |
+
sil_score=0,
|
112 |
+
blank_token="<s>",
|
113 |
+
)
|
114 |
+
|
115 |
+
beam_search_result = beam_search_decoder(outputs.to("cpu"))
|
116 |
+
transcription = " ".join(beam_search_result[0][0].words).strip()
|
117 |
+
|
118 |
+
yield transcription
|
119 |
+
|
120 |
+
styles = """
|
121 |
+
.thaana textarea {
|
122 |
+
font-size: 18px !important;
|
123 |
+
font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
|
124 |
+
line-height: 1.8 !important;
|
125 |
+
}
|
126 |
+
.textbox2 textarea {
|
127 |
+
display: none;
|
128 |
+
}
|
129 |
+
"""
|
130 |
+
|
131 |
+
with gr.Blocks(css=styles) as demo:
|
132 |
+
gr.Markdown("# <center> Transcribe Dhivehi Audio with MMS-ZEROSHOT</center>")
|
133 |
+
with gr.Row():
|
134 |
+
with gr.Column():
|
135 |
+
audio = gr.Audio(label="Audio Input\n(use microphone or upload a file)",min_length=1,max_length=60)
|
136 |
+
|
137 |
+
with gr.Accordion("Advanced Settings", open=False):
|
138 |
+
gr.Markdown(
|
139 |
+
"The following parameters are used for beam-search decoding. Use the default values if you are not sure."
|
140 |
+
)
|
141 |
+
with gr.Row():
|
142 |
+
with gr.Column():
|
143 |
+
wscore_usedefault = gr.Checkbox(
|
144 |
+
label="Use Default Word Insertion Score", value=True
|
145 |
+
)
|
146 |
+
wscore = gr.Slider(
|
147 |
+
minimum=-10.0,
|
148 |
+
maximum=10.0,
|
149 |
+
value=WORD_SCORE_DEFAULT_IF_LM,
|
150 |
+
step=0.1,
|
151 |
+
interactive=False,
|
152 |
+
label="Word Insertion Score",
|
153 |
+
)
|
154 |
+
|
155 |
+
with gr.Column():
|
156 |
+
lmscore_usedefault = gr.Checkbox(
|
157 |
+
label="Use Default Language Model Score", value=True
|
158 |
+
)
|
159 |
+
lmscore = gr.Slider(
|
160 |
+
minimum=-10.0,
|
161 |
+
maximum=10.0,
|
162 |
+
value=LM_SCORE_DEFAULT,
|
163 |
+
step=0.1,
|
164 |
+
interactive=False,
|
165 |
+
label="Language Model Score",
|
166 |
+
)
|
167 |
+
with gr.Column():
|
168 |
+
uselm = gr.Checkbox(
|
169 |
+
label="Use LM",
|
170 |
+
value=True,
|
171 |
+
)
|
172 |
+
btn = gr.Button("Submit", elem_id="submit")
|
173 |
+
|
174 |
+
@gr.on(
|
175 |
+
inputs=[wscore_usedefault, lmscore_usedefault, uselm],
|
176 |
+
outputs=[wscore, lmscore],
|
177 |
+
)
|
178 |
+
def update_slider(ws, ls, lm, alm):
|
179 |
+
|
180 |
+
ws_slider = gr.Slider(
|
181 |
+
minimum=-10.0,
|
182 |
+
maximum=10.0,
|
183 |
+
value=LM_SCORE_DEFAULT if (lm is not None or alm) else 0,
|
184 |
+
step=0.1,
|
185 |
+
interactive=not ws,
|
186 |
+
label="Word Insertion Score",
|
187 |
+
)
|
188 |
+
ls_slider = gr.Slider(
|
189 |
+
minimum=-10.0,
|
190 |
+
maximum=10.0,
|
191 |
+
value=WORD_SCORE_DEFAULT_IF_NOLM
|
192 |
+
if (lm is None and not alm)
|
193 |
+
else WORD_SCORE_DEFAULT_IF_LM,
|
194 |
+
step=0.1,
|
195 |
+
interactive=not ls,
|
196 |
+
label="Language Model Score",
|
197 |
+
)
|
198 |
+
return ws_slider, ls_slider
|
199 |
+
|
200 |
+
with gr.Column():
|
201 |
+
text = gr.Textbox(label="Transcript",rtl=True,elem_classes="thaana")
|
202 |
+
|
203 |
+
reference = gr.Textbox(label="Reference Transcript", visible=False)
|
204 |
+
|
205 |
+
btn.click(
|
206 |
+
transcribe,
|
207 |
+
inputs=[
|
208 |
+
audio,
|
209 |
+
wscore,
|
210 |
+
lmscore,
|
211 |
+
wscore_usedefault,
|
212 |
+
lmscore_usedefault,
|
213 |
+
uselm,
|
214 |
+
reference,
|
215 |
+
],
|
216 |
+
outputs=[text],
|
217 |
+
)
|
218 |
+
|
219 |
+
# Examples
|
220 |
+
gr.Examples(
|
221 |
+
examples=[
|
222 |
+
[
|
223 |
+
"samples/audio1.mp3",
|
224 |
+
"އަޅުގަނޑުވެސް ދާކަށް ބޭނުމެއްނުވި"
|
225 |
+
],
|
226 |
+
[
|
227 |
+
"samples/audio2.wav",
|
228 |
+
"ރަނގަޅަށްވިއްޔާ އެވާނީ މުސްކުޅި ކުރެހުމަކަށް"
|
229 |
+
],
|
230 |
+
|
231 |
+
[
|
232 |
+
"samples/audio3.wav",
|
233 |
+
"އެއީ ޞަހްޔޫނީންގެ ޒަމާންވީ ރޭވުމެއްގެ ދަށުން މެދުނުކެނޑި ކުރިއަށްވާ ޕްރޮގްރާމެއް"
|
234 |
+
],
|
235 |
+
],
|
236 |
+
inputs=[audio, reference],
|
237 |
+
label="Dhivehi Audio Samples",
|
238 |
+
)
|
239 |
+
|
240 |
+
demo.launch(show_api=False)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
flashlight-text
|
3 |
+
librosa
|
4 |
+
torchaudio
|
samples/audio1.mp3
ADDED
Binary file (21.9 kB). View file
|
|
samples/audio2.wav
ADDED
Binary file (194 kB). View file
|
|
samples/audio3.wav
ADDED
Binary file (267 kB). View file
|
|