NorHsangPha commited on
Commit
772adb0
1 Parent(s): b39be5b

Initial: initial commit

Browse files
Files changed (7) hide show
  1. .gitattributes +2 -0
  2. .gitignore +1 -0
  3. app.py +52 -0
  4. asr.py +74 -0
  5. requirements.txt +5 -0
  6. upload/sample1.wav +3 -0
  7. upload/sample2.wav +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ upload/sample2.wav filter=lfs diff=lfs merge=lfs -text
37
+ upload/sample1.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from asr import transcribe, ASR_EXAMPLES
3
+
4
+ mms_select_source_trans = gr.Radio(
5
+ ["Record from Mic", "Upload audio"],
6
+ label="Audio input",
7
+ value="Record from Mic",
8
+ )
9
+ mms_mic_source_trans = gr.Audio(
10
+ sources=["microphone"], type="filepath", label="Use mic"
11
+ )
12
+ mms_upload_source_trans = gr.Audio(
13
+ sources=["upload"], type="filepath", label="Upload file", visible=False
14
+ )
15
+
16
+ mms_transcribe = gr.Interface(
17
+ fn=transcribe,
18
+ inputs=[
19
+ gr.Dropdown(
20
+ [
21
+ "original",
22
+ "finetune",
23
+ ],
24
+ label="Model",
25
+ value="finetune",
26
+ ),
27
+ mms_select_source_trans,
28
+ mms_mic_source_trans,
29
+ mms_upload_source_trans,
30
+ ],
31
+ outputs="text",
32
+ examples=ASR_EXAMPLES,
33
+ title="Auto Speech Recognition Demo",
34
+ description=(
35
+ "Transcribe audio from a microphone or input file in your desired language."
36
+ ),
37
+ allow_flagging="never",
38
+ )
39
+
40
+ with gr.Blocks() as demo:
41
+ mms_transcribe.render()
42
+ mms_select_source_trans.change(
43
+ lambda x: [
44
+ gr.update(visible=True if x == "Record from Mic" else False),
45
+ gr.update(visible=True if x == "Upload audio" else False),
46
+ ],
47
+ inputs=[mms_select_source_trans],
48
+ outputs=[mms_mic_source_trans, mms_upload_source_trans],
49
+ queue=False,
50
+ )
51
+
52
+ demo.launch()
asr.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import librosa
3
+ from transformers import Wav2Vec2ForCTC, AutoProcessor
4
+ import torch
5
+
6
+ ASR_SAMPLING_RATE = 16_000
7
+
8
+
9
+ def transcribe(model_name: str, audio_source=None, microphone=None, file_upload=None):
10
+ if type(microphone) is dict:
11
+ microphone = microphone["name"]
12
+
13
+ audio_fp = (
14
+ file_upload if "upload" in str(audio_source or "").lower() else microphone
15
+ )
16
+
17
+ if audio_fp is None:
18
+ return "ERROR: You have to either use the microphone or upload an audio file"
19
+
20
+ audio_samples = librosa.load(audio_fp, sr=ASR_SAMPLING_RATE, mono=True)[0]
21
+
22
+ model_id = {
23
+ "original": "facebook/mms-1b-all",
24
+ "finetune": "NorHsangPha/wav2vec2-large-mms-1b-shan",
25
+ }[model_name]
26
+
27
+ auth_token = os.environ.get("TOKEN_READ_SECRET") or True
28
+
29
+ if model_name == "original":
30
+ model = Wav2Vec2ForCTC.from_pretrained(model_id)
31
+ processor = AutoProcessor.from_pretrained(model_id)
32
+ processor.tokenizer.set_target_lang("shn")
33
+ model.load_adapter("shn")
34
+ elif model_name == "finetune":
35
+ model = Wav2Vec2ForCTC.from_pretrained(
36
+ model_id, target_lang="shn", ignore_mismatched_sizes=True, token=auth_token
37
+ )
38
+ processor = AutoProcessor.from_pretrained(model_id, token=auth_token)
39
+ else:
40
+ return "ERROR: Wrong model name, or model not available please restart."
41
+
42
+ if torch.cuda.is_available():
43
+ device = torch.device("cuda")
44
+ elif (
45
+ hasattr(torch.backends, "mps")
46
+ and torch.backends.mps.is_available()
47
+ and torch.backends.mps.is_built()
48
+ ):
49
+ device = torch.device("mps")
50
+ else:
51
+ device = torch.device("cpu")
52
+
53
+ model.to(device)
54
+
55
+ inputs = processor(
56
+ audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt"
57
+ )
58
+ inputs = inputs.to(device)
59
+
60
+ with torch.no_grad():
61
+ outputs = model(**inputs).logits
62
+
63
+ ids = torch.argmax(outputs, dim=-1)[0]
64
+ transcription = processor.decode(ids)
65
+
66
+ return transcription
67
+
68
+
69
+ ASR_EXAMPLES = [
70
+ ["finetune", "Upload audio", None, "upload/sample1.wav"],
71
+ ["finetune", "Upload audio", None, "upload/sample2.wav"],
72
+ ["original", "Upload audio", None, "upload/sample1.wav"],
73
+ ["original", "Upload audio", None, "upload/sample2.wav"],
74
+ ]
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ librosa
3
+ transformers
4
+ torch
5
+ torchaudio
upload/sample1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00ba47c1cebd97baa03b7dd33716dd5049cf0328780447bb37fc3a0f74fe19da
3
+ size 2218566
upload/sample2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dc22f6c9a97bf3cfb5025b3b68b1dc3814822ad4acfb04d7d914f9a86eadeb0
3
+ size 260808