vumichien commited on
Commit
4671727
1 Parent(s): 947682e

Create new file

Browse files
Files changed (1) hide show
  1. app.py +54 -0
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
+ import torch
6
+ import librosa
7
+
8
+ # load model and processor
9
+ processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
10
+ model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
11
+
12
+ tokenizer = AutoTokenizer.from_pretrained("icon-it-tdtu/mt-en-vi-optimum")
13
+ model_lm = ORTModelForSeq2SeqLM.from_pretrained("icon-it-tdtu/mt-en-vi-optimum")
14
+
15
+ def process_audio_file(file):
16
+ data, sr = librosa.load(file)
17
+ if sr != 16000:
18
+ data = librosa.resample(data, sr, 16000)
19
+ inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
20
+ return inputs
21
+
22
+
23
+ def transcribe(file, state=""):
24
+ inputs = process_audio_file(file)
25
+ with torch.no_grad():
26
+ output_logit = model(inputs.input_values).logits
27
+ pred_ids = torch.argmax(output_logit, dim=-1)
28
+ text = processor.batch_decode(pred_ids)[0].lower()
29
+ print(text)
30
+ text = translate(text)
31
+ state += text + " "
32
+ return state, state
33
+
34
+
35
+ def translate(text):
36
+ batch = tokenizer([text], return_tensors="pt")
37
+ generated_ids = model_lm.generate(**batch)
38
+ translated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
39
+ return translated_text
40
+
41
+
42
+ # Set the starting state to an empty string
43
+
44
+ gr.Interface(
45
+ fn=transcribe,
46
+ inputs=[
47
+ gr.Audio(source="microphone", type="filepath", streaming=True),
48
+ "state"
49
+ ],
50
+ outputs=[
51
+ "textbox",
52
+ "state"
53
+ ],
54
+ live=True).launch(debug=True)