RikeshSilwal commited on
Commit
6308522
1 Parent(s): 1233998

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -0
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
4
+ from datasets import Audio, load_dataset
5
+
6
+ import torchaudio
7
+ from torchaudio.transforms import Resample
8
+
9
+
10
+ # load model and processor
11
+ processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
12
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
13
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="hindi", task="transcribe")
14
+
15
+
16
+
17
+ def transcribe_audio(audio_file):
18
+ input_arr, sampling_rate =torchaudio.load(audio_file)
19
+ input_arr = input_arr[0].numpy()
20
+
21
+ if sampling_rate != 16000:
22
+ resampler = Resample(orig_freq=sampling_rate, new_freq=16000)
23
+ input_arr = resampler(input_arr).squeeze().numpy()
24
+ sampling_rate = 16000
25
+ input_features = processor(input_arr, sampling_rate=sampling_rate, return_tensors="pt").input_features
26
+
27
+ # generate token ids
28
+ predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
29
+ # decode token ids to text
30
+ transcription = processor.batch_decode(predicted_ids)
31
+
32
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
33
+ print(transcription)
34
+ return transcription[0]
35
+
36
+ audio_input = gr.inputs.Audio(source="upload", type="filepath")
37
+
38
+ iface = gr.Interface(fn=transcribe_audio, inputs=audio_input,
39
+ outputs=["textbox"], title="Speech To Text",
40
+ description="Upload an audio file and hit the 'Submit'\
41
+ button")
42
+
43
+ iface.launch(inline=False)
44
+