RikeshSilwal commited on
Commit
5cacb9c
1 Parent(s): 6308522

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -23
app.py CHANGED
@@ -1,37 +1,39 @@
1
  import gradio as gr
2
 
3
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
4
- from datasets import Audio, load_dataset
5
-
6
  import torchaudio
 
 
 
 
 
 
 
 
 
 
7
  from torchaudio.transforms import Resample
 
8
 
9
 
10
- # load model and processor
11
- processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
12
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
13
- forced_decoder_ids = processor.get_decoder_prompt_ids(language="hindi", task="transcribe")
14
 
15
 
16
 
17
  def transcribe_audio(audio_file):
18
  input_arr, sampling_rate =torchaudio.load(audio_file)
19
- input_arr = input_arr[0].numpy()
20
-
21
- if sampling_rate != 16000:
22
- resampler = Resample(orig_freq=sampling_rate, new_freq=16000)
23
- input_arr = resampler(input_arr).squeeze().numpy()
24
- sampling_rate = 16000
25
- input_features = processor(input_arr, sampling_rate=sampling_rate, return_tensors="pt").input_features
26
-
27
- # generate token ids
28
- predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
29
- # decode token ids to text
30
- transcription = processor.batch_decode(predicted_ids)
31
-
32
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
33
- print(transcription)
34
- return transcription[0]
35
 
36
  audio_input = gr.inputs.Audio(source="upload", type="filepath")
37
 
 
1
  import gradio as gr
2
 
3
+ import torch
 
 
4
  import torchaudio
5
+ from datasets import load_dataset
6
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
7
+ import pandas as pd
8
+ from sklearn.model_selection import train_test_split
9
+
10
+
11
+
12
+ processor = Wav2Vec2Processor.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
13
+ model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
14
+
15
  from torchaudio.transforms import Resample
16
+ import numpy as np
17
 
18
 
 
 
 
 
19
 
20
 
21
 
22
  def transcribe_audio(audio_file):
23
  input_arr, sampling_rate =torchaudio.load(audio_file)
24
+ resampler = Resample(orig_freq=sampling_rate, new_freq=16000)
25
+ input_arr = resampler(input_arr).squeeze().numpy()
26
+ sampling_rate = 16000
27
+ inputs = processor(input_arr, sampling_rate=16_000, return_tensors="pt", padding=True)
28
+
29
+ with torch.no_grad():
30
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
31
+
32
+ predicted_ids = torch.argmax(logits, dim=-1)
33
+
34
+ predicted_words= processor.batch_decode(predicted_ids)
35
+
36
+ return predicted_words[0]
 
 
 
37
 
38
  audio_input = gr.inputs.Audio(source="upload", type="filepath")
39