Amir Zait commited on
Commit
f7c2e78
1 Parent(s): 0fbdf8e

added files

Browse files
Files changed (3) hide show
  1. app.py +75 -0
  2. packages.txt +3 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoProcessor, AutoModelForCTC
2
+ from transformers import pipeline
3
+
4
+ import soundfile as sf
5
+ import gradio as gr
6
+ import librosa
7
+ import torch
8
+ import sox
9
+ import os
10
+
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+
13
+ api_token = os.getenv("API_TOKEN")
14
+ asr_processor = AutoProcessor.from_pretrained("imvladikon/wav2vec2-xls-r-300m-hebrew")
15
+ asr_model = AutoModelForCTC.from_pretrained("imvladikon/wav2vec2-xls-r-300m-hebrew")
16
+
17
+ en_he_translator = pipeline("translation_en_to_he")
18
+
19
+
20
+ def process_audio_file(file):
21
+ data, sr = librosa.load(file)
22
+ if sr != 16000:
23
+ data = librosa.resample(data, sr, 16000)
24
+
25
+ input_values = processor(data, sampling_rate=16_000, return_tensors="pt").input_values #.to(device)
26
+ return input_values
27
+
28
+ def transcribe(file_mic, file_upload):
29
+ warn_output = ""
30
+ if (file_mic is not None) and (file_upload is not None):
31
+ warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
32
+ file = file_mic
33
+ elif (file_mic is None) and (file_upload is None):
34
+ return "ERROR: You have to either use the microphone or upload an audio file"
35
+ elif file_mic is not None:
36
+ file = file_mic
37
+ else:
38
+ file = file_upload
39
+
40
+ input_values = process_audio_file(file)
41
+ logits = model(input_values).logits
42
+ predicted_ids = torch.argmax(logits, dim=-1)
43
+ transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
44
+ return warn_output + transcription
45
+
46
+ def convert(inputfile, outfile):
47
+ sox_tfm = sox.Transformer()
48
+ sox_tfm.set_output_format(
49
+ file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
50
+ )
51
+ sox_tfm.build(inputfile, outfile)
52
+
53
+ def parse_transcription(wav_file):
54
+ filename = wav_file.name.split('.')[0]
55
+ convert(wav_file.name, filename + "16k.wav")
56
+ speech, _ = sf.read(filename + "16k.wav")
57
+ print(speech.shape)
58
+ input_values = trans_processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
59
+ logits = trans_model(input_values).logits
60
+ predicted_ids = torch.argmax(logits, dim=-1)
61
+ transcription = trans_processor.decode(predicted_ids[0], skip_special_tokens=True)
62
+ translated = en_he_translator(trasncription)
63
+ return transcription
64
+
65
+ output = gr.outputs.Textbox(label="TEXT")
66
+ input_mic = gr.inputs.Audio(source="microphone", type="file", optional=True)
67
+ input_upload = gr.inputs.Audio(source="upload", type="file", optional=True)
68
+
69
+ gr.Interface(parse_transcription, inputs=[input_mic], outputs=output,
70
+ analytics_enabled=False,
71
+ show_tips=False,
72
+ theme='huggingface',
73
+ layout='horizontal',
74
+ title="Draw Me A Ship in Hebrew",
75
+ enable_queue=True).launch(inline=False)
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ libsndfile1
2
+ sox
3
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ librosa
3
+ soundfile
4
+ torch
5
+ transformers
6
+ sox
7
+ sentencepiece