Files changed (1) hide show
  1. app.py +0 -71
app.py CHANGED
@@ -1,71 +0,0 @@
1
- import gradio as gr
2
- import torch
3
- import librosa
4
- import soundfile
5
- import nemo.collections.asr as nemo_asr
6
- import tempfile
7
- import os
8
- import uuid
9
-
10
- SAMPLE_RATE = 16000
11
-
12
- model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("stt_en_conformer_transducer_large")
13
- model.change_decoding_strategy(None)
14
- model.eval()
15
-
16
-
17
- def process_audio_file(file):
18
- data, sr = librosa.load(file)
19
-
20
- if sr != SAMPLE_RATE:
21
- data = librosa.resample(data, sr, SAMPLE_RATE)
22
-
23
- # monochannel
24
- data = librosa.to_mono(data)
25
- return data
26
-
27
-
28
- def transcribe(Microphone, File_Upload):
29
- warn_output = ""
30
- if (Microphone is not None) and (File_Upload is not None):
31
- warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
32
- "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
33
- file = Microphone
34
-
35
- elif (Microphone is None) and (File_Upload is None):
36
- return "ERROR: You have to either use the microphone or upload an audio file"
37
-
38
- elif Microphone is not None:
39
- file = Microphone
40
- else:
41
- file = File_Upload
42
-
43
- audio_data = process_audio_file(file)
44
-
45
- with tempfile.TemporaryDirectory() as tmpdir:
46
- audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
47
- soundfile.write(audio_path, audio_data, SAMPLE_RATE)
48
-
49
- transcriptions = model.transcribe([audio_path])
50
-
51
- # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
52
- if type(transcriptions) == tuple and len(transcriptions) == 2:
53
- transcriptions = transcriptions[0]
54
-
55
- return warn_output + transcriptions[0]
56
-
57
-
58
- iface = gr.Interface(
59
- fn=transcribe,
60
- inputs=[
61
- gr.inputs.Audio(source="microphone", type='filepath', optional=True),
62
- gr.inputs.Audio(source="upload", type='filepath', optional=True),
63
- ],
64
- outputs="text",
65
- layout="horizontal",
66
- theme="huggingface",
67
- title="NeMo Conformer Transducer Large - English",
68
- description="Demo for English speech recognition using Conformer Transducers",
69
- allow_flagging='never',
70
- )
71
- iface.launch(enable_queue=True)