Spaces:

smajumdar
/

nemo_conformer_rnnt_large

Sleeping

App Files Files Community

smajumdar commited on Mar 27, 2022

Commit

75e1446

•

1 Parent(s): 729499b

Add nemo inference code

Browse files

Files changed (4) hide show

README.md +1 -2
app.py +71 -0
packages.txt +2 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -1,10 +1,9 @@
 ---
 title: Nemo_conformer_rnnt_large
-emoji: 📉
 colorFrom: green
 colorTo: red
 sdk: gradio
-sdk_version: 2.8.14
 app_file: app.py
 pinned: false
 license: apache-2.0

 ---
 title: Nemo_conformer_rnnt_large
+emoji: 🐠
 colorFrom: green
 colorTo: red
 sdk: gradio
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gradio as gr
+import torch
+import librosa
+import soundfile
+import nemo.collections.asr as nemo_asr
+import tempfile
+import os
+import uuid
+SAMPLE_RATE = 16000
+model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("stt_en_conformer_transducer_large")
+model.change_decoding_strategy(None)
+model.eval()
+def process_audio_file(file):
+    data, sr = librosa.load(file)
+    if sr != SAMPLE_RATE:
+        data = librosa.resample(data, sr, SAMPLE_RATE)
+    # monochannel
+    data = librosa.to_mono(data)
+    return data
+def transcribe(file_mic, file_upload):
+    warn_output = ""
+    if (file_mic is not None) and (file_upload is not None):
+        warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
+                      "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
+        file = file_mic
+    elif (file_mic is None) and (file_upload is None):
+        return "ERROR: You have to either use the microphone or upload an audio file"
+    elif file_mic is not None:
+        file = file_mic
+    else:
+        file = file_upload
+    audio_data = process_audio_file(file)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
+        soundfile.write(audio_path, audio_data, SAMPLE_RATE)
+        transcriptions = model.transcribe([audio_path])
+        # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
+        if type(transcriptions) == tuple and len(transcriptions) == 2:
+            transcriptions = transcriptions[0]
+    return warn_output + transcriptions[0]
+iface = gr.Interface(
+    fn=transcribe,
+    inputs=[
+        gr.inputs.Audio(source="microphone", type='filepath', optional=True),
+        gr.inputs.Audio(source="upload", type='filepath', optional=True),
+    ],
+    outputs="text",
+    layout="horizontal",
+    theme="huggingface",
+    title="NeMo Conformer Transducer Large",
+    description="Demo for speech recognition using Conformers",
+    enable_queue=True,
+    allow_flagging=False,
+)
+iface.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ libsndfile1

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ nemo_toolkit[asr]