import os os.system("pip install nemo_toolkit['all']") import gradio as gr import nemo.collections.asr as nemo_asr model = nemo_asr.models.EncDecCTCModel.from_pretrained( model_name="stt_en_quartznet15x5" ) def speech_file(x): # print(x) text = model.transcribe([f"{x}"]) # print(text) return text def speech_record(x): text = model.transcribe([f"{x}"]) return text with gr.Blocks() as demo: gr.Markdown( """ ## Speech to Text - NVIDIA Qaurtznet15x5 (English) """) with gr.Tab("Audio File"): with gr.Row().style(equal_height=True): audio_input2 = gr.Audio(label="Audio File", type="filepath") text_output2 = gr.Textbox(label="Transcription", show_label=False) file_button = gr.Button("Transcribe") with gr.Tab("Record"): with gr.Row().style(equal_height=True): audio_input3 = gr.Audio(label="Input Audio", source="microphone", type="filepath") text_output3 = gr.Textbox(label="Transcription", show_label=False) rec_button = gr.Button("Transcribe") file_button.click(speech_file, inputs=audio_input2, outputs=text_output2) rec_button.click(speech_record, inputs=audio_input3, outputs=text_output3) demo.launch()