Spaces:

rayl-aoit
/

translate_text_and_speech

Running

App Files Files Community

rayl-aoit commited on Jul 8

Commit

1180d04

•

1 Parent(s): 79b79bf

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -22

app.py CHANGED Viewed

@@ -23,10 +23,8 @@ decode_cfg.beam.beam_size = 1
 canary_model.change_decoding_strategy(decode_cfg)
 # load TTS model
-tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
-tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
-tts_fra_model = VitsModel.from_pretrained("facebook/mms-tts-fra")
-tts_fra_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-fra")
 # Function to convert audio to text using ASR
 def gen_text(audio_filepath, action):
@@ -56,38 +54,49 @@ def gen_text(audio_filepath, action):
         manifest_filepath = os.path.join(tmpdir, f"{utt_id}.json")
         with open(manifest_filepath, 'w') as fout:
             fout.write(json.dumps(manifest_data))
-        if duration < 40:
-            predicted_text = canary_model.transcribe(manifest_filepath)[0]
-        else:
-            predicted_text = get_buffered_pred_feat_multitaskAED(
-                frame_asr,
-                canary_model.cfg.preprocessor,
-                model_stride_in_secs,
-                canary_model.device,
-                manifest=manifest_filepath,
-            )[0].text
     return predicted_text
 # Function to convert text to speech using TTS
-def gen_speech(text):
     set_seed(555)  # Make it deterministic
-    input_text = tts_fra_tokenizer(text, return_tensors="pt")
     with torch.no_grad():
-        outputs = tts_fra_model(**input_text)
     waveform_np = outputs.waveform[0].cpu().numpy()
     output_file = f"{str(uuid.uuid4())}.wav"
     wav.write(output_file, rate=tts_model.config.sampling_rate, data=waveform_np)
     return output_file
 # Root function for Gradio interface
-def start_process(audio_filepath):
     transcription = gen_text(audio_filepath, "asr")
     print("Done transcribing")
     translation = gen_text(audio_filepath, "s2t_translation")
     print("Done translation")
-    audio_output_filepath = gen_speech(transcription)
     print("Done speaking")
     return transcription, translation, audio_output_filepath
@@ -123,7 +132,7 @@ with playground:
         with gr.Column():
             submit_button = gr.Button(value="Start Process", variant="primary")
         with gr.Column():
-            clear_button = gr.ClearButton(components=[input_audio, transcipted_text, translated_speech, translated_text, source_lang, target_lang], value="Clear")
     # with gr.Row():
     #     gr.Examples(
@@ -133,6 +142,6 @@ with playground:
     #         run_on_click=True, cache_examples=True, fn=start_process
     #     )
-    submit_button.click(start_process, inputs=[input_audio], outputs=[transcipted_text, translated_text, translated_speech])
 playground.launch()

 canary_model.change_decoding_strategy(decode_cfg)
 # load TTS model
+# tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+# tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
 # Function to convert audio to text using ASR
 def gen_text(audio_filepath, action):
         manifest_filepath = os.path.join(tmpdir, f"{utt_id}.json")
         with open(manifest_filepath, 'w') as fout:
             fout.write(json.dumps(manifest_data))
+        predicted_text = canary_model.transcribe(manifest_filepath)[0]
+        # if duration < 40:
+        #     predicted_text = canary_model.transcribe(manifest_filepath)[0]
+        # else:
+        #     predicted_text = get_buffered_pred_feat_multitaskAED(
+        #         frame_asr,
+        #         canary_model.cfg.preprocessor,
+        #         model_stride_in_secs,
+        #         canary_model.device,
+        #         manifest=manifest_filepath,
+        #     )[0].text
     return predicted_text
 # Function to convert text to speech using TTS
+def gen_speech(text, lang):
     set_seed(555)  # Make it deterministic
+    if lang=="en":
+        model = "facebook/mms-tts-eng"
+    elif lang=="fr":
+        model = "facebook/mms-tts-fra"
+    # load TTS model
+    tts_model = VitsModel.from_pretrained(model)
+    tts_tokenizer = AutoTokenizer.from_pretrained(model)
+    input_text = tts_tokenizer(text, return_tensors="pt")
     with torch.no_grad():
+        outputs = tts_model(**input_text)
     waveform_np = outputs.waveform[0].cpu().numpy()
     output_file = f"{str(uuid.uuid4())}.wav"
     wav.write(output_file, rate=tts_model.config.sampling_rate, data=waveform_np)
     return output_file
 # Root function for Gradio interface
+def start_process(audio_filepath, source_lang, target_lang):
     transcription = gen_text(audio_filepath, "asr")
     print("Done transcribing")
     translation = gen_text(audio_filepath, "s2t_translation")
     print("Done translation")
+    audio_output_filepath = gen_speech(transcription, target_lang)
     print("Done speaking")
     return transcription, translation, audio_output_filepath
         with gr.Column():
             submit_button = gr.Button(value="Start Process", variant="primary")
         with gr.Column():
+            clear_button = gr.ClearButton(components=[input_audio, source_lang, target_lang, transcipted_text, translated_text, translated_speech], value="Clear")
     # with gr.Row():
     #     gr.Examples(
     #         run_on_click=True, cache_examples=True, fn=start_process
     #     )
+    submit_button.click(start_process, inputs=[input_audio, source_lang, target_lang], outputs=[transcipted_text, translated_text, translated_speech])
 playground.launch()