Spaces:

k2-fsa
/

automatic-speech-recognition

Running

App Files Files Community

csukuangfj commited on Apr 13, 2024

Commit

09ae8c3

1 Parent(s): 1420134

add punctuations

Browse files

Files changed (3) hide show

app.py +28 -1
model.py +15 -0
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -32,7 +32,13 @@ import torch
 import torchaudio
 from examples import examples
-from model import decode, get_pretrained_model, language_to_models, sample_rate
 languages = list(language_to_models.keys())
@@ -65,6 +71,7 @@ def process_url(
     repo_id: str,
     decoding_method: str,
     num_active_paths: int,
     url: str,
 ):
     logging.info(f"Processing URL: {url}")
@@ -78,6 +85,7 @@ def process_url(
                 repo_id=repo_id,
                 decoding_method=decoding_method,
                 num_active_paths=num_active_paths,
             )
         except Exception as e:
             logging.info(str(e))
@@ -89,6 +97,7 @@ def process_uploaded_file(
     repo_id: str,
     decoding_method: str,
     num_active_paths: int,
     in_filename: str,
 ):
     if in_filename is None or in_filename == "":
@@ -106,6 +115,7 @@ def process_uploaded_file(
             repo_id=repo_id,
             decoding_method=decoding_method,
             num_active_paths=num_active_paths,
         )
     except Exception as e:
         logging.info(str(e))
@@ -117,6 +127,7 @@ def process_microphone(
     repo_id: str,
     decoding_method: str,
     num_active_paths: int,
     in_filename: str,
 ):
     if in_filename is None or in_filename == "":
@@ -135,6 +146,7 @@ def process_microphone(
             repo_id=repo_id,
             decoding_method=decoding_method,
             num_active_paths=num_active_paths,
         )
     except Exception as e:
         logging.info(str(e))
@@ -147,6 +159,7 @@ def process(
     repo_id: str,
     decoding_method: str,
     num_active_paths: int,
     in_filename: str,
 ):
     logging.info(f"language: {language}")
@@ -170,6 +183,9 @@ def process(
     )
     text = decode(recognizer, filename)
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
@@ -277,6 +293,12 @@ with demo:
         label="Number of active paths for modified_beam_search",
     )
     with gr.Tabs():
         with gr.TabItem("Upload from disk"):
             uploaded_file = gr.Audio(
@@ -295,6 +317,7 @@ with demo:
                     model_dropdown,
                     decoding_method_radio,
                     num_active_paths_slider,
                     uploaded_file,
                 ],
                 outputs=[uploaded_output, uploaded_html_info],
@@ -319,6 +342,7 @@ with demo:
                     model_dropdown,
                     decoding_method_radio,
                     num_active_paths_slider,
                     microphone,
                 ],
                 outputs=[recorded_output, recorded_html_info],
@@ -344,6 +368,7 @@ with demo:
                 model_dropdown,
                 decoding_method_radio,
                 num_active_paths_slider,
                 uploaded_file,
             ],
             outputs=[uploaded_output, uploaded_html_info],
@@ -356,6 +381,7 @@ with demo:
                 model_dropdown,
                 decoding_method_radio,
                 num_active_paths_slider,
                 microphone,
             ],
             outputs=[recorded_output, recorded_html_info],
@@ -368,6 +394,7 @@ with demo:
                 model_dropdown,
                 decoding_method_radio,
                 num_active_paths_slider,
                 url_textbox,
             ],
             outputs=[url_output, url_html_info],

 import torchaudio
 from examples import examples
+from model import (
+    decode,
+    get_pretrained_model,
+    get_punct_model,
+    language_to_models,
+    sample_rate,
+)
 languages = list(language_to_models.keys())
     repo_id: str,
     decoding_method: str,
     num_active_paths: int,
+    add_punct: str,
     url: str,
 ):
     logging.info(f"Processing URL: {url}")
                 repo_id=repo_id,
                 decoding_method=decoding_method,
                 num_active_paths=num_active_paths,
+                add_punct=add_punct,
             )
         except Exception as e:
             logging.info(str(e))
     repo_id: str,
     decoding_method: str,
     num_active_paths: int,
+    add_punct: str,
     in_filename: str,
 ):
     if in_filename is None or in_filename == "":
             repo_id=repo_id,
             decoding_method=decoding_method,
             num_active_paths=num_active_paths,
+            add_punct=add_punct,
         )
     except Exception as e:
         logging.info(str(e))
     repo_id: str,
     decoding_method: str,
     num_active_paths: int,
+    add_punct: str,
     in_filename: str,
 ):
     if in_filename is None or in_filename == "":
             repo_id=repo_id,
             decoding_method=decoding_method,
             num_active_paths=num_active_paths,
+            add_punct=add_punct,
         )
     except Exception as e:
         logging.info(str(e))
     repo_id: str,
     decoding_method: str,
     num_active_paths: int,
+    add_punct: str,
     in_filename: str,
 ):
     logging.info(f"language: {language}")
     )
     text = decode(recognizer, filename)
+    if add_punct == "Yes":
+        punct = get_punct_model()
+        text = punct.add_punctuation(text)
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
         label="Number of active paths for modified_beam_search",
     )
+    punct_radio = gr.Radio(
+        label="Whether to add punctuation (Only for Chinese and English)",
+        choices=["Yes", "No"],
+        value="Yes",
+    )
     with gr.Tabs():
         with gr.TabItem("Upload from disk"):
             uploaded_file = gr.Audio(
                     model_dropdown,
                     decoding_method_radio,
                     num_active_paths_slider,
+                    punct_radio,
                     uploaded_file,
                 ],
                 outputs=[uploaded_output, uploaded_html_info],
                     model_dropdown,
                     decoding_method_radio,
                     num_active_paths_slider,
+                    punct_radio,
                     microphone,
                 ],
                 outputs=[recorded_output, recorded_html_info],
                 model_dropdown,
                 decoding_method_radio,
                 num_active_paths_slider,
+                punct_radio,
                 uploaded_file,
             ],
             outputs=[uploaded_output, uploaded_html_info],
                 model_dropdown,
                 decoding_method_radio,
                 num_active_paths_slider,
+                punct_radio,
                 microphone,
             ],
             outputs=[recorded_output, recorded_html_info],
                 model_dropdown,
                 decoding_method_radio,
                 num_active_paths_slider,
+                punct_radio,
                 url_textbox,
             ],
             outputs=[url_output, url_html_info],

model.py CHANGED Viewed

@@ -1182,6 +1182,21 @@ def _get_aishell_pre_trained_model(
     return recognizer
 def _get_multi_zh_hans_pre_trained_model(
     repo_id: str,
     decoding_method: str,

     return recognizer
+@lru_cache(maxsize=2)
+def get_punct_model() -> sherpa_onnx.OfflinePunctuation:
+    model = _get_nn_model_filename(
+        repo_id="csukuangfj/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12",
+        filename="model.onnx",
+        subfolder=".",
+    )
+    config = sherpa_onnx.OfflinePunctuationConfig(
+        model=sherpa_onnx.OfflinePunctuationModelConfig(ct_transformer=model),
+    )
+    punct = sherpa_onnx.OfflinePunctuation(config)
+    return punct
 def _get_multi_zh_hans_pre_trained_model(
     repo_id: str,
     decoding_method: str,

requirements.txt CHANGED Viewed

@@ -9,4 +9,4 @@ sentencepiece>=0.1.96
 numpy
 huggingface_hub
-sherpa-onnx

 numpy
 huggingface_hub
+sherpa-onnx>=1.9.19