Spaces:

k2-fsa
/

audio-tagging

Running

App Files Files Community

csukuangfj commited on Apr 15, 2024

Commit

2e7292b

1 Parent(s): a73e88b

first commit

Browse files

Files changed (3) hide show

README.md +7 -2
app.py +297 -0
model.py +114 -0

README.md CHANGED Viewed

@@ -4,10 +4,15 @@ emoji: 📈
 colorFrom: blue
 colorTo: purple
 sdk: gradio
-sdk_version: 4.26.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: blue
 colorTo: purple
 sdk: gradio
+sdk_version: 4.14.0
+python_version: 3.8.9
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
+Please see
+https://k2-fsa.github.io/sherpa/onnx/audio-tagging/index.html
+for more information.

app.py ADDED Viewed

	@@ -0,0 +1,297 @@

+#!/usr/bin/env python3
+#
+# Copyright      2022-2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# References:
+# https://gradio.app/docs/#dropdown
+import logging
+import os
+import tempfile
+import time
+import urllib.request
+from datetime import datetime
+from examples import examples
+import gradio as gr
+import soundfile as sf
+from model import decode, get_pretrained_model, models
+def convert_to_wav(in_filename: str) -> str:
+    """Convert the input audio file to a wave file"""
+    out_filename = in_filename + ".wav"
+    logging.info(f"Converting '{in_filename}' to '{out_filename}'")
+    _ = os.system(
+        f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 -ac 1 '{out_filename}' -y"
+    )
+    return out_filename
+def build_html_output(s: str, style: str = "result_item_success"):
+    return f"""
+    <div class='result'>
+        <div class='result_item {style}'>
+          {s}
+        </div>
+    </div>
+    """
+def process_url(
+    repo_id: str,
+    url: str,
+):
+    logging.info(f"Processing URL: {url}")
+    with tempfile.NamedTemporaryFile() as f:
+        try:
+            urllib.request.urlretrieve(url, f.name)
+            return process(
+                in_filename=f.name,
+                repo_id=repo_id,
+            )
+        except Exception as e:
+            logging.info(str(e))
+            return "", build_html_output(str(e), "result_item_error")
+def process_uploaded_file(
+    repo_id: str,
+    in_filename: str,
+):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first upload a file and then click "
+            'the button "submit for recognition"',
+            "result_item_error",
+        )
+    logging.info(f"Processing uploaded file: {in_filename}")
+    try:
+        return process(
+            in_filename=in_filename,
+            repo_id=repo_id,
+        )
+    except Exception as e:
+        logging.info(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+def process_microphone(
+    repo_id: str,
+    in_filename: str,
+):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first click 'Record from microphone', speak, "
+            "click 'Stop recording', and then "
+            "click the button 'submit for recognition'",
+            "result_item_error",
+        )
+    logging.info(f"Processing microphone: {in_filename}")
+    try:
+        return process(
+            in_filename=in_filename,
+            repo_id=repo_id,
+        )
+    except Exception as e:
+        logging.info(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+def process(
+    repo_id: str,
+    in_filename: str,
+):
+    logging.info(f"repo_id: {repo_id}")
+    logging.info(f"in_filename: {in_filename}")
+    filename = convert_to_wav(in_filename)
+    now = datetime.now()
+    date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
+    logging.info(f"Started at {date_time}")
+    start = time.time()
+    tagger = get_pretrained_model(repo_id)
+    events = decode(tagger, filename)
+    date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
+    end = time.time()
+    info = sf.info(filename)
+    duration = info.duration
+    elapsed = end - start
+    rtf = elapsed / duration
+    logging.info(f"Finished at {date_time} s. Elapsed: {elapsed: .3f} s")
+    info = f"""
+    Wave duration  : {duration: .3f} s <br/>
+    Processing time: {elapsed: .3f} s <br/>
+    RTF: {elapsed: .3f}/{duration: .3f} = {rtf:.3f} <br/>
+    """
+    if rtf > 1:
+        info += (
+            "<br/>We are loading the model for the first run. "
+            "Please run again to measure the real RTF.<br/>"
+        )
+    logging.info(info)
+    logging.info(f"\nrepo_id: {repo_id}\nDetected events: {events}")
+    events = {
+        "headers": ["Event name", "Probability"],
+        "data": [["bird", 0.9], ["pig", 0.8]],
+    }
+    return events, build_html_output(info)
+title = "# Audio tagging with [Next-gen Kaldi](https://github.com/k2-fsa) "
+description = """
+This space shows how to do audio tagging with [Next-gen Kaldi](https://github.com/k2-fsa)
+It is running on a machine with 2 vCPUs with 16 GB RAM within a docker container provided by Hugging Face.
+See more information by visiting the following links:
+- <https://github.com/k2-fsa/sherpa-onnx>
+If you want to deploy it locally, please see
+<https://k2-fsa.github.io/sherpa/onnx>
+"""
+# css style is copied from
+# https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
+css = """
+.result {display:flex;flex-direction:column}
+.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
+.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
+.result_item_error {background-color:#ff7070;color:white;align-self:start}
+"""
+demo = gr.Blocks(css=css)
+with demo:
+    gr.Markdown(title)
+    model_choices = list(models.keys())
+    model_dropdown = gr.Dropdown(
+        choices=model_choices,
+        label="Select a model",
+        value=model_choices[0],
+    )
+    with gr.Tabs():
+        with gr.TabItem("Upload from disk"):
+            uploaded_file = gr.Audio(
+                sources=["upload"],  # Choose between "microphone", "upload"
+                type="filepath",
+                label="Upload from disk",
+            )
+            upload_button = gr.Button("Submit for recognition")
+            uploaded_html_info = gr.HTML(label="Info")
+            gr.Examples(
+                examples=examples,
+                inputs=[
+                    model_dropdown,
+                    uploaded_file,
+                ],
+                outputs=["dataframe", uploaded_html_info],
+                fn=process_uploaded_file,
+            )
+        with gr.TabItem("Record from microphone"):
+            microphone = gr.Audio(
+                sources=["microphone"],  # Choose between "microphone", "upload"
+                type="filepath",
+                label="Record from microphone",
+            )
+            record_button = gr.Button("Submit for recognition")
+            recorded_output = gr.Textbox(label="Detected language from recordings")
+            recorded_html_info = gr.HTML(label="Info")
+            gr.Examples(
+                examples=examples,
+                inputs=[
+                    model_dropdown,
+                    microphone,
+                ],
+                outputs=[recorded_output, recorded_html_info],
+                fn=process_microphone,
+            )
+        with gr.TabItem("From URL"):
+            url_textbox = gr.Textbox(
+                max_lines=1,
+                placeholder="URL to an audio file",
+                label="URL",
+                interactive=True,
+            )
+            url_button = gr.Button("Submit for recognition")
+            url_output = gr.Textbox(label="Detected language from URL")
+            url_html_info = gr.HTML(label="Info")
+        upload_button.click(
+            process_uploaded_file,
+            inputs=[
+                model_dropdown,
+                uploaded_file,
+            ],
+            outputs=["dataframe", uploaded_html_info],
+        )
+        record_button.click(
+            process_microphone,
+            inputs=[
+                model_dropdown,
+                microphone,
+            ],
+            outputs=[recorded_output, recorded_html_info],
+        )
+        url_button.click(
+            process_url,
+            inputs=[
+                model_dropdown,
+                url_textbox,
+            ],
+            outputs=[url_output, url_html_info],
+        )
+    gr.Markdown(description)
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    demo.launch()

model.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright      2022-2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import wave
+from functools import lru_cache
+from typing import Tuple, List
+import numpy as np
+import sherpa_onnx
+from huggingface_hub import hf_hub_download
+sample_rate = 16000
+def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
+    """
+    Args:
+      wave_filename:
+        Path to a wave file. It should be single channel and each sample should
+        be 16-bit. Its sample rate does not need to be 16kHz.
+    Returns:
+      Return a tuple containing:
+       - A 1-D array of dtype np.float32 containing the samples, which are
+       normalized to the range [-1, 1].
+       - sample rate of the wave file
+    """
+    with wave.open(wave_filename) as f:
+        assert f.getnchannels() == 1, f.getnchannels()
+        assert f.getsampwidth() == 2, f.getsampwidth()  # it is in bytes
+        num_samples = f.getnframes()
+        samples = f.readframes(num_samples)
+        samples_int16 = np.frombuffer(samples, dtype=np.int16)
+        samples_float32 = samples_int16.astype(np.float32)
+        samples_float32 = samples_float32 / 32768
+        return samples_float32, f.getframerate()
+def decode(
+    tagger: sherpa_onnx.AudioTagging,
+    filename: str,
+    top_k: int = -1,
+) -> List[sherpa_onnx.AudioEvent]:
+    s = tagger.create_stream()
+    samples, sample_rate = read_wave(filename)
+    s.accept_waveform(sample_rate, samples)
+    events = tagger.compute(s, top_k)
+    return events
+def _get_nn_model_filename(
+    repo_id: str,
+    filename: str,
+    subfolder: str = ".",
+) -> str:
+    nn_model_filename = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        subfolder=subfolder,
+    )
+    return nn_model_filename
+@lru_cache(maxsize=8)
+def get_pretrained_model(repo_id: str) -> sherpa_onnx.AudioTagging:
+    assert repo_id in (
+        "k2-fsa/sherpa-onnx-zipformer-small-audio-tagging-2024-04-15",
+        "k2-fsa/sherpa-onnx-zipformer-audio-tagging-2024-04-09",
+    ), repo_id
+    model = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="model.int8.onnx",
+    )
+    labels = _get_nn_model_filename(
+        repo_id=repo_id,
+        filename="class_labels_indices.csv",
+    )
+    config = sherpa_onnx.AudioTaggingConfig(
+        model=sherpa_onnx.AudioTaggingModelConfig(
+            zipformer=sherpa_onnx.OfflineZipformerAudioTaggingModelConfig(
+                model=model,
+            ),
+            num_threads=1,
+            debug=True,
+            provider="cpu",
+        ),
+        labels=labels,
+        top_k=5,
+    )
+    return sherpa_onnx.AudioTagging(config)
+models = {
+    "k2-fsa/sherpa-onnx-zipformer-audio-tagging-2024-04-09": get_pretrained_model,
+    "k2-fsa/sherpa-onnx-zipformer-small-audio-tagging-2024-04-15": get_pretrained_model,
+}