Spaces:

JacobLinCool
/

TWASR

Sleeping

File size: 2,922 Bytes

import gradio as gr
from huggingface_hub.utils import get_token
import requests
import base64
from model import model_id, transcribe_audio_local

token = get_token()


def read_file_as_base64(file_path: str) -> str:
    with open(file_path, "rb") as f:
        return base64.b64encode(f.read()).decode()


def transcribe_audio(audio: str) -> str:
    print(f"{audio=}")

    if audio is None:
        raise gr.Error(
            "Please wait a moment for the audio to be uploaded, then click the button again."
        )

    # resample to 16k mono to reduce file size
    import subprocess
    import os

    audio_resampled = audio.replace(".mp3", "_resampled.mp3")
    subprocess.run(
        [
            "ffmpeg",
            "-i",
            audio,
            "-ac",
            "1",
            "-ar",
            "16000",
            audio_resampled,
            "-y",
        ],
        check=True,
    )

    b64 = read_file_as_base64(audio_resampled)
    url = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json",
        "x-wait-for-model": "true",
    }
    data = {
        "inputs": b64,
        "parameters": {
            "generate_kwargs": {
                "return_timestamps": True,
            }
        },
    }
    response = requests.post(url, headers=headers, json=data)
    print(f"{response.text=}")
    out = response.json()
    print(f"{out=}")

    return out["text"]


with gr.Blocks() as demo:
    gr.Markdown("# TWASR: Chinese (Taiwan) Automatic Speech Recognition.")
    gr.Markdown("Upload an audio file or record your voice to transcribe it to text.")
    gr.Markdown(
        "First load may take a while to initialize the model, following requests will be faster."
    )

    with gr.Row():
        audio_input = gr.Audio(
            label="Audio", type="filepath", show_download_button=True
        )
        text_output = gr.Textbox(label="Transcription")

    transcribe_local_button = gr.Button(
        "Transcribe with Transformers", variant="primary"
    )
    transcribe_button = gr.Button("Transcribe with Inference API", variant="secondary")

    transcribe_local_button.click(
        fn=transcribe_audio_local, inputs=[audio_input], outputs=[text_output]
    )
    transcribe_button.click(
        fn=transcribe_audio, inputs=[audio_input], outputs=[text_output]
    )

    gr.Examples(
        [
            ["./examples/audio1.mp3"],
            ["./examples/audio2.mp3"],
        ],
        inputs=[audio_input],
        outputs=[text_output],
        fn=transcribe_audio_local,
        cache_examples=True,
        cache_mode="lazy",
        run_on_click=True,
    )

    gr.Markdown(
        f"Current model: {model_id}. For more information, visit the [model hub](https://huggingface.co./{model_id})."
    )

if __name__ == "__main__":
    demo.launch()