Spaces:

VinayHajare
/

Marathi-Audio-Transcriber-and-Translator

Sleeping

Marathi-Audio-Transcriber-and-Translator

File size: 5,217 Bytes

import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr
import pytube as pt

MODEL_NAME = "VinayHajare/whisper-small-finetuned-common-voice-mr"
BATCH_SIZE = 8
LANG = "mr"
device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=LANG)

# Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
    if seconds is not None:
        milliseconds = round(seconds * 1000.0)

        hours = milliseconds // 3_600_000
        milliseconds -= hours * 3_600_000

        minutes = milliseconds // 60_000
        milliseconds -= minutes * 60_000

        seconds = milliseconds // 1_000
        milliseconds -= seconds * 1_000

        hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
        return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
    else:
        # we have a malformed timestamp so just return it as is
        return seconds

def transcribe(file, task, return_timestamps):
    outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
    text = outputs["text"]
    if return_timestamps:
        timestamps = outputs["chunks"]
        timestamps = [
            f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
            for chunk in timestamps
        ]
        text = "\n".join(str(feature) for feature in timestamps)
    return text

def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str
    
def yt_transcribe(yt_url, task, return_timestamps):
    yt = pt.YouTube(yt_url)
    html_embed_str = _return_yt_html_embed(yt_url)
    stream = yt.streams.filter(only_audio=True)[0]
    stream.download(filename="audio.mp3")
    outputs = pipe("audio.mp3",batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
    text = outputs["text"]
    if return_timestamps:
        timestamps = outputs["chunks"]
        timestamps = [
            f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
            for chunk in timestamps
        ]
        text = "\n".join(str(feature) for feature in timestamps)
    return html_embed_str, text

demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="microphone", type="filepath"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
        gr.Checkbox(value=False, label="Return timestamps"),
    ],
    outputs="text",
    theme="huggingface",
    title="Whisper Demo: Transcribe Marathi Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and ðŸ¤— Transformers to transcribe audio files"
        " of arbitrary length."
    ),
    allow_flagging="never",
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="upload", label="Audio file", type="filepath"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
        gr.Checkbox(value=False, label="Return timestamps"),
    ],
    outputs="text",
    theme="huggingface",
    title="Whisper Demo: Transcribe Marathi Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and ðŸ¤— Transformers to transcribe audio files"
        " of arbitrary length."
    ),
    cache_examples=True,
    allow_flagging="never",
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[
       gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube Video URL"),
       gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
       gr.Checkbox(value=False, label="Return timestamps"),
    ],
    outputs=["html", "text"],
    theme="huggingface",
    title="Whisper Demo: Transcribe Marathi YouTube Video",
    description=(
        "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
        f" [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and ðŸ¤— Transformers to transcribe audio files of"
        " arbitrary length."
    ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mic_transcribe, file_transcribe, yt_transcribe], ["Transcribe Microphone", "Transcribe Audio File", "Transcribe YouTube Video"])
demo.queue(max_size=10)
demo.launch()