Spaces:

aiola
/

whisper-ner-v1

Running on Zero

File size: 6,196 Bytes

import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import torchaudio
import spaces
import re

# Initialize devices
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and processor
processor = WhisperProcessor.from_pretrained("aiola/whisper-ner-v1")
model = WhisperForConditionalGeneration.from_pretrained("aiola/whisper-ner-v1")
model = model.to(device)


examples = [
    [
        "audio/sports.wav",
        "football-club, football-player, action"
    ],
    [
        "audio/entertainment.wav",
        "movie, date, actor, tv-show, musician"
    ],
    [
        "audio/672-122797-0026.wav",
        "biological-classification, desire, demographic-group, object-category, relationship-role, reflexive-pronoun, furniture-type"
    ],
    [
        "audio/7021-85628-0025.wav",
        "action-goal, person's-title, emotional-connection, personal-qualities, pronoun-target, assignmentaction, physical-action, family-role"
    ],
    [
        "audio/672-122797-0024.wav",
        "health-warning, importance-indicator, event, sentiment"
    ],
    [
        "audio/672-122797-0027.wav",
        "action, emotional-resilience, comparative-path-characteristic, social-role"
    ],
    [
        "audio/672-122797-0048.wav",
        "weapon, emotional-state, household-chore, atmosphere-quality"
    ],
]


def unify_ner_text(text, symbols_to_replace=("/", " ", ":", "_")):
    """Process and standardize entity text by replacing certain symbols and normalizing spaces."""
    text = " ".join(text.split())
    for symbol in symbols_to_replace:
        text = text.replace(symbol, "-")
    return text.lower()


def extract_entities_and_clean_text_fixed(text):
    entity_pattern = r"<(.*?)>(.*?)<\1>>"
    entities = []
    clean_text = []
    current_pos = 0

    # Iterate through the matches for entity tags
    for match in re.finditer(entity_pattern, text):
        # Add text before the entity to the clean text
        clean_text.append(text[current_pos:match.start()])

        entity_type = match.group(1)
        entity_text = match.group(2)
        start_pos = len("".join(clean_text))  # Start position in the clean text
        end_pos = start_pos + len(entity_text)

        # Append the entity text to the clean text
        clean_text.append(entity_text)

        # Add the entity details to the list
        entities.append({
            "entity": entity_type,
            "text": entity_text,
            "start": start_pos,
            "end": end_pos
        })

        # Update the current position to the end of the match
        current_pos = match.end()

    # Append the remaining part of the text after the last entity
    clean_text.append(text[current_pos:])

    # Join all parts of the clean text
    clean_text_str = "".join(clean_text)

    return clean_text_str, entities


@spaces.GPU  # This decorator ensures your function can use GPU on Hugging Face Spaces
def transcribe_and_recognize_entities(audio_file, prompt):
    target_sample_rate = 16000
    signal, sampling_rate = torchaudio.load(audio_file)
    resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=target_sample_rate)
    signal = resampler(signal)
    if signal.ndim == 2:
        signal = torch.mean(signal, dim=0)

    input_features = processor(signal, sampling_rate=target_sample_rate, return_tensors="pt").input_features
    input_features = input_features.to(device)

    ner_types = prompt.split(',')
    processed_ner_types = [unify_ner_text(ner_type.strip()) for ner_type in ner_types]
    prompt = ", ".join(processed_ner_types)

    print(f"Prompt after unify_ner_text: {prompt}")
    prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt")
    prompt_ids = prompt_ids.to(device)

    predicted_ids = model.generate(
        input_features,
        max_new_tokens=256,
        prompt_ids=prompt_ids,
        language='en',
        generation_config=model.generation_config,
    )
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    clean_text_fixed, extracted_entities_fixed = extract_entities_and_clean_text_fixed(transcription)

    return transcription, {"text": clean_text_fixed, "entities": extracted_entities_fixed}


with gr.Blocks(title="WhisperNER v1") as demo:

    gr.Markdown(
        """
        # Whisper-NER: ASR with zero-shot NER

        WhisperNER is a unified model for automatic speech recognition (ASR) and named entity recognition (NER), with zero-shot capabilities.
        The WhisperNER model is designed as a strong base model for the downstream task of ASR with NER, and can be fine-tuned on specific datasets for improved performance.

        ## Links

        * Paper: [WhisperNER: Unified Open Named Entity and Speech Recognition](https://arxiv.org/abs/2409.08107).
        * Model: https://huggingface.co./aiola/whisper-ner-v1
        * Code: https://github.com/aiola-lab/whisper-ner
        """
    )

    with gr.Row() as row1:
        with gr.Column() as col1:
            audio_input = gr.Audio(label="Audio Example", type="filepath")
        with gr.Column() as col2:
            label_input = gr.Textbox(label="Entity Labels")

    submit_btn = gr.Button("Submit")
    
    gr.Markdown("## Output")

    with gr.Row() as row3:
        transcript_output = gr.Textbox(label="Transcription and Entities")

    with gr.Row() as row4:
        highlighted_text_output = gr.HighlightedText(label="Predicted Highlighted Entities")

    examples = gr.Examples(
        examples,
        fn=transcribe_and_recognize_entities,
        inputs=[audio_input, label_input],
        outputs=[transcript_output, highlighted_text_output],
        cache_examples=True,
        run_on_click=True,
    )

    # Submitting
    label_input.submit(
        fn=transcribe_and_recognize_entities,
        inputs=[audio_input, label_input],
        outputs=[transcript_output, highlighted_text_output],
    )
    submit_btn.click(
        fn=transcribe_and_recognize_entities,
        inputs=[audio_input, label_input],
        outputs=[transcript_output, highlighted_text_output],
    )

    demo.launch()