import gradio as gr
from transformers import pipeline

pipe = pipeline("token-classification", model="cogniveon/nlpcw_bert-base-uncased-abbr", grouped_entities=True)

def predict(input) -> list[tuple[str, str | float | None]] | dict | None:
    output = pipe(input)
    entities = []

    # Collect entities with their start and end positions
    for entity in output:
        entities.append({
            "entity": entity["entity_group"],
            "word": entity["word"],
            "score": round(entity["score"], 4),
            "start": entity["start"],
            "end": entity["end"]
        })
    
    highlighted_text = [(input[:entities[0]['start']], None)]  # Initial text before the first entity
    
    # Generate highlighted text segments
    for i, entity in enumerate(entities):
        highlighted_text.append((input[entity['start']:entity['end']], entity['entity']))
        if i < len(entities) - 1:
            highlighted_text.append((input[entity['end']:entities[i+1]['start']], None))
        else:
            highlighted_text.append((input[entity['end']:], None))  # Remaining text after the last entity
    
    return highlighted_text


demo = gr.Interface(
    predict,
    gr.Textbox(
        label="Input",
        lines=3,
    ),
    gr.HighlightedText(
        label="Output",
        combine_adjacent=True,
        show_legend=True
    ),
    examples=[
        ["We developed a variant of gene set enrichment analysis (GSEA) to determine whether a genetic pathway shows evidence for age regulation [23]."],
    ],
).launch()