File size: 5,131 Bytes
2024883
4fac050
d0428be
2024883
 
 
8f0428b
2024883
 
8f0428b
1c3ac91
 
 
9c925f4
1c3ac91
9c925f4
2024883
 
d0428be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2024883
0be981f
4fac050
0be981f
4fac050
0be981f
d0428be
 
 
0be981f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2024883
 
48749cd
2024883
48749cd
0be981f
 
 
48749cd
2024883
 
 
 
 
 
 
1d5e2bb
2024883
d0428be
2024883
0be981f
1d5e2bb
0b052a8
 
1d5e2bb
 
2024883
 
 
48749cd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
from transformers import AutoTokenizer, GPT2TokenizerFast
import random

# List of available tokenizers
tokenizers = [
    "allenai/longformer-base-4096",
    "gpt2",
    "roberta-base",
    'Xenova/gpt-4',
    'Xenova/gpt-4o',
    'Xenova/claude-tokenizer',
    'Xenova/mistral-tokenizer-v3',
    #'Xenova/gemma-2-tokenizer',
    'Xenova/llama3-tokenizer-new',
    #'Xenova/Phi-3-mini-4k-instruct-hf'
]

def generate_colored_html(tokens, decoded_tokens):
    colors = ["#FFDDC1", "#C1FFD4", "#D4C1FF", "#FFC1C1", "#C1FFFD"]
    text_color = "#000000"
    last_color = None
    background_color = "#F0F0F0"
    html_tokens = []

    special_token_replacements = {
        '<pad>': '[Padding]',
        '<s>': '[Start of Sentence]',
        '</s>': '[End of Sentence]',
        '<unk>': '[Unknown]',
        '<mask>': '[Masked]',
        '[CLS]': '[Class]',
        '[SEP]': '[Separator]'
    }

    for i, (token, decoded_token) in enumerate(zip(tokens, decoded_tokens)):
        for special_token, replacement in special_token_replacements.items():
            if special_token in decoded_token:
                decoded_token = decoded_token.replace(special_token, replacement)

        hover_info = f"Token Index: {i}, Token: {decoded_token}, Token ID: {token}"

        if '\n' in decoded_token:
            color = random.choice([c for c in colors if c != last_color])
            last_color = color
            newline_representation = f"<span style='background-color: {color}; color: {text_color};' title='{hover_info}'>[NEWLINE]</span><br>"
            html_tokens.append(newline_representation)
        else:
            color = random.choice([c for c in colors if c != last_color])
            last_color = color
            html_tokens.append(f'<span style="background-color: {color}; color: {text_color}; text-decoration: none;" title="{hover_info}">{decoded_token}</span>')

    html_output = " ".join(html_tokens)
    html_output = f'<div style="background-color: {background_color}; padding: 10px;">{html_output}</div>'
    return html_output

def tokenize_text(text, tokenizer_name):
    if tokenizer_name.split("/")[0] == "Xenova":
        tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_name)
    else:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    tokens = tokenizer.encode(text, add_special_tokens=True)
    decoded_tokens = [tokenizer.decode(token) for token in tokens]
    html_output = generate_colored_html(tokens, decoded_tokens)
    
    # Calculate tokenizer metrics
    total_tokens = len(tokens)
    unique_tokens = len(set(tokens))
    vocab_size = len(tokenizer.get_vocab())
    avg_token_length = sum(len(t) for t in decoded_tokens) / total_tokens if total_tokens > 0 else 0
    
    metrics = f"""
    <h4>Tokenizer Metrics:</h4>
    <ul>
        <li>Total Tokens: {total_tokens}</li>
        <li>Unique Tokens: {unique_tokens}</li>
        <li>Vocabulary Size: {vocab_size}</li>
        <li>Average Token Length: {avg_token_length:.2f} characters</li>
    </ul>
    """
    
    return html_output, metrics

def compare_tokenizers(text, selected_tokenizers):
    results = ""
    for tokenizer_name in selected_tokenizers:
        results += f"<h3>{tokenizer_name}</h3>"
        html_output, metrics = tokenize_text(text, tokenizer_name)
        results += html_output
        results += metrics
        results += "<hr>"
    return results

# Create the Gradio interface
iface = gr.Interface(
    fn=compare_tokenizers,
    inputs=[
        gr.Textbox(label="Enter text to tokenize"),
        gr.CheckboxGroup(choices=tokenizers, label="Select tokenizers", value=["bert-base-uncased", "gpt2"])
    ],
    outputs=gr.HTML(label="Tokenization Results"),
    title="Tokenizer Comparison",
    description="Compare tokenization results and metrics from different tokenizers.",
    examples=[
        ["Allergies: \nNo Known Allergies / Adverse Drug Reactions\n\nChief Complaint: \nRight hand erythema/edema\n\nHistory of Present Illness: \nThe patient is a ___ y/o M with PMHx significant", ["allenai/longformer-base-4096", "gpt2"]],
        ["In the ED, initial vitals: 98.3 80 144/90 18 96% ra  \nLabs notable for: Leukocytosis to 12.5.  The patient endorsed \nsevere R hand tenderness but otherwise denied any fevers, \nchills, nausea, vomiting, numbness, tingling or weakness.  Hand \nsurgery was consulted and recommended admission to medicine for \nIV antibiotics, possible ultrasound to evaluate for \nthrombophlebitis and pain control.  In the ED, patient received \nZosyn and IV dilaudid 0.5mg x1.\n\nUpon arrival to the floor, patient endorses R hand pain- stable \nfrom prior.  \n\nROS:  Per HPI \n\n\nPast Medical History: \nCrohn's disease\nAnal fissure\nh/o DVT on coumadin\nRotator cuff repair in ___\n\n\nFamily History: \nNo family h/o IBD\n\n\nMedications on Admission:  EXAMINATION:  UNILAT UP EXT VEINS US RIGHT\n\nINDICATION:  ___ year old man with concern for R hand cellulitis vs.\nthrombophlebitis ", ["roberta-base", "allenai/longformer-base-4096"]],
    ],
    allow_flagging="never"
)

# Launch the app
iface.launch()