File size: 1,334 Bytes
a0047d0
0dc240c
48a45d9
0dc240c
48a45d9
 
0dc240c
 
48a45d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0dc240c
48a45d9
 
 
 
a0047d0
48a45d9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from transformers import AutoTokenizer
import gradio as gr
import os

# Retrieve the Hugging Face token from secrets
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")

def tokenize(input_text):
    palmyra_x_003_tokens = len(palmyra_x_003_tokenizer(input_text, add_special_tokens=True)["input_ids"])
    gpt2_tokens = len(gpt2_tokenizer(input_text, add_special_tokens=True)["input_ids"])
    palmyra_x_004_tokens = len(palmyra_x_004_tokenizer(input_text, add_special_tokens=True)["input_ids"])

    results = {
        "Palmyra-X-004": palmyra_x_004_tokens,
        "Palmyra-Fin & Med": palmyra_x_003_tokens,
        "Palmyra-X-003": gpt2_tokens
    }

    # Sort the results in descending order based on token length
    sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

    return "\n".join([f"{model}: {tokens}" for model, tokens in sorted_results])


if __name__ == "__main__":
    palmyra_x_003_tokenizer = AutoTokenizer.from_pretrained("wassemgtk/palmyra-x-003-tokenizer", token=huggingface_token)
    gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
    palmyra_x_004_tokenizer = AutoTokenizer.from_pretrained("wassemgtk/palmyra-x-004-tokenizer", token=huggingface_token)

    iface = gr.Interface(fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=19), outputs="text")
    iface.launch()