|
|
|
|
|
|
|
|
|
""" |
|
|
|
plots |
|
|
|
table |
|
|
|
## related demo |
|
http://text-processing.com/demo/tokenize/ |
|
|
|
## 可视化 |
|
|
|
[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ] |
|
""" |
|
|
|
import json |
|
import pandas as pd |
|
import gradio as gr |
|
|
|
from vocab import all_tokenizers, load_tokener |
|
|
|
|
|
|
|
css = """ |
|
.space-show {white-space: pre-wrap;} |
|
.cell-wrap {white-space: pre-wrap;} |
|
.category-legend {display: none !important} |
|
""" |
|
|
|
example_text = """中文测试:华为智能音箱发布:华为Sound X。維基百科由非營利組織──維基媒體基金會負責維持 |
|
标点测试:,。!?; |
|
空格测试: 2个空格 8个空格 |
|
数字测试:(10086 + 98) = 100184""" |
|
|
|
|
|
def tokenize(text, tokenizer_type): |
|
print(text, tokenizer_type) |
|
pos_tokens = [] |
|
tokenizer = load_tokener(tokenizer_type) |
|
encoding = tokenizer.encode(text) |
|
|
|
table = [] |
|
|
|
for idx, token_id in enumerate(encoding): |
|
decode_text = tokenizer.decode([token_id]) |
|
pos_tokens.extend([(decode_text, str(idx % 3))]) |
|
|
|
|
|
token = tokenizer.convert_ids_to_tokens([token_id])[0] |
|
if isinstance(token, bytes): |
|
token_str = token.decode("utf-8") |
|
token_bytes = token |
|
json_dumps = json.dumps(token_str) |
|
elif isinstance(token, str): |
|
token_str = token |
|
token_bytes = bytes(token_str, "utf-8") |
|
json_dumps = json.dumps(token_str) |
|
else: |
|
return |
|
|
|
table.append( |
|
{"TokenID": token_id, |
|
"Token": token_str, |
|
"Text": decode_text, |
|
|
|
"Bytes": str(token_bytes), |
|
|
|
} |
|
) |
|
|
|
table_df = pd.DataFrame(table) |
|
print(table) |
|
print(table_df) |
|
|
|
return pos_tokens, table_df |
|
|
|
|
|
def test_coding(): |
|
bytes1 = b'\xe4\xb8\xad' |
|
print(bytes1) |
|
|
|
|
|
with gr.Blocks(css=css) as demo: |
|
gr.HTML("""<h1 align="center">Tokenizer Arena</h1>""") |
|
|
|
|
|
|
|
|
|
user_input = gr.Textbox( |
|
value=example_text, |
|
lines=5 |
|
) |
|
|
|
|
|
|
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
tokenizer_type_1 = gr.Dropdown( |
|
all_tokenizers, value="llama", label="tokenizer" |
|
) |
|
token_counter_1 = None |
|
output_text_1 = gr.Highlightedtext( |
|
label="Tokenization", |
|
show_legend=True, |
|
elem_classes="space-show" |
|
) |
|
|
|
output_table_1 = gr.Dataframe( |
|
headers=["TokenID", "Byte", "Text"], |
|
datatype=["str", "str", "str"], |
|
|
|
) |
|
|
|
with gr.Column(): |
|
tokenizer_type_2 = gr.Dropdown( |
|
all_tokenizers, value="baichuan_7b", label="tokenizer" |
|
) |
|
token_counter_2 = None |
|
output_text_2 = gr.Highlightedtext( |
|
label="Tokenization", |
|
show_legend=True, |
|
elem_classes="space-show" |
|
) |
|
|
|
output_table_2 = gr.Dataframe( |
|
headers=["TokenID", "Token", "Text"], |
|
datatype=["str", "str", "str"], |
|
) |
|
|
|
user_input.change(tokenize, |
|
[user_input, tokenizer_type_1], |
|
[output_text_1, output_table_1]) |
|
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1]) |
|
|
|
user_input.change(tokenize, |
|
[user_input, tokenizer_type_2], |
|
[output_text_2, output_table_2]) |
|
tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|