examples = { "en": [ ["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm_6b"], # chatglm 有blank_n, # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏. ["punctuations: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"], ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"], ["digits: (10086 + 98) = 100184", "baichuan", "llama"] ] , "zh": [ ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n, ["标点测试:,。!?;", "baichuan_7b", "llama"], ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"], ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"], ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"], ] } more_examples = [ # bert VS clue # bert系列 ("bert_base_cased", "bert_base_uncased", ""), # # clue VS kplug, bert VS clue # llama系列 (基于sentencepiece) ("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"), ("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"), ("llama", "chinese_llama2", ""), ("chinese_llama", "chinese_llama2", ""), # glm系列 (基于sentencepiece) ("glm", "chatglm1", ""), ("chatglm1", "chatglm2", ""), # gpt2系列 ("gpt2", "moss", ""), ("", "", ""), # openai系列 (tiktoken) ("qwen", "gpt_35_turbo", ""), ] def example_fn(example_idx): return examples["en"][example_idx] def get_more_example(): import urllib.parse url_prefix = "https://huggingface.co./spaces/eson/tokenizer-arena" for tokenizer1, tokenizer2, text in more_examples: full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}' print(full_url) if __name__ == "__main__": get_more_example()