Spaces:
Runtime error
Runtime error
Arrcttacsrks
commited on
Upload llama.cpp/convert_hf_to_gguf_update.py with huggingface_hub
Browse files
llama.cpp/convert_hf_to_gguf_update.py
ADDED
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
# This script downloads the tokenizer models of the specified models from Huggingface and
|
5 |
+
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
6 |
+
#
|
7 |
+
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
8 |
+
# provide the necessary information to llama.cpp via the GGUF header in order to implement
|
9 |
+
# the same pre-tokenizer.
|
10 |
+
#
|
11 |
+
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
12 |
+
#
|
13 |
+
# Instructions:
|
14 |
+
#
|
15 |
+
# - Add a new model to the "models" list
|
16 |
+
# - Run the script with your huggingface token:
|
17 |
+
#
|
18 |
+
# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
19 |
+
#
|
20 |
+
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
|
21 |
+
# - Update llama.cpp with the new pre-tokenizer if necessary
|
22 |
+
#
|
23 |
+
# TODO: generate tokenizer tests for llama.cpp
|
24 |
+
#
|
25 |
+
|
26 |
+
import logging
|
27 |
+
import os
|
28 |
+
import pathlib
|
29 |
+
import re
|
30 |
+
|
31 |
+
import requests
|
32 |
+
import sys
|
33 |
+
import json
|
34 |
+
import shutil
|
35 |
+
|
36 |
+
from hashlib import sha256
|
37 |
+
from enum import IntEnum, auto
|
38 |
+
from transformers import AutoTokenizer
|
39 |
+
|
40 |
+
logging.basicConfig(level=logging.DEBUG)
|
41 |
+
logger = logging.getLogger("convert_hf_to_gguf_update")
|
42 |
+
sess = requests.Session()
|
43 |
+
|
44 |
+
|
45 |
+
class TOKENIZER_TYPE(IntEnum):
|
46 |
+
SPM = auto()
|
47 |
+
BPE = auto()
|
48 |
+
WPM = auto()
|
49 |
+
UGM = auto()
|
50 |
+
|
51 |
+
|
52 |
+
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
53 |
+
# will be updated with time - contributions welcome
|
54 |
+
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
55 |
+
|
56 |
+
if len(sys.argv) == 2:
|
57 |
+
token = sys.argv[1]
|
58 |
+
if not token.startswith("hf_"):
|
59 |
+
logger.info("Huggingface token seems invalid")
|
60 |
+
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
61 |
+
sys.exit(1)
|
62 |
+
else:
|
63 |
+
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
64 |
+
sys.exit(1)
|
65 |
+
|
66 |
+
# TODO: add models here, base models preferred
|
67 |
+
models = [
|
68 |
+
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
69 |
+
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
|
70 |
+
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
|
71 |
+
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
|
72 |
+
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
73 |
+
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
|
74 |
+
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
75 |
+
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
|
76 |
+
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
77 |
+
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
78 |
+
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
79 |
+
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
|
80 |
+
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
81 |
+
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
82 |
+
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
83 |
+
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
84 |
+
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
85 |
+
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
|
86 |
+
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
87 |
+
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
88 |
+
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
89 |
+
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
90 |
+
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
91 |
+
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
92 |
+
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
|
93 |
+
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
|
94 |
+
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
|
95 |
+
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
96 |
+
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
97 |
+
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
98 |
+
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
99 |
+
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
|
100 |
+
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
101 |
+
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
102 |
+
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
103 |
+
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
104 |
+
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
|
105 |
+
]
|
106 |
+
|
107 |
+
|
108 |
+
def download_file_with_auth(url, token, save_path):
|
109 |
+
headers = {"Authorization": f"Bearer {token}"}
|
110 |
+
response = sess.get(url, headers=headers)
|
111 |
+
response.raise_for_status()
|
112 |
+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
113 |
+
with open(save_path, 'wb') as downloaded_file:
|
114 |
+
downloaded_file.write(response.content)
|
115 |
+
logger.info(f"File {save_path} downloaded successfully")
|
116 |
+
|
117 |
+
|
118 |
+
def download_model(model):
|
119 |
+
name = model["name"]
|
120 |
+
repo = model["repo"]
|
121 |
+
tokt = model["tokt"]
|
122 |
+
|
123 |
+
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
|
124 |
+
|
125 |
+
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
|
126 |
+
|
127 |
+
if tokt == TOKENIZER_TYPE.SPM:
|
128 |
+
files.append("tokenizer.model")
|
129 |
+
|
130 |
+
if tokt == TOKENIZER_TYPE.UGM:
|
131 |
+
files.append("spiece.model")
|
132 |
+
|
133 |
+
if os.path.isdir(repo):
|
134 |
+
# If repo is a path on the file system, copy the directory
|
135 |
+
for file in files:
|
136 |
+
src_path = os.path.join(repo, file)
|
137 |
+
dst_path = f"models/tokenizers/{name}/{file}"
|
138 |
+
if os.path.isfile(dst_path):
|
139 |
+
logger.info(f"{name}: File {dst_path} already exists - skipping")
|
140 |
+
continue
|
141 |
+
if os.path.isfile(src_path):
|
142 |
+
shutil.copy2(src_path, dst_path)
|
143 |
+
logger.info(f"{name}: Copied {src_path} to {dst_path}")
|
144 |
+
else:
|
145 |
+
logger.warning(f"{name}: Source file {src_path} does not exist")
|
146 |
+
else:
|
147 |
+
# If repo is a URL, download the files
|
148 |
+
for file in files:
|
149 |
+
save_path = f"models/tokenizers/{name}/{file}"
|
150 |
+
if os.path.isfile(save_path):
|
151 |
+
logger.info(f"{name}: File {save_path} already exists - skipping")
|
152 |
+
continue
|
153 |
+
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
154 |
+
|
155 |
+
|
156 |
+
for model in models:
|
157 |
+
try:
|
158 |
+
download_model(model)
|
159 |
+
except Exception as e:
|
160 |
+
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
161 |
+
|
162 |
+
|
163 |
+
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
164 |
+
|
165 |
+
src_ifs = ""
|
166 |
+
for model in models:
|
167 |
+
name = model["name"]
|
168 |
+
tokt = model["tokt"]
|
169 |
+
|
170 |
+
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
|
171 |
+
continue
|
172 |
+
|
173 |
+
# Skip if the tokenizer folder does not exist or there are other download issues previously
|
174 |
+
if not os.path.exists(f"models/tokenizers/{name}"):
|
175 |
+
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
|
176 |
+
continue
|
177 |
+
|
178 |
+
# create the tokenizer
|
179 |
+
try:
|
180 |
+
if name == "t5":
|
181 |
+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
182 |
+
else:
|
183 |
+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
184 |
+
except OSError as e:
|
185 |
+
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
186 |
+
continue # Skip to the next model if the tokenizer can't be loaded
|
187 |
+
|
188 |
+
chktok = tokenizer.encode(CHK_TXT)
|
189 |
+
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
190 |
+
|
191 |
+
logger.info(f"model: {name}")
|
192 |
+
logger.info(f"tokt: {tokt}")
|
193 |
+
logger.info(f"repo: {model['repo']}")
|
194 |
+
logger.info(f"chktok: {chktok}")
|
195 |
+
logger.info(f"chkhsh: {chkhsh}")
|
196 |
+
|
197 |
+
# print the "pre_tokenizer" content from the tokenizer.json
|
198 |
+
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
|
199 |
+
cfg = json.load(f)
|
200 |
+
normalizer = cfg["normalizer"]
|
201 |
+
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
|
202 |
+
pre_tokenizer = cfg["pre_tokenizer"]
|
203 |
+
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
204 |
+
if "ignore_merges" in cfg["model"]:
|
205 |
+
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
|
206 |
+
|
207 |
+
logger.info("")
|
208 |
+
|
209 |
+
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
|
210 |
+
src_ifs += f" # ref: {model['repo']}\n"
|
211 |
+
src_ifs += f" res = \"{name}\"\n"
|
212 |
+
|
213 |
+
src_func = f"""
|
214 |
+
def get_vocab_base_pre(self, tokenizer) -> str:
|
215 |
+
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
216 |
+
# is specific for the BPE pre-tokenizer used by the model
|
217 |
+
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
218 |
+
# use in llama.cpp to implement the same pre-tokenizer
|
219 |
+
|
220 |
+
chktxt = {repr(CHK_TXT)}
|
221 |
+
|
222 |
+
chktok = tokenizer.encode(chktxt)
|
223 |
+
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
224 |
+
|
225 |
+
logger.debug(f"chktok: {{chktok}}")
|
226 |
+
logger.debug(f"chkhsh: {{chkhsh}}")
|
227 |
+
|
228 |
+
res = None
|
229 |
+
|
230 |
+
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
|
231 |
+
# or pull the latest version of the model from Huggingface
|
232 |
+
# don't edit the hashes manually!
|
233 |
+
{src_ifs}
|
234 |
+
if res is None:
|
235 |
+
logger.warning("\\n")
|
236 |
+
logger.warning("**************************************************************************************")
|
237 |
+
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
|
238 |
+
logger.warning("** There are 2 possible reasons for this:")
|
239 |
+
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
240 |
+
logger.warning("** - the pre-tokenization config has changed upstream")
|
241 |
+
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
242 |
+
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
243 |
+
logger.warning("**")
|
244 |
+
logger.warning(f"** chkhsh: {{chkhsh}}")
|
245 |
+
logger.warning("**************************************************************************************")
|
246 |
+
logger.warning("\\n")
|
247 |
+
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
|
248 |
+
|
249 |
+
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
|
250 |
+
logger.debug(f"chkhsh: {{chkhsh}}")
|
251 |
+
|
252 |
+
return res
|
253 |
+
"""
|
254 |
+
|
255 |
+
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
|
256 |
+
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
257 |
+
convert_py = re.sub(
|
258 |
+
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
259 |
+
lambda m: m.group(1) + src_func + m.group(3),
|
260 |
+
convert_py,
|
261 |
+
flags=re.DOTALL | re.MULTILINE,
|
262 |
+
)
|
263 |
+
|
264 |
+
convert_py_pth.write_text(convert_py, encoding="utf-8")
|
265 |
+
|
266 |
+
logger.info("+++ convert_hf_to_gguf.py was updated")
|
267 |
+
|
268 |
+
# generate tests for each tokenizer model
|
269 |
+
|
270 |
+
tests = [
|
271 |
+
"ied 4 ½ months",
|
272 |
+
"Führer",
|
273 |
+
"",
|
274 |
+
" ",
|
275 |
+
" ",
|
276 |
+
" ",
|
277 |
+
"\t",
|
278 |
+
"\n",
|
279 |
+
"\n\n",
|
280 |
+
"\n\n\n",
|
281 |
+
"\t\n",
|
282 |
+
"Hello world",
|
283 |
+
" Hello world",
|
284 |
+
"Hello World",
|
285 |
+
" Hello World",
|
286 |
+
" Hello World!",
|
287 |
+
"Hello, world!",
|
288 |
+
" Hello, world!",
|
289 |
+
" this is 🦙.cpp",
|
290 |
+
"w048 7tuijk dsdfhu",
|
291 |
+
"нещо на Български",
|
292 |
+
"កាន់តែពិសេសអាចខលចេញ",
|
293 |
+
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
294 |
+
"Hello",
|
295 |
+
" Hello",
|
296 |
+
" Hello",
|
297 |
+
" Hello",
|
298 |
+
" Hello",
|
299 |
+
" Hello\n Hello",
|
300 |
+
" (",
|
301 |
+
"\n =",
|
302 |
+
"' era",
|
303 |
+
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
|
304 |
+
"!!!!!!",
|
305 |
+
"3",
|
306 |
+
"33",
|
307 |
+
"333",
|
308 |
+
"3333",
|
309 |
+
"33333",
|
310 |
+
"333333",
|
311 |
+
"3333333",
|
312 |
+
"33333333",
|
313 |
+
"333333333",
|
314 |
+
"Cửa Việt", # llama-bpe fails on this
|
315 |
+
" discards",
|
316 |
+
CHK_TXT,
|
317 |
+
]
|
318 |
+
|
319 |
+
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
|
320 |
+
# the format is:
|
321 |
+
#
|
322 |
+
# test0
|
323 |
+
# __ggml_vocab_test__
|
324 |
+
# test1
|
325 |
+
# __ggml_vocab_test__
|
326 |
+
# ...
|
327 |
+
#
|
328 |
+
|
329 |
+
# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
|
330 |
+
# for each test, write the resulting tokens on a separate line
|
331 |
+
|
332 |
+
for model in models:
|
333 |
+
name = model["name"]
|
334 |
+
tokt = model["tokt"]
|
335 |
+
|
336 |
+
# Skip if the tokenizer folder does not exist or there are other download issues previously
|
337 |
+
if not os.path.exists(f"models/tokenizers/{name}"):
|
338 |
+
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
|
339 |
+
continue
|
340 |
+
|
341 |
+
# create the tokenizer
|
342 |
+
try:
|
343 |
+
if name == "t5":
|
344 |
+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
345 |
+
else:
|
346 |
+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
347 |
+
except OSError as e:
|
348 |
+
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
|
349 |
+
continue # Skip this model and continue with the next one in the loop
|
350 |
+
|
351 |
+
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
|
352 |
+
for text in tests:
|
353 |
+
f.write(f"{text}")
|
354 |
+
f.write("\n__ggml_vocab_test__\n")
|
355 |
+
|
356 |
+
with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
|
357 |
+
for text in tests:
|
358 |
+
res = tokenizer.encode(text, add_special_tokens=False)
|
359 |
+
for r in res:
|
360 |
+
f.write(f" {r}")
|
361 |
+
f.write("\n")
|
362 |
+
|
363 |
+
logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
|
364 |
+
|
365 |
+
# generate commands for creating vocab files
|
366 |
+
|
367 |
+
logger.info("\nRun the following commands to generate the vocab files for testing:\n")
|
368 |
+
|
369 |
+
for model in models:
|
370 |
+
name = model["name"]
|
371 |
+
|
372 |
+
print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
|
373 |
+
|
374 |
+
logger.info("\n")
|