Arrcttacsrks commited on
Commit
cf00d9f
·
verified ·
1 Parent(s): a179517

Upload llama.cpp/convert_hf_to_gguf_update.py with huggingface_hub

Browse files
llama.cpp/convert_hf_to_gguf_update.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # This script downloads the tokenizer models of the specified models from Huggingface and
5
+ # generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
6
+ #
7
+ # This is necessary in order to analyze the type of pre-tokenizer used by the model and
8
+ # provide the necessary information to llama.cpp via the GGUF header in order to implement
9
+ # the same pre-tokenizer.
10
+ #
11
+ # ref: https://github.com/ggerganov/llama.cpp/pull/6920
12
+ #
13
+ # Instructions:
14
+ #
15
+ # - Add a new model to the "models" list
16
+ # - Run the script with your huggingface token:
17
+ #
18
+ # python3 convert_hf_to_gguf_update.py <huggingface_token>
19
+ #
20
+ # - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
21
+ # - Update llama.cpp with the new pre-tokenizer if necessary
22
+ #
23
+ # TODO: generate tokenizer tests for llama.cpp
24
+ #
25
+
26
+ import logging
27
+ import os
28
+ import pathlib
29
+ import re
30
+
31
+ import requests
32
+ import sys
33
+ import json
34
+ import shutil
35
+
36
+ from hashlib import sha256
37
+ from enum import IntEnum, auto
38
+ from transformers import AutoTokenizer
39
+
40
+ logging.basicConfig(level=logging.DEBUG)
41
+ logger = logging.getLogger("convert_hf_to_gguf_update")
42
+ sess = requests.Session()
43
+
44
+
45
+ class TOKENIZER_TYPE(IntEnum):
46
+ SPM = auto()
47
+ BPE = auto()
48
+ WPM = auto()
49
+ UGM = auto()
50
+
51
+
52
+ # TODO: this string has to exercise as much pre-tokenizer functionality as possible
53
+ # will be updated with time - contributions welcome
54
+ CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
55
+
56
+ if len(sys.argv) == 2:
57
+ token = sys.argv[1]
58
+ if not token.startswith("hf_"):
59
+ logger.info("Huggingface token seems invalid")
60
+ logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
61
+ sys.exit(1)
62
+ else:
63
+ logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
64
+ sys.exit(1)
65
+
66
+ # TODO: add models here, base models preferred
67
+ models = [
68
+ {"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
69
+ {"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
70
+ {"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
71
+ {"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
72
+ {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
73
+ {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
74
+ {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
75
+ {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
76
+ {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
77
+ {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
78
+ {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
79
+ {"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
80
+ {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
81
+ {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
82
+ {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
83
+ {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
84
+ {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
85
+ {"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
86
+ {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
87
+ {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
88
+ {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
89
+ {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
90
+ {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
91
+ {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
92
+ {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
93
+ {"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
94
+ {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
95
+ {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
96
+ {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
97
+ {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
98
+ {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
99
+ {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
100
+ {'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
101
+ {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
102
+ {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
103
+ {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
104
+ {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
105
+ ]
106
+
107
+
108
+ def download_file_with_auth(url, token, save_path):
109
+ headers = {"Authorization": f"Bearer {token}"}
110
+ response = sess.get(url, headers=headers)
111
+ response.raise_for_status()
112
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
113
+ with open(save_path, 'wb') as downloaded_file:
114
+ downloaded_file.write(response.content)
115
+ logger.info(f"File {save_path} downloaded successfully")
116
+
117
+
118
+ def download_model(model):
119
+ name = model["name"]
120
+ repo = model["repo"]
121
+ tokt = model["tokt"]
122
+
123
+ os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
124
+
125
+ files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
126
+
127
+ if tokt == TOKENIZER_TYPE.SPM:
128
+ files.append("tokenizer.model")
129
+
130
+ if tokt == TOKENIZER_TYPE.UGM:
131
+ files.append("spiece.model")
132
+
133
+ if os.path.isdir(repo):
134
+ # If repo is a path on the file system, copy the directory
135
+ for file in files:
136
+ src_path = os.path.join(repo, file)
137
+ dst_path = f"models/tokenizers/{name}/{file}"
138
+ if os.path.isfile(dst_path):
139
+ logger.info(f"{name}: File {dst_path} already exists - skipping")
140
+ continue
141
+ if os.path.isfile(src_path):
142
+ shutil.copy2(src_path, dst_path)
143
+ logger.info(f"{name}: Copied {src_path} to {dst_path}")
144
+ else:
145
+ logger.warning(f"{name}: Source file {src_path} does not exist")
146
+ else:
147
+ # If repo is a URL, download the files
148
+ for file in files:
149
+ save_path = f"models/tokenizers/{name}/{file}"
150
+ if os.path.isfile(save_path):
151
+ logger.info(f"{name}: File {save_path} already exists - skipping")
152
+ continue
153
+ download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
154
+
155
+
156
+ for model in models:
157
+ try:
158
+ download_model(model)
159
+ except Exception as e:
160
+ logger.error(f"Failed to download model {model['name']}. Error: {e}")
161
+
162
+
163
+ # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
164
+
165
+ src_ifs = ""
166
+ for model in models:
167
+ name = model["name"]
168
+ tokt = model["tokt"]
169
+
170
+ if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
171
+ continue
172
+
173
+ # Skip if the tokenizer folder does not exist or there are other download issues previously
174
+ if not os.path.exists(f"models/tokenizers/{name}"):
175
+ logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
176
+ continue
177
+
178
+ # create the tokenizer
179
+ try:
180
+ if name == "t5":
181
+ tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
182
+ else:
183
+ tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
184
+ except OSError as e:
185
+ logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
186
+ continue # Skip to the next model if the tokenizer can't be loaded
187
+
188
+ chktok = tokenizer.encode(CHK_TXT)
189
+ chkhsh = sha256(str(chktok).encode()).hexdigest()
190
+
191
+ logger.info(f"model: {name}")
192
+ logger.info(f"tokt: {tokt}")
193
+ logger.info(f"repo: {model['repo']}")
194
+ logger.info(f"chktok: {chktok}")
195
+ logger.info(f"chkhsh: {chkhsh}")
196
+
197
+ # print the "pre_tokenizer" content from the tokenizer.json
198
+ with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
199
+ cfg = json.load(f)
200
+ normalizer = cfg["normalizer"]
201
+ logger.info("normalizer: " + json.dumps(normalizer, indent=4))
202
+ pre_tokenizer = cfg["pre_tokenizer"]
203
+ logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
204
+ if "ignore_merges" in cfg["model"]:
205
+ logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
206
+
207
+ logger.info("")
208
+
209
+ src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
210
+ src_ifs += f" # ref: {model['repo']}\n"
211
+ src_ifs += f" res = \"{name}\"\n"
212
+
213
+ src_func = f"""
214
+ def get_vocab_base_pre(self, tokenizer) -> str:
215
+ # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
216
+ # is specific for the BPE pre-tokenizer used by the model
217
+ # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
218
+ # use in llama.cpp to implement the same pre-tokenizer
219
+
220
+ chktxt = {repr(CHK_TXT)}
221
+
222
+ chktok = tokenizer.encode(chktxt)
223
+ chkhsh = sha256(str(chktok).encode()).hexdigest()
224
+
225
+ logger.debug(f"chktok: {{chktok}}")
226
+ logger.debug(f"chkhsh: {{chkhsh}}")
227
+
228
+ res = None
229
+
230
+ # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
231
+ # or pull the latest version of the model from Huggingface
232
+ # don't edit the hashes manually!
233
+ {src_ifs}
234
+ if res is None:
235
+ logger.warning("\\n")
236
+ logger.warning("**************************************************************************************")
237
+ logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
238
+ logger.warning("** There are 2 possible reasons for this:")
239
+ logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
240
+ logger.warning("** - the pre-tokenization config has changed upstream")
241
+ logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
242
+ logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
243
+ logger.warning("**")
244
+ logger.warning(f"** chkhsh: {{chkhsh}}")
245
+ logger.warning("**************************************************************************************")
246
+ logger.warning("\\n")
247
+ raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
248
+
249
+ logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
250
+ logger.debug(f"chkhsh: {{chkhsh}}")
251
+
252
+ return res
253
+ """
254
+
255
+ convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
256
+ convert_py = convert_py_pth.read_text(encoding="utf-8")
257
+ convert_py = re.sub(
258
+ r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
259
+ lambda m: m.group(1) + src_func + m.group(3),
260
+ convert_py,
261
+ flags=re.DOTALL | re.MULTILINE,
262
+ )
263
+
264
+ convert_py_pth.write_text(convert_py, encoding="utf-8")
265
+
266
+ logger.info("+++ convert_hf_to_gguf.py was updated")
267
+
268
+ # generate tests for each tokenizer model
269
+
270
+ tests = [
271
+ "ied 4 ½ months",
272
+ "Führer",
273
+ "",
274
+ " ",
275
+ " ",
276
+ " ",
277
+ "\t",
278
+ "\n",
279
+ "\n\n",
280
+ "\n\n\n",
281
+ "\t\n",
282
+ "Hello world",
283
+ " Hello world",
284
+ "Hello World",
285
+ " Hello World",
286
+ " Hello World!",
287
+ "Hello, world!",
288
+ " Hello, world!",
289
+ " this is 🦙.cpp",
290
+ "w048 7tuijk dsdfhu",
291
+ "нещо на Български",
292
+ "កាន់តែពិសេសអាចខលចេញ",
293
+ "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
294
+ "Hello",
295
+ " Hello",
296
+ " Hello",
297
+ " Hello",
298
+ " Hello",
299
+ " Hello\n Hello",
300
+ " (",
301
+ "\n =",
302
+ "' era",
303
+ "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
304
+ "!!!!!!",
305
+ "3",
306
+ "33",
307
+ "333",
308
+ "3333",
309
+ "33333",
310
+ "333333",
311
+ "3333333",
312
+ "33333333",
313
+ "333333333",
314
+ "Cửa Việt", # llama-bpe fails on this
315
+ " discards",
316
+ CHK_TXT,
317
+ ]
318
+
319
+ # write the tests to ./models/ggml-vocab-{name}.gguf.inp
320
+ # the format is:
321
+ #
322
+ # test0
323
+ # __ggml_vocab_test__
324
+ # test1
325
+ # __ggml_vocab_test__
326
+ # ...
327
+ #
328
+
329
+ # with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
330
+ # for each test, write the resulting tokens on a separate line
331
+
332
+ for model in models:
333
+ name = model["name"]
334
+ tokt = model["tokt"]
335
+
336
+ # Skip if the tokenizer folder does not exist or there are other download issues previously
337
+ if not os.path.exists(f"models/tokenizers/{name}"):
338
+ logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
339
+ continue
340
+
341
+ # create the tokenizer
342
+ try:
343
+ if name == "t5":
344
+ tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
345
+ else:
346
+ tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
347
+ except OSError as e:
348
+ logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
349
+ continue # Skip this model and continue with the next one in the loop
350
+
351
+ with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
352
+ for text in tests:
353
+ f.write(f"{text}")
354
+ f.write("\n__ggml_vocab_test__\n")
355
+
356
+ with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
357
+ for text in tests:
358
+ res = tokenizer.encode(text, add_special_tokens=False)
359
+ for r in res:
360
+ f.write(f" {r}")
361
+ f.write("\n")
362
+
363
+ logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
364
+
365
+ # generate commands for creating vocab files
366
+
367
+ logger.info("\nRun the following commands to generate the vocab files for testing:\n")
368
+
369
+ for model in models:
370
+ name = model["name"]
371
+
372
+ print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
373
+
374
+ logger.info("\n")