mtasic85 commited on
Commit
baab2e1
·
1 Parent(s): 142238a

train tokenizer

Browse files
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/train_tokenizer.py CHANGED
@@ -1,6 +1,6 @@
1
  import gc
2
  import sys
3
- import string
4
 
5
  from datasets import load_dataset
6
  from transformers import PreTrainedTokenizerFast
@@ -183,15 +183,15 @@ def batch_iterator():
183
  # gc.collect()
184
 
185
 
186
- bpe = BPE(unk_token='<unk>', fuse_unk=False, byte_fallback=False)
187
  tokenizer = Tokenizer(bpe)
188
 
189
  special_tokens = [
190
  '<unk>',
191
  '<s>',
192
  '</s>',
193
- '<|im_end|>',
194
  '<|im_start|>',
 
195
  '<tools>',
196
  '</tools>',
197
  '<tool_call>',
@@ -202,6 +202,19 @@ special_tokens = [
202
  'user',
203
  'assistant',
204
  'tool',
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  ]
206
 
207
  for i in range(2, 25):
@@ -210,21 +223,24 @@ for i in range(2, 25):
210
  for i in range(64 - len(special_tokens)):
211
  special_tokens.append(f'<|reserved_{i}|>')
212
 
213
- # ascii
214
- ascii_chars = list(string.ascii_letters + string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation)
215
 
216
  # emoji
217
  dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
218
  emoji_chars = [row['character'] for row in dataset if len(row['character']) == 1]
219
  del dataset
220
 
 
 
 
 
 
221
  # programming languages keywords
222
  dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
223
  code_keywords = [n for row in dataset for n in row['keywords']]
224
  del dataset
225
 
226
- tokenizer.normalizer = normalizers.NFC()
227
-
228
  tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
229
 
230
  tokenizer.post_processor = TemplateProcessing(
@@ -237,11 +253,9 @@ tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True
237
 
238
  trainer = BpeTrainer(
239
  vocab_size=32000,
240
- # min_frequency=2,
241
  special_tokens=special_tokens,
242
- initial_alphabet=ascii_chars + emoji_chars + code_keywords,
243
- # continuing_subword_prefix=None,
244
- # end_of_word_suffix=None,
245
  )
246
 
247
  tokenizer.train_from_iterator(batch_iterator(), trainer)
@@ -261,12 +275,10 @@ fast_tokenizer = PreTrainedTokenizerFast(
261
  tokenizer_object=tokenizer,
262
  chat_template=CHATML_CHAT_TEMPLATE,
263
  bos_token='<s>',
264
- eos_token='<|im_end|>',
265
  unk_token='<unk>',
266
  pad_token='</s>',
267
  clean_up_tokenization_spaces=False,
268
- # spaces_between_special_tokens=False,
269
- # use_default_system_prompt=False,
270
  )
271
 
272
- fast_tokenizer.save_pretrained('../')
 
1
  import gc
2
  import sys
3
+ # import string
4
 
5
  from datasets import load_dataset
6
  from transformers import PreTrainedTokenizerFast
 
183
  # gc.collect()
184
 
185
 
186
+ bpe = BPE(unk_token='<unk>', fuse_unk=True, byte_fallback=True)
187
  tokenizer = Tokenizer(bpe)
188
 
189
  special_tokens = [
190
  '<unk>',
191
  '<s>',
192
  '</s>',
 
193
  '<|im_start|>',
194
+ '<|im_end|>',
195
  '<tools>',
196
  '</tools>',
197
  '<tool_call>',
 
202
  'user',
203
  'assistant',
204
  'tool',
205
+
206
+ '"arguments"',
207
+ '<arguments>',
208
+ '<argument>',
209
+ '<argument-name>',
210
+ '<parameter>',
211
+ '<parameter-name>',
212
+ '<value>',
213
+ '<argument-value>',
214
+ '<parameter-value>',
215
+ '"name"',
216
+ '<function>',
217
+ '<function-name>',
218
  ]
219
 
220
  for i in range(2, 25):
 
223
  for i in range(64 - len(special_tokens)):
224
  special_tokens.append(f'<|reserved_{i}|>')
225
 
226
+ ## ascii
227
+ # ascii_chars = list(string.ascii_letters + string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation)
228
 
229
  # emoji
230
  dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
231
  emoji_chars = [row['character'] for row in dataset if len(row['character']) == 1]
232
  del dataset
233
 
234
+ # programming languages
235
+ dataset = load_dataset('Tanvir1337/programming-languages', split='train')
236
+ programming_languages = [n for row in dataset for n in row['text']]
237
+ del dataset
238
+
239
  # programming languages keywords
240
  dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
241
  code_keywords = [n for row in dataset for n in row['keywords']]
242
  del dataset
243
 
 
 
244
  tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
245
 
246
  tokenizer.post_processor = TemplateProcessing(
 
253
 
254
  trainer = BpeTrainer(
255
  vocab_size=32000,
256
+ min_frequency=2,
257
  special_tokens=special_tokens,
258
+ initial_alphabet=emoji_chars + programming_languages + code_keywords,
 
 
259
  )
260
 
261
  tokenizer.train_from_iterator(batch_iterator(), trainer)
 
275
  tokenizer_object=tokenizer,
276
  chat_template=CHATML_CHAT_TEMPLATE,
277
  bos_token='<s>',
278
+ eos_token='</s>',
279
  unk_token='<unk>',
280
  pad_token='</s>',
281
  clean_up_tokenization_spaces=False,
 
 
282
  )
283
 
284
+ fast_tokenizer.save_pretrained('../')
special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "bos_token": "<s>",
3
- "eos_token": "<|im_end|>",
4
  "pad_token": "</s>",
5
  "unk_token": "<unk>"
6
  }
 
1
  {
2
  "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
  "pad_token": "</s>",
5
  "unk_token": "<unk>"
6
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -25,7 +25,7 @@
25
  "special": true
26
  },
27
  "3": {
28
- "content": "<|im_end|>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
@@ -33,7 +33,7 @@
33
  "special": true
34
  },
35
  "4": {
36
- "content": "<|im_start|>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
@@ -121,7 +121,7 @@
121
  "special": true
122
  },
123
  "15": {
124
- "content": " ",
125
  "lstrip": false,
126
  "normalized": false,
127
  "rstrip": false,
@@ -129,7 +129,7 @@
129
  "special": true
130
  },
131
  "16": {
132
- "content": " ",
133
  "lstrip": false,
134
  "normalized": false,
135
  "rstrip": false,
@@ -137,7 +137,7 @@
137
  "special": true
138
  },
139
  "17": {
140
- "content": " ",
141
  "lstrip": false,
142
  "normalized": false,
143
  "rstrip": false,
@@ -145,7 +145,7 @@
145
  "special": true
146
  },
147
  "18": {
148
- "content": " ",
149
  "lstrip": false,
150
  "normalized": false,
151
  "rstrip": false,
@@ -153,7 +153,7 @@
153
  "special": true
154
  },
155
  "19": {
156
- "content": " ",
157
  "lstrip": false,
158
  "normalized": false,
159
  "rstrip": false,
@@ -161,7 +161,7 @@
161
  "special": true
162
  },
163
  "20": {
164
- "content": " ",
165
  "lstrip": false,
166
  "normalized": false,
167
  "rstrip": false,
@@ -169,7 +169,7 @@
169
  "special": true
170
  },
171
  "21": {
172
- "content": " ",
173
  "lstrip": false,
174
  "normalized": false,
175
  "rstrip": false,
@@ -177,7 +177,7 @@
177
  "special": true
178
  },
179
  "22": {
180
- "content": " ",
181
  "lstrip": false,
182
  "normalized": false,
183
  "rstrip": false,
@@ -185,7 +185,7 @@
185
  "special": true
186
  },
187
  "23": {
188
- "content": " ",
189
  "lstrip": false,
190
  "normalized": false,
191
  "rstrip": false,
@@ -193,7 +193,7 @@
193
  "special": true
194
  },
195
  "24": {
196
- "content": " ",
197
  "lstrip": false,
198
  "normalized": false,
199
  "rstrip": false,
@@ -201,7 +201,7 @@
201
  "special": true
202
  },
203
  "25": {
204
- "content": " ",
205
  "lstrip": false,
206
  "normalized": false,
207
  "rstrip": false,
@@ -209,7 +209,7 @@
209
  "special": true
210
  },
211
  "26": {
212
- "content": " ",
213
  "lstrip": false,
214
  "normalized": false,
215
  "rstrip": false,
@@ -217,7 +217,7 @@
217
  "special": true
218
  },
219
  "27": {
220
- "content": " ",
221
  "lstrip": false,
222
  "normalized": false,
223
  "rstrip": false,
@@ -225,7 +225,7 @@
225
  "special": true
226
  },
227
  "28": {
228
- "content": " ",
229
  "lstrip": false,
230
  "normalized": false,
231
  "rstrip": false,
@@ -233,7 +233,7 @@
233
  "special": true
234
  },
235
  "29": {
236
- "content": " ",
237
  "lstrip": false,
238
  "normalized": false,
239
  "rstrip": false,
@@ -241,7 +241,7 @@
241
  "special": true
242
  },
243
  "30": {
244
- "content": " ",
245
  "lstrip": false,
246
  "normalized": false,
247
  "rstrip": false,
@@ -249,7 +249,7 @@
249
  "special": true
250
  },
251
  "31": {
252
- "content": " ",
253
  "lstrip": false,
254
  "normalized": false,
255
  "rstrip": false,
@@ -257,7 +257,7 @@
257
  "special": true
258
  },
259
  "32": {
260
- "content": " ",
261
  "lstrip": false,
262
  "normalized": false,
263
  "rstrip": false,
@@ -265,7 +265,7 @@
265
  "special": true
266
  },
267
  "33": {
268
- "content": " ",
269
  "lstrip": false,
270
  "normalized": false,
271
  "rstrip": false,
@@ -273,7 +273,7 @@
273
  "special": true
274
  },
275
  "34": {
276
- "content": " ",
277
  "lstrip": false,
278
  "normalized": false,
279
  "rstrip": false,
@@ -281,7 +281,7 @@
281
  "special": true
282
  },
283
  "35": {
284
- "content": " ",
285
  "lstrip": false,
286
  "normalized": false,
287
  "rstrip": false,
@@ -289,7 +289,7 @@
289
  "special": true
290
  },
291
  "36": {
292
- "content": " ",
293
  "lstrip": false,
294
  "normalized": false,
295
  "rstrip": false,
@@ -297,7 +297,7 @@
297
  "special": true
298
  },
299
  "37": {
300
- "content": " ",
301
  "lstrip": false,
302
  "normalized": false,
303
  "rstrip": false,
@@ -305,7 +305,7 @@
305
  "special": true
306
  },
307
  "38": {
308
- "content": "<|reserved_0|>",
309
  "lstrip": false,
310
  "normalized": false,
311
  "rstrip": false,
@@ -313,7 +313,7 @@
313
  "special": true
314
  },
315
  "39": {
316
- "content": "<|reserved_1|>",
317
  "lstrip": false,
318
  "normalized": false,
319
  "rstrip": false,
@@ -321,7 +321,7 @@
321
  "special": true
322
  },
323
  "40": {
324
- "content": "<|reserved_2|>",
325
  "lstrip": false,
326
  "normalized": false,
327
  "rstrip": false,
@@ -329,7 +329,7 @@
329
  "special": true
330
  },
331
  "41": {
332
- "content": "<|reserved_3|>",
333
  "lstrip": false,
334
  "normalized": false,
335
  "rstrip": false,
@@ -337,7 +337,7 @@
337
  "special": true
338
  },
339
  "42": {
340
- "content": "<|reserved_4|>",
341
  "lstrip": false,
342
  "normalized": false,
343
  "rstrip": false,
@@ -345,7 +345,7 @@
345
  "special": true
346
  },
347
  "43": {
348
- "content": "<|reserved_5|>",
349
  "lstrip": false,
350
  "normalized": false,
351
  "rstrip": false,
@@ -353,7 +353,7 @@
353
  "special": true
354
  },
355
  "44": {
356
- "content": "<|reserved_6|>",
357
  "lstrip": false,
358
  "normalized": false,
359
  "rstrip": false,
@@ -361,7 +361,7 @@
361
  "special": true
362
  },
363
  "45": {
364
- "content": "<|reserved_7|>",
365
  "lstrip": false,
366
  "normalized": false,
367
  "rstrip": false,
@@ -369,7 +369,7 @@
369
  "special": true
370
  },
371
  "46": {
372
- "content": "<|reserved_8|>",
373
  "lstrip": false,
374
  "normalized": false,
375
  "rstrip": false,
@@ -377,7 +377,7 @@
377
  "special": true
378
  },
379
  "47": {
380
- "content": "<|reserved_9|>",
381
  "lstrip": false,
382
  "normalized": false,
383
  "rstrip": false,
@@ -385,7 +385,7 @@
385
  "special": true
386
  },
387
  "48": {
388
- "content": "<|reserved_10|>",
389
  "lstrip": false,
390
  "normalized": false,
391
  "rstrip": false,
@@ -393,7 +393,7 @@
393
  "special": true
394
  },
395
  "49": {
396
- "content": "<|reserved_11|>",
397
  "lstrip": false,
398
  "normalized": false,
399
  "rstrip": false,
@@ -401,7 +401,7 @@
401
  "special": true
402
  },
403
  "50": {
404
- "content": "<|reserved_12|>",
405
  "lstrip": false,
406
  "normalized": false,
407
  "rstrip": false,
@@ -409,7 +409,7 @@
409
  "special": true
410
  },
411
  "51": {
412
- "content": "<|reserved_13|>",
413
  "lstrip": false,
414
  "normalized": false,
415
  "rstrip": false,
@@ -417,7 +417,7 @@
417
  "special": true
418
  },
419
  "52": {
420
- "content": "<|reserved_14|>",
421
  "lstrip": false,
422
  "normalized": false,
423
  "rstrip": false,
@@ -425,7 +425,7 @@
425
  "special": true
426
  },
427
  "53": {
428
- "content": "<|reserved_15|>",
429
  "lstrip": false,
430
  "normalized": false,
431
  "rstrip": false,
@@ -433,7 +433,7 @@
433
  "special": true
434
  },
435
  "54": {
436
- "content": "<|reserved_16|>",
437
  "lstrip": false,
438
  "normalized": false,
439
  "rstrip": false,
@@ -441,7 +441,7 @@
441
  "special": true
442
  },
443
  "55": {
444
- "content": "<|reserved_17|>",
445
  "lstrip": false,
446
  "normalized": false,
447
  "rstrip": false,
@@ -449,7 +449,7 @@
449
  "special": true
450
  },
451
  "56": {
452
- "content": "<|reserved_18|>",
453
  "lstrip": false,
454
  "normalized": false,
455
  "rstrip": false,
@@ -457,7 +457,7 @@
457
  "special": true
458
  },
459
  "57": {
460
- "content": "<|reserved_19|>",
461
  "lstrip": false,
462
  "normalized": false,
463
  "rstrip": false,
@@ -465,7 +465,7 @@
465
  "special": true
466
  },
467
  "58": {
468
- "content": "<|reserved_20|>",
469
  "lstrip": false,
470
  "normalized": false,
471
  "rstrip": false,
@@ -473,7 +473,7 @@
473
  "special": true
474
  },
475
  "59": {
476
- "content": "<|reserved_21|>",
477
  "lstrip": false,
478
  "normalized": false,
479
  "rstrip": false,
@@ -481,7 +481,7 @@
481
  "special": true
482
  },
483
  "60": {
484
- "content": "<|reserved_22|>",
485
  "lstrip": false,
486
  "normalized": false,
487
  "rstrip": false,
@@ -489,7 +489,7 @@
489
  "special": true
490
  },
491
  "61": {
492
- "content": "<|reserved_23|>",
493
  "lstrip": false,
494
  "normalized": false,
495
  "rstrip": false,
@@ -497,7 +497,7 @@
497
  "special": true
498
  },
499
  "62": {
500
- "content": "<|reserved_24|>",
501
  "lstrip": false,
502
  "normalized": false,
503
  "rstrip": false,
@@ -505,7 +505,7 @@
505
  "special": true
506
  },
507
  "63": {
508
- "content": "<|reserved_25|>",
509
  "lstrip": false,
510
  "normalized": false,
511
  "rstrip": false,
@@ -516,7 +516,7 @@
516
  "bos_token": "<s>",
517
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
518
  "clean_up_tokenization_spaces": false,
519
- "eos_token": "<|im_end|>",
520
  "model_max_length": 1000000000000000019884624838656,
521
  "pad_token": "</s>",
522
  "tokenizer_class": "PreTrainedTokenizerFast",
 
25
  "special": true
26
  },
27
  "3": {
28
+ "content": "<|im_start|>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
 
33
  "special": true
34
  },
35
  "4": {
36
+ "content": "<|im_end|>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
 
121
  "special": true
122
  },
123
  "15": {
124
+ "content": "\"arguments\"",
125
  "lstrip": false,
126
  "normalized": false,
127
  "rstrip": false,
 
129
  "special": true
130
  },
131
  "16": {
132
+ "content": "<arguments>",
133
  "lstrip": false,
134
  "normalized": false,
135
  "rstrip": false,
 
137
  "special": true
138
  },
139
  "17": {
140
+ "content": "<argument>",
141
  "lstrip": false,
142
  "normalized": false,
143
  "rstrip": false,
 
145
  "special": true
146
  },
147
  "18": {
148
+ "content": "<argument-name>",
149
  "lstrip": false,
150
  "normalized": false,
151
  "rstrip": false,
 
153
  "special": true
154
  },
155
  "19": {
156
+ "content": "<parameter>",
157
  "lstrip": false,
158
  "normalized": false,
159
  "rstrip": false,
 
161
  "special": true
162
  },
163
  "20": {
164
+ "content": "<parameter-name>",
165
  "lstrip": false,
166
  "normalized": false,
167
  "rstrip": false,
 
169
  "special": true
170
  },
171
  "21": {
172
+ "content": "<value>",
173
  "lstrip": false,
174
  "normalized": false,
175
  "rstrip": false,
 
177
  "special": true
178
  },
179
  "22": {
180
+ "content": "<argument-value>",
181
  "lstrip": false,
182
  "normalized": false,
183
  "rstrip": false,
 
185
  "special": true
186
  },
187
  "23": {
188
+ "content": "<parameter-value>",
189
  "lstrip": false,
190
  "normalized": false,
191
  "rstrip": false,
 
193
  "special": true
194
  },
195
  "24": {
196
+ "content": "\"name\"",
197
  "lstrip": false,
198
  "normalized": false,
199
  "rstrip": false,
 
201
  "special": true
202
  },
203
  "25": {
204
+ "content": "<function>",
205
  "lstrip": false,
206
  "normalized": false,
207
  "rstrip": false,
 
209
  "special": true
210
  },
211
  "26": {
212
+ "content": "<function-name>",
213
  "lstrip": false,
214
  "normalized": false,
215
  "rstrip": false,
 
217
  "special": true
218
  },
219
  "27": {
220
+ "content": " ",
221
  "lstrip": false,
222
  "normalized": false,
223
  "rstrip": false,
 
225
  "special": true
226
  },
227
  "28": {
228
+ "content": " ",
229
  "lstrip": false,
230
  "normalized": false,
231
  "rstrip": false,
 
233
  "special": true
234
  },
235
  "29": {
236
+ "content": " ",
237
  "lstrip": false,
238
  "normalized": false,
239
  "rstrip": false,
 
241
  "special": true
242
  },
243
  "30": {
244
+ "content": " ",
245
  "lstrip": false,
246
  "normalized": false,
247
  "rstrip": false,
 
249
  "special": true
250
  },
251
  "31": {
252
+ "content": " ",
253
  "lstrip": false,
254
  "normalized": false,
255
  "rstrip": false,
 
257
  "special": true
258
  },
259
  "32": {
260
+ "content": " ",
261
  "lstrip": false,
262
  "normalized": false,
263
  "rstrip": false,
 
265
  "special": true
266
  },
267
  "33": {
268
+ "content": " ",
269
  "lstrip": false,
270
  "normalized": false,
271
  "rstrip": false,
 
273
  "special": true
274
  },
275
  "34": {
276
+ "content": " ",
277
  "lstrip": false,
278
  "normalized": false,
279
  "rstrip": false,
 
281
  "special": true
282
  },
283
  "35": {
284
+ "content": " ",
285
  "lstrip": false,
286
  "normalized": false,
287
  "rstrip": false,
 
289
  "special": true
290
  },
291
  "36": {
292
+ "content": " ",
293
  "lstrip": false,
294
  "normalized": false,
295
  "rstrip": false,
 
297
  "special": true
298
  },
299
  "37": {
300
+ "content": " ",
301
  "lstrip": false,
302
  "normalized": false,
303
  "rstrip": false,
 
305
  "special": true
306
  },
307
  "38": {
308
+ "content": " ",
309
  "lstrip": false,
310
  "normalized": false,
311
  "rstrip": false,
 
313
  "special": true
314
  },
315
  "39": {
316
+ "content": " ",
317
  "lstrip": false,
318
  "normalized": false,
319
  "rstrip": false,
 
321
  "special": true
322
  },
323
  "40": {
324
+ "content": " ",
325
  "lstrip": false,
326
  "normalized": false,
327
  "rstrip": false,
 
329
  "special": true
330
  },
331
  "41": {
332
+ "content": " ",
333
  "lstrip": false,
334
  "normalized": false,
335
  "rstrip": false,
 
337
  "special": true
338
  },
339
  "42": {
340
+ "content": " ",
341
  "lstrip": false,
342
  "normalized": false,
343
  "rstrip": false,
 
345
  "special": true
346
  },
347
  "43": {
348
+ "content": " ",
349
  "lstrip": false,
350
  "normalized": false,
351
  "rstrip": false,
 
353
  "special": true
354
  },
355
  "44": {
356
+ "content": " ",
357
  "lstrip": false,
358
  "normalized": false,
359
  "rstrip": false,
 
361
  "special": true
362
  },
363
  "45": {
364
+ "content": " ",
365
  "lstrip": false,
366
  "normalized": false,
367
  "rstrip": false,
 
369
  "special": true
370
  },
371
  "46": {
372
+ "content": " ",
373
  "lstrip": false,
374
  "normalized": false,
375
  "rstrip": false,
 
377
  "special": true
378
  },
379
  "47": {
380
+ "content": " ",
381
  "lstrip": false,
382
  "normalized": false,
383
  "rstrip": false,
 
385
  "special": true
386
  },
387
  "48": {
388
+ "content": " ",
389
  "lstrip": false,
390
  "normalized": false,
391
  "rstrip": false,
 
393
  "special": true
394
  },
395
  "49": {
396
+ "content": " ",
397
  "lstrip": false,
398
  "normalized": false,
399
  "rstrip": false,
 
401
  "special": true
402
  },
403
  "50": {
404
+ "content": "<|reserved_0|>",
405
  "lstrip": false,
406
  "normalized": false,
407
  "rstrip": false,
 
409
  "special": true
410
  },
411
  "51": {
412
+ "content": "<|reserved_1|>",
413
  "lstrip": false,
414
  "normalized": false,
415
  "rstrip": false,
 
417
  "special": true
418
  },
419
  "52": {
420
+ "content": "<|reserved_2|>",
421
  "lstrip": false,
422
  "normalized": false,
423
  "rstrip": false,
 
425
  "special": true
426
  },
427
  "53": {
428
+ "content": "<|reserved_3|>",
429
  "lstrip": false,
430
  "normalized": false,
431
  "rstrip": false,
 
433
  "special": true
434
  },
435
  "54": {
436
+ "content": "<|reserved_4|>",
437
  "lstrip": false,
438
  "normalized": false,
439
  "rstrip": false,
 
441
  "special": true
442
  },
443
  "55": {
444
+ "content": "<|reserved_5|>",
445
  "lstrip": false,
446
  "normalized": false,
447
  "rstrip": false,
 
449
  "special": true
450
  },
451
  "56": {
452
+ "content": "<|reserved_6|>",
453
  "lstrip": false,
454
  "normalized": false,
455
  "rstrip": false,
 
457
  "special": true
458
  },
459
  "57": {
460
+ "content": "<|reserved_7|>",
461
  "lstrip": false,
462
  "normalized": false,
463
  "rstrip": false,
 
465
  "special": true
466
  },
467
  "58": {
468
+ "content": "<|reserved_8|>",
469
  "lstrip": false,
470
  "normalized": false,
471
  "rstrip": false,
 
473
  "special": true
474
  },
475
  "59": {
476
+ "content": "<|reserved_9|>",
477
  "lstrip": false,
478
  "normalized": false,
479
  "rstrip": false,
 
481
  "special": true
482
  },
483
  "60": {
484
+ "content": "<|reserved_10|>",
485
  "lstrip": false,
486
  "normalized": false,
487
  "rstrip": false,
 
489
  "special": true
490
  },
491
  "61": {
492
+ "content": "<|reserved_11|>",
493
  "lstrip": false,
494
  "normalized": false,
495
  "rstrip": false,
 
497
  "special": true
498
  },
499
  "62": {
500
+ "content": "<|reserved_12|>",
501
  "lstrip": false,
502
  "normalized": false,
503
  "rstrip": false,
 
505
  "special": true
506
  },
507
  "63": {
508
+ "content": "<|reserved_13|>",
509
  "lstrip": false,
510
  "normalized": false,
511
  "rstrip": false,
 
516
  "bos_token": "<s>",
517
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
518
  "clean_up_tokenization_spaces": false,
519
+ "eos_token": "</s>",
520
  "model_max_length": 1000000000000000019884624838656,
521
  "pad_token": "</s>",
522
  "tokenizer_class": "PreTrainedTokenizerFast",
vocab.json CHANGED
The diff for this file is too large to render. See raw diff