mtasic85 commited on
Commit
8c82fc3
1 Parent(s): 38a72ab
Files changed (5) hide show
  1. merges.txt +0 -0
  2. scripts/train_tokenizer.py +22 -36
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +48 -464
  5. vocab.json +0 -0
merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
scripts/train_tokenizer.py CHANGED
@@ -1,10 +1,10 @@
1
  import string
2
 
3
  from datasets import load_dataset
4
- from tokenizers import ByteLevelBPETokenizer
 
5
  from transformers import PreTrainedTokenizerFast
6
 
7
-
8
  # dataset_0 = (
9
  # load_dataset('wikimedia/wikisource', lang, split='train')
10
  # for lang in ['20231201.ar', '20231201.as', '20231201.az', '20231201.ban', '20231201.be', '20231201.bg', '20231201.bn', '20231201.br', '20231201.bs', '20231201.ca', '20231201.cs', '20231201.cy', '20231201.da', '20231201.de', '20231201.el', '20231201.en', '20231201.eo', '20231201.es', '20231201.et', '20231201.eu', '20231201.fa', '20231201.fi', '20231201.fo', '20231201.fr', '20231201.gl', '20231201.gu', '20231201.he', '20231201.hi', '20231201.hr', '20231201.hu', '20231201.hy', '20231201.id', '20231201.is', '20231201.it', '20231201.ja', '20231201.jv', '20231201.kn', '20231201.ko', '20231201.la', '20231201.li', '20231201.lij', '20231201.lt', '20231201.mk', '20231201.ml', '20231201.mr', '20231201.nap', '20231201.nl', '20231201.no', '20231201.or', '20231201.pa', '20231201.pl', '20231201.pms', '20231201.pt', '20231201.ro', '20231201.ru', '20231201.sa', '20231201.sah', '20231201.sk', '20231201.sl', '20231201.sr', '20231201.su', '20231201.sv', '20231201.ta', '20231201.te', '20231201.th', '20231201.tr', '20231201.uk', '20231201.vec', '20231201.vi', '20231201.wa', '20231201.yi', '20231201.zh', '20231201.zh-min-nan']
@@ -38,64 +38,42 @@ def batch_iterator():
38
  # for d in dataset_0:
39
  # for row in d['text']:
40
  # yield row
41
- # break
42
- #
43
- # break
44
 
45
  for d in dataset_1:
46
  for row in d['text']:
47
  yield row
48
- # break
49
-
50
- # break
51
 
52
  for d in dataset_2:
53
  for row in d['text']:
54
  yield row
55
- # break
56
-
57
- # break
58
 
59
  # for row in dataset_3['text']:
60
  # yield row
61
- # break
62
 
63
  for row in dataset_4:
64
  yield row['query'] + '\n' + row['answer']
65
- # break
66
 
67
  for row in dataset_5:
68
  yield row['prompt'] + '\n' + row['response']
69
- # break
70
 
71
  # for row in dataset_6:
72
  # yield row['instruction'] + '\n' + row['output']
73
- # break
74
 
75
  for row in dataset_7:
76
  yield row['question'] + '\n' + row['answer']
77
- # break
78
 
79
  for row in dataset_8['conversations']:
80
  yield '\n'.join(n['value'] for n in row)
81
- # break
82
 
83
  for row in dataset_9['conversations']:
84
  yield '\n'.join(n['value'] for n in row)
85
- # break
86
 
87
  for d in dataset_10:
88
  for row in d['messages']:
89
  yield '\n'.join(n['content'] for n in row)
90
- # break
91
 
92
  for row in dataset_11:
93
  yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
94
- # break
95
-
96
-
97
- # for row in batch_iterator():
98
- # print(f'{row = }')
99
 
100
 
101
  special_tokens = [
@@ -115,32 +93,40 @@ special_tokens = [
115
  'system',
116
  'user',
117
  'assistant',
118
- *list(string.printable),
119
  ]
120
 
 
 
 
121
  for i in range(64 - len(special_tokens)):
122
  special_tokens.append(f'<|reserved_{i}|>')
123
 
124
  ascii_chars = string.ascii_letters + string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation
125
 
126
- tokenizer = ByteLevelBPETokenizer()
 
 
 
127
 
128
- tokenizer.train_from_iterator(
129
- [ascii_chars],
130
- vocab_size=len(ascii_chars),
131
- min_frequency=1,
132
- special_tokens=[],
133
- )
134
 
135
- tokenizer.train_from_iterator(
136
- batch_iterator(),
137
  vocab_size=32064,
138
- min_frequency=2,
139
  special_tokens=special_tokens,
 
140
  )
141
 
142
- tokenizer.save_model('..')
 
143
 
 
 
 
144
  CHATML_CHAT_TEMPLATE = (
145
  "{% for message in messages %}"
146
  "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
 
1
  import string
2
 
3
  from datasets import load_dataset
4
+ from tokenizers import Tokenizer, models, pre_tokenizers, trainers
5
+ from tokenizers.processors import TemplateProcessing
6
  from transformers import PreTrainedTokenizerFast
7
 
 
8
  # dataset_0 = (
9
  # load_dataset('wikimedia/wikisource', lang, split='train')
10
  # for lang in ['20231201.ar', '20231201.as', '20231201.az', '20231201.ban', '20231201.be', '20231201.bg', '20231201.bn', '20231201.br', '20231201.bs', '20231201.ca', '20231201.cs', '20231201.cy', '20231201.da', '20231201.de', '20231201.el', '20231201.en', '20231201.eo', '20231201.es', '20231201.et', '20231201.eu', '20231201.fa', '20231201.fi', '20231201.fo', '20231201.fr', '20231201.gl', '20231201.gu', '20231201.he', '20231201.hi', '20231201.hr', '20231201.hu', '20231201.hy', '20231201.id', '20231201.is', '20231201.it', '20231201.ja', '20231201.jv', '20231201.kn', '20231201.ko', '20231201.la', '20231201.li', '20231201.lij', '20231201.lt', '20231201.mk', '20231201.ml', '20231201.mr', '20231201.nap', '20231201.nl', '20231201.no', '20231201.or', '20231201.pa', '20231201.pl', '20231201.pms', '20231201.pt', '20231201.ro', '20231201.ru', '20231201.sa', '20231201.sah', '20231201.sk', '20231201.sl', '20231201.sr', '20231201.su', '20231201.sv', '20231201.ta', '20231201.te', '20231201.th', '20231201.tr', '20231201.uk', '20231201.vec', '20231201.vi', '20231201.wa', '20231201.yi', '20231201.zh', '20231201.zh-min-nan']
 
38
  # for d in dataset_0:
39
  # for row in d['text']:
40
  # yield row
 
 
 
41
 
42
  for d in dataset_1:
43
  for row in d['text']:
44
  yield row
 
 
 
45
 
46
  for d in dataset_2:
47
  for row in d['text']:
48
  yield row
 
 
 
49
 
50
  # for row in dataset_3['text']:
51
  # yield row
 
52
 
53
  for row in dataset_4:
54
  yield row['query'] + '\n' + row['answer']
 
55
 
56
  for row in dataset_5:
57
  yield row['prompt'] + '\n' + row['response']
 
58
 
59
  # for row in dataset_6:
60
  # yield row['instruction'] + '\n' + row['output']
 
61
 
62
  for row in dataset_7:
63
  yield row['question'] + '\n' + row['answer']
 
64
 
65
  for row in dataset_8['conversations']:
66
  yield '\n'.join(n['value'] for n in row)
 
67
 
68
  for row in dataset_9['conversations']:
69
  yield '\n'.join(n['value'] for n in row)
 
70
 
71
  for d in dataset_10:
72
  for row in d['messages']:
73
  yield '\n'.join(n['content'] for n in row)
 
74
 
75
  for row in dataset_11:
76
  yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
 
 
 
 
 
77
 
78
 
79
  special_tokens = [
 
93
  'system',
94
  'user',
95
  'assistant',
 
96
  ]
97
 
98
+ for i in range(2, 25):
99
+ special_tokens.append(' ' * i)
100
+
101
  for i in range(64 - len(special_tokens)):
102
  special_tokens.append(f'<|reserved_{i}|>')
103
 
104
  ascii_chars = string.ascii_letters + string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation
105
 
106
+ #
107
+ # tokenizer
108
+ #
109
+ tokenizer = Tokenizer(models.BPE())
110
 
111
+ # set up pre-tokenizer to split on whitespace and punctuation
112
+ tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
113
+ pre_tokenizers.WhitespaceSplit(),
114
+ pre_tokenizers.Punctuation(),
115
+ ])
 
116
 
117
+ # trainer
118
+ trainer = trainers.BpeTrainer(
119
  vocab_size=32064,
 
120
  special_tokens=special_tokens,
121
+ initial_alphabet=list(ascii_chars),
122
  )
123
 
124
+ # train the tokenizer
125
+ tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
126
 
127
+ #
128
+ # fast_tokenizer
129
+ #
130
  CHATML_CHAT_TEMPLATE = (
131
  "{% for message in messages %}"
132
  "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -129,7 +129,7 @@
129
  "special": true
130
  },
131
  "16": {
132
- "content": "0",
133
  "lstrip": false,
134
  "normalized": false,
135
  "rstrip": false,
@@ -137,7 +137,7 @@
137
  "special": true
138
  },
139
  "17": {
140
- "content": "1",
141
  "lstrip": false,
142
  "normalized": false,
143
  "rstrip": false,
@@ -145,7 +145,7 @@
145
  "special": true
146
  },
147
  "18": {
148
- "content": "2",
149
  "lstrip": false,
150
  "normalized": false,
151
  "rstrip": false,
@@ -153,7 +153,7 @@
153
  "special": true
154
  },
155
  "19": {
156
- "content": "3",
157
  "lstrip": false,
158
  "normalized": false,
159
  "rstrip": false,
@@ -161,7 +161,7 @@
161
  "special": true
162
  },
163
  "20": {
164
- "content": "4",
165
  "lstrip": false,
166
  "normalized": false,
167
  "rstrip": false,
@@ -169,7 +169,7 @@
169
  "special": true
170
  },
171
  "21": {
172
- "content": "5",
173
  "lstrip": false,
174
  "normalized": false,
175
  "rstrip": false,
@@ -177,7 +177,7 @@
177
  "special": true
178
  },
179
  "22": {
180
- "content": "6",
181
  "lstrip": false,
182
  "normalized": false,
183
  "rstrip": false,
@@ -185,7 +185,7 @@
185
  "special": true
186
  },
187
  "23": {
188
- "content": "7",
189
  "lstrip": false,
190
  "normalized": false,
191
  "rstrip": false,
@@ -193,7 +193,7 @@
193
  "special": true
194
  },
195
  "24": {
196
- "content": "8",
197
  "lstrip": false,
198
  "normalized": false,
199
  "rstrip": false,
@@ -201,7 +201,7 @@
201
  "special": true
202
  },
203
  "25": {
204
- "content": "9",
205
  "lstrip": false,
206
  "normalized": false,
207
  "rstrip": false,
@@ -209,7 +209,7 @@
209
  "special": true
210
  },
211
  "26": {
212
- "content": "a",
213
  "lstrip": false,
214
  "normalized": false,
215
  "rstrip": false,
@@ -217,7 +217,7 @@
217
  "special": true
218
  },
219
  "27": {
220
- "content": "b",
221
  "lstrip": false,
222
  "normalized": false,
223
  "rstrip": false,
@@ -225,7 +225,7 @@
225
  "special": true
226
  },
227
  "28": {
228
- "content": "c",
229
  "lstrip": false,
230
  "normalized": false,
231
  "rstrip": false,
@@ -233,7 +233,7 @@
233
  "special": true
234
  },
235
  "29": {
236
- "content": "d",
237
  "lstrip": false,
238
  "normalized": false,
239
  "rstrip": false,
@@ -241,7 +241,7 @@
241
  "special": true
242
  },
243
  "30": {
244
- "content": "e",
245
  "lstrip": false,
246
  "normalized": false,
247
  "rstrip": false,
@@ -249,7 +249,7 @@
249
  "special": true
250
  },
251
  "31": {
252
- "content": "f",
253
  "lstrip": false,
254
  "normalized": false,
255
  "rstrip": false,
@@ -257,7 +257,7 @@
257
  "special": true
258
  },
259
  "32": {
260
- "content": "g",
261
  "lstrip": false,
262
  "normalized": false,
263
  "rstrip": false,
@@ -265,7 +265,7 @@
265
  "special": true
266
  },
267
  "33": {
268
- "content": "h",
269
  "lstrip": false,
270
  "normalized": false,
271
  "rstrip": false,
@@ -273,7 +273,7 @@
273
  "special": true
274
  },
275
  "34": {
276
- "content": "i",
277
  "lstrip": false,
278
  "normalized": false,
279
  "rstrip": false,
@@ -281,7 +281,7 @@
281
  "special": true
282
  },
283
  "35": {
284
- "content": "j",
285
  "lstrip": false,
286
  "normalized": false,
287
  "rstrip": false,
@@ -289,7 +289,7 @@
289
  "special": true
290
  },
291
  "36": {
292
- "content": "k",
293
  "lstrip": false,
294
  "normalized": false,
295
  "rstrip": false,
@@ -297,7 +297,7 @@
297
  "special": true
298
  },
299
  "37": {
300
- "content": "l",
301
  "lstrip": false,
302
  "normalized": false,
303
  "rstrip": false,
@@ -305,7 +305,7 @@
305
  "special": true
306
  },
307
  "38": {
308
- "content": "m",
309
  "lstrip": false,
310
  "normalized": false,
311
  "rstrip": false,
@@ -313,7 +313,7 @@
313
  "special": true
314
  },
315
  "39": {
316
- "content": "n",
317
  "lstrip": false,
318
  "normalized": false,
319
  "rstrip": false,
@@ -321,7 +321,7 @@
321
  "special": true
322
  },
323
  "40": {
324
- "content": "o",
325
  "lstrip": false,
326
  "normalized": false,
327
  "rstrip": false,
@@ -329,7 +329,7 @@
329
  "special": true
330
  },
331
  "41": {
332
- "content": "p",
333
  "lstrip": false,
334
  "normalized": false,
335
  "rstrip": false,
@@ -337,7 +337,7 @@
337
  "special": true
338
  },
339
  "42": {
340
- "content": "q",
341
  "lstrip": false,
342
  "normalized": false,
343
  "rstrip": false,
@@ -345,7 +345,7 @@
345
  "special": true
346
  },
347
  "43": {
348
- "content": "r",
349
  "lstrip": false,
350
  "normalized": false,
351
  "rstrip": false,
@@ -353,7 +353,7 @@
353
  "special": true
354
  },
355
  "44": {
356
- "content": "s",
357
  "lstrip": false,
358
  "normalized": false,
359
  "rstrip": false,
@@ -361,7 +361,7 @@
361
  "special": true
362
  },
363
  "45": {
364
- "content": "t",
365
  "lstrip": false,
366
  "normalized": false,
367
  "rstrip": false,
@@ -369,7 +369,7 @@
369
  "special": true
370
  },
371
  "46": {
372
- "content": "u",
373
  "lstrip": false,
374
  "normalized": false,
375
  "rstrip": false,
@@ -377,7 +377,7 @@
377
  "special": true
378
  },
379
  "47": {
380
- "content": "v",
381
  "lstrip": false,
382
  "normalized": false,
383
  "rstrip": false,
@@ -385,7 +385,7 @@
385
  "special": true
386
  },
387
  "48": {
388
- "content": "w",
389
  "lstrip": false,
390
  "normalized": false,
391
  "rstrip": false,
@@ -393,7 +393,7 @@
393
  "special": true
394
  },
395
  "49": {
396
- "content": "x",
397
  "lstrip": false,
398
  "normalized": false,
399
  "rstrip": false,
@@ -401,7 +401,7 @@
401
  "special": true
402
  },
403
  "50": {
404
- "content": "y",
405
  "lstrip": false,
406
  "normalized": false,
407
  "rstrip": false,
@@ -409,7 +409,7 @@
409
  "special": true
410
  },
411
  "51": {
412
- "content": "z",
413
  "lstrip": false,
414
  "normalized": false,
415
  "rstrip": false,
@@ -417,7 +417,7 @@
417
  "special": true
418
  },
419
  "52": {
420
- "content": "A",
421
  "lstrip": false,
422
  "normalized": false,
423
  "rstrip": false,
@@ -425,7 +425,7 @@
425
  "special": true
426
  },
427
  "53": {
428
- "content": "B",
429
  "lstrip": false,
430
  "normalized": false,
431
  "rstrip": false,
@@ -433,7 +433,7 @@
433
  "special": true
434
  },
435
  "54": {
436
- "content": "C",
437
  "lstrip": false,
438
  "normalized": false,
439
  "rstrip": false,
@@ -441,7 +441,7 @@
441
  "special": true
442
  },
443
  "55": {
444
- "content": "D",
445
  "lstrip": false,
446
  "normalized": false,
447
  "rstrip": false,
@@ -449,7 +449,7 @@
449
  "special": true
450
  },
451
  "56": {
452
- "content": "E",
453
  "lstrip": false,
454
  "normalized": false,
455
  "rstrip": false,
@@ -457,7 +457,7 @@
457
  "special": true
458
  },
459
  "57": {
460
- "content": "F",
461
  "lstrip": false,
462
  "normalized": false,
463
  "rstrip": false,
@@ -465,7 +465,7 @@
465
  "special": true
466
  },
467
  "58": {
468
- "content": "G",
469
  "lstrip": false,
470
  "normalized": false,
471
  "rstrip": false,
@@ -473,7 +473,7 @@
473
  "special": true
474
  },
475
  "59": {
476
- "content": "H",
477
  "lstrip": false,
478
  "normalized": false,
479
  "rstrip": false,
@@ -481,7 +481,7 @@
481
  "special": true
482
  },
483
  "60": {
484
- "content": "I",
485
  "lstrip": false,
486
  "normalized": false,
487
  "rstrip": false,
@@ -489,7 +489,7 @@
489
  "special": true
490
  },
491
  "61": {
492
- "content": "J",
493
  "lstrip": false,
494
  "normalized": false,
495
  "rstrip": false,
@@ -497,7 +497,7 @@
497
  "special": true
498
  },
499
  "62": {
500
- "content": "K",
501
  "lstrip": false,
502
  "normalized": false,
503
  "rstrip": false,
@@ -505,423 +505,7 @@
505
  "special": true
506
  },
507
  "63": {
508
- "content": "L",
509
- "lstrip": false,
510
- "normalized": false,
511
- "rstrip": false,
512
- "single_word": false,
513
- "special": true
514
- },
515
- "64": {
516
- "content": "M",
517
- "lstrip": false,
518
- "normalized": false,
519
- "rstrip": false,
520
- "single_word": false,
521
- "special": true
522
- },
523
- "65": {
524
- "content": "N",
525
- "lstrip": false,
526
- "normalized": false,
527
- "rstrip": false,
528
- "single_word": false,
529
- "special": true
530
- },
531
- "66": {
532
- "content": "O",
533
- "lstrip": false,
534
- "normalized": false,
535
- "rstrip": false,
536
- "single_word": false,
537
- "special": true
538
- },
539
- "67": {
540
- "content": "P",
541
- "lstrip": false,
542
- "normalized": false,
543
- "rstrip": false,
544
- "single_word": false,
545
- "special": true
546
- },
547
- "68": {
548
- "content": "Q",
549
- "lstrip": false,
550
- "normalized": false,
551
- "rstrip": false,
552
- "single_word": false,
553
- "special": true
554
- },
555
- "69": {
556
- "content": "R",
557
- "lstrip": false,
558
- "normalized": false,
559
- "rstrip": false,
560
- "single_word": false,
561
- "special": true
562
- },
563
- "70": {
564
- "content": "S",
565
- "lstrip": false,
566
- "normalized": false,
567
- "rstrip": false,
568
- "single_word": false,
569
- "special": true
570
- },
571
- "71": {
572
- "content": "T",
573
- "lstrip": false,
574
- "normalized": false,
575
- "rstrip": false,
576
- "single_word": false,
577
- "special": true
578
- },
579
- "72": {
580
- "content": "U",
581
- "lstrip": false,
582
- "normalized": false,
583
- "rstrip": false,
584
- "single_word": false,
585
- "special": true
586
- },
587
- "73": {
588
- "content": "V",
589
- "lstrip": false,
590
- "normalized": false,
591
- "rstrip": false,
592
- "single_word": false,
593
- "special": true
594
- },
595
- "74": {
596
- "content": "W",
597
- "lstrip": false,
598
- "normalized": false,
599
- "rstrip": false,
600
- "single_word": false,
601
- "special": true
602
- },
603
- "75": {
604
- "content": "X",
605
- "lstrip": false,
606
- "normalized": false,
607
- "rstrip": false,
608
- "single_word": false,
609
- "special": true
610
- },
611
- "76": {
612
- "content": "Y",
613
- "lstrip": false,
614
- "normalized": false,
615
- "rstrip": false,
616
- "single_word": false,
617
- "special": true
618
- },
619
- "77": {
620
- "content": "Z",
621
- "lstrip": false,
622
- "normalized": false,
623
- "rstrip": false,
624
- "single_word": false,
625
- "special": true
626
- },
627
- "78": {
628
- "content": "!",
629
- "lstrip": false,
630
- "normalized": false,
631
- "rstrip": false,
632
- "single_word": false,
633
- "special": true
634
- },
635
- "79": {
636
- "content": "\"",
637
- "lstrip": false,
638
- "normalized": false,
639
- "rstrip": false,
640
- "single_word": false,
641
- "special": true
642
- },
643
- "80": {
644
- "content": "#",
645
- "lstrip": false,
646
- "normalized": false,
647
- "rstrip": false,
648
- "single_word": false,
649
- "special": true
650
- },
651
- "81": {
652
- "content": "$",
653
- "lstrip": false,
654
- "normalized": false,
655
- "rstrip": false,
656
- "single_word": false,
657
- "special": true
658
- },
659
- "82": {
660
- "content": "%",
661
- "lstrip": false,
662
- "normalized": false,
663
- "rstrip": false,
664
- "single_word": false,
665
- "special": true
666
- },
667
- "83": {
668
- "content": "&",
669
- "lstrip": false,
670
- "normalized": false,
671
- "rstrip": false,
672
- "single_word": false,
673
- "special": true
674
- },
675
- "84": {
676
- "content": "'",
677
- "lstrip": false,
678
- "normalized": false,
679
- "rstrip": false,
680
- "single_word": false,
681
- "special": true
682
- },
683
- "85": {
684
- "content": "(",
685
- "lstrip": false,
686
- "normalized": false,
687
- "rstrip": false,
688
- "single_word": false,
689
- "special": true
690
- },
691
- "86": {
692
- "content": ")",
693
- "lstrip": false,
694
- "normalized": false,
695
- "rstrip": false,
696
- "single_word": false,
697
- "special": true
698
- },
699
- "87": {
700
- "content": "*",
701
- "lstrip": false,
702
- "normalized": false,
703
- "rstrip": false,
704
- "single_word": false,
705
- "special": true
706
- },
707
- "88": {
708
- "content": "+",
709
- "lstrip": false,
710
- "normalized": false,
711
- "rstrip": false,
712
- "single_word": false,
713
- "special": true
714
- },
715
- "89": {
716
- "content": ",",
717
- "lstrip": false,
718
- "normalized": false,
719
- "rstrip": false,
720
- "single_word": false,
721
- "special": true
722
- },
723
- "90": {
724
- "content": "-",
725
- "lstrip": false,
726
- "normalized": false,
727
- "rstrip": false,
728
- "single_word": false,
729
- "special": true
730
- },
731
- "91": {
732
- "content": ".",
733
- "lstrip": false,
734
- "normalized": false,
735
- "rstrip": false,
736
- "single_word": false,
737
- "special": true
738
- },
739
- "92": {
740
- "content": "/",
741
- "lstrip": false,
742
- "normalized": false,
743
- "rstrip": false,
744
- "single_word": false,
745
- "special": true
746
- },
747
- "93": {
748
- "content": ":",
749
- "lstrip": false,
750
- "normalized": false,
751
- "rstrip": false,
752
- "single_word": false,
753
- "special": true
754
- },
755
- "94": {
756
- "content": ";",
757
- "lstrip": false,
758
- "normalized": false,
759
- "rstrip": false,
760
- "single_word": false,
761
- "special": true
762
- },
763
- "95": {
764
- "content": "<",
765
- "lstrip": false,
766
- "normalized": false,
767
- "rstrip": false,
768
- "single_word": false,
769
- "special": true
770
- },
771
- "96": {
772
- "content": "=",
773
- "lstrip": false,
774
- "normalized": false,
775
- "rstrip": false,
776
- "single_word": false,
777
- "special": true
778
- },
779
- "97": {
780
- "content": ">",
781
- "lstrip": false,
782
- "normalized": false,
783
- "rstrip": false,
784
- "single_word": false,
785
- "special": true
786
- },
787
- "98": {
788
- "content": "?",
789
- "lstrip": false,
790
- "normalized": false,
791
- "rstrip": false,
792
- "single_word": false,
793
- "special": true
794
- },
795
- "99": {
796
- "content": "@",
797
- "lstrip": false,
798
- "normalized": false,
799
- "rstrip": false,
800
- "single_word": false,
801
- "special": true
802
- },
803
- "100": {
804
- "content": "[",
805
- "lstrip": false,
806
- "normalized": false,
807
- "rstrip": false,
808
- "single_word": false,
809
- "special": true
810
- },
811
- "101": {
812
- "content": "\\",
813
- "lstrip": false,
814
- "normalized": false,
815
- "rstrip": false,
816
- "single_word": false,
817
- "special": true
818
- },
819
- "102": {
820
- "content": "]",
821
- "lstrip": false,
822
- "normalized": false,
823
- "rstrip": false,
824
- "single_word": false,
825
- "special": true
826
- },
827
- "103": {
828
- "content": "^",
829
- "lstrip": false,
830
- "normalized": false,
831
- "rstrip": false,
832
- "single_word": false,
833
- "special": true
834
- },
835
- "104": {
836
- "content": "_",
837
- "lstrip": false,
838
- "normalized": false,
839
- "rstrip": false,
840
- "single_word": false,
841
- "special": true
842
- },
843
- "105": {
844
- "content": "`",
845
- "lstrip": false,
846
- "normalized": false,
847
- "rstrip": false,
848
- "single_word": false,
849
- "special": true
850
- },
851
- "106": {
852
- "content": "{",
853
- "lstrip": false,
854
- "normalized": false,
855
- "rstrip": false,
856
- "single_word": false,
857
- "special": true
858
- },
859
- "107": {
860
- "content": "|",
861
- "lstrip": false,
862
- "normalized": false,
863
- "rstrip": false,
864
- "single_word": false,
865
- "special": true
866
- },
867
- "108": {
868
- "content": "}",
869
- "lstrip": false,
870
- "normalized": false,
871
- "rstrip": false,
872
- "single_word": false,
873
- "special": true
874
- },
875
- "109": {
876
- "content": "~",
877
- "lstrip": false,
878
- "normalized": false,
879
- "rstrip": false,
880
- "single_word": false,
881
- "special": true
882
- },
883
- "110": {
884
- "content": " ",
885
- "lstrip": false,
886
- "normalized": false,
887
- "rstrip": false,
888
- "single_word": false,
889
- "special": true
890
- },
891
- "111": {
892
- "content": "\t",
893
- "lstrip": false,
894
- "normalized": false,
895
- "rstrip": false,
896
- "single_word": false,
897
- "special": true
898
- },
899
- "112": {
900
- "content": "\n",
901
- "lstrip": false,
902
- "normalized": false,
903
- "rstrip": false,
904
- "single_word": false,
905
- "special": true
906
- },
907
- "113": {
908
- "content": "\r",
909
- "lstrip": false,
910
- "normalized": false,
911
- "rstrip": false,
912
- "single_word": false,
913
- "special": true
914
- },
915
- "114": {
916
- "content": "\u000b",
917
- "lstrip": false,
918
- "normalized": false,
919
- "rstrip": false,
920
- "single_word": false,
921
- "special": true
922
- },
923
- "115": {
924
- "content": "\f",
925
  "lstrip": false,
926
  "normalized": false,
927
  "rstrip": false,
 
129
  "special": true
130
  },
131
  "16": {
132
+ "content": " ",
133
  "lstrip": false,
134
  "normalized": false,
135
  "rstrip": false,
 
137
  "special": true
138
  },
139
  "17": {
140
+ "content": " ",
141
  "lstrip": false,
142
  "normalized": false,
143
  "rstrip": false,
 
145
  "special": true
146
  },
147
  "18": {
148
+ "content": " ",
149
  "lstrip": false,
150
  "normalized": false,
151
  "rstrip": false,
 
153
  "special": true
154
  },
155
  "19": {
156
+ "content": " ",
157
  "lstrip": false,
158
  "normalized": false,
159
  "rstrip": false,
 
161
  "special": true
162
  },
163
  "20": {
164
+ "content": " ",
165
  "lstrip": false,
166
  "normalized": false,
167
  "rstrip": false,
 
169
  "special": true
170
  },
171
  "21": {
172
+ "content": " ",
173
  "lstrip": false,
174
  "normalized": false,
175
  "rstrip": false,
 
177
  "special": true
178
  },
179
  "22": {
180
+ "content": " ",
181
  "lstrip": false,
182
  "normalized": false,
183
  "rstrip": false,
 
185
  "special": true
186
  },
187
  "23": {
188
+ "content": " ",
189
  "lstrip": false,
190
  "normalized": false,
191
  "rstrip": false,
 
193
  "special": true
194
  },
195
  "24": {
196
+ "content": " ",
197
  "lstrip": false,
198
  "normalized": false,
199
  "rstrip": false,
 
201
  "special": true
202
  },
203
  "25": {
204
+ "content": " ",
205
  "lstrip": false,
206
  "normalized": false,
207
  "rstrip": false,
 
209
  "special": true
210
  },
211
  "26": {
212
+ "content": " ",
213
  "lstrip": false,
214
  "normalized": false,
215
  "rstrip": false,
 
217
  "special": true
218
  },
219
  "27": {
220
+ "content": " ",
221
  "lstrip": false,
222
  "normalized": false,
223
  "rstrip": false,
 
225
  "special": true
226
  },
227
  "28": {
228
+ "content": " ",
229
  "lstrip": false,
230
  "normalized": false,
231
  "rstrip": false,
 
233
  "special": true
234
  },
235
  "29": {
236
+ "content": " ",
237
  "lstrip": false,
238
  "normalized": false,
239
  "rstrip": false,
 
241
  "special": true
242
  },
243
  "30": {
244
+ "content": " ",
245
  "lstrip": false,
246
  "normalized": false,
247
  "rstrip": false,
 
249
  "special": true
250
  },
251
  "31": {
252
+ "content": " ",
253
  "lstrip": false,
254
  "normalized": false,
255
  "rstrip": false,
 
257
  "special": true
258
  },
259
  "32": {
260
+ "content": " ",
261
  "lstrip": false,
262
  "normalized": false,
263
  "rstrip": false,
 
265
  "special": true
266
  },
267
  "33": {
268
+ "content": " ",
269
  "lstrip": false,
270
  "normalized": false,
271
  "rstrip": false,
 
273
  "special": true
274
  },
275
  "34": {
276
+ "content": " ",
277
  "lstrip": false,
278
  "normalized": false,
279
  "rstrip": false,
 
281
  "special": true
282
  },
283
  "35": {
284
+ "content": " ",
285
  "lstrip": false,
286
  "normalized": false,
287
  "rstrip": false,
 
289
  "special": true
290
  },
291
  "36": {
292
+ "content": " ",
293
  "lstrip": false,
294
  "normalized": false,
295
  "rstrip": false,
 
297
  "special": true
298
  },
299
  "37": {
300
+ "content": " ",
301
  "lstrip": false,
302
  "normalized": false,
303
  "rstrip": false,
 
305
  "special": true
306
  },
307
  "38": {
308
+ "content": " ",
309
  "lstrip": false,
310
  "normalized": false,
311
  "rstrip": false,
 
313
  "special": true
314
  },
315
  "39": {
316
+ "content": "<|reserved_0|>",
317
  "lstrip": false,
318
  "normalized": false,
319
  "rstrip": false,
 
321
  "special": true
322
  },
323
  "40": {
324
+ "content": "<|reserved_1|>",
325
  "lstrip": false,
326
  "normalized": false,
327
  "rstrip": false,
 
329
  "special": true
330
  },
331
  "41": {
332
+ "content": "<|reserved_2|>",
333
  "lstrip": false,
334
  "normalized": false,
335
  "rstrip": false,
 
337
  "special": true
338
  },
339
  "42": {
340
+ "content": "<|reserved_3|>",
341
  "lstrip": false,
342
  "normalized": false,
343
  "rstrip": false,
 
345
  "special": true
346
  },
347
  "43": {
348
+ "content": "<|reserved_4|>",
349
  "lstrip": false,
350
  "normalized": false,
351
  "rstrip": false,
 
353
  "special": true
354
  },
355
  "44": {
356
+ "content": "<|reserved_5|>",
357
  "lstrip": false,
358
  "normalized": false,
359
  "rstrip": false,
 
361
  "special": true
362
  },
363
  "45": {
364
+ "content": "<|reserved_6|>",
365
  "lstrip": false,
366
  "normalized": false,
367
  "rstrip": false,
 
369
  "special": true
370
  },
371
  "46": {
372
+ "content": "<|reserved_7|>",
373
  "lstrip": false,
374
  "normalized": false,
375
  "rstrip": false,
 
377
  "special": true
378
  },
379
  "47": {
380
+ "content": "<|reserved_8|>",
381
  "lstrip": false,
382
  "normalized": false,
383
  "rstrip": false,
 
385
  "special": true
386
  },
387
  "48": {
388
+ "content": "<|reserved_9|>",
389
  "lstrip": false,
390
  "normalized": false,
391
  "rstrip": false,
 
393
  "special": true
394
  },
395
  "49": {
396
+ "content": "<|reserved_10|>",
397
  "lstrip": false,
398
  "normalized": false,
399
  "rstrip": false,
 
401
  "special": true
402
  },
403
  "50": {
404
+ "content": "<|reserved_11|>",
405
  "lstrip": false,
406
  "normalized": false,
407
  "rstrip": false,
 
409
  "special": true
410
  },
411
  "51": {
412
+ "content": "<|reserved_12|>",
413
  "lstrip": false,
414
  "normalized": false,
415
  "rstrip": false,
 
417
  "special": true
418
  },
419
  "52": {
420
+ "content": "<|reserved_13|>",
421
  "lstrip": false,
422
  "normalized": false,
423
  "rstrip": false,
 
425
  "special": true
426
  },
427
  "53": {
428
+ "content": "<|reserved_14|>",
429
  "lstrip": false,
430
  "normalized": false,
431
  "rstrip": false,
 
433
  "special": true
434
  },
435
  "54": {
436
+ "content": "<|reserved_15|>",
437
  "lstrip": false,
438
  "normalized": false,
439
  "rstrip": false,
 
441
  "special": true
442
  },
443
  "55": {
444
+ "content": "<|reserved_16|>",
445
  "lstrip": false,
446
  "normalized": false,
447
  "rstrip": false,
 
449
  "special": true
450
  },
451
  "56": {
452
+ "content": "<|reserved_17|>",
453
  "lstrip": false,
454
  "normalized": false,
455
  "rstrip": false,
 
457
  "special": true
458
  },
459
  "57": {
460
+ "content": "<|reserved_18|>",
461
  "lstrip": false,
462
  "normalized": false,
463
  "rstrip": false,
 
465
  "special": true
466
  },
467
  "58": {
468
+ "content": "<|reserved_19|>",
469
  "lstrip": false,
470
  "normalized": false,
471
  "rstrip": false,
 
473
  "special": true
474
  },
475
  "59": {
476
+ "content": "<|reserved_20|>",
477
  "lstrip": false,
478
  "normalized": false,
479
  "rstrip": false,
 
481
  "special": true
482
  },
483
  "60": {
484
+ "content": "<|reserved_21|>",
485
  "lstrip": false,
486
  "normalized": false,
487
  "rstrip": false,
 
489
  "special": true
490
  },
491
  "61": {
492
+ "content": "<|reserved_22|>",
493
  "lstrip": false,
494
  "normalized": false,
495
  "rstrip": false,
 
497
  "special": true
498
  },
499
  "62": {
500
+ "content": "<|reserved_23|>",
501
  "lstrip": false,
502
  "normalized": false,
503
  "rstrip": false,
 
505
  "special": true
506
  },
507
  "63": {
508
+ "content": "<|reserved_24|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  "lstrip": false,
510
  "normalized": false,
511
  "rstrip": false,
vocab.json DELETED
The diff for this file is too large to render. See raw diff