Modified gitignore
Browse files- .gitignore +1 -1
- tokenizer/merges.txt +244 -0
- tokenizer/vocab.txt +0 -0
.gitignore
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
tokenizer
|
2 |
tokenizer/README.md
|
3 |
__pycache__
|
|
|
1 |
+
tokenizer/cindrella_stories.txt
|
2 |
tokenizer/README.md
|
3 |
__pycache__
|
tokenizer/merges.txt
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
101 32
|
2 |
+
226 128
|
3 |
+
32 116
|
4 |
+
101 114
|
5 |
+
100 32
|
6 |
+
258 104
|
7 |
+
105 110
|
8 |
+
108 108
|
9 |
+
115 116
|
10 |
+
32 97
|
11 |
+
257 156
|
12 |
+
32 119
|
13 |
+
261 256
|
14 |
+
257 157
|
15 |
+
46 32
|
16 |
+
10 10
|
17 |
+
111 32
|
18 |
+
115 32
|
19 |
+
111 117
|
20 |
+
97 105
|
21 |
+
270 32
|
22 |
+
116 104
|
23 |
+
32 115
|
24 |
+
259 101
|
25 |
+
116 32
|
26 |
+
271 266
|
27 |
+
101 97
|
28 |
+
262 100
|
29 |
+
263 97
|
30 |
+
67 283
|
31 |
+
285 279
|
32 |
+
286 284
|
33 |
+
32 99
|
34 |
+
111 110
|
35 |
+
101 260
|
36 |
+
101 112
|
37 |
+
44 32
|
38 |
+
264 291
|
39 |
+
111 114
|
40 |
+
110 260
|
41 |
+
269 278
|
42 |
+
32 104
|
43 |
+
115 105
|
44 |
+
101 110
|
45 |
+
296 275
|
46 |
+
109 111
|
47 |
+
267 97
|
48 |
+
262 103
|
49 |
+
111 111
|
50 |
+
84 104
|
51 |
+
109 101
|
52 |
+
32 98
|
53 |
+
105 114
|
54 |
+
259 32
|
55 |
+
258 272
|
56 |
+
100 268
|
57 |
+
97 263
|
58 |
+
256 116
|
59 |
+
111 102
|
60 |
+
301 277
|
61 |
+
103 111
|
62 |
+
110 111
|
63 |
+
298 264
|
64 |
+
32 100
|
65 |
+
97 114
|
66 |
+
265 295
|
67 |
+
46 281
|
68 |
+
103 104
|
69 |
+
275 114
|
70 |
+
269 281
|
71 |
+
32 108
|
72 |
+
293 318
|
73 |
+
104 97
|
74 |
+
315 259
|
75 |
+
327 259
|
76 |
+
121 274
|
77 |
+
114 105
|
78 |
+
260 287
|
79 |
+
276 266
|
80 |
+
268 98
|
81 |
+
323 116
|
82 |
+
324 121
|
83 |
+
304 107
|
84 |
+
101 115
|
85 |
+
261 101
|
86 |
+
102 294
|
87 |
+
313 272
|
88 |
+
259 256
|
89 |
+
117 116
|
90 |
+
300 333
|
91 |
+
97 110
|
92 |
+
32 73
|
93 |
+
300 311
|
94 |
+
267 105
|
95 |
+
70 337
|
96 |
+
302 273
|
97 |
+
267 104
|
98 |
+
258 111
|
99 |
+
32 109
|
100 |
+
105 116
|
101 |
+
305 256
|
102 |
+
97 121
|
103 |
+
101 44
|
104 |
+
118 256
|
105 |
+
119 110
|
106 |
+
114 282
|
107 |
+
335 312
|
108 |
+
97 103
|
109 |
+
262 265
|
110 |
+
348 350
|
111 |
+
32 103
|
112 |
+
293 329
|
113 |
+
107 256
|
114 |
+
117 112
|
115 |
+
268 102
|
116 |
+
112 111
|
117 |
+
274 108
|
118 |
+
114 111
|
119 |
+
115 104
|
120 |
+
32 306
|
121 |
+
33 269
|
122 |
+
32 32
|
123 |
+
282 114
|
124 |
+
105 102
|
125 |
+
100 111
|
126 |
+
89 274
|
127 |
+
261 97
|
128 |
+
87 104
|
129 |
+
63 325
|
130 |
+
353 362
|
131 |
+
110 105
|
132 |
+
320 332
|
133 |
+
105 100
|
134 |
+
32 71
|
135 |
+
389 111
|
136 |
+
390 100
|
137 |
+
105 273
|
138 |
+
308 108
|
139 |
+
119 105
|
140 |
+
99 111
|
141 |
+
100 310
|
142 |
+
316 303
|
143 |
+
372 260
|
144 |
+
317 280
|
145 |
+
288 108
|
146 |
+
375 115
|
147 |
+
330 273
|
148 |
+
288 312
|
149 |
+
115 265
|
150 |
+
267 343
|
151 |
+
263 32
|
152 |
+
326 97
|
153 |
+
117 108
|
154 |
+
105 260
|
155 |
+
297 97
|
156 |
+
83 104
|
157 |
+
115 101
|
158 |
+
32 102
|
159 |
+
114 339
|
160 |
+
10 266
|
161 |
+
334 383
|
162 |
+
288 346
|
163 |
+
386 336
|
164 |
+
387 363
|
165 |
+
419 256
|
166 |
+
112 112
|
167 |
+
421 290
|
168 |
+
117 264
|
169 |
+
98 108
|
170 |
+
108 121
|
171 |
+
46 325
|
172 |
+
371 115
|
173 |
+
365 391
|
174 |
+
108 338
|
175 |
+
97 109
|
176 |
+
290 287
|
177 |
+
326 105
|
178 |
+
394 277
|
179 |
+
297 309
|
180 |
+
258 119
|
181 |
+
435 272
|
182 |
+
330 115
|
183 |
+
276 73
|
184 |
+
287 352
|
185 |
+
439 272
|
186 |
+
299 32
|
187 |
+
370 308
|
188 |
+
87 328
|
189 |
+
116 265
|
190 |
+
265 114
|
191 |
+
108 97
|
192 |
+
340 121
|
193 |
+
104 309
|
194 |
+
46 269
|
195 |
+
105 103
|
196 |
+
340 308
|
197 |
+
105 109
|
198 |
+
33 377
|
199 |
+
406 314
|
200 |
+
267 398
|
201 |
+
307 282
|
202 |
+
456 344
|
203 |
+
457 379
|
204 |
+
458 408
|
205 |
+
32 316
|
206 |
+
256 277
|
207 |
+
32 317
|
208 |
+
276 411
|
209 |
+
259 276
|
210 |
+
315 309
|
211 |
+
292 266
|
212 |
+
349 263
|
213 |
+
63 269
|
214 |
+
99 107
|
215 |
+
101 116
|
216 |
+
104 259
|
217 |
+
367 321
|
218 |
+
46 271
|
219 |
+
99 420
|
220 |
+
104 111
|
221 |
+
66 344
|
222 |
+
280 314
|
223 |
+
427 298
|
224 |
+
478 424
|
225 |
+
388 418
|
226 |
+
79 78
|
227 |
+
366 393
|
228 |
+
32 110
|
229 |
+
290 433
|
230 |
+
265 110
|
231 |
+
438 116
|
232 |
+
369 32
|
233 |
+
99 104
|
234 |
+
355 351
|
235 |
+
320 107
|
236 |
+
116 442
|
237 |
+
101 276
|
238 |
+
101 311
|
239 |
+
270 356
|
240 |
+
400 282
|
241 |
+
495 110
|
242 |
+
97 374
|
243 |
+
259 273
|
244 |
+
98 121
|
tokenizer/vocab.txt
ADDED
Binary file (3.93 kB). View file
|
|