guipenedo HF staff commited on
Commit
79ddb0e
1 Parent(s): 3a9bfe9

cleaned up unusued parent toks

Browse files
data/Afro-Asiatic.json CHANGED
The diff for this file is too large to render. See raw diff
 
data/Austro-Asiatic.json CHANGED
@@ -36,35 +36,13 @@
36
  "name": "Aslian",
37
  "iso_1_code": null,
38
  "iso_3_code": null,
39
- "tokenizers": {
40
- "Latn": {
41
- "full_object": "SpaCyTokenizer(\"vi\")",
42
- "original_lang_name": "vietnamese",
43
- "original_lang_code": "vie",
44
- "scripts": [
45
- "Latn"
46
- ],
47
- "class_name": "SpaCyTokenizer",
48
- "macrolanguage": false
49
- }
50
- },
51
  "children": [
52
  {
53
  "name": "Jah Hut",
54
  "iso_1_code": null,
55
  "iso_3_code": null,
56
- "tokenizers": {
57
- "Latn": {
58
- "full_object": "SpaCyTokenizer(\"vi\")",
59
- "original_lang_name": "vietnamese",
60
- "original_lang_code": "vie",
61
- "scripts": [
62
- "Latn"
63
- ],
64
- "class_name": "SpaCyTokenizer",
65
- "macrolanguage": false
66
- }
67
- },
68
  "children": [
69
  {
70
  "name": "Jah Hut",
@@ -85,35 +63,13 @@
85
  "name": "North Aslian",
86
  "iso_1_code": null,
87
  "iso_3_code": null,
88
- "tokenizers": {
89
- "Latn": {
90
- "full_object": "SpaCyTokenizer(\"vi\")",
91
- "original_lang_name": "vietnamese",
92
- "original_lang_code": "vie",
93
- "scripts": [
94
- "Latn"
95
- ],
96
- "class_name": "SpaCyTokenizer",
97
- "macrolanguage": false
98
- }
99
- },
100
  "children": [
101
  {
102
  "name": "Chewong",
103
  "iso_1_code": null,
104
  "iso_3_code": null,
105
- "tokenizers": {
106
- "Latn": {
107
- "full_object": "SpaCyTokenizer(\"vi\")",
108
- "original_lang_name": "vietnamese",
109
- "original_lang_code": "vie",
110
- "scripts": [
111
- "Latn"
112
- ],
113
- "class_name": "SpaCyTokenizer",
114
- "macrolanguage": false
115
- }
116
- },
117
  "children": [
118
  {
119
  "name": "Cheq Wong",
@@ -134,18 +90,7 @@
134
  "name": "Eastern",
135
  "iso_1_code": null,
136
  "iso_3_code": null,
137
- "tokenizers": {
138
- "Latn": {
139
- "full_object": "SpaCyTokenizer(\"vi\")",
140
- "original_lang_name": "vietnamese",
141
- "original_lang_code": "vie",
142
- "scripts": [
143
- "Latn"
144
- ],
145
- "class_name": "SpaCyTokenizer",
146
- "macrolanguage": false
147
- }
148
- },
149
  "children": [
150
  {
151
  "name": "Batek",
@@ -196,18 +141,7 @@
196
  "name": "Tonga",
197
  "iso_1_code": null,
198
  "iso_3_code": null,
199
- "tokenizers": {
200
- "Latn": {
201
- "full_object": "SpaCyTokenizer(\"vi\")",
202
- "original_lang_name": "vietnamese",
203
- "original_lang_code": "vie",
204
- "scripts": [
205
- "Latn"
206
- ],
207
- "class_name": "SpaCyTokenizer",
208
- "macrolanguage": false
209
- }
210
- },
211
  "children": [
212
  {
213
  "name": "Ten\u2019edn",
@@ -228,18 +162,7 @@
228
  "name": "Western",
229
  "iso_1_code": null,
230
  "iso_3_code": null,
231
- "tokenizers": {
232
- "Latn": {
233
- "full_object": "SpaCyTokenizer(\"vi\")",
234
- "original_lang_name": "vietnamese",
235
- "original_lang_code": "vie",
236
- "scripts": [
237
- "Latn"
238
- ],
239
- "class_name": "SpaCyTokenizer",
240
- "macrolanguage": false
241
- }
242
- },
243
  "children": [
244
  {
245
  "name": "Kintaq",
@@ -275,18 +198,7 @@
275
  "name": "Senoic",
276
  "iso_1_code": null,
277
  "iso_3_code": null,
278
- "tokenizers": {
279
- "Latn": {
280
- "full_object": "SpaCyTokenizer(\"vi\")",
281
- "original_lang_name": "vietnamese",
282
- "original_lang_code": "vie",
283
- "scripts": [
284
- "Latn"
285
- ],
286
- "class_name": "SpaCyTokenizer",
287
- "macrolanguage": false
288
- }
289
- },
290
  "children": [
291
  {
292
  "name": "Lanoh",
@@ -347,18 +259,7 @@
347
  "name": "South Aslian",
348
  "iso_1_code": null,
349
  "iso_3_code": null,
350
- "tokenizers": {
351
- "Latn": {
352
- "full_object": "SpaCyTokenizer(\"vi\")",
353
- "original_lang_name": "vietnamese",
354
- "original_lang_code": "vie",
355
- "scripts": [
356
- "Latn"
357
- ],
358
- "class_name": "SpaCyTokenizer",
359
- "macrolanguage": false
360
- }
361
- },
362
  "children": [
363
  {
364
  "name": "Mah Meri",
@@ -523,18 +424,7 @@
523
  "name": "East Bahnaric",
524
  "iso_1_code": null,
525
  "iso_3_code": null,
526
- "tokenizers": {
527
- "Latn": {
528
- "full_object": "SpaCyTokenizer(\"vi\")",
529
- "original_lang_name": "vietnamese",
530
- "original_lang_code": "vie",
531
- "scripts": [
532
- "Latn"
533
- ],
534
- "class_name": "SpaCyTokenizer",
535
- "macrolanguage": false
536
- }
537
- },
538
  "children": [
539
  {
540
  "name": "Cua",
@@ -555,18 +445,7 @@
555
  "name": "North Bahnaric",
556
  "iso_1_code": null,
557
  "iso_3_code": null,
558
- "tokenizers": {
559
- "Latn": {
560
- "full_object": "SpaCyTokenizer(\"vi\")",
561
- "original_lang_name": "vietnamese",
562
- "original_lang_code": "vie",
563
- "scripts": [
564
- "Latn"
565
- ],
566
- "class_name": "SpaCyTokenizer",
567
- "macrolanguage": false
568
- }
569
- },
570
  "children": [
571
  {
572
  "name": "Katua",
@@ -592,18 +471,7 @@
592
  "name": "East",
593
  "iso_1_code": null,
594
  "iso_3_code": null,
595
- "tokenizers": {
596
- "Latn": {
597
- "full_object": "SpaCyTokenizer(\"vi\")",
598
- "original_lang_name": "vietnamese",
599
- "original_lang_code": "vie",
600
- "scripts": [
601
- "Latn"
602
- ],
603
- "class_name": "SpaCyTokenizer",
604
- "macrolanguage": false
605
- }
606
- },
607
  "children": [
608
  {
609
  "name": "Kayong",
@@ -634,18 +502,7 @@
634
  "name": "West",
635
  "iso_1_code": null,
636
  "iso_3_code": null,
637
- "tokenizers": {
638
- "Latn": {
639
- "full_object": "SpaCyTokenizer(\"vi\")",
640
- "original_lang_name": "vietnamese",
641
- "original_lang_code": "vie",
642
- "scripts": [
643
- "Latn"
644
- ],
645
- "class_name": "SpaCyTokenizer",
646
- "macrolanguage": false
647
- }
648
- },
649
  "children": [
650
  {
651
  "name": "Trieng",
@@ -671,18 +528,7 @@
671
  "name": "Duan",
672
  "iso_1_code": null,
673
  "iso_3_code": null,
674
- "tokenizers": {
675
- "Latn": {
676
- "full_object": "SpaCyTokenizer(\"vi\")",
677
- "original_lang_name": "vietnamese",
678
- "original_lang_code": "vie",
679
- "scripts": [
680
- "Latn"
681
- ],
682
- "class_name": "SpaCyTokenizer",
683
- "macrolanguage": false
684
- }
685
- },
686
  "children": [
687
  {
688
  "name": "Halang Doan",
@@ -703,18 +549,7 @@
703
  "name": "Jeh-Halang",
704
  "iso_1_code": null,
705
  "iso_3_code": null,
706
- "tokenizers": {
707
- "Latn": {
708
- "full_object": "SpaCyTokenizer(\"vi\")",
709
- "original_lang_name": "vietnamese",
710
- "original_lang_code": "vie",
711
- "scripts": [
712
- "Latn"
713
- ],
714
- "class_name": "SpaCyTokenizer",
715
- "macrolanguage": false
716
- }
717
- },
718
  "children": [
719
  {
720
  "name": "Halang",
@@ -745,18 +580,7 @@
745
  "name": "Rengao",
746
  "iso_1_code": null,
747
  "iso_3_code": null,
748
- "tokenizers": {
749
- "Latn": {
750
- "full_object": "SpaCyTokenizer(\"vi\")",
751
- "original_lang_name": "vietnamese",
752
- "original_lang_code": "vie",
753
- "scripts": [
754
- "Latn"
755
- ],
756
- "class_name": "SpaCyTokenizer",
757
- "macrolanguage": false
758
- }
759
- },
760
  "children": [
761
  {
762
  "name": "Rengao",
@@ -777,35 +601,13 @@
777
  "name": "Sedang-Todrah",
778
  "iso_1_code": null,
779
  "iso_3_code": null,
780
- "tokenizers": {
781
- "Latn": {
782
- "full_object": "SpaCyTokenizer(\"vi\")",
783
- "original_lang_name": "vietnamese",
784
- "original_lang_code": "vie",
785
- "scripts": [
786
- "Latn"
787
- ],
788
- "class_name": "SpaCyTokenizer",
789
- "macrolanguage": false
790
- }
791
- },
792
  "children": [
793
  {
794
  "name": "Sedang",
795
  "iso_1_code": null,
796
  "iso_3_code": null,
797
- "tokenizers": {
798
- "Latn": {
799
- "full_object": "SpaCyTokenizer(\"vi\")",
800
- "original_lang_name": "vietnamese",
801
- "original_lang_code": "vie",
802
- "scripts": [
803
- "Latn"
804
- ],
805
- "class_name": "SpaCyTokenizer",
806
- "macrolanguage": false
807
- }
808
- },
809
  "children": [
810
  {
811
  "name": "Hre",
@@ -836,18 +638,7 @@
836
  "name": "Todrah-Monom",
837
  "iso_1_code": null,
838
  "iso_3_code": null,
839
- "tokenizers": {
840
- "Latn": {
841
- "full_object": "SpaCyTokenizer(\"vi\")",
842
- "original_lang_name": "vietnamese",
843
- "original_lang_code": "vie",
844
- "scripts": [
845
- "Latn"
846
- ],
847
- "class_name": "SpaCyTokenizer",
848
- "macrolanguage": false
849
- }
850
- },
851
  "children": [
852
  {
853
  "name": "Monom",
@@ -954,18 +745,7 @@
954
  "name": "Eastern Mnong",
955
  "iso_1_code": null,
956
  "iso_3_code": null,
957
- "tokenizers": {
958
- "Latn": {
959
- "full_object": "SpaCyTokenizer(\"vi\")",
960
- "original_lang_name": "vietnamese",
961
- "original_lang_code": "vie",
962
- "scripts": [
963
- "Latn"
964
- ],
965
- "class_name": "SpaCyTokenizer",
966
- "macrolanguage": false
967
- }
968
- },
969
  "children": [
970
  {
971
  "name": "Mnong, Eastern",
@@ -1057,18 +837,7 @@
1057
  "name": "Sre",
1058
  "iso_1_code": null,
1059
  "iso_3_code": null,
1060
- "tokenizers": {
1061
- "Latn": {
1062
- "full_object": "SpaCyTokenizer(\"vi\")",
1063
- "original_lang_name": "vietnamese",
1064
- "original_lang_code": "vie",
1065
- "scripts": [
1066
- "Latn"
1067
- ],
1068
- "class_name": "SpaCyTokenizer",
1069
- "macrolanguage": false
1070
- }
1071
- },
1072
  "children": [
1073
  {
1074
  "name": "Maa",
@@ -1104,18 +873,7 @@
1104
  "name": "Stieng-Chrau",
1105
  "iso_1_code": null,
1106
  "iso_3_code": null,
1107
- "tokenizers": {
1108
- "Latn": {
1109
- "full_object": "SpaCyTokenizer(\"vi\")",
1110
- "original_lang_name": "vietnamese",
1111
- "original_lang_code": "vie",
1112
- "scripts": [
1113
- "Latn"
1114
- ],
1115
- "class_name": "SpaCyTokenizer",
1116
- "macrolanguage": false
1117
- }
1118
- },
1119
  "children": [
1120
  {
1121
  "name": "Chrau",
@@ -1161,18 +919,7 @@
1161
  "name": "West Bahnaric",
1162
  "iso_1_code": null,
1163
  "iso_3_code": null,
1164
- "tokenizers": {
1165
- "Latn": {
1166
- "full_object": "SpaCyTokenizer(\"vi\")",
1167
- "original_lang_name": "vietnamese",
1168
- "original_lang_code": "vie",
1169
- "scripts": [
1170
- "Latn"
1171
- ],
1172
- "class_name": "SpaCyTokenizer",
1173
- "macrolanguage": false
1174
- }
1175
- },
1176
  "children": [
1177
  {
1178
  "name": "Lavi",
@@ -1188,18 +935,7 @@
1188
  "name": "Brao-Kravet",
1189
  "iso_1_code": null,
1190
  "iso_3_code": null,
1191
- "tokenizers": {
1192
- "Latn": {
1193
- "full_object": "SpaCyTokenizer(\"vi\")",
1194
- "original_lang_name": "vietnamese",
1195
- "original_lang_code": "vie",
1196
- "scripts": [
1197
- "Latn"
1198
- ],
1199
- "class_name": "SpaCyTokenizer",
1200
- "macrolanguage": false
1201
- }
1202
- },
1203
  "children": [
1204
  {
1205
  "name": "Brao",
@@ -1250,18 +986,7 @@
1250
  "name": "Laven",
1251
  "iso_1_code": null,
1252
  "iso_3_code": null,
1253
- "tokenizers": {
1254
- "Latn": {
1255
- "full_object": "SpaCyTokenizer(\"vi\")",
1256
- "original_lang_name": "vietnamese",
1257
- "original_lang_code": "vie",
1258
- "scripts": [
1259
- "Latn"
1260
- ],
1261
- "class_name": "SpaCyTokenizer",
1262
- "macrolanguage": false
1263
- }
1264
- },
1265
  "children": [
1266
  {
1267
  "name": "Laven",
@@ -1282,18 +1007,7 @@
1282
  "name": "Nyaheun",
1283
  "iso_1_code": null,
1284
  "iso_3_code": null,
1285
- "tokenizers": {
1286
- "Latn": {
1287
- "full_object": "SpaCyTokenizer(\"vi\")",
1288
- "original_lang_name": "vietnamese",
1289
- "original_lang_code": "vie",
1290
- "scripts": [
1291
- "Latn"
1292
- ],
1293
- "class_name": "SpaCyTokenizer",
1294
- "macrolanguage": false
1295
- }
1296
- },
1297
  "children": [
1298
  {
1299
  "name": "Nyaheun",
@@ -1314,18 +1028,7 @@
1314
  "name": "Oi-The",
1315
  "iso_1_code": null,
1316
  "iso_3_code": null,
1317
- "tokenizers": {
1318
- "Latn": {
1319
- "full_object": "SpaCyTokenizer(\"vi\")",
1320
- "original_lang_name": "vietnamese",
1321
- "original_lang_code": "vie",
1322
- "scripts": [
1323
- "Latn"
1324
- ],
1325
- "class_name": "SpaCyTokenizer",
1326
- "macrolanguage": false
1327
- }
1328
- },
1329
  "children": [
1330
  {
1331
  "name": "Oy",
@@ -1383,35 +1086,13 @@
1383
  "name": "Central Katuic",
1384
  "iso_1_code": null,
1385
  "iso_3_code": null,
1386
- "tokenizers": {
1387
- "Latn": {
1388
- "full_object": "SpaCyTokenizer(\"vi\")",
1389
- "original_lang_name": "vietnamese",
1390
- "original_lang_code": "vie",
1391
- "scripts": [
1392
- "Latn"
1393
- ],
1394
- "class_name": "SpaCyTokenizer",
1395
- "macrolanguage": false
1396
- }
1397
- },
1398
  "children": [
1399
  {
1400
  "name": "Ta\u2019oih",
1401
  "iso_1_code": null,
1402
  "iso_3_code": null,
1403
- "tokenizers": {
1404
- "Latn": {
1405
- "full_object": "SpaCyTokenizer(\"vi\")",
1406
- "original_lang_name": "vietnamese",
1407
- "original_lang_code": "vie",
1408
- "scripts": [
1409
- "Latn"
1410
- ],
1411
- "class_name": "SpaCyTokenizer",
1412
- "macrolanguage": false
1413
- }
1414
- },
1415
  "children": [
1416
  {
1417
  "name": "Ir",
@@ -1467,35 +1148,13 @@
1467
  "name": "East Katuic",
1468
  "iso_1_code": null,
1469
  "iso_3_code": null,
1470
- "tokenizers": {
1471
- "Latn": {
1472
- "full_object": "SpaCyTokenizer(\"vi\")",
1473
- "original_lang_name": "vietnamese",
1474
- "original_lang_code": "vie",
1475
- "scripts": [
1476
- "Latn"
1477
- ],
1478
- "class_name": "SpaCyTokenizer",
1479
- "macrolanguage": false
1480
- }
1481
- },
1482
  "children": [
1483
  {
1484
  "name": "Katu-Pacoh",
1485
  "iso_1_code": null,
1486
  "iso_3_code": null,
1487
- "tokenizers": {
1488
- "Latn": {
1489
- "full_object": "SpaCyTokenizer(\"vi\")",
1490
- "original_lang_name": "vietnamese",
1491
- "original_lang_code": "vie",
1492
- "scripts": [
1493
- "Latn"
1494
- ],
1495
- "class_name": "SpaCyTokenizer",
1496
- "macrolanguage": false
1497
- }
1498
- },
1499
  "children": [
1500
  {
1501
  "name": "Katu, Eastern",
@@ -1556,18 +1215,7 @@
1556
  "name": "Ngeq-Nkriang",
1557
  "iso_1_code": null,
1558
  "iso_3_code": null,
1559
- "tokenizers": {
1560
- "Latn": {
1561
- "full_object": "SpaCyTokenizer(\"vi\")",
1562
- "original_lang_name": "vietnamese",
1563
- "original_lang_code": "vie",
1564
- "scripts": [
1565
- "Latn"
1566
- ],
1567
- "class_name": "SpaCyTokenizer",
1568
- "macrolanguage": false
1569
- }
1570
- },
1571
  "children": [
1572
  {
1573
  "name": "Kriang",
@@ -1707,18 +1355,7 @@
1707
  "name": "Kuay",
1708
  "iso_1_code": null,
1709
  "iso_3_code": null,
1710
- "tokenizers": {
1711
- "Latn": {
1712
- "full_object": "SpaCyTokenizer(\"vi\")",
1713
- "original_lang_name": "vietnamese",
1714
- "original_lang_code": "vie",
1715
- "scripts": [
1716
- "Latn"
1717
- ],
1718
- "class_name": "SpaCyTokenizer",
1719
- "macrolanguage": false
1720
- }
1721
- },
1722
  "children": [
1723
  {
1724
  "name": "Kuay",
@@ -1759,18 +1396,7 @@
1759
  "name": "Khmer",
1760
  "iso_1_code": null,
1761
  "iso_3_code": null,
1762
- "tokenizers": {
1763
- "Latn": {
1764
- "full_object": "SpaCyTokenizer(\"vi\")",
1765
- "original_lang_name": "vietnamese",
1766
- "original_lang_code": "vie",
1767
- "scripts": [
1768
- "Latn"
1769
- ],
1770
- "class_name": "SpaCyTokenizer",
1771
- "macrolanguage": false
1772
- }
1773
- },
1774
  "children": [
1775
  {
1776
  "name": "Khmer",
@@ -1805,35 +1431,13 @@
1805
  "name": "Pearic",
1806
  "iso_1_code": null,
1807
  "iso_3_code": null,
1808
- "tokenizers": {
1809
- "Latn": {
1810
- "full_object": "SpaCyTokenizer(\"vi\")",
1811
- "original_lang_name": "vietnamese",
1812
- "original_lang_code": "vie",
1813
- "scripts": [
1814
- "Latn"
1815
- ],
1816
- "class_name": "SpaCyTokenizer",
1817
- "macrolanguage": false
1818
- }
1819
- },
1820
  "children": [
1821
  {
1822
  "name": "Eastern",
1823
  "iso_1_code": null,
1824
  "iso_3_code": null,
1825
- "tokenizers": {
1826
- "Latn": {
1827
- "full_object": "SpaCyTokenizer(\"vi\")",
1828
- "original_lang_name": "vietnamese",
1829
- "original_lang_code": "vie",
1830
- "scripts": [
1831
- "Latn"
1832
- ],
1833
- "class_name": "SpaCyTokenizer",
1834
- "macrolanguage": false
1835
- }
1836
- },
1837
  "children": [
1838
  {
1839
  "name": "Pear",
@@ -1854,35 +1458,13 @@
1854
  "name": "Western",
1855
  "iso_1_code": null,
1856
  "iso_3_code": null,
1857
- "tokenizers": {
1858
- "Latn": {
1859
- "full_object": "SpaCyTokenizer(\"vi\")",
1860
- "original_lang_name": "vietnamese",
1861
- "original_lang_code": "vie",
1862
- "scripts": [
1863
- "Latn"
1864
- ],
1865
- "class_name": "SpaCyTokenizer",
1866
- "macrolanguage": false
1867
- }
1868
- },
1869
  "children": [
1870
  {
1871
  "name": "Chong",
1872
  "iso_1_code": null,
1873
  "iso_3_code": null,
1874
- "tokenizers": {
1875
- "Latn": {
1876
- "full_object": "SpaCyTokenizer(\"vi\")",
1877
- "original_lang_name": "vietnamese",
1878
- "original_lang_code": "vie",
1879
- "scripts": [
1880
- "Latn"
1881
- ],
1882
- "class_name": "SpaCyTokenizer",
1883
- "macrolanguage": false
1884
- }
1885
- },
1886
  "children": [
1887
  {
1888
  "name": "Chong",
@@ -1913,18 +1495,7 @@
1913
  "name": "Samre",
1914
  "iso_1_code": null,
1915
  "iso_3_code": null,
1916
- "tokenizers": {
1917
- "Latn": {
1918
- "full_object": "SpaCyTokenizer(\"vi\")",
1919
- "original_lang_name": "vietnamese",
1920
- "original_lang_code": "vie",
1921
- "scripts": [
1922
- "Latn"
1923
- ],
1924
- "class_name": "SpaCyTokenizer",
1925
- "macrolanguage": false
1926
- }
1927
- },
1928
  "children": [
1929
  {
1930
  "name": "Somray",
@@ -1946,27 +1517,16 @@
1946
  "scripts": [],
1947
  "own_tokenizer": false
1948
  }
1949
- ],
1950
- "node_i": "1292",
1951
- "scripts": [],
1952
- "own_tokenizer": false
1953
- },
1954
- {
1955
- "name": "Suoy",
1956
- "iso_1_code": null,
1957
- "iso_3_code": null,
1958
- "tokenizers": {
1959
- "Latn": {
1960
- "full_object": "SpaCyTokenizer(\"vi\")",
1961
- "original_lang_name": "vietnamese",
1962
- "original_lang_code": "vie",
1963
- "scripts": [
1964
- "Latn"
1965
- ],
1966
- "class_name": "SpaCyTokenizer",
1967
- "macrolanguage": false
1968
- }
1969
- },
1970
  "children": [
1971
  {
1972
  "name": "Su\u2019ung",
@@ -2002,18 +1562,7 @@
2002
  "name": "Monic",
2003
  "iso_1_code": null,
2004
  "iso_3_code": null,
2005
- "tokenizers": {
2006
- "Latn": {
2007
- "full_object": "SpaCyTokenizer(\"vi\")",
2008
- "original_lang_name": "vietnamese",
2009
- "original_lang_code": "vie",
2010
- "scripts": [
2011
- "Latn"
2012
- ],
2013
- "class_name": "SpaCyTokenizer",
2014
- "macrolanguage": false
2015
- }
2016
- },
2017
  "children": [
2018
  {
2019
  "name": "Mon",
@@ -2098,18 +1647,7 @@
2098
  "name": "Chowra-Teressa",
2099
  "iso_1_code": null,
2100
  "iso_3_code": null,
2101
- "tokenizers": {
2102
- "Latn": {
2103
- "full_object": "SpaCyTokenizer(\"vi\")",
2104
- "original_lang_name": "vietnamese",
2105
- "original_lang_code": "vie",
2106
- "scripts": [
2107
- "Latn"
2108
- ],
2109
- "class_name": "SpaCyTokenizer",
2110
- "macrolanguage": false
2111
- }
2112
- },
2113
  "children": [
2114
  {
2115
  "name": "Chaura",
@@ -2140,18 +1678,7 @@
2140
  "name": "Great Nicobar",
2141
  "iso_1_code": null,
2142
  "iso_3_code": null,
2143
- "tokenizers": {
2144
- "Latn": {
2145
- "full_object": "SpaCyTokenizer(\"vi\")",
2146
- "original_lang_name": "vietnamese",
2147
- "original_lang_code": "vie",
2148
- "scripts": [
2149
- "Latn"
2150
- ],
2151
- "class_name": "SpaCyTokenizer",
2152
- "macrolanguage": false
2153
- }
2154
- },
2155
  "children": [
2156
  {
2157
  "name": "Nicobarese, Southern",
@@ -2172,18 +1699,7 @@
2172
  "name": "Nancowry",
2173
  "iso_1_code": null,
2174
  "iso_3_code": null,
2175
- "tokenizers": {
2176
- "Latn": {
2177
- "full_object": "SpaCyTokenizer(\"vi\")",
2178
- "original_lang_name": "vietnamese",
2179
- "original_lang_code": "vie",
2180
- "scripts": [
2181
- "Latn"
2182
- ],
2183
- "class_name": "SpaCyTokenizer",
2184
- "macrolanguage": false
2185
- }
2186
- },
2187
  "children": [
2188
  {
2189
  "name": "Nicobarese, Central",
@@ -2204,18 +1720,7 @@
2204
  "name": "Shom Peng",
2205
  "iso_1_code": null,
2206
  "iso_3_code": null,
2207
- "tokenizers": {
2208
- "Latn": {
2209
- "full_object": "SpaCyTokenizer(\"vi\")",
2210
- "original_lang_name": "vietnamese",
2211
- "original_lang_code": "vie",
2212
- "scripts": [
2213
- "Latn"
2214
- ],
2215
- "class_name": "SpaCyTokenizer",
2216
- "macrolanguage": false
2217
- }
2218
- },
2219
  "children": [
2220
  {
2221
  "name": "Shom Peng",
@@ -2333,35 +1838,13 @@
2333
  "name": "Khmuic",
2334
  "iso_1_code": null,
2335
  "iso_3_code": null,
2336
- "tokenizers": {
2337
- "Latn": {
2338
- "full_object": "SpaCyTokenizer(\"vi\")",
2339
- "original_lang_name": "vietnamese",
2340
- "original_lang_code": "vie",
2341
- "scripts": [
2342
- "Latn"
2343
- ],
2344
- "class_name": "SpaCyTokenizer",
2345
- "macrolanguage": false
2346
- }
2347
- },
2348
  "children": [
2349
  {
2350
  "name": "Khao",
2351
  "iso_1_code": null,
2352
  "iso_3_code": null,
2353
- "tokenizers": {
2354
- "Latn": {
2355
- "full_object": "SpaCyTokenizer(\"vi\")",
2356
- "original_lang_name": "vietnamese",
2357
- "original_lang_code": "vie",
2358
- "scripts": [
2359
- "Latn"
2360
- ],
2361
- "class_name": "SpaCyTokenizer",
2362
- "macrolanguage": false
2363
- }
2364
- },
2365
  "children": [
2366
  {
2367
  "name": "Khao",
@@ -2382,35 +1865,13 @@
2382
  "name": "Mal-Khmu\u2019",
2383
  "iso_1_code": null,
2384
  "iso_3_code": null,
2385
- "tokenizers": {
2386
- "Latn": {
2387
- "full_object": "SpaCyTokenizer(\"vi\")",
2388
- "original_lang_name": "vietnamese",
2389
- "original_lang_code": "vie",
2390
- "scripts": [
2391
- "Latn"
2392
- ],
2393
- "class_name": "SpaCyTokenizer",
2394
- "macrolanguage": false
2395
- }
2396
- },
2397
  "children": [
2398
  {
2399
  "name": "Khmu\u2019",
2400
  "iso_1_code": null,
2401
  "iso_3_code": null,
2402
- "tokenizers": {
2403
- "Latn": {
2404
- "full_object": "SpaCyTokenizer(\"vi\")",
2405
- "original_lang_name": "vietnamese",
2406
- "original_lang_code": "vie",
2407
- "scripts": [
2408
- "Latn"
2409
- ],
2410
- "class_name": "SpaCyTokenizer",
2411
- "macrolanguage": false
2412
- }
2413
- },
2414
  "children": [
2415
  {
2416
  "name": "Khuen",
@@ -2451,18 +1912,7 @@
2451
  "name": "Mal-Prai",
2452
  "iso_1_code": null,
2453
  "iso_3_code": null,
2454
- "tokenizers": {
2455
- "Latn": {
2456
- "full_object": "SpaCyTokenizer(\"vi\")",
2457
- "original_lang_name": "vietnamese",
2458
- "original_lang_code": "vie",
2459
- "scripts": [
2460
- "Latn"
2461
- ],
2462
- "class_name": "SpaCyTokenizer",
2463
- "macrolanguage": false
2464
- }
2465
- },
2466
  "children": [
2467
  {
2468
  "name": "Mal",
@@ -2498,18 +1948,7 @@
2498
  "name": "Mlabri",
2499
  "iso_1_code": null,
2500
  "iso_3_code": null,
2501
- "tokenizers": {
2502
- "Latn": {
2503
- "full_object": "SpaCyTokenizer(\"vi\")",
2504
- "original_lang_name": "vietnamese",
2505
- "original_lang_code": "vie",
2506
- "scripts": [
2507
- "Latn"
2508
- ],
2509
- "class_name": "SpaCyTokenizer",
2510
- "macrolanguage": false
2511
- }
2512
- },
2513
  "children": [
2514
  {
2515
  "name": "Mlabri",
@@ -2530,18 +1969,7 @@
2530
  "name": "Xinh Mul",
2531
  "iso_1_code": null,
2532
  "iso_3_code": null,
2533
- "tokenizers": {
2534
- "Latn": {
2535
- "full_object": "SpaCyTokenizer(\"vi\")",
2536
- "original_lang_name": "vietnamese",
2537
- "original_lang_code": "vie",
2538
- "scripts": [
2539
- "Latn"
2540
- ],
2541
- "class_name": "SpaCyTokenizer",
2542
- "macrolanguage": false
2543
- }
2544
- },
2545
  "children": [
2546
  {
2547
  "name": "Phong-Kniang",
@@ -2577,18 +2005,7 @@
2577
  "name": "Mang",
2578
  "iso_1_code": null,
2579
  "iso_3_code": null,
2580
- "tokenizers": {
2581
- "Latn": {
2582
- "full_object": "SpaCyTokenizer(\"vi\")",
2583
- "original_lang_name": "vietnamese",
2584
- "original_lang_code": "vie",
2585
- "scripts": [
2586
- "Latn"
2587
- ],
2588
- "class_name": "SpaCyTokenizer",
2589
- "macrolanguage": false
2590
- }
2591
- },
2592
  "children": [
2593
  {
2594
  "name": "Mang",
@@ -2643,18 +2060,7 @@
2643
  "name": "Angkuic",
2644
  "iso_1_code": null,
2645
  "iso_3_code": null,
2646
- "tokenizers": {
2647
- "Latn": {
2648
- "full_object": "SpaCyTokenizer(\"vi\")",
2649
- "original_lang_name": "vietnamese",
2650
- "original_lang_code": "vie",
2651
- "scripts": [
2652
- "Latn"
2653
- ],
2654
- "class_name": "SpaCyTokenizer",
2655
- "macrolanguage": false
2656
- }
2657
- },
2658
  "children": [
2659
  {
2660
  "name": "Hu",
@@ -2755,18 +2161,7 @@
2755
  "name": "Bit-Khang",
2756
  "iso_1_code": null,
2757
  "iso_3_code": null,
2758
- "tokenizers": {
2759
- "Latn": {
2760
- "full_object": "SpaCyTokenizer(\"vi\")",
2761
- "original_lang_name": "vietnamese",
2762
- "original_lang_code": "vie",
2763
- "scripts": [
2764
- "Latn"
2765
- ],
2766
- "class_name": "SpaCyTokenizer",
2767
- "macrolanguage": false
2768
- }
2769
- },
2770
  "children": [
2771
  {
2772
  "name": "Bit",
@@ -2807,18 +2202,7 @@
2807
  "name": "Lametic",
2808
  "iso_1_code": null,
2809
  "iso_3_code": null,
2810
- "tokenizers": {
2811
- "Latn": {
2812
- "full_object": "SpaCyTokenizer(\"vi\")",
2813
- "original_lang_name": "vietnamese",
2814
- "original_lang_code": "vie",
2815
- "scripts": [
2816
- "Latn"
2817
- ],
2818
- "class_name": "SpaCyTokenizer",
2819
- "macrolanguage": false
2820
- }
2821
- },
2822
  "children": [
2823
  {
2824
  "name": "Con",
@@ -2866,18 +2250,7 @@
2866
  "name": "Bulang",
2867
  "iso_1_code": null,
2868
  "iso_3_code": null,
2869
- "tokenizers": {
2870
- "Latn": {
2871
- "full_object": "SpaCyTokenizer(\"vi\")",
2872
- "original_lang_name": "vietnamese",
2873
- "original_lang_code": "vie",
2874
- "scripts": [
2875
- "Latn"
2876
- ],
2877
- "class_name": "SpaCyTokenizer",
2878
- "macrolanguage": false
2879
- }
2880
- },
2881
  "children": [
2882
  {
2883
  "name": "Blang",
@@ -2898,18 +2271,7 @@
2898
  "name": "Lawa",
2899
  "iso_1_code": null,
2900
  "iso_3_code": null,
2901
- "tokenizers": {
2902
- "Latn": {
2903
- "full_object": "SpaCyTokenizer(\"vi\")",
2904
- "original_lang_name": "vietnamese",
2905
- "original_lang_code": "vie",
2906
- "scripts": [
2907
- "Latn"
2908
- ],
2909
- "class_name": "SpaCyTokenizer",
2910
- "macrolanguage": false
2911
- }
2912
- },
2913
  "children": [
2914
  {
2915
  "name": "Lawa, Western",
@@ -3017,35 +2379,13 @@
3017
  "name": "Western Palaungic",
3018
  "iso_1_code": null,
3019
  "iso_3_code": null,
3020
- "tokenizers": {
3021
- "Latn": {
3022
- "full_object": "SpaCyTokenizer(\"vi\")",
3023
- "original_lang_name": "vietnamese",
3024
- "original_lang_code": "vie",
3025
- "scripts": [
3026
- "Latn"
3027
- ],
3028
- "class_name": "SpaCyTokenizer",
3029
- "macrolanguage": false
3030
- }
3031
- },
3032
  "children": [
3033
  {
3034
  "name": "Danau",
3035
  "iso_1_code": null,
3036
  "iso_3_code": null,
3037
- "tokenizers": {
3038
- "Latn": {
3039
- "full_object": "SpaCyTokenizer(\"vi\")",
3040
- "original_lang_name": "vietnamese",
3041
- "original_lang_code": "vie",
3042
- "scripts": [
3043
- "Latn"
3044
- ],
3045
- "class_name": "SpaCyTokenizer",
3046
- "macrolanguage": false
3047
- }
3048
- },
3049
  "children": [
3050
  {
3051
  "name": "Danau",
@@ -3066,18 +2406,7 @@
3066
  "name": "Palaung",
3067
  "iso_1_code": null,
3068
  "iso_3_code": null,
3069
- "tokenizers": {
3070
- "Latn": {
3071
- "full_object": "SpaCyTokenizer(\"vi\")",
3072
- "original_lang_name": "vietnamese",
3073
- "original_lang_code": "vie",
3074
- "scripts": [
3075
- "Latn"
3076
- ],
3077
- "class_name": "SpaCyTokenizer",
3078
- "macrolanguage": false
3079
- }
3080
- },
3081
  "children": [
3082
  {
3083
  "name": "Palaung, Ruching",
@@ -3118,18 +2447,7 @@
3118
  "name": "Riang",
3119
  "iso_1_code": null,
3120
  "iso_3_code": null,
3121
- "tokenizers": {
3122
- "Latn": {
3123
- "full_object": "SpaCyTokenizer(\"vi\")",
3124
- "original_lang_name": "vietnamese",
3125
- "original_lang_code": "vie",
3126
- "scripts": [
3127
- "Latn"
3128
- ],
3129
- "class_name": "SpaCyTokenizer",
3130
- "macrolanguage": false
3131
- }
3132
- },
3133
  "children": [
3134
  {
3135
  "name": "Riang Lang",
@@ -3175,18 +2493,7 @@
3175
  "name": "Palyu",
3176
  "iso_1_code": null,
3177
  "iso_3_code": null,
3178
- "tokenizers": {
3179
- "Latn": {
3180
- "full_object": "SpaCyTokenizer(\"vi\")",
3181
- "original_lang_name": "vietnamese",
3182
- "original_lang_code": "vie",
3183
- "scripts": [
3184
- "Latn"
3185
- ],
3186
- "class_name": "SpaCyTokenizer",
3187
- "macrolanguage": false
3188
- }
3189
- },
3190
  "children": [
3191
  {
3192
  "name": "Bugan",
@@ -3217,18 +2524,7 @@
3217
  "name": "Southern Monic",
3218
  "iso_1_code": null,
3219
  "iso_3_code": null,
3220
- "tokenizers": {
3221
- "Latn": {
3222
- "full_object": "SpaCyTokenizer(\"vi\")",
3223
- "original_lang_name": "vietnamese",
3224
- "original_lang_code": "vie",
3225
- "scripts": [
3226
- "Latn"
3227
- ],
3228
- "class_name": "SpaCyTokenizer",
3229
- "macrolanguage": false
3230
- }
3231
- },
3232
  "children": [
3233
  {
3234
  "name": "Nyahkur",
@@ -3249,18 +2545,7 @@
3249
  "name": "Unclassified",
3250
  "iso_1_code": null,
3251
  "iso_3_code": null,
3252
- "tokenizers": {
3253
- "Latn": {
3254
- "full_object": "SpaCyTokenizer(\"vi\")",
3255
- "original_lang_name": "vietnamese",
3256
- "original_lang_code": "vie",
3257
- "scripts": [
3258
- "Latn"
3259
- ],
3260
- "class_name": "SpaCyTokenizer",
3261
- "macrolanguage": false
3262
- }
3263
- },
3264
  "children": [
3265
  {
3266
  "name": "Kemiehua",
@@ -3308,18 +2593,7 @@
3308
  "name": "Chut",
3309
  "iso_1_code": null,
3310
  "iso_3_code": null,
3311
- "tokenizers": {
3312
- "Latn": {
3313
- "full_object": "SpaCyTokenizer(\"vi\")",
3314
- "original_lang_name": "vietnamese",
3315
- "original_lang_code": "vie",
3316
- "scripts": [
3317
- "Latn"
3318
- ],
3319
- "class_name": "SpaCyTokenizer",
3320
- "macrolanguage": false
3321
- }
3322
- },
3323
  "children": [
3324
  {
3325
  "name": "Arem",
@@ -3360,18 +2634,7 @@
3360
  "name": "Cuoi",
3361
  "iso_1_code": null,
3362
  "iso_3_code": null,
3363
- "tokenizers": {
3364
- "Latn": {
3365
- "full_object": "SpaCyTokenizer(\"vi\")",
3366
- "original_lang_name": "vietnamese",
3367
- "original_lang_code": "vie",
3368
- "scripts": [
3369
- "Latn"
3370
- ],
3371
- "class_name": "SpaCyTokenizer",
3372
- "macrolanguage": false
3373
- }
3374
- },
3375
  "children": [
3376
  {
3377
  "name": "Hung",
@@ -3402,18 +2665,7 @@
3402
  "name": "Muong",
3403
  "iso_1_code": null,
3404
  "iso_3_code": null,
3405
- "tokenizers": {
3406
- "Latn": {
3407
- "full_object": "SpaCyTokenizer(\"vi\")",
3408
- "original_lang_name": "vietnamese",
3409
- "original_lang_code": "vie",
3410
- "scripts": [
3411
- "Latn"
3412
- ],
3413
- "class_name": "SpaCyTokenizer",
3414
- "macrolanguage": false
3415
- }
3416
- },
3417
  "children": [
3418
  {
3419
  "name": "Bo",
@@ -3454,18 +2706,7 @@
3454
  "name": "Thavung",
3455
  "iso_1_code": null,
3456
  "iso_3_code": null,
3457
- "tokenizers": {
3458
- "Latn": {
3459
- "full_object": "SpaCyTokenizer(\"vi\")",
3460
- "original_lang_name": "vietnamese",
3461
- "original_lang_code": "vie",
3462
- "scripts": [
3463
- "Latn"
3464
- ],
3465
- "class_name": "SpaCyTokenizer",
3466
- "macrolanguage": false
3467
- }
3468
- },
3469
  "children": [
3470
  {
3471
  "name": "Aheu",
@@ -3809,18 +3050,7 @@
3809
  "name": "Korku",
3810
  "iso_1_code": null,
3811
  "iso_3_code": null,
3812
- "tokenizers": {
3813
- "Latn": {
3814
- "full_object": "SpaCyTokenizer(\"vi\")",
3815
- "original_lang_name": "vietnamese",
3816
- "original_lang_code": "vie",
3817
- "scripts": [
3818
- "Latn"
3819
- ],
3820
- "class_name": "SpaCyTokenizer",
3821
- "macrolanguage": false
3822
- }
3823
- },
3824
  "children": [
3825
  {
3826
  "name": "Korku",
@@ -3846,35 +3076,13 @@
3846
  "name": "South Munda",
3847
  "iso_1_code": null,
3848
  "iso_3_code": null,
3849
- "tokenizers": {
3850
- "Latn": {
3851
- "full_object": "SpaCyTokenizer(\"vi\")",
3852
- "original_lang_name": "vietnamese",
3853
- "original_lang_code": "vie",
3854
- "scripts": [
3855
- "Latn"
3856
- ],
3857
- "class_name": "SpaCyTokenizer",
3858
- "macrolanguage": false
3859
- }
3860
- },
3861
  "children": [
3862
  {
3863
  "name": "Kharia-Juang",
3864
  "iso_1_code": null,
3865
  "iso_3_code": null,
3866
- "tokenizers": {
3867
- "Latn": {
3868
- "full_object": "SpaCyTokenizer(\"vi\")",
3869
- "original_lang_name": "vietnamese",
3870
- "original_lang_code": "vie",
3871
- "scripts": [
3872
- "Latn"
3873
- ],
3874
- "class_name": "SpaCyTokenizer",
3875
- "macrolanguage": false
3876
- }
3877
- },
3878
  "children": [
3879
  {
3880
  "name": "Juang",
@@ -3907,52 +3115,19 @@
3907
  "name": "Koraput Munda",
3908
  "iso_1_code": null,
3909
  "iso_3_code": null,
3910
- "tokenizers": {
3911
- "Latn": {
3912
- "full_object": "SpaCyTokenizer(\"vi\")",
3913
- "original_lang_name": "vietnamese",
3914
- "original_lang_code": "vie",
3915
- "scripts": [
3916
- "Latn"
3917
- ],
3918
- "class_name": "SpaCyTokenizer",
3919
- "macrolanguage": false
3920
- }
3921
- },
3922
  "children": [
3923
  {
3924
  "name": "Gutob-Remo-Geta\u2019",
3925
  "iso_1_code": null,
3926
  "iso_3_code": null,
3927
- "tokenizers": {
3928
- "Latn": {
3929
- "full_object": "SpaCyTokenizer(\"vi\")",
3930
- "original_lang_name": "vietnamese",
3931
- "original_lang_code": "vie",
3932
- "scripts": [
3933
- "Latn"
3934
- ],
3935
- "class_name": "SpaCyTokenizer",
3936
- "macrolanguage": false
3937
- }
3938
- },
3939
  "children": [
3940
  {
3941
  "name": "Geta\u2019",
3942
  "iso_1_code": null,
3943
  "iso_3_code": null,
3944
- "tokenizers": {
3945
- "Latn": {
3946
- "full_object": "SpaCyTokenizer(\"vi\")",
3947
- "original_lang_name": "vietnamese",
3948
- "original_lang_code": "vie",
3949
- "scripts": [
3950
- "Latn"
3951
- ],
3952
- "class_name": "SpaCyTokenizer",
3953
- "macrolanguage": false
3954
- }
3955
- },
3956
  "children": [
3957
  {
3958
  "name": "Gata\u2019",
@@ -3973,18 +3148,7 @@
3973
  "name": "Gutob-Remo",
3974
  "iso_1_code": null,
3975
  "iso_3_code": null,
3976
- "tokenizers": {
3977
- "Latn": {
3978
- "full_object": "SpaCyTokenizer(\"vi\")",
3979
- "original_lang_name": "vietnamese",
3980
- "original_lang_code": "vie",
3981
- "scripts": [
3982
- "Latn"
3983
- ],
3984
- "class_name": "SpaCyTokenizer",
3985
- "macrolanguage": false
3986
- }
3987
- },
3988
  "children": [
3989
  {
3990
  "name": "Bondo",
@@ -4020,35 +3184,13 @@
4020
  "name": "Sora-Juray-Gorum",
4021
  "iso_1_code": null,
4022
  "iso_3_code": null,
4023
- "tokenizers": {
4024
- "Latn": {
4025
- "full_object": "SpaCyTokenizer(\"vi\")",
4026
- "original_lang_name": "vietnamese",
4027
- "original_lang_code": "vie",
4028
- "scripts": [
4029
- "Latn"
4030
- ],
4031
- "class_name": "SpaCyTokenizer",
4032
- "macrolanguage": false
4033
- }
4034
- },
4035
  "children": [
4036
  {
4037
  "name": "Gorum",
4038
  "iso_1_code": null,
4039
  "iso_3_code": null,
4040
- "tokenizers": {
4041
- "Latn": {
4042
- "full_object": "SpaCyTokenizer(\"vi\")",
4043
- "original_lang_name": "vietnamese",
4044
- "original_lang_code": "vie",
4045
- "scripts": [
4046
- "Latn"
4047
- ],
4048
- "class_name": "SpaCyTokenizer",
4049
- "macrolanguage": false
4050
- }
4051
- },
4052
  "children": [
4053
  {
4054
  "name": "Parenga",
@@ -4069,18 +3211,7 @@
4069
  "name": "Sora-Juray",
4070
  "iso_1_code": null,
4071
  "iso_3_code": null,
4072
- "tokenizers": {
4073
- "Latn": {
4074
- "full_object": "SpaCyTokenizer(\"vi\")",
4075
- "original_lang_name": "vietnamese",
4076
- "original_lang_code": "vie",
4077
- "scripts": [
4078
- "Latn"
4079
- ],
4080
- "class_name": "SpaCyTokenizer",
4081
- "macrolanguage": false
4082
- }
4083
- },
4084
  "children": [
4085
  {
4086
  "name": "Juray",
 
36
  "name": "Aslian",
37
  "iso_1_code": null,
38
  "iso_3_code": null,
39
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
40
  "children": [
41
  {
42
  "name": "Jah Hut",
43
  "iso_1_code": null,
44
  "iso_3_code": null,
45
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
46
  "children": [
47
  {
48
  "name": "Jah Hut",
 
63
  "name": "North Aslian",
64
  "iso_1_code": null,
65
  "iso_3_code": null,
66
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
67
  "children": [
68
  {
69
  "name": "Chewong",
70
  "iso_1_code": null,
71
  "iso_3_code": null,
72
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
73
  "children": [
74
  {
75
  "name": "Cheq Wong",
 
90
  "name": "Eastern",
91
  "iso_1_code": null,
92
  "iso_3_code": null,
93
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
94
  "children": [
95
  {
96
  "name": "Batek",
 
141
  "name": "Tonga",
142
  "iso_1_code": null,
143
  "iso_3_code": null,
144
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
145
  "children": [
146
  {
147
  "name": "Ten\u2019edn",
 
162
  "name": "Western",
163
  "iso_1_code": null,
164
  "iso_3_code": null,
165
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
166
  "children": [
167
  {
168
  "name": "Kintaq",
 
198
  "name": "Senoic",
199
  "iso_1_code": null,
200
  "iso_3_code": null,
201
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
202
  "children": [
203
  {
204
  "name": "Lanoh",
 
259
  "name": "South Aslian",
260
  "iso_1_code": null,
261
  "iso_3_code": null,
262
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
263
  "children": [
264
  {
265
  "name": "Mah Meri",
 
424
  "name": "East Bahnaric",
425
  "iso_1_code": null,
426
  "iso_3_code": null,
427
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
428
  "children": [
429
  {
430
  "name": "Cua",
 
445
  "name": "North Bahnaric",
446
  "iso_1_code": null,
447
  "iso_3_code": null,
448
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
449
  "children": [
450
  {
451
  "name": "Katua",
 
471
  "name": "East",
472
  "iso_1_code": null,
473
  "iso_3_code": null,
474
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
475
  "children": [
476
  {
477
  "name": "Kayong",
 
502
  "name": "West",
503
  "iso_1_code": null,
504
  "iso_3_code": null,
505
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
506
  "children": [
507
  {
508
  "name": "Trieng",
 
528
  "name": "Duan",
529
  "iso_1_code": null,
530
  "iso_3_code": null,
531
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
532
  "children": [
533
  {
534
  "name": "Halang Doan",
 
549
  "name": "Jeh-Halang",
550
  "iso_1_code": null,
551
  "iso_3_code": null,
552
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
553
  "children": [
554
  {
555
  "name": "Halang",
 
580
  "name": "Rengao",
581
  "iso_1_code": null,
582
  "iso_3_code": null,
583
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
584
  "children": [
585
  {
586
  "name": "Rengao",
 
601
  "name": "Sedang-Todrah",
602
  "iso_1_code": null,
603
  "iso_3_code": null,
604
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
605
  "children": [
606
  {
607
  "name": "Sedang",
608
  "iso_1_code": null,
609
  "iso_3_code": null,
610
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
611
  "children": [
612
  {
613
  "name": "Hre",
 
638
  "name": "Todrah-Monom",
639
  "iso_1_code": null,
640
  "iso_3_code": null,
641
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
642
  "children": [
643
  {
644
  "name": "Monom",
 
745
  "name": "Eastern Mnong",
746
  "iso_1_code": null,
747
  "iso_3_code": null,
748
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
749
  "children": [
750
  {
751
  "name": "Mnong, Eastern",
 
837
  "name": "Sre",
838
  "iso_1_code": null,
839
  "iso_3_code": null,
840
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
841
  "children": [
842
  {
843
  "name": "Maa",
 
873
  "name": "Stieng-Chrau",
874
  "iso_1_code": null,
875
  "iso_3_code": null,
876
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
877
  "children": [
878
  {
879
  "name": "Chrau",
 
919
  "name": "West Bahnaric",
920
  "iso_1_code": null,
921
  "iso_3_code": null,
922
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
923
  "children": [
924
  {
925
  "name": "Lavi",
 
935
  "name": "Brao-Kravet",
936
  "iso_1_code": null,
937
  "iso_3_code": null,
938
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
939
  "children": [
940
  {
941
  "name": "Brao",
 
986
  "name": "Laven",
987
  "iso_1_code": null,
988
  "iso_3_code": null,
989
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
990
  "children": [
991
  {
992
  "name": "Laven",
 
1007
  "name": "Nyaheun",
1008
  "iso_1_code": null,
1009
  "iso_3_code": null,
1010
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1011
  "children": [
1012
  {
1013
  "name": "Nyaheun",
 
1028
  "name": "Oi-The",
1029
  "iso_1_code": null,
1030
  "iso_3_code": null,
1031
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1032
  "children": [
1033
  {
1034
  "name": "Oy",
 
1086
  "name": "Central Katuic",
1087
  "iso_1_code": null,
1088
  "iso_3_code": null,
1089
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1090
  "children": [
1091
  {
1092
  "name": "Ta\u2019oih",
1093
  "iso_1_code": null,
1094
  "iso_3_code": null,
1095
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1096
  "children": [
1097
  {
1098
  "name": "Ir",
 
1148
  "name": "East Katuic",
1149
  "iso_1_code": null,
1150
  "iso_3_code": null,
1151
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1152
  "children": [
1153
  {
1154
  "name": "Katu-Pacoh",
1155
  "iso_1_code": null,
1156
  "iso_3_code": null,
1157
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1158
  "children": [
1159
  {
1160
  "name": "Katu, Eastern",
 
1215
  "name": "Ngeq-Nkriang",
1216
  "iso_1_code": null,
1217
  "iso_3_code": null,
1218
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1219
  "children": [
1220
  {
1221
  "name": "Kriang",
 
1355
  "name": "Kuay",
1356
  "iso_1_code": null,
1357
  "iso_3_code": null,
1358
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1359
  "children": [
1360
  {
1361
  "name": "Kuay",
 
1396
  "name": "Khmer",
1397
  "iso_1_code": null,
1398
  "iso_3_code": null,
1399
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1400
  "children": [
1401
  {
1402
  "name": "Khmer",
 
1431
  "name": "Pearic",
1432
  "iso_1_code": null,
1433
  "iso_3_code": null,
1434
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1435
  "children": [
1436
  {
1437
  "name": "Eastern",
1438
  "iso_1_code": null,
1439
  "iso_3_code": null,
1440
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1441
  "children": [
1442
  {
1443
  "name": "Pear",
 
1458
  "name": "Western",
1459
  "iso_1_code": null,
1460
  "iso_3_code": null,
1461
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1462
  "children": [
1463
  {
1464
  "name": "Chong",
1465
  "iso_1_code": null,
1466
  "iso_3_code": null,
1467
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1468
  "children": [
1469
  {
1470
  "name": "Chong",
 
1495
  "name": "Samre",
1496
  "iso_1_code": null,
1497
  "iso_3_code": null,
1498
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1499
  "children": [
1500
  {
1501
  "name": "Somray",
 
1517
  "scripts": [],
1518
  "own_tokenizer": false
1519
  }
1520
+ ],
1521
+ "node_i": "1292",
1522
+ "scripts": [],
1523
+ "own_tokenizer": false
1524
+ },
1525
+ {
1526
+ "name": "Suoy",
1527
+ "iso_1_code": null,
1528
+ "iso_3_code": null,
1529
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1530
  "children": [
1531
  {
1532
  "name": "Su\u2019ung",
 
1562
  "name": "Monic",
1563
  "iso_1_code": null,
1564
  "iso_3_code": null,
1565
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1566
  "children": [
1567
  {
1568
  "name": "Mon",
 
1647
  "name": "Chowra-Teressa",
1648
  "iso_1_code": null,
1649
  "iso_3_code": null,
1650
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1651
  "children": [
1652
  {
1653
  "name": "Chaura",
 
1678
  "name": "Great Nicobar",
1679
  "iso_1_code": null,
1680
  "iso_3_code": null,
1681
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1682
  "children": [
1683
  {
1684
  "name": "Nicobarese, Southern",
 
1699
  "name": "Nancowry",
1700
  "iso_1_code": null,
1701
  "iso_3_code": null,
1702
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1703
  "children": [
1704
  {
1705
  "name": "Nicobarese, Central",
 
1720
  "name": "Shom Peng",
1721
  "iso_1_code": null,
1722
  "iso_3_code": null,
1723
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1724
  "children": [
1725
  {
1726
  "name": "Shom Peng",
 
1838
  "name": "Khmuic",
1839
  "iso_1_code": null,
1840
  "iso_3_code": null,
1841
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1842
  "children": [
1843
  {
1844
  "name": "Khao",
1845
  "iso_1_code": null,
1846
  "iso_3_code": null,
1847
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1848
  "children": [
1849
  {
1850
  "name": "Khao",
 
1865
  "name": "Mal-Khmu\u2019",
1866
  "iso_1_code": null,
1867
  "iso_3_code": null,
1868
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1869
  "children": [
1870
  {
1871
  "name": "Khmu\u2019",
1872
  "iso_1_code": null,
1873
  "iso_3_code": null,
1874
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1875
  "children": [
1876
  {
1877
  "name": "Khuen",
 
1912
  "name": "Mal-Prai",
1913
  "iso_1_code": null,
1914
  "iso_3_code": null,
1915
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1916
  "children": [
1917
  {
1918
  "name": "Mal",
 
1948
  "name": "Mlabri",
1949
  "iso_1_code": null,
1950
  "iso_3_code": null,
1951
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1952
  "children": [
1953
  {
1954
  "name": "Mlabri",
 
1969
  "name": "Xinh Mul",
1970
  "iso_1_code": null,
1971
  "iso_3_code": null,
1972
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1973
  "children": [
1974
  {
1975
  "name": "Phong-Kniang",
 
2005
  "name": "Mang",
2006
  "iso_1_code": null,
2007
  "iso_3_code": null,
2008
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2009
  "children": [
2010
  {
2011
  "name": "Mang",
 
2060
  "name": "Angkuic",
2061
  "iso_1_code": null,
2062
  "iso_3_code": null,
2063
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2064
  "children": [
2065
  {
2066
  "name": "Hu",
 
2161
  "name": "Bit-Khang",
2162
  "iso_1_code": null,
2163
  "iso_3_code": null,
2164
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2165
  "children": [
2166
  {
2167
  "name": "Bit",
 
2202
  "name": "Lametic",
2203
  "iso_1_code": null,
2204
  "iso_3_code": null,
2205
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2206
  "children": [
2207
  {
2208
  "name": "Con",
 
2250
  "name": "Bulang",
2251
  "iso_1_code": null,
2252
  "iso_3_code": null,
2253
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2254
  "children": [
2255
  {
2256
  "name": "Blang",
 
2271
  "name": "Lawa",
2272
  "iso_1_code": null,
2273
  "iso_3_code": null,
2274
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2275
  "children": [
2276
  {
2277
  "name": "Lawa, Western",
 
2379
  "name": "Western Palaungic",
2380
  "iso_1_code": null,
2381
  "iso_3_code": null,
2382
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2383
  "children": [
2384
  {
2385
  "name": "Danau",
2386
  "iso_1_code": null,
2387
  "iso_3_code": null,
2388
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2389
  "children": [
2390
  {
2391
  "name": "Danau",
 
2406
  "name": "Palaung",
2407
  "iso_1_code": null,
2408
  "iso_3_code": null,
2409
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2410
  "children": [
2411
  {
2412
  "name": "Palaung, Ruching",
 
2447
  "name": "Riang",
2448
  "iso_1_code": null,
2449
  "iso_3_code": null,
2450
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2451
  "children": [
2452
  {
2453
  "name": "Riang Lang",
 
2493
  "name": "Palyu",
2494
  "iso_1_code": null,
2495
  "iso_3_code": null,
2496
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2497
  "children": [
2498
  {
2499
  "name": "Bugan",
 
2524
  "name": "Southern Monic",
2525
  "iso_1_code": null,
2526
  "iso_3_code": null,
2527
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2528
  "children": [
2529
  {
2530
  "name": "Nyahkur",
 
2545
  "name": "Unclassified",
2546
  "iso_1_code": null,
2547
  "iso_3_code": null,
2548
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2549
  "children": [
2550
  {
2551
  "name": "Kemiehua",
 
2593
  "name": "Chut",
2594
  "iso_1_code": null,
2595
  "iso_3_code": null,
2596
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2597
  "children": [
2598
  {
2599
  "name": "Arem",
 
2634
  "name": "Cuoi",
2635
  "iso_1_code": null,
2636
  "iso_3_code": null,
2637
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2638
  "children": [
2639
  {
2640
  "name": "Hung",
 
2665
  "name": "Muong",
2666
  "iso_1_code": null,
2667
  "iso_3_code": null,
2668
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2669
  "children": [
2670
  {
2671
  "name": "Bo",
 
2706
  "name": "Thavung",
2707
  "iso_1_code": null,
2708
  "iso_3_code": null,
2709
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
2710
  "children": [
2711
  {
2712
  "name": "Aheu",
 
3050
  "name": "Korku",
3051
  "iso_1_code": null,
3052
  "iso_3_code": null,
3053
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
3054
  "children": [
3055
  {
3056
  "name": "Korku",
 
3076
  "name": "South Munda",
3077
  "iso_1_code": null,
3078
  "iso_3_code": null,
3079
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
3080
  "children": [
3081
  {
3082
  "name": "Kharia-Juang",
3083
  "iso_1_code": null,
3084
  "iso_3_code": null,
3085
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
3086
  "children": [
3087
  {
3088
  "name": "Juang",
 
3115
  "name": "Koraput Munda",
3116
  "iso_1_code": null,
3117
  "iso_3_code": null,
3118
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
3119
  "children": [
3120
  {
3121
  "name": "Gutob-Remo-Geta\u2019",
3122
  "iso_1_code": null,
3123
  "iso_3_code": null,
3124
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
3125
  "children": [
3126
  {
3127
  "name": "Geta\u2019",
3128
  "iso_1_code": null,
3129
  "iso_3_code": null,
3130
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
3131
  "children": [
3132
  {
3133
  "name": "Gata\u2019",
 
3148
  "name": "Gutob-Remo",
3149
  "iso_1_code": null,
3150
  "iso_3_code": null,
3151
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
3152
  "children": [
3153
  {
3154
  "name": "Bondo",
 
3184
  "name": "Sora-Juray-Gorum",
3185
  "iso_1_code": null,
3186
  "iso_3_code": null,
3187
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
3188
  "children": [
3189
  {
3190
  "name": "Gorum",
3191
  "iso_1_code": null,
3192
  "iso_3_code": null,
3193
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
3194
  "children": [
3195
  {
3196
  "name": "Parenga",
 
3211
  "name": "Sora-Juray",
3212
  "iso_1_code": null,
3213
  "iso_3_code": null,
3214
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
3215
  "children": [
3216
  {
3217
  "name": "Juray",
data/Austronesian.json CHANGED
The diff for this file is too large to render. See raw diff
 
data/Creole.json CHANGED
@@ -8,9 +8,9 @@
8
  "original_lang_name": "malay",
9
  "original_lang_code": "msa",
10
  "scripts": [
 
11
  "Arab",
12
- "Thai",
13
- "Latn"
14
  ],
15
  "class_name": "SpaCyTokenizer",
16
  "macrolanguage": true
@@ -20,9 +20,9 @@
20
  "original_lang_name": "malay",
21
  "original_lang_code": "msa",
22
  "scripts": [
 
23
  "Arab",
24
- "Thai",
25
- "Latn"
26
  ],
27
  "class_name": "SpaCyTokenizer",
28
  "macrolanguage": true
@@ -32,9 +32,9 @@
32
  "original_lang_name": "malay",
33
  "original_lang_code": "msa",
34
  "scripts": [
 
35
  "Arab",
36
- "Thai",
37
- "Latn"
38
  ],
39
  "class_name": "SpaCyTokenizer",
40
  "macrolanguage": true
@@ -45,44 +45,7 @@
45
  "name": "Afrikaans based",
46
  "iso_1_code": null,
47
  "iso_3_code": null,
48
- "tokenizers": {
49
- "Arab": {
50
- "full_object": "SpaCyTokenizer(\"ms\")",
51
- "original_lang_name": "malay",
52
- "original_lang_code": "msa",
53
- "scripts": [
54
- "Arab",
55
- "Thai",
56
- "Latn"
57
- ],
58
- "class_name": "SpaCyTokenizer",
59
- "macrolanguage": true
60
- },
61
- "Latn": {
62
- "full_object": "SpaCyTokenizer(\"ms\")",
63
- "original_lang_name": "malay",
64
- "original_lang_code": "msa",
65
- "scripts": [
66
- "Arab",
67
- "Thai",
68
- "Latn"
69
- ],
70
- "class_name": "SpaCyTokenizer",
71
- "macrolanguage": true
72
- },
73
- "Thai": {
74
- "full_object": "SpaCyTokenizer(\"ms\")",
75
- "original_lang_name": "malay",
76
- "original_lang_code": "msa",
77
- "scripts": [
78
- "Arab",
79
- "Thai",
80
- "Latn"
81
- ],
82
- "class_name": "SpaCyTokenizer",
83
- "macrolanguage": true
84
- }
85
- },
86
  "children": [
87
  {
88
  "name": "Flaaitaal",
@@ -123,30 +86,6 @@
123
  ],
124
  "class_name": "SpaCyTokenizer",
125
  "macrolanguage": true
126
- },
127
- "Latn": {
128
- "full_object": "SpaCyTokenizer(\"ms\")",
129
- "original_lang_name": "malay",
130
- "original_lang_code": "msa",
131
- "scripts": [
132
- "Arab",
133
- "Thai",
134
- "Latn"
135
- ],
136
- "class_name": "SpaCyTokenizer",
137
- "macrolanguage": true
138
- },
139
- "Thai": {
140
- "full_object": "SpaCyTokenizer(\"ms\")",
141
- "original_lang_name": "malay",
142
- "original_lang_code": "msa",
143
- "scripts": [
144
- "Arab",
145
- "Thai",
146
- "Latn"
147
- ],
148
- "class_name": "SpaCyTokenizer",
149
- "macrolanguage": true
150
  }
151
  },
152
  "children": [
@@ -190,44 +129,7 @@
190
  "name": "Assamese based",
191
  "iso_1_code": null,
192
  "iso_3_code": null,
193
- "tokenizers": {
194
- "Arab": {
195
- "full_object": "SpaCyTokenizer(\"ms\")",
196
- "original_lang_name": "malay",
197
- "original_lang_code": "msa",
198
- "scripts": [
199
- "Arab",
200
- "Thai",
201
- "Latn"
202
- ],
203
- "class_name": "SpaCyTokenizer",
204
- "macrolanguage": true
205
- },
206
- "Latn": {
207
- "full_object": "SpaCyTokenizer(\"ms\")",
208
- "original_lang_name": "malay",
209
- "original_lang_code": "msa",
210
- "scripts": [
211
- "Arab",
212
- "Thai",
213
- "Latn"
214
- ],
215
- "class_name": "SpaCyTokenizer",
216
- "macrolanguage": true
217
- },
218
- "Thai": {
219
- "full_object": "SpaCyTokenizer(\"ms\")",
220
- "original_lang_name": "malay",
221
- "original_lang_code": "msa",
222
- "scripts": [
223
- "Arab",
224
- "Thai",
225
- "Latn"
226
- ],
227
- "class_name": "SpaCyTokenizer",
228
- "macrolanguage": true
229
- }
230
- },
231
  "children": [
232
  {
233
  "name": "Nagamese",
@@ -248,44 +150,7 @@
248
  "name": "Dutch based",
249
  "iso_1_code": null,
250
  "iso_3_code": null,
251
- "tokenizers": {
252
- "Arab": {
253
- "full_object": "SpaCyTokenizer(\"ms\")",
254
- "original_lang_name": "malay",
255
- "original_lang_code": "msa",
256
- "scripts": [
257
- "Arab",
258
- "Thai",
259
- "Latn"
260
- ],
261
- "class_name": "SpaCyTokenizer",
262
- "macrolanguage": true
263
- },
264
- "Latn": {
265
- "full_object": "SpaCyTokenizer(\"ms\")",
266
- "original_lang_name": "malay",
267
- "original_lang_code": "msa",
268
- "scripts": [
269
- "Arab",
270
- "Thai",
271
- "Latn"
272
- ],
273
- "class_name": "SpaCyTokenizer",
274
- "macrolanguage": true
275
- },
276
- "Thai": {
277
- "full_object": "SpaCyTokenizer(\"ms\")",
278
- "original_lang_name": "malay",
279
- "original_lang_code": "msa",
280
- "scripts": [
281
- "Arab",
282
- "Thai",
283
- "Latn"
284
- ],
285
- "class_name": "SpaCyTokenizer",
286
- "macrolanguage": true
287
- }
288
- },
289
  "children": [
290
  {
291
  "name": "Berbice Dutch Creole",
@@ -356,30 +221,6 @@
356
  ],
357
  "class_name": "StanzaTokenizer",
358
  "macrolanguage": false
359
- },
360
- "Arab": {
361
- "full_object": "SpaCyTokenizer(\"ms\")",
362
- "original_lang_name": "malay",
363
- "original_lang_code": "msa",
364
- "scripts": [
365
- "Arab",
366
- "Thai",
367
- "Latn"
368
- ],
369
- "class_name": "SpaCyTokenizer",
370
- "macrolanguage": true
371
- },
372
- "Thai": {
373
- "full_object": "SpaCyTokenizer(\"ms\")",
374
- "original_lang_name": "malay",
375
- "original_lang_code": "msa",
376
- "scripts": [
377
- "Arab",
378
- "Thai",
379
- "Latn"
380
- ],
381
- "class_name": "SpaCyTokenizer",
382
- "macrolanguage": true
383
  }
384
  },
385
  "children": [
@@ -420,30 +261,6 @@
420
  ],
421
  "class_name": "StanzaTokenizer",
422
  "macrolanguage": false
423
- },
424
- "Arab": {
425
- "full_object": "SpaCyTokenizer(\"ms\")",
426
- "original_lang_name": "malay",
427
- "original_lang_code": "msa",
428
- "scripts": [
429
- "Arab",
430
- "Thai",
431
- "Latn"
432
- ],
433
- "class_name": "SpaCyTokenizer",
434
- "macrolanguage": true
435
- },
436
- "Thai": {
437
- "full_object": "SpaCyTokenizer(\"ms\")",
438
- "original_lang_name": "malay",
439
- "original_lang_code": "msa",
440
- "scripts": [
441
- "Arab",
442
- "Thai",
443
- "Latn"
444
- ],
445
- "class_name": "SpaCyTokenizer",
446
- "macrolanguage": true
447
  }
448
  },
449
  "children": [
@@ -461,30 +278,6 @@
461
  ],
462
  "class_name": "StanzaTokenizer",
463
  "macrolanguage": false
464
- },
465
- "Arab": {
466
- "full_object": "SpaCyTokenizer(\"ms\")",
467
- "original_lang_name": "malay",
468
- "original_lang_code": "msa",
469
- "scripts": [
470
- "Arab",
471
- "Thai",
472
- "Latn"
473
- ],
474
- "class_name": "SpaCyTokenizer",
475
- "macrolanguage": true
476
- },
477
- "Thai": {
478
- "full_object": "SpaCyTokenizer(\"ms\")",
479
- "original_lang_name": "malay",
480
- "original_lang_code": "msa",
481
- "scripts": [
482
- "Arab",
483
- "Thai",
484
- "Latn"
485
- ],
486
- "class_name": "SpaCyTokenizer",
487
- "macrolanguage": true
488
  }
489
  },
490
  "children": [
@@ -512,30 +305,6 @@
512
  ],
513
  "class_name": "StanzaTokenizer",
514
  "macrolanguage": false
515
- },
516
- "Arab": {
517
- "full_object": "SpaCyTokenizer(\"ms\")",
518
- "original_lang_name": "malay",
519
- "original_lang_code": "msa",
520
- "scripts": [
521
- "Arab",
522
- "Thai",
523
- "Latn"
524
- ],
525
- "class_name": "SpaCyTokenizer",
526
- "macrolanguage": true
527
- },
528
- "Thai": {
529
- "full_object": "SpaCyTokenizer(\"ms\")",
530
- "original_lang_name": "malay",
531
- "original_lang_code": "msa",
532
- "scripts": [
533
- "Arab",
534
- "Thai",
535
- "Latn"
536
- ],
537
- "class_name": "SpaCyTokenizer",
538
- "macrolanguage": true
539
  }
540
  },
541
  "children": [
@@ -591,42 +360,7 @@
591
  "name": "Southern",
592
  "iso_1_code": null,
593
  "iso_3_code": null,
594
- "tokenizers": {
595
- "Latn": {
596
- "full_object": "StanzaTokenizer(\"pcm\")",
597
- "original_lang_name": "nigerian_pidgin",
598
- "original_lang_code": "pcm",
599
- "scripts": [
600
- "Latn"
601
- ],
602
- "class_name": "StanzaTokenizer",
603
- "macrolanguage": false
604
- },
605
- "Arab": {
606
- "full_object": "SpaCyTokenizer(\"ms\")",
607
- "original_lang_name": "malay",
608
- "original_lang_code": "msa",
609
- "scripts": [
610
- "Arab",
611
- "Thai",
612
- "Latn"
613
- ],
614
- "class_name": "SpaCyTokenizer",
615
- "macrolanguage": true
616
- },
617
- "Thai": {
618
- "full_object": "SpaCyTokenizer(\"ms\")",
619
- "original_lang_name": "malay",
620
- "original_lang_code": "msa",
621
- "scripts": [
622
- "Arab",
623
- "Thai",
624
- "Latn"
625
- ],
626
- "class_name": "SpaCyTokenizer",
627
- "macrolanguage": true
628
- }
629
- },
630
  "children": [
631
  {
632
  "name": "Leeward Caribbean English Creole",
@@ -732,30 +466,6 @@
732
  ],
733
  "class_name": "StanzaTokenizer",
734
  "macrolanguage": false
735
- },
736
- "Arab": {
737
- "full_object": "SpaCyTokenizer(\"ms\")",
738
- "original_lang_name": "malay",
739
- "original_lang_code": "msa",
740
- "scripts": [
741
- "Arab",
742
- "Thai",
743
- "Latn"
744
- ],
745
- "class_name": "SpaCyTokenizer",
746
- "macrolanguage": true
747
- },
748
- "Thai": {
749
- "full_object": "SpaCyTokenizer(\"ms\")",
750
- "original_lang_name": "malay",
751
- "original_lang_code": "msa",
752
- "scripts": [
753
- "Arab",
754
- "Thai",
755
- "Latn"
756
- ],
757
- "class_name": "SpaCyTokenizer",
758
- "macrolanguage": true
759
  }
760
  },
761
  "children": [
@@ -867,30 +577,6 @@
867
  ],
868
  "class_name": "StanzaTokenizer",
869
  "macrolanguage": false
870
- },
871
- "Arab": {
872
- "full_object": "SpaCyTokenizer(\"ms\")",
873
- "original_lang_name": "malay",
874
- "original_lang_code": "msa",
875
- "scripts": [
876
- "Arab",
877
- "Thai",
878
- "Latn"
879
- ],
880
- "class_name": "SpaCyTokenizer",
881
- "macrolanguage": true
882
- },
883
- "Thai": {
884
- "full_object": "SpaCyTokenizer(\"ms\")",
885
- "original_lang_name": "malay",
886
- "original_lang_code": "msa",
887
- "scripts": [
888
- "Arab",
889
- "Thai",
890
- "Latn"
891
- ],
892
- "class_name": "SpaCyTokenizer",
893
- "macrolanguage": true
894
  }
895
  },
896
  "children": [
@@ -931,30 +617,6 @@
931
  ],
932
  "class_name": "StanzaTokenizer",
933
  "macrolanguage": false
934
- },
935
- "Arab": {
936
- "full_object": "SpaCyTokenizer(\"ms\")",
937
- "original_lang_name": "malay",
938
- "original_lang_code": "msa",
939
- "scripts": [
940
- "Arab",
941
- "Thai",
942
- "Latn"
943
- ],
944
- "class_name": "SpaCyTokenizer",
945
- "macrolanguage": true
946
- },
947
- "Thai": {
948
- "full_object": "SpaCyTokenizer(\"ms\")",
949
- "original_lang_name": "malay",
950
- "original_lang_code": "msa",
951
- "scripts": [
952
- "Arab",
953
- "Thai",
954
- "Latn"
955
- ],
956
- "class_name": "SpaCyTokenizer",
957
- "macrolanguage": true
958
  }
959
  },
960
  "children": [
@@ -1015,30 +677,6 @@
1015
  ],
1016
  "class_name": "StanzaTokenizer",
1017
  "macrolanguage": false
1018
- },
1019
- "Arab": {
1020
- "full_object": "SpaCyTokenizer(\"ms\")",
1021
- "original_lang_name": "malay",
1022
- "original_lang_code": "msa",
1023
- "scripts": [
1024
- "Arab",
1025
- "Thai",
1026
- "Latn"
1027
- ],
1028
- "class_name": "SpaCyTokenizer",
1029
- "macrolanguage": true
1030
- },
1031
- "Thai": {
1032
- "full_object": "SpaCyTokenizer(\"ms\")",
1033
- "original_lang_name": "malay",
1034
- "original_lang_code": "msa",
1035
- "scripts": [
1036
- "Arab",
1037
- "Thai",
1038
- "Latn"
1039
- ],
1040
- "class_name": "SpaCyTokenizer",
1041
- "macrolanguage": true
1042
  }
1043
  },
1044
  "children": [
@@ -1145,30 +783,6 @@
1145
  ],
1146
  "class_name": "StanzaTokenizer",
1147
  "macrolanguage": false
1148
- },
1149
- "Arab": {
1150
- "full_object": "SpaCyTokenizer(\"ms\")",
1151
- "original_lang_name": "malay",
1152
- "original_lang_code": "msa",
1153
- "scripts": [
1154
- "Arab",
1155
- "Thai",
1156
- "Latn"
1157
- ],
1158
- "class_name": "SpaCyTokenizer",
1159
- "macrolanguage": true
1160
- },
1161
- "Thai": {
1162
- "full_object": "SpaCyTokenizer(\"ms\")",
1163
- "original_lang_name": "malay",
1164
- "original_lang_code": "msa",
1165
- "scripts": [
1166
- "Arab",
1167
- "Thai",
1168
- "Latn"
1169
- ],
1170
- "class_name": "SpaCyTokenizer",
1171
- "macrolanguage": true
1172
  }
1173
  },
1174
  "children": [
@@ -1350,9 +964,9 @@
1350
  "original_lang_name": "malay",
1351
  "original_lang_code": "msa",
1352
  "scripts": [
 
1353
  "Arab",
1354
- "Thai",
1355
- "Latn"
1356
  ],
1357
  "class_name": "SpaCyTokenizer",
1358
  "macrolanguage": true
@@ -1362,9 +976,9 @@
1362
  "original_lang_name": "malay",
1363
  "original_lang_code": "msa",
1364
  "scripts": [
 
1365
  "Arab",
1366
- "Thai",
1367
- "Latn"
1368
  ],
1369
  "class_name": "SpaCyTokenizer",
1370
  "macrolanguage": true
@@ -1374,9 +988,9 @@
1374
  "original_lang_name": "malay",
1375
  "original_lang_code": "msa",
1376
  "scripts": [
 
1377
  "Arab",
1378
- "Thai",
1379
- "Latn"
1380
  ],
1381
  "class_name": "SpaCyTokenizer",
1382
  "macrolanguage": true
@@ -1393,9 +1007,9 @@
1393
  "original_lang_name": "malay",
1394
  "original_lang_code": "msa",
1395
  "scripts": [
 
1396
  "Arab",
1397
- "Thai",
1398
- "Latn"
1399
  ],
1400
  "class_name": "SpaCyTokenizer",
1401
  "macrolanguage": true
@@ -1428,9 +1042,9 @@
1428
  "original_lang_name": "malay",
1429
  "original_lang_code": "msa",
1430
  "scripts": [
 
1431
  "Arab",
1432
- "Thai",
1433
- "Latn"
1434
  ],
1435
  "class_name": "SpaCyTokenizer",
1436
  "macrolanguage": true
@@ -1453,9 +1067,9 @@
1453
  "original_lang_name": "malay",
1454
  "original_lang_code": "msa",
1455
  "scripts": [
 
1456
  "Arab",
1457
- "Thai",
1458
- "Latn"
1459
  ],
1460
  "class_name": "SpaCyTokenizer",
1461
  "macrolanguage": true
@@ -1478,9 +1092,9 @@
1478
  "original_lang_name": "malay",
1479
  "original_lang_code": "msa",
1480
  "scripts": [
 
1481
  "Arab",
1482
- "Thai",
1483
- "Latn"
1484
  ],
1485
  "class_name": "SpaCyTokenizer",
1486
  "macrolanguage": true
@@ -1503,9 +1117,9 @@
1503
  "original_lang_name": "malay",
1504
  "original_lang_code": "msa",
1505
  "scripts": [
 
1506
  "Arab",
1507
- "Thai",
1508
- "Latn"
1509
  ],
1510
  "class_name": "SpaCyTokenizer",
1511
  "macrolanguage": true
@@ -1548,9 +1162,9 @@
1548
  "original_lang_name": "malay",
1549
  "original_lang_code": "msa",
1550
  "scripts": [
 
1551
  "Arab",
1552
- "Thai",
1553
- "Latn"
1554
  ],
1555
  "class_name": "SpaCyTokenizer",
1556
  "macrolanguage": true
@@ -1573,9 +1187,9 @@
1573
  "original_lang_name": "malay",
1574
  "original_lang_code": "msa",
1575
  "scripts": [
 
1576
  "Arab",
1577
- "Thai",
1578
- "Latn"
1579
  ],
1580
  "class_name": "SpaCyTokenizer",
1581
  "macrolanguage": true
@@ -1607,44 +1221,7 @@
1607
  "name": "German based",
1608
  "iso_1_code": null,
1609
  "iso_3_code": null,
1610
- "tokenizers": {
1611
- "Arab": {
1612
- "full_object": "SpaCyTokenizer(\"ms\")",
1613
- "original_lang_name": "malay",
1614
- "original_lang_code": "msa",
1615
- "scripts": [
1616
- "Arab",
1617
- "Thai",
1618
- "Latn"
1619
- ],
1620
- "class_name": "SpaCyTokenizer",
1621
- "macrolanguage": true
1622
- },
1623
- "Latn": {
1624
- "full_object": "SpaCyTokenizer(\"ms\")",
1625
- "original_lang_name": "malay",
1626
- "original_lang_code": "msa",
1627
- "scripts": [
1628
- "Arab",
1629
- "Thai",
1630
- "Latn"
1631
- ],
1632
- "class_name": "SpaCyTokenizer",
1633
- "macrolanguage": true
1634
- },
1635
- "Thai": {
1636
- "full_object": "SpaCyTokenizer(\"ms\")",
1637
- "original_lang_name": "malay",
1638
- "original_lang_code": "msa",
1639
- "scripts": [
1640
- "Arab",
1641
- "Thai",
1642
- "Latn"
1643
- ],
1644
- "class_name": "SpaCyTokenizer",
1645
- "macrolanguage": true
1646
- }
1647
- },
1648
  "children": [
1649
  {
1650
  "name": "Unserdeutsch",
@@ -1665,44 +1242,7 @@
1665
  "name": "Hindi based",
1666
  "iso_1_code": null,
1667
  "iso_3_code": null,
1668
- "tokenizers": {
1669
- "Arab": {
1670
- "full_object": "SpaCyTokenizer(\"ms\")",
1671
- "original_lang_name": "malay",
1672
- "original_lang_code": "msa",
1673
- "scripts": [
1674
- "Arab",
1675
- "Thai",
1676
- "Latn"
1677
- ],
1678
- "class_name": "SpaCyTokenizer",
1679
- "macrolanguage": true
1680
- },
1681
- "Latn": {
1682
- "full_object": "SpaCyTokenizer(\"ms\")",
1683
- "original_lang_name": "malay",
1684
- "original_lang_code": "msa",
1685
- "scripts": [
1686
- "Arab",
1687
- "Thai",
1688
- "Latn"
1689
- ],
1690
- "class_name": "SpaCyTokenizer",
1691
- "macrolanguage": true
1692
- },
1693
- "Thai": {
1694
- "full_object": "SpaCyTokenizer(\"ms\")",
1695
- "original_lang_name": "malay",
1696
- "original_lang_code": "msa",
1697
- "scripts": [
1698
- "Arab",
1699
- "Thai",
1700
- "Latn"
1701
- ],
1702
- "class_name": "SpaCyTokenizer",
1703
- "macrolanguage": true
1704
- }
1705
- },
1706
  "children": [
1707
  {
1708
  "name": "Andaman Hindi Creole",
@@ -1729,9 +1269,9 @@
1729
  "original_lang_name": "malay",
1730
  "original_lang_code": "msa",
1731
  "scripts": [
 
1732
  "Arab",
1733
- "Thai",
1734
- "Latn"
1735
  ],
1736
  "class_name": "SpaCyTokenizer",
1737
  "macrolanguage": true
@@ -1741,9 +1281,9 @@
1741
  "original_lang_name": "malay",
1742
  "original_lang_code": "msa",
1743
  "scripts": [
 
1744
  "Arab",
1745
- "Thai",
1746
- "Latn"
1747
  ],
1748
  "class_name": "SpaCyTokenizer",
1749
  "macrolanguage": true
@@ -1753,9 +1293,9 @@
1753
  "original_lang_name": "malay",
1754
  "original_lang_code": "msa",
1755
  "scripts": [
 
1756
  "Arab",
1757
- "Thai",
1758
- "Latn"
1759
  ],
1760
  "class_name": "SpaCyTokenizer",
1761
  "macrolanguage": true
@@ -1772,9 +1312,9 @@
1772
  "original_lang_name": "malay",
1773
  "original_lang_code": "msa",
1774
  "scripts": [
 
1775
  "Arab",
1776
- "Thai",
1777
- "Latn"
1778
  ],
1779
  "class_name": "SpaCyTokenizer",
1780
  "macrolanguage": true
@@ -1796,44 +1336,7 @@
1796
  "name": "Japanese-based",
1797
  "iso_1_code": null,
1798
  "iso_3_code": null,
1799
- "tokenizers": {
1800
- "Arab": {
1801
- "full_object": "SpaCyTokenizer(\"ms\")",
1802
- "original_lang_name": "malay",
1803
- "original_lang_code": "msa",
1804
- "scripts": [
1805
- "Arab",
1806
- "Thai",
1807
- "Latn"
1808
- ],
1809
- "class_name": "SpaCyTokenizer",
1810
- "macrolanguage": true
1811
- },
1812
- "Latn": {
1813
- "full_object": "SpaCyTokenizer(\"ms\")",
1814
- "original_lang_name": "malay",
1815
- "original_lang_code": "msa",
1816
- "scripts": [
1817
- "Arab",
1818
- "Thai",
1819
- "Latn"
1820
- ],
1821
- "class_name": "SpaCyTokenizer",
1822
- "macrolanguage": true
1823
- },
1824
- "Thai": {
1825
- "full_object": "SpaCyTokenizer(\"ms\")",
1826
- "original_lang_name": "malay",
1827
- "original_lang_code": "msa",
1828
- "scripts": [
1829
- "Arab",
1830
- "Thai",
1831
- "Latn"
1832
- ],
1833
- "class_name": "SpaCyTokenizer",
1834
- "macrolanguage": true
1835
- }
1836
- },
1837
  "children": [
1838
  {
1839
  "name": "Yilan Creole",
@@ -1860,9 +1363,9 @@
1860
  "original_lang_name": "malay",
1861
  "original_lang_code": "msa",
1862
  "scripts": [
 
1863
  "Arab",
1864
- "Thai",
1865
- "Latn"
1866
  ],
1867
  "class_name": "SpaCyTokenizer",
1868
  "macrolanguage": true
@@ -1872,9 +1375,9 @@
1872
  "original_lang_name": "malay",
1873
  "original_lang_code": "msa",
1874
  "scripts": [
 
1875
  "Arab",
1876
- "Thai",
1877
- "Latn"
1878
  ],
1879
  "class_name": "SpaCyTokenizer",
1880
  "macrolanguage": true
@@ -1884,9 +1387,9 @@
1884
  "original_lang_name": "malay",
1885
  "original_lang_code": "msa",
1886
  "scripts": [
 
1887
  "Arab",
1888
- "Thai",
1889
- "Latn"
1890
  ],
1891
  "class_name": "SpaCyTokenizer",
1892
  "macrolanguage": true
@@ -1903,9 +1406,9 @@
1903
  "original_lang_name": "malay",
1904
  "original_lang_code": "msa",
1905
  "scripts": [
 
1906
  "Arab",
1907
- "Thai",
1908
- "Latn"
1909
  ],
1910
  "class_name": "SpaCyTokenizer",
1911
  "macrolanguage": true
@@ -1938,38 +1441,38 @@
1938
  "iso_1_code": null,
1939
  "iso_3_code": null,
1940
  "tokenizers": {
1941
- "Arab": {
1942
  "full_object": "SpaCyTokenizer(\"ms\")",
1943
  "original_lang_name": "malay",
1944
  "original_lang_code": "msa",
1945
  "scripts": [
 
1946
  "Arab",
1947
- "Thai",
1948
- "Latn"
1949
  ],
1950
  "class_name": "SpaCyTokenizer",
1951
  "macrolanguage": true
1952
  },
1953
- "Thai": {
1954
  "full_object": "SpaCyTokenizer(\"ms\")",
1955
  "original_lang_name": "malay",
1956
  "original_lang_code": "msa",
1957
  "scripts": [
 
1958
  "Arab",
1959
- "Thai",
1960
- "Latn"
1961
  ],
1962
  "class_name": "SpaCyTokenizer",
1963
  "macrolanguage": true
1964
  },
1965
- "Latn": {
1966
  "full_object": "SpaCyTokenizer(\"ms\")",
1967
  "original_lang_name": "malay",
1968
  "original_lang_code": "msa",
1969
  "scripts": [
 
1970
  "Arab",
1971
- "Thai",
1972
- "Latn"
1973
  ],
1974
  "class_name": "SpaCyTokenizer",
1975
  "macrolanguage": true
@@ -1986,9 +1489,9 @@
1986
  "original_lang_name": "malay",
1987
  "original_lang_code": "msa",
1988
  "scripts": [
 
1989
  "Arab",
1990
- "Thai",
1991
- "Latn"
1992
  ],
1993
  "class_name": "SpaCyTokenizer",
1994
  "macrolanguage": true
@@ -2011,9 +1514,9 @@
2011
  "original_lang_name": "malay",
2012
  "original_lang_code": "msa",
2013
  "scripts": [
 
2014
  "Arab",
2015
- "Thai",
2016
- "Latn"
2017
  ],
2018
  "class_name": "SpaCyTokenizer",
2019
  "macrolanguage": true
@@ -2051,38 +1554,38 @@
2051
  "iso_1_code": "ms",
2052
  "iso_3_code": "coa",
2053
  "tokenizers": {
2054
- "Arab": {
2055
  "full_object": "SpaCyTokenizer(\"ms\")",
2056
  "original_lang_name": "malay",
2057
  "original_lang_code": "msa",
2058
  "scripts": [
 
2059
  "Arab",
2060
- "Thai",
2061
- "Latn"
2062
  ],
2063
  "class_name": "SpaCyTokenizer",
2064
  "macrolanguage": true
2065
  },
2066
- "Thai": {
2067
  "full_object": "SpaCyTokenizer(\"ms\")",
2068
  "original_lang_name": "malay",
2069
  "original_lang_code": "msa",
2070
  "scripts": [
 
2071
  "Arab",
2072
- "Thai",
2073
- "Latn"
2074
  ],
2075
  "class_name": "SpaCyTokenizer",
2076
  "macrolanguage": true
2077
  },
2078
- "Latn": {
2079
  "full_object": "SpaCyTokenizer(\"ms\")",
2080
  "original_lang_name": "malay",
2081
  "original_lang_code": "msa",
2082
  "scripts": [
 
2083
  "Arab",
2084
- "Thai",
2085
- "Latn"
2086
  ],
2087
  "class_name": "SpaCyTokenizer",
2088
  "macrolanguage": true
@@ -2108,38 +1611,38 @@
2108
  "iso_1_code": "ms",
2109
  "iso_3_code": "max",
2110
  "tokenizers": {
2111
- "Arab": {
2112
  "full_object": "SpaCyTokenizer(\"ms\")",
2113
  "original_lang_name": "malay",
2114
  "original_lang_code": "msa",
2115
  "scripts": [
 
2116
  "Arab",
2117
- "Thai",
2118
- "Latn"
2119
  ],
2120
  "class_name": "SpaCyTokenizer",
2121
  "macrolanguage": true
2122
  },
2123
- "Thai": {
2124
  "full_object": "SpaCyTokenizer(\"ms\")",
2125
  "original_lang_name": "malay",
2126
  "original_lang_code": "msa",
2127
  "scripts": [
 
2128
  "Arab",
2129
- "Thai",
2130
- "Latn"
2131
  ],
2132
  "class_name": "SpaCyTokenizer",
2133
  "macrolanguage": true
2134
  },
2135
- "Latn": {
2136
  "full_object": "SpaCyTokenizer(\"ms\")",
2137
  "original_lang_name": "malay",
2138
  "original_lang_code": "msa",
2139
  "scripts": [
 
2140
  "Arab",
2141
- "Thai",
2142
- "Latn"
2143
  ],
2144
  "class_name": "SpaCyTokenizer",
2145
  "macrolanguage": true
@@ -2162,9 +1665,9 @@
2162
  "original_lang_name": "malay",
2163
  "original_lang_code": "msa",
2164
  "scripts": [
 
2165
  "Arab",
2166
- "Thai",
2167
- "Latn"
2168
  ],
2169
  "class_name": "SpaCyTokenizer",
2170
  "macrolanguage": true
@@ -2197,9 +1700,9 @@
2197
  "original_lang_name": "malay",
2198
  "original_lang_code": "msa",
2199
  "scripts": [
 
2200
  "Arab",
2201
- "Thai",
2202
- "Latn"
2203
  ],
2204
  "class_name": "SpaCyTokenizer",
2205
  "macrolanguage": true
@@ -2247,38 +1750,38 @@
2247
  "iso_1_code": "ms",
2248
  "iso_3_code": "xmm",
2249
  "tokenizers": {
2250
- "Arab": {
2251
  "full_object": "SpaCyTokenizer(\"ms\")",
2252
  "original_lang_name": "malay",
2253
  "original_lang_code": "msa",
2254
  "scripts": [
 
2255
  "Arab",
2256
- "Thai",
2257
- "Latn"
2258
  ],
2259
  "class_name": "SpaCyTokenizer",
2260
  "macrolanguage": true
2261
  },
2262
- "Thai": {
2263
  "full_object": "SpaCyTokenizer(\"ms\")",
2264
  "original_lang_name": "malay",
2265
  "original_lang_code": "msa",
2266
  "scripts": [
 
2267
  "Arab",
2268
- "Thai",
2269
- "Latn"
2270
  ],
2271
  "class_name": "SpaCyTokenizer",
2272
  "macrolanguage": true
2273
  },
2274
- "Latn": {
2275
  "full_object": "SpaCyTokenizer(\"ms\")",
2276
  "original_lang_name": "malay",
2277
  "original_lang_code": "msa",
2278
  "scripts": [
 
2279
  "Arab",
2280
- "Thai",
2281
- "Latn"
2282
  ],
2283
  "class_name": "SpaCyTokenizer",
2284
  "macrolanguage": true
@@ -2306,9 +1809,9 @@
2306
  "original_lang_name": "malay",
2307
  "original_lang_code": "msa",
2308
  "scripts": [
 
2309
  "Arab",
2310
- "Thai",
2311
- "Latn"
2312
  ],
2313
  "class_name": "SpaCyTokenizer",
2314
  "macrolanguage": true
@@ -2318,9 +1821,9 @@
2318
  "original_lang_name": "malay",
2319
  "original_lang_code": "msa",
2320
  "scripts": [
 
2321
  "Arab",
2322
- "Thai",
2323
- "Latn"
2324
  ],
2325
  "class_name": "SpaCyTokenizer",
2326
  "macrolanguage": true
@@ -2330,9 +1833,9 @@
2330
  "original_lang_name": "malay",
2331
  "original_lang_code": "msa",
2332
  "scripts": [
 
2333
  "Arab",
2334
- "Thai",
2335
- "Latn"
2336
  ],
2337
  "class_name": "SpaCyTokenizer",
2338
  "macrolanguage": true
@@ -2349,9 +1852,9 @@
2349
  "original_lang_name": "malay",
2350
  "original_lang_code": "msa",
2351
  "scripts": [
 
2352
  "Arab",
2353
- "Thai",
2354
- "Latn"
2355
  ],
2356
  "class_name": "SpaCyTokenizer",
2357
  "macrolanguage": true
@@ -2389,9 +1892,9 @@
2389
  "original_lang_name": "malay",
2390
  "original_lang_code": "msa",
2391
  "scripts": [
 
2392
  "Arab",
2393
- "Thai",
2394
- "Latn"
2395
  ],
2396
  "class_name": "SpaCyTokenizer",
2397
  "macrolanguage": true
@@ -2401,9 +1904,9 @@
2401
  "original_lang_name": "malay",
2402
  "original_lang_code": "msa",
2403
  "scripts": [
 
2404
  "Arab",
2405
- "Thai",
2406
- "Latn"
2407
  ],
2408
  "class_name": "SpaCyTokenizer",
2409
  "macrolanguage": true
@@ -2413,9 +1916,9 @@
2413
  "original_lang_name": "malay",
2414
  "original_lang_code": "msa",
2415
  "scripts": [
 
2416
  "Arab",
2417
- "Thai",
2418
- "Latn"
2419
  ],
2420
  "class_name": "SpaCyTokenizer",
2421
  "macrolanguage": true
@@ -2452,9 +1955,9 @@
2452
  "original_lang_name": "malay",
2453
  "original_lang_code": "msa",
2454
  "scripts": [
 
2455
  "Arab",
2456
- "Thai",
2457
- "Latn"
2458
  ],
2459
  "class_name": "SpaCyTokenizer",
2460
  "macrolanguage": true
@@ -2497,9 +2000,9 @@
2497
  "original_lang_name": "malay",
2498
  "original_lang_code": "msa",
2499
  "scripts": [
 
2500
  "Arab",
2501
- "Thai",
2502
- "Latn"
2503
  ],
2504
  "class_name": "SpaCyTokenizer",
2505
  "macrolanguage": true
@@ -2542,9 +2045,9 @@
2542
  "original_lang_name": "malay",
2543
  "original_lang_code": "msa",
2544
  "scripts": [
 
2545
  "Arab",
2546
- "Thai",
2547
- "Latn"
2548
  ],
2549
  "class_name": "SpaCyTokenizer",
2550
  "macrolanguage": true
@@ -2612,9 +2115,9 @@
2612
  "original_lang_name": "malay",
2613
  "original_lang_code": "msa",
2614
  "scripts": [
 
2615
  "Arab",
2616
- "Thai",
2617
- "Latn"
2618
  ],
2619
  "class_name": "SpaCyTokenizer",
2620
  "macrolanguage": true
@@ -2624,9 +2127,9 @@
2624
  "original_lang_name": "malay",
2625
  "original_lang_code": "msa",
2626
  "scripts": [
 
2627
  "Arab",
2628
- "Thai",
2629
- "Latn"
2630
  ],
2631
  "class_name": "SpaCyTokenizer",
2632
  "macrolanguage": true
@@ -2636,9 +2139,9 @@
2636
  "original_lang_name": "malay",
2637
  "original_lang_code": "msa",
2638
  "scripts": [
 
2639
  "Arab",
2640
- "Thai",
2641
- "Latn"
2642
  ],
2643
  "class_name": "SpaCyTokenizer",
2644
  "macrolanguage": true
@@ -2655,9 +2158,9 @@
2655
  "original_lang_name": "malay",
2656
  "original_lang_code": "msa",
2657
  "scripts": [
 
2658
  "Arab",
2659
- "Thai",
2660
- "Latn"
2661
  ],
2662
  "class_name": "SpaCyTokenizer",
2663
  "macrolanguage": true
@@ -2689,44 +2192,7 @@
2689
  "name": "Swahili based",
2690
  "iso_1_code": null,
2691
  "iso_3_code": null,
2692
- "tokenizers": {
2693
- "Arab": {
2694
- "full_object": "SpaCyTokenizer(\"ms\")",
2695
- "original_lang_name": "malay",
2696
- "original_lang_code": "msa",
2697
- "scripts": [
2698
- "Arab",
2699
- "Thai",
2700
- "Latn"
2701
- ],
2702
- "class_name": "SpaCyTokenizer",
2703
- "macrolanguage": true
2704
- },
2705
- "Latn": {
2706
- "full_object": "SpaCyTokenizer(\"ms\")",
2707
- "original_lang_name": "malay",
2708
- "original_lang_code": "msa",
2709
- "scripts": [
2710
- "Arab",
2711
- "Thai",
2712
- "Latn"
2713
- ],
2714
- "class_name": "SpaCyTokenizer",
2715
- "macrolanguage": true
2716
- },
2717
- "Thai": {
2718
- "full_object": "SpaCyTokenizer(\"ms\")",
2719
- "original_lang_name": "malay",
2720
- "original_lang_code": "msa",
2721
- "scripts": [
2722
- "Arab",
2723
- "Thai",
2724
- "Latn"
2725
- ],
2726
- "class_name": "SpaCyTokenizer",
2727
- "macrolanguage": true
2728
- }
2729
- },
2730
  "children": [
2731
  {
2732
  "name": "Cutchi-Swahili",
@@ -2753,9 +2219,9 @@
2753
  "original_lang_name": "malay",
2754
  "original_lang_code": "msa",
2755
  "scripts": [
 
2756
  "Arab",
2757
- "Thai",
2758
- "Latn"
2759
  ],
2760
  "class_name": "SpaCyTokenizer",
2761
  "macrolanguage": true
@@ -2765,9 +2231,9 @@
2765
  "original_lang_name": "malay",
2766
  "original_lang_code": "msa",
2767
  "scripts": [
 
2768
  "Arab",
2769
- "Thai",
2770
- "Latn"
2771
  ],
2772
  "class_name": "SpaCyTokenizer",
2773
  "macrolanguage": true
@@ -2777,9 +2243,9 @@
2777
  "original_lang_name": "malay",
2778
  "original_lang_code": "msa",
2779
  "scripts": [
 
2780
  "Arab",
2781
- "Thai",
2782
- "Latn"
2783
  ],
2784
  "class_name": "SpaCyTokenizer",
2785
  "macrolanguage": true
@@ -2796,9 +2262,9 @@
2796
  "original_lang_name": "malay",
2797
  "original_lang_code": "msa",
2798
  "scripts": [
 
2799
  "Arab",
2800
- "Thai",
2801
- "Latn"
2802
  ],
2803
  "class_name": "SpaCyTokenizer",
2804
  "macrolanguage": true
 
8
  "original_lang_name": "malay",
9
  "original_lang_code": "msa",
10
  "scripts": [
11
+ "Latn",
12
  "Arab",
13
+ "Thai"
 
14
  ],
15
  "class_name": "SpaCyTokenizer",
16
  "macrolanguage": true
 
20
  "original_lang_name": "malay",
21
  "original_lang_code": "msa",
22
  "scripts": [
23
+ "Latn",
24
  "Arab",
25
+ "Thai"
 
26
  ],
27
  "class_name": "SpaCyTokenizer",
28
  "macrolanguage": true
 
32
  "original_lang_name": "malay",
33
  "original_lang_code": "msa",
34
  "scripts": [
35
+ "Latn",
36
  "Arab",
37
+ "Thai"
 
38
  ],
39
  "class_name": "SpaCyTokenizer",
40
  "macrolanguage": true
 
45
  "name": "Afrikaans based",
46
  "iso_1_code": null,
47
  "iso_3_code": null,
48
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  "children": [
50
  {
51
  "name": "Flaaitaal",
 
86
  ],
87
  "class_name": "SpaCyTokenizer",
88
  "macrolanguage": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  }
90
  },
91
  "children": [
 
129
  "name": "Assamese based",
130
  "iso_1_code": null,
131
  "iso_3_code": null,
132
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  "children": [
134
  {
135
  "name": "Nagamese",
 
150
  "name": "Dutch based",
151
  "iso_1_code": null,
152
  "iso_3_code": null,
153
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  "children": [
155
  {
156
  "name": "Berbice Dutch Creole",
 
221
  ],
222
  "class_name": "StanzaTokenizer",
223
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  }
225
  },
226
  "children": [
 
261
  ],
262
  "class_name": "StanzaTokenizer",
263
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  }
265
  },
266
  "children": [
 
278
  ],
279
  "class_name": "StanzaTokenizer",
280
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  }
282
  },
283
  "children": [
 
305
  ],
306
  "class_name": "StanzaTokenizer",
307
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  }
309
  },
310
  "children": [
 
360
  "name": "Southern",
361
  "iso_1_code": null,
362
  "iso_3_code": null,
363
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  "children": [
365
  {
366
  "name": "Leeward Caribbean English Creole",
 
466
  ],
467
  "class_name": "StanzaTokenizer",
468
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  }
470
  },
471
  "children": [
 
577
  ],
578
  "class_name": "StanzaTokenizer",
579
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
  }
581
  },
582
  "children": [
 
617
  ],
618
  "class_name": "StanzaTokenizer",
619
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
  }
621
  },
622
  "children": [
 
677
  ],
678
  "class_name": "StanzaTokenizer",
679
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
680
  }
681
  },
682
  "children": [
 
783
  ],
784
  "class_name": "StanzaTokenizer",
785
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
  }
787
  },
788
  "children": [
 
964
  "original_lang_name": "malay",
965
  "original_lang_code": "msa",
966
  "scripts": [
967
+ "Latn",
968
  "Arab",
969
+ "Thai"
 
970
  ],
971
  "class_name": "SpaCyTokenizer",
972
  "macrolanguage": true
 
976
  "original_lang_name": "malay",
977
  "original_lang_code": "msa",
978
  "scripts": [
979
+ "Latn",
980
  "Arab",
981
+ "Thai"
 
982
  ],
983
  "class_name": "SpaCyTokenizer",
984
  "macrolanguage": true
 
988
  "original_lang_name": "malay",
989
  "original_lang_code": "msa",
990
  "scripts": [
991
+ "Latn",
992
  "Arab",
993
+ "Thai"
 
994
  ],
995
  "class_name": "SpaCyTokenizer",
996
  "macrolanguage": true
 
1007
  "original_lang_name": "malay",
1008
  "original_lang_code": "msa",
1009
  "scripts": [
1010
+ "Latn",
1011
  "Arab",
1012
+ "Thai"
 
1013
  ],
1014
  "class_name": "SpaCyTokenizer",
1015
  "macrolanguage": true
 
1042
  "original_lang_name": "malay",
1043
  "original_lang_code": "msa",
1044
  "scripts": [
1045
+ "Latn",
1046
  "Arab",
1047
+ "Thai"
 
1048
  ],
1049
  "class_name": "SpaCyTokenizer",
1050
  "macrolanguage": true
 
1067
  "original_lang_name": "malay",
1068
  "original_lang_code": "msa",
1069
  "scripts": [
1070
+ "Latn",
1071
  "Arab",
1072
+ "Thai"
 
1073
  ],
1074
  "class_name": "SpaCyTokenizer",
1075
  "macrolanguage": true
 
1092
  "original_lang_name": "malay",
1093
  "original_lang_code": "msa",
1094
  "scripts": [
1095
+ "Latn",
1096
  "Arab",
1097
+ "Thai"
 
1098
  ],
1099
  "class_name": "SpaCyTokenizer",
1100
  "macrolanguage": true
 
1117
  "original_lang_name": "malay",
1118
  "original_lang_code": "msa",
1119
  "scripts": [
1120
+ "Latn",
1121
  "Arab",
1122
+ "Thai"
 
1123
  ],
1124
  "class_name": "SpaCyTokenizer",
1125
  "macrolanguage": true
 
1162
  "original_lang_name": "malay",
1163
  "original_lang_code": "msa",
1164
  "scripts": [
1165
+ "Latn",
1166
  "Arab",
1167
+ "Thai"
 
1168
  ],
1169
  "class_name": "SpaCyTokenizer",
1170
  "macrolanguage": true
 
1187
  "original_lang_name": "malay",
1188
  "original_lang_code": "msa",
1189
  "scripts": [
1190
+ "Latn",
1191
  "Arab",
1192
+ "Thai"
 
1193
  ],
1194
  "class_name": "SpaCyTokenizer",
1195
  "macrolanguage": true
 
1221
  "name": "German based",
1222
  "iso_1_code": null,
1223
  "iso_3_code": null,
1224
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1225
  "children": [
1226
  {
1227
  "name": "Unserdeutsch",
 
1242
  "name": "Hindi based",
1243
  "iso_1_code": null,
1244
  "iso_3_code": null,
1245
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1246
  "children": [
1247
  {
1248
  "name": "Andaman Hindi Creole",
 
1269
  "original_lang_name": "malay",
1270
  "original_lang_code": "msa",
1271
  "scripts": [
1272
+ "Latn",
1273
  "Arab",
1274
+ "Thai"
 
1275
  ],
1276
  "class_name": "SpaCyTokenizer",
1277
  "macrolanguage": true
 
1281
  "original_lang_name": "malay",
1282
  "original_lang_code": "msa",
1283
  "scripts": [
1284
+ "Latn",
1285
  "Arab",
1286
+ "Thai"
 
1287
  ],
1288
  "class_name": "SpaCyTokenizer",
1289
  "macrolanguage": true
 
1293
  "original_lang_name": "malay",
1294
  "original_lang_code": "msa",
1295
  "scripts": [
1296
+ "Latn",
1297
  "Arab",
1298
+ "Thai"
 
1299
  ],
1300
  "class_name": "SpaCyTokenizer",
1301
  "macrolanguage": true
 
1312
  "original_lang_name": "malay",
1313
  "original_lang_code": "msa",
1314
  "scripts": [
1315
+ "Latn",
1316
  "Arab",
1317
+ "Thai"
 
1318
  ],
1319
  "class_name": "SpaCyTokenizer",
1320
  "macrolanguage": true
 
1336
  "name": "Japanese-based",
1337
  "iso_1_code": null,
1338
  "iso_3_code": null,
1339
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1340
  "children": [
1341
  {
1342
  "name": "Yilan Creole",
 
1363
  "original_lang_name": "malay",
1364
  "original_lang_code": "msa",
1365
  "scripts": [
1366
+ "Latn",
1367
  "Arab",
1368
+ "Thai"
 
1369
  ],
1370
  "class_name": "SpaCyTokenizer",
1371
  "macrolanguage": true
 
1375
  "original_lang_name": "malay",
1376
  "original_lang_code": "msa",
1377
  "scripts": [
1378
+ "Latn",
1379
  "Arab",
1380
+ "Thai"
 
1381
  ],
1382
  "class_name": "SpaCyTokenizer",
1383
  "macrolanguage": true
 
1387
  "original_lang_name": "malay",
1388
  "original_lang_code": "msa",
1389
  "scripts": [
1390
+ "Latn",
1391
  "Arab",
1392
+ "Thai"
 
1393
  ],
1394
  "class_name": "SpaCyTokenizer",
1395
  "macrolanguage": true
 
1406
  "original_lang_name": "malay",
1407
  "original_lang_code": "msa",
1408
  "scripts": [
1409
+ "Latn",
1410
  "Arab",
1411
+ "Thai"
 
1412
  ],
1413
  "class_name": "SpaCyTokenizer",
1414
  "macrolanguage": true
 
1441
  "iso_1_code": null,
1442
  "iso_3_code": null,
1443
  "tokenizers": {
1444
+ "Latn": {
1445
  "full_object": "SpaCyTokenizer(\"ms\")",
1446
  "original_lang_name": "malay",
1447
  "original_lang_code": "msa",
1448
  "scripts": [
1449
+ "Latn",
1450
  "Arab",
1451
+ "Thai"
 
1452
  ],
1453
  "class_name": "SpaCyTokenizer",
1454
  "macrolanguage": true
1455
  },
1456
+ "Arab": {
1457
  "full_object": "SpaCyTokenizer(\"ms\")",
1458
  "original_lang_name": "malay",
1459
  "original_lang_code": "msa",
1460
  "scripts": [
1461
+ "Latn",
1462
  "Arab",
1463
+ "Thai"
 
1464
  ],
1465
  "class_name": "SpaCyTokenizer",
1466
  "macrolanguage": true
1467
  },
1468
+ "Thai": {
1469
  "full_object": "SpaCyTokenizer(\"ms\")",
1470
  "original_lang_name": "malay",
1471
  "original_lang_code": "msa",
1472
  "scripts": [
1473
+ "Latn",
1474
  "Arab",
1475
+ "Thai"
 
1476
  ],
1477
  "class_name": "SpaCyTokenizer",
1478
  "macrolanguage": true
 
1489
  "original_lang_name": "malay",
1490
  "original_lang_code": "msa",
1491
  "scripts": [
1492
+ "Latn",
1493
  "Arab",
1494
+ "Thai"
 
1495
  ],
1496
  "class_name": "SpaCyTokenizer",
1497
  "macrolanguage": true
 
1514
  "original_lang_name": "malay",
1515
  "original_lang_code": "msa",
1516
  "scripts": [
1517
+ "Latn",
1518
  "Arab",
1519
+ "Thai"
 
1520
  ],
1521
  "class_name": "SpaCyTokenizer",
1522
  "macrolanguage": true
 
1554
  "iso_1_code": "ms",
1555
  "iso_3_code": "coa",
1556
  "tokenizers": {
1557
+ "Latn": {
1558
  "full_object": "SpaCyTokenizer(\"ms\")",
1559
  "original_lang_name": "malay",
1560
  "original_lang_code": "msa",
1561
  "scripts": [
1562
+ "Latn",
1563
  "Arab",
1564
+ "Thai"
 
1565
  ],
1566
  "class_name": "SpaCyTokenizer",
1567
  "macrolanguage": true
1568
  },
1569
+ "Arab": {
1570
  "full_object": "SpaCyTokenizer(\"ms\")",
1571
  "original_lang_name": "malay",
1572
  "original_lang_code": "msa",
1573
  "scripts": [
1574
+ "Latn",
1575
  "Arab",
1576
+ "Thai"
 
1577
  ],
1578
  "class_name": "SpaCyTokenizer",
1579
  "macrolanguage": true
1580
  },
1581
+ "Thai": {
1582
  "full_object": "SpaCyTokenizer(\"ms\")",
1583
  "original_lang_name": "malay",
1584
  "original_lang_code": "msa",
1585
  "scripts": [
1586
+ "Latn",
1587
  "Arab",
1588
+ "Thai"
 
1589
  ],
1590
  "class_name": "SpaCyTokenizer",
1591
  "macrolanguage": true
 
1611
  "iso_1_code": "ms",
1612
  "iso_3_code": "max",
1613
  "tokenizers": {
1614
+ "Latn": {
1615
  "full_object": "SpaCyTokenizer(\"ms\")",
1616
  "original_lang_name": "malay",
1617
  "original_lang_code": "msa",
1618
  "scripts": [
1619
+ "Latn",
1620
  "Arab",
1621
+ "Thai"
 
1622
  ],
1623
  "class_name": "SpaCyTokenizer",
1624
  "macrolanguage": true
1625
  },
1626
+ "Arab": {
1627
  "full_object": "SpaCyTokenizer(\"ms\")",
1628
  "original_lang_name": "malay",
1629
  "original_lang_code": "msa",
1630
  "scripts": [
1631
+ "Latn",
1632
  "Arab",
1633
+ "Thai"
 
1634
  ],
1635
  "class_name": "SpaCyTokenizer",
1636
  "macrolanguage": true
1637
  },
1638
+ "Thai": {
1639
  "full_object": "SpaCyTokenizer(\"ms\")",
1640
  "original_lang_name": "malay",
1641
  "original_lang_code": "msa",
1642
  "scripts": [
1643
+ "Latn",
1644
  "Arab",
1645
+ "Thai"
 
1646
  ],
1647
  "class_name": "SpaCyTokenizer",
1648
  "macrolanguage": true
 
1665
  "original_lang_name": "malay",
1666
  "original_lang_code": "msa",
1667
  "scripts": [
1668
+ "Latn",
1669
  "Arab",
1670
+ "Thai"
 
1671
  ],
1672
  "class_name": "SpaCyTokenizer",
1673
  "macrolanguage": true
 
1700
  "original_lang_name": "malay",
1701
  "original_lang_code": "msa",
1702
  "scripts": [
1703
+ "Latn",
1704
  "Arab",
1705
+ "Thai"
 
1706
  ],
1707
  "class_name": "SpaCyTokenizer",
1708
  "macrolanguage": true
 
1750
  "iso_1_code": "ms",
1751
  "iso_3_code": "xmm",
1752
  "tokenizers": {
1753
+ "Latn": {
1754
  "full_object": "SpaCyTokenizer(\"ms\")",
1755
  "original_lang_name": "malay",
1756
  "original_lang_code": "msa",
1757
  "scripts": [
1758
+ "Latn",
1759
  "Arab",
1760
+ "Thai"
 
1761
  ],
1762
  "class_name": "SpaCyTokenizer",
1763
  "macrolanguage": true
1764
  },
1765
+ "Arab": {
1766
  "full_object": "SpaCyTokenizer(\"ms\")",
1767
  "original_lang_name": "malay",
1768
  "original_lang_code": "msa",
1769
  "scripts": [
1770
+ "Latn",
1771
  "Arab",
1772
+ "Thai"
 
1773
  ],
1774
  "class_name": "SpaCyTokenizer",
1775
  "macrolanguage": true
1776
  },
1777
+ "Thai": {
1778
  "full_object": "SpaCyTokenizer(\"ms\")",
1779
  "original_lang_name": "malay",
1780
  "original_lang_code": "msa",
1781
  "scripts": [
1782
+ "Latn",
1783
  "Arab",
1784
+ "Thai"
 
1785
  ],
1786
  "class_name": "SpaCyTokenizer",
1787
  "macrolanguage": true
 
1809
  "original_lang_name": "malay",
1810
  "original_lang_code": "msa",
1811
  "scripts": [
1812
+ "Latn",
1813
  "Arab",
1814
+ "Thai"
 
1815
  ],
1816
  "class_name": "SpaCyTokenizer",
1817
  "macrolanguage": true
 
1821
  "original_lang_name": "malay",
1822
  "original_lang_code": "msa",
1823
  "scripts": [
1824
+ "Latn",
1825
  "Arab",
1826
+ "Thai"
 
1827
  ],
1828
  "class_name": "SpaCyTokenizer",
1829
  "macrolanguage": true
 
1833
  "original_lang_name": "malay",
1834
  "original_lang_code": "msa",
1835
  "scripts": [
1836
+ "Latn",
1837
  "Arab",
1838
+ "Thai"
 
1839
  ],
1840
  "class_name": "SpaCyTokenizer",
1841
  "macrolanguage": true
 
1852
  "original_lang_name": "malay",
1853
  "original_lang_code": "msa",
1854
  "scripts": [
1855
+ "Latn",
1856
  "Arab",
1857
+ "Thai"
 
1858
  ],
1859
  "class_name": "SpaCyTokenizer",
1860
  "macrolanguage": true
 
1892
  "original_lang_name": "malay",
1893
  "original_lang_code": "msa",
1894
  "scripts": [
1895
+ "Latn",
1896
  "Arab",
1897
+ "Thai"
 
1898
  ],
1899
  "class_name": "SpaCyTokenizer",
1900
  "macrolanguage": true
 
1904
  "original_lang_name": "malay",
1905
  "original_lang_code": "msa",
1906
  "scripts": [
1907
+ "Latn",
1908
  "Arab",
1909
+ "Thai"
 
1910
  ],
1911
  "class_name": "SpaCyTokenizer",
1912
  "macrolanguage": true
 
1916
  "original_lang_name": "malay",
1917
  "original_lang_code": "msa",
1918
  "scripts": [
1919
+ "Latn",
1920
  "Arab",
1921
+ "Thai"
 
1922
  ],
1923
  "class_name": "SpaCyTokenizer",
1924
  "macrolanguage": true
 
1955
  "original_lang_name": "malay",
1956
  "original_lang_code": "msa",
1957
  "scripts": [
1958
+ "Latn",
1959
  "Arab",
1960
+ "Thai"
 
1961
  ],
1962
  "class_name": "SpaCyTokenizer",
1963
  "macrolanguage": true
 
2000
  "original_lang_name": "malay",
2001
  "original_lang_code": "msa",
2002
  "scripts": [
2003
+ "Latn",
2004
  "Arab",
2005
+ "Thai"
 
2006
  ],
2007
  "class_name": "SpaCyTokenizer",
2008
  "macrolanguage": true
 
2045
  "original_lang_name": "malay",
2046
  "original_lang_code": "msa",
2047
  "scripts": [
2048
+ "Latn",
2049
  "Arab",
2050
+ "Thai"
 
2051
  ],
2052
  "class_name": "SpaCyTokenizer",
2053
  "macrolanguage": true
 
2115
  "original_lang_name": "malay",
2116
  "original_lang_code": "msa",
2117
  "scripts": [
2118
+ "Latn",
2119
  "Arab",
2120
+ "Thai"
 
2121
  ],
2122
  "class_name": "SpaCyTokenizer",
2123
  "macrolanguage": true
 
2127
  "original_lang_name": "malay",
2128
  "original_lang_code": "msa",
2129
  "scripts": [
2130
+ "Latn",
2131
  "Arab",
2132
+ "Thai"
 
2133
  ],
2134
  "class_name": "SpaCyTokenizer",
2135
  "macrolanguage": true
 
2139
  "original_lang_name": "malay",
2140
  "original_lang_code": "msa",
2141
  "scripts": [
2142
+ "Latn",
2143
  "Arab",
2144
+ "Thai"
 
2145
  ],
2146
  "class_name": "SpaCyTokenizer",
2147
  "macrolanguage": true
 
2158
  "original_lang_name": "malay",
2159
  "original_lang_code": "msa",
2160
  "scripts": [
2161
+ "Latn",
2162
  "Arab",
2163
+ "Thai"
 
2164
  ],
2165
  "class_name": "SpaCyTokenizer",
2166
  "macrolanguage": true
 
2192
  "name": "Swahili based",
2193
  "iso_1_code": null,
2194
  "iso_3_code": null,
2195
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2196
  "children": [
2197
  {
2198
  "name": "Cutchi-Swahili",
 
2219
  "original_lang_name": "malay",
2220
  "original_lang_code": "msa",
2221
  "scripts": [
2222
+ "Latn",
2223
  "Arab",
2224
+ "Thai"
 
2225
  ],
2226
  "class_name": "SpaCyTokenizer",
2227
  "macrolanguage": true
 
2231
  "original_lang_name": "malay",
2232
  "original_lang_code": "msa",
2233
  "scripts": [
2234
+ "Latn",
2235
  "Arab",
2236
+ "Thai"
 
2237
  ],
2238
  "class_name": "SpaCyTokenizer",
2239
  "macrolanguage": true
 
2243
  "original_lang_name": "malay",
2244
  "original_lang_code": "msa",
2245
  "scripts": [
2246
+ "Latn",
2247
  "Arab",
2248
+ "Thai"
 
2249
  ],
2250
  "class_name": "SpaCyTokenizer",
2251
  "macrolanguage": true
 
2262
  "original_lang_name": "malay",
2263
  "original_lang_code": "msa",
2264
  "scripts": [
2265
+ "Latn",
2266
  "Arab",
2267
+ "Thai"
 
2268
  ],
2269
  "class_name": "SpaCyTokenizer",
2270
  "macrolanguage": true
data/Dravidian.json CHANGED
@@ -3,24 +3,24 @@
3
  "iso_1_code": null,
4
  "iso_3_code": null,
5
  "tokenizers": {
6
- "Latn": {
7
- "full_object": "SpaCyTokenizer(\"ta\")",
8
- "original_lang_name": "tamil",
9
- "original_lang_code": "tam",
10
  "scripts": [
11
- "Taml",
12
  "Latn"
13
  ],
14
  "class_name": "SpaCyTokenizer",
15
  "macrolanguage": false
16
  },
17
- "Telu": {
18
- "full_object": "SpaCyTokenizer(\"te\")",
19
- "original_lang_name": "telugu",
20
- "original_lang_code": "tel",
21
  "scripts": [
22
  "Latn",
23
- "Telu"
24
  ],
25
  "class_name": "SpaCyTokenizer",
26
  "macrolanguage": false
@@ -52,8 +52,8 @@
52
  "original_lang_name": "tamil",
53
  "original_lang_code": "tam",
54
  "scripts": [
55
- "Taml",
56
- "Latn"
57
  ],
58
  "class_name": "SpaCyTokenizer",
59
  "macrolanguage": false
@@ -64,125 +64,13 @@
64
  "name": "Central",
65
  "iso_1_code": null,
66
  "iso_3_code": null,
67
- "tokenizers": {
68
- "Latn": {
69
- "full_object": "SpaCyTokenizer(\"ta\")",
70
- "original_lang_name": "tamil",
71
- "original_lang_code": "tam",
72
- "scripts": [
73
- "Taml",
74
- "Latn"
75
- ],
76
- "class_name": "SpaCyTokenizer",
77
- "macrolanguage": false
78
- },
79
- "Telu": {
80
- "full_object": "SpaCyTokenizer(\"te\")",
81
- "original_lang_name": "telugu",
82
- "original_lang_code": "tel",
83
- "scripts": [
84
- "Latn",
85
- "Telu"
86
- ],
87
- "class_name": "SpaCyTokenizer",
88
- "macrolanguage": false
89
- },
90
- "Knda": {
91
- "full_object": "SpaCyTokenizer(\"kn\")",
92
- "original_lang_name": "kannada",
93
- "original_lang_code": "kan",
94
- "scripts": [
95
- "Latn",
96
- "Knda"
97
- ],
98
- "class_name": "SpaCyTokenizer",
99
- "macrolanguage": false
100
- },
101
- "Mlym": {
102
- "full_object": "SpaCyTokenizer(\"ml\")",
103
- "original_lang_name": "malayalam",
104
- "original_lang_code": "mal",
105
- "scripts": [
106
- "Latn",
107
- "Mlym"
108
- ],
109
- "class_name": "SpaCyTokenizer",
110
- "macrolanguage": false
111
- },
112
- "Taml": {
113
- "full_object": "SpaCyTokenizer(\"ta\")",
114
- "original_lang_name": "tamil",
115
- "original_lang_code": "tam",
116
- "scripts": [
117
- "Taml",
118
- "Latn"
119
- ],
120
- "class_name": "SpaCyTokenizer",
121
- "macrolanguage": false
122
- }
123
- },
124
  "children": [
125
  {
126
  "name": "Kolami-Naiki",
127
  "iso_1_code": null,
128
  "iso_3_code": null,
129
- "tokenizers": {
130
- "Latn": {
131
- "full_object": "SpaCyTokenizer(\"ta\")",
132
- "original_lang_name": "tamil",
133
- "original_lang_code": "tam",
134
- "scripts": [
135
- "Taml",
136
- "Latn"
137
- ],
138
- "class_name": "SpaCyTokenizer",
139
- "macrolanguage": false
140
- },
141
- "Telu": {
142
- "full_object": "SpaCyTokenizer(\"te\")",
143
- "original_lang_name": "telugu",
144
- "original_lang_code": "tel",
145
- "scripts": [
146
- "Latn",
147
- "Telu"
148
- ],
149
- "class_name": "SpaCyTokenizer",
150
- "macrolanguage": false
151
- },
152
- "Knda": {
153
- "full_object": "SpaCyTokenizer(\"kn\")",
154
- "original_lang_name": "kannada",
155
- "original_lang_code": "kan",
156
- "scripts": [
157
- "Latn",
158
- "Knda"
159
- ],
160
- "class_name": "SpaCyTokenizer",
161
- "macrolanguage": false
162
- },
163
- "Mlym": {
164
- "full_object": "SpaCyTokenizer(\"ml\")",
165
- "original_lang_name": "malayalam",
166
- "original_lang_code": "mal",
167
- "scripts": [
168
- "Latn",
169
- "Mlym"
170
- ],
171
- "class_name": "SpaCyTokenizer",
172
- "macrolanguage": false
173
- },
174
- "Taml": {
175
- "full_object": "SpaCyTokenizer(\"ta\")",
176
- "original_lang_name": "tamil",
177
- "original_lang_code": "tam",
178
- "scripts": [
179
- "Taml",
180
- "Latn"
181
- ],
182
- "class_name": "SpaCyTokenizer",
183
- "macrolanguage": false
184
- }
185
- },
186
  "children": [
187
  {
188
  "name": "Kolami, Northwestern",
@@ -213,63 +101,7 @@
213
  "name": "Parji-Gadaba",
214
  "iso_1_code": null,
215
  "iso_3_code": null,
216
- "tokenizers": {
217
- "Latn": {
218
- "full_object": "SpaCyTokenizer(\"ta\")",
219
- "original_lang_name": "tamil",
220
- "original_lang_code": "tam",
221
- "scripts": [
222
- "Taml",
223
- "Latn"
224
- ],
225
- "class_name": "SpaCyTokenizer",
226
- "macrolanguage": false
227
- },
228
- "Telu": {
229
- "full_object": "SpaCyTokenizer(\"te\")",
230
- "original_lang_name": "telugu",
231
- "original_lang_code": "tel",
232
- "scripts": [
233
- "Latn",
234
- "Telu"
235
- ],
236
- "class_name": "SpaCyTokenizer",
237
- "macrolanguage": false
238
- },
239
- "Knda": {
240
- "full_object": "SpaCyTokenizer(\"kn\")",
241
- "original_lang_name": "kannada",
242
- "original_lang_code": "kan",
243
- "scripts": [
244
- "Latn",
245
- "Knda"
246
- ],
247
- "class_name": "SpaCyTokenizer",
248
- "macrolanguage": false
249
- },
250
- "Mlym": {
251
- "full_object": "SpaCyTokenizer(\"ml\")",
252
- "original_lang_name": "malayalam",
253
- "original_lang_code": "mal",
254
- "scripts": [
255
- "Latn",
256
- "Mlym"
257
- ],
258
- "class_name": "SpaCyTokenizer",
259
- "macrolanguage": false
260
- },
261
- "Taml": {
262
- "full_object": "SpaCyTokenizer(\"ta\")",
263
- "original_lang_name": "tamil",
264
- "original_lang_code": "tam",
265
- "scripts": [
266
- "Taml",
267
- "Latn"
268
- ],
269
- "class_name": "SpaCyTokenizer",
270
- "macrolanguage": false
271
- }
272
- },
273
  "children": [
274
  {
275
  "name": "Gadaba, Mudhili",
@@ -315,63 +147,7 @@
315
  "name": "Northern",
316
  "iso_1_code": null,
317
  "iso_3_code": null,
318
- "tokenizers": {
319
- "Latn": {
320
- "full_object": "SpaCyTokenizer(\"ta\")",
321
- "original_lang_name": "tamil",
322
- "original_lang_code": "tam",
323
- "scripts": [
324
- "Taml",
325
- "Latn"
326
- ],
327
- "class_name": "SpaCyTokenizer",
328
- "macrolanguage": false
329
- },
330
- "Telu": {
331
- "full_object": "SpaCyTokenizer(\"te\")",
332
- "original_lang_name": "telugu",
333
- "original_lang_code": "tel",
334
- "scripts": [
335
- "Latn",
336
- "Telu"
337
- ],
338
- "class_name": "SpaCyTokenizer",
339
- "macrolanguage": false
340
- },
341
- "Knda": {
342
- "full_object": "SpaCyTokenizer(\"kn\")",
343
- "original_lang_name": "kannada",
344
- "original_lang_code": "kan",
345
- "scripts": [
346
- "Latn",
347
- "Knda"
348
- ],
349
- "class_name": "SpaCyTokenizer",
350
- "macrolanguage": false
351
- },
352
- "Mlym": {
353
- "full_object": "SpaCyTokenizer(\"ml\")",
354
- "original_lang_name": "malayalam",
355
- "original_lang_code": "mal",
356
- "scripts": [
357
- "Latn",
358
- "Mlym"
359
- ],
360
- "class_name": "SpaCyTokenizer",
361
- "macrolanguage": false
362
- },
363
- "Taml": {
364
- "full_object": "SpaCyTokenizer(\"ta\")",
365
- "original_lang_name": "tamil",
366
- "original_lang_code": "tam",
367
- "scripts": [
368
- "Taml",
369
- "Latn"
370
- ],
371
- "class_name": "SpaCyTokenizer",
372
- "macrolanguage": false
373
- }
374
- },
375
  "children": [
376
  {
377
  "name": "Brahui",
@@ -437,57 +213,24 @@
437
  "iso_1_code": null,
438
  "iso_3_code": null,
439
  "tokenizers": {
440
- "Latn": {
441
  "full_object": "SpaCyTokenizer(\"te\")",
442
  "original_lang_name": "telugu",
443
  "original_lang_code": "tel",
444
  "scripts": [
445
- "Latn",
446
- "Telu"
447
  ],
448
  "class_name": "SpaCyTokenizer",
449
  "macrolanguage": false
450
  },
451
- "Telu": {
452
  "full_object": "SpaCyTokenizer(\"te\")",
453
  "original_lang_name": "telugu",
454
  "original_lang_code": "tel",
455
  "scripts": [
456
- "Latn",
457
- "Telu"
458
- ],
459
- "class_name": "SpaCyTokenizer",
460
- "macrolanguage": false
461
- },
462
- "Knda": {
463
- "full_object": "SpaCyTokenizer(\"kn\")",
464
- "original_lang_name": "kannada",
465
- "original_lang_code": "kan",
466
- "scripts": [
467
- "Latn",
468
- "Knda"
469
- ],
470
- "class_name": "SpaCyTokenizer",
471
- "macrolanguage": false
472
- },
473
- "Mlym": {
474
- "full_object": "SpaCyTokenizer(\"ml\")",
475
- "original_lang_name": "malayalam",
476
- "original_lang_code": "mal",
477
- "scripts": [
478
- "Latn",
479
- "Mlym"
480
- ],
481
- "class_name": "SpaCyTokenizer",
482
- "macrolanguage": false
483
- },
484
- "Taml": {
485
- "full_object": "SpaCyTokenizer(\"ta\")",
486
- "original_lang_name": "tamil",
487
- "original_lang_code": "tam",
488
- "scripts": [
489
- "Taml",
490
- "Latn"
491
  ],
492
  "class_name": "SpaCyTokenizer",
493
  "macrolanguage": false
@@ -499,56 +242,23 @@
499
  "iso_1_code": null,
500
  "iso_3_code": null,
501
  "tokenizers": {
502
- "Latn": {
503
  "full_object": "SpaCyTokenizer(\"te\")",
504
  "original_lang_name": "telugu",
505
  "original_lang_code": "tel",
506
  "scripts": [
507
- "Latn",
508
- "Telu"
509
  ],
510
  "class_name": "SpaCyTokenizer",
511
  "macrolanguage": false
512
  },
513
- "Telu": {
514
  "full_object": "SpaCyTokenizer(\"te\")",
515
  "original_lang_name": "telugu",
516
  "original_lang_code": "tel",
517
  "scripts": [
518
- "Latn",
519
- "Telu"
520
- ],
521
- "class_name": "SpaCyTokenizer",
522
- "macrolanguage": false
523
- },
524
- "Knda": {
525
- "full_object": "SpaCyTokenizer(\"kn\")",
526
- "original_lang_name": "kannada",
527
- "original_lang_code": "kan",
528
- "scripts": [
529
- "Latn",
530
- "Knda"
531
- ],
532
- "class_name": "SpaCyTokenizer",
533
- "macrolanguage": false
534
- },
535
- "Mlym": {
536
- "full_object": "SpaCyTokenizer(\"ml\")",
537
- "original_lang_name": "malayalam",
538
- "original_lang_code": "mal",
539
- "scripts": [
540
- "Latn",
541
- "Mlym"
542
- ],
543
- "class_name": "SpaCyTokenizer",
544
- "macrolanguage": false
545
- },
546
- "Taml": {
547
- "full_object": "SpaCyTokenizer(\"ta\")",
548
- "original_lang_name": "tamil",
549
- "original_lang_code": "tam",
550
- "scripts": [
551
- "Taml",
552
  "Latn"
553
  ],
554
  "class_name": "SpaCyTokenizer",
@@ -561,56 +271,23 @@
561
  "iso_1_code": null,
562
  "iso_3_code": null,
563
  "tokenizers": {
564
- "Latn": {
565
  "full_object": "SpaCyTokenizer(\"te\")",
566
  "original_lang_name": "telugu",
567
  "original_lang_code": "tel",
568
  "scripts": [
569
- "Latn",
570
- "Telu"
571
  ],
572
  "class_name": "SpaCyTokenizer",
573
  "macrolanguage": false
574
  },
575
- "Telu": {
576
  "full_object": "SpaCyTokenizer(\"te\")",
577
  "original_lang_name": "telugu",
578
  "original_lang_code": "tel",
579
  "scripts": [
580
- "Latn",
581
- "Telu"
582
- ],
583
- "class_name": "SpaCyTokenizer",
584
- "macrolanguage": false
585
- },
586
- "Knda": {
587
- "full_object": "SpaCyTokenizer(\"kn\")",
588
- "original_lang_name": "kannada",
589
- "original_lang_code": "kan",
590
- "scripts": [
591
- "Latn",
592
- "Knda"
593
- ],
594
- "class_name": "SpaCyTokenizer",
595
- "macrolanguage": false
596
- },
597
- "Mlym": {
598
- "full_object": "SpaCyTokenizer(\"ml\")",
599
- "original_lang_name": "malayalam",
600
- "original_lang_code": "mal",
601
- "scripts": [
602
- "Latn",
603
- "Mlym"
604
- ],
605
- "class_name": "SpaCyTokenizer",
606
- "macrolanguage": false
607
- },
608
- "Taml": {
609
- "full_object": "SpaCyTokenizer(\"ta\")",
610
- "original_lang_name": "tamil",
611
- "original_lang_code": "tam",
612
- "scripts": [
613
- "Taml",
614
  "Latn"
615
  ],
616
  "class_name": "SpaCyTokenizer",
@@ -730,8 +407,8 @@
730
  "original_lang_name": "telugu",
731
  "original_lang_code": "tel",
732
  "scripts": [
733
- "Latn",
734
- "Telu"
735
  ],
736
  "class_name": "SpaCyTokenizer",
737
  "macrolanguage": false
@@ -754,56 +431,23 @@
754
  "iso_1_code": null,
755
  "iso_3_code": null,
756
  "tokenizers": {
757
- "Latn": {
758
  "full_object": "SpaCyTokenizer(\"te\")",
759
  "original_lang_name": "telugu",
760
  "original_lang_code": "tel",
761
  "scripts": [
762
- "Latn",
763
- "Telu"
764
  ],
765
  "class_name": "SpaCyTokenizer",
766
  "macrolanguage": false
767
  },
768
- "Telu": {
769
  "full_object": "SpaCyTokenizer(\"te\")",
770
  "original_lang_name": "telugu",
771
  "original_lang_code": "tel",
772
  "scripts": [
773
- "Latn",
774
- "Telu"
775
- ],
776
- "class_name": "SpaCyTokenizer",
777
- "macrolanguage": false
778
- },
779
- "Knda": {
780
- "full_object": "SpaCyTokenizer(\"kn\")",
781
- "original_lang_name": "kannada",
782
- "original_lang_code": "kan",
783
- "scripts": [
784
- "Latn",
785
- "Knda"
786
- ],
787
- "class_name": "SpaCyTokenizer",
788
- "macrolanguage": false
789
- },
790
- "Mlym": {
791
- "full_object": "SpaCyTokenizer(\"ml\")",
792
- "original_lang_name": "malayalam",
793
- "original_lang_code": "mal",
794
- "scripts": [
795
- "Latn",
796
- "Mlym"
797
- ],
798
- "class_name": "SpaCyTokenizer",
799
- "macrolanguage": false
800
- },
801
- "Taml": {
802
- "full_object": "SpaCyTokenizer(\"ta\")",
803
- "original_lang_name": "tamil",
804
- "original_lang_code": "tam",
805
- "scripts": [
806
- "Taml",
807
  "Latn"
808
  ],
809
  "class_name": "SpaCyTokenizer",
@@ -815,63 +459,7 @@
815
  "name": "Konda",
816
  "iso_1_code": null,
817
  "iso_3_code": null,
818
- "tokenizers": {
819
- "Latn": {
820
- "full_object": "SpaCyTokenizer(\"te\")",
821
- "original_lang_name": "telugu",
822
- "original_lang_code": "tel",
823
- "scripts": [
824
- "Latn",
825
- "Telu"
826
- ],
827
- "class_name": "SpaCyTokenizer",
828
- "macrolanguage": false
829
- },
830
- "Telu": {
831
- "full_object": "SpaCyTokenizer(\"te\")",
832
- "original_lang_name": "telugu",
833
- "original_lang_code": "tel",
834
- "scripts": [
835
- "Latn",
836
- "Telu"
837
- ],
838
- "class_name": "SpaCyTokenizer",
839
- "macrolanguage": false
840
- },
841
- "Knda": {
842
- "full_object": "SpaCyTokenizer(\"kn\")",
843
- "original_lang_name": "kannada",
844
- "original_lang_code": "kan",
845
- "scripts": [
846
- "Latn",
847
- "Knda"
848
- ],
849
- "class_name": "SpaCyTokenizer",
850
- "macrolanguage": false
851
- },
852
- "Mlym": {
853
- "full_object": "SpaCyTokenizer(\"ml\")",
854
- "original_lang_name": "malayalam",
855
- "original_lang_code": "mal",
856
- "scripts": [
857
- "Latn",
858
- "Mlym"
859
- ],
860
- "class_name": "SpaCyTokenizer",
861
- "macrolanguage": false
862
- },
863
- "Taml": {
864
- "full_object": "SpaCyTokenizer(\"ta\")",
865
- "original_lang_name": "tamil",
866
- "original_lang_code": "tam",
867
- "scripts": [
868
- "Taml",
869
- "Latn"
870
- ],
871
- "class_name": "SpaCyTokenizer",
872
- "macrolanguage": false
873
- }
874
- },
875
  "children": [
876
  {
877
  "name": "Konda-Dora",
@@ -903,56 +491,23 @@
903
  "iso_1_code": null,
904
  "iso_3_code": null,
905
  "tokenizers": {
906
- "Latn": {
907
  "full_object": "SpaCyTokenizer(\"te\")",
908
  "original_lang_name": "telugu",
909
  "original_lang_code": "tel",
910
  "scripts": [
911
- "Latn",
912
- "Telu"
913
  ],
914
  "class_name": "SpaCyTokenizer",
915
  "macrolanguage": false
916
  },
917
- "Telu": {
918
  "full_object": "SpaCyTokenizer(\"te\")",
919
  "original_lang_name": "telugu",
920
  "original_lang_code": "tel",
921
  "scripts": [
922
- "Latn",
923
- "Telu"
924
- ],
925
- "class_name": "SpaCyTokenizer",
926
- "macrolanguage": false
927
- },
928
- "Knda": {
929
- "full_object": "SpaCyTokenizer(\"kn\")",
930
- "original_lang_name": "kannada",
931
- "original_lang_code": "kan",
932
- "scripts": [
933
- "Latn",
934
- "Knda"
935
- ],
936
- "class_name": "SpaCyTokenizer",
937
- "macrolanguage": false
938
- },
939
- "Mlym": {
940
- "full_object": "SpaCyTokenizer(\"ml\")",
941
- "original_lang_name": "malayalam",
942
- "original_lang_code": "mal",
943
- "scripts": [
944
- "Latn",
945
- "Mlym"
946
- ],
947
- "class_name": "SpaCyTokenizer",
948
- "macrolanguage": false
949
- },
950
- "Taml": {
951
- "full_object": "SpaCyTokenizer(\"ta\")",
952
- "original_lang_name": "tamil",
953
- "original_lang_code": "tam",
954
- "scripts": [
955
- "Taml",
956
  "Latn"
957
  ],
958
  "class_name": "SpaCyTokenizer",
@@ -965,57 +520,24 @@
965
  "iso_1_code": null,
966
  "iso_3_code": null,
967
  "tokenizers": {
968
- "Latn": {
969
  "full_object": "SpaCyTokenizer(\"te\")",
970
  "original_lang_name": "telugu",
971
  "original_lang_code": "tel",
972
  "scripts": [
973
- "Latn",
974
- "Telu"
975
  ],
976
  "class_name": "SpaCyTokenizer",
977
  "macrolanguage": false
978
  },
979
- "Telu": {
980
  "full_object": "SpaCyTokenizer(\"te\")",
981
  "original_lang_name": "telugu",
982
  "original_lang_code": "tel",
983
  "scripts": [
984
- "Latn",
985
- "Telu"
986
- ],
987
- "class_name": "SpaCyTokenizer",
988
- "macrolanguage": false
989
- },
990
- "Knda": {
991
- "full_object": "SpaCyTokenizer(\"kn\")",
992
- "original_lang_name": "kannada",
993
- "original_lang_code": "kan",
994
- "scripts": [
995
- "Latn",
996
- "Knda"
997
- ],
998
- "class_name": "SpaCyTokenizer",
999
- "macrolanguage": false
1000
- },
1001
- "Mlym": {
1002
- "full_object": "SpaCyTokenizer(\"ml\")",
1003
- "original_lang_name": "malayalam",
1004
- "original_lang_code": "mal",
1005
- "scripts": [
1006
- "Latn",
1007
- "Mlym"
1008
- ],
1009
- "class_name": "SpaCyTokenizer",
1010
- "macrolanguage": false
1011
- },
1012
- "Taml": {
1013
- "full_object": "SpaCyTokenizer(\"ta\")",
1014
- "original_lang_name": "tamil",
1015
- "original_lang_code": "tam",
1016
- "scripts": [
1017
- "Taml",
1018
- "Latn"
1019
  ],
1020
  "class_name": "SpaCyTokenizer",
1021
  "macrolanguage": false
@@ -1042,8 +564,8 @@
1042
  "original_lang_name": "telugu",
1043
  "original_lang_code": "tel",
1044
  "scripts": [
1045
- "Latn",
1046
- "Telu"
1047
  ],
1048
  "class_name": "SpaCyTokenizer",
1049
  "macrolanguage": false
@@ -1085,63 +607,7 @@
1085
  "name": "Manda-Pengo",
1086
  "iso_1_code": null,
1087
  "iso_3_code": null,
1088
- "tokenizers": {
1089
- "Latn": {
1090
- "full_object": "SpaCyTokenizer(\"te\")",
1091
- "original_lang_name": "telugu",
1092
- "original_lang_code": "tel",
1093
- "scripts": [
1094
- "Latn",
1095
- "Telu"
1096
- ],
1097
- "class_name": "SpaCyTokenizer",
1098
- "macrolanguage": false
1099
- },
1100
- "Telu": {
1101
- "full_object": "SpaCyTokenizer(\"te\")",
1102
- "original_lang_name": "telugu",
1103
- "original_lang_code": "tel",
1104
- "scripts": [
1105
- "Latn",
1106
- "Telu"
1107
- ],
1108
- "class_name": "SpaCyTokenizer",
1109
- "macrolanguage": false
1110
- },
1111
- "Knda": {
1112
- "full_object": "SpaCyTokenizer(\"kn\")",
1113
- "original_lang_name": "kannada",
1114
- "original_lang_code": "kan",
1115
- "scripts": [
1116
- "Latn",
1117
- "Knda"
1118
- ],
1119
- "class_name": "SpaCyTokenizer",
1120
- "macrolanguage": false
1121
- },
1122
- "Mlym": {
1123
- "full_object": "SpaCyTokenizer(\"ml\")",
1124
- "original_lang_name": "malayalam",
1125
- "original_lang_code": "mal",
1126
- "scripts": [
1127
- "Latn",
1128
- "Mlym"
1129
- ],
1130
- "class_name": "SpaCyTokenizer",
1131
- "macrolanguage": false
1132
- },
1133
- "Taml": {
1134
- "full_object": "SpaCyTokenizer(\"ta\")",
1135
- "original_lang_name": "tamil",
1136
- "original_lang_code": "tam",
1137
- "scripts": [
1138
- "Taml",
1139
- "Latn"
1140
- ],
1141
- "class_name": "SpaCyTokenizer",
1142
- "macrolanguage": false
1143
- }
1144
- },
1145
  "children": [
1146
  {
1147
  "name": "Manda",
@@ -1188,56 +654,23 @@
1188
  "iso_1_code": null,
1189
  "iso_3_code": null,
1190
  "tokenizers": {
1191
- "Latn": {
1192
  "full_object": "SpaCyTokenizer(\"te\")",
1193
  "original_lang_name": "telugu",
1194
  "original_lang_code": "tel",
1195
  "scripts": [
1196
- "Latn",
1197
- "Telu"
1198
  ],
1199
  "class_name": "SpaCyTokenizer",
1200
  "macrolanguage": false
1201
  },
1202
- "Telu": {
1203
  "full_object": "SpaCyTokenizer(\"te\")",
1204
  "original_lang_name": "telugu",
1205
  "original_lang_code": "tel",
1206
  "scripts": [
1207
- "Latn",
1208
- "Telu"
1209
- ],
1210
- "class_name": "SpaCyTokenizer",
1211
- "macrolanguage": false
1212
- },
1213
- "Knda": {
1214
- "full_object": "SpaCyTokenizer(\"kn\")",
1215
- "original_lang_name": "kannada",
1216
- "original_lang_code": "kan",
1217
- "scripts": [
1218
- "Latn",
1219
- "Knda"
1220
- ],
1221
- "class_name": "SpaCyTokenizer",
1222
- "macrolanguage": false
1223
- },
1224
- "Mlym": {
1225
- "full_object": "SpaCyTokenizer(\"ml\")",
1226
- "original_lang_name": "malayalam",
1227
- "original_lang_code": "mal",
1228
- "scripts": [
1229
- "Latn",
1230
- "Mlym"
1231
- ],
1232
- "class_name": "SpaCyTokenizer",
1233
- "macrolanguage": false
1234
- },
1235
- "Taml": {
1236
- "full_object": "SpaCyTokenizer(\"ta\")",
1237
- "original_lang_name": "tamil",
1238
- "original_lang_code": "tam",
1239
- "scripts": [
1240
- "Taml",
1241
  "Latn"
1242
  ],
1243
  "class_name": "SpaCyTokenizer",
@@ -1270,24 +703,24 @@
1270
  "iso_1_code": "te",
1271
  "iso_3_code": "tel",
1272
  "tokenizers": {
1273
- "Latn": {
1274
  "full_object": "SpaCyTokenizer(\"te\")",
1275
  "original_lang_name": "telugu",
1276
  "original_lang_code": "tel",
1277
  "scripts": [
1278
- "Latn",
1279
- "Telu"
1280
  ],
1281
  "class_name": "SpaCyTokenizer",
1282
  "macrolanguage": false
1283
  },
1284
- "Telu": {
1285
  "full_object": "SpaCyTokenizer(\"te\")",
1286
  "original_lang_name": "telugu",
1287
  "original_lang_code": "tel",
1288
  "scripts": [
1289
- "Latn",
1290
- "Telu"
1291
  ],
1292
  "class_name": "SpaCyTokenizer",
1293
  "macrolanguage": false
@@ -1331,8 +764,8 @@
1331
  "original_lang_name": "tamil",
1332
  "original_lang_code": "tam",
1333
  "scripts": [
1334
- "Taml",
1335
- "Latn"
1336
  ],
1337
  "class_name": "SpaCyTokenizer",
1338
  "macrolanguage": false
@@ -1363,20 +796,9 @@
1363
  "full_object": "SpaCyTokenizer(\"ta\")",
1364
  "original_lang_name": "tamil",
1365
  "original_lang_code": "tam",
1366
- "scripts": [
1367
- "Taml",
1368
- "Latn"
1369
- ],
1370
- "class_name": "SpaCyTokenizer",
1371
- "macrolanguage": false
1372
- },
1373
- "Telu": {
1374
- "full_object": "SpaCyTokenizer(\"te\")",
1375
- "original_lang_name": "telugu",
1376
- "original_lang_code": "tel",
1377
  "scripts": [
1378
  "Latn",
1379
- "Telu"
1380
  ],
1381
  "class_name": "SpaCyTokenizer",
1382
  "macrolanguage": false
@@ -1463,8 +885,8 @@
1463
  "original_lang_name": "tamil",
1464
  "original_lang_code": "tam",
1465
  "scripts": [
1466
- "Taml",
1467
- "Latn"
1468
  ],
1469
  "class_name": "SpaCyTokenizer",
1470
  "macrolanguage": false
@@ -1495,20 +917,9 @@
1495
  "full_object": "SpaCyTokenizer(\"ta\")",
1496
  "original_lang_name": "tamil",
1497
  "original_lang_code": "tam",
1498
- "scripts": [
1499
- "Taml",
1500
- "Latn"
1501
- ],
1502
- "class_name": "SpaCyTokenizer",
1503
- "macrolanguage": false
1504
- },
1505
- "Telu": {
1506
- "full_object": "SpaCyTokenizer(\"te\")",
1507
- "original_lang_name": "telugu",
1508
- "original_lang_code": "tel",
1509
  "scripts": [
1510
  "Latn",
1511
- "Telu"
1512
  ],
1513
  "class_name": "SpaCyTokenizer",
1514
  "macrolanguage": false
@@ -1541,39 +952,6 @@
1541
  ],
1542
  "class_name": "SpaCyTokenizer",
1543
  "macrolanguage": false
1544
- },
1545
- "Mlym": {
1546
- "full_object": "SpaCyTokenizer(\"ml\")",
1547
- "original_lang_name": "malayalam",
1548
- "original_lang_code": "mal",
1549
- "scripts": [
1550
- "Latn",
1551
- "Mlym"
1552
- ],
1553
- "class_name": "SpaCyTokenizer",
1554
- "macrolanguage": false
1555
- },
1556
- "Taml": {
1557
- "full_object": "SpaCyTokenizer(\"ta\")",
1558
- "original_lang_name": "tamil",
1559
- "original_lang_code": "tam",
1560
- "scripts": [
1561
- "Taml",
1562
- "Latn"
1563
- ],
1564
- "class_name": "SpaCyTokenizer",
1565
- "macrolanguage": false
1566
- },
1567
- "Telu": {
1568
- "full_object": "SpaCyTokenizer(\"te\")",
1569
- "original_lang_name": "telugu",
1570
- "original_lang_code": "tel",
1571
- "scripts": [
1572
- "Latn",
1573
- "Telu"
1574
- ],
1575
- "class_name": "SpaCyTokenizer",
1576
- "macrolanguage": false
1577
  }
1578
  },
1579
  "children": [
@@ -1658,8 +1036,8 @@
1658
  "original_lang_name": "tamil",
1659
  "original_lang_code": "tam",
1660
  "scripts": [
1661
- "Taml",
1662
- "Latn"
1663
  ],
1664
  "class_name": "SpaCyTokenizer",
1665
  "macrolanguage": false
@@ -1679,31 +1057,9 @@
1679
  "full_object": "SpaCyTokenizer(\"ta\")",
1680
  "original_lang_name": "tamil",
1681
  "original_lang_code": "tam",
1682
- "scripts": [
1683
- "Taml",
1684
- "Latn"
1685
- ],
1686
- "class_name": "SpaCyTokenizer",
1687
- "macrolanguage": false
1688
- },
1689
- "Knda": {
1690
- "full_object": "SpaCyTokenizer(\"kn\")",
1691
- "original_lang_name": "kannada",
1692
- "original_lang_code": "kan",
1693
- "scripts": [
1694
- "Latn",
1695
- "Knda"
1696
- ],
1697
- "class_name": "SpaCyTokenizer",
1698
- "macrolanguage": false
1699
- },
1700
- "Telu": {
1701
- "full_object": "SpaCyTokenizer(\"te\")",
1702
- "original_lang_name": "telugu",
1703
- "original_lang_code": "tel",
1704
  "scripts": [
1705
  "Latn",
1706
- "Telu"
1707
  ],
1708
  "class_name": "SpaCyTokenizer",
1709
  "macrolanguage": false
@@ -1714,83 +1070,27 @@
1714
  "name": "Kodagu",
1715
  "iso_1_code": null,
1716
  "iso_3_code": null,
1717
- "tokenizers": {
1718
- "Latn": {
1719
- "full_object": "SpaCyTokenizer(\"ta\")",
1720
- "original_lang_name": "tamil",
1721
- "original_lang_code": "tam",
1722
- "scripts": [
1723
- "Taml",
1724
- "Latn"
1725
- ],
1726
- "class_name": "SpaCyTokenizer",
1727
- "macrolanguage": false
1728
- },
1729
- "Mlym": {
1730
- "full_object": "SpaCyTokenizer(\"ml\")",
1731
- "original_lang_name": "malayalam",
1732
- "original_lang_code": "mal",
1733
- "scripts": [
1734
- "Latn",
1735
- "Mlym"
1736
- ],
1737
- "class_name": "SpaCyTokenizer",
1738
- "macrolanguage": false
1739
  },
1740
- "Taml": {
1741
- "full_object": "SpaCyTokenizer(\"ta\")",
1742
- "original_lang_name": "tamil",
1743
- "original_lang_code": "tam",
1744
- "scripts": [
1745
- "Taml",
1746
- "Latn"
1747
- ],
1748
- "class_name": "SpaCyTokenizer",
1749
- "macrolanguage": false
1750
- },
1751
- "Knda": {
1752
- "full_object": "SpaCyTokenizer(\"kn\")",
1753
- "original_lang_name": "kannada",
1754
- "original_lang_code": "kan",
1755
- "scripts": [
1756
- "Latn",
1757
- "Knda"
1758
- ],
1759
- "class_name": "SpaCyTokenizer",
1760
- "macrolanguage": false
1761
- },
1762
- "Telu": {
1763
- "full_object": "SpaCyTokenizer(\"te\")",
1764
- "original_lang_name": "telugu",
1765
- "original_lang_code": "tel",
1766
- "scripts": [
1767
- "Latn",
1768
- "Telu"
1769
- ],
1770
- "class_name": "SpaCyTokenizer",
1771
- "macrolanguage": false
1772
- }
1773
- },
1774
- "children": [
1775
- {
1776
- "name": "Kodava",
1777
- "iso_1_code": null,
1778
- "iso_3_code": "kfa",
1779
- "tokenizers": {},
1780
- "children": [],
1781
- "node_i": "3664",
1782
- "scripts": [],
1783
- "own_tokenizer": false
1784
- },
1785
- {
1786
- "name": "Kurumba, Kannada",
1787
- "iso_1_code": null,
1788
- "iso_3_code": "kfi",
1789
- "tokenizers": {},
1790
- "children": [],
1791
- "node_i": "3665",
1792
- "scripts": [],
1793
- "own_tokenizer": false
1794
  },
1795
  {
1796
  "name": "Kurumba, Mullu",
@@ -1837,8 +1137,8 @@
1837
  "original_lang_name": "tamil",
1838
  "original_lang_code": "tam",
1839
  "scripts": [
1840
- "Taml",
1841
- "Latn"
1842
  ],
1843
  "class_name": "SpaCyTokenizer",
1844
  "macrolanguage": false
@@ -1858,31 +1158,9 @@
1858
  "full_object": "SpaCyTokenizer(\"ta\")",
1859
  "original_lang_name": "tamil",
1860
  "original_lang_code": "tam",
1861
- "scripts": [
1862
- "Taml",
1863
- "Latn"
1864
- ],
1865
- "class_name": "SpaCyTokenizer",
1866
- "macrolanguage": false
1867
- },
1868
- "Knda": {
1869
- "full_object": "SpaCyTokenizer(\"kn\")",
1870
- "original_lang_name": "kannada",
1871
- "original_lang_code": "kan",
1872
  "scripts": [
1873
  "Latn",
1874
- "Knda"
1875
- ],
1876
- "class_name": "SpaCyTokenizer",
1877
- "macrolanguage": false
1878
- },
1879
- "Telu": {
1880
- "full_object": "SpaCyTokenizer(\"te\")",
1881
- "original_lang_name": "telugu",
1882
- "original_lang_code": "tel",
1883
- "scripts": [
1884
- "Latn",
1885
- "Telu"
1886
  ],
1887
  "class_name": "SpaCyTokenizer",
1888
  "macrolanguage": false
@@ -1925,39 +1203,6 @@
1925
  ],
1926
  "class_name": "SpaCyTokenizer",
1927
  "macrolanguage": false
1928
- },
1929
- "Taml": {
1930
- "full_object": "SpaCyTokenizer(\"ta\")",
1931
- "original_lang_name": "tamil",
1932
- "original_lang_code": "tam",
1933
- "scripts": [
1934
- "Taml",
1935
- "Latn"
1936
- ],
1937
- "class_name": "SpaCyTokenizer",
1938
- "macrolanguage": false
1939
- },
1940
- "Knda": {
1941
- "full_object": "SpaCyTokenizer(\"kn\")",
1942
- "original_lang_name": "kannada",
1943
- "original_lang_code": "kan",
1944
- "scripts": [
1945
- "Latn",
1946
- "Knda"
1947
- ],
1948
- "class_name": "SpaCyTokenizer",
1949
- "macrolanguage": false
1950
- },
1951
- "Telu": {
1952
- "full_object": "SpaCyTokenizer(\"te\")",
1953
- "original_lang_name": "telugu",
1954
- "original_lang_code": "tel",
1955
- "scripts": [
1956
- "Latn",
1957
- "Telu"
1958
- ],
1959
- "class_name": "SpaCyTokenizer",
1960
- "macrolanguage": false
1961
  }
1962
  },
1963
  "children": [
@@ -2087,57 +1332,24 @@
2087
  "iso_1_code": null,
2088
  "iso_3_code": null,
2089
  "tokenizers": {
2090
- "Taml": {
2091
  "full_object": "SpaCyTokenizer(\"ta\")",
2092
  "original_lang_name": "tamil",
2093
  "original_lang_code": "tam",
2094
  "scripts": [
2095
- "Taml",
2096
- "Latn"
2097
  ],
2098
  "class_name": "SpaCyTokenizer",
2099
  "macrolanguage": false
2100
  },
2101
- "Latn": {
2102
  "full_object": "SpaCyTokenizer(\"ta\")",
2103
  "original_lang_name": "tamil",
2104
  "original_lang_code": "tam",
2105
- "scripts": [
2106
- "Taml",
2107
- "Latn"
2108
- ],
2109
- "class_name": "SpaCyTokenizer",
2110
- "macrolanguage": false
2111
- },
2112
- "Mlym": {
2113
- "full_object": "SpaCyTokenizer(\"ml\")",
2114
- "original_lang_name": "malayalam",
2115
- "original_lang_code": "mal",
2116
- "scripts": [
2117
- "Latn",
2118
- "Mlym"
2119
- ],
2120
- "class_name": "SpaCyTokenizer",
2121
- "macrolanguage": false
2122
- },
2123
- "Knda": {
2124
- "full_object": "SpaCyTokenizer(\"kn\")",
2125
- "original_lang_name": "kannada",
2126
- "original_lang_code": "kan",
2127
  "scripts": [
2128
  "Latn",
2129
- "Knda"
2130
- ],
2131
- "class_name": "SpaCyTokenizer",
2132
- "macrolanguage": false
2133
- },
2134
- "Telu": {
2135
- "full_object": "SpaCyTokenizer(\"te\")",
2136
- "original_lang_name": "telugu",
2137
- "original_lang_code": "tel",
2138
- "scripts": [
2139
- "Latn",
2140
- "Telu"
2141
  ],
2142
  "class_name": "SpaCyTokenizer",
2143
  "macrolanguage": false
@@ -2209,24 +1421,24 @@
2209
  "iso_1_code": "ta",
2210
  "iso_3_code": "tam",
2211
  "tokenizers": {
2212
- "Taml": {
2213
  "full_object": "SpaCyTokenizer(\"ta\")",
2214
  "original_lang_name": "tamil",
2215
  "original_lang_code": "tam",
2216
  "scripts": [
2217
- "Taml",
2218
- "Latn"
2219
  ],
2220
  "class_name": "SpaCyTokenizer",
2221
  "macrolanguage": false
2222
  },
2223
- "Latn": {
2224
  "full_object": "SpaCyTokenizer(\"ta\")",
2225
  "original_lang_name": "tamil",
2226
  "original_lang_code": "tam",
2227
  "scripts": [
2228
- "Taml",
2229
- "Latn"
2230
  ],
2231
  "class_name": "SpaCyTokenizer",
2232
  "macrolanguage": false
@@ -2274,63 +1486,7 @@
2274
  "name": "Toda-Kota",
2275
  "iso_1_code": null,
2276
  "iso_3_code": null,
2277
- "tokenizers": {
2278
- "Latn": {
2279
- "full_object": "SpaCyTokenizer(\"ta\")",
2280
- "original_lang_name": "tamil",
2281
- "original_lang_code": "tam",
2282
- "scripts": [
2283
- "Taml",
2284
- "Latn"
2285
- ],
2286
- "class_name": "SpaCyTokenizer",
2287
- "macrolanguage": false
2288
- },
2289
- "Mlym": {
2290
- "full_object": "SpaCyTokenizer(\"ml\")",
2291
- "original_lang_name": "malayalam",
2292
- "original_lang_code": "mal",
2293
- "scripts": [
2294
- "Latn",
2295
- "Mlym"
2296
- ],
2297
- "class_name": "SpaCyTokenizer",
2298
- "macrolanguage": false
2299
- },
2300
- "Taml": {
2301
- "full_object": "SpaCyTokenizer(\"ta\")",
2302
- "original_lang_name": "tamil",
2303
- "original_lang_code": "tam",
2304
- "scripts": [
2305
- "Taml",
2306
- "Latn"
2307
- ],
2308
- "class_name": "SpaCyTokenizer",
2309
- "macrolanguage": false
2310
- },
2311
- "Knda": {
2312
- "full_object": "SpaCyTokenizer(\"kn\")",
2313
- "original_lang_name": "kannada",
2314
- "original_lang_code": "kan",
2315
- "scripts": [
2316
- "Latn",
2317
- "Knda"
2318
- ],
2319
- "class_name": "SpaCyTokenizer",
2320
- "macrolanguage": false
2321
- },
2322
- "Telu": {
2323
- "full_object": "SpaCyTokenizer(\"te\")",
2324
- "original_lang_name": "telugu",
2325
- "original_lang_code": "tel",
2326
- "scripts": [
2327
- "Latn",
2328
- "Telu"
2329
- ],
2330
- "class_name": "SpaCyTokenizer",
2331
- "macrolanguage": false
2332
- }
2333
- },
2334
  "children": [
2335
  {
2336
  "name": "Kota",
@@ -2366,63 +1522,7 @@
2366
  "name": "Unclassified",
2367
  "iso_1_code": null,
2368
  "iso_3_code": null,
2369
- "tokenizers": {
2370
- "Latn": {
2371
- "full_object": "SpaCyTokenizer(\"ta\")",
2372
- "original_lang_name": "tamil",
2373
- "original_lang_code": "tam",
2374
- "scripts": [
2375
- "Taml",
2376
- "Latn"
2377
- ],
2378
- "class_name": "SpaCyTokenizer",
2379
- "macrolanguage": false
2380
- },
2381
- "Knda": {
2382
- "full_object": "SpaCyTokenizer(\"kn\")",
2383
- "original_lang_name": "kannada",
2384
- "original_lang_code": "kan",
2385
- "scripts": [
2386
- "Latn",
2387
- "Knda"
2388
- ],
2389
- "class_name": "SpaCyTokenizer",
2390
- "macrolanguage": false
2391
- },
2392
- "Mlym": {
2393
- "full_object": "SpaCyTokenizer(\"ml\")",
2394
- "original_lang_name": "malayalam",
2395
- "original_lang_code": "mal",
2396
- "scripts": [
2397
- "Latn",
2398
- "Mlym"
2399
- ],
2400
- "class_name": "SpaCyTokenizer",
2401
- "macrolanguage": false
2402
- },
2403
- "Taml": {
2404
- "full_object": "SpaCyTokenizer(\"ta\")",
2405
- "original_lang_name": "tamil",
2406
- "original_lang_code": "tam",
2407
- "scripts": [
2408
- "Taml",
2409
- "Latn"
2410
- ],
2411
- "class_name": "SpaCyTokenizer",
2412
- "macrolanguage": false
2413
- },
2414
- "Telu": {
2415
- "full_object": "SpaCyTokenizer(\"te\")",
2416
- "original_lang_name": "telugu",
2417
- "original_lang_code": "tel",
2418
- "scripts": [
2419
- "Latn",
2420
- "Telu"
2421
- ],
2422
- "class_name": "SpaCyTokenizer",
2423
- "macrolanguage": false
2424
- }
2425
- },
2426
  "children": [
2427
  {
2428
  "name": "Chetti, Wayanad",
@@ -2449,17 +1549,6 @@
2449
  "iso_1_code": null,
2450
  "iso_3_code": null,
2451
  "tokenizers": {
2452
- "Latn": {
2453
- "full_object": "SpaCyTokenizer(\"ta\")",
2454
- "original_lang_name": "tamil",
2455
- "original_lang_code": "tam",
2456
- "scripts": [
2457
- "Taml",
2458
- "Latn"
2459
- ],
2460
- "class_name": "SpaCyTokenizer",
2461
- "macrolanguage": false
2462
- },
2463
  "Knda": {
2464
  "full_object": "SpaCyTokenizer(\"kn\")",
2465
  "original_lang_name": "kannada",
@@ -2470,39 +1559,6 @@
2470
  ],
2471
  "class_name": "SpaCyTokenizer",
2472
  "macrolanguage": false
2473
- },
2474
- "Mlym": {
2475
- "full_object": "SpaCyTokenizer(\"ml\")",
2476
- "original_lang_name": "malayalam",
2477
- "original_lang_code": "mal",
2478
- "scripts": [
2479
- "Latn",
2480
- "Mlym"
2481
- ],
2482
- "class_name": "SpaCyTokenizer",
2483
- "macrolanguage": false
2484
- },
2485
- "Taml": {
2486
- "full_object": "SpaCyTokenizer(\"ta\")",
2487
- "original_lang_name": "tamil",
2488
- "original_lang_code": "tam",
2489
- "scripts": [
2490
- "Taml",
2491
- "Latn"
2492
- ],
2493
- "class_name": "SpaCyTokenizer",
2494
- "macrolanguage": false
2495
- },
2496
- "Telu": {
2497
- "full_object": "SpaCyTokenizer(\"te\")",
2498
- "original_lang_name": "telugu",
2499
- "original_lang_code": "tel",
2500
- "scripts": [
2501
- "Latn",
2502
- "Telu"
2503
- ],
2504
- "class_name": "SpaCyTokenizer",
2505
- "macrolanguage": false
2506
  }
2507
  },
2508
  "children": [
@@ -2554,63 +1610,7 @@
2554
  "name": "Koraga",
2555
  "iso_1_code": null,
2556
  "iso_3_code": null,
2557
- "tokenizers": {
2558
- "Latn": {
2559
- "full_object": "SpaCyTokenizer(\"ta\")",
2560
- "original_lang_name": "tamil",
2561
- "original_lang_code": "tam",
2562
- "scripts": [
2563
- "Taml",
2564
- "Latn"
2565
- ],
2566
- "class_name": "SpaCyTokenizer",
2567
- "macrolanguage": false
2568
- },
2569
- "Knda": {
2570
- "full_object": "SpaCyTokenizer(\"kn\")",
2571
- "original_lang_name": "kannada",
2572
- "original_lang_code": "kan",
2573
- "scripts": [
2574
- "Latn",
2575
- "Knda"
2576
- ],
2577
- "class_name": "SpaCyTokenizer",
2578
- "macrolanguage": false
2579
- },
2580
- "Mlym": {
2581
- "full_object": "SpaCyTokenizer(\"ml\")",
2582
- "original_lang_name": "malayalam",
2583
- "original_lang_code": "mal",
2584
- "scripts": [
2585
- "Latn",
2586
- "Mlym"
2587
- ],
2588
- "class_name": "SpaCyTokenizer",
2589
- "macrolanguage": false
2590
- },
2591
- "Taml": {
2592
- "full_object": "SpaCyTokenizer(\"ta\")",
2593
- "original_lang_name": "tamil",
2594
- "original_lang_code": "tam",
2595
- "scripts": [
2596
- "Taml",
2597
- "Latn"
2598
- ],
2599
- "class_name": "SpaCyTokenizer",
2600
- "macrolanguage": false
2601
- },
2602
- "Telu": {
2603
- "full_object": "SpaCyTokenizer(\"te\")",
2604
- "original_lang_name": "telugu",
2605
- "original_lang_code": "tel",
2606
- "scripts": [
2607
- "Latn",
2608
- "Telu"
2609
- ],
2610
- "class_name": "SpaCyTokenizer",
2611
- "macrolanguage": false
2612
- }
2613
- },
2614
  "children": [
2615
  {
2616
  "name": "Koraga, Korra",
@@ -2646,63 +1646,7 @@
2646
  "name": "Unclassified",
2647
  "iso_1_code": null,
2648
  "iso_3_code": null,
2649
- "tokenizers": {
2650
- "Latn": {
2651
- "full_object": "SpaCyTokenizer(\"ta\")",
2652
- "original_lang_name": "tamil",
2653
- "original_lang_code": "tam",
2654
- "scripts": [
2655
- "Taml",
2656
- "Latn"
2657
- ],
2658
- "class_name": "SpaCyTokenizer",
2659
- "macrolanguage": false
2660
- },
2661
- "Knda": {
2662
- "full_object": "SpaCyTokenizer(\"kn\")",
2663
- "original_lang_name": "kannada",
2664
- "original_lang_code": "kan",
2665
- "scripts": [
2666
- "Latn",
2667
- "Knda"
2668
- ],
2669
- "class_name": "SpaCyTokenizer",
2670
- "macrolanguage": false
2671
- },
2672
- "Mlym": {
2673
- "full_object": "SpaCyTokenizer(\"ml\")",
2674
- "original_lang_name": "malayalam",
2675
- "original_lang_code": "mal",
2676
- "scripts": [
2677
- "Latn",
2678
- "Mlym"
2679
- ],
2680
- "class_name": "SpaCyTokenizer",
2681
- "macrolanguage": false
2682
- },
2683
- "Taml": {
2684
- "full_object": "SpaCyTokenizer(\"ta\")",
2685
- "original_lang_name": "tamil",
2686
- "original_lang_code": "tam",
2687
- "scripts": [
2688
- "Taml",
2689
- "Latn"
2690
- ],
2691
- "class_name": "SpaCyTokenizer",
2692
- "macrolanguage": false
2693
- },
2694
- "Telu": {
2695
- "full_object": "SpaCyTokenizer(\"te\")",
2696
- "original_lang_name": "telugu",
2697
- "original_lang_code": "tel",
2698
- "scripts": [
2699
- "Latn",
2700
- "Telu"
2701
- ],
2702
- "class_name": "SpaCyTokenizer",
2703
- "macrolanguage": false
2704
- }
2705
- },
2706
  "children": [
2707
  {
2708
  "name": "Mala Malasar",
@@ -2758,63 +1702,7 @@
2758
  "name": "Unclassified",
2759
  "iso_1_code": null,
2760
  "iso_3_code": null,
2761
- "tokenizers": {
2762
- "Latn": {
2763
- "full_object": "SpaCyTokenizer(\"ta\")",
2764
- "original_lang_name": "tamil",
2765
- "original_lang_code": "tam",
2766
- "scripts": [
2767
- "Taml",
2768
- "Latn"
2769
- ],
2770
- "class_name": "SpaCyTokenizer",
2771
- "macrolanguage": false
2772
- },
2773
- "Telu": {
2774
- "full_object": "SpaCyTokenizer(\"te\")",
2775
- "original_lang_name": "telugu",
2776
- "original_lang_code": "tel",
2777
- "scripts": [
2778
- "Latn",
2779
- "Telu"
2780
- ],
2781
- "class_name": "SpaCyTokenizer",
2782
- "macrolanguage": false
2783
- },
2784
- "Knda": {
2785
- "full_object": "SpaCyTokenizer(\"kn\")",
2786
- "original_lang_name": "kannada",
2787
- "original_lang_code": "kan",
2788
- "scripts": [
2789
- "Latn",
2790
- "Knda"
2791
- ],
2792
- "class_name": "SpaCyTokenizer",
2793
- "macrolanguage": false
2794
- },
2795
- "Mlym": {
2796
- "full_object": "SpaCyTokenizer(\"ml\")",
2797
- "original_lang_name": "malayalam",
2798
- "original_lang_code": "mal",
2799
- "scripts": [
2800
- "Latn",
2801
- "Mlym"
2802
- ],
2803
- "class_name": "SpaCyTokenizer",
2804
- "macrolanguage": false
2805
- },
2806
- "Taml": {
2807
- "full_object": "SpaCyTokenizer(\"ta\")",
2808
- "original_lang_name": "tamil",
2809
- "original_lang_code": "tam",
2810
- "scripts": [
2811
- "Taml",
2812
- "Latn"
2813
- ],
2814
- "class_name": "SpaCyTokenizer",
2815
- "macrolanguage": false
2816
- }
2817
- },
2818
  "children": [
2819
  {
2820
  "name": "Allar",
 
3
  "iso_1_code": null,
4
  "iso_3_code": null,
5
  "tokenizers": {
6
+ "Telu": {
7
+ "full_object": "SpaCyTokenizer(\"te\")",
8
+ "original_lang_name": "telugu",
9
+ "original_lang_code": "tel",
10
  "scripts": [
11
+ "Telu",
12
  "Latn"
13
  ],
14
  "class_name": "SpaCyTokenizer",
15
  "macrolanguage": false
16
  },
17
+ "Latn": {
18
+ "full_object": "SpaCyTokenizer(\"ta\")",
19
+ "original_lang_name": "tamil",
20
+ "original_lang_code": "tam",
21
  "scripts": [
22
  "Latn",
23
+ "Taml"
24
  ],
25
  "class_name": "SpaCyTokenizer",
26
  "macrolanguage": false
 
52
  "original_lang_name": "tamil",
53
  "original_lang_code": "tam",
54
  "scripts": [
55
+ "Latn",
56
+ "Taml"
57
  ],
58
  "class_name": "SpaCyTokenizer",
59
  "macrolanguage": false
 
64
  "name": "Central",
65
  "iso_1_code": null,
66
  "iso_3_code": null,
67
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  "children": [
69
  {
70
  "name": "Kolami-Naiki",
71
  "iso_1_code": null,
72
  "iso_3_code": null,
73
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  "children": [
75
  {
76
  "name": "Kolami, Northwestern",
 
101
  "name": "Parji-Gadaba",
102
  "iso_1_code": null,
103
  "iso_3_code": null,
104
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  "children": [
106
  {
107
  "name": "Gadaba, Mudhili",
 
147
  "name": "Northern",
148
  "iso_1_code": null,
149
  "iso_3_code": null,
150
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  "children": [
152
  {
153
  "name": "Brahui",
 
213
  "iso_1_code": null,
214
  "iso_3_code": null,
215
  "tokenizers": {
216
+ "Telu": {
217
  "full_object": "SpaCyTokenizer(\"te\")",
218
  "original_lang_name": "telugu",
219
  "original_lang_code": "tel",
220
  "scripts": [
221
+ "Telu",
222
+ "Latn"
223
  ],
224
  "class_name": "SpaCyTokenizer",
225
  "macrolanguage": false
226
  },
227
+ "Latn": {
228
  "full_object": "SpaCyTokenizer(\"te\")",
229
  "original_lang_name": "telugu",
230
  "original_lang_code": "tel",
231
  "scripts": [
232
+ "Telu",
233
+ "Latn"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  ],
235
  "class_name": "SpaCyTokenizer",
236
  "macrolanguage": false
 
242
  "iso_1_code": null,
243
  "iso_3_code": null,
244
  "tokenizers": {
245
+ "Telu": {
246
  "full_object": "SpaCyTokenizer(\"te\")",
247
  "original_lang_name": "telugu",
248
  "original_lang_code": "tel",
249
  "scripts": [
250
+ "Telu",
251
+ "Latn"
252
  ],
253
  "class_name": "SpaCyTokenizer",
254
  "macrolanguage": false
255
  },
256
+ "Latn": {
257
  "full_object": "SpaCyTokenizer(\"te\")",
258
  "original_lang_name": "telugu",
259
  "original_lang_code": "tel",
260
  "scripts": [
261
+ "Telu",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  "Latn"
263
  ],
264
  "class_name": "SpaCyTokenizer",
 
271
  "iso_1_code": null,
272
  "iso_3_code": null,
273
  "tokenizers": {
274
+ "Telu": {
275
  "full_object": "SpaCyTokenizer(\"te\")",
276
  "original_lang_name": "telugu",
277
  "original_lang_code": "tel",
278
  "scripts": [
279
+ "Telu",
280
+ "Latn"
281
  ],
282
  "class_name": "SpaCyTokenizer",
283
  "macrolanguage": false
284
  },
285
+ "Latn": {
286
  "full_object": "SpaCyTokenizer(\"te\")",
287
  "original_lang_name": "telugu",
288
  "original_lang_code": "tel",
289
  "scripts": [
290
+ "Telu",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  "Latn"
292
  ],
293
  "class_name": "SpaCyTokenizer",
 
407
  "original_lang_name": "telugu",
408
  "original_lang_code": "tel",
409
  "scripts": [
410
+ "Telu",
411
+ "Latn"
412
  ],
413
  "class_name": "SpaCyTokenizer",
414
  "macrolanguage": false
 
431
  "iso_1_code": null,
432
  "iso_3_code": null,
433
  "tokenizers": {
434
+ "Telu": {
435
  "full_object": "SpaCyTokenizer(\"te\")",
436
  "original_lang_name": "telugu",
437
  "original_lang_code": "tel",
438
  "scripts": [
439
+ "Telu",
440
+ "Latn"
441
  ],
442
  "class_name": "SpaCyTokenizer",
443
  "macrolanguage": false
444
  },
445
+ "Latn": {
446
  "full_object": "SpaCyTokenizer(\"te\")",
447
  "original_lang_name": "telugu",
448
  "original_lang_code": "tel",
449
  "scripts": [
450
+ "Telu",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  "Latn"
452
  ],
453
  "class_name": "SpaCyTokenizer",
 
459
  "name": "Konda",
460
  "iso_1_code": null,
461
  "iso_3_code": null,
462
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  "children": [
464
  {
465
  "name": "Konda-Dora",
 
491
  "iso_1_code": null,
492
  "iso_3_code": null,
493
  "tokenizers": {
494
+ "Telu": {
495
  "full_object": "SpaCyTokenizer(\"te\")",
496
  "original_lang_name": "telugu",
497
  "original_lang_code": "tel",
498
  "scripts": [
499
+ "Telu",
500
+ "Latn"
501
  ],
502
  "class_name": "SpaCyTokenizer",
503
  "macrolanguage": false
504
  },
505
+ "Latn": {
506
  "full_object": "SpaCyTokenizer(\"te\")",
507
  "original_lang_name": "telugu",
508
  "original_lang_code": "tel",
509
  "scripts": [
510
+ "Telu",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  "Latn"
512
  ],
513
  "class_name": "SpaCyTokenizer",
 
520
  "iso_1_code": null,
521
  "iso_3_code": null,
522
  "tokenizers": {
523
+ "Telu": {
524
  "full_object": "SpaCyTokenizer(\"te\")",
525
  "original_lang_name": "telugu",
526
  "original_lang_code": "tel",
527
  "scripts": [
528
+ "Telu",
529
+ "Latn"
530
  ],
531
  "class_name": "SpaCyTokenizer",
532
  "macrolanguage": false
533
  },
534
+ "Latn": {
535
  "full_object": "SpaCyTokenizer(\"te\")",
536
  "original_lang_name": "telugu",
537
  "original_lang_code": "tel",
538
  "scripts": [
539
+ "Telu",
540
+ "Latn"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
  ],
542
  "class_name": "SpaCyTokenizer",
543
  "macrolanguage": false
 
564
  "original_lang_name": "telugu",
565
  "original_lang_code": "tel",
566
  "scripts": [
567
+ "Telu",
568
+ "Latn"
569
  ],
570
  "class_name": "SpaCyTokenizer",
571
  "macrolanguage": false
 
607
  "name": "Manda-Pengo",
608
  "iso_1_code": null,
609
  "iso_3_code": null,
610
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  "children": [
612
  {
613
  "name": "Manda",
 
654
  "iso_1_code": null,
655
  "iso_3_code": null,
656
  "tokenizers": {
657
+ "Telu": {
658
  "full_object": "SpaCyTokenizer(\"te\")",
659
  "original_lang_name": "telugu",
660
  "original_lang_code": "tel",
661
  "scripts": [
662
+ "Telu",
663
+ "Latn"
664
  ],
665
  "class_name": "SpaCyTokenizer",
666
  "macrolanguage": false
667
  },
668
+ "Latn": {
669
  "full_object": "SpaCyTokenizer(\"te\")",
670
  "original_lang_name": "telugu",
671
  "original_lang_code": "tel",
672
  "scripts": [
673
+ "Telu",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
  "Latn"
675
  ],
676
  "class_name": "SpaCyTokenizer",
 
703
  "iso_1_code": "te",
704
  "iso_3_code": "tel",
705
  "tokenizers": {
706
+ "Telu": {
707
  "full_object": "SpaCyTokenizer(\"te\")",
708
  "original_lang_name": "telugu",
709
  "original_lang_code": "tel",
710
  "scripts": [
711
+ "Telu",
712
+ "Latn"
713
  ],
714
  "class_name": "SpaCyTokenizer",
715
  "macrolanguage": false
716
  },
717
+ "Latn": {
718
  "full_object": "SpaCyTokenizer(\"te\")",
719
  "original_lang_name": "telugu",
720
  "original_lang_code": "tel",
721
  "scripts": [
722
+ "Telu",
723
+ "Latn"
724
  ],
725
  "class_name": "SpaCyTokenizer",
726
  "macrolanguage": false
 
764
  "original_lang_name": "tamil",
765
  "original_lang_code": "tam",
766
  "scripts": [
767
+ "Latn",
768
+ "Taml"
769
  ],
770
  "class_name": "SpaCyTokenizer",
771
  "macrolanguage": false
 
796
  "full_object": "SpaCyTokenizer(\"ta\")",
797
  "original_lang_name": "tamil",
798
  "original_lang_code": "tam",
 
 
 
 
 
 
 
 
 
 
 
799
  "scripts": [
800
  "Latn",
801
+ "Taml"
802
  ],
803
  "class_name": "SpaCyTokenizer",
804
  "macrolanguage": false
 
885
  "original_lang_name": "tamil",
886
  "original_lang_code": "tam",
887
  "scripts": [
888
+ "Latn",
889
+ "Taml"
890
  ],
891
  "class_name": "SpaCyTokenizer",
892
  "macrolanguage": false
 
917
  "full_object": "SpaCyTokenizer(\"ta\")",
918
  "original_lang_name": "tamil",
919
  "original_lang_code": "tam",
 
 
 
 
 
 
 
 
 
 
 
920
  "scripts": [
921
  "Latn",
922
+ "Taml"
923
  ],
924
  "class_name": "SpaCyTokenizer",
925
  "macrolanguage": false
 
952
  ],
953
  "class_name": "SpaCyTokenizer",
954
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
955
  }
956
  },
957
  "children": [
 
1036
  "original_lang_name": "tamil",
1037
  "original_lang_code": "tam",
1038
  "scripts": [
1039
+ "Latn",
1040
+ "Taml"
1041
  ],
1042
  "class_name": "SpaCyTokenizer",
1043
  "macrolanguage": false
 
1057
  "full_object": "SpaCyTokenizer(\"ta\")",
1058
  "original_lang_name": "tamil",
1059
  "original_lang_code": "tam",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1060
  "scripts": [
1061
  "Latn",
1062
+ "Taml"
1063
  ],
1064
  "class_name": "SpaCyTokenizer",
1065
  "macrolanguage": false
 
1070
  "name": "Kodagu",
1071
  "iso_1_code": null,
1072
  "iso_3_code": null,
1073
+ "tokenizers": {},
1074
+ "children": [
1075
+ {
1076
+ "name": "Kodava",
1077
+ "iso_1_code": null,
1078
+ "iso_3_code": "kfa",
1079
+ "tokenizers": {},
1080
+ "children": [],
1081
+ "node_i": "3664",
1082
+ "scripts": [],
1083
+ "own_tokenizer": false
 
 
 
 
 
 
 
 
 
 
 
1084
  },
1085
+ {
1086
+ "name": "Kurumba, Kannada",
1087
+ "iso_1_code": null,
1088
+ "iso_3_code": "kfi",
1089
+ "tokenizers": {},
1090
+ "children": [],
1091
+ "node_i": "3665",
1092
+ "scripts": [],
1093
+ "own_tokenizer": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1094
  },
1095
  {
1096
  "name": "Kurumba, Mullu",
 
1137
  "original_lang_name": "tamil",
1138
  "original_lang_code": "tam",
1139
  "scripts": [
1140
+ "Latn",
1141
+ "Taml"
1142
  ],
1143
  "class_name": "SpaCyTokenizer",
1144
  "macrolanguage": false
 
1158
  "full_object": "SpaCyTokenizer(\"ta\")",
1159
  "original_lang_name": "tamil",
1160
  "original_lang_code": "tam",
 
 
 
 
 
 
 
 
 
 
 
1161
  "scripts": [
1162
  "Latn",
1163
+ "Taml"
 
 
 
 
 
 
 
 
 
 
 
1164
  ],
1165
  "class_name": "SpaCyTokenizer",
1166
  "macrolanguage": false
 
1203
  ],
1204
  "class_name": "SpaCyTokenizer",
1205
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1206
  }
1207
  },
1208
  "children": [
 
1332
  "iso_1_code": null,
1333
  "iso_3_code": null,
1334
  "tokenizers": {
1335
+ "Latn": {
1336
  "full_object": "SpaCyTokenizer(\"ta\")",
1337
  "original_lang_name": "tamil",
1338
  "original_lang_code": "tam",
1339
  "scripts": [
1340
+ "Latn",
1341
+ "Taml"
1342
  ],
1343
  "class_name": "SpaCyTokenizer",
1344
  "macrolanguage": false
1345
  },
1346
+ "Taml": {
1347
  "full_object": "SpaCyTokenizer(\"ta\")",
1348
  "original_lang_name": "tamil",
1349
  "original_lang_code": "tam",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1350
  "scripts": [
1351
  "Latn",
1352
+ "Taml"
 
 
 
 
 
 
 
 
 
 
 
1353
  ],
1354
  "class_name": "SpaCyTokenizer",
1355
  "macrolanguage": false
 
1421
  "iso_1_code": "ta",
1422
  "iso_3_code": "tam",
1423
  "tokenizers": {
1424
+ "Latn": {
1425
  "full_object": "SpaCyTokenizer(\"ta\")",
1426
  "original_lang_name": "tamil",
1427
  "original_lang_code": "tam",
1428
  "scripts": [
1429
+ "Latn",
1430
+ "Taml"
1431
  ],
1432
  "class_name": "SpaCyTokenizer",
1433
  "macrolanguage": false
1434
  },
1435
+ "Taml": {
1436
  "full_object": "SpaCyTokenizer(\"ta\")",
1437
  "original_lang_name": "tamil",
1438
  "original_lang_code": "tam",
1439
  "scripts": [
1440
+ "Latn",
1441
+ "Taml"
1442
  ],
1443
  "class_name": "SpaCyTokenizer",
1444
  "macrolanguage": false
 
1486
  "name": "Toda-Kota",
1487
  "iso_1_code": null,
1488
  "iso_3_code": null,
1489
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1490
  "children": [
1491
  {
1492
  "name": "Kota",
 
1522
  "name": "Unclassified",
1523
  "iso_1_code": null,
1524
  "iso_3_code": null,
1525
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1526
  "children": [
1527
  {
1528
  "name": "Chetti, Wayanad",
 
1549
  "iso_1_code": null,
1550
  "iso_3_code": null,
1551
  "tokenizers": {
 
 
 
 
 
 
 
 
 
 
 
1552
  "Knda": {
1553
  "full_object": "SpaCyTokenizer(\"kn\")",
1554
  "original_lang_name": "kannada",
 
1559
  ],
1560
  "class_name": "SpaCyTokenizer",
1561
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1562
  }
1563
  },
1564
  "children": [
 
1610
  "name": "Koraga",
1611
  "iso_1_code": null,
1612
  "iso_3_code": null,
1613
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1614
  "children": [
1615
  {
1616
  "name": "Koraga, Korra",
 
1646
  "name": "Unclassified",
1647
  "iso_1_code": null,
1648
  "iso_3_code": null,
1649
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1650
  "children": [
1651
  {
1652
  "name": "Mala Malasar",
 
1702
  "name": "Unclassified",
1703
  "iso_1_code": null,
1704
  "iso_3_code": null,
1705
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1706
  "children": [
1707
  {
1708
  "name": "Allar",
data/Indo-European.json CHANGED
The diff for this file is too large to render. See raw diff
 
data/Japonic.json CHANGED
@@ -42,52 +42,19 @@
42
  "name": "Ryukyuan",
43
  "iso_1_code": null,
44
  "iso_3_code": null,
45
- "tokenizers": {
46
- "Jpan": {
47
- "full_object": "SpaCyTokenizer(\"ja\"), ",
48
- "original_lang_name": "japanese",
49
- "original_lang_code": "jpn",
50
- "scripts": [
51
- "Jpan"
52
- ],
53
- "class_name": "SpaCyTokenizer",
54
- "macrolanguage": false
55
- }
56
- },
57
  "children": [
58
  {
59
  "name": "Amami-Okinawan",
60
  "iso_1_code": null,
61
  "iso_3_code": null,
62
- "tokenizers": {
63
- "Jpan": {
64
- "full_object": "SpaCyTokenizer(\"ja\"), ",
65
- "original_lang_name": "japanese",
66
- "original_lang_code": "jpn",
67
- "scripts": [
68
- "Jpan"
69
- ],
70
- "class_name": "SpaCyTokenizer",
71
- "macrolanguage": false
72
- }
73
- },
74
  "children": [
75
  {
76
  "name": "Northern Amami-Okinawan",
77
  "iso_1_code": null,
78
  "iso_3_code": null,
79
- "tokenizers": {
80
- "Jpan": {
81
- "full_object": "SpaCyTokenizer(\"ja\"), ",
82
- "original_lang_name": "japanese",
83
- "original_lang_code": "jpn",
84
- "scripts": [
85
- "Jpan"
86
- ],
87
- "class_name": "SpaCyTokenizer",
88
- "macrolanguage": false
89
- }
90
- },
91
  "children": [
92
  {
93
  "name": "Amami-Oshima, Southern",
@@ -138,18 +105,7 @@
138
  "name": "Southern Amami-Okinawan",
139
  "iso_1_code": null,
140
  "iso_3_code": null,
141
- "tokenizers": {
142
- "Jpan": {
143
- "full_object": "SpaCyTokenizer(\"ja\"), ",
144
- "original_lang_name": "japanese",
145
- "original_lang_code": "jpn",
146
- "scripts": [
147
- "Jpan"
148
- ],
149
- "class_name": "SpaCyTokenizer",
150
- "macrolanguage": false
151
- }
152
- },
153
  "children": [
154
  {
155
  "name": "Oki-No-Erabu",
@@ -205,18 +161,7 @@
205
  "name": "Sakishima",
206
  "iso_1_code": null,
207
  "iso_3_code": null,
208
- "tokenizers": {
209
- "Jpan": {
210
- "full_object": "SpaCyTokenizer(\"ja\"), ",
211
- "original_lang_name": "japanese",
212
- "original_lang_code": "jpn",
213
- "scripts": [
214
- "Jpan"
215
- ],
216
- "class_name": "SpaCyTokenizer",
217
- "macrolanguage": false
218
- }
219
- },
220
  "children": [
221
  {
222
  "name": "Miyako",
 
42
  "name": "Ryukyuan",
43
  "iso_1_code": null,
44
  "iso_3_code": null,
45
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
46
  "children": [
47
  {
48
  "name": "Amami-Okinawan",
49
  "iso_1_code": null,
50
  "iso_3_code": null,
51
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
52
  "children": [
53
  {
54
  "name": "Northern Amami-Okinawan",
55
  "iso_1_code": null,
56
  "iso_3_code": null,
57
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
58
  "children": [
59
  {
60
  "name": "Amami-Oshima, Southern",
 
105
  "name": "Southern Amami-Okinawan",
106
  "iso_1_code": null,
107
  "iso_3_code": null,
108
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
109
  "children": [
110
  {
111
  "name": "Oki-No-Erabu",
 
161
  "name": "Sakishima",
162
  "iso_1_code": null,
163
  "iso_3_code": null,
164
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
165
  "children": [
166
  {
167
  "name": "Miyako",
data/Kra-Dai.json CHANGED
@@ -29,18 +29,7 @@
29
  "name": "Hlai",
30
  "iso_1_code": null,
31
  "iso_3_code": null,
32
- "tokenizers": {
33
- "Thai": {
34
- "full_object": "ThaiTokenizer()",
35
- "original_lang_name": "thai",
36
- "original_lang_code": "tha",
37
- "scripts": [
38
- "Thai"
39
- ],
40
- "class_name": "ThaiTokenizer",
41
- "macrolanguage": false
42
- }
43
- },
44
  "children": [
45
  {
46
  "name": "Jiamao",
@@ -88,18 +77,7 @@
88
  "name": "Kam-Sui",
89
  "iso_1_code": null,
90
  "iso_3_code": null,
91
- "tokenizers": {
92
- "Thai": {
93
- "full_object": "ThaiTokenizer()",
94
- "original_lang_name": "thai",
95
- "original_lang_code": "tha",
96
- "scripts": [
97
- "Thai"
98
- ],
99
- "class_name": "ThaiTokenizer",
100
- "macrolanguage": false
101
- }
102
- },
103
  "children": [
104
  {
105
  "name": "Ai-Cham",
@@ -230,18 +208,7 @@
230
  "name": "Lakkja",
231
  "iso_1_code": null,
232
  "iso_3_code": null,
233
- "tokenizers": {
234
- "Thai": {
235
- "full_object": "ThaiTokenizer()",
236
- "original_lang_name": "thai",
237
- "original_lang_code": "tha",
238
- "scripts": [
239
- "Thai"
240
- ],
241
- "class_name": "ThaiTokenizer",
242
- "macrolanguage": false
243
- }
244
- },
245
  "children": [
246
  {
247
  "name": "Lakkia",
@@ -329,18 +296,7 @@
329
  "name": "Central",
330
  "iso_1_code": null,
331
  "iso_3_code": null,
332
- "tokenizers": {
333
- "Thai": {
334
- "full_object": "ThaiTokenizer()",
335
- "original_lang_name": "thai",
336
- "original_lang_code": "tha",
337
- "scripts": [
338
- "Thai"
339
- ],
340
- "class_name": "ThaiTokenizer",
341
- "macrolanguage": false
342
- }
343
- },
344
  "children": [
345
  {
346
  "name": "Cao Lan",
@@ -451,18 +407,7 @@
451
  "name": "Northern",
452
  "iso_1_code": null,
453
  "iso_3_code": null,
454
- "tokenizers": {
455
- "Thai": {
456
- "full_object": "ThaiTokenizer()",
457
- "original_lang_name": "thai",
458
- "original_lang_code": "tha",
459
- "scripts": [
460
- "Thai"
461
- ],
462
- "class_name": "ThaiTokenizer",
463
- "macrolanguage": false
464
- }
465
- },
466
  "children": [
467
  {
468
  "name": "Bouyei",
@@ -979,35 +924,13 @@
979
  "name": "Kra",
980
  "iso_1_code": null,
981
  "iso_3_code": null,
982
- "tokenizers": {
983
- "Thai": {
984
- "full_object": "ThaiTokenizer()",
985
- "original_lang_name": "thai",
986
- "original_lang_code": "tha",
987
- "scripts": [
988
- "Thai"
989
- ],
990
- "class_name": "ThaiTokenizer",
991
- "macrolanguage": false
992
- }
993
- },
994
  "children": [
995
  {
996
  "name": "Central Kra",
997
  "iso_1_code": null,
998
  "iso_3_code": null,
999
- "tokenizers": {
1000
- "Thai": {
1001
- "full_object": "ThaiTokenizer()",
1002
- "original_lang_name": "thai",
1003
- "original_lang_code": "tha",
1004
- "scripts": [
1005
- "Thai"
1006
- ],
1007
- "class_name": "ThaiTokenizer",
1008
- "macrolanguage": false
1009
- }
1010
- },
1011
  "children": [
1012
  {
1013
  "name": "Buyang, Baha",
@@ -1028,18 +951,7 @@
1028
  "name": "Eastern Kra",
1029
  "iso_1_code": null,
1030
  "iso_3_code": null,
1031
- "tokenizers": {
1032
- "Thai": {
1033
- "full_object": "ThaiTokenizer()",
1034
- "original_lang_name": "thai",
1035
- "original_lang_code": "tha",
1036
- "scripts": [
1037
- "Thai"
1038
- ],
1039
- "class_name": "ThaiTokenizer",
1040
- "macrolanguage": false
1041
- }
1042
- },
1043
  "children": [
1044
  {
1045
  "name": "Cun",
@@ -1120,18 +1032,7 @@
1120
  "name": "Western Kra",
1121
  "iso_1_code": null,
1122
  "iso_3_code": null,
1123
- "tokenizers": {
1124
- "Thai": {
1125
- "full_object": "ThaiTokenizer()",
1126
- "original_lang_name": "thai",
1127
- "original_lang_code": "tha",
1128
- "scripts": [
1129
- "Thai"
1130
- ],
1131
- "class_name": "ThaiTokenizer",
1132
- "macrolanguage": false
1133
- }
1134
- },
1135
  "children": [
1136
  {
1137
  "name": "A\u2019ou",
 
29
  "name": "Hlai",
30
  "iso_1_code": null,
31
  "iso_3_code": null,
32
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
33
  "children": [
34
  {
35
  "name": "Jiamao",
 
77
  "name": "Kam-Sui",
78
  "iso_1_code": null,
79
  "iso_3_code": null,
80
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
81
  "children": [
82
  {
83
  "name": "Ai-Cham",
 
208
  "name": "Lakkja",
209
  "iso_1_code": null,
210
  "iso_3_code": null,
211
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
212
  "children": [
213
  {
214
  "name": "Lakkia",
 
296
  "name": "Central",
297
  "iso_1_code": null,
298
  "iso_3_code": null,
299
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
300
  "children": [
301
  {
302
  "name": "Cao Lan",
 
407
  "name": "Northern",
408
  "iso_1_code": null,
409
  "iso_3_code": null,
410
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
411
  "children": [
412
  {
413
  "name": "Bouyei",
 
924
  "name": "Kra",
925
  "iso_1_code": null,
926
  "iso_3_code": null,
927
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
928
  "children": [
929
  {
930
  "name": "Central Kra",
931
  "iso_1_code": null,
932
  "iso_3_code": null,
933
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
934
  "children": [
935
  {
936
  "name": "Buyang, Baha",
 
951
  "name": "Eastern Kra",
952
  "iso_1_code": null,
953
  "iso_3_code": null,
954
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
955
  "children": [
956
  {
957
  "name": "Cun",
 
1032
  "name": "Western Kra",
1033
  "iso_1_code": null,
1034
  "iso_3_code": null,
1035
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
1036
  "children": [
1037
  {
1038
  "name": "A\u2019ou",
data/Mongolic.json CHANGED
@@ -36,18 +36,7 @@
36
  "name": "Dagur",
37
  "iso_1_code": null,
38
  "iso_3_code": null,
39
- "tokenizers": {
40
- "Cyrl": {
41
- "full_object": "StanzaTokenizer(\"bxr\")",
42
- "original_lang_name": "russia_buriat",
43
- "original_lang_code": "bxr",
44
- "scripts": [
45
- "Cyrl"
46
- ],
47
- "class_name": "StanzaTokenizer",
48
- "macrolanguage": false
49
- }
50
- },
51
  "children": [
52
  {
53
  "name": "Daur",
@@ -68,18 +57,7 @@
68
  "name": "Mongour",
69
  "iso_1_code": null,
70
  "iso_3_code": null,
71
- "tokenizers": {
72
- "Cyrl": {
73
- "full_object": "StanzaTokenizer(\"bxr\")",
74
- "original_lang_name": "russia_buriat",
75
- "original_lang_code": "bxr",
76
- "scripts": [
77
- "Cyrl"
78
- ],
79
- "class_name": "StanzaTokenizer",
80
- "macrolanguage": false
81
- }
82
- },
83
  "children": [
84
  {
85
  "name": "Kangjia",
@@ -364,18 +342,7 @@
364
  "name": "Western",
365
  "iso_1_code": null,
366
  "iso_3_code": null,
367
- "tokenizers": {
368
- "Cyrl": {
369
- "full_object": "StanzaTokenizer(\"bxr\")",
370
- "original_lang_name": "russia_buriat",
371
- "original_lang_code": "bxr",
372
- "scripts": [
373
- "Cyrl"
374
- ],
375
- "class_name": "StanzaTokenizer",
376
- "macrolanguage": false
377
- }
378
- },
379
  "children": [
380
  {
381
  "name": "Mogholi",
 
36
  "name": "Dagur",
37
  "iso_1_code": null,
38
  "iso_3_code": null,
39
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
40
  "children": [
41
  {
42
  "name": "Daur",
 
57
  "name": "Mongour",
58
  "iso_1_code": null,
59
  "iso_3_code": null,
60
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
61
  "children": [
62
  {
63
  "name": "Kangjia",
 
342
  "name": "Western",
343
  "iso_1_code": null,
344
  "iso_3_code": null,
345
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
346
  "children": [
347
  {
348
  "name": "Mogholi",
data/Niger-Congo.json CHANGED
The diff for this file is too large to render. See raw diff
 
data/Sino-Tibetan.json CHANGED
@@ -1029,30 +1029,7 @@
1029
  "name": "Digarish",
1030
  "iso_1_code": null,
1031
  "iso_3_code": null,
1032
- "tokenizers": {
1033
- "Latn": {
1034
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1035
- "original_lang_name": "chinese",
1036
- "original_lang_code": "zho",
1037
- "scripts": [
1038
- "Latn",
1039
- "Hani"
1040
- ],
1041
- "class_name": "SpaCyTokenizer",
1042
- "macrolanguage": true
1043
- },
1044
- "Hani": {
1045
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1046
- "original_lang_name": "chinese",
1047
- "original_lang_code": "zho",
1048
- "scripts": [
1049
- "Latn",
1050
- "Hani"
1051
- ],
1052
- "class_name": "SpaCyTokenizer",
1053
- "macrolanguage": true
1054
- }
1055
- },
1056
  "children": [
1057
  {
1058
  "name": "Idu-Mishmi",
@@ -1083,30 +1060,7 @@
1083
  "name": "Hrusish",
1084
  "iso_1_code": null,
1085
  "iso_3_code": null,
1086
- "tokenizers": {
1087
- "Latn": {
1088
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1089
- "original_lang_name": "chinese",
1090
- "original_lang_code": "zho",
1091
- "scripts": [
1092
- "Latn",
1093
- "Hani"
1094
- ],
1095
- "class_name": "SpaCyTokenizer",
1096
- "macrolanguage": true
1097
- },
1098
- "Hani": {
1099
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1100
- "original_lang_name": "chinese",
1101
- "original_lang_code": "zho",
1102
- "scripts": [
1103
- "Latn",
1104
- "Hani"
1105
- ],
1106
- "class_name": "SpaCyTokenizer",
1107
- "macrolanguage": true
1108
- }
1109
- },
1110
  "children": [
1111
  {
1112
  "name": "Hruso",
@@ -1137,30 +1091,7 @@
1137
  "name": "Keman",
1138
  "iso_1_code": null,
1139
  "iso_3_code": null,
1140
- "tokenizers": {
1141
- "Latn": {
1142
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1143
- "original_lang_name": "chinese",
1144
- "original_lang_code": "zho",
1145
- "scripts": [
1146
- "Latn",
1147
- "Hani"
1148
- ],
1149
- "class_name": "SpaCyTokenizer",
1150
- "macrolanguage": true
1151
- },
1152
- "Hani": {
1153
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1154
- "original_lang_name": "chinese",
1155
- "original_lang_code": "zho",
1156
- "scripts": [
1157
- "Latn",
1158
- "Hani"
1159
- ],
1160
- "class_name": "SpaCyTokenizer",
1161
- "macrolanguage": true
1162
- }
1163
- },
1164
  "children": [
1165
  {
1166
  "name": "Zakhring",
@@ -1181,30 +1112,7 @@
1181
  "name": "Kho-Bwa",
1182
  "iso_1_code": null,
1183
  "iso_3_code": null,
1184
- "tokenizers": {
1185
- "Latn": {
1186
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1187
- "original_lang_name": "chinese",
1188
- "original_lang_code": "zho",
1189
- "scripts": [
1190
- "Latn",
1191
- "Hani"
1192
- ],
1193
- "class_name": "SpaCyTokenizer",
1194
- "macrolanguage": true
1195
- },
1196
- "Hani": {
1197
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1198
- "original_lang_name": "chinese",
1199
- "original_lang_code": "zho",
1200
- "scripts": [
1201
- "Latn",
1202
- "Hani"
1203
- ],
1204
- "class_name": "SpaCyTokenizer",
1205
- "macrolanguage": true
1206
- }
1207
- },
1208
  "children": [
1209
  {
1210
  "name": "Bugun",
@@ -1275,30 +1183,7 @@
1275
  "name": "Lepcha",
1276
  "iso_1_code": null,
1277
  "iso_3_code": null,
1278
- "tokenizers": {
1279
- "Latn": {
1280
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1281
- "original_lang_name": "chinese",
1282
- "original_lang_code": "zho",
1283
- "scripts": [
1284
- "Latn",
1285
- "Hani"
1286
- ],
1287
- "class_name": "SpaCyTokenizer",
1288
- "macrolanguage": true
1289
- },
1290
- "Hani": {
1291
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1292
- "original_lang_name": "chinese",
1293
- "original_lang_code": "zho",
1294
- "scripts": [
1295
- "Latn",
1296
- "Hani"
1297
- ],
1298
- "class_name": "SpaCyTokenizer",
1299
- "macrolanguage": true
1300
- }
1301
- },
1302
  "children": [
1303
  {
1304
  "name": "Lepcha",
@@ -1319,30 +1204,7 @@
1319
  "name": "Mijish",
1320
  "iso_1_code": null,
1321
  "iso_3_code": null,
1322
- "tokenizers": {
1323
- "Latn": {
1324
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1325
- "original_lang_name": "chinese",
1326
- "original_lang_code": "zho",
1327
- "scripts": [
1328
- "Latn",
1329
- "Hani"
1330
- ],
1331
- "class_name": "SpaCyTokenizer",
1332
- "macrolanguage": true
1333
- },
1334
- "Hani": {
1335
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1336
- "original_lang_name": "chinese",
1337
- "original_lang_code": "zho",
1338
- "scripts": [
1339
- "Latn",
1340
- "Hani"
1341
- ],
1342
- "class_name": "SpaCyTokenizer",
1343
- "macrolanguage": true
1344
- }
1345
- },
1346
  "children": [
1347
  {
1348
  "name": "Miju-Mishmi",
@@ -1851,30 +1713,7 @@
1851
  "name": "Northern",
1852
  "iso_1_code": null,
1853
  "iso_3_code": null,
1854
- "tokenizers": {
1855
- "Latn": {
1856
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1857
- "original_lang_name": "chinese",
1858
- "original_lang_code": "zho",
1859
- "scripts": [
1860
- "Latn",
1861
- "Hani"
1862
- ],
1863
- "class_name": "SpaCyTokenizer",
1864
- "macrolanguage": true
1865
- },
1866
- "Hani": {
1867
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1868
- "original_lang_name": "chinese",
1869
- "original_lang_code": "zho",
1870
- "scripts": [
1871
- "Latn",
1872
- "Hani"
1873
- ],
1874
- "class_name": "SpaCyTokenizer",
1875
- "macrolanguage": true
1876
- }
1877
- },
1878
  "children": [
1879
  {
1880
  "name": "Lahta",
@@ -1935,30 +1774,7 @@
1935
  "name": "Peripheral",
1936
  "iso_1_code": null,
1937
  "iso_3_code": null,
1938
- "tokenizers": {
1939
- "Latn": {
1940
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1941
- "original_lang_name": "chinese",
1942
- "original_lang_code": "zho",
1943
- "scripts": [
1944
- "Latn",
1945
- "Hani"
1946
- ],
1947
- "class_name": "SpaCyTokenizer",
1948
- "macrolanguage": true
1949
- },
1950
- "Hani": {
1951
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
1952
- "original_lang_name": "chinese",
1953
- "original_lang_code": "zho",
1954
- "scripts": [
1955
- "Latn",
1956
- "Hani"
1957
- ],
1958
- "class_name": "SpaCyTokenizer",
1959
- "macrolanguage": true
1960
- }
1961
- },
1962
  "children": [
1963
  {
1964
  "name": "Pa\u2019o",
@@ -2023,30 +1839,7 @@
2023
  "name": "Southern",
2024
  "iso_1_code": null,
2025
  "iso_3_code": null,
2026
- "tokenizers": {
2027
- "Latn": {
2028
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
2029
- "original_lang_name": "chinese",
2030
- "original_lang_code": "zho",
2031
- "scripts": [
2032
- "Latn",
2033
- "Hani"
2034
- ],
2035
- "class_name": "SpaCyTokenizer",
2036
- "macrolanguage": true
2037
- },
2038
- "Hani": {
2039
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
2040
- "original_lang_name": "chinese",
2041
- "original_lang_code": "zho",
2042
- "scripts": [
2043
- "Latn",
2044
- "Hani"
2045
- ],
2046
- "class_name": "SpaCyTokenizer",
2047
- "macrolanguage": true
2048
- }
2049
- },
2050
  "children": [
2051
  {
2052
  "name": "Karen, Mobwa",
@@ -2564,30 +2357,7 @@
2564
  "name": "Mara",
2565
  "iso_1_code": null,
2566
  "iso_3_code": null,
2567
- "tokenizers": {
2568
- "Latn": {
2569
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
2570
- "original_lang_name": "chinese",
2571
- "original_lang_code": "zho",
2572
- "scripts": [
2573
- "Latn",
2574
- "Hani"
2575
- ],
2576
- "class_name": "SpaCyTokenizer",
2577
- "macrolanguage": true
2578
- },
2579
- "Hani": {
2580
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
2581
- "original_lang_name": "chinese",
2582
- "original_lang_code": "zho",
2583
- "scripts": [
2584
- "Latn",
2585
- "Hani"
2586
- ],
2587
- "class_name": "SpaCyTokenizer",
2588
- "macrolanguage": true
2589
- }
2590
- },
2591
  "children": [
2592
  {
2593
  "name": "Chin, Lautu",
@@ -3406,30 +3176,7 @@
3406
  "name": "Asho",
3407
  "iso_1_code": null,
3408
  "iso_3_code": null,
3409
- "tokenizers": {
3410
- "Latn": {
3411
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
3412
- "original_lang_name": "chinese",
3413
- "original_lang_code": "zho",
3414
- "scripts": [
3415
- "Latn",
3416
- "Hani"
3417
- ],
3418
- "class_name": "SpaCyTokenizer",
3419
- "macrolanguage": true
3420
- },
3421
- "Hani": {
3422
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
3423
- "original_lang_name": "chinese",
3424
- "original_lang_code": "zho",
3425
- "scripts": [
3426
- "Latn",
3427
- "Hani"
3428
- ],
3429
- "class_name": "SpaCyTokenizer",
3430
- "macrolanguage": true
3431
- }
3432
- },
3433
  "children": [
3434
  {
3435
  "name": "Chin, L\u00e4okt\u00fc",
@@ -3810,30 +3557,7 @@
3810
  "name": "Southern",
3811
  "iso_1_code": null,
3812
  "iso_3_code": null,
3813
- "tokenizers": {
3814
- "Latn": {
3815
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
3816
- "original_lang_name": "chinese",
3817
- "original_lang_code": "zho",
3818
- "scripts": [
3819
- "Latn",
3820
- "Hani"
3821
- ],
3822
- "class_name": "SpaCyTokenizer",
3823
- "macrolanguage": true
3824
- },
3825
- "Hani": {
3826
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
3827
- "original_lang_name": "chinese",
3828
- "original_lang_code": "zho",
3829
- "scripts": [
3830
- "Latn",
3831
- "Hani"
3832
- ],
3833
- "class_name": "SpaCyTokenizer",
3834
- "macrolanguage": true
3835
- }
3836
- },
3837
  "children": [
3838
  {
3839
  "name": "Danu",
@@ -3921,30 +3645,7 @@
3921
  "name": "Mru",
3922
  "iso_1_code": null,
3923
  "iso_3_code": null,
3924
- "tokenizers": {
3925
- "Latn": {
3926
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
3927
- "original_lang_name": "chinese",
3928
- "original_lang_code": "zho",
3929
- "scripts": [
3930
- "Latn",
3931
- "Hani"
3932
- ],
3933
- "class_name": "SpaCyTokenizer",
3934
- "macrolanguage": true
3935
- },
3936
- "Hani": {
3937
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
3938
- "original_lang_name": "chinese",
3939
- "original_lang_code": "zho",
3940
- "scripts": [
3941
- "Latn",
3942
- "Hani"
3943
- ],
3944
- "class_name": "SpaCyTokenizer",
3945
- "macrolanguage": true
3946
- }
3947
- },
3948
  "children": [
3949
  {
3950
  "name": "Chin, Anu-Khongso",
@@ -4318,30 +4019,7 @@
4318
  "name": "Northern",
4319
  "iso_1_code": null,
4320
  "iso_3_code": null,
4321
- "tokenizers": {
4322
- "Latn": {
4323
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
4324
- "original_lang_name": "chinese",
4325
- "original_lang_code": "zho",
4326
- "scripts": [
4327
- "Latn",
4328
- "Hani"
4329
- ],
4330
- "class_name": "SpaCyTokenizer",
4331
- "macrolanguage": true
4332
- },
4333
- "Hani": {
4334
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
4335
- "original_lang_name": "chinese",
4336
- "original_lang_code": "zho",
4337
- "scripts": [
4338
- "Latn",
4339
- "Hani"
4340
- ],
4341
- "class_name": "SpaCyTokenizer",
4342
- "macrolanguage": true
4343
- }
4344
- },
4345
  "children": [
4346
  {
4347
  "name": "Nuosu",
@@ -4492,30 +4170,7 @@
4492
  "name": "Southeastern",
4493
  "iso_1_code": null,
4494
  "iso_3_code": null,
4495
- "tokenizers": {
4496
- "Latn": {
4497
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
4498
- "original_lang_name": "chinese",
4499
- "original_lang_code": "zho",
4500
- "scripts": [
4501
- "Latn",
4502
- "Hani"
4503
- ],
4504
- "class_name": "SpaCyTokenizer",
4505
- "macrolanguage": true
4506
- },
4507
- "Hani": {
4508
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
4509
- "original_lang_name": "chinese",
4510
- "original_lang_code": "zho",
4511
- "scripts": [
4512
- "Latn",
4513
- "Hani"
4514
- ],
4515
- "class_name": "SpaCyTokenizer",
4516
- "macrolanguage": true
4517
- }
4518
- },
4519
  "children": [
4520
  {
4521
  "name": "Alugu",
@@ -5113,30 +4768,7 @@
5113
  "name": "Bisoid",
5114
  "iso_1_code": null,
5115
  "iso_3_code": null,
5116
- "tokenizers": {
5117
- "Latn": {
5118
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5119
- "original_lang_name": "chinese",
5120
- "original_lang_code": "zho",
5121
- "scripts": [
5122
- "Latn",
5123
- "Hani"
5124
- ],
5125
- "class_name": "SpaCyTokenizer",
5126
- "macrolanguage": true
5127
- },
5128
- "Hani": {
5129
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5130
- "original_lang_name": "chinese",
5131
- "original_lang_code": "zho",
5132
- "scripts": [
5133
- "Latn",
5134
- "Hani"
5135
- ],
5136
- "class_name": "SpaCyTokenizer",
5137
- "macrolanguage": true
5138
- }
5139
- },
5140
  "children": [
5141
  {
5142
  "name": "Bisu",
@@ -5194,30 +4826,7 @@
5194
  "name": "Unclassified",
5195
  "iso_1_code": null,
5196
  "iso_3_code": null,
5197
- "tokenizers": {
5198
- "Latn": {
5199
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5200
- "original_lang_name": "chinese",
5201
- "original_lang_code": "zho",
5202
- "scripts": [
5203
- "Latn",
5204
- "Hani"
5205
- ],
5206
- "class_name": "SpaCyTokenizer",
5207
- "macrolanguage": true
5208
- },
5209
- "Hani": {
5210
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5211
- "original_lang_name": "chinese",
5212
- "original_lang_code": "zho",
5213
- "scripts": [
5214
- "Latn",
5215
- "Hani"
5216
- ],
5217
- "class_name": "SpaCyTokenizer",
5218
- "macrolanguage": true
5219
- }
5220
- },
5221
  "children": [
5222
  {
5223
  "name": "Laopang",
@@ -5258,59 +4867,13 @@
5258
  "name": "Northeastern Tibeto-Burman",
5259
  "iso_1_code": null,
5260
  "iso_3_code": null,
5261
- "tokenizers": {
5262
- "Latn": {
5263
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5264
- "original_lang_name": "chinese",
5265
- "original_lang_code": "zho",
5266
- "scripts": [
5267
- "Latn",
5268
- "Hani"
5269
- ],
5270
- "class_name": "SpaCyTokenizer",
5271
- "macrolanguage": true
5272
- },
5273
- "Hani": {
5274
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5275
- "original_lang_name": "chinese",
5276
- "original_lang_code": "zho",
5277
- "scripts": [
5278
- "Latn",
5279
- "Hani"
5280
- ],
5281
- "class_name": "SpaCyTokenizer",
5282
- "macrolanguage": true
5283
- }
5284
- },
5285
  "children": [
5286
  {
5287
  "name": "Bai",
5288
  "iso_1_code": null,
5289
  "iso_3_code": null,
5290
- "tokenizers": {
5291
- "Latn": {
5292
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5293
- "original_lang_name": "chinese",
5294
- "original_lang_code": "zho",
5295
- "scripts": [
5296
- "Latn",
5297
- "Hani"
5298
- ],
5299
- "class_name": "SpaCyTokenizer",
5300
- "macrolanguage": true
5301
- },
5302
- "Hani": {
5303
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5304
- "original_lang_name": "chinese",
5305
- "original_lang_code": "zho",
5306
- "scripts": [
5307
- "Latn",
5308
- "Hani"
5309
- ],
5310
- "class_name": "SpaCyTokenizer",
5311
- "macrolanguage": true
5312
- }
5313
- },
5314
  "children": [
5315
  {
5316
  "name": "Bai, Central",
@@ -5361,30 +4924,7 @@
5361
  "name": "Baima",
5362
  "iso_1_code": null,
5363
  "iso_3_code": null,
5364
- "tokenizers": {
5365
- "Latn": {
5366
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5367
- "original_lang_name": "chinese",
5368
- "original_lang_code": "zho",
5369
- "scripts": [
5370
- "Latn",
5371
- "Hani"
5372
- ],
5373
- "class_name": "SpaCyTokenizer",
5374
- "macrolanguage": true
5375
- },
5376
- "Hani": {
5377
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5378
- "original_lang_name": "chinese",
5379
- "original_lang_code": "zho",
5380
- "scripts": [
5381
- "Latn",
5382
- "Hani"
5383
- ],
5384
- "class_name": "SpaCyTokenizer",
5385
- "macrolanguage": true
5386
- }
5387
- },
5388
  "children": [
5389
  {
5390
  "name": "Baima",
@@ -5405,30 +4945,7 @@
5405
  "name": "Ersuish",
5406
  "iso_1_code": null,
5407
  "iso_3_code": null,
5408
- "tokenizers": {
5409
- "Latn": {
5410
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5411
- "original_lang_name": "chinese",
5412
- "original_lang_code": "zho",
5413
- "scripts": [
5414
- "Latn",
5415
- "Hani"
5416
- ],
5417
- "class_name": "SpaCyTokenizer",
5418
- "macrolanguage": true
5419
- },
5420
- "Hani": {
5421
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5422
- "original_lang_name": "chinese",
5423
- "original_lang_code": "zho",
5424
- "scripts": [
5425
- "Latn",
5426
- "Hani"
5427
- ],
5428
- "class_name": "SpaCyTokenizer",
5429
- "macrolanguage": true
5430
- }
5431
- },
5432
  "children": [
5433
  {
5434
  "name": "Ersu",
@@ -5449,30 +4966,7 @@
5449
  "name": "Naic",
5450
  "iso_1_code": null,
5451
  "iso_3_code": null,
5452
- "tokenizers": {
5453
- "Latn": {
5454
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5455
- "original_lang_name": "chinese",
5456
- "original_lang_code": "zho",
5457
- "scripts": [
5458
- "Latn",
5459
- "Hani"
5460
- ],
5461
- "class_name": "SpaCyTokenizer",
5462
- "macrolanguage": true
5463
- },
5464
- "Hani": {
5465
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5466
- "original_lang_name": "chinese",
5467
- "original_lang_code": "zho",
5468
- "scripts": [
5469
- "Latn",
5470
- "Hani"
5471
- ],
5472
- "class_name": "SpaCyTokenizer",
5473
- "macrolanguage": true
5474
- }
5475
- },
5476
  "children": [
5477
  {
5478
  "name": "Namuyi",
@@ -5523,30 +5017,7 @@
5523
  "name": "Qiangic",
5524
  "iso_1_code": null,
5525
  "iso_3_code": null,
5526
- "tokenizers": {
5527
- "Latn": {
5528
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5529
- "original_lang_name": "chinese",
5530
- "original_lang_code": "zho",
5531
- "scripts": [
5532
- "Latn",
5533
- "Hani"
5534
- ],
5535
- "class_name": "SpaCyTokenizer",
5536
- "macrolanguage": true
5537
- },
5538
- "Hani": {
5539
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5540
- "original_lang_name": "chinese",
5541
- "original_lang_code": "zho",
5542
- "scripts": [
5543
- "Latn",
5544
- "Hani"
5545
- ],
5546
- "class_name": "SpaCyTokenizer",
5547
- "macrolanguage": true
5548
- }
5549
- },
5550
  "children": [
5551
  {
5552
  "name": "Qiang, Northern",
@@ -5647,30 +5118,7 @@
5647
  "name": "rGyalrongic",
5648
  "iso_1_code": null,
5649
  "iso_3_code": null,
5650
- "tokenizers": {
5651
- "Latn": {
5652
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5653
- "original_lang_name": "chinese",
5654
- "original_lang_code": "zho",
5655
- "scripts": [
5656
- "Latn",
5657
- "Hani"
5658
- ],
5659
- "class_name": "SpaCyTokenizer",
5660
- "macrolanguage": true
5661
- },
5662
- "Hani": {
5663
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5664
- "original_lang_name": "chinese",
5665
- "original_lang_code": "zho",
5666
- "scripts": [
5667
- "Latn",
5668
- "Hani"
5669
- ],
5670
- "class_name": "SpaCyTokenizer",
5671
- "macrolanguage": true
5672
- }
5673
- },
5674
  "children": [
5675
  {
5676
  "name": "Horpa",
@@ -5721,30 +5169,7 @@
5721
  "name": "Tujia",
5722
  "iso_1_code": null,
5723
  "iso_3_code": null,
5724
- "tokenizers": {
5725
- "Latn": {
5726
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5727
- "original_lang_name": "chinese",
5728
- "original_lang_code": "zho",
5729
- "scripts": [
5730
- "Latn",
5731
- "Hani"
5732
- ],
5733
- "class_name": "SpaCyTokenizer",
5734
- "macrolanguage": true
5735
- },
5736
- "Hani": {
5737
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
5738
- "original_lang_name": "chinese",
5739
- "original_lang_code": "zho",
5740
- "scripts": [
5741
- "Latn",
5742
- "Hani"
5743
- ],
5744
- "class_name": "SpaCyTokenizer",
5745
- "macrolanguage": true
5746
- }
5747
- },
5748
  "children": [
5749
  {
5750
  "name": "Tujia, Northern",
@@ -6143,30 +5568,7 @@
6143
  "name": "Koch",
6144
  "iso_1_code": null,
6145
  "iso_3_code": null,
6146
- "tokenizers": {
6147
- "Latn": {
6148
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
6149
- "original_lang_name": "chinese",
6150
- "original_lang_code": "zho",
6151
- "scripts": [
6152
- "Latn",
6153
- "Hani"
6154
- ],
6155
- "class_name": "SpaCyTokenizer",
6156
- "macrolanguage": true
6157
- },
6158
- "Hani": {
6159
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
6160
- "original_lang_name": "chinese",
6161
- "original_lang_code": "zho",
6162
- "scripts": [
6163
- "Latn",
6164
- "Hani"
6165
- ],
6166
- "class_name": "SpaCyTokenizer",
6167
- "macrolanguage": true
6168
- }
6169
- },
6170
  "children": [
6171
  {
6172
  "name": "Atong",
@@ -6484,30 +5886,7 @@
6484
  "name": "Dhimalish",
6485
  "iso_1_code": null,
6486
  "iso_3_code": null,
6487
- "tokenizers": {
6488
- "Latn": {
6489
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
6490
- "original_lang_name": "chinese",
6491
- "original_lang_code": "zho",
6492
- "scripts": [
6493
- "Latn",
6494
- "Hani"
6495
- ],
6496
- "class_name": "SpaCyTokenizer",
6497
- "macrolanguage": true
6498
- },
6499
- "Hani": {
6500
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
6501
- "original_lang_name": "chinese",
6502
- "original_lang_code": "zho",
6503
- "scripts": [
6504
- "Latn",
6505
- "Hani"
6506
- ],
6507
- "class_name": "SpaCyTokenizer",
6508
- "macrolanguage": true
6509
- }
6510
- },
6511
  "children": [
6512
  {
6513
  "name": "Dhimal",
@@ -6567,30 +5946,7 @@
6567
  "name": "Asakian",
6568
  "iso_1_code": null,
6569
  "iso_3_code": null,
6570
- "tokenizers": {
6571
- "Latn": {
6572
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
6573
- "original_lang_name": "chinese",
6574
- "original_lang_code": "zho",
6575
- "scripts": [
6576
- "Latn",
6577
- "Hani"
6578
- ],
6579
- "class_name": "SpaCyTokenizer",
6580
- "macrolanguage": true
6581
- },
6582
- "Hani": {
6583
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
6584
- "original_lang_name": "chinese",
6585
- "original_lang_code": "zho",
6586
- "scripts": [
6587
- "Latn",
6588
- "Hani"
6589
- ],
6590
- "class_name": "SpaCyTokenizer",
6591
- "macrolanguage": true
6592
- }
6593
- },
6594
  "children": [
6595
  {
6596
  "name": "Chak",
@@ -7191,59 +6547,13 @@
7191
  "name": "Western Tibeto-Burman",
7192
  "iso_1_code": null,
7193
  "iso_3_code": null,
7194
- "tokenizers": {
7195
- "Latn": {
7196
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7197
- "original_lang_name": "chinese",
7198
- "original_lang_code": "zho",
7199
- "scripts": [
7200
- "Latn",
7201
- "Hani"
7202
- ],
7203
- "class_name": "SpaCyTokenizer",
7204
- "macrolanguage": true
7205
- },
7206
- "Hani": {
7207
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7208
- "original_lang_name": "chinese",
7209
- "original_lang_code": "zho",
7210
- "scripts": [
7211
- "Latn",
7212
- "Hani"
7213
- ],
7214
- "class_name": "SpaCyTokenizer",
7215
- "macrolanguage": true
7216
- }
7217
- },
7218
  "children": [
7219
  {
7220
  "name": "Bodish",
7221
  "iso_1_code": null,
7222
  "iso_3_code": null,
7223
- "tokenizers": {
7224
- "Latn": {
7225
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7226
- "original_lang_name": "chinese",
7227
- "original_lang_code": "zho",
7228
- "scripts": [
7229
- "Latn",
7230
- "Hani"
7231
- ],
7232
- "class_name": "SpaCyTokenizer",
7233
- "macrolanguage": true
7234
- },
7235
- "Hani": {
7236
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7237
- "original_lang_name": "chinese",
7238
- "original_lang_code": "zho",
7239
- "scripts": [
7240
- "Latn",
7241
- "Hani"
7242
- ],
7243
- "class_name": "SpaCyTokenizer",
7244
- "macrolanguage": true
7245
- }
7246
- },
7247
  "children": [
7248
  {
7249
  "name": "Gongduk",
@@ -7299,30 +6609,7 @@
7299
  "name": "Central Bodish",
7300
  "iso_1_code": null,
7301
  "iso_3_code": null,
7302
- "tokenizers": {
7303
- "Latn": {
7304
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7305
- "original_lang_name": "chinese",
7306
- "original_lang_code": "zho",
7307
- "scripts": [
7308
- "Latn",
7309
- "Hani"
7310
- ],
7311
- "class_name": "SpaCyTokenizer",
7312
- "macrolanguage": true
7313
- },
7314
- "Hani": {
7315
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7316
- "original_lang_name": "chinese",
7317
- "original_lang_code": "zho",
7318
- "scripts": [
7319
- "Latn",
7320
- "Hani"
7321
- ],
7322
- "class_name": "SpaCyTokenizer",
7323
- "macrolanguage": true
7324
- }
7325
- },
7326
  "children": [
7327
  {
7328
  "name": "Choni",
@@ -7348,30 +6635,7 @@
7348
  "name": "Amdo",
7349
  "iso_1_code": null,
7350
  "iso_3_code": null,
7351
- "tokenizers": {
7352
- "Latn": {
7353
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7354
- "original_lang_name": "chinese",
7355
- "original_lang_code": "zho",
7356
- "scripts": [
7357
- "Latn",
7358
- "Hani"
7359
- ],
7360
- "class_name": "SpaCyTokenizer",
7361
- "macrolanguage": true
7362
- },
7363
- "Hani": {
7364
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7365
- "original_lang_name": "chinese",
7366
- "original_lang_code": "zho",
7367
- "scripts": [
7368
- "Latn",
7369
- "Hani"
7370
- ],
7371
- "class_name": "SpaCyTokenizer",
7372
- "macrolanguage": true
7373
- }
7374
- },
7375
  "children": [
7376
  {
7377
  "name": "Tibetan, Amdo",
@@ -7392,30 +6656,7 @@
7392
  "name": "Central",
7393
  "iso_1_code": null,
7394
  "iso_3_code": null,
7395
- "tokenizers": {
7396
- "Latn": {
7397
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7398
- "original_lang_name": "chinese",
7399
- "original_lang_code": "zho",
7400
- "scripts": [
7401
- "Latn",
7402
- "Hani"
7403
- ],
7404
- "class_name": "SpaCyTokenizer",
7405
- "macrolanguage": true
7406
- },
7407
- "Hani": {
7408
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7409
- "original_lang_name": "chinese",
7410
- "original_lang_code": "zho",
7411
- "scripts": [
7412
- "Latn",
7413
- "Hani"
7414
- ],
7415
- "class_name": "SpaCyTokenizer",
7416
- "macrolanguage": true
7417
- }
7418
- },
7419
  "children": [
7420
  {
7421
  "name": "Tibetan, Central",
@@ -7433,30 +6674,7 @@
7433
  "name": "gTsang",
7434
  "iso_1_code": null,
7435
  "iso_3_code": null,
7436
- "tokenizers": {
7437
- "Latn": {
7438
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7439
- "original_lang_name": "chinese",
7440
- "original_lang_code": "zho",
7441
- "scripts": [
7442
- "Latn",
7443
- "Hani"
7444
- ],
7445
- "class_name": "SpaCyTokenizer",
7446
- "macrolanguage": true
7447
- },
7448
- "Hani": {
7449
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7450
- "original_lang_name": "chinese",
7451
- "original_lang_code": "zho",
7452
- "scripts": [
7453
- "Latn",
7454
- "Hani"
7455
- ],
7456
- "class_name": "SpaCyTokenizer",
7457
- "macrolanguage": true
7458
- }
7459
- },
7460
  "children": [
7461
  {
7462
  "name": "Dolpo",
@@ -7609,30 +6827,7 @@
7609
  "name": "Southern",
7610
  "iso_1_code": null,
7611
  "iso_3_code": null,
7612
- "tokenizers": {
7613
- "Latn": {
7614
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7615
- "original_lang_name": "chinese",
7616
- "original_lang_code": "zho",
7617
- "scripts": [
7618
- "Latn",
7619
- "Hani"
7620
- ],
7621
- "class_name": "SpaCyTokenizer",
7622
- "macrolanguage": true
7623
- },
7624
- "Hani": {
7625
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7626
- "original_lang_name": "chinese",
7627
- "original_lang_code": "zho",
7628
- "scripts": [
7629
- "Latn",
7630
- "Hani"
7631
- ],
7632
- "class_name": "SpaCyTokenizer",
7633
- "macrolanguage": true
7634
- }
7635
- },
7636
  "children": [
7637
  {
7638
  "name": "Brokkat",
@@ -7735,30 +6930,7 @@
7735
  "name": "Western",
7736
  "iso_1_code": null,
7737
  "iso_3_code": null,
7738
- "tokenizers": {
7739
- "Latn": {
7740
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7741
- "original_lang_name": "chinese",
7742
- "original_lang_code": "zho",
7743
- "scripts": [
7744
- "Latn",
7745
- "Hani"
7746
- ],
7747
- "class_name": "SpaCyTokenizer",
7748
- "macrolanguage": true
7749
- },
7750
- "Hani": {
7751
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7752
- "original_lang_name": "chinese",
7753
- "original_lang_code": "zho",
7754
- "scripts": [
7755
- "Latn",
7756
- "Hani"
7757
- ],
7758
- "class_name": "SpaCyTokenizer",
7759
- "macrolanguage": true
7760
- }
7761
- },
7762
  "children": [
7763
  {
7764
  "name": "Jad",
@@ -7804,30 +6976,7 @@
7804
  "name": "Khams",
7805
  "iso_1_code": null,
7806
  "iso_3_code": null,
7807
- "tokenizers": {
7808
- "Latn": {
7809
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7810
- "original_lang_name": "chinese",
7811
- "original_lang_code": "zho",
7812
- "scripts": [
7813
- "Latn",
7814
- "Hani"
7815
- ],
7816
- "class_name": "SpaCyTokenizer",
7817
- "macrolanguage": true
7818
- },
7819
- "Hani": {
7820
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7821
- "original_lang_name": "chinese",
7822
- "original_lang_code": "zho",
7823
- "scripts": [
7824
- "Latn",
7825
- "Hani"
7826
- ],
7827
- "class_name": "SpaCyTokenizer",
7828
- "macrolanguage": true
7829
- }
7830
- },
7831
  "children": [
7832
  {
7833
  "name": "Tibetan, Khams",
@@ -7848,30 +6997,7 @@
7848
  "name": "Unclassified",
7849
  "iso_1_code": null,
7850
  "iso_3_code": null,
7851
- "tokenizers": {
7852
- "Latn": {
7853
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7854
- "original_lang_name": "chinese",
7855
- "original_lang_code": "zho",
7856
- "scripts": [
7857
- "Latn",
7858
- "Hani"
7859
- ],
7860
- "class_name": "SpaCyTokenizer",
7861
- "macrolanguage": true
7862
- },
7863
- "Hani": {
7864
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7865
- "original_lang_name": "chinese",
7866
- "original_lang_code": "zho",
7867
- "scripts": [
7868
- "Latn",
7869
- "Hani"
7870
- ],
7871
- "class_name": "SpaCyTokenizer",
7872
- "macrolanguage": true
7873
- }
7874
- },
7875
  "children": [
7876
  {
7877
  "name": "Naaba",
@@ -7902,30 +7028,7 @@
7902
  "name": "Western",
7903
  "iso_1_code": null,
7904
  "iso_3_code": null,
7905
- "tokenizers": {
7906
- "Latn": {
7907
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7908
- "original_lang_name": "chinese",
7909
- "original_lang_code": "zho",
7910
- "scripts": [
7911
- "Latn",
7912
- "Hani"
7913
- ],
7914
- "class_name": "SpaCyTokenizer",
7915
- "macrolanguage": true
7916
- },
7917
- "Hani": {
7918
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7919
- "original_lang_name": "chinese",
7920
- "original_lang_code": "zho",
7921
- "scripts": [
7922
- "Latn",
7923
- "Hani"
7924
- ],
7925
- "class_name": "SpaCyTokenizer",
7926
- "macrolanguage": true
7927
- }
7928
- },
7929
  "children": [
7930
  {
7931
  "name": "Balti",
@@ -7993,30 +7096,7 @@
7993
  "name": "East Bodish",
7994
  "iso_1_code": null,
7995
  "iso_3_code": null,
7996
- "tokenizers": {
7997
- "Latn": {
7998
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
7999
- "original_lang_name": "chinese",
8000
- "original_lang_code": "zho",
8001
- "scripts": [
8002
- "Latn",
8003
- "Hani"
8004
- ],
8005
- "class_name": "SpaCyTokenizer",
8006
- "macrolanguage": true
8007
- },
8008
- "Hani": {
8009
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8010
- "original_lang_name": "chinese",
8011
- "original_lang_code": "zho",
8012
- "scripts": [
8013
- "Latn",
8014
- "Hani"
8015
- ],
8016
- "class_name": "SpaCyTokenizer",
8017
- "macrolanguage": true
8018
- }
8019
- },
8020
  "children": [
8021
  {
8022
  "name": "Dakpakha",
@@ -8042,40 +7122,17 @@
8042
  "name": "Bumthang",
8043
  "iso_1_code": null,
8044
  "iso_3_code": null,
8045
- "tokenizers": {
8046
- "Latn": {
8047
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8048
- "original_lang_name": "chinese",
8049
- "original_lang_code": "zho",
8050
- "scripts": [
8051
- "Latn",
8052
- "Hani"
8053
- ],
8054
- "class_name": "SpaCyTokenizer",
8055
- "macrolanguage": true
8056
- },
8057
- "Hani": {
8058
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8059
- "original_lang_name": "chinese",
8060
- "original_lang_code": "zho",
8061
- "scripts": [
8062
- "Latn",
8063
- "Hani"
8064
- ],
8065
- "class_name": "SpaCyTokenizer",
8066
- "macrolanguage": true
8067
- }
8068
- },
8069
- "children": [
8070
- {
8071
- "name": "Dzalakha",
8072
- "iso_1_code": null,
8073
- "iso_3_code": "dzl",
8074
- "tokenizers": {},
8075
- "children": [],
8076
- "node_i": "9374",
8077
- "scripts": [],
8078
- "own_tokenizer": false
8079
  },
8080
  {
8081
  "name": "Bumthangkha",
@@ -8151,30 +7208,7 @@
8151
  "name": "West Bodish",
8152
  "iso_1_code": null,
8153
  "iso_3_code": null,
8154
- "tokenizers": {
8155
- "Latn": {
8156
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8157
- "original_lang_name": "chinese",
8158
- "original_lang_code": "zho",
8159
- "scripts": [
8160
- "Latn",
8161
- "Hani"
8162
- ],
8163
- "class_name": "SpaCyTokenizer",
8164
- "macrolanguage": true
8165
- },
8166
- "Hani": {
8167
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8168
- "original_lang_name": "chinese",
8169
- "original_lang_code": "zho",
8170
- "scripts": [
8171
- "Latn",
8172
- "Hani"
8173
- ],
8174
- "class_name": "SpaCyTokenizer",
8175
- "macrolanguage": true
8176
- }
8177
- },
8178
  "children": [
8179
  {
8180
  "name": "Dura",
@@ -8200,30 +7234,7 @@
8200
  "name": "Ghale",
8201
  "iso_1_code": null,
8202
  "iso_3_code": null,
8203
- "tokenizers": {
8204
- "Latn": {
8205
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8206
- "original_lang_name": "chinese",
8207
- "original_lang_code": "zho",
8208
- "scripts": [
8209
- "Latn",
8210
- "Hani"
8211
- ],
8212
- "class_name": "SpaCyTokenizer",
8213
- "macrolanguage": true
8214
- },
8215
- "Hani": {
8216
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8217
- "original_lang_name": "chinese",
8218
- "original_lang_code": "zho",
8219
- "scripts": [
8220
- "Latn",
8221
- "Hani"
8222
- ],
8223
- "class_name": "SpaCyTokenizer",
8224
- "macrolanguage": true
8225
- }
8226
- },
8227
  "children": [
8228
  {
8229
  "name": "Ghale, Southern",
@@ -8266,59 +7277,13 @@
8266
  "name": "Gurung-Tamang",
8267
  "iso_1_code": null,
8268
  "iso_3_code": null,
8269
- "tokenizers": {
8270
- "Latn": {
8271
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8272
- "original_lang_name": "chinese",
8273
- "original_lang_code": "zho",
8274
- "scripts": [
8275
- "Latn",
8276
- "Hani"
8277
- ],
8278
- "class_name": "SpaCyTokenizer",
8279
- "macrolanguage": true
8280
- },
8281
- "Hani": {
8282
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8283
- "original_lang_name": "chinese",
8284
- "original_lang_code": "zho",
8285
- "scripts": [
8286
- "Latn",
8287
- "Hani"
8288
- ],
8289
- "class_name": "SpaCyTokenizer",
8290
- "macrolanguage": true
8291
- }
8292
- },
8293
  "children": [
8294
  {
8295
  "name": "Gurungic",
8296
  "iso_1_code": null,
8297
  "iso_3_code": null,
8298
- "tokenizers": {
8299
- "Latn": {
8300
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8301
- "original_lang_name": "chinese",
8302
- "original_lang_code": "zho",
8303
- "scripts": [
8304
- "Latn",
8305
- "Hani"
8306
- ],
8307
- "class_name": "SpaCyTokenizer",
8308
- "macrolanguage": true
8309
- },
8310
- "Hani": {
8311
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8312
- "original_lang_name": "chinese",
8313
- "original_lang_code": "zho",
8314
- "scripts": [
8315
- "Latn",
8316
- "Hani"
8317
- ],
8318
- "class_name": "SpaCyTokenizer",
8319
- "macrolanguage": true
8320
- }
8321
- },
8322
  "children": [
8323
  {
8324
  "name": "Chantyal",
@@ -8389,30 +7354,7 @@
8389
  "name": "Tamang",
8390
  "iso_1_code": null,
8391
  "iso_3_code": null,
8392
- "tokenizers": {
8393
- "Latn": {
8394
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8395
- "original_lang_name": "chinese",
8396
- "original_lang_code": "zho",
8397
- "scripts": [
8398
- "Latn",
8399
- "Hani"
8400
- ],
8401
- "class_name": "SpaCyTokenizer",
8402
- "macrolanguage": true
8403
- },
8404
- "Hani": {
8405
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8406
- "original_lang_name": "chinese",
8407
- "original_lang_code": "zho",
8408
- "scripts": [
8409
- "Latn",
8410
- "Hani"
8411
- ],
8412
- "class_name": "SpaCyTokenizer",
8413
- "macrolanguage": true
8414
- }
8415
- },
8416
  "children": [
8417
  {
8418
  "name": "Tamang, Eastern",
@@ -8465,59 +7407,13 @@
8465
  "name": "West Himalayish",
8466
  "iso_1_code": null,
8467
  "iso_3_code": null,
8468
- "tokenizers": {
8469
- "Latn": {
8470
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8471
- "original_lang_name": "chinese",
8472
- "original_lang_code": "zho",
8473
- "scripts": [
8474
- "Latn",
8475
- "Hani"
8476
- ],
8477
- "class_name": "SpaCyTokenizer",
8478
- "macrolanguage": true
8479
- },
8480
- "Hani": {
8481
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8482
- "original_lang_name": "chinese",
8483
- "original_lang_code": "zho",
8484
- "scripts": [
8485
- "Latn",
8486
- "Hani"
8487
- ],
8488
- "class_name": "SpaCyTokenizer",
8489
- "macrolanguage": true
8490
- }
8491
- },
8492
  "children": [
8493
  {
8494
  "name": "Almora",
8495
  "iso_1_code": null,
8496
  "iso_3_code": null,
8497
- "tokenizers": {
8498
- "Latn": {
8499
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8500
- "original_lang_name": "chinese",
8501
- "original_lang_code": "zho",
8502
- "scripts": [
8503
- "Latn",
8504
- "Hani"
8505
- ],
8506
- "class_name": "SpaCyTokenizer",
8507
- "macrolanguage": true
8508
- },
8509
- "Hani": {
8510
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8511
- "original_lang_name": "chinese",
8512
- "original_lang_code": "zho",
8513
- "scripts": [
8514
- "Latn",
8515
- "Hani"
8516
- ],
8517
- "class_name": "SpaCyTokenizer",
8518
- "macrolanguage": true
8519
- }
8520
- },
8521
  "children": [
8522
  {
8523
  "name": "Byangsi",
@@ -8568,30 +7464,7 @@
8568
  "name": "Kinauri",
8569
  "iso_1_code": null,
8570
  "iso_3_code": null,
8571
- "tokenizers": {
8572
- "Latn": {
8573
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8574
- "original_lang_name": "chinese",
8575
- "original_lang_code": "zho",
8576
- "scripts": [
8577
- "Latn",
8578
- "Hani"
8579
- ],
8580
- "class_name": "SpaCyTokenizer",
8581
- "macrolanguage": true
8582
- },
8583
- "Hani": {
8584
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8585
- "original_lang_name": "chinese",
8586
- "original_lang_code": "zho",
8587
- "scripts": [
8588
- "Latn",
8589
- "Hani"
8590
- ],
8591
- "class_name": "SpaCyTokenizer",
8592
- "macrolanguage": true
8593
- }
8594
- },
8595
  "children": [
8596
  {
8597
  "name": "Gahri",
@@ -8732,88 +7605,19 @@
8732
  "name": "Himalayan",
8733
  "iso_1_code": null,
8734
  "iso_3_code": null,
8735
- "tokenizers": {
8736
- "Latn": {
8737
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8738
- "original_lang_name": "chinese",
8739
- "original_lang_code": "zho",
8740
- "scripts": [
8741
- "Latn",
8742
- "Hani"
8743
- ],
8744
- "class_name": "SpaCyTokenizer",
8745
- "macrolanguage": true
8746
- },
8747
- "Hani": {
8748
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8749
- "original_lang_name": "chinese",
8750
- "original_lang_code": "zho",
8751
- "scripts": [
8752
- "Latn",
8753
- "Hani"
8754
- ],
8755
- "class_name": "SpaCyTokenizer",
8756
- "macrolanguage": true
8757
- }
8758
- },
8759
- "children": [
8760
- {
8761
- "name": "Central Himalayan",
8762
- "iso_1_code": null,
8763
- "iso_3_code": null,
8764
- "tokenizers": {
8765
- "Latn": {
8766
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8767
- "original_lang_name": "chinese",
8768
- "original_lang_code": "zho",
8769
- "scripts": [
8770
- "Latn",
8771
- "Hani"
8772
- ],
8773
- "class_name": "SpaCyTokenizer",
8774
- "macrolanguage": true
8775
- },
8776
- "Hani": {
8777
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8778
- "original_lang_name": "chinese",
8779
- "original_lang_code": "zho",
8780
- "scripts": [
8781
- "Latn",
8782
- "Hani"
8783
- ],
8784
- "class_name": "SpaCyTokenizer",
8785
- "macrolanguage": true
8786
- }
8787
- },
8788
  "children": [
8789
  {
8790
  "name": "Chepang-Bhujel",
8791
  "iso_1_code": null,
8792
  "iso_3_code": null,
8793
- "tokenizers": {
8794
- "Latn": {
8795
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8796
- "original_lang_name": "chinese",
8797
- "original_lang_code": "zho",
8798
- "scripts": [
8799
- "Latn",
8800
- "Hani"
8801
- ],
8802
- "class_name": "SpaCyTokenizer",
8803
- "macrolanguage": true
8804
- },
8805
- "Hani": {
8806
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8807
- "original_lang_name": "chinese",
8808
- "original_lang_code": "zho",
8809
- "scripts": [
8810
- "Latn",
8811
- "Hani"
8812
- ],
8813
- "class_name": "SpaCyTokenizer",
8814
- "macrolanguage": true
8815
- }
8816
- },
8817
  "children": [
8818
  {
8819
  "name": "Bhujel",
@@ -8844,59 +7648,13 @@
8844
  "name": "Kham-Magar",
8845
  "iso_1_code": null,
8846
  "iso_3_code": null,
8847
- "tokenizers": {
8848
- "Latn": {
8849
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8850
- "original_lang_name": "chinese",
8851
- "original_lang_code": "zho",
8852
- "scripts": [
8853
- "Latn",
8854
- "Hani"
8855
- ],
8856
- "class_name": "SpaCyTokenizer",
8857
- "macrolanguage": true
8858
- },
8859
- "Hani": {
8860
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8861
- "original_lang_name": "chinese",
8862
- "original_lang_code": "zho",
8863
- "scripts": [
8864
- "Latn",
8865
- "Hani"
8866
- ],
8867
- "class_name": "SpaCyTokenizer",
8868
- "macrolanguage": true
8869
- }
8870
- },
8871
  "children": [
8872
  {
8873
  "name": "Kham",
8874
  "iso_1_code": null,
8875
  "iso_3_code": null,
8876
- "tokenizers": {
8877
- "Latn": {
8878
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8879
- "original_lang_name": "chinese",
8880
- "original_lang_code": "zho",
8881
- "scripts": [
8882
- "Latn",
8883
- "Hani"
8884
- ],
8885
- "class_name": "SpaCyTokenizer",
8886
- "macrolanguage": true
8887
- },
8888
- "Hani": {
8889
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8890
- "original_lang_name": "chinese",
8891
- "original_lang_code": "zho",
8892
- "scripts": [
8893
- "Latn",
8894
- "Hani"
8895
- ],
8896
- "class_name": "SpaCyTokenizer",
8897
- "macrolanguage": true
8898
- }
8899
- },
8900
  "children": [
8901
  {
8902
  "name": "Kham, Gamal",
@@ -8947,30 +7705,7 @@
8947
  "name": "Magar",
8948
  "iso_1_code": null,
8949
  "iso_3_code": null,
8950
- "tokenizers": {
8951
- "Latn": {
8952
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8953
- "original_lang_name": "chinese",
8954
- "original_lang_code": "zho",
8955
- "scripts": [
8956
- "Latn",
8957
- "Hani"
8958
- ],
8959
- "class_name": "SpaCyTokenizer",
8960
- "macrolanguage": true
8961
- },
8962
- "Hani": {
8963
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
8964
- "original_lang_name": "chinese",
8965
- "original_lang_code": "zho",
8966
- "scripts": [
8967
- "Latn",
8968
- "Hani"
8969
- ],
8970
- "class_name": "SpaCyTokenizer",
8971
- "macrolanguage": true
8972
- }
8973
- },
8974
  "children": [
8975
  {
8976
  "name": "Magar, Eastern",
@@ -9006,30 +7741,7 @@
9006
  "name": "Newar",
9007
  "iso_1_code": null,
9008
  "iso_3_code": null,
9009
- "tokenizers": {
9010
- "Latn": {
9011
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9012
- "original_lang_name": "chinese",
9013
- "original_lang_code": "zho",
9014
- "scripts": [
9015
- "Latn",
9016
- "Hani"
9017
- ],
9018
- "class_name": "SpaCyTokenizer",
9019
- "macrolanguage": true
9020
- },
9021
- "Hani": {
9022
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9023
- "original_lang_name": "chinese",
9024
- "original_lang_code": "zho",
9025
- "scripts": [
9026
- "Latn",
9027
- "Hani"
9028
- ],
9029
- "class_name": "SpaCyTokenizer",
9030
- "macrolanguage": true
9031
- }
9032
- },
9033
  "children": [
9034
  {
9035
  "name": "Newar",
@@ -9062,30 +7774,7 @@
9062
  "name": "Raute-Raji",
9063
  "iso_1_code": null,
9064
  "iso_3_code": null,
9065
- "tokenizers": {
9066
- "Latn": {
9067
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9068
- "original_lang_name": "chinese",
9069
- "original_lang_code": "zho",
9070
- "scripts": [
9071
- "Latn",
9072
- "Hani"
9073
- ],
9074
- "class_name": "SpaCyTokenizer",
9075
- "macrolanguage": true
9076
- },
9077
- "Hani": {
9078
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9079
- "original_lang_name": "chinese",
9080
- "original_lang_code": "zho",
9081
- "scripts": [
9082
- "Latn",
9083
- "Hani"
9084
- ],
9085
- "class_name": "SpaCyTokenizer",
9086
- "macrolanguage": true
9087
- }
9088
- },
9089
  "children": [
9090
  {
9091
  "name": "Rawat",
@@ -9126,30 +7815,7 @@
9126
  "name": "Thangmi-Baraamu",
9127
  "iso_1_code": null,
9128
  "iso_3_code": null,
9129
- "tokenizers": {
9130
- "Latn": {
9131
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9132
- "original_lang_name": "chinese",
9133
- "original_lang_code": "zho",
9134
- "scripts": [
9135
- "Latn",
9136
- "Hani"
9137
- ],
9138
- "class_name": "SpaCyTokenizer",
9139
- "macrolanguage": true
9140
- },
9141
- "Hani": {
9142
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9143
- "original_lang_name": "chinese",
9144
- "original_lang_code": "zho",
9145
- "scripts": [
9146
- "Latn",
9147
- "Hani"
9148
- ],
9149
- "class_name": "SpaCyTokenizer",
9150
- "macrolanguage": true
9151
- }
9152
- },
9153
  "children": [
9154
  {
9155
  "name": "Baram",
@@ -9185,59 +7851,13 @@
9185
  "name": "Kiranti",
9186
  "iso_1_code": null,
9187
  "iso_3_code": null,
9188
- "tokenizers": {
9189
- "Latn": {
9190
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9191
- "original_lang_name": "chinese",
9192
- "original_lang_code": "zho",
9193
- "scripts": [
9194
- "Latn",
9195
- "Hani"
9196
- ],
9197
- "class_name": "SpaCyTokenizer",
9198
- "macrolanguage": true
9199
- },
9200
- "Hani": {
9201
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9202
- "original_lang_name": "chinese",
9203
- "original_lang_code": "zho",
9204
- "scripts": [
9205
- "Latn",
9206
- "Hani"
9207
- ],
9208
- "class_name": "SpaCyTokenizer",
9209
- "macrolanguage": true
9210
- }
9211
- },
9212
  "children": [
9213
  {
9214
  "name": "Eastern",
9215
  "iso_1_code": null,
9216
  "iso_3_code": null,
9217
- "tokenizers": {
9218
- "Latn": {
9219
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9220
- "original_lang_name": "chinese",
9221
- "original_lang_code": "zho",
9222
- "scripts": [
9223
- "Latn",
9224
- "Hani"
9225
- ],
9226
- "class_name": "SpaCyTokenizer",
9227
- "macrolanguage": true
9228
- },
9229
- "Hani": {
9230
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9231
- "original_lang_name": "chinese",
9232
- "original_lang_code": "zho",
9233
- "scripts": [
9234
- "Latn",
9235
- "Hani"
9236
- ],
9237
- "class_name": "SpaCyTokenizer",
9238
- "macrolanguage": true
9239
- }
9240
- },
9241
  "children": [
9242
  {
9243
  "name": "Athpariya",
@@ -9485,30 +8105,7 @@
9485
  "name": "Western",
9486
  "iso_1_code": null,
9487
  "iso_3_code": null,
9488
- "tokenizers": {
9489
- "Latn": {
9490
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9491
- "original_lang_name": "chinese",
9492
- "original_lang_code": "zho",
9493
- "scripts": [
9494
- "Latn",
9495
- "Hani"
9496
- ],
9497
- "class_name": "SpaCyTokenizer",
9498
- "macrolanguage": true
9499
- },
9500
- "Hani": {
9501
- "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
9502
- "original_lang_name": "chinese",
9503
- "original_lang_code": "zho",
9504
- "scripts": [
9505
- "Latn",
9506
- "Hani"
9507
- ],
9508
- "class_name": "SpaCyTokenizer",
9509
- "macrolanguage": true
9510
- }
9511
- },
9512
  "children": [
9513
  {
9514
  "name": "Bahing",
 
1029
  "name": "Digarish",
1030
  "iso_1_code": null,
1031
  "iso_3_code": null,
1032
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1033
  "children": [
1034
  {
1035
  "name": "Idu-Mishmi",
 
1060
  "name": "Hrusish",
1061
  "iso_1_code": null,
1062
  "iso_3_code": null,
1063
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1064
  "children": [
1065
  {
1066
  "name": "Hruso",
 
1091
  "name": "Keman",
1092
  "iso_1_code": null,
1093
  "iso_3_code": null,
1094
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1095
  "children": [
1096
  {
1097
  "name": "Zakhring",
 
1112
  "name": "Kho-Bwa",
1113
  "iso_1_code": null,
1114
  "iso_3_code": null,
1115
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1116
  "children": [
1117
  {
1118
  "name": "Bugun",
 
1183
  "name": "Lepcha",
1184
  "iso_1_code": null,
1185
  "iso_3_code": null,
1186
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1187
  "children": [
1188
  {
1189
  "name": "Lepcha",
 
1204
  "name": "Mijish",
1205
  "iso_1_code": null,
1206
  "iso_3_code": null,
1207
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1208
  "children": [
1209
  {
1210
  "name": "Miju-Mishmi",
 
1713
  "name": "Northern",
1714
  "iso_1_code": null,
1715
  "iso_3_code": null,
1716
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1717
  "children": [
1718
  {
1719
  "name": "Lahta",
 
1774
  "name": "Peripheral",
1775
  "iso_1_code": null,
1776
  "iso_3_code": null,
1777
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1778
  "children": [
1779
  {
1780
  "name": "Pa\u2019o",
 
1839
  "name": "Southern",
1840
  "iso_1_code": null,
1841
  "iso_3_code": null,
1842
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1843
  "children": [
1844
  {
1845
  "name": "Karen, Mobwa",
 
2357
  "name": "Mara",
2358
  "iso_1_code": null,
2359
  "iso_3_code": null,
2360
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2361
  "children": [
2362
  {
2363
  "name": "Chin, Lautu",
 
3176
  "name": "Asho",
3177
  "iso_1_code": null,
3178
  "iso_3_code": null,
3179
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3180
  "children": [
3181
  {
3182
  "name": "Chin, L\u00e4okt\u00fc",
 
3557
  "name": "Southern",
3558
  "iso_1_code": null,
3559
  "iso_3_code": null,
3560
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3561
  "children": [
3562
  {
3563
  "name": "Danu",
 
3645
  "name": "Mru",
3646
  "iso_1_code": null,
3647
  "iso_3_code": null,
3648
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3649
  "children": [
3650
  {
3651
  "name": "Chin, Anu-Khongso",
 
4019
  "name": "Northern",
4020
  "iso_1_code": null,
4021
  "iso_3_code": null,
4022
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4023
  "children": [
4024
  {
4025
  "name": "Nuosu",
 
4170
  "name": "Southeastern",
4171
  "iso_1_code": null,
4172
  "iso_3_code": null,
4173
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4174
  "children": [
4175
  {
4176
  "name": "Alugu",
 
4768
  "name": "Bisoid",
4769
  "iso_1_code": null,
4770
  "iso_3_code": null,
4771
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4772
  "children": [
4773
  {
4774
  "name": "Bisu",
 
4826
  "name": "Unclassified",
4827
  "iso_1_code": null,
4828
  "iso_3_code": null,
4829
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4830
  "children": [
4831
  {
4832
  "name": "Laopang",
 
4867
  "name": "Northeastern Tibeto-Burman",
4868
  "iso_1_code": null,
4869
  "iso_3_code": null,
4870
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4871
  "children": [
4872
  {
4873
  "name": "Bai",
4874
  "iso_1_code": null,
4875
  "iso_3_code": null,
4876
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4877
  "children": [
4878
  {
4879
  "name": "Bai, Central",
 
4924
  "name": "Baima",
4925
  "iso_1_code": null,
4926
  "iso_3_code": null,
4927
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4928
  "children": [
4929
  {
4930
  "name": "Baima",
 
4945
  "name": "Ersuish",
4946
  "iso_1_code": null,
4947
  "iso_3_code": null,
4948
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4949
  "children": [
4950
  {
4951
  "name": "Ersu",
 
4966
  "name": "Naic",
4967
  "iso_1_code": null,
4968
  "iso_3_code": null,
4969
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4970
  "children": [
4971
  {
4972
  "name": "Namuyi",
 
5017
  "name": "Qiangic",
5018
  "iso_1_code": null,
5019
  "iso_3_code": null,
5020
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5021
  "children": [
5022
  {
5023
  "name": "Qiang, Northern",
 
5118
  "name": "rGyalrongic",
5119
  "iso_1_code": null,
5120
  "iso_3_code": null,
5121
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5122
  "children": [
5123
  {
5124
  "name": "Horpa",
 
5169
  "name": "Tujia",
5170
  "iso_1_code": null,
5171
  "iso_3_code": null,
5172
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5173
  "children": [
5174
  {
5175
  "name": "Tujia, Northern",
 
5568
  "name": "Koch",
5569
  "iso_1_code": null,
5570
  "iso_3_code": null,
5571
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5572
  "children": [
5573
  {
5574
  "name": "Atong",
 
5886
  "name": "Dhimalish",
5887
  "iso_1_code": null,
5888
  "iso_3_code": null,
5889
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5890
  "children": [
5891
  {
5892
  "name": "Dhimal",
 
5946
  "name": "Asakian",
5947
  "iso_1_code": null,
5948
  "iso_3_code": null,
5949
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5950
  "children": [
5951
  {
5952
  "name": "Chak",
 
6547
  "name": "Western Tibeto-Burman",
6548
  "iso_1_code": null,
6549
  "iso_3_code": null,
6550
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6551
  "children": [
6552
  {
6553
  "name": "Bodish",
6554
  "iso_1_code": null,
6555
  "iso_3_code": null,
6556
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6557
  "children": [
6558
  {
6559
  "name": "Gongduk",
 
6609
  "name": "Central Bodish",
6610
  "iso_1_code": null,
6611
  "iso_3_code": null,
6612
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6613
  "children": [
6614
  {
6615
  "name": "Choni",
 
6635
  "name": "Amdo",
6636
  "iso_1_code": null,
6637
  "iso_3_code": null,
6638
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6639
  "children": [
6640
  {
6641
  "name": "Tibetan, Amdo",
 
6656
  "name": "Central",
6657
  "iso_1_code": null,
6658
  "iso_3_code": null,
6659
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6660
  "children": [
6661
  {
6662
  "name": "Tibetan, Central",
 
6674
  "name": "gTsang",
6675
  "iso_1_code": null,
6676
  "iso_3_code": null,
6677
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6678
  "children": [
6679
  {
6680
  "name": "Dolpo",
 
6827
  "name": "Southern",
6828
  "iso_1_code": null,
6829
  "iso_3_code": null,
6830
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6831
  "children": [
6832
  {
6833
  "name": "Brokkat",
 
6930
  "name": "Western",
6931
  "iso_1_code": null,
6932
  "iso_3_code": null,
6933
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6934
  "children": [
6935
  {
6936
  "name": "Jad",
 
6976
  "name": "Khams",
6977
  "iso_1_code": null,
6978
  "iso_3_code": null,
6979
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6980
  "children": [
6981
  {
6982
  "name": "Tibetan, Khams",
 
6997
  "name": "Unclassified",
6998
  "iso_1_code": null,
6999
  "iso_3_code": null,
7000
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7001
  "children": [
7002
  {
7003
  "name": "Naaba",
 
7028
  "name": "Western",
7029
  "iso_1_code": null,
7030
  "iso_3_code": null,
7031
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7032
  "children": [
7033
  {
7034
  "name": "Balti",
 
7096
  "name": "East Bodish",
7097
  "iso_1_code": null,
7098
  "iso_3_code": null,
7099
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7100
  "children": [
7101
  {
7102
  "name": "Dakpakha",
 
7122
  "name": "Bumthang",
7123
  "iso_1_code": null,
7124
  "iso_3_code": null,
7125
+ "tokenizers": {},
7126
+ "children": [
7127
+ {
7128
+ "name": "Dzalakha",
7129
+ "iso_1_code": null,
7130
+ "iso_3_code": "dzl",
7131
+ "tokenizers": {},
7132
+ "children": [],
7133
+ "node_i": "9374",
7134
+ "scripts": [],
7135
+ "own_tokenizer": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7136
  },
7137
  {
7138
  "name": "Bumthangkha",
 
7208
  "name": "West Bodish",
7209
  "iso_1_code": null,
7210
  "iso_3_code": null,
7211
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7212
  "children": [
7213
  {
7214
  "name": "Dura",
 
7234
  "name": "Ghale",
7235
  "iso_1_code": null,
7236
  "iso_3_code": null,
7237
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7238
  "children": [
7239
  {
7240
  "name": "Ghale, Southern",
 
7277
  "name": "Gurung-Tamang",
7278
  "iso_1_code": null,
7279
  "iso_3_code": null,
7280
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7281
  "children": [
7282
  {
7283
  "name": "Gurungic",
7284
  "iso_1_code": null,
7285
  "iso_3_code": null,
7286
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7287
  "children": [
7288
  {
7289
  "name": "Chantyal",
 
7354
  "name": "Tamang",
7355
  "iso_1_code": null,
7356
  "iso_3_code": null,
7357
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7358
  "children": [
7359
  {
7360
  "name": "Tamang, Eastern",
 
7407
  "name": "West Himalayish",
7408
  "iso_1_code": null,
7409
  "iso_3_code": null,
7410
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7411
  "children": [
7412
  {
7413
  "name": "Almora",
7414
  "iso_1_code": null,
7415
  "iso_3_code": null,
7416
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7417
  "children": [
7418
  {
7419
  "name": "Byangsi",
 
7464
  "name": "Kinauri",
7465
  "iso_1_code": null,
7466
  "iso_3_code": null,
7467
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7468
  "children": [
7469
  {
7470
  "name": "Gahri",
 
7605
  "name": "Himalayan",
7606
  "iso_1_code": null,
7607
  "iso_3_code": null,
7608
+ "tokenizers": {},
7609
+ "children": [
7610
+ {
7611
+ "name": "Central Himalayan",
7612
+ "iso_1_code": null,
7613
+ "iso_3_code": null,
7614
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7615
  "children": [
7616
  {
7617
  "name": "Chepang-Bhujel",
7618
  "iso_1_code": null,
7619
  "iso_3_code": null,
7620
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7621
  "children": [
7622
  {
7623
  "name": "Bhujel",
 
7648
  "name": "Kham-Magar",
7649
  "iso_1_code": null,
7650
  "iso_3_code": null,
7651
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7652
  "children": [
7653
  {
7654
  "name": "Kham",
7655
  "iso_1_code": null,
7656
  "iso_3_code": null,
7657
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7658
  "children": [
7659
  {
7660
  "name": "Kham, Gamal",
 
7705
  "name": "Magar",
7706
  "iso_1_code": null,
7707
  "iso_3_code": null,
7708
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7709
  "children": [
7710
  {
7711
  "name": "Magar, Eastern",
 
7741
  "name": "Newar",
7742
  "iso_1_code": null,
7743
  "iso_3_code": null,
7744
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7745
  "children": [
7746
  {
7747
  "name": "Newar",
 
7774
  "name": "Raute-Raji",
7775
  "iso_1_code": null,
7776
  "iso_3_code": null,
7777
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7778
  "children": [
7779
  {
7780
  "name": "Rawat",
 
7815
  "name": "Thangmi-Baraamu",
7816
  "iso_1_code": null,
7817
  "iso_3_code": null,
7818
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7819
  "children": [
7820
  {
7821
  "name": "Baram",
 
7851
  "name": "Kiranti",
7852
  "iso_1_code": null,
7853
  "iso_3_code": null,
7854
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7855
  "children": [
7856
  {
7857
  "name": "Eastern",
7858
  "iso_1_code": null,
7859
  "iso_3_code": null,
7860
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7861
  "children": [
7862
  {
7863
  "name": "Athpariya",
 
8105
  "name": "Western",
8106
  "iso_1_code": null,
8107
  "iso_3_code": null,
8108
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8109
  "children": [
8110
  {
8111
  "name": "Bahing",
data/Turkic.json CHANGED
@@ -13,7 +13,7 @@
13
  "class_name": "SpaCyTokenizer",
14
  "macrolanguage": false
15
  },
16
- "Arab": {
17
  "full_object": "SpaCyTokenizer(\"az\")",
18
  "original_lang_name": "azerbaijani",
19
  "original_lang_code": "aze",
@@ -25,7 +25,7 @@
25
  "class_name": "SpaCyTokenizer",
26
  "macrolanguage": true
27
  },
28
- "Cyrl": {
29
  "full_object": "SpaCyTokenizer(\"az\")",
30
  "original_lang_name": "azerbaijani",
31
  "original_lang_code": "aze",
@@ -54,17 +54,7 @@
54
  "iso_1_code": null,
55
  "iso_3_code": null,
56
  "tokenizers": {
57
- "Latn": {
58
- "full_object": "SpaCyTokenizer(\"tr\")",
59
- "original_lang_name": "turkish",
60
- "original_lang_code": "tur",
61
- "scripts": [
62
- "Latn"
63
- ],
64
- "class_name": "SpaCyTokenizer",
65
- "macrolanguage": false
66
- },
67
- "Arab": {
68
  "full_object": "SpaCyTokenizer(\"az\")",
69
  "original_lang_name": "azerbaijani",
70
  "original_lang_code": "aze",
@@ -76,7 +66,7 @@
76
  "class_name": "SpaCyTokenizer",
77
  "macrolanguage": true
78
  },
79
- "Cyrl": {
80
  "full_object": "SpaCyTokenizer(\"az\")",
81
  "original_lang_name": "azerbaijani",
82
  "original_lang_code": "aze",
@@ -131,32 +121,32 @@
131
  "original_lang_code": "uig",
132
  "scripts": [
133
  "Latn",
134
- "Arab",
135
- "Cyrl"
136
  ],
137
  "class_name": "StanzaTokenizer",
138
  "macrolanguage": false
139
  },
140
- "Arab": {
141
  "full_object": "StanzaTokenizer(\"ug\")",
142
  "original_lang_name": "uyghur",
143
  "original_lang_code": "uig",
144
  "scripts": [
145
  "Latn",
146
- "Arab",
147
- "Cyrl"
148
  ],
149
  "class_name": "StanzaTokenizer",
150
  "macrolanguage": false
151
  },
152
- "Cyrl": {
153
  "full_object": "StanzaTokenizer(\"ug\")",
154
  "original_lang_name": "uyghur",
155
  "original_lang_code": "uig",
156
  "scripts": [
157
  "Latn",
158
- "Arab",
159
- "Cyrl"
160
  ],
161
  "class_name": "StanzaTokenizer",
162
  "macrolanguage": false
@@ -204,32 +194,32 @@
204
  "original_lang_code": "uig",
205
  "scripts": [
206
  "Latn",
207
- "Arab",
208
- "Cyrl"
209
  ],
210
  "class_name": "StanzaTokenizer",
211
  "macrolanguage": false
212
  },
213
- "Arab": {
214
  "full_object": "StanzaTokenizer(\"ug\")",
215
  "original_lang_name": "uyghur",
216
  "original_lang_code": "uig",
217
  "scripts": [
218
  "Latn",
219
- "Arab",
220
- "Cyrl"
221
  ],
222
  "class_name": "StanzaTokenizer",
223
  "macrolanguage": false
224
  },
225
- "Cyrl": {
226
  "full_object": "StanzaTokenizer(\"ug\")",
227
  "original_lang_name": "uyghur",
228
  "original_lang_code": "uig",
229
  "scripts": [
230
  "Latn",
231
- "Arab",
232
- "Cyrl"
233
  ],
234
  "class_name": "StanzaTokenizer",
235
  "macrolanguage": false
@@ -255,8 +245,8 @@
255
  "original_lang_code": "uig",
256
  "scripts": [
257
  "Latn",
258
- "Arab",
259
- "Cyrl"
260
  ],
261
  "class_name": "StanzaTokenizer",
262
  "macrolanguage": false
@@ -267,8 +257,8 @@
267
  "original_lang_code": "uig",
268
  "scripts": [
269
  "Latn",
270
- "Arab",
271
- "Cyrl"
272
  ],
273
  "class_name": "StanzaTokenizer",
274
  "macrolanguage": false
@@ -293,8 +283,8 @@
293
  "original_lang_code": "uig",
294
  "scripts": [
295
  "Latn",
296
- "Arab",
297
- "Cyrl"
298
  ],
299
  "class_name": "StanzaTokenizer",
300
  "macrolanguage": false
@@ -327,17 +317,7 @@
327
  "iso_1_code": null,
328
  "iso_3_code": null,
329
  "tokenizers": {
330
- "Latn": {
331
- "full_object": "SpaCyTokenizer(\"tr\")",
332
- "original_lang_name": "turkish",
333
- "original_lang_code": "tur",
334
- "scripts": [
335
- "Latn"
336
- ],
337
- "class_name": "SpaCyTokenizer",
338
- "macrolanguage": false
339
- },
340
- "Arab": {
341
  "full_object": "SpaCyTokenizer(\"az\")",
342
  "original_lang_name": "azerbaijani",
343
  "original_lang_code": "aze",
@@ -349,7 +329,7 @@
349
  "class_name": "SpaCyTokenizer",
350
  "macrolanguage": true
351
  },
352
- "Cyrl": {
353
  "full_object": "SpaCyTokenizer(\"az\")",
354
  "original_lang_name": "azerbaijani",
355
  "original_lang_code": "aze",
@@ -1027,18 +1007,6 @@
1027
  ],
1028
  "class_name": "SpaCyTokenizer",
1029
  "macrolanguage": false
1030
- },
1031
- "Arab": {
1032
- "full_object": "SpaCyTokenizer(\"az\")",
1033
- "original_lang_name": "azerbaijani",
1034
- "original_lang_code": "aze",
1035
- "scripts": [
1036
- "Arab",
1037
- "Latn",
1038
- "Cyrl"
1039
- ],
1040
- "class_name": "SpaCyTokenizer",
1041
- "macrolanguage": true
1042
  }
1043
  },
1044
  "children": [
@@ -1067,18 +1035,6 @@
1067
  ],
1068
  "class_name": "SpaCyTokenizer",
1069
  "macrolanguage": false
1070
- },
1071
- "Arab": {
1072
- "full_object": "SpaCyTokenizer(\"az\")",
1073
- "original_lang_name": "azerbaijani",
1074
- "original_lang_code": "aze",
1075
- "scripts": [
1076
- "Arab",
1077
- "Latn",
1078
- "Cyrl"
1079
- ],
1080
- "class_name": "SpaCyTokenizer",
1081
- "macrolanguage": true
1082
  }
1083
  },
1084
  "children": [
@@ -1226,18 +1182,6 @@
1226
  ],
1227
  "class_name": "SpaCyTokenizer",
1228
  "macrolanguage": false
1229
- },
1230
- "Arab": {
1231
- "full_object": "SpaCyTokenizer(\"az\")",
1232
- "original_lang_name": "azerbaijani",
1233
- "original_lang_code": "aze",
1234
- "scripts": [
1235
- "Arab",
1236
- "Latn",
1237
- "Cyrl"
1238
- ],
1239
- "class_name": "SpaCyTokenizer",
1240
- "macrolanguage": true
1241
  }
1242
  },
1243
  "children": [
@@ -1352,18 +1296,6 @@
1352
  ],
1353
  "class_name": "SpaCyTokenizer",
1354
  "macrolanguage": false
1355
- },
1356
- "Arab": {
1357
- "full_object": "SpaCyTokenizer(\"az\")",
1358
- "original_lang_name": "azerbaijani",
1359
- "original_lang_code": "aze",
1360
- "scripts": [
1361
- "Arab",
1362
- "Latn",
1363
- "Cyrl"
1364
- ],
1365
- "class_name": "SpaCyTokenizer",
1366
- "macrolanguage": true
1367
  }
1368
  },
1369
  "children": [
 
13
  "class_name": "SpaCyTokenizer",
14
  "macrolanguage": false
15
  },
16
+ "Cyrl": {
17
  "full_object": "SpaCyTokenizer(\"az\")",
18
  "original_lang_name": "azerbaijani",
19
  "original_lang_code": "aze",
 
25
  "class_name": "SpaCyTokenizer",
26
  "macrolanguage": true
27
  },
28
+ "Arab": {
29
  "full_object": "SpaCyTokenizer(\"az\")",
30
  "original_lang_name": "azerbaijani",
31
  "original_lang_code": "aze",
 
54
  "iso_1_code": null,
55
  "iso_3_code": null,
56
  "tokenizers": {
57
+ "Cyrl": {
 
 
 
 
 
 
 
 
 
 
58
  "full_object": "SpaCyTokenizer(\"az\")",
59
  "original_lang_name": "azerbaijani",
60
  "original_lang_code": "aze",
 
66
  "class_name": "SpaCyTokenizer",
67
  "macrolanguage": true
68
  },
69
+ "Arab": {
70
  "full_object": "SpaCyTokenizer(\"az\")",
71
  "original_lang_name": "azerbaijani",
72
  "original_lang_code": "aze",
 
121
  "original_lang_code": "uig",
122
  "scripts": [
123
  "Latn",
124
+ "Cyrl",
125
+ "Arab"
126
  ],
127
  "class_name": "StanzaTokenizer",
128
  "macrolanguage": false
129
  },
130
+ "Cyrl": {
131
  "full_object": "StanzaTokenizer(\"ug\")",
132
  "original_lang_name": "uyghur",
133
  "original_lang_code": "uig",
134
  "scripts": [
135
  "Latn",
136
+ "Cyrl",
137
+ "Arab"
138
  ],
139
  "class_name": "StanzaTokenizer",
140
  "macrolanguage": false
141
  },
142
+ "Arab": {
143
  "full_object": "StanzaTokenizer(\"ug\")",
144
  "original_lang_name": "uyghur",
145
  "original_lang_code": "uig",
146
  "scripts": [
147
  "Latn",
148
+ "Cyrl",
149
+ "Arab"
150
  ],
151
  "class_name": "StanzaTokenizer",
152
  "macrolanguage": false
 
194
  "original_lang_code": "uig",
195
  "scripts": [
196
  "Latn",
197
+ "Cyrl",
198
+ "Arab"
199
  ],
200
  "class_name": "StanzaTokenizer",
201
  "macrolanguage": false
202
  },
203
+ "Cyrl": {
204
  "full_object": "StanzaTokenizer(\"ug\")",
205
  "original_lang_name": "uyghur",
206
  "original_lang_code": "uig",
207
  "scripts": [
208
  "Latn",
209
+ "Cyrl",
210
+ "Arab"
211
  ],
212
  "class_name": "StanzaTokenizer",
213
  "macrolanguage": false
214
  },
215
+ "Arab": {
216
  "full_object": "StanzaTokenizer(\"ug\")",
217
  "original_lang_name": "uyghur",
218
  "original_lang_code": "uig",
219
  "scripts": [
220
  "Latn",
221
+ "Cyrl",
222
+ "Arab"
223
  ],
224
  "class_name": "StanzaTokenizer",
225
  "macrolanguage": false
 
245
  "original_lang_code": "uig",
246
  "scripts": [
247
  "Latn",
248
+ "Cyrl",
249
+ "Arab"
250
  ],
251
  "class_name": "StanzaTokenizer",
252
  "macrolanguage": false
 
257
  "original_lang_code": "uig",
258
  "scripts": [
259
  "Latn",
260
+ "Cyrl",
261
+ "Arab"
262
  ],
263
  "class_name": "StanzaTokenizer",
264
  "macrolanguage": false
 
283
  "original_lang_code": "uig",
284
  "scripts": [
285
  "Latn",
286
+ "Cyrl",
287
+ "Arab"
288
  ],
289
  "class_name": "StanzaTokenizer",
290
  "macrolanguage": false
 
317
  "iso_1_code": null,
318
  "iso_3_code": null,
319
  "tokenizers": {
320
+ "Cyrl": {
 
 
 
 
 
 
 
 
 
 
321
  "full_object": "SpaCyTokenizer(\"az\")",
322
  "original_lang_name": "azerbaijani",
323
  "original_lang_code": "aze",
 
329
  "class_name": "SpaCyTokenizer",
330
  "macrolanguage": true
331
  },
332
+ "Arab": {
333
  "full_object": "SpaCyTokenizer(\"az\")",
334
  "original_lang_name": "azerbaijani",
335
  "original_lang_code": "aze",
 
1007
  ],
1008
  "class_name": "SpaCyTokenizer",
1009
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
1010
  }
1011
  },
1012
  "children": [
 
1035
  ],
1036
  "class_name": "SpaCyTokenizer",
1037
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
1038
  }
1039
  },
1040
  "children": [
 
1182
  ],
1183
  "class_name": "SpaCyTokenizer",
1184
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
1185
  }
1186
  },
1187
  "children": [
 
1296
  ],
1297
  "class_name": "SpaCyTokenizer",
1298
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
 
 
1299
  }
1300
  },
1301
  "children": [
data/Uralic.json CHANGED
@@ -108,16 +108,6 @@
108
  ],
109
  "class_name": "SpaCyTokenizer",
110
  "macrolanguage": false
111
- },
112
- "Cyrl": {
113
- "full_object": "StanzaTokenizer(\"myv\")",
114
- "original_lang_name": "erzya",
115
- "original_lang_code": "myv",
116
- "scripts": [
117
- "Cyrl"
118
- ],
119
- "class_name": "StanzaTokenizer",
120
- "macrolanguage": false
121
  }
122
  },
123
  "children": [
@@ -404,16 +394,6 @@
404
  "iso_1_code": null,
405
  "iso_3_code": null,
406
  "tokenizers": {
407
- "Latn": {
408
- "full_object": "SpaCyTokenizer(\"hu\")",
409
- "original_lang_name": "hungarian",
410
- "original_lang_code": "hun",
411
- "scripts": [
412
- "Latn"
413
- ],
414
- "class_name": "SpaCyTokenizer",
415
- "macrolanguage": false
416
- },
417
  "Cyrl": {
418
  "full_object": "StanzaTokenizer(\"myv\")",
419
  "original_lang_name": "erzya",
@@ -491,16 +471,6 @@
491
  ],
492
  "class_name": "StanzaTokenizer",
493
  "macrolanguage": false
494
- },
495
- "Latn": {
496
- "full_object": "SpaCyTokenizer(\"hu\")",
497
- "original_lang_name": "hungarian",
498
- "original_lang_code": "hun",
499
- "scripts": [
500
- "Latn"
501
- ],
502
- "class_name": "SpaCyTokenizer",
503
- "macrolanguage": false
504
  }
505
  },
506
  "children": [
@@ -560,16 +530,6 @@
560
  "iso_1_code": null,
561
  "iso_3_code": null,
562
  "tokenizers": {
563
- "Latn": {
564
- "full_object": "SpaCyTokenizer(\"hu\")",
565
- "original_lang_name": "hungarian",
566
- "original_lang_code": "hun",
567
- "scripts": [
568
- "Latn"
569
- ],
570
- "class_name": "SpaCyTokenizer",
571
- "macrolanguage": false
572
- },
573
  "Cyrl": {
574
  "full_object": "StanzaTokenizer(\"myv\")",
575
  "original_lang_name": "erzya",
@@ -610,16 +570,6 @@
610
  "iso_1_code": null,
611
  "iso_3_code": null,
612
  "tokenizers": {
613
- "Latn": {
614
- "full_object": "SpaCyTokenizer(\"hu\")",
615
- "original_lang_name": "hungarian",
616
- "original_lang_code": "hun",
617
- "scripts": [
618
- "Latn"
619
- ],
620
- "class_name": "SpaCyTokenizer",
621
- "macrolanguage": false
622
- },
623
  "Cyrl": {
624
  "full_object": "StanzaTokenizer(\"myv\")",
625
  "original_lang_name": "erzya",
@@ -702,16 +652,6 @@
702
  ],
703
  "class_name": "StanzaTokenizer",
704
  "macrolanguage": false
705
- },
706
- "Cyrl": {
707
- "full_object": "StanzaTokenizer(\"myv\")",
708
- "original_lang_name": "erzya",
709
- "original_lang_code": "myv",
710
- "scripts": [
711
- "Cyrl"
712
- ],
713
- "class_name": "StanzaTokenizer",
714
- "macrolanguage": false
715
  }
716
  },
717
  "children": [
@@ -729,16 +669,6 @@
729
  ],
730
  "class_name": "StanzaTokenizer",
731
  "macrolanguage": false
732
- },
733
- "Cyrl": {
734
- "full_object": "StanzaTokenizer(\"myv\")",
735
- "original_lang_name": "erzya",
736
- "original_lang_code": "myv",
737
- "scripts": [
738
- "Cyrl"
739
- ],
740
- "class_name": "StanzaTokenizer",
741
- "macrolanguage": false
742
  }
743
  },
744
  "children": [
@@ -847,16 +777,6 @@
847
  ],
848
  "class_name": "StanzaTokenizer",
849
  "macrolanguage": false
850
- },
851
- "Cyrl": {
852
- "full_object": "StanzaTokenizer(\"myv\")",
853
- "original_lang_name": "erzya",
854
- "original_lang_code": "myv",
855
- "scripts": [
856
- "Cyrl"
857
- ],
858
- "class_name": "StanzaTokenizer",
859
- "macrolanguage": false
860
  }
861
  },
862
  "children": [
@@ -902,16 +822,6 @@
902
  ],
903
  "class_name": "StanzaTokenizer",
904
  "macrolanguage": false
905
- },
906
- "Cyrl": {
907
- "full_object": "StanzaTokenizer(\"myv\")",
908
- "original_lang_name": "erzya",
909
- "original_lang_code": "myv",
910
- "scripts": [
911
- "Cyrl"
912
- ],
913
- "class_name": "StanzaTokenizer",
914
- "macrolanguage": false
915
  }
916
  },
917
  "children": [
@@ -929,16 +839,6 @@
929
  ],
930
  "class_name": "StanzaTokenizer",
931
  "macrolanguage": false
932
- },
933
- "Cyrl": {
934
- "full_object": "StanzaTokenizer(\"myv\")",
935
- "original_lang_name": "erzya",
936
- "original_lang_code": "myv",
937
- "scripts": [
938
- "Cyrl"
939
- ],
940
- "class_name": "StanzaTokenizer",
941
- "macrolanguage": false
942
  }
943
  },
944
  "children": [
@@ -1017,16 +917,6 @@
1017
  ],
1018
  "class_name": "StanzaTokenizer",
1019
  "macrolanguage": false
1020
- },
1021
- "Cyrl": {
1022
- "full_object": "StanzaTokenizer(\"myv\")",
1023
- "original_lang_name": "erzya",
1024
- "original_lang_code": "myv",
1025
- "scripts": [
1026
- "Cyrl"
1027
- ],
1028
- "class_name": "StanzaTokenizer",
1029
- "macrolanguage": false
1030
  }
1031
  },
1032
  "children": [
@@ -1073,16 +963,6 @@
1073
  "iso_1_code": null,
1074
  "iso_3_code": null,
1075
  "tokenizers": {
1076
- "Latn": {
1077
- "full_object": "SpaCyTokenizer(\"hu\")",
1078
- "original_lang_name": "hungarian",
1079
- "original_lang_code": "hun",
1080
- "scripts": [
1081
- "Latn"
1082
- ],
1083
- "class_name": "SpaCyTokenizer",
1084
- "macrolanguage": false
1085
- },
1086
  "Cyrl": {
1087
  "full_object": "StanzaTokenizer(\"myv\")",
1088
  "original_lang_name": "erzya",
@@ -1110,16 +990,6 @@
1110
  "iso_1_code": null,
1111
  "iso_3_code": null,
1112
  "tokenizers": {
1113
- "Latn": {
1114
- "full_object": "SpaCyTokenizer(\"hu\")",
1115
- "original_lang_name": "hungarian",
1116
- "original_lang_code": "hun",
1117
- "scripts": [
1118
- "Latn"
1119
- ],
1120
- "class_name": "SpaCyTokenizer",
1121
- "macrolanguage": false
1122
- },
1123
  "Cyrl": {
1124
  "full_object": "StanzaTokenizer(\"myv\")",
1125
  "original_lang_name": "erzya",
@@ -1182,28 +1052,7 @@
1182
  "name": "Enets",
1183
  "iso_1_code": null,
1184
  "iso_3_code": null,
1185
- "tokenizers": {
1186
- "Latn": {
1187
- "full_object": "SpaCyTokenizer(\"hu\")",
1188
- "original_lang_name": "hungarian",
1189
- "original_lang_code": "hun",
1190
- "scripts": [
1191
- "Latn"
1192
- ],
1193
- "class_name": "SpaCyTokenizer",
1194
- "macrolanguage": false
1195
- },
1196
- "Cyrl": {
1197
- "full_object": "StanzaTokenizer(\"myv\")",
1198
- "original_lang_name": "erzya",
1199
- "original_lang_code": "myv",
1200
- "scripts": [
1201
- "Cyrl"
1202
- ],
1203
- "class_name": "StanzaTokenizer",
1204
- "macrolanguage": false
1205
- }
1206
- },
1207
  "children": [
1208
  {
1209
  "name": "Enets, Forest",
@@ -1250,16 +1099,6 @@
1250
  "iso_1_code": null,
1251
  "iso_3_code": null,
1252
  "tokenizers": {
1253
- "Latn": {
1254
- "full_object": "SpaCyTokenizer(\"hu\")",
1255
- "original_lang_name": "hungarian",
1256
- "original_lang_code": "hun",
1257
- "scripts": [
1258
- "Latn"
1259
- ],
1260
- "class_name": "SpaCyTokenizer",
1261
- "macrolanguage": false
1262
- },
1263
  "Cyrl": {
1264
  "full_object": "StanzaTokenizer(\"myv\")",
1265
  "original_lang_name": "erzya",
 
108
  ],
109
  "class_name": "SpaCyTokenizer",
110
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
111
  }
112
  },
113
  "children": [
 
394
  "iso_1_code": null,
395
  "iso_3_code": null,
396
  "tokenizers": {
 
 
 
 
 
 
 
 
 
 
397
  "Cyrl": {
398
  "full_object": "StanzaTokenizer(\"myv\")",
399
  "original_lang_name": "erzya",
 
471
  ],
472
  "class_name": "StanzaTokenizer",
473
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
474
  }
475
  },
476
  "children": [
 
530
  "iso_1_code": null,
531
  "iso_3_code": null,
532
  "tokenizers": {
 
 
 
 
 
 
 
 
 
 
533
  "Cyrl": {
534
  "full_object": "StanzaTokenizer(\"myv\")",
535
  "original_lang_name": "erzya",
 
570
  "iso_1_code": null,
571
  "iso_3_code": null,
572
  "tokenizers": {
 
 
 
 
 
 
 
 
 
 
573
  "Cyrl": {
574
  "full_object": "StanzaTokenizer(\"myv\")",
575
  "original_lang_name": "erzya",
 
652
  ],
653
  "class_name": "StanzaTokenizer",
654
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
655
  }
656
  },
657
  "children": [
 
669
  ],
670
  "class_name": "StanzaTokenizer",
671
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
672
  }
673
  },
674
  "children": [
 
777
  ],
778
  "class_name": "StanzaTokenizer",
779
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
780
  }
781
  },
782
  "children": [
 
822
  ],
823
  "class_name": "StanzaTokenizer",
824
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
825
  }
826
  },
827
  "children": [
 
839
  ],
840
  "class_name": "StanzaTokenizer",
841
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
842
  }
843
  },
844
  "children": [
 
917
  ],
918
  "class_name": "StanzaTokenizer",
919
  "macrolanguage": false
 
 
 
 
 
 
 
 
 
 
920
  }
921
  },
922
  "children": [
 
963
  "iso_1_code": null,
964
  "iso_3_code": null,
965
  "tokenizers": {
 
 
 
 
 
 
 
 
 
 
966
  "Cyrl": {
967
  "full_object": "StanzaTokenizer(\"myv\")",
968
  "original_lang_name": "erzya",
 
990
  "iso_1_code": null,
991
  "iso_3_code": null,
992
  "tokenizers": {
 
 
 
 
 
 
 
 
 
 
993
  "Cyrl": {
994
  "full_object": "StanzaTokenizer(\"myv\")",
995
  "original_lang_name": "erzya",
 
1052
  "name": "Enets",
1053
  "iso_1_code": null,
1054
  "iso_3_code": null,
1055
+ "tokenizers": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1056
  "children": [
1057
  {
1058
  "name": "Enets, Forest",
 
1099
  "iso_1_code": null,
1100
  "iso_3_code": null,
1101
  "tokenizers": {
 
 
 
 
 
 
 
 
 
 
1102
  "Cyrl": {
1103
  "full_object": "StanzaTokenizer(\"myv\")",
1104
  "original_lang_name": "erzya",