jstzwj commited on
Commit
ab790b6
1 Parent(s): d2529d1
README.md CHANGED
@@ -1,3 +1,8 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ Shami Tokenizer
6
+ ===
7
+
8
+ This is the tokenizer of Shami Model.
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers==4.29.2
2
+ datasets==2.12.0
3
+ apache-beam[gcp]
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|endoftext|>",
6
+ "model_max_length": 1024,
7
+ "tokenizer_class": "GPT2Tokenizer",
8
+ "unk_token": "<|endoftext|>"
9
+ }
train_tokenizer.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from transformers import AutoTokenizer
4
+ old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
5
+
6
+ import os
7
+ from datasets import load_dataset
8
+
9
+ langs = ["en", "ja", "ko", "zh-cn", "zh-tw"]
10
+ raw_datasets = [
11
+ load_dataset("wiki40b", lang, beam_runner='DirectRunner')
12
+ for lang in langs
13
+ ]
14
+
15
+ total_line = 0
16
+ for training_dataset in raw_datasets:
17
+ for line in training_dataset["train"]:
18
+ total_line += 1
19
+
20
+ def training_dataset_iterator():
21
+ for training_dataset in raw_datasets:
22
+ for line in training_dataset["train"]:
23
+ yield line['text']
24
+
25
+ # tokenizer.train(training_files, trainer)
26
+ tokenizer = old_tokenizer.train_new_from_iterator(training_dataset_iterator(), 102000, total_line)
27
+
28
+ tokenizer.save_pretrained("tokenizer-shami")
vocab.json ADDED
The diff for this file is too large to render. See raw diff