File size: 2,375 Bytes
c6e7238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import random
import argparse
import shutil
from glob import glob
from pathlib import Path

from lm_dataformat import Reader
from tokenizers import (Tokenizer, decoders, models, pre_tokenizers,
                        processors, trainers)
from tokenizers.normalizers import NFKC
from tqdm import tqdm

# parser

parser = argparse.ArgumentParser()
parser.add_argument("--base_dir", type=str, help="Path to where your files are located. Files ending in .zst are treated as \
                    archives, all others as raw text.")
parser.add_argument("--output_dir", type=str, default="tokenizers", help="Where to put the tokenizer")
parser.add_argument("--file_type", type=str, choices=["xz", "txt"], default="xz", help="Extension of file to parse")
parser.add_argument("--vocab_size", type=int, help="Size of vocabulary", required = True)
args = parser.parse_args()

# main script

data_path = Path(args.base_dir)
archives = glob(str(data_path / f"*.{args.file_type}"))

out_path = Path(args.output_dir)

if os.path.exists(out_path):
    shutil.rmtree(out_path)

if not out_path.is_dir():
    out_path.mkdir()

    for arch in tqdm(archives):
        name = os.path.basename(arch).split(".")[0] + ".txt"
        fp = out_path / name

        if args.file_type == 'xz':
            g = Reader(arch).stream_data()

            with open(fp, "w") as f:
                for s in g:
                    f.write(s)
                    f.write("\n\n")
        elif args.file_type == 'txt':
            shutil.copyfile(str(arch), str(fp))

data_files = glob(str(out_path / "*.txt"))
data_files = random.sample(data_files, int(0.2 * len(data_files)))

assert len(data_files) > 0, 'No data files found'

# Initialize a tokenizer
tokenizer = Tokenizer(models.BPE())

# Customize pre-tokenization and decoding
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
tokenizer.normalizer = NFKC()

# And then train
trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=2, special_tokens=["<|endoftext|>", "<|padding|>"])
tokenizer.train(trainer, data_files)

# And Save it
tokenizer_path = out_path / "byte-level-bpe.tokenizer.json"
tokenizer.save(str(tokenizer_path), pretty=True)

print(f'tokenizer saved at {str(tokenizer_path)}')