tangled-llama-m-128k-v0.1 / scripts /prepare_pretrain_datasets.py
mtasic85's picture
prepare pretrain datasets
fe25a83
raw
history blame
1.8 kB
from functools import partial
from litgpt.tokenizer import Tokenizer
from litdata import optimize, TokensLoader, StreamingDataset
from transformers import AutoTokenizer
from utils import tokenize_text_fn
from pretrain_datasets import pretrain_datasets
#
# optimize datasets
#
# for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]):
for i, (block_size, subchunk_size) in enumerate([(4097, 4000)]):
chunk_size = block_size * subchunk_size
output_dir = f'../pretrain-data-{i}-{block_size}-{chunk_size}'
outputs = optimize(
fn=partial(
tokenize_text_fn,
hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True),
tokenizer=Tokenizer('..'),
),
inputs=pretrain_datasets,
output_dir=output_dir,
chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
num_workers=32,
reorder_files=False,
## This is important to inform LitData that we are encoding contiguous 1D array (tokens).
## LitData skips storing metadata for each sample e.g all the tokens are concatenated to form one large tensor.
# item_loader=TokensLoader(block_size=block_size),
)
#
# total number of chunks in datasets
#
# for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]):
for i, (block_size, subchunk_size) in enumerate([(4097, 4000)]):
chunk_size = block_size * subchunk_size
input_dir = f'../pretrain-data-{i}-{block_size}-{chunk_size}'
dataset = StreamingDataset(
input_dir=input_dir,
item_loader=TokensLoader(block_size=block_size),
)
print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')