mtasic85's picture
tokenizer, pretrain dataset
36f3948
metadata
license: apache-2.0

tangled-llama-n-128k-v0.1

time python -B train_tokenizer.py
time python -B prepare_pretrain_datasets.py