train tokenizer 128k

Browse files

Files changed (13) hide show

TRAIN.md +113 -0
config.json +40 -0
misc/logo.png +3 -0
scripts/contrain-model-0.yaml +156 -0
scripts/contrain_datasets.py +204 -0
scripts/prepare_contrain_datasets.py +49 -0
scripts/prepare_pretrain_datasets.py +50 -0
scripts/pretrain-model-0.yaml +156 -0
scripts/pretrain_datasets.py +73 -0
scripts/requirements.in +22 -0
scripts/tokenizer_datasets.py +48 -0
scripts/train_tokenizer.py +252 -0
scripts/utils.py +143 -0

TRAIN.md ADDED Viewed

	@@ -0,0 +1,113 @@

+# Train
+## Environment
+```bash
+cd scripts
+python -m venv venv
+source venv/bin/activate
+pip install -U -r requirements.in
+```
+## Train Tokenizer
+```bash
+time python -B train_tokenizer.py
+```
+## Pretrain
+```bash
+python -B prepare_pretrain_datasets.py
+```
+```bash
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-0.yaml
+litgpt convert_pretrained_checkpoint ../out/pretrain-0/final/ ../out/pretrain-0-final-checkpoint
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-1.yaml
+litgpt convert_pretrained_checkpoint ../out/pretrain-1/final/ ../out/pretrain-1-final-checkpoint
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-2.yaml
+litgpt convert_pretrained_checkpoint ../out/pretrain-2/final/ ../out/pretrain-2-final-checkpoint
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-3.yaml
+litgpt convert_pretrained_checkpoint ../out/pretrain-3/final/ ../out/pretrain-3-final-checkpoint
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-4.yaml
+litgpt convert_pretrained_checkpoint ../out/pretrain-4/final/ ../out/pretrain-4-final-checkpoint
+# NOTE: unused
+#   CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-5.yaml
+#   litgpt convert_pretrained_checkpoint ../out/pretrain-5/final/ ../out/pretrain-5-final-checkpoint
+```
+### Continued Pretraining
+```bash
+python -B prepare_contrain_datasets.py
+```
+```bash
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config contrain-model-0.yaml
+litgpt convert_pretrained_checkpoint ../out/contrain-0/final/ ../out/contrain-0-final-checkpoint
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config contrain-model-1.yaml
+litgpt convert_pretrained_checkpoint ../out/contrain-1/final/ ../out/contrain-1-final-checkpoint
+```
+## Chat with Pretrained model
+```bash
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-0/final/
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-1/final/
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-2/final/
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-3/final/
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-4/final/
+# CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-5/final/
+```
+<!-- OLD -->
+## Model
+### Pretraining
+```bash
+litgpt pretrain --config ./pretrain-model.yaml
+litgpt convert_from_litgpt out/pretrain/final/ out/converted_pretrain
+cp config.json out/pretrain/final/
+cp config.json out/converted_pretrain/
+```
+```python
+import torch
+from safetensors.torch import save_file
+state_dict = torch.load('out/converted_pretrain/model.pth', map_location='cpu')
+save_file(state_dict, 'out/converted_pretrain/model.safetensors')
+```
+### Continued Pretraining
+```bash
+litgpt convert_pretrained_checkpoint out/pretrain/final/ out/pretrain_checkpoint/final/
+cp config.json out/pretrain_checkpoint/final/
+litgpt pretrain --config ./contrain-model.yaml
+litgpt convert_from_litgpt out/contrain/final/ out/converted_contrain
+cp config.json out/converted_contrain/
+```
+```python
+import torch
+from safetensors.torch import save_file
+state_dict = torch.load('out/converted_contrain/model.pth', map_location='cpu')
+save_file(state_dict, 'out/converted_contrain/model.safetensors')
+```
+```bash
+cp out/converted_contrain/model.pth ./
+cp out/converted_contrain/model.safetensors ./
+```

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_name_or_path": "tangledgroup/tangled-llama-j-128k-v0.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": [
+    1,
+    4,
+    5
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 1000000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.0.dev0",
+  "use_cache": true,
+  "vocab_size": 65536
+}

misc/logo.png ADDED Viewed

Git LFS Details

SHA256: 591bee6fa56315a84eeec47c5e04ff6331f842c773ce50b7a59e508b4d2904cf
Pointer size: 131 Bytes
Size of remote file: 684 kB

scripts/contrain-model-0.yaml ADDED Viewed

	@@ -0,0 +1,156 @@

+# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/config.json
+# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
+# ``model_config``. (type: Optional[str], default: null)
+model_name: "Llama-3.2-1B"
+# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
+# ``model_config``. (type: Optional[Config], default: null)
+model_config:
+  padded_vocab_size: 65536
+  vocab_size: 65536
+  block_size: 131072
+  n_layer: 32
+  n_head: 16
+  head_size: 64
+  n_embd: 768
+  n_query_groups: 4
+  rotary_percentage: 1.0
+  parallel_residual: false
+  shared_attention_norm: false
+  bias: false
+  # attn_bias: true # qwen 2.5
+  norm_class_name: "RMSNorm"
+  mlp_class_name: "LLaMAMLP"
+  intermediate_size: 2048
+  # rope_base: 500000 # llama 3.2
+  rope_base: 1000000 # qwen 2.5
+  rope_adjustments: # llama 3.2
+    factor: 32.0
+    low_freq_factor: 1.0
+    high_freq_factor: 4.0
+    original_max_seq_len: 8192
+# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
+# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
+out_dir: "../out/contrain-0/"
+# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+# precision: bf16-mixed
+precision: bf16-true
+# Optional path to a checkpoint directory to initialize the model from.
+# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
+initial_checkpoint_dir: "../out/pretrain-4-final-checkpoint/"
+# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
+# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
+# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
+# (type: Union[bool, Literal["auto"], Path], default: False)
+# resume:
+# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
+data:
+  class_path: LitData
+  init_args:
+    data_path: "../contrain-data-0-4097-16388000/"
+    num_workers: 32
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 100
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
+  global_batch_size: 512
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 3
+  # Number of iterations with learning rate warmup active (type: int, default: 2000)
+  lr_warmup_steps: 0
+  # Number of epochs to train on (type: Optional[int], default: null)
+  epochs:
+  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
+  max_tokens: 1527816367 # 4_097 * 372_911
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 4097
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
+  tie_embeddings: true
+  #   (type: Optional[float], default: 1.0)
+  max_norm: 1.0
+  #   (type: float, default: 4e-05)
+  min_lr: 1e-06
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+  # Number of optimizer steps between evaluation calls (type: int, default: 1000)
+  interval: 25
+  # Number of tokens to generate (type: Optional[int], default: null)
+  max_new_tokens:
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+# Optimizer-related arguments
+optimizer:
+  class_path: grokadamw.GrokAdamW
+  init_args:
+    # (type: float, default: 0.001)
+    lr: 1e-05
+    # (type: float, default: 0.01)
+    weight_decay: 1e-2
+    # (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.999
+# optimizer:
+#   class_path: sophia_opt.SophiaG
+#
+#   init_args:
+#     lr: 4e-4
+#     betas:
+#       - 0.965
+#       - 0.99
+#     rho: 0.01
+#     weight_decay: 1e-1
+# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
+devices: auto
+# How many nodes to use. (type: int, default: 1)
+num_nodes: 1
+# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
+# module require this. (type: Optional[Path], default: null)
+tokenizer_dir: "../"
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
+logger_name: "wandb"
+# The random seed to use for reproducibility. (type: int, default: 42)
+seed: 23

scripts/contrain_datasets.py ADDED Viewed

	@@ -0,0 +1,204 @@

+roles_map = {
+    'system': 'system',
+    'user': 'user',
+    'human': 'user',
+    'assistant': 'assistant',
+    'gpt': 'assistant',
+    'AI': 'assistant',
+}
+contrain_datasets = [
+    #
+    # general instructs
+    #
+    # mlabonne/open-perfectblend - 1.48 GB, 1,420,909
+    #   meta-math/MetaMathQA 	395,000
+    #   openbmb/UltraInteract_sft 	288,579
+    #   HuggingFaceH4/ultrachat_200k 	207,865
+    #   microsoft/orca-math-word-problems-200k 	200,035
+    #   HuggingFaceH4/ultrafeedback_binarized 	187,405
+    #   theblackcat102/evol-codealpaca-v1 	111,272
+    #   Post-training-Data-Flywheel/AutoIF-instruct-61k 	61,492
+    #   mlabonne/lmsys-arena-human-preference-55k-sharegpt 	57,362
+    *[
+        {'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
+            {'role': roles_map[m['from']], 'content': m['value']}
+            for m in msgs
+        ]}
+        for i in range(0, 100, 20)
+    ],
+    # arcee-ai/The-Tome - 4.58 GB, 1,752,473
+    # - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
+    # - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
+    # - jondurbin/airoboros-3.2
+    # - gardner/glaive-function-calling-v2-sharegpt
+    # - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01)
+    # - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k)
+    # - cognitivecomputations/ultrainteract_trajectories_sharegpt
+    # - cognitivecomputations/SystemChat-2.0
+    # - arcee-ai/qwen2-72b-magpie-en
+    *[
+        {'path': 'arcee-ai/The-Tome', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
+            {'role': roles_map[m['from']], 'content': m['value']}
+            for m in msgs
+        ]}
+        for i in range(0, 100, 20)
+    ],
+    # rombodawg/Everything_Instruct_Multilingual - 2.48 GB, 5,808,694
+    # Science:
+    #     antiven0m/physical-reasoning-dpoScience
+    #     LawalAfeez/science-dataset
+    # Social media:
+    #     Kyle1668/AG-Tweets
+    #     euclaise/reddit-instruct-curated
+    # General Knowledge:
+    #     NousResearch/CharacterCodex_Characters
+    #     jstet/quotes-500k_Famous_Quotes
+    #     FronkonGames/steam-games-dataset_Video_Games
+    #     totuta_youtube_subs_howto100M_HowTo
+    # Multi-lingual:
+    #     Amani27/massive_translation_dataset
+    #     udmurtNLP/udmurt-russian-english-labse
+    #     grosenthal/latin_english
+    #     msarmi9/korean-english-multitarget-ted-talks-task
+    #     HaiderSultanArc/MT-Urdu-English_Translate
+    #     Garsa3112/ChineseEnglishTranslationDataset
+    # Cooking:
+    #     andrewsiah/se_cooking_preference_sft
+    #     Hieu-Phamkaggle/food_recipes
+    # Writing:
+    #     shahules786/PoetryFoundationData
+    #     euclaise/writingprompts
+    #     qwedsacf/ivypanda-essaysEssay
+    # Medicine:
+    #     keivalya/MedQuad-MedicalQnADataset
+    #     nuvocare/MSD
+    # History:
+    #     ambrosfitz10k/history_data_v4
+    # Law:
+    #     dzunggg/legal-qa-v1
+    # Role-Play:
+    #     roleplay4/fun_CoupleRP
+    #     Undi95andrijdavid/roleplay-conversation-sharegpt
+    # News:
+    #     RealTimeData/bbc_news_alltime
+    # Coding: (rombodawg/code_bagel)
+    #     layoric/tiny-codes-alpaca
+    #     glaiveai/glaive-code-assistant-v3
+    #     ajibawa-2023/Code-290k-ShareGPT
+    #     chargoddard/commitpack-ft-instruct-rated
+    #     iamtarun/code_instructions_120k_alpaca
+    #     ise-uiuc/Magicoder-Evol-Instruct-110K
+    #     cognitivecomputations/dolphin-coder
+    #     nickrosh/Evol-Instruct-Code-80k-v1
+    #     coseal/CodeUltraFeedback_binarized
+    #     CyberNative/Code_Vulnerability_Security_DPO
+    # Math: (rombodawg/code_bagel)
+    #     TIGER-Lab/MathInstruct
+    # Function calling: (rombodawg/code_bagel)
+    #     glaiveai/glaive-function-calling-v2
+    # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
+    #     teknium/OpenHermes-2.5
+    *[
+        {'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 20}%]', 'transform': lambda r: [
+            {'role': 'system', 'content': r['instruction']},
+            {'role': 'user', 'content': r['input']},
+            {'role': 'assistant', 'content': r['output']},
+        ]}
+        for i in range(0, 100, 20)
+    ],
+    #
+    # tool/function calling
+    #
+    # 65.7 MB, 11,578
+    {'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [
+        {'role': roles_map[m['from']], 'content': m['value']}
+        for m in msgs
+    ]},
+    #
+    # agent
+    #
+    # 1.51 GB, 485,874
+    *[
+        {'path': 'arcee-ai/agent-data', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
+            {'role': roles_map[m['from']], 'content': m['value']}
+            for m in msgs
+        ]}
+        for i in range(0, 100, 20)
+    ],
+    #
+    # general reasoning
+    #
+    *[
+        # 10.8 MB, 15,770
+        {'path': 'AtlasUnified/Atlas-Reasoning', 'data_files': 'reasoning.csv', 'transform': lambda r: [
+            {'role': 'user', 'content': r['Prompt']},
+            {'role': 'assistant', 'content': r['Step-by-step reasoning'] + '\n' + r['Solution']},
+        ]},
+    ],
+    #
+    # math reasoning
+    #
+    # 8.99 MB, 6,914
+    {'path': 'thesven/gsm8k-reasoning', 'transform': lambda r: [
+        {'role': 'user', 'content': r['question']},
+        {'role': 'assistant', 'content': (r['generation'] or '') + '\n' + r['answer'] + '\n' + r['short_answer']},
+    ]},
+    # 1.79 MB, 3,963
+    {'path': 'AlgorithmicResearchGroup/math_reasoning_autoformalization_track', 'transform': lambda r: [
+        {'role': 'user', 'content': r['informal_statement']},
+        {'role': 'assistant', 'content': r['informal_proof'] + '\n' + r['formal_proof']},
+    ]},
+    # 307 MB, 19,944
+    {'path': 'KingNish/reasoning-base-20k', 'transform': lambda r: [
+        {'role': 'user', 'content': r['user']},
+        {'role': 'assistant', 'content': r['reasoning'] + '\n' + r['assistant']},
+    ]},
+    # 9.45 MB, 10,000
+    {'path': 'Aarushhh/math-reasoning-10k', 'transform': lambda r: [
+        {'role': 'user', 'content': r['problem']},
+        {'role': 'assistant', 'content': r['plan'] + '\n' + r['solution']},
+    ]},
+    #
+    # reflection
+    #
+    # 4.17 MB, 1,000
+    {'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [
+        {'role': 'system', 'content': r['system']},
+        {'role': 'user', 'content': r['prompt']},
+        {'role': 'assistant', 'content': r['response']},
+    ]},
+    # 12.4 MB, 3,000
+    {'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [
+        {'role': 'system', 'content': r['system']},
+        {'role': 'user', 'content': r['prompt']},
+        {'role': 'assistant', 'content': r['response']},
+    ]},
+    # 70.8 MB, 36,549
+    {'path': 'dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [
+        {'role': 'system', 'content': r['system']},
+        {'role': 'user', 'content': r['prompt']},
+        {'role': 'assistant', 'content': r['response']},
+    ]},
+    # 30.6 MB, 25,391
+    {'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [
+        r['system'][0],
+        {'role': 'user', 'content': r['input']},
+        {'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']},
+    ]},
+    #
+    # general instructs
+    #
+    # 971 MB, 484,570
+    {'path': 'HuggingFaceTB/smol-smoltalk', 'field': 'messages'},
+]

scripts/prepare_contrain_datasets.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from functools import partial
+from litgpt.tokenizer import Tokenizer
+from litdata import optimize, TokensLoader, StreamingDataset
+from transformers import AutoTokenizer
+from utils import tokenize_chat_fn
+from contrain_datasets import contrain_datasets
+#
+# optimize datasets
+#
+for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]):
+    # i = 0
+    # block_size = 8193
+    # chunk_size = block_size * 2000
+    chunk_size = block_size * subchunk_size
+    output_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}'
+    outputs = optimize(
+        fn=partial(
+            tokenize_chat_fn,
+            hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True),
+            tokenizer=Tokenizer('..'),
+        ),
+        inputs=contrain_datasets,
+        output_dir=output_dir,
+        chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
+        num_workers=32,
+        reorder_files=False,
+    )
+#
+# total number of chunks in datasets
+#
+for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]):
+    # i = 0
+    # block_size = 8193
+    # chunk_size = block_size * 2000
+    chunk_size = block_size * subchunk_size
+    input_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}'
+    dataset = StreamingDataset(
+        input_dir=input_dir,
+        item_loader=TokensLoader(block_size=block_size),
+    )
+    print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')

scripts/prepare_pretrain_datasets.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from functools import partial
+from litgpt.tokenizer import Tokenizer
+from litdata import optimize, TokensLoader, StreamingDataset
+from utils import tokenize_text_fn
+from pretrain_datasets import pretrain_datasets
+#
+# optimize datasets
+#
+for i, (b, e) in enumerate([(0, 513), (512, 1025), (1024, 2049), (2048, 4097), (4096, 8192), (8192, 1024 ** 3)]):
+    if e <= 8192:
+        block_size = (64 * 1024 * 1024) // (4 * e)
+        chunk_size = e * block_size
+    else:
+        block_size = 2048
+        chunk_size = b * block_size
+    output_dir = f'../pretrain-data-{i}-{b}-{e}-{block_size}-{chunk_size}'
+    outputs = optimize(
+        fn=partial(tokenize_text_fn, tokenizer=Tokenizer('..'), min_len=b, max_len=e),
+        inputs=pretrain_datasets,
+        output_dir=output_dir,
+        chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
+        num_workers=32,
+        reorder_files=False,
+    )
+#
+# total number of chunks in datasets
+#
+for i, (b, e) in enumerate([(0, 513), (512, 1025), (1024, 2049), (2048, 4097), (4096, 8192), (8192, 1024 ** 3)]):
+    if e <= 8192:
+        block_size = (64 * 1024 * 1024) // (4 * e)
+        chunk_size = e * block_size
+    else:
+        block_size = 2048
+        chunk_size = b * block_size
+    input_dir = f'../pretrain-data-{i}-{b}-{e}-{block_size}-{chunk_size}'
+    dataset = StreamingDataset(
+        input_dir=input_dir,
+        item_loader=TokensLoader(block_size=block_size),
+    )
+    print(f'{i=}, {b=}, {e=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {e * len(dataset)=}')

scripts/pretrain-model-0.yaml ADDED Viewed

	@@ -0,0 +1,156 @@

+# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/config.json
+# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
+# ``model_config``. (type: Optional[str], default: null)
+model_name: "Llama-3.2-1B"
+# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
+# ``model_config``. (type: Optional[Config], default: null)
+model_config:
+  padded_vocab_size: 65536
+  vocab_size: 65536
+  block_size: 131072
+  n_layer: 32
+  n_head: 16
+  head_size: 64
+  n_embd: 768
+  n_query_groups: 4
+  rotary_percentage: 1.0
+  parallel_residual: false
+  shared_attention_norm: false
+  bias: false
+  # attn_bias: true # qwen 2.5
+  norm_class_name: "RMSNorm"
+  mlp_class_name: "LLaMAMLP"
+  intermediate_size: 2048
+  # rope_base: 500000 # llama 3.2
+  rope_base: 1000000 # qwen 2.5
+  rope_adjustments: # llama 3.2
+    factor: 32.0
+    low_freq_factor: 1.0
+    high_freq_factor: 4.0
+    original_max_seq_len: 8192
+# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
+# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
+out_dir: "../out/pretrain-0/"
+# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+# precision: bf16-mixed
+precision: bf16-true
+# Optional path to a checkpoint directory to initialize the model from.
+# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
+initial_checkpoint_dir:
+# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
+# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
+# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
+# (type: Union[bool, Literal["auto"], Path], default: False)
+resume: "auto"
+# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
+data:
+  class_path: LitData
+  init_args:
+    data_path: "../pretrain-data-0-0-513-32704-16777152/"
+    num_workers: 32
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 100
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
+  global_batch_size: 512
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 24
+  # Number of iterations with learning rate warmup active (type: int, default: 2000)
+  lr_warmup_steps: 0
+  # Number of epochs to train on (type: Optional[int], default: null)
+  epochs:
+  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
+  max_tokens: 1945266624 # 32_704 * 59_481
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 513
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
+  tie_embeddings: true
+  #   (type: Optional[float], default: 1.0)
+  max_norm: 1.0
+  #   (type: float, default: 4e-05)
+  min_lr: 1e-05
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+  # Number of optimizer steps between evaluation calls (type: int, default: 1000)
+  interval: 100
+  # Number of tokens to generate (type: Optional[int], default: null)
+  max_new_tokens:
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+# Optimizer-related arguments
+optimizer:
+  class_path: grokadamw.GrokAdamW
+  init_args:
+    # (type: float, default: 0.001)
+    lr: 1e-04
+    # (type: float, default: 0.01)
+    weight_decay: 1e-2
+    # (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.999
+# optimizer:
+#   class_path: sophia_opt.SophiaG
+#
+#   init_args:
+#     lr: 4e-4
+#     betas:
+#       - 0.965
+#       - 0.99
+#     rho: 0.01
+#     weight_decay: 1e-1
+# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
+devices: auto
+# How many nodes to use. (type: int, default: 1)
+num_nodes: 1
+# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
+# module require this. (type: Optional[Path], default: null)
+tokenizer_dir: "../"
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
+logger_name: "wandb"
+# The random seed to use for reproducibility. (type: int, default: 42)
+seed: 23

scripts/pretrain_datasets.py ADDED Viewed

	@@ -0,0 +1,73 @@

+pretrain_datasets = [
+    #
+    # multilingual
+    #
+    # 3.17 GB, 2,226,907
+    *[
+        {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
+        for i in range(0, 100, 5)
+    ],
+    # 1.64 GB, 1,001,000
+    *[
+        {'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
+        for i in range(0, 100, 5)
+    ],
+    #
+    # general knowledge
+    #
+    # 65.1 MB, 7,819
+    {'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
+    # 135 MB, 1,795
+    {'path': 'open-phi/textbooks', 'format': lambda n: n['markdown']},
+    # 631 MB, 111,048
+    {'path': 'open-phi/programming_books_llama', 'format': lambda n: n['markdown']},
+    #
+    # misc
+    #
+    # 472 KB, 5,034
+    {'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
+    #
+    # math
+    #
+    # 12.6 GB, 21,972,791 - we use 1M subset - 639 MB, 1,000,000
+    *[
+        {'path': 'nvidia/OpenMathInstruct-2', 'split': f'train_1M[{i}%:{i + 5}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
+        for i in range(0, 100, 5)
+    ],
+    #
+    # stem
+    #
+    # 1.44 GB, 63,357
+    *[
+        {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['abstract']}
+        for i in range(0, 100, 5)
+    ],
+    *[
+        {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['markdown']}
+        for i in range(0, 100, 5)
+    ],
+    #
+    # code
+    #
+    # 7.81 GB, ~2,804,025
+    *[
+        {'path': 'rombodawg/code_bagel_hermes-2.5', 'split': f'train[{i}%:{i + 5}%]', 'format': '{input} {output}'}
+        for i in range(0, 100, 5)
+    ],
+    #
+    # general knowledge
+    #
+    # 3.18 GB, 1,010,500 - paper says that extracted is 6GB
+    *[
+        {'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
+        for i in range(0, 100, 5)
+    ],
+    {'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
+    {'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
+]

scripts/requirements.in ADDED Viewed

	@@ -0,0 +1,22 @@

+# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+torch>=2.2.0,<=2.4.1
+numpy<2.0
+tqdm
+datasets
+jinja2
+transformers
+wandb
+# litgpt[all]
+litgpt[all] @ git+https://github.com/Lightning-AI/litgpt.git
+# litgpt @ git+https://github.com/Lightning-AI/litgpt.git
+# litdata
+# litdata @ git+https://github.com/Lightning-AI/litdata.git
+lm_eval[ifeval,math]
+grokadamw
+# bitsandbytes
+# pyzstd
+# zstd
+Pillow
+sophia-opt

scripts/tokenizer_datasets.py ADDED Viewed

	@@ -0,0 +1,48 @@

+tokenizer_datasets = [
+    #
+    # multilingual
+    #
+    # 3.17 GB, 2,226,907
+    *[
+        {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
+        for i in range(0, 100, 5)
+    ],
+    # 1.64 GB, 1,001,000
+    *[
+        {'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
+        for i in range(0, 100, 5)
+    ],
+    #
+    # stem
+    #
+    # 1.44 GB, 63,357
+    *[
+        {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['abstract']}
+        for i in range(0, 100, 5)
+    ],
+    *[
+        {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['markdown']}
+        for i in range(0, 100, 5)
+    ],
+    #
+    # code
+    #
+    # 7.81 GB, ~2,804,025
+    *[
+        {'path': 'rombodawg/code_bagel_hermes-2.5', 'split': f'train[{i}%:{i + 5}%]', 'format': '{input} {output}'}
+        for i in range(0, 100, 5)
+    ],
+    #
+    # general knowledge
+    #
+    # 3.18 GB, 1,010,500 - paper says that extracted is 6GB
+    *[
+        {'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
+        for i in range(0, 100, 5)
+    ],
+    {'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
+    {'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
+]

scripts/train_tokenizer.py ADDED Viewed

	@@ -0,0 +1,252 @@

+from transformers import PreTrainedTokenizerFast
+from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from utils import batch_text_iterator
+from tokenizer_datasets import tokenizer_datasets
+#
+# special_tokens
+#
+bos_token = '<|begin_of_text|>'
+eos_token = '<|end_of_text|>'
+special_tokens = [
+    bos_token,
+    eos_token,
+    '<|start_header_id|>',
+    '<|end_header_id|>',
+    '<|eom_id|>',
+    '<|eot_id|>',
+    'system',
+    'user',
+    'assistant',
+    # tool/function calling
+    '<tools>',
+    '</tools>',
+    '<tool>',
+    '</tool>',
+    '<tool_call>',
+    '</tool_call>',
+    '<tool_response>',
+    '</tool_response>',
+    '"name"',
+    '"arguments"',
+    #
+    # JSON Schema
+    #
+    # General Metadata Keywords
+    '"$schema"',
+    '"$id"',
+    '"$ref"',
+    '"$defs"',
+    '"$anchor"',
+    '"$dynamicAnchor"',
+    '"$dynamicRef"',
+    '"$vocabulary"',
+    '"$comment"',
+    # Data Types
+    '"null"',
+    '"boolean"',
+    '"object"',
+    '"array"',
+    '"number"',
+    '"string"',
+    '"integer"',
+    # Validation Keywords
+    '"type"',
+    '"enum"',
+    '"const"',
+    '"multipleOf"',
+    '"maximum"',
+    '"exclusiveMaximum"',
+    '"minimum"',
+    '"exclusiveMinimum"',
+    '"maxLength"',
+    '"minLength"',
+    '"pattern"',
+    '"additionalItems"',
+    '"items"',
+    '"prefixItems"',
+    '"contains"',
+    '"maxItems"',
+    '"minItems"',
+    '"uniqueItems"',
+    '"maxProperties"',
+    '"minProperties"',
+    '"required"',
+    '"properties"',
+    '"patternProperties"',
+    '"additionalProperties"',
+    '"dependentRequired"',
+    '"dependentSchemas"',
+    '"propertyNames"',
+    # Conditional Keywords
+    '"if"',
+    '"then"',
+    '"else"',
+    '"allOf"',
+    '"anyOf"',
+    '"oneOf"',
+    '"not"',
+    # Additional Keywords for Evaluation Control
+    '"unevaluatedItems"',
+    '"unevaluatedProperties"',
+    # Informational Keywords
+    '"title"',
+    '"description"',
+    '"default"',
+    '"deprecated"',
+    '"readOnly"',
+    '"writeOnly"',
+    '"examples"',
+    # Content-Related Keywords
+    '"contentEncoding"',
+    '"contentMediaType"',
+    '"contentSchema"',
+    # Additional Keywords
+    '"next"',                # Typically used in reference to linked or next items
+    '"value"',                # Represents the value of a property or item
+    # misc
+    '<input>',
+    '</input>',
+    '<output>',
+    '</output>',
+    '<query>',
+    '</query>',
+    '<key>',
+    '</key>',
+    '<value>',
+    '</value>',
+    '<text>',
+    '</text>',
+    '<code>',
+    '</code>',
+    '<image>',
+    '</image>',
+    '<file>',
+    '</file>',
+    # qa
+    '<question>',
+    '</question>',
+    '<answer>',
+    '</answer>',
+    # thought
+    '<thought>',
+    '</thought>',
+    '<plan>',
+    '</plan>',
+    '<vote>',
+    '</vote>',
+    '<passage>',
+    '</passage>',
+    # reasoning
+    '<reasoning>',
+    '</reasoning>',
+    '<acting>',
+    '</acting>',
+    '<action>',
+    '</action>',
+    '<observation>',
+    '</observation>',
+    '<claim>',
+    '</claim>',
+    # reflection
+    '<thinking>',
+    '</thinking>',
+    '<reflection>',
+    '</reflection>',
+    '<step>',
+    '</step>',
+    # graph
+    '<graph>',
+    '</graph>',
+    '<edge>',
+    '</edge>',
+    '<source>',
+    '</source>',
+    '<destination>',
+    '</destination>',
+    '<relation>',
+    '</relation>',
+    # '<value>',
+    # '</value>',
+]
+for i in range(256 - len(special_tokens)):
+    special_tokens.append(f'<|reserved_special_token_{i}|>')
+for i in range(256):
+    special_tokens.append(f'<0x{i:02X}>')
+#
+# BPE Tokenizer
+#
+bpe = BPE(unk_token=None, byte_fallback=True)
+tokenizer = Tokenizer(bpe)
+# normalizer
+tokenizer.normalizer = None
+# pre-tokenizer
+tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
+# post-processor
+tokenizer.post_processor = processors.ByteLevel(add_prefix_space=True, trim_offsets=False, use_regex=True)
+# decoder
+tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
+#
+# BPE Trainer
+#
+trainer = BpeTrainer(
+    vocab_size=131072, # 128 * 1024
+    min_frequency=3,
+    special_tokens=special_tokens,
+    max_token_length=16,
+)
+tokenizer.train_from_iterator(
+    batch_text_iterator(tokenizer_datasets),
+    trainer,
+)
+tokenizer.save('../tokenizer.json')
+tokenizer.model.save('../')
+#
+# PreTrainedTokenizerFast
+#
+CHAT_TEMPLATE = (
+    "{{ bos_token }}"
+    "{% for message in messages %}"
+        "{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + message['content'] + '<|eot_id|>'}}"
+    "{% endfor %}"
+    "{% if add_generation_prompt %}"
+        "{{ '<|start_header_id|>assistant<|end_header_id|>' }}"
+    "{% else %}"
+        "{{ eos_token }}"
+    "{% endif %}"
+)
+fast_tokenizer = PreTrainedTokenizerFast(
+    tokenizer_object=tokenizer,
+    chat_template=CHAT_TEMPLATE,
+    bos_token=bos_token,
+    eos_token=eos_token,
+    clean_up_tokenization_spaces=False,
+)
+fast_tokenizer.save_pretrained('../')

scripts/utils.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import gc
+from typing import Union, Optional, Iterator, Callable
+import torch
+from datasets import load_dataset
+from litgpt.tokenizer import Tokenizer
+from transformers import AutoTokenizer
+def _batch_text_iterator(path: str,
+                         name: Optional[str]=None,
+                         data_dir: Optional[str]=None,
+                         data_files: Optional[str]=None,
+                         keep_in_memory: bool=False,
+                         revision: Optional[str]=None,
+                         split: str='train',
+                         num_proc: Optional[int]=None,
+                         format: Optional[Callable|str]=None) -> Iterator[str]:
+    assert isinstance(format, str) or callable(format), repr(format)
+    dataset = load_dataset(path=path,
+                           name=name,
+                           data_dir=data_dir,
+                           data_files=data_files,
+                           keep_in_memory=keep_in_memory,
+                           revision=revision,
+                           split=split,
+                           trust_remote_code=True,
+                           num_proc=num_proc)
+    if callable(format):
+        for row in dataset:
+            text = format(row)
+            yield text
+    else:
+        for row in dataset:
+            text = format.format(**row)
+            yield text
+    del dataset
+    gc.collect()
+def _batch_chat_iterator(path: str,
+                         name: Optional[str]=None,
+                         data_dir: Optional[str]=None,
+                         data_files: Optional[str]=None,
+                         keep_in_memory: bool=False,
+                         revision: Optional[str]=None,
+                         split: str='train',
+                         num_proc: Optional[int]=None,
+                         field: Optional[str]=None,
+                         transform: Optional[Callable]=None) -> Iterator[list[dict[str, str]]]:
+    dataset = load_dataset(path=path,
+                           name=name,
+                           data_dir=data_dir,
+                           data_files=data_files,
+                           keep_in_memory=keep_in_memory,
+                           revision=revision,
+                           split=split,
+                           trust_remote_code=True,
+                           num_proc=num_proc)
+    if callable(transform):
+        for row in dataset:
+            if field:
+                messages = transform(row[field])
+            else:
+                messages = transform(row)
+            yield messages
+    else:
+        for row in dataset:
+            if field:
+                messages = row[field]
+            else:
+                raise ValueError(field)
+            yield messages
+    del dataset
+    gc.collect()
+def batch_text_iterator(dataset_config: Union[list, dict]) -> Iterator[str]:
+    assert isinstance(dataset_config, (dict, list)), dataset_config
+    if isinstance(dataset_config, dict):
+        for text in _batch_text_iterator(**dataset_config):
+            yield text
+    elif isinstance(dataset_config, list):
+        for dc in dataset_config:
+            for text in _batch_text_iterator(**dc):
+                yield text
+def batch_chat_iterator(dataset_config: Union[list, dict]) -> Iterator[list[dict[str, str]]]:
+    assert isinstance(dataset_config, (dict, list)), dataset_config
+    if isinstance(dataset_config, dict):
+        for messages in _batch_chat_iterator(**dataset_config):
+            yield messages
+    elif isinstance(dataset_config, list):
+        for dc in dataset_config:
+            for messages in _batch_chat_iterator(**dc):
+                yield messages
+def tokenize_text_fn(dataset_config: list, tokenizer: Tokenizer, min_len: Optional[int]=None, max_len: Optional[int]=None) -> Iterator[torch.Tensor]:
+    for text in batch_text_iterator(dataset_config):
+        text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=True)
+        if min_len is None and max_len is None:
+            yield text_ids
+        if min_len is None:
+            min_len = 0
+        if max_len is None:
+            max_len = len(text_ids)
+        if min_len <= len(text_ids) <= max_len:
+            yield text_ids
+def tokenize_chat_fn(dataset_config: list, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer, min_len: Optional[int]=None, max_len: Optional[int]=None) -> Iterator[torch.Tensor]:
+    for messages in batch_chat_iterator(dataset_config):
+        # text_ids: torch.Tensor = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors='pt')
+        # text_ids = text_ids.to(torch.int)
+        text: str = hf_tokenizer.apply_chat_template(messages, tokenize=False)
+        text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=False)
+        if min_len is None and max_len is None:
+            yield text_ids
+        if min_len is None:
+            min_len = 0
+        if max_len is None:
+            max_len = len(text_ids)
+        if min_len <= len(text_ids) <= max_len:
+            yield text_ids