mtasic85 commited on
Commit
5e3ba51
·
1 Parent(s): aab4d25

train tokenizer 128k

Browse files
TRAIN.md ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Train
2
+
3
+ ## Environment
4
+
5
+ ```bash
6
+ cd scripts
7
+ python -m venv venv
8
+ source venv/bin/activate
9
+ pip install -U -r requirements.in
10
+ ```
11
+
12
+ ## Train Tokenizer
13
+
14
+ ```bash
15
+ time python -B train_tokenizer.py
16
+ ```
17
+
18
+ ## Pretrain
19
+
20
+ ```bash
21
+ python -B prepare_pretrain_datasets.py
22
+ ```
23
+
24
+ ```bash
25
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-0.yaml
26
+ litgpt convert_pretrained_checkpoint ../out/pretrain-0/final/ ../out/pretrain-0-final-checkpoint
27
+
28
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-1.yaml
29
+ litgpt convert_pretrained_checkpoint ../out/pretrain-1/final/ ../out/pretrain-1-final-checkpoint
30
+
31
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-2.yaml
32
+ litgpt convert_pretrained_checkpoint ../out/pretrain-2/final/ ../out/pretrain-2-final-checkpoint
33
+
34
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-3.yaml
35
+ litgpt convert_pretrained_checkpoint ../out/pretrain-3/final/ ../out/pretrain-3-final-checkpoint
36
+
37
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-4.yaml
38
+ litgpt convert_pretrained_checkpoint ../out/pretrain-4/final/ ../out/pretrain-4-final-checkpoint
39
+
40
+ # NOTE: unused
41
+ # CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-5.yaml
42
+ # litgpt convert_pretrained_checkpoint ../out/pretrain-5/final/ ../out/pretrain-5-final-checkpoint
43
+ ```
44
+
45
+ ### Continued Pretraining
46
+
47
+ ```bash
48
+ python -B prepare_contrain_datasets.py
49
+ ```
50
+
51
+ ```bash
52
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config contrain-model-0.yaml
53
+ litgpt convert_pretrained_checkpoint ../out/contrain-0/final/ ../out/contrain-0-final-checkpoint
54
+
55
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config contrain-model-1.yaml
56
+ litgpt convert_pretrained_checkpoint ../out/contrain-1/final/ ../out/contrain-1-final-checkpoint
57
+ ```
58
+
59
+ ## Chat with Pretrained model
60
+
61
+ ```bash
62
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-0/final/
63
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-1/final/
64
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-2/final/
65
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-3/final/
66
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-4/final/
67
+ # CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-5/final/
68
+ ```
69
+
70
+ <!-- OLD -->
71
+
72
+ ## Model
73
+
74
+ ### Pretraining
75
+
76
+ ```bash
77
+ litgpt pretrain --config ./pretrain-model.yaml
78
+ litgpt convert_from_litgpt out/pretrain/final/ out/converted_pretrain
79
+ cp config.json out/pretrain/final/
80
+ cp config.json out/converted_pretrain/
81
+ ```
82
+
83
+ ```python
84
+ import torch
85
+ from safetensors.torch import save_file
86
+
87
+ state_dict = torch.load('out/converted_pretrain/model.pth', map_location='cpu')
88
+ save_file(state_dict, 'out/converted_pretrain/model.safetensors')
89
+ ```
90
+
91
+ ### Continued Pretraining
92
+
93
+ ```bash
94
+ litgpt convert_pretrained_checkpoint out/pretrain/final/ out/pretrain_checkpoint/final/
95
+ cp config.json out/pretrain_checkpoint/final/
96
+
97
+ litgpt pretrain --config ./contrain-model.yaml
98
+ litgpt convert_from_litgpt out/contrain/final/ out/converted_contrain
99
+ cp config.json out/converted_contrain/
100
+ ```
101
+
102
+ ```python
103
+ import torch
104
+ from safetensors.torch import save_file
105
+
106
+ state_dict = torch.load('out/converted_contrain/model.pth', map_location='cpu')
107
+ save_file(state_dict, 'out/converted_contrain/model.safetensors')
108
+ ```
109
+
110
+ ```bash
111
+ cp out/converted_contrain/model.pth ./
112
+ cp out/converted_contrain/model.safetensors ./
113
+ ```
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "tangledgroup/tangled-llama-j-128k-v0.1",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "eos_token_id": [
10
+ 1,
11
+ 4,
12
+ 5
13
+ ],
14
+ "head_dim": 64,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 768,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 2048,
19
+ "max_position_embeddings": 131072,
20
+ "mlp_bias": false,
21
+ "model_type": "llama",
22
+ "num_attention_heads": 16,
23
+ "num_hidden_layers": 32,
24
+ "num_key_value_heads": 4,
25
+ "pretraining_tp": 1,
26
+ "rms_norm_eps": 1e-05,
27
+ "rope_scaling": {
28
+ "factor": 32.0,
29
+ "high_freq_factor": 4.0,
30
+ "low_freq_factor": 1.0,
31
+ "original_max_position_embeddings": 8192,
32
+ "rope_type": "llama3"
33
+ },
34
+ "rope_theta": 1000000.0,
35
+ "tie_word_embeddings": true,
36
+ "torch_dtype": "bfloat16",
37
+ "transformers_version": "4.45.0.dev0",
38
+ "use_cache": true,
39
+ "vocab_size": 65536
40
+ }
misc/logo.png ADDED

Git LFS Details

  • SHA256: 591bee6fa56315a84eeec47c5e04ff6331f842c773ce50b7a59e508b4d2904cf
  • Pointer size: 131 Bytes
  • Size of remote file: 684 kB
scripts/contrain-model-0.yaml ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/config.json
2
+
3
+ # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
4
+ # ``model_config``. (type: Optional[str], default: null)
5
+ model_name: "Llama-3.2-1B"
6
+
7
+ # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
8
+ # ``model_config``. (type: Optional[Config], default: null)
9
+ model_config:
10
+ padded_vocab_size: 65536
11
+ vocab_size: 65536
12
+ block_size: 131072
13
+ n_layer: 32
14
+ n_head: 16
15
+ head_size: 64
16
+ n_embd: 768
17
+ n_query_groups: 4
18
+ rotary_percentage: 1.0
19
+ parallel_residual: false
20
+ shared_attention_norm: false
21
+ bias: false
22
+ # attn_bias: true # qwen 2.5
23
+ norm_class_name: "RMSNorm"
24
+ mlp_class_name: "LLaMAMLP"
25
+ intermediate_size: 2048
26
+ # rope_base: 500000 # llama 3.2
27
+ rope_base: 1000000 # qwen 2.5
28
+ rope_adjustments: # llama 3.2
29
+ factor: 32.0
30
+ low_freq_factor: 1.0
31
+ high_freq_factor: 4.0
32
+ original_max_seq_len: 8192
33
+
34
+ # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
35
+ # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
36
+ out_dir: "../out/contrain-0/"
37
+
38
+ # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
39
+ # precision: bf16-mixed
40
+ precision: bf16-true
41
+
42
+ # Optional path to a checkpoint directory to initialize the model from.
43
+ # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
44
+ initial_checkpoint_dir: "../out/pretrain-4-final-checkpoint/"
45
+
46
+ # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
47
+ # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
48
+ # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
49
+ # (type: Union[bool, Literal["auto"], Path], default: False)
50
+ # resume:
51
+
52
+ # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
53
+ data:
54
+ class_path: LitData
55
+
56
+ init_args:
57
+ data_path: "../contrain-data-0-4097-16388000/"
58
+ num_workers: 32
59
+
60
+ # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
61
+ train:
62
+ # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
63
+ save_interval: 100
64
+
65
+ # Number of iterations between logging calls (type: int, default: 1)
66
+ log_interval: 1
67
+
68
+ # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
69
+ global_batch_size: 512
70
+
71
+ # Number of samples per data-parallel rank (type: int, default: 4)
72
+ micro_batch_size: 3
73
+
74
+ # Number of iterations with learning rate warmup active (type: int, default: 2000)
75
+ lr_warmup_steps: 0
76
+
77
+ # Number of epochs to train on (type: Optional[int], default: null)
78
+ epochs:
79
+
80
+ # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
81
+ max_tokens: 1527816367 # 4_097 * 372_911
82
+
83
+ # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
84
+ max_steps:
85
+
86
+ # Limits the length of samples. Off by default (type: Optional[int], default: null)
87
+ max_seq_length: 4097
88
+
89
+ # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
90
+ tie_embeddings: true
91
+
92
+ # (type: Optional[float], default: 1.0)
93
+ max_norm: 1.0
94
+
95
+ # (type: float, default: 4e-05)
96
+ min_lr: 1e-06
97
+
98
+ # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
99
+ eval:
100
+ # Number of optimizer steps between evaluation calls (type: int, default: 1000)
101
+ interval: 25
102
+
103
+ # Number of tokens to generate (type: Optional[int], default: null)
104
+ max_new_tokens:
105
+
106
+ # Number of iterations (type: int, default: 100)
107
+ max_iters: 100
108
+
109
+ # Whether to evaluate on the validation set at the beginning of the training
110
+ initial_validation: false
111
+
112
+ # Whether to evaluate on the validation set at the end the training
113
+ final_validation: true
114
+
115
+ # Optimizer-related arguments
116
+ optimizer:
117
+ class_path: grokadamw.GrokAdamW
118
+
119
+ init_args:
120
+ # (type: float, default: 0.001)
121
+ lr: 1e-05
122
+
123
+ # (type: float, default: 0.01)
124
+ weight_decay: 1e-2
125
+
126
+ # (type: tuple, default: (0.9,0.999))
127
+ betas:
128
+ - 0.9
129
+ - 0.999
130
+
131
+ # optimizer:
132
+ # class_path: sophia_opt.SophiaG
133
+ #
134
+ # init_args:
135
+ # lr: 4e-4
136
+ # betas:
137
+ # - 0.965
138
+ # - 0.99
139
+ # rho: 0.01
140
+ # weight_decay: 1e-1
141
+
142
+ # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
143
+ devices: auto
144
+
145
+ # How many nodes to use. (type: int, default: 1)
146
+ num_nodes: 1
147
+
148
+ # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
149
+ # module require this. (type: Optional[Path], default: null)
150
+ tokenizer_dir: "../"
151
+
152
+ # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
153
+ logger_name: "wandb"
154
+
155
+ # The random seed to use for reproducibility. (type: int, default: 42)
156
+ seed: 23
scripts/contrain_datasets.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ roles_map = {
2
+ 'system': 'system',
3
+ 'user': 'user',
4
+ 'human': 'user',
5
+ 'assistant': 'assistant',
6
+ 'gpt': 'assistant',
7
+ 'AI': 'assistant',
8
+ }
9
+
10
+
11
+ contrain_datasets = [
12
+ #
13
+ # general instructs
14
+ #
15
+ # mlabonne/open-perfectblend - 1.48 GB, 1,420,909
16
+ # meta-math/MetaMathQA 395,000
17
+ # openbmb/UltraInteract_sft 288,579
18
+ # HuggingFaceH4/ultrachat_200k 207,865
19
+ # microsoft/orca-math-word-problems-200k 200,035
20
+ # HuggingFaceH4/ultrafeedback_binarized 187,405
21
+ # theblackcat102/evol-codealpaca-v1 111,272
22
+ # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
23
+ # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
24
+ *[
25
+ {'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
26
+ {'role': roles_map[m['from']], 'content': m['value']}
27
+ for m in msgs
28
+ ]}
29
+ for i in range(0, 100, 20)
30
+ ],
31
+ # arcee-ai/The-Tome - 4.58 GB, 1,752,473
32
+ # - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
33
+ # - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
34
+ # - jondurbin/airoboros-3.2
35
+ # - gardner/glaive-function-calling-v2-sharegpt
36
+ # - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01)
37
+ # - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k)
38
+ # - cognitivecomputations/ultrainteract_trajectories_sharegpt
39
+ # - cognitivecomputations/SystemChat-2.0
40
+ # - arcee-ai/qwen2-72b-magpie-en
41
+ *[
42
+ {'path': 'arcee-ai/The-Tome', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
43
+ {'role': roles_map[m['from']], 'content': m['value']}
44
+ for m in msgs
45
+ ]}
46
+ for i in range(0, 100, 20)
47
+ ],
48
+ # rombodawg/Everything_Instruct_Multilingual - 2.48 GB, 5,808,694
49
+ # Science:
50
+ # antiven0m/physical-reasoning-dpoScience
51
+ # LawalAfeez/science-dataset
52
+ # Social media:
53
+ # Kyle1668/AG-Tweets
54
+ # euclaise/reddit-instruct-curated
55
+ # General Knowledge:
56
+ # NousResearch/CharacterCodex_Characters
57
+ # jstet/quotes-500k_Famous_Quotes
58
+ # FronkonGames/steam-games-dataset_Video_Games
59
+ # totuta_youtube_subs_howto100M_HowTo
60
+ # Multi-lingual:
61
+ # Amani27/massive_translation_dataset
62
+ # udmurtNLP/udmurt-russian-english-labse
63
+ # grosenthal/latin_english
64
+ # msarmi9/korean-english-multitarget-ted-talks-task
65
+ # HaiderSultanArc/MT-Urdu-English_Translate
66
+ # Garsa3112/ChineseEnglishTranslationDataset
67
+ # Cooking:
68
+ # andrewsiah/se_cooking_preference_sft
69
+ # Hieu-Phamkaggle/food_recipes
70
+ # Writing:
71
+ # shahules786/PoetryFoundationData
72
+ # euclaise/writingprompts
73
+ # qwedsacf/ivypanda-essaysEssay
74
+ # Medicine:
75
+ # keivalya/MedQuad-MedicalQnADataset
76
+ # nuvocare/MSD
77
+ # History:
78
+ # ambrosfitz10k/history_data_v4
79
+ # Law:
80
+ # dzunggg/legal-qa-v1
81
+ # Role-Play:
82
+ # roleplay4/fun_CoupleRP
83
+ # Undi95andrijdavid/roleplay-conversation-sharegpt
84
+ # News:
85
+ # RealTimeData/bbc_news_alltime
86
+ # Coding: (rombodawg/code_bagel)
87
+ # layoric/tiny-codes-alpaca
88
+ # glaiveai/glaive-code-assistant-v3
89
+ # ajibawa-2023/Code-290k-ShareGPT
90
+ # chargoddard/commitpack-ft-instruct-rated
91
+ # iamtarun/code_instructions_120k_alpaca
92
+ # ise-uiuc/Magicoder-Evol-Instruct-110K
93
+ # cognitivecomputations/dolphin-coder
94
+ # nickrosh/Evol-Instruct-Code-80k-v1
95
+ # coseal/CodeUltraFeedback_binarized
96
+ # CyberNative/Code_Vulnerability_Security_DPO
97
+ # Math: (rombodawg/code_bagel)
98
+ # TIGER-Lab/MathInstruct
99
+ # Function calling: (rombodawg/code_bagel)
100
+ # glaiveai/glaive-function-calling-v2
101
+ # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
102
+ # teknium/OpenHermes-2.5
103
+ *[
104
+ {'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 20}%]', 'transform': lambda r: [
105
+ {'role': 'system', 'content': r['instruction']},
106
+ {'role': 'user', 'content': r['input']},
107
+ {'role': 'assistant', 'content': r['output']},
108
+ ]}
109
+ for i in range(0, 100, 20)
110
+ ],
111
+
112
+ #
113
+ # tool/function calling
114
+ #
115
+ # 65.7 MB, 11,578
116
+ {'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [
117
+ {'role': roles_map[m['from']], 'content': m['value']}
118
+ for m in msgs
119
+ ]},
120
+
121
+ #
122
+ # agent
123
+ #
124
+ # 1.51 GB, 485,874
125
+ *[
126
+ {'path': 'arcee-ai/agent-data', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
127
+ {'role': roles_map[m['from']], 'content': m['value']}
128
+ for m in msgs
129
+ ]}
130
+ for i in range(0, 100, 20)
131
+ ],
132
+
133
+ #
134
+ # general reasoning
135
+ #
136
+ *[
137
+ # 10.8 MB, 15,770
138
+ {'path': 'AtlasUnified/Atlas-Reasoning', 'data_files': 'reasoning.csv', 'transform': lambda r: [
139
+ {'role': 'user', 'content': r['Prompt']},
140
+ {'role': 'assistant', 'content': r['Step-by-step reasoning'] + '\n' + r['Solution']},
141
+ ]},
142
+ ],
143
+
144
+ #
145
+ # math reasoning
146
+ #
147
+ # 8.99 MB, 6,914
148
+ {'path': 'thesven/gsm8k-reasoning', 'transform': lambda r: [
149
+ {'role': 'user', 'content': r['question']},
150
+ {'role': 'assistant', 'content': (r['generation'] or '') + '\n' + r['answer'] + '\n' + r['short_answer']},
151
+ ]},
152
+
153
+ # 1.79 MB, 3,963
154
+ {'path': 'AlgorithmicResearchGroup/math_reasoning_autoformalization_track', 'transform': lambda r: [
155
+ {'role': 'user', 'content': r['informal_statement']},
156
+ {'role': 'assistant', 'content': r['informal_proof'] + '\n' + r['formal_proof']},
157
+ ]},
158
+
159
+ # 307 MB, 19,944
160
+ {'path': 'KingNish/reasoning-base-20k', 'transform': lambda r: [
161
+ {'role': 'user', 'content': r['user']},
162
+ {'role': 'assistant', 'content': r['reasoning'] + '\n' + r['assistant']},
163
+ ]},
164
+
165
+ # 9.45 MB, 10,000
166
+ {'path': 'Aarushhh/math-reasoning-10k', 'transform': lambda r: [
167
+ {'role': 'user', 'content': r['problem']},
168
+ {'role': 'assistant', 'content': r['plan'] + '\n' + r['solution']},
169
+ ]},
170
+
171
+ #
172
+ # reflection
173
+ #
174
+ # 4.17 MB, 1,000
175
+ {'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [
176
+ {'role': 'system', 'content': r['system']},
177
+ {'role': 'user', 'content': r['prompt']},
178
+ {'role': 'assistant', 'content': r['response']},
179
+ ]},
180
+ # 12.4 MB, 3,000
181
+ {'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [
182
+ {'role': 'system', 'content': r['system']},
183
+ {'role': 'user', 'content': r['prompt']},
184
+ {'role': 'assistant', 'content': r['response']},
185
+ ]},
186
+ # 70.8 MB, 36,549
187
+ {'path': 'dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [
188
+ {'role': 'system', 'content': r['system']},
189
+ {'role': 'user', 'content': r['prompt']},
190
+ {'role': 'assistant', 'content': r['response']},
191
+ ]},
192
+ # 30.6 MB, 25,391
193
+ {'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [
194
+ r['system'][0],
195
+ {'role': 'user', 'content': r['input']},
196
+ {'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']},
197
+ ]},
198
+
199
+ #
200
+ # general instructs
201
+ #
202
+ # 971 MB, 484,570
203
+ {'path': 'HuggingFaceTB/smol-smoltalk', 'field': 'messages'},
204
+ ]
scripts/prepare_contrain_datasets.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+
3
+ from litgpt.tokenizer import Tokenizer
4
+ from litdata import optimize, TokensLoader, StreamingDataset
5
+ from transformers import AutoTokenizer
6
+
7
+ from utils import tokenize_chat_fn
8
+ from contrain_datasets import contrain_datasets
9
+
10
+
11
+ #
12
+ # optimize datasets
13
+ #
14
+ for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]):
15
+ # i = 0
16
+ # block_size = 8193
17
+ # chunk_size = block_size * 2000
18
+ chunk_size = block_size * subchunk_size
19
+ output_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}'
20
+
21
+ outputs = optimize(
22
+ fn=partial(
23
+ tokenize_chat_fn,
24
+ hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True),
25
+ tokenizer=Tokenizer('..'),
26
+ ),
27
+ inputs=contrain_datasets,
28
+ output_dir=output_dir,
29
+ chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
30
+ num_workers=32,
31
+ reorder_files=False,
32
+ )
33
+
34
+ #
35
+ # total number of chunks in datasets
36
+ #
37
+ for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]):
38
+ # i = 0
39
+ # block_size = 8193
40
+ # chunk_size = block_size * 2000
41
+ chunk_size = block_size * subchunk_size
42
+ input_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}'
43
+
44
+ dataset = StreamingDataset(
45
+ input_dir=input_dir,
46
+ item_loader=TokensLoader(block_size=block_size),
47
+ )
48
+
49
+ print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')
scripts/prepare_pretrain_datasets.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+
3
+ from litgpt.tokenizer import Tokenizer
4
+ from litdata import optimize, TokensLoader, StreamingDataset
5
+
6
+ from utils import tokenize_text_fn
7
+ from pretrain_datasets import pretrain_datasets
8
+
9
+
10
+ #
11
+ # optimize datasets
12
+ #
13
+ for i, (b, e) in enumerate([(0, 513), (512, 1025), (1024, 2049), (2048, 4097), (4096, 8192), (8192, 1024 ** 3)]):
14
+ if e <= 8192:
15
+ block_size = (64 * 1024 * 1024) // (4 * e)
16
+ chunk_size = e * block_size
17
+ else:
18
+ block_size = 2048
19
+ chunk_size = b * block_size
20
+
21
+ output_dir = f'../pretrain-data-{i}-{b}-{e}-{block_size}-{chunk_size}'
22
+
23
+ outputs = optimize(
24
+ fn=partial(tokenize_text_fn, tokenizer=Tokenizer('..'), min_len=b, max_len=e),
25
+ inputs=pretrain_datasets,
26
+ output_dir=output_dir,
27
+ chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
28
+ num_workers=32,
29
+ reorder_files=False,
30
+ )
31
+
32
+ #
33
+ # total number of chunks in datasets
34
+ #
35
+ for i, (b, e) in enumerate([(0, 513), (512, 1025), (1024, 2049), (2048, 4097), (4096, 8192), (8192, 1024 ** 3)]):
36
+ if e <= 8192:
37
+ block_size = (64 * 1024 * 1024) // (4 * e)
38
+ chunk_size = e * block_size
39
+ else:
40
+ block_size = 2048
41
+ chunk_size = b * block_size
42
+
43
+ input_dir = f'../pretrain-data-{i}-{b}-{e}-{block_size}-{chunk_size}'
44
+
45
+ dataset = StreamingDataset(
46
+ input_dir=input_dir,
47
+ item_loader=TokensLoader(block_size=block_size),
48
+ )
49
+
50
+ print(f'{i=}, {b=}, {e=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {e * len(dataset)=}')
scripts/pretrain-model-0.yaml ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/config.json
2
+
3
+ # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
4
+ # ``model_config``. (type: Optional[str], default: null)
5
+ model_name: "Llama-3.2-1B"
6
+
7
+ # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
8
+ # ``model_config``. (type: Optional[Config], default: null)
9
+ model_config:
10
+ padded_vocab_size: 65536
11
+ vocab_size: 65536
12
+ block_size: 131072
13
+ n_layer: 32
14
+ n_head: 16
15
+ head_size: 64
16
+ n_embd: 768
17
+ n_query_groups: 4
18
+ rotary_percentage: 1.0
19
+ parallel_residual: false
20
+ shared_attention_norm: false
21
+ bias: false
22
+ # attn_bias: true # qwen 2.5
23
+ norm_class_name: "RMSNorm"
24
+ mlp_class_name: "LLaMAMLP"
25
+ intermediate_size: 2048
26
+ # rope_base: 500000 # llama 3.2
27
+ rope_base: 1000000 # qwen 2.5
28
+ rope_adjustments: # llama 3.2
29
+ factor: 32.0
30
+ low_freq_factor: 1.0
31
+ high_freq_factor: 4.0
32
+ original_max_seq_len: 8192
33
+
34
+ # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
35
+ # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
36
+ out_dir: "../out/pretrain-0/"
37
+
38
+ # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
39
+ # precision: bf16-mixed
40
+ precision: bf16-true
41
+
42
+ # Optional path to a checkpoint directory to initialize the model from.
43
+ # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
44
+ initial_checkpoint_dir:
45
+
46
+ # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
47
+ # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
48
+ # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
49
+ # (type: Union[bool, Literal["auto"], Path], default: False)
50
+ resume: "auto"
51
+
52
+ # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
53
+ data:
54
+ class_path: LitData
55
+
56
+ init_args:
57
+ data_path: "../pretrain-data-0-0-513-32704-16777152/"
58
+ num_workers: 32
59
+
60
+ # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
61
+ train:
62
+ # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
63
+ save_interval: 100
64
+
65
+ # Number of iterations between logging calls (type: int, default: 1)
66
+ log_interval: 1
67
+
68
+ # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
69
+ global_batch_size: 512
70
+
71
+ # Number of samples per data-parallel rank (type: int, default: 4)
72
+ micro_batch_size: 24
73
+
74
+ # Number of iterations with learning rate warmup active (type: int, default: 2000)
75
+ lr_warmup_steps: 0
76
+
77
+ # Number of epochs to train on (type: Optional[int], default: null)
78
+ epochs:
79
+
80
+ # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
81
+ max_tokens: 1945266624 # 32_704 * 59_481
82
+
83
+ # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
84
+ max_steps:
85
+
86
+ # Limits the length of samples. Off by default (type: Optional[int], default: null)
87
+ max_seq_length: 513
88
+
89
+ # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
90
+ tie_embeddings: true
91
+
92
+ # (type: Optional[float], default: 1.0)
93
+ max_norm: 1.0
94
+
95
+ # (type: float, default: 4e-05)
96
+ min_lr: 1e-05
97
+
98
+ # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
99
+ eval:
100
+ # Number of optimizer steps between evaluation calls (type: int, default: 1000)
101
+ interval: 100
102
+
103
+ # Number of tokens to generate (type: Optional[int], default: null)
104
+ max_new_tokens:
105
+
106
+ # Number of iterations (type: int, default: 100)
107
+ max_iters: 100
108
+
109
+ # Whether to evaluate on the validation set at the beginning of the training
110
+ initial_validation: false
111
+
112
+ # Whether to evaluate on the validation set at the end the training
113
+ final_validation: true
114
+
115
+ # Optimizer-related arguments
116
+ optimizer:
117
+ class_path: grokadamw.GrokAdamW
118
+
119
+ init_args:
120
+ # (type: float, default: 0.001)
121
+ lr: 1e-04
122
+
123
+ # (type: float, default: 0.01)
124
+ weight_decay: 1e-2
125
+
126
+ # (type: tuple, default: (0.9,0.999))
127
+ betas:
128
+ - 0.9
129
+ - 0.999
130
+
131
+ # optimizer:
132
+ # class_path: sophia_opt.SophiaG
133
+ #
134
+ # init_args:
135
+ # lr: 4e-4
136
+ # betas:
137
+ # - 0.965
138
+ # - 0.99
139
+ # rho: 0.01
140
+ # weight_decay: 1e-1
141
+
142
+ # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
143
+ devices: auto
144
+
145
+ # How many nodes to use. (type: int, default: 1)
146
+ num_nodes: 1
147
+
148
+ # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
149
+ # module require this. (type: Optional[Path], default: null)
150
+ tokenizer_dir: "../"
151
+
152
+ # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
153
+ logger_name: "wandb"
154
+
155
+ # The random seed to use for reproducibility. (type: int, default: 42)
156
+ seed: 23
scripts/pretrain_datasets.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pretrain_datasets = [
2
+ #
3
+ # multilingual
4
+ #
5
+ # 3.17 GB, 2,226,907
6
+ *[
7
+ {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
8
+ for i in range(0, 100, 5)
9
+ ],
10
+ # 1.64 GB, 1,001,000
11
+ *[
12
+ {'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
13
+ for i in range(0, 100, 5)
14
+ ],
15
+
16
+ #
17
+ # general knowledge
18
+ #
19
+ # 65.1 MB, 7,819
20
+ {'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
21
+ # 135 MB, 1,795
22
+ {'path': 'open-phi/textbooks', 'format': lambda n: n['markdown']},
23
+ # 631 MB, 111,048
24
+ {'path': 'open-phi/programming_books_llama', 'format': lambda n: n['markdown']},
25
+
26
+ #
27
+ # misc
28
+ #
29
+ # 472 KB, 5,034
30
+ {'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
31
+
32
+ #
33
+ # math
34
+ #
35
+ # 12.6 GB, 21,972,791 - we use 1M subset - 639 MB, 1,000,000
36
+ *[
37
+ {'path': 'nvidia/OpenMathInstruct-2', 'split': f'train_1M[{i}%:{i + 5}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
38
+ for i in range(0, 100, 5)
39
+ ],
40
+
41
+ #
42
+ # stem
43
+ #
44
+ # 1.44 GB, 63,357
45
+ *[
46
+ {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['abstract']}
47
+ for i in range(0, 100, 5)
48
+ ],
49
+ *[
50
+ {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['markdown']}
51
+ for i in range(0, 100, 5)
52
+ ],
53
+
54
+ #
55
+ # code
56
+ #
57
+ # 7.81 GB, ~2,804,025
58
+ *[
59
+ {'path': 'rombodawg/code_bagel_hermes-2.5', 'split': f'train[{i}%:{i + 5}%]', 'format': '{input} {output}'}
60
+ for i in range(0, 100, 5)
61
+ ],
62
+
63
+ #
64
+ # general knowledge
65
+ #
66
+ # 3.18 GB, 1,010,500 - paper says that extracted is 6GB
67
+ *[
68
+ {'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
69
+ for i in range(0, 100, 5)
70
+ ],
71
+ {'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
72
+ {'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
73
+ ]
scripts/requirements.in ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
2
+ torch>=2.2.0,<=2.4.1
3
+ numpy<2.0
4
+
5
+ tqdm
6
+ datasets
7
+ jinja2
8
+ transformers
9
+ wandb
10
+ # litgpt[all]
11
+ litgpt[all] @ git+https://github.com/Lightning-AI/litgpt.git
12
+ # litgpt @ git+https://github.com/Lightning-AI/litgpt.git
13
+ # litdata
14
+ # litdata @ git+https://github.com/Lightning-AI/litdata.git
15
+ lm_eval[ifeval,math]
16
+ grokadamw
17
+ # bitsandbytes
18
+ # pyzstd
19
+ # zstd
20
+ Pillow
21
+
22
+ sophia-opt
scripts/tokenizer_datasets.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tokenizer_datasets = [
2
+ #
3
+ # multilingual
4
+ #
5
+ # 3.17 GB, 2,226,907
6
+ *[
7
+ {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
8
+ for i in range(0, 100, 5)
9
+ ],
10
+ # 1.64 GB, 1,001,000
11
+ *[
12
+ {'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
13
+ for i in range(0, 100, 5)
14
+ ],
15
+
16
+ #
17
+ # stem
18
+ #
19
+ # 1.44 GB, 63,357
20
+ *[
21
+ {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['abstract']}
22
+ for i in range(0, 100, 5)
23
+ ],
24
+ *[
25
+ {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['markdown']}
26
+ for i in range(0, 100, 5)
27
+ ],
28
+
29
+ #
30
+ # code
31
+ #
32
+ # 7.81 GB, ~2,804,025
33
+ *[
34
+ {'path': 'rombodawg/code_bagel_hermes-2.5', 'split': f'train[{i}%:{i + 5}%]', 'format': '{input} {output}'}
35
+ for i in range(0, 100, 5)
36
+ ],
37
+
38
+ #
39
+ # general knowledge
40
+ #
41
+ # 3.18 GB, 1,010,500 - paper says that extracted is 6GB
42
+ *[
43
+ {'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
44
+ for i in range(0, 100, 5)
45
+ ],
46
+ {'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
47
+ {'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
48
+ ]
scripts/train_tokenizer.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedTokenizerFast
2
+ from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders
3
+ from tokenizers.models import BPE
4
+ from tokenizers.trainers import BpeTrainer
5
+
6
+ from utils import batch_text_iterator
7
+ from tokenizer_datasets import tokenizer_datasets
8
+
9
+ #
10
+ # special_tokens
11
+ #
12
+ bos_token = '<|begin_of_text|>'
13
+ eos_token = '<|end_of_text|>'
14
+
15
+ special_tokens = [
16
+ bos_token,
17
+ eos_token,
18
+ '<|start_header_id|>',
19
+ '<|end_header_id|>',
20
+ '<|eom_id|>',
21
+ '<|eot_id|>',
22
+ 'system',
23
+ 'user',
24
+ 'assistant',
25
+
26
+ # tool/function calling
27
+ '<tools>',
28
+ '</tools>',
29
+ '<tool>',
30
+ '</tool>',
31
+ '<tool_call>',
32
+ '</tool_call>',
33
+ '<tool_response>',
34
+ '</tool_response>',
35
+ '"name"',
36
+ '"arguments"',
37
+
38
+ #
39
+ # JSON Schema
40
+ #
41
+ # General Metadata Keywords
42
+ '"$schema"',
43
+ '"$id"',
44
+ '"$ref"',
45
+ '"$defs"',
46
+ '"$anchor"',
47
+ '"$dynamicAnchor"',
48
+ '"$dynamicRef"',
49
+ '"$vocabulary"',
50
+ '"$comment"',
51
+ # Data Types
52
+ '"null"',
53
+ '"boolean"',
54
+ '"object"',
55
+ '"array"',
56
+ '"number"',
57
+ '"string"',
58
+ '"integer"',
59
+ # Validation Keywords
60
+ '"type"',
61
+ '"enum"',
62
+ '"const"',
63
+ '"multipleOf"',
64
+ '"maximum"',
65
+ '"exclusiveMaximum"',
66
+ '"minimum"',
67
+ '"exclusiveMinimum"',
68
+ '"maxLength"',
69
+ '"minLength"',
70
+ '"pattern"',
71
+ '"additionalItems"',
72
+ '"items"',
73
+ '"prefixItems"',
74
+ '"contains"',
75
+ '"maxItems"',
76
+ '"minItems"',
77
+ '"uniqueItems"',
78
+ '"maxProperties"',
79
+ '"minProperties"',
80
+ '"required"',
81
+ '"properties"',
82
+ '"patternProperties"',
83
+ '"additionalProperties"',
84
+ '"dependentRequired"',
85
+ '"dependentSchemas"',
86
+ '"propertyNames"',
87
+ # Conditional Keywords
88
+ '"if"',
89
+ '"then"',
90
+ '"else"',
91
+ '"allOf"',
92
+ '"anyOf"',
93
+ '"oneOf"',
94
+ '"not"',
95
+ # Additional Keywords for Evaluation Control
96
+ '"unevaluatedItems"',
97
+ '"unevaluatedProperties"',
98
+ # Informational Keywords
99
+ '"title"',
100
+ '"description"',
101
+ '"default"',
102
+ '"deprecated"',
103
+ '"readOnly"',
104
+ '"writeOnly"',
105
+ '"examples"',
106
+ # Content-Related Keywords
107
+ '"contentEncoding"',
108
+ '"contentMediaType"',
109
+ '"contentSchema"',
110
+ # Additional Keywords
111
+ '"next"', # Typically used in reference to linked or next items
112
+ '"value"', # Represents the value of a property or item
113
+
114
+ # misc
115
+ '<input>',
116
+ '</input>',
117
+ '<output>',
118
+ '</output>',
119
+ '<query>',
120
+ '</query>',
121
+ '<key>',
122
+ '</key>',
123
+ '<value>',
124
+ '</value>',
125
+ '<text>',
126
+ '</text>',
127
+ '<code>',
128
+ '</code>',
129
+ '<image>',
130
+ '</image>',
131
+ '<file>',
132
+ '</file>',
133
+
134
+ # qa
135
+ '<question>',
136
+ '</question>',
137
+ '<answer>',
138
+ '</answer>',
139
+
140
+ # thought
141
+ '<thought>',
142
+ '</thought>',
143
+ '<plan>',
144
+ '</plan>',
145
+ '<vote>',
146
+ '</vote>',
147
+ '<passage>',
148
+ '</passage>',
149
+
150
+ # reasoning
151
+ '<reasoning>',
152
+ '</reasoning>',
153
+ '<acting>',
154
+ '</acting>',
155
+ '<action>',
156
+ '</action>',
157
+ '<observation>',
158
+ '</observation>',
159
+ '<claim>',
160
+ '</claim>',
161
+
162
+ # reflection
163
+ '<thinking>',
164
+ '</thinking>',
165
+ '<reflection>',
166
+ '</reflection>',
167
+ '<step>',
168
+ '</step>',
169
+
170
+ # graph
171
+ '<graph>',
172
+ '</graph>',
173
+ '<edge>',
174
+ '</edge>',
175
+ '<source>',
176
+ '</source>',
177
+ '<destination>',
178
+ '</destination>',
179
+ '<relation>',
180
+ '</relation>',
181
+ # '<value>',
182
+ # '</value>',
183
+ ]
184
+
185
+ for i in range(256 - len(special_tokens)):
186
+ special_tokens.append(f'<|reserved_special_token_{i}|>')
187
+
188
+ for i in range(256):
189
+ special_tokens.append(f'<0x{i:02X}>')
190
+
191
+ #
192
+ # BPE Tokenizer
193
+ #
194
+ bpe = BPE(unk_token=None, byte_fallback=True)
195
+ tokenizer = Tokenizer(bpe)
196
+
197
+ # normalizer
198
+ tokenizer.normalizer = None
199
+
200
+ # pre-tokenizer
201
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
202
+
203
+ # post-processor
204
+ tokenizer.post_processor = processors.ByteLevel(add_prefix_space=True, trim_offsets=False, use_regex=True)
205
+
206
+ # decoder
207
+ tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
208
+
209
+ #
210
+ # BPE Trainer
211
+ #
212
+ trainer = BpeTrainer(
213
+ vocab_size=131072, # 128 * 1024
214
+ min_frequency=3,
215
+ special_tokens=special_tokens,
216
+ max_token_length=16,
217
+ )
218
+
219
+ tokenizer.train_from_iterator(
220
+ batch_text_iterator(tokenizer_datasets),
221
+ trainer,
222
+ )
223
+
224
+ tokenizer.save('../tokenizer.json')
225
+ tokenizer.model.save('../')
226
+
227
+ #
228
+ # PreTrainedTokenizerFast
229
+ #
230
+ CHAT_TEMPLATE = (
231
+ "{{ bos_token }}"
232
+
233
+ "{% for message in messages %}"
234
+ "{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + message['content'] + '<|eot_id|>'}}"
235
+ "{% endfor %}"
236
+
237
+ "{% if add_generation_prompt %}"
238
+ "{{ '<|start_header_id|>assistant<|end_header_id|>' }}"
239
+ "{% else %}"
240
+ "{{ eos_token }}"
241
+ "{% endif %}"
242
+ )
243
+
244
+ fast_tokenizer = PreTrainedTokenizerFast(
245
+ tokenizer_object=tokenizer,
246
+ chat_template=CHAT_TEMPLATE,
247
+ bos_token=bos_token,
248
+ eos_token=eos_token,
249
+ clean_up_tokenization_spaces=False,
250
+ )
251
+
252
+ fast_tokenizer.save_pretrained('../')
scripts/utils.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from typing import Union, Optional, Iterator, Callable
3
+
4
+ import torch
5
+ from datasets import load_dataset
6
+ from litgpt.tokenizer import Tokenizer
7
+ from transformers import AutoTokenizer
8
+
9
+ def _batch_text_iterator(path: str,
10
+ name: Optional[str]=None,
11
+ data_dir: Optional[str]=None,
12
+ data_files: Optional[str]=None,
13
+ keep_in_memory: bool=False,
14
+ revision: Optional[str]=None,
15
+ split: str='train',
16
+ num_proc: Optional[int]=None,
17
+ format: Optional[Callable|str]=None) -> Iterator[str]:
18
+ assert isinstance(format, str) or callable(format), repr(format)
19
+
20
+ dataset = load_dataset(path=path,
21
+ name=name,
22
+ data_dir=data_dir,
23
+ data_files=data_files,
24
+ keep_in_memory=keep_in_memory,
25
+ revision=revision,
26
+ split=split,
27
+ trust_remote_code=True,
28
+ num_proc=num_proc)
29
+
30
+ if callable(format):
31
+ for row in dataset:
32
+ text = format(row)
33
+ yield text
34
+ else:
35
+ for row in dataset:
36
+ text = format.format(**row)
37
+ yield text
38
+
39
+ del dataset
40
+ gc.collect()
41
+
42
+
43
+ def _batch_chat_iterator(path: str,
44
+ name: Optional[str]=None,
45
+ data_dir: Optional[str]=None,
46
+ data_files: Optional[str]=None,
47
+ keep_in_memory: bool=False,
48
+ revision: Optional[str]=None,
49
+ split: str='train',
50
+ num_proc: Optional[int]=None,
51
+ field: Optional[str]=None,
52
+ transform: Optional[Callable]=None) -> Iterator[list[dict[str, str]]]:
53
+
54
+ dataset = load_dataset(path=path,
55
+ name=name,
56
+ data_dir=data_dir,
57
+ data_files=data_files,
58
+ keep_in_memory=keep_in_memory,
59
+ revision=revision,
60
+ split=split,
61
+ trust_remote_code=True,
62
+ num_proc=num_proc)
63
+
64
+ if callable(transform):
65
+ for row in dataset:
66
+ if field:
67
+ messages = transform(row[field])
68
+ else:
69
+ messages = transform(row)
70
+
71
+ yield messages
72
+ else:
73
+ for row in dataset:
74
+ if field:
75
+ messages = row[field]
76
+ else:
77
+ raise ValueError(field)
78
+
79
+ yield messages
80
+
81
+ del dataset
82
+ gc.collect()
83
+
84
+
85
+ def batch_text_iterator(dataset_config: Union[list, dict]) -> Iterator[str]:
86
+ assert isinstance(dataset_config, (dict, list)), dataset_config
87
+
88
+ if isinstance(dataset_config, dict):
89
+ for text in _batch_text_iterator(**dataset_config):
90
+ yield text
91
+ elif isinstance(dataset_config, list):
92
+ for dc in dataset_config:
93
+ for text in _batch_text_iterator(**dc):
94
+ yield text
95
+
96
+
97
+ def batch_chat_iterator(dataset_config: Union[list, dict]) -> Iterator[list[dict[str, str]]]:
98
+ assert isinstance(dataset_config, (dict, list)), dataset_config
99
+
100
+ if isinstance(dataset_config, dict):
101
+ for messages in _batch_chat_iterator(**dataset_config):
102
+ yield messages
103
+ elif isinstance(dataset_config, list):
104
+ for dc in dataset_config:
105
+ for messages in _batch_chat_iterator(**dc):
106
+ yield messages
107
+
108
+
109
+ def tokenize_text_fn(dataset_config: list, tokenizer: Tokenizer, min_len: Optional[int]=None, max_len: Optional[int]=None) -> Iterator[torch.Tensor]:
110
+ for text in batch_text_iterator(dataset_config):
111
+ text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=True)
112
+
113
+ if min_len is None and max_len is None:
114
+ yield text_ids
115
+
116
+ if min_len is None:
117
+ min_len = 0
118
+
119
+ if max_len is None:
120
+ max_len = len(text_ids)
121
+
122
+ if min_len <= len(text_ids) <= max_len:
123
+ yield text_ids
124
+
125
+
126
+ def tokenize_chat_fn(dataset_config: list, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer, min_len: Optional[int]=None, max_len: Optional[int]=None) -> Iterator[torch.Tensor]:
127
+ for messages in batch_chat_iterator(dataset_config):
128
+ # text_ids: torch.Tensor = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors='pt')
129
+ # text_ids = text_ids.to(torch.int)
130
+ text: str = hf_tokenizer.apply_chat_template(messages, tokenize=False)
131
+ text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=False)
132
+
133
+ if min_len is None and max_len is None:
134
+ yield text_ids
135
+
136
+ if min_len is None:
137
+ min_len = 0
138
+
139
+ if max_len is None:
140
+ max_len = len(text_ids)
141
+
142
+ if min_len <= len(text_ids) <= max_len:
143
+ yield text_ids