train tokenizer 128k
Browse files- TRAIN.md +113 -0
- config.json +40 -0
- misc/logo.png +3 -0
- scripts/contrain-model-0.yaml +156 -0
- scripts/contrain_datasets.py +204 -0
- scripts/prepare_contrain_datasets.py +49 -0
- scripts/prepare_pretrain_datasets.py +50 -0
- scripts/pretrain-model-0.yaml +156 -0
- scripts/pretrain_datasets.py +73 -0
- scripts/requirements.in +22 -0
- scripts/tokenizer_datasets.py +48 -0
- scripts/train_tokenizer.py +252 -0
- scripts/utils.py +143 -0
TRAIN.md
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Train
|
2 |
+
|
3 |
+
## Environment
|
4 |
+
|
5 |
+
```bash
|
6 |
+
cd scripts
|
7 |
+
python -m venv venv
|
8 |
+
source venv/bin/activate
|
9 |
+
pip install -U -r requirements.in
|
10 |
+
```
|
11 |
+
|
12 |
+
## Train Tokenizer
|
13 |
+
|
14 |
+
```bash
|
15 |
+
time python -B train_tokenizer.py
|
16 |
+
```
|
17 |
+
|
18 |
+
## Pretrain
|
19 |
+
|
20 |
+
```bash
|
21 |
+
python -B prepare_pretrain_datasets.py
|
22 |
+
```
|
23 |
+
|
24 |
+
```bash
|
25 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-0.yaml
|
26 |
+
litgpt convert_pretrained_checkpoint ../out/pretrain-0/final/ ../out/pretrain-0-final-checkpoint
|
27 |
+
|
28 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-1.yaml
|
29 |
+
litgpt convert_pretrained_checkpoint ../out/pretrain-1/final/ ../out/pretrain-1-final-checkpoint
|
30 |
+
|
31 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-2.yaml
|
32 |
+
litgpt convert_pretrained_checkpoint ../out/pretrain-2/final/ ../out/pretrain-2-final-checkpoint
|
33 |
+
|
34 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-3.yaml
|
35 |
+
litgpt convert_pretrained_checkpoint ../out/pretrain-3/final/ ../out/pretrain-3-final-checkpoint
|
36 |
+
|
37 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-4.yaml
|
38 |
+
litgpt convert_pretrained_checkpoint ../out/pretrain-4/final/ ../out/pretrain-4-final-checkpoint
|
39 |
+
|
40 |
+
# NOTE: unused
|
41 |
+
# CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain-model-5.yaml
|
42 |
+
# litgpt convert_pretrained_checkpoint ../out/pretrain-5/final/ ../out/pretrain-5-final-checkpoint
|
43 |
+
```
|
44 |
+
|
45 |
+
### Continued Pretraining
|
46 |
+
|
47 |
+
```bash
|
48 |
+
python -B prepare_contrain_datasets.py
|
49 |
+
```
|
50 |
+
|
51 |
+
```bash
|
52 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config contrain-model-0.yaml
|
53 |
+
litgpt convert_pretrained_checkpoint ../out/contrain-0/final/ ../out/contrain-0-final-checkpoint
|
54 |
+
|
55 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config contrain-model-1.yaml
|
56 |
+
litgpt convert_pretrained_checkpoint ../out/contrain-1/final/ ../out/contrain-1-final-checkpoint
|
57 |
+
```
|
58 |
+
|
59 |
+
## Chat with Pretrained model
|
60 |
+
|
61 |
+
```bash
|
62 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-0/final/
|
63 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-1/final/
|
64 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-2/final/
|
65 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-3/final/
|
66 |
+
CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-4/final/
|
67 |
+
# CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat out/pretrain-5/final/
|
68 |
+
```
|
69 |
+
|
70 |
+
<!-- OLD -->
|
71 |
+
|
72 |
+
## Model
|
73 |
+
|
74 |
+
### Pretraining
|
75 |
+
|
76 |
+
```bash
|
77 |
+
litgpt pretrain --config ./pretrain-model.yaml
|
78 |
+
litgpt convert_from_litgpt out/pretrain/final/ out/converted_pretrain
|
79 |
+
cp config.json out/pretrain/final/
|
80 |
+
cp config.json out/converted_pretrain/
|
81 |
+
```
|
82 |
+
|
83 |
+
```python
|
84 |
+
import torch
|
85 |
+
from safetensors.torch import save_file
|
86 |
+
|
87 |
+
state_dict = torch.load('out/converted_pretrain/model.pth', map_location='cpu')
|
88 |
+
save_file(state_dict, 'out/converted_pretrain/model.safetensors')
|
89 |
+
```
|
90 |
+
|
91 |
+
### Continued Pretraining
|
92 |
+
|
93 |
+
```bash
|
94 |
+
litgpt convert_pretrained_checkpoint out/pretrain/final/ out/pretrain_checkpoint/final/
|
95 |
+
cp config.json out/pretrain_checkpoint/final/
|
96 |
+
|
97 |
+
litgpt pretrain --config ./contrain-model.yaml
|
98 |
+
litgpt convert_from_litgpt out/contrain/final/ out/converted_contrain
|
99 |
+
cp config.json out/converted_contrain/
|
100 |
+
```
|
101 |
+
|
102 |
+
```python
|
103 |
+
import torch
|
104 |
+
from safetensors.torch import save_file
|
105 |
+
|
106 |
+
state_dict = torch.load('out/converted_contrain/model.pth', map_location='cpu')
|
107 |
+
save_file(state_dict, 'out/converted_contrain/model.safetensors')
|
108 |
+
```
|
109 |
+
|
110 |
+
```bash
|
111 |
+
cp out/converted_contrain/model.pth ./
|
112 |
+
cp out/converted_contrain/model.safetensors ./
|
113 |
+
```
|
config.json
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "tangledgroup/tangled-llama-j-128k-v0.1",
|
3 |
+
"architectures": [
|
4 |
+
"LlamaForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"eos_token_id": [
|
10 |
+
1,
|
11 |
+
4,
|
12 |
+
5
|
13 |
+
],
|
14 |
+
"head_dim": 64,
|
15 |
+
"hidden_act": "silu",
|
16 |
+
"hidden_size": 768,
|
17 |
+
"initializer_range": 0.02,
|
18 |
+
"intermediate_size": 2048,
|
19 |
+
"max_position_embeddings": 131072,
|
20 |
+
"mlp_bias": false,
|
21 |
+
"model_type": "llama",
|
22 |
+
"num_attention_heads": 16,
|
23 |
+
"num_hidden_layers": 32,
|
24 |
+
"num_key_value_heads": 4,
|
25 |
+
"pretraining_tp": 1,
|
26 |
+
"rms_norm_eps": 1e-05,
|
27 |
+
"rope_scaling": {
|
28 |
+
"factor": 32.0,
|
29 |
+
"high_freq_factor": 4.0,
|
30 |
+
"low_freq_factor": 1.0,
|
31 |
+
"original_max_position_embeddings": 8192,
|
32 |
+
"rope_type": "llama3"
|
33 |
+
},
|
34 |
+
"rope_theta": 1000000.0,
|
35 |
+
"tie_word_embeddings": true,
|
36 |
+
"torch_dtype": "bfloat16",
|
37 |
+
"transformers_version": "4.45.0.dev0",
|
38 |
+
"use_cache": true,
|
39 |
+
"vocab_size": 65536
|
40 |
+
}
|
misc/logo.png
ADDED
Git LFS Details
|
scripts/contrain-model-0.yaml
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/config.json
|
2 |
+
|
3 |
+
# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
|
4 |
+
# ``model_config``. (type: Optional[str], default: null)
|
5 |
+
model_name: "Llama-3.2-1B"
|
6 |
+
|
7 |
+
# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
|
8 |
+
# ``model_config``. (type: Optional[Config], default: null)
|
9 |
+
model_config:
|
10 |
+
padded_vocab_size: 65536
|
11 |
+
vocab_size: 65536
|
12 |
+
block_size: 131072
|
13 |
+
n_layer: 32
|
14 |
+
n_head: 16
|
15 |
+
head_size: 64
|
16 |
+
n_embd: 768
|
17 |
+
n_query_groups: 4
|
18 |
+
rotary_percentage: 1.0
|
19 |
+
parallel_residual: false
|
20 |
+
shared_attention_norm: false
|
21 |
+
bias: false
|
22 |
+
# attn_bias: true # qwen 2.5
|
23 |
+
norm_class_name: "RMSNorm"
|
24 |
+
mlp_class_name: "LLaMAMLP"
|
25 |
+
intermediate_size: 2048
|
26 |
+
# rope_base: 500000 # llama 3.2
|
27 |
+
rope_base: 1000000 # qwen 2.5
|
28 |
+
rope_adjustments: # llama 3.2
|
29 |
+
factor: 32.0
|
30 |
+
low_freq_factor: 1.0
|
31 |
+
high_freq_factor: 4.0
|
32 |
+
original_max_seq_len: 8192
|
33 |
+
|
34 |
+
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
|
35 |
+
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
|
36 |
+
out_dir: "../out/contrain-0/"
|
37 |
+
|
38 |
+
# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
|
39 |
+
# precision: bf16-mixed
|
40 |
+
precision: bf16-true
|
41 |
+
|
42 |
+
# Optional path to a checkpoint directory to initialize the model from.
|
43 |
+
# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
|
44 |
+
initial_checkpoint_dir: "../out/pretrain-4-final-checkpoint/"
|
45 |
+
|
46 |
+
# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
|
47 |
+
# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
|
48 |
+
# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
|
49 |
+
# (type: Union[bool, Literal["auto"], Path], default: False)
|
50 |
+
# resume:
|
51 |
+
|
52 |
+
# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
|
53 |
+
data:
|
54 |
+
class_path: LitData
|
55 |
+
|
56 |
+
init_args:
|
57 |
+
data_path: "../contrain-data-0-4097-16388000/"
|
58 |
+
num_workers: 32
|
59 |
+
|
60 |
+
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
|
61 |
+
train:
|
62 |
+
# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
|
63 |
+
save_interval: 100
|
64 |
+
|
65 |
+
# Number of iterations between logging calls (type: int, default: 1)
|
66 |
+
log_interval: 1
|
67 |
+
|
68 |
+
# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
|
69 |
+
global_batch_size: 512
|
70 |
+
|
71 |
+
# Number of samples per data-parallel rank (type: int, default: 4)
|
72 |
+
micro_batch_size: 3
|
73 |
+
|
74 |
+
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
75 |
+
lr_warmup_steps: 0
|
76 |
+
|
77 |
+
# Number of epochs to train on (type: Optional[int], default: null)
|
78 |
+
epochs:
|
79 |
+
|
80 |
+
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
81 |
+
max_tokens: 1527816367 # 4_097 * 372_911
|
82 |
+
|
83 |
+
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
84 |
+
max_steps:
|
85 |
+
|
86 |
+
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
87 |
+
max_seq_length: 4097
|
88 |
+
|
89 |
+
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
90 |
+
tie_embeddings: true
|
91 |
+
|
92 |
+
# (type: Optional[float], default: 1.0)
|
93 |
+
max_norm: 1.0
|
94 |
+
|
95 |
+
# (type: float, default: 4e-05)
|
96 |
+
min_lr: 1e-06
|
97 |
+
|
98 |
+
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
|
99 |
+
eval:
|
100 |
+
# Number of optimizer steps between evaluation calls (type: int, default: 1000)
|
101 |
+
interval: 25
|
102 |
+
|
103 |
+
# Number of tokens to generate (type: Optional[int], default: null)
|
104 |
+
max_new_tokens:
|
105 |
+
|
106 |
+
# Number of iterations (type: int, default: 100)
|
107 |
+
max_iters: 100
|
108 |
+
|
109 |
+
# Whether to evaluate on the validation set at the beginning of the training
|
110 |
+
initial_validation: false
|
111 |
+
|
112 |
+
# Whether to evaluate on the validation set at the end the training
|
113 |
+
final_validation: true
|
114 |
+
|
115 |
+
# Optimizer-related arguments
|
116 |
+
optimizer:
|
117 |
+
class_path: grokadamw.GrokAdamW
|
118 |
+
|
119 |
+
init_args:
|
120 |
+
# (type: float, default: 0.001)
|
121 |
+
lr: 1e-05
|
122 |
+
|
123 |
+
# (type: float, default: 0.01)
|
124 |
+
weight_decay: 1e-2
|
125 |
+
|
126 |
+
# (type: tuple, default: (0.9,0.999))
|
127 |
+
betas:
|
128 |
+
- 0.9
|
129 |
+
- 0.999
|
130 |
+
|
131 |
+
# optimizer:
|
132 |
+
# class_path: sophia_opt.SophiaG
|
133 |
+
#
|
134 |
+
# init_args:
|
135 |
+
# lr: 4e-4
|
136 |
+
# betas:
|
137 |
+
# - 0.965
|
138 |
+
# - 0.99
|
139 |
+
# rho: 0.01
|
140 |
+
# weight_decay: 1e-1
|
141 |
+
|
142 |
+
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
143 |
+
devices: auto
|
144 |
+
|
145 |
+
# How many nodes to use. (type: int, default: 1)
|
146 |
+
num_nodes: 1
|
147 |
+
|
148 |
+
# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
|
149 |
+
# module require this. (type: Optional[Path], default: null)
|
150 |
+
tokenizer_dir: "../"
|
151 |
+
|
152 |
+
# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
|
153 |
+
logger_name: "wandb"
|
154 |
+
|
155 |
+
# The random seed to use for reproducibility. (type: int, default: 42)
|
156 |
+
seed: 23
|
scripts/contrain_datasets.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
roles_map = {
|
2 |
+
'system': 'system',
|
3 |
+
'user': 'user',
|
4 |
+
'human': 'user',
|
5 |
+
'assistant': 'assistant',
|
6 |
+
'gpt': 'assistant',
|
7 |
+
'AI': 'assistant',
|
8 |
+
}
|
9 |
+
|
10 |
+
|
11 |
+
contrain_datasets = [
|
12 |
+
#
|
13 |
+
# general instructs
|
14 |
+
#
|
15 |
+
# mlabonne/open-perfectblend - 1.48 GB, 1,420,909
|
16 |
+
# meta-math/MetaMathQA 395,000
|
17 |
+
# openbmb/UltraInteract_sft 288,579
|
18 |
+
# HuggingFaceH4/ultrachat_200k 207,865
|
19 |
+
# microsoft/orca-math-word-problems-200k 200,035
|
20 |
+
# HuggingFaceH4/ultrafeedback_binarized 187,405
|
21 |
+
# theblackcat102/evol-codealpaca-v1 111,272
|
22 |
+
# Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
|
23 |
+
# mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
|
24 |
+
*[
|
25 |
+
{'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
|
26 |
+
{'role': roles_map[m['from']], 'content': m['value']}
|
27 |
+
for m in msgs
|
28 |
+
]}
|
29 |
+
for i in range(0, 100, 20)
|
30 |
+
],
|
31 |
+
# arcee-ai/The-Tome - 4.58 GB, 1,752,473
|
32 |
+
# - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
|
33 |
+
# - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
|
34 |
+
# - jondurbin/airoboros-3.2
|
35 |
+
# - gardner/glaive-function-calling-v2-sharegpt
|
36 |
+
# - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01)
|
37 |
+
# - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k)
|
38 |
+
# - cognitivecomputations/ultrainteract_trajectories_sharegpt
|
39 |
+
# - cognitivecomputations/SystemChat-2.0
|
40 |
+
# - arcee-ai/qwen2-72b-magpie-en
|
41 |
+
*[
|
42 |
+
{'path': 'arcee-ai/The-Tome', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
|
43 |
+
{'role': roles_map[m['from']], 'content': m['value']}
|
44 |
+
for m in msgs
|
45 |
+
]}
|
46 |
+
for i in range(0, 100, 20)
|
47 |
+
],
|
48 |
+
# rombodawg/Everything_Instruct_Multilingual - 2.48 GB, 5,808,694
|
49 |
+
# Science:
|
50 |
+
# antiven0m/physical-reasoning-dpoScience
|
51 |
+
# LawalAfeez/science-dataset
|
52 |
+
# Social media:
|
53 |
+
# Kyle1668/AG-Tweets
|
54 |
+
# euclaise/reddit-instruct-curated
|
55 |
+
# General Knowledge:
|
56 |
+
# NousResearch/CharacterCodex_Characters
|
57 |
+
# jstet/quotes-500k_Famous_Quotes
|
58 |
+
# FronkonGames/steam-games-dataset_Video_Games
|
59 |
+
# totuta_youtube_subs_howto100M_HowTo
|
60 |
+
# Multi-lingual:
|
61 |
+
# Amani27/massive_translation_dataset
|
62 |
+
# udmurtNLP/udmurt-russian-english-labse
|
63 |
+
# grosenthal/latin_english
|
64 |
+
# msarmi9/korean-english-multitarget-ted-talks-task
|
65 |
+
# HaiderSultanArc/MT-Urdu-English_Translate
|
66 |
+
# Garsa3112/ChineseEnglishTranslationDataset
|
67 |
+
# Cooking:
|
68 |
+
# andrewsiah/se_cooking_preference_sft
|
69 |
+
# Hieu-Phamkaggle/food_recipes
|
70 |
+
# Writing:
|
71 |
+
# shahules786/PoetryFoundationData
|
72 |
+
# euclaise/writingprompts
|
73 |
+
# qwedsacf/ivypanda-essaysEssay
|
74 |
+
# Medicine:
|
75 |
+
# keivalya/MedQuad-MedicalQnADataset
|
76 |
+
# nuvocare/MSD
|
77 |
+
# History:
|
78 |
+
# ambrosfitz10k/history_data_v4
|
79 |
+
# Law:
|
80 |
+
# dzunggg/legal-qa-v1
|
81 |
+
# Role-Play:
|
82 |
+
# roleplay4/fun_CoupleRP
|
83 |
+
# Undi95andrijdavid/roleplay-conversation-sharegpt
|
84 |
+
# News:
|
85 |
+
# RealTimeData/bbc_news_alltime
|
86 |
+
# Coding: (rombodawg/code_bagel)
|
87 |
+
# layoric/tiny-codes-alpaca
|
88 |
+
# glaiveai/glaive-code-assistant-v3
|
89 |
+
# ajibawa-2023/Code-290k-ShareGPT
|
90 |
+
# chargoddard/commitpack-ft-instruct-rated
|
91 |
+
# iamtarun/code_instructions_120k_alpaca
|
92 |
+
# ise-uiuc/Magicoder-Evol-Instruct-110K
|
93 |
+
# cognitivecomputations/dolphin-coder
|
94 |
+
# nickrosh/Evol-Instruct-Code-80k-v1
|
95 |
+
# coseal/CodeUltraFeedback_binarized
|
96 |
+
# CyberNative/Code_Vulnerability_Security_DPO
|
97 |
+
# Math: (rombodawg/code_bagel)
|
98 |
+
# TIGER-Lab/MathInstruct
|
99 |
+
# Function calling: (rombodawg/code_bagel)
|
100 |
+
# glaiveai/glaive-function-calling-v2
|
101 |
+
# General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
|
102 |
+
# teknium/OpenHermes-2.5
|
103 |
+
*[
|
104 |
+
{'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 20}%]', 'transform': lambda r: [
|
105 |
+
{'role': 'system', 'content': r['instruction']},
|
106 |
+
{'role': 'user', 'content': r['input']},
|
107 |
+
{'role': 'assistant', 'content': r['output']},
|
108 |
+
]}
|
109 |
+
for i in range(0, 100, 20)
|
110 |
+
],
|
111 |
+
|
112 |
+
#
|
113 |
+
# tool/function calling
|
114 |
+
#
|
115 |
+
# 65.7 MB, 11,578
|
116 |
+
{'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [
|
117 |
+
{'role': roles_map[m['from']], 'content': m['value']}
|
118 |
+
for m in msgs
|
119 |
+
]},
|
120 |
+
|
121 |
+
#
|
122 |
+
# agent
|
123 |
+
#
|
124 |
+
# 1.51 GB, 485,874
|
125 |
+
*[
|
126 |
+
{'path': 'arcee-ai/agent-data', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
|
127 |
+
{'role': roles_map[m['from']], 'content': m['value']}
|
128 |
+
for m in msgs
|
129 |
+
]}
|
130 |
+
for i in range(0, 100, 20)
|
131 |
+
],
|
132 |
+
|
133 |
+
#
|
134 |
+
# general reasoning
|
135 |
+
#
|
136 |
+
*[
|
137 |
+
# 10.8 MB, 15,770
|
138 |
+
{'path': 'AtlasUnified/Atlas-Reasoning', 'data_files': 'reasoning.csv', 'transform': lambda r: [
|
139 |
+
{'role': 'user', 'content': r['Prompt']},
|
140 |
+
{'role': 'assistant', 'content': r['Step-by-step reasoning'] + '\n' + r['Solution']},
|
141 |
+
]},
|
142 |
+
],
|
143 |
+
|
144 |
+
#
|
145 |
+
# math reasoning
|
146 |
+
#
|
147 |
+
# 8.99 MB, 6,914
|
148 |
+
{'path': 'thesven/gsm8k-reasoning', 'transform': lambda r: [
|
149 |
+
{'role': 'user', 'content': r['question']},
|
150 |
+
{'role': 'assistant', 'content': (r['generation'] or '') + '\n' + r['answer'] + '\n' + r['short_answer']},
|
151 |
+
]},
|
152 |
+
|
153 |
+
# 1.79 MB, 3,963
|
154 |
+
{'path': 'AlgorithmicResearchGroup/math_reasoning_autoformalization_track', 'transform': lambda r: [
|
155 |
+
{'role': 'user', 'content': r['informal_statement']},
|
156 |
+
{'role': 'assistant', 'content': r['informal_proof'] + '\n' + r['formal_proof']},
|
157 |
+
]},
|
158 |
+
|
159 |
+
# 307 MB, 19,944
|
160 |
+
{'path': 'KingNish/reasoning-base-20k', 'transform': lambda r: [
|
161 |
+
{'role': 'user', 'content': r['user']},
|
162 |
+
{'role': 'assistant', 'content': r['reasoning'] + '\n' + r['assistant']},
|
163 |
+
]},
|
164 |
+
|
165 |
+
# 9.45 MB, 10,000
|
166 |
+
{'path': 'Aarushhh/math-reasoning-10k', 'transform': lambda r: [
|
167 |
+
{'role': 'user', 'content': r['problem']},
|
168 |
+
{'role': 'assistant', 'content': r['plan'] + '\n' + r['solution']},
|
169 |
+
]},
|
170 |
+
|
171 |
+
#
|
172 |
+
# reflection
|
173 |
+
#
|
174 |
+
# 4.17 MB, 1,000
|
175 |
+
{'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [
|
176 |
+
{'role': 'system', 'content': r['system']},
|
177 |
+
{'role': 'user', 'content': r['prompt']},
|
178 |
+
{'role': 'assistant', 'content': r['response']},
|
179 |
+
]},
|
180 |
+
# 12.4 MB, 3,000
|
181 |
+
{'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [
|
182 |
+
{'role': 'system', 'content': r['system']},
|
183 |
+
{'role': 'user', 'content': r['prompt']},
|
184 |
+
{'role': 'assistant', 'content': r['response']},
|
185 |
+
]},
|
186 |
+
# 70.8 MB, 36,549
|
187 |
+
{'path': 'dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [
|
188 |
+
{'role': 'system', 'content': r['system']},
|
189 |
+
{'role': 'user', 'content': r['prompt']},
|
190 |
+
{'role': 'assistant', 'content': r['response']},
|
191 |
+
]},
|
192 |
+
# 30.6 MB, 25,391
|
193 |
+
{'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [
|
194 |
+
r['system'][0],
|
195 |
+
{'role': 'user', 'content': r['input']},
|
196 |
+
{'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']},
|
197 |
+
]},
|
198 |
+
|
199 |
+
#
|
200 |
+
# general instructs
|
201 |
+
#
|
202 |
+
# 971 MB, 484,570
|
203 |
+
{'path': 'HuggingFaceTB/smol-smoltalk', 'field': 'messages'},
|
204 |
+
]
|
scripts/prepare_contrain_datasets.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import partial
|
2 |
+
|
3 |
+
from litgpt.tokenizer import Tokenizer
|
4 |
+
from litdata import optimize, TokensLoader, StreamingDataset
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
|
7 |
+
from utils import tokenize_chat_fn
|
8 |
+
from contrain_datasets import contrain_datasets
|
9 |
+
|
10 |
+
|
11 |
+
#
|
12 |
+
# optimize datasets
|
13 |
+
#
|
14 |
+
for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]):
|
15 |
+
# i = 0
|
16 |
+
# block_size = 8193
|
17 |
+
# chunk_size = block_size * 2000
|
18 |
+
chunk_size = block_size * subchunk_size
|
19 |
+
output_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}'
|
20 |
+
|
21 |
+
outputs = optimize(
|
22 |
+
fn=partial(
|
23 |
+
tokenize_chat_fn,
|
24 |
+
hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True),
|
25 |
+
tokenizer=Tokenizer('..'),
|
26 |
+
),
|
27 |
+
inputs=contrain_datasets,
|
28 |
+
output_dir=output_dir,
|
29 |
+
chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
30 |
+
num_workers=32,
|
31 |
+
reorder_files=False,
|
32 |
+
)
|
33 |
+
|
34 |
+
#
|
35 |
+
# total number of chunks in datasets
|
36 |
+
#
|
37 |
+
for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]):
|
38 |
+
# i = 0
|
39 |
+
# block_size = 8193
|
40 |
+
# chunk_size = block_size * 2000
|
41 |
+
chunk_size = block_size * subchunk_size
|
42 |
+
input_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}'
|
43 |
+
|
44 |
+
dataset = StreamingDataset(
|
45 |
+
input_dir=input_dir,
|
46 |
+
item_loader=TokensLoader(block_size=block_size),
|
47 |
+
)
|
48 |
+
|
49 |
+
print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')
|
scripts/prepare_pretrain_datasets.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import partial
|
2 |
+
|
3 |
+
from litgpt.tokenizer import Tokenizer
|
4 |
+
from litdata import optimize, TokensLoader, StreamingDataset
|
5 |
+
|
6 |
+
from utils import tokenize_text_fn
|
7 |
+
from pretrain_datasets import pretrain_datasets
|
8 |
+
|
9 |
+
|
10 |
+
#
|
11 |
+
# optimize datasets
|
12 |
+
#
|
13 |
+
for i, (b, e) in enumerate([(0, 513), (512, 1025), (1024, 2049), (2048, 4097), (4096, 8192), (8192, 1024 ** 3)]):
|
14 |
+
if e <= 8192:
|
15 |
+
block_size = (64 * 1024 * 1024) // (4 * e)
|
16 |
+
chunk_size = e * block_size
|
17 |
+
else:
|
18 |
+
block_size = 2048
|
19 |
+
chunk_size = b * block_size
|
20 |
+
|
21 |
+
output_dir = f'../pretrain-data-{i}-{b}-{e}-{block_size}-{chunk_size}'
|
22 |
+
|
23 |
+
outputs = optimize(
|
24 |
+
fn=partial(tokenize_text_fn, tokenizer=Tokenizer('..'), min_len=b, max_len=e),
|
25 |
+
inputs=pretrain_datasets,
|
26 |
+
output_dir=output_dir,
|
27 |
+
chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
28 |
+
num_workers=32,
|
29 |
+
reorder_files=False,
|
30 |
+
)
|
31 |
+
|
32 |
+
#
|
33 |
+
# total number of chunks in datasets
|
34 |
+
#
|
35 |
+
for i, (b, e) in enumerate([(0, 513), (512, 1025), (1024, 2049), (2048, 4097), (4096, 8192), (8192, 1024 ** 3)]):
|
36 |
+
if e <= 8192:
|
37 |
+
block_size = (64 * 1024 * 1024) // (4 * e)
|
38 |
+
chunk_size = e * block_size
|
39 |
+
else:
|
40 |
+
block_size = 2048
|
41 |
+
chunk_size = b * block_size
|
42 |
+
|
43 |
+
input_dir = f'../pretrain-data-{i}-{b}-{e}-{block_size}-{chunk_size}'
|
44 |
+
|
45 |
+
dataset = StreamingDataset(
|
46 |
+
input_dir=input_dir,
|
47 |
+
item_loader=TokensLoader(block_size=block_size),
|
48 |
+
)
|
49 |
+
|
50 |
+
print(f'{i=}, {b=}, {e=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {e * len(dataset)=}')
|
scripts/pretrain-model-0.yaml
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/config.json
|
2 |
+
|
3 |
+
# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
|
4 |
+
# ``model_config``. (type: Optional[str], default: null)
|
5 |
+
model_name: "Llama-3.2-1B"
|
6 |
+
|
7 |
+
# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
|
8 |
+
# ``model_config``. (type: Optional[Config], default: null)
|
9 |
+
model_config:
|
10 |
+
padded_vocab_size: 65536
|
11 |
+
vocab_size: 65536
|
12 |
+
block_size: 131072
|
13 |
+
n_layer: 32
|
14 |
+
n_head: 16
|
15 |
+
head_size: 64
|
16 |
+
n_embd: 768
|
17 |
+
n_query_groups: 4
|
18 |
+
rotary_percentage: 1.0
|
19 |
+
parallel_residual: false
|
20 |
+
shared_attention_norm: false
|
21 |
+
bias: false
|
22 |
+
# attn_bias: true # qwen 2.5
|
23 |
+
norm_class_name: "RMSNorm"
|
24 |
+
mlp_class_name: "LLaMAMLP"
|
25 |
+
intermediate_size: 2048
|
26 |
+
# rope_base: 500000 # llama 3.2
|
27 |
+
rope_base: 1000000 # qwen 2.5
|
28 |
+
rope_adjustments: # llama 3.2
|
29 |
+
factor: 32.0
|
30 |
+
low_freq_factor: 1.0
|
31 |
+
high_freq_factor: 4.0
|
32 |
+
original_max_seq_len: 8192
|
33 |
+
|
34 |
+
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
|
35 |
+
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
|
36 |
+
out_dir: "../out/pretrain-0/"
|
37 |
+
|
38 |
+
# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
|
39 |
+
# precision: bf16-mixed
|
40 |
+
precision: bf16-true
|
41 |
+
|
42 |
+
# Optional path to a checkpoint directory to initialize the model from.
|
43 |
+
# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
|
44 |
+
initial_checkpoint_dir:
|
45 |
+
|
46 |
+
# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
|
47 |
+
# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
|
48 |
+
# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
|
49 |
+
# (type: Union[bool, Literal["auto"], Path], default: False)
|
50 |
+
resume: "auto"
|
51 |
+
|
52 |
+
# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
|
53 |
+
data:
|
54 |
+
class_path: LitData
|
55 |
+
|
56 |
+
init_args:
|
57 |
+
data_path: "../pretrain-data-0-0-513-32704-16777152/"
|
58 |
+
num_workers: 32
|
59 |
+
|
60 |
+
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
|
61 |
+
train:
|
62 |
+
# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
|
63 |
+
save_interval: 100
|
64 |
+
|
65 |
+
# Number of iterations between logging calls (type: int, default: 1)
|
66 |
+
log_interval: 1
|
67 |
+
|
68 |
+
# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
|
69 |
+
global_batch_size: 512
|
70 |
+
|
71 |
+
# Number of samples per data-parallel rank (type: int, default: 4)
|
72 |
+
micro_batch_size: 24
|
73 |
+
|
74 |
+
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
75 |
+
lr_warmup_steps: 0
|
76 |
+
|
77 |
+
# Number of epochs to train on (type: Optional[int], default: null)
|
78 |
+
epochs:
|
79 |
+
|
80 |
+
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
81 |
+
max_tokens: 1945266624 # 32_704 * 59_481
|
82 |
+
|
83 |
+
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
84 |
+
max_steps:
|
85 |
+
|
86 |
+
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
87 |
+
max_seq_length: 513
|
88 |
+
|
89 |
+
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
90 |
+
tie_embeddings: true
|
91 |
+
|
92 |
+
# (type: Optional[float], default: 1.0)
|
93 |
+
max_norm: 1.0
|
94 |
+
|
95 |
+
# (type: float, default: 4e-05)
|
96 |
+
min_lr: 1e-05
|
97 |
+
|
98 |
+
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
|
99 |
+
eval:
|
100 |
+
# Number of optimizer steps between evaluation calls (type: int, default: 1000)
|
101 |
+
interval: 100
|
102 |
+
|
103 |
+
# Number of tokens to generate (type: Optional[int], default: null)
|
104 |
+
max_new_tokens:
|
105 |
+
|
106 |
+
# Number of iterations (type: int, default: 100)
|
107 |
+
max_iters: 100
|
108 |
+
|
109 |
+
# Whether to evaluate on the validation set at the beginning of the training
|
110 |
+
initial_validation: false
|
111 |
+
|
112 |
+
# Whether to evaluate on the validation set at the end the training
|
113 |
+
final_validation: true
|
114 |
+
|
115 |
+
# Optimizer-related arguments
|
116 |
+
optimizer:
|
117 |
+
class_path: grokadamw.GrokAdamW
|
118 |
+
|
119 |
+
init_args:
|
120 |
+
# (type: float, default: 0.001)
|
121 |
+
lr: 1e-04
|
122 |
+
|
123 |
+
# (type: float, default: 0.01)
|
124 |
+
weight_decay: 1e-2
|
125 |
+
|
126 |
+
# (type: tuple, default: (0.9,0.999))
|
127 |
+
betas:
|
128 |
+
- 0.9
|
129 |
+
- 0.999
|
130 |
+
|
131 |
+
# optimizer:
|
132 |
+
# class_path: sophia_opt.SophiaG
|
133 |
+
#
|
134 |
+
# init_args:
|
135 |
+
# lr: 4e-4
|
136 |
+
# betas:
|
137 |
+
# - 0.965
|
138 |
+
# - 0.99
|
139 |
+
# rho: 0.01
|
140 |
+
# weight_decay: 1e-1
|
141 |
+
|
142 |
+
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
143 |
+
devices: auto
|
144 |
+
|
145 |
+
# How many nodes to use. (type: int, default: 1)
|
146 |
+
num_nodes: 1
|
147 |
+
|
148 |
+
# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
|
149 |
+
# module require this. (type: Optional[Path], default: null)
|
150 |
+
tokenizer_dir: "../"
|
151 |
+
|
152 |
+
# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
|
153 |
+
logger_name: "wandb"
|
154 |
+
|
155 |
+
# The random seed to use for reproducibility. (type: int, default: 42)
|
156 |
+
seed: 23
|
scripts/pretrain_datasets.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pretrain_datasets = [
|
2 |
+
#
|
3 |
+
# multilingual
|
4 |
+
#
|
5 |
+
# 3.17 GB, 2,226,907
|
6 |
+
*[
|
7 |
+
{'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
|
8 |
+
for i in range(0, 100, 5)
|
9 |
+
],
|
10 |
+
# 1.64 GB, 1,001,000
|
11 |
+
*[
|
12 |
+
{'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
|
13 |
+
for i in range(0, 100, 5)
|
14 |
+
],
|
15 |
+
|
16 |
+
#
|
17 |
+
# general knowledge
|
18 |
+
#
|
19 |
+
# 65.1 MB, 7,819
|
20 |
+
{'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
|
21 |
+
# 135 MB, 1,795
|
22 |
+
{'path': 'open-phi/textbooks', 'format': lambda n: n['markdown']},
|
23 |
+
# 631 MB, 111,048
|
24 |
+
{'path': 'open-phi/programming_books_llama', 'format': lambda n: n['markdown']},
|
25 |
+
|
26 |
+
#
|
27 |
+
# misc
|
28 |
+
#
|
29 |
+
# 472 KB, 5,034
|
30 |
+
{'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
|
31 |
+
|
32 |
+
#
|
33 |
+
# math
|
34 |
+
#
|
35 |
+
# 12.6 GB, 21,972,791 - we use 1M subset - 639 MB, 1,000,000
|
36 |
+
*[
|
37 |
+
{'path': 'nvidia/OpenMathInstruct-2', 'split': f'train_1M[{i}%:{i + 5}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
|
38 |
+
for i in range(0, 100, 5)
|
39 |
+
],
|
40 |
+
|
41 |
+
#
|
42 |
+
# stem
|
43 |
+
#
|
44 |
+
# 1.44 GB, 63,357
|
45 |
+
*[
|
46 |
+
{'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['abstract']}
|
47 |
+
for i in range(0, 100, 5)
|
48 |
+
],
|
49 |
+
*[
|
50 |
+
{'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['markdown']}
|
51 |
+
for i in range(0, 100, 5)
|
52 |
+
],
|
53 |
+
|
54 |
+
#
|
55 |
+
# code
|
56 |
+
#
|
57 |
+
# 7.81 GB, ~2,804,025
|
58 |
+
*[
|
59 |
+
{'path': 'rombodawg/code_bagel_hermes-2.5', 'split': f'train[{i}%:{i + 5}%]', 'format': '{input} {output}'}
|
60 |
+
for i in range(0, 100, 5)
|
61 |
+
],
|
62 |
+
|
63 |
+
#
|
64 |
+
# general knowledge
|
65 |
+
#
|
66 |
+
# 3.18 GB, 1,010,500 - paper says that extracted is 6GB
|
67 |
+
*[
|
68 |
+
{'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
|
69 |
+
for i in range(0, 100, 5)
|
70 |
+
],
|
71 |
+
{'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
|
72 |
+
{'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
|
73 |
+
]
|
scripts/requirements.in
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
2 |
+
torch>=2.2.0,<=2.4.1
|
3 |
+
numpy<2.0
|
4 |
+
|
5 |
+
tqdm
|
6 |
+
datasets
|
7 |
+
jinja2
|
8 |
+
transformers
|
9 |
+
wandb
|
10 |
+
# litgpt[all]
|
11 |
+
litgpt[all] @ git+https://github.com/Lightning-AI/litgpt.git
|
12 |
+
# litgpt @ git+https://github.com/Lightning-AI/litgpt.git
|
13 |
+
# litdata
|
14 |
+
# litdata @ git+https://github.com/Lightning-AI/litdata.git
|
15 |
+
lm_eval[ifeval,math]
|
16 |
+
grokadamw
|
17 |
+
# bitsandbytes
|
18 |
+
# pyzstd
|
19 |
+
# zstd
|
20 |
+
Pillow
|
21 |
+
|
22 |
+
sophia-opt
|
scripts/tokenizer_datasets.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tokenizer_datasets = [
|
2 |
+
#
|
3 |
+
# multilingual
|
4 |
+
#
|
5 |
+
# 3.17 GB, 2,226,907
|
6 |
+
*[
|
7 |
+
{'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
|
8 |
+
for i in range(0, 100, 5)
|
9 |
+
],
|
10 |
+
# 1.64 GB, 1,001,000
|
11 |
+
*[
|
12 |
+
{'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
|
13 |
+
for i in range(0, 100, 5)
|
14 |
+
],
|
15 |
+
|
16 |
+
#
|
17 |
+
# stem
|
18 |
+
#
|
19 |
+
# 1.44 GB, 63,357
|
20 |
+
*[
|
21 |
+
{'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['abstract']}
|
22 |
+
for i in range(0, 100, 5)
|
23 |
+
],
|
24 |
+
*[
|
25 |
+
{'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['markdown']}
|
26 |
+
for i in range(0, 100, 5)
|
27 |
+
],
|
28 |
+
|
29 |
+
#
|
30 |
+
# code
|
31 |
+
#
|
32 |
+
# 7.81 GB, ~2,804,025
|
33 |
+
*[
|
34 |
+
{'path': 'rombodawg/code_bagel_hermes-2.5', 'split': f'train[{i}%:{i + 5}%]', 'format': '{input} {output}'}
|
35 |
+
for i in range(0, 100, 5)
|
36 |
+
],
|
37 |
+
|
38 |
+
#
|
39 |
+
# general knowledge
|
40 |
+
#
|
41 |
+
# 3.18 GB, 1,010,500 - paper says that extracted is 6GB
|
42 |
+
*[
|
43 |
+
{'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
|
44 |
+
for i in range(0, 100, 5)
|
45 |
+
],
|
46 |
+
{'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
|
47 |
+
{'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
|
48 |
+
]
|
scripts/train_tokenizer.py
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PreTrainedTokenizerFast
|
2 |
+
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders
|
3 |
+
from tokenizers.models import BPE
|
4 |
+
from tokenizers.trainers import BpeTrainer
|
5 |
+
|
6 |
+
from utils import batch_text_iterator
|
7 |
+
from tokenizer_datasets import tokenizer_datasets
|
8 |
+
|
9 |
+
#
|
10 |
+
# special_tokens
|
11 |
+
#
|
12 |
+
bos_token = '<|begin_of_text|>'
|
13 |
+
eos_token = '<|end_of_text|>'
|
14 |
+
|
15 |
+
special_tokens = [
|
16 |
+
bos_token,
|
17 |
+
eos_token,
|
18 |
+
'<|start_header_id|>',
|
19 |
+
'<|end_header_id|>',
|
20 |
+
'<|eom_id|>',
|
21 |
+
'<|eot_id|>',
|
22 |
+
'system',
|
23 |
+
'user',
|
24 |
+
'assistant',
|
25 |
+
|
26 |
+
# tool/function calling
|
27 |
+
'<tools>',
|
28 |
+
'</tools>',
|
29 |
+
'<tool>',
|
30 |
+
'</tool>',
|
31 |
+
'<tool_call>',
|
32 |
+
'</tool_call>',
|
33 |
+
'<tool_response>',
|
34 |
+
'</tool_response>',
|
35 |
+
'"name"',
|
36 |
+
'"arguments"',
|
37 |
+
|
38 |
+
#
|
39 |
+
# JSON Schema
|
40 |
+
#
|
41 |
+
# General Metadata Keywords
|
42 |
+
'"$schema"',
|
43 |
+
'"$id"',
|
44 |
+
'"$ref"',
|
45 |
+
'"$defs"',
|
46 |
+
'"$anchor"',
|
47 |
+
'"$dynamicAnchor"',
|
48 |
+
'"$dynamicRef"',
|
49 |
+
'"$vocabulary"',
|
50 |
+
'"$comment"',
|
51 |
+
# Data Types
|
52 |
+
'"null"',
|
53 |
+
'"boolean"',
|
54 |
+
'"object"',
|
55 |
+
'"array"',
|
56 |
+
'"number"',
|
57 |
+
'"string"',
|
58 |
+
'"integer"',
|
59 |
+
# Validation Keywords
|
60 |
+
'"type"',
|
61 |
+
'"enum"',
|
62 |
+
'"const"',
|
63 |
+
'"multipleOf"',
|
64 |
+
'"maximum"',
|
65 |
+
'"exclusiveMaximum"',
|
66 |
+
'"minimum"',
|
67 |
+
'"exclusiveMinimum"',
|
68 |
+
'"maxLength"',
|
69 |
+
'"minLength"',
|
70 |
+
'"pattern"',
|
71 |
+
'"additionalItems"',
|
72 |
+
'"items"',
|
73 |
+
'"prefixItems"',
|
74 |
+
'"contains"',
|
75 |
+
'"maxItems"',
|
76 |
+
'"minItems"',
|
77 |
+
'"uniqueItems"',
|
78 |
+
'"maxProperties"',
|
79 |
+
'"minProperties"',
|
80 |
+
'"required"',
|
81 |
+
'"properties"',
|
82 |
+
'"patternProperties"',
|
83 |
+
'"additionalProperties"',
|
84 |
+
'"dependentRequired"',
|
85 |
+
'"dependentSchemas"',
|
86 |
+
'"propertyNames"',
|
87 |
+
# Conditional Keywords
|
88 |
+
'"if"',
|
89 |
+
'"then"',
|
90 |
+
'"else"',
|
91 |
+
'"allOf"',
|
92 |
+
'"anyOf"',
|
93 |
+
'"oneOf"',
|
94 |
+
'"not"',
|
95 |
+
# Additional Keywords for Evaluation Control
|
96 |
+
'"unevaluatedItems"',
|
97 |
+
'"unevaluatedProperties"',
|
98 |
+
# Informational Keywords
|
99 |
+
'"title"',
|
100 |
+
'"description"',
|
101 |
+
'"default"',
|
102 |
+
'"deprecated"',
|
103 |
+
'"readOnly"',
|
104 |
+
'"writeOnly"',
|
105 |
+
'"examples"',
|
106 |
+
# Content-Related Keywords
|
107 |
+
'"contentEncoding"',
|
108 |
+
'"contentMediaType"',
|
109 |
+
'"contentSchema"',
|
110 |
+
# Additional Keywords
|
111 |
+
'"next"', # Typically used in reference to linked or next items
|
112 |
+
'"value"', # Represents the value of a property or item
|
113 |
+
|
114 |
+
# misc
|
115 |
+
'<input>',
|
116 |
+
'</input>',
|
117 |
+
'<output>',
|
118 |
+
'</output>',
|
119 |
+
'<query>',
|
120 |
+
'</query>',
|
121 |
+
'<key>',
|
122 |
+
'</key>',
|
123 |
+
'<value>',
|
124 |
+
'</value>',
|
125 |
+
'<text>',
|
126 |
+
'</text>',
|
127 |
+
'<code>',
|
128 |
+
'</code>',
|
129 |
+
'<image>',
|
130 |
+
'</image>',
|
131 |
+
'<file>',
|
132 |
+
'</file>',
|
133 |
+
|
134 |
+
# qa
|
135 |
+
'<question>',
|
136 |
+
'</question>',
|
137 |
+
'<answer>',
|
138 |
+
'</answer>',
|
139 |
+
|
140 |
+
# thought
|
141 |
+
'<thought>',
|
142 |
+
'</thought>',
|
143 |
+
'<plan>',
|
144 |
+
'</plan>',
|
145 |
+
'<vote>',
|
146 |
+
'</vote>',
|
147 |
+
'<passage>',
|
148 |
+
'</passage>',
|
149 |
+
|
150 |
+
# reasoning
|
151 |
+
'<reasoning>',
|
152 |
+
'</reasoning>',
|
153 |
+
'<acting>',
|
154 |
+
'</acting>',
|
155 |
+
'<action>',
|
156 |
+
'</action>',
|
157 |
+
'<observation>',
|
158 |
+
'</observation>',
|
159 |
+
'<claim>',
|
160 |
+
'</claim>',
|
161 |
+
|
162 |
+
# reflection
|
163 |
+
'<thinking>',
|
164 |
+
'</thinking>',
|
165 |
+
'<reflection>',
|
166 |
+
'</reflection>',
|
167 |
+
'<step>',
|
168 |
+
'</step>',
|
169 |
+
|
170 |
+
# graph
|
171 |
+
'<graph>',
|
172 |
+
'</graph>',
|
173 |
+
'<edge>',
|
174 |
+
'</edge>',
|
175 |
+
'<source>',
|
176 |
+
'</source>',
|
177 |
+
'<destination>',
|
178 |
+
'</destination>',
|
179 |
+
'<relation>',
|
180 |
+
'</relation>',
|
181 |
+
# '<value>',
|
182 |
+
# '</value>',
|
183 |
+
]
|
184 |
+
|
185 |
+
for i in range(256 - len(special_tokens)):
|
186 |
+
special_tokens.append(f'<|reserved_special_token_{i}|>')
|
187 |
+
|
188 |
+
for i in range(256):
|
189 |
+
special_tokens.append(f'<0x{i:02X}>')
|
190 |
+
|
191 |
+
#
|
192 |
+
# BPE Tokenizer
|
193 |
+
#
|
194 |
+
bpe = BPE(unk_token=None, byte_fallback=True)
|
195 |
+
tokenizer = Tokenizer(bpe)
|
196 |
+
|
197 |
+
# normalizer
|
198 |
+
tokenizer.normalizer = None
|
199 |
+
|
200 |
+
# pre-tokenizer
|
201 |
+
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
|
202 |
+
|
203 |
+
# post-processor
|
204 |
+
tokenizer.post_processor = processors.ByteLevel(add_prefix_space=True, trim_offsets=False, use_regex=True)
|
205 |
+
|
206 |
+
# decoder
|
207 |
+
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
|
208 |
+
|
209 |
+
#
|
210 |
+
# BPE Trainer
|
211 |
+
#
|
212 |
+
trainer = BpeTrainer(
|
213 |
+
vocab_size=131072, # 128 * 1024
|
214 |
+
min_frequency=3,
|
215 |
+
special_tokens=special_tokens,
|
216 |
+
max_token_length=16,
|
217 |
+
)
|
218 |
+
|
219 |
+
tokenizer.train_from_iterator(
|
220 |
+
batch_text_iterator(tokenizer_datasets),
|
221 |
+
trainer,
|
222 |
+
)
|
223 |
+
|
224 |
+
tokenizer.save('../tokenizer.json')
|
225 |
+
tokenizer.model.save('../')
|
226 |
+
|
227 |
+
#
|
228 |
+
# PreTrainedTokenizerFast
|
229 |
+
#
|
230 |
+
CHAT_TEMPLATE = (
|
231 |
+
"{{ bos_token }}"
|
232 |
+
|
233 |
+
"{% for message in messages %}"
|
234 |
+
"{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + message['content'] + '<|eot_id|>'}}"
|
235 |
+
"{% endfor %}"
|
236 |
+
|
237 |
+
"{% if add_generation_prompt %}"
|
238 |
+
"{{ '<|start_header_id|>assistant<|end_header_id|>' }}"
|
239 |
+
"{% else %}"
|
240 |
+
"{{ eos_token }}"
|
241 |
+
"{% endif %}"
|
242 |
+
)
|
243 |
+
|
244 |
+
fast_tokenizer = PreTrainedTokenizerFast(
|
245 |
+
tokenizer_object=tokenizer,
|
246 |
+
chat_template=CHAT_TEMPLATE,
|
247 |
+
bos_token=bos_token,
|
248 |
+
eos_token=eos_token,
|
249 |
+
clean_up_tokenization_spaces=False,
|
250 |
+
)
|
251 |
+
|
252 |
+
fast_tokenizer.save_pretrained('../')
|
scripts/utils.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
from typing import Union, Optional, Iterator, Callable
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from datasets import load_dataset
|
6 |
+
from litgpt.tokenizer import Tokenizer
|
7 |
+
from transformers import AutoTokenizer
|
8 |
+
|
9 |
+
def _batch_text_iterator(path: str,
|
10 |
+
name: Optional[str]=None,
|
11 |
+
data_dir: Optional[str]=None,
|
12 |
+
data_files: Optional[str]=None,
|
13 |
+
keep_in_memory: bool=False,
|
14 |
+
revision: Optional[str]=None,
|
15 |
+
split: str='train',
|
16 |
+
num_proc: Optional[int]=None,
|
17 |
+
format: Optional[Callable|str]=None) -> Iterator[str]:
|
18 |
+
assert isinstance(format, str) or callable(format), repr(format)
|
19 |
+
|
20 |
+
dataset = load_dataset(path=path,
|
21 |
+
name=name,
|
22 |
+
data_dir=data_dir,
|
23 |
+
data_files=data_files,
|
24 |
+
keep_in_memory=keep_in_memory,
|
25 |
+
revision=revision,
|
26 |
+
split=split,
|
27 |
+
trust_remote_code=True,
|
28 |
+
num_proc=num_proc)
|
29 |
+
|
30 |
+
if callable(format):
|
31 |
+
for row in dataset:
|
32 |
+
text = format(row)
|
33 |
+
yield text
|
34 |
+
else:
|
35 |
+
for row in dataset:
|
36 |
+
text = format.format(**row)
|
37 |
+
yield text
|
38 |
+
|
39 |
+
del dataset
|
40 |
+
gc.collect()
|
41 |
+
|
42 |
+
|
43 |
+
def _batch_chat_iterator(path: str,
|
44 |
+
name: Optional[str]=None,
|
45 |
+
data_dir: Optional[str]=None,
|
46 |
+
data_files: Optional[str]=None,
|
47 |
+
keep_in_memory: bool=False,
|
48 |
+
revision: Optional[str]=None,
|
49 |
+
split: str='train',
|
50 |
+
num_proc: Optional[int]=None,
|
51 |
+
field: Optional[str]=None,
|
52 |
+
transform: Optional[Callable]=None) -> Iterator[list[dict[str, str]]]:
|
53 |
+
|
54 |
+
dataset = load_dataset(path=path,
|
55 |
+
name=name,
|
56 |
+
data_dir=data_dir,
|
57 |
+
data_files=data_files,
|
58 |
+
keep_in_memory=keep_in_memory,
|
59 |
+
revision=revision,
|
60 |
+
split=split,
|
61 |
+
trust_remote_code=True,
|
62 |
+
num_proc=num_proc)
|
63 |
+
|
64 |
+
if callable(transform):
|
65 |
+
for row in dataset:
|
66 |
+
if field:
|
67 |
+
messages = transform(row[field])
|
68 |
+
else:
|
69 |
+
messages = transform(row)
|
70 |
+
|
71 |
+
yield messages
|
72 |
+
else:
|
73 |
+
for row in dataset:
|
74 |
+
if field:
|
75 |
+
messages = row[field]
|
76 |
+
else:
|
77 |
+
raise ValueError(field)
|
78 |
+
|
79 |
+
yield messages
|
80 |
+
|
81 |
+
del dataset
|
82 |
+
gc.collect()
|
83 |
+
|
84 |
+
|
85 |
+
def batch_text_iterator(dataset_config: Union[list, dict]) -> Iterator[str]:
|
86 |
+
assert isinstance(dataset_config, (dict, list)), dataset_config
|
87 |
+
|
88 |
+
if isinstance(dataset_config, dict):
|
89 |
+
for text in _batch_text_iterator(**dataset_config):
|
90 |
+
yield text
|
91 |
+
elif isinstance(dataset_config, list):
|
92 |
+
for dc in dataset_config:
|
93 |
+
for text in _batch_text_iterator(**dc):
|
94 |
+
yield text
|
95 |
+
|
96 |
+
|
97 |
+
def batch_chat_iterator(dataset_config: Union[list, dict]) -> Iterator[list[dict[str, str]]]:
|
98 |
+
assert isinstance(dataset_config, (dict, list)), dataset_config
|
99 |
+
|
100 |
+
if isinstance(dataset_config, dict):
|
101 |
+
for messages in _batch_chat_iterator(**dataset_config):
|
102 |
+
yield messages
|
103 |
+
elif isinstance(dataset_config, list):
|
104 |
+
for dc in dataset_config:
|
105 |
+
for messages in _batch_chat_iterator(**dc):
|
106 |
+
yield messages
|
107 |
+
|
108 |
+
|
109 |
+
def tokenize_text_fn(dataset_config: list, tokenizer: Tokenizer, min_len: Optional[int]=None, max_len: Optional[int]=None) -> Iterator[torch.Tensor]:
|
110 |
+
for text in batch_text_iterator(dataset_config):
|
111 |
+
text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=True)
|
112 |
+
|
113 |
+
if min_len is None and max_len is None:
|
114 |
+
yield text_ids
|
115 |
+
|
116 |
+
if min_len is None:
|
117 |
+
min_len = 0
|
118 |
+
|
119 |
+
if max_len is None:
|
120 |
+
max_len = len(text_ids)
|
121 |
+
|
122 |
+
if min_len <= len(text_ids) <= max_len:
|
123 |
+
yield text_ids
|
124 |
+
|
125 |
+
|
126 |
+
def tokenize_chat_fn(dataset_config: list, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer, min_len: Optional[int]=None, max_len: Optional[int]=None) -> Iterator[torch.Tensor]:
|
127 |
+
for messages in batch_chat_iterator(dataset_config):
|
128 |
+
# text_ids: torch.Tensor = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors='pt')
|
129 |
+
# text_ids = text_ids.to(torch.int)
|
130 |
+
text: str = hf_tokenizer.apply_chat_template(messages, tokenize=False)
|
131 |
+
text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=False)
|
132 |
+
|
133 |
+
if min_len is None and max_len is None:
|
134 |
+
yield text_ids
|
135 |
+
|
136 |
+
if min_len is None:
|
137 |
+
min_len = 0
|
138 |
+
|
139 |
+
if max_len is None:
|
140 |
+
max_len = len(text_ids)
|
141 |
+
|
142 |
+
if min_len <= len(text_ids) <= max_len:
|
143 |
+
yield text_ids
|