Andrew DalPino
commited on
Commit
·
160e81f
1
Parent(s):
8c4359f
Cleanup checkpointing history
Browse files- README.md +3 -3
- model.py +1 -1
- pre-train.py +1 -9
README.md
CHANGED
@@ -45,10 +45,10 @@ python pre-train.py
|
|
45 |
|
46 |
> Note that it will take a while to download and pre-process the dataset the first time that the training script is run.
|
47 |
|
48 |
-
To customize the default "lightgpt-small" architecture you can adjust the `block_size`, `embedding_dimensions`, and `
|
49 |
|
50 |
```
|
51 |
-
python pre-train.py --block_size=2048 --embedding_dimensions=4096 --num_hidden_layers=64
|
52 |
```
|
53 |
|
54 |
You can also adjust the `batch_size`, `learning_rate`, and `gradient_accumulation_steps` to suite your training setup.
|
@@ -57,7 +57,7 @@ You can also adjust the `batch_size`, `learning_rate`, and `gradient_accumulatio
|
|
57 |
python pre-train.py --batch_size=32 --learning_rate=0.01 --gradient_accumulation_steps=128
|
58 |
```
|
59 |
|
60 |
-
For distributed training use PyTorch's [torchrun](https://pytorch.org/docs/stable/elastic/run.html) extension to launch a distributed data parallel session. The example below is for executing the training script on a single node with individual 8 GPUs.
|
61 |
|
62 |
```
|
63 |
torchrun --standalone --nnodes=1 --nproc-per-node=8 pre-train.py --batch_size=16 --gradient_accumulation_steps=128
|
|
|
45 |
|
46 |
> Note that it will take a while to download and pre-process the dataset the first time that the training script is run.
|
47 |
|
48 |
+
To customize the default "lightgpt-small" architecture you can adjust the `block_size`, `embedding_dimensions`, `num_hidden_layers`, and `num_attention_heads` arguments of the pre-training script. Refer to the `model_sizing.ipynb` notebook for an estimation of the memory and compute requirements for your chosen architecture.
|
49 |
|
50 |
```
|
51 |
+
python pre-train.py --block_size=2048 --embedding_dimensions=4096 --num_hidden_layers=64 --num_attention_heads=64
|
52 |
```
|
53 |
|
54 |
You can also adjust the `batch_size`, `learning_rate`, and `gradient_accumulation_steps` to suite your training setup.
|
|
|
57 |
python pre-train.py --batch_size=32 --learning_rate=0.01 --gradient_accumulation_steps=128
|
58 |
```
|
59 |
|
60 |
+
For distributed training, use PyTorch's [torchrun](https://pytorch.org/docs/stable/elastic/run.html) extension to launch a distributed data parallel session. The example below is for executing the training script on a single node with individual 8 GPUs.
|
61 |
|
62 |
```
|
63 |
torchrun --standalone --nnodes=1 --nproc-per-node=8 pre-train.py --batch_size=16 --gradient_accumulation_steps=128
|
model.py
CHANGED
@@ -34,7 +34,7 @@ class GPT(Module):
|
|
34 |
block_size: int = 1024,
|
35 |
embedding_dimensions: int = 1024,
|
36 |
num_heads: int = 16,
|
37 |
-
num_layers: int =
|
38 |
dropout: float = 0.1,
|
39 |
activation_checkpointing: bool = False,
|
40 |
vocabulary_size: int = 50257,
|
|
|
34 |
block_size: int = 1024,
|
35 |
embedding_dimensions: int = 1024,
|
36 |
num_heads: int = 16,
|
37 |
+
num_layers: int = 32,
|
38 |
dropout: float = 0.1,
|
39 |
activation_checkpointing: bool = False,
|
40 |
vocabulary_size: int = 50257,
|
pre-train.py
CHANGED
@@ -54,7 +54,6 @@ def main():
|
|
54 |
parser.add_argument("--eval_interval", default=10, type=int)
|
55 |
parser.add_argument("--checkpoint_interval", default=20, type=int)
|
56 |
parser.add_argument("--checkpoint_path", default="./out/checkpoint.pt", type=str)
|
57 |
-
parser.add_argument("--checkpoint_history", action="store_true")
|
58 |
parser.add_argument("--resume", action="store_true")
|
59 |
parser.add_argument("--dataset_path", default="./dataset", type=str)
|
60 |
parser.add_argument("--num_dataset_processes", default=8, type=int)
|
@@ -290,14 +289,7 @@ def main():
|
|
290 |
"optimizer": optimizer.state_dict(),
|
291 |
}
|
292 |
|
293 |
-
|
294 |
-
root, ext = path.splitext(args.checkpoint_path)
|
295 |
-
|
296 |
-
checkpoint_path = f"{root}-{epoch}{ext}"
|
297 |
-
else:
|
298 |
-
checkpoint_path = args.checkpoint_path
|
299 |
-
|
300 |
-
torch.save(checkpoint, checkpoint_path)
|
301 |
|
302 |
print("Checkpoint saved")
|
303 |
|
|
|
54 |
parser.add_argument("--eval_interval", default=10, type=int)
|
55 |
parser.add_argument("--checkpoint_interval", default=20, type=int)
|
56 |
parser.add_argument("--checkpoint_path", default="./out/checkpoint.pt", type=str)
|
|
|
57 |
parser.add_argument("--resume", action="store_true")
|
58 |
parser.add_argument("--dataset_path", default="./dataset", type=str)
|
59 |
parser.add_argument("--num_dataset_processes", default=8, type=int)
|
|
|
289 |
"optimizer": optimizer.state_dict(),
|
290 |
}
|
291 |
|
292 |
+
torch.save(checkpoint, args.checkpoint_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
|
294 |
print("Checkpoint saved")
|
295 |
|