Andrew DalPino commited on
Commit
160e81f
·
1 Parent(s): 8c4359f

Cleanup checkpointing history

Browse files
Files changed (3) hide show
  1. README.md +3 -3
  2. model.py +1 -1
  3. pre-train.py +1 -9
README.md CHANGED
@@ -45,10 +45,10 @@ python pre-train.py
45
 
46
  > Note that it will take a while to download and pre-process the dataset the first time that the training script is run.
47
 
48
- To customize the default "lightgpt-small" architecture you can adjust the `block_size`, `embedding_dimensions`, and `num_layers` arguments of the pre-training script. Refer to the `model_sizing.ipynb` notebook for an estimation of the memory and compute requirements for a particular architecture.
49
 
50
  ```
51
- python pre-train.py --block_size=2048 --embedding_dimensions=4096 --num_hidden_layers=64
52
  ```
53
 
54
  You can also adjust the `batch_size`, `learning_rate`, and `gradient_accumulation_steps` to suite your training setup.
@@ -57,7 +57,7 @@ You can also adjust the `batch_size`, `learning_rate`, and `gradient_accumulatio
57
  python pre-train.py --batch_size=32 --learning_rate=0.01 --gradient_accumulation_steps=128
58
  ```
59
 
60
- For distributed training use PyTorch's [torchrun](https://pytorch.org/docs/stable/elastic/run.html) extension to launch a distributed data parallel session. The example below is for executing the training script on a single node with individual 8 GPUs.
61
 
62
  ```
63
  torchrun --standalone --nnodes=1 --nproc-per-node=8 pre-train.py --batch_size=16 --gradient_accumulation_steps=128
 
45
 
46
  > Note that it will take a while to download and pre-process the dataset the first time that the training script is run.
47
 
48
+ To customize the default "lightgpt-small" architecture you can adjust the `block_size`, `embedding_dimensions`, `num_hidden_layers`, and `num_attention_heads` arguments of the pre-training script. Refer to the `model_sizing.ipynb` notebook for an estimation of the memory and compute requirements for your chosen architecture.
49
 
50
  ```
51
+ python pre-train.py --block_size=2048 --embedding_dimensions=4096 --num_hidden_layers=64 --num_attention_heads=64
52
  ```
53
 
54
  You can also adjust the `batch_size`, `learning_rate`, and `gradient_accumulation_steps` to suite your training setup.
 
57
  python pre-train.py --batch_size=32 --learning_rate=0.01 --gradient_accumulation_steps=128
58
  ```
59
 
60
+ For distributed training, use PyTorch's [torchrun](https://pytorch.org/docs/stable/elastic/run.html) extension to launch a distributed data parallel session. The example below is for executing the training script on a single node with individual 8 GPUs.
61
 
62
  ```
63
  torchrun --standalone --nnodes=1 --nproc-per-node=8 pre-train.py --batch_size=16 --gradient_accumulation_steps=128
model.py CHANGED
@@ -34,7 +34,7 @@ class GPT(Module):
34
  block_size: int = 1024,
35
  embedding_dimensions: int = 1024,
36
  num_heads: int = 16,
37
- num_layers: int = 24,
38
  dropout: float = 0.1,
39
  activation_checkpointing: bool = False,
40
  vocabulary_size: int = 50257,
 
34
  block_size: int = 1024,
35
  embedding_dimensions: int = 1024,
36
  num_heads: int = 16,
37
+ num_layers: int = 32,
38
  dropout: float = 0.1,
39
  activation_checkpointing: bool = False,
40
  vocabulary_size: int = 50257,
pre-train.py CHANGED
@@ -54,7 +54,6 @@ def main():
54
  parser.add_argument("--eval_interval", default=10, type=int)
55
  parser.add_argument("--checkpoint_interval", default=20, type=int)
56
  parser.add_argument("--checkpoint_path", default="./out/checkpoint.pt", type=str)
57
- parser.add_argument("--checkpoint_history", action="store_true")
58
  parser.add_argument("--resume", action="store_true")
59
  parser.add_argument("--dataset_path", default="./dataset", type=str)
60
  parser.add_argument("--num_dataset_processes", default=8, type=int)
@@ -290,14 +289,7 @@ def main():
290
  "optimizer": optimizer.state_dict(),
291
  }
292
 
293
- if args.checkpoint_history:
294
- root, ext = path.splitext(args.checkpoint_path)
295
-
296
- checkpoint_path = f"{root}-{epoch}{ext}"
297
- else:
298
- checkpoint_path = args.checkpoint_path
299
-
300
- torch.save(checkpoint, checkpoint_path)
301
 
302
  print("Checkpoint saved")
303
 
 
54
  parser.add_argument("--eval_interval", default=10, type=int)
55
  parser.add_argument("--checkpoint_interval", default=20, type=int)
56
  parser.add_argument("--checkpoint_path", default="./out/checkpoint.pt", type=str)
 
57
  parser.add_argument("--resume", action="store_true")
58
  parser.add_argument("--dataset_path", default="./dataset", type=str)
59
  parser.add_argument("--num_dataset_processes", default=8, type=int)
 
289
  "optimizer": optimizer.state_dict(),
290
  }
291
 
292
+ torch.save(checkpoint, args.checkpoint_path)
 
 
 
 
 
 
 
293
 
294
  print("Checkpoint saved")
295