mtasic85 commited on
Commit
bf45302
1 Parent(s): a07758a
Files changed (1) hide show
  1. scripts/model.yaml +130 -0
scripts/model.yaml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
2
+ # ``model_config``. (type: Optional[str], default: null)
3
+ model_name: "tiny-llama-1.1b"
4
+
5
+ # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
6
+ # ``model_config``. (type: Optional[Config], default: null)
7
+ model_config:
8
+ padded_vocab_size: 32768
9
+ vocab_size: 32768
10
+ block_size: 32768
11
+ n_layer: 10
12
+ n_head: 12
13
+ head_size: null
14
+ n_embd: 312
15
+ n_query_groups: 4
16
+ rotary_percentage: 1.0
17
+ parallel_residual: false
18
+ bias: false
19
+ norm_class_name: "RMSNorm"
20
+ norm_eps: 1e-05
21
+ mlp_class_name: "LLaMAMLP"
22
+ intermediate_size: 1092
23
+ rope_base: 500000
24
+
25
+ # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
26
+ # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
27
+ out_dir: out/pretrain/
28
+
29
+ # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
30
+ precision: bf16-mixed
31
+
32
+ # Optional path to a checkpoint directory to initialize the model from.
33
+ # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
34
+ initial_checkpoint_dir:
35
+
36
+ # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
37
+ # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
38
+ # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
39
+ # (type: Union[bool, Literal["auto"], Path], default: False)
40
+ resume: false
41
+
42
+ # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
43
+ # data: LitData
44
+
45
+ # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
46
+ train:
47
+ # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
48
+ save_interval: 1000
49
+
50
+ # Number of iterations between logging calls (type: int, default: 1)
51
+ log_interval: 1
52
+
53
+ # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
54
+ global_batch_size: 512
55
+
56
+ # Number of samples per data-parallel rank (type: int, default: 4)
57
+ micro_batch_size: 4
58
+
59
+ # Number of iterations with learning rate warmup active (type: int, default: 2000)
60
+ lr_warmup_steps: 2000
61
+
62
+ # Number of epochs to train on (type: Optional[int], default: null)
63
+ epochs:
64
+
65
+ # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
66
+ max_tokens: 3000000000000
67
+
68
+ # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
69
+ max_steps:
70
+
71
+ # Limits the length of samples. Off by default (type: Optional[int], default: null)
72
+ max_seq_length: 2048
73
+
74
+ # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
75
+ tie_embeddings:
76
+
77
+ # (type: Optional[float], default: 1.0)
78
+ max_norm: 1.0
79
+
80
+ # (type: float, default: 4e-05)
81
+ min_lr: 4.0e-05
82
+
83
+ # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
84
+ eval:
85
+ # Number of optimizer steps between evaluation calls (type: int, default: 1000)
86
+ interval: 1000
87
+
88
+ # Number of tokens to generate (type: Optional[int], default: null)
89
+ max_new_tokens:
90
+
91
+ # Number of iterations (type: int, default: 100)
92
+ max_iters: 100
93
+
94
+ # Whether to evaluate on the validation set at the beginning of the training
95
+ initial_validation: false
96
+
97
+ # Whether to evaluate on the validation set at the end the training
98
+ final_validation: false
99
+
100
+ # Optimizer-related arguments
101
+ optimizer:
102
+ class_path: torch.optim.AdamW
103
+
104
+ init_args:
105
+ # (type: float, default: 0.001)
106
+ lr: 5e-5
107
+
108
+ # (type: float, default: 0.01)
109
+ weight_decay: 0.1
110
+
111
+ # (type: tuple, default: (0.9,0.999))
112
+ betas:
113
+ - 0.9
114
+ - 0.95
115
+
116
+ # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
117
+ devices: auto
118
+
119
+ # How many nodes to use. (type: int, default: 1)
120
+ num_nodes: 1
121
+
122
+ # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
123
+ # module require this. (type: Optional[Path], default: null)
124
+ tokenizer_dir: "../"
125
+
126
+ # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
127
+ logger_name: "wandb"
128
+
129
+ # The random seed to use for reproducibility. (type: int, default: 42)
130
+ seed: 42