model
Browse files- scripts/model.yaml +4 -2
scripts/model.yaml
CHANGED
@@ -56,7 +56,8 @@ train:
|
|
56 |
global_batch_size: 512
|
57 |
|
58 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
59 |
-
micro_batch_size: 16
|
|
|
60 |
|
61 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
62 |
lr_warmup_steps: 2000
|
@@ -102,7 +103,8 @@ eval:
|
|
102 |
|
103 |
# Optimizer-related arguments
|
104 |
optimizer:
|
105 |
-
class_path: torch.optim.AdamW
|
|
|
106 |
# class_path: bitsandbytes.optim.PagedAdamW
|
107 |
# class_path: bitsandbytes.optim.AdamW8bit
|
108 |
# class_path: bitsandbytes.optim.PagedAdamW8bit
|
|
|
56 |
global_batch_size: 512
|
57 |
|
58 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
59 |
+
# micro_batch_size: 16
|
60 |
+
micro_batch_size: 32
|
61 |
|
62 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
63 |
lr_warmup_steps: 2000
|
|
|
103 |
|
104 |
# Optimizer-related arguments
|
105 |
optimizer:
|
106 |
+
# class_path: torch.optim.AdamW
|
107 |
+
class_path: torch.optim.PagedAdamW
|
108 |
# class_path: bitsandbytes.optim.PagedAdamW
|
109 |
# class_path: bitsandbytes.optim.AdamW8bit
|
110 |
# class_path: bitsandbytes.optim.PagedAdamW8bit
|