|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.995433789954339, |
|
"eval_steps": 500, |
|
"global_step": 1204, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.182648401826484, |
|
"grad_norm": 0.4084379971027374, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8312, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.365296803652968, |
|
"grad_norm": 0.24725936353206635, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1547, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"grad_norm": 0.1690889149904251, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0644, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.730593607305936, |
|
"grad_norm": 0.09192364662885666, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0466, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.91324200913242, |
|
"grad_norm": 0.08266641944646835, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0385, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.095890410958904, |
|
"grad_norm": 0.10168185085058212, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0379, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.278538812785388, |
|
"grad_norm": 0.10715723037719727, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0337, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.461187214611872, |
|
"grad_norm": 0.08185174316167831, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0304, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.643835616438356, |
|
"grad_norm": 0.0720980241894722, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0342, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.82648401826484, |
|
"grad_norm": 0.07974616438150406, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0312, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.009132420091324, |
|
"grad_norm": 0.08611268550157547, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0315, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.191780821917808, |
|
"grad_norm": 0.06699004024267197, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0267, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.374429223744292, |
|
"grad_norm": 0.1077587902545929, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0246, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.557077625570776, |
|
"grad_norm": 0.10352851450443268, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0267, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.73972602739726, |
|
"grad_norm": 0.08488716930150986, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0297, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.922374429223744, |
|
"grad_norm": 0.08407847583293915, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0269, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.105022831050228, |
|
"grad_norm": 0.0976366400718689, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0251, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.287671232876712, |
|
"grad_norm": 0.08240761607885361, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0229, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.470319634703196, |
|
"grad_norm": 0.0689239650964737, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0232, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.65296803652968, |
|
"grad_norm": 0.0607539638876915, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0231, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.8356164383561646, |
|
"grad_norm": 0.06858925521373749, |
|
"learning_rate": 0.0001, |
|
"loss": 0.023, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.018264840182648, |
|
"grad_norm": 0.04049643874168396, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0231, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.200913242009133, |
|
"grad_norm": 0.08556920289993286, |
|
"learning_rate": 0.0001, |
|
"loss": 0.018, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.383561643835616, |
|
"grad_norm": 0.05961354076862335, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0183, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.566210045662101, |
|
"grad_norm": 0.05691586434841156, |
|
"learning_rate": 0.0001, |
|
"loss": 0.02, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.748858447488584, |
|
"grad_norm": 0.05423538759350777, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0196, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.931506849315069, |
|
"grad_norm": 0.10058747231960297, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0206, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 5.114155251141552, |
|
"grad_norm": 0.064676932990551, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0177, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 5.296803652968037, |
|
"grad_norm": 0.08128379285335541, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0157, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 5.47945205479452, |
|
"grad_norm": 0.10474538058042526, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0169, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.662100456621005, |
|
"grad_norm": 0.09420209378004074, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0207, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.844748858447488, |
|
"grad_norm": 0.07704417407512665, |
|
"learning_rate": 0.0001, |
|
"loss": 0.018, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 6.027397260273973, |
|
"grad_norm": 0.044411078095436096, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0168, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 6.210045662100456, |
|
"grad_norm": 0.09763959795236588, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0131, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 6.392694063926941, |
|
"grad_norm": 0.08706251531839371, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0146, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 6.575342465753424, |
|
"grad_norm": 0.10404196381568909, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0169, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 6.757990867579909, |
|
"grad_norm": 0.1037658154964447, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0165, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 6.940639269406392, |
|
"grad_norm": 0.07572110742330551, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0168, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 7.123287671232877, |
|
"grad_norm": 0.06740553677082062, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0139, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 7.30593607305936, |
|
"grad_norm": 0.08043979108333588, |
|
"learning_rate": 0.0001, |
|
"loss": 0.014, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 7.488584474885845, |
|
"grad_norm": 0.06607798486948013, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0136, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 7.671232876712329, |
|
"grad_norm": 0.11705009639263153, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0146, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 7.853881278538813, |
|
"grad_norm": 0.04560132324695587, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0154, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 8.036529680365296, |
|
"grad_norm": 0.05037812143564224, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0129, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 8.219178082191782, |
|
"grad_norm": 0.07135117053985596, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0109, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 8.401826484018265, |
|
"grad_norm": 0.05977578088641167, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0117, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 8.584474885844749, |
|
"grad_norm": 0.07411223649978638, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0111, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 8.767123287671232, |
|
"grad_norm": 0.08515261113643646, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0122, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 8.949771689497716, |
|
"grad_norm": 0.07383166998624802, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0125, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 9.132420091324201, |
|
"grad_norm": 0.041954681277275085, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0105, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 9.315068493150685, |
|
"grad_norm": 0.09089387208223343, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0105, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 9.497716894977168, |
|
"grad_norm": 0.08716876059770584, |
|
"learning_rate": 0.0001, |
|
"loss": 0.011, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 9.680365296803654, |
|
"grad_norm": 0.04927799850702286, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0106, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 9.863013698630137, |
|
"grad_norm": 0.05259260907769203, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0111, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 10.045662100456621, |
|
"grad_norm": 0.04412449151277542, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0106, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 10.228310502283104, |
|
"grad_norm": 0.05673637241125107, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0087, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 10.41095890410959, |
|
"grad_norm": 0.04577219486236572, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0094, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 10.593607305936073, |
|
"grad_norm": 0.05691211298108101, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0098, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 10.776255707762557, |
|
"grad_norm": 0.05354565382003784, |
|
"learning_rate": 0.0001, |
|
"loss": 0.01, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 10.95890410958904, |
|
"grad_norm": 0.06758158653974533, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0104, |
|
"step": 1200 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 10900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.136418741180006e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|