|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 774, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03875968992248062, |
|
"grad_norm": 2.577054727951972, |
|
"learning_rate": 2e-06, |
|
"loss": 0.721, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07751937984496124, |
|
"grad_norm": 1.3292498101090573, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6438, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11627906976744186, |
|
"grad_norm": 2.3414148705752926, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6248, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.15503875968992248, |
|
"grad_norm": 2.621497754252154, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6155, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1937984496124031, |
|
"grad_norm": 1.642709929025015, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6022, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.23255813953488372, |
|
"grad_norm": 1.4108572822591154, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6008, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2713178294573643, |
|
"grad_norm": 1.8320230338483412, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5995, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.31007751937984496, |
|
"grad_norm": 2.0509677067609906, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5919, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3488372093023256, |
|
"grad_norm": 1.8615028380190877, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5883, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3875968992248062, |
|
"grad_norm": 1.651732467548723, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5867, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4263565891472868, |
|
"grad_norm": 1.827483286285098, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5824, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 1.878984672982078, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5768, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5038759689922481, |
|
"grad_norm": 1.461881748693503, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5791, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5426356589147286, |
|
"grad_norm": 1.7965397328352164, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5769, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5813953488372093, |
|
"grad_norm": 1.9228942975753127, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5808, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6201550387596899, |
|
"grad_norm": 1.1676070598043125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5704, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6589147286821705, |
|
"grad_norm": 1.1754765506774811, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5689, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"grad_norm": 1.7812488410180551, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5668, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7364341085271318, |
|
"grad_norm": 1.808692259046032, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5753, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7751937984496124, |
|
"grad_norm": 1.8584614057755389, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5652, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.813953488372093, |
|
"grad_norm": 1.599864515399134, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5683, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8527131782945736, |
|
"grad_norm": 1.6516869387309938, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5673, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8914728682170543, |
|
"grad_norm": 1.6848137903140608, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5643, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 1.3032607046739686, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5649, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9689922480620154, |
|
"grad_norm": 1.5431955138670315, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5597, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0077519379844961, |
|
"grad_norm": 2.0310291210286238, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5571, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0465116279069768, |
|
"grad_norm": 1.2551497648955234, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5299, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0852713178294573, |
|
"grad_norm": 1.1678057917636586, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5317, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.124031007751938, |
|
"grad_norm": 1.125227296422933, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5275, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.1627906976744187, |
|
"grad_norm": 1.3900860537725233, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5236, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.2015503875968991, |
|
"grad_norm": 1.2021576698979493, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5306, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.2403100775193798, |
|
"grad_norm": 1.2053080767276194, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5277, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.2790697674418605, |
|
"grad_norm": 1.3991030309565773, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5274, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.3178294573643412, |
|
"grad_norm": 1.1826629752006035, |
|
"learning_rate": 2e-06, |
|
"loss": 0.531, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.3565891472868217, |
|
"grad_norm": 1.1287968703954268, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5275, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.3953488372093024, |
|
"grad_norm": 1.5127256802296236, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5317, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.4341085271317828, |
|
"grad_norm": 1.5013875357775832, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5297, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.4728682170542635, |
|
"grad_norm": 1.7614327167944768, |
|
"learning_rate": 2e-06, |
|
"loss": 0.526, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.5116279069767442, |
|
"grad_norm": 1.4214027958077065, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5293, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.550387596899225, |
|
"grad_norm": 1.6564120069131323, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5255, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.5891472868217056, |
|
"grad_norm": 1.378206427304318, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5211, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.627906976744186, |
|
"grad_norm": 1.988629014217936, |
|
"learning_rate": 2e-06, |
|
"loss": 0.526, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 1.9503056920319735, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5288, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.7054263565891472, |
|
"grad_norm": 1.732782542682626, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5241, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.744186046511628, |
|
"grad_norm": 1.528224415622805, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5272, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.7829457364341086, |
|
"grad_norm": 1.1793820983967487, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5226, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.8217054263565893, |
|
"grad_norm": 1.1809068975224493, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5242, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"grad_norm": 1.379981502891672, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5273, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.8992248062015504, |
|
"grad_norm": 1.8940459651264965, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5236, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.937984496124031, |
|
"grad_norm": 1.898050329402726, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5263, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.9767441860465116, |
|
"grad_norm": 1.6806097447928925, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5231, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.0155038759689923, |
|
"grad_norm": 1.7706824956520133, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5092, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.054263565891473, |
|
"grad_norm": 1.1944302472248438, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4867, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.0930232558139537, |
|
"grad_norm": 1.3491176032576682, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4842, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.1317829457364343, |
|
"grad_norm": 1.182447255831288, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4843, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.1705426356589146, |
|
"grad_norm": 1.0865626136425315, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4878, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.2093023255813953, |
|
"grad_norm": 1.1893887821883726, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4852, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.248062015503876, |
|
"grad_norm": 1.197021602818726, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4858, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.2868217054263567, |
|
"grad_norm": 1.0736236283669167, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4893, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 1.017760123624255, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4858, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.3643410852713176, |
|
"grad_norm": 1.1866183343445358, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4893, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.4031007751937983, |
|
"grad_norm": 1.0875734211472181, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4893, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.441860465116279, |
|
"grad_norm": 1.4175668384873126, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4879, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.4806201550387597, |
|
"grad_norm": 1.5445789270591548, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4905, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.5193798449612403, |
|
"grad_norm": 1.1527990919509379, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4861, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.558139534883721, |
|
"grad_norm": 1.0983296348010934, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4893, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.5968992248062017, |
|
"grad_norm": 1.7393177878547021, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4915, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.6356589147286824, |
|
"grad_norm": 1.6766013258821928, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4847, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.6744186046511627, |
|
"grad_norm": 1.5674228297707393, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4876, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.7131782945736433, |
|
"grad_norm": 1.5303183442776453, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4876, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.751937984496124, |
|
"grad_norm": 1.830851000547026, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4839, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.7906976744186047, |
|
"grad_norm": 1.7047867407889794, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4898, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.8294573643410854, |
|
"grad_norm": 1.5112038359871813, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4899, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.8682170542635657, |
|
"grad_norm": 1.4351025379161033, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4892, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.9069767441860463, |
|
"grad_norm": 1.222684031513469, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4864, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.945736434108527, |
|
"grad_norm": 1.3407496538423456, |
|
"learning_rate": 2e-06, |
|
"loss": 0.4881, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.9844961240310077, |
|
"grad_norm": 1.5499167243320089, |
|
"learning_rate": 2e-06, |
|
"loss": 0.488, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 774, |
|
"total_flos": 2591282618695680.0, |
|
"train_loss": 0.5347004343372906, |
|
"train_runtime": 11473.9369, |
|
"train_samples_per_second": 69.018, |
|
"train_steps_per_second": 0.067 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 774, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2591282618695680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|