|
{ |
|
"best_metric": 1.0532242059707642, |
|
"best_model_checkpoint": "outputs/checkpoint-648", |
|
"epoch": 19.865047233468285, |
|
"eval_steps": 500, |
|
"global_step": 920, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4318488529014845, |
|
"grad_norm": 0.8586685061454773, |
|
"learning_rate": 6e-06, |
|
"loss": 2.3406, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.863697705802969, |
|
"grad_norm": 0.6262326240539551, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.2977, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9932523616734144, |
|
"eval_loss": 2.046743154525757, |
|
"eval_runtime": 11.8178, |
|
"eval_samples_per_second": 31.478, |
|
"eval_steps_per_second": 3.977, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.2955465587044535, |
|
"grad_norm": 0.5441118478775024, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.1983, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.7273954116059378, |
|
"grad_norm": 0.5027536153793335, |
|
"learning_rate": 2.4e-05, |
|
"loss": 2.0126, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.9865047233468287, |
|
"eval_loss": 1.756650686264038, |
|
"eval_runtime": 11.8327, |
|
"eval_samples_per_second": 31.438, |
|
"eval_steps_per_second": 3.972, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.1592442645074224, |
|
"grad_norm": 0.5985815525054932, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9539, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.591093117408907, |
|
"grad_norm": 0.5876953601837158, |
|
"learning_rate": 2.9955987017756105e-05, |
|
"loss": 1.8487, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.979757085020243, |
|
"eval_loss": 1.5829453468322754, |
|
"eval_runtime": 11.8342, |
|
"eval_samples_per_second": 31.434, |
|
"eval_steps_per_second": 3.972, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 3.0229419703103915, |
|
"grad_norm": 0.9397473931312561, |
|
"learning_rate": 2.982420635670523e-05, |
|
"loss": 1.838, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.454790823211876, |
|
"grad_norm": 0.9532083868980408, |
|
"learning_rate": 2.9605431358166687e-05, |
|
"loss": 1.7307, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.8866396761133606, |
|
"grad_norm": 0.9994444847106934, |
|
"learning_rate": 2.9300945880823957e-05, |
|
"loss": 1.6956, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.9946018893387314, |
|
"eval_loss": 1.4417527914047241, |
|
"eval_runtime": 11.8388, |
|
"eval_samples_per_second": 31.422, |
|
"eval_steps_per_second": 3.97, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 4.318488529014845, |
|
"grad_norm": 1.2850545644760132, |
|
"learning_rate": 2.8912536766531424e-05, |
|
"loss": 1.6058, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.75033738191633, |
|
"grad_norm": 1.5529141426086426, |
|
"learning_rate": 2.8442483354415835e-05, |
|
"loss": 1.5981, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.987854251012146, |
|
"eval_loss": 1.3265655040740967, |
|
"eval_runtime": 11.8379, |
|
"eval_samples_per_second": 31.424, |
|
"eval_steps_per_second": 3.97, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 5.182186234817814, |
|
"grad_norm": 1.335929036140442, |
|
"learning_rate": 2.789354410480802e-05, |
|
"loss": 1.5531, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.614035087719298, |
|
"grad_norm": 1.6453096866607666, |
|
"learning_rate": 2.7268940411500768e-05, |
|
"loss": 1.4762, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.98110661268556, |
|
"eval_loss": 1.2389419078826904, |
|
"eval_runtime": 11.8303, |
|
"eval_samples_per_second": 31.445, |
|
"eval_steps_per_second": 3.973, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 6.045883940620783, |
|
"grad_norm": 1.6546735763549805, |
|
"learning_rate": 2.6572337697329145e-05, |
|
"loss": 1.4335, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.477732793522267, |
|
"grad_norm": 1.6672505140304565, |
|
"learning_rate": 2.5807823904011803e-05, |
|
"loss": 1.3456, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.909581646423752, |
|
"grad_norm": 1.872035264968872, |
|
"learning_rate": 2.497988550248348e-05, |
|
"loss": 1.3907, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.995951417004049, |
|
"eval_loss": 1.1727797985076904, |
|
"eval_runtime": 11.8334, |
|
"eval_samples_per_second": 31.436, |
|
"eval_steps_per_second": 3.972, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 7.341430499325236, |
|
"grad_norm": 2.2928590774536133, |
|
"learning_rate": 2.4093381164499572e-05, |
|
"loss": 1.2972, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.77327935222672, |
|
"grad_norm": 2.046597957611084, |
|
"learning_rate": 2.315351325001832e-05, |
|
"loss": 1.3121, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.989203778677463, |
|
"eval_loss": 1.1317445039749146, |
|
"eval_runtime": 11.8354, |
|
"eval_samples_per_second": 31.431, |
|
"eval_steps_per_second": 3.971, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"grad_norm": 2.3657939434051514, |
|
"learning_rate": 2.2165797277683945e-05, |
|
"loss": 1.2587, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 8.63697705802969, |
|
"grad_norm": 2.6628217697143555, |
|
"learning_rate": 2.11360295575701e-05, |
|
"loss": 1.2259, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.982456140350877, |
|
"eval_loss": 1.089838981628418, |
|
"eval_runtime": 11.8331, |
|
"eval_samples_per_second": 31.437, |
|
"eval_steps_per_second": 3.972, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 9.068825910931174, |
|
"grad_norm": 2.126490354537964, |
|
"learning_rate": 2.007025317612754e-05, |
|
"loss": 1.2231, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 9.50067476383266, |
|
"grad_norm": 2.33013653755188, |
|
"learning_rate": 1.897472253294993e-05, |
|
"loss": 1.1601, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 9.932523616734143, |
|
"grad_norm": 2.1320955753326416, |
|
"learning_rate": 1.7855866637470027e-05, |
|
"loss": 1.1858, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.997300944669366, |
|
"eval_loss": 1.0753756761550903, |
|
"eval_runtime": 11.8356, |
|
"eval_samples_per_second": 31.431, |
|
"eval_steps_per_second": 3.971, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 10.364372469635628, |
|
"grad_norm": 2.7270896434783936, |
|
"learning_rate": 1.6720251380976008e-05, |
|
"loss": 1.1264, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 10.796221322537113, |
|
"grad_norm": 2.817499876022339, |
|
"learning_rate": 1.557454100535053e-05, |
|
"loss": 1.0953, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 10.99055330634278, |
|
"eval_loss": 1.0628381967544556, |
|
"eval_runtime": 11.8346, |
|
"eval_samples_per_second": 31.433, |
|
"eval_steps_per_second": 3.971, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 11.228070175438596, |
|
"grad_norm": 2.4983890056610107, |
|
"learning_rate": 1.442545899464947e-05, |
|
"loss": 1.1448, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 11.65991902834008, |
|
"grad_norm": 3.187267303466797, |
|
"learning_rate": 1.3279748619023994e-05, |
|
"loss": 1.0969, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 11.983805668016194, |
|
"eval_loss": 1.0652039051055908, |
|
"eval_runtime": 11.8314, |
|
"eval_samples_per_second": 31.442, |
|
"eval_steps_per_second": 3.972, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 12.091767881241566, |
|
"grad_norm": 2.7236690521240234, |
|
"learning_rate": 1.2144133362529972e-05, |
|
"loss": 1.0659, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 12.523616734143049, |
|
"grad_norm": 2.291569232940674, |
|
"learning_rate": 1.1025277467050077e-05, |
|
"loss": 1.0509, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 12.955465587044534, |
|
"grad_norm": 3.7598021030426025, |
|
"learning_rate": 9.929746823872461e-06, |
|
"loss": 1.0398, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 12.998650472334683, |
|
"eval_loss": 1.0660185813903809, |
|
"eval_runtime": 11.8359, |
|
"eval_samples_per_second": 31.43, |
|
"eval_steps_per_second": 3.971, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 13.387314439946019, |
|
"grad_norm": 2.636166572570801, |
|
"learning_rate": 8.863970442429903e-06, |
|
"loss": 1.0089, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 13.819163292847504, |
|
"grad_norm": 2.8337323665618896, |
|
"learning_rate": 7.834202722316054e-06, |
|
"loss": 1.0692, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 13.991902834008098, |
|
"eval_loss": 1.0532242059707642, |
|
"eval_runtime": 11.8398, |
|
"eval_samples_per_second": 31.419, |
|
"eval_steps_per_second": 3.97, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 14.251012145748987, |
|
"grad_norm": 3.053009033203125, |
|
"learning_rate": 6.846486749981685e-06, |
|
"loss": 1.0366, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 14.682860998650472, |
|
"grad_norm": 2.598278522491455, |
|
"learning_rate": 5.906618835500434e-06, |
|
"loss": 0.9994, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 14.98515519568151, |
|
"eval_loss": 1.0662660598754883, |
|
"eval_runtime": 11.8811, |
|
"eval_samples_per_second": 31.31, |
|
"eval_steps_per_second": 3.956, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 15.114709851551957, |
|
"grad_norm": 3.115391492843628, |
|
"learning_rate": 5.0201144975165215e-06, |
|
"loss": 1.0248, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 15.54655870445344, |
|
"grad_norm": 2.9552927017211914, |
|
"learning_rate": 4.192176095988196e-06, |
|
"loss": 0.9757, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 15.978407557354926, |
|
"grad_norm": 2.836552143096924, |
|
"learning_rate": 3.4276623026708552e-06, |
|
"loss": 1.0462, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 1.0600887537002563, |
|
"eval_runtime": 11.8762, |
|
"eval_samples_per_second": 31.323, |
|
"eval_steps_per_second": 3.957, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 16.41025641025641, |
|
"grad_norm": 2.7339868545532227, |
|
"learning_rate": 2.7310595884992356e-06, |
|
"loss": 0.956, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 16.842105263157894, |
|
"grad_norm": 2.860797882080078, |
|
"learning_rate": 2.1064558951919854e-06, |
|
"loss": 1.0121, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 16.993252361673413, |
|
"eval_loss": 1.067551851272583, |
|
"eval_runtime": 11.9433, |
|
"eval_samples_per_second": 31.147, |
|
"eval_steps_per_second": 3.935, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 17.27395411605938, |
|
"grad_norm": 3.2689273357391357, |
|
"learning_rate": 1.5575166455841678e-06, |
|
"loss": 1.0109, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 17.705802968960864, |
|
"grad_norm": 2.9255502223968506, |
|
"learning_rate": 1.0874632334685808e-06, |
|
"loss": 0.926, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 17.98650472334683, |
|
"eval_loss": 1.0671662092208862, |
|
"eval_runtime": 11.9354, |
|
"eval_samples_per_second": 31.168, |
|
"eval_steps_per_second": 3.938, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 18.13765182186235, |
|
"grad_norm": 2.799520492553711, |
|
"learning_rate": 6.990541191760419e-07, |
|
"loss": 1.015, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 18.569500674763834, |
|
"grad_norm": 3.2285776138305664, |
|
"learning_rate": 3.945686418333155e-07, |
|
"loss": 0.9512, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 18.979757085020243, |
|
"eval_loss": 1.0708719491958618, |
|
"eval_runtime": 11.9397, |
|
"eval_samples_per_second": 31.157, |
|
"eval_steps_per_second": 3.936, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 19.00134952766532, |
|
"grad_norm": 2.990569591522217, |
|
"learning_rate": 1.7579364329477376e-07, |
|
"loss": 0.9571, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 19.4331983805668, |
|
"grad_norm": 3.1629343032836914, |
|
"learning_rate": 4.401298224389338e-08, |
|
"loss": 0.9524, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 19.865047233468285, |
|
"grad_norm": 3.00166392326355, |
|
"learning_rate": 0.0, |
|
"loss": 0.976, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 19.865047233468285, |
|
"eval_loss": 1.0710477828979492, |
|
"eval_runtime": 11.9108, |
|
"eval_samples_per_second": 31.232, |
|
"eval_steps_per_second": 3.946, |
|
"step": 920 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 920, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 8.174619616582042e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|