|
{ |
|
"best_metric": 1.0477440357208252, |
|
"best_model_checkpoint": "outputs/checkpoint-555", |
|
"epoch": 18.979757085020243, |
|
"eval_steps": 500, |
|
"global_step": 879, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4318488529014845, |
|
"grad_norm": 0.8887946605682373, |
|
"learning_rate": 6e-06, |
|
"loss": 2.3403, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.863697705802969, |
|
"grad_norm": 0.6425972580909729, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.296, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9932523616734144, |
|
"eval_loss": 2.043026924133301, |
|
"eval_runtime": 11.9177, |
|
"eval_samples_per_second": 31.214, |
|
"eval_steps_per_second": 3.944, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.2955465587044535, |
|
"grad_norm": 0.5528222918510437, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.1948, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.7273954116059378, |
|
"grad_norm": 0.5118728876113892, |
|
"learning_rate": 2.4e-05, |
|
"loss": 2.0099, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.9865047233468287, |
|
"eval_loss": 1.7544503211975098, |
|
"eval_runtime": 11.9277, |
|
"eval_samples_per_second": 31.188, |
|
"eval_steps_per_second": 3.94, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.1592442645074224, |
|
"grad_norm": 0.6005122661590576, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9526, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.591093117408907, |
|
"grad_norm": 0.580225944519043, |
|
"learning_rate": 2.9973151946516027e-05, |
|
"loss": 1.8471, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.979757085020243, |
|
"eval_loss": 1.5811642408370972, |
|
"eval_runtime": 11.9279, |
|
"eval_samples_per_second": 31.187, |
|
"eval_steps_per_second": 3.94, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 3.0229419703103915, |
|
"grad_norm": 0.9076627492904663, |
|
"learning_rate": 2.989270389512756e-05, |
|
"loss": 1.8369, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.454790823211876, |
|
"grad_norm": 0.9181211590766907, |
|
"learning_rate": 2.9758943828979444e-05, |
|
"loss": 1.7294, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.8866396761133606, |
|
"grad_norm": 0.953004002571106, |
|
"learning_rate": 2.957235057439301e-05, |
|
"loss": 1.6939, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.9946018893387314, |
|
"eval_loss": 1.4385555982589722, |
|
"eval_runtime": 11.9317, |
|
"eval_samples_per_second": 31.177, |
|
"eval_steps_per_second": 3.939, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 4.318488529014845, |
|
"grad_norm": 1.2283871173858643, |
|
"learning_rate": 2.9333592086792113e-05, |
|
"loss": 1.6026, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.75033738191633, |
|
"grad_norm": 1.482079267501831, |
|
"learning_rate": 2.904352305959606e-05, |
|
"loss": 1.5949, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.987854251012146, |
|
"eval_loss": 1.3210583925247192, |
|
"eval_runtime": 11.9364, |
|
"eval_samples_per_second": 31.165, |
|
"eval_steps_per_second": 3.938, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 5.182186234817814, |
|
"grad_norm": 1.305906057357788, |
|
"learning_rate": 2.8703181864639013e-05, |
|
"loss": 1.5484, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.614035087719298, |
|
"grad_norm": 1.6025702953338623, |
|
"learning_rate": 2.8313786835068314e-05, |
|
"loss": 1.4699, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.98110661268556, |
|
"eval_loss": 1.2309151887893677, |
|
"eval_runtime": 11.9213, |
|
"eval_samples_per_second": 31.205, |
|
"eval_steps_per_second": 3.943, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 6.045883940620783, |
|
"grad_norm": 1.6253570318222046, |
|
"learning_rate": 2.7876731904027994e-05, |
|
"loss": 1.4256, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.477732793522267, |
|
"grad_norm": 1.6080800294876099, |
|
"learning_rate": 2.7393581614739924e-05, |
|
"loss": 1.3346, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.909581646423752, |
|
"grad_norm": 1.8431881666183472, |
|
"learning_rate": 2.6866065519845124e-05, |
|
"loss": 1.3798, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.995951417004049, |
|
"eval_loss": 1.1634759902954102, |
|
"eval_runtime": 11.927, |
|
"eval_samples_per_second": 31.19, |
|
"eval_steps_per_second": 3.941, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 7.341430499325236, |
|
"grad_norm": 2.2968621253967285, |
|
"learning_rate": 2.6296071990054167e-05, |
|
"loss": 1.2827, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.77327935222672, |
|
"grad_norm": 2.0731184482574463, |
|
"learning_rate": 2.5685641454270172e-05, |
|
"loss": 1.2972, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.989203778677463, |
|
"eval_loss": 1.1176676750183105, |
|
"eval_runtime": 11.9385, |
|
"eval_samples_per_second": 31.16, |
|
"eval_steps_per_second": 3.937, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"grad_norm": 2.328958749771118, |
|
"learning_rate": 2.5036959095382875e-05, |
|
"loss": 1.2416, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 8.63697705802969, |
|
"grad_norm": 2.6954452991485596, |
|
"learning_rate": 2.4352347027881003e-05, |
|
"loss": 1.2072, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.982456140350877, |
|
"eval_loss": 1.084679365158081, |
|
"eval_runtime": 11.9276, |
|
"eval_samples_per_second": 31.188, |
|
"eval_steps_per_second": 3.94, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 9.068825910931174, |
|
"grad_norm": 2.3439438343048096, |
|
"learning_rate": 2.3634255985285104e-05, |
|
"loss": 1.2027, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 9.50067476383266, |
|
"grad_norm": 2.3041927814483643, |
|
"learning_rate": 2.288525654715757e-05, |
|
"loss": 1.135, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 9.932523616734143, |
|
"grad_norm": 2.1704723834991455, |
|
"learning_rate": 2.210802993709498e-05, |
|
"loss": 1.1612, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.997300944669366, |
|
"eval_loss": 1.0629429817199707, |
|
"eval_runtime": 11.909, |
|
"eval_samples_per_second": 31.237, |
|
"eval_steps_per_second": 3.947, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 10.364372469635628, |
|
"grad_norm": 2.9438016414642334, |
|
"learning_rate": 2.1305358424643484e-05, |
|
"loss": 1.0958, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 10.796221322537113, |
|
"grad_norm": 2.8956775665283203, |
|
"learning_rate": 2.0480115365495928e-05, |
|
"loss": 1.0673, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 10.99055330634278, |
|
"eval_loss": 1.0595099925994873, |
|
"eval_runtime": 11.9097, |
|
"eval_samples_per_second": 31.235, |
|
"eval_steps_per_second": 3.946, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 11.228070175438596, |
|
"grad_norm": 2.5965044498443604, |
|
"learning_rate": 1.963525491562421e-05, |
|
"loss": 1.1122, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 11.65991902834008, |
|
"grad_norm": 3.310291290283203, |
|
"learning_rate": 1.877380145616763e-05, |
|
"loss": 1.0611, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 11.983805668016194, |
|
"eval_loss": 1.0477440357208252, |
|
"eval_runtime": 11.9101, |
|
"eval_samples_per_second": 31.234, |
|
"eval_steps_per_second": 3.946, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 12.091767881241566, |
|
"grad_norm": 2.768397569656372, |
|
"learning_rate": 1.78988387669333e-05, |
|
"loss": 1.0279, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 12.523616734143049, |
|
"grad_norm": 2.6344711780548096, |
|
"learning_rate": 1.7013498987264832e-05, |
|
"loss": 1.0072, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 12.955465587044534, |
|
"grad_norm": 3.8299720287323, |
|
"learning_rate": 1.6120951403796367e-05, |
|
"loss": 0.9972, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 12.998650472334683, |
|
"eval_loss": 1.049790859222412, |
|
"eval_runtime": 11.9084, |
|
"eval_samples_per_second": 31.239, |
|
"eval_steps_per_second": 3.947, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 13.387314439946019, |
|
"grad_norm": 2.735280990600586, |
|
"learning_rate": 1.5224391105228956e-05, |
|
"loss": 0.9579, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 13.819163292847504, |
|
"grad_norm": 2.9928438663482666, |
|
"learning_rate": 1.4327027544742281e-05, |
|
"loss": 1.0177, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 13.991902834008098, |
|
"eval_loss": 1.0596399307250977, |
|
"eval_runtime": 11.9225, |
|
"eval_samples_per_second": 31.201, |
|
"eval_steps_per_second": 3.942, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 14.251012145748987, |
|
"grad_norm": 2.9771831035614014, |
|
"learning_rate": 1.3432073050985201e-05, |
|
"loss": 0.9815, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 14.682860998650472, |
|
"grad_norm": 2.8720908164978027, |
|
"learning_rate": 1.2542731328772936e-05, |
|
"loss": 0.9378, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 14.98515519568151, |
|
"eval_loss": 1.0685012340545654, |
|
"eval_runtime": 11.9562, |
|
"eval_samples_per_second": 31.114, |
|
"eval_steps_per_second": 3.931, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 15.114709851551957, |
|
"grad_norm": 3.394012212753296, |
|
"learning_rate": 1.1662185990655285e-05, |
|
"loss": 0.9611, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 15.54655870445344, |
|
"grad_norm": 3.343366861343384, |
|
"learning_rate": 1.079358916040996e-05, |
|
"loss": 0.9022, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 15.978407557354926, |
|
"grad_norm": 3.1342532634735107, |
|
"learning_rate": 9.940050189257552e-06, |
|
"loss": 0.9787, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 1.0815356969833374, |
|
"eval_runtime": 11.9572, |
|
"eval_samples_per_second": 31.111, |
|
"eval_steps_per_second": 3.931, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 16.41025641025641, |
|
"grad_norm": 2.9826908111572266, |
|
"learning_rate": 9.104624525191147e-06, |
|
"loss": 0.8726, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 16.842105263157894, |
|
"grad_norm": 3.0531911849975586, |
|
"learning_rate": 8.290302775265509e-06, |
|
"loss": 0.9296, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 16.993252361673413, |
|
"eval_loss": 1.0938704013824463, |
|
"eval_runtime": 12.0533, |
|
"eval_samples_per_second": 30.863, |
|
"eval_steps_per_second": 3.899, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 17.27395411605938, |
|
"grad_norm": 3.4158248901367188, |
|
"learning_rate": 7.500000000000004e-06, |
|
"loss": 0.9234, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 17.705802968960864, |
|
"grad_norm": 3.574179172515869, |
|
"learning_rate": 6.736545278218464e-06, |
|
"loss": 0.8333, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 17.98650472334683, |
|
"eval_loss": 1.0902316570281982, |
|
"eval_runtime": 12.0644, |
|
"eval_samples_per_second": 30.835, |
|
"eval_steps_per_second": 3.896, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 18.13765182186235, |
|
"grad_norm": 3.0140535831451416, |
|
"learning_rate": 6.0026715796812945e-06, |
|
"loss": 0.9186, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 18.569500674763834, |
|
"grad_norm": 3.737762928009033, |
|
"learning_rate": 5.301005981763007e-06, |
|
"loss": 0.8445, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 18.979757085020243, |
|
"eval_loss": 1.1101101636886597, |
|
"eval_runtime": 12.0551, |
|
"eval_samples_per_second": 30.858, |
|
"eval_steps_per_second": 3.899, |
|
"step": 879 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1150, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 25, |
|
"save_steps": 500, |
|
"total_flos": 7.822159756807373e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|