|
{ |
|
"best_metric": 1.0839496850967407, |
|
"best_model_checkpoint": "data/Llama-31-8B_task-2_60-samples_config-4_full/checkpoint-267", |
|
"epoch": 99.82608695652173, |
|
"eval_steps": 500, |
|
"global_step": 287, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 0.5254858136177063, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 1.6001, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 0.48315590620040894, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 1.5658, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"eval_loss": 1.5853713750839233, |
|
"eval_runtime": 12.9778, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 1.391304347826087, |
|
"grad_norm": 0.4942804276943207, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 1.5728, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"eval_loss": 1.583631992340088, |
|
"eval_runtime": 12.9766, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 2.0869565217391304, |
|
"grad_norm": 0.5288798809051514, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.5934, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 2.782608695652174, |
|
"grad_norm": 0.5049395561218262, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 1.583, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 2.782608695652174, |
|
"eval_loss": 1.580271601676941, |
|
"eval_runtime": 12.9795, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 3.4782608695652173, |
|
"grad_norm": 0.468879371881485, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.562, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 3.8260869565217392, |
|
"eval_loss": 1.5752835273742676, |
|
"eval_runtime": 12.9747, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 4.173913043478261, |
|
"grad_norm": 0.45035144686698914, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.5779, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 4.869565217391305, |
|
"grad_norm": 0.44937947392463684, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 1.5687, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 4.869565217391305, |
|
"eval_loss": 1.5688363313674927, |
|
"eval_runtime": 12.9875, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 5.565217391304348, |
|
"grad_norm": 0.439909964799881, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.5495, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 5.913043478260869, |
|
"eval_loss": 1.5600029230117798, |
|
"eval_runtime": 12.9772, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 6.260869565217392, |
|
"grad_norm": 0.4658964276313782, |
|
"learning_rate": 6e-06, |
|
"loss": 1.5541, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 6.956521739130435, |
|
"grad_norm": 0.5037180185317993, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.5493, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 6.956521739130435, |
|
"eval_loss": 1.548214316368103, |
|
"eval_runtime": 12.9784, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 7.6521739130434785, |
|
"grad_norm": 0.47590604424476624, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 1.5379, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.5340198278427124, |
|
"eval_runtime": 12.9772, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 8.347826086956522, |
|
"grad_norm": 0.4758809506893158, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.5155, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 8.695652173913043, |
|
"eval_loss": 1.5222147703170776, |
|
"eval_runtime": 12.9749, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 9.043478260869565, |
|
"grad_norm": 0.44333091378211975, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 1.5287, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 9.73913043478261, |
|
"grad_norm": 0.4040922224521637, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 1.5131, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 9.73913043478261, |
|
"eval_loss": 1.5057040452957153, |
|
"eval_runtime": 12.977, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 10.434782608695652, |
|
"grad_norm": 0.39793452620506287, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4971, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 10.782608695652174, |
|
"eval_loss": 1.4859318733215332, |
|
"eval_runtime": 12.9807, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 11.130434782608695, |
|
"grad_norm": 0.4213944971561432, |
|
"learning_rate": 9.99864620589731e-06, |
|
"loss": 1.4727, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 11.826086956521738, |
|
"grad_norm": 0.4004620909690857, |
|
"learning_rate": 9.994585556692624e-06, |
|
"loss": 1.4675, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 11.826086956521738, |
|
"eval_loss": 1.4652339220046997, |
|
"eval_runtime": 12.9758, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 12.521739130434783, |
|
"grad_norm": 0.35341501235961914, |
|
"learning_rate": 9.987820251299121e-06, |
|
"loss": 1.4518, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 12.869565217391305, |
|
"eval_loss": 1.4474080801010132, |
|
"eval_runtime": 12.9771, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 13.217391304347826, |
|
"grad_norm": 0.31634095311164856, |
|
"learning_rate": 9.978353953249023e-06, |
|
"loss": 1.4225, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 13.91304347826087, |
|
"grad_norm": 0.32467326521873474, |
|
"learning_rate": 9.966191788709716e-06, |
|
"loss": 1.4267, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 13.91304347826087, |
|
"eval_loss": 1.4301291704177856, |
|
"eval_runtime": 12.9757, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 14.608695652173914, |
|
"grad_norm": 0.3214071989059448, |
|
"learning_rate": 9.951340343707852e-06, |
|
"loss": 1.4004, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 14.956521739130435, |
|
"eval_loss": 1.413205623626709, |
|
"eval_runtime": 12.9784, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 15.304347826086957, |
|
"grad_norm": 0.26766690611839294, |
|
"learning_rate": 9.933807660562898e-06, |
|
"loss": 1.3894, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.24698252975940704, |
|
"learning_rate": 9.913603233532067e-06, |
|
"loss": 1.3993, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 1.3975735902786255, |
|
"eval_runtime": 12.9795, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 16.695652173913043, |
|
"grad_norm": 0.2775205671787262, |
|
"learning_rate": 9.890738003669029e-06, |
|
"loss": 1.3748, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 16.695652173913043, |
|
"eval_loss": 1.3880764245986938, |
|
"eval_runtime": 12.9755, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 17.391304347826086, |
|
"grad_norm": 0.2671767473220825, |
|
"learning_rate": 9.86522435289912e-06, |
|
"loss": 1.3664, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 17.73913043478261, |
|
"eval_loss": 1.3743128776550293, |
|
"eval_runtime": 12.98, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 18.08695652173913, |
|
"grad_norm": 0.23573799431324005, |
|
"learning_rate": 9.83707609731432e-06, |
|
"loss": 1.3404, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 18.782608695652176, |
|
"grad_norm": 0.23537954688072205, |
|
"learning_rate": 9.806308479691595e-06, |
|
"loss": 1.3465, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 18.782608695652176, |
|
"eval_loss": 1.361410140991211, |
|
"eval_runtime": 12.9811, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 19.47826086956522, |
|
"grad_norm": 0.24991613626480103, |
|
"learning_rate": 9.77293816123866e-06, |
|
"loss": 1.3407, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 19.82608695652174, |
|
"eval_loss": 1.3488320112228394, |
|
"eval_runtime": 12.9784, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 20.17391304347826, |
|
"grad_norm": 0.22332234680652618, |
|
"learning_rate": 9.736983212571646e-06, |
|
"loss": 1.3237, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 20.869565217391305, |
|
"grad_norm": 0.225299671292305, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 1.32, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 20.869565217391305, |
|
"eval_loss": 1.3369255065917969, |
|
"eval_runtime": 12.9791, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 21.565217391304348, |
|
"grad_norm": 0.24724063277244568, |
|
"learning_rate": 9.657398694630713e-06, |
|
"loss": 1.305, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 21.91304347826087, |
|
"eval_loss": 1.3247287273406982, |
|
"eval_runtime": 12.9775, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 22.26086956521739, |
|
"grad_norm": 0.24492307007312775, |
|
"learning_rate": 9.613812221777212e-06, |
|
"loss": 1.2923, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 22.956521739130434, |
|
"grad_norm": 0.2448989301919937, |
|
"learning_rate": 9.567727288213005e-06, |
|
"loss": 1.281, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 22.956521739130434, |
|
"eval_loss": 1.3119220733642578, |
|
"eval_runtime": 12.9822, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 23.652173913043477, |
|
"grad_norm": 0.22922120988368988, |
|
"learning_rate": 9.519168849742603e-06, |
|
"loss": 1.2869, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 1.298582911491394, |
|
"eval_runtime": 12.9756, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 24.347826086956523, |
|
"grad_norm": 0.20991288125514984, |
|
"learning_rate": 9.468163201617063e-06, |
|
"loss": 1.2523, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 24.695652173913043, |
|
"eval_loss": 1.2903255224227905, |
|
"eval_runtime": 12.978, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 25.043478260869566, |
|
"grad_norm": 0.22222064435482025, |
|
"learning_rate": 9.414737964294636e-06, |
|
"loss": 1.2628, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 25.73913043478261, |
|
"grad_norm": 0.21490232646465302, |
|
"learning_rate": 9.358922068483813e-06, |
|
"loss": 1.2642, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 25.73913043478261, |
|
"eval_loss": 1.278277039527893, |
|
"eval_runtime": 12.9804, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 26.434782608695652, |
|
"grad_norm": 0.22001975774765015, |
|
"learning_rate": 9.30074573947683e-06, |
|
"loss": 1.2323, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 26.782608695652176, |
|
"eval_loss": 1.265702247619629, |
|
"eval_runtime": 12.9772, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 27.130434782608695, |
|
"grad_norm": 0.20034939050674438, |
|
"learning_rate": 9.24024048078213e-06, |
|
"loss": 1.2395, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 27.82608695652174, |
|
"grad_norm": 0.19895507395267487, |
|
"learning_rate": 9.177439057064684e-06, |
|
"loss": 1.2121, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 27.82608695652174, |
|
"eval_loss": 1.253515362739563, |
|
"eval_runtime": 12.9793, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 28.52173913043478, |
|
"grad_norm": 0.19117023050785065, |
|
"learning_rate": 9.112375476403313e-06, |
|
"loss": 1.1896, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 28.869565217391305, |
|
"eval_loss": 1.2409998178482056, |
|
"eval_runtime": 12.9782, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 29.217391304347824, |
|
"grad_norm": 0.2141137570142746, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 1.2215, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 29.91304347826087, |
|
"grad_norm": 0.23155109584331512, |
|
"learning_rate": 8.97560398247424e-06, |
|
"loss": 1.1678, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 29.91304347826087, |
|
"eval_loss": 1.2283284664154053, |
|
"eval_runtime": 12.9808, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 30.608695652173914, |
|
"grad_norm": 0.2200462818145752, |
|
"learning_rate": 8.903970133383297e-06, |
|
"loss": 1.1768, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 30.956521739130434, |
|
"eval_loss": 1.2153583765029907, |
|
"eval_runtime": 12.9811, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 31.304347826086957, |
|
"grad_norm": 0.29315873980522156, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 1.1623, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.22663454711437225, |
|
"learning_rate": 8.754400164907496e-06, |
|
"loss": 1.1824, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 1.20297372341156, |
|
"eval_runtime": 12.9813, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 32.69565217391305, |
|
"grad_norm": 0.2229231745004654, |
|
"learning_rate": 8.676545040299145e-06, |
|
"loss": 1.1589, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 32.69565217391305, |
|
"eval_loss": 1.1947580575942993, |
|
"eval_runtime": 12.9744, |
|
"eval_samples_per_second": 0.925, |
|
"eval_steps_per_second": 0.925, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 33.391304347826086, |
|
"grad_norm": 0.2454340010881424, |
|
"learning_rate": 8.596699001693257e-06, |
|
"loss": 1.126, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 33.73913043478261, |
|
"eval_loss": 1.1819590330123901, |
|
"eval_runtime": 12.9877, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 34.08695652173913, |
|
"grad_norm": 0.23333889245986938, |
|
"learning_rate": 8.51490528712831e-06, |
|
"loss": 1.1243, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 34.78260869565217, |
|
"grad_norm": 0.23953379690647125, |
|
"learning_rate": 8.43120818934367e-06, |
|
"loss": 1.1059, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 34.78260869565217, |
|
"eval_loss": 1.169416069984436, |
|
"eval_runtime": 12.9903, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 35.47826086956522, |
|
"grad_norm": 0.23915378749370575, |
|
"learning_rate": 8.345653031794292e-06, |
|
"loss": 1.1334, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 35.82608695652174, |
|
"eval_loss": 1.1582151651382446, |
|
"eval_runtime": 12.9858, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 36.17391304347826, |
|
"grad_norm": 0.23581968247890472, |
|
"learning_rate": 8.258286144107277e-06, |
|
"loss": 1.0911, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 36.869565217391305, |
|
"grad_norm": 0.207149475812912, |
|
"learning_rate": 8.16915483699355e-06, |
|
"loss": 1.1081, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 36.869565217391305, |
|
"eval_loss": 1.1483094692230225, |
|
"eval_runtime": 12.9841, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 37.56521739130435, |
|
"grad_norm": 0.20900563895702362, |
|
"learning_rate": 8.078307376628292e-06, |
|
"loss": 1.0794, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 37.91304347826087, |
|
"eval_loss": 1.139157772064209, |
|
"eval_runtime": 12.9828, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 38.26086956521739, |
|
"grad_norm": 0.19496272504329681, |
|
"learning_rate": 7.985792958513932e-06, |
|
"loss": 1.0814, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 38.95652173913044, |
|
"grad_norm": 0.20283730328083038, |
|
"learning_rate": 7.891661680839932e-06, |
|
"loss": 1.0614, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 38.95652173913044, |
|
"eval_loss": 1.1314579248428345, |
|
"eval_runtime": 13.0034, |
|
"eval_samples_per_second": 0.923, |
|
"eval_steps_per_second": 0.923, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 39.65217391304348, |
|
"grad_norm": 0.18000271916389465, |
|
"learning_rate": 7.795964517353734e-06, |
|
"loss": 1.0877, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 1.1258946657180786, |
|
"eval_runtime": 12.9887, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 40.34782608695652, |
|
"grad_norm": 0.1749521791934967, |
|
"learning_rate": 7.698753289757565e-06, |
|
"loss": 1.0198, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 40.69565217391305, |
|
"eval_loss": 1.122938632965088, |
|
"eval_runtime": 12.988, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 41.04347826086956, |
|
"grad_norm": 0.16494785249233246, |
|
"learning_rate": 7.600080639646077e-06, |
|
"loss": 1.0862, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 41.73913043478261, |
|
"grad_norm": 0.1447688788175583, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.0538, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 41.73913043478261, |
|
"eval_loss": 1.1193265914916992, |
|
"eval_runtime": 12.9856, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 42.43478260869565, |
|
"grad_norm": 0.1363561749458313, |
|
"learning_rate": 7.398565566251232e-06, |
|
"loss": 1.0351, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 42.78260869565217, |
|
"eval_loss": 1.116489291191101, |
|
"eval_runtime": 12.9859, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 43.130434782608695, |
|
"grad_norm": 0.149055615067482, |
|
"learning_rate": 7.295832266935059e-06, |
|
"loss": 1.0684, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 43.82608695652174, |
|
"grad_norm": 0.12285248190164566, |
|
"learning_rate": 7.191855733945388e-06, |
|
"loss": 1.0121, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 43.82608695652174, |
|
"eval_loss": 1.1143923997879028, |
|
"eval_runtime": 12.9844, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 44.52173913043478, |
|
"grad_norm": 0.13907888531684875, |
|
"learning_rate": 7.08669227240909e-06, |
|
"loss": 1.0475, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 44.869565217391305, |
|
"eval_loss": 1.1124660968780518, |
|
"eval_runtime": 12.9826, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 45.21739130434783, |
|
"grad_norm": 0.12828341126441956, |
|
"learning_rate": 6.980398830195785e-06, |
|
"loss": 1.0647, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 45.91304347826087, |
|
"grad_norm": 0.11993666738271713, |
|
"learning_rate": 6.873032967079562e-06, |
|
"loss": 1.035, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 45.91304347826087, |
|
"eval_loss": 1.1105235815048218, |
|
"eval_runtime": 12.9835, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 46.608695652173914, |
|
"grad_norm": 0.11968977749347687, |
|
"learning_rate": 6.7646528235693445e-06, |
|
"loss": 1.0582, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 46.95652173913044, |
|
"eval_loss": 1.1090105772018433, |
|
"eval_runtime": 12.981, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 47.30434782608695, |
|
"grad_norm": 0.12057274580001831, |
|
"learning_rate": 6.655317089424791e-06, |
|
"loss": 1.0257, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 0.13167637586593628, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 1.029, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_loss": 1.10718834400177, |
|
"eval_runtime": 12.9849, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 48.69565217391305, |
|
"grad_norm": 0.11730153858661652, |
|
"learning_rate": 6.434016163555452e-06, |
|
"loss": 1.0353, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 48.69565217391305, |
|
"eval_loss": 1.1063708066940308, |
|
"eval_runtime": 12.9872, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 49.391304347826086, |
|
"grad_norm": 0.1166403666138649, |
|
"learning_rate": 6.322170810186013e-06, |
|
"loss": 1.0203, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 49.73913043478261, |
|
"eval_loss": 1.1048336029052734, |
|
"eval_runtime": 12.9919, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 50.08695652173913, |
|
"grad_norm": 0.13392803072929382, |
|
"learning_rate": 6.209609477998339e-06, |
|
"loss": 1.0411, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 50.78260869565217, |
|
"grad_norm": 0.11808019131422043, |
|
"learning_rate": 6.0963931209395165e-06, |
|
"loss": 1.0313, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 50.78260869565217, |
|
"eval_loss": 1.1034945249557495, |
|
"eval_runtime": 12.9911, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 51.47826086956522, |
|
"grad_norm": 0.12019839882850647, |
|
"learning_rate": 5.982583047664151e-06, |
|
"loss": 1.0473, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 51.82608695652174, |
|
"eval_loss": 1.1025954484939575, |
|
"eval_runtime": 12.987, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 52.17391304347826, |
|
"grad_norm": 0.13823947310447693, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 1.0099, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 52.869565217391305, |
|
"grad_norm": 0.13493500649929047, |
|
"learning_rate": 5.753428561247416e-06, |
|
"loss": 1.0189, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 52.869565217391305, |
|
"eval_loss": 1.1011220216751099, |
|
"eval_runtime": 12.9858, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 53.56521739130435, |
|
"grad_norm": 0.11930125206708908, |
|
"learning_rate": 5.638208239302975e-06, |
|
"loss": 1.0088, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 53.91304347826087, |
|
"eval_loss": 1.100091576576233, |
|
"eval_runtime": 12.9925, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 54.26086956521739, |
|
"grad_norm": 0.1283315271139145, |
|
"learning_rate": 5.522642316338268e-06, |
|
"loss": 1.0471, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 54.95652173913044, |
|
"grad_norm": 0.12327709794044495, |
|
"learning_rate": 5.406793373339292e-06, |
|
"loss": 1.0336, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 54.95652173913044, |
|
"eval_loss": 1.0989197492599487, |
|
"eval_runtime": 12.9836, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 55.65217391304348, |
|
"grad_norm": 0.1234331727027893, |
|
"learning_rate": 5.290724144552379e-06, |
|
"loss": 1.0014, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_loss": 1.098119854927063, |
|
"eval_runtime": 12.9883, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 56.34782608695652, |
|
"grad_norm": 0.1393013894557953, |
|
"learning_rate": 5.174497483512506e-06, |
|
"loss": 1.0036, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 56.69565217391305, |
|
"eval_loss": 1.0972312688827515, |
|
"eval_runtime": 12.9843, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 57.04347826086956, |
|
"grad_norm": 0.12967315316200256, |
|
"learning_rate": 5.0581763290069865e-06, |
|
"loss": 1.0466, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 57.73913043478261, |
|
"grad_norm": 0.1203417107462883, |
|
"learning_rate": 4.941823670993016e-06, |
|
"loss": 1.0266, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 57.73913043478261, |
|
"eval_loss": 1.0962202548980713, |
|
"eval_runtime": 12.984, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 58.43478260869565, |
|
"grad_norm": 0.12664881348609924, |
|
"learning_rate": 4.825502516487497e-06, |
|
"loss": 0.9893, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 58.78260869565217, |
|
"eval_loss": 1.0956058502197266, |
|
"eval_runtime": 12.983, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 59.130434782608695, |
|
"grad_norm": 0.1268896907567978, |
|
"learning_rate": 4.7092758554476215e-06, |
|
"loss": 1.0363, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 59.82608695652174, |
|
"grad_norm": 0.12260066717863083, |
|
"learning_rate": 4.59320662666071e-06, |
|
"loss": 1.0122, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 59.82608695652174, |
|
"eval_loss": 1.0947729349136353, |
|
"eval_runtime": 12.9887, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 60.52173913043478, |
|
"grad_norm": 0.12348895519971848, |
|
"learning_rate": 4.477357683661734e-06, |
|
"loss": 1.0456, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 60.869565217391305, |
|
"eval_loss": 1.0938516855239868, |
|
"eval_runtime": 12.9852, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 61.21739130434783, |
|
"grad_norm": 0.1181364431977272, |
|
"learning_rate": 4.361791760697027e-06, |
|
"loss": 1.0021, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 61.91304347826087, |
|
"grad_norm": 0.12426912784576416, |
|
"learning_rate": 4.246571438752585e-06, |
|
"loss": 0.9873, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 61.91304347826087, |
|
"eval_loss": 1.0932997465133667, |
|
"eval_runtime": 12.9851, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 62.608695652173914, |
|
"grad_norm": 0.12317227572202682, |
|
"learning_rate": 4.131759111665349e-06, |
|
"loss": 1.0189, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 62.95652173913044, |
|
"eval_loss": 1.0926170349121094, |
|
"eval_runtime": 12.9854, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 63.30434782608695, |
|
"grad_norm": 0.14774170517921448, |
|
"learning_rate": 4.017416952335849e-06, |
|
"loss": 0.9927, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 0.12388727813959122, |
|
"learning_rate": 3.903606879060483e-06, |
|
"loss": 1.0325, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_loss": 1.0917848348617554, |
|
"eval_runtime": 12.981, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 64.69565217391305, |
|
"grad_norm": 0.12087590992450714, |
|
"learning_rate": 3.790390522001662e-06, |
|
"loss": 1.0081, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 64.69565217391305, |
|
"eval_loss": 1.0912106037139893, |
|
"eval_runtime": 12.9845, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 65.3913043478261, |
|
"grad_norm": 0.12642121315002441, |
|
"learning_rate": 3.6778291898139907e-06, |
|
"loss": 0.995, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 65.73913043478261, |
|
"eval_loss": 1.0907906293869019, |
|
"eval_runtime": 12.9837, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 66.08695652173913, |
|
"grad_norm": 0.13900116086006165, |
|
"learning_rate": 3.5659838364445505e-06, |
|
"loss": 1.0352, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 66.78260869565217, |
|
"grad_norm": 0.13428539037704468, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 1.0104, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 66.78260869565217, |
|
"eval_loss": 1.0903018712997437, |
|
"eval_runtime": 12.9828, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 67.47826086956522, |
|
"grad_norm": 0.12461841106414795, |
|
"learning_rate": 3.3446829105752103e-06, |
|
"loss": 0.9979, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 67.82608695652173, |
|
"eval_loss": 1.089614748954773, |
|
"eval_runtime": 12.9828, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 68.17391304347827, |
|
"grad_norm": 0.13673454523086548, |
|
"learning_rate": 3.2353471764306567e-06, |
|
"loss": 1.015, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 68.8695652173913, |
|
"grad_norm": 0.12500186264514923, |
|
"learning_rate": 3.12696703292044e-06, |
|
"loss": 0.9927, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 68.8695652173913, |
|
"eval_loss": 1.0892704725265503, |
|
"eval_runtime": 12.986, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 69.56521739130434, |
|
"grad_norm": 0.12714624404907227, |
|
"learning_rate": 3.019601169804216e-06, |
|
"loss": 0.9898, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 69.91304347826087, |
|
"eval_loss": 1.0887213945388794, |
|
"eval_runtime": 12.9821, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 70.26086956521739, |
|
"grad_norm": 0.1471695601940155, |
|
"learning_rate": 2.9133077275909112e-06, |
|
"loss": 1.0112, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 70.95652173913044, |
|
"grad_norm": 0.12886075675487518, |
|
"learning_rate": 2.8081442660546126e-06, |
|
"loss": 1.0087, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 70.95652173913044, |
|
"eval_loss": 1.0882389545440674, |
|
"eval_runtime": 12.9829, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 71.65217391304348, |
|
"grad_norm": 0.13256506621837616, |
|
"learning_rate": 2.7041677330649408e-06, |
|
"loss": 0.9903, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_loss": 1.0877516269683838, |
|
"eval_runtime": 12.9825, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 72.34782608695652, |
|
"grad_norm": 0.131199449300766, |
|
"learning_rate": 2.601434433748771e-06, |
|
"loss": 1.0198, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 72.69565217391305, |
|
"eval_loss": 1.0877233743667603, |
|
"eval_runtime": 12.9886, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 73.04347826086956, |
|
"grad_norm": 0.12681709229946136, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 0.9942, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 73.73913043478261, |
|
"grad_norm": 0.12825633585453033, |
|
"learning_rate": 2.3999193603539234e-06, |
|
"loss": 1.0078, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 73.73913043478261, |
|
"eval_loss": 1.0873947143554688, |
|
"eval_runtime": 12.985, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 74.43478260869566, |
|
"grad_norm": 0.12875820696353912, |
|
"learning_rate": 2.3012467102424373e-06, |
|
"loss": 1.0056, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 74.78260869565217, |
|
"eval_loss": 1.086978793144226, |
|
"eval_runtime": 12.9911, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 75.1304347826087, |
|
"grad_norm": 0.16595281660556793, |
|
"learning_rate": 2.204035482646267e-06, |
|
"loss": 0.9857, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 75.82608695652173, |
|
"grad_norm": 0.1281590312719345, |
|
"learning_rate": 2.1083383191600676e-06, |
|
"loss": 1.0114, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 75.82608695652173, |
|
"eval_loss": 1.0867377519607544, |
|
"eval_runtime": 12.9863, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 76.52173913043478, |
|
"grad_norm": 0.1323501169681549, |
|
"learning_rate": 2.0142070414860704e-06, |
|
"loss": 0.9982, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 76.8695652173913, |
|
"eval_loss": 1.0864088535308838, |
|
"eval_runtime": 12.9841, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 77.21739130434783, |
|
"grad_norm": 0.1450796127319336, |
|
"learning_rate": 1.9216926233717087e-06, |
|
"loss": 0.974, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 77.91304347826087, |
|
"grad_norm": 0.13413220643997192, |
|
"learning_rate": 1.8308451630064484e-06, |
|
"loss": 1.0105, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 77.91304347826087, |
|
"eval_loss": 1.086008906364441, |
|
"eval_runtime": 13.0104, |
|
"eval_samples_per_second": 0.922, |
|
"eval_steps_per_second": 0.922, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 78.6086956521739, |
|
"grad_norm": 0.1377415508031845, |
|
"learning_rate": 1.7417138558927244e-06, |
|
"loss": 1.0033, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 78.95652173913044, |
|
"eval_loss": 1.085876226425171, |
|
"eval_runtime": 12.9858, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 79.30434782608695, |
|
"grad_norm": 0.13374638557434082, |
|
"learning_rate": 1.6543469682057105e-06, |
|
"loss": 0.9876, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 0.12589867413043976, |
|
"learning_rate": 1.5687918106563326e-06, |
|
"loss": 1.0024, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_loss": 1.0857700109481812, |
|
"eval_runtime": 12.9853, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 80.69565217391305, |
|
"grad_norm": 0.1440684199333191, |
|
"learning_rate": 1.4850947128716914e-06, |
|
"loss": 1.0091, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 80.69565217391305, |
|
"eval_loss": 1.0854946374893188, |
|
"eval_runtime": 12.994, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 81.3913043478261, |
|
"grad_norm": 0.13353127241134644, |
|
"learning_rate": 1.4033009983067454e-06, |
|
"loss": 0.9971, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 81.73913043478261, |
|
"eval_loss": 1.0853348970413208, |
|
"eval_runtime": 12.9853, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 82.08695652173913, |
|
"grad_norm": 0.14419566094875336, |
|
"learning_rate": 1.3234549597008572e-06, |
|
"loss": 0.9801, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 82.78260869565217, |
|
"grad_norm": 0.13215622305870056, |
|
"learning_rate": 1.2455998350925042e-06, |
|
"loss": 0.969, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 82.78260869565217, |
|
"eval_loss": 1.0850697755813599, |
|
"eval_runtime": 12.9889, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 83.47826086956522, |
|
"grad_norm": 0.1365135759115219, |
|
"learning_rate": 1.1697777844051105e-06, |
|
"loss": 1.0242, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 83.82608695652173, |
|
"eval_loss": 1.0847272872924805, |
|
"eval_runtime": 12.9926, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 84.17391304347827, |
|
"grad_norm": 0.14472560584545135, |
|
"learning_rate": 1.096029866616704e-06, |
|
"loss": 1.0026, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 84.8695652173913, |
|
"grad_norm": 0.13619466125965118, |
|
"learning_rate": 1.0243960175257605e-06, |
|
"loss": 0.9949, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 84.8695652173913, |
|
"eval_loss": 1.0849597454071045, |
|
"eval_runtime": 12.9855, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 85.56521739130434, |
|
"grad_norm": 0.13221730291843414, |
|
"learning_rate": 9.549150281252633e-07, |
|
"loss": 0.9715, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 85.91304347826087, |
|
"eval_loss": 1.0846881866455078, |
|
"eval_runtime": 12.9911, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 86.26086956521739, |
|
"grad_norm": 0.14602892100811005, |
|
"learning_rate": 8.876245235966884e-07, |
|
"loss": 0.9918, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 86.95652173913044, |
|
"grad_norm": 0.1370145082473755, |
|
"learning_rate": 8.225609429353187e-07, |
|
"loss": 1.0164, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 86.95652173913044, |
|
"eval_loss": 1.084583044052124, |
|
"eval_runtime": 12.9873, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 87.65217391304348, |
|
"grad_norm": 0.1295238435268402, |
|
"learning_rate": 7.597595192178702e-07, |
|
"loss": 0.9729, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_loss": 1.084546685218811, |
|
"eval_runtime": 12.9858, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 88.34782608695652, |
|
"grad_norm": 0.12774939835071564, |
|
"learning_rate": 6.992542605231739e-07, |
|
"loss": 1.0065, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 88.69565217391305, |
|
"eval_loss": 1.0845379829406738, |
|
"eval_runtime": 12.9859, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 89.04347826086956, |
|
"grad_norm": 0.12875907123088837, |
|
"learning_rate": 6.410779315161885e-07, |
|
"loss": 1.0004, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 89.73913043478261, |
|
"grad_norm": 0.13160941004753113, |
|
"learning_rate": 5.852620357053651e-07, |
|
"loss": 0.994, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 89.73913043478261, |
|
"eval_loss": 1.0845232009887695, |
|
"eval_runtime": 12.9865, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 90.43478260869566, |
|
"grad_norm": 0.1342068612575531, |
|
"learning_rate": 5.318367983829393e-07, |
|
"loss": 0.9852, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 90.78260869565217, |
|
"eval_loss": 1.084337830543518, |
|
"eval_runtime": 12.9887, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 91.1304347826087, |
|
"grad_norm": 0.14868062734603882, |
|
"learning_rate": 4.808311502573976e-07, |
|
"loss": 1.0154, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 91.82608695652173, |
|
"grad_norm": 0.14731329679489136, |
|
"learning_rate": 4.322727117869951e-07, |
|
"loss": 0.9755, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 91.82608695652173, |
|
"eval_loss": 1.0842256546020508, |
|
"eval_runtime": 12.9863, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 92.52173913043478, |
|
"grad_norm": 0.14365264773368835, |
|
"learning_rate": 3.8618777822278854e-07, |
|
"loss": 1.0191, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 92.8695652173913, |
|
"eval_loss": 1.0839496850967407, |
|
"eval_runtime": 12.9888, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 93.21739130434783, |
|
"grad_norm": 0.15139809250831604, |
|
"learning_rate": 3.426013053692878e-07, |
|
"loss": 0.9765, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 93.91304347826087, |
|
"grad_norm": 0.14520440995693207, |
|
"learning_rate": 3.015368960704584e-07, |
|
"loss": 0.9864, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 93.91304347826087, |
|
"eval_loss": 1.0840731859207153, |
|
"eval_runtime": 12.9887, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 94.6086956521739, |
|
"grad_norm": 0.13488347828388214, |
|
"learning_rate": 2.63016787428354e-07, |
|
"loss": 0.9773, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 94.95652173913044, |
|
"eval_loss": 1.084143042564392, |
|
"eval_runtime": 12.9839, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 95.30434782608695, |
|
"grad_norm": 0.14086610078811646, |
|
"learning_rate": 2.2706183876134047e-07, |
|
"loss": 1.0173, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"grad_norm": 0.13791334629058838, |
|
"learning_rate": 1.9369152030840553e-07, |
|
"loss": 0.9869, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_loss": 1.0842411518096924, |
|
"eval_runtime": 12.9864, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 96.69565217391305, |
|
"grad_norm": 0.14270715415477753, |
|
"learning_rate": 1.6292390268568103e-07, |
|
"loss": 0.986, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 96.69565217391305, |
|
"eval_loss": 1.0841034650802612, |
|
"eval_runtime": 12.985, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 97.3913043478261, |
|
"grad_norm": 0.13356775045394897, |
|
"learning_rate": 1.3477564710088097e-07, |
|
"loss": 0.9925, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 97.73913043478261, |
|
"eval_loss": 1.084011197090149, |
|
"eval_runtime": 12.9842, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 98.08695652173913, |
|
"grad_norm": 0.15943579375743866, |
|
"learning_rate": 1.0926199633097156e-07, |
|
"loss": 0.9947, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 98.78260869565217, |
|
"grad_norm": 0.1478174775838852, |
|
"learning_rate": 8.639676646793382e-08, |
|
"loss": 0.9882, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 98.78260869565217, |
|
"eval_loss": 1.0840474367141724, |
|
"eval_runtime": 12.9842, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 99.47826086956522, |
|
"grad_norm": 0.14343644678592682, |
|
"learning_rate": 6.61923394371039e-08, |
|
"loss": 0.9917, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 99.82608695652173, |
|
"eval_loss": 1.0840203762054443, |
|
"eval_runtime": 12.9837, |
|
"eval_samples_per_second": 0.924, |
|
"eval_steps_per_second": 0.924, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 99.82608695652173, |
|
"step": 287, |
|
"total_flos": 4.445754122074849e+17, |
|
"train_loss": 1.1405799933011525, |
|
"train_runtime": 16000.7915, |
|
"train_samples_per_second": 0.431, |
|
"train_steps_per_second": 0.019 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 150, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 7, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.445754122074849e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|