|
{ |
|
"best_metric": 0.44082728028297424, |
|
"best_model_checkpoint": "data/Llama-31-8B_task-3_120-samples_config-3/checkpoint-264", |
|
"epoch": 31.0, |
|
"eval_steps": 500, |
|
"global_step": 341, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 3.931244373321533, |
|
"learning_rate": 6.060606060606061e-08, |
|
"loss": 3.095, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 2.530280828475952, |
|
"learning_rate": 1.2121212121212122e-07, |
|
"loss": 2.5399, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 2.5952248573303223, |
|
"learning_rate": 2.4242424242424244e-07, |
|
"loss": 2.5861, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 2.0955796241760254, |
|
"learning_rate": 3.6363636363636366e-07, |
|
"loss": 2.4442, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 3.4685494899749756, |
|
"learning_rate": 4.848484848484849e-07, |
|
"loss": 2.4938, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 3.055572509765625, |
|
"learning_rate": 6.060606060606061e-07, |
|
"loss": 2.3729, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.4972100257873535, |
|
"eval_runtime": 21.0088, |
|
"eval_samples_per_second": 1.142, |
|
"eval_steps_per_second": 1.142, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 2.5200321674346924, |
|
"learning_rate": 7.272727272727273e-07, |
|
"loss": 2.4004, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 2.5053160190582275, |
|
"learning_rate": 8.484848484848486e-07, |
|
"loss": 2.3639, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 2.5543558597564697, |
|
"learning_rate": 9.696969696969698e-07, |
|
"loss": 2.7736, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 2.6235134601593018, |
|
"learning_rate": 1.090909090909091e-06, |
|
"loss": 2.3351, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 2.2782557010650635, |
|
"learning_rate": 1.2121212121212122e-06, |
|
"loss": 2.4073, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.364241123199463, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 2.6938, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.457120180130005, |
|
"eval_runtime": 20.9973, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 2.8939199447631836, |
|
"learning_rate": 1.4545454545454546e-06, |
|
"loss": 2.6378, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 2.3636363636363638, |
|
"grad_norm": 1.724478006362915, |
|
"learning_rate": 1.5757575757575759e-06, |
|
"loss": 2.3463, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 3.102917194366455, |
|
"learning_rate": 1.6969696969696973e-06, |
|
"loss": 2.3407, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 2.207613945007324, |
|
"learning_rate": 1.8181818181818183e-06, |
|
"loss": 2.4054, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 3.5173332691192627, |
|
"learning_rate": 1.9393939393939395e-06, |
|
"loss": 2.6474, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.3881404399871826, |
|
"eval_runtime": 20.9969, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 3.090909090909091, |
|
"grad_norm": 2.588070869445801, |
|
"learning_rate": 2.0606060606060607e-06, |
|
"loss": 2.2888, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 3.2727272727272725, |
|
"grad_norm": 3.921119451522827, |
|
"learning_rate": 2.181818181818182e-06, |
|
"loss": 2.858, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 3.4545454545454546, |
|
"grad_norm": 3.1361024379730225, |
|
"learning_rate": 2.303030303030303e-06, |
|
"loss": 2.292, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 2.6496403217315674, |
|
"learning_rate": 2.4242424242424244e-06, |
|
"loss": 2.2874, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 3.8181818181818183, |
|
"grad_norm": 1.7044334411621094, |
|
"learning_rate": 2.5454545454545456e-06, |
|
"loss": 2.0497, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.6162171363830566, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 2.2763, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.264249801635742, |
|
"eval_runtime": 21.0004, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 4.181818181818182, |
|
"grad_norm": 3.5348732471466064, |
|
"learning_rate": 2.7878787878787885e-06, |
|
"loss": 2.486, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 1.8792097568511963, |
|
"learning_rate": 2.9090909090909093e-06, |
|
"loss": 2.0279, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 3.1981241703033447, |
|
"learning_rate": 3.0303030303030305e-06, |
|
"loss": 2.279, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 2.5685250759124756, |
|
"learning_rate": 3.1515151515151517e-06, |
|
"loss": 2.1978, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 4.909090909090909, |
|
"grad_norm": 2.161813497543335, |
|
"learning_rate": 3.272727272727273e-06, |
|
"loss": 2.0268, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.069439649581909, |
|
"eval_runtime": 20.9965, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 5.090909090909091, |
|
"grad_norm": 2.084135055541992, |
|
"learning_rate": 3.3939393939393946e-06, |
|
"loss": 1.9478, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 5.2727272727272725, |
|
"grad_norm": 2.176788806915283, |
|
"learning_rate": 3.5151515151515154e-06, |
|
"loss": 1.9753, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 4.001251220703125, |
|
"learning_rate": 3.6363636363636366e-06, |
|
"loss": 2.1318, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 5.636363636363637, |
|
"grad_norm": 3.4586737155914307, |
|
"learning_rate": 3.757575757575758e-06, |
|
"loss": 1.9426, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 5.818181818181818, |
|
"grad_norm": 3.11710786819458, |
|
"learning_rate": 3.878787878787879e-06, |
|
"loss": 1.9089, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 2.0227549076080322, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.7309, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.7871297597885132, |
|
"eval_runtime": 20.9996, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 6.181818181818182, |
|
"grad_norm": 3.2912845611572266, |
|
"learning_rate": 4.1212121212121215e-06, |
|
"loss": 1.8532, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 6.363636363636363, |
|
"grad_norm": 2.7779335975646973, |
|
"learning_rate": 4.242424242424243e-06, |
|
"loss": 1.7371, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 6.545454545454545, |
|
"grad_norm": 1.3037850856781006, |
|
"learning_rate": 4.363636363636364e-06, |
|
"loss": 1.4541, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 6.7272727272727275, |
|
"grad_norm": 2.257749557495117, |
|
"learning_rate": 4.4848484848484855e-06, |
|
"loss": 1.5641, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 6.909090909090909, |
|
"grad_norm": 2.7067761421203613, |
|
"learning_rate": 4.606060606060606e-06, |
|
"loss": 1.4481, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 1.4330077171325684, |
|
"eval_runtime": 21.0001, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 7.090909090909091, |
|
"grad_norm": 1.7599331140518188, |
|
"learning_rate": 4.727272727272728e-06, |
|
"loss": 1.3604, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 1.4922908544540405, |
|
"learning_rate": 4.848484848484849e-06, |
|
"loss": 1.3767, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 7.454545454545454, |
|
"grad_norm": 2.4537432193756104, |
|
"learning_rate": 4.9696969696969696e-06, |
|
"loss": 1.2013, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 7.636363636363637, |
|
"grad_norm": 2.7686843872070312, |
|
"learning_rate": 5.090909090909091e-06, |
|
"loss": 1.1578, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 7.818181818181818, |
|
"grad_norm": 2.3368959426879883, |
|
"learning_rate": 5.212121212121213e-06, |
|
"loss": 1.1423, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.5859788656234741, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.0554, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.0675058364868164, |
|
"eval_runtime": 21.0113, |
|
"eval_samples_per_second": 1.142, |
|
"eval_steps_per_second": 1.142, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 8.181818181818182, |
|
"grad_norm": 1.8273411989212036, |
|
"learning_rate": 5.4545454545454545e-06, |
|
"loss": 0.9927, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 8.363636363636363, |
|
"grad_norm": 1.7687430381774902, |
|
"learning_rate": 5.575757575757577e-06, |
|
"loss": 0.8886, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 8.545454545454545, |
|
"grad_norm": 1.9781934022903442, |
|
"learning_rate": 5.696969696969698e-06, |
|
"loss": 0.7076, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 8.727272727272727, |
|
"grad_norm": 1.8124470710754395, |
|
"learning_rate": 5.8181818181818185e-06, |
|
"loss": 0.8414, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 8.909090909090908, |
|
"grad_norm": 1.2052216529846191, |
|
"learning_rate": 5.93939393939394e-06, |
|
"loss": 0.8392, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.756293773651123, |
|
"eval_runtime": 21.002, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": 1.1055922508239746, |
|
"learning_rate": 6.060606060606061e-06, |
|
"loss": 0.7587, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 9.272727272727273, |
|
"grad_norm": 1.4254761934280396, |
|
"learning_rate": 6.181818181818182e-06, |
|
"loss": 0.333, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 9.454545454545455, |
|
"grad_norm": 1.636863350868225, |
|
"learning_rate": 6.303030303030303e-06, |
|
"loss": 1.0304, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 9.636363636363637, |
|
"grad_norm": 1.3520318269729614, |
|
"learning_rate": 6.424242424242425e-06, |
|
"loss": 0.5626, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 9.818181818181818, |
|
"grad_norm": 0.739101231098175, |
|
"learning_rate": 6.545454545454546e-06, |
|
"loss": 0.4562, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.719274640083313, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.4685, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.6436794400215149, |
|
"eval_runtime": 20.9986, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 10.181818181818182, |
|
"grad_norm": 0.5967914462089539, |
|
"learning_rate": 6.787878787878789e-06, |
|
"loss": 0.4207, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 10.363636363636363, |
|
"grad_norm": 0.7981317639350891, |
|
"learning_rate": 6.90909090909091e-06, |
|
"loss": 0.7027, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 10.545454545454545, |
|
"grad_norm": 0.648346483707428, |
|
"learning_rate": 7.030303030303031e-06, |
|
"loss": 0.48, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 10.727272727272727, |
|
"grad_norm": 1.0265989303588867, |
|
"learning_rate": 7.151515151515152e-06, |
|
"loss": 0.6033, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 10.909090909090908, |
|
"grad_norm": 0.6349014639854431, |
|
"learning_rate": 7.272727272727273e-06, |
|
"loss": 0.3588, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 0.5850974321365356, |
|
"eval_runtime": 21.004, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 11.090909090909092, |
|
"grad_norm": 0.530517041683197, |
|
"learning_rate": 7.393939393939395e-06, |
|
"loss": 0.4937, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 11.272727272727273, |
|
"grad_norm": 0.9633344411849976, |
|
"learning_rate": 7.515151515151516e-06, |
|
"loss": 0.542, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 11.454545454545455, |
|
"grad_norm": 0.6681090593338013, |
|
"learning_rate": 7.636363636363638e-06, |
|
"loss": 0.4627, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 11.636363636363637, |
|
"grad_norm": 0.4449699819087982, |
|
"learning_rate": 7.757575757575758e-06, |
|
"loss": 0.4319, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 11.818181818181818, |
|
"grad_norm": 0.5469127297401428, |
|
"learning_rate": 7.87878787878788e-06, |
|
"loss": 0.3578, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.9779024720191956, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.6319, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 0.5406798720359802, |
|
"eval_runtime": 20.9992, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 12.181818181818182, |
|
"grad_norm": 0.5859923958778381, |
|
"learning_rate": 8.121212121212121e-06, |
|
"loss": 0.3894, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 12.363636363636363, |
|
"grad_norm": 0.1749761551618576, |
|
"learning_rate": 8.242424242424243e-06, |
|
"loss": 0.2537, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 12.545454545454545, |
|
"grad_norm": 1.400254487991333, |
|
"learning_rate": 8.363636363636365e-06, |
|
"loss": 0.6108, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 12.727272727272727, |
|
"grad_norm": 0.556860089302063, |
|
"learning_rate": 8.484848484848486e-06, |
|
"loss": 0.5365, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 12.909090909090908, |
|
"grad_norm": 0.4626636207103729, |
|
"learning_rate": 8.606060606060606e-06, |
|
"loss": 0.4211, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 0.5247967839241028, |
|
"eval_runtime": 21.0033, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 13.090909090909092, |
|
"grad_norm": 0.42664870619773865, |
|
"learning_rate": 8.727272727272728e-06, |
|
"loss": 0.4939, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 13.272727272727273, |
|
"grad_norm": 0.38396334648132324, |
|
"learning_rate": 8.84848484848485e-06, |
|
"loss": 0.3484, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 13.454545454545455, |
|
"grad_norm": 0.43991151452064514, |
|
"learning_rate": 8.969696969696971e-06, |
|
"loss": 0.4156, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 13.636363636363637, |
|
"grad_norm": 0.4237063229084015, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.3955, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 13.818181818181818, |
|
"grad_norm": 0.34925854206085205, |
|
"learning_rate": 9.212121212121213e-06, |
|
"loss": 0.446, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.8939254879951477, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 0.495, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 0.5127188563346863, |
|
"eval_runtime": 20.9989, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 14.181818181818182, |
|
"grad_norm": 0.4808824062347412, |
|
"learning_rate": 9.454545454545456e-06, |
|
"loss": 0.4217, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 14.363636363636363, |
|
"grad_norm": 0.3635479807853699, |
|
"learning_rate": 9.575757575757576e-06, |
|
"loss": 0.4738, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 14.545454545454545, |
|
"grad_norm": 0.2080135941505432, |
|
"learning_rate": 9.696969696969698e-06, |
|
"loss": 0.3693, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 14.727272727272727, |
|
"grad_norm": 2.0220720767974854, |
|
"learning_rate": 9.81818181818182e-06, |
|
"loss": 0.3737, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 14.909090909090908, |
|
"grad_norm": 0.3454424738883972, |
|
"learning_rate": 9.939393939393939e-06, |
|
"loss": 0.4232, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 0.5019441246986389, |
|
"eval_runtime": 21.0036, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 15.090909090909092, |
|
"grad_norm": 0.4212585389614105, |
|
"learning_rate": 9.999988811118232e-06, |
|
"loss": 0.4735, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 15.272727272727273, |
|
"grad_norm": 0.22414664924144745, |
|
"learning_rate": 9.999899300364534e-06, |
|
"loss": 0.2503, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 15.454545454545455, |
|
"grad_norm": 1.3229994773864746, |
|
"learning_rate": 9.999720280459576e-06, |
|
"loss": 0.4071, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 15.636363636363637, |
|
"grad_norm": 0.29747679829597473, |
|
"learning_rate": 9.999451754608208e-06, |
|
"loss": 0.3427, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 15.818181818181818, |
|
"grad_norm": 0.3459671437740326, |
|
"learning_rate": 9.99909372761763e-06, |
|
"loss": 0.4779, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.31325361132621765, |
|
"learning_rate": 9.99864620589731e-06, |
|
"loss": 0.496, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 0.5102820992469788, |
|
"eval_runtime": 21.0039, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 16.181818181818183, |
|
"grad_norm": 0.35442739725112915, |
|
"learning_rate": 9.998109197458865e-06, |
|
"loss": 0.2946, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 16.363636363636363, |
|
"grad_norm": 1.0050742626190186, |
|
"learning_rate": 9.997482711915926e-06, |
|
"loss": 0.5689, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 16.545454545454547, |
|
"grad_norm": 0.3000654876232147, |
|
"learning_rate": 9.996766760483955e-06, |
|
"loss": 0.3062, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 16.727272727272727, |
|
"grad_norm": 0.33660659193992615, |
|
"learning_rate": 9.995961355980052e-06, |
|
"loss": 0.3404, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 16.90909090909091, |
|
"grad_norm": 0.32371142506599426, |
|
"learning_rate": 9.99506651282272e-06, |
|
"loss": 0.3903, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 0.4814320504665375, |
|
"eval_runtime": 21.0016, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 17.09090909090909, |
|
"grad_norm": 0.7857639193534851, |
|
"learning_rate": 9.994082247031613e-06, |
|
"loss": 0.5145, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 17.272727272727273, |
|
"grad_norm": 0.007223070599138737, |
|
"learning_rate": 9.993008576227248e-06, |
|
"loss": 0.2366, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 17.454545454545453, |
|
"grad_norm": 0.5182167887687683, |
|
"learning_rate": 9.991845519630679e-06, |
|
"loss": 0.4549, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 17.636363636363637, |
|
"grad_norm": 0.2884611487388611, |
|
"learning_rate": 9.99059309806317e-06, |
|
"loss": 0.3861, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 17.818181818181817, |
|
"grad_norm": 0.49307528138160706, |
|
"learning_rate": 9.989251333945813e-06, |
|
"loss": 0.4248, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.48076504468917847, |
|
"learning_rate": 9.987820251299121e-06, |
|
"loss": 0.331, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 0.49132904410362244, |
|
"eval_runtime": 21.0024, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 18.181818181818183, |
|
"grad_norm": 0.5222908854484558, |
|
"learning_rate": 9.986299875742612e-06, |
|
"loss": 0.4817, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 18.363636363636363, |
|
"grad_norm": 0.3346542418003082, |
|
"learning_rate": 9.984690234494338e-06, |
|
"loss": 0.2685, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 18.545454545454547, |
|
"grad_norm": 0.48757171630859375, |
|
"learning_rate": 9.982991356370404e-06, |
|
"loss": 0.4127, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 18.727272727272727, |
|
"grad_norm": 0.5337949991226196, |
|
"learning_rate": 9.98120327178445e-06, |
|
"loss": 0.4215, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 18.90909090909091, |
|
"grad_norm": 0.26037395000457764, |
|
"learning_rate": 9.979326012747106e-06, |
|
"loss": 0.2403, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 0.4869350492954254, |
|
"eval_runtime": 21.0006, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 19.09090909090909, |
|
"grad_norm": 0.31072062253952026, |
|
"learning_rate": 9.977359612865424e-06, |
|
"loss": 0.2671, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 19.272727272727273, |
|
"grad_norm": 0.5128066539764404, |
|
"learning_rate": 9.975304107342268e-06, |
|
"loss": 0.3695, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 19.454545454545453, |
|
"grad_norm": 0.585717499256134, |
|
"learning_rate": 9.973159532975691e-06, |
|
"loss": 0.5722, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 19.636363636363637, |
|
"grad_norm": 0.4827703833580017, |
|
"learning_rate": 9.970925928158275e-06, |
|
"loss": 0.3577, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 19.818181818181817, |
|
"grad_norm": 0.3153725266456604, |
|
"learning_rate": 9.968603332876435e-06, |
|
"loss": 0.2262, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.4635719954967499, |
|
"learning_rate": 9.966191788709716e-06, |
|
"loss": 0.3563, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 0.471805602312088, |
|
"eval_runtime": 21.0011, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 20.181818181818183, |
|
"grad_norm": 0.4292517304420471, |
|
"learning_rate": 9.963691338830045e-06, |
|
"loss": 0.2494, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 20.363636363636363, |
|
"grad_norm": 0.2989480495452881, |
|
"learning_rate": 9.961102028000948e-06, |
|
"loss": 0.2761, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 20.545454545454547, |
|
"grad_norm": 0.4395551383495331, |
|
"learning_rate": 9.958423902576764e-06, |
|
"loss": 0.5072, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 20.727272727272727, |
|
"grad_norm": 0.47977691888809204, |
|
"learning_rate": 9.955657010501807e-06, |
|
"loss": 0.2494, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 20.90909090909091, |
|
"grad_norm": 0.4911171793937683, |
|
"learning_rate": 9.952801401309504e-06, |
|
"loss": 0.4107, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 0.4596048593521118, |
|
"eval_runtime": 20.9979, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 21.09090909090909, |
|
"grad_norm": 0.5808133482933044, |
|
"learning_rate": 9.949857126121519e-06, |
|
"loss": 0.4717, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 21.272727272727273, |
|
"grad_norm": 0.19762107729911804, |
|
"learning_rate": 9.946824237646823e-06, |
|
"loss": 0.165, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 21.454545454545453, |
|
"grad_norm": 0.5478127598762512, |
|
"learning_rate": 9.94370279018077e-06, |
|
"loss": 0.383, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 21.636363636363637, |
|
"grad_norm": 0.3956260085105896, |
|
"learning_rate": 9.940492839604103e-06, |
|
"loss": 0.3194, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 21.818181818181817, |
|
"grad_norm": 0.608517050743103, |
|
"learning_rate": 9.937194443381972e-06, |
|
"loss": 0.3811, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.35069647431373596, |
|
"learning_rate": 9.933807660562898e-06, |
|
"loss": 0.2631, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 0.4477730691432953, |
|
"eval_runtime": 20.9988, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 22.181818181818183, |
|
"grad_norm": 0.5358024835586548, |
|
"learning_rate": 9.930332551777709e-06, |
|
"loss": 0.24, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 22.363636363636363, |
|
"grad_norm": 0.5837539434432983, |
|
"learning_rate": 9.926769179238467e-06, |
|
"loss": 0.312, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 22.545454545454547, |
|
"grad_norm": 0.4202631711959839, |
|
"learning_rate": 9.923117606737347e-06, |
|
"loss": 0.1839, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 22.727272727272727, |
|
"grad_norm": 0.7312754988670349, |
|
"learning_rate": 9.919377899645497e-06, |
|
"loss": 0.3982, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 22.90909090909091, |
|
"grad_norm": 0.6649301648139954, |
|
"learning_rate": 9.915550124911866e-06, |
|
"loss": 0.4212, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 0.4496145248413086, |
|
"eval_runtime": 21.002, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 23.09090909090909, |
|
"grad_norm": 0.4475117027759552, |
|
"learning_rate": 9.91163435106201e-06, |
|
"loss": 0.2837, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 23.272727272727273, |
|
"grad_norm": 0.7331680059432983, |
|
"learning_rate": 9.907630648196857e-06, |
|
"loss": 0.4256, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 23.454545454545453, |
|
"grad_norm": 0.5951094627380371, |
|
"learning_rate": 9.903539087991462e-06, |
|
"loss": 0.2876, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 23.636363636363637, |
|
"grad_norm": 0.5116230845451355, |
|
"learning_rate": 9.899359743693715e-06, |
|
"loss": 0.2226, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 23.818181818181817, |
|
"grad_norm": 0.4971112310886383, |
|
"learning_rate": 9.895092690123036e-06, |
|
"loss": 0.2511, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.0066973976790905, |
|
"learning_rate": 9.890738003669029e-06, |
|
"loss": 0.3304, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 0.44082728028297424, |
|
"eval_runtime": 20.9935, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 24.181818181818183, |
|
"grad_norm": 0.5074991583824158, |
|
"learning_rate": 9.886295762290125e-06, |
|
"loss": 0.1619, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 24.363636363636363, |
|
"grad_norm": 0.803401529788971, |
|
"learning_rate": 9.881766045512176e-06, |
|
"loss": 0.3071, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 24.545454545454547, |
|
"grad_norm": 0.7890399694442749, |
|
"learning_rate": 9.877148934427037e-06, |
|
"loss": 0.2939, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 24.727272727272727, |
|
"grad_norm": 0.5034437775611877, |
|
"learning_rate": 9.872444511691108e-06, |
|
"loss": 0.2662, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 24.90909090909091, |
|
"grad_norm": 0.7107832431793213, |
|
"learning_rate": 9.867652861523866e-06, |
|
"loss": 0.3296, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 0.4437350928783417, |
|
"eval_runtime": 21.0123, |
|
"eval_samples_per_second": 1.142, |
|
"eval_steps_per_second": 1.142, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 25.09090909090909, |
|
"grad_norm": 0.8815945386886597, |
|
"learning_rate": 9.862774069706346e-06, |
|
"loss": 0.3628, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 25.272727272727273, |
|
"grad_norm": 0.6133561134338379, |
|
"learning_rate": 9.85780822357961e-06, |
|
"loss": 0.2528, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 25.454545454545453, |
|
"grad_norm": 0.7392721772193909, |
|
"learning_rate": 9.85275541204318e-06, |
|
"loss": 0.166, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 25.636363636363637, |
|
"grad_norm": 0.7326251268386841, |
|
"learning_rate": 9.847615725553457e-06, |
|
"loss": 0.1924, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 25.818181818181817, |
|
"grad_norm": 0.9658777117729187, |
|
"learning_rate": 9.842389256122086e-06, |
|
"loss": 0.3393, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.9744178056716919, |
|
"learning_rate": 9.83707609731432e-06, |
|
"loss": 0.3266, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 0.44414618611335754, |
|
"eval_runtime": 20.9954, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 26.181818181818183, |
|
"grad_norm": 1.009691834449768, |
|
"learning_rate": 9.831676344247343e-06, |
|
"loss": 0.2983, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 26.363636363636363, |
|
"grad_norm": 0.9881241917610168, |
|
"learning_rate": 9.826190093588564e-06, |
|
"loss": 0.3114, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 26.545454545454547, |
|
"grad_norm": 0.6508401036262512, |
|
"learning_rate": 9.820617443553889e-06, |
|
"loss": 0.1722, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 26.727272727272727, |
|
"grad_norm": 1.0022698640823364, |
|
"learning_rate": 9.814958493905962e-06, |
|
"loss": 0.2775, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 26.90909090909091, |
|
"grad_norm": 0.8220618963241577, |
|
"learning_rate": 9.80921334595238e-06, |
|
"loss": 0.1403, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 0.4495992660522461, |
|
"eval_runtime": 21.0004, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 27.09090909090909, |
|
"grad_norm": 0.6972556114196777, |
|
"learning_rate": 9.80338210254388e-06, |
|
"loss": 0.2361, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 27.272727272727273, |
|
"grad_norm": 1.3660632371902466, |
|
"learning_rate": 9.797464868072489e-06, |
|
"loss": 0.1793, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 27.454545454545453, |
|
"grad_norm": 0.9514533281326294, |
|
"learning_rate": 9.791461748469669e-06, |
|
"loss": 0.346, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 27.636363636363637, |
|
"grad_norm": 1.0539543628692627, |
|
"learning_rate": 9.785372851204415e-06, |
|
"loss": 0.201, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 27.818181818181817, |
|
"grad_norm": 1.111655592918396, |
|
"learning_rate": 9.779198285281326e-06, |
|
"loss": 0.2216, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.7238084077835083, |
|
"learning_rate": 9.77293816123866e-06, |
|
"loss": 0.1732, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 0.45740625262260437, |
|
"eval_runtime": 21.0057, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 28.181818181818183, |
|
"grad_norm": 1.3565870523452759, |
|
"learning_rate": 9.766592591146353e-06, |
|
"loss": 0.3174, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 28.363636363636363, |
|
"grad_norm": 1.3067699670791626, |
|
"learning_rate": 9.760161688604008e-06, |
|
"loss": 0.2076, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 28.545454545454547, |
|
"grad_norm": 0.9948272705078125, |
|
"learning_rate": 9.753645568738872e-06, |
|
"loss": 0.1527, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 28.727272727272727, |
|
"grad_norm": 1.3128433227539062, |
|
"learning_rate": 9.747044348203766e-06, |
|
"loss": 0.1789, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 28.90909090909091, |
|
"grad_norm": 0.9066749215126038, |
|
"learning_rate": 9.740358145174999e-06, |
|
"loss": 0.1797, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 0.4808818995952606, |
|
"eval_runtime": 20.9997, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 29.09090909090909, |
|
"grad_norm": 1.0640830993652344, |
|
"learning_rate": 9.733587079350254e-06, |
|
"loss": 0.1481, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 29.272727272727273, |
|
"grad_norm": 1.490294337272644, |
|
"learning_rate": 9.72673127194644e-06, |
|
"loss": 0.355, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 29.454545454545453, |
|
"grad_norm": 0.6777256727218628, |
|
"learning_rate": 9.719790845697534e-06, |
|
"loss": 0.1048, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 29.636363636363637, |
|
"grad_norm": 1.5173866748809814, |
|
"learning_rate": 9.71276592485237e-06, |
|
"loss": 0.1738, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 29.818181818181817, |
|
"grad_norm": 1.211938500404358, |
|
"learning_rate": 9.705656635172418e-06, |
|
"loss": 0.0788, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 1.4657930135726929, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.1355, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 0.49904152750968933, |
|
"eval_runtime": 21.0013, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 30.181818181818183, |
|
"grad_norm": 1.3539866209030151, |
|
"learning_rate": 9.69118545990371e-06, |
|
"loss": 0.1336, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 30.363636363636363, |
|
"grad_norm": 1.1103911399841309, |
|
"learning_rate": 9.683823833380692e-06, |
|
"loss": 0.1738, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 30.545454545454547, |
|
"grad_norm": 1.4360435009002686, |
|
"learning_rate": 9.676378356149733e-06, |
|
"loss": 0.0891, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 30.727272727272727, |
|
"grad_norm": 1.7939261198043823, |
|
"learning_rate": 9.668849161501186e-06, |
|
"loss": 0.2107, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 30.90909090909091, |
|
"grad_norm": 1.3665426969528198, |
|
"learning_rate": 9.66123638422413e-06, |
|
"loss": 0.1346, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_loss": 0.5312966704368591, |
|
"eval_runtime": 21.0009, |
|
"eval_samples_per_second": 1.143, |
|
"eval_steps_per_second": 1.143, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"step": 341, |
|
"total_flos": 2.1292989192077312e+17, |
|
"train_loss": 0.8120344140050698, |
|
"train_runtime": 7851.2602, |
|
"train_samples_per_second": 1.681, |
|
"train_steps_per_second": 0.21 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 1650, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 150, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 7, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.1292989192077312e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|