|
{ |
|
"best_metric": 1.019782543182373, |
|
"best_model_checkpoint": "/scratch/czm5kz/llama2-13b_32_1_0.0003_sequential_RANDOM_25pct/checkpoint-1220", |
|
"epoch": 0.9975062344139651, |
|
"eval_steps": 20, |
|
"global_step": 1400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.6291937232017517, |
|
"learning_rate": 0.0002989308624376336, |
|
"loss": 2.4791, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.013960838317871, |
|
"learning_rate": 0.00029786172487526725, |
|
"loss": 2.2353, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.220999240875244, |
|
"learning_rate": 0.0002967925873129009, |
|
"loss": 2.1904, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.8855465650558472, |
|
"learning_rate": 0.00029572344975053457, |
|
"loss": 1.9672, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 1.7276546955108643, |
|
"eval_runtime": 428.596, |
|
"eval_samples_per_second": 26.199, |
|
"eval_steps_per_second": 3.276, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5181469917297363, |
|
"learning_rate": 0.00029465431218816815, |
|
"loss": 1.8267, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.45906826853752136, |
|
"learning_rate": 0.00029358517462580184, |
|
"loss": 1.7535, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.34283947944641113, |
|
"learning_rate": 0.0002925160370634355, |
|
"loss": 1.6963, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.4426736533641815, |
|
"learning_rate": 0.0002914468995010691, |
|
"loss": 1.6877, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 1.5335692167282104, |
|
"eval_runtime": 429.6602, |
|
"eval_samples_per_second": 26.135, |
|
"eval_steps_per_second": 3.268, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.35644304752349854, |
|
"learning_rate": 0.00029037776193870275, |
|
"loss": 1.6777, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.44168591499328613, |
|
"learning_rate": 0.0002893086243763364, |
|
"loss": 1.6301, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3644905388355255, |
|
"learning_rate": 0.00028823948681397, |
|
"loss": 1.7032, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.2797521650791168, |
|
"learning_rate": 0.0002871703492516037, |
|
"loss": 1.6513, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 1.5024319887161255, |
|
"eval_runtime": 429.903, |
|
"eval_samples_per_second": 26.12, |
|
"eval_steps_per_second": 3.266, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4262318015098572, |
|
"learning_rate": 0.0002861012116892373, |
|
"loss": 1.6956, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4277687966823578, |
|
"learning_rate": 0.000285032074126871, |
|
"loss": 1.6304, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.34024539589881897, |
|
"learning_rate": 0.0002839629365645046, |
|
"loss": 1.6168, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3809501826763153, |
|
"learning_rate": 0.00028289379900213826, |
|
"loss": 1.6026, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 1.4783284664154053, |
|
"eval_runtime": 429.7998, |
|
"eval_samples_per_second": 26.126, |
|
"eval_steps_per_second": 3.267, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.4502013325691223, |
|
"learning_rate": 0.0002818246614397719, |
|
"loss": 1.6441, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5180115103721619, |
|
"learning_rate": 0.00028075552387740553, |
|
"loss": 1.6145, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.2917430102825165, |
|
"learning_rate": 0.00027968638631503917, |
|
"loss": 1.6093, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3517865538597107, |
|
"learning_rate": 0.0002786172487526728, |
|
"loss": 1.623, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 1.477242112159729, |
|
"eval_runtime": 430.0134, |
|
"eval_samples_per_second": 26.113, |
|
"eval_steps_per_second": 3.265, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.4099072515964508, |
|
"learning_rate": 0.00027754811119030644, |
|
"loss": 1.5882, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.45024898648262024, |
|
"learning_rate": 0.0002764789736279401, |
|
"loss": 1.5881, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.31718018651008606, |
|
"learning_rate": 0.00027540983606557377, |
|
"loss": 1.6424, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.41389569640159607, |
|
"learning_rate": 0.0002743406985032074, |
|
"loss": 1.5927, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 1.4581289291381836, |
|
"eval_runtime": 428.9623, |
|
"eval_samples_per_second": 26.177, |
|
"eval_steps_per_second": 3.273, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.43200165033340454, |
|
"learning_rate": 0.00027327156094084104, |
|
"loss": 1.6016, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.4232298731803894, |
|
"learning_rate": 0.0002722024233784747, |
|
"loss": 1.5898, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.36732131242752075, |
|
"learning_rate": 0.0002711332858161083, |
|
"loss": 1.5221, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6660999059677124, |
|
"learning_rate": 0.00027006414825374195, |
|
"loss": 1.6149, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 1.4388762712478638, |
|
"eval_runtime": 430.0242, |
|
"eval_samples_per_second": 26.112, |
|
"eval_steps_per_second": 3.265, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7961187958717346, |
|
"learning_rate": 0.00026899501069137564, |
|
"loss": 1.576, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.4951170086860657, |
|
"learning_rate": 0.0002679258731290092, |
|
"loss": 1.5753, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.46930018067359924, |
|
"learning_rate": 0.0002668567355666429, |
|
"loss": 1.5677, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.33915072679519653, |
|
"learning_rate": 0.00026578759800427654, |
|
"loss": 1.5954, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 1.426082968711853, |
|
"eval_runtime": 430.4131, |
|
"eval_samples_per_second": 26.089, |
|
"eval_steps_per_second": 3.262, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.37250855565071106, |
|
"learning_rate": 0.0002647184604419102, |
|
"loss": 1.5982, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.4476079046726227, |
|
"learning_rate": 0.0002636493228795438, |
|
"loss": 1.5739, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.6137750744819641, |
|
"learning_rate": 0.00026258018531717745, |
|
"loss": 1.5652, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.49674519896507263, |
|
"learning_rate": 0.0002615110477548111, |
|
"loss": 1.5676, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_loss": 1.407203197479248, |
|
"eval_runtime": 429.3415, |
|
"eval_samples_per_second": 26.154, |
|
"eval_steps_per_second": 3.27, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.3989889621734619, |
|
"learning_rate": 0.0002604419101924447, |
|
"loss": 1.5738, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5406049489974976, |
|
"learning_rate": 0.00025937277263007836, |
|
"loss": 1.5633, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.605331301689148, |
|
"learning_rate": 0.000258303635067712, |
|
"loss": 1.6135, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.4816053509712219, |
|
"learning_rate": 0.0002572344975053457, |
|
"loss": 1.557, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 1.38784921169281, |
|
"eval_runtime": 429.6602, |
|
"eval_samples_per_second": 26.135, |
|
"eval_steps_per_second": 3.268, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.4018230140209198, |
|
"learning_rate": 0.0002561653599429793, |
|
"loss": 1.516, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5622389912605286, |
|
"learning_rate": 0.00025509622238061296, |
|
"loss": 1.543, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5343061685562134, |
|
"learning_rate": 0.0002540270848182466, |
|
"loss": 1.4964, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6404452919960022, |
|
"learning_rate": 0.00025295794725588023, |
|
"loss": 1.5498, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 1.357541799545288, |
|
"eval_runtime": 431.0081, |
|
"eval_samples_per_second": 26.053, |
|
"eval_steps_per_second": 3.257, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5417826771736145, |
|
"learning_rate": 0.00025188880969351387, |
|
"loss": 1.5356, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5132070183753967, |
|
"learning_rate": 0.00025081967213114756, |
|
"loss": 1.4853, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5137109160423279, |
|
"learning_rate": 0.00024975053456878114, |
|
"loss": 1.4966, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.538129985332489, |
|
"learning_rate": 0.00024868139700641483, |
|
"loss": 1.5015, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 1.30915105342865, |
|
"eval_runtime": 429.5601, |
|
"eval_samples_per_second": 26.141, |
|
"eval_steps_per_second": 3.268, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7500670552253723, |
|
"learning_rate": 0.00024761225944404847, |
|
"loss": 1.5566, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8524929285049438, |
|
"learning_rate": 0.0002465431218816821, |
|
"loss": 1.545, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7134194374084473, |
|
"learning_rate": 0.00024547398431931574, |
|
"loss": 1.4856, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6981797814369202, |
|
"learning_rate": 0.0002444048467569494, |
|
"loss": 1.5214, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 1.2537784576416016, |
|
"eval_runtime": 429.6683, |
|
"eval_samples_per_second": 26.134, |
|
"eval_steps_per_second": 3.268, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6537020802497864, |
|
"learning_rate": 0.000243335709194583, |
|
"loss": 1.4988, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7282711863517761, |
|
"learning_rate": 0.00024226657163221665, |
|
"loss": 1.4872, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.990155816078186, |
|
"learning_rate": 0.0002411974340698503, |
|
"loss": 1.4584, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.578199028968811, |
|
"learning_rate": 0.00024012829650748392, |
|
"loss": 1.4595, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 1.2257404327392578, |
|
"eval_runtime": 430.6646, |
|
"eval_samples_per_second": 26.074, |
|
"eval_steps_per_second": 3.26, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7419201731681824, |
|
"learning_rate": 0.00023905915894511758, |
|
"loss": 1.4881, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6976954936981201, |
|
"learning_rate": 0.00023799002138275122, |
|
"loss": 1.4743, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6367508769035339, |
|
"learning_rate": 0.00023692088382038488, |
|
"loss": 1.4695, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7029184699058533, |
|
"learning_rate": 0.0002358517462580185, |
|
"loss": 1.4245, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 1.1911545991897583, |
|
"eval_runtime": 430.7506, |
|
"eval_samples_per_second": 26.068, |
|
"eval_steps_per_second": 3.259, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5411133170127869, |
|
"learning_rate": 0.00023478260869565215, |
|
"loss": 1.4176, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8051818609237671, |
|
"learning_rate": 0.0002337134711332858, |
|
"loss": 1.4264, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6067179441452026, |
|
"learning_rate": 0.00023264433357091945, |
|
"loss": 1.4669, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7595986127853394, |
|
"learning_rate": 0.0002315751960085531, |
|
"loss": 1.4295, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 1.1661920547485352, |
|
"eval_runtime": 430.0879, |
|
"eval_samples_per_second": 26.109, |
|
"eval_steps_per_second": 3.264, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.6554045081138611, |
|
"learning_rate": 0.00023050605844618672, |
|
"loss": 1.4938, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.3542299270629883, |
|
"learning_rate": 0.00022943692088382036, |
|
"loss": 1.3983, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7835495471954346, |
|
"learning_rate": 0.00022836778332145402, |
|
"loss": 1.3958, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.695136547088623, |
|
"learning_rate": 0.00022729864575908766, |
|
"loss": 1.45, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 1.1369729042053223, |
|
"eval_runtime": 429.726, |
|
"eval_samples_per_second": 26.131, |
|
"eval_steps_per_second": 3.267, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.8016501069068909, |
|
"learning_rate": 0.00022622950819672127, |
|
"loss": 1.3158, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.617300271987915, |
|
"learning_rate": 0.00022516037063435493, |
|
"loss": 1.4326, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.0931488275527954, |
|
"learning_rate": 0.00022409123307198857, |
|
"loss": 1.4018, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.6332559585571289, |
|
"learning_rate": 0.00022302209550962223, |
|
"loss": 1.3883, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 1.1649107933044434, |
|
"eval_runtime": 429.708, |
|
"eval_samples_per_second": 26.132, |
|
"eval_steps_per_second": 3.267, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.4458891153335571, |
|
"learning_rate": 0.00022195295794725584, |
|
"loss": 1.4136, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.1108673810958862, |
|
"learning_rate": 0.0002208838203848895, |
|
"loss": 1.4037, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.50286465883255, |
|
"learning_rate": 0.00021981468282252314, |
|
"loss": 1.4646, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6288882493972778, |
|
"learning_rate": 0.0002187455452601568, |
|
"loss": 1.377, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"eval_loss": 1.1202141046524048, |
|
"eval_runtime": 430.2691, |
|
"eval_samples_per_second": 26.098, |
|
"eval_steps_per_second": 3.263, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6638579368591309, |
|
"learning_rate": 0.0002176764076977904, |
|
"loss": 1.3928, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6448540091514587, |
|
"learning_rate": 0.00021660727013542407, |
|
"loss": 1.3941, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5767375826835632, |
|
"learning_rate": 0.0002155381325730577, |
|
"loss": 1.3517, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5582290887832642, |
|
"learning_rate": 0.00021446899501069137, |
|
"loss": 1.4041, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 1.0965207815170288, |
|
"eval_runtime": 429.9294, |
|
"eval_samples_per_second": 26.118, |
|
"eval_steps_per_second": 3.266, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.8463812470436096, |
|
"learning_rate": 0.00021339985744832498, |
|
"loss": 1.3467, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5267343521118164, |
|
"learning_rate": 0.00021233071988595865, |
|
"loss": 1.3615, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7553706169128418, |
|
"learning_rate": 0.00021126158232359228, |
|
"loss": 1.3438, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7279307842254639, |
|
"learning_rate": 0.00021019244476122595, |
|
"loss": 1.4185, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 1.1018279790878296, |
|
"eval_runtime": 429.0802, |
|
"eval_samples_per_second": 26.17, |
|
"eval_steps_per_second": 3.272, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7685341835021973, |
|
"learning_rate": 0.00020912330719885958, |
|
"loss": 1.4186, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7275740504264832, |
|
"learning_rate": 0.0002080541696364932, |
|
"loss": 1.3576, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7372286319732666, |
|
"learning_rate": 0.00020698503207412685, |
|
"loss": 1.3902, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.8976317644119263, |
|
"learning_rate": 0.0002059158945117605, |
|
"loss": 1.404, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 1.09088933467865, |
|
"eval_runtime": 429.9108, |
|
"eval_samples_per_second": 26.119, |
|
"eval_steps_per_second": 3.266, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7758693695068359, |
|
"learning_rate": 0.00020484675694939415, |
|
"loss": 1.3686, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.8729623556137085, |
|
"learning_rate": 0.00020377761938702776, |
|
"loss": 1.3923, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7426387667655945, |
|
"learning_rate": 0.00020270848182466143, |
|
"loss": 1.3244, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5695792436599731, |
|
"learning_rate": 0.00020163934426229506, |
|
"loss": 1.3677, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 1.0908492803573608, |
|
"eval_runtime": 428.9402, |
|
"eval_samples_per_second": 26.178, |
|
"eval_steps_per_second": 3.273, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5147937536239624, |
|
"learning_rate": 0.00020057020669992872, |
|
"loss": 1.3643, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5052825212478638, |
|
"learning_rate": 0.00019950106913756233, |
|
"loss": 1.3665, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.4269680976867676, |
|
"learning_rate": 0.000198431931575196, |
|
"loss": 1.3191, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.49817460775375366, |
|
"learning_rate": 0.00019736279401282963, |
|
"loss": 1.3423, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 1.081033706665039, |
|
"eval_runtime": 430.1153, |
|
"eval_samples_per_second": 26.107, |
|
"eval_steps_per_second": 3.264, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5547080636024475, |
|
"learning_rate": 0.0001962936564504633, |
|
"loss": 1.3562, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.851958692073822, |
|
"learning_rate": 0.0001952245188880969, |
|
"loss": 1.4167, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.8551843166351318, |
|
"learning_rate": 0.00019415538132573057, |
|
"loss": 1.4025, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6752102971076965, |
|
"learning_rate": 0.0001930862437633642, |
|
"loss": 1.3144, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 1.0697931051254272, |
|
"eval_runtime": 430.0744, |
|
"eval_samples_per_second": 26.109, |
|
"eval_steps_per_second": 3.265, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5209426283836365, |
|
"learning_rate": 0.00019201710620099787, |
|
"loss": 1.405, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5636367201805115, |
|
"learning_rate": 0.00019094796863863148, |
|
"loss": 1.3456, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5399631857872009, |
|
"learning_rate": 0.0001898788310762651, |
|
"loss": 1.3752, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6341372728347778, |
|
"learning_rate": 0.00018880969351389878, |
|
"loss": 1.3685, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_loss": 1.0655372142791748, |
|
"eval_runtime": 430.0079, |
|
"eval_samples_per_second": 26.113, |
|
"eval_steps_per_second": 3.265, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.018846869468689, |
|
"learning_rate": 0.0001877405559515324, |
|
"loss": 1.3351, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6910350322723389, |
|
"learning_rate": 0.00018667141838916605, |
|
"loss": 1.3409, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3985555171966553, |
|
"learning_rate": 0.00018560228082679968, |
|
"loss": 1.3792, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.9939374327659607, |
|
"learning_rate": 0.00018453314326443335, |
|
"loss": 1.341, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 1.0706018209457397, |
|
"eval_runtime": 430.0786, |
|
"eval_samples_per_second": 26.109, |
|
"eval_steps_per_second": 3.265, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.8195542693138123, |
|
"learning_rate": 0.00018346400570206698, |
|
"loss": 1.3279, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6725456118583679, |
|
"learning_rate": 0.00018239486813970065, |
|
"loss": 1.3687, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5679619908332825, |
|
"learning_rate": 0.00018132573057733425, |
|
"loss": 1.3672, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5750882029533386, |
|
"learning_rate": 0.00018025659301496792, |
|
"loss": 1.3077, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 1.0635442733764648, |
|
"eval_runtime": 430.3677, |
|
"eval_samples_per_second": 26.092, |
|
"eval_steps_per_second": 3.262, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6369686722755432, |
|
"learning_rate": 0.00017918745545260155, |
|
"loss": 1.3236, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5866047143936157, |
|
"learning_rate": 0.00017811831789023522, |
|
"loss": 1.3253, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5465147495269775, |
|
"learning_rate": 0.00017704918032786883, |
|
"loss": 1.3344, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6831965446472168, |
|
"learning_rate": 0.0001759800427655025, |
|
"loss": 1.3546, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_loss": 1.0727200508117676, |
|
"eval_runtime": 430.2107, |
|
"eval_samples_per_second": 26.101, |
|
"eval_steps_per_second": 3.264, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.9631245136260986, |
|
"learning_rate": 0.00017491090520313613, |
|
"loss": 1.3735, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.8638582825660706, |
|
"learning_rate": 0.0001738417676407698, |
|
"loss": 1.3783, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5413733720779419, |
|
"learning_rate": 0.0001727726300784034, |
|
"loss": 1.3831, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.5948848128318787, |
|
"learning_rate": 0.00017170349251603703, |
|
"loss": 1.3589, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_loss": 1.0601730346679688, |
|
"eval_runtime": 429.9557, |
|
"eval_samples_per_second": 26.117, |
|
"eval_steps_per_second": 3.265, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7511246800422668, |
|
"learning_rate": 0.0001706343549536707, |
|
"loss": 1.3685, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6298426389694214, |
|
"learning_rate": 0.00016956521739130433, |
|
"loss": 1.309, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6661334037780762, |
|
"learning_rate": 0.00016849607982893797, |
|
"loss": 1.3337, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5155322551727295, |
|
"learning_rate": 0.0001674269422665716, |
|
"loss": 1.3756, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 1.0586997270584106, |
|
"eval_runtime": 430.2762, |
|
"eval_samples_per_second": 26.097, |
|
"eval_steps_per_second": 3.263, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.3205231428146362, |
|
"learning_rate": 0.00016635780470420527, |
|
"loss": 1.3285, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.4283859133720398, |
|
"learning_rate": 0.0001652886671418389, |
|
"loss": 1.3028, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.1601816415786743, |
|
"learning_rate": 0.00016421952957947254, |
|
"loss": 1.3416, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.44518712162971497, |
|
"learning_rate": 0.00016315039201710618, |
|
"loss": 1.3171, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 1.0572543144226074, |
|
"eval_runtime": 429.3198, |
|
"eval_samples_per_second": 26.155, |
|
"eval_steps_per_second": 3.27, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7820278406143188, |
|
"learning_rate": 0.00016208125445473984, |
|
"loss": 1.3304, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6720359325408936, |
|
"learning_rate": 0.00016101211689237348, |
|
"loss": 1.3263, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.0143332481384277, |
|
"learning_rate": 0.00015994297933000714, |
|
"loss": 1.332, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.5054823160171509, |
|
"learning_rate": 0.00015887384176764075, |
|
"loss": 1.2838, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_loss": 1.064550518989563, |
|
"eval_runtime": 429.3314, |
|
"eval_samples_per_second": 26.155, |
|
"eval_steps_per_second": 3.27, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.031082272529602, |
|
"learning_rate": 0.0001578047042052744, |
|
"loss": 1.3094, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.9540282487869263, |
|
"learning_rate": 0.00015673556664290805, |
|
"loss": 1.3874, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.5223524570465088, |
|
"learning_rate": 0.0001556664290805417, |
|
"loss": 1.3256, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7279605269432068, |
|
"learning_rate": 0.00015459729151817532, |
|
"loss": 1.373, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 1.0548007488250732, |
|
"eval_runtime": 429.2395, |
|
"eval_samples_per_second": 26.16, |
|
"eval_steps_per_second": 3.271, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5409196019172668, |
|
"learning_rate": 0.00015352815395580896, |
|
"loss": 1.3431, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5752881765365601, |
|
"learning_rate": 0.00015245901639344262, |
|
"loss": 1.3486, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8059548139572144, |
|
"learning_rate": 0.00015138987883107623, |
|
"loss": 1.3471, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.509373664855957, |
|
"learning_rate": 0.0001503207412687099, |
|
"loss": 1.3501, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.0632661581039429, |
|
"eval_runtime": 429.2245, |
|
"eval_samples_per_second": 26.161, |
|
"eval_steps_per_second": 3.271, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6135790944099426, |
|
"learning_rate": 0.00014925160370634355, |
|
"loss": 1.3213, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6826092600822449, |
|
"learning_rate": 0.0001481824661439772, |
|
"loss": 1.3623, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6348379850387573, |
|
"learning_rate": 0.00014711332858161083, |
|
"loss": 1.3135, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7005529999732971, |
|
"learning_rate": 0.00014604419101924446, |
|
"loss": 1.3419, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 1.0513877868652344, |
|
"eval_runtime": 430.4426, |
|
"eval_samples_per_second": 26.087, |
|
"eval_steps_per_second": 3.262, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.5816593170166016, |
|
"learning_rate": 0.0001449750534568781, |
|
"loss": 1.3094, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6071094274520874, |
|
"learning_rate": 0.00014390591589451173, |
|
"loss": 1.2911, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.0864328145980835, |
|
"learning_rate": 0.0001428367783321454, |
|
"loss": 1.3191, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.8440472483634949, |
|
"learning_rate": 0.00014176764076977903, |
|
"loss": 1.2931, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 1.0493059158325195, |
|
"eval_runtime": 430.1744, |
|
"eval_samples_per_second": 26.103, |
|
"eval_steps_per_second": 3.264, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.238255262374878, |
|
"learning_rate": 0.00014069850320741267, |
|
"loss": 1.2751, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.5711309909820557, |
|
"learning_rate": 0.0001396293656450463, |
|
"loss": 1.3271, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7740243077278137, |
|
"learning_rate": 0.00013856022808267997, |
|
"loss": 1.3454, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.4949895441532135, |
|
"learning_rate": 0.0001374910905203136, |
|
"loss": 1.3521, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 1.0512796640396118, |
|
"eval_runtime": 429.3325, |
|
"eval_samples_per_second": 26.155, |
|
"eval_steps_per_second": 3.27, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.8579160571098328, |
|
"learning_rate": 0.00013642195295794724, |
|
"loss": 1.2972, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.8241709470748901, |
|
"learning_rate": 0.00013535281539558088, |
|
"loss": 1.3556, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7028602957725525, |
|
"learning_rate": 0.00013428367783321454, |
|
"loss": 1.3457, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5605300068855286, |
|
"learning_rate": 0.00013321454027084818, |
|
"loss": 1.3149, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 1.0340629816055298, |
|
"eval_runtime": 428.4333, |
|
"eval_samples_per_second": 26.209, |
|
"eval_steps_per_second": 3.277, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5241943001747131, |
|
"learning_rate": 0.0001321454027084818, |
|
"loss": 1.3316, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7008923888206482, |
|
"learning_rate": 0.00013107626514611545, |
|
"loss": 1.3343, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7116791009902954, |
|
"learning_rate": 0.0001300071275837491, |
|
"loss": 1.3599, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.47471854090690613, |
|
"learning_rate": 0.00012893799002138275, |
|
"loss": 1.3042, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_loss": 1.0408192873001099, |
|
"eval_runtime": 430.9133, |
|
"eval_samples_per_second": 26.059, |
|
"eval_steps_per_second": 3.258, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.4862081706523895, |
|
"learning_rate": 0.00012786885245901638, |
|
"loss": 1.3269, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6292262077331543, |
|
"learning_rate": 0.00012679971489665002, |
|
"loss": 1.3372, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.5607986450195312, |
|
"learning_rate": 0.00012573057733428366, |
|
"loss": 1.3225, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.9970059394836426, |
|
"learning_rate": 0.0001246614397719173, |
|
"loss": 1.3126, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 1.0392816066741943, |
|
"eval_runtime": 429.9226, |
|
"eval_samples_per_second": 26.119, |
|
"eval_steps_per_second": 3.266, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5598148703575134, |
|
"learning_rate": 0.00012359230220955095, |
|
"loss": 1.3226, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.7155619859695435, |
|
"learning_rate": 0.0001225231646471846, |
|
"loss": 1.3015, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5198763012886047, |
|
"learning_rate": 0.00012145402708481824, |
|
"loss": 1.3338, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6012284755706787, |
|
"learning_rate": 0.00012038488952245188, |
|
"loss": 1.3306, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 1.0436369180679321, |
|
"eval_runtime": 430.0367, |
|
"eval_samples_per_second": 26.112, |
|
"eval_steps_per_second": 3.265, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.7888386249542236, |
|
"learning_rate": 0.00011931575196008553, |
|
"loss": 1.3209, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.598448634147644, |
|
"learning_rate": 0.00011824661439771916, |
|
"loss": 1.351, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6814375519752502, |
|
"learning_rate": 0.00011717747683535281, |
|
"loss": 1.3323, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.5430750846862793, |
|
"learning_rate": 0.00011610833927298645, |
|
"loss": 1.299, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_loss": 1.0388227701187134, |
|
"eval_runtime": 429.5954, |
|
"eval_samples_per_second": 26.139, |
|
"eval_steps_per_second": 3.268, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0289543867111206, |
|
"learning_rate": 0.0001150392017106201, |
|
"loss": 1.3167, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.8182229399681091, |
|
"learning_rate": 0.00011397006414825373, |
|
"loss": 1.2939, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5602372288703918, |
|
"learning_rate": 0.00011290092658588738, |
|
"loss": 1.345, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.8015730977058411, |
|
"learning_rate": 0.00011183178902352102, |
|
"loss": 1.3186, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_loss": 1.0438414812088013, |
|
"eval_runtime": 430.3302, |
|
"eval_samples_per_second": 26.094, |
|
"eval_steps_per_second": 3.263, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.5967668890953064, |
|
"learning_rate": 0.00011076265146115467, |
|
"loss": 1.3232, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.8282290697097778, |
|
"learning_rate": 0.00010969351389878829, |
|
"loss": 1.2881, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5095836520195007, |
|
"learning_rate": 0.00010862437633642194, |
|
"loss": 1.3026, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5845779776573181, |
|
"learning_rate": 0.00010755523877405558, |
|
"loss": 1.385, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 1.0384798049926758, |
|
"eval_runtime": 429.8904, |
|
"eval_samples_per_second": 26.121, |
|
"eval_steps_per_second": 3.266, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6281318664550781, |
|
"learning_rate": 0.00010648610121168923, |
|
"loss": 1.3167, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.4388660192489624, |
|
"learning_rate": 0.00010541696364932286, |
|
"loss": 1.2848, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5878056883811951, |
|
"learning_rate": 0.00010434782608695651, |
|
"loss": 1.2798, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.0295441150665283, |
|
"learning_rate": 0.00010327868852459015, |
|
"loss": 1.3578, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_loss": 1.036583662033081, |
|
"eval_runtime": 429.2823, |
|
"eval_samples_per_second": 26.158, |
|
"eval_steps_per_second": 3.271, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.169296383857727, |
|
"learning_rate": 0.0001022095509622238, |
|
"loss": 1.3251, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.8990792632102966, |
|
"learning_rate": 0.00010114041339985743, |
|
"loss": 1.3402, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5439760088920593, |
|
"learning_rate": 0.00010007127583749108, |
|
"loss": 1.302, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5038203597068787, |
|
"learning_rate": 9.900213827512472e-05, |
|
"loss": 1.2863, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_loss": 1.0299803018569946, |
|
"eval_runtime": 430.5328, |
|
"eval_samples_per_second": 26.082, |
|
"eval_steps_per_second": 3.261, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5802446603775024, |
|
"learning_rate": 9.793300071275837e-05, |
|
"loss": 1.2493, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5156883001327515, |
|
"learning_rate": 9.686386315039202e-05, |
|
"loss": 1.2997, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.7282747030258179, |
|
"learning_rate": 9.579472558802566e-05, |
|
"loss": 1.2893, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5055095553398132, |
|
"learning_rate": 9.47255880256593e-05, |
|
"loss": 1.2894, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 1.0320148468017578, |
|
"eval_runtime": 428.6868, |
|
"eval_samples_per_second": 26.194, |
|
"eval_steps_per_second": 3.275, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7030293345451355, |
|
"learning_rate": 9.365645046329294e-05, |
|
"loss": 1.3313, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.4963017404079437, |
|
"learning_rate": 9.258731290092659e-05, |
|
"loss": 1.2798, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.1872601509094238, |
|
"learning_rate": 9.151817533856021e-05, |
|
"loss": 1.3095, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.01945960521698, |
|
"learning_rate": 9.044903777619385e-05, |
|
"loss": 1.319, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 1.037099838256836, |
|
"eval_runtime": 428.5058, |
|
"eval_samples_per_second": 26.205, |
|
"eval_steps_per_second": 3.277, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.9102843999862671, |
|
"learning_rate": 8.93799002138275e-05, |
|
"loss": 1.2645, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.43511050939559937, |
|
"learning_rate": 8.831076265146115e-05, |
|
"loss": 1.3248, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.866431713104248, |
|
"learning_rate": 8.724162508909478e-05, |
|
"loss": 1.2844, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.5549300909042358, |
|
"learning_rate": 8.617248752672843e-05, |
|
"loss": 1.2784, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 1.036270260810852, |
|
"eval_runtime": 430.0845, |
|
"eval_samples_per_second": 26.109, |
|
"eval_steps_per_second": 3.264, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.503010094165802, |
|
"learning_rate": 8.510334996436207e-05, |
|
"loss": 1.2949, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6416985988616943, |
|
"learning_rate": 8.403421240199572e-05, |
|
"loss": 1.3145, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5773264765739441, |
|
"learning_rate": 8.296507483962936e-05, |
|
"loss": 1.2916, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.8444538116455078, |
|
"learning_rate": 8.1895937277263e-05, |
|
"loss": 1.2588, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_loss": 1.02756929397583, |
|
"eval_runtime": 429.6856, |
|
"eval_samples_per_second": 26.133, |
|
"eval_steps_per_second": 3.268, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.6495450735092163, |
|
"learning_rate": 8.082679971489664e-05, |
|
"loss": 1.3093, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.633300244808197, |
|
"learning_rate": 7.975766215253029e-05, |
|
"loss": 1.292, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5709363222122192, |
|
"learning_rate": 7.868852459016393e-05, |
|
"loss": 1.2976, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6210110187530518, |
|
"learning_rate": 7.761938702779758e-05, |
|
"loss": 1.3409, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 1.0304205417633057, |
|
"eval_runtime": 430.053, |
|
"eval_samples_per_second": 26.111, |
|
"eval_steps_per_second": 3.265, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5457026958465576, |
|
"learning_rate": 7.655024946543121e-05, |
|
"loss": 1.2711, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.5168452858924866, |
|
"learning_rate": 7.548111190306486e-05, |
|
"loss": 1.3493, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6571021676063538, |
|
"learning_rate": 7.44119743406985e-05, |
|
"loss": 1.3051, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6311302185058594, |
|
"learning_rate": 7.334283677833213e-05, |
|
"loss": 1.2985, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 1.0277342796325684, |
|
"eval_runtime": 429.4921, |
|
"eval_samples_per_second": 26.145, |
|
"eval_steps_per_second": 3.269, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5146097540855408, |
|
"learning_rate": 7.227369921596578e-05, |
|
"loss": 1.2888, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.9732445478439331, |
|
"learning_rate": 7.120456165359942e-05, |
|
"loss": 1.3134, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.126225233078003, |
|
"learning_rate": 7.013542409123307e-05, |
|
"loss": 1.3117, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6155526041984558, |
|
"learning_rate": 6.90662865288667e-05, |
|
"loss": 1.2754, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_loss": 1.0243170261383057, |
|
"eval_runtime": 429.5325, |
|
"eval_samples_per_second": 26.142, |
|
"eval_steps_per_second": 3.269, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5756591558456421, |
|
"learning_rate": 6.799714896650034e-05, |
|
"loss": 1.2867, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.9171560406684875, |
|
"learning_rate": 6.692801140413399e-05, |
|
"loss": 1.3024, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.0372343063354492, |
|
"learning_rate": 6.585887384176763e-05, |
|
"loss": 1.3004, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5745114684104919, |
|
"learning_rate": 6.478973627940128e-05, |
|
"loss": 1.3079, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 1.0274326801300049, |
|
"eval_runtime": 430.2338, |
|
"eval_samples_per_second": 26.1, |
|
"eval_steps_per_second": 3.263, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.6692591309547424, |
|
"learning_rate": 6.372059871703493e-05, |
|
"loss": 1.3284, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5350373983383179, |
|
"learning_rate": 6.265146115466856e-05, |
|
"loss": 1.2706, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5629821419715881, |
|
"learning_rate": 6.158232359230221e-05, |
|
"loss": 1.2898, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6059845685958862, |
|
"learning_rate": 6.051318602993584e-05, |
|
"loss": 1.2614, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 1.0247998237609863, |
|
"eval_runtime": 430.8367, |
|
"eval_samples_per_second": 26.063, |
|
"eval_steps_per_second": 3.259, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9147688150405884, |
|
"learning_rate": 5.9444048467569485e-05, |
|
"loss": 1.3293, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.48272547125816345, |
|
"learning_rate": 5.837491090520313e-05, |
|
"loss": 1.3069, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5300667881965637, |
|
"learning_rate": 5.730577334283677e-05, |
|
"loss": 1.3209, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5542011857032776, |
|
"learning_rate": 5.6236635780470413e-05, |
|
"loss": 1.3116, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"eval_loss": 1.027745246887207, |
|
"eval_runtime": 428.9108, |
|
"eval_samples_per_second": 26.18, |
|
"eval_steps_per_second": 3.273, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.6744430661201477, |
|
"learning_rate": 5.5167498218104056e-05, |
|
"loss": 1.2887, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.660572350025177, |
|
"learning_rate": 5.40983606557377e-05, |
|
"loss": 1.2296, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.6836673021316528, |
|
"learning_rate": 5.302922309337134e-05, |
|
"loss": 1.2773, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5106320381164551, |
|
"learning_rate": 5.196008553100499e-05, |
|
"loss": 1.3101, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_loss": 1.0228785276412964, |
|
"eval_runtime": 430.0103, |
|
"eval_samples_per_second": 26.113, |
|
"eval_steps_per_second": 3.265, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5948910117149353, |
|
"learning_rate": 5.089094796863862e-05, |
|
"loss": 1.3074, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6630853414535522, |
|
"learning_rate": 4.9821810406272264e-05, |
|
"loss": 1.3249, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5019823312759399, |
|
"learning_rate": 4.875267284390591e-05, |
|
"loss": 1.3239, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.524602472782135, |
|
"learning_rate": 4.7683535281539556e-05, |
|
"loss": 1.3213, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 1.028998613357544, |
|
"eval_runtime": 430.7493, |
|
"eval_samples_per_second": 26.069, |
|
"eval_steps_per_second": 3.259, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5343472361564636, |
|
"learning_rate": 4.66143977191732e-05, |
|
"loss": 1.3257, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.6384676098823547, |
|
"learning_rate": 4.554526015680684e-05, |
|
"loss": 1.3164, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.6373520493507385, |
|
"learning_rate": 4.4476122594440485e-05, |
|
"loss": 1.3331, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9878657460212708, |
|
"learning_rate": 4.340698503207413e-05, |
|
"loss": 1.3003, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_loss": 1.0289608240127563, |
|
"eval_runtime": 429.2778, |
|
"eval_samples_per_second": 26.158, |
|
"eval_steps_per_second": 3.271, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5717479586601257, |
|
"learning_rate": 4.2337847469707764e-05, |
|
"loss": 1.3245, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.7836593985557556, |
|
"learning_rate": 4.1268709907341407e-05, |
|
"loss": 1.3023, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.53411465883255, |
|
"learning_rate": 4.019957234497505e-05, |
|
"loss": 1.2955, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5131638646125793, |
|
"learning_rate": 3.913043478260869e-05, |
|
"loss": 1.251, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 1.019782543182373, |
|
"eval_runtime": 429.0558, |
|
"eval_samples_per_second": 26.171, |
|
"eval_steps_per_second": 3.272, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5704027414321899, |
|
"learning_rate": 3.8061297220242335e-05, |
|
"loss": 1.3267, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.44694724678993225, |
|
"learning_rate": 3.699215965787598e-05, |
|
"loss": 1.307, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5663137435913086, |
|
"learning_rate": 3.592302209550962e-05, |
|
"loss": 1.3299, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.7042715549468994, |
|
"learning_rate": 3.485388453314326e-05, |
|
"loss": 1.283, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 1.0289275646209717, |
|
"eval_runtime": 429.5115, |
|
"eval_samples_per_second": 26.144, |
|
"eval_steps_per_second": 3.269, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.6489189267158508, |
|
"learning_rate": 3.3784746970776906e-05, |
|
"loss": 1.3028, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.6258445382118225, |
|
"learning_rate": 3.271560940841055e-05, |
|
"loss": 1.2966, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.5730215311050415, |
|
"learning_rate": 3.164647184604419e-05, |
|
"loss": 1.318, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.7148025035858154, |
|
"learning_rate": 3.057733428367783e-05, |
|
"loss": 1.263, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 1.0275732278823853, |
|
"eval_runtime": 430.3813, |
|
"eval_samples_per_second": 26.091, |
|
"eval_steps_per_second": 3.262, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5665017366409302, |
|
"learning_rate": 2.950819672131147e-05, |
|
"loss": 1.2744, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.8906866312026978, |
|
"learning_rate": 2.8439059158945114e-05, |
|
"loss": 1.3154, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.7491171956062317, |
|
"learning_rate": 2.736992159657876e-05, |
|
"loss": 1.3158, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5208207964897156, |
|
"learning_rate": 2.63007840342124e-05, |
|
"loss": 1.2488, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_loss": 1.02449369430542, |
|
"eval_runtime": 429.3553, |
|
"eval_samples_per_second": 26.153, |
|
"eval_steps_per_second": 3.27, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.7041420936584473, |
|
"learning_rate": 2.5231646471846042e-05, |
|
"loss": 1.3145, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5100432634353638, |
|
"learning_rate": 2.4162508909479685e-05, |
|
"loss": 1.3209, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.491322785615921, |
|
"learning_rate": 2.3093371347113328e-05, |
|
"loss": 1.2835, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.4475841224193573, |
|
"learning_rate": 2.2024233784746968e-05, |
|
"loss": 1.3328, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_loss": 1.0273866653442383, |
|
"eval_runtime": 429.937, |
|
"eval_samples_per_second": 26.118, |
|
"eval_steps_per_second": 3.266, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.4693332612514496, |
|
"learning_rate": 2.095509622238061e-05, |
|
"loss": 1.284, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5190310478210449, |
|
"learning_rate": 1.9885958660014253e-05, |
|
"loss": 1.2987, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5245566368103027, |
|
"learning_rate": 1.8816821097647896e-05, |
|
"loss": 1.3144, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.6682144403457642, |
|
"learning_rate": 1.774768353528154e-05, |
|
"loss": 1.331, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 1.0257986783981323, |
|
"eval_runtime": 429.8676, |
|
"eval_samples_per_second": 26.122, |
|
"eval_steps_per_second": 3.266, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.8946670293807983, |
|
"learning_rate": 1.6678545972915182e-05, |
|
"loss": 1.307, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.4281274378299713, |
|
"learning_rate": 1.560940841054882e-05, |
|
"loss": 1.316, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.6014120578765869, |
|
"learning_rate": 1.4540270848182466e-05, |
|
"loss": 1.3436, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.48019564151763916, |
|
"learning_rate": 1.3471133285816107e-05, |
|
"loss": 1.247, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 1.026702642440796, |
|
"eval_runtime": 429.7224, |
|
"eval_samples_per_second": 26.131, |
|
"eval_steps_per_second": 3.267, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.6991472244262695, |
|
"learning_rate": 1.240199572344975e-05, |
|
"loss": 1.3156, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5677585601806641, |
|
"learning_rate": 1.1332858161083391e-05, |
|
"loss": 1.3139, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.4951375722885132, |
|
"learning_rate": 1.0263720598717034e-05, |
|
"loss": 1.2654, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.4676544666290283, |
|
"learning_rate": 9.194583036350677e-06, |
|
"loss": 1.2781, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_loss": 1.0250627994537354, |
|
"eval_runtime": 428.9007, |
|
"eval_samples_per_second": 26.181, |
|
"eval_steps_per_second": 3.273, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.5621703267097473, |
|
"learning_rate": 8.12544547398432e-06, |
|
"loss": 1.3214, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5663436055183411, |
|
"learning_rate": 7.0563079116179615e-06, |
|
"loss": 1.3685, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.6462903022766113, |
|
"learning_rate": 5.9871703492516035e-06, |
|
"loss": 1.3161, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5171219706535339, |
|
"learning_rate": 4.9180327868852455e-06, |
|
"loss": 1.2707, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"eval_loss": 1.0249730348587036, |
|
"eval_runtime": 429.5565, |
|
"eval_samples_per_second": 26.141, |
|
"eval_steps_per_second": 3.268, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.5951040387153625, |
|
"learning_rate": 3.848895224518888e-06, |
|
"loss": 1.3382, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.43818196654319763, |
|
"learning_rate": 2.7797576621525303e-06, |
|
"loss": 1.2831, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.6639280319213867, |
|
"learning_rate": 1.7106200997861725e-06, |
|
"loss": 1.2704, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.45635080337524414, |
|
"learning_rate": 6.414825374198146e-07, |
|
"loss": 1.3303, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.0249310731887817, |
|
"eval_runtime": 429.876, |
|
"eval_samples_per_second": 26.121, |
|
"eval_steps_per_second": 3.266, |
|
"step": 1400 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1403, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 5.282295580046131e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|