|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9876543209876543, |
|
"eval_steps": 500, |
|
"global_step": 363, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00823045267489712, |
|
"grad_norm": 1.4866708861109026, |
|
"learning_rate": 5.405405405405406e-06, |
|
"loss": 1.7565, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01646090534979424, |
|
"grad_norm": 1.4564242960595204, |
|
"learning_rate": 1.0810810810810812e-05, |
|
"loss": 1.7428, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024691358024691357, |
|
"grad_norm": 1.5029588271629886, |
|
"learning_rate": 1.6216216216216218e-05, |
|
"loss": 1.7399, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03292181069958848, |
|
"grad_norm": 1.4275061812775922, |
|
"learning_rate": 2.1621621621621624e-05, |
|
"loss": 1.7083, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0411522633744856, |
|
"grad_norm": 1.3211892837590813, |
|
"learning_rate": 2.702702702702703e-05, |
|
"loss": 1.6472, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04938271604938271, |
|
"grad_norm": 1.2954095745224536, |
|
"learning_rate": 3.2432432432432436e-05, |
|
"loss": 1.5935, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05761316872427984, |
|
"grad_norm": 0.9115980406962145, |
|
"learning_rate": 3.783783783783784e-05, |
|
"loss": 1.4303, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.06584362139917696, |
|
"grad_norm": 0.7713124020101795, |
|
"learning_rate": 4.324324324324325e-05, |
|
"loss": 1.3661, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.7920851957124333, |
|
"learning_rate": 4.8648648648648654e-05, |
|
"loss": 1.3093, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0823045267489712, |
|
"grad_norm": 0.8823079310051533, |
|
"learning_rate": 5.405405405405406e-05, |
|
"loss": 1.1587, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09053497942386832, |
|
"grad_norm": 0.8557863491129706, |
|
"learning_rate": 5.9459459459459466e-05, |
|
"loss": 1.0485, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.09876543209876543, |
|
"grad_norm": 0.8802985405555526, |
|
"learning_rate": 6.486486486486487e-05, |
|
"loss": 0.9429, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.10699588477366255, |
|
"grad_norm": 0.8436384566917278, |
|
"learning_rate": 7.027027027027028e-05, |
|
"loss": 0.8218, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.11522633744855967, |
|
"grad_norm": 0.6676398141327192, |
|
"learning_rate": 7.567567567567568e-05, |
|
"loss": 0.7054, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.12345679012345678, |
|
"grad_norm": 0.6471842089986185, |
|
"learning_rate": 8.108108108108109e-05, |
|
"loss": 0.6758, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.13168724279835392, |
|
"grad_norm": 0.4142479004737112, |
|
"learning_rate": 8.64864864864865e-05, |
|
"loss": 0.6222, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.13991769547325103, |
|
"grad_norm": 0.42850243250289555, |
|
"learning_rate": 9.18918918918919e-05, |
|
"loss": 0.5962, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.5568208902875614, |
|
"learning_rate": 9.729729729729731e-05, |
|
"loss": 0.5984, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.15637860082304528, |
|
"grad_norm": 0.5540687236420326, |
|
"learning_rate": 0.0001027027027027027, |
|
"loss": 0.4938, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.1646090534979424, |
|
"grad_norm": 0.33332950971206693, |
|
"learning_rate": 0.00010810810810810812, |
|
"loss": 0.5161, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1728395061728395, |
|
"grad_norm": 0.2653066423031725, |
|
"learning_rate": 0.00011351351351351351, |
|
"loss": 0.5136, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.18106995884773663, |
|
"grad_norm": 0.2272101382108708, |
|
"learning_rate": 0.00011891891891891893, |
|
"loss": 0.4873, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.18930041152263374, |
|
"grad_norm": 0.24006404965358885, |
|
"learning_rate": 0.00012432432432432433, |
|
"loss": 0.4844, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.19753086419753085, |
|
"grad_norm": 0.20608476733616163, |
|
"learning_rate": 0.00012972972972972974, |
|
"loss": 0.4821, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.205761316872428, |
|
"grad_norm": 0.16150407452920948, |
|
"learning_rate": 0.00013513513513513514, |
|
"loss": 0.4167, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2139917695473251, |
|
"grad_norm": 0.1538751616515209, |
|
"learning_rate": 0.00014054054054054056, |
|
"loss": 0.446, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.16472183357224798, |
|
"learning_rate": 0.00014594594594594595, |
|
"loss": 0.4142, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.23045267489711935, |
|
"grad_norm": 0.1552702492617925, |
|
"learning_rate": 0.00015135135135135137, |
|
"loss": 0.4909, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.23868312757201646, |
|
"grad_norm": 0.14727903417905372, |
|
"learning_rate": 0.00015675675675675676, |
|
"loss": 0.4065, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.24691358024691357, |
|
"grad_norm": 0.14784235244019464, |
|
"learning_rate": 0.00016216216216216218, |
|
"loss": 0.3886, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2551440329218107, |
|
"grad_norm": 0.1450616035553425, |
|
"learning_rate": 0.00016756756756756757, |
|
"loss": 0.3902, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.26337448559670784, |
|
"grad_norm": 0.153342097650188, |
|
"learning_rate": 0.000172972972972973, |
|
"loss": 0.3888, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2716049382716049, |
|
"grad_norm": 0.15900713930790533, |
|
"learning_rate": 0.00017837837837837839, |
|
"loss": 0.3524, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.27983539094650206, |
|
"grad_norm": 0.1546499892536795, |
|
"learning_rate": 0.0001837837837837838, |
|
"loss": 0.3648, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2880658436213992, |
|
"grad_norm": 0.14320964717963003, |
|
"learning_rate": 0.0001891891891891892, |
|
"loss": 0.3658, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.13103727806567708, |
|
"learning_rate": 0.00019459459459459462, |
|
"loss": 0.3496, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3045267489711934, |
|
"grad_norm": 0.13308383037195082, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3572, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.31275720164609055, |
|
"grad_norm": 0.1387871382009263, |
|
"learning_rate": 0.00019999535665248002, |
|
"loss": 0.3214, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.32098765432098764, |
|
"grad_norm": 0.14270131722019058, |
|
"learning_rate": 0.0001999814270411335, |
|
"loss": 0.3445, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3292181069958848, |
|
"grad_norm": 0.1545337969749864, |
|
"learning_rate": 0.000199958212459561, |
|
"loss": 0.32, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3374485596707819, |
|
"grad_norm": 0.13802182056866283, |
|
"learning_rate": 0.00019992571506363, |
|
"loss": 0.3449, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.345679012345679, |
|
"grad_norm": 0.14162173979281106, |
|
"learning_rate": 0.00019988393787127441, |
|
"loss": 0.3262, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.35390946502057613, |
|
"grad_norm": 0.13594846523403772, |
|
"learning_rate": 0.0001998328847622148, |
|
"loss": 0.2958, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.36213991769547327, |
|
"grad_norm": 0.12596629667831052, |
|
"learning_rate": 0.00019977256047759765, |
|
"loss": 0.286, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.13835455454793713, |
|
"learning_rate": 0.00019970297061955533, |
|
"loss": 0.2878, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3786008230452675, |
|
"grad_norm": 0.14644104388608956, |
|
"learning_rate": 0.00019962412165068573, |
|
"loss": 0.2952, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3868312757201646, |
|
"grad_norm": 0.1361181054099992, |
|
"learning_rate": 0.00019953602089345217, |
|
"loss": 0.267, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3950617283950617, |
|
"grad_norm": 0.1545760777310844, |
|
"learning_rate": 0.0001994386765295032, |
|
"loss": 0.2823, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.40329218106995884, |
|
"grad_norm": 0.16048561280797768, |
|
"learning_rate": 0.00019933209759891317, |
|
"loss": 0.2598, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.411522633744856, |
|
"grad_norm": 0.14585143788717048, |
|
"learning_rate": 0.00019921629399934223, |
|
"loss": 0.2834, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.41975308641975306, |
|
"grad_norm": 0.15590757372746555, |
|
"learning_rate": 0.00019909127648511755, |
|
"loss": 0.2619, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4279835390946502, |
|
"grad_norm": 0.142205678471833, |
|
"learning_rate": 0.0001989570566662345, |
|
"loss": 0.2477, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.43621399176954734, |
|
"grad_norm": 0.14334288636686987, |
|
"learning_rate": 0.00019881364700727823, |
|
"loss": 0.2958, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.1584716514858334, |
|
"learning_rate": 0.0001986610608262665, |
|
"loss": 0.2708, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.45267489711934156, |
|
"grad_norm": 0.15229425624208448, |
|
"learning_rate": 0.00019849931229341258, |
|
"loss": 0.2776, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4609053497942387, |
|
"grad_norm": 0.1497514774482752, |
|
"learning_rate": 0.00019832841642980945, |
|
"loss": 0.2325, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.4691358024691358, |
|
"grad_norm": 0.15302882739571017, |
|
"learning_rate": 0.00019814838910603481, |
|
"loss": 0.2755, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.4773662551440329, |
|
"grad_norm": 0.20080266795074445, |
|
"learning_rate": 0.00019795924704067721, |
|
"loss": 0.2421, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.48559670781893005, |
|
"grad_norm": 0.15063814452987448, |
|
"learning_rate": 0.00019776100779878345, |
|
"loss": 0.2152, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"grad_norm": 0.1424158898404743, |
|
"learning_rate": 0.00019755368979022732, |
|
"loss": 0.2424, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5020576131687243, |
|
"grad_norm": 0.15708199535695755, |
|
"learning_rate": 0.00019733731226800015, |
|
"loss": 0.2439, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5102880658436214, |
|
"grad_norm": 0.1469505519285418, |
|
"learning_rate": 0.00019711189532642243, |
|
"loss": 0.2174, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 0.15443629035666892, |
|
"learning_rate": 0.00019687745989927823, |
|
"loss": 0.2201, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5267489711934157, |
|
"grad_norm": 0.16091185410714737, |
|
"learning_rate": 0.00019663402775787066, |
|
"loss": 0.2153, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5349794238683128, |
|
"grad_norm": 0.16812247962269722, |
|
"learning_rate": 0.00019638162150900027, |
|
"loss": 0.2245, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5432098765432098, |
|
"grad_norm": 0.15476006323853086, |
|
"learning_rate": 0.00019612026459286578, |
|
"loss": 0.2168, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.551440329218107, |
|
"grad_norm": 0.14319948869013133, |
|
"learning_rate": 0.00019584998128088684, |
|
"loss": 0.2102, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5596707818930041, |
|
"grad_norm": 0.1416595781349322, |
|
"learning_rate": 0.0001955707966734505, |
|
"loss": 0.2109, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5679012345679012, |
|
"grad_norm": 0.15627936343326146, |
|
"learning_rate": 0.00019528273669757972, |
|
"loss": 0.221, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5761316872427984, |
|
"grad_norm": 0.1355242418955904, |
|
"learning_rate": 0.0001949858281045261, |
|
"loss": 0.1934, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5843621399176955, |
|
"grad_norm": 0.15837694932534985, |
|
"learning_rate": 0.00019468009846728513, |
|
"loss": 0.2106, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.14575785598463487, |
|
"learning_rate": 0.00019436557617803595, |
|
"loss": 0.1958, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.6008230452674898, |
|
"grad_norm": 0.15004026647926874, |
|
"learning_rate": 0.00019404229044550433, |
|
"loss": 0.2111, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.6090534979423868, |
|
"grad_norm": 0.1531758472218286, |
|
"learning_rate": 0.00019371027129225042, |
|
"loss": 0.1796, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6172839506172839, |
|
"grad_norm": 0.14522275863691966, |
|
"learning_rate": 0.0001933695495518804, |
|
"loss": 0.1879, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6255144032921811, |
|
"grad_norm": 0.14778202822977027, |
|
"learning_rate": 0.00019302015686618326, |
|
"loss": 0.1783, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6337448559670782, |
|
"grad_norm": 0.14202535229938665, |
|
"learning_rate": 0.0001926621256821922, |
|
"loss": 0.1672, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6419753086419753, |
|
"grad_norm": 0.15532295497905474, |
|
"learning_rate": 0.00019229548924917146, |
|
"loss": 0.1894, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6502057613168725, |
|
"grad_norm": 0.1298933712297695, |
|
"learning_rate": 0.00019192028161552847, |
|
"loss": 0.1626, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6584362139917695, |
|
"grad_norm": 0.15486589167638415, |
|
"learning_rate": 0.0001915365376256519, |
|
"loss": 0.1829, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.13198994845547862, |
|
"learning_rate": 0.00019114429291667583, |
|
"loss": 0.1827, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6748971193415638, |
|
"grad_norm": 0.14486854526136775, |
|
"learning_rate": 0.00019074358391517023, |
|
"loss": 0.1711, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6831275720164609, |
|
"grad_norm": 0.18136392511258384, |
|
"learning_rate": 0.00019033444783375804, |
|
"loss": 0.1852, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.691358024691358, |
|
"grad_norm": 0.12835927358561786, |
|
"learning_rate": 0.00018991692266765947, |
|
"loss": 0.1874, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6995884773662552, |
|
"grad_norm": 0.15923203558527596, |
|
"learning_rate": 0.00018949104719116332, |
|
"loss": 0.1754, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7078189300411523, |
|
"grad_norm": 0.1524830096667991, |
|
"learning_rate": 0.00018905686095402647, |
|
"loss": 0.1772, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.7160493827160493, |
|
"grad_norm": 0.1458630884597557, |
|
"learning_rate": 0.0001886144042778006, |
|
"loss": 0.1884, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.7242798353909465, |
|
"grad_norm": 0.14812531898730455, |
|
"learning_rate": 0.00018816371825208789, |
|
"loss": 0.1694, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.7325102880658436, |
|
"grad_norm": 0.1371838995231899, |
|
"learning_rate": 0.0001877048447307252, |
|
"loss": 0.175, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.12597672532717027, |
|
"learning_rate": 0.00018723782632789701, |
|
"loss": 0.1663, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7489711934156379, |
|
"grad_norm": 0.13653897293999862, |
|
"learning_rate": 0.00018676270641417822, |
|
"loss": 0.1902, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.757201646090535, |
|
"grad_norm": 0.11918081783294039, |
|
"learning_rate": 0.0001862795291125063, |
|
"loss": 0.1662, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7654320987654321, |
|
"grad_norm": 0.1333312758081247, |
|
"learning_rate": 0.0001857883392940837, |
|
"loss": 0.199, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7736625514403292, |
|
"grad_norm": 0.12930618535907987, |
|
"learning_rate": 0.000185289182574211, |
|
"loss": 0.1697, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7818930041152263, |
|
"grad_norm": 0.1336640654310965, |
|
"learning_rate": 0.0001847821053080505, |
|
"loss": 0.1852, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7901234567901234, |
|
"grad_norm": 0.14974272672832492, |
|
"learning_rate": 0.00018426715458632153, |
|
"loss": 0.1819, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7983539094650206, |
|
"grad_norm": 0.11738767625285426, |
|
"learning_rate": 0.00018374437823092724, |
|
"loss": 0.1628, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.8065843621399177, |
|
"grad_norm": 0.1126501945549117, |
|
"learning_rate": 0.00018321382479051347, |
|
"loss": 0.1574, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 0.14716070780057008, |
|
"learning_rate": 0.00018267554353596025, |
|
"loss": 0.1671, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.823045267489712, |
|
"grad_norm": 0.14236534823623162, |
|
"learning_rate": 0.0001821295844558062, |
|
"loss": 0.179, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.831275720164609, |
|
"grad_norm": 0.14819744720356537, |
|
"learning_rate": 0.0001815759982516061, |
|
"loss": 0.1765, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.8395061728395061, |
|
"grad_norm": 0.1359773653210936, |
|
"learning_rate": 0.00018101483633322255, |
|
"loss": 0.1736, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8477366255144033, |
|
"grad_norm": 0.12220336917197815, |
|
"learning_rate": 0.00018044615081405153, |
|
"loss": 0.1559, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8559670781893004, |
|
"grad_norm": 0.12734749296356374, |
|
"learning_rate": 0.00017986999450618295, |
|
"loss": 0.1598, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8641975308641975, |
|
"grad_norm": 0.13635727971275893, |
|
"learning_rate": 0.00017928642091549613, |
|
"loss": 0.1716, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8724279835390947, |
|
"grad_norm": 0.1256041209800328, |
|
"learning_rate": 0.00017869548423669077, |
|
"loss": 0.1694, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8806584362139918, |
|
"grad_norm": 0.156735390007985, |
|
"learning_rate": 0.00017809723934825405, |
|
"loss": 0.1711, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.13613462557969605, |
|
"learning_rate": 0.00017749174180736442, |
|
"loss": 0.1575, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.897119341563786, |
|
"grad_norm": 0.12136550882477753, |
|
"learning_rate": 0.00017687904784473188, |
|
"loss": 0.1541, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.9053497942386831, |
|
"grad_norm": 0.11805606680499708, |
|
"learning_rate": 0.00017625921435937637, |
|
"loss": 0.153, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9135802469135802, |
|
"grad_norm": 0.12306976139021918, |
|
"learning_rate": 0.00017563229891334338, |
|
"loss": 0.1723, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.9218106995884774, |
|
"grad_norm": 0.12639467162562257, |
|
"learning_rate": 0.00017499835972635856, |
|
"loss": 0.1637, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.9300411522633745, |
|
"grad_norm": 0.11341325633796959, |
|
"learning_rate": 0.00017435745567042095, |
|
"loss": 0.1471, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.9382716049382716, |
|
"grad_norm": 0.12238460464162876, |
|
"learning_rate": 0.00017370964626433567, |
|
"loss": 0.1682, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.9465020576131687, |
|
"grad_norm": 0.12574079950832473, |
|
"learning_rate": 0.0001730549916681868, |
|
"loss": 0.1493, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.9547325102880658, |
|
"grad_norm": 0.13251823016582745, |
|
"learning_rate": 0.00017239355267775018, |
|
"loss": 0.1649, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 0.11699891536949006, |
|
"learning_rate": 0.0001717253907188477, |
|
"loss": 0.1628, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.9711934156378601, |
|
"grad_norm": 0.11316448680114288, |
|
"learning_rate": 0.00017105056784164294, |
|
"loss": 0.1434, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.9794238683127572, |
|
"grad_norm": 0.11531932007410475, |
|
"learning_rate": 0.00017036914671487852, |
|
"loss": 0.1565, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"grad_norm": 0.11423778544173263, |
|
"learning_rate": 0.00016968119062005642, |
|
"loss": 0.1481, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9958847736625515, |
|
"grad_norm": 0.10808259099105172, |
|
"learning_rate": 0.00016898676344556118, |
|
"loss": 0.1393, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9958847736625515, |
|
"eval_loss": 0.1582392454147339, |
|
"eval_runtime": 24.3633, |
|
"eval_samples_per_second": 33.534, |
|
"eval_steps_per_second": 1.067, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.0041152263374487, |
|
"grad_norm": 0.12179637517786698, |
|
"learning_rate": 0.00016828592968072678, |
|
"loss": 0.1367, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.0123456790123457, |
|
"grad_norm": 0.12278954055202473, |
|
"learning_rate": 0.00016757875440984768, |
|
"loss": 0.1352, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.0205761316872428, |
|
"grad_norm": 0.11262318129903952, |
|
"learning_rate": 0.0001668653033061347, |
|
"loss": 0.1319, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.02880658436214, |
|
"grad_norm": 0.10685919423230769, |
|
"learning_rate": 0.00016614564262561608, |
|
"loss": 0.1483, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.037037037037037, |
|
"grad_norm": 0.12688594869999706, |
|
"learning_rate": 0.0001654198392009846, |
|
"loss": 0.1345, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.045267489711934, |
|
"grad_norm": 0.11380792656533252, |
|
"learning_rate": 0.0001646879604353908, |
|
"loss": 0.1435, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.0534979423868314, |
|
"grad_norm": 0.11336135523754398, |
|
"learning_rate": 0.00016395007429618382, |
|
"loss": 0.1496, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.0617283950617284, |
|
"grad_norm": 0.11068219074387899, |
|
"learning_rate": 0.00016320624930859904, |
|
"loss": 0.1412, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.0699588477366255, |
|
"grad_norm": 0.10745622525472565, |
|
"learning_rate": 0.00016245655454939474, |
|
"loss": 0.1294, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0781893004115226, |
|
"grad_norm": 0.10201633324761311, |
|
"learning_rate": 0.00016170105964043695, |
|
"loss": 0.1443, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.0864197530864197, |
|
"grad_norm": 0.1137314706455847, |
|
"learning_rate": 0.0001609398347422339, |
|
"loss": 0.1389, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.0946502057613168, |
|
"grad_norm": 0.11204326746406651, |
|
"learning_rate": 0.00016017295054742046, |
|
"loss": 0.1422, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.102880658436214, |
|
"grad_norm": 0.10142584272145459, |
|
"learning_rate": 0.00015940047827419303, |
|
"loss": 0.1301, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.10382986472663408, |
|
"learning_rate": 0.00015862248965969604, |
|
"loss": 0.1388, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.1193415637860082, |
|
"grad_norm": 0.11472916493277631, |
|
"learning_rate": 0.00015783905695335946, |
|
"loss": 0.1406, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.1275720164609053, |
|
"grad_norm": 0.11056711973862805, |
|
"learning_rate": 0.0001570502529101896, |
|
"loss": 0.1295, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.1358024691358024, |
|
"grad_norm": 0.11881161089888494, |
|
"learning_rate": 0.00015625615078401244, |
|
"loss": 0.1491, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.1440329218106995, |
|
"grad_norm": 0.10248132676392517, |
|
"learning_rate": 0.00015545682432067067, |
|
"loss": 0.1235, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.1522633744855968, |
|
"grad_norm": 0.10737832299551002, |
|
"learning_rate": 0.0001546523477511754, |
|
"loss": 0.15, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.1604938271604939, |
|
"grad_norm": 0.10807513117026502, |
|
"learning_rate": 0.00015384279578481221, |
|
"loss": 0.1302, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.168724279835391, |
|
"grad_norm": 0.11235063041052787, |
|
"learning_rate": 0.00015302824360220353, |
|
"loss": 0.1386, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.176954732510288, |
|
"grad_norm": 0.11303603244571206, |
|
"learning_rate": 0.00015220876684832638, |
|
"loss": 0.1354, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 0.10834449382941344, |
|
"learning_rate": 0.0001513844416254879, |
|
"loss": 0.1329, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.1934156378600824, |
|
"grad_norm": 0.1079013280016857, |
|
"learning_rate": 0.00015055534448625766, |
|
"loss": 0.1395, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.2016460905349795, |
|
"grad_norm": 0.10261607273228789, |
|
"learning_rate": 0.00014972155242635852, |
|
"loss": 0.129, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.2098765432098766, |
|
"grad_norm": 0.11363708966450349, |
|
"learning_rate": 0.0001488831428775164, |
|
"loss": 0.1461, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.2181069958847737, |
|
"grad_norm": 0.11418227389762935, |
|
"learning_rate": 0.00014804019370026926, |
|
"loss": 0.1408, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.2263374485596708, |
|
"grad_norm": 0.11642847183223218, |
|
"learning_rate": 0.00014719278317673655, |
|
"loss": 0.1462, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.2345679012345678, |
|
"grad_norm": 0.11219874980224723, |
|
"learning_rate": 0.0001463409900033493, |
|
"loss": 0.1302, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.242798353909465, |
|
"grad_norm": 0.11039753961672125, |
|
"learning_rate": 0.00014548489328354195, |
|
"loss": 0.1349, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.2510288065843622, |
|
"grad_norm": 0.10772127499535779, |
|
"learning_rate": 0.00014462457252040607, |
|
"loss": 0.134, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.2592592592592593, |
|
"grad_norm": 0.10693684319005581, |
|
"learning_rate": 0.00014376010760930728, |
|
"loss": 0.1314, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.2674897119341564, |
|
"grad_norm": 0.10099951149470732, |
|
"learning_rate": 0.00014289157883046568, |
|
"loss": 0.1314, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.2757201646090535, |
|
"grad_norm": 0.10714073475937398, |
|
"learning_rate": 0.0001420190668415002, |
|
"loss": 0.1168, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.2839506172839505, |
|
"grad_norm": 0.11431029472007842, |
|
"learning_rate": 0.00014114265266993846, |
|
"loss": 0.1457, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.2921810699588478, |
|
"grad_norm": 0.11035801526331707, |
|
"learning_rate": 0.00014026241770569197, |
|
"loss": 0.1496, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.300411522633745, |
|
"grad_norm": 0.10419031063352954, |
|
"learning_rate": 0.00013937844369349734, |
|
"loss": 0.1323, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.308641975308642, |
|
"grad_norm": 0.10823681149718124, |
|
"learning_rate": 0.00013849081272532544, |
|
"loss": 0.1264, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.316872427983539, |
|
"grad_norm": 0.11224858342990347, |
|
"learning_rate": 0.00013759960723275732, |
|
"loss": 0.1494, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3251028806584362, |
|
"grad_norm": 0.11222637320000997, |
|
"learning_rate": 0.00013670490997932922, |
|
"loss": 0.1446, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.11350101469976674, |
|
"learning_rate": 0.00013580680405284664, |
|
"loss": 0.1501, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.3415637860082303, |
|
"grad_norm": 0.11216990215296897, |
|
"learning_rate": 0.00013490537285766808, |
|
"loss": 0.1518, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.3497942386831276, |
|
"grad_norm": 0.10385126129598882, |
|
"learning_rate": 0.00013400070010695966, |
|
"loss": 0.1326, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.3580246913580247, |
|
"grad_norm": 0.09801204121771982, |
|
"learning_rate": 0.00013309286981492085, |
|
"loss": 0.1385, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.3662551440329218, |
|
"grad_norm": 0.10515015522554948, |
|
"learning_rate": 0.00013218196628898233, |
|
"loss": 0.1435, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.374485596707819, |
|
"grad_norm": 0.11167870227165867, |
|
"learning_rate": 0.00013126807412197665, |
|
"loss": 0.1469, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.382716049382716, |
|
"grad_norm": 0.110692438020908, |
|
"learning_rate": 0.0001303512781842824, |
|
"loss": 0.1267, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.3909465020576133, |
|
"grad_norm": 0.11426160677474222, |
|
"learning_rate": 0.00012943166361594242, |
|
"loss": 0.1308, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.3991769547325104, |
|
"grad_norm": 0.11583690935003194, |
|
"learning_rate": 0.00012850931581875723, |
|
"loss": 0.1484, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.4074074074074074, |
|
"grad_norm": 0.10018605187361156, |
|
"learning_rate": 0.00012758432044835392, |
|
"loss": 0.141, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.4156378600823045, |
|
"grad_norm": 0.10604765462173282, |
|
"learning_rate": 0.0001266567634062317, |
|
"loss": 0.1291, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.4238683127572016, |
|
"grad_norm": 0.10158287607968718, |
|
"learning_rate": 0.0001257267308317845, |
|
"loss": 0.1276, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.4320987654320987, |
|
"grad_norm": 0.10299808028319524, |
|
"learning_rate": 0.00012479430909430108, |
|
"loss": 0.1317, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.4403292181069958, |
|
"grad_norm": 0.1118568576111719, |
|
"learning_rate": 0.00012385958478494487, |
|
"loss": 0.1288, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.448559670781893, |
|
"grad_norm": 0.10353944126895748, |
|
"learning_rate": 0.00012292264470871182, |
|
"loss": 0.1175, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.4567901234567902, |
|
"grad_norm": 0.11318325883515341, |
|
"learning_rate": 0.00012198357587636957, |
|
"loss": 0.1304, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.4650205761316872, |
|
"grad_norm": 0.11698708468284778, |
|
"learning_rate": 0.00012104246549637683, |
|
"loss": 0.143, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.4732510288065843, |
|
"grad_norm": 0.12408074070049697, |
|
"learning_rate": 0.00012009940096678452, |
|
"loss": 0.1583, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.115849428171122, |
|
"learning_rate": 0.00011915446986711953, |
|
"loss": 0.1401, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.4897119341563787, |
|
"grad_norm": 0.10498841318355592, |
|
"learning_rate": 0.00011820775995025147, |
|
"loss": 0.1404, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.4979423868312758, |
|
"grad_norm": 0.11284118297477287, |
|
"learning_rate": 0.0001172593591342432, |
|
"loss": 0.1479, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.5061728395061729, |
|
"grad_norm": 0.1063063021523787, |
|
"learning_rate": 0.00011630935549418627, |
|
"loss": 0.1223, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.51440329218107, |
|
"grad_norm": 0.09875055061391125, |
|
"learning_rate": 0.00011535783725402163, |
|
"loss": 0.1177, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.522633744855967, |
|
"grad_norm": 0.10114695106701617, |
|
"learning_rate": 0.00011440489277834645, |
|
"loss": 0.1403, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.5308641975308643, |
|
"grad_norm": 0.09017689726172246, |
|
"learning_rate": 0.0001134506105642081, |
|
"loss": 0.1114, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.5390946502057612, |
|
"grad_norm": 0.09727031699745013, |
|
"learning_rate": 0.00011249507923288562, |
|
"loss": 0.1368, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.5473251028806585, |
|
"grad_norm": 0.10131253354694934, |
|
"learning_rate": 0.0001115383875216598, |
|
"loss": 0.1278, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.09766756433474798, |
|
"learning_rate": 0.00011058062427557229, |
|
"loss": 0.1284, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.5637860082304527, |
|
"grad_norm": 0.09911328195816287, |
|
"learning_rate": 0.00010962187843917497, |
|
"loss": 0.1284, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.5720164609053497, |
|
"grad_norm": 0.10809617890378591, |
|
"learning_rate": 0.0001086622390482699, |
|
"loss": 0.1423, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.5802469135802468, |
|
"grad_norm": 0.09456158065899406, |
|
"learning_rate": 0.00010770179522164079, |
|
"loss": 0.1317, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.5884773662551441, |
|
"grad_norm": 0.09799719160217063, |
|
"learning_rate": 0.0001067406361527768, |
|
"loss": 0.1356, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.596707818930041, |
|
"grad_norm": 0.09992479118544549, |
|
"learning_rate": 0.00010577885110158958, |
|
"loss": 0.1292, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.6049382716049383, |
|
"grad_norm": 0.09693089475338096, |
|
"learning_rate": 0.00010481652938612374, |
|
"loss": 0.1391, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.6131687242798354, |
|
"grad_norm": 0.09217578544631694, |
|
"learning_rate": 0.00010385376037426226, |
|
"loss": 0.1152, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.6213991769547325, |
|
"grad_norm": 0.09249375316990154, |
|
"learning_rate": 0.00010289063347542726, |
|
"loss": 0.1154, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.6296296296296298, |
|
"grad_norm": 0.09641811645171969, |
|
"learning_rate": 0.00010192723813227672, |
|
"loss": 0.1182, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.6378600823045266, |
|
"grad_norm": 0.09748485166957398, |
|
"learning_rate": 0.00010096366381239808, |
|
"loss": 0.1338, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.646090534979424, |
|
"grad_norm": 0.09383283234337292, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1235, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.654320987654321, |
|
"grad_norm": 0.09670711594290475, |
|
"learning_rate": 9.903633618760195e-05, |
|
"loss": 0.1222, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.662551440329218, |
|
"grad_norm": 0.09682675756651625, |
|
"learning_rate": 9.807276186772333e-05, |
|
"loss": 0.1271, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.6707818930041154, |
|
"grad_norm": 0.09455470748347092, |
|
"learning_rate": 9.710936652457276e-05, |
|
"loss": 0.1217, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.6790123456790123, |
|
"grad_norm": 0.10988834676545467, |
|
"learning_rate": 9.614623962573776e-05, |
|
"loss": 0.1288, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.6872427983539096, |
|
"grad_norm": 0.10415165974330459, |
|
"learning_rate": 9.518347061387628e-05, |
|
"loss": 0.1307, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.6954732510288066, |
|
"grad_norm": 0.10985786659106267, |
|
"learning_rate": 9.422114889841044e-05, |
|
"loss": 0.1405, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.7037037037037037, |
|
"grad_norm": 0.09921801439791741, |
|
"learning_rate": 9.325936384722321e-05, |
|
"loss": 0.1421, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.7119341563786008, |
|
"grad_norm": 0.09695464903081527, |
|
"learning_rate": 9.229820477835927e-05, |
|
"loss": 0.1261, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.7201646090534979, |
|
"grad_norm": 0.09747696306459716, |
|
"learning_rate": 9.133776095173015e-05, |
|
"loss": 0.1295, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.7283950617283952, |
|
"grad_norm": 0.09028526494291234, |
|
"learning_rate": 9.037812156082504e-05, |
|
"loss": 0.1148, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.736625514403292, |
|
"grad_norm": 0.11512926087416764, |
|
"learning_rate": 8.941937572442773e-05, |
|
"loss": 0.1385, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.7448559670781894, |
|
"grad_norm": 0.09719378966338557, |
|
"learning_rate": 8.846161247834024e-05, |
|
"loss": 0.1308, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.7530864197530864, |
|
"grad_norm": 0.09008557356074057, |
|
"learning_rate": 8.750492076711439e-05, |
|
"loss": 0.1142, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.7613168724279835, |
|
"grad_norm": 0.10577290101156246, |
|
"learning_rate": 8.654938943579194e-05, |
|
"loss": 0.1409, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.7695473251028808, |
|
"grad_norm": 0.10122461461639144, |
|
"learning_rate": 8.55951072216536e-05, |
|
"loss": 0.1314, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.09823573980962558, |
|
"learning_rate": 8.464216274597838e-05, |
|
"loss": 0.1219, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.786008230452675, |
|
"grad_norm": 0.10401921170876789, |
|
"learning_rate": 8.369064450581373e-05, |
|
"loss": 0.144, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.794238683127572, |
|
"grad_norm": 0.09078999043742002, |
|
"learning_rate": 8.274064086575681e-05, |
|
"loss": 0.1146, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.8024691358024691, |
|
"grad_norm": 0.09290736424935679, |
|
"learning_rate": 8.179224004974857e-05, |
|
"loss": 0.1338, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.8106995884773662, |
|
"grad_norm": 0.10089995497723929, |
|
"learning_rate": 8.084553013288048e-05, |
|
"loss": 0.1299, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.8189300411522633, |
|
"grad_norm": 0.11086354657368475, |
|
"learning_rate": 7.990059903321553e-05, |
|
"loss": 0.1413, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.8271604938271606, |
|
"grad_norm": 0.10239856014254489, |
|
"learning_rate": 7.89575345036232e-05, |
|
"loss": 0.1357, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.8353909465020575, |
|
"grad_norm": 0.10469014425315878, |
|
"learning_rate": 7.801642412363041e-05, |
|
"loss": 0.1386, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.8436213991769548, |
|
"grad_norm": 0.09567702676769906, |
|
"learning_rate": 7.707735529128819e-05, |
|
"loss": 0.1191, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 0.09887833758041985, |
|
"learning_rate": 7.614041521505517e-05, |
|
"loss": 0.1219, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.860082304526749, |
|
"grad_norm": 0.11752510500534394, |
|
"learning_rate": 7.520569090569893e-05, |
|
"loss": 0.1436, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.8683127572016462, |
|
"grad_norm": 0.10388061868632578, |
|
"learning_rate": 7.427326916821557e-05, |
|
"loss": 0.1521, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.876543209876543, |
|
"grad_norm": 0.09472507284073291, |
|
"learning_rate": 7.334323659376829e-05, |
|
"loss": 0.1309, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.8847736625514404, |
|
"grad_norm": 0.09368373509928322, |
|
"learning_rate": 7.24156795516461e-05, |
|
"loss": 0.1226, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.8930041152263375, |
|
"grad_norm": 0.09798127382996634, |
|
"learning_rate": 7.149068418124281e-05, |
|
"loss": 0.134, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.9012345679012346, |
|
"grad_norm": 0.09780244211128043, |
|
"learning_rate": 7.056833638405762e-05, |
|
"loss": 0.1139, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.9094650205761317, |
|
"grad_norm": 0.09955863946262253, |
|
"learning_rate": 6.964872181571764e-05, |
|
"loss": 0.1372, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.9176954732510287, |
|
"grad_norm": 0.0999251460303617, |
|
"learning_rate": 6.87319258780234e-05, |
|
"loss": 0.1302, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"grad_norm": 0.10537500302863075, |
|
"learning_rate": 6.781803371101774e-05, |
|
"loss": 0.1361, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.934156378600823, |
|
"grad_norm": 0.10646426579724969, |
|
"learning_rate": 6.690713018507918e-05, |
|
"loss": 0.1382, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.9423868312757202, |
|
"grad_norm": 0.09815506133539018, |
|
"learning_rate": 6.599929989304035e-05, |
|
"loss": 0.1248, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.9506172839506173, |
|
"grad_norm": 0.10476726048050965, |
|
"learning_rate": 6.509462714233195e-05, |
|
"loss": 0.139, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.9588477366255144, |
|
"grad_norm": 0.10022243654627384, |
|
"learning_rate": 6.419319594715339e-05, |
|
"loss": 0.1285, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.9670781893004117, |
|
"grad_norm": 0.09626907084774464, |
|
"learning_rate": 6.32950900206708e-05, |
|
"loss": 0.1345, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.9753086419753085, |
|
"grad_norm": 0.09684684548905444, |
|
"learning_rate": 6.240039276724272e-05, |
|
"loss": 0.1338, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.9835390946502058, |
|
"grad_norm": 0.09617321255871392, |
|
"learning_rate": 6.150918727467455e-05, |
|
"loss": 0.1475, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.991769547325103, |
|
"grad_norm": 0.10288685971512879, |
|
"learning_rate": 6.062155630650265e-05, |
|
"loss": 0.13, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.09997416312505185, |
|
"learning_rate": 5.973758229430806e-05, |
|
"loss": 0.1282, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.14014942944049835, |
|
"eval_runtime": 21.1655, |
|
"eval_samples_per_second": 38.601, |
|
"eval_steps_per_second": 1.228, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.0082304526748973, |
|
"grad_norm": 0.0846538233930147, |
|
"learning_rate": 5.885734733006154e-05, |
|
"loss": 0.0975, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.016460905349794, |
|
"grad_norm": 0.09549609421876461, |
|
"learning_rate": 5.798093315849984e-05, |
|
"loss": 0.1102, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.0246913580246915, |
|
"grad_norm": 0.09559068048673255, |
|
"learning_rate": 5.710842116953438e-05, |
|
"loss": 0.1025, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.0329218106995883, |
|
"grad_norm": 0.10047776150134546, |
|
"learning_rate": 5.623989239069275e-05, |
|
"loss": 0.1167, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.0411522633744856, |
|
"grad_norm": 0.10670187619168023, |
|
"learning_rate": 5.537542747959394e-05, |
|
"loss": 0.1115, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.049382716049383, |
|
"grad_norm": 0.10560096207358394, |
|
"learning_rate": 5.451510671645807e-05, |
|
"loss": 0.117, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.05761316872428, |
|
"grad_norm": 0.10122771696509619, |
|
"learning_rate": 5.36590099966507e-05, |
|
"loss": 0.1253, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.065843621399177, |
|
"grad_norm": 0.09738370841133107, |
|
"learning_rate": 5.2807216823263484e-05, |
|
"loss": 0.1188, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.074074074074074, |
|
"grad_norm": 0.09886958144813014, |
|
"learning_rate": 5.1959806299730774e-05, |
|
"loss": 0.1237, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.0823045267489713, |
|
"grad_norm": 0.10219049090054223, |
|
"learning_rate": 5.111685712248364e-05, |
|
"loss": 0.1137, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.090534979423868, |
|
"grad_norm": 0.09836202650897583, |
|
"learning_rate": 5.0278447573641495e-05, |
|
"loss": 0.1196, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.0987654320987654, |
|
"grad_norm": 0.09213881530201376, |
|
"learning_rate": 4.944465551374238e-05, |
|
"loss": 0.0999, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.1069958847736627, |
|
"grad_norm": 0.09541725924400123, |
|
"learning_rate": 4.861555837451213e-05, |
|
"loss": 0.1042, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.1152263374485596, |
|
"grad_norm": 0.09856792963956205, |
|
"learning_rate": 4.779123315167362e-05, |
|
"loss": 0.1139, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.123456790123457, |
|
"grad_norm": 0.09596664358323789, |
|
"learning_rate": 4.6971756397796504e-05, |
|
"loss": 0.1127, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.1316872427983538, |
|
"grad_norm": 0.10398664110407933, |
|
"learning_rate": 4.61572042151878e-05, |
|
"loss": 0.1226, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.139917695473251, |
|
"grad_norm": 0.09339956051495163, |
|
"learning_rate": 4.5347652248824624e-05, |
|
"loss": 0.1099, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.148148148148148, |
|
"grad_norm": 0.09890330904002925, |
|
"learning_rate": 4.4543175679329344e-05, |
|
"loss": 0.1195, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.156378600823045, |
|
"grad_norm": 0.09997598174399476, |
|
"learning_rate": 4.3743849215987595e-05, |
|
"loss": 0.1128, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.1646090534979425, |
|
"grad_norm": 0.09613540261918692, |
|
"learning_rate": 4.294974708981041e-05, |
|
"loss": 0.11, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.1728395061728394, |
|
"grad_norm": 0.09811570643997117, |
|
"learning_rate": 4.216094304664056e-05, |
|
"loss": 0.1221, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.1810699588477367, |
|
"grad_norm": 0.10130508026732313, |
|
"learning_rate": 4.137751034030399e-05, |
|
"loss": 0.1147, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.1893004115226335, |
|
"grad_norm": 0.10155247373344696, |
|
"learning_rate": 4.059952172580694e-05, |
|
"loss": 0.1258, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.197530864197531, |
|
"grad_norm": 0.10114305608112335, |
|
"learning_rate": 3.982704945257957e-05, |
|
"loss": 0.1125, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.205761316872428, |
|
"grad_norm": 0.09834925536076868, |
|
"learning_rate": 3.906016525776611e-05, |
|
"loss": 0.1178, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.213991769547325, |
|
"grad_norm": 0.10693349155074929, |
|
"learning_rate": 3.829894035956306e-05, |
|
"loss": 0.125, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.09467414992649671, |
|
"learning_rate": 3.7543445450605285e-05, |
|
"loss": 0.1054, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.230452674897119, |
|
"grad_norm": 0.09826633722499567, |
|
"learning_rate": 3.6793750691400994e-05, |
|
"loss": 0.1066, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.2386831275720165, |
|
"grad_norm": 0.10348161204951604, |
|
"learning_rate": 3.6049925703816214e-05, |
|
"loss": 0.1144, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.246913580246914, |
|
"grad_norm": 0.10479446585724977, |
|
"learning_rate": 3.53120395646092e-05, |
|
"loss": 0.1135, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.2551440329218106, |
|
"grad_norm": 0.0951654749026979, |
|
"learning_rate": 3.458016079901544e-05, |
|
"loss": 0.1074, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.263374485596708, |
|
"grad_norm": 0.11076793087444725, |
|
"learning_rate": 3.38543573743839e-05, |
|
"loss": 0.1299, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.271604938271605, |
|
"grad_norm": 0.0946857687815803, |
|
"learning_rate": 3.3134696693865316e-05, |
|
"loss": 0.106, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.279835390946502, |
|
"grad_norm": 0.09607169531950183, |
|
"learning_rate": 3.242124559015234e-05, |
|
"loss": 0.0966, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.288065843621399, |
|
"grad_norm": 0.10167592887197249, |
|
"learning_rate": 3.171407031927325e-05, |
|
"loss": 0.1162, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.2962962962962963, |
|
"grad_norm": 0.09726294827804387, |
|
"learning_rate": 3.101323655443882e-05, |
|
"loss": 0.1139, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.3045267489711936, |
|
"grad_norm": 0.09582434907073016, |
|
"learning_rate": 3.031880937994359e-05, |
|
"loss": 0.1084, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.3127572016460904, |
|
"grad_norm": 0.11021289451624465, |
|
"learning_rate": 2.9630853285121508e-05, |
|
"loss": 0.1231, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.3209876543209877, |
|
"grad_norm": 0.09391459178579925, |
|
"learning_rate": 2.894943215835708e-05, |
|
"loss": 0.1047, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.3292181069958846, |
|
"grad_norm": 0.09899700696295954, |
|
"learning_rate": 2.827460928115232e-05, |
|
"loss": 0.1101, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.337448559670782, |
|
"grad_norm": 0.09901505034589045, |
|
"learning_rate": 2.7606447322249872e-05, |
|
"loss": 0.1067, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.3456790123456788, |
|
"grad_norm": 0.10420896589962987, |
|
"learning_rate": 2.6945008331813226e-05, |
|
"loss": 0.1282, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.353909465020576, |
|
"grad_norm": 0.09729393595160353, |
|
"learning_rate": 2.629035373566433e-05, |
|
"loss": 0.1023, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.3621399176954734, |
|
"grad_norm": 0.09829406149572398, |
|
"learning_rate": 2.5642544329579088e-05, |
|
"loss": 0.1155, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 0.10275430043325989, |
|
"learning_rate": 2.500164027364147e-05, |
|
"loss": 0.1252, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.3786008230452675, |
|
"grad_norm": 0.10367750094719844, |
|
"learning_rate": 2.4367701086656624e-05, |
|
"loss": 0.1166, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.386831275720165, |
|
"grad_norm": 0.10274988439011652, |
|
"learning_rate": 2.3740785640623643e-05, |
|
"loss": 0.1169, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.3950617283950617, |
|
"grad_norm": 0.09057411016580416, |
|
"learning_rate": 2.312095215526814e-05, |
|
"loss": 0.1026, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.403292181069959, |
|
"grad_norm": 0.104243770759663, |
|
"learning_rate": 2.2508258192635612e-05, |
|
"loss": 0.1127, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.411522633744856, |
|
"grad_norm": 0.09595429188209968, |
|
"learning_rate": 2.1902760651745958e-05, |
|
"loss": 0.1117, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.419753086419753, |
|
"grad_norm": 0.10143248145662324, |
|
"learning_rate": 2.1304515763309253e-05, |
|
"loss": 0.111, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.42798353909465, |
|
"grad_norm": 0.1074361000716321, |
|
"learning_rate": 2.0713579084503876e-05, |
|
"loss": 0.1222, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.4362139917695473, |
|
"grad_norm": 0.09389635392057138, |
|
"learning_rate": 2.013000549381706e-05, |
|
"loss": 0.1009, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 0.0950080424907371, |
|
"learning_rate": 1.9553849185948512e-05, |
|
"loss": 0.1111, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.4526748971193415, |
|
"grad_norm": 0.09761821744754258, |
|
"learning_rate": 1.8985163666777473e-05, |
|
"loss": 0.1013, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.460905349794239, |
|
"grad_norm": 0.10939996129330187, |
|
"learning_rate": 1.8424001748393905e-05, |
|
"loss": 0.1215, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.4691358024691357, |
|
"grad_norm": 0.09848524806568702, |
|
"learning_rate": 1.787041554419381e-05, |
|
"loss": 0.1, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.477366255144033, |
|
"grad_norm": 0.10208382321734329, |
|
"learning_rate": 1.7324456464039752e-05, |
|
"loss": 0.1156, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.48559670781893, |
|
"grad_norm": 0.10860830565047937, |
|
"learning_rate": 1.6786175209486566e-05, |
|
"loss": 0.1135, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.493827160493827, |
|
"grad_norm": 0.11254066618708665, |
|
"learning_rate": 1.6255621769072805e-05, |
|
"loss": 0.1202, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.5020576131687244, |
|
"grad_norm": 0.09949710536532337, |
|
"learning_rate": 1.5732845413678477e-05, |
|
"loss": 0.108, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.5102880658436213, |
|
"grad_norm": 0.09871004589964077, |
|
"learning_rate": 1.521789469194952e-05, |
|
"loss": 0.1048, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.5185185185185186, |
|
"grad_norm": 0.09509589762300931, |
|
"learning_rate": 1.4710817425789014e-05, |
|
"loss": 0.1108, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.526748971193416, |
|
"grad_norm": 0.09874336616077671, |
|
"learning_rate": 1.4211660705916285e-05, |
|
"loss": 0.1075, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.5349794238683128, |
|
"grad_norm": 0.10610686628994136, |
|
"learning_rate": 1.3720470887493719e-05, |
|
"loss": 0.114, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.5432098765432096, |
|
"grad_norm": 0.09371419408053047, |
|
"learning_rate": 1.3237293585821786e-05, |
|
"loss": 0.1, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.551440329218107, |
|
"grad_norm": 0.097056043473284, |
|
"learning_rate": 1.2762173672102996e-05, |
|
"loss": 0.1091, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.5596707818930042, |
|
"grad_norm": 0.10289500706688874, |
|
"learning_rate": 1.2295155269274827e-05, |
|
"loss": 0.108, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.567901234567901, |
|
"grad_norm": 0.10969046879996797, |
|
"learning_rate": 1.1836281747912125e-05, |
|
"loss": 0.1231, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.5761316872427984, |
|
"grad_norm": 0.09910058926653355, |
|
"learning_rate": 1.1385595722199438e-05, |
|
"loss": 0.1132, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.5843621399176957, |
|
"grad_norm": 0.0973576033135528, |
|
"learning_rate": 1.0943139045973549e-05, |
|
"loss": 0.1152, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.5925925925925926, |
|
"grad_norm": 0.10324605327000957, |
|
"learning_rate": 1.050895280883668e-05, |
|
"loss": 0.119, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.60082304526749, |
|
"grad_norm": 0.09940459072200475, |
|
"learning_rate": 1.0083077332340562e-05, |
|
"loss": 0.1086, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.6090534979423867, |
|
"grad_norm": 0.10054320644748665, |
|
"learning_rate": 9.665552166241964e-06, |
|
"loss": 0.1149, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.617283950617284, |
|
"grad_norm": 0.10479612628242368, |
|
"learning_rate": 9.256416084829778e-06, |
|
"loss": 0.1274, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.625514403292181, |
|
"grad_norm": 0.09669311712494659, |
|
"learning_rate": 8.855707083324183e-06, |
|
"loss": 0.1165, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.633744855967078, |
|
"grad_norm": 0.09887270600607351, |
|
"learning_rate": 8.46346237434813e-06, |
|
"loss": 0.1056, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.6419753086419755, |
|
"grad_norm": 0.10455687155682203, |
|
"learning_rate": 8.079718384471557e-06, |
|
"loss": 0.1152, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.6502057613168724, |
|
"grad_norm": 0.10011518680387445, |
|
"learning_rate": 7.704510750828542e-06, |
|
"loss": 0.1056, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.6584362139917697, |
|
"grad_norm": 0.09682701363965382, |
|
"learning_rate": 7.337874317807802e-06, |
|
"loss": 0.1034, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.10000970149912758, |
|
"learning_rate": 6.979843133816743e-06, |
|
"loss": 0.1119, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.674897119341564, |
|
"grad_norm": 0.10465713299866858, |
|
"learning_rate": 6.630450448119618e-06, |
|
"loss": 0.1183, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.6831275720164607, |
|
"grad_norm": 0.10068814150000575, |
|
"learning_rate": 6.289728707749609e-06, |
|
"loss": 0.11, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.691358024691358, |
|
"grad_norm": 0.09310993381960417, |
|
"learning_rate": 5.957709554495683e-06, |
|
"loss": 0.103, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.6995884773662553, |
|
"grad_norm": 0.10472398535370353, |
|
"learning_rate": 5.634423821964074e-06, |
|
"loss": 0.1161, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.707818930041152, |
|
"grad_norm": 0.10645507272996577, |
|
"learning_rate": 5.319901532714877e-06, |
|
"loss": 0.1217, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.7160493827160495, |
|
"grad_norm": 0.10602188133639034, |
|
"learning_rate": 5.014171895473929e-06, |
|
"loss": 0.1144, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.7242798353909468, |
|
"grad_norm": 0.10289659614865782, |
|
"learning_rate": 4.717263302420283e-06, |
|
"loss": 0.1079, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.7325102880658436, |
|
"grad_norm": 0.09777304769239446, |
|
"learning_rate": 4.429203326549525e-06, |
|
"loss": 0.0999, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.7407407407407405, |
|
"grad_norm": 0.1006102230121861, |
|
"learning_rate": 4.1500187191131466e-06, |
|
"loss": 0.099, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.748971193415638, |
|
"grad_norm": 0.10180225031662725, |
|
"learning_rate": 3.879735407134244e-06, |
|
"loss": 0.1173, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.757201646090535, |
|
"grad_norm": 0.10250402250260239, |
|
"learning_rate": 3.6183784909997187e-06, |
|
"loss": 0.1139, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.765432098765432, |
|
"grad_norm": 0.10615733085722019, |
|
"learning_rate": 3.3659722421293783e-06, |
|
"loss": 0.1133, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.7736625514403292, |
|
"grad_norm": 0.10439855178259218, |
|
"learning_rate": 3.1225401007217936e-06, |
|
"loss": 0.1119, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.7818930041152266, |
|
"grad_norm": 0.10526144305704047, |
|
"learning_rate": 2.8881046735775742e-06, |
|
"loss": 0.1219, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.7901234567901234, |
|
"grad_norm": 0.10521105916873999, |
|
"learning_rate": 2.66268773199988e-06, |
|
"loss": 0.1074, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.7983539094650207, |
|
"grad_norm": 0.10748613106096833, |
|
"learning_rate": 2.446310209772684e-06, |
|
"loss": 0.1128, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.8065843621399176, |
|
"grad_norm": 0.09914523804031186, |
|
"learning_rate": 2.2389922012165944e-06, |
|
"loss": 0.1121, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.814814814814815, |
|
"grad_norm": 0.09936886240495578, |
|
"learning_rate": 2.0407529593228116e-06, |
|
"loss": 0.108, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.8230452674897117, |
|
"grad_norm": 0.09150083914451453, |
|
"learning_rate": 1.8516108939651945e-06, |
|
"loss": 0.1033, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.831275720164609, |
|
"grad_norm": 0.10108462806007419, |
|
"learning_rate": 1.6715835701905603e-06, |
|
"loss": 0.1126, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.8395061728395063, |
|
"grad_norm": 0.09387291972239033, |
|
"learning_rate": 1.5006877065874336e-06, |
|
"loss": 0.1033, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.847736625514403, |
|
"grad_norm": 0.10136947076698849, |
|
"learning_rate": 1.3389391737335112e-06, |
|
"loss": 0.1104, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.8559670781893005, |
|
"grad_norm": 0.09974875048963104, |
|
"learning_rate": 1.1863529927217732e-06, |
|
"loss": 0.1043, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.8641975308641974, |
|
"grad_norm": 0.1025752381453182, |
|
"learning_rate": 1.0429433337655115e-06, |
|
"loss": 0.1122, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.8724279835390947, |
|
"grad_norm": 0.10454016619792661, |
|
"learning_rate": 9.087235148824368e-07, |
|
"loss": 0.1184, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.8806584362139915, |
|
"grad_norm": 0.09589319596591109, |
|
"learning_rate": 7.837060006577801e-07, |
|
"loss": 0.1046, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 0.09451088377127861, |
|
"learning_rate": 6.679024010868618e-07, |
|
"loss": 0.1103, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.897119341563786, |
|
"grad_norm": 0.09770471400992828, |
|
"learning_rate": 5.613234704967996e-07, |
|
"loss": 0.1027, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.905349794238683, |
|
"grad_norm": 0.09543457833989027, |
|
"learning_rate": 4.639791065478738e-07, |
|
"loss": 0.1025, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.9135802469135803, |
|
"grad_norm": 0.10382486799240734, |
|
"learning_rate": 3.758783493142737e-07, |
|
"loss": 0.118, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.9218106995884776, |
|
"grad_norm": 0.1030393593220336, |
|
"learning_rate": 2.9702938044468e-07, |
|
"loss": 0.1179, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.9300411522633745, |
|
"grad_norm": 0.10073287500645238, |
|
"learning_rate": 2.2743952240236176e-07, |
|
"loss": 0.1058, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.9382716049382713, |
|
"grad_norm": 0.10064431575396734, |
|
"learning_rate": 1.6711523778520921e-07, |
|
"loss": 0.1061, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.9465020576131686, |
|
"grad_norm": 0.08907660041993512, |
|
"learning_rate": 1.1606212872559141e-07, |
|
"loss": 0.0985, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.954732510288066, |
|
"grad_norm": 0.10069703961300915, |
|
"learning_rate": 7.428493637002821e-08, |
|
"loss": 0.1107, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.09834578070623724, |
|
"learning_rate": 4.178754043898669e-08, |
|
"loss": 0.1056, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.97119341563786, |
|
"grad_norm": 0.10056360393735715, |
|
"learning_rate": 1.8572958866514e-08, |
|
"loss": 0.1123, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.9794238683127574, |
|
"grad_norm": 0.0943621498377698, |
|
"learning_rate": 4.643347520005836e-09, |
|
"loss": 0.1045, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.9876543209876543, |
|
"grad_norm": 0.09983652481727213, |
|
"learning_rate": 0.0, |
|
"loss": 0.1135, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.9876543209876543, |
|
"eval_loss": 0.13842210173606873, |
|
"eval_runtime": 21.2699, |
|
"eval_samples_per_second": 38.411, |
|
"eval_steps_per_second": 1.222, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.9876543209876543, |
|
"step": 363, |
|
"total_flos": 9.618537139981517e+16, |
|
"train_loss": 0.21143763487742953, |
|
"train_runtime": 3172.33, |
|
"train_samples_per_second": 14.668, |
|
"train_steps_per_second": 0.114 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 363, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.618537139981517e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|