|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.3998792426790874, |
|
"eval_steps": 2318, |
|
"global_step": 9272, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0009919351360676242, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.001, |
|
"loss": 9.0124, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0019838702721352484, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 0.0009990070799516491, |
|
"loss": 4.9678, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0029758054082028723, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.0009980141599032983, |
|
"loss": 4.8546, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.003967740544270497, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0009970212398549474, |
|
"loss": 4.7284, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.00495967568033812, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0009960283198065965, |
|
"loss": 4.5063, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.005951610816405745, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0009950353997582456, |
|
"loss": 4.2107, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.006943545952473369, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0009940424797098947, |
|
"loss": 4.0923, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.007935481088540993, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0009930495596615439, |
|
"loss": 3.7695, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.008927416224608617, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.000992056639613193, |
|
"loss": 3.6712, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.00991935136067624, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0009910637195648419, |
|
"loss": 3.5194, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.010911286496743866, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0009900707995164912, |
|
"loss": 3.4792, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.01190322163281149, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0009890778794681401, |
|
"loss": 3.3372, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.012895156768879113, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0009880849594197895, |
|
"loss": 3.2479, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.013887091904946738, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0009870920393714384, |
|
"loss": 3.2265, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.014879027041014362, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0009860991193230877, |
|
"loss": 3.1277, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.015870962177081987, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0009851061992747366, |
|
"loss": 3.0156, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.01686289731314961, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0009841132792263857, |
|
"loss": 2.9304, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.017854832449217234, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0009831203591780348, |
|
"loss": 2.9364, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.01884676758528486, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.000982127439129684, |
|
"loss": 2.8848, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.01983870272135248, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.000981134519081333, |
|
"loss": 2.8368, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.020830637857420106, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0009801415990329822, |
|
"loss": 2.8024, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.02182257299348773, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.0009791486789846313, |
|
"loss": 2.7152, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.022814508129555353, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0009781557589362804, |
|
"loss": 2.6818, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.02380644326562298, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0009771628388879296, |
|
"loss": 2.6625, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.024798378401690604, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0009761699188395787, |
|
"loss": 2.5967, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.025790313537758226, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0009751769987912278, |
|
"loss": 2.541, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.02678224867382585, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0009741840787428768, |
|
"loss": 2.5455, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.027774183809893476, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.000973191158694526, |
|
"loss": 2.5566, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.028766118945961098, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.000972198238646175, |
|
"loss": 2.4318, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.029758054082028723, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0009712053185978243, |
|
"loss": 2.4449, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.03074998921809635, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0009702123985494733, |
|
"loss": 2.4232, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.031741924354163974, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0009692194785011224, |
|
"loss": 2.3352, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.032733859490231595, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0009682265584527715, |
|
"loss": 2.332, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.03372579462629922, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.0009672336384044207, |
|
"loss": 2.3185, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.034717729762366846, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0009662407183560698, |
|
"loss": 2.2945, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.03570966489843447, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0009652477983077189, |
|
"loss": 2.2821, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.03670160003450209, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.000964254878259368, |
|
"loss": 2.2184, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.03769353517056972, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0009632619582110171, |
|
"loss": 2.2383, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.03868547030663734, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0009622690381626662, |
|
"loss": 2.1755, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.03967740544270496, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0009612761181143154, |
|
"loss": 2.1569, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.04066934057877259, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0009602831980659644, |
|
"loss": 2.1272, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.04166127571484021, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0009592902780176136, |
|
"loss": 2.1358, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.042653210850907834, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0009582973579692626, |
|
"loss": 2.1388, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.04364514598697546, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0009573044379209118, |
|
"loss": 2.0937, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.044637081123043085, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0009563115178725609, |
|
"loss": 2.0818, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.045629016259110707, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0009553185978242101, |
|
"loss": 2.0259, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 0.046620951395178335, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0009543256777758591, |
|
"loss": 2.0098, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 0.04761288653124596, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0009533327577275082, |
|
"loss": 2.0276, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.04860482166731358, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0009523398376791573, |
|
"loss": 2.0285, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 0.04959675680338121, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0009513469176308065, |
|
"loss": 1.9488, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.05058869193944883, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0009503539975824556, |
|
"loss": 1.9922, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 0.05158062707551645, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0009493610775341046, |
|
"loss": 1.9759, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 0.05257256221158408, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0009483681574857538, |
|
"loss": 1.93, |
|
"step": 1219 |
|
}, |
|
{ |
|
"epoch": 0.0535644973476517, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0009473752374374028, |
|
"loss": 1.9264, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 0.05455643248371932, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0009463823173890521, |
|
"loss": 1.8782, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.05554836761978695, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0009453893973407011, |
|
"loss": 1.9079, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 0.056540302755854574, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0009443964772923502, |
|
"loss": 1.9279, |
|
"step": 1311 |
|
}, |
|
{ |
|
"epoch": 0.057532237891922196, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0009434035572439993, |
|
"loss": 1.8501, |
|
"step": 1334 |
|
}, |
|
{ |
|
"epoch": 0.058524173027989825, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0009424106371956484, |
|
"loss": 1.8732, |
|
"step": 1357 |
|
}, |
|
{ |
|
"epoch": 0.059516108164057446, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0009414177171472975, |
|
"loss": 1.8397, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.06050804330012507, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0009404247970989467, |
|
"loss": 1.8288, |
|
"step": 1403 |
|
}, |
|
{ |
|
"epoch": 0.0614999784361927, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0009394318770505958, |
|
"loss": 1.8162, |
|
"step": 1426 |
|
}, |
|
{ |
|
"epoch": 0.06249191357226032, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0009384389570022449, |
|
"loss": 1.8204, |
|
"step": 1449 |
|
}, |
|
{ |
|
"epoch": 0.06348384870832795, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.0009374460369538939, |
|
"loss": 1.8123, |
|
"step": 1472 |
|
}, |
|
{ |
|
"epoch": 0.06447578384439556, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0009364531169055431, |
|
"loss": 1.8225, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.06546771898046319, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0009354601968571922, |
|
"loss": 1.7787, |
|
"step": 1518 |
|
}, |
|
{ |
|
"epoch": 0.06645965411653082, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0009344672768088414, |
|
"loss": 1.7703, |
|
"step": 1541 |
|
}, |
|
{ |
|
"epoch": 0.06745158925259843, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0009334743567604904, |
|
"loss": 1.7955, |
|
"step": 1564 |
|
}, |
|
{ |
|
"epoch": 0.06844352438866606, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0009324814367121396, |
|
"loss": 1.7675, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 0.06943545952473369, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0009314885166637886, |
|
"loss": 1.7548, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.0704273946608013, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0009304955966154378, |
|
"loss": 1.7549, |
|
"step": 1633 |
|
}, |
|
{ |
|
"epoch": 0.07141932979686894, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0009295026765670869, |
|
"loss": 1.7057, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 0.07241126493293656, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.000928509756518736, |
|
"loss": 1.7128, |
|
"step": 1679 |
|
}, |
|
{ |
|
"epoch": 0.07340320006900418, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0009275168364703851, |
|
"loss": 1.7606, |
|
"step": 1702 |
|
}, |
|
{ |
|
"epoch": 0.07439513520507181, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0009265239164220342, |
|
"loss": 1.6989, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.07538707034113944, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0009255309963736834, |
|
"loss": 1.7052, |
|
"step": 1748 |
|
}, |
|
{ |
|
"epoch": 0.07637900547720705, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0009245380763253324, |
|
"loss": 1.7021, |
|
"step": 1771 |
|
}, |
|
{ |
|
"epoch": 0.07737094061327468, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0009235451562769816, |
|
"loss": 1.7096, |
|
"step": 1794 |
|
}, |
|
{ |
|
"epoch": 0.07836287574934231, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0009225522362286306, |
|
"loss": 1.6537, |
|
"step": 1817 |
|
}, |
|
{ |
|
"epoch": 0.07935481088540992, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0009215593161802797, |
|
"loss": 1.6359, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.08034674602147755, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0009205663961319288, |
|
"loss": 1.6107, |
|
"step": 1863 |
|
}, |
|
{ |
|
"epoch": 0.08133868115754518, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.000919573476083578, |
|
"loss": 1.6137, |
|
"step": 1886 |
|
}, |
|
{ |
|
"epoch": 0.0823306162936128, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0009185805560352271, |
|
"loss": 1.6428, |
|
"step": 1909 |
|
}, |
|
{ |
|
"epoch": 0.08332255142968042, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.0009175876359868762, |
|
"loss": 1.6146, |
|
"step": 1932 |
|
}, |
|
{ |
|
"epoch": 0.08431448656574805, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0009165947159385253, |
|
"loss": 1.5996, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.08530642170181567, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0009156017958901744, |
|
"loss": 1.6153, |
|
"step": 1978 |
|
}, |
|
{ |
|
"epoch": 0.0862983568378833, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0009146088758418235, |
|
"loss": 1.6178, |
|
"step": 2001 |
|
}, |
|
{ |
|
"epoch": 0.08729029197395093, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0009136159557934727, |
|
"loss": 1.5988, |
|
"step": 2024 |
|
}, |
|
{ |
|
"epoch": 0.08828222711001854, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0009126230357451217, |
|
"loss": 1.578, |
|
"step": 2047 |
|
}, |
|
{ |
|
"epoch": 0.08927416224608617, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0009116301156967709, |
|
"loss": 1.5755, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.0902660973821538, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0009106371956484199, |
|
"loss": 1.5772, |
|
"step": 2093 |
|
}, |
|
{ |
|
"epoch": 0.09125803251822141, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0009096442756000692, |
|
"loss": 1.5405, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 0.09224996765428904, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0009086513555517182, |
|
"loss": 1.5458, |
|
"step": 2139 |
|
}, |
|
{ |
|
"epoch": 0.09324190279035667, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0009076584355033673, |
|
"loss": 1.5783, |
|
"step": 2162 |
|
}, |
|
{ |
|
"epoch": 0.09423383792642429, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0009066655154550164, |
|
"loss": 1.5237, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.09522577306249191, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0009056725954066655, |
|
"loss": 1.5361, |
|
"step": 2208 |
|
}, |
|
{ |
|
"epoch": 0.09621770819855954, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0009046796753583147, |
|
"loss": 1.5493, |
|
"step": 2231 |
|
}, |
|
{ |
|
"epoch": 0.09720964333462716, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0009036867553099638, |
|
"loss": 1.5243, |
|
"step": 2254 |
|
}, |
|
{ |
|
"epoch": 0.09820157847069479, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.0009026938352616129, |
|
"loss": 1.5061, |
|
"step": 2277 |
|
}, |
|
{ |
|
"epoch": 0.09919351360676242, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.000901700915213262, |
|
"loss": 1.5141, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.09996981066977186, |
|
"eval_runtime": 163.8773, |
|
"eval_samples_per_second": 610.212, |
|
"eval_steps_per_second": 7.628, |
|
"step": 2318 |
|
}, |
|
{ |
|
"epoch": 0.10018544874283003, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0009007079951649111, |
|
"loss": 1.5069, |
|
"step": 2323 |
|
}, |
|
{ |
|
"epoch": 0.10117738387889766, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0008997150751165601, |
|
"loss": 1.513, |
|
"step": 2346 |
|
}, |
|
{ |
|
"epoch": 0.10216931901496529, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0008987221550682093, |
|
"loss": 1.5015, |
|
"step": 2369 |
|
}, |
|
{ |
|
"epoch": 0.1031612541510329, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0008977292350198584, |
|
"loss": 1.4961, |
|
"step": 2392 |
|
}, |
|
{ |
|
"epoch": 0.10415318928710053, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0008967363149715075, |
|
"loss": 1.4802, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.10514512442316816, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0008957433949231566, |
|
"loss": 1.4725, |
|
"step": 2438 |
|
}, |
|
{ |
|
"epoch": 0.10613705955923577, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0008947504748748057, |
|
"loss": 1.4698, |
|
"step": 2461 |
|
}, |
|
{ |
|
"epoch": 0.1071289946953034, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0008937575548264549, |
|
"loss": 1.4954, |
|
"step": 2484 |
|
}, |
|
{ |
|
"epoch": 0.10812092983137103, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.000892764634778104, |
|
"loss": 1.4221, |
|
"step": 2507 |
|
}, |
|
{ |
|
"epoch": 0.10911286496743865, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.000891771714729753, |
|
"loss": 1.4364, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.11010480010350628, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0008907787946814022, |
|
"loss": 1.4479, |
|
"step": 2553 |
|
}, |
|
{ |
|
"epoch": 0.1110967352395739, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0008897858746330512, |
|
"loss": 1.4662, |
|
"step": 2576 |
|
}, |
|
{ |
|
"epoch": 0.11208867037564152, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0008887929545847005, |
|
"loss": 1.4463, |
|
"step": 2599 |
|
}, |
|
{ |
|
"epoch": 0.11308060551170915, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0008878000345363495, |
|
"loss": 1.4342, |
|
"step": 2622 |
|
}, |
|
{ |
|
"epoch": 0.11407254064777678, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0008868071144879987, |
|
"loss": 1.4457, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.11506447578384439, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0008858141944396477, |
|
"loss": 1.4413, |
|
"step": 2668 |
|
}, |
|
{ |
|
"epoch": 0.11605641091991202, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.000884821274391297, |
|
"loss": 1.4272, |
|
"step": 2691 |
|
}, |
|
{ |
|
"epoch": 0.11704834605597965, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.000883828354342946, |
|
"loss": 1.4108, |
|
"step": 2714 |
|
}, |
|
{ |
|
"epoch": 0.11804028119204726, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0008828354342945951, |
|
"loss": 1.4226, |
|
"step": 2737 |
|
}, |
|
{ |
|
"epoch": 0.11903221632811489, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0008818425142462442, |
|
"loss": 1.4096, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.12002415146418252, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0008808495941978933, |
|
"loss": 1.3829, |
|
"step": 2783 |
|
}, |
|
{ |
|
"epoch": 0.12101608660025014, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0008798566741495424, |
|
"loss": 1.3841, |
|
"step": 2806 |
|
}, |
|
{ |
|
"epoch": 0.12200802173631776, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0008788637541011916, |
|
"loss": 1.4068, |
|
"step": 2829 |
|
}, |
|
{ |
|
"epoch": 0.1229999568723854, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0008778708340528407, |
|
"loss": 1.3872, |
|
"step": 2852 |
|
}, |
|
{ |
|
"epoch": 0.12399189200845301, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0008768779140044898, |
|
"loss": 1.3999, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.12498382714452064, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0008758849939561388, |
|
"loss": 1.3889, |
|
"step": 2898 |
|
}, |
|
{ |
|
"epoch": 0.12597576228058827, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0008748920739077879, |
|
"loss": 1.3763, |
|
"step": 2921 |
|
}, |
|
{ |
|
"epoch": 0.1269676974166559, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.000873899153859437, |
|
"loss": 1.39, |
|
"step": 2944 |
|
}, |
|
{ |
|
"epoch": 0.1279596325527235, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0008729062338110862, |
|
"loss": 1.3858, |
|
"step": 2967 |
|
}, |
|
{ |
|
"epoch": 0.12895156768879112, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0008719133137627353, |
|
"loss": 1.3674, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.12994350282485875, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0008709203937143844, |
|
"loss": 1.3458, |
|
"step": 3013 |
|
}, |
|
{ |
|
"epoch": 0.13093543796092638, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0008699274736660335, |
|
"loss": 1.3549, |
|
"step": 3036 |
|
}, |
|
{ |
|
"epoch": 0.131927373096994, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0008689345536176825, |
|
"loss": 1.3683, |
|
"step": 3059 |
|
}, |
|
{ |
|
"epoch": 0.13291930823306164, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0008679416335693318, |
|
"loss": 1.3625, |
|
"step": 3082 |
|
}, |
|
{ |
|
"epoch": 0.13391124336912924, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0008669487135209808, |
|
"loss": 1.3462, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.13490317850519687, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00086595579347263, |
|
"loss": 1.345, |
|
"step": 3128 |
|
}, |
|
{ |
|
"epoch": 0.1358951136412645, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.000864962873424279, |
|
"loss": 1.3552, |
|
"step": 3151 |
|
}, |
|
{ |
|
"epoch": 0.13688704877733213, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.0008639699533759282, |
|
"loss": 1.3356, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 0.13787898391339976, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0008629770333275773, |
|
"loss": 1.3373, |
|
"step": 3197 |
|
}, |
|
{ |
|
"epoch": 0.13887091904946738, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0008619841132792265, |
|
"loss": 1.3514, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.13986285418553498, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0008609911932308755, |
|
"loss": 1.3291, |
|
"step": 3243 |
|
}, |
|
{ |
|
"epoch": 0.1408547893216026, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0008599982731825246, |
|
"loss": 1.3233, |
|
"step": 3266 |
|
}, |
|
{ |
|
"epoch": 0.14184672445767024, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.0008590053531341737, |
|
"loss": 1.3174, |
|
"step": 3289 |
|
}, |
|
{ |
|
"epoch": 0.14283865959373787, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0008580124330858229, |
|
"loss": 1.3304, |
|
"step": 3312 |
|
}, |
|
{ |
|
"epoch": 0.1438305947298055, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.000857019513037472, |
|
"loss": 1.3287, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.14482252986587313, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0008560265929891211, |
|
"loss": 1.3201, |
|
"step": 3358 |
|
}, |
|
{ |
|
"epoch": 0.14581446500194073, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0008550336729407702, |
|
"loss": 1.3027, |
|
"step": 3381 |
|
}, |
|
{ |
|
"epoch": 0.14680640013800836, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0008540407528924193, |
|
"loss": 1.3128, |
|
"step": 3404 |
|
}, |
|
{ |
|
"epoch": 0.147798335274076, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0008530478328440683, |
|
"loss": 1.3182, |
|
"step": 3427 |
|
}, |
|
{ |
|
"epoch": 0.14879027041014362, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0008520549127957176, |
|
"loss": 1.2906, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.14978220554621124, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0008510619927473666, |
|
"loss": 1.2837, |
|
"step": 3473 |
|
}, |
|
{ |
|
"epoch": 0.15077414068227887, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0008500690726990157, |
|
"loss": 1.3114, |
|
"step": 3496 |
|
}, |
|
{ |
|
"epoch": 0.15176607581834647, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0008490761526506648, |
|
"loss": 1.3047, |
|
"step": 3519 |
|
}, |
|
{ |
|
"epoch": 0.1527580109544141, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.000848083232602314, |
|
"loss": 1.2654, |
|
"step": 3542 |
|
}, |
|
{ |
|
"epoch": 0.15374994609048173, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0008470903125539631, |
|
"loss": 1.2675, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 0.15474188122654936, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0008460973925056121, |
|
"loss": 1.2671, |
|
"step": 3588 |
|
}, |
|
{ |
|
"epoch": 0.155733816362617, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0008451044724572613, |
|
"loss": 1.2719, |
|
"step": 3611 |
|
}, |
|
{ |
|
"epoch": 0.15672575149868462, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0008441115524089103, |
|
"loss": 1.2755, |
|
"step": 3634 |
|
}, |
|
{ |
|
"epoch": 0.15771768663475222, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0008431186323605595, |
|
"loss": 1.2876, |
|
"step": 3657 |
|
}, |
|
{ |
|
"epoch": 0.15870962177081985, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0008421257123122086, |
|
"loss": 1.2928, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.15970155690688748, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0008411327922638578, |
|
"loss": 1.2855, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 0.1606934920429551, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0008401398722155068, |
|
"loss": 1.2558, |
|
"step": 3726 |
|
}, |
|
{ |
|
"epoch": 0.16168542717902273, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.000839146952167156, |
|
"loss": 1.2561, |
|
"step": 3749 |
|
}, |
|
{ |
|
"epoch": 0.16267736231509036, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.000838154032118805, |
|
"loss": 1.2851, |
|
"step": 3772 |
|
}, |
|
{ |
|
"epoch": 0.16366929745115796, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0008371611120704542, |
|
"loss": 1.2682, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 0.1646612325872256, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0008361681920221033, |
|
"loss": 1.2709, |
|
"step": 3818 |
|
}, |
|
{ |
|
"epoch": 0.16565316772329322, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0008351752719737524, |
|
"loss": 1.2459, |
|
"step": 3841 |
|
}, |
|
{ |
|
"epoch": 0.16664510285936085, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0008341823519254015, |
|
"loss": 1.2134, |
|
"step": 3864 |
|
}, |
|
{ |
|
"epoch": 0.16763703799542848, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0008331894318770506, |
|
"loss": 1.2445, |
|
"step": 3887 |
|
}, |
|
{ |
|
"epoch": 0.1686289731314961, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0008321965118286998, |
|
"loss": 1.2522, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.1696209082675637, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0008312035917803489, |
|
"loss": 1.2296, |
|
"step": 3933 |
|
}, |
|
{ |
|
"epoch": 0.17061284340363134, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0008302106717319979, |
|
"loss": 1.252, |
|
"step": 3956 |
|
}, |
|
{ |
|
"epoch": 0.17160477853969897, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0008292177516836471, |
|
"loss": 1.2501, |
|
"step": 3979 |
|
}, |
|
{ |
|
"epoch": 0.1725967136757666, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0008282248316352961, |
|
"loss": 1.2328, |
|
"step": 4002 |
|
}, |
|
{ |
|
"epoch": 0.17358864881183422, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0008272319115869454, |
|
"loss": 1.2182, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.17458058394790185, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.0008262389915385944, |
|
"loss": 1.2515, |
|
"step": 4048 |
|
}, |
|
{ |
|
"epoch": 0.17557251908396945, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0008252460714902435, |
|
"loss": 1.2076, |
|
"step": 4071 |
|
}, |
|
{ |
|
"epoch": 0.17656445422003708, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0008242531514418926, |
|
"loss": 1.2219, |
|
"step": 4094 |
|
}, |
|
{ |
|
"epoch": 0.1775563893561047, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0008232602313935416, |
|
"loss": 1.2307, |
|
"step": 4117 |
|
}, |
|
{ |
|
"epoch": 0.17854832449217234, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0008222673113451908, |
|
"loss": 1.2368, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.17954025962823997, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0008212743912968399, |
|
"loss": 1.2332, |
|
"step": 4163 |
|
}, |
|
{ |
|
"epoch": 0.1805321947643076, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0008202814712484891, |
|
"loss": 1.2086, |
|
"step": 4186 |
|
}, |
|
{ |
|
"epoch": 0.1815241299003752, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.0008192885512001381, |
|
"loss": 1.2204, |
|
"step": 4209 |
|
}, |
|
{ |
|
"epoch": 0.18251606503644283, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0008182956311517873, |
|
"loss": 1.2379, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 0.18350800017251045, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0008173027111034363, |
|
"loss": 1.2387, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 0.18449993530857808, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0008163097910550856, |
|
"loss": 1.2008, |
|
"step": 4278 |
|
}, |
|
{ |
|
"epoch": 0.1854918704446457, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0008153168710067346, |
|
"loss": 1.2191, |
|
"step": 4301 |
|
}, |
|
{ |
|
"epoch": 0.18648380558071334, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0008143239509583837, |
|
"loss": 1.2153, |
|
"step": 4324 |
|
}, |
|
{ |
|
"epoch": 0.18747574071678094, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.0008133310309100328, |
|
"loss": 1.2171, |
|
"step": 4347 |
|
}, |
|
{ |
|
"epoch": 0.18846767585284857, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0008123381108616819, |
|
"loss": 1.2154, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.1894596109889162, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0008113451908133311, |
|
"loss": 1.2055, |
|
"step": 4393 |
|
}, |
|
{ |
|
"epoch": 0.19045154612498383, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0008103522707649802, |
|
"loss": 1.2215, |
|
"step": 4416 |
|
}, |
|
{ |
|
"epoch": 0.19144348126105146, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0008093593507166293, |
|
"loss": 1.2035, |
|
"step": 4439 |
|
}, |
|
{ |
|
"epoch": 0.19243541639711909, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0008083664306682784, |
|
"loss": 1.183, |
|
"step": 4462 |
|
}, |
|
{ |
|
"epoch": 0.1934273515331867, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0008073735106199274, |
|
"loss": 1.1892, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 0.19441928666925432, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0008063805905715767, |
|
"loss": 1.2006, |
|
"step": 4508 |
|
}, |
|
{ |
|
"epoch": 0.19541122180532194, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.0008053876705232257, |
|
"loss": 1.2081, |
|
"step": 4531 |
|
}, |
|
{ |
|
"epoch": 0.19640315694138957, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0008043947504748749, |
|
"loss": 1.1878, |
|
"step": 4554 |
|
}, |
|
{ |
|
"epoch": 0.1973950920774572, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0008034018304265239, |
|
"loss": 1.1701, |
|
"step": 4577 |
|
}, |
|
{ |
|
"epoch": 0.19838702721352483, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0008024089103781731, |
|
"loss": 1.179, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.19937896234959243, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0008014159903298221, |
|
"loss": 1.1794, |
|
"step": 4623 |
|
}, |
|
{ |
|
"epoch": 0.1999396213395437, |
|
"eval_runtime": 164.1241, |
|
"eval_samples_per_second": 609.295, |
|
"eval_steps_per_second": 7.616, |
|
"step": 4636 |
|
}, |
|
{ |
|
"epoch": 0.20037089748566006, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0008004230702814712, |
|
"loss": 1.181, |
|
"step": 4646 |
|
}, |
|
{ |
|
"epoch": 0.2013628326217277, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0007994301502331204, |
|
"loss": 1.1706, |
|
"step": 4669 |
|
}, |
|
{ |
|
"epoch": 0.20235476775779532, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0007984372301847694, |
|
"loss": 1.2005, |
|
"step": 4692 |
|
}, |
|
{ |
|
"epoch": 0.20334670289386295, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0007974443101364186, |
|
"loss": 1.1784, |
|
"step": 4715 |
|
}, |
|
{ |
|
"epoch": 0.20433863802993057, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0007964513900880676, |
|
"loss": 1.1774, |
|
"step": 4738 |
|
}, |
|
{ |
|
"epoch": 0.20533057316599818, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0007954584700397169, |
|
"loss": 1.1589, |
|
"step": 4761 |
|
}, |
|
{ |
|
"epoch": 0.2063225083020658, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0007944655499913659, |
|
"loss": 1.1821, |
|
"step": 4784 |
|
}, |
|
{ |
|
"epoch": 0.20731444343813343, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0007934726299430151, |
|
"loss": 1.1496, |
|
"step": 4807 |
|
}, |
|
{ |
|
"epoch": 0.20830637857420106, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0007924797098946641, |
|
"loss": 1.1666, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.2092983137102687, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0007914867898463132, |
|
"loss": 1.1546, |
|
"step": 4853 |
|
}, |
|
{ |
|
"epoch": 0.21029024884633632, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0007904938697979624, |
|
"loss": 1.2045, |
|
"step": 4876 |
|
}, |
|
{ |
|
"epoch": 0.21128218398240392, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0007895009497496115, |
|
"loss": 1.1757, |
|
"step": 4899 |
|
}, |
|
{ |
|
"epoch": 0.21227411911847155, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0007885080297012606, |
|
"loss": 1.1894, |
|
"step": 4922 |
|
}, |
|
{ |
|
"epoch": 0.21326605425453918, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0007875151096529097, |
|
"loss": 1.174, |
|
"step": 4945 |
|
}, |
|
{ |
|
"epoch": 0.2142579893906068, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0007865221896045588, |
|
"loss": 1.1423, |
|
"step": 4968 |
|
}, |
|
{ |
|
"epoch": 0.21524992452667444, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.000785529269556208, |
|
"loss": 1.1532, |
|
"step": 4991 |
|
}, |
|
{ |
|
"epoch": 0.21624185966274206, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.000784536349507857, |
|
"loss": 1.1688, |
|
"step": 5014 |
|
}, |
|
{ |
|
"epoch": 0.21723379479880967, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0007835434294595062, |
|
"loss": 1.1446, |
|
"step": 5037 |
|
}, |
|
{ |
|
"epoch": 0.2182257299348773, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0007825505094111552, |
|
"loss": 1.1792, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.21921766507094492, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0007815575893628044, |
|
"loss": 1.1516, |
|
"step": 5083 |
|
}, |
|
{ |
|
"epoch": 0.22020960020701255, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0007805646693144534, |
|
"loss": 1.1712, |
|
"step": 5106 |
|
}, |
|
{ |
|
"epoch": 0.22120153534308018, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0007795717492661027, |
|
"loss": 1.1562, |
|
"step": 5129 |
|
}, |
|
{ |
|
"epoch": 0.2221934704791478, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0007785788292177517, |
|
"loss": 1.1641, |
|
"step": 5152 |
|
}, |
|
{ |
|
"epoch": 0.2231854056152154, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0007775859091694009, |
|
"loss": 1.1649, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 0.22417734075128304, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0007765929891210499, |
|
"loss": 1.1684, |
|
"step": 5198 |
|
}, |
|
{ |
|
"epoch": 0.22516927588735067, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0007756000690726989, |
|
"loss": 1.1659, |
|
"step": 5221 |
|
}, |
|
{ |
|
"epoch": 0.2261612110234183, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.0007746071490243482, |
|
"loss": 1.1683, |
|
"step": 5244 |
|
}, |
|
{ |
|
"epoch": 0.22715314615948592, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0007736142289759972, |
|
"loss": 1.1536, |
|
"step": 5267 |
|
}, |
|
{ |
|
"epoch": 0.22814508129555355, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0007726213089276464, |
|
"loss": 1.1451, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.22913701643162115, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0007716283888792954, |
|
"loss": 1.1453, |
|
"step": 5313 |
|
}, |
|
{ |
|
"epoch": 0.23012895156768878, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0007706354688309446, |
|
"loss": 1.1585, |
|
"step": 5336 |
|
}, |
|
{ |
|
"epoch": 0.2311208867037564, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0007696425487825937, |
|
"loss": 1.1585, |
|
"step": 5359 |
|
}, |
|
{ |
|
"epoch": 0.23211282183982404, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0007686496287342428, |
|
"loss": 1.1396, |
|
"step": 5382 |
|
}, |
|
{ |
|
"epoch": 0.23310475697589167, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0007676567086858919, |
|
"loss": 1.1527, |
|
"step": 5405 |
|
}, |
|
{ |
|
"epoch": 0.2340966921119593, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.000766663788637541, |
|
"loss": 1.1157, |
|
"step": 5428 |
|
}, |
|
{ |
|
"epoch": 0.2350886272480269, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0007656708685891901, |
|
"loss": 1.1232, |
|
"step": 5451 |
|
}, |
|
{ |
|
"epoch": 0.23608056238409453, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0007646779485408393, |
|
"loss": 1.1411, |
|
"step": 5474 |
|
}, |
|
{ |
|
"epoch": 0.23707249752016216, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0007636850284924884, |
|
"loss": 1.1329, |
|
"step": 5497 |
|
}, |
|
{ |
|
"epoch": 0.23806443265622979, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0007626921084441375, |
|
"loss": 1.1262, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.23905636779229741, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0007616991883957865, |
|
"loss": 1.1346, |
|
"step": 5543 |
|
}, |
|
{ |
|
"epoch": 0.24004830292836504, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0007607062683474357, |
|
"loss": 1.139, |
|
"step": 5566 |
|
}, |
|
{ |
|
"epoch": 0.24104023806443264, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0007597133482990847, |
|
"loss": 1.1318, |
|
"step": 5589 |
|
}, |
|
{ |
|
"epoch": 0.24203217320050027, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.000758720428250734, |
|
"loss": 1.1178, |
|
"step": 5612 |
|
}, |
|
{ |
|
"epoch": 0.2430241083365679, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.000757727508202383, |
|
"loss": 1.1312, |
|
"step": 5635 |
|
}, |
|
{ |
|
"epoch": 0.24401604347263553, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0007567345881540322, |
|
"loss": 1.1263, |
|
"step": 5658 |
|
}, |
|
{ |
|
"epoch": 0.24500797860870316, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0007557416681056812, |
|
"loss": 1.1303, |
|
"step": 5681 |
|
}, |
|
{ |
|
"epoch": 0.2459999137447708, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0007547487480573305, |
|
"loss": 1.1365, |
|
"step": 5704 |
|
}, |
|
{ |
|
"epoch": 0.2469918488808384, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0007537558280089795, |
|
"loss": 1.1239, |
|
"step": 5727 |
|
}, |
|
{ |
|
"epoch": 0.24798378401690602, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0007527629079606286, |
|
"loss": 1.1212, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.24897571915297365, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.0007517699879122777, |
|
"loss": 1.1128, |
|
"step": 5773 |
|
}, |
|
{ |
|
"epoch": 0.24996765428904127, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0007507770678639267, |
|
"loss": 1.1153, |
|
"step": 5796 |
|
}, |
|
{ |
|
"epoch": 0.2509595894251089, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0007497841478155759, |
|
"loss": 1.1191, |
|
"step": 5819 |
|
}, |
|
{ |
|
"epoch": 0.25195152456117653, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.000748791227767225, |
|
"loss": 1.1115, |
|
"step": 5842 |
|
}, |
|
{ |
|
"epoch": 0.25294345969724413, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0007477983077188742, |
|
"loss": 1.1377, |
|
"step": 5865 |
|
}, |
|
{ |
|
"epoch": 0.2539353948333118, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0007468053876705232, |
|
"loss": 1.1345, |
|
"step": 5888 |
|
}, |
|
{ |
|
"epoch": 0.2549273299693794, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0007458124676221723, |
|
"loss": 1.1226, |
|
"step": 5911 |
|
}, |
|
{ |
|
"epoch": 0.255919265105447, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0007448195475738214, |
|
"loss": 1.1163, |
|
"step": 5934 |
|
}, |
|
{ |
|
"epoch": 0.25691120024151465, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0007438266275254706, |
|
"loss": 1.1255, |
|
"step": 5957 |
|
}, |
|
{ |
|
"epoch": 0.25790313537758225, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0007428337074771197, |
|
"loss": 1.1049, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.2588950705136499, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0007418407874287688, |
|
"loss": 1.1294, |
|
"step": 6003 |
|
}, |
|
{ |
|
"epoch": 0.2598870056497175, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0007408478673804179, |
|
"loss": 1.1167, |
|
"step": 6026 |
|
}, |
|
{ |
|
"epoch": 0.26087894078578516, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.000739854947332067, |
|
"loss": 1.1041, |
|
"step": 6049 |
|
}, |
|
{ |
|
"epoch": 0.26187087592185276, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.000738862027283716, |
|
"loss": 1.1146, |
|
"step": 6072 |
|
}, |
|
{ |
|
"epoch": 0.26286281105792036, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0007378691072353653, |
|
"loss": 1.104, |
|
"step": 6095 |
|
}, |
|
{ |
|
"epoch": 0.263854746193988, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0007368761871870143, |
|
"loss": 1.1091, |
|
"step": 6118 |
|
}, |
|
{ |
|
"epoch": 0.2648466813300556, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0007358832671386635, |
|
"loss": 1.1295, |
|
"step": 6141 |
|
}, |
|
{ |
|
"epoch": 0.2658386164661233, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0007348903470903125, |
|
"loss": 1.0913, |
|
"step": 6164 |
|
}, |
|
{ |
|
"epoch": 0.2668305516021909, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0007338974270419618, |
|
"loss": 1.0806, |
|
"step": 6187 |
|
}, |
|
{ |
|
"epoch": 0.2678224867382585, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0007329045069936108, |
|
"loss": 1.099, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.26881442187432614, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00073191158694526, |
|
"loss": 1.0982, |
|
"step": 6233 |
|
}, |
|
{ |
|
"epoch": 0.26980635701039374, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.000730918666896909, |
|
"loss": 1.093, |
|
"step": 6256 |
|
}, |
|
{ |
|
"epoch": 0.2707982921464614, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0007299257468485581, |
|
"loss": 1.0773, |
|
"step": 6279 |
|
}, |
|
{ |
|
"epoch": 0.271790227282529, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0007289328268002072, |
|
"loss": 1.101, |
|
"step": 6302 |
|
}, |
|
{ |
|
"epoch": 0.27278216241859665, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0007279399067518564, |
|
"loss": 1.1084, |
|
"step": 6325 |
|
}, |
|
{ |
|
"epoch": 0.27377409755466425, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0007269469867035055, |
|
"loss": 1.0869, |
|
"step": 6348 |
|
}, |
|
{ |
|
"epoch": 0.27476603269073185, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0007259540666551545, |
|
"loss": 1.088, |
|
"step": 6371 |
|
}, |
|
{ |
|
"epoch": 0.2757579678267995, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0007249611466068037, |
|
"loss": 1.1002, |
|
"step": 6394 |
|
}, |
|
{ |
|
"epoch": 0.2767499029628671, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0007239682265584527, |
|
"loss": 1.083, |
|
"step": 6417 |
|
}, |
|
{ |
|
"epoch": 0.27774183809893477, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0007229753065101019, |
|
"loss": 1.0884, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.27873377323500237, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.000721982386461751, |
|
"loss": 1.092, |
|
"step": 6463 |
|
}, |
|
{ |
|
"epoch": 0.27972570837106997, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0007209894664134001, |
|
"loss": 1.0736, |
|
"step": 6486 |
|
}, |
|
{ |
|
"epoch": 0.2807176435071376, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0007199965463650492, |
|
"loss": 1.091, |
|
"step": 6509 |
|
}, |
|
{ |
|
"epoch": 0.2817095786432052, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0007190036263166983, |
|
"loss": 1.0893, |
|
"step": 6532 |
|
}, |
|
{ |
|
"epoch": 0.2827015137792729, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0007180107062683475, |
|
"loss": 1.0936, |
|
"step": 6555 |
|
}, |
|
{ |
|
"epoch": 0.2836934489153405, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0007170177862199966, |
|
"loss": 1.0955, |
|
"step": 6578 |
|
}, |
|
{ |
|
"epoch": 0.28468538405140814, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0007160248661716456, |
|
"loss": 1.1016, |
|
"step": 6601 |
|
}, |
|
{ |
|
"epoch": 0.28567731918747574, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0007150319461232948, |
|
"loss": 1.102, |
|
"step": 6624 |
|
}, |
|
{ |
|
"epoch": 0.28666925432354334, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.0007140390260749438, |
|
"loss": 1.08, |
|
"step": 6647 |
|
}, |
|
{ |
|
"epoch": 0.287661189459611, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.000713046106026593, |
|
"loss": 1.0785, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.2886531245956786, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0007120531859782421, |
|
"loss": 1.0944, |
|
"step": 6693 |
|
}, |
|
{ |
|
"epoch": 0.28964505973174626, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0007110602659298913, |
|
"loss": 1.0851, |
|
"step": 6716 |
|
}, |
|
{ |
|
"epoch": 0.29063699486781386, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0007100673458815403, |
|
"loss": 1.0942, |
|
"step": 6739 |
|
}, |
|
{ |
|
"epoch": 0.29162893000388146, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.0007090744258331895, |
|
"loss": 1.0831, |
|
"step": 6762 |
|
}, |
|
{ |
|
"epoch": 0.2926208651399491, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.0007080815057848385, |
|
"loss": 1.0724, |
|
"step": 6785 |
|
}, |
|
{ |
|
"epoch": 0.2936128002760167, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0007070885857364877, |
|
"loss": 1.0875, |
|
"step": 6808 |
|
}, |
|
{ |
|
"epoch": 0.2946047354120844, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0007060956656881368, |
|
"loss": 1.0896, |
|
"step": 6831 |
|
}, |
|
{ |
|
"epoch": 0.295596670548152, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0007051027456397859, |
|
"loss": 1.0702, |
|
"step": 6854 |
|
}, |
|
{ |
|
"epoch": 0.29658860568421963, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.000704109825591435, |
|
"loss": 1.0887, |
|
"step": 6877 |
|
}, |
|
{ |
|
"epoch": 0.29758054082028723, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0007031169055430841, |
|
"loss": 1.0708, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.29857247595635483, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.0007021239854947333, |
|
"loss": 1.0828, |
|
"step": 6923 |
|
}, |
|
{ |
|
"epoch": 0.2995644110924225, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0007011310654463823, |
|
"loss": 1.061, |
|
"step": 6946 |
|
}, |
|
{ |
|
"epoch": 0.2999094320093156, |
|
"eval_runtime": 164.053, |
|
"eval_samples_per_second": 609.559, |
|
"eval_steps_per_second": 7.619, |
|
"step": 6954 |
|
}, |
|
{ |
|
"epoch": 0.3005563462284901, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0007001381453980314, |
|
"loss": 1.0634, |
|
"step": 6969 |
|
}, |
|
{ |
|
"epoch": 0.30154828136455775, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0006991452253496805, |
|
"loss": 1.071, |
|
"step": 6992 |
|
}, |
|
{ |
|
"epoch": 0.30254021650062535, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0006981523053013296, |
|
"loss": 1.0822, |
|
"step": 7015 |
|
}, |
|
{ |
|
"epoch": 0.30353215163669295, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0006971593852529788, |
|
"loss": 1.0618, |
|
"step": 7038 |
|
}, |
|
{ |
|
"epoch": 0.3045240867727606, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0006961664652046279, |
|
"loss": 1.0722, |
|
"step": 7061 |
|
}, |
|
{ |
|
"epoch": 0.3055160219088282, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.000695173545156277, |
|
"loss": 1.0651, |
|
"step": 7084 |
|
}, |
|
{ |
|
"epoch": 0.30650795704489586, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0006941806251079261, |
|
"loss": 1.0648, |
|
"step": 7107 |
|
}, |
|
{ |
|
"epoch": 0.30749989218096346, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0006931877050595752, |
|
"loss": 1.0615, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.3084918273170311, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0006921947850112244, |
|
"loss": 1.0673, |
|
"step": 7153 |
|
}, |
|
{ |
|
"epoch": 0.3094837624530987, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.0006912018649628734, |
|
"loss": 1.0596, |
|
"step": 7176 |
|
}, |
|
{ |
|
"epoch": 0.3104756975891663, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0006902089449145226, |
|
"loss": 1.0665, |
|
"step": 7199 |
|
}, |
|
{ |
|
"epoch": 0.311467632725234, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0006892160248661716, |
|
"loss": 1.0784, |
|
"step": 7222 |
|
}, |
|
{ |
|
"epoch": 0.3124595678613016, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0006882231048178208, |
|
"loss": 1.0737, |
|
"step": 7245 |
|
}, |
|
{ |
|
"epoch": 0.31345150299736924, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0006872301847694698, |
|
"loss": 1.0647, |
|
"step": 7268 |
|
}, |
|
{ |
|
"epoch": 0.31444343813343684, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0006862372647211191, |
|
"loss": 1.0859, |
|
"step": 7291 |
|
}, |
|
{ |
|
"epoch": 0.31543537326950444, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0006852443446727681, |
|
"loss": 1.0703, |
|
"step": 7314 |
|
}, |
|
{ |
|
"epoch": 0.3164273084055721, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0006842514246244172, |
|
"loss": 1.0581, |
|
"step": 7337 |
|
}, |
|
{ |
|
"epoch": 0.3174192435416397, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.0006832585045760663, |
|
"loss": 1.0557, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.31841117867770735, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0006822655845277154, |
|
"loss": 1.0853, |
|
"step": 7383 |
|
}, |
|
{ |
|
"epoch": 0.31940311381377495, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0006812726644793646, |
|
"loss": 1.0602, |
|
"step": 7406 |
|
}, |
|
{ |
|
"epoch": 0.3203950489498426, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0006802797444310137, |
|
"loss": 1.0611, |
|
"step": 7429 |
|
}, |
|
{ |
|
"epoch": 0.3213869840859102, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0006792868243826628, |
|
"loss": 1.0703, |
|
"step": 7452 |
|
}, |
|
{ |
|
"epoch": 0.3223789192219778, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0006782939043343119, |
|
"loss": 1.0357, |
|
"step": 7475 |
|
}, |
|
{ |
|
"epoch": 0.32337085435804547, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0006773009842859609, |
|
"loss": 1.0556, |
|
"step": 7498 |
|
}, |
|
{ |
|
"epoch": 0.32436278949411307, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00067630806423761, |
|
"loss": 1.0561, |
|
"step": 7521 |
|
}, |
|
{ |
|
"epoch": 0.3253547246301807, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0006753151441892592, |
|
"loss": 1.0693, |
|
"step": 7544 |
|
}, |
|
{ |
|
"epoch": 0.3263466597662483, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.0006743222241409083, |
|
"loss": 1.0668, |
|
"step": 7567 |
|
}, |
|
{ |
|
"epoch": 0.3273385949023159, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0006733293040925574, |
|
"loss": 1.038, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.3283305300383836, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0006723363840442065, |
|
"loss": 1.056, |
|
"step": 7613 |
|
}, |
|
{ |
|
"epoch": 0.3293224651744512, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0006713434639958557, |
|
"loss": 1.0444, |
|
"step": 7636 |
|
}, |
|
{ |
|
"epoch": 0.33031440031051884, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0006703505439475048, |
|
"loss": 1.0534, |
|
"step": 7659 |
|
}, |
|
{ |
|
"epoch": 0.33130633544658644, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0006693576238991539, |
|
"loss": 1.0597, |
|
"step": 7682 |
|
}, |
|
{ |
|
"epoch": 0.3322982705826541, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.0006683647038508029, |
|
"loss": 1.0305, |
|
"step": 7705 |
|
}, |
|
{ |
|
"epoch": 0.3332902057187217, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0006673717838024521, |
|
"loss": 1.0632, |
|
"step": 7728 |
|
}, |
|
{ |
|
"epoch": 0.3342821408547893, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0006663788637541011, |
|
"loss": 1.0462, |
|
"step": 7751 |
|
}, |
|
{ |
|
"epoch": 0.33527407599085696, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0006653859437057504, |
|
"loss": 1.0534, |
|
"step": 7774 |
|
}, |
|
{ |
|
"epoch": 0.33626601112692456, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0006643930236573994, |
|
"loss": 1.0468, |
|
"step": 7797 |
|
}, |
|
{ |
|
"epoch": 0.3372579462629922, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0006634001036090486, |
|
"loss": 1.0473, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.3382498813990598, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0006624071835606976, |
|
"loss": 1.0402, |
|
"step": 7843 |
|
}, |
|
{ |
|
"epoch": 0.3392418165351274, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0006614142635123467, |
|
"loss": 1.0713, |
|
"step": 7866 |
|
}, |
|
{ |
|
"epoch": 0.3402337516711951, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0006604213434639959, |
|
"loss": 1.0549, |
|
"step": 7889 |
|
}, |
|
{ |
|
"epoch": 0.3412256868072627, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.000659428423415645, |
|
"loss": 1.0527, |
|
"step": 7912 |
|
}, |
|
{ |
|
"epoch": 0.34221762194333033, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.0006584355033672941, |
|
"loss": 1.0599, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 0.34320955707939793, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0006574425833189432, |
|
"loss": 1.0503, |
|
"step": 7958 |
|
}, |
|
{ |
|
"epoch": 0.3442014922154656, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0006564496632705923, |
|
"loss": 1.0465, |
|
"step": 7981 |
|
}, |
|
{ |
|
"epoch": 0.3451934273515332, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0006554567432222415, |
|
"loss": 1.0246, |
|
"step": 8004 |
|
}, |
|
{ |
|
"epoch": 0.3461853624876008, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0006544638231738905, |
|
"loss": 1.0463, |
|
"step": 8027 |
|
}, |
|
{ |
|
"epoch": 0.34717729762366845, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.0006534709031255397, |
|
"loss": 1.0471, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.34816923275973605, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0006524779830771887, |
|
"loss": 1.0414, |
|
"step": 8073 |
|
}, |
|
{ |
|
"epoch": 0.3491611678958037, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0006514850630288378, |
|
"loss": 1.0431, |
|
"step": 8096 |
|
}, |
|
{ |
|
"epoch": 0.3501531030318713, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.000650492142980487, |
|
"loss": 1.0382, |
|
"step": 8119 |
|
}, |
|
{ |
|
"epoch": 0.3511450381679389, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0006494992229321361, |
|
"loss": 1.0557, |
|
"step": 8142 |
|
}, |
|
{ |
|
"epoch": 0.35213697330400656, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0006485063028837852, |
|
"loss": 1.0414, |
|
"step": 8165 |
|
}, |
|
{ |
|
"epoch": 0.35312890844007416, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0006475133828354343, |
|
"loss": 1.0221, |
|
"step": 8188 |
|
}, |
|
{ |
|
"epoch": 0.3541208435761418, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0006465204627870834, |
|
"loss": 1.0207, |
|
"step": 8211 |
|
}, |
|
{ |
|
"epoch": 0.3551127787122094, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0006455275427387324, |
|
"loss": 1.0399, |
|
"step": 8234 |
|
}, |
|
{ |
|
"epoch": 0.3561047138482771, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.0006445346226903817, |
|
"loss": 1.0256, |
|
"step": 8257 |
|
}, |
|
{ |
|
"epoch": 0.3570966489843447, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0006435417026420307, |
|
"loss": 1.0452, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.3580885841204123, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0006425487825936799, |
|
"loss": 1.0388, |
|
"step": 8303 |
|
}, |
|
{ |
|
"epoch": 0.35908051925647994, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0006415558625453289, |
|
"loss": 1.0401, |
|
"step": 8326 |
|
}, |
|
{ |
|
"epoch": 0.36007245439254754, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.0006405629424969782, |
|
"loss": 1.0345, |
|
"step": 8349 |
|
}, |
|
{ |
|
"epoch": 0.3610643895286152, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0006395700224486272, |
|
"loss": 1.0362, |
|
"step": 8372 |
|
}, |
|
{ |
|
"epoch": 0.3620563246646828, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0006385771024002763, |
|
"loss": 1.045, |
|
"step": 8395 |
|
}, |
|
{ |
|
"epoch": 0.3630482598007504, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0006375841823519254, |
|
"loss": 1.0417, |
|
"step": 8418 |
|
}, |
|
{ |
|
"epoch": 0.36404019493681805, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0006365912623035745, |
|
"loss": 1.0388, |
|
"step": 8441 |
|
}, |
|
{ |
|
"epoch": 0.36503213007288565, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0006355983422552236, |
|
"loss": 1.0303, |
|
"step": 8464 |
|
}, |
|
{ |
|
"epoch": 0.3660240652089533, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0006346054222068728, |
|
"loss": 1.017, |
|
"step": 8487 |
|
}, |
|
{ |
|
"epoch": 0.3670160003450209, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0006336125021585219, |
|
"loss": 1.0274, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 0.36800793548108857, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.000632619582110171, |
|
"loss": 1.0257, |
|
"step": 8533 |
|
}, |
|
{ |
|
"epoch": 0.36899987061715617, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0006316266620618201, |
|
"loss": 1.0389, |
|
"step": 8556 |
|
}, |
|
{ |
|
"epoch": 0.36999180575322377, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0006306337420134692, |
|
"loss": 1.0295, |
|
"step": 8579 |
|
}, |
|
{ |
|
"epoch": 0.3709837408892914, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0006296408219651183, |
|
"loss": 1.045, |
|
"step": 8602 |
|
}, |
|
{ |
|
"epoch": 0.371975676025359, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0006286479019167675, |
|
"loss": 1.0581, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 0.3729676111614267, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0006276549818684165, |
|
"loss": 1.0521, |
|
"step": 8648 |
|
}, |
|
{ |
|
"epoch": 0.3739595462974943, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0006266620618200657, |
|
"loss": 1.0355, |
|
"step": 8671 |
|
}, |
|
{ |
|
"epoch": 0.3749514814335619, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0006256691417717147, |
|
"loss": 1.0366, |
|
"step": 8694 |
|
}, |
|
{ |
|
"epoch": 0.37594341656962954, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0006246762217233639, |
|
"loss": 1.0148, |
|
"step": 8717 |
|
}, |
|
{ |
|
"epoch": 0.37693535170569714, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.000623683301675013, |
|
"loss": 1.0401, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.3779272868417648, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.000622690381626662, |
|
"loss": 1.0346, |
|
"step": 8763 |
|
}, |
|
{ |
|
"epoch": 0.3789192219778324, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0006216974615783112, |
|
"loss": 1.0218, |
|
"step": 8786 |
|
}, |
|
{ |
|
"epoch": 0.37991115711390006, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0006207045415299602, |
|
"loss": 1.0405, |
|
"step": 8809 |
|
}, |
|
{ |
|
"epoch": 0.38090309224996766, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0006197116214816095, |
|
"loss": 1.0253, |
|
"step": 8832 |
|
}, |
|
{ |
|
"epoch": 0.38189502738603526, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0006187187014332585, |
|
"loss": 1.0409, |
|
"step": 8855 |
|
}, |
|
{ |
|
"epoch": 0.3828869625221029, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0006177257813849077, |
|
"loss": 1.0349, |
|
"step": 8878 |
|
}, |
|
{ |
|
"epoch": 0.3838788976581705, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0006167328613365567, |
|
"loss": 1.029, |
|
"step": 8901 |
|
}, |
|
{ |
|
"epoch": 0.38487083279423817, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0006157399412882058, |
|
"loss": 1.0338, |
|
"step": 8924 |
|
}, |
|
{ |
|
"epoch": 0.3858627679303058, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0006147470212398549, |
|
"loss": 1.0335, |
|
"step": 8947 |
|
}, |
|
{ |
|
"epoch": 0.3868547030663734, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0006137541011915041, |
|
"loss": 1.0512, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 0.38784663820244103, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0006127611811431532, |
|
"loss": 1.0187, |
|
"step": 8993 |
|
}, |
|
{ |
|
"epoch": 0.38883857333850863, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0006117682610948023, |
|
"loss": 1.0321, |
|
"step": 9016 |
|
}, |
|
{ |
|
"epoch": 0.3898305084745763, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0006107753410464514, |
|
"loss": 1.0314, |
|
"step": 9039 |
|
}, |
|
{ |
|
"epoch": 0.3908224436106439, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0006097824209981005, |
|
"loss": 1.011, |
|
"step": 9062 |
|
}, |
|
{ |
|
"epoch": 0.39181437874671154, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0006087895009497497, |
|
"loss": 1.0127, |
|
"step": 9085 |
|
}, |
|
{ |
|
"epoch": 0.39280631388277915, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0006077965809013988, |
|
"loss": 1.0045, |
|
"step": 9108 |
|
}, |
|
{ |
|
"epoch": 0.39379824901884675, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0006068036608530478, |
|
"loss": 0.9966, |
|
"step": 9131 |
|
}, |
|
{ |
|
"epoch": 0.3947901841549144, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.000605810740804697, |
|
"loss": 1.0235, |
|
"step": 9154 |
|
}, |
|
{ |
|
"epoch": 0.395782119290982, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.000604817820756346, |
|
"loss": 1.0448, |
|
"step": 9177 |
|
}, |
|
{ |
|
"epoch": 0.39677405442704966, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0006038249007079953, |
|
"loss": 1.0197, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.39776598956311726, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0006028319806596443, |
|
"loss": 1.0093, |
|
"step": 9223 |
|
}, |
|
{ |
|
"epoch": 0.39875792469918486, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0006018390606112935, |
|
"loss": 1.014, |
|
"step": 9246 |
|
}, |
|
{ |
|
"epoch": 0.3997498598352525, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.0006008461405629425, |
|
"loss": 1.0269, |
|
"step": 9269 |
|
}, |
|
{ |
|
"epoch": 0.3998792426790874, |
|
"eval_runtime": 163.9225, |
|
"eval_samples_per_second": 610.045, |
|
"eval_steps_per_second": 7.626, |
|
"step": 9272 |
|
} |
|
], |
|
"logging_steps": 23, |
|
"max_steps": 23187, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 2318, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.778106242599485e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|