diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7890 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999106584472438, + "eval_steps": 500, + "global_step": 5596, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001786831055123738, + "grad_norm": 0.6767252087593079, + "learning_rate": 3.5714285714285716e-07, + "loss": 2.5858, + "step": 1 + }, + { + "epoch": 0.000893415527561869, + "grad_norm": 0.8736157417297363, + "learning_rate": 1.7857142857142857e-06, + "loss": 2.8563, + "step": 5 + }, + { + "epoch": 0.001786831055123738, + "grad_norm": 0.9124715328216553, + "learning_rate": 3.5714285714285714e-06, + "loss": 2.7741, + "step": 10 + }, + { + "epoch": 0.002680246582685607, + "grad_norm": 0.8725903034210205, + "learning_rate": 5.357142857142857e-06, + "loss": 2.7188, + "step": 15 + }, + { + "epoch": 0.003573662110247476, + "grad_norm": 0.7513923645019531, + "learning_rate": 7.142857142857143e-06, + "loss": 2.7556, + "step": 20 + }, + { + "epoch": 0.004467077637809345, + "grad_norm": 0.7444078326225281, + "learning_rate": 8.92857142857143e-06, + "loss": 2.7256, + "step": 25 + }, + { + "epoch": 0.005360493165371214, + "grad_norm": 0.8215367794036865, + "learning_rate": 1.0714285714285714e-05, + "loss": 2.7319, + "step": 30 + }, + { + "epoch": 0.006253908692933083, + "grad_norm": 0.7591757774353027, + "learning_rate": 1.25e-05, + "loss": 2.6611, + "step": 35 + }, + { + "epoch": 0.007147324220494952, + "grad_norm": 0.6609904766082764, + "learning_rate": 1.4285714285714285e-05, + "loss": 2.6396, + "step": 40 + }, + { + "epoch": 0.008040739748056821, + "grad_norm": 0.523094117641449, + "learning_rate": 1.6071428571428572e-05, + "loss": 2.5739, + "step": 45 + }, + { + "epoch": 0.00893415527561869, + "grad_norm": 0.4964945614337921, + "learning_rate": 1.785714285714286e-05, + "loss": 2.4766, + "step": 50 + }, + { + "epoch": 0.00982757080318056, + "grad_norm": 0.4061848819255829, + "learning_rate": 1.9642857142857145e-05, + "loss": 2.4273, + "step": 55 + }, + { + "epoch": 0.010720986330742428, + "grad_norm": 0.438088059425354, + "learning_rate": 2.1428571428571428e-05, + "loss": 2.3819, + "step": 60 + }, + { + "epoch": 0.011614401858304297, + "grad_norm": 0.406583696603775, + "learning_rate": 2.3214285714285715e-05, + "loss": 2.3326, + "step": 65 + }, + { + "epoch": 0.012507817385866166, + "grad_norm": 0.38678309321403503, + "learning_rate": 2.5e-05, + "loss": 2.3171, + "step": 70 + }, + { + "epoch": 0.013401232913428035, + "grad_norm": 0.4419669210910797, + "learning_rate": 2.6785714285714288e-05, + "loss": 2.2798, + "step": 75 + }, + { + "epoch": 0.014294648440989904, + "grad_norm": 0.43436020612716675, + "learning_rate": 2.857142857142857e-05, + "loss": 2.23, + "step": 80 + }, + { + "epoch": 0.015188063968551773, + "grad_norm": 0.4241899251937866, + "learning_rate": 3.0357142857142857e-05, + "loss": 2.2421, + "step": 85 + }, + { + "epoch": 0.016081479496113642, + "grad_norm": 0.4129183888435364, + "learning_rate": 3.2142857142857144e-05, + "loss": 2.182, + "step": 90 + }, + { + "epoch": 0.016974895023675513, + "grad_norm": 0.429864764213562, + "learning_rate": 3.392857142857143e-05, + "loss": 2.1112, + "step": 95 + }, + { + "epoch": 0.01786831055123738, + "grad_norm": 0.4998682141304016, + "learning_rate": 3.571428571428572e-05, + "loss": 2.1752, + "step": 100 + }, + { + "epoch": 0.01876172607879925, + "grad_norm": 0.4673575162887573, + "learning_rate": 3.7500000000000003e-05, + "loss": 2.1021, + "step": 105 + }, + { + "epoch": 0.01965514160636112, + "grad_norm": 0.5105161070823669, + "learning_rate": 3.928571428571429e-05, + "loss": 2.075, + "step": 110 + }, + { + "epoch": 0.02054855713392299, + "grad_norm": 0.5451802015304565, + "learning_rate": 4.107142857142857e-05, + "loss": 2.0945, + "step": 115 + }, + { + "epoch": 0.021441972661484857, + "grad_norm": 0.5659247040748596, + "learning_rate": 4.2857142857142856e-05, + "loss": 2.0086, + "step": 120 + }, + { + "epoch": 0.022335388189046727, + "grad_norm": 0.5502987504005432, + "learning_rate": 4.464285714285715e-05, + "loss": 2.0306, + "step": 125 + }, + { + "epoch": 0.023228803716608595, + "grad_norm": 0.5537978410720825, + "learning_rate": 4.642857142857143e-05, + "loss": 1.995, + "step": 130 + }, + { + "epoch": 0.024122219244170465, + "grad_norm": 0.6560796499252319, + "learning_rate": 4.8214285714285716e-05, + "loss": 1.9854, + "step": 135 + }, + { + "epoch": 0.025015634771732333, + "grad_norm": 0.6709268689155579, + "learning_rate": 5e-05, + "loss": 2.0174, + "step": 140 + }, + { + "epoch": 0.025909050299294203, + "grad_norm": 0.6651104092597961, + "learning_rate": 5.1785714285714296e-05, + "loss": 1.9972, + "step": 145 + }, + { + "epoch": 0.02680246582685607, + "grad_norm": 0.6255117058753967, + "learning_rate": 5.3571428571428575e-05, + "loss": 1.9597, + "step": 150 + }, + { + "epoch": 0.02769588135441794, + "grad_norm": 0.7806227207183838, + "learning_rate": 5.535714285714286e-05, + "loss": 1.9863, + "step": 155 + }, + { + "epoch": 0.02858929688197981, + "grad_norm": 0.7957138419151306, + "learning_rate": 5.714285714285714e-05, + "loss": 1.965, + "step": 160 + }, + { + "epoch": 0.02948271240954168, + "grad_norm": 0.8026310205459595, + "learning_rate": 5.8928571428571435e-05, + "loss": 1.9605, + "step": 165 + }, + { + "epoch": 0.030376127937103547, + "grad_norm": 0.724997878074646, + "learning_rate": 6.0714285714285715e-05, + "loss": 1.932, + "step": 170 + }, + { + "epoch": 0.031269543464665414, + "grad_norm": 0.7658754587173462, + "learning_rate": 6.25e-05, + "loss": 1.8975, + "step": 175 + }, + { + "epoch": 0.032162958992227285, + "grad_norm": 0.7666235566139221, + "learning_rate": 6.428571428571429e-05, + "loss": 1.888, + "step": 180 + }, + { + "epoch": 0.033056374519789156, + "grad_norm": 0.8629304766654968, + "learning_rate": 6.607142857142857e-05, + "loss": 1.8824, + "step": 185 + }, + { + "epoch": 0.033949790047351026, + "grad_norm": 0.912053108215332, + "learning_rate": 6.785714285714286e-05, + "loss": 1.8766, + "step": 190 + }, + { + "epoch": 0.03484320557491289, + "grad_norm": 0.8080089688301086, + "learning_rate": 6.964285714285715e-05, + "loss": 1.8944, + "step": 195 + }, + { + "epoch": 0.03573662110247476, + "grad_norm": 0.8864442110061646, + "learning_rate": 7.142857142857143e-05, + "loss": 1.8932, + "step": 200 + }, + { + "epoch": 0.03663003663003663, + "grad_norm": 0.9291841387748718, + "learning_rate": 7.321428571428571e-05, + "loss": 1.86, + "step": 205 + }, + { + "epoch": 0.0375234521575985, + "grad_norm": 0.8211831450462341, + "learning_rate": 7.500000000000001e-05, + "loss": 1.8467, + "step": 210 + }, + { + "epoch": 0.038416867685160366, + "grad_norm": 0.9697086215019226, + "learning_rate": 7.67857142857143e-05, + "loss": 1.8802, + "step": 215 + }, + { + "epoch": 0.03931028321272224, + "grad_norm": 1.0050380229949951, + "learning_rate": 7.857142857142858e-05, + "loss": 1.8544, + "step": 220 + }, + { + "epoch": 0.04020369874028411, + "grad_norm": 0.9131980538368225, + "learning_rate": 8.035714285714287e-05, + "loss": 1.8207, + "step": 225 + }, + { + "epoch": 0.04109711426784598, + "grad_norm": 0.9117507338523865, + "learning_rate": 8.214285714285714e-05, + "loss": 1.8391, + "step": 230 + }, + { + "epoch": 0.04199052979540784, + "grad_norm": 0.8625417351722717, + "learning_rate": 8.392857142857144e-05, + "loss": 1.7931, + "step": 235 + }, + { + "epoch": 0.04288394532296971, + "grad_norm": 1.025675654411316, + "learning_rate": 8.571428571428571e-05, + "loss": 1.8135, + "step": 240 + }, + { + "epoch": 0.043777360850531584, + "grad_norm": 0.9845486879348755, + "learning_rate": 8.75e-05, + "loss": 1.8192, + "step": 245 + }, + { + "epoch": 0.044670776378093455, + "grad_norm": 0.9469249844551086, + "learning_rate": 8.92857142857143e-05, + "loss": 1.8107, + "step": 250 + }, + { + "epoch": 0.04556419190565532, + "grad_norm": 0.8901212215423584, + "learning_rate": 9.107142857142857e-05, + "loss": 1.8072, + "step": 255 + }, + { + "epoch": 0.04645760743321719, + "grad_norm": 0.9586057066917419, + "learning_rate": 9.285714285714286e-05, + "loss": 1.7973, + "step": 260 + }, + { + "epoch": 0.04735102296077906, + "grad_norm": 0.9514407515525818, + "learning_rate": 9.464285714285715e-05, + "loss": 1.7815, + "step": 265 + }, + { + "epoch": 0.04824443848834093, + "grad_norm": 0.8750002980232239, + "learning_rate": 9.642857142857143e-05, + "loss": 1.765, + "step": 270 + }, + { + "epoch": 0.049137854015902795, + "grad_norm": 0.966252863407135, + "learning_rate": 9.821428571428572e-05, + "loss": 1.8197, + "step": 275 + }, + { + "epoch": 0.050031269543464665, + "grad_norm": 0.9129286408424377, + "learning_rate": 0.0001, + "loss": 1.7548, + "step": 280 + }, + { + "epoch": 0.050924685071026536, + "grad_norm": 0.9409673810005188, + "learning_rate": 0.00010178571428571428, + "loss": 1.7109, + "step": 285 + }, + { + "epoch": 0.05181810059858841, + "grad_norm": 0.9348879456520081, + "learning_rate": 0.00010357142857142859, + "loss": 1.7618, + "step": 290 + }, + { + "epoch": 0.05271151612615027, + "grad_norm": 0.8221137523651123, + "learning_rate": 0.00010535714285714286, + "loss": 1.734, + "step": 295 + }, + { + "epoch": 0.05360493165371214, + "grad_norm": 0.8188982009887695, + "learning_rate": 0.00010714285714285715, + "loss": 1.7522, + "step": 300 + }, + { + "epoch": 0.05449834718127401, + "grad_norm": 0.7834224700927734, + "learning_rate": 0.00010892857142857142, + "loss": 1.7878, + "step": 305 + }, + { + "epoch": 0.05539176270883588, + "grad_norm": 0.9102511405944824, + "learning_rate": 0.00011071428571428572, + "loss": 1.7572, + "step": 310 + }, + { + "epoch": 0.05628517823639775, + "grad_norm": 0.8769788146018982, + "learning_rate": 0.00011250000000000001, + "loss": 1.7443, + "step": 315 + }, + { + "epoch": 0.05717859376395962, + "grad_norm": 0.9550459980964661, + "learning_rate": 0.00011428571428571428, + "loss": 1.7346, + "step": 320 + }, + { + "epoch": 0.05807200929152149, + "grad_norm": 0.8972039222717285, + "learning_rate": 0.00011607142857142858, + "loss": 1.7377, + "step": 325 + }, + { + "epoch": 0.05896542481908336, + "grad_norm": 0.9489789009094238, + "learning_rate": 0.00011785714285714287, + "loss": 1.7335, + "step": 330 + }, + { + "epoch": 0.05985884034664522, + "grad_norm": 0.9783198833465576, + "learning_rate": 0.00011964285714285714, + "loss": 1.7243, + "step": 335 + }, + { + "epoch": 0.060752255874207094, + "grad_norm": 1.0081501007080078, + "learning_rate": 0.00012142857142857143, + "loss": 1.7717, + "step": 340 + }, + { + "epoch": 0.061645671401768964, + "grad_norm": 0.9436238408088684, + "learning_rate": 0.00012321428571428572, + "loss": 1.7301, + "step": 345 + }, + { + "epoch": 0.06253908692933083, + "grad_norm": 0.8638908863067627, + "learning_rate": 0.000125, + "loss": 1.738, + "step": 350 + }, + { + "epoch": 0.0634325024568927, + "grad_norm": 0.8834109306335449, + "learning_rate": 0.0001267857142857143, + "loss": 1.7379, + "step": 355 + }, + { + "epoch": 0.06432591798445457, + "grad_norm": 1.045885682106018, + "learning_rate": 0.00012857142857142858, + "loss": 1.7175, + "step": 360 + }, + { + "epoch": 0.06521933351201643, + "grad_norm": 0.8442100286483765, + "learning_rate": 0.00013035714285714286, + "loss": 1.6629, + "step": 365 + }, + { + "epoch": 0.06611274903957831, + "grad_norm": 0.948798656463623, + "learning_rate": 0.00013214285714285715, + "loss": 1.6661, + "step": 370 + }, + { + "epoch": 0.06700616456714018, + "grad_norm": 0.9104481935501099, + "learning_rate": 0.00013392857142857144, + "loss": 1.7239, + "step": 375 + }, + { + "epoch": 0.06789958009470205, + "grad_norm": 0.8660710453987122, + "learning_rate": 0.00013571428571428572, + "loss": 1.6696, + "step": 380 + }, + { + "epoch": 0.06879299562226392, + "grad_norm": 0.8465679287910461, + "learning_rate": 0.0001375, + "loss": 1.6838, + "step": 385 + }, + { + "epoch": 0.06968641114982578, + "grad_norm": 0.9764780402183533, + "learning_rate": 0.0001392857142857143, + "loss": 1.7024, + "step": 390 + }, + { + "epoch": 0.07057982667738766, + "grad_norm": 0.8730722665786743, + "learning_rate": 0.00014107142857142858, + "loss": 1.678, + "step": 395 + }, + { + "epoch": 0.07147324220494952, + "grad_norm": 0.8504787087440491, + "learning_rate": 0.00014285714285714287, + "loss": 1.6513, + "step": 400 + }, + { + "epoch": 0.07236665773251139, + "grad_norm": 0.7620259523391724, + "learning_rate": 0.00014464285714285715, + "loss": 1.6478, + "step": 405 + }, + { + "epoch": 0.07326007326007326, + "grad_norm": 0.8745117783546448, + "learning_rate": 0.00014642857142857141, + "loss": 1.6717, + "step": 410 + }, + { + "epoch": 0.07415348878763513, + "grad_norm": 0.8818298578262329, + "learning_rate": 0.00014821428571428573, + "loss": 1.5922, + "step": 415 + }, + { + "epoch": 0.075046904315197, + "grad_norm": 0.8444076180458069, + "learning_rate": 0.00015000000000000001, + "loss": 1.6568, + "step": 420 + }, + { + "epoch": 0.07594031984275887, + "grad_norm": 0.7880132794380188, + "learning_rate": 0.00015178571428571427, + "loss": 1.6642, + "step": 425 + }, + { + "epoch": 0.07683373537032073, + "grad_norm": 1.0402840375900269, + "learning_rate": 0.0001535714285714286, + "loss": 1.6942, + "step": 430 + }, + { + "epoch": 0.07772715089788261, + "grad_norm": 0.8606407046318054, + "learning_rate": 0.00015535714285714287, + "loss": 1.6216, + "step": 435 + }, + { + "epoch": 0.07862056642544447, + "grad_norm": 0.8538222312927246, + "learning_rate": 0.00015714285714285716, + "loss": 1.6577, + "step": 440 + }, + { + "epoch": 0.07951398195300634, + "grad_norm": 0.8017067909240723, + "learning_rate": 0.00015892857142857142, + "loss": 1.6588, + "step": 445 + }, + { + "epoch": 0.08040739748056822, + "grad_norm": 0.8014382123947144, + "learning_rate": 0.00016071428571428573, + "loss": 1.6661, + "step": 450 + }, + { + "epoch": 0.08130081300813008, + "grad_norm": 0.8568883538246155, + "learning_rate": 0.00016250000000000002, + "loss": 1.6668, + "step": 455 + }, + { + "epoch": 0.08219422853569196, + "grad_norm": 0.7566272020339966, + "learning_rate": 0.00016428571428571428, + "loss": 1.6403, + "step": 460 + }, + { + "epoch": 0.08308764406325382, + "grad_norm": 0.8573461174964905, + "learning_rate": 0.0001660714285714286, + "loss": 1.6548, + "step": 465 + }, + { + "epoch": 0.08398105959081568, + "grad_norm": 0.8354015350341797, + "learning_rate": 0.00016785714285714288, + "loss": 1.6219, + "step": 470 + }, + { + "epoch": 0.08487447511837756, + "grad_norm": 0.790770947933197, + "learning_rate": 0.00016964285714285714, + "loss": 1.6428, + "step": 475 + }, + { + "epoch": 0.08576789064593943, + "grad_norm": 0.7817040085792542, + "learning_rate": 0.00017142857142857143, + "loss": 1.6103, + "step": 480 + }, + { + "epoch": 0.08666130617350129, + "grad_norm": 0.6936588287353516, + "learning_rate": 0.00017321428571428574, + "loss": 1.6043, + "step": 485 + }, + { + "epoch": 0.08755472170106317, + "grad_norm": 0.8221834897994995, + "learning_rate": 0.000175, + "loss": 1.6187, + "step": 490 + }, + { + "epoch": 0.08844813722862503, + "grad_norm": 0.7943176627159119, + "learning_rate": 0.00017678571428571428, + "loss": 1.6695, + "step": 495 + }, + { + "epoch": 0.08934155275618691, + "grad_norm": 0.7854962944984436, + "learning_rate": 0.0001785714285714286, + "loss": 1.6191, + "step": 500 + }, + { + "epoch": 0.09023496828374877, + "grad_norm": 0.8033050298690796, + "learning_rate": 0.00018035714285714286, + "loss": 1.6066, + "step": 505 + }, + { + "epoch": 0.09112838381131064, + "grad_norm": 0.8427201509475708, + "learning_rate": 0.00018214285714285714, + "loss": 1.6486, + "step": 510 + }, + { + "epoch": 0.09202179933887251, + "grad_norm": 0.7360148429870605, + "learning_rate": 0.00018392857142857143, + "loss": 1.6022, + "step": 515 + }, + { + "epoch": 0.09291521486643438, + "grad_norm": 0.6847764253616333, + "learning_rate": 0.00018571428571428572, + "loss": 1.6158, + "step": 520 + }, + { + "epoch": 0.09380863039399624, + "grad_norm": 0.8051613569259644, + "learning_rate": 0.0001875, + "loss": 1.5828, + "step": 525 + }, + { + "epoch": 0.09470204592155812, + "grad_norm": 0.7814632058143616, + "learning_rate": 0.0001892857142857143, + "loss": 1.6537, + "step": 530 + }, + { + "epoch": 0.09559546144911998, + "grad_norm": 0.6943553686141968, + "learning_rate": 0.00019107142857142858, + "loss": 1.5729, + "step": 535 + }, + { + "epoch": 0.09648887697668186, + "grad_norm": 0.6946085095405579, + "learning_rate": 0.00019285714285714286, + "loss": 1.5977, + "step": 540 + }, + { + "epoch": 0.09738229250424373, + "grad_norm": 0.7925629019737244, + "learning_rate": 0.00019464285714285715, + "loss": 1.6249, + "step": 545 + }, + { + "epoch": 0.09827570803180559, + "grad_norm": 0.8013836741447449, + "learning_rate": 0.00019642857142857144, + "loss": 1.5448, + "step": 550 + }, + { + "epoch": 0.09916912355936747, + "grad_norm": 0.7135029435157776, + "learning_rate": 0.00019821428571428572, + "loss": 1.5866, + "step": 555 + }, + { + "epoch": 0.10006253908692933, + "grad_norm": 0.7435690760612488, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 560 + }, + { + "epoch": 0.1009559546144912, + "grad_norm": 0.73707115650177, + "learning_rate": 0.00019999951355027364, + "loss": 1.5756, + "step": 565 + }, + { + "epoch": 0.10184937014205307, + "grad_norm": 0.7932726144790649, + "learning_rate": 0.00019999805420582728, + "loss": 1.5724, + "step": 570 + }, + { + "epoch": 0.10274278566961494, + "grad_norm": 0.7718573808670044, + "learning_rate": 0.00019999562198085878, + "loss": 1.5735, + "step": 575 + }, + { + "epoch": 0.10363620119717681, + "grad_norm": 0.7420961856842041, + "learning_rate": 0.00019999221689903133, + "loss": 1.5687, + "step": 580 + }, + { + "epoch": 0.10452961672473868, + "grad_norm": 0.6695738434791565, + "learning_rate": 0.00019998783899347294, + "loss": 1.5685, + "step": 585 + }, + { + "epoch": 0.10542303225230054, + "grad_norm": 0.6004559397697449, + "learning_rate": 0.0001999824883067762, + "loss": 1.625, + "step": 590 + }, + { + "epoch": 0.10631644777986242, + "grad_norm": 0.6196651458740234, + "learning_rate": 0.00019997616489099792, + "loss": 1.5445, + "step": 595 + }, + { + "epoch": 0.10720986330742428, + "grad_norm": 0.6544579267501831, + "learning_rate": 0.00019996886880765854, + "loss": 1.5167, + "step": 600 + }, + { + "epoch": 0.10810327883498615, + "grad_norm": 0.7319378852844238, + "learning_rate": 0.0001999606001277417, + "loss": 1.5423, + "step": 605 + }, + { + "epoch": 0.10899669436254802, + "grad_norm": 0.6340135335922241, + "learning_rate": 0.0001999513589316933, + "loss": 1.5131, + "step": 610 + }, + { + "epoch": 0.10989010989010989, + "grad_norm": 0.719735324382782, + "learning_rate": 0.00019994114530942088, + "loss": 1.5784, + "step": 615 + }, + { + "epoch": 0.11078352541767177, + "grad_norm": 0.7871695756912231, + "learning_rate": 0.0001999299593602927, + "loss": 1.5561, + "step": 620 + }, + { + "epoch": 0.11167694094523363, + "grad_norm": 0.7577756643295288, + "learning_rate": 0.00019991780119313682, + "loss": 1.5863, + "step": 625 + }, + { + "epoch": 0.1125703564727955, + "grad_norm": 0.7437707185745239, + "learning_rate": 0.00019990467092623998, + "loss": 1.5244, + "step": 630 + }, + { + "epoch": 0.11346377200035737, + "grad_norm": 0.6263313889503479, + "learning_rate": 0.00019989056868734647, + "loss": 1.5438, + "step": 635 + }, + { + "epoch": 0.11435718752791924, + "grad_norm": 0.6760973930358887, + "learning_rate": 0.0001998754946136569, + "loss": 1.5808, + "step": 640 + }, + { + "epoch": 0.1152506030554811, + "grad_norm": 0.7239100337028503, + "learning_rate": 0.00019985944885182687, + "loss": 1.5129, + "step": 645 + }, + { + "epoch": 0.11614401858304298, + "grad_norm": 0.6674967408180237, + "learning_rate": 0.00019984243155796546, + "loss": 1.5238, + "step": 650 + }, + { + "epoch": 0.11703743411060484, + "grad_norm": 0.6619219779968262, + "learning_rate": 0.00019982444289763388, + "loss": 1.4989, + "step": 655 + }, + { + "epoch": 0.11793084963816672, + "grad_norm": 0.6065383553504944, + "learning_rate": 0.00019980548304584364, + "loss": 1.4831, + "step": 660 + }, + { + "epoch": 0.11882426516572858, + "grad_norm": 0.6437244415283203, + "learning_rate": 0.00019978555218705513, + "loss": 1.5404, + "step": 665 + }, + { + "epoch": 0.11971768069329045, + "grad_norm": 0.6297422647476196, + "learning_rate": 0.00019976465051517548, + "loss": 1.5232, + "step": 670 + }, + { + "epoch": 0.12061109622085232, + "grad_norm": 0.660554051399231, + "learning_rate": 0.00019974277823355698, + "loss": 1.5417, + "step": 675 + }, + { + "epoch": 0.12150451174841419, + "grad_norm": 0.6322402358055115, + "learning_rate": 0.00019971993555499494, + "loss": 1.5161, + "step": 680 + }, + { + "epoch": 0.12239792727597605, + "grad_norm": 0.6935272216796875, + "learning_rate": 0.00019969612270172567, + "loss": 1.5089, + "step": 685 + }, + { + "epoch": 0.12329134280353793, + "grad_norm": 0.7320016026496887, + "learning_rate": 0.00019967133990542423, + "loss": 1.5479, + "step": 690 + }, + { + "epoch": 0.12418475833109979, + "grad_norm": 0.6796984672546387, + "learning_rate": 0.0001996455874072024, + "loss": 1.5242, + "step": 695 + }, + { + "epoch": 0.12507817385866166, + "grad_norm": 0.6992778778076172, + "learning_rate": 0.00019961886545760598, + "loss": 1.5287, + "step": 700 + }, + { + "epoch": 0.12597158938622352, + "grad_norm": 0.6733859777450562, + "learning_rate": 0.00019959117431661273, + "loss": 1.5488, + "step": 705 + }, + { + "epoch": 0.1268650049137854, + "grad_norm": 0.7480871081352234, + "learning_rate": 0.00019956251425362967, + "loss": 1.4335, + "step": 710 + }, + { + "epoch": 0.12775842044134728, + "grad_norm": 0.6139786243438721, + "learning_rate": 0.0001995328855474903, + "loss": 1.5344, + "step": 715 + }, + { + "epoch": 0.12865183596890914, + "grad_norm": 0.6919850707054138, + "learning_rate": 0.00019950228848645218, + "loss": 1.5349, + "step": 720 + }, + { + "epoch": 0.129545251496471, + "grad_norm": 0.6435360908508301, + "learning_rate": 0.00019947072336819397, + "loss": 1.52, + "step": 725 + }, + { + "epoch": 0.13043866702403287, + "grad_norm": 0.7430850267410278, + "learning_rate": 0.00019943819049981248, + "loss": 1.4731, + "step": 730 + }, + { + "epoch": 0.13133208255159476, + "grad_norm": 0.6160067319869995, + "learning_rate": 0.00019940469019781985, + "loss": 1.5028, + "step": 735 + }, + { + "epoch": 0.13222549807915662, + "grad_norm": 0.6640142798423767, + "learning_rate": 0.00019937022278814032, + "loss": 1.5135, + "step": 740 + }, + { + "epoch": 0.1331189136067185, + "grad_norm": 0.680387556552887, + "learning_rate": 0.00019933478860610713, + "loss": 1.4975, + "step": 745 + }, + { + "epoch": 0.13401232913428035, + "grad_norm": 0.5649276971817017, + "learning_rate": 0.00019929838799645925, + "loss": 1.4241, + "step": 750 + }, + { + "epoch": 0.13490574466184221, + "grad_norm": 0.69096440076828, + "learning_rate": 0.00019926102131333803, + "loss": 1.4581, + "step": 755 + }, + { + "epoch": 0.1357991601894041, + "grad_norm": 0.7214106917381287, + "learning_rate": 0.00019922268892028368, + "loss": 1.4663, + "step": 760 + }, + { + "epoch": 0.13669257571696597, + "grad_norm": 0.6388475894927979, + "learning_rate": 0.0001991833911902319, + "loss": 1.4789, + "step": 765 + }, + { + "epoch": 0.13758599124452783, + "grad_norm": 0.653536319732666, + "learning_rate": 0.00019914312850551, + "loss": 1.5142, + "step": 770 + }, + { + "epoch": 0.1384794067720897, + "grad_norm": 0.6236264109611511, + "learning_rate": 0.0001991019012578335, + "loss": 1.4822, + "step": 775 + }, + { + "epoch": 0.13937282229965156, + "grad_norm": 0.6071485280990601, + "learning_rate": 0.00019905970984830204, + "loss": 1.4656, + "step": 780 + }, + { + "epoch": 0.14026623782721342, + "grad_norm": 0.6202497482299805, + "learning_rate": 0.00019901655468739562, + "loss": 1.4984, + "step": 785 + }, + { + "epoch": 0.14115965335477532, + "grad_norm": 0.6519142389297485, + "learning_rate": 0.00019897243619497056, + "loss": 1.473, + "step": 790 + }, + { + "epoch": 0.14205306888233718, + "grad_norm": 0.5959755182266235, + "learning_rate": 0.00019892735480025545, + "loss": 1.4976, + "step": 795 + }, + { + "epoch": 0.14294648440989904, + "grad_norm": 0.6568565368652344, + "learning_rate": 0.0001988813109418469, + "loss": 1.4929, + "step": 800 + }, + { + "epoch": 0.1438398999374609, + "grad_norm": 0.6304877996444702, + "learning_rate": 0.00019883430506770536, + "loss": 1.4768, + "step": 805 + }, + { + "epoch": 0.14473331546502277, + "grad_norm": 0.6260554194450378, + "learning_rate": 0.00019878633763515074, + "loss": 1.4449, + "step": 810 + }, + { + "epoch": 0.14562673099258466, + "grad_norm": 0.6394698619842529, + "learning_rate": 0.00019873740911085792, + "loss": 1.4705, + "step": 815 + }, + { + "epoch": 0.14652014652014653, + "grad_norm": 0.6746828556060791, + "learning_rate": 0.00019868751997085225, + "loss": 1.4527, + "step": 820 + }, + { + "epoch": 0.1474135620477084, + "grad_norm": 0.6241781115531921, + "learning_rate": 0.0001986366707005049, + "loss": 1.4807, + "step": 825 + }, + { + "epoch": 0.14830697757527025, + "grad_norm": 0.6119701266288757, + "learning_rate": 0.00019858486179452812, + "loss": 1.478, + "step": 830 + }, + { + "epoch": 0.14920039310283212, + "grad_norm": 0.6817257404327393, + "learning_rate": 0.0001985320937569705, + "loss": 1.507, + "step": 835 + }, + { + "epoch": 0.150093808630394, + "grad_norm": 0.6211469173431396, + "learning_rate": 0.00019847836710121198, + "loss": 1.4612, + "step": 840 + }, + { + "epoch": 0.15098722415795587, + "grad_norm": 0.6433590054512024, + "learning_rate": 0.0001984236823499589, + "loss": 1.4689, + "step": 845 + }, + { + "epoch": 0.15188063968551774, + "grad_norm": 0.6386223435401917, + "learning_rate": 0.0001983680400352389, + "loss": 1.4681, + "step": 850 + }, + { + "epoch": 0.1527740552130796, + "grad_norm": 0.6272917985916138, + "learning_rate": 0.00019831144069839578, + "loss": 1.4372, + "step": 855 + }, + { + "epoch": 0.15366747074064147, + "grad_norm": 0.5687587857246399, + "learning_rate": 0.00019825388489008415, + "loss": 1.4271, + "step": 860 + }, + { + "epoch": 0.15456088626820333, + "grad_norm": 0.6058815121650696, + "learning_rate": 0.0001981953731702642, + "loss": 1.4664, + "step": 865 + }, + { + "epoch": 0.15545430179576522, + "grad_norm": 0.5973011255264282, + "learning_rate": 0.00019813590610819604, + "loss": 1.4698, + "step": 870 + }, + { + "epoch": 0.15634771732332708, + "grad_norm": 0.6365277767181396, + "learning_rate": 0.00019807548428243447, + "loss": 1.4151, + "step": 875 + }, + { + "epoch": 0.15724113285088895, + "grad_norm": 0.6562153100967407, + "learning_rate": 0.00019801410828082307, + "loss": 1.4769, + "step": 880 + }, + { + "epoch": 0.1581345483784508, + "grad_norm": 0.6231607794761658, + "learning_rate": 0.00019795177870048864, + "loss": 1.4464, + "step": 885 + }, + { + "epoch": 0.15902796390601268, + "grad_norm": 0.6296908855438232, + "learning_rate": 0.00019788849614783534, + "loss": 1.4202, + "step": 890 + }, + { + "epoch": 0.15992137943357457, + "grad_norm": 0.5952068567276001, + "learning_rate": 0.00019782426123853873, + "loss": 1.4916, + "step": 895 + }, + { + "epoch": 0.16081479496113643, + "grad_norm": 0.659608006477356, + "learning_rate": 0.0001977590745975399, + "loss": 1.3604, + "step": 900 + }, + { + "epoch": 0.1617082104886983, + "grad_norm": 0.6710678339004517, + "learning_rate": 0.00019769293685903937, + "loss": 1.426, + "step": 905 + }, + { + "epoch": 0.16260162601626016, + "grad_norm": 0.6199014782905579, + "learning_rate": 0.0001976258486664908, + "loss": 1.4376, + "step": 910 + }, + { + "epoch": 0.16349504154382202, + "grad_norm": 0.5568641424179077, + "learning_rate": 0.00019755781067259487, + "loss": 1.4233, + "step": 915 + }, + { + "epoch": 0.16438845707138391, + "grad_norm": 0.6075164675712585, + "learning_rate": 0.00019748882353929283, + "loss": 1.3773, + "step": 920 + }, + { + "epoch": 0.16528187259894578, + "grad_norm": 0.6208435297012329, + "learning_rate": 0.00019741888793776012, + "loss": 1.4342, + "step": 925 + }, + { + "epoch": 0.16617528812650764, + "grad_norm": 0.5977674126625061, + "learning_rate": 0.00019734800454839985, + "loss": 1.4001, + "step": 930 + }, + { + "epoch": 0.1670687036540695, + "grad_norm": 0.6107041835784912, + "learning_rate": 0.00019727617406083608, + "loss": 1.4483, + "step": 935 + }, + { + "epoch": 0.16796211918163137, + "grad_norm": 0.6070581078529358, + "learning_rate": 0.00019720339717390725, + "loss": 1.434, + "step": 940 + }, + { + "epoch": 0.16885553470919323, + "grad_norm": 0.5824504494667053, + "learning_rate": 0.00019712967459565935, + "loss": 1.4214, + "step": 945 + }, + { + "epoch": 0.16974895023675513, + "grad_norm": 0.6274870038032532, + "learning_rate": 0.00019705500704333888, + "loss": 1.3898, + "step": 950 + }, + { + "epoch": 0.170642365764317, + "grad_norm": 0.6038565039634705, + "learning_rate": 0.00019697939524338605, + "loss": 1.441, + "step": 955 + }, + { + "epoch": 0.17153578129187885, + "grad_norm": 0.5851924419403076, + "learning_rate": 0.00019690283993142768, + "loss": 1.4293, + "step": 960 + }, + { + "epoch": 0.17242919681944072, + "grad_norm": 0.6128916144371033, + "learning_rate": 0.00019682534185226996, + "loss": 1.4526, + "step": 965 + }, + { + "epoch": 0.17332261234700258, + "grad_norm": 0.5964568853378296, + "learning_rate": 0.0001967469017598913, + "loss": 1.4286, + "step": 970 + }, + { + "epoch": 0.17421602787456447, + "grad_norm": 0.6289203763008118, + "learning_rate": 0.00019666752041743485, + "loss": 1.3942, + "step": 975 + }, + { + "epoch": 0.17510944340212634, + "grad_norm": 0.538835883140564, + "learning_rate": 0.00019658719859720137, + "loss": 1.4165, + "step": 980 + }, + { + "epoch": 0.1760028589296882, + "grad_norm": 0.571048378944397, + "learning_rate": 0.00019650593708064133, + "loss": 1.4293, + "step": 985 + }, + { + "epoch": 0.17689627445725006, + "grad_norm": 0.6304472088813782, + "learning_rate": 0.0001964237366583476, + "loss": 1.4069, + "step": 990 + }, + { + "epoch": 0.17778968998481193, + "grad_norm": 0.6265087723731995, + "learning_rate": 0.00019634059813004767, + "loss": 1.3956, + "step": 995 + }, + { + "epoch": 0.17868310551237382, + "grad_norm": 0.5952944159507751, + "learning_rate": 0.00019625652230459577, + "loss": 1.4243, + "step": 1000 + }, + { + "epoch": 0.17957652103993568, + "grad_norm": 0.5611996650695801, + "learning_rate": 0.00019617150999996522, + "loss": 1.3769, + "step": 1005 + }, + { + "epoch": 0.18046993656749755, + "grad_norm": 0.6139466166496277, + "learning_rate": 0.00019608556204324016, + "loss": 1.3937, + "step": 1010 + }, + { + "epoch": 0.1813633520950594, + "grad_norm": 0.59721839427948, + "learning_rate": 0.00019599867927060788, + "loss": 1.4536, + "step": 1015 + }, + { + "epoch": 0.18225676762262127, + "grad_norm": 0.6089359521865845, + "learning_rate": 0.0001959108625273504, + "loss": 1.4217, + "step": 1020 + }, + { + "epoch": 0.18315018315018314, + "grad_norm": 0.6171666383743286, + "learning_rate": 0.0001958221126678363, + "loss": 1.3777, + "step": 1025 + }, + { + "epoch": 0.18404359867774503, + "grad_norm": 0.5585091710090637, + "learning_rate": 0.00019573243055551247, + "loss": 1.4085, + "step": 1030 + }, + { + "epoch": 0.1849370142053069, + "grad_norm": 0.5734081268310547, + "learning_rate": 0.0001956418170628957, + "loss": 1.3873, + "step": 1035 + }, + { + "epoch": 0.18583042973286876, + "grad_norm": 0.6407363414764404, + "learning_rate": 0.0001955502730715642, + "loss": 1.4274, + "step": 1040 + }, + { + "epoch": 0.18672384526043062, + "grad_norm": 0.6202085614204407, + "learning_rate": 0.0001954577994721489, + "loss": 1.4569, + "step": 1045 + }, + { + "epoch": 0.18761726078799248, + "grad_norm": 0.5608704090118408, + "learning_rate": 0.00019536439716432496, + "loss": 1.3569, + "step": 1050 + }, + { + "epoch": 0.18851067631555438, + "grad_norm": 0.5941877365112305, + "learning_rate": 0.00019527006705680297, + "loss": 1.3997, + "step": 1055 + }, + { + "epoch": 0.18940409184311624, + "grad_norm": 0.5690048336982727, + "learning_rate": 0.00019517481006731997, + "loss": 1.4205, + "step": 1060 + }, + { + "epoch": 0.1902975073706781, + "grad_norm": 0.5559670925140381, + "learning_rate": 0.0001950786271226307, + "loss": 1.3996, + "step": 1065 + }, + { + "epoch": 0.19119092289823997, + "grad_norm": 0.633906364440918, + "learning_rate": 0.00019498151915849855, + "loss": 1.4224, + "step": 1070 + }, + { + "epoch": 0.19208433842580183, + "grad_norm": 0.6395050287246704, + "learning_rate": 0.00019488348711968633, + "loss": 1.3711, + "step": 1075 + }, + { + "epoch": 0.19297775395336372, + "grad_norm": 0.5923237800598145, + "learning_rate": 0.00019478453195994719, + "loss": 1.4317, + "step": 1080 + }, + { + "epoch": 0.1938711694809256, + "grad_norm": 0.6085279583930969, + "learning_rate": 0.0001946846546420154, + "loss": 1.3991, + "step": 1085 + }, + { + "epoch": 0.19476458500848745, + "grad_norm": 0.5692582726478577, + "learning_rate": 0.0001945838561375968, + "loss": 1.4099, + "step": 1090 + }, + { + "epoch": 0.19565800053604931, + "grad_norm": 0.5419508218765259, + "learning_rate": 0.00019448213742735942, + "loss": 1.3943, + "step": 1095 + }, + { + "epoch": 0.19655141606361118, + "grad_norm": 0.5866132378578186, + "learning_rate": 0.0001943794995009242, + "loss": 1.3711, + "step": 1100 + }, + { + "epoch": 0.19744483159117304, + "grad_norm": 0.5647941827774048, + "learning_rate": 0.00019427594335685478, + "loss": 1.3508, + "step": 1105 + }, + { + "epoch": 0.19833824711873493, + "grad_norm": 0.5306203365325928, + "learning_rate": 0.00019417147000264852, + "loss": 1.3501, + "step": 1110 + }, + { + "epoch": 0.1992316626462968, + "grad_norm": 0.5783630609512329, + "learning_rate": 0.0001940660804547259, + "loss": 1.367, + "step": 1115 + }, + { + "epoch": 0.20012507817385866, + "grad_norm": 0.5690432786941528, + "learning_rate": 0.00019395977573842142, + "loss": 1.3662, + "step": 1120 + }, + { + "epoch": 0.20101849370142053, + "grad_norm": 0.5380032658576965, + "learning_rate": 0.000193852556887973, + "loss": 1.3674, + "step": 1125 + }, + { + "epoch": 0.2019119092289824, + "grad_norm": 0.5310468077659607, + "learning_rate": 0.00019374442494651223, + "loss": 1.3827, + "step": 1130 + }, + { + "epoch": 0.20280532475654428, + "grad_norm": 0.5655437707901001, + "learning_rate": 0.00019363538096605427, + "loss": 1.3837, + "step": 1135 + }, + { + "epoch": 0.20369874028410614, + "grad_norm": 0.5816426873207092, + "learning_rate": 0.00019352542600748734, + "loss": 1.3857, + "step": 1140 + }, + { + "epoch": 0.204592155811668, + "grad_norm": 0.5484604835510254, + "learning_rate": 0.00019341456114056263, + "loss": 1.3561, + "step": 1145 + }, + { + "epoch": 0.20548557133922987, + "grad_norm": 0.5980105996131897, + "learning_rate": 0.00019330278744388385, + "loss": 1.3571, + "step": 1150 + }, + { + "epoch": 0.20637898686679174, + "grad_norm": 0.5934194922447205, + "learning_rate": 0.00019319010600489663, + "loss": 1.3629, + "step": 1155 + }, + { + "epoch": 0.20727240239435363, + "grad_norm": 0.5532252192497253, + "learning_rate": 0.00019307651791987816, + "loss": 1.3442, + "step": 1160 + }, + { + "epoch": 0.2081658179219155, + "grad_norm": 0.5868908762931824, + "learning_rate": 0.00019296202429392622, + "loss": 1.3697, + "step": 1165 + }, + { + "epoch": 0.20905923344947736, + "grad_norm": 0.557306170463562, + "learning_rate": 0.00019284662624094874, + "loss": 1.3245, + "step": 1170 + }, + { + "epoch": 0.20995264897703922, + "grad_norm": 0.5710069537162781, + "learning_rate": 0.00019273032488365267, + "loss": 1.3606, + "step": 1175 + }, + { + "epoch": 0.21084606450460108, + "grad_norm": 0.5752673745155334, + "learning_rate": 0.00019261312135353332, + "loss": 1.3384, + "step": 1180 + }, + { + "epoch": 0.21173948003216295, + "grad_norm": 0.5679814219474792, + "learning_rate": 0.0001924950167908632, + "loss": 1.3272, + "step": 1185 + }, + { + "epoch": 0.21263289555972484, + "grad_norm": 0.5580770373344421, + "learning_rate": 0.00019237601234468096, + "loss": 1.3557, + "step": 1190 + }, + { + "epoch": 0.2135263110872867, + "grad_norm": 0.5821225047111511, + "learning_rate": 0.0001922561091727802, + "loss": 1.326, + "step": 1195 + }, + { + "epoch": 0.21441972661484857, + "grad_norm": 0.593561053276062, + "learning_rate": 0.00019213530844169817, + "loss": 1.3584, + "step": 1200 + }, + { + "epoch": 0.21531314214241043, + "grad_norm": 0.6168695688247681, + "learning_rate": 0.00019201361132670456, + "loss": 1.3307, + "step": 1205 + }, + { + "epoch": 0.2162065576699723, + "grad_norm": 0.5816683173179626, + "learning_rate": 0.00019189101901178997, + "loss": 1.3807, + "step": 1210 + }, + { + "epoch": 0.21709997319753419, + "grad_norm": 0.5499683022499084, + "learning_rate": 0.00019176753268965432, + "loss": 1.3177, + "step": 1215 + }, + { + "epoch": 0.21799338872509605, + "grad_norm": 0.5521628260612488, + "learning_rate": 0.00019164315356169536, + "loss": 1.3386, + "step": 1220 + }, + { + "epoch": 0.2188868042526579, + "grad_norm": 0.6149545907974243, + "learning_rate": 0.00019151788283799698, + "loss": 1.3589, + "step": 1225 + }, + { + "epoch": 0.21978021978021978, + "grad_norm": 0.5420519113540649, + "learning_rate": 0.00019139172173731733, + "loss": 1.3446, + "step": 1230 + }, + { + "epoch": 0.22067363530778164, + "grad_norm": 0.5895097851753235, + "learning_rate": 0.0001912646714870771, + "loss": 1.3714, + "step": 1235 + }, + { + "epoch": 0.22156705083534353, + "grad_norm": 0.5893450975418091, + "learning_rate": 0.0001911367333233474, + "loss": 1.3723, + "step": 1240 + }, + { + "epoch": 0.2224604663629054, + "grad_norm": 0.5816531181335449, + "learning_rate": 0.00019100790849083804, + "loss": 1.3002, + "step": 1245 + }, + { + "epoch": 0.22335388189046726, + "grad_norm": 0.557987630367279, + "learning_rate": 0.00019087819824288504, + "loss": 1.3805, + "step": 1250 + }, + { + "epoch": 0.22424729741802912, + "grad_norm": 0.5602419972419739, + "learning_rate": 0.0001907476038414387, + "loss": 1.3822, + "step": 1255 + }, + { + "epoch": 0.225140712945591, + "grad_norm": 0.5873110294342041, + "learning_rate": 0.00019061612655705128, + "loss": 1.3457, + "step": 1260 + }, + { + "epoch": 0.22603412847315285, + "grad_norm": 0.5846714973449707, + "learning_rate": 0.00019048376766886448, + "loss": 1.3483, + "step": 1265 + }, + { + "epoch": 0.22692754400071474, + "grad_norm": 0.5950532555580139, + "learning_rate": 0.00019035052846459727, + "loss": 1.3358, + "step": 1270 + }, + { + "epoch": 0.2278209595282766, + "grad_norm": 0.5122111439704895, + "learning_rate": 0.00019021641024053308, + "loss": 1.3126, + "step": 1275 + }, + { + "epoch": 0.22871437505583847, + "grad_norm": 0.5635678172111511, + "learning_rate": 0.00019008141430150745, + "loss": 1.3349, + "step": 1280 + }, + { + "epoch": 0.22960779058340033, + "grad_norm": 0.5411239266395569, + "learning_rate": 0.00018994554196089506, + "loss": 1.3382, + "step": 1285 + }, + { + "epoch": 0.2305012061109622, + "grad_norm": 0.5744121670722961, + "learning_rate": 0.0001898087945405972, + "loss": 1.2789, + "step": 1290 + }, + { + "epoch": 0.2313946216385241, + "grad_norm": 0.578059732913971, + "learning_rate": 0.00018967117337102883, + "loss": 1.3034, + "step": 1295 + }, + { + "epoch": 0.23228803716608595, + "grad_norm": 0.5708723664283752, + "learning_rate": 0.00018953267979110545, + "loss": 1.3583, + "step": 1300 + }, + { + "epoch": 0.23318145269364782, + "grad_norm": 0.6082606315612793, + "learning_rate": 0.0001893933151482304, + "loss": 1.3753, + "step": 1305 + }, + { + "epoch": 0.23407486822120968, + "grad_norm": 0.57098788022995, + "learning_rate": 0.00018925308079828152, + "loss": 1.3369, + "step": 1310 + }, + { + "epoch": 0.23496828374877154, + "grad_norm": 0.5236451625823975, + "learning_rate": 0.00018911197810559803, + "loss": 1.3468, + "step": 1315 + }, + { + "epoch": 0.23586169927633344, + "grad_norm": 0.5678128600120544, + "learning_rate": 0.00018897000844296727, + "loss": 1.342, + "step": 1320 + }, + { + "epoch": 0.2367551148038953, + "grad_norm": 0.544394314289093, + "learning_rate": 0.00018882717319161128, + "loss": 1.3434, + "step": 1325 + }, + { + "epoch": 0.23764853033145716, + "grad_norm": 0.5973039865493774, + "learning_rate": 0.00018868347374117344, + "loss": 1.2866, + "step": 1330 + }, + { + "epoch": 0.23854194585901903, + "grad_norm": 0.6183381080627441, + "learning_rate": 0.00018853891148970498, + "loss": 1.3513, + "step": 1335 + }, + { + "epoch": 0.2394353613865809, + "grad_norm": 0.5657194256782532, + "learning_rate": 0.00018839348784365116, + "loss": 1.3293, + "step": 1340 + }, + { + "epoch": 0.24032877691414276, + "grad_norm": 0.5699062347412109, + "learning_rate": 0.0001882472042178379, + "loss": 1.3529, + "step": 1345 + }, + { + "epoch": 0.24122219244170465, + "grad_norm": 0.5497627258300781, + "learning_rate": 0.0001881000620354578, + "loss": 1.3395, + "step": 1350 + }, + { + "epoch": 0.2421156079692665, + "grad_norm": 0.5238006711006165, + "learning_rate": 0.0001879520627280563, + "loss": 1.3195, + "step": 1355 + }, + { + "epoch": 0.24300902349682837, + "grad_norm": 0.5889202356338501, + "learning_rate": 0.0001878032077355179, + "loss": 1.3201, + "step": 1360 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.543110191822052, + "learning_rate": 0.00018765349850605195, + "loss": 1.3083, + "step": 1365 + }, + { + "epoch": 0.2447958545519521, + "grad_norm": 0.580144464969635, + "learning_rate": 0.0001875029364961788, + "loss": 1.2999, + "step": 1370 + }, + { + "epoch": 0.245689270079514, + "grad_norm": 0.5746851563453674, + "learning_rate": 0.00018735152317071534, + "loss": 1.3282, + "step": 1375 + }, + { + "epoch": 0.24658268560707586, + "grad_norm": 0.5605974793434143, + "learning_rate": 0.00018719926000276106, + "loss": 1.3357, + "step": 1380 + }, + { + "epoch": 0.24747610113463772, + "grad_norm": 0.556448757648468, + "learning_rate": 0.0001870461484736834, + "loss": 1.3159, + "step": 1385 + }, + { + "epoch": 0.24836951666219959, + "grad_norm": 0.5685500502586365, + "learning_rate": 0.00018689219007310369, + "loss": 1.3422, + "step": 1390 + }, + { + "epoch": 0.24926293218976145, + "grad_norm": 0.5622739195823669, + "learning_rate": 0.00018673738629888226, + "loss": 1.3162, + "step": 1395 + }, + { + "epoch": 0.2501563477173233, + "grad_norm": 0.5816725492477417, + "learning_rate": 0.0001865817386571043, + "loss": 1.348, + "step": 1400 + }, + { + "epoch": 0.2510497632448852, + "grad_norm": 0.5116124153137207, + "learning_rate": 0.00018642524866206475, + "loss": 1.3196, + "step": 1405 + }, + { + "epoch": 0.25194317877244704, + "grad_norm": 0.5634516477584839, + "learning_rate": 0.000186267917836254, + "loss": 1.3117, + "step": 1410 + }, + { + "epoch": 0.25283659430000893, + "grad_norm": 0.5746950507164001, + "learning_rate": 0.00018610974771034275, + "loss": 1.316, + "step": 1415 + }, + { + "epoch": 0.2537300098275708, + "grad_norm": 0.5387247800827026, + "learning_rate": 0.00018595073982316732, + "loss": 1.3182, + "step": 1420 + }, + { + "epoch": 0.25462342535513266, + "grad_norm": 0.5491217374801636, + "learning_rate": 0.00018579089572171454, + "loss": 1.3214, + "step": 1425 + }, + { + "epoch": 0.25551684088269455, + "grad_norm": 0.5725581049919128, + "learning_rate": 0.00018563021696110682, + "loss": 1.3304, + "step": 1430 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.5422765016555786, + "learning_rate": 0.0001854687051045869, + "loss": 1.2947, + "step": 1435 + }, + { + "epoch": 0.2573036719378183, + "grad_norm": 0.5965689420700073, + "learning_rate": 0.00018530636172350287, + "loss": 1.2988, + "step": 1440 + }, + { + "epoch": 0.25819708746538017, + "grad_norm": 0.5194745659828186, + "learning_rate": 0.00018514318839729242, + "loss": 1.29, + "step": 1445 + }, + { + "epoch": 0.259090502992942, + "grad_norm": 0.5547758936882019, + "learning_rate": 0.00018497918671346808, + "loss": 1.3159, + "step": 1450 + }, + { + "epoch": 0.2599839185205039, + "grad_norm": 0.5594758987426758, + "learning_rate": 0.0001848143582676013, + "loss": 1.2949, + "step": 1455 + }, + { + "epoch": 0.26087733404806573, + "grad_norm": 0.5487962961196899, + "learning_rate": 0.0001846487046633071, + "loss": 1.2851, + "step": 1460 + }, + { + "epoch": 0.2617707495756276, + "grad_norm": 0.5708212852478027, + "learning_rate": 0.0001844822275122285, + "loss": 1.3092, + "step": 1465 + }, + { + "epoch": 0.2626641651031895, + "grad_norm": 0.5744941234588623, + "learning_rate": 0.00018431492843402084, + "loss": 1.32, + "step": 1470 + }, + { + "epoch": 0.26355758063075135, + "grad_norm": 0.5791029930114746, + "learning_rate": 0.00018414680905633586, + "loss": 1.3131, + "step": 1475 + }, + { + "epoch": 0.26445099615831325, + "grad_norm": 0.5608724355697632, + "learning_rate": 0.00018397787101480612, + "loss": 1.2954, + "step": 1480 + }, + { + "epoch": 0.2653444116858751, + "grad_norm": 0.5400808453559875, + "learning_rate": 0.0001838081159530289, + "loss": 1.2775, + "step": 1485 + }, + { + "epoch": 0.266237827213437, + "grad_norm": 0.5288774371147156, + "learning_rate": 0.0001836375455225502, + "loss": 1.3106, + "step": 1490 + }, + { + "epoch": 0.26713124274099886, + "grad_norm": 0.5479869842529297, + "learning_rate": 0.00018346616138284892, + "loss": 1.3533, + "step": 1495 + }, + { + "epoch": 0.2680246582685607, + "grad_norm": 0.5189379453659058, + "learning_rate": 0.0001832939652013203, + "loss": 1.3082, + "step": 1500 + }, + { + "epoch": 0.2689180737961226, + "grad_norm": 0.5478213429450989, + "learning_rate": 0.00018312095865326012, + "loss": 1.273, + "step": 1505 + }, + { + "epoch": 0.26981148932368443, + "grad_norm": 0.5622948408126831, + "learning_rate": 0.0001829471434218481, + "loss": 1.2966, + "step": 1510 + }, + { + "epoch": 0.2707049048512463, + "grad_norm": 0.5554372072219849, + "learning_rate": 0.00018277252119813176, + "loss": 1.274, + "step": 1515 + }, + { + "epoch": 0.2715983203788082, + "grad_norm": 0.5189542770385742, + "learning_rate": 0.00018259709368100962, + "loss": 1.2845, + "step": 1520 + }, + { + "epoch": 0.27249173590637005, + "grad_norm": 0.5948411226272583, + "learning_rate": 0.00018242086257721515, + "loss": 1.2779, + "step": 1525 + }, + { + "epoch": 0.27338515143393194, + "grad_norm": 0.5104305744171143, + "learning_rate": 0.00018224382960129972, + "loss": 1.2815, + "step": 1530 + }, + { + "epoch": 0.2742785669614938, + "grad_norm": 0.5303378701210022, + "learning_rate": 0.00018206599647561627, + "loss": 1.3096, + "step": 1535 + }, + { + "epoch": 0.27517198248905567, + "grad_norm": 0.5319021344184875, + "learning_rate": 0.00018188736493030222, + "loss": 1.2779, + "step": 1540 + }, + { + "epoch": 0.2760653980166175, + "grad_norm": 0.5153783559799194, + "learning_rate": 0.00018170793670326292, + "loss": 1.2889, + "step": 1545 + }, + { + "epoch": 0.2769588135441794, + "grad_norm": 0.5285100936889648, + "learning_rate": 0.0001815277135401546, + "loss": 1.2865, + "step": 1550 + }, + { + "epoch": 0.2778522290717413, + "grad_norm": 0.5145849585533142, + "learning_rate": 0.00018134669719436749, + "loss": 1.2755, + "step": 1555 + }, + { + "epoch": 0.2787456445993031, + "grad_norm": 0.541465163230896, + "learning_rate": 0.00018116488942700857, + "loss": 1.2888, + "step": 1560 + }, + { + "epoch": 0.279639060126865, + "grad_norm": 0.5526989698410034, + "learning_rate": 0.00018098229200688462, + "loss": 1.3123, + "step": 1565 + }, + { + "epoch": 0.28053247565442685, + "grad_norm": 0.5341644287109375, + "learning_rate": 0.00018079890671048493, + "loss": 1.2959, + "step": 1570 + }, + { + "epoch": 0.28142589118198874, + "grad_norm": 0.5949883460998535, + "learning_rate": 0.0001806147353219641, + "loss": 1.2757, + "step": 1575 + }, + { + "epoch": 0.28231930670955063, + "grad_norm": 0.5318828821182251, + "learning_rate": 0.00018042977963312451, + "loss": 1.319, + "step": 1580 + }, + { + "epoch": 0.28321272223711247, + "grad_norm": 0.514118492603302, + "learning_rate": 0.00018024404144339906, + "loss": 1.2924, + "step": 1585 + }, + { + "epoch": 0.28410613776467436, + "grad_norm": 0.5623351335525513, + "learning_rate": 0.00018005752255983355, + "loss": 1.2712, + "step": 1590 + }, + { + "epoch": 0.2849995532922362, + "grad_norm": 0.5316884517669678, + "learning_rate": 0.00017987022479706922, + "loss": 1.2994, + "step": 1595 + }, + { + "epoch": 0.2858929688197981, + "grad_norm": 0.5517479777336121, + "learning_rate": 0.000179682149977325, + "loss": 1.3081, + "step": 1600 + }, + { + "epoch": 0.28678638434736, + "grad_norm": 0.5413886904716492, + "learning_rate": 0.0001794932999303797, + "loss": 1.2911, + "step": 1605 + }, + { + "epoch": 0.2876797998749218, + "grad_norm": 0.5298170447349548, + "learning_rate": 0.00017930367649355447, + "loss": 1.304, + "step": 1610 + }, + { + "epoch": 0.2885732154024837, + "grad_norm": 0.6161289215087891, + "learning_rate": 0.00017911328151169466, + "loss": 1.3018, + "step": 1615 + }, + { + "epoch": 0.28946663093004554, + "grad_norm": 0.5742626190185547, + "learning_rate": 0.000178922116837152, + "loss": 1.2513, + "step": 1620 + }, + { + "epoch": 0.29036004645760743, + "grad_norm": 0.5081664323806763, + "learning_rate": 0.00017873018432976658, + "loss": 1.2688, + "step": 1625 + }, + { + "epoch": 0.2912534619851693, + "grad_norm": 0.550042450428009, + "learning_rate": 0.00017853748585684867, + "loss": 1.2895, + "step": 1630 + }, + { + "epoch": 0.29214687751273116, + "grad_norm": 0.5372338891029358, + "learning_rate": 0.0001783440232931607, + "loss": 1.2525, + "step": 1635 + }, + { + "epoch": 0.29304029304029305, + "grad_norm": 0.5075910687446594, + "learning_rate": 0.0001781497985208989, + "loss": 1.2852, + "step": 1640 + }, + { + "epoch": 0.2939337085678549, + "grad_norm": 0.4970715641975403, + "learning_rate": 0.00017795481342967501, + "loss": 1.2175, + "step": 1645 + }, + { + "epoch": 0.2948271240954168, + "grad_norm": 0.510547399520874, + "learning_rate": 0.0001777590699164979, + "loss": 1.3002, + "step": 1650 + }, + { + "epoch": 0.2957205396229787, + "grad_norm": 0.5301088094711304, + "learning_rate": 0.00017756256988575513, + "loss": 1.2885, + "step": 1655 + }, + { + "epoch": 0.2966139551505405, + "grad_norm": 0.5963689684867859, + "learning_rate": 0.00017736531524919445, + "loss": 1.3018, + "step": 1660 + }, + { + "epoch": 0.2975073706781024, + "grad_norm": 0.5285865068435669, + "learning_rate": 0.00017716730792590512, + "loss": 1.3158, + "step": 1665 + }, + { + "epoch": 0.29840078620566424, + "grad_norm": 0.5783317685127258, + "learning_rate": 0.00017696854984229933, + "loss": 1.299, + "step": 1670 + }, + { + "epoch": 0.29929420173322613, + "grad_norm": 0.5652763843536377, + "learning_rate": 0.00017676904293209336, + "loss": 1.303, + "step": 1675 + }, + { + "epoch": 0.300187617260788, + "grad_norm": 0.5288776755332947, + "learning_rate": 0.00017656878913628883, + "loss": 1.2839, + "step": 1680 + }, + { + "epoch": 0.30108103278834986, + "grad_norm": 0.5509206056594849, + "learning_rate": 0.0001763677904031539, + "loss": 1.2739, + "step": 1685 + }, + { + "epoch": 0.30197444831591175, + "grad_norm": 0.5611214637756348, + "learning_rate": 0.00017616604868820406, + "loss": 1.3148, + "step": 1690 + }, + { + "epoch": 0.3028678638434736, + "grad_norm": 0.5442464351654053, + "learning_rate": 0.0001759635659541834, + "loss": 1.2931, + "step": 1695 + }, + { + "epoch": 0.3037612793710355, + "grad_norm": 0.5243226885795593, + "learning_rate": 0.0001757603441710453, + "loss": 1.2174, + "step": 1700 + }, + { + "epoch": 0.3046546948985973, + "grad_norm": 0.5281797647476196, + "learning_rate": 0.0001755563853159334, + "loss": 1.239, + "step": 1705 + }, + { + "epoch": 0.3055481104261592, + "grad_norm": 0.5556020736694336, + "learning_rate": 0.00017535169137316227, + "loss": 1.2909, + "step": 1710 + }, + { + "epoch": 0.3064415259537211, + "grad_norm": 0.5657395124435425, + "learning_rate": 0.0001751462643341982, + "loss": 1.2534, + "step": 1715 + }, + { + "epoch": 0.30733494148128293, + "grad_norm": 0.5573301315307617, + "learning_rate": 0.0001749401061976397, + "loss": 1.2457, + "step": 1720 + }, + { + "epoch": 0.3082283570088448, + "grad_norm": 0.5255555510520935, + "learning_rate": 0.0001747332189691981, + "loss": 1.2453, + "step": 1725 + }, + { + "epoch": 0.30912177253640666, + "grad_norm": 0.5263189673423767, + "learning_rate": 0.00017452560466167818, + "loss": 1.2899, + "step": 1730 + }, + { + "epoch": 0.31001518806396855, + "grad_norm": 0.5250783562660217, + "learning_rate": 0.00017431726529495837, + "loss": 1.2823, + "step": 1735 + }, + { + "epoch": 0.31090860359153044, + "grad_norm": 0.5130211114883423, + "learning_rate": 0.00017410820289597126, + "loss": 1.2815, + "step": 1740 + }, + { + "epoch": 0.3118020191190923, + "grad_norm": 0.4937721788883209, + "learning_rate": 0.00017389841949868378, + "loss": 1.2589, + "step": 1745 + }, + { + "epoch": 0.31269543464665417, + "grad_norm": 0.5342549681663513, + "learning_rate": 0.0001736879171440774, + "loss": 1.2599, + "step": 1750 + }, + { + "epoch": 0.313588850174216, + "grad_norm": 0.5212100148200989, + "learning_rate": 0.00017347669788012846, + "loss": 1.221, + "step": 1755 + }, + { + "epoch": 0.3144822657017779, + "grad_norm": 0.4915301203727722, + "learning_rate": 0.00017326476376178796, + "loss": 1.2569, + "step": 1760 + }, + { + "epoch": 0.3153756812293398, + "grad_norm": 0.5691230297088623, + "learning_rate": 0.00017305211685096178, + "loss": 1.2304, + "step": 1765 + }, + { + "epoch": 0.3162690967569016, + "grad_norm": 0.5791122317314148, + "learning_rate": 0.00017283875921649057, + "loss": 1.284, + "step": 1770 + }, + { + "epoch": 0.3171625122844635, + "grad_norm": 0.5447010397911072, + "learning_rate": 0.0001726246929341296, + "loss": 1.2642, + "step": 1775 + }, + { + "epoch": 0.31805592781202535, + "grad_norm": 0.49899354577064514, + "learning_rate": 0.00017240992008652855, + "loss": 1.2926, + "step": 1780 + }, + { + "epoch": 0.31894934333958724, + "grad_norm": 0.541107714176178, + "learning_rate": 0.00017219444276321127, + "loss": 1.2677, + "step": 1785 + }, + { + "epoch": 0.31984275886714914, + "grad_norm": 0.563939094543457, + "learning_rate": 0.00017197826306055547, + "loss": 1.2592, + "step": 1790 + }, + { + "epoch": 0.32073617439471097, + "grad_norm": 0.538004994392395, + "learning_rate": 0.0001717613830817723, + "loss": 1.2802, + "step": 1795 + }, + { + "epoch": 0.32162958992227286, + "grad_norm": 0.5084322690963745, + "learning_rate": 0.00017154380493688583, + "loss": 1.2718, + "step": 1800 + }, + { + "epoch": 0.3225230054498347, + "grad_norm": 0.5454220175743103, + "learning_rate": 0.00017132553074271272, + "loss": 1.2522, + "step": 1805 + }, + { + "epoch": 0.3234164209773966, + "grad_norm": 0.5236728191375732, + "learning_rate": 0.00017110656262284135, + "loss": 1.2325, + "step": 1810 + }, + { + "epoch": 0.3243098365049585, + "grad_norm": 0.5248743295669556, + "learning_rate": 0.0001708869027076114, + "loss": 1.2443, + "step": 1815 + }, + { + "epoch": 0.3252032520325203, + "grad_norm": 0.5419241786003113, + "learning_rate": 0.00017066655313409295, + "loss": 1.2359, + "step": 1820 + }, + { + "epoch": 0.3260966675600822, + "grad_norm": 0.5280671715736389, + "learning_rate": 0.00017044551604606585, + "loss": 1.2639, + "step": 1825 + }, + { + "epoch": 0.32699008308764405, + "grad_norm": 0.5182398557662964, + "learning_rate": 0.00017022379359399868, + "loss": 1.2303, + "step": 1830 + }, + { + "epoch": 0.32788349861520594, + "grad_norm": 0.5220509171485901, + "learning_rate": 0.00017000138793502796, + "loss": 1.2605, + "step": 1835 + }, + { + "epoch": 0.32877691414276783, + "grad_norm": 0.5644056797027588, + "learning_rate": 0.00016977830123293713, + "loss": 1.2321, + "step": 1840 + }, + { + "epoch": 0.32967032967032966, + "grad_norm": 0.5633097887039185, + "learning_rate": 0.00016955453565813548, + "loss": 1.2882, + "step": 1845 + }, + { + "epoch": 0.33056374519789156, + "grad_norm": 0.5087893605232239, + "learning_rate": 0.0001693300933876371, + "loss": 1.2588, + "step": 1850 + }, + { + "epoch": 0.3314571607254534, + "grad_norm": 0.5339887738227844, + "learning_rate": 0.00016910497660503957, + "loss": 1.2356, + "step": 1855 + }, + { + "epoch": 0.3323505762530153, + "grad_norm": 0.5123136043548584, + "learning_rate": 0.00016887918750050292, + "loss": 1.2672, + "step": 1860 + }, + { + "epoch": 0.3332439917805771, + "grad_norm": 0.5174533128738403, + "learning_rate": 0.00016865272827072797, + "loss": 1.2506, + "step": 1865 + }, + { + "epoch": 0.334137407308139, + "grad_norm": 0.4912176728248596, + "learning_rate": 0.00016842560111893543, + "loss": 1.2689, + "step": 1870 + }, + { + "epoch": 0.3350308228357009, + "grad_norm": 0.5270506739616394, + "learning_rate": 0.00016819780825484413, + "loss": 1.2343, + "step": 1875 + }, + { + "epoch": 0.33592423836326274, + "grad_norm": 0.568572461605072, + "learning_rate": 0.00016796935189464956, + "loss": 1.2454, + "step": 1880 + }, + { + "epoch": 0.33681765389082463, + "grad_norm": 0.5541931390762329, + "learning_rate": 0.00016774023426100238, + "loss": 1.2434, + "step": 1885 + }, + { + "epoch": 0.33771106941838647, + "grad_norm": 0.5010518431663513, + "learning_rate": 0.0001675104575829868, + "loss": 1.2402, + "step": 1890 + }, + { + "epoch": 0.33860448494594836, + "grad_norm": 0.5561944246292114, + "learning_rate": 0.0001672800240960989, + "loss": 1.2616, + "step": 1895 + }, + { + "epoch": 0.33949790047351025, + "grad_norm": 0.5340306758880615, + "learning_rate": 0.00016704893604222476, + "loss": 1.2949, + "step": 1900 + }, + { + "epoch": 0.3403913160010721, + "grad_norm": 0.5372459888458252, + "learning_rate": 0.00016681719566961887, + "loss": 1.2721, + "step": 1905 + }, + { + "epoch": 0.341284731528634, + "grad_norm": 0.5558496117591858, + "learning_rate": 0.00016658480523288195, + "loss": 1.2146, + "step": 1910 + }, + { + "epoch": 0.3421781470561958, + "grad_norm": 0.523130476474762, + "learning_rate": 0.00016635176699293934, + "loss": 1.2607, + "step": 1915 + }, + { + "epoch": 0.3430715625837577, + "grad_norm": 0.5020752549171448, + "learning_rate": 0.00016611808321701882, + "loss": 1.2572, + "step": 1920 + }, + { + "epoch": 0.3439649781113196, + "grad_norm": 0.52286297082901, + "learning_rate": 0.00016588375617862858, + "loss": 1.2249, + "step": 1925 + }, + { + "epoch": 0.34485839363888143, + "grad_norm": 0.5461264848709106, + "learning_rate": 0.000165648788157535, + "loss": 1.2896, + "step": 1930 + }, + { + "epoch": 0.3457518091664433, + "grad_norm": 0.5276172161102295, + "learning_rate": 0.00016541318143974075, + "loss": 1.263, + "step": 1935 + }, + { + "epoch": 0.34664522469400516, + "grad_norm": 0.5360847115516663, + "learning_rate": 0.00016517693831746225, + "loss": 1.2618, + "step": 1940 + }, + { + "epoch": 0.34753864022156705, + "grad_norm": 0.5737685561180115, + "learning_rate": 0.00016494006108910757, + "loss": 1.2572, + "step": 1945 + }, + { + "epoch": 0.34843205574912894, + "grad_norm": 0.4999217987060547, + "learning_rate": 0.000164702552059254, + "loss": 1.2388, + "step": 1950 + }, + { + "epoch": 0.3493254712766908, + "grad_norm": 0.5562167763710022, + "learning_rate": 0.00016446441353862556, + "loss": 1.3043, + "step": 1955 + }, + { + "epoch": 0.35021888680425267, + "grad_norm": 0.542860209941864, + "learning_rate": 0.0001642256478440706, + "loss": 1.2583, + "step": 1960 + }, + { + "epoch": 0.3511123023318145, + "grad_norm": 0.5292645692825317, + "learning_rate": 0.00016398625729853924, + "loss": 1.2556, + "step": 1965 + }, + { + "epoch": 0.3520057178593764, + "grad_norm": 0.5102574229240417, + "learning_rate": 0.00016374624423106087, + "loss": 1.2536, + "step": 1970 + }, + { + "epoch": 0.3528991333869383, + "grad_norm": 0.5271995067596436, + "learning_rate": 0.00016350561097672122, + "loss": 1.2147, + "step": 1975 + }, + { + "epoch": 0.3537925489145001, + "grad_norm": 0.5228263735771179, + "learning_rate": 0.00016326435987663995, + "loss": 1.247, + "step": 1980 + }, + { + "epoch": 0.354685964442062, + "grad_norm": 0.5432056188583374, + "learning_rate": 0.0001630224932779477, + "loss": 1.2328, + "step": 1985 + }, + { + "epoch": 0.35557937996962385, + "grad_norm": 0.5798959732055664, + "learning_rate": 0.00016278001353376323, + "loss": 1.2105, + "step": 1990 + }, + { + "epoch": 0.35647279549718575, + "grad_norm": 0.5255203247070312, + "learning_rate": 0.0001625369230031707, + "loss": 1.2412, + "step": 1995 + }, + { + "epoch": 0.35736621102474764, + "grad_norm": 0.5129863619804382, + "learning_rate": 0.00016229322405119655, + "loss": 1.2291, + "step": 2000 + }, + { + "epoch": 0.3582596265523095, + "grad_norm": 0.5471330881118774, + "learning_rate": 0.00016204891904878657, + "loss": 1.2397, + "step": 2005 + }, + { + "epoch": 0.35915304207987137, + "grad_norm": 0.4979896545410156, + "learning_rate": 0.0001618040103727827, + "loss": 1.2312, + "step": 2010 + }, + { + "epoch": 0.3600464576074332, + "grad_norm": 0.5342873930931091, + "learning_rate": 0.00016155850040590016, + "loss": 1.2454, + "step": 2015 + }, + { + "epoch": 0.3609398731349951, + "grad_norm": 0.5229947566986084, + "learning_rate": 0.0001613123915367041, + "loss": 1.2417, + "step": 2020 + }, + { + "epoch": 0.36183328866255693, + "grad_norm": 0.5110452175140381, + "learning_rate": 0.00016106568615958632, + "loss": 1.1992, + "step": 2025 + }, + { + "epoch": 0.3627267041901188, + "grad_norm": 0.5229007601737976, + "learning_rate": 0.00016081838667474213, + "loss": 1.2174, + "step": 2030 + }, + { + "epoch": 0.3636201197176807, + "grad_norm": 0.5256268978118896, + "learning_rate": 0.0001605704954881468, + "loss": 1.2365, + "step": 2035 + }, + { + "epoch": 0.36451353524524255, + "grad_norm": 0.5547985434532166, + "learning_rate": 0.00016032201501153242, + "loss": 1.2282, + "step": 2040 + }, + { + "epoch": 0.36540695077280444, + "grad_norm": 0.5028260946273804, + "learning_rate": 0.00016007294766236406, + "loss": 1.2256, + "step": 2045 + }, + { + "epoch": 0.3663003663003663, + "grad_norm": 0.48005393147468567, + "learning_rate": 0.00015982329586381675, + "loss": 1.2313, + "step": 2050 + }, + { + "epoch": 0.36719378182792817, + "grad_norm": 0.48138970136642456, + "learning_rate": 0.00015957306204475132, + "loss": 1.2547, + "step": 2055 + }, + { + "epoch": 0.36808719735549006, + "grad_norm": 0.5655422210693359, + "learning_rate": 0.00015932224863969135, + "loss": 1.2304, + "step": 2060 + }, + { + "epoch": 0.3689806128830519, + "grad_norm": 0.514876127243042, + "learning_rate": 0.000159070858088799, + "loss": 1.1753, + "step": 2065 + }, + { + "epoch": 0.3698740284106138, + "grad_norm": 0.5176100134849548, + "learning_rate": 0.0001588188928378516, + "loss": 1.2343, + "step": 2070 + }, + { + "epoch": 0.3707674439381756, + "grad_norm": 0.5761798024177551, + "learning_rate": 0.00015856635533821774, + "loss": 1.2001, + "step": 2075 + }, + { + "epoch": 0.3716608594657375, + "grad_norm": 0.5333663821220398, + "learning_rate": 0.00015831324804683328, + "loss": 1.2414, + "step": 2080 + }, + { + "epoch": 0.3725542749932994, + "grad_norm": 0.5275282859802246, + "learning_rate": 0.0001580595734261777, + "loss": 1.1956, + "step": 2085 + }, + { + "epoch": 0.37344769052086124, + "grad_norm": 0.5324888825416565, + "learning_rate": 0.00015780533394425006, + "loss": 1.2218, + "step": 2090 + }, + { + "epoch": 0.37434110604842313, + "grad_norm": 0.5113807916641235, + "learning_rate": 0.00015755053207454483, + "loss": 1.2431, + "step": 2095 + }, + { + "epoch": 0.37523452157598497, + "grad_norm": 0.5269024968147278, + "learning_rate": 0.00015729517029602802, + "loss": 1.2333, + "step": 2100 + }, + { + "epoch": 0.37612793710354686, + "grad_norm": 0.538032054901123, + "learning_rate": 0.00015703925109311295, + "loss": 1.2178, + "step": 2105 + }, + { + "epoch": 0.37702135263110875, + "grad_norm": 0.5196306109428406, + "learning_rate": 0.00015678277695563617, + "loss": 1.2357, + "step": 2110 + }, + { + "epoch": 0.3779147681586706, + "grad_norm": 0.543264627456665, + "learning_rate": 0.00015652575037883318, + "loss": 1.1962, + "step": 2115 + }, + { + "epoch": 0.3788081836862325, + "grad_norm": 0.5403855443000793, + "learning_rate": 0.0001562681738633141, + "loss": 1.2599, + "step": 2120 + }, + { + "epoch": 0.3797015992137943, + "grad_norm": 0.5239453911781311, + "learning_rate": 0.00015601004991503946, + "loss": 1.2144, + "step": 2125 + }, + { + "epoch": 0.3805950147413562, + "grad_norm": 0.5659239292144775, + "learning_rate": 0.00015575138104529577, + "loss": 1.201, + "step": 2130 + }, + { + "epoch": 0.3814884302689181, + "grad_norm": 0.4925094246864319, + "learning_rate": 0.00015549216977067099, + "loss": 1.2219, + "step": 2135 + }, + { + "epoch": 0.38238184579647994, + "grad_norm": 0.4957289695739746, + "learning_rate": 0.0001552324186130302, + "loss": 1.2577, + "step": 2140 + }, + { + "epoch": 0.3832752613240418, + "grad_norm": 0.5230002999305725, + "learning_rate": 0.00015497213009949104, + "loss": 1.2017, + "step": 2145 + }, + { + "epoch": 0.38416867685160366, + "grad_norm": 0.5107108354568481, + "learning_rate": 0.000154711306762399, + "loss": 1.2115, + "step": 2150 + }, + { + "epoch": 0.38506209237916555, + "grad_norm": 0.5467280745506287, + "learning_rate": 0.00015444995113930287, + "loss": 1.2443, + "step": 2155 + }, + { + "epoch": 0.38595550790672745, + "grad_norm": 0.5325524806976318, + "learning_rate": 0.00015418806577293013, + "loss": 1.2103, + "step": 2160 + }, + { + "epoch": 0.3868489234342893, + "grad_norm": 0.5066697001457214, + "learning_rate": 0.00015392565321116207, + "loss": 1.2157, + "step": 2165 + }, + { + "epoch": 0.3877423389618512, + "grad_norm": 0.5199021100997925, + "learning_rate": 0.00015366271600700902, + "loss": 1.2284, + "step": 2170 + }, + { + "epoch": 0.388635754489413, + "grad_norm": 0.557767391204834, + "learning_rate": 0.00015339925671858563, + "loss": 1.2144, + "step": 2175 + }, + { + "epoch": 0.3895291700169749, + "grad_norm": 0.5319749116897583, + "learning_rate": 0.0001531352779090859, + "loss": 1.1939, + "step": 2180 + }, + { + "epoch": 0.39042258554453674, + "grad_norm": 0.5209249258041382, + "learning_rate": 0.00015287078214675819, + "loss": 1.2352, + "step": 2185 + }, + { + "epoch": 0.39131600107209863, + "grad_norm": 0.5308728218078613, + "learning_rate": 0.00015260577200488034, + "loss": 1.232, + "step": 2190 + }, + { + "epoch": 0.3922094165996605, + "grad_norm": 0.5475233793258667, + "learning_rate": 0.00015234025006173452, + "loss": 1.2255, + "step": 2195 + }, + { + "epoch": 0.39310283212722236, + "grad_norm": 0.5130513310432434, + "learning_rate": 0.00015207421890058237, + "loss": 1.2169, + "step": 2200 + }, + { + "epoch": 0.39399624765478425, + "grad_norm": 0.5458916425704956, + "learning_rate": 0.0001518076811096395, + "loss": 1.206, + "step": 2205 + }, + { + "epoch": 0.3948896631823461, + "grad_norm": 0.4863106608390808, + "learning_rate": 0.00015154063928205067, + "loss": 1.1518, + "step": 2210 + }, + { + "epoch": 0.395783078709908, + "grad_norm": 0.494357168674469, + "learning_rate": 0.00015127309601586434, + "loss": 1.1735, + "step": 2215 + }, + { + "epoch": 0.39667649423746987, + "grad_norm": 0.5145966410636902, + "learning_rate": 0.0001510050539140075, + "loss": 1.2215, + "step": 2220 + }, + { + "epoch": 0.3975699097650317, + "grad_norm": 0.5166826844215393, + "learning_rate": 0.00015073651558426026, + "loss": 1.2375, + "step": 2225 + }, + { + "epoch": 0.3984633252925936, + "grad_norm": 0.5195355415344238, + "learning_rate": 0.00015046748363923057, + "loss": 1.2458, + "step": 2230 + }, + { + "epoch": 0.39935674082015543, + "grad_norm": 0.5170833468437195, + "learning_rate": 0.00015019796069632878, + "loss": 1.2356, + "step": 2235 + }, + { + "epoch": 0.4002501563477173, + "grad_norm": 0.5438719987869263, + "learning_rate": 0.00014992794937774211, + "loss": 1.2214, + "step": 2240 + }, + { + "epoch": 0.4011435718752792, + "grad_norm": 0.510884165763855, + "learning_rate": 0.00014965745231040919, + "loss": 1.2497, + "step": 2245 + }, + { + "epoch": 0.40203698740284105, + "grad_norm": 0.5298115611076355, + "learning_rate": 0.00014938647212599452, + "loss": 1.2133, + "step": 2250 + }, + { + "epoch": 0.40293040293040294, + "grad_norm": 0.5149961709976196, + "learning_rate": 0.00014911501146086281, + "loss": 1.217, + "step": 2255 + }, + { + "epoch": 0.4038238184579648, + "grad_norm": 0.49967697262763977, + "learning_rate": 0.00014884307295605343, + "loss": 1.2235, + "step": 2260 + }, + { + "epoch": 0.40471723398552667, + "grad_norm": 0.5198853015899658, + "learning_rate": 0.00014857065925725452, + "loss": 1.197, + "step": 2265 + }, + { + "epoch": 0.40561064951308856, + "grad_norm": 0.5316833853721619, + "learning_rate": 0.0001482977730147776, + "loss": 1.2051, + "step": 2270 + }, + { + "epoch": 0.4065040650406504, + "grad_norm": 0.5476366877555847, + "learning_rate": 0.00014802441688353127, + "loss": 1.2368, + "step": 2275 + }, + { + "epoch": 0.4073974805682123, + "grad_norm": 0.523506224155426, + "learning_rate": 0.00014775059352299598, + "loss": 1.1973, + "step": 2280 + }, + { + "epoch": 0.4082908960957741, + "grad_norm": 0.5155140161514282, + "learning_rate": 0.00014747630559719762, + "loss": 1.2026, + "step": 2285 + }, + { + "epoch": 0.409184311623336, + "grad_norm": 0.5298252701759338, + "learning_rate": 0.00014720155577468193, + "loss": 1.1871, + "step": 2290 + }, + { + "epoch": 0.4100777271508979, + "grad_norm": 0.5185467004776001, + "learning_rate": 0.00014692634672848847, + "loss": 1.208, + "step": 2295 + }, + { + "epoch": 0.41097114267845974, + "grad_norm": 0.4993501603603363, + "learning_rate": 0.00014665068113612449, + "loss": 1.213, + "step": 2300 + }, + { + "epoch": 0.41186455820602164, + "grad_norm": 0.5336927175521851, + "learning_rate": 0.00014637456167953907, + "loss": 1.2232, + "step": 2305 + }, + { + "epoch": 0.41275797373358347, + "grad_norm": 0.4852610230445862, + "learning_rate": 0.00014609799104509685, + "loss": 1.2055, + "step": 2310 + }, + { + "epoch": 0.41365138926114536, + "grad_norm": 0.5042843222618103, + "learning_rate": 0.00014582097192355207, + "loss": 1.1969, + "step": 2315 + }, + { + "epoch": 0.41454480478870726, + "grad_norm": 0.5343486070632935, + "learning_rate": 0.00014554350701002222, + "loss": 1.1814, + "step": 2320 + }, + { + "epoch": 0.4154382203162691, + "grad_norm": 0.5457481741905212, + "learning_rate": 0.00014526559900396188, + "loss": 1.1891, + "step": 2325 + }, + { + "epoch": 0.416331635843831, + "grad_norm": 0.5025014281272888, + "learning_rate": 0.00014498725060913662, + "loss": 1.2334, + "step": 2330 + }, + { + "epoch": 0.4172250513713928, + "grad_norm": 0.5562976002693176, + "learning_rate": 0.00014470846453359636, + "loss": 1.1992, + "step": 2335 + }, + { + "epoch": 0.4181184668989547, + "grad_norm": 0.5212603807449341, + "learning_rate": 0.00014442924348964938, + "loss": 1.2133, + "step": 2340 + }, + { + "epoch": 0.41901188242651655, + "grad_norm": 0.49604693055152893, + "learning_rate": 0.00014414959019383564, + "loss": 1.2171, + "step": 2345 + }, + { + "epoch": 0.41990529795407844, + "grad_norm": 0.5236498713493347, + "learning_rate": 0.00014386950736690053, + "loss": 1.2163, + "step": 2350 + }, + { + "epoch": 0.42079871348164033, + "grad_norm": 0.5673661828041077, + "learning_rate": 0.00014358899773376832, + "loss": 1.1914, + "step": 2355 + }, + { + "epoch": 0.42169212900920217, + "grad_norm": 0.49621349573135376, + "learning_rate": 0.00014330806402351574, + "loss": 1.188, + "step": 2360 + }, + { + "epoch": 0.42258554453676406, + "grad_norm": 0.5138354897499084, + "learning_rate": 0.00014302670896934532, + "loss": 1.2169, + "step": 2365 + }, + { + "epoch": 0.4234789600643259, + "grad_norm": 0.5443947315216064, + "learning_rate": 0.00014274493530855878, + "loss": 1.2049, + "step": 2370 + }, + { + "epoch": 0.4243723755918878, + "grad_norm": 0.5496159195899963, + "learning_rate": 0.00014246274578253059, + "loss": 1.2084, + "step": 2375 + }, + { + "epoch": 0.4252657911194497, + "grad_norm": 0.5288496017456055, + "learning_rate": 0.00014218014313668105, + "loss": 1.2225, + "step": 2380 + }, + { + "epoch": 0.4261592066470115, + "grad_norm": 0.5264514088630676, + "learning_rate": 0.00014189713012044977, + "loss": 1.2164, + "step": 2385 + }, + { + "epoch": 0.4270526221745734, + "grad_norm": 0.5251997113227844, + "learning_rate": 0.00014161370948726894, + "loss": 1.1923, + "step": 2390 + }, + { + "epoch": 0.42794603770213524, + "grad_norm": 0.5069969296455383, + "learning_rate": 0.00014132988399453617, + "loss": 1.2421, + "step": 2395 + }, + { + "epoch": 0.42883945322969713, + "grad_norm": 0.5222852826118469, + "learning_rate": 0.00014104565640358824, + "loss": 1.1921, + "step": 2400 + }, + { + "epoch": 0.429732868757259, + "grad_norm": 0.5466008186340332, + "learning_rate": 0.0001407610294796738, + "loss": 1.1934, + "step": 2405 + }, + { + "epoch": 0.43062628428482086, + "grad_norm": 0.5061824917793274, + "learning_rate": 0.00014047600599192666, + "loss": 1.1836, + "step": 2410 + }, + { + "epoch": 0.43151969981238275, + "grad_norm": 0.5115388631820679, + "learning_rate": 0.0001401905887133387, + "loss": 1.1785, + "step": 2415 + }, + { + "epoch": 0.4324131153399446, + "grad_norm": 0.5375398397445679, + "learning_rate": 0.00013990478042073313, + "loss": 1.1939, + "step": 2420 + }, + { + "epoch": 0.4333065308675065, + "grad_norm": 0.510578989982605, + "learning_rate": 0.00013961858389473727, + "loss": 1.1948, + "step": 2425 + }, + { + "epoch": 0.43419994639506837, + "grad_norm": 0.526443362236023, + "learning_rate": 0.0001393320019197555, + "loss": 1.1978, + "step": 2430 + }, + { + "epoch": 0.4350933619226302, + "grad_norm": 0.4990314841270447, + "learning_rate": 0.00013904503728394234, + "loss": 1.2009, + "step": 2435 + }, + { + "epoch": 0.4359867774501921, + "grad_norm": 0.5471546053886414, + "learning_rate": 0.00013875769277917513, + "loss": 1.1962, + "step": 2440 + }, + { + "epoch": 0.43688019297775393, + "grad_norm": 0.4955877959728241, + "learning_rate": 0.000138469971201027, + "loss": 1.1761, + "step": 2445 + }, + { + "epoch": 0.4377736085053158, + "grad_norm": 0.5647017359733582, + "learning_rate": 0.00013818187534873954, + "loss": 1.2009, + "step": 2450 + }, + { + "epoch": 0.4386670240328777, + "grad_norm": 0.5101616978645325, + "learning_rate": 0.00013789340802519581, + "loss": 1.2066, + "step": 2455 + }, + { + "epoch": 0.43956043956043955, + "grad_norm": 0.5000629425048828, + "learning_rate": 0.0001376045720368928, + "loss": 1.2066, + "step": 2460 + }, + { + "epoch": 0.44045385508800144, + "grad_norm": 0.5091837048530579, + "learning_rate": 0.00013731537019391428, + "loss": 1.152, + "step": 2465 + }, + { + "epoch": 0.4413472706155633, + "grad_norm": 0.5215714573860168, + "learning_rate": 0.00013702580530990335, + "loss": 1.202, + "step": 2470 + }, + { + "epoch": 0.4422406861431252, + "grad_norm": 0.5080455541610718, + "learning_rate": 0.00013673588020203517, + "loss": 1.1798, + "step": 2475 + }, + { + "epoch": 0.44313410167068706, + "grad_norm": 0.5180327892303467, + "learning_rate": 0.0001364455976909896, + "loss": 1.1654, + "step": 2480 + }, + { + "epoch": 0.4440275171982489, + "grad_norm": 0.5066566467285156, + "learning_rate": 0.00013615496060092355, + "loss": 1.1768, + "step": 2485 + }, + { + "epoch": 0.4449209327258108, + "grad_norm": 0.5126291513442993, + "learning_rate": 0.00013586397175944368, + "loss": 1.2028, + "step": 2490 + }, + { + "epoch": 0.4458143482533726, + "grad_norm": 0.5379718542098999, + "learning_rate": 0.0001355726339975788, + "loss": 1.2208, + "step": 2495 + }, + { + "epoch": 0.4467077637809345, + "grad_norm": 0.5048497915267944, + "learning_rate": 0.00013528095014975252, + "loss": 1.1812, + "step": 2500 + }, + { + "epoch": 0.44760117930849636, + "grad_norm": 0.5384891033172607, + "learning_rate": 0.0001349889230537553, + "loss": 1.2319, + "step": 2505 + }, + { + "epoch": 0.44849459483605825, + "grad_norm": 0.541824221611023, + "learning_rate": 0.00013469655555071715, + "loss": 1.1513, + "step": 2510 + }, + { + "epoch": 0.44938801036362014, + "grad_norm": 0.5238659977912903, + "learning_rate": 0.00013440385048507997, + "loss": 1.209, + "step": 2515 + }, + { + "epoch": 0.450281425891182, + "grad_norm": 0.529753565788269, + "learning_rate": 0.0001341108107045697, + "loss": 1.1841, + "step": 2520 + }, + { + "epoch": 0.45117484141874387, + "grad_norm": 0.5439874529838562, + "learning_rate": 0.00013381743906016878, + "loss": 1.1729, + "step": 2525 + }, + { + "epoch": 0.4520682569463057, + "grad_norm": 0.5199108719825745, + "learning_rate": 0.00013352373840608834, + "loss": 1.226, + "step": 2530 + }, + { + "epoch": 0.4529616724738676, + "grad_norm": 0.529483437538147, + "learning_rate": 0.00013322971159974043, + "loss": 1.172, + "step": 2535 + }, + { + "epoch": 0.4538550880014295, + "grad_norm": 0.5473654270172119, + "learning_rate": 0.0001329353615017102, + "loss": 1.1524, + "step": 2540 + }, + { + "epoch": 0.4547485035289913, + "grad_norm": 0.5226883292198181, + "learning_rate": 0.00013264069097572816, + "loss": 1.1901, + "step": 2545 + }, + { + "epoch": 0.4556419190565532, + "grad_norm": 0.49766266345977783, + "learning_rate": 0.00013234570288864228, + "loss": 1.1731, + "step": 2550 + }, + { + "epoch": 0.45653533458411505, + "grad_norm": 0.5254089832305908, + "learning_rate": 0.00013205040011039004, + "loss": 1.1752, + "step": 2555 + }, + { + "epoch": 0.45742875011167694, + "grad_norm": 0.4883306324481964, + "learning_rate": 0.0001317547855139705, + "loss": 1.1997, + "step": 2560 + }, + { + "epoch": 0.45832216563923883, + "grad_norm": 0.48100265860557556, + "learning_rate": 0.00013145886197541651, + "loss": 1.1615, + "step": 2565 + }, + { + "epoch": 0.45921558116680067, + "grad_norm": 0.4742489159107208, + "learning_rate": 0.0001311626323737665, + "loss": 1.2034, + "step": 2570 + }, + { + "epoch": 0.46010899669436256, + "grad_norm": 0.5177445411682129, + "learning_rate": 0.00013086609959103672, + "loss": 1.1795, + "step": 2575 + }, + { + "epoch": 0.4610024122219244, + "grad_norm": 0.548886775970459, + "learning_rate": 0.00013056926651219293, + "loss": 1.2038, + "step": 2580 + }, + { + "epoch": 0.4618958277494863, + "grad_norm": 0.5247954726219177, + "learning_rate": 0.00013027213602512258, + "loss": 1.1983, + "step": 2585 + }, + { + "epoch": 0.4627892432770482, + "grad_norm": 0.513616681098938, + "learning_rate": 0.00012997471102060647, + "loss": 1.2036, + "step": 2590 + }, + { + "epoch": 0.46368265880461, + "grad_norm": 0.5180881023406982, + "learning_rate": 0.00012967699439229093, + "loss": 1.1869, + "step": 2595 + }, + { + "epoch": 0.4645760743321719, + "grad_norm": 0.513308048248291, + "learning_rate": 0.00012937898903665935, + "loss": 1.1945, + "step": 2600 + }, + { + "epoch": 0.46546948985973374, + "grad_norm": 0.5470111966133118, + "learning_rate": 0.0001290806978530042, + "loss": 1.184, + "step": 2605 + }, + { + "epoch": 0.46636290538729563, + "grad_norm": 0.5049375891685486, + "learning_rate": 0.00012878212374339883, + "loss": 1.1989, + "step": 2610 + }, + { + "epoch": 0.4672563209148575, + "grad_norm": 0.5457560420036316, + "learning_rate": 0.0001284832696126691, + "loss": 1.183, + "step": 2615 + }, + { + "epoch": 0.46814973644241936, + "grad_norm": 0.5425926446914673, + "learning_rate": 0.00012818413836836515, + "loss": 1.1818, + "step": 2620 + }, + { + "epoch": 0.46904315196998125, + "grad_norm": 0.5093770027160645, + "learning_rate": 0.00012788473292073328, + "loss": 1.1977, + "step": 2625 + }, + { + "epoch": 0.4699365674975431, + "grad_norm": 0.47960811853408813, + "learning_rate": 0.00012758505618268743, + "loss": 1.1948, + "step": 2630 + }, + { + "epoch": 0.470829983025105, + "grad_norm": 0.5090500116348267, + "learning_rate": 0.000127285111069781, + "loss": 1.1457, + "step": 2635 + }, + { + "epoch": 0.4717233985526669, + "grad_norm": 0.5354019403457642, + "learning_rate": 0.00012698490050017824, + "loss": 1.184, + "step": 2640 + }, + { + "epoch": 0.4726168140802287, + "grad_norm": 0.46892282366752625, + "learning_rate": 0.0001266844273946262, + "loss": 1.1394, + "step": 2645 + }, + { + "epoch": 0.4735102296077906, + "grad_norm": 0.5120964646339417, + "learning_rate": 0.0001263836946764261, + "loss": 1.1829, + "step": 2650 + }, + { + "epoch": 0.47440364513535244, + "grad_norm": 0.5253159403800964, + "learning_rate": 0.0001260827052714049, + "loss": 1.1393, + "step": 2655 + }, + { + "epoch": 0.47529706066291433, + "grad_norm": 0.49876195192337036, + "learning_rate": 0.00012578146210788686, + "loss": 1.1603, + "step": 2660 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.542789876461029, + "learning_rate": 0.0001254799681166651, + "loss": 1.197, + "step": 2665 + }, + { + "epoch": 0.47708389171803806, + "grad_norm": 0.5235465168952942, + "learning_rate": 0.00012517822623097296, + "loss": 1.1878, + "step": 2670 + }, + { + "epoch": 0.47797730724559995, + "grad_norm": 0.4984067678451538, + "learning_rate": 0.0001248762393864556, + "loss": 1.1687, + "step": 2675 + }, + { + "epoch": 0.4788707227731618, + "grad_norm": 0.4617389142513275, + "learning_rate": 0.0001245740105211414, + "loss": 1.1608, + "step": 2680 + }, + { + "epoch": 0.4797641383007237, + "grad_norm": 0.5108579993247986, + "learning_rate": 0.00012427154257541333, + "loss": 1.1825, + "step": 2685 + }, + { + "epoch": 0.4806575538282855, + "grad_norm": 0.535814642906189, + "learning_rate": 0.0001239688384919804, + "loss": 1.1533, + "step": 2690 + }, + { + "epoch": 0.4815509693558474, + "grad_norm": 0.5493077635765076, + "learning_rate": 0.00012366590121584895, + "loss": 1.1494, + "step": 2695 + }, + { + "epoch": 0.4824443848834093, + "grad_norm": 0.5428511500358582, + "learning_rate": 0.0001233627336942941, + "loss": 1.1999, + "step": 2700 + }, + { + "epoch": 0.48333780041097113, + "grad_norm": 0.4960634708404541, + "learning_rate": 0.00012305933887683102, + "loss": 1.1713, + "step": 2705 + }, + { + "epoch": 0.484231215938533, + "grad_norm": 0.49134474992752075, + "learning_rate": 0.00012275571971518616, + "loss": 1.1895, + "step": 2710 + }, + { + "epoch": 0.48512463146609486, + "grad_norm": 0.5415772795677185, + "learning_rate": 0.00012245187916326878, + "loss": 1.1975, + "step": 2715 + }, + { + "epoch": 0.48601804699365675, + "grad_norm": 0.5128939747810364, + "learning_rate": 0.00012214782017714185, + "loss": 1.1426, + "step": 2720 + }, + { + "epoch": 0.48691146252121864, + "grad_norm": 0.5035459399223328, + "learning_rate": 0.00012184354571499365, + "loss": 1.1667, + "step": 2725 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.5528464317321777, + "learning_rate": 0.00012153905873710878, + "loss": 1.1943, + "step": 2730 + }, + { + "epoch": 0.48869829357634237, + "grad_norm": 0.44781333208084106, + "learning_rate": 0.00012123436220583931, + "loss": 1.178, + "step": 2735 + }, + { + "epoch": 0.4895917091039042, + "grad_norm": 0.5722351670265198, + "learning_rate": 0.00012092945908557616, + "loss": 1.1797, + "step": 2740 + }, + { + "epoch": 0.4904851246314661, + "grad_norm": 0.5207979083061218, + "learning_rate": 0.00012062435234272007, + "loss": 1.1919, + "step": 2745 + }, + { + "epoch": 0.491378540159028, + "grad_norm": 0.5260438323020935, + "learning_rate": 0.00012031904494565296, + "loss": 1.1874, + "step": 2750 + }, + { + "epoch": 0.4922719556865898, + "grad_norm": 0.5548182725906372, + "learning_rate": 0.00012001353986470878, + "loss": 1.1995, + "step": 2755 + }, + { + "epoch": 0.4931653712141517, + "grad_norm": 0.5213192701339722, + "learning_rate": 0.00011970784007214477, + "loss": 1.1892, + "step": 2760 + }, + { + "epoch": 0.49405878674171355, + "grad_norm": 0.5301674604415894, + "learning_rate": 0.00011940194854211258, + "loss": 1.1679, + "step": 2765 + }, + { + "epoch": 0.49495220226927544, + "grad_norm": 0.5306346416473389, + "learning_rate": 0.00011909586825062917, + "loss": 1.1775, + "step": 2770 + }, + { + "epoch": 0.49584561779683733, + "grad_norm": 0.5511103272438049, + "learning_rate": 0.00011878960217554809, + "loss": 1.1754, + "step": 2775 + }, + { + "epoch": 0.49673903332439917, + "grad_norm": 0.5091948509216309, + "learning_rate": 0.00011848315329653028, + "loss": 1.1693, + "step": 2780 + }, + { + "epoch": 0.49763244885196106, + "grad_norm": 0.5281689167022705, + "learning_rate": 0.0001181765245950152, + "loss": 1.1817, + "step": 2785 + }, + { + "epoch": 0.4985258643795229, + "grad_norm": 0.48319005966186523, + "learning_rate": 0.00011786971905419179, + "loss": 1.1437, + "step": 2790 + }, + { + "epoch": 0.4994192799070848, + "grad_norm": 0.5014276504516602, + "learning_rate": 0.00011756273965896953, + "loss": 1.1665, + "step": 2795 + }, + { + "epoch": 0.5003126954346466, + "grad_norm": 0.5470197796821594, + "learning_rate": 0.00011725558939594924, + "loss": 1.1852, + "step": 2800 + }, + { + "epoch": 0.5012061109622086, + "grad_norm": 0.5165842771530151, + "learning_rate": 0.00011694827125339418, + "loss": 1.1756, + "step": 2805 + }, + { + "epoch": 0.5020995264897704, + "grad_norm": 0.5374709963798523, + "learning_rate": 0.00011664078822120084, + "loss": 1.1946, + "step": 2810 + }, + { + "epoch": 0.5029929420173322, + "grad_norm": 0.48660171031951904, + "learning_rate": 0.00011633314329086993, + "loss": 1.1606, + "step": 2815 + }, + { + "epoch": 0.5038863575448941, + "grad_norm": 0.541174590587616, + "learning_rate": 0.00011602533945547737, + "loss": 1.1393, + "step": 2820 + }, + { + "epoch": 0.504779773072456, + "grad_norm": 0.5074775815010071, + "learning_rate": 0.00011571737970964496, + "loss": 1.1719, + "step": 2825 + }, + { + "epoch": 0.5056731886000179, + "grad_norm": 0.49670732021331787, + "learning_rate": 0.00011540926704951136, + "loss": 1.1632, + "step": 2830 + }, + { + "epoch": 0.5065666041275797, + "grad_norm": 0.4699111580848694, + "learning_rate": 0.000115101004472703, + "loss": 1.137, + "step": 2835 + }, + { + "epoch": 0.5074600196551416, + "grad_norm": 0.4844904839992523, + "learning_rate": 0.00011479259497830472, + "loss": 1.1636, + "step": 2840 + }, + { + "epoch": 0.5083534351827035, + "grad_norm": 0.5194590091705322, + "learning_rate": 0.00011448404156683088, + "loss": 1.1817, + "step": 2845 + }, + { + "epoch": 0.5092468507102653, + "grad_norm": 0.49438104033470154, + "learning_rate": 0.00011417534724019592, + "loss": 1.1669, + "step": 2850 + }, + { + "epoch": 0.5101402662378273, + "grad_norm": 0.5134628415107727, + "learning_rate": 0.00011386651500168524, + "loss": 1.1938, + "step": 2855 + }, + { + "epoch": 0.5110336817653891, + "grad_norm": 0.5145233869552612, + "learning_rate": 0.00011355754785592596, + "loss": 1.1529, + "step": 2860 + }, + { + "epoch": 0.5119270972929509, + "grad_norm": 0.5291239619255066, + "learning_rate": 0.00011324844880885783, + "loss": 1.1737, + "step": 2865 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.5408050417900085, + "learning_rate": 0.00011293922086770375, + "loss": 1.148, + "step": 2870 + }, + { + "epoch": 0.5137139283480747, + "grad_norm": 0.622309684753418, + "learning_rate": 0.00011262986704094065, + "loss": 1.1984, + "step": 2875 + }, + { + "epoch": 0.5146073438756366, + "grad_norm": 0.5061530470848083, + "learning_rate": 0.00011232039033827025, + "loss": 1.1492, + "step": 2880 + }, + { + "epoch": 0.5155007594031984, + "grad_norm": 0.49988749623298645, + "learning_rate": 0.00011201079377058963, + "loss": 1.1392, + "step": 2885 + }, + { + "epoch": 0.5163941749307603, + "grad_norm": 0.5244172811508179, + "learning_rate": 0.00011170108034996218, + "loss": 1.1682, + "step": 2890 + }, + { + "epoch": 0.5172875904583222, + "grad_norm": 0.5669301748275757, + "learning_rate": 0.00011139125308958804, + "loss": 1.1655, + "step": 2895 + }, + { + "epoch": 0.518181005985884, + "grad_norm": 0.5045320987701416, + "learning_rate": 0.00011108131500377494, + "loss": 1.172, + "step": 2900 + }, + { + "epoch": 0.5190744215134458, + "grad_norm": 0.5101925730705261, + "learning_rate": 0.00011077126910790882, + "loss": 1.1566, + "step": 2905 + }, + { + "epoch": 0.5199678370410078, + "grad_norm": 0.5170836448669434, + "learning_rate": 0.0001104611184184245, + "loss": 1.1494, + "step": 2910 + }, + { + "epoch": 0.5208612525685696, + "grad_norm": 0.49804747104644775, + "learning_rate": 0.00011015086595277633, + "loss": 1.1615, + "step": 2915 + }, + { + "epoch": 0.5217546680961315, + "grad_norm": 0.494862824678421, + "learning_rate": 0.00010984051472940885, + "loss": 1.1393, + "step": 2920 + }, + { + "epoch": 0.5226480836236934, + "grad_norm": 0.5359513759613037, + "learning_rate": 0.00010953006776772747, + "loss": 1.1552, + "step": 2925 + }, + { + "epoch": 0.5235414991512553, + "grad_norm": 0.49183422327041626, + "learning_rate": 0.00010921952808806888, + "loss": 1.1521, + "step": 2930 + }, + { + "epoch": 0.5244349146788171, + "grad_norm": 0.5181066989898682, + "learning_rate": 0.00010890889871167203, + "loss": 1.1544, + "step": 2935 + }, + { + "epoch": 0.525328330206379, + "grad_norm": 0.5055237412452698, + "learning_rate": 0.00010859818266064835, + "loss": 1.1722, + "step": 2940 + }, + { + "epoch": 0.5262217457339409, + "grad_norm": 0.521922767162323, + "learning_rate": 0.00010828738295795262, + "loss": 1.1637, + "step": 2945 + }, + { + "epoch": 0.5271151612615027, + "grad_norm": 0.5190510153770447, + "learning_rate": 0.00010797650262735346, + "loss": 1.1692, + "step": 2950 + }, + { + "epoch": 0.5280085767890645, + "grad_norm": 0.5434553623199463, + "learning_rate": 0.00010766554469340386, + "loss": 1.1801, + "step": 2955 + }, + { + "epoch": 0.5289019923166265, + "grad_norm": 0.502286970615387, + "learning_rate": 0.00010735451218141191, + "loss": 1.1753, + "step": 2960 + }, + { + "epoch": 0.5297954078441883, + "grad_norm": 0.5568491220474243, + "learning_rate": 0.0001070434081174112, + "loss": 1.1603, + "step": 2965 + }, + { + "epoch": 0.5306888233717502, + "grad_norm": 0.470877468585968, + "learning_rate": 0.00010673223552813147, + "loss": 1.1581, + "step": 2970 + }, + { + "epoch": 0.5315822388993121, + "grad_norm": 0.5503152012825012, + "learning_rate": 0.00010642099744096914, + "loss": 1.1617, + "step": 2975 + }, + { + "epoch": 0.532475654426874, + "grad_norm": 0.5499110221862793, + "learning_rate": 0.00010610969688395782, + "loss": 1.1525, + "step": 2980 + }, + { + "epoch": 0.5333690699544358, + "grad_norm": 0.5412406325340271, + "learning_rate": 0.00010579833688573897, + "loss": 1.1664, + "step": 2985 + }, + { + "epoch": 0.5342624854819977, + "grad_norm": 0.4897433817386627, + "learning_rate": 0.00010548692047553227, + "loss": 1.156, + "step": 2990 + }, + { + "epoch": 0.5351559010095596, + "grad_norm": 0.5212523341178894, + "learning_rate": 0.00010517545068310635, + "loss": 1.1502, + "step": 2995 + }, + { + "epoch": 0.5360493165371214, + "grad_norm": 0.48938560485839844, + "learning_rate": 0.00010486393053874902, + "loss": 1.1703, + "step": 3000 + }, + { + "epoch": 0.5369427320646832, + "grad_norm": 0.5584519505500793, + "learning_rate": 0.0001045523630732381, + "loss": 1.1739, + "step": 3005 + }, + { + "epoch": 0.5378361475922452, + "grad_norm": 0.5129048228263855, + "learning_rate": 0.00010424075131781178, + "loss": 1.1852, + "step": 3010 + }, + { + "epoch": 0.538729563119807, + "grad_norm": 0.5111370086669922, + "learning_rate": 0.00010392909830413904, + "loss": 1.1613, + "step": 3015 + }, + { + "epoch": 0.5396229786473689, + "grad_norm": 0.5235916376113892, + "learning_rate": 0.00010361740706429046, + "loss": 1.1644, + "step": 3020 + }, + { + "epoch": 0.5405163941749308, + "grad_norm": 0.523694634437561, + "learning_rate": 0.00010330568063070832, + "loss": 1.1947, + "step": 3025 + }, + { + "epoch": 0.5414098097024926, + "grad_norm": 0.4990216791629791, + "learning_rate": 0.00010299392203617744, + "loss": 1.1552, + "step": 3030 + }, + { + "epoch": 0.5423032252300545, + "grad_norm": 0.5227737426757812, + "learning_rate": 0.00010268213431379543, + "loss": 1.187, + "step": 3035 + }, + { + "epoch": 0.5431966407576164, + "grad_norm": 0.49936816096305847, + "learning_rate": 0.00010237032049694335, + "loss": 1.1584, + "step": 3040 + }, + { + "epoch": 0.5440900562851783, + "grad_norm": 0.5005204081535339, + "learning_rate": 0.00010205848361925618, + "loss": 1.1528, + "step": 3045 + }, + { + "epoch": 0.5449834718127401, + "grad_norm": 0.553865909576416, + "learning_rate": 0.0001017466267145931, + "loss": 1.1696, + "step": 3050 + }, + { + "epoch": 0.5458768873403019, + "grad_norm": 0.5369734168052673, + "learning_rate": 0.0001014347528170083, + "loss": 1.1576, + "step": 3055 + }, + { + "epoch": 0.5467703028678639, + "grad_norm": 0.5343630909919739, + "learning_rate": 0.00010112286496072117, + "loss": 1.1848, + "step": 3060 + }, + { + "epoch": 0.5476637183954257, + "grad_norm": 0.535750687122345, + "learning_rate": 0.00010081096618008699, + "loss": 1.1436, + "step": 3065 + }, + { + "epoch": 0.5485571339229875, + "grad_norm": 0.551857590675354, + "learning_rate": 0.00010049905950956728, + "loss": 1.1477, + "step": 3070 + }, + { + "epoch": 0.5494505494505495, + "grad_norm": 0.5209478139877319, + "learning_rate": 0.00010018714798370035, + "loss": 1.1672, + "step": 3075 + }, + { + "epoch": 0.5503439649781113, + "grad_norm": 0.47555601596832275, + "learning_rate": 9.98752346370717e-05, + "loss": 1.1585, + "step": 3080 + }, + { + "epoch": 0.5512373805056732, + "grad_norm": 0.4957531988620758, + "learning_rate": 9.956332250428457e-05, + "loss": 1.1574, + "step": 3085 + }, + { + "epoch": 0.552130796033235, + "grad_norm": 0.4895710051059723, + "learning_rate": 9.925141461993043e-05, + "loss": 1.1262, + "step": 3090 + }, + { + "epoch": 0.553024211560797, + "grad_norm": 0.5204643607139587, + "learning_rate": 9.893951401855932e-05, + "loss": 1.1567, + "step": 3095 + }, + { + "epoch": 0.5539176270883588, + "grad_norm": 0.4921712279319763, + "learning_rate": 9.862762373465055e-05, + "loss": 1.1659, + "step": 3100 + }, + { + "epoch": 0.5548110426159206, + "grad_norm": 0.5866613388061523, + "learning_rate": 9.831574680258297e-05, + "loss": 1.141, + "step": 3105 + }, + { + "epoch": 0.5557044581434826, + "grad_norm": 0.5556049346923828, + "learning_rate": 9.800388625660553e-05, + "loss": 1.1803, + "step": 3110 + }, + { + "epoch": 0.5565978736710444, + "grad_norm": 0.5409796833992004, + "learning_rate": 9.769204513080775e-05, + "loss": 1.1225, + "step": 3115 + }, + { + "epoch": 0.5574912891986062, + "grad_norm": 0.5227869749069214, + "learning_rate": 9.738022645909026e-05, + "loss": 1.2076, + "step": 3120 + }, + { + "epoch": 0.5583847047261682, + "grad_norm": 0.5156333446502686, + "learning_rate": 9.706843327513521e-05, + "loss": 1.1229, + "step": 3125 + }, + { + "epoch": 0.55927812025373, + "grad_norm": 0.5258979201316833, + "learning_rate": 9.675666861237677e-05, + "loss": 1.1495, + "step": 3130 + }, + { + "epoch": 0.5601715357812919, + "grad_norm": 0.5187394022941589, + "learning_rate": 9.644493550397168e-05, + "loss": 1.1477, + "step": 3135 + }, + { + "epoch": 0.5610649513088537, + "grad_norm": 0.4999004304409027, + "learning_rate": 9.61332369827696e-05, + "loss": 1.1449, + "step": 3140 + }, + { + "epoch": 0.5619583668364156, + "grad_norm": 0.5169167518615723, + "learning_rate": 9.582157608128374e-05, + "loss": 1.1284, + "step": 3145 + }, + { + "epoch": 0.5628517823639775, + "grad_norm": 0.5010549426078796, + "learning_rate": 9.550995583166133e-05, + "loss": 1.1526, + "step": 3150 + }, + { + "epoch": 0.5637451978915393, + "grad_norm": 0.4774199426174164, + "learning_rate": 9.519837926565409e-05, + "loss": 1.1382, + "step": 3155 + }, + { + "epoch": 0.5646386134191013, + "grad_norm": 0.5373679995536804, + "learning_rate": 9.488684941458867e-05, + "loss": 1.1759, + "step": 3160 + }, + { + "epoch": 0.5655320289466631, + "grad_norm": 0.514975368976593, + "learning_rate": 9.45753693093373e-05, + "loss": 1.1553, + "step": 3165 + }, + { + "epoch": 0.5664254444742249, + "grad_norm": 0.5256127119064331, + "learning_rate": 9.426394198028823e-05, + "loss": 1.1217, + "step": 3170 + }, + { + "epoch": 0.5673188600017869, + "grad_norm": 0.49551287293434143, + "learning_rate": 9.395257045731627e-05, + "loss": 1.1269, + "step": 3175 + }, + { + "epoch": 0.5682122755293487, + "grad_norm": 0.5360212922096252, + "learning_rate": 9.364125776975318e-05, + "loss": 1.1636, + "step": 3180 + }, + { + "epoch": 0.5691056910569106, + "grad_norm": 0.5216602087020874, + "learning_rate": 9.333000694635849e-05, + "loss": 1.1649, + "step": 3185 + }, + { + "epoch": 0.5699991065844724, + "grad_norm": 0.4999989867210388, + "learning_rate": 9.30188210152897e-05, + "loss": 1.1135, + "step": 3190 + }, + { + "epoch": 0.5708925221120343, + "grad_norm": 0.5036138892173767, + "learning_rate": 9.270770300407305e-05, + "loss": 1.1549, + "step": 3195 + }, + { + "epoch": 0.5717859376395962, + "grad_norm": 0.5282987356185913, + "learning_rate": 9.239665593957398e-05, + "loss": 1.1426, + "step": 3200 + }, + { + "epoch": 0.572679353167158, + "grad_norm": 0.5551292300224304, + "learning_rate": 9.208568284796766e-05, + "loss": 1.147, + "step": 3205 + }, + { + "epoch": 0.57357276869472, + "grad_norm": 0.5044068098068237, + "learning_rate": 9.177478675470956e-05, + "loss": 1.1307, + "step": 3210 + }, + { + "epoch": 0.5744661842222818, + "grad_norm": 0.4707668125629425, + "learning_rate": 9.146397068450612e-05, + "loss": 1.1298, + "step": 3215 + }, + { + "epoch": 0.5753595997498436, + "grad_norm": 0.5296815037727356, + "learning_rate": 9.11532376612852e-05, + "loss": 1.1601, + "step": 3220 + }, + { + "epoch": 0.5762530152774055, + "grad_norm": 0.508043646812439, + "learning_rate": 9.084259070816662e-05, + "loss": 1.1773, + "step": 3225 + }, + { + "epoch": 0.5771464308049674, + "grad_norm": 0.5130497813224792, + "learning_rate": 9.053203284743294e-05, + "loss": 1.1281, + "step": 3230 + }, + { + "epoch": 0.5780398463325293, + "grad_norm": 0.48072364926338196, + "learning_rate": 9.022156710049985e-05, + "loss": 1.1401, + "step": 3235 + }, + { + "epoch": 0.5789332618600911, + "grad_norm": 0.5246201753616333, + "learning_rate": 8.991119648788696e-05, + "loss": 1.1313, + "step": 3240 + }, + { + "epoch": 0.579826677387653, + "grad_norm": 0.5002977252006531, + "learning_rate": 8.960092402918819e-05, + "loss": 1.1225, + "step": 3245 + }, + { + "epoch": 0.5807200929152149, + "grad_norm": 0.5016835927963257, + "learning_rate": 8.929075274304267e-05, + "loss": 1.1257, + "step": 3250 + }, + { + "epoch": 0.5816135084427767, + "grad_norm": 0.5389193296432495, + "learning_rate": 8.898068564710508e-05, + "loss": 1.1445, + "step": 3255 + }, + { + "epoch": 0.5825069239703387, + "grad_norm": 0.5444117188453674, + "learning_rate": 8.86707257580165e-05, + "loss": 1.1449, + "step": 3260 + }, + { + "epoch": 0.5834003394979005, + "grad_norm": 0.501939594745636, + "learning_rate": 8.836087609137502e-05, + "loss": 1.1912, + "step": 3265 + }, + { + "epoch": 0.5842937550254623, + "grad_norm": 0.5322566628456116, + "learning_rate": 8.805113966170635e-05, + "loss": 1.128, + "step": 3270 + }, + { + "epoch": 0.5851871705530242, + "grad_norm": 0.5263496041297913, + "learning_rate": 8.774151948243453e-05, + "loss": 1.1253, + "step": 3275 + }, + { + "epoch": 0.5860805860805861, + "grad_norm": 0.5112000703811646, + "learning_rate": 8.743201856585256e-05, + "loss": 1.1595, + "step": 3280 + }, + { + "epoch": 0.5869740016081479, + "grad_norm": 0.5330345630645752, + "learning_rate": 8.712263992309318e-05, + "loss": 1.1191, + "step": 3285 + }, + { + "epoch": 0.5878674171357098, + "grad_norm": 0.49791669845581055, + "learning_rate": 8.681338656409951e-05, + "loss": 1.1125, + "step": 3290 + }, + { + "epoch": 0.5887608326632717, + "grad_norm": 0.5101556181907654, + "learning_rate": 8.650426149759578e-05, + "loss": 1.1345, + "step": 3295 + }, + { + "epoch": 0.5896542481908336, + "grad_norm": 0.521859347820282, + "learning_rate": 8.619526773105813e-05, + "loss": 1.1581, + "step": 3300 + }, + { + "epoch": 0.5905476637183954, + "grad_norm": 0.5242456197738647, + "learning_rate": 8.588640827068512e-05, + "loss": 1.1512, + "step": 3305 + }, + { + "epoch": 0.5914410792459573, + "grad_norm": 0.489236444234848, + "learning_rate": 8.557768612136882e-05, + "loss": 1.099, + "step": 3310 + }, + { + "epoch": 0.5923344947735192, + "grad_norm": 0.520885169506073, + "learning_rate": 8.52691042866653e-05, + "loss": 1.1243, + "step": 3315 + }, + { + "epoch": 0.593227910301081, + "grad_norm": 0.495631605386734, + "learning_rate": 8.496066576876556e-05, + "loss": 1.1235, + "step": 3320 + }, + { + "epoch": 0.5941213258286429, + "grad_norm": 0.5357411503791809, + "learning_rate": 8.465237356846624e-05, + "loss": 1.1647, + "step": 3325 + }, + { + "epoch": 0.5950147413562048, + "grad_norm": 0.543991208076477, + "learning_rate": 8.434423068514048e-05, + "loss": 1.1379, + "step": 3330 + }, + { + "epoch": 0.5959081568837666, + "grad_norm": 0.5474541187286377, + "learning_rate": 8.403624011670871e-05, + "loss": 1.1423, + "step": 3335 + }, + { + "epoch": 0.5968015724113285, + "grad_norm": 0.5549032092094421, + "learning_rate": 8.372840485960947e-05, + "loss": 1.1872, + "step": 3340 + }, + { + "epoch": 0.5976949879388904, + "grad_norm": 0.5237979888916016, + "learning_rate": 8.342072790877033e-05, + "loss": 1.1388, + "step": 3345 + }, + { + "epoch": 0.5985884034664523, + "grad_norm": 0.5107823610305786, + "learning_rate": 8.311321225757858e-05, + "loss": 1.1151, + "step": 3350 + }, + { + "epoch": 0.5994818189940141, + "grad_norm": 0.4889257252216339, + "learning_rate": 8.280586089785236e-05, + "loss": 1.1501, + "step": 3355 + }, + { + "epoch": 0.600375234521576, + "grad_norm": 0.5312927961349487, + "learning_rate": 8.249867681981139e-05, + "loss": 1.1124, + "step": 3360 + }, + { + "epoch": 0.6012686500491379, + "grad_norm": 0.5377475619316101, + "learning_rate": 8.219166301204781e-05, + "loss": 1.1632, + "step": 3365 + }, + { + "epoch": 0.6021620655766997, + "grad_norm": 0.5036457777023315, + "learning_rate": 8.188482246149736e-05, + "loss": 1.1485, + "step": 3370 + }, + { + "epoch": 0.6030554811042615, + "grad_norm": 0.5324269533157349, + "learning_rate": 8.157815815341002e-05, + "loss": 1.1563, + "step": 3375 + }, + { + "epoch": 0.6039488966318235, + "grad_norm": 0.5336183309555054, + "learning_rate": 8.127167307132119e-05, + "loss": 1.1387, + "step": 3380 + }, + { + "epoch": 0.6048423121593853, + "grad_norm": 0.5451260805130005, + "learning_rate": 8.096537019702255e-05, + "loss": 1.1599, + "step": 3385 + }, + { + "epoch": 0.6057357276869472, + "grad_norm": 0.4915177524089813, + "learning_rate": 8.065925251053307e-05, + "loss": 1.1664, + "step": 3390 + }, + { + "epoch": 0.6066291432145091, + "grad_norm": 0.5154266357421875, + "learning_rate": 8.035332299007014e-05, + "loss": 1.1343, + "step": 3395 + }, + { + "epoch": 0.607522558742071, + "grad_norm": 0.48472052812576294, + "learning_rate": 8.004758461202023e-05, + "loss": 1.1482, + "step": 3400 + }, + { + "epoch": 0.6084159742696328, + "grad_norm": 0.48825910687446594, + "learning_rate": 7.974204035091046e-05, + "loss": 1.1233, + "step": 3405 + }, + { + "epoch": 0.6093093897971946, + "grad_norm": 0.5200708508491516, + "learning_rate": 7.943669317937923e-05, + "loss": 1.1334, + "step": 3410 + }, + { + "epoch": 0.6102028053247566, + "grad_norm": 0.5124212503433228, + "learning_rate": 7.913154606814753e-05, + "loss": 1.1336, + "step": 3415 + }, + { + "epoch": 0.6110962208523184, + "grad_norm": 0.5567544102668762, + "learning_rate": 7.882660198598993e-05, + "loss": 1.1524, + "step": 3420 + }, + { + "epoch": 0.6119896363798802, + "grad_norm": 0.5360432267189026, + "learning_rate": 7.852186389970571e-05, + "loss": 1.1671, + "step": 3425 + }, + { + "epoch": 0.6128830519074422, + "grad_norm": 0.5166192650794983, + "learning_rate": 7.821733477409005e-05, + "loss": 1.1043, + "step": 3430 + }, + { + "epoch": 0.613776467435004, + "grad_norm": 0.5590426921844482, + "learning_rate": 7.791301757190516e-05, + "loss": 1.1091, + "step": 3435 + }, + { + "epoch": 0.6146698829625659, + "grad_norm": 0.5402864813804626, + "learning_rate": 7.760891525385146e-05, + "loss": 1.125, + "step": 3440 + }, + { + "epoch": 0.6155632984901278, + "grad_norm": 0.4936152696609497, + "learning_rate": 7.730503077853873e-05, + "loss": 1.1471, + "step": 3445 + }, + { + "epoch": 0.6164567140176896, + "grad_norm": 0.48089155554771423, + "learning_rate": 7.700136710245731e-05, + "loss": 1.0873, + "step": 3450 + }, + { + "epoch": 0.6173501295452515, + "grad_norm": 0.517113447189331, + "learning_rate": 7.669792717994946e-05, + "loss": 1.1164, + "step": 3455 + }, + { + "epoch": 0.6182435450728133, + "grad_norm": 0.5420798063278198, + "learning_rate": 7.639471396318057e-05, + "loss": 1.1514, + "step": 3460 + }, + { + "epoch": 0.6191369606003753, + "grad_norm": 0.5132094025611877, + "learning_rate": 7.609173040211035e-05, + "loss": 1.1526, + "step": 3465 + }, + { + "epoch": 0.6200303761279371, + "grad_norm": 0.5790542364120483, + "learning_rate": 7.578897944446417e-05, + "loss": 1.1685, + "step": 3470 + }, + { + "epoch": 0.6209237916554989, + "grad_norm": 0.5307674407958984, + "learning_rate": 7.548646403570449e-05, + "loss": 1.1245, + "step": 3475 + }, + { + "epoch": 0.6218172071830609, + "grad_norm": 0.5037259459495544, + "learning_rate": 7.518418711900206e-05, + "loss": 1.1549, + "step": 3480 + }, + { + "epoch": 0.6227106227106227, + "grad_norm": 0.49981221556663513, + "learning_rate": 7.488215163520729e-05, + "loss": 1.1136, + "step": 3485 + }, + { + "epoch": 0.6236040382381846, + "grad_norm": 0.5257123708724976, + "learning_rate": 7.458036052282185e-05, + "loss": 1.0913, + "step": 3490 + }, + { + "epoch": 0.6244974537657465, + "grad_norm": 0.5649625062942505, + "learning_rate": 7.427881671796973e-05, + "loss": 1.129, + "step": 3495 + }, + { + "epoch": 0.6253908692933083, + "grad_norm": 0.5401619672775269, + "learning_rate": 7.3977523154369e-05, + "loss": 1.1526, + "step": 3500 + }, + { + "epoch": 0.6262842848208702, + "grad_norm": 0.5306874513626099, + "learning_rate": 7.367648276330305e-05, + "loss": 1.1597, + "step": 3505 + }, + { + "epoch": 0.627177700348432, + "grad_norm": 0.5311725735664368, + "learning_rate": 7.337569847359226e-05, + "loss": 1.1308, + "step": 3510 + }, + { + "epoch": 0.628071115875994, + "grad_norm": 0.5326215028762817, + "learning_rate": 7.307517321156528e-05, + "loss": 1.1142, + "step": 3515 + }, + { + "epoch": 0.6289645314035558, + "grad_norm": 0.5113494992256165, + "learning_rate": 7.277490990103079e-05, + "loss": 1.1321, + "step": 3520 + }, + { + "epoch": 0.6298579469311176, + "grad_norm": 0.5225013494491577, + "learning_rate": 7.247491146324887e-05, + "loss": 1.1103, + "step": 3525 + }, + { + "epoch": 0.6307513624586796, + "grad_norm": 0.522226095199585, + "learning_rate": 7.217518081690265e-05, + "loss": 1.1611, + "step": 3530 + }, + { + "epoch": 0.6316447779862414, + "grad_norm": 0.5471552014350891, + "learning_rate": 7.187572087807e-05, + "loss": 1.1654, + "step": 3535 + }, + { + "epoch": 0.6325381935138032, + "grad_norm": 0.5112610459327698, + "learning_rate": 7.157653456019504e-05, + "loss": 1.1273, + "step": 3540 + }, + { + "epoch": 0.6334316090413651, + "grad_norm": 0.5094234347343445, + "learning_rate": 7.127762477405976e-05, + "loss": 1.1483, + "step": 3545 + }, + { + "epoch": 0.634325024568927, + "grad_norm": 0.5493167638778687, + "learning_rate": 7.097899442775584e-05, + "loss": 1.1254, + "step": 3550 + }, + { + "epoch": 0.6352184400964889, + "grad_norm": 0.5173693299293518, + "learning_rate": 7.068064642665631e-05, + "loss": 1.1468, + "step": 3555 + }, + { + "epoch": 0.6361118556240507, + "grad_norm": 0.5034879446029663, + "learning_rate": 7.038258367338723e-05, + "loss": 1.1349, + "step": 3560 + }, + { + "epoch": 0.6370052711516127, + "grad_norm": 0.546248197555542, + "learning_rate": 7.008480906779948e-05, + "loss": 1.1423, + "step": 3565 + }, + { + "epoch": 0.6378986866791745, + "grad_norm": 0.49636945128440857, + "learning_rate": 6.97873255069406e-05, + "loss": 1.1527, + "step": 3570 + }, + { + "epoch": 0.6387921022067363, + "grad_norm": 0.5306327939033508, + "learning_rate": 6.949013588502651e-05, + "loss": 1.1298, + "step": 3575 + }, + { + "epoch": 0.6396855177342983, + "grad_norm": 0.4831233024597168, + "learning_rate": 6.919324309341341e-05, + "loss": 1.1189, + "step": 3580 + }, + { + "epoch": 0.6405789332618601, + "grad_norm": 0.553893506526947, + "learning_rate": 6.889665002056966e-05, + "loss": 1.118, + "step": 3585 + }, + { + "epoch": 0.6414723487894219, + "grad_norm": 0.4722842872142792, + "learning_rate": 6.860035955204767e-05, + "loss": 1.1195, + "step": 3590 + }, + { + "epoch": 0.6423657643169838, + "grad_norm": 0.5145003795623779, + "learning_rate": 6.830437457045568e-05, + "loss": 1.1491, + "step": 3595 + }, + { + "epoch": 0.6432591798445457, + "grad_norm": 0.5275436639785767, + "learning_rate": 6.800869795543007e-05, + "loss": 1.1086, + "step": 3600 + }, + { + "epoch": 0.6441525953721076, + "grad_norm": 0.5202250480651855, + "learning_rate": 6.771333258360694e-05, + "loss": 1.1357, + "step": 3605 + }, + { + "epoch": 0.6450460108996694, + "grad_norm": 0.5156890749931335, + "learning_rate": 6.74182813285944e-05, + "loss": 1.118, + "step": 3610 + }, + { + "epoch": 0.6459394264272313, + "grad_norm": 0.5047606229782104, + "learning_rate": 6.712354706094452e-05, + "loss": 1.1066, + "step": 3615 + }, + { + "epoch": 0.6468328419547932, + "grad_norm": 0.5137758851051331, + "learning_rate": 6.682913264812533e-05, + "loss": 1.1402, + "step": 3620 + }, + { + "epoch": 0.647726257482355, + "grad_norm": 0.5158393383026123, + "learning_rate": 6.653504095449305e-05, + "loss": 1.1016, + "step": 3625 + }, + { + "epoch": 0.648619673009917, + "grad_norm": 0.5306075811386108, + "learning_rate": 6.624127484126421e-05, + "loss": 1.1417, + "step": 3630 + }, + { + "epoch": 0.6495130885374788, + "grad_norm": 0.5093557238578796, + "learning_rate": 6.594783716648769e-05, + "loss": 1.1397, + "step": 3635 + }, + { + "epoch": 0.6504065040650406, + "grad_norm": 0.4910166561603546, + "learning_rate": 6.565473078501698e-05, + "loss": 1.1187, + "step": 3640 + }, + { + "epoch": 0.6512999195926025, + "grad_norm": 0.5265088677406311, + "learning_rate": 6.536195854848248e-05, + "loss": 1.0743, + "step": 3645 + }, + { + "epoch": 0.6521933351201644, + "grad_norm": 0.518035888671875, + "learning_rate": 6.506952330526373e-05, + "loss": 1.1224, + "step": 3650 + }, + { + "epoch": 0.6530867506477263, + "grad_norm": 0.5047338604927063, + "learning_rate": 6.477742790046152e-05, + "loss": 1.1183, + "step": 3655 + }, + { + "epoch": 0.6539801661752881, + "grad_norm": 0.5210195779800415, + "learning_rate": 6.448567517587052e-05, + "loss": 1.1546, + "step": 3660 + }, + { + "epoch": 0.65487358170285, + "grad_norm": 0.5376661419868469, + "learning_rate": 6.419426796995137e-05, + "loss": 1.147, + "step": 3665 + }, + { + "epoch": 0.6557669972304119, + "grad_norm": 0.5363953709602356, + "learning_rate": 6.390320911780317e-05, + "loss": 1.1399, + "step": 3670 + }, + { + "epoch": 0.6566604127579737, + "grad_norm": 0.5107960104942322, + "learning_rate": 6.36125014511359e-05, + "loss": 1.1382, + "step": 3675 + }, + { + "epoch": 0.6575538282855357, + "grad_norm": 0.511021614074707, + "learning_rate": 6.332214779824288e-05, + "loss": 1.1503, + "step": 3680 + }, + { + "epoch": 0.6584472438130975, + "grad_norm": 0.5405541062355042, + "learning_rate": 6.303215098397321e-05, + "loss": 1.1043, + "step": 3685 + }, + { + "epoch": 0.6593406593406593, + "grad_norm": 0.542253851890564, + "learning_rate": 6.274251382970427e-05, + "loss": 1.1344, + "step": 3690 + }, + { + "epoch": 0.6602340748682212, + "grad_norm": 0.5661068558692932, + "learning_rate": 6.245323915331439e-05, + "loss": 1.1463, + "step": 3695 + }, + { + "epoch": 0.6611274903957831, + "grad_norm": 0.5561456680297852, + "learning_rate": 6.216432976915527e-05, + "loss": 1.1655, + "step": 3700 + }, + { + "epoch": 0.662020905923345, + "grad_norm": 0.5392094254493713, + "learning_rate": 6.187578848802475e-05, + "loss": 1.1092, + "step": 3705 + }, + { + "epoch": 0.6629143214509068, + "grad_norm": 0.54625403881073, + "learning_rate": 6.15876181171394e-05, + "loss": 1.1417, + "step": 3710 + }, + { + "epoch": 0.6638077369784687, + "grad_norm": 0.5339346528053284, + "learning_rate": 6.129982146010713e-05, + "loss": 1.1493, + "step": 3715 + }, + { + "epoch": 0.6647011525060306, + "grad_norm": 0.4904159605503082, + "learning_rate": 6.101240131690009e-05, + "loss": 1.1063, + "step": 3720 + }, + { + "epoch": 0.6655945680335924, + "grad_norm": 0.5268272161483765, + "learning_rate": 6.072536048382726e-05, + "loss": 1.1261, + "step": 3725 + }, + { + "epoch": 0.6664879835611542, + "grad_norm": 0.5363719463348389, + "learning_rate": 6.043870175350732e-05, + "loss": 1.1581, + "step": 3730 + }, + { + "epoch": 0.6673813990887162, + "grad_norm": 0.5529670715332031, + "learning_rate": 6.0152427914841544e-05, + "loss": 1.112, + "step": 3735 + }, + { + "epoch": 0.668274814616278, + "grad_norm": 0.49713894724845886, + "learning_rate": 5.9866541752986485e-05, + "loss": 1.1057, + "step": 3740 + }, + { + "epoch": 0.6691682301438399, + "grad_norm": 0.5073032975196838, + "learning_rate": 5.958104604932706e-05, + "loss": 1.1555, + "step": 3745 + }, + { + "epoch": 0.6700616456714018, + "grad_norm": 0.5101709961891174, + "learning_rate": 5.9295943581449385e-05, + "loss": 1.1366, + "step": 3750 + }, + { + "epoch": 0.6709550611989636, + "grad_norm": 0.5283429026603699, + "learning_rate": 5.901123712311385e-05, + "loss": 1.1178, + "step": 3755 + }, + { + "epoch": 0.6718484767265255, + "grad_norm": 0.5022781491279602, + "learning_rate": 5.8726929444228016e-05, + "loss": 1.1079, + "step": 3760 + }, + { + "epoch": 0.6727418922540874, + "grad_norm": 0.5120312571525574, + "learning_rate": 5.844302331081972e-05, + "loss": 1.1218, + "step": 3765 + }, + { + "epoch": 0.6736353077816493, + "grad_norm": 0.48626720905303955, + "learning_rate": 5.8159521485010214e-05, + "loss": 1.1053, + "step": 3770 + }, + { + "epoch": 0.6745287233092111, + "grad_norm": 0.5270091891288757, + "learning_rate": 5.787642672498719e-05, + "loss": 1.1096, + "step": 3775 + }, + { + "epoch": 0.6754221388367729, + "grad_norm": 0.5076459050178528, + "learning_rate": 5.759374178497801e-05, + "loss": 1.0824, + "step": 3780 + }, + { + "epoch": 0.6763155543643349, + "grad_norm": 0.5193483829498291, + "learning_rate": 5.731146941522292e-05, + "loss": 1.1143, + "step": 3785 + }, + { + "epoch": 0.6772089698918967, + "grad_norm": 0.5338672399520874, + "learning_rate": 5.702961236194826e-05, + "loss": 1.1068, + "step": 3790 + }, + { + "epoch": 0.6781023854194586, + "grad_norm": 0.5404512286186218, + "learning_rate": 5.674817336733975e-05, + "loss": 1.107, + "step": 3795 + }, + { + "epoch": 0.6789958009470205, + "grad_norm": 0.5492522120475769, + "learning_rate": 5.646715516951584e-05, + "loss": 1.1108, + "step": 3800 + }, + { + "epoch": 0.6798892164745823, + "grad_norm": 0.561931848526001, + "learning_rate": 5.618656050250099e-05, + "loss": 1.1257, + "step": 3805 + }, + { + "epoch": 0.6807826320021442, + "grad_norm": 0.5259913206100464, + "learning_rate": 5.5906392096199255e-05, + "loss": 1.0976, + "step": 3810 + }, + { + "epoch": 0.6816760475297061, + "grad_norm": 0.5238481163978577, + "learning_rate": 5.562665267636751e-05, + "loss": 1.1711, + "step": 3815 + }, + { + "epoch": 0.682569463057268, + "grad_norm": 0.5467326045036316, + "learning_rate": 5.5347344964588996e-05, + "loss": 1.0727, + "step": 3820 + }, + { + "epoch": 0.6834628785848298, + "grad_norm": 0.5232309699058533, + "learning_rate": 5.506847167824696e-05, + "loss": 1.1064, + "step": 3825 + }, + { + "epoch": 0.6843562941123916, + "grad_norm": 0.5298530459403992, + "learning_rate": 5.479003553049806e-05, + "loss": 1.1082, + "step": 3830 + }, + { + "epoch": 0.6852497096399536, + "grad_norm": 0.506592333316803, + "learning_rate": 5.4512039230246035e-05, + "loss": 1.0942, + "step": 3835 + }, + { + "epoch": 0.6861431251675154, + "grad_norm": 0.5119116306304932, + "learning_rate": 5.42344854821154e-05, + "loss": 1.1278, + "step": 3840 + }, + { + "epoch": 0.6870365406950772, + "grad_norm": 0.5258318185806274, + "learning_rate": 5.395737698642503e-05, + "loss": 1.1161, + "step": 3845 + }, + { + "epoch": 0.6879299562226392, + "grad_norm": 0.5019605159759521, + "learning_rate": 5.368071643916194e-05, + "loss": 1.1363, + "step": 3850 + }, + { + "epoch": 0.688823371750201, + "grad_norm": 0.47688642144203186, + "learning_rate": 5.3404506531955146e-05, + "loss": 1.0935, + "step": 3855 + }, + { + "epoch": 0.6897167872777629, + "grad_norm": 0.5404648184776306, + "learning_rate": 5.3128749952049284e-05, + "loss": 1.1134, + "step": 3860 + }, + { + "epoch": 0.6906102028053247, + "grad_norm": 0.5106511116027832, + "learning_rate": 5.2853449382278605e-05, + "loss": 1.1165, + "step": 3865 + }, + { + "epoch": 0.6915036183328866, + "grad_norm": 0.5181242227554321, + "learning_rate": 5.2578607501040863e-05, + "loss": 1.104, + "step": 3870 + }, + { + "epoch": 0.6923970338604485, + "grad_norm": 0.5107550024986267, + "learning_rate": 5.2304226982271174e-05, + "loss": 1.1438, + "step": 3875 + }, + { + "epoch": 0.6932904493880103, + "grad_norm": 0.5244885683059692, + "learning_rate": 5.203031049541621e-05, + "loss": 1.1284, + "step": 3880 + }, + { + "epoch": 0.6941838649155723, + "grad_norm": 0.519130289554596, + "learning_rate": 5.175686070540786e-05, + "loss": 1.1188, + "step": 3885 + }, + { + "epoch": 0.6950772804431341, + "grad_norm": 0.5156662464141846, + "learning_rate": 5.148388027263769e-05, + "loss": 1.1648, + "step": 3890 + }, + { + "epoch": 0.6959706959706959, + "grad_norm": 0.5092237591743469, + "learning_rate": 5.1211371852930766e-05, + "loss": 1.0715, + "step": 3895 + }, + { + "epoch": 0.6968641114982579, + "grad_norm": 0.544964075088501, + "learning_rate": 5.0939338097520095e-05, + "loss": 1.114, + "step": 3900 + }, + { + "epoch": 0.6977575270258197, + "grad_norm": 0.5449029207229614, + "learning_rate": 5.0667781653020584e-05, + "loss": 1.0786, + "step": 3905 + }, + { + "epoch": 0.6986509425533816, + "grad_norm": 0.5234874486923218, + "learning_rate": 5.039670516140338e-05, + "loss": 1.1419, + "step": 3910 + }, + { + "epoch": 0.6995443580809434, + "grad_norm": 0.5106183290481567, + "learning_rate": 5.012611125997018e-05, + "loss": 1.0891, + "step": 3915 + }, + { + "epoch": 0.7004377736085053, + "grad_norm": 0.49707773327827454, + "learning_rate": 4.9856002581327565e-05, + "loss": 1.1272, + "step": 3920 + }, + { + "epoch": 0.7013311891360672, + "grad_norm": 0.5646072626113892, + "learning_rate": 4.958638175336137e-05, + "loss": 1.1029, + "step": 3925 + }, + { + "epoch": 0.702224604663629, + "grad_norm": 0.5331147313117981, + "learning_rate": 4.931725139921126e-05, + "loss": 1.1271, + "step": 3930 + }, + { + "epoch": 0.703118020191191, + "grad_norm": 0.521113932132721, + "learning_rate": 4.9048614137244865e-05, + "loss": 1.0927, + "step": 3935 + }, + { + "epoch": 0.7040114357187528, + "grad_norm": 0.5384479761123657, + "learning_rate": 4.878047258103267e-05, + "loss": 1.1262, + "step": 3940 + }, + { + "epoch": 0.7049048512463146, + "grad_norm": 0.5266872644424438, + "learning_rate": 4.8512829339322375e-05, + "loss": 1.1521, + "step": 3945 + }, + { + "epoch": 0.7057982667738766, + "grad_norm": 0.5195939540863037, + "learning_rate": 4.8245687016013696e-05, + "loss": 1.1201, + "step": 3950 + }, + { + "epoch": 0.7066916823014384, + "grad_norm": 0.49455180764198303, + "learning_rate": 4.797904821013278e-05, + "loss": 1.1118, + "step": 3955 + }, + { + "epoch": 0.7075850978290003, + "grad_norm": 0.49879488348960876, + "learning_rate": 4.771291551580712e-05, + "loss": 1.1106, + "step": 3960 + }, + { + "epoch": 0.7084785133565621, + "grad_norm": 0.5107743740081787, + "learning_rate": 4.744729152224024e-05, + "loss": 1.0964, + "step": 3965 + }, + { + "epoch": 0.709371928884124, + "grad_norm": 0.4998965263366699, + "learning_rate": 4.71821788136865e-05, + "loss": 1.0842, + "step": 3970 + }, + { + "epoch": 0.7102653444116859, + "grad_norm": 0.5038910508155823, + "learning_rate": 4.691757996942607e-05, + "loss": 1.1108, + "step": 3975 + }, + { + "epoch": 0.7111587599392477, + "grad_norm": 0.5343025326728821, + "learning_rate": 4.665349756373957e-05, + "loss": 1.1084, + "step": 3980 + }, + { + "epoch": 0.7120521754668097, + "grad_norm": 0.5399016737937927, + "learning_rate": 4.6389934165883306e-05, + "loss": 1.1055, + "step": 3985 + }, + { + "epoch": 0.7129455909943715, + "grad_norm": 0.49205487966537476, + "learning_rate": 4.6126892340064096e-05, + "loss": 1.1041, + "step": 3990 + }, + { + "epoch": 0.7138390065219333, + "grad_norm": 0.5059479475021362, + "learning_rate": 4.586437464541451e-05, + "loss": 1.1028, + "step": 3995 + }, + { + "epoch": 0.7147324220494953, + "grad_norm": 0.520707368850708, + "learning_rate": 4.560238363596771e-05, + "loss": 1.064, + "step": 4000 + }, + { + "epoch": 0.7156258375770571, + "grad_norm": 0.46862202882766724, + "learning_rate": 4.53409218606328e-05, + "loss": 1.116, + "step": 4005 + }, + { + "epoch": 0.716519253104619, + "grad_norm": 0.5046384930610657, + "learning_rate": 4.5079991863169966e-05, + "loss": 1.0911, + "step": 4010 + }, + { + "epoch": 0.7174126686321808, + "grad_norm": 0.5321259498596191, + "learning_rate": 4.481959618216568e-05, + "loss": 1.0903, + "step": 4015 + }, + { + "epoch": 0.7183060841597427, + "grad_norm": 0.5124063491821289, + "learning_rate": 4.455973735100818e-05, + "loss": 1.1261, + "step": 4020 + }, + { + "epoch": 0.7191994996873046, + "grad_norm": 0.5123804211616516, + "learning_rate": 4.43004178978626e-05, + "loss": 1.1248, + "step": 4025 + }, + { + "epoch": 0.7200929152148664, + "grad_norm": 0.5785008668899536, + "learning_rate": 4.404164034564641e-05, + "loss": 1.1447, + "step": 4030 + }, + { + "epoch": 0.7209863307424283, + "grad_norm": 0.537517786026001, + "learning_rate": 4.378340721200501e-05, + "loss": 1.1067, + "step": 4035 + }, + { + "epoch": 0.7218797462699902, + "grad_norm": 0.5579099655151367, + "learning_rate": 4.3525721009287215e-05, + "loss": 1.1124, + "step": 4040 + }, + { + "epoch": 0.722773161797552, + "grad_norm": 0.5302057266235352, + "learning_rate": 4.326858424452063e-05, + "loss": 1.0999, + "step": 4045 + }, + { + "epoch": 0.7236665773251139, + "grad_norm": 0.5399320125579834, + "learning_rate": 4.301199941938744e-05, + "loss": 1.0849, + "step": 4050 + }, + { + "epoch": 0.7245599928526758, + "grad_norm": 0.5739198923110962, + "learning_rate": 4.275596903020001e-05, + "loss": 1.118, + "step": 4055 + }, + { + "epoch": 0.7254534083802376, + "grad_norm": 0.5343251824378967, + "learning_rate": 4.250049556787655e-05, + "loss": 1.1116, + "step": 4060 + }, + { + "epoch": 0.7263468239077995, + "grad_norm": 0.5469746589660645, + "learning_rate": 4.2245581517917065e-05, + "loss": 1.0922, + "step": 4065 + }, + { + "epoch": 0.7272402394353614, + "grad_norm": 0.5611655712127686, + "learning_rate": 4.199122936037889e-05, + "loss": 1.1282, + "step": 4070 + }, + { + "epoch": 0.7281336549629233, + "grad_norm": 0.5936520099639893, + "learning_rate": 4.173744156985283e-05, + "loss": 1.135, + "step": 4075 + }, + { + "epoch": 0.7290270704904851, + "grad_norm": 0.5532947778701782, + "learning_rate": 4.148422061543884e-05, + "loss": 1.123, + "step": 4080 + }, + { + "epoch": 0.729920486018047, + "grad_norm": 0.5437267422676086, + "learning_rate": 4.123156896072219e-05, + "loss": 1.084, + "step": 4085 + }, + { + "epoch": 0.7308139015456089, + "grad_norm": 0.5226598978042603, + "learning_rate": 4.097948906374951e-05, + "loss": 1.1311, + "step": 4090 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.5470917820930481, + "learning_rate": 4.0727983377004716e-05, + "loss": 1.1086, + "step": 4095 + }, + { + "epoch": 0.7326007326007326, + "grad_norm": 0.5484129190444946, + "learning_rate": 4.047705434738527e-05, + "loss": 1.0744, + "step": 4100 + }, + { + "epoch": 0.7334941481282945, + "grad_norm": 0.5352585911750793, + "learning_rate": 4.02267044161783e-05, + "loss": 1.1017, + "step": 4105 + }, + { + "epoch": 0.7343875636558563, + "grad_norm": 0.5201607942581177, + "learning_rate": 3.997693601903688e-05, + "loss": 1.1283, + "step": 4110 + }, + { + "epoch": 0.7352809791834182, + "grad_norm": 0.5287265181541443, + "learning_rate": 3.9727751585956477e-05, + "loss": 1.1334, + "step": 4115 + }, + { + "epoch": 0.7361743947109801, + "grad_norm": 0.5448318123817444, + "learning_rate": 3.9479153541251056e-05, + "loss": 1.1403, + "step": 4120 + }, + { + "epoch": 0.737067810238542, + "grad_norm": 0.5017204880714417, + "learning_rate": 3.923114430352958e-05, + "loss": 1.126, + "step": 4125 + }, + { + "epoch": 0.7379612257661038, + "grad_norm": 0.5083391666412354, + "learning_rate": 3.8983726285672536e-05, + "loss": 1.1325, + "step": 4130 + }, + { + "epoch": 0.7388546412936657, + "grad_norm": 0.4848039150238037, + "learning_rate": 3.8736901894808575e-05, + "loss": 1.0555, + "step": 4135 + }, + { + "epoch": 0.7397480568212276, + "grad_norm": 0.5136265158653259, + "learning_rate": 3.849067353229078e-05, + "loss": 1.1228, + "step": 4140 + }, + { + "epoch": 0.7406414723487894, + "grad_norm": 0.5346769690513611, + "learning_rate": 3.824504359367355e-05, + "loss": 1.1229, + "step": 4145 + }, + { + "epoch": 0.7415348878763512, + "grad_norm": 0.5281243324279785, + "learning_rate": 3.80000144686892e-05, + "loss": 1.1327, + "step": 4150 + }, + { + "epoch": 0.7424283034039132, + "grad_norm": 0.5599030256271362, + "learning_rate": 3.775558854122475e-05, + "loss": 1.0789, + "step": 4155 + }, + { + "epoch": 0.743321718931475, + "grad_norm": 0.5306710600852966, + "learning_rate": 3.7511768189298746e-05, + "loss": 1.0885, + "step": 4160 + }, + { + "epoch": 0.7442151344590369, + "grad_norm": 0.5410204529762268, + "learning_rate": 3.726855578503804e-05, + "loss": 1.0959, + "step": 4165 + }, + { + "epoch": 0.7451085499865988, + "grad_norm": 0.5430173873901367, + "learning_rate": 3.70259536946548e-05, + "loss": 1.1033, + "step": 4170 + }, + { + "epoch": 0.7460019655141606, + "grad_norm": 0.5353587865829468, + "learning_rate": 3.678396427842334e-05, + "loss": 1.121, + "step": 4175 + }, + { + "epoch": 0.7468953810417225, + "grad_norm": 0.5129430890083313, + "learning_rate": 3.6542589890657476e-05, + "loss": 1.0958, + "step": 4180 + }, + { + "epoch": 0.7477887965692843, + "grad_norm": 0.5174726843833923, + "learning_rate": 3.630183287968727e-05, + "loss": 1.1203, + "step": 4185 + }, + { + "epoch": 0.7486822120968463, + "grad_norm": 0.5343511700630188, + "learning_rate": 3.606169558783635e-05, + "loss": 1.0792, + "step": 4190 + }, + { + "epoch": 0.7495756276244081, + "grad_norm": 0.5478770136833191, + "learning_rate": 3.5822180351399136e-05, + "loss": 1.0975, + "step": 4195 + }, + { + "epoch": 0.7504690431519699, + "grad_norm": 0.4997379183769226, + "learning_rate": 3.5583289500618e-05, + "loss": 1.0913, + "step": 4200 + }, + { + "epoch": 0.7513624586795319, + "grad_norm": 0.5360389351844788, + "learning_rate": 3.53450253596608e-05, + "loss": 1.081, + "step": 4205 + }, + { + "epoch": 0.7522558742070937, + "grad_norm": 0.5671184062957764, + "learning_rate": 3.510739024659802e-05, + "loss": 1.1355, + "step": 4210 + }, + { + "epoch": 0.7531492897346556, + "grad_norm": 0.5735521912574768, + "learning_rate": 3.487038647338038e-05, + "loss": 1.0998, + "step": 4215 + }, + { + "epoch": 0.7540427052622175, + "grad_norm": 0.5425437092781067, + "learning_rate": 3.463401634581631e-05, + "loss": 1.1192, + "step": 4220 + }, + { + "epoch": 0.7549361207897793, + "grad_norm": 0.5446605086326599, + "learning_rate": 3.4398282163549414e-05, + "loss": 1.1166, + "step": 4225 + }, + { + "epoch": 0.7558295363173412, + "grad_norm": 0.5220611691474915, + "learning_rate": 3.416318622003634e-05, + "loss": 1.0814, + "step": 4230 + }, + { + "epoch": 0.756722951844903, + "grad_norm": 0.513034999370575, + "learning_rate": 3.39287308025242e-05, + "loss": 1.1026, + "step": 4235 + }, + { + "epoch": 0.757616367372465, + "grad_norm": 0.5283447504043579, + "learning_rate": 3.369491819202849e-05, + "loss": 1.1125, + "step": 4240 + }, + { + "epoch": 0.7585097829000268, + "grad_norm": 0.5065436363220215, + "learning_rate": 3.34617506633108e-05, + "loss": 1.1106, + "step": 4245 + }, + { + "epoch": 0.7594031984275886, + "grad_norm": 0.5315711498260498, + "learning_rate": 3.322923048485672e-05, + "loss": 1.0821, + "step": 4250 + }, + { + "epoch": 0.7602966139551506, + "grad_norm": 0.5503782033920288, + "learning_rate": 3.2997359918853845e-05, + "loss": 1.1076, + "step": 4255 + }, + { + "epoch": 0.7611900294827124, + "grad_norm": 0.5607456564903259, + "learning_rate": 3.276614122116962e-05, + "loss": 1.0913, + "step": 4260 + }, + { + "epoch": 0.7620834450102743, + "grad_norm": 0.536376953125, + "learning_rate": 3.2535576641329514e-05, + "loss": 1.0911, + "step": 4265 + }, + { + "epoch": 0.7629768605378362, + "grad_norm": 0.5168653726577759, + "learning_rate": 3.230566842249497e-05, + "loss": 1.0845, + "step": 4270 + }, + { + "epoch": 0.763870276065398, + "grad_norm": 0.5393795371055603, + "learning_rate": 3.2076418801441886e-05, + "loss": 1.098, + "step": 4275 + }, + { + "epoch": 0.7647636915929599, + "grad_norm": 0.538724422454834, + "learning_rate": 3.1847830008538545e-05, + "loss": 1.096, + "step": 4280 + }, + { + "epoch": 0.7656571071205217, + "grad_norm": 0.527229905128479, + "learning_rate": 3.1619904267724065e-05, + "loss": 1.0845, + "step": 4285 + }, + { + "epoch": 0.7665505226480837, + "grad_norm": 0.5256169438362122, + "learning_rate": 3.139264379648671e-05, + "loss": 1.0796, + "step": 4290 + }, + { + "epoch": 0.7674439381756455, + "grad_norm": 0.540058434009552, + "learning_rate": 3.116605080584235e-05, + "loss": 1.0909, + "step": 4295 + }, + { + "epoch": 0.7683373537032073, + "grad_norm": 0.5083367824554443, + "learning_rate": 3.0940127500313e-05, + "loss": 1.1013, + "step": 4300 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.5150957703590393, + "learning_rate": 3.071487607790524e-05, + "loss": 1.1422, + "step": 4305 + }, + { + "epoch": 0.7701241847583311, + "grad_norm": 0.5053666234016418, + "learning_rate": 3.049029873008893e-05, + "loss": 1.0878, + "step": 4310 + }, + { + "epoch": 0.771017600285893, + "grad_norm": 0.5635530352592468, + "learning_rate": 3.0266397641775835e-05, + "loss": 1.106, + "step": 4315 + }, + { + "epoch": 0.7719110158134549, + "grad_norm": 0.5005871653556824, + "learning_rate": 3.004317499129845e-05, + "loss": 1.1155, + "step": 4320 + }, + { + "epoch": 0.7728044313410167, + "grad_norm": 0.5285431742668152, + "learning_rate": 2.9820632950388695e-05, + "loss": 1.0865, + "step": 4325 + }, + { + "epoch": 0.7736978468685786, + "grad_norm": 0.5447906851768494, + "learning_rate": 2.9598773684156878e-05, + "loss": 1.1002, + "step": 4330 + }, + { + "epoch": 0.7745912623961404, + "grad_norm": 0.5144314765930176, + "learning_rate": 2.9377599351070595e-05, + "loss": 1.1234, + "step": 4335 + }, + { + "epoch": 0.7754846779237023, + "grad_norm": 0.5474838614463806, + "learning_rate": 2.915711210293367e-05, + "loss": 1.1577, + "step": 4340 + }, + { + "epoch": 0.7763780934512642, + "grad_norm": 0.5402074456214905, + "learning_rate": 2.8937314084865407e-05, + "loss": 1.119, + "step": 4345 + }, + { + "epoch": 0.777271508978826, + "grad_norm": 0.5311411023139954, + "learning_rate": 2.8718207435279486e-05, + "loss": 1.1003, + "step": 4350 + }, + { + "epoch": 0.778164924506388, + "grad_norm": 0.5577319264411926, + "learning_rate": 2.849979428586331e-05, + "loss": 1.0855, + "step": 4355 + }, + { + "epoch": 0.7790583400339498, + "grad_norm": 0.5216962099075317, + "learning_rate": 2.828207676155722e-05, + "loss": 1.0935, + "step": 4360 + }, + { + "epoch": 0.7799517555615116, + "grad_norm": 0.508258044719696, + "learning_rate": 2.80650569805338e-05, + "loss": 1.0922, + "step": 4365 + }, + { + "epoch": 0.7808451710890735, + "grad_norm": 0.5772315263748169, + "learning_rate": 2.784873705417731e-05, + "loss": 1.0749, + "step": 4370 + }, + { + "epoch": 0.7817385866166354, + "grad_norm": 0.5212002992630005, + "learning_rate": 2.7633119087063152e-05, + "loss": 1.1221, + "step": 4375 + }, + { + "epoch": 0.7826320021441973, + "grad_norm": 0.5161262154579163, + "learning_rate": 2.7418205176937327e-05, + "loss": 1.1418, + "step": 4380 + }, + { + "epoch": 0.7835254176717591, + "grad_norm": 0.5041150450706482, + "learning_rate": 2.7203997414696104e-05, + "loss": 1.1163, + "step": 4385 + }, + { + "epoch": 0.784418833199321, + "grad_norm": 0.5260159373283386, + "learning_rate": 2.6990497884365586e-05, + "loss": 1.0596, + "step": 4390 + }, + { + "epoch": 0.7853122487268829, + "grad_norm": 0.5398173332214355, + "learning_rate": 2.67777086630816e-05, + "loss": 1.1323, + "step": 4395 + }, + { + "epoch": 0.7862056642544447, + "grad_norm": 0.529534101486206, + "learning_rate": 2.6565631821069304e-05, + "loss": 1.0992, + "step": 4400 + }, + { + "epoch": 0.7870990797820067, + "grad_norm": 0.5247671008110046, + "learning_rate": 2.6354269421623112e-05, + "loss": 1.1322, + "step": 4405 + }, + { + "epoch": 0.7879924953095685, + "grad_norm": 0.4856671690940857, + "learning_rate": 2.6143623521086647e-05, + "loss": 1.0655, + "step": 4410 + }, + { + "epoch": 0.7888859108371303, + "grad_norm": 0.5172522664070129, + "learning_rate": 2.5933696168832743e-05, + "loss": 1.0842, + "step": 4415 + }, + { + "epoch": 0.7897793263646922, + "grad_norm": 0.4878483712673187, + "learning_rate": 2.5724489407243447e-05, + "loss": 1.1015, + "step": 4420 + }, + { + "epoch": 0.7906727418922541, + "grad_norm": 0.5132545828819275, + "learning_rate": 2.5516005271690203e-05, + "loss": 1.1243, + "step": 4425 + }, + { + "epoch": 0.791566157419816, + "grad_norm": 0.5201311111450195, + "learning_rate": 2.530824579051403e-05, + "loss": 1.0829, + "step": 4430 + }, + { + "epoch": 0.7924595729473778, + "grad_norm": 0.554908275604248, + "learning_rate": 2.510121298500573e-05, + "loss": 1.1092, + "step": 4435 + }, + { + "epoch": 0.7933529884749397, + "grad_norm": 0.5134171843528748, + "learning_rate": 2.4894908869386424e-05, + "loss": 1.1063, + "step": 4440 + }, + { + "epoch": 0.7942464040025016, + "grad_norm": 0.5175535678863525, + "learning_rate": 2.4689335450787675e-05, + "loss": 1.1117, + "step": 4445 + }, + { + "epoch": 0.7951398195300634, + "grad_norm": 0.543436586856842, + "learning_rate": 2.4484494729232155e-05, + "loss": 1.0985, + "step": 4450 + }, + { + "epoch": 0.7960332350576254, + "grad_norm": 0.5105295181274414, + "learning_rate": 2.428038869761412e-05, + "loss": 1.0921, + "step": 4455 + }, + { + "epoch": 0.7969266505851872, + "grad_norm": 0.524043619632721, + "learning_rate": 2.4077019341680042e-05, + "loss": 1.0728, + "step": 4460 + }, + { + "epoch": 0.797820066112749, + "grad_norm": 0.5553478002548218, + "learning_rate": 2.387438864000926e-05, + "loss": 1.1036, + "step": 4465 + }, + { + "epoch": 0.7987134816403109, + "grad_norm": 0.4928396940231323, + "learning_rate": 2.3672498563994762e-05, + "loss": 1.1025, + "step": 4470 + }, + { + "epoch": 0.7996068971678728, + "grad_norm": 0.5613781213760376, + "learning_rate": 2.3471351077824e-05, + "loss": 1.1173, + "step": 4475 + }, + { + "epoch": 0.8005003126954346, + "grad_norm": 0.5897471904754639, + "learning_rate": 2.3270948138459735e-05, + "loss": 1.1225, + "step": 4480 + }, + { + "epoch": 0.8013937282229965, + "grad_norm": 0.5354331135749817, + "learning_rate": 2.3071291695621135e-05, + "loss": 1.0784, + "step": 4485 + }, + { + "epoch": 0.8022871437505584, + "grad_norm": 0.5370863676071167, + "learning_rate": 2.2872383691764586e-05, + "loss": 1.0601, + "step": 4490 + }, + { + "epoch": 0.8031805592781203, + "grad_norm": 0.5611812472343445, + "learning_rate": 2.2674226062064996e-05, + "loss": 1.0762, + "step": 4495 + }, + { + "epoch": 0.8040739748056821, + "grad_norm": 0.5540429353713989, + "learning_rate": 2.2476820734396843e-05, + "loss": 1.0666, + "step": 4500 + }, + { + "epoch": 0.8049673903332439, + "grad_norm": 0.5697487592697144, + "learning_rate": 2.2280169629315484e-05, + "loss": 1.1338, + "step": 4505 + }, + { + "epoch": 0.8058608058608059, + "grad_norm": 0.517309308052063, + "learning_rate": 2.208427466003844e-05, + "loss": 1.0976, + "step": 4510 + }, + { + "epoch": 0.8067542213883677, + "grad_norm": 0.5207148790359497, + "learning_rate": 2.1889137732426802e-05, + "loss": 1.13, + "step": 4515 + }, + { + "epoch": 0.8076476369159296, + "grad_norm": 0.5412813425064087, + "learning_rate": 2.1694760744966668e-05, + "loss": 1.1341, + "step": 4520 + }, + { + "epoch": 0.8085410524434915, + "grad_norm": 0.5917104482650757, + "learning_rate": 2.1501145588750694e-05, + "loss": 1.0628, + "step": 4525 + }, + { + "epoch": 0.8094344679710533, + "grad_norm": 0.5434411764144897, + "learning_rate": 2.1308294147459628e-05, + "loss": 1.1121, + "step": 4530 + }, + { + "epoch": 0.8103278834986152, + "grad_norm": 0.5469930171966553, + "learning_rate": 2.1116208297344155e-05, + "loss": 1.1057, + "step": 4535 + }, + { + "epoch": 0.8112212990261771, + "grad_norm": 0.5234044194221497, + "learning_rate": 2.0924889907206425e-05, + "loss": 1.0711, + "step": 4540 + }, + { + "epoch": 0.812114714553739, + "grad_norm": 0.5449784398078918, + "learning_rate": 2.0734340838382015e-05, + "loss": 1.0914, + "step": 4545 + }, + { + "epoch": 0.8130081300813008, + "grad_norm": 0.5194562077522278, + "learning_rate": 2.0544562944721778e-05, + "loss": 1.0878, + "step": 4550 + }, + { + "epoch": 0.8139015456088626, + "grad_norm": 0.646867573261261, + "learning_rate": 2.0355558072573787e-05, + "loss": 1.121, + "step": 4555 + }, + { + "epoch": 0.8147949611364246, + "grad_norm": 0.530295729637146, + "learning_rate": 2.016732806076542e-05, + "loss": 1.0924, + "step": 4560 + }, + { + "epoch": 0.8156883766639864, + "grad_norm": 0.5485954880714417, + "learning_rate": 1.9979874740585426e-05, + "loss": 1.1192, + "step": 4565 + }, + { + "epoch": 0.8165817921915483, + "grad_norm": 0.549314022064209, + "learning_rate": 1.979319993576614e-05, + "loss": 1.083, + "step": 4570 + }, + { + "epoch": 0.8174752077191102, + "grad_norm": 0.5288789868354797, + "learning_rate": 1.9607305462465686e-05, + "loss": 1.0898, + "step": 4575 + }, + { + "epoch": 0.818368623246672, + "grad_norm": 0.5372732281684875, + "learning_rate": 1.942219312925042e-05, + "loss": 1.1002, + "step": 4580 + }, + { + "epoch": 0.8192620387742339, + "grad_norm": 0.5401791930198669, + "learning_rate": 1.9237864737077204e-05, + "loss": 1.0702, + "step": 4585 + }, + { + "epoch": 0.8201554543017958, + "grad_norm": 0.49033990502357483, + "learning_rate": 1.9054322079275953e-05, + "loss": 1.1151, + "step": 4590 + }, + { + "epoch": 0.8210488698293577, + "grad_norm": 0.5073211193084717, + "learning_rate": 1.8871566941532182e-05, + "loss": 1.0934, + "step": 4595 + }, + { + "epoch": 0.8219422853569195, + "grad_norm": 0.48106008768081665, + "learning_rate": 1.8689601101869604e-05, + "loss": 1.0789, + "step": 4600 + }, + { + "epoch": 0.8228357008844813, + "grad_norm": 0.5558092594146729, + "learning_rate": 1.8508426330632933e-05, + "loss": 1.0921, + "step": 4605 + }, + { + "epoch": 0.8237291164120433, + "grad_norm": 0.5238335728645325, + "learning_rate": 1.8328044390470478e-05, + "loss": 1.0684, + "step": 4610 + }, + { + "epoch": 0.8246225319396051, + "grad_norm": 0.5261583924293518, + "learning_rate": 1.8148457036317157e-05, + "loss": 1.1005, + "step": 4615 + }, + { + "epoch": 0.8255159474671669, + "grad_norm": 0.5787078738212585, + "learning_rate": 1.796966601537734e-05, + "loss": 1.0957, + "step": 4620 + }, + { + "epoch": 0.8264093629947289, + "grad_norm": 0.5445564985275269, + "learning_rate": 1.7791673067107927e-05, + "loss": 1.1059, + "step": 4625 + }, + { + "epoch": 0.8273027785222907, + "grad_norm": 0.4829619228839874, + "learning_rate": 1.7614479923201333e-05, + "loss": 1.0794, + "step": 4630 + }, + { + "epoch": 0.8281961940498526, + "grad_norm": 0.5207480192184448, + "learning_rate": 1.7438088307568667e-05, + "loss": 1.0725, + "step": 4635 + }, + { + "epoch": 0.8290896095774145, + "grad_norm": 0.5238359570503235, + "learning_rate": 1.7262499936322997e-05, + "loss": 1.1395, + "step": 4640 + }, + { + "epoch": 0.8299830251049763, + "grad_norm": 0.5396834015846252, + "learning_rate": 1.708771651776263e-05, + "loss": 1.1148, + "step": 4645 + }, + { + "epoch": 0.8308764406325382, + "grad_norm": 0.5577458143234253, + "learning_rate": 1.6913739752354464e-05, + "loss": 1.104, + "step": 4650 + }, + { + "epoch": 0.8317698561601, + "grad_norm": 0.5378464460372925, + "learning_rate": 1.6740571332717558e-05, + "loss": 1.0964, + "step": 4655 + }, + { + "epoch": 0.832663271687662, + "grad_norm": 0.5314394235610962, + "learning_rate": 1.6568212943606465e-05, + "loss": 1.1034, + "step": 4660 + }, + { + "epoch": 0.8335566872152238, + "grad_norm": 0.519146203994751, + "learning_rate": 1.6396666261895034e-05, + "loss": 1.076, + "step": 4665 + }, + { + "epoch": 0.8344501027427856, + "grad_norm": 0.5421656966209412, + "learning_rate": 1.6225932956559943e-05, + "loss": 1.0792, + "step": 4670 + }, + { + "epoch": 0.8353435182703476, + "grad_norm": 0.5136523842811584, + "learning_rate": 1.6056014688664656e-05, + "loss": 1.1073, + "step": 4675 + }, + { + "epoch": 0.8362369337979094, + "grad_norm": 0.5214201807975769, + "learning_rate": 1.5886913111343037e-05, + "loss": 1.0946, + "step": 4680 + }, + { + "epoch": 0.8371303493254713, + "grad_norm": 0.5285434722900391, + "learning_rate": 1.571862986978342e-05, + "loss": 1.1, + "step": 4685 + }, + { + "epoch": 0.8380237648530331, + "grad_norm": 0.5122743844985962, + "learning_rate": 1.555116660121253e-05, + "loss": 1.1146, + "step": 4690 + }, + { + "epoch": 0.838917180380595, + "grad_norm": 0.5290825963020325, + "learning_rate": 1.538452493487956e-05, + "loss": 1.1022, + "step": 4695 + }, + { + "epoch": 0.8398105959081569, + "grad_norm": 0.5569957494735718, + "learning_rate": 1.5218706492040435e-05, + "loss": 1.068, + "step": 4700 + }, + { + "epoch": 0.8407040114357187, + "grad_norm": 0.5273507833480835, + "learning_rate": 1.5053712885941862e-05, + "loss": 1.0983, + "step": 4705 + }, + { + "epoch": 0.8415974269632807, + "grad_norm": 0.5313451886177063, + "learning_rate": 1.4889545721805687e-05, + "loss": 1.051, + "step": 4710 + }, + { + "epoch": 0.8424908424908425, + "grad_norm": 0.5266393423080444, + "learning_rate": 1.4726206596813363e-05, + "loss": 1.1074, + "step": 4715 + }, + { + "epoch": 0.8433842580184043, + "grad_norm": 0.5717965364456177, + "learning_rate": 1.456369710009038e-05, + "loss": 1.1266, + "step": 4720 + }, + { + "epoch": 0.8442776735459663, + "grad_norm": 0.4887787699699402, + "learning_rate": 1.4402018812690721e-05, + "loss": 1.0694, + "step": 4725 + }, + { + "epoch": 0.8451710890735281, + "grad_norm": 0.5646076202392578, + "learning_rate": 1.4241173307581558e-05, + "loss": 1.0688, + "step": 4730 + }, + { + "epoch": 0.84606450460109, + "grad_norm": 0.5532607436180115, + "learning_rate": 1.4081162149627936e-05, + "loss": 1.0364, + "step": 4735 + }, + { + "epoch": 0.8469579201286518, + "grad_norm": 0.5489101409912109, + "learning_rate": 1.39219868955775e-05, + "loss": 1.0935, + "step": 4740 + }, + { + "epoch": 0.8478513356562137, + "grad_norm": 0.48812875151634216, + "learning_rate": 1.3763649094045483e-05, + "loss": 1.0678, + "step": 4745 + }, + { + "epoch": 0.8487447511837756, + "grad_norm": 0.4934766888618469, + "learning_rate": 1.3606150285499475e-05, + "loss": 1.0741, + "step": 4750 + }, + { + "epoch": 0.8496381667113374, + "grad_norm": 0.5471161603927612, + "learning_rate": 1.3449492002244502e-05, + "loss": 1.0799, + "step": 4755 + }, + { + "epoch": 0.8505315822388994, + "grad_norm": 0.520369827747345, + "learning_rate": 1.329367576840812e-05, + "loss": 1.0963, + "step": 4760 + }, + { + "epoch": 0.8514249977664612, + "grad_norm": 0.5638504028320312, + "learning_rate": 1.3138703099925676e-05, + "loss": 1.0745, + "step": 4765 + }, + { + "epoch": 0.852318413294023, + "grad_norm": 0.5431874990463257, + "learning_rate": 1.2984575504525376e-05, + "loss": 1.1035, + "step": 4770 + }, + { + "epoch": 0.853211828821585, + "grad_norm": 0.49290958046913147, + "learning_rate": 1.2831294481713763e-05, + "loss": 1.0742, + "step": 4775 + }, + { + "epoch": 0.8541052443491468, + "grad_norm": 0.5271280407905579, + "learning_rate": 1.2678861522761066e-05, + "loss": 1.0823, + "step": 4780 + }, + { + "epoch": 0.8549986598767086, + "grad_norm": 0.5476307272911072, + "learning_rate": 1.2527278110686712e-05, + "loss": 1.0886, + "step": 4785 + }, + { + "epoch": 0.8558920754042705, + "grad_norm": 0.5493554472923279, + "learning_rate": 1.237654572024487e-05, + "loss": 1.109, + "step": 4790 + }, + { + "epoch": 0.8567854909318324, + "grad_norm": 0.5390406847000122, + "learning_rate": 1.2226665817910166e-05, + "loss": 1.0723, + "step": 4795 + }, + { + "epoch": 0.8576789064593943, + "grad_norm": 0.5032559037208557, + "learning_rate": 1.2077639861863365e-05, + "loss": 1.0737, + "step": 4800 + }, + { + "epoch": 0.8585723219869561, + "grad_norm": 0.5364784598350525, + "learning_rate": 1.1929469301977136e-05, + "loss": 1.0947, + "step": 4805 + }, + { + "epoch": 0.859465737514518, + "grad_norm": 0.519612729549408, + "learning_rate": 1.1782155579802034e-05, + "loss": 1.0627, + "step": 4810 + }, + { + "epoch": 0.8603591530420799, + "grad_norm": 0.5701001286506653, + "learning_rate": 1.1635700128552508e-05, + "loss": 1.1162, + "step": 4815 + }, + { + "epoch": 0.8612525685696417, + "grad_norm": 0.5443189740180969, + "learning_rate": 1.1490104373092825e-05, + "loss": 1.0975, + "step": 4820 + }, + { + "epoch": 0.8621459840972036, + "grad_norm": 0.5358694195747375, + "learning_rate": 1.1345369729923271e-05, + "loss": 1.0922, + "step": 4825 + }, + { + "epoch": 0.8630393996247655, + "grad_norm": 0.5254837274551392, + "learning_rate": 1.1201497607166423e-05, + "loss": 1.106, + "step": 4830 + }, + { + "epoch": 0.8639328151523273, + "grad_norm": 0.5468859672546387, + "learning_rate": 1.105848940455334e-05, + "loss": 1.0725, + "step": 4835 + }, + { + "epoch": 0.8648262306798892, + "grad_norm": 0.5670140385627747, + "learning_rate": 1.0916346513410081e-05, + "loss": 1.0736, + "step": 4840 + }, + { + "epoch": 0.8657196462074511, + "grad_norm": 0.5354899168014526, + "learning_rate": 1.0775070316644042e-05, + "loss": 1.0831, + "step": 4845 + }, + { + "epoch": 0.866613061735013, + "grad_norm": 0.5412014126777649, + "learning_rate": 1.0634662188730604e-05, + "loss": 1.1105, + "step": 4850 + }, + { + "epoch": 0.8675064772625748, + "grad_norm": 0.5652616620063782, + "learning_rate": 1.0495123495699588e-05, + "loss": 1.092, + "step": 4855 + }, + { + "epoch": 0.8683998927901367, + "grad_norm": 0.5190427303314209, + "learning_rate": 1.0356455595122239e-05, + "loss": 1.1216, + "step": 4860 + }, + { + "epoch": 0.8692933083176986, + "grad_norm": 0.5157451629638672, + "learning_rate": 1.02186598360978e-05, + "loss": 1.0765, + "step": 4865 + }, + { + "epoch": 0.8701867238452604, + "grad_norm": 0.532561182975769, + "learning_rate": 1.0081737559240445e-05, + "loss": 1.1269, + "step": 4870 + }, + { + "epoch": 0.8710801393728222, + "grad_norm": 0.5337056517601013, + "learning_rate": 9.945690096666249e-06, + "loss": 1.0813, + "step": 4875 + }, + { + "epoch": 0.8719735549003842, + "grad_norm": 0.5260433554649353, + "learning_rate": 9.810518771980225e-06, + "loss": 1.1078, + "step": 4880 + }, + { + "epoch": 0.872866970427946, + "grad_norm": 0.5416473746299744, + "learning_rate": 9.676224900263497e-06, + "loss": 1.1235, + "step": 4885 + }, + { + "epoch": 0.8737603859555079, + "grad_norm": 0.5500181913375854, + "learning_rate": 9.542809788060358e-06, + "loss": 1.1465, + "step": 4890 + }, + { + "epoch": 0.8746538014830698, + "grad_norm": 0.5110030770301819, + "learning_rate": 9.410274733365753e-06, + "loss": 1.0819, + "step": 4895 + }, + { + "epoch": 0.8755472170106317, + "grad_norm": 0.5659358501434326, + "learning_rate": 9.278621025612434e-06, + "loss": 1.0976, + "step": 4900 + }, + { + "epoch": 0.8764406325381935, + "grad_norm": 0.5617207884788513, + "learning_rate": 9.147849945658648e-06, + "loss": 1.0852, + "step": 4905 + }, + { + "epoch": 0.8773340480657554, + "grad_norm": 0.5818383693695068, + "learning_rate": 9.017962765775523e-06, + "loss": 1.1086, + "step": 4910 + }, + { + "epoch": 0.8782274635933173, + "grad_norm": 0.5794780850410461, + "learning_rate": 8.888960749634712e-06, + "loss": 1.0831, + "step": 4915 + }, + { + "epoch": 0.8791208791208791, + "grad_norm": 0.5406253933906555, + "learning_rate": 8.760845152296116e-06, + "loss": 1.0816, + "step": 4920 + }, + { + "epoch": 0.8800142946484409, + "grad_norm": 0.5111543536186218, + "learning_rate": 8.63361722019569e-06, + "loss": 1.0647, + "step": 4925 + }, + { + "epoch": 0.8809077101760029, + "grad_norm": 0.5049743056297302, + "learning_rate": 8.507278191133261e-06, + "loss": 1.0666, + "step": 4930 + }, + { + "epoch": 0.8818011257035647, + "grad_norm": 0.5689327716827393, + "learning_rate": 8.38182929426059e-06, + "loss": 1.1112, + "step": 4935 + }, + { + "epoch": 0.8826945412311266, + "grad_norm": 0.5630576014518738, + "learning_rate": 8.257271750069295e-06, + "loss": 1.1307, + "step": 4940 + }, + { + "epoch": 0.8835879567586885, + "grad_norm": 0.518915593624115, + "learning_rate": 8.133606770379055e-06, + "loss": 1.0815, + "step": 4945 + }, + { + "epoch": 0.8844813722862503, + "grad_norm": 0.4844837188720703, + "learning_rate": 8.010835558325735e-06, + "loss": 1.0847, + "step": 4950 + }, + { + "epoch": 0.8853747878138122, + "grad_norm": 0.5525776147842407, + "learning_rate": 7.88895930834983e-06, + "loss": 1.101, + "step": 4955 + }, + { + "epoch": 0.8862682033413741, + "grad_norm": 0.49364981055259705, + "learning_rate": 7.767979206184694e-06, + "loss": 1.0628, + "step": 4960 + }, + { + "epoch": 0.887161618868936, + "grad_norm": 0.5372298359870911, + "learning_rate": 7.64789642884508e-06, + "loss": 1.0762, + "step": 4965 + }, + { + "epoch": 0.8880550343964978, + "grad_norm": 0.5336244106292725, + "learning_rate": 7.528712144615679e-06, + "loss": 1.051, + "step": 4970 + }, + { + "epoch": 0.8889484499240596, + "grad_norm": 0.510352611541748, + "learning_rate": 7.4104275130397085e-06, + "loss": 1.0813, + "step": 4975 + }, + { + "epoch": 0.8898418654516216, + "grad_norm": 0.5422449707984924, + "learning_rate": 7.29304368490773e-06, + "loss": 1.0806, + "step": 4980 + }, + { + "epoch": 0.8907352809791834, + "grad_norm": 0.5817996263504028, + "learning_rate": 7.176561802246373e-06, + "loss": 1.1025, + "step": 4985 + }, + { + "epoch": 0.8916286965067453, + "grad_norm": 0.5407490730285645, + "learning_rate": 7.06098299830722e-06, + "loss": 1.0885, + "step": 4990 + }, + { + "epoch": 0.8925221120343072, + "grad_norm": 0.5575851202011108, + "learning_rate": 6.946308397555823e-06, + "loss": 1.135, + "step": 4995 + }, + { + "epoch": 0.893415527561869, + "grad_norm": 0.5409137010574341, + "learning_rate": 6.832539115660752e-06, + "loss": 1.0889, + "step": 5000 + }, + { + "epoch": 0.8943089430894309, + "grad_norm": 0.515312910079956, + "learning_rate": 6.719676259482721e-06, + "loss": 1.0601, + "step": 5005 + }, + { + "epoch": 0.8952023586169927, + "grad_norm": 0.5435192584991455, + "learning_rate": 6.607720927063843e-06, + "loss": 1.0978, + "step": 5010 + }, + { + "epoch": 0.8960957741445547, + "grad_norm": 0.5604092478752136, + "learning_rate": 6.496674207616926e-06, + "loss": 1.0699, + "step": 5015 + }, + { + "epoch": 0.8969891896721165, + "grad_norm": 0.5125716924667358, + "learning_rate": 6.386537181514896e-06, + "loss": 1.0807, + "step": 5020 + }, + { + "epoch": 0.8978826051996783, + "grad_norm": 0.5005190372467041, + "learning_rate": 6.277310920280299e-06, + "loss": 1.066, + "step": 5025 + }, + { + "epoch": 0.8987760207272403, + "grad_norm": 0.5269173979759216, + "learning_rate": 6.1689964865748185e-06, + "loss": 1.0719, + "step": 5030 + }, + { + "epoch": 0.8996694362548021, + "grad_norm": 0.5614027380943298, + "learning_rate": 6.061594934188985e-06, + "loss": 1.0712, + "step": 5035 + }, + { + "epoch": 0.900562851782364, + "grad_norm": 0.5723266005516052, + "learning_rate": 5.955107308031915e-06, + "loss": 1.0802, + "step": 5040 + }, + { + "epoch": 0.9014562673099259, + "grad_norm": 0.5639398694038391, + "learning_rate": 5.849534644121146e-06, + "loss": 1.0757, + "step": 5045 + }, + { + "epoch": 0.9023496828374877, + "grad_norm": 0.522843599319458, + "learning_rate": 5.744877969572537e-06, + "loss": 1.0538, + "step": 5050 + }, + { + "epoch": 0.9032430983650496, + "grad_norm": 0.5338366031646729, + "learning_rate": 5.6411383025903205e-06, + "loss": 1.0957, + "step": 5055 + }, + { + "epoch": 0.9041365138926114, + "grad_norm": 0.55523681640625, + "learning_rate": 5.538316652457121e-06, + "loss": 1.0752, + "step": 5060 + }, + { + "epoch": 0.9050299294201734, + "grad_norm": 0.5244052410125732, + "learning_rate": 5.436414019524216e-06, + "loss": 1.1156, + "step": 5065 + }, + { + "epoch": 0.9059233449477352, + "grad_norm": 0.5795427560806274, + "learning_rate": 5.335431395201784e-06, + "loss": 1.0806, + "step": 5070 + }, + { + "epoch": 0.906816760475297, + "grad_norm": 0.5322124361991882, + "learning_rate": 5.235369761949216e-06, + "loss": 1.0903, + "step": 5075 + }, + { + "epoch": 0.907710176002859, + "grad_norm": 0.5396597981452942, + "learning_rate": 5.136230093265593e-06, + "loss": 1.0639, + "step": 5080 + }, + { + "epoch": 0.9086035915304208, + "grad_norm": 0.5385281443595886, + "learning_rate": 5.038013353680204e-06, + "loss": 1.1, + "step": 5085 + }, + { + "epoch": 0.9094970070579826, + "grad_norm": 0.5123850107192993, + "learning_rate": 4.940720498743179e-06, + "loss": 1.1072, + "step": 5090 + }, + { + "epoch": 0.9103904225855446, + "grad_norm": 0.4937289357185364, + "learning_rate": 4.8443524750161676e-06, + "loss": 1.1047, + "step": 5095 + }, + { + "epoch": 0.9112838381131064, + "grad_norm": 0.5448057055473328, + "learning_rate": 4.74891022006313e-06, + "loss": 1.091, + "step": 5100 + }, + { + "epoch": 0.9121772536406683, + "grad_norm": 0.5475412011146545, + "learning_rate": 4.654394662441264e-06, + "loss": 1.0841, + "step": 5105 + }, + { + "epoch": 0.9130706691682301, + "grad_norm": 0.5547966361045837, + "learning_rate": 4.560806721691913e-06, + "loss": 1.089, + "step": 5110 + }, + { + "epoch": 0.913964084695792, + "grad_norm": 0.5370575785636902, + "learning_rate": 4.468147308331605e-06, + "loss": 1.1023, + "step": 5115 + }, + { + "epoch": 0.9148575002233539, + "grad_norm": 0.5097227096557617, + "learning_rate": 4.376417323843318e-06, + "loss": 1.0737, + "step": 5120 + }, + { + "epoch": 0.9157509157509157, + "grad_norm": 0.5403570532798767, + "learning_rate": 4.28561766066754e-06, + "loss": 1.1012, + "step": 5125 + }, + { + "epoch": 0.9166443312784777, + "grad_norm": 0.5509471893310547, + "learning_rate": 4.195749202193699e-06, + "loss": 1.091, + "step": 5130 + }, + { + "epoch": 0.9175377468060395, + "grad_norm": 0.5664262175559998, + "learning_rate": 4.106812822751538e-06, + "loss": 1.0778, + "step": 5135 + }, + { + "epoch": 0.9184311623336013, + "grad_norm": 0.5238862633705139, + "learning_rate": 4.018809387602596e-06, + "loss": 1.087, + "step": 5140 + }, + { + "epoch": 0.9193245778611632, + "grad_norm": 0.5326477289199829, + "learning_rate": 3.931739752931829e-06, + "loss": 1.0912, + "step": 5145 + }, + { + "epoch": 0.9202179933887251, + "grad_norm": 0.5700786709785461, + "learning_rate": 3.845604765839228e-06, + "loss": 1.1111, + "step": 5150 + }, + { + "epoch": 0.921111408916287, + "grad_norm": 0.5622351765632629, + "learning_rate": 3.760405264331612e-06, + "loss": 1.0649, + "step": 5155 + }, + { + "epoch": 0.9220048244438488, + "grad_norm": 0.5550172328948975, + "learning_rate": 3.676142077314448e-06, + "loss": 1.0898, + "step": 5160 + }, + { + "epoch": 0.9228982399714107, + "grad_norm": 0.5153648853302002, + "learning_rate": 3.592816024583856e-06, + "loss": 1.0997, + "step": 5165 + }, + { + "epoch": 0.9237916554989726, + "grad_norm": 0.5297582745552063, + "learning_rate": 3.510427916818526e-06, + "loss": 1.0909, + "step": 5170 + }, + { + "epoch": 0.9246850710265344, + "grad_norm": 0.550730288028717, + "learning_rate": 3.4289785555719157e-06, + "loss": 1.1237, + "step": 5175 + }, + { + "epoch": 0.9255784865540964, + "grad_norm": 0.5354406237602234, + "learning_rate": 3.348468733264398e-06, + "loss": 1.058, + "step": 5180 + }, + { + "epoch": 0.9264719020816582, + "grad_norm": 0.5400915145874023, + "learning_rate": 3.268899233175604e-06, + "loss": 1.0929, + "step": 5185 + }, + { + "epoch": 0.92736531760922, + "grad_norm": 0.5201057195663452, + "learning_rate": 3.19027082943677e-06, + "loss": 1.0704, + "step": 5190 + }, + { + "epoch": 0.9282587331367819, + "grad_norm": 0.5121946930885315, + "learning_rate": 3.1125842870232014e-06, + "loss": 1.1248, + "step": 5195 + }, + { + "epoch": 0.9291521486643438, + "grad_norm": 0.5835002064704895, + "learning_rate": 3.0358403617468446e-06, + "loss": 1.14, + "step": 5200 + }, + { + "epoch": 0.9300455641919056, + "grad_norm": 0.5196322798728943, + "learning_rate": 2.960039800248915e-06, + "loss": 1.0889, + "step": 5205 + }, + { + "epoch": 0.9309389797194675, + "grad_norm": 0.4990968108177185, + "learning_rate": 2.885183339992692e-06, + "loss": 1.0883, + "step": 5210 + }, + { + "epoch": 0.9318323952470294, + "grad_norm": 0.5504352450370789, + "learning_rate": 2.8112717092562358e-06, + "loss": 1.0471, + "step": 5215 + }, + { + "epoch": 0.9327258107745913, + "grad_norm": 0.5271458625793457, + "learning_rate": 2.738305627125415e-06, + "loss": 1.0967, + "step": 5220 + }, + { + "epoch": 0.9336192263021531, + "grad_norm": 0.5700637698173523, + "learning_rate": 2.6662858034868454e-06, + "loss": 1.1138, + "step": 5225 + }, + { + "epoch": 0.934512641829715, + "grad_norm": 0.502876341342926, + "learning_rate": 2.5952129390209854e-06, + "loss": 1.0897, + "step": 5230 + }, + { + "epoch": 0.9354060573572769, + "grad_norm": 0.5170863270759583, + "learning_rate": 2.525087725195352e-06, + "loss": 1.0711, + "step": 5235 + }, + { + "epoch": 0.9362994728848387, + "grad_norm": 0.5183836817741394, + "learning_rate": 2.4559108442577585e-06, + "loss": 1.0735, + "step": 5240 + }, + { + "epoch": 0.9371928884124006, + "grad_norm": 0.5455399751663208, + "learning_rate": 2.38768296922971e-06, + "loss": 1.0648, + "step": 5245 + }, + { + "epoch": 0.9380863039399625, + "grad_norm": 0.47233831882476807, + "learning_rate": 2.3204047638998195e-06, + "loss": 1.0621, + "step": 5250 + }, + { + "epoch": 0.9389797194675243, + "grad_norm": 0.5074595808982849, + "learning_rate": 2.2540768828173795e-06, + "loss": 1.0736, + "step": 5255 + }, + { + "epoch": 0.9398731349950862, + "grad_norm": 0.5757531523704529, + "learning_rate": 2.1886999712860014e-06, + "loss": 1.1304, + "step": 5260 + }, + { + "epoch": 0.9407665505226481, + "grad_norm": 0.5505633354187012, + "learning_rate": 2.1242746653572845e-06, + "loss": 1.1001, + "step": 5265 + }, + { + "epoch": 0.94165996605021, + "grad_norm": 0.5664975047111511, + "learning_rate": 2.060801591824668e-06, + "loss": 1.1198, + "step": 5270 + }, + { + "epoch": 0.9425533815777718, + "grad_norm": 0.5190711617469788, + "learning_rate": 1.9982813682173586e-06, + "loss": 1.0804, + "step": 5275 + }, + { + "epoch": 0.9434467971053337, + "grad_norm": 0.5266507863998413, + "learning_rate": 1.936714602794254e-06, + "loss": 1.0819, + "step": 5280 + }, + { + "epoch": 0.9443402126328956, + "grad_norm": 0.5362054705619812, + "learning_rate": 1.8761018945380849e-06, + "loss": 1.0607, + "step": 5285 + }, + { + "epoch": 0.9452336281604574, + "grad_norm": 0.50426185131073, + "learning_rate": 1.8164438331495614e-06, + "loss": 1.1005, + "step": 5290 + }, + { + "epoch": 0.9461270436880193, + "grad_norm": 0.5616719722747803, + "learning_rate": 1.7577409990416237e-06, + "loss": 1.0772, + "step": 5295 + }, + { + "epoch": 0.9470204592155812, + "grad_norm": 0.5646582841873169, + "learning_rate": 1.6999939633338236e-06, + "loss": 1.1123, + "step": 5300 + }, + { + "epoch": 0.947913874743143, + "grad_norm": 0.5808379650115967, + "learning_rate": 1.6432032878467729e-06, + "loss": 1.0763, + "step": 5305 + }, + { + "epoch": 0.9488072902707049, + "grad_norm": 0.51576167345047, + "learning_rate": 1.587369525096627e-06, + "loss": 1.0801, + "step": 5310 + }, + { + "epoch": 0.9497007057982668, + "grad_norm": 0.5430741906166077, + "learning_rate": 1.5324932182897656e-06, + "loss": 1.0978, + "step": 5315 + }, + { + "epoch": 0.9505941213258287, + "grad_norm": 0.5385932922363281, + "learning_rate": 1.4785749013174754e-06, + "loss": 1.0807, + "step": 5320 + }, + { + "epoch": 0.9514875368533905, + "grad_norm": 0.5400928854942322, + "learning_rate": 1.4256150987507544e-06, + "loss": 1.1189, + "step": 5325 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.5451071262359619, + "learning_rate": 1.3736143258352707e-06, + "loss": 1.073, + "step": 5330 + }, + { + "epoch": 0.9532743679085143, + "grad_norm": 0.5353686213493347, + "learning_rate": 1.322573088486212e-06, + "loss": 1.1045, + "step": 5335 + }, + { + "epoch": 0.9541677834360761, + "grad_norm": 0.50026935338974, + "learning_rate": 1.272491883283533e-06, + "loss": 1.1046, + "step": 5340 + }, + { + "epoch": 0.955061198963638, + "grad_norm": 0.5251268148422241, + "learning_rate": 1.2233711974669714e-06, + "loss": 1.0887, + "step": 5345 + }, + { + "epoch": 0.9559546144911999, + "grad_norm": 0.5508390069007874, + "learning_rate": 1.1752115089314398e-06, + "loss": 1.1005, + "step": 5350 + }, + { + "epoch": 0.9568480300187617, + "grad_norm": 0.5888670086860657, + "learning_rate": 1.1280132862222737e-06, + "loss": 1.095, + "step": 5355 + }, + { + "epoch": 0.9577414455463236, + "grad_norm": 0.5448806285858154, + "learning_rate": 1.081776988530725e-06, + "loss": 1.094, + "step": 5360 + }, + { + "epoch": 0.9586348610738855, + "grad_norm": 0.5379595160484314, + "learning_rate": 1.0365030656894759e-06, + "loss": 1.1017, + "step": 5365 + }, + { + "epoch": 0.9595282766014473, + "grad_norm": 0.520027756690979, + "learning_rate": 9.921919581682759e-07, + "loss": 1.0904, + "step": 5370 + }, + { + "epoch": 0.9604216921290092, + "grad_norm": 0.5289562940597534, + "learning_rate": 9.488440970696566e-07, + "loss": 1.1229, + "step": 5375 + }, + { + "epoch": 0.961315107656571, + "grad_norm": 0.5180892944335938, + "learning_rate": 9.064599041247124e-07, + "loss": 1.1399, + "step": 5380 + }, + { + "epoch": 0.962208523184133, + "grad_norm": 0.5140677094459534, + "learning_rate": 8.650397916890263e-07, + "loss": 1.0734, + "step": 5385 + }, + { + "epoch": 0.9631019387116948, + "grad_norm": 0.5384522080421448, + "learning_rate": 8.245841627386397e-07, + "loss": 1.1054, + "step": 5390 + }, + { + "epoch": 0.9639953542392566, + "grad_norm": 0.5593591928482056, + "learning_rate": 7.850934108661556e-07, + "loss": 1.0937, + "step": 5395 + }, + { + "epoch": 0.9648887697668186, + "grad_norm": 0.5436766743659973, + "learning_rate": 7.465679202768749e-07, + "loss": 1.1055, + "step": 5400 + }, + { + "epoch": 0.9657821852943804, + "grad_norm": 0.47669553756713867, + "learning_rate": 7.090080657850884e-07, + "loss": 1.0633, + "step": 5405 + }, + { + "epoch": 0.9666756008219423, + "grad_norm": 0.49469995498657227, + "learning_rate": 6.724142128104239e-07, + "loss": 1.0978, + "step": 5410 + }, + { + "epoch": 0.9675690163495042, + "grad_norm": 0.565022349357605, + "learning_rate": 6.367867173742603e-07, + "loss": 1.0757, + "step": 5415 + }, + { + "epoch": 0.968462431877066, + "grad_norm": 0.5273885726928711, + "learning_rate": 6.021259260963085e-07, + "loss": 1.0793, + "step": 5420 + }, + { + "epoch": 0.9693558474046279, + "grad_norm": 0.47813916206359863, + "learning_rate": 5.684321761912247e-07, + "loss": 1.0439, + "step": 5425 + }, + { + "epoch": 0.9702492629321897, + "grad_norm": 0.4988810122013092, + "learning_rate": 5.357057954653244e-07, + "loss": 1.0706, + "step": 5430 + }, + { + "epoch": 0.9711426784597517, + "grad_norm": 0.5139685869216919, + "learning_rate": 5.039471023133624e-07, + "loss": 1.1125, + "step": 5435 + }, + { + "epoch": 0.9720360939873135, + "grad_norm": 0.5338658690452576, + "learning_rate": 4.7315640571550246e-07, + "loss": 1.1047, + "step": 5440 + }, + { + "epoch": 0.9729295095148753, + "grad_norm": 0.49352288246154785, + "learning_rate": 4.433340052342749e-07, + "loss": 1.0723, + "step": 5445 + }, + { + "epoch": 0.9738229250424373, + "grad_norm": 0.545481264591217, + "learning_rate": 4.1448019101163473e-07, + "loss": 1.0697, + "step": 5450 + }, + { + "epoch": 0.9747163405699991, + "grad_norm": 0.5542700886726379, + "learning_rate": 3.865952437661968e-07, + "loss": 1.1145, + "step": 5455 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.5232776999473572, + "learning_rate": 3.5967943479043867e-07, + "loss": 1.1051, + "step": 5460 + }, + { + "epoch": 0.9765031716251228, + "grad_norm": 0.5434113144874573, + "learning_rate": 3.3373302594814637e-07, + "loss": 1.1001, + "step": 5465 + }, + { + "epoch": 0.9773965871526847, + "grad_norm": 0.5470346212387085, + "learning_rate": 3.0875626967176165e-07, + "loss": 1.1008, + "step": 5470 + }, + { + "epoch": 0.9782900026802466, + "grad_norm": 0.5273741483688354, + "learning_rate": 2.8474940896003887e-07, + "loss": 1.1393, + "step": 5475 + }, + { + "epoch": 0.9791834182078084, + "grad_norm": 0.5102143287658691, + "learning_rate": 2.617126773755696e-07, + "loss": 1.0718, + "step": 5480 + }, + { + "epoch": 0.9800768337353704, + "grad_norm": 0.5177452564239502, + "learning_rate": 2.3964629904259514e-07, + "loss": 1.1154, + "step": 5485 + }, + { + "epoch": 0.9809702492629322, + "grad_norm": 0.5169101357460022, + "learning_rate": 2.1855048864479754e-07, + "loss": 1.0984, + "step": 5490 + }, + { + "epoch": 0.981863664790494, + "grad_norm": 0.5187830924987793, + "learning_rate": 1.984254514232009e-07, + "loss": 1.0837, + "step": 5495 + }, + { + "epoch": 0.982757080318056, + "grad_norm": 0.5481850504875183, + "learning_rate": 1.7927138317417324e-07, + "loss": 1.0616, + "step": 5500 + }, + { + "epoch": 0.9836504958456178, + "grad_norm": 0.5242062211036682, + "learning_rate": 1.6108847024755015e-07, + "loss": 1.1152, + "step": 5505 + }, + { + "epoch": 0.9845439113731796, + "grad_norm": 0.5562185645103455, + "learning_rate": 1.4387688954478063e-07, + "loss": 1.1404, + "step": 5510 + }, + { + "epoch": 0.9854373269007415, + "grad_norm": 0.490465372800827, + "learning_rate": 1.276368085172397e-07, + "loss": 1.0926, + "step": 5515 + }, + { + "epoch": 0.9863307424283034, + "grad_norm": 0.5415327548980713, + "learning_rate": 1.1236838516459625e-07, + "loss": 1.0913, + "step": 5520 + }, + { + "epoch": 0.9872241579558653, + "grad_norm": 0.536961555480957, + "learning_rate": 9.807176803325879e-08, + "loss": 1.1042, + "step": 5525 + }, + { + "epoch": 0.9881175734834271, + "grad_norm": 0.5555692315101624, + "learning_rate": 8.474709621492105e-08, + "loss": 1.106, + "step": 5530 + }, + { + "epoch": 0.989010989010989, + "grad_norm": 0.5068987011909485, + "learning_rate": 7.239449934525189e-08, + "loss": 1.0631, + "step": 5535 + }, + { + "epoch": 0.9899044045385509, + "grad_norm": 0.5715160369873047, + "learning_rate": 6.101409760260746e-08, + "loss": 1.0986, + "step": 5540 + }, + { + "epoch": 0.9907978200661127, + "grad_norm": 0.5271539092063904, + "learning_rate": 5.0606001706843264e-08, + "loss": 1.0638, + "step": 5545 + }, + { + "epoch": 0.9916912355936747, + "grad_norm": 0.580915093421936, + "learning_rate": 4.1170312918259456e-08, + "loss": 1.0884, + "step": 5550 + }, + { + "epoch": 0.9925846511212365, + "grad_norm": 0.5607919692993164, + "learning_rate": 3.2707123036646026e-08, + "loss": 1.0791, + "step": 5555 + }, + { + "epoch": 0.9934780666487983, + "grad_norm": 0.5168312788009644, + "learning_rate": 2.5216514400305813e-08, + "loss": 1.1157, + "step": 5560 + }, + { + "epoch": 0.9943714821763602, + "grad_norm": 0.5568081736564636, + "learning_rate": 1.869855988534397e-08, + "loss": 1.0988, + "step": 5565 + }, + { + "epoch": 0.9952648977039221, + "grad_norm": 0.5483577251434326, + "learning_rate": 1.31533229049019e-08, + "loss": 1.1063, + "step": 5570 + }, + { + "epoch": 0.996158313231484, + "grad_norm": 0.5298967957496643, + "learning_rate": 8.580857408546639e-09, + "loss": 1.0713, + "step": 5575 + }, + { + "epoch": 0.9970517287590458, + "grad_norm": 0.5338754057884216, + "learning_rate": 4.9812078817934596e-09, + "loss": 1.0486, + "step": 5580 + }, + { + "epoch": 0.9979451442866077, + "grad_norm": 0.5403038263320923, + "learning_rate": 2.3544093455951654e-09, + "loss": 1.063, + "step": 5585 + }, + { + "epoch": 0.9988385598141696, + "grad_norm": 0.5284872055053711, + "learning_rate": 7.004873560645387e-10, + "loss": 1.099, + "step": 5590 + }, + { + "epoch": 0.9997319753417314, + "grad_norm": 0.5430343747138977, + "learning_rate": 1.9458004196781787e-11, + "loss": 1.0962, + "step": 5595 + }, + { + "epoch": 0.9999106584472438, + "eval_loss": 1.0571058988571167, + "eval_runtime": 871.0685, + "eval_samples_per_second": 5.121, + "eval_steps_per_second": 0.641, + "step": 5596 + }, + { + "epoch": 0.9999106584472438, + "step": 5596, + "total_flos": 4.151601388859687e+18, + "train_loss": 1.2674157931559251, + "train_runtime": 31909.1388, + "train_samples_per_second": 1.403, + "train_steps_per_second": 0.175 + } + ], + "logging_steps": 5, + "max_steps": 5596, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.151601388859687e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}