{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9943714821763603, "eval_steps": 500, "global_step": 798, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00375234521575985, "grad_norm": 1.1422045632346107, "learning_rate": 2.5e-06, "loss": 1.4765, "step": 1 }, { "epoch": 0.0075046904315197, "grad_norm": 1.10606272348653, "learning_rate": 5e-06, "loss": 1.4679, "step": 2 }, { "epoch": 0.01125703564727955, "grad_norm": 1.124285036588602, "learning_rate": 7.5e-06, "loss": 1.4926, "step": 3 }, { "epoch": 0.0150093808630394, "grad_norm": 1.175650360755083, "learning_rate": 1e-05, "loss": 1.4946, "step": 4 }, { "epoch": 0.01876172607879925, "grad_norm": 1.1176055565878193, "learning_rate": 1.25e-05, "loss": 1.4803, "step": 5 }, { "epoch": 0.0225140712945591, "grad_norm": 1.080965163228283, "learning_rate": 1.5e-05, "loss": 1.4069, "step": 6 }, { "epoch": 0.02626641651031895, "grad_norm": 1.073199125902437, "learning_rate": 1.75e-05, "loss": 1.4175, "step": 7 }, { "epoch": 0.0300187617260788, "grad_norm": 0.9895651667655394, "learning_rate": 2e-05, "loss": 1.3952, "step": 8 }, { "epoch": 0.03377110694183865, "grad_norm": 0.8484409515335725, "learning_rate": 2.25e-05, "loss": 1.3084, "step": 9 }, { "epoch": 0.0375234521575985, "grad_norm": 0.656224580389129, "learning_rate": 2.5e-05, "loss": 1.2224, "step": 10 }, { "epoch": 0.04127579737335835, "grad_norm": 0.6681802871972625, "learning_rate": 2.7500000000000004e-05, "loss": 1.2279, "step": 11 }, { "epoch": 0.0450281425891182, "grad_norm": 0.6445930931164492, "learning_rate": 3e-05, "loss": 1.1869, "step": 12 }, { "epoch": 0.04878048780487805, "grad_norm": 0.6774830464098534, "learning_rate": 3.2500000000000004e-05, "loss": 1.1345, "step": 13 }, { "epoch": 0.0525328330206379, "grad_norm": 0.7129957171173121, "learning_rate": 3.5e-05, "loss": 1.0342, "step": 14 }, { "epoch": 0.05628517823639775, "grad_norm": 0.6988046692034513, "learning_rate": 3.7500000000000003e-05, "loss": 0.9683, "step": 15 }, { "epoch": 0.0600375234521576, "grad_norm": 0.7305746200421179, "learning_rate": 4e-05, "loss": 0.8998, "step": 16 }, { "epoch": 0.06378986866791744, "grad_norm": 0.6303366234907746, "learning_rate": 4.25e-05, "loss": 0.8585, "step": 17 }, { "epoch": 0.0675422138836773, "grad_norm": 0.6262466336688131, "learning_rate": 4.5e-05, "loss": 0.7913, "step": 18 }, { "epoch": 0.07129455909943715, "grad_norm": 0.5934168135285741, "learning_rate": 4.75e-05, "loss": 0.7358, "step": 19 }, { "epoch": 0.075046904315197, "grad_norm": 0.5003901957180881, "learning_rate": 5e-05, "loss": 0.6762, "step": 20 }, { "epoch": 0.07879924953095685, "grad_norm": 0.44247641980699626, "learning_rate": 5.25e-05, "loss": 0.6412, "step": 21 }, { "epoch": 0.0825515947467167, "grad_norm": 0.33108999413889184, "learning_rate": 5.500000000000001e-05, "loss": 0.6021, "step": 22 }, { "epoch": 0.08630393996247655, "grad_norm": 0.30987297699741684, "learning_rate": 5.7499999999999995e-05, "loss": 0.5678, "step": 23 }, { "epoch": 0.0900562851782364, "grad_norm": 0.2879383883871797, "learning_rate": 6e-05, "loss": 0.5653, "step": 24 }, { "epoch": 0.09380863039399624, "grad_norm": 0.4921785771111378, "learning_rate": 6.25e-05, "loss": 0.5397, "step": 25 }, { "epoch": 0.0975609756097561, "grad_norm": 0.23455468567206647, "learning_rate": 6.500000000000001e-05, "loss": 0.5392, "step": 26 }, { "epoch": 0.10131332082551595, "grad_norm": 0.21551936996375468, "learning_rate": 6.750000000000001e-05, "loss": 0.5423, "step": 27 }, { "epoch": 0.1050656660412758, "grad_norm": 0.2138475404490417, "learning_rate": 7e-05, "loss": 0.5072, "step": 28 }, { "epoch": 0.10881801125703565, "grad_norm": 0.1981260579789532, "learning_rate": 7.25e-05, "loss": 0.4927, "step": 29 }, { "epoch": 0.1125703564727955, "grad_norm": 0.19766175304738637, "learning_rate": 7.500000000000001e-05, "loss": 0.4992, "step": 30 }, { "epoch": 0.11632270168855535, "grad_norm": 0.16180823154197033, "learning_rate": 7.75e-05, "loss": 0.5078, "step": 31 }, { "epoch": 0.1200750469043152, "grad_norm": 0.15792678361397225, "learning_rate": 8e-05, "loss": 0.4834, "step": 32 }, { "epoch": 0.12382739212007504, "grad_norm": 0.17901823211719936, "learning_rate": 8.25e-05, "loss": 0.5038, "step": 33 }, { "epoch": 0.1275797373358349, "grad_norm": 0.15291985686600748, "learning_rate": 8.5e-05, "loss": 0.463, "step": 34 }, { "epoch": 0.13133208255159476, "grad_norm": 0.1402347205035838, "learning_rate": 8.75e-05, "loss": 0.4678, "step": 35 }, { "epoch": 0.1350844277673546, "grad_norm": 0.1292157193781673, "learning_rate": 9e-05, "loss": 0.48, "step": 36 }, { "epoch": 0.13883677298311445, "grad_norm": 0.12200374600393228, "learning_rate": 9.250000000000001e-05, "loss": 0.4678, "step": 37 }, { "epoch": 0.1425891181988743, "grad_norm": 0.12645974836123272, "learning_rate": 9.5e-05, "loss": 0.4783, "step": 38 }, { "epoch": 0.14634146341463414, "grad_norm": 0.12515993032794973, "learning_rate": 9.75e-05, "loss": 0.4558, "step": 39 }, { "epoch": 0.150093808630394, "grad_norm": 0.1257915818218713, "learning_rate": 0.0001, "loss": 0.4582, "step": 40 }, { "epoch": 0.15384615384615385, "grad_norm": 0.11519718216680118, "learning_rate": 0.0001025, "loss": 0.4433, "step": 41 }, { "epoch": 0.1575984990619137, "grad_norm": 0.11408287464445384, "learning_rate": 0.000105, "loss": 0.4566, "step": 42 }, { "epoch": 0.16135084427767354, "grad_norm": 0.11355997287120467, "learning_rate": 0.0001075, "loss": 0.4601, "step": 43 }, { "epoch": 0.1651031894934334, "grad_norm": 0.1236061343834286, "learning_rate": 0.00011000000000000002, "loss": 0.4279, "step": 44 }, { "epoch": 0.16885553470919323, "grad_norm": 0.11292335583297317, "learning_rate": 0.00011250000000000001, "loss": 0.4242, "step": 45 }, { "epoch": 0.1726078799249531, "grad_norm": 0.10830414207227934, "learning_rate": 0.00011499999999999999, "loss": 0.4392, "step": 46 }, { "epoch": 0.17636022514071295, "grad_norm": 0.1130446420034329, "learning_rate": 0.00011750000000000001, "loss": 0.4175, "step": 47 }, { "epoch": 0.1801125703564728, "grad_norm": 0.10972733489410498, "learning_rate": 0.00012, "loss": 0.4064, "step": 48 }, { "epoch": 0.18386491557223264, "grad_norm": 0.11723858927517143, "learning_rate": 0.00012250000000000002, "loss": 0.4618, "step": 49 }, { "epoch": 0.18761726078799248, "grad_norm": 0.12979793592348535, "learning_rate": 0.000125, "loss": 0.4413, "step": 50 }, { "epoch": 0.19136960600375236, "grad_norm": 0.12190484063649769, "learning_rate": 0.0001275, "loss": 0.4212, "step": 51 }, { "epoch": 0.1951219512195122, "grad_norm": 0.1200977200253699, "learning_rate": 0.00013000000000000002, "loss": 0.4236, "step": 52 }, { "epoch": 0.19887429643527205, "grad_norm": 0.11576799257930886, "learning_rate": 0.0001325, "loss": 0.4328, "step": 53 }, { "epoch": 0.2026266416510319, "grad_norm": 0.11804398873031127, "learning_rate": 0.00013500000000000003, "loss": 0.3906, "step": 54 }, { "epoch": 0.20637898686679174, "grad_norm": 0.11890529087801377, "learning_rate": 0.0001375, "loss": 0.4092, "step": 55 }, { "epoch": 0.2101313320825516, "grad_norm": 0.11537178670561035, "learning_rate": 0.00014, "loss": 0.4026, "step": 56 }, { "epoch": 0.21388367729831145, "grad_norm": 0.11591938376106178, "learning_rate": 0.00014250000000000002, "loss": 0.3678, "step": 57 }, { "epoch": 0.2176360225140713, "grad_norm": 0.12025566814049414, "learning_rate": 0.000145, "loss": 0.3791, "step": 58 }, { "epoch": 0.22138836772983114, "grad_norm": 0.13088656560108905, "learning_rate": 0.0001475, "loss": 0.3906, "step": 59 }, { "epoch": 0.225140712945591, "grad_norm": 0.12366551138693345, "learning_rate": 0.00015000000000000001, "loss": 0.3769, "step": 60 }, { "epoch": 0.22889305816135083, "grad_norm": 0.12338960635908504, "learning_rate": 0.0001525, "loss": 0.3806, "step": 61 }, { "epoch": 0.2326454033771107, "grad_norm": 0.12957742480845902, "learning_rate": 0.000155, "loss": 0.365, "step": 62 }, { "epoch": 0.23639774859287055, "grad_norm": 0.1282017025800552, "learning_rate": 0.0001575, "loss": 0.3637, "step": 63 }, { "epoch": 0.2401500938086304, "grad_norm": 0.12685377163368308, "learning_rate": 0.00016, "loss": 0.3813, "step": 64 }, { "epoch": 0.24390243902439024, "grad_norm": 0.12515445789228635, "learning_rate": 0.00016250000000000002, "loss": 0.3665, "step": 65 }, { "epoch": 0.24765478424015008, "grad_norm": 0.12780302020094111, "learning_rate": 0.000165, "loss": 0.372, "step": 66 }, { "epoch": 0.25140712945590993, "grad_norm": 0.13509915457231544, "learning_rate": 0.0001675, "loss": 0.3964, "step": 67 }, { "epoch": 0.2551594746716698, "grad_norm": 0.1324450895974203, "learning_rate": 0.00017, "loss": 0.3809, "step": 68 }, { "epoch": 0.2589118198874296, "grad_norm": 0.14039471561000108, "learning_rate": 0.00017250000000000002, "loss": 0.3788, "step": 69 }, { "epoch": 0.2626641651031895, "grad_norm": 0.13748884493823293, "learning_rate": 0.000175, "loss": 0.3477, "step": 70 }, { "epoch": 0.26641651031894936, "grad_norm": 0.12981102084999996, "learning_rate": 0.0001775, "loss": 0.3803, "step": 71 }, { "epoch": 0.2701688555347092, "grad_norm": 0.12375391443012415, "learning_rate": 0.00018, "loss": 0.3557, "step": 72 }, { "epoch": 0.27392120075046905, "grad_norm": 0.12792478465226367, "learning_rate": 0.0001825, "loss": 0.351, "step": 73 }, { "epoch": 0.2776735459662289, "grad_norm": 0.1281934594676182, "learning_rate": 0.00018500000000000002, "loss": 0.3662, "step": 74 }, { "epoch": 0.28142589118198874, "grad_norm": 0.13402822886419208, "learning_rate": 0.0001875, "loss": 0.3491, "step": 75 }, { "epoch": 0.2851782363977486, "grad_norm": 0.1292536897601892, "learning_rate": 0.00019, "loss": 0.3415, "step": 76 }, { "epoch": 0.28893058161350843, "grad_norm": 0.16014546584829106, "learning_rate": 0.00019250000000000002, "loss": 0.3493, "step": 77 }, { "epoch": 0.2926829268292683, "grad_norm": 0.1393384528675237, "learning_rate": 0.000195, "loss": 0.3509, "step": 78 }, { "epoch": 0.2964352720450281, "grad_norm": 0.15248843449290234, "learning_rate": 0.00019750000000000003, "loss": 0.3328, "step": 79 }, { "epoch": 0.300187617260788, "grad_norm": 0.1478683373584156, "learning_rate": 0.0002, "loss": 0.339, "step": 80 }, { "epoch": 0.30393996247654786, "grad_norm": 0.1457678828189889, "learning_rate": 0.00019999904276147618, "loss": 0.3536, "step": 81 }, { "epoch": 0.3076923076923077, "grad_norm": 0.15185000879528737, "learning_rate": 0.00019999617106423082, "loss": 0.3529, "step": 82 }, { "epoch": 0.31144465290806755, "grad_norm": 0.15201189365883755, "learning_rate": 0.0001999913849632419, "loss": 0.3548, "step": 83 }, { "epoch": 0.3151969981238274, "grad_norm": 0.14879326753679958, "learning_rate": 0.00019998468455013823, "loss": 0.3264, "step": 84 }, { "epoch": 0.31894934333958724, "grad_norm": 0.14083771591440533, "learning_rate": 0.00019997606995319768, "loss": 0.3331, "step": 85 }, { "epoch": 0.3227016885553471, "grad_norm": 0.1503929432468549, "learning_rate": 0.00019996554133734474, "loss": 0.3282, "step": 86 }, { "epoch": 0.32645403377110693, "grad_norm": 0.14030605779758232, "learning_rate": 0.00019995309890414732, "loss": 0.3216, "step": 87 }, { "epoch": 0.3302063789868668, "grad_norm": 0.13891895714301467, "learning_rate": 0.000199938742891813, "loss": 0.3049, "step": 88 }, { "epoch": 0.3339587242026266, "grad_norm": 0.13742909920708118, "learning_rate": 0.00019992247357518428, "loss": 0.3252, "step": 89 }, { "epoch": 0.33771106941838647, "grad_norm": 0.14398237502236147, "learning_rate": 0.0001999042912657335, "loss": 0.3226, "step": 90 }, { "epoch": 0.34146341463414637, "grad_norm": 0.14292774523614082, "learning_rate": 0.00019988419631155683, "loss": 0.3323, "step": 91 }, { "epoch": 0.3452157598499062, "grad_norm": 0.14529808441186043, "learning_rate": 0.00019986218909736757, "loss": 0.3621, "step": 92 }, { "epoch": 0.34896810506566606, "grad_norm": 0.14363660567228131, "learning_rate": 0.00019983827004448873, "loss": 0.3325, "step": 93 }, { "epoch": 0.3527204502814259, "grad_norm": 0.14053215950288314, "learning_rate": 0.00019981243961084515, "loss": 0.3317, "step": 94 }, { "epoch": 0.35647279549718575, "grad_norm": 0.12839662363868307, "learning_rate": 0.0001997846982909545, "loss": 0.3017, "step": 95 }, { "epoch": 0.3602251407129456, "grad_norm": 0.1421301998134749, "learning_rate": 0.000199755046615918, "loss": 0.3236, "step": 96 }, { "epoch": 0.36397748592870544, "grad_norm": 0.1475029420066679, "learning_rate": 0.00019972348515341016, "loss": 0.3362, "step": 97 }, { "epoch": 0.3677298311444653, "grad_norm": 0.13378279730516257, "learning_rate": 0.00019969001450766794, "loss": 0.3254, "step": 98 }, { "epoch": 0.3714821763602251, "grad_norm": 0.1497261207938794, "learning_rate": 0.0001996546353194792, "loss": 0.3156, "step": 99 }, { "epoch": 0.37523452157598497, "grad_norm": 0.1356839966194173, "learning_rate": 0.00019961734826617035, "loss": 0.3282, "step": 100 }, { "epoch": 0.3789868667917448, "grad_norm": 0.12386942577985954, "learning_rate": 0.0001995781540615934, "loss": 0.3207, "step": 101 }, { "epoch": 0.3827392120075047, "grad_norm": 0.16584604505517364, "learning_rate": 0.0001995370534561125, "loss": 0.3026, "step": 102 }, { "epoch": 0.38649155722326456, "grad_norm": 0.1277560294599099, "learning_rate": 0.0001994940472365893, "loss": 0.322, "step": 103 }, { "epoch": 0.3902439024390244, "grad_norm": 0.13567813426924816, "learning_rate": 0.00019944913622636795, "loss": 0.3232, "step": 104 }, { "epoch": 0.39399624765478425, "grad_norm": 0.12123496832228846, "learning_rate": 0.0001994023212852595, "loss": 0.2972, "step": 105 }, { "epoch": 0.3977485928705441, "grad_norm": 0.13879373741004694, "learning_rate": 0.00019935360330952518, "loss": 0.3005, "step": 106 }, { "epoch": 0.40150093808630394, "grad_norm": 0.1274679949876301, "learning_rate": 0.00019930298323185945, "loss": 0.3119, "step": 107 }, { "epoch": 0.4052532833020638, "grad_norm": 0.13101222758435194, "learning_rate": 0.00019925046202137216, "loss": 0.2939, "step": 108 }, { "epoch": 0.4090056285178236, "grad_norm": 0.12738472548497895, "learning_rate": 0.00019919604068356978, "loss": 0.3093, "step": 109 }, { "epoch": 0.41275797373358347, "grad_norm": 0.1490015817444115, "learning_rate": 0.00019913972026033632, "loss": 0.2844, "step": 110 }, { "epoch": 0.4165103189493433, "grad_norm": 0.1470790264142207, "learning_rate": 0.00019908150182991339, "loss": 0.2872, "step": 111 }, { "epoch": 0.4202626641651032, "grad_norm": 0.12721396486874495, "learning_rate": 0.00019902138650687942, "loss": 0.3043, "step": 112 }, { "epoch": 0.42401500938086306, "grad_norm": 0.13891744298891914, "learning_rate": 0.00019895937544212858, "loss": 0.3009, "step": 113 }, { "epoch": 0.4277673545966229, "grad_norm": 0.134346074178801, "learning_rate": 0.00019889546982284834, "loss": 0.3013, "step": 114 }, { "epoch": 0.43151969981238275, "grad_norm": 0.1379066741076229, "learning_rate": 0.00019882967087249718, "loss": 0.3052, "step": 115 }, { "epoch": 0.4352720450281426, "grad_norm": 0.12972548899740632, "learning_rate": 0.0001987619798507809, "loss": 0.3124, "step": 116 }, { "epoch": 0.43902439024390244, "grad_norm": 0.12813310196115213, "learning_rate": 0.0001986923980536286, "loss": 0.2893, "step": 117 }, { "epoch": 0.4427767354596623, "grad_norm": 0.13797054317394944, "learning_rate": 0.00019862092681316776, "loss": 0.3016, "step": 118 }, { "epoch": 0.44652908067542213, "grad_norm": 0.13780600670778337, "learning_rate": 0.0001985475674976989, "loss": 0.3158, "step": 119 }, { "epoch": 0.450281425891182, "grad_norm": 0.13926178383999727, "learning_rate": 0.0001984723215116693, "loss": 0.2801, "step": 120 }, { "epoch": 0.4540337711069418, "grad_norm": 0.1369353496922525, "learning_rate": 0.00019839519029564605, "loss": 0.305, "step": 121 }, { "epoch": 0.45778611632270166, "grad_norm": 0.13937382639705567, "learning_rate": 0.00019831617532628862, "loss": 0.3176, "step": 122 }, { "epoch": 0.46153846153846156, "grad_norm": 0.14086276027188518, "learning_rate": 0.00019823527811632042, "loss": 0.2879, "step": 123 }, { "epoch": 0.4652908067542214, "grad_norm": 0.13282215800163436, "learning_rate": 0.00019815250021449997, "loss": 0.2996, "step": 124 }, { "epoch": 0.46904315196998125, "grad_norm": 0.12757163326850707, "learning_rate": 0.00019806784320559127, "loss": 0.3006, "step": 125 }, { "epoch": 0.4727954971857411, "grad_norm": 0.14854709123219104, "learning_rate": 0.00019798130871033322, "loss": 0.301, "step": 126 }, { "epoch": 0.47654784240150094, "grad_norm": 0.13087500973091548, "learning_rate": 0.00019789289838540897, "loss": 0.2902, "step": 127 }, { "epoch": 0.4803001876172608, "grad_norm": 0.1433475392806627, "learning_rate": 0.00019780261392341383, "loss": 0.2926, "step": 128 }, { "epoch": 0.48405253283302063, "grad_norm": 0.1341283559656879, "learning_rate": 0.0001977104570528231, "loss": 0.2602, "step": 129 }, { "epoch": 0.4878048780487805, "grad_norm": 0.1607197394251248, "learning_rate": 0.00019761642953795895, "loss": 0.2984, "step": 130 }, { "epoch": 0.4915572232645403, "grad_norm": 0.11856150621760517, "learning_rate": 0.0001975205331789566, "loss": 0.2988, "step": 131 }, { "epoch": 0.49530956848030017, "grad_norm": 0.14014139613661877, "learning_rate": 0.00019742276981172976, "loss": 0.291, "step": 132 }, { "epoch": 0.49906191369606, "grad_norm": 0.12881861735846314, "learning_rate": 0.00019732314130793568, "loss": 0.2971, "step": 133 }, { "epoch": 0.5028142589118199, "grad_norm": 0.11788683351931176, "learning_rate": 0.00019722164957493922, "loss": 0.2766, "step": 134 }, { "epoch": 0.5065666041275797, "grad_norm": 0.13746078706666037, "learning_rate": 0.0001971182965557763, "loss": 0.2886, "step": 135 }, { "epoch": 0.5103189493433395, "grad_norm": 0.12745519285890888, "learning_rate": 0.00019701308422911672, "loss": 0.2963, "step": 136 }, { "epoch": 0.5140712945590994, "grad_norm": 0.11835270726835292, "learning_rate": 0.0001969060146092264, "loss": 0.2995, "step": 137 }, { "epoch": 0.5178236397748592, "grad_norm": 0.14011034379489426, "learning_rate": 0.0001967970897459286, "loss": 0.2881, "step": 138 }, { "epoch": 0.5215759849906192, "grad_norm": 0.13060776440495228, "learning_rate": 0.0001966863117245648, "loss": 0.2765, "step": 139 }, { "epoch": 0.525328330206379, "grad_norm": 0.14161693580554588, "learning_rate": 0.00019657368266595476, "loss": 0.281, "step": 140 }, { "epoch": 0.5290806754221389, "grad_norm": 0.12125364150709082, "learning_rate": 0.00019645920472635608, "loss": 0.2732, "step": 141 }, { "epoch": 0.5328330206378987, "grad_norm": 0.1334127552945295, "learning_rate": 0.00019634288009742255, "loss": 0.2523, "step": 142 }, { "epoch": 0.5365853658536586, "grad_norm": 0.12113573146827264, "learning_rate": 0.0001962247110061625, "loss": 0.2775, "step": 143 }, { "epoch": 0.5403377110694184, "grad_norm": 0.12331032028922699, "learning_rate": 0.00019610469971489608, "loss": 0.2687, "step": 144 }, { "epoch": 0.5440900562851783, "grad_norm": 0.13237586077608754, "learning_rate": 0.00019598284852121188, "loss": 0.2774, "step": 145 }, { "epoch": 0.5478424015009381, "grad_norm": 0.12199880756983131, "learning_rate": 0.0001958591597579231, "loss": 0.2815, "step": 146 }, { "epoch": 0.551594746716698, "grad_norm": 0.11915746795874955, "learning_rate": 0.00019573363579302266, "loss": 0.2558, "step": 147 }, { "epoch": 0.5553470919324578, "grad_norm": 0.11644382804351376, "learning_rate": 0.00019560627902963807, "loss": 0.2951, "step": 148 }, { "epoch": 0.5590994371482176, "grad_norm": 0.1317161794959933, "learning_rate": 0.00019547709190598534, "loss": 0.2629, "step": 149 }, { "epoch": 0.5628517823639775, "grad_norm": 0.13859313218362884, "learning_rate": 0.00019534607689532233, "loss": 0.2884, "step": 150 }, { "epoch": 0.5666041275797373, "grad_norm": 0.1643061756146766, "learning_rate": 0.00019521323650590133, "loss": 0.2932, "step": 151 }, { "epoch": 0.5703564727954972, "grad_norm": 0.12366306539172685, "learning_rate": 0.00019507857328092108, "loss": 0.2861, "step": 152 }, { "epoch": 0.574108818011257, "grad_norm": 0.12624207186548378, "learning_rate": 0.00019494208979847812, "loss": 0.2796, "step": 153 }, { "epoch": 0.5778611632270169, "grad_norm": 0.12237336350000451, "learning_rate": 0.00019480378867151746, "loss": 0.273, "step": 154 }, { "epoch": 0.5816135084427767, "grad_norm": 0.12323433685041912, "learning_rate": 0.00019466367254778233, "loss": 0.2747, "step": 155 }, { "epoch": 0.5853658536585366, "grad_norm": 0.12577598956544817, "learning_rate": 0.0001945217441097638, "loss": 0.2634, "step": 156 }, { "epoch": 0.5891181988742964, "grad_norm": 0.12244570380339517, "learning_rate": 0.00019437800607464932, "loss": 0.2701, "step": 157 }, { "epoch": 0.5928705440900562, "grad_norm": 0.12004670825182381, "learning_rate": 0.00019423246119427043, "loss": 0.2781, "step": 158 }, { "epoch": 0.5966228893058161, "grad_norm": 0.13091796767694497, "learning_rate": 0.00019408511225505056, "loss": 0.2646, "step": 159 }, { "epoch": 0.600375234521576, "grad_norm": 0.11771920694416416, "learning_rate": 0.00019393596207795136, "loss": 0.2795, "step": 160 }, { "epoch": 0.6041275797373359, "grad_norm": 0.12447218651645564, "learning_rate": 0.00019378501351841865, "loss": 0.2767, "step": 161 }, { "epoch": 0.6078799249530957, "grad_norm": 0.11854916742534294, "learning_rate": 0.000193632269466328, "loss": 0.2595, "step": 162 }, { "epoch": 0.6116322701688556, "grad_norm": 0.11517649062994549, "learning_rate": 0.0001934777328459292, "loss": 0.2611, "step": 163 }, { "epoch": 0.6153846153846154, "grad_norm": 0.12291906434338017, "learning_rate": 0.00019332140661579042, "loss": 0.2569, "step": 164 }, { "epoch": 0.6191369606003753, "grad_norm": 0.12768661337225065, "learning_rate": 0.00019316329376874145, "loss": 0.2802, "step": 165 }, { "epoch": 0.6228893058161351, "grad_norm": 0.12224468589372722, "learning_rate": 0.00019300339733181642, "loss": 0.2742, "step": 166 }, { "epoch": 0.626641651031895, "grad_norm": 0.11873375913983374, "learning_rate": 0.00019284172036619594, "loss": 0.2496, "step": 167 }, { "epoch": 0.6303939962476548, "grad_norm": 0.1094029489278503, "learning_rate": 0.0001926782659671484, "loss": 0.2834, "step": 168 }, { "epoch": 0.6341463414634146, "grad_norm": 0.11667364916992014, "learning_rate": 0.00019251303726397078, "loss": 0.2749, "step": 169 }, { "epoch": 0.6378986866791745, "grad_norm": 0.10721206701910313, "learning_rate": 0.00019234603741992862, "loss": 0.2833, "step": 170 }, { "epoch": 0.6416510318949343, "grad_norm": 0.11114975628124507, "learning_rate": 0.00019217726963219567, "loss": 0.2412, "step": 171 }, { "epoch": 0.6454033771106942, "grad_norm": 0.11052789377191914, "learning_rate": 0.00019200673713179245, "loss": 0.2629, "step": 172 }, { "epoch": 0.649155722326454, "grad_norm": 0.1254877320751365, "learning_rate": 0.00019183444318352457, "loss": 0.2676, "step": 173 }, { "epoch": 0.6529080675422139, "grad_norm": 0.11436464042758997, "learning_rate": 0.0001916603910859201, "loss": 0.2786, "step": 174 }, { "epoch": 0.6566604127579737, "grad_norm": 0.12040982753537727, "learning_rate": 0.00019148458417116645, "loss": 0.255, "step": 175 }, { "epoch": 0.6604127579737336, "grad_norm": 0.1215472428194096, "learning_rate": 0.00019130702580504676, "loss": 0.2933, "step": 176 }, { "epoch": 0.6641651031894934, "grad_norm": 0.11127574852727158, "learning_rate": 0.0001911277193868751, "loss": 0.2638, "step": 177 }, { "epoch": 0.6679174484052532, "grad_norm": 0.11297276732299613, "learning_rate": 0.00019094666834943179, "loss": 0.2553, "step": 178 }, { "epoch": 0.6716697936210131, "grad_norm": 0.11230362581933455, "learning_rate": 0.00019076387615889727, "loss": 0.2656, "step": 179 }, { "epoch": 0.6754221388367729, "grad_norm": 0.11339982024848368, "learning_rate": 0.00019057934631478617, "loss": 0.2608, "step": 180 }, { "epoch": 0.6791744840525328, "grad_norm": 0.1157018708653507, "learning_rate": 0.00019039308234987992, "loss": 0.2661, "step": 181 }, { "epoch": 0.6829268292682927, "grad_norm": 0.12120354653706046, "learning_rate": 0.00019020508783015942, "loss": 0.2655, "step": 182 }, { "epoch": 0.6866791744840526, "grad_norm": 0.11650498536100079, "learning_rate": 0.00019001536635473664, "loss": 0.2617, "step": 183 }, { "epoch": 0.6904315196998124, "grad_norm": 0.11284326019455035, "learning_rate": 0.0001898239215557856, "loss": 0.2604, "step": 184 }, { "epoch": 0.6941838649155723, "grad_norm": 0.11137366023131207, "learning_rate": 0.0001896307570984731, "loss": 0.2695, "step": 185 }, { "epoch": 0.6979362101313321, "grad_norm": 0.10909150712308537, "learning_rate": 0.00018943587668088832, "loss": 0.261, "step": 186 }, { "epoch": 0.701688555347092, "grad_norm": 0.11533104627662898, "learning_rate": 0.00018923928403397208, "loss": 0.2662, "step": 187 }, { "epoch": 0.7054409005628518, "grad_norm": 0.11085301527387796, "learning_rate": 0.00018904098292144554, "loss": 0.26, "step": 188 }, { "epoch": 0.7091932457786116, "grad_norm": 0.1040125545017247, "learning_rate": 0.00018884097713973798, "loss": 0.2641, "step": 189 }, { "epoch": 0.7129455909943715, "grad_norm": 0.10775777270108124, "learning_rate": 0.00018863927051791416, "loss": 0.2553, "step": 190 }, { "epoch": 0.7166979362101313, "grad_norm": 0.11556746781951048, "learning_rate": 0.00018843586691760108, "loss": 0.2817, "step": 191 }, { "epoch": 0.7204502814258912, "grad_norm": 0.11370972134361729, "learning_rate": 0.00018823077023291397, "loss": 0.2715, "step": 192 }, { "epoch": 0.724202626641651, "grad_norm": 0.10785721109445355, "learning_rate": 0.00018802398439038176, "loss": 0.2604, "step": 193 }, { "epoch": 0.7279549718574109, "grad_norm": 0.10825278350141479, "learning_rate": 0.00018781551334887201, "loss": 0.2498, "step": 194 }, { "epoch": 0.7317073170731707, "grad_norm": 0.09965163182891702, "learning_rate": 0.0001876053610995149, "loss": 0.2504, "step": 195 }, { "epoch": 0.7354596622889306, "grad_norm": 0.1026489808604617, "learning_rate": 0.000187393531665627, "loss": 0.2587, "step": 196 }, { "epoch": 0.7392120075046904, "grad_norm": 0.10399821510438714, "learning_rate": 0.00018718002910263426, "loss": 0.273, "step": 197 }, { "epoch": 0.7429643527204502, "grad_norm": 0.10994775687961979, "learning_rate": 0.0001869648574979942, "loss": 0.2659, "step": 198 }, { "epoch": 0.7467166979362101, "grad_norm": 0.10593465784705908, "learning_rate": 0.00018674802097111784, "loss": 0.26, "step": 199 }, { "epoch": 0.7504690431519699, "grad_norm": 0.11280493763136354, "learning_rate": 0.0001865295236732907, "loss": 0.2677, "step": 200 }, { "epoch": 0.7542213883677298, "grad_norm": 0.10536591132251391, "learning_rate": 0.00018630936978759338, "loss": 0.2513, "step": 201 }, { "epoch": 0.7579737335834896, "grad_norm": 0.10796354732338231, "learning_rate": 0.00018608756352882152, "loss": 0.2757, "step": 202 }, { "epoch": 0.7617260787992496, "grad_norm": 0.10552783825603758, "learning_rate": 0.00018586410914340497, "loss": 0.2552, "step": 203 }, { "epoch": 0.7654784240150094, "grad_norm": 0.10937928150050989, "learning_rate": 0.00018563901090932672, "loss": 0.2675, "step": 204 }, { "epoch": 0.7692307692307693, "grad_norm": 0.11537632950908651, "learning_rate": 0.00018541227313604078, "loss": 0.2402, "step": 205 }, { "epoch": 0.7729831144465291, "grad_norm": 0.11524821367403956, "learning_rate": 0.0001851839001643898, "loss": 0.2628, "step": 206 }, { "epoch": 0.776735459662289, "grad_norm": 0.10266098148088061, "learning_rate": 0.00018495389636652185, "loss": 0.2484, "step": 207 }, { "epoch": 0.7804878048780488, "grad_norm": 0.10807777719284456, "learning_rate": 0.0001847222661458069, "loss": 0.2648, "step": 208 }, { "epoch": 0.7842401500938087, "grad_norm": 0.10744597380010515, "learning_rate": 0.00018448901393675233, "loss": 0.2575, "step": 209 }, { "epoch": 0.7879924953095685, "grad_norm": 0.10942201726245399, "learning_rate": 0.00018425414420491815, "loss": 0.266, "step": 210 }, { "epoch": 0.7917448405253283, "grad_norm": 0.10660876081865972, "learning_rate": 0.00018401766144683147, "loss": 0.2438, "step": 211 }, { "epoch": 0.7954971857410882, "grad_norm": 0.11694393967537217, "learning_rate": 0.0001837795701899004, "loss": 0.2787, "step": 212 }, { "epoch": 0.799249530956848, "grad_norm": 0.11981272200535166, "learning_rate": 0.00018353987499232746, "loss": 0.264, "step": 213 }, { "epoch": 0.8030018761726079, "grad_norm": 0.10661350248202765, "learning_rate": 0.00018329858044302213, "loss": 0.2467, "step": 214 }, { "epoch": 0.8067542213883677, "grad_norm": 0.10372037225439175, "learning_rate": 0.0001830556911615132, "loss": 0.2718, "step": 215 }, { "epoch": 0.8105065666041276, "grad_norm": 0.10573394846211595, "learning_rate": 0.00018281121179786024, "loss": 0.2414, "step": 216 }, { "epoch": 0.8142589118198874, "grad_norm": 0.10765219346551154, "learning_rate": 0.0001825651470325645, "loss": 0.2516, "step": 217 }, { "epoch": 0.8180112570356473, "grad_norm": 0.09961054466797757, "learning_rate": 0.0001823175015764795, "loss": 0.2337, "step": 218 }, { "epoch": 0.8217636022514071, "grad_norm": 0.10573680507484315, "learning_rate": 0.00018206828017072057, "loss": 0.2443, "step": 219 }, { "epoch": 0.8255159474671669, "grad_norm": 0.10617911818381037, "learning_rate": 0.00018181748758657438, "loss": 0.2409, "step": 220 }, { "epoch": 0.8292682926829268, "grad_norm": 0.10190011860666479, "learning_rate": 0.0001815651286254074, "loss": 0.2699, "step": 221 }, { "epoch": 0.8330206378986866, "grad_norm": 0.10217498312134918, "learning_rate": 0.000181311208118574, "loss": 0.261, "step": 222 }, { "epoch": 0.8367729831144465, "grad_norm": 0.10290805625127751, "learning_rate": 0.000181055730927324, "loss": 0.2544, "step": 223 }, { "epoch": 0.8405253283302064, "grad_norm": 0.10273441373621256, "learning_rate": 0.00018079870194270958, "loss": 0.2394, "step": 224 }, { "epoch": 0.8442776735459663, "grad_norm": 0.09880435844395785, "learning_rate": 0.00018054012608549166, "loss": 0.263, "step": 225 }, { "epoch": 0.8480300187617261, "grad_norm": 0.10357276059735837, "learning_rate": 0.0001802800083060457, "loss": 0.2853, "step": 226 }, { "epoch": 0.851782363977486, "grad_norm": 0.10804308023574893, "learning_rate": 0.00018001835358426687, "loss": 0.2595, "step": 227 }, { "epoch": 0.8555347091932458, "grad_norm": 0.09776326620940605, "learning_rate": 0.00017975516692947475, "loss": 0.253, "step": 228 }, { "epoch": 0.8592870544090057, "grad_norm": 0.0995125991589646, "learning_rate": 0.00017949045338031745, "loss": 0.2536, "step": 229 }, { "epoch": 0.8630393996247655, "grad_norm": 0.10281461790899643, "learning_rate": 0.00017922421800467512, "loss": 0.2592, "step": 230 }, { "epoch": 0.8667917448405253, "grad_norm": 0.11374858278223317, "learning_rate": 0.0001789564658995629, "loss": 0.2694, "step": 231 }, { "epoch": 0.8705440900562852, "grad_norm": 0.10048956101218906, "learning_rate": 0.00017868720219103344, "loss": 0.2563, "step": 232 }, { "epoch": 0.874296435272045, "grad_norm": 0.11978050473597157, "learning_rate": 0.00017841643203407852, "loss": 0.2671, "step": 233 }, { "epoch": 0.8780487804878049, "grad_norm": 0.1022948197426214, "learning_rate": 0.00017814416061253077, "loss": 0.2442, "step": 234 }, { "epoch": 0.8818011257035647, "grad_norm": 0.10648409702487768, "learning_rate": 0.000177870393138964, "loss": 0.2172, "step": 235 }, { "epoch": 0.8855534709193246, "grad_norm": 0.09682467776295996, "learning_rate": 0.00017759513485459367, "loss": 0.2503, "step": 236 }, { "epoch": 0.8893058161350844, "grad_norm": 0.10093582432576866, "learning_rate": 0.00017731839102917644, "loss": 0.2526, "step": 237 }, { "epoch": 0.8930581613508443, "grad_norm": 0.10283968277186326, "learning_rate": 0.00017704016696090937, "loss": 0.2467, "step": 238 }, { "epoch": 0.8968105065666041, "grad_norm": 0.1016691703162235, "learning_rate": 0.00017676046797632835, "loss": 0.2458, "step": 239 }, { "epoch": 0.900562851782364, "grad_norm": 0.09871178549145665, "learning_rate": 0.00017647929943020625, "loss": 0.2387, "step": 240 }, { "epoch": 0.9043151969981238, "grad_norm": 0.11005062968397657, "learning_rate": 0.00017619666670545033, "loss": 0.2485, "step": 241 }, { "epoch": 0.9080675422138836, "grad_norm": 0.10636010374538316, "learning_rate": 0.00017591257521299932, "loss": 0.2344, "step": 242 }, { "epoch": 0.9118198874296435, "grad_norm": 0.10269265934208162, "learning_rate": 0.00017562703039171955, "loss": 0.2449, "step": 243 }, { "epoch": 0.9155722326454033, "grad_norm": 0.1123496871025115, "learning_rate": 0.0001753400377083011, "loss": 0.2472, "step": 244 }, { "epoch": 0.9193245778611632, "grad_norm": 0.10731321325088286, "learning_rate": 0.00017505160265715304, "loss": 0.2257, "step": 245 }, { "epoch": 0.9230769230769231, "grad_norm": 0.10122280465712044, "learning_rate": 0.0001747617307602982, "loss": 0.2673, "step": 246 }, { "epoch": 0.926829268292683, "grad_norm": 0.10287633377626088, "learning_rate": 0.00017447042756726754, "loss": 0.2623, "step": 247 }, { "epoch": 0.9305816135084428, "grad_norm": 0.11180813962431274, "learning_rate": 0.0001741776986549938, "loss": 0.2588, "step": 248 }, { "epoch": 0.9343339587242027, "grad_norm": 0.10342918680770019, "learning_rate": 0.00017388354962770487, "loss": 0.2365, "step": 249 }, { "epoch": 0.9380863039399625, "grad_norm": 0.10248241650027715, "learning_rate": 0.0001735879861168163, "loss": 0.2453, "step": 250 }, { "epoch": 0.9418386491557224, "grad_norm": 0.11730400265701718, "learning_rate": 0.00017329101378082374, "loss": 0.2486, "step": 251 }, { "epoch": 0.9455909943714822, "grad_norm": 0.09685186553299667, "learning_rate": 0.0001729926383051943, "loss": 0.2572, "step": 252 }, { "epoch": 0.949343339587242, "grad_norm": 0.12090818479499119, "learning_rate": 0.00017269286540225805, "loss": 0.2248, "step": 253 }, { "epoch": 0.9530956848030019, "grad_norm": 0.10260399450357141, "learning_rate": 0.0001723917008110984, "loss": 0.2527, "step": 254 }, { "epoch": 0.9568480300187617, "grad_norm": 0.10114612523395812, "learning_rate": 0.0001720891502974423, "loss": 0.2602, "step": 255 }, { "epoch": 0.9606003752345216, "grad_norm": 0.11613810011247953, "learning_rate": 0.00017178521965354992, "loss": 0.2535, "step": 256 }, { "epoch": 0.9643527204502814, "grad_norm": 0.10548781228478918, "learning_rate": 0.00017147991469810368, "loss": 0.2616, "step": 257 }, { "epoch": 0.9681050656660413, "grad_norm": 0.10337010169414873, "learning_rate": 0.00017117324127609686, "loss": 0.2506, "step": 258 }, { "epoch": 0.9718574108818011, "grad_norm": 0.1022753450493229, "learning_rate": 0.00017086520525872172, "loss": 0.2536, "step": 259 }, { "epoch": 0.975609756097561, "grad_norm": 0.10274802198295474, "learning_rate": 0.00017055581254325715, "loss": 0.2444, "step": 260 }, { "epoch": 0.9793621013133208, "grad_norm": 0.10073944882387982, "learning_rate": 0.00017024506905295565, "loss": 0.2583, "step": 261 }, { "epoch": 0.9831144465290806, "grad_norm": 0.10220040335882648, "learning_rate": 0.00016993298073693003, "loss": 0.2431, "step": 262 }, { "epoch": 0.9868667917448405, "grad_norm": 0.1060948209024435, "learning_rate": 0.00016961955357003947, "loss": 0.262, "step": 263 }, { "epoch": 0.9906191369606003, "grad_norm": 0.10004277645336798, "learning_rate": 0.0001693047935527751, "loss": 0.234, "step": 264 }, { "epoch": 0.9943714821763602, "grad_norm": 0.1000376814502259, "learning_rate": 0.00016898870671114527, "loss": 0.2566, "step": 265 }, { "epoch": 0.99812382739212, "grad_norm": 0.09911659249018077, "learning_rate": 0.00016867129909655998, "loss": 0.2657, "step": 266 }, { "epoch": 0.99812382739212, "eval_loss": 0.25076788663864136, "eval_runtime": 54.8199, "eval_samples_per_second": 32.725, "eval_steps_per_second": 1.04, "step": 266 }, { "epoch": 1.00187617260788, "grad_norm": 0.09735711101314591, "learning_rate": 0.00016835257678571514, "loss": 0.2408, "step": 267 }, { "epoch": 1.0056285178236397, "grad_norm": 0.09557625747953799, "learning_rate": 0.0001680325458804763, "loss": 0.2347, "step": 268 }, { "epoch": 1.0093808630393997, "grad_norm": 0.1097988122941221, "learning_rate": 0.0001677112125077616, "loss": 0.2352, "step": 269 }, { "epoch": 1.0131332082551594, "grad_norm": 0.10618471568318419, "learning_rate": 0.00016738858281942478, "loss": 0.2307, "step": 270 }, { "epoch": 1.0168855534709194, "grad_norm": 0.09853993482758462, "learning_rate": 0.00016706466299213715, "loss": 0.2464, "step": 271 }, { "epoch": 1.020637898686679, "grad_norm": 0.10229116782791361, "learning_rate": 0.00016673945922726944, "loss": 0.225, "step": 272 }, { "epoch": 1.024390243902439, "grad_norm": 0.0965251213134533, "learning_rate": 0.00016641297775077312, "loss": 0.231, "step": 273 }, { "epoch": 1.0281425891181988, "grad_norm": 0.11757269825570726, "learning_rate": 0.00016608522481306108, "loss": 0.2472, "step": 274 }, { "epoch": 1.0318949343339587, "grad_norm": 0.11070560434441643, "learning_rate": 0.00016575620668888812, "loss": 0.2238, "step": 275 }, { "epoch": 1.0356472795497185, "grad_norm": 0.10334777259567575, "learning_rate": 0.00016542592967723065, "loss": 0.2434, "step": 276 }, { "epoch": 1.0393996247654784, "grad_norm": 0.10075083080394591, "learning_rate": 0.00016509440010116632, "loss": 0.2321, "step": 277 }, { "epoch": 1.0431519699812384, "grad_norm": 0.10193183344799751, "learning_rate": 0.00016476162430775277, "loss": 0.232, "step": 278 }, { "epoch": 1.0469043151969981, "grad_norm": 0.09977179459604381, "learning_rate": 0.00016442760866790617, "loss": 0.2416, "step": 279 }, { "epoch": 1.050656660412758, "grad_norm": 0.10418248901787001, "learning_rate": 0.00016409235957627925, "loss": 0.2186, "step": 280 }, { "epoch": 1.0544090056285178, "grad_norm": 0.10314187360454156, "learning_rate": 0.00016375588345113892, "loss": 0.2299, "step": 281 }, { "epoch": 1.0581613508442778, "grad_norm": 0.11605691367689991, "learning_rate": 0.00016341818673424344, "loss": 0.2538, "step": 282 }, { "epoch": 1.0619136960600375, "grad_norm": 0.10111793724282202, "learning_rate": 0.00016307927589071888, "loss": 0.239, "step": 283 }, { "epoch": 1.0656660412757974, "grad_norm": 0.09948744594514197, "learning_rate": 0.00016273915740893554, "loss": 0.2411, "step": 284 }, { "epoch": 1.0694183864915572, "grad_norm": 0.10520769051847559, "learning_rate": 0.00016239783780038373, "loss": 0.2007, "step": 285 }, { "epoch": 1.0731707317073171, "grad_norm": 0.09787585864913016, "learning_rate": 0.00016205532359954902, "loss": 0.2471, "step": 286 }, { "epoch": 1.0769230769230769, "grad_norm": 0.09502262375530048, "learning_rate": 0.00016171162136378715, "loss": 0.218, "step": 287 }, { "epoch": 1.0806754221388368, "grad_norm": 0.09683942311347521, "learning_rate": 0.0001613667376731985, "loss": 0.2439, "step": 288 }, { "epoch": 1.0844277673545966, "grad_norm": 0.10057823230315424, "learning_rate": 0.00016102067913050224, "loss": 0.2419, "step": 289 }, { "epoch": 1.0881801125703565, "grad_norm": 0.10216410131013826, "learning_rate": 0.0001606734523609097, "loss": 0.2245, "step": 290 }, { "epoch": 1.0919324577861163, "grad_norm": 0.10331269995788077, "learning_rate": 0.0001603250640119977, "loss": 0.2449, "step": 291 }, { "epoch": 1.0956848030018762, "grad_norm": 0.10791236009737591, "learning_rate": 0.0001599755207535812, "loss": 0.2335, "step": 292 }, { "epoch": 1.099437148217636, "grad_norm": 0.1020688406387255, "learning_rate": 0.00015962482927758568, "loss": 0.2392, "step": 293 }, { "epoch": 1.103189493433396, "grad_norm": 0.09755455951195353, "learning_rate": 0.0001592729962979189, "loss": 0.234, "step": 294 }, { "epoch": 1.1069418386491556, "grad_norm": 0.09920885945520751, "learning_rate": 0.0001589200285503426, "loss": 0.2474, "step": 295 }, { "epoch": 1.1106941838649156, "grad_norm": 0.10613730355510181, "learning_rate": 0.00015856593279234317, "loss": 0.2505, "step": 296 }, { "epoch": 1.1144465290806753, "grad_norm": 0.09272222867145113, "learning_rate": 0.00015821071580300272, "loss": 0.2288, "step": 297 }, { "epoch": 1.1181988742964353, "grad_norm": 0.10496298965116614, "learning_rate": 0.00015785438438286893, "loss": 0.2428, "step": 298 }, { "epoch": 1.1219512195121952, "grad_norm": 0.10407571400314669, "learning_rate": 0.00015749694535382509, "loss": 0.2394, "step": 299 }, { "epoch": 1.125703564727955, "grad_norm": 0.0979849366221422, "learning_rate": 0.00015713840555895935, "loss": 0.2308, "step": 300 }, { "epoch": 1.1294559099437147, "grad_norm": 0.10853871626166521, "learning_rate": 0.0001567787718624338, "loss": 0.226, "step": 301 }, { "epoch": 1.1332082551594747, "grad_norm": 0.10500635115885587, "learning_rate": 0.00015641805114935297, "loss": 0.2334, "step": 302 }, { "epoch": 1.1369606003752346, "grad_norm": 0.09142321631155889, "learning_rate": 0.00015605625032563217, "loss": 0.212, "step": 303 }, { "epoch": 1.1407129455909943, "grad_norm": 0.09901885829174076, "learning_rate": 0.00015569337631786512, "loss": 0.2461, "step": 304 }, { "epoch": 1.1444652908067543, "grad_norm": 0.10306359273642063, "learning_rate": 0.00015532943607319142, "loss": 0.2367, "step": 305 }, { "epoch": 1.148217636022514, "grad_norm": 0.10120916123549235, "learning_rate": 0.00015496443655916347, "loss": 0.2287, "step": 306 }, { "epoch": 1.151969981238274, "grad_norm": 0.09555462855940407, "learning_rate": 0.00015459838476361324, "loss": 0.2224, "step": 307 }, { "epoch": 1.1557223264540337, "grad_norm": 0.0962585827915037, "learning_rate": 0.0001542312876945183, "loss": 0.2012, "step": 308 }, { "epoch": 1.1594746716697937, "grad_norm": 0.10477995260230735, "learning_rate": 0.00015386315237986783, "loss": 0.2365, "step": 309 }, { "epoch": 1.1632270168855534, "grad_norm": 0.10547585239552275, "learning_rate": 0.00015349398586752793, "loss": 0.2205, "step": 310 }, { "epoch": 1.1669793621013134, "grad_norm": 0.10406096594039889, "learning_rate": 0.00015312379522510668, "loss": 0.2566, "step": 311 }, { "epoch": 1.170731707317073, "grad_norm": 0.09350218881935819, "learning_rate": 0.000152752587539819, "loss": 0.2209, "step": 312 }, { "epoch": 1.174484052532833, "grad_norm": 0.09509675025055157, "learning_rate": 0.00015238036991835086, "loss": 0.2169, "step": 313 }, { "epoch": 1.1782363977485928, "grad_norm": 0.10079938948741775, "learning_rate": 0.0001520071494867231, "loss": 0.2023, "step": 314 }, { "epoch": 1.1819887429643527, "grad_norm": 0.10701945533460545, "learning_rate": 0.00015163293339015533, "loss": 0.2377, "step": 315 }, { "epoch": 1.1857410881801125, "grad_norm": 0.11578971323992811, "learning_rate": 0.00015125772879292878, "loss": 0.227, "step": 316 }, { "epoch": 1.1894934333958724, "grad_norm": 0.09922963411080835, "learning_rate": 0.00015088154287824933, "loss": 0.2077, "step": 317 }, { "epoch": 1.1932457786116322, "grad_norm": 0.09695103147741277, "learning_rate": 0.00015050438284811002, "loss": 0.2176, "step": 318 }, { "epoch": 1.1969981238273921, "grad_norm": 0.10129994676770977, "learning_rate": 0.00015012625592315297, "loss": 0.2013, "step": 319 }, { "epoch": 1.200750469043152, "grad_norm": 0.09642434156774893, "learning_rate": 0.00014974716934253147, "loss": 0.1968, "step": 320 }, { "epoch": 1.2045028142589118, "grad_norm": 0.11374861504077183, "learning_rate": 0.00014936713036377102, "loss": 0.2386, "step": 321 }, { "epoch": 1.2082551594746715, "grad_norm": 0.10727720426497706, "learning_rate": 0.00014898614626263066, "loss": 0.2432, "step": 322 }, { "epoch": 1.2120075046904315, "grad_norm": 0.09788299082038164, "learning_rate": 0.00014860422433296363, "loss": 0.2285, "step": 323 }, { "epoch": 1.2157598499061915, "grad_norm": 0.10206622558818354, "learning_rate": 0.00014822137188657752, "loss": 0.2492, "step": 324 }, { "epoch": 1.2195121951219512, "grad_norm": 0.09498285131481717, "learning_rate": 0.00014783759625309453, "loss": 0.2193, "step": 325 }, { "epoch": 1.2232645403377111, "grad_norm": 0.09730960742903207, "learning_rate": 0.0001474529047798112, "loss": 0.2351, "step": 326 }, { "epoch": 1.2270168855534709, "grad_norm": 0.09578898317356191, "learning_rate": 0.00014706730483155737, "loss": 0.2223, "step": 327 }, { "epoch": 1.2307692307692308, "grad_norm": 0.10544252021979206, "learning_rate": 0.00014668080379055562, "loss": 0.2483, "step": 328 }, { "epoch": 1.2345215759849906, "grad_norm": 0.10470749080815936, "learning_rate": 0.00014629340905627963, "loss": 0.2153, "step": 329 }, { "epoch": 1.2382739212007505, "grad_norm": 0.09976907858686591, "learning_rate": 0.0001459051280453127, "loss": 0.2325, "step": 330 }, { "epoch": 1.2420262664165103, "grad_norm": 0.0991070010763277, "learning_rate": 0.00014551596819120563, "loss": 0.2325, "step": 331 }, { "epoch": 1.2457786116322702, "grad_norm": 0.1020697715133557, "learning_rate": 0.00014512593694433453, "loss": 0.2294, "step": 332 }, { "epoch": 1.24953095684803, "grad_norm": 0.09999275146583982, "learning_rate": 0.0001447350417717581, "loss": 0.2308, "step": 333 }, { "epoch": 1.25328330206379, "grad_norm": 0.10319899972091776, "learning_rate": 0.00014434329015707467, "loss": 0.2531, "step": 334 }, { "epoch": 1.2570356472795496, "grad_norm": 0.09724419173382096, "learning_rate": 0.000143950689600279, "loss": 0.2398, "step": 335 }, { "epoch": 1.2607879924953096, "grad_norm": 0.09772540231829421, "learning_rate": 0.0001435572476176187, "loss": 0.2174, "step": 336 }, { "epoch": 1.2645403377110696, "grad_norm": 0.09901865331808393, "learning_rate": 0.00014316297174145017, "loss": 0.2409, "step": 337 }, { "epoch": 1.2682926829268293, "grad_norm": 0.10275864985344388, "learning_rate": 0.00014276786952009451, "loss": 0.238, "step": 338 }, { "epoch": 1.272045028142589, "grad_norm": 0.10122974341644604, "learning_rate": 0.00014237194851769318, "loss": 0.235, "step": 339 }, { "epoch": 1.275797373358349, "grad_norm": 0.1011683596887568, "learning_rate": 0.0001419752163140628, "loss": 0.2258, "step": 340 }, { "epoch": 1.279549718574109, "grad_norm": 0.09430075982460438, "learning_rate": 0.00014157768050455038, "loss": 0.2158, "step": 341 }, { "epoch": 1.2833020637898687, "grad_norm": 0.0983574701177382, "learning_rate": 0.00014117934869988777, "loss": 0.2395, "step": 342 }, { "epoch": 1.2870544090056284, "grad_norm": 0.09743731183342702, "learning_rate": 0.00014078022852604592, "loss": 0.2161, "step": 343 }, { "epoch": 1.2908067542213884, "grad_norm": 0.10629549685421696, "learning_rate": 0.00014038032762408897, "loss": 0.2387, "step": 344 }, { "epoch": 1.2945590994371483, "grad_norm": 0.10742796776549986, "learning_rate": 0.00013997965365002789, "loss": 0.2357, "step": 345 }, { "epoch": 1.298311444652908, "grad_norm": 0.09600449697205138, "learning_rate": 0.00013957821427467392, "loss": 0.2094, "step": 346 }, { "epoch": 1.302063789868668, "grad_norm": 0.09578874088186679, "learning_rate": 0.00013917601718349182, "loss": 0.2405, "step": 347 }, { "epoch": 1.3058161350844277, "grad_norm": 0.09943095705760366, "learning_rate": 0.00013877307007645256, "loss": 0.2343, "step": 348 }, { "epoch": 1.3095684803001877, "grad_norm": 0.09495368343830073, "learning_rate": 0.000138369380667886, "loss": 0.2337, "step": 349 }, { "epoch": 1.3133208255159474, "grad_norm": 0.09560858450028774, "learning_rate": 0.00013796495668633326, "loss": 0.2292, "step": 350 }, { "epoch": 1.3170731707317074, "grad_norm": 0.09856846251431063, "learning_rate": 0.00013755980587439856, "loss": 0.2247, "step": 351 }, { "epoch": 1.320825515947467, "grad_norm": 0.10476296342316459, "learning_rate": 0.0001371539359886013, "loss": 0.2376, "step": 352 }, { "epoch": 1.324577861163227, "grad_norm": 0.09750986787445932, "learning_rate": 0.0001367473547992272, "loss": 0.2406, "step": 353 }, { "epoch": 1.3283302063789868, "grad_norm": 0.09480706314179475, "learning_rate": 0.00013634007009017985, "loss": 0.2286, "step": 354 }, { "epoch": 1.3320825515947468, "grad_norm": 0.09343923801170138, "learning_rate": 0.00013593208965883156, "loss": 0.24, "step": 355 }, { "epoch": 1.3358348968105065, "grad_norm": 0.09790544769030897, "learning_rate": 0.00013552342131587398, "loss": 0.2266, "step": 356 }, { "epoch": 1.3395872420262664, "grad_norm": 0.09593153635680574, "learning_rate": 0.0001351140728851688, "loss": 0.2165, "step": 357 }, { "epoch": 1.3433395872420264, "grad_norm": 0.09724129539893522, "learning_rate": 0.00013470405220359773, "loss": 0.2268, "step": 358 }, { "epoch": 1.3470919324577861, "grad_norm": 0.11375066003254507, "learning_rate": 0.0001342933671209126, "loss": 0.2538, "step": 359 }, { "epoch": 1.3508442776735459, "grad_norm": 0.09474834489243378, "learning_rate": 0.00013388202549958507, "loss": 0.238, "step": 360 }, { "epoch": 1.3545966228893058, "grad_norm": 0.0930046134081851, "learning_rate": 0.0001334700352146561, "loss": 0.2328, "step": 361 }, { "epoch": 1.3583489681050658, "grad_norm": 0.09451762554855862, "learning_rate": 0.00013305740415358504, "loss": 0.2138, "step": 362 }, { "epoch": 1.3621013133208255, "grad_norm": 0.09703907009293414, "learning_rate": 0.000132644140216099, "loss": 0.2295, "step": 363 }, { "epoch": 1.3658536585365852, "grad_norm": 0.09339693738861075, "learning_rate": 0.00013223025131404106, "loss": 0.2124, "step": 364 }, { "epoch": 1.3696060037523452, "grad_norm": 0.09527748104301437, "learning_rate": 0.00013181574537121933, "loss": 0.2388, "step": 365 }, { "epoch": 1.3733583489681052, "grad_norm": 0.093154160924506, "learning_rate": 0.0001314006303232549, "loss": 0.2451, "step": 366 }, { "epoch": 1.377110694183865, "grad_norm": 0.1011182665719985, "learning_rate": 0.00013098491411743014, "loss": 0.2384, "step": 367 }, { "epoch": 1.3808630393996248, "grad_norm": 0.09967710236675086, "learning_rate": 0.00013056860471253638, "loss": 0.2309, "step": 368 }, { "epoch": 1.3846153846153846, "grad_norm": 0.09894617016616869, "learning_rate": 0.0001301517100787216, "loss": 0.2383, "step": 369 }, { "epoch": 1.3883677298311445, "grad_norm": 0.09445738440316971, "learning_rate": 0.0001297342381973379, "loss": 0.2285, "step": 370 }, { "epoch": 1.3921200750469043, "grad_norm": 0.0988373582071634, "learning_rate": 0.00012931619706078862, "loss": 0.2273, "step": 371 }, { "epoch": 1.3958724202626642, "grad_norm": 0.09523024426497596, "learning_rate": 0.00012889759467237533, "loss": 0.1996, "step": 372 }, { "epoch": 1.399624765478424, "grad_norm": 0.0980770000927041, "learning_rate": 0.00012847843904614475, "loss": 0.2267, "step": 373 }, { "epoch": 1.403377110694184, "grad_norm": 0.1000350881284721, "learning_rate": 0.0001280587382067351, "loss": 0.2342, "step": 374 }, { "epoch": 1.4071294559099436, "grad_norm": 0.09760637716029413, "learning_rate": 0.00012763850018922257, "loss": 0.2307, "step": 375 }, { "epoch": 1.4108818011257036, "grad_norm": 0.10052208501482537, "learning_rate": 0.00012721773303896763, "loss": 0.2503, "step": 376 }, { "epoch": 1.4146341463414633, "grad_norm": 0.09172234629093598, "learning_rate": 0.00012679644481146081, "loss": 0.2202, "step": 377 }, { "epoch": 1.4183864915572233, "grad_norm": 0.09854006806926725, "learning_rate": 0.00012637464357216846, "loss": 0.2302, "step": 378 }, { "epoch": 1.4221388367729833, "grad_norm": 0.09543240233343374, "learning_rate": 0.0001259523373963785, "loss": 0.2494, "step": 379 }, { "epoch": 1.425891181988743, "grad_norm": 0.09463585439380381, "learning_rate": 0.00012552953436904577, "loss": 0.2326, "step": 380 }, { "epoch": 1.4296435272045027, "grad_norm": 0.09378431967427468, "learning_rate": 0.00012510624258463718, "loss": 0.2449, "step": 381 }, { "epoch": 1.4333958724202627, "grad_norm": 0.09070103412454147, "learning_rate": 0.0001246824701469768, "loss": 0.2214, "step": 382 }, { "epoch": 1.4371482176360226, "grad_norm": 0.09051851692336525, "learning_rate": 0.00012425822516909065, "loss": 0.2231, "step": 383 }, { "epoch": 1.4409005628517824, "grad_norm": 0.10103712292921486, "learning_rate": 0.00012383351577305147, "loss": 0.2439, "step": 384 }, { "epoch": 1.444652908067542, "grad_norm": 0.09487712674152077, "learning_rate": 0.00012340835008982313, "loss": 0.222, "step": 385 }, { "epoch": 1.448405253283302, "grad_norm": 0.0948558910866512, "learning_rate": 0.0001229827362591051, "loss": 0.2195, "step": 386 }, { "epoch": 1.452157598499062, "grad_norm": 0.08833690071237923, "learning_rate": 0.0001225566824291765, "loss": 0.2141, "step": 387 }, { "epoch": 1.4559099437148217, "grad_norm": 0.10083948403636385, "learning_rate": 0.00012213019675674008, "loss": 0.2259, "step": 388 }, { "epoch": 1.4596622889305815, "grad_norm": 0.10134097189657629, "learning_rate": 0.00012170328740676613, "loss": 0.2447, "step": 389 }, { "epoch": 1.4634146341463414, "grad_norm": 0.09393280064610553, "learning_rate": 0.00012127596255233622, "loss": 0.2266, "step": 390 }, { "epoch": 1.4671669793621014, "grad_norm": 0.1038113816247064, "learning_rate": 0.00012084823037448654, "loss": 0.2332, "step": 391 }, { "epoch": 1.4709193245778611, "grad_norm": 0.09776855466524487, "learning_rate": 0.00012042009906205152, "loss": 0.2451, "step": 392 }, { "epoch": 1.474671669793621, "grad_norm": 0.09705953983738934, "learning_rate": 0.00011999157681150684, "loss": 0.2385, "step": 393 }, { "epoch": 1.4784240150093808, "grad_norm": 0.09561991491048107, "learning_rate": 0.00011956267182681264, "loss": 0.2436, "step": 394 }, { "epoch": 1.4821763602251408, "grad_norm": 0.09352280072376362, "learning_rate": 0.00011913339231925643, "loss": 0.2036, "step": 395 }, { "epoch": 1.4859287054409005, "grad_norm": 0.09506124468167064, "learning_rate": 0.0001187037465072958, "loss": 0.2341, "step": 396 }, { "epoch": 1.4896810506566605, "grad_norm": 0.09556386697453095, "learning_rate": 0.00011827374261640127, "loss": 0.2252, "step": 397 }, { "epoch": 1.4934333958724202, "grad_norm": 0.09620969600998162, "learning_rate": 0.00011784338887889858, "loss": 0.2212, "step": 398 }, { "epoch": 1.4971857410881801, "grad_norm": 0.09670651255581325, "learning_rate": 0.00011741269353381128, "loss": 0.2318, "step": 399 }, { "epoch": 1.50093808630394, "grad_norm": 0.09008271462901533, "learning_rate": 0.00011698166482670292, "loss": 0.2192, "step": 400 }, { "epoch": 1.5046904315196998, "grad_norm": 0.10308418216879658, "learning_rate": 0.0001165503110095191, "loss": 0.2206, "step": 401 }, { "epoch": 1.5084427767354596, "grad_norm": 0.10031539219103021, "learning_rate": 0.00011611864034042972, "loss": 0.24, "step": 402 }, { "epoch": 1.5121951219512195, "grad_norm": 0.09579006879477674, "learning_rate": 0.00011568666108367065, "loss": 0.2208, "step": 403 }, { "epoch": 1.5159474671669795, "grad_norm": 0.09362346025963265, "learning_rate": 0.00011525438150938554, "loss": 0.2082, "step": 404 }, { "epoch": 1.5196998123827392, "grad_norm": 0.09932354842666284, "learning_rate": 0.00011482180989346771, "loss": 0.2371, "step": 405 }, { "epoch": 1.523452157598499, "grad_norm": 0.09086206966321682, "learning_rate": 0.00011438895451740142, "loss": 0.205, "step": 406 }, { "epoch": 1.527204502814259, "grad_norm": 0.10012726577032707, "learning_rate": 0.00011395582366810346, "loss": 0.2251, "step": 407 }, { "epoch": 1.5309568480300189, "grad_norm": 0.09303157882710397, "learning_rate": 0.0001135224256377646, "loss": 0.2201, "step": 408 }, { "epoch": 1.5347091932457786, "grad_norm": 0.09843049012555664, "learning_rate": 0.0001130887687236906, "loss": 0.2266, "step": 409 }, { "epoch": 1.5384615384615383, "grad_norm": 0.09695566090218002, "learning_rate": 0.00011265486122814359, "loss": 0.2311, "step": 410 }, { "epoch": 1.5422138836772983, "grad_norm": 0.09175484300732825, "learning_rate": 0.00011222071145818294, "loss": 0.2005, "step": 411 }, { "epoch": 1.5459662288930582, "grad_norm": 0.09857760057358032, "learning_rate": 0.00011178632772550635, "loss": 0.2399, "step": 412 }, { "epoch": 1.549718574108818, "grad_norm": 0.09629500388446056, "learning_rate": 0.00011135171834629071, "loss": 0.22, "step": 413 }, { "epoch": 1.5534709193245777, "grad_norm": 0.09725156323526822, "learning_rate": 0.00011091689164103281, "loss": 0.2295, "step": 414 }, { "epoch": 1.5572232645403377, "grad_norm": 0.09771091604822173, "learning_rate": 0.00011048185593439014, "loss": 0.2342, "step": 415 }, { "epoch": 1.5609756097560976, "grad_norm": 0.09928358316062869, "learning_rate": 0.00011004661955502142, "loss": 0.2191, "step": 416 }, { "epoch": 1.5647279549718576, "grad_norm": 0.10070058193524922, "learning_rate": 0.00010961119083542726, "loss": 0.2321, "step": 417 }, { "epoch": 1.5684803001876173, "grad_norm": 0.10163771478776185, "learning_rate": 0.00010917557811179056, "loss": 0.235, "step": 418 }, { "epoch": 1.572232645403377, "grad_norm": 0.0959831873984863, "learning_rate": 0.0001087397897238169, "loss": 0.2345, "step": 419 }, { "epoch": 1.575984990619137, "grad_norm": 0.09912094462045823, "learning_rate": 0.00010830383401457498, "loss": 0.2345, "step": 420 }, { "epoch": 1.579737335834897, "grad_norm": 0.09713827030239745, "learning_rate": 0.00010786771933033677, "loss": 0.2336, "step": 421 }, { "epoch": 1.5834896810506567, "grad_norm": 0.09586573168928784, "learning_rate": 0.00010743145402041781, "loss": 0.2218, "step": 422 }, { "epoch": 1.5872420262664164, "grad_norm": 0.0964267909183374, "learning_rate": 0.00010699504643701732, "loss": 0.2223, "step": 423 }, { "epoch": 1.5909943714821764, "grad_norm": 0.09462727861813398, "learning_rate": 0.00010655850493505834, "loss": 0.2359, "step": 424 }, { "epoch": 1.5947467166979363, "grad_norm": 0.0947470136127899, "learning_rate": 0.00010612183787202767, "loss": 0.224, "step": 425 }, { "epoch": 1.598499061913696, "grad_norm": 0.09913794740261614, "learning_rate": 0.00010568505360781606, "loss": 0.2328, "step": 426 }, { "epoch": 1.6022514071294558, "grad_norm": 0.09943541246432905, "learning_rate": 0.00010524816050455801, "loss": 0.2219, "step": 427 }, { "epoch": 1.6060037523452158, "grad_norm": 0.09790889344857133, "learning_rate": 0.00010481116692647164, "loss": 0.2326, "step": 428 }, { "epoch": 1.6097560975609757, "grad_norm": 0.09837772070524535, "learning_rate": 0.00010437408123969877, "loss": 0.2, "step": 429 }, { "epoch": 1.6135084427767354, "grad_norm": 0.0974160428534803, "learning_rate": 0.0001039369118121445, "loss": 0.2374, "step": 430 }, { "epoch": 1.6172607879924952, "grad_norm": 0.09832648736093924, "learning_rate": 0.00010349966701331721, "loss": 0.2451, "step": 431 }, { "epoch": 1.6210131332082551, "grad_norm": 0.09299437074609294, "learning_rate": 0.0001030623552141682, "loss": 0.2338, "step": 432 }, { "epoch": 1.624765478424015, "grad_norm": 0.09302666370247067, "learning_rate": 0.00010262498478693147, "loss": 0.2091, "step": 433 }, { "epoch": 1.6285178236397748, "grad_norm": 0.09577010560643465, "learning_rate": 0.00010218756410496354, "loss": 0.2482, "step": 434 }, { "epoch": 1.6322701688555346, "grad_norm": 0.08960294424396283, "learning_rate": 0.00010175010154258289, "loss": 0.2222, "step": 435 }, { "epoch": 1.6360225140712945, "grad_norm": 0.09282177110646465, "learning_rate": 0.00010131260547490991, "loss": 0.2251, "step": 436 }, { "epoch": 1.6397748592870545, "grad_norm": 0.09521644447862954, "learning_rate": 0.00010087508427770638, "loss": 0.2247, "step": 437 }, { "epoch": 1.6435272045028144, "grad_norm": 0.0983364489403648, "learning_rate": 0.00010043754632721518, "loss": 0.2321, "step": 438 }, { "epoch": 1.6472795497185742, "grad_norm": 0.09923727566079374, "learning_rate": 0.0001, "loss": 0.2289, "step": 439 }, { "epoch": 1.6510318949343339, "grad_norm": 0.09206598379643983, "learning_rate": 9.956245367278482e-05, "loss": 0.2228, "step": 440 }, { "epoch": 1.6547842401500938, "grad_norm": 0.10070658108496927, "learning_rate": 9.912491572229367e-05, "loss": 0.237, "step": 441 }, { "epoch": 1.6585365853658538, "grad_norm": 0.09161131331913487, "learning_rate": 9.868739452509011e-05, "loss": 0.2163, "step": 442 }, { "epoch": 1.6622889305816135, "grad_norm": 0.10139978192330912, "learning_rate": 9.824989845741713e-05, "loss": 0.2433, "step": 443 }, { "epoch": 1.6660412757973733, "grad_norm": 0.09427696830317533, "learning_rate": 9.781243589503649e-05, "loss": 0.2266, "step": 444 }, { "epoch": 1.6697936210131332, "grad_norm": 0.09155321002598128, "learning_rate": 9.737501521306854e-05, "loss": 0.2175, "step": 445 }, { "epoch": 1.6735459662288932, "grad_norm": 0.0950200704550263, "learning_rate": 9.693764478583185e-05, "loss": 0.2211, "step": 446 }, { "epoch": 1.677298311444653, "grad_norm": 0.0968747486645257, "learning_rate": 9.65003329866828e-05, "loss": 0.2296, "step": 447 }, { "epoch": 1.6810506566604126, "grad_norm": 0.09666111001258508, "learning_rate": 9.606308818785551e-05, "loss": 0.2223, "step": 448 }, { "epoch": 1.6848030018761726, "grad_norm": 0.0921723875506241, "learning_rate": 9.562591876030127e-05, "loss": 0.221, "step": 449 }, { "epoch": 1.6885553470919326, "grad_norm": 0.09670737686580928, "learning_rate": 9.518883307352839e-05, "loss": 0.2181, "step": 450 }, { "epoch": 1.6923076923076923, "grad_norm": 0.09887865182391634, "learning_rate": 9.475183949544204e-05, "loss": 0.2471, "step": 451 }, { "epoch": 1.696060037523452, "grad_norm": 0.09588212622362839, "learning_rate": 9.431494639218397e-05, "loss": 0.2277, "step": 452 }, { "epoch": 1.699812382739212, "grad_norm": 0.10074994984220358, "learning_rate": 9.387816212797233e-05, "loss": 0.2318, "step": 453 }, { "epoch": 1.703564727954972, "grad_norm": 0.09563871978533513, "learning_rate": 9.344149506494168e-05, "loss": 0.2189, "step": 454 }, { "epoch": 1.7073170731707317, "grad_norm": 0.1036934403800542, "learning_rate": 9.300495356298269e-05, "loss": 0.2371, "step": 455 }, { "epoch": 1.7110694183864914, "grad_norm": 0.0999758194982529, "learning_rate": 9.256854597958221e-05, "loss": 0.2208, "step": 456 }, { "epoch": 1.7148217636022514, "grad_norm": 0.09293777471686854, "learning_rate": 9.213228066966327e-05, "loss": 0.2055, "step": 457 }, { "epoch": 1.7185741088180113, "grad_norm": 0.09520329939720786, "learning_rate": 9.169616598542503e-05, "loss": 0.2393, "step": 458 }, { "epoch": 1.7223264540337713, "grad_norm": 0.09652518762466816, "learning_rate": 9.126021027618311e-05, "loss": 0.2235, "step": 459 }, { "epoch": 1.726078799249531, "grad_norm": 0.09360819050527228, "learning_rate": 9.082442188820946e-05, "loss": 0.217, "step": 460 }, { "epoch": 1.7298311444652907, "grad_norm": 0.09721890301094122, "learning_rate": 9.038880916457276e-05, "loss": 0.2413, "step": 461 }, { "epoch": 1.7335834896810507, "grad_norm": 0.093039794672402, "learning_rate": 8.99533804449786e-05, "loss": 0.2079, "step": 462 }, { "epoch": 1.7373358348968106, "grad_norm": 0.09835643069659236, "learning_rate": 8.951814406560987e-05, "loss": 0.227, "step": 463 }, { "epoch": 1.7410881801125704, "grad_norm": 0.10344720588136387, "learning_rate": 8.90831083589672e-05, "loss": 0.244, "step": 464 }, { "epoch": 1.7448405253283301, "grad_norm": 0.10152528010120943, "learning_rate": 8.86482816537093e-05, "loss": 0.2094, "step": 465 }, { "epoch": 1.74859287054409, "grad_norm": 0.09673916894053619, "learning_rate": 8.821367227449367e-05, "loss": 0.2208, "step": 466 }, { "epoch": 1.75234521575985, "grad_norm": 0.09445446641754265, "learning_rate": 8.77792885418171e-05, "loss": 0.2114, "step": 467 }, { "epoch": 1.7560975609756098, "grad_norm": 0.09420865256892867, "learning_rate": 8.734513877185644e-05, "loss": 0.2262, "step": 468 }, { "epoch": 1.7598499061913695, "grad_norm": 0.09958663710649508, "learning_rate": 8.691123127630942e-05, "loss": 0.2232, "step": 469 }, { "epoch": 1.7636022514071295, "grad_norm": 0.0922790068257261, "learning_rate": 8.647757436223543e-05, "loss": 0.2082, "step": 470 }, { "epoch": 1.7673545966228894, "grad_norm": 0.09917925858312189, "learning_rate": 8.604417633189656e-05, "loss": 0.2204, "step": 471 }, { "epoch": 1.7711069418386491, "grad_norm": 0.09568035725103857, "learning_rate": 8.561104548259863e-05, "loss": 0.243, "step": 472 }, { "epoch": 1.7748592870544089, "grad_norm": 0.09571143801780994, "learning_rate": 8.517819010653234e-05, "loss": 0.2181, "step": 473 }, { "epoch": 1.7786116322701688, "grad_norm": 0.09509116264565627, "learning_rate": 8.474561849061445e-05, "loss": 0.2213, "step": 474 }, { "epoch": 1.7823639774859288, "grad_norm": 0.09971411865611762, "learning_rate": 8.431333891632937e-05, "loss": 0.2273, "step": 475 }, { "epoch": 1.7861163227016885, "grad_norm": 0.0928737537182183, "learning_rate": 8.38813596595703e-05, "loss": 0.2073, "step": 476 }, { "epoch": 1.7898686679174483, "grad_norm": 0.09557624867506877, "learning_rate": 8.344968899048093e-05, "loss": 0.2218, "step": 477 }, { "epoch": 1.7936210131332082, "grad_norm": 0.09939509629560786, "learning_rate": 8.301833517329714e-05, "loss": 0.2254, "step": 478 }, { "epoch": 1.7973733583489682, "grad_norm": 0.09345708817873574, "learning_rate": 8.258730646618872e-05, "loss": 0.2021, "step": 479 }, { "epoch": 1.8011257035647281, "grad_norm": 0.09371680566942005, "learning_rate": 8.215661112110143e-05, "loss": 0.2144, "step": 480 }, { "epoch": 1.8048780487804879, "grad_norm": 0.09834806206209937, "learning_rate": 8.172625738359875e-05, "loss": 0.2287, "step": 481 }, { "epoch": 1.8086303939962476, "grad_norm": 0.09957873029024283, "learning_rate": 8.12962534927042e-05, "loss": 0.2111, "step": 482 }, { "epoch": 1.8123827392120075, "grad_norm": 0.09557530525865873, "learning_rate": 8.086660768074358e-05, "loss": 0.2199, "step": 483 }, { "epoch": 1.8161350844277675, "grad_norm": 0.09622175925949066, "learning_rate": 8.043732817318736e-05, "loss": 0.2282, "step": 484 }, { "epoch": 1.8198874296435272, "grad_norm": 0.09465802986234279, "learning_rate": 8.000842318849317e-05, "loss": 0.2317, "step": 485 }, { "epoch": 1.823639774859287, "grad_norm": 0.08916298991186261, "learning_rate": 7.957990093794849e-05, "loss": 0.2115, "step": 486 }, { "epoch": 1.827392120075047, "grad_norm": 0.09238291065333254, "learning_rate": 7.915176962551347e-05, "loss": 0.2186, "step": 487 }, { "epoch": 1.8311444652908069, "grad_norm": 0.09002736286086312, "learning_rate": 7.872403744766383e-05, "loss": 0.2241, "step": 488 }, { "epoch": 1.8348968105065666, "grad_norm": 0.089491467964355, "learning_rate": 7.82967125932339e-05, "loss": 0.1999, "step": 489 }, { "epoch": 1.8386491557223263, "grad_norm": 0.10455266258397437, "learning_rate": 7.786980324325995e-05, "loss": 0.2471, "step": 490 }, { "epoch": 1.8424015009380863, "grad_norm": 0.08855632929683876, "learning_rate": 7.74433175708235e-05, "loss": 0.2149, "step": 491 }, { "epoch": 1.8461538461538463, "grad_norm": 0.09599775971890664, "learning_rate": 7.70172637408949e-05, "loss": 0.2249, "step": 492 }, { "epoch": 1.849906191369606, "grad_norm": 0.0930126035349028, "learning_rate": 7.659164991017689e-05, "loss": 0.2273, "step": 493 }, { "epoch": 1.8536585365853657, "grad_norm": 0.09484471832756645, "learning_rate": 7.616648422694858e-05, "loss": 0.2463, "step": 494 }, { "epoch": 1.8574108818011257, "grad_norm": 0.08958160415408828, "learning_rate": 7.574177483090937e-05, "loss": 0.2092, "step": 495 }, { "epoch": 1.8611632270168856, "grad_norm": 0.09649064421997748, "learning_rate": 7.531752985302323e-05, "loss": 0.2279, "step": 496 }, { "epoch": 1.8649155722326454, "grad_norm": 0.09487279116811025, "learning_rate": 7.489375741536283e-05, "loss": 0.2116, "step": 497 }, { "epoch": 1.868667917448405, "grad_norm": 0.09234503645808133, "learning_rate": 7.447046563095424e-05, "loss": 0.2117, "step": 498 }, { "epoch": 1.872420262664165, "grad_norm": 0.09403769810770585, "learning_rate": 7.404766260362152e-05, "loss": 0.2264, "step": 499 }, { "epoch": 1.876172607879925, "grad_norm": 0.09396847439786832, "learning_rate": 7.362535642783155e-05, "loss": 0.2316, "step": 500 }, { "epoch": 1.8799249530956847, "grad_norm": 0.09371768315048483, "learning_rate": 7.320355518853921e-05, "loss": 0.2057, "step": 501 }, { "epoch": 1.8836772983114447, "grad_norm": 0.09339808934851973, "learning_rate": 7.278226696103239e-05, "loss": 0.229, "step": 502 }, { "epoch": 1.8874296435272044, "grad_norm": 0.0944706527223077, "learning_rate": 7.236149981077745e-05, "loss": 0.2454, "step": 503 }, { "epoch": 1.8911819887429644, "grad_norm": 0.10040670311990112, "learning_rate": 7.194126179326497e-05, "loss": 0.2313, "step": 504 }, { "epoch": 1.8949343339587243, "grad_norm": 0.09634234013092864, "learning_rate": 7.152156095385527e-05, "loss": 0.2429, "step": 505 }, { "epoch": 1.898686679174484, "grad_norm": 0.0969012473081891, "learning_rate": 7.110240532762469e-05, "loss": 0.2367, "step": 506 }, { "epoch": 1.9024390243902438, "grad_norm": 0.10040211148018247, "learning_rate": 7.068380293921142e-05, "loss": 0.2382, "step": 507 }, { "epoch": 1.9061913696060038, "grad_norm": 0.0945017927625979, "learning_rate": 7.026576180266214e-05, "loss": 0.209, "step": 508 }, { "epoch": 1.9099437148217637, "grad_norm": 0.0916031152893614, "learning_rate": 6.984828992127842e-05, "loss": 0.224, "step": 509 }, { "epoch": 1.9136960600375235, "grad_norm": 0.09634578850487348, "learning_rate": 6.943139528746366e-05, "loss": 0.2137, "step": 510 }, { "epoch": 1.9174484052532832, "grad_norm": 0.10032861508214572, "learning_rate": 6.901508588256986e-05, "loss": 0.2311, "step": 511 }, { "epoch": 1.9212007504690432, "grad_norm": 0.10382600019704231, "learning_rate": 6.859936967674509e-05, "loss": 0.2157, "step": 512 }, { "epoch": 1.924953095684803, "grad_norm": 0.09398681809831279, "learning_rate": 6.81842546287807e-05, "loss": 0.2179, "step": 513 }, { "epoch": 1.9287054409005628, "grad_norm": 0.09495655588048224, "learning_rate": 6.776974868595898e-05, "loss": 0.2295, "step": 514 }, { "epoch": 1.9324577861163226, "grad_norm": 0.1037059716662824, "learning_rate": 6.735585978390105e-05, "loss": 0.2247, "step": 515 }, { "epoch": 1.9362101313320825, "grad_norm": 0.1034544726598181, "learning_rate": 6.694259584641496e-05, "loss": 0.2274, "step": 516 }, { "epoch": 1.9399624765478425, "grad_norm": 0.09525440277796053, "learning_rate": 6.652996478534394e-05, "loss": 0.2168, "step": 517 }, { "epoch": 1.9437148217636022, "grad_norm": 0.09617598490242872, "learning_rate": 6.611797450041495e-05, "loss": 0.2309, "step": 518 }, { "epoch": 1.947467166979362, "grad_norm": 0.10076971532967963, "learning_rate": 6.570663287908743e-05, "loss": 0.2161, "step": 519 }, { "epoch": 1.951219512195122, "grad_norm": 0.09591701560370183, "learning_rate": 6.52959477964023e-05, "loss": 0.235, "step": 520 }, { "epoch": 1.9549718574108819, "grad_norm": 0.09529947861083574, "learning_rate": 6.488592711483121e-05, "loss": 0.2113, "step": 521 }, { "epoch": 1.9587242026266416, "grad_norm": 0.09828556061683102, "learning_rate": 6.447657868412602e-05, "loss": 0.2232, "step": 522 }, { "epoch": 1.9624765478424016, "grad_norm": 0.09811690552264642, "learning_rate": 6.406791034116846e-05, "loss": 0.2238, "step": 523 }, { "epoch": 1.9662288930581613, "grad_norm": 0.09251959707691187, "learning_rate": 6.365992990982015e-05, "loss": 0.2125, "step": 524 }, { "epoch": 1.9699812382739212, "grad_norm": 0.09208040753409237, "learning_rate": 6.325264520077284e-05, "loss": 0.2041, "step": 525 }, { "epoch": 1.9737335834896812, "grad_norm": 0.09404759185222095, "learning_rate": 6.284606401139875e-05, "loss": 0.2314, "step": 526 }, { "epoch": 1.977485928705441, "grad_norm": 0.09296590290664984, "learning_rate": 6.244019412560144e-05, "loss": 0.2098, "step": 527 }, { "epoch": 1.9812382739212007, "grad_norm": 0.09162797489683842, "learning_rate": 6.203504331366677e-05, "loss": 0.225, "step": 528 }, { "epoch": 1.9849906191369606, "grad_norm": 0.09459057754753163, "learning_rate": 6.163061933211403e-05, "loss": 0.2314, "step": 529 }, { "epoch": 1.9887429643527206, "grad_norm": 0.09501788828069234, "learning_rate": 6.122692992354748e-05, "loss": 0.2184, "step": 530 }, { "epoch": 1.9924953095684803, "grad_norm": 0.09244115084585679, "learning_rate": 6.082398281650823e-05, "loss": 0.2177, "step": 531 }, { "epoch": 1.99624765478424, "grad_norm": 0.09596139173299358, "learning_rate": 6.042178572532609e-05, "loss": 0.226, "step": 532 }, { "epoch": 2.0, "grad_norm": 0.09172506684302523, "learning_rate": 6.002034634997213e-05, "loss": 0.2142, "step": 533 }, { "epoch": 2.0, "eval_loss": 0.23376528918743134, "eval_runtime": 50.1201, "eval_samples_per_second": 35.794, "eval_steps_per_second": 1.137, "step": 533 }, { "epoch": 2.00375234521576, "grad_norm": 0.09079745434183319, "learning_rate": 5.9619672375911065e-05, "loss": 0.207, "step": 534 }, { "epoch": 2.00750469043152, "grad_norm": 0.09318706902825784, "learning_rate": 5.92197714739541e-05, "loss": 0.2, "step": 535 }, { "epoch": 2.0112570356472794, "grad_norm": 0.0915278433170553, "learning_rate": 5.882065130011226e-05, "loss": 0.2163, "step": 536 }, { "epoch": 2.0150093808630394, "grad_norm": 0.0888806657583464, "learning_rate": 5.842231949544963e-05, "loss": 0.1954, "step": 537 }, { "epoch": 2.0187617260787993, "grad_norm": 0.0923342471487866, "learning_rate": 5.80247836859372e-05, "loss": 0.2076, "step": 538 }, { "epoch": 2.0225140712945593, "grad_norm": 0.09920205691642849, "learning_rate": 5.762805148230688e-05, "loss": 0.2169, "step": 539 }, { "epoch": 2.026266416510319, "grad_norm": 0.10035057872617904, "learning_rate": 5.723213047990552e-05, "loss": 0.2014, "step": 540 }, { "epoch": 2.0300187617260788, "grad_norm": 0.09963126276081327, "learning_rate": 5.68370282585499e-05, "loss": 0.2031, "step": 541 }, { "epoch": 2.0337711069418387, "grad_norm": 0.10447528881697037, "learning_rate": 5.6442752382381304e-05, "loss": 0.2068, "step": 542 }, { "epoch": 2.0375234521575987, "grad_norm": 0.10199582392883727, "learning_rate": 5.604931039972099e-05, "loss": 0.2065, "step": 543 }, { "epoch": 2.041275797373358, "grad_norm": 0.09911798520080613, "learning_rate": 5.5656709842925335e-05, "loss": 0.2098, "step": 544 }, { "epoch": 2.045028142589118, "grad_norm": 0.10227122873222887, "learning_rate": 5.5264958228241924e-05, "loss": 0.1965, "step": 545 }, { "epoch": 2.048780487804878, "grad_norm": 0.09678443893625513, "learning_rate": 5.487406305566549e-05, "loss": 0.1945, "step": 546 }, { "epoch": 2.052532833020638, "grad_norm": 0.09615229278305461, "learning_rate": 5.44840318087944e-05, "loss": 0.2018, "step": 547 }, { "epoch": 2.0562851782363976, "grad_norm": 0.09448791165046015, "learning_rate": 5.40948719546873e-05, "loss": 0.1856, "step": 548 }, { "epoch": 2.0600375234521575, "grad_norm": 0.09990654001751424, "learning_rate": 5.370659094372036e-05, "loss": 0.197, "step": 549 }, { "epoch": 2.0637898686679175, "grad_norm": 0.1017134570942473, "learning_rate": 5.331919620944438e-05, "loss": 0.1937, "step": 550 }, { "epoch": 2.0675422138836774, "grad_norm": 0.09999166353485331, "learning_rate": 5.293269516844263e-05, "loss": 0.189, "step": 551 }, { "epoch": 2.071294559099437, "grad_norm": 0.0960462591244089, "learning_rate": 5.2547095220188813e-05, "loss": 0.1986, "step": 552 }, { "epoch": 2.075046904315197, "grad_norm": 0.09497842140609362, "learning_rate": 5.216240374690546e-05, "loss": 0.1839, "step": 553 }, { "epoch": 2.078799249530957, "grad_norm": 0.1035536072035628, "learning_rate": 5.177862811342253e-05, "loss": 0.2036, "step": 554 }, { "epoch": 2.082551594746717, "grad_norm": 0.10347603420133354, "learning_rate": 5.1395775667036425e-05, "loss": 0.2104, "step": 555 }, { "epoch": 2.0863039399624768, "grad_norm": 0.09669567097477653, "learning_rate": 5.101385373736937e-05, "loss": 0.2018, "step": 556 }, { "epoch": 2.0900562851782363, "grad_norm": 0.10564180076893855, "learning_rate": 5.063286963622903e-05, "loss": 0.2087, "step": 557 }, { "epoch": 2.0938086303939962, "grad_norm": 0.10328313340838309, "learning_rate": 5.0252830657468556e-05, "loss": 0.2134, "step": 558 }, { "epoch": 2.097560975609756, "grad_norm": 0.10036199076264615, "learning_rate": 4.987374407684703e-05, "loss": 0.2111, "step": 559 }, { "epoch": 2.101313320825516, "grad_norm": 0.0976925810982622, "learning_rate": 4.949561715189e-05, "loss": 0.1994, "step": 560 }, { "epoch": 2.1050656660412757, "grad_norm": 0.10018786504971282, "learning_rate": 4.911845712175067e-05, "loss": 0.1955, "step": 561 }, { "epoch": 2.1088180112570356, "grad_norm": 0.09919206716593768, "learning_rate": 4.874227120707122e-05, "loss": 0.1746, "step": 562 }, { "epoch": 2.1125703564727956, "grad_norm": 0.10165212679534245, "learning_rate": 4.836706660984467e-05, "loss": 0.2032, "step": 563 }, { "epoch": 2.1163227016885555, "grad_norm": 0.1023373486908179, "learning_rate": 4.7992850513276856e-05, "loss": 0.1964, "step": 564 }, { "epoch": 2.120075046904315, "grad_norm": 0.09675867467113107, "learning_rate": 4.761963008164918e-05, "loss": 0.1917, "step": 565 }, { "epoch": 2.123827392120075, "grad_norm": 0.10215374436744387, "learning_rate": 4.724741246018103e-05, "loss": 0.1901, "step": 566 }, { "epoch": 2.127579737335835, "grad_norm": 0.09936010256528022, "learning_rate": 4.6876204774893375e-05, "loss": 0.185, "step": 567 }, { "epoch": 2.131332082551595, "grad_norm": 0.10095730965547263, "learning_rate": 4.650601413247214e-05, "loss": 0.2001, "step": 568 }, { "epoch": 2.1350844277673544, "grad_norm": 0.10307428822339372, "learning_rate": 4.613684762013217e-05, "loss": 0.2116, "step": 569 }, { "epoch": 2.1388367729831144, "grad_norm": 0.10151450870892983, "learning_rate": 4.57687123054817e-05, "loss": 0.1899, "step": 570 }, { "epoch": 2.1425891181988743, "grad_norm": 0.10279801915183673, "learning_rate": 4.540161523638679e-05, "loss": 0.2199, "step": 571 }, { "epoch": 2.1463414634146343, "grad_norm": 0.10216901995705475, "learning_rate": 4.503556344083656e-05, "loss": 0.2129, "step": 572 }, { "epoch": 2.150093808630394, "grad_norm": 0.09672154783822277, "learning_rate": 4.467056392680863e-05, "loss": 0.1942, "step": 573 }, { "epoch": 2.1538461538461537, "grad_norm": 0.09811181215399747, "learning_rate": 4.4306623682134873e-05, "loss": 0.2009, "step": 574 }, { "epoch": 2.1575984990619137, "grad_norm": 0.09977068165121625, "learning_rate": 4.394374967436783e-05, "loss": 0.2005, "step": 575 }, { "epoch": 2.1613508442776737, "grad_norm": 0.10334743364527872, "learning_rate": 4.3581948850647035e-05, "loss": 0.2177, "step": 576 }, { "epoch": 2.1651031894934336, "grad_norm": 0.10102317468460993, "learning_rate": 4.322122813756623e-05, "loss": 0.2023, "step": 577 }, { "epoch": 2.168855534709193, "grad_norm": 0.1026213811782461, "learning_rate": 4.286159444104068e-05, "loss": 0.1915, "step": 578 }, { "epoch": 2.172607879924953, "grad_norm": 0.10521270371005095, "learning_rate": 4.250305464617493e-05, "loss": 0.1971, "step": 579 }, { "epoch": 2.176360225140713, "grad_norm": 0.10030946045880718, "learning_rate": 4.2145615617131095e-05, "loss": 0.1867, "step": 580 }, { "epoch": 2.180112570356473, "grad_norm": 0.10277698027056928, "learning_rate": 4.178928419699731e-05, "loss": 0.2059, "step": 581 }, { "epoch": 2.1838649155722325, "grad_norm": 0.09971894923334872, "learning_rate": 4.143406720765687e-05, "loss": 0.1846, "step": 582 }, { "epoch": 2.1876172607879925, "grad_norm": 0.1033037305658252, "learning_rate": 4.1079971449657476e-05, "loss": 0.2211, "step": 583 }, { "epoch": 2.1913696060037524, "grad_norm": 0.10205179424227565, "learning_rate": 4.072700370208115e-05, "loss": 0.2111, "step": 584 }, { "epoch": 2.1951219512195124, "grad_norm": 0.10170970223951699, "learning_rate": 4.037517072241435e-05, "loss": 0.2157, "step": 585 }, { "epoch": 2.198874296435272, "grad_norm": 0.10013077468746785, "learning_rate": 4.0024479246418824e-05, "loss": 0.2042, "step": 586 }, { "epoch": 2.202626641651032, "grad_norm": 0.10101033553996595, "learning_rate": 3.967493598800233e-05, "loss": 0.1947, "step": 587 }, { "epoch": 2.206378986866792, "grad_norm": 0.09739456251257238, "learning_rate": 3.9326547639090315e-05, "loss": 0.1899, "step": 588 }, { "epoch": 2.2101313320825517, "grad_norm": 0.10060772593441947, "learning_rate": 3.897932086949778e-05, "loss": 0.1889, "step": 589 }, { "epoch": 2.2138836772983113, "grad_norm": 0.10071997233912698, "learning_rate": 3.863326232680148e-05, "loss": 0.1776, "step": 590 }, { "epoch": 2.217636022514071, "grad_norm": 0.1022213039350476, "learning_rate": 3.828837863621286e-05, "loss": 0.209, "step": 591 }, { "epoch": 2.221388367729831, "grad_norm": 0.09822580435817398, "learning_rate": 3.794467640045102e-05, "loss": 0.1924, "step": 592 }, { "epoch": 2.225140712945591, "grad_norm": 0.10257081241980225, "learning_rate": 3.76021621996163e-05, "loss": 0.1888, "step": 593 }, { "epoch": 2.2288930581613506, "grad_norm": 0.10310065568513274, "learning_rate": 3.7260842591064506e-05, "loss": 0.1953, "step": 594 }, { "epoch": 2.2326454033771106, "grad_norm": 0.1045482144809441, "learning_rate": 3.692072410928115e-05, "loss": 0.1966, "step": 595 }, { "epoch": 2.2363977485928705, "grad_norm": 0.10273894973169112, "learning_rate": 3.658181326575659e-05, "loss": 0.1947, "step": 596 }, { "epoch": 2.2401500938086305, "grad_norm": 0.10525839669259326, "learning_rate": 3.6244116548861085e-05, "loss": 0.1974, "step": 597 }, { "epoch": 2.2439024390243905, "grad_norm": 0.10169087378990962, "learning_rate": 3.590764042372079e-05, "loss": 0.1881, "step": 598 }, { "epoch": 2.24765478424015, "grad_norm": 0.09999835732356765, "learning_rate": 3.557239133209387e-05, "loss": 0.1966, "step": 599 }, { "epoch": 2.25140712945591, "grad_norm": 0.09733303416392536, "learning_rate": 3.523837569224725e-05, "loss": 0.1819, "step": 600 }, { "epoch": 2.25515947467167, "grad_norm": 0.10100776645474248, "learning_rate": 3.4905599898833664e-05, "loss": 0.1793, "step": 601 }, { "epoch": 2.2589118198874294, "grad_norm": 0.1029672174813885, "learning_rate": 3.457407032276935e-05, "loss": 0.1863, "step": 602 }, { "epoch": 2.2626641651031894, "grad_norm": 0.10654075355135953, "learning_rate": 3.4243793311111915e-05, "loss": 0.2027, "step": 603 }, { "epoch": 2.2664165103189493, "grad_norm": 0.0999884893424256, "learning_rate": 3.391477518693894e-05, "loss": 0.1933, "step": 604 }, { "epoch": 2.2701688555347093, "grad_norm": 0.10004615878876773, "learning_rate": 3.3587022249226904e-05, "loss": 0.1929, "step": 605 }, { "epoch": 2.273921200750469, "grad_norm": 0.10933449083370074, "learning_rate": 3.3260540772730574e-05, "loss": 0.1995, "step": 606 }, { "epoch": 2.2776735459662287, "grad_norm": 0.10584448120093172, "learning_rate": 3.293533700786287e-05, "loss": 0.1954, "step": 607 }, { "epoch": 2.2814258911819887, "grad_norm": 0.10327239804726301, "learning_rate": 3.261141718057523e-05, "loss": 0.2079, "step": 608 }, { "epoch": 2.2851782363977486, "grad_norm": 0.10227791430848202, "learning_rate": 3.228878749223842e-05, "loss": 0.1942, "step": 609 }, { "epoch": 2.2889305816135086, "grad_norm": 0.1049992954664816, "learning_rate": 3.1967454119523744e-05, "loss": 0.2155, "step": 610 }, { "epoch": 2.292682926829268, "grad_norm": 0.09845313981810072, "learning_rate": 3.1647423214284856e-05, "loss": 0.2019, "step": 611 }, { "epoch": 2.296435272045028, "grad_norm": 0.10586868882664938, "learning_rate": 3.1328700903440046e-05, "loss": 0.1966, "step": 612 }, { "epoch": 2.300187617260788, "grad_norm": 0.10419647410764402, "learning_rate": 3.101129328885475e-05, "loss": 0.195, "step": 613 }, { "epoch": 2.303939962476548, "grad_norm": 0.10270534216016665, "learning_rate": 3.069520644722492e-05, "loss": 0.2154, "step": 614 }, { "epoch": 2.3076923076923075, "grad_norm": 0.1049524325311392, "learning_rate": 3.0380446429960575e-05, "loss": 0.192, "step": 615 }, { "epoch": 2.3114446529080674, "grad_norm": 0.10182029532251992, "learning_rate": 3.0067019263069972e-05, "loss": 0.2032, "step": 616 }, { "epoch": 2.3151969981238274, "grad_norm": 0.10026439505776784, "learning_rate": 2.9754930947044357e-05, "loss": 0.2009, "step": 617 }, { "epoch": 2.3189493433395874, "grad_norm": 0.10397934271498302, "learning_rate": 2.9444187456742855e-05, "loss": 0.203, "step": 618 }, { "epoch": 2.3227016885553473, "grad_norm": 0.10426527615751131, "learning_rate": 2.9134794741278313e-05, "loss": 0.1966, "step": 619 }, { "epoch": 2.326454033771107, "grad_norm": 0.1056275256554938, "learning_rate": 2.882675872390319e-05, "loss": 0.2045, "step": 620 }, { "epoch": 2.3302063789868668, "grad_norm": 0.10696559879415825, "learning_rate": 2.852008530189637e-05, "loss": 0.2178, "step": 621 }, { "epoch": 2.3339587242026267, "grad_norm": 0.10669888583330973, "learning_rate": 2.8214780346450087e-05, "loss": 0.2016, "step": 622 }, { "epoch": 2.3377110694183862, "grad_norm": 0.09949119223713555, "learning_rate": 2.7910849702557717e-05, "loss": 0.1879, "step": 623 }, { "epoch": 2.341463414634146, "grad_norm": 0.10026543163402477, "learning_rate": 2.760829918890163e-05, "loss": 0.184, "step": 624 }, { "epoch": 2.345215759849906, "grad_norm": 0.1026868883691737, "learning_rate": 2.730713459774198e-05, "loss": 0.1903, "step": 625 }, { "epoch": 2.348968105065666, "grad_norm": 0.1040530218803013, "learning_rate": 2.7007361694805733e-05, "loss": 0.2024, "step": 626 }, { "epoch": 2.352720450281426, "grad_norm": 0.10398152070295266, "learning_rate": 2.670898621917629e-05, "loss": 0.2072, "step": 627 }, { "epoch": 2.3564727954971856, "grad_norm": 0.10446424033377862, "learning_rate": 2.6412013883183696e-05, "loss": 0.2089, "step": 628 }, { "epoch": 2.3602251407129455, "grad_norm": 0.11251268375275413, "learning_rate": 2.6116450372295144e-05, "loss": 0.1932, "step": 629 }, { "epoch": 2.3639774859287055, "grad_norm": 0.10497575845280084, "learning_rate": 2.5822301345006194e-05, "loss": 0.2004, "step": 630 }, { "epoch": 2.3677298311444654, "grad_norm": 0.10289220968071199, "learning_rate": 2.5529572432732474e-05, "loss": 0.1877, "step": 631 }, { "epoch": 2.371482176360225, "grad_norm": 0.10331506916385837, "learning_rate": 2.5238269239701817e-05, "loss": 0.1923, "step": 632 }, { "epoch": 2.375234521575985, "grad_norm": 0.10590823913614765, "learning_rate": 2.4948397342846985e-05, "loss": 0.1976, "step": 633 }, { "epoch": 2.378986866791745, "grad_norm": 0.096413124948388, "learning_rate": 2.4659962291698933e-05, "loss": 0.1941, "step": 634 }, { "epoch": 2.382739212007505, "grad_norm": 0.10013403813194789, "learning_rate": 2.4372969608280482e-05, "loss": 0.1744, "step": 635 }, { "epoch": 2.3864915572232643, "grad_norm": 0.10528880387979549, "learning_rate": 2.4087424787000712e-05, "loss": 0.2059, "step": 636 }, { "epoch": 2.3902439024390243, "grad_norm": 0.09910581750952785, "learning_rate": 2.3803333294549646e-05, "loss": 0.1857, "step": 637 }, { "epoch": 2.3939962476547842, "grad_norm": 0.10340609977970545, "learning_rate": 2.352070056979375e-05, "loss": 0.2025, "step": 638 }, { "epoch": 2.397748592870544, "grad_norm": 0.09706846137528291, "learning_rate": 2.323953202367166e-05, "loss": 0.1873, "step": 639 }, { "epoch": 2.401500938086304, "grad_norm": 0.10231946255293252, "learning_rate": 2.295983303909065e-05, "loss": 0.1961, "step": 640 }, { "epoch": 2.4052532833020637, "grad_norm": 0.09651492781163441, "learning_rate": 2.2681608970823565e-05, "loss": 0.1824, "step": 641 }, { "epoch": 2.4090056285178236, "grad_norm": 0.10132623115233401, "learning_rate": 2.2404865145406352e-05, "loss": 0.2059, "step": 642 }, { "epoch": 2.4127579737335836, "grad_norm": 0.10112743841625528, "learning_rate": 2.2129606861036e-05, "loss": 0.2028, "step": 643 }, { "epoch": 2.416510318949343, "grad_norm": 0.09964804346224858, "learning_rate": 2.1855839387469233e-05, "loss": 0.2125, "step": 644 }, { "epoch": 2.420262664165103, "grad_norm": 0.10427243112295695, "learning_rate": 2.158356796592147e-05, "loss": 0.1977, "step": 645 }, { "epoch": 2.424015009380863, "grad_norm": 0.09897901758651276, "learning_rate": 2.131279780896662e-05, "loss": 0.2038, "step": 646 }, { "epoch": 2.427767354596623, "grad_norm": 0.10099800442823986, "learning_rate": 2.1043534100437124e-05, "loss": 0.1868, "step": 647 }, { "epoch": 2.431519699812383, "grad_norm": 0.10299893970712365, "learning_rate": 2.0775781995324882e-05, "loss": 0.2165, "step": 648 }, { "epoch": 2.4352720450281424, "grad_norm": 0.10399871122403716, "learning_rate": 2.050954661968255e-05, "loss": 0.216, "step": 649 }, { "epoch": 2.4390243902439024, "grad_norm": 0.10175951535441477, "learning_rate": 2.024483307052526e-05, "loss": 0.1937, "step": 650 }, { "epoch": 2.4427767354596623, "grad_norm": 0.10112284461470993, "learning_rate": 1.9981646415733157e-05, "loss": 0.1911, "step": 651 }, { "epoch": 2.4465290806754223, "grad_norm": 0.10402946070557738, "learning_rate": 1.971999169395432e-05, "loss": 0.1938, "step": 652 }, { "epoch": 2.450281425891182, "grad_norm": 0.10696044873030722, "learning_rate": 1.945987391450833e-05, "loss": 0.1987, "step": 653 }, { "epoch": 2.4540337711069418, "grad_norm": 0.10736029154944617, "learning_rate": 1.920129805729043e-05, "loss": 0.2042, "step": 654 }, { "epoch": 2.4577861163227017, "grad_norm": 0.10645182539440759, "learning_rate": 1.8944269072676012e-05, "loss": 0.2003, "step": 655 }, { "epoch": 2.4615384615384617, "grad_norm": 0.10292326662889295, "learning_rate": 1.8688791881426017e-05, "loss": 0.1845, "step": 656 }, { "epoch": 2.465290806754221, "grad_norm": 0.10575280486572482, "learning_rate": 1.843487137459261e-05, "loss": 0.2066, "step": 657 }, { "epoch": 2.469043151969981, "grad_norm": 0.10744007401993587, "learning_rate": 1.8182512413425625e-05, "loss": 0.2152, "step": 658 }, { "epoch": 2.472795497185741, "grad_norm": 0.10784391160924228, "learning_rate": 1.7931719829279447e-05, "loss": 0.2046, "step": 659 }, { "epoch": 2.476547842401501, "grad_norm": 0.10999520632523263, "learning_rate": 1.7682498423520543e-05, "loss": 0.2213, "step": 660 }, { "epoch": 2.480300187617261, "grad_norm": 0.10222317013218042, "learning_rate": 1.7434852967435523e-05, "loss": 0.1963, "step": 661 }, { "epoch": 2.4840525328330205, "grad_norm": 0.1005127738221163, "learning_rate": 1.7188788202139792e-05, "loss": 0.1867, "step": 662 }, { "epoch": 2.4878048780487805, "grad_norm": 0.09610928647550443, "learning_rate": 1.6944308838486824e-05, "loss": 0.1934, "step": 663 }, { "epoch": 2.4915572232645404, "grad_norm": 0.10526028770285865, "learning_rate": 1.6701419556977883e-05, "loss": 0.195, "step": 664 }, { "epoch": 2.4953095684803, "grad_norm": 0.10236251605106483, "learning_rate": 1.6460125007672557e-05, "loss": 0.1866, "step": 665 }, { "epoch": 2.49906191369606, "grad_norm": 0.10176267199727759, "learning_rate": 1.62204298100996e-05, "loss": 0.1882, "step": 666 }, { "epoch": 2.50281425891182, "grad_norm": 0.1019507914805454, "learning_rate": 1.5982338553168563e-05, "loss": 0.2009, "step": 667 }, { "epoch": 2.50656660412758, "grad_norm": 0.10804707767457539, "learning_rate": 1.5745855795081887e-05, "loss": 0.2053, "step": 668 }, { "epoch": 2.5103189493433398, "grad_norm": 0.1035038747763287, "learning_rate": 1.551098606324768e-05, "loss": 0.194, "step": 669 }, { "epoch": 2.5140712945590993, "grad_norm": 0.1050714841483446, "learning_rate": 1.527773385419311e-05, "loss": 0.2035, "step": 670 }, { "epoch": 2.5178236397748592, "grad_norm": 0.10583943020297447, "learning_rate": 1.5046103633478148e-05, "loss": 0.2051, "step": 671 }, { "epoch": 2.521575984990619, "grad_norm": 0.10561215432586364, "learning_rate": 1.4816099835610209e-05, "loss": 0.1979, "step": 672 }, { "epoch": 2.525328330206379, "grad_norm": 0.09979215156338624, "learning_rate": 1.458772686395924e-05, "loss": 0.1853, "step": 673 }, { "epoch": 2.529080675422139, "grad_norm": 0.10204206438042053, "learning_rate": 1.4360989090673283e-05, "loss": 0.1857, "step": 674 }, { "epoch": 2.5328330206378986, "grad_norm": 0.10045106683180502, "learning_rate": 1.4135890856595047e-05, "loss": 0.195, "step": 675 }, { "epoch": 2.5365853658536586, "grad_norm": 0.10797036446361757, "learning_rate": 1.3912436471178526e-05, "loss": 0.2199, "step": 676 }, { "epoch": 2.5403377110694185, "grad_norm": 0.10081545869313864, "learning_rate": 1.3690630212406652e-05, "loss": 0.1946, "step": 677 }, { "epoch": 2.544090056285178, "grad_norm": 0.10284609365181138, "learning_rate": 1.3470476326709336e-05, "loss": 0.186, "step": 678 }, { "epoch": 2.547842401500938, "grad_norm": 0.10827914953666169, "learning_rate": 1.3251979028882177e-05, "loss": 0.2024, "step": 679 }, { "epoch": 2.551594746716698, "grad_norm": 0.10735743283847107, "learning_rate": 1.3035142502005792e-05, "loss": 0.2031, "step": 680 }, { "epoch": 2.555347091932458, "grad_norm": 0.10503191958506779, "learning_rate": 1.2819970897365741e-05, "loss": 0.1817, "step": 681 }, { "epoch": 2.559099437148218, "grad_norm": 0.10184154040199396, "learning_rate": 1.2606468334373001e-05, "loss": 0.2022, "step": 682 }, { "epoch": 2.5628517823639774, "grad_norm": 0.10749329814350675, "learning_rate": 1.2394638900485123e-05, "loss": 0.2045, "step": 683 }, { "epoch": 2.5666041275797373, "grad_norm": 0.1052016831437128, "learning_rate": 1.2184486651128013e-05, "loss": 0.1979, "step": 684 }, { "epoch": 2.5703564727954973, "grad_norm": 0.10912985456356006, "learning_rate": 1.1976015609618241e-05, "loss": 0.2098, "step": 685 }, { "epoch": 2.574108818011257, "grad_norm": 0.11014749857667003, "learning_rate": 1.1769229767086054e-05, "loss": 0.2054, "step": 686 }, { "epoch": 2.5778611632270167, "grad_norm": 0.10424534741987558, "learning_rate": 1.1564133082398942e-05, "loss": 0.1909, "step": 687 }, { "epoch": 2.5816135084427767, "grad_norm": 0.10812308545934828, "learning_rate": 1.1360729482085853e-05, "loss": 0.2094, "step": 688 }, { "epoch": 2.5853658536585367, "grad_norm": 0.10143413919144614, "learning_rate": 1.1159022860262036e-05, "loss": 0.2044, "step": 689 }, { "epoch": 2.5891181988742966, "grad_norm": 0.10206659259014333, "learning_rate": 1.0959017078554457e-05, "loss": 0.1773, "step": 690 }, { "epoch": 2.592870544090056, "grad_norm": 0.09952346727740458, "learning_rate": 1.0760715966027923e-05, "loss": 0.1899, "step": 691 }, { "epoch": 2.596622889305816, "grad_norm": 0.10427412693597317, "learning_rate": 1.0564123319111706e-05, "loss": 0.2002, "step": 692 }, { "epoch": 2.600375234521576, "grad_norm": 0.10249540064121361, "learning_rate": 1.036924290152691e-05, "loss": 0.2028, "step": 693 }, { "epoch": 2.604127579737336, "grad_norm": 0.10843918197727967, "learning_rate": 1.017607844421441e-05, "loss": 0.2077, "step": 694 }, { "epoch": 2.607879924953096, "grad_norm": 0.10288144453666269, "learning_rate": 9.984633645263387e-06, "loss": 0.2013, "step": 695 }, { "epoch": 2.6116322701688555, "grad_norm": 0.10267806866723643, "learning_rate": 9.794912169840565e-06, "loss": 0.1948, "step": 696 }, { "epoch": 2.6153846153846154, "grad_norm": 0.10194354127629204, "learning_rate": 9.606917650120084e-06, "loss": 0.1922, "step": 697 }, { "epoch": 2.6191369606003754, "grad_norm": 0.1017288709531509, "learning_rate": 9.420653685213855e-06, "loss": 0.2014, "step": 698 }, { "epoch": 2.622889305816135, "grad_norm": 0.10414806414329983, "learning_rate": 9.236123841102762e-06, "loss": 0.1897, "step": 699 }, { "epoch": 2.626641651031895, "grad_norm": 0.10091534748202144, "learning_rate": 9.053331650568265e-06, "loss": 0.1822, "step": 700 }, { "epoch": 2.630393996247655, "grad_norm": 0.10057882616031338, "learning_rate": 8.872280613124895e-06, "loss": 0.201, "step": 701 }, { "epoch": 2.6341463414634148, "grad_norm": 0.1064565679095478, "learning_rate": 8.692974194953263e-06, "loss": 0.1929, "step": 702 }, { "epoch": 2.6378986866791747, "grad_norm": 0.1105132931652779, "learning_rate": 8.515415828833561e-06, "loss": 0.2052, "step": 703 }, { "epoch": 2.641651031894934, "grad_norm": 0.10537303709262331, "learning_rate": 8.339608914079944e-06, "loss": 0.1936, "step": 704 }, { "epoch": 2.645403377110694, "grad_norm": 0.1065727333189368, "learning_rate": 8.165556816475461e-06, "loss": 0.1966, "step": 705 }, { "epoch": 2.649155722326454, "grad_norm": 0.1039109704950518, "learning_rate": 7.993262868207552e-06, "loss": 0.1902, "step": 706 }, { "epoch": 2.6529080675422136, "grad_norm": 0.10745276421338502, "learning_rate": 7.822730367804333e-06, "loss": 0.1932, "step": 707 }, { "epoch": 2.6566604127579736, "grad_norm": 0.10876027022253146, "learning_rate": 7.653962580071384e-06, "loss": 0.1995, "step": 708 }, { "epoch": 2.6604127579737336, "grad_norm": 0.103448141377836, "learning_rate": 7.486962736029246e-06, "loss": 0.193, "step": 709 }, { "epoch": 2.6641651031894935, "grad_norm": 0.10395144677705122, "learning_rate": 7.321734032851612e-06, "loss": 0.1882, "step": 710 }, { "epoch": 2.6679174484052535, "grad_norm": 0.10869764134740856, "learning_rate": 7.158279633804077e-06, "loss": 0.1969, "step": 711 }, { "epoch": 2.671669793621013, "grad_norm": 0.10884753585888339, "learning_rate": 6.996602668183605e-06, "loss": 0.2105, "step": 712 }, { "epoch": 2.675422138836773, "grad_norm": 0.1023700486776284, "learning_rate": 6.836706231258583e-06, "loss": 0.2074, "step": 713 }, { "epoch": 2.679174484052533, "grad_norm": 0.10072431214674206, "learning_rate": 6.678593384209597e-06, "loss": 0.2003, "step": 714 }, { "epoch": 2.682926829268293, "grad_norm": 0.1017603847914724, "learning_rate": 6.522267154070816e-06, "loss": 0.1928, "step": 715 }, { "epoch": 2.686679174484053, "grad_norm": 0.10652616399365392, "learning_rate": 6.367730533672034e-06, "loss": 0.186, "step": 716 }, { "epoch": 2.6904315196998123, "grad_norm": 0.09811981179599741, "learning_rate": 6.214986481581364e-06, "loss": 0.1844, "step": 717 }, { "epoch": 2.6941838649155723, "grad_norm": 0.10555082174853377, "learning_rate": 6.0640379220486595e-06, "loss": 0.1967, "step": 718 }, { "epoch": 2.6979362101313322, "grad_norm": 0.10141779697817346, "learning_rate": 5.914887744949427e-06, "loss": 0.1866, "step": 719 }, { "epoch": 2.7016885553470917, "grad_norm": 0.10459464914706551, "learning_rate": 5.767538805729578e-06, "loss": 0.2051, "step": 720 }, { "epoch": 2.7054409005628517, "grad_norm": 0.10000684976584827, "learning_rate": 5.621993925350721e-06, "loss": 0.2038, "step": 721 }, { "epoch": 2.7091932457786116, "grad_norm": 0.1048421212174767, "learning_rate": 5.478255890236184e-06, "loss": 0.2, "step": 722 }, { "epoch": 2.7129455909943716, "grad_norm": 0.0997399916761435, "learning_rate": 5.336327452217682e-06, "loss": 0.1726, "step": 723 }, { "epoch": 2.7166979362101316, "grad_norm": 0.10649021635632532, "learning_rate": 5.196211328482559e-06, "loss": 0.2024, "step": 724 }, { "epoch": 2.720450281425891, "grad_norm": 0.1103866372245366, "learning_rate": 5.057910201521876e-06, "loss": 0.2089, "step": 725 }, { "epoch": 2.724202626641651, "grad_norm": 0.10624735937513594, "learning_rate": 4.921426719078948e-06, "loss": 0.2076, "step": 726 }, { "epoch": 2.727954971857411, "grad_norm": 0.10950236168024759, "learning_rate": 4.786763494098689e-06, "loss": 0.2043, "step": 727 }, { "epoch": 2.7317073170731705, "grad_norm": 0.11231969126745754, "learning_rate": 4.653923104677671e-06, "loss": 0.2085, "step": 728 }, { "epoch": 2.7354596622889304, "grad_norm": 0.10257452327839935, "learning_rate": 4.522908094014655e-06, "loss": 0.1993, "step": 729 }, { "epoch": 2.7392120075046904, "grad_norm": 0.1085844983559947, "learning_rate": 4.393720970361948e-06, "loss": 0.2095, "step": 730 }, { "epoch": 2.7429643527204504, "grad_norm": 0.10415488073529694, "learning_rate": 4.266364206977369e-06, "loss": 0.1857, "step": 731 }, { "epoch": 2.7467166979362103, "grad_norm": 0.10010777498448906, "learning_rate": 4.140840242076926e-06, "loss": 0.1808, "step": 732 }, { "epoch": 2.75046904315197, "grad_norm": 0.10942684469416578, "learning_rate": 4.017151478788117e-06, "loss": 0.1985, "step": 733 }, { "epoch": 2.75422138836773, "grad_norm": 0.10858203215925548, "learning_rate": 3.895300285103931e-06, "loss": 0.2106, "step": 734 }, { "epoch": 2.7579737335834897, "grad_norm": 0.10316282689896458, "learning_rate": 3.7752889938375113e-06, "loss": 0.213, "step": 735 }, { "epoch": 2.7617260787992497, "grad_norm": 0.10337955225655752, "learning_rate": 3.657119902577466e-06, "loss": 0.1863, "step": 736 }, { "epoch": 2.7654784240150097, "grad_norm": 0.10297876529957423, "learning_rate": 3.5407952736439265e-06, "loss": 0.1696, "step": 737 }, { "epoch": 2.769230769230769, "grad_norm": 0.10288855256639455, "learning_rate": 3.4263173340452257e-06, "loss": 0.196, "step": 738 }, { "epoch": 2.772983114446529, "grad_norm": 0.10727866360197454, "learning_rate": 3.313688275435234e-06, "loss": 0.2108, "step": 739 }, { "epoch": 2.776735459662289, "grad_norm": 0.10695531311777713, "learning_rate": 3.202910254071434e-06, "loss": 0.2051, "step": 740 }, { "epoch": 2.7804878048780486, "grad_norm": 0.10397091074857444, "learning_rate": 3.0939853907736126e-06, "loss": 0.203, "step": 741 }, { "epoch": 2.7842401500938085, "grad_norm": 0.10285534187602755, "learning_rate": 2.986915770883281e-06, "loss": 0.1957, "step": 742 }, { "epoch": 2.7879924953095685, "grad_norm": 0.10412156356713502, "learning_rate": 2.8817034442237155e-06, "loss": 0.2076, "step": 743 }, { "epoch": 2.7917448405253285, "grad_norm": 0.10501195738430412, "learning_rate": 2.778350425060794e-06, "loss": 0.1874, "step": 744 }, { "epoch": 2.7954971857410884, "grad_norm": 0.1019400505982182, "learning_rate": 2.6768586920643324e-06, "loss": 0.1729, "step": 745 }, { "epoch": 2.799249530956848, "grad_norm": 0.10373169999599365, "learning_rate": 2.577230188270263e-06, "loss": 0.2094, "step": 746 }, { "epoch": 2.803001876172608, "grad_norm": 0.10490091881822013, "learning_rate": 2.4794668210434193e-06, "loss": 0.1999, "step": 747 }, { "epoch": 2.806754221388368, "grad_norm": 0.10354211641819952, "learning_rate": 2.383570462041029e-06, "loss": 0.1883, "step": 748 }, { "epoch": 2.8105065666041273, "grad_norm": 0.10952103392855697, "learning_rate": 2.2895429471768926e-06, "loss": 0.2081, "step": 749 }, { "epoch": 2.8142589118198873, "grad_norm": 0.10364189198883765, "learning_rate": 2.197386076586183e-06, "loss": 0.1981, "step": 750 }, { "epoch": 2.8180112570356473, "grad_norm": 0.10114787624687957, "learning_rate": 2.107101614591045e-06, "loss": 0.1998, "step": 751 }, { "epoch": 2.821763602251407, "grad_norm": 0.10253948372114673, "learning_rate": 2.018691289666774e-06, "loss": 0.1834, "step": 752 }, { "epoch": 2.825515947467167, "grad_norm": 0.10297906079858929, "learning_rate": 1.932156794408757e-06, "loss": 0.1902, "step": 753 }, { "epoch": 2.8292682926829267, "grad_norm": 0.10384623713716713, "learning_rate": 1.8474997855000176e-06, "loss": 0.1798, "step": 754 }, { "epoch": 2.8330206378986866, "grad_norm": 0.1047445526477421, "learning_rate": 1.7647218836795875e-06, "loss": 0.1965, "step": 755 }, { "epoch": 2.8367729831144466, "grad_norm": 0.10059428430606297, "learning_rate": 1.6838246737113983e-06, "loss": 0.1895, "step": 756 }, { "epoch": 2.8405253283302065, "grad_norm": 0.10396170539773615, "learning_rate": 1.604809704353949e-06, "loss": 0.1888, "step": 757 }, { "epoch": 2.8442776735459665, "grad_norm": 0.10389353521025217, "learning_rate": 1.5276784883307082e-06, "loss": 0.1842, "step": 758 }, { "epoch": 2.848030018761726, "grad_norm": 0.1056300013582629, "learning_rate": 1.4524325023010931e-06, "loss": 0.1976, "step": 759 }, { "epoch": 2.851782363977486, "grad_norm": 0.10790683746729697, "learning_rate": 1.3790731868322471e-06, "loss": 0.1851, "step": 760 }, { "epoch": 2.855534709193246, "grad_norm": 0.10252924390509223, "learning_rate": 1.3076019463714172e-06, "loss": 0.1782, "step": 761 }, { "epoch": 2.8592870544090054, "grad_norm": 0.10393059447949725, "learning_rate": 1.238020149219099e-06, "loss": 0.1935, "step": 762 }, { "epoch": 2.8630393996247654, "grad_norm": 0.10158319963272996, "learning_rate": 1.1703291275028228e-06, "loss": 0.1761, "step": 763 }, { "epoch": 2.8667917448405253, "grad_norm": 0.10007484289927086, "learning_rate": 1.1045301771516747e-06, "loss": 0.1991, "step": 764 }, { "epoch": 2.8705440900562853, "grad_norm": 0.10896631717021059, "learning_rate": 1.0406245578714612e-06, "loss": 0.2007, "step": 765 }, { "epoch": 2.8742964352720453, "grad_norm": 0.10304958661622615, "learning_rate": 9.786134931205726e-07, "loss": 0.1922, "step": 766 }, { "epoch": 2.8780487804878048, "grad_norm": 0.10650838974325404, "learning_rate": 9.184981700866346e-07, "loss": 0.2001, "step": 767 }, { "epoch": 2.8818011257035647, "grad_norm": 0.10737499476286118, "learning_rate": 8.602797396636942e-07, "loss": 0.1981, "step": 768 }, { "epoch": 2.8855534709193247, "grad_norm": 0.10219446494494909, "learning_rate": 8.039593164302362e-07, "loss": 0.1971, "step": 769 }, { "epoch": 2.889305816135084, "grad_norm": 0.10740869389844176, "learning_rate": 7.495379786278456e-07, "loss": 0.1974, "step": 770 }, { "epoch": 2.893058161350844, "grad_norm": 0.10555604472352258, "learning_rate": 6.97016768140546e-07, "loss": 0.1962, "step": 771 }, { "epoch": 2.896810506566604, "grad_norm": 0.10785151072047686, "learning_rate": 6.463966904748486e-07, "loss": 0.2152, "step": 772 }, { "epoch": 2.900562851782364, "grad_norm": 0.10469816573622616, "learning_rate": 5.97678714740535e-07, "loss": 0.1927, "step": 773 }, { "epoch": 2.904315196998124, "grad_norm": 0.10791670699957659, "learning_rate": 5.508637736320488e-07, "loss": 0.2068, "step": 774 }, { "epoch": 2.9080675422138835, "grad_norm": 0.11113541212866279, "learning_rate": 5.059527634107109e-07, "loss": 0.2174, "step": 775 }, { "epoch": 2.9118198874296435, "grad_norm": 0.10652878686727578, "learning_rate": 4.6294654388748804e-07, "loss": 0.217, "step": 776 }, { "epoch": 2.9155722326454034, "grad_norm": 0.10407054116599615, "learning_rate": 4.2184593840659537e-07, "loss": 0.1986, "step": 777 }, { "epoch": 2.919324577861163, "grad_norm": 0.10704242060898553, "learning_rate": 3.8265173382968644e-07, "loss": 0.1993, "step": 778 }, { "epoch": 2.9230769230769234, "grad_norm": 0.10328465891363892, "learning_rate": 3.45364680520821e-07, "loss": 0.1957, "step": 779 }, { "epoch": 2.926829268292683, "grad_norm": 0.10581130543096962, "learning_rate": 3.0998549233205443e-07, "loss": 0.2037, "step": 780 }, { "epoch": 2.930581613508443, "grad_norm": 0.10489859992730348, "learning_rate": 2.7651484658984816e-07, "loss": 0.2022, "step": 781 }, { "epoch": 2.9343339587242028, "grad_norm": 0.10090834154030347, "learning_rate": 2.4495338408201394e-07, "loss": 0.1823, "step": 782 }, { "epoch": 2.9380863039399623, "grad_norm": 0.1053894032543558, "learning_rate": 2.1530170904551228e-07, "loss": 0.1958, "step": 783 }, { "epoch": 2.9418386491557222, "grad_norm": 0.10515240032700511, "learning_rate": 1.8756038915486163e-07, "loss": 0.2169, "step": 784 }, { "epoch": 2.945590994371482, "grad_norm": 0.10072328781410543, "learning_rate": 1.6172995551125837e-07, "loss": 0.1858, "step": 785 }, { "epoch": 2.949343339587242, "grad_norm": 0.10103316711760477, "learning_rate": 1.3781090263242924e-07, "loss": 0.2127, "step": 786 }, { "epoch": 2.953095684803002, "grad_norm": 0.10985126613219337, "learning_rate": 1.1580368844316125e-07, "loss": 0.2123, "step": 787 }, { "epoch": 2.9568480300187616, "grad_norm": 0.10761272038138127, "learning_rate": 9.570873426649752e-08, "loss": 0.1809, "step": 788 }, { "epoch": 2.9606003752345216, "grad_norm": 0.10492142661860061, "learning_rate": 7.752642481573258e-08, "loss": 0.2002, "step": 789 }, { "epoch": 2.9643527204502815, "grad_norm": 0.10018242564611118, "learning_rate": 6.125710818701835e-08, "loss": 0.209, "step": 790 }, { "epoch": 2.968105065666041, "grad_norm": 0.10682008091523718, "learning_rate": 4.6901095852680544e-08, "loss": 0.2004, "step": 791 }, { "epoch": 2.971857410881801, "grad_norm": 0.10106577269812643, "learning_rate": 3.4458662655267873e-08, "loss": 0.1897, "step": 792 }, { "epoch": 2.975609756097561, "grad_norm": 0.10793878514656147, "learning_rate": 2.3930046802322913e-08, "loss": 0.1921, "step": 793 }, { "epoch": 2.979362101313321, "grad_norm": 0.10080948051676845, "learning_rate": 1.5315449861774688e-08, "loss": 0.1829, "step": 794 }, { "epoch": 2.983114446529081, "grad_norm": 0.10459513366578907, "learning_rate": 8.615036758108374e-09, "loss": 0.1886, "step": 795 }, { "epoch": 2.9868667917448404, "grad_norm": 0.10329003618449013, "learning_rate": 3.828935769190078e-09, "loss": 0.1938, "step": 796 }, { "epoch": 2.9906191369606003, "grad_norm": 0.10558471702291192, "learning_rate": 9.572385238243442e-10, "loss": 0.2038, "step": 797 }, { "epoch": 2.9943714821763603, "grad_norm": 0.11048073093068178, "learning_rate": 0.0, "loss": 0.2057, "step": 798 }, { "epoch": 2.9943714821763603, "eval_loss": 0.23170919716358185, "eval_runtime": 49.215, "eval_samples_per_second": 36.452, "eval_steps_per_second": 1.158, "step": 798 }, { "epoch": 2.9943714821763603, "step": 798, "total_flos": 2.2979867712657818e+17, "train_loss": 0.26831792730063125, "train_runtime": 7564.9733, "train_samples_per_second": 13.512, "train_steps_per_second": 0.105 } ], "logging_steps": 1, "max_steps": 798, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2979867712657818e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }