{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.99812382739212, "eval_steps": 500, "global_step": 266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00375234521575985, "grad_norm": 1.1422045632346107, "learning_rate": 2.5e-06, "loss": 1.4765, "step": 1 }, { "epoch": 0.0075046904315197, "grad_norm": 1.10606272348653, "learning_rate": 5e-06, "loss": 1.4679, "step": 2 }, { "epoch": 0.01125703564727955, "grad_norm": 1.124285036588602, "learning_rate": 7.5e-06, "loss": 1.4926, "step": 3 }, { "epoch": 0.0150093808630394, "grad_norm": 1.175650360755083, "learning_rate": 1e-05, "loss": 1.4946, "step": 4 }, { "epoch": 0.01876172607879925, "grad_norm": 1.1176055565878193, "learning_rate": 1.25e-05, "loss": 1.4803, "step": 5 }, { "epoch": 0.0225140712945591, "grad_norm": 1.080965163228283, "learning_rate": 1.5e-05, "loss": 1.4069, "step": 6 }, { "epoch": 0.02626641651031895, "grad_norm": 1.073199125902437, "learning_rate": 1.75e-05, "loss": 1.4175, "step": 7 }, { "epoch": 0.0300187617260788, "grad_norm": 0.9895651667655394, "learning_rate": 2e-05, "loss": 1.3952, "step": 8 }, { "epoch": 0.03377110694183865, "grad_norm": 0.8484409515335725, "learning_rate": 2.25e-05, "loss": 1.3084, "step": 9 }, { "epoch": 0.0375234521575985, "grad_norm": 0.656224580389129, "learning_rate": 2.5e-05, "loss": 1.2224, "step": 10 }, { "epoch": 0.04127579737335835, "grad_norm": 0.6681802871972625, "learning_rate": 2.7500000000000004e-05, "loss": 1.2279, "step": 11 }, { "epoch": 0.0450281425891182, "grad_norm": 0.6445930931164492, "learning_rate": 3e-05, "loss": 1.1869, "step": 12 }, { "epoch": 0.04878048780487805, "grad_norm": 0.6774830464098534, "learning_rate": 3.2500000000000004e-05, "loss": 1.1345, "step": 13 }, { "epoch": 0.0525328330206379, "grad_norm": 0.7129957171173121, "learning_rate": 3.5e-05, "loss": 1.0342, "step": 14 }, { "epoch": 0.05628517823639775, "grad_norm": 0.6988046692034513, "learning_rate": 3.7500000000000003e-05, "loss": 0.9683, "step": 15 }, { "epoch": 0.0600375234521576, "grad_norm": 0.7305746200421179, "learning_rate": 4e-05, "loss": 0.8998, "step": 16 }, { "epoch": 0.06378986866791744, "grad_norm": 0.6303366234907746, "learning_rate": 4.25e-05, "loss": 0.8585, "step": 17 }, { "epoch": 0.0675422138836773, "grad_norm": 0.6262466336688131, "learning_rate": 4.5e-05, "loss": 0.7913, "step": 18 }, { "epoch": 0.07129455909943715, "grad_norm": 0.5934168135285741, "learning_rate": 4.75e-05, "loss": 0.7358, "step": 19 }, { "epoch": 0.075046904315197, "grad_norm": 0.5003901957180881, "learning_rate": 5e-05, "loss": 0.6762, "step": 20 }, { "epoch": 0.07879924953095685, "grad_norm": 0.44247641980699626, "learning_rate": 5.25e-05, "loss": 0.6412, "step": 21 }, { "epoch": 0.0825515947467167, "grad_norm": 0.33108999413889184, "learning_rate": 5.500000000000001e-05, "loss": 0.6021, "step": 22 }, { "epoch": 0.08630393996247655, "grad_norm": 0.30987297699741684, "learning_rate": 5.7499999999999995e-05, "loss": 0.5678, "step": 23 }, { "epoch": 0.0900562851782364, "grad_norm": 0.2879383883871797, "learning_rate": 6e-05, "loss": 0.5653, "step": 24 }, { "epoch": 0.09380863039399624, "grad_norm": 0.4921785771111378, "learning_rate": 6.25e-05, "loss": 0.5397, "step": 25 }, { "epoch": 0.0975609756097561, "grad_norm": 0.23455468567206647, "learning_rate": 6.500000000000001e-05, "loss": 0.5392, "step": 26 }, { "epoch": 0.10131332082551595, "grad_norm": 0.21551936996375468, "learning_rate": 6.750000000000001e-05, "loss": 0.5423, "step": 27 }, { "epoch": 0.1050656660412758, "grad_norm": 0.2138475404490417, "learning_rate": 7e-05, "loss": 0.5072, "step": 28 }, { "epoch": 0.10881801125703565, "grad_norm": 0.1981260579789532, "learning_rate": 7.25e-05, "loss": 0.4927, "step": 29 }, { "epoch": 0.1125703564727955, "grad_norm": 0.19766175304738637, "learning_rate": 7.500000000000001e-05, "loss": 0.4992, "step": 30 }, { "epoch": 0.11632270168855535, "grad_norm": 0.16180823154197033, "learning_rate": 7.75e-05, "loss": 0.5078, "step": 31 }, { "epoch": 0.1200750469043152, "grad_norm": 0.15792678361397225, "learning_rate": 8e-05, "loss": 0.4834, "step": 32 }, { "epoch": 0.12382739212007504, "grad_norm": 0.17901823211719936, "learning_rate": 8.25e-05, "loss": 0.5038, "step": 33 }, { "epoch": 0.1275797373358349, "grad_norm": 0.15291985686600748, "learning_rate": 8.5e-05, "loss": 0.463, "step": 34 }, { "epoch": 0.13133208255159476, "grad_norm": 0.1402347205035838, "learning_rate": 8.75e-05, "loss": 0.4678, "step": 35 }, { "epoch": 0.1350844277673546, "grad_norm": 0.1292157193781673, "learning_rate": 9e-05, "loss": 0.48, "step": 36 }, { "epoch": 0.13883677298311445, "grad_norm": 0.12200374600393228, "learning_rate": 9.250000000000001e-05, "loss": 0.4678, "step": 37 }, { "epoch": 0.1425891181988743, "grad_norm": 0.12645974836123272, "learning_rate": 9.5e-05, "loss": 0.4783, "step": 38 }, { "epoch": 0.14634146341463414, "grad_norm": 0.12515993032794973, "learning_rate": 9.75e-05, "loss": 0.4558, "step": 39 }, { "epoch": 0.150093808630394, "grad_norm": 0.1257915818218713, "learning_rate": 0.0001, "loss": 0.4582, "step": 40 }, { "epoch": 0.15384615384615385, "grad_norm": 0.11519718216680118, "learning_rate": 0.0001025, "loss": 0.4433, "step": 41 }, { "epoch": 0.1575984990619137, "grad_norm": 0.11408287464445384, "learning_rate": 0.000105, "loss": 0.4566, "step": 42 }, { "epoch": 0.16135084427767354, "grad_norm": 0.11355997287120467, "learning_rate": 0.0001075, "loss": 0.4601, "step": 43 }, { "epoch": 0.1651031894934334, "grad_norm": 0.1236061343834286, "learning_rate": 0.00011000000000000002, "loss": 0.4279, "step": 44 }, { "epoch": 0.16885553470919323, "grad_norm": 0.11292335583297317, "learning_rate": 0.00011250000000000001, "loss": 0.4242, "step": 45 }, { "epoch": 0.1726078799249531, "grad_norm": 0.10830414207227934, "learning_rate": 0.00011499999999999999, "loss": 0.4392, "step": 46 }, { "epoch": 0.17636022514071295, "grad_norm": 0.1130446420034329, "learning_rate": 0.00011750000000000001, "loss": 0.4175, "step": 47 }, { "epoch": 0.1801125703564728, "grad_norm": 0.10972733489410498, "learning_rate": 0.00012, "loss": 0.4064, "step": 48 }, { "epoch": 0.18386491557223264, "grad_norm": 0.11723858927517143, "learning_rate": 0.00012250000000000002, "loss": 0.4618, "step": 49 }, { "epoch": 0.18761726078799248, "grad_norm": 0.12979793592348535, "learning_rate": 0.000125, "loss": 0.4413, "step": 50 }, { "epoch": 0.19136960600375236, "grad_norm": 0.12190484063649769, "learning_rate": 0.0001275, "loss": 0.4212, "step": 51 }, { "epoch": 0.1951219512195122, "grad_norm": 0.1200977200253699, "learning_rate": 0.00013000000000000002, "loss": 0.4236, "step": 52 }, { "epoch": 0.19887429643527205, "grad_norm": 0.11576799257930886, "learning_rate": 0.0001325, "loss": 0.4328, "step": 53 }, { "epoch": 0.2026266416510319, "grad_norm": 0.11804398873031127, "learning_rate": 0.00013500000000000003, "loss": 0.3906, "step": 54 }, { "epoch": 0.20637898686679174, "grad_norm": 0.11890529087801377, "learning_rate": 0.0001375, "loss": 0.4092, "step": 55 }, { "epoch": 0.2101313320825516, "grad_norm": 0.11537178670561035, "learning_rate": 0.00014, "loss": 0.4026, "step": 56 }, { "epoch": 0.21388367729831145, "grad_norm": 0.11591938376106178, "learning_rate": 0.00014250000000000002, "loss": 0.3678, "step": 57 }, { "epoch": 0.2176360225140713, "grad_norm": 0.12025566814049414, "learning_rate": 0.000145, "loss": 0.3791, "step": 58 }, { "epoch": 0.22138836772983114, "grad_norm": 0.13088656560108905, "learning_rate": 0.0001475, "loss": 0.3906, "step": 59 }, { "epoch": 0.225140712945591, "grad_norm": 0.12366551138693345, "learning_rate": 0.00015000000000000001, "loss": 0.3769, "step": 60 }, { "epoch": 0.22889305816135083, "grad_norm": 0.12338960635908504, "learning_rate": 0.0001525, "loss": 0.3806, "step": 61 }, { "epoch": 0.2326454033771107, "grad_norm": 0.12957742480845902, "learning_rate": 0.000155, "loss": 0.365, "step": 62 }, { "epoch": 0.23639774859287055, "grad_norm": 0.1282017025800552, "learning_rate": 0.0001575, "loss": 0.3637, "step": 63 }, { "epoch": 0.2401500938086304, "grad_norm": 0.12685377163368308, "learning_rate": 0.00016, "loss": 0.3813, "step": 64 }, { "epoch": 0.24390243902439024, "grad_norm": 0.12515445789228635, "learning_rate": 0.00016250000000000002, "loss": 0.3665, "step": 65 }, { "epoch": 0.24765478424015008, "grad_norm": 0.12780302020094111, "learning_rate": 0.000165, "loss": 0.372, "step": 66 }, { "epoch": 0.25140712945590993, "grad_norm": 0.13509915457231544, "learning_rate": 0.0001675, "loss": 0.3964, "step": 67 }, { "epoch": 0.2551594746716698, "grad_norm": 0.1324450895974203, "learning_rate": 0.00017, "loss": 0.3809, "step": 68 }, { "epoch": 0.2589118198874296, "grad_norm": 0.14039471561000108, "learning_rate": 0.00017250000000000002, "loss": 0.3788, "step": 69 }, { "epoch": 0.2626641651031895, "grad_norm": 0.13748884493823293, "learning_rate": 0.000175, "loss": 0.3477, "step": 70 }, { "epoch": 0.26641651031894936, "grad_norm": 0.12981102084999996, "learning_rate": 0.0001775, "loss": 0.3803, "step": 71 }, { "epoch": 0.2701688555347092, "grad_norm": 0.12375391443012415, "learning_rate": 0.00018, "loss": 0.3557, "step": 72 }, { "epoch": 0.27392120075046905, "grad_norm": 0.12792478465226367, "learning_rate": 0.0001825, "loss": 0.351, "step": 73 }, { "epoch": 0.2776735459662289, "grad_norm": 0.1281934594676182, "learning_rate": 0.00018500000000000002, "loss": 0.3662, "step": 74 }, { "epoch": 0.28142589118198874, "grad_norm": 0.13402822886419208, "learning_rate": 0.0001875, "loss": 0.3491, "step": 75 }, { "epoch": 0.2851782363977486, "grad_norm": 0.1292536897601892, "learning_rate": 0.00019, "loss": 0.3415, "step": 76 }, { "epoch": 0.28893058161350843, "grad_norm": 0.16014546584829106, "learning_rate": 0.00019250000000000002, "loss": 0.3493, "step": 77 }, { "epoch": 0.2926829268292683, "grad_norm": 0.1393384528675237, "learning_rate": 0.000195, "loss": 0.3509, "step": 78 }, { "epoch": 0.2964352720450281, "grad_norm": 0.15248843449290234, "learning_rate": 0.00019750000000000003, "loss": 0.3328, "step": 79 }, { "epoch": 0.300187617260788, "grad_norm": 0.1478683373584156, "learning_rate": 0.0002, "loss": 0.339, "step": 80 }, { "epoch": 0.30393996247654786, "grad_norm": 0.1457678828189889, "learning_rate": 0.00019999904276147618, "loss": 0.3536, "step": 81 }, { "epoch": 0.3076923076923077, "grad_norm": 0.15185000879528737, "learning_rate": 0.00019999617106423082, "loss": 0.3529, "step": 82 }, { "epoch": 0.31144465290806755, "grad_norm": 0.15201189365883755, "learning_rate": 0.0001999913849632419, "loss": 0.3548, "step": 83 }, { "epoch": 0.3151969981238274, "grad_norm": 0.14879326753679958, "learning_rate": 0.00019998468455013823, "loss": 0.3264, "step": 84 }, { "epoch": 0.31894934333958724, "grad_norm": 0.14083771591440533, "learning_rate": 0.00019997606995319768, "loss": 0.3331, "step": 85 }, { "epoch": 0.3227016885553471, "grad_norm": 0.1503929432468549, "learning_rate": 0.00019996554133734474, "loss": 0.3282, "step": 86 }, { "epoch": 0.32645403377110693, "grad_norm": 0.14030605779758232, "learning_rate": 0.00019995309890414732, "loss": 0.3216, "step": 87 }, { "epoch": 0.3302063789868668, "grad_norm": 0.13891895714301467, "learning_rate": 0.000199938742891813, "loss": 0.3049, "step": 88 }, { "epoch": 0.3339587242026266, "grad_norm": 0.13742909920708118, "learning_rate": 0.00019992247357518428, "loss": 0.3252, "step": 89 }, { "epoch": 0.33771106941838647, "grad_norm": 0.14398237502236147, "learning_rate": 0.0001999042912657335, "loss": 0.3226, "step": 90 }, { "epoch": 0.34146341463414637, "grad_norm": 0.14292774523614082, "learning_rate": 0.00019988419631155683, "loss": 0.3323, "step": 91 }, { "epoch": 0.3452157598499062, "grad_norm": 0.14529808441186043, "learning_rate": 0.00019986218909736757, "loss": 0.3621, "step": 92 }, { "epoch": 0.34896810506566606, "grad_norm": 0.14363660567228131, "learning_rate": 0.00019983827004448873, "loss": 0.3325, "step": 93 }, { "epoch": 0.3527204502814259, "grad_norm": 0.14053215950288314, "learning_rate": 0.00019981243961084515, "loss": 0.3317, "step": 94 }, { "epoch": 0.35647279549718575, "grad_norm": 0.12839662363868307, "learning_rate": 0.0001997846982909545, "loss": 0.3017, "step": 95 }, { "epoch": 0.3602251407129456, "grad_norm": 0.1421301998134749, "learning_rate": 0.000199755046615918, "loss": 0.3236, "step": 96 }, { "epoch": 0.36397748592870544, "grad_norm": 0.1475029420066679, "learning_rate": 0.00019972348515341016, "loss": 0.3362, "step": 97 }, { "epoch": 0.3677298311444653, "grad_norm": 0.13378279730516257, "learning_rate": 0.00019969001450766794, "loss": 0.3254, "step": 98 }, { "epoch": 0.3714821763602251, "grad_norm": 0.1497261207938794, "learning_rate": 0.0001996546353194792, "loss": 0.3156, "step": 99 }, { "epoch": 0.37523452157598497, "grad_norm": 0.1356839966194173, "learning_rate": 0.00019961734826617035, "loss": 0.3282, "step": 100 }, { "epoch": 0.3789868667917448, "grad_norm": 0.12386942577985954, "learning_rate": 0.0001995781540615934, "loss": 0.3207, "step": 101 }, { "epoch": 0.3827392120075047, "grad_norm": 0.16584604505517364, "learning_rate": 0.0001995370534561125, "loss": 0.3026, "step": 102 }, { "epoch": 0.38649155722326456, "grad_norm": 0.1277560294599099, "learning_rate": 0.0001994940472365893, "loss": 0.322, "step": 103 }, { "epoch": 0.3902439024390244, "grad_norm": 0.13567813426924816, "learning_rate": 0.00019944913622636795, "loss": 0.3232, "step": 104 }, { "epoch": 0.39399624765478425, "grad_norm": 0.12123496832228846, "learning_rate": 0.0001994023212852595, "loss": 0.2972, "step": 105 }, { "epoch": 0.3977485928705441, "grad_norm": 0.13879373741004694, "learning_rate": 0.00019935360330952518, "loss": 0.3005, "step": 106 }, { "epoch": 0.40150093808630394, "grad_norm": 0.1274679949876301, "learning_rate": 0.00019930298323185945, "loss": 0.3119, "step": 107 }, { "epoch": 0.4052532833020638, "grad_norm": 0.13101222758435194, "learning_rate": 0.00019925046202137216, "loss": 0.2939, "step": 108 }, { "epoch": 0.4090056285178236, "grad_norm": 0.12738472548497895, "learning_rate": 0.00019919604068356978, "loss": 0.3093, "step": 109 }, { "epoch": 0.41275797373358347, "grad_norm": 0.1490015817444115, "learning_rate": 0.00019913972026033632, "loss": 0.2844, "step": 110 }, { "epoch": 0.4165103189493433, "grad_norm": 0.1470790264142207, "learning_rate": 0.00019908150182991339, "loss": 0.2872, "step": 111 }, { "epoch": 0.4202626641651032, "grad_norm": 0.12721396486874495, "learning_rate": 0.00019902138650687942, "loss": 0.3043, "step": 112 }, { "epoch": 0.42401500938086306, "grad_norm": 0.13891744298891914, "learning_rate": 0.00019895937544212858, "loss": 0.3009, "step": 113 }, { "epoch": 0.4277673545966229, "grad_norm": 0.134346074178801, "learning_rate": 0.00019889546982284834, "loss": 0.3013, "step": 114 }, { "epoch": 0.43151969981238275, "grad_norm": 0.1379066741076229, "learning_rate": 0.00019882967087249718, "loss": 0.3052, "step": 115 }, { "epoch": 0.4352720450281426, "grad_norm": 0.12972548899740632, "learning_rate": 0.0001987619798507809, "loss": 0.3124, "step": 116 }, { "epoch": 0.43902439024390244, "grad_norm": 0.12813310196115213, "learning_rate": 0.0001986923980536286, "loss": 0.2893, "step": 117 }, { "epoch": 0.4427767354596623, "grad_norm": 0.13797054317394944, "learning_rate": 0.00019862092681316776, "loss": 0.3016, "step": 118 }, { "epoch": 0.44652908067542213, "grad_norm": 0.13780600670778337, "learning_rate": 0.0001985475674976989, "loss": 0.3158, "step": 119 }, { "epoch": 0.450281425891182, "grad_norm": 0.13926178383999727, "learning_rate": 0.0001984723215116693, "loss": 0.2801, "step": 120 }, { "epoch": 0.4540337711069418, "grad_norm": 0.1369353496922525, "learning_rate": 0.00019839519029564605, "loss": 0.305, "step": 121 }, { "epoch": 0.45778611632270166, "grad_norm": 0.13937382639705567, "learning_rate": 0.00019831617532628862, "loss": 0.3176, "step": 122 }, { "epoch": 0.46153846153846156, "grad_norm": 0.14086276027188518, "learning_rate": 0.00019823527811632042, "loss": 0.2879, "step": 123 }, { "epoch": 0.4652908067542214, "grad_norm": 0.13282215800163436, "learning_rate": 0.00019815250021449997, "loss": 0.2996, "step": 124 }, { "epoch": 0.46904315196998125, "grad_norm": 0.12757163326850707, "learning_rate": 0.00019806784320559127, "loss": 0.3006, "step": 125 }, { "epoch": 0.4727954971857411, "grad_norm": 0.14854709123219104, "learning_rate": 0.00019798130871033322, "loss": 0.301, "step": 126 }, { "epoch": 0.47654784240150094, "grad_norm": 0.13087500973091548, "learning_rate": 0.00019789289838540897, "loss": 0.2902, "step": 127 }, { "epoch": 0.4803001876172608, "grad_norm": 0.1433475392806627, "learning_rate": 0.00019780261392341383, "loss": 0.2926, "step": 128 }, { "epoch": 0.48405253283302063, "grad_norm": 0.1341283559656879, "learning_rate": 0.0001977104570528231, "loss": 0.2602, "step": 129 }, { "epoch": 0.4878048780487805, "grad_norm": 0.1607197394251248, "learning_rate": 0.00019761642953795895, "loss": 0.2984, "step": 130 }, { "epoch": 0.4915572232645403, "grad_norm": 0.11856150621760517, "learning_rate": 0.0001975205331789566, "loss": 0.2988, "step": 131 }, { "epoch": 0.49530956848030017, "grad_norm": 0.14014139613661877, "learning_rate": 0.00019742276981172976, "loss": 0.291, "step": 132 }, { "epoch": 0.49906191369606, "grad_norm": 0.12881861735846314, "learning_rate": 0.00019732314130793568, "loss": 0.2971, "step": 133 }, { "epoch": 0.5028142589118199, "grad_norm": 0.11788683351931176, "learning_rate": 0.00019722164957493922, "loss": 0.2766, "step": 134 }, { "epoch": 0.5065666041275797, "grad_norm": 0.13746078706666037, "learning_rate": 0.0001971182965557763, "loss": 0.2886, "step": 135 }, { "epoch": 0.5103189493433395, "grad_norm": 0.12745519285890888, "learning_rate": 0.00019701308422911672, "loss": 0.2963, "step": 136 }, { "epoch": 0.5140712945590994, "grad_norm": 0.11835270726835292, "learning_rate": 0.0001969060146092264, "loss": 0.2995, "step": 137 }, { "epoch": 0.5178236397748592, "grad_norm": 0.14011034379489426, "learning_rate": 0.0001967970897459286, "loss": 0.2881, "step": 138 }, { "epoch": 0.5215759849906192, "grad_norm": 0.13060776440495228, "learning_rate": 0.0001966863117245648, "loss": 0.2765, "step": 139 }, { "epoch": 0.525328330206379, "grad_norm": 0.14161693580554588, "learning_rate": 0.00019657368266595476, "loss": 0.281, "step": 140 }, { "epoch": 0.5290806754221389, "grad_norm": 0.12125364150709082, "learning_rate": 0.00019645920472635608, "loss": 0.2732, "step": 141 }, { "epoch": 0.5328330206378987, "grad_norm": 0.1334127552945295, "learning_rate": 0.00019634288009742255, "loss": 0.2523, "step": 142 }, { "epoch": 0.5365853658536586, "grad_norm": 0.12113573146827264, "learning_rate": 0.0001962247110061625, "loss": 0.2775, "step": 143 }, { "epoch": 0.5403377110694184, "grad_norm": 0.12331032028922699, "learning_rate": 0.00019610469971489608, "loss": 0.2687, "step": 144 }, { "epoch": 0.5440900562851783, "grad_norm": 0.13237586077608754, "learning_rate": 0.00019598284852121188, "loss": 0.2774, "step": 145 }, { "epoch": 0.5478424015009381, "grad_norm": 0.12199880756983131, "learning_rate": 0.0001958591597579231, "loss": 0.2815, "step": 146 }, { "epoch": 0.551594746716698, "grad_norm": 0.11915746795874955, "learning_rate": 0.00019573363579302266, "loss": 0.2558, "step": 147 }, { "epoch": 0.5553470919324578, "grad_norm": 0.11644382804351376, "learning_rate": 0.00019560627902963807, "loss": 0.2951, "step": 148 }, { "epoch": 0.5590994371482176, "grad_norm": 0.1317161794959933, "learning_rate": 0.00019547709190598534, "loss": 0.2629, "step": 149 }, { "epoch": 0.5628517823639775, "grad_norm": 0.13859313218362884, "learning_rate": 0.00019534607689532233, "loss": 0.2884, "step": 150 }, { "epoch": 0.5666041275797373, "grad_norm": 0.1643061756146766, "learning_rate": 0.00019521323650590133, "loss": 0.2932, "step": 151 }, { "epoch": 0.5703564727954972, "grad_norm": 0.12366306539172685, "learning_rate": 0.00019507857328092108, "loss": 0.2861, "step": 152 }, { "epoch": 0.574108818011257, "grad_norm": 0.12624207186548378, "learning_rate": 0.00019494208979847812, "loss": 0.2796, "step": 153 }, { "epoch": 0.5778611632270169, "grad_norm": 0.12237336350000451, "learning_rate": 0.00019480378867151746, "loss": 0.273, "step": 154 }, { "epoch": 0.5816135084427767, "grad_norm": 0.12323433685041912, "learning_rate": 0.00019466367254778233, "loss": 0.2747, "step": 155 }, { "epoch": 0.5853658536585366, "grad_norm": 0.12577598956544817, "learning_rate": 0.0001945217441097638, "loss": 0.2634, "step": 156 }, { "epoch": 0.5891181988742964, "grad_norm": 0.12244570380339517, "learning_rate": 0.00019437800607464932, "loss": 0.2701, "step": 157 }, { "epoch": 0.5928705440900562, "grad_norm": 0.12004670825182381, "learning_rate": 0.00019423246119427043, "loss": 0.2781, "step": 158 }, { "epoch": 0.5966228893058161, "grad_norm": 0.13091796767694497, "learning_rate": 0.00019408511225505056, "loss": 0.2646, "step": 159 }, { "epoch": 0.600375234521576, "grad_norm": 0.11771920694416416, "learning_rate": 0.00019393596207795136, "loss": 0.2795, "step": 160 }, { "epoch": 0.6041275797373359, "grad_norm": 0.12447218651645564, "learning_rate": 0.00019378501351841865, "loss": 0.2767, "step": 161 }, { "epoch": 0.6078799249530957, "grad_norm": 0.11854916742534294, "learning_rate": 0.000193632269466328, "loss": 0.2595, "step": 162 }, { "epoch": 0.6116322701688556, "grad_norm": 0.11517649062994549, "learning_rate": 0.0001934777328459292, "loss": 0.2611, "step": 163 }, { "epoch": 0.6153846153846154, "grad_norm": 0.12291906434338017, "learning_rate": 0.00019332140661579042, "loss": 0.2569, "step": 164 }, { "epoch": 0.6191369606003753, "grad_norm": 0.12768661337225065, "learning_rate": 0.00019316329376874145, "loss": 0.2802, "step": 165 }, { "epoch": 0.6228893058161351, "grad_norm": 0.12224468589372722, "learning_rate": 0.00019300339733181642, "loss": 0.2742, "step": 166 }, { "epoch": 0.626641651031895, "grad_norm": 0.11873375913983374, "learning_rate": 0.00019284172036619594, "loss": 0.2496, "step": 167 }, { "epoch": 0.6303939962476548, "grad_norm": 0.1094029489278503, "learning_rate": 0.0001926782659671484, "loss": 0.2834, "step": 168 }, { "epoch": 0.6341463414634146, "grad_norm": 0.11667364916992014, "learning_rate": 0.00019251303726397078, "loss": 0.2749, "step": 169 }, { "epoch": 0.6378986866791745, "grad_norm": 0.10721206701910313, "learning_rate": 0.00019234603741992862, "loss": 0.2833, "step": 170 }, { "epoch": 0.6416510318949343, "grad_norm": 0.11114975628124507, "learning_rate": 0.00019217726963219567, "loss": 0.2412, "step": 171 }, { "epoch": 0.6454033771106942, "grad_norm": 0.11052789377191914, "learning_rate": 0.00019200673713179245, "loss": 0.2629, "step": 172 }, { "epoch": 0.649155722326454, "grad_norm": 0.1254877320751365, "learning_rate": 0.00019183444318352457, "loss": 0.2676, "step": 173 }, { "epoch": 0.6529080675422139, "grad_norm": 0.11436464042758997, "learning_rate": 0.0001916603910859201, "loss": 0.2786, "step": 174 }, { "epoch": 0.6566604127579737, "grad_norm": 0.12040982753537727, "learning_rate": 0.00019148458417116645, "loss": 0.255, "step": 175 }, { "epoch": 0.6604127579737336, "grad_norm": 0.1215472428194096, "learning_rate": 0.00019130702580504676, "loss": 0.2933, "step": 176 }, { "epoch": 0.6641651031894934, "grad_norm": 0.11127574852727158, "learning_rate": 0.0001911277193868751, "loss": 0.2638, "step": 177 }, { "epoch": 0.6679174484052532, "grad_norm": 0.11297276732299613, "learning_rate": 0.00019094666834943179, "loss": 0.2553, "step": 178 }, { "epoch": 0.6716697936210131, "grad_norm": 0.11230362581933455, "learning_rate": 0.00019076387615889727, "loss": 0.2656, "step": 179 }, { "epoch": 0.6754221388367729, "grad_norm": 0.11339982024848368, "learning_rate": 0.00019057934631478617, "loss": 0.2608, "step": 180 }, { "epoch": 0.6791744840525328, "grad_norm": 0.1157018708653507, "learning_rate": 0.00019039308234987992, "loss": 0.2661, "step": 181 }, { "epoch": 0.6829268292682927, "grad_norm": 0.12120354653706046, "learning_rate": 0.00019020508783015942, "loss": 0.2655, "step": 182 }, { "epoch": 0.6866791744840526, "grad_norm": 0.11650498536100079, "learning_rate": 0.00019001536635473664, "loss": 0.2617, "step": 183 }, { "epoch": 0.6904315196998124, "grad_norm": 0.11284326019455035, "learning_rate": 0.0001898239215557856, "loss": 0.2604, "step": 184 }, { "epoch": 0.6941838649155723, "grad_norm": 0.11137366023131207, "learning_rate": 0.0001896307570984731, "loss": 0.2695, "step": 185 }, { "epoch": 0.6979362101313321, "grad_norm": 0.10909150712308537, "learning_rate": 0.00018943587668088832, "loss": 0.261, "step": 186 }, { "epoch": 0.701688555347092, "grad_norm": 0.11533104627662898, "learning_rate": 0.00018923928403397208, "loss": 0.2662, "step": 187 }, { "epoch": 0.7054409005628518, "grad_norm": 0.11085301527387796, "learning_rate": 0.00018904098292144554, "loss": 0.26, "step": 188 }, { "epoch": 0.7091932457786116, "grad_norm": 0.1040125545017247, "learning_rate": 0.00018884097713973798, "loss": 0.2641, "step": 189 }, { "epoch": 0.7129455909943715, "grad_norm": 0.10775777270108124, "learning_rate": 0.00018863927051791416, "loss": 0.2553, "step": 190 }, { "epoch": 0.7166979362101313, "grad_norm": 0.11556746781951048, "learning_rate": 0.00018843586691760108, "loss": 0.2817, "step": 191 }, { "epoch": 0.7204502814258912, "grad_norm": 0.11370972134361729, "learning_rate": 0.00018823077023291397, "loss": 0.2715, "step": 192 }, { "epoch": 0.724202626641651, "grad_norm": 0.10785721109445355, "learning_rate": 0.00018802398439038176, "loss": 0.2604, "step": 193 }, { "epoch": 0.7279549718574109, "grad_norm": 0.10825278350141479, "learning_rate": 0.00018781551334887201, "loss": 0.2498, "step": 194 }, { "epoch": 0.7317073170731707, "grad_norm": 0.09965163182891702, "learning_rate": 0.0001876053610995149, "loss": 0.2504, "step": 195 }, { "epoch": 0.7354596622889306, "grad_norm": 0.1026489808604617, "learning_rate": 0.000187393531665627, "loss": 0.2587, "step": 196 }, { "epoch": 0.7392120075046904, "grad_norm": 0.10399821510438714, "learning_rate": 0.00018718002910263426, "loss": 0.273, "step": 197 }, { "epoch": 0.7429643527204502, "grad_norm": 0.10994775687961979, "learning_rate": 0.0001869648574979942, "loss": 0.2659, "step": 198 }, { "epoch": 0.7467166979362101, "grad_norm": 0.10593465784705908, "learning_rate": 0.00018674802097111784, "loss": 0.26, "step": 199 }, { "epoch": 0.7504690431519699, "grad_norm": 0.11280493763136354, "learning_rate": 0.0001865295236732907, "loss": 0.2677, "step": 200 }, { "epoch": 0.7542213883677298, "grad_norm": 0.10536591132251391, "learning_rate": 0.00018630936978759338, "loss": 0.2513, "step": 201 }, { "epoch": 0.7579737335834896, "grad_norm": 0.10796354732338231, "learning_rate": 0.00018608756352882152, "loss": 0.2757, "step": 202 }, { "epoch": 0.7617260787992496, "grad_norm": 0.10552783825603758, "learning_rate": 0.00018586410914340497, "loss": 0.2552, "step": 203 }, { "epoch": 0.7654784240150094, "grad_norm": 0.10937928150050989, "learning_rate": 0.00018563901090932672, "loss": 0.2675, "step": 204 }, { "epoch": 0.7692307692307693, "grad_norm": 0.11537632950908651, "learning_rate": 0.00018541227313604078, "loss": 0.2402, "step": 205 }, { "epoch": 0.7729831144465291, "grad_norm": 0.11524821367403956, "learning_rate": 0.0001851839001643898, "loss": 0.2628, "step": 206 }, { "epoch": 0.776735459662289, "grad_norm": 0.10266098148088061, "learning_rate": 0.00018495389636652185, "loss": 0.2484, "step": 207 }, { "epoch": 0.7804878048780488, "grad_norm": 0.10807777719284456, "learning_rate": 0.0001847222661458069, "loss": 0.2648, "step": 208 }, { "epoch": 0.7842401500938087, "grad_norm": 0.10744597380010515, "learning_rate": 0.00018448901393675233, "loss": 0.2575, "step": 209 }, { "epoch": 0.7879924953095685, "grad_norm": 0.10942201726245399, "learning_rate": 0.00018425414420491815, "loss": 0.266, "step": 210 }, { "epoch": 0.7917448405253283, "grad_norm": 0.10660876081865972, "learning_rate": 0.00018401766144683147, "loss": 0.2438, "step": 211 }, { "epoch": 0.7954971857410882, "grad_norm": 0.11694393967537217, "learning_rate": 0.0001837795701899004, "loss": 0.2787, "step": 212 }, { "epoch": 0.799249530956848, "grad_norm": 0.11981272200535166, "learning_rate": 0.00018353987499232746, "loss": 0.264, "step": 213 }, { "epoch": 0.8030018761726079, "grad_norm": 0.10661350248202765, "learning_rate": 0.00018329858044302213, "loss": 0.2467, "step": 214 }, { "epoch": 0.8067542213883677, "grad_norm": 0.10372037225439175, "learning_rate": 0.0001830556911615132, "loss": 0.2718, "step": 215 }, { "epoch": 0.8105065666041276, "grad_norm": 0.10573394846211595, "learning_rate": 0.00018281121179786024, "loss": 0.2414, "step": 216 }, { "epoch": 0.8142589118198874, "grad_norm": 0.10765219346551154, "learning_rate": 0.0001825651470325645, "loss": 0.2516, "step": 217 }, { "epoch": 0.8180112570356473, "grad_norm": 0.09961054466797757, "learning_rate": 0.0001823175015764795, "loss": 0.2337, "step": 218 }, { "epoch": 0.8217636022514071, "grad_norm": 0.10573680507484315, "learning_rate": 0.00018206828017072057, "loss": 0.2443, "step": 219 }, { "epoch": 0.8255159474671669, "grad_norm": 0.10617911818381037, "learning_rate": 0.00018181748758657438, "loss": 0.2409, "step": 220 }, { "epoch": 0.8292682926829268, "grad_norm": 0.10190011860666479, "learning_rate": 0.0001815651286254074, "loss": 0.2699, "step": 221 }, { "epoch": 0.8330206378986866, "grad_norm": 0.10217498312134918, "learning_rate": 0.000181311208118574, "loss": 0.261, "step": 222 }, { "epoch": 0.8367729831144465, "grad_norm": 0.10290805625127751, "learning_rate": 0.000181055730927324, "loss": 0.2544, "step": 223 }, { "epoch": 0.8405253283302064, "grad_norm": 0.10273441373621256, "learning_rate": 0.00018079870194270958, "loss": 0.2394, "step": 224 }, { "epoch": 0.8442776735459663, "grad_norm": 0.09880435844395785, "learning_rate": 0.00018054012608549166, "loss": 0.263, "step": 225 }, { "epoch": 0.8480300187617261, "grad_norm": 0.10357276059735837, "learning_rate": 0.0001802800083060457, "loss": 0.2853, "step": 226 }, { "epoch": 0.851782363977486, "grad_norm": 0.10804308023574893, "learning_rate": 0.00018001835358426687, "loss": 0.2595, "step": 227 }, { "epoch": 0.8555347091932458, "grad_norm": 0.09776326620940605, "learning_rate": 0.00017975516692947475, "loss": 0.253, "step": 228 }, { "epoch": 0.8592870544090057, "grad_norm": 0.0995125991589646, "learning_rate": 0.00017949045338031745, "loss": 0.2536, "step": 229 }, { "epoch": 0.8630393996247655, "grad_norm": 0.10281461790899643, "learning_rate": 0.00017922421800467512, "loss": 0.2592, "step": 230 }, { "epoch": 0.8667917448405253, "grad_norm": 0.11374858278223317, "learning_rate": 0.0001789564658995629, "loss": 0.2694, "step": 231 }, { "epoch": 0.8705440900562852, "grad_norm": 0.10048956101218906, "learning_rate": 0.00017868720219103344, "loss": 0.2563, "step": 232 }, { "epoch": 0.874296435272045, "grad_norm": 0.11978050473597157, "learning_rate": 0.00017841643203407852, "loss": 0.2671, "step": 233 }, { "epoch": 0.8780487804878049, "grad_norm": 0.1022948197426214, "learning_rate": 0.00017814416061253077, "loss": 0.2442, "step": 234 }, { "epoch": 0.8818011257035647, "grad_norm": 0.10648409702487768, "learning_rate": 0.000177870393138964, "loss": 0.2172, "step": 235 }, { "epoch": 0.8855534709193246, "grad_norm": 0.09682467776295996, "learning_rate": 0.00017759513485459367, "loss": 0.2503, "step": 236 }, { "epoch": 0.8893058161350844, "grad_norm": 0.10093582432576866, "learning_rate": 0.00017731839102917644, "loss": 0.2526, "step": 237 }, { "epoch": 0.8930581613508443, "grad_norm": 0.10283968277186326, "learning_rate": 0.00017704016696090937, "loss": 0.2467, "step": 238 }, { "epoch": 0.8968105065666041, "grad_norm": 0.1016691703162235, "learning_rate": 0.00017676046797632835, "loss": 0.2458, "step": 239 }, { "epoch": 0.900562851782364, "grad_norm": 0.09871178549145665, "learning_rate": 0.00017647929943020625, "loss": 0.2387, "step": 240 }, { "epoch": 0.9043151969981238, "grad_norm": 0.11005062968397657, "learning_rate": 0.00017619666670545033, "loss": 0.2485, "step": 241 }, { "epoch": 0.9080675422138836, "grad_norm": 0.10636010374538316, "learning_rate": 0.00017591257521299932, "loss": 0.2344, "step": 242 }, { "epoch": 0.9118198874296435, "grad_norm": 0.10269265934208162, "learning_rate": 0.00017562703039171955, "loss": 0.2449, "step": 243 }, { "epoch": 0.9155722326454033, "grad_norm": 0.1123496871025115, "learning_rate": 0.0001753400377083011, "loss": 0.2472, "step": 244 }, { "epoch": 0.9193245778611632, "grad_norm": 0.10731321325088286, "learning_rate": 0.00017505160265715304, "loss": 0.2257, "step": 245 }, { "epoch": 0.9230769230769231, "grad_norm": 0.10122280465712044, "learning_rate": 0.0001747617307602982, "loss": 0.2673, "step": 246 }, { "epoch": 0.926829268292683, "grad_norm": 0.10287633377626088, "learning_rate": 0.00017447042756726754, "loss": 0.2623, "step": 247 }, { "epoch": 0.9305816135084428, "grad_norm": 0.11180813962431274, "learning_rate": 0.0001741776986549938, "loss": 0.2588, "step": 248 }, { "epoch": 0.9343339587242027, "grad_norm": 0.10342918680770019, "learning_rate": 0.00017388354962770487, "loss": 0.2365, "step": 249 }, { "epoch": 0.9380863039399625, "grad_norm": 0.10248241650027715, "learning_rate": 0.0001735879861168163, "loss": 0.2453, "step": 250 }, { "epoch": 0.9418386491557224, "grad_norm": 0.11730400265701718, "learning_rate": 0.00017329101378082374, "loss": 0.2486, "step": 251 }, { "epoch": 0.9455909943714822, "grad_norm": 0.09685186553299667, "learning_rate": 0.0001729926383051943, "loss": 0.2572, "step": 252 }, { "epoch": 0.949343339587242, "grad_norm": 0.12090818479499119, "learning_rate": 0.00017269286540225805, "loss": 0.2248, "step": 253 }, { "epoch": 0.9530956848030019, "grad_norm": 0.10260399450357141, "learning_rate": 0.0001723917008110984, "loss": 0.2527, "step": 254 }, { "epoch": 0.9568480300187617, "grad_norm": 0.10114612523395812, "learning_rate": 0.0001720891502974423, "loss": 0.2602, "step": 255 }, { "epoch": 0.9606003752345216, "grad_norm": 0.11613810011247953, "learning_rate": 0.00017178521965354992, "loss": 0.2535, "step": 256 }, { "epoch": 0.9643527204502814, "grad_norm": 0.10548781228478918, "learning_rate": 0.00017147991469810368, "loss": 0.2616, "step": 257 }, { "epoch": 0.9681050656660413, "grad_norm": 0.10337010169414873, "learning_rate": 0.00017117324127609686, "loss": 0.2506, "step": 258 }, { "epoch": 0.9718574108818011, "grad_norm": 0.1022753450493229, "learning_rate": 0.00017086520525872172, "loss": 0.2536, "step": 259 }, { "epoch": 0.975609756097561, "grad_norm": 0.10274802198295474, "learning_rate": 0.00017055581254325715, "loss": 0.2444, "step": 260 }, { "epoch": 0.9793621013133208, "grad_norm": 0.10073944882387982, "learning_rate": 0.00017024506905295565, "loss": 0.2583, "step": 261 }, { "epoch": 0.9831144465290806, "grad_norm": 0.10220040335882648, "learning_rate": 0.00016993298073693003, "loss": 0.2431, "step": 262 }, { "epoch": 0.9868667917448405, "grad_norm": 0.1060948209024435, "learning_rate": 0.00016961955357003947, "loss": 0.262, "step": 263 }, { "epoch": 0.9906191369606003, "grad_norm": 0.10004277645336798, "learning_rate": 0.0001693047935527751, "loss": 0.234, "step": 264 }, { "epoch": 0.9943714821763602, "grad_norm": 0.1000376814502259, "learning_rate": 0.00016898870671114527, "loss": 0.2566, "step": 265 }, { "epoch": 0.99812382739212, "grad_norm": 0.09911659249018077, "learning_rate": 0.00016867129909655998, "loss": 0.2657, "step": 266 }, { "epoch": 0.99812382739212, "eval_loss": 0.25076788663864136, "eval_runtime": 54.8199, "eval_samples_per_second": 32.725, "eval_steps_per_second": 1.04, "step": 266 } ], "logging_steps": 1, "max_steps": 798, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.692263947344282e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }