{ "best_metric": 1.1278622150421143, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.39611313981555984, "eval_steps": 25, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019805656990777992, "grad_norm": 0.8444206714630127, "learning_rate": 2.9999999999999997e-05, "loss": 1.1022, "step": 1 }, { "epoch": 0.0019805656990777992, "eval_loss": 1.6692832708358765, "eval_runtime": 7.7828, "eval_samples_per_second": 6.424, "eval_steps_per_second": 6.424, "step": 1 }, { "epoch": 0.0039611313981555984, "grad_norm": 0.6467396020889282, "learning_rate": 5.9999999999999995e-05, "loss": 1.4138, "step": 2 }, { "epoch": 0.005941697097233398, "grad_norm": 0.8423255681991577, "learning_rate": 8.999999999999999e-05, "loss": 1.4376, "step": 3 }, { "epoch": 0.007922262796311197, "grad_norm": 0.8432876467704773, "learning_rate": 0.00011999999999999999, "loss": 1.3625, "step": 4 }, { "epoch": 0.009902828495388996, "grad_norm": 0.9727899432182312, "learning_rate": 0.00015, "loss": 1.3676, "step": 5 }, { "epoch": 0.011883394194466795, "grad_norm": 0.7333775758743286, "learning_rate": 0.00017999999999999998, "loss": 1.2198, "step": 6 }, { "epoch": 0.013863959893544593, "grad_norm": 0.8031468987464905, "learning_rate": 0.00020999999999999998, "loss": 1.1534, "step": 7 }, { "epoch": 0.015844525592622394, "grad_norm": 0.9863713979721069, "learning_rate": 0.00023999999999999998, "loss": 1.2578, "step": 8 }, { "epoch": 0.017825091291700193, "grad_norm": 1.7402286529541016, "learning_rate": 0.00027, "loss": 0.9468, "step": 9 }, { "epoch": 0.019805656990777992, "grad_norm": 0.9096835851669312, "learning_rate": 0.0003, "loss": 1.0291, "step": 10 }, { "epoch": 0.02178622268985579, "grad_norm": 0.42851653695106506, "learning_rate": 0.0002999794957488703, "loss": 0.9768, "step": 11 }, { "epoch": 0.02376678838893359, "grad_norm": 0.7323011159896851, "learning_rate": 0.0002999179886011389, "loss": 0.58, "step": 12 }, { "epoch": 0.02574735408801139, "grad_norm": 0.5732091665267944, "learning_rate": 0.0002998154953722457, "loss": 0.7324, "step": 13 }, { "epoch": 0.027727919787089186, "grad_norm": 0.7931118011474609, "learning_rate": 0.00029967204408281613, "loss": 0.803, "step": 14 }, { "epoch": 0.029708485486166985, "grad_norm": 0.4358796775341034, "learning_rate": 0.00029948767395100045, "loss": 0.88, "step": 15 }, { "epoch": 0.03168905118524479, "grad_norm": 0.4290747046470642, "learning_rate": 0.0002992624353817517, "loss": 1.0615, "step": 16 }, { "epoch": 0.03366961688432259, "grad_norm": 0.48837924003601074, "learning_rate": 0.0002989963899530457, "loss": 1.2557, "step": 17 }, { "epoch": 0.035650182583400386, "grad_norm": 0.675270676612854, "learning_rate": 0.00029868961039904624, "loss": 1.3184, "step": 18 }, { "epoch": 0.037630748282478185, "grad_norm": 0.6336246728897095, "learning_rate": 0.00029834218059022024, "loss": 1.2438, "step": 19 }, { "epoch": 0.039611313981555984, "grad_norm": 0.585077166557312, "learning_rate": 0.00029795419551040833, "loss": 1.3877, "step": 20 }, { "epoch": 0.041591879680633784, "grad_norm": 0.5378795862197876, "learning_rate": 0.00029752576123085736, "loss": 1.3058, "step": 21 }, { "epoch": 0.04357244537971158, "grad_norm": 0.5802419185638428, "learning_rate": 0.0002970569948812214, "loss": 1.3264, "step": 22 }, { "epoch": 0.04555301107878938, "grad_norm": 0.6426310539245605, "learning_rate": 0.0002965480246175399, "loss": 1.2176, "step": 23 }, { "epoch": 0.04753357677786718, "grad_norm": 0.49440938234329224, "learning_rate": 0.0002959989895872009, "loss": 1.3859, "step": 24 }, { "epoch": 0.04951414247694498, "grad_norm": 0.6634644865989685, "learning_rate": 0.0002954100398908995, "loss": 1.2996, "step": 25 }, { "epoch": 0.04951414247694498, "eval_loss": 1.2120962142944336, "eval_runtime": 7.9569, "eval_samples_per_second": 6.284, "eval_steps_per_second": 6.284, "step": 25 }, { "epoch": 0.05149470817602278, "grad_norm": 0.5885547995567322, "learning_rate": 0.0002947813365416023, "loss": 1.3487, "step": 26 }, { "epoch": 0.05347527387510058, "grad_norm": 0.6097450256347656, "learning_rate": 0.0002941130514205272, "loss": 1.192, "step": 27 }, { "epoch": 0.05545583957417837, "grad_norm": 0.5981147885322571, "learning_rate": 0.0002934053672301536, "loss": 1.2905, "step": 28 }, { "epoch": 0.05743640527325617, "grad_norm": 0.5432254672050476, "learning_rate": 0.00029265847744427303, "loss": 1.3429, "step": 29 }, { "epoch": 0.05941697097233397, "grad_norm": 0.6555147767066956, "learning_rate": 0.00029187258625509513, "loss": 1.251, "step": 30 }, { "epoch": 0.06139753667141177, "grad_norm": 0.5807340145111084, "learning_rate": 0.00029104790851742417, "loss": 1.2457, "step": 31 }, { "epoch": 0.06337810237048958, "grad_norm": 0.6112722754478455, "learning_rate": 0.0002901846696899191, "loss": 1.0677, "step": 32 }, { "epoch": 0.06535866806956737, "grad_norm": 0.5526084899902344, "learning_rate": 0.00028928310577345606, "loss": 1.2227, "step": 33 }, { "epoch": 0.06733923376864517, "grad_norm": 0.5868051052093506, "learning_rate": 0.0002883434632466077, "loss": 1.1071, "step": 34 }, { "epoch": 0.06931979946772297, "grad_norm": 0.6242677569389343, "learning_rate": 0.00028736599899825856, "loss": 1.2399, "step": 35 }, { "epoch": 0.07130036516680077, "grad_norm": 0.6842761039733887, "learning_rate": 0.00028635098025737434, "loss": 0.9351, "step": 36 }, { "epoch": 0.07328093086587857, "grad_norm": 0.7058606743812561, "learning_rate": 0.00028529868451994384, "loss": 0.9032, "step": 37 }, { "epoch": 0.07526149656495637, "grad_norm": 0.6029658913612366, "learning_rate": 0.0002842093994731145, "loss": 0.9962, "step": 38 }, { "epoch": 0.07724206226403417, "grad_norm": 0.5888481736183167, "learning_rate": 0.00028308342291654174, "loss": 0.7233, "step": 39 }, { "epoch": 0.07922262796311197, "grad_norm": 0.6307267546653748, "learning_rate": 0.00028192106268097334, "loss": 0.8629, "step": 40 }, { "epoch": 0.08120319366218977, "grad_norm": 0.6958957314491272, "learning_rate": 0.00028072263654409154, "loss": 1.1296, "step": 41 }, { "epoch": 0.08318375936126757, "grad_norm": 0.6250704526901245, "learning_rate": 0.0002794884721436361, "loss": 0.9775, "step": 42 }, { "epoch": 0.08516432506034537, "grad_norm": 0.7835971713066101, "learning_rate": 0.00027821890688783083, "loss": 1.0837, "step": 43 }, { "epoch": 0.08714489075942317, "grad_norm": 0.6644023656845093, "learning_rate": 0.0002769142878631403, "loss": 0.8859, "step": 44 }, { "epoch": 0.08912545645850097, "grad_norm": 0.6114141941070557, "learning_rate": 0.00027557497173937923, "loss": 0.8415, "step": 45 }, { "epoch": 0.09110602215757876, "grad_norm": 0.8422795534133911, "learning_rate": 0.000274201324672203, "loss": 1.0906, "step": 46 }, { "epoch": 0.09308658785665656, "grad_norm": 1.0369035005569458, "learning_rate": 0.00027279372220300385, "loss": 1.2738, "step": 47 }, { "epoch": 0.09506715355573436, "grad_norm": 1.2230334281921387, "learning_rate": 0.0002713525491562421, "loss": 1.0714, "step": 48 }, { "epoch": 0.09704771925481216, "grad_norm": 1.0913413763046265, "learning_rate": 0.00026987819953423867, "loss": 0.8399, "step": 49 }, { "epoch": 0.09902828495388996, "grad_norm": 1.3864681720733643, "learning_rate": 0.00026837107640945905, "loss": 0.7678, "step": 50 }, { "epoch": 0.09902828495388996, "eval_loss": 1.5767979621887207, "eval_runtime": 7.9554, "eval_samples_per_second": 6.285, "eval_steps_per_second": 6.285, "step": 50 }, { "epoch": 0.10100885065296776, "grad_norm": 5.140173435211182, "learning_rate": 0.0002668315918143169, "loss": 1.4746, "step": 51 }, { "epoch": 0.10298941635204556, "grad_norm": 2.367671012878418, "learning_rate": 0.00026526016662852886, "loss": 1.3469, "step": 52 }, { "epoch": 0.10496998205112336, "grad_norm": 0.7265090942382812, "learning_rate": 0.00026365723046405023, "loss": 1.1216, "step": 53 }, { "epoch": 0.10695054775020116, "grad_norm": 0.40162143111228943, "learning_rate": 0.0002620232215476231, "loss": 0.9282, "step": 54 }, { "epoch": 0.10893111344927894, "grad_norm": 0.3782496750354767, "learning_rate": 0.0002603585866009697, "loss": 0.9196, "step": 55 }, { "epoch": 0.11091167914835674, "grad_norm": 0.39901453256607056, "learning_rate": 0.00025866378071866334, "loss": 0.9934, "step": 56 }, { "epoch": 0.11289224484743454, "grad_norm": 0.34213370084762573, "learning_rate": 0.00025693926724370956, "loss": 0.7669, "step": 57 }, { "epoch": 0.11487281054651234, "grad_norm": 0.3721526563167572, "learning_rate": 0.00025518551764087326, "loss": 1.0746, "step": 58 }, { "epoch": 0.11685337624559014, "grad_norm": 0.32081490755081177, "learning_rate": 0.00025340301136778483, "loss": 0.8125, "step": 59 }, { "epoch": 0.11883394194466794, "grad_norm": 0.30376437306404114, "learning_rate": 0.00025159223574386114, "loss": 0.7478, "step": 60 }, { "epoch": 0.12081450764374574, "grad_norm": 0.3731345534324646, "learning_rate": 0.0002497536858170772, "loss": 0.8446, "step": 61 }, { "epoch": 0.12279507334282354, "grad_norm": 0.3900434970855713, "learning_rate": 0.00024788786422862526, "loss": 0.7374, "step": 62 }, { "epoch": 0.12477563904190134, "grad_norm": 0.33390289545059204, "learning_rate": 0.00024599528107549745, "loss": 0.8323, "step": 63 }, { "epoch": 0.12675620474097915, "grad_norm": 0.39714372158050537, "learning_rate": 0.00024407645377103054, "loss": 0.7524, "step": 64 }, { "epoch": 0.12873677044005694, "grad_norm": 0.39826449751853943, "learning_rate": 0.00024213190690345018, "loss": 0.8488, "step": 65 }, { "epoch": 0.13071733613913475, "grad_norm": 0.34255996346473694, "learning_rate": 0.00024016217209245374, "loss": 0.7016, "step": 66 }, { "epoch": 0.13269790183821253, "grad_norm": 0.38450971245765686, "learning_rate": 0.00023816778784387094, "loss": 1.0756, "step": 67 }, { "epoch": 0.13467846753729035, "grad_norm": 0.4849868416786194, "learning_rate": 0.0002361492994024415, "loss": 1.2146, "step": 68 }, { "epoch": 0.13665903323636813, "grad_norm": 0.5141264200210571, "learning_rate": 0.0002341072586027509, "loss": 1.2667, "step": 69 }, { "epoch": 0.13863959893544595, "grad_norm": 0.45579981803894043, "learning_rate": 0.00023204222371836405, "loss": 1.3471, "step": 70 }, { "epoch": 0.14062016463452373, "grad_norm": 0.48155584931373596, "learning_rate": 0.00022995475930919905, "loss": 1.2678, "step": 71 }, { "epoch": 0.14260073033360154, "grad_norm": 0.4835640490055084, "learning_rate": 0.00022784543606718227, "loss": 1.305, "step": 72 }, { "epoch": 0.14458129603267933, "grad_norm": 0.4561633765697479, "learning_rate": 0.00022571483066022657, "loss": 1.1703, "step": 73 }, { "epoch": 0.14656186173175714, "grad_norm": 0.4840521216392517, "learning_rate": 0.0002235635255745762, "loss": 1.2423, "step": 74 }, { "epoch": 0.14854242743083493, "grad_norm": 0.4835491478443146, "learning_rate": 0.00022139210895556104, "loss": 1.2603, "step": 75 }, { "epoch": 0.14854242743083493, "eval_loss": 1.1694457530975342, "eval_runtime": 7.9547, "eval_samples_per_second": 6.286, "eval_steps_per_second": 6.286, "step": 75 }, { "epoch": 0.15052299312991274, "grad_norm": 0.5516873598098755, "learning_rate": 0.00021920117444680317, "loss": 1.2946, "step": 76 }, { "epoch": 0.15250355882899053, "grad_norm": 0.5743866562843323, "learning_rate": 0.00021699132102792097, "loss": 1.3838, "step": 77 }, { "epoch": 0.15448412452806834, "grad_norm": 0.52964848279953, "learning_rate": 0.0002147631528507739, "loss": 1.1524, "step": 78 }, { "epoch": 0.15646469022714612, "grad_norm": 0.6194289326667786, "learning_rate": 0.00021251727907429355, "loss": 1.2024, "step": 79 }, { "epoch": 0.15844525592622394, "grad_norm": 0.5605446696281433, "learning_rate": 0.0002102543136979454, "loss": 1.3307, "step": 80 }, { "epoch": 0.16042582162530172, "grad_norm": 0.5482114553451538, "learning_rate": 0.0002079748753938678, "loss": 1.2013, "step": 81 }, { "epoch": 0.16240638732437954, "grad_norm": 0.570341169834137, "learning_rate": 0.0002056795873377331, "loss": 1.1523, "step": 82 }, { "epoch": 0.16438695302345732, "grad_norm": 0.49789685010910034, "learning_rate": 0.00020336907703837748, "loss": 1.0551, "step": 83 }, { "epoch": 0.16636751872253513, "grad_norm": 0.5111387968063354, "learning_rate": 0.00020104397616624645, "loss": 1.1719, "step": 84 }, { "epoch": 0.16834808442161292, "grad_norm": 0.5921433568000793, "learning_rate": 0.00019870492038070252, "loss": 1.1666, "step": 85 }, { "epoch": 0.17032865012069073, "grad_norm": 0.5336898565292358, "learning_rate": 0.0001963525491562421, "loss": 1.1609, "step": 86 }, { "epoch": 0.17230921581976852, "grad_norm": 0.5636164546012878, "learning_rate": 0.0001939875056076697, "loss": 0.9504, "step": 87 }, { "epoch": 0.17428978151884633, "grad_norm": 0.577713668346405, "learning_rate": 0.00019161043631427666, "loss": 1.0856, "step": 88 }, { "epoch": 0.17627034721792412, "grad_norm": 0.6310526728630066, "learning_rate": 0.00018922199114307294, "loss": 1.261, "step": 89 }, { "epoch": 0.17825091291700193, "grad_norm": 0.6531705856323242, "learning_rate": 0.00018682282307111987, "loss": 1.0971, "step": 90 }, { "epoch": 0.18023147861607972, "grad_norm": 0.6384961605072021, "learning_rate": 0.00018441358800701273, "loss": 0.9694, "step": 91 }, { "epoch": 0.18221204431515753, "grad_norm": 0.699545681476593, "learning_rate": 0.00018199494461156203, "loss": 0.9644, "step": 92 }, { "epoch": 0.18419261001423531, "grad_norm": 0.6665005087852478, "learning_rate": 0.000179567554117722, "loss": 0.8765, "step": 93 }, { "epoch": 0.18617317571331313, "grad_norm": 0.5795398950576782, "learning_rate": 0.00017713208014981648, "loss": 0.719, "step": 94 }, { "epoch": 0.1881537414123909, "grad_norm": 0.5112330913543701, "learning_rate": 0.00017468918854211007, "loss": 0.5695, "step": 95 }, { "epoch": 0.19013430711146873, "grad_norm": 1.0074706077575684, "learning_rate": 0.00017223954715677627, "loss": 1.2813, "step": 96 }, { "epoch": 0.1921148728105465, "grad_norm": 0.9614951014518738, "learning_rate": 0.00016978382570131034, "loss": 1.1026, "step": 97 }, { "epoch": 0.19409543850962432, "grad_norm": 1.0307389497756958, "learning_rate": 0.00016732269554543794, "loss": 1.0465, "step": 98 }, { "epoch": 0.1960760042087021, "grad_norm": 1.1638429164886475, "learning_rate": 0.00016485682953756942, "loss": 0.8255, "step": 99 }, { "epoch": 0.19805656990777992, "grad_norm": 1.3206615447998047, "learning_rate": 0.00016238690182084986, "loss": 0.8929, "step": 100 }, { "epoch": 0.19805656990777992, "eval_loss": 1.363052248954773, "eval_runtime": 7.9498, "eval_samples_per_second": 6.29, "eval_steps_per_second": 6.29, "step": 100 }, { "epoch": 0.2000371356068577, "grad_norm": 1.788689136505127, "learning_rate": 0.0001599135876488549, "loss": 1.1028, "step": 101 }, { "epoch": 0.20201770130593552, "grad_norm": 1.1090993881225586, "learning_rate": 0.00015743756320098332, "loss": 1.14, "step": 102 }, { "epoch": 0.2039982670050133, "grad_norm": 0.710551917552948, "learning_rate": 0.0001549595053975962, "loss": 1.0624, "step": 103 }, { "epoch": 0.20597883270409112, "grad_norm": 0.42503175139427185, "learning_rate": 0.00015248009171495378, "loss": 0.9895, "step": 104 }, { "epoch": 0.2079593984031689, "grad_norm": 0.30752989649772644, "learning_rate": 0.00015, "loss": 0.9048, "step": 105 }, { "epoch": 0.20993996410224672, "grad_norm": 0.28837981820106506, "learning_rate": 0.00014751990828504622, "loss": 0.9729, "step": 106 }, { "epoch": 0.2119205298013245, "grad_norm": 0.28646722435951233, "learning_rate": 0.00014504049460240375, "loss": 0.8549, "step": 107 }, { "epoch": 0.21390109550040232, "grad_norm": 0.2909419536590576, "learning_rate": 0.00014256243679901663, "loss": 0.8896, "step": 108 }, { "epoch": 0.2158816611994801, "grad_norm": 0.3322446346282959, "learning_rate": 0.00014008641235114508, "loss": 0.7194, "step": 109 }, { "epoch": 0.2178622268985579, "grad_norm": 0.2969757318496704, "learning_rate": 0.00013761309817915014, "loss": 0.79, "step": 110 }, { "epoch": 0.2198427925976357, "grad_norm": 0.3453933596611023, "learning_rate": 0.00013514317046243058, "loss": 0.8365, "step": 111 }, { "epoch": 0.22182335829671349, "grad_norm": 0.26184096932411194, "learning_rate": 0.00013267730445456208, "loss": 0.581, "step": 112 }, { "epoch": 0.2238039239957913, "grad_norm": 0.33915719389915466, "learning_rate": 0.00013021617429868963, "loss": 0.7551, "step": 113 }, { "epoch": 0.22578448969486908, "grad_norm": 0.3160857856273651, "learning_rate": 0.00012776045284322368, "loss": 0.7751, "step": 114 }, { "epoch": 0.2277650553939469, "grad_norm": 0.32658034563064575, "learning_rate": 0.00012531081145788987, "loss": 0.8707, "step": 115 }, { "epoch": 0.22974562109302468, "grad_norm": 0.3431919813156128, "learning_rate": 0.00012286791985018355, "loss": 0.7935, "step": 116 }, { "epoch": 0.2317261867921025, "grad_norm": 0.3515626788139343, "learning_rate": 0.00012043244588227796, "loss": 0.8467, "step": 117 }, { "epoch": 0.23370675249118028, "grad_norm": 0.387687623500824, "learning_rate": 0.00011800505538843798, "loss": 1.0536, "step": 118 }, { "epoch": 0.2356873181902581, "grad_norm": 0.47919800877571106, "learning_rate": 0.00011558641199298727, "loss": 1.253, "step": 119 }, { "epoch": 0.23766788388933588, "grad_norm": 0.460734099149704, "learning_rate": 0.00011317717692888012, "loss": 1.2217, "step": 120 }, { "epoch": 0.2396484495884137, "grad_norm": 0.418082058429718, "learning_rate": 0.00011077800885692702, "loss": 1.2639, "step": 121 }, { "epoch": 0.24162901528749148, "grad_norm": 0.43898528814315796, "learning_rate": 0.00010838956368572334, "loss": 1.2732, "step": 122 }, { "epoch": 0.2436095809865693, "grad_norm": 0.557872474193573, "learning_rate": 0.0001060124943923303, "loss": 1.2141, "step": 123 }, { "epoch": 0.24559014668564708, "grad_norm": 0.5319792628288269, "learning_rate": 0.0001036474508437579, "loss": 1.2522, "step": 124 }, { "epoch": 0.2475707123847249, "grad_norm": 0.5045695304870605, "learning_rate": 0.00010129507961929748, "loss": 1.2974, "step": 125 }, { "epoch": 0.2475707123847249, "eval_loss": 1.1529182195663452, "eval_runtime": 7.944, "eval_samples_per_second": 6.294, "eval_steps_per_second": 6.294, "step": 125 }, { "epoch": 0.24955127808380267, "grad_norm": 0.5417290925979614, "learning_rate": 9.895602383375353e-05, "loss": 1.212, "step": 126 }, { "epoch": 0.25153184378288046, "grad_norm": 0.4937967360019684, "learning_rate": 9.663092296162251e-05, "loss": 1.2267, "step": 127 }, { "epoch": 0.2535124094819583, "grad_norm": 0.49752992391586304, "learning_rate": 9.432041266226686e-05, "loss": 1.0415, "step": 128 }, { "epoch": 0.2554929751810361, "grad_norm": 0.48785874247550964, "learning_rate": 9.202512460613219e-05, "loss": 1.1829, "step": 129 }, { "epoch": 0.25747354088011387, "grad_norm": 0.4991934895515442, "learning_rate": 8.97456863020546e-05, "loss": 1.2282, "step": 130 }, { "epoch": 0.25945410657919166, "grad_norm": 0.5931552648544312, "learning_rate": 8.748272092570646e-05, "loss": 1.2068, "step": 131 }, { "epoch": 0.2614346722782695, "grad_norm": 0.5376507639884949, "learning_rate": 8.523684714922608e-05, "loss": 1.2745, "step": 132 }, { "epoch": 0.2634152379773473, "grad_norm": 0.5740606188774109, "learning_rate": 8.300867897207903e-05, "loss": 0.9724, "step": 133 }, { "epoch": 0.26539580367642507, "grad_norm": 0.552382230758667, "learning_rate": 8.079882555319684e-05, "loss": 1.2513, "step": 134 }, { "epoch": 0.26737636937550285, "grad_norm": 0.4819226861000061, "learning_rate": 7.860789104443896e-05, "loss": 1.029, "step": 135 }, { "epoch": 0.2693569350745807, "grad_norm": 0.5257924199104309, "learning_rate": 7.643647442542382e-05, "loss": 1.2768, "step": 136 }, { "epoch": 0.2713375007736585, "grad_norm": 0.5025389194488525, "learning_rate": 7.428516933977347e-05, "loss": 1.1005, "step": 137 }, { "epoch": 0.27331806647273627, "grad_norm": 0.5294015407562256, "learning_rate": 7.215456393281776e-05, "loss": 0.9751, "step": 138 }, { "epoch": 0.27529863217181405, "grad_norm": 0.5134867429733276, "learning_rate": 7.004524069080096e-05, "loss": 0.8709, "step": 139 }, { "epoch": 0.2772791978708919, "grad_norm": 0.6456707119941711, "learning_rate": 6.795777628163599e-05, "loss": 0.9567, "step": 140 }, { "epoch": 0.2792597635699697, "grad_norm": 0.5765627026557922, "learning_rate": 6.58927413972491e-05, "loss": 0.6746, "step": 141 }, { "epoch": 0.28124032926904746, "grad_norm": 0.6089555025100708, "learning_rate": 6.385070059755846e-05, "loss": 0.8622, "step": 142 }, { "epoch": 0.28322089496812525, "grad_norm": 0.6544927954673767, "learning_rate": 6.183221215612904e-05, "loss": 0.7769, "step": 143 }, { "epoch": 0.2852014606672031, "grad_norm": 0.5500593781471252, "learning_rate": 5.983782790754623e-05, "loss": 0.5781, "step": 144 }, { "epoch": 0.2871820263662809, "grad_norm": 0.5492486357688904, "learning_rate": 5.786809309654982e-05, "loss": 0.4864, "step": 145 }, { "epoch": 0.28916259206535866, "grad_norm": 0.7028548121452332, "learning_rate": 5.592354622896944e-05, "loss": 0.9125, "step": 146 }, { "epoch": 0.29114315776443644, "grad_norm": 0.8846566081047058, "learning_rate": 5.40047189245025e-05, "loss": 1.2438, "step": 147 }, { "epoch": 0.2931237234635143, "grad_norm": 0.9810320734977722, "learning_rate": 5.211213577137469e-05, "loss": 1.1933, "step": 148 }, { "epoch": 0.29510428916259207, "grad_norm": 1.4606691598892212, "learning_rate": 5.024631418292274e-05, "loss": 0.839, "step": 149 }, { "epoch": 0.29708485486166986, "grad_norm": 2.203315258026123, "learning_rate": 4.840776425613886e-05, "loss": 0.7995, "step": 150 }, { "epoch": 0.29708485486166986, "eval_loss": 1.169845700263977, "eval_runtime": 7.946, "eval_samples_per_second": 6.292, "eval_steps_per_second": 6.292, "step": 150 }, { "epoch": 0.29906542056074764, "grad_norm": 0.5212217569351196, "learning_rate": 4.659698863221513e-05, "loss": 0.9336, "step": 151 }, { "epoch": 0.3010459862598255, "grad_norm": 0.5553271770477295, "learning_rate": 4.481448235912671e-05, "loss": 0.9745, "step": 152 }, { "epoch": 0.30302655195890327, "grad_norm": 0.6053627133369446, "learning_rate": 4.306073275629044e-05, "loss": 0.953, "step": 153 }, { "epoch": 0.30500711765798105, "grad_norm": 0.5293852686882019, "learning_rate": 4.133621928133665e-05, "loss": 0.8579, "step": 154 }, { "epoch": 0.30698768335705884, "grad_norm": 0.5316524505615234, "learning_rate": 3.964141339903026e-05, "loss": 0.8284, "step": 155 }, { "epoch": 0.3089682490561367, "grad_norm": 0.5538764595985413, "learning_rate": 3.797677845237696e-05, "loss": 0.8725, "step": 156 }, { "epoch": 0.31094881475521446, "grad_norm": 0.5353713631629944, "learning_rate": 3.634276953594982e-05, "loss": 0.8798, "step": 157 }, { "epoch": 0.31292938045429225, "grad_norm": 0.45507702231407166, "learning_rate": 3.473983337147118e-05, "loss": 0.7345, "step": 158 }, { "epoch": 0.31490994615337004, "grad_norm": 0.5768627524375916, "learning_rate": 3.316840818568315e-05, "loss": 0.9696, "step": 159 }, { "epoch": 0.3168905118524479, "grad_norm": 0.4070928394794464, "learning_rate": 3.162892359054098e-05, "loss": 0.8171, "step": 160 }, { "epoch": 0.31887107755152566, "grad_norm": 0.28896093368530273, "learning_rate": 3.0121800465761293e-05, "loss": 0.6783, "step": 161 }, { "epoch": 0.32085164325060345, "grad_norm": 0.292948842048645, "learning_rate": 2.8647450843757897e-05, "loss": 0.8252, "step": 162 }, { "epoch": 0.32283220894968123, "grad_norm": 0.373655766248703, "learning_rate": 2.7206277796996144e-05, "loss": 0.8913, "step": 163 }, { "epoch": 0.3248127746487591, "grad_norm": 0.2723099887371063, "learning_rate": 2.5798675327796993e-05, "loss": 0.7866, "step": 164 }, { "epoch": 0.32679334034783686, "grad_norm": 0.31054458022117615, "learning_rate": 2.4425028260620715e-05, "loss": 0.7626, "step": 165 }, { "epoch": 0.32877390604691464, "grad_norm": 0.31001657247543335, "learning_rate": 2.3085712136859668e-05, "loss": 0.837, "step": 166 }, { "epoch": 0.33075447174599243, "grad_norm": 0.3545086085796356, "learning_rate": 2.178109311216913e-05, "loss": 1.0232, "step": 167 }, { "epoch": 0.33273503744507027, "grad_norm": 0.3655906915664673, "learning_rate": 2.0511527856363912e-05, "loss": 0.9356, "step": 168 }, { "epoch": 0.33471560314414806, "grad_norm": 0.42094361782073975, "learning_rate": 1.927736345590839e-05, "loss": 1.122, "step": 169 }, { "epoch": 0.33669616884322584, "grad_norm": 0.4121546745300293, "learning_rate": 1.8078937319026654e-05, "loss": 1.1692, "step": 170 }, { "epoch": 0.3386767345423036, "grad_norm": 0.43121325969696045, "learning_rate": 1.6916577083458228e-05, "loss": 1.1433, "step": 171 }, { "epoch": 0.34065730024138147, "grad_norm": 0.44818028807640076, "learning_rate": 1.579060052688548e-05, "loss": 1.1544, "step": 172 }, { "epoch": 0.34263786594045925, "grad_norm": 0.46644723415374756, "learning_rate": 1.4701315480056164e-05, "loss": 1.1511, "step": 173 }, { "epoch": 0.34461843163953704, "grad_norm": 0.5559659004211426, "learning_rate": 1.3649019742625623e-05, "loss": 1.1325, "step": 174 }, { "epoch": 0.3465989973386148, "grad_norm": 0.486060231924057, "learning_rate": 1.2634001001741373e-05, "loss": 1.0881, "step": 175 }, { "epoch": 0.3465989973386148, "eval_loss": 1.13009774684906, "eval_runtime": 7.9539, "eval_samples_per_second": 6.286, "eval_steps_per_second": 6.286, "step": 175 }, { "epoch": 0.34857956303769266, "grad_norm": 0.4915584921836853, "learning_rate": 1.1656536753392287e-05, "loss": 1.1702, "step": 176 }, { "epoch": 0.35056012873677045, "grad_norm": 0.5216694474220276, "learning_rate": 1.0716894226543953e-05, "loss": 1.2237, "step": 177 }, { "epoch": 0.35254069443584823, "grad_norm": 0.4730156660079956, "learning_rate": 9.815330310080887e-06, "loss": 1.1725, "step": 178 }, { "epoch": 0.354521260134926, "grad_norm": 0.5110475420951843, "learning_rate": 8.952091482575824e-06, "loss": 1.2701, "step": 179 }, { "epoch": 0.35650182583400386, "grad_norm": 0.5410056114196777, "learning_rate": 8.127413744904804e-06, "loss": 1.125, "step": 180 }, { "epoch": 0.35848239153308165, "grad_norm": 0.4916069209575653, "learning_rate": 7.34152255572697e-06, "loss": 1.063, "step": 181 }, { "epoch": 0.36046295723215943, "grad_norm": 0.5280264019966125, "learning_rate": 6.594632769846353e-06, "loss": 1.022, "step": 182 }, { "epoch": 0.3624435229312372, "grad_norm": 0.4919007420539856, "learning_rate": 5.886948579472778e-06, "loss": 1.0365, "step": 183 }, { "epoch": 0.36442408863031506, "grad_norm": 0.5240092873573303, "learning_rate": 5.218663458397715e-06, "loss": 1.1609, "step": 184 }, { "epoch": 0.36640465432939284, "grad_norm": 0.6551080942153931, "learning_rate": 4.589960109100444e-06, "loss": 0.8449, "step": 185 }, { "epoch": 0.36838522002847063, "grad_norm": 0.5417661666870117, "learning_rate": 4.001010412799138e-06, "loss": 1.0913, "step": 186 }, { "epoch": 0.3703657857275484, "grad_norm": 0.5707260370254517, "learning_rate": 3.451975382460109e-06, "loss": 0.8799, "step": 187 }, { "epoch": 0.37234635142662625, "grad_norm": 0.5540571808815002, "learning_rate": 2.9430051187785962e-06, "loss": 0.9926, "step": 188 }, { "epoch": 0.37432691712570404, "grad_norm": 0.5678872466087341, "learning_rate": 2.4742387691426445e-06, "loss": 0.9895, "step": 189 }, { "epoch": 0.3763074828247818, "grad_norm": 0.6278743743896484, "learning_rate": 2.0458044895916513e-06, "loss": 0.9935, "step": 190 }, { "epoch": 0.3782880485238596, "grad_norm": 0.6568822860717773, "learning_rate": 1.6578194097797258e-06, "loss": 0.8738, "step": 191 }, { "epoch": 0.38026861422293745, "grad_norm": 0.6540056467056274, "learning_rate": 1.3103896009537207e-06, "loss": 1.1838, "step": 192 }, { "epoch": 0.38224917992201524, "grad_norm": 0.7000465393066406, "learning_rate": 1.0036100469542786e-06, "loss": 1.0445, "step": 193 }, { "epoch": 0.384229745621093, "grad_norm": 0.5517255663871765, "learning_rate": 7.375646182482875e-07, "loss": 0.6349, "step": 194 }, { "epoch": 0.3862103113201708, "grad_norm": 0.837535560131073, "learning_rate": 5.123260489995229e-07, "loss": 0.7306, "step": 195 }, { "epoch": 0.38819087701924865, "grad_norm": 0.7388361096382141, "learning_rate": 3.2795591718381975e-07, "loss": 0.9283, "step": 196 }, { "epoch": 0.39017144271832643, "grad_norm": 1.2435849905014038, "learning_rate": 1.8450462775428942e-07, "loss": 1.205, "step": 197 }, { "epoch": 0.3921520084174042, "grad_norm": 1.1012904644012451, "learning_rate": 8.201139886109264e-08, "loss": 1.2314, "step": 198 }, { "epoch": 0.394132574116482, "grad_norm": 1.3080377578735352, "learning_rate": 2.0504251129649374e-08, "loss": 0.7755, "step": 199 }, { "epoch": 0.39611313981555984, "grad_norm": 1.854987621307373, "learning_rate": 0.0, "loss": 0.8858, "step": 200 }, { "epoch": 0.39611313981555984, "eval_loss": 1.1278622150421143, "eval_runtime": 7.959, "eval_samples_per_second": 6.282, "eval_steps_per_second": 6.282, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.153022666407936e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }