{ "best_metric": 9.585326194763184, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.007664156655362036, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.664156655362036e-05, "grad_norm": 3.0833306312561035, "learning_rate": 1.5000000000000002e-07, "loss": 10.2038, "step": 1 }, { "epoch": 7.664156655362036e-05, "eval_loss": 10.22679328918457, "eval_runtime": 416.8959, "eval_samples_per_second": 52.713, "eval_steps_per_second": 6.589, "step": 1 }, { "epoch": 0.0001532831331072407, "grad_norm": 2.9026267528533936, "learning_rate": 3.0000000000000004e-07, "loss": 10.2123, "step": 2 }, { "epoch": 0.00022992469966086107, "grad_norm": 2.717977523803711, "learning_rate": 4.5e-07, "loss": 10.2573, "step": 3 }, { "epoch": 0.0003065662662144814, "grad_norm": 2.9277496337890625, "learning_rate": 6.000000000000001e-07, "loss": 10.2601, "step": 4 }, { "epoch": 0.00038320783276810176, "grad_norm": 2.9612338542938232, "learning_rate": 7.5e-07, "loss": 10.1693, "step": 5 }, { "epoch": 0.00045984939932172214, "grad_norm": 2.873253583908081, "learning_rate": 9e-07, "loss": 10.2514, "step": 6 }, { "epoch": 0.0005364909658753425, "grad_norm": 2.8762059211730957, "learning_rate": 1.0500000000000001e-06, "loss": 10.3508, "step": 7 }, { "epoch": 0.0006131325324289629, "grad_norm": 2.795992851257324, "learning_rate": 1.2000000000000002e-06, "loss": 10.2734, "step": 8 }, { "epoch": 0.0006897740989825832, "grad_norm": 2.769205331802368, "learning_rate": 1.35e-06, "loss": 10.0884, "step": 9 }, { "epoch": 0.0007664156655362035, "grad_norm": 2.8631913661956787, "learning_rate": 1.5e-06, "loss": 10.1913, "step": 10 }, { "epoch": 0.000843057232089824, "grad_norm": 2.778069019317627, "learning_rate": 1.65e-06, "loss": 10.3434, "step": 11 }, { "epoch": 0.0009196987986434443, "grad_norm": 2.8693792819976807, "learning_rate": 1.8e-06, "loss": 10.2631, "step": 12 }, { "epoch": 0.0009963403651970647, "grad_norm": 3.109616756439209, "learning_rate": 1.95e-06, "loss": 10.3441, "step": 13 }, { "epoch": 0.001072981931750685, "grad_norm": 3.0356502532958984, "learning_rate": 2.1000000000000002e-06, "loss": 10.1493, "step": 14 }, { "epoch": 0.0011496234983043054, "grad_norm": 2.9254584312438965, "learning_rate": 2.25e-06, "loss": 10.2948, "step": 15 }, { "epoch": 0.0012262650648579257, "grad_norm": 2.876832962036133, "learning_rate": 2.4000000000000003e-06, "loss": 10.1155, "step": 16 }, { "epoch": 0.001302906631411546, "grad_norm": 2.9504146575927734, "learning_rate": 2.55e-06, "loss": 10.1583, "step": 17 }, { "epoch": 0.0013795481979651664, "grad_norm": 2.836592435836792, "learning_rate": 2.7e-06, "loss": 10.1299, "step": 18 }, { "epoch": 0.0014561897645187867, "grad_norm": 3.039888381958008, "learning_rate": 2.8500000000000002e-06, "loss": 10.1638, "step": 19 }, { "epoch": 0.001532831331072407, "grad_norm": 2.7712507247924805, "learning_rate": 3e-06, "loss": 10.2074, "step": 20 }, { "epoch": 0.0016094728976260276, "grad_norm": 2.803617238998413, "learning_rate": 3.15e-06, "loss": 10.2651, "step": 21 }, { "epoch": 0.001686114464179648, "grad_norm": 2.9304254055023193, "learning_rate": 3.3e-06, "loss": 10.2377, "step": 22 }, { "epoch": 0.0017627560307332682, "grad_norm": 2.8531479835510254, "learning_rate": 3.4500000000000004e-06, "loss": 10.2657, "step": 23 }, { "epoch": 0.0018393975972868886, "grad_norm": 2.846740245819092, "learning_rate": 3.6e-06, "loss": 10.2348, "step": 24 }, { "epoch": 0.001916039163840509, "grad_norm": 3.0314481258392334, "learning_rate": 3.75e-06, "loss": 10.2579, "step": 25 }, { "epoch": 0.0019926807303941294, "grad_norm": 3.0108513832092285, "learning_rate": 3.9e-06, "loss": 10.2788, "step": 26 }, { "epoch": 0.0020693222969477496, "grad_norm": 2.8648674488067627, "learning_rate": 4.05e-06, "loss": 10.16, "step": 27 }, { "epoch": 0.00214596386350137, "grad_norm": 2.820892095565796, "learning_rate": 4.2000000000000004e-06, "loss": 10.1158, "step": 28 }, { "epoch": 0.00222260543005499, "grad_norm": 2.9025187492370605, "learning_rate": 4.35e-06, "loss": 10.3069, "step": 29 }, { "epoch": 0.0022992469966086108, "grad_norm": 2.791337490081787, "learning_rate": 4.5e-06, "loss": 10.1772, "step": 30 }, { "epoch": 0.002375888563162231, "grad_norm": 2.826338768005371, "learning_rate": 4.65e-06, "loss": 10.1847, "step": 31 }, { "epoch": 0.0024525301297158514, "grad_norm": 2.5365052223205566, "learning_rate": 4.800000000000001e-06, "loss": 10.2665, "step": 32 }, { "epoch": 0.002529171696269472, "grad_norm": 2.9171478748321533, "learning_rate": 4.95e-06, "loss": 10.2147, "step": 33 }, { "epoch": 0.002605813262823092, "grad_norm": 3.2460930347442627, "learning_rate": 5.1e-06, "loss": 10.0759, "step": 34 }, { "epoch": 0.0026824548293767126, "grad_norm": 2.8777740001678467, "learning_rate": 5.25e-06, "loss": 10.214, "step": 35 }, { "epoch": 0.0027590963959303327, "grad_norm": 2.959735155105591, "learning_rate": 5.4e-06, "loss": 10.266, "step": 36 }, { "epoch": 0.0028357379624839533, "grad_norm": 2.900165319442749, "learning_rate": 5.55e-06, "loss": 10.0336, "step": 37 }, { "epoch": 0.0029123795290375734, "grad_norm": 2.9267468452453613, "learning_rate": 5.7000000000000005e-06, "loss": 10.1848, "step": 38 }, { "epoch": 0.002989021095591194, "grad_norm": 2.95743727684021, "learning_rate": 5.850000000000001e-06, "loss": 10.1497, "step": 39 }, { "epoch": 0.003065662662144814, "grad_norm": 2.9404218196868896, "learning_rate": 6e-06, "loss": 10.1824, "step": 40 }, { "epoch": 0.0031423042286984346, "grad_norm": 3.076639413833618, "learning_rate": 6.1499999999999996e-06, "loss": 10.1815, "step": 41 }, { "epoch": 0.003218945795252055, "grad_norm": 3.1609745025634766, "learning_rate": 6.3e-06, "loss": 10.1391, "step": 42 }, { "epoch": 0.0032955873618056753, "grad_norm": 3.024962902069092, "learning_rate": 6.45e-06, "loss": 10.1044, "step": 43 }, { "epoch": 0.003372228928359296, "grad_norm": 2.7803852558135986, "learning_rate": 6.6e-06, "loss": 10.1082, "step": 44 }, { "epoch": 0.003448870494912916, "grad_norm": 2.9283249378204346, "learning_rate": 6.750000000000001e-06, "loss": 10.0209, "step": 45 }, { "epoch": 0.0035255120614665365, "grad_norm": 2.9342944622039795, "learning_rate": 6.900000000000001e-06, "loss": 10.1783, "step": 46 }, { "epoch": 0.0036021536280201566, "grad_norm": 2.911269187927246, "learning_rate": 7.049999999999999e-06, "loss": 10.1705, "step": 47 }, { "epoch": 0.003678795194573777, "grad_norm": 2.9178407192230225, "learning_rate": 7.2e-06, "loss": 10.185, "step": 48 }, { "epoch": 0.0037554367611273972, "grad_norm": 3.2789461612701416, "learning_rate": 7.35e-06, "loss": 10.1262, "step": 49 }, { "epoch": 0.003832078327681018, "grad_norm": 2.9491851329803467, "learning_rate": 7.5e-06, "loss": 9.9911, "step": 50 }, { "epoch": 0.003832078327681018, "eval_loss": 10.096914291381836, "eval_runtime": 417.99, "eval_samples_per_second": 52.575, "eval_steps_per_second": 6.572, "step": 50 }, { "epoch": 0.003908719894234638, "grad_norm": 2.9691355228424072, "learning_rate": 7.65e-06, "loss": 10.012, "step": 51 }, { "epoch": 0.003985361460788259, "grad_norm": 3.0408084392547607, "learning_rate": 7.8e-06, "loss": 10.1541, "step": 52 }, { "epoch": 0.0040620030273418786, "grad_norm": 2.865037441253662, "learning_rate": 7.95e-06, "loss": 10.1152, "step": 53 }, { "epoch": 0.004138644593895499, "grad_norm": 3.0118324756622314, "learning_rate": 8.1e-06, "loss": 10.056, "step": 54 }, { "epoch": 0.00421528616044912, "grad_norm": 3.0567305088043213, "learning_rate": 8.25e-06, "loss": 10.1187, "step": 55 }, { "epoch": 0.00429192772700274, "grad_norm": 3.090759038925171, "learning_rate": 8.400000000000001e-06, "loss": 9.9549, "step": 56 }, { "epoch": 0.00436856929355636, "grad_norm": 3.094413995742798, "learning_rate": 8.55e-06, "loss": 10.0912, "step": 57 }, { "epoch": 0.00444521086010998, "grad_norm": 2.8803794384002686, "learning_rate": 8.7e-06, "loss": 10.0612, "step": 58 }, { "epoch": 0.004521852426663601, "grad_norm": 3.1622092723846436, "learning_rate": 8.85e-06, "loss": 10.0883, "step": 59 }, { "epoch": 0.0045984939932172215, "grad_norm": 3.1270225048065186, "learning_rate": 9e-06, "loss": 10.0202, "step": 60 }, { "epoch": 0.004675135559770842, "grad_norm": 3.1323904991149902, "learning_rate": 9.15e-06, "loss": 9.9491, "step": 61 }, { "epoch": 0.004751777126324462, "grad_norm": 3.130385637283325, "learning_rate": 9.3e-06, "loss": 9.9419, "step": 62 }, { "epoch": 0.004828418692878082, "grad_norm": 2.9450523853302, "learning_rate": 9.450000000000001e-06, "loss": 9.9549, "step": 63 }, { "epoch": 0.004905060259431703, "grad_norm": 2.9594674110412598, "learning_rate": 9.600000000000001e-06, "loss": 9.9597, "step": 64 }, { "epoch": 0.004981701825985323, "grad_norm": 2.946688413619995, "learning_rate": 9.75e-06, "loss": 10.0721, "step": 65 }, { "epoch": 0.005058343392538944, "grad_norm": 3.045152425765991, "learning_rate": 9.9e-06, "loss": 9.8657, "step": 66 }, { "epoch": 0.005134984959092564, "grad_norm": 3.0098981857299805, "learning_rate": 1.005e-05, "loss": 9.8488, "step": 67 }, { "epoch": 0.005211626525646184, "grad_norm": 2.9197866916656494, "learning_rate": 1.02e-05, "loss": 9.858, "step": 68 }, { "epoch": 0.005288268092199805, "grad_norm": 3.026247501373291, "learning_rate": 1.035e-05, "loss": 9.9475, "step": 69 }, { "epoch": 0.005364909658753425, "grad_norm": 2.9125189781188965, "learning_rate": 1.05e-05, "loss": 9.9352, "step": 70 }, { "epoch": 0.005441551225307045, "grad_norm": 2.881605863571167, "learning_rate": 1.065e-05, "loss": 10.0641, "step": 71 }, { "epoch": 0.0055181927918606655, "grad_norm": 3.004376173019409, "learning_rate": 1.08e-05, "loss": 10.0225, "step": 72 }, { "epoch": 0.005594834358414286, "grad_norm": 3.101276159286499, "learning_rate": 1.095e-05, "loss": 9.9834, "step": 73 }, { "epoch": 0.0056714759249679066, "grad_norm": 2.9609551429748535, "learning_rate": 1.11e-05, "loss": 9.942, "step": 74 }, { "epoch": 0.005748117491521527, "grad_norm": 2.913710594177246, "learning_rate": 1.125e-05, "loss": 9.864, "step": 75 }, { "epoch": 0.005824759058075147, "grad_norm": 2.971576452255249, "learning_rate": 1.1400000000000001e-05, "loss": 9.9609, "step": 76 }, { "epoch": 0.005901400624628767, "grad_norm": 3.054107904434204, "learning_rate": 1.1550000000000001e-05, "loss": 10.0123, "step": 77 }, { "epoch": 0.005978042191182388, "grad_norm": 2.910504102706909, "learning_rate": 1.1700000000000001e-05, "loss": 9.8387, "step": 78 }, { "epoch": 0.006054683757736008, "grad_norm": 2.9695205688476562, "learning_rate": 1.185e-05, "loss": 9.8195, "step": 79 }, { "epoch": 0.006131325324289628, "grad_norm": 2.9118409156799316, "learning_rate": 1.2e-05, "loss": 9.7845, "step": 80 }, { "epoch": 0.006207966890843249, "grad_norm": 3.0323214530944824, "learning_rate": 1.215e-05, "loss": 9.8386, "step": 81 }, { "epoch": 0.006284608457396869, "grad_norm": 2.9126343727111816, "learning_rate": 1.2299999999999999e-05, "loss": 9.8635, "step": 82 }, { "epoch": 0.00636125002395049, "grad_norm": 3.027489185333252, "learning_rate": 1.245e-05, "loss": 9.8263, "step": 83 }, { "epoch": 0.00643789159050411, "grad_norm": 2.968306064605713, "learning_rate": 1.26e-05, "loss": 9.7821, "step": 84 }, { "epoch": 0.00651453315705773, "grad_norm": 3.212526321411133, "learning_rate": 1.275e-05, "loss": 9.8742, "step": 85 }, { "epoch": 0.0065911747236113505, "grad_norm": 3.0750510692596436, "learning_rate": 1.29e-05, "loss": 9.756, "step": 86 }, { "epoch": 0.006667816290164971, "grad_norm": 3.143045425415039, "learning_rate": 1.305e-05, "loss": 9.733, "step": 87 }, { "epoch": 0.006744457856718592, "grad_norm": 3.0339534282684326, "learning_rate": 1.32e-05, "loss": 9.7565, "step": 88 }, { "epoch": 0.006821099423272211, "grad_norm": 2.8683807849884033, "learning_rate": 1.3350000000000001e-05, "loss": 9.673, "step": 89 }, { "epoch": 0.006897740989825832, "grad_norm": 3.0948848724365234, "learning_rate": 1.3500000000000001e-05, "loss": 9.6999, "step": 90 }, { "epoch": 0.006974382556379452, "grad_norm": 3.123711347579956, "learning_rate": 1.3650000000000001e-05, "loss": 9.868, "step": 91 }, { "epoch": 0.007051024122933073, "grad_norm": 3.072702169418335, "learning_rate": 1.3800000000000002e-05, "loss": 9.6111, "step": 92 }, { "epoch": 0.0071276656894866935, "grad_norm": 3.133103609085083, "learning_rate": 1.395e-05, "loss": 9.7074, "step": 93 }, { "epoch": 0.007204307256040313, "grad_norm": 3.1256680488586426, "learning_rate": 1.4099999999999999e-05, "loss": 9.7523, "step": 94 }, { "epoch": 0.007280948822593934, "grad_norm": 3.0179288387298584, "learning_rate": 1.4249999999999999e-05, "loss": 9.6325, "step": 95 }, { "epoch": 0.007357590389147554, "grad_norm": 3.047670841217041, "learning_rate": 1.44e-05, "loss": 9.7347, "step": 96 }, { "epoch": 0.007434231955701175, "grad_norm": 3.1137807369232178, "learning_rate": 1.455e-05, "loss": 9.5914, "step": 97 }, { "epoch": 0.0075108735222547945, "grad_norm": 2.887342929840088, "learning_rate": 1.47e-05, "loss": 9.6417, "step": 98 }, { "epoch": 0.007587515088808415, "grad_norm": 3.202164888381958, "learning_rate": 1.485e-05, "loss": 9.4693, "step": 99 }, { "epoch": 0.007664156655362036, "grad_norm": 3.069445848464966, "learning_rate": 1.5e-05, "loss": 9.5667, "step": 100 }, { "epoch": 0.007664156655362036, "eval_loss": 9.585326194763184, "eval_runtime": 418.1777, "eval_samples_per_second": 52.552, "eval_steps_per_second": 6.569, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 130824536064000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }