{ "best_metric": 0.097813181579113, "best_model_checkpoint": "idbwbase/checkpoint-42381", "epoch": 10.0, "eval_steps": 500, "global_step": 47090, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11, "grad_norm": 3.36433744430542, "learning_rate": 0.0001978764068804417, "loss": 0.4791, "step": 500 }, { "epoch": 0.21, "grad_norm": 3.1407039165496826, "learning_rate": 0.0001957528137608834, "loss": 0.454, "step": 1000 }, { "epoch": 0.32, "grad_norm": 2.483966112136841, "learning_rate": 0.00019362922064132513, "loss": 0.4364, "step": 1500 }, { "epoch": 0.42, "grad_norm": 2.059077501296997, "learning_rate": 0.00019150987470800598, "loss": 0.4149, "step": 2000 }, { "epoch": 0.53, "grad_norm": 2.1167092323303223, "learning_rate": 0.00018938628158844767, "loss": 0.41, "step": 2500 }, { "epoch": 0.64, "grad_norm": 1.9980089664459229, "learning_rate": 0.00018726268846888937, "loss": 0.3953, "step": 3000 }, { "epoch": 0.74, "grad_norm": 1.5476139783859253, "learning_rate": 0.00018513909534933107, "loss": 0.3892, "step": 3500 }, { "epoch": 0.85, "grad_norm": 1.8769443035125732, "learning_rate": 0.0001830197494160119, "loss": 0.3826, "step": 4000 }, { "epoch": 0.96, "grad_norm": 2.0475404262542725, "learning_rate": 0.0001808961562964536, "loss": 0.373, "step": 4500 }, { "epoch": 1.0, "eval_accuracy": 0.870536469791843, "eval_loss": 0.30019834637641907, "eval_runtime": 363.8544, "eval_samples_per_second": 146.16, "eval_steps_per_second": 18.271, "step": 4709 }, { "epoch": 1.06, "grad_norm": 3.492917776107788, "learning_rate": 0.0001787725631768953, "loss": 0.3639, "step": 5000 }, { "epoch": 1.17, "grad_norm": 3.2921090126037598, "learning_rate": 0.00017664897005733704, "loss": 0.3553, "step": 5500 }, { "epoch": 1.27, "grad_norm": 2.0384316444396973, "learning_rate": 0.00017452537693777873, "loss": 0.3499, "step": 6000 }, { "epoch": 1.38, "grad_norm": 2.055048942565918, "learning_rate": 0.00017240178381822043, "loss": 0.3506, "step": 6500 }, { "epoch": 1.49, "grad_norm": 1.965047001838684, "learning_rate": 0.00017027819069866215, "loss": 0.3423, "step": 7000 }, { "epoch": 1.59, "grad_norm": 2.3037126064300537, "learning_rate": 0.00016815459757910385, "loss": 0.3402, "step": 7500 }, { "epoch": 1.7, "grad_norm": 2.647536516189575, "learning_rate": 0.00016603100445954555, "loss": 0.3323, "step": 8000 }, { "epoch": 1.81, "grad_norm": 2.767207384109497, "learning_rate": 0.0001639116585262264, "loss": 0.3294, "step": 8500 }, { "epoch": 1.91, "grad_norm": 2.31170654296875, "learning_rate": 0.00016179231259290722, "loss": 0.3244, "step": 9000 }, { "epoch": 2.0, "eval_accuracy": 0.9044395554803407, "eval_loss": 0.22620604932308197, "eval_runtime": 365.6085, "eval_samples_per_second": 145.459, "eval_steps_per_second": 18.183, "step": 9418 }, { "epoch": 2.02, "grad_norm": 1.986091136932373, "learning_rate": 0.00015966871947334892, "loss": 0.3175, "step": 9500 }, { "epoch": 2.12, "grad_norm": 2.7250773906707764, "learning_rate": 0.0001575451263537906, "loss": 0.3038, "step": 10000 }, { "epoch": 2.23, "grad_norm": 2.0195140838623047, "learning_rate": 0.0001554215332342323, "loss": 0.3037, "step": 10500 }, { "epoch": 2.34, "grad_norm": 2.9471521377563477, "learning_rate": 0.00015329794011467404, "loss": 0.2911, "step": 11000 }, { "epoch": 2.44, "grad_norm": 1.8268228769302368, "learning_rate": 0.00015117434699511573, "loss": 0.2931, "step": 11500 }, { "epoch": 2.55, "grad_norm": 2.0443902015686035, "learning_rate": 0.00014905075387555746, "loss": 0.2903, "step": 12000 }, { "epoch": 2.65, "grad_norm": 2.8994696140289307, "learning_rate": 0.00014692716075599915, "loss": 0.2864, "step": 12500 }, { "epoch": 2.76, "grad_norm": 2.1983399391174316, "learning_rate": 0.00014480781482267998, "loss": 0.2833, "step": 13000 }, { "epoch": 2.87, "grad_norm": 2.4135706424713135, "learning_rate": 0.0001426884688893608, "loss": 0.2788, "step": 13500 }, { "epoch": 2.97, "grad_norm": 1.9159047603607178, "learning_rate": 0.0001405648757698025, "loss": 0.2801, "step": 14000 }, { "epoch": 3.0, "eval_accuracy": 0.9195953442018766, "eval_loss": 0.1986866444349289, "eval_runtime": 365.1106, "eval_samples_per_second": 145.657, "eval_steps_per_second": 18.208, "step": 14127 }, { "epoch": 3.08, "grad_norm": 2.0817108154296875, "learning_rate": 0.00013844128265024422, "loss": 0.2584, "step": 14500 }, { "epoch": 3.19, "grad_norm": 2.8534085750579834, "learning_rate": 0.00013631768953068594, "loss": 0.2559, "step": 15000 }, { "epoch": 3.29, "grad_norm": 3.98779559135437, "learning_rate": 0.00013419409641112764, "loss": 0.2511, "step": 15500 }, { "epoch": 3.4, "grad_norm": 3.104475975036621, "learning_rate": 0.00013207475047780846, "loss": 0.2517, "step": 16000 }, { "epoch": 3.5, "grad_norm": 2.7032172679901123, "learning_rate": 0.00012995115735825016, "loss": 0.2504, "step": 16500 }, { "epoch": 3.61, "grad_norm": 3.3657124042510986, "learning_rate": 0.00012782756423869186, "loss": 0.2461, "step": 17000 }, { "epoch": 3.72, "grad_norm": 2.80549693107605, "learning_rate": 0.00012570397111913358, "loss": 0.2451, "step": 17500 }, { "epoch": 3.82, "grad_norm": 1.6164408922195435, "learning_rate": 0.0001235803779995753, "loss": 0.2421, "step": 18000 }, { "epoch": 3.93, "grad_norm": 2.589157819747925, "learning_rate": 0.00012145678488001699, "loss": 0.2366, "step": 18500 }, { "epoch": 4.0, "eval_accuracy": 0.9344502735939527, "eval_loss": 0.17884854972362518, "eval_runtime": 360.8393, "eval_samples_per_second": 147.381, "eval_steps_per_second": 18.424, "step": 18836 }, { "epoch": 4.03, "grad_norm": 2.659468412399292, "learning_rate": 0.0001193331917604587, "loss": 0.2287, "step": 19000 }, { "epoch": 4.14, "grad_norm": 2.452038526535034, "learning_rate": 0.00011720959864090042, "loss": 0.2205, "step": 19500 }, { "epoch": 4.25, "grad_norm": 3.7109389305114746, "learning_rate": 0.00011509025270758123, "loss": 0.2076, "step": 20000 }, { "epoch": 4.35, "grad_norm": 2.5269851684570312, "learning_rate": 0.00011296665958802296, "loss": 0.2134, "step": 20500 }, { "epoch": 4.46, "grad_norm": 2.7143073081970215, "learning_rate": 0.00011084306646846465, "loss": 0.2144, "step": 21000 }, { "epoch": 4.57, "grad_norm": 3.1681575775146484, "learning_rate": 0.00010871947334890635, "loss": 0.2091, "step": 21500 }, { "epoch": 4.67, "grad_norm": 1.2098946571350098, "learning_rate": 0.00010659588022934806, "loss": 0.2065, "step": 22000 }, { "epoch": 4.78, "grad_norm": 1.6173245906829834, "learning_rate": 0.00010447228710978979, "loss": 0.2107, "step": 22500 }, { "epoch": 4.88, "grad_norm": 4.180131435394287, "learning_rate": 0.00010235294117647058, "loss": 0.2053, "step": 23000 }, { "epoch": 4.99, "grad_norm": 1.9634019136428833, "learning_rate": 0.0001002293480569123, "loss": 0.2051, "step": 23500 }, { "epoch": 5.0, "eval_accuracy": 0.9483650175814671, "eval_loss": 0.1462855190038681, "eval_runtime": 362.8319, "eval_samples_per_second": 146.572, "eval_steps_per_second": 18.323, "step": 23545 }, { "epoch": 5.1, "grad_norm": 2.8337090015411377, "learning_rate": 9.810575493735401e-05, "loss": 0.1896, "step": 24000 }, { "epoch": 5.2, "grad_norm": 1.7976356744766235, "learning_rate": 9.598216181779571e-05, "loss": 0.1872, "step": 24500 }, { "epoch": 5.31, "grad_norm": 2.647800922393799, "learning_rate": 9.385856869823742e-05, "loss": 0.1819, "step": 25000 }, { "epoch": 5.42, "grad_norm": 2.1115944385528564, "learning_rate": 9.173922276491824e-05, "loss": 0.1837, "step": 25500 }, { "epoch": 5.52, "grad_norm": 3.922607183456421, "learning_rate": 8.961562964535995e-05, "loss": 0.1868, "step": 26000 }, { "epoch": 5.63, "grad_norm": 1.7281802892684937, "learning_rate": 8.749203652580165e-05, "loss": 0.1832, "step": 26500 }, { "epoch": 5.73, "grad_norm": 2.4881973266601562, "learning_rate": 8.536844340624338e-05, "loss": 0.1791, "step": 27000 }, { "epoch": 5.84, "grad_norm": 1.2277300357818604, "learning_rate": 8.324909747292418e-05, "loss": 0.1805, "step": 27500 }, { "epoch": 5.95, "grad_norm": 1.151065468788147, "learning_rate": 8.112975153960502e-05, "loss": 0.1764, "step": 28000 }, { "epoch": 6.0, "eval_accuracy": 0.9593087756905662, "eval_loss": 0.1202462688088417, "eval_runtime": 359.7308, "eval_samples_per_second": 147.836, "eval_steps_per_second": 18.48, "step": 28254 }, { "epoch": 6.05, "grad_norm": 2.2972559928894043, "learning_rate": 7.900615842004673e-05, "loss": 0.1694, "step": 28500 }, { "epoch": 6.16, "grad_norm": 1.6081655025482178, "learning_rate": 7.688256530048843e-05, "loss": 0.1656, "step": 29000 }, { "epoch": 6.26, "grad_norm": 4.132796764373779, "learning_rate": 7.475897218093014e-05, "loss": 0.1589, "step": 29500 }, { "epoch": 6.37, "grad_norm": 2.355618476867676, "learning_rate": 7.263537906137185e-05, "loss": 0.1618, "step": 30000 }, { "epoch": 6.48, "grad_norm": 2.159538745880127, "learning_rate": 7.051178594181356e-05, "loss": 0.1572, "step": 30500 }, { "epoch": 6.58, "grad_norm": 3.115480661392212, "learning_rate": 6.838819282225526e-05, "loss": 0.163, "step": 31000 }, { "epoch": 6.69, "grad_norm": 2.2360424995422363, "learning_rate": 6.626459970269697e-05, "loss": 0.1607, "step": 31500 }, { "epoch": 6.8, "grad_norm": 2.970757246017456, "learning_rate": 6.414525376937779e-05, "loss": 0.1599, "step": 32000 }, { "epoch": 6.9, "grad_norm": 6.401217460632324, "learning_rate": 6.20216606498195e-05, "loss": 0.1595, "step": 32500 }, { "epoch": 7.0, "eval_accuracy": 0.9654763919444914, "eval_loss": 0.12429220974445343, "eval_runtime": 360.8363, "eval_samples_per_second": 147.383, "eval_steps_per_second": 18.424, "step": 32963 }, { "epoch": 7.01, "grad_norm": 3.2262797355651855, "learning_rate": 5.9898067530261204e-05, "loss": 0.1528, "step": 33000 }, { "epoch": 7.11, "grad_norm": 3.578483819961548, "learning_rate": 5.7774474410702914e-05, "loss": 0.1473, "step": 33500 }, { "epoch": 7.22, "grad_norm": 3.9829201698303223, "learning_rate": 5.565088129114462e-05, "loss": 0.1441, "step": 34000 }, { "epoch": 7.33, "grad_norm": 3.7269842624664307, "learning_rate": 5.3531535357825446e-05, "loss": 0.1385, "step": 34500 }, { "epoch": 7.43, "grad_norm": 2.6239781379699707, "learning_rate": 5.140794223826715e-05, "loss": 0.1406, "step": 35000 }, { "epoch": 7.54, "grad_norm": 1.6224650144577026, "learning_rate": 4.9284349118708855e-05, "loss": 0.1429, "step": 35500 }, { "epoch": 7.64, "grad_norm": 5.391174793243408, "learning_rate": 4.7160755999150565e-05, "loss": 0.1457, "step": 36000 }, { "epoch": 7.75, "grad_norm": 5.336463928222656, "learning_rate": 4.504141006583139e-05, "loss": 0.1359, "step": 36500 }, { "epoch": 7.86, "grad_norm": 3.005934238433838, "learning_rate": 4.29178169462731e-05, "loss": 0.1417, "step": 37000 }, { "epoch": 7.96, "grad_norm": 3.7323451042175293, "learning_rate": 4.079422382671481e-05, "loss": 0.1359, "step": 37500 }, { "epoch": 8.0, "eval_accuracy": 0.9658712698144074, "eval_loss": 0.11875798553228378, "eval_runtime": 368.3038, "eval_samples_per_second": 144.394, "eval_steps_per_second": 18.05, "step": 37672 }, { "epoch": 8.07, "grad_norm": 2.7839584350585938, "learning_rate": 3.867063070715651e-05, "loss": 0.1335, "step": 38000 }, { "epoch": 8.18, "grad_norm": 4.733059406280518, "learning_rate": 3.6551284773837333e-05, "loss": 0.1313, "step": 38500 }, { "epoch": 8.28, "grad_norm": 2.712228536605835, "learning_rate": 3.4427691654279044e-05, "loss": 0.1325, "step": 39000 }, { "epoch": 8.39, "grad_norm": 4.026779651641846, "learning_rate": 3.230409853472075e-05, "loss": 0.1272, "step": 39500 }, { "epoch": 8.49, "grad_norm": 3.8425519466400146, "learning_rate": 3.018050541516246e-05, "loss": 0.1307, "step": 40000 }, { "epoch": 8.6, "grad_norm": 3.5189483165740967, "learning_rate": 2.806115948184328e-05, "loss": 0.129, "step": 40500 }, { "epoch": 8.71, "grad_norm": 4.5053486824035645, "learning_rate": 2.5937566362284988e-05, "loss": 0.1257, "step": 41000 }, { "epoch": 8.81, "grad_norm": 4.372600078582764, "learning_rate": 2.3818220428965812e-05, "loss": 0.1238, "step": 41500 }, { "epoch": 8.92, "grad_norm": 1.8681597709655762, "learning_rate": 2.169462730940752e-05, "loss": 0.1231, "step": 42000 }, { "epoch": 9.0, "eval_accuracy": 0.9741072939583686, "eval_loss": 0.097813181579113, "eval_runtime": 359.2297, "eval_samples_per_second": 148.042, "eval_steps_per_second": 18.506, "step": 42381 }, { "epoch": 9.03, "grad_norm": 3.127023220062256, "learning_rate": 1.9571034189849227e-05, "loss": 0.1249, "step": 42500 }, { "epoch": 9.13, "grad_norm": 1.650771141052246, "learning_rate": 1.744744107029093e-05, "loss": 0.118, "step": 43000 }, { "epoch": 9.24, "grad_norm": 3.3975789546966553, "learning_rate": 1.532384795073264e-05, "loss": 0.1198, "step": 43500 }, { "epoch": 9.34, "grad_norm": 1.8186984062194824, "learning_rate": 1.3200254831174346e-05, "loss": 0.1167, "step": 44000 }, { "epoch": 9.45, "grad_norm": 7.110922336578369, "learning_rate": 1.1076661711616054e-05, "loss": 0.1198, "step": 44500 }, { "epoch": 9.56, "grad_norm": 6.389682292938232, "learning_rate": 8.953068592057761e-06, "loss": 0.1187, "step": 45000 }, { "epoch": 9.66, "grad_norm": 2.857072353363037, "learning_rate": 6.833722658738587e-06, "loss": 0.1129, "step": 45500 }, { "epoch": 9.77, "grad_norm": 0.7192772626876831, "learning_rate": 4.710129539180293e-06, "loss": 0.1133, "step": 46000 }, { "epoch": 9.87, "grad_norm": 3.4914393424987793, "learning_rate": 2.5865364196220007e-06, "loss": 0.1151, "step": 46500 }, { "epoch": 9.98, "grad_norm": 1.8187050819396973, "learning_rate": 4.671904863028244e-07, "loss": 0.1162, "step": 47000 }, { "epoch": 10.0, "eval_accuracy": 0.9761004870160396, "eval_loss": 0.10011033713817596, "eval_runtime": 359.0659, "eval_samples_per_second": 148.109, "eval_steps_per_second": 18.515, "step": 47090 }, { "epoch": 10.0, "step": 47090, "total_flos": 2.383420595495315e+20, "train_loss": 0.22288281881624433, "train_runtime": 60913.5069, "train_samples_per_second": 49.473, "train_steps_per_second": 0.773 } ], "logging_steps": 500, "max_steps": 47090, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 2.383420595495315e+20, "train_batch_size": 64, "trial_name": null, "trial_params": null }