{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 766, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.5974025974025976e-06, "loss": 2.4814, "step": 1 }, { "epoch": 0.01, "learning_rate": 1.2987012987012986e-05, "loss": 2.2941, "step": 5 }, { "epoch": 0.01, "learning_rate": 2.5974025974025972e-05, "loss": 2.4997, "step": 10 }, { "epoch": 0.02, "learning_rate": 3.8961038961038966e-05, "loss": 2.7226, "step": 15 }, { "epoch": 0.03, "learning_rate": 5.1948051948051944e-05, "loss": 2.8458, "step": 20 }, { "epoch": 0.03, "learning_rate": 6.493506493506494e-05, "loss": 2.6056, "step": 25 }, { "epoch": 0.04, "learning_rate": 7.792207792207793e-05, "loss": 2.6706, "step": 30 }, { "epoch": 0.05, "learning_rate": 9.090909090909092e-05, "loss": 2.5805, "step": 35 }, { "epoch": 0.05, "learning_rate": 0.00010389610389610389, "loss": 2.5799, "step": 40 }, { "epoch": 0.06, "learning_rate": 0.00011688311688311689, "loss": 2.7861, "step": 45 }, { "epoch": 0.07, "learning_rate": 0.00012987012987012987, "loss": 2.4062, "step": 50 }, { "epoch": 0.07, "learning_rate": 0.00014285714285714287, "loss": 2.7231, "step": 55 }, { "epoch": 0.08, "learning_rate": 0.00015584415584415587, "loss": 2.8154, "step": 60 }, { "epoch": 0.08, "learning_rate": 0.00016883116883116884, "loss": 2.785, "step": 65 }, { "epoch": 0.09, "learning_rate": 0.00018181818181818183, "loss": 2.393, "step": 70 }, { "epoch": 0.1, "learning_rate": 0.0001948051948051948, "loss": 2.6932, "step": 75 }, { "epoch": 0.1, "learning_rate": 0.00019999064449912997, "loss": 2.7158, "step": 80 }, { "epoch": 0.11, "learning_rate": 0.00019993347833297808, "loss": 2.4839, "step": 85 }, { "epoch": 0.12, "learning_rate": 0.00019982437317643217, "loss": 2.8402, "step": 90 }, { "epoch": 0.12, "learning_rate": 0.00019966338573533063, "loss": 2.5689, "step": 95 }, { "epoch": 0.13, "learning_rate": 0.0001994505996805844, "loss": 2.4474, "step": 100 }, { "epoch": 0.14, "learning_rate": 0.0001991861256046899, "loss": 2.7085, "step": 105 }, { "epoch": 0.14, "learning_rate": 0.00019887010096425055, "loss": 2.6015, "step": 110 }, { "epoch": 0.15, "learning_rate": 0.0001985026900085352, "loss": 2.6907, "step": 115 }, { "epoch": 0.16, "learning_rate": 0.00019808408369411241, "loss": 2.6831, "step": 120 }, { "epoch": 0.16, "learning_rate": 0.00019761449958560326, "loss": 2.1675, "step": 125 }, { "epoch": 0.17, "learning_rate": 0.0001970941817426052, "loss": 2.8732, "step": 130 }, { "epoch": 0.18, "learning_rate": 0.00019652340059284583, "loss": 2.8546, "step": 135 }, { "epoch": 0.18, "learning_rate": 0.00019590245279163169, "loss": 2.7912, "step": 140 }, { "epoch": 0.19, "learning_rate": 0.00019523166106766616, "loss": 2.7565, "step": 145 }, { "epoch": 0.2, "learning_rate": 0.00019451137405531563, "loss": 2.8615, "step": 150 }, { "epoch": 0.2, "learning_rate": 0.0001937419661134121, "loss": 2.6343, "step": 155 }, { "epoch": 0.21, "learning_rate": 0.00019292383713068518, "loss": 2.446, "step": 160 }, { "epoch": 0.22, "learning_rate": 0.00019205741231792596, "loss": 2.9245, "step": 165 }, { "epoch": 0.22, "learning_rate": 0.00019114314198698967, "loss": 2.5026, "step": 170 }, { "epoch": 0.23, "learning_rate": 0.00019018150131675253, "loss": 2.842, "step": 175 }, { "epoch": 0.23, "learning_rate": 0.0001891729901061445, "loss": 2.5944, "step": 180 }, { "epoch": 0.24, "learning_rate": 0.0001881181325143858, "loss": 2.767, "step": 185 }, { "epoch": 0.25, "learning_rate": 0.00018701747678856286, "loss": 2.4217, "step": 190 }, { "epoch": 0.25, "learning_rate": 0.00018587159497868448, "loss": 2.5336, "step": 195 }, { "epoch": 0.26, "learning_rate": 0.00018468108264036747, "loss": 2.6071, "step": 200 }, { "epoch": 0.27, "learning_rate": 0.00018344655852530468, "loss": 2.6432, "step": 205 }, { "epoch": 0.27, "learning_rate": 0.00018216866425967818, "loss": 2.526, "step": 210 }, { "epoch": 0.28, "learning_rate": 0.00018084806401068285, "loss": 2.6892, "step": 215 }, { "epoch": 0.29, "learning_rate": 0.00017948544414133534, "loss": 2.9056, "step": 220 }, { "epoch": 0.29, "learning_rate": 0.00017808151285374652, "loss": 2.784, "step": 225 }, { "epoch": 0.3, "learning_rate": 0.00017663699982104373, "loss": 2.2989, "step": 230 }, { "epoch": 0.31, "learning_rate": 0.00017515265580813355, "loss": 2.4406, "step": 235 }, { "epoch": 0.31, "learning_rate": 0.0001736292522815027, "loss": 2.3931, "step": 240 }, { "epoch": 0.32, "learning_rate": 0.00017206758100825917, "loss": 2.6808, "step": 245 }, { "epoch": 0.33, "learning_rate": 0.00017046845364462285, "loss": 2.3481, "step": 250 }, { "epoch": 0.33, "learning_rate": 0.00016883270131407864, "loss": 2.704, "step": 255 }, { "epoch": 0.34, "learning_rate": 0.00016716117417541227, "loss": 2.6067, "step": 260 }, { "epoch": 0.35, "learning_rate": 0.0001654547409808521, "loss": 2.4372, "step": 265 }, { "epoch": 0.35, "learning_rate": 0.000163714288624548, "loss": 2.3988, "step": 270 }, { "epoch": 0.36, "learning_rate": 0.00016194072168162048, "loss": 2.43, "step": 275 }, { "epoch": 0.37, "learning_rate": 0.0001601349619380211, "loss": 2.948, "step": 280 }, { "epoch": 0.37, "learning_rate": 0.0001582979479114472, "loss": 2.5301, "step": 285 }, { "epoch": 0.38, "learning_rate": 0.00015643063436356102, "loss": 2.6635, "step": 290 }, { "epoch": 0.39, "learning_rate": 0.0001545339918037658, "loss": 2.4107, "step": 295 }, { "epoch": 0.39, "learning_rate": 0.00015260900598479765, "loss": 2.4512, "step": 300 }, { "epoch": 0.4, "learning_rate": 0.0001506566773903946, "loss": 2.661, "step": 305 }, { "epoch": 0.4, "learning_rate": 0.0001486780207153097, "loss": 2.5907, "step": 310 }, { "epoch": 0.41, "learning_rate": 0.00014667406433793774, "loss": 2.589, "step": 315 }, { "epoch": 0.42, "learning_rate": 0.00014464584978583057, "loss": 2.7578, "step": 320 }, { "epoch": 0.42, "learning_rate": 0.000142594431194378, "loss": 2.4308, "step": 325 }, { "epoch": 0.43, "learning_rate": 0.00014052087475893615, "loss": 2.3724, "step": 330 }, { "epoch": 0.44, "learning_rate": 0.00013842625818068758, "loss": 2.6396, "step": 335 }, { "epoch": 0.44, "learning_rate": 0.00013631167010652178, "loss": 2.6514, "step": 340 }, { "epoch": 0.45, "learning_rate": 0.00013417820956322633, "loss": 2.5516, "step": 345 }, { "epoch": 0.46, "learning_rate": 0.00013202698538628376, "loss": 2.7136, "step": 350 }, { "epoch": 0.46, "learning_rate": 0.0001298591156435701, "loss": 2.7312, "step": 355 }, { "epoch": 0.47, "learning_rate": 0.00012767572705425513, "loss": 2.3132, "step": 360 }, { "epoch": 0.48, "learning_rate": 0.00012547795440320626, "loss": 2.4825, "step": 365 }, { "epoch": 0.48, "learning_rate": 0.00012326693995120064, "loss": 2.6723, "step": 370 }, { "epoch": 0.49, "learning_rate": 0.0001210438328412511, "loss": 2.5739, "step": 375 }, { "epoch": 0.5, "learning_rate": 0.00011880978850135593, "loss": 2.5385, "step": 380 }, { "epoch": 0.5, "learning_rate": 0.00011656596804398158, "loss": 2.5449, "step": 385 }, { "epoch": 0.51, "learning_rate": 0.00011431353766259121, "loss": 2.2918, "step": 390 }, { "epoch": 0.52, "learning_rate": 0.0001120536680255323, "loss": 2.4322, "step": 395 }, { "epoch": 0.52, "learning_rate": 0.00010978753366759877, "loss": 2.6421, "step": 400 }, { "epoch": 0.53, "learning_rate": 0.00010751631237958309, "loss": 2.4996, "step": 405 }, { "epoch": 0.54, "learning_rate": 0.00010524118459613672, "loss": 2.3949, "step": 410 }, { "epoch": 0.54, "learning_rate": 0.00010296333278225599, "loss": 2.384, "step": 415 }, { "epoch": 0.55, "learning_rate": 0.00010068394081871289, "loss": 2.4835, "step": 420 }, { "epoch": 0.55, "learning_rate": 9.840419338675015e-05, "loss": 2.6381, "step": 425 }, { "epoch": 0.56, "learning_rate": 9.612527535236006e-05, "loss": 2.5099, "step": 430 }, { "epoch": 0.57, "learning_rate": 9.384837115046746e-05, "loss": 2.5638, "step": 435 }, { "epoch": 0.57, "learning_rate": 9.157466416933662e-05, "loss": 2.6704, "step": 440 }, { "epoch": 0.58, "learning_rate": 8.930533613552232e-05, "loss": 2.3547, "step": 445 }, { "epoch": 0.59, "learning_rate": 8.704156649968422e-05, "loss": 2.2018, "step": 450 }, { "epoch": 0.59, "learning_rate": 8.478453182358461e-05, "loss": 2.3659, "step": 455 }, { "epoch": 0.6, "learning_rate": 8.253540516858695e-05, "loss": 2.8535, "step": 460 }, { "epoch": 0.61, "learning_rate": 8.029535548597451e-05, "loss": 2.6786, "step": 465 }, { "epoch": 0.61, "learning_rate": 7.806554700940441e-05, "loss": 2.7602, "step": 470 }, { "epoch": 0.62, "learning_rate": 7.584713864981387e-05, "loss": 2.6715, "step": 475 }, { "epoch": 0.63, "learning_rate": 7.364128339309326e-05, "loss": 2.5784, "step": 480 }, { "epoch": 0.63, "learning_rate": 7.14491277008377e-05, "loss": 2.4982, "step": 485 }, { "epoch": 0.64, "learning_rate": 6.927181091449061e-05, "loss": 2.2037, "step": 490 }, { "epoch": 0.65, "learning_rate": 6.711046466318702e-05, "loss": 2.6169, "step": 495 }, { "epoch": 0.65, "learning_rate": 6.496621227560581e-05, "loss": 2.7077, "step": 500 }, { "epoch": 0.66, "learning_rate": 6.28401681961355e-05, "loss": 2.7811, "step": 505 }, { "epoch": 0.67, "learning_rate": 6.073343740565807e-05, "loss": 2.3697, "step": 510 }, { "epoch": 0.67, "learning_rate": 5.86471148472507e-05, "loss": 2.6585, "step": 515 }, { "epoch": 0.68, "learning_rate": 5.658228485710517e-05, "loss": 2.5007, "step": 520 }, { "epoch": 0.69, "learning_rate": 5.45400206009595e-05, "loss": 2.6819, "step": 525 }, { "epoch": 0.69, "learning_rate": 5.2521383516335376e-05, "loss": 2.6446, "step": 530 }, { "epoch": 0.7, "learning_rate": 5.0527422760871435e-05, "loss": 2.5785, "step": 535 }, { "epoch": 0.7, "learning_rate": 4.855917466703853e-05, "loss": 2.5346, "step": 540 }, { "epoch": 0.71, "learning_rate": 4.661766220352097e-05, "loss": 2.6784, "step": 545 }, { "epoch": 0.72, "learning_rate": 4.470389444354286e-05, "loss": 2.6187, "step": 550 }, { "epoch": 0.72, "learning_rate": 4.281886604041729e-05, "loss": 2.3566, "step": 555 }, { "epoch": 0.73, "learning_rate": 4.096355671058918e-05, "loss": 2.5479, "step": 560 }, { "epoch": 0.74, "learning_rate": 3.9138930724441816e-05, "loss": 2.5891, "step": 565 }, { "epoch": 0.74, "learning_rate": 3.73459364051313e-05, "loss": 2.6546, "step": 570 }, { "epoch": 0.75, "learning_rate": 3.558550563570903e-05, "loss": 2.5385, "step": 575 }, { "epoch": 0.76, "learning_rate": 3.385855337478894e-05, "loss": 2.4146, "step": 580 }, { "epoch": 0.76, "learning_rate": 3.2165977181010874e-05, "loss": 2.5971, "step": 585 }, { "epoch": 0.77, "learning_rate": 3.0508656746547505e-05, "loss": 2.5462, "step": 590 }, { "epoch": 0.78, "learning_rate": 2.8887453439896728e-05, "loss": 2.8763, "step": 595 }, { "epoch": 0.78, "learning_rate": 2.730320985819791e-05, "loss": 2.7621, "step": 600 }, { "epoch": 0.79, "learning_rate": 2.575674938930408e-05, "loss": 2.1617, "step": 605 }, { "epoch": 0.8, "learning_rate": 2.4248875783837987e-05, "loss": 2.3758, "step": 610 }, { "epoch": 0.8, "learning_rate": 2.2780372737454124e-05, "loss": 2.4131, "step": 615 }, { "epoch": 0.81, "learning_rate": 2.135200348352423e-05, "loss": 2.5124, "step": 620 }, { "epoch": 0.82, "learning_rate": 1.9964510396457748e-05, "loss": 2.4138, "step": 625 }, { "epoch": 0.82, "learning_rate": 1.8618614605863284e-05, "loss": 2.8193, "step": 630 }, { "epoch": 0.83, "learning_rate": 1.7315015621752006e-05, "loss": 1.9123, "step": 635 }, { "epoch": 0.84, "learning_rate": 1.6054390970977174e-05, "loss": 2.5428, "step": 640 }, { "epoch": 0.84, "learning_rate": 1.4837395845099423e-05, "loss": 2.9045, "step": 645 }, { "epoch": 0.85, "learning_rate": 1.3664662759860258e-05, "loss": 2.62, "step": 650 }, { "epoch": 0.86, "learning_rate": 1.2536801226441208e-05, "loss": 2.2114, "step": 655 }, { "epoch": 0.86, "learning_rate": 1.1454397434679021e-05, "loss": 2.5337, "step": 660 }, { "epoch": 0.87, "learning_rate": 1.0418013948401917e-05, "loss": 2.5415, "step": 665 }, { "epoch": 0.87, "learning_rate": 9.428189413045285e-06, "loss": 2.683, "step": 670 }, { "epoch": 0.88, "learning_rate": 8.485438275698154e-06, "loss": 2.691, "step": 675 }, { "epoch": 0.89, "learning_rate": 7.590250517727016e-06, "loss": 2.4456, "step": 680 }, { "epoch": 0.89, "learning_rate": 6.7430914001148424e-06, "loss": 2.5518, "step": 685 }, { "epoch": 0.9, "learning_rate": 5.944401221648432e-06, "loss": 2.7904, "step": 690 }, { "epoch": 0.91, "learning_rate": 5.194595090079457e-06, "loss": 2.4716, "step": 695 }, { "epoch": 0.91, "learning_rate": 4.494062706378166e-06, "loss": 2.4181, "step": 700 }, { "epoch": 0.92, "learning_rate": 3.843168162191857e-06, "loss": 2.5981, "step": 705 }, { "epoch": 0.93, "learning_rate": 3.242249750613502e-06, "loss": 2.4747, "step": 710 }, { "epoch": 0.93, "learning_rate": 2.6916197903588127e-06, "loss": 2.4181, "step": 715 }, { "epoch": 0.94, "learning_rate": 2.1915644634429854e-06, "loss": 2.7945, "step": 720 }, { "epoch": 0.95, "learning_rate": 1.7423436664417593e-06, "loss": 2.5029, "step": 725 }, { "epoch": 0.95, "learning_rate": 1.3441908754139198e-06, "loss": 2.335, "step": 730 }, { "epoch": 0.96, "learning_rate": 9.973130245554018e-07, "loss": 2.6024, "step": 735 }, { "epoch": 0.97, "learning_rate": 7.018903986483083e-07, "loss": 2.6012, "step": 740 }, { "epoch": 0.97, "learning_rate": 4.58076539360397e-07, "loss": 2.5945, "step": 745 }, { "epoch": 0.98, "learning_rate": 2.659981654440591e-07, "loss": 2.7643, "step": 750 }, { "epoch": 0.99, "learning_rate": 1.2575510687601322e-07, "loss": 2.7358, "step": 755 }, { "epoch": 0.99, "learning_rate": 3.742025297219787e-08, "loss": 2.778, "step": 760 }, { "epoch": 1.0, "learning_rate": 1.039514504497241e-09, "loss": 2.744, "step": 765 }, { "epoch": 1.0, "eval_loss": 2.4906539916992188, "eval_runtime": 232.4011, "eval_samples_per_second": 3.296, "eval_steps_per_second": 3.296, "step": 766 }, { "epoch": 1.0, "step": 766, "total_flos": 5.492443574383411e+16, "train_loss": 2.5803672159309485, "train_runtime": 1034.6215, "train_samples_per_second": 0.74, "train_steps_per_second": 0.74 } ], "logging_steps": 5, "max_steps": 766, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 5.492443574383411e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }