{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7711654268508679, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017711654268508677, "grad_norm": 2066.203369140625, "learning_rate": 7.000000000000001e-07, "loss": 190.4975, "step": 100 }, { "epoch": 0.035423308537017355, "grad_norm": 0.0, "learning_rate": 1.4000000000000001e-06, "loss": 143.7361, "step": 200 }, { "epoch": 0.053134962805526036, "grad_norm": 775.2849731445312, "learning_rate": 2.1e-06, "loss": 112.8461, "step": 300 }, { "epoch": 0.07084661707403471, "grad_norm": 508.002685546875, "learning_rate": 2.8000000000000003e-06, "loss": 153.5849, "step": 400 }, { "epoch": 0.08855827134254339, "grad_norm": 460.08441162109375, "learning_rate": 3.5e-06, "loss": 74.8366, "step": 500 }, { "epoch": 0.10626992561105207, "grad_norm": 367.09698486328125, "learning_rate": 4.2e-06, "loss": 189.2716, "step": 600 }, { "epoch": 0.12398157987956075, "grad_norm": 253.3091583251953, "learning_rate": 4.9e-06, "loss": 120.8286, "step": 700 }, { "epoch": 0.14169323414806942, "grad_norm": 459.9718017578125, "learning_rate": 5.600000000000001e-06, "loss": 187.236, "step": 800 }, { "epoch": 0.1594048884165781, "grad_norm": 569.0575561523438, "learning_rate": 6.3e-06, "loss": 144.0774, "step": 900 }, { "epoch": 0.17711654268508678, "grad_norm": 882.364501953125, "learning_rate": 7e-06, "loss": 117.9007, "step": 1000 }, { "epoch": 0.19482819695359546, "grad_norm": 0.0, "learning_rate": 6.9978678945668355e-06, "loss": 129.7736, "step": 1100 }, { "epoch": 0.21253985122210414, "grad_norm": 1200.3560791015625, "learning_rate": 6.991474175909385e-06, "loss": 120.1585, "step": 1200 }, { "epoch": 0.23025150549061282, "grad_norm": 1630.1290283203125, "learning_rate": 6.980826633788957e-06, "loss": 101.0185, "step": 1300 }, { "epoch": 0.2479631597591215, "grad_norm": 881.6813354492188, "learning_rate": 6.965938240595496e-06, "loss": 118.4921, "step": 1400 }, { "epoch": 0.26567481402763016, "grad_norm": 450.1161804199219, "learning_rate": 6.946827135542729e-06, "loss": 72.2752, "step": 1500 }, { "epoch": 0.28338646829613884, "grad_norm": 0.0, "learning_rate": 6.92351660256832e-06, "loss": 86.3471, "step": 1600 }, { "epoch": 0.3010981225646475, "grad_norm": 629.2029418945312, "learning_rate": 6.896035041965987e-06, "loss": 124.8669, "step": 1700 }, { "epoch": 0.3188097768331562, "grad_norm": 3508.349365234375, "learning_rate": 6.864415935784116e-06, "loss": 63.879, "step": 1800 }, { "epoch": 0.3365214311016649, "grad_norm": 1110.3475341796875, "learning_rate": 6.828697807033037e-06, "loss": 115.0453, "step": 1900 }, { "epoch": 0.35423308537017356, "grad_norm": 2961.45166015625, "learning_rate": 6.788924172750679e-06, "loss": 75.6149, "step": 2000 }, { "epoch": 0.37194473963868224, "grad_norm": 430.79962158203125, "learning_rate": 6.745143490983756e-06, "loss": 180.1111, "step": 2100 }, { "epoch": 0.3896563939071909, "grad_norm": 1095.099365234375, "learning_rate": 6.697409101749103e-06, "loss": 103.6415, "step": 2200 }, { "epoch": 0.4073680481756996, "grad_norm": 520.1317138671875, "learning_rate": 6.645779162047084e-06, "loss": 135.7425, "step": 2300 }, { "epoch": 0.4250797024442083, "grad_norm": 1034.732666015625, "learning_rate": 6.590316575006244e-06, "loss": 67.7314, "step": 2400 }, { "epoch": 0.44279135671271697, "grad_norm": 232.021240234375, "learning_rate": 6.531088913245536e-06, "loss": 72.1622, "step": 2500 }, { "epoch": 0.46050301098122565, "grad_norm": 571.3900146484375, "learning_rate": 6.46816833654749e-06, "loss": 84.1363, "step": 2600 }, { "epoch": 0.4782146652497343, "grad_norm": 146.1368408203125, "learning_rate": 6.4016315039426455e-06, "loss": 167.3046, "step": 2700 }, { "epoch": 0.495926319518243, "grad_norm": 0.0, "learning_rate": 6.331559480312316e-06, "loss": 126.0654, "step": 2800 }, { "epoch": 0.5136379737867517, "grad_norm": 0.0, "learning_rate": 6.2580376376235265e-06, "loss": 65.9502, "step": 2900 }, { "epoch": 0.5313496280552603, "grad_norm": 2008.842041015625, "learning_rate": 6.181155550916423e-06, "loss": 102.3511, "step": 3000 }, { "epoch": 0.549061282323769, "grad_norm": 0.0, "learning_rate": 6.10100688917088e-06, "loss": 114.994, "step": 3100 }, { "epoch": 0.5667729365922777, "grad_norm": 6229.5458984375, "learning_rate": 6.017689301185278e-06, "loss": 66.3055, "step": 3200 }, { "epoch": 0.5844845908607864, "grad_norm": 345.4675598144531, "learning_rate": 5.931304296606491e-06, "loss": 72.0315, "step": 3300 }, { "epoch": 0.602196245129295, "grad_norm": 0.0, "learning_rate": 5.841957122256004e-06, "loss": 70.9246, "step": 3400 }, { "epoch": 0.6199078993978038, "grad_norm": 407.43560791015625, "learning_rate": 5.749756633902887e-06, "loss": 47.9195, "step": 3500 }, { "epoch": 0.6376195536663124, "grad_norm": 673.2072143554688, "learning_rate": 5.6548151636398035e-06, "loss": 82.5247, "step": 3600 }, { "epoch": 0.6553312079348211, "grad_norm": 350.8990173339844, "learning_rate": 5.557248383023656e-06, "loss": 71.3394, "step": 3700 }, { "epoch": 0.6730428622033298, "grad_norm": 589.0938110351562, "learning_rate": 5.457175162147614e-06, "loss": 43.6511, "step": 3800 }, { "epoch": 0.6907545164718385, "grad_norm": 337.3728332519531, "learning_rate": 5.354717424816217e-06, "loss": 59.6694, "step": 3900 }, { "epoch": 0.7084661707403471, "grad_norm": 455.6134033203125, "learning_rate": 5.25e-06, "loss": 97.9412, "step": 4000 }, { "epoch": 0.7261778250088559, "grad_norm": 556.7127685546875, "learning_rate": 5.143150469750618e-06, "loss": 85.2299, "step": 4100 }, { "epoch": 0.7438894792773645, "grad_norm": 348.9285888671875, "learning_rate": 5.0342990137617705e-06, "loss": 112.7235, "step": 4200 }, { "epoch": 0.7616011335458732, "grad_norm": 4914.0361328125, "learning_rate": 4.923578250765301e-06, "loss": 90.57, "step": 4300 }, { "epoch": 0.7793127878143818, "grad_norm": 658.976318359375, "learning_rate": 4.811123076955693e-06, "loss": 144.314, "step": 4400 }, { "epoch": 0.7970244420828906, "grad_norm": 145.98973083496094, "learning_rate": 4.697070501639841e-06, "loss": 101.2893, "step": 4500 }, { "epoch": 0.8147360963513992, "grad_norm": 175.7976531982422, "learning_rate": 4.581559480312316e-06, "loss": 78.9849, "step": 4600 }, { "epoch": 0.832447750619908, "grad_norm": 506.91827392578125, "learning_rate": 4.464730745359497e-06, "loss": 54.5608, "step": 4700 }, { "epoch": 0.8501594048884166, "grad_norm": 3711.033203125, "learning_rate": 4.346726634598836e-06, "loss": 84.2886, "step": 4800 }, { "epoch": 0.8678710591569253, "grad_norm": 271.1780700683594, "learning_rate": 4.227690917862157e-06, "loss": 96.7124, "step": 4900 }, { "epoch": 0.8855827134254339, "grad_norm": 881.6817626953125, "learning_rate": 4.107768621834257e-06, "loss": 82.2586, "step": 5000 }, { "epoch": 0.9032943676939427, "grad_norm": 561.7996826171875, "learning_rate": 3.987105853360229e-06, "loss": 76.9558, "step": 5100 }, { "epoch": 0.9210060219624513, "grad_norm": 1692.2088623046875, "learning_rate": 3.865849621436788e-06, "loss": 64.0634, "step": 5200 }, { "epoch": 0.9387176762309599, "grad_norm": 187.64515686035156, "learning_rate": 3.744147658104438e-06, "loss": 89.3247, "step": 5300 }, { "epoch": 0.9564293304994687, "grad_norm": 301.86456298828125, "learning_rate": 3.6221482384587538e-06, "loss": 64.4865, "step": 5400 }, { "epoch": 0.9741409847679773, "grad_norm": 601.5034790039062, "learning_rate": 3.5e-06, "loss": 55.067, "step": 5500 }, { "epoch": 0.991852639036486, "grad_norm": 844.2435302734375, "learning_rate": 3.3778517615412474e-06, "loss": 83.8524, "step": 5600 }, { "epoch": 1.0, "eval_loss": 215.96673583984375, "eval_runtime": 61.7801, "eval_samples_per_second": 40.628, "eval_steps_per_second": 10.165, "step": 5646 }, { "epoch": 1.0095642933049946, "grad_norm": 247.68511962890625, "learning_rate": 3.2558523418955614e-06, "loss": 38.0757, "step": 5700 }, { "epoch": 1.0272759475735034, "grad_norm": 185.5315704345703, "learning_rate": 3.1341503785632136e-06, "loss": 54.2678, "step": 5800 }, { "epoch": 1.0449876018420121, "grad_norm": 159.9699249267578, "learning_rate": 3.0128941466397717e-06, "loss": 54.5598, "step": 5900 }, { "epoch": 1.0626992561105206, "grad_norm": 1924.46728515625, "learning_rate": 2.8922313781657437e-06, "loss": 50.845, "step": 6000 }, { "epoch": 1.0804109103790294, "grad_norm": 100.4423828125, "learning_rate": 2.772309082137843e-06, "loss": 58.8289, "step": 6100 }, { "epoch": 1.098122564647538, "grad_norm": 69.27734375, "learning_rate": 2.653273365401163e-06, "loss": 73.329, "step": 6200 }, { "epoch": 1.1158342189160468, "grad_norm": 2327.090087890625, "learning_rate": 2.535269254640503e-06, "loss": 67.806, "step": 6300 }, { "epoch": 1.1335458731845554, "grad_norm": 89.2359848022461, "learning_rate": 2.418440519687684e-06, "loss": 58.6152, "step": 6400 }, { "epoch": 1.151257527453064, "grad_norm": 1888.966552734375, "learning_rate": 2.3029294983601598e-06, "loss": 53.2992, "step": 6500 }, { "epoch": 1.1689691817215728, "grad_norm": 910.3232421875, "learning_rate": 2.1888769230443076e-06, "loss": 68.5465, "step": 6600 }, { "epoch": 1.1866808359900816, "grad_norm": 279.23175048828125, "learning_rate": 2.0764217492347e-06, "loss": 89.0601, "step": 6700 }, { "epoch": 1.20439249025859, "grad_norm": 210.86968994140625, "learning_rate": 1.9657009862382285e-06, "loss": 47.2063, "step": 6800 }, { "epoch": 1.2221041445270988, "grad_norm": 347.5172119140625, "learning_rate": 1.856849530249383e-06, "loss": 57.3233, "step": 6900 }, { "epoch": 1.2398157987956075, "grad_norm": 0.0, "learning_rate": 1.7500000000000008e-06, "loss": 74.2723, "step": 7000 }, { "epoch": 1.257527453064116, "grad_norm": 1560.5267333984375, "learning_rate": 1.6452825751837832e-06, "loss": 68.7949, "step": 7100 }, { "epoch": 1.2752391073326248, "grad_norm": 27841.501953125, "learning_rate": 1.5428248378523867e-06, "loss": 66.1273, "step": 7200 }, { "epoch": 1.2929507616011335, "grad_norm": 814.24755859375, "learning_rate": 1.4427516169763444e-06, "loss": 41.6258, "step": 7300 }, { "epoch": 1.3106624158696423, "grad_norm": 115.33793640136719, "learning_rate": 1.345184836360196e-06, "loss": 42.7393, "step": 7400 }, { "epoch": 1.328374070138151, "grad_norm": 429.6169738769531, "learning_rate": 1.2502433660971122e-06, "loss": 56.4888, "step": 7500 }, { "epoch": 1.3460857244066595, "grad_norm": 187.01365661621094, "learning_rate": 1.1580428777439973e-06, "loss": 66.9346, "step": 7600 }, { "epoch": 1.3637973786751683, "grad_norm": 320.4998474121094, "learning_rate": 1.0686957033935093e-06, "loss": 67.0082, "step": 7700 }, { "epoch": 1.381509032943677, "grad_norm": 361.787841796875, "learning_rate": 9.823106988147216e-07, "loss": 48.5887, "step": 7800 }, { "epoch": 1.3992206872121855, "grad_norm": 455.1145324707031, "learning_rate": 8.989931108291198e-07, "loss": 52.0289, "step": 7900 }, { "epoch": 1.4169323414806942, "grad_norm": 0.0, "learning_rate": 8.188444490835774e-07, "loss": 67.7081, "step": 8000 }, { "epoch": 1.434643995749203, "grad_norm": 592.6516723632812, "learning_rate": 7.419623623764733e-07, "loss": 66.9211, "step": 8100 }, { "epoch": 1.4523556500177117, "grad_norm": 659.7842407226562, "learning_rate": 6.684405196876843e-07, "loss": 40.6013, "step": 8200 }, { "epoch": 1.4700673042862205, "grad_norm": 2484.764892578125, "learning_rate": 5.983684960573543e-07, "loss": 73.2992, "step": 8300 }, { "epoch": 1.487778958554729, "grad_norm": 1122.31396484375, "learning_rate": 5.318316634525092e-07, "loss": 56.8008, "step": 8400 }, { "epoch": 1.5054906128232377, "grad_norm": 578.3250732421875, "learning_rate": 4.6891108675446453e-07, "loss": 103.5606, "step": 8500 }, { "epoch": 1.5232022670917464, "grad_norm": 553.962890625, "learning_rate": 4.0968342499375555e-07, "loss": 53.6275, "step": 8600 }, { "epoch": 1.540913921360255, "grad_norm": 537.8104248046875, "learning_rate": 3.5422083795291617e-07, "loss": 57.2888, "step": 8700 }, { "epoch": 1.5586255756287637, "grad_norm": 1050.9219970703125, "learning_rate": 3.025908982508966e-07, "loss": 61.4504, "step": 8800 }, { "epoch": 1.5763372298972724, "grad_norm": 168.5572052001953, "learning_rate": 2.548565090162444e-07, "loss": 63.5764, "step": 8900 }, { "epoch": 1.594048884165781, "grad_norm": 336.8919372558594, "learning_rate": 2.1107582724932088e-07, "loss": 88.5666, "step": 9000 }, { "epoch": 1.61176053843429, "grad_norm": 671.1005249023438, "learning_rate": 1.7130219296696264e-07, "loss": 44.282, "step": 9100 }, { "epoch": 1.6294721927027984, "grad_norm": 136.64625549316406, "learning_rate": 1.3558406421588387e-07, "loss": 36.684, "step": 9200 }, { "epoch": 1.6471838469713072, "grad_norm": 207.37889099121094, "learning_rate": 1.0396495803401234e-07, "loss": 68.6406, "step": 9300 }, { "epoch": 1.664895501239816, "grad_norm": 300.9512939453125, "learning_rate": 7.648339743168009e-08, "loss": 36.0643, "step": 9400 }, { "epoch": 1.6826071555083244, "grad_norm": 451.09735107421875, "learning_rate": 5.3172864457271926e-08, "loss": 76.4972, "step": 9500 }, { "epoch": 1.7003188097768331, "grad_norm": 919.307373046875, "learning_rate": 3.406175940450373e-08, "loss": 44.2545, "step": 9600 }, { "epoch": 1.7180304640453419, "grad_norm": 295.6180725097656, "learning_rate": 1.9173366211043486e-08, "loss": 46.4226, "step": 9700 }, { "epoch": 1.7357421183138504, "grad_norm": 238.06109619140625, "learning_rate": 8.525824090615308e-09, "loss": 68.5876, "step": 9800 }, { "epoch": 1.7534537725823593, "grad_norm": 302.9677429199219, "learning_rate": 2.1321054331648324e-09, "loss": 60.3941, "step": 9900 }, { "epoch": 1.7711654268508679, "grad_norm": 463.6520690917969, "learning_rate": 0.0, "loss": 100.1996, "step": 10000 } ], "logging_steps": 100, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }