{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 43, "global_step": 127, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007874015748031496, "grad_norm": 118.11203002929688, "learning_rate": 2.0000000000000003e-06, "loss": 4.6099, "step": 1 }, { "epoch": 0.007874015748031496, "eval_loss": 3.1001100540161133, "eval_runtime": 5.3966, "eval_samples_per_second": 30.204, "eval_steps_per_second": 3.891, "step": 1 }, { "epoch": 0.015748031496062992, "grad_norm": 118.4310302734375, "learning_rate": 4.000000000000001e-06, "loss": 4.5857, "step": 2 }, { "epoch": 0.023622047244094488, "grad_norm": 103.37439727783203, "learning_rate": 6e-06, "loss": 4.3069, "step": 3 }, { "epoch": 0.031496062992125984, "grad_norm": 75.05075073242188, "learning_rate": 8.000000000000001e-06, "loss": 3.8754, "step": 4 }, { "epoch": 0.03937007874015748, "grad_norm": 50.459983825683594, "learning_rate": 1e-05, "loss": 3.2841, "step": 5 }, { "epoch": 0.047244094488188976, "grad_norm": 47.4603385925293, "learning_rate": 1.2e-05, "loss": 2.4285, "step": 6 }, { "epoch": 0.05511811023622047, "grad_norm": 32.362667083740234, "learning_rate": 1.4e-05, "loss": 1.8177, "step": 7 }, { "epoch": 0.06299212598425197, "grad_norm": 22.846933364868164, "learning_rate": 1.6000000000000003e-05, "loss": 1.1567, "step": 8 }, { "epoch": 0.07086614173228346, "grad_norm": 17.060213088989258, "learning_rate": 1.8e-05, "loss": 0.8257, "step": 9 }, { "epoch": 0.07874015748031496, "grad_norm": 14.415579795837402, "learning_rate": 2e-05, "loss": 0.4257, "step": 10 }, { "epoch": 0.08661417322834646, "grad_norm": 7.753712177276611, "learning_rate": 1.999964147509006e-05, "loss": 0.2976, "step": 11 }, { "epoch": 0.09448818897637795, "grad_norm": 26.883708953857422, "learning_rate": 1.9998565926068253e-05, "loss": 0.3365, "step": 12 }, { "epoch": 0.10236220472440945, "grad_norm": 10.675631523132324, "learning_rate": 1.9996773430056806e-05, "loss": 0.2161, "step": 13 }, { "epoch": 0.11023622047244094, "grad_norm": 6.670111179351807, "learning_rate": 1.999426411558661e-05, "loss": 0.1816, "step": 14 }, { "epoch": 0.11811023622047244, "grad_norm": 8.878239631652832, "learning_rate": 1.9991038162588018e-05, "loss": 0.1567, "step": 15 }, { "epoch": 0.12598425196850394, "grad_norm": 2.9917383193969727, "learning_rate": 1.9987095802377933e-05, "loss": 0.0813, "step": 16 }, { "epoch": 0.13385826771653545, "grad_norm": 1.0548763275146484, "learning_rate": 1.9982437317643218e-05, "loss": 0.0217, "step": 17 }, { "epoch": 0.14173228346456693, "grad_norm": 2.8778488636016846, "learning_rate": 1.9977063042420438e-05, "loss": 0.0618, "step": 18 }, { "epoch": 0.14960629921259844, "grad_norm": 0.9811734557151794, "learning_rate": 1.99709733620719e-05, "loss": 0.0175, "step": 19 }, { "epoch": 0.15748031496062992, "grad_norm": 0.7218202948570251, "learning_rate": 1.996416871325803e-05, "loss": 0.0302, "step": 20 }, { "epoch": 0.16535433070866143, "grad_norm": 1.2746995687484741, "learning_rate": 1.995664958390604e-05, "loss": 0.0453, "step": 21 }, { "epoch": 0.1732283464566929, "grad_norm": 0.9413469433784485, "learning_rate": 1.9948416513174976e-05, "loss": 0.0175, "step": 22 }, { "epoch": 0.18110236220472442, "grad_norm": 1.4161137342453003, "learning_rate": 1.9939470091417012e-05, "loss": 0.0277, "step": 23 }, { "epoch": 0.1889763779527559, "grad_norm": 2.2721235752105713, "learning_rate": 1.992981096013517e-05, "loss": 0.0589, "step": 24 }, { "epoch": 0.1968503937007874, "grad_norm": 1.143970251083374, "learning_rate": 1.9919439811937283e-05, "loss": 0.0182, "step": 25 }, { "epoch": 0.2047244094488189, "grad_norm": 0.8054028749465942, "learning_rate": 1.9908357390486342e-05, "loss": 0.0211, "step": 26 }, { "epoch": 0.2125984251968504, "grad_norm": 1.4449081420898438, "learning_rate": 1.989656449044718e-05, "loss": 0.0244, "step": 27 }, { "epoch": 0.2204724409448819, "grad_norm": 0.49216631054878235, "learning_rate": 1.988406195742948e-05, "loss": 0.005, "step": 28 }, { "epoch": 0.2283464566929134, "grad_norm": 0.9945647716522217, "learning_rate": 1.987085068792715e-05, "loss": 0.0373, "step": 29 }, { "epoch": 0.23622047244094488, "grad_norm": 1.1753748655319214, "learning_rate": 1.9856931629254032e-05, "loss": 0.0217, "step": 30 }, { "epoch": 0.2440944881889764, "grad_norm": 0.5960403680801392, "learning_rate": 1.984230577947597e-05, "loss": 0.0157, "step": 31 }, { "epoch": 0.25196850393700787, "grad_norm": 0.3657272160053253, "learning_rate": 1.9826974187339267e-05, "loss": 0.0082, "step": 32 }, { "epoch": 0.25984251968503935, "grad_norm": 1.1290266513824463, "learning_rate": 1.981093795219546e-05, "loss": 0.0236, "step": 33 }, { "epoch": 0.2677165354330709, "grad_norm": 1.673962116241455, "learning_rate": 1.9794198223922496e-05, "loss": 0.0182, "step": 34 }, { "epoch": 0.2755905511811024, "grad_norm": 0.540355384349823, "learning_rate": 1.9776756202842297e-05, "loss": 0.011, "step": 35 }, { "epoch": 0.28346456692913385, "grad_norm": 0.3380790054798126, "learning_rate": 1.9758613139634662e-05, "loss": 0.0048, "step": 36 }, { "epoch": 0.29133858267716534, "grad_norm": 1.886232852935791, "learning_rate": 1.9739770335247616e-05, "loss": 0.0157, "step": 37 }, { "epoch": 0.2992125984251969, "grad_norm": 2.140639305114746, "learning_rate": 1.972022914080411e-05, "loss": 0.0393, "step": 38 }, { "epoch": 0.30708661417322836, "grad_norm": 0.35308870673179626, "learning_rate": 1.9699990957505136e-05, "loss": 0.0074, "step": 39 }, { "epoch": 0.31496062992125984, "grad_norm": 0.3918301463127136, "learning_rate": 1.9679057236529266e-05, "loss": 0.0083, "step": 40 }, { "epoch": 0.3228346456692913, "grad_norm": 0.4406338632106781, "learning_rate": 1.965742947892858e-05, "loss": 0.0152, "step": 41 }, { "epoch": 0.33070866141732286, "grad_norm": 0.6819682121276855, "learning_rate": 1.9635109235521057e-05, "loss": 0.0091, "step": 42 }, { "epoch": 0.33858267716535434, "grad_norm": 0.6794927716255188, "learning_rate": 1.961209810677934e-05, "loss": 0.0071, "step": 43 }, { "epoch": 0.33858267716535434, "eval_loss": 0.3895845115184784, "eval_runtime": 6.5602, "eval_samples_per_second": 24.847, "eval_steps_per_second": 3.201, "step": 43 }, { "epoch": 0.3464566929133858, "grad_norm": 0.3874967694282532, "learning_rate": 1.9588397742716004e-05, "loss": 0.0089, "step": 44 }, { "epoch": 0.3543307086614173, "grad_norm": 0.5577577352523804, "learning_rate": 1.9564009842765225e-05, "loss": 0.0098, "step": 45 }, { "epoch": 0.36220472440944884, "grad_norm": 0.8152347207069397, "learning_rate": 1.9538936155660934e-05, "loss": 0.0118, "step": 46 }, { "epoch": 0.3700787401574803, "grad_norm": 0.2971118688583374, "learning_rate": 1.951317847931141e-05, "loss": 0.0084, "step": 47 }, { "epoch": 0.3779527559055118, "grad_norm": 1.0286651849746704, "learning_rate": 1.9486738660670373e-05, "loss": 0.0123, "step": 48 }, { "epoch": 0.3858267716535433, "grad_norm": 0.5227222442626953, "learning_rate": 1.945961859560454e-05, "loss": 0.0144, "step": 49 }, { "epoch": 0.3937007874015748, "grad_norm": 0.461935818195343, "learning_rate": 1.943182022875769e-05, "loss": 0.0119, "step": 50 }, { "epoch": 0.4015748031496063, "grad_norm": 1.2550626993179321, "learning_rate": 1.940334555341122e-05, "loss": 0.013, "step": 51 }, { "epoch": 0.4094488188976378, "grad_norm": 0.37549659609794617, "learning_rate": 1.9374196611341212e-05, "loss": 0.0181, "step": 52 }, { "epoch": 0.41732283464566927, "grad_norm": 0.3444191515445709, "learning_rate": 1.9344375492672024e-05, "loss": 0.0111, "step": 53 }, { "epoch": 0.4251968503937008, "grad_norm": 0.3489387333393097, "learning_rate": 1.9313884335726443e-05, "loss": 0.0111, "step": 54 }, { "epoch": 0.4330708661417323, "grad_norm": 0.26080814003944397, "learning_rate": 1.9282725326872324e-05, "loss": 0.0091, "step": 55 }, { "epoch": 0.4409448818897638, "grad_norm": 0.1390451341867447, "learning_rate": 1.9250900700365837e-05, "loss": 0.0033, "step": 56 }, { "epoch": 0.44881889763779526, "grad_norm": 0.20499111711978912, "learning_rate": 1.921841273819125e-05, "loss": 0.0066, "step": 57 }, { "epoch": 0.4566929133858268, "grad_norm": 2.185487747192383, "learning_rate": 1.918526376989731e-05, "loss": 0.0095, "step": 58 }, { "epoch": 0.4645669291338583, "grad_norm": 0.23939816653728485, "learning_rate": 1.9151456172430186e-05, "loss": 0.0048, "step": 59 }, { "epoch": 0.47244094488188976, "grad_norm": 0.41510018706321716, "learning_rate": 1.911699236996305e-05, "loss": 0.0077, "step": 60 }, { "epoch": 0.48031496062992124, "grad_norm": 0.264318585395813, "learning_rate": 1.9081874833722234e-05, "loss": 0.0129, "step": 61 }, { "epoch": 0.4881889763779528, "grad_norm": 1.0443968772888184, "learning_rate": 1.9046106081810047e-05, "loss": 0.0035, "step": 62 }, { "epoch": 0.49606299212598426, "grad_norm": 0.2800132632255554, "learning_rate": 1.900968867902419e-05, "loss": 0.0057, "step": 63 }, { "epoch": 0.5039370078740157, "grad_norm": 1.114960789680481, "learning_rate": 1.8972625236673887e-05, "loss": 0.0123, "step": 64 }, { "epoch": 0.5118110236220472, "grad_norm": 0.5027065873146057, "learning_rate": 1.8934918412392596e-05, "loss": 0.0052, "step": 65 }, { "epoch": 0.5196850393700787, "grad_norm": 0.5564169883728027, "learning_rate": 1.8896570909947477e-05, "loss": 0.0085, "step": 66 }, { "epoch": 0.5275590551181102, "grad_norm": 0.7567198872566223, "learning_rate": 1.8857585479045493e-05, "loss": 0.0054, "step": 67 }, { "epoch": 0.5354330708661418, "grad_norm": 0.13573969900608063, "learning_rate": 1.8817964915136277e-05, "loss": 0.0008, "step": 68 }, { "epoch": 0.5433070866141733, "grad_norm": 0.2704390287399292, "learning_rate": 1.8777712059211643e-05, "loss": 0.0078, "step": 69 }, { "epoch": 0.5511811023622047, "grad_norm": 0.6014392971992493, "learning_rate": 1.8736829797601903e-05, "loss": 0.0059, "step": 70 }, { "epoch": 0.5590551181102362, "grad_norm": 0.5487034916877747, "learning_rate": 1.8695321061768886e-05, "loss": 0.0097, "step": 71 }, { "epoch": 0.5669291338582677, "grad_norm": 0.6670834422111511, "learning_rate": 1.8653188828095754e-05, "loss": 0.011, "step": 72 }, { "epoch": 0.5748031496062992, "grad_norm": 0.1795203685760498, "learning_rate": 1.8610436117673557e-05, "loss": 0.0067, "step": 73 }, { "epoch": 0.5826771653543307, "grad_norm": 1.768436074256897, "learning_rate": 1.8567065996084628e-05, "loss": 0.0096, "step": 74 }, { "epoch": 0.5905511811023622, "grad_norm": 0.26233312487602234, "learning_rate": 1.8523081573182754e-05, "loss": 0.0124, "step": 75 }, { "epoch": 0.5984251968503937, "grad_norm": 0.3775719404220581, "learning_rate": 1.847848600287019e-05, "loss": 0.0052, "step": 76 }, { "epoch": 0.6062992125984252, "grad_norm": 1.0016565322875977, "learning_rate": 1.8433282482871497e-05, "loss": 0.0058, "step": 77 }, { "epoch": 0.6141732283464567, "grad_norm": 0.20153792202472687, "learning_rate": 1.8387474254504265e-05, "loss": 0.0056, "step": 78 }, { "epoch": 0.6220472440944882, "grad_norm": 0.5119822025299072, "learning_rate": 1.8341064602446686e-05, "loss": 0.0079, "step": 79 }, { "epoch": 0.6299212598425197, "grad_norm": 1.5781004428863525, "learning_rate": 1.829405685450202e-05, "loss": 0.008, "step": 80 }, { "epoch": 0.6377952755905512, "grad_norm": 0.23826757073402405, "learning_rate": 1.824645438135999e-05, "loss": 0.0041, "step": 81 }, { "epoch": 0.6456692913385826, "grad_norm": 0.6386727690696716, "learning_rate": 1.8198260596355077e-05, "loss": 0.0188, "step": 82 }, { "epoch": 0.6535433070866141, "grad_norm": 0.9503199458122253, "learning_rate": 1.814947895522176e-05, "loss": 0.008, "step": 83 }, { "epoch": 0.6614173228346457, "grad_norm": 0.2040701061487198, "learning_rate": 1.8100112955846746e-05, "loss": 0.0038, "step": 84 }, { "epoch": 0.6692913385826772, "grad_norm": 0.3660199046134949, "learning_rate": 1.805016613801813e-05, "loss": 0.0148, "step": 85 }, { "epoch": 0.6771653543307087, "grad_norm": 1.0502821207046509, "learning_rate": 1.7999642083171576e-05, "loss": 0.0098, "step": 86 }, { "epoch": 0.6771653543307087, "eval_loss": 0.3526817262172699, "eval_runtime": 6.6167, "eval_samples_per_second": 24.635, "eval_steps_per_second": 3.174, "step": 86 }, { "epoch": 0.6850393700787402, "grad_norm": 0.13735969364643097, "learning_rate": 1.7948544414133534e-05, "loss": 0.0022, "step": 87 }, { "epoch": 0.6929133858267716, "grad_norm": 0.6425012946128845, "learning_rate": 1.7896876794861443e-05, "loss": 0.0086, "step": 88 }, { "epoch": 0.7007874015748031, "grad_norm": 0.7540380954742432, "learning_rate": 1.7844642930181008e-05, "loss": 0.0062, "step": 89 }, { "epoch": 0.7086614173228346, "grad_norm": 0.6727365255355835, "learning_rate": 1.779184656552056e-05, "loss": 0.0027, "step": 90 }, { "epoch": 0.7165354330708661, "grad_norm": 0.14059337973594666, "learning_rate": 1.773849148664247e-05, "loss": 0.0056, "step": 91 }, { "epoch": 0.7244094488188977, "grad_norm": 0.33292093873023987, "learning_rate": 1.7684581519371714e-05, "loss": 0.0047, "step": 92 }, { "epoch": 0.7322834645669292, "grad_norm": 0.3809877932071686, "learning_rate": 1.7630120529321518e-05, "loss": 0.0139, "step": 93 }, { "epoch": 0.7401574803149606, "grad_norm": 1.729589819908142, "learning_rate": 1.7575112421616203e-05, "loss": 0.0128, "step": 94 }, { "epoch": 0.7480314960629921, "grad_norm": 0.18192608654499054, "learning_rate": 1.751956114061113e-05, "loss": 0.0025, "step": 95 }, { "epoch": 0.7559055118110236, "grad_norm": 1.0333118438720703, "learning_rate": 1.7463470669609907e-05, "loss": 0.006, "step": 96 }, { "epoch": 0.7637795275590551, "grad_norm": 0.7247685194015503, "learning_rate": 1.7406845030578747e-05, "loss": 0.0073, "step": 97 }, { "epoch": 0.7716535433070866, "grad_norm": 0.06979379802942276, "learning_rate": 1.734968828385808e-05, "loss": 0.0005, "step": 98 }, { "epoch": 0.7795275590551181, "grad_norm": 0.5137119293212891, "learning_rate": 1.729200452787139e-05, "loss": 0.0082, "step": 99 }, { "epoch": 0.7874015748031497, "grad_norm": 0.4704137146472931, "learning_rate": 1.7233797898831376e-05, "loss": 0.005, "step": 100 }, { "epoch": 0.7952755905511811, "grad_norm": 0.28564465045928955, "learning_rate": 1.717507257044331e-05, "loss": 0.0052, "step": 101 }, { "epoch": 0.8031496062992126, "grad_norm": 0.17685537040233612, "learning_rate": 1.711583275360582e-05, "loss": 0.0024, "step": 102 }, { "epoch": 0.8110236220472441, "grad_norm": 0.45714935660362244, "learning_rate": 1.7056082696108896e-05, "loss": 0.0072, "step": 103 }, { "epoch": 0.8188976377952756, "grad_norm": 0.4373086988925934, "learning_rate": 1.699582668232934e-05, "loss": 0.0051, "step": 104 }, { "epoch": 0.8267716535433071, "grad_norm": 0.8478983640670776, "learning_rate": 1.6935069032923525e-05, "loss": 0.022, "step": 105 }, { "epoch": 0.8346456692913385, "grad_norm": 0.16181086003780365, "learning_rate": 1.6873814104517617e-05, "loss": 0.0058, "step": 106 }, { "epoch": 0.84251968503937, "grad_norm": 0.09503592550754547, "learning_rate": 1.6812066289395157e-05, "loss": 0.0009, "step": 107 }, { "epoch": 0.8503937007874016, "grad_norm": 0.7462632060050964, "learning_rate": 1.6749830015182106e-05, "loss": 0.0044, "step": 108 }, { "epoch": 0.8582677165354331, "grad_norm": 0.07221701741218567, "learning_rate": 1.6687109744529394e-05, "loss": 0.0015, "step": 109 }, { "epoch": 0.8661417322834646, "grad_norm": 0.08999036252498627, "learning_rate": 1.6623909974792888e-05, "loss": 0.0023, "step": 110 }, { "epoch": 0.8740157480314961, "grad_norm": 0.42536938190460205, "learning_rate": 1.656023523771095e-05, "loss": 0.005, "step": 111 }, { "epoch": 0.8818897637795275, "grad_norm": 0.7885191440582275, "learning_rate": 1.6496090099079452e-05, "loss": 0.0103, "step": 112 }, { "epoch": 0.889763779527559, "grad_norm": 0.16610018908977509, "learning_rate": 1.64314791584244e-05, "loss": 0.006, "step": 113 }, { "epoch": 0.8976377952755905, "grad_norm": 0.32151034474372864, "learning_rate": 1.6366407048672135e-05, "loss": 0.0086, "step": 114 }, { "epoch": 0.905511811023622, "grad_norm": 0.557732343673706, "learning_rate": 1.6300878435817115e-05, "loss": 0.0064, "step": 115 }, { "epoch": 0.9133858267716536, "grad_norm": 0.2238176167011261, "learning_rate": 1.6234898018587336e-05, "loss": 0.0065, "step": 116 }, { "epoch": 0.9212598425196851, "grad_norm": 0.2980042099952698, "learning_rate": 1.616847052810744e-05, "loss": 0.0095, "step": 117 }, { "epoch": 0.9291338582677166, "grad_norm": 0.1529705822467804, "learning_rate": 1.6101600727559423e-05, "loss": 0.0062, "step": 118 }, { "epoch": 0.937007874015748, "grad_norm": 0.017149658873677254, "learning_rate": 1.603429341184114e-05, "loss": 0.0002, "step": 119 }, { "epoch": 0.9448818897637795, "grad_norm": 0.4514746367931366, "learning_rate": 1.596655340722244e-05, "loss": 0.0067, "step": 120 }, { "epoch": 0.952755905511811, "grad_norm": 0.11766134947538376, "learning_rate": 1.5898385570999146e-05, "loss": 0.0053, "step": 121 }, { "epoch": 0.9606299212598425, "grad_norm": 0.4089784026145935, "learning_rate": 1.5829794791144723e-05, "loss": 0.0085, "step": 122 }, { "epoch": 0.968503937007874, "grad_norm": 0.1353057473897934, "learning_rate": 1.57607859859598e-05, "loss": 0.0013, "step": 123 }, { "epoch": 0.9763779527559056, "grad_norm": 0.6548481583595276, "learning_rate": 1.5691364103719515e-05, "loss": 0.0117, "step": 124 }, { "epoch": 0.984251968503937, "grad_norm": 0.1571267992258072, "learning_rate": 1.5621534122318682e-05, "loss": 0.0049, "step": 125 }, { "epoch": 0.9921259842519685, "grad_norm": 1.2177189588546753, "learning_rate": 1.5551301048914863e-05, "loss": 0.0161, "step": 126 }, { "epoch": 1.0, "grad_norm": 0.414489209651947, "learning_rate": 1.5480669919569313e-05, "loss": 0.0181, "step": 127 } ], "logging_steps": 1, "max_steps": 381, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 127, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3087271069889331e+17, "train_batch_size": 128, "trial_name": null, "trial_params": null }