|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 43, |
|
"global_step": 254, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007874015748031496, |
|
"grad_norm": 118.11203002929688, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 4.6099, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007874015748031496, |
|
"eval_loss": 3.1001100540161133, |
|
"eval_runtime": 5.3966, |
|
"eval_samples_per_second": 30.204, |
|
"eval_steps_per_second": 3.891, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.015748031496062992, |
|
"grad_norm": 118.4310302734375, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 4.5857, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.023622047244094488, |
|
"grad_norm": 103.37439727783203, |
|
"learning_rate": 6e-06, |
|
"loss": 4.3069, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.031496062992125984, |
|
"grad_norm": 75.05075073242188, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 3.8754, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03937007874015748, |
|
"grad_norm": 50.459983825683594, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2841, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.047244094488188976, |
|
"grad_norm": 47.4603385925293, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.4285, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05511811023622047, |
|
"grad_norm": 32.362667083740234, |
|
"learning_rate": 1.4e-05, |
|
"loss": 1.8177, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.06299212598425197, |
|
"grad_norm": 22.846933364868164, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.1567, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.07086614173228346, |
|
"grad_norm": 17.060213088989258, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.8257, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07874015748031496, |
|
"grad_norm": 14.415579795837402, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4257, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08661417322834646, |
|
"grad_norm": 7.753712177276611, |
|
"learning_rate": 1.999964147509006e-05, |
|
"loss": 0.2976, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.09448818897637795, |
|
"grad_norm": 26.883708953857422, |
|
"learning_rate": 1.9998565926068253e-05, |
|
"loss": 0.3365, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.10236220472440945, |
|
"grad_norm": 10.675631523132324, |
|
"learning_rate": 1.9996773430056806e-05, |
|
"loss": 0.2161, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.11023622047244094, |
|
"grad_norm": 6.670111179351807, |
|
"learning_rate": 1.999426411558661e-05, |
|
"loss": 0.1816, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.11811023622047244, |
|
"grad_norm": 8.878239631652832, |
|
"learning_rate": 1.9991038162588018e-05, |
|
"loss": 0.1567, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12598425196850394, |
|
"grad_norm": 2.9917383193969727, |
|
"learning_rate": 1.9987095802377933e-05, |
|
"loss": 0.0813, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.13385826771653545, |
|
"grad_norm": 1.0548763275146484, |
|
"learning_rate": 1.9982437317643218e-05, |
|
"loss": 0.0217, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.14173228346456693, |
|
"grad_norm": 2.8778488636016846, |
|
"learning_rate": 1.9977063042420438e-05, |
|
"loss": 0.0618, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.14960629921259844, |
|
"grad_norm": 0.9811734557151794, |
|
"learning_rate": 1.99709733620719e-05, |
|
"loss": 0.0175, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.15748031496062992, |
|
"grad_norm": 0.7218202948570251, |
|
"learning_rate": 1.996416871325803e-05, |
|
"loss": 0.0302, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16535433070866143, |
|
"grad_norm": 1.2746995687484741, |
|
"learning_rate": 1.995664958390604e-05, |
|
"loss": 0.0453, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1732283464566929, |
|
"grad_norm": 0.9413469433784485, |
|
"learning_rate": 1.9948416513174976e-05, |
|
"loss": 0.0175, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.18110236220472442, |
|
"grad_norm": 1.4161137342453003, |
|
"learning_rate": 1.9939470091417012e-05, |
|
"loss": 0.0277, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1889763779527559, |
|
"grad_norm": 2.2721235752105713, |
|
"learning_rate": 1.992981096013517e-05, |
|
"loss": 0.0589, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1968503937007874, |
|
"grad_norm": 1.143970251083374, |
|
"learning_rate": 1.9919439811937283e-05, |
|
"loss": 0.0182, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2047244094488189, |
|
"grad_norm": 0.8054028749465942, |
|
"learning_rate": 1.9908357390486342e-05, |
|
"loss": 0.0211, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2125984251968504, |
|
"grad_norm": 1.4449081420898438, |
|
"learning_rate": 1.989656449044718e-05, |
|
"loss": 0.0244, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.2204724409448819, |
|
"grad_norm": 0.49216631054878235, |
|
"learning_rate": 1.988406195742948e-05, |
|
"loss": 0.005, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2283464566929134, |
|
"grad_norm": 0.9945647716522217, |
|
"learning_rate": 1.987085068792715e-05, |
|
"loss": 0.0373, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.23622047244094488, |
|
"grad_norm": 1.1753748655319214, |
|
"learning_rate": 1.9856931629254032e-05, |
|
"loss": 0.0217, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2440944881889764, |
|
"grad_norm": 0.5960403680801392, |
|
"learning_rate": 1.984230577947597e-05, |
|
"loss": 0.0157, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.25196850393700787, |
|
"grad_norm": 0.3657272160053253, |
|
"learning_rate": 1.9826974187339267e-05, |
|
"loss": 0.0082, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.25984251968503935, |
|
"grad_norm": 1.1290266513824463, |
|
"learning_rate": 1.981093795219546e-05, |
|
"loss": 0.0236, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2677165354330709, |
|
"grad_norm": 1.673962116241455, |
|
"learning_rate": 1.9794198223922496e-05, |
|
"loss": 0.0182, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2755905511811024, |
|
"grad_norm": 0.540355384349823, |
|
"learning_rate": 1.9776756202842297e-05, |
|
"loss": 0.011, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.28346456692913385, |
|
"grad_norm": 0.3380790054798126, |
|
"learning_rate": 1.9758613139634662e-05, |
|
"loss": 0.0048, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.29133858267716534, |
|
"grad_norm": 1.886232852935791, |
|
"learning_rate": 1.9739770335247616e-05, |
|
"loss": 0.0157, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2992125984251969, |
|
"grad_norm": 2.140639305114746, |
|
"learning_rate": 1.972022914080411e-05, |
|
"loss": 0.0393, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.30708661417322836, |
|
"grad_norm": 0.35308870673179626, |
|
"learning_rate": 1.9699990957505136e-05, |
|
"loss": 0.0074, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.31496062992125984, |
|
"grad_norm": 0.3918301463127136, |
|
"learning_rate": 1.9679057236529266e-05, |
|
"loss": 0.0083, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3228346456692913, |
|
"grad_norm": 0.4406338632106781, |
|
"learning_rate": 1.965742947892858e-05, |
|
"loss": 0.0152, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.33070866141732286, |
|
"grad_norm": 0.6819682121276855, |
|
"learning_rate": 1.9635109235521057e-05, |
|
"loss": 0.0091, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.33858267716535434, |
|
"grad_norm": 0.6794927716255188, |
|
"learning_rate": 1.961209810677934e-05, |
|
"loss": 0.0071, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.33858267716535434, |
|
"eval_loss": 0.3895845115184784, |
|
"eval_runtime": 6.5602, |
|
"eval_samples_per_second": 24.847, |
|
"eval_steps_per_second": 3.201, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3464566929133858, |
|
"grad_norm": 0.3874967694282532, |
|
"learning_rate": 1.9588397742716004e-05, |
|
"loss": 0.0089, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3543307086614173, |
|
"grad_norm": 0.5577577352523804, |
|
"learning_rate": 1.9564009842765225e-05, |
|
"loss": 0.0098, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.36220472440944884, |
|
"grad_norm": 0.8152347207069397, |
|
"learning_rate": 1.9538936155660934e-05, |
|
"loss": 0.0118, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3700787401574803, |
|
"grad_norm": 0.2971118688583374, |
|
"learning_rate": 1.951317847931141e-05, |
|
"loss": 0.0084, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3779527559055118, |
|
"grad_norm": 1.0286651849746704, |
|
"learning_rate": 1.9486738660670373e-05, |
|
"loss": 0.0123, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3858267716535433, |
|
"grad_norm": 0.5227222442626953, |
|
"learning_rate": 1.945961859560454e-05, |
|
"loss": 0.0144, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3937007874015748, |
|
"grad_norm": 0.461935818195343, |
|
"learning_rate": 1.943182022875769e-05, |
|
"loss": 0.0119, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4015748031496063, |
|
"grad_norm": 1.2550626993179321, |
|
"learning_rate": 1.940334555341122e-05, |
|
"loss": 0.013, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4094488188976378, |
|
"grad_norm": 0.37549659609794617, |
|
"learning_rate": 1.9374196611341212e-05, |
|
"loss": 0.0181, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.41732283464566927, |
|
"grad_norm": 0.3444191515445709, |
|
"learning_rate": 1.9344375492672024e-05, |
|
"loss": 0.0111, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.4251968503937008, |
|
"grad_norm": 0.3489387333393097, |
|
"learning_rate": 1.9313884335726443e-05, |
|
"loss": 0.0111, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.4330708661417323, |
|
"grad_norm": 0.26080814003944397, |
|
"learning_rate": 1.9282725326872324e-05, |
|
"loss": 0.0091, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4409448818897638, |
|
"grad_norm": 0.1390451341867447, |
|
"learning_rate": 1.9250900700365837e-05, |
|
"loss": 0.0033, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.44881889763779526, |
|
"grad_norm": 0.20499111711978912, |
|
"learning_rate": 1.921841273819125e-05, |
|
"loss": 0.0066, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.4566929133858268, |
|
"grad_norm": 2.185487747192383, |
|
"learning_rate": 1.918526376989731e-05, |
|
"loss": 0.0095, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4645669291338583, |
|
"grad_norm": 0.23939816653728485, |
|
"learning_rate": 1.9151456172430186e-05, |
|
"loss": 0.0048, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.47244094488188976, |
|
"grad_norm": 0.41510018706321716, |
|
"learning_rate": 1.911699236996305e-05, |
|
"loss": 0.0077, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.48031496062992124, |
|
"grad_norm": 0.264318585395813, |
|
"learning_rate": 1.9081874833722234e-05, |
|
"loss": 0.0129, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4881889763779528, |
|
"grad_norm": 1.0443968772888184, |
|
"learning_rate": 1.9046106081810047e-05, |
|
"loss": 0.0035, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.49606299212598426, |
|
"grad_norm": 0.2800132632255554, |
|
"learning_rate": 1.900968867902419e-05, |
|
"loss": 0.0057, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5039370078740157, |
|
"grad_norm": 1.114960789680481, |
|
"learning_rate": 1.8972625236673887e-05, |
|
"loss": 0.0123, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5118110236220472, |
|
"grad_norm": 0.5027065873146057, |
|
"learning_rate": 1.8934918412392596e-05, |
|
"loss": 0.0052, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5196850393700787, |
|
"grad_norm": 0.5564169883728027, |
|
"learning_rate": 1.8896570909947477e-05, |
|
"loss": 0.0085, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5275590551181102, |
|
"grad_norm": 0.7567198872566223, |
|
"learning_rate": 1.8857585479045493e-05, |
|
"loss": 0.0054, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5354330708661418, |
|
"grad_norm": 0.13573969900608063, |
|
"learning_rate": 1.8817964915136277e-05, |
|
"loss": 0.0008, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5433070866141733, |
|
"grad_norm": 0.2704390287399292, |
|
"learning_rate": 1.8777712059211643e-05, |
|
"loss": 0.0078, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5511811023622047, |
|
"grad_norm": 0.6014392971992493, |
|
"learning_rate": 1.8736829797601903e-05, |
|
"loss": 0.0059, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5590551181102362, |
|
"grad_norm": 0.5487034916877747, |
|
"learning_rate": 1.8695321061768886e-05, |
|
"loss": 0.0097, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5669291338582677, |
|
"grad_norm": 0.6670834422111511, |
|
"learning_rate": 1.8653188828095754e-05, |
|
"loss": 0.011, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5748031496062992, |
|
"grad_norm": 0.1795203685760498, |
|
"learning_rate": 1.8610436117673557e-05, |
|
"loss": 0.0067, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5826771653543307, |
|
"grad_norm": 1.768436074256897, |
|
"learning_rate": 1.8567065996084628e-05, |
|
"loss": 0.0096, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5905511811023622, |
|
"grad_norm": 0.26233312487602234, |
|
"learning_rate": 1.8523081573182754e-05, |
|
"loss": 0.0124, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5984251968503937, |
|
"grad_norm": 0.3775719404220581, |
|
"learning_rate": 1.847848600287019e-05, |
|
"loss": 0.0052, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6062992125984252, |
|
"grad_norm": 1.0016565322875977, |
|
"learning_rate": 1.8433282482871497e-05, |
|
"loss": 0.0058, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6141732283464567, |
|
"grad_norm": 0.20153792202472687, |
|
"learning_rate": 1.8387474254504265e-05, |
|
"loss": 0.0056, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6220472440944882, |
|
"grad_norm": 0.5119822025299072, |
|
"learning_rate": 1.8341064602446686e-05, |
|
"loss": 0.0079, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6299212598425197, |
|
"grad_norm": 1.5781004428863525, |
|
"learning_rate": 1.829405685450202e-05, |
|
"loss": 0.008, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6377952755905512, |
|
"grad_norm": 0.23826757073402405, |
|
"learning_rate": 1.824645438135999e-05, |
|
"loss": 0.0041, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6456692913385826, |
|
"grad_norm": 0.6386727690696716, |
|
"learning_rate": 1.8198260596355077e-05, |
|
"loss": 0.0188, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6535433070866141, |
|
"grad_norm": 0.9503199458122253, |
|
"learning_rate": 1.814947895522176e-05, |
|
"loss": 0.008, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6614173228346457, |
|
"grad_norm": 0.2040701061487198, |
|
"learning_rate": 1.8100112955846746e-05, |
|
"loss": 0.0038, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6692913385826772, |
|
"grad_norm": 0.3660199046134949, |
|
"learning_rate": 1.805016613801813e-05, |
|
"loss": 0.0148, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6771653543307087, |
|
"grad_norm": 1.0502821207046509, |
|
"learning_rate": 1.7999642083171576e-05, |
|
"loss": 0.0098, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6771653543307087, |
|
"eval_loss": 0.3526817262172699, |
|
"eval_runtime": 6.6167, |
|
"eval_samples_per_second": 24.635, |
|
"eval_steps_per_second": 3.174, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6850393700787402, |
|
"grad_norm": 0.13735969364643097, |
|
"learning_rate": 1.7948544414133534e-05, |
|
"loss": 0.0022, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6929133858267716, |
|
"grad_norm": 0.6425012946128845, |
|
"learning_rate": 1.7896876794861443e-05, |
|
"loss": 0.0086, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.7007874015748031, |
|
"grad_norm": 0.7540380954742432, |
|
"learning_rate": 1.7844642930181008e-05, |
|
"loss": 0.0062, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.7086614173228346, |
|
"grad_norm": 0.6727365255355835, |
|
"learning_rate": 1.779184656552056e-05, |
|
"loss": 0.0027, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7165354330708661, |
|
"grad_norm": 0.14059337973594666, |
|
"learning_rate": 1.773849148664247e-05, |
|
"loss": 0.0056, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.7244094488188977, |
|
"grad_norm": 0.33292093873023987, |
|
"learning_rate": 1.7684581519371714e-05, |
|
"loss": 0.0047, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7322834645669292, |
|
"grad_norm": 0.3809877932071686, |
|
"learning_rate": 1.7630120529321518e-05, |
|
"loss": 0.0139, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7401574803149606, |
|
"grad_norm": 1.729589819908142, |
|
"learning_rate": 1.7575112421616203e-05, |
|
"loss": 0.0128, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7480314960629921, |
|
"grad_norm": 0.18192608654499054, |
|
"learning_rate": 1.751956114061113e-05, |
|
"loss": 0.0025, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7559055118110236, |
|
"grad_norm": 1.0333118438720703, |
|
"learning_rate": 1.7463470669609907e-05, |
|
"loss": 0.006, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7637795275590551, |
|
"grad_norm": 0.7247685194015503, |
|
"learning_rate": 1.7406845030578747e-05, |
|
"loss": 0.0073, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7716535433070866, |
|
"grad_norm": 0.06979379802942276, |
|
"learning_rate": 1.734968828385808e-05, |
|
"loss": 0.0005, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7795275590551181, |
|
"grad_norm": 0.5137119293212891, |
|
"learning_rate": 1.729200452787139e-05, |
|
"loss": 0.0082, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7874015748031497, |
|
"grad_norm": 0.4704137146472931, |
|
"learning_rate": 1.7233797898831376e-05, |
|
"loss": 0.005, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7952755905511811, |
|
"grad_norm": 0.28564465045928955, |
|
"learning_rate": 1.717507257044331e-05, |
|
"loss": 0.0052, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.8031496062992126, |
|
"grad_norm": 0.17685537040233612, |
|
"learning_rate": 1.711583275360582e-05, |
|
"loss": 0.0024, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8110236220472441, |
|
"grad_norm": 0.45714935660362244, |
|
"learning_rate": 1.7056082696108896e-05, |
|
"loss": 0.0072, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8188976377952756, |
|
"grad_norm": 0.4373086988925934, |
|
"learning_rate": 1.699582668232934e-05, |
|
"loss": 0.0051, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8267716535433071, |
|
"grad_norm": 0.8478983640670776, |
|
"learning_rate": 1.6935069032923525e-05, |
|
"loss": 0.022, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8346456692913385, |
|
"grad_norm": 0.16181086003780365, |
|
"learning_rate": 1.6873814104517617e-05, |
|
"loss": 0.0058, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.84251968503937, |
|
"grad_norm": 0.09503592550754547, |
|
"learning_rate": 1.6812066289395157e-05, |
|
"loss": 0.0009, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8503937007874016, |
|
"grad_norm": 0.7462632060050964, |
|
"learning_rate": 1.6749830015182106e-05, |
|
"loss": 0.0044, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8582677165354331, |
|
"grad_norm": 0.07221701741218567, |
|
"learning_rate": 1.6687109744529394e-05, |
|
"loss": 0.0015, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8661417322834646, |
|
"grad_norm": 0.08999036252498627, |
|
"learning_rate": 1.6623909974792888e-05, |
|
"loss": 0.0023, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8740157480314961, |
|
"grad_norm": 0.42536938190460205, |
|
"learning_rate": 1.656023523771095e-05, |
|
"loss": 0.005, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8818897637795275, |
|
"grad_norm": 0.7885191440582275, |
|
"learning_rate": 1.6496090099079452e-05, |
|
"loss": 0.0103, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.889763779527559, |
|
"grad_norm": 0.16610018908977509, |
|
"learning_rate": 1.64314791584244e-05, |
|
"loss": 0.006, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8976377952755905, |
|
"grad_norm": 0.32151034474372864, |
|
"learning_rate": 1.6366407048672135e-05, |
|
"loss": 0.0086, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.905511811023622, |
|
"grad_norm": 0.557732343673706, |
|
"learning_rate": 1.6300878435817115e-05, |
|
"loss": 0.0064, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.9133858267716536, |
|
"grad_norm": 0.2238176167011261, |
|
"learning_rate": 1.6234898018587336e-05, |
|
"loss": 0.0065, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9212598425196851, |
|
"grad_norm": 0.2980042099952698, |
|
"learning_rate": 1.616847052810744e-05, |
|
"loss": 0.0095, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.9291338582677166, |
|
"grad_norm": 0.1529705822467804, |
|
"learning_rate": 1.6101600727559423e-05, |
|
"loss": 0.0062, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.937007874015748, |
|
"grad_norm": 0.017149658873677254, |
|
"learning_rate": 1.603429341184114e-05, |
|
"loss": 0.0002, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.9448818897637795, |
|
"grad_norm": 0.4514746367931366, |
|
"learning_rate": 1.596655340722244e-05, |
|
"loss": 0.0067, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.952755905511811, |
|
"grad_norm": 0.11766134947538376, |
|
"learning_rate": 1.5898385570999146e-05, |
|
"loss": 0.0053, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9606299212598425, |
|
"grad_norm": 0.4089784026145935, |
|
"learning_rate": 1.5829794791144723e-05, |
|
"loss": 0.0085, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.968503937007874, |
|
"grad_norm": 0.1353057473897934, |
|
"learning_rate": 1.57607859859598e-05, |
|
"loss": 0.0013, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9763779527559056, |
|
"grad_norm": 0.6548481583595276, |
|
"learning_rate": 1.5691364103719515e-05, |
|
"loss": 0.0117, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.984251968503937, |
|
"grad_norm": 0.1571267992258072, |
|
"learning_rate": 1.5621534122318682e-05, |
|
"loss": 0.0049, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9921259842519685, |
|
"grad_norm": 1.2177189588546753, |
|
"learning_rate": 1.5551301048914863e-05, |
|
"loss": 0.0161, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.414489209651947, |
|
"learning_rate": 1.5480669919569313e-05, |
|
"loss": 0.0181, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.0078740157480315, |
|
"grad_norm": 0.10985995829105377, |
|
"learning_rate": 1.54096457988859e-05, |
|
"loss": 0.0049, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.015748031496063, |
|
"grad_norm": 0.12780147790908813, |
|
"learning_rate": 1.533823377964791e-05, |
|
"loss": 0.0026, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.015748031496063, |
|
"eval_loss": 0.33064374327659607, |
|
"eval_runtime": 6.9286, |
|
"eval_samples_per_second": 23.526, |
|
"eval_steps_per_second": 3.031, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.0236220472440944, |
|
"grad_norm": 0.5142458081245422, |
|
"learning_rate": 1.52664389824529e-05, |
|
"loss": 0.0082, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.031496062992126, |
|
"grad_norm": 0.15617145597934723, |
|
"learning_rate": 1.5194266555345505e-05, |
|
"loss": 0.0016, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.0393700787401574, |
|
"grad_norm": 0.5782387852668762, |
|
"learning_rate": 1.5121721673448319e-05, |
|
"loss": 0.0117, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.047244094488189, |
|
"grad_norm": 0.08414836972951889, |
|
"learning_rate": 1.5048809538590789e-05, |
|
"loss": 0.0021, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.0551181102362204, |
|
"grad_norm": 0.28253939747810364, |
|
"learning_rate": 1.4975535378936228e-05, |
|
"loss": 0.0055, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.0629921259842519, |
|
"grad_norm": 0.47917842864990234, |
|
"learning_rate": 1.490190444860694e-05, |
|
"loss": 0.0046, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0708661417322836, |
|
"grad_norm": 0.1895662248134613, |
|
"learning_rate": 1.482792202730745e-05, |
|
"loss": 0.006, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.078740157480315, |
|
"grad_norm": 0.13722768425941467, |
|
"learning_rate": 1.475359341994595e-05, |
|
"loss": 0.0031, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.0866141732283465, |
|
"grad_norm": 0.10731153190135956, |
|
"learning_rate": 1.4678923956253894e-05, |
|
"loss": 0.0005, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.094488188976378, |
|
"grad_norm": 0.12261265516281128, |
|
"learning_rate": 1.460391899040383e-05, |
|
"loss": 0.0031, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.1023622047244095, |
|
"grad_norm": 0.0038245893083512783, |
|
"learning_rate": 1.4528583900625481e-05, |
|
"loss": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.110236220472441, |
|
"grad_norm": 0.28762558102607727, |
|
"learning_rate": 1.4452924088820101e-05, |
|
"loss": 0.004, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.1181102362204725, |
|
"grad_norm": 0.17267552018165588, |
|
"learning_rate": 1.4376944980173138e-05, |
|
"loss": 0.0002, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.125984251968504, |
|
"grad_norm": 0.12727122008800507, |
|
"learning_rate": 1.4300652022765207e-05, |
|
"loss": 0.0029, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.1338582677165354, |
|
"grad_norm": 0.25049135088920593, |
|
"learning_rate": 1.4224050687181442e-05, |
|
"loss": 0.0108, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.141732283464567, |
|
"grad_norm": 0.16092728078365326, |
|
"learning_rate": 1.4147146466119235e-05, |
|
"loss": 0.0024, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.1496062992125984, |
|
"grad_norm": 0.13642658293247223, |
|
"learning_rate": 1.406994487399437e-05, |
|
"loss": 0.0037, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.1574803149606299, |
|
"grad_norm": 0.9029403328895569, |
|
"learning_rate": 1.3992451446545624e-05, |
|
"loss": 0.0034, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.1653543307086613, |
|
"grad_norm": 0.19518424570560455, |
|
"learning_rate": 1.3914671740437811e-05, |
|
"loss": 0.0057, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.1732283464566928, |
|
"grad_norm": 0.12140502035617828, |
|
"learning_rate": 1.3836611332863356e-05, |
|
"loss": 0.0041, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.1811023622047245, |
|
"grad_norm": 0.5148038864135742, |
|
"learning_rate": 1.3758275821142382e-05, |
|
"loss": 0.0026, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.188976377952756, |
|
"grad_norm": 1.828904390335083, |
|
"learning_rate": 1.3679670822321347e-05, |
|
"loss": 0.0024, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.1968503937007875, |
|
"grad_norm": 0.3571717143058777, |
|
"learning_rate": 1.3600801972770272e-05, |
|
"loss": 0.0106, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.204724409448819, |
|
"grad_norm": 0.051027003675699234, |
|
"learning_rate": 1.3521674927778594e-05, |
|
"loss": 0.0003, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.2125984251968505, |
|
"grad_norm": 0.6490982174873352, |
|
"learning_rate": 1.3442295361149651e-05, |
|
"loss": 0.0035, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.220472440944882, |
|
"grad_norm": 0.08408445864915848, |
|
"learning_rate": 1.336266896479384e-05, |
|
"loss": 0.0027, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.2283464566929134, |
|
"grad_norm": 0.09666562080383301, |
|
"learning_rate": 1.328280144832047e-05, |
|
"loss": 0.0019, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.236220472440945, |
|
"grad_norm": 0.03880690038204193, |
|
"learning_rate": 1.3202698538628376e-05, |
|
"loss": 0.0003, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.2440944881889764, |
|
"grad_norm": 0.11940775066614151, |
|
"learning_rate": 1.3122365979495259e-05, |
|
"loss": 0.0024, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.2519685039370079, |
|
"grad_norm": 0.1442880481481552, |
|
"learning_rate": 1.3041809531165819e-05, |
|
"loss": 0.0015, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.2598425196850394, |
|
"grad_norm": 0.1961939036846161, |
|
"learning_rate": 1.2961034969938732e-05, |
|
"loss": 0.0056, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2677165354330708, |
|
"grad_norm": 0.26947638392448425, |
|
"learning_rate": 1.288004808775246e-05, |
|
"loss": 0.0028, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.2755905511811023, |
|
"grad_norm": 0.5154056549072266, |
|
"learning_rate": 1.2798854691769927e-05, |
|
"loss": 0.0037, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.2834645669291338, |
|
"grad_norm": 0.4292369782924652, |
|
"learning_rate": 1.2717460603962132e-05, |
|
"loss": 0.0029, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.2913385826771653, |
|
"grad_norm": 0.19139212369918823, |
|
"learning_rate": 1.2635871660690677e-05, |
|
"loss": 0.0061, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2992125984251968, |
|
"grad_norm": 0.19960306584835052, |
|
"learning_rate": 1.2554093712289267e-05, |
|
"loss": 0.005, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.3070866141732282, |
|
"grad_norm": 0.4523830711841583, |
|
"learning_rate": 1.2472132622644222e-05, |
|
"loss": 0.0065, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.3149606299212597, |
|
"grad_norm": 0.49343299865722656, |
|
"learning_rate": 1.2389994268773995e-05, |
|
"loss": 0.0061, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.3228346456692912, |
|
"grad_norm": 0.01938088797032833, |
|
"learning_rate": 1.2307684540407775e-05, |
|
"loss": 0.0001, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.330708661417323, |
|
"grad_norm": 0.3082112669944763, |
|
"learning_rate": 1.2225209339563144e-05, |
|
"loss": 0.0053, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.3385826771653544, |
|
"grad_norm": 0.01982509344816208, |
|
"learning_rate": 1.2142574580122903e-05, |
|
"loss": 0.0001, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.3464566929133859, |
|
"grad_norm": 0.12388588488101959, |
|
"learning_rate": 1.2059786187410984e-05, |
|
"loss": 0.0049, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.3543307086614174, |
|
"grad_norm": 0.43759095668792725, |
|
"learning_rate": 1.1976850097767598e-05, |
|
"loss": 0.0128, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.3543307086614174, |
|
"eval_loss": 0.3166251480579376, |
|
"eval_runtime": 6.9515, |
|
"eval_samples_per_second": 23.448, |
|
"eval_steps_per_second": 3.021, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.3622047244094488, |
|
"grad_norm": 0.46561670303344727, |
|
"learning_rate": 1.1893772258123554e-05, |
|
"loss": 0.008, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.3700787401574803, |
|
"grad_norm": 0.16612188518047333, |
|
"learning_rate": 1.1810558625573856e-05, |
|
"loss": 0.0024, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3779527559055118, |
|
"grad_norm": 0.13628093898296356, |
|
"learning_rate": 1.1727215166950519e-05, |
|
"loss": 0.0045, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.3858267716535433, |
|
"grad_norm": 0.565229058265686, |
|
"learning_rate": 1.1643747858394743e-05, |
|
"loss": 0.0103, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.3937007874015748, |
|
"grad_norm": 0.14550763368606567, |
|
"learning_rate": 1.156016268492839e-05, |
|
"loss": 0.0028, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.4015748031496063, |
|
"grad_norm": 0.12460129708051682, |
|
"learning_rate": 1.1476465640024814e-05, |
|
"loss": 0.0031, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.4094488188976377, |
|
"grad_norm": 0.19089221954345703, |
|
"learning_rate": 1.1392662725179114e-05, |
|
"loss": 0.0035, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.4173228346456692, |
|
"grad_norm": 0.6106573343276978, |
|
"learning_rate": 1.1308759949477786e-05, |
|
"loss": 0.0088, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.425196850393701, |
|
"grad_norm": 0.20053207874298096, |
|
"learning_rate": 1.1224763329167859e-05, |
|
"loss": 0.0033, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.4330708661417324, |
|
"grad_norm": 0.1984691321849823, |
|
"learning_rate": 1.1140678887225468e-05, |
|
"loss": 0.0051, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.4409448818897639, |
|
"grad_norm": 0.19264858961105347, |
|
"learning_rate": 1.1056512652924014e-05, |
|
"loss": 0.0046, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.4488188976377954, |
|
"grad_norm": 0.10979076474905014, |
|
"learning_rate": 1.0972270661401812e-05, |
|
"loss": 0.0031, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.4566929133858268, |
|
"grad_norm": 0.1744084656238556, |
|
"learning_rate": 1.0887958953229349e-05, |
|
"loss": 0.0024, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.4645669291338583, |
|
"grad_norm": 0.20646224915981293, |
|
"learning_rate": 1.0803583573976137e-05, |
|
"loss": 0.008, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.4724409448818898, |
|
"grad_norm": 0.14391584694385529, |
|
"learning_rate": 1.0719150573777226e-05, |
|
"loss": 0.004, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.4803149606299213, |
|
"grad_norm": 0.36887863278388977, |
|
"learning_rate": 1.0634666006899375e-05, |
|
"loss": 0.0074, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.4881889763779528, |
|
"grad_norm": 0.21352627873420715, |
|
"learning_rate": 1.055013593130693e-05, |
|
"loss": 0.0082, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.4960629921259843, |
|
"grad_norm": 0.22443020343780518, |
|
"learning_rate": 1.046556640822744e-05, |
|
"loss": 0.0087, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.5039370078740157, |
|
"grad_norm": 0.4243764281272888, |
|
"learning_rate": 1.0380963501717034e-05, |
|
"loss": 0.0068, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.5118110236220472, |
|
"grad_norm": 0.17558562755584717, |
|
"learning_rate": 1.0296333278225599e-05, |
|
"loss": 0.0054, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.5196850393700787, |
|
"grad_norm": 0.14842620491981506, |
|
"learning_rate": 1.0211681806161787e-05, |
|
"loss": 0.0031, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.5275590551181102, |
|
"grad_norm": 0.09316081553697586, |
|
"learning_rate": 1.0127015155457875e-05, |
|
"loss": 0.0013, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.5354330708661417, |
|
"grad_norm": 0.19795025885105133, |
|
"learning_rate": 1.0042339397134528e-05, |
|
"loss": 0.0051, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.5433070866141732, |
|
"grad_norm": 0.21606990694999695, |
|
"learning_rate": 9.957660602865477e-06, |
|
"loss": 0.0041, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.5511811023622046, |
|
"grad_norm": 0.18036173284053802, |
|
"learning_rate": 9.872984844542128e-06, |
|
"loss": 0.0037, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.5590551181102361, |
|
"grad_norm": 0.18953870236873627, |
|
"learning_rate": 9.788318193838218e-06, |
|
"loss": 0.0041, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.5669291338582676, |
|
"grad_norm": 0.12346503138542175, |
|
"learning_rate": 9.703666721774403e-06, |
|
"loss": 0.0035, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.574803149606299, |
|
"grad_norm": 0.4576225280761719, |
|
"learning_rate": 9.619036498282968e-06, |
|
"loss": 0.0041, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5826771653543306, |
|
"grad_norm": 0.10333681106567383, |
|
"learning_rate": 9.534433591772562e-06, |
|
"loss": 0.0011, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.590551181102362, |
|
"grad_norm": 0.19167865812778473, |
|
"learning_rate": 9.449864068693072e-06, |
|
"loss": 0.0062, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.5984251968503937, |
|
"grad_norm": 0.2258184254169464, |
|
"learning_rate": 9.365333993100628e-06, |
|
"loss": 0.003, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.6062992125984252, |
|
"grad_norm": 0.07945302873849869, |
|
"learning_rate": 9.280849426222778e-06, |
|
"loss": 0.0008, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.6141732283464567, |
|
"grad_norm": 0.17767398059368134, |
|
"learning_rate": 9.196416426023868e-06, |
|
"loss": 0.0053, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.6220472440944882, |
|
"grad_norm": 0.12704500555992126, |
|
"learning_rate": 9.112041046770653e-06, |
|
"loss": 0.0023, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.6299212598425197, |
|
"grad_norm": 0.4054742753505707, |
|
"learning_rate": 9.027729338598188e-06, |
|
"loss": 0.0045, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.6377952755905512, |
|
"grad_norm": 0.4463757574558258, |
|
"learning_rate": 8.943487347075988e-06, |
|
"loss": 0.007, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.6456692913385826, |
|
"grad_norm": 0.6517045497894287, |
|
"learning_rate": 8.859321112774535e-06, |
|
"loss": 0.0052, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.6535433070866141, |
|
"grad_norm": 0.1542089730501175, |
|
"learning_rate": 8.775236670832146e-06, |
|
"loss": 0.0047, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.6614173228346458, |
|
"grad_norm": 0.14716440439224243, |
|
"learning_rate": 8.691240050522215e-06, |
|
"loss": 0.0049, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.6692913385826773, |
|
"grad_norm": 0.2997347116470337, |
|
"learning_rate": 8.607337274820888e-06, |
|
"loss": 0.0076, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.6771653543307088, |
|
"grad_norm": 0.22548256814479828, |
|
"learning_rate": 8.52353435997519e-06, |
|
"loss": 0.0063, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.6850393700787403, |
|
"grad_norm": 0.7220733165740967, |
|
"learning_rate": 8.439837315071612e-06, |
|
"loss": 0.0089, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.6929133858267718, |
|
"grad_norm": 0.5101618766784668, |
|
"learning_rate": 8.35625214160526e-06, |
|
"loss": 0.0042, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.6929133858267718, |
|
"eval_loss": 0.3484288156032562, |
|
"eval_runtime": 6.4482, |
|
"eval_samples_per_second": 25.278, |
|
"eval_steps_per_second": 3.257, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.7007874015748032, |
|
"grad_norm": 0.1698393076658249, |
|
"learning_rate": 8.272784833049485e-06, |
|
"loss": 0.0028, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.7086614173228347, |
|
"grad_norm": 0.5772718191146851, |
|
"learning_rate": 8.18944137442615e-06, |
|
"loss": 0.0082, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.7165354330708662, |
|
"grad_norm": 0.09606469422578812, |
|
"learning_rate": 8.106227741876447e-06, |
|
"loss": 0.0011, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.7244094488188977, |
|
"grad_norm": 0.14510361850261688, |
|
"learning_rate": 8.023149902232404e-06, |
|
"loss": 0.0015, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.7322834645669292, |
|
"grad_norm": 0.055804118514060974, |
|
"learning_rate": 7.940213812589018e-06, |
|
"loss": 0.0008, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.7401574803149606, |
|
"grad_norm": 0.13318321108818054, |
|
"learning_rate": 7.857425419877097e-06, |
|
"loss": 0.005, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.7480314960629921, |
|
"grad_norm": 0.23600782454013824, |
|
"learning_rate": 7.774790660436857e-06, |
|
"loss": 0.0063, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.7559055118110236, |
|
"grad_norm": 0.8483791351318359, |
|
"learning_rate": 7.69231545959223e-06, |
|
"loss": 0.0027, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.763779527559055, |
|
"grad_norm": 0.16536197066307068, |
|
"learning_rate": 7.610005731226009e-06, |
|
"loss": 0.0039, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.7716535433070866, |
|
"grad_norm": 0.14446765184402466, |
|
"learning_rate": 7.52786737735578e-06, |
|
"loss": 0.0036, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.779527559055118, |
|
"grad_norm": 0.8880365490913391, |
|
"learning_rate": 7.445906287710733e-06, |
|
"loss": 0.0061, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.7874015748031495, |
|
"grad_norm": 0.151743084192276, |
|
"learning_rate": 7.364128339309326e-06, |
|
"loss": 0.0028, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.795275590551181, |
|
"grad_norm": 0.1224551647901535, |
|
"learning_rate": 7.282539396037868e-06, |
|
"loss": 0.002, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.8031496062992125, |
|
"grad_norm": 0.4868486225605011, |
|
"learning_rate": 7.201145308230075e-06, |
|
"loss": 0.0031, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.811023622047244, |
|
"grad_norm": 0.2875569462776184, |
|
"learning_rate": 7.119951912247545e-06, |
|
"loss": 0.0082, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.8188976377952755, |
|
"grad_norm": 0.43524420261383057, |
|
"learning_rate": 7.038965030061273e-06, |
|
"loss": 0.0075, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.826771653543307, |
|
"grad_norm": 0.39634883403778076, |
|
"learning_rate": 6.9581904688341854e-06, |
|
"loss": 0.0032, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.8346456692913384, |
|
"grad_norm": 0.9809433817863464, |
|
"learning_rate": 6.8776340205047446e-06, |
|
"loss": 0.0085, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.84251968503937, |
|
"grad_norm": 0.20062875747680664, |
|
"learning_rate": 6.797301461371626e-06, |
|
"loss": 0.0043, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.8503937007874016, |
|
"grad_norm": 0.148948073387146, |
|
"learning_rate": 6.7171985516795315e-06, |
|
"loss": 0.0036, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.858267716535433, |
|
"grad_norm": 0.15658679604530334, |
|
"learning_rate": 6.637331035206166e-06, |
|
"loss": 0.0046, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.8661417322834646, |
|
"grad_norm": 0.22365815937519073, |
|
"learning_rate": 6.557704638850352e-06, |
|
"loss": 0.0081, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.874015748031496, |
|
"grad_norm": 0.10596666485071182, |
|
"learning_rate": 6.4783250722214066e-06, |
|
"loss": 0.0032, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.8818897637795275, |
|
"grad_norm": 0.2130754142999649, |
|
"learning_rate": 6.399198027229732e-06, |
|
"loss": 0.0056, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.889763779527559, |
|
"grad_norm": 0.05641167238354683, |
|
"learning_rate": 6.320329177678656e-06, |
|
"loss": 0.0008, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8976377952755905, |
|
"grad_norm": 0.10349344462156296, |
|
"learning_rate": 6.241724178857621e-06, |
|
"loss": 0.0026, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.905511811023622, |
|
"grad_norm": 0.08451675623655319, |
|
"learning_rate": 6.163388667136646e-06, |
|
"loss": 0.0016, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.9133858267716537, |
|
"grad_norm": 0.13671623170375824, |
|
"learning_rate": 6.085328259562195e-06, |
|
"loss": 0.0034, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.9212598425196852, |
|
"grad_norm": 0.5500523447990417, |
|
"learning_rate": 6.007548553454379e-06, |
|
"loss": 0.0028, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.9291338582677167, |
|
"grad_norm": 0.06702329218387604, |
|
"learning_rate": 5.93005512600563e-06, |
|
"loss": 0.0009, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.9370078740157481, |
|
"grad_norm": 0.15156973898410797, |
|
"learning_rate": 5.852853533880768e-06, |
|
"loss": 0.0064, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.9448818897637796, |
|
"grad_norm": 0.2970314621925354, |
|
"learning_rate": 5.7759493128185584e-06, |
|
"loss": 0.0077, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.952755905511811, |
|
"grad_norm": 0.06406261771917343, |
|
"learning_rate": 5.699347977234799e-06, |
|
"loss": 0.0006, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.9606299212598426, |
|
"grad_norm": 0.2910393178462982, |
|
"learning_rate": 5.623055019826862e-06, |
|
"loss": 0.0036, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.968503937007874, |
|
"grad_norm": 0.6454993486404419, |
|
"learning_rate": 5.547075911179902e-06, |
|
"loss": 0.0084, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.9763779527559056, |
|
"grad_norm": 0.09460143744945526, |
|
"learning_rate": 5.471416099374525e-06, |
|
"loss": 0.0021, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.984251968503937, |
|
"grad_norm": 0.2024363875389099, |
|
"learning_rate": 5.3960810095961705e-06, |
|
"loss": 0.0052, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.9921259842519685, |
|
"grad_norm": 0.09423142671585083, |
|
"learning_rate": 5.321076043746108e-06, |
|
"loss": 0.0018, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.1085880920290947, |
|
"learning_rate": 5.246406580054051e-06, |
|
"loss": 0.0039, |
|
"step": 254 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 381, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 127, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.6174542139778662e+17, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|