|
{ |
|
"best_metric": 0.7647964954376221, |
|
"best_model_checkpoint": "outputs/checkpoint-350", |
|
"epoch": 0.21895527056615577, |
|
"eval_steps": 25, |
|
"global_step": 350, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006255864873318737, |
|
"grad_norm": 3.302262783050537, |
|
"learning_rate": 4e-05, |
|
"loss": 1.7639, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0012511729746637473, |
|
"grad_norm": 3.7728819847106934, |
|
"learning_rate": 8e-05, |
|
"loss": 2.3471, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.001876759461995621, |
|
"grad_norm": 3.575211763381958, |
|
"learning_rate": 0.00012, |
|
"loss": 1.274, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0025023459493274947, |
|
"grad_norm": 4.3921918869018555, |
|
"learning_rate": 0.00016, |
|
"loss": 1.8361, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0031279324366593683, |
|
"grad_norm": 3.215696096420288, |
|
"learning_rate": 0.0002, |
|
"loss": 2.8766, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003753518923991242, |
|
"grad_norm": 4.060017108917236, |
|
"learning_rate": 0.0001996638655462185, |
|
"loss": 1.4329, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004379105411323116, |
|
"grad_norm": 2.7935523986816406, |
|
"learning_rate": 0.00019932773109243698, |
|
"loss": 1.2844, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.005004691898654989, |
|
"grad_norm": 2.312218189239502, |
|
"learning_rate": 0.00019899159663865548, |
|
"loss": 1.8112, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.005630278385986863, |
|
"grad_norm": 3.5389914512634277, |
|
"learning_rate": 0.00019865546218487395, |
|
"loss": 2.0504, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.006255864873318737, |
|
"grad_norm": 2.913029432296753, |
|
"learning_rate": 0.00019831932773109245, |
|
"loss": 1.9101, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00688145136065061, |
|
"grad_norm": 3.6916606426239014, |
|
"learning_rate": 0.00019798319327731095, |
|
"loss": 1.9899, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.007507037847982484, |
|
"grad_norm": 3.002810478210449, |
|
"learning_rate": 0.00019764705882352942, |
|
"loss": 1.3224, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.008132624335314358, |
|
"grad_norm": 1.657835602760315, |
|
"learning_rate": 0.00019731092436974792, |
|
"loss": 1.2208, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.008758210822646231, |
|
"grad_norm": 2.414161443710327, |
|
"learning_rate": 0.00019697478991596642, |
|
"loss": 1.5375, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.009383797309978105, |
|
"grad_norm": 1.9695100784301758, |
|
"learning_rate": 0.00019663865546218486, |
|
"loss": 0.9218, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.010009383797309979, |
|
"grad_norm": 3.9755845069885254, |
|
"learning_rate": 0.00019630252100840336, |
|
"loss": 1.3608, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.010634970284641852, |
|
"grad_norm": 6.843455791473389, |
|
"learning_rate": 0.00019596638655462186, |
|
"loss": 1.2168, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.011260556771973726, |
|
"grad_norm": 3.8736443519592285, |
|
"learning_rate": 0.00019563025210084033, |
|
"loss": 0.7392, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0118861432593056, |
|
"grad_norm": 1.7369539737701416, |
|
"learning_rate": 0.00019529411764705883, |
|
"loss": 1.0495, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.012511729746637473, |
|
"grad_norm": 1.1708225011825562, |
|
"learning_rate": 0.0001949579831932773, |
|
"loss": 1.2266, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013137316233969347, |
|
"grad_norm": 1.4693603515625, |
|
"learning_rate": 0.0001946218487394958, |
|
"loss": 1.1364, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.01376290272130122, |
|
"grad_norm": 0.8484959602355957, |
|
"learning_rate": 0.0001942857142857143, |
|
"loss": 0.6253, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.014388489208633094, |
|
"grad_norm": 2.7237887382507324, |
|
"learning_rate": 0.00019394957983193278, |
|
"loss": 1.2932, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.015014075695964968, |
|
"grad_norm": 1.1654947996139526, |
|
"learning_rate": 0.00019361344537815127, |
|
"loss": 0.5659, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01563966218329684, |
|
"grad_norm": 1.7193485498428345, |
|
"learning_rate": 0.00019327731092436975, |
|
"loss": 1.3627, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01563966218329684, |
|
"eval_loss": 1.0974555015563965, |
|
"eval_runtime": 46.8133, |
|
"eval_samples_per_second": 5.469, |
|
"eval_steps_per_second": 2.734, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.016265248670628715, |
|
"grad_norm": 2.883988380432129, |
|
"learning_rate": 0.00019294117647058825, |
|
"loss": 0.6257, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.01689083515796059, |
|
"grad_norm": 1.4707483053207397, |
|
"learning_rate": 0.00019260504201680674, |
|
"loss": 0.879, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.017516421645292463, |
|
"grad_norm": 1.3346422910690308, |
|
"learning_rate": 0.00019226890756302522, |
|
"loss": 1.0058, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.018142008132624336, |
|
"grad_norm": 0.5815519094467163, |
|
"learning_rate": 0.00019193277310924372, |
|
"loss": 0.3475, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.01876759461995621, |
|
"grad_norm": 0.8800593018531799, |
|
"learning_rate": 0.00019159663865546221, |
|
"loss": 0.5426, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.019393181107288084, |
|
"grad_norm": 8.196944236755371, |
|
"learning_rate": 0.0001912605042016807, |
|
"loss": 1.0088, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.020018767594619957, |
|
"grad_norm": 3.264193296432495, |
|
"learning_rate": 0.00019092436974789919, |
|
"loss": 0.9319, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.02064435408195183, |
|
"grad_norm": 1.1047834157943726, |
|
"learning_rate": 0.00019058823529411766, |
|
"loss": 0.9262, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.021269940569283705, |
|
"grad_norm": 1.982783555984497, |
|
"learning_rate": 0.00019025210084033613, |
|
"loss": 1.2904, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.021895527056615578, |
|
"grad_norm": 2.6765289306640625, |
|
"learning_rate": 0.00018991596638655463, |
|
"loss": 1.0785, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.022521113543947452, |
|
"grad_norm": 4.674818992614746, |
|
"learning_rate": 0.0001895798319327731, |
|
"loss": 0.9822, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.023146700031279326, |
|
"grad_norm": 1.6232353448867798, |
|
"learning_rate": 0.0001892436974789916, |
|
"loss": 0.6441, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0237722865186112, |
|
"grad_norm": 2.623237371444702, |
|
"learning_rate": 0.0001889075630252101, |
|
"loss": 0.8874, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.024397873005943073, |
|
"grad_norm": 1.4366761445999146, |
|
"learning_rate": 0.00018857142857142857, |
|
"loss": 0.4596, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.025023459493274947, |
|
"grad_norm": 1.8809682130813599, |
|
"learning_rate": 0.00018823529411764707, |
|
"loss": 0.887, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02564904598060682, |
|
"grad_norm": 1.081438660621643, |
|
"learning_rate": 0.00018789915966386554, |
|
"loss": 0.4735, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.026274632467938694, |
|
"grad_norm": 2.1302649974823, |
|
"learning_rate": 0.00018756302521008404, |
|
"loss": 0.795, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.026900218955270568, |
|
"grad_norm": 2.005425453186035, |
|
"learning_rate": 0.00018722689075630254, |
|
"loss": 0.8891, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.02752580544260244, |
|
"grad_norm": 1.7256505489349365, |
|
"learning_rate": 0.000186890756302521, |
|
"loss": 0.5993, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.028151391929934315, |
|
"grad_norm": 0.927653968334198, |
|
"learning_rate": 0.0001865546218487395, |
|
"loss": 0.6185, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02877697841726619, |
|
"grad_norm": 1.5710850954055786, |
|
"learning_rate": 0.000186218487394958, |
|
"loss": 0.5258, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.029402564904598062, |
|
"grad_norm": 1.8794296979904175, |
|
"learning_rate": 0.00018588235294117648, |
|
"loss": 0.8168, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.030028151391929936, |
|
"grad_norm": 0.9695333242416382, |
|
"learning_rate": 0.00018554621848739498, |
|
"loss": 0.5458, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.03065373787926181, |
|
"grad_norm": 3.7846665382385254, |
|
"learning_rate": 0.00018521008403361345, |
|
"loss": 0.943, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.03127932436659368, |
|
"grad_norm": 1.9213052988052368, |
|
"learning_rate": 0.00018487394957983195, |
|
"loss": 0.5069, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03127932436659368, |
|
"eval_loss": 0.9765783548355103, |
|
"eval_runtime": 43.502, |
|
"eval_samples_per_second": 5.885, |
|
"eval_steps_per_second": 2.942, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03190491085392556, |
|
"grad_norm": 2.0580382347106934, |
|
"learning_rate": 0.00018453781512605045, |
|
"loss": 0.9423, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.03253049734125743, |
|
"grad_norm": 2.063591957092285, |
|
"learning_rate": 0.0001842016806722689, |
|
"loss": 0.7054, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.033156083828589304, |
|
"grad_norm": 1.2656595706939697, |
|
"learning_rate": 0.0001838655462184874, |
|
"loss": 0.401, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.03378167031592118, |
|
"grad_norm": 1.2392399311065674, |
|
"learning_rate": 0.0001835294117647059, |
|
"loss": 0.6077, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03440725680325305, |
|
"grad_norm": 0.99504154920578, |
|
"learning_rate": 0.00018319327731092437, |
|
"loss": 0.6313, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.035032843290584925, |
|
"grad_norm": 2.0478012561798096, |
|
"learning_rate": 0.00018285714285714286, |
|
"loss": 1.2652, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0356584297779168, |
|
"grad_norm": 0.9636131525039673, |
|
"learning_rate": 0.00018252100840336134, |
|
"loss": 0.7561, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.03628401626524867, |
|
"grad_norm": 0.874576210975647, |
|
"learning_rate": 0.00018218487394957984, |
|
"loss": 0.7461, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.036909602752580546, |
|
"grad_norm": 1.3745896816253662, |
|
"learning_rate": 0.00018184873949579833, |
|
"loss": 1.2856, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03753518923991242, |
|
"grad_norm": 2.4839162826538086, |
|
"learning_rate": 0.0001815126050420168, |
|
"loss": 1.0574, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.038160775727244294, |
|
"grad_norm": 1.2671383619308472, |
|
"learning_rate": 0.0001811764705882353, |
|
"loss": 0.6177, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.03878636221457617, |
|
"grad_norm": 1.1862553358078003, |
|
"learning_rate": 0.0001808403361344538, |
|
"loss": 1.1169, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.03941194870190804, |
|
"grad_norm": 1.1347297430038452, |
|
"learning_rate": 0.00018050420168067228, |
|
"loss": 1.3303, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.040037535189239915, |
|
"grad_norm": 2.1583523750305176, |
|
"learning_rate": 0.00018016806722689078, |
|
"loss": 0.7941, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.04066312167657179, |
|
"grad_norm": 1.2432655096054077, |
|
"learning_rate": 0.00017983193277310925, |
|
"loss": 0.7848, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04128870816390366, |
|
"grad_norm": 1.3345468044281006, |
|
"learning_rate": 0.00017949579831932775, |
|
"loss": 0.8953, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.041914294651235535, |
|
"grad_norm": 0.6861767768859863, |
|
"learning_rate": 0.00017915966386554625, |
|
"loss": 0.4162, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.04253988113856741, |
|
"grad_norm": 0.85309898853302, |
|
"learning_rate": 0.00017882352941176472, |
|
"loss": 0.6606, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.04316546762589928, |
|
"grad_norm": 1.0247780084609985, |
|
"learning_rate": 0.00017848739495798322, |
|
"loss": 0.5271, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.043791054113231156, |
|
"grad_norm": 1.3019441366195679, |
|
"learning_rate": 0.0001781512605042017, |
|
"loss": 0.5605, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04441664060056303, |
|
"grad_norm": 1.1024900674819946, |
|
"learning_rate": 0.00017781512605042016, |
|
"loss": 0.9303, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.045042227087894904, |
|
"grad_norm": 1.079655408859253, |
|
"learning_rate": 0.00017747899159663866, |
|
"loss": 1.0138, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.04566781357522678, |
|
"grad_norm": 1.1078468561172485, |
|
"learning_rate": 0.00017714285714285713, |
|
"loss": 0.9861, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.04629340006255865, |
|
"grad_norm": 1.8648931980133057, |
|
"learning_rate": 0.00017680672268907563, |
|
"loss": 0.6756, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.046918986549890525, |
|
"grad_norm": 0.8588104248046875, |
|
"learning_rate": 0.00017647058823529413, |
|
"loss": 0.4867, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.046918986549890525, |
|
"eval_loss": 0.9139823913574219, |
|
"eval_runtime": 43.5635, |
|
"eval_samples_per_second": 5.876, |
|
"eval_steps_per_second": 2.938, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0475445730372224, |
|
"grad_norm": 1.6970480680465698, |
|
"learning_rate": 0.0001761344537815126, |
|
"loss": 0.5523, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.04817015952455427, |
|
"grad_norm": 0.8562026023864746, |
|
"learning_rate": 0.0001757983193277311, |
|
"loss": 0.4084, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.048795746011886146, |
|
"grad_norm": 0.9487925171852112, |
|
"learning_rate": 0.0001754621848739496, |
|
"loss": 0.6204, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.04942133249921802, |
|
"grad_norm": 11.929024696350098, |
|
"learning_rate": 0.00017512605042016807, |
|
"loss": 1.1662, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.05004691898654989, |
|
"grad_norm": 1.3468140363693237, |
|
"learning_rate": 0.00017478991596638657, |
|
"loss": 0.8037, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05067250547388177, |
|
"grad_norm": 0.7379503846168518, |
|
"learning_rate": 0.00017445378151260504, |
|
"loss": 0.6564, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.05129809196121364, |
|
"grad_norm": 1.0315027236938477, |
|
"learning_rate": 0.00017411764705882354, |
|
"loss": 0.6377, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.051923678448545514, |
|
"grad_norm": 0.5900093913078308, |
|
"learning_rate": 0.00017378151260504204, |
|
"loss": 0.5122, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.05254926493587739, |
|
"grad_norm": 1.5138239860534668, |
|
"learning_rate": 0.0001734453781512605, |
|
"loss": 0.4769, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.05317485142320926, |
|
"grad_norm": 1.016790747642517, |
|
"learning_rate": 0.000173109243697479, |
|
"loss": 0.6654, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.053800437910541135, |
|
"grad_norm": 1.1964718103408813, |
|
"learning_rate": 0.00017277310924369748, |
|
"loss": 0.6334, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.05442602439787301, |
|
"grad_norm": 1.102842092514038, |
|
"learning_rate": 0.00017243697478991598, |
|
"loss": 0.832, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.05505161088520488, |
|
"grad_norm": 6.609305381774902, |
|
"learning_rate": 0.00017210084033613448, |
|
"loss": 0.6112, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.055677197372536756, |
|
"grad_norm": 2.6627745628356934, |
|
"learning_rate": 0.00017176470588235293, |
|
"loss": 1.032, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.05630278385986863, |
|
"grad_norm": 2.114955425262451, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 0.6116, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0569283703472005, |
|
"grad_norm": 1.7707552909851074, |
|
"learning_rate": 0.00017109243697478992, |
|
"loss": 0.4766, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.05755395683453238, |
|
"grad_norm": 0.9983264803886414, |
|
"learning_rate": 0.0001707563025210084, |
|
"loss": 0.5397, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.05817954332186425, |
|
"grad_norm": 8.190524101257324, |
|
"learning_rate": 0.0001704201680672269, |
|
"loss": 0.9531, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.058805129809196124, |
|
"grad_norm": 1.9920661449432373, |
|
"learning_rate": 0.0001700840336134454, |
|
"loss": 1.3801, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.059430716296528, |
|
"grad_norm": 0.8791856169700623, |
|
"learning_rate": 0.00016974789915966387, |
|
"loss": 0.6218, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06005630278385987, |
|
"grad_norm": 1.0745537281036377, |
|
"learning_rate": 0.00016941176470588237, |
|
"loss": 0.5578, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.060681889271191745, |
|
"grad_norm": 1.4266705513000488, |
|
"learning_rate": 0.00016907563025210084, |
|
"loss": 1.5821, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.06130747575852362, |
|
"grad_norm": 1.1001832485198975, |
|
"learning_rate": 0.00016873949579831934, |
|
"loss": 0.5972, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.06193306224585549, |
|
"grad_norm": 1.3168463706970215, |
|
"learning_rate": 0.00016840336134453784, |
|
"loss": 0.5794, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.06255864873318737, |
|
"grad_norm": 1.0342196226119995, |
|
"learning_rate": 0.0001680672268907563, |
|
"loss": 0.6827, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06255864873318737, |
|
"eval_loss": 0.8885337114334106, |
|
"eval_runtime": 43.4886, |
|
"eval_samples_per_second": 5.887, |
|
"eval_steps_per_second": 2.943, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06318423522051923, |
|
"grad_norm": 2.2497031688690186, |
|
"learning_rate": 0.0001677310924369748, |
|
"loss": 0.6468, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.06380982170785111, |
|
"grad_norm": 0.8061516284942627, |
|
"learning_rate": 0.00016739495798319328, |
|
"loss": 0.5388, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.06443540819518298, |
|
"grad_norm": 0.6954531669616699, |
|
"learning_rate": 0.00016705882352941178, |
|
"loss": 0.3191, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.06506099468251486, |
|
"grad_norm": 1.3721911907196045, |
|
"learning_rate": 0.00016672268907563028, |
|
"loss": 0.9, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.06568658116984673, |
|
"grad_norm": 1.084492564201355, |
|
"learning_rate": 0.00016638655462184875, |
|
"loss": 0.6144, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06631216765717861, |
|
"grad_norm": 3.317697525024414, |
|
"learning_rate": 0.00016605042016806725, |
|
"loss": 0.634, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.06693775414451048, |
|
"grad_norm": 2.5598530769348145, |
|
"learning_rate": 0.00016571428571428575, |
|
"loss": 0.8931, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.06756334063184236, |
|
"grad_norm": 3.6414177417755127, |
|
"learning_rate": 0.0001653781512605042, |
|
"loss": 0.7226, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.06818892711917422, |
|
"grad_norm": 2.2443768978118896, |
|
"learning_rate": 0.0001650420168067227, |
|
"loss": 0.8862, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0688145136065061, |
|
"grad_norm": 0.6285691857337952, |
|
"learning_rate": 0.0001647058823529412, |
|
"loss": 0.3766, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06944010009383797, |
|
"grad_norm": 0.6171959042549133, |
|
"learning_rate": 0.00016436974789915966, |
|
"loss": 0.2821, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.07006568658116985, |
|
"grad_norm": 1.0057804584503174, |
|
"learning_rate": 0.00016403361344537816, |
|
"loss": 0.6293, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.07069127306850172, |
|
"grad_norm": 1.3190034627914429, |
|
"learning_rate": 0.00016369747899159663, |
|
"loss": 0.5547, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.0713168595558336, |
|
"grad_norm": 0.518517017364502, |
|
"learning_rate": 0.00016336134453781513, |
|
"loss": 0.1951, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.07194244604316546, |
|
"grad_norm": 0.848175048828125, |
|
"learning_rate": 0.00016302521008403363, |
|
"loss": 0.5091, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.07256803253049735, |
|
"grad_norm": 0.7387409806251526, |
|
"learning_rate": 0.0001626890756302521, |
|
"loss": 0.3872, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.07319361901782921, |
|
"grad_norm": 2.828091859817505, |
|
"learning_rate": 0.0001623529411764706, |
|
"loss": 1.2046, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.07381920550516109, |
|
"grad_norm": 1.7653822898864746, |
|
"learning_rate": 0.00016201680672268907, |
|
"loss": 1.8133, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.07444479199249296, |
|
"grad_norm": 3.5097360610961914, |
|
"learning_rate": 0.00016168067226890757, |
|
"loss": 0.6837, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.07507037847982484, |
|
"grad_norm": 1.3884797096252441, |
|
"learning_rate": 0.00016134453781512607, |
|
"loss": 0.8846, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0756959649671567, |
|
"grad_norm": 22.705190658569336, |
|
"learning_rate": 0.00016100840336134454, |
|
"loss": 0.7281, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.07632155145448859, |
|
"grad_norm": 3.1223599910736084, |
|
"learning_rate": 0.00016067226890756304, |
|
"loss": 0.6254, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.07694713794182045, |
|
"grad_norm": 0.530583381652832, |
|
"learning_rate": 0.00016033613445378154, |
|
"loss": 0.3292, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.07757272442915233, |
|
"grad_norm": 1.4720183610916138, |
|
"learning_rate": 0.00016, |
|
"loss": 0.8192, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.0781983109164842, |
|
"grad_norm": 0.6448870301246643, |
|
"learning_rate": 0.0001596638655462185, |
|
"loss": 0.2431, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0781983109164842, |
|
"eval_loss": 0.890012800693512, |
|
"eval_runtime": 43.5059, |
|
"eval_samples_per_second": 5.884, |
|
"eval_steps_per_second": 2.942, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07882389740381608, |
|
"grad_norm": 1.803906798362732, |
|
"learning_rate": 0.00015932773109243698, |
|
"loss": 0.8937, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.07944948389114795, |
|
"grad_norm": 2.2447054386138916, |
|
"learning_rate": 0.00015899159663865546, |
|
"loss": 0.6993, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.08007507037847983, |
|
"grad_norm": 0.6667381525039673, |
|
"learning_rate": 0.00015865546218487396, |
|
"loss": 0.4266, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.0807006568658117, |
|
"grad_norm": 1.1449408531188965, |
|
"learning_rate": 0.00015831932773109243, |
|
"loss": 0.5557, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.08132624335314358, |
|
"grad_norm": 1.399849534034729, |
|
"learning_rate": 0.00015798319327731093, |
|
"loss": 0.6761, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08195182984047544, |
|
"grad_norm": 0.745627760887146, |
|
"learning_rate": 0.00015764705882352943, |
|
"loss": 0.5323, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.08257741632780732, |
|
"grad_norm": 1.162428379058838, |
|
"learning_rate": 0.0001573109243697479, |
|
"loss": 0.8231, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.08320300281513919, |
|
"grad_norm": 1.0329734086990356, |
|
"learning_rate": 0.0001569747899159664, |
|
"loss": 0.6179, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.08382858930247107, |
|
"grad_norm": 0.5739912986755371, |
|
"learning_rate": 0.00015663865546218487, |
|
"loss": 0.2515, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.08445417578980294, |
|
"grad_norm": 1.2065409421920776, |
|
"learning_rate": 0.00015630252100840337, |
|
"loss": 0.6161, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.08507976227713482, |
|
"grad_norm": 1.1025582551956177, |
|
"learning_rate": 0.00015596638655462187, |
|
"loss": 0.5926, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.08570534876446669, |
|
"grad_norm": 0.78680020570755, |
|
"learning_rate": 0.00015563025210084034, |
|
"loss": 0.9987, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.08633093525179857, |
|
"grad_norm": 0.6232782006263733, |
|
"learning_rate": 0.00015529411764705884, |
|
"loss": 0.4952, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.08695652173913043, |
|
"grad_norm": 3.347989559173584, |
|
"learning_rate": 0.00015495798319327734, |
|
"loss": 1.0787, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.08758210822646231, |
|
"grad_norm": 0.9020625352859497, |
|
"learning_rate": 0.0001546218487394958, |
|
"loss": 0.354, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08820769471379418, |
|
"grad_norm": 1.8955539464950562, |
|
"learning_rate": 0.0001542857142857143, |
|
"loss": 0.5515, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.08883328120112606, |
|
"grad_norm": 5.194116115570068, |
|
"learning_rate": 0.00015394957983193278, |
|
"loss": 0.6843, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.08945886768845793, |
|
"grad_norm": 1.4467953443527222, |
|
"learning_rate": 0.00015361344537815128, |
|
"loss": 0.4236, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.09008445417578981, |
|
"grad_norm": 0.523921012878418, |
|
"learning_rate": 0.00015327731092436978, |
|
"loss": 0.2165, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.09071004066312167, |
|
"grad_norm": 1.653648018836975, |
|
"learning_rate": 0.00015294117647058822, |
|
"loss": 1.0643, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.09133562715045355, |
|
"grad_norm": 0.6991509199142456, |
|
"learning_rate": 0.00015260504201680672, |
|
"loss": 0.4398, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.09196121363778542, |
|
"grad_norm": 1.3986660242080688, |
|
"learning_rate": 0.00015226890756302522, |
|
"loss": 0.8488, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.0925868001251173, |
|
"grad_norm": 1.2424954175949097, |
|
"learning_rate": 0.0001519327731092437, |
|
"loss": 0.9516, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.09321238661244917, |
|
"grad_norm": 0.8900560140609741, |
|
"learning_rate": 0.0001515966386554622, |
|
"loss": 0.767, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.09383797309978105, |
|
"grad_norm": 40.042503356933594, |
|
"learning_rate": 0.00015126050420168066, |
|
"loss": 0.9691, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09383797309978105, |
|
"eval_loss": 0.8660734295845032, |
|
"eval_runtime": 43.5102, |
|
"eval_samples_per_second": 5.884, |
|
"eval_steps_per_second": 2.942, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09446355958711292, |
|
"grad_norm": 2.816359519958496, |
|
"learning_rate": 0.00015092436974789916, |
|
"loss": 1.4959, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.0950891460744448, |
|
"grad_norm": 1.9332157373428345, |
|
"learning_rate": 0.00015058823529411766, |
|
"loss": 0.6786, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.09571473256177666, |
|
"grad_norm": 1.2608965635299683, |
|
"learning_rate": 0.00015025210084033613, |
|
"loss": 1.1282, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.09634031904910854, |
|
"grad_norm": 1.0167793035507202, |
|
"learning_rate": 0.00014991596638655463, |
|
"loss": 0.4932, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.09696590553644041, |
|
"grad_norm": 1.6121408939361572, |
|
"learning_rate": 0.00014957983193277313, |
|
"loss": 0.7193, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.09759149202377229, |
|
"grad_norm": 2.4104394912719727, |
|
"learning_rate": 0.0001492436974789916, |
|
"loss": 0.4472, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.09821707851110416, |
|
"grad_norm": 1.1095707416534424, |
|
"learning_rate": 0.0001489075630252101, |
|
"loss": 0.7595, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.09884266499843604, |
|
"grad_norm": 1.686458945274353, |
|
"learning_rate": 0.00014857142857142857, |
|
"loss": 0.5686, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.0994682514857679, |
|
"grad_norm": 3.2238378524780273, |
|
"learning_rate": 0.00014823529411764707, |
|
"loss": 0.4236, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.10009383797309979, |
|
"grad_norm": 1.800552248954773, |
|
"learning_rate": 0.00014789915966386557, |
|
"loss": 0.9519, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.10071942446043165, |
|
"grad_norm": 0.6441445350646973, |
|
"learning_rate": 0.00014756302521008404, |
|
"loss": 0.4119, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.10134501094776353, |
|
"grad_norm": 0.5892903804779053, |
|
"learning_rate": 0.00014722689075630254, |
|
"loss": 0.2956, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.1019705974350954, |
|
"grad_norm": 0.8733301758766174, |
|
"learning_rate": 0.00014689075630252101, |
|
"loss": 0.5749, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.10259618392242728, |
|
"grad_norm": 1.0460662841796875, |
|
"learning_rate": 0.0001465546218487395, |
|
"loss": 0.8167, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.10322177040975915, |
|
"grad_norm": 0.8178017735481262, |
|
"learning_rate": 0.00014621848739495799, |
|
"loss": 0.9027, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.10384735689709103, |
|
"grad_norm": 0.5698068737983704, |
|
"learning_rate": 0.00014588235294117646, |
|
"loss": 0.1829, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.1044729433844229, |
|
"grad_norm": 1.0011018514633179, |
|
"learning_rate": 0.00014554621848739496, |
|
"loss": 0.8985, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.10509852987175478, |
|
"grad_norm": 1.189772367477417, |
|
"learning_rate": 0.00014521008403361346, |
|
"loss": 0.5547, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.10572411635908664, |
|
"grad_norm": 0.7990069389343262, |
|
"learning_rate": 0.00014487394957983193, |
|
"loss": 0.6222, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.10634970284641852, |
|
"grad_norm": 0.6419771313667297, |
|
"learning_rate": 0.00014453781512605043, |
|
"loss": 0.3225, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10697528933375039, |
|
"grad_norm": 0.8978354930877686, |
|
"learning_rate": 0.00014420168067226893, |
|
"loss": 0.4567, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.10760087582108227, |
|
"grad_norm": 0.7193794250488281, |
|
"learning_rate": 0.0001438655462184874, |
|
"loss": 0.4793, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.10822646230841414, |
|
"grad_norm": 0.9533759355545044, |
|
"learning_rate": 0.0001435294117647059, |
|
"loss": 1.4397, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.10885204879574602, |
|
"grad_norm": 0.48348739743232727, |
|
"learning_rate": 0.00014319327731092437, |
|
"loss": 0.3398, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.10947763528307788, |
|
"grad_norm": 0.7699019312858582, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.7491, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.10947763528307788, |
|
"eval_loss": 0.8425782322883606, |
|
"eval_runtime": 43.5013, |
|
"eval_samples_per_second": 5.885, |
|
"eval_steps_per_second": 2.942, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.11010322177040976, |
|
"grad_norm": 0.9201186895370483, |
|
"learning_rate": 0.00014252100840336137, |
|
"loss": 0.6919, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.11072880825774163, |
|
"grad_norm": 0.8190593123435974, |
|
"learning_rate": 0.00014218487394957984, |
|
"loss": 0.6262, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.11135439474507351, |
|
"grad_norm": 0.9715782403945923, |
|
"learning_rate": 0.00014184873949579834, |
|
"loss": 0.8364, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.11197998123240538, |
|
"grad_norm": 0.6699782609939575, |
|
"learning_rate": 0.0001415126050420168, |
|
"loss": 0.4898, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.11260556771973726, |
|
"grad_norm": 1.8386518955230713, |
|
"learning_rate": 0.0001411764705882353, |
|
"loss": 0.7812, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.11323115420706913, |
|
"grad_norm": 0.7240263819694519, |
|
"learning_rate": 0.0001408403361344538, |
|
"loss": 0.5508, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.113856740694401, |
|
"grad_norm": 0.6068630814552307, |
|
"learning_rate": 0.00014050420168067225, |
|
"loss": 0.5151, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.11448232718173287, |
|
"grad_norm": 1.6705517768859863, |
|
"learning_rate": 0.00014016806722689075, |
|
"loss": 1.2281, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.11510791366906475, |
|
"grad_norm": 1.6179956197738647, |
|
"learning_rate": 0.00013983193277310925, |
|
"loss": 0.7365, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.11573350015639662, |
|
"grad_norm": 1.5741758346557617, |
|
"learning_rate": 0.00013949579831932772, |
|
"loss": 1.0039, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1163590866437285, |
|
"grad_norm": 0.9270511865615845, |
|
"learning_rate": 0.00013915966386554622, |
|
"loss": 0.5768, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.11698467313106037, |
|
"grad_norm": 1.3651914596557617, |
|
"learning_rate": 0.00013882352941176472, |
|
"loss": 0.7715, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.11761025961839225, |
|
"grad_norm": 1.4330601692199707, |
|
"learning_rate": 0.0001384873949579832, |
|
"loss": 0.4462, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.11823584610572412, |
|
"grad_norm": 0.9181672930717468, |
|
"learning_rate": 0.0001381512605042017, |
|
"loss": 0.3901, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.118861432593056, |
|
"grad_norm": 0.5304622650146484, |
|
"learning_rate": 0.00013781512605042016, |
|
"loss": 0.1718, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11948701908038786, |
|
"grad_norm": 0.7475191354751587, |
|
"learning_rate": 0.00013747899159663866, |
|
"loss": 0.3602, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.12011260556771974, |
|
"grad_norm": 1.2558002471923828, |
|
"learning_rate": 0.00013714285714285716, |
|
"loss": 0.8558, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.12073819205505161, |
|
"grad_norm": 0.9859037399291992, |
|
"learning_rate": 0.00013680672268907563, |
|
"loss": 0.7155, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.12136377854238349, |
|
"grad_norm": 0.6028466820716858, |
|
"learning_rate": 0.00013647058823529413, |
|
"loss": 0.9596, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.12198936502971536, |
|
"grad_norm": 0.5713469386100769, |
|
"learning_rate": 0.0001361344537815126, |
|
"loss": 0.3442, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.12261495151704724, |
|
"grad_norm": 1.0781211853027344, |
|
"learning_rate": 0.0001357983193277311, |
|
"loss": 0.5569, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.1232405380043791, |
|
"grad_norm": 0.7850176095962524, |
|
"learning_rate": 0.0001354621848739496, |
|
"loss": 0.5853, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.12386612449171099, |
|
"grad_norm": 0.8100555539131165, |
|
"learning_rate": 0.00013512605042016807, |
|
"loss": 0.8285, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.12449171097904285, |
|
"grad_norm": 1.106834888458252, |
|
"learning_rate": 0.00013478991596638657, |
|
"loss": 0.9521, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.12511729746637473, |
|
"grad_norm": 1.4412230253219604, |
|
"learning_rate": 0.00013445378151260507, |
|
"loss": 0.6478, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12511729746637473, |
|
"eval_loss": 0.8300326466560364, |
|
"eval_runtime": 43.5102, |
|
"eval_samples_per_second": 5.884, |
|
"eval_steps_per_second": 2.942, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1257428839537066, |
|
"grad_norm": 1.7852795124053955, |
|
"learning_rate": 0.00013411764705882352, |
|
"loss": 0.5687, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.12636847044103847, |
|
"grad_norm": 2.423583745956421, |
|
"learning_rate": 0.00013378151260504202, |
|
"loss": 0.9082, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.12699405692837035, |
|
"grad_norm": 1.538001298904419, |
|
"learning_rate": 0.00013344537815126052, |
|
"loss": 0.7143, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.12761964341570223, |
|
"grad_norm": 1.7380592823028564, |
|
"learning_rate": 0.000133109243697479, |
|
"loss": 0.8296, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.1282452299030341, |
|
"grad_norm": 0.8279218673706055, |
|
"learning_rate": 0.0001327731092436975, |
|
"loss": 0.6719, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.12887081639036596, |
|
"grad_norm": 0.7059926986694336, |
|
"learning_rate": 0.00013243697478991596, |
|
"loss": 0.4785, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.12949640287769784, |
|
"grad_norm": 0.6946935653686523, |
|
"learning_rate": 0.00013210084033613446, |
|
"loss": 0.4578, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.13012198936502972, |
|
"grad_norm": 0.9800712466239929, |
|
"learning_rate": 0.00013176470588235296, |
|
"loss": 1.4369, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.1307475758523616, |
|
"grad_norm": 0.708831787109375, |
|
"learning_rate": 0.00013142857142857143, |
|
"loss": 0.5071, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.13137316233969346, |
|
"grad_norm": 1.0098780393600464, |
|
"learning_rate": 0.00013109243697478993, |
|
"loss": 0.9155, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.13199874882702534, |
|
"grad_norm": 1.1598243713378906, |
|
"learning_rate": 0.0001307563025210084, |
|
"loss": 0.3757, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.13262433531435722, |
|
"grad_norm": 0.7583935260772705, |
|
"learning_rate": 0.0001304201680672269, |
|
"loss": 0.3365, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.1332499218016891, |
|
"grad_norm": 1.0866564512252808, |
|
"learning_rate": 0.0001300840336134454, |
|
"loss": 0.6398, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.13387550828902095, |
|
"grad_norm": 1.4322006702423096, |
|
"learning_rate": 0.00012974789915966387, |
|
"loss": 0.6427, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.13450109477635283, |
|
"grad_norm": 1.600325345993042, |
|
"learning_rate": 0.00012941176470588237, |
|
"loss": 0.6884, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1351266812636847, |
|
"grad_norm": 1.0634167194366455, |
|
"learning_rate": 0.00012907563025210087, |
|
"loss": 1.0343, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.13575226775101656, |
|
"grad_norm": 0.9889366626739502, |
|
"learning_rate": 0.00012873949579831934, |
|
"loss": 0.717, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.13637785423834844, |
|
"grad_norm": 2.0635392665863037, |
|
"learning_rate": 0.00012840336134453784, |
|
"loss": 0.5965, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.13700344072568033, |
|
"grad_norm": 0.8937773704528809, |
|
"learning_rate": 0.0001280672268907563, |
|
"loss": 0.7281, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.1376290272130122, |
|
"grad_norm": 0.9768427014350891, |
|
"learning_rate": 0.00012773109243697478, |
|
"loss": 0.5687, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13825461370034406, |
|
"grad_norm": 1.3913767337799072, |
|
"learning_rate": 0.00012739495798319328, |
|
"loss": 0.3984, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.13888020018767594, |
|
"grad_norm": 1.4933342933654785, |
|
"learning_rate": 0.00012705882352941175, |
|
"loss": 1.2441, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.13950578667500782, |
|
"grad_norm": 1.0846196413040161, |
|
"learning_rate": 0.00012672268907563025, |
|
"loss": 0.9013, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.1401313731623397, |
|
"grad_norm": 0.7788563370704651, |
|
"learning_rate": 0.00012638655462184875, |
|
"loss": 0.4674, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.14075695964967155, |
|
"grad_norm": 0.7341142296791077, |
|
"learning_rate": 0.00012605042016806722, |
|
"loss": 1.3271, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.14075695964967155, |
|
"eval_loss": 0.8179877996444702, |
|
"eval_runtime": 43.5514, |
|
"eval_samples_per_second": 5.878, |
|
"eval_steps_per_second": 2.939, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.14138254613700343, |
|
"grad_norm": 6.473598480224609, |
|
"learning_rate": 0.00012571428571428572, |
|
"loss": 0.6219, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.14200813262433531, |
|
"grad_norm": 0.9846400022506714, |
|
"learning_rate": 0.0001253781512605042, |
|
"loss": 0.4407, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.1426337191116672, |
|
"grad_norm": 0.7880604267120361, |
|
"learning_rate": 0.0001250420168067227, |
|
"loss": 0.3927, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.14325930559899905, |
|
"grad_norm": 1.5999399423599243, |
|
"learning_rate": 0.0001247058823529412, |
|
"loss": 0.6917, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.14388489208633093, |
|
"grad_norm": 0.8072729110717773, |
|
"learning_rate": 0.00012436974789915966, |
|
"loss": 0.4909, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1445104785736628, |
|
"grad_norm": 2.2560601234436035, |
|
"learning_rate": 0.00012403361344537816, |
|
"loss": 0.3355, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.1451360650609947, |
|
"grad_norm": 0.9964832663536072, |
|
"learning_rate": 0.00012369747899159666, |
|
"loss": 0.4436, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.14576165154832654, |
|
"grad_norm": 1.1081007719039917, |
|
"learning_rate": 0.00012336134453781513, |
|
"loss": 0.6582, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.14638723803565842, |
|
"grad_norm": 0.9722908735275269, |
|
"learning_rate": 0.00012302521008403363, |
|
"loss": 0.7412, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.1470128245229903, |
|
"grad_norm": 0.7456592917442322, |
|
"learning_rate": 0.0001226890756302521, |
|
"loss": 0.4303, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.14763841101032218, |
|
"grad_norm": 1.0428457260131836, |
|
"learning_rate": 0.0001223529411764706, |
|
"loss": 1.0538, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.14826399749765404, |
|
"grad_norm": 0.9209719896316528, |
|
"learning_rate": 0.00012201680672268909, |
|
"loss": 0.5864, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.14888958398498592, |
|
"grad_norm": 0.990292489528656, |
|
"learning_rate": 0.00012168067226890756, |
|
"loss": 0.5929, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.1495151704723178, |
|
"grad_norm": 0.6086494326591492, |
|
"learning_rate": 0.00012134453781512605, |
|
"loss": 0.4436, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.15014075695964968, |
|
"grad_norm": 1.429149866104126, |
|
"learning_rate": 0.00012100840336134453, |
|
"loss": 0.246, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.15076634344698153, |
|
"grad_norm": 1.8170491456985474, |
|
"learning_rate": 0.00012067226890756302, |
|
"loss": 0.6574, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.1513919299343134, |
|
"grad_norm": 1.1577768325805664, |
|
"learning_rate": 0.00012033613445378152, |
|
"loss": 0.5706, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.1520175164216453, |
|
"grad_norm": 0.7442137598991394, |
|
"learning_rate": 0.00012, |
|
"loss": 0.2772, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.15264310290897717, |
|
"grad_norm": 1.1375997066497803, |
|
"learning_rate": 0.00011966386554621849, |
|
"loss": 0.397, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.15326868939630903, |
|
"grad_norm": 0.8451513648033142, |
|
"learning_rate": 0.00011932773109243697, |
|
"loss": 0.5425, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1538942758836409, |
|
"grad_norm": 0.7176560163497925, |
|
"learning_rate": 0.00011899159663865547, |
|
"loss": 0.4398, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.1545198623709728, |
|
"grad_norm": 1.049872875213623, |
|
"learning_rate": 0.00011865546218487396, |
|
"loss": 0.6479, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.15514544885830467, |
|
"grad_norm": 0.6093642115592957, |
|
"learning_rate": 0.00011831932773109244, |
|
"loss": 0.6125, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.15577103534563652, |
|
"grad_norm": 0.9963379502296448, |
|
"learning_rate": 0.00011798319327731093, |
|
"loss": 0.3768, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.1563966218329684, |
|
"grad_norm": 3.4668896198272705, |
|
"learning_rate": 0.00011764705882352942, |
|
"loss": 0.3744, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1563966218329684, |
|
"eval_loss": 0.8456696271896362, |
|
"eval_runtime": 43.5223, |
|
"eval_samples_per_second": 5.882, |
|
"eval_steps_per_second": 2.941, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.15702220832030028, |
|
"grad_norm": 0.6826130747795105, |
|
"learning_rate": 0.00011731092436974791, |
|
"loss": 0.4877, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.15764779480763216, |
|
"grad_norm": 1.8045300245285034, |
|
"learning_rate": 0.0001169747899159664, |
|
"loss": 0.9699, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.15827338129496402, |
|
"grad_norm": 0.7311923503875732, |
|
"learning_rate": 0.00011663865546218489, |
|
"loss": 0.4648, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.1588989677822959, |
|
"grad_norm": 1.7481943368911743, |
|
"learning_rate": 0.00011630252100840337, |
|
"loss": 0.8871, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.15952455426962778, |
|
"grad_norm": 2.6331326961517334, |
|
"learning_rate": 0.00011596638655462187, |
|
"loss": 0.8109, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.16015014075695966, |
|
"grad_norm": 0.899364709854126, |
|
"learning_rate": 0.00011563025210084036, |
|
"loss": 0.5021, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.1607757272442915, |
|
"grad_norm": 0.922218918800354, |
|
"learning_rate": 0.00011529411764705881, |
|
"loss": 0.5741, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.1614013137316234, |
|
"grad_norm": 5.335756301879883, |
|
"learning_rate": 0.00011495798319327731, |
|
"loss": 0.842, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.16202690021895527, |
|
"grad_norm": 0.8632665872573853, |
|
"learning_rate": 0.0001146218487394958, |
|
"loss": 0.4208, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.16265248670628715, |
|
"grad_norm": 4.576591968536377, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 0.8813, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.163278073193619, |
|
"grad_norm": 0.907714307308197, |
|
"learning_rate": 0.00011394957983193277, |
|
"loss": 0.7204, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.16390365968095089, |
|
"grad_norm": 0.8328534960746765, |
|
"learning_rate": 0.00011361344537815127, |
|
"loss": 0.7552, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.16452924616828277, |
|
"grad_norm": 1.0882028341293335, |
|
"learning_rate": 0.00011327731092436975, |
|
"loss": 0.9079, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.16515483265561465, |
|
"grad_norm": 1.0093358755111694, |
|
"learning_rate": 0.00011294117647058824, |
|
"loss": 0.6284, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.1657804191429465, |
|
"grad_norm": 0.853907585144043, |
|
"learning_rate": 0.00011260504201680672, |
|
"loss": 0.508, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.16640600563027838, |
|
"grad_norm": 1.0016460418701172, |
|
"learning_rate": 0.00011226890756302521, |
|
"loss": 0.597, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.16703159211761026, |
|
"grad_norm": 1.0138968229293823, |
|
"learning_rate": 0.00011193277310924371, |
|
"loss": 0.9238, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.16765717860494214, |
|
"grad_norm": 1.1728049516677856, |
|
"learning_rate": 0.0001115966386554622, |
|
"loss": 0.9152, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.168282765092274, |
|
"grad_norm": 1.2228264808654785, |
|
"learning_rate": 0.00011126050420168068, |
|
"loss": 0.7483, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.16890835157960588, |
|
"grad_norm": 0.6260212659835815, |
|
"learning_rate": 0.00011092436974789917, |
|
"loss": 0.5566, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16953393806693776, |
|
"grad_norm": 0.7589625716209412, |
|
"learning_rate": 0.00011058823529411766, |
|
"loss": 0.6242, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.17015952455426964, |
|
"grad_norm": 1.1016935110092163, |
|
"learning_rate": 0.00011025210084033615, |
|
"loss": 0.4419, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.1707851110416015, |
|
"grad_norm": 0.8092851042747498, |
|
"learning_rate": 0.00010991596638655464, |
|
"loss": 0.5168, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.17141069752893337, |
|
"grad_norm": 1.012885332107544, |
|
"learning_rate": 0.00010957983193277312, |
|
"loss": 0.4334, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.17203628401626525, |
|
"grad_norm": 2.6073336601257324, |
|
"learning_rate": 0.00010924369747899159, |
|
"loss": 0.5262, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.17203628401626525, |
|
"eval_loss": 0.8115787506103516, |
|
"eval_runtime": 43.4931, |
|
"eval_samples_per_second": 5.886, |
|
"eval_steps_per_second": 2.943, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.17266187050359713, |
|
"grad_norm": 5.577237606048584, |
|
"learning_rate": 0.00010890756302521008, |
|
"loss": 1.0595, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.17328745699092898, |
|
"grad_norm": 1.1434190273284912, |
|
"learning_rate": 0.00010857142857142856, |
|
"loss": 0.4401, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 0.951992928981781, |
|
"learning_rate": 0.00010823529411764706, |
|
"loss": 0.4393, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.17453862996559275, |
|
"grad_norm": 0.6695138216018677, |
|
"learning_rate": 0.00010789915966386555, |
|
"loss": 0.314, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.17516421645292463, |
|
"grad_norm": 0.40990278124809265, |
|
"learning_rate": 0.00010756302521008403, |
|
"loss": 0.192, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.17578980294025648, |
|
"grad_norm": 0.9555610418319702, |
|
"learning_rate": 0.00010722689075630252, |
|
"loss": 0.3646, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.17641538942758836, |
|
"grad_norm": 0.7370548844337463, |
|
"learning_rate": 0.000106890756302521, |
|
"loss": 0.8997, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.17704097591492024, |
|
"grad_norm": 1.0178982019424438, |
|
"learning_rate": 0.0001065546218487395, |
|
"loss": 0.986, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.17766656240225212, |
|
"grad_norm": 0.41388389468193054, |
|
"learning_rate": 0.00010621848739495799, |
|
"loss": 0.2069, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.17829214888958397, |
|
"grad_norm": 0.7140624523162842, |
|
"learning_rate": 0.00010588235294117647, |
|
"loss": 0.4852, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.17891773537691585, |
|
"grad_norm": 0.7758356332778931, |
|
"learning_rate": 0.00010554621848739496, |
|
"loss": 0.3943, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.17954332186424773, |
|
"grad_norm": 1.4193260669708252, |
|
"learning_rate": 0.00010521008403361346, |
|
"loss": 0.6412, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.18016890835157962, |
|
"grad_norm": 0.7264838814735413, |
|
"learning_rate": 0.00010487394957983194, |
|
"loss": 0.7834, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.18079449483891147, |
|
"grad_norm": 2.4300973415374756, |
|
"learning_rate": 0.00010453781512605043, |
|
"loss": 0.7462, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.18142008132624335, |
|
"grad_norm": 1.033916711807251, |
|
"learning_rate": 0.00010420168067226892, |
|
"loss": 0.5241, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.18204566781357523, |
|
"grad_norm": 0.5583767294883728, |
|
"learning_rate": 0.00010386554621848741, |
|
"loss": 0.7815, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.1826712543009071, |
|
"grad_norm": 0.7440481781959534, |
|
"learning_rate": 0.0001035294117647059, |
|
"loss": 0.4674, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.18329684078823896, |
|
"grad_norm": 4.230656147003174, |
|
"learning_rate": 0.00010319327731092439, |
|
"loss": 0.5219, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.18392242727557084, |
|
"grad_norm": 0.6165269017219543, |
|
"learning_rate": 0.00010285714285714286, |
|
"loss": 0.3274, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.18454801376290272, |
|
"grad_norm": 0.5844498872756958, |
|
"learning_rate": 0.00010252100840336134, |
|
"loss": 0.3719, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.1851736002502346, |
|
"grad_norm": 0.9936206936836243, |
|
"learning_rate": 0.00010218487394957983, |
|
"loss": 1.0453, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.18579918673756646, |
|
"grad_norm": 1.749831199645996, |
|
"learning_rate": 0.00010184873949579831, |
|
"loss": 0.6634, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.18642477322489834, |
|
"grad_norm": 0.4740132689476013, |
|
"learning_rate": 0.0001015126050420168, |
|
"loss": 0.2901, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.18705035971223022, |
|
"grad_norm": 0.664300262928009, |
|
"learning_rate": 0.0001011764705882353, |
|
"loss": 0.5869, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.1876759461995621, |
|
"grad_norm": 0.7400941252708435, |
|
"learning_rate": 0.00010084033613445378, |
|
"loss": 0.7881, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1876759461995621, |
|
"eval_loss": 0.7877693772315979, |
|
"eval_runtime": 43.5162, |
|
"eval_samples_per_second": 5.883, |
|
"eval_steps_per_second": 2.941, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.18830153268689395, |
|
"grad_norm": 0.6142858862876892, |
|
"learning_rate": 0.00010050420168067227, |
|
"loss": 0.3808, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.18892711917422583, |
|
"grad_norm": 1.991969347000122, |
|
"learning_rate": 0.00010016806722689076, |
|
"loss": 0.7035, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.1895527056615577, |
|
"grad_norm": 0.6220730543136597, |
|
"learning_rate": 9.983193277310925e-05, |
|
"loss": 0.2548, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.1901782921488896, |
|
"grad_norm": 0.6476833820343018, |
|
"learning_rate": 9.949579831932774e-05, |
|
"loss": 0.3569, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.19080387863622145, |
|
"grad_norm": 0.7133951783180237, |
|
"learning_rate": 9.915966386554623e-05, |
|
"loss": 0.4744, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.19142946512355333, |
|
"grad_norm": 0.6500736474990845, |
|
"learning_rate": 9.882352941176471e-05, |
|
"loss": 0.4653, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.1920550516108852, |
|
"grad_norm": 1.1231927871704102, |
|
"learning_rate": 9.848739495798321e-05, |
|
"loss": 0.818, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.1926806380982171, |
|
"grad_norm": 0.8654798865318298, |
|
"learning_rate": 9.815126050420168e-05, |
|
"loss": 0.7065, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.19330622458554894, |
|
"grad_norm": 0.45660969614982605, |
|
"learning_rate": 9.781512605042017e-05, |
|
"loss": 0.2412, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.19393181107288082, |
|
"grad_norm": 0.9538519978523254, |
|
"learning_rate": 9.747899159663865e-05, |
|
"loss": 1.3428, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1945573975602127, |
|
"grad_norm": 0.596633791923523, |
|
"learning_rate": 9.714285714285715e-05, |
|
"loss": 0.5119, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.19518298404754458, |
|
"grad_norm": 0.5247074365615845, |
|
"learning_rate": 9.680672268907564e-05, |
|
"loss": 0.6413, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.19580857053487644, |
|
"grad_norm": 0.7713050246238708, |
|
"learning_rate": 9.647058823529412e-05, |
|
"loss": 0.49, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.19643415702220832, |
|
"grad_norm": 0.6971513628959656, |
|
"learning_rate": 9.613445378151261e-05, |
|
"loss": 0.6505, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.1970597435095402, |
|
"grad_norm": 0.5454917550086975, |
|
"learning_rate": 9.579831932773111e-05, |
|
"loss": 0.7018, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.19768532999687208, |
|
"grad_norm": 0.8349499702453613, |
|
"learning_rate": 9.546218487394959e-05, |
|
"loss": 0.3179, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.19831091648420393, |
|
"grad_norm": 0.5682560801506042, |
|
"learning_rate": 9.512605042016806e-05, |
|
"loss": 0.4003, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.1989365029715358, |
|
"grad_norm": 0.5094739198684692, |
|
"learning_rate": 9.478991596638655e-05, |
|
"loss": 0.313, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.1995620894588677, |
|
"grad_norm": 1.7074236869812012, |
|
"learning_rate": 9.445378151260505e-05, |
|
"loss": 0.9912, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.20018767594619957, |
|
"grad_norm": 1.1477283239364624, |
|
"learning_rate": 9.411764705882353e-05, |
|
"loss": 0.851, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.20081326243353143, |
|
"grad_norm": 0.6616579294204712, |
|
"learning_rate": 9.378151260504202e-05, |
|
"loss": 0.4844, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.2014388489208633, |
|
"grad_norm": 1.0401920080184937, |
|
"learning_rate": 9.34453781512605e-05, |
|
"loss": 0.5421, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.2020644354081952, |
|
"grad_norm": 0.729664146900177, |
|
"learning_rate": 9.3109243697479e-05, |
|
"loss": 0.6632, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.20269002189552707, |
|
"grad_norm": 0.6752575635910034, |
|
"learning_rate": 9.277310924369749e-05, |
|
"loss": 0.4352, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.20331560838285892, |
|
"grad_norm": 0.7963948249816895, |
|
"learning_rate": 9.243697478991598e-05, |
|
"loss": 0.7614, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.20331560838285892, |
|
"eval_loss": 0.771190881729126, |
|
"eval_runtime": 43.551, |
|
"eval_samples_per_second": 5.878, |
|
"eval_steps_per_second": 2.939, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2039411948701908, |
|
"grad_norm": 0.7778791189193726, |
|
"learning_rate": 9.210084033613445e-05, |
|
"loss": 0.7251, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.20456678135752268, |
|
"grad_norm": 3.0929737091064453, |
|
"learning_rate": 9.176470588235295e-05, |
|
"loss": 0.5375, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.20519236784485456, |
|
"grad_norm": 0.6188391447067261, |
|
"learning_rate": 9.142857142857143e-05, |
|
"loss": 0.4007, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.20581795433218641, |
|
"grad_norm": 0.9423925876617432, |
|
"learning_rate": 9.109243697478992e-05, |
|
"loss": 0.5059, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.2064435408195183, |
|
"grad_norm": 0.506572425365448, |
|
"learning_rate": 9.07563025210084e-05, |
|
"loss": 0.2794, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.20706912730685018, |
|
"grad_norm": 1.7139545679092407, |
|
"learning_rate": 9.04201680672269e-05, |
|
"loss": 0.5984, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.20769471379418206, |
|
"grad_norm": 0.5540574789047241, |
|
"learning_rate": 9.008403361344539e-05, |
|
"loss": 0.323, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.2083203002815139, |
|
"grad_norm": 0.6909454464912415, |
|
"learning_rate": 8.974789915966387e-05, |
|
"loss": 0.5399, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.2089458867688458, |
|
"grad_norm": 0.7409022450447083, |
|
"learning_rate": 8.941176470588236e-05, |
|
"loss": 0.4251, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.20957147325617767, |
|
"grad_norm": 0.6636312007904053, |
|
"learning_rate": 8.907563025210084e-05, |
|
"loss": 0.4021, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.21019705974350955, |
|
"grad_norm": 0.5426271557807922, |
|
"learning_rate": 8.873949579831933e-05, |
|
"loss": 0.2095, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.2108226462308414, |
|
"grad_norm": 0.8870647549629211, |
|
"learning_rate": 8.840336134453782e-05, |
|
"loss": 0.5773, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.21144823271817328, |
|
"grad_norm": 0.5508524179458618, |
|
"learning_rate": 8.80672268907563e-05, |
|
"loss": 0.6744, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.21207381920550517, |
|
"grad_norm": 1.6577738523483276, |
|
"learning_rate": 8.77310924369748e-05, |
|
"loss": 1.1134, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.21269940569283705, |
|
"grad_norm": 3.218395233154297, |
|
"learning_rate": 8.739495798319329e-05, |
|
"loss": 0.5932, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2133249921801689, |
|
"grad_norm": 0.5119672417640686, |
|
"learning_rate": 8.705882352941177e-05, |
|
"loss": 0.1831, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.21395057866750078, |
|
"grad_norm": 0.4874535799026489, |
|
"learning_rate": 8.672268907563026e-05, |
|
"loss": 0.485, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.21457616515483266, |
|
"grad_norm": 0.6597093939781189, |
|
"learning_rate": 8.638655462184874e-05, |
|
"loss": 0.3588, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.21520175164216454, |
|
"grad_norm": 1.1764620542526245, |
|
"learning_rate": 8.605042016806724e-05, |
|
"loss": 1.8765, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.2158273381294964, |
|
"grad_norm": 0.6894935369491577, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 0.5355, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.21645292461682827, |
|
"grad_norm": 0.5896294116973877, |
|
"learning_rate": 8.53781512605042e-05, |
|
"loss": 0.494, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.21707851110416015, |
|
"grad_norm": 0.6212694048881531, |
|
"learning_rate": 8.50420168067227e-05, |
|
"loss": 0.5721, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.21770409759149204, |
|
"grad_norm": 0.5058571100234985, |
|
"learning_rate": 8.470588235294118e-05, |
|
"loss": 0.5051, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.2183296840788239, |
|
"grad_norm": 0.5089401006698608, |
|
"learning_rate": 8.436974789915967e-05, |
|
"loss": 0.3794, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.21895527056615577, |
|
"grad_norm": 6.416032314300537, |
|
"learning_rate": 8.403361344537815e-05, |
|
"loss": 0.5026, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.21895527056615577, |
|
"eval_loss": 0.7647964954376221, |
|
"eval_runtime": 43.4854, |
|
"eval_samples_per_second": 5.887, |
|
"eval_steps_per_second": 2.944, |
|
"step": 350 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.5558583235916595e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|