|
{ |
|
"best_metric": 0.6811222434043884, |
|
"best_model_checkpoint": "outputs/checkpoint-550", |
|
"epoch": 0.3440725680325305, |
|
"eval_steps": 25, |
|
"global_step": 550, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006255864873318737, |
|
"grad_norm": 3.302262783050537, |
|
"learning_rate": 4e-05, |
|
"loss": 1.7639, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0012511729746637473, |
|
"grad_norm": 3.7728819847106934, |
|
"learning_rate": 8e-05, |
|
"loss": 2.3471, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.001876759461995621, |
|
"grad_norm": 3.575211763381958, |
|
"learning_rate": 0.00012, |
|
"loss": 1.274, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0025023459493274947, |
|
"grad_norm": 4.3921918869018555, |
|
"learning_rate": 0.00016, |
|
"loss": 1.8361, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0031279324366593683, |
|
"grad_norm": 3.215696096420288, |
|
"learning_rate": 0.0002, |
|
"loss": 2.8766, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003753518923991242, |
|
"grad_norm": 4.060017108917236, |
|
"learning_rate": 0.0001996638655462185, |
|
"loss": 1.4329, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004379105411323116, |
|
"grad_norm": 2.7935523986816406, |
|
"learning_rate": 0.00019932773109243698, |
|
"loss": 1.2844, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.005004691898654989, |
|
"grad_norm": 2.312218189239502, |
|
"learning_rate": 0.00019899159663865548, |
|
"loss": 1.8112, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.005630278385986863, |
|
"grad_norm": 3.5389914512634277, |
|
"learning_rate": 0.00019865546218487395, |
|
"loss": 2.0504, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.006255864873318737, |
|
"grad_norm": 2.913029432296753, |
|
"learning_rate": 0.00019831932773109245, |
|
"loss": 1.9101, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00688145136065061, |
|
"grad_norm": 3.6916606426239014, |
|
"learning_rate": 0.00019798319327731095, |
|
"loss": 1.9899, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.007507037847982484, |
|
"grad_norm": 3.002810478210449, |
|
"learning_rate": 0.00019764705882352942, |
|
"loss": 1.3224, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.008132624335314358, |
|
"grad_norm": 1.657835602760315, |
|
"learning_rate": 0.00019731092436974792, |
|
"loss": 1.2208, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.008758210822646231, |
|
"grad_norm": 2.414161443710327, |
|
"learning_rate": 0.00019697478991596642, |
|
"loss": 1.5375, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.009383797309978105, |
|
"grad_norm": 1.9695100784301758, |
|
"learning_rate": 0.00019663865546218486, |
|
"loss": 0.9218, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.010009383797309979, |
|
"grad_norm": 3.9755845069885254, |
|
"learning_rate": 0.00019630252100840336, |
|
"loss": 1.3608, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.010634970284641852, |
|
"grad_norm": 6.843455791473389, |
|
"learning_rate": 0.00019596638655462186, |
|
"loss": 1.2168, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.011260556771973726, |
|
"grad_norm": 3.8736443519592285, |
|
"learning_rate": 0.00019563025210084033, |
|
"loss": 0.7392, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0118861432593056, |
|
"grad_norm": 1.7369539737701416, |
|
"learning_rate": 0.00019529411764705883, |
|
"loss": 1.0495, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.012511729746637473, |
|
"grad_norm": 1.1708225011825562, |
|
"learning_rate": 0.0001949579831932773, |
|
"loss": 1.2266, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013137316233969347, |
|
"grad_norm": 1.4693603515625, |
|
"learning_rate": 0.0001946218487394958, |
|
"loss": 1.1364, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.01376290272130122, |
|
"grad_norm": 0.8484959602355957, |
|
"learning_rate": 0.0001942857142857143, |
|
"loss": 0.6253, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.014388489208633094, |
|
"grad_norm": 2.7237887382507324, |
|
"learning_rate": 0.00019394957983193278, |
|
"loss": 1.2932, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.015014075695964968, |
|
"grad_norm": 1.1654947996139526, |
|
"learning_rate": 0.00019361344537815127, |
|
"loss": 0.5659, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01563966218329684, |
|
"grad_norm": 1.7193485498428345, |
|
"learning_rate": 0.00019327731092436975, |
|
"loss": 1.3627, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01563966218329684, |
|
"eval_loss": 1.0974555015563965, |
|
"eval_runtime": 46.8133, |
|
"eval_samples_per_second": 5.469, |
|
"eval_steps_per_second": 2.734, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.016265248670628715, |
|
"grad_norm": 2.883988380432129, |
|
"learning_rate": 0.00019294117647058825, |
|
"loss": 0.6257, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.01689083515796059, |
|
"grad_norm": 1.4707483053207397, |
|
"learning_rate": 0.00019260504201680674, |
|
"loss": 0.879, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.017516421645292463, |
|
"grad_norm": 1.3346422910690308, |
|
"learning_rate": 0.00019226890756302522, |
|
"loss": 1.0058, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.018142008132624336, |
|
"grad_norm": 0.5815519094467163, |
|
"learning_rate": 0.00019193277310924372, |
|
"loss": 0.3475, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.01876759461995621, |
|
"grad_norm": 0.8800593018531799, |
|
"learning_rate": 0.00019159663865546221, |
|
"loss": 0.5426, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.019393181107288084, |
|
"grad_norm": 8.196944236755371, |
|
"learning_rate": 0.0001912605042016807, |
|
"loss": 1.0088, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.020018767594619957, |
|
"grad_norm": 3.264193296432495, |
|
"learning_rate": 0.00019092436974789919, |
|
"loss": 0.9319, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.02064435408195183, |
|
"grad_norm": 1.1047834157943726, |
|
"learning_rate": 0.00019058823529411766, |
|
"loss": 0.9262, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.021269940569283705, |
|
"grad_norm": 1.982783555984497, |
|
"learning_rate": 0.00019025210084033613, |
|
"loss": 1.2904, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.021895527056615578, |
|
"grad_norm": 2.6765289306640625, |
|
"learning_rate": 0.00018991596638655463, |
|
"loss": 1.0785, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.022521113543947452, |
|
"grad_norm": 4.674818992614746, |
|
"learning_rate": 0.0001895798319327731, |
|
"loss": 0.9822, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.023146700031279326, |
|
"grad_norm": 1.6232353448867798, |
|
"learning_rate": 0.0001892436974789916, |
|
"loss": 0.6441, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0237722865186112, |
|
"grad_norm": 2.623237371444702, |
|
"learning_rate": 0.0001889075630252101, |
|
"loss": 0.8874, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.024397873005943073, |
|
"grad_norm": 1.4366761445999146, |
|
"learning_rate": 0.00018857142857142857, |
|
"loss": 0.4596, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.025023459493274947, |
|
"grad_norm": 1.8809682130813599, |
|
"learning_rate": 0.00018823529411764707, |
|
"loss": 0.887, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02564904598060682, |
|
"grad_norm": 1.081438660621643, |
|
"learning_rate": 0.00018789915966386554, |
|
"loss": 0.4735, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.026274632467938694, |
|
"grad_norm": 2.1302649974823, |
|
"learning_rate": 0.00018756302521008404, |
|
"loss": 0.795, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.026900218955270568, |
|
"grad_norm": 2.005425453186035, |
|
"learning_rate": 0.00018722689075630254, |
|
"loss": 0.8891, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.02752580544260244, |
|
"grad_norm": 1.7256505489349365, |
|
"learning_rate": 0.000186890756302521, |
|
"loss": 0.5993, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.028151391929934315, |
|
"grad_norm": 0.927653968334198, |
|
"learning_rate": 0.0001865546218487395, |
|
"loss": 0.6185, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02877697841726619, |
|
"grad_norm": 1.5710850954055786, |
|
"learning_rate": 0.000186218487394958, |
|
"loss": 0.5258, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.029402564904598062, |
|
"grad_norm": 1.8794296979904175, |
|
"learning_rate": 0.00018588235294117648, |
|
"loss": 0.8168, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.030028151391929936, |
|
"grad_norm": 0.9695333242416382, |
|
"learning_rate": 0.00018554621848739498, |
|
"loss": 0.5458, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.03065373787926181, |
|
"grad_norm": 3.7846665382385254, |
|
"learning_rate": 0.00018521008403361345, |
|
"loss": 0.943, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.03127932436659368, |
|
"grad_norm": 1.9213052988052368, |
|
"learning_rate": 0.00018487394957983195, |
|
"loss": 0.5069, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03127932436659368, |
|
"eval_loss": 0.9765783548355103, |
|
"eval_runtime": 43.502, |
|
"eval_samples_per_second": 5.885, |
|
"eval_steps_per_second": 2.942, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03190491085392556, |
|
"grad_norm": 2.0580382347106934, |
|
"learning_rate": 0.00018453781512605045, |
|
"loss": 0.9423, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.03253049734125743, |
|
"grad_norm": 2.063591957092285, |
|
"learning_rate": 0.0001842016806722689, |
|
"loss": 0.7054, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.033156083828589304, |
|
"grad_norm": 1.2656595706939697, |
|
"learning_rate": 0.0001838655462184874, |
|
"loss": 0.401, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.03378167031592118, |
|
"grad_norm": 1.2392399311065674, |
|
"learning_rate": 0.0001835294117647059, |
|
"loss": 0.6077, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03440725680325305, |
|
"grad_norm": 0.99504154920578, |
|
"learning_rate": 0.00018319327731092437, |
|
"loss": 0.6313, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.035032843290584925, |
|
"grad_norm": 2.0478012561798096, |
|
"learning_rate": 0.00018285714285714286, |
|
"loss": 1.2652, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0356584297779168, |
|
"grad_norm": 0.9636131525039673, |
|
"learning_rate": 0.00018252100840336134, |
|
"loss": 0.7561, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.03628401626524867, |
|
"grad_norm": 0.874576210975647, |
|
"learning_rate": 0.00018218487394957984, |
|
"loss": 0.7461, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.036909602752580546, |
|
"grad_norm": 1.3745896816253662, |
|
"learning_rate": 0.00018184873949579833, |
|
"loss": 1.2856, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03753518923991242, |
|
"grad_norm": 2.4839162826538086, |
|
"learning_rate": 0.0001815126050420168, |
|
"loss": 1.0574, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.038160775727244294, |
|
"grad_norm": 1.2671383619308472, |
|
"learning_rate": 0.0001811764705882353, |
|
"loss": 0.6177, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.03878636221457617, |
|
"grad_norm": 1.1862553358078003, |
|
"learning_rate": 0.0001808403361344538, |
|
"loss": 1.1169, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.03941194870190804, |
|
"grad_norm": 1.1347297430038452, |
|
"learning_rate": 0.00018050420168067228, |
|
"loss": 1.3303, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.040037535189239915, |
|
"grad_norm": 2.1583523750305176, |
|
"learning_rate": 0.00018016806722689078, |
|
"loss": 0.7941, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.04066312167657179, |
|
"grad_norm": 1.2432655096054077, |
|
"learning_rate": 0.00017983193277310925, |
|
"loss": 0.7848, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04128870816390366, |
|
"grad_norm": 1.3345468044281006, |
|
"learning_rate": 0.00017949579831932775, |
|
"loss": 0.8953, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.041914294651235535, |
|
"grad_norm": 0.6861767768859863, |
|
"learning_rate": 0.00017915966386554625, |
|
"loss": 0.4162, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.04253988113856741, |
|
"grad_norm": 0.85309898853302, |
|
"learning_rate": 0.00017882352941176472, |
|
"loss": 0.6606, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.04316546762589928, |
|
"grad_norm": 1.0247780084609985, |
|
"learning_rate": 0.00017848739495798322, |
|
"loss": 0.5271, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.043791054113231156, |
|
"grad_norm": 1.3019441366195679, |
|
"learning_rate": 0.0001781512605042017, |
|
"loss": 0.5605, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04441664060056303, |
|
"grad_norm": 1.1024900674819946, |
|
"learning_rate": 0.00017781512605042016, |
|
"loss": 0.9303, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.045042227087894904, |
|
"grad_norm": 1.079655408859253, |
|
"learning_rate": 0.00017747899159663866, |
|
"loss": 1.0138, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.04566781357522678, |
|
"grad_norm": 1.1078468561172485, |
|
"learning_rate": 0.00017714285714285713, |
|
"loss": 0.9861, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.04629340006255865, |
|
"grad_norm": 1.8648931980133057, |
|
"learning_rate": 0.00017680672268907563, |
|
"loss": 0.6756, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.046918986549890525, |
|
"grad_norm": 0.8588104248046875, |
|
"learning_rate": 0.00017647058823529413, |
|
"loss": 0.4867, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.046918986549890525, |
|
"eval_loss": 0.9139823913574219, |
|
"eval_runtime": 43.5635, |
|
"eval_samples_per_second": 5.876, |
|
"eval_steps_per_second": 2.938, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0475445730372224, |
|
"grad_norm": 1.6970480680465698, |
|
"learning_rate": 0.0001761344537815126, |
|
"loss": 0.5523, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.04817015952455427, |
|
"grad_norm": 0.8562026023864746, |
|
"learning_rate": 0.0001757983193277311, |
|
"loss": 0.4084, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.048795746011886146, |
|
"grad_norm": 0.9487925171852112, |
|
"learning_rate": 0.0001754621848739496, |
|
"loss": 0.6204, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.04942133249921802, |
|
"grad_norm": 11.929024696350098, |
|
"learning_rate": 0.00017512605042016807, |
|
"loss": 1.1662, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.05004691898654989, |
|
"grad_norm": 1.3468140363693237, |
|
"learning_rate": 0.00017478991596638657, |
|
"loss": 0.8037, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05067250547388177, |
|
"grad_norm": 0.7379503846168518, |
|
"learning_rate": 0.00017445378151260504, |
|
"loss": 0.6564, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.05129809196121364, |
|
"grad_norm": 1.0315027236938477, |
|
"learning_rate": 0.00017411764705882354, |
|
"loss": 0.6377, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.051923678448545514, |
|
"grad_norm": 0.5900093913078308, |
|
"learning_rate": 0.00017378151260504204, |
|
"loss": 0.5122, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.05254926493587739, |
|
"grad_norm": 1.5138239860534668, |
|
"learning_rate": 0.0001734453781512605, |
|
"loss": 0.4769, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.05317485142320926, |
|
"grad_norm": 1.016790747642517, |
|
"learning_rate": 0.000173109243697479, |
|
"loss": 0.6654, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.053800437910541135, |
|
"grad_norm": 1.1964718103408813, |
|
"learning_rate": 0.00017277310924369748, |
|
"loss": 0.6334, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.05442602439787301, |
|
"grad_norm": 1.102842092514038, |
|
"learning_rate": 0.00017243697478991598, |
|
"loss": 0.832, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.05505161088520488, |
|
"grad_norm": 6.609305381774902, |
|
"learning_rate": 0.00017210084033613448, |
|
"loss": 0.6112, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.055677197372536756, |
|
"grad_norm": 2.6627745628356934, |
|
"learning_rate": 0.00017176470588235293, |
|
"loss": 1.032, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.05630278385986863, |
|
"grad_norm": 2.114955425262451, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 0.6116, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0569283703472005, |
|
"grad_norm": 1.7707552909851074, |
|
"learning_rate": 0.00017109243697478992, |
|
"loss": 0.4766, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.05755395683453238, |
|
"grad_norm": 0.9983264803886414, |
|
"learning_rate": 0.0001707563025210084, |
|
"loss": 0.5397, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.05817954332186425, |
|
"grad_norm": 8.190524101257324, |
|
"learning_rate": 0.0001704201680672269, |
|
"loss": 0.9531, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.058805129809196124, |
|
"grad_norm": 1.9920661449432373, |
|
"learning_rate": 0.0001700840336134454, |
|
"loss": 1.3801, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.059430716296528, |
|
"grad_norm": 0.8791856169700623, |
|
"learning_rate": 0.00016974789915966387, |
|
"loss": 0.6218, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06005630278385987, |
|
"grad_norm": 1.0745537281036377, |
|
"learning_rate": 0.00016941176470588237, |
|
"loss": 0.5578, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.060681889271191745, |
|
"grad_norm": 1.4266705513000488, |
|
"learning_rate": 0.00016907563025210084, |
|
"loss": 1.5821, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.06130747575852362, |
|
"grad_norm": 1.1001832485198975, |
|
"learning_rate": 0.00016873949579831934, |
|
"loss": 0.5972, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.06193306224585549, |
|
"grad_norm": 1.3168463706970215, |
|
"learning_rate": 0.00016840336134453784, |
|
"loss": 0.5794, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.06255864873318737, |
|
"grad_norm": 1.0342196226119995, |
|
"learning_rate": 0.0001680672268907563, |
|
"loss": 0.6827, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06255864873318737, |
|
"eval_loss": 0.8885337114334106, |
|
"eval_runtime": 43.4886, |
|
"eval_samples_per_second": 5.887, |
|
"eval_steps_per_second": 2.943, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06318423522051923, |
|
"grad_norm": 2.2497031688690186, |
|
"learning_rate": 0.0001677310924369748, |
|
"loss": 0.6468, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.06380982170785111, |
|
"grad_norm": 0.8061516284942627, |
|
"learning_rate": 0.00016739495798319328, |
|
"loss": 0.5388, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.06443540819518298, |
|
"grad_norm": 0.6954531669616699, |
|
"learning_rate": 0.00016705882352941178, |
|
"loss": 0.3191, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.06506099468251486, |
|
"grad_norm": 1.3721911907196045, |
|
"learning_rate": 0.00016672268907563028, |
|
"loss": 0.9, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.06568658116984673, |
|
"grad_norm": 1.084492564201355, |
|
"learning_rate": 0.00016638655462184875, |
|
"loss": 0.6144, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06631216765717861, |
|
"grad_norm": 3.317697525024414, |
|
"learning_rate": 0.00016605042016806725, |
|
"loss": 0.634, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.06693775414451048, |
|
"grad_norm": 2.5598530769348145, |
|
"learning_rate": 0.00016571428571428575, |
|
"loss": 0.8931, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.06756334063184236, |
|
"grad_norm": 3.6414177417755127, |
|
"learning_rate": 0.0001653781512605042, |
|
"loss": 0.7226, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.06818892711917422, |
|
"grad_norm": 2.2443768978118896, |
|
"learning_rate": 0.0001650420168067227, |
|
"loss": 0.8862, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0688145136065061, |
|
"grad_norm": 0.6285691857337952, |
|
"learning_rate": 0.0001647058823529412, |
|
"loss": 0.3766, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06944010009383797, |
|
"grad_norm": 0.6171959042549133, |
|
"learning_rate": 0.00016436974789915966, |
|
"loss": 0.2821, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.07006568658116985, |
|
"grad_norm": 1.0057804584503174, |
|
"learning_rate": 0.00016403361344537816, |
|
"loss": 0.6293, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.07069127306850172, |
|
"grad_norm": 1.3190034627914429, |
|
"learning_rate": 0.00016369747899159663, |
|
"loss": 0.5547, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.0713168595558336, |
|
"grad_norm": 0.518517017364502, |
|
"learning_rate": 0.00016336134453781513, |
|
"loss": 0.1951, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.07194244604316546, |
|
"grad_norm": 0.848175048828125, |
|
"learning_rate": 0.00016302521008403363, |
|
"loss": 0.5091, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.07256803253049735, |
|
"grad_norm": 0.7387409806251526, |
|
"learning_rate": 0.0001626890756302521, |
|
"loss": 0.3872, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.07319361901782921, |
|
"grad_norm": 2.828091859817505, |
|
"learning_rate": 0.0001623529411764706, |
|
"loss": 1.2046, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.07381920550516109, |
|
"grad_norm": 1.7653822898864746, |
|
"learning_rate": 0.00016201680672268907, |
|
"loss": 1.8133, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.07444479199249296, |
|
"grad_norm": 3.5097360610961914, |
|
"learning_rate": 0.00016168067226890757, |
|
"loss": 0.6837, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.07507037847982484, |
|
"grad_norm": 1.3884797096252441, |
|
"learning_rate": 0.00016134453781512607, |
|
"loss": 0.8846, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0756959649671567, |
|
"grad_norm": 22.705190658569336, |
|
"learning_rate": 0.00016100840336134454, |
|
"loss": 0.7281, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.07632155145448859, |
|
"grad_norm": 3.1223599910736084, |
|
"learning_rate": 0.00016067226890756304, |
|
"loss": 0.6254, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.07694713794182045, |
|
"grad_norm": 0.530583381652832, |
|
"learning_rate": 0.00016033613445378154, |
|
"loss": 0.3292, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.07757272442915233, |
|
"grad_norm": 1.4720183610916138, |
|
"learning_rate": 0.00016, |
|
"loss": 0.8192, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.0781983109164842, |
|
"grad_norm": 0.6448870301246643, |
|
"learning_rate": 0.0001596638655462185, |
|
"loss": 0.2431, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0781983109164842, |
|
"eval_loss": 0.890012800693512, |
|
"eval_runtime": 43.5059, |
|
"eval_samples_per_second": 5.884, |
|
"eval_steps_per_second": 2.942, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07882389740381608, |
|
"grad_norm": 1.803906798362732, |
|
"learning_rate": 0.00015932773109243698, |
|
"loss": 0.8937, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.07944948389114795, |
|
"grad_norm": 2.2447054386138916, |
|
"learning_rate": 0.00015899159663865546, |
|
"loss": 0.6993, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.08007507037847983, |
|
"grad_norm": 0.6667381525039673, |
|
"learning_rate": 0.00015865546218487396, |
|
"loss": 0.4266, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.0807006568658117, |
|
"grad_norm": 1.1449408531188965, |
|
"learning_rate": 0.00015831932773109243, |
|
"loss": 0.5557, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.08132624335314358, |
|
"grad_norm": 1.399849534034729, |
|
"learning_rate": 0.00015798319327731093, |
|
"loss": 0.6761, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08195182984047544, |
|
"grad_norm": 0.745627760887146, |
|
"learning_rate": 0.00015764705882352943, |
|
"loss": 0.5323, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.08257741632780732, |
|
"grad_norm": 1.162428379058838, |
|
"learning_rate": 0.0001573109243697479, |
|
"loss": 0.8231, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.08320300281513919, |
|
"grad_norm": 1.0329734086990356, |
|
"learning_rate": 0.0001569747899159664, |
|
"loss": 0.6179, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.08382858930247107, |
|
"grad_norm": 0.5739912986755371, |
|
"learning_rate": 0.00015663865546218487, |
|
"loss": 0.2515, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.08445417578980294, |
|
"grad_norm": 1.2065409421920776, |
|
"learning_rate": 0.00015630252100840337, |
|
"loss": 0.6161, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.08507976227713482, |
|
"grad_norm": 1.1025582551956177, |
|
"learning_rate": 0.00015596638655462187, |
|
"loss": 0.5926, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.08570534876446669, |
|
"grad_norm": 0.78680020570755, |
|
"learning_rate": 0.00015563025210084034, |
|
"loss": 0.9987, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.08633093525179857, |
|
"grad_norm": 0.6232782006263733, |
|
"learning_rate": 0.00015529411764705884, |
|
"loss": 0.4952, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.08695652173913043, |
|
"grad_norm": 3.347989559173584, |
|
"learning_rate": 0.00015495798319327734, |
|
"loss": 1.0787, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.08758210822646231, |
|
"grad_norm": 0.9020625352859497, |
|
"learning_rate": 0.0001546218487394958, |
|
"loss": 0.354, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08820769471379418, |
|
"grad_norm": 1.8955539464950562, |
|
"learning_rate": 0.0001542857142857143, |
|
"loss": 0.5515, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.08883328120112606, |
|
"grad_norm": 5.194116115570068, |
|
"learning_rate": 0.00015394957983193278, |
|
"loss": 0.6843, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.08945886768845793, |
|
"grad_norm": 1.4467953443527222, |
|
"learning_rate": 0.00015361344537815128, |
|
"loss": 0.4236, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.09008445417578981, |
|
"grad_norm": 0.523921012878418, |
|
"learning_rate": 0.00015327731092436978, |
|
"loss": 0.2165, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.09071004066312167, |
|
"grad_norm": 1.653648018836975, |
|
"learning_rate": 0.00015294117647058822, |
|
"loss": 1.0643, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.09133562715045355, |
|
"grad_norm": 0.6991509199142456, |
|
"learning_rate": 0.00015260504201680672, |
|
"loss": 0.4398, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.09196121363778542, |
|
"grad_norm": 1.3986660242080688, |
|
"learning_rate": 0.00015226890756302522, |
|
"loss": 0.8488, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.0925868001251173, |
|
"grad_norm": 1.2424954175949097, |
|
"learning_rate": 0.0001519327731092437, |
|
"loss": 0.9516, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.09321238661244917, |
|
"grad_norm": 0.8900560140609741, |
|
"learning_rate": 0.0001515966386554622, |
|
"loss": 0.767, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.09383797309978105, |
|
"grad_norm": 40.042503356933594, |
|
"learning_rate": 0.00015126050420168066, |
|
"loss": 0.9691, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09383797309978105, |
|
"eval_loss": 0.8660734295845032, |
|
"eval_runtime": 43.5102, |
|
"eval_samples_per_second": 5.884, |
|
"eval_steps_per_second": 2.942, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09446355958711292, |
|
"grad_norm": 2.816359519958496, |
|
"learning_rate": 0.00015092436974789916, |
|
"loss": 1.4959, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.0950891460744448, |
|
"grad_norm": 1.9332157373428345, |
|
"learning_rate": 0.00015058823529411766, |
|
"loss": 0.6786, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.09571473256177666, |
|
"grad_norm": 1.2608965635299683, |
|
"learning_rate": 0.00015025210084033613, |
|
"loss": 1.1282, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.09634031904910854, |
|
"grad_norm": 1.0167793035507202, |
|
"learning_rate": 0.00014991596638655463, |
|
"loss": 0.4932, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.09696590553644041, |
|
"grad_norm": 1.6121408939361572, |
|
"learning_rate": 0.00014957983193277313, |
|
"loss": 0.7193, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.09759149202377229, |
|
"grad_norm": 2.4104394912719727, |
|
"learning_rate": 0.0001492436974789916, |
|
"loss": 0.4472, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.09821707851110416, |
|
"grad_norm": 1.1095707416534424, |
|
"learning_rate": 0.0001489075630252101, |
|
"loss": 0.7595, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.09884266499843604, |
|
"grad_norm": 1.686458945274353, |
|
"learning_rate": 0.00014857142857142857, |
|
"loss": 0.5686, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.0994682514857679, |
|
"grad_norm": 3.2238378524780273, |
|
"learning_rate": 0.00014823529411764707, |
|
"loss": 0.4236, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.10009383797309979, |
|
"grad_norm": 1.800552248954773, |
|
"learning_rate": 0.00014789915966386557, |
|
"loss": 0.9519, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.10071942446043165, |
|
"grad_norm": 0.6441445350646973, |
|
"learning_rate": 0.00014756302521008404, |
|
"loss": 0.4119, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.10134501094776353, |
|
"grad_norm": 0.5892903804779053, |
|
"learning_rate": 0.00014722689075630254, |
|
"loss": 0.2956, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.1019705974350954, |
|
"grad_norm": 0.8733301758766174, |
|
"learning_rate": 0.00014689075630252101, |
|
"loss": 0.5749, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.10259618392242728, |
|
"grad_norm": 1.0460662841796875, |
|
"learning_rate": 0.0001465546218487395, |
|
"loss": 0.8167, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.10322177040975915, |
|
"grad_norm": 0.8178017735481262, |
|
"learning_rate": 0.00014621848739495799, |
|
"loss": 0.9027, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.10384735689709103, |
|
"grad_norm": 0.5698068737983704, |
|
"learning_rate": 0.00014588235294117646, |
|
"loss": 0.1829, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.1044729433844229, |
|
"grad_norm": 1.0011018514633179, |
|
"learning_rate": 0.00014554621848739496, |
|
"loss": 0.8985, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.10509852987175478, |
|
"grad_norm": 1.189772367477417, |
|
"learning_rate": 0.00014521008403361346, |
|
"loss": 0.5547, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.10572411635908664, |
|
"grad_norm": 0.7990069389343262, |
|
"learning_rate": 0.00014487394957983193, |
|
"loss": 0.6222, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.10634970284641852, |
|
"grad_norm": 0.6419771313667297, |
|
"learning_rate": 0.00014453781512605043, |
|
"loss": 0.3225, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10697528933375039, |
|
"grad_norm": 0.8978354930877686, |
|
"learning_rate": 0.00014420168067226893, |
|
"loss": 0.4567, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.10760087582108227, |
|
"grad_norm": 0.7193794250488281, |
|
"learning_rate": 0.0001438655462184874, |
|
"loss": 0.4793, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.10822646230841414, |
|
"grad_norm": 0.9533759355545044, |
|
"learning_rate": 0.0001435294117647059, |
|
"loss": 1.4397, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.10885204879574602, |
|
"grad_norm": 0.48348739743232727, |
|
"learning_rate": 0.00014319327731092437, |
|
"loss": 0.3398, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.10947763528307788, |
|
"grad_norm": 0.7699019312858582, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.7491, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.10947763528307788, |
|
"eval_loss": 0.8425782322883606, |
|
"eval_runtime": 43.5013, |
|
"eval_samples_per_second": 5.885, |
|
"eval_steps_per_second": 2.942, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.11010322177040976, |
|
"grad_norm": 0.9201186895370483, |
|
"learning_rate": 0.00014252100840336137, |
|
"loss": 0.6919, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.11072880825774163, |
|
"grad_norm": 0.8190593123435974, |
|
"learning_rate": 0.00014218487394957984, |
|
"loss": 0.6262, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.11135439474507351, |
|
"grad_norm": 0.9715782403945923, |
|
"learning_rate": 0.00014184873949579834, |
|
"loss": 0.8364, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.11197998123240538, |
|
"grad_norm": 0.6699782609939575, |
|
"learning_rate": 0.0001415126050420168, |
|
"loss": 0.4898, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.11260556771973726, |
|
"grad_norm": 1.8386518955230713, |
|
"learning_rate": 0.0001411764705882353, |
|
"loss": 0.7812, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.11323115420706913, |
|
"grad_norm": 0.7240263819694519, |
|
"learning_rate": 0.0001408403361344538, |
|
"loss": 0.5508, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.113856740694401, |
|
"grad_norm": 0.6068630814552307, |
|
"learning_rate": 0.00014050420168067225, |
|
"loss": 0.5151, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.11448232718173287, |
|
"grad_norm": 1.6705517768859863, |
|
"learning_rate": 0.00014016806722689075, |
|
"loss": 1.2281, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.11510791366906475, |
|
"grad_norm": 1.6179956197738647, |
|
"learning_rate": 0.00013983193277310925, |
|
"loss": 0.7365, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.11573350015639662, |
|
"grad_norm": 1.5741758346557617, |
|
"learning_rate": 0.00013949579831932772, |
|
"loss": 1.0039, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1163590866437285, |
|
"grad_norm": 0.9270511865615845, |
|
"learning_rate": 0.00013915966386554622, |
|
"loss": 0.5768, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.11698467313106037, |
|
"grad_norm": 1.3651914596557617, |
|
"learning_rate": 0.00013882352941176472, |
|
"loss": 0.7715, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.11761025961839225, |
|
"grad_norm": 1.4330601692199707, |
|
"learning_rate": 0.0001384873949579832, |
|
"loss": 0.4462, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.11823584610572412, |
|
"grad_norm": 0.9181672930717468, |
|
"learning_rate": 0.0001381512605042017, |
|
"loss": 0.3901, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.118861432593056, |
|
"grad_norm": 0.5304622650146484, |
|
"learning_rate": 0.00013781512605042016, |
|
"loss": 0.1718, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11948701908038786, |
|
"grad_norm": 0.7475191354751587, |
|
"learning_rate": 0.00013747899159663866, |
|
"loss": 0.3602, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.12011260556771974, |
|
"grad_norm": 1.2558002471923828, |
|
"learning_rate": 0.00013714285714285716, |
|
"loss": 0.8558, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.12073819205505161, |
|
"grad_norm": 0.9859037399291992, |
|
"learning_rate": 0.00013680672268907563, |
|
"loss": 0.7155, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.12136377854238349, |
|
"grad_norm": 0.6028466820716858, |
|
"learning_rate": 0.00013647058823529413, |
|
"loss": 0.9596, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.12198936502971536, |
|
"grad_norm": 0.5713469386100769, |
|
"learning_rate": 0.0001361344537815126, |
|
"loss": 0.3442, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.12261495151704724, |
|
"grad_norm": 1.0781211853027344, |
|
"learning_rate": 0.0001357983193277311, |
|
"loss": 0.5569, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.1232405380043791, |
|
"grad_norm": 0.7850176095962524, |
|
"learning_rate": 0.0001354621848739496, |
|
"loss": 0.5853, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.12386612449171099, |
|
"grad_norm": 0.8100555539131165, |
|
"learning_rate": 0.00013512605042016807, |
|
"loss": 0.8285, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.12449171097904285, |
|
"grad_norm": 1.106834888458252, |
|
"learning_rate": 0.00013478991596638657, |
|
"loss": 0.9521, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.12511729746637473, |
|
"grad_norm": 1.4412230253219604, |
|
"learning_rate": 0.00013445378151260507, |
|
"loss": 0.6478, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12511729746637473, |
|
"eval_loss": 0.8300326466560364, |
|
"eval_runtime": 43.5102, |
|
"eval_samples_per_second": 5.884, |
|
"eval_steps_per_second": 2.942, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1257428839537066, |
|
"grad_norm": 1.7852795124053955, |
|
"learning_rate": 0.00013411764705882352, |
|
"loss": 0.5687, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.12636847044103847, |
|
"grad_norm": 2.423583745956421, |
|
"learning_rate": 0.00013378151260504202, |
|
"loss": 0.9082, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.12699405692837035, |
|
"grad_norm": 1.538001298904419, |
|
"learning_rate": 0.00013344537815126052, |
|
"loss": 0.7143, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.12761964341570223, |
|
"grad_norm": 1.7380592823028564, |
|
"learning_rate": 0.000133109243697479, |
|
"loss": 0.8296, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.1282452299030341, |
|
"grad_norm": 0.8279218673706055, |
|
"learning_rate": 0.0001327731092436975, |
|
"loss": 0.6719, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.12887081639036596, |
|
"grad_norm": 0.7059926986694336, |
|
"learning_rate": 0.00013243697478991596, |
|
"loss": 0.4785, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.12949640287769784, |
|
"grad_norm": 0.6946935653686523, |
|
"learning_rate": 0.00013210084033613446, |
|
"loss": 0.4578, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.13012198936502972, |
|
"grad_norm": 0.9800712466239929, |
|
"learning_rate": 0.00013176470588235296, |
|
"loss": 1.4369, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.1307475758523616, |
|
"grad_norm": 0.708831787109375, |
|
"learning_rate": 0.00013142857142857143, |
|
"loss": 0.5071, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.13137316233969346, |
|
"grad_norm": 1.0098780393600464, |
|
"learning_rate": 0.00013109243697478993, |
|
"loss": 0.9155, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.13199874882702534, |
|
"grad_norm": 1.1598243713378906, |
|
"learning_rate": 0.0001307563025210084, |
|
"loss": 0.3757, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.13262433531435722, |
|
"grad_norm": 0.7583935260772705, |
|
"learning_rate": 0.0001304201680672269, |
|
"loss": 0.3365, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.1332499218016891, |
|
"grad_norm": 1.0866564512252808, |
|
"learning_rate": 0.0001300840336134454, |
|
"loss": 0.6398, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.13387550828902095, |
|
"grad_norm": 1.4322006702423096, |
|
"learning_rate": 0.00012974789915966387, |
|
"loss": 0.6427, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.13450109477635283, |
|
"grad_norm": 1.600325345993042, |
|
"learning_rate": 0.00012941176470588237, |
|
"loss": 0.6884, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1351266812636847, |
|
"grad_norm": 1.0634167194366455, |
|
"learning_rate": 0.00012907563025210087, |
|
"loss": 1.0343, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.13575226775101656, |
|
"grad_norm": 0.9889366626739502, |
|
"learning_rate": 0.00012873949579831934, |
|
"loss": 0.717, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.13637785423834844, |
|
"grad_norm": 2.0635392665863037, |
|
"learning_rate": 0.00012840336134453784, |
|
"loss": 0.5965, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.13700344072568033, |
|
"grad_norm": 0.8937773704528809, |
|
"learning_rate": 0.0001280672268907563, |
|
"loss": 0.7281, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.1376290272130122, |
|
"grad_norm": 0.9768427014350891, |
|
"learning_rate": 0.00012773109243697478, |
|
"loss": 0.5687, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13825461370034406, |
|
"grad_norm": 1.3913767337799072, |
|
"learning_rate": 0.00012739495798319328, |
|
"loss": 0.3984, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.13888020018767594, |
|
"grad_norm": 1.4933342933654785, |
|
"learning_rate": 0.00012705882352941175, |
|
"loss": 1.2441, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.13950578667500782, |
|
"grad_norm": 1.0846196413040161, |
|
"learning_rate": 0.00012672268907563025, |
|
"loss": 0.9013, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.1401313731623397, |
|
"grad_norm": 0.7788563370704651, |
|
"learning_rate": 0.00012638655462184875, |
|
"loss": 0.4674, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.14075695964967155, |
|
"grad_norm": 0.7341142296791077, |
|
"learning_rate": 0.00012605042016806722, |
|
"loss": 1.3271, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.14075695964967155, |
|
"eval_loss": 0.8179877996444702, |
|
"eval_runtime": 43.5514, |
|
"eval_samples_per_second": 5.878, |
|
"eval_steps_per_second": 2.939, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.14138254613700343, |
|
"grad_norm": 6.473598480224609, |
|
"learning_rate": 0.00012571428571428572, |
|
"loss": 0.6219, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.14200813262433531, |
|
"grad_norm": 0.9846400022506714, |
|
"learning_rate": 0.0001253781512605042, |
|
"loss": 0.4407, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.1426337191116672, |
|
"grad_norm": 0.7880604267120361, |
|
"learning_rate": 0.0001250420168067227, |
|
"loss": 0.3927, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.14325930559899905, |
|
"grad_norm": 1.5999399423599243, |
|
"learning_rate": 0.0001247058823529412, |
|
"loss": 0.6917, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.14388489208633093, |
|
"grad_norm": 0.8072729110717773, |
|
"learning_rate": 0.00012436974789915966, |
|
"loss": 0.4909, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1445104785736628, |
|
"grad_norm": 2.2560601234436035, |
|
"learning_rate": 0.00012403361344537816, |
|
"loss": 0.3355, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.1451360650609947, |
|
"grad_norm": 0.9964832663536072, |
|
"learning_rate": 0.00012369747899159666, |
|
"loss": 0.4436, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.14576165154832654, |
|
"grad_norm": 1.1081007719039917, |
|
"learning_rate": 0.00012336134453781513, |
|
"loss": 0.6582, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.14638723803565842, |
|
"grad_norm": 0.9722908735275269, |
|
"learning_rate": 0.00012302521008403363, |
|
"loss": 0.7412, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.1470128245229903, |
|
"grad_norm": 0.7456592917442322, |
|
"learning_rate": 0.0001226890756302521, |
|
"loss": 0.4303, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.14763841101032218, |
|
"grad_norm": 1.0428457260131836, |
|
"learning_rate": 0.0001223529411764706, |
|
"loss": 1.0538, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.14826399749765404, |
|
"grad_norm": 0.9209719896316528, |
|
"learning_rate": 0.00012201680672268909, |
|
"loss": 0.5864, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.14888958398498592, |
|
"grad_norm": 0.990292489528656, |
|
"learning_rate": 0.00012168067226890756, |
|
"loss": 0.5929, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.1495151704723178, |
|
"grad_norm": 0.6086494326591492, |
|
"learning_rate": 0.00012134453781512605, |
|
"loss": 0.4436, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.15014075695964968, |
|
"grad_norm": 1.429149866104126, |
|
"learning_rate": 0.00012100840336134453, |
|
"loss": 0.246, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.15076634344698153, |
|
"grad_norm": 1.8170491456985474, |
|
"learning_rate": 0.00012067226890756302, |
|
"loss": 0.6574, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.1513919299343134, |
|
"grad_norm": 1.1577768325805664, |
|
"learning_rate": 0.00012033613445378152, |
|
"loss": 0.5706, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.1520175164216453, |
|
"grad_norm": 0.7442137598991394, |
|
"learning_rate": 0.00012, |
|
"loss": 0.2772, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.15264310290897717, |
|
"grad_norm": 1.1375997066497803, |
|
"learning_rate": 0.00011966386554621849, |
|
"loss": 0.397, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.15326868939630903, |
|
"grad_norm": 0.8451513648033142, |
|
"learning_rate": 0.00011932773109243697, |
|
"loss": 0.5425, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1538942758836409, |
|
"grad_norm": 0.7176560163497925, |
|
"learning_rate": 0.00011899159663865547, |
|
"loss": 0.4398, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.1545198623709728, |
|
"grad_norm": 1.049872875213623, |
|
"learning_rate": 0.00011865546218487396, |
|
"loss": 0.6479, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.15514544885830467, |
|
"grad_norm": 0.6093642115592957, |
|
"learning_rate": 0.00011831932773109244, |
|
"loss": 0.6125, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.15577103534563652, |
|
"grad_norm": 0.9963379502296448, |
|
"learning_rate": 0.00011798319327731093, |
|
"loss": 0.3768, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.1563966218329684, |
|
"grad_norm": 3.4668896198272705, |
|
"learning_rate": 0.00011764705882352942, |
|
"loss": 0.3744, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1563966218329684, |
|
"eval_loss": 0.8456696271896362, |
|
"eval_runtime": 43.5223, |
|
"eval_samples_per_second": 5.882, |
|
"eval_steps_per_second": 2.941, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.15702220832030028, |
|
"grad_norm": 0.6826130747795105, |
|
"learning_rate": 0.00011731092436974791, |
|
"loss": 0.4877, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.15764779480763216, |
|
"grad_norm": 1.8045300245285034, |
|
"learning_rate": 0.0001169747899159664, |
|
"loss": 0.9699, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.15827338129496402, |
|
"grad_norm": 0.7311923503875732, |
|
"learning_rate": 0.00011663865546218489, |
|
"loss": 0.4648, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.1588989677822959, |
|
"grad_norm": 1.7481943368911743, |
|
"learning_rate": 0.00011630252100840337, |
|
"loss": 0.8871, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.15952455426962778, |
|
"grad_norm": 2.6331326961517334, |
|
"learning_rate": 0.00011596638655462187, |
|
"loss": 0.8109, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.16015014075695966, |
|
"grad_norm": 0.899364709854126, |
|
"learning_rate": 0.00011563025210084036, |
|
"loss": 0.5021, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.1607757272442915, |
|
"grad_norm": 0.922218918800354, |
|
"learning_rate": 0.00011529411764705881, |
|
"loss": 0.5741, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.1614013137316234, |
|
"grad_norm": 5.335756301879883, |
|
"learning_rate": 0.00011495798319327731, |
|
"loss": 0.842, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.16202690021895527, |
|
"grad_norm": 0.8632665872573853, |
|
"learning_rate": 0.0001146218487394958, |
|
"loss": 0.4208, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.16265248670628715, |
|
"grad_norm": 4.576591968536377, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 0.8813, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.163278073193619, |
|
"grad_norm": 0.907714307308197, |
|
"learning_rate": 0.00011394957983193277, |
|
"loss": 0.7204, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.16390365968095089, |
|
"grad_norm": 0.8328534960746765, |
|
"learning_rate": 0.00011361344537815127, |
|
"loss": 0.7552, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.16452924616828277, |
|
"grad_norm": 1.0882028341293335, |
|
"learning_rate": 0.00011327731092436975, |
|
"loss": 0.9079, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.16515483265561465, |
|
"grad_norm": 1.0093358755111694, |
|
"learning_rate": 0.00011294117647058824, |
|
"loss": 0.6284, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.1657804191429465, |
|
"grad_norm": 0.853907585144043, |
|
"learning_rate": 0.00011260504201680672, |
|
"loss": 0.508, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.16640600563027838, |
|
"grad_norm": 1.0016460418701172, |
|
"learning_rate": 0.00011226890756302521, |
|
"loss": 0.597, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.16703159211761026, |
|
"grad_norm": 1.0138968229293823, |
|
"learning_rate": 0.00011193277310924371, |
|
"loss": 0.9238, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.16765717860494214, |
|
"grad_norm": 1.1728049516677856, |
|
"learning_rate": 0.0001115966386554622, |
|
"loss": 0.9152, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.168282765092274, |
|
"grad_norm": 1.2228264808654785, |
|
"learning_rate": 0.00011126050420168068, |
|
"loss": 0.7483, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.16890835157960588, |
|
"grad_norm": 0.6260212659835815, |
|
"learning_rate": 0.00011092436974789917, |
|
"loss": 0.5566, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16953393806693776, |
|
"grad_norm": 0.7589625716209412, |
|
"learning_rate": 0.00011058823529411766, |
|
"loss": 0.6242, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.17015952455426964, |
|
"grad_norm": 1.1016935110092163, |
|
"learning_rate": 0.00011025210084033615, |
|
"loss": 0.4419, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.1707851110416015, |
|
"grad_norm": 0.8092851042747498, |
|
"learning_rate": 0.00010991596638655464, |
|
"loss": 0.5168, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.17141069752893337, |
|
"grad_norm": 1.012885332107544, |
|
"learning_rate": 0.00010957983193277312, |
|
"loss": 0.4334, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.17203628401626525, |
|
"grad_norm": 2.6073336601257324, |
|
"learning_rate": 0.00010924369747899159, |
|
"loss": 0.5262, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.17203628401626525, |
|
"eval_loss": 0.8115787506103516, |
|
"eval_runtime": 43.4931, |
|
"eval_samples_per_second": 5.886, |
|
"eval_steps_per_second": 2.943, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.17266187050359713, |
|
"grad_norm": 5.577237606048584, |
|
"learning_rate": 0.00010890756302521008, |
|
"loss": 1.0595, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.17328745699092898, |
|
"grad_norm": 1.1434190273284912, |
|
"learning_rate": 0.00010857142857142856, |
|
"loss": 0.4401, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 0.951992928981781, |
|
"learning_rate": 0.00010823529411764706, |
|
"loss": 0.4393, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.17453862996559275, |
|
"grad_norm": 0.6695138216018677, |
|
"learning_rate": 0.00010789915966386555, |
|
"loss": 0.314, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.17516421645292463, |
|
"grad_norm": 0.40990278124809265, |
|
"learning_rate": 0.00010756302521008403, |
|
"loss": 0.192, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.17578980294025648, |
|
"grad_norm": 0.9555610418319702, |
|
"learning_rate": 0.00010722689075630252, |
|
"loss": 0.3646, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.17641538942758836, |
|
"grad_norm": 0.7370548844337463, |
|
"learning_rate": 0.000106890756302521, |
|
"loss": 0.8997, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.17704097591492024, |
|
"grad_norm": 1.0178982019424438, |
|
"learning_rate": 0.0001065546218487395, |
|
"loss": 0.986, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.17766656240225212, |
|
"grad_norm": 0.41388389468193054, |
|
"learning_rate": 0.00010621848739495799, |
|
"loss": 0.2069, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.17829214888958397, |
|
"grad_norm": 0.7140624523162842, |
|
"learning_rate": 0.00010588235294117647, |
|
"loss": 0.4852, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.17891773537691585, |
|
"grad_norm": 0.7758356332778931, |
|
"learning_rate": 0.00010554621848739496, |
|
"loss": 0.3943, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.17954332186424773, |
|
"grad_norm": 1.4193260669708252, |
|
"learning_rate": 0.00010521008403361346, |
|
"loss": 0.6412, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.18016890835157962, |
|
"grad_norm": 0.7264838814735413, |
|
"learning_rate": 0.00010487394957983194, |
|
"loss": 0.7834, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.18079449483891147, |
|
"grad_norm": 2.4300973415374756, |
|
"learning_rate": 0.00010453781512605043, |
|
"loss": 0.7462, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.18142008132624335, |
|
"grad_norm": 1.033916711807251, |
|
"learning_rate": 0.00010420168067226892, |
|
"loss": 0.5241, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.18204566781357523, |
|
"grad_norm": 0.5583767294883728, |
|
"learning_rate": 0.00010386554621848741, |
|
"loss": 0.7815, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.1826712543009071, |
|
"grad_norm": 0.7440481781959534, |
|
"learning_rate": 0.0001035294117647059, |
|
"loss": 0.4674, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.18329684078823896, |
|
"grad_norm": 4.230656147003174, |
|
"learning_rate": 0.00010319327731092439, |
|
"loss": 0.5219, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.18392242727557084, |
|
"grad_norm": 0.6165269017219543, |
|
"learning_rate": 0.00010285714285714286, |
|
"loss": 0.3274, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.18454801376290272, |
|
"grad_norm": 0.5844498872756958, |
|
"learning_rate": 0.00010252100840336134, |
|
"loss": 0.3719, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.1851736002502346, |
|
"grad_norm": 0.9936206936836243, |
|
"learning_rate": 0.00010218487394957983, |
|
"loss": 1.0453, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.18579918673756646, |
|
"grad_norm": 1.749831199645996, |
|
"learning_rate": 0.00010184873949579831, |
|
"loss": 0.6634, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.18642477322489834, |
|
"grad_norm": 0.4740132689476013, |
|
"learning_rate": 0.0001015126050420168, |
|
"loss": 0.2901, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.18705035971223022, |
|
"grad_norm": 0.664300262928009, |
|
"learning_rate": 0.0001011764705882353, |
|
"loss": 0.5869, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.1876759461995621, |
|
"grad_norm": 0.7400941252708435, |
|
"learning_rate": 0.00010084033613445378, |
|
"loss": 0.7881, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1876759461995621, |
|
"eval_loss": 0.7877693772315979, |
|
"eval_runtime": 43.5162, |
|
"eval_samples_per_second": 5.883, |
|
"eval_steps_per_second": 2.941, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.18830153268689395, |
|
"grad_norm": 0.6142858862876892, |
|
"learning_rate": 0.00010050420168067227, |
|
"loss": 0.3808, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.18892711917422583, |
|
"grad_norm": 1.991969347000122, |
|
"learning_rate": 0.00010016806722689076, |
|
"loss": 0.7035, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.1895527056615577, |
|
"grad_norm": 0.6220730543136597, |
|
"learning_rate": 9.983193277310925e-05, |
|
"loss": 0.2548, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.1901782921488896, |
|
"grad_norm": 0.6476833820343018, |
|
"learning_rate": 9.949579831932774e-05, |
|
"loss": 0.3569, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.19080387863622145, |
|
"grad_norm": 0.7133951783180237, |
|
"learning_rate": 9.915966386554623e-05, |
|
"loss": 0.4744, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.19142946512355333, |
|
"grad_norm": 0.6500736474990845, |
|
"learning_rate": 9.882352941176471e-05, |
|
"loss": 0.4653, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.1920550516108852, |
|
"grad_norm": 1.1231927871704102, |
|
"learning_rate": 9.848739495798321e-05, |
|
"loss": 0.818, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.1926806380982171, |
|
"grad_norm": 0.8654798865318298, |
|
"learning_rate": 9.815126050420168e-05, |
|
"loss": 0.7065, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.19330622458554894, |
|
"grad_norm": 0.45660969614982605, |
|
"learning_rate": 9.781512605042017e-05, |
|
"loss": 0.2412, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.19393181107288082, |
|
"grad_norm": 0.9538519978523254, |
|
"learning_rate": 9.747899159663865e-05, |
|
"loss": 1.3428, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1945573975602127, |
|
"grad_norm": 0.596633791923523, |
|
"learning_rate": 9.714285714285715e-05, |
|
"loss": 0.5119, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.19518298404754458, |
|
"grad_norm": 0.5247074365615845, |
|
"learning_rate": 9.680672268907564e-05, |
|
"loss": 0.6413, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.19580857053487644, |
|
"grad_norm": 0.7713050246238708, |
|
"learning_rate": 9.647058823529412e-05, |
|
"loss": 0.49, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.19643415702220832, |
|
"grad_norm": 0.6971513628959656, |
|
"learning_rate": 9.613445378151261e-05, |
|
"loss": 0.6505, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.1970597435095402, |
|
"grad_norm": 0.5454917550086975, |
|
"learning_rate": 9.579831932773111e-05, |
|
"loss": 0.7018, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.19768532999687208, |
|
"grad_norm": 0.8349499702453613, |
|
"learning_rate": 9.546218487394959e-05, |
|
"loss": 0.3179, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.19831091648420393, |
|
"grad_norm": 0.5682560801506042, |
|
"learning_rate": 9.512605042016806e-05, |
|
"loss": 0.4003, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.1989365029715358, |
|
"grad_norm": 0.5094739198684692, |
|
"learning_rate": 9.478991596638655e-05, |
|
"loss": 0.313, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.1995620894588677, |
|
"grad_norm": 1.7074236869812012, |
|
"learning_rate": 9.445378151260505e-05, |
|
"loss": 0.9912, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.20018767594619957, |
|
"grad_norm": 1.1477283239364624, |
|
"learning_rate": 9.411764705882353e-05, |
|
"loss": 0.851, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.20081326243353143, |
|
"grad_norm": 0.6616579294204712, |
|
"learning_rate": 9.378151260504202e-05, |
|
"loss": 0.4844, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.2014388489208633, |
|
"grad_norm": 1.0401920080184937, |
|
"learning_rate": 9.34453781512605e-05, |
|
"loss": 0.5421, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.2020644354081952, |
|
"grad_norm": 0.729664146900177, |
|
"learning_rate": 9.3109243697479e-05, |
|
"loss": 0.6632, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.20269002189552707, |
|
"grad_norm": 0.6752575635910034, |
|
"learning_rate": 9.277310924369749e-05, |
|
"loss": 0.4352, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.20331560838285892, |
|
"grad_norm": 0.7963948249816895, |
|
"learning_rate": 9.243697478991598e-05, |
|
"loss": 0.7614, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.20331560838285892, |
|
"eval_loss": 0.771190881729126, |
|
"eval_runtime": 43.551, |
|
"eval_samples_per_second": 5.878, |
|
"eval_steps_per_second": 2.939, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2039411948701908, |
|
"grad_norm": 0.7778791189193726, |
|
"learning_rate": 9.210084033613445e-05, |
|
"loss": 0.7251, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.20456678135752268, |
|
"grad_norm": 3.0929737091064453, |
|
"learning_rate": 9.176470588235295e-05, |
|
"loss": 0.5375, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.20519236784485456, |
|
"grad_norm": 0.6188391447067261, |
|
"learning_rate": 9.142857142857143e-05, |
|
"loss": 0.4007, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.20581795433218641, |
|
"grad_norm": 0.9423925876617432, |
|
"learning_rate": 9.109243697478992e-05, |
|
"loss": 0.5059, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.2064435408195183, |
|
"grad_norm": 0.506572425365448, |
|
"learning_rate": 9.07563025210084e-05, |
|
"loss": 0.2794, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.20706912730685018, |
|
"grad_norm": 1.7139545679092407, |
|
"learning_rate": 9.04201680672269e-05, |
|
"loss": 0.5984, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.20769471379418206, |
|
"grad_norm": 0.5540574789047241, |
|
"learning_rate": 9.008403361344539e-05, |
|
"loss": 0.323, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.2083203002815139, |
|
"grad_norm": 0.6909454464912415, |
|
"learning_rate": 8.974789915966387e-05, |
|
"loss": 0.5399, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.2089458867688458, |
|
"grad_norm": 0.7409022450447083, |
|
"learning_rate": 8.941176470588236e-05, |
|
"loss": 0.4251, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.20957147325617767, |
|
"grad_norm": 0.6636312007904053, |
|
"learning_rate": 8.907563025210084e-05, |
|
"loss": 0.4021, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.21019705974350955, |
|
"grad_norm": 0.5426271557807922, |
|
"learning_rate": 8.873949579831933e-05, |
|
"loss": 0.2095, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.2108226462308414, |
|
"grad_norm": 0.8870647549629211, |
|
"learning_rate": 8.840336134453782e-05, |
|
"loss": 0.5773, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.21144823271817328, |
|
"grad_norm": 0.5508524179458618, |
|
"learning_rate": 8.80672268907563e-05, |
|
"loss": 0.6744, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.21207381920550517, |
|
"grad_norm": 1.6577738523483276, |
|
"learning_rate": 8.77310924369748e-05, |
|
"loss": 1.1134, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.21269940569283705, |
|
"grad_norm": 3.218395233154297, |
|
"learning_rate": 8.739495798319329e-05, |
|
"loss": 0.5932, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2133249921801689, |
|
"grad_norm": 0.5119672417640686, |
|
"learning_rate": 8.705882352941177e-05, |
|
"loss": 0.1831, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.21395057866750078, |
|
"grad_norm": 0.4874535799026489, |
|
"learning_rate": 8.672268907563026e-05, |
|
"loss": 0.485, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.21457616515483266, |
|
"grad_norm": 0.6597093939781189, |
|
"learning_rate": 8.638655462184874e-05, |
|
"loss": 0.3588, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.21520175164216454, |
|
"grad_norm": 1.1764620542526245, |
|
"learning_rate": 8.605042016806724e-05, |
|
"loss": 1.8765, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.2158273381294964, |
|
"grad_norm": 0.6894935369491577, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 0.5355, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.21645292461682827, |
|
"grad_norm": 0.5896294116973877, |
|
"learning_rate": 8.53781512605042e-05, |
|
"loss": 0.494, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.21707851110416015, |
|
"grad_norm": 0.6212694048881531, |
|
"learning_rate": 8.50420168067227e-05, |
|
"loss": 0.5721, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.21770409759149204, |
|
"grad_norm": 0.5058571100234985, |
|
"learning_rate": 8.470588235294118e-05, |
|
"loss": 0.5051, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.2183296840788239, |
|
"grad_norm": 0.5089401006698608, |
|
"learning_rate": 8.436974789915967e-05, |
|
"loss": 0.3794, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.21895527056615577, |
|
"grad_norm": 6.416032314300537, |
|
"learning_rate": 8.403361344537815e-05, |
|
"loss": 0.5026, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.21895527056615577, |
|
"eval_loss": 0.7647964954376221, |
|
"eval_runtime": 43.4854, |
|
"eval_samples_per_second": 5.887, |
|
"eval_steps_per_second": 2.944, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.21958085705348765, |
|
"grad_norm": 0.8862031698226929, |
|
"learning_rate": 8.369747899159664e-05, |
|
"loss": 0.5003, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.22020644354081953, |
|
"grad_norm": 1.3196977376937866, |
|
"learning_rate": 8.336134453781514e-05, |
|
"loss": 0.4573, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.22083203002815138, |
|
"grad_norm": 0.4763634204864502, |
|
"learning_rate": 8.302521008403362e-05, |
|
"loss": 0.3987, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.22145761651548326, |
|
"grad_norm": 0.45634883642196655, |
|
"learning_rate": 8.26890756302521e-05, |
|
"loss": 0.2064, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.22208320300281514, |
|
"grad_norm": 0.443393737077713, |
|
"learning_rate": 8.23529411764706e-05, |
|
"loss": 0.2308, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.22270878949014702, |
|
"grad_norm": 1.135941505432129, |
|
"learning_rate": 8.201680672268908e-05, |
|
"loss": 0.8731, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.22333437597747888, |
|
"grad_norm": 0.6853610873222351, |
|
"learning_rate": 8.168067226890757e-05, |
|
"loss": 0.5563, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.22395996246481076, |
|
"grad_norm": 0.6356902718544006, |
|
"learning_rate": 8.134453781512605e-05, |
|
"loss": 0.4265, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.22458554895214264, |
|
"grad_norm": 0.6331340074539185, |
|
"learning_rate": 8.100840336134454e-05, |
|
"loss": 0.3293, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.22521113543947452, |
|
"grad_norm": 0.8068905472755432, |
|
"learning_rate": 8.067226890756304e-05, |
|
"loss": 1.2136, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.22583672192680637, |
|
"grad_norm": 0.6827020049095154, |
|
"learning_rate": 8.033613445378152e-05, |
|
"loss": 0.8519, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.22646230841413825, |
|
"grad_norm": 0.829730749130249, |
|
"learning_rate": 8e-05, |
|
"loss": 0.6237, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.22708789490147013, |
|
"grad_norm": 0.5221096873283386, |
|
"learning_rate": 7.966386554621849e-05, |
|
"loss": 0.3099, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.227713481388802, |
|
"grad_norm": 0.6234191060066223, |
|
"learning_rate": 7.932773109243698e-05, |
|
"loss": 0.9356, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.22833906787613387, |
|
"grad_norm": 0.5766564607620239, |
|
"learning_rate": 7.899159663865546e-05, |
|
"loss": 0.2529, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.22896465436346575, |
|
"grad_norm": 0.758171558380127, |
|
"learning_rate": 7.865546218487395e-05, |
|
"loss": 0.7577, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.22959024085079763, |
|
"grad_norm": 0.6313957571983337, |
|
"learning_rate": 7.831932773109243e-05, |
|
"loss": 0.6219, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.2302158273381295, |
|
"grad_norm": 0.7843011617660522, |
|
"learning_rate": 7.798319327731093e-05, |
|
"loss": 0.6617, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.23084141382546136, |
|
"grad_norm": 0.9671229124069214, |
|
"learning_rate": 7.764705882352942e-05, |
|
"loss": 0.7156, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.23146700031279324, |
|
"grad_norm": 0.663546085357666, |
|
"learning_rate": 7.73109243697479e-05, |
|
"loss": 0.4462, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.23209258680012512, |
|
"grad_norm": 0.6233255863189697, |
|
"learning_rate": 7.697478991596639e-05, |
|
"loss": 0.5534, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.232718173287457, |
|
"grad_norm": 2.0895440578460693, |
|
"learning_rate": 7.663865546218489e-05, |
|
"loss": 0.7289, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.23334375977478886, |
|
"grad_norm": 0.6122156381607056, |
|
"learning_rate": 7.630252100840336e-05, |
|
"loss": 0.3891, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.23396934626212074, |
|
"grad_norm": 0.5940058827400208, |
|
"learning_rate": 7.596638655462185e-05, |
|
"loss": 0.3038, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.23459493274945262, |
|
"grad_norm": 0.35755977034568787, |
|
"learning_rate": 7.563025210084033e-05, |
|
"loss": 0.2848, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.23459493274945262, |
|
"eval_loss": 0.7521110773086548, |
|
"eval_runtime": 43.5299, |
|
"eval_samples_per_second": 5.881, |
|
"eval_steps_per_second": 2.941, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.2352205192367845, |
|
"grad_norm": 0.8450719118118286, |
|
"learning_rate": 7.529411764705883e-05, |
|
"loss": 0.5348, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.23584610572411635, |
|
"grad_norm": 0.9100202918052673, |
|
"learning_rate": 7.495798319327732e-05, |
|
"loss": 0.8178, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.23647169221144823, |
|
"grad_norm": 0.5748711228370667, |
|
"learning_rate": 7.46218487394958e-05, |
|
"loss": 0.778, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.2370972786987801, |
|
"grad_norm": 0.5675060153007507, |
|
"learning_rate": 7.428571428571429e-05, |
|
"loss": 0.8042, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.237722865186112, |
|
"grad_norm": 6.2747392654418945, |
|
"learning_rate": 7.394957983193279e-05, |
|
"loss": 1.4173, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.23834845167344385, |
|
"grad_norm": 0.6252509355545044, |
|
"learning_rate": 7.361344537815127e-05, |
|
"loss": 0.5095, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.23897403816077573, |
|
"grad_norm": 1.0525410175323486, |
|
"learning_rate": 7.327731092436974e-05, |
|
"loss": 1.1774, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.2395996246481076, |
|
"grad_norm": 0.505670428276062, |
|
"learning_rate": 7.294117647058823e-05, |
|
"loss": 0.7603, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.2402252111354395, |
|
"grad_norm": 0.5476568341255188, |
|
"learning_rate": 7.260504201680673e-05, |
|
"loss": 0.3354, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.24085079762277134, |
|
"grad_norm": 0.687854528427124, |
|
"learning_rate": 7.226890756302521e-05, |
|
"loss": 0.6306, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.24147638411010322, |
|
"grad_norm": 1.3373991250991821, |
|
"learning_rate": 7.19327731092437e-05, |
|
"loss": 0.5736, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.2421019705974351, |
|
"grad_norm": 0.5465985536575317, |
|
"learning_rate": 7.159663865546218e-05, |
|
"loss": 0.2859, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.24272755708476698, |
|
"grad_norm": 0.6637946963310242, |
|
"learning_rate": 7.126050420168068e-05, |
|
"loss": 0.5948, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.24335314357209883, |
|
"grad_norm": 0.637915313243866, |
|
"learning_rate": 7.092436974789917e-05, |
|
"loss": 0.3947, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.24397873005943072, |
|
"grad_norm": 0.8073198795318604, |
|
"learning_rate": 7.058823529411765e-05, |
|
"loss": 0.5006, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2446043165467626, |
|
"grad_norm": 0.7423315644264221, |
|
"learning_rate": 7.025210084033613e-05, |
|
"loss": 0.5209, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.24522990303409448, |
|
"grad_norm": 0.6418082118034363, |
|
"learning_rate": 6.991596638655463e-05, |
|
"loss": 0.3793, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.24585548952142633, |
|
"grad_norm": 1.072240948677063, |
|
"learning_rate": 6.957983193277311e-05, |
|
"loss": 0.3656, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.2464810760087582, |
|
"grad_norm": 1.3167545795440674, |
|
"learning_rate": 6.92436974789916e-05, |
|
"loss": 0.9145, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.2471066624960901, |
|
"grad_norm": 0.6734040379524231, |
|
"learning_rate": 6.890756302521008e-05, |
|
"loss": 0.4063, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.24773224898342197, |
|
"grad_norm": 0.48195910453796387, |
|
"learning_rate": 6.857142857142858e-05, |
|
"loss": 0.4146, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.24835783547075382, |
|
"grad_norm": 1.2620956897735596, |
|
"learning_rate": 6.823529411764707e-05, |
|
"loss": 0.4289, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.2489834219580857, |
|
"grad_norm": 0.6438835859298706, |
|
"learning_rate": 6.789915966386555e-05, |
|
"loss": 0.6204, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.24960900844541759, |
|
"grad_norm": 1.6006457805633545, |
|
"learning_rate": 6.756302521008404e-05, |
|
"loss": 0.8828, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.25023459493274947, |
|
"grad_norm": 3.7350921630859375, |
|
"learning_rate": 6.722689075630254e-05, |
|
"loss": 0.5608, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.25023459493274947, |
|
"eval_loss": 0.7404398322105408, |
|
"eval_runtime": 43.5267, |
|
"eval_samples_per_second": 5.881, |
|
"eval_steps_per_second": 2.941, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2508601814200813, |
|
"grad_norm": 0.776977002620697, |
|
"learning_rate": 6.689075630252101e-05, |
|
"loss": 0.709, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.2514857679074132, |
|
"grad_norm": 0.547192394733429, |
|
"learning_rate": 6.65546218487395e-05, |
|
"loss": 0.2832, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.2521113543947451, |
|
"grad_norm": 1.2148370742797852, |
|
"learning_rate": 6.621848739495798e-05, |
|
"loss": 0.9248, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.25273694088207693, |
|
"grad_norm": 0.5215961337089539, |
|
"learning_rate": 6.588235294117648e-05, |
|
"loss": 0.5407, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.25336252736940884, |
|
"grad_norm": 0.32982224225997925, |
|
"learning_rate": 6.554621848739496e-05, |
|
"loss": 0.1891, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.2539881138567407, |
|
"grad_norm": 0.707619309425354, |
|
"learning_rate": 6.521008403361345e-05, |
|
"loss": 0.6929, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.25461370034407255, |
|
"grad_norm": 1.87132728099823, |
|
"learning_rate": 6.487394957983193e-05, |
|
"loss": 0.6022, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.25523928683140445, |
|
"grad_norm": 0.5033402442932129, |
|
"learning_rate": 6.453781512605043e-05, |
|
"loss": 0.2833, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.2558648733187363, |
|
"grad_norm": 1.0010263919830322, |
|
"learning_rate": 6.420168067226892e-05, |
|
"loss": 0.2809, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.2564904598060682, |
|
"grad_norm": 0.9624127745628357, |
|
"learning_rate": 6.386554621848739e-05, |
|
"loss": 0.5303, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.25711604629340007, |
|
"grad_norm": 1.2495983839035034, |
|
"learning_rate": 6.352941176470588e-05, |
|
"loss": 0.5994, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.2577416327807319, |
|
"grad_norm": 0.7493329048156738, |
|
"learning_rate": 6.319327731092438e-05, |
|
"loss": 0.4795, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.25836721926806383, |
|
"grad_norm": 1.0783026218414307, |
|
"learning_rate": 6.285714285714286e-05, |
|
"loss": 1.13, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.2589928057553957, |
|
"grad_norm": 0.6462905406951904, |
|
"learning_rate": 6.252100840336135e-05, |
|
"loss": 0.4111, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.25961839224272754, |
|
"grad_norm": 0.4357486665248871, |
|
"learning_rate": 6.218487394957983e-05, |
|
"loss": 0.2122, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.26024397873005944, |
|
"grad_norm": 0.42553481459617615, |
|
"learning_rate": 6.184873949579833e-05, |
|
"loss": 0.2509, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.2608695652173913, |
|
"grad_norm": 0.8176494836807251, |
|
"learning_rate": 6.151260504201682e-05, |
|
"loss": 1.157, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.2614951517047232, |
|
"grad_norm": 0.527748703956604, |
|
"learning_rate": 6.11764705882353e-05, |
|
"loss": 0.3804, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.26212073819205506, |
|
"grad_norm": 0.9033327102661133, |
|
"learning_rate": 6.084033613445378e-05, |
|
"loss": 0.6555, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.2627463246793869, |
|
"grad_norm": 0.7106732130050659, |
|
"learning_rate": 6.0504201680672267e-05, |
|
"loss": 0.4358, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2633719111667188, |
|
"grad_norm": 1.1655712127685547, |
|
"learning_rate": 6.016806722689076e-05, |
|
"loss": 0.5233, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.2639974976540507, |
|
"grad_norm": 0.7053611874580383, |
|
"learning_rate": 5.9831932773109244e-05, |
|
"loss": 0.428, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.2646230841413825, |
|
"grad_norm": 0.7588666677474976, |
|
"learning_rate": 5.9495798319327737e-05, |
|
"loss": 0.5904, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.26524867062871443, |
|
"grad_norm": 0.6778993010520935, |
|
"learning_rate": 5.915966386554622e-05, |
|
"loss": 0.7834, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.2658742571160463, |
|
"grad_norm": 0.5685262084007263, |
|
"learning_rate": 5.882352941176471e-05, |
|
"loss": 0.7958, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.2658742571160463, |
|
"eval_loss": 0.7321073412895203, |
|
"eval_runtime": 43.5595, |
|
"eval_samples_per_second": 5.877, |
|
"eval_steps_per_second": 2.939, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.2664998436033782, |
|
"grad_norm": 0.41137516498565674, |
|
"learning_rate": 5.84873949579832e-05, |
|
"loss": 0.255, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.26712543009071005, |
|
"grad_norm": 0.48806631565093994, |
|
"learning_rate": 5.8151260504201685e-05, |
|
"loss": 0.2586, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.2677510165780419, |
|
"grad_norm": 0.877154529094696, |
|
"learning_rate": 5.781512605042018e-05, |
|
"loss": 0.5605, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.2683766030653738, |
|
"grad_norm": 1.1426063776016235, |
|
"learning_rate": 5.7478991596638656e-05, |
|
"loss": 0.6767, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.26900218955270566, |
|
"grad_norm": 0.7325838208198547, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.4384, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2696277760400375, |
|
"grad_norm": 0.815000593662262, |
|
"learning_rate": 5.6806722689075634e-05, |
|
"loss": 0.5198, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.2702533625273694, |
|
"grad_norm": 0.582699716091156, |
|
"learning_rate": 5.647058823529412e-05, |
|
"loss": 0.5345, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.2708789490147013, |
|
"grad_norm": 0.6257805228233337, |
|
"learning_rate": 5.6134453781512605e-05, |
|
"loss": 0.4172, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.27150453550203313, |
|
"grad_norm": 0.8166823983192444, |
|
"learning_rate": 5.57983193277311e-05, |
|
"loss": 0.7274, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.27213012198936504, |
|
"grad_norm": 0.6732988953590393, |
|
"learning_rate": 5.546218487394958e-05, |
|
"loss": 0.781, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2727557084766969, |
|
"grad_norm": 0.6230109930038452, |
|
"learning_rate": 5.5126050420168075e-05, |
|
"loss": 0.6356, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.2733812949640288, |
|
"grad_norm": 0.6590014696121216, |
|
"learning_rate": 5.478991596638656e-05, |
|
"loss": 0.9665, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.27400688145136065, |
|
"grad_norm": 0.3651019036769867, |
|
"learning_rate": 5.445378151260504e-05, |
|
"loss": 0.3858, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.2746324679386925, |
|
"grad_norm": 0.6834749579429626, |
|
"learning_rate": 5.411764705882353e-05, |
|
"loss": 0.5685, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.2752580544260244, |
|
"grad_norm": 0.46671655774116516, |
|
"learning_rate": 5.378151260504202e-05, |
|
"loss": 0.4953, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.27588364091335627, |
|
"grad_norm": 0.6245185732841492, |
|
"learning_rate": 5.34453781512605e-05, |
|
"loss": 0.7836, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.2765092274006881, |
|
"grad_norm": 0.5942935943603516, |
|
"learning_rate": 5.3109243697478995e-05, |
|
"loss": 0.3441, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.27713481388802, |
|
"grad_norm": 0.7539409399032593, |
|
"learning_rate": 5.277310924369748e-05, |
|
"loss": 0.7533, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.2777604003753519, |
|
"grad_norm": 0.40587514638900757, |
|
"learning_rate": 5.243697478991597e-05, |
|
"loss": 0.3298, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.2783859868626838, |
|
"grad_norm": 0.5237724184989929, |
|
"learning_rate": 5.210084033613446e-05, |
|
"loss": 0.65, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.27901157335001564, |
|
"grad_norm": 0.6571043133735657, |
|
"learning_rate": 5.176470588235295e-05, |
|
"loss": 0.5741, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.2796371598373475, |
|
"grad_norm": 0.4717683792114258, |
|
"learning_rate": 5.142857142857143e-05, |
|
"loss": 0.3073, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.2802627463246794, |
|
"grad_norm": 0.4331720173358917, |
|
"learning_rate": 5.1092436974789914e-05, |
|
"loss": 0.401, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.28088833281201125, |
|
"grad_norm": 0.6984372138977051, |
|
"learning_rate": 5.07563025210084e-05, |
|
"loss": 0.5573, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.2815139192993431, |
|
"grad_norm": 0.556936502456665, |
|
"learning_rate": 5.042016806722689e-05, |
|
"loss": 0.4267, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2815139192993431, |
|
"eval_loss": 0.7238953709602356, |
|
"eval_runtime": 43.5441, |
|
"eval_samples_per_second": 5.879, |
|
"eval_steps_per_second": 2.94, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.282139505786675, |
|
"grad_norm": 0.5527558922767639, |
|
"learning_rate": 5.008403361344538e-05, |
|
"loss": 0.3417, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.28276509227400687, |
|
"grad_norm": 0.6538864374160767, |
|
"learning_rate": 4.974789915966387e-05, |
|
"loss": 0.2658, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.2833906787613388, |
|
"grad_norm": 0.8087690472602844, |
|
"learning_rate": 4.9411764705882355e-05, |
|
"loss": 0.6919, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.28401626524867063, |
|
"grad_norm": 0.8165414333343506, |
|
"learning_rate": 4.907563025210084e-05, |
|
"loss": 0.575, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.2846418517360025, |
|
"grad_norm": 0.8950768113136292, |
|
"learning_rate": 4.8739495798319326e-05, |
|
"loss": 1.0045, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2852674382233344, |
|
"grad_norm": 0.6484183073043823, |
|
"learning_rate": 4.840336134453782e-05, |
|
"loss": 0.5661, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.28589302471066624, |
|
"grad_norm": 2.4603426456451416, |
|
"learning_rate": 4.8067226890756304e-05, |
|
"loss": 0.5769, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.2865186111979981, |
|
"grad_norm": 0.28350579738616943, |
|
"learning_rate": 4.7731092436974796e-05, |
|
"loss": 0.1358, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.28714419768533, |
|
"grad_norm": 0.5952170491218567, |
|
"learning_rate": 4.7394957983193275e-05, |
|
"loss": 0.7661, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.28776978417266186, |
|
"grad_norm": 0.6130964756011963, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 0.6188, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.28839537065999377, |
|
"grad_norm": 2.7420737743377686, |
|
"learning_rate": 4.672268907563025e-05, |
|
"loss": 0.5851, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.2890209571473256, |
|
"grad_norm": 0.6672143340110779, |
|
"learning_rate": 4.6386554621848745e-05, |
|
"loss": 0.6167, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.28964654363465747, |
|
"grad_norm": 0.5279991030693054, |
|
"learning_rate": 4.6050420168067224e-05, |
|
"loss": 0.5681, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.2902721301219894, |
|
"grad_norm": 1.2923184633255005, |
|
"learning_rate": 4.5714285714285716e-05, |
|
"loss": 0.6798, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.29089771660932123, |
|
"grad_norm": 0.718950629234314, |
|
"learning_rate": 4.53781512605042e-05, |
|
"loss": 0.7855, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.2915233030966531, |
|
"grad_norm": 0.6185110807418823, |
|
"learning_rate": 4.5042016806722694e-05, |
|
"loss": 0.3669, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.292148889583985, |
|
"grad_norm": 0.7038689255714417, |
|
"learning_rate": 4.470588235294118e-05, |
|
"loss": 0.534, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.29277447607131685, |
|
"grad_norm": 0.9318879246711731, |
|
"learning_rate": 4.4369747899159665e-05, |
|
"loss": 0.8676, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.29340006255864876, |
|
"grad_norm": 0.5326322317123413, |
|
"learning_rate": 4.403361344537815e-05, |
|
"loss": 0.511, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.2940256490459806, |
|
"grad_norm": 0.6265137195587158, |
|
"learning_rate": 4.369747899159664e-05, |
|
"loss": 0.3251, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.29465123553331246, |
|
"grad_norm": 0.4834919273853302, |
|
"learning_rate": 4.336134453781513e-05, |
|
"loss": 0.3329, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.29527682202064437, |
|
"grad_norm": 0.5931064486503601, |
|
"learning_rate": 4.302521008403362e-05, |
|
"loss": 0.3942, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.2959024085079762, |
|
"grad_norm": 0.23674030601978302, |
|
"learning_rate": 4.26890756302521e-05, |
|
"loss": 0.2589, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.2965279949953081, |
|
"grad_norm": 0.5993025302886963, |
|
"learning_rate": 4.235294117647059e-05, |
|
"loss": 0.4007, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.29715358148264, |
|
"grad_norm": 0.944987416267395, |
|
"learning_rate": 4.201680672268908e-05, |
|
"loss": 0.6389, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.29715358148264, |
|
"eval_loss": 0.7186967134475708, |
|
"eval_runtime": 43.5187, |
|
"eval_samples_per_second": 5.883, |
|
"eval_steps_per_second": 2.941, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.29777916796997184, |
|
"grad_norm": 0.5033174157142639, |
|
"learning_rate": 4.168067226890757e-05, |
|
"loss": 0.3001, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.29840475445730374, |
|
"grad_norm": 0.55152827501297, |
|
"learning_rate": 4.134453781512605e-05, |
|
"loss": 0.3029, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.2990303409446356, |
|
"grad_norm": 0.3445724546909332, |
|
"learning_rate": 4.100840336134454e-05, |
|
"loss": 0.2868, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.29965592743196745, |
|
"grad_norm": 0.7697100639343262, |
|
"learning_rate": 4.0672268907563026e-05, |
|
"loss": 0.4365, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.30028151391929936, |
|
"grad_norm": 0.8685904145240784, |
|
"learning_rate": 4.033613445378152e-05, |
|
"loss": 0.5848, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3009071004066312, |
|
"grad_norm": 0.9457945227622986, |
|
"learning_rate": 4e-05, |
|
"loss": 0.8947, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.30153268689396306, |
|
"grad_norm": 0.6347827911376953, |
|
"learning_rate": 3.966386554621849e-05, |
|
"loss": 0.4328, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.302158273381295, |
|
"grad_norm": 0.47420793771743774, |
|
"learning_rate": 3.9327731092436974e-05, |
|
"loss": 0.3031, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.3027838598686268, |
|
"grad_norm": 0.6944661736488342, |
|
"learning_rate": 3.8991596638655467e-05, |
|
"loss": 0.3967, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.30340944635595873, |
|
"grad_norm": 0.5811148881912231, |
|
"learning_rate": 3.865546218487395e-05, |
|
"loss": 0.4092, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.3040350328432906, |
|
"grad_norm": 0.8307628631591797, |
|
"learning_rate": 3.8319327731092444e-05, |
|
"loss": 0.4091, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.30466061933062244, |
|
"grad_norm": 0.53610759973526, |
|
"learning_rate": 3.798319327731092e-05, |
|
"loss": 0.2789, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.30528620581795435, |
|
"grad_norm": 0.7043578028678894, |
|
"learning_rate": 3.7647058823529415e-05, |
|
"loss": 0.4689, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.3059117923052862, |
|
"grad_norm": 0.6058409810066223, |
|
"learning_rate": 3.73109243697479e-05, |
|
"loss": 0.4195, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.30653737879261805, |
|
"grad_norm": 0.6879023313522339, |
|
"learning_rate": 3.697478991596639e-05, |
|
"loss": 0.8699, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.30716296527994996, |
|
"grad_norm": 0.9076145887374878, |
|
"learning_rate": 3.663865546218487e-05, |
|
"loss": 0.4701, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.3077885517672818, |
|
"grad_norm": 0.7418045401573181, |
|
"learning_rate": 3.6302521008403364e-05, |
|
"loss": 0.618, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.3084141382546137, |
|
"grad_norm": 0.7602983713150024, |
|
"learning_rate": 3.596638655462185e-05, |
|
"loss": 0.4609, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.3090397247419456, |
|
"grad_norm": 0.776549220085144, |
|
"learning_rate": 3.563025210084034e-05, |
|
"loss": 0.422, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.30966531122927743, |
|
"grad_norm": 0.6375955939292908, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 0.6409, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.31029089771660934, |
|
"grad_norm": 1.038455843925476, |
|
"learning_rate": 3.495798319327731e-05, |
|
"loss": 0.6265, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.3109164842039412, |
|
"grad_norm": 0.7373848557472229, |
|
"learning_rate": 3.46218487394958e-05, |
|
"loss": 0.5249, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.31154207069127304, |
|
"grad_norm": 0.5265942215919495, |
|
"learning_rate": 3.428571428571429e-05, |
|
"loss": 0.2448, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.31216765717860495, |
|
"grad_norm": 0.6590535640716553, |
|
"learning_rate": 3.3949579831932776e-05, |
|
"loss": 0.6337, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.3127932436659368, |
|
"grad_norm": 1.0600816011428833, |
|
"learning_rate": 3.361344537815127e-05, |
|
"loss": 0.6247, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3127932436659368, |
|
"eval_loss": 0.6991736888885498, |
|
"eval_runtime": 43.4877, |
|
"eval_samples_per_second": 5.887, |
|
"eval_steps_per_second": 2.943, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3134188301532687, |
|
"grad_norm": 0.6913841366767883, |
|
"learning_rate": 3.327731092436975e-05, |
|
"loss": 0.4161, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.31404441664060057, |
|
"grad_norm": 0.627515971660614, |
|
"learning_rate": 3.294117647058824e-05, |
|
"loss": 0.418, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.3146700031279324, |
|
"grad_norm": 0.6809556484222412, |
|
"learning_rate": 3.2605042016806725e-05, |
|
"loss": 0.4784, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.3152955896152643, |
|
"grad_norm": 0.6119003891944885, |
|
"learning_rate": 3.226890756302522e-05, |
|
"loss": 0.4283, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.3159211761025962, |
|
"grad_norm": 0.44398972392082214, |
|
"learning_rate": 3.1932773109243696e-05, |
|
"loss": 0.2422, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.31654676258992803, |
|
"grad_norm": 0.607389509677887, |
|
"learning_rate": 3.159663865546219e-05, |
|
"loss": 0.4103, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.31717234907725994, |
|
"grad_norm": 0.5077054500579834, |
|
"learning_rate": 3.1260504201680673e-05, |
|
"loss": 0.444, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.3177979355645918, |
|
"grad_norm": 0.6008110642433167, |
|
"learning_rate": 3.0924369747899166e-05, |
|
"loss": 0.29, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.3184235220519237, |
|
"grad_norm": 0.5967888832092285, |
|
"learning_rate": 3.058823529411765e-05, |
|
"loss": 0.3271, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.31904910853925555, |
|
"grad_norm": 0.5713754296302795, |
|
"learning_rate": 3.0252100840336133e-05, |
|
"loss": 0.7984, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3196746950265874, |
|
"grad_norm": 0.520448625087738, |
|
"learning_rate": 2.9915966386554622e-05, |
|
"loss": 0.6564, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.3203002815139193, |
|
"grad_norm": 0.7720608115196228, |
|
"learning_rate": 2.957983193277311e-05, |
|
"loss": 0.7506, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.32092586800125117, |
|
"grad_norm": 0.6118083000183105, |
|
"learning_rate": 2.92436974789916e-05, |
|
"loss": 0.5653, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.321551454488583, |
|
"grad_norm": 0.8836561441421509, |
|
"learning_rate": 2.890756302521009e-05, |
|
"loss": 0.5534, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.32217704097591493, |
|
"grad_norm": 0.7553030848503113, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.7212, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.3228026274632468, |
|
"grad_norm": 0.6662014722824097, |
|
"learning_rate": 2.823529411764706e-05, |
|
"loss": 0.8303, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.3234282139505787, |
|
"grad_norm": 0.7432876825332642, |
|
"learning_rate": 2.789915966386555e-05, |
|
"loss": 0.7016, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.32405380043791054, |
|
"grad_norm": 0.6568440198898315, |
|
"learning_rate": 2.7563025210084037e-05, |
|
"loss": 0.3467, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.3246793869252424, |
|
"grad_norm": 0.5214548110961914, |
|
"learning_rate": 2.722689075630252e-05, |
|
"loss": 0.2693, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.3253049734125743, |
|
"grad_norm": 0.6171154379844666, |
|
"learning_rate": 2.689075630252101e-05, |
|
"loss": 0.8636, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.32593055989990616, |
|
"grad_norm": 0.528217613697052, |
|
"learning_rate": 2.6554621848739497e-05, |
|
"loss": 0.3577, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.326556146387238, |
|
"grad_norm": 0.4560914635658264, |
|
"learning_rate": 2.6218487394957986e-05, |
|
"loss": 0.2316, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.3271817328745699, |
|
"grad_norm": 0.4513761103153229, |
|
"learning_rate": 2.5882352941176475e-05, |
|
"loss": 0.2614, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.32780731936190177, |
|
"grad_norm": 0.7243900299072266, |
|
"learning_rate": 2.5546218487394957e-05, |
|
"loss": 0.6181, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.3284329058492337, |
|
"grad_norm": 1.2595303058624268, |
|
"learning_rate": 2.5210084033613446e-05, |
|
"loss": 0.6232, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.3284329058492337, |
|
"eval_loss": 0.6902230381965637, |
|
"eval_runtime": 43.5369, |
|
"eval_samples_per_second": 5.88, |
|
"eval_steps_per_second": 2.94, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.32905849233656553, |
|
"grad_norm": 0.8221277594566345, |
|
"learning_rate": 2.4873949579831935e-05, |
|
"loss": 0.5717, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.3296840788238974, |
|
"grad_norm": 0.79567551612854, |
|
"learning_rate": 2.453781512605042e-05, |
|
"loss": 0.9611, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.3303096653112293, |
|
"grad_norm": 0.7971245050430298, |
|
"learning_rate": 2.420168067226891e-05, |
|
"loss": 0.3147, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.33093525179856115, |
|
"grad_norm": 0.5613353848457336, |
|
"learning_rate": 2.3865546218487398e-05, |
|
"loss": 0.2808, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.331560838285893, |
|
"grad_norm": 0.5482297539710999, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 0.3648, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3321864247732249, |
|
"grad_norm": 0.8833631873130798, |
|
"learning_rate": 2.3193277310924373e-05, |
|
"loss": 0.5692, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.33281201126055676, |
|
"grad_norm": 0.9152381420135498, |
|
"learning_rate": 2.2857142857142858e-05, |
|
"loss": 0.657, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.33343759774788867, |
|
"grad_norm": 4.917063236236572, |
|
"learning_rate": 2.2521008403361347e-05, |
|
"loss": 0.6422, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.3340631842352205, |
|
"grad_norm": 0.466827392578125, |
|
"learning_rate": 2.2184873949579832e-05, |
|
"loss": 0.3138, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.3346887707225524, |
|
"grad_norm": 0.4326834976673126, |
|
"learning_rate": 2.184873949579832e-05, |
|
"loss": 0.2392, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.3353143572098843, |
|
"grad_norm": 0.9225325584411621, |
|
"learning_rate": 2.151260504201681e-05, |
|
"loss": 0.9698, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.33593994369721614, |
|
"grad_norm": 51.696407318115234, |
|
"learning_rate": 2.1176470588235296e-05, |
|
"loss": 1.3961, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.336565530184548, |
|
"grad_norm": 0.7156023383140564, |
|
"learning_rate": 2.0840336134453785e-05, |
|
"loss": 0.6503, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.3371911166718799, |
|
"grad_norm": 0.46554067730903625, |
|
"learning_rate": 2.050420168067227e-05, |
|
"loss": 0.3789, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.33781670315921175, |
|
"grad_norm": 1.5252093076705933, |
|
"learning_rate": 2.016806722689076e-05, |
|
"loss": 1.3196, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.33844228964654366, |
|
"grad_norm": 0.836153507232666, |
|
"learning_rate": 1.9831932773109244e-05, |
|
"loss": 0.4832, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.3390678761338755, |
|
"grad_norm": 0.6036570072174072, |
|
"learning_rate": 1.9495798319327733e-05, |
|
"loss": 0.4105, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.33969346262120736, |
|
"grad_norm": 0.8396046161651611, |
|
"learning_rate": 1.9159663865546222e-05, |
|
"loss": 0.6624, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.3403190491085393, |
|
"grad_norm": 1.1757038831710815, |
|
"learning_rate": 1.8823529411764708e-05, |
|
"loss": 0.4706, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.3409446355958711, |
|
"grad_norm": 0.3689747154712677, |
|
"learning_rate": 1.8487394957983196e-05, |
|
"loss": 0.3568, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.341570222083203, |
|
"grad_norm": 0.5986055135726929, |
|
"learning_rate": 1.8151260504201682e-05, |
|
"loss": 0.6308, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.3421958085705349, |
|
"grad_norm": 0.4873492419719696, |
|
"learning_rate": 1.781512605042017e-05, |
|
"loss": 0.3524, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.34282139505786674, |
|
"grad_norm": 0.7307869791984558, |
|
"learning_rate": 1.7478991596638656e-05, |
|
"loss": 1.0777, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.34344698154519865, |
|
"grad_norm": 0.6755880117416382, |
|
"learning_rate": 1.7142857142857145e-05, |
|
"loss": 0.3718, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.3440725680325305, |
|
"grad_norm": 0.8727423548698425, |
|
"learning_rate": 1.6806722689075634e-05, |
|
"loss": 0.6502, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3440725680325305, |
|
"eval_loss": 0.6811222434043884, |
|
"eval_runtime": 43.5707, |
|
"eval_samples_per_second": 5.876, |
|
"eval_steps_per_second": 2.938, |
|
"step": 550 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.137904338467881e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|