|
{ |
|
"best_metric": 0.9047689437866211, |
|
"best_model_checkpoint": "data/Llama-31-8B_task-1_180-samples_config-4_full/checkpoint-391", |
|
"epoch": 52.94117647058823, |
|
"eval_steps": 500, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11764705882352941, |
|
"grad_norm": 1.835170865058899, |
|
"learning_rate": 8.333333333333334e-08, |
|
"loss": 2.4537, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 1.8771226406097412, |
|
"learning_rate": 1.6666666666666668e-07, |
|
"loss": 2.3789, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 2.0154590606689453, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 2.4445, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 2.0653038024902344, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 2.4809, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 1.925260305404663, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 2.4745, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"eval_loss": 2.43351149559021, |
|
"eval_runtime": 14.4922, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 1.789984107017517, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 2.4155, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.4117647058823528, |
|
"grad_norm": 1.7350174188613892, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 2.4448, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.6470588235294117, |
|
"grad_norm": 1.7722196578979492, |
|
"learning_rate": 1.1666666666666668e-06, |
|
"loss": 2.4888, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.8823529411764706, |
|
"grad_norm": 1.6953096389770508, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 2.4286, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.411425828933716, |
|
"eval_runtime": 14.4913, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 2.1176470588235294, |
|
"grad_norm": 1.5511802434921265, |
|
"learning_rate": 1.5e-06, |
|
"loss": 2.4245, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 1.547842025756836, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 2.4254, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.588235294117647, |
|
"grad_norm": 1.5457754135131836, |
|
"learning_rate": 1.8333333333333333e-06, |
|
"loss": 2.4147, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 2.8235294117647056, |
|
"grad_norm": 1.5168875455856323, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.419, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"eval_loss": 2.381402015686035, |
|
"eval_runtime": 14.4968, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 3.0588235294117645, |
|
"grad_norm": 1.573783278465271, |
|
"learning_rate": 2.166666666666667e-06, |
|
"loss": 2.375, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 3.2941176470588234, |
|
"grad_norm": 1.4476714134216309, |
|
"learning_rate": 2.3333333333333336e-06, |
|
"loss": 2.3949, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 1.569410800933838, |
|
"learning_rate": 2.5e-06, |
|
"loss": 2.3609, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.764705882352941, |
|
"grad_norm": 1.604489803314209, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 2.3778, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.7640857696533203, |
|
"learning_rate": 2.8333333333333335e-06, |
|
"loss": 2.3475, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.326162099838257, |
|
"eval_runtime": 14.494, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 4.235294117647059, |
|
"grad_norm": 1.7499988079071045, |
|
"learning_rate": 3e-06, |
|
"loss": 2.3113, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 4.470588235294118, |
|
"grad_norm": 1.8631538152694702, |
|
"learning_rate": 3.1666666666666667e-06, |
|
"loss": 2.3479, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 1.75290846824646, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 2.2702, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 4.9411764705882355, |
|
"grad_norm": 1.6694003343582153, |
|
"learning_rate": 3.5e-06, |
|
"loss": 2.3147, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 4.9411764705882355, |
|
"eval_loss": 2.254105567932129, |
|
"eval_runtime": 14.4974, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 5.176470588235294, |
|
"grad_norm": 1.4519027471542358, |
|
"learning_rate": 3.6666666666666666e-06, |
|
"loss": 2.2606, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 5.411764705882353, |
|
"grad_norm": 1.507379174232483, |
|
"learning_rate": 3.833333333333334e-06, |
|
"loss": 2.1865, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 5.647058823529412, |
|
"grad_norm": 1.6610608100891113, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.2445, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 5.882352941176471, |
|
"grad_norm": 1.8665122985839844, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 2.2214, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.1715917587280273, |
|
"eval_runtime": 14.4909, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 6.117647058823529, |
|
"grad_norm": 2.0021536350250244, |
|
"learning_rate": 4.333333333333334e-06, |
|
"loss": 2.1689, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 6.352941176470588, |
|
"grad_norm": 1.3459073305130005, |
|
"learning_rate": 4.5e-06, |
|
"loss": 2.1659, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 6.588235294117647, |
|
"grad_norm": 1.1446928977966309, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 2.1603, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 6.823529411764706, |
|
"grad_norm": 1.0907998085021973, |
|
"learning_rate": 4.833333333333333e-06, |
|
"loss": 2.1097, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 6.9411764705882355, |
|
"eval_loss": 2.074471950531006, |
|
"eval_runtime": 14.4917, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 7.0588235294117645, |
|
"grad_norm": 1.0119956731796265, |
|
"learning_rate": 5e-06, |
|
"loss": 2.1078, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 7.294117647058823, |
|
"grad_norm": 1.069096326828003, |
|
"learning_rate": 5.1666666666666675e-06, |
|
"loss": 2.0802, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 7.529411764705882, |
|
"grad_norm": 1.0303503274917603, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 2.0106, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 7.764705882352941, |
|
"grad_norm": 1.0196495056152344, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 2.0143, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.940999448299408, |
|
"learning_rate": 5.666666666666667e-06, |
|
"loss": 1.9617, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.9479138851165771, |
|
"eval_runtime": 14.5012, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 8.235294117647058, |
|
"grad_norm": 0.9313414692878723, |
|
"learning_rate": 5.833333333333334e-06, |
|
"loss": 1.9513, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 8.470588235294118, |
|
"grad_norm": 0.8379154801368713, |
|
"learning_rate": 6e-06, |
|
"loss": 1.9401, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 8.705882352941176, |
|
"grad_norm": 0.8604364395141602, |
|
"learning_rate": 6.166666666666667e-06, |
|
"loss": 1.8323, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 8.941176470588236, |
|
"grad_norm": 0.895465612411499, |
|
"learning_rate": 6.333333333333333e-06, |
|
"loss": 1.908, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 8.941176470588236, |
|
"eval_loss": 1.8374704122543335, |
|
"eval_runtime": 14.5083, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 9.176470588235293, |
|
"grad_norm": 0.7938605546951294, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 1.811, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 9.411764705882353, |
|
"grad_norm": 0.8125358819961548, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.7889, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 9.647058823529411, |
|
"grad_norm": 0.8654666543006897, |
|
"learning_rate": 6.833333333333334e-06, |
|
"loss": 1.756, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 9.882352941176471, |
|
"grad_norm": 0.9033392071723938, |
|
"learning_rate": 7e-06, |
|
"loss": 1.7669, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 1.6952563524246216, |
|
"eval_runtime": 14.4915, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 10.117647058823529, |
|
"grad_norm": 0.8959240317344666, |
|
"learning_rate": 7.166666666666667e-06, |
|
"loss": 1.6923, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 10.352941176470589, |
|
"grad_norm": 0.9546213150024414, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 1.6253, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 10.588235294117647, |
|
"grad_norm": 1.0652508735656738, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.6249, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 10.823529411764707, |
|
"grad_norm": 1.0107028484344482, |
|
"learning_rate": 7.666666666666667e-06, |
|
"loss": 1.6325, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 10.941176470588236, |
|
"eval_loss": 1.5460655689239502, |
|
"eval_runtime": 14.5077, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 11.058823529411764, |
|
"grad_norm": 0.8419122099876404, |
|
"learning_rate": 7.833333333333333e-06, |
|
"loss": 1.5955, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 11.294117647058824, |
|
"grad_norm": 0.8873438835144043, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.5361, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 11.529411764705882, |
|
"grad_norm": 0.8289064168930054, |
|
"learning_rate": 8.166666666666668e-06, |
|
"loss": 1.4957, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 11.764705882352942, |
|
"grad_norm": 0.890578031539917, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.4318, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.8572049140930176, |
|
"learning_rate": 8.5e-06, |
|
"loss": 1.3201, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 1.3738850355148315, |
|
"eval_runtime": 14.5098, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 12.235294117647058, |
|
"grad_norm": 0.9629900455474854, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 1.3875, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 12.470588235294118, |
|
"grad_norm": 0.845135509967804, |
|
"learning_rate": 8.833333333333334e-06, |
|
"loss": 1.2982, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 12.705882352941176, |
|
"grad_norm": 0.8272332549095154, |
|
"learning_rate": 9e-06, |
|
"loss": 1.2623, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 12.941176470588236, |
|
"grad_norm": 0.777883768081665, |
|
"learning_rate": 9.166666666666666e-06, |
|
"loss": 1.2477, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 12.941176470588236, |
|
"eval_loss": 1.233130931854248, |
|
"eval_runtime": 14.5015, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 13.176470588235293, |
|
"grad_norm": 0.6998254656791687, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 1.209, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 13.411764705882353, |
|
"grad_norm": 0.6446437835693359, |
|
"learning_rate": 9.5e-06, |
|
"loss": 1.165, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 13.647058823529411, |
|
"grad_norm": 0.6399714946746826, |
|
"learning_rate": 9.666666666666667e-06, |
|
"loss": 1.1563, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 13.882352941176471, |
|
"grad_norm": 0.6025314331054688, |
|
"learning_rate": 9.833333333333333e-06, |
|
"loss": 1.163, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 1.1329914331436157, |
|
"eval_runtime": 14.4965, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 14.117647058823529, |
|
"grad_norm": 0.5692251920700073, |
|
"learning_rate": 1e-05, |
|
"loss": 1.1526, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 14.352941176470589, |
|
"grad_norm": 0.5997884273529053, |
|
"learning_rate": 9.999915384288723e-06, |
|
"loss": 1.1087, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 14.588235294117647, |
|
"grad_norm": 0.575445294380188, |
|
"learning_rate": 9.999661540018812e-06, |
|
"loss": 1.1228, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 14.823529411764707, |
|
"grad_norm": 0.5134881734848022, |
|
"learning_rate": 9.999238475781957e-06, |
|
"loss": 1.0579, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 14.941176470588236, |
|
"eval_loss": 1.086132526397705, |
|
"eval_runtime": 14.5111, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 15.058823529411764, |
|
"grad_norm": 0.49025943875312805, |
|
"learning_rate": 9.99864620589731e-06, |
|
"loss": 1.0658, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 15.294117647058824, |
|
"grad_norm": 0.41751477122306824, |
|
"learning_rate": 9.997884750411004e-06, |
|
"loss": 1.0543, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 15.529411764705882, |
|
"grad_norm": 0.4086260199546814, |
|
"learning_rate": 9.99695413509548e-06, |
|
"loss": 1.0696, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 15.764705882352942, |
|
"grad_norm": 0.38625603914260864, |
|
"learning_rate": 9.995854391448607e-06, |
|
"loss": 1.0581, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.3801828622817993, |
|
"learning_rate": 9.994585556692624e-06, |
|
"loss": 1.0655, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 1.0610604286193848, |
|
"eval_runtime": 14.4857, |
|
"eval_samples_per_second": 2.485, |
|
"eval_steps_per_second": 2.485, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 16.235294117647058, |
|
"grad_norm": 0.3603919744491577, |
|
"learning_rate": 9.993147673772869e-06, |
|
"loss": 1.0141, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 16.470588235294116, |
|
"grad_norm": 0.3525612950325012, |
|
"learning_rate": 9.991540791356342e-06, |
|
"loss": 1.0913, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 16.705882352941178, |
|
"grad_norm": 0.34835246205329895, |
|
"learning_rate": 9.989764963830038e-06, |
|
"loss": 1.0581, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 16.941176470588236, |
|
"grad_norm": 0.3293631076812744, |
|
"learning_rate": 9.987820251299121e-06, |
|
"loss": 0.9976, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 16.941176470588236, |
|
"eval_loss": 1.0454792976379395, |
|
"eval_runtime": 14.4924, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 17.176470588235293, |
|
"grad_norm": 0.3257598578929901, |
|
"learning_rate": 9.985706719584888e-06, |
|
"loss": 1.0318, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 17.41176470588235, |
|
"grad_norm": 0.351471483707428, |
|
"learning_rate": 9.98342444022253e-06, |
|
"loss": 0.9863, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 17.647058823529413, |
|
"grad_norm": 0.35920023918151855, |
|
"learning_rate": 9.980973490458728e-06, |
|
"loss": 1.0559, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 17.88235294117647, |
|
"grad_norm": 0.3804563283920288, |
|
"learning_rate": 9.978353953249023e-06, |
|
"loss": 1.0285, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 1.0318020582199097, |
|
"eval_runtime": 14.4883, |
|
"eval_samples_per_second": 2.485, |
|
"eval_steps_per_second": 2.485, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 18.11764705882353, |
|
"grad_norm": 0.3365037739276886, |
|
"learning_rate": 9.975565917255017e-06, |
|
"loss": 0.9899, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 18.352941176470587, |
|
"grad_norm": 0.33106836676597595, |
|
"learning_rate": 9.972609476841368e-06, |
|
"loss": 1.0316, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 18.58823529411765, |
|
"grad_norm": 0.34729650616645813, |
|
"learning_rate": 9.9694847320726e-06, |
|
"loss": 1.0052, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 18.823529411764707, |
|
"grad_norm": 0.3627675473690033, |
|
"learning_rate": 9.966191788709716e-06, |
|
"loss": 0.998, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 18.941176470588236, |
|
"eval_loss": 1.0204837322235107, |
|
"eval_runtime": 14.4913, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 19.058823529411764, |
|
"grad_norm": 0.37889716029167175, |
|
"learning_rate": 9.962730758206612e-06, |
|
"loss": 0.9902, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 19.294117647058822, |
|
"grad_norm": 0.36187246441841125, |
|
"learning_rate": 9.959101757706308e-06, |
|
"loss": 1.0316, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 19.529411764705884, |
|
"grad_norm": 0.4003053903579712, |
|
"learning_rate": 9.955304910036993e-06, |
|
"loss": 0.9643, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 19.764705882352942, |
|
"grad_norm": 0.351055771112442, |
|
"learning_rate": 9.951340343707852e-06, |
|
"loss": 0.9944, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.3519394099712372, |
|
"learning_rate": 9.947208192904722e-06, |
|
"loss": 1.0038, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 1.01017427444458, |
|
"eval_runtime": 14.4899, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 20.235294117647058, |
|
"grad_norm": 0.3706846237182617, |
|
"learning_rate": 9.942908597485558e-06, |
|
"loss": 0.9619, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 20.470588235294116, |
|
"grad_norm": 0.39562636613845825, |
|
"learning_rate": 9.938441702975689e-06, |
|
"loss": 1.0219, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 20.705882352941178, |
|
"grad_norm": 0.48325368762016296, |
|
"learning_rate": 9.933807660562898e-06, |
|
"loss": 0.9589, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 20.941176470588236, |
|
"grad_norm": 0.4090341627597809, |
|
"learning_rate": 9.929006627092298e-06, |
|
"loss": 0.9907, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 20.941176470588236, |
|
"eval_loss": 1.0019959211349487, |
|
"eval_runtime": 14.4938, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 21.176470588235293, |
|
"grad_norm": 0.3393973708152771, |
|
"learning_rate": 9.924038765061042e-06, |
|
"loss": 0.9841, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 21.41176470588235, |
|
"grad_norm": 0.47041693329811096, |
|
"learning_rate": 9.918904242612794e-06, |
|
"loss": 1.0023, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 21.647058823529413, |
|
"grad_norm": 0.4426097273826599, |
|
"learning_rate": 9.913603233532067e-06, |
|
"loss": 0.9654, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 21.88235294117647, |
|
"grad_norm": 0.4066382646560669, |
|
"learning_rate": 9.908135917238321e-06, |
|
"loss": 0.9673, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 0.9929290413856506, |
|
"eval_runtime": 14.5036, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 22.11764705882353, |
|
"grad_norm": 0.4103560447692871, |
|
"learning_rate": 9.902502478779897e-06, |
|
"loss": 0.9763, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 22.352941176470587, |
|
"grad_norm": 0.3923400938510895, |
|
"learning_rate": 9.896703108827758e-06, |
|
"loss": 0.9583, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 22.58823529411765, |
|
"grad_norm": 0.43239229917526245, |
|
"learning_rate": 9.890738003669029e-06, |
|
"loss": 0.9429, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 22.823529411764707, |
|
"grad_norm": 0.45320191979408264, |
|
"learning_rate": 9.884607365200355e-06, |
|
"loss": 0.95, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 22.941176470588236, |
|
"eval_loss": 0.9870390295982361, |
|
"eval_runtime": 14.4888, |
|
"eval_samples_per_second": 2.485, |
|
"eval_steps_per_second": 2.485, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 23.058823529411764, |
|
"grad_norm": 0.48725754022598267, |
|
"learning_rate": 9.878311400921072e-06, |
|
"loss": 0.9369, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 23.294117647058822, |
|
"grad_norm": 0.4515678286552429, |
|
"learning_rate": 9.871850323926178e-06, |
|
"loss": 0.9978, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 23.529411764705884, |
|
"grad_norm": 0.44924628734588623, |
|
"learning_rate": 9.86522435289912e-06, |
|
"loss": 0.9126, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 23.764705882352942, |
|
"grad_norm": 0.4258614182472229, |
|
"learning_rate": 9.858433712104403e-06, |
|
"loss": 0.9716, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.46707141399383545, |
|
"learning_rate": 9.851478631379982e-06, |
|
"loss": 0.9467, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 0.980078399181366, |
|
"eval_runtime": 14.4952, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 24.235294117647058, |
|
"grad_norm": 0.5438560843467712, |
|
"learning_rate": 9.844359346129504e-06, |
|
"loss": 0.9756, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 24.470588235294116, |
|
"grad_norm": 0.47229835391044617, |
|
"learning_rate": 9.83707609731432e-06, |
|
"loss": 0.9285, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 24.705882352941178, |
|
"grad_norm": 0.4578736126422882, |
|
"learning_rate": 9.829629131445342e-06, |
|
"loss": 0.9288, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 24.941176470588236, |
|
"grad_norm": 0.45513561367988586, |
|
"learning_rate": 9.822018700574696e-06, |
|
"loss": 0.9423, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 24.941176470588236, |
|
"eval_loss": 0.9737484455108643, |
|
"eval_runtime": 14.4981, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 25.176470588235293, |
|
"grad_norm": 0.4210945963859558, |
|
"learning_rate": 9.81424506228719e-06, |
|
"loss": 0.9093, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 25.41176470588235, |
|
"grad_norm": 0.5261120796203613, |
|
"learning_rate": 9.806308479691595e-06, |
|
"loss": 0.9351, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 25.647058823529413, |
|
"grad_norm": 0.5155240893363953, |
|
"learning_rate": 9.798209221411748e-06, |
|
"loss": 0.9548, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 25.88235294117647, |
|
"grad_norm": 0.5193476676940918, |
|
"learning_rate": 9.789947561577445e-06, |
|
"loss": 0.937, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 0.9675381183624268, |
|
"eval_runtime": 14.5154, |
|
"eval_samples_per_second": 2.48, |
|
"eval_steps_per_second": 2.48, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 26.11764705882353, |
|
"grad_norm": 0.4255479574203491, |
|
"learning_rate": 9.781523779815178e-06, |
|
"loss": 0.9061, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 26.352941176470587, |
|
"grad_norm": 0.5112140774726868, |
|
"learning_rate": 9.77293816123866e-06, |
|
"loss": 0.9409, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 26.58823529411765, |
|
"grad_norm": 0.5464062094688416, |
|
"learning_rate": 9.764190996439181e-06, |
|
"loss": 0.9274, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 26.823529411764707, |
|
"grad_norm": 0.473880797624588, |
|
"learning_rate": 9.755282581475769e-06, |
|
"loss": 0.9035, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 26.941176470588236, |
|
"eval_loss": 0.9625819325447083, |
|
"eval_runtime": 14.5037, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 27.058823529411764, |
|
"grad_norm": 0.5459398627281189, |
|
"learning_rate": 9.74621321786517e-06, |
|
"loss": 0.9274, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 27.294117647058822, |
|
"grad_norm": 0.5077070593833923, |
|
"learning_rate": 9.736983212571646e-06, |
|
"loss": 0.911, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 27.529411764705884, |
|
"grad_norm": 0.510081946849823, |
|
"learning_rate": 9.727592877996585e-06, |
|
"loss": 0.8977, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 27.764705882352942, |
|
"grad_norm": 0.5523853898048401, |
|
"learning_rate": 9.718042531967918e-06, |
|
"loss": 0.9165, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.587631344795227, |
|
"learning_rate": 9.708332497729378e-06, |
|
"loss": 0.9074, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 0.9581733345985413, |
|
"eval_runtime": 14.489, |
|
"eval_samples_per_second": 2.485, |
|
"eval_steps_per_second": 2.485, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 28.235294117647058, |
|
"grad_norm": 0.5389757752418518, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.8634, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 28.470588235294116, |
|
"grad_norm": 0.5718576908111572, |
|
"learning_rate": 9.688434684610725e-06, |
|
"loss": 0.941, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 28.705882352941178, |
|
"grad_norm": 0.5238634943962097, |
|
"learning_rate": 9.678247579197658e-06, |
|
"loss": 0.9235, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 28.941176470588236, |
|
"grad_norm": 0.5517392754554749, |
|
"learning_rate": 9.667902132486009e-06, |
|
"loss": 0.8944, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 28.941176470588236, |
|
"eval_loss": 0.9534149169921875, |
|
"eval_runtime": 14.4915, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 29.176470588235293, |
|
"grad_norm": 0.6139256954193115, |
|
"learning_rate": 9.657398694630713e-06, |
|
"loss": 0.8854, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 29.41176470588235, |
|
"grad_norm": 0.5577236413955688, |
|
"learning_rate": 9.646737621134112e-06, |
|
"loss": 0.9253, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 29.647058823529413, |
|
"grad_norm": 0.638213038444519, |
|
"learning_rate": 9.635919272833938e-06, |
|
"loss": 0.9007, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 29.88235294117647, |
|
"grad_norm": 0.5903518795967102, |
|
"learning_rate": 9.62494401589108e-06, |
|
"loss": 0.8785, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 0.9493101835250854, |
|
"eval_runtime": 14.4864, |
|
"eval_samples_per_second": 2.485, |
|
"eval_steps_per_second": 2.485, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 30.11764705882353, |
|
"grad_norm": 0.6490582823753357, |
|
"learning_rate": 9.613812221777212e-06, |
|
"loss": 0.8782, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 30.352941176470587, |
|
"grad_norm": 0.5926429033279419, |
|
"learning_rate": 9.602524267262202e-06, |
|
"loss": 0.8326, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 30.58823529411765, |
|
"grad_norm": 0.6460428237915039, |
|
"learning_rate": 9.591080534401371e-06, |
|
"loss": 0.9207, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 30.823529411764707, |
|
"grad_norm": 0.59434574842453, |
|
"learning_rate": 9.579481410522556e-06, |
|
"loss": 0.8797, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 30.941176470588236, |
|
"eval_loss": 0.9451322555541992, |
|
"eval_runtime": 14.513, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 31.058823529411764, |
|
"grad_norm": 0.5878000855445862, |
|
"learning_rate": 9.567727288213005e-06, |
|
"loss": 0.864, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 31.294117647058822, |
|
"grad_norm": 0.5548307299613953, |
|
"learning_rate": 9.555818565306086e-06, |
|
"loss": 0.8766, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 31.529411764705884, |
|
"grad_norm": 0.6302839517593384, |
|
"learning_rate": 9.543755644867823e-06, |
|
"loss": 0.8702, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 31.764705882352942, |
|
"grad_norm": 0.6953015923500061, |
|
"learning_rate": 9.531538935183252e-06, |
|
"loss": 0.8888, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.8207558989524841, |
|
"learning_rate": 9.519168849742603e-06, |
|
"loss": 0.8764, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 0.9421913623809814, |
|
"eval_runtime": 14.4916, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 32.23529411764706, |
|
"grad_norm": 0.699301540851593, |
|
"learning_rate": 9.506645807227311e-06, |
|
"loss": 0.8338, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 32.470588235294116, |
|
"grad_norm": 0.7847335934638977, |
|
"learning_rate": 9.493970231495836e-06, |
|
"loss": 0.9099, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 32.705882352941174, |
|
"grad_norm": 0.7237752079963684, |
|
"learning_rate": 9.481142551569318e-06, |
|
"loss": 0.8442, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 32.94117647058823, |
|
"grad_norm": 0.7358724474906921, |
|
"learning_rate": 9.468163201617063e-06, |
|
"loss": 0.8903, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 32.94117647058823, |
|
"eval_loss": 0.9388971328735352, |
|
"eval_runtime": 14.4936, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 33.1764705882353, |
|
"grad_norm": 0.7364740371704102, |
|
"learning_rate": 9.45503262094184e-06, |
|
"loss": 0.8319, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 33.411764705882355, |
|
"grad_norm": 0.6712108254432678, |
|
"learning_rate": 9.441751253965022e-06, |
|
"loss": 0.8394, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 33.64705882352941, |
|
"grad_norm": 0.9382649660110474, |
|
"learning_rate": 9.428319550211531e-06, |
|
"loss": 0.8431, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 33.88235294117647, |
|
"grad_norm": 0.6663911938667297, |
|
"learning_rate": 9.414737964294636e-06, |
|
"loss": 0.8835, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_loss": 0.9377151131629944, |
|
"eval_runtime": 14.4983, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 34.11764705882353, |
|
"grad_norm": 0.9108101725578308, |
|
"learning_rate": 9.401006955900555e-06, |
|
"loss": 0.8886, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 34.35294117647059, |
|
"grad_norm": 0.6387782096862793, |
|
"learning_rate": 9.38712698977291e-06, |
|
"loss": 0.8418, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 34.588235294117645, |
|
"grad_norm": 0.7615544199943542, |
|
"learning_rate": 9.37309853569698e-06, |
|
"loss": 0.833, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 34.8235294117647, |
|
"grad_norm": 0.7540785074234009, |
|
"learning_rate": 9.358922068483813e-06, |
|
"loss": 0.8452, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 34.94117647058823, |
|
"eval_loss": 0.9331609010696411, |
|
"eval_runtime": 14.4883, |
|
"eval_samples_per_second": 2.485, |
|
"eval_steps_per_second": 2.485, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 35.05882352941177, |
|
"grad_norm": 0.9316972494125366, |
|
"learning_rate": 9.344598067954151e-06, |
|
"loss": 0.8427, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 35.294117647058826, |
|
"grad_norm": 0.8066386580467224, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 0.8404, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 35.529411764705884, |
|
"grad_norm": 0.7952571511268616, |
|
"learning_rate": 9.315509411179182e-06, |
|
"loss": 0.7953, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 35.76470588235294, |
|
"grad_norm": 0.7538934946060181, |
|
"learning_rate": 9.30074573947683e-06, |
|
"loss": 0.8435, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 0.7602026462554932, |
|
"learning_rate": 9.285836503510562e-06, |
|
"loss": 0.8777, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_loss": 0.9272398352622986, |
|
"eval_runtime": 14.4824, |
|
"eval_samples_per_second": 2.486, |
|
"eval_steps_per_second": 2.486, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 36.23529411764706, |
|
"grad_norm": 0.7411664128303528, |
|
"learning_rate": 9.27078220790263e-06, |
|
"loss": 0.8382, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 36.470588235294116, |
|
"grad_norm": 0.7392826676368713, |
|
"learning_rate": 9.255583362184998e-06, |
|
"loss": 0.8206, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 36.705882352941174, |
|
"grad_norm": 0.8133856058120728, |
|
"learning_rate": 9.24024048078213e-06, |
|
"loss": 0.8384, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 36.94117647058823, |
|
"grad_norm": 0.8298240900039673, |
|
"learning_rate": 9.224754082993553e-06, |
|
"loss": 0.8101, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 36.94117647058823, |
|
"eval_loss": 0.9257263541221619, |
|
"eval_runtime": 14.5049, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 37.1764705882353, |
|
"grad_norm": 0.756746768951416, |
|
"learning_rate": 9.209124692976287e-06, |
|
"loss": 0.8068, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 37.411764705882355, |
|
"grad_norm": 0.7719102501869202, |
|
"learning_rate": 9.193352839727122e-06, |
|
"loss": 0.8067, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 37.64705882352941, |
|
"grad_norm": 0.7538597583770752, |
|
"learning_rate": 9.177439057064684e-06, |
|
"loss": 0.821, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 37.88235294117647, |
|
"grad_norm": 0.8648045063018799, |
|
"learning_rate": 9.16138388361139e-06, |
|
"loss": 0.8526, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_loss": 0.9229225516319275, |
|
"eval_runtime": 14.498, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 38.11764705882353, |
|
"grad_norm": 0.7806123495101929, |
|
"learning_rate": 9.145187862775208e-06, |
|
"loss": 0.8019, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 38.35294117647059, |
|
"grad_norm": 0.8803067803382874, |
|
"learning_rate": 9.128851542731271e-06, |
|
"loss": 0.7752, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 38.588235294117645, |
|
"grad_norm": 0.9083964824676514, |
|
"learning_rate": 9.112375476403313e-06, |
|
"loss": 0.8176, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 38.8235294117647, |
|
"grad_norm": 0.8712770938873291, |
|
"learning_rate": 9.09576022144496e-06, |
|
"loss": 0.8228, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 38.94117647058823, |
|
"eval_loss": 0.9196635484695435, |
|
"eval_runtime": 14.5068, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 39.05882352941177, |
|
"grad_norm": 0.8734506964683533, |
|
"learning_rate": 9.079006340220862e-06, |
|
"loss": 0.8059, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 39.294117647058826, |
|
"grad_norm": 0.8067296147346497, |
|
"learning_rate": 9.062114399787648e-06, |
|
"loss": 0.7936, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 39.529411764705884, |
|
"grad_norm": 0.825115978717804, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.837, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 39.76470588235294, |
|
"grad_norm": 0.9862772226333618, |
|
"learning_rate": 9.027918632864998e-06, |
|
"loss": 0.756, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.9583683609962463, |
|
"learning_rate": 9.01061596377522e-06, |
|
"loss": 0.8066, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 0.9175994396209717, |
|
"eval_runtime": 14.4912, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 40.23529411764706, |
|
"grad_norm": 0.9128953814506531, |
|
"learning_rate": 8.993177550236464e-06, |
|
"loss": 0.8042, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 40.470588235294116, |
|
"grad_norm": 0.9705334901809692, |
|
"learning_rate": 8.97560398247424e-06, |
|
"loss": 0.8072, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 40.705882352941174, |
|
"grad_norm": 0.9346606731414795, |
|
"learning_rate": 8.957895855288517e-06, |
|
"loss": 0.7423, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 40.94117647058823, |
|
"grad_norm": 1.2344577312469482, |
|
"learning_rate": 8.94005376803361e-06, |
|
"loss": 0.7701, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 40.94117647058823, |
|
"eval_loss": 0.9198606014251709, |
|
"eval_runtime": 14.4977, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 41.1764705882353, |
|
"grad_norm": 0.9365864992141724, |
|
"learning_rate": 8.92207832459788e-06, |
|
"loss": 0.7861, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 41.411764705882355, |
|
"grad_norm": 0.8833426833152771, |
|
"learning_rate": 8.903970133383297e-06, |
|
"loss": 0.7686, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 41.64705882352941, |
|
"grad_norm": 0.9196489453315735, |
|
"learning_rate": 8.885729807284855e-06, |
|
"loss": 0.773, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 41.88235294117647, |
|
"grad_norm": 1.0096192359924316, |
|
"learning_rate": 8.867357963669821e-06, |
|
"loss": 0.8132, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_loss": 0.916193962097168, |
|
"eval_runtime": 14.4918, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 42.11764705882353, |
|
"grad_norm": 0.9623170495033264, |
|
"learning_rate": 8.84885522435684e-06, |
|
"loss": 0.7446, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 42.35294117647059, |
|
"grad_norm": 0.9639574885368347, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 0.8012, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 42.588235294117645, |
|
"grad_norm": 1.0695879459381104, |
|
"learning_rate": 8.811459568042092e-06, |
|
"loss": 0.7134, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 42.8235294117647, |
|
"grad_norm": 1.0046355724334717, |
|
"learning_rate": 8.792567916744346e-06, |
|
"loss": 0.7804, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 42.94117647058823, |
|
"eval_loss": 0.9103953838348389, |
|
"eval_runtime": 14.5212, |
|
"eval_samples_per_second": 2.479, |
|
"eval_steps_per_second": 2.479, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 43.05882352941177, |
|
"grad_norm": 0.9536579251289368, |
|
"learning_rate": 8.773547901113862e-06, |
|
"loss": 0.7675, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 43.294117647058826, |
|
"grad_norm": 0.9319316148757935, |
|
"learning_rate": 8.754400164907496e-06, |
|
"loss": 0.7311, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 43.529411764705884, |
|
"grad_norm": 1.153613805770874, |
|
"learning_rate": 8.735125356204982e-06, |
|
"loss": 0.7777, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 43.76470588235294, |
|
"grad_norm": 1.0393705368041992, |
|
"learning_rate": 8.715724127386971e-06, |
|
"loss": 0.7322, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 0.9913296103477478, |
|
"learning_rate": 8.69619713511298e-06, |
|
"loss": 0.7508, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_loss": 0.908278226852417, |
|
"eval_runtime": 14.4883, |
|
"eval_samples_per_second": 2.485, |
|
"eval_steps_per_second": 2.485, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 44.23529411764706, |
|
"grad_norm": 0.9832679033279419, |
|
"learning_rate": 8.676545040299145e-06, |
|
"loss": 0.7355, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 44.470588235294116, |
|
"grad_norm": 1.2460023164749146, |
|
"learning_rate": 8.656768508095853e-06, |
|
"loss": 0.7238, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 44.705882352941174, |
|
"grad_norm": 0.964077889919281, |
|
"learning_rate": 8.636868207865244e-06, |
|
"loss": 0.7564, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 44.94117647058823, |
|
"grad_norm": 0.9776307344436646, |
|
"learning_rate": 8.61684481315854e-06, |
|
"loss": 0.7192, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 44.94117647058823, |
|
"eval_loss": 0.9051859378814697, |
|
"eval_runtime": 14.5008, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 45.1764705882353, |
|
"grad_norm": 0.9675242304801941, |
|
"learning_rate": 8.596699001693257e-06, |
|
"loss": 0.754, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 45.411764705882355, |
|
"grad_norm": 1.1183017492294312, |
|
"learning_rate": 8.576431455330258e-06, |
|
"loss": 0.6805, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 45.64705882352941, |
|
"grad_norm": 0.9894379377365112, |
|
"learning_rate": 8.556042860050686e-06, |
|
"loss": 0.7691, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 45.88235294117647, |
|
"grad_norm": 0.8682206869125366, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 0.7633, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_loss": 0.9047689437866211, |
|
"eval_runtime": 14.5123, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 46.11764705882353, |
|
"grad_norm": 0.9697660803794861, |
|
"learning_rate": 8.51490528712831e-06, |
|
"loss": 0.7131, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 46.35294117647059, |
|
"grad_norm": 0.9910651445388794, |
|
"learning_rate": 8.4941577018395e-06, |
|
"loss": 0.7183, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 46.588235294117645, |
|
"grad_norm": 1.670358419418335, |
|
"learning_rate": 8.473291852294986e-06, |
|
"loss": 0.6982, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 46.8235294117647, |
|
"grad_norm": 1.1389248371124268, |
|
"learning_rate": 8.452308444726249e-06, |
|
"loss": 0.7534, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 46.94117647058823, |
|
"eval_loss": 0.9052470326423645, |
|
"eval_runtime": 14.5068, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 47.05882352941177, |
|
"grad_norm": 0.963392972946167, |
|
"learning_rate": 8.43120818934367e-06, |
|
"loss": 0.7282, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 47.294117647058826, |
|
"grad_norm": 1.0950130224227905, |
|
"learning_rate": 8.409991800312493e-06, |
|
"loss": 0.7273, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 47.529411764705884, |
|
"grad_norm": 1.0003873109817505, |
|
"learning_rate": 8.388659995728662e-06, |
|
"loss": 0.7113, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 47.76470588235294, |
|
"grad_norm": 1.1168714761734009, |
|
"learning_rate": 8.367213497594501e-06, |
|
"loss": 0.7019, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 1.1041070222854614, |
|
"learning_rate": 8.345653031794292e-06, |
|
"loss": 0.666, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_loss": 0.915124237537384, |
|
"eval_runtime": 14.4863, |
|
"eval_samples_per_second": 2.485, |
|
"eval_steps_per_second": 2.485, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 48.23529411764706, |
|
"grad_norm": 0.8982572555541992, |
|
"learning_rate": 8.323979328069689e-06, |
|
"loss": 0.6762, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 48.470588235294116, |
|
"grad_norm": 1.0614545345306396, |
|
"learning_rate": 8.302193119995038e-06, |
|
"loss": 0.6779, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 48.705882352941174, |
|
"grad_norm": 0.9689596891403198, |
|
"learning_rate": 8.280295144952537e-06, |
|
"loss": 0.7313, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 48.94117647058823, |
|
"grad_norm": 0.9231712818145752, |
|
"learning_rate": 8.258286144107277e-06, |
|
"loss": 0.7298, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 48.94117647058823, |
|
"eval_loss": 0.9143383502960205, |
|
"eval_runtime": 14.4951, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 49.1764705882353, |
|
"grad_norm": 1.089706301689148, |
|
"learning_rate": 8.236166862382163e-06, |
|
"loss": 0.6328, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 49.411764705882355, |
|
"grad_norm": 1.1155279874801636, |
|
"learning_rate": 8.213938048432697e-06, |
|
"loss": 0.6914, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 49.64705882352941, |
|
"grad_norm": 0.9705954194068909, |
|
"learning_rate": 8.191600454621642e-06, |
|
"loss": 0.7003, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 49.88235294117647, |
|
"grad_norm": 1.0692474842071533, |
|
"learning_rate": 8.16915483699355e-06, |
|
"loss": 0.6815, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_loss": 0.9157248735427856, |
|
"eval_runtime": 14.4868, |
|
"eval_samples_per_second": 2.485, |
|
"eval_steps_per_second": 2.485, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 50.11764705882353, |
|
"grad_norm": 1.0241312980651855, |
|
"learning_rate": 8.146601955249187e-06, |
|
"loss": 0.6999, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 50.35294117647059, |
|
"grad_norm": 1.097902774810791, |
|
"learning_rate": 8.123942572719801e-06, |
|
"loss": 0.6549, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 50.588235294117645, |
|
"grad_norm": 1.1850600242614746, |
|
"learning_rate": 8.101177456341301e-06, |
|
"loss": 0.6675, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 50.8235294117647, |
|
"grad_norm": 1.080406904220581, |
|
"learning_rate": 8.078307376628292e-06, |
|
"loss": 0.6845, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 50.94117647058823, |
|
"eval_loss": 0.9170003533363342, |
|
"eval_runtime": 14.5105, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 51.05882352941177, |
|
"grad_norm": 1.128921389579773, |
|
"learning_rate": 8.055333107648e-06, |
|
"loss": 0.7022, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 51.294117647058826, |
|
"grad_norm": 1.2979844808578491, |
|
"learning_rate": 8.032255426994069e-06, |
|
"loss": 0.6501, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 51.529411764705884, |
|
"grad_norm": 1.0280219316482544, |
|
"learning_rate": 8.009075115760243e-06, |
|
"loss": 0.6824, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 51.76470588235294, |
|
"grad_norm": 1.2425734996795654, |
|
"learning_rate": 7.985792958513932e-06, |
|
"loss": 0.6787, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"grad_norm": 1.114044427871704, |
|
"learning_rate": 7.962409743269654e-06, |
|
"loss": 0.6524, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_loss": 0.9215981960296631, |
|
"eval_runtime": 14.5021, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 52.23529411764706, |
|
"grad_norm": 1.1072558164596558, |
|
"learning_rate": 7.938926261462366e-06, |
|
"loss": 0.6602, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 52.470588235294116, |
|
"grad_norm": 1.1433347463607788, |
|
"learning_rate": 7.915343307920674e-06, |
|
"loss": 0.6649, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 52.705882352941174, |
|
"grad_norm": 1.1459075212478638, |
|
"learning_rate": 7.891661680839932e-06, |
|
"loss": 0.6407, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 52.94117647058823, |
|
"grad_norm": 1.146519422531128, |
|
"learning_rate": 7.86788218175523e-06, |
|
"loss": 0.6397, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 52.94117647058823, |
|
"eval_loss": 0.9228353500366211, |
|
"eval_runtime": 14.5275, |
|
"eval_samples_per_second": 2.478, |
|
"eval_steps_per_second": 2.478, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 52.94117647058823, |
|
"step": 450, |
|
"total_flos": 1.1473460706410496e+17, |
|
"train_loss": 1.1512780372301739, |
|
"train_runtime": 8506.7232, |
|
"train_samples_per_second": 2.398, |
|
"train_steps_per_second": 0.141 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 1200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 150, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 7, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1473460706410496e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|