|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 112.94117647058823, |
|
"eval_steps": 500, |
|
"global_step": 1440, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 3.280456066131592, |
|
"learning_rate": 3.4722222222222224e-06, |
|
"loss": 2.7006, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"eval_accuracy": 0.11666666666666667, |
|
"eval_loss": 2.678151845932007, |
|
"eval_runtime": 4.6451, |
|
"eval_samples_per_second": 38.751, |
|
"eval_steps_per_second": 1.292, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.5686274509803921, |
|
"grad_norm": 7.857480049133301, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 2.6863, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.9607843137254903, |
|
"eval_accuracy": 0.16111111111111112, |
|
"eval_loss": 2.627171039581299, |
|
"eval_runtime": 3.8353, |
|
"eval_samples_per_second": 46.933, |
|
"eval_steps_per_second": 1.564, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 3.7707808017730713, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 2.6437, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 2.980392156862745, |
|
"eval_accuracy": 0.28888888888888886, |
|
"eval_loss": 2.5389277935028076, |
|
"eval_runtime": 3.8826, |
|
"eval_samples_per_second": 46.36, |
|
"eval_steps_per_second": 1.545, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 3.1372549019607843, |
|
"grad_norm": 5.107277870178223, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 2.5839, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 3.9215686274509802, |
|
"grad_norm": 6.703848361968994, |
|
"learning_rate": 1.736111111111111e-05, |
|
"loss": 2.4851, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.4111111111111111, |
|
"eval_loss": 2.411587953567505, |
|
"eval_runtime": 3.8828, |
|
"eval_samples_per_second": 46.359, |
|
"eval_steps_per_second": 1.545, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 11.087530136108398, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 2.3732, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 4.9411764705882355, |
|
"eval_accuracy": 0.4888888888888889, |
|
"eval_loss": 2.270714521408081, |
|
"eval_runtime": 3.8466, |
|
"eval_samples_per_second": 46.794, |
|
"eval_steps_per_second": 1.56, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 5.490196078431373, |
|
"grad_norm": 7.360437393188477, |
|
"learning_rate": 2.4305555555555558e-05, |
|
"loss": 2.2546, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 5.96078431372549, |
|
"eval_accuracy": 0.5722222222222222, |
|
"eval_loss": 2.0710320472717285, |
|
"eval_runtime": 3.9713, |
|
"eval_samples_per_second": 45.326, |
|
"eval_steps_per_second": 1.511, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 6.2745098039215685, |
|
"grad_norm": 6.191979885101318, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 2.1023, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 6.980392156862745, |
|
"eval_accuracy": 0.6166666666666667, |
|
"eval_loss": 1.8370894193649292, |
|
"eval_runtime": 3.8628, |
|
"eval_samples_per_second": 46.599, |
|
"eval_steps_per_second": 1.553, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 7.0588235294117645, |
|
"grad_norm": 7.690328598022461, |
|
"learning_rate": 3.125e-05, |
|
"loss": 1.9156, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 7.8431372549019605, |
|
"grad_norm": 7.919386863708496, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 1.7115, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.6111111111111112, |
|
"eval_loss": 1.6161085367202759, |
|
"eval_runtime": 3.8886, |
|
"eval_samples_per_second": 46.29, |
|
"eval_steps_per_second": 1.543, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 8.627450980392156, |
|
"grad_norm": 12.924628257751465, |
|
"learning_rate": 3.8194444444444444e-05, |
|
"loss": 1.5295, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 8.941176470588236, |
|
"eval_accuracy": 0.6277777777777778, |
|
"eval_loss": 1.4381340742111206, |
|
"eval_runtime": 3.8538, |
|
"eval_samples_per_second": 46.708, |
|
"eval_steps_per_second": 1.557, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 9.411764705882353, |
|
"grad_norm": 14.41945743560791, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.3366, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 9.96078431372549, |
|
"eval_accuracy": 0.65, |
|
"eval_loss": 1.2539671659469604, |
|
"eval_runtime": 3.9661, |
|
"eval_samples_per_second": 45.385, |
|
"eval_steps_per_second": 1.513, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 10.196078431372548, |
|
"grad_norm": 17.627479553222656, |
|
"learning_rate": 4.5138888888888894e-05, |
|
"loss": 1.2377, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 10.980392156862745, |
|
"grad_norm": 13.988055229187012, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 1.0556, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 10.980392156862745, |
|
"eval_accuracy": 0.6611111111111111, |
|
"eval_loss": 1.1632429361343384, |
|
"eval_runtime": 3.9305, |
|
"eval_samples_per_second": 45.796, |
|
"eval_steps_per_second": 1.527, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 11.764705882352942, |
|
"grad_norm": 12.996641159057617, |
|
"learning_rate": 4.976851851851852e-05, |
|
"loss": 0.9657, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7, |
|
"eval_loss": 1.0600230693817139, |
|
"eval_runtime": 3.9116, |
|
"eval_samples_per_second": 46.016, |
|
"eval_steps_per_second": 1.534, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 12.549019607843137, |
|
"grad_norm": 26.70894432067871, |
|
"learning_rate": 4.938271604938271e-05, |
|
"loss": 0.8703, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 12.941176470588236, |
|
"eval_accuracy": 0.7222222222222222, |
|
"eval_loss": 0.9983330368995667, |
|
"eval_runtime": 3.823, |
|
"eval_samples_per_second": 47.084, |
|
"eval_steps_per_second": 1.569, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 24.295700073242188, |
|
"learning_rate": 4.899691358024692e-05, |
|
"loss": 0.8007, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 13.96078431372549, |
|
"eval_accuracy": 0.7277777777777777, |
|
"eval_loss": 0.9474301934242249, |
|
"eval_runtime": 4.0708, |
|
"eval_samples_per_second": 44.218, |
|
"eval_steps_per_second": 1.474, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 14.117647058823529, |
|
"grad_norm": 19.43092918395996, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 0.7257, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 14.901960784313726, |
|
"grad_norm": 22.165098190307617, |
|
"learning_rate": 4.8225308641975306e-05, |
|
"loss": 0.6398, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 14.980392156862745, |
|
"eval_accuracy": 0.75, |
|
"eval_loss": 0.8633670210838318, |
|
"eval_runtime": 4.1208, |
|
"eval_samples_per_second": 43.681, |
|
"eval_steps_per_second": 1.456, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 15.686274509803921, |
|
"grad_norm": 24.815359115600586, |
|
"learning_rate": 4.783950617283951e-05, |
|
"loss": 0.6023, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.7277777777777777, |
|
"eval_loss": 0.8527319431304932, |
|
"eval_runtime": 3.869, |
|
"eval_samples_per_second": 46.524, |
|
"eval_steps_per_second": 1.551, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 16.470588235294116, |
|
"grad_norm": 28.00403594970703, |
|
"learning_rate": 4.745370370370371e-05, |
|
"loss": 0.583, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 16.941176470588236, |
|
"eval_accuracy": 0.7666666666666667, |
|
"eval_loss": 0.7927896976470947, |
|
"eval_runtime": 3.8557, |
|
"eval_samples_per_second": 46.684, |
|
"eval_steps_per_second": 1.556, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 17.254901960784313, |
|
"grad_norm": 30.892383575439453, |
|
"learning_rate": 4.70679012345679e-05, |
|
"loss": 0.5279, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 17.96078431372549, |
|
"eval_accuracy": 0.7833333333333333, |
|
"eval_loss": 0.7896744012832642, |
|
"eval_runtime": 3.7921, |
|
"eval_samples_per_second": 47.468, |
|
"eval_steps_per_second": 1.582, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 18.03921568627451, |
|
"grad_norm": 19.16618537902832, |
|
"learning_rate": 4.66820987654321e-05, |
|
"loss": 0.5084, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 18.823529411764707, |
|
"grad_norm": 13.686722755432129, |
|
"learning_rate": 4.62962962962963e-05, |
|
"loss": 0.4643, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 18.980392156862745, |
|
"eval_accuracy": 0.7666666666666667, |
|
"eval_loss": 0.7885976433753967, |
|
"eval_runtime": 3.8507, |
|
"eval_samples_per_second": 46.745, |
|
"eval_steps_per_second": 1.558, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 19.607843137254903, |
|
"grad_norm": 17.4645938873291, |
|
"learning_rate": 4.591049382716049e-05, |
|
"loss": 0.4296, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.7833333333333333, |
|
"eval_loss": 0.7328829169273376, |
|
"eval_runtime": 3.8093, |
|
"eval_samples_per_second": 47.252, |
|
"eval_steps_per_second": 1.575, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 20.392156862745097, |
|
"grad_norm": 16.678590774536133, |
|
"learning_rate": 4.5524691358024696e-05, |
|
"loss": 0.41, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 20.941176470588236, |
|
"eval_accuracy": 0.7611111111111111, |
|
"eval_loss": 0.7316663861274719, |
|
"eval_runtime": 3.8668, |
|
"eval_samples_per_second": 46.55, |
|
"eval_steps_per_second": 1.552, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 21.176470588235293, |
|
"grad_norm": 12.328489303588867, |
|
"learning_rate": 4.5138888888888894e-05, |
|
"loss": 0.3663, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 21.96078431372549, |
|
"grad_norm": 23.868070602416992, |
|
"learning_rate": 4.4753086419753084e-05, |
|
"loss": 0.3674, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 21.96078431372549, |
|
"eval_accuracy": 0.7666666666666667, |
|
"eval_loss": 0.7170845866203308, |
|
"eval_runtime": 3.956, |
|
"eval_samples_per_second": 45.5, |
|
"eval_steps_per_second": 1.517, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 22.745098039215687, |
|
"grad_norm": 21.286258697509766, |
|
"learning_rate": 4.436728395061729e-05, |
|
"loss": 0.3285, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 22.980392156862745, |
|
"eval_accuracy": 0.7833333333333333, |
|
"eval_loss": 0.7005434036254883, |
|
"eval_runtime": 3.9134, |
|
"eval_samples_per_second": 45.996, |
|
"eval_steps_per_second": 1.533, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 23.529411764705884, |
|
"grad_norm": 12.573646545410156, |
|
"learning_rate": 4.3981481481481486e-05, |
|
"loss": 0.2978, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.7888888888888889, |
|
"eval_loss": 0.6576042771339417, |
|
"eval_runtime": 3.8597, |
|
"eval_samples_per_second": 46.636, |
|
"eval_steps_per_second": 1.555, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 24.313725490196077, |
|
"grad_norm": 18.386383056640625, |
|
"learning_rate": 4.359567901234568e-05, |
|
"loss": 0.293, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 24.941176470588236, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.644997239112854, |
|
"eval_runtime": 4.0291, |
|
"eval_samples_per_second": 44.675, |
|
"eval_steps_per_second": 1.489, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 25.098039215686274, |
|
"grad_norm": 18.57107925415039, |
|
"learning_rate": 4.3209876543209875e-05, |
|
"loss": 0.2665, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 25.88235294117647, |
|
"grad_norm": 16.507802963256836, |
|
"learning_rate": 4.282407407407408e-05, |
|
"loss": 0.2724, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 25.96078431372549, |
|
"eval_accuracy": 0.7888888888888889, |
|
"eval_loss": 0.6764713525772095, |
|
"eval_runtime": 3.8073, |
|
"eval_samples_per_second": 47.278, |
|
"eval_steps_per_second": 1.576, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 26.666666666666668, |
|
"grad_norm": 18.386646270751953, |
|
"learning_rate": 4.243827160493827e-05, |
|
"loss": 0.2494, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 26.980392156862745, |
|
"eval_accuracy": 0.8055555555555556, |
|
"eval_loss": 0.6826486587524414, |
|
"eval_runtime": 3.9305, |
|
"eval_samples_per_second": 45.796, |
|
"eval_steps_per_second": 1.527, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 27.45098039215686, |
|
"grad_norm": 16.895566940307617, |
|
"learning_rate": 4.205246913580247e-05, |
|
"loss": 0.2504, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.8055555555555556, |
|
"eval_loss": 0.6710352301597595, |
|
"eval_runtime": 3.8264, |
|
"eval_samples_per_second": 47.041, |
|
"eval_steps_per_second": 1.568, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 28.235294117647058, |
|
"grad_norm": 14.318737983703613, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.2332, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 28.941176470588236, |
|
"eval_accuracy": 0.7777777777777778, |
|
"eval_loss": 0.666705846786499, |
|
"eval_runtime": 3.9608, |
|
"eval_samples_per_second": 45.445, |
|
"eval_steps_per_second": 1.515, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 29.019607843137255, |
|
"grad_norm": 19.979778289794922, |
|
"learning_rate": 4.128086419753087e-05, |
|
"loss": 0.2071, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 29.80392156862745, |
|
"grad_norm": 14.415696144104004, |
|
"learning_rate": 4.089506172839506e-05, |
|
"loss": 0.2012, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 29.96078431372549, |
|
"eval_accuracy": 0.7944444444444444, |
|
"eval_loss": 0.7399319410324097, |
|
"eval_runtime": 3.8411, |
|
"eval_samples_per_second": 46.861, |
|
"eval_steps_per_second": 1.562, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 30.58823529411765, |
|
"grad_norm": 10.09865665435791, |
|
"learning_rate": 4.0509259259259265e-05, |
|
"loss": 0.1866, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 30.980392156862745, |
|
"eval_accuracy": 0.7833333333333333, |
|
"eval_loss": 0.7311467528343201, |
|
"eval_runtime": 4.2025, |
|
"eval_samples_per_second": 42.831, |
|
"eval_steps_per_second": 1.428, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 31.372549019607842, |
|
"grad_norm": 31.518268585205078, |
|
"learning_rate": 4.012345679012346e-05, |
|
"loss": 0.2031, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.7944444444444444, |
|
"eval_loss": 0.7076573967933655, |
|
"eval_runtime": 3.8553, |
|
"eval_samples_per_second": 46.689, |
|
"eval_steps_per_second": 1.556, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 32.15686274509804, |
|
"grad_norm": 18.065155029296875, |
|
"learning_rate": 3.973765432098765e-05, |
|
"loss": 0.1969, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 32.94117647058823, |
|
"grad_norm": 19.082073211669922, |
|
"learning_rate": 3.935185185185186e-05, |
|
"loss": 0.1969, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 32.94117647058823, |
|
"eval_accuracy": 0.7666666666666667, |
|
"eval_loss": 0.7769466638565063, |
|
"eval_runtime": 3.9404, |
|
"eval_samples_per_second": 45.681, |
|
"eval_steps_per_second": 1.523, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 33.72549019607843, |
|
"grad_norm": 23.67195701599121, |
|
"learning_rate": 3.8966049382716055e-05, |
|
"loss": 0.1968, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 33.96078431372549, |
|
"eval_accuracy": 0.7833333333333333, |
|
"eval_loss": 0.7666174173355103, |
|
"eval_runtime": 3.7732, |
|
"eval_samples_per_second": 47.705, |
|
"eval_steps_per_second": 1.59, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 34.509803921568626, |
|
"grad_norm": 22.488903045654297, |
|
"learning_rate": 3.8580246913580246e-05, |
|
"loss": 0.1712, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 34.98039215686274, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.6795583367347717, |
|
"eval_runtime": 3.7971, |
|
"eval_samples_per_second": 47.404, |
|
"eval_steps_per_second": 1.58, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 35.294117647058826, |
|
"grad_norm": 20.480487823486328, |
|
"learning_rate": 3.8194444444444444e-05, |
|
"loss": 0.1813, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.8111111111111111, |
|
"eval_loss": 0.6653857827186584, |
|
"eval_runtime": 3.8151, |
|
"eval_samples_per_second": 47.181, |
|
"eval_steps_per_second": 1.573, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 36.07843137254902, |
|
"grad_norm": 15.032731056213379, |
|
"learning_rate": 3.780864197530865e-05, |
|
"loss": 0.1625, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 36.86274509803921, |
|
"grad_norm": 12.346388816833496, |
|
"learning_rate": 3.742283950617284e-05, |
|
"loss": 0.1678, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 36.94117647058823, |
|
"eval_accuracy": 0.7888888888888889, |
|
"eval_loss": 0.6851311326026917, |
|
"eval_runtime": 3.8058, |
|
"eval_samples_per_second": 47.296, |
|
"eval_steps_per_second": 1.577, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 37.64705882352941, |
|
"grad_norm": 16.994136810302734, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 0.1461, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 37.96078431372549, |
|
"eval_accuracy": 0.7833333333333333, |
|
"eval_loss": 0.7054334878921509, |
|
"eval_runtime": 3.847, |
|
"eval_samples_per_second": 46.79, |
|
"eval_steps_per_second": 1.56, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 38.431372549019606, |
|
"grad_norm": 9.07701587677002, |
|
"learning_rate": 3.665123456790124e-05, |
|
"loss": 0.1244, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 38.98039215686274, |
|
"eval_accuracy": 0.8055555555555556, |
|
"eval_loss": 0.7013460993766785, |
|
"eval_runtime": 3.8047, |
|
"eval_samples_per_second": 47.31, |
|
"eval_steps_per_second": 1.577, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 39.21568627450981, |
|
"grad_norm": 9.217803955078125, |
|
"learning_rate": 3.626543209876543e-05, |
|
"loss": 0.1385, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 18.938865661621094, |
|
"learning_rate": 3.587962962962963e-05, |
|
"loss": 0.1329, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.6785274147987366, |
|
"eval_runtime": 3.86, |
|
"eval_samples_per_second": 46.632, |
|
"eval_steps_per_second": 1.554, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 40.78431372549019, |
|
"grad_norm": 12.516762733459473, |
|
"learning_rate": 3.5493827160493834e-05, |
|
"loss": 0.1186, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 40.94117647058823, |
|
"eval_accuracy": 0.7777777777777778, |
|
"eval_loss": 0.7499803900718689, |
|
"eval_runtime": 3.8115, |
|
"eval_samples_per_second": 47.225, |
|
"eval_steps_per_second": 1.574, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 41.568627450980394, |
|
"grad_norm": 9.269219398498535, |
|
"learning_rate": 3.5108024691358025e-05, |
|
"loss": 0.1397, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 41.96078431372549, |
|
"eval_accuracy": 0.8166666666666667, |
|
"eval_loss": 0.6819199919700623, |
|
"eval_runtime": 3.8462, |
|
"eval_samples_per_second": 46.799, |
|
"eval_steps_per_second": 1.56, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 42.35294117647059, |
|
"grad_norm": 23.982585906982422, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 0.1324, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 42.98039215686274, |
|
"eval_accuracy": 0.8111111111111111, |
|
"eval_loss": 0.6256746649742126, |
|
"eval_runtime": 3.8195, |
|
"eval_samples_per_second": 47.126, |
|
"eval_steps_per_second": 1.571, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 43.13725490196079, |
|
"grad_norm": 11.348409652709961, |
|
"learning_rate": 3.4336419753086427e-05, |
|
"loss": 0.1461, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 43.92156862745098, |
|
"grad_norm": 14.504171371459961, |
|
"learning_rate": 3.395061728395062e-05, |
|
"loss": 0.111, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.8277777777777777, |
|
"eval_loss": 0.5938891172409058, |
|
"eval_runtime": 3.8255, |
|
"eval_samples_per_second": 47.053, |
|
"eval_steps_per_second": 1.568, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 44.705882352941174, |
|
"grad_norm": 15.88039493560791, |
|
"learning_rate": 3.3564814814814815e-05, |
|
"loss": 0.1228, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 44.94117647058823, |
|
"eval_accuracy": 0.8222222222222222, |
|
"eval_loss": 0.6379250288009644, |
|
"eval_runtime": 3.8691, |
|
"eval_samples_per_second": 46.522, |
|
"eval_steps_per_second": 1.551, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 45.490196078431374, |
|
"grad_norm": 14.967761993408203, |
|
"learning_rate": 3.317901234567901e-05, |
|
"loss": 0.1085, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 45.96078431372549, |
|
"eval_accuracy": 0.8222222222222222, |
|
"eval_loss": 0.6788524389266968, |
|
"eval_runtime": 3.8236, |
|
"eval_samples_per_second": 47.077, |
|
"eval_steps_per_second": 1.569, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 46.27450980392157, |
|
"grad_norm": 7.978495121002197, |
|
"learning_rate": 3.279320987654321e-05, |
|
"loss": 0.1234, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 46.98039215686274, |
|
"eval_accuracy": 0.8277777777777777, |
|
"eval_loss": 0.624097466468811, |
|
"eval_runtime": 3.7905, |
|
"eval_samples_per_second": 47.487, |
|
"eval_steps_per_second": 1.583, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 47.05882352941177, |
|
"grad_norm": 21.228994369506836, |
|
"learning_rate": 3.240740740740741e-05, |
|
"loss": 0.1007, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 47.84313725490196, |
|
"grad_norm": 16.632568359375, |
|
"learning_rate": 3.2021604938271605e-05, |
|
"loss": 0.1129, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.7888888888888889, |
|
"eval_loss": 0.750299334526062, |
|
"eval_runtime": 3.8266, |
|
"eval_samples_per_second": 47.039, |
|
"eval_steps_per_second": 1.568, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 48.627450980392155, |
|
"grad_norm": 8.629143714904785, |
|
"learning_rate": 3.16358024691358e-05, |
|
"loss": 0.1197, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 48.94117647058823, |
|
"eval_accuracy": 0.7944444444444444, |
|
"eval_loss": 0.6861774325370789, |
|
"eval_runtime": 3.8119, |
|
"eval_samples_per_second": 47.22, |
|
"eval_steps_per_second": 1.574, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 49.411764705882355, |
|
"grad_norm": 7.733061790466309, |
|
"learning_rate": 3.125e-05, |
|
"loss": 0.0898, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 49.96078431372549, |
|
"eval_accuracy": 0.7888888888888889, |
|
"eval_loss": 0.6763875484466553, |
|
"eval_runtime": 3.818, |
|
"eval_samples_per_second": 47.145, |
|
"eval_steps_per_second": 1.571, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 50.19607843137255, |
|
"grad_norm": 4.7213454246521, |
|
"learning_rate": 3.08641975308642e-05, |
|
"loss": 0.1021, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 50.98039215686274, |
|
"grad_norm": 7.109160423278809, |
|
"learning_rate": 3.04783950617284e-05, |
|
"loss": 0.1057, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 50.98039215686274, |
|
"eval_accuracy": 0.8166666666666667, |
|
"eval_loss": 0.6338934898376465, |
|
"eval_runtime": 3.8669, |
|
"eval_samples_per_second": 46.549, |
|
"eval_steps_per_second": 1.552, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 51.76470588235294, |
|
"grad_norm": 16.793262481689453, |
|
"learning_rate": 3.0092592592592593e-05, |
|
"loss": 0.0893, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.85, |
|
"eval_loss": 0.5828067064285278, |
|
"eval_runtime": 3.8135, |
|
"eval_samples_per_second": 47.2, |
|
"eval_steps_per_second": 1.573, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 52.549019607843135, |
|
"grad_norm": 15.305388450622559, |
|
"learning_rate": 2.970679012345679e-05, |
|
"loss": 0.0736, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 52.94117647058823, |
|
"eval_accuracy": 0.8111111111111111, |
|
"eval_loss": 0.6572611331939697, |
|
"eval_runtime": 3.8338, |
|
"eval_samples_per_second": 46.951, |
|
"eval_steps_per_second": 1.565, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 53.333333333333336, |
|
"grad_norm": 13.537842750549316, |
|
"learning_rate": 2.9320987654320992e-05, |
|
"loss": 0.0752, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 53.96078431372549, |
|
"eval_accuracy": 0.7944444444444444, |
|
"eval_loss": 0.6806420087814331, |
|
"eval_runtime": 3.821, |
|
"eval_samples_per_second": 47.108, |
|
"eval_steps_per_second": 1.57, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 54.11764705882353, |
|
"grad_norm": 21.195058822631836, |
|
"learning_rate": 2.8935185185185186e-05, |
|
"loss": 0.081, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 54.90196078431372, |
|
"grad_norm": 4.959319591522217, |
|
"learning_rate": 2.8549382716049384e-05, |
|
"loss": 0.1127, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 54.98039215686274, |
|
"eval_accuracy": 0.8111111111111111, |
|
"eval_loss": 0.6222459673881531, |
|
"eval_runtime": 3.8138, |
|
"eval_samples_per_second": 47.197, |
|
"eval_steps_per_second": 1.573, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 55.68627450980392, |
|
"grad_norm": 15.941388130187988, |
|
"learning_rate": 2.8163580246913578e-05, |
|
"loss": 0.1126, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.8166666666666667, |
|
"eval_loss": 0.6305037140846252, |
|
"eval_runtime": 3.7889, |
|
"eval_samples_per_second": 47.507, |
|
"eval_steps_per_second": 1.584, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 56.470588235294116, |
|
"grad_norm": 12.527265548706055, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.0874, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 56.94117647058823, |
|
"eval_accuracy": 0.8111111111111111, |
|
"eval_loss": 0.6593422293663025, |
|
"eval_runtime": 3.7695, |
|
"eval_samples_per_second": 47.751, |
|
"eval_steps_per_second": 1.592, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 57.254901960784316, |
|
"grad_norm": 7.021444797515869, |
|
"learning_rate": 2.7391975308641977e-05, |
|
"loss": 0.0806, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 57.96078431372549, |
|
"eval_accuracy": 0.8166666666666667, |
|
"eval_loss": 0.7005773782730103, |
|
"eval_runtime": 3.7654, |
|
"eval_samples_per_second": 47.804, |
|
"eval_steps_per_second": 1.593, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 58.03921568627451, |
|
"grad_norm": 13.148822784423828, |
|
"learning_rate": 2.700617283950617e-05, |
|
"loss": 0.0862, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 58.8235294117647, |
|
"grad_norm": 21.20357322692871, |
|
"learning_rate": 2.6620370370370372e-05, |
|
"loss": 0.0978, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 58.98039215686274, |
|
"eval_accuracy": 0.8055555555555556, |
|
"eval_loss": 0.6680053472518921, |
|
"eval_runtime": 3.8444, |
|
"eval_samples_per_second": 46.821, |
|
"eval_steps_per_second": 1.561, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 59.6078431372549, |
|
"grad_norm": 14.901837348937988, |
|
"learning_rate": 2.623456790123457e-05, |
|
"loss": 0.0875, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.8166666666666667, |
|
"eval_loss": 0.67389976978302, |
|
"eval_runtime": 3.8169, |
|
"eval_samples_per_second": 47.158, |
|
"eval_steps_per_second": 1.572, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 60.3921568627451, |
|
"grad_norm": 7.75632905960083, |
|
"learning_rate": 2.5848765432098764e-05, |
|
"loss": 0.0722, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 60.94117647058823, |
|
"eval_accuracy": 0.8333333333333334, |
|
"eval_loss": 0.6340806484222412, |
|
"eval_runtime": 3.804, |
|
"eval_samples_per_second": 47.318, |
|
"eval_steps_per_second": 1.577, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 61.1764705882353, |
|
"grad_norm": 15.415738105773926, |
|
"learning_rate": 2.5462962962962965e-05, |
|
"loss": 0.0901, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 61.96078431372549, |
|
"grad_norm": 4.349020004272461, |
|
"learning_rate": 2.5077160493827162e-05, |
|
"loss": 0.0942, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 61.96078431372549, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.6428362727165222, |
|
"eval_runtime": 3.7936, |
|
"eval_samples_per_second": 47.448, |
|
"eval_steps_per_second": 1.582, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 62.745098039215684, |
|
"grad_norm": 9.049278259277344, |
|
"learning_rate": 2.4691358024691357e-05, |
|
"loss": 0.0957, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 62.98039215686274, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.6757560968399048, |
|
"eval_runtime": 3.8348, |
|
"eval_samples_per_second": 46.938, |
|
"eval_steps_per_second": 1.565, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 63.529411764705884, |
|
"grad_norm": 11.159144401550293, |
|
"learning_rate": 2.4305555555555558e-05, |
|
"loss": 0.0814, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.8166666666666667, |
|
"eval_loss": 0.6104480028152466, |
|
"eval_runtime": 3.8591, |
|
"eval_samples_per_second": 46.643, |
|
"eval_steps_per_second": 1.555, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 64.31372549019608, |
|
"grad_norm": 11.834258079528809, |
|
"learning_rate": 2.3919753086419755e-05, |
|
"loss": 0.077, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 64.94117647058823, |
|
"eval_accuracy": 0.8111111111111111, |
|
"eval_loss": 0.6226403713226318, |
|
"eval_runtime": 3.8932, |
|
"eval_samples_per_second": 46.235, |
|
"eval_steps_per_second": 1.541, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 65.09803921568627, |
|
"grad_norm": 11.226044654846191, |
|
"learning_rate": 2.353395061728395e-05, |
|
"loss": 0.0862, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 65.88235294117646, |
|
"grad_norm": 14.373990058898926, |
|
"learning_rate": 2.314814814814815e-05, |
|
"loss": 0.1004, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 65.96078431372548, |
|
"eval_accuracy": 0.8055555555555556, |
|
"eval_loss": 0.6898564696311951, |
|
"eval_runtime": 3.8229, |
|
"eval_samples_per_second": 47.085, |
|
"eval_steps_per_second": 1.569, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"grad_norm": 8.56983757019043, |
|
"learning_rate": 2.2762345679012348e-05, |
|
"loss": 0.0697, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 66.98039215686275, |
|
"eval_accuracy": 0.8166666666666667, |
|
"eval_loss": 0.7104570865631104, |
|
"eval_runtime": 3.7843, |
|
"eval_samples_per_second": 47.565, |
|
"eval_steps_per_second": 1.585, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 67.45098039215686, |
|
"grad_norm": 7.092602729797363, |
|
"learning_rate": 2.2376543209876542e-05, |
|
"loss": 0.0754, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.8111111111111111, |
|
"eval_loss": 0.675083339214325, |
|
"eval_runtime": 3.8377, |
|
"eval_samples_per_second": 46.903, |
|
"eval_steps_per_second": 1.563, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 68.23529411764706, |
|
"grad_norm": 21.711984634399414, |
|
"learning_rate": 2.1990740740740743e-05, |
|
"loss": 0.0842, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 68.94117647058823, |
|
"eval_accuracy": 0.7833333333333333, |
|
"eval_loss": 0.6912497878074646, |
|
"eval_runtime": 4.1438, |
|
"eval_samples_per_second": 43.438, |
|
"eval_steps_per_second": 1.448, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 69.01960784313725, |
|
"grad_norm": 17.59021759033203, |
|
"learning_rate": 2.1604938271604937e-05, |
|
"loss": 0.0815, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 69.80392156862744, |
|
"grad_norm": 6.925079822540283, |
|
"learning_rate": 2.1219135802469135e-05, |
|
"loss": 0.0684, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 69.96078431372548, |
|
"eval_accuracy": 0.8166666666666667, |
|
"eval_loss": 0.7235284447669983, |
|
"eval_runtime": 3.8887, |
|
"eval_samples_per_second": 46.288, |
|
"eval_steps_per_second": 1.543, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 70.58823529411765, |
|
"grad_norm": 10.211894989013672, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 0.0684, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 70.98039215686275, |
|
"eval_accuracy": 0.8277777777777777, |
|
"eval_loss": 0.5839894413948059, |
|
"eval_runtime": 3.8698, |
|
"eval_samples_per_second": 46.514, |
|
"eval_steps_per_second": 1.55, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 71.37254901960785, |
|
"grad_norm": 7.931591987609863, |
|
"learning_rate": 2.044753086419753e-05, |
|
"loss": 0.0705, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.8222222222222222, |
|
"eval_loss": 0.6635811924934387, |
|
"eval_runtime": 3.899, |
|
"eval_samples_per_second": 46.165, |
|
"eval_steps_per_second": 1.539, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 72.15686274509804, |
|
"grad_norm": 8.439017295837402, |
|
"learning_rate": 2.006172839506173e-05, |
|
"loss": 0.0532, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 72.94117647058823, |
|
"grad_norm": 20.3321475982666, |
|
"learning_rate": 1.967592592592593e-05, |
|
"loss": 0.0681, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 72.94117647058823, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.678679883480072, |
|
"eval_runtime": 3.8456, |
|
"eval_samples_per_second": 46.807, |
|
"eval_steps_per_second": 1.56, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 73.72549019607843, |
|
"grad_norm": 16.518983840942383, |
|
"learning_rate": 1.9290123456790123e-05, |
|
"loss": 0.0906, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 73.96078431372548, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.6242751479148865, |
|
"eval_runtime": 3.8813, |
|
"eval_samples_per_second": 46.376, |
|
"eval_steps_per_second": 1.546, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 74.50980392156863, |
|
"grad_norm": 10.190762519836426, |
|
"learning_rate": 1.8904320987654324e-05, |
|
"loss": 0.0453, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 74.98039215686275, |
|
"eval_accuracy": 0.8222222222222222, |
|
"eval_loss": 0.6786649823188782, |
|
"eval_runtime": 3.8468, |
|
"eval_samples_per_second": 46.793, |
|
"eval_steps_per_second": 1.56, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 75.29411764705883, |
|
"grad_norm": 12.627336502075195, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 0.0874, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.8277777777777777, |
|
"eval_loss": 0.6259381771087646, |
|
"eval_runtime": 3.9491, |
|
"eval_samples_per_second": 45.58, |
|
"eval_steps_per_second": 1.519, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 76.07843137254902, |
|
"grad_norm": 15.190587997436523, |
|
"learning_rate": 1.8132716049382716e-05, |
|
"loss": 0.0668, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 76.86274509803921, |
|
"grad_norm": 4.534496307373047, |
|
"learning_rate": 1.7746913580246917e-05, |
|
"loss": 0.051, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 76.94117647058823, |
|
"eval_accuracy": 0.8277777777777777, |
|
"eval_loss": 0.6590437293052673, |
|
"eval_runtime": 3.8024, |
|
"eval_samples_per_second": 47.339, |
|
"eval_steps_per_second": 1.578, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 77.6470588235294, |
|
"grad_norm": 12.094006538391113, |
|
"learning_rate": 1.736111111111111e-05, |
|
"loss": 0.0858, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 77.96078431372548, |
|
"eval_accuracy": 0.8277777777777777, |
|
"eval_loss": 0.6306740045547485, |
|
"eval_runtime": 4.0063, |
|
"eval_samples_per_second": 44.929, |
|
"eval_steps_per_second": 1.498, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 78.43137254901961, |
|
"grad_norm": 12.669979095458984, |
|
"learning_rate": 1.697530864197531e-05, |
|
"loss": 0.0601, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 78.98039215686275, |
|
"eval_accuracy": 0.8444444444444444, |
|
"eval_loss": 0.6041626334190369, |
|
"eval_runtime": 3.9261, |
|
"eval_samples_per_second": 45.847, |
|
"eval_steps_per_second": 1.528, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 79.2156862745098, |
|
"grad_norm": 13.32419490814209, |
|
"learning_rate": 1.6589506172839506e-05, |
|
"loss": 0.0596, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 17.491554260253906, |
|
"learning_rate": 1.6203703703703704e-05, |
|
"loss": 0.0601, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.5874945521354675, |
|
"eval_runtime": 4.0068, |
|
"eval_samples_per_second": 44.923, |
|
"eval_steps_per_second": 1.497, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 80.7843137254902, |
|
"grad_norm": 7.148036956787109, |
|
"learning_rate": 1.58179012345679e-05, |
|
"loss": 0.067, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 80.94117647058823, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.6078370809555054, |
|
"eval_runtime": 3.7634, |
|
"eval_samples_per_second": 47.829, |
|
"eval_steps_per_second": 1.594, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 81.56862745098039, |
|
"grad_norm": 6.120352745056152, |
|
"learning_rate": 1.54320987654321e-05, |
|
"loss": 0.0556, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 81.96078431372548, |
|
"eval_accuracy": 0.8444444444444444, |
|
"eval_loss": 0.6006819605827332, |
|
"eval_runtime": 3.8921, |
|
"eval_samples_per_second": 46.248, |
|
"eval_steps_per_second": 1.542, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 82.3529411764706, |
|
"grad_norm": 9.7392578125, |
|
"learning_rate": 1.5046296296296297e-05, |
|
"loss": 0.0661, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 82.98039215686275, |
|
"eval_accuracy": 0.8333333333333334, |
|
"eval_loss": 0.6062378287315369, |
|
"eval_runtime": 3.7924, |
|
"eval_samples_per_second": 47.464, |
|
"eval_steps_per_second": 1.582, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 83.13725490196079, |
|
"grad_norm": 7.788672924041748, |
|
"learning_rate": 1.4660493827160496e-05, |
|
"loss": 0.0594, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 83.92156862745098, |
|
"grad_norm": 3.5243284702301025, |
|
"learning_rate": 1.4274691358024692e-05, |
|
"loss": 0.0651, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_accuracy": 0.8111111111111111, |
|
"eval_loss": 0.6387273669242859, |
|
"eval_runtime": 3.9274, |
|
"eval_samples_per_second": 45.832, |
|
"eval_steps_per_second": 1.528, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 84.70588235294117, |
|
"grad_norm": 10.037181854248047, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 0.0546, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 84.94117647058823, |
|
"eval_accuracy": 0.8166666666666667, |
|
"eval_loss": 0.6861324906349182, |
|
"eval_runtime": 3.8238, |
|
"eval_samples_per_second": 47.074, |
|
"eval_steps_per_second": 1.569, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 85.49019607843137, |
|
"grad_norm": 10.713313102722168, |
|
"learning_rate": 1.3503086419753085e-05, |
|
"loss": 0.0827, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 85.96078431372548, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.6072664260864258, |
|
"eval_runtime": 4.2373, |
|
"eval_samples_per_second": 42.479, |
|
"eval_steps_per_second": 1.416, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 86.27450980392157, |
|
"grad_norm": 7.2301201820373535, |
|
"learning_rate": 1.3117283950617285e-05, |
|
"loss": 0.052, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 86.98039215686275, |
|
"eval_accuracy": 0.85, |
|
"eval_loss": 0.593485951423645, |
|
"eval_runtime": 3.8302, |
|
"eval_samples_per_second": 46.995, |
|
"eval_steps_per_second": 1.566, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 87.05882352941177, |
|
"grad_norm": 4.73368501663208, |
|
"learning_rate": 1.2731481481481482e-05, |
|
"loss": 0.0442, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 87.84313725490196, |
|
"grad_norm": 18.29523277282715, |
|
"learning_rate": 1.2345679012345678e-05, |
|
"loss": 0.0524, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.5899335145950317, |
|
"eval_runtime": 3.8551, |
|
"eval_samples_per_second": 46.692, |
|
"eval_steps_per_second": 1.556, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 88.62745098039215, |
|
"grad_norm": 5.7875494956970215, |
|
"learning_rate": 1.1959876543209878e-05, |
|
"loss": 0.066, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 88.94117647058823, |
|
"eval_accuracy": 0.8444444444444444, |
|
"eval_loss": 0.5954256057739258, |
|
"eval_runtime": 3.8582, |
|
"eval_samples_per_second": 46.654, |
|
"eval_steps_per_second": 1.555, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 89.41176470588235, |
|
"grad_norm": 7.960011005401611, |
|
"learning_rate": 1.1574074074074075e-05, |
|
"loss": 0.0617, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 89.96078431372548, |
|
"eval_accuracy": 0.8444444444444444, |
|
"eval_loss": 0.6145300269126892, |
|
"eval_runtime": 3.8594, |
|
"eval_samples_per_second": 46.639, |
|
"eval_steps_per_second": 1.555, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 90.19607843137256, |
|
"grad_norm": 5.9040350914001465, |
|
"learning_rate": 1.1188271604938271e-05, |
|
"loss": 0.0373, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 90.98039215686275, |
|
"grad_norm": 7.361179828643799, |
|
"learning_rate": 1.0802469135802469e-05, |
|
"loss": 0.0572, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 90.98039215686275, |
|
"eval_accuracy": 0.8444444444444444, |
|
"eval_loss": 0.6176372766494751, |
|
"eval_runtime": 3.825, |
|
"eval_samples_per_second": 47.059, |
|
"eval_steps_per_second": 1.569, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 91.76470588235294, |
|
"grad_norm": 14.031915664672852, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 0.0719, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"eval_accuracy": 0.8277777777777777, |
|
"eval_loss": 0.6406115889549255, |
|
"eval_runtime": 3.7626, |
|
"eval_samples_per_second": 47.839, |
|
"eval_steps_per_second": 1.595, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 92.54901960784314, |
|
"grad_norm": 12.152432441711426, |
|
"learning_rate": 1.0030864197530866e-05, |
|
"loss": 0.0734, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 92.94117647058823, |
|
"eval_accuracy": 0.8333333333333334, |
|
"eval_loss": 0.6484689712524414, |
|
"eval_runtime": 3.8146, |
|
"eval_samples_per_second": 47.187, |
|
"eval_steps_per_second": 1.573, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 93.33333333333333, |
|
"grad_norm": 7.38240385055542, |
|
"learning_rate": 9.645061728395062e-06, |
|
"loss": 0.0616, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 93.96078431372548, |
|
"eval_accuracy": 0.8333333333333334, |
|
"eval_loss": 0.619816243648529, |
|
"eval_runtime": 3.8069, |
|
"eval_samples_per_second": 47.282, |
|
"eval_steps_per_second": 1.576, |
|
"step": 1198 |
|
}, |
|
{ |
|
"epoch": 94.11764705882354, |
|
"grad_norm": 15.435483932495117, |
|
"learning_rate": 9.259259259259259e-06, |
|
"loss": 0.047, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 94.90196078431373, |
|
"grad_norm": 3.8711276054382324, |
|
"learning_rate": 8.873456790123458e-06, |
|
"loss": 0.0557, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 94.98039215686275, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.6167161464691162, |
|
"eval_runtime": 3.7712, |
|
"eval_samples_per_second": 47.73, |
|
"eval_steps_per_second": 1.591, |
|
"step": 1211 |
|
}, |
|
{ |
|
"epoch": 95.68627450980392, |
|
"grad_norm": 6.233323574066162, |
|
"learning_rate": 8.487654320987654e-06, |
|
"loss": 0.0494, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_accuracy": 0.8444444444444444, |
|
"eval_loss": 0.6479634642601013, |
|
"eval_runtime": 3.8139, |
|
"eval_samples_per_second": 47.196, |
|
"eval_steps_per_second": 1.573, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 96.47058823529412, |
|
"grad_norm": 2.76275897026062, |
|
"learning_rate": 8.101851851851852e-06, |
|
"loss": 0.0587, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 96.94117647058823, |
|
"eval_accuracy": 0.85, |
|
"eval_loss": 0.6075512170791626, |
|
"eval_runtime": 3.8781, |
|
"eval_samples_per_second": 46.414, |
|
"eval_steps_per_second": 1.547, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 97.25490196078431, |
|
"grad_norm": 6.258754730224609, |
|
"learning_rate": 7.71604938271605e-06, |
|
"loss": 0.052, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 97.96078431372548, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.6511959433555603, |
|
"eval_runtime": 3.7767, |
|
"eval_samples_per_second": 47.661, |
|
"eval_steps_per_second": 1.589, |
|
"step": 1249 |
|
}, |
|
{ |
|
"epoch": 98.03921568627452, |
|
"grad_norm": 2.6569933891296387, |
|
"learning_rate": 7.330246913580248e-06, |
|
"loss": 0.0511, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 98.82352941176471, |
|
"grad_norm": 6.5559306144714355, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 0.0383, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 98.98039215686275, |
|
"eval_accuracy": 0.8333333333333334, |
|
"eval_loss": 0.6781744360923767, |
|
"eval_runtime": 3.7667, |
|
"eval_samples_per_second": 47.787, |
|
"eval_steps_per_second": 1.593, |
|
"step": 1262 |
|
}, |
|
{ |
|
"epoch": 99.6078431372549, |
|
"grad_norm": 2.813370704650879, |
|
"learning_rate": 6.558641975308642e-06, |
|
"loss": 0.0499, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_accuracy": 0.8277777777777777, |
|
"eval_loss": 0.6542291641235352, |
|
"eval_runtime": 3.8026, |
|
"eval_samples_per_second": 47.337, |
|
"eval_steps_per_second": 1.578, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 100.3921568627451, |
|
"grad_norm": 16.95922088623047, |
|
"learning_rate": 6.172839506172839e-06, |
|
"loss": 0.0511, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 100.94117647058823, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.6794776320457458, |
|
"eval_runtime": 3.8468, |
|
"eval_samples_per_second": 46.793, |
|
"eval_steps_per_second": 1.56, |
|
"step": 1287 |
|
}, |
|
{ |
|
"epoch": 101.17647058823529, |
|
"grad_norm": 15.969926834106445, |
|
"learning_rate": 5.787037037037038e-06, |
|
"loss": 0.0527, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 101.96078431372548, |
|
"grad_norm": 15.459400177001953, |
|
"learning_rate": 5.401234567901234e-06, |
|
"loss": 0.0452, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 101.96078431372548, |
|
"eval_accuracy": 0.8333333333333334, |
|
"eval_loss": 0.6739789247512817, |
|
"eval_runtime": 3.7395, |
|
"eval_samples_per_second": 48.135, |
|
"eval_steps_per_second": 1.605, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 102.74509803921569, |
|
"grad_norm": 5.437152862548828, |
|
"learning_rate": 5.015432098765433e-06, |
|
"loss": 0.0475, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 102.98039215686275, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.6615740656852722, |
|
"eval_runtime": 3.7883, |
|
"eval_samples_per_second": 47.515, |
|
"eval_steps_per_second": 1.584, |
|
"step": 1313 |
|
}, |
|
{ |
|
"epoch": 103.52941176470588, |
|
"grad_norm": 12.730355262756348, |
|
"learning_rate": 4.6296296296296296e-06, |
|
"loss": 0.0455, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 104.0, |
|
"eval_accuracy": 0.8277777777777777, |
|
"eval_loss": 0.6490476131439209, |
|
"eval_runtime": 3.8157, |
|
"eval_samples_per_second": 47.173, |
|
"eval_steps_per_second": 1.572, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 104.31372549019608, |
|
"grad_norm": 10.348357200622559, |
|
"learning_rate": 4.243827160493827e-06, |
|
"loss": 0.0486, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 104.94117647058823, |
|
"eval_accuracy": 0.8333333333333334, |
|
"eval_loss": 0.6331196427345276, |
|
"eval_runtime": 3.7919, |
|
"eval_samples_per_second": 47.469, |
|
"eval_steps_per_second": 1.582, |
|
"step": 1338 |
|
}, |
|
{ |
|
"epoch": 105.09803921568627, |
|
"grad_norm": 16.444259643554688, |
|
"learning_rate": 3.858024691358025e-06, |
|
"loss": 0.0442, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 105.88235294117646, |
|
"grad_norm": 7.596277236938477, |
|
"learning_rate": 3.4722222222222224e-06, |
|
"loss": 0.0585, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 105.96078431372548, |
|
"eval_accuracy": 0.8333333333333334, |
|
"eval_loss": 0.6298839449882507, |
|
"eval_runtime": 3.7758, |
|
"eval_samples_per_second": 47.672, |
|
"eval_steps_per_second": 1.589, |
|
"step": 1351 |
|
}, |
|
{ |
|
"epoch": 106.66666666666667, |
|
"grad_norm": 13.853099822998047, |
|
"learning_rate": 3.0864197530864196e-06, |
|
"loss": 0.0549, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 106.98039215686275, |
|
"eval_accuracy": 0.8277777777777777, |
|
"eval_loss": 0.6397578120231628, |
|
"eval_runtime": 3.7676, |
|
"eval_samples_per_second": 47.776, |
|
"eval_steps_per_second": 1.593, |
|
"step": 1364 |
|
}, |
|
{ |
|
"epoch": 107.45098039215686, |
|
"grad_norm": 10.11983871459961, |
|
"learning_rate": 2.700617283950617e-06, |
|
"loss": 0.0436, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 108.0, |
|
"eval_accuracy": 0.8444444444444444, |
|
"eval_loss": 0.6338447332382202, |
|
"eval_runtime": 3.8768, |
|
"eval_samples_per_second": 46.43, |
|
"eval_steps_per_second": 1.548, |
|
"step": 1377 |
|
}, |
|
{ |
|
"epoch": 108.23529411764706, |
|
"grad_norm": 7.749964237213135, |
|
"learning_rate": 2.3148148148148148e-06, |
|
"loss": 0.0429, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 108.94117647058823, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.6458715796470642, |
|
"eval_runtime": 3.7687, |
|
"eval_samples_per_second": 47.762, |
|
"eval_steps_per_second": 1.592, |
|
"step": 1389 |
|
}, |
|
{ |
|
"epoch": 109.01960784313725, |
|
"grad_norm": 5.384810447692871, |
|
"learning_rate": 1.9290123456790124e-06, |
|
"loss": 0.047, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 109.80392156862744, |
|
"grad_norm": 14.666231155395508, |
|
"learning_rate": 1.5432098765432098e-06, |
|
"loss": 0.0449, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 109.96078431372548, |
|
"eval_accuracy": 0.8444444444444444, |
|
"eval_loss": 0.6469634175300598, |
|
"eval_runtime": 3.785, |
|
"eval_samples_per_second": 47.556, |
|
"eval_steps_per_second": 1.585, |
|
"step": 1402 |
|
}, |
|
{ |
|
"epoch": 110.58823529411765, |
|
"grad_norm": 11.94315242767334, |
|
"learning_rate": 1.1574074074074074e-06, |
|
"loss": 0.0559, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 110.98039215686275, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.646262526512146, |
|
"eval_runtime": 3.8809, |
|
"eval_samples_per_second": 46.381, |
|
"eval_steps_per_second": 1.546, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 111.37254901960785, |
|
"grad_norm": 8.830354690551758, |
|
"learning_rate": 7.716049382716049e-07, |
|
"loss": 0.0378, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 112.0, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.6480041742324829, |
|
"eval_runtime": 3.8373, |
|
"eval_samples_per_second": 46.908, |
|
"eval_steps_per_second": 1.564, |
|
"step": 1428 |
|
}, |
|
{ |
|
"epoch": 112.15686274509804, |
|
"grad_norm": 16.867820739746094, |
|
"learning_rate": 3.8580246913580245e-07, |
|
"loss": 0.0509, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 112.94117647058823, |
|
"grad_norm": 13.072942733764648, |
|
"learning_rate": 0.0, |
|
"loss": 0.0476, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 112.94117647058823, |
|
"eval_accuracy": 0.8388888888888889, |
|
"eval_loss": 0.6477780342102051, |
|
"eval_runtime": 3.8103, |
|
"eval_samples_per_second": 47.24, |
|
"eval_steps_per_second": 1.575, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 112.94117647058823, |
|
"step": 1440, |
|
"total_flos": 4.607069541812011e+18, |
|
"train_loss": 0.33674598841203585, |
|
"train_runtime": 5395.5485, |
|
"train_samples_per_second": 36.03, |
|
"train_steps_per_second": 0.267 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1440, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 120, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.607069541812011e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|