|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7831325301204819, |
|
"eval_steps": 500, |
|
"global_step": 130, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 4.9995000000000005e-05, |
|
"loss": 1.7049, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 4.999e-05, |
|
"loss": 2.9872, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 4.9985e-05, |
|
"loss": 2.1065, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 4.9980000000000006e-05, |
|
"loss": 2.2636, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 4.9975e-05, |
|
"loss": 1.8175, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 4.997e-05, |
|
"loss": 1.5265, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 4.9965e-05, |
|
"loss": 1.3999, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 4.996e-05, |
|
"loss": 1.3867, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 4.9955e-05, |
|
"loss": 1.3329, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 4.995e-05, |
|
"loss": 1.3145, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 4.9945000000000004e-05, |
|
"loss": 1.2775, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 4.9940000000000006e-05, |
|
"loss": 1.2551, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 4.9935e-05, |
|
"loss": 1.2279, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 4.9930000000000005e-05, |
|
"loss": 1.1668, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 4.992500000000001e-05, |
|
"loss": 1.1261, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 4.992e-05, |
|
"loss": 1.1738, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 4.9915e-05, |
|
"loss": 1.1098, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 4.991e-05, |
|
"loss": 1.1268, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 4.9905000000000004e-05, |
|
"loss": 1.1261, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 4.99e-05, |
|
"loss": 1.1152, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 4.9895e-05, |
|
"loss": 1.1345, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 4.9890000000000005e-05, |
|
"loss": 1.1341, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 4.9885e-05, |
|
"loss": 1.1288, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 4.9880000000000004e-05, |
|
"loss": 1.1264, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 4.9875000000000006e-05, |
|
"loss": 1.0576, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 4.987e-05, |
|
"loss": 1.0905, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 4.9865e-05, |
|
"loss": 1.0967, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 4.986e-05, |
|
"loss": 1.0937, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 4.9855e-05, |
|
"loss": 1.0597, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 4.9850000000000006e-05, |
|
"loss": 1.0706, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 4.9845e-05, |
|
"loss": 1.0608, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 4.9840000000000004e-05, |
|
"loss": 1.0817, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 4.9835000000000007e-05, |
|
"loss": 1.0739, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 4.983e-05, |
|
"loss": 1.0434, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 4.9825000000000005e-05, |
|
"loss": 1.0168, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 4.982e-05, |
|
"loss": 1.0351, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 4.9815e-05, |
|
"loss": 1.0435, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 4.981e-05, |
|
"loss": 1.0247, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 4.9805e-05, |
|
"loss": 1.0065, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 4.9800000000000004e-05, |
|
"loss": 1.0036, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 4.9795e-05, |
|
"loss": 1.0483, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 4.979e-05, |
|
"loss": 1.0252, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 4.9785000000000005e-05, |
|
"loss": 1.0344, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 4.978e-05, |
|
"loss": 1.0353, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 4.9775000000000004e-05, |
|
"loss": 1.0381, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 4.977e-05, |
|
"loss": 0.9899, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 4.9765e-05, |
|
"loss": 0.9786, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 4.976e-05, |
|
"loss": 1.0434, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 4.9755e-05, |
|
"loss": 1.0069, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 4.975e-05, |
|
"loss": 0.9665, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 4.9745000000000006e-05, |
|
"loss": 0.9754, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 4.974e-05, |
|
"loss": 0.9718, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 4.9735000000000004e-05, |
|
"loss": 0.9828, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 4.973000000000001e-05, |
|
"loss": 0.996, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 4.9725e-05, |
|
"loss": 1.026, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 4.972e-05, |
|
"loss": 0.9832, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 4.9715e-05, |
|
"loss": 0.9985, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 4.9710000000000003e-05, |
|
"loss": 1.0145, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 4.9705e-05, |
|
"loss": 0.9708, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 4.97e-05, |
|
"loss": 0.9411, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 4.9695000000000004e-05, |
|
"loss": 0.9774, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 4.969e-05, |
|
"loss": 1.0044, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 4.9685e-05, |
|
"loss": 0.9581, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 4.9680000000000005e-05, |
|
"loss": 1.0007, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 4.967500000000001e-05, |
|
"loss": 0.9589, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 4.967e-05, |
|
"loss": 0.9704, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 4.9665e-05, |
|
"loss": 0.978, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 4.966e-05, |
|
"loss": 0.9554, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 4.9655000000000005e-05, |
|
"loss": 0.9433, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 4.965e-05, |
|
"loss": 0.9905, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 4.9645e-05, |
|
"loss": 0.9578, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 4.9640000000000006e-05, |
|
"loss": 0.9069, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 4.9635e-05, |
|
"loss": 0.9647, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 4.9630000000000004e-05, |
|
"loss": 0.9377, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 4.962500000000001e-05, |
|
"loss": 0.9343, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 4.962e-05, |
|
"loss": 0.9644, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 4.9615e-05, |
|
"loss": 0.9737, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 4.961e-05, |
|
"loss": 0.9552, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 4.9605000000000004e-05, |
|
"loss": 0.9158, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 4.96e-05, |
|
"loss": 0.9278, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 4.9595e-05, |
|
"loss": 0.9464, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 4.9590000000000005e-05, |
|
"loss": 0.9672, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 4.9585e-05, |
|
"loss": 0.9487, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 4.958e-05, |
|
"loss": 0.9248, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 4.9575000000000006e-05, |
|
"loss": 0.9443, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 4.957e-05, |
|
"loss": 0.9565, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 4.9565e-05, |
|
"loss": 0.934, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 4.956e-05, |
|
"loss": 0.9198, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 4.9555e-05, |
|
"loss": 0.8948, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 4.9550000000000005e-05, |
|
"loss": 0.9336, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 4.9545e-05, |
|
"loss": 0.9117, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 4.9540000000000003e-05, |
|
"loss": 0.876, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 4.9535000000000006e-05, |
|
"loss": 0.9297, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 4.953e-05, |
|
"loss": 0.9241, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 4.9525000000000004e-05, |
|
"loss": 0.9305, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 4.952e-05, |
|
"loss": 0.9395, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 4.9515e-05, |
|
"loss": 0.9208, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 4.951e-05, |
|
"loss": 0.9112, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 4.9505e-05, |
|
"loss": 0.9159, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.902, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 4.9495e-05, |
|
"loss": 0.9412, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 4.949e-05, |
|
"loss": 0.8916, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 4.9485000000000005e-05, |
|
"loss": 0.8846, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 4.948000000000001e-05, |
|
"loss": 0.8974, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 4.9475e-05, |
|
"loss": 0.9332, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 4.947e-05, |
|
"loss": 0.9, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 4.9465e-05, |
|
"loss": 0.92, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 4.946e-05, |
|
"loss": 0.8802, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 4.9455e-05, |
|
"loss": 0.9199, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 4.945e-05, |
|
"loss": 0.8872, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 4.9445000000000005e-05, |
|
"loss": 0.8689, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 4.944e-05, |
|
"loss": 0.906, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 4.9435000000000004e-05, |
|
"loss": 0.8725, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.9430000000000006e-05, |
|
"loss": 0.8708, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.9425e-05, |
|
"loss": 0.8726, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.942e-05, |
|
"loss": 0.8584, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 4.9415e-05, |
|
"loss": 0.9218, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 4.941e-05, |
|
"loss": 0.8279, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 4.9405e-05, |
|
"loss": 0.9098, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 4.94e-05, |
|
"loss": 0.8924, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 4.9395000000000004e-05, |
|
"loss": 0.8897, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 4.939e-05, |
|
"loss": 0.876, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 4.9385e-05, |
|
"loss": 0.912, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 4.9380000000000005e-05, |
|
"loss": 0.878, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 4.937500000000001e-05, |
|
"loss": 0.8596, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 4.937e-05, |
|
"loss": 0.8849, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 4.9365e-05, |
|
"loss": 0.8471, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 4.936e-05, |
|
"loss": 0.894, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 4.9355000000000004e-05, |
|
"loss": 0.9051, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 4.935e-05, |
|
"loss": 0.8714, |
|
"step": 130 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 61, |
|
"save_steps": 10, |
|
"total_flos": 1.6591004557295616e+16, |
|
"train_batch_size": 3584, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|