|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.0015, |
|
"eval_steps": 500, |
|
"global_step": 150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1e-05, |
|
"grad_norm": 8.875, |
|
"learning_rate": 9.999999997532599e-06, |
|
"loss": 1.6459, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 2e-05, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 9.999999990130395e-06, |
|
"loss": 1.6742, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 3e-05, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 9.99999997779339e-06, |
|
"loss": 1.6223, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 4e-05, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 9.999999960521582e-06, |
|
"loss": 1.5398, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 5e-05, |
|
"grad_norm": 3.375, |
|
"learning_rate": 9.999999938314972e-06, |
|
"loss": 1.5666, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 6e-05, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 9.999999911173561e-06, |
|
"loss": 1.5981, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 7e-05, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 9.999999879097347e-06, |
|
"loss": 1.644, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 8e-05, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 9.999999842086332e-06, |
|
"loss": 1.6331, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 9e-05, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.999999800140514e-06, |
|
"loss": 1.626, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0001, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 9.999999753259893e-06, |
|
"loss": 1.5778, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00011, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 9.99999970144447e-06, |
|
"loss": 1.6286, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00012, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 9.999999644694247e-06, |
|
"loss": 1.5614, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.00013, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.999999583009221e-06, |
|
"loss": 1.6447, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.00014, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 9.999999516389394e-06, |
|
"loss": 1.5258, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.00015, |
|
"grad_norm": 1.25, |
|
"learning_rate": 9.999999444834763e-06, |
|
"loss": 1.6336, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00016, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 9.999999368345333e-06, |
|
"loss": 1.6073, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00017, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 9.999999286921101e-06, |
|
"loss": 1.5919, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.00018, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 9.999999200562065e-06, |
|
"loss": 1.543, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.00019, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 9.99999910926823e-06, |
|
"loss": 1.6101, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0002, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 9.999999013039593e-06, |
|
"loss": 1.5796, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00021, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 9.999998911876154e-06, |
|
"loss": 1.5748, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.00022, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 9.999998805777915e-06, |
|
"loss": 1.5479, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.00023, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 9.999998694744875e-06, |
|
"loss": 1.5318, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.00024, |
|
"grad_norm": 1.125, |
|
"learning_rate": 9.999998578777036e-06, |
|
"loss": 1.6259, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.00025, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.999998457874392e-06, |
|
"loss": 1.5525, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.00026, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 9.99999833203695e-06, |
|
"loss": 1.5576, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.00027, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 9.999998201264707e-06, |
|
"loss": 1.3934, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00028, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.999998065557664e-06, |
|
"loss": 1.5423, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.00029, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 9.999997924915818e-06, |
|
"loss": 1.5679, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0003, |
|
"grad_norm": 1.625, |
|
"learning_rate": 9.999997779339175e-06, |
|
"loss": 1.5329, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00031, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 9.999997628827732e-06, |
|
"loss": 1.4603, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.00032, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 9.999997473381487e-06, |
|
"loss": 1.5774, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.00033, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.999997313000444e-06, |
|
"loss": 1.5522, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.00034, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 9.9999971476846e-06, |
|
"loss": 1.5964, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.00035, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 9.999996977433957e-06, |
|
"loss": 1.6129, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.00036, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 9.999996802248514e-06, |
|
"loss": 1.548, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.00037, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 9.999996622128274e-06, |
|
"loss": 1.5662, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.00038, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 9.999996437073236e-06, |
|
"loss": 1.6197, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.00039, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 9.999996247083397e-06, |
|
"loss": 1.5308, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0004, |
|
"grad_norm": 1.375, |
|
"learning_rate": 9.99999605215876e-06, |
|
"loss": 1.5846, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.00041, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 9.999995852299324e-06, |
|
"loss": 1.4274, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.00042, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 9.999995647505092e-06, |
|
"loss": 1.4986, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.00043, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 9.99999543777606e-06, |
|
"loss": 1.5135, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.00044, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 9.999995223112231e-06, |
|
"loss": 1.5472, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.00045, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 9.999995003513605e-06, |
|
"loss": 1.5635, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.00046, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 9.999994778980182e-06, |
|
"loss": 1.5506, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.00047, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 9.99999454951196e-06, |
|
"loss": 1.5071, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.00048, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 9.999994315108943e-06, |
|
"loss": 1.5532, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.00049, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 9.999994075771128e-06, |
|
"loss": 1.6061, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0005, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.999993831498517e-06, |
|
"loss": 1.5629, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00051, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 9.999993582291112e-06, |
|
"loss": 1.5536, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.00052, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.999993328148909e-06, |
|
"loss": 1.5013, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.00053, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 9.999993069071912e-06, |
|
"loss": 1.4943, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.00054, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 9.999992805060117e-06, |
|
"loss": 1.5057, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.00055, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 9.99999253611353e-06, |
|
"loss": 1.6486, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.00056, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 9.999992262232145e-06, |
|
"loss": 1.4421, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.00057, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.999991983415968e-06, |
|
"loss": 1.5346, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.00058, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 9.999991699664996e-06, |
|
"loss": 1.5433, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.00059, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 9.99999141097923e-06, |
|
"loss": 1.5306, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0006, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.99999111735867e-06, |
|
"loss": 1.4453, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00061, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 9.999990818803316e-06, |
|
"loss": 1.6219, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.00062, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 9.99999051531317e-06, |
|
"loss": 1.5296, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.00063, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 9.999990206888231e-06, |
|
"loss": 1.5165, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.00064, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 9.999989893528499e-06, |
|
"loss": 1.5697, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.00065, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 9.999989575233975e-06, |
|
"loss": 1.5121, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.00066, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 9.999989252004657e-06, |
|
"loss": 1.4815, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.00067, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 9.999988923840551e-06, |
|
"loss": 1.4624, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.00068, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 9.999988590741651e-06, |
|
"loss": 1.5089, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.00069, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.999988252707961e-06, |
|
"loss": 1.4864, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0007, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 9.999987909739481e-06, |
|
"loss": 1.5284, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.00071, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 9.99998756183621e-06, |
|
"loss": 1.4759, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.00072, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 9.999987208998151e-06, |
|
"loss": 1.5659, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.00073, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 9.9999868512253e-06, |
|
"loss": 1.6112, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.00074, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 9.999986488517661e-06, |
|
"loss": 1.503, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.00075, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.999986120875233e-06, |
|
"loss": 1.4688, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.00076, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.999985748298016e-06, |
|
"loss": 1.543, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.00077, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 9.999985370786011e-06, |
|
"loss": 1.5166, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.00078, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 9.999984988339219e-06, |
|
"loss": 1.5451, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.00079, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 9.999984600957639e-06, |
|
"loss": 1.5026, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0008, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 9.999984208641271e-06, |
|
"loss": 1.4629, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.00081, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.999983811390117e-06, |
|
"loss": 1.5866, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.00082, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 9.999983409204178e-06, |
|
"loss": 1.4361, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.00083, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 9.999983002083451e-06, |
|
"loss": 1.5498, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.00084, |
|
"grad_norm": 3.125, |
|
"learning_rate": 9.999982590027942e-06, |
|
"loss": 1.5787, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.00085, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 9.999982173037645e-06, |
|
"loss": 1.5674, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.00086, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 9.999981751112563e-06, |
|
"loss": 1.5676, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.00087, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 9.999981324252698e-06, |
|
"loss": 1.5031, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.00088, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 9.99998089245805e-06, |
|
"loss": 1.5127, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.00089, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 9.999980455728618e-06, |
|
"loss": 1.5867, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.0009, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.999980014064404e-06, |
|
"loss": 1.4413, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.00091, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 9.999979567465405e-06, |
|
"loss": 1.3676, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.00092, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 9.999979115931626e-06, |
|
"loss": 1.5553, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.00093, |
|
"grad_norm": 0.75, |
|
"learning_rate": 9.999978659463065e-06, |
|
"loss": 1.4763, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.00094, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 9.999978198059722e-06, |
|
"loss": 1.4483, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.00095, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 9.9999777317216e-06, |
|
"loss": 1.5449, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.00096, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 9.999977260448697e-06, |
|
"loss": 1.4965, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.00097, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.999976784241014e-06, |
|
"loss": 1.596, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.00098, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 9.999976303098552e-06, |
|
"loss": 1.5585, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.00099, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 9.99997581702131e-06, |
|
"loss": 1.501, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.001, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 9.999975326009292e-06, |
|
"loss": 1.4553, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.00101, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 9.999974830062494e-06, |
|
"loss": 1.5695, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.00102, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 9.99997432918092e-06, |
|
"loss": 1.4606, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.00103, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 9.999973823364568e-06, |
|
"loss": 1.5428, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.00104, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 9.99997331261344e-06, |
|
"loss": 1.5081, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.00105, |
|
"grad_norm": 1.875, |
|
"learning_rate": 9.999972796927537e-06, |
|
"loss": 1.5, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.00106, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 9.999972276306858e-06, |
|
"loss": 1.5322, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.00107, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 9.999971750751405e-06, |
|
"loss": 1.5786, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.00108, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 9.999971220261177e-06, |
|
"loss": 1.4891, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.00109, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 9.999970684836174e-06, |
|
"loss": 1.4762, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0011, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.9999701444764e-06, |
|
"loss": 1.5361, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.00111, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 9.999969599181852e-06, |
|
"loss": 1.4876, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.00112, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.999969048952532e-06, |
|
"loss": 1.4864, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.00113, |
|
"grad_norm": 0.875, |
|
"learning_rate": 9.99996849378844e-06, |
|
"loss": 1.5043, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.00114, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 9.999967933689577e-06, |
|
"loss": 1.4646, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.00115, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 9.999967368655942e-06, |
|
"loss": 1.5111, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.00116, |
|
"grad_norm": 1.125, |
|
"learning_rate": 9.99996679868754e-06, |
|
"loss": 1.4959, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.00117, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 9.999966223784368e-06, |
|
"loss": 1.5595, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.00118, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 9.999965643946425e-06, |
|
"loss": 1.1576, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.00119, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 9.999965059173715e-06, |
|
"loss": 1.5184, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0012, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 9.999964469466236e-06, |
|
"loss": 1.4559, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.00121, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 9.999963874823993e-06, |
|
"loss": 1.5143, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.00122, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.999963275246983e-06, |
|
"loss": 1.5526, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.00123, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.999962670735205e-06, |
|
"loss": 1.5328, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.00124, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 9.999962061288662e-06, |
|
"loss": 1.5333, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.00125, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 9.999961446907354e-06, |
|
"loss": 1.5423, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.00126, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 9.999960827591283e-06, |
|
"loss": 1.5687, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.00127, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 9.999960203340447e-06, |
|
"loss": 1.5186, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.00128, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.99995957415485e-06, |
|
"loss": 1.5319, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.00129, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.99995894003449e-06, |
|
"loss": 1.4797, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.0013, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 9.999958300979367e-06, |
|
"loss": 1.5232, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.00131, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 9.999957656989482e-06, |
|
"loss": 1.4463, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.00132, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.99995700806484e-06, |
|
"loss": 1.4935, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.00133, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.999956354205437e-06, |
|
"loss": 1.6034, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.00134, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999955695411274e-06, |
|
"loss": 1.5081, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.00135, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.999955031682354e-06, |
|
"loss": 1.4503, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.00136, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.999954363018675e-06, |
|
"loss": 1.3321, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.00137, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 9.999953689420238e-06, |
|
"loss": 1.5462, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.00138, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 9.999953010887047e-06, |
|
"loss": 1.5052, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.00139, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999952327419098e-06, |
|
"loss": 1.3766, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.0014, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 9.999951639016396e-06, |
|
"loss": 1.6074, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.00141, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 9.999950945678939e-06, |
|
"loss": 1.5381, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.00142, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999950247406725e-06, |
|
"loss": 1.5463, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.00143, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.99994954419976e-06, |
|
"loss": 1.5013, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.00144, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999948836058045e-06, |
|
"loss": 1.4789, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.00145, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999948122981576e-06, |
|
"loss": 1.4872, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.00146, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999947404970356e-06, |
|
"loss": 1.5426, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.00147, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 9.999946682024386e-06, |
|
"loss": 1.5073, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.00148, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999945954143665e-06, |
|
"loss": 1.5537, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.00149, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 9.999945221328198e-06, |
|
"loss": 1.5759, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.0015, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999944483577982e-06, |
|
"loss": 1.5019, |
|
"step": 150 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.4582692888576e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|