|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.005, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1e-05, |
|
"grad_norm": 8.875, |
|
"learning_rate": 9.999999997532599e-06, |
|
"loss": 1.6459, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 2e-05, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 9.999999990130395e-06, |
|
"loss": 1.6742, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 3e-05, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 9.99999997779339e-06, |
|
"loss": 1.6223, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 4e-05, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 9.999999960521582e-06, |
|
"loss": 1.5398, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 5e-05, |
|
"grad_norm": 3.375, |
|
"learning_rate": 9.999999938314972e-06, |
|
"loss": 1.5666, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 6e-05, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 9.999999911173561e-06, |
|
"loss": 1.5981, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 7e-05, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 9.999999879097347e-06, |
|
"loss": 1.644, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 8e-05, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 9.999999842086332e-06, |
|
"loss": 1.6331, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 9e-05, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.999999800140514e-06, |
|
"loss": 1.626, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0001, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 9.999999753259893e-06, |
|
"loss": 1.5778, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00011, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 9.99999970144447e-06, |
|
"loss": 1.6286, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00012, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 9.999999644694247e-06, |
|
"loss": 1.5614, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.00013, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.999999583009221e-06, |
|
"loss": 1.6447, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.00014, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 9.999999516389394e-06, |
|
"loss": 1.5258, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.00015, |
|
"grad_norm": 1.25, |
|
"learning_rate": 9.999999444834763e-06, |
|
"loss": 1.6336, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00016, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 9.999999368345333e-06, |
|
"loss": 1.6073, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00017, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 9.999999286921101e-06, |
|
"loss": 1.5919, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.00018, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 9.999999200562065e-06, |
|
"loss": 1.543, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.00019, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 9.99999910926823e-06, |
|
"loss": 1.6101, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0002, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 9.999999013039593e-06, |
|
"loss": 1.5796, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00021, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 9.999998911876154e-06, |
|
"loss": 1.5748, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.00022, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 9.999998805777915e-06, |
|
"loss": 1.5479, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.00023, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 9.999998694744875e-06, |
|
"loss": 1.5318, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.00024, |
|
"grad_norm": 1.125, |
|
"learning_rate": 9.999998578777036e-06, |
|
"loss": 1.6259, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.00025, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.999998457874392e-06, |
|
"loss": 1.5525, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.00026, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 9.99999833203695e-06, |
|
"loss": 1.5576, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.00027, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 9.999998201264707e-06, |
|
"loss": 1.3934, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00028, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.999998065557664e-06, |
|
"loss": 1.5423, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.00029, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 9.999997924915818e-06, |
|
"loss": 1.5679, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0003, |
|
"grad_norm": 1.625, |
|
"learning_rate": 9.999997779339175e-06, |
|
"loss": 1.5329, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00031, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 9.999997628827732e-06, |
|
"loss": 1.4603, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.00032, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 9.999997473381487e-06, |
|
"loss": 1.5774, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.00033, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.999997313000444e-06, |
|
"loss": 1.5522, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.00034, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 9.9999971476846e-06, |
|
"loss": 1.5964, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.00035, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 9.999996977433957e-06, |
|
"loss": 1.6129, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.00036, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 9.999996802248514e-06, |
|
"loss": 1.548, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.00037, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 9.999996622128274e-06, |
|
"loss": 1.5662, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.00038, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 9.999996437073236e-06, |
|
"loss": 1.6197, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.00039, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 9.999996247083397e-06, |
|
"loss": 1.5308, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0004, |
|
"grad_norm": 1.375, |
|
"learning_rate": 9.99999605215876e-06, |
|
"loss": 1.5846, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.00041, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 9.999995852299324e-06, |
|
"loss": 1.4274, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.00042, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 9.999995647505092e-06, |
|
"loss": 1.4986, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.00043, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 9.99999543777606e-06, |
|
"loss": 1.5135, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.00044, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 9.999995223112231e-06, |
|
"loss": 1.5472, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.00045, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 9.999995003513605e-06, |
|
"loss": 1.5635, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.00046, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 9.999994778980182e-06, |
|
"loss": 1.5506, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.00047, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 9.99999454951196e-06, |
|
"loss": 1.5071, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.00048, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 9.999994315108943e-06, |
|
"loss": 1.5532, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.00049, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 9.999994075771128e-06, |
|
"loss": 1.6061, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0005, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.999993831498517e-06, |
|
"loss": 1.5629, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00051, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 9.999993582291112e-06, |
|
"loss": 1.5536, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.00052, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.999993328148909e-06, |
|
"loss": 1.5013, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.00053, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 9.999993069071912e-06, |
|
"loss": 1.4943, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.00054, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 9.999992805060117e-06, |
|
"loss": 1.5057, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.00055, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 9.99999253611353e-06, |
|
"loss": 1.6486, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.00056, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 9.999992262232145e-06, |
|
"loss": 1.4421, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.00057, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.999991983415968e-06, |
|
"loss": 1.5346, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.00058, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 9.999991699664996e-06, |
|
"loss": 1.5433, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.00059, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 9.99999141097923e-06, |
|
"loss": 1.5306, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0006, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.99999111735867e-06, |
|
"loss": 1.4453, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00061, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 9.999990818803316e-06, |
|
"loss": 1.6219, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.00062, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 9.99999051531317e-06, |
|
"loss": 1.5296, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.00063, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 9.999990206888231e-06, |
|
"loss": 1.5165, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.00064, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 9.999989893528499e-06, |
|
"loss": 1.5697, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.00065, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 9.999989575233975e-06, |
|
"loss": 1.5121, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.00066, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 9.999989252004657e-06, |
|
"loss": 1.4815, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.00067, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 9.999988923840551e-06, |
|
"loss": 1.4624, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.00068, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 9.999988590741651e-06, |
|
"loss": 1.5089, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.00069, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.999988252707961e-06, |
|
"loss": 1.4864, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0007, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 9.999987909739481e-06, |
|
"loss": 1.5284, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.00071, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 9.99998756183621e-06, |
|
"loss": 1.4759, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.00072, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 9.999987208998151e-06, |
|
"loss": 1.5659, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.00073, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 9.9999868512253e-06, |
|
"loss": 1.6112, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.00074, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 9.999986488517661e-06, |
|
"loss": 1.503, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.00075, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.999986120875233e-06, |
|
"loss": 1.4688, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.00076, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.999985748298016e-06, |
|
"loss": 1.543, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.00077, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 9.999985370786011e-06, |
|
"loss": 1.5166, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.00078, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 9.999984988339219e-06, |
|
"loss": 1.5451, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.00079, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 9.999984600957639e-06, |
|
"loss": 1.5026, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0008, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 9.999984208641271e-06, |
|
"loss": 1.4629, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.00081, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.999983811390117e-06, |
|
"loss": 1.5866, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.00082, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 9.999983409204178e-06, |
|
"loss": 1.4361, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.00083, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 9.999983002083451e-06, |
|
"loss": 1.5498, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.00084, |
|
"grad_norm": 3.125, |
|
"learning_rate": 9.999982590027942e-06, |
|
"loss": 1.5787, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.00085, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 9.999982173037645e-06, |
|
"loss": 1.5674, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.00086, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 9.999981751112563e-06, |
|
"loss": 1.5676, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.00087, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 9.999981324252698e-06, |
|
"loss": 1.5031, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.00088, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 9.99998089245805e-06, |
|
"loss": 1.5127, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.00089, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 9.999980455728618e-06, |
|
"loss": 1.5867, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.0009, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.999980014064404e-06, |
|
"loss": 1.4413, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.00091, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 9.999979567465405e-06, |
|
"loss": 1.3676, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.00092, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 9.999979115931626e-06, |
|
"loss": 1.5553, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.00093, |
|
"grad_norm": 0.75, |
|
"learning_rate": 9.999978659463065e-06, |
|
"loss": 1.4763, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.00094, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 9.999978198059722e-06, |
|
"loss": 1.4483, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.00095, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 9.9999777317216e-06, |
|
"loss": 1.5449, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.00096, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 9.999977260448697e-06, |
|
"loss": 1.4965, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.00097, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.999976784241014e-06, |
|
"loss": 1.596, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.00098, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 9.999976303098552e-06, |
|
"loss": 1.5585, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.00099, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 9.99997581702131e-06, |
|
"loss": 1.501, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.001, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 9.999975326009292e-06, |
|
"loss": 1.4553, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.00101, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 9.999974830062494e-06, |
|
"loss": 1.5695, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.00102, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 9.99997432918092e-06, |
|
"loss": 1.4606, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.00103, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 9.999973823364568e-06, |
|
"loss": 1.5428, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.00104, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 9.99997331261344e-06, |
|
"loss": 1.5081, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.00105, |
|
"grad_norm": 1.875, |
|
"learning_rate": 9.999972796927537e-06, |
|
"loss": 1.5, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.00106, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 9.999972276306858e-06, |
|
"loss": 1.5322, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.00107, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 9.999971750751405e-06, |
|
"loss": 1.5786, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.00108, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 9.999971220261177e-06, |
|
"loss": 1.4891, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.00109, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 9.999970684836174e-06, |
|
"loss": 1.4762, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0011, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.9999701444764e-06, |
|
"loss": 1.5361, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.00111, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 9.999969599181852e-06, |
|
"loss": 1.4876, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.00112, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.999969048952532e-06, |
|
"loss": 1.4864, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.00113, |
|
"grad_norm": 0.875, |
|
"learning_rate": 9.99996849378844e-06, |
|
"loss": 1.5043, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.00114, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 9.999967933689577e-06, |
|
"loss": 1.4646, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.00115, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 9.999967368655942e-06, |
|
"loss": 1.5111, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.00116, |
|
"grad_norm": 1.125, |
|
"learning_rate": 9.99996679868754e-06, |
|
"loss": 1.4959, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.00117, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 9.999966223784368e-06, |
|
"loss": 1.5595, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.00118, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 9.999965643946425e-06, |
|
"loss": 1.1576, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.00119, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 9.999965059173715e-06, |
|
"loss": 1.5184, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0012, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 9.999964469466236e-06, |
|
"loss": 1.4559, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.00121, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 9.999963874823993e-06, |
|
"loss": 1.5143, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.00122, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.999963275246983e-06, |
|
"loss": 1.5526, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.00123, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.999962670735205e-06, |
|
"loss": 1.5328, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.00124, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 9.999962061288662e-06, |
|
"loss": 1.5333, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.00125, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 9.999961446907354e-06, |
|
"loss": 1.5423, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.00126, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 9.999960827591283e-06, |
|
"loss": 1.5687, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.00127, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 9.999960203340447e-06, |
|
"loss": 1.5186, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.00128, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.99995957415485e-06, |
|
"loss": 1.5319, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.00129, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.99995894003449e-06, |
|
"loss": 1.4797, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.0013, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 9.999958300979367e-06, |
|
"loss": 1.5232, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.00131, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 9.999957656989482e-06, |
|
"loss": 1.4463, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.00132, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.99995700806484e-06, |
|
"loss": 1.4935, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.00133, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.999956354205437e-06, |
|
"loss": 1.6034, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.00134, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999955695411274e-06, |
|
"loss": 1.5081, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.00135, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.999955031682354e-06, |
|
"loss": 1.4503, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.00136, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.999954363018675e-06, |
|
"loss": 1.3321, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.00137, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 9.999953689420238e-06, |
|
"loss": 1.5462, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.00138, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 9.999953010887047e-06, |
|
"loss": 1.5052, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.00139, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999952327419098e-06, |
|
"loss": 1.3766, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.0014, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 9.999951639016396e-06, |
|
"loss": 1.6074, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.00141, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 9.999950945678939e-06, |
|
"loss": 1.5381, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.00142, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999950247406725e-06, |
|
"loss": 1.5463, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.00143, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.99994954419976e-06, |
|
"loss": 1.5013, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.00144, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999948836058045e-06, |
|
"loss": 1.4789, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.00145, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999948122981576e-06, |
|
"loss": 1.4872, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.00146, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999947404970356e-06, |
|
"loss": 1.5426, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.00147, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 9.999946682024386e-06, |
|
"loss": 1.5073, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.00148, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999945954143665e-06, |
|
"loss": 1.5537, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.00149, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 9.999945221328198e-06, |
|
"loss": 1.5759, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.0015, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999944483577982e-06, |
|
"loss": 1.5019, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.00151, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999943740893017e-06, |
|
"loss": 1.4706, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.00152, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 9.999942993273306e-06, |
|
"loss": 1.5303, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.00153, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.99994224071885e-06, |
|
"loss": 1.5028, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.00154, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999941483229646e-06, |
|
"loss": 1.5878, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.00155, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.9999407208057e-06, |
|
"loss": 1.4271, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.00156, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999939953447012e-06, |
|
"loss": 1.4115, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.00157, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.999939181153578e-06, |
|
"loss": 1.5243, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.00158, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999938403925404e-06, |
|
"loss": 1.4749, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.00159, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999937621762487e-06, |
|
"loss": 1.5109, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.0016, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.99993683466483e-06, |
|
"loss": 1.5235, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.00161, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.999936042632433e-06, |
|
"loss": 1.5407, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.00162, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 9.999935245665296e-06, |
|
"loss": 1.4664, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.00163, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999934443763421e-06, |
|
"loss": 1.6013, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.00164, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.99993363692681e-06, |
|
"loss": 1.5297, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.00165, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.999932825155462e-06, |
|
"loss": 1.5636, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.00166, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.999932008449377e-06, |
|
"loss": 1.4603, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.00167, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999931186808558e-06, |
|
"loss": 1.5884, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.00168, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999930360233003e-06, |
|
"loss": 1.4181, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.00169, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 9.999929528722716e-06, |
|
"loss": 1.4743, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.0017, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 9.999928692277696e-06, |
|
"loss": 1.6229, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.00171, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.999927850897944e-06, |
|
"loss": 1.5356, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.00172, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999927004583462e-06, |
|
"loss": 1.5382, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.00173, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 9.99992615333425e-06, |
|
"loss": 1.5174, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.00174, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.999925297150307e-06, |
|
"loss": 1.4997, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.00175, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.999924436031636e-06, |
|
"loss": 1.4769, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.00176, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999923569978238e-06, |
|
"loss": 1.517, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.00177, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.999922698990112e-06, |
|
"loss": 1.5596, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.00178, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.999921823067263e-06, |
|
"loss": 1.5257, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.00179, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 9.999920942209686e-06, |
|
"loss": 1.4666, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.0018, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999920056417385e-06, |
|
"loss": 1.493, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.00181, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 9.999919165690362e-06, |
|
"loss": 1.5201, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.00182, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 9.999918270028616e-06, |
|
"loss": 1.5152, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.00183, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999917369432148e-06, |
|
"loss": 1.5456, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.00184, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999916463900961e-06, |
|
"loss": 1.5257, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.00185, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999915553435053e-06, |
|
"loss": 1.3887, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.00186, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.999914638034426e-06, |
|
"loss": 1.5393, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.00187, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.999913717699081e-06, |
|
"loss": 1.5807, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.00188, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.99991279242902e-06, |
|
"loss": 1.5917, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.00189, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999911862224242e-06, |
|
"loss": 1.4811, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.0019, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.999910927084748e-06, |
|
"loss": 1.5421, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.00191, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999909987010541e-06, |
|
"loss": 1.5756, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.00192, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999909042001621e-06, |
|
"loss": 1.4599, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.00193, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999908092057988e-06, |
|
"loss": 1.4314, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.00194, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999907137179644e-06, |
|
"loss": 1.5985, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.00195, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 9.999906177366587e-06, |
|
"loss": 1.5923, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.00196, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.999905212618822e-06, |
|
"loss": 1.5388, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.00197, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 9.999904242936349e-06, |
|
"loss": 1.313, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.00198, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999903268319167e-06, |
|
"loss": 1.5182, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.00199, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.99990228876728e-06, |
|
"loss": 1.4396, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.002, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 9.999901304280686e-06, |
|
"loss": 1.489, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00201, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999900314859388e-06, |
|
"loss": 1.4867, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.00202, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999899320503386e-06, |
|
"loss": 1.5109, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.00203, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999898321212681e-06, |
|
"loss": 1.5289, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.00204, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999897316987273e-06, |
|
"loss": 1.4972, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.00205, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999896307827167e-06, |
|
"loss": 1.4926, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.00206, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999895293732358e-06, |
|
"loss": 1.5496, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.00207, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999894274702853e-06, |
|
"loss": 1.4316, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.00208, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 9.999893250738649e-06, |
|
"loss": 1.5131, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.00209, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999892221839747e-06, |
|
"loss": 1.4639, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.0021, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.99989118800615e-06, |
|
"loss": 1.5029, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.00211, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999890149237857e-06, |
|
"loss": 1.5104, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.00212, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999889105534873e-06, |
|
"loss": 1.5475, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.00213, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999888056897193e-06, |
|
"loss": 1.4491, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.00214, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999887003324825e-06, |
|
"loss": 1.5361, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.00215, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999885944817762e-06, |
|
"loss": 1.5162, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.00216, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999884881376012e-06, |
|
"loss": 1.5086, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.00217, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999883812999574e-06, |
|
"loss": 1.4937, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.00218, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999882739688446e-06, |
|
"loss": 1.5327, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.00219, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999881661442635e-06, |
|
"loss": 1.4829, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.0022, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999880578262135e-06, |
|
"loss": 1.4377, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.00221, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.999879490146953e-06, |
|
"loss": 1.4425, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.00222, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.999878397097086e-06, |
|
"loss": 1.5688, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.00223, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 9.999877299112539e-06, |
|
"loss": 1.4533, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.00224, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.99987619619331e-06, |
|
"loss": 1.5631, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.00225, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999875088339401e-06, |
|
"loss": 1.451, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.00226, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999873975550812e-06, |
|
"loss": 1.4725, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.00227, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999872857827546e-06, |
|
"loss": 1.5389, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.00228, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 9.999871735169603e-06, |
|
"loss": 1.5243, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.00229, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999870607576984e-06, |
|
"loss": 1.5612, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.0023, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999869475049693e-06, |
|
"loss": 1.5469, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.00231, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.999868337587725e-06, |
|
"loss": 1.3031, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.00232, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.99986719519109e-06, |
|
"loss": 1.4403, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.00233, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999866047859778e-06, |
|
"loss": 1.438, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.00234, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999864895593799e-06, |
|
"loss": 1.4921, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.00235, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.99986373839315e-06, |
|
"loss": 1.456, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.00236, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999862576257836e-06, |
|
"loss": 1.4828, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.00237, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999861409187851e-06, |
|
"loss": 1.3898, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.00238, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999860237183202e-06, |
|
"loss": 1.5977, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.00239, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.99985906024389e-06, |
|
"loss": 1.4885, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.0024, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.999857878369917e-06, |
|
"loss": 1.4311, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.00241, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999856691561278e-06, |
|
"loss": 1.4091, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.00242, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.99985549981798e-06, |
|
"loss": 1.6187, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.00243, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.999854303140022e-06, |
|
"loss": 1.5306, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.00244, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999853101527406e-06, |
|
"loss": 1.5455, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.00245, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 9.999851894980133e-06, |
|
"loss": 1.5096, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.00246, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999850683498204e-06, |
|
"loss": 1.4124, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.00247, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 9.999849467081619e-06, |
|
"loss": 1.5054, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.00248, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 9.999848245730382e-06, |
|
"loss": 1.5263, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.00249, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.99984701944449e-06, |
|
"loss": 1.4857, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.99984578822395e-06, |
|
"loss": 1.451, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.00251, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999844552068759e-06, |
|
"loss": 1.5749, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.00252, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999843310978919e-06, |
|
"loss": 1.5781, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.00253, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.99984206495443e-06, |
|
"loss": 1.5356, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.00254, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 9.999840813995296e-06, |
|
"loss": 1.532, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.00255, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999839558101517e-06, |
|
"loss": 1.513, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.00256, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999838297273093e-06, |
|
"loss": 1.5514, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.00257, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999837031510027e-06, |
|
"loss": 1.529, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.00258, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.99983576081232e-06, |
|
"loss": 1.4506, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.00259, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999834485179974e-06, |
|
"loss": 1.4989, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.0026, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.999833204612988e-06, |
|
"loss": 1.4701, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.00261, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.999831919111363e-06, |
|
"loss": 1.4888, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.00262, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999830628675105e-06, |
|
"loss": 1.4784, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.00263, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.99982933330421e-06, |
|
"loss": 1.5459, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.00264, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.99982803299868e-06, |
|
"loss": 1.525, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.00265, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.99982672775852e-06, |
|
"loss": 1.4649, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.00266, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.999825417583727e-06, |
|
"loss": 1.5024, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.00267, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999824102474304e-06, |
|
"loss": 1.5069, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.00268, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 9.999822782430253e-06, |
|
"loss": 1.3715, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.00269, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999821457451576e-06, |
|
"loss": 1.5258, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.0027, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999820127538271e-06, |
|
"loss": 1.5928, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.00271, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.999818792690344e-06, |
|
"loss": 1.4709, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.00272, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.99981745290779e-06, |
|
"loss": 1.5465, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.00273, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.999816108190616e-06, |
|
"loss": 1.4847, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.00274, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 9.999814758538821e-06, |
|
"loss": 1.5317, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.00275, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.999813403952407e-06, |
|
"loss": 1.5641, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.00276, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.999812044431374e-06, |
|
"loss": 1.5378, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.00277, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 9.999810679975725e-06, |
|
"loss": 1.4964, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.00278, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.999809310585462e-06, |
|
"loss": 1.3405, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.00279, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999807936260583e-06, |
|
"loss": 1.4228, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.0028, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999806557001092e-06, |
|
"loss": 1.4399, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.00281, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.99980517280699e-06, |
|
"loss": 1.4913, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.00282, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 9.999803783678276e-06, |
|
"loss": 1.4801, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.00283, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999802389614957e-06, |
|
"loss": 1.5795, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.00284, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.99980099061703e-06, |
|
"loss": 1.5647, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.00285, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999799586684495e-06, |
|
"loss": 1.4144, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.00286, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999798177817357e-06, |
|
"loss": 1.5315, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.00287, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999796764015617e-06, |
|
"loss": 1.5143, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.00288, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.999795345279273e-06, |
|
"loss": 1.4468, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.00289, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 9.999793921608331e-06, |
|
"loss": 1.4191, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.0029, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.99979249300279e-06, |
|
"loss": 1.5005, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.00291, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999791059462649e-06, |
|
"loss": 1.5497, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.00292, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.999789620987914e-06, |
|
"loss": 1.4732, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.00293, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999788177578585e-06, |
|
"loss": 1.5058, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.00294, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999786729234661e-06, |
|
"loss": 1.5243, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.00295, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 9.999785275956147e-06, |
|
"loss": 1.5668, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.00296, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.999783817743043e-06, |
|
"loss": 1.4874, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.00297, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999782354595349e-06, |
|
"loss": 1.5466, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.00298, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999780886513069e-06, |
|
"loss": 1.5114, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.00299, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.9997794134962e-06, |
|
"loss": 1.4656, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.003, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.99977793554475e-06, |
|
"loss": 1.4512, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.00301, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999776452658716e-06, |
|
"loss": 1.4278, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.00302, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999774964838099e-06, |
|
"loss": 1.4672, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.00303, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.999773472082903e-06, |
|
"loss": 1.5091, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.00304, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999771974393129e-06, |
|
"loss": 1.4843, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.00305, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999770471768778e-06, |
|
"loss": 1.424, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.00306, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999768964209848e-06, |
|
"loss": 1.3624, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.00307, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999767451716347e-06, |
|
"loss": 1.5685, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.00308, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999765934288272e-06, |
|
"loss": 1.563, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.00309, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.999764411925628e-06, |
|
"loss": 1.5428, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.0031, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.999762884628413e-06, |
|
"loss": 1.5119, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.00311, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999761352396628e-06, |
|
"loss": 1.5343, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.00312, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.999759815230277e-06, |
|
"loss": 1.5505, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.00313, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999758273129362e-06, |
|
"loss": 1.4687, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.00314, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999756726093883e-06, |
|
"loss": 1.4969, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.00315, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999755174123842e-06, |
|
"loss": 1.5959, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.00316, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 9.999753617219241e-06, |
|
"loss": 1.2334, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.00317, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.99975205538008e-06, |
|
"loss": 1.4491, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.00318, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999750488606362e-06, |
|
"loss": 1.468, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.00319, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999748916898088e-06, |
|
"loss": 1.4278, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.99974734025526e-06, |
|
"loss": 1.5056, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.00321, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999745758677878e-06, |
|
"loss": 1.4188, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.00322, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999744172165945e-06, |
|
"loss": 1.3937, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.00323, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999742580719463e-06, |
|
"loss": 1.5149, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.00324, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999740984338433e-06, |
|
"loss": 1.4844, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.00325, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999739383022856e-06, |
|
"loss": 1.4974, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.00326, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999737776772735e-06, |
|
"loss": 1.4215, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.00327, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.99973616558807e-06, |
|
"loss": 1.578, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.00328, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.999734549468863e-06, |
|
"loss": 1.5505, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.00329, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 9.999732928415115e-06, |
|
"loss": 1.3222, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.0033, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999731302426829e-06, |
|
"loss": 1.5815, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.00331, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999729671504007e-06, |
|
"loss": 1.4017, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.00332, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999728035646648e-06, |
|
"loss": 1.507, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.00333, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999726394854755e-06, |
|
"loss": 1.441, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.00334, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.99972474912833e-06, |
|
"loss": 1.5628, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.00335, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999723098467375e-06, |
|
"loss": 1.464, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.00336, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999721442871892e-06, |
|
"loss": 1.4567, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.00337, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.99971978234188e-06, |
|
"loss": 1.5087, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.00338, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999718116877344e-06, |
|
"loss": 1.5264, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.00339, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999716446478284e-06, |
|
"loss": 1.4069, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.0034, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.9997147711447e-06, |
|
"loss": 1.4323, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.00341, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999713090876597e-06, |
|
"loss": 1.5584, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.00342, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 9.999711405673974e-06, |
|
"loss": 1.4787, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.00343, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.999709715536833e-06, |
|
"loss": 1.5732, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.00344, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999708020465177e-06, |
|
"loss": 1.3899, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.00345, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 9.999706320459008e-06, |
|
"loss": 1.3948, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.00346, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.999704615518327e-06, |
|
"loss": 1.5258, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.00347, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999702905643133e-06, |
|
"loss": 1.4421, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.00348, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999701190833431e-06, |
|
"loss": 1.5647, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.00349, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.999699471089223e-06, |
|
"loss": 1.6083, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.0035, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 9.999697746410509e-06, |
|
"loss": 1.5287, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.00351, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.99969601679729e-06, |
|
"loss": 1.4517, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.00352, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.999694282249568e-06, |
|
"loss": 1.5041, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.00353, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999692542767347e-06, |
|
"loss": 1.4867, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.00354, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999690798350628e-06, |
|
"loss": 1.5221, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.00355, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999689048999411e-06, |
|
"loss": 1.5112, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.00356, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999687294713697e-06, |
|
"loss": 1.609, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.00357, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 9.999685535493491e-06, |
|
"loss": 1.4672, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.00358, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999683771338794e-06, |
|
"loss": 1.5205, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.00359, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999682002249607e-06, |
|
"loss": 1.5086, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.0036, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.99968022822593e-06, |
|
"loss": 1.5032, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.00361, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999678449267766e-06, |
|
"loss": 1.4734, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.00362, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.99967666537512e-06, |
|
"loss": 1.5343, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.00363, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999674876547989e-06, |
|
"loss": 1.4186, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.00364, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.999673082786376e-06, |
|
"loss": 1.386, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.00365, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999671284090286e-06, |
|
"loss": 1.5629, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.00366, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999669480459716e-06, |
|
"loss": 1.5062, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.00367, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.99966767189467e-06, |
|
"loss": 1.4843, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.00368, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999665858395152e-06, |
|
"loss": 1.5003, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.00369, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.99966403996116e-06, |
|
"loss": 1.4893, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.0037, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999662216592696e-06, |
|
"loss": 1.4829, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.00371, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 9.999660388289765e-06, |
|
"loss": 1.4288, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.00372, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999658555052368e-06, |
|
"loss": 1.4853, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.00373, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999656716880504e-06, |
|
"loss": 1.4939, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.00374, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 9.999654873774177e-06, |
|
"loss": 1.5349, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.00375, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999653025733386e-06, |
|
"loss": 1.5811, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.00376, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999651172758139e-06, |
|
"loss": 1.5546, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.00377, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999649314848432e-06, |
|
"loss": 1.5265, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.00378, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.99964745200427e-06, |
|
"loss": 1.5653, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.00379, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 9.999645584225654e-06, |
|
"loss": 1.3955, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.0038, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.999643711512586e-06, |
|
"loss": 1.5375, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.00381, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999641833865065e-06, |
|
"loss": 1.5196, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.00382, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999639951283098e-06, |
|
"loss": 1.4494, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.00383, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.99963806376668e-06, |
|
"loss": 1.5654, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.00384, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999636171315822e-06, |
|
"loss": 1.5794, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.00385, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999634273930519e-06, |
|
"loss": 1.5123, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.00386, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999632371610775e-06, |
|
"loss": 1.5589, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.00387, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.99963046435659e-06, |
|
"loss": 1.5204, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.00388, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999628552167968e-06, |
|
"loss": 1.5047, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.00389, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999626635044911e-06, |
|
"loss": 1.5133, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.0039, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.99962471298742e-06, |
|
"loss": 1.5079, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.00391, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999622785995499e-06, |
|
"loss": 1.525, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.00392, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999620854069148e-06, |
|
"loss": 1.5243, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.00393, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 9.999618917208367e-06, |
|
"loss": 1.5368, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.00394, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 9.99961697541316e-06, |
|
"loss": 1.5637, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.00395, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.99961502868353e-06, |
|
"loss": 1.5719, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.00396, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999613077019477e-06, |
|
"loss": 1.5431, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.00397, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999611120421004e-06, |
|
"loss": 1.562, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.00398, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999609158888112e-06, |
|
"loss": 1.5388, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.00399, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999607192420804e-06, |
|
"loss": 1.4451, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 9.999605221019082e-06, |
|
"loss": 1.488, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.00401, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999603244682946e-06, |
|
"loss": 1.498, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.00402, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.9996012634124e-06, |
|
"loss": 1.4949, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.00403, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 9.999599277207445e-06, |
|
"loss": 1.4222, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.00404, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999597286068085e-06, |
|
"loss": 1.4637, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.00405, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.99959528999432e-06, |
|
"loss": 1.4693, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.00406, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999593288986151e-06, |
|
"loss": 1.6096, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.00407, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999591283043581e-06, |
|
"loss": 1.4465, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.00408, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999589272166614e-06, |
|
"loss": 1.514, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.00409, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999587256355248e-06, |
|
"loss": 1.4868, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.0041, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999585235609487e-06, |
|
"loss": 1.4936, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.00411, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999583209929335e-06, |
|
"loss": 1.5555, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.00412, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999581179314793e-06, |
|
"loss": 1.4915, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.00413, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999579143765861e-06, |
|
"loss": 1.5244, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.00414, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.999577103282541e-06, |
|
"loss": 1.558, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.00415, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.999575057864837e-06, |
|
"loss": 1.5333, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.00416, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999573007512751e-06, |
|
"loss": 1.5035, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.00417, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.999570952226284e-06, |
|
"loss": 1.5245, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.00418, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999568892005436e-06, |
|
"loss": 1.4968, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.00419, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 9.999566826850214e-06, |
|
"loss": 1.4418, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.0042, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999564756760616e-06, |
|
"loss": 1.6185, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.00421, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999562681736645e-06, |
|
"loss": 1.4845, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.00422, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 9.999560601778303e-06, |
|
"loss": 1.3121, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.00423, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999558516885595e-06, |
|
"loss": 1.5696, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.00424, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.99955642705852e-06, |
|
"loss": 1.4734, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.00425, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999554332297079e-06, |
|
"loss": 1.5252, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.00426, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999552232601275e-06, |
|
"loss": 1.5028, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.00427, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999550127971113e-06, |
|
"loss": 1.428, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.00428, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999548018406591e-06, |
|
"loss": 1.5138, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.00429, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999545903907714e-06, |
|
"loss": 1.5135, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.0043, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999543784474484e-06, |
|
"loss": 1.5209, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.00431, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 9.9995416601069e-06, |
|
"loss": 1.5048, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.00432, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999539530804967e-06, |
|
"loss": 1.5185, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.00433, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999537396568685e-06, |
|
"loss": 1.5269, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.00434, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.999535257398058e-06, |
|
"loss": 1.562, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.00435, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.99953311329309e-06, |
|
"loss": 1.4944, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.00436, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.99953096425378e-06, |
|
"loss": 1.487, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.00437, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999528810280128e-06, |
|
"loss": 1.5033, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.00438, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.99952665137214e-06, |
|
"loss": 1.5232, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.00439, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.999524487529817e-06, |
|
"loss": 1.5764, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.0044, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.99952231875316e-06, |
|
"loss": 1.5068, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.00441, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.999520145042174e-06, |
|
"loss": 1.5465, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.00442, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.999517966396859e-06, |
|
"loss": 1.5235, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.00443, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999515782817217e-06, |
|
"loss": 1.5467, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.00444, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999513594303251e-06, |
|
"loss": 1.4685, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.00445, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 9.999511400854964e-06, |
|
"loss": 1.5144, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.00446, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999509202472356e-06, |
|
"loss": 1.5127, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.00447, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 9.99950699915543e-06, |
|
"loss": 1.4352, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.00448, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.99950479090419e-06, |
|
"loss": 1.568, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.00449, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.999502577718634e-06, |
|
"loss": 1.4572, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.0045, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999500359598769e-06, |
|
"loss": 1.4915, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.00451, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999498136544594e-06, |
|
"loss": 1.5833, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.00452, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999495908556112e-06, |
|
"loss": 1.5356, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.00453, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.999493675633325e-06, |
|
"loss": 1.4961, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.00454, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999491437776236e-06, |
|
"loss": 1.4887, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.00455, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.999489194984846e-06, |
|
"loss": 1.5002, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.00456, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.999486947259159e-06, |
|
"loss": 1.5023, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.00457, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999484694599174e-06, |
|
"loss": 1.5351, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.00458, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.999482437004898e-06, |
|
"loss": 1.45, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.00459, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.99948017447633e-06, |
|
"loss": 1.496, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.0046, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999477907013473e-06, |
|
"loss": 1.6283, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.00461, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999475634616328e-06, |
|
"loss": 1.419, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.00462, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 9.999473357284898e-06, |
|
"loss": 1.6006, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.00463, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 9.999471075019188e-06, |
|
"loss": 1.5436, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.00464, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999468787819195e-06, |
|
"loss": 1.5771, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.00465, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999466495684926e-06, |
|
"loss": 1.4976, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.00466, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.999464198616381e-06, |
|
"loss": 1.4614, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.00467, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999461896613563e-06, |
|
"loss": 1.573, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.00468, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999459589676473e-06, |
|
"loss": 1.5366, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.00469, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999457277805113e-06, |
|
"loss": 1.4971, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.0047, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.999454960999488e-06, |
|
"loss": 1.5383, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.00471, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999452639259599e-06, |
|
"loss": 1.5175, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.00472, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.999450312585449e-06, |
|
"loss": 1.481, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.00473, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999447980977037e-06, |
|
"loss": 1.501, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.00474, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 9.99944564443437e-06, |
|
"loss": 1.5477, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.00475, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999443302957446e-06, |
|
"loss": 1.5133, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.00476, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.99944095654627e-06, |
|
"loss": 1.5842, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.00477, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999438605200841e-06, |
|
"loss": 1.4929, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.00478, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.999436248921167e-06, |
|
"loss": 1.4826, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.00479, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999433887707246e-06, |
|
"loss": 1.5088, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.0048, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999431521559081e-06, |
|
"loss": 1.5755, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.00481, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.999429150476678e-06, |
|
"loss": 1.5071, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.00482, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.999426774460032e-06, |
|
"loss": 1.6016, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.00483, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999424393509153e-06, |
|
"loss": 1.5384, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.00484, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.999422007624038e-06, |
|
"loss": 1.4805, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.00485, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999419616804693e-06, |
|
"loss": 1.4774, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.00486, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999417221051116e-06, |
|
"loss": 1.4592, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.00487, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.999414820363313e-06, |
|
"loss": 1.5434, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.00488, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 9.999412414741286e-06, |
|
"loss": 1.5352, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.00489, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.999410004185038e-06, |
|
"loss": 1.4044, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.0049, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999407588694568e-06, |
|
"loss": 1.4273, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.00491, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.99940516826988e-06, |
|
"loss": 1.5409, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.00492, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.999402742910977e-06, |
|
"loss": 1.5455, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.00493, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.999400312617863e-06, |
|
"loss": 1.461, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.00494, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999397877390538e-06, |
|
"loss": 1.446, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.00495, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999395437229004e-06, |
|
"loss": 1.5338, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.00496, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.999392992133265e-06, |
|
"loss": 1.5947, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.00497, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.999390542103325e-06, |
|
"loss": 1.5632, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.00498, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.999388087139182e-06, |
|
"loss": 1.441, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.00499, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 9.99938562724084e-06, |
|
"loss": 1.5702, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.999383162408303e-06, |
|
"loss": 1.5134, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1527564296192e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|