|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.979631425800194, |
|
"eval_steps": 5000, |
|
"global_step": 384, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015518913676042677, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.831, |
|
"num_input_tokens_seen": 524288, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.031037827352085354, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 9.999323662872997e-05, |
|
"loss": 0.7398, |
|
"num_input_tokens_seen": 1048576, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04655674102812803, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 9.99729483446475e-05, |
|
"loss": 0.6438, |
|
"num_input_tokens_seen": 1572864, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06207565470417071, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 9.993914063644052e-05, |
|
"loss": 0.6032, |
|
"num_input_tokens_seen": 2097152, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.07759456838021339, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 9.989182265027232e-05, |
|
"loss": 0.5433, |
|
"num_input_tokens_seen": 2621440, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09311348205625607, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 9.98310071873072e-05, |
|
"loss": 0.5228, |
|
"num_input_tokens_seen": 3145728, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.10863239573229874, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 9.97567107002474e-05, |
|
"loss": 0.4702, |
|
"num_input_tokens_seen": 3670016, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.12415130940834142, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 9.966895328888194e-05, |
|
"loss": 0.4574, |
|
"num_input_tokens_seen": 4194304, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1396702230843841, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 9.956775869464901e-05, |
|
"loss": 0.5093, |
|
"num_input_tokens_seen": 4718592, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.15518913676042678, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 9.945315429421306e-05, |
|
"loss": 0.4771, |
|
"num_input_tokens_seen": 5242880, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17070805043646944, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 9.932517109205849e-05, |
|
"loss": 0.4343, |
|
"num_input_tokens_seen": 5767168, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.18622696411251213, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 9.918384371210176e-05, |
|
"loss": 0.4455, |
|
"num_input_tokens_seen": 6291456, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2017458777885548, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 9.902921038832455e-05, |
|
"loss": 0.4669, |
|
"num_input_tokens_seen": 6815744, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.21726479146459748, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 9.886131295443003e-05, |
|
"loss": 0.4723, |
|
"num_input_tokens_seen": 7340032, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.23278370514064015, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 9.868019683252543e-05, |
|
"loss": 0.4364, |
|
"num_input_tokens_seen": 7864320, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.24830261881668284, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.848591102083375e-05, |
|
"loss": 0.4013, |
|
"num_input_tokens_seen": 8388608, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2638215324927255, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 9.82785080804381e-05, |
|
"loss": 0.3875, |
|
"num_input_tokens_seen": 8912896, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2793404461687682, |
|
"grad_norm": 0.125, |
|
"learning_rate": 9.805804412106198e-05, |
|
"loss": 0.4187, |
|
"num_input_tokens_seen": 9437184, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2948593598448109, |
|
"grad_norm": 0.12255859375, |
|
"learning_rate": 9.782457878588977e-05, |
|
"loss": 0.3981, |
|
"num_input_tokens_seen": 9961472, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.31037827352085356, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 9.757817523543109e-05, |
|
"loss": 0.4121, |
|
"num_input_tokens_seen": 10485760, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3258971871968962, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 9.731890013043368e-05, |
|
"loss": 0.392, |
|
"num_input_tokens_seen": 11010048, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3414161008729389, |
|
"grad_norm": 0.10546875, |
|
"learning_rate": 9.704682361384941e-05, |
|
"loss": 0.3845, |
|
"num_input_tokens_seen": 11534336, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3569350145489816, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 9.676201929185809e-05, |
|
"loss": 0.397, |
|
"num_input_tokens_seen": 12058624, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.37245392822502427, |
|
"grad_norm": 0.083984375, |
|
"learning_rate": 9.646456421395446e-05, |
|
"loss": 0.3753, |
|
"num_input_tokens_seen": 12582912, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3879728419010669, |
|
"grad_norm": 0.0888671875, |
|
"learning_rate": 9.615453885210369e-05, |
|
"loss": 0.387, |
|
"num_input_tokens_seen": 13107200, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4034917555771096, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 9.583202707897074e-05, |
|
"loss": 0.3724, |
|
"num_input_tokens_seen": 13631488, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4190106692531523, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 9.549711614523007e-05, |
|
"loss": 0.4394, |
|
"num_input_tokens_seen": 14155776, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.43452958292919497, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 9.514989665596114e-05, |
|
"loss": 0.4177, |
|
"num_input_tokens_seen": 14680064, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.45004849660523766, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 9.479046254613673e-05, |
|
"loss": 0.3939, |
|
"num_input_tokens_seen": 15204352, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4655674102812803, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 9.441891105521006e-05, |
|
"loss": 0.4207, |
|
"num_input_tokens_seen": 15728640, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.481086323957323, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 9.403534270080829e-05, |
|
"loss": 0.3653, |
|
"num_input_tokens_seen": 16252928, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.49660523763336567, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 9.3639861251539e-05, |
|
"loss": 0.3925, |
|
"num_input_tokens_seen": 16777216, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5121241513094084, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 9.323257369891703e-05, |
|
"loss": 0.3982, |
|
"num_input_tokens_seen": 17301504, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.527643064985451, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 9.281359022841965e-05, |
|
"loss": 0.3709, |
|
"num_input_tokens_seen": 17825792, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5431619786614937, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 9.238302418967756e-05, |
|
"loss": 0.3744, |
|
"num_input_tokens_seen": 18350080, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5586808923375364, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 9.194099206580982e-05, |
|
"loss": 0.3929, |
|
"num_input_tokens_seen": 18874368, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.574199806013579, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 9.148761344191109e-05, |
|
"loss": 0.3716, |
|
"num_input_tokens_seen": 19398656, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5897187196896218, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 9.102301097269974e-05, |
|
"loss": 0.3959, |
|
"num_input_tokens_seen": 19922944, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6052376333656644, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 9.054731034933549e-05, |
|
"loss": 0.3514, |
|
"num_input_tokens_seen": 20447232, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6207565470417071, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 9.006064026541548e-05, |
|
"loss": 0.3767, |
|
"num_input_tokens_seen": 20971520, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6362754607177498, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 8.956313238215824e-05, |
|
"loss": 0.371, |
|
"num_input_tokens_seen": 21495808, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6517943743937924, |
|
"grad_norm": 0.1171875, |
|
"learning_rate": 8.905492129278478e-05, |
|
"loss": 0.3529, |
|
"num_input_tokens_seen": 22020096, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6673132880698351, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 8.853614448610631e-05, |
|
"loss": 0.3044, |
|
"num_input_tokens_seen": 22544384, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6828322017458778, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 8.800694230932884e-05, |
|
"loss": 0.3532, |
|
"num_input_tokens_seen": 23068672, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6983511154219205, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 8.74674579300843e-05, |
|
"loss": 0.3461, |
|
"num_input_tokens_seen": 23592960, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7138700290979632, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 8.691783729769874e-05, |
|
"loss": 0.3513, |
|
"num_input_tokens_seen": 24117248, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7293889427740058, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 8.635822910370792e-05, |
|
"loss": 0.3842, |
|
"num_input_tokens_seen": 24641536, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7449078564500485, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 8.578878474163115e-05, |
|
"loss": 0.363, |
|
"num_input_tokens_seen": 25165824, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7604267701260912, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 8.520965826601394e-05, |
|
"loss": 0.3079, |
|
"num_input_tokens_seen": 25690112, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7759456838021338, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 8.462100635075097e-05, |
|
"loss": 0.3769, |
|
"num_input_tokens_seen": 26214400, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7914645974781765, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 8.40229882467003e-05, |
|
"loss": 0.3907, |
|
"num_input_tokens_seen": 26738688, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8069835111542192, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 8.341576573860048e-05, |
|
"loss": 0.3457, |
|
"num_input_tokens_seen": 27262976, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8225024248302619, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 8.279950310130217e-05, |
|
"loss": 0.3889, |
|
"num_input_tokens_seen": 27787264, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8380213385063046, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 8.2174367055326e-05, |
|
"loss": 0.3142, |
|
"num_input_tokens_seen": 28311552, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8535402521823472, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 8.154052672175887e-05, |
|
"loss": 0.3299, |
|
"num_input_tokens_seen": 28835840, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8690591658583899, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 8.089815357650089e-05, |
|
"loss": 0.3425, |
|
"num_input_tokens_seen": 29360128, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8845780795344326, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 8.024742140387506e-05, |
|
"loss": 0.3363, |
|
"num_input_tokens_seen": 29884416, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.9000969932104753, |
|
"grad_norm": 0.083984375, |
|
"learning_rate": 7.95885062496126e-05, |
|
"loss": 0.3725, |
|
"num_input_tokens_seen": 30408704, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.915615906886518, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 7.892158637322646e-05, |
|
"loss": 0.3397, |
|
"num_input_tokens_seen": 30932992, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.9311348205625606, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 7.824684219978591e-05, |
|
"loss": 0.2812, |
|
"num_input_tokens_seen": 31457280, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9466537342386033, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 7.756445627110523e-05, |
|
"loss": 0.3555, |
|
"num_input_tokens_seen": 31981568, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.962172647914646, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 7.687461319635981e-05, |
|
"loss": 0.3362, |
|
"num_input_tokens_seen": 32505856, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9776915615906887, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 7.6177499602143e-05, |
|
"loss": 0.3133, |
|
"num_input_tokens_seen": 33030144, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9932104752667313, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 7.547330408197695e-05, |
|
"loss": 0.3119, |
|
"num_input_tokens_seen": 33554432, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.008729388942774, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 7.476221714529167e-05, |
|
"loss": 0.3117, |
|
"num_input_tokens_seen": 34078720, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0242483026188167, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 7.404443116588548e-05, |
|
"loss": 0.329, |
|
"num_input_tokens_seen": 34603008, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.0397672162948595, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 7.332014032988123e-05, |
|
"loss": 0.279, |
|
"num_input_tokens_seen": 35127296, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.055286129970902, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 7.258954058319216e-05, |
|
"loss": 0.2682, |
|
"num_input_tokens_seen": 35651584, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.0708050436469447, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 7.185282957851175e-05, |
|
"loss": 0.293, |
|
"num_input_tokens_seen": 36175872, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.0863239573229875, |
|
"grad_norm": 0.08154296875, |
|
"learning_rate": 7.111020662184174e-05, |
|
"loss": 0.315, |
|
"num_input_tokens_seen": 36700160, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.10184287099903, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 7.036187261857289e-05, |
|
"loss": 0.289, |
|
"num_input_tokens_seen": 37224448, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.1173617846750727, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 6.960803001913314e-05, |
|
"loss": 0.2808, |
|
"num_input_tokens_seen": 37748736, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.1328806983511155, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 6.884888276421766e-05, |
|
"loss": 0.318, |
|
"num_input_tokens_seen": 38273024, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.148399612027158, |
|
"grad_norm": 0.08251953125, |
|
"learning_rate": 6.808463622961578e-05, |
|
"loss": 0.2685, |
|
"num_input_tokens_seen": 38797312, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.1639185257032008, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 6.731549717064974e-05, |
|
"loss": 0.3121, |
|
"num_input_tokens_seen": 39321600, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1794374393792435, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 6.654167366624009e-05, |
|
"loss": 0.2835, |
|
"num_input_tokens_seen": 39845888, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.1949563530552862, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 6.576337506261314e-05, |
|
"loss": 0.2905, |
|
"num_input_tokens_seen": 40370176, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.2104752667313288, |
|
"grad_norm": 0.08984375, |
|
"learning_rate": 6.498081191666548e-05, |
|
"loss": 0.3277, |
|
"num_input_tokens_seen": 40894464, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.2259941804073715, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 6.419419593900108e-05, |
|
"loss": 0.2788, |
|
"num_input_tokens_seen": 41418752, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.2415130940834143, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 6.340373993665607e-05, |
|
"loss": 0.2971, |
|
"num_input_tokens_seen": 41943040, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2570320077594568, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 6.260965775552712e-05, |
|
"loss": 0.287, |
|
"num_input_tokens_seen": 42467328, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.2725509214354995, |
|
"grad_norm": 0.0849609375, |
|
"learning_rate": 6.181216422251862e-05, |
|
"loss": 0.3196, |
|
"num_input_tokens_seen": 42991616, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2880698351115423, |
|
"grad_norm": 0.083984375, |
|
"learning_rate": 6.101147508742455e-05, |
|
"loss": 0.3021, |
|
"num_input_tokens_seen": 43515904, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.3035887487875848, |
|
"grad_norm": 0.0810546875, |
|
"learning_rate": 6.0207806964560584e-05, |
|
"loss": 0.2329, |
|
"num_input_tokens_seen": 44040192, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.3191076624636275, |
|
"grad_norm": 0.08984375, |
|
"learning_rate": 5.940137727416246e-05, |
|
"loss": 0.2803, |
|
"num_input_tokens_seen": 44564480, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.3346265761396703, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 5.8592404183566144e-05, |
|
"loss": 0.2744, |
|
"num_input_tokens_seen": 45088768, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.3501454898157128, |
|
"grad_norm": 0.08544921875, |
|
"learning_rate": 5.778110654818601e-05, |
|
"loss": 0.3332, |
|
"num_input_tokens_seen": 45613056, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3656644034917556, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 5.6967703852306786e-05, |
|
"loss": 0.3223, |
|
"num_input_tokens_seen": 46137344, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.3811833171677983, |
|
"grad_norm": 0.083984375, |
|
"learning_rate": 5.6152416149705455e-05, |
|
"loss": 0.3127, |
|
"num_input_tokens_seen": 46661632, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.3967022308438408, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 5.5335464004118986e-05, |
|
"loss": 0.2908, |
|
"num_input_tokens_seen": 47185920, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.4122211445198836, |
|
"grad_norm": 0.08984375, |
|
"learning_rate": 5.4517068429574215e-05, |
|
"loss": 0.2918, |
|
"num_input_tokens_seen": 47710208, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.4277400581959263, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 5.3697450830595774e-05, |
|
"loss": 0.268, |
|
"num_input_tokens_seen": 48234496, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.4432589718719688, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 5.287683294230855e-05, |
|
"loss": 0.2862, |
|
"num_input_tokens_seen": 48758784, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.4587778855480116, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 5.205543677045049e-05, |
|
"loss": 0.3054, |
|
"num_input_tokens_seen": 49283072, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.4742967992240543, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 5.1233484531312414e-05, |
|
"loss": 0.2814, |
|
"num_input_tokens_seen": 49807360, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.489815712900097, |
|
"grad_norm": 0.1123046875, |
|
"learning_rate": 5.0411198591620676e-05, |
|
"loss": 0.2703, |
|
"num_input_tokens_seen": 50331648, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.5053346265761398, |
|
"grad_norm": 0.11181640625, |
|
"learning_rate": 4.958880140837933e-05, |
|
"loss": 0.2689, |
|
"num_input_tokens_seen": 50855936, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.5208535402521823, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 4.876651546868759e-05, |
|
"loss": 0.3013, |
|
"num_input_tokens_seen": 51380224, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.536372453928225, |
|
"grad_norm": 0.0849609375, |
|
"learning_rate": 4.794456322954952e-05, |
|
"loss": 0.2751, |
|
"num_input_tokens_seen": 51904512, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.5518913676042678, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 4.712316705769145e-05, |
|
"loss": 0.3178, |
|
"num_input_tokens_seen": 52428800, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5674102812803103, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 4.630254916940424e-05, |
|
"loss": 0.2742, |
|
"num_input_tokens_seen": 52953088, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.582929194956353, |
|
"grad_norm": 0.0888671875, |
|
"learning_rate": 4.548293157042581e-05, |
|
"loss": 0.2751, |
|
"num_input_tokens_seen": 53477376, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.5984481086323958, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 4.466453599588103e-05, |
|
"loss": 0.3256, |
|
"num_input_tokens_seen": 54001664, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.6139670223084384, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 4.384758385029457e-05, |
|
"loss": 0.2603, |
|
"num_input_tokens_seen": 54525952, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.629485935984481, |
|
"grad_norm": 0.08740234375, |
|
"learning_rate": 4.3032296147693225e-05, |
|
"loss": 0.2598, |
|
"num_input_tokens_seen": 55050240, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.6450048496605238, |
|
"grad_norm": 0.0908203125, |
|
"learning_rate": 4.2218893451814005e-05, |
|
"loss": 0.2811, |
|
"num_input_tokens_seen": 55574528, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.6605237633365664, |
|
"grad_norm": 0.08544921875, |
|
"learning_rate": 4.140759581643386e-05, |
|
"loss": 0.2386, |
|
"num_input_tokens_seen": 56098816, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.6760426770126091, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 4.059862272583755e-05, |
|
"loss": 0.2999, |
|
"num_input_tokens_seen": 56623104, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.6915615906886519, |
|
"grad_norm": 0.08935546875, |
|
"learning_rate": 3.979219303543942e-05, |
|
"loss": 0.2857, |
|
"num_input_tokens_seen": 57147392, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.7070805043646944, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 3.898852491257546e-05, |
|
"loss": 0.2533, |
|
"num_input_tokens_seen": 57671680, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.7225994180407371, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 3.818783577748138e-05, |
|
"loss": 0.306, |
|
"num_input_tokens_seen": 58195968, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.7381183317167799, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 3.739034224447289e-05, |
|
"loss": 0.2594, |
|
"num_input_tokens_seen": 58720256, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.7536372453928224, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 3.659626006334395e-05, |
|
"loss": 0.284, |
|
"num_input_tokens_seen": 59244544, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.7691561590688651, |
|
"grad_norm": 0.10546875, |
|
"learning_rate": 3.580580406099893e-05, |
|
"loss": 0.33, |
|
"num_input_tokens_seen": 59768832, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.7846750727449079, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 3.501918808333453e-05, |
|
"loss": 0.2968, |
|
"num_input_tokens_seen": 60293120, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.8001939864209504, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 3.4236624937386876e-05, |
|
"loss": 0.2836, |
|
"num_input_tokens_seen": 60817408, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.8157129000969934, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 3.3458326333759925e-05, |
|
"loss": 0.2452, |
|
"num_input_tokens_seen": 61341696, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.831231813773036, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 3.268450282935026e-05, |
|
"loss": 0.2526, |
|
"num_input_tokens_seen": 61865984, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.8467507274490784, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 3.191536377038422e-05, |
|
"loss": 0.2578, |
|
"num_input_tokens_seen": 62390272, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.8622696411251214, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 3.115111723578235e-05, |
|
"loss": 0.2895, |
|
"num_input_tokens_seen": 62914560, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.877788554801164, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 3.0391969980866875e-05, |
|
"loss": 0.3047, |
|
"num_input_tokens_seen": 63438848, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.8933074684772064, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 2.963812738142713e-05, |
|
"loss": 0.2958, |
|
"num_input_tokens_seen": 63963136, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.9088263821532494, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 2.888979337815828e-05, |
|
"loss": 0.2598, |
|
"num_input_tokens_seen": 64487424, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.924345295829292, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 2.8147170421488272e-05, |
|
"loss": 0.2699, |
|
"num_input_tokens_seen": 65011712, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.9398642095053347, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 2.7410459416807853e-05, |
|
"loss": 0.2827, |
|
"num_input_tokens_seen": 65536000, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.9553831231813774, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 2.6679859670118783e-05, |
|
"loss": 0.3119, |
|
"num_input_tokens_seen": 66060288, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.97090203685742, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 2.5955568834114524e-05, |
|
"loss": 0.2837, |
|
"num_input_tokens_seen": 66584576, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.9864209505334627, |
|
"grad_norm": 0.08935546875, |
|
"learning_rate": 2.5237782854708348e-05, |
|
"loss": 0.2511, |
|
"num_input_tokens_seen": 67108864, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.0019398642095054, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 2.452669591802307e-05, |
|
"loss": 0.2501, |
|
"num_input_tokens_seen": 67633152, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.017458777885548, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 2.3822500397857018e-05, |
|
"loss": 0.2296, |
|
"num_input_tokens_seen": 68157440, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.0329776915615905, |
|
"grad_norm": 0.0908203125, |
|
"learning_rate": 2.3125386803640187e-05, |
|
"loss": 0.2333, |
|
"num_input_tokens_seen": 68681728, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.0484966052376334, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 2.2435543728894792e-05, |
|
"loss": 0.2119, |
|
"num_input_tokens_seen": 69206016, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.064015518913676, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 2.175315780021411e-05, |
|
"loss": 0.2676, |
|
"num_input_tokens_seen": 69730304, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.079534432589719, |
|
"grad_norm": 0.08837890625, |
|
"learning_rate": 2.1078413626773546e-05, |
|
"loss": 0.2285, |
|
"num_input_tokens_seen": 70254592, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.0950533462657615, |
|
"grad_norm": 0.10546875, |
|
"learning_rate": 2.0411493750387423e-05, |
|
"loss": 0.2281, |
|
"num_input_tokens_seen": 70778880, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.110572259941804, |
|
"grad_norm": 0.1025390625, |
|
"learning_rate": 1.9752578596124954e-05, |
|
"loss": 0.2701, |
|
"num_input_tokens_seen": 71303168, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.126091173617847, |
|
"grad_norm": 0.08984375, |
|
"learning_rate": 1.9101846423499116e-05, |
|
"loss": 0.2033, |
|
"num_input_tokens_seen": 71827456, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.1416100872938895, |
|
"grad_norm": 0.10546875, |
|
"learning_rate": 1.8459473278241126e-05, |
|
"loss": 0.2489, |
|
"num_input_tokens_seen": 72351744, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.157129000969932, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 1.7825632944674015e-05, |
|
"loss": 0.2294, |
|
"num_input_tokens_seen": 72876032, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.172647914645975, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 1.7200496898697832e-05, |
|
"loss": 0.2452, |
|
"num_input_tokens_seen": 73400320, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.1881668283220175, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 1.6584234261399534e-05, |
|
"loss": 0.242, |
|
"num_input_tokens_seen": 73924608, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.20368574199806, |
|
"grad_norm": 0.10888671875, |
|
"learning_rate": 1.5977011753299725e-05, |
|
"loss": 0.2894, |
|
"num_input_tokens_seen": 74448896, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.219204655674103, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 1.537899364924905e-05, |
|
"loss": 0.231, |
|
"num_input_tokens_seen": 74973184, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.2347235693501455, |
|
"grad_norm": 0.11279296875, |
|
"learning_rate": 1.4790341733986085e-05, |
|
"loss": 0.2412, |
|
"num_input_tokens_seen": 75497472, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.250242483026188, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 1.4211215258368866e-05, |
|
"loss": 0.2464, |
|
"num_input_tokens_seen": 76021760, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.265761396702231, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 1.3641770896292084e-05, |
|
"loss": 0.2231, |
|
"num_input_tokens_seen": 76546048, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.2812803103782735, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 1.3082162702301276e-05, |
|
"loss": 0.2432, |
|
"num_input_tokens_seen": 77070336, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.296799224054316, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 1.253254206991572e-05, |
|
"loss": 0.2147, |
|
"num_input_tokens_seen": 77594624, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.312318137730359, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 1.1993057690671173e-05, |
|
"loss": 0.249, |
|
"num_input_tokens_seen": 78118912, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.3278370514064015, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 1.1463855513893695e-05, |
|
"loss": 0.2362, |
|
"num_input_tokens_seen": 78643200, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.343355965082444, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 1.0945078707215222e-05, |
|
"loss": 0.2232, |
|
"num_input_tokens_seen": 79167488, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.358874878758487, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 1.0436867617841768e-05, |
|
"loss": 0.2569, |
|
"num_input_tokens_seen": 79691776, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.3743937924345295, |
|
"grad_norm": 0.10546875, |
|
"learning_rate": 9.939359734584553e-06, |
|
"loss": 0.214, |
|
"num_input_tokens_seen": 80216064, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.3899127061105725, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 9.452689650664515e-06, |
|
"loss": 0.2451, |
|
"num_input_tokens_seen": 80740352, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.405431619786615, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 8.976989027300264e-06, |
|
"loss": 0.2288, |
|
"num_input_tokens_seen": 81264640, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.4209505334626575, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 8.51238655808892e-06, |
|
"loss": 0.2332, |
|
"num_input_tokens_seen": 81788928, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.4364694471387, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 8.059007934190194e-06, |
|
"loss": 0.202, |
|
"num_input_tokens_seen": 82313216, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.451988360814743, |
|
"grad_norm": 0.0908203125, |
|
"learning_rate": 7.61697581032243e-06, |
|
"loss": 0.227, |
|
"num_input_tokens_seen": 82837504, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.4675072744907856, |
|
"grad_norm": 0.1025390625, |
|
"learning_rate": 7.186409771580354e-06, |
|
"loss": 0.2429, |
|
"num_input_tokens_seen": 83361792, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.4830261881668285, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 6.76742630108298e-06, |
|
"loss": 0.2147, |
|
"num_input_tokens_seen": 83886080, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.498545101842871, |
|
"grad_norm": 0.1025390625, |
|
"learning_rate": 6.3601387484610145e-06, |
|
"loss": 0.2423, |
|
"num_input_tokens_seen": 84410368, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.5140640155189136, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 5.9646572991917116e-06, |
|
"loss": 0.2828, |
|
"num_input_tokens_seen": 84934656, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.529582929194956, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 5.581088944789953e-06, |
|
"loss": 0.2461, |
|
"num_input_tokens_seen": 85458944, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.545101842870999, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 5.209537453863289e-06, |
|
"loss": 0.296, |
|
"num_input_tokens_seen": 85983232, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.5606207565470416, |
|
"grad_norm": 0.08984375, |
|
"learning_rate": 4.850103344038853e-06, |
|
"loss": 0.2061, |
|
"num_input_tokens_seen": 86507520, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.5761396702230845, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 4.502883854769935e-06, |
|
"loss": 0.2323, |
|
"num_input_tokens_seen": 87031808, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.591658583899127, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 4.167972921029262e-06, |
|
"loss": 0.2156, |
|
"num_input_tokens_seen": 87556096, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.6071774975751696, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 3.845461147896323e-06, |
|
"loss": 0.2393, |
|
"num_input_tokens_seen": 88080384, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.6226964112512126, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 3.535435786045538e-06, |
|
"loss": 0.2165, |
|
"num_input_tokens_seen": 88604672, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.638215324927255, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 3.2379807081419187e-06, |
|
"loss": 0.2313, |
|
"num_input_tokens_seen": 89128960, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.653734238603298, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 2.9531763861505966e-06, |
|
"loss": 0.2336, |
|
"num_input_tokens_seen": 89653248, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.6692531522793406, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 2.6810998695663282e-06, |
|
"loss": 0.2311, |
|
"num_input_tokens_seen": 90177536, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.684772065955383, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 2.4218247645689307e-06, |
|
"loss": 0.213, |
|
"num_input_tokens_seen": 90701824, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.7002909796314256, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 2.1754212141102346e-06, |
|
"loss": 0.2364, |
|
"num_input_tokens_seen": 91226112, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.7158098933074686, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 1.941955878938029e-06, |
|
"loss": 0.2147, |
|
"num_input_tokens_seen": 91750400, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.731328806983511, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 1.7214919195619127e-06, |
|
"loss": 0.2316, |
|
"num_input_tokens_seen": 92274688, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.746847720659554, |
|
"grad_norm": 0.10791015625, |
|
"learning_rate": 1.514088979166256e-06, |
|
"loss": 0.2263, |
|
"num_input_tokens_seen": 92798976, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.7623666343355966, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 1.3198031674745813e-06, |
|
"loss": 0.2323, |
|
"num_input_tokens_seen": 93323264, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.777885548011639, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 1.138687045569975e-06, |
|
"loss": 0.2246, |
|
"num_input_tokens_seen": 93847552, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.7934044616876816, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 9.707896116754488e-07, |
|
"loss": 0.2287, |
|
"num_input_tokens_seen": 94371840, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.8089233753637246, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 8.161562878982398e-07, |
|
"loss": 0.2081, |
|
"num_input_tokens_seen": 94896128, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.824442289039767, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 6.74828907941516e-07, |
|
"loss": 0.226, |
|
"num_input_tokens_seen": 95420416, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.83996120271581, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 5.468457057869358e-07, |
|
"loss": 0.273, |
|
"num_input_tokens_seen": 95944704, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.8554801163918526, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 4.322413053509944e-07, |
|
"loss": 0.2634, |
|
"num_input_tokens_seen": 96468992, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.870999030067895, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 3.3104671111806593e-07, |
|
"loss": 0.2592, |
|
"num_input_tokens_seen": 96993280, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.8865179437439377, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 2.432892997526026e-07, |
|
"loss": 0.2566, |
|
"num_input_tokens_seen": 97517568, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.9020368574199806, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 1.6899281269279755e-07, |
|
"loss": 0.2575, |
|
"num_input_tokens_seen": 98041856, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.917555771096023, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 1.0817734972768944e-07, |
|
"loss": 0.2482, |
|
"num_input_tokens_seen": 98566144, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.933074684772066, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 6.085936355947897e-08, |
|
"loss": 0.2483, |
|
"num_input_tokens_seen": 99090432, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.9485935984481086, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 2.7051655352494652e-08, |
|
"loss": 0.2359, |
|
"num_input_tokens_seen": 99614720, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.964112512124151, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 6.763371270035457e-09, |
|
"loss": 0.2434, |
|
"num_input_tokens_seen": 100139008, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.979631425800194, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 0.0, |
|
"loss": 0.2042, |
|
"num_input_tokens_seen": 100663296, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.979631425800194, |
|
"num_input_tokens_seen": 100663296, |
|
"step": 384, |
|
"total_flos": 4.2827022437921587e+18, |
|
"train_loss": 0.3106601850595325, |
|
"train_runtime": 8133.8849, |
|
"train_samples_per_second": 12.157, |
|
"train_steps_per_second": 0.047 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 384, |
|
"num_input_tokens_seen": 100663296, |
|
"num_train_epochs": 3, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.2827022437921587e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|