|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.024896678783050342, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 102.515625, |
|
"epoch": 4.979335756610068e-05, |
|
"grad_norm": 3.439708301992933, |
|
"kl": 0.0, |
|
"learning_rate": 9.999751033212169e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.4075993597507477, |
|
"rewards/format_reward": 0.1171875, |
|
"rewards/iou_reward": 0.0703125, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 99.21875, |
|
"epoch": 9.958671513220136e-05, |
|
"grad_norm": 5.223373032311897, |
|
"kl": 0.0010242462158203125, |
|
"learning_rate": 9.99950206642434e-07, |
|
"loss": 0.0, |
|
"reward": 0.1328125, |
|
"reward_std": 0.3100738227367401, |
|
"rewards/format_reward": 0.0859375, |
|
"rewards/iou_reward": 0.046875, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 100.6015625, |
|
"epoch": 0.00014938007269830206, |
|
"grad_norm": 2.787815970639101, |
|
"kl": 0.001255035400390625, |
|
"learning_rate": 9.999253099636507e-07, |
|
"loss": 0.0001, |
|
"reward": 0.140625, |
|
"reward_std": 0.3062904253602028, |
|
"rewards/format_reward": 0.09375, |
|
"rewards/iou_reward": 0.046875, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 109.7890625, |
|
"epoch": 0.00019917343026440272, |
|
"grad_norm": 3.8836805256336344, |
|
"kl": 0.0029449462890625, |
|
"learning_rate": 9.999004132848678e-07, |
|
"loss": 0.0001, |
|
"reward": 0.3125, |
|
"reward_std": 0.5315210521221161, |
|
"rewards/format_reward": 0.1953125, |
|
"rewards/iou_reward": 0.1171875, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 104.8359375, |
|
"epoch": 0.0002489667878305034, |
|
"grad_norm": 5.140671303940714, |
|
"kl": 0.0057830810546875, |
|
"learning_rate": 9.998755166060848e-07, |
|
"loss": 0.0002, |
|
"reward": 0.4375, |
|
"reward_std": 0.6855984330177307, |
|
"rewards/format_reward": 0.2734375, |
|
"rewards/iou_reward": 0.1640625, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 111.203125, |
|
"epoch": 0.0002987601453966041, |
|
"grad_norm": 23.256398147936178, |
|
"kl": 0.01739501953125, |
|
"learning_rate": 9.998506199273017e-07, |
|
"loss": 0.0007, |
|
"reward": 0.6875, |
|
"reward_std": 0.8091083467006683, |
|
"rewards/format_reward": 0.3828125, |
|
"rewards/iou_reward": 0.3046875, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 105.3671875, |
|
"epoch": 0.0003485535029627048, |
|
"grad_norm": 2.9658525695350466, |
|
"kl": 0.030517578125, |
|
"learning_rate": 9.998257232485186e-07, |
|
"loss": 0.0012, |
|
"reward": 0.6953125, |
|
"reward_std": 0.7262816429138184, |
|
"rewards/format_reward": 0.3984375, |
|
"rewards/iou_reward": 0.296875, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 105.921875, |
|
"epoch": 0.00039834686052880544, |
|
"grad_norm": 4.098196384522638, |
|
"kl": 0.03277587890625, |
|
"learning_rate": 9.998008265697355e-07, |
|
"loss": 0.0013, |
|
"reward": 1.09375, |
|
"reward_std": 0.836446076631546, |
|
"rewards/format_reward": 0.609375, |
|
"rewards/iou_reward": 0.484375, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 100.7265625, |
|
"epoch": 0.00044814021809490615, |
|
"grad_norm": 8.043342546467072, |
|
"kl": 0.0408935546875, |
|
"learning_rate": 9.997759298909524e-07, |
|
"loss": 0.0016, |
|
"reward": 1.1015625, |
|
"reward_std": 0.7649284303188324, |
|
"rewards/format_reward": 0.6953125, |
|
"rewards/iou_reward": 0.40625, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 108.1796875, |
|
"epoch": 0.0004979335756610068, |
|
"grad_norm": 2.8305496250278512, |
|
"kl": 0.0579833984375, |
|
"learning_rate": 9.997510332121696e-07, |
|
"loss": 0.0023, |
|
"reward": 1.21875, |
|
"reward_std": 0.6406852006912231, |
|
"rewards/format_reward": 0.7421875, |
|
"rewards/iou_reward": 0.4765625, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 113.3671875, |
|
"epoch": 0.0005477269332271075, |
|
"grad_norm": 4.679645845027607, |
|
"kl": 0.06982421875, |
|
"learning_rate": 9.997261365333865e-07, |
|
"loss": 0.0028, |
|
"reward": 1.328125, |
|
"reward_std": 0.7090373337268829, |
|
"rewards/format_reward": 0.8046875, |
|
"rewards/iou_reward": 0.5234375, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 112.203125, |
|
"epoch": 0.0005975202907932082, |
|
"grad_norm": 9.1144963917017, |
|
"kl": 0.06103515625, |
|
"learning_rate": 9.997012398546034e-07, |
|
"loss": 0.0024, |
|
"reward": 1.375, |
|
"reward_std": 0.6350045800209045, |
|
"rewards/format_reward": 0.828125, |
|
"rewards/iou_reward": 0.546875, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 110.375, |
|
"epoch": 0.0006473136483593089, |
|
"grad_norm": 2.853826837549953, |
|
"kl": 0.078857421875, |
|
"learning_rate": 9.996763431758203e-07, |
|
"loss": 0.0032, |
|
"reward": 1.4765625, |
|
"reward_std": 0.5687777698040009, |
|
"rewards/format_reward": 0.875, |
|
"rewards/iou_reward": 0.6015625, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 104.4453125, |
|
"epoch": 0.0006971070059254096, |
|
"grad_norm": 2.614377297487722, |
|
"kl": 0.07763671875, |
|
"learning_rate": 9.996514464970372e-07, |
|
"loss": 0.0031, |
|
"reward": 1.53125, |
|
"reward_std": 0.5344685912132263, |
|
"rewards/format_reward": 0.9296875, |
|
"rewards/iou_reward": 0.6015625, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 105.359375, |
|
"epoch": 0.0007469003634915102, |
|
"grad_norm": 2.720864400824515, |
|
"kl": 0.066162109375, |
|
"learning_rate": 9.996265498182541e-07, |
|
"loss": 0.0027, |
|
"reward": 1.5546875, |
|
"reward_std": 0.48394446074962616, |
|
"rewards/format_reward": 0.9375, |
|
"rewards/iou_reward": 0.6171875, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 106.984375, |
|
"epoch": 0.0007966937210576109, |
|
"grad_norm": 3.5635470584533757, |
|
"kl": 0.073974609375, |
|
"learning_rate": 9.996016531394713e-07, |
|
"loss": 0.003, |
|
"reward": 1.4140625, |
|
"reward_std": 0.5414000898599625, |
|
"rewards/format_reward": 0.890625, |
|
"rewards/iou_reward": 0.5234375, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 102.484375, |
|
"epoch": 0.0008464870786237116, |
|
"grad_norm": 3.2130024856121113, |
|
"kl": 0.0836181640625, |
|
"learning_rate": 9.99576756460688e-07, |
|
"loss": 0.0033, |
|
"reward": 1.625, |
|
"reward_std": 0.43743596971035004, |
|
"rewards/format_reward": 0.9375, |
|
"rewards/iou_reward": 0.6875, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 96.4453125, |
|
"epoch": 0.0008962804361898123, |
|
"grad_norm": 5.3036477946017495, |
|
"kl": 0.092529296875, |
|
"learning_rate": 9.995518597819051e-07, |
|
"loss": 0.0037, |
|
"reward": 1.546875, |
|
"reward_std": 0.4155466854572296, |
|
"rewards/format_reward": 0.9453125, |
|
"rewards/iou_reward": 0.6015625, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 92.7265625, |
|
"epoch": 0.000946073793755913, |
|
"grad_norm": 1.6404768518677588, |
|
"kl": 0.0986328125, |
|
"learning_rate": 9.99526963103122e-07, |
|
"loss": 0.0039, |
|
"reward": 1.8515625, |
|
"reward_std": 0.21249166131019592, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 95.3359375, |
|
"epoch": 0.0009958671513220136, |
|
"grad_norm": 8.387492877691109, |
|
"kl": 0.091552734375, |
|
"learning_rate": 9.99502066424339e-07, |
|
"loss": 0.0037, |
|
"reward": 1.5234375, |
|
"reward_std": 0.41780228912830353, |
|
"rewards/format_reward": 0.9453125, |
|
"rewards/iou_reward": 0.578125, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 91.8671875, |
|
"epoch": 0.0010456605088881143, |
|
"grad_norm": 7.902830417365247, |
|
"kl": 0.09033203125, |
|
"learning_rate": 9.994771697455559e-07, |
|
"loss": 0.0036, |
|
"reward": 1.7890625, |
|
"reward_std": 0.29395797848701477, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 84.7109375, |
|
"epoch": 0.001095453866454215, |
|
"grad_norm": 5.778709448370351, |
|
"kl": 0.103271484375, |
|
"learning_rate": 9.994522730667728e-07, |
|
"loss": 0.0041, |
|
"reward": 1.75, |
|
"reward_std": 0.21865329891443253, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.7734375, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 84.734375, |
|
"epoch": 0.0011452472240203156, |
|
"grad_norm": 2.162213886702805, |
|
"kl": 0.105712890625, |
|
"learning_rate": 9.994273763879897e-07, |
|
"loss": 0.0042, |
|
"reward": 1.7578125, |
|
"reward_std": 0.2409384548664093, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.765625, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 85.265625, |
|
"epoch": 0.0011950405815864165, |
|
"grad_norm": 2.02003344314062, |
|
"kl": 0.098876953125, |
|
"learning_rate": 9.994024797092068e-07, |
|
"loss": 0.004, |
|
"reward": 1.8203125, |
|
"reward_std": 0.22890795022249222, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 86.25, |
|
"epoch": 0.0012448339391525171, |
|
"grad_norm": 285.94463291873876, |
|
"kl": 5.951171875, |
|
"learning_rate": 9.993775830304237e-07, |
|
"loss": 0.2391, |
|
"reward": 1.7578125, |
|
"reward_std": 0.3533627539873123, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 84.546875, |
|
"epoch": 0.0012946272967186178, |
|
"grad_norm": 5.188819559309853, |
|
"kl": 0.102294921875, |
|
"learning_rate": 9.993526863516407e-07, |
|
"loss": 0.0041, |
|
"reward": 1.765625, |
|
"reward_std": 0.3224920183420181, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.78125, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 91.0859375, |
|
"epoch": 0.0013444206542847185, |
|
"grad_norm": 2.159559641995413, |
|
"kl": 0.097900390625, |
|
"learning_rate": 9.993277896728576e-07, |
|
"loss": 0.0039, |
|
"reward": 1.578125, |
|
"reward_std": 0.35679991543293, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.59375, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 87.90625, |
|
"epoch": 0.0013942140118508191, |
|
"grad_norm": 2.941556720282863, |
|
"kl": 0.099365234375, |
|
"learning_rate": 9.993028929940745e-07, |
|
"loss": 0.004, |
|
"reward": 1.7578125, |
|
"reward_std": 0.27292172610759735, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.765625, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 83.796875, |
|
"epoch": 0.0014440073694169198, |
|
"grad_norm": 4.988223268300561, |
|
"kl": 0.105712890625, |
|
"learning_rate": 9.992779963152916e-07, |
|
"loss": 0.0042, |
|
"reward": 1.90625, |
|
"reward_std": 0.16097761690616608, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 92.6171875, |
|
"epoch": 0.0014938007269830204, |
|
"grad_norm": 3.0411104156344626, |
|
"kl": 0.088134765625, |
|
"learning_rate": 9.992530996365083e-07, |
|
"loss": 0.0035, |
|
"reward": 1.6640625, |
|
"reward_std": 0.34205709397792816, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.6796875, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 84.1875, |
|
"epoch": 0.001543594084549121, |
|
"grad_norm": 4.7841273877784145, |
|
"kl": 0.093994140625, |
|
"learning_rate": 9.992282029577255e-07, |
|
"loss": 0.0038, |
|
"reward": 1.703125, |
|
"reward_std": 0.25855977088212967, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7109375, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 83.421875, |
|
"epoch": 0.0015933874421152217, |
|
"grad_norm": 2.3920531831879464, |
|
"kl": 0.099609375, |
|
"learning_rate": 9.992033062789424e-07, |
|
"loss": 0.004, |
|
"reward": 1.765625, |
|
"reward_std": 0.3056272119283676, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.765625, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 86.203125, |
|
"epoch": 0.0016431807996813224, |
|
"grad_norm": 1.559907759469652, |
|
"kl": 0.0849609375, |
|
"learning_rate": 9.991784096001593e-07, |
|
"loss": 0.0034, |
|
"reward": 1.8359375, |
|
"reward_std": 0.16439256072044373, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 78.859375, |
|
"epoch": 0.0016929741572474233, |
|
"grad_norm": 1.8611319118029848, |
|
"kl": 0.101806640625, |
|
"learning_rate": 9.991535129213762e-07, |
|
"loss": 0.0041, |
|
"reward": 1.828125, |
|
"reward_std": 0.21143082529306412, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 88.9609375, |
|
"epoch": 0.001742767514813524, |
|
"grad_norm": 1.8569665035563752, |
|
"kl": 0.09228515625, |
|
"learning_rate": 9.991286162425931e-07, |
|
"loss": 0.0037, |
|
"reward": 1.828125, |
|
"reward_std": 0.195631742477417, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 81.40625, |
|
"epoch": 0.0017925608723796246, |
|
"grad_norm": 3.9821079372769645, |
|
"kl": 0.108642578125, |
|
"learning_rate": 9.9910371956381e-07, |
|
"loss": 0.0044, |
|
"reward": 1.78125, |
|
"reward_std": 0.29159896075725555, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 77.796875, |
|
"epoch": 0.0018423542299457253, |
|
"grad_norm": 1.2534831070514643, |
|
"kl": 0.126708984375, |
|
"learning_rate": 9.990788228850272e-07, |
|
"loss": 0.0051, |
|
"reward": 1.8359375, |
|
"reward_std": 0.16439255699515343, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 84.2734375, |
|
"epoch": 0.001892147587511826, |
|
"grad_norm": 1.8218011681763233, |
|
"kl": 0.119384765625, |
|
"learning_rate": 9.99053926206244e-07, |
|
"loss": 0.0048, |
|
"reward": 1.8984375, |
|
"reward_std": 0.1893337331712246, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 84.15625, |
|
"epoch": 0.0019419409450779266, |
|
"grad_norm": 2.2362299880015586, |
|
"kl": 0.11328125, |
|
"learning_rate": 9.99029029527461e-07, |
|
"loss": 0.0045, |
|
"reward": 1.8203125, |
|
"reward_std": 0.20175683498382568, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 84.6640625, |
|
"epoch": 0.0019917343026440272, |
|
"grad_norm": 1.5450507919601535, |
|
"kl": 0.127685546875, |
|
"learning_rate": 9.99004132848678e-07, |
|
"loss": 0.0051, |
|
"reward": 1.765625, |
|
"reward_std": 0.22119548916816711, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.765625, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 79.734375, |
|
"epoch": 0.002041527660210128, |
|
"grad_norm": 1.7437426477171256, |
|
"kl": 0.121337890625, |
|
"learning_rate": 9.989792361698948e-07, |
|
"loss": 0.0049, |
|
"reward": 1.8203125, |
|
"reward_std": 0.14389308914542198, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 82.6171875, |
|
"epoch": 0.0020913210177762286, |
|
"grad_norm": 1.6293101072649048, |
|
"kl": 0.12939453125, |
|
"learning_rate": 9.989543394911118e-07, |
|
"loss": 0.0052, |
|
"reward": 1.8359375, |
|
"reward_std": 0.20357418060302734, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 80.265625, |
|
"epoch": 0.0021411143753423292, |
|
"grad_norm": 2.008038783692788, |
|
"kl": 0.12158203125, |
|
"learning_rate": 9.98929442812329e-07, |
|
"loss": 0.0049, |
|
"reward": 1.875, |
|
"reward_std": 0.18394745513796806, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 81.828125, |
|
"epoch": 0.00219090773290843, |
|
"grad_norm": 1.5541490564060598, |
|
"kl": 0.1201171875, |
|
"learning_rate": 9.989045461335456e-07, |
|
"loss": 0.0048, |
|
"reward": 1.796875, |
|
"reward_std": 0.1934976577758789, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 86.765625, |
|
"epoch": 0.0022407010904745305, |
|
"grad_norm": 2.702185067000183, |
|
"kl": 0.1201171875, |
|
"learning_rate": 9.988796494547627e-07, |
|
"loss": 0.0048, |
|
"reward": 1.84375, |
|
"reward_std": 0.17623991519212723, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 87.875, |
|
"epoch": 0.002290494448040631, |
|
"grad_norm": 3.192226779118565, |
|
"kl": 0.112060546875, |
|
"learning_rate": 9.988547527759796e-07, |
|
"loss": 0.0045, |
|
"reward": 1.8359375, |
|
"reward_std": 0.2296229898929596, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 79.7265625, |
|
"epoch": 0.002340287805606732, |
|
"grad_norm": 3.072141289856114, |
|
"kl": 0.124755859375, |
|
"learning_rate": 9.988298560971966e-07, |
|
"loss": 0.005, |
|
"reward": 1.796875, |
|
"reward_std": 0.18543372303247452, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 83.6171875, |
|
"epoch": 0.002390081163172833, |
|
"grad_norm": 2.4271148066972534, |
|
"kl": 0.110107421875, |
|
"learning_rate": 9.988049594184135e-07, |
|
"loss": 0.0044, |
|
"reward": 1.8046875, |
|
"reward_std": 0.17587634921073914, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 88.25, |
|
"epoch": 0.0024398745207389336, |
|
"grad_norm": 1.506472972956932, |
|
"kl": 0.107666015625, |
|
"learning_rate": 9.987800627396304e-07, |
|
"loss": 0.0043, |
|
"reward": 1.6953125, |
|
"reward_std": 0.12415502034127712, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.6953125, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 86.8515625, |
|
"epoch": 0.0024896678783050343, |
|
"grad_norm": 1.4729827765169945, |
|
"kl": 0.10888671875, |
|
"learning_rate": 9.987551660608473e-07, |
|
"loss": 0.0044, |
|
"reward": 1.8125, |
|
"reward_std": 0.12255740165710449, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 82.3359375, |
|
"epoch": 0.002539461235871135, |
|
"grad_norm": 2.3251601765981733, |
|
"kl": 0.1103515625, |
|
"learning_rate": 9.987302693820644e-07, |
|
"loss": 0.0044, |
|
"reward": 1.8671875, |
|
"reward_std": 0.17859892547130585, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 90.46875, |
|
"epoch": 0.0025892545934372356, |
|
"grad_norm": 2.57968011148121, |
|
"kl": 0.10009765625, |
|
"learning_rate": 9.987053727032814e-07, |
|
"loss": 0.004, |
|
"reward": 1.828125, |
|
"reward_std": 0.28353502601385117, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 89.3984375, |
|
"epoch": 0.0026390479510033362, |
|
"grad_norm": 1.7225559317809422, |
|
"kl": 0.102783203125, |
|
"learning_rate": 9.986804760244983e-07, |
|
"loss": 0.0041, |
|
"reward": 1.8828125, |
|
"reward_std": 0.13941731676459312, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 91.421875, |
|
"epoch": 0.002688841308569437, |
|
"grad_norm": 6.075719013971686, |
|
"kl": 0.107177734375, |
|
"learning_rate": 9.986555793457152e-07, |
|
"loss": 0.0043, |
|
"reward": 1.890625, |
|
"reward_std": 0.1751839816570282, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 90.6015625, |
|
"epoch": 0.0027386346661355376, |
|
"grad_norm": 2.506752602259074, |
|
"kl": 0.097412109375, |
|
"learning_rate": 9.986306826669321e-07, |
|
"loss": 0.0039, |
|
"reward": 1.8046875, |
|
"reward_std": 0.27040712535381317, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 97.9609375, |
|
"epoch": 0.0027884280237016382, |
|
"grad_norm": 3.5989816591744974, |
|
"kl": 0.114990234375, |
|
"learning_rate": 9.986057859881492e-07, |
|
"loss": 0.0046, |
|
"reward": 1.78125, |
|
"reward_std": 0.20251334458589554, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.78125, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 93.71875, |
|
"epoch": 0.002838221381267739, |
|
"grad_norm": 1.6289613450606333, |
|
"kl": 0.107421875, |
|
"learning_rate": 9.985808893093662e-07, |
|
"loss": 0.0043, |
|
"reward": 1.71875, |
|
"reward_std": 0.16097760945558548, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.71875, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 90.6796875, |
|
"epoch": 0.0028880147388338395, |
|
"grad_norm": 2.425051848826219, |
|
"kl": 0.110595703125, |
|
"learning_rate": 9.98555992630583e-07, |
|
"loss": 0.0044, |
|
"reward": 1.859375, |
|
"reward_std": 0.1751839891076088, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 98.578125, |
|
"epoch": 0.00293780809639994, |
|
"grad_norm": 1.5387697826714153, |
|
"kl": 0.099365234375, |
|
"learning_rate": 9.985310959518e-07, |
|
"loss": 0.004, |
|
"reward": 1.7890625, |
|
"reward_std": 0.22673208266496658, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 102.0625, |
|
"epoch": 0.002987601453966041, |
|
"grad_norm": 1.604092614395233, |
|
"kl": 0.091552734375, |
|
"learning_rate": 9.98506199273017e-07, |
|
"loss": 0.0037, |
|
"reward": 1.7890625, |
|
"reward_std": 0.14807433634996414, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 101.4921875, |
|
"epoch": 0.0030373948115321415, |
|
"grad_norm": 1.8625216560575186, |
|
"kl": 0.1123046875, |
|
"learning_rate": 9.984813025942338e-07, |
|
"loss": 0.0045, |
|
"reward": 1.8046875, |
|
"reward_std": 0.21778053790330887, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 97.0625, |
|
"epoch": 0.003087188169098242, |
|
"grad_norm": 1.415839549921855, |
|
"kl": 0.116455078125, |
|
"learning_rate": 9.98456405915451e-07, |
|
"loss": 0.0047, |
|
"reward": 1.6875, |
|
"reward_std": 0.1751839928328991, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.6875, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 100.453125, |
|
"epoch": 0.003136981526664343, |
|
"grad_norm": 2.136173002374526, |
|
"kl": 0.101806640625, |
|
"learning_rate": 9.984315092366677e-07, |
|
"loss": 0.0041, |
|
"reward": 1.6953125, |
|
"reward_std": 0.21778054535388947, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.703125, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 97.71875, |
|
"epoch": 0.0031867748842304435, |
|
"grad_norm": 2.78958535574904, |
|
"kl": 0.109130859375, |
|
"learning_rate": 9.984066125578848e-07, |
|
"loss": 0.0044, |
|
"reward": 1.7890625, |
|
"reward_std": 0.22095812857151031, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 94.2890625, |
|
"epoch": 0.003236568241796544, |
|
"grad_norm": 2.9146595458163973, |
|
"kl": 0.12744140625, |
|
"learning_rate": 9.983817158791017e-07, |
|
"loss": 0.0051, |
|
"reward": 1.875, |
|
"reward_std": 0.18543372303247452, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 95.8671875, |
|
"epoch": 0.003286361599362645, |
|
"grad_norm": 1.8449697292358263, |
|
"kl": 0.1171875, |
|
"learning_rate": 9.983568192003186e-07, |
|
"loss": 0.0047, |
|
"reward": 1.7109375, |
|
"reward_std": 0.13888052850961685, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.7109375, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 104.0234375, |
|
"epoch": 0.003336154956928746, |
|
"grad_norm": 1.118366554180186, |
|
"kl": 0.09228515625, |
|
"learning_rate": 9.983319225215356e-07, |
|
"loss": 0.0037, |
|
"reward": 1.828125, |
|
"reward_std": 0.17700131237506866, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 96.875, |
|
"epoch": 0.0033859483144948466, |
|
"grad_norm": 1.9844796774181017, |
|
"kl": 0.106689453125, |
|
"learning_rate": 9.983070258427525e-07, |
|
"loss": 0.0043, |
|
"reward": 1.734375, |
|
"reward_std": 0.30776649713516235, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7421875, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 94.5234375, |
|
"epoch": 0.0034357416720609472, |
|
"grad_norm": 1.4626727156374901, |
|
"kl": 0.10107421875, |
|
"learning_rate": 9.982821291639694e-07, |
|
"loss": 0.004, |
|
"reward": 1.828125, |
|
"reward_std": 0.19616854190826416, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 97.234375, |
|
"epoch": 0.003485535029627048, |
|
"grad_norm": 2.350243000229927, |
|
"kl": 0.091064453125, |
|
"learning_rate": 9.982572324851865e-07, |
|
"loss": 0.0036, |
|
"reward": 1.765625, |
|
"reward_std": 0.1462521106004715, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.765625, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 95.828125, |
|
"epoch": 0.0035353283871931485, |
|
"grad_norm": 1.887912161676318, |
|
"kl": 0.10498046875, |
|
"learning_rate": 9.982323358064034e-07, |
|
"loss": 0.0042, |
|
"reward": 1.890625, |
|
"reward_std": 0.11678344383835793, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 99.9453125, |
|
"epoch": 0.003585121744759249, |
|
"grad_norm": 2.2226345404585333, |
|
"kl": 0.09912109375, |
|
"learning_rate": 9.982074391276204e-07, |
|
"loss": 0.004, |
|
"reward": 1.78125, |
|
"reward_std": 0.20463500916957855, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 93.7578125, |
|
"epoch": 0.00363491510232535, |
|
"grad_norm": 2.1991430911777416, |
|
"kl": 0.098876953125, |
|
"learning_rate": 9.981825424488373e-07, |
|
"loss": 0.004, |
|
"reward": 1.953125, |
|
"reward_std": 0.1020579319447279, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9609375, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 95.2265625, |
|
"epoch": 0.0036847084598914505, |
|
"grad_norm": 58.59358696174032, |
|
"kl": 0.091552734375, |
|
"learning_rate": 9.981576457700542e-07, |
|
"loss": 0.0037, |
|
"reward": 1.84375, |
|
"reward_std": 0.2398776337504387, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 92.453125, |
|
"epoch": 0.003734501817457551, |
|
"grad_norm": 2.64891702613998, |
|
"kl": 0.105712890625, |
|
"learning_rate": 9.981327490912711e-07, |
|
"loss": 0.0042, |
|
"reward": 1.875, |
|
"reward_std": 0.15539421141147614, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 93.71875, |
|
"epoch": 0.003784295175023652, |
|
"grad_norm": 3.310727657848531, |
|
"kl": 0.126953125, |
|
"learning_rate": 9.981078524124882e-07, |
|
"loss": 0.0051, |
|
"reward": 1.7890625, |
|
"reward_std": 0.20623262226581573, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 94.046875, |
|
"epoch": 0.0038340885325897525, |
|
"grad_norm": 1.5179070792699012, |
|
"kl": 0.08935546875, |
|
"learning_rate": 9.98082955733705e-07, |
|
"loss": 0.0036, |
|
"reward": 1.8125, |
|
"reward_std": 0.18885356932878494, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 96.609375, |
|
"epoch": 0.003883881890155853, |
|
"grad_norm": 7.881110055836989, |
|
"kl": 0.103759765625, |
|
"learning_rate": 9.98058059054922e-07, |
|
"loss": 0.0041, |
|
"reward": 1.875, |
|
"reward_std": 0.13204573653638363, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.875, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 96.640625, |
|
"epoch": 0.003933675247721954, |
|
"grad_norm": 2.5041303523542635, |
|
"kl": 0.113525390625, |
|
"learning_rate": 9.98033162376139e-07, |
|
"loss": 0.0045, |
|
"reward": 1.828125, |
|
"reward_std": 0.25855977833271027, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 94.65625, |
|
"epoch": 0.0039834686052880545, |
|
"grad_norm": 2.4069902820249514, |
|
"kl": 0.105712890625, |
|
"learning_rate": 9.98008265697356e-07, |
|
"loss": 0.0042, |
|
"reward": 1.6953125, |
|
"reward_std": 0.30616889894008636, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.7109375, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 97.03125, |
|
"epoch": 0.004033261962854156, |
|
"grad_norm": 1.8743529173857354, |
|
"kl": 0.11767578125, |
|
"learning_rate": 9.979833690185728e-07, |
|
"loss": 0.0047, |
|
"reward": 1.7890625, |
|
"reward_std": 0.16296816617250443, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 90.4296875, |
|
"epoch": 0.004083055320420256, |
|
"grad_norm": 2.345438840725503, |
|
"kl": 0.116455078125, |
|
"learning_rate": 9.979584723397897e-07, |
|
"loss": 0.0046, |
|
"reward": 1.8359375, |
|
"reward_std": 0.13488983362913132, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 90.84375, |
|
"epoch": 0.004132848677986357, |
|
"grad_norm": 2.051561279848721, |
|
"kl": 0.115478515625, |
|
"learning_rate": 9.979335756610069e-07, |
|
"loss": 0.0046, |
|
"reward": 1.84375, |
|
"reward_std": 0.11230766773223877, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 92.2734375, |
|
"epoch": 0.004182642035552457, |
|
"grad_norm": 14.342711299567778, |
|
"kl": 0.106201171875, |
|
"learning_rate": 9.979086789822238e-07, |
|
"loss": 0.0042, |
|
"reward": 1.8828125, |
|
"reward_std": 0.12863079272210598, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 97.3046875, |
|
"epoch": 0.004232435393118558, |
|
"grad_norm": 1.6606982273715714, |
|
"kl": 0.103271484375, |
|
"learning_rate": 9.978837823034407e-07, |
|
"loss": 0.0041, |
|
"reward": 1.7421875, |
|
"reward_std": 0.18361148983240128, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.765625, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 99.71875, |
|
"epoch": 0.0042822287506846584, |
|
"grad_norm": 1.7887536485013715, |
|
"kl": 0.097900390625, |
|
"learning_rate": 9.978588856246576e-07, |
|
"loss": 0.0039, |
|
"reward": 1.6875, |
|
"reward_std": 0.2567247971892357, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.703125, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 97.5625, |
|
"epoch": 0.0043320221082507595, |
|
"grad_norm": 2.4675857175453983, |
|
"kl": 0.110595703125, |
|
"learning_rate": 9.978339889458745e-07, |
|
"loss": 0.0044, |
|
"reward": 1.828125, |
|
"reward_std": 0.19910329580307007, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 96.6796875, |
|
"epoch": 0.00438181546581686, |
|
"grad_norm": 1.9032435655638502, |
|
"kl": 0.10791015625, |
|
"learning_rate": 9.978090922670915e-07, |
|
"loss": 0.0043, |
|
"reward": 1.7265625, |
|
"reward_std": 0.21778053790330887, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.734375, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 94.921875, |
|
"epoch": 0.004431608823382961, |
|
"grad_norm": 1.2887238907565408, |
|
"kl": 0.10009765625, |
|
"learning_rate": 9.977841955883086e-07, |
|
"loss": 0.004, |
|
"reward": 1.84375, |
|
"reward_std": 0.1643974632024765, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 94.234375, |
|
"epoch": 0.004481402180949061, |
|
"grad_norm": 0.9629324305950829, |
|
"kl": 0.09765625, |
|
"learning_rate": 9.977592989095253e-07, |
|
"loss": 0.0039, |
|
"reward": 1.9375, |
|
"reward_std": 0.07312605530023575, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9375, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 92.953125, |
|
"epoch": 0.004531195538515162, |
|
"grad_norm": 1.758439472489501, |
|
"kl": 0.12109375, |
|
"learning_rate": 9.977344022307424e-07, |
|
"loss": 0.0048, |
|
"reward": 1.8828125, |
|
"reward_std": 0.172288179397583, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 93.6484375, |
|
"epoch": 0.004580988896081262, |
|
"grad_norm": 2.550333683622402, |
|
"kl": 0.119873046875, |
|
"learning_rate": 9.977095055519593e-07, |
|
"loss": 0.0048, |
|
"reward": 1.7734375, |
|
"reward_std": 0.21325305849313736, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 98.3203125, |
|
"epoch": 0.0046307822536473635, |
|
"grad_norm": 1.6373377896127548, |
|
"kl": 0.108154296875, |
|
"learning_rate": 9.976846088731763e-07, |
|
"loss": 0.0043, |
|
"reward": 1.6796875, |
|
"reward_std": 0.26409636437892914, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.6953125, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 91.9375, |
|
"epoch": 0.004680575611213464, |
|
"grad_norm": 0.9835936524801608, |
|
"kl": 0.1201171875, |
|
"learning_rate": 9.976597121943932e-07, |
|
"loss": 0.0048, |
|
"reward": 1.8515625, |
|
"reward_std": 0.05102896690368652, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 90.109375, |
|
"epoch": 0.004730368968779565, |
|
"grad_norm": 0.8306050897153783, |
|
"kl": 0.12158203125, |
|
"learning_rate": 9.9763481551561e-07, |
|
"loss": 0.0049, |
|
"reward": 1.8359375, |
|
"reward_std": 0.07996084541082382, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 93.78125, |
|
"epoch": 0.004780162326345666, |
|
"grad_norm": 1.7603034433514313, |
|
"kl": 0.124267578125, |
|
"learning_rate": 9.97609918836827e-07, |
|
"loss": 0.005, |
|
"reward": 1.828125, |
|
"reward_std": 0.12756996601819992, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 90.0859375, |
|
"epoch": 0.004829955683911766, |
|
"grad_norm": 6.562410625855334, |
|
"kl": 0.14404296875, |
|
"learning_rate": 9.975850221580441e-07, |
|
"loss": 0.0058, |
|
"reward": 1.75, |
|
"reward_std": 0.1820138692855835, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.75, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 94.7109375, |
|
"epoch": 0.004879749041477867, |
|
"grad_norm": 7.167072839398765, |
|
"kl": 0.124267578125, |
|
"learning_rate": 9.97560125479261e-07, |
|
"loss": 0.005, |
|
"reward": 1.78125, |
|
"reward_std": 0.1173202209174633, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.78125, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 97.984375, |
|
"epoch": 0.0049295423990439674, |
|
"grad_norm": 1.599951444379713, |
|
"kl": 0.1123046875, |
|
"learning_rate": 9.97535228800478e-07, |
|
"loss": 0.0045, |
|
"reward": 1.671875, |
|
"reward_std": 0.24093355238437653, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.6875, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 92.96875, |
|
"epoch": 0.0049793357566100685, |
|
"grad_norm": 1.8255456686826015, |
|
"kl": 0.130126953125, |
|
"learning_rate": 9.97510332121695e-07, |
|
"loss": 0.0052, |
|
"reward": 1.859375, |
|
"reward_std": 0.09863808751106262, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 90.921875, |
|
"epoch": 0.005029129114176169, |
|
"grad_norm": 9.631285085503505, |
|
"kl": 0.12353515625, |
|
"learning_rate": 9.974854354429118e-07, |
|
"loss": 0.0049, |
|
"reward": 1.609375, |
|
"reward_std": 0.1820138692855835, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.6171875, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 95.09375, |
|
"epoch": 0.00507892247174227, |
|
"grad_norm": 1.3504518610587621, |
|
"kl": 0.14453125, |
|
"learning_rate": 9.974605387641287e-07, |
|
"loss": 0.0058, |
|
"reward": 1.7734375, |
|
"reward_std": 0.22395770996809006, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 87.3671875, |
|
"epoch": 0.00512871582930837, |
|
"grad_norm": 0.9084816270882866, |
|
"kl": 0.138671875, |
|
"learning_rate": 9.974356420853459e-07, |
|
"loss": 0.0056, |
|
"reward": 1.875, |
|
"reward_std": 0.09127141162753105, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.875, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 90.1875, |
|
"epoch": 0.005178509186874471, |
|
"grad_norm": 1.1325737984752882, |
|
"kl": 0.14599609375, |
|
"learning_rate": 9.974107454065626e-07, |
|
"loss": 0.0058, |
|
"reward": 1.8203125, |
|
"reward_std": 0.1678124126046896, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 91.46875, |
|
"epoch": 0.005228302544440571, |
|
"grad_norm": 1.3951011507540294, |
|
"kl": 0.1318359375, |
|
"learning_rate": 9.973858487277797e-07, |
|
"loss": 0.0053, |
|
"reward": 1.8046875, |
|
"reward_std": 0.22766181081533432, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 88.359375, |
|
"epoch": 0.0052780959020066725, |
|
"grad_norm": 9.281549781773146, |
|
"kl": 0.11328125, |
|
"learning_rate": 9.973609520489966e-07, |
|
"loss": 0.0045, |
|
"reward": 1.7890625, |
|
"reward_std": 0.11048543080687523, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 92.609375, |
|
"epoch": 0.005327889259572773, |
|
"grad_norm": 1.342575462098463, |
|
"kl": 0.123779296875, |
|
"learning_rate": 9.973360553702135e-07, |
|
"loss": 0.0049, |
|
"reward": 1.8671875, |
|
"reward_std": 0.18542881309986115, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 94.0703125, |
|
"epoch": 0.005377682617138874, |
|
"grad_norm": 0.783556311317755, |
|
"kl": 0.12109375, |
|
"learning_rate": 9.973111586914307e-07, |
|
"loss": 0.0048, |
|
"reward": 1.890625, |
|
"reward_std": 0.0731260534375906, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 96.125, |
|
"epoch": 0.005427475974704974, |
|
"grad_norm": 2.3538443559933584, |
|
"kl": 0.111572265625, |
|
"learning_rate": 9.972862620126474e-07, |
|
"loss": 0.0045, |
|
"reward": 1.890625, |
|
"reward_std": 0.1841355413198471, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 96.703125, |
|
"epoch": 0.005477269332271075, |
|
"grad_norm": 12.82190276009885, |
|
"kl": 0.121337890625, |
|
"learning_rate": 9.972613653338645e-07, |
|
"loss": 0.0048, |
|
"reward": 1.765625, |
|
"reward_std": 0.2874399423599243, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.78125, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 94.515625, |
|
"epoch": 0.005527062689837175, |
|
"grad_norm": 2.1222628429676145, |
|
"kl": 0.0986328125, |
|
"learning_rate": 9.972364686550814e-07, |
|
"loss": 0.0039, |
|
"reward": 1.890625, |
|
"reward_std": 0.12125921249389648, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 107.25, |
|
"epoch": 0.0055768560474032764, |
|
"grad_norm": 1.2991554775023286, |
|
"kl": 0.104736328125, |
|
"learning_rate": 9.972115719762983e-07, |
|
"loss": 0.0042, |
|
"reward": 1.78125, |
|
"reward_std": 0.21436560153961182, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 102.453125, |
|
"epoch": 0.005626649404969377, |
|
"grad_norm": 1.2996222270625057, |
|
"kl": 0.11865234375, |
|
"learning_rate": 9.971866752975152e-07, |
|
"loss": 0.0047, |
|
"reward": 1.859375, |
|
"reward_std": 0.18990950286388397, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 99.8046875, |
|
"epoch": 0.005676442762535478, |
|
"grad_norm": 1.7202541424079283, |
|
"kl": 0.1220703125, |
|
"learning_rate": 9.971617786187322e-07, |
|
"loss": 0.0049, |
|
"reward": 1.703125, |
|
"reward_std": 0.22043409198522568, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.734375, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 95.6640625, |
|
"epoch": 0.005726236120101579, |
|
"grad_norm": 3.756046638923596, |
|
"kl": 0.114501953125, |
|
"learning_rate": 9.97136881939949e-07, |
|
"loss": 0.0046, |
|
"reward": 1.8125, |
|
"reward_std": 0.2919157147407532, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 102.2265625, |
|
"epoch": 0.005776029477667679, |
|
"grad_norm": 1.639183217811092, |
|
"kl": 0.114990234375, |
|
"learning_rate": 9.971119852611662e-07, |
|
"loss": 0.0046, |
|
"reward": 1.7265625, |
|
"reward_std": 0.22597318142652512, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.7578125, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 95.421875, |
|
"epoch": 0.00582582283523378, |
|
"grad_norm": 1.7798573964496212, |
|
"kl": 0.123046875, |
|
"learning_rate": 9.970870885823831e-07, |
|
"loss": 0.0049, |
|
"reward": 1.84375, |
|
"reward_std": 0.17176414281129837, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 99.1328125, |
|
"epoch": 0.00587561619279988, |
|
"grad_norm": 1.0418154482289879, |
|
"kl": 0.10400390625, |
|
"learning_rate": 9.970621919036e-07, |
|
"loss": 0.0042, |
|
"reward": 1.9140625, |
|
"reward_std": 0.10994865000247955, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 100.0390625, |
|
"epoch": 0.0059254095503659815, |
|
"grad_norm": 1.2568897269852273, |
|
"kl": 0.1318359375, |
|
"learning_rate": 9.97037295224817e-07, |
|
"loss": 0.0053, |
|
"reward": 1.8984375, |
|
"reward_std": 0.17517907544970512, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 100.859375, |
|
"epoch": 0.005975202907932082, |
|
"grad_norm": 1.329896372039074, |
|
"kl": 0.110595703125, |
|
"learning_rate": 9.970123985460339e-07, |
|
"loss": 0.0044, |
|
"reward": 1.765625, |
|
"reward_std": 0.1462521068751812, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7734375, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 98.7890625, |
|
"epoch": 0.006024996265498183, |
|
"grad_norm": 1.2361240382386223, |
|
"kl": 0.120849609375, |
|
"learning_rate": 9.969875018672508e-07, |
|
"loss": 0.0048, |
|
"reward": 1.8125, |
|
"reward_std": 0.14806943386793137, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 96.4921875, |
|
"epoch": 0.006074789623064283, |
|
"grad_norm": 1.454148082773978, |
|
"kl": 0.119140625, |
|
"learning_rate": 9.96962605188468e-07, |
|
"loss": 0.0048, |
|
"reward": 1.765625, |
|
"reward_std": 0.159869983792305, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7734375, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 95.4375, |
|
"epoch": 0.006124582980630384, |
|
"grad_norm": 2.086270562267407, |
|
"kl": 0.13720703125, |
|
"learning_rate": 9.969377085096846e-07, |
|
"loss": 0.0055, |
|
"reward": 1.7734375, |
|
"reward_std": 0.20411096513271332, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.78125, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 100.9609375, |
|
"epoch": 0.006174376338196484, |
|
"grad_norm": 6.607701923097873, |
|
"kl": 0.3427734375, |
|
"learning_rate": 9.969128118309018e-07, |
|
"loss": 0.0137, |
|
"reward": 1.8984375, |
|
"reward_std": 0.11048543080687523, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 101.609375, |
|
"epoch": 0.0062241696957625854, |
|
"grad_norm": 0.7479370583088916, |
|
"kl": 0.124755859375, |
|
"learning_rate": 9.968879151521187e-07, |
|
"loss": 0.005, |
|
"reward": 1.984375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.984375, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 97.234375, |
|
"epoch": 0.006273963053328686, |
|
"grad_norm": 1.2449395272627752, |
|
"kl": 0.1376953125, |
|
"learning_rate": 9.968630184733356e-07, |
|
"loss": 0.0055, |
|
"reward": 1.9609375, |
|
"reward_std": 0.07996084541082382, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9609375, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 93.8203125, |
|
"epoch": 0.006323756410894787, |
|
"grad_norm": 1.2108942877518432, |
|
"kl": 0.14501953125, |
|
"learning_rate": 9.968381217945525e-07, |
|
"loss": 0.0058, |
|
"reward": 1.8984375, |
|
"reward_std": 0.11572261154651642, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 101.78125, |
|
"epoch": 0.006373549768460887, |
|
"grad_norm": 1.483964368827148, |
|
"kl": 0.126708984375, |
|
"learning_rate": 9.968132251157694e-07, |
|
"loss": 0.0051, |
|
"reward": 1.796875, |
|
"reward_std": 0.22189275175333023, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 101.2578125, |
|
"epoch": 0.006423343126026988, |
|
"grad_norm": 1.9021174092739832, |
|
"kl": 0.1279296875, |
|
"learning_rate": 9.967883284369866e-07, |
|
"loss": 0.0051, |
|
"reward": 1.96875, |
|
"reward_std": 0.0731260534375906, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.96875, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 97.703125, |
|
"epoch": 0.006473136483593088, |
|
"grad_norm": 1.4956511194697275, |
|
"kl": 0.14697265625, |
|
"learning_rate": 9.967634317582035e-07, |
|
"loss": 0.0059, |
|
"reward": 1.8984375, |
|
"reward_std": 0.15991678833961487, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 94.125, |
|
"epoch": 0.006522929841159189, |
|
"grad_norm": 1.188534152591436, |
|
"kl": 0.134765625, |
|
"learning_rate": 9.967385350794204e-07, |
|
"loss": 0.0054, |
|
"reward": 1.875, |
|
"reward_std": 0.1417246237397194, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 103.7109375, |
|
"epoch": 0.00657272319872529, |
|
"grad_norm": 2.403396926934965, |
|
"kl": 0.130615234375, |
|
"learning_rate": 9.967136384006373e-07, |
|
"loss": 0.0052, |
|
"reward": 1.8671875, |
|
"reward_std": 0.19728107005357742, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 100.3515625, |
|
"epoch": 0.006622516556291391, |
|
"grad_norm": 1.6511816180599905, |
|
"kl": 0.15673828125, |
|
"learning_rate": 9.966887417218542e-07, |
|
"loss": 0.0063, |
|
"reward": 1.8984375, |
|
"reward_std": 0.0946863554418087, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 102.1015625, |
|
"epoch": 0.006672309913857492, |
|
"grad_norm": 7.371838017323048, |
|
"kl": 0.1494140625, |
|
"learning_rate": 9.966638450430712e-07, |
|
"loss": 0.006, |
|
"reward": 1.7578125, |
|
"reward_std": 0.2227931022644043, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.7734375, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 99.046875, |
|
"epoch": 0.006722103271423592, |
|
"grad_norm": 1.5837922767625858, |
|
"kl": 0.14501953125, |
|
"learning_rate": 9.966389483642883e-07, |
|
"loss": 0.0058, |
|
"reward": 1.8125, |
|
"reward_std": 0.18538201414048672, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 100.078125, |
|
"epoch": 0.006771896628989693, |
|
"grad_norm": 1.4734436213779043, |
|
"kl": 0.14892578125, |
|
"learning_rate": 9.966140516855052e-07, |
|
"loss": 0.0059, |
|
"reward": 1.8828125, |
|
"reward_std": 0.12415501847863197, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 102.828125, |
|
"epoch": 0.006821689986555793, |
|
"grad_norm": 2.958171346551285, |
|
"kl": 0.14208984375, |
|
"learning_rate": 9.965891550067221e-07, |
|
"loss": 0.0057, |
|
"reward": 1.71875, |
|
"reward_std": 0.27345359325408936, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.734375, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 96.75, |
|
"epoch": 0.0068714833441218945, |
|
"grad_norm": 1.6926754738207872, |
|
"kl": 0.1689453125, |
|
"learning_rate": 9.96564258327939e-07, |
|
"loss": 0.0068, |
|
"reward": 1.9140625, |
|
"reward_std": 0.14966704696416855, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 98.9140625, |
|
"epoch": 0.006921276701687995, |
|
"grad_norm": 1.8589271672389032, |
|
"kl": 0.15869140625, |
|
"learning_rate": 9.96539361649156e-07, |
|
"loss": 0.0063, |
|
"reward": 1.8671875, |
|
"reward_std": 0.11572261154651642, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 100.59375, |
|
"epoch": 0.006971070059254096, |
|
"grad_norm": 2.1552075487725175, |
|
"kl": 0.14794921875, |
|
"learning_rate": 9.965144649703729e-07, |
|
"loss": 0.0059, |
|
"reward": 1.8046875, |
|
"reward_std": 0.20680594444274902, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 97.1484375, |
|
"epoch": 0.007020863416820196, |
|
"grad_norm": 1.3342964417174048, |
|
"kl": 0.13720703125, |
|
"learning_rate": 9.964895682915898e-07, |
|
"loss": 0.0055, |
|
"reward": 1.921875, |
|
"reward_std": 0.1417246311903, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 97.796875, |
|
"epoch": 0.007070656774386297, |
|
"grad_norm": 1.712613688076723, |
|
"kl": 0.15673828125, |
|
"learning_rate": 9.964646716128067e-07, |
|
"loss": 0.0063, |
|
"reward": 1.9140625, |
|
"reward_std": 0.051028965041041374, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 93.484375, |
|
"epoch": 0.007120450131952397, |
|
"grad_norm": 1.258501993566246, |
|
"kl": 0.1484375, |
|
"learning_rate": 9.964397749340238e-07, |
|
"loss": 0.0059, |
|
"reward": 1.8984375, |
|
"reward_std": 0.13098490610718727, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 98.2421875, |
|
"epoch": 0.007170243489518498, |
|
"grad_norm": 1.0202602046475222, |
|
"kl": 0.15185546875, |
|
"learning_rate": 9.964148782552408e-07, |
|
"loss": 0.0061, |
|
"reward": 1.9140625, |
|
"reward_std": 0.051028965041041374, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 94.5625, |
|
"epoch": 0.007220036847084599, |
|
"grad_norm": 2.4309087080560525, |
|
"kl": 0.1708984375, |
|
"learning_rate": 9.963899815764577e-07, |
|
"loss": 0.0068, |
|
"reward": 1.8359375, |
|
"reward_std": 0.17407145351171494, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 97.4609375, |
|
"epoch": 0.0072698302046507, |
|
"grad_norm": 1.068439968727649, |
|
"kl": 0.142578125, |
|
"learning_rate": 9.963650848976746e-07, |
|
"loss": 0.0057, |
|
"reward": 1.828125, |
|
"reward_std": 0.16675157099962234, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 98.7421875, |
|
"epoch": 0.0073196235622168, |
|
"grad_norm": 12.97483651914656, |
|
"kl": 0.14599609375, |
|
"learning_rate": 9.963401882188915e-07, |
|
"loss": 0.0058, |
|
"reward": 1.890625, |
|
"reward_std": 0.12703317403793335, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 96.28125, |
|
"epoch": 0.007369416919782901, |
|
"grad_norm": 0.6736983035983228, |
|
"kl": 0.14990234375, |
|
"learning_rate": 9.963152915401084e-07, |
|
"loss": 0.006, |
|
"reward": 1.921875, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 95.890625, |
|
"epoch": 0.007419210277349001, |
|
"grad_norm": 1.5893146844454131, |
|
"kl": 0.18505859375, |
|
"learning_rate": 9.962903948613256e-07, |
|
"loss": 0.0074, |
|
"reward": 1.8046875, |
|
"reward_std": 0.23876510187983513, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 94.3515625, |
|
"epoch": 0.007469003634915102, |
|
"grad_norm": 1.0758569475379494, |
|
"kl": 0.17431640625, |
|
"learning_rate": 9.962654981825423e-07, |
|
"loss": 0.007, |
|
"reward": 1.8671875, |
|
"reward_std": 0.11572261154651642, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 96.1640625, |
|
"epoch": 0.007518796992481203, |
|
"grad_norm": 2.1547459675611833, |
|
"kl": 0.18505859375, |
|
"learning_rate": 9.962406015037594e-07, |
|
"loss": 0.0074, |
|
"reward": 1.8515625, |
|
"reward_std": 0.1054728738963604, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 97.2109375, |
|
"epoch": 0.007568590350047304, |
|
"grad_norm": 4.203584254718333, |
|
"kl": 0.1708984375, |
|
"learning_rate": 9.962157048249763e-07, |
|
"loss": 0.0068, |
|
"reward": 1.8828125, |
|
"reward_std": 0.14966704696416855, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 95.21875, |
|
"epoch": 0.007618383707613405, |
|
"grad_norm": 1.5296983467832583, |
|
"kl": 0.15673828125, |
|
"learning_rate": 9.961908081461932e-07, |
|
"loss": 0.0063, |
|
"reward": 1.78125, |
|
"reward_std": 0.2001592367887497, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.78125, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 96.6796875, |
|
"epoch": 0.007668177065179505, |
|
"grad_norm": 1.475962686477158, |
|
"kl": 0.16015625, |
|
"learning_rate": 9.961659114674104e-07, |
|
"loss": 0.0064, |
|
"reward": 1.828125, |
|
"reward_std": 0.0776018276810646, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 96.703125, |
|
"epoch": 0.007717970422745606, |
|
"grad_norm": 1.8439068537788146, |
|
"kl": 0.14111328125, |
|
"learning_rate": 9.96141014788627e-07, |
|
"loss": 0.0056, |
|
"reward": 1.765625, |
|
"reward_std": 0.17570313066244125, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7734375, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 96.1875, |
|
"epoch": 0.007767763780311706, |
|
"grad_norm": 2.371095979651294, |
|
"kl": 0.15869140625, |
|
"learning_rate": 9.961161181098442e-07, |
|
"loss": 0.0064, |
|
"reward": 1.875, |
|
"reward_std": 0.2109457477927208, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.875, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 95.265625, |
|
"epoch": 0.007817557137877807, |
|
"grad_norm": 1.5264382161135959, |
|
"kl": 0.16259765625, |
|
"learning_rate": 9.96091221431061e-07, |
|
"loss": 0.0065, |
|
"reward": 1.734375, |
|
"reward_std": 0.17065651342272758, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.75, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 98.875, |
|
"epoch": 0.007867350495443909, |
|
"grad_norm": 1.2951938163240482, |
|
"kl": 0.15625, |
|
"learning_rate": 9.96066324752278e-07, |
|
"loss": 0.0062, |
|
"reward": 1.84375, |
|
"reward_std": 0.12255740165710449, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 96.1484375, |
|
"epoch": 0.007917143853010008, |
|
"grad_norm": 1.067814817752617, |
|
"kl": 0.13818359375, |
|
"learning_rate": 9.96041428073495e-07, |
|
"loss": 0.0055, |
|
"reward": 1.9609375, |
|
"reward_std": 0.0765409953892231, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.96875, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 95.5703125, |
|
"epoch": 0.007966937210576109, |
|
"grad_norm": 1.199265548990744, |
|
"kl": 0.15478515625, |
|
"learning_rate": 9.960165313947119e-07, |
|
"loss": 0.0062, |
|
"reward": 1.8359375, |
|
"reward_std": 0.09522314183413982, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 100.359375, |
|
"epoch": 0.00801673056814221, |
|
"grad_norm": 0.9864466757054574, |
|
"kl": 0.1728515625, |
|
"learning_rate": 9.959916347159288e-07, |
|
"loss": 0.0069, |
|
"reward": 1.828125, |
|
"reward_std": 0.16887324303388596, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 98.6484375, |
|
"epoch": 0.008066523925708311, |
|
"grad_norm": 10.030481330685754, |
|
"kl": 0.15478515625, |
|
"learning_rate": 9.95966738037146e-07, |
|
"loss": 0.0062, |
|
"reward": 1.859375, |
|
"reward_std": 0.21040896326303482, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 93.9609375, |
|
"epoch": 0.00811631728327441, |
|
"grad_norm": 2.3231292253872424, |
|
"kl": 0.1572265625, |
|
"learning_rate": 9.959418413583628e-07, |
|
"loss": 0.0063, |
|
"reward": 1.8515625, |
|
"reward_std": 0.13168217800557613, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 98.3515625, |
|
"epoch": 0.008166110640840512, |
|
"grad_norm": 1.3208417963591144, |
|
"kl": 0.15234375, |
|
"learning_rate": 9.959169446795797e-07, |
|
"loss": 0.0061, |
|
"reward": 1.734375, |
|
"reward_std": 0.2346404492855072, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.7578125, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 91.4921875, |
|
"epoch": 0.008215903998406613, |
|
"grad_norm": 2.2829174080922487, |
|
"kl": 0.17626953125, |
|
"learning_rate": 9.958920480007967e-07, |
|
"loss": 0.0071, |
|
"reward": 1.8515625, |
|
"reward_std": 0.1054728776216507, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 93.5625, |
|
"epoch": 0.008265697355972714, |
|
"grad_norm": 2.806598044164708, |
|
"kl": 0.15234375, |
|
"learning_rate": 9.958671513220136e-07, |
|
"loss": 0.0061, |
|
"reward": 1.84375, |
|
"reward_std": 0.1552036516368389, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 95.78125, |
|
"epoch": 0.008315490713538813, |
|
"grad_norm": 1.702058197746131, |
|
"kl": 0.1923828125, |
|
"learning_rate": 9.958422546432305e-07, |
|
"loss": 0.0077, |
|
"reward": 1.90625, |
|
"reward_std": 0.07312605157494545, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 97.390625, |
|
"epoch": 0.008365284071104914, |
|
"grad_norm": 1.562676602612297, |
|
"kl": 0.16015625, |
|
"learning_rate": 9.958173579644476e-07, |
|
"loss": 0.0064, |
|
"reward": 1.84375, |
|
"reward_std": 0.1809062361717224, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 93.875, |
|
"epoch": 0.008415077428671015, |
|
"grad_norm": 1.8547618925969274, |
|
"kl": 0.158203125, |
|
"learning_rate": 9.957924612856643e-07, |
|
"loss": 0.0063, |
|
"reward": 1.71875, |
|
"reward_std": 0.2290911078453064, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.71875, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 89.8515625, |
|
"epoch": 0.008464870786237116, |
|
"grad_norm": 1.6663052268480494, |
|
"kl": 0.16064453125, |
|
"learning_rate": 9.957675646068815e-07, |
|
"loss": 0.0064, |
|
"reward": 1.828125, |
|
"reward_std": 0.20064431428909302, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 96.0078125, |
|
"epoch": 0.008514664143803218, |
|
"grad_norm": 3.0612964718925326, |
|
"kl": 0.1416015625, |
|
"learning_rate": 9.957426679280984e-07, |
|
"loss": 0.0057, |
|
"reward": 1.984375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9921875, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 91.8828125, |
|
"epoch": 0.008564457501369317, |
|
"grad_norm": 13.811099626214578, |
|
"kl": 0.16162109375, |
|
"learning_rate": 9.957177712493153e-07, |
|
"loss": 0.0065, |
|
"reward": 1.9140625, |
|
"reward_std": 0.05102896690368652, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 100.078125, |
|
"epoch": 0.008614250858935418, |
|
"grad_norm": 2.5009260961499913, |
|
"kl": 0.129638671875, |
|
"learning_rate": 9.956928745705322e-07, |
|
"loss": 0.0052, |
|
"reward": 1.8359375, |
|
"reward_std": 0.1541428193449974, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 97.4296875, |
|
"epoch": 0.008664044216501519, |
|
"grad_norm": 1.2330462789967154, |
|
"kl": 0.132568359375, |
|
"learning_rate": 9.956679778917491e-07, |
|
"loss": 0.0053, |
|
"reward": 1.75, |
|
"reward_std": 0.12646234035491943, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7578125, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 97.7578125, |
|
"epoch": 0.00871383757406762, |
|
"grad_norm": 4.24805360210085, |
|
"kl": 0.1357421875, |
|
"learning_rate": 9.95643081212966e-07, |
|
"loss": 0.0054, |
|
"reward": 1.828125, |
|
"reward_std": 0.2057085745036602, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 95.140625, |
|
"epoch": 0.00876363093163372, |
|
"grad_norm": 2.6724427079399424, |
|
"kl": 0.140625, |
|
"learning_rate": 9.956181845341832e-07, |
|
"loss": 0.0056, |
|
"reward": 1.828125, |
|
"reward_std": 0.07760182581841946, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 100.53125, |
|
"epoch": 0.00881342428919982, |
|
"grad_norm": 2.3813750405645413, |
|
"kl": 0.1337890625, |
|
"learning_rate": 9.955932878554e-07, |
|
"loss": 0.0054, |
|
"reward": 1.8671875, |
|
"reward_std": 0.14389308914542198, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 95.2265625, |
|
"epoch": 0.008863217646765922, |
|
"grad_norm": 2.394371423017808, |
|
"kl": 0.1513671875, |
|
"learning_rate": 9.95568391176617e-07, |
|
"loss": 0.0061, |
|
"reward": 1.8125, |
|
"reward_std": 0.2488291710615158, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 99.46875, |
|
"epoch": 0.008913011004332023, |
|
"grad_norm": 1.3538381055779696, |
|
"kl": 0.13525390625, |
|
"learning_rate": 9.95543494497834e-07, |
|
"loss": 0.0054, |
|
"reward": 1.6015625, |
|
"reward_std": 0.09069566242396832, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.609375, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 101.0625, |
|
"epoch": 0.008962804361898122, |
|
"grad_norm": 4.252291324840375, |
|
"kl": 0.1494140625, |
|
"learning_rate": 9.955185978190508e-07, |
|
"loss": 0.006, |
|
"reward": 1.609375, |
|
"reward_std": 0.22935794293880463, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.640625, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 99.8671875, |
|
"epoch": 0.009012597719464223, |
|
"grad_norm": 1.2334952187097605, |
|
"kl": 0.1337890625, |
|
"learning_rate": 9.95493701140268e-07, |
|
"loss": 0.0054, |
|
"reward": 1.7109375, |
|
"reward_std": 0.14966705441474915, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.7109375, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 101.1015625, |
|
"epoch": 0.009062391077030324, |
|
"grad_norm": 1.3428638629964709, |
|
"kl": 0.14208984375, |
|
"learning_rate": 9.954688044614849e-07, |
|
"loss": 0.0057, |
|
"reward": 1.828125, |
|
"reward_std": 0.18235475197434425, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 102.3828125, |
|
"epoch": 0.009112184434596425, |
|
"grad_norm": 5.9840330860465025, |
|
"kl": 0.11865234375, |
|
"learning_rate": 9.954439077827018e-07, |
|
"loss": 0.0047, |
|
"reward": 1.6484375, |
|
"reward_std": 0.2364109754562378, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.65625, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 95.4453125, |
|
"epoch": 0.009161977792162525, |
|
"grad_norm": 1.048321546061037, |
|
"kl": 0.13671875, |
|
"learning_rate": 9.954190111039187e-07, |
|
"loss": 0.0055, |
|
"reward": 1.8046875, |
|
"reward_std": 0.15308690071105957, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 97.7578125, |
|
"epoch": 0.009211771149728626, |
|
"grad_norm": 0.7784114741222256, |
|
"kl": 0.124267578125, |
|
"learning_rate": 9.953941144251356e-07, |
|
"loss": 0.005, |
|
"reward": 1.984375, |
|
"reward_std": 0.0289318785071373, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.984375, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 98.296875, |
|
"epoch": 0.009261564507294727, |
|
"grad_norm": 0.9982607935203828, |
|
"kl": 0.1474609375, |
|
"learning_rate": 9.953692177463526e-07, |
|
"loss": 0.0059, |
|
"reward": 1.90625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 102.65625, |
|
"epoch": 0.009311357864860828, |
|
"grad_norm": 1.3093659372357407, |
|
"kl": 0.123046875, |
|
"learning_rate": 9.953443210675695e-07, |
|
"loss": 0.0049, |
|
"reward": 1.65625, |
|
"reward_std": 0.1354655846953392, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.65625, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 97.6875, |
|
"epoch": 0.009361151222426927, |
|
"grad_norm": 1.5230553744664186, |
|
"kl": 0.1376953125, |
|
"learning_rate": 9.953194243887864e-07, |
|
"loss": 0.0055, |
|
"reward": 1.8125, |
|
"reward_std": 0.19616853445768356, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 96.09375, |
|
"epoch": 0.009410944579993029, |
|
"grad_norm": 2.005065330677421, |
|
"kl": 0.1513671875, |
|
"learning_rate": 9.952945277100035e-07, |
|
"loss": 0.0061, |
|
"reward": 1.90625, |
|
"reward_std": 0.18702642619609833, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 101.015625, |
|
"epoch": 0.00946073793755913, |
|
"grad_norm": 1.7969604490467637, |
|
"kl": 0.154296875, |
|
"learning_rate": 9.952696310312204e-07, |
|
"loss": 0.0062, |
|
"reward": 1.890625, |
|
"reward_std": 0.22043409198522568, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 97.7890625, |
|
"epoch": 0.00951053129512523, |
|
"grad_norm": 1.3267510112496272, |
|
"kl": 0.1806640625, |
|
"learning_rate": 9.952447343524374e-07, |
|
"loss": 0.0072, |
|
"reward": 1.9453125, |
|
"reward_std": 0.09969891421496868, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9453125, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 94.03125, |
|
"epoch": 0.009560324652691332, |
|
"grad_norm": 1.7976714842104902, |
|
"kl": 0.1552734375, |
|
"learning_rate": 9.952198376736543e-07, |
|
"loss": 0.0062, |
|
"reward": 1.8984375, |
|
"reward_std": 0.09969891235232353, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 90.765625, |
|
"epoch": 0.009610118010257431, |
|
"grad_norm": 0.8628733113398043, |
|
"kl": 0.158203125, |
|
"learning_rate": 9.951949409948712e-07, |
|
"loss": 0.0063, |
|
"reward": 1.859375, |
|
"reward_std": 0.08337578736245632, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 90.078125, |
|
"epoch": 0.009659911367823532, |
|
"grad_norm": 1.4528992760387773, |
|
"kl": 0.16162109375, |
|
"learning_rate": 9.951700443160881e-07, |
|
"loss": 0.0065, |
|
"reward": 1.9140625, |
|
"reward_std": 0.08679073303937912, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 95.3203125, |
|
"epoch": 0.009709704725389633, |
|
"grad_norm": 1.7031346758116503, |
|
"kl": 0.135986328125, |
|
"learning_rate": 9.951451476373052e-07, |
|
"loss": 0.0054, |
|
"reward": 1.90625, |
|
"reward_std": 0.1462521031498909, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 90.5703125, |
|
"epoch": 0.009759498082955734, |
|
"grad_norm": 1.5615501470235476, |
|
"kl": 0.14306640625, |
|
"learning_rate": 9.95120250958522e-07, |
|
"loss": 0.0057, |
|
"reward": 1.9609375, |
|
"reward_std": 0.0765409991145134, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9609375, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 88.5546875, |
|
"epoch": 0.009809291440521834, |
|
"grad_norm": 2.9388617161604813, |
|
"kl": 0.18359375, |
|
"learning_rate": 9.95095354279739e-07, |
|
"loss": 0.0074, |
|
"reward": 1.875, |
|
"reward_std": 0.12255740165710449, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.875, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 91.7265625, |
|
"epoch": 0.009859084798087935, |
|
"grad_norm": 1.603082915799036, |
|
"kl": 0.17724609375, |
|
"learning_rate": 9.95070457600956e-07, |
|
"loss": 0.0071, |
|
"reward": 1.9296875, |
|
"reward_std": 0.12073517218232155, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 87.90625, |
|
"epoch": 0.009908878155654036, |
|
"grad_norm": 1.909745305417159, |
|
"kl": 0.17431640625, |
|
"learning_rate": 9.95045560922173e-07, |
|
"loss": 0.007, |
|
"reward": 1.78125, |
|
"reward_std": 0.13781969994306564, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.78125, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 91.6796875, |
|
"epoch": 0.009958671513220137, |
|
"grad_norm": 0.6142806144484387, |
|
"kl": 0.15625, |
|
"learning_rate": 9.950206642433898e-07, |
|
"loss": 0.0063, |
|
"reward": 1.921875, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 94.2890625, |
|
"epoch": 0.010008464870786236, |
|
"grad_norm": 1.1532891444934046, |
|
"kl": 0.1572265625, |
|
"learning_rate": 9.949957675646068e-07, |
|
"loss": 0.0063, |
|
"reward": 1.8828125, |
|
"reward_std": 0.09969891421496868, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 89.9609375, |
|
"epoch": 0.010058258228352338, |
|
"grad_norm": 2.8714540278296954, |
|
"kl": 0.1708984375, |
|
"learning_rate": 9.949708708858237e-07, |
|
"loss": 0.0068, |
|
"reward": 1.84375, |
|
"reward_std": 0.2198973074555397, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 86.0546875, |
|
"epoch": 0.010108051585918439, |
|
"grad_norm": 2.035555595645388, |
|
"kl": 0.17333984375, |
|
"learning_rate": 9.949459742070408e-07, |
|
"loss": 0.0069, |
|
"reward": 1.890625, |
|
"reward_std": 0.12179600074887276, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 85.6953125, |
|
"epoch": 0.01015784494348454, |
|
"grad_norm": 1.5550466780010161, |
|
"kl": 0.1591796875, |
|
"learning_rate": 9.949210775282577e-07, |
|
"loss": 0.0064, |
|
"reward": 1.8203125, |
|
"reward_std": 0.11336849629878998, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 86.46875, |
|
"epoch": 0.010207638301050639, |
|
"grad_norm": 1.2467441275476379, |
|
"kl": 0.15478515625, |
|
"learning_rate": 9.948961808494746e-07, |
|
"loss": 0.0062, |
|
"reward": 1.859375, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 87.9921875, |
|
"epoch": 0.01025743165861674, |
|
"grad_norm": 1.6598367907110883, |
|
"kl": 0.14306640625, |
|
"learning_rate": 9.948712841706916e-07, |
|
"loss": 0.0057, |
|
"reward": 1.8671875, |
|
"reward_std": 0.08679073303937912, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 88.875, |
|
"epoch": 0.010307225016182841, |
|
"grad_norm": 2.5085407336865093, |
|
"kl": 0.1318359375, |
|
"learning_rate": 9.948463874919085e-07, |
|
"loss": 0.0053, |
|
"reward": 1.828125, |
|
"reward_std": 0.12756996601819992, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 91.859375, |
|
"epoch": 0.010357018373748942, |
|
"grad_norm": 1.321603593809594, |
|
"kl": 0.1396484375, |
|
"learning_rate": 9.948214908131256e-07, |
|
"loss": 0.0056, |
|
"reward": 1.8515625, |
|
"reward_std": 0.13888053596019745, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 87.671875, |
|
"epoch": 0.010406811731315043, |
|
"grad_norm": 1.3506953926454046, |
|
"kl": 0.13134765625, |
|
"learning_rate": 9.947965941343425e-07, |
|
"loss": 0.0052, |
|
"reward": 1.859375, |
|
"reward_std": 0.1462521031498909, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 94.703125, |
|
"epoch": 0.010456605088881143, |
|
"grad_norm": 1.3399050917299098, |
|
"kl": 0.1474609375, |
|
"learning_rate": 9.947716974555594e-07, |
|
"loss": 0.0059, |
|
"reward": 1.859375, |
|
"reward_std": 0.1530819907784462, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 87.046875, |
|
"epoch": 0.010506398446447244, |
|
"grad_norm": 1.5005372147075555, |
|
"kl": 0.150390625, |
|
"learning_rate": 9.947468007767764e-07, |
|
"loss": 0.006, |
|
"reward": 1.90625, |
|
"reward_std": 0.15650184452533722, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 91.4921875, |
|
"epoch": 0.010556191804013345, |
|
"grad_norm": 1.6169292368037649, |
|
"kl": 0.1376953125, |
|
"learning_rate": 9.947219040979933e-07, |
|
"loss": 0.0055, |
|
"reward": 1.75, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.75, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 91.1328125, |
|
"epoch": 0.010605985161579446, |
|
"grad_norm": 1.0644189696613937, |
|
"kl": 0.16162109375, |
|
"learning_rate": 9.946970074192102e-07, |
|
"loss": 0.0065, |
|
"reward": 1.859375, |
|
"reward_std": 0.07312605157494545, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 93.890625, |
|
"epoch": 0.010655778519145545, |
|
"grad_norm": 0.9605698536438, |
|
"kl": 0.15673828125, |
|
"learning_rate": 9.946721107404273e-07, |
|
"loss": 0.0063, |
|
"reward": 1.9296875, |
|
"reward_std": 0.061278700828552246, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 91.984375, |
|
"epoch": 0.010705571876711647, |
|
"grad_norm": 0.34875054369651237, |
|
"kl": 0.1591796875, |
|
"learning_rate": 9.94647214061644e-07, |
|
"loss": 0.0064, |
|
"reward": 1.859375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 100.8828125, |
|
"epoch": 0.010755365234277748, |
|
"grad_norm": 6.314357417292242, |
|
"kl": 0.16552734375, |
|
"learning_rate": 9.946223173828612e-07, |
|
"loss": 0.0066, |
|
"reward": 1.765625, |
|
"reward_std": 0.2574521452188492, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 95.0, |
|
"epoch": 0.010805158591843849, |
|
"grad_norm": 0.7883325479540553, |
|
"kl": 0.15478515625, |
|
"learning_rate": 9.94597420704078e-07, |
|
"loss": 0.0062, |
|
"reward": 1.984375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9921875, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 95.0859375, |
|
"epoch": 0.010854951949409948, |
|
"grad_norm": 0.8701061022223152, |
|
"kl": 0.16845703125, |
|
"learning_rate": 9.94572524025295e-07, |
|
"loss": 0.0067, |
|
"reward": 1.8828125, |
|
"reward_std": 0.08443661965429783, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 101.5546875, |
|
"epoch": 0.01090474530697605, |
|
"grad_norm": 2.168648370456946, |
|
"kl": 0.14208984375, |
|
"learning_rate": 9.94547627346512e-07, |
|
"loss": 0.0057, |
|
"reward": 1.84375, |
|
"reward_std": 0.22119548916816711, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 105.4921875, |
|
"epoch": 0.01095453866454215, |
|
"grad_norm": 1.136817729966043, |
|
"kl": 0.14404296875, |
|
"learning_rate": 9.945227306677288e-07, |
|
"loss": 0.0058, |
|
"reward": 1.7109375, |
|
"reward_std": 0.0946863554418087, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.7109375, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 97.5234375, |
|
"epoch": 0.011004332022108251, |
|
"grad_norm": 0.7968996025234165, |
|
"kl": 0.15771484375, |
|
"learning_rate": 9.944978339889457e-07, |
|
"loss": 0.0063, |
|
"reward": 1.8671875, |
|
"reward_std": 0.09522313438355923, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.875, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 100.3828125, |
|
"epoch": 0.01105412537967435, |
|
"grad_norm": 2.416431736093127, |
|
"kl": 0.1494140625, |
|
"learning_rate": 9.944729373101629e-07, |
|
"loss": 0.006, |
|
"reward": 1.9609375, |
|
"reward_std": 0.09522313624620438, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.9765625, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 93.78125, |
|
"epoch": 0.011103918737240452, |
|
"grad_norm": 1.3887511020726446, |
|
"kl": 0.1396484375, |
|
"learning_rate": 9.944480406313798e-07, |
|
"loss": 0.0056, |
|
"reward": 1.875, |
|
"reward_std": 0.12756995856761932, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 102.3828125, |
|
"epoch": 0.011153712094806553, |
|
"grad_norm": 0.9029435249555274, |
|
"kl": 0.148681640625, |
|
"learning_rate": 9.944231439525967e-07, |
|
"loss": 0.006, |
|
"reward": 1.8828125, |
|
"reward_std": 0.0657544732093811, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 99.6796875, |
|
"epoch": 0.011203505452372654, |
|
"grad_norm": 0.7658349778128279, |
|
"kl": 0.1357421875, |
|
"learning_rate": 9.943982472738136e-07, |
|
"loss": 0.0054, |
|
"reward": 1.8203125, |
|
"reward_std": 0.05102896690368652, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 98.421875, |
|
"epoch": 0.011253298809938753, |
|
"grad_norm": 1.0827382684524358, |
|
"kl": 0.14404296875, |
|
"learning_rate": 9.943733505950305e-07, |
|
"loss": 0.0058, |
|
"reward": 1.9140625, |
|
"reward_std": 0.051028965041041374, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 98.6875, |
|
"epoch": 0.011303092167504854, |
|
"grad_norm": 1.2846414905204715, |
|
"kl": 0.154296875, |
|
"learning_rate": 9.943484539162475e-07, |
|
"loss": 0.0062, |
|
"reward": 1.75, |
|
"reward_std": 0.15197435580193996, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7578125, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 105.1171875, |
|
"epoch": 0.011352885525070956, |
|
"grad_norm": 0.9038892300324233, |
|
"kl": 0.13427734375, |
|
"learning_rate": 9.943235572374646e-07, |
|
"loss": 0.0054, |
|
"reward": 1.875, |
|
"reward_std": 0.07312605157494545, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 101.7421875, |
|
"epoch": 0.011402678882637057, |
|
"grad_norm": 1.4330643498290019, |
|
"kl": 0.15380859375, |
|
"learning_rate": 9.942986605586813e-07, |
|
"loss": 0.0062, |
|
"reward": 1.7265625, |
|
"reward_std": 0.15756267309188843, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.7265625, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 102.0, |
|
"epoch": 0.011452472240203158, |
|
"grad_norm": 1.198891908394454, |
|
"kl": 0.14111328125, |
|
"learning_rate": 9.942737638798984e-07, |
|
"loss": 0.0056, |
|
"reward": 1.8828125, |
|
"reward_std": 0.1054728776216507, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 98.6328125, |
|
"epoch": 0.011502265597769257, |
|
"grad_norm": 1.0157049801935816, |
|
"kl": 0.15576171875, |
|
"learning_rate": 9.942488672011153e-07, |
|
"loss": 0.0062, |
|
"reward": 1.8515625, |
|
"reward_std": 0.08443661965429783, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 100.1796875, |
|
"epoch": 0.011552058955335358, |
|
"grad_norm": 0.9138543630603742, |
|
"kl": 0.15283203125, |
|
"learning_rate": 9.942239705223323e-07, |
|
"loss": 0.0061, |
|
"reward": 1.8203125, |
|
"reward_std": 0.09969891421496868, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 97.6796875, |
|
"epoch": 0.01160185231290146, |
|
"grad_norm": 1.3505547841265122, |
|
"kl": 0.16796875, |
|
"learning_rate": 9.941990738435492e-07, |
|
"loss": 0.0067, |
|
"reward": 1.859375, |
|
"reward_std": 0.1763787642121315, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.875, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 105.6171875, |
|
"epoch": 0.01165164567046756, |
|
"grad_norm": 1.1868677385109885, |
|
"kl": 0.1640625, |
|
"learning_rate": 9.94174177164766e-07, |
|
"loss": 0.0066, |
|
"reward": 1.859375, |
|
"reward_std": 0.13204574212431908, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 100.7578125, |
|
"epoch": 0.01170143902803366, |
|
"grad_norm": 1.344014510355161, |
|
"kl": 0.14306640625, |
|
"learning_rate": 9.941492804859832e-07, |
|
"loss": 0.0057, |
|
"reward": 1.796875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 96.875, |
|
"epoch": 0.01175123238559976, |
|
"grad_norm": 2.656115422672397, |
|
"kl": 0.140625, |
|
"learning_rate": 9.941243838072001e-07, |
|
"loss": 0.0056, |
|
"reward": 1.859375, |
|
"reward_std": 0.15596505254507065, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 101.4375, |
|
"epoch": 0.011801025743165862, |
|
"grad_norm": 1.4880179640695324, |
|
"kl": 0.162109375, |
|
"learning_rate": 9.94099487128417e-07, |
|
"loss": 0.0065, |
|
"reward": 1.734375, |
|
"reward_std": 0.19438526779413223, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7421875, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 103.7265625, |
|
"epoch": 0.011850819100731963, |
|
"grad_norm": 1.3579385193758495, |
|
"kl": 0.1455078125, |
|
"learning_rate": 9.94074590449634e-07, |
|
"loss": 0.0058, |
|
"reward": 1.8828125, |
|
"reward_std": 0.15991678088903427, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 101.8203125, |
|
"epoch": 0.011900612458298062, |
|
"grad_norm": 5.237599308987956, |
|
"kl": 0.14111328125, |
|
"learning_rate": 9.940496937708509e-07, |
|
"loss": 0.0056, |
|
"reward": 1.8671875, |
|
"reward_std": 0.17551995813846588, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.875, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 100.046875, |
|
"epoch": 0.011950405815864163, |
|
"grad_norm": 1.5880696893655704, |
|
"kl": 0.15087890625, |
|
"learning_rate": 9.940247970920678e-07, |
|
"loss": 0.006, |
|
"reward": 1.765625, |
|
"reward_std": 0.22507023811340332, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 96.0234375, |
|
"epoch": 0.012000199173430265, |
|
"grad_norm": 1.2242789460306562, |
|
"kl": 0.15576171875, |
|
"learning_rate": 9.93999900413285e-07, |
|
"loss": 0.0062, |
|
"reward": 1.8828125, |
|
"reward_std": 0.08443661779165268, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 98.328125, |
|
"epoch": 0.012049992530996366, |
|
"grad_norm": 5.272027410895548, |
|
"kl": 0.14599609375, |
|
"learning_rate": 9.939750037345019e-07, |
|
"loss": 0.0058, |
|
"reward": 1.90625, |
|
"reward_std": 0.1462521031498909, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 106.5, |
|
"epoch": 0.012099785888562465, |
|
"grad_norm": 1.651733896715607, |
|
"kl": 0.12451171875, |
|
"learning_rate": 9.939501070557188e-07, |
|
"loss": 0.005, |
|
"reward": 1.859375, |
|
"reward_std": 0.19616854190826416, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 96.1953125, |
|
"epoch": 0.012149579246128566, |
|
"grad_norm": 1.7062418902803764, |
|
"kl": 0.14697265625, |
|
"learning_rate": 9.939252103769357e-07, |
|
"loss": 0.0059, |
|
"reward": 1.765625, |
|
"reward_std": 0.16675157845020294, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.765625, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 98.328125, |
|
"epoch": 0.012199372603694667, |
|
"grad_norm": 1.9553665963930726, |
|
"kl": 0.1357421875, |
|
"learning_rate": 9.939003136981526e-07, |
|
"loss": 0.0054, |
|
"reward": 1.8046875, |
|
"reward_std": 0.14465449005365372, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 99.453125, |
|
"epoch": 0.012249165961260768, |
|
"grad_norm": 0.876278707942381, |
|
"kl": 0.14404296875, |
|
"learning_rate": 9.938754170193695e-07, |
|
"loss": 0.0058, |
|
"reward": 1.8984375, |
|
"reward_std": 0.0765409953892231, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 94.8203125, |
|
"epoch": 0.01229895931882687, |
|
"grad_norm": 1.2319674361274036, |
|
"kl": 0.140625, |
|
"learning_rate": 9.938505203405864e-07, |
|
"loss": 0.0056, |
|
"reward": 1.7578125, |
|
"reward_std": 0.09969891235232353, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.7578125, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 95.109375, |
|
"epoch": 0.012348752676392969, |
|
"grad_norm": 1.2173331856384102, |
|
"kl": 0.15576171875, |
|
"learning_rate": 9.938256236618034e-07, |
|
"loss": 0.0062, |
|
"reward": 1.875, |
|
"reward_std": 0.07312605157494545, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 99.0859375, |
|
"epoch": 0.01239854603395907, |
|
"grad_norm": 1.571527754558131, |
|
"kl": 0.1396484375, |
|
"learning_rate": 9.938007269830205e-07, |
|
"loss": 0.0056, |
|
"reward": 1.8515625, |
|
"reward_std": 0.13435305655002594, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 93.1484375, |
|
"epoch": 0.012448339391525171, |
|
"grad_norm": 5.278731633230044, |
|
"kl": 0.271484375, |
|
"learning_rate": 9.937758303042374e-07, |
|
"loss": 0.0109, |
|
"reward": 1.859375, |
|
"reward_std": 0.13781969994306564, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 98.1953125, |
|
"epoch": 0.012498132749091272, |
|
"grad_norm": 1.8826337994095337, |
|
"kl": 0.15625, |
|
"learning_rate": 9.937509336254543e-07, |
|
"loss": 0.0062, |
|
"reward": 1.796875, |
|
"reward_std": 0.14123954623937607, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 100.21875, |
|
"epoch": 0.012547926106657371, |
|
"grad_norm": 1.2105855334088336, |
|
"kl": 0.14453125, |
|
"learning_rate": 9.937260369466712e-07, |
|
"loss": 0.0058, |
|
"reward": 1.8515625, |
|
"reward_std": 0.08729992806911469, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 96.6328125, |
|
"epoch": 0.012597719464223472, |
|
"grad_norm": 6.827721020597877, |
|
"kl": 0.1337890625, |
|
"learning_rate": 9.937011402678882e-07, |
|
"loss": 0.0054, |
|
"reward": 1.8984375, |
|
"reward_std": 0.12863079458475113, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 97.3984375, |
|
"epoch": 0.012647512821789574, |
|
"grad_norm": 10.921323790834576, |
|
"kl": 0.140625, |
|
"learning_rate": 9.93676243589105e-07, |
|
"loss": 0.0056, |
|
"reward": 1.890625, |
|
"reward_std": 0.10205793380737305, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 97.0, |
|
"epoch": 0.012697306179355675, |
|
"grad_norm": 1.3961482693399616, |
|
"kl": 0.128173828125, |
|
"learning_rate": 9.936513469103222e-07, |
|
"loss": 0.0051, |
|
"reward": 1.875, |
|
"reward_std": 0.08838834427297115, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 103.15625, |
|
"epoch": 0.012747099536921774, |
|
"grad_norm": 1.4192749877150084, |
|
"kl": 0.13671875, |
|
"learning_rate": 9.93626450231539e-07, |
|
"loss": 0.0055, |
|
"reward": 1.734375, |
|
"reward_std": 0.18591880798339844, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.75, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 99.9140625, |
|
"epoch": 0.012796892894487875, |
|
"grad_norm": 3.1629432643690496, |
|
"kl": 0.19482421875, |
|
"learning_rate": 9.93601553552756e-07, |
|
"loss": 0.0078, |
|
"reward": 1.7890625, |
|
"reward_std": 0.05550474114716053, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 100.1796875, |
|
"epoch": 0.012846686252053976, |
|
"grad_norm": 0.6125894427123689, |
|
"kl": 0.13330078125, |
|
"learning_rate": 9.93576656873973e-07, |
|
"loss": 0.0053, |
|
"reward": 1.8515625, |
|
"reward_std": 0.05102896690368652, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 99.125, |
|
"epoch": 0.012896479609620077, |
|
"grad_norm": 1.4583437192070734, |
|
"kl": 0.120849609375, |
|
"learning_rate": 9.935517601951899e-07, |
|
"loss": 0.0048, |
|
"reward": 1.734375, |
|
"reward_std": 0.1462521068751812, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7421875, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 100.34375, |
|
"epoch": 0.012946272967186177, |
|
"grad_norm": 2.0065005745649707, |
|
"kl": 0.11572265625, |
|
"learning_rate": 9.93526863516407e-07, |
|
"loss": 0.0046, |
|
"reward": 1.84375, |
|
"reward_std": 0.21937815845012665, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.875, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 106.84375, |
|
"epoch": 0.012996066324752278, |
|
"grad_norm": 1.874002169739414, |
|
"kl": 0.11865234375, |
|
"learning_rate": 9.935019668376237e-07, |
|
"loss": 0.0047, |
|
"reward": 1.7109375, |
|
"reward_std": 0.290145181119442, |
|
"rewards/format_reward": 0.9609375, |
|
"rewards/iou_reward": 0.75, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 97.828125, |
|
"epoch": 0.013045859682318379, |
|
"grad_norm": 1.9663851693795746, |
|
"kl": 0.130859375, |
|
"learning_rate": 9.934770701588408e-07, |
|
"loss": 0.0052, |
|
"reward": 1.921875, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 106.71875, |
|
"epoch": 0.01309565303988448, |
|
"grad_norm": 3.4962475907013717, |
|
"kl": 0.1142578125, |
|
"learning_rate": 9.934521734800578e-07, |
|
"loss": 0.0046, |
|
"reward": 1.921875, |
|
"reward_std": 0.17176412604749203, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.9375, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 112.15625, |
|
"epoch": 0.01314544639745058, |
|
"grad_norm": 4.645400687233048, |
|
"kl": 0.19677734375, |
|
"learning_rate": 9.934272768012747e-07, |
|
"loss": 0.0079, |
|
"reward": 1.7421875, |
|
"reward_std": 0.16834919154644012, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.75, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 104.6796875, |
|
"epoch": 0.01319523975501668, |
|
"grad_norm": 1.4745268154075024, |
|
"kl": 0.13916015625, |
|
"learning_rate": 9.934023801224916e-07, |
|
"loss": 0.0056, |
|
"reward": 1.8828125, |
|
"reward_std": 0.1649293377995491, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 109.5078125, |
|
"epoch": 0.013245033112582781, |
|
"grad_norm": 1.3368669114055916, |
|
"kl": 0.11669921875, |
|
"learning_rate": 9.933774834437085e-07, |
|
"loss": 0.0047, |
|
"reward": 1.8515625, |
|
"reward_std": 0.18808725476264954, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.875, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 107.4140625, |
|
"epoch": 0.013294826470148883, |
|
"grad_norm": 1.864568126929696, |
|
"kl": 0.150390625, |
|
"learning_rate": 9.933525867649254e-07, |
|
"loss": 0.006, |
|
"reward": 1.8046875, |
|
"reward_std": 0.1649293377995491, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 105.078125, |
|
"epoch": 0.013344619827714984, |
|
"grad_norm": 1.5167463575044362, |
|
"kl": 0.119140625, |
|
"learning_rate": 9.933276900861426e-07, |
|
"loss": 0.0048, |
|
"reward": 1.84375, |
|
"reward_std": 0.21485067531466484, |
|
"rewards/format_reward": 0.9609375, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 109.2578125, |
|
"epoch": 0.013394413185281083, |
|
"grad_norm": 2.338259055622313, |
|
"kl": 0.109619140625, |
|
"learning_rate": 9.933027934073595e-07, |
|
"loss": 0.0044, |
|
"reward": 1.7890625, |
|
"reward_std": 0.29640422761440277, |
|
"rewards/format_reward": 0.953125, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 111.9296875, |
|
"epoch": 0.013444206542847184, |
|
"grad_norm": 1.984850355663328, |
|
"kl": 0.127197265625, |
|
"learning_rate": 9.932778967285764e-07, |
|
"loss": 0.0051, |
|
"reward": 1.703125, |
|
"reward_std": 0.25403229147195816, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.7265625, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 109.484375, |
|
"epoch": 0.013493999900413285, |
|
"grad_norm": 2.217655615113654, |
|
"kl": 0.114013671875, |
|
"learning_rate": 9.932530000497933e-07, |
|
"loss": 0.0046, |
|
"reward": 1.8046875, |
|
"reward_std": 0.193861223757267, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 108.421875, |
|
"epoch": 0.013543793257979386, |
|
"grad_norm": 1.587560053476885, |
|
"kl": 0.11767578125, |
|
"learning_rate": 9.932281033710102e-07, |
|
"loss": 0.0047, |
|
"reward": 1.8125, |
|
"reward_std": 0.13781969994306564, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 110.03125, |
|
"epoch": 0.013593586615545486, |
|
"grad_norm": 1.9838624824571407, |
|
"kl": 0.13623046875, |
|
"learning_rate": 9.932032066922272e-07, |
|
"loss": 0.0054, |
|
"reward": 1.8046875, |
|
"reward_std": 0.15414283238351345, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 112.7265625, |
|
"epoch": 0.013643379973111587, |
|
"grad_norm": 0.9152592616923091, |
|
"kl": 0.139404296875, |
|
"learning_rate": 9.931783100134443e-07, |
|
"loss": 0.0056, |
|
"reward": 1.9296875, |
|
"reward_std": 0.1649293377995491, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.953125, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 107.1640625, |
|
"epoch": 0.013693173330677688, |
|
"grad_norm": 1.8553634182255607, |
|
"kl": 0.1943359375, |
|
"learning_rate": 9.93153413334661e-07, |
|
"loss": 0.0078, |
|
"reward": 1.8203125, |
|
"reward_std": 0.1938612163066864, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 103.734375, |
|
"epoch": 0.013742966688243789, |
|
"grad_norm": 1.2659236936766336, |
|
"kl": 0.14013671875, |
|
"learning_rate": 9.931285166558781e-07, |
|
"loss": 0.0056, |
|
"reward": 1.90625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 104.953125, |
|
"epoch": 0.013792760045809888, |
|
"grad_norm": 1.4341220624848123, |
|
"kl": 0.1767578125, |
|
"learning_rate": 9.93103619977095e-07, |
|
"loss": 0.0071, |
|
"reward": 1.8515625, |
|
"reward_std": 0.18361148983240128, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 103.328125, |
|
"epoch": 0.01384255340337599, |
|
"grad_norm": 2.003205055364934, |
|
"kl": 0.16259765625, |
|
"learning_rate": 9.93078723298312e-07, |
|
"loss": 0.0065, |
|
"reward": 1.765625, |
|
"reward_std": 0.26168815419077873, |
|
"rewards/format_reward": 0.953125, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 106.8984375, |
|
"epoch": 0.01389234676094209, |
|
"grad_norm": 2.404203671929336, |
|
"kl": 0.15380859375, |
|
"learning_rate": 9.930538266195289e-07, |
|
"loss": 0.0062, |
|
"reward": 1.703125, |
|
"reward_std": 0.4052969515323639, |
|
"rewards/format_reward": 0.8984375, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 105.9765625, |
|
"epoch": 0.013942140118508192, |
|
"grad_norm": 3.9220476065368675, |
|
"kl": 0.150390625, |
|
"learning_rate": 9.930289299407458e-07, |
|
"loss": 0.006, |
|
"reward": 1.7578125, |
|
"reward_std": 0.2777220755815506, |
|
"rewards/format_reward": 0.9453125, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 110.15625, |
|
"epoch": 0.013991933476074291, |
|
"grad_norm": 2.002885664335074, |
|
"kl": 0.16650390625, |
|
"learning_rate": 9.930040332619627e-07, |
|
"loss": 0.0067, |
|
"reward": 1.7421875, |
|
"reward_std": 0.3387659341096878, |
|
"rewards/format_reward": 0.9453125, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 102.8046875, |
|
"epoch": 0.014041726833640392, |
|
"grad_norm": 1.4822912654724572, |
|
"kl": 0.19482421875, |
|
"learning_rate": 9.929791365831798e-07, |
|
"loss": 0.0078, |
|
"reward": 1.796875, |
|
"reward_std": 0.21436559408903122, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 110.21875, |
|
"epoch": 0.014091520191206493, |
|
"grad_norm": 2.554867145679922, |
|
"kl": 0.19091796875, |
|
"learning_rate": 9.929542399043968e-07, |
|
"loss": 0.0076, |
|
"reward": 1.765625, |
|
"reward_std": 0.24489018321037292, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 109.2890625, |
|
"epoch": 0.014141313548772594, |
|
"grad_norm": 1.1463949069156814, |
|
"kl": 0.18603515625, |
|
"learning_rate": 9.929293432256137e-07, |
|
"loss": 0.0074, |
|
"reward": 1.8359375, |
|
"reward_std": 0.07996084541082382, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 105.8828125, |
|
"epoch": 0.014191106906338695, |
|
"grad_norm": 2.4280649493654143, |
|
"kl": 0.20703125, |
|
"learning_rate": 9.929044465468306e-07, |
|
"loss": 0.0083, |
|
"reward": 1.875, |
|
"reward_std": 0.1751839816570282, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.875, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 100.71875, |
|
"epoch": 0.014240900263904795, |
|
"grad_norm": 1.3735812053453058, |
|
"kl": 0.20068359375, |
|
"learning_rate": 9.928795498680475e-07, |
|
"loss": 0.008, |
|
"reward": 1.765625, |
|
"reward_std": 0.11276257783174515, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 108.3203125, |
|
"epoch": 0.014290693621470896, |
|
"grad_norm": 0.3186652851170607, |
|
"kl": 0.1787109375, |
|
"learning_rate": 9.928546531892646e-07, |
|
"loss": 0.0072, |
|
"reward": 2.0, |
|
"reward_std": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 1.0, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 105.9765625, |
|
"epoch": 0.014340486979036997, |
|
"grad_norm": 35.394499678811044, |
|
"kl": 0.1748046875, |
|
"learning_rate": 9.928297565104816e-07, |
|
"loss": 0.007, |
|
"reward": 1.75, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.75, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 102.859375, |
|
"epoch": 0.014390280336603098, |
|
"grad_norm": 3.4336721047005136, |
|
"kl": 0.14013671875, |
|
"learning_rate": 9.928048598316985e-07, |
|
"loss": 0.0056, |
|
"reward": 1.890625, |
|
"reward_std": 0.10205793008208275, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 99.484375, |
|
"epoch": 0.014440073694169197, |
|
"grad_norm": 1.4643334075482506, |
|
"kl": 0.16162109375, |
|
"learning_rate": 9.927799631529154e-07, |
|
"loss": 0.0065, |
|
"reward": 1.703125, |
|
"reward_std": 0.08785156533122063, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.703125, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 103.2109375, |
|
"epoch": 0.014489867051735298, |
|
"grad_norm": 1.361006756453886, |
|
"kl": 0.13427734375, |
|
"learning_rate": 9.927550664741323e-07, |
|
"loss": 0.0054, |
|
"reward": 1.875, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.875, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 102.1796875, |
|
"epoch": 0.0145396604093014, |
|
"grad_norm": 1.5533041726608692, |
|
"kl": 0.15087890625, |
|
"learning_rate": 9.927301697953492e-07, |
|
"loss": 0.006, |
|
"reward": 1.7421875, |
|
"reward_std": 0.1690588891506195, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.75, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 99.4140625, |
|
"epoch": 0.0145894537668675, |
|
"grad_norm": 1.439767424175536, |
|
"kl": 0.15234375, |
|
"learning_rate": 9.927052731165661e-07, |
|
"loss": 0.0061, |
|
"reward": 1.8984375, |
|
"reward_std": 0.1054728776216507, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 96.3671875, |
|
"epoch": 0.0146392471244336, |
|
"grad_norm": 4.413297482947589, |
|
"kl": 0.149169921875, |
|
"learning_rate": 9.92680376437783e-07, |
|
"loss": 0.006, |
|
"reward": 1.84375, |
|
"reward_std": 0.09127141535282135, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 101.9765625, |
|
"epoch": 0.014689040481999701, |
|
"grad_norm": 1.626605726091794, |
|
"kl": 0.140625, |
|
"learning_rate": 9.926554797590002e-07, |
|
"loss": 0.0056, |
|
"reward": 1.90625, |
|
"reward_std": 0.08838834427297115, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 103.78125, |
|
"epoch": 0.014738833839565802, |
|
"grad_norm": 2.2599267724991567, |
|
"kl": 0.1572265625, |
|
"learning_rate": 9.92630583080217e-07, |
|
"loss": 0.0063, |
|
"reward": 1.921875, |
|
"reward_std": 0.18702643364667892, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 102.46875, |
|
"epoch": 0.014788627197131903, |
|
"grad_norm": 1.1231542937944743, |
|
"kl": 0.13720703125, |
|
"learning_rate": 9.92605686401434e-07, |
|
"loss": 0.0055, |
|
"reward": 1.84375, |
|
"reward_std": 0.0578637570142746, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 99.3984375, |
|
"epoch": 0.014838420554698003, |
|
"grad_norm": 2.0540913905238107, |
|
"kl": 0.15478515625, |
|
"learning_rate": 9.92580789722651e-07, |
|
"loss": 0.0062, |
|
"reward": 1.9140625, |
|
"reward_std": 0.17407145351171494, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.9375, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 101.2578125, |
|
"epoch": 0.014888213912264104, |
|
"grad_norm": 1.5934406783336377, |
|
"kl": 0.1494140625, |
|
"learning_rate": 9.925558930438679e-07, |
|
"loss": 0.006, |
|
"reward": 1.890625, |
|
"reward_std": 0.0776018276810646, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 98.34375, |
|
"epoch": 0.014938007269830205, |
|
"grad_norm": 1.181052660812371, |
|
"kl": 0.1650390625, |
|
"learning_rate": 9.925309963650848e-07, |
|
"loss": 0.0066, |
|
"reward": 1.84375, |
|
"reward_std": 0.12304247915744781, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 99.4453125, |
|
"epoch": 0.014987800627396306, |
|
"grad_norm": 0.9290622957204577, |
|
"kl": 0.1572265625, |
|
"learning_rate": 9.92506099686302e-07, |
|
"loss": 0.0063, |
|
"reward": 1.9609375, |
|
"reward_std": 0.07996084541082382, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9609375, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 101.6953125, |
|
"epoch": 0.015037593984962405, |
|
"grad_norm": 1.9370889477802344, |
|
"kl": 0.15185546875, |
|
"learning_rate": 9.924812030075186e-07, |
|
"loss": 0.0061, |
|
"reward": 1.8359375, |
|
"reward_std": 0.23945221304893494, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 96.1953125, |
|
"epoch": 0.015087387342528506, |
|
"grad_norm": 0.7342116250551401, |
|
"kl": 0.1630859375, |
|
"learning_rate": 9.924563063287357e-07, |
|
"loss": 0.0065, |
|
"reward": 1.953125, |
|
"reward_std": 0.09753045253455639, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.9765625, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 99.3515625, |
|
"epoch": 0.015137180700094607, |
|
"grad_norm": 2.850502548856703, |
|
"kl": 0.1640625, |
|
"learning_rate": 9.924314096499527e-07, |
|
"loss": 0.0066, |
|
"reward": 1.9375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9375, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 100.109375, |
|
"epoch": 0.015186974057660708, |
|
"grad_norm": 20.248774371990162, |
|
"kl": 0.77685546875, |
|
"learning_rate": 9.924065129711696e-07, |
|
"loss": 0.0311, |
|
"reward": 1.921875, |
|
"reward_std": 0.0289318785071373, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 99.65625, |
|
"epoch": 0.01523676741522681, |
|
"grad_norm": 0.7379060057008064, |
|
"kl": 0.17724609375, |
|
"learning_rate": 9.923816162923865e-07, |
|
"loss": 0.0071, |
|
"reward": 1.859375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 100.140625, |
|
"epoch": 0.015286560772792909, |
|
"grad_norm": 1.2082202542597547, |
|
"kl": 0.1767578125, |
|
"learning_rate": 9.923567196136034e-07, |
|
"loss": 0.0071, |
|
"reward": 1.9453125, |
|
"reward_std": 0.09969891235232353, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9453125, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 102.1328125, |
|
"epoch": 0.01533635413035901, |
|
"grad_norm": 1.2070345306161352, |
|
"kl": 0.15576171875, |
|
"learning_rate": 9.923318229348203e-07, |
|
"loss": 0.0062, |
|
"reward": 1.921875, |
|
"reward_std": 0.0731260534375906, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 107.78125, |
|
"epoch": 0.015386147487925111, |
|
"grad_norm": 2.0329940364880965, |
|
"kl": 0.15380859375, |
|
"learning_rate": 9.923069262560375e-07, |
|
"loss": 0.0061, |
|
"reward": 1.9140625, |
|
"reward_std": 0.13310656696558, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 100.2578125, |
|
"epoch": 0.015435940845491212, |
|
"grad_norm": 2.3073351557356787, |
|
"kl": 0.1796875, |
|
"learning_rate": 9.922820295772544e-07, |
|
"loss": 0.0072, |
|
"reward": 1.921875, |
|
"reward_std": 0.10888782143592834, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 102.4375, |
|
"epoch": 0.015485734203057312, |
|
"grad_norm": 1.5457036905286086, |
|
"kl": 0.13720703125, |
|
"learning_rate": 9.922571328984713e-07, |
|
"loss": 0.0055, |
|
"reward": 1.953125, |
|
"reward_std": 0.11732023023068905, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.96875, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 109.5625, |
|
"epoch": 0.015535527560623413, |
|
"grad_norm": 0.9431371624891495, |
|
"kl": 0.1396484375, |
|
"learning_rate": 9.922322362196882e-07, |
|
"loss": 0.0056, |
|
"reward": 1.8984375, |
|
"reward_std": 0.11048543266952038, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 100.2265625, |
|
"epoch": 0.015585320918189514, |
|
"grad_norm": 1.3143549974970972, |
|
"kl": 0.1328125, |
|
"learning_rate": 9.922073395409051e-07, |
|
"loss": 0.0053, |
|
"reward": 1.6640625, |
|
"reward_std": 0.13098490796983242, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.671875, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 102.7578125, |
|
"epoch": 0.015635114275755615, |
|
"grad_norm": 2.349338557558, |
|
"kl": 0.13623046875, |
|
"learning_rate": 9.921824428621223e-07, |
|
"loss": 0.0054, |
|
"reward": 1.8984375, |
|
"reward_std": 0.11048543080687523, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 101.8203125, |
|
"epoch": 0.015684907633321716, |
|
"grad_norm": 1.5655788209499324, |
|
"kl": 0.1337890625, |
|
"learning_rate": 9.921575461833392e-07, |
|
"loss": 0.0053, |
|
"reward": 1.890625, |
|
"reward_std": 0.1378196980804205, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 101.78125, |
|
"epoch": 0.015734700990887817, |
|
"grad_norm": 1.711007951147447, |
|
"kl": 0.125244140625, |
|
"learning_rate": 9.92132649504556e-07, |
|
"loss": 0.005, |
|
"reward": 1.875, |
|
"reward_std": 0.14283225685358047, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 101.8671875, |
|
"epoch": 0.015784494348453915, |
|
"grad_norm": 1.8267745898970256, |
|
"kl": 0.1328125, |
|
"learning_rate": 9.92107752825773e-07, |
|
"loss": 0.0053, |
|
"reward": 1.875, |
|
"reward_std": 0.20498371869325638, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 103.5, |
|
"epoch": 0.015834287706020016, |
|
"grad_norm": 0.29516051752988187, |
|
"kl": 0.1328125, |
|
"learning_rate": 9.9208285614699e-07, |
|
"loss": 0.0053, |
|
"reward": 1.875, |
|
"reward_std": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.875, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 97.3046875, |
|
"epoch": 0.015884081063586117, |
|
"grad_norm": 1.4004042506582008, |
|
"kl": 0.13525390625, |
|
"learning_rate": 9.920579594682068e-07, |
|
"loss": 0.0054, |
|
"reward": 1.828125, |
|
"reward_std": 0.20517178624868393, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 104.0078125, |
|
"epoch": 0.015933874421152218, |
|
"grad_norm": 1.136958742068706, |
|
"kl": 0.1142578125, |
|
"learning_rate": 9.92033062789424e-07, |
|
"loss": 0.0046, |
|
"reward": 1.859375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 103.375, |
|
"epoch": 0.01598366777871832, |
|
"grad_norm": 4.0385058650510635, |
|
"kl": 0.1162109375, |
|
"learning_rate": 9.920081661106407e-07, |
|
"loss": 0.0047, |
|
"reward": 1.734375, |
|
"reward_std": 0.15650184452533722, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.734375, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 99.7421875, |
|
"epoch": 0.01603346113628442, |
|
"grad_norm": 0.741816138566885, |
|
"kl": 0.10498046875, |
|
"learning_rate": 9.919832694318578e-07, |
|
"loss": 0.0042, |
|
"reward": 1.796875, |
|
"reward_std": 0.0761774368584156, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 100.96875, |
|
"epoch": 0.01608325449385052, |
|
"grad_norm": 0.7118059651725485, |
|
"kl": 0.11962890625, |
|
"learning_rate": 9.919583727530747e-07, |
|
"loss": 0.0048, |
|
"reward": 1.890625, |
|
"reward_std": 0.05444390885531902, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 103.15625, |
|
"epoch": 0.016133047851416622, |
|
"grad_norm": 0.87277981517813, |
|
"kl": 0.1220703125, |
|
"learning_rate": 9.919334760742916e-07, |
|
"loss": 0.0049, |
|
"reward": 1.765625, |
|
"reward_std": 0.05444391071796417, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.765625, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 100.5390625, |
|
"epoch": 0.016182841208982723, |
|
"grad_norm": 1.4153720221971624, |
|
"kl": 0.13525390625, |
|
"learning_rate": 9.919085793955086e-07, |
|
"loss": 0.0054, |
|
"reward": 1.875, |
|
"reward_std": 0.08337578922510147, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.875, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 106.8046875, |
|
"epoch": 0.01623263456654882, |
|
"grad_norm": 0.828020409872588, |
|
"kl": 0.119384765625, |
|
"learning_rate": 9.918836827167255e-07, |
|
"loss": 0.0048, |
|
"reward": 1.9140625, |
|
"reward_std": 0.051028965041041374, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 111.6953125, |
|
"epoch": 0.016282427924114922, |
|
"grad_norm": 1.2356589850701865, |
|
"kl": 0.1103515625, |
|
"learning_rate": 9.918587860379424e-07, |
|
"loss": 0.0044, |
|
"reward": 1.875, |
|
"reward_std": 0.18538201600313187, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 107.2265625, |
|
"epoch": 0.016332221281681023, |
|
"grad_norm": 0.7036411251654024, |
|
"kl": 0.11474609375, |
|
"learning_rate": 9.918338893591595e-07, |
|
"loss": 0.0046, |
|
"reward": 1.9375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9375, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 103.3515625, |
|
"epoch": 0.016382014639247124, |
|
"grad_norm": 2.5281054133643868, |
|
"kl": 0.118408203125, |
|
"learning_rate": 9.918089926803764e-07, |
|
"loss": 0.0047, |
|
"reward": 1.8046875, |
|
"reward_std": 0.1344047524034977, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 106.625, |
|
"epoch": 0.016431807996813225, |
|
"grad_norm": 3.4762620443154297, |
|
"kl": 0.1103515625, |
|
"learning_rate": 9.917840960015934e-07, |
|
"loss": 0.0044, |
|
"reward": 1.7890625, |
|
"reward_std": 0.21937325596809387, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 111.0625, |
|
"epoch": 0.016481601354379326, |
|
"grad_norm": 1.1834140618116331, |
|
"kl": 0.116455078125, |
|
"learning_rate": 9.917591993228103e-07, |
|
"loss": 0.0047, |
|
"reward": 1.765625, |
|
"reward_std": 0.19007781147956848, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 106.9765625, |
|
"epoch": 0.016531394711945428, |
|
"grad_norm": 1.7993865705235723, |
|
"kl": 0.113037109375, |
|
"learning_rate": 9.917343026440272e-07, |
|
"loss": 0.0045, |
|
"reward": 1.8359375, |
|
"reward_std": 0.13941731303930283, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 101.828125, |
|
"epoch": 0.01658118806951153, |
|
"grad_norm": 1.031000629146479, |
|
"kl": 0.1279296875, |
|
"learning_rate": 9.917094059652441e-07, |
|
"loss": 0.0051, |
|
"reward": 1.9140625, |
|
"reward_std": 0.11048543080687523, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 103.5234375, |
|
"epoch": 0.016630981427077626, |
|
"grad_norm": 2.9527876174571674, |
|
"kl": 0.21630859375, |
|
"learning_rate": 9.916845092864612e-07, |
|
"loss": 0.0087, |
|
"reward": 1.921875, |
|
"reward_std": 0.1659901738166809, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.9453125, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 108.53125, |
|
"epoch": 0.016680774784643727, |
|
"grad_norm": 1.718680351922868, |
|
"kl": 0.1171875, |
|
"learning_rate": 9.91659612607678e-07, |
|
"loss": 0.0047, |
|
"reward": 1.765625, |
|
"reward_std": 0.2301129810512066, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 97.5078125, |
|
"epoch": 0.01673056814220983, |
|
"grad_norm": 2.5277046029364554, |
|
"kl": 0.126220703125, |
|
"learning_rate": 9.91634715928895e-07, |
|
"loss": 0.0051, |
|
"reward": 1.953125, |
|
"reward_std": 0.10205793380737305, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9609375, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 109.6328125, |
|
"epoch": 0.01678036149977593, |
|
"grad_norm": 1.3519973913823835, |
|
"kl": 0.13134765625, |
|
"learning_rate": 9.91609819250112e-07, |
|
"loss": 0.0052, |
|
"reward": 1.828125, |
|
"reward_std": 0.1173202246427536, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 99.59375, |
|
"epoch": 0.01683015485734203, |
|
"grad_norm": 1.5389166357549786, |
|
"kl": 0.11572265625, |
|
"learning_rate": 9.91584922571329e-07, |
|
"loss": 0.0046, |
|
"reward": 1.8984375, |
|
"reward_std": 0.13941731676459312, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 101.0625, |
|
"epoch": 0.016879948214908132, |
|
"grad_norm": 2.136670042647075, |
|
"kl": 0.148193359375, |
|
"learning_rate": 9.915600258925458e-07, |
|
"loss": 0.0059, |
|
"reward": 1.8515625, |
|
"reward_std": 0.1386924535036087, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 101.7578125, |
|
"epoch": 0.016929741572474233, |
|
"grad_norm": 0.7924944562927205, |
|
"kl": 0.1220703125, |
|
"learning_rate": 9.915351292137627e-07, |
|
"loss": 0.0049, |
|
"reward": 1.8671875, |
|
"reward_std": 0.022097086533904076, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 97.671875, |
|
"epoch": 0.016979534930040334, |
|
"grad_norm": 1.079899794826965, |
|
"kl": 0.120849609375, |
|
"learning_rate": 9.915102325349799e-07, |
|
"loss": 0.0048, |
|
"reward": 1.890625, |
|
"reward_std": 0.05444390885531902, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 98.7421875, |
|
"epoch": 0.017029328287606435, |
|
"grad_norm": 1.3834803071490325, |
|
"kl": 0.15283203125, |
|
"learning_rate": 9.914853358561968e-07, |
|
"loss": 0.0061, |
|
"reward": 1.90625, |
|
"reward_std": 0.10766242444515228, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 105.0859375, |
|
"epoch": 0.017079121645172533, |
|
"grad_norm": 1.5014868203711145, |
|
"kl": 0.113525390625, |
|
"learning_rate": 9.914604391774137e-07, |
|
"loss": 0.0045, |
|
"reward": 1.8984375, |
|
"reward_std": 0.12019838765263557, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 101.859375, |
|
"epoch": 0.017128915002738634, |
|
"grad_norm": 3.4513551059647622, |
|
"kl": 0.1337890625, |
|
"learning_rate": 9.914355424986306e-07, |
|
"loss": 0.0054, |
|
"reward": 1.796875, |
|
"reward_std": 0.1354655846953392, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 99.03125, |
|
"epoch": 0.017178708360304735, |
|
"grad_norm": 1.4812624190590156, |
|
"kl": 0.125244140625, |
|
"learning_rate": 9.914106458198475e-07, |
|
"loss": 0.005, |
|
"reward": 1.9140625, |
|
"reward_std": 0.18808726221323013, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.9375, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 98.4453125, |
|
"epoch": 0.017228501717870836, |
|
"grad_norm": 1.7372892629439445, |
|
"kl": 0.138671875, |
|
"learning_rate": 9.913857491410645e-07, |
|
"loss": 0.0056, |
|
"reward": 1.859375, |
|
"reward_std": 0.19568345695734024, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 99.9296875, |
|
"epoch": 0.017278295075436937, |
|
"grad_norm": 1.4367003043638524, |
|
"kl": 0.13232421875, |
|
"learning_rate": 9.913608524622816e-07, |
|
"loss": 0.0053, |
|
"reward": 1.859375, |
|
"reward_std": 0.13204573094844818, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 103.3828125, |
|
"epoch": 0.017328088433003038, |
|
"grad_norm": 2.525388205793568, |
|
"kl": 0.14599609375, |
|
"learning_rate": 9.913359557834985e-07, |
|
"loss": 0.0058, |
|
"reward": 1.71875, |
|
"reward_std": 0.30270224809646606, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.734375, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 96.328125, |
|
"epoch": 0.01737788179056914, |
|
"grad_norm": 1.918935513690073, |
|
"kl": 0.14208984375, |
|
"learning_rate": 9.913110591047154e-07, |
|
"loss": 0.0057, |
|
"reward": 1.796875, |
|
"reward_std": 0.10205793008208275, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 101.734375, |
|
"epoch": 0.01742767514813524, |
|
"grad_norm": 1.4147877112816027, |
|
"kl": 0.16015625, |
|
"learning_rate": 9.912861624259323e-07, |
|
"loss": 0.0064, |
|
"reward": 1.9140625, |
|
"reward_std": 0.19728106260299683, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 102.7890625, |
|
"epoch": 0.017477468505701338, |
|
"grad_norm": 1.329338383954974, |
|
"kl": 0.14599609375, |
|
"learning_rate": 9.912612657471493e-07, |
|
"loss": 0.0059, |
|
"reward": 1.8671875, |
|
"reward_std": 0.17210011184215546, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 101.578125, |
|
"epoch": 0.01752726186326744, |
|
"grad_norm": 1.6414064333926215, |
|
"kl": 0.1494140625, |
|
"learning_rate": 9.912363690683662e-07, |
|
"loss": 0.006, |
|
"reward": 1.8515625, |
|
"reward_std": 0.10994864627718925, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 104.6328125, |
|
"epoch": 0.01757705522083354, |
|
"grad_norm": 1.1258099816149707, |
|
"kl": 0.15966796875, |
|
"learning_rate": 9.91211472389583e-07, |
|
"loss": 0.0064, |
|
"reward": 1.921875, |
|
"reward_std": 0.05444391071796417, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 97.9609375, |
|
"epoch": 0.01762684857839964, |
|
"grad_norm": 2.2520232071507573, |
|
"kl": 0.1748046875, |
|
"learning_rate": 9.911865757108e-07, |
|
"loss": 0.007, |
|
"reward": 1.8359375, |
|
"reward_std": 0.11048543453216553, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 99.7421875, |
|
"epoch": 0.017676641935965742, |
|
"grad_norm": 3.855079834939123, |
|
"kl": 0.17041015625, |
|
"learning_rate": 9.911616790320171e-07, |
|
"loss": 0.0068, |
|
"reward": 1.7578125, |
|
"reward_std": 0.04005437344312668, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.7734375, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 95.1875, |
|
"epoch": 0.017726435293531843, |
|
"grad_norm": 1.4814969183314357, |
|
"kl": 0.181640625, |
|
"learning_rate": 9.91136782353234e-07, |
|
"loss": 0.0073, |
|
"reward": 1.8671875, |
|
"reward_std": 0.134404756128788, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.875, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 94.7265625, |
|
"epoch": 0.017776228651097944, |
|
"grad_norm": 1.2393532430120184, |
|
"kl": 0.16552734375, |
|
"learning_rate": 9.91111885674451e-07, |
|
"loss": 0.0066, |
|
"reward": 1.7734375, |
|
"reward_std": 0.11048543266952038, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 99.9375, |
|
"epoch": 0.017826022008664046, |
|
"grad_norm": 0.41930698245651776, |
|
"kl": 0.15185546875, |
|
"learning_rate": 9.91086988995668e-07, |
|
"loss": 0.0061, |
|
"reward": 1.921875, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 100.328125, |
|
"epoch": 0.017875815366230143, |
|
"grad_norm": 1.744032725917679, |
|
"kl": 0.15625, |
|
"learning_rate": 9.910620923168848e-07, |
|
"loss": 0.0062, |
|
"reward": 1.71875, |
|
"reward_std": 0.19616853445768356, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.7421875, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 97.7109375, |
|
"epoch": 0.017925608723796244, |
|
"grad_norm": 1.0228978308270482, |
|
"kl": 0.15380859375, |
|
"learning_rate": 9.910371956381017e-07, |
|
"loss": 0.0062, |
|
"reward": 1.8515625, |
|
"reward_std": 0.12415501289069653, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 94.4375, |
|
"epoch": 0.017975402081362345, |
|
"grad_norm": 1.369756375741506, |
|
"kl": 0.1689453125, |
|
"learning_rate": 9.910122989593189e-07, |
|
"loss": 0.0068, |
|
"reward": 1.8828125, |
|
"reward_std": 0.09827452525496483, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 100.4609375, |
|
"epoch": 0.018025195438928446, |
|
"grad_norm": 1.1950759441928243, |
|
"kl": 0.13818359375, |
|
"learning_rate": 9.909874022805356e-07, |
|
"loss": 0.0055, |
|
"reward": 1.7890625, |
|
"reward_std": 0.13098490610718727, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 96.640625, |
|
"epoch": 0.018074988796494548, |
|
"grad_norm": 1.57707588469271, |
|
"kl": 0.13916015625, |
|
"learning_rate": 9.909625056017527e-07, |
|
"loss": 0.0056, |
|
"reward": 1.90625, |
|
"reward_std": 0.13527751341462135, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 96.421875, |
|
"epoch": 0.01812478215406065, |
|
"grad_norm": 4.918152718075018, |
|
"kl": 0.2607421875, |
|
"learning_rate": 9.909376089229696e-07, |
|
"loss": 0.0104, |
|
"reward": 1.8828125, |
|
"reward_std": 0.15773098915815353, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 100.421875, |
|
"epoch": 0.01817457551162675, |
|
"grad_norm": 2.170305261344841, |
|
"kl": 0.1162109375, |
|
"learning_rate": 9.909127122441865e-07, |
|
"loss": 0.0047, |
|
"reward": 1.7109375, |
|
"reward_std": 0.20175684243440628, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.7265625, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 100.4765625, |
|
"epoch": 0.01822436886919285, |
|
"grad_norm": 1.7937145211228676, |
|
"kl": 0.109130859375, |
|
"learning_rate": 9.908878155654037e-07, |
|
"loss": 0.0044, |
|
"reward": 1.796875, |
|
"reward_std": 0.1659901700913906, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 97.421875, |
|
"epoch": 0.018274162226758952, |
|
"grad_norm": 1.7316724169513968, |
|
"kl": 0.12744140625, |
|
"learning_rate": 9.908629188866204e-07, |
|
"loss": 0.0051, |
|
"reward": 1.796875, |
|
"reward_std": 0.19226360321044922, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 94.8359375, |
|
"epoch": 0.01832395558432505, |
|
"grad_norm": 1.499286282687084, |
|
"kl": 0.1396484375, |
|
"learning_rate": 9.908380222078375e-07, |
|
"loss": 0.0056, |
|
"reward": 1.7734375, |
|
"reward_std": 0.1857697032392025, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 92.125, |
|
"epoch": 0.01837374894189115, |
|
"grad_norm": 2.0995638315484375, |
|
"kl": 0.13232421875, |
|
"learning_rate": 9.908131255290544e-07, |
|
"loss": 0.0053, |
|
"reward": 1.8046875, |
|
"reward_std": 0.1927535980939865, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 97.6640625, |
|
"epoch": 0.018423542299457252, |
|
"grad_norm": 1.1517851044526053, |
|
"kl": 0.1376953125, |
|
"learning_rate": 9.907882288502713e-07, |
|
"loss": 0.0055, |
|
"reward": 1.796875, |
|
"reward_std": 0.17623990774154663, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 101.921875, |
|
"epoch": 0.018473335657023353, |
|
"grad_norm": 1.8011882086471984, |
|
"kl": 0.13525390625, |
|
"learning_rate": 9.907633321714883e-07, |
|
"loss": 0.0054, |
|
"reward": 1.7421875, |
|
"reward_std": 0.23876509815454483, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.7578125, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 98.7578125, |
|
"epoch": 0.018523129014589454, |
|
"grad_norm": 3.1813018595247597, |
|
"kl": 0.1376953125, |
|
"learning_rate": 9.907384354927052e-07, |
|
"loss": 0.0055, |
|
"reward": 1.828125, |
|
"reward_std": 0.19727616012096405, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 97.578125, |
|
"epoch": 0.018572922372155555, |
|
"grad_norm": 2.2131678528637653, |
|
"kl": 0.123291015625, |
|
"learning_rate": 9.90713538813922e-07, |
|
"loss": 0.0049, |
|
"reward": 1.859375, |
|
"reward_std": 0.1677432656288147, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 99.6875, |
|
"epoch": 0.018622715729721656, |
|
"grad_norm": 1.3699739483339763, |
|
"kl": 0.140625, |
|
"learning_rate": 9.906886421351392e-07, |
|
"loss": 0.0056, |
|
"reward": 1.84375, |
|
"reward_std": 0.14620039984583855, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 99.0546875, |
|
"epoch": 0.018672509087287757, |
|
"grad_norm": 0.9069774166590702, |
|
"kl": 0.1123046875, |
|
"learning_rate": 9.906637454563561e-07, |
|
"loss": 0.0045, |
|
"reward": 1.96875, |
|
"reward_std": 0.07312605530023575, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.96875, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 98.671875, |
|
"epoch": 0.018722302444853855, |
|
"grad_norm": 1.1953320492810002, |
|
"kl": 0.118408203125, |
|
"learning_rate": 9.90638848777573e-07, |
|
"loss": 0.0047, |
|
"reward": 1.8984375, |
|
"reward_std": 0.09522313624620438, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 105.3671875, |
|
"epoch": 0.018772095802419956, |
|
"grad_norm": 3.2370628282700844, |
|
"kl": 0.125244140625, |
|
"learning_rate": 9.9061395209879e-07, |
|
"loss": 0.005, |
|
"reward": 1.765625, |
|
"reward_std": 0.10653370432555676, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.765625, |
|
"step": 377 |
|
}, |
|
{ |
|
"completion_length": 100.0, |
|
"epoch": 0.018821889159986057, |
|
"grad_norm": 1.2697251767374456, |
|
"kl": 0.110107421875, |
|
"learning_rate": 9.905890554200069e-07, |
|
"loss": 0.0044, |
|
"reward": 1.8359375, |
|
"reward_std": 0.1054728701710701, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 110.7890625, |
|
"epoch": 0.018871682517552158, |
|
"grad_norm": 2.2971174682186533, |
|
"kl": 0.12841796875, |
|
"learning_rate": 9.905641587412238e-07, |
|
"loss": 0.0051, |
|
"reward": 1.734375, |
|
"reward_std": 0.16151439398527145, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.734375, |
|
"step": 379 |
|
}, |
|
{ |
|
"completion_length": 98.828125, |
|
"epoch": 0.01892147587511826, |
|
"grad_norm": 1.184205164506072, |
|
"kl": 0.118896484375, |
|
"learning_rate": 9.90539262062441e-07, |
|
"loss": 0.0048, |
|
"reward": 1.7578125, |
|
"reward_std": 0.1054728776216507, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.765625, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 99.5234375, |
|
"epoch": 0.01897126923268436, |
|
"grad_norm": 2.1275123190764305, |
|
"kl": 0.146484375, |
|
"learning_rate": 9.905143653836576e-07, |
|
"loss": 0.0059, |
|
"reward": 1.828125, |
|
"reward_std": 0.19007781893014908, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 381 |
|
}, |
|
{ |
|
"completion_length": 95.375, |
|
"epoch": 0.01902106259025046, |
|
"grad_norm": 4.8216422170101545, |
|
"kl": 0.150390625, |
|
"learning_rate": 9.904894687048748e-07, |
|
"loss": 0.006, |
|
"reward": 1.8828125, |
|
"reward_std": 0.10547287575900555, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 104.2265625, |
|
"epoch": 0.019070855947816562, |
|
"grad_norm": 5.0443235142603875, |
|
"kl": 0.1142578125, |
|
"learning_rate": 9.904645720260917e-07, |
|
"loss": 0.0046, |
|
"reward": 1.953125, |
|
"reward_std": 0.13258251547813416, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.953125, |
|
"step": 383 |
|
}, |
|
{ |
|
"completion_length": 96.28125, |
|
"epoch": 0.019120649305382664, |
|
"grad_norm": 0.6707868851123492, |
|
"kl": 0.12646484375, |
|
"learning_rate": 9.904396753473086e-07, |
|
"loss": 0.0051, |
|
"reward": 1.9140625, |
|
"reward_std": 0.03234682232141495, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 104.2734375, |
|
"epoch": 0.01917044266294876, |
|
"grad_norm": 2.45741205443543, |
|
"kl": 0.11376953125, |
|
"learning_rate": 9.904147786685255e-07, |
|
"loss": 0.0046, |
|
"reward": 1.890625, |
|
"reward_std": 0.1519743576645851, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 106.3984375, |
|
"epoch": 0.019220236020514862, |
|
"grad_norm": 1.0495748496704251, |
|
"kl": 0.103271484375, |
|
"learning_rate": 9.903898819897424e-07, |
|
"loss": 0.0041, |
|
"reward": 1.9765625, |
|
"reward_std": 0.05102896690368652, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9765625, |
|
"step": 386 |
|
}, |
|
{ |
|
"completion_length": 104.09375, |
|
"epoch": 0.019270029378080963, |
|
"grad_norm": 0.8125729547717747, |
|
"kl": 0.107421875, |
|
"learning_rate": 9.903649853109596e-07, |
|
"loss": 0.0043, |
|
"reward": 1.8984375, |
|
"reward_std": 0.03234682232141495, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 387 |
|
}, |
|
{ |
|
"completion_length": 98.28125, |
|
"epoch": 0.019319822735647064, |
|
"grad_norm": 2.689506622548591, |
|
"kl": 0.11572265625, |
|
"learning_rate": 9.903400886321765e-07, |
|
"loss": 0.0046, |
|
"reward": 1.875, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.875, |
|
"step": 388 |
|
}, |
|
{ |
|
"completion_length": 105.2421875, |
|
"epoch": 0.019369616093213166, |
|
"grad_norm": 1.8319766333651575, |
|
"kl": 0.142333984375, |
|
"learning_rate": 9.903151919533934e-07, |
|
"loss": 0.0057, |
|
"reward": 1.890625, |
|
"reward_std": 0.1462521031498909, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 389 |
|
}, |
|
{ |
|
"completion_length": 101.2890625, |
|
"epoch": 0.019419409450779267, |
|
"grad_norm": 1.1697800221914707, |
|
"kl": 0.12109375, |
|
"learning_rate": 9.902902952746103e-07, |
|
"loss": 0.0048, |
|
"reward": 1.9609375, |
|
"reward_std": 0.09522313438355923, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9609375, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 102.28125, |
|
"epoch": 0.019469202808345368, |
|
"grad_norm": 0.9996355166487594, |
|
"kl": 0.1533203125, |
|
"learning_rate": 9.902653985958272e-07, |
|
"loss": 0.0061, |
|
"reward": 1.921875, |
|
"reward_std": 0.06233953312039375, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 391 |
|
}, |
|
{ |
|
"completion_length": 99.109375, |
|
"epoch": 0.01951899616591147, |
|
"grad_norm": 1.6706965485925356, |
|
"kl": 0.124267578125, |
|
"learning_rate": 9.902405019170442e-07, |
|
"loss": 0.005, |
|
"reward": 1.7421875, |
|
"reward_std": 0.1207351740449667, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.7421875, |
|
"step": 392 |
|
}, |
|
{ |
|
"completion_length": 109.375, |
|
"epoch": 0.019568789523477566, |
|
"grad_norm": 3.581644525089151, |
|
"kl": 0.12744140625, |
|
"learning_rate": 9.902156052382613e-07, |
|
"loss": 0.0051, |
|
"reward": 1.9453125, |
|
"reward_std": 0.10094539821147919, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.953125, |
|
"step": 393 |
|
}, |
|
{ |
|
"completion_length": 99.953125, |
|
"epoch": 0.019618582881043668, |
|
"grad_norm": 2.599868565195125, |
|
"kl": 0.12255859375, |
|
"learning_rate": 9.901907085594782e-07, |
|
"loss": 0.0049, |
|
"reward": 1.8046875, |
|
"reward_std": 0.11914245784282684, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8046875, |
|
"step": 394 |
|
}, |
|
{ |
|
"completion_length": 106.6484375, |
|
"epoch": 0.01966837623860977, |
|
"grad_norm": 1.1968477451913047, |
|
"kl": 0.13330078125, |
|
"learning_rate": 9.901658118806951e-07, |
|
"loss": 0.0053, |
|
"reward": 1.8125, |
|
"reward_std": 0.14283225685358047, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 108.703125, |
|
"epoch": 0.01971816959617587, |
|
"grad_norm": 1.2790337556954594, |
|
"kl": 0.12744140625, |
|
"learning_rate": 9.90140915201912e-07, |
|
"loss": 0.0051, |
|
"reward": 1.859375, |
|
"reward_std": 0.10205793380737305, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 396 |
|
}, |
|
{ |
|
"completion_length": 99.1953125, |
|
"epoch": 0.01976796295374197, |
|
"grad_norm": 1.1002164035568978, |
|
"kl": 0.138671875, |
|
"learning_rate": 9.90116018523129e-07, |
|
"loss": 0.0055, |
|
"reward": 1.8203125, |
|
"reward_std": 0.11353681795299053, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 397 |
|
}, |
|
{ |
|
"completion_length": 98.7578125, |
|
"epoch": 0.019817756311308072, |
|
"grad_norm": 12.114373736401388, |
|
"kl": 0.1396484375, |
|
"learning_rate": 9.900911218443459e-07, |
|
"loss": 0.0056, |
|
"reward": 1.890625, |
|
"reward_std": 0.16151439398527145, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 398 |
|
}, |
|
{ |
|
"completion_length": 103.8203125, |
|
"epoch": 0.019867549668874173, |
|
"grad_norm": 1.386622375590888, |
|
"kl": 0.14404296875, |
|
"learning_rate": 9.900662251655628e-07, |
|
"loss": 0.0057, |
|
"reward": 1.875, |
|
"reward_std": 0.1065337099134922, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.875, |
|
"step": 399 |
|
}, |
|
{ |
|
"completion_length": 104.8671875, |
|
"epoch": 0.019917343026440274, |
|
"grad_norm": 2.69974158968377, |
|
"kl": 0.1494140625, |
|
"learning_rate": 9.900413284867797e-07, |
|
"loss": 0.006, |
|
"reward": 1.6171875, |
|
"reward_std": 0.17282496765255928, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.6328125, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 103.640625, |
|
"epoch": 0.019967136384006375, |
|
"grad_norm": 1.7197971306683453, |
|
"kl": 0.14453125, |
|
"learning_rate": 9.900164318079968e-07, |
|
"loss": 0.0058, |
|
"reward": 1.78125, |
|
"reward_std": 0.1820138692855835, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 401 |
|
}, |
|
{ |
|
"completion_length": 101.1328125, |
|
"epoch": 0.020016929741572473, |
|
"grad_norm": 1.2522993480139728, |
|
"kl": 0.13720703125, |
|
"learning_rate": 9.899915351292138e-07, |
|
"loss": 0.0055, |
|
"reward": 1.921875, |
|
"reward_std": 0.10634563490748405, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.9375, |
|
"step": 402 |
|
}, |
|
{ |
|
"completion_length": 101.7421875, |
|
"epoch": 0.020066723099138574, |
|
"grad_norm": 2.466207725150392, |
|
"kl": 0.1591796875, |
|
"learning_rate": 9.899666384504307e-07, |
|
"loss": 0.0064, |
|
"reward": 1.7734375, |
|
"reward_std": 0.21537472307682037, |
|
"rewards/format_reward": 0.9609375, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 403 |
|
}, |
|
{ |
|
"completion_length": 100.265625, |
|
"epoch": 0.020116516456704675, |
|
"grad_norm": 8.104464261305123, |
|
"kl": 0.13427734375, |
|
"learning_rate": 9.899417417716476e-07, |
|
"loss": 0.0054, |
|
"reward": 1.875, |
|
"reward_std": 0.15072787553071976, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 404 |
|
}, |
|
{ |
|
"completion_length": 101.7890625, |
|
"epoch": 0.020166309814270776, |
|
"grad_norm": 1.3422086840090581, |
|
"kl": 0.169921875, |
|
"learning_rate": 9.899168450928645e-07, |
|
"loss": 0.0068, |
|
"reward": 1.96875, |
|
"reward_std": 0.0731260534375906, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9765625, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 107.9140625, |
|
"epoch": 0.020216103171836877, |
|
"grad_norm": 2.520847147919029, |
|
"kl": 0.145263671875, |
|
"learning_rate": 9.898919484140814e-07, |
|
"loss": 0.0058, |
|
"reward": 1.8046875, |
|
"reward_std": 0.25673753023147583, |
|
"rewards/format_reward": 0.9609375, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 406 |
|
}, |
|
{ |
|
"completion_length": 105.4921875, |
|
"epoch": 0.02026589652940298, |
|
"grad_norm": 2.9518749678536884, |
|
"kl": 0.12548828125, |
|
"learning_rate": 9.898670517352986e-07, |
|
"loss": 0.005, |
|
"reward": 1.90625, |
|
"reward_std": 0.07760182581841946, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 407 |
|
}, |
|
{ |
|
"completion_length": 107.2109375, |
|
"epoch": 0.02031568988696908, |
|
"grad_norm": 3.104097773131322, |
|
"kl": 0.12939453125, |
|
"learning_rate": 9.898421550565155e-07, |
|
"loss": 0.0052, |
|
"reward": 1.859375, |
|
"reward_std": 0.16222409904003143, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.875, |
|
"step": 408 |
|
}, |
|
{ |
|
"completion_length": 100.46875, |
|
"epoch": 0.02036548324453518, |
|
"grad_norm": 2.660618449177582, |
|
"kl": 0.14208984375, |
|
"learning_rate": 9.898172583777324e-07, |
|
"loss": 0.0057, |
|
"reward": 1.8203125, |
|
"reward_std": 0.1649293452501297, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 409 |
|
}, |
|
{ |
|
"completion_length": 102.140625, |
|
"epoch": 0.020415276602101278, |
|
"grad_norm": 1.0792718753711772, |
|
"kl": 0.14501953125, |
|
"learning_rate": 9.897923616989493e-07, |
|
"loss": 0.0058, |
|
"reward": 1.7265625, |
|
"reward_std": 0.13888052850961685, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.734375, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 103.8515625, |
|
"epoch": 0.02046506995966738, |
|
"grad_norm": 0.7979162652767577, |
|
"kl": 0.129638671875, |
|
"learning_rate": 9.897674650201662e-07, |
|
"loss": 0.0052, |
|
"reward": 1.875, |
|
"reward_std": 0.13563390634953976, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 411 |
|
}, |
|
{ |
|
"completion_length": 106.6015625, |
|
"epoch": 0.02051486331723348, |
|
"grad_norm": 3.1592524401487014, |
|
"kl": 0.116455078125, |
|
"learning_rate": 9.897425683413834e-07, |
|
"loss": 0.0047, |
|
"reward": 1.84375, |
|
"reward_std": 0.14172462187707424, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 412 |
|
}, |
|
{ |
|
"completion_length": 104.3046875, |
|
"epoch": 0.02056465667479958, |
|
"grad_norm": 2.172706697436291, |
|
"kl": 0.13330078125, |
|
"learning_rate": 9.897176716626e-07, |
|
"loss": 0.0053, |
|
"reward": 1.734375, |
|
"reward_std": 0.18147709220647812, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.734375, |
|
"step": 413 |
|
}, |
|
{ |
|
"completion_length": 101.90625, |
|
"epoch": 0.020614450032365682, |
|
"grad_norm": 1.7805805389907776, |
|
"kl": 0.13134765625, |
|
"learning_rate": 9.896927749838172e-07, |
|
"loss": 0.0052, |
|
"reward": 1.6640625, |
|
"reward_std": 0.17676395922899246, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.6796875, |
|
"step": 414 |
|
}, |
|
{ |
|
"completion_length": 101.5703125, |
|
"epoch": 0.020664243389931784, |
|
"grad_norm": 0.6603270714713876, |
|
"kl": 0.1416015625, |
|
"learning_rate": 9.896678783050341e-07, |
|
"loss": 0.0057, |
|
"reward": 1.8046875, |
|
"reward_std": 0.06629125960171223, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 107.3046875, |
|
"epoch": 0.020714036747497885, |
|
"grad_norm": 1.2448176474313857, |
|
"kl": 0.119873046875, |
|
"learning_rate": 9.89642981626251e-07, |
|
"loss": 0.0048, |
|
"reward": 1.8671875, |
|
"reward_std": 0.16203844547271729, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.875, |
|
"step": 416 |
|
}, |
|
{ |
|
"completion_length": 100.65625, |
|
"epoch": 0.020763830105063986, |
|
"grad_norm": 0.8395144531177806, |
|
"kl": 0.1396484375, |
|
"learning_rate": 9.89618084947468e-07, |
|
"loss": 0.0056, |
|
"reward": 1.7890625, |
|
"reward_std": 0.0765409991145134, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 417 |
|
}, |
|
{ |
|
"completion_length": 106.890625, |
|
"epoch": 0.020813623462630087, |
|
"grad_norm": 1.1286508071956538, |
|
"kl": 0.118896484375, |
|
"learning_rate": 9.895931882686849e-07, |
|
"loss": 0.0048, |
|
"reward": 1.953125, |
|
"reward_std": 0.1325825210660696, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.96875, |
|
"step": 418 |
|
}, |
|
{ |
|
"completion_length": 107.0625, |
|
"epoch": 0.020863416820196184, |
|
"grad_norm": 0.8583986202699341, |
|
"kl": 0.119873046875, |
|
"learning_rate": 9.895682915899018e-07, |
|
"loss": 0.0048, |
|
"reward": 1.9296875, |
|
"reward_std": 0.10994865000247955, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9375, |
|
"step": 419 |
|
}, |
|
{ |
|
"completion_length": 110.5234375, |
|
"epoch": 0.020913210177762286, |
|
"grad_norm": 5.59520071206407, |
|
"kl": 0.130859375, |
|
"learning_rate": 9.89543394911119e-07, |
|
"loss": 0.0052, |
|
"reward": 1.7421875, |
|
"reward_std": 0.11962754279375076, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.7578125, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 99.4921875, |
|
"epoch": 0.020963003535328387, |
|
"grad_norm": 1.8446850482639763, |
|
"kl": 0.1376953125, |
|
"learning_rate": 9.895184982323358e-07, |
|
"loss": 0.0055, |
|
"reward": 1.8515625, |
|
"reward_std": 0.24306794255971909, |
|
"rewards/format_reward": 0.953125, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 421 |
|
}, |
|
{ |
|
"completion_length": 105.7265625, |
|
"epoch": 0.021012796892894488, |
|
"grad_norm": 1.239169481355673, |
|
"kl": 0.1318359375, |
|
"learning_rate": 9.894936015535527e-07, |
|
"loss": 0.0053, |
|
"reward": 1.9453125, |
|
"reward_std": 0.12415501475334167, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.953125, |
|
"step": 422 |
|
}, |
|
{ |
|
"completion_length": 106.109375, |
|
"epoch": 0.02106259025046059, |
|
"grad_norm": 1.3106805901225493, |
|
"kl": 0.1162109375, |
|
"learning_rate": 9.894687048747697e-07, |
|
"loss": 0.0046, |
|
"reward": 1.84375, |
|
"reward_std": 0.1643974706530571, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 423 |
|
}, |
|
{ |
|
"completion_length": 106.6171875, |
|
"epoch": 0.02111238360802669, |
|
"grad_norm": 2.432770103483429, |
|
"kl": 0.1474609375, |
|
"learning_rate": 9.894438081959866e-07, |
|
"loss": 0.0059, |
|
"reward": 1.65625, |
|
"reward_std": 0.2623106837272644, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.6875, |
|
"step": 424 |
|
}, |
|
{ |
|
"completion_length": 103.5859375, |
|
"epoch": 0.02116217696559279, |
|
"grad_norm": 1.0794467870648388, |
|
"kl": 0.134033203125, |
|
"learning_rate": 9.894189115172035e-07, |
|
"loss": 0.0054, |
|
"reward": 1.921875, |
|
"reward_std": 0.09753045439720154, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 102.796875, |
|
"epoch": 0.021211970323158892, |
|
"grad_norm": 1.1422490781487054, |
|
"kl": 0.1435546875, |
|
"learning_rate": 9.893940148384206e-07, |
|
"loss": 0.0057, |
|
"reward": 1.890625, |
|
"reward_std": 0.07935492694377899, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 426 |
|
}, |
|
{ |
|
"completion_length": 108.53125, |
|
"epoch": 0.02126176368072499, |
|
"grad_norm": 0.3848385553711625, |
|
"kl": 0.12353515625, |
|
"learning_rate": 9.893691181596373e-07, |
|
"loss": 0.0049, |
|
"reward": 1.921875, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 427 |
|
}, |
|
{ |
|
"completion_length": 106.5, |
|
"epoch": 0.02131155703829109, |
|
"grad_norm": 2.0603763523362764, |
|
"kl": 0.1337890625, |
|
"learning_rate": 9.893442214808545e-07, |
|
"loss": 0.0054, |
|
"reward": 1.828125, |
|
"reward_std": 0.24936595559120178, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 428 |
|
}, |
|
{ |
|
"completion_length": 110.234375, |
|
"epoch": 0.021361350395857192, |
|
"grad_norm": 4.284844193367983, |
|
"kl": 0.1201171875, |
|
"learning_rate": 9.893193248020714e-07, |
|
"loss": 0.0048, |
|
"reward": 1.8671875, |
|
"reward_std": 0.06629125773906708, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 429 |
|
}, |
|
{ |
|
"completion_length": 108.7578125, |
|
"epoch": 0.021411143753423293, |
|
"grad_norm": 3.7805170953465925, |
|
"kl": 0.133544921875, |
|
"learning_rate": 9.892944281232883e-07, |
|
"loss": 0.0053, |
|
"reward": 1.90625, |
|
"reward_std": 0.17965975403785706, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 109.0234375, |
|
"epoch": 0.021460937110989394, |
|
"grad_norm": 25.46747324216285, |
|
"kl": 0.130859375, |
|
"learning_rate": 9.892695314445052e-07, |
|
"loss": 0.0052, |
|
"reward": 1.84375, |
|
"reward_std": 0.09863808006048203, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 431 |
|
}, |
|
{ |
|
"completion_length": 109.1484375, |
|
"epoch": 0.021510730468555495, |
|
"grad_norm": 1.3885752931019502, |
|
"kl": 0.1240234375, |
|
"learning_rate": 9.892446347657221e-07, |
|
"loss": 0.005, |
|
"reward": 1.90625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 432 |
|
}, |
|
{ |
|
"completion_length": 105.28125, |
|
"epoch": 0.021560523826121596, |
|
"grad_norm": 2.9208100258993395, |
|
"kl": 0.20849609375, |
|
"learning_rate": 9.89219738086939e-07, |
|
"loss": 0.0083, |
|
"reward": 1.8046875, |
|
"reward_std": 0.11048543080687523, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 433 |
|
}, |
|
{ |
|
"completion_length": 106.6015625, |
|
"epoch": 0.021610317183687697, |
|
"grad_norm": 1.0572970251706457, |
|
"kl": 0.130126953125, |
|
"learning_rate": 9.891948414081562e-07, |
|
"loss": 0.0052, |
|
"reward": 1.984375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.984375, |
|
"step": 434 |
|
}, |
|
{ |
|
"completion_length": 107.625, |
|
"epoch": 0.021660110541253795, |
|
"grad_norm": 4.640354600838917, |
|
"kl": 0.12451171875, |
|
"learning_rate": 9.89169944729373e-07, |
|
"loss": 0.005, |
|
"reward": 1.8125, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8125, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 101.84375, |
|
"epoch": 0.021709903898819896, |
|
"grad_norm": 3.25264871697286, |
|
"kl": 0.134765625, |
|
"learning_rate": 9.8914504805059e-07, |
|
"loss": 0.0054, |
|
"reward": 1.9765625, |
|
"reward_std": 0.05102896690368652, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9765625, |
|
"step": 436 |
|
}, |
|
{ |
|
"completion_length": 103.171875, |
|
"epoch": 0.021759697256385997, |
|
"grad_norm": 1.6726799535676136, |
|
"kl": 0.14208984375, |
|
"learning_rate": 9.89120151371807e-07, |
|
"loss": 0.0057, |
|
"reward": 1.921875, |
|
"reward_std": 0.16723665595054626, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.9375, |
|
"step": 437 |
|
}, |
|
{ |
|
"completion_length": 106.9140625, |
|
"epoch": 0.0218094906139521, |
|
"grad_norm": 2.2171035860724038, |
|
"kl": 0.13720703125, |
|
"learning_rate": 9.890952546930239e-07, |
|
"loss": 0.0055, |
|
"reward": 1.703125, |
|
"reward_std": 0.15650184452533722, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.703125, |
|
"step": 438 |
|
}, |
|
{ |
|
"completion_length": 107.28125, |
|
"epoch": 0.0218592839715182, |
|
"grad_norm": 1.2185841409268514, |
|
"kl": 0.14013671875, |
|
"learning_rate": 9.89070358014241e-07, |
|
"loss": 0.0056, |
|
"reward": 1.8125, |
|
"reward_std": 0.2306838184595108, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 439 |
|
}, |
|
{ |
|
"completion_length": 103.375, |
|
"epoch": 0.0219090773290843, |
|
"grad_norm": 1.211666275673379, |
|
"kl": 0.14208984375, |
|
"learning_rate": 9.89045461335458e-07, |
|
"loss": 0.0057, |
|
"reward": 1.90625, |
|
"reward_std": 0.11279274895787239, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 105.703125, |
|
"epoch": 0.0219588706866504, |
|
"grad_norm": 1.060456626560434, |
|
"kl": 0.1416015625, |
|
"learning_rate": 9.890205646566748e-07, |
|
"loss": 0.0057, |
|
"reward": 1.90625, |
|
"reward_std": 0.12756995856761932, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 441 |
|
}, |
|
{ |
|
"completion_length": 100.53125, |
|
"epoch": 0.022008664044216503, |
|
"grad_norm": 2.9632335060925987, |
|
"kl": 0.13818359375, |
|
"learning_rate": 9.889956679778917e-07, |
|
"loss": 0.0055, |
|
"reward": 1.8671875, |
|
"reward_std": 0.1649293452501297, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 442 |
|
}, |
|
{ |
|
"completion_length": 100.3515625, |
|
"epoch": 0.022058457401782604, |
|
"grad_norm": 1.4074621078233351, |
|
"kl": 0.15576171875, |
|
"learning_rate": 9.889707712991087e-07, |
|
"loss": 0.0062, |
|
"reward": 1.8671875, |
|
"reward_std": 0.15106385201215744, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 443 |
|
}, |
|
{ |
|
"completion_length": 103.0234375, |
|
"epoch": 0.0221082507593487, |
|
"grad_norm": 2.056323559284852, |
|
"kl": 0.126953125, |
|
"learning_rate": 9.889458746203256e-07, |
|
"loss": 0.0051, |
|
"reward": 1.9453125, |
|
"reward_std": 0.1054728738963604, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.953125, |
|
"step": 444 |
|
}, |
|
{ |
|
"completion_length": 100.671875, |
|
"epoch": 0.022158044116914802, |
|
"grad_norm": 1.5520548620576045, |
|
"kl": 0.1591796875, |
|
"learning_rate": 9.889209779415425e-07, |
|
"loss": 0.0064, |
|
"reward": 1.828125, |
|
"reward_std": 0.21143083274364471, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 100.4921875, |
|
"epoch": 0.022207837474480904, |
|
"grad_norm": 0.919045163804287, |
|
"kl": 0.1494140625, |
|
"learning_rate": 9.888960812627594e-07, |
|
"loss": 0.006, |
|
"reward": 1.890625, |
|
"reward_std": 0.15769661217927933, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 446 |
|
}, |
|
{ |
|
"completion_length": 103.78125, |
|
"epoch": 0.022257630832047005, |
|
"grad_norm": 1.477275393573811, |
|
"kl": 0.1494140625, |
|
"learning_rate": 9.888711845839765e-07, |
|
"loss": 0.006, |
|
"reward": 1.921875, |
|
"reward_std": 0.1751839816570282, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.9453125, |
|
"step": 447 |
|
}, |
|
{ |
|
"completion_length": 104.8125, |
|
"epoch": 0.022307424189613106, |
|
"grad_norm": 3.3800076484718904, |
|
"kl": 0.14892578125, |
|
"learning_rate": 9.888462879051935e-07, |
|
"loss": 0.006, |
|
"reward": 1.8203125, |
|
"reward_std": 0.13941730558872223, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 448 |
|
}, |
|
{ |
|
"completion_length": 104.9765625, |
|
"epoch": 0.022357217547179207, |
|
"grad_norm": 1.526273039167759, |
|
"kl": 0.1513671875, |
|
"learning_rate": 9.888213912264104e-07, |
|
"loss": 0.0061, |
|
"reward": 1.8203125, |
|
"reward_std": 0.1207351740449667, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 449 |
|
}, |
|
{ |
|
"completion_length": 106.3359375, |
|
"epoch": 0.022407010904745308, |
|
"grad_norm": 2.8208080775190596, |
|
"kl": 0.13671875, |
|
"learning_rate": 9.887964945476273e-07, |
|
"loss": 0.0055, |
|
"reward": 1.78125, |
|
"reward_std": 0.08838834427297115, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.7890625, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 104.9296875, |
|
"epoch": 0.02245680426231141, |
|
"grad_norm": 1.7731793114382703, |
|
"kl": 0.1484375, |
|
"learning_rate": 9.887715978688442e-07, |
|
"loss": 0.0059, |
|
"reward": 1.75, |
|
"reward_std": 0.33486589789390564, |
|
"rewards/format_reward": 0.953125, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 451 |
|
}, |
|
{ |
|
"completion_length": 104.296875, |
|
"epoch": 0.022506597619877507, |
|
"grad_norm": 1.7084659505677322, |
|
"kl": 0.14453125, |
|
"learning_rate": 9.887467011900611e-07, |
|
"loss": 0.0058, |
|
"reward": 1.84375, |
|
"reward_std": 0.1552036553621292, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 452 |
|
}, |
|
{ |
|
"completion_length": 108.3828125, |
|
"epoch": 0.022556390977443608, |
|
"grad_norm": 2.186224807486325, |
|
"kl": 0.14306640625, |
|
"learning_rate": 9.887218045112783e-07, |
|
"loss": 0.0057, |
|
"reward": 1.796875, |
|
"reward_std": 0.1751839779317379, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 453 |
|
}, |
|
{ |
|
"completion_length": 106.0546875, |
|
"epoch": 0.02260618433500971, |
|
"grad_norm": 1.9998022286226829, |
|
"kl": 0.15625, |
|
"learning_rate": 9.886969078324952e-07, |
|
"loss": 0.0063, |
|
"reward": 1.9140625, |
|
"reward_std": 0.0765409991145134, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9140625, |
|
"step": 454 |
|
}, |
|
{ |
|
"completion_length": 101.890625, |
|
"epoch": 0.02265597769257581, |
|
"grad_norm": 3.274181955559814, |
|
"kl": 0.1572265625, |
|
"learning_rate": 9.88672011153712e-07, |
|
"loss": 0.0063, |
|
"reward": 1.859375, |
|
"reward_std": 0.1354655846953392, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 103.1015625, |
|
"epoch": 0.02270577105014191, |
|
"grad_norm": 1.1019710890616639, |
|
"kl": 0.1513671875, |
|
"learning_rate": 9.88647114474929e-07, |
|
"loss": 0.0061, |
|
"reward": 1.859375, |
|
"reward_std": 0.10205793380737305, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 456 |
|
}, |
|
{ |
|
"completion_length": 101.859375, |
|
"epoch": 0.022755564407708012, |
|
"grad_norm": 2.034628546119833, |
|
"kl": 0.16552734375, |
|
"learning_rate": 9.88622217796146e-07, |
|
"loss": 0.0066, |
|
"reward": 1.8359375, |
|
"reward_std": 0.09522314369678497, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8515625, |
|
"step": 457 |
|
}, |
|
{ |
|
"completion_length": 100.9296875, |
|
"epoch": 0.022805357765274113, |
|
"grad_norm": 6.010880307814223, |
|
"kl": 0.15087890625, |
|
"learning_rate": 9.885973211173628e-07, |
|
"loss": 0.006, |
|
"reward": 1.8828125, |
|
"reward_std": 0.21937324851751328, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 458 |
|
}, |
|
{ |
|
"completion_length": 104.640625, |
|
"epoch": 0.022855151122840214, |
|
"grad_norm": 4.489741661737741, |
|
"kl": 0.18212890625, |
|
"learning_rate": 9.885724244385798e-07, |
|
"loss": 0.0073, |
|
"reward": 1.7109375, |
|
"reward_std": 0.27168767154216766, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.7109375, |
|
"step": 459 |
|
}, |
|
{ |
|
"completion_length": 105.6328125, |
|
"epoch": 0.022904944480406315, |
|
"grad_norm": 0.9733419328869521, |
|
"kl": 0.1533203125, |
|
"learning_rate": 9.885475277597967e-07, |
|
"loss": 0.0061, |
|
"reward": 1.828125, |
|
"reward_std": 0.10205793380737305, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 103.484375, |
|
"epoch": 0.022954737837972413, |
|
"grad_norm": 2.7677154473436314, |
|
"kl": 0.15380859375, |
|
"learning_rate": 9.885226310810138e-07, |
|
"loss": 0.0062, |
|
"reward": 1.9296875, |
|
"reward_std": 0.11336849629878998, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 461 |
|
}, |
|
{ |
|
"completion_length": 106.265625, |
|
"epoch": 0.023004531195538514, |
|
"grad_norm": 1.451172692034042, |
|
"kl": 0.146484375, |
|
"learning_rate": 9.884977344022307e-07, |
|
"loss": 0.0059, |
|
"reward": 1.8203125, |
|
"reward_std": 0.14389308542013168, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 462 |
|
}, |
|
{ |
|
"completion_length": 106.1328125, |
|
"epoch": 0.023054324553104615, |
|
"grad_norm": 1.0025729288333158, |
|
"kl": 0.14013671875, |
|
"learning_rate": 9.884728377234476e-07, |
|
"loss": 0.0056, |
|
"reward": 1.9609375, |
|
"reward_std": 0.09522314183413982, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9609375, |
|
"step": 463 |
|
}, |
|
{ |
|
"completion_length": 108.5546875, |
|
"epoch": 0.023104117910670716, |
|
"grad_norm": 2.073188082861282, |
|
"kl": 0.150390625, |
|
"learning_rate": 9.884479410446646e-07, |
|
"loss": 0.006, |
|
"reward": 1.8046875, |
|
"reward_std": 0.20033245533704758, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 464 |
|
}, |
|
{ |
|
"completion_length": 111.265625, |
|
"epoch": 0.023153911268236817, |
|
"grad_norm": 1.8859982820608152, |
|
"kl": 0.14404296875, |
|
"learning_rate": 9.884230443658815e-07, |
|
"loss": 0.0057, |
|
"reward": 1.671875, |
|
"reward_std": 0.12756995856761932, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.6796875, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 108.421875, |
|
"epoch": 0.02320370462580292, |
|
"grad_norm": 1.36239297718611, |
|
"kl": 0.1552734375, |
|
"learning_rate": 9.883981476870986e-07, |
|
"loss": 0.0062, |
|
"reward": 1.859375, |
|
"reward_std": 0.1530819907784462, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 466 |
|
}, |
|
{ |
|
"completion_length": 107.3515625, |
|
"epoch": 0.02325349798336902, |
|
"grad_norm": 1.537929387248317, |
|
"kl": 0.142578125, |
|
"learning_rate": 9.883732510083155e-07, |
|
"loss": 0.0057, |
|
"reward": 1.8984375, |
|
"reward_std": 0.09522313997149467, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 467 |
|
}, |
|
{ |
|
"completion_length": 104.6484375, |
|
"epoch": 0.02330329134093512, |
|
"grad_norm": 1.4377231285009622, |
|
"kl": 0.14453125, |
|
"learning_rate": 9.883483543295324e-07, |
|
"loss": 0.0058, |
|
"reward": 1.8671875, |
|
"reward_std": 0.12019838392734528, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 468 |
|
}, |
|
{ |
|
"completion_length": 105.5625, |
|
"epoch": 0.02335308469850122, |
|
"grad_norm": 0.9226007760964924, |
|
"kl": 0.131591796875, |
|
"learning_rate": 9.883234576507494e-07, |
|
"loss": 0.0053, |
|
"reward": 1.9453125, |
|
"reward_std": 0.10094539821147919, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.953125, |
|
"step": 469 |
|
}, |
|
{ |
|
"completion_length": 103.8828125, |
|
"epoch": 0.02340287805606732, |
|
"grad_norm": 1.0439864254701836, |
|
"kl": 0.16748046875, |
|
"learning_rate": 9.882985609719663e-07, |
|
"loss": 0.0067, |
|
"reward": 1.875, |
|
"reward_std": 0.11678344011306763, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 102.7890625, |
|
"epoch": 0.02345267141363342, |
|
"grad_norm": 0.8519347229747519, |
|
"kl": 0.1484375, |
|
"learning_rate": 9.882736642931832e-07, |
|
"loss": 0.0059, |
|
"reward": 1.984375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.984375, |
|
"step": 471 |
|
}, |
|
{ |
|
"completion_length": 104.03125, |
|
"epoch": 0.02350246477119952, |
|
"grad_norm": 3.9532178592979537, |
|
"kl": 0.13037109375, |
|
"learning_rate": 9.882487676144003e-07, |
|
"loss": 0.0052, |
|
"reward": 1.875, |
|
"reward_std": 0.14806943386793137, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.875, |
|
"step": 472 |
|
}, |
|
{ |
|
"completion_length": 103.90625, |
|
"epoch": 0.023552258128765623, |
|
"grad_norm": 1.4602642353976163, |
|
"kl": 0.1591796875, |
|
"learning_rate": 9.88223870935617e-07, |
|
"loss": 0.0064, |
|
"reward": 1.890625, |
|
"reward_std": 0.09863808378577232, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 473 |
|
}, |
|
{ |
|
"completion_length": 106.125, |
|
"epoch": 0.023602051486331724, |
|
"grad_norm": 1.5608561778767236, |
|
"kl": 0.15576171875, |
|
"learning_rate": 9.881989742568342e-07, |
|
"loss": 0.0062, |
|
"reward": 1.8828125, |
|
"reward_std": 0.14389308542013168, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 474 |
|
}, |
|
{ |
|
"completion_length": 103.765625, |
|
"epoch": 0.023651844843897825, |
|
"grad_norm": 1.3727273258923491, |
|
"kl": 0.138671875, |
|
"learning_rate": 9.88174077578051e-07, |
|
"loss": 0.0055, |
|
"reward": 1.8359375, |
|
"reward_std": 0.12019838392734528, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8359375, |
|
"step": 475 |
|
}, |
|
{ |
|
"completion_length": 111.890625, |
|
"epoch": 0.023701638201463926, |
|
"grad_norm": 6.728290652103424, |
|
"kl": 0.14306640625, |
|
"learning_rate": 9.88149180899268e-07, |
|
"loss": 0.0057, |
|
"reward": 1.6640625, |
|
"reward_std": 0.19674427807331085, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.6640625, |
|
"step": 476 |
|
}, |
|
{ |
|
"completion_length": 101.2109375, |
|
"epoch": 0.023751431559030027, |
|
"grad_norm": 1.9283080384935798, |
|
"kl": 0.15869140625, |
|
"learning_rate": 9.88124284220485e-07, |
|
"loss": 0.0063, |
|
"reward": 1.828125, |
|
"reward_std": 0.1530819945037365, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.828125, |
|
"step": 477 |
|
}, |
|
{ |
|
"completion_length": 109.3046875, |
|
"epoch": 0.023801224916596125, |
|
"grad_norm": 1.5920743917951639, |
|
"kl": 0.1572265625, |
|
"learning_rate": 9.880993875417018e-07, |
|
"loss": 0.0063, |
|
"reward": 1.703125, |
|
"reward_std": 0.23752351105213165, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.7265625, |
|
"step": 478 |
|
}, |
|
{ |
|
"completion_length": 103.078125, |
|
"epoch": 0.023851018274162226, |
|
"grad_norm": 1.4479296644833646, |
|
"kl": 0.14013671875, |
|
"learning_rate": 9.880744908629187e-07, |
|
"loss": 0.0056, |
|
"reward": 1.84375, |
|
"reward_std": 0.1020579319447279, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.84375, |
|
"step": 479 |
|
}, |
|
{ |
|
"completion_length": 103.4140625, |
|
"epoch": 0.023900811631728327, |
|
"grad_norm": 1.7626997175565988, |
|
"kl": 0.14404296875, |
|
"learning_rate": 9.880495941841359e-07, |
|
"loss": 0.0058, |
|
"reward": 1.8984375, |
|
"reward_std": 0.09069566056132317, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.90625, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 102.84375, |
|
"epoch": 0.023950604989294428, |
|
"grad_norm": 1.3488570567034721, |
|
"kl": 0.15185546875, |
|
"learning_rate": 9.880246975053528e-07, |
|
"loss": 0.0061, |
|
"reward": 1.8671875, |
|
"reward_std": 0.14513956755399704, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.890625, |
|
"step": 481 |
|
}, |
|
{ |
|
"completion_length": 105.921875, |
|
"epoch": 0.02400039834686053, |
|
"grad_norm": 1.5192301089699045, |
|
"kl": 0.154296875, |
|
"learning_rate": 9.879998008265697e-07, |
|
"loss": 0.0062, |
|
"reward": 1.8828125, |
|
"reward_std": 0.19704129546880722, |
|
"rewards/format_reward": 0.9609375, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 482 |
|
}, |
|
{ |
|
"completion_length": 104.140625, |
|
"epoch": 0.02405019170442663, |
|
"grad_norm": 2.0745024198809747, |
|
"kl": 0.16357421875, |
|
"learning_rate": 9.879749041477866e-07, |
|
"loss": 0.0065, |
|
"reward": 1.75, |
|
"reward_std": 0.19886352866888046, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.765625, |
|
"step": 483 |
|
}, |
|
{ |
|
"completion_length": 105.7265625, |
|
"epoch": 0.02409998506199273, |
|
"grad_norm": 1.2406324281185725, |
|
"kl": 0.1435546875, |
|
"learning_rate": 9.879500074690035e-07, |
|
"loss": 0.0058, |
|
"reward": 1.6796875, |
|
"reward_std": 0.14855941385030746, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.7109375, |
|
"step": 484 |
|
}, |
|
{ |
|
"completion_length": 104.3359375, |
|
"epoch": 0.024149778419558832, |
|
"grad_norm": 1.5231890908193364, |
|
"kl": 0.15087890625, |
|
"learning_rate": 9.879251107902205e-07, |
|
"loss": 0.006, |
|
"reward": 1.7578125, |
|
"reward_std": 0.193861223757267, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.78125, |
|
"step": 485 |
|
}, |
|
{ |
|
"completion_length": 103.765625, |
|
"epoch": 0.02419957177712493, |
|
"grad_norm": 1.1293010761774125, |
|
"kl": 0.1796875, |
|
"learning_rate": 9.879002141114376e-07, |
|
"loss": 0.0072, |
|
"reward": 1.8828125, |
|
"reward_std": 0.13941731303930283, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8984375, |
|
"step": 486 |
|
}, |
|
{ |
|
"completion_length": 102.71875, |
|
"epoch": 0.02424936513469103, |
|
"grad_norm": 1.7761059615407961, |
|
"kl": 0.15234375, |
|
"learning_rate": 9.878753174326543e-07, |
|
"loss": 0.0061, |
|
"reward": 1.921875, |
|
"reward_std": 0.15072788298130035, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9296875, |
|
"step": 487 |
|
}, |
|
{ |
|
"completion_length": 93.34375, |
|
"epoch": 0.024299158492257132, |
|
"grad_norm": 1.964291414192396, |
|
"kl": 0.18505859375, |
|
"learning_rate": 9.878504207538714e-07, |
|
"loss": 0.0074, |
|
"reward": 1.765625, |
|
"reward_std": 0.14795350283384323, |
|
"rewards/format_reward": 0.96875, |
|
"rewards/iou_reward": 0.796875, |
|
"step": 488 |
|
}, |
|
{ |
|
"completion_length": 100.8203125, |
|
"epoch": 0.024348951849823233, |
|
"grad_norm": 0.7277617054482011, |
|
"kl": 0.16748046875, |
|
"learning_rate": 9.878255240750883e-07, |
|
"loss": 0.0067, |
|
"reward": 1.96875, |
|
"reward_std": 0.05444390885531902, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.96875, |
|
"step": 489 |
|
}, |
|
{ |
|
"completion_length": 99.40625, |
|
"epoch": 0.024398745207389334, |
|
"grad_norm": 1.1702776487906323, |
|
"kl": 0.16748046875, |
|
"learning_rate": 9.878006273963053e-07, |
|
"loss": 0.0067, |
|
"reward": 1.8671875, |
|
"reward_std": 0.12863079085946083, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 97.0859375, |
|
"epoch": 0.024448538564955435, |
|
"grad_norm": 2.714386342878631, |
|
"kl": 0.16064453125, |
|
"learning_rate": 9.877757307175224e-07, |
|
"loss": 0.0064, |
|
"reward": 1.8515625, |
|
"reward_std": 0.2074790969491005, |
|
"rewards/format_reward": 0.984375, |
|
"rewards/iou_reward": 0.8671875, |
|
"step": 491 |
|
}, |
|
{ |
|
"completion_length": 101.8359375, |
|
"epoch": 0.024498331922521537, |
|
"grad_norm": 1.0387645335043558, |
|
"kl": 0.14697265625, |
|
"learning_rate": 9.87750834038739e-07, |
|
"loss": 0.0059, |
|
"reward": 1.8203125, |
|
"reward_std": 0.09522313438355923, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 492 |
|
}, |
|
{ |
|
"completion_length": 99.53125, |
|
"epoch": 0.024548125280087638, |
|
"grad_norm": 0.7040826833051839, |
|
"kl": 0.15771484375, |
|
"learning_rate": 9.877259373599562e-07, |
|
"loss": 0.0063, |
|
"reward": 1.8828125, |
|
"reward_std": 0.05102896690368652, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.8828125, |
|
"step": 493 |
|
}, |
|
{ |
|
"completion_length": 104.375, |
|
"epoch": 0.02459791863765374, |
|
"grad_norm": 1.622053629854092, |
|
"kl": 0.16845703125, |
|
"learning_rate": 9.877010406811731e-07, |
|
"loss": 0.0067, |
|
"reward": 1.7578125, |
|
"reward_std": 0.14966704696416855, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.78125, |
|
"step": 494 |
|
}, |
|
{ |
|
"completion_length": 101.390625, |
|
"epoch": 0.024647711995219836, |
|
"grad_norm": 1969.1433146636798, |
|
"kl": 22.71142578125, |
|
"learning_rate": 9.8767614400239e-07, |
|
"loss": 0.9075, |
|
"reward": 1.796875, |
|
"reward_std": 0.20498371869325638, |
|
"rewards/format_reward": 0.9765625, |
|
"rewards/iou_reward": 0.8203125, |
|
"step": 495 |
|
}, |
|
{ |
|
"completion_length": 97.296875, |
|
"epoch": 0.024697505352785937, |
|
"grad_norm": 1.1663268157684155, |
|
"kl": 0.181640625, |
|
"learning_rate": 9.87651247323607e-07, |
|
"loss": 0.0073, |
|
"reward": 1.9453125, |
|
"reward_std": 0.09969891421496868, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.953125, |
|
"step": 496 |
|
}, |
|
{ |
|
"completion_length": 99.7890625, |
|
"epoch": 0.02474729871035204, |
|
"grad_norm": 0.12937812351177783, |
|
"kl": 0.17822265625, |
|
"learning_rate": 9.87626350644824e-07, |
|
"loss": 0.0071, |
|
"reward": 1.9375, |
|
"reward_std": 0.0, |
|
"rewards/format_reward": 1.0, |
|
"rewards/iou_reward": 0.9375, |
|
"step": 497 |
|
}, |
|
{ |
|
"completion_length": 97.7578125, |
|
"epoch": 0.02479709206791814, |
|
"grad_norm": 1.8140947072137117, |
|
"kl": 0.16943359375, |
|
"learning_rate": 9.876014539660408e-07, |
|
"loss": 0.0068, |
|
"reward": 1.9140625, |
|
"reward_std": 0.13098490238189697, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.921875, |
|
"step": 498 |
|
}, |
|
{ |
|
"completion_length": 104.140625, |
|
"epoch": 0.02484688542548424, |
|
"grad_norm": 1.1820055832409997, |
|
"kl": 0.16455078125, |
|
"learning_rate": 9.87576557287258e-07, |
|
"loss": 0.0066, |
|
"reward": 1.8515625, |
|
"reward_std": 0.08891239389777184, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.859375, |
|
"step": 499 |
|
}, |
|
{ |
|
"completion_length": 103.265625, |
|
"epoch": 0.024896678783050342, |
|
"grad_norm": 1.8872195133113516, |
|
"kl": 0.23388671875, |
|
"learning_rate": 9.875516606084749e-07, |
|
"loss": 0.0093, |
|
"reward": 1.9296875, |
|
"reward_std": 0.14389308542013168, |
|
"rewards/format_reward": 0.9921875, |
|
"rewards/iou_reward": 0.9375, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 40166, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|