|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.997867803837953, |
|
"eval_steps": 100, |
|
"global_step": 234, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 649.4174499511719, |
|
"epoch": 0.008528784648187633, |
|
"grad_norm": 0.31805452704429626, |
|
"kl": 0.0, |
|
"learning_rate": 1.25e-07, |
|
"loss": 0.0024, |
|
"reward": 0.6517857387661934, |
|
"reward_std": 0.34545752592384815, |
|
"rewards/accuracy_reward": 0.6517857387661934, |
|
"rewards/format_reward": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 590.9721260070801, |
|
"epoch": 0.042643923240938165, |
|
"grad_norm": 0.45416054129600525, |
|
"kl": 0.0001373589038848877, |
|
"learning_rate": 6.25e-07, |
|
"loss": 0.0229, |
|
"reward": 0.6350446697324514, |
|
"reward_std": 0.36835399456322193, |
|
"rewards/accuracy_reward": 0.6350446697324514, |
|
"rewards/format_reward": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 605.218334197998, |
|
"epoch": 0.08528784648187633, |
|
"grad_norm": 0.3594004809856415, |
|
"kl": 0.00028357505798339846, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.021, |
|
"reward": 0.5959821727126837, |
|
"reward_std": 0.3600444387644529, |
|
"rewards/accuracy_reward": 0.5950893145054579, |
|
"rewards/format_reward": 0.0008928571827709675, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 595.6455612182617, |
|
"epoch": 0.1279317697228145, |
|
"grad_norm": 0.7764827609062195, |
|
"kl": 0.0025279045104980467, |
|
"learning_rate": 1.875e-06, |
|
"loss": 0.0361, |
|
"reward": 0.670089316368103, |
|
"reward_std": 0.3190986420959234, |
|
"rewards/accuracy_reward": 0.670089316368103, |
|
"rewards/format_reward": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 627.0107414245606, |
|
"epoch": 0.17057569296375266, |
|
"grad_norm": 0.21981796622276306, |
|
"kl": 0.00797119140625, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0652, |
|
"reward": 0.7066964596509934, |
|
"reward_std": 0.28181498125195503, |
|
"rewards/accuracy_reward": 0.7066964596509934, |
|
"rewards/format_reward": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 603.1241317749024, |
|
"epoch": 0.21321961620469082, |
|
"grad_norm": 0.21138541400432587, |
|
"kl": 0.017241477966308594, |
|
"learning_rate": 2.99983215271541e-06, |
|
"loss": 0.0725, |
|
"reward": 0.7758928939700127, |
|
"reward_std": 0.20822736751288176, |
|
"rewards/accuracy_reward": 0.7758928939700127, |
|
"rewards/format_reward": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 593.8991271972657, |
|
"epoch": 0.255863539445629, |
|
"grad_norm": 0.2828144133090973, |
|
"kl": 0.021068000793457033, |
|
"learning_rate": 2.993961440992859e-06, |
|
"loss": 0.04, |
|
"reward": 0.758482176065445, |
|
"reward_std": 0.20791386468335987, |
|
"rewards/accuracy_reward": 0.758482176065445, |
|
"rewards/format_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 583.6861869812012, |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.40165308117866516, |
|
"kl": 0.004427337646484375, |
|
"learning_rate": 2.979735890885282e-06, |
|
"loss": 0.0572, |
|
"reward": 0.7812500312924385, |
|
"reward_std": 0.1955214325338602, |
|
"rewards/accuracy_reward": 0.7812500312924385, |
|
"rewards/format_reward": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 581.3607414245605, |
|
"epoch": 0.3411513859275053, |
|
"grad_norm": 2.153895378112793, |
|
"kl": 0.004603385925292969, |
|
"learning_rate": 2.957235057439301e-06, |
|
"loss": 0.0418, |
|
"reward": 0.783928607404232, |
|
"reward_std": 0.190056839492172, |
|
"rewards/accuracy_reward": 0.783928607404232, |
|
"rewards/format_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 587.9826141357422, |
|
"epoch": 0.3837953091684435, |
|
"grad_norm": 1.4613006114959717, |
|
"kl": 0.037926101684570314, |
|
"learning_rate": 2.9265847744427307e-06, |
|
"loss": 0.0367, |
|
"reward": 0.7933036029338837, |
|
"reward_std": 0.16121128750965, |
|
"rewards/accuracy_reward": 0.7933036029338837, |
|
"rewards/format_reward": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 580.0799369812012, |
|
"epoch": 0.42643923240938164, |
|
"grad_norm": 0.4102199077606201, |
|
"kl": 0.004796600341796875, |
|
"learning_rate": 2.887956450710995e-06, |
|
"loss": 0.0454, |
|
"reward": 0.7410714626312256, |
|
"reward_std": 0.1911674102768302, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 571.1562767028809, |
|
"epoch": 0.4690831556503198, |
|
"grad_norm": 0.2943607568740845, |
|
"kl": 0.008086776733398438, |
|
"learning_rate": 2.8415661114995055e-06, |
|
"loss": 0.0303, |
|
"reward": 0.7580357536673545, |
|
"reward_std": 0.17367877559736372, |
|
"rewards/accuracy_reward": 0.7580357536673545, |
|
"rewards/format_reward": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 594.6495803833008, |
|
"epoch": 0.511727078891258, |
|
"grad_norm": 0.2648622989654541, |
|
"kl": 0.006841278076171875, |
|
"learning_rate": 2.7876731904027993e-06, |
|
"loss": 0.033, |
|
"reward": 0.759375037252903, |
|
"reward_std": 0.16417703656479715, |
|
"rewards/accuracy_reward": 0.759375037252903, |
|
"rewards/format_reward": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 561.1022575378418, |
|
"epoch": 0.5543710021321961, |
|
"grad_norm": 0.17413394153118134, |
|
"kl": 0.0077056884765625, |
|
"learning_rate": 2.726579078496647e-06, |
|
"loss": 0.0209, |
|
"reward": 0.7508928917348385, |
|
"reward_std": 0.17149335239082575, |
|
"rewards/accuracy_reward": 0.7508928917348385, |
|
"rewards/format_reward": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 582.3223472595215, |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.19076332449913025, |
|
"kl": 0.006637954711914062, |
|
"learning_rate": 2.6586254388368995e-06, |
|
"loss": 0.0295, |
|
"reward": 0.7477678909897805, |
|
"reward_std": 0.17248529344797134, |
|
"rewards/accuracy_reward": 0.7477678909897805, |
|
"rewards/format_reward": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 554.729940032959, |
|
"epoch": 0.6396588486140725, |
|
"grad_norm": 0.4007582366466522, |
|
"kl": 0.013779067993164062, |
|
"learning_rate": 2.584192295741087e-06, |
|
"loss": 0.0353, |
|
"reward": 0.7924107506871223, |
|
"reward_std": 0.16211480675265194, |
|
"rewards/accuracy_reward": 0.7924107506871223, |
|
"rewards/format_reward": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 566.0049346923828, |
|
"epoch": 0.6823027718550106, |
|
"grad_norm": 0.1895846575498581, |
|
"kl": 0.006413650512695312, |
|
"learning_rate": 2.5036959095382875e-06, |
|
"loss": 0.0316, |
|
"reward": 0.7776786044239998, |
|
"reward_std": 0.1660858705639839, |
|
"rewards/accuracy_reward": 0.7776786044239998, |
|
"rewards/format_reward": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 581.461636352539, |
|
"epoch": 0.7249466950959488, |
|
"grad_norm": 0.2569662928581238, |
|
"kl": 0.00648956298828125, |
|
"learning_rate": 2.4175864486725093e-06, |
|
"loss": 0.0311, |
|
"reward": 0.7754464603960514, |
|
"reward_std": 0.17775620855391025, |
|
"rewards/accuracy_reward": 0.7754464603960514, |
|
"rewards/format_reward": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 566.7705581665039, |
|
"epoch": 0.767590618336887, |
|
"grad_norm": 0.5861203074455261, |
|
"kl": 0.00692901611328125, |
|
"learning_rate": 2.3263454721781537e-06, |
|
"loss": 0.0299, |
|
"reward": 0.7517857506871224, |
|
"reward_std": 0.1826934120617807, |
|
"rewards/accuracy_reward": 0.7517857506871224, |
|
"rewards/format_reward": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 569.6250282287598, |
|
"epoch": 0.8102345415778252, |
|
"grad_norm": 0.1765875220298767, |
|
"kl": 0.006940460205078125, |
|
"learning_rate": 2.230483236606551e-06, |
|
"loss": 0.0238, |
|
"reward": 0.7611607514321804, |
|
"reward_std": 0.18477695686742662, |
|
"rewards/accuracy_reward": 0.7611607514321804, |
|
"rewards/format_reward": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8528784648187633, |
|
"grad_norm": 0.2575342655181885, |
|
"learning_rate": 2.1305358424643485e-06, |
|
"loss": 0.0241, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8528784648187633, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 546.6882248535156, |
|
"eval_kl": 0.0092373046875, |
|
"eval_loss": 0.010418593883514404, |
|
"eval_reward": 0.6825428889751435, |
|
"eval_reward_std": 0.2040955721795559, |
|
"eval_rewards/accuracy_reward": 0.6825428889751435, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 6332.2052, |
|
"eval_samples_per_second": 0.79, |
|
"eval_steps_per_second": 0.014, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 561.6328327178956, |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.27169159054756165, |
|
"kl": 0.007660293579101562, |
|
"learning_rate": 2.027062236122014e-06, |
|
"loss": 0.022, |
|
"reward": 0.775000037997961, |
|
"reward_std": 0.18100181790068745, |
|
"rewards/accuracy_reward": 0.775000037997961, |
|
"rewards/format_reward": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 566.2844017028808, |
|
"epoch": 0.9381663113006397, |
|
"grad_norm": 0.1813412606716156, |
|
"kl": 0.00843048095703125, |
|
"learning_rate": 1.9206410839590043e-06, |
|
"loss": 0.0284, |
|
"reward": 0.7910714611411095, |
|
"reward_std": 0.191034213360399, |
|
"rewards/accuracy_reward": 0.7910714611411095, |
|
"rewards/format_reward": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 551.362525177002, |
|
"epoch": 0.9808102345415778, |
|
"grad_norm": 0.2454252392053604, |
|
"kl": 0.010602569580078125, |
|
"learning_rate": 1.8118675362266389e-06, |
|
"loss": 0.0384, |
|
"reward": 0.7803571820259094, |
|
"reward_std": 0.18656156454235315, |
|
"rewards/accuracy_reward": 0.7803571820259094, |
|
"rewards/format_reward": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 560.3027687072754, |
|
"epoch": 1.0255863539445629, |
|
"grad_norm": 0.338925838470459, |
|
"kl": 0.012747955322265626, |
|
"learning_rate": 1.7013498987264833e-06, |
|
"loss": 0.0206, |
|
"reward": 0.7812500402331353, |
|
"reward_std": 0.17740353057160974, |
|
"rewards/accuracy_reward": 0.7812500402331353, |
|
"rewards/format_reward": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 547.627702331543, |
|
"epoch": 1.068230277185501, |
|
"grad_norm": 0.7126057147979736, |
|
"kl": 0.0207763671875, |
|
"learning_rate": 1.5897062309175513e-06, |
|
"loss": 0.0426, |
|
"reward": 0.7910714641213417, |
|
"reward_std": 0.1805942740291357, |
|
"rewards/accuracy_reward": 0.7910714641213417, |
|
"rewards/format_reward": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 548.1620819091797, |
|
"epoch": 1.1108742004264391, |
|
"grad_norm": 32.12421798706055, |
|
"kl": 0.04654083251953125, |
|
"learning_rate": 1.4775608894771048e-06, |
|
"loss": 0.0453, |
|
"reward": 0.745089316368103, |
|
"reward_std": 0.20257350029423832, |
|
"rewards/accuracy_reward": 0.745089316368103, |
|
"rewards/format_reward": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 539.5571685791016, |
|
"epoch": 1.1535181236673775, |
|
"grad_norm": 9.155479431152344, |
|
"kl": 0.06140899658203125, |
|
"learning_rate": 1.3655410366448499e-06, |
|
"loss": 0.0713, |
|
"reward": 0.7616071730852128, |
|
"reward_std": 0.2343007681891322, |
|
"rewards/accuracy_reward": 0.7616071730852128, |
|
"rewards/format_reward": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 588.0808280944824, |
|
"epoch": 1.1961620469083156, |
|
"grad_norm": 183.4071502685547, |
|
"kl": 0.445599365234375, |
|
"learning_rate": 1.2542731328772936e-06, |
|
"loss": 0.1391, |
|
"reward": 0.701339316368103, |
|
"reward_std": 0.2731596459634602, |
|
"rewards/accuracy_reward": 0.701339316368103, |
|
"rewards/format_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 603.2902099609375, |
|
"epoch": 1.2388059701492538, |
|
"grad_norm": 8.56387710571289, |
|
"kl": 0.11466064453125, |
|
"learning_rate": 1.1443794334267539e-06, |
|
"loss": 0.11, |
|
"reward": 0.6888393096625804, |
|
"reward_std": 0.2936958262696862, |
|
"rewards/accuracy_reward": 0.6888393096625804, |
|
"rewards/format_reward": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 601.5696647644043, |
|
"epoch": 1.2814498933901919, |
|
"grad_norm": 7617.19140625, |
|
"kl": 5.51258544921875, |
|
"learning_rate": 1.036474508437579e-06, |
|
"loss": 0.3675, |
|
"reward": 0.7062500342726707, |
|
"reward_std": 0.2892017766833305, |
|
"rewards/accuracy_reward": 0.7062500342726707, |
|
"rewards/format_reward": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 602.9071655273438, |
|
"epoch": 1.32409381663113, |
|
"grad_norm": 52.287174224853516, |
|
"kl": 9337.884985351562, |
|
"learning_rate": 9.311618060206075e-07, |
|
"loss": 641.8632, |
|
"reward": 0.6732143133878707, |
|
"reward_std": 0.28922309204936025, |
|
"rewards/accuracy_reward": 0.6732143133878707, |
|
"rewards/format_reward": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 624.1178833007813, |
|
"epoch": 1.3667377398720681, |
|
"grad_norm": 30.293315887451172, |
|
"kl": 0.25384521484375, |
|
"learning_rate": 8.290302775265509e-07, |
|
"loss": 0.1763, |
|
"reward": 0.6522321723401546, |
|
"reward_std": 0.31476256959140303, |
|
"rewards/accuracy_reward": 0.6522321723401546, |
|
"rewards/format_reward": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 604.4107437133789, |
|
"epoch": 1.4093816631130065, |
|
"grad_norm": 140.5424346923828, |
|
"kl": 0.729931640625, |
|
"learning_rate": 7.30651083891141e-07, |
|
"loss": 0.17, |
|
"reward": 0.6995536044239998, |
|
"reward_std": 0.27449417784810065, |
|
"rewards/accuracy_reward": 0.6995536044239998, |
|
"rewards/format_reward": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 616.0844047546386, |
|
"epoch": 1.4520255863539446, |
|
"grad_norm": 308.79248046875, |
|
"kl": 1.08154296875, |
|
"learning_rate": 6.3657440147149e-07, |
|
"loss": 0.1747, |
|
"reward": 0.666517885029316, |
|
"reward_std": 0.2974686399102211, |
|
"rewards/accuracy_reward": 0.666517885029316, |
|
"rewards/format_reward": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 625.4259185791016, |
|
"epoch": 1.4946695095948828, |
|
"grad_norm": 27.743221282958984, |
|
"kl": 0.8737548828125, |
|
"learning_rate": 5.473263452367318e-07, |
|
"loss": 0.1675, |
|
"reward": 0.6281250268220901, |
|
"reward_std": 0.2969447080045938, |
|
"rewards/accuracy_reward": 0.6281250268220901, |
|
"rewards/format_reward": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 594.4665451049805, |
|
"epoch": 1.537313432835821, |
|
"grad_norm": 42.16392517089844, |
|
"kl": 0.6285614013671875, |
|
"learning_rate": 4.63406026519703e-07, |
|
"loss": 0.1549, |
|
"reward": 0.6785714589059353, |
|
"reward_std": 0.28817852344363926, |
|
"rewards/accuracy_reward": 0.6785714589059353, |
|
"rewards/format_reward": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 597.4826126098633, |
|
"epoch": 1.579957356076759, |
|
"grad_norm": 9.374706268310547, |
|
"kl": 0.40172119140625, |
|
"learning_rate": 3.852827617839085e-07, |
|
"loss": 0.1409, |
|
"reward": 0.672321455180645, |
|
"reward_std": 0.28770273169502614, |
|
"rewards/accuracy_reward": 0.672321455180645, |
|
"rewards/format_reward": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 616.756723022461, |
|
"epoch": 1.6226012793176974, |
|
"grad_norm": 4.605694770812988, |
|
"kl": 0.297802734375, |
|
"learning_rate": 3.133934480154885e-07, |
|
"loss": 0.1275, |
|
"reward": 0.6464285954833031, |
|
"reward_std": 0.32245727106928823, |
|
"rewards/accuracy_reward": 0.6464285954833031, |
|
"rewards/format_reward": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 594.2477890014649, |
|
"epoch": 1.6652452025586353, |
|
"grad_norm": 41.83647155761719, |
|
"kl": 0.3505615234375, |
|
"learning_rate": 2.48140119418046e-07, |
|
"loss": 0.1389, |
|
"reward": 0.6745536029338837, |
|
"reward_std": 0.29486791118979455, |
|
"rewards/accuracy_reward": 0.6745536029338837, |
|
"rewards/format_reward": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.7078891257995736, |
|
"grad_norm": 6.253437519073486, |
|
"learning_rate": 1.8988769907430552e-07, |
|
"loss": 0.5188, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.7078891257995736, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 590.7401977539063, |
|
"eval_kl": 1.22388359375, |
|
"eval_loss": 0.1446426957845688, |
|
"eval_reward": 0.5810285974502564, |
|
"eval_reward_std": 0.2900823284029961, |
|
"eval_rewards/accuracy_reward": 0.5810285974502564, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 6537.0371, |
|
"eval_samples_per_second": 0.765, |
|
"eval_steps_per_second": 0.014, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 590.7547119140625, |
|
"epoch": 1.7505330490405118, |
|
"grad_norm": 13380.6484375, |
|
"kl": 5.470074462890625, |
|
"learning_rate": 1.3896195814820269e-07, |
|
"loss": 0.232, |
|
"reward": 0.6745536033064127, |
|
"reward_std": 0.28764786571264267, |
|
"rewards/accuracy_reward": 0.6745536033064127, |
|
"rewards/format_reward": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 611.0937705993653, |
|
"epoch": 1.79317697228145, |
|
"grad_norm": 107.79976654052734, |
|
"kl": 0.5142333984375, |
|
"learning_rate": 9.564769404039419e-08, |
|
"loss": 0.138, |
|
"reward": 0.6406250268220901, |
|
"reward_std": 0.30025416277348993, |
|
"rewards/accuracy_reward": 0.6406250268220901, |
|
"rewards/format_reward": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 581.3156517028808, |
|
"epoch": 1.835820895522388, |
|
"grad_norm": 45.014381408691406, |
|
"kl": 0.490106201171875, |
|
"learning_rate": 6.018713768566658e-08, |
|
"loss": 0.1346, |
|
"reward": 0.6852678880095482, |
|
"reward_std": 0.26793576404452324, |
|
"rewards/accuracy_reward": 0.6852678880095482, |
|
"rewards/format_reward": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 590.7098457336426, |
|
"epoch": 1.8784648187633262, |
|
"grad_norm": 12.46617317199707, |
|
"kl": 0.482330322265625, |
|
"learning_rate": 3.277859889929147e-08, |
|
"loss": 0.1325, |
|
"reward": 0.6723214596509933, |
|
"reward_std": 0.3076107632368803, |
|
"rewards/accuracy_reward": 0.6723214596509933, |
|
"rewards/format_reward": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 591.4446685791015, |
|
"epoch": 1.9211087420042645, |
|
"grad_norm": 12.500801086425781, |
|
"kl": 0.86319580078125, |
|
"learning_rate": 1.357535734809795e-08, |
|
"loss": 0.1614, |
|
"reward": 0.6620536029338837, |
|
"reward_std": 0.28253707848489285, |
|
"rewards/accuracy_reward": 0.6620536029338837, |
|
"rewards/format_reward": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 595.5790466308594, |
|
"epoch": 1.9637526652452024, |
|
"grad_norm": 6.495975494384766, |
|
"kl": 0.371343994140625, |
|
"learning_rate": 2.684805348397268e-09, |
|
"loss": 0.1161, |
|
"reward": 0.663839316368103, |
|
"reward_std": 0.2793597897514701, |
|
"rewards/accuracy_reward": 0.663839316368103, |
|
"rewards/format_reward": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 604.5189571380615, |
|
"epoch": 1.997867803837953, |
|
"kl": 0.5056266784667969, |
|
"reward": 0.655133955180645, |
|
"reward_std": 0.2893115577753633, |
|
"rewards/accuracy_reward": 0.655133955180645, |
|
"rewards/format_reward": 0.0, |
|
"step": 234, |
|
"total_flos": 0.0, |
|
"train_loss": 13.809995387143527, |
|
"train_runtime": 33837.9498, |
|
"train_samples_per_second": 0.443, |
|
"train_steps_per_second": 0.007 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 234, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|