|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 537, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0186219739292365, |
|
"grad_norm": 1.3874553442001343, |
|
"learning_rate": 9.98134328358209e-05, |
|
"loss": 1.8326, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.037243947858473, |
|
"grad_norm": 0.7816306352615356, |
|
"learning_rate": 9.94402985074627e-05, |
|
"loss": 1.314, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.055865921787709494, |
|
"grad_norm": 0.748516857624054, |
|
"learning_rate": 9.906716417910448e-05, |
|
"loss": 1.2913, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.074487895716946, |
|
"grad_norm": 0.7102469205856323, |
|
"learning_rate": 9.869402985074628e-05, |
|
"loss": 1.2041, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0931098696461825, |
|
"grad_norm": 0.7098786234855652, |
|
"learning_rate": 9.832089552238806e-05, |
|
"loss": 1.1642, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11173184357541899, |
|
"grad_norm": 0.6812120676040649, |
|
"learning_rate": 9.794776119402985e-05, |
|
"loss": 1.1717, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1303538175046555, |
|
"grad_norm": 0.6887907981872559, |
|
"learning_rate": 9.757462686567165e-05, |
|
"loss": 1.169, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.148975791433892, |
|
"grad_norm": 0.685250461101532, |
|
"learning_rate": 9.720149253731343e-05, |
|
"loss": 1.0856, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16759776536312848, |
|
"grad_norm": 0.7008183002471924, |
|
"learning_rate": 9.682835820895523e-05, |
|
"loss": 1.0955, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.186219739292365, |
|
"grad_norm": 0.6370360255241394, |
|
"learning_rate": 9.645522388059703e-05, |
|
"loss": 1.0581, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2048417132216015, |
|
"grad_norm": 0.7338399887084961, |
|
"learning_rate": 9.608208955223881e-05, |
|
"loss": 1.0287, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22346368715083798, |
|
"grad_norm": 0.6552614569664001, |
|
"learning_rate": 9.57089552238806e-05, |
|
"loss": 1.0256, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24208566108007448, |
|
"grad_norm": 0.6847032308578491, |
|
"learning_rate": 9.533582089552238e-05, |
|
"loss": 0.9911, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.260707635009311, |
|
"grad_norm": 0.7163254618644714, |
|
"learning_rate": 9.496268656716418e-05, |
|
"loss": 1.0316, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.27932960893854747, |
|
"grad_norm": 0.6374409794807434, |
|
"learning_rate": 9.458955223880598e-05, |
|
"loss": 1.0137, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.297951582867784, |
|
"grad_norm": 1.027390718460083, |
|
"learning_rate": 9.421641791044776e-05, |
|
"loss": 0.9583, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3165735567970205, |
|
"grad_norm": 0.6466365456581116, |
|
"learning_rate": 9.384328358208956e-05, |
|
"loss": 0.9578, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.33519553072625696, |
|
"grad_norm": 0.6674121022224426, |
|
"learning_rate": 9.347014925373135e-05, |
|
"loss": 0.9662, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3538175046554935, |
|
"grad_norm": 0.6532049179077148, |
|
"learning_rate": 9.309701492537313e-05, |
|
"loss": 0.9258, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.37243947858473, |
|
"grad_norm": 0.6357593536376953, |
|
"learning_rate": 9.272388059701493e-05, |
|
"loss": 0.8891, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.39106145251396646, |
|
"grad_norm": 0.6210165619850159, |
|
"learning_rate": 9.235074626865672e-05, |
|
"loss": 0.8979, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.409683426443203, |
|
"grad_norm": 0.6325780749320984, |
|
"learning_rate": 9.197761194029851e-05, |
|
"loss": 0.9028, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.42830540037243947, |
|
"grad_norm": 0.6363133788108826, |
|
"learning_rate": 9.16044776119403e-05, |
|
"loss": 0.9283, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.44692737430167595, |
|
"grad_norm": 0.5902472138404846, |
|
"learning_rate": 9.12313432835821e-05, |
|
"loss": 0.8592, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4655493482309125, |
|
"grad_norm": 0.6462955474853516, |
|
"learning_rate": 9.08582089552239e-05, |
|
"loss": 0.8595, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.48417132216014896, |
|
"grad_norm": 0.6341489553451538, |
|
"learning_rate": 9.048507462686568e-05, |
|
"loss": 0.8372, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5027932960893855, |
|
"grad_norm": 0.6504695415496826, |
|
"learning_rate": 9.011194029850746e-05, |
|
"loss": 0.8458, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.521415270018622, |
|
"grad_norm": 0.6562509536743164, |
|
"learning_rate": 8.973880597014925e-05, |
|
"loss": 0.8211, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5400372439478585, |
|
"grad_norm": 0.622184693813324, |
|
"learning_rate": 8.936567164179105e-05, |
|
"loss": 0.8278, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5586592178770949, |
|
"grad_norm": 0.6273349523544312, |
|
"learning_rate": 8.899253731343285e-05, |
|
"loss": 0.8188, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5772811918063314, |
|
"grad_norm": 0.6405043601989746, |
|
"learning_rate": 8.861940298507463e-05, |
|
"loss": 0.8646, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.595903165735568, |
|
"grad_norm": 0.6616275310516357, |
|
"learning_rate": 8.824626865671643e-05, |
|
"loss": 0.8606, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6145251396648045, |
|
"grad_norm": 0.6546701192855835, |
|
"learning_rate": 8.787313432835821e-05, |
|
"loss": 0.8271, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.633147113594041, |
|
"grad_norm": 0.6446258425712585, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.805, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6517690875232774, |
|
"grad_norm": 0.6694822311401367, |
|
"learning_rate": 8.71268656716418e-05, |
|
"loss": 0.8237, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6703910614525139, |
|
"grad_norm": 0.5971983075141907, |
|
"learning_rate": 8.675373134328358e-05, |
|
"loss": 0.7784, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6890130353817505, |
|
"grad_norm": 0.5651050209999084, |
|
"learning_rate": 8.638059701492538e-05, |
|
"loss": 0.7345, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.707635009310987, |
|
"grad_norm": 0.6789554357528687, |
|
"learning_rate": 8.600746268656717e-05, |
|
"loss": 0.8024, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7262569832402235, |
|
"grad_norm": 0.6168780326843262, |
|
"learning_rate": 8.563432835820896e-05, |
|
"loss": 0.7387, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.74487895716946, |
|
"grad_norm": 0.6000941395759583, |
|
"learning_rate": 8.526119402985075e-05, |
|
"loss": 0.7403, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7635009310986964, |
|
"grad_norm": 0.6785574555397034, |
|
"learning_rate": 8.488805970149253e-05, |
|
"loss": 0.7653, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7821229050279329, |
|
"grad_norm": 0.6262904405593872, |
|
"learning_rate": 8.451492537313433e-05, |
|
"loss": 0.7421, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8007448789571695, |
|
"grad_norm": 0.6499343514442444, |
|
"learning_rate": 8.414179104477612e-05, |
|
"loss": 0.6831, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.819366852886406, |
|
"grad_norm": 0.5795383453369141, |
|
"learning_rate": 8.376865671641791e-05, |
|
"loss": 0.7189, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8379888268156425, |
|
"grad_norm": 0.6257463097572327, |
|
"learning_rate": 8.339552238805971e-05, |
|
"loss": 0.7052, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8566108007448789, |
|
"grad_norm": 0.6415144801139832, |
|
"learning_rate": 8.30223880597015e-05, |
|
"loss": 0.703, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8752327746741154, |
|
"grad_norm": 0.6050721406936646, |
|
"learning_rate": 8.26492537313433e-05, |
|
"loss": 0.7093, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8938547486033519, |
|
"grad_norm": 0.63200443983078, |
|
"learning_rate": 8.227611940298508e-05, |
|
"loss": 0.711, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9124767225325885, |
|
"grad_norm": 0.7535350918769836, |
|
"learning_rate": 8.190298507462687e-05, |
|
"loss": 0.7101, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.931098696461825, |
|
"grad_norm": 0.5603737831115723, |
|
"learning_rate": 8.152985074626866e-05, |
|
"loss": 0.6888, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9497206703910615, |
|
"grad_norm": 0.6010568141937256, |
|
"learning_rate": 8.115671641791045e-05, |
|
"loss": 0.702, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9683426443202979, |
|
"grad_norm": 0.6261228322982788, |
|
"learning_rate": 8.078358208955225e-05, |
|
"loss": 0.6634, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9869646182495344, |
|
"grad_norm": 0.5894924998283386, |
|
"learning_rate": 8.041044776119403e-05, |
|
"loss": 0.6782, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.8664104342460632, |
|
"eval_runtime": 40.1805, |
|
"eval_samples_per_second": 10.005, |
|
"eval_steps_per_second": 1.269, |
|
"step": 537 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2685, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 8.800341942707159e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|