{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9980781550288276, "eval_steps": 500, "global_step": 1170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025624599615631006, "grad_norm": 23.87626578586006, "learning_rate": 5e-06, "loss": 0.9021, "step": 10 }, { "epoch": 0.05124919923126201, "grad_norm": 12.894025041742566, "learning_rate": 5e-06, "loss": 0.8278, "step": 20 }, { "epoch": 0.07687379884689302, "grad_norm": 1.9039308459782107, "learning_rate": 5e-06, "loss": 0.7942, "step": 30 }, { "epoch": 0.10249839846252402, "grad_norm": 0.8246425522140604, "learning_rate": 5e-06, "loss": 0.7682, "step": 40 }, { "epoch": 0.12812299807815503, "grad_norm": 0.7903898281300727, "learning_rate": 5e-06, "loss": 0.7443, "step": 50 }, { "epoch": 0.15374759769378604, "grad_norm": 0.7618274610091537, "learning_rate": 5e-06, "loss": 0.733, "step": 60 }, { "epoch": 0.17937219730941703, "grad_norm": 0.6631690211716542, "learning_rate": 5e-06, "loss": 0.7251, "step": 70 }, { "epoch": 0.20499679692504805, "grad_norm": 0.7752376118116371, "learning_rate": 5e-06, "loss": 0.7144, "step": 80 }, { "epoch": 0.23062139654067906, "grad_norm": 0.751359072795012, "learning_rate": 5e-06, "loss": 0.7106, "step": 90 }, { "epoch": 0.25624599615631005, "grad_norm": 0.5794322814005122, "learning_rate": 5e-06, "loss": 0.7063, "step": 100 }, { "epoch": 0.28187059577194107, "grad_norm": 0.8734788690919498, "learning_rate": 5e-06, "loss": 0.6994, "step": 110 }, { "epoch": 0.3074951953875721, "grad_norm": 0.5062452804420097, "learning_rate": 5e-06, "loss": 0.7057, "step": 120 }, { "epoch": 0.3331197950032031, "grad_norm": 0.6542917427814133, "learning_rate": 5e-06, "loss": 0.6997, "step": 130 }, { "epoch": 0.35874439461883406, "grad_norm": 0.592400656203172, "learning_rate": 5e-06, "loss": 0.7055, "step": 140 }, { "epoch": 0.3843689942344651, "grad_norm": 0.5861404729061817, "learning_rate": 5e-06, "loss": 0.7002, "step": 150 }, { "epoch": 0.4099935938500961, "grad_norm": 0.5129403767949405, "learning_rate": 5e-06, "loss": 0.6977, "step": 160 }, { "epoch": 0.4356181934657271, "grad_norm": 0.5869968660626531, "learning_rate": 5e-06, "loss": 0.6961, "step": 170 }, { "epoch": 0.4612427930813581, "grad_norm": 0.6257829680932973, "learning_rate": 5e-06, "loss": 0.6928, "step": 180 }, { "epoch": 0.4868673926969891, "grad_norm": 0.5886221575021535, "learning_rate": 5e-06, "loss": 0.6876, "step": 190 }, { "epoch": 0.5124919923126201, "grad_norm": 0.6006186676094072, "learning_rate": 5e-06, "loss": 0.6983, "step": 200 }, { "epoch": 0.5381165919282511, "grad_norm": 0.5015888297145041, "learning_rate": 5e-06, "loss": 0.6931, "step": 210 }, { "epoch": 0.5637411915438821, "grad_norm": 0.5307259707540836, "learning_rate": 5e-06, "loss": 0.6881, "step": 220 }, { "epoch": 0.5893657911595132, "grad_norm": 0.6093913911692712, "learning_rate": 5e-06, "loss": 0.6797, "step": 230 }, { "epoch": 0.6149903907751442, "grad_norm": 0.6093826734084604, "learning_rate": 5e-06, "loss": 0.6838, "step": 240 }, { "epoch": 0.6406149903907752, "grad_norm": 0.607941001127991, "learning_rate": 5e-06, "loss": 0.6789, "step": 250 }, { "epoch": 0.6662395900064062, "grad_norm": 0.4897426114233287, "learning_rate": 5e-06, "loss": 0.684, "step": 260 }, { "epoch": 0.6918641896220371, "grad_norm": 0.4483386511091874, "learning_rate": 5e-06, "loss": 0.6769, "step": 270 }, { "epoch": 0.7174887892376681, "grad_norm": 0.5533227188891904, "learning_rate": 5e-06, "loss": 0.6785, "step": 280 }, { "epoch": 0.7431133888532991, "grad_norm": 0.5333807928895044, "learning_rate": 5e-06, "loss": 0.6866, "step": 290 }, { "epoch": 0.7687379884689302, "grad_norm": 0.5209254483197653, "learning_rate": 5e-06, "loss": 0.6762, "step": 300 }, { "epoch": 0.7943625880845612, "grad_norm": 0.45530525150524676, "learning_rate": 5e-06, "loss": 0.6773, "step": 310 }, { "epoch": 0.8199871877001922, "grad_norm": 0.6616235203126, "learning_rate": 5e-06, "loss": 0.679, "step": 320 }, { "epoch": 0.8456117873158232, "grad_norm": 0.5707652588164589, "learning_rate": 5e-06, "loss": 0.671, "step": 330 }, { "epoch": 0.8712363869314542, "grad_norm": 0.6197974812556873, "learning_rate": 5e-06, "loss": 0.6701, "step": 340 }, { "epoch": 0.8968609865470852, "grad_norm": 0.5705968336277791, "learning_rate": 5e-06, "loss": 0.674, "step": 350 }, { "epoch": 0.9224855861627163, "grad_norm": 0.46125889577293033, "learning_rate": 5e-06, "loss": 0.6675, "step": 360 }, { "epoch": 0.9481101857783472, "grad_norm": 0.5263500841671853, "learning_rate": 5e-06, "loss": 0.6816, "step": 370 }, { "epoch": 0.9737347853939782, "grad_norm": 0.6701180427430501, "learning_rate": 5e-06, "loss": 0.6753, "step": 380 }, { "epoch": 0.9993593850096092, "grad_norm": 0.6408384613157992, "learning_rate": 5e-06, "loss": 0.6786, "step": 390 }, { "epoch": 0.9993593850096092, "eval_loss": 0.669628381729126, "eval_runtime": 210.2529, "eval_samples_per_second": 50.011, "eval_steps_per_second": 0.395, "step": 390 }, { "epoch": 1.0249839846252402, "grad_norm": 0.6401466301705097, "learning_rate": 5e-06, "loss": 0.6342, "step": 400 }, { "epoch": 1.0506085842408712, "grad_norm": 0.561321939868088, "learning_rate": 5e-06, "loss": 0.6286, "step": 410 }, { "epoch": 1.0762331838565022, "grad_norm": 0.5873132600959697, "learning_rate": 5e-06, "loss": 0.634, "step": 420 }, { "epoch": 1.1018577834721333, "grad_norm": 0.4982492625003527, "learning_rate": 5e-06, "loss": 0.6297, "step": 430 }, { "epoch": 1.1274823830877643, "grad_norm": 0.5451894371727711, "learning_rate": 5e-06, "loss": 0.6307, "step": 440 }, { "epoch": 1.1531069827033953, "grad_norm": 0.505854015646168, "learning_rate": 5e-06, "loss": 0.6283, "step": 450 }, { "epoch": 1.1787315823190263, "grad_norm": 0.47382835639154014, "learning_rate": 5e-06, "loss": 0.6296, "step": 460 }, { "epoch": 1.2043561819346573, "grad_norm": 0.49475730559885767, "learning_rate": 5e-06, "loss": 0.6347, "step": 470 }, { "epoch": 1.2299807815502883, "grad_norm": 0.4985567194890561, "learning_rate": 5e-06, "loss": 0.6315, "step": 480 }, { "epoch": 1.2556053811659194, "grad_norm": 0.5560313955644473, "learning_rate": 5e-06, "loss": 0.6275, "step": 490 }, { "epoch": 1.2812299807815504, "grad_norm": 0.49255815849421947, "learning_rate": 5e-06, "loss": 0.6134, "step": 500 }, { "epoch": 1.3068545803971814, "grad_norm": 0.5136446413395007, "learning_rate": 5e-06, "loss": 0.6294, "step": 510 }, { "epoch": 1.3324791800128124, "grad_norm": 0.715579880532086, "learning_rate": 5e-06, "loss": 0.634, "step": 520 }, { "epoch": 1.3581037796284434, "grad_norm": 0.539683933602384, "learning_rate": 5e-06, "loss": 0.6273, "step": 530 }, { "epoch": 1.3837283792440744, "grad_norm": 0.47743309373776915, "learning_rate": 5e-06, "loss": 0.6312, "step": 540 }, { "epoch": 1.4093529788597055, "grad_norm": 0.5510090178389563, "learning_rate": 5e-06, "loss": 0.6291, "step": 550 }, { "epoch": 1.4349775784753362, "grad_norm": 0.4885531066853449, "learning_rate": 5e-06, "loss": 0.63, "step": 560 }, { "epoch": 1.4606021780909673, "grad_norm": 0.4550390667985221, "learning_rate": 5e-06, "loss": 0.63, "step": 570 }, { "epoch": 1.4862267777065983, "grad_norm": 0.5094454871437174, "learning_rate": 5e-06, "loss": 0.6301, "step": 580 }, { "epoch": 1.5118513773222293, "grad_norm": 0.5378797481591068, "learning_rate": 5e-06, "loss": 0.6362, "step": 590 }, { "epoch": 1.5374759769378603, "grad_norm": 0.4964570729916681, "learning_rate": 5e-06, "loss": 0.6275, "step": 600 }, { "epoch": 1.5631005765534913, "grad_norm": 0.5150915164078523, "learning_rate": 5e-06, "loss": 0.6336, "step": 610 }, { "epoch": 1.5887251761691223, "grad_norm": 0.45827307808132584, "learning_rate": 5e-06, "loss": 0.6305, "step": 620 }, { "epoch": 1.6143497757847534, "grad_norm": 0.670050138542801, "learning_rate": 5e-06, "loss": 0.629, "step": 630 }, { "epoch": 1.6399743754003844, "grad_norm": 0.5390177749556742, "learning_rate": 5e-06, "loss": 0.6189, "step": 640 }, { "epoch": 1.6655989750160154, "grad_norm": 0.47514950562866276, "learning_rate": 5e-06, "loss": 0.6335, "step": 650 }, { "epoch": 1.6912235746316464, "grad_norm": 0.45626516699726205, "learning_rate": 5e-06, "loss": 0.63, "step": 660 }, { "epoch": 1.7168481742472774, "grad_norm": 0.5259666631069414, "learning_rate": 5e-06, "loss": 0.629, "step": 670 }, { "epoch": 1.7424727738629084, "grad_norm": 0.45958090467852913, "learning_rate": 5e-06, "loss": 0.6304, "step": 680 }, { "epoch": 1.7680973734785392, "grad_norm": 0.537603671552474, "learning_rate": 5e-06, "loss": 0.6201, "step": 690 }, { "epoch": 1.7937219730941703, "grad_norm": 0.4944510355408107, "learning_rate": 5e-06, "loss": 0.6227, "step": 700 }, { "epoch": 1.8193465727098013, "grad_norm": 0.4459500383872668, "learning_rate": 5e-06, "loss": 0.6293, "step": 710 }, { "epoch": 1.8449711723254323, "grad_norm": 0.41107024989867014, "learning_rate": 5e-06, "loss": 0.6325, "step": 720 }, { "epoch": 1.8705957719410633, "grad_norm": 0.5205263432281845, "learning_rate": 5e-06, "loss": 0.626, "step": 730 }, { "epoch": 1.8962203715566943, "grad_norm": 0.5178259439056772, "learning_rate": 5e-06, "loss": 0.6253, "step": 740 }, { "epoch": 1.9218449711723253, "grad_norm": 0.5186310082942625, "learning_rate": 5e-06, "loss": 0.6254, "step": 750 }, { "epoch": 1.9474695707879563, "grad_norm": 0.48818671836894234, "learning_rate": 5e-06, "loss": 0.6244, "step": 760 }, { "epoch": 1.9730941704035874, "grad_norm": 0.4816352801983277, "learning_rate": 5e-06, "loss": 0.6301, "step": 770 }, { "epoch": 1.9987187700192184, "grad_norm": 0.5573576817650256, "learning_rate": 5e-06, "loss": 0.6284, "step": 780 }, { "epoch": 1.9987187700192184, "eval_loss": 0.6584250926971436, "eval_runtime": 210.7445, "eval_samples_per_second": 49.895, "eval_steps_per_second": 0.394, "step": 780 }, { "epoch": 2.0243433696348494, "grad_norm": 0.6596303880050641, "learning_rate": 5e-06, "loss": 0.582, "step": 790 }, { "epoch": 2.0499679692504804, "grad_norm": 0.5374575961558432, "learning_rate": 5e-06, "loss": 0.5856, "step": 800 }, { "epoch": 2.0755925688661114, "grad_norm": 0.565805764328584, "learning_rate": 5e-06, "loss": 0.5799, "step": 810 }, { "epoch": 2.1012171684817424, "grad_norm": 0.45938077865068583, "learning_rate": 5e-06, "loss": 0.574, "step": 820 }, { "epoch": 2.1268417680973735, "grad_norm": 0.5086155425168429, "learning_rate": 5e-06, "loss": 0.5846, "step": 830 }, { "epoch": 2.1524663677130045, "grad_norm": 0.5951424838463266, "learning_rate": 5e-06, "loss": 0.5804, "step": 840 }, { "epoch": 2.1780909673286355, "grad_norm": 0.5389501938241722, "learning_rate": 5e-06, "loss": 0.5733, "step": 850 }, { "epoch": 2.2037155669442665, "grad_norm": 0.49715965924754885, "learning_rate": 5e-06, "loss": 0.5901, "step": 860 }, { "epoch": 2.2293401665598975, "grad_norm": 0.6416154288767167, "learning_rate": 5e-06, "loss": 0.5814, "step": 870 }, { "epoch": 2.2549647661755285, "grad_norm": 0.5840426113039779, "learning_rate": 5e-06, "loss": 0.5833, "step": 880 }, { "epoch": 2.2805893657911596, "grad_norm": 0.5363577230603178, "learning_rate": 5e-06, "loss": 0.5913, "step": 890 }, { "epoch": 2.3062139654067906, "grad_norm": 0.4999642649696085, "learning_rate": 5e-06, "loss": 0.5853, "step": 900 }, { "epoch": 2.3318385650224216, "grad_norm": 0.6712704299303816, "learning_rate": 5e-06, "loss": 0.5824, "step": 910 }, { "epoch": 2.3574631646380526, "grad_norm": 0.4993129343338316, "learning_rate": 5e-06, "loss": 0.5882, "step": 920 }, { "epoch": 2.3830877642536836, "grad_norm": 0.6482429574217665, "learning_rate": 5e-06, "loss": 0.5852, "step": 930 }, { "epoch": 2.4087123638693146, "grad_norm": 0.5858234169687565, "learning_rate": 5e-06, "loss": 0.5891, "step": 940 }, { "epoch": 2.4343369634849457, "grad_norm": 0.4606498702171825, "learning_rate": 5e-06, "loss": 0.5882, "step": 950 }, { "epoch": 2.4599615631005767, "grad_norm": 0.4665245302124569, "learning_rate": 5e-06, "loss": 0.5882, "step": 960 }, { "epoch": 2.4855861627162077, "grad_norm": 0.4787609348846855, "learning_rate": 5e-06, "loss": 0.5904, "step": 970 }, { "epoch": 2.5112107623318387, "grad_norm": 0.5112448288708001, "learning_rate": 5e-06, "loss": 0.5915, "step": 980 }, { "epoch": 2.5368353619474697, "grad_norm": 0.5584692413737392, "learning_rate": 5e-06, "loss": 0.5855, "step": 990 }, { "epoch": 2.5624599615631007, "grad_norm": 0.5151021464825077, "learning_rate": 5e-06, "loss": 0.5895, "step": 1000 }, { "epoch": 2.5880845611787313, "grad_norm": 0.47435391792664705, "learning_rate": 5e-06, "loss": 0.5818, "step": 1010 }, { "epoch": 2.6137091607943628, "grad_norm": 0.47062007111208676, "learning_rate": 5e-06, "loss": 0.5869, "step": 1020 }, { "epoch": 2.6393337604099933, "grad_norm": 0.5030724465847995, "learning_rate": 5e-06, "loss": 0.5817, "step": 1030 }, { "epoch": 2.664958360025625, "grad_norm": 0.5696236917490695, "learning_rate": 5e-06, "loss": 0.5846, "step": 1040 }, { "epoch": 2.6905829596412554, "grad_norm": 0.5097571703440519, "learning_rate": 5e-06, "loss": 0.5838, "step": 1050 }, { "epoch": 2.716207559256887, "grad_norm": 0.46906709023993814, "learning_rate": 5e-06, "loss": 0.5849, "step": 1060 }, { "epoch": 2.7418321588725174, "grad_norm": 0.6048108768327223, "learning_rate": 5e-06, "loss": 0.5861, "step": 1070 }, { "epoch": 2.767456758488149, "grad_norm": 0.5763148469790699, "learning_rate": 5e-06, "loss": 0.59, "step": 1080 }, { "epoch": 2.7930813581037794, "grad_norm": 0.5325120981655176, "learning_rate": 5e-06, "loss": 0.5922, "step": 1090 }, { "epoch": 2.818705957719411, "grad_norm": 0.47425803790021404, "learning_rate": 5e-06, "loss": 0.5919, "step": 1100 }, { "epoch": 2.8443305573350415, "grad_norm": 0.5025999987290302, "learning_rate": 5e-06, "loss": 0.5851, "step": 1110 }, { "epoch": 2.8699551569506725, "grad_norm": 0.5898623437912127, "learning_rate": 5e-06, "loss": 0.584, "step": 1120 }, { "epoch": 2.8955797565663035, "grad_norm": 0.49890831764955235, "learning_rate": 5e-06, "loss": 0.5847, "step": 1130 }, { "epoch": 2.9212043561819345, "grad_norm": 0.4911458303208662, "learning_rate": 5e-06, "loss": 0.5868, "step": 1140 }, { "epoch": 2.9468289557975655, "grad_norm": 0.4862045432861365, "learning_rate": 5e-06, "loss": 0.5896, "step": 1150 }, { "epoch": 2.9724535554131966, "grad_norm": 0.574968732277202, "learning_rate": 5e-06, "loss": 0.5915, "step": 1160 }, { "epoch": 2.9980781550288276, "grad_norm": 0.536851160723009, "learning_rate": 5e-06, "loss": 0.5918, "step": 1170 }, { "epoch": 2.9980781550288276, "eval_loss": 0.6599797010421753, "eval_runtime": 210.167, "eval_samples_per_second": 50.032, "eval_steps_per_second": 0.395, "step": 1170 }, { "epoch": 2.9980781550288276, "step": 1170, "total_flos": 1959374817853440.0, "train_loss": 0.6400190381922274, "train_runtime": 35097.8295, "train_samples_per_second": 17.076, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1170, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1959374817853440.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }