{ "best_metric": 0.8481370944449793, "best_model_checkpoint": "./results/checkpoint-144", "epoch": 47.0, "eval_steps": 500, "global_step": 1128, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 6.823522090911865, "learning_rate": 9.8e-05, "loss": 1.0529, "step": 24 }, { "epoch": 1.0, "eval_loss": 0.7948330044746399, "eval_precision": 0.6056436074893574, "eval_runtime": 0.1118, "eval_samples_per_second": 1950.689, "eval_steps_per_second": 35.792, "step": 24 }, { "epoch": 2.0, "grad_norm": 6.746466636657715, "learning_rate": 9.6e-05, "loss": 0.628, "step": 48 }, { "epoch": 2.0, "eval_loss": 0.5195804834365845, "eval_precision": 0.7895640074211502, "eval_runtime": 0.1027, "eval_samples_per_second": 2122.54, "eval_steps_per_second": 38.946, "step": 48 }, { "epoch": 3.0, "grad_norm": 12.689231872558594, "learning_rate": 9.4e-05, "loss": 0.3634, "step": 72 }, { "epoch": 3.0, "eval_loss": 0.49251171946525574, "eval_precision": 0.8362987458423723, "eval_runtime": 0.0971, "eval_samples_per_second": 2244.959, "eval_steps_per_second": 41.192, "step": 72 }, { "epoch": 4.0, "grad_norm": 4.118255615234375, "learning_rate": 9.200000000000001e-05, "loss": 0.2013, "step": 96 }, { "epoch": 4.0, "eval_loss": 0.532037079334259, "eval_precision": 0.811715061018966, "eval_runtime": 0.0947, "eval_samples_per_second": 2301.367, "eval_steps_per_second": 42.227, "step": 96 }, { "epoch": 5.0, "grad_norm": 8.898234367370605, "learning_rate": 9e-05, "loss": 0.1305, "step": 120 }, { "epoch": 5.0, "eval_loss": 0.6465156674385071, "eval_precision": 0.8354440853264382, "eval_runtime": 0.0962, "eval_samples_per_second": 2265.085, "eval_steps_per_second": 41.561, "step": 120 }, { "epoch": 6.0, "grad_norm": 1.2157541513442993, "learning_rate": 8.800000000000001e-05, "loss": 0.1097, "step": 144 }, { "epoch": 6.0, "eval_loss": 0.6573904156684875, "eval_precision": 0.8481370944449793, "eval_runtime": 0.0949, "eval_samples_per_second": 2297.567, "eval_steps_per_second": 42.157, "step": 144 }, { "epoch": 7.0, "grad_norm": 7.853116512298584, "learning_rate": 8.6e-05, "loss": 0.0892, "step": 168 }, { "epoch": 7.0, "eval_loss": 0.6417277455329895, "eval_precision": 0.827079642667878, "eval_runtime": 0.0965, "eval_samples_per_second": 2260.158, "eval_steps_per_second": 41.471, "step": 168 }, { "epoch": 8.0, "grad_norm": 5.781342506408691, "learning_rate": 8.4e-05, "loss": 0.0613, "step": 192 }, { "epoch": 8.0, "eval_loss": 0.7849622964859009, "eval_precision": 0.8241884831740076, "eval_runtime": 0.0972, "eval_samples_per_second": 2243.929, "eval_steps_per_second": 41.173, "step": 192 }, { "epoch": 9.0, "grad_norm": 1.440025806427002, "learning_rate": 8.2e-05, "loss": 0.0641, "step": 216 }, { "epoch": 9.0, "eval_loss": 0.8728927373886108, "eval_precision": 0.8153104500381388, "eval_runtime": 0.0977, "eval_samples_per_second": 2232.042, "eval_steps_per_second": 40.955, "step": 216 }, { "epoch": 10.0, "grad_norm": 0.08336609601974487, "learning_rate": 8e-05, "loss": 0.0442, "step": 240 }, { "epoch": 10.0, "eval_loss": 0.8833715915679932, "eval_precision": 0.8203376565295171, "eval_runtime": 0.0972, "eval_samples_per_second": 2243.367, "eval_steps_per_second": 41.163, "step": 240 }, { "epoch": 11.0, "grad_norm": 6.659815788269043, "learning_rate": 7.800000000000001e-05, "loss": 0.0413, "step": 264 }, { "epoch": 11.0, "eval_loss": 0.964482307434082, "eval_precision": 0.7970081571089974, "eval_runtime": 0.1009, "eval_samples_per_second": 2160.837, "eval_steps_per_second": 39.648, "step": 264 }, { "epoch": 12.0, "grad_norm": 3.413681983947754, "learning_rate": 7.6e-05, "loss": 0.0319, "step": 288 }, { "epoch": 12.0, "eval_loss": 0.9364728927612305, "eval_precision": 0.8252503052503053, "eval_runtime": 0.0956, "eval_samples_per_second": 2281.225, "eval_steps_per_second": 41.857, "step": 288 }, { "epoch": 13.0, "grad_norm": 0.598942220211029, "learning_rate": 7.4e-05, "loss": 0.0355, "step": 312 }, { "epoch": 13.0, "eval_loss": 1.0806515216827393, "eval_precision": 0.8052417546769706, "eval_runtime": 0.0944, "eval_samples_per_second": 2308.478, "eval_steps_per_second": 42.357, "step": 312 }, { "epoch": 14.0, "grad_norm": 6.365128517150879, "learning_rate": 7.2e-05, "loss": 0.0367, "step": 336 }, { "epoch": 14.0, "eval_loss": 1.02200448513031, "eval_precision": 0.7803898076132464, "eval_runtime": 0.0936, "eval_samples_per_second": 2330.282, "eval_steps_per_second": 42.757, "step": 336 }, { "epoch": 15.0, "grad_norm": 17.668140411376953, "learning_rate": 7e-05, "loss": 0.0372, "step": 360 }, { "epoch": 15.0, "eval_loss": 1.4776127338409424, "eval_precision": 0.7776844669818972, "eval_runtime": 0.0943, "eval_samples_per_second": 2312.25, "eval_steps_per_second": 42.427, "step": 360 }, { "epoch": 16.0, "grad_norm": 0.7209205031394958, "learning_rate": 6.800000000000001e-05, "loss": 0.0391, "step": 384 }, { "epoch": 16.0, "eval_loss": 1.0191988945007324, "eval_precision": 0.8054774089256848, "eval_runtime": 0.0952, "eval_samples_per_second": 2288.923, "eval_steps_per_second": 41.999, "step": 384 }, { "epoch": 17.0, "grad_norm": 20.729230880737305, "learning_rate": 6.6e-05, "loss": 0.0182, "step": 408 }, { "epoch": 17.0, "eval_loss": 1.0303993225097656, "eval_precision": 0.7919336219336219, "eval_runtime": 0.0937, "eval_samples_per_second": 2326.132, "eval_steps_per_second": 42.681, "step": 408 }, { "epoch": 18.0, "grad_norm": 8.232949256896973, "learning_rate": 6.400000000000001e-05, "loss": 0.0049, "step": 432 }, { "epoch": 18.0, "eval_loss": 1.0636919736862183, "eval_precision": 0.8149147286821705, "eval_runtime": 0.0958, "eval_samples_per_second": 2274.483, "eval_steps_per_second": 41.734, "step": 432 }, { "epoch": 19.0, "grad_norm": 0.012149515561759472, "learning_rate": 6.2e-05, "loss": 0.0076, "step": 456 }, { "epoch": 19.0, "eval_loss": 1.2673149108886719, "eval_precision": 0.8303134327276478, "eval_runtime": 0.1127, "eval_samples_per_second": 1935.056, "eval_steps_per_second": 35.506, "step": 456 }, { "epoch": 20.0, "grad_norm": 0.1371331363916397, "learning_rate": 6e-05, "loss": 0.0142, "step": 480 }, { "epoch": 20.0, "eval_loss": 1.092652440071106, "eval_precision": 0.8361016889514425, "eval_runtime": 0.0946, "eval_samples_per_second": 2303.344, "eval_steps_per_second": 42.263, "step": 480 }, { "epoch": 21.0, "grad_norm": 8.097236633300781, "learning_rate": 5.8e-05, "loss": 0.0069, "step": 504 }, { "epoch": 21.0, "eval_loss": 1.0935724973678589, "eval_precision": 0.8423427172287354, "eval_runtime": 0.0937, "eval_samples_per_second": 2326.523, "eval_steps_per_second": 42.688, "step": 504 }, { "epoch": 22.0, "grad_norm": Infinity, "learning_rate": 5.608333333333333e-05, "loss": 0.0058, "step": 528 }, { "epoch": 22.0, "eval_loss": 1.1121009588241577, "eval_precision": 0.8411591093421643, "eval_runtime": 0.096, "eval_samples_per_second": 2270.761, "eval_steps_per_second": 41.665, "step": 528 }, { "epoch": 23.0, "grad_norm": 0.004105714615434408, "learning_rate": 5.4083333333333345e-05, "loss": 0.0005, "step": 552 }, { "epoch": 23.0, "eval_loss": 1.0889389514923096, "eval_precision": 0.8351062801932368, "eval_runtime": 0.097, "eval_samples_per_second": 2247.702, "eval_steps_per_second": 41.242, "step": 552 }, { "epoch": 24.0, "grad_norm": 0.009472750127315521, "learning_rate": 5.208333333333334e-05, "loss": 0.0004, "step": 576 }, { "epoch": 24.0, "eval_loss": 1.1021697521209717, "eval_precision": 0.8396207338776763, "eval_runtime": 0.0965, "eval_samples_per_second": 2258.054, "eval_steps_per_second": 41.432, "step": 576 }, { "epoch": 25.0, "grad_norm": 0.005111050326377153, "learning_rate": 5.0083333333333335e-05, "loss": 0.0014, "step": 600 }, { "epoch": 25.0, "eval_loss": 1.1365474462509155, "eval_precision": 0.8312554219698844, "eval_runtime": 0.0953, "eval_samples_per_second": 2287.171, "eval_steps_per_second": 41.966, "step": 600 }, { "epoch": 26.0, "grad_norm": 0.00251354300417006, "learning_rate": 4.8083333333333334e-05, "loss": 0.0003, "step": 624 }, { "epoch": 26.0, "eval_loss": 1.0676130056381226, "eval_precision": 0.8335021266718321, "eval_runtime": 0.0946, "eval_samples_per_second": 2305.411, "eval_steps_per_second": 42.301, "step": 624 }, { "epoch": 27.0, "grad_norm": 0.0027927160263061523, "learning_rate": 4.608333333333333e-05, "loss": 0.0002, "step": 648 }, { "epoch": 27.0, "eval_loss": 1.072741985321045, "eval_precision": 0.844591186815656, "eval_runtime": 0.0939, "eval_samples_per_second": 2321.225, "eval_steps_per_second": 42.591, "step": 648 }, { "epoch": 28.0, "grad_norm": 0.005234727635979652, "learning_rate": 4.408333333333334e-05, "loss": 0.0002, "step": 672 }, { "epoch": 28.0, "eval_loss": 1.077904462814331, "eval_precision": 0.8439559596053445, "eval_runtime": 0.0928, "eval_samples_per_second": 2348.513, "eval_steps_per_second": 43.092, "step": 672 }, { "epoch": 29.0, "grad_norm": 0.002270384691655636, "learning_rate": 4.208333333333334e-05, "loss": 0.0002, "step": 696 }, { "epoch": 29.0, "eval_loss": 1.0991228818893433, "eval_precision": 0.8379138895347336, "eval_runtime": 0.0946, "eval_samples_per_second": 2304.261, "eval_steps_per_second": 42.28, "step": 696 }, { "epoch": 30.0, "grad_norm": 0.001969350501894951, "learning_rate": 4.0083333333333336e-05, "loss": 0.0003, "step": 720 }, { "epoch": 30.0, "eval_loss": 1.1093668937683105, "eval_precision": 0.8355436037933597, "eval_runtime": 0.0943, "eval_samples_per_second": 2311.893, "eval_steps_per_second": 42.42, "step": 720 }, { "epoch": 31.0, "grad_norm": 0.0020132025238126516, "learning_rate": 3.8083333333333335e-05, "loss": 0.0005, "step": 744 }, { "epoch": 31.0, "eval_loss": 1.1845322847366333, "eval_precision": 0.8258347143415434, "eval_runtime": 0.0971, "eval_samples_per_second": 2244.683, "eval_steps_per_second": 41.187, "step": 744 }, { "epoch": 32.0, "grad_norm": 0.003815974574536085, "learning_rate": 3.6083333333333334e-05, "loss": 0.0003, "step": 768 }, { "epoch": 32.0, "eval_loss": 1.2152763605117798, "eval_precision": 0.8261568668553261, "eval_runtime": 0.0957, "eval_samples_per_second": 2277.719, "eval_steps_per_second": 41.793, "step": 768 }, { "epoch": 33.0, "grad_norm": 0.0018600963521748781, "learning_rate": 3.408333333333333e-05, "loss": 0.0014, "step": 792 }, { "epoch": 33.0, "eval_loss": 1.3052575588226318, "eval_precision": 0.8209558890904759, "eval_runtime": 0.0948, "eval_samples_per_second": 2298.567, "eval_steps_per_second": 42.176, "step": 792 }, { "epoch": 34.0, "grad_norm": 0.002450386295095086, "learning_rate": 3.208333333333334e-05, "loss": 0.0002, "step": 816 }, { "epoch": 34.0, "eval_loss": 1.3422256708145142, "eval_precision": 0.826399558034305, "eval_runtime": 0.094, "eval_samples_per_second": 2320.135, "eval_steps_per_second": 42.571, "step": 816 }, { "epoch": 35.0, "grad_norm": 0.0035078648943454027, "learning_rate": 3.0083333333333337e-05, "loss": 0.0005, "step": 840 }, { "epoch": 35.0, "eval_loss": 1.1371830701828003, "eval_precision": 0.8434633760480936, "eval_runtime": 0.1007, "eval_samples_per_second": 2165.171, "eval_steps_per_second": 39.728, "step": 840 }, { "epoch": 36.0, "grad_norm": 0.0018654108280315995, "learning_rate": 2.8083333333333333e-05, "loss": 0.0002, "step": 864 }, { "epoch": 36.0, "eval_loss": 1.3277233839035034, "eval_precision": 0.8193836477987422, "eval_runtime": 0.0946, "eval_samples_per_second": 2303.274, "eval_steps_per_second": 42.262, "step": 864 }, { "epoch": 37.0, "grad_norm": 0.0041082086972892284, "learning_rate": 2.608333333333333e-05, "loss": 0.0002, "step": 888 }, { "epoch": 37.0, "eval_loss": 1.2366803884506226, "eval_precision": 0.8155238095238095, "eval_runtime": 0.0989, "eval_samples_per_second": 2204.245, "eval_steps_per_second": 40.445, "step": 888 }, { "epoch": 38.0, "grad_norm": 0.0014091773191466928, "learning_rate": 2.4083333333333337e-05, "loss": 0.0002, "step": 912 }, { "epoch": 38.0, "eval_loss": 1.1948307752609253, "eval_precision": 0.8354469023991407, "eval_runtime": 0.0945, "eval_samples_per_second": 2306.219, "eval_steps_per_second": 42.316, "step": 912 }, { "epoch": 39.0, "grad_norm": 0.001613400294445455, "learning_rate": 2.2083333333333333e-05, "loss": 0.0002, "step": 936 }, { "epoch": 39.0, "eval_loss": 1.2023606300354004, "eval_precision": 0.8334932604517856, "eval_runtime": 0.0968, "eval_samples_per_second": 2251.315, "eval_steps_per_second": 41.309, "step": 936 }, { "epoch": 40.0, "grad_norm": 0.0020071123726665974, "learning_rate": 2.0083333333333335e-05, "loss": 0.0001, "step": 960 }, { "epoch": 40.0, "eval_loss": 1.207884430885315, "eval_precision": 0.8334932604517856, "eval_runtime": 0.0951, "eval_samples_per_second": 2292.665, "eval_steps_per_second": 42.067, "step": 960 }, { "epoch": 41.0, "grad_norm": 0.0017131684580817819, "learning_rate": 1.8083333333333337e-05, "loss": 0.0001, "step": 984 }, { "epoch": 41.0, "eval_loss": 1.2119500637054443, "eval_precision": 0.8334932604517856, "eval_runtime": 0.0955, "eval_samples_per_second": 2283.082, "eval_steps_per_second": 41.891, "step": 984 }, { "epoch": 42.0, "grad_norm": 0.002312659751623869, "learning_rate": 1.6083333333333332e-05, "loss": 0.0001, "step": 1008 }, { "epoch": 42.0, "eval_loss": 1.2135387659072876, "eval_precision": 0.8334932604517856, "eval_runtime": 0.0966, "eval_samples_per_second": 2257.875, "eval_steps_per_second": 41.429, "step": 1008 }, { "epoch": 43.0, "grad_norm": 0.0015511205419898033, "learning_rate": 1.4083333333333335e-05, "loss": 0.0001, "step": 1032 }, { "epoch": 43.0, "eval_loss": 1.211680293083191, "eval_precision": 0.8334932604517856, "eval_runtime": 0.0942, "eval_samples_per_second": 2313.191, "eval_steps_per_second": 42.444, "step": 1032 }, { "epoch": 44.0, "grad_norm": 0.0013854044955223799, "learning_rate": 1.2083333333333333e-05, "loss": 0.0001, "step": 1056 }, { "epoch": 44.0, "eval_loss": 1.2127768993377686, "eval_precision": 0.8334932604517856, "eval_runtime": 0.095, "eval_samples_per_second": 2295.796, "eval_steps_per_second": 42.125, "step": 1056 }, { "epoch": 45.0, "grad_norm": 0.0025542026851326227, "learning_rate": 1.0083333333333334e-05, "loss": 0.0001, "step": 1080 }, { "epoch": 45.0, "eval_loss": 1.2139766216278076, "eval_precision": 0.8334932604517856, "eval_runtime": 0.0993, "eval_samples_per_second": 2194.379, "eval_steps_per_second": 40.264, "step": 1080 }, { "epoch": 46.0, "grad_norm": 0.0012550665996968746, "learning_rate": 8.083333333333333e-06, "loss": 0.0001, "step": 1104 }, { "epoch": 46.0, "eval_loss": 1.214843511581421, "eval_precision": 0.8334932604517856, "eval_runtime": 0.0971, "eval_samples_per_second": 2244.209, "eval_steps_per_second": 41.178, "step": 1104 }, { "epoch": 47.0, "grad_norm": 0.0018021941650658846, "learning_rate": 6.083333333333334e-06, "loss": 0.0001, "step": 1128 }, { "epoch": 47.0, "eval_loss": 1.2158193588256836, "eval_precision": 0.8334932604517856, "eval_runtime": 0.0948, "eval_samples_per_second": 2299.347, "eval_steps_per_second": 42.19, "step": 1128 } ], "logging_steps": 500, "max_steps": 1200, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1739124586046544.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }