{ "best_metric": 0.2019248753786087, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 3.0247349823321557, "eval_steps": 25, "global_step": 107, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028268551236749116, "grad_norm": 0.1496291607618332, "learning_rate": 2.5e-05, "loss": 0.2719, "step": 1 }, { "epoch": 0.028268551236749116, "eval_loss": 0.33758026361465454, "eval_runtime": 2.2677, "eval_samples_per_second": 22.049, "eval_steps_per_second": 5.733, "step": 1 }, { "epoch": 0.05653710247349823, "grad_norm": 0.19203834235668182, "learning_rate": 5e-05, "loss": 0.3043, "step": 2 }, { "epoch": 0.08480565371024736, "grad_norm": 0.1976221799850464, "learning_rate": 7.500000000000001e-05, "loss": 0.3131, "step": 3 }, { "epoch": 0.11307420494699646, "grad_norm": 0.18888598680496216, "learning_rate": 0.0001, "loss": 0.3167, "step": 4 }, { "epoch": 0.1413427561837456, "grad_norm": 0.16196908056735992, "learning_rate": 9.997906976305083e-05, "loss": 0.3673, "step": 5 }, { "epoch": 0.1696113074204947, "grad_norm": 0.13969330489635468, "learning_rate": 9.991629852219523e-05, "loss": 0.3187, "step": 6 }, { "epoch": 0.1978798586572438, "grad_norm": 0.16544975340366364, "learning_rate": 9.981174466929743e-05, "loss": 0.3186, "step": 7 }, { "epoch": 0.22614840989399293, "grad_norm": 0.1924961507320404, "learning_rate": 9.966550546377587e-05, "loss": 0.3365, "step": 8 }, { "epoch": 0.254416961130742, "grad_norm": 0.12072353810071945, "learning_rate": 9.947771694212933e-05, "loss": 0.2508, "step": 9 }, { "epoch": 0.2826855123674912, "grad_norm": 0.12432608753442764, "learning_rate": 9.924855379139136e-05, "loss": 0.2776, "step": 10 }, { "epoch": 0.31095406360424027, "grad_norm": 0.10128191113471985, "learning_rate": 9.897822918663062e-05, "loss": 0.2447, "step": 11 }, { "epoch": 0.3392226148409894, "grad_norm": 0.08782458305358887, "learning_rate": 9.866699459264848e-05, "loss": 0.2468, "step": 12 }, { "epoch": 0.3674911660777385, "grad_norm": 0.07989340275526047, "learning_rate": 9.831513953005823e-05, "loss": 0.254, "step": 13 }, { "epoch": 0.3957597173144876, "grad_norm": 0.07268672436475754, "learning_rate": 9.792299130596348e-05, "loss": 0.2531, "step": 14 }, { "epoch": 0.42402826855123676, "grad_norm": 0.0746292918920517, "learning_rate": 9.749091470948645e-05, "loss": 0.2708, "step": 15 }, { "epoch": 0.45229681978798586, "grad_norm": 0.07829014211893082, "learning_rate": 9.70193116724291e-05, "loss": 0.2828, "step": 16 }, { "epoch": 0.48056537102473496, "grad_norm": 0.09487626701593399, "learning_rate": 9.650862089538307e-05, "loss": 0.3013, "step": 17 }, { "epoch": 0.508833922261484, "grad_norm": 0.06753429770469666, "learning_rate": 9.595931743963597e-05, "loss": 0.2171, "step": 18 }, { "epoch": 0.5371024734982333, "grad_norm": 0.07974890619516373, "learning_rate": 9.537191228525384e-05, "loss": 0.2098, "step": 19 }, { "epoch": 0.5653710247349824, "grad_norm": 0.07332316040992737, "learning_rate": 9.474695185575073e-05, "loss": 0.2234, "step": 20 }, { "epoch": 0.5936395759717314, "grad_norm": 0.06308283656835556, "learning_rate": 9.408501750978769e-05, "loss": 0.2219, "step": 21 }, { "epoch": 0.6219081272084805, "grad_norm": 0.07921251654624939, "learning_rate": 9.338672500037388e-05, "loss": 0.237, "step": 22 }, { "epoch": 0.6501766784452296, "grad_norm": 0.0856022983789444, "learning_rate": 9.26527239020729e-05, "loss": 0.2579, "step": 23 }, { "epoch": 0.6784452296819788, "grad_norm": 0.0858432948589325, "learning_rate": 9.188369700674736e-05, "loss": 0.2633, "step": 24 }, { "epoch": 0.7067137809187279, "grad_norm": 0.08091321587562561, "learning_rate": 9.108035968840348e-05, "loss": 0.2571, "step": 25 }, { "epoch": 0.7067137809187279, "eval_loss": 0.22624744474887848, "eval_runtime": 2.3489, "eval_samples_per_second": 21.286, "eval_steps_per_second": 5.534, "step": 25 }, { "epoch": 0.734982332155477, "grad_norm": 0.09223099052906036, "learning_rate": 9.024345923772673e-05, "loss": 0.2829, "step": 26 }, { "epoch": 0.7632508833922261, "grad_norm": 0.06381526589393616, "learning_rate": 8.937377416692753e-05, "loss": 0.1954, "step": 27 }, { "epoch": 0.7915194346289752, "grad_norm": 0.0784890204668045, "learning_rate": 8.847211348554383e-05, "loss": 0.2135, "step": 28 }, { "epoch": 0.8197879858657244, "grad_norm": 0.08194228261709213, "learning_rate": 8.753931594787381e-05, "loss": 0.2163, "step": 29 }, { "epoch": 0.8480565371024735, "grad_norm": 0.07212778180837631, "learning_rate": 8.65762492727392e-05, "loss": 0.2276, "step": 30 }, { "epoch": 0.8763250883392226, "grad_norm": 0.07101049274206161, "learning_rate": 8.558380933630481e-05, "loss": 0.2567, "step": 31 }, { "epoch": 0.9045936395759717, "grad_norm": 0.06946881860494614, "learning_rate": 8.456291933870523e-05, "loss": 0.2489, "step": 32 }, { "epoch": 0.9328621908127208, "grad_norm": 0.08646480739116669, "learning_rate": 8.351452894525369e-05, "loss": 0.2449, "step": 33 }, { "epoch": 0.9611307420494699, "grad_norm": 0.09382244199514389, "learning_rate": 8.243961340303246e-05, "loss": 0.2713, "step": 34 }, { "epoch": 0.9893992932862191, "grad_norm": 0.10727506130933762, "learning_rate": 8.13391726336859e-05, "loss": 0.2837, "step": 35 }, { "epoch": 1.017667844522968, "grad_norm": 0.11025042831897736, "learning_rate": 8.021423030326076e-05, "loss": 0.3322, "step": 36 }, { "epoch": 1.0459363957597174, "grad_norm": 0.05836005136370659, "learning_rate": 7.906583286995835e-05, "loss": 0.1968, "step": 37 }, { "epoch": 1.0742049469964665, "grad_norm": 0.06407664716243744, "learning_rate": 7.789504861068493e-05, "loss": 0.1979, "step": 38 }, { "epoch": 1.1024734982332156, "grad_norm": 0.07625511288642883, "learning_rate": 7.670296662730553e-05, "loss": 0.2166, "step": 39 }, { "epoch": 1.1307420494699647, "grad_norm": 0.0838479995727539, "learning_rate": 7.54906958335257e-05, "loss": 0.2416, "step": 40 }, { "epoch": 1.1590106007067138, "grad_norm": 0.08144625276327133, "learning_rate": 7.425936392334369e-05, "loss": 0.2472, "step": 41 }, { "epoch": 1.187279151943463, "grad_norm": 0.06474898755550385, "learning_rate": 7.301011632203251e-05, "loss": 0.214, "step": 42 }, { "epoch": 1.215547703180212, "grad_norm": 0.0715680941939354, "learning_rate": 7.17441151206279e-05, "loss": 0.2439, "step": 43 }, { "epoch": 1.243816254416961, "grad_norm": 0.0650620236992836, "learning_rate": 7.046253799491311e-05, "loss": 0.1902, "step": 44 }, { "epoch": 1.2720848056537102, "grad_norm": 0.12278559058904648, "learning_rate": 6.916657710990633e-05, "loss": 0.2531, "step": 45 }, { "epoch": 1.3003533568904593, "grad_norm": 0.07810261845588684, "learning_rate": 6.785743801086981e-05, "loss": 0.2229, "step": 46 }, { "epoch": 1.3286219081272086, "grad_norm": 0.06839544326066971, "learning_rate": 6.653633850187212e-05, "loss": 0.2014, "step": 47 }, { "epoch": 1.3568904593639575, "grad_norm": 0.07193570584058762, "learning_rate": 6.520450751294685e-05, "loss": 0.2103, "step": 48 }, { "epoch": 1.3851590106007068, "grad_norm": 0.09183719009160995, "learning_rate": 6.386318395690179e-05, "loss": 0.2542, "step": 49 }, { "epoch": 1.4134275618374559, "grad_norm": 0.09221762418746948, "learning_rate": 6.25136155768415e-05, "loss": 0.2299, "step": 50 }, { "epoch": 1.4134275618374559, "eval_loss": 0.21029320359230042, "eval_runtime": 2.4406, "eval_samples_per_second": 20.487, "eval_steps_per_second": 5.327, "step": 50 }, { "epoch": 1.441696113074205, "grad_norm": 0.10233300924301147, "learning_rate": 6.115705778547597e-05, "loss": 0.238, "step": 51 }, { "epoch": 1.469964664310954, "grad_norm": 0.08884930610656738, "learning_rate": 5.979477249729443e-05, "loss": 0.2443, "step": 52 }, { "epoch": 1.4982332155477032, "grad_norm": 0.06135065108537674, "learning_rate": 5.842802695469132e-05, "loss": 0.1793, "step": 53 }, { "epoch": 1.5265017667844523, "grad_norm": 0.0772595927119255, "learning_rate": 5.705809254913577e-05, "loss": 0.2317, "step": 54 }, { "epoch": 1.5547703180212014, "grad_norm": 0.07231206446886063, "learning_rate": 5.568624363848167e-05, "loss": 0.1857, "step": 55 }, { "epoch": 1.5830388692579507, "grad_norm": 0.07848239690065384, "learning_rate": 5.431375636151834e-05, "loss": 0.2049, "step": 56 }, { "epoch": 1.6113074204946995, "grad_norm": 0.07845490425825119, "learning_rate": 5.294190745086426e-05, "loss": 0.2081, "step": 57 }, { "epoch": 1.6395759717314489, "grad_norm": 0.0769682452082634, "learning_rate": 5.15719730453087e-05, "loss": 0.2242, "step": 58 }, { "epoch": 1.6678445229681977, "grad_norm": 0.07719448208808899, "learning_rate": 5.020522750270559e-05, "loss": 0.2138, "step": 59 }, { "epoch": 1.696113074204947, "grad_norm": 0.08276604861021042, "learning_rate": 4.884294221452406e-05, "loss": 0.2339, "step": 60 }, { "epoch": 1.7243816254416962, "grad_norm": 0.09187748283147812, "learning_rate": 4.7486384423158514e-05, "loss": 0.2585, "step": 61 }, { "epoch": 1.7526501766784452, "grad_norm": 0.06316478550434113, "learning_rate": 4.613681604309824e-05, "loss": 0.1936, "step": 62 }, { "epoch": 1.7809187279151943, "grad_norm": 0.06568682938814163, "learning_rate": 4.479549248705316e-05, "loss": 0.1895, "step": 63 }, { "epoch": 1.8091872791519434, "grad_norm": 0.07526635378599167, "learning_rate": 4.346366149812791e-05, "loss": 0.1954, "step": 64 }, { "epoch": 1.8374558303886925, "grad_norm": 0.07216266542673111, "learning_rate": 4.2142561989130204e-05, "loss": 0.1972, "step": 65 }, { "epoch": 1.8657243816254416, "grad_norm": 0.07032765448093414, "learning_rate": 4.0833422890093684e-05, "loss": 0.1875, "step": 66 }, { "epoch": 1.893992932862191, "grad_norm": 0.0753110945224762, "learning_rate": 3.9537462005086936e-05, "loss": 0.2326, "step": 67 }, { "epoch": 1.9222614840989398, "grad_norm": 0.07524023950099945, "learning_rate": 3.825588487937212e-05, "loss": 0.235, "step": 68 }, { "epoch": 1.9505300353356891, "grad_norm": 0.0842115730047226, "learning_rate": 3.6989883677967485e-05, "loss": 0.2449, "step": 69 }, { "epoch": 1.978798586572438, "grad_norm": 0.10185158252716064, "learning_rate": 3.574063607665633e-05, "loss": 0.2608, "step": 70 }, { "epoch": 2.0070671378091873, "grad_norm": 0.14404942095279694, "learning_rate": 3.450930416647429e-05, "loss": 0.3676, "step": 71 }, { "epoch": 2.035335689045936, "grad_norm": 0.060068968683481216, "learning_rate": 3.3297033372694477e-05, "loss": 0.1747, "step": 72 }, { "epoch": 2.0636042402826855, "grad_norm": 0.06130439415574074, "learning_rate": 3.2104951389315077e-05, "loss": 0.2236, "step": 73 }, { "epoch": 2.091872791519435, "grad_norm": 0.06732094287872314, "learning_rate": 3.093416713004167e-05, "loss": 0.1928, "step": 74 }, { "epoch": 2.1201413427561837, "grad_norm": 0.07202989608049393, "learning_rate": 2.9785769696739264e-05, "loss": 0.2016, "step": 75 }, { "epoch": 2.1201413427561837, "eval_loss": 0.20411866903305054, "eval_runtime": 2.3425, "eval_samples_per_second": 21.345, "eval_steps_per_second": 5.55, "step": 75 }, { "epoch": 2.148409893992933, "grad_norm": 0.08088778704404831, "learning_rate": 2.86608273663141e-05, "loss": 0.2475, "step": 76 }, { "epoch": 2.176678445229682, "grad_norm": 0.07822652906179428, "learning_rate": 2.7560386596967557e-05, "loss": 0.2136, "step": 77 }, { "epoch": 2.204946996466431, "grad_norm": 0.080837681889534, "learning_rate": 2.6485471054746318e-05, "loss": 0.2256, "step": 78 }, { "epoch": 2.23321554770318, "grad_norm": 0.07773551344871521, "learning_rate": 2.5437080661294786e-05, "loss": 0.2158, "step": 79 }, { "epoch": 2.2614840989399294, "grad_norm": 0.07530491799116135, "learning_rate": 2.4416190663695194e-05, "loss": 0.206, "step": 80 }, { "epoch": 2.2897526501766783, "grad_norm": 0.07282233983278275, "learning_rate": 2.3423750727260816e-05, "loss": 0.1726, "step": 81 }, { "epoch": 2.3180212014134276, "grad_norm": 0.0700765922665596, "learning_rate": 2.2460684052126197e-05, "loss": 0.1814, "step": 82 }, { "epoch": 2.3462897526501765, "grad_norm": 0.06933876872062683, "learning_rate": 2.152788651445618e-05, "loss": 0.1945, "step": 83 }, { "epoch": 2.374558303886926, "grad_norm": 0.06883740425109863, "learning_rate": 2.0626225833072487e-05, "loss": 0.2101, "step": 84 }, { "epoch": 2.402826855123675, "grad_norm": 0.06986892968416214, "learning_rate": 1.97565407622733e-05, "loss": 0.1994, "step": 85 }, { "epoch": 2.431095406360424, "grad_norm": 0.07742994278669357, "learning_rate": 1.891964031159653e-05, "loss": 0.2147, "step": 86 }, { "epoch": 2.4593639575971733, "grad_norm": 0.08387456834316254, "learning_rate": 1.8116302993252637e-05, "loss": 0.2197, "step": 87 }, { "epoch": 2.487632508833922, "grad_norm": 0.08557935804128647, "learning_rate": 1.7347276097927105e-05, "loss": 0.2097, "step": 88 }, { "epoch": 2.5159010600706715, "grad_norm": 0.06514272093772888, "learning_rate": 1.6613274999626137e-05, "loss": 0.2136, "step": 89 }, { "epoch": 2.5441696113074204, "grad_norm": 0.05643463507294655, "learning_rate": 1.5914982490212312e-05, "loss": 0.1907, "step": 90 }, { "epoch": 2.5724381625441697, "grad_norm": 0.06008900701999664, "learning_rate": 1.5253048144249275e-05, "loss": 0.1835, "step": 91 }, { "epoch": 2.6007067137809186, "grad_norm": 0.06434128433465958, "learning_rate": 1.4628087714746172e-05, "loss": 0.189, "step": 92 }, { "epoch": 2.628975265017668, "grad_norm": 0.07678020000457764, "learning_rate": 1.4040682560364033e-05, "loss": 0.2275, "step": 93 }, { "epoch": 2.657243816254417, "grad_norm": 0.07552898675203323, "learning_rate": 1.3491379104616938e-05, "loss": 0.2218, "step": 94 }, { "epoch": 2.685512367491166, "grad_norm": 0.0758337527513504, "learning_rate": 1.2980688327570905e-05, "loss": 0.2201, "step": 95 }, { "epoch": 2.713780918727915, "grad_norm": 0.08446727693080902, "learning_rate": 1.2509085290513564e-05, "loss": 0.2429, "step": 96 }, { "epoch": 2.7420494699646643, "grad_norm": 0.08306104689836502, "learning_rate": 1.2077008694036528e-05, "loss": 0.2056, "step": 97 }, { "epoch": 2.7703180212014136, "grad_norm": 0.0664680078625679, "learning_rate": 1.1684860469941786e-05, "loss": 0.1927, "step": 98 }, { "epoch": 2.7985865724381624, "grad_norm": 0.06700006872415543, "learning_rate": 1.1333005407351517e-05, "loss": 0.1953, "step": 99 }, { "epoch": 2.8268551236749118, "grad_norm": 0.06202859431505203, "learning_rate": 1.1021770813369377e-05, "loss": 0.1882, "step": 100 }, { "epoch": 2.8268551236749118, "eval_loss": 0.2019248753786087, "eval_runtime": 2.346, "eval_samples_per_second": 21.313, "eval_steps_per_second": 5.541, "step": 100 }, { "epoch": 2.8551236749116606, "grad_norm": 0.06783071905374527, "learning_rate": 1.0751446208608642e-05, "loss": 0.2021, "step": 101 }, { "epoch": 2.88339222614841, "grad_norm": 0.06956081092357635, "learning_rate": 1.0522283057870676e-05, "loss": 0.2095, "step": 102 }, { "epoch": 2.9116607773851593, "grad_norm": 0.07598631829023361, "learning_rate": 1.0334494536224147e-05, "loss": 0.2283, "step": 103 }, { "epoch": 2.939929328621908, "grad_norm": 0.07840535044670105, "learning_rate": 1.0188255330702585e-05, "loss": 0.2234, "step": 104 }, { "epoch": 2.968197879858657, "grad_norm": 0.08693535625934601, "learning_rate": 1.008370147780478e-05, "loss": 0.2282, "step": 105 }, { "epoch": 2.9964664310954063, "grad_norm": 0.12355560064315796, "learning_rate": 1.0020930236949183e-05, "loss": 0.3288, "step": 106 }, { "epoch": 3.0247349823321557, "grad_norm": 0.06719771027565002, "learning_rate": 1e-05, "loss": 0.2029, "step": 107 } ], "logging_steps": 1, "max_steps": 107, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 30, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.4028515042797814e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }