{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9581589958158996, "eval_steps": 59, "global_step": 118, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016736401673640166, "grad_norm": 0.3126992881298065, "learning_rate": 2e-05, "loss": 2.4388, "step": 1 }, { "epoch": 0.016736401673640166, "eval_loss": 2.2736361026763916, "eval_runtime": 44.9772, "eval_samples_per_second": 37.419, "eval_steps_per_second": 0.8, "step": 1 }, { "epoch": 0.03347280334728033, "grad_norm": 0.31262004375457764, "learning_rate": 4e-05, "loss": 2.3915, "step": 2 }, { "epoch": 0.0502092050209205, "grad_norm": 0.3161173462867737, "learning_rate": 6e-05, "loss": 2.4118, "step": 3 }, { "epoch": 0.06694560669456066, "grad_norm": 0.32605665922164917, "learning_rate": 8e-05, "loss": 2.3872, "step": 4 }, { "epoch": 0.08368200836820083, "grad_norm": 0.32243478298187256, "learning_rate": 0.0001, "loss": 2.3368, "step": 5 }, { "epoch": 0.100418410041841, "grad_norm": 0.3141494393348694, "learning_rate": 0.00012, "loss": 2.2966, "step": 6 }, { "epoch": 0.11715481171548117, "grad_norm": 0.330599308013916, "learning_rate": 0.00014, "loss": 2.2379, "step": 7 }, { "epoch": 0.13389121338912133, "grad_norm": 0.33452969789505005, "learning_rate": 0.00016, "loss": 2.1949, "step": 8 }, { "epoch": 0.1506276150627615, "grad_norm": 0.2835342288017273, "learning_rate": 0.00018, "loss": 2.156, "step": 9 }, { "epoch": 0.16736401673640167, "grad_norm": 0.23091256618499756, "learning_rate": 0.0002, "loss": 2.1109, "step": 10 }, { "epoch": 0.18410041841004185, "grad_norm": 0.2284015268087387, "learning_rate": 0.00019999033847063811, "loss": 2.0782, "step": 11 }, { "epoch": 0.200836820083682, "grad_norm": 0.23981927335262299, "learning_rate": 0.00019996135574945544, "loss": 2.053, "step": 12 }, { "epoch": 0.2175732217573222, "grad_norm": 0.23807936906814575, "learning_rate": 0.00019991305743680013, "loss": 2.0294, "step": 13 }, { "epoch": 0.23430962343096234, "grad_norm": 0.20766477286815643, "learning_rate": 0.0001998454528653836, "loss": 1.9947, "step": 14 }, { "epoch": 0.2510460251046025, "grad_norm": 0.1892593652009964, "learning_rate": 0.00019975855509847686, "loss": 1.9643, "step": 15 }, { "epoch": 0.26778242677824265, "grad_norm": 0.19022946059703827, "learning_rate": 0.00019965238092738643, "loss": 1.9275, "step": 16 }, { "epoch": 0.28451882845188287, "grad_norm": 0.18370254337787628, "learning_rate": 0.00019952695086820975, "loss": 1.9512, "step": 17 }, { "epoch": 0.301255230125523, "grad_norm": 0.16774943470954895, "learning_rate": 0.0001993822891578708, "loss": 1.8883, "step": 18 }, { "epoch": 0.3179916317991632, "grad_norm": 0.17572854459285736, "learning_rate": 0.0001992184237494368, "loss": 1.938, "step": 19 }, { "epoch": 0.33472803347280333, "grad_norm": 0.21952244639396667, "learning_rate": 0.0001990353863067169, "loss": 1.8937, "step": 20 }, { "epoch": 0.3514644351464435, "grad_norm": 0.2579434812068939, "learning_rate": 0.0001988332121981436, "loss": 1.8797, "step": 21 }, { "epoch": 0.3682008368200837, "grad_norm": 0.29942768812179565, "learning_rate": 0.00019861194048993863, "loss": 1.8766, "step": 22 }, { "epoch": 0.38493723849372385, "grad_norm": 0.326593279838562, "learning_rate": 0.0001983716139385641, "loss": 1.8567, "step": 23 }, { "epoch": 0.401673640167364, "grad_norm": 0.2769831418991089, "learning_rate": 0.0001981122789824607, "loss": 1.839, "step": 24 }, { "epoch": 0.41841004184100417, "grad_norm": 0.2902805507183075, "learning_rate": 0.00019783398573307428, "loss": 1.8622, "step": 25 }, { "epoch": 0.4351464435146444, "grad_norm": 0.2152428776025772, "learning_rate": 0.00019753678796517282, "loss": 1.8393, "step": 26 }, { "epoch": 0.45188284518828453, "grad_norm": 0.1541067361831665, "learning_rate": 0.00019722074310645553, "loss": 1.8326, "step": 27 }, { "epoch": 0.4686192468619247, "grad_norm": 0.12413746118545532, "learning_rate": 0.00019688591222645607, "loss": 1.8079, "step": 28 }, { "epoch": 0.48535564853556484, "grad_norm": 0.11909659206867218, "learning_rate": 0.000196532360024742, "loss": 1.8044, "step": 29 }, { "epoch": 0.502092050209205, "grad_norm": 0.16260313987731934, "learning_rate": 0.0001961601548184129, "loss": 1.7888, "step": 30 }, { "epoch": 0.5188284518828452, "grad_norm": 0.17894335091114044, "learning_rate": 0.00019576936852889936, "loss": 1.8263, "step": 31 }, { "epoch": 0.5355648535564853, "grad_norm": 0.21706554293632507, "learning_rate": 0.00019536007666806556, "loss": 1.8151, "step": 32 }, { "epoch": 0.5523012552301255, "grad_norm": 0.17296762764453888, "learning_rate": 0.0001949323583236181, "loss": 1.7901, "step": 33 }, { "epoch": 0.5690376569037657, "grad_norm": 0.1761002093553543, "learning_rate": 0.0001944862961438239, "loss": 1.8126, "step": 34 }, { "epoch": 0.5857740585774058, "grad_norm": 0.14914190769195557, "learning_rate": 0.00019402197632153992, "loss": 1.7797, "step": 35 }, { "epoch": 0.602510460251046, "grad_norm": 0.13111546635627747, "learning_rate": 0.00019353948857755803, "loss": 1.7916, "step": 36 }, { "epoch": 0.6192468619246861, "grad_norm": 0.12080994248390198, "learning_rate": 0.00019303892614326836, "loss": 1.7834, "step": 37 }, { "epoch": 0.6359832635983264, "grad_norm": 0.15622606873512268, "learning_rate": 0.00019252038574264405, "loss": 1.794, "step": 38 }, { "epoch": 0.6527196652719666, "grad_norm": 0.1345275640487671, "learning_rate": 0.00019198396757355118, "loss": 1.7554, "step": 39 }, { "epoch": 0.6694560669456067, "grad_norm": 0.18387249112129211, "learning_rate": 0.00019142977528838762, "loss": 1.7675, "step": 40 }, { "epoch": 0.6861924686192469, "grad_norm": 0.165897399187088, "learning_rate": 0.00019085791597405404, "loss": 1.7631, "step": 41 }, { "epoch": 0.702928870292887, "grad_norm": 0.13675889372825623, "learning_rate": 0.00019026850013126157, "loss": 1.754, "step": 42 }, { "epoch": 0.7196652719665272, "grad_norm": 0.16969387233257294, "learning_rate": 0.00018966164165317966, "loss": 1.745, "step": 43 }, { "epoch": 0.7364016736401674, "grad_norm": 0.16098380088806152, "learning_rate": 0.00018903745780342839, "loss": 1.7609, "step": 44 }, { "epoch": 0.7531380753138075, "grad_norm": 0.12338632345199585, "learning_rate": 0.0001883960691934196, "loss": 1.7584, "step": 45 }, { "epoch": 0.7698744769874477, "grad_norm": 0.1335158348083496, "learning_rate": 0.00018773759975905098, "loss": 1.7302, "step": 46 }, { "epoch": 0.7866108786610879, "grad_norm": 0.11967575550079346, "learning_rate": 0.00018706217673675811, "loss": 1.7323, "step": 47 }, { "epoch": 0.803347280334728, "grad_norm": 0.12446475774049759, "learning_rate": 0.0001863699306389282, "loss": 1.7217, "step": 48 }, { "epoch": 0.8200836820083682, "grad_norm": 0.13070392608642578, "learning_rate": 0.00018566099522868119, "loss": 1.7192, "step": 49 }, { "epoch": 0.8368200836820083, "grad_norm": 0.15041972696781158, "learning_rate": 0.00018493550749402278, "loss": 1.7386, "step": 50 }, { "epoch": 0.8535564853556485, "grad_norm": 0.12314204126596451, "learning_rate": 0.00018419360762137395, "loss": 1.7465, "step": 51 }, { "epoch": 0.8702928870292888, "grad_norm": 0.15009309351444244, "learning_rate": 0.00018343543896848273, "loss": 1.7216, "step": 52 }, { "epoch": 0.8870292887029289, "grad_norm": 0.12467087060213089, "learning_rate": 0.00018266114803672318, "loss": 1.7043, "step": 53 }, { "epoch": 0.9037656903765691, "grad_norm": 0.1297266036272049, "learning_rate": 0.00018187088444278674, "loss": 1.7069, "step": 54 }, { "epoch": 0.9205020920502092, "grad_norm": 0.1195509061217308, "learning_rate": 0.00018106480088977172, "loss": 1.7305, "step": 55 }, { "epoch": 0.9372384937238494, "grad_norm": 0.14450602233409882, "learning_rate": 0.00018024305313767646, "loss": 1.7171, "step": 56 }, { "epoch": 0.9539748953974896, "grad_norm": 0.16594989597797394, "learning_rate": 0.00017940579997330165, "loss": 1.7239, "step": 57 }, { "epoch": 0.9707112970711297, "grad_norm": 0.19210928678512573, "learning_rate": 0.00017855320317956784, "loss": 1.7081, "step": 58 }, { "epoch": 0.9874476987447699, "grad_norm": 0.16463987529277802, "learning_rate": 0.00017768542750425426, "loss": 1.7178, "step": 59 }, { "epoch": 0.9874476987447699, "eval_loss": 1.7365927696228027, "eval_runtime": 44.9816, "eval_samples_per_second": 37.415, "eval_steps_per_second": 0.8, "step": 59 }, { "epoch": 1.00418410041841, "grad_norm": 0.13954801857471466, "learning_rate": 0.0001768026406281642, "loss": 1.6905, "step": 60 }, { "epoch": 1.00418410041841, "grad_norm": 0.17747963964939117, "learning_rate": 0.00017590501313272415, "loss": 1.6864, "step": 61 }, { "epoch": 1.0209205020920502, "grad_norm": 0.1328994333744049, "learning_rate": 0.00017499271846702213, "loss": 1.6388, "step": 62 }, { "epoch": 1.0376569037656904, "grad_norm": 0.12811322510242462, "learning_rate": 0.00017406593291429217, "loss": 1.6821, "step": 63 }, { "epoch": 1.0543933054393306, "grad_norm": 0.1379721611738205, "learning_rate": 0.00017312483555785086, "loss": 1.6364, "step": 64 }, { "epoch": 1.0711297071129706, "grad_norm": 0.14301170408725739, "learning_rate": 0.00017216960824649303, "loss": 1.646, "step": 65 }, { "epoch": 1.0878661087866108, "grad_norm": 0.15547165274620056, "learning_rate": 0.00017120043555935298, "loss": 1.6625, "step": 66 }, { "epoch": 1.104602510460251, "grad_norm": 0.14291325211524963, "learning_rate": 0.0001702175047702382, "loss": 1.6392, "step": 67 }, { "epoch": 1.1213389121338913, "grad_norm": 0.14325812458992004, "learning_rate": 0.00016922100581144228, "loss": 1.6677, "step": 68 }, { "epoch": 1.1380753138075315, "grad_norm": 0.14328014850616455, "learning_rate": 0.00016821113123704424, "loss": 1.6182, "step": 69 }, { "epoch": 1.1548117154811715, "grad_norm": 0.16151900589466095, "learning_rate": 0.00016718807618570106, "loss": 1.6492, "step": 70 }, { "epoch": 1.1715481171548117, "grad_norm": 0.15649794042110443, "learning_rate": 0.00016615203834294119, "loss": 1.6334, "step": 71 }, { "epoch": 1.1882845188284519, "grad_norm": 0.15963214635849, "learning_rate": 0.00016510321790296525, "loss": 1.6203, "step": 72 }, { "epoch": 1.205020920502092, "grad_norm": 0.15893195569515228, "learning_rate": 0.00016404181752996289, "loss": 1.6408, "step": 73 }, { "epoch": 1.2217573221757323, "grad_norm": 0.16694918274879456, "learning_rate": 0.00016296804231895142, "loss": 1.633, "step": 74 }, { "epoch": 1.2384937238493725, "grad_norm": 0.15804418921470642, "learning_rate": 0.00016188209975614542, "loss": 1.6548, "step": 75 }, { "epoch": 1.2552301255230125, "grad_norm": 0.14725787937641144, "learning_rate": 0.00016078419967886402, "loss": 1.6158, "step": 76 }, { "epoch": 1.2719665271966527, "grad_norm": 0.15310946106910706, "learning_rate": 0.00015967455423498387, "loss": 1.664, "step": 77 }, { "epoch": 1.288702928870293, "grad_norm": 0.14741982519626617, "learning_rate": 0.00015855337784194577, "loss": 1.6494, "step": 78 }, { "epoch": 1.3054393305439331, "grad_norm": 0.1478704810142517, "learning_rate": 0.00015742088714532247, "loss": 1.6254, "step": 79 }, { "epoch": 1.3221757322175733, "grad_norm": 0.1895337849855423, "learning_rate": 0.00015627730097695638, "loss": 1.6018, "step": 80 }, { "epoch": 1.3389121338912133, "grad_norm": 0.19194994866847992, "learning_rate": 0.00015512284031267437, "loss": 1.6543, "step": 81 }, { "epoch": 1.3556485355648535, "grad_norm": 0.1553422063589096, "learning_rate": 0.00015395772822958845, "loss": 1.6429, "step": 82 }, { "epoch": 1.3723849372384938, "grad_norm": 0.19711142778396606, "learning_rate": 0.00015278218986299074, "loss": 1.624, "step": 83 }, { "epoch": 1.389121338912134, "grad_norm": 0.16961851716041565, "learning_rate": 0.0001515964523628501, "loss": 1.6097, "step": 84 }, { "epoch": 1.4058577405857742, "grad_norm": 0.20063763856887817, "learning_rate": 0.00015040074484992, "loss": 1.6171, "step": 85 }, { "epoch": 1.4225941422594142, "grad_norm": 0.16175565123558044, "learning_rate": 0.00014919529837146528, "loss": 1.6119, "step": 86 }, { "epoch": 1.4393305439330544, "grad_norm": 0.20445561408996582, "learning_rate": 0.00014798034585661695, "loss": 1.6089, "step": 87 }, { "epoch": 1.4560669456066946, "grad_norm": 0.17204363644123077, "learning_rate": 0.0001467561220713628, "loss": 1.6178, "step": 88 }, { "epoch": 1.4728033472803348, "grad_norm": 0.1780082732439041, "learning_rate": 0.0001455228635731839, "loss": 1.6172, "step": 89 }, { "epoch": 1.489539748953975, "grad_norm": 0.17321935296058655, "learning_rate": 0.00014428080866534396, "loss": 1.574, "step": 90 }, { "epoch": 1.506276150627615, "grad_norm": 0.1722816675901413, "learning_rate": 0.00014303019735084226, "loss": 1.6096, "step": 91 }, { "epoch": 1.5230125523012552, "grad_norm": 0.1621088981628418, "learning_rate": 0.00014177127128603745, "loss": 1.603, "step": 92 }, { "epoch": 1.5397489539748954, "grad_norm": 0.17485831677913666, "learning_rate": 0.0001405042737339524, "loss": 1.6105, "step": 93 }, { "epoch": 1.5564853556485354, "grad_norm": 0.16599352657794952, "learning_rate": 0.0001392294495172681, "loss": 1.632, "step": 94 }, { "epoch": 1.5732217573221758, "grad_norm": 0.1569545418024063, "learning_rate": 0.00013794704497101655, "loss": 1.608, "step": 95 }, { "epoch": 1.5899581589958158, "grad_norm": 0.15951111912727356, "learning_rate": 0.0001366573078949813, "loss": 1.5742, "step": 96 }, { "epoch": 1.606694560669456, "grad_norm": 0.1951216161251068, "learning_rate": 0.00013536048750581494, "loss": 1.5908, "step": 97 }, { "epoch": 1.6234309623430963, "grad_norm": 0.20315046608448029, "learning_rate": 0.00013405683438888282, "loss": 1.6081, "step": 98 }, { "epoch": 1.6401673640167362, "grad_norm": 0.16759882867336273, "learning_rate": 0.00013274660044984224, "loss": 1.6041, "step": 99 }, { "epoch": 1.6569037656903767, "grad_norm": 0.18749655783176422, "learning_rate": 0.00013143003886596669, "loss": 1.6002, "step": 100 }, { "epoch": 1.6736401673640167, "grad_norm": 0.1809973418712616, "learning_rate": 0.0001301074040372242, "loss": 1.6328, "step": 101 }, { "epoch": 1.6903765690376569, "grad_norm": 0.20419345796108246, "learning_rate": 0.00012877895153711935, "loss": 1.6017, "step": 102 }, { "epoch": 1.707112970711297, "grad_norm": 0.1926768571138382, "learning_rate": 0.0001274449380633089, "loss": 1.5793, "step": 103 }, { "epoch": 1.723849372384937, "grad_norm": 0.22891242802143097, "learning_rate": 0.00012610562138799978, "loss": 1.6306, "step": 104 }, { "epoch": 1.7405857740585775, "grad_norm": 0.17105843126773834, "learning_rate": 0.00012476126030813963, "loss": 1.5999, "step": 105 }, { "epoch": 1.7573221757322175, "grad_norm": 0.17857873439788818, "learning_rate": 0.0001234121145954094, "loss": 1.5867, "step": 106 }, { "epoch": 1.7740585774058577, "grad_norm": 0.1942574381828308, "learning_rate": 0.0001220584449460274, "loss": 1.5719, "step": 107 }, { "epoch": 1.790794979079498, "grad_norm": 0.18035678565502167, "learning_rate": 0.00012070051293037492, "loss": 1.5783, "step": 108 }, { "epoch": 1.8075313807531381, "grad_norm": 0.18351756036281586, "learning_rate": 0.00011933858094245281, "loss": 1.6121, "step": 109 }, { "epoch": 1.8242677824267783, "grad_norm": 0.17191624641418457, "learning_rate": 0.00011797291214917881, "loss": 1.5801, "step": 110 }, { "epoch": 1.8410041841004183, "grad_norm": 0.19317130744457245, "learning_rate": 0.00011660377043953588, "loss": 1.5928, "step": 111 }, { "epoch": 1.8577405857740585, "grad_norm": 0.1735094040632248, "learning_rate": 0.0001152314203735805, "loss": 1.5782, "step": 112 }, { "epoch": 1.8744769874476988, "grad_norm": 0.1757810264825821, "learning_rate": 0.0001138561271313219, "loss": 1.605, "step": 113 }, { "epoch": 1.891213389121339, "grad_norm": 0.20154766738414764, "learning_rate": 0.00011247815646148087, "loss": 1.5782, "step": 114 }, { "epoch": 1.9079497907949792, "grad_norm": 0.18075552582740784, "learning_rate": 0.00011109777463013915, "loss": 1.5905, "step": 115 }, { "epoch": 1.9246861924686192, "grad_norm": 0.1821986436843872, "learning_rate": 0.0001097152483692886, "loss": 1.5661, "step": 116 }, { "epoch": 1.9414225941422594, "grad_norm": 0.1858905404806137, "learning_rate": 0.00010833084482529048, "loss": 1.6245, "step": 117 }, { "epoch": 1.9581589958158996, "grad_norm": 0.19977439939975739, "learning_rate": 0.00010694483150725458, "loss": 1.5568, "step": 118 }, { "epoch": 1.9581589958158996, "eval_loss": 1.7031370401382446, "eval_runtime": 45.0796, "eval_samples_per_second": 37.334, "eval_steps_per_second": 0.799, "step": 118 } ], "logging_steps": 1, "max_steps": 236, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 59, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.814736962945024e+17, "train_batch_size": 48, "trial_name": null, "trial_params": null }