{ "best_metric": 1.3831464052200317, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.4305415405314497, "eval_steps": 25, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021527077026572486, "grad_norm": 27.975635528564453, "learning_rate": 4.9999999999999996e-05, "loss": 52.1874, "step": 1 }, { "epoch": 0.0021527077026572486, "eval_loss": 1.9092251062393188, "eval_runtime": 4.6352, "eval_samples_per_second": 10.787, "eval_steps_per_second": 10.787, "step": 1 }, { "epoch": 0.004305415405314497, "grad_norm": 28.772228240966797, "learning_rate": 9.999999999999999e-05, "loss": 50.8343, "step": 2 }, { "epoch": 0.006458123107971746, "grad_norm": 26.468538284301758, "learning_rate": 0.00015, "loss": 57.1093, "step": 3 }, { "epoch": 0.008610830810628994, "grad_norm": 30.467575073242188, "learning_rate": 0.00019999999999999998, "loss": 53.5437, "step": 4 }, { "epoch": 0.010763538513286243, "grad_norm": 22.272993087768555, "learning_rate": 0.00025, "loss": 52.2886, "step": 5 }, { "epoch": 0.012916246215943492, "grad_norm": 22.904247283935547, "learning_rate": 0.0003, "loss": 58.8652, "step": 6 }, { "epoch": 0.01506895391860074, "grad_norm": 19.842830657958984, "learning_rate": 0.00029998229929486034, "loss": 55.835, "step": 7 }, { "epoch": 0.01722166162125799, "grad_norm": 28.269920349121094, "learning_rate": 0.0002999292018211445, "loss": 52.4249, "step": 8 }, { "epoch": 0.019374369323915237, "grad_norm": 25.87576675415039, "learning_rate": 0.0002998407215027447, "loss": 49.1273, "step": 9 }, { "epoch": 0.021527077026572486, "grad_norm": 27.49983787536621, "learning_rate": 0.00029971688154209106, "loss": 47.7184, "step": 10 }, { "epoch": 0.023679784729229734, "grad_norm": 28.130020141601562, "learning_rate": 0.00029955771441406685, "loss": 49.5591, "step": 11 }, { "epoch": 0.025832492431886983, "grad_norm": 20.761058807373047, "learning_rate": 0.00029936326185749286, "loss": 62.6977, "step": 12 }, { "epoch": 0.02798520013454423, "grad_norm": 24.422517776489258, "learning_rate": 0.00029913357486418196, "loss": 49.0729, "step": 13 }, { "epoch": 0.03013790783720148, "grad_norm": 17.220544815063477, "learning_rate": 0.0002988687136655674, "loss": 58.0586, "step": 14 }, { "epoch": 0.03229061553985873, "grad_norm": 16.4974365234375, "learning_rate": 0.00029856874771690806, "loss": 67.3599, "step": 15 }, { "epoch": 0.03444332324251598, "grad_norm": 17.164011001586914, "learning_rate": 0.0002982337556790752, "loss": 52.5874, "step": 16 }, { "epoch": 0.036596030945173226, "grad_norm": 15.168697357177734, "learning_rate": 0.000297863825397925, "loss": 67.4725, "step": 17 }, { "epoch": 0.038748738647830475, "grad_norm": 15.330493927001953, "learning_rate": 0.0002974590538812622, "loss": 67.5717, "step": 18 }, { "epoch": 0.04090144635048772, "grad_norm": 18.652278900146484, "learning_rate": 0.00029701954727340204, "loss": 51.9793, "step": 19 }, { "epoch": 0.04305415405314497, "grad_norm": 16.22951316833496, "learning_rate": 0.000296545420827335, "loss": 47.1872, "step": 20 }, { "epoch": 0.04520686175580222, "grad_norm": 15.609740257263184, "learning_rate": 0.0002960367988745045, "loss": 67.1906, "step": 21 }, { "epoch": 0.04735956945845947, "grad_norm": 16.1520938873291, "learning_rate": 0.0002954938147922025, "loss": 64.4007, "step": 22 }, { "epoch": 0.04951227716111672, "grad_norm": 17.867395401000977, "learning_rate": 0.00029491661096859407, "loss": 64.1611, "step": 23 }, { "epoch": 0.051664984863773966, "grad_norm": 21.238767623901367, "learning_rate": 0.00029430533876537824, "loss": 53.9628, "step": 24 }, { "epoch": 0.053817692566431215, "grad_norm": 17.106630325317383, "learning_rate": 0.0002936601584780962, "loss": 65.3057, "step": 25 }, { "epoch": 0.053817692566431215, "eval_loss": 1.4372963905334473, "eval_runtime": 4.6502, "eval_samples_per_second": 10.752, "eval_steps_per_second": 10.752, "step": 25 }, { "epoch": 0.05597040026908846, "grad_norm": 16.96392822265625, "learning_rate": 0.00029298123929409647, "loss": 56.2454, "step": 26 }, { "epoch": 0.05812310797174571, "grad_norm": 17.571765899658203, "learning_rate": 0.0002922687592481686, "loss": 55.3028, "step": 27 }, { "epoch": 0.06027581567440296, "grad_norm": 19.826202392578125, "learning_rate": 0.00029152290517585637, "loss": 59.4945, "step": 28 }, { "epoch": 0.06242852337706021, "grad_norm": 18.554855346679688, "learning_rate": 0.0002907438726644637, "loss": 59.2052, "step": 29 }, { "epoch": 0.06458123107971746, "grad_norm": 19.96543312072754, "learning_rate": 0.00028993186600176504, "loss": 56.7843, "step": 30 }, { "epoch": 0.0667339387823747, "grad_norm": 18.83479881286621, "learning_rate": 0.00028908709812243465, "loss": 60.5815, "step": 31 }, { "epoch": 0.06888664648503195, "grad_norm": 19.240285873413086, "learning_rate": 0.0002882097905522079, "loss": 65.3217, "step": 32 }, { "epoch": 0.0710393541876892, "grad_norm": 20.48154640197754, "learning_rate": 0.0002873001733497903, "loss": 50.349, "step": 33 }, { "epoch": 0.07319206189034645, "grad_norm": 17.85178565979004, "learning_rate": 0.00028635848504652834, "loss": 55.1921, "step": 34 }, { "epoch": 0.0753447695930037, "grad_norm": 18.247879028320312, "learning_rate": 0.00028538497258385895, "loss": 51.2774, "step": 35 }, { "epoch": 0.07749747729566095, "grad_norm": 21.495031356811523, "learning_rate": 0.00028437989124855317, "loss": 53.9253, "step": 36 }, { "epoch": 0.07965018499831819, "grad_norm": 19.129505157470703, "learning_rate": 0.0002833435046057719, "loss": 49.977, "step": 37 }, { "epoch": 0.08180289270097545, "grad_norm": 20.080968856811523, "learning_rate": 0.00028227608442995037, "loss": 45.9058, "step": 38 }, { "epoch": 0.08395560040363269, "grad_norm": 25.35235023498535, "learning_rate": 0.00028117791063352987, "loss": 53.9856, "step": 39 }, { "epoch": 0.08610830810628994, "grad_norm": 33.66371536254883, "learning_rate": 0.0002800492711935558, "loss": 45.4571, "step": 40 }, { "epoch": 0.08826101580894719, "grad_norm": 27.388399124145508, "learning_rate": 0.00027889046207616055, "loss": 45.3808, "step": 41 }, { "epoch": 0.09041372351160444, "grad_norm": 32.35976791381836, "learning_rate": 0.00027770178715895156, "loss": 37.1048, "step": 42 }, { "epoch": 0.09256643121426168, "grad_norm": 35.10908508300781, "learning_rate": 0.0002764835581513246, "loss": 39.1569, "step": 43 }, { "epoch": 0.09471913891691894, "grad_norm": 31.484935760498047, "learning_rate": 0.00027523609451272343, "loss": 26.3453, "step": 44 }, { "epoch": 0.09687184661957618, "grad_norm": 30.296733856201172, "learning_rate": 0.0002739597233688672, "loss": 31.8288, "step": 45 }, { "epoch": 0.09902455432223344, "grad_norm": 26.935611724853516, "learning_rate": 0.0002726547794259673, "loss": 30.4528, "step": 46 }, { "epoch": 0.10117726202489068, "grad_norm": 22.19170379638672, "learning_rate": 0.0002713216048829563, "loss": 31.2617, "step": 47 }, { "epoch": 0.10332996972754793, "grad_norm": 30.748441696166992, "learning_rate": 0.00026996054934175267, "loss": 31.4245, "step": 48 }, { "epoch": 0.10548267743020517, "grad_norm": 24.570531845092773, "learning_rate": 0.00026857196971558306, "loss": 24.5434, "step": 49 }, { "epoch": 0.10763538513286243, "grad_norm": 21.509544372558594, "learning_rate": 0.00026715623013538883, "loss": 16.7066, "step": 50 }, { "epoch": 0.10763538513286243, "eval_loss": 1.8110151290893555, "eval_runtime": 4.6693, "eval_samples_per_second": 10.708, "eval_steps_per_second": 10.708, "step": 50 }, { "epoch": 0.10978809283551967, "grad_norm": 251.77491760253906, "learning_rate": 0.0002657137018543382, "loss": 81.1481, "step": 51 }, { "epoch": 0.11194080053817693, "grad_norm": 151.48643493652344, "learning_rate": 0.00026424476315047203, "loss": 56.8618, "step": 52 }, { "epoch": 0.11409350824083417, "grad_norm": 83.96969604492188, "learning_rate": 0.0002627497992275069, "loss": 59.0097, "step": 53 }, { "epoch": 0.11624621594349142, "grad_norm": 54.963260650634766, "learning_rate": 0.0002612292021138219, "loss": 69.7693, "step": 54 }, { "epoch": 0.11839892364614867, "grad_norm": 29.370437622070312, "learning_rate": 0.0002596833705596564, "loss": 50.5148, "step": 55 }, { "epoch": 0.12055163134880592, "grad_norm": 22.113611221313477, "learning_rate": 0.0002581127099325441, "loss": 48.963, "step": 56 }, { "epoch": 0.12270433905146316, "grad_norm": 22.532588958740234, "learning_rate": 0.0002565176321110129, "loss": 45.6119, "step": 57 }, { "epoch": 0.12485704675412042, "grad_norm": 18.061492919921875, "learning_rate": 0.0002548985553765769, "loss": 47.1696, "step": 58 }, { "epoch": 0.12700975445677767, "grad_norm": 19.217716217041016, "learning_rate": 0.0002532559043040491, "loss": 48.7373, "step": 59 }, { "epoch": 0.12916246215943492, "grad_norm": 17.86422348022461, "learning_rate": 0.00025159010965020384, "loss": 49.57, "step": 60 }, { "epoch": 0.13131516986209216, "grad_norm": 17.24818992614746, "learning_rate": 0.000249901608240819, "loss": 43.3737, "step": 61 }, { "epoch": 0.1334678775647494, "grad_norm": 15.537205696105957, "learning_rate": 0.0002481908428561252, "loss": 48.9256, "step": 62 }, { "epoch": 0.13562058526740667, "grad_norm": 15.320372581481934, "learning_rate": 0.0002464582621146948, "loss": 58.0217, "step": 63 }, { "epoch": 0.1377732929700639, "grad_norm": 15.557882308959961, "learning_rate": 0.00024470432035579955, "loss": 52.9572, "step": 64 }, { "epoch": 0.13992600067272115, "grad_norm": 18.153133392333984, "learning_rate": 0.00024292947752026762, "loss": 54.717, "step": 65 }, { "epoch": 0.1420787083753784, "grad_norm": 17.778865814208984, "learning_rate": 0.00024113419902987233, "loss": 53.135, "step": 66 }, { "epoch": 0.14423141607803566, "grad_norm": 20.06256866455078, "learning_rate": 0.00023931895566528346, "loss": 58.12, "step": 67 }, { "epoch": 0.1463841237806929, "grad_norm": 16.77607536315918, "learning_rate": 0.00023748422344261282, "loss": 53.9254, "step": 68 }, { "epoch": 0.14853683148335015, "grad_norm": 18.06422996520996, "learning_rate": 0.00023563048348858754, "loss": 54.2261, "step": 69 }, { "epoch": 0.1506895391860074, "grad_norm": 15.940298080444336, "learning_rate": 0.000233758221914383, "loss": 66.3633, "step": 70 }, { "epoch": 0.15284224688866466, "grad_norm": 18.226343154907227, "learning_rate": 0.00023186792968814835, "loss": 59.0898, "step": 71 }, { "epoch": 0.1549949545913219, "grad_norm": 18.315324783325195, "learning_rate": 0.0002299601025062587, "loss": 55.1847, "step": 72 }, { "epoch": 0.15714766229397914, "grad_norm": 18.426488876342773, "learning_rate": 0.00022803524066332745, "loss": 61.6653, "step": 73 }, { "epoch": 0.15930036999663638, "grad_norm": 15.84361743927002, "learning_rate": 0.00022609384892101274, "loss": 53.0315, "step": 74 }, { "epoch": 0.16145307769929365, "grad_norm": 16.615859985351562, "learning_rate": 0.0002241364363756521, "loss": 58.2848, "step": 75 }, { "epoch": 0.16145307769929365, "eval_loss": 1.397356390953064, "eval_runtime": 4.6614, "eval_samples_per_second": 10.726, "eval_steps_per_second": 10.726, "step": 75 }, { "epoch": 0.1636057854019509, "grad_norm": 18.040613174438477, "learning_rate": 0.0002221635163247612, "loss": 49.2038, "step": 76 }, { "epoch": 0.16575849310460813, "grad_norm": 17.213279724121094, "learning_rate": 0.00022017560613243008, "loss": 61.9849, "step": 77 }, { "epoch": 0.16791120080726538, "grad_norm": 16.561328887939453, "learning_rate": 0.00021817322709365372, "loss": 54.9861, "step": 78 }, { "epoch": 0.17006390850992265, "grad_norm": 18.200952529907227, "learning_rate": 0.00021615690429763141, "loss": 57.5604, "step": 79 }, { "epoch": 0.1722166162125799, "grad_norm": 18.224294662475586, "learning_rate": 0.00021412716649007083, "loss": 54.4244, "step": 80 }, { "epoch": 0.17436932391523713, "grad_norm": 20.06167984008789, "learning_rate": 0.00021208454593453407, "loss": 48.1195, "step": 81 }, { "epoch": 0.17652203161789437, "grad_norm": 18.243457794189453, "learning_rate": 0.00021002957827286078, "loss": 61.208, "step": 82 }, { "epoch": 0.17867473932055164, "grad_norm": 19.6166934967041, "learning_rate": 0.00020796280238470492, "loss": 51.9622, "step": 83 }, { "epoch": 0.18082744702320888, "grad_norm": 17.353927612304688, "learning_rate": 0.00020588476024622332, "loss": 52.8324, "step": 84 }, { "epoch": 0.18298015472586612, "grad_norm": 19.890092849731445, "learning_rate": 0.0002037959967879518, "loss": 49.001, "step": 85 }, { "epoch": 0.18513286242852336, "grad_norm": 22.065095901489258, "learning_rate": 0.00020169705975190628, "loss": 56.8814, "step": 86 }, { "epoch": 0.18728557013118063, "grad_norm": 20.77701187133789, "learning_rate": 0.0001995884995479472, "loss": 49.7699, "step": 87 }, { "epoch": 0.18943827783383788, "grad_norm": 20.845335006713867, "learning_rate": 0.00019747086910944423, "loss": 49.7895, "step": 88 }, { "epoch": 0.19159098553649512, "grad_norm": 24.944059371948242, "learning_rate": 0.00019534472374827845, "loss": 54.0157, "step": 89 }, { "epoch": 0.19374369323915236, "grad_norm": 19.914514541625977, "learning_rate": 0.00019321062100922213, "loss": 45.4326, "step": 90 }, { "epoch": 0.19589640094180963, "grad_norm": 24.652219772338867, "learning_rate": 0.00019106912052373187, "loss": 47.0036, "step": 91 }, { "epoch": 0.19804910864446687, "grad_norm": 21.81653594970703, "learning_rate": 0.00018892078386319508, "loss": 36.5694, "step": 92 }, { "epoch": 0.2002018163471241, "grad_norm": 21.040407180786133, "learning_rate": 0.00018676617439166755, "loss": 34.3789, "step": 93 }, { "epoch": 0.20235452404978135, "grad_norm": 21.030424118041992, "learning_rate": 0.0001846058571181412, "loss": 24.1345, "step": 94 }, { "epoch": 0.20450723175243862, "grad_norm": 19.839706420898438, "learning_rate": 0.00018244039854837984, "loss": 29.7828, "step": 95 }, { "epoch": 0.20665993945509586, "grad_norm": 22.62742042541504, "learning_rate": 0.0001802703665363634, "loss": 33.3202, "step": 96 }, { "epoch": 0.2088126471577531, "grad_norm": 22.82349395751953, "learning_rate": 0.0001780963301353775, "loss": 25.4279, "step": 97 }, { "epoch": 0.21096535486041035, "grad_norm": 23.84822654724121, "learning_rate": 0.0001759188594487896, "loss": 27.3422, "step": 98 }, { "epoch": 0.21311806256306762, "grad_norm": 22.19009780883789, "learning_rate": 0.00017373852548054883, "loss": 18.5194, "step": 99 }, { "epoch": 0.21527077026572486, "grad_norm": 20.305334091186523, "learning_rate": 0.0001715558999854505, "loss": 16.2787, "step": 100 }, { "epoch": 0.21527077026572486, "eval_loss": 1.5999956130981445, "eval_runtime": 4.6458, "eval_samples_per_second": 10.762, "eval_steps_per_second": 10.762, "step": 100 }, { "epoch": 0.2174234779683821, "grad_norm": 82.87445831298828, "learning_rate": 0.00016937155531920306, "loss": 61.8069, "step": 101 }, { "epoch": 0.21957618567103934, "grad_norm": 65.13322448730469, "learning_rate": 0.00016718606428833772, "loss": 57.1572, "step": 102 }, { "epoch": 0.2217288933736966, "grad_norm": 54.594844818115234, "learning_rate": 0.000165, "loss": 51.6823, "step": 103 }, { "epoch": 0.22388160107635385, "grad_norm": 37.8702278137207, "learning_rate": 0.00016281393571166228, "loss": 49.1134, "step": 104 }, { "epoch": 0.2260343087790111, "grad_norm": 33.802913665771484, "learning_rate": 0.000160628444680797, "loss": 42.163, "step": 105 }, { "epoch": 0.22818701648166834, "grad_norm": 20.086145401000977, "learning_rate": 0.00015844410001454953, "loss": 48.1086, "step": 106 }, { "epoch": 0.2303397241843256, "grad_norm": 18.237884521484375, "learning_rate": 0.00015626147451945117, "loss": 50.6928, "step": 107 }, { "epoch": 0.23249243188698285, "grad_norm": 15.673614501953125, "learning_rate": 0.00015408114055121046, "loss": 54.4374, "step": 108 }, { "epoch": 0.2346451395896401, "grad_norm": 17.010723114013672, "learning_rate": 0.0001519036698646225, "loss": 36.3256, "step": 109 }, { "epoch": 0.23679784729229733, "grad_norm": 16.029193878173828, "learning_rate": 0.0001497296334636366, "loss": 47.0858, "step": 110 }, { "epoch": 0.2389505549949546, "grad_norm": 16.307323455810547, "learning_rate": 0.00014755960145162016, "loss": 57.0106, "step": 111 }, { "epoch": 0.24110326269761184, "grad_norm": 17.687231063842773, "learning_rate": 0.00014539414288185882, "loss": 52.8393, "step": 112 }, { "epoch": 0.24325597040026908, "grad_norm": 15.435744285583496, "learning_rate": 0.00014323382560833242, "loss": 50.8995, "step": 113 }, { "epoch": 0.24540867810292633, "grad_norm": 34.477108001708984, "learning_rate": 0.00014107921613680491, "loss": 45.3053, "step": 114 }, { "epoch": 0.2475613858055836, "grad_norm": 17.266403198242188, "learning_rate": 0.00013893087947626812, "loss": 56.6521, "step": 115 }, { "epoch": 0.24971409350824084, "grad_norm": 14.08521842956543, "learning_rate": 0.00013678937899077787, "loss": 56.4354, "step": 116 }, { "epoch": 0.2518668012108981, "grad_norm": 13.650261878967285, "learning_rate": 0.00013465527625172158, "loss": 58.5963, "step": 117 }, { "epoch": 0.25401950891355535, "grad_norm": 15.609672546386719, "learning_rate": 0.0001325291308905558, "loss": 61.3632, "step": 118 }, { "epoch": 0.2561722166162126, "grad_norm": 17.89152717590332, "learning_rate": 0.00013041150045205272, "loss": 49.1726, "step": 119 }, { "epoch": 0.25832492431886983, "grad_norm": 15.168315887451172, "learning_rate": 0.00012830294024809372, "loss": 55.65, "step": 120 }, { "epoch": 0.2604776320215271, "grad_norm": 16.242740631103516, "learning_rate": 0.0001262040032120482, "loss": 62.057, "step": 121 }, { "epoch": 0.2626303397241843, "grad_norm": 14.286775588989258, "learning_rate": 0.00012411523975377667, "loss": 58.6996, "step": 122 }, { "epoch": 0.26478304742684156, "grad_norm": 13.97586727142334, "learning_rate": 0.00012203719761529511, "loss": 62.2048, "step": 123 }, { "epoch": 0.2669357551294988, "grad_norm": 13.744102478027344, "learning_rate": 0.00011997042172713925, "loss": 59.8054, "step": 124 }, { "epoch": 0.2690884628321561, "grad_norm": 16.709117889404297, "learning_rate": 0.00011791545406546589, "loss": 57.2232, "step": 125 }, { "epoch": 0.2690884628321561, "eval_loss": 1.3868330717086792, "eval_runtime": 4.6549, "eval_samples_per_second": 10.741, "eval_steps_per_second": 10.741, "step": 125 }, { "epoch": 0.27124117053481334, "grad_norm": 15.43789005279541, "learning_rate": 0.00011587283350992917, "loss": 57.2502, "step": 126 }, { "epoch": 0.2733938782374706, "grad_norm": 14.495203018188477, "learning_rate": 0.0001138430957023686, "loss": 66.2669, "step": 127 }, { "epoch": 0.2755465859401278, "grad_norm": 15.335055351257324, "learning_rate": 0.00011182677290634626, "loss": 59.3284, "step": 128 }, { "epoch": 0.27769929364278506, "grad_norm": 15.876263618469238, "learning_rate": 0.00010982439386756993, "loss": 48.8434, "step": 129 }, { "epoch": 0.2798520013454423, "grad_norm": 15.106823921203613, "learning_rate": 0.00010783648367523887, "loss": 55.1709, "step": 130 }, { "epoch": 0.28200470904809954, "grad_norm": 16.672956466674805, "learning_rate": 0.00010586356362434786, "loss": 59.7799, "step": 131 }, { "epoch": 0.2841574167507568, "grad_norm": 15.906288146972656, "learning_rate": 0.00010390615107898727, "loss": 53.3234, "step": 132 }, { "epoch": 0.2863101244534141, "grad_norm": 16.802085876464844, "learning_rate": 0.00010196475933667252, "loss": 59.1581, "step": 133 }, { "epoch": 0.2884628321560713, "grad_norm": 17.283231735229492, "learning_rate": 0.00010003989749374132, "loss": 50.9385, "step": 134 }, { "epoch": 0.29061553985872857, "grad_norm": 17.643165588378906, "learning_rate": 9.813207031185173e-05, "loss": 59.0425, "step": 135 }, { "epoch": 0.2927682475613858, "grad_norm": 17.9544620513916, "learning_rate": 9.624177808561703e-05, "loss": 56.6361, "step": 136 }, { "epoch": 0.29492095526404305, "grad_norm": 17.375778198242188, "learning_rate": 9.436951651141242e-05, "loss": 54.7815, "step": 137 }, { "epoch": 0.2970736629667003, "grad_norm": 17.666385650634766, "learning_rate": 9.251577655738719e-05, "loss": 42.6534, "step": 138 }, { "epoch": 0.29922637066935753, "grad_norm": 18.56308364868164, "learning_rate": 9.068104433471652e-05, "loss": 51.0905, "step": 139 }, { "epoch": 0.3013790783720148, "grad_norm": 19.243757247924805, "learning_rate": 8.886580097012762e-05, "loss": 47.3786, "step": 140 }, { "epoch": 0.30353178607467207, "grad_norm": 19.949800491333008, "learning_rate": 8.70705224797324e-05, "loss": 53.0778, "step": 141 }, { "epoch": 0.3056844937773293, "grad_norm": 19.51043701171875, "learning_rate": 8.529567964420047e-05, "loss": 43.7085, "step": 142 }, { "epoch": 0.30783720147998656, "grad_norm": 18.10249900817871, "learning_rate": 8.354173788530516e-05, "loss": 38.5526, "step": 143 }, { "epoch": 0.3099899091826438, "grad_norm": 17.946630477905273, "learning_rate": 8.180915714387479e-05, "loss": 27.1243, "step": 144 }, { "epoch": 0.31214261688530104, "grad_norm": 18.7962703704834, "learning_rate": 8.009839175918098e-05, "loss": 23.8417, "step": 145 }, { "epoch": 0.3142953245879583, "grad_norm": 20.63129997253418, "learning_rate": 7.840989034979613e-05, "loss": 32.8246, "step": 146 }, { "epoch": 0.3164480322906155, "grad_norm": 20.96570587158203, "learning_rate": 7.674409569595094e-05, "loss": 24.125, "step": 147 }, { "epoch": 0.31860073999327276, "grad_norm": 20.575000762939453, "learning_rate": 7.510144462342307e-05, "loss": 25.7693, "step": 148 }, { "epoch": 0.32075344769593006, "grad_norm": 18.93328094482422, "learning_rate": 7.348236788898705e-05, "loss": 18.3569, "step": 149 }, { "epoch": 0.3229061553985873, "grad_norm": 28.199974060058594, "learning_rate": 7.188729006745592e-05, "loss": 16.7151, "step": 150 }, { "epoch": 0.3229061553985873, "eval_loss": 1.426528811454773, "eval_runtime": 4.6498, "eval_samples_per_second": 10.753, "eval_steps_per_second": 10.753, "step": 150 }, { "epoch": 0.32505886310124454, "grad_norm": 41.9819221496582, "learning_rate": 7.03166294403436e-05, "loss": 46.1149, "step": 151 }, { "epoch": 0.3272115708039018, "grad_norm": 30.605613708496094, "learning_rate": 6.877079788617809e-05, "loss": 55.5686, "step": 152 }, { "epoch": 0.329364278506559, "grad_norm": 27.620370864868164, "learning_rate": 6.725020077249312e-05, "loss": 43.7873, "step": 153 }, { "epoch": 0.33151698620921627, "grad_norm": 30.644044876098633, "learning_rate": 6.575523684952798e-05, "loss": 56.4732, "step": 154 }, { "epoch": 0.3336696939118735, "grad_norm": 26.266923904418945, "learning_rate": 6.428629814566178e-05, "loss": 42.3263, "step": 155 }, { "epoch": 0.33582240161453075, "grad_norm": 23.177692413330078, "learning_rate": 6.284376986461113e-05, "loss": 46.6876, "step": 156 }, { "epoch": 0.33797510931718805, "grad_norm": 23.30403709411621, "learning_rate": 6.142803028441687e-05, "loss": 38.3969, "step": 157 }, { "epoch": 0.3401278170198453, "grad_norm": 22.989364624023438, "learning_rate": 6.003945065824737e-05, "loss": 53.7869, "step": 158 }, { "epoch": 0.34228052472250253, "grad_norm": 23.292339324951172, "learning_rate": 5.867839511704368e-05, "loss": 50.1899, "step": 159 }, { "epoch": 0.3444332324251598, "grad_norm": 16.233699798583984, "learning_rate": 5.734522057403271e-05, "loss": 59.9854, "step": 160 }, { "epoch": 0.346585940127817, "grad_norm": 17.827598571777344, "learning_rate": 5.604027663113273e-05, "loss": 45.3715, "step": 161 }, { "epoch": 0.34873864783047426, "grad_norm": 16.554851531982422, "learning_rate": 5.4763905487276506e-05, "loss": 48.1336, "step": 162 }, { "epoch": 0.3508913555331315, "grad_norm": 17.867149353027344, "learning_rate": 5.3516441848675385e-05, "loss": 56.3543, "step": 163 }, { "epoch": 0.35304406323578874, "grad_norm": 14.600744247436523, "learning_rate": 5.229821284104842e-05, "loss": 52.4116, "step": 164 }, { "epoch": 0.35519677093844604, "grad_norm": 14.218669891357422, "learning_rate": 5.110953792383941e-05, "loss": 51.6633, "step": 165 }, { "epoch": 0.3573494786411033, "grad_norm": 14.407130241394043, "learning_rate": 4.995072880644416e-05, "loss": 56.2798, "step": 166 }, { "epoch": 0.3595021863437605, "grad_norm": 15.17442512512207, "learning_rate": 4.882208936647008e-05, "loss": 46.4648, "step": 167 }, { "epoch": 0.36165489404641776, "grad_norm": 14.578434944152832, "learning_rate": 4.7723915570049596e-05, "loss": 57.4643, "step": 168 }, { "epoch": 0.363807601749075, "grad_norm": 15.142487525939941, "learning_rate": 4.6656495394228076e-05, "loss": 52.9338, "step": 169 }, { "epoch": 0.36596030945173225, "grad_norm": 13.773836135864258, "learning_rate": 4.562010875144683e-05, "loss": 65.5862, "step": 170 }, { "epoch": 0.3681130171543895, "grad_norm": 14.126533508300781, "learning_rate": 4.461502741614107e-05, "loss": 61.5793, "step": 171 }, { "epoch": 0.37026572485704673, "grad_norm": 14.658692359924316, "learning_rate": 4.364151495347164e-05, "loss": 65.1335, "step": 172 }, { "epoch": 0.372418432559704, "grad_norm": 14.365325927734375, "learning_rate": 4.269982665020967e-05, "loss": 60.9303, "step": 173 }, { "epoch": 0.37457114026236127, "grad_norm": 14.562641143798828, "learning_rate": 4.179020944779209e-05, "loss": 62.4841, "step": 174 }, { "epoch": 0.3767238479650185, "grad_norm": 15.046587944030762, "learning_rate": 4.091290187756536e-05, "loss": 55.8978, "step": 175 }, { "epoch": 0.3767238479650185, "eval_loss": 1.4019769430160522, "eval_runtime": 4.6566, "eval_samples_per_second": 10.737, "eval_steps_per_second": 10.737, "step": 175 }, { "epoch": 0.37887655566767575, "grad_norm": 15.49817943572998, "learning_rate": 4.006813399823494e-05, "loss": 65.303, "step": 176 }, { "epoch": 0.381029263370333, "grad_norm": 14.744792938232422, "learning_rate": 3.925612733553629e-05, "loss": 63.0347, "step": 177 }, { "epoch": 0.38318197107299024, "grad_norm": 16.15069580078125, "learning_rate": 3.8477094824143625e-05, "loss": 62.3457, "step": 178 }, { "epoch": 0.3853346787756475, "grad_norm": 14.346293449401855, "learning_rate": 3.7731240751831375e-05, "loss": 55.1305, "step": 179 }, { "epoch": 0.3874873864783047, "grad_norm": 15.92353630065918, "learning_rate": 3.701876070590349e-05, "loss": 53.608, "step": 180 }, { "epoch": 0.389640094180962, "grad_norm": 14.467766761779785, "learning_rate": 3.633984152190378e-05, "loss": 54.3097, "step": 181 }, { "epoch": 0.39179280188361926, "grad_norm": 15.454816818237305, "learning_rate": 3.5694661234621745e-05, "loss": 58.8295, "step": 182 }, { "epoch": 0.3939455095862765, "grad_norm": 18.29946517944336, "learning_rate": 3.508338903140592e-05, "loss": 46.8614, "step": 183 }, { "epoch": 0.39609821728893374, "grad_norm": 15.767854690551758, "learning_rate": 3.4506185207797495e-05, "loss": 53.0518, "step": 184 }, { "epoch": 0.398250924991591, "grad_norm": 18.729963302612305, "learning_rate": 3.396320112549551e-05, "loss": 46.7372, "step": 185 }, { "epoch": 0.4004036326942482, "grad_norm": 17.034263610839844, "learning_rate": 3.345457917266499e-05, "loss": 54.1875, "step": 186 }, { "epoch": 0.40255634039690547, "grad_norm": 18.100072860717773, "learning_rate": 3.298045272659797e-05, "loss": 49.5733, "step": 187 }, { "epoch": 0.4047090480995627, "grad_norm": 17.38837242126465, "learning_rate": 3.254094611873773e-05, "loss": 47.9609, "step": 188 }, { "epoch": 0.40686175580222, "grad_norm": 18.574222564697266, "learning_rate": 3.213617460207498e-05, "loss": 49.3699, "step": 189 }, { "epoch": 0.40901446350487725, "grad_norm": 19.958696365356445, "learning_rate": 3.176624432092475e-05, "loss": 46.7215, "step": 190 }, { "epoch": 0.4111671712075345, "grad_norm": 21.526926040649414, "learning_rate": 3.143125228309194e-05, "loss": 46.696, "step": 191 }, { "epoch": 0.41331987891019173, "grad_norm": 19.809968948364258, "learning_rate": 3.113128633443261e-05, "loss": 37.0424, "step": 192 }, { "epoch": 0.41547258661284897, "grad_norm": 20.5278377532959, "learning_rate": 3.086642513581802e-05, "loss": 31.7745, "step": 193 }, { "epoch": 0.4176252943155062, "grad_norm": 14.962890625, "learning_rate": 3.0636738142507115e-05, "loss": 17.8696, "step": 194 }, { "epoch": 0.41977800201816345, "grad_norm": 18.451675415039062, "learning_rate": 3.044228558593313e-05, "loss": 23.2517, "step": 195 }, { "epoch": 0.4219307097208207, "grad_norm": 26.63257598876953, "learning_rate": 3.028311845790893e-05, "loss": 33.0328, "step": 196 }, { "epoch": 0.424083417423478, "grad_norm": 26.404699325561523, "learning_rate": 3.0159278497255256e-05, "loss": 30.4732, "step": 197 }, { "epoch": 0.42623612512613523, "grad_norm": 26.336477279663086, "learning_rate": 3.0070798178855484e-05, "loss": 24.6996, "step": 198 }, { "epoch": 0.4283888328287925, "grad_norm": 26.34403419494629, "learning_rate": 3.001770070513965e-05, "loss": 16.7861, "step": 199 }, { "epoch": 0.4305415405314497, "grad_norm": 41.23857879638672, "learning_rate": 2.9999999999999997e-05, "loss": 16.6288, "step": 200 }, { "epoch": 0.4305415405314497, "eval_loss": 1.3831464052200317, "eval_runtime": 4.6565, "eval_samples_per_second": 10.738, "eval_steps_per_second": 10.738, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.430760044724224e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }