{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9974825174825175, "eval_steps": 250, "global_step": 2679, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011188811188811189, "grad_norm": 12.223331775621379, "learning_rate": 2.0000000000000002e-07, "loss": 1.2427, "num_input_tokens_seen": 1048576, "step": 1 }, { "epoch": 0.0022377622377622378, "grad_norm": 13.69194525869009, "learning_rate": 4.0000000000000003e-07, "loss": 1.3185, "num_input_tokens_seen": 2097152, "step": 2 }, { "epoch": 0.0033566433566433566, "grad_norm": 11.853665431536184, "learning_rate": 6.000000000000001e-07, "loss": 1.2261, "num_input_tokens_seen": 3145728, "step": 3 }, { "epoch": 0.0044755244755244755, "grad_norm": 12.744514855322654, "learning_rate": 8.000000000000001e-07, "loss": 1.3815, "num_input_tokens_seen": 4194304, "step": 4 }, { "epoch": 0.005594405594405594, "grad_norm": 13.538588732198958, "learning_rate": 1.0000000000000002e-06, "loss": 1.3404, "num_input_tokens_seen": 5242880, "step": 5 }, { "epoch": 0.006713286713286713, "grad_norm": 9.023781497869667, "learning_rate": 1.2000000000000002e-06, "loss": 1.236, "num_input_tokens_seen": 6291456, "step": 6 }, { "epoch": 0.007832167832167832, "grad_norm": 9.252447545200786, "learning_rate": 1.4000000000000001e-06, "loss": 1.2324, "num_input_tokens_seen": 7340032, "step": 7 }, { "epoch": 0.008951048951048951, "grad_norm": 7.885019948304179, "learning_rate": 1.6000000000000001e-06, "loss": 1.1904, "num_input_tokens_seen": 8388608, "step": 8 }, { "epoch": 0.01006993006993007, "grad_norm": 12.533230525168268, "learning_rate": 1.8000000000000001e-06, "loss": 1.2405, "num_input_tokens_seen": 9437184, "step": 9 }, { "epoch": 0.011188811188811189, "grad_norm": 7.1958450958493305, "learning_rate": 2.0000000000000003e-06, "loss": 1.0979, "num_input_tokens_seen": 10485760, "step": 10 }, { "epoch": 0.012307692307692308, "grad_norm": 11.187551927349928, "learning_rate": 2.2e-06, "loss": 1.004, "num_input_tokens_seen": 11534336, "step": 11 }, { "epoch": 0.013426573426573427, "grad_norm": 6.310943218638618, "learning_rate": 2.4000000000000003e-06, "loss": 0.9888, "num_input_tokens_seen": 12582912, "step": 12 }, { "epoch": 0.014545454545454545, "grad_norm": 3.2240547082626185, "learning_rate": 2.6e-06, "loss": 1.0584, "num_input_tokens_seen": 13631488, "step": 13 }, { "epoch": 0.015664335664335664, "grad_norm": 1.8778382241076728, "learning_rate": 2.8000000000000003e-06, "loss": 0.9863, "num_input_tokens_seen": 14680064, "step": 14 }, { "epoch": 0.016783216783216783, "grad_norm": 2.0112325439279832, "learning_rate": 3e-06, "loss": 0.9604, "num_input_tokens_seen": 15728640, "step": 15 }, { "epoch": 0.017902097902097902, "grad_norm": 2.817010063392967, "learning_rate": 3.2000000000000003e-06, "loss": 0.9025, "num_input_tokens_seen": 16777216, "step": 16 }, { "epoch": 0.01902097902097902, "grad_norm": 0.9889601226489826, "learning_rate": 3.4000000000000005e-06, "loss": 1.004, "num_input_tokens_seen": 17825792, "step": 17 }, { "epoch": 0.02013986013986014, "grad_norm": 1.07300612980992, "learning_rate": 3.6000000000000003e-06, "loss": 0.973, "num_input_tokens_seen": 18874368, "step": 18 }, { "epoch": 0.02125874125874126, "grad_norm": 0.8137920116522358, "learning_rate": 3.8000000000000005e-06, "loss": 0.9635, "num_input_tokens_seen": 19922944, "step": 19 }, { "epoch": 0.022377622377622378, "grad_norm": 0.8508042463158337, "learning_rate": 4.000000000000001e-06, "loss": 0.8557, "num_input_tokens_seen": 20971520, "step": 20 }, { "epoch": 0.023496503496503496, "grad_norm": 1.2874521188642698, "learning_rate": 4.2000000000000004e-06, "loss": 1.0787, "num_input_tokens_seen": 22020096, "step": 21 }, { "epoch": 0.024615384615384615, "grad_norm": 0.8854716356034362, "learning_rate": 4.4e-06, "loss": 0.8179, "num_input_tokens_seen": 23068672, "step": 22 }, { "epoch": 0.025734265734265734, "grad_norm": 0.8311542295563416, "learning_rate": 4.600000000000001e-06, "loss": 0.9493, "num_input_tokens_seen": 24117248, "step": 23 }, { "epoch": 0.026853146853146853, "grad_norm": 1.7168692849700664, "learning_rate": 4.800000000000001e-06, "loss": 0.8281, "num_input_tokens_seen": 25165824, "step": 24 }, { "epoch": 0.027972027972027972, "grad_norm": 0.8818295278585502, "learning_rate": 5e-06, "loss": 1.0281, "num_input_tokens_seen": 26214400, "step": 25 }, { "epoch": 0.02909090909090909, "grad_norm": 0.6749518055317714, "learning_rate": 4.999998248509159e-06, "loss": 0.8749, "num_input_tokens_seen": 27262976, "step": 26 }, { "epoch": 0.03020979020979021, "grad_norm": 0.7356914121152679, "learning_rate": 4.99999299403909e-06, "loss": 0.8868, "num_input_tokens_seen": 28311552, "step": 27 }, { "epoch": 0.03132867132867133, "grad_norm": 0.5753914115541998, "learning_rate": 4.999984236597156e-06, "loss": 0.8974, "num_input_tokens_seen": 29360128, "step": 28 }, { "epoch": 0.03244755244755245, "grad_norm": 0.6941445282621481, "learning_rate": 4.999971976195628e-06, "loss": 0.9162, "num_input_tokens_seen": 30408704, "step": 29 }, { "epoch": 0.033566433566433566, "grad_norm": 0.5935662011636139, "learning_rate": 4.9999562128516835e-06, "loss": 0.9994, "num_input_tokens_seen": 31457280, "step": 30 }, { "epoch": 0.03468531468531469, "grad_norm": 0.6074014830907029, "learning_rate": 4.999936946587412e-06, "loss": 1.0164, "num_input_tokens_seen": 32505856, "step": 31 }, { "epoch": 0.035804195804195804, "grad_norm": 0.5670726664618633, "learning_rate": 4.999914177429808e-06, "loss": 0.9654, "num_input_tokens_seen": 33554432, "step": 32 }, { "epoch": 0.036923076923076927, "grad_norm": 0.9242149781939829, "learning_rate": 4.999887905410775e-06, "loss": 0.9447, "num_input_tokens_seen": 34603008, "step": 33 }, { "epoch": 0.03804195804195804, "grad_norm": 0.6714982193980112, "learning_rate": 4.999858130567127e-06, "loss": 0.7911, "num_input_tokens_seen": 35651584, "step": 34 }, { "epoch": 0.039160839160839164, "grad_norm": 0.5298220500057619, "learning_rate": 4.999824852940583e-06, "loss": 0.8502, "num_input_tokens_seen": 36700160, "step": 35 }, { "epoch": 0.04027972027972028, "grad_norm": 0.5489907450252375, "learning_rate": 4.999788072577773e-06, "loss": 0.8437, "num_input_tokens_seen": 37748736, "step": 36 }, { "epoch": 0.0413986013986014, "grad_norm": 0.6095867825906436, "learning_rate": 4.99974778953023e-06, "loss": 0.7833, "num_input_tokens_seen": 38797312, "step": 37 }, { "epoch": 0.04251748251748252, "grad_norm": 0.6065159760290083, "learning_rate": 4.999704003854402e-06, "loss": 0.9155, "num_input_tokens_seen": 39845888, "step": 38 }, { "epoch": 0.04363636363636364, "grad_norm": 0.4517277899491505, "learning_rate": 4.9996567156116395e-06, "loss": 0.9428, "num_input_tokens_seen": 40894464, "step": 39 }, { "epoch": 0.044755244755244755, "grad_norm": 0.4835328140340531, "learning_rate": 4.999605924868201e-06, "loss": 0.8479, "num_input_tokens_seen": 41943040, "step": 40 }, { "epoch": 0.04587412587412588, "grad_norm": 0.6238490968818873, "learning_rate": 4.999551631695257e-06, "loss": 0.8798, "num_input_tokens_seen": 42991616, "step": 41 }, { "epoch": 0.04699300699300699, "grad_norm": 0.4524053733970489, "learning_rate": 4.999493836168882e-06, "loss": 0.7788, "num_input_tokens_seen": 44040192, "step": 42 }, { "epoch": 0.048111888111888115, "grad_norm": 0.4471846539394142, "learning_rate": 4.999432538370057e-06, "loss": 0.8416, "num_input_tokens_seen": 45088768, "step": 43 }, { "epoch": 0.04923076923076923, "grad_norm": 0.48883052276201744, "learning_rate": 4.999367738384673e-06, "loss": 1.1177, "num_input_tokens_seen": 46137344, "step": 44 }, { "epoch": 0.05034965034965035, "grad_norm": 0.43158877395011797, "learning_rate": 4.999299436303527e-06, "loss": 1.0214, "num_input_tokens_seen": 47185920, "step": 45 }, { "epoch": 0.05146853146853147, "grad_norm": 0.4120977459925011, "learning_rate": 4.999227632222324e-06, "loss": 0.9225, "num_input_tokens_seen": 48234496, "step": 46 }, { "epoch": 0.05258741258741259, "grad_norm": 0.4455918868862268, "learning_rate": 4.999152326241675e-06, "loss": 0.8194, "num_input_tokens_seen": 49283072, "step": 47 }, { "epoch": 0.053706293706293706, "grad_norm": 0.4234718637004096, "learning_rate": 4.999073518467098e-06, "loss": 0.8172, "num_input_tokens_seen": 50331648, "step": 48 }, { "epoch": 0.05482517482517483, "grad_norm": 0.43191187391323305, "learning_rate": 4.998991209009019e-06, "loss": 0.8096, "num_input_tokens_seen": 51380224, "step": 49 }, { "epoch": 0.055944055944055944, "grad_norm": 0.4612755062279119, "learning_rate": 4.998905397982767e-06, "loss": 0.9269, "num_input_tokens_seen": 52428800, "step": 50 }, { "epoch": 0.057062937062937066, "grad_norm": 0.4429523149978322, "learning_rate": 4.998816085508582e-06, "loss": 0.8298, "num_input_tokens_seen": 53477376, "step": 51 }, { "epoch": 0.05818181818181818, "grad_norm": 0.41770483221793064, "learning_rate": 4.998723271711607e-06, "loss": 0.7906, "num_input_tokens_seen": 54525952, "step": 52 }, { "epoch": 0.059300699300699304, "grad_norm": 0.4231190609667863, "learning_rate": 4.998626956721894e-06, "loss": 0.7859, "num_input_tokens_seen": 55574528, "step": 53 }, { "epoch": 0.06041958041958042, "grad_norm": 0.5209234045259732, "learning_rate": 4.998527140674395e-06, "loss": 0.8241, "num_input_tokens_seen": 56623104, "step": 54 }, { "epoch": 0.06153846153846154, "grad_norm": 0.402593624058825, "learning_rate": 4.998423823708974e-06, "loss": 0.8721, "num_input_tokens_seen": 57671680, "step": 55 }, { "epoch": 0.06265734265734266, "grad_norm": 0.5822122992144914, "learning_rate": 4.998317005970398e-06, "loss": 0.8404, "num_input_tokens_seen": 58720256, "step": 56 }, { "epoch": 0.06377622377622377, "grad_norm": 0.4395783477215438, "learning_rate": 4.998206687608339e-06, "loss": 0.7641, "num_input_tokens_seen": 59768832, "step": 57 }, { "epoch": 0.0648951048951049, "grad_norm": 0.504796134918118, "learning_rate": 4.998092868777374e-06, "loss": 0.9132, "num_input_tokens_seen": 60817408, "step": 58 }, { "epoch": 0.06601398601398602, "grad_norm": 0.41376034762584496, "learning_rate": 4.997975549636985e-06, "loss": 0.9944, "num_input_tokens_seen": 61865984, "step": 59 }, { "epoch": 0.06713286713286713, "grad_norm": 0.4730772543900096, "learning_rate": 4.997854730351559e-06, "loss": 0.8668, "num_input_tokens_seen": 62914560, "step": 60 }, { "epoch": 0.06825174825174825, "grad_norm": 0.5006468370293393, "learning_rate": 4.997730411090387e-06, "loss": 0.7948, "num_input_tokens_seen": 63963136, "step": 61 }, { "epoch": 0.06937062937062938, "grad_norm": 0.4824994740879561, "learning_rate": 4.997602592027664e-06, "loss": 0.7899, "num_input_tokens_seen": 65011712, "step": 62 }, { "epoch": 0.07048951048951049, "grad_norm": 0.6050679900401704, "learning_rate": 4.9974712733424905e-06, "loss": 0.7772, "num_input_tokens_seen": 66060288, "step": 63 }, { "epoch": 0.07160839160839161, "grad_norm": 0.39957570031903733, "learning_rate": 4.997336455218868e-06, "loss": 0.9086, "num_input_tokens_seen": 67108864, "step": 64 }, { "epoch": 0.07272727272727272, "grad_norm": 0.5237808861917288, "learning_rate": 4.997198137845702e-06, "loss": 0.8423, "num_input_tokens_seen": 68157440, "step": 65 }, { "epoch": 0.07384615384615385, "grad_norm": 0.38512331176249215, "learning_rate": 4.997056321416803e-06, "loss": 0.9755, "num_input_tokens_seen": 69206016, "step": 66 }, { "epoch": 0.07496503496503497, "grad_norm": 0.482202161696032, "learning_rate": 4.9969110061308826e-06, "loss": 0.833, "num_input_tokens_seen": 70254592, "step": 67 }, { "epoch": 0.07608391608391608, "grad_norm": 0.38084430004870246, "learning_rate": 4.996762192191556e-06, "loss": 0.8858, "num_input_tokens_seen": 71303168, "step": 68 }, { "epoch": 0.0772027972027972, "grad_norm": 0.4443786681440813, "learning_rate": 4.996609879807341e-06, "loss": 0.7604, "num_input_tokens_seen": 72351744, "step": 69 }, { "epoch": 0.07832167832167833, "grad_norm": 0.41353809908085776, "learning_rate": 4.996454069191653e-06, "loss": 0.9583, "num_input_tokens_seen": 73400320, "step": 70 }, { "epoch": 0.07944055944055944, "grad_norm": 0.384716440292039, "learning_rate": 4.996294760562817e-06, "loss": 0.809, "num_input_tokens_seen": 74448896, "step": 71 }, { "epoch": 0.08055944055944056, "grad_norm": 0.4895625632646858, "learning_rate": 4.996131954144053e-06, "loss": 0.8756, "num_input_tokens_seen": 75497472, "step": 72 }, { "epoch": 0.08167832167832167, "grad_norm": 0.4184809282065292, "learning_rate": 4.995965650163485e-06, "loss": 0.7074, "num_input_tokens_seen": 76546048, "step": 73 }, { "epoch": 0.0827972027972028, "grad_norm": 0.41937398794197217, "learning_rate": 4.995795848854134e-06, "loss": 0.8109, "num_input_tokens_seen": 77594624, "step": 74 }, { "epoch": 0.08391608391608392, "grad_norm": 0.360907187414614, "learning_rate": 4.995622550453929e-06, "loss": 0.8878, "num_input_tokens_seen": 78643200, "step": 75 }, { "epoch": 0.08503496503496503, "grad_norm": 0.3951371929861385, "learning_rate": 4.995445755205692e-06, "loss": 0.7673, "num_input_tokens_seen": 79691776, "step": 76 }, { "epoch": 0.08615384615384615, "grad_norm": 0.7171640111640437, "learning_rate": 4.995265463357147e-06, "loss": 0.8226, "num_input_tokens_seen": 80740352, "step": 77 }, { "epoch": 0.08727272727272728, "grad_norm": 0.44426635592447294, "learning_rate": 4.995081675160918e-06, "loss": 0.7972, "num_input_tokens_seen": 81788928, "step": 78 }, { "epoch": 0.0883916083916084, "grad_norm": 0.40224317907460433, "learning_rate": 4.994894390874527e-06, "loss": 0.8637, "num_input_tokens_seen": 82837504, "step": 79 }, { "epoch": 0.08951048951048951, "grad_norm": 0.4352041510542713, "learning_rate": 4.9947036107603975e-06, "loss": 0.8085, "num_input_tokens_seen": 83886080, "step": 80 }, { "epoch": 0.09062937062937063, "grad_norm": 0.40105230460092406, "learning_rate": 4.994509335085847e-06, "loss": 0.8668, "num_input_tokens_seen": 84934656, "step": 81 }, { "epoch": 0.09174825174825176, "grad_norm": 0.4030936224287719, "learning_rate": 4.994311564123093e-06, "loss": 0.8147, "num_input_tokens_seen": 85983232, "step": 82 }, { "epoch": 0.09286713286713287, "grad_norm": 0.5475687560027213, "learning_rate": 4.994110298149253e-06, "loss": 0.9393, "num_input_tokens_seen": 87031808, "step": 83 }, { "epoch": 0.09398601398601399, "grad_norm": 0.3783060297682423, "learning_rate": 4.993905537446337e-06, "loss": 0.8956, "num_input_tokens_seen": 88080384, "step": 84 }, { "epoch": 0.0951048951048951, "grad_norm": 0.5074584166205796, "learning_rate": 4.993697282301256e-06, "loss": 0.8612, "num_input_tokens_seen": 89128960, "step": 85 }, { "epoch": 0.09622377622377623, "grad_norm": 0.444609313515968, "learning_rate": 4.9934855330058145e-06, "loss": 0.8561, "num_input_tokens_seen": 90177536, "step": 86 }, { "epoch": 0.09734265734265735, "grad_norm": 0.43687241777367936, "learning_rate": 4.993270289856714e-06, "loss": 0.8474, "num_input_tokens_seen": 91226112, "step": 87 }, { "epoch": 0.09846153846153846, "grad_norm": 1.5138790691265183, "learning_rate": 4.993051553155552e-06, "loss": 0.9317, "num_input_tokens_seen": 92274688, "step": 88 }, { "epoch": 0.09958041958041958, "grad_norm": 0.546335608319255, "learning_rate": 4.992829323208822e-06, "loss": 0.7798, "num_input_tokens_seen": 93323264, "step": 89 }, { "epoch": 0.1006993006993007, "grad_norm": 0.43288676436142937, "learning_rate": 4.992603600327909e-06, "loss": 0.83, "num_input_tokens_seen": 94371840, "step": 90 }, { "epoch": 0.10181818181818182, "grad_norm": 0.46680757037457654, "learning_rate": 4.992374384829094e-06, "loss": 0.806, "num_input_tokens_seen": 95420416, "step": 91 }, { "epoch": 0.10293706293706294, "grad_norm": 0.4751934254586172, "learning_rate": 4.992141677033554e-06, "loss": 0.8123, "num_input_tokens_seen": 96468992, "step": 92 }, { "epoch": 0.10405594405594405, "grad_norm": 0.7212549388937934, "learning_rate": 4.991905477267356e-06, "loss": 0.8025, "num_input_tokens_seen": 97517568, "step": 93 }, { "epoch": 0.10517482517482518, "grad_norm": 0.47331432181584193, "learning_rate": 4.991665785861463e-06, "loss": 0.7708, "num_input_tokens_seen": 98566144, "step": 94 }, { "epoch": 0.1062937062937063, "grad_norm": 0.42231243545073155, "learning_rate": 4.991422603151727e-06, "loss": 0.6959, "num_input_tokens_seen": 99614720, "step": 95 }, { "epoch": 0.10741258741258741, "grad_norm": 0.6831469800558394, "learning_rate": 4.991175929478894e-06, "loss": 0.8078, "num_input_tokens_seen": 100663296, "step": 96 }, { "epoch": 0.10853146853146853, "grad_norm": 0.40201262307966823, "learning_rate": 4.990925765188602e-06, "loss": 0.7607, "num_input_tokens_seen": 101711872, "step": 97 }, { "epoch": 0.10965034965034966, "grad_norm": 0.48692770468458546, "learning_rate": 4.990672110631379e-06, "loss": 0.8431, "num_input_tokens_seen": 102760448, "step": 98 }, { "epoch": 0.11076923076923077, "grad_norm": 0.423136817532724, "learning_rate": 4.9904149661626456e-06, "loss": 0.9402, "num_input_tokens_seen": 103809024, "step": 99 }, { "epoch": 0.11188811188811189, "grad_norm": 0.46330417197237517, "learning_rate": 4.990154332142708e-06, "loss": 0.793, "num_input_tokens_seen": 104857600, "step": 100 }, { "epoch": 0.113006993006993, "grad_norm": 0.4566154200639198, "learning_rate": 4.989890208936767e-06, "loss": 0.8518, "num_input_tokens_seen": 105906176, "step": 101 }, { "epoch": 0.11412587412587413, "grad_norm": 0.6518609235499214, "learning_rate": 4.989622596914908e-06, "loss": 0.9362, "num_input_tokens_seen": 106954752, "step": 102 }, { "epoch": 0.11524475524475525, "grad_norm": 0.4512494356433435, "learning_rate": 4.989351496452109e-06, "loss": 0.8008, "num_input_tokens_seen": 108003328, "step": 103 }, { "epoch": 0.11636363636363636, "grad_norm": 0.5225362125465312, "learning_rate": 4.989076907928233e-06, "loss": 0.7556, "num_input_tokens_seen": 109051904, "step": 104 }, { "epoch": 0.11748251748251748, "grad_norm": 0.4432962351297985, "learning_rate": 4.988798831728031e-06, "loss": 0.8277, "num_input_tokens_seen": 110100480, "step": 105 }, { "epoch": 0.11860139860139861, "grad_norm": 0.4673884950648465, "learning_rate": 4.988517268241142e-06, "loss": 0.95, "num_input_tokens_seen": 111149056, "step": 106 }, { "epoch": 0.11972027972027972, "grad_norm": 0.5060460426645152, "learning_rate": 4.988232217862091e-06, "loss": 0.8442, "num_input_tokens_seen": 112197632, "step": 107 }, { "epoch": 0.12083916083916084, "grad_norm": 0.4233795815046769, "learning_rate": 4.987943680990288e-06, "loss": 1.0015, "num_input_tokens_seen": 113246208, "step": 108 }, { "epoch": 0.12195804195804195, "grad_norm": 0.42949109054058604, "learning_rate": 4.9876516580300285e-06, "loss": 0.8104, "num_input_tokens_seen": 114294784, "step": 109 }, { "epoch": 0.12307692307692308, "grad_norm": 0.40613989787839777, "learning_rate": 4.987356149390493e-06, "loss": 1.0469, "num_input_tokens_seen": 115343360, "step": 110 }, { "epoch": 0.1241958041958042, "grad_norm": 0.5142165493685615, "learning_rate": 4.987057155485746e-06, "loss": 0.7177, "num_input_tokens_seen": 116391936, "step": 111 }, { "epoch": 0.12531468531468531, "grad_norm": 0.40026142261424, "learning_rate": 4.986754676734737e-06, "loss": 0.8348, "num_input_tokens_seen": 117440512, "step": 112 }, { "epoch": 0.12643356643356643, "grad_norm": 0.44774924111962405, "learning_rate": 4.986448713561295e-06, "loss": 0.7816, "num_input_tokens_seen": 118489088, "step": 113 }, { "epoch": 0.12755244755244755, "grad_norm": 0.3983308626677301, "learning_rate": 4.986139266394134e-06, "loss": 0.9002, "num_input_tokens_seen": 119537664, "step": 114 }, { "epoch": 0.12867132867132866, "grad_norm": 0.4465472780783531, "learning_rate": 4.9858263356668505e-06, "loss": 0.8056, "num_input_tokens_seen": 120586240, "step": 115 }, { "epoch": 0.1297902097902098, "grad_norm": 0.3838254603601786, "learning_rate": 4.9855099218179186e-06, "loss": 1.0116, "num_input_tokens_seen": 121634816, "step": 116 }, { "epoch": 0.13090909090909092, "grad_norm": 0.38722451107881567, "learning_rate": 4.985190025290696e-06, "loss": 0.8787, "num_input_tokens_seen": 122683392, "step": 117 }, { "epoch": 0.13202797202797203, "grad_norm": 0.4143314575191455, "learning_rate": 4.98486664653342e-06, "loss": 0.8072, "num_input_tokens_seen": 123731968, "step": 118 }, { "epoch": 0.13314685314685315, "grad_norm": 0.5625928131332723, "learning_rate": 4.984539785999205e-06, "loss": 0.8445, "num_input_tokens_seen": 124780544, "step": 119 }, { "epoch": 0.13426573426573427, "grad_norm": 0.4724639998743113, "learning_rate": 4.9842094441460476e-06, "loss": 0.9251, "num_input_tokens_seen": 125829120, "step": 120 }, { "epoch": 0.13538461538461538, "grad_norm": 0.4290455094721757, "learning_rate": 4.9838756214368185e-06, "loss": 0.8878, "num_input_tokens_seen": 126877696, "step": 121 }, { "epoch": 0.1365034965034965, "grad_norm": 0.43270552698582443, "learning_rate": 4.983538318339268e-06, "loss": 0.779, "num_input_tokens_seen": 127926272, "step": 122 }, { "epoch": 0.1376223776223776, "grad_norm": 0.3989028963501919, "learning_rate": 4.983197535326024e-06, "loss": 0.7563, "num_input_tokens_seen": 128974848, "step": 123 }, { "epoch": 0.13874125874125876, "grad_norm": 0.39828587702469376, "learning_rate": 4.982853272874589e-06, "loss": 0.7584, "num_input_tokens_seen": 130023424, "step": 124 }, { "epoch": 0.13986013986013987, "grad_norm": 0.48331907292674264, "learning_rate": 4.982505531467339e-06, "loss": 0.7233, "num_input_tokens_seen": 131072000, "step": 125 }, { "epoch": 0.14097902097902099, "grad_norm": 0.4049095833667207, "learning_rate": 4.982154311591529e-06, "loss": 0.865, "num_input_tokens_seen": 132120576, "step": 126 }, { "epoch": 0.1420979020979021, "grad_norm": 0.39289161038940934, "learning_rate": 4.981799613739284e-06, "loss": 0.7565, "num_input_tokens_seen": 133169152, "step": 127 }, { "epoch": 0.14321678321678322, "grad_norm": 0.39886411820570505, "learning_rate": 4.981441438407605e-06, "loss": 0.7785, "num_input_tokens_seen": 134217728, "step": 128 }, { "epoch": 0.14433566433566433, "grad_norm": 0.44735631697740225, "learning_rate": 4.981079786098365e-06, "loss": 0.8922, "num_input_tokens_seen": 135266304, "step": 129 }, { "epoch": 0.14545454545454545, "grad_norm": 0.4634531235807853, "learning_rate": 4.980714657318307e-06, "loss": 0.8254, "num_input_tokens_seen": 136314880, "step": 130 }, { "epoch": 0.14657342657342656, "grad_norm": 0.39461039693271, "learning_rate": 4.980346052579049e-06, "loss": 0.8365, "num_input_tokens_seen": 137363456, "step": 131 }, { "epoch": 0.1476923076923077, "grad_norm": 0.4295885791529507, "learning_rate": 4.979973972397075e-06, "loss": 0.8213, "num_input_tokens_seen": 138412032, "step": 132 }, { "epoch": 0.14881118881118882, "grad_norm": 0.3989982526001331, "learning_rate": 4.979598417293743e-06, "loss": 0.8138, "num_input_tokens_seen": 139460608, "step": 133 }, { "epoch": 0.14993006993006994, "grad_norm": 0.4027849806960307, "learning_rate": 4.9792193877952765e-06, "loss": 0.8871, "num_input_tokens_seen": 140509184, "step": 134 }, { "epoch": 0.15104895104895105, "grad_norm": 0.4193459178600057, "learning_rate": 4.97883688443277e-06, "loss": 0.7556, "num_input_tokens_seen": 141557760, "step": 135 }, { "epoch": 0.15216783216783217, "grad_norm": 0.4374451578106669, "learning_rate": 4.9784509077421836e-06, "loss": 0.831, "num_input_tokens_seen": 142606336, "step": 136 }, { "epoch": 0.15328671328671328, "grad_norm": 0.4534956209731893, "learning_rate": 4.978061458264346e-06, "loss": 0.8148, "num_input_tokens_seen": 143654912, "step": 137 }, { "epoch": 0.1544055944055944, "grad_norm": 0.4476767435986571, "learning_rate": 4.97766853654495e-06, "loss": 0.8116, "num_input_tokens_seen": 144703488, "step": 138 }, { "epoch": 0.15552447552447551, "grad_norm": 0.38264935412655576, "learning_rate": 4.977272143134554e-06, "loss": 0.8231, "num_input_tokens_seen": 145752064, "step": 139 }, { "epoch": 0.15664335664335666, "grad_norm": 0.41701177911525683, "learning_rate": 4.976872278588582e-06, "loss": 0.9111, "num_input_tokens_seen": 146800640, "step": 140 }, { "epoch": 0.15776223776223777, "grad_norm": 0.4243716873570224, "learning_rate": 4.976468943467323e-06, "loss": 0.8487, "num_input_tokens_seen": 147849216, "step": 141 }, { "epoch": 0.1588811188811189, "grad_norm": 0.43896864873420016, "learning_rate": 4.976062138335926e-06, "loss": 0.76, "num_input_tokens_seen": 148897792, "step": 142 }, { "epoch": 0.16, "grad_norm": 0.9328177488300727, "learning_rate": 4.975651863764403e-06, "loss": 0.7741, "num_input_tokens_seen": 149946368, "step": 143 }, { "epoch": 0.16111888111888112, "grad_norm": 0.40793311152545136, "learning_rate": 4.975238120327628e-06, "loss": 0.8565, "num_input_tokens_seen": 150994944, "step": 144 }, { "epoch": 0.16223776223776223, "grad_norm": 0.475119260990423, "learning_rate": 4.974820908605336e-06, "loss": 0.8921, "num_input_tokens_seen": 152043520, "step": 145 }, { "epoch": 0.16335664335664335, "grad_norm": 0.5006736281067129, "learning_rate": 4.974400229182119e-06, "loss": 0.9724, "num_input_tokens_seen": 153092096, "step": 146 }, { "epoch": 0.16447552447552446, "grad_norm": 0.4478011243221582, "learning_rate": 4.973976082647432e-06, "loss": 0.7849, "num_input_tokens_seen": 154140672, "step": 147 }, { "epoch": 0.1655944055944056, "grad_norm": 0.4594701430101693, "learning_rate": 4.973548469595585e-06, "loss": 0.774, "num_input_tokens_seen": 155189248, "step": 148 }, { "epoch": 0.16671328671328672, "grad_norm": 0.4695379376726495, "learning_rate": 4.973117390625746e-06, "loss": 0.7858, "num_input_tokens_seen": 156237824, "step": 149 }, { "epoch": 0.16783216783216784, "grad_norm": 0.4202301394288576, "learning_rate": 4.972682846341941e-06, "loss": 0.8224, "num_input_tokens_seen": 157286400, "step": 150 }, { "epoch": 0.16895104895104895, "grad_norm": 0.5287861727776014, "learning_rate": 4.97224483735305e-06, "loss": 0.8042, "num_input_tokens_seen": 158334976, "step": 151 }, { "epoch": 0.17006993006993007, "grad_norm": 0.4401097240532196, "learning_rate": 4.971803364272806e-06, "loss": 0.867, "num_input_tokens_seen": 159383552, "step": 152 }, { "epoch": 0.17118881118881119, "grad_norm": 0.4171566923905569, "learning_rate": 4.9713584277198e-06, "loss": 0.8237, "num_input_tokens_seen": 160432128, "step": 153 }, { "epoch": 0.1723076923076923, "grad_norm": 0.36948722090578073, "learning_rate": 4.9709100283174735e-06, "loss": 0.7817, "num_input_tokens_seen": 161480704, "step": 154 }, { "epoch": 0.17342657342657342, "grad_norm": 0.43263527009068387, "learning_rate": 4.97045816669412e-06, "loss": 0.8944, "num_input_tokens_seen": 162529280, "step": 155 }, { "epoch": 0.17454545454545456, "grad_norm": 0.3988433089368691, "learning_rate": 4.970002843482885e-06, "loss": 0.75, "num_input_tokens_seen": 163577856, "step": 156 }, { "epoch": 0.17566433566433567, "grad_norm": 0.45339986900011503, "learning_rate": 4.9695440593217635e-06, "loss": 0.9354, "num_input_tokens_seen": 164626432, "step": 157 }, { "epoch": 0.1767832167832168, "grad_norm": 0.4211991604608452, "learning_rate": 4.969081814853601e-06, "loss": 0.7858, "num_input_tokens_seen": 165675008, "step": 158 }, { "epoch": 0.1779020979020979, "grad_norm": 0.37418609994913754, "learning_rate": 4.9686161107260906e-06, "loss": 0.8166, "num_input_tokens_seen": 166723584, "step": 159 }, { "epoch": 0.17902097902097902, "grad_norm": 0.377800362263991, "learning_rate": 4.9681469475917746e-06, "loss": 0.7839, "num_input_tokens_seen": 167772160, "step": 160 }, { "epoch": 0.18013986013986014, "grad_norm": 0.8856941625448042, "learning_rate": 4.967674326108039e-06, "loss": 0.694, "num_input_tokens_seen": 168820736, "step": 161 }, { "epoch": 0.18125874125874125, "grad_norm": 0.46788344308563995, "learning_rate": 4.967198246937119e-06, "loss": 0.7829, "num_input_tokens_seen": 169869312, "step": 162 }, { "epoch": 0.18237762237762237, "grad_norm": 0.4065392285627104, "learning_rate": 4.9667187107460934e-06, "loss": 0.9115, "num_input_tokens_seen": 170917888, "step": 163 }, { "epoch": 0.1834965034965035, "grad_norm": 0.44989616201000493, "learning_rate": 4.966235718206885e-06, "loss": 0.7645, "num_input_tokens_seen": 171966464, "step": 164 }, { "epoch": 0.18461538461538463, "grad_norm": 0.44208013683389097, "learning_rate": 4.965749269996258e-06, "loss": 0.8021, "num_input_tokens_seen": 173015040, "step": 165 }, { "epoch": 0.18573426573426574, "grad_norm": 0.42605381158362515, "learning_rate": 4.965259366795821e-06, "loss": 0.9263, "num_input_tokens_seen": 174063616, "step": 166 }, { "epoch": 0.18685314685314686, "grad_norm": 0.44906438738123644, "learning_rate": 4.964766009292022e-06, "loss": 0.7471, "num_input_tokens_seen": 175112192, "step": 167 }, { "epoch": 0.18797202797202797, "grad_norm": 0.4538207754933081, "learning_rate": 4.964269198176152e-06, "loss": 0.7309, "num_input_tokens_seen": 176160768, "step": 168 }, { "epoch": 0.1890909090909091, "grad_norm": 0.42995201077965706, "learning_rate": 4.963768934144336e-06, "loss": 0.7829, "num_input_tokens_seen": 177209344, "step": 169 }, { "epoch": 0.1902097902097902, "grad_norm": 0.4888137353825207, "learning_rate": 4.963265217897543e-06, "loss": 0.7963, "num_input_tokens_seen": 178257920, "step": 170 }, { "epoch": 0.19132867132867132, "grad_norm": 0.45905309466805105, "learning_rate": 4.962758050141576e-06, "loss": 0.7968, "num_input_tokens_seen": 179306496, "step": 171 }, { "epoch": 0.19244755244755246, "grad_norm": 0.4078113198062552, "learning_rate": 4.962247431587073e-06, "loss": 0.7048, "num_input_tokens_seen": 180355072, "step": 172 }, { "epoch": 0.19356643356643358, "grad_norm": 0.5116435936619046, "learning_rate": 4.96173336294951e-06, "loss": 0.8132, "num_input_tokens_seen": 181403648, "step": 173 }, { "epoch": 0.1946853146853147, "grad_norm": 0.48253559642797955, "learning_rate": 4.961215844949197e-06, "loss": 0.7951, "num_input_tokens_seen": 182452224, "step": 174 }, { "epoch": 0.1958041958041958, "grad_norm": 0.4321408916292664, "learning_rate": 4.960694878311276e-06, "loss": 0.865, "num_input_tokens_seen": 183500800, "step": 175 }, { "epoch": 0.19692307692307692, "grad_norm": 0.5178358008578322, "learning_rate": 4.9601704637657225e-06, "loss": 0.7015, "num_input_tokens_seen": 184549376, "step": 176 }, { "epoch": 0.19804195804195804, "grad_norm": 0.3945026845377132, "learning_rate": 4.959642602047339e-06, "loss": 0.7767, "num_input_tokens_seen": 185597952, "step": 177 }, { "epoch": 0.19916083916083915, "grad_norm": 0.433836446042042, "learning_rate": 4.959111293895765e-06, "loss": 0.8576, "num_input_tokens_seen": 186646528, "step": 178 }, { "epoch": 0.20027972027972027, "grad_norm": 0.3977107707409774, "learning_rate": 4.958576540055464e-06, "loss": 0.8765, "num_input_tokens_seen": 187695104, "step": 179 }, { "epoch": 0.2013986013986014, "grad_norm": 0.44253150231673893, "learning_rate": 4.95803834127573e-06, "loss": 0.9279, "num_input_tokens_seen": 188743680, "step": 180 }, { "epoch": 0.20251748251748253, "grad_norm": 0.41462785773815153, "learning_rate": 4.9574966983106824e-06, "loss": 0.7802, "num_input_tokens_seen": 189792256, "step": 181 }, { "epoch": 0.20363636363636364, "grad_norm": 0.4565381254500923, "learning_rate": 4.956951611919267e-06, "loss": 0.717, "num_input_tokens_seen": 190840832, "step": 182 }, { "epoch": 0.20475524475524476, "grad_norm": 0.44694110534362946, "learning_rate": 4.9564030828652565e-06, "loss": 0.7341, "num_input_tokens_seen": 191889408, "step": 183 }, { "epoch": 0.20587412587412587, "grad_norm": 0.41166784994343303, "learning_rate": 4.955851111917245e-06, "loss": 0.6868, "num_input_tokens_seen": 192937984, "step": 184 }, { "epoch": 0.206993006993007, "grad_norm": 0.4348937777415701, "learning_rate": 4.955295699848649e-06, "loss": 0.8103, "num_input_tokens_seen": 193986560, "step": 185 }, { "epoch": 0.2081118881118881, "grad_norm": 0.3821135760408287, "learning_rate": 4.954736847437709e-06, "loss": 0.7368, "num_input_tokens_seen": 195035136, "step": 186 }, { "epoch": 0.20923076923076922, "grad_norm": 0.47058686594138116, "learning_rate": 4.954174555467484e-06, "loss": 0.9349, "num_input_tokens_seen": 196083712, "step": 187 }, { "epoch": 0.21034965034965036, "grad_norm": 0.4002484384001902, "learning_rate": 4.953608824725855e-06, "loss": 0.8829, "num_input_tokens_seen": 197132288, "step": 188 }, { "epoch": 0.21146853146853148, "grad_norm": 0.4556836893720014, "learning_rate": 4.953039656005519e-06, "loss": 1.0609, "num_input_tokens_seen": 198180864, "step": 189 }, { "epoch": 0.2125874125874126, "grad_norm": 0.43181189095101985, "learning_rate": 4.95246705010399e-06, "loss": 0.7973, "num_input_tokens_seen": 199229440, "step": 190 }, { "epoch": 0.2137062937062937, "grad_norm": 0.40578711899024605, "learning_rate": 4.951891007823601e-06, "loss": 0.7003, "num_input_tokens_seen": 200278016, "step": 191 }, { "epoch": 0.21482517482517482, "grad_norm": 0.6396750817418133, "learning_rate": 4.951311529971496e-06, "loss": 0.8175, "num_input_tokens_seen": 201326592, "step": 192 }, { "epoch": 0.21594405594405594, "grad_norm": 0.46439444718152895, "learning_rate": 4.950728617359637e-06, "loss": 0.7979, "num_input_tokens_seen": 202375168, "step": 193 }, { "epoch": 0.21706293706293706, "grad_norm": 0.405028495828316, "learning_rate": 4.950142270804797e-06, "loss": 0.7616, "num_input_tokens_seen": 203423744, "step": 194 }, { "epoch": 0.21818181818181817, "grad_norm": 0.40111406632451735, "learning_rate": 4.949552491128559e-06, "loss": 0.6991, "num_input_tokens_seen": 204472320, "step": 195 }, { "epoch": 0.21930069930069931, "grad_norm": 0.4206905273951015, "learning_rate": 4.948959279157319e-06, "loss": 0.8588, "num_input_tokens_seen": 205520896, "step": 196 }, { "epoch": 0.22041958041958043, "grad_norm": 0.4611302946403387, "learning_rate": 4.948362635722281e-06, "loss": 0.7721, "num_input_tokens_seen": 206569472, "step": 197 }, { "epoch": 0.22153846153846155, "grad_norm": 0.43177497169130546, "learning_rate": 4.947762561659457e-06, "loss": 0.8775, "num_input_tokens_seen": 207618048, "step": 198 }, { "epoch": 0.22265734265734266, "grad_norm": 0.6864956444481425, "learning_rate": 4.947159057809668e-06, "loss": 0.9032, "num_input_tokens_seen": 208666624, "step": 199 }, { "epoch": 0.22377622377622378, "grad_norm": 0.40690777829157665, "learning_rate": 4.9465521250185365e-06, "loss": 0.8398, "num_input_tokens_seen": 209715200, "step": 200 }, { "epoch": 0.2248951048951049, "grad_norm": 0.47861209828375845, "learning_rate": 4.945941764136494e-06, "loss": 0.7901, "num_input_tokens_seen": 210763776, "step": 201 }, { "epoch": 0.226013986013986, "grad_norm": 0.3975909591733088, "learning_rate": 4.945327976018774e-06, "loss": 0.9251, "num_input_tokens_seen": 211812352, "step": 202 }, { "epoch": 0.22713286713286712, "grad_norm": 0.4307744172802356, "learning_rate": 4.944710761525411e-06, "loss": 0.7021, "num_input_tokens_seen": 212860928, "step": 203 }, { "epoch": 0.22825174825174827, "grad_norm": 0.40365877331472794, "learning_rate": 4.944090121521242e-06, "loss": 0.767, "num_input_tokens_seen": 213909504, "step": 204 }, { "epoch": 0.22937062937062938, "grad_norm": 0.4306615257931823, "learning_rate": 4.943466056875903e-06, "loss": 0.7803, "num_input_tokens_seen": 214958080, "step": 205 }, { "epoch": 0.2304895104895105, "grad_norm": 0.463503253858425, "learning_rate": 4.942838568463829e-06, "loss": 0.6921, "num_input_tokens_seen": 216006656, "step": 206 }, { "epoch": 0.2316083916083916, "grad_norm": 0.4256405377091981, "learning_rate": 4.9422076571642516e-06, "loss": 0.8301, "num_input_tokens_seen": 217055232, "step": 207 }, { "epoch": 0.23272727272727273, "grad_norm": 0.5090759343551103, "learning_rate": 4.9415733238612e-06, "loss": 0.7935, "num_input_tokens_seen": 218103808, "step": 208 }, { "epoch": 0.23384615384615384, "grad_norm": 0.4425631862517256, "learning_rate": 4.940935569443496e-06, "loss": 0.7251, "num_input_tokens_seen": 219152384, "step": 209 }, { "epoch": 0.23496503496503496, "grad_norm": 0.4175194662149566, "learning_rate": 4.940294394804757e-06, "loss": 0.8056, "num_input_tokens_seen": 220200960, "step": 210 }, { "epoch": 0.23608391608391607, "grad_norm": 0.5218114296991244, "learning_rate": 4.939649800843394e-06, "loss": 0.709, "num_input_tokens_seen": 221249536, "step": 211 }, { "epoch": 0.23720279720279722, "grad_norm": 0.3788624259359706, "learning_rate": 4.939001788462604e-06, "loss": 0.8409, "num_input_tokens_seen": 222298112, "step": 212 }, { "epoch": 0.23832167832167833, "grad_norm": 0.41562725170234655, "learning_rate": 4.93835035857038e-06, "loss": 0.8815, "num_input_tokens_seen": 223346688, "step": 213 }, { "epoch": 0.23944055944055945, "grad_norm": 0.5234109387187096, "learning_rate": 4.9376955120795e-06, "loss": 0.8565, "num_input_tokens_seen": 224395264, "step": 214 }, { "epoch": 0.24055944055944056, "grad_norm": 0.48534827415949744, "learning_rate": 4.937037249907529e-06, "loss": 0.8288, "num_input_tokens_seen": 225443840, "step": 215 }, { "epoch": 0.24167832167832168, "grad_norm": 0.6143944983631473, "learning_rate": 4.936375572976822e-06, "loss": 0.8191, "num_input_tokens_seen": 226492416, "step": 216 }, { "epoch": 0.2427972027972028, "grad_norm": 0.4102404834787625, "learning_rate": 4.935710482214512e-06, "loss": 0.7696, "num_input_tokens_seen": 227540992, "step": 217 }, { "epoch": 0.2439160839160839, "grad_norm": 0.6206628138160337, "learning_rate": 4.935041978552522e-06, "loss": 0.8006, "num_input_tokens_seen": 228589568, "step": 218 }, { "epoch": 0.24503496503496502, "grad_norm": 0.44243651312603277, "learning_rate": 4.9343700629275525e-06, "loss": 0.8424, "num_input_tokens_seen": 229638144, "step": 219 }, { "epoch": 0.24615384615384617, "grad_norm": 0.5357996045267722, "learning_rate": 4.933694736281089e-06, "loss": 0.8801, "num_input_tokens_seen": 230686720, "step": 220 }, { "epoch": 0.24727272727272728, "grad_norm": 0.47530346822086145, "learning_rate": 4.9330159995593926e-06, "loss": 0.8984, "num_input_tokens_seen": 231735296, "step": 221 }, { "epoch": 0.2483916083916084, "grad_norm": 0.5020961951456309, "learning_rate": 4.932333853713505e-06, "loss": 0.9508, "num_input_tokens_seen": 232783872, "step": 222 }, { "epoch": 0.2495104895104895, "grad_norm": 0.5187698636305198, "learning_rate": 4.931648299699245e-06, "loss": 0.7423, "num_input_tokens_seen": 233832448, "step": 223 }, { "epoch": 0.25062937062937063, "grad_norm": 0.47060346073742854, "learning_rate": 4.930959338477203e-06, "loss": 0.7948, "num_input_tokens_seen": 234881024, "step": 224 }, { "epoch": 0.2517482517482518, "grad_norm": 0.533187039209635, "learning_rate": 4.930266971012748e-06, "loss": 0.7943, "num_input_tokens_seen": 235929600, "step": 225 }, { "epoch": 0.25286713286713286, "grad_norm": 0.4955130908255806, "learning_rate": 4.92957119827602e-06, "loss": 0.7972, "num_input_tokens_seen": 236978176, "step": 226 }, { "epoch": 0.253986013986014, "grad_norm": 0.3828018044920714, "learning_rate": 4.928872021241932e-06, "loss": 0.7005, "num_input_tokens_seen": 238026752, "step": 227 }, { "epoch": 0.2551048951048951, "grad_norm": 0.7728633234228283, "learning_rate": 4.928169440890164e-06, "loss": 0.7865, "num_input_tokens_seen": 239075328, "step": 228 }, { "epoch": 0.25622377622377623, "grad_norm": 0.47505500529340566, "learning_rate": 4.927463458205167e-06, "loss": 0.8001, "num_input_tokens_seen": 240123904, "step": 229 }, { "epoch": 0.2573426573426573, "grad_norm": 0.532109727576613, "learning_rate": 4.926754074176159e-06, "loss": 0.7972, "num_input_tokens_seen": 241172480, "step": 230 }, { "epoch": 0.25846153846153846, "grad_norm": 0.49736970412275383, "learning_rate": 4.9260412897971225e-06, "loss": 0.756, "num_input_tokens_seen": 242221056, "step": 231 }, { "epoch": 0.2595804195804196, "grad_norm": 0.44475582902002786, "learning_rate": 4.925325106066808e-06, "loss": 0.7881, "num_input_tokens_seen": 243269632, "step": 232 }, { "epoch": 0.2606993006993007, "grad_norm": 0.43609783881416647, "learning_rate": 4.9246055239887255e-06, "loss": 0.7635, "num_input_tokens_seen": 244318208, "step": 233 }, { "epoch": 0.26181818181818184, "grad_norm": 1.030523306996848, "learning_rate": 4.923882544571148e-06, "loss": 0.8651, "num_input_tokens_seen": 245366784, "step": 234 }, { "epoch": 0.2629370629370629, "grad_norm": 0.8269731350372227, "learning_rate": 4.923156168827109e-06, "loss": 0.8524, "num_input_tokens_seen": 246415360, "step": 235 }, { "epoch": 0.26405594405594407, "grad_norm": 0.4957827807892276, "learning_rate": 4.922426397774402e-06, "loss": 0.9272, "num_input_tokens_seen": 247463936, "step": 236 }, { "epoch": 0.26517482517482516, "grad_norm": 0.4638255664556491, "learning_rate": 4.9216932324355755e-06, "loss": 0.8336, "num_input_tokens_seen": 248512512, "step": 237 }, { "epoch": 0.2662937062937063, "grad_norm": 0.4642790324241793, "learning_rate": 4.920956673837936e-06, "loss": 0.8021, "num_input_tokens_seen": 249561088, "step": 238 }, { "epoch": 0.2674125874125874, "grad_norm": 0.48301304738300044, "learning_rate": 4.920216723013544e-06, "loss": 0.7572, "num_input_tokens_seen": 250609664, "step": 239 }, { "epoch": 0.26853146853146853, "grad_norm": 0.4290984355400012, "learning_rate": 4.919473380999212e-06, "loss": 0.773, "num_input_tokens_seen": 251658240, "step": 240 }, { "epoch": 0.2696503496503497, "grad_norm": 0.4496379240203434, "learning_rate": 4.918726648836507e-06, "loss": 0.8538, "num_input_tokens_seen": 252706816, "step": 241 }, { "epoch": 0.27076923076923076, "grad_norm": 0.45478793309867555, "learning_rate": 4.917976527571745e-06, "loss": 0.7968, "num_input_tokens_seen": 253755392, "step": 242 }, { "epoch": 0.2718881118881119, "grad_norm": 0.6008993780883729, "learning_rate": 4.917223018255989e-06, "loss": 0.6936, "num_input_tokens_seen": 254803968, "step": 243 }, { "epoch": 0.273006993006993, "grad_norm": 0.5451271544078405, "learning_rate": 4.9164661219450504e-06, "loss": 0.8573, "num_input_tokens_seen": 255852544, "step": 244 }, { "epoch": 0.27412587412587414, "grad_norm": 0.443045393020945, "learning_rate": 4.915705839699488e-06, "loss": 0.7018, "num_input_tokens_seen": 256901120, "step": 245 }, { "epoch": 0.2752447552447552, "grad_norm": 0.5130767845579554, "learning_rate": 4.914942172584605e-06, "loss": 0.9167, "num_input_tokens_seen": 257949696, "step": 246 }, { "epoch": 0.27636363636363637, "grad_norm": 0.4536420148834774, "learning_rate": 4.914175121670443e-06, "loss": 0.7525, "num_input_tokens_seen": 258998272, "step": 247 }, { "epoch": 0.2774825174825175, "grad_norm": 0.4409979862477785, "learning_rate": 4.9134046880317895e-06, "loss": 0.859, "num_input_tokens_seen": 260046848, "step": 248 }, { "epoch": 0.2786013986013986, "grad_norm": 0.46212524826358076, "learning_rate": 4.912630872748171e-06, "loss": 0.7317, "num_input_tokens_seen": 261095424, "step": 249 }, { "epoch": 0.27972027972027974, "grad_norm": 0.3918614212195811, "learning_rate": 4.911853676903851e-06, "loss": 0.7855, "num_input_tokens_seen": 262144000, "step": 250 }, { "epoch": 0.27972027972027974, "eval_loss": 0.7918664813041687, "eval_runtime": 248.3569, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.294, "num_input_tokens_seen": 262144000, "step": 250 }, { "epoch": 0.28083916083916083, "grad_norm": 0.5431527194603042, "learning_rate": 4.911073101587831e-06, "loss": 0.7646, "num_input_tokens_seen": 263192576, "step": 251 }, { "epoch": 0.28195804195804197, "grad_norm": 0.4148029881928778, "learning_rate": 4.9102891478938475e-06, "loss": 0.7942, "num_input_tokens_seen": 264241152, "step": 252 }, { "epoch": 0.28307692307692306, "grad_norm": 0.4297159978555356, "learning_rate": 4.90950181692037e-06, "loss": 0.7701, "num_input_tokens_seen": 265289728, "step": 253 }, { "epoch": 0.2841958041958042, "grad_norm": 0.49403794052294653, "learning_rate": 4.908711109770602e-06, "loss": 0.8457, "num_input_tokens_seen": 266338304, "step": 254 }, { "epoch": 0.2853146853146853, "grad_norm": 0.4056980457678907, "learning_rate": 4.9079170275524765e-06, "loss": 0.7976, "num_input_tokens_seen": 267386880, "step": 255 }, { "epoch": 0.28643356643356643, "grad_norm": 0.40271142022184997, "learning_rate": 4.907119571378655e-06, "loss": 0.8408, "num_input_tokens_seen": 268435456, "step": 256 }, { "epoch": 0.2875524475524476, "grad_norm": 0.44472804752053346, "learning_rate": 4.906318742366527e-06, "loss": 0.8099, "num_input_tokens_seen": 269484032, "step": 257 }, { "epoch": 0.28867132867132866, "grad_norm": 0.44024694779532203, "learning_rate": 4.90551454163821e-06, "loss": 0.8456, "num_input_tokens_seen": 270532608, "step": 258 }, { "epoch": 0.2897902097902098, "grad_norm": 0.4014262596226305, "learning_rate": 4.904706970320542e-06, "loss": 0.7634, "num_input_tokens_seen": 271581184, "step": 259 }, { "epoch": 0.2909090909090909, "grad_norm": 0.48646437135989745, "learning_rate": 4.9038960295450865e-06, "loss": 0.8899, "num_input_tokens_seen": 272629760, "step": 260 }, { "epoch": 0.29202797202797204, "grad_norm": 0.4185096867519385, "learning_rate": 4.903081720448128e-06, "loss": 0.7128, "num_input_tokens_seen": 273678336, "step": 261 }, { "epoch": 0.2931468531468531, "grad_norm": 0.514500237124604, "learning_rate": 4.902264044170671e-06, "loss": 0.7855, "num_input_tokens_seen": 274726912, "step": 262 }, { "epoch": 0.29426573426573427, "grad_norm": 0.48437113564054995, "learning_rate": 4.901443001858438e-06, "loss": 0.788, "num_input_tokens_seen": 275775488, "step": 263 }, { "epoch": 0.2953846153846154, "grad_norm": 0.4512591383562876, "learning_rate": 4.900618594661865e-06, "loss": 0.7231, "num_input_tokens_seen": 276824064, "step": 264 }, { "epoch": 0.2965034965034965, "grad_norm": 0.40119054524287867, "learning_rate": 4.899790823736108e-06, "loss": 0.7654, "num_input_tokens_seen": 277872640, "step": 265 }, { "epoch": 0.29762237762237764, "grad_norm": 0.42759114943072035, "learning_rate": 4.898959690241033e-06, "loss": 0.8424, "num_input_tokens_seen": 278921216, "step": 266 }, { "epoch": 0.29874125874125873, "grad_norm": 0.43364779651603524, "learning_rate": 4.898125195341217e-06, "loss": 0.9365, "num_input_tokens_seen": 279969792, "step": 267 }, { "epoch": 0.2998601398601399, "grad_norm": 0.3918533183296276, "learning_rate": 4.897287340205948e-06, "loss": 0.7048, "num_input_tokens_seen": 281018368, "step": 268 }, { "epoch": 0.30097902097902096, "grad_norm": 0.395933629732131, "learning_rate": 4.896446126009224e-06, "loss": 0.7451, "num_input_tokens_seen": 282066944, "step": 269 }, { "epoch": 0.3020979020979021, "grad_norm": 0.4064258373536968, "learning_rate": 4.895601553929748e-06, "loss": 0.8328, "num_input_tokens_seen": 283115520, "step": 270 }, { "epoch": 0.3032167832167832, "grad_norm": 0.4669557757819441, "learning_rate": 4.894753625150927e-06, "loss": 0.8177, "num_input_tokens_seen": 284164096, "step": 271 }, { "epoch": 0.30433566433566434, "grad_norm": 0.44682668234509043, "learning_rate": 4.893902340860872e-06, "loss": 0.7319, "num_input_tokens_seen": 285212672, "step": 272 }, { "epoch": 0.3054545454545455, "grad_norm": 0.38146264612882813, "learning_rate": 4.893047702252399e-06, "loss": 0.8058, "num_input_tokens_seen": 286261248, "step": 273 }, { "epoch": 0.30657342657342657, "grad_norm": 0.3910843337767334, "learning_rate": 4.89218971052302e-06, "loss": 0.657, "num_input_tokens_seen": 287309824, "step": 274 }, { "epoch": 0.3076923076923077, "grad_norm": 0.4189124587899034, "learning_rate": 4.891328366874946e-06, "loss": 0.68, "num_input_tokens_seen": 288358400, "step": 275 }, { "epoch": 0.3088111888111888, "grad_norm": 0.40986260199755353, "learning_rate": 4.890463672515086e-06, "loss": 0.6954, "num_input_tokens_seen": 289406976, "step": 276 }, { "epoch": 0.30993006993006994, "grad_norm": 0.38217157754635894, "learning_rate": 4.889595628655044e-06, "loss": 0.7821, "num_input_tokens_seen": 290455552, "step": 277 }, { "epoch": 0.31104895104895103, "grad_norm": 0.3800744668245649, "learning_rate": 4.8887242365111155e-06, "loss": 0.8368, "num_input_tokens_seen": 291504128, "step": 278 }, { "epoch": 0.31216783216783217, "grad_norm": 0.39866234514508864, "learning_rate": 4.887849497304289e-06, "loss": 0.8302, "num_input_tokens_seen": 292552704, "step": 279 }, { "epoch": 0.3132867132867133, "grad_norm": 0.4237244664749374, "learning_rate": 4.886971412260244e-06, "loss": 0.7727, "num_input_tokens_seen": 293601280, "step": 280 }, { "epoch": 0.3144055944055944, "grad_norm": 0.43271918237594864, "learning_rate": 4.886089982609345e-06, "loss": 0.7328, "num_input_tokens_seen": 294649856, "step": 281 }, { "epoch": 0.31552447552447555, "grad_norm": 0.4363009000967073, "learning_rate": 4.885205209586647e-06, "loss": 0.7126, "num_input_tokens_seen": 295698432, "step": 282 }, { "epoch": 0.31664335664335663, "grad_norm": 0.4739503328479856, "learning_rate": 4.8843170944318855e-06, "loss": 0.8727, "num_input_tokens_seen": 296747008, "step": 283 }, { "epoch": 0.3177622377622378, "grad_norm": 0.4390079661697419, "learning_rate": 4.883425638389482e-06, "loss": 0.9965, "num_input_tokens_seen": 297795584, "step": 284 }, { "epoch": 0.31888111888111886, "grad_norm": 0.41149527558303306, "learning_rate": 4.882530842708537e-06, "loss": 0.7599, "num_input_tokens_seen": 298844160, "step": 285 }, { "epoch": 0.32, "grad_norm": 0.41674309384859703, "learning_rate": 4.881632708642832e-06, "loss": 0.6943, "num_input_tokens_seen": 299892736, "step": 286 }, { "epoch": 0.3211188811188811, "grad_norm": 0.4275312004682713, "learning_rate": 4.880731237450828e-06, "loss": 0.7503, "num_input_tokens_seen": 300941312, "step": 287 }, { "epoch": 0.32223776223776224, "grad_norm": 0.44992140868787484, "learning_rate": 4.8798264303956565e-06, "loss": 0.7757, "num_input_tokens_seen": 301989888, "step": 288 }, { "epoch": 0.3233566433566434, "grad_norm": 0.4216150387820347, "learning_rate": 4.878918288745128e-06, "loss": 0.757, "num_input_tokens_seen": 303038464, "step": 289 }, { "epoch": 0.32447552447552447, "grad_norm": 0.40284297005190073, "learning_rate": 4.8780068137717255e-06, "loss": 0.7132, "num_input_tokens_seen": 304087040, "step": 290 }, { "epoch": 0.3255944055944056, "grad_norm": 0.40868454246969055, "learning_rate": 4.877092006752599e-06, "loss": 0.7994, "num_input_tokens_seen": 305135616, "step": 291 }, { "epoch": 0.3267132867132867, "grad_norm": 0.46941496964349755, "learning_rate": 4.8761738689695695e-06, "loss": 0.9409, "num_input_tokens_seen": 306184192, "step": 292 }, { "epoch": 0.32783216783216784, "grad_norm": 0.42257175114987533, "learning_rate": 4.875252401709126e-06, "loss": 0.7061, "num_input_tokens_seen": 307232768, "step": 293 }, { "epoch": 0.32895104895104893, "grad_norm": 0.38452544600456867, "learning_rate": 4.8743276062624214e-06, "loss": 0.8946, "num_input_tokens_seen": 308281344, "step": 294 }, { "epoch": 0.3300699300699301, "grad_norm": 0.40199358440613153, "learning_rate": 4.873399483925272e-06, "loss": 0.7836, "num_input_tokens_seen": 309329920, "step": 295 }, { "epoch": 0.3311888111888112, "grad_norm": 0.4701085727016215, "learning_rate": 4.872468035998155e-06, "loss": 0.753, "num_input_tokens_seen": 310378496, "step": 296 }, { "epoch": 0.3323076923076923, "grad_norm": 0.42603733087753504, "learning_rate": 4.87153326378621e-06, "loss": 0.8541, "num_input_tokens_seen": 311427072, "step": 297 }, { "epoch": 0.33342657342657345, "grad_norm": 0.4344592429970221, "learning_rate": 4.8705951685992325e-06, "loss": 0.7654, "num_input_tokens_seen": 312475648, "step": 298 }, { "epoch": 0.33454545454545453, "grad_norm": 0.423017887104777, "learning_rate": 4.8696537517516754e-06, "loss": 0.8081, "num_input_tokens_seen": 313524224, "step": 299 }, { "epoch": 0.3356643356643357, "grad_norm": 0.45192090622114983, "learning_rate": 4.868709014562643e-06, "loss": 0.6635, "num_input_tokens_seen": 314572800, "step": 300 }, { "epoch": 0.33678321678321677, "grad_norm": 0.398578662086278, "learning_rate": 4.8677609583558956e-06, "loss": 0.8295, "num_input_tokens_seen": 315621376, "step": 301 }, { "epoch": 0.3379020979020979, "grad_norm": 0.437637186912362, "learning_rate": 4.866809584459842e-06, "loss": 0.824, "num_input_tokens_seen": 316669952, "step": 302 }, { "epoch": 0.339020979020979, "grad_norm": 0.393268895107929, "learning_rate": 4.865854894207541e-06, "loss": 0.6988, "num_input_tokens_seen": 317718528, "step": 303 }, { "epoch": 0.34013986013986014, "grad_norm": 0.5101376372931259, "learning_rate": 4.864896888936698e-06, "loss": 0.7389, "num_input_tokens_seen": 318767104, "step": 304 }, { "epoch": 0.3412587412587413, "grad_norm": 0.43350607551769993, "learning_rate": 4.863935569989662e-06, "loss": 0.8982, "num_input_tokens_seen": 319815680, "step": 305 }, { "epoch": 0.34237762237762237, "grad_norm": 0.518334840128171, "learning_rate": 4.8629709387134255e-06, "loss": 0.8236, "num_input_tokens_seen": 320864256, "step": 306 }, { "epoch": 0.3434965034965035, "grad_norm": 0.4465505706007649, "learning_rate": 4.8620029964596234e-06, "loss": 0.7898, "num_input_tokens_seen": 321912832, "step": 307 }, { "epoch": 0.3446153846153846, "grad_norm": 0.46890671387936755, "learning_rate": 4.86103174458453e-06, "loss": 0.7371, "num_input_tokens_seen": 322961408, "step": 308 }, { "epoch": 0.34573426573426574, "grad_norm": 0.4834077494786035, "learning_rate": 4.860057184449057e-06, "loss": 0.8417, "num_input_tokens_seen": 324009984, "step": 309 }, { "epoch": 0.34685314685314683, "grad_norm": 0.5240402975328184, "learning_rate": 4.8590793174187486e-06, "loss": 0.7701, "num_input_tokens_seen": 325058560, "step": 310 }, { "epoch": 0.347972027972028, "grad_norm": 0.418925402493061, "learning_rate": 4.858098144863786e-06, "loss": 0.6911, "num_input_tokens_seen": 326107136, "step": 311 }, { "epoch": 0.3490909090909091, "grad_norm": 0.5168240034684315, "learning_rate": 4.85711366815898e-06, "loss": 0.8692, "num_input_tokens_seen": 327155712, "step": 312 }, { "epoch": 0.3502097902097902, "grad_norm": 0.4353466994938481, "learning_rate": 4.856125888683775e-06, "loss": 0.8684, "num_input_tokens_seen": 328204288, "step": 313 }, { "epoch": 0.35132867132867135, "grad_norm": 0.4933854096631406, "learning_rate": 4.855134807822238e-06, "loss": 0.6737, "num_input_tokens_seen": 329252864, "step": 314 }, { "epoch": 0.35244755244755244, "grad_norm": 0.4030326638347577, "learning_rate": 4.854140426963064e-06, "loss": 0.7885, "num_input_tokens_seen": 330301440, "step": 315 }, { "epoch": 0.3535664335664336, "grad_norm": 0.47140228107983906, "learning_rate": 4.853142747499574e-06, "loss": 0.9624, "num_input_tokens_seen": 331350016, "step": 316 }, { "epoch": 0.35468531468531467, "grad_norm": 0.4984806014484147, "learning_rate": 4.852141770829707e-06, "loss": 0.6914, "num_input_tokens_seen": 332398592, "step": 317 }, { "epoch": 0.3558041958041958, "grad_norm": 0.4318660064068759, "learning_rate": 4.851137498356025e-06, "loss": 0.9217, "num_input_tokens_seen": 333447168, "step": 318 }, { "epoch": 0.3569230769230769, "grad_norm": 0.4991057414288555, "learning_rate": 4.850129931485709e-06, "loss": 0.8165, "num_input_tokens_seen": 334495744, "step": 319 }, { "epoch": 0.35804195804195804, "grad_norm": 0.46958180383922893, "learning_rate": 4.849119071630553e-06, "loss": 0.7721, "num_input_tokens_seen": 335544320, "step": 320 }, { "epoch": 0.3591608391608392, "grad_norm": 0.4269731275802518, "learning_rate": 4.848104920206964e-06, "loss": 0.9471, "num_input_tokens_seen": 336592896, "step": 321 }, { "epoch": 0.3602797202797203, "grad_norm": 0.4575307376370584, "learning_rate": 4.847087478635968e-06, "loss": 0.8085, "num_input_tokens_seen": 337641472, "step": 322 }, { "epoch": 0.3613986013986014, "grad_norm": 0.4628148164188177, "learning_rate": 4.846066748343193e-06, "loss": 0.7591, "num_input_tokens_seen": 338690048, "step": 323 }, { "epoch": 0.3625174825174825, "grad_norm": 0.388638215107357, "learning_rate": 4.845042730758881e-06, "loss": 0.7275, "num_input_tokens_seen": 339738624, "step": 324 }, { "epoch": 0.36363636363636365, "grad_norm": 0.42422155989252397, "learning_rate": 4.844015427317878e-06, "loss": 0.8182, "num_input_tokens_seen": 340787200, "step": 325 }, { "epoch": 0.36475524475524473, "grad_norm": 0.41867320537038927, "learning_rate": 4.842984839459631e-06, "loss": 0.7841, "num_input_tokens_seen": 341835776, "step": 326 }, { "epoch": 0.3658741258741259, "grad_norm": 0.4062456689643522, "learning_rate": 4.8419509686281965e-06, "loss": 0.7658, "num_input_tokens_seen": 342884352, "step": 327 }, { "epoch": 0.366993006993007, "grad_norm": 0.37545951149070206, "learning_rate": 4.8409138162722235e-06, "loss": 0.7905, "num_input_tokens_seen": 343932928, "step": 328 }, { "epoch": 0.3681118881118881, "grad_norm": 0.45712730382017264, "learning_rate": 4.839873383844964e-06, "loss": 0.7325, "num_input_tokens_seen": 344981504, "step": 329 }, { "epoch": 0.36923076923076925, "grad_norm": 0.45472846351588203, "learning_rate": 4.838829672804264e-06, "loss": 0.8508, "num_input_tokens_seen": 346030080, "step": 330 }, { "epoch": 0.37034965034965034, "grad_norm": 0.45297774414281383, "learning_rate": 4.837782684612562e-06, "loss": 0.7284, "num_input_tokens_seen": 347078656, "step": 331 }, { "epoch": 0.3714685314685315, "grad_norm": 0.438363062066642, "learning_rate": 4.836732420736893e-06, "loss": 0.7731, "num_input_tokens_seen": 348127232, "step": 332 }, { "epoch": 0.37258741258741257, "grad_norm": 0.4695454593709068, "learning_rate": 4.835678882648878e-06, "loss": 0.862, "num_input_tokens_seen": 349175808, "step": 333 }, { "epoch": 0.3737062937062937, "grad_norm": 0.43855883118979616, "learning_rate": 4.834622071824726e-06, "loss": 0.8556, "num_input_tokens_seen": 350224384, "step": 334 }, { "epoch": 0.3748251748251748, "grad_norm": 0.5593604582328207, "learning_rate": 4.833561989745232e-06, "loss": 0.7893, "num_input_tokens_seen": 351272960, "step": 335 }, { "epoch": 0.37594405594405594, "grad_norm": 0.3999954168335143, "learning_rate": 4.832498637895778e-06, "loss": 0.8358, "num_input_tokens_seen": 352321536, "step": 336 }, { "epoch": 0.3770629370629371, "grad_norm": 0.5316031851311273, "learning_rate": 4.831432017766323e-06, "loss": 0.7147, "num_input_tokens_seen": 353370112, "step": 337 }, { "epoch": 0.3781818181818182, "grad_norm": 0.40682196095807466, "learning_rate": 4.830362130851407e-06, "loss": 0.829, "num_input_tokens_seen": 354418688, "step": 338 }, { "epoch": 0.3793006993006993, "grad_norm": 0.431939124219467, "learning_rate": 4.829288978650149e-06, "loss": 0.7823, "num_input_tokens_seen": 355467264, "step": 339 }, { "epoch": 0.3804195804195804, "grad_norm": 0.43844540994187053, "learning_rate": 4.82821256266624e-06, "loss": 0.722, "num_input_tokens_seen": 356515840, "step": 340 }, { "epoch": 0.38153846153846155, "grad_norm": 0.49375713264057786, "learning_rate": 4.827132884407948e-06, "loss": 0.7389, "num_input_tokens_seen": 357564416, "step": 341 }, { "epoch": 0.38265734265734264, "grad_norm": 0.4667169862983245, "learning_rate": 4.826049945388109e-06, "loss": 0.7596, "num_input_tokens_seen": 358612992, "step": 342 }, { "epoch": 0.3837762237762238, "grad_norm": 0.44377448908614997, "learning_rate": 4.824963747124132e-06, "loss": 0.8015, "num_input_tokens_seen": 359661568, "step": 343 }, { "epoch": 0.3848951048951049, "grad_norm": 2.1904683771233633, "learning_rate": 4.823874291137986e-06, "loss": 0.7084, "num_input_tokens_seen": 360710144, "step": 344 }, { "epoch": 0.386013986013986, "grad_norm": 0.632903936086508, "learning_rate": 4.822781578956212e-06, "loss": 0.7906, "num_input_tokens_seen": 361758720, "step": 345 }, { "epoch": 0.38713286713286715, "grad_norm": 0.39704327494395303, "learning_rate": 4.8216856121099074e-06, "loss": 0.7454, "num_input_tokens_seen": 362807296, "step": 346 }, { "epoch": 0.38825174825174824, "grad_norm": 0.618085655552128, "learning_rate": 4.820586392134735e-06, "loss": 0.6856, "num_input_tokens_seen": 363855872, "step": 347 }, { "epoch": 0.3893706293706294, "grad_norm": 0.41971548591103847, "learning_rate": 4.819483920570914e-06, "loss": 0.7819, "num_input_tokens_seen": 364904448, "step": 348 }, { "epoch": 0.39048951048951047, "grad_norm": 0.6499285601196653, "learning_rate": 4.818378198963218e-06, "loss": 0.8421, "num_input_tokens_seen": 365953024, "step": 349 }, { "epoch": 0.3916083916083916, "grad_norm": 0.5228044785478678, "learning_rate": 4.817269228860978e-06, "loss": 0.8554, "num_input_tokens_seen": 367001600, "step": 350 }, { "epoch": 0.3927272727272727, "grad_norm": 0.49443519776089845, "learning_rate": 4.816157011818073e-06, "loss": 0.8218, "num_input_tokens_seen": 368050176, "step": 351 }, { "epoch": 0.39384615384615385, "grad_norm": 0.4435061282272205, "learning_rate": 4.815041549392934e-06, "loss": 0.8468, "num_input_tokens_seen": 369098752, "step": 352 }, { "epoch": 0.394965034965035, "grad_norm": 0.4633382741397914, "learning_rate": 4.813922843148537e-06, "loss": 0.9159, "num_input_tokens_seen": 370147328, "step": 353 }, { "epoch": 0.3960839160839161, "grad_norm": 0.4256487217082112, "learning_rate": 4.8128008946524085e-06, "loss": 0.7099, "num_input_tokens_seen": 371195904, "step": 354 }, { "epoch": 0.3972027972027972, "grad_norm": 1.1089146630975995, "learning_rate": 4.811675705476613e-06, "loss": 0.9111, "num_input_tokens_seen": 372244480, "step": 355 }, { "epoch": 0.3983216783216783, "grad_norm": 0.589532450346439, "learning_rate": 4.810547277197755e-06, "loss": 0.8061, "num_input_tokens_seen": 373293056, "step": 356 }, { "epoch": 0.39944055944055945, "grad_norm": 0.4112226396789665, "learning_rate": 4.809415611396984e-06, "loss": 0.7518, "num_input_tokens_seen": 374341632, "step": 357 }, { "epoch": 0.40055944055944054, "grad_norm": 0.548884072614938, "learning_rate": 4.80828070965998e-06, "loss": 0.7067, "num_input_tokens_seen": 375390208, "step": 358 }, { "epoch": 0.4016783216783217, "grad_norm": 0.40164099373355727, "learning_rate": 4.807142573576958e-06, "loss": 0.8054, "num_input_tokens_seen": 376438784, "step": 359 }, { "epoch": 0.4027972027972028, "grad_norm": 0.6967454527085344, "learning_rate": 4.8060012047426666e-06, "loss": 0.7113, "num_input_tokens_seen": 377487360, "step": 360 }, { "epoch": 0.4039160839160839, "grad_norm": 0.4292609536416324, "learning_rate": 4.8048566047563835e-06, "loss": 0.7598, "num_input_tokens_seen": 378535936, "step": 361 }, { "epoch": 0.40503496503496506, "grad_norm": 0.5566386873499677, "learning_rate": 4.803708775221914e-06, "loss": 0.821, "num_input_tokens_seen": 379584512, "step": 362 }, { "epoch": 0.40615384615384614, "grad_norm": 0.3954624364425005, "learning_rate": 4.802557717747588e-06, "loss": 0.8442, "num_input_tokens_seen": 380633088, "step": 363 }, { "epoch": 0.4072727272727273, "grad_norm": 0.48970625574667126, "learning_rate": 4.80140343394626e-06, "loss": 0.7244, "num_input_tokens_seen": 381681664, "step": 364 }, { "epoch": 0.4083916083916084, "grad_norm": 0.429925322065755, "learning_rate": 4.800245925435302e-06, "loss": 0.7735, "num_input_tokens_seen": 382730240, "step": 365 }, { "epoch": 0.4095104895104895, "grad_norm": 0.4626319430848944, "learning_rate": 4.799085193836609e-06, "loss": 0.7388, "num_input_tokens_seen": 383778816, "step": 366 }, { "epoch": 0.4106293706293706, "grad_norm": 0.4448343546876498, "learning_rate": 4.797921240776587e-06, "loss": 0.7508, "num_input_tokens_seen": 384827392, "step": 367 }, { "epoch": 0.41174825174825175, "grad_norm": 0.4273592643587522, "learning_rate": 4.79675406788616e-06, "loss": 0.6736, "num_input_tokens_seen": 385875968, "step": 368 }, { "epoch": 0.4128671328671329, "grad_norm": 0.4621149080347458, "learning_rate": 4.795583676800762e-06, "loss": 0.8284, "num_input_tokens_seen": 386924544, "step": 369 }, { "epoch": 0.413986013986014, "grad_norm": 0.40975516650892746, "learning_rate": 4.794410069160337e-06, "loss": 0.7609, "num_input_tokens_seen": 387973120, "step": 370 }, { "epoch": 0.4151048951048951, "grad_norm": 0.44279645913233223, "learning_rate": 4.793233246609333e-06, "loss": 0.7684, "num_input_tokens_seen": 389021696, "step": 371 }, { "epoch": 0.4162237762237762, "grad_norm": 1.055428683702529, "learning_rate": 4.792053210796708e-06, "loss": 0.6941, "num_input_tokens_seen": 390070272, "step": 372 }, { "epoch": 0.41734265734265735, "grad_norm": 0.40587249129251884, "learning_rate": 4.790869963375918e-06, "loss": 0.8273, "num_input_tokens_seen": 391118848, "step": 373 }, { "epoch": 0.41846153846153844, "grad_norm": 0.4401534966917422, "learning_rate": 4.789683506004921e-06, "loss": 0.8277, "num_input_tokens_seen": 392167424, "step": 374 }, { "epoch": 0.4195804195804196, "grad_norm": 0.4596590669499322, "learning_rate": 4.788493840346172e-06, "loss": 0.8717, "num_input_tokens_seen": 393216000, "step": 375 }, { "epoch": 0.4206993006993007, "grad_norm": 0.4391846571990017, "learning_rate": 4.7873009680666225e-06, "loss": 0.8423, "num_input_tokens_seen": 394264576, "step": 376 }, { "epoch": 0.4218181818181818, "grad_norm": 0.38183837770762574, "learning_rate": 4.786104890837715e-06, "loss": 0.6853, "num_input_tokens_seen": 395313152, "step": 377 }, { "epoch": 0.42293706293706296, "grad_norm": 0.4393313779749151, "learning_rate": 4.7849056103353864e-06, "loss": 0.6717, "num_input_tokens_seen": 396361728, "step": 378 }, { "epoch": 0.42405594405594405, "grad_norm": 0.4054009034012445, "learning_rate": 4.783703128240058e-06, "loss": 0.9347, "num_input_tokens_seen": 397410304, "step": 379 }, { "epoch": 0.4251748251748252, "grad_norm": 0.44458338617744475, "learning_rate": 4.782497446236639e-06, "loss": 0.7345, "num_input_tokens_seen": 398458880, "step": 380 }, { "epoch": 0.4262937062937063, "grad_norm": 0.40555814195362383, "learning_rate": 4.781288566014524e-06, "loss": 0.6862, "num_input_tokens_seen": 399507456, "step": 381 }, { "epoch": 0.4274125874125874, "grad_norm": 0.4392795918355731, "learning_rate": 4.7800764892675836e-06, "loss": 0.6942, "num_input_tokens_seen": 400556032, "step": 382 }, { "epoch": 0.4285314685314685, "grad_norm": 0.38462961936558615, "learning_rate": 4.778861217694174e-06, "loss": 0.8405, "num_input_tokens_seen": 401604608, "step": 383 }, { "epoch": 0.42965034965034965, "grad_norm": 0.5737713134641942, "learning_rate": 4.7776427529971245e-06, "loss": 0.7666, "num_input_tokens_seen": 402653184, "step": 384 }, { "epoch": 0.4307692307692308, "grad_norm": 0.4266574955111262, "learning_rate": 4.776421096883737e-06, "loss": 0.7281, "num_input_tokens_seen": 403701760, "step": 385 }, { "epoch": 0.4318881118881119, "grad_norm": 0.49987086665841857, "learning_rate": 4.775196251065789e-06, "loss": 0.8681, "num_input_tokens_seen": 404750336, "step": 386 }, { "epoch": 0.433006993006993, "grad_norm": 0.43377844385508535, "learning_rate": 4.773968217259525e-06, "loss": 0.797, "num_input_tokens_seen": 405798912, "step": 387 }, { "epoch": 0.4341258741258741, "grad_norm": 0.4519113282540047, "learning_rate": 4.772736997185656e-06, "loss": 0.7469, "num_input_tokens_seen": 406847488, "step": 388 }, { "epoch": 0.43524475524475525, "grad_norm": 0.6977455490672427, "learning_rate": 4.7715025925693595e-06, "loss": 0.6814, "num_input_tokens_seen": 407896064, "step": 389 }, { "epoch": 0.43636363636363634, "grad_norm": 0.4950070919924748, "learning_rate": 4.7702650051402745e-06, "loss": 0.6619, "num_input_tokens_seen": 408944640, "step": 390 }, { "epoch": 0.4374825174825175, "grad_norm": 0.48062326581081993, "learning_rate": 4.769024236632498e-06, "loss": 0.6681, "num_input_tokens_seen": 409993216, "step": 391 }, { "epoch": 0.43860139860139863, "grad_norm": 0.39569545131066014, "learning_rate": 4.767780288784588e-06, "loss": 0.72, "num_input_tokens_seen": 411041792, "step": 392 }, { "epoch": 0.4397202797202797, "grad_norm": 0.42074525650885963, "learning_rate": 4.766533163339553e-06, "loss": 0.8121, "num_input_tokens_seen": 412090368, "step": 393 }, { "epoch": 0.44083916083916086, "grad_norm": 0.5156527708993254, "learning_rate": 4.765282862044857e-06, "loss": 0.8041, "num_input_tokens_seen": 413138944, "step": 394 }, { "epoch": 0.44195804195804195, "grad_norm": 0.4981634222523768, "learning_rate": 4.764029386652412e-06, "loss": 0.8315, "num_input_tokens_seen": 414187520, "step": 395 }, { "epoch": 0.4430769230769231, "grad_norm": 0.5574915472453056, "learning_rate": 4.76277273891858e-06, "loss": 0.7477, "num_input_tokens_seen": 415236096, "step": 396 }, { "epoch": 0.4441958041958042, "grad_norm": 0.5823762740899219, "learning_rate": 4.761512920604165e-06, "loss": 0.6838, "num_input_tokens_seen": 416284672, "step": 397 }, { "epoch": 0.4453146853146853, "grad_norm": 0.4754776632229244, "learning_rate": 4.760249933474418e-06, "loss": 0.7911, "num_input_tokens_seen": 417333248, "step": 398 }, { "epoch": 0.4464335664335664, "grad_norm": 0.4029956999100092, "learning_rate": 4.758983779299025e-06, "loss": 0.7992, "num_input_tokens_seen": 418381824, "step": 399 }, { "epoch": 0.44755244755244755, "grad_norm": 0.4074268432677725, "learning_rate": 4.757714459852111e-06, "loss": 0.7182, "num_input_tokens_seen": 419430400, "step": 400 }, { "epoch": 0.4486713286713287, "grad_norm": 0.4647836814387946, "learning_rate": 4.75644197691224e-06, "loss": 0.6905, "num_input_tokens_seen": 420478976, "step": 401 }, { "epoch": 0.4497902097902098, "grad_norm": 0.4384774619371977, "learning_rate": 4.755166332262403e-06, "loss": 0.6494, "num_input_tokens_seen": 421527552, "step": 402 }, { "epoch": 0.4509090909090909, "grad_norm": 0.49801191879471474, "learning_rate": 4.753887527690027e-06, "loss": 0.8166, "num_input_tokens_seen": 422576128, "step": 403 }, { "epoch": 0.452027972027972, "grad_norm": 0.38031518869170783, "learning_rate": 4.7526055649869606e-06, "loss": 0.7885, "num_input_tokens_seen": 423624704, "step": 404 }, { "epoch": 0.45314685314685316, "grad_norm": 0.3993850056482144, "learning_rate": 4.7513204459494825e-06, "loss": 0.9796, "num_input_tokens_seen": 424673280, "step": 405 }, { "epoch": 0.45426573426573424, "grad_norm": 0.45902626826419013, "learning_rate": 4.7500321723782905e-06, "loss": 0.6518, "num_input_tokens_seen": 425721856, "step": 406 }, { "epoch": 0.4553846153846154, "grad_norm": 0.4742971722284821, "learning_rate": 4.748740746078505e-06, "loss": 0.7536, "num_input_tokens_seen": 426770432, "step": 407 }, { "epoch": 0.45650349650349653, "grad_norm": 0.5012875926744111, "learning_rate": 4.747446168859664e-06, "loss": 0.714, "num_input_tokens_seen": 427819008, "step": 408 }, { "epoch": 0.4576223776223776, "grad_norm": 0.43085663412119585, "learning_rate": 4.746148442535717e-06, "loss": 0.9065, "num_input_tokens_seen": 428867584, "step": 409 }, { "epoch": 0.45874125874125876, "grad_norm": 1.0639796318435526, "learning_rate": 4.744847568925032e-06, "loss": 0.7311, "num_input_tokens_seen": 429916160, "step": 410 }, { "epoch": 0.45986013986013985, "grad_norm": 0.6574859440688938, "learning_rate": 4.743543549850381e-06, "loss": 0.8334, "num_input_tokens_seen": 430964736, "step": 411 }, { "epoch": 0.460979020979021, "grad_norm": 0.39521252580494237, "learning_rate": 4.7422363871389465e-06, "loss": 0.6762, "num_input_tokens_seen": 432013312, "step": 412 }, { "epoch": 0.4620979020979021, "grad_norm": 0.5564856498144339, "learning_rate": 4.740926082622316e-06, "loss": 0.8831, "num_input_tokens_seen": 433061888, "step": 413 }, { "epoch": 0.4632167832167832, "grad_norm": 0.4966167061926072, "learning_rate": 4.739612638136478e-06, "loss": 0.8917, "num_input_tokens_seen": 434110464, "step": 414 }, { "epoch": 0.4643356643356643, "grad_norm": 0.4229145964163021, "learning_rate": 4.738296055521821e-06, "loss": 0.721, "num_input_tokens_seen": 435159040, "step": 415 }, { "epoch": 0.46545454545454545, "grad_norm": 0.4405720325772957, "learning_rate": 4.736976336623133e-06, "loss": 0.7055, "num_input_tokens_seen": 436207616, "step": 416 }, { "epoch": 0.4665734265734266, "grad_norm": 0.458990907143575, "learning_rate": 4.735653483289591e-06, "loss": 0.783, "num_input_tokens_seen": 437256192, "step": 417 }, { "epoch": 0.4676923076923077, "grad_norm": 0.547024453997867, "learning_rate": 4.734327497374771e-06, "loss": 0.7741, "num_input_tokens_seen": 438304768, "step": 418 }, { "epoch": 0.46881118881118883, "grad_norm": 0.41178543556743946, "learning_rate": 4.732998380736632e-06, "loss": 0.7771, "num_input_tokens_seen": 439353344, "step": 419 }, { "epoch": 0.4699300699300699, "grad_norm": 0.41762427261207824, "learning_rate": 4.731666135237524e-06, "loss": 0.944, "num_input_tokens_seen": 440401920, "step": 420 }, { "epoch": 0.47104895104895106, "grad_norm": 0.4590068253435067, "learning_rate": 4.730330762744178e-06, "loss": 0.6851, "num_input_tokens_seen": 441450496, "step": 421 }, { "epoch": 0.47216783216783215, "grad_norm": 0.4099707095349753, "learning_rate": 4.72899226512771e-06, "loss": 0.7333, "num_input_tokens_seen": 442499072, "step": 422 }, { "epoch": 0.4732867132867133, "grad_norm": 0.43250503710252, "learning_rate": 4.7276506442636125e-06, "loss": 0.7371, "num_input_tokens_seen": 443547648, "step": 423 }, { "epoch": 0.47440559440559443, "grad_norm": 0.35598173932045307, "learning_rate": 4.726305902031754e-06, "loss": 0.7004, "num_input_tokens_seen": 444596224, "step": 424 }, { "epoch": 0.4755244755244755, "grad_norm": 0.37439128588916126, "learning_rate": 4.7249580403163786e-06, "loss": 0.76, "num_input_tokens_seen": 445644800, "step": 425 }, { "epoch": 0.47664335664335666, "grad_norm": 0.4593531027552188, "learning_rate": 4.7236070610061e-06, "loss": 0.7535, "num_input_tokens_seen": 446693376, "step": 426 }, { "epoch": 0.47776223776223775, "grad_norm": 0.42829037011291643, "learning_rate": 4.7222529659939e-06, "loss": 0.8175, "num_input_tokens_seen": 447741952, "step": 427 }, { "epoch": 0.4788811188811189, "grad_norm": 0.4534719163123595, "learning_rate": 4.720895757177126e-06, "loss": 0.836, "num_input_tokens_seen": 448790528, "step": 428 }, { "epoch": 0.48, "grad_norm": 0.462006391568746, "learning_rate": 4.7195354364574915e-06, "loss": 0.7785, "num_input_tokens_seen": 449839104, "step": 429 }, { "epoch": 0.4811188811188811, "grad_norm": 0.4487803457729824, "learning_rate": 4.718172005741066e-06, "loss": 0.903, "num_input_tokens_seen": 450887680, "step": 430 }, { "epoch": 0.4822377622377622, "grad_norm": 0.4229751039068483, "learning_rate": 4.716805466938278e-06, "loss": 0.8167, "num_input_tokens_seen": 451936256, "step": 431 }, { "epoch": 0.48335664335664336, "grad_norm": 0.4682822758213802, "learning_rate": 4.715435821963913e-06, "loss": 0.7245, "num_input_tokens_seen": 452984832, "step": 432 }, { "epoch": 0.4844755244755245, "grad_norm": 0.42804660393075666, "learning_rate": 4.714063072737108e-06, "loss": 0.6369, "num_input_tokens_seen": 454033408, "step": 433 }, { "epoch": 0.4855944055944056, "grad_norm": 0.48754932209069907, "learning_rate": 4.712687221181348e-06, "loss": 0.6817, "num_input_tokens_seen": 455081984, "step": 434 }, { "epoch": 0.48671328671328673, "grad_norm": 0.4388036036483515, "learning_rate": 4.711308269224466e-06, "loss": 0.7328, "num_input_tokens_seen": 456130560, "step": 435 }, { "epoch": 0.4878321678321678, "grad_norm": 0.4625581894060147, "learning_rate": 4.70992621879864e-06, "loss": 0.6596, "num_input_tokens_seen": 457179136, "step": 436 }, { "epoch": 0.48895104895104896, "grad_norm": 0.4257337352936816, "learning_rate": 4.708541071840388e-06, "loss": 0.684, "num_input_tokens_seen": 458227712, "step": 437 }, { "epoch": 0.49006993006993005, "grad_norm": 0.5002575057632895, "learning_rate": 4.70715283029057e-06, "loss": 0.7575, "num_input_tokens_seen": 459276288, "step": 438 }, { "epoch": 0.4911888111888112, "grad_norm": 0.45588543686505, "learning_rate": 4.705761496094377e-06, "loss": 0.7695, "num_input_tokens_seen": 460324864, "step": 439 }, { "epoch": 0.49230769230769234, "grad_norm": 0.38460440091291237, "learning_rate": 4.704367071201339e-06, "loss": 0.9081, "num_input_tokens_seen": 461373440, "step": 440 }, { "epoch": 0.4934265734265734, "grad_norm": 0.431829609619128, "learning_rate": 4.702969557565312e-06, "loss": 0.7648, "num_input_tokens_seen": 462422016, "step": 441 }, { "epoch": 0.49454545454545457, "grad_norm": 0.4214209687812584, "learning_rate": 4.701568957144483e-06, "loss": 0.7444, "num_input_tokens_seen": 463470592, "step": 442 }, { "epoch": 0.49566433566433565, "grad_norm": 0.3579403292000241, "learning_rate": 4.700165271901361e-06, "loss": 0.7005, "num_input_tokens_seen": 464519168, "step": 443 }, { "epoch": 0.4967832167832168, "grad_norm": 0.4317189751461722, "learning_rate": 4.698758503802782e-06, "loss": 0.7749, "num_input_tokens_seen": 465567744, "step": 444 }, { "epoch": 0.4979020979020979, "grad_norm": 0.3926298708121702, "learning_rate": 4.697348654819898e-06, "loss": 0.7667, "num_input_tokens_seen": 466616320, "step": 445 }, { "epoch": 0.499020979020979, "grad_norm": 0.5181066333641017, "learning_rate": 4.695935726928179e-06, "loss": 0.797, "num_input_tokens_seen": 467664896, "step": 446 }, { "epoch": 0.5001398601398601, "grad_norm": 0.4301035258814286, "learning_rate": 4.6945197221074104e-06, "loss": 0.8127, "num_input_tokens_seen": 468713472, "step": 447 }, { "epoch": 0.5012587412587413, "grad_norm": 0.4700440405321513, "learning_rate": 4.693100642341686e-06, "loss": 0.6833, "num_input_tokens_seen": 469762048, "step": 448 }, { "epoch": 0.5023776223776224, "grad_norm": 0.42325430062804653, "learning_rate": 4.691678489619411e-06, "loss": 0.7943, "num_input_tokens_seen": 470810624, "step": 449 }, { "epoch": 0.5034965034965035, "grad_norm": 0.4254573285849647, "learning_rate": 4.690253265933295e-06, "loss": 0.7282, "num_input_tokens_seen": 471859200, "step": 450 }, { "epoch": 0.5046153846153846, "grad_norm": 0.47221923985388364, "learning_rate": 4.6888249732803516e-06, "loss": 0.8112, "num_input_tokens_seen": 472907776, "step": 451 }, { "epoch": 0.5057342657342657, "grad_norm": 0.45472236088012963, "learning_rate": 4.6873936136618925e-06, "loss": 0.791, "num_input_tokens_seen": 473956352, "step": 452 }, { "epoch": 0.5068531468531469, "grad_norm": 0.40585880935937185, "learning_rate": 4.685959189083531e-06, "loss": 0.7884, "num_input_tokens_seen": 475004928, "step": 453 }, { "epoch": 0.507972027972028, "grad_norm": 0.7928405286585005, "learning_rate": 4.68452170155517e-06, "loss": 0.9747, "num_input_tokens_seen": 476053504, "step": 454 }, { "epoch": 0.509090909090909, "grad_norm": 0.532199747181477, "learning_rate": 4.683081153091006e-06, "loss": 0.7799, "num_input_tokens_seen": 477102080, "step": 455 }, { "epoch": 0.5102097902097902, "grad_norm": 0.43313442212304193, "learning_rate": 4.681637545709527e-06, "loss": 0.7367, "num_input_tokens_seen": 478150656, "step": 456 }, { "epoch": 0.5113286713286713, "grad_norm": 0.45715795453053343, "learning_rate": 4.680190881433504e-06, "loss": 0.7124, "num_input_tokens_seen": 479199232, "step": 457 }, { "epoch": 0.5124475524475525, "grad_norm": 0.39569211356771194, "learning_rate": 4.678741162289993e-06, "loss": 0.8018, "num_input_tokens_seen": 480247808, "step": 458 }, { "epoch": 0.5135664335664336, "grad_norm": 0.44650970441547055, "learning_rate": 4.67728839031033e-06, "loss": 0.7131, "num_input_tokens_seen": 481296384, "step": 459 }, { "epoch": 0.5146853146853146, "grad_norm": 0.5201145323037956, "learning_rate": 4.675832567530126e-06, "loss": 0.6567, "num_input_tokens_seen": 482344960, "step": 460 }, { "epoch": 0.5158041958041958, "grad_norm": 0.4525793205268753, "learning_rate": 4.674373695989272e-06, "loss": 0.8276, "num_input_tokens_seen": 483393536, "step": 461 }, { "epoch": 0.5169230769230769, "grad_norm": 0.43800824013949946, "learning_rate": 4.6729117777319275e-06, "loss": 0.6733, "num_input_tokens_seen": 484442112, "step": 462 }, { "epoch": 0.5180419580419581, "grad_norm": 0.4330270392975316, "learning_rate": 4.6714468148065215e-06, "loss": 0.8719, "num_input_tokens_seen": 485490688, "step": 463 }, { "epoch": 0.5191608391608392, "grad_norm": 0.4503879398075036, "learning_rate": 4.669978809265749e-06, "loss": 0.8106, "num_input_tokens_seen": 486539264, "step": 464 }, { "epoch": 0.5202797202797202, "grad_norm": 0.44364047244866595, "learning_rate": 4.668507763166568e-06, "loss": 0.6892, "num_input_tokens_seen": 487587840, "step": 465 }, { "epoch": 0.5213986013986014, "grad_norm": 0.4205011024708046, "learning_rate": 4.667033678570199e-06, "loss": 0.7149, "num_input_tokens_seen": 488636416, "step": 466 }, { "epoch": 0.5225174825174825, "grad_norm": 0.4406538028689905, "learning_rate": 4.665556557542118e-06, "loss": 0.7681, "num_input_tokens_seen": 489684992, "step": 467 }, { "epoch": 0.5236363636363637, "grad_norm": 0.44401061913648004, "learning_rate": 4.664076402152056e-06, "loss": 0.7663, "num_input_tokens_seen": 490733568, "step": 468 }, { "epoch": 0.5247552447552447, "grad_norm": 0.4192391955321833, "learning_rate": 4.662593214473995e-06, "loss": 0.7281, "num_input_tokens_seen": 491782144, "step": 469 }, { "epoch": 0.5258741258741259, "grad_norm": 0.4835893004252956, "learning_rate": 4.6611069965861685e-06, "loss": 0.7265, "num_input_tokens_seen": 492830720, "step": 470 }, { "epoch": 0.526993006993007, "grad_norm": 0.41394258846922766, "learning_rate": 4.659617750571052e-06, "loss": 0.8549, "num_input_tokens_seen": 493879296, "step": 471 }, { "epoch": 0.5281118881118881, "grad_norm": 0.5880925980450123, "learning_rate": 4.658125478515369e-06, "loss": 0.7059, "num_input_tokens_seen": 494927872, "step": 472 }, { "epoch": 0.5292307692307693, "grad_norm": 0.4538674040425184, "learning_rate": 4.656630182510078e-06, "loss": 0.8532, "num_input_tokens_seen": 495976448, "step": 473 }, { "epoch": 0.5303496503496503, "grad_norm": 0.7106414965997636, "learning_rate": 4.6551318646503765e-06, "loss": 0.6441, "num_input_tokens_seen": 497025024, "step": 474 }, { "epoch": 0.5314685314685315, "grad_norm": 0.5559589375855173, "learning_rate": 4.6536305270356975e-06, "loss": 0.828, "num_input_tokens_seen": 498073600, "step": 475 }, { "epoch": 0.5325874125874126, "grad_norm": 0.4383686370073057, "learning_rate": 4.652126171769705e-06, "loss": 0.6788, "num_input_tokens_seen": 499122176, "step": 476 }, { "epoch": 0.5337062937062937, "grad_norm": 0.4279591997705516, "learning_rate": 4.6506188009602885e-06, "loss": 0.752, "num_input_tokens_seen": 500170752, "step": 477 }, { "epoch": 0.5348251748251748, "grad_norm": 0.4000096231936139, "learning_rate": 4.6491084167195665e-06, "loss": 0.7119, "num_input_tokens_seen": 501219328, "step": 478 }, { "epoch": 0.5359440559440559, "grad_norm": 0.4343718720023054, "learning_rate": 4.647595021163878e-06, "loss": 0.7091, "num_input_tokens_seen": 502267904, "step": 479 }, { "epoch": 0.5370629370629371, "grad_norm": 0.4863187982474019, "learning_rate": 4.646078616413781e-06, "loss": 0.7446, "num_input_tokens_seen": 503316480, "step": 480 }, { "epoch": 0.5381818181818182, "grad_norm": 0.41294535341709815, "learning_rate": 4.6445592045940515e-06, "loss": 0.8345, "num_input_tokens_seen": 504365056, "step": 481 }, { "epoch": 0.5393006993006993, "grad_norm": 0.5248549967704561, "learning_rate": 4.6430367878336795e-06, "loss": 0.8465, "num_input_tokens_seen": 505413632, "step": 482 }, { "epoch": 0.5404195804195804, "grad_norm": 0.4320427007399372, "learning_rate": 4.641511368265861e-06, "loss": 0.7268, "num_input_tokens_seen": 506462208, "step": 483 }, { "epoch": 0.5415384615384615, "grad_norm": 0.4058993308046492, "learning_rate": 4.6399829480280055e-06, "loss": 0.7069, "num_input_tokens_seen": 507510784, "step": 484 }, { "epoch": 0.5426573426573427, "grad_norm": 0.39925329621156685, "learning_rate": 4.6384515292617226e-06, "loss": 0.7307, "num_input_tokens_seen": 508559360, "step": 485 }, { "epoch": 0.5437762237762238, "grad_norm": 0.44591091678676614, "learning_rate": 4.636917114112827e-06, "loss": 0.6262, "num_input_tokens_seen": 509607936, "step": 486 }, { "epoch": 0.5448951048951048, "grad_norm": 0.37486296087855503, "learning_rate": 4.635379704731327e-06, "loss": 0.7048, "num_input_tokens_seen": 510656512, "step": 487 }, { "epoch": 0.546013986013986, "grad_norm": 0.44065071062176, "learning_rate": 4.633839303271432e-06, "loss": 0.7502, "num_input_tokens_seen": 511705088, "step": 488 }, { "epoch": 0.5471328671328671, "grad_norm": 0.3971676014198471, "learning_rate": 4.63229591189154e-06, "loss": 0.802, "num_input_tokens_seen": 512753664, "step": 489 }, { "epoch": 0.5482517482517483, "grad_norm": 0.4213854818563274, "learning_rate": 4.63074953275424e-06, "loss": 0.7881, "num_input_tokens_seen": 513802240, "step": 490 }, { "epoch": 0.5493706293706294, "grad_norm": 0.42142480768491825, "learning_rate": 4.629200168026307e-06, "loss": 0.6901, "num_input_tokens_seen": 514850816, "step": 491 }, { "epoch": 0.5504895104895104, "grad_norm": 0.4018280667816967, "learning_rate": 4.6276478198787004e-06, "loss": 0.7506, "num_input_tokens_seen": 515899392, "step": 492 }, { "epoch": 0.5516083916083916, "grad_norm": 0.45912292526124115, "learning_rate": 4.626092490486557e-06, "loss": 0.8355, "num_input_tokens_seen": 516947968, "step": 493 }, { "epoch": 0.5527272727272727, "grad_norm": 0.5452339471076676, "learning_rate": 4.624534182029195e-06, "loss": 0.8243, "num_input_tokens_seen": 517996544, "step": 494 }, { "epoch": 0.5538461538461539, "grad_norm": 0.3972220882594883, "learning_rate": 4.6229728966901036e-06, "loss": 0.8418, "num_input_tokens_seen": 519045120, "step": 495 }, { "epoch": 0.554965034965035, "grad_norm": 0.4329727782715638, "learning_rate": 4.621408636656944e-06, "loss": 0.8343, "num_input_tokens_seen": 520093696, "step": 496 }, { "epoch": 0.556083916083916, "grad_norm": 0.38856132333782106, "learning_rate": 4.6198414041215484e-06, "loss": 0.6521, "num_input_tokens_seen": 521142272, "step": 497 }, { "epoch": 0.5572027972027972, "grad_norm": 0.3999132190318599, "learning_rate": 4.618271201279908e-06, "loss": 0.7437, "num_input_tokens_seen": 522190848, "step": 498 }, { "epoch": 0.5583216783216783, "grad_norm": 0.38790838694223395, "learning_rate": 4.616698030332183e-06, "loss": 0.7586, "num_input_tokens_seen": 523239424, "step": 499 }, { "epoch": 0.5594405594405595, "grad_norm": 0.3901133935020128, "learning_rate": 4.6151218934826866e-06, "loss": 0.6871, "num_input_tokens_seen": 524288000, "step": 500 }, { "epoch": 0.5594405594405595, "eval_loss": 0.7598036527633667, "eval_runtime": 246.9603, "eval_samples_per_second": 2.365, "eval_steps_per_second": 0.296, "num_input_tokens_seen": 524288000, "step": 500 }, { "epoch": 0.5605594405594405, "grad_norm": 0.4051531756159155, "learning_rate": 4.613542792939891e-06, "loss": 0.8157, "num_input_tokens_seen": 525336576, "step": 501 }, { "epoch": 0.5616783216783217, "grad_norm": 0.4101161529369437, "learning_rate": 4.611960730916419e-06, "loss": 0.6763, "num_input_tokens_seen": 526385152, "step": 502 }, { "epoch": 0.5627972027972028, "grad_norm": 0.37131274107669393, "learning_rate": 4.610375709629047e-06, "loss": 0.7936, "num_input_tokens_seen": 527433728, "step": 503 }, { "epoch": 0.5639160839160839, "grad_norm": 0.45267728422360054, "learning_rate": 4.608787731298694e-06, "loss": 0.6723, "num_input_tokens_seen": 528482304, "step": 504 }, { "epoch": 0.5650349650349651, "grad_norm": 0.40724529318352176, "learning_rate": 4.607196798150423e-06, "loss": 0.7054, "num_input_tokens_seen": 529530880, "step": 505 }, { "epoch": 0.5661538461538461, "grad_norm": 0.42201171845699553, "learning_rate": 4.605602912413437e-06, "loss": 0.7401, "num_input_tokens_seen": 530579456, "step": 506 }, { "epoch": 0.5672727272727273, "grad_norm": 0.42650026677236097, "learning_rate": 4.60400607632108e-06, "loss": 0.7786, "num_input_tokens_seen": 531628032, "step": 507 }, { "epoch": 0.5683916083916084, "grad_norm": 0.48113938089720626, "learning_rate": 4.602406292110825e-06, "loss": 0.6818, "num_input_tokens_seen": 532676608, "step": 508 }, { "epoch": 0.5695104895104895, "grad_norm": 0.42055692314601006, "learning_rate": 4.600803562024277e-06, "loss": 0.8581, "num_input_tokens_seen": 533725184, "step": 509 }, { "epoch": 0.5706293706293706, "grad_norm": 0.4051653438945856, "learning_rate": 4.599197888307172e-06, "loss": 0.759, "num_input_tokens_seen": 534773760, "step": 510 }, { "epoch": 0.5717482517482517, "grad_norm": 0.43305230095362424, "learning_rate": 4.597589273209366e-06, "loss": 0.7531, "num_input_tokens_seen": 535822336, "step": 511 }, { "epoch": 0.5728671328671329, "grad_norm": 0.38160535866249856, "learning_rate": 4.5959777189848405e-06, "loss": 0.8437, "num_input_tokens_seen": 536870912, "step": 512 }, { "epoch": 0.573986013986014, "grad_norm": 0.5739624921045537, "learning_rate": 4.594363227891693e-06, "loss": 0.7465, "num_input_tokens_seen": 537919488, "step": 513 }, { "epoch": 0.5751048951048952, "grad_norm": 0.3948190832960597, "learning_rate": 4.592745802192136e-06, "loss": 0.6948, "num_input_tokens_seen": 538968064, "step": 514 }, { "epoch": 0.5762237762237762, "grad_norm": 0.4427243624565206, "learning_rate": 4.591125444152495e-06, "loss": 0.7117, "num_input_tokens_seen": 540016640, "step": 515 }, { "epoch": 0.5773426573426573, "grad_norm": 0.42636946215390814, "learning_rate": 4.589502156043203e-06, "loss": 0.8269, "num_input_tokens_seen": 541065216, "step": 516 }, { "epoch": 0.5784615384615385, "grad_norm": 0.3906507736759229, "learning_rate": 4.587875940138801e-06, "loss": 0.6603, "num_input_tokens_seen": 542113792, "step": 517 }, { "epoch": 0.5795804195804196, "grad_norm": 0.4612595367345824, "learning_rate": 4.58624679871793e-06, "loss": 0.6775, "num_input_tokens_seen": 543162368, "step": 518 }, { "epoch": 0.5806993006993006, "grad_norm": 0.441460381036946, "learning_rate": 4.5846147340633305e-06, "loss": 0.7769, "num_input_tokens_seen": 544210944, "step": 519 }, { "epoch": 0.5818181818181818, "grad_norm": 0.44496856009013347, "learning_rate": 4.58297974846184e-06, "loss": 0.6541, "num_input_tokens_seen": 545259520, "step": 520 }, { "epoch": 0.5829370629370629, "grad_norm": 0.3791534064037372, "learning_rate": 4.5813418442043885e-06, "loss": 0.7022, "num_input_tokens_seen": 546308096, "step": 521 }, { "epoch": 0.5840559440559441, "grad_norm": 0.41884758926358073, "learning_rate": 4.579701023585995e-06, "loss": 0.7934, "num_input_tokens_seen": 547356672, "step": 522 }, { "epoch": 0.5851748251748252, "grad_norm": 0.3640013771705732, "learning_rate": 4.578057288905766e-06, "loss": 0.7407, "num_input_tokens_seen": 548405248, "step": 523 }, { "epoch": 0.5862937062937063, "grad_norm": 0.38638746581261013, "learning_rate": 4.57641064246689e-06, "loss": 0.805, "num_input_tokens_seen": 549453824, "step": 524 }, { "epoch": 0.5874125874125874, "grad_norm": 0.40926323392356273, "learning_rate": 4.574761086576635e-06, "loss": 0.7296, "num_input_tokens_seen": 550502400, "step": 525 }, { "epoch": 0.5885314685314685, "grad_norm": 0.3900920106689787, "learning_rate": 4.573108623546348e-06, "loss": 0.6807, "num_input_tokens_seen": 551550976, "step": 526 }, { "epoch": 0.5896503496503497, "grad_norm": 0.46956195794755756, "learning_rate": 4.571453255691449e-06, "loss": 0.7452, "num_input_tokens_seen": 552599552, "step": 527 }, { "epoch": 0.5907692307692308, "grad_norm": 0.39190547735579456, "learning_rate": 4.569794985331425e-06, "loss": 0.7424, "num_input_tokens_seen": 553648128, "step": 528 }, { "epoch": 0.5918881118881119, "grad_norm": 0.39499361861816856, "learning_rate": 4.568133814789833e-06, "loss": 0.6451, "num_input_tokens_seen": 554696704, "step": 529 }, { "epoch": 0.593006993006993, "grad_norm": 0.4395662744241946, "learning_rate": 4.566469746394294e-06, "loss": 0.7206, "num_input_tokens_seen": 555745280, "step": 530 }, { "epoch": 0.5941258741258741, "grad_norm": 0.3793196939789831, "learning_rate": 4.564802782476487e-06, "loss": 0.8261, "num_input_tokens_seen": 556793856, "step": 531 }, { "epoch": 0.5952447552447553, "grad_norm": 0.39040353645293513, "learning_rate": 4.563132925372151e-06, "loss": 0.8197, "num_input_tokens_seen": 557842432, "step": 532 }, { "epoch": 0.5963636363636363, "grad_norm": 0.45714789296474956, "learning_rate": 4.561460177421078e-06, "loss": 0.7082, "num_input_tokens_seen": 558891008, "step": 533 }, { "epoch": 0.5974825174825175, "grad_norm": 0.3798339877254914, "learning_rate": 4.559784540967107e-06, "loss": 0.6756, "num_input_tokens_seen": 559939584, "step": 534 }, { "epoch": 0.5986013986013986, "grad_norm": 0.42384040635755255, "learning_rate": 4.558106018358131e-06, "loss": 0.6532, "num_input_tokens_seen": 560988160, "step": 535 }, { "epoch": 0.5997202797202797, "grad_norm": 0.3952383847526873, "learning_rate": 4.5564246119460805e-06, "loss": 0.7283, "num_input_tokens_seen": 562036736, "step": 536 }, { "epoch": 0.6008391608391609, "grad_norm": 0.6134024093557748, "learning_rate": 4.5547403240869335e-06, "loss": 0.739, "num_input_tokens_seen": 563085312, "step": 537 }, { "epoch": 0.6019580419580419, "grad_norm": 0.4194929568946521, "learning_rate": 4.553053157140699e-06, "loss": 0.654, "num_input_tokens_seen": 564133888, "step": 538 }, { "epoch": 0.6030769230769231, "grad_norm": 0.39181748012762524, "learning_rate": 4.5513631134714235e-06, "loss": 0.7559, "num_input_tokens_seen": 565182464, "step": 539 }, { "epoch": 0.6041958041958042, "grad_norm": 0.4174123082674562, "learning_rate": 4.5496701954471845e-06, "loss": 0.8768, "num_input_tokens_seen": 566231040, "step": 540 }, { "epoch": 0.6053146853146854, "grad_norm": 0.4896560744359922, "learning_rate": 4.547974405440085e-06, "loss": 0.7983, "num_input_tokens_seen": 567279616, "step": 541 }, { "epoch": 0.6064335664335664, "grad_norm": 0.47998689711551684, "learning_rate": 4.5462757458262565e-06, "loss": 0.7015, "num_input_tokens_seen": 568328192, "step": 542 }, { "epoch": 0.6075524475524475, "grad_norm": 0.41109928836028425, "learning_rate": 4.544574218985845e-06, "loss": 0.949, "num_input_tokens_seen": 569376768, "step": 543 }, { "epoch": 0.6086713286713287, "grad_norm": 0.4271039252169236, "learning_rate": 4.5428698273030185e-06, "loss": 0.6954, "num_input_tokens_seen": 570425344, "step": 544 }, { "epoch": 0.6097902097902098, "grad_norm": 0.4315715615641279, "learning_rate": 4.5411625731659595e-06, "loss": 0.6441, "num_input_tokens_seen": 571473920, "step": 545 }, { "epoch": 0.610909090909091, "grad_norm": 0.6779318239278531, "learning_rate": 4.539452458966857e-06, "loss": 0.8815, "num_input_tokens_seen": 572522496, "step": 546 }, { "epoch": 0.612027972027972, "grad_norm": 0.44137197932927896, "learning_rate": 4.5377394871019145e-06, "loss": 0.8271, "num_input_tokens_seen": 573571072, "step": 547 }, { "epoch": 0.6131468531468531, "grad_norm": 0.46410008531014496, "learning_rate": 4.536023659971332e-06, "loss": 0.6533, "num_input_tokens_seen": 574619648, "step": 548 }, { "epoch": 0.6142657342657343, "grad_norm": 0.4428987636866565, "learning_rate": 4.534304979979317e-06, "loss": 0.7459, "num_input_tokens_seen": 575668224, "step": 549 }, { "epoch": 0.6153846153846154, "grad_norm": 0.4092502440699636, "learning_rate": 4.5325834495340695e-06, "loss": 0.7696, "num_input_tokens_seen": 576716800, "step": 550 }, { "epoch": 0.6165034965034965, "grad_norm": 0.49483321631643656, "learning_rate": 4.530859071047785e-06, "loss": 0.9027, "num_input_tokens_seen": 577765376, "step": 551 }, { "epoch": 0.6176223776223776, "grad_norm": 0.41051981072843585, "learning_rate": 4.529131846936651e-06, "loss": 0.7433, "num_input_tokens_seen": 578813952, "step": 552 }, { "epoch": 0.6187412587412587, "grad_norm": 0.4709519906857253, "learning_rate": 4.52740177962084e-06, "loss": 0.8272, "num_input_tokens_seen": 579862528, "step": 553 }, { "epoch": 0.6198601398601399, "grad_norm": 0.4260290990277087, "learning_rate": 4.525668871524512e-06, "loss": 0.7529, "num_input_tokens_seen": 580911104, "step": 554 }, { "epoch": 0.620979020979021, "grad_norm": 0.4228102549702508, "learning_rate": 4.5239331250758025e-06, "loss": 0.8129, "num_input_tokens_seen": 581959680, "step": 555 }, { "epoch": 0.6220979020979021, "grad_norm": 0.5249314521684558, "learning_rate": 4.522194542706828e-06, "loss": 0.7073, "num_input_tokens_seen": 583008256, "step": 556 }, { "epoch": 0.6232167832167832, "grad_norm": 0.4146003132352531, "learning_rate": 4.520453126853677e-06, "loss": 0.8055, "num_input_tokens_seen": 584056832, "step": 557 }, { "epoch": 0.6243356643356643, "grad_norm": 0.427459707648928, "learning_rate": 4.51870887995641e-06, "loss": 0.6958, "num_input_tokens_seen": 585105408, "step": 558 }, { "epoch": 0.6254545454545455, "grad_norm": 0.3977998023214914, "learning_rate": 4.51696180445905e-06, "loss": 0.8523, "num_input_tokens_seen": 586153984, "step": 559 }, { "epoch": 0.6265734265734266, "grad_norm": 0.37367223712702463, "learning_rate": 4.51521190280959e-06, "loss": 0.782, "num_input_tokens_seen": 587202560, "step": 560 }, { "epoch": 0.6276923076923077, "grad_norm": 0.39445974396529554, "learning_rate": 4.513459177459977e-06, "loss": 0.7491, "num_input_tokens_seen": 588251136, "step": 561 }, { "epoch": 0.6288111888111888, "grad_norm": 0.42398448011304557, "learning_rate": 4.511703630866118e-06, "loss": 0.7499, "num_input_tokens_seen": 589299712, "step": 562 }, { "epoch": 0.62993006993007, "grad_norm": 0.4601645118490057, "learning_rate": 4.509945265487871e-06, "loss": 0.8599, "num_input_tokens_seen": 590348288, "step": 563 }, { "epoch": 0.6310489510489511, "grad_norm": 0.49814637336829276, "learning_rate": 4.5081840837890445e-06, "loss": 0.6938, "num_input_tokens_seen": 591396864, "step": 564 }, { "epoch": 0.6321678321678321, "grad_norm": 0.3788730031835099, "learning_rate": 4.506420088237395e-06, "loss": 0.783, "num_input_tokens_seen": 592445440, "step": 565 }, { "epoch": 0.6332867132867133, "grad_norm": 0.46397606992769574, "learning_rate": 4.504653281304619e-06, "loss": 0.7826, "num_input_tokens_seen": 593494016, "step": 566 }, { "epoch": 0.6344055944055944, "grad_norm": 0.40979687570872103, "learning_rate": 4.5028836654663535e-06, "loss": 0.7171, "num_input_tokens_seen": 594542592, "step": 567 }, { "epoch": 0.6355244755244756, "grad_norm": 0.4075343797036647, "learning_rate": 4.501111243202172e-06, "loss": 0.6844, "num_input_tokens_seen": 595591168, "step": 568 }, { "epoch": 0.6366433566433567, "grad_norm": 0.4510321764815716, "learning_rate": 4.4993360169955784e-06, "loss": 0.7739, "num_input_tokens_seen": 596639744, "step": 569 }, { "epoch": 0.6377622377622377, "grad_norm": 0.4228928552010739, "learning_rate": 4.497557989334008e-06, "loss": 0.7509, "num_input_tokens_seen": 597688320, "step": 570 }, { "epoch": 0.6388811188811189, "grad_norm": 0.45322288768916524, "learning_rate": 4.4957771627088185e-06, "loss": 0.7863, "num_input_tokens_seen": 598736896, "step": 571 }, { "epoch": 0.64, "grad_norm": 0.42326019918542884, "learning_rate": 4.493993539615293e-06, "loss": 0.7671, "num_input_tokens_seen": 599785472, "step": 572 }, { "epoch": 0.6411188811188812, "grad_norm": 0.4939633626364763, "learning_rate": 4.492207122552629e-06, "loss": 0.7251, "num_input_tokens_seen": 600834048, "step": 573 }, { "epoch": 0.6422377622377622, "grad_norm": 0.4131065486051911, "learning_rate": 4.490417914023944e-06, "loss": 0.7206, "num_input_tokens_seen": 601882624, "step": 574 }, { "epoch": 0.6433566433566433, "grad_norm": 0.4413355500866939, "learning_rate": 4.48862591653626e-06, "loss": 0.6965, "num_input_tokens_seen": 602931200, "step": 575 }, { "epoch": 0.6444755244755245, "grad_norm": 0.4033487449963994, "learning_rate": 4.486831132600513e-06, "loss": 0.7772, "num_input_tokens_seen": 603979776, "step": 576 }, { "epoch": 0.6455944055944056, "grad_norm": 0.42668135212686015, "learning_rate": 4.485033564731542e-06, "loss": 0.6837, "num_input_tokens_seen": 605028352, "step": 577 }, { "epoch": 0.6467132867132868, "grad_norm": 0.3846571800286361, "learning_rate": 4.483233215448084e-06, "loss": 0.7357, "num_input_tokens_seen": 606076928, "step": 578 }, { "epoch": 0.6478321678321678, "grad_norm": 0.3754886975091245, "learning_rate": 4.481430087272776e-06, "loss": 0.6863, "num_input_tokens_seen": 607125504, "step": 579 }, { "epoch": 0.6489510489510489, "grad_norm": 0.45366241286099274, "learning_rate": 4.479624182732148e-06, "loss": 0.7333, "num_input_tokens_seen": 608174080, "step": 580 }, { "epoch": 0.6500699300699301, "grad_norm": 0.39819898287849886, "learning_rate": 4.47781550435662e-06, "loss": 0.8355, "num_input_tokens_seen": 609222656, "step": 581 }, { "epoch": 0.6511888111888112, "grad_norm": 0.4345007096495338, "learning_rate": 4.476004054680501e-06, "loss": 0.8488, "num_input_tokens_seen": 610271232, "step": 582 }, { "epoch": 0.6523076923076923, "grad_norm": 0.39539053912133715, "learning_rate": 4.474189836241976e-06, "loss": 0.7343, "num_input_tokens_seen": 611319808, "step": 583 }, { "epoch": 0.6534265734265734, "grad_norm": 0.45275399411061396, "learning_rate": 4.472372851583121e-06, "loss": 0.8272, "num_input_tokens_seen": 612368384, "step": 584 }, { "epoch": 0.6545454545454545, "grad_norm": 0.46594065917258676, "learning_rate": 4.470553103249876e-06, "loss": 0.7532, "num_input_tokens_seen": 613416960, "step": 585 }, { "epoch": 0.6556643356643357, "grad_norm": 0.4135422309227483, "learning_rate": 4.468730593792062e-06, "loss": 0.7231, "num_input_tokens_seen": 614465536, "step": 586 }, { "epoch": 0.6567832167832168, "grad_norm": 0.4252419743092624, "learning_rate": 4.466905325763365e-06, "loss": 0.8676, "num_input_tokens_seen": 615514112, "step": 587 }, { "epoch": 0.6579020979020979, "grad_norm": 0.39647242360578583, "learning_rate": 4.465077301721338e-06, "loss": 0.6988, "num_input_tokens_seen": 616562688, "step": 588 }, { "epoch": 0.659020979020979, "grad_norm": 0.4296123862809154, "learning_rate": 4.463246524227393e-06, "loss": 0.7208, "num_input_tokens_seen": 617611264, "step": 589 }, { "epoch": 0.6601398601398601, "grad_norm": 0.4142911569988168, "learning_rate": 4.461412995846804e-06, "loss": 0.7239, "num_input_tokens_seen": 618659840, "step": 590 }, { "epoch": 0.6612587412587413, "grad_norm": 0.4016338593391835, "learning_rate": 4.459576719148697e-06, "loss": 0.6923, "num_input_tokens_seen": 619708416, "step": 591 }, { "epoch": 0.6623776223776224, "grad_norm": 0.3874036967368266, "learning_rate": 4.4577376967060495e-06, "loss": 0.7647, "num_input_tokens_seen": 620756992, "step": 592 }, { "epoch": 0.6634965034965035, "grad_norm": 0.4051984726755203, "learning_rate": 4.4558959310956865e-06, "loss": 0.844, "num_input_tokens_seen": 621805568, "step": 593 }, { "epoch": 0.6646153846153846, "grad_norm": 0.47505156948241867, "learning_rate": 4.4540514248982744e-06, "loss": 0.7201, "num_input_tokens_seen": 622854144, "step": 594 }, { "epoch": 0.6657342657342658, "grad_norm": 0.3832706796500114, "learning_rate": 4.452204180698325e-06, "loss": 0.7064, "num_input_tokens_seen": 623902720, "step": 595 }, { "epoch": 0.6668531468531469, "grad_norm": 0.38651455583938366, "learning_rate": 4.450354201084181e-06, "loss": 0.7925, "num_input_tokens_seen": 624951296, "step": 596 }, { "epoch": 0.6679720279720279, "grad_norm": 0.4158405781773378, "learning_rate": 4.448501488648021e-06, "loss": 0.7738, "num_input_tokens_seen": 625999872, "step": 597 }, { "epoch": 0.6690909090909091, "grad_norm": 0.43682643127028875, "learning_rate": 4.446646045985852e-06, "loss": 0.7432, "num_input_tokens_seen": 627048448, "step": 598 }, { "epoch": 0.6702097902097902, "grad_norm": 0.4436616021793646, "learning_rate": 4.4447878756975074e-06, "loss": 0.6673, "num_input_tokens_seen": 628097024, "step": 599 }, { "epoch": 0.6713286713286714, "grad_norm": 0.369261613718792, "learning_rate": 4.44292698038664e-06, "loss": 0.7206, "num_input_tokens_seen": 629145600, "step": 600 }, { "epoch": 0.6724475524475525, "grad_norm": 0.4776667316187862, "learning_rate": 4.441063362660726e-06, "loss": 0.7508, "num_input_tokens_seen": 630194176, "step": 601 }, { "epoch": 0.6735664335664335, "grad_norm": 0.4049471555267597, "learning_rate": 4.439197025131049e-06, "loss": 0.7696, "num_input_tokens_seen": 631242752, "step": 602 }, { "epoch": 0.6746853146853147, "grad_norm": 0.400269308270229, "learning_rate": 4.4373279704127095e-06, "loss": 0.6821, "num_input_tokens_seen": 632291328, "step": 603 }, { "epoch": 0.6758041958041958, "grad_norm": 0.4539179428156578, "learning_rate": 4.435456201124613e-06, "loss": 0.7345, "num_input_tokens_seen": 633339904, "step": 604 }, { "epoch": 0.676923076923077, "grad_norm": 0.4250432166806938, "learning_rate": 4.433581719889469e-06, "loss": 0.6975, "num_input_tokens_seen": 634388480, "step": 605 }, { "epoch": 0.678041958041958, "grad_norm": 0.510702570968114, "learning_rate": 4.431704529333787e-06, "loss": 0.7357, "num_input_tokens_seen": 635437056, "step": 606 }, { "epoch": 0.6791608391608391, "grad_norm": 0.4143952784983821, "learning_rate": 4.429824632087873e-06, "loss": 0.8389, "num_input_tokens_seen": 636485632, "step": 607 }, { "epoch": 0.6802797202797203, "grad_norm": 0.3813015666608585, "learning_rate": 4.427942030785824e-06, "loss": 0.738, "num_input_tokens_seen": 637534208, "step": 608 }, { "epoch": 0.6813986013986014, "grad_norm": 0.4949438279728506, "learning_rate": 4.426056728065527e-06, "loss": 0.8876, "num_input_tokens_seen": 638582784, "step": 609 }, { "epoch": 0.6825174825174826, "grad_norm": 0.39768100835429787, "learning_rate": 4.424168726568656e-06, "loss": 0.6831, "num_input_tokens_seen": 639631360, "step": 610 }, { "epoch": 0.6836363636363636, "grad_norm": 0.41349916993959523, "learning_rate": 4.422278028940664e-06, "loss": 0.7425, "num_input_tokens_seen": 640679936, "step": 611 }, { "epoch": 0.6847552447552447, "grad_norm": 0.391140152512324, "learning_rate": 4.420384637830783e-06, "loss": 0.8177, "num_input_tokens_seen": 641728512, "step": 612 }, { "epoch": 0.6858741258741259, "grad_norm": 0.4094689331623298, "learning_rate": 4.418488555892018e-06, "loss": 0.8866, "num_input_tokens_seen": 642777088, "step": 613 }, { "epoch": 0.686993006993007, "grad_norm": 0.39202643182805624, "learning_rate": 4.4165897857811455e-06, "loss": 0.8667, "num_input_tokens_seen": 643825664, "step": 614 }, { "epoch": 0.6881118881118881, "grad_norm": 0.4132220142267629, "learning_rate": 4.414688330158709e-06, "loss": 0.706, "num_input_tokens_seen": 644874240, "step": 615 }, { "epoch": 0.6892307692307692, "grad_norm": 0.4130747036036898, "learning_rate": 4.412784191689013e-06, "loss": 0.8243, "num_input_tokens_seen": 645922816, "step": 616 }, { "epoch": 0.6903496503496503, "grad_norm": 0.41782207622573625, "learning_rate": 4.4108773730401235e-06, "loss": 0.8379, "num_input_tokens_seen": 646971392, "step": 617 }, { "epoch": 0.6914685314685315, "grad_norm": 0.42966565736920653, "learning_rate": 4.40896787688386e-06, "loss": 0.7371, "num_input_tokens_seen": 648019968, "step": 618 }, { "epoch": 0.6925874125874126, "grad_norm": 0.3922504632685414, "learning_rate": 4.407055705895794e-06, "loss": 0.7052, "num_input_tokens_seen": 649068544, "step": 619 }, { "epoch": 0.6937062937062937, "grad_norm": 0.7761550045164338, "learning_rate": 4.405140862755247e-06, "loss": 0.894, "num_input_tokens_seen": 650117120, "step": 620 }, { "epoch": 0.6948251748251748, "grad_norm": 0.42599518763396677, "learning_rate": 4.403223350145283e-06, "loss": 0.7827, "num_input_tokens_seen": 651165696, "step": 621 }, { "epoch": 0.695944055944056, "grad_norm": 0.4465274152619557, "learning_rate": 4.401303170752705e-06, "loss": 0.7743, "num_input_tokens_seen": 652214272, "step": 622 }, { "epoch": 0.6970629370629371, "grad_norm": 0.43638099378594924, "learning_rate": 4.3993803272680555e-06, "loss": 0.7744, "num_input_tokens_seen": 653262848, "step": 623 }, { "epoch": 0.6981818181818182, "grad_norm": 0.39231567118810057, "learning_rate": 4.397454822385608e-06, "loss": 0.7421, "num_input_tokens_seen": 654311424, "step": 624 }, { "epoch": 0.6993006993006993, "grad_norm": 0.40290357468523386, "learning_rate": 4.395526658803367e-06, "loss": 0.7657, "num_input_tokens_seen": 655360000, "step": 625 }, { "epoch": 0.7004195804195804, "grad_norm": 0.3611458350822117, "learning_rate": 4.393595839223061e-06, "loss": 0.7183, "num_input_tokens_seen": 656408576, "step": 626 }, { "epoch": 0.7015384615384616, "grad_norm": 0.40848434823009067, "learning_rate": 4.391662366350139e-06, "loss": 0.8945, "num_input_tokens_seen": 657457152, "step": 627 }, { "epoch": 0.7026573426573427, "grad_norm": 0.3861331054648259, "learning_rate": 4.38972624289377e-06, "loss": 0.7571, "num_input_tokens_seen": 658505728, "step": 628 }, { "epoch": 0.7037762237762237, "grad_norm": 0.392632788864081, "learning_rate": 4.387787471566837e-06, "loss": 0.7475, "num_input_tokens_seen": 659554304, "step": 629 }, { "epoch": 0.7048951048951049, "grad_norm": 0.41703103075470016, "learning_rate": 4.385846055085929e-06, "loss": 0.7916, "num_input_tokens_seen": 660602880, "step": 630 }, { "epoch": 0.706013986013986, "grad_norm": 0.46532577126123575, "learning_rate": 4.383901996171348e-06, "loss": 0.871, "num_input_tokens_seen": 661651456, "step": 631 }, { "epoch": 0.7071328671328672, "grad_norm": 0.44056125773601396, "learning_rate": 4.381955297547093e-06, "loss": 0.6305, "num_input_tokens_seen": 662700032, "step": 632 }, { "epoch": 0.7082517482517483, "grad_norm": 0.40970646495020124, "learning_rate": 4.380005961940864e-06, "loss": 0.8249, "num_input_tokens_seen": 663748608, "step": 633 }, { "epoch": 0.7093706293706293, "grad_norm": 0.511665998705182, "learning_rate": 4.378053992084057e-06, "loss": 0.7488, "num_input_tokens_seen": 664797184, "step": 634 }, { "epoch": 0.7104895104895105, "grad_norm": 0.37523469325565506, "learning_rate": 4.376099390711758e-06, "loss": 0.883, "num_input_tokens_seen": 665845760, "step": 635 }, { "epoch": 0.7116083916083916, "grad_norm": 0.4266181328024029, "learning_rate": 4.374142160562738e-06, "loss": 0.7111, "num_input_tokens_seen": 666894336, "step": 636 }, { "epoch": 0.7127272727272728, "grad_norm": 0.39278069116138203, "learning_rate": 4.372182304379455e-06, "loss": 0.7723, "num_input_tokens_seen": 667942912, "step": 637 }, { "epoch": 0.7138461538461538, "grad_norm": 0.4002402676050192, "learning_rate": 4.370219824908045e-06, "loss": 0.7532, "num_input_tokens_seen": 668991488, "step": 638 }, { "epoch": 0.7149650349650349, "grad_norm": 0.3973279408821485, "learning_rate": 4.368254724898319e-06, "loss": 0.8121, "num_input_tokens_seen": 670040064, "step": 639 }, { "epoch": 0.7160839160839161, "grad_norm": 0.40666324942255155, "learning_rate": 4.366287007103762e-06, "loss": 0.692, "num_input_tokens_seen": 671088640, "step": 640 }, { "epoch": 0.7172027972027972, "grad_norm": 0.3789105804193058, "learning_rate": 4.364316674281526e-06, "loss": 0.7969, "num_input_tokens_seen": 672137216, "step": 641 }, { "epoch": 0.7183216783216784, "grad_norm": 0.4232083353603421, "learning_rate": 4.362343729192425e-06, "loss": 0.6253, "num_input_tokens_seen": 673185792, "step": 642 }, { "epoch": 0.7194405594405594, "grad_norm": 0.37298664312714913, "learning_rate": 4.3603681746009374e-06, "loss": 0.7665, "num_input_tokens_seen": 674234368, "step": 643 }, { "epoch": 0.7205594405594405, "grad_norm": 0.39400723856339687, "learning_rate": 4.358390013275195e-06, "loss": 0.7127, "num_input_tokens_seen": 675282944, "step": 644 }, { "epoch": 0.7216783216783217, "grad_norm": 0.42050928012616007, "learning_rate": 4.356409247986982e-06, "loss": 0.7536, "num_input_tokens_seen": 676331520, "step": 645 }, { "epoch": 0.7227972027972028, "grad_norm": 0.4344980856086079, "learning_rate": 4.354425881511733e-06, "loss": 0.8728, "num_input_tokens_seen": 677380096, "step": 646 }, { "epoch": 0.7239160839160839, "grad_norm": 0.43060727520806313, "learning_rate": 4.352439916628527e-06, "loss": 0.688, "num_input_tokens_seen": 678428672, "step": 647 }, { "epoch": 0.725034965034965, "grad_norm": 0.40209956281893156, "learning_rate": 4.350451356120082e-06, "loss": 0.6268, "num_input_tokens_seen": 679477248, "step": 648 }, { "epoch": 0.7261538461538461, "grad_norm": 0.4606434814106928, "learning_rate": 4.348460202772756e-06, "loss": 0.773, "num_input_tokens_seen": 680525824, "step": 649 }, { "epoch": 0.7272727272727273, "grad_norm": 0.391207916131685, "learning_rate": 4.3464664593765385e-06, "loss": 0.7218, "num_input_tokens_seen": 681574400, "step": 650 }, { "epoch": 0.7283916083916084, "grad_norm": 0.3845685166246231, "learning_rate": 4.344470128725047e-06, "loss": 0.7529, "num_input_tokens_seen": 682622976, "step": 651 }, { "epoch": 0.7295104895104895, "grad_norm": 0.3851474976091053, "learning_rate": 4.342471213615525e-06, "loss": 0.6315, "num_input_tokens_seen": 683671552, "step": 652 }, { "epoch": 0.7306293706293706, "grad_norm": 0.47683588453711717, "learning_rate": 4.34046971684884e-06, "loss": 0.7196, "num_input_tokens_seen": 684720128, "step": 653 }, { "epoch": 0.7317482517482518, "grad_norm": 0.7189935519497389, "learning_rate": 4.3384656412294725e-06, "loss": 1.0307, "num_input_tokens_seen": 685768704, "step": 654 }, { "epoch": 0.7328671328671329, "grad_norm": 0.45241425419389864, "learning_rate": 4.336458989565519e-06, "loss": 0.8165, "num_input_tokens_seen": 686817280, "step": 655 }, { "epoch": 0.733986013986014, "grad_norm": 0.45415951379974506, "learning_rate": 4.334449764668686e-06, "loss": 0.6931, "num_input_tokens_seen": 687865856, "step": 656 }, { "epoch": 0.7351048951048951, "grad_norm": 0.4393375245619633, "learning_rate": 4.332437969354284e-06, "loss": 0.9317, "num_input_tokens_seen": 688914432, "step": 657 }, { "epoch": 0.7362237762237762, "grad_norm": 0.4422083468100868, "learning_rate": 4.330423606441225e-06, "loss": 0.7599, "num_input_tokens_seen": 689963008, "step": 658 }, { "epoch": 0.7373426573426574, "grad_norm": 0.4069911232933982, "learning_rate": 4.328406678752022e-06, "loss": 0.8251, "num_input_tokens_seen": 691011584, "step": 659 }, { "epoch": 0.7384615384615385, "grad_norm": 0.4022379902722273, "learning_rate": 4.326387189112776e-06, "loss": 0.7429, "num_input_tokens_seen": 692060160, "step": 660 }, { "epoch": 0.7395804195804195, "grad_norm": 0.4160031321222711, "learning_rate": 4.324365140353185e-06, "loss": 0.6885, "num_input_tokens_seen": 693108736, "step": 661 }, { "epoch": 0.7406993006993007, "grad_norm": 0.4786857157006666, "learning_rate": 4.322340535306525e-06, "loss": 0.7502, "num_input_tokens_seen": 694157312, "step": 662 }, { "epoch": 0.7418181818181818, "grad_norm": 0.4136060467370817, "learning_rate": 4.32031337680966e-06, "loss": 0.7426, "num_input_tokens_seen": 695205888, "step": 663 }, { "epoch": 0.742937062937063, "grad_norm": 0.44459198605794387, "learning_rate": 4.31828366770303e-06, "loss": 0.6304, "num_input_tokens_seen": 696254464, "step": 664 }, { "epoch": 0.7440559440559441, "grad_norm": 0.4774455200278896, "learning_rate": 4.3162514108306465e-06, "loss": 0.7407, "num_input_tokens_seen": 697303040, "step": 665 }, { "epoch": 0.7451748251748251, "grad_norm": 0.45817121705439867, "learning_rate": 4.314216609040095e-06, "loss": 0.6271, "num_input_tokens_seen": 698351616, "step": 666 }, { "epoch": 0.7462937062937063, "grad_norm": 0.4064381449937991, "learning_rate": 4.312179265182523e-06, "loss": 0.6549, "num_input_tokens_seen": 699400192, "step": 667 }, { "epoch": 0.7474125874125874, "grad_norm": 0.5229710520112711, "learning_rate": 4.310139382112644e-06, "loss": 0.7184, "num_input_tokens_seen": 700448768, "step": 668 }, { "epoch": 0.7485314685314686, "grad_norm": 0.44338406153515453, "learning_rate": 4.308096962688726e-06, "loss": 0.6622, "num_input_tokens_seen": 701497344, "step": 669 }, { "epoch": 0.7496503496503496, "grad_norm": 0.4471737357588951, "learning_rate": 4.3060520097725915e-06, "loss": 0.7929, "num_input_tokens_seen": 702545920, "step": 670 }, { "epoch": 0.7507692307692307, "grad_norm": 0.4025138914614887, "learning_rate": 4.304004526229614e-06, "loss": 0.8827, "num_input_tokens_seen": 703594496, "step": 671 }, { "epoch": 0.7518881118881119, "grad_norm": 0.47576883753239, "learning_rate": 4.301954514928713e-06, "loss": 0.6351, "num_input_tokens_seen": 704643072, "step": 672 }, { "epoch": 0.753006993006993, "grad_norm": 0.4154829122797699, "learning_rate": 4.299901978742349e-06, "loss": 0.7272, "num_input_tokens_seen": 705691648, "step": 673 }, { "epoch": 0.7541258741258742, "grad_norm": 0.49688148110727276, "learning_rate": 4.29784692054652e-06, "loss": 0.7258, "num_input_tokens_seen": 706740224, "step": 674 }, { "epoch": 0.7552447552447552, "grad_norm": 0.3486129542645724, "learning_rate": 4.29578934322076e-06, "loss": 0.7522, "num_input_tokens_seen": 707788800, "step": 675 }, { "epoch": 0.7563636363636363, "grad_norm": 0.5027840114999806, "learning_rate": 4.29372924964813e-06, "loss": 0.7302, "num_input_tokens_seen": 708837376, "step": 676 }, { "epoch": 0.7574825174825175, "grad_norm": 0.44006545128397556, "learning_rate": 4.2916666427152175e-06, "loss": 0.8214, "num_input_tokens_seen": 709885952, "step": 677 }, { "epoch": 0.7586013986013986, "grad_norm": 0.42678801282589, "learning_rate": 4.289601525312134e-06, "loss": 0.7277, "num_input_tokens_seen": 710934528, "step": 678 }, { "epoch": 0.7597202797202797, "grad_norm": 0.4305647439600633, "learning_rate": 4.287533900332506e-06, "loss": 0.755, "num_input_tokens_seen": 711983104, "step": 679 }, { "epoch": 0.7608391608391608, "grad_norm": 0.43069286153935304, "learning_rate": 4.285463770673474e-06, "loss": 0.7632, "num_input_tokens_seen": 713031680, "step": 680 }, { "epoch": 0.761958041958042, "grad_norm": 0.4109329767050543, "learning_rate": 4.283391139235688e-06, "loss": 0.6914, "num_input_tokens_seen": 714080256, "step": 681 }, { "epoch": 0.7630769230769231, "grad_norm": 0.397983888611596, "learning_rate": 4.281316008923306e-06, "loss": 0.6956, "num_input_tokens_seen": 715128832, "step": 682 }, { "epoch": 0.7641958041958042, "grad_norm": 0.3990843140642313, "learning_rate": 4.279238382643985e-06, "loss": 0.8227, "num_input_tokens_seen": 716177408, "step": 683 }, { "epoch": 0.7653146853146853, "grad_norm": 0.392214136150687, "learning_rate": 4.277158263308878e-06, "loss": 0.8398, "num_input_tokens_seen": 717225984, "step": 684 }, { "epoch": 0.7664335664335664, "grad_norm": 0.4281036953525592, "learning_rate": 4.275075653832635e-06, "loss": 0.744, "num_input_tokens_seen": 718274560, "step": 685 }, { "epoch": 0.7675524475524476, "grad_norm": 0.37903060379397585, "learning_rate": 4.272990557133391e-06, "loss": 0.8331, "num_input_tokens_seen": 719323136, "step": 686 }, { "epoch": 0.7686713286713287, "grad_norm": 0.4853313517404678, "learning_rate": 4.27090297613277e-06, "loss": 0.8187, "num_input_tokens_seen": 720371712, "step": 687 }, { "epoch": 0.7697902097902098, "grad_norm": 0.3868887311317549, "learning_rate": 4.268812913755875e-06, "loss": 0.7071, "num_input_tokens_seen": 721420288, "step": 688 }, { "epoch": 0.7709090909090909, "grad_norm": 0.40061279112341563, "learning_rate": 4.266720372931285e-06, "loss": 0.7088, "num_input_tokens_seen": 722468864, "step": 689 }, { "epoch": 0.772027972027972, "grad_norm": 0.4721974485733753, "learning_rate": 4.2646253565910535e-06, "loss": 0.6332, "num_input_tokens_seen": 723517440, "step": 690 }, { "epoch": 0.7731468531468532, "grad_norm": 0.3962049720659885, "learning_rate": 4.262527867670702e-06, "loss": 0.808, "num_input_tokens_seen": 724566016, "step": 691 }, { "epoch": 0.7742657342657343, "grad_norm": 0.36661180658243175, "learning_rate": 4.260427909109216e-06, "loss": 0.7215, "num_input_tokens_seen": 725614592, "step": 692 }, { "epoch": 0.7753846153846153, "grad_norm": 0.40246900956754994, "learning_rate": 4.258325483849044e-06, "loss": 0.8065, "num_input_tokens_seen": 726663168, "step": 693 }, { "epoch": 0.7765034965034965, "grad_norm": 0.3671412932972897, "learning_rate": 4.256220594836087e-06, "loss": 0.7724, "num_input_tokens_seen": 727711744, "step": 694 }, { "epoch": 0.7776223776223776, "grad_norm": 0.43951209156666354, "learning_rate": 4.254113245019701e-06, "loss": 0.726, "num_input_tokens_seen": 728760320, "step": 695 }, { "epoch": 0.7787412587412588, "grad_norm": 0.37639083627421277, "learning_rate": 4.252003437352688e-06, "loss": 0.8208, "num_input_tokens_seen": 729808896, "step": 696 }, { "epoch": 0.7798601398601399, "grad_norm": 0.3971858770503908, "learning_rate": 4.249891174791297e-06, "loss": 0.6953, "num_input_tokens_seen": 730857472, "step": 697 }, { "epoch": 0.7809790209790209, "grad_norm": 0.3840058046301361, "learning_rate": 4.247776460295213e-06, "loss": 0.6712, "num_input_tokens_seen": 731906048, "step": 698 }, { "epoch": 0.7820979020979021, "grad_norm": 0.4241924256100081, "learning_rate": 4.245659296827559e-06, "loss": 0.7831, "num_input_tokens_seen": 732954624, "step": 699 }, { "epoch": 0.7832167832167832, "grad_norm": 0.4119686383512181, "learning_rate": 4.243539687354889e-06, "loss": 0.7666, "num_input_tokens_seen": 734003200, "step": 700 }, { "epoch": 0.7843356643356644, "grad_norm": 0.37199954077490954, "learning_rate": 4.2414176348471845e-06, "loss": 0.7889, "num_input_tokens_seen": 735051776, "step": 701 }, { "epoch": 0.7854545454545454, "grad_norm": 0.4288978521007298, "learning_rate": 4.23929314227785e-06, "loss": 0.6987, "num_input_tokens_seen": 736100352, "step": 702 }, { "epoch": 0.7865734265734265, "grad_norm": 0.4484940127163932, "learning_rate": 4.237166212623708e-06, "loss": 0.7479, "num_input_tokens_seen": 737148928, "step": 703 }, { "epoch": 0.7876923076923077, "grad_norm": 0.3785243882172237, "learning_rate": 4.235036848864998e-06, "loss": 0.727, "num_input_tokens_seen": 738197504, "step": 704 }, { "epoch": 0.7888111888111888, "grad_norm": 0.4390066154345979, "learning_rate": 4.232905053985368e-06, "loss": 0.6466, "num_input_tokens_seen": 739246080, "step": 705 }, { "epoch": 0.78993006993007, "grad_norm": 0.3943569194018216, "learning_rate": 4.230770830971873e-06, "loss": 0.9148, "num_input_tokens_seen": 740294656, "step": 706 }, { "epoch": 0.791048951048951, "grad_norm": 0.5884684027874382, "learning_rate": 4.228634182814972e-06, "loss": 0.7618, "num_input_tokens_seen": 741343232, "step": 707 }, { "epoch": 0.7921678321678322, "grad_norm": 0.4269597342724163, "learning_rate": 4.226495112508521e-06, "loss": 0.7249, "num_input_tokens_seen": 742391808, "step": 708 }, { "epoch": 0.7932867132867133, "grad_norm": 0.4434301958165542, "learning_rate": 4.224353623049767e-06, "loss": 0.7519, "num_input_tokens_seen": 743440384, "step": 709 }, { "epoch": 0.7944055944055944, "grad_norm": 0.4143652197060312, "learning_rate": 4.222209717439351e-06, "loss": 0.7133, "num_input_tokens_seen": 744488960, "step": 710 }, { "epoch": 0.7955244755244755, "grad_norm": 0.36244793092015914, "learning_rate": 4.220063398681299e-06, "loss": 0.7734, "num_input_tokens_seen": 745537536, "step": 711 }, { "epoch": 0.7966433566433566, "grad_norm": 0.4088586263698245, "learning_rate": 4.2179146697830155e-06, "loss": 0.7867, "num_input_tokens_seen": 746586112, "step": 712 }, { "epoch": 0.7977622377622378, "grad_norm": 0.37162626593760373, "learning_rate": 4.215763533755285e-06, "loss": 0.7775, "num_input_tokens_seen": 747634688, "step": 713 }, { "epoch": 0.7988811188811189, "grad_norm": 0.42197689380226366, "learning_rate": 4.213609993612262e-06, "loss": 0.791, "num_input_tokens_seen": 748683264, "step": 714 }, { "epoch": 0.8, "grad_norm": 0.4382978957189326, "learning_rate": 4.211454052371471e-06, "loss": 0.7399, "num_input_tokens_seen": 749731840, "step": 715 }, { "epoch": 0.8011188811188811, "grad_norm": 0.41351177315175053, "learning_rate": 4.209295713053802e-06, "loss": 0.8503, "num_input_tokens_seen": 750780416, "step": 716 }, { "epoch": 0.8022377622377622, "grad_norm": 0.4057354756480584, "learning_rate": 4.207134978683506e-06, "loss": 0.881, "num_input_tokens_seen": 751828992, "step": 717 }, { "epoch": 0.8033566433566434, "grad_norm": 0.35492932637331887, "learning_rate": 4.204971852288185e-06, "loss": 0.7961, "num_input_tokens_seen": 752877568, "step": 718 }, { "epoch": 0.8044755244755245, "grad_norm": 0.4275215006055302, "learning_rate": 4.202806336898798e-06, "loss": 0.7213, "num_input_tokens_seen": 753926144, "step": 719 }, { "epoch": 0.8055944055944056, "grad_norm": 0.3884685807518183, "learning_rate": 4.200638435549648e-06, "loss": 0.7755, "num_input_tokens_seen": 754974720, "step": 720 }, { "epoch": 0.8067132867132867, "grad_norm": 0.4417480048277306, "learning_rate": 4.198468151278382e-06, "loss": 0.7661, "num_input_tokens_seen": 756023296, "step": 721 }, { "epoch": 0.8078321678321678, "grad_norm": 0.36610407280610907, "learning_rate": 4.196295487125989e-06, "loss": 0.642, "num_input_tokens_seen": 757071872, "step": 722 }, { "epoch": 0.808951048951049, "grad_norm": 0.3793483207679627, "learning_rate": 4.194120446136788e-06, "loss": 0.7341, "num_input_tokens_seen": 758120448, "step": 723 }, { "epoch": 0.8100699300699301, "grad_norm": 0.381166379964808, "learning_rate": 4.19194303135843e-06, "loss": 0.8549, "num_input_tokens_seen": 759169024, "step": 724 }, { "epoch": 0.8111888111888111, "grad_norm": 0.4480057230669013, "learning_rate": 4.189763245841895e-06, "loss": 0.709, "num_input_tokens_seen": 760217600, "step": 725 }, { "epoch": 0.8123076923076923, "grad_norm": 0.3705137212623044, "learning_rate": 4.187581092641481e-06, "loss": 0.7409, "num_input_tokens_seen": 761266176, "step": 726 }, { "epoch": 0.8134265734265734, "grad_norm": 0.3649191724594021, "learning_rate": 4.185396574814804e-06, "loss": 0.7986, "num_input_tokens_seen": 762314752, "step": 727 }, { "epoch": 0.8145454545454546, "grad_norm": 0.4013777755732205, "learning_rate": 4.183209695422797e-06, "loss": 0.7582, "num_input_tokens_seen": 763363328, "step": 728 }, { "epoch": 0.8156643356643357, "grad_norm": 0.378709637068626, "learning_rate": 4.1810204575296966e-06, "loss": 0.6475, "num_input_tokens_seen": 764411904, "step": 729 }, { "epoch": 0.8167832167832167, "grad_norm": 0.3932915213598773, "learning_rate": 4.178828864203049e-06, "loss": 0.701, "num_input_tokens_seen": 765460480, "step": 730 }, { "epoch": 0.8179020979020979, "grad_norm": 0.397204837210087, "learning_rate": 4.176634918513698e-06, "loss": 0.6757, "num_input_tokens_seen": 766509056, "step": 731 }, { "epoch": 0.819020979020979, "grad_norm": 0.39879528228352273, "learning_rate": 4.174438623535784e-06, "loss": 0.7554, "num_input_tokens_seen": 767557632, "step": 732 }, { "epoch": 0.8201398601398602, "grad_norm": 0.40106360649342343, "learning_rate": 4.17223998234674e-06, "loss": 0.7702, "num_input_tokens_seen": 768606208, "step": 733 }, { "epoch": 0.8212587412587412, "grad_norm": 0.4020940738325546, "learning_rate": 4.170038998027285e-06, "loss": 0.7197, "num_input_tokens_seen": 769654784, "step": 734 }, { "epoch": 0.8223776223776224, "grad_norm": 0.3793572649813936, "learning_rate": 4.167835673661422e-06, "loss": 0.6775, "num_input_tokens_seen": 770703360, "step": 735 }, { "epoch": 0.8234965034965035, "grad_norm": 0.373331460852642, "learning_rate": 4.165630012336435e-06, "loss": 0.7137, "num_input_tokens_seen": 771751936, "step": 736 }, { "epoch": 0.8246153846153846, "grad_norm": 0.38413077199646195, "learning_rate": 4.163422017142879e-06, "loss": 0.6855, "num_input_tokens_seen": 772800512, "step": 737 }, { "epoch": 0.8257342657342658, "grad_norm": 0.3846037319710703, "learning_rate": 4.1612116911745805e-06, "loss": 0.779, "num_input_tokens_seen": 773849088, "step": 738 }, { "epoch": 0.8268531468531468, "grad_norm": 0.3783727293040752, "learning_rate": 4.158999037528632e-06, "loss": 0.7513, "num_input_tokens_seen": 774897664, "step": 739 }, { "epoch": 0.827972027972028, "grad_norm": 0.38208504619919353, "learning_rate": 4.156784059305388e-06, "loss": 0.7909, "num_input_tokens_seen": 775946240, "step": 740 }, { "epoch": 0.8290909090909091, "grad_norm": 0.46342775749509363, "learning_rate": 4.1545667596084596e-06, "loss": 0.8066, "num_input_tokens_seen": 776994816, "step": 741 }, { "epoch": 0.8302097902097902, "grad_norm": 0.3886343727740628, "learning_rate": 4.152347141544711e-06, "loss": 0.7322, "num_input_tokens_seen": 778043392, "step": 742 }, { "epoch": 0.8313286713286713, "grad_norm": 0.37894567395398865, "learning_rate": 4.150125208224255e-06, "loss": 0.8437, "num_input_tokens_seen": 779091968, "step": 743 }, { "epoch": 0.8324475524475524, "grad_norm": 0.39186751855243934, "learning_rate": 4.147900962760447e-06, "loss": 0.8597, "num_input_tokens_seen": 780140544, "step": 744 }, { "epoch": 0.8335664335664336, "grad_norm": 0.42604379524630565, "learning_rate": 4.145674408269885e-06, "loss": 0.7272, "num_input_tokens_seen": 781189120, "step": 745 }, { "epoch": 0.8346853146853147, "grad_norm": 0.398088675719126, "learning_rate": 4.1434455478724e-06, "loss": 0.6691, "num_input_tokens_seen": 782237696, "step": 746 }, { "epoch": 0.8358041958041958, "grad_norm": 0.49986407004716077, "learning_rate": 4.141214384691056e-06, "loss": 0.7349, "num_input_tokens_seen": 783286272, "step": 747 }, { "epoch": 0.8369230769230769, "grad_norm": 0.40921378937081465, "learning_rate": 4.138980921852141e-06, "loss": 0.8242, "num_input_tokens_seen": 784334848, "step": 748 }, { "epoch": 0.838041958041958, "grad_norm": 0.502295975129897, "learning_rate": 4.136745162485168e-06, "loss": 0.6431, "num_input_tokens_seen": 785383424, "step": 749 }, { "epoch": 0.8391608391608392, "grad_norm": 0.40310645213798424, "learning_rate": 4.134507109722865e-06, "loss": 0.7689, "num_input_tokens_seen": 786432000, "step": 750 }, { "epoch": 0.8391608391608392, "eval_loss": 0.7424774169921875, "eval_runtime": 246.6584, "eval_samples_per_second": 2.368, "eval_steps_per_second": 0.296, "num_input_tokens_seen": 786432000, "step": 750 }, { "epoch": 0.8402797202797203, "grad_norm": 0.5184102988626936, "learning_rate": 4.1322667667011774e-06, "loss": 0.7584, "num_input_tokens_seen": 787480576, "step": 751 }, { "epoch": 0.8413986013986015, "grad_norm": 0.39257782943933106, "learning_rate": 4.130024136559255e-06, "loss": 0.6891, "num_input_tokens_seen": 788529152, "step": 752 }, { "epoch": 0.8425174825174825, "grad_norm": 0.40792765875192344, "learning_rate": 4.127779222439457e-06, "loss": 0.7819, "num_input_tokens_seen": 789577728, "step": 753 }, { "epoch": 0.8436363636363636, "grad_norm": 0.42640389509602417, "learning_rate": 4.125532027487339e-06, "loss": 0.8617, "num_input_tokens_seen": 790626304, "step": 754 }, { "epoch": 0.8447552447552448, "grad_norm": 0.36584368393831823, "learning_rate": 4.123282554851654e-06, "loss": 0.8794, "num_input_tokens_seen": 791674880, "step": 755 }, { "epoch": 0.8458741258741259, "grad_norm": 0.41679608474011764, "learning_rate": 4.121030807684349e-06, "loss": 0.6896, "num_input_tokens_seen": 792723456, "step": 756 }, { "epoch": 0.846993006993007, "grad_norm": 0.7024024221539993, "learning_rate": 4.118776789140551e-06, "loss": 0.6846, "num_input_tokens_seen": 793772032, "step": 757 }, { "epoch": 0.8481118881118881, "grad_norm": 0.3896357208875676, "learning_rate": 4.11652050237858e-06, "loss": 0.715, "num_input_tokens_seen": 794820608, "step": 758 }, { "epoch": 0.8492307692307692, "grad_norm": 0.4504558612479771, "learning_rate": 4.114261950559924e-06, "loss": 0.7033, "num_input_tokens_seen": 795869184, "step": 759 }, { "epoch": 0.8503496503496504, "grad_norm": 0.46136724186403594, "learning_rate": 4.112001136849252e-06, "loss": 0.6531, "num_input_tokens_seen": 796917760, "step": 760 }, { "epoch": 0.8514685314685315, "grad_norm": 0.4128165233408574, "learning_rate": 4.109738064414397e-06, "loss": 0.7427, "num_input_tokens_seen": 797966336, "step": 761 }, { "epoch": 0.8525874125874126, "grad_norm": 0.46323968452029674, "learning_rate": 4.107472736426362e-06, "loss": 0.8499, "num_input_tokens_seen": 799014912, "step": 762 }, { "epoch": 0.8537062937062937, "grad_norm": 0.3994526575500648, "learning_rate": 4.105205156059307e-06, "loss": 0.7645, "num_input_tokens_seen": 800063488, "step": 763 }, { "epoch": 0.8548251748251748, "grad_norm": 0.4367074804282339, "learning_rate": 4.102935326490549e-06, "loss": 0.6868, "num_input_tokens_seen": 801112064, "step": 764 }, { "epoch": 0.855944055944056, "grad_norm": 0.3586727227286502, "learning_rate": 4.100663250900556e-06, "loss": 0.6391, "num_input_tokens_seen": 802160640, "step": 765 }, { "epoch": 0.857062937062937, "grad_norm": 0.38017219603824504, "learning_rate": 4.098388932472944e-06, "loss": 0.7374, "num_input_tokens_seen": 803209216, "step": 766 }, { "epoch": 0.8581818181818182, "grad_norm": 0.3886395993823501, "learning_rate": 4.0961123743944715e-06, "loss": 0.7492, "num_input_tokens_seen": 804257792, "step": 767 }, { "epoch": 0.8593006993006993, "grad_norm": 0.3793298176255215, "learning_rate": 4.093833579855036e-06, "loss": 0.6784, "num_input_tokens_seen": 805306368, "step": 768 }, { "epoch": 0.8604195804195804, "grad_norm": 0.38808040511094233, "learning_rate": 4.0915525520476665e-06, "loss": 0.6226, "num_input_tokens_seen": 806354944, "step": 769 }, { "epoch": 0.8615384615384616, "grad_norm": 0.47810745493009166, "learning_rate": 4.089269294168522e-06, "loss": 0.8488, "num_input_tokens_seen": 807403520, "step": 770 }, { "epoch": 0.8626573426573426, "grad_norm": 0.39976487454038595, "learning_rate": 4.086983809416887e-06, "loss": 0.8694, "num_input_tokens_seen": 808452096, "step": 771 }, { "epoch": 0.8637762237762238, "grad_norm": 0.467902316952956, "learning_rate": 4.084696100995167e-06, "loss": 0.7307, "num_input_tokens_seen": 809500672, "step": 772 }, { "epoch": 0.8648951048951049, "grad_norm": 0.37624712682999584, "learning_rate": 4.082406172108882e-06, "loss": 0.8257, "num_input_tokens_seen": 810549248, "step": 773 }, { "epoch": 0.866013986013986, "grad_norm": 0.4326292024694533, "learning_rate": 4.0801140259666624e-06, "loss": 0.7092, "num_input_tokens_seen": 811597824, "step": 774 }, { "epoch": 0.8671328671328671, "grad_norm": 0.3668643275740706, "learning_rate": 4.0778196657802484e-06, "loss": 0.7616, "num_input_tokens_seen": 812646400, "step": 775 }, { "epoch": 0.8682517482517482, "grad_norm": 0.38073192463633604, "learning_rate": 4.075523094764479e-06, "loss": 0.7918, "num_input_tokens_seen": 813694976, "step": 776 }, { "epoch": 0.8693706293706294, "grad_norm": 0.3999679550237199, "learning_rate": 4.073224316137293e-06, "loss": 0.6629, "num_input_tokens_seen": 814743552, "step": 777 }, { "epoch": 0.8704895104895105, "grad_norm": 0.3881869828369213, "learning_rate": 4.070923333119723e-06, "loss": 0.8073, "num_input_tokens_seen": 815792128, "step": 778 }, { "epoch": 0.8716083916083917, "grad_norm": 0.43003454146228165, "learning_rate": 4.068620148935889e-06, "loss": 0.6477, "num_input_tokens_seen": 816840704, "step": 779 }, { "epoch": 0.8727272727272727, "grad_norm": 0.3960752779876195, "learning_rate": 4.066314766812996e-06, "loss": 0.84, "num_input_tokens_seen": 817889280, "step": 780 }, { "epoch": 0.8738461538461538, "grad_norm": 0.35884541787142765, "learning_rate": 4.0640071899813284e-06, "loss": 0.7025, "num_input_tokens_seen": 818937856, "step": 781 }, { "epoch": 0.874965034965035, "grad_norm": 0.4084222095234087, "learning_rate": 4.061697421674245e-06, "loss": 0.9056, "num_input_tokens_seen": 819986432, "step": 782 }, { "epoch": 0.8760839160839161, "grad_norm": 0.5455408470651838, "learning_rate": 4.059385465128179e-06, "loss": 0.9119, "num_input_tokens_seen": 821035008, "step": 783 }, { "epoch": 0.8772027972027973, "grad_norm": 0.42996956441716816, "learning_rate": 4.057071323582623e-06, "loss": 0.7266, "num_input_tokens_seen": 822083584, "step": 784 }, { "epoch": 0.8783216783216783, "grad_norm": 0.39810287493825014, "learning_rate": 4.054755000280139e-06, "loss": 0.8663, "num_input_tokens_seen": 823132160, "step": 785 }, { "epoch": 0.8794405594405594, "grad_norm": 0.4174855691151875, "learning_rate": 4.05243649846634e-06, "loss": 0.7008, "num_input_tokens_seen": 824180736, "step": 786 }, { "epoch": 0.8805594405594406, "grad_norm": 0.42028580830744167, "learning_rate": 4.050115821389894e-06, "loss": 0.7324, "num_input_tokens_seen": 825229312, "step": 787 }, { "epoch": 0.8816783216783217, "grad_norm": 0.4164606782005147, "learning_rate": 4.047792972302518e-06, "loss": 0.6976, "num_input_tokens_seen": 826277888, "step": 788 }, { "epoch": 0.8827972027972028, "grad_norm": 0.3787140000607857, "learning_rate": 4.045467954458969e-06, "loss": 0.8645, "num_input_tokens_seen": 827326464, "step": 789 }, { "epoch": 0.8839160839160839, "grad_norm": 0.40766330650700583, "learning_rate": 4.0431407711170465e-06, "loss": 0.684, "num_input_tokens_seen": 828375040, "step": 790 }, { "epoch": 0.885034965034965, "grad_norm": 0.4001531515477462, "learning_rate": 4.040811425537583e-06, "loss": 0.8267, "num_input_tokens_seen": 829423616, "step": 791 }, { "epoch": 0.8861538461538462, "grad_norm": 0.38349081788226114, "learning_rate": 4.038479920984439e-06, "loss": 0.7346, "num_input_tokens_seen": 830472192, "step": 792 }, { "epoch": 0.8872727272727273, "grad_norm": 0.46212364749135615, "learning_rate": 4.036146260724503e-06, "loss": 0.783, "num_input_tokens_seen": 831520768, "step": 793 }, { "epoch": 0.8883916083916084, "grad_norm": 0.35949324399859384, "learning_rate": 4.033810448027682e-06, "loss": 0.8189, "num_input_tokens_seen": 832569344, "step": 794 }, { "epoch": 0.8895104895104895, "grad_norm": 0.4889173761418744, "learning_rate": 4.0314724861669e-06, "loss": 0.8026, "num_input_tokens_seen": 833617920, "step": 795 }, { "epoch": 0.8906293706293706, "grad_norm": 0.39459589149746993, "learning_rate": 4.029132378418092e-06, "loss": 0.6769, "num_input_tokens_seen": 834666496, "step": 796 }, { "epoch": 0.8917482517482518, "grad_norm": 0.38240145959745486, "learning_rate": 4.0267901280601985e-06, "loss": 0.8146, "num_input_tokens_seen": 835715072, "step": 797 }, { "epoch": 0.8928671328671328, "grad_norm": 0.6341966748165627, "learning_rate": 4.024445738375164e-06, "loss": 0.7063, "num_input_tokens_seen": 836763648, "step": 798 }, { "epoch": 0.893986013986014, "grad_norm": 0.41747548456132216, "learning_rate": 4.022099212647933e-06, "loss": 0.7581, "num_input_tokens_seen": 837812224, "step": 799 }, { "epoch": 0.8951048951048951, "grad_norm": 0.4036983620442501, "learning_rate": 4.019750554166436e-06, "loss": 0.7643, "num_input_tokens_seen": 838860800, "step": 800 }, { "epoch": 0.8962237762237762, "grad_norm": 0.4209419611846452, "learning_rate": 4.017399766221599e-06, "loss": 0.6969, "num_input_tokens_seen": 839909376, "step": 801 }, { "epoch": 0.8973426573426574, "grad_norm": 0.3867561200891814, "learning_rate": 4.015046852107327e-06, "loss": 0.7633, "num_input_tokens_seen": 840957952, "step": 802 }, { "epoch": 0.8984615384615384, "grad_norm": 0.5004181645697954, "learning_rate": 4.012691815120508e-06, "loss": 0.7878, "num_input_tokens_seen": 842006528, "step": 803 }, { "epoch": 0.8995804195804196, "grad_norm": 0.44199474646111714, "learning_rate": 4.0103346585610015e-06, "loss": 0.7283, "num_input_tokens_seen": 843055104, "step": 804 }, { "epoch": 0.9006993006993007, "grad_norm": 0.40500567525620973, "learning_rate": 4.007975385731637e-06, "loss": 0.6244, "num_input_tokens_seen": 844103680, "step": 805 }, { "epoch": 0.9018181818181819, "grad_norm": 0.3827972061846943, "learning_rate": 4.005613999938211e-06, "loss": 0.7264, "num_input_tokens_seen": 845152256, "step": 806 }, { "epoch": 0.9029370629370629, "grad_norm": 0.4026623322316549, "learning_rate": 4.003250504489481e-06, "loss": 0.7475, "num_input_tokens_seen": 846200832, "step": 807 }, { "epoch": 0.904055944055944, "grad_norm": 1.2283476738186698, "learning_rate": 4.000884902697159e-06, "loss": 0.7093, "num_input_tokens_seen": 847249408, "step": 808 }, { "epoch": 0.9051748251748252, "grad_norm": 0.4015149079265648, "learning_rate": 3.998517197875908e-06, "loss": 0.6377, "num_input_tokens_seen": 848297984, "step": 809 }, { "epoch": 0.9062937062937063, "grad_norm": 0.39923925549726585, "learning_rate": 3.996147393343338e-06, "loss": 0.619, "num_input_tokens_seen": 849346560, "step": 810 }, { "epoch": 0.9074125874125875, "grad_norm": 0.37177506204093586, "learning_rate": 3.993775492420005e-06, "loss": 0.6694, "num_input_tokens_seen": 850395136, "step": 811 }, { "epoch": 0.9085314685314685, "grad_norm": 0.3849336002776656, "learning_rate": 3.9914014984293955e-06, "loss": 0.7513, "num_input_tokens_seen": 851443712, "step": 812 }, { "epoch": 0.9096503496503496, "grad_norm": 0.3598910599076654, "learning_rate": 3.989025414697935e-06, "loss": 0.648, "num_input_tokens_seen": 852492288, "step": 813 }, { "epoch": 0.9107692307692308, "grad_norm": 0.39487266894071166, "learning_rate": 3.986647244554974e-06, "loss": 0.8291, "num_input_tokens_seen": 853540864, "step": 814 }, { "epoch": 0.9118881118881119, "grad_norm": 0.5610570108213332, "learning_rate": 3.984266991332787e-06, "loss": 0.7154, "num_input_tokens_seen": 854589440, "step": 815 }, { "epoch": 0.9130069930069931, "grad_norm": 0.4012668743861729, "learning_rate": 3.981884658366566e-06, "loss": 0.7503, "num_input_tokens_seen": 855638016, "step": 816 }, { "epoch": 0.9141258741258741, "grad_norm": 0.46902366583540417, "learning_rate": 3.9795002489944216e-06, "loss": 0.6959, "num_input_tokens_seen": 856686592, "step": 817 }, { "epoch": 0.9152447552447552, "grad_norm": 0.4602534566059332, "learning_rate": 3.977113766557367e-06, "loss": 0.6735, "num_input_tokens_seen": 857735168, "step": 818 }, { "epoch": 0.9163636363636364, "grad_norm": 0.39395027528672294, "learning_rate": 3.9747252143993265e-06, "loss": 0.7275, "num_input_tokens_seen": 858783744, "step": 819 }, { "epoch": 0.9174825174825175, "grad_norm": 0.40409773929768605, "learning_rate": 3.972334595867122e-06, "loss": 0.8853, "num_input_tokens_seen": 859832320, "step": 820 }, { "epoch": 0.9186013986013986, "grad_norm": 0.4859755337070157, "learning_rate": 3.969941914310469e-06, "loss": 0.6545, "num_input_tokens_seen": 860880896, "step": 821 }, { "epoch": 0.9197202797202797, "grad_norm": 0.419958268537841, "learning_rate": 3.967547173081976e-06, "loss": 0.6411, "num_input_tokens_seen": 861929472, "step": 822 }, { "epoch": 0.9208391608391608, "grad_norm": 0.4529971820627527, "learning_rate": 3.965150375537137e-06, "loss": 0.6289, "num_input_tokens_seen": 862978048, "step": 823 }, { "epoch": 0.921958041958042, "grad_norm": 0.5451558861642671, "learning_rate": 3.9627515250343275e-06, "loss": 0.6772, "num_input_tokens_seen": 864026624, "step": 824 }, { "epoch": 0.9230769230769231, "grad_norm": 0.44643137537323296, "learning_rate": 3.9603506249348e-06, "loss": 0.699, "num_input_tokens_seen": 865075200, "step": 825 }, { "epoch": 0.9241958041958042, "grad_norm": 0.4129605053789471, "learning_rate": 3.957947678602676e-06, "loss": 0.7727, "num_input_tokens_seen": 866123776, "step": 826 }, { "epoch": 0.9253146853146853, "grad_norm": 0.4134252368346388, "learning_rate": 3.955542689404948e-06, "loss": 0.7034, "num_input_tokens_seen": 867172352, "step": 827 }, { "epoch": 0.9264335664335664, "grad_norm": 0.36507972763580043, "learning_rate": 3.953135660711467e-06, "loss": 0.6624, "num_input_tokens_seen": 868220928, "step": 828 }, { "epoch": 0.9275524475524476, "grad_norm": 0.3984801386925763, "learning_rate": 3.950726595894947e-06, "loss": 0.6719, "num_input_tokens_seen": 869269504, "step": 829 }, { "epoch": 0.9286713286713286, "grad_norm": 0.3916415344318043, "learning_rate": 3.94831549833095e-06, "loss": 0.6623, "num_input_tokens_seen": 870318080, "step": 830 }, { "epoch": 0.9297902097902098, "grad_norm": 0.3802835323558295, "learning_rate": 3.9459023713978895e-06, "loss": 0.7096, "num_input_tokens_seen": 871366656, "step": 831 }, { "epoch": 0.9309090909090909, "grad_norm": 0.37642630252887915, "learning_rate": 3.94348721847702e-06, "loss": 0.8014, "num_input_tokens_seen": 872415232, "step": 832 }, { "epoch": 0.932027972027972, "grad_norm": 0.3820169748013418, "learning_rate": 3.941070042952437e-06, "loss": 0.8499, "num_input_tokens_seen": 873463808, "step": 833 }, { "epoch": 0.9331468531468532, "grad_norm": 0.3755385063148412, "learning_rate": 3.938650848211068e-06, "loss": 0.7902, "num_input_tokens_seen": 874512384, "step": 834 }, { "epoch": 0.9342657342657342, "grad_norm": 0.42973932472039017, "learning_rate": 3.936229637642672e-06, "loss": 0.8006, "num_input_tokens_seen": 875560960, "step": 835 }, { "epoch": 0.9353846153846154, "grad_norm": 0.39416064845095833, "learning_rate": 3.933806414639832e-06, "loss": 0.8366, "num_input_tokens_seen": 876609536, "step": 836 }, { "epoch": 0.9365034965034965, "grad_norm": 0.43254833793568165, "learning_rate": 3.931381182597949e-06, "loss": 0.6731, "num_input_tokens_seen": 877658112, "step": 837 }, { "epoch": 0.9376223776223777, "grad_norm": 0.5543980320369337, "learning_rate": 3.928953944915242e-06, "loss": 0.8729, "num_input_tokens_seen": 878706688, "step": 838 }, { "epoch": 0.9387412587412587, "grad_norm": 0.5062710343254719, "learning_rate": 3.926524704992736e-06, "loss": 0.7003, "num_input_tokens_seen": 879755264, "step": 839 }, { "epoch": 0.9398601398601398, "grad_norm": 0.39070241139509204, "learning_rate": 3.9240934662342665e-06, "loss": 0.7721, "num_input_tokens_seen": 880803840, "step": 840 }, { "epoch": 0.940979020979021, "grad_norm": 0.3790869822235597, "learning_rate": 3.9216602320464655e-06, "loss": 0.7873, "num_input_tokens_seen": 881852416, "step": 841 }, { "epoch": 0.9420979020979021, "grad_norm": 0.42017388676890977, "learning_rate": 3.9192250058387656e-06, "loss": 0.7316, "num_input_tokens_seen": 882900992, "step": 842 }, { "epoch": 0.9432167832167833, "grad_norm": 0.3920498770402266, "learning_rate": 3.916787791023386e-06, "loss": 0.6785, "num_input_tokens_seen": 883949568, "step": 843 }, { "epoch": 0.9443356643356643, "grad_norm": 0.5140826012754313, "learning_rate": 3.914348591015335e-06, "loss": 0.7708, "num_input_tokens_seen": 884998144, "step": 844 }, { "epoch": 0.9454545454545454, "grad_norm": 0.3905642680446615, "learning_rate": 3.911907409232402e-06, "loss": 0.7246, "num_input_tokens_seen": 886046720, "step": 845 }, { "epoch": 0.9465734265734266, "grad_norm": 0.3592161741313191, "learning_rate": 3.90946424909515e-06, "loss": 0.7081, "num_input_tokens_seen": 887095296, "step": 846 }, { "epoch": 0.9476923076923077, "grad_norm": 0.399982382253274, "learning_rate": 3.907019114026922e-06, "loss": 0.7986, "num_input_tokens_seen": 888143872, "step": 847 }, { "epoch": 0.9488111888111889, "grad_norm": 0.3976609272176194, "learning_rate": 3.90457200745382e-06, "loss": 0.8037, "num_input_tokens_seen": 889192448, "step": 848 }, { "epoch": 0.9499300699300699, "grad_norm": 0.36695730177858116, "learning_rate": 3.902122932804713e-06, "loss": 0.8904, "num_input_tokens_seen": 890241024, "step": 849 }, { "epoch": 0.951048951048951, "grad_norm": 0.3702357302632216, "learning_rate": 3.899671893511226e-06, "loss": 0.7235, "num_input_tokens_seen": 891289600, "step": 850 }, { "epoch": 0.9521678321678322, "grad_norm": 0.3832570324010306, "learning_rate": 3.897218893007737e-06, "loss": 0.8046, "num_input_tokens_seen": 892338176, "step": 851 }, { "epoch": 0.9532867132867133, "grad_norm": 0.3636562887637132, "learning_rate": 3.894763934731373e-06, "loss": 0.8139, "num_input_tokens_seen": 893386752, "step": 852 }, { "epoch": 0.9544055944055944, "grad_norm": 0.39657045490338155, "learning_rate": 3.8923070221220035e-06, "loss": 0.6991, "num_input_tokens_seen": 894435328, "step": 853 }, { "epoch": 0.9555244755244755, "grad_norm": 0.39518051267542376, "learning_rate": 3.889848158622237e-06, "loss": 0.7715, "num_input_tokens_seen": 895483904, "step": 854 }, { "epoch": 0.9566433566433566, "grad_norm": 0.3872524727655556, "learning_rate": 3.887387347677413e-06, "loss": 0.677, "num_input_tokens_seen": 896532480, "step": 855 }, { "epoch": 0.9577622377622378, "grad_norm": 0.4236923054821848, "learning_rate": 3.884924592735604e-06, "loss": 0.7913, "num_input_tokens_seen": 897581056, "step": 856 }, { "epoch": 0.9588811188811189, "grad_norm": 0.42018281053041734, "learning_rate": 3.882459897247603e-06, "loss": 0.8272, "num_input_tokens_seen": 898629632, "step": 857 }, { "epoch": 0.96, "grad_norm": 0.3932792108791179, "learning_rate": 3.8799932646669235e-06, "loss": 0.6715, "num_input_tokens_seen": 899678208, "step": 858 }, { "epoch": 0.9611188811188811, "grad_norm": 0.3758147772949339, "learning_rate": 3.8775246984497924e-06, "loss": 0.7166, "num_input_tokens_seen": 900726784, "step": 859 }, { "epoch": 0.9622377622377623, "grad_norm": 0.4042268741460496, "learning_rate": 3.875054202055148e-06, "loss": 0.7947, "num_input_tokens_seen": 901775360, "step": 860 }, { "epoch": 0.9633566433566434, "grad_norm": 0.36303495570435407, "learning_rate": 3.87258177894463e-06, "loss": 0.6249, "num_input_tokens_seen": 902823936, "step": 861 }, { "epoch": 0.9644755244755244, "grad_norm": 0.4016781848176148, "learning_rate": 3.870107432582582e-06, "loss": 0.6987, "num_input_tokens_seen": 903872512, "step": 862 }, { "epoch": 0.9655944055944056, "grad_norm": 0.4736578231325623, "learning_rate": 3.867631166436038e-06, "loss": 0.6279, "num_input_tokens_seen": 904921088, "step": 863 }, { "epoch": 0.9667132867132867, "grad_norm": 0.36904317583567653, "learning_rate": 3.865152983974724e-06, "loss": 0.7039, "num_input_tokens_seen": 905969664, "step": 864 }, { "epoch": 0.9678321678321679, "grad_norm": 0.42170628867770477, "learning_rate": 3.862672888671051e-06, "loss": 0.8572, "num_input_tokens_seen": 907018240, "step": 865 }, { "epoch": 0.968951048951049, "grad_norm": 0.37407554173153906, "learning_rate": 3.860190884000112e-06, "loss": 0.7775, "num_input_tokens_seen": 908066816, "step": 866 }, { "epoch": 0.97006993006993, "grad_norm": 1.0180533920782378, "learning_rate": 3.857706973439672e-06, "loss": 0.7948, "num_input_tokens_seen": 909115392, "step": 867 }, { "epoch": 0.9711888111888112, "grad_norm": 0.4441458361223115, "learning_rate": 3.85522116047017e-06, "loss": 0.8362, "num_input_tokens_seen": 910163968, "step": 868 }, { "epoch": 0.9723076923076923, "grad_norm": 0.37162891553898675, "learning_rate": 3.852733448574707e-06, "loss": 0.6686, "num_input_tokens_seen": 911212544, "step": 869 }, { "epoch": 0.9734265734265735, "grad_norm": 0.4369520040423374, "learning_rate": 3.850243841239047e-06, "loss": 0.6899, "num_input_tokens_seen": 912261120, "step": 870 }, { "epoch": 0.9745454545454545, "grad_norm": 0.3729945873796046, "learning_rate": 3.8477523419516115e-06, "loss": 0.6706, "num_input_tokens_seen": 913309696, "step": 871 }, { "epoch": 0.9756643356643356, "grad_norm": 0.3836701038656393, "learning_rate": 3.8452589542034686e-06, "loss": 0.6796, "num_input_tokens_seen": 914358272, "step": 872 }, { "epoch": 0.9767832167832168, "grad_norm": 0.38961601281190456, "learning_rate": 3.842763681488337e-06, "loss": 0.8015, "num_input_tokens_seen": 915406848, "step": 873 }, { "epoch": 0.9779020979020979, "grad_norm": 0.41242260211123066, "learning_rate": 3.8402665273025726e-06, "loss": 0.705, "num_input_tokens_seen": 916455424, "step": 874 }, { "epoch": 0.9790209790209791, "grad_norm": 0.4000706526914174, "learning_rate": 3.837767495145171e-06, "loss": 0.6312, "num_input_tokens_seen": 917504000, "step": 875 }, { "epoch": 0.9801398601398601, "grad_norm": 0.3976904001435826, "learning_rate": 3.835266588517757e-06, "loss": 0.7085, "num_input_tokens_seen": 918552576, "step": 876 }, { "epoch": 0.9812587412587412, "grad_norm": 0.46036030829044383, "learning_rate": 3.832763810924583e-06, "loss": 0.7812, "num_input_tokens_seen": 919601152, "step": 877 }, { "epoch": 0.9823776223776224, "grad_norm": 0.3804839418012878, "learning_rate": 3.830259165872523e-06, "loss": 0.6654, "num_input_tokens_seen": 920649728, "step": 878 }, { "epoch": 0.9834965034965035, "grad_norm": 0.39022485308203997, "learning_rate": 3.827752656871067e-06, "loss": 1.0061, "num_input_tokens_seen": 921698304, "step": 879 }, { "epoch": 0.9846153846153847, "grad_norm": 0.4009793095489505, "learning_rate": 3.825244287432316e-06, "loss": 0.6913, "num_input_tokens_seen": 922746880, "step": 880 }, { "epoch": 0.9857342657342657, "grad_norm": 0.4609359673125681, "learning_rate": 3.822734061070979e-06, "loss": 0.6779, "num_input_tokens_seen": 923795456, "step": 881 }, { "epoch": 0.9868531468531468, "grad_norm": 0.42997289006846684, "learning_rate": 3.82022198130437e-06, "loss": 0.6848, "num_input_tokens_seen": 924844032, "step": 882 }, { "epoch": 0.987972027972028, "grad_norm": 0.37813630443662505, "learning_rate": 3.817708051652392e-06, "loss": 0.809, "num_input_tokens_seen": 925892608, "step": 883 }, { "epoch": 0.9890909090909091, "grad_norm": 0.450749746428077, "learning_rate": 3.8151922756375485e-06, "loss": 0.8708, "num_input_tokens_seen": 926941184, "step": 884 }, { "epoch": 0.9902097902097902, "grad_norm": 0.42317804558688343, "learning_rate": 3.812674656784924e-06, "loss": 0.7192, "num_input_tokens_seen": 927989760, "step": 885 }, { "epoch": 0.9913286713286713, "grad_norm": 0.36987357614886024, "learning_rate": 3.8101551986221896e-06, "loss": 0.6901, "num_input_tokens_seen": 929038336, "step": 886 }, { "epoch": 0.9924475524475525, "grad_norm": 0.4672560045708888, "learning_rate": 3.8076339046795897e-06, "loss": 0.6654, "num_input_tokens_seen": 930086912, "step": 887 }, { "epoch": 0.9935664335664336, "grad_norm": 0.3820482389044477, "learning_rate": 3.8051107784899443e-06, "loss": 0.6568, "num_input_tokens_seen": 931135488, "step": 888 }, { "epoch": 0.9946853146853147, "grad_norm": 0.49570307026827604, "learning_rate": 3.8025858235886394e-06, "loss": 0.6623, "num_input_tokens_seen": 932184064, "step": 889 }, { "epoch": 0.9958041958041958, "grad_norm": 0.42001913596480506, "learning_rate": 3.8000590435136213e-06, "loss": 0.6758, "num_input_tokens_seen": 933232640, "step": 890 }, { "epoch": 0.9969230769230769, "grad_norm": 0.3872030677455255, "learning_rate": 3.7975304418053986e-06, "loss": 0.6512, "num_input_tokens_seen": 934281216, "step": 891 }, { "epoch": 0.998041958041958, "grad_norm": 0.42436842300460104, "learning_rate": 3.795000022007027e-06, "loss": 0.7217, "num_input_tokens_seen": 935329792, "step": 892 }, { "epoch": 0.9991608391608392, "grad_norm": 0.3996860988433165, "learning_rate": 3.7924677876641147e-06, "loss": 0.8638, "num_input_tokens_seen": 936378368, "step": 893 }, { "epoch": 1.0002797202797202, "grad_norm": 0.3481180155714078, "learning_rate": 3.789933742324807e-06, "loss": 0.7973, "num_input_tokens_seen": 937426944, "step": 894 }, { "epoch": 1.0013986013986014, "grad_norm": 0.49162482834728966, "learning_rate": 3.787397889539792e-06, "loss": 0.7068, "num_input_tokens_seen": 938475520, "step": 895 }, { "epoch": 1.0025174825174825, "grad_norm": 0.37337894566503377, "learning_rate": 3.7848602328622864e-06, "loss": 0.6657, "num_input_tokens_seen": 939524096, "step": 896 }, { "epoch": 1.0036363636363637, "grad_norm": 0.4817997580233594, "learning_rate": 3.782320775848038e-06, "loss": 0.6522, "num_input_tokens_seen": 940572672, "step": 897 }, { "epoch": 1.0047552447552448, "grad_norm": 0.44337441255105126, "learning_rate": 3.7797795220553136e-06, "loss": 0.696, "num_input_tokens_seen": 941621248, "step": 898 }, { "epoch": 1.005874125874126, "grad_norm": 0.4428883482981781, "learning_rate": 3.7772364750449002e-06, "loss": 0.655, "num_input_tokens_seen": 942669824, "step": 899 }, { "epoch": 1.006993006993007, "grad_norm": 0.5903801748645382, "learning_rate": 3.774691638380096e-06, "loss": 0.6671, "num_input_tokens_seen": 943718400, "step": 900 }, { "epoch": 1.008111888111888, "grad_norm": 0.4073792942585215, "learning_rate": 3.772145015626709e-06, "loss": 0.7109, "num_input_tokens_seen": 944766976, "step": 901 }, { "epoch": 1.0092307692307692, "grad_norm": 0.4539491677871825, "learning_rate": 3.769596610353047e-06, "loss": 0.6671, "num_input_tokens_seen": 945815552, "step": 902 }, { "epoch": 1.0103496503496503, "grad_norm": 0.36608033367294673, "learning_rate": 3.767046426129917e-06, "loss": 0.7918, "num_input_tokens_seen": 946864128, "step": 903 }, { "epoch": 1.0114685314685314, "grad_norm": 0.4418548967260508, "learning_rate": 3.764494466530618e-06, "loss": 0.7056, "num_input_tokens_seen": 947912704, "step": 904 }, { "epoch": 1.0125874125874126, "grad_norm": 0.46166892050233144, "learning_rate": 3.7619407351309377e-06, "loss": 0.8757, "num_input_tokens_seen": 948961280, "step": 905 }, { "epoch": 1.0137062937062937, "grad_norm": 0.3828047934959702, "learning_rate": 3.7593852355091463e-06, "loss": 0.7706, "num_input_tokens_seen": 950009856, "step": 906 }, { "epoch": 1.0148251748251749, "grad_norm": 0.4787130239345721, "learning_rate": 3.7568279712459908e-06, "loss": 0.676, "num_input_tokens_seen": 951058432, "step": 907 }, { "epoch": 1.015944055944056, "grad_norm": 0.39173558557908167, "learning_rate": 3.7542689459246907e-06, "loss": 0.6838, "num_input_tokens_seen": 952107008, "step": 908 }, { "epoch": 1.0170629370629372, "grad_norm": 0.46987878357928603, "learning_rate": 3.7517081631309336e-06, "loss": 0.6933, "num_input_tokens_seen": 953155584, "step": 909 }, { "epoch": 1.018181818181818, "grad_norm": 0.36794774666757774, "learning_rate": 3.7491456264528703e-06, "loss": 0.762, "num_input_tokens_seen": 954204160, "step": 910 }, { "epoch": 1.0193006993006992, "grad_norm": 0.46592312297287924, "learning_rate": 3.746581339481108e-06, "loss": 0.6466, "num_input_tokens_seen": 955252736, "step": 911 }, { "epoch": 1.0204195804195804, "grad_norm": 0.43283054324421877, "learning_rate": 3.7440153058087064e-06, "loss": 0.6643, "num_input_tokens_seen": 956301312, "step": 912 }, { "epoch": 1.0215384615384615, "grad_norm": 0.40065095256105515, "learning_rate": 3.741447529031173e-06, "loss": 0.6281, "num_input_tokens_seen": 957349888, "step": 913 }, { "epoch": 1.0226573426573426, "grad_norm": 0.408018936450002, "learning_rate": 3.7388780127464586e-06, "loss": 0.6409, "num_input_tokens_seen": 958398464, "step": 914 }, { "epoch": 1.0237762237762238, "grad_norm": 0.3983265591632198, "learning_rate": 3.7363067605549515e-06, "loss": 0.7178, "num_input_tokens_seen": 959447040, "step": 915 }, { "epoch": 1.024895104895105, "grad_norm": 0.42195720330460224, "learning_rate": 3.733733776059468e-06, "loss": 0.661, "num_input_tokens_seen": 960495616, "step": 916 }, { "epoch": 1.026013986013986, "grad_norm": 0.40181657727979697, "learning_rate": 3.7311590628652584e-06, "loss": 0.6455, "num_input_tokens_seen": 961544192, "step": 917 }, { "epoch": 1.0271328671328672, "grad_norm": 0.4313747890591837, "learning_rate": 3.7285826245799904e-06, "loss": 0.6748, "num_input_tokens_seen": 962592768, "step": 918 }, { "epoch": 1.0282517482517481, "grad_norm": 0.45518494422198436, "learning_rate": 3.726004464813752e-06, "loss": 0.6433, "num_input_tokens_seen": 963641344, "step": 919 }, { "epoch": 1.0293706293706293, "grad_norm": 0.3948488591775494, "learning_rate": 3.723424587179039e-06, "loss": 0.7869, "num_input_tokens_seen": 964689920, "step": 920 }, { "epoch": 1.0304895104895104, "grad_norm": 0.4275997766227239, "learning_rate": 3.72084299529076e-06, "loss": 0.8307, "num_input_tokens_seen": 965738496, "step": 921 }, { "epoch": 1.0316083916083916, "grad_norm": 0.3970010277811363, "learning_rate": 3.718259692766221e-06, "loss": 0.5991, "num_input_tokens_seen": 966787072, "step": 922 }, { "epoch": 1.0327272727272727, "grad_norm": 0.4152645070166263, "learning_rate": 3.7156746832251266e-06, "loss": 0.8823, "num_input_tokens_seen": 967835648, "step": 923 }, { "epoch": 1.0338461538461539, "grad_norm": 0.39079764548929397, "learning_rate": 3.7130879702895733e-06, "loss": 0.8461, "num_input_tokens_seen": 968884224, "step": 924 }, { "epoch": 1.034965034965035, "grad_norm": 0.43734979115053313, "learning_rate": 3.710499557584045e-06, "loss": 0.7179, "num_input_tokens_seen": 969932800, "step": 925 }, { "epoch": 1.0360839160839161, "grad_norm": 0.38468104216965604, "learning_rate": 3.7079094487354055e-06, "loss": 0.785, "num_input_tokens_seen": 970981376, "step": 926 }, { "epoch": 1.0372027972027973, "grad_norm": 0.38571665821183504, "learning_rate": 3.705317647372898e-06, "loss": 0.7238, "num_input_tokens_seen": 972029952, "step": 927 }, { "epoch": 1.0383216783216782, "grad_norm": 0.38433058643083334, "learning_rate": 3.702724157128135e-06, "loss": 0.6425, "num_input_tokens_seen": 973078528, "step": 928 }, { "epoch": 1.0394405594405594, "grad_norm": 0.37529205686275935, "learning_rate": 3.700128981635094e-06, "loss": 0.6641, "num_input_tokens_seen": 974127104, "step": 929 }, { "epoch": 1.0405594405594405, "grad_norm": 0.3764985837493403, "learning_rate": 3.6975321245301183e-06, "loss": 0.7375, "num_input_tokens_seen": 975175680, "step": 930 }, { "epoch": 1.0416783216783216, "grad_norm": 0.41806664561514273, "learning_rate": 3.6949335894519033e-06, "loss": 0.7991, "num_input_tokens_seen": 976224256, "step": 931 }, { "epoch": 1.0427972027972028, "grad_norm": 0.5832173852931514, "learning_rate": 3.6923333800414997e-06, "loss": 0.6662, "num_input_tokens_seen": 977272832, "step": 932 }, { "epoch": 1.043916083916084, "grad_norm": 0.38405684639155374, "learning_rate": 3.6897314999423e-06, "loss": 0.6159, "num_input_tokens_seen": 978321408, "step": 933 }, { "epoch": 1.045034965034965, "grad_norm": 0.3714813974501526, "learning_rate": 3.68712795280004e-06, "loss": 0.8542, "num_input_tokens_seen": 979369984, "step": 934 }, { "epoch": 1.0461538461538462, "grad_norm": 0.3749389848171504, "learning_rate": 3.6845227422627904e-06, "loss": 0.6883, "num_input_tokens_seen": 980418560, "step": 935 }, { "epoch": 1.0472727272727274, "grad_norm": 0.3492080817151956, "learning_rate": 3.681915871980954e-06, "loss": 0.7343, "num_input_tokens_seen": 981467136, "step": 936 }, { "epoch": 1.0483916083916085, "grad_norm": 0.9553899799132058, "learning_rate": 3.679307345607257e-06, "loss": 0.7385, "num_input_tokens_seen": 982515712, "step": 937 }, { "epoch": 1.0495104895104894, "grad_norm": 0.4192326983124154, "learning_rate": 3.676697166796749e-06, "loss": 0.6768, "num_input_tokens_seen": 983564288, "step": 938 }, { "epoch": 1.0506293706293706, "grad_norm": 0.37605762521879116, "learning_rate": 3.6740853392067925e-06, "loss": 0.5939, "num_input_tokens_seen": 984612864, "step": 939 }, { "epoch": 1.0517482517482517, "grad_norm": 0.435199724618489, "learning_rate": 3.6714718664970624e-06, "loss": 0.6575, "num_input_tokens_seen": 985661440, "step": 940 }, { "epoch": 1.0528671328671328, "grad_norm": 0.4350655179326981, "learning_rate": 3.6688567523295356e-06, "loss": 0.6733, "num_input_tokens_seen": 986710016, "step": 941 }, { "epoch": 1.053986013986014, "grad_norm": 0.37496527366060106, "learning_rate": 3.6662400003684915e-06, "loss": 0.7445, "num_input_tokens_seen": 987758592, "step": 942 }, { "epoch": 1.0551048951048951, "grad_norm": 0.46931105680518714, "learning_rate": 3.663621614280505e-06, "loss": 0.7437, "num_input_tokens_seen": 988807168, "step": 943 }, { "epoch": 1.0562237762237763, "grad_norm": 0.3672723179654939, "learning_rate": 3.661001597734438e-06, "loss": 0.8643, "num_input_tokens_seen": 989855744, "step": 944 }, { "epoch": 1.0573426573426574, "grad_norm": 0.40135260450396454, "learning_rate": 3.6583799544014397e-06, "loss": 0.6972, "num_input_tokens_seen": 990904320, "step": 945 }, { "epoch": 1.0584615384615386, "grad_norm": 0.5664334729899018, "learning_rate": 3.655756687954937e-06, "loss": 0.7712, "num_input_tokens_seen": 991952896, "step": 946 }, { "epoch": 1.0595804195804195, "grad_norm": 0.3703855822730114, "learning_rate": 3.653131802070631e-06, "loss": 0.7906, "num_input_tokens_seen": 993001472, "step": 947 }, { "epoch": 1.0606993006993006, "grad_norm": 0.3787920439869484, "learning_rate": 3.6505053004264936e-06, "loss": 0.6038, "num_input_tokens_seen": 994050048, "step": 948 }, { "epoch": 1.0618181818181818, "grad_norm": 0.36892292208768235, "learning_rate": 3.6478771867027585e-06, "loss": 0.7082, "num_input_tokens_seen": 995098624, "step": 949 }, { "epoch": 1.062937062937063, "grad_norm": 0.4168049621207178, "learning_rate": 3.64524746458192e-06, "loss": 0.6744, "num_input_tokens_seen": 996147200, "step": 950 }, { "epoch": 1.064055944055944, "grad_norm": 0.4276304949036021, "learning_rate": 3.642616137748727e-06, "loss": 0.7289, "num_input_tokens_seen": 997195776, "step": 951 }, { "epoch": 1.0651748251748252, "grad_norm": 0.3716986325545515, "learning_rate": 3.6399832098901726e-06, "loss": 0.7373, "num_input_tokens_seen": 998244352, "step": 952 }, { "epoch": 1.0662937062937063, "grad_norm": 0.37987449500826104, "learning_rate": 3.637348684695498e-06, "loss": 0.7249, "num_input_tokens_seen": 999292928, "step": 953 }, { "epoch": 1.0674125874125875, "grad_norm": 0.3912594418377999, "learning_rate": 3.63471256585618e-06, "loss": 0.7168, "num_input_tokens_seen": 1000341504, "step": 954 }, { "epoch": 1.0685314685314686, "grad_norm": 0.3767266956409575, "learning_rate": 3.632074857065928e-06, "loss": 0.823, "num_input_tokens_seen": 1001390080, "step": 955 }, { "epoch": 1.0696503496503496, "grad_norm": 0.37878442074424423, "learning_rate": 3.6294355620206824e-06, "loss": 0.757, "num_input_tokens_seen": 1002438656, "step": 956 }, { "epoch": 1.0707692307692307, "grad_norm": 0.3958777172720107, "learning_rate": 3.6267946844186023e-06, "loss": 0.6583, "num_input_tokens_seen": 1003487232, "step": 957 }, { "epoch": 1.0718881118881118, "grad_norm": 0.36501406709930273, "learning_rate": 3.6241522279600674e-06, "loss": 0.7008, "num_input_tokens_seen": 1004535808, "step": 958 }, { "epoch": 1.073006993006993, "grad_norm": 0.37025313465714493, "learning_rate": 3.621508196347667e-06, "loss": 0.6306, "num_input_tokens_seen": 1005584384, "step": 959 }, { "epoch": 1.0741258741258741, "grad_norm": 0.39837207121689333, "learning_rate": 3.618862593286199e-06, "loss": 0.7172, "num_input_tokens_seen": 1006632960, "step": 960 }, { "epoch": 1.0752447552447553, "grad_norm": 0.40019409140709555, "learning_rate": 3.6162154224826627e-06, "loss": 0.7115, "num_input_tokens_seen": 1007681536, "step": 961 }, { "epoch": 1.0763636363636364, "grad_norm": 0.3708254775203409, "learning_rate": 3.6135666876462565e-06, "loss": 0.6473, "num_input_tokens_seen": 1008730112, "step": 962 }, { "epoch": 1.0774825174825176, "grad_norm": 0.40920599938147156, "learning_rate": 3.6109163924883668e-06, "loss": 0.6132, "num_input_tokens_seen": 1009778688, "step": 963 }, { "epoch": 1.0786013986013987, "grad_norm": 0.3657671985203965, "learning_rate": 3.6082645407225673e-06, "loss": 0.6439, "num_input_tokens_seen": 1010827264, "step": 964 }, { "epoch": 1.0797202797202796, "grad_norm": 0.42973995781067953, "learning_rate": 3.6056111360646134e-06, "loss": 0.6203, "num_input_tokens_seen": 1011875840, "step": 965 }, { "epoch": 1.0808391608391608, "grad_norm": 0.3919864923600246, "learning_rate": 3.602956182232438e-06, "loss": 0.7, "num_input_tokens_seen": 1012924416, "step": 966 }, { "epoch": 1.081958041958042, "grad_norm": 0.3697542043157021, "learning_rate": 3.60029968294614e-06, "loss": 0.6136, "num_input_tokens_seen": 1013972992, "step": 967 }, { "epoch": 1.083076923076923, "grad_norm": 0.40963987698093207, "learning_rate": 3.5976416419279892e-06, "loss": 0.8232, "num_input_tokens_seen": 1015021568, "step": 968 }, { "epoch": 1.0841958041958042, "grad_norm": 0.3634170054876749, "learning_rate": 3.594982062902412e-06, "loss": 0.6797, "num_input_tokens_seen": 1016070144, "step": 969 }, { "epoch": 1.0853146853146853, "grad_norm": 0.362195279234427, "learning_rate": 3.5923209495959923e-06, "loss": 0.5934, "num_input_tokens_seen": 1017118720, "step": 970 }, { "epoch": 1.0864335664335665, "grad_norm": 0.43423576381368484, "learning_rate": 3.5896583057374607e-06, "loss": 0.663, "num_input_tokens_seen": 1018167296, "step": 971 }, { "epoch": 1.0875524475524476, "grad_norm": 0.4932039589894863, "learning_rate": 3.5869941350576958e-06, "loss": 0.6741, "num_input_tokens_seen": 1019215872, "step": 972 }, { "epoch": 1.0886713286713288, "grad_norm": 0.40922478739753965, "learning_rate": 3.5843284412897127e-06, "loss": 0.5814, "num_input_tokens_seen": 1020264448, "step": 973 }, { "epoch": 1.08979020979021, "grad_norm": 0.41035590628813307, "learning_rate": 3.5816612281686636e-06, "loss": 0.8484, "num_input_tokens_seen": 1021313024, "step": 974 }, { "epoch": 1.0909090909090908, "grad_norm": 0.4130225086557905, "learning_rate": 3.5789924994318267e-06, "loss": 0.7998, "num_input_tokens_seen": 1022361600, "step": 975 }, { "epoch": 1.092027972027972, "grad_norm": 0.39839822952407405, "learning_rate": 3.5763222588186053e-06, "loss": 0.6548, "num_input_tokens_seen": 1023410176, "step": 976 }, { "epoch": 1.0931468531468531, "grad_norm": 0.3899721344304656, "learning_rate": 3.5736505100705223e-06, "loss": 0.6207, "num_input_tokens_seen": 1024458752, "step": 977 }, { "epoch": 1.0942657342657343, "grad_norm": 0.40282143014839866, "learning_rate": 3.5709772569312097e-06, "loss": 0.703, "num_input_tokens_seen": 1025507328, "step": 978 }, { "epoch": 1.0953846153846154, "grad_norm": 0.38400589730069673, "learning_rate": 3.568302503146413e-06, "loss": 0.7447, "num_input_tokens_seen": 1026555904, "step": 979 }, { "epoch": 1.0965034965034965, "grad_norm": 0.4872521373918003, "learning_rate": 3.565626252463977e-06, "loss": 0.6949, "num_input_tokens_seen": 1027604480, "step": 980 }, { "epoch": 1.0976223776223777, "grad_norm": 0.4212222512816792, "learning_rate": 3.5629485086338432e-06, "loss": 0.8627, "num_input_tokens_seen": 1028653056, "step": 981 }, { "epoch": 1.0987412587412588, "grad_norm": 0.4276923255631712, "learning_rate": 3.560269275408048e-06, "loss": 0.6752, "num_input_tokens_seen": 1029701632, "step": 982 }, { "epoch": 1.09986013986014, "grad_norm": 0.3683102111529947, "learning_rate": 3.557588556540712e-06, "loss": 0.6785, "num_input_tokens_seen": 1030750208, "step": 983 }, { "epoch": 1.100979020979021, "grad_norm": 0.4280022163998382, "learning_rate": 3.554906355788041e-06, "loss": 0.6627, "num_input_tokens_seen": 1031798784, "step": 984 }, { "epoch": 1.102097902097902, "grad_norm": 0.36224336692916786, "learning_rate": 3.552222676908313e-06, "loss": 0.7507, "num_input_tokens_seen": 1032847360, "step": 985 }, { "epoch": 1.1032167832167832, "grad_norm": 0.3898411736782322, "learning_rate": 3.5495375236618795e-06, "loss": 0.819, "num_input_tokens_seen": 1033895936, "step": 986 }, { "epoch": 1.1043356643356643, "grad_norm": 0.39216183125807946, "learning_rate": 3.5468508998111596e-06, "loss": 0.7001, "num_input_tokens_seen": 1034944512, "step": 987 }, { "epoch": 1.1054545454545455, "grad_norm": 0.4768759134463784, "learning_rate": 3.5441628091206276e-06, "loss": 0.6529, "num_input_tokens_seen": 1035993088, "step": 988 }, { "epoch": 1.1065734265734266, "grad_norm": 0.35988437142543966, "learning_rate": 3.5414732553568194e-06, "loss": 0.5991, "num_input_tokens_seen": 1037041664, "step": 989 }, { "epoch": 1.1076923076923078, "grad_norm": 0.40463775499748805, "learning_rate": 3.538782242288316e-06, "loss": 0.6535, "num_input_tokens_seen": 1038090240, "step": 990 }, { "epoch": 1.108811188811189, "grad_norm": 0.4349805636814759, "learning_rate": 3.5360897736857464e-06, "loss": 0.8031, "num_input_tokens_seen": 1039138816, "step": 991 }, { "epoch": 1.10993006993007, "grad_norm": 0.4111574707765291, "learning_rate": 3.533395853321778e-06, "loss": 0.7237, "num_input_tokens_seen": 1040187392, "step": 992 }, { "epoch": 1.111048951048951, "grad_norm": 0.4668219796784991, "learning_rate": 3.5307004849711114e-06, "loss": 0.5778, "num_input_tokens_seen": 1041235968, "step": 993 }, { "epoch": 1.112167832167832, "grad_norm": 0.4265203818499569, "learning_rate": 3.528003672410477e-06, "loss": 0.6749, "num_input_tokens_seen": 1042284544, "step": 994 }, { "epoch": 1.1132867132867132, "grad_norm": 0.3740029880371878, "learning_rate": 3.5253054194186297e-06, "loss": 0.7667, "num_input_tokens_seen": 1043333120, "step": 995 }, { "epoch": 1.1144055944055944, "grad_norm": 0.6038102205639235, "learning_rate": 3.5226057297763407e-06, "loss": 0.6976, "num_input_tokens_seen": 1044381696, "step": 996 }, { "epoch": 1.1155244755244755, "grad_norm": 0.41979725560408115, "learning_rate": 3.5199046072663968e-06, "loss": 0.7189, "num_input_tokens_seen": 1045430272, "step": 997 }, { "epoch": 1.1166433566433567, "grad_norm": 0.4523653927527875, "learning_rate": 3.5172020556735897e-06, "loss": 0.787, "num_input_tokens_seen": 1046478848, "step": 998 }, { "epoch": 1.1177622377622378, "grad_norm": 0.4079335825539162, "learning_rate": 3.5144980787847155e-06, "loss": 0.6373, "num_input_tokens_seen": 1047527424, "step": 999 }, { "epoch": 1.118881118881119, "grad_norm": 0.3778375811057347, "learning_rate": 3.511792680388567e-06, "loss": 0.7507, "num_input_tokens_seen": 1048576000, "step": 1000 }, { "epoch": 1.118881118881119, "eval_loss": 0.7350466847419739, "eval_runtime": 246.8721, "eval_samples_per_second": 2.366, "eval_steps_per_second": 0.296, "num_input_tokens_seen": 1048576000, "step": 1000 }, { "epoch": 1.12, "grad_norm": 0.3982167664899806, "learning_rate": 3.5090858642759273e-06, "loss": 0.6723, "num_input_tokens_seen": 1049624576, "step": 1001 }, { "epoch": 1.121118881118881, "grad_norm": 0.44947430147326956, "learning_rate": 3.5063776342395693e-06, "loss": 0.6434, "num_input_tokens_seen": 1050673152, "step": 1002 }, { "epoch": 1.1222377622377622, "grad_norm": 0.3946185785079117, "learning_rate": 3.503667994074244e-06, "loss": 0.758, "num_input_tokens_seen": 1051721728, "step": 1003 }, { "epoch": 1.1233566433566433, "grad_norm": 0.3642452421825928, "learning_rate": 3.5009569475766793e-06, "loss": 0.7988, "num_input_tokens_seen": 1052770304, "step": 1004 }, { "epoch": 1.1244755244755245, "grad_norm": 0.395633426079698, "learning_rate": 3.4982444985455744e-06, "loss": 0.676, "num_input_tokens_seen": 1053818880, "step": 1005 }, { "epoch": 1.1255944055944056, "grad_norm": 0.382872669159488, "learning_rate": 3.495530650781591e-06, "loss": 0.6572, "num_input_tokens_seen": 1054867456, "step": 1006 }, { "epoch": 1.1267132867132867, "grad_norm": 0.4121297648889241, "learning_rate": 3.4928154080873556e-06, "loss": 0.6558, "num_input_tokens_seen": 1055916032, "step": 1007 }, { "epoch": 1.1278321678321679, "grad_norm": 0.3994470472699709, "learning_rate": 3.490098774267444e-06, "loss": 0.8024, "num_input_tokens_seen": 1056964608, "step": 1008 }, { "epoch": 1.128951048951049, "grad_norm": 0.37777901802286, "learning_rate": 3.487380753128385e-06, "loss": 0.7818, "num_input_tokens_seen": 1058013184, "step": 1009 }, { "epoch": 1.1300699300699302, "grad_norm": 0.37762541018564344, "learning_rate": 3.4846613484786497e-06, "loss": 0.8533, "num_input_tokens_seen": 1059061760, "step": 1010 }, { "epoch": 1.131188811188811, "grad_norm": 0.4358640676248289, "learning_rate": 3.4819405641286476e-06, "loss": 0.6416, "num_input_tokens_seen": 1060110336, "step": 1011 }, { "epoch": 1.1323076923076922, "grad_norm": 0.35496020793558963, "learning_rate": 3.4792184038907212e-06, "loss": 0.6771, "num_input_tokens_seen": 1061158912, "step": 1012 }, { "epoch": 1.1334265734265734, "grad_norm": 0.39826379587440786, "learning_rate": 3.4764948715791425e-06, "loss": 0.6969, "num_input_tokens_seen": 1062207488, "step": 1013 }, { "epoch": 1.1345454545454545, "grad_norm": 0.404345326133934, "learning_rate": 3.473769971010105e-06, "loss": 0.8173, "num_input_tokens_seen": 1063256064, "step": 1014 }, { "epoch": 1.1356643356643357, "grad_norm": 0.3464748636959376, "learning_rate": 3.471043706001719e-06, "loss": 0.6521, "num_input_tokens_seen": 1064304640, "step": 1015 }, { "epoch": 1.1367832167832168, "grad_norm": 0.43714677931657137, "learning_rate": 3.468316080374007e-06, "loss": 0.6838, "num_input_tokens_seen": 1065353216, "step": 1016 }, { "epoch": 1.137902097902098, "grad_norm": 0.36238752863683965, "learning_rate": 3.465587097948898e-06, "loss": 0.7206, "num_input_tokens_seen": 1066401792, "step": 1017 }, { "epoch": 1.139020979020979, "grad_norm": 0.37541580577786265, "learning_rate": 3.462856762550223e-06, "loss": 0.6745, "num_input_tokens_seen": 1067450368, "step": 1018 }, { "epoch": 1.1401398601398602, "grad_norm": 0.3838930611999155, "learning_rate": 3.4601250780037064e-06, "loss": 0.8624, "num_input_tokens_seen": 1068498944, "step": 1019 }, { "epoch": 1.1412587412587412, "grad_norm": 0.38464401419830857, "learning_rate": 3.4573920481369666e-06, "loss": 0.6621, "num_input_tokens_seen": 1069547520, "step": 1020 }, { "epoch": 1.1423776223776223, "grad_norm": 0.3992401036384657, "learning_rate": 3.4546576767795036e-06, "loss": 0.7106, "num_input_tokens_seen": 1070596096, "step": 1021 }, { "epoch": 1.1434965034965034, "grad_norm": 0.40746973500385775, "learning_rate": 3.4519219677626986e-06, "loss": 0.6669, "num_input_tokens_seen": 1071644672, "step": 1022 }, { "epoch": 1.1446153846153846, "grad_norm": 0.4669749338911947, "learning_rate": 3.4491849249198074e-06, "loss": 0.7286, "num_input_tokens_seen": 1072693248, "step": 1023 }, { "epoch": 1.1457342657342657, "grad_norm": 0.40481819543611697, "learning_rate": 3.446446552085954e-06, "loss": 0.7738, "num_input_tokens_seen": 1073741824, "step": 1024 }, { "epoch": 1.1468531468531469, "grad_norm": 0.3772784571867631, "learning_rate": 3.4437068530981266e-06, "loss": 0.7004, "num_input_tokens_seen": 1074790400, "step": 1025 }, { "epoch": 1.147972027972028, "grad_norm": 0.3696109555970487, "learning_rate": 3.4409658317951717e-06, "loss": 0.7089, "num_input_tokens_seen": 1075838976, "step": 1026 }, { "epoch": 1.1490909090909092, "grad_norm": 0.4159421989880179, "learning_rate": 3.438223492017787e-06, "loss": 0.6312, "num_input_tokens_seen": 1076887552, "step": 1027 }, { "epoch": 1.1502097902097903, "grad_norm": 0.4007665622364079, "learning_rate": 3.435479837608521e-06, "loss": 0.7368, "num_input_tokens_seen": 1077936128, "step": 1028 }, { "epoch": 1.1513286713286712, "grad_norm": 0.4165485823739524, "learning_rate": 3.432734872411761e-06, "loss": 0.7439, "num_input_tokens_seen": 1078984704, "step": 1029 }, { "epoch": 1.1524475524475524, "grad_norm": 0.40585646047992713, "learning_rate": 3.4299886002737313e-06, "loss": 0.8508, "num_input_tokens_seen": 1080033280, "step": 1030 }, { "epoch": 1.1535664335664335, "grad_norm": 0.38007249997000436, "learning_rate": 3.4272410250424893e-06, "loss": 0.6596, "num_input_tokens_seen": 1081081856, "step": 1031 }, { "epoch": 1.1546853146853147, "grad_norm": 0.37996596261470683, "learning_rate": 3.4244921505679175e-06, "loss": 0.7512, "num_input_tokens_seen": 1082130432, "step": 1032 }, { "epoch": 1.1558041958041958, "grad_norm": 0.35733658585688005, "learning_rate": 3.4217419807017177e-06, "loss": 0.6346, "num_input_tokens_seen": 1083179008, "step": 1033 }, { "epoch": 1.156923076923077, "grad_norm": 0.40060174005712884, "learning_rate": 3.4189905192974087e-06, "loss": 0.8083, "num_input_tokens_seen": 1084227584, "step": 1034 }, { "epoch": 1.158041958041958, "grad_norm": 0.362887386082327, "learning_rate": 3.416237770210317e-06, "loss": 0.6709, "num_input_tokens_seen": 1085276160, "step": 1035 }, { "epoch": 1.1591608391608392, "grad_norm": 0.38962592087168535, "learning_rate": 3.413483737297576e-06, "loss": 0.7477, "num_input_tokens_seen": 1086324736, "step": 1036 }, { "epoch": 1.1602797202797204, "grad_norm": 0.44114622021009525, "learning_rate": 3.4107284244181154e-06, "loss": 0.6706, "num_input_tokens_seen": 1087373312, "step": 1037 }, { "epoch": 1.1613986013986013, "grad_norm": 0.3919463274381358, "learning_rate": 3.4079718354326583e-06, "loss": 0.8134, "num_input_tokens_seen": 1088421888, "step": 1038 }, { "epoch": 1.1625174825174824, "grad_norm": 0.382945181591034, "learning_rate": 3.40521397420372e-06, "loss": 0.5733, "num_input_tokens_seen": 1089470464, "step": 1039 }, { "epoch": 1.1636363636363636, "grad_norm": 0.4414304822999407, "learning_rate": 3.402454844595593e-06, "loss": 0.9083, "num_input_tokens_seen": 1090519040, "step": 1040 }, { "epoch": 1.1647552447552447, "grad_norm": 0.38598977613080476, "learning_rate": 3.39969445047435e-06, "loss": 0.677, "num_input_tokens_seen": 1091567616, "step": 1041 }, { "epoch": 1.1658741258741259, "grad_norm": 0.4146172533987681, "learning_rate": 3.396932795707836e-06, "loss": 0.628, "num_input_tokens_seen": 1092616192, "step": 1042 }, { "epoch": 1.166993006993007, "grad_norm": 0.4157079599704548, "learning_rate": 3.3941698841656594e-06, "loss": 0.6666, "num_input_tokens_seen": 1093664768, "step": 1043 }, { "epoch": 1.1681118881118882, "grad_norm": 0.3855958627869479, "learning_rate": 3.3914057197191936e-06, "loss": 0.7819, "num_input_tokens_seen": 1094713344, "step": 1044 }, { "epoch": 1.1692307692307693, "grad_norm": 0.37576827071971775, "learning_rate": 3.3886403062415653e-06, "loss": 0.6897, "num_input_tokens_seen": 1095761920, "step": 1045 }, { "epoch": 1.1703496503496504, "grad_norm": 0.4688700424637654, "learning_rate": 3.3858736476076503e-06, "loss": 0.6811, "num_input_tokens_seen": 1096810496, "step": 1046 }, { "epoch": 1.1714685314685314, "grad_norm": 0.3723445710649299, "learning_rate": 3.3831057476940716e-06, "loss": 0.6543, "num_input_tokens_seen": 1097859072, "step": 1047 }, { "epoch": 1.1725874125874125, "grad_norm": 0.3771332736780671, "learning_rate": 3.3803366103791892e-06, "loss": 0.616, "num_input_tokens_seen": 1098907648, "step": 1048 }, { "epoch": 1.1737062937062936, "grad_norm": 0.37632440028939584, "learning_rate": 3.3775662395431e-06, "loss": 0.7484, "num_input_tokens_seen": 1099956224, "step": 1049 }, { "epoch": 1.1748251748251748, "grad_norm": 0.3467121787123361, "learning_rate": 3.3747946390676246e-06, "loss": 0.6972, "num_input_tokens_seen": 1101004800, "step": 1050 }, { "epoch": 1.175944055944056, "grad_norm": 0.3996162441109318, "learning_rate": 3.372021812836311e-06, "loss": 0.8078, "num_input_tokens_seen": 1102053376, "step": 1051 }, { "epoch": 1.177062937062937, "grad_norm": 0.37663336122230257, "learning_rate": 3.369247764734424e-06, "loss": 0.6498, "num_input_tokens_seen": 1103101952, "step": 1052 }, { "epoch": 1.1781818181818182, "grad_norm": 0.531199299416399, "learning_rate": 3.3664724986489368e-06, "loss": 0.7215, "num_input_tokens_seen": 1104150528, "step": 1053 }, { "epoch": 1.1793006993006994, "grad_norm": 0.38506703211295074, "learning_rate": 3.363696018468534e-06, "loss": 0.7566, "num_input_tokens_seen": 1105199104, "step": 1054 }, { "epoch": 1.1804195804195805, "grad_norm": 0.36619024879545065, "learning_rate": 3.360918328083598e-06, "loss": 0.7554, "num_input_tokens_seen": 1106247680, "step": 1055 }, { "epoch": 1.1815384615384614, "grad_norm": 0.491355055998027, "learning_rate": 3.3581394313862094e-06, "loss": 0.7479, "num_input_tokens_seen": 1107296256, "step": 1056 }, { "epoch": 1.1826573426573426, "grad_norm": 0.3937026076746143, "learning_rate": 3.3553593322701374e-06, "loss": 0.7084, "num_input_tokens_seen": 1108344832, "step": 1057 }, { "epoch": 1.1837762237762237, "grad_norm": 0.37805536779239934, "learning_rate": 3.3525780346308354e-06, "loss": 0.6758, "num_input_tokens_seen": 1109393408, "step": 1058 }, { "epoch": 1.1848951048951049, "grad_norm": 0.3966751008188965, "learning_rate": 3.3497955423654395e-06, "loss": 0.7811, "num_input_tokens_seen": 1110441984, "step": 1059 }, { "epoch": 1.186013986013986, "grad_norm": 0.5410875148924355, "learning_rate": 3.3470118593727557e-06, "loss": 0.6947, "num_input_tokens_seen": 1111490560, "step": 1060 }, { "epoch": 1.1871328671328671, "grad_norm": 0.36833423832314116, "learning_rate": 3.3442269895532604e-06, "loss": 0.6182, "num_input_tokens_seen": 1112539136, "step": 1061 }, { "epoch": 1.1882517482517483, "grad_norm": 0.3931089568671557, "learning_rate": 3.3414409368090932e-06, "loss": 0.6655, "num_input_tokens_seen": 1113587712, "step": 1062 }, { "epoch": 1.1893706293706294, "grad_norm": 0.39874464237478163, "learning_rate": 3.338653705044051e-06, "loss": 0.6562, "num_input_tokens_seen": 1114636288, "step": 1063 }, { "epoch": 1.1904895104895106, "grad_norm": 0.4007143687922427, "learning_rate": 3.3358652981635826e-06, "loss": 0.838, "num_input_tokens_seen": 1115684864, "step": 1064 }, { "epoch": 1.1916083916083915, "grad_norm": 0.5334473849612689, "learning_rate": 3.3330757200747828e-06, "loss": 0.7933, "num_input_tokens_seen": 1116733440, "step": 1065 }, { "epoch": 1.1927272727272726, "grad_norm": 0.408757459946148, "learning_rate": 3.3302849746863873e-06, "loss": 0.5994, "num_input_tokens_seen": 1117782016, "step": 1066 }, { "epoch": 1.1938461538461538, "grad_norm": 0.9217401803258203, "learning_rate": 3.3274930659087694e-06, "loss": 0.6728, "num_input_tokens_seen": 1118830592, "step": 1067 }, { "epoch": 1.194965034965035, "grad_norm": 0.39529159494203475, "learning_rate": 3.3246999976539315e-06, "loss": 0.7122, "num_input_tokens_seen": 1119879168, "step": 1068 }, { "epoch": 1.196083916083916, "grad_norm": 0.38934017982849317, "learning_rate": 3.321905773835498e-06, "loss": 0.6045, "num_input_tokens_seen": 1120927744, "step": 1069 }, { "epoch": 1.1972027972027972, "grad_norm": 0.4231675850459808, "learning_rate": 3.319110398368718e-06, "loss": 0.564, "num_input_tokens_seen": 1121976320, "step": 1070 }, { "epoch": 1.1983216783216784, "grad_norm": 0.3645011553213066, "learning_rate": 3.316313875170449e-06, "loss": 0.7708, "num_input_tokens_seen": 1123024896, "step": 1071 }, { "epoch": 1.1994405594405595, "grad_norm": 0.3833893289682461, "learning_rate": 3.3135162081591592e-06, "loss": 0.7236, "num_input_tokens_seen": 1124073472, "step": 1072 }, { "epoch": 1.2005594405594406, "grad_norm": 0.4350375464030555, "learning_rate": 3.310717401254919e-06, "loss": 0.8704, "num_input_tokens_seen": 1125122048, "step": 1073 }, { "epoch": 1.2016783216783216, "grad_norm": 0.37676963314633505, "learning_rate": 3.307917458379397e-06, "loss": 0.6604, "num_input_tokens_seen": 1126170624, "step": 1074 }, { "epoch": 1.2027972027972027, "grad_norm": 0.3911805203267056, "learning_rate": 3.305116383455852e-06, "loss": 0.7461, "num_input_tokens_seen": 1127219200, "step": 1075 }, { "epoch": 1.2039160839160838, "grad_norm": 0.39122411237996946, "learning_rate": 3.3023141804091295e-06, "loss": 0.709, "num_input_tokens_seen": 1128267776, "step": 1076 }, { "epoch": 1.205034965034965, "grad_norm": 0.38494633470346945, "learning_rate": 3.2995108531656566e-06, "loss": 0.6079, "num_input_tokens_seen": 1129316352, "step": 1077 }, { "epoch": 1.2061538461538461, "grad_norm": 0.3891217520437993, "learning_rate": 3.2967064056534342e-06, "loss": 0.6995, "num_input_tokens_seen": 1130364928, "step": 1078 }, { "epoch": 1.2072727272727273, "grad_norm": 0.4011475526260584, "learning_rate": 3.2939008418020334e-06, "loss": 0.8537, "num_input_tokens_seen": 1131413504, "step": 1079 }, { "epoch": 1.2083916083916084, "grad_norm": 0.42432768048421776, "learning_rate": 3.2910941655425903e-06, "loss": 0.7067, "num_input_tokens_seen": 1132462080, "step": 1080 }, { "epoch": 1.2095104895104896, "grad_norm": 0.3510823750838042, "learning_rate": 3.2882863808077993e-06, "loss": 0.706, "num_input_tokens_seen": 1133510656, "step": 1081 }, { "epoch": 1.2106293706293707, "grad_norm": 0.41950287313950335, "learning_rate": 3.285477491531908e-06, "loss": 0.7535, "num_input_tokens_seen": 1134559232, "step": 1082 }, { "epoch": 1.2117482517482516, "grad_norm": 0.36034039478042373, "learning_rate": 3.2826675016507094e-06, "loss": 0.6834, "num_input_tokens_seen": 1135607808, "step": 1083 }, { "epoch": 1.212867132867133, "grad_norm": 0.36627896198864374, "learning_rate": 3.279856415101543e-06, "loss": 0.6554, "num_input_tokens_seen": 1136656384, "step": 1084 }, { "epoch": 1.213986013986014, "grad_norm": 0.39480109813941194, "learning_rate": 3.277044235823281e-06, "loss": 0.7024, "num_input_tokens_seen": 1137704960, "step": 1085 }, { "epoch": 1.215104895104895, "grad_norm": 0.3963697036078931, "learning_rate": 3.2742309677563307e-06, "loss": 0.7157, "num_input_tokens_seen": 1138753536, "step": 1086 }, { "epoch": 1.2162237762237762, "grad_norm": 0.37364694630530343, "learning_rate": 3.2714166148426204e-06, "loss": 0.7182, "num_input_tokens_seen": 1139802112, "step": 1087 }, { "epoch": 1.2173426573426573, "grad_norm": 0.38615470074976294, "learning_rate": 3.2686011810256023e-06, "loss": 0.6355, "num_input_tokens_seen": 1140850688, "step": 1088 }, { "epoch": 1.2184615384615385, "grad_norm": 0.3482024144994338, "learning_rate": 3.2657846702502404e-06, "loss": 0.6785, "num_input_tokens_seen": 1141899264, "step": 1089 }, { "epoch": 1.2195804195804196, "grad_norm": 0.39856303402038784, "learning_rate": 3.2629670864630104e-06, "loss": 0.7554, "num_input_tokens_seen": 1142947840, "step": 1090 }, { "epoch": 1.2206993006993008, "grad_norm": 0.8956480433089458, "learning_rate": 3.2601484336118887e-06, "loss": 0.6407, "num_input_tokens_seen": 1143996416, "step": 1091 }, { "epoch": 1.221818181818182, "grad_norm": 0.4030338251195053, "learning_rate": 3.257328715646351e-06, "loss": 0.6531, "num_input_tokens_seen": 1145044992, "step": 1092 }, { "epoch": 1.222937062937063, "grad_norm": 0.42981845353289466, "learning_rate": 3.2545079365173672e-06, "loss": 0.8666, "num_input_tokens_seen": 1146093568, "step": 1093 }, { "epoch": 1.224055944055944, "grad_norm": 0.4987524528640455, "learning_rate": 3.2516861001773904e-06, "loss": 0.6811, "num_input_tokens_seen": 1147142144, "step": 1094 }, { "epoch": 1.2251748251748251, "grad_norm": 0.4084171929018349, "learning_rate": 3.248863210580358e-06, "loss": 0.7283, "num_input_tokens_seen": 1148190720, "step": 1095 }, { "epoch": 1.2262937062937063, "grad_norm": 0.4014176222734445, "learning_rate": 3.2460392716816826e-06, "loss": 0.6472, "num_input_tokens_seen": 1149239296, "step": 1096 }, { "epoch": 1.2274125874125874, "grad_norm": 0.4436256998898884, "learning_rate": 3.2432142874382442e-06, "loss": 0.6829, "num_input_tokens_seen": 1150287872, "step": 1097 }, { "epoch": 1.2285314685314686, "grad_norm": 0.41621506369794103, "learning_rate": 3.240388261808394e-06, "loss": 0.5833, "num_input_tokens_seen": 1151336448, "step": 1098 }, { "epoch": 1.2296503496503497, "grad_norm": 0.40837565599353337, "learning_rate": 3.237561198751935e-06, "loss": 0.6781, "num_input_tokens_seen": 1152385024, "step": 1099 }, { "epoch": 1.2307692307692308, "grad_norm": 0.4410875795840239, "learning_rate": 3.2347331022301293e-06, "loss": 0.6834, "num_input_tokens_seen": 1153433600, "step": 1100 }, { "epoch": 1.231888111888112, "grad_norm": 0.40659722560244227, "learning_rate": 3.231903976205684e-06, "loss": 0.6747, "num_input_tokens_seen": 1154482176, "step": 1101 }, { "epoch": 1.2330069930069931, "grad_norm": 0.40501007684234214, "learning_rate": 3.2290738246427494e-06, "loss": 0.6939, "num_input_tokens_seen": 1155530752, "step": 1102 }, { "epoch": 1.234125874125874, "grad_norm": 0.37093164382436, "learning_rate": 3.2262426515069144e-06, "loss": 0.6245, "num_input_tokens_seen": 1156579328, "step": 1103 }, { "epoch": 1.2352447552447552, "grad_norm": 0.47122695642039725, "learning_rate": 3.223410460765198e-06, "loss": 0.7372, "num_input_tokens_seen": 1157627904, "step": 1104 }, { "epoch": 1.2363636363636363, "grad_norm": 0.4121053157092799, "learning_rate": 3.220577256386043e-06, "loss": 0.66, "num_input_tokens_seen": 1158676480, "step": 1105 }, { "epoch": 1.2374825174825175, "grad_norm": 0.4009988226068995, "learning_rate": 3.217743042339318e-06, "loss": 0.7805, "num_input_tokens_seen": 1159725056, "step": 1106 }, { "epoch": 1.2386013986013986, "grad_norm": 0.3819147507663703, "learning_rate": 3.2149078225963e-06, "loss": 0.7018, "num_input_tokens_seen": 1160773632, "step": 1107 }, { "epoch": 1.2397202797202798, "grad_norm": 0.3894032839699649, "learning_rate": 3.2120716011296794e-06, "loss": 0.6365, "num_input_tokens_seen": 1161822208, "step": 1108 }, { "epoch": 1.240839160839161, "grad_norm": 0.3983760937431603, "learning_rate": 3.2092343819135485e-06, "loss": 0.7791, "num_input_tokens_seen": 1162870784, "step": 1109 }, { "epoch": 1.241958041958042, "grad_norm": 0.4379527290113371, "learning_rate": 3.206396168923398e-06, "loss": 0.7503, "num_input_tokens_seen": 1163919360, "step": 1110 }, { "epoch": 1.2430769230769232, "grad_norm": 0.3916899041537578, "learning_rate": 3.203556966136113e-06, "loss": 0.6576, "num_input_tokens_seen": 1164967936, "step": 1111 }, { "epoch": 1.2441958041958041, "grad_norm": 0.389914285381746, "learning_rate": 3.2007167775299613e-06, "loss": 0.7734, "num_input_tokens_seen": 1166016512, "step": 1112 }, { "epoch": 1.2453146853146853, "grad_norm": 0.38147675073855564, "learning_rate": 3.197875607084595e-06, "loss": 0.7638, "num_input_tokens_seen": 1167065088, "step": 1113 }, { "epoch": 1.2464335664335664, "grad_norm": 0.39815210068197937, "learning_rate": 3.195033458781042e-06, "loss": 0.6899, "num_input_tokens_seen": 1168113664, "step": 1114 }, { "epoch": 1.2475524475524475, "grad_norm": 0.5826200641306831, "learning_rate": 3.192190336601698e-06, "loss": 0.7669, "num_input_tokens_seen": 1169162240, "step": 1115 }, { "epoch": 1.2486713286713287, "grad_norm": 0.38332515964553093, "learning_rate": 3.189346244530327e-06, "loss": 0.6825, "num_input_tokens_seen": 1170210816, "step": 1116 }, { "epoch": 1.2497902097902098, "grad_norm": 0.3923091232261296, "learning_rate": 3.18650118655205e-06, "loss": 0.6848, "num_input_tokens_seen": 1171259392, "step": 1117 }, { "epoch": 1.250909090909091, "grad_norm": 0.3586280712488189, "learning_rate": 3.183655166653339e-06, "loss": 0.5578, "num_input_tokens_seen": 1172307968, "step": 1118 }, { "epoch": 1.252027972027972, "grad_norm": 0.38109069640754506, "learning_rate": 3.180808188822019e-06, "loss": 0.6381, "num_input_tokens_seen": 1173356544, "step": 1119 }, { "epoch": 1.2531468531468533, "grad_norm": 0.3837090392066703, "learning_rate": 3.177960257047252e-06, "loss": 0.631, "num_input_tokens_seen": 1174405120, "step": 1120 }, { "epoch": 1.2542657342657342, "grad_norm": 0.38619975883769725, "learning_rate": 3.175111375319541e-06, "loss": 0.7872, "num_input_tokens_seen": 1175453696, "step": 1121 }, { "epoch": 1.2553846153846153, "grad_norm": 0.39719268540693575, "learning_rate": 3.1722615476307173e-06, "loss": 0.7283, "num_input_tokens_seen": 1176502272, "step": 1122 }, { "epoch": 1.2565034965034965, "grad_norm": 0.38516696665471906, "learning_rate": 3.1694107779739394e-06, "loss": 0.767, "num_input_tokens_seen": 1177550848, "step": 1123 }, { "epoch": 1.2576223776223776, "grad_norm": 0.386145015275318, "learning_rate": 3.1665590703436843e-06, "loss": 0.7658, "num_input_tokens_seen": 1178599424, "step": 1124 }, { "epoch": 1.2587412587412588, "grad_norm": 0.4433908889958929, "learning_rate": 3.1637064287357433e-06, "loss": 0.6358, "num_input_tokens_seen": 1179648000, "step": 1125 }, { "epoch": 1.25986013986014, "grad_norm": 0.38754199824542584, "learning_rate": 3.1608528571472174e-06, "loss": 0.7587, "num_input_tokens_seen": 1180696576, "step": 1126 }, { "epoch": 1.260979020979021, "grad_norm": 0.3932886601445186, "learning_rate": 3.1579983595765107e-06, "loss": 0.6578, "num_input_tokens_seen": 1181745152, "step": 1127 }, { "epoch": 1.2620979020979022, "grad_norm": 0.34770129812295, "learning_rate": 3.1551429400233235e-06, "loss": 0.6376, "num_input_tokens_seen": 1182793728, "step": 1128 }, { "epoch": 1.2632167832167833, "grad_norm": 0.3820975992966151, "learning_rate": 3.1522866024886497e-06, "loss": 0.8811, "num_input_tokens_seen": 1183842304, "step": 1129 }, { "epoch": 1.2643356643356642, "grad_norm": 0.4093774036603509, "learning_rate": 3.149429350974767e-06, "loss": 0.7592, "num_input_tokens_seen": 1184890880, "step": 1130 }, { "epoch": 1.2654545454545454, "grad_norm": 0.3863572983675206, "learning_rate": 3.1465711894852364e-06, "loss": 0.6601, "num_input_tokens_seen": 1185939456, "step": 1131 }, { "epoch": 1.2665734265734265, "grad_norm": 0.41479477928832975, "learning_rate": 3.143712122024893e-06, "loss": 0.686, "num_input_tokens_seen": 1186988032, "step": 1132 }, { "epoch": 1.2676923076923077, "grad_norm": 0.4812416645382523, "learning_rate": 3.1408521525998403e-06, "loss": 0.7381, "num_input_tokens_seen": 1188036608, "step": 1133 }, { "epoch": 1.2688111888111888, "grad_norm": 0.35719358634951914, "learning_rate": 3.1379912852174477e-06, "loss": 0.6579, "num_input_tokens_seen": 1189085184, "step": 1134 }, { "epoch": 1.26993006993007, "grad_norm": 0.37089889418565936, "learning_rate": 3.135129523886341e-06, "loss": 0.7443, "num_input_tokens_seen": 1190133760, "step": 1135 }, { "epoch": 1.271048951048951, "grad_norm": 0.41289836031824484, "learning_rate": 3.1322668726163983e-06, "loss": 0.697, "num_input_tokens_seen": 1191182336, "step": 1136 }, { "epoch": 1.2721678321678322, "grad_norm": 0.41325803099429803, "learning_rate": 3.129403335418747e-06, "loss": 0.8191, "num_input_tokens_seen": 1192230912, "step": 1137 }, { "epoch": 1.2732867132867134, "grad_norm": 0.8137798988700256, "learning_rate": 3.1265389163057537e-06, "loss": 0.8157, "num_input_tokens_seen": 1193279488, "step": 1138 }, { "epoch": 1.2744055944055943, "grad_norm": 0.42222454987854774, "learning_rate": 3.123673619291021e-06, "loss": 0.7923, "num_input_tokens_seen": 1194328064, "step": 1139 }, { "epoch": 1.2755244755244755, "grad_norm": 0.3844422902641973, "learning_rate": 3.1208074483893833e-06, "loss": 0.6936, "num_input_tokens_seen": 1195376640, "step": 1140 }, { "epoch": 1.2766433566433566, "grad_norm": 0.45062931174364823, "learning_rate": 3.1179404076168983e-06, "loss": 0.6132, "num_input_tokens_seen": 1196425216, "step": 1141 }, { "epoch": 1.2777622377622377, "grad_norm": 0.3777741275885734, "learning_rate": 3.115072500990841e-06, "loss": 0.7471, "num_input_tokens_seen": 1197473792, "step": 1142 }, { "epoch": 1.2788811188811189, "grad_norm": 0.3823355299369143, "learning_rate": 3.1122037325297027e-06, "loss": 0.6598, "num_input_tokens_seen": 1198522368, "step": 1143 }, { "epoch": 1.28, "grad_norm": 0.3729813921966533, "learning_rate": 3.1093341062531797e-06, "loss": 0.7787, "num_input_tokens_seen": 1199570944, "step": 1144 }, { "epoch": 1.2811188811188812, "grad_norm": 0.37503079167689574, "learning_rate": 3.1064636261821716e-06, "loss": 0.6321, "num_input_tokens_seen": 1200619520, "step": 1145 }, { "epoch": 1.2822377622377623, "grad_norm": 0.36404767363897766, "learning_rate": 3.103592296338775e-06, "loss": 0.8964, "num_input_tokens_seen": 1201668096, "step": 1146 }, { "epoch": 1.2833566433566435, "grad_norm": 0.35914664756667386, "learning_rate": 3.1007201207462745e-06, "loss": 0.5977, "num_input_tokens_seen": 1202716672, "step": 1147 }, { "epoch": 1.2844755244755244, "grad_norm": 0.4489562584249548, "learning_rate": 3.097847103429143e-06, "loss": 0.771, "num_input_tokens_seen": 1203765248, "step": 1148 }, { "epoch": 1.2855944055944055, "grad_norm": 0.3866224783172401, "learning_rate": 3.09497324841303e-06, "loss": 0.8276, "num_input_tokens_seen": 1204813824, "step": 1149 }, { "epoch": 1.2867132867132867, "grad_norm": 0.40694745997128623, "learning_rate": 3.092098559724761e-06, "loss": 0.8319, "num_input_tokens_seen": 1205862400, "step": 1150 }, { "epoch": 1.2878321678321678, "grad_norm": 0.42120551931707056, "learning_rate": 3.089223041392329e-06, "loss": 0.7226, "num_input_tokens_seen": 1206910976, "step": 1151 }, { "epoch": 1.288951048951049, "grad_norm": 0.3687209670875961, "learning_rate": 3.086346697444888e-06, "loss": 0.7661, "num_input_tokens_seen": 1207959552, "step": 1152 }, { "epoch": 1.29006993006993, "grad_norm": 0.36545593303349583, "learning_rate": 3.0834695319127516e-06, "loss": 0.665, "num_input_tokens_seen": 1209008128, "step": 1153 }, { "epoch": 1.2911888111888112, "grad_norm": 0.38133649061814295, "learning_rate": 3.080591548827382e-06, "loss": 0.8001, "num_input_tokens_seen": 1210056704, "step": 1154 }, { "epoch": 1.2923076923076924, "grad_norm": 0.400693956200376, "learning_rate": 3.077712752221388e-06, "loss": 0.823, "num_input_tokens_seen": 1211105280, "step": 1155 }, { "epoch": 1.2934265734265735, "grad_norm": 0.3815227941977192, "learning_rate": 3.074833146128519e-06, "loss": 0.8342, "num_input_tokens_seen": 1212153856, "step": 1156 }, { "epoch": 1.2945454545454544, "grad_norm": 0.46707688059686137, "learning_rate": 3.0719527345836568e-06, "loss": 0.6794, "num_input_tokens_seen": 1213202432, "step": 1157 }, { "epoch": 1.2956643356643356, "grad_norm": 0.42473556793117495, "learning_rate": 3.0690715216228143e-06, "loss": 0.7232, "num_input_tokens_seen": 1214251008, "step": 1158 }, { "epoch": 1.2967832167832167, "grad_norm": 0.35652447681587524, "learning_rate": 3.066189511283126e-06, "loss": 0.6864, "num_input_tokens_seen": 1215299584, "step": 1159 }, { "epoch": 1.2979020979020979, "grad_norm": 0.5285181186268111, "learning_rate": 3.063306707602842e-06, "loss": 0.6201, "num_input_tokens_seen": 1216348160, "step": 1160 }, { "epoch": 1.299020979020979, "grad_norm": 0.4032512123737385, "learning_rate": 3.0604231146213276e-06, "loss": 0.7225, "num_input_tokens_seen": 1217396736, "step": 1161 }, { "epoch": 1.3001398601398602, "grad_norm": 0.3954142575045444, "learning_rate": 3.0575387363790505e-06, "loss": 0.6241, "num_input_tokens_seen": 1218445312, "step": 1162 }, { "epoch": 1.3012587412587413, "grad_norm": 0.401748708964853, "learning_rate": 3.054653576917581e-06, "loss": 0.611, "num_input_tokens_seen": 1219493888, "step": 1163 }, { "epoch": 1.3023776223776224, "grad_norm": 0.3918151177687357, "learning_rate": 3.051767640279585e-06, "loss": 0.7342, "num_input_tokens_seen": 1220542464, "step": 1164 }, { "epoch": 1.3034965034965036, "grad_norm": 0.3439110664649469, "learning_rate": 3.048880930508813e-06, "loss": 0.5949, "num_input_tokens_seen": 1221591040, "step": 1165 }, { "epoch": 1.3046153846153845, "grad_norm": 0.37677589252359334, "learning_rate": 3.0459934516501035e-06, "loss": 0.6798, "num_input_tokens_seen": 1222639616, "step": 1166 }, { "epoch": 1.3057342657342657, "grad_norm": 0.4920357719562969, "learning_rate": 3.0431052077493693e-06, "loss": 0.6861, "num_input_tokens_seen": 1223688192, "step": 1167 }, { "epoch": 1.3068531468531468, "grad_norm": 0.410605366933133, "learning_rate": 3.0402162028535985e-06, "loss": 0.7506, "num_input_tokens_seen": 1224736768, "step": 1168 }, { "epoch": 1.307972027972028, "grad_norm": 0.41903817819949934, "learning_rate": 3.0373264410108422e-06, "loss": 0.7873, "num_input_tokens_seen": 1225785344, "step": 1169 }, { "epoch": 1.309090909090909, "grad_norm": 0.4542786876160638, "learning_rate": 3.0344359262702135e-06, "loss": 0.8313, "num_input_tokens_seen": 1226833920, "step": 1170 }, { "epoch": 1.3102097902097902, "grad_norm": 0.37313381295532344, "learning_rate": 3.0315446626818816e-06, "loss": 0.7804, "num_input_tokens_seen": 1227882496, "step": 1171 }, { "epoch": 1.3113286713286714, "grad_norm": 0.36868919223370367, "learning_rate": 3.0286526542970624e-06, "loss": 0.6189, "num_input_tokens_seen": 1228931072, "step": 1172 }, { "epoch": 1.3124475524475525, "grad_norm": 0.376934469038679, "learning_rate": 3.0257599051680175e-06, "loss": 0.6606, "num_input_tokens_seen": 1229979648, "step": 1173 }, { "epoch": 1.3135664335664337, "grad_norm": 0.5904702962230491, "learning_rate": 3.022866419348046e-06, "loss": 0.7247, "num_input_tokens_seen": 1231028224, "step": 1174 }, { "epoch": 1.3146853146853146, "grad_norm": 0.3878475718210238, "learning_rate": 3.0199722008914787e-06, "loss": 0.6983, "num_input_tokens_seen": 1232076800, "step": 1175 }, { "epoch": 1.315804195804196, "grad_norm": 0.4145564067236418, "learning_rate": 3.0170772538536735e-06, "loss": 0.6892, "num_input_tokens_seen": 1233125376, "step": 1176 }, { "epoch": 1.3169230769230769, "grad_norm": 0.39937318860386706, "learning_rate": 3.0141815822910094e-06, "loss": 0.636, "num_input_tokens_seen": 1234173952, "step": 1177 }, { "epoch": 1.318041958041958, "grad_norm": 0.39720270940133356, "learning_rate": 3.011285190260879e-06, "loss": 0.8924, "num_input_tokens_seen": 1235222528, "step": 1178 }, { "epoch": 1.3191608391608391, "grad_norm": 0.3920275095209831, "learning_rate": 3.008388081821687e-06, "loss": 0.6605, "num_input_tokens_seen": 1236271104, "step": 1179 }, { "epoch": 1.3202797202797203, "grad_norm": 0.7447746624552618, "learning_rate": 3.005490261032839e-06, "loss": 0.6842, "num_input_tokens_seen": 1237319680, "step": 1180 }, { "epoch": 1.3213986013986014, "grad_norm": 0.3633575618069481, "learning_rate": 3.0025917319547417e-06, "loss": 0.76, "num_input_tokens_seen": 1238368256, "step": 1181 }, { "epoch": 1.3225174825174826, "grad_norm": 0.4034722978062668, "learning_rate": 2.999692498648792e-06, "loss": 0.6304, "num_input_tokens_seen": 1239416832, "step": 1182 }, { "epoch": 1.3236363636363637, "grad_norm": 0.39527056169777314, "learning_rate": 2.9967925651773745e-06, "loss": 0.5649, "num_input_tokens_seen": 1240465408, "step": 1183 }, { "epoch": 1.3247552447552446, "grad_norm": 0.37910830789449684, "learning_rate": 2.9938919356038548e-06, "loss": 0.774, "num_input_tokens_seen": 1241513984, "step": 1184 }, { "epoch": 1.325874125874126, "grad_norm": 0.3842285076835187, "learning_rate": 2.990990613992573e-06, "loss": 0.6848, "num_input_tokens_seen": 1242562560, "step": 1185 }, { "epoch": 1.326993006993007, "grad_norm": 0.42853761153069947, "learning_rate": 2.9880886044088416e-06, "loss": 0.7088, "num_input_tokens_seen": 1243611136, "step": 1186 }, { "epoch": 1.328111888111888, "grad_norm": 0.35853303653314544, "learning_rate": 2.9851859109189335e-06, "loss": 0.6251, "num_input_tokens_seen": 1244659712, "step": 1187 }, { "epoch": 1.3292307692307692, "grad_norm": 0.3814598207630288, "learning_rate": 2.9822825375900816e-06, "loss": 0.6438, "num_input_tokens_seen": 1245708288, "step": 1188 }, { "epoch": 1.3303496503496504, "grad_norm": 0.36528417160523563, "learning_rate": 2.9793784884904733e-06, "loss": 0.77, "num_input_tokens_seen": 1246756864, "step": 1189 }, { "epoch": 1.3314685314685315, "grad_norm": 0.4211828510002538, "learning_rate": 2.9764737676892375e-06, "loss": 0.7754, "num_input_tokens_seen": 1247805440, "step": 1190 }, { "epoch": 1.3325874125874126, "grad_norm": 0.5004676580857434, "learning_rate": 2.9735683792564506e-06, "loss": 0.6628, "num_input_tokens_seen": 1248854016, "step": 1191 }, { "epoch": 1.3337062937062938, "grad_norm": 0.3623797796236322, "learning_rate": 2.9706623272631206e-06, "loss": 0.7496, "num_input_tokens_seen": 1249902592, "step": 1192 }, { "epoch": 1.3348251748251747, "grad_norm": 0.5346758530741487, "learning_rate": 2.967755615781186e-06, "loss": 0.6997, "num_input_tokens_seen": 1250951168, "step": 1193 }, { "epoch": 1.335944055944056, "grad_norm": 0.39817978582104097, "learning_rate": 2.96484824888351e-06, "loss": 0.6166, "num_input_tokens_seen": 1251999744, "step": 1194 }, { "epoch": 1.337062937062937, "grad_norm": 0.4034227046528643, "learning_rate": 2.9619402306438738e-06, "loss": 0.6393, "num_input_tokens_seen": 1253048320, "step": 1195 }, { "epoch": 1.3381818181818181, "grad_norm": 0.3790733814178904, "learning_rate": 2.959031565136971e-06, "loss": 0.6455, "num_input_tokens_seen": 1254096896, "step": 1196 }, { "epoch": 1.3393006993006993, "grad_norm": 0.3985810310270806, "learning_rate": 2.956122256438403e-06, "loss": 0.7384, "num_input_tokens_seen": 1255145472, "step": 1197 }, { "epoch": 1.3404195804195804, "grad_norm": 0.3636612832767238, "learning_rate": 2.9532123086246704e-06, "loss": 0.6676, "num_input_tokens_seen": 1256194048, "step": 1198 }, { "epoch": 1.3415384615384616, "grad_norm": 0.3601883231833679, "learning_rate": 2.9503017257731727e-06, "loss": 0.7254, "num_input_tokens_seen": 1257242624, "step": 1199 }, { "epoch": 1.3426573426573427, "grad_norm": 0.36246399336792506, "learning_rate": 2.947390511962196e-06, "loss": 0.7849, "num_input_tokens_seen": 1258291200, "step": 1200 }, { "epoch": 1.3437762237762239, "grad_norm": 0.9329133515887662, "learning_rate": 2.9444786712709122e-06, "loss": 0.7132, "num_input_tokens_seen": 1259339776, "step": 1201 }, { "epoch": 1.3448951048951048, "grad_norm": 0.4040862278271341, "learning_rate": 2.9415662077793705e-06, "loss": 0.7177, "num_input_tokens_seen": 1260388352, "step": 1202 }, { "epoch": 1.3460139860139861, "grad_norm": 0.36884439044865164, "learning_rate": 2.9386531255684942e-06, "loss": 0.6699, "num_input_tokens_seen": 1261436928, "step": 1203 }, { "epoch": 1.347132867132867, "grad_norm": 0.7144993769712487, "learning_rate": 2.9357394287200724e-06, "loss": 0.6733, "num_input_tokens_seen": 1262485504, "step": 1204 }, { "epoch": 1.3482517482517482, "grad_norm": 0.4116656605384311, "learning_rate": 2.9328251213167557e-06, "loss": 0.7537, "num_input_tokens_seen": 1263534080, "step": 1205 }, { "epoch": 1.3493706293706293, "grad_norm": 0.5544556754514866, "learning_rate": 2.9299102074420504e-06, "loss": 0.8106, "num_input_tokens_seen": 1264582656, "step": 1206 }, { "epoch": 1.3504895104895105, "grad_norm": 0.40859813034930065, "learning_rate": 2.9269946911803134e-06, "loss": 0.7536, "num_input_tokens_seen": 1265631232, "step": 1207 }, { "epoch": 1.3516083916083916, "grad_norm": 0.406488249706227, "learning_rate": 2.9240785766167428e-06, "loss": 0.6991, "num_input_tokens_seen": 1266679808, "step": 1208 }, { "epoch": 1.3527272727272728, "grad_norm": 0.44236212362869837, "learning_rate": 2.9211618678373775e-06, "loss": 0.7489, "num_input_tokens_seen": 1267728384, "step": 1209 }, { "epoch": 1.353846153846154, "grad_norm": 0.4101602705479307, "learning_rate": 2.9182445689290894e-06, "loss": 0.6784, "num_input_tokens_seen": 1268776960, "step": 1210 }, { "epoch": 1.3549650349650348, "grad_norm": 0.3649812633082753, "learning_rate": 2.9153266839795756e-06, "loss": 0.6933, "num_input_tokens_seen": 1269825536, "step": 1211 }, { "epoch": 1.3560839160839162, "grad_norm": 0.3936425261091729, "learning_rate": 2.912408217077356e-06, "loss": 0.6613, "num_input_tokens_seen": 1270874112, "step": 1212 }, { "epoch": 1.3572027972027971, "grad_norm": 0.6449008146832149, "learning_rate": 2.909489172311765e-06, "loss": 0.7275, "num_input_tokens_seen": 1271922688, "step": 1213 }, { "epoch": 1.3583216783216783, "grad_norm": 0.39040239408420446, "learning_rate": 2.906569553772945e-06, "loss": 0.6383, "num_input_tokens_seen": 1272971264, "step": 1214 }, { "epoch": 1.3594405594405594, "grad_norm": 0.39706015352738006, "learning_rate": 2.9036493655518456e-06, "loss": 0.715, "num_input_tokens_seen": 1274019840, "step": 1215 }, { "epoch": 1.3605594405594406, "grad_norm": 0.4151779198327144, "learning_rate": 2.900728611740213e-06, "loss": 0.6316, "num_input_tokens_seen": 1275068416, "step": 1216 }, { "epoch": 1.3616783216783217, "grad_norm": 0.42593221578259555, "learning_rate": 2.8978072964305848e-06, "loss": 0.787, "num_input_tokens_seen": 1276116992, "step": 1217 }, { "epoch": 1.3627972027972028, "grad_norm": 0.4103860718586457, "learning_rate": 2.894885423716289e-06, "loss": 0.799, "num_input_tokens_seen": 1277165568, "step": 1218 }, { "epoch": 1.363916083916084, "grad_norm": 0.3691246447907775, "learning_rate": 2.89196299769143e-06, "loss": 0.7139, "num_input_tokens_seen": 1278214144, "step": 1219 }, { "epoch": 1.365034965034965, "grad_norm": 0.5050362648102538, "learning_rate": 2.8890400224508912e-06, "loss": 0.7172, "num_input_tokens_seen": 1279262720, "step": 1220 }, { "epoch": 1.3661538461538463, "grad_norm": 0.35892588314204177, "learning_rate": 2.8861165020903235e-06, "loss": 0.7018, "num_input_tokens_seen": 1280311296, "step": 1221 }, { "epoch": 1.3672727272727272, "grad_norm": 0.36079207044462236, "learning_rate": 2.883192440706141e-06, "loss": 0.6721, "num_input_tokens_seen": 1281359872, "step": 1222 }, { "epoch": 1.3683916083916083, "grad_norm": 0.4180443657704634, "learning_rate": 2.88026784239552e-06, "loss": 0.7519, "num_input_tokens_seen": 1282408448, "step": 1223 }, { "epoch": 1.3695104895104895, "grad_norm": 0.41656830228072295, "learning_rate": 2.8773427112563833e-06, "loss": 0.8246, "num_input_tokens_seen": 1283457024, "step": 1224 }, { "epoch": 1.3706293706293706, "grad_norm": 0.4135047497982733, "learning_rate": 2.8744170513874054e-06, "loss": 0.7777, "num_input_tokens_seen": 1284505600, "step": 1225 }, { "epoch": 1.3717482517482518, "grad_norm": 0.44216209891984176, "learning_rate": 2.871490866887998e-06, "loss": 0.7001, "num_input_tokens_seen": 1285554176, "step": 1226 }, { "epoch": 1.372867132867133, "grad_norm": 0.3831848936870926, "learning_rate": 2.8685641618583098e-06, "loss": 0.6791, "num_input_tokens_seen": 1286602752, "step": 1227 }, { "epoch": 1.373986013986014, "grad_norm": 0.36883830026356323, "learning_rate": 2.8656369403992192e-06, "loss": 0.7116, "num_input_tokens_seen": 1287651328, "step": 1228 }, { "epoch": 1.375104895104895, "grad_norm": 0.37181897365493316, "learning_rate": 2.8627092066123263e-06, "loss": 0.6922, "num_input_tokens_seen": 1288699904, "step": 1229 }, { "epoch": 1.3762237762237763, "grad_norm": 0.40789395450941157, "learning_rate": 2.85978096459995e-06, "loss": 0.7006, "num_input_tokens_seen": 1289748480, "step": 1230 }, { "epoch": 1.3773426573426573, "grad_norm": 0.4003132055014799, "learning_rate": 2.856852218465124e-06, "loss": 0.5692, "num_input_tokens_seen": 1290797056, "step": 1231 }, { "epoch": 1.3784615384615384, "grad_norm": 0.3731720161019701, "learning_rate": 2.8539229723115826e-06, "loss": 0.6211, "num_input_tokens_seen": 1291845632, "step": 1232 }, { "epoch": 1.3795804195804195, "grad_norm": 0.40027189422631654, "learning_rate": 2.8509932302437665e-06, "loss": 0.772, "num_input_tokens_seen": 1292894208, "step": 1233 }, { "epoch": 1.3806993006993007, "grad_norm": 0.39302470361207653, "learning_rate": 2.8480629963668075e-06, "loss": 0.6958, "num_input_tokens_seen": 1293942784, "step": 1234 }, { "epoch": 1.3818181818181818, "grad_norm": 0.35019494278260094, "learning_rate": 2.8451322747865286e-06, "loss": 0.6951, "num_input_tokens_seen": 1294991360, "step": 1235 }, { "epoch": 1.382937062937063, "grad_norm": 0.3608348825510404, "learning_rate": 2.8422010696094356e-06, "loss": 0.7116, "num_input_tokens_seen": 1296039936, "step": 1236 }, { "epoch": 1.3840559440559441, "grad_norm": 0.3580596935967802, "learning_rate": 2.83926938494271e-06, "loss": 0.7113, "num_input_tokens_seen": 1297088512, "step": 1237 }, { "epoch": 1.385174825174825, "grad_norm": 0.34790388714616416, "learning_rate": 2.836337224894209e-06, "loss": 0.6888, "num_input_tokens_seen": 1298137088, "step": 1238 }, { "epoch": 1.3862937062937064, "grad_norm": 0.3552961140339484, "learning_rate": 2.833404593572453e-06, "loss": 0.682, "num_input_tokens_seen": 1299185664, "step": 1239 }, { "epoch": 1.3874125874125873, "grad_norm": 0.3798078119911368, "learning_rate": 2.8304714950866225e-06, "loss": 0.8484, "num_input_tokens_seen": 1300234240, "step": 1240 }, { "epoch": 1.3885314685314685, "grad_norm": 0.3935985980247797, "learning_rate": 2.827537933546555e-06, "loss": 0.6114, "num_input_tokens_seen": 1301282816, "step": 1241 }, { "epoch": 1.3896503496503496, "grad_norm": 0.3585358656195484, "learning_rate": 2.8246039130627356e-06, "loss": 0.7367, "num_input_tokens_seen": 1302331392, "step": 1242 }, { "epoch": 1.3907692307692308, "grad_norm": 0.3599457996795072, "learning_rate": 2.821669437746291e-06, "loss": 0.7446, "num_input_tokens_seen": 1303379968, "step": 1243 }, { "epoch": 1.391888111888112, "grad_norm": 0.37541233869384794, "learning_rate": 2.818734511708987e-06, "loss": 0.575, "num_input_tokens_seen": 1304428544, "step": 1244 }, { "epoch": 1.393006993006993, "grad_norm": 0.5153488708470014, "learning_rate": 2.8157991390632206e-06, "loss": 0.719, "num_input_tokens_seen": 1305477120, "step": 1245 }, { "epoch": 1.3941258741258742, "grad_norm": 0.37064999395258014, "learning_rate": 2.812863323922015e-06, "loss": 0.5899, "num_input_tokens_seen": 1306525696, "step": 1246 }, { "epoch": 1.395244755244755, "grad_norm": 0.37909769545923, "learning_rate": 2.8099270703990124e-06, "loss": 0.783, "num_input_tokens_seen": 1307574272, "step": 1247 }, { "epoch": 1.3963636363636365, "grad_norm": 0.40574945883486085, "learning_rate": 2.8069903826084695e-06, "loss": 0.8601, "num_input_tokens_seen": 1308622848, "step": 1248 }, { "epoch": 1.3974825174825174, "grad_norm": 0.37927382632650025, "learning_rate": 2.8040532646652515e-06, "loss": 0.6974, "num_input_tokens_seen": 1309671424, "step": 1249 }, { "epoch": 1.3986013986013985, "grad_norm": 0.3591857678850769, "learning_rate": 2.8011157206848266e-06, "loss": 0.7827, "num_input_tokens_seen": 1310720000, "step": 1250 }, { "epoch": 1.3986013986013985, "eval_loss": 0.7286360263824463, "eval_runtime": 246.6002, "eval_samples_per_second": 2.368, "eval_steps_per_second": 0.296, "num_input_tokens_seen": 1310720000, "step": 1250 }, { "epoch": 1.3997202797202797, "grad_norm": 0.6806907406240487, "learning_rate": 2.7981777547832604e-06, "loss": 0.6823, "num_input_tokens_seen": 1311768576, "step": 1251 }, { "epoch": 1.4008391608391608, "grad_norm": 0.544471830631099, "learning_rate": 2.7952393710772097e-06, "loss": 0.5896, "num_input_tokens_seen": 1312817152, "step": 1252 }, { "epoch": 1.401958041958042, "grad_norm": 0.3845824333756411, "learning_rate": 2.792300573683915e-06, "loss": 0.7552, "num_input_tokens_seen": 1313865728, "step": 1253 }, { "epoch": 1.403076923076923, "grad_norm": 0.4217104776030561, "learning_rate": 2.7893613667211983e-06, "loss": 0.6488, "num_input_tokens_seen": 1314914304, "step": 1254 }, { "epoch": 1.4041958041958043, "grad_norm": 0.3905579969373465, "learning_rate": 2.7864217543074544e-06, "loss": 0.6017, "num_input_tokens_seen": 1315962880, "step": 1255 }, { "epoch": 1.4053146853146854, "grad_norm": 0.41717563210838055, "learning_rate": 2.7834817405616476e-06, "loss": 0.6918, "num_input_tokens_seen": 1317011456, "step": 1256 }, { "epoch": 1.4064335664335665, "grad_norm": 0.4199533783959727, "learning_rate": 2.780541329603303e-06, "loss": 0.6059, "num_input_tokens_seen": 1318060032, "step": 1257 }, { "epoch": 1.4075524475524475, "grad_norm": 0.32897056219774895, "learning_rate": 2.7776005255525022e-06, "loss": 0.6015, "num_input_tokens_seen": 1319108608, "step": 1258 }, { "epoch": 1.4086713286713286, "grad_norm": 0.36654457642766575, "learning_rate": 2.77465933252988e-06, "loss": 0.8008, "num_input_tokens_seen": 1320157184, "step": 1259 }, { "epoch": 1.4097902097902097, "grad_norm": 0.400560094632117, "learning_rate": 2.7717177546566126e-06, "loss": 0.8286, "num_input_tokens_seen": 1321205760, "step": 1260 }, { "epoch": 1.410909090909091, "grad_norm": 0.40694374959351154, "learning_rate": 2.7687757960544193e-06, "loss": 0.763, "num_input_tokens_seen": 1322254336, "step": 1261 }, { "epoch": 1.412027972027972, "grad_norm": 0.402586312282581, "learning_rate": 2.7658334608455495e-06, "loss": 0.7339, "num_input_tokens_seen": 1323302912, "step": 1262 }, { "epoch": 1.4131468531468532, "grad_norm": 0.3849269072287987, "learning_rate": 2.7628907531527815e-06, "loss": 0.7166, "num_input_tokens_seen": 1324351488, "step": 1263 }, { "epoch": 1.4142657342657343, "grad_norm": 0.9497931722649486, "learning_rate": 2.759947677099417e-06, "loss": 0.6082, "num_input_tokens_seen": 1325400064, "step": 1264 }, { "epoch": 1.4153846153846155, "grad_norm": 0.3399736277077713, "learning_rate": 2.7570042368092724e-06, "loss": 0.7741, "num_input_tokens_seen": 1326448640, "step": 1265 }, { "epoch": 1.4165034965034966, "grad_norm": 0.41578635993061264, "learning_rate": 2.754060436406674e-06, "loss": 0.7008, "num_input_tokens_seen": 1327497216, "step": 1266 }, { "epoch": 1.4176223776223775, "grad_norm": 0.37514328887625675, "learning_rate": 2.7511162800164536e-06, "loss": 0.7292, "num_input_tokens_seen": 1328545792, "step": 1267 }, { "epoch": 1.4187412587412587, "grad_norm": 0.3686250503770648, "learning_rate": 2.748171771763941e-06, "loss": 0.7581, "num_input_tokens_seen": 1329594368, "step": 1268 }, { "epoch": 1.4198601398601398, "grad_norm": 0.43316999453392113, "learning_rate": 2.7452269157749614e-06, "loss": 0.7157, "num_input_tokens_seen": 1330642944, "step": 1269 }, { "epoch": 1.420979020979021, "grad_norm": 0.4236191031717472, "learning_rate": 2.7422817161758234e-06, "loss": 0.6362, "num_input_tokens_seen": 1331691520, "step": 1270 }, { "epoch": 1.422097902097902, "grad_norm": 0.4054651439513534, "learning_rate": 2.7393361770933198e-06, "loss": 0.7119, "num_input_tokens_seen": 1332740096, "step": 1271 }, { "epoch": 1.4232167832167832, "grad_norm": 0.43671389925264287, "learning_rate": 2.7363903026547196e-06, "loss": 0.8825, "num_input_tokens_seen": 1333788672, "step": 1272 }, { "epoch": 1.4243356643356644, "grad_norm": 0.3789830247152755, "learning_rate": 2.7334440969877584e-06, "loss": 0.6226, "num_input_tokens_seen": 1334837248, "step": 1273 }, { "epoch": 1.4254545454545455, "grad_norm": 0.3877105797783583, "learning_rate": 2.7304975642206394e-06, "loss": 0.7138, "num_input_tokens_seen": 1335885824, "step": 1274 }, { "epoch": 1.4265734265734267, "grad_norm": 0.3944175516752716, "learning_rate": 2.7275507084820226e-06, "loss": 0.7926, "num_input_tokens_seen": 1336934400, "step": 1275 }, { "epoch": 1.4276923076923076, "grad_norm": 0.3678333219846232, "learning_rate": 2.724603533901019e-06, "loss": 0.7555, "num_input_tokens_seen": 1337982976, "step": 1276 }, { "epoch": 1.4288111888111887, "grad_norm": 0.36681237254459914, "learning_rate": 2.7216560446071904e-06, "loss": 0.7921, "num_input_tokens_seen": 1339031552, "step": 1277 }, { "epoch": 1.4299300699300699, "grad_norm": 0.4578736833691613, "learning_rate": 2.718708244730537e-06, "loss": 0.6401, "num_input_tokens_seen": 1340080128, "step": 1278 }, { "epoch": 1.431048951048951, "grad_norm": 0.4216860022674988, "learning_rate": 2.7157601384014927e-06, "loss": 0.6946, "num_input_tokens_seen": 1341128704, "step": 1279 }, { "epoch": 1.4321678321678322, "grad_norm": 0.36985231826989556, "learning_rate": 2.7128117297509233e-06, "loss": 0.7291, "num_input_tokens_seen": 1342177280, "step": 1280 }, { "epoch": 1.4332867132867133, "grad_norm": 0.5550478086086422, "learning_rate": 2.7098630229101174e-06, "loss": 0.6842, "num_input_tokens_seen": 1343225856, "step": 1281 }, { "epoch": 1.4344055944055945, "grad_norm": 0.4239083261418276, "learning_rate": 2.706914022010782e-06, "loss": 0.6862, "num_input_tokens_seen": 1344274432, "step": 1282 }, { "epoch": 1.4355244755244756, "grad_norm": 0.41875375638378076, "learning_rate": 2.7039647311850347e-06, "loss": 0.6372, "num_input_tokens_seen": 1345323008, "step": 1283 }, { "epoch": 1.4366433566433567, "grad_norm": 0.44827329941788396, "learning_rate": 2.7010151545654006e-06, "loss": 0.6159, "num_input_tokens_seen": 1346371584, "step": 1284 }, { "epoch": 1.4377622377622377, "grad_norm": 0.398831842716796, "learning_rate": 2.6980652962848055e-06, "loss": 0.7926, "num_input_tokens_seen": 1347420160, "step": 1285 }, { "epoch": 1.4388811188811188, "grad_norm": 0.3771295213090851, "learning_rate": 2.6951151604765668e-06, "loss": 0.8506, "num_input_tokens_seen": 1348468736, "step": 1286 }, { "epoch": 1.44, "grad_norm": 0.4808408447883052, "learning_rate": 2.6921647512743963e-06, "loss": 0.8527, "num_input_tokens_seen": 1349517312, "step": 1287 }, { "epoch": 1.441118881118881, "grad_norm": 0.42899003609852376, "learning_rate": 2.689214072812384e-06, "loss": 0.6399, "num_input_tokens_seen": 1350565888, "step": 1288 }, { "epoch": 1.4422377622377622, "grad_norm": 0.3878992905663293, "learning_rate": 2.686263129224999e-06, "loss": 0.6748, "num_input_tokens_seen": 1351614464, "step": 1289 }, { "epoch": 1.4433566433566434, "grad_norm": 0.4277380466916509, "learning_rate": 2.683311924647083e-06, "loss": 0.6529, "num_input_tokens_seen": 1352663040, "step": 1290 }, { "epoch": 1.4444755244755245, "grad_norm": 0.43706385640602297, "learning_rate": 2.6803604632138403e-06, "loss": 0.7679, "num_input_tokens_seen": 1353711616, "step": 1291 }, { "epoch": 1.4455944055944057, "grad_norm": 0.4202278084540102, "learning_rate": 2.6774087490608384e-06, "loss": 0.7352, "num_input_tokens_seen": 1354760192, "step": 1292 }, { "epoch": 1.4467132867132868, "grad_norm": 0.3891301643771567, "learning_rate": 2.674456786323998e-06, "loss": 0.7613, "num_input_tokens_seen": 1355808768, "step": 1293 }, { "epoch": 1.4478321678321677, "grad_norm": 0.374186923249243, "learning_rate": 2.6715045791395855e-06, "loss": 0.717, "num_input_tokens_seen": 1356857344, "step": 1294 }, { "epoch": 1.4489510489510489, "grad_norm": 0.41814619827493504, "learning_rate": 2.668552131644214e-06, "loss": 0.7243, "num_input_tokens_seen": 1357905920, "step": 1295 }, { "epoch": 1.45006993006993, "grad_norm": 0.7080579218817563, "learning_rate": 2.6655994479748313e-06, "loss": 0.7768, "num_input_tokens_seen": 1358954496, "step": 1296 }, { "epoch": 1.4511888111888112, "grad_norm": 0.3633948800745216, "learning_rate": 2.6626465322687144e-06, "loss": 0.7053, "num_input_tokens_seen": 1360003072, "step": 1297 }, { "epoch": 1.4523076923076923, "grad_norm": 0.40254345223630156, "learning_rate": 2.659693388663469e-06, "loss": 0.6195, "num_input_tokens_seen": 1361051648, "step": 1298 }, { "epoch": 1.4534265734265734, "grad_norm": 0.4135506747205181, "learning_rate": 2.656740021297017e-06, "loss": 0.6968, "num_input_tokens_seen": 1362100224, "step": 1299 }, { "epoch": 1.4545454545454546, "grad_norm": 0.40152770383917613, "learning_rate": 2.653786434307596e-06, "loss": 0.7104, "num_input_tokens_seen": 1363148800, "step": 1300 }, { "epoch": 1.4556643356643357, "grad_norm": 0.35762529537837845, "learning_rate": 2.6508326318337498e-06, "loss": 0.7466, "num_input_tokens_seen": 1364197376, "step": 1301 }, { "epoch": 1.4567832167832169, "grad_norm": 0.35323941263948094, "learning_rate": 2.6478786180143253e-06, "loss": 0.7815, "num_input_tokens_seen": 1365245952, "step": 1302 }, { "epoch": 1.4579020979020978, "grad_norm": 0.3690604448018549, "learning_rate": 2.644924396988465e-06, "loss": 0.6731, "num_input_tokens_seen": 1366294528, "step": 1303 }, { "epoch": 1.4590209790209792, "grad_norm": 0.35974877203147315, "learning_rate": 2.641969972895601e-06, "loss": 0.6903, "num_input_tokens_seen": 1367343104, "step": 1304 }, { "epoch": 1.46013986013986, "grad_norm": 0.3836023509647574, "learning_rate": 2.6390153498754506e-06, "loss": 0.7819, "num_input_tokens_seen": 1368391680, "step": 1305 }, { "epoch": 1.4612587412587412, "grad_norm": 0.36459033907272403, "learning_rate": 2.6360605320680117e-06, "loss": 0.6417, "num_input_tokens_seen": 1369440256, "step": 1306 }, { "epoch": 1.4623776223776224, "grad_norm": 0.4014043579277202, "learning_rate": 2.633105523613551e-06, "loss": 0.64, "num_input_tokens_seen": 1370488832, "step": 1307 }, { "epoch": 1.4634965034965035, "grad_norm": 0.39245081779759416, "learning_rate": 2.6301503286526076e-06, "loss": 0.7603, "num_input_tokens_seen": 1371537408, "step": 1308 }, { "epoch": 1.4646153846153847, "grad_norm": 0.3603063834672613, "learning_rate": 2.6271949513259764e-06, "loss": 0.7417, "num_input_tokens_seen": 1372585984, "step": 1309 }, { "epoch": 1.4657342657342658, "grad_norm": 0.37701684548393616, "learning_rate": 2.6242393957747112e-06, "loss": 0.8494, "num_input_tokens_seen": 1373634560, "step": 1310 }, { "epoch": 1.466853146853147, "grad_norm": 0.40313192342917653, "learning_rate": 2.6212836661401154e-06, "loss": 0.5646, "num_input_tokens_seen": 1374683136, "step": 1311 }, { "epoch": 1.4679720279720279, "grad_norm": 0.3584419362287317, "learning_rate": 2.618327766563735e-06, "loss": 0.6526, "num_input_tokens_seen": 1375731712, "step": 1312 }, { "epoch": 1.4690909090909092, "grad_norm": 0.3679905474485617, "learning_rate": 2.615371701187355e-06, "loss": 0.811, "num_input_tokens_seen": 1376780288, "step": 1313 }, { "epoch": 1.4702097902097901, "grad_norm": 0.34031422656697624, "learning_rate": 2.6124154741529934e-06, "loss": 0.6345, "num_input_tokens_seen": 1377828864, "step": 1314 }, { "epoch": 1.4713286713286713, "grad_norm": 0.3818309292486768, "learning_rate": 2.609459089602892e-06, "loss": 0.8762, "num_input_tokens_seen": 1378877440, "step": 1315 }, { "epoch": 1.4724475524475524, "grad_norm": 0.4104335016382105, "learning_rate": 2.6065025516795165e-06, "loss": 0.6337, "num_input_tokens_seen": 1379926016, "step": 1316 }, { "epoch": 1.4735664335664336, "grad_norm": 0.5009796133155104, "learning_rate": 2.6035458645255467e-06, "loss": 0.6049, "num_input_tokens_seen": 1380974592, "step": 1317 }, { "epoch": 1.4746853146853147, "grad_norm": 0.35577535476674926, "learning_rate": 2.6005890322838697e-06, "loss": 0.6951, "num_input_tokens_seen": 1382023168, "step": 1318 }, { "epoch": 1.4758041958041959, "grad_norm": 0.3867619949733362, "learning_rate": 2.597632059097577e-06, "loss": 0.5612, "num_input_tokens_seen": 1383071744, "step": 1319 }, { "epoch": 1.476923076923077, "grad_norm": 0.4010856772867681, "learning_rate": 2.5946749491099597e-06, "loss": 0.7296, "num_input_tokens_seen": 1384120320, "step": 1320 }, { "epoch": 1.478041958041958, "grad_norm": 0.405871494252086, "learning_rate": 2.5917177064644974e-06, "loss": 0.7324, "num_input_tokens_seen": 1385168896, "step": 1321 }, { "epoch": 1.4791608391608393, "grad_norm": 0.41416723997906857, "learning_rate": 2.5887603353048564e-06, "loss": 0.7148, "num_input_tokens_seen": 1386217472, "step": 1322 }, { "epoch": 1.4802797202797202, "grad_norm": 0.6793412321655151, "learning_rate": 2.585802839774883e-06, "loss": 0.8008, "num_input_tokens_seen": 1387266048, "step": 1323 }, { "epoch": 1.4813986013986014, "grad_norm": 0.4226494257057772, "learning_rate": 2.5828452240186002e-06, "loss": 0.7134, "num_input_tokens_seen": 1388314624, "step": 1324 }, { "epoch": 1.4825174825174825, "grad_norm": 0.392630856050716, "learning_rate": 2.579887492180197e-06, "loss": 0.5837, "num_input_tokens_seen": 1389363200, "step": 1325 }, { "epoch": 1.4836363636363636, "grad_norm": 0.3958098829672122, "learning_rate": 2.576929648404025e-06, "loss": 0.6575, "num_input_tokens_seen": 1390411776, "step": 1326 }, { "epoch": 1.4847552447552448, "grad_norm": 0.41834740690313144, "learning_rate": 2.5739716968345922e-06, "loss": 0.6686, "num_input_tokens_seen": 1391460352, "step": 1327 }, { "epoch": 1.485874125874126, "grad_norm": 0.43463090140151583, "learning_rate": 2.5710136416165602e-06, "loss": 0.5814, "num_input_tokens_seen": 1392508928, "step": 1328 }, { "epoch": 1.486993006993007, "grad_norm": 0.3981604571471748, "learning_rate": 2.5680554868947346e-06, "loss": 0.6269, "num_input_tokens_seen": 1393557504, "step": 1329 }, { "epoch": 1.488111888111888, "grad_norm": 0.3975416407586878, "learning_rate": 2.5650972368140587e-06, "loss": 0.5988, "num_input_tokens_seen": 1394606080, "step": 1330 }, { "epoch": 1.4892307692307694, "grad_norm": 0.38302758146234966, "learning_rate": 2.5621388955196113e-06, "loss": 0.6707, "num_input_tokens_seen": 1395654656, "step": 1331 }, { "epoch": 1.4903496503496503, "grad_norm": 0.3881294703329537, "learning_rate": 2.5591804671566003e-06, "loss": 0.627, "num_input_tokens_seen": 1396703232, "step": 1332 }, { "epoch": 1.4914685314685314, "grad_norm": 0.4256147617258772, "learning_rate": 2.5562219558703504e-06, "loss": 0.8331, "num_input_tokens_seen": 1397751808, "step": 1333 }, { "epoch": 1.4925874125874126, "grad_norm": 0.3849104835659771, "learning_rate": 2.5532633658063095e-06, "loss": 0.6928, "num_input_tokens_seen": 1398800384, "step": 1334 }, { "epoch": 1.4937062937062937, "grad_norm": 0.4102193126055427, "learning_rate": 2.55030470111003e-06, "loss": 0.557, "num_input_tokens_seen": 1399848960, "step": 1335 }, { "epoch": 1.4948251748251749, "grad_norm": 0.37155470222046, "learning_rate": 2.5473459659271715e-06, "loss": 0.8429, "num_input_tokens_seen": 1400897536, "step": 1336 }, { "epoch": 1.495944055944056, "grad_norm": 0.3902475212638151, "learning_rate": 2.544387164403493e-06, "loss": 0.7329, "num_input_tokens_seen": 1401946112, "step": 1337 }, { "epoch": 1.4970629370629371, "grad_norm": 0.3973274754157307, "learning_rate": 2.541428300684845e-06, "loss": 0.7075, "num_input_tokens_seen": 1402994688, "step": 1338 }, { "epoch": 1.498181818181818, "grad_norm": 0.41020822841470045, "learning_rate": 2.5384693789171656e-06, "loss": 0.6779, "num_input_tokens_seen": 1404043264, "step": 1339 }, { "epoch": 1.4993006993006994, "grad_norm": 0.3817134334346347, "learning_rate": 2.5355104032464746e-06, "loss": 0.7811, "num_input_tokens_seen": 1405091840, "step": 1340 }, { "epoch": 1.5004195804195803, "grad_norm": 0.4179717302636287, "learning_rate": 2.532551377818866e-06, "loss": 0.7379, "num_input_tokens_seen": 1406140416, "step": 1341 }, { "epoch": 1.5015384615384615, "grad_norm": 0.3955095591752124, "learning_rate": 2.5295923067805054e-06, "loss": 0.9567, "num_input_tokens_seen": 1407188992, "step": 1342 }, { "epoch": 1.5026573426573426, "grad_norm": 0.3712308020098143, "learning_rate": 2.526633194277622e-06, "loss": 0.6989, "num_input_tokens_seen": 1408237568, "step": 1343 }, { "epoch": 1.5037762237762238, "grad_norm": 0.41285137006221134, "learning_rate": 2.5236740444565016e-06, "loss": 0.7396, "num_input_tokens_seen": 1409286144, "step": 1344 }, { "epoch": 1.504895104895105, "grad_norm": 0.45653964238888556, "learning_rate": 2.5207148614634836e-06, "loss": 0.6208, "num_input_tokens_seen": 1410334720, "step": 1345 }, { "epoch": 1.506013986013986, "grad_norm": 0.38836521949442737, "learning_rate": 2.5177556494449534e-06, "loss": 0.6852, "num_input_tokens_seen": 1411383296, "step": 1346 }, { "epoch": 1.5071328671328672, "grad_norm": 0.40943397917902136, "learning_rate": 2.514796412547337e-06, "loss": 0.7523, "num_input_tokens_seen": 1412431872, "step": 1347 }, { "epoch": 1.5082517482517481, "grad_norm": 0.3776893531308707, "learning_rate": 2.5118371549170967e-06, "loss": 0.6517, "num_input_tokens_seen": 1413480448, "step": 1348 }, { "epoch": 1.5093706293706295, "grad_norm": 0.3885268895633927, "learning_rate": 2.5088778807007203e-06, "loss": 0.6122, "num_input_tokens_seen": 1414529024, "step": 1349 }, { "epoch": 1.5104895104895104, "grad_norm": 0.3883652711210257, "learning_rate": 2.505918594044724e-06, "loss": 0.7379, "num_input_tokens_seen": 1415577600, "step": 1350 }, { "epoch": 1.5116083916083916, "grad_norm": 0.369325683333827, "learning_rate": 2.502959299095636e-06, "loss": 0.7122, "num_input_tokens_seen": 1416626176, "step": 1351 }, { "epoch": 1.5127272727272727, "grad_norm": 0.36269814117754895, "learning_rate": 2.5e-06, "loss": 0.588, "num_input_tokens_seen": 1417674752, "step": 1352 }, { "epoch": 1.5138461538461538, "grad_norm": 0.3740830840624538, "learning_rate": 2.4970407009043646e-06, "loss": 0.6763, "num_input_tokens_seen": 1418723328, "step": 1353 }, { "epoch": 1.514965034965035, "grad_norm": 0.37001438590930996, "learning_rate": 2.4940814059552763e-06, "loss": 0.7872, "num_input_tokens_seen": 1419771904, "step": 1354 }, { "epoch": 1.5160839160839161, "grad_norm": 0.37789514392540635, "learning_rate": 2.49112211929928e-06, "loss": 0.7358, "num_input_tokens_seen": 1420820480, "step": 1355 }, { "epoch": 1.5172027972027973, "grad_norm": 0.3702976724275708, "learning_rate": 2.488162845082904e-06, "loss": 0.6192, "num_input_tokens_seen": 1421869056, "step": 1356 }, { "epoch": 1.5183216783216782, "grad_norm": 0.38037269249828276, "learning_rate": 2.4852035874526632e-06, "loss": 0.7241, "num_input_tokens_seen": 1422917632, "step": 1357 }, { "epoch": 1.5194405594405596, "grad_norm": 0.3982028921442716, "learning_rate": 2.4822443505550474e-06, "loss": 0.7823, "num_input_tokens_seen": 1423966208, "step": 1358 }, { "epoch": 1.5205594405594405, "grad_norm": 0.399019073540658, "learning_rate": 2.479285138536517e-06, "loss": 0.6195, "num_input_tokens_seen": 1425014784, "step": 1359 }, { "epoch": 1.5216783216783218, "grad_norm": 0.37699389813308076, "learning_rate": 2.4763259555434997e-06, "loss": 0.551, "num_input_tokens_seen": 1426063360, "step": 1360 }, { "epoch": 1.5227972027972028, "grad_norm": 0.36219167624962967, "learning_rate": 2.473366805722379e-06, "loss": 0.7314, "num_input_tokens_seen": 1427111936, "step": 1361 }, { "epoch": 1.523916083916084, "grad_norm": 0.35041519372295377, "learning_rate": 2.470407693219495e-06, "loss": 0.6199, "num_input_tokens_seen": 1428160512, "step": 1362 }, { "epoch": 1.525034965034965, "grad_norm": 0.39608006700510595, "learning_rate": 2.4674486221811345e-06, "loss": 0.6151, "num_input_tokens_seen": 1429209088, "step": 1363 }, { "epoch": 1.5261538461538462, "grad_norm": 0.3708236664890989, "learning_rate": 2.4644895967535267e-06, "loss": 0.7297, "num_input_tokens_seen": 1430257664, "step": 1364 }, { "epoch": 1.5272727272727273, "grad_norm": 0.3887516251976424, "learning_rate": 2.4615306210828357e-06, "loss": 0.6494, "num_input_tokens_seen": 1431306240, "step": 1365 }, { "epoch": 1.5283916083916083, "grad_norm": 0.38182972518416913, "learning_rate": 2.4585716993151555e-06, "loss": 0.6477, "num_input_tokens_seen": 1432354816, "step": 1366 }, { "epoch": 1.5295104895104896, "grad_norm": 0.33803354756210535, "learning_rate": 2.4556128355965076e-06, "loss": 0.6864, "num_input_tokens_seen": 1433403392, "step": 1367 }, { "epoch": 1.5306293706293705, "grad_norm": 0.36021006321571003, "learning_rate": 2.4526540340728285e-06, "loss": 0.7068, "num_input_tokens_seen": 1434451968, "step": 1368 }, { "epoch": 1.531748251748252, "grad_norm": 0.3652486511039604, "learning_rate": 2.449695298889971e-06, "loss": 0.7852, "num_input_tokens_seen": 1435500544, "step": 1369 }, { "epoch": 1.5328671328671328, "grad_norm": 0.36077373255104245, "learning_rate": 2.4467366341936922e-06, "loss": 0.6291, "num_input_tokens_seen": 1436549120, "step": 1370 }, { "epoch": 1.533986013986014, "grad_norm": 0.41011578566000345, "learning_rate": 2.44377804412965e-06, "loss": 0.5578, "num_input_tokens_seen": 1437597696, "step": 1371 }, { "epoch": 1.5351048951048951, "grad_norm": 0.3828885950417013, "learning_rate": 2.440819532843401e-06, "loss": 0.6644, "num_input_tokens_seen": 1438646272, "step": 1372 }, { "epoch": 1.5362237762237763, "grad_norm": 0.4252060342652802, "learning_rate": 2.4378611044803887e-06, "loss": 0.7209, "num_input_tokens_seen": 1439694848, "step": 1373 }, { "epoch": 1.5373426573426574, "grad_norm": 0.39073657021385316, "learning_rate": 2.434902763185942e-06, "loss": 0.6141, "num_input_tokens_seen": 1440743424, "step": 1374 }, { "epoch": 1.5384615384615383, "grad_norm": 0.39850020562271005, "learning_rate": 2.431944513105266e-06, "loss": 0.6641, "num_input_tokens_seen": 1441792000, "step": 1375 }, { "epoch": 1.5395804195804197, "grad_norm": 0.42655680342996205, "learning_rate": 2.4289863583834406e-06, "loss": 0.6653, "num_input_tokens_seen": 1442840576, "step": 1376 }, { "epoch": 1.5406993006993006, "grad_norm": 0.38790799657215197, "learning_rate": 2.426028303165409e-06, "loss": 0.7575, "num_input_tokens_seen": 1443889152, "step": 1377 }, { "epoch": 1.541818181818182, "grad_norm": 0.407684082886352, "learning_rate": 2.4230703515959765e-06, "loss": 0.7988, "num_input_tokens_seen": 1444937728, "step": 1378 }, { "epoch": 1.542937062937063, "grad_norm": 0.41258631840209675, "learning_rate": 2.420112507819804e-06, "loss": 0.7933, "num_input_tokens_seen": 1445986304, "step": 1379 }, { "epoch": 1.544055944055944, "grad_norm": 0.37830014075156615, "learning_rate": 2.4171547759813998e-06, "loss": 0.6814, "num_input_tokens_seen": 1447034880, "step": 1380 }, { "epoch": 1.5451748251748252, "grad_norm": 0.3770338216561213, "learning_rate": 2.4141971602251176e-06, "loss": 0.6962, "num_input_tokens_seen": 1448083456, "step": 1381 }, { "epoch": 1.5462937062937063, "grad_norm": 0.393475230424633, "learning_rate": 2.411239664695145e-06, "loss": 0.8119, "num_input_tokens_seen": 1449132032, "step": 1382 }, { "epoch": 1.5474125874125875, "grad_norm": 0.41660743123021626, "learning_rate": 2.4082822935355035e-06, "loss": 0.6718, "num_input_tokens_seen": 1450180608, "step": 1383 }, { "epoch": 1.5485314685314684, "grad_norm": 0.3816983606190608, "learning_rate": 2.4053250508900416e-06, "loss": 0.6898, "num_input_tokens_seen": 1451229184, "step": 1384 }, { "epoch": 1.5496503496503498, "grad_norm": 0.4602844932610833, "learning_rate": 2.402367940902423e-06, "loss": 0.7117, "num_input_tokens_seen": 1452277760, "step": 1385 }, { "epoch": 1.5507692307692307, "grad_norm": 0.42480433234241277, "learning_rate": 2.3994109677161316e-06, "loss": 0.7005, "num_input_tokens_seen": 1453326336, "step": 1386 }, { "epoch": 1.551888111888112, "grad_norm": 0.3847456220733697, "learning_rate": 2.396454135474454e-06, "loss": 0.7743, "num_input_tokens_seen": 1454374912, "step": 1387 }, { "epoch": 1.553006993006993, "grad_norm": 0.3871293284167062, "learning_rate": 2.393497448320484e-06, "loss": 0.6175, "num_input_tokens_seen": 1455423488, "step": 1388 }, { "epoch": 1.554125874125874, "grad_norm": 0.4129061774248552, "learning_rate": 2.3905409103971096e-06, "loss": 0.8076, "num_input_tokens_seen": 1456472064, "step": 1389 }, { "epoch": 1.5552447552447553, "grad_norm": 0.41471544298429186, "learning_rate": 2.3875845258470074e-06, "loss": 0.5888, "num_input_tokens_seen": 1457520640, "step": 1390 }, { "epoch": 1.5563636363636364, "grad_norm": 0.3711820327266248, "learning_rate": 2.384628298812646e-06, "loss": 0.6998, "num_input_tokens_seen": 1458569216, "step": 1391 }, { "epoch": 1.5574825174825175, "grad_norm": 0.40414798486775827, "learning_rate": 2.3816722334362656e-06, "loss": 0.6847, "num_input_tokens_seen": 1459617792, "step": 1392 }, { "epoch": 1.5586013986013985, "grad_norm": 0.3868347213030352, "learning_rate": 2.3787163338598854e-06, "loss": 0.6591, "num_input_tokens_seen": 1460666368, "step": 1393 }, { "epoch": 1.5597202797202798, "grad_norm": 0.39670164662553126, "learning_rate": 2.37576060422529e-06, "loss": 0.6874, "num_input_tokens_seen": 1461714944, "step": 1394 }, { "epoch": 1.5608391608391607, "grad_norm": 0.37558294317162283, "learning_rate": 2.3728050486740244e-06, "loss": 0.7028, "num_input_tokens_seen": 1462763520, "step": 1395 }, { "epoch": 1.561958041958042, "grad_norm": 0.36416036982383987, "learning_rate": 2.3698496713473937e-06, "loss": 0.7958, "num_input_tokens_seen": 1463812096, "step": 1396 }, { "epoch": 1.563076923076923, "grad_norm": 0.45923831364745715, "learning_rate": 2.3668944763864486e-06, "loss": 0.7713, "num_input_tokens_seen": 1464860672, "step": 1397 }, { "epoch": 1.5641958041958042, "grad_norm": 0.34922199395870035, "learning_rate": 2.363939467931989e-06, "loss": 0.8503, "num_input_tokens_seen": 1465909248, "step": 1398 }, { "epoch": 1.5653146853146853, "grad_norm": 0.42957906483316227, "learning_rate": 2.3609846501245494e-06, "loss": 0.7456, "num_input_tokens_seen": 1466957824, "step": 1399 }, { "epoch": 1.5664335664335665, "grad_norm": 0.3835882565743541, "learning_rate": 2.3580300271044e-06, "loss": 0.8466, "num_input_tokens_seen": 1468006400, "step": 1400 }, { "epoch": 1.5675524475524476, "grad_norm": 0.4018688954080891, "learning_rate": 2.3550756030115364e-06, "loss": 0.7182, "num_input_tokens_seen": 1469054976, "step": 1401 }, { "epoch": 1.5686713286713285, "grad_norm": 0.4635463283206513, "learning_rate": 2.3521213819856756e-06, "loss": 0.625, "num_input_tokens_seen": 1470103552, "step": 1402 }, { "epoch": 1.56979020979021, "grad_norm": 0.3965489983160831, "learning_rate": 2.349167368166251e-06, "loss": 0.7735, "num_input_tokens_seen": 1471152128, "step": 1403 }, { "epoch": 1.5709090909090908, "grad_norm": 0.4056640124818364, "learning_rate": 2.3462135656924046e-06, "loss": 0.6306, "num_input_tokens_seen": 1472200704, "step": 1404 }, { "epoch": 1.5720279720279722, "grad_norm": 0.386277310607133, "learning_rate": 2.343259978702984e-06, "loss": 0.62, "num_input_tokens_seen": 1473249280, "step": 1405 }, { "epoch": 1.573146853146853, "grad_norm": 0.35706520770801253, "learning_rate": 2.3403066113365323e-06, "loss": 0.6094, "num_input_tokens_seen": 1474297856, "step": 1406 }, { "epoch": 1.5742657342657342, "grad_norm": 0.3546653062307285, "learning_rate": 2.337353467731286e-06, "loss": 0.7058, "num_input_tokens_seen": 1475346432, "step": 1407 }, { "epoch": 1.5753846153846154, "grad_norm": 0.3957378202184631, "learning_rate": 2.33440055202517e-06, "loss": 0.6864, "num_input_tokens_seen": 1476395008, "step": 1408 }, { "epoch": 1.5765034965034965, "grad_norm": 0.4028100616555917, "learning_rate": 2.3314478683557863e-06, "loss": 0.6795, "num_input_tokens_seen": 1477443584, "step": 1409 }, { "epoch": 1.5776223776223777, "grad_norm": 0.39719819175984344, "learning_rate": 2.3284954208604154e-06, "loss": 0.7887, "num_input_tokens_seen": 1478492160, "step": 1410 }, { "epoch": 1.5787412587412586, "grad_norm": 1.4224017793009884, "learning_rate": 2.3255432136760026e-06, "loss": 0.6818, "num_input_tokens_seen": 1479540736, "step": 1411 }, { "epoch": 1.57986013986014, "grad_norm": 0.3992949548872193, "learning_rate": 2.322591250939162e-06, "loss": 0.7175, "num_input_tokens_seen": 1480589312, "step": 1412 }, { "epoch": 1.5809790209790209, "grad_norm": 0.47394986511477105, "learning_rate": 2.3196395367861605e-06, "loss": 0.6623, "num_input_tokens_seen": 1481637888, "step": 1413 }, { "epoch": 1.5820979020979022, "grad_norm": 0.4305743988522782, "learning_rate": 2.316688075352918e-06, "loss": 0.8412, "num_input_tokens_seen": 1482686464, "step": 1414 }, { "epoch": 1.5832167832167832, "grad_norm": 0.3583401349526158, "learning_rate": 2.3137368707750018e-06, "loss": 0.659, "num_input_tokens_seen": 1483735040, "step": 1415 }, { "epoch": 1.5843356643356643, "grad_norm": 0.34503126894946257, "learning_rate": 2.310785927187616e-06, "loss": 0.6678, "num_input_tokens_seen": 1484783616, "step": 1416 }, { "epoch": 1.5854545454545454, "grad_norm": 0.3657931381059671, "learning_rate": 2.3078352487256045e-06, "loss": 0.599, "num_input_tokens_seen": 1485832192, "step": 1417 }, { "epoch": 1.5865734265734266, "grad_norm": 0.35206100666702933, "learning_rate": 2.3048848395234337e-06, "loss": 0.7434, "num_input_tokens_seen": 1486880768, "step": 1418 }, { "epoch": 1.5876923076923077, "grad_norm": 0.41914069586948005, "learning_rate": 2.301934703715196e-06, "loss": 0.6261, "num_input_tokens_seen": 1487929344, "step": 1419 }, { "epoch": 1.5888111888111887, "grad_norm": 0.34431105616490026, "learning_rate": 2.2989848454346007e-06, "loss": 0.5925, "num_input_tokens_seen": 1488977920, "step": 1420 }, { "epoch": 1.58993006993007, "grad_norm": 0.3830931563241908, "learning_rate": 2.2960352688149657e-06, "loss": 0.6348, "num_input_tokens_seen": 1490026496, "step": 1421 }, { "epoch": 1.591048951048951, "grad_norm": 0.3540372546208806, "learning_rate": 2.293085977989219e-06, "loss": 0.6674, "num_input_tokens_seen": 1491075072, "step": 1422 }, { "epoch": 1.5921678321678323, "grad_norm": 0.3639336822429974, "learning_rate": 2.290136977089883e-06, "loss": 0.6147, "num_input_tokens_seen": 1492123648, "step": 1423 }, { "epoch": 1.5932867132867132, "grad_norm": 0.44253242635185924, "learning_rate": 2.287188270249077e-06, "loss": 0.733, "num_input_tokens_seen": 1493172224, "step": 1424 }, { "epoch": 1.5944055944055944, "grad_norm": 0.4037946719255645, "learning_rate": 2.2842398615985086e-06, "loss": 0.7475, "num_input_tokens_seen": 1494220800, "step": 1425 }, { "epoch": 1.5955244755244755, "grad_norm": 0.36323907615192613, "learning_rate": 2.281291755269464e-06, "loss": 0.5872, "num_input_tokens_seen": 1495269376, "step": 1426 }, { "epoch": 1.5966433566433567, "grad_norm": 0.4191625542780069, "learning_rate": 2.27834395539281e-06, "loss": 0.7137, "num_input_tokens_seen": 1496317952, "step": 1427 }, { "epoch": 1.5977622377622378, "grad_norm": 0.368727883638021, "learning_rate": 2.2753964660989813e-06, "loss": 0.6452, "num_input_tokens_seen": 1497366528, "step": 1428 }, { "epoch": 1.5988811188811187, "grad_norm": 0.4236711207836781, "learning_rate": 2.2724492915179787e-06, "loss": 0.6903, "num_input_tokens_seen": 1498415104, "step": 1429 }, { "epoch": 1.6, "grad_norm": 0.3741490340201765, "learning_rate": 2.269502435779362e-06, "loss": 0.7619, "num_input_tokens_seen": 1499463680, "step": 1430 }, { "epoch": 1.601118881118881, "grad_norm": 0.3862464018388154, "learning_rate": 2.2665559030122424e-06, "loss": 0.745, "num_input_tokens_seen": 1500512256, "step": 1431 }, { "epoch": 1.6022377622377624, "grad_norm": 0.3988008717439541, "learning_rate": 2.2636096973452813e-06, "loss": 0.5718, "num_input_tokens_seen": 1501560832, "step": 1432 }, { "epoch": 1.6033566433566433, "grad_norm": 0.34612553642871274, "learning_rate": 2.2606638229066802e-06, "loss": 0.7402, "num_input_tokens_seen": 1502609408, "step": 1433 }, { "epoch": 1.6044755244755244, "grad_norm": 0.3900980363376793, "learning_rate": 2.257718283824177e-06, "loss": 0.6025, "num_input_tokens_seen": 1503657984, "step": 1434 }, { "epoch": 1.6055944055944056, "grad_norm": 0.35872198046774767, "learning_rate": 2.254773084225039e-06, "loss": 0.7073, "num_input_tokens_seen": 1504706560, "step": 1435 }, { "epoch": 1.6067132867132867, "grad_norm": 0.36151012000169064, "learning_rate": 2.2518282282360597e-06, "loss": 0.6828, "num_input_tokens_seen": 1505755136, "step": 1436 }, { "epoch": 1.6078321678321679, "grad_norm": 0.38224445802232665, "learning_rate": 2.2488837199835477e-06, "loss": 0.8252, "num_input_tokens_seen": 1506803712, "step": 1437 }, { "epoch": 1.6089510489510488, "grad_norm": 0.35427002449315836, "learning_rate": 2.2459395635933267e-06, "loss": 0.6887, "num_input_tokens_seen": 1507852288, "step": 1438 }, { "epoch": 1.6100699300699302, "grad_norm": 0.3595896737335375, "learning_rate": 2.2429957631907285e-06, "loss": 0.6742, "num_input_tokens_seen": 1508900864, "step": 1439 }, { "epoch": 1.611188811188811, "grad_norm": 0.36659759590456403, "learning_rate": 2.240052322900583e-06, "loss": 0.6127, "num_input_tokens_seen": 1509949440, "step": 1440 }, { "epoch": 1.6123076923076924, "grad_norm": 0.35209095361748816, "learning_rate": 2.2371092468472193e-06, "loss": 0.6728, "num_input_tokens_seen": 1510998016, "step": 1441 }, { "epoch": 1.6134265734265734, "grad_norm": 0.364005667812455, "learning_rate": 2.2341665391544522e-06, "loss": 0.7416, "num_input_tokens_seen": 1512046592, "step": 1442 }, { "epoch": 1.6145454545454545, "grad_norm": 0.3724891845066251, "learning_rate": 2.2312242039455816e-06, "loss": 0.6883, "num_input_tokens_seen": 1513095168, "step": 1443 }, { "epoch": 1.6156643356643356, "grad_norm": 0.46741850029881393, "learning_rate": 2.2282822453433878e-06, "loss": 0.6225, "num_input_tokens_seen": 1514143744, "step": 1444 }, { "epoch": 1.6167832167832168, "grad_norm": 0.36971103910962194, "learning_rate": 2.2253406674701206e-06, "loss": 0.6233, "num_input_tokens_seen": 1515192320, "step": 1445 }, { "epoch": 1.617902097902098, "grad_norm": 0.36921393157599314, "learning_rate": 2.2223994744474986e-06, "loss": 0.7264, "num_input_tokens_seen": 1516240896, "step": 1446 }, { "epoch": 1.6190209790209789, "grad_norm": 0.37363199334768865, "learning_rate": 2.2194586703966976e-06, "loss": 0.7442, "num_input_tokens_seen": 1517289472, "step": 1447 }, { "epoch": 1.6201398601398602, "grad_norm": 0.3690039730025088, "learning_rate": 2.2165182594383532e-06, "loss": 0.6927, "num_input_tokens_seen": 1518338048, "step": 1448 }, { "epoch": 1.6212587412587411, "grad_norm": 0.43354195444827137, "learning_rate": 2.213578245692546e-06, "loss": 0.8128, "num_input_tokens_seen": 1519386624, "step": 1449 }, { "epoch": 1.6223776223776225, "grad_norm": 0.38010989263443784, "learning_rate": 2.210638633278802e-06, "loss": 0.7315, "num_input_tokens_seen": 1520435200, "step": 1450 }, { "epoch": 1.6234965034965034, "grad_norm": 0.37510811785224757, "learning_rate": 2.2076994263160863e-06, "loss": 0.7408, "num_input_tokens_seen": 1521483776, "step": 1451 }, { "epoch": 1.6246153846153846, "grad_norm": 0.4765756351666927, "learning_rate": 2.204760628922791e-06, "loss": 0.6227, "num_input_tokens_seen": 1522532352, "step": 1452 }, { "epoch": 1.6257342657342657, "grad_norm": 0.4063106895420182, "learning_rate": 2.20182224521674e-06, "loss": 0.7336, "num_input_tokens_seen": 1523580928, "step": 1453 }, { "epoch": 1.6268531468531469, "grad_norm": 0.3658456120940129, "learning_rate": 2.1988842793151743e-06, "loss": 0.6759, "num_input_tokens_seen": 1524629504, "step": 1454 }, { "epoch": 1.627972027972028, "grad_norm": 0.3672109862882697, "learning_rate": 2.1959467353347494e-06, "loss": 0.6754, "num_input_tokens_seen": 1525678080, "step": 1455 }, { "epoch": 1.6290909090909091, "grad_norm": 0.3592585250595939, "learning_rate": 2.193009617391532e-06, "loss": 0.6758, "num_input_tokens_seen": 1526726656, "step": 1456 }, { "epoch": 1.6302097902097903, "grad_norm": 0.3818185063154414, "learning_rate": 2.190072929600989e-06, "loss": 0.6846, "num_input_tokens_seen": 1527775232, "step": 1457 }, { "epoch": 1.6313286713286712, "grad_norm": 0.45978773451862454, "learning_rate": 2.1871366760779857e-06, "loss": 0.7539, "num_input_tokens_seen": 1528823808, "step": 1458 }, { "epoch": 1.6324475524475526, "grad_norm": 0.3367487632889278, "learning_rate": 2.1842008609367794e-06, "loss": 0.6996, "num_input_tokens_seen": 1529872384, "step": 1459 }, { "epoch": 1.6335664335664335, "grad_norm": 0.36803830704522544, "learning_rate": 2.1812654882910134e-06, "loss": 0.6688, "num_input_tokens_seen": 1530920960, "step": 1460 }, { "epoch": 1.6346853146853146, "grad_norm": 0.39816957922984453, "learning_rate": 2.1783305622537106e-06, "loss": 0.642, "num_input_tokens_seen": 1531969536, "step": 1461 }, { "epoch": 1.6358041958041958, "grad_norm": 0.3699826193122252, "learning_rate": 2.175396086937265e-06, "loss": 0.7306, "num_input_tokens_seen": 1533018112, "step": 1462 }, { "epoch": 1.636923076923077, "grad_norm": 0.4451687123554849, "learning_rate": 2.1724620664534453e-06, "loss": 0.7331, "num_input_tokens_seen": 1534066688, "step": 1463 }, { "epoch": 1.638041958041958, "grad_norm": 0.4497924552281637, "learning_rate": 2.169528504913378e-06, "loss": 0.7023, "num_input_tokens_seen": 1535115264, "step": 1464 }, { "epoch": 1.6391608391608392, "grad_norm": 0.3510088757820524, "learning_rate": 2.166595406427548e-06, "loss": 0.6027, "num_input_tokens_seen": 1536163840, "step": 1465 }, { "epoch": 1.6402797202797204, "grad_norm": 0.3963714426716292, "learning_rate": 2.163662775105792e-06, "loss": 0.8737, "num_input_tokens_seen": 1537212416, "step": 1466 }, { "epoch": 1.6413986013986013, "grad_norm": 0.37246439994294844, "learning_rate": 2.1607306150572905e-06, "loss": 0.69, "num_input_tokens_seen": 1538260992, "step": 1467 }, { "epoch": 1.6425174825174826, "grad_norm": 0.3579774296651962, "learning_rate": 2.1577989303905657e-06, "loss": 0.6028, "num_input_tokens_seen": 1539309568, "step": 1468 }, { "epoch": 1.6436363636363636, "grad_norm": 0.3665929683189659, "learning_rate": 2.154867725213472e-06, "loss": 0.6556, "num_input_tokens_seen": 1540358144, "step": 1469 }, { "epoch": 1.6447552447552447, "grad_norm": 0.3550355506997787, "learning_rate": 2.151937003633193e-06, "loss": 0.7768, "num_input_tokens_seen": 1541406720, "step": 1470 }, { "epoch": 1.6458741258741258, "grad_norm": 0.3500447727996359, "learning_rate": 2.149006769756234e-06, "loss": 0.774, "num_input_tokens_seen": 1542455296, "step": 1471 }, { "epoch": 1.646993006993007, "grad_norm": 0.3361636463694964, "learning_rate": 2.146077027688418e-06, "loss": 0.7, "num_input_tokens_seen": 1543503872, "step": 1472 }, { "epoch": 1.6481118881118881, "grad_norm": 0.36960505934898524, "learning_rate": 2.1431477815348775e-06, "loss": 0.6505, "num_input_tokens_seen": 1544552448, "step": 1473 }, { "epoch": 1.6492307692307693, "grad_norm": 0.44557226978962405, "learning_rate": 2.1402190354000502e-06, "loss": 0.7912, "num_input_tokens_seen": 1545601024, "step": 1474 }, { "epoch": 1.6503496503496504, "grad_norm": 0.3573915016479245, "learning_rate": 2.1372907933876745e-06, "loss": 0.6772, "num_input_tokens_seen": 1546649600, "step": 1475 }, { "epoch": 1.6514685314685313, "grad_norm": 0.3661902663222269, "learning_rate": 2.134363059600781e-06, "loss": 0.681, "num_input_tokens_seen": 1547698176, "step": 1476 }, { "epoch": 1.6525874125874127, "grad_norm": 0.3380600362910697, "learning_rate": 2.1314358381416906e-06, "loss": 0.6805, "num_input_tokens_seen": 1548746752, "step": 1477 }, { "epoch": 1.6537062937062936, "grad_norm": 0.3415726128200725, "learning_rate": 2.1285091331120028e-06, "loss": 0.6777, "num_input_tokens_seen": 1549795328, "step": 1478 }, { "epoch": 1.6548251748251748, "grad_norm": 0.3710138717237046, "learning_rate": 2.125582948612595e-06, "loss": 0.7613, "num_input_tokens_seen": 1550843904, "step": 1479 }, { "epoch": 1.655944055944056, "grad_norm": 0.3696138653444523, "learning_rate": 2.1226572887436175e-06, "loss": 0.8748, "num_input_tokens_seen": 1551892480, "step": 1480 }, { "epoch": 1.657062937062937, "grad_norm": 0.4684918982422934, "learning_rate": 2.1197321576044803e-06, "loss": 0.8277, "num_input_tokens_seen": 1552941056, "step": 1481 }, { "epoch": 1.6581818181818182, "grad_norm": 0.3552604013018774, "learning_rate": 2.1168075592938592e-06, "loss": 0.6905, "num_input_tokens_seen": 1553989632, "step": 1482 }, { "epoch": 1.6593006993006993, "grad_norm": 0.3743123389451923, "learning_rate": 2.1138834979096778e-06, "loss": 0.6963, "num_input_tokens_seen": 1555038208, "step": 1483 }, { "epoch": 1.6604195804195805, "grad_norm": 0.36013549095449693, "learning_rate": 2.1109599775491096e-06, "loss": 0.6557, "num_input_tokens_seen": 1556086784, "step": 1484 }, { "epoch": 1.6615384615384614, "grad_norm": 0.341398397400898, "learning_rate": 2.1080370023085713e-06, "loss": 0.683, "num_input_tokens_seen": 1557135360, "step": 1485 }, { "epoch": 1.6626573426573428, "grad_norm": 0.3295345221716557, "learning_rate": 2.1051145762837115e-06, "loss": 0.6941, "num_input_tokens_seen": 1558183936, "step": 1486 }, { "epoch": 1.6637762237762237, "grad_norm": 0.3630658892572225, "learning_rate": 2.102192703569416e-06, "loss": 0.7029, "num_input_tokens_seen": 1559232512, "step": 1487 }, { "epoch": 1.664895104895105, "grad_norm": 1.6994990107676227, "learning_rate": 2.0992713882597883e-06, "loss": 0.5943, "num_input_tokens_seen": 1560281088, "step": 1488 }, { "epoch": 1.666013986013986, "grad_norm": 0.3625234148440345, "learning_rate": 2.0963506344481556e-06, "loss": 0.7091, "num_input_tokens_seen": 1561329664, "step": 1489 }, { "epoch": 1.6671328671328671, "grad_norm": 0.36158769030639054, "learning_rate": 2.0934304462270568e-06, "loss": 0.6906, "num_input_tokens_seen": 1562378240, "step": 1490 }, { "epoch": 1.6682517482517483, "grad_norm": 0.34990496368263135, "learning_rate": 2.0905108276882356e-06, "loss": 0.8045, "num_input_tokens_seen": 1563426816, "step": 1491 }, { "epoch": 1.6693706293706294, "grad_norm": 0.40822922774482007, "learning_rate": 2.087591782922645e-06, "loss": 0.6821, "num_input_tokens_seen": 1564475392, "step": 1492 }, { "epoch": 1.6704895104895106, "grad_norm": 0.3734319286303249, "learning_rate": 2.0846733160204244e-06, "loss": 0.7651, "num_input_tokens_seen": 1565523968, "step": 1493 }, { "epoch": 1.6716083916083915, "grad_norm": 0.42915970376294044, "learning_rate": 2.081755431070911e-06, "loss": 0.6843, "num_input_tokens_seen": 1566572544, "step": 1494 }, { "epoch": 1.6727272727272728, "grad_norm": 0.614891637041917, "learning_rate": 2.0788381321626237e-06, "loss": 0.7936, "num_input_tokens_seen": 1567621120, "step": 1495 }, { "epoch": 1.6738461538461538, "grad_norm": 0.42194029304031444, "learning_rate": 2.075921423383258e-06, "loss": 0.7214, "num_input_tokens_seen": 1568669696, "step": 1496 }, { "epoch": 1.6749650349650351, "grad_norm": 0.4259389298234062, "learning_rate": 2.0730053088196883e-06, "loss": 0.6312, "num_input_tokens_seen": 1569718272, "step": 1497 }, { "epoch": 1.676083916083916, "grad_norm": 0.39828854297370253, "learning_rate": 2.07008979255795e-06, "loss": 0.8513, "num_input_tokens_seen": 1570766848, "step": 1498 }, { "epoch": 1.6772027972027972, "grad_norm": 0.35547525928672413, "learning_rate": 2.0671748786832447e-06, "loss": 0.8586, "num_input_tokens_seen": 1571815424, "step": 1499 }, { "epoch": 1.6783216783216783, "grad_norm": 0.41037725944961945, "learning_rate": 2.064260571279928e-06, "loss": 0.6795, "num_input_tokens_seen": 1572864000, "step": 1500 }, { "epoch": 1.6783216783216783, "eval_loss": 0.7240810394287109, "eval_runtime": 246.8606, "eval_samples_per_second": 2.366, "eval_steps_per_second": 0.296, "num_input_tokens_seen": 1572864000, "step": 1500 }, { "epoch": 1.6794405594405595, "grad_norm": 0.3585866868978481, "learning_rate": 2.061346874431507e-06, "loss": 0.7162, "num_input_tokens_seen": 1573912576, "step": 1501 }, { "epoch": 1.6805594405594406, "grad_norm": 0.3597551154404562, "learning_rate": 2.0584337922206303e-06, "loss": 0.6341, "num_input_tokens_seen": 1574961152, "step": 1502 }, { "epoch": 1.6816783216783215, "grad_norm": 0.3855352694514827, "learning_rate": 2.0555213287290886e-06, "loss": 0.6942, "num_input_tokens_seen": 1576009728, "step": 1503 }, { "epoch": 1.682797202797203, "grad_norm": 0.38510503872372237, "learning_rate": 2.052609488037805e-06, "loss": 0.7038, "num_input_tokens_seen": 1577058304, "step": 1504 }, { "epoch": 1.6839160839160838, "grad_norm": 0.37287031114429176, "learning_rate": 2.0496982742268273e-06, "loss": 0.7551, "num_input_tokens_seen": 1578106880, "step": 1505 }, { "epoch": 1.6850349650349652, "grad_norm": 0.3912874395531707, "learning_rate": 2.04678769137533e-06, "loss": 0.6798, "num_input_tokens_seen": 1579155456, "step": 1506 }, { "epoch": 1.6861538461538461, "grad_norm": 0.38082038018879333, "learning_rate": 2.043877743561598e-06, "loss": 0.7259, "num_input_tokens_seen": 1580204032, "step": 1507 }, { "epoch": 1.6872727272727273, "grad_norm": 0.3584211102333867, "learning_rate": 2.0409684348630292e-06, "loss": 0.6198, "num_input_tokens_seen": 1581252608, "step": 1508 }, { "epoch": 1.6883916083916084, "grad_norm": 0.3669765609091533, "learning_rate": 2.038059769356127e-06, "loss": 0.6017, "num_input_tokens_seen": 1582301184, "step": 1509 }, { "epoch": 1.6895104895104895, "grad_norm": 0.39379438125130334, "learning_rate": 2.0351517511164903e-06, "loss": 0.732, "num_input_tokens_seen": 1583349760, "step": 1510 }, { "epoch": 1.6906293706293707, "grad_norm": 0.3620945081872885, "learning_rate": 2.032244384218815e-06, "loss": 0.6047, "num_input_tokens_seen": 1584398336, "step": 1511 }, { "epoch": 1.6917482517482516, "grad_norm": 0.3543208066325359, "learning_rate": 2.0293376727368798e-06, "loss": 0.6204, "num_input_tokens_seen": 1585446912, "step": 1512 }, { "epoch": 1.692867132867133, "grad_norm": 0.3825118860346632, "learning_rate": 2.02643162074355e-06, "loss": 0.5819, "num_input_tokens_seen": 1586495488, "step": 1513 }, { "epoch": 1.693986013986014, "grad_norm": 0.3930833199985328, "learning_rate": 2.0235262323107633e-06, "loss": 0.6841, "num_input_tokens_seen": 1587544064, "step": 1514 }, { "epoch": 1.6951048951048953, "grad_norm": 0.38928060301032075, "learning_rate": 2.020621511509528e-06, "loss": 0.6528, "num_input_tokens_seen": 1588592640, "step": 1515 }, { "epoch": 1.6962237762237762, "grad_norm": 0.34647588339543134, "learning_rate": 2.0177174624099193e-06, "loss": 0.6367, "num_input_tokens_seen": 1589641216, "step": 1516 }, { "epoch": 1.6973426573426573, "grad_norm": 0.34848012418333774, "learning_rate": 2.014814089081067e-06, "loss": 0.763, "num_input_tokens_seen": 1590689792, "step": 1517 }, { "epoch": 1.6984615384615385, "grad_norm": 0.40641697992243897, "learning_rate": 2.0119113955911596e-06, "loss": 0.8083, "num_input_tokens_seen": 1591738368, "step": 1518 }, { "epoch": 1.6995804195804196, "grad_norm": 0.3507101312889027, "learning_rate": 2.0090093860074273e-06, "loss": 0.6938, "num_input_tokens_seen": 1592786944, "step": 1519 }, { "epoch": 1.7006993006993008, "grad_norm": 0.38246890511813814, "learning_rate": 2.006108064396146e-06, "loss": 0.862, "num_input_tokens_seen": 1593835520, "step": 1520 }, { "epoch": 1.7018181818181817, "grad_norm": 0.3967275683315675, "learning_rate": 2.0032074348226268e-06, "loss": 0.7229, "num_input_tokens_seen": 1594884096, "step": 1521 }, { "epoch": 1.702937062937063, "grad_norm": 0.366977644801669, "learning_rate": 2.000307501351209e-06, "loss": 0.7758, "num_input_tokens_seen": 1595932672, "step": 1522 }, { "epoch": 1.704055944055944, "grad_norm": 0.35295783808880327, "learning_rate": 1.997408268045259e-06, "loss": 0.6983, "num_input_tokens_seen": 1596981248, "step": 1523 }, { "epoch": 1.7051748251748253, "grad_norm": 0.37171527297191187, "learning_rate": 1.994509738967161e-06, "loss": 0.8414, "num_input_tokens_seen": 1598029824, "step": 1524 }, { "epoch": 1.7062937062937062, "grad_norm": 0.3461485236990216, "learning_rate": 1.9916119181783135e-06, "loss": 0.6005, "num_input_tokens_seen": 1599078400, "step": 1525 }, { "epoch": 1.7074125874125874, "grad_norm": 0.3656822450094739, "learning_rate": 1.9887148097391217e-06, "loss": 0.6042, "num_input_tokens_seen": 1600126976, "step": 1526 }, { "epoch": 1.7085314685314685, "grad_norm": 0.37135908992213634, "learning_rate": 1.9858184177089915e-06, "loss": 0.6072, "num_input_tokens_seen": 1601175552, "step": 1527 }, { "epoch": 1.7096503496503497, "grad_norm": 0.35731488920392573, "learning_rate": 1.982922746146327e-06, "loss": 0.5928, "num_input_tokens_seen": 1602224128, "step": 1528 }, { "epoch": 1.7107692307692308, "grad_norm": 0.36918179391318146, "learning_rate": 1.9800277991085217e-06, "loss": 0.6829, "num_input_tokens_seen": 1603272704, "step": 1529 }, { "epoch": 1.7118881118881117, "grad_norm": 0.7638337216262671, "learning_rate": 1.9771335806519544e-06, "loss": 0.784, "num_input_tokens_seen": 1604321280, "step": 1530 }, { "epoch": 1.713006993006993, "grad_norm": 0.3772438172226753, "learning_rate": 1.9742400948319838e-06, "loss": 0.6659, "num_input_tokens_seen": 1605369856, "step": 1531 }, { "epoch": 1.714125874125874, "grad_norm": 0.43038702076823443, "learning_rate": 1.9713473457029384e-06, "loss": 0.7278, "num_input_tokens_seen": 1606418432, "step": 1532 }, { "epoch": 1.7152447552447554, "grad_norm": 0.4567655852706425, "learning_rate": 1.9684553373181197e-06, "loss": 0.7213, "num_input_tokens_seen": 1607467008, "step": 1533 }, { "epoch": 1.7163636363636363, "grad_norm": 0.39418633672444, "learning_rate": 1.965564073729787e-06, "loss": 0.6836, "num_input_tokens_seen": 1608515584, "step": 1534 }, { "epoch": 1.7174825174825175, "grad_norm": 0.39167217706047713, "learning_rate": 1.962673558989158e-06, "loss": 0.6943, "num_input_tokens_seen": 1609564160, "step": 1535 }, { "epoch": 1.7186013986013986, "grad_norm": 0.40729982886215554, "learning_rate": 1.959783797146402e-06, "loss": 0.6404, "num_input_tokens_seen": 1610612736, "step": 1536 }, { "epoch": 1.7197202797202797, "grad_norm": 0.36680516154398474, "learning_rate": 1.956894792250631e-06, "loss": 0.6365, "num_input_tokens_seen": 1611661312, "step": 1537 }, { "epoch": 1.7208391608391609, "grad_norm": 0.3609370240196091, "learning_rate": 1.9540065483498978e-06, "loss": 0.6816, "num_input_tokens_seen": 1612709888, "step": 1538 }, { "epoch": 1.7219580419580418, "grad_norm": 0.4310248188046204, "learning_rate": 1.9511190694911875e-06, "loss": 0.6292, "num_input_tokens_seen": 1613758464, "step": 1539 }, { "epoch": 1.7230769230769232, "grad_norm": 0.359576421641524, "learning_rate": 1.948232359720416e-06, "loss": 0.5989, "num_input_tokens_seen": 1614807040, "step": 1540 }, { "epoch": 1.724195804195804, "grad_norm": 0.35971387763335055, "learning_rate": 1.9453464230824186e-06, "loss": 0.6825, "num_input_tokens_seen": 1615855616, "step": 1541 }, { "epoch": 1.7253146853146855, "grad_norm": 0.37328754142344645, "learning_rate": 1.9424612636209503e-06, "loss": 0.7772, "num_input_tokens_seen": 1616904192, "step": 1542 }, { "epoch": 1.7264335664335664, "grad_norm": 0.3726121644784703, "learning_rate": 1.939576885378674e-06, "loss": 0.6889, "num_input_tokens_seen": 1617952768, "step": 1543 }, { "epoch": 1.7275524475524475, "grad_norm": 0.3534001349869352, "learning_rate": 1.9366932923971583e-06, "loss": 0.6516, "num_input_tokens_seen": 1619001344, "step": 1544 }, { "epoch": 1.7286713286713287, "grad_norm": 0.3786462172895107, "learning_rate": 1.9338104887168753e-06, "loss": 0.6828, "num_input_tokens_seen": 1620049920, "step": 1545 }, { "epoch": 1.7297902097902098, "grad_norm": 0.363987217181334, "learning_rate": 1.9309284783771857e-06, "loss": 0.727, "num_input_tokens_seen": 1621098496, "step": 1546 }, { "epoch": 1.730909090909091, "grad_norm": 0.3503153111176502, "learning_rate": 1.9280472654163436e-06, "loss": 0.613, "num_input_tokens_seen": 1622147072, "step": 1547 }, { "epoch": 1.7320279720279719, "grad_norm": 0.3846529047803018, "learning_rate": 1.9251668538714814e-06, "loss": 0.6623, "num_input_tokens_seen": 1623195648, "step": 1548 }, { "epoch": 1.7331468531468532, "grad_norm": 0.4071938576659285, "learning_rate": 1.9222872477786124e-06, "loss": 0.7716, "num_input_tokens_seen": 1624244224, "step": 1549 }, { "epoch": 1.7342657342657342, "grad_norm": 0.3945288808508757, "learning_rate": 1.919408451172619e-06, "loss": 0.7813, "num_input_tokens_seen": 1625292800, "step": 1550 }, { "epoch": 1.7353846153846155, "grad_norm": 0.3455127380131485, "learning_rate": 1.916530468087249e-06, "loss": 0.7538, "num_input_tokens_seen": 1626341376, "step": 1551 }, { "epoch": 1.7365034965034964, "grad_norm": 0.35302266539544525, "learning_rate": 1.9136533025551126e-06, "loss": 0.7096, "num_input_tokens_seen": 1627389952, "step": 1552 }, { "epoch": 1.7376223776223776, "grad_norm": 0.35651711613964776, "learning_rate": 1.9107769586076716e-06, "loss": 0.6097, "num_input_tokens_seen": 1628438528, "step": 1553 }, { "epoch": 1.7387412587412587, "grad_norm": 0.34155626725070953, "learning_rate": 1.9079014402752392e-06, "loss": 0.683, "num_input_tokens_seen": 1629487104, "step": 1554 }, { "epoch": 1.7398601398601399, "grad_norm": 0.34127096380551525, "learning_rate": 1.9050267515869709e-06, "loss": 0.6598, "num_input_tokens_seen": 1630535680, "step": 1555 }, { "epoch": 1.740979020979021, "grad_norm": 0.42290456557948686, "learning_rate": 1.9021528965708576e-06, "loss": 0.633, "num_input_tokens_seen": 1631584256, "step": 1556 }, { "epoch": 1.742097902097902, "grad_norm": 0.4121916926264618, "learning_rate": 1.8992798792537265e-06, "loss": 0.6395, "num_input_tokens_seen": 1632632832, "step": 1557 }, { "epoch": 1.7432167832167833, "grad_norm": 0.39154195468979414, "learning_rate": 1.8964077036612262e-06, "loss": 0.6078, "num_input_tokens_seen": 1633681408, "step": 1558 }, { "epoch": 1.7443356643356642, "grad_norm": 0.36842799454307523, "learning_rate": 1.8935363738178288e-06, "loss": 0.7763, "num_input_tokens_seen": 1634729984, "step": 1559 }, { "epoch": 1.7454545454545456, "grad_norm": 0.397080443328502, "learning_rate": 1.8906658937468205e-06, "loss": 0.6203, "num_input_tokens_seen": 1635778560, "step": 1560 }, { "epoch": 1.7465734265734265, "grad_norm": 0.37562584339032534, "learning_rate": 1.8877962674702977e-06, "loss": 0.7323, "num_input_tokens_seen": 1636827136, "step": 1561 }, { "epoch": 1.7476923076923077, "grad_norm": 0.3465729593554075, "learning_rate": 1.8849274990091599e-06, "loss": 0.738, "num_input_tokens_seen": 1637875712, "step": 1562 }, { "epoch": 1.7488111888111888, "grad_norm": 0.36001853566013076, "learning_rate": 1.8820595923831025e-06, "loss": 0.6101, "num_input_tokens_seen": 1638924288, "step": 1563 }, { "epoch": 1.74993006993007, "grad_norm": 0.36561682357305275, "learning_rate": 1.8791925516106169e-06, "loss": 0.5869, "num_input_tokens_seen": 1639972864, "step": 1564 }, { "epoch": 1.751048951048951, "grad_norm": 0.3774041435821508, "learning_rate": 1.876326380708979e-06, "loss": 0.76, "num_input_tokens_seen": 1641021440, "step": 1565 }, { "epoch": 1.752167832167832, "grad_norm": 0.35639151698741744, "learning_rate": 1.8734610836942467e-06, "loss": 0.607, "num_input_tokens_seen": 1642070016, "step": 1566 }, { "epoch": 1.7532867132867134, "grad_norm": 0.33476784932412224, "learning_rate": 1.8705966645812544e-06, "loss": 0.6074, "num_input_tokens_seen": 1643118592, "step": 1567 }, { "epoch": 1.7544055944055943, "grad_norm": 0.39229365495038426, "learning_rate": 1.8677331273836025e-06, "loss": 0.7481, "num_input_tokens_seen": 1644167168, "step": 1568 }, { "epoch": 1.7555244755244757, "grad_norm": 0.5014908424311935, "learning_rate": 1.8648704761136604e-06, "loss": 0.5741, "num_input_tokens_seen": 1645215744, "step": 1569 }, { "epoch": 1.7566433566433566, "grad_norm": 0.34140432585382163, "learning_rate": 1.8620087147825528e-06, "loss": 0.6466, "num_input_tokens_seen": 1646264320, "step": 1570 }, { "epoch": 1.7577622377622377, "grad_norm": 0.4140342032109525, "learning_rate": 1.8591478474001601e-06, "loss": 0.7783, "num_input_tokens_seen": 1647312896, "step": 1571 }, { "epoch": 1.7588811188811189, "grad_norm": 0.6411170315670462, "learning_rate": 1.8562878779751074e-06, "loss": 0.6752, "num_input_tokens_seen": 1648361472, "step": 1572 }, { "epoch": 1.76, "grad_norm": 0.34464017332714303, "learning_rate": 1.8534288105147644e-06, "loss": 0.741, "num_input_tokens_seen": 1649410048, "step": 1573 }, { "epoch": 1.7611188811188812, "grad_norm": 0.36541436697409846, "learning_rate": 1.850570649025234e-06, "loss": 0.6407, "num_input_tokens_seen": 1650458624, "step": 1574 }, { "epoch": 1.762237762237762, "grad_norm": 0.3704636233105469, "learning_rate": 1.8477133975113516e-06, "loss": 0.5687, "num_input_tokens_seen": 1651507200, "step": 1575 }, { "epoch": 1.7633566433566434, "grad_norm": 0.33369394978611566, "learning_rate": 1.8448570599766772e-06, "loss": 0.7221, "num_input_tokens_seen": 1652555776, "step": 1576 }, { "epoch": 1.7644755244755244, "grad_norm": 0.374105117579175, "learning_rate": 1.8420016404234897e-06, "loss": 0.7117, "num_input_tokens_seen": 1653604352, "step": 1577 }, { "epoch": 1.7655944055944057, "grad_norm": 0.3596505249740274, "learning_rate": 1.8391471428527835e-06, "loss": 0.653, "num_input_tokens_seen": 1654652928, "step": 1578 }, { "epoch": 1.7667132867132866, "grad_norm": 0.3657305512869846, "learning_rate": 1.836293571264258e-06, "loss": 0.6882, "num_input_tokens_seen": 1655701504, "step": 1579 }, { "epoch": 1.7678321678321678, "grad_norm": 0.3653212597563551, "learning_rate": 1.8334409296563165e-06, "loss": 0.624, "num_input_tokens_seen": 1656750080, "step": 1580 }, { "epoch": 1.768951048951049, "grad_norm": 0.37619757106557355, "learning_rate": 1.830589222026062e-06, "loss": 0.8613, "num_input_tokens_seen": 1657798656, "step": 1581 }, { "epoch": 1.77006993006993, "grad_norm": 0.38915486972914975, "learning_rate": 1.8277384523692827e-06, "loss": 0.7239, "num_input_tokens_seen": 1658847232, "step": 1582 }, { "epoch": 1.7711888111888112, "grad_norm": 0.40003422756035456, "learning_rate": 1.8248886246804598e-06, "loss": 0.6033, "num_input_tokens_seen": 1659895808, "step": 1583 }, { "epoch": 1.7723076923076924, "grad_norm": 0.3692611481232594, "learning_rate": 1.8220397429527484e-06, "loss": 0.5767, "num_input_tokens_seen": 1660944384, "step": 1584 }, { "epoch": 1.7734265734265735, "grad_norm": 0.4063639525829798, "learning_rate": 1.819191811177982e-06, "loss": 0.6316, "num_input_tokens_seen": 1661992960, "step": 1585 }, { "epoch": 1.7745454545454544, "grad_norm": 0.43653164660380217, "learning_rate": 1.8163448333466622e-06, "loss": 0.8537, "num_input_tokens_seen": 1663041536, "step": 1586 }, { "epoch": 1.7756643356643358, "grad_norm": 0.39014992569076534, "learning_rate": 1.813498813447951e-06, "loss": 0.7122, "num_input_tokens_seen": 1664090112, "step": 1587 }, { "epoch": 1.7767832167832167, "grad_norm": 0.3407199015686125, "learning_rate": 1.8106537554696736e-06, "loss": 0.649, "num_input_tokens_seen": 1665138688, "step": 1588 }, { "epoch": 1.7779020979020979, "grad_norm": 0.34969999267262986, "learning_rate": 1.8078096633983023e-06, "loss": 0.6713, "num_input_tokens_seen": 1666187264, "step": 1589 }, { "epoch": 1.779020979020979, "grad_norm": 0.41800401748573707, "learning_rate": 1.804966541218959e-06, "loss": 0.7523, "num_input_tokens_seen": 1667235840, "step": 1590 }, { "epoch": 1.7801398601398601, "grad_norm": 0.3488130152340182, "learning_rate": 1.8021243929154063e-06, "loss": 0.769, "num_input_tokens_seen": 1668284416, "step": 1591 }, { "epoch": 1.7812587412587413, "grad_norm": 0.3542828658361714, "learning_rate": 1.7992832224700391e-06, "loss": 0.6585, "num_input_tokens_seen": 1669332992, "step": 1592 }, { "epoch": 1.7823776223776224, "grad_norm": 0.3725655481579693, "learning_rate": 1.7964430338638883e-06, "loss": 0.7053, "num_input_tokens_seen": 1670381568, "step": 1593 }, { "epoch": 1.7834965034965036, "grad_norm": 0.45292083998845595, "learning_rate": 1.793603831076602e-06, "loss": 0.686, "num_input_tokens_seen": 1671430144, "step": 1594 }, { "epoch": 1.7846153846153845, "grad_norm": 0.4015345562898075, "learning_rate": 1.7907656180864519e-06, "loss": 0.6467, "num_input_tokens_seen": 1672478720, "step": 1595 }, { "epoch": 1.7857342657342659, "grad_norm": 0.3514113834476507, "learning_rate": 1.7879283988703223e-06, "loss": 0.6553, "num_input_tokens_seen": 1673527296, "step": 1596 }, { "epoch": 1.7868531468531468, "grad_norm": 0.3662032492823558, "learning_rate": 1.7850921774037012e-06, "loss": 0.5995, "num_input_tokens_seen": 1674575872, "step": 1597 }, { "epoch": 1.787972027972028, "grad_norm": 0.3545946442448354, "learning_rate": 1.7822569576606833e-06, "loss": 0.7491, "num_input_tokens_seen": 1675624448, "step": 1598 }, { "epoch": 1.789090909090909, "grad_norm": 0.3519553286683113, "learning_rate": 1.7794227436139569e-06, "loss": 0.6929, "num_input_tokens_seen": 1676673024, "step": 1599 }, { "epoch": 1.7902097902097902, "grad_norm": 0.3391074386584108, "learning_rate": 1.776589539234803e-06, "loss": 0.6819, "num_input_tokens_seen": 1677721600, "step": 1600 }, { "epoch": 1.7913286713286714, "grad_norm": 0.3577788640908163, "learning_rate": 1.7737573484930853e-06, "loss": 0.7201, "num_input_tokens_seen": 1678770176, "step": 1601 }, { "epoch": 1.7924475524475525, "grad_norm": 0.4278360301881774, "learning_rate": 1.770926175357251e-06, "loss": 0.7498, "num_input_tokens_seen": 1679818752, "step": 1602 }, { "epoch": 1.7935664335664336, "grad_norm": 0.35473065243806573, "learning_rate": 1.7680960237943174e-06, "loss": 0.5936, "num_input_tokens_seen": 1680867328, "step": 1603 }, { "epoch": 1.7946853146853146, "grad_norm": 0.3815754458038622, "learning_rate": 1.7652668977698714e-06, "loss": 1.0174, "num_input_tokens_seen": 1681915904, "step": 1604 }, { "epoch": 1.795804195804196, "grad_norm": 0.369652048346652, "learning_rate": 1.7624388012480656e-06, "loss": 0.7075, "num_input_tokens_seen": 1682964480, "step": 1605 }, { "epoch": 1.7969230769230768, "grad_norm": 0.34832961938443296, "learning_rate": 1.7596117381916068e-06, "loss": 0.7212, "num_input_tokens_seen": 1684013056, "step": 1606 }, { "epoch": 1.798041958041958, "grad_norm": 0.3424363305704864, "learning_rate": 1.756785712561756e-06, "loss": 0.6554, "num_input_tokens_seen": 1685061632, "step": 1607 }, { "epoch": 1.7991608391608391, "grad_norm": 0.4039025309030356, "learning_rate": 1.7539607283183191e-06, "loss": 0.8153, "num_input_tokens_seen": 1686110208, "step": 1608 }, { "epoch": 1.8002797202797203, "grad_norm": 0.3658471492737092, "learning_rate": 1.7511367894196426e-06, "loss": 0.9455, "num_input_tokens_seen": 1687158784, "step": 1609 }, { "epoch": 1.8013986013986014, "grad_norm": 0.3476445112270428, "learning_rate": 1.7483138998226102e-06, "loss": 0.6293, "num_input_tokens_seen": 1688207360, "step": 1610 }, { "epoch": 1.8025174825174826, "grad_norm": 0.3551408624630902, "learning_rate": 1.7454920634826334e-06, "loss": 0.6063, "num_input_tokens_seen": 1689255936, "step": 1611 }, { "epoch": 1.8036363636363637, "grad_norm": 0.35062916479373646, "learning_rate": 1.7426712843536497e-06, "loss": 0.6952, "num_input_tokens_seen": 1690304512, "step": 1612 }, { "epoch": 1.8047552447552446, "grad_norm": 0.36037335445704105, "learning_rate": 1.7398515663881117e-06, "loss": 0.7721, "num_input_tokens_seen": 1691353088, "step": 1613 }, { "epoch": 1.805874125874126, "grad_norm": 0.3390547993985359, "learning_rate": 1.7370329135369906e-06, "loss": 0.5806, "num_input_tokens_seen": 1692401664, "step": 1614 }, { "epoch": 1.806993006993007, "grad_norm": 0.3387026378277829, "learning_rate": 1.73421532974976e-06, "loss": 0.6257, "num_input_tokens_seen": 1693450240, "step": 1615 }, { "epoch": 1.8081118881118883, "grad_norm": 0.37039709103021795, "learning_rate": 1.731398818974398e-06, "loss": 0.6538, "num_input_tokens_seen": 1694498816, "step": 1616 }, { "epoch": 1.8092307692307692, "grad_norm": 0.35537967125115927, "learning_rate": 1.7285833851573802e-06, "loss": 0.7351, "num_input_tokens_seen": 1695547392, "step": 1617 }, { "epoch": 1.8103496503496503, "grad_norm": 0.37896047084677437, "learning_rate": 1.72576903224367e-06, "loss": 0.6641, "num_input_tokens_seen": 1696595968, "step": 1618 }, { "epoch": 1.8114685314685315, "grad_norm": 0.3772574259600305, "learning_rate": 1.7229557641767191e-06, "loss": 0.6571, "num_input_tokens_seen": 1697644544, "step": 1619 }, { "epoch": 1.8125874125874126, "grad_norm": 0.38507772025443404, "learning_rate": 1.7201435848984582e-06, "loss": 0.7547, "num_input_tokens_seen": 1698693120, "step": 1620 }, { "epoch": 1.8137062937062938, "grad_norm": 2.376053067249365, "learning_rate": 1.7173324983492912e-06, "loss": 0.7071, "num_input_tokens_seen": 1699741696, "step": 1621 }, { "epoch": 1.8148251748251747, "grad_norm": 0.36566708226842703, "learning_rate": 1.7145225084680939e-06, "loss": 0.7692, "num_input_tokens_seen": 1700790272, "step": 1622 }, { "epoch": 1.815944055944056, "grad_norm": 0.38193823683239403, "learning_rate": 1.7117136191922013e-06, "loss": 0.7484, "num_input_tokens_seen": 1701838848, "step": 1623 }, { "epoch": 1.817062937062937, "grad_norm": 0.37484911249425135, "learning_rate": 1.70890583445741e-06, "loss": 0.7389, "num_input_tokens_seen": 1702887424, "step": 1624 }, { "epoch": 1.8181818181818183, "grad_norm": 0.3863134571865988, "learning_rate": 1.7060991581979668e-06, "loss": 0.7051, "num_input_tokens_seen": 1703936000, "step": 1625 }, { "epoch": 1.8193006993006993, "grad_norm": 0.34400166073506927, "learning_rate": 1.7032935943465664e-06, "loss": 0.6826, "num_input_tokens_seen": 1704984576, "step": 1626 }, { "epoch": 1.8204195804195804, "grad_norm": 0.6284358167863014, "learning_rate": 1.7004891468343445e-06, "loss": 0.6736, "num_input_tokens_seen": 1706033152, "step": 1627 }, { "epoch": 1.8215384615384616, "grad_norm": 0.36318906695703673, "learning_rate": 1.6976858195908707e-06, "loss": 0.6188, "num_input_tokens_seen": 1707081728, "step": 1628 }, { "epoch": 1.8226573426573427, "grad_norm": 0.36331277862050504, "learning_rate": 1.6948836165441487e-06, "loss": 0.7019, "num_input_tokens_seen": 1708130304, "step": 1629 }, { "epoch": 1.8237762237762238, "grad_norm": 0.3627502052312083, "learning_rate": 1.6920825416206032e-06, "loss": 0.8377, "num_input_tokens_seen": 1709178880, "step": 1630 }, { "epoch": 1.8248951048951048, "grad_norm": 0.3881923464324314, "learning_rate": 1.6892825987450811e-06, "loss": 0.6087, "num_input_tokens_seen": 1710227456, "step": 1631 }, { "epoch": 1.8260139860139861, "grad_norm": 0.3540184068144853, "learning_rate": 1.6864837918408422e-06, "loss": 0.6364, "num_input_tokens_seen": 1711276032, "step": 1632 }, { "epoch": 1.827132867132867, "grad_norm": 0.3640081291814246, "learning_rate": 1.6836861248295522e-06, "loss": 0.6452, "num_input_tokens_seen": 1712324608, "step": 1633 }, { "epoch": 1.8282517482517484, "grad_norm": 0.39470347354128793, "learning_rate": 1.6808896016312832e-06, "loss": 0.7781, "num_input_tokens_seen": 1713373184, "step": 1634 }, { "epoch": 1.8293706293706293, "grad_norm": 0.3664370559323104, "learning_rate": 1.6780942261645022e-06, "loss": 0.7269, "num_input_tokens_seen": 1714421760, "step": 1635 }, { "epoch": 1.8304895104895105, "grad_norm": 0.35889255346765414, "learning_rate": 1.6753000023460698e-06, "loss": 0.7237, "num_input_tokens_seen": 1715470336, "step": 1636 }, { "epoch": 1.8316083916083916, "grad_norm": 0.36297023344415935, "learning_rate": 1.6725069340912306e-06, "loss": 0.7058, "num_input_tokens_seen": 1716518912, "step": 1637 }, { "epoch": 1.8327272727272728, "grad_norm": 0.4601241027286176, "learning_rate": 1.6697150253136136e-06, "loss": 0.7408, "num_input_tokens_seen": 1717567488, "step": 1638 }, { "epoch": 1.833846153846154, "grad_norm": 0.3702607993145882, "learning_rate": 1.666924279925219e-06, "loss": 0.7289, "num_input_tokens_seen": 1718616064, "step": 1639 }, { "epoch": 1.8349650349650348, "grad_norm": 0.36536248486884443, "learning_rate": 1.6641347018364182e-06, "loss": 0.6819, "num_input_tokens_seen": 1719664640, "step": 1640 }, { "epoch": 1.8360839160839162, "grad_norm": 0.3657504485846478, "learning_rate": 1.6613462949559494e-06, "loss": 0.5842, "num_input_tokens_seen": 1720713216, "step": 1641 }, { "epoch": 1.837202797202797, "grad_norm": 0.3611423333483979, "learning_rate": 1.6585590631909072e-06, "loss": 0.5558, "num_input_tokens_seen": 1721761792, "step": 1642 }, { "epoch": 1.8383216783216785, "grad_norm": 0.3582910584353947, "learning_rate": 1.6557730104467407e-06, "loss": 0.6536, "num_input_tokens_seen": 1722810368, "step": 1643 }, { "epoch": 1.8394405594405594, "grad_norm": 0.38887766622128245, "learning_rate": 1.6529881406272457e-06, "loss": 0.7932, "num_input_tokens_seen": 1723858944, "step": 1644 }, { "epoch": 1.8405594405594405, "grad_norm": 0.3646703595273719, "learning_rate": 1.6502044576345614e-06, "loss": 0.8077, "num_input_tokens_seen": 1724907520, "step": 1645 }, { "epoch": 1.8416783216783217, "grad_norm": 0.41477365194507376, "learning_rate": 1.647421965369165e-06, "loss": 0.754, "num_input_tokens_seen": 1725956096, "step": 1646 }, { "epoch": 1.8427972027972028, "grad_norm": 0.3613703971123837, "learning_rate": 1.6446406677298632e-06, "loss": 0.7005, "num_input_tokens_seen": 1727004672, "step": 1647 }, { "epoch": 1.843916083916084, "grad_norm": 0.36120769722366425, "learning_rate": 1.6418605686137914e-06, "loss": 0.6724, "num_input_tokens_seen": 1728053248, "step": 1648 }, { "epoch": 1.845034965034965, "grad_norm": 0.4017111737193555, "learning_rate": 1.6390816719164022e-06, "loss": 0.678, "num_input_tokens_seen": 1729101824, "step": 1649 }, { "epoch": 1.8461538461538463, "grad_norm": 0.41298002505622305, "learning_rate": 1.6363039815314668e-06, "loss": 0.8139, "num_input_tokens_seen": 1730150400, "step": 1650 }, { "epoch": 1.8472727272727272, "grad_norm": 0.3519722486485584, "learning_rate": 1.6335275013510638e-06, "loss": 0.6761, "num_input_tokens_seen": 1731198976, "step": 1651 }, { "epoch": 1.8483916083916085, "grad_norm": 0.34751711390967327, "learning_rate": 1.630752235265577e-06, "loss": 0.719, "num_input_tokens_seen": 1732247552, "step": 1652 }, { "epoch": 1.8495104895104895, "grad_norm": 0.37762960794842365, "learning_rate": 1.6279781871636896e-06, "loss": 0.7217, "num_input_tokens_seen": 1733296128, "step": 1653 }, { "epoch": 1.8506293706293706, "grad_norm": 0.3435618597816076, "learning_rate": 1.6252053609323758e-06, "loss": 0.6829, "num_input_tokens_seen": 1734344704, "step": 1654 }, { "epoch": 1.8517482517482518, "grad_norm": 0.3532548331565441, "learning_rate": 1.6224337604569012e-06, "loss": 0.7143, "num_input_tokens_seen": 1735393280, "step": 1655 }, { "epoch": 1.852867132867133, "grad_norm": 0.3701932804814071, "learning_rate": 1.6196633896208118e-06, "loss": 0.6368, "num_input_tokens_seen": 1736441856, "step": 1656 }, { "epoch": 1.853986013986014, "grad_norm": 0.3671973113017598, "learning_rate": 1.616894252305929e-06, "loss": 0.6704, "num_input_tokens_seen": 1737490432, "step": 1657 }, { "epoch": 1.855104895104895, "grad_norm": 0.9123789284525732, "learning_rate": 1.6141263523923512e-06, "loss": 0.6929, "num_input_tokens_seen": 1738539008, "step": 1658 }, { "epoch": 1.8562237762237763, "grad_norm": 0.38084291777468343, "learning_rate": 1.6113596937584358e-06, "loss": 0.7426, "num_input_tokens_seen": 1739587584, "step": 1659 }, { "epoch": 1.8573426573426572, "grad_norm": 0.3925858032012816, "learning_rate": 1.6085942802808068e-06, "loss": 0.6341, "num_input_tokens_seen": 1740636160, "step": 1660 }, { "epoch": 1.8584615384615386, "grad_norm": 0.3475119490762125, "learning_rate": 1.6058301158343408e-06, "loss": 0.7194, "num_input_tokens_seen": 1741684736, "step": 1661 }, { "epoch": 1.8595804195804195, "grad_norm": 0.3452122055527786, "learning_rate": 1.6030672042921647e-06, "loss": 0.6637, "num_input_tokens_seen": 1742733312, "step": 1662 }, { "epoch": 1.8606993006993007, "grad_norm": 0.36489553801740765, "learning_rate": 1.600305549525651e-06, "loss": 0.7159, "num_input_tokens_seen": 1743781888, "step": 1663 }, { "epoch": 1.8618181818181818, "grad_norm": 0.35961894418931073, "learning_rate": 1.5975451554044074e-06, "loss": 0.6029, "num_input_tokens_seen": 1744830464, "step": 1664 }, { "epoch": 1.862937062937063, "grad_norm": 0.36151976828395266, "learning_rate": 1.5947860257962808e-06, "loss": 0.7696, "num_input_tokens_seen": 1745879040, "step": 1665 }, { "epoch": 1.864055944055944, "grad_norm": 0.38023862108465944, "learning_rate": 1.5920281645673412e-06, "loss": 0.7132, "num_input_tokens_seen": 1746927616, "step": 1666 }, { "epoch": 1.865174825174825, "grad_norm": 0.3385816526373643, "learning_rate": 1.5892715755818855e-06, "loss": 0.6487, "num_input_tokens_seen": 1747976192, "step": 1667 }, { "epoch": 1.8662937062937064, "grad_norm": 0.35516870769886993, "learning_rate": 1.586516262702425e-06, "loss": 0.6523, "num_input_tokens_seen": 1749024768, "step": 1668 }, { "epoch": 1.8674125874125873, "grad_norm": 0.3689841400817224, "learning_rate": 1.5837622297896832e-06, "loss": 0.6681, "num_input_tokens_seen": 1750073344, "step": 1669 }, { "epoch": 1.8685314685314687, "grad_norm": 0.361278852114849, "learning_rate": 1.5810094807025923e-06, "loss": 0.9453, "num_input_tokens_seen": 1751121920, "step": 1670 }, { "epoch": 1.8696503496503496, "grad_norm": 0.35321212129287455, "learning_rate": 1.5782580192982827e-06, "loss": 0.7012, "num_input_tokens_seen": 1752170496, "step": 1671 }, { "epoch": 1.8707692307692307, "grad_norm": 0.35821832837202583, "learning_rate": 1.575507849432083e-06, "loss": 0.7607, "num_input_tokens_seen": 1753219072, "step": 1672 }, { "epoch": 1.8718881118881119, "grad_norm": 0.3547088322826387, "learning_rate": 1.5727589749575107e-06, "loss": 0.6264, "num_input_tokens_seen": 1754267648, "step": 1673 }, { "epoch": 1.873006993006993, "grad_norm": 0.39471364536972475, "learning_rate": 1.5700113997262695e-06, "loss": 0.6274, "num_input_tokens_seen": 1755316224, "step": 1674 }, { "epoch": 1.8741258741258742, "grad_norm": 0.33309242271362466, "learning_rate": 1.56726512758824e-06, "loss": 0.6287, "num_input_tokens_seen": 1756364800, "step": 1675 }, { "epoch": 1.875244755244755, "grad_norm": 0.390543656089801, "learning_rate": 1.564520162391479e-06, "loss": 0.6618, "num_input_tokens_seen": 1757413376, "step": 1676 }, { "epoch": 1.8763636363636365, "grad_norm": 0.38215754937297597, "learning_rate": 1.5617765079822133e-06, "loss": 0.8474, "num_input_tokens_seen": 1758461952, "step": 1677 }, { "epoch": 1.8774825174825174, "grad_norm": 0.3645598915124146, "learning_rate": 1.5590341682048285e-06, "loss": 0.7726, "num_input_tokens_seen": 1759510528, "step": 1678 }, { "epoch": 1.8786013986013987, "grad_norm": 0.35078449510022947, "learning_rate": 1.5562931469018738e-06, "loss": 0.6259, "num_input_tokens_seen": 1760559104, "step": 1679 }, { "epoch": 1.8797202797202797, "grad_norm": 0.42486462512158424, "learning_rate": 1.5535534479140469e-06, "loss": 0.663, "num_input_tokens_seen": 1761607680, "step": 1680 }, { "epoch": 1.8808391608391608, "grad_norm": 0.36057889052796266, "learning_rate": 1.550815075080193e-06, "loss": 0.7949, "num_input_tokens_seen": 1762656256, "step": 1681 }, { "epoch": 1.881958041958042, "grad_norm": 0.38747464774170304, "learning_rate": 1.5480780322373026e-06, "loss": 0.5959, "num_input_tokens_seen": 1763704832, "step": 1682 }, { "epoch": 1.883076923076923, "grad_norm": 0.35880851305609385, "learning_rate": 1.5453423232204968e-06, "loss": 0.6469, "num_input_tokens_seen": 1764753408, "step": 1683 }, { "epoch": 1.8841958041958042, "grad_norm": 0.36235540893989887, "learning_rate": 1.5426079518630345e-06, "loss": 0.5968, "num_input_tokens_seen": 1765801984, "step": 1684 }, { "epoch": 1.8853146853146852, "grad_norm": 0.35207166145366375, "learning_rate": 1.5398749219962935e-06, "loss": 0.6284, "num_input_tokens_seen": 1766850560, "step": 1685 }, { "epoch": 1.8864335664335665, "grad_norm": 0.34782010538999897, "learning_rate": 1.5371432374497778e-06, "loss": 0.758, "num_input_tokens_seen": 1767899136, "step": 1686 }, { "epoch": 1.8875524475524474, "grad_norm": 0.3389696248103579, "learning_rate": 1.5344129020511029e-06, "loss": 0.7631, "num_input_tokens_seen": 1768947712, "step": 1687 }, { "epoch": 1.8886713286713288, "grad_norm": 0.3836023739244322, "learning_rate": 1.5316839196259932e-06, "loss": 0.832, "num_input_tokens_seen": 1769996288, "step": 1688 }, { "epoch": 1.8897902097902097, "grad_norm": 0.34876477691115065, "learning_rate": 1.5289562939982822e-06, "loss": 0.5984, "num_input_tokens_seen": 1771044864, "step": 1689 }, { "epoch": 1.8909090909090909, "grad_norm": 0.3449825007300553, "learning_rate": 1.5262300289898956e-06, "loss": 0.7892, "num_input_tokens_seen": 1772093440, "step": 1690 }, { "epoch": 1.892027972027972, "grad_norm": 0.34438930793259065, "learning_rate": 1.523505128420858e-06, "loss": 0.7893, "num_input_tokens_seen": 1773142016, "step": 1691 }, { "epoch": 1.8931468531468532, "grad_norm": 0.38742866901221534, "learning_rate": 1.5207815961092798e-06, "loss": 0.8584, "num_input_tokens_seen": 1774190592, "step": 1692 }, { "epoch": 1.8942657342657343, "grad_norm": 0.3569879079860235, "learning_rate": 1.518059435871353e-06, "loss": 0.6513, "num_input_tokens_seen": 1775239168, "step": 1693 }, { "epoch": 1.8953846153846152, "grad_norm": 0.3635757239162608, "learning_rate": 1.5153386515213514e-06, "loss": 0.6159, "num_input_tokens_seen": 1776287744, "step": 1694 }, { "epoch": 1.8965034965034966, "grad_norm": 0.37652373746282825, "learning_rate": 1.5126192468716152e-06, "loss": 0.6859, "num_input_tokens_seen": 1777336320, "step": 1695 }, { "epoch": 1.8976223776223775, "grad_norm": 0.3401166541089911, "learning_rate": 1.5099012257325563e-06, "loss": 0.7051, "num_input_tokens_seen": 1778384896, "step": 1696 }, { "epoch": 1.8987412587412589, "grad_norm": 0.3262783893627933, "learning_rate": 1.5071845919126448e-06, "loss": 0.6118, "num_input_tokens_seen": 1779433472, "step": 1697 }, { "epoch": 1.8998601398601398, "grad_norm": 0.3770633082388179, "learning_rate": 1.5044693492184092e-06, "loss": 0.625, "num_input_tokens_seen": 1780482048, "step": 1698 }, { "epoch": 1.900979020979021, "grad_norm": 0.35787087137966467, "learning_rate": 1.5017555014544273e-06, "loss": 0.6267, "num_input_tokens_seen": 1781530624, "step": 1699 }, { "epoch": 1.902097902097902, "grad_norm": 0.3618009707253574, "learning_rate": 1.4990430524233213e-06, "loss": 0.9697, "num_input_tokens_seen": 1782579200, "step": 1700 }, { "epoch": 1.9032167832167832, "grad_norm": 0.35730609834981547, "learning_rate": 1.4963320059257565e-06, "loss": 0.7214, "num_input_tokens_seen": 1783627776, "step": 1701 }, { "epoch": 1.9043356643356644, "grad_norm": 0.34190996971456444, "learning_rate": 1.493622365760431e-06, "loss": 0.6651, "num_input_tokens_seen": 1784676352, "step": 1702 }, { "epoch": 1.9054545454545453, "grad_norm": 0.34730791154163415, "learning_rate": 1.4909141357240731e-06, "loss": 0.638, "num_input_tokens_seen": 1785724928, "step": 1703 }, { "epoch": 1.9065734265734267, "grad_norm": 0.3427970101799572, "learning_rate": 1.4882073196114343e-06, "loss": 0.6439, "num_input_tokens_seen": 1786773504, "step": 1704 }, { "epoch": 1.9076923076923076, "grad_norm": 0.6145833581142299, "learning_rate": 1.4855019212152852e-06, "loss": 0.7888, "num_input_tokens_seen": 1787822080, "step": 1705 }, { "epoch": 1.908811188811189, "grad_norm": 0.3458125785796812, "learning_rate": 1.4827979443264113e-06, "loss": 0.6843, "num_input_tokens_seen": 1788870656, "step": 1706 }, { "epoch": 1.9099300699300699, "grad_norm": 0.3405342864811183, "learning_rate": 1.4800953927336036e-06, "loss": 0.7045, "num_input_tokens_seen": 1789919232, "step": 1707 }, { "epoch": 1.911048951048951, "grad_norm": 0.39385632719325897, "learning_rate": 1.47739427022366e-06, "loss": 0.6767, "num_input_tokens_seen": 1790967808, "step": 1708 }, { "epoch": 1.9121678321678321, "grad_norm": 0.3755803964755116, "learning_rate": 1.4746945805813707e-06, "loss": 0.7113, "num_input_tokens_seen": 1792016384, "step": 1709 }, { "epoch": 1.9132867132867133, "grad_norm": 0.3697460605064327, "learning_rate": 1.4719963275895239e-06, "loss": 0.6605, "num_input_tokens_seen": 1793064960, "step": 1710 }, { "epoch": 1.9144055944055944, "grad_norm": 0.3505133455036502, "learning_rate": 1.4692995150288896e-06, "loss": 0.7026, "num_input_tokens_seen": 1794113536, "step": 1711 }, { "epoch": 1.9155244755244756, "grad_norm": 0.3433554480343946, "learning_rate": 1.4666041466782227e-06, "loss": 0.8619, "num_input_tokens_seen": 1795162112, "step": 1712 }, { "epoch": 1.9166433566433567, "grad_norm": 0.6211245292434256, "learning_rate": 1.4639102263142546e-06, "loss": 0.7531, "num_input_tokens_seen": 1796210688, "step": 1713 }, { "epoch": 1.9177622377622376, "grad_norm": 0.37537774055201434, "learning_rate": 1.4612177577116843e-06, "loss": 0.6497, "num_input_tokens_seen": 1797259264, "step": 1714 }, { "epoch": 1.918881118881119, "grad_norm": 0.3482379212627553, "learning_rate": 1.4585267446431817e-06, "loss": 0.6201, "num_input_tokens_seen": 1798307840, "step": 1715 }, { "epoch": 1.92, "grad_norm": 0.37481504046643926, "learning_rate": 1.4558371908793734e-06, "loss": 0.7372, "num_input_tokens_seen": 1799356416, "step": 1716 }, { "epoch": 1.921118881118881, "grad_norm": 1.5636069081103956, "learning_rate": 1.4531491001888421e-06, "loss": 0.779, "num_input_tokens_seen": 1800404992, "step": 1717 }, { "epoch": 1.9222377622377622, "grad_norm": 0.40130530527215935, "learning_rate": 1.4504624763381207e-06, "loss": 0.6333, "num_input_tokens_seen": 1801453568, "step": 1718 }, { "epoch": 1.9233566433566434, "grad_norm": 0.35124193995400904, "learning_rate": 1.4477773230916872e-06, "loss": 0.7656, "num_input_tokens_seen": 1802502144, "step": 1719 }, { "epoch": 1.9244755244755245, "grad_norm": 0.33577480672622634, "learning_rate": 1.44509364421196e-06, "loss": 0.6512, "num_input_tokens_seen": 1803550720, "step": 1720 }, { "epoch": 1.9255944055944056, "grad_norm": 0.34403351997853115, "learning_rate": 1.442411443459289e-06, "loss": 0.5951, "num_input_tokens_seen": 1804599296, "step": 1721 }, { "epoch": 1.9267132867132868, "grad_norm": 0.34729369202269605, "learning_rate": 1.4397307245919534e-06, "loss": 0.7829, "num_input_tokens_seen": 1805647872, "step": 1722 }, { "epoch": 1.9278321678321677, "grad_norm": 0.40791777991105704, "learning_rate": 1.4370514913661576e-06, "loss": 0.745, "num_input_tokens_seen": 1806696448, "step": 1723 }, { "epoch": 1.928951048951049, "grad_norm": 0.366162033353925, "learning_rate": 1.4343737475360236e-06, "loss": 0.5664, "num_input_tokens_seen": 1807745024, "step": 1724 }, { "epoch": 1.93006993006993, "grad_norm": 0.3443770707096183, "learning_rate": 1.4316974968535873e-06, "loss": 0.653, "num_input_tokens_seen": 1808793600, "step": 1725 }, { "epoch": 1.9311888111888111, "grad_norm": 0.36049896641731133, "learning_rate": 1.4290227430687903e-06, "loss": 0.7096, "num_input_tokens_seen": 1809842176, "step": 1726 }, { "epoch": 1.9323076923076923, "grad_norm": 0.3513293259005713, "learning_rate": 1.4263494899294794e-06, "loss": 0.6817, "num_input_tokens_seen": 1810890752, "step": 1727 }, { "epoch": 1.9334265734265734, "grad_norm": 0.4011781654499557, "learning_rate": 1.4236777411813951e-06, "loss": 0.7689, "num_input_tokens_seen": 1811939328, "step": 1728 }, { "epoch": 1.9345454545454546, "grad_norm": 0.36197873123459984, "learning_rate": 1.4210075005681737e-06, "loss": 0.5943, "num_input_tokens_seen": 1812987904, "step": 1729 }, { "epoch": 1.9356643356643357, "grad_norm": 0.3529815813465272, "learning_rate": 1.4183387718313374e-06, "loss": 0.6229, "num_input_tokens_seen": 1814036480, "step": 1730 }, { "epoch": 1.9367832167832169, "grad_norm": 0.3467515306414731, "learning_rate": 1.4156715587102875e-06, "loss": 0.7617, "num_input_tokens_seen": 1815085056, "step": 1731 }, { "epoch": 1.9379020979020978, "grad_norm": 0.4956459208267009, "learning_rate": 1.4130058649423057e-06, "loss": 0.6782, "num_input_tokens_seen": 1816133632, "step": 1732 }, { "epoch": 1.9390209790209791, "grad_norm": 0.3652323404860807, "learning_rate": 1.4103416942625397e-06, "loss": 0.8375, "num_input_tokens_seen": 1817182208, "step": 1733 }, { "epoch": 1.94013986013986, "grad_norm": 0.4751265442896118, "learning_rate": 1.4076790504040084e-06, "loss": 0.7826, "num_input_tokens_seen": 1818230784, "step": 1734 }, { "epoch": 1.9412587412587412, "grad_norm": 0.3469181492670609, "learning_rate": 1.4050179370975886e-06, "loss": 0.6335, "num_input_tokens_seen": 1819279360, "step": 1735 }, { "epoch": 1.9423776223776223, "grad_norm": 0.35041110964576433, "learning_rate": 1.4023583580720112e-06, "loss": 0.7191, "num_input_tokens_seen": 1820327936, "step": 1736 }, { "epoch": 1.9434965034965035, "grad_norm": 0.3511336608766253, "learning_rate": 1.3997003170538608e-06, "loss": 0.7197, "num_input_tokens_seen": 1821376512, "step": 1737 }, { "epoch": 1.9446153846153846, "grad_norm": 0.3528348259975287, "learning_rate": 1.3970438177675632e-06, "loss": 0.7631, "num_input_tokens_seen": 1822425088, "step": 1738 }, { "epoch": 1.9457342657342658, "grad_norm": 0.3577264775779954, "learning_rate": 1.3943888639353866e-06, "loss": 0.6191, "num_input_tokens_seen": 1823473664, "step": 1739 }, { "epoch": 1.946853146853147, "grad_norm": 0.35696593503747065, "learning_rate": 1.391735459277434e-06, "loss": 0.6907, "num_input_tokens_seen": 1824522240, "step": 1740 }, { "epoch": 1.9479720279720278, "grad_norm": 0.3445151997342317, "learning_rate": 1.3890836075116343e-06, "loss": 0.7179, "num_input_tokens_seen": 1825570816, "step": 1741 }, { "epoch": 1.9490909090909092, "grad_norm": 0.36967836568299617, "learning_rate": 1.3864333123537446e-06, "loss": 0.7048, "num_input_tokens_seen": 1826619392, "step": 1742 }, { "epoch": 1.9502097902097901, "grad_norm": 0.3348213658689775, "learning_rate": 1.3837845775173375e-06, "loss": 0.6714, "num_input_tokens_seen": 1827667968, "step": 1743 }, { "epoch": 1.9513286713286715, "grad_norm": 0.3562026385169248, "learning_rate": 1.3811374067138016e-06, "loss": 0.6153, "num_input_tokens_seen": 1828716544, "step": 1744 }, { "epoch": 1.9524475524475524, "grad_norm": 0.4751241097544352, "learning_rate": 1.3784918036523346e-06, "loss": 0.6374, "num_input_tokens_seen": 1829765120, "step": 1745 }, { "epoch": 1.9535664335664336, "grad_norm": 0.3803963982494086, "learning_rate": 1.3758477720399339e-06, "loss": 0.6838, "num_input_tokens_seen": 1830813696, "step": 1746 }, { "epoch": 1.9546853146853147, "grad_norm": 0.3887307963219591, "learning_rate": 1.3732053155813987e-06, "loss": 0.6767, "num_input_tokens_seen": 1831862272, "step": 1747 }, { "epoch": 1.9558041958041958, "grad_norm": 0.42538977646623255, "learning_rate": 1.3705644379793182e-06, "loss": 0.6895, "num_input_tokens_seen": 1832910848, "step": 1748 }, { "epoch": 1.956923076923077, "grad_norm": 0.35438256041767197, "learning_rate": 1.3679251429340717e-06, "loss": 0.659, "num_input_tokens_seen": 1833959424, "step": 1749 }, { "epoch": 1.958041958041958, "grad_norm": 0.3888735972890291, "learning_rate": 1.3652874341438203e-06, "loss": 0.6489, "num_input_tokens_seen": 1835008000, "step": 1750 }, { "epoch": 1.958041958041958, "eval_loss": 0.7199289202690125, "eval_runtime": 246.7815, "eval_samples_per_second": 2.366, "eval_steps_per_second": 0.296, "num_input_tokens_seen": 1835008000, "step": 1750 }, { "epoch": 1.9591608391608393, "grad_norm": 0.3784987374072853, "learning_rate": 1.3626513153045024e-06, "loss": 0.7072, "num_input_tokens_seen": 1836056576, "step": 1751 }, { "epoch": 1.9602797202797202, "grad_norm": 0.36968809658905133, "learning_rate": 1.3600167901098282e-06, "loss": 0.7325, "num_input_tokens_seen": 1837105152, "step": 1752 }, { "epoch": 1.9613986013986016, "grad_norm": 0.3810681136577437, "learning_rate": 1.3573838622512743e-06, "loss": 0.6664, "num_input_tokens_seen": 1838153728, "step": 1753 }, { "epoch": 1.9625174825174825, "grad_norm": 0.3460804362029606, "learning_rate": 1.3547525354180796e-06, "loss": 0.6499, "num_input_tokens_seen": 1839202304, "step": 1754 }, { "epoch": 1.9636363636363636, "grad_norm": 0.3677592870410058, "learning_rate": 1.3521228132972414e-06, "loss": 0.7961, "num_input_tokens_seen": 1840250880, "step": 1755 }, { "epoch": 1.9647552447552448, "grad_norm": 0.3809786756313395, "learning_rate": 1.3494946995735075e-06, "loss": 0.717, "num_input_tokens_seen": 1841299456, "step": 1756 }, { "epoch": 1.965874125874126, "grad_norm": 0.34990293442382814, "learning_rate": 1.3468681979293702e-06, "loss": 0.6987, "num_input_tokens_seen": 1842348032, "step": 1757 }, { "epoch": 1.966993006993007, "grad_norm": 0.41638078924838484, "learning_rate": 1.3442433120450642e-06, "loss": 0.81, "num_input_tokens_seen": 1843396608, "step": 1758 }, { "epoch": 1.968111888111888, "grad_norm": 0.36112974825560545, "learning_rate": 1.3416200455985607e-06, "loss": 0.6939, "num_input_tokens_seen": 1844445184, "step": 1759 }, { "epoch": 1.9692307692307693, "grad_norm": 0.35300244870739933, "learning_rate": 1.3389984022655617e-06, "loss": 0.7493, "num_input_tokens_seen": 1845493760, "step": 1760 }, { "epoch": 1.9703496503496503, "grad_norm": 0.3487546263228181, "learning_rate": 1.3363783857194957e-06, "loss": 0.6629, "num_input_tokens_seen": 1846542336, "step": 1761 }, { "epoch": 1.9714685314685316, "grad_norm": 0.35686672733850644, "learning_rate": 1.3337599996315087e-06, "loss": 0.6622, "num_input_tokens_seen": 1847590912, "step": 1762 }, { "epoch": 1.9725874125874125, "grad_norm": 0.395656874848888, "learning_rate": 1.3311432476704655e-06, "loss": 0.6485, "num_input_tokens_seen": 1848639488, "step": 1763 }, { "epoch": 1.9737062937062937, "grad_norm": 0.356224619979862, "learning_rate": 1.3285281335029387e-06, "loss": 0.7794, "num_input_tokens_seen": 1849688064, "step": 1764 }, { "epoch": 1.9748251748251748, "grad_norm": 0.3814787326118372, "learning_rate": 1.325914660793207e-06, "loss": 0.812, "num_input_tokens_seen": 1850736640, "step": 1765 }, { "epoch": 1.975944055944056, "grad_norm": 0.40812199283388617, "learning_rate": 1.3233028332032516e-06, "loss": 0.6568, "num_input_tokens_seen": 1851785216, "step": 1766 }, { "epoch": 1.9770629370629371, "grad_norm": 0.35460856691676496, "learning_rate": 1.3206926543927435e-06, "loss": 0.6246, "num_input_tokens_seen": 1852833792, "step": 1767 }, { "epoch": 1.978181818181818, "grad_norm": 0.3703981936132845, "learning_rate": 1.3180841280190476e-06, "loss": 0.6539, "num_input_tokens_seen": 1853882368, "step": 1768 }, { "epoch": 1.9793006993006994, "grad_norm": 0.36775732777060144, "learning_rate": 1.3154772577372104e-06, "loss": 0.6982, "num_input_tokens_seen": 1854930944, "step": 1769 }, { "epoch": 1.9804195804195803, "grad_norm": 0.3403020134353418, "learning_rate": 1.3128720471999606e-06, "loss": 0.6398, "num_input_tokens_seen": 1855979520, "step": 1770 }, { "epoch": 1.9815384615384617, "grad_norm": 0.3681130210433176, "learning_rate": 1.310268500057701e-06, "loss": 0.6829, "num_input_tokens_seen": 1857028096, "step": 1771 }, { "epoch": 1.9826573426573426, "grad_norm": 0.3478481365724911, "learning_rate": 1.307666619958501e-06, "loss": 0.6814, "num_input_tokens_seen": 1858076672, "step": 1772 }, { "epoch": 1.9837762237762238, "grad_norm": 0.47697851157419574, "learning_rate": 1.305066410548097e-06, "loss": 0.5909, "num_input_tokens_seen": 1859125248, "step": 1773 }, { "epoch": 1.984895104895105, "grad_norm": 0.3811447304052207, "learning_rate": 1.3024678754698827e-06, "loss": 0.6712, "num_input_tokens_seen": 1860173824, "step": 1774 }, { "epoch": 1.986013986013986, "grad_norm": 0.35886958267621044, "learning_rate": 1.2998710183649066e-06, "loss": 0.5639, "num_input_tokens_seen": 1861222400, "step": 1775 }, { "epoch": 1.9871328671328672, "grad_norm": 0.35396553237565365, "learning_rate": 1.2972758428718668e-06, "loss": 0.6183, "num_input_tokens_seen": 1862270976, "step": 1776 }, { "epoch": 1.988251748251748, "grad_norm": 0.36319700050581316, "learning_rate": 1.2946823526271023e-06, "loss": 0.7198, "num_input_tokens_seen": 1863319552, "step": 1777 }, { "epoch": 1.9893706293706295, "grad_norm": 0.3704737373708342, "learning_rate": 1.292090551264595e-06, "loss": 0.6938, "num_input_tokens_seen": 1864368128, "step": 1778 }, { "epoch": 1.9904895104895104, "grad_norm": 0.3312145305991146, "learning_rate": 1.2895004424159557e-06, "loss": 0.7327, "num_input_tokens_seen": 1865416704, "step": 1779 }, { "epoch": 1.9916083916083918, "grad_norm": 0.3410780657546888, "learning_rate": 1.286912029710427e-06, "loss": 0.5803, "num_input_tokens_seen": 1866465280, "step": 1780 }, { "epoch": 1.9927272727272727, "grad_norm": 0.33713145262128996, "learning_rate": 1.2843253167748745e-06, "loss": 0.7363, "num_input_tokens_seen": 1867513856, "step": 1781 }, { "epoch": 1.9938461538461538, "grad_norm": 0.3734268092154474, "learning_rate": 1.2817403072337798e-06, "loss": 0.7181, "num_input_tokens_seen": 1868562432, "step": 1782 }, { "epoch": 1.994965034965035, "grad_norm": 0.3324391692318491, "learning_rate": 1.2791570047092413e-06, "loss": 0.6783, "num_input_tokens_seen": 1869611008, "step": 1783 }, { "epoch": 1.996083916083916, "grad_norm": 0.34303558537795564, "learning_rate": 1.2765754128209614e-06, "loss": 0.6798, "num_input_tokens_seen": 1870659584, "step": 1784 }, { "epoch": 1.9972027972027973, "grad_norm": 0.35688659236905496, "learning_rate": 1.2739955351862488e-06, "loss": 0.7397, "num_input_tokens_seen": 1871708160, "step": 1785 }, { "epoch": 1.9983216783216782, "grad_norm": 0.3769854870961845, "learning_rate": 1.2714173754200094e-06, "loss": 0.5976, "num_input_tokens_seen": 1872756736, "step": 1786 }, { "epoch": 1.9994405594405595, "grad_norm": 0.32403857951804826, "learning_rate": 1.2688409371347422e-06, "loss": 0.6467, "num_input_tokens_seen": 1873805312, "step": 1787 }, { "epoch": 2.0005594405594405, "grad_norm": 0.5110479276202166, "learning_rate": 1.266266223940533e-06, "loss": 0.5426, "num_input_tokens_seen": 1874853888, "step": 1788 }, { "epoch": 2.001678321678322, "grad_norm": 0.4985984719964178, "learning_rate": 1.2636932394450502e-06, "loss": 0.5883, "num_input_tokens_seen": 1875902464, "step": 1789 }, { "epoch": 2.0027972027972027, "grad_norm": 0.3675207595520399, "learning_rate": 1.2611219872535412e-06, "loss": 0.7158, "num_input_tokens_seen": 1876951040, "step": 1790 }, { "epoch": 2.003916083916084, "grad_norm": 0.3572758957506379, "learning_rate": 1.2585524709688268e-06, "loss": 0.6502, "num_input_tokens_seen": 1877999616, "step": 1791 }, { "epoch": 2.005034965034965, "grad_norm": 0.4370468062061656, "learning_rate": 1.2559846941912942e-06, "loss": 0.7371, "num_input_tokens_seen": 1879048192, "step": 1792 }, { "epoch": 2.006153846153846, "grad_norm": 0.39626582769093777, "learning_rate": 1.2534186605188933e-06, "loss": 0.6541, "num_input_tokens_seen": 1880096768, "step": 1793 }, { "epoch": 2.0072727272727273, "grad_norm": 0.37459871603443906, "learning_rate": 1.2508543735471305e-06, "loss": 0.7293, "num_input_tokens_seen": 1881145344, "step": 1794 }, { "epoch": 2.0083916083916082, "grad_norm": 0.3663012364855239, "learning_rate": 1.2482918368690666e-06, "loss": 0.7359, "num_input_tokens_seen": 1882193920, "step": 1795 }, { "epoch": 2.0095104895104896, "grad_norm": 0.3755743888327766, "learning_rate": 1.2457310540753093e-06, "loss": 0.5873, "num_input_tokens_seen": 1883242496, "step": 1796 }, { "epoch": 2.0106293706293705, "grad_norm": 0.44844269745742915, "learning_rate": 1.2431720287540097e-06, "loss": 0.6087, "num_input_tokens_seen": 1884291072, "step": 1797 }, { "epoch": 2.011748251748252, "grad_norm": 0.38503565477163404, "learning_rate": 1.2406147644908537e-06, "loss": 0.7105, "num_input_tokens_seen": 1885339648, "step": 1798 }, { "epoch": 2.012867132867133, "grad_norm": 0.3699856887550528, "learning_rate": 1.2380592648690629e-06, "loss": 0.611, "num_input_tokens_seen": 1886388224, "step": 1799 }, { "epoch": 2.013986013986014, "grad_norm": 0.4075647297328535, "learning_rate": 1.2355055334693828e-06, "loss": 0.8724, "num_input_tokens_seen": 1887436800, "step": 1800 }, { "epoch": 2.015104895104895, "grad_norm": 0.37035606462258597, "learning_rate": 1.2329535738700838e-06, "loss": 0.7183, "num_input_tokens_seen": 1888485376, "step": 1801 }, { "epoch": 2.016223776223776, "grad_norm": 0.39188702893303806, "learning_rate": 1.2304033896469543e-06, "loss": 0.7514, "num_input_tokens_seen": 1889533952, "step": 1802 }, { "epoch": 2.0173426573426574, "grad_norm": 0.3452943170332209, "learning_rate": 1.2278549843732915e-06, "loss": 0.5897, "num_input_tokens_seen": 1890582528, "step": 1803 }, { "epoch": 2.0184615384615383, "grad_norm": 0.3803454268780761, "learning_rate": 1.2253083616199045e-06, "loss": 0.6102, "num_input_tokens_seen": 1891631104, "step": 1804 }, { "epoch": 2.0195804195804197, "grad_norm": 0.37618797558218475, "learning_rate": 1.2227635249551014e-06, "loss": 0.6717, "num_input_tokens_seen": 1892679680, "step": 1805 }, { "epoch": 2.0206993006993006, "grad_norm": 0.40777778715511104, "learning_rate": 1.2202204779446866e-06, "loss": 0.6703, "num_input_tokens_seen": 1893728256, "step": 1806 }, { "epoch": 2.021818181818182, "grad_norm": 0.34915266776591336, "learning_rate": 1.2176792241519628e-06, "loss": 0.5805, "num_input_tokens_seen": 1894776832, "step": 1807 }, { "epoch": 2.022937062937063, "grad_norm": 0.38803062900842367, "learning_rate": 1.2151397671377135e-06, "loss": 0.7194, "num_input_tokens_seen": 1895825408, "step": 1808 }, { "epoch": 2.0240559440559442, "grad_norm": 0.3827582216492673, "learning_rate": 1.212602110460209e-06, "loss": 0.7313, "num_input_tokens_seen": 1896873984, "step": 1809 }, { "epoch": 2.025174825174825, "grad_norm": 0.3883338316683154, "learning_rate": 1.2100662576751932e-06, "loss": 0.6724, "num_input_tokens_seen": 1897922560, "step": 1810 }, { "epoch": 2.026293706293706, "grad_norm": 0.3265142481891951, "learning_rate": 1.2075322123358857e-06, "loss": 0.6227, "num_input_tokens_seen": 1898971136, "step": 1811 }, { "epoch": 2.0274125874125875, "grad_norm": 0.460662327231326, "learning_rate": 1.2049999779929733e-06, "loss": 0.6343, "num_input_tokens_seen": 1900019712, "step": 1812 }, { "epoch": 2.0285314685314684, "grad_norm": 0.3486517099975311, "learning_rate": 1.2024695581946016e-06, "loss": 0.6051, "num_input_tokens_seen": 1901068288, "step": 1813 }, { "epoch": 2.0296503496503497, "grad_norm": 0.3658912637952016, "learning_rate": 1.1999409564863793e-06, "loss": 0.6979, "num_input_tokens_seen": 1902116864, "step": 1814 }, { "epoch": 2.0307692307692307, "grad_norm": 0.34505728535563246, "learning_rate": 1.1974141764113617e-06, "loss": 0.6427, "num_input_tokens_seen": 1903165440, "step": 1815 }, { "epoch": 2.031888111888112, "grad_norm": 0.36075685238254174, "learning_rate": 1.1948892215100557e-06, "loss": 0.7831, "num_input_tokens_seen": 1904214016, "step": 1816 }, { "epoch": 2.033006993006993, "grad_norm": 0.40557300303172816, "learning_rate": 1.192366095320411e-06, "loss": 0.7386, "num_input_tokens_seen": 1905262592, "step": 1817 }, { "epoch": 2.0341258741258743, "grad_norm": 0.3870102951657016, "learning_rate": 1.189844801377811e-06, "loss": 0.8826, "num_input_tokens_seen": 1906311168, "step": 1818 }, { "epoch": 2.0352447552447552, "grad_norm": 0.3582691414429587, "learning_rate": 1.1873253432150769e-06, "loss": 0.7653, "num_input_tokens_seen": 1907359744, "step": 1819 }, { "epoch": 2.036363636363636, "grad_norm": 0.3417934951869752, "learning_rate": 1.1848077243624525e-06, "loss": 0.6843, "num_input_tokens_seen": 1908408320, "step": 1820 }, { "epoch": 2.0374825174825175, "grad_norm": 0.36042613178742483, "learning_rate": 1.1822919483476089e-06, "loss": 0.6073, "num_input_tokens_seen": 1909456896, "step": 1821 }, { "epoch": 2.0386013986013984, "grad_norm": 0.3370453821749797, "learning_rate": 1.1797780186956307e-06, "loss": 0.6871, "num_input_tokens_seen": 1910505472, "step": 1822 }, { "epoch": 2.03972027972028, "grad_norm": 0.47643053269588653, "learning_rate": 1.177265938929021e-06, "loss": 0.705, "num_input_tokens_seen": 1911554048, "step": 1823 }, { "epoch": 2.0408391608391607, "grad_norm": 0.38434293864385877, "learning_rate": 1.1747557125676853e-06, "loss": 0.7486, "num_input_tokens_seen": 1912602624, "step": 1824 }, { "epoch": 2.041958041958042, "grad_norm": 0.3456919726051499, "learning_rate": 1.1722473431289344e-06, "loss": 0.6472, "num_input_tokens_seen": 1913651200, "step": 1825 }, { "epoch": 2.043076923076923, "grad_norm": 0.40935464683123113, "learning_rate": 1.1697408341274781e-06, "loss": 0.7062, "num_input_tokens_seen": 1914699776, "step": 1826 }, { "epoch": 2.0441958041958044, "grad_norm": 0.3476776678799426, "learning_rate": 1.1672361890754165e-06, "loss": 0.6114, "num_input_tokens_seen": 1915748352, "step": 1827 }, { "epoch": 2.0453146853146853, "grad_norm": 0.36337106259406116, "learning_rate": 1.1647334114822434e-06, "loss": 0.8218, "num_input_tokens_seen": 1916796928, "step": 1828 }, { "epoch": 2.046433566433566, "grad_norm": 0.33990402749830984, "learning_rate": 1.1622325048548303e-06, "loss": 0.6464, "num_input_tokens_seen": 1917845504, "step": 1829 }, { "epoch": 2.0475524475524476, "grad_norm": 0.3515736622984899, "learning_rate": 1.159733472697428e-06, "loss": 0.6844, "num_input_tokens_seen": 1918894080, "step": 1830 }, { "epoch": 2.0486713286713285, "grad_norm": 0.39561667268893796, "learning_rate": 1.1572363185116648e-06, "loss": 0.7415, "num_input_tokens_seen": 1919942656, "step": 1831 }, { "epoch": 2.04979020979021, "grad_norm": 0.34007012117619784, "learning_rate": 1.1547410457965314e-06, "loss": 0.5814, "num_input_tokens_seen": 1920991232, "step": 1832 }, { "epoch": 2.050909090909091, "grad_norm": 0.367561219238635, "learning_rate": 1.1522476580483893e-06, "loss": 0.7473, "num_input_tokens_seen": 1922039808, "step": 1833 }, { "epoch": 2.052027972027972, "grad_norm": 0.35432877215810843, "learning_rate": 1.149756158760953e-06, "loss": 0.7224, "num_input_tokens_seen": 1923088384, "step": 1834 }, { "epoch": 2.053146853146853, "grad_norm": 0.35436799457777524, "learning_rate": 1.1472665514252943e-06, "loss": 0.5895, "num_input_tokens_seen": 1924136960, "step": 1835 }, { "epoch": 2.0542657342657344, "grad_norm": 0.3477005263450572, "learning_rate": 1.1447788395298318e-06, "loss": 0.6445, "num_input_tokens_seen": 1925185536, "step": 1836 }, { "epoch": 2.0553846153846154, "grad_norm": 0.3409961839329795, "learning_rate": 1.142293026560328e-06, "loss": 0.6205, "num_input_tokens_seen": 1926234112, "step": 1837 }, { "epoch": 2.0565034965034963, "grad_norm": 0.38802743337311885, "learning_rate": 1.1398091159998887e-06, "loss": 0.7758, "num_input_tokens_seen": 1927282688, "step": 1838 }, { "epoch": 2.0576223776223777, "grad_norm": 0.3307261473813479, "learning_rate": 1.137327111328949e-06, "loss": 0.5974, "num_input_tokens_seen": 1928331264, "step": 1839 }, { "epoch": 2.0587412587412586, "grad_norm": 0.3595123727474879, "learning_rate": 1.1348470160252772e-06, "loss": 0.6062, "num_input_tokens_seen": 1929379840, "step": 1840 }, { "epoch": 2.05986013986014, "grad_norm": 0.3405956478420985, "learning_rate": 1.1323688335639637e-06, "loss": 0.6752, "num_input_tokens_seen": 1930428416, "step": 1841 }, { "epoch": 2.060979020979021, "grad_norm": 0.38534742830857344, "learning_rate": 1.1298925674174191e-06, "loss": 0.6068, "num_input_tokens_seen": 1931476992, "step": 1842 }, { "epoch": 2.0620979020979022, "grad_norm": 0.4067374386035562, "learning_rate": 1.12741822105537e-06, "loss": 0.7561, "num_input_tokens_seen": 1932525568, "step": 1843 }, { "epoch": 2.063216783216783, "grad_norm": 0.35886374561974577, "learning_rate": 1.1249457979448521e-06, "loss": 0.6795, "num_input_tokens_seen": 1933574144, "step": 1844 }, { "epoch": 2.0643356643356645, "grad_norm": 0.3763427783519024, "learning_rate": 1.122475301550208e-06, "loss": 0.7066, "num_input_tokens_seen": 1934622720, "step": 1845 }, { "epoch": 2.0654545454545454, "grad_norm": 0.40498086459930516, "learning_rate": 1.120006735333078e-06, "loss": 0.5849, "num_input_tokens_seen": 1935671296, "step": 1846 }, { "epoch": 2.0665734265734264, "grad_norm": 0.4297150643586454, "learning_rate": 1.117540102752398e-06, "loss": 0.6686, "num_input_tokens_seen": 1936719872, "step": 1847 }, { "epoch": 2.0676923076923077, "grad_norm": 0.39736259332515267, "learning_rate": 1.1150754072643966e-06, "loss": 0.6698, "num_input_tokens_seen": 1937768448, "step": 1848 }, { "epoch": 2.0688111888111886, "grad_norm": 0.6674589599868125, "learning_rate": 1.1126126523225869e-06, "loss": 0.6424, "num_input_tokens_seen": 1938817024, "step": 1849 }, { "epoch": 2.06993006993007, "grad_norm": 0.38715083295523406, "learning_rate": 1.110151841377764e-06, "loss": 0.6023, "num_input_tokens_seen": 1939865600, "step": 1850 }, { "epoch": 2.071048951048951, "grad_norm": 0.3513536034727624, "learning_rate": 1.1076929778779965e-06, "loss": 0.6791, "num_input_tokens_seen": 1940914176, "step": 1851 }, { "epoch": 2.0721678321678323, "grad_norm": 0.3742373997149547, "learning_rate": 1.1052360652686275e-06, "loss": 0.6703, "num_input_tokens_seen": 1941962752, "step": 1852 }, { "epoch": 2.073286713286713, "grad_norm": 0.3488201719401927, "learning_rate": 1.1027811069922634e-06, "loss": 0.7182, "num_input_tokens_seen": 1943011328, "step": 1853 }, { "epoch": 2.0744055944055946, "grad_norm": 0.3417863472605587, "learning_rate": 1.1003281064887744e-06, "loss": 0.6414, "num_input_tokens_seen": 1944059904, "step": 1854 }, { "epoch": 2.0755244755244755, "grad_norm": 0.37017198187718764, "learning_rate": 1.097877067195288e-06, "loss": 0.679, "num_input_tokens_seen": 1945108480, "step": 1855 }, { "epoch": 2.0766433566433564, "grad_norm": 0.37874080204798755, "learning_rate": 1.0954279925461802e-06, "loss": 0.7014, "num_input_tokens_seen": 1946157056, "step": 1856 }, { "epoch": 2.077762237762238, "grad_norm": 0.35662993657908754, "learning_rate": 1.092980885973079e-06, "loss": 0.7407, "num_input_tokens_seen": 1947205632, "step": 1857 }, { "epoch": 2.0788811188811187, "grad_norm": 0.3508531521283057, "learning_rate": 1.0905357509048498e-06, "loss": 0.6654, "num_input_tokens_seen": 1948254208, "step": 1858 }, { "epoch": 2.08, "grad_norm": 0.3389588527628599, "learning_rate": 1.088092590767599e-06, "loss": 0.6313, "num_input_tokens_seen": 1949302784, "step": 1859 }, { "epoch": 2.081118881118881, "grad_norm": 0.34437211361603354, "learning_rate": 1.0856514089846656e-06, "loss": 0.6359, "num_input_tokens_seen": 1950351360, "step": 1860 }, { "epoch": 2.0822377622377624, "grad_norm": 0.3252394134674311, "learning_rate": 1.0832122089766143e-06, "loss": 0.6465, "num_input_tokens_seen": 1951399936, "step": 1861 }, { "epoch": 2.0833566433566433, "grad_norm": 0.34401183970908583, "learning_rate": 1.080774994161235e-06, "loss": 0.6765, "num_input_tokens_seen": 1952448512, "step": 1862 }, { "epoch": 2.0844755244755246, "grad_norm": 0.3708989557126905, "learning_rate": 1.0783397679535343e-06, "loss": 0.8147, "num_input_tokens_seen": 1953497088, "step": 1863 }, { "epoch": 2.0855944055944056, "grad_norm": 0.3396650970665035, "learning_rate": 1.075906533765734e-06, "loss": 0.7105, "num_input_tokens_seen": 1954545664, "step": 1864 }, { "epoch": 2.0867132867132865, "grad_norm": 0.3927854481854163, "learning_rate": 1.073475295007265e-06, "loss": 0.69, "num_input_tokens_seen": 1955594240, "step": 1865 }, { "epoch": 2.087832167832168, "grad_norm": 0.3386220273235909, "learning_rate": 1.0710460550847593e-06, "loss": 0.5658, "num_input_tokens_seen": 1956642816, "step": 1866 }, { "epoch": 2.0889510489510488, "grad_norm": 0.36092650984517666, "learning_rate": 1.068618817402052e-06, "loss": 0.6637, "num_input_tokens_seen": 1957691392, "step": 1867 }, { "epoch": 2.09006993006993, "grad_norm": 0.35237657452329657, "learning_rate": 1.0661935853601688e-06, "loss": 0.6657, "num_input_tokens_seen": 1958739968, "step": 1868 }, { "epoch": 2.091188811188811, "grad_norm": 0.34239815852933264, "learning_rate": 1.0637703623573278e-06, "loss": 0.6223, "num_input_tokens_seen": 1959788544, "step": 1869 }, { "epoch": 2.0923076923076924, "grad_norm": 0.7037682342442632, "learning_rate": 1.0613491517889326e-06, "loss": 0.6222, "num_input_tokens_seen": 1960837120, "step": 1870 }, { "epoch": 2.0934265734265733, "grad_norm": 0.3638967776662905, "learning_rate": 1.058929957047564e-06, "loss": 0.5786, "num_input_tokens_seen": 1961885696, "step": 1871 }, { "epoch": 2.0945454545454547, "grad_norm": 0.358918438928907, "learning_rate": 1.0565127815229815e-06, "loss": 0.7348, "num_input_tokens_seen": 1962934272, "step": 1872 }, { "epoch": 2.0956643356643356, "grad_norm": 0.3544797188167434, "learning_rate": 1.0540976286021115e-06, "loss": 0.7973, "num_input_tokens_seen": 1963982848, "step": 1873 }, { "epoch": 2.096783216783217, "grad_norm": 0.34041636832831146, "learning_rate": 1.0516845016690502e-06, "loss": 0.7, "num_input_tokens_seen": 1965031424, "step": 1874 }, { "epoch": 2.097902097902098, "grad_norm": 0.39583496046540345, "learning_rate": 1.0492734041050532e-06, "loss": 0.6746, "num_input_tokens_seen": 1966080000, "step": 1875 }, { "epoch": 2.099020979020979, "grad_norm": 0.3433167413788926, "learning_rate": 1.0468643392885335e-06, "loss": 0.663, "num_input_tokens_seen": 1967128576, "step": 1876 }, { "epoch": 2.10013986013986, "grad_norm": 0.39008621299153445, "learning_rate": 1.0444573105950543e-06, "loss": 0.6262, "num_input_tokens_seen": 1968177152, "step": 1877 }, { "epoch": 2.101258741258741, "grad_norm": 0.393982543600159, "learning_rate": 1.0420523213973253e-06, "loss": 0.6432, "num_input_tokens_seen": 1969225728, "step": 1878 }, { "epoch": 2.1023776223776225, "grad_norm": 0.3746407295107264, "learning_rate": 1.0396493750652008e-06, "loss": 0.5529, "num_input_tokens_seen": 1970274304, "step": 1879 }, { "epoch": 2.1034965034965034, "grad_norm": 0.3524443176268899, "learning_rate": 1.0372484749656723e-06, "loss": 0.6106, "num_input_tokens_seen": 1971322880, "step": 1880 }, { "epoch": 2.1046153846153848, "grad_norm": 0.8586449984785981, "learning_rate": 1.0348496244628633e-06, "loss": 0.6082, "num_input_tokens_seen": 1972371456, "step": 1881 }, { "epoch": 2.1057342657342657, "grad_norm": 0.3840157514254585, "learning_rate": 1.0324528269180252e-06, "loss": 0.6217, "num_input_tokens_seen": 1973420032, "step": 1882 }, { "epoch": 2.106853146853147, "grad_norm": 0.5871677257620914, "learning_rate": 1.030058085689532e-06, "loss": 0.6896, "num_input_tokens_seen": 1974468608, "step": 1883 }, { "epoch": 2.107972027972028, "grad_norm": 0.36468347764589165, "learning_rate": 1.0276654041328787e-06, "loss": 0.7581, "num_input_tokens_seen": 1975517184, "step": 1884 }, { "epoch": 2.109090909090909, "grad_norm": 0.35683552665881113, "learning_rate": 1.0252747856006735e-06, "loss": 0.6518, "num_input_tokens_seen": 1976565760, "step": 1885 }, { "epoch": 2.1102097902097903, "grad_norm": 0.4292777942688978, "learning_rate": 1.0228862334426335e-06, "loss": 0.688, "num_input_tokens_seen": 1977614336, "step": 1886 }, { "epoch": 2.111328671328671, "grad_norm": 0.36781798624234446, "learning_rate": 1.0204997510055793e-06, "loss": 0.7086, "num_input_tokens_seen": 1978662912, "step": 1887 }, { "epoch": 2.1124475524475526, "grad_norm": 0.3504562392860338, "learning_rate": 1.0181153416334344e-06, "loss": 0.6979, "num_input_tokens_seen": 1979711488, "step": 1888 }, { "epoch": 2.1135664335664335, "grad_norm": 0.34172621255939717, "learning_rate": 1.015733008667214e-06, "loss": 0.5586, "num_input_tokens_seen": 1980760064, "step": 1889 }, { "epoch": 2.114685314685315, "grad_norm": 0.4047126351380739, "learning_rate": 1.0133527554450262e-06, "loss": 0.5597, "num_input_tokens_seen": 1981808640, "step": 1890 }, { "epoch": 2.1158041958041958, "grad_norm": 0.3973231439655392, "learning_rate": 1.0109745853020655e-06, "loss": 0.6179, "num_input_tokens_seen": 1982857216, "step": 1891 }, { "epoch": 2.116923076923077, "grad_norm": 0.37234505758507175, "learning_rate": 1.0085985015706045e-06, "loss": 0.7409, "num_input_tokens_seen": 1983905792, "step": 1892 }, { "epoch": 2.118041958041958, "grad_norm": 0.4157885233357669, "learning_rate": 1.0062245075799966e-06, "loss": 0.6989, "num_input_tokens_seen": 1984954368, "step": 1893 }, { "epoch": 2.119160839160839, "grad_norm": 0.3909548048765735, "learning_rate": 1.0038526066566624e-06, "loss": 0.6449, "num_input_tokens_seen": 1986002944, "step": 1894 }, { "epoch": 2.1202797202797203, "grad_norm": 0.374967216456381, "learning_rate": 1.0014828021240932e-06, "loss": 0.619, "num_input_tokens_seen": 1987051520, "step": 1895 }, { "epoch": 2.1213986013986013, "grad_norm": 0.3809114717502835, "learning_rate": 9.991150973028428e-07, "loss": 0.6311, "num_input_tokens_seen": 1988100096, "step": 1896 }, { "epoch": 2.1225174825174826, "grad_norm": 0.40346507400847037, "learning_rate": 9.967494955105197e-07, "loss": 0.7862, "num_input_tokens_seen": 1989148672, "step": 1897 }, { "epoch": 2.1236363636363635, "grad_norm": 0.35707304829867637, "learning_rate": 9.9438600006179e-07, "loss": 0.6566, "num_input_tokens_seen": 1990197248, "step": 1898 }, { "epoch": 2.124755244755245, "grad_norm": 0.3949103666304915, "learning_rate": 9.92024614268364e-07, "loss": 0.7726, "num_input_tokens_seen": 1991245824, "step": 1899 }, { "epoch": 2.125874125874126, "grad_norm": 0.38786185705008264, "learning_rate": 9.896653414389996e-07, "loss": 0.5821, "num_input_tokens_seen": 1992294400, "step": 1900 }, { "epoch": 2.126993006993007, "grad_norm": 0.40763570170082863, "learning_rate": 9.873081848794926e-07, "loss": 0.6046, "num_input_tokens_seen": 1993342976, "step": 1901 }, { "epoch": 2.128111888111888, "grad_norm": 0.34201303710236847, "learning_rate": 9.84953147892673e-07, "loss": 0.7002, "num_input_tokens_seen": 1994391552, "step": 1902 }, { "epoch": 2.129230769230769, "grad_norm": 0.3874042326509998, "learning_rate": 9.82600233778402e-07, "loss": 0.6932, "num_input_tokens_seen": 1995440128, "step": 1903 }, { "epoch": 2.1303496503496504, "grad_norm": 0.3729482095954626, "learning_rate": 9.802494458335643e-07, "loss": 0.6934, "num_input_tokens_seen": 1996488704, "step": 1904 }, { "epoch": 2.1314685314685313, "grad_norm": 0.3594190818583719, "learning_rate": 9.77900787352068e-07, "loss": 0.6303, "num_input_tokens_seen": 1997537280, "step": 1905 }, { "epoch": 2.1325874125874127, "grad_norm": 0.3636553070078813, "learning_rate": 9.755542616248361e-07, "loss": 0.6924, "num_input_tokens_seen": 1998585856, "step": 1906 }, { "epoch": 2.1337062937062936, "grad_norm": 0.3433535864089947, "learning_rate": 9.732098719398025e-07, "loss": 0.6302, "num_input_tokens_seen": 1999634432, "step": 1907 }, { "epoch": 2.134825174825175, "grad_norm": 0.33553017050223366, "learning_rate": 9.708676215819098e-07, "loss": 0.6511, "num_input_tokens_seen": 2000683008, "step": 1908 }, { "epoch": 2.135944055944056, "grad_norm": 0.34396635264511355, "learning_rate": 9.68527513833101e-07, "loss": 0.6657, "num_input_tokens_seen": 2001731584, "step": 1909 }, { "epoch": 2.1370629370629373, "grad_norm": 0.3629485828926079, "learning_rate": 9.661895519723183e-07, "loss": 0.658, "num_input_tokens_seen": 2002780160, "step": 1910 }, { "epoch": 2.138181818181818, "grad_norm": 0.3767713083457435, "learning_rate": 9.638537392754968e-07, "loss": 0.7099, "num_input_tokens_seen": 2003828736, "step": 1911 }, { "epoch": 2.139300699300699, "grad_norm": 0.3723263612638146, "learning_rate": 9.615200790155612e-07, "loss": 0.7836, "num_input_tokens_seen": 2004877312, "step": 1912 }, { "epoch": 2.1404195804195805, "grad_norm": 0.3396662534537283, "learning_rate": 9.591885744624183e-07, "loss": 0.7269, "num_input_tokens_seen": 2005925888, "step": 1913 }, { "epoch": 2.1415384615384614, "grad_norm": 0.348811141502699, "learning_rate": 9.56859228882954e-07, "loss": 0.7035, "num_input_tokens_seen": 2006974464, "step": 1914 }, { "epoch": 2.1426573426573428, "grad_norm": 0.5867612549584215, "learning_rate": 9.54532045541031e-07, "loss": 0.6048, "num_input_tokens_seen": 2008023040, "step": 1915 }, { "epoch": 2.1437762237762237, "grad_norm": 0.3825803136591894, "learning_rate": 9.522070276974823e-07, "loss": 0.7171, "num_input_tokens_seen": 2009071616, "step": 1916 }, { "epoch": 2.144895104895105, "grad_norm": 0.3554083419159791, "learning_rate": 9.498841786101065e-07, "loss": 0.7006, "num_input_tokens_seen": 2010120192, "step": 1917 }, { "epoch": 2.146013986013986, "grad_norm": 0.3991950535267171, "learning_rate": 9.475635015336612e-07, "loss": 0.6544, "num_input_tokens_seen": 2011168768, "step": 1918 }, { "epoch": 2.1471328671328673, "grad_norm": 0.36991425400205846, "learning_rate": 9.45244999719862e-07, "loss": 0.8538, "num_input_tokens_seen": 2012217344, "step": 1919 }, { "epoch": 2.1482517482517482, "grad_norm": 0.3529621483177475, "learning_rate": 9.42928676417377e-07, "loss": 0.7319, "num_input_tokens_seen": 2013265920, "step": 1920 }, { "epoch": 2.149370629370629, "grad_norm": 0.6397222697064499, "learning_rate": 9.406145348718218e-07, "loss": 0.7846, "num_input_tokens_seen": 2014314496, "step": 1921 }, { "epoch": 2.1504895104895105, "grad_norm": 0.3922037457169449, "learning_rate": 9.383025783257554e-07, "loss": 0.5742, "num_input_tokens_seen": 2015363072, "step": 1922 }, { "epoch": 2.1516083916083915, "grad_norm": 0.40529844250635877, "learning_rate": 9.359928100186724e-07, "loss": 0.5967, "num_input_tokens_seen": 2016411648, "step": 1923 }, { "epoch": 2.152727272727273, "grad_norm": 0.3715126574232512, "learning_rate": 9.336852331870052e-07, "loss": 0.6586, "num_input_tokens_seen": 2017460224, "step": 1924 }, { "epoch": 2.1538461538461537, "grad_norm": 0.43469491154092155, "learning_rate": 9.313798510641117e-07, "loss": 0.68, "num_input_tokens_seen": 2018508800, "step": 1925 }, { "epoch": 2.154965034965035, "grad_norm": 0.44051803673361034, "learning_rate": 9.290766668802773e-07, "loss": 0.7835, "num_input_tokens_seen": 2019557376, "step": 1926 }, { "epoch": 2.156083916083916, "grad_norm": 0.39637802629260144, "learning_rate": 9.267756838627079e-07, "loss": 0.5909, "num_input_tokens_seen": 2020605952, "step": 1927 }, { "epoch": 2.1572027972027974, "grad_norm": 0.3987267713462657, "learning_rate": 9.244769052355218e-07, "loss": 0.7329, "num_input_tokens_seen": 2021654528, "step": 1928 }, { "epoch": 2.1583216783216783, "grad_norm": 0.4126481393923499, "learning_rate": 9.22180334219753e-07, "loss": 0.7475, "num_input_tokens_seen": 2022703104, "step": 1929 }, { "epoch": 2.1594405594405592, "grad_norm": 0.3402684516388309, "learning_rate": 9.19885974033338e-07, "loss": 0.5997, "num_input_tokens_seen": 2023751680, "step": 1930 }, { "epoch": 2.1605594405594406, "grad_norm": 0.39272589349376275, "learning_rate": 9.175938278911184e-07, "loss": 0.7637, "num_input_tokens_seen": 2024800256, "step": 1931 }, { "epoch": 2.1616783216783215, "grad_norm": 0.4349038652482646, "learning_rate": 9.153038990048335e-07, "loss": 0.8228, "num_input_tokens_seen": 2025848832, "step": 1932 }, { "epoch": 2.162797202797203, "grad_norm": 0.3897790467313609, "learning_rate": 9.130161905831131e-07, "loss": 0.6544, "num_input_tokens_seen": 2026897408, "step": 1933 }, { "epoch": 2.163916083916084, "grad_norm": 0.36349800805981636, "learning_rate": 9.107307058314793e-07, "loss": 0.7327, "num_input_tokens_seen": 2027945984, "step": 1934 }, { "epoch": 2.165034965034965, "grad_norm": 0.37600590447803456, "learning_rate": 9.084474479523347e-07, "loss": 0.6742, "num_input_tokens_seen": 2028994560, "step": 1935 }, { "epoch": 2.166153846153846, "grad_norm": 0.351348302727689, "learning_rate": 9.061664201449643e-07, "loss": 0.6392, "num_input_tokens_seen": 2030043136, "step": 1936 }, { "epoch": 2.1672727272727275, "grad_norm": 0.38137992971005014, "learning_rate": 9.038876256055288e-07, "loss": 0.6315, "num_input_tokens_seen": 2031091712, "step": 1937 }, { "epoch": 2.1683916083916084, "grad_norm": 0.3961481651559409, "learning_rate": 9.016110675270562e-07, "loss": 0.722, "num_input_tokens_seen": 2032140288, "step": 1938 }, { "epoch": 2.1695104895104893, "grad_norm": 0.35719693373207195, "learning_rate": 8.993367490994451e-07, "loss": 0.7279, "num_input_tokens_seen": 2033188864, "step": 1939 }, { "epoch": 2.1706293706293707, "grad_norm": 0.359831843188822, "learning_rate": 8.970646735094521e-07, "loss": 0.7584, "num_input_tokens_seen": 2034237440, "step": 1940 }, { "epoch": 2.1717482517482516, "grad_norm": 0.36981631254320313, "learning_rate": 8.947948439406934e-07, "loss": 0.64, "num_input_tokens_seen": 2035286016, "step": 1941 }, { "epoch": 2.172867132867133, "grad_norm": 0.36952148954214126, "learning_rate": 8.925272635736387e-07, "loss": 0.6578, "num_input_tokens_seen": 2036334592, "step": 1942 }, { "epoch": 2.173986013986014, "grad_norm": 0.3569918784448221, "learning_rate": 8.902619355856032e-07, "loss": 0.649, "num_input_tokens_seen": 2037383168, "step": 1943 }, { "epoch": 2.1751048951048952, "grad_norm": 0.3600871826447918, "learning_rate": 8.879988631507494e-07, "loss": 0.6007, "num_input_tokens_seen": 2038431744, "step": 1944 }, { "epoch": 2.176223776223776, "grad_norm": 0.36879638534764214, "learning_rate": 8.857380494400764e-07, "loss": 0.6682, "num_input_tokens_seen": 2039480320, "step": 1945 }, { "epoch": 2.1773426573426575, "grad_norm": 0.4076436342568645, "learning_rate": 8.834794976214206e-07, "loss": 0.6925, "num_input_tokens_seen": 2040528896, "step": 1946 }, { "epoch": 2.1784615384615384, "grad_norm": 0.33428736982048224, "learning_rate": 8.812232108594482e-07, "loss": 0.5794, "num_input_tokens_seen": 2041577472, "step": 1947 }, { "epoch": 2.17958041958042, "grad_norm": 0.37444889629014944, "learning_rate": 8.789691923156524e-07, "loss": 0.695, "num_input_tokens_seen": 2042626048, "step": 1948 }, { "epoch": 2.1806993006993007, "grad_norm": 0.38685310563597297, "learning_rate": 8.767174451483468e-07, "loss": 0.5919, "num_input_tokens_seen": 2043674624, "step": 1949 }, { "epoch": 2.1818181818181817, "grad_norm": 0.399894274429508, "learning_rate": 8.744679725126621e-07, "loss": 0.6265, "num_input_tokens_seen": 2044723200, "step": 1950 }, { "epoch": 2.182937062937063, "grad_norm": 0.35327980180888685, "learning_rate": 8.722207775605437e-07, "loss": 0.6858, "num_input_tokens_seen": 2045771776, "step": 1951 }, { "epoch": 2.184055944055944, "grad_norm": 0.3338447996853078, "learning_rate": 8.699758634407452e-07, "loss": 0.6281, "num_input_tokens_seen": 2046820352, "step": 1952 }, { "epoch": 2.1851748251748253, "grad_norm": 0.37713554835938334, "learning_rate": 8.677332332988236e-07, "loss": 0.7444, "num_input_tokens_seen": 2047868928, "step": 1953 }, { "epoch": 2.1862937062937062, "grad_norm": 0.356525738107378, "learning_rate": 8.654928902771359e-07, "loss": 0.5745, "num_input_tokens_seen": 2048917504, "step": 1954 }, { "epoch": 2.1874125874125876, "grad_norm": 0.4063122572780938, "learning_rate": 8.632548375148333e-07, "loss": 0.7492, "num_input_tokens_seen": 2049966080, "step": 1955 }, { "epoch": 2.1885314685314685, "grad_norm": 0.3621411100625131, "learning_rate": 8.610190781478595e-07, "loss": 0.6301, "num_input_tokens_seen": 2051014656, "step": 1956 }, { "epoch": 2.18965034965035, "grad_norm": 0.3716436333988751, "learning_rate": 8.587856153089444e-07, "loss": 0.6863, "num_input_tokens_seen": 2052063232, "step": 1957 }, { "epoch": 2.190769230769231, "grad_norm": 0.371780288689286, "learning_rate": 8.565544521276004e-07, "loss": 0.6687, "num_input_tokens_seen": 2053111808, "step": 1958 }, { "epoch": 2.1918881118881117, "grad_norm": 0.392623569154893, "learning_rate": 8.543255917301163e-07, "loss": 0.6111, "num_input_tokens_seen": 2054160384, "step": 1959 }, { "epoch": 2.193006993006993, "grad_norm": 0.38903219624446717, "learning_rate": 8.520990372395541e-07, "loss": 0.6618, "num_input_tokens_seen": 2055208960, "step": 1960 }, { "epoch": 2.194125874125874, "grad_norm": 0.3485583101718587, "learning_rate": 8.498747917757464e-07, "loss": 0.6658, "num_input_tokens_seen": 2056257536, "step": 1961 }, { "epoch": 2.1952447552447554, "grad_norm": 0.341832494235448, "learning_rate": 8.476528584552896e-07, "loss": 0.5486, "num_input_tokens_seen": 2057306112, "step": 1962 }, { "epoch": 2.1963636363636363, "grad_norm": 0.3789876867103509, "learning_rate": 8.454332403915416e-07, "loss": 0.6476, "num_input_tokens_seen": 2058354688, "step": 1963 }, { "epoch": 2.1974825174825177, "grad_norm": 0.35779419716927985, "learning_rate": 8.432159406946128e-07, "loss": 0.675, "num_input_tokens_seen": 2059403264, "step": 1964 }, { "epoch": 2.1986013986013986, "grad_norm": 0.4452010468497492, "learning_rate": 8.410009624713691e-07, "loss": 0.7093, "num_input_tokens_seen": 2060451840, "step": 1965 }, { "epoch": 2.19972027972028, "grad_norm": 0.37033274946261735, "learning_rate": 8.387883088254206e-07, "loss": 0.6202, "num_input_tokens_seen": 2061500416, "step": 1966 }, { "epoch": 2.200839160839161, "grad_norm": 0.37021530208742126, "learning_rate": 8.365779828571214e-07, "loss": 0.8134, "num_input_tokens_seen": 2062548992, "step": 1967 }, { "epoch": 2.201958041958042, "grad_norm": 0.3555186289471138, "learning_rate": 8.343699876635655e-07, "loss": 0.6977, "num_input_tokens_seen": 2063597568, "step": 1968 }, { "epoch": 2.203076923076923, "grad_norm": 0.4123047452365663, "learning_rate": 8.321643263385776e-07, "loss": 0.695, "num_input_tokens_seen": 2064646144, "step": 1969 }, { "epoch": 2.204195804195804, "grad_norm": 0.3507021704389126, "learning_rate": 8.299610019727159e-07, "loss": 0.7216, "num_input_tokens_seen": 2065694720, "step": 1970 }, { "epoch": 2.2053146853146854, "grad_norm": 0.36577221892803613, "learning_rate": 8.277600176532608e-07, "loss": 0.6096, "num_input_tokens_seen": 2066743296, "step": 1971 }, { "epoch": 2.2064335664335664, "grad_norm": 0.7717914773663034, "learning_rate": 8.255613764642161e-07, "loss": 0.6693, "num_input_tokens_seen": 2067791872, "step": 1972 }, { "epoch": 2.2075524475524477, "grad_norm": 0.3691289597495359, "learning_rate": 8.233650814863026e-07, "loss": 0.6091, "num_input_tokens_seen": 2068840448, "step": 1973 }, { "epoch": 2.2086713286713286, "grad_norm": 0.36520182032695897, "learning_rate": 8.211711357969513e-07, "loss": 0.6001, "num_input_tokens_seen": 2069889024, "step": 1974 }, { "epoch": 2.20979020979021, "grad_norm": 0.3542719239752823, "learning_rate": 8.18979542470304e-07, "loss": 0.658, "num_input_tokens_seen": 2070937600, "step": 1975 }, { "epoch": 2.210909090909091, "grad_norm": 0.3571946820203093, "learning_rate": 8.167903045772041e-07, "loss": 0.7175, "num_input_tokens_seen": 2071986176, "step": 1976 }, { "epoch": 2.212027972027972, "grad_norm": 0.39812950907305095, "learning_rate": 8.146034251851959e-07, "loss": 0.6342, "num_input_tokens_seen": 2073034752, "step": 1977 }, { "epoch": 2.213146853146853, "grad_norm": 0.38406828955153705, "learning_rate": 8.124189073585204e-07, "loss": 0.722, "num_input_tokens_seen": 2074083328, "step": 1978 }, { "epoch": 2.214265734265734, "grad_norm": 0.3913529334982864, "learning_rate": 8.102367541581055e-07, "loss": 0.6942, "num_input_tokens_seen": 2075131904, "step": 1979 }, { "epoch": 2.2153846153846155, "grad_norm": 0.7276733150178811, "learning_rate": 8.080569686415704e-07, "loss": 0.5802, "num_input_tokens_seen": 2076180480, "step": 1980 }, { "epoch": 2.2165034965034964, "grad_norm": 0.3721593767457791, "learning_rate": 8.05879553863213e-07, "loss": 0.7118, "num_input_tokens_seen": 2077229056, "step": 1981 }, { "epoch": 2.217622377622378, "grad_norm": 0.38839107773489306, "learning_rate": 8.037045128740115e-07, "loss": 0.5638, "num_input_tokens_seen": 2078277632, "step": 1982 }, { "epoch": 2.2187412587412587, "grad_norm": 0.40385834234761586, "learning_rate": 8.015318487216184e-07, "loss": 0.7202, "num_input_tokens_seen": 2079326208, "step": 1983 }, { "epoch": 2.21986013986014, "grad_norm": 0.5971694407094728, "learning_rate": 7.993615644503531e-07, "loss": 0.6565, "num_input_tokens_seen": 2080374784, "step": 1984 }, { "epoch": 2.220979020979021, "grad_norm": 0.35093580607320085, "learning_rate": 7.971936631012033e-07, "loss": 0.745, "num_input_tokens_seen": 2081423360, "step": 1985 }, { "epoch": 2.222097902097902, "grad_norm": 0.35510124481301414, "learning_rate": 7.950281477118155e-07, "loss": 0.6211, "num_input_tokens_seen": 2082471936, "step": 1986 }, { "epoch": 2.2232167832167833, "grad_norm": 0.33710235906945357, "learning_rate": 7.928650213164945e-07, "loss": 0.5893, "num_input_tokens_seen": 2083520512, "step": 1987 }, { "epoch": 2.224335664335664, "grad_norm": 0.3322550561713402, "learning_rate": 7.90704286946197e-07, "loss": 0.7493, "num_input_tokens_seen": 2084569088, "step": 1988 }, { "epoch": 2.2254545454545456, "grad_norm": 0.36292374360155705, "learning_rate": 7.885459476285292e-07, "loss": 0.6657, "num_input_tokens_seen": 2085617664, "step": 1989 }, { "epoch": 2.2265734265734265, "grad_norm": 0.3487474839668801, "learning_rate": 7.863900063877397e-07, "loss": 0.5458, "num_input_tokens_seen": 2086666240, "step": 1990 }, { "epoch": 2.227692307692308, "grad_norm": 1.1546974963948144, "learning_rate": 7.842364662447161e-07, "loss": 0.6809, "num_input_tokens_seen": 2087714816, "step": 1991 }, { "epoch": 2.228811188811189, "grad_norm": 0.3913856475538096, "learning_rate": 7.820853302169845e-07, "loss": 0.6604, "num_input_tokens_seen": 2088763392, "step": 1992 }, { "epoch": 2.22993006993007, "grad_norm": 0.37234351993586023, "learning_rate": 7.799366013187007e-07, "loss": 0.6575, "num_input_tokens_seen": 2089811968, "step": 1993 }, { "epoch": 2.231048951048951, "grad_norm": 0.3708088232048316, "learning_rate": 7.77790282560649e-07, "loss": 0.5596, "num_input_tokens_seen": 2090860544, "step": 1994 }, { "epoch": 2.232167832167832, "grad_norm": 0.3566860252240544, "learning_rate": 7.75646376950234e-07, "loss": 0.6729, "num_input_tokens_seen": 2091909120, "step": 1995 }, { "epoch": 2.2332867132867134, "grad_norm": 0.37282503272440753, "learning_rate": 7.735048874914805e-07, "loss": 0.6427, "num_input_tokens_seen": 2092957696, "step": 1996 }, { "epoch": 2.2344055944055943, "grad_norm": 0.37263570424432546, "learning_rate": 7.713658171850289e-07, "loss": 0.679, "num_input_tokens_seen": 2094006272, "step": 1997 }, { "epoch": 2.2355244755244756, "grad_norm": 0.40916454424801363, "learning_rate": 7.692291690281267e-07, "loss": 0.6468, "num_input_tokens_seen": 2095054848, "step": 1998 }, { "epoch": 2.2366433566433566, "grad_norm": 0.41029575121494916, "learning_rate": 7.670949460146329e-07, "loss": 0.6207, "num_input_tokens_seen": 2096103424, "step": 1999 }, { "epoch": 2.237762237762238, "grad_norm": 0.39120590520177606, "learning_rate": 7.649631511350025e-07, "loss": 0.6875, "num_input_tokens_seen": 2097152000, "step": 2000 }, { "epoch": 2.237762237762238, "eval_loss": 0.7205791473388672, "eval_runtime": 246.93, "eval_samples_per_second": 2.365, "eval_steps_per_second": 0.296, "num_input_tokens_seen": 2097152000, "step": 2000 }, { "epoch": 2.238881118881119, "grad_norm": 0.3688045095118029, "learning_rate": 7.628337873762928e-07, "loss": 0.7097, "num_input_tokens_seen": 2098200576, "step": 2001 }, { "epoch": 2.24, "grad_norm": 0.3665988102450242, "learning_rate": 7.607068577221516e-07, "loss": 0.6993, "num_input_tokens_seen": 2099249152, "step": 2002 }, { "epoch": 2.241118881118881, "grad_norm": 0.34537323560266886, "learning_rate": 7.585823651528157e-07, "loss": 0.6197, "num_input_tokens_seen": 2100297728, "step": 2003 }, { "epoch": 2.242237762237762, "grad_norm": 0.38066025508328544, "learning_rate": 7.564603126451117e-07, "loss": 0.6454, "num_input_tokens_seen": 2101346304, "step": 2004 }, { "epoch": 2.2433566433566434, "grad_norm": 0.3850950961649825, "learning_rate": 7.543407031724415e-07, "loss": 0.6997, "num_input_tokens_seen": 2102394880, "step": 2005 }, { "epoch": 2.2444755244755243, "grad_norm": 0.39524216594389205, "learning_rate": 7.522235397047881e-07, "loss": 0.7814, "num_input_tokens_seen": 2103443456, "step": 2006 }, { "epoch": 2.2455944055944057, "grad_norm": 0.35207175522349904, "learning_rate": 7.501088252087046e-07, "loss": 0.7047, "num_input_tokens_seen": 2104492032, "step": 2007 }, { "epoch": 2.2467132867132866, "grad_norm": 0.3923455462004978, "learning_rate": 7.479965626473118e-07, "loss": 0.5965, "num_input_tokens_seen": 2105540608, "step": 2008 }, { "epoch": 2.247832167832168, "grad_norm": 0.3350148042297089, "learning_rate": 7.458867549802998e-07, "loss": 0.7117, "num_input_tokens_seen": 2106589184, "step": 2009 }, { "epoch": 2.248951048951049, "grad_norm": 0.3603376989571429, "learning_rate": 7.437794051639133e-07, "loss": 0.7982, "num_input_tokens_seen": 2107637760, "step": 2010 }, { "epoch": 2.2500699300699303, "grad_norm": 0.36806353341496967, "learning_rate": 7.41674516150957e-07, "loss": 0.5344, "num_input_tokens_seen": 2108686336, "step": 2011 }, { "epoch": 2.251188811188811, "grad_norm": 0.3481187240740505, "learning_rate": 7.395720908907842e-07, "loss": 0.6165, "num_input_tokens_seen": 2109734912, "step": 2012 }, { "epoch": 2.252307692307692, "grad_norm": 0.3669506863027627, "learning_rate": 7.374721323292985e-07, "loss": 0.6814, "num_input_tokens_seen": 2110783488, "step": 2013 }, { "epoch": 2.2534265734265735, "grad_norm": 0.35999984152434555, "learning_rate": 7.353746434089476e-07, "loss": 0.6214, "num_input_tokens_seen": 2111832064, "step": 2014 }, { "epoch": 2.2545454545454544, "grad_norm": 0.33956427159250585, "learning_rate": 7.332796270687159e-07, "loss": 0.6101, "num_input_tokens_seen": 2112880640, "step": 2015 }, { "epoch": 2.2556643356643358, "grad_norm": 0.5943893774083661, "learning_rate": 7.311870862441265e-07, "loss": 0.7496, "num_input_tokens_seen": 2113929216, "step": 2016 }, { "epoch": 2.2567832167832167, "grad_norm": 0.34741897282443196, "learning_rate": 7.290970238672307e-07, "loss": 0.7339, "num_input_tokens_seen": 2114977792, "step": 2017 }, { "epoch": 2.257902097902098, "grad_norm": 0.3334738242337691, "learning_rate": 7.270094428666099e-07, "loss": 0.5609, "num_input_tokens_seen": 2116026368, "step": 2018 }, { "epoch": 2.259020979020979, "grad_norm": 0.3532864380247249, "learning_rate": 7.24924346167366e-07, "loss": 0.772, "num_input_tokens_seen": 2117074944, "step": 2019 }, { "epoch": 2.2601398601398603, "grad_norm": 0.4035870312028721, "learning_rate": 7.228417366911222e-07, "loss": 0.7736, "num_input_tokens_seen": 2118123520, "step": 2020 }, { "epoch": 2.2612587412587413, "grad_norm": 0.3686001797311558, "learning_rate": 7.207616173560158e-07, "loss": 0.7358, "num_input_tokens_seen": 2119172096, "step": 2021 }, { "epoch": 2.262377622377622, "grad_norm": 0.3508608276856075, "learning_rate": 7.18683991076694e-07, "loss": 0.674, "num_input_tokens_seen": 2120220672, "step": 2022 }, { "epoch": 2.2634965034965036, "grad_norm": 0.4856307647199245, "learning_rate": 7.166088607643123e-07, "loss": 0.618, "num_input_tokens_seen": 2121269248, "step": 2023 }, { "epoch": 2.2646153846153845, "grad_norm": 0.3639929790049854, "learning_rate": 7.145362293265265e-07, "loss": 0.7095, "num_input_tokens_seen": 2122317824, "step": 2024 }, { "epoch": 2.265734265734266, "grad_norm": 0.34692932474534205, "learning_rate": 7.124660996674951e-07, "loss": 0.637, "num_input_tokens_seen": 2123366400, "step": 2025 }, { "epoch": 2.2668531468531468, "grad_norm": 0.3346365072449412, "learning_rate": 7.103984746878673e-07, "loss": 0.5973, "num_input_tokens_seen": 2124414976, "step": 2026 }, { "epoch": 2.267972027972028, "grad_norm": 0.33928574225747127, "learning_rate": 7.083333572847831e-07, "loss": 0.6508, "num_input_tokens_seen": 2125463552, "step": 2027 }, { "epoch": 2.269090909090909, "grad_norm": 0.8125928801722034, "learning_rate": 7.062707503518718e-07, "loss": 0.6465, "num_input_tokens_seen": 2126512128, "step": 2028 }, { "epoch": 2.2702097902097904, "grad_norm": 0.36102343576518403, "learning_rate": 7.042106567792406e-07, "loss": 0.6888, "num_input_tokens_seen": 2127560704, "step": 2029 }, { "epoch": 2.2713286713286713, "grad_norm": 0.48009661121867697, "learning_rate": 7.021530794534803e-07, "loss": 0.5843, "num_input_tokens_seen": 2128609280, "step": 2030 }, { "epoch": 2.2724475524475523, "grad_norm": 0.35482768254890906, "learning_rate": 7.000980212576522e-07, "loss": 0.6222, "num_input_tokens_seen": 2129657856, "step": 2031 }, { "epoch": 2.2735664335664336, "grad_norm": 0.36708413708094656, "learning_rate": 6.980454850712878e-07, "loss": 0.638, "num_input_tokens_seen": 2130706432, "step": 2032 }, { "epoch": 2.2746853146853145, "grad_norm": 0.4618101416204606, "learning_rate": 6.959954737703872e-07, "loss": 0.5787, "num_input_tokens_seen": 2131755008, "step": 2033 }, { "epoch": 2.275804195804196, "grad_norm": 0.32437971821253436, "learning_rate": 6.939479902274088e-07, "loss": 0.6328, "num_input_tokens_seen": 2132803584, "step": 2034 }, { "epoch": 2.276923076923077, "grad_norm": 0.6336743036519067, "learning_rate": 6.919030373112748e-07, "loss": 0.6582, "num_input_tokens_seen": 2133852160, "step": 2035 }, { "epoch": 2.278041958041958, "grad_norm": 0.36082260715490544, "learning_rate": 6.89860617887356e-07, "loss": 0.7161, "num_input_tokens_seen": 2134900736, "step": 2036 }, { "epoch": 2.279160839160839, "grad_norm": 0.3630895787707352, "learning_rate": 6.87820734817477e-07, "loss": 0.5773, "num_input_tokens_seen": 2135949312, "step": 2037 }, { "epoch": 2.2802797202797205, "grad_norm": 0.41238539090302917, "learning_rate": 6.857833909599063e-07, "loss": 0.669, "num_input_tokens_seen": 2136997888, "step": 2038 }, { "epoch": 2.2813986013986014, "grad_norm": 0.38311062639659044, "learning_rate": 6.837485891693541e-07, "loss": 0.7104, "num_input_tokens_seen": 2138046464, "step": 2039 }, { "epoch": 2.2825174825174823, "grad_norm": 0.35981325832964006, "learning_rate": 6.817163322969708e-07, "loss": 0.5769, "num_input_tokens_seen": 2139095040, "step": 2040 }, { "epoch": 2.2836363636363637, "grad_norm": 0.365790341125758, "learning_rate": 6.796866231903402e-07, "loss": 0.6786, "num_input_tokens_seen": 2140143616, "step": 2041 }, { "epoch": 2.2847552447552446, "grad_norm": 0.3619990718451309, "learning_rate": 6.776594646934756e-07, "loss": 0.5712, "num_input_tokens_seen": 2141192192, "step": 2042 }, { "epoch": 2.285874125874126, "grad_norm": 0.40638929211728086, "learning_rate": 6.756348596468168e-07, "loss": 0.7821, "num_input_tokens_seen": 2142240768, "step": 2043 }, { "epoch": 2.286993006993007, "grad_norm": 0.3486111647132872, "learning_rate": 6.736128108872244e-07, "loss": 0.5473, "num_input_tokens_seen": 2143289344, "step": 2044 }, { "epoch": 2.2881118881118883, "grad_norm": 0.33281356152475117, "learning_rate": 6.715933212479791e-07, "loss": 0.6668, "num_input_tokens_seen": 2144337920, "step": 2045 }, { "epoch": 2.289230769230769, "grad_norm": 0.3745261809788843, "learning_rate": 6.695763935587752e-07, "loss": 0.6109, "num_input_tokens_seen": 2145386496, "step": 2046 }, { "epoch": 2.2903496503496505, "grad_norm": 0.3692696965314896, "learning_rate": 6.675620306457172e-07, "loss": 0.7932, "num_input_tokens_seen": 2146435072, "step": 2047 }, { "epoch": 2.2914685314685315, "grad_norm": 0.5018792129589581, "learning_rate": 6.655502353313147e-07, "loss": 0.7036, "num_input_tokens_seen": 2147483648, "step": 2048 }, { "epoch": 2.2925874125874124, "grad_norm": 0.36720636232778764, "learning_rate": 6.635410104344819e-07, "loss": 0.8149, "num_input_tokens_seen": 2148532224, "step": 2049 }, { "epoch": 2.2937062937062938, "grad_norm": 0.4609362930639404, "learning_rate": 6.615343587705284e-07, "loss": 0.7342, "num_input_tokens_seen": 2149580800, "step": 2050 }, { "epoch": 2.2948251748251747, "grad_norm": 0.3588950981485549, "learning_rate": 6.595302831511607e-07, "loss": 0.721, "num_input_tokens_seen": 2150629376, "step": 2051 }, { "epoch": 2.295944055944056, "grad_norm": 0.380067720127937, "learning_rate": 6.575287863844753e-07, "loss": 0.6407, "num_input_tokens_seen": 2151677952, "step": 2052 }, { "epoch": 2.297062937062937, "grad_norm": 0.4010449502164757, "learning_rate": 6.555298712749538e-07, "loss": 0.7026, "num_input_tokens_seen": 2152726528, "step": 2053 }, { "epoch": 2.2981818181818183, "grad_norm": 0.3724979393058588, "learning_rate": 6.535335406234627e-07, "loss": 0.5839, "num_input_tokens_seen": 2153775104, "step": 2054 }, { "epoch": 2.2993006993006992, "grad_norm": 0.3632401255730732, "learning_rate": 6.515397972272444e-07, "loss": 0.6603, "num_input_tokens_seen": 2154823680, "step": 2055 }, { "epoch": 2.3004195804195806, "grad_norm": 0.38004384978636835, "learning_rate": 6.495486438799181e-07, "loss": 0.7736, "num_input_tokens_seen": 2155872256, "step": 2056 }, { "epoch": 2.3015384615384615, "grad_norm": 0.3417002000461984, "learning_rate": 6.475600833714743e-07, "loss": 0.6303, "num_input_tokens_seen": 2156920832, "step": 2057 }, { "epoch": 2.3026573426573425, "grad_norm": 0.3462004242445, "learning_rate": 6.455741184882674e-07, "loss": 0.6659, "num_input_tokens_seen": 2157969408, "step": 2058 }, { "epoch": 2.303776223776224, "grad_norm": 0.462324202736324, "learning_rate": 6.435907520130191e-07, "loss": 0.7286, "num_input_tokens_seen": 2159017984, "step": 2059 }, { "epoch": 2.3048951048951047, "grad_norm": 0.36070784526535443, "learning_rate": 6.41609986724806e-07, "loss": 0.6201, "num_input_tokens_seen": 2160066560, "step": 2060 }, { "epoch": 2.306013986013986, "grad_norm": 0.3650567403498594, "learning_rate": 6.396318253990628e-07, "loss": 0.6331, "num_input_tokens_seen": 2161115136, "step": 2061 }, { "epoch": 2.307132867132867, "grad_norm": 0.3577346848384965, "learning_rate": 6.376562708075753e-07, "loss": 0.5273, "num_input_tokens_seen": 2162163712, "step": 2062 }, { "epoch": 2.3082517482517484, "grad_norm": 0.34909792557997227, "learning_rate": 6.356833257184747e-07, "loss": 0.736, "num_input_tokens_seen": 2163212288, "step": 2063 }, { "epoch": 2.3093706293706293, "grad_norm": 0.3334970256897544, "learning_rate": 6.337129928962385e-07, "loss": 0.5976, "num_input_tokens_seen": 2164260864, "step": 2064 }, { "epoch": 2.3104895104895107, "grad_norm": 0.34278665846161593, "learning_rate": 6.317452751016815e-07, "loss": 0.7811, "num_input_tokens_seen": 2165309440, "step": 2065 }, { "epoch": 2.3116083916083916, "grad_norm": 0.4560658405156493, "learning_rate": 6.297801750919558e-07, "loss": 0.6262, "num_input_tokens_seen": 2166358016, "step": 2066 }, { "epoch": 2.3127272727272725, "grad_norm": 0.38419095000595316, "learning_rate": 6.278176956205462e-07, "loss": 0.7044, "num_input_tokens_seen": 2167406592, "step": 2067 }, { "epoch": 2.313846153846154, "grad_norm": 0.4740185696284329, "learning_rate": 6.25857839437263e-07, "loss": 0.6227, "num_input_tokens_seen": 2168455168, "step": 2068 }, { "epoch": 2.314965034965035, "grad_norm": 0.37918441472396824, "learning_rate": 6.239006092882438e-07, "loss": 0.5727, "num_input_tokens_seen": 2169503744, "step": 2069 }, { "epoch": 2.316083916083916, "grad_norm": 0.36763727735794355, "learning_rate": 6.219460079159434e-07, "loss": 0.5844, "num_input_tokens_seen": 2170552320, "step": 2070 }, { "epoch": 2.317202797202797, "grad_norm": 0.45424102901507196, "learning_rate": 6.19994038059136e-07, "loss": 0.7689, "num_input_tokens_seen": 2171600896, "step": 2071 }, { "epoch": 2.3183216783216785, "grad_norm": 0.3817127200220786, "learning_rate": 6.180447024529074e-07, "loss": 0.775, "num_input_tokens_seen": 2172649472, "step": 2072 }, { "epoch": 2.3194405594405594, "grad_norm": 0.3722563904479236, "learning_rate": 6.160980038286529e-07, "loss": 0.6933, "num_input_tokens_seen": 2173698048, "step": 2073 }, { "epoch": 2.3205594405594407, "grad_norm": 0.5679839020368583, "learning_rate": 6.141539449140718e-07, "loss": 0.6771, "num_input_tokens_seen": 2174746624, "step": 2074 }, { "epoch": 2.3216783216783217, "grad_norm": 0.36052446482422, "learning_rate": 6.122125284331646e-07, "loss": 0.602, "num_input_tokens_seen": 2175795200, "step": 2075 }, { "epoch": 2.3227972027972026, "grad_norm": 0.39960685564698506, "learning_rate": 6.1027375710623e-07, "loss": 0.702, "num_input_tokens_seen": 2176843776, "step": 2076 }, { "epoch": 2.323916083916084, "grad_norm": 0.3323519573292894, "learning_rate": 6.083376336498608e-07, "loss": 0.6474, "num_input_tokens_seen": 2177892352, "step": 2077 }, { "epoch": 2.325034965034965, "grad_norm": 0.40553358709849674, "learning_rate": 6.064041607769397e-07, "loss": 0.645, "num_input_tokens_seen": 2178940928, "step": 2078 }, { "epoch": 2.3261538461538462, "grad_norm": 0.34547847347198457, "learning_rate": 6.044733411966336e-07, "loss": 0.6766, "num_input_tokens_seen": 2179989504, "step": 2079 }, { "epoch": 2.327272727272727, "grad_norm": 0.34691700554140253, "learning_rate": 6.025451776143923e-07, "loss": 0.7105, "num_input_tokens_seen": 2181038080, "step": 2080 }, { "epoch": 2.3283916083916085, "grad_norm": 0.38622090352203975, "learning_rate": 6.006196727319452e-07, "loss": 0.8447, "num_input_tokens_seen": 2182086656, "step": 2081 }, { "epoch": 2.3295104895104894, "grad_norm": 0.3557413122467602, "learning_rate": 5.986968292472955e-07, "loss": 0.8273, "num_input_tokens_seen": 2183135232, "step": 2082 }, { "epoch": 2.330629370629371, "grad_norm": 0.4218834237211476, "learning_rate": 5.967766498547181e-07, "loss": 0.6558, "num_input_tokens_seen": 2184183808, "step": 2083 }, { "epoch": 2.3317482517482517, "grad_norm": 0.3575517039081643, "learning_rate": 5.94859137244754e-07, "loss": 0.6443, "num_input_tokens_seen": 2185232384, "step": 2084 }, { "epoch": 2.3328671328671327, "grad_norm": 0.37663118821245345, "learning_rate": 5.929442941042066e-07, "loss": 0.7713, "num_input_tokens_seen": 2186280960, "step": 2085 }, { "epoch": 2.333986013986014, "grad_norm": 0.3407759102436275, "learning_rate": 5.910321231161409e-07, "loss": 0.6315, "num_input_tokens_seen": 2187329536, "step": 2086 }, { "epoch": 2.335104895104895, "grad_norm": 0.34075068629379757, "learning_rate": 5.891226269598768e-07, "loss": 0.5425, "num_input_tokens_seen": 2188378112, "step": 2087 }, { "epoch": 2.3362237762237763, "grad_norm": 0.3429645537782866, "learning_rate": 5.872158083109874e-07, "loss": 0.6496, "num_input_tokens_seen": 2189426688, "step": 2088 }, { "epoch": 2.3373426573426572, "grad_norm": 0.3510555960388594, "learning_rate": 5.853116698412913e-07, "loss": 0.6263, "num_input_tokens_seen": 2190475264, "step": 2089 }, { "epoch": 2.3384615384615386, "grad_norm": 0.3550280706115816, "learning_rate": 5.83410214218855e-07, "loss": 0.6418, "num_input_tokens_seen": 2191523840, "step": 2090 }, { "epoch": 2.3395804195804195, "grad_norm": 0.3935241024967228, "learning_rate": 5.815114441079825e-07, "loss": 0.6581, "num_input_tokens_seen": 2192572416, "step": 2091 }, { "epoch": 2.340699300699301, "grad_norm": 0.3558506778720097, "learning_rate": 5.796153621692174e-07, "loss": 0.6411, "num_input_tokens_seen": 2193620992, "step": 2092 }, { "epoch": 2.341818181818182, "grad_norm": 0.3677586078871795, "learning_rate": 5.777219710593365e-07, "loss": 0.6982, "num_input_tokens_seen": 2194669568, "step": 2093 }, { "epoch": 2.3429370629370627, "grad_norm": 0.41410723432252894, "learning_rate": 5.758312734313442e-07, "loss": 0.5993, "num_input_tokens_seen": 2195718144, "step": 2094 }, { "epoch": 2.344055944055944, "grad_norm": 0.36897799066747605, "learning_rate": 5.739432719344737e-07, "loss": 0.7793, "num_input_tokens_seen": 2196766720, "step": 2095 }, { "epoch": 2.345174825174825, "grad_norm": 0.35598234511967825, "learning_rate": 5.720579692141773e-07, "loss": 0.7526, "num_input_tokens_seen": 2197815296, "step": 2096 }, { "epoch": 2.3462937062937064, "grad_norm": 0.3517395300781419, "learning_rate": 5.70175367912128e-07, "loss": 0.7495, "num_input_tokens_seen": 2198863872, "step": 2097 }, { "epoch": 2.3474125874125873, "grad_norm": 0.35744540106934414, "learning_rate": 5.682954706662137e-07, "loss": 0.7236, "num_input_tokens_seen": 2199912448, "step": 2098 }, { "epoch": 2.3485314685314687, "grad_norm": 0.3587274779632327, "learning_rate": 5.664182801105314e-07, "loss": 0.701, "num_input_tokens_seen": 2200961024, "step": 2099 }, { "epoch": 2.3496503496503496, "grad_norm": 0.3830829602582527, "learning_rate": 5.645437988753877e-07, "loss": 0.6934, "num_input_tokens_seen": 2202009600, "step": 2100 }, { "epoch": 2.350769230769231, "grad_norm": 0.3497503379954363, "learning_rate": 5.626720295872911e-07, "loss": 0.6752, "num_input_tokens_seen": 2203058176, "step": 2101 }, { "epoch": 2.351888111888112, "grad_norm": 0.3498951623192306, "learning_rate": 5.608029748689514e-07, "loss": 0.6061, "num_input_tokens_seen": 2204106752, "step": 2102 }, { "epoch": 2.353006993006993, "grad_norm": 0.3479578824572703, "learning_rate": 5.589366373392754e-07, "loss": 0.6042, "num_input_tokens_seen": 2205155328, "step": 2103 }, { "epoch": 2.354125874125874, "grad_norm": 0.3850090182693759, "learning_rate": 5.570730196133597e-07, "loss": 0.7077, "num_input_tokens_seen": 2206203904, "step": 2104 }, { "epoch": 2.355244755244755, "grad_norm": 0.34202324573328563, "learning_rate": 5.552121243024935e-07, "loss": 0.5474, "num_input_tokens_seen": 2207252480, "step": 2105 }, { "epoch": 2.3563636363636364, "grad_norm": 0.36182783615400044, "learning_rate": 5.533539540141483e-07, "loss": 0.7622, "num_input_tokens_seen": 2208301056, "step": 2106 }, { "epoch": 2.3574825174825174, "grad_norm": 0.3784605830548415, "learning_rate": 5.514985113519794e-07, "loss": 0.5663, "num_input_tokens_seen": 2209349632, "step": 2107 }, { "epoch": 2.3586013986013987, "grad_norm": 0.3355597653112651, "learning_rate": 5.4964579891582e-07, "loss": 0.6412, "num_input_tokens_seen": 2210398208, "step": 2108 }, { "epoch": 2.3597202797202796, "grad_norm": 0.34624100846593536, "learning_rate": 5.477958193016758e-07, "loss": 0.7015, "num_input_tokens_seen": 2211446784, "step": 2109 }, { "epoch": 2.360839160839161, "grad_norm": 0.3681703697094909, "learning_rate": 5.459485751017263e-07, "loss": 0.712, "num_input_tokens_seen": 2212495360, "step": 2110 }, { "epoch": 2.361958041958042, "grad_norm": 0.3335340668452395, "learning_rate": 5.441040689043148e-07, "loss": 0.6735, "num_input_tokens_seen": 2213543936, "step": 2111 }, { "epoch": 2.363076923076923, "grad_norm": 0.33506575945699735, "learning_rate": 5.422623032939508e-07, "loss": 0.6204, "num_input_tokens_seen": 2214592512, "step": 2112 }, { "epoch": 2.364195804195804, "grad_norm": 0.3826194231070801, "learning_rate": 5.404232808513027e-07, "loss": 0.7015, "num_input_tokens_seen": 2215641088, "step": 2113 }, { "epoch": 2.365314685314685, "grad_norm": 0.3669199290579852, "learning_rate": 5.385870041531963e-07, "loss": 0.6456, "num_input_tokens_seen": 2216689664, "step": 2114 }, { "epoch": 2.3664335664335665, "grad_norm": 0.364006317613497, "learning_rate": 5.367534757726079e-07, "loss": 0.6823, "num_input_tokens_seen": 2217738240, "step": 2115 }, { "epoch": 2.3675524475524474, "grad_norm": 0.33045769253424645, "learning_rate": 5.349226982786632e-07, "loss": 0.6688, "num_input_tokens_seen": 2218786816, "step": 2116 }, { "epoch": 2.368671328671329, "grad_norm": 0.4561948159056627, "learning_rate": 5.330946742366356e-07, "loss": 0.722, "num_input_tokens_seen": 2219835392, "step": 2117 }, { "epoch": 2.3697902097902097, "grad_norm": 0.35410182279373864, "learning_rate": 5.312694062079385e-07, "loss": 0.6293, "num_input_tokens_seen": 2220883968, "step": 2118 }, { "epoch": 2.370909090909091, "grad_norm": 0.35818896582089094, "learning_rate": 5.294468967501248e-07, "loss": 0.7684, "num_input_tokens_seen": 2221932544, "step": 2119 }, { "epoch": 2.372027972027972, "grad_norm": 0.34824197483925745, "learning_rate": 5.276271484168808e-07, "loss": 0.6777, "num_input_tokens_seen": 2222981120, "step": 2120 }, { "epoch": 2.373146853146853, "grad_norm": 0.36767602534806654, "learning_rate": 5.258101637580238e-07, "loss": 0.6338, "num_input_tokens_seen": 2224029696, "step": 2121 }, { "epoch": 2.3742657342657343, "grad_norm": 0.3437434070320051, "learning_rate": 5.239959453195001e-07, "loss": 0.6652, "num_input_tokens_seen": 2225078272, "step": 2122 }, { "epoch": 2.375384615384615, "grad_norm": 0.3590888826844622, "learning_rate": 5.221844956433794e-07, "loss": 0.6101, "num_input_tokens_seen": 2226126848, "step": 2123 }, { "epoch": 2.3765034965034966, "grad_norm": 0.35288108494614323, "learning_rate": 5.203758172678522e-07, "loss": 0.7176, "num_input_tokens_seen": 2227175424, "step": 2124 }, { "epoch": 2.3776223776223775, "grad_norm": 0.3474986201265241, "learning_rate": 5.185699127272243e-07, "loss": 0.6639, "num_input_tokens_seen": 2228224000, "step": 2125 }, { "epoch": 2.378741258741259, "grad_norm": 0.3806667044308288, "learning_rate": 5.167667845519167e-07, "loss": 0.7719, "num_input_tokens_seen": 2229272576, "step": 2126 }, { "epoch": 2.37986013986014, "grad_norm": 0.38054122689991565, "learning_rate": 5.149664352684586e-07, "loss": 0.7191, "num_input_tokens_seen": 2230321152, "step": 2127 }, { "epoch": 2.380979020979021, "grad_norm": 0.40393145697702404, "learning_rate": 5.131688673994867e-07, "loss": 0.6281, "num_input_tokens_seen": 2231369728, "step": 2128 }, { "epoch": 2.382097902097902, "grad_norm": 0.39232507799180044, "learning_rate": 5.113740834637407e-07, "loss": 0.7414, "num_input_tokens_seen": 2232418304, "step": 2129 }, { "epoch": 2.383216783216783, "grad_norm": 0.3408644342145396, "learning_rate": 5.095820859760572e-07, "loss": 0.6552, "num_input_tokens_seen": 2233466880, "step": 2130 }, { "epoch": 2.3843356643356644, "grad_norm": 0.3809191213336915, "learning_rate": 5.077928774473714e-07, "loss": 0.7107, "num_input_tokens_seen": 2234515456, "step": 2131 }, { "epoch": 2.3854545454545453, "grad_norm": 0.4236676264019149, "learning_rate": 5.060064603847079e-07, "loss": 0.6555, "num_input_tokens_seen": 2235564032, "step": 2132 }, { "epoch": 2.3865734265734266, "grad_norm": 0.34989320261602885, "learning_rate": 5.042228372911815e-07, "loss": 0.617, "num_input_tokens_seen": 2236612608, "step": 2133 }, { "epoch": 2.3876923076923076, "grad_norm": 0.3783674888022726, "learning_rate": 5.024420106659928e-07, "loss": 0.6716, "num_input_tokens_seen": 2237661184, "step": 2134 }, { "epoch": 2.388811188811189, "grad_norm": 0.35937273649755996, "learning_rate": 5.006639830044219e-07, "loss": 0.6374, "num_input_tokens_seen": 2238709760, "step": 2135 }, { "epoch": 2.38993006993007, "grad_norm": 0.5671121386397902, "learning_rate": 4.98888756797829e-07, "loss": 0.7336, "num_input_tokens_seen": 2239758336, "step": 2136 }, { "epoch": 2.391048951048951, "grad_norm": 0.3692073670513007, "learning_rate": 4.971163345336469e-07, "loss": 0.627, "num_input_tokens_seen": 2240806912, "step": 2137 }, { "epoch": 2.392167832167832, "grad_norm": 0.3705523657404764, "learning_rate": 4.953467186953814e-07, "loss": 0.6792, "num_input_tokens_seen": 2241855488, "step": 2138 }, { "epoch": 2.393286713286713, "grad_norm": 0.34283775745400724, "learning_rate": 4.935799117626058e-07, "loss": 0.5442, "num_input_tokens_seen": 2242904064, "step": 2139 }, { "epoch": 2.3944055944055944, "grad_norm": 0.3590184958443482, "learning_rate": 4.918159162109559e-07, "loss": 0.7343, "num_input_tokens_seen": 2243952640, "step": 2140 }, { "epoch": 2.3955244755244753, "grad_norm": 0.43918417186258746, "learning_rate": 4.900547345121304e-07, "loss": 0.6794, "num_input_tokens_seen": 2245001216, "step": 2141 }, { "epoch": 2.3966433566433567, "grad_norm": 0.34972683487867307, "learning_rate": 4.882963691338832e-07, "loss": 0.6673, "num_input_tokens_seen": 2246049792, "step": 2142 }, { "epoch": 2.3977622377622376, "grad_norm": 0.3668483549231142, "learning_rate": 4.865408225400234e-07, "loss": 0.7118, "num_input_tokens_seen": 2247098368, "step": 2143 }, { "epoch": 2.398881118881119, "grad_norm": 0.3834194429652244, "learning_rate": 4.847880971904106e-07, "loss": 0.7726, "num_input_tokens_seen": 2248146944, "step": 2144 }, { "epoch": 2.4, "grad_norm": 0.9564295565522206, "learning_rate": 4.830381955409497e-07, "loss": 0.7406, "num_input_tokens_seen": 2249195520, "step": 2145 }, { "epoch": 2.4011188811188813, "grad_norm": 0.41147551797046417, "learning_rate": 4.812911200435913e-07, "loss": 0.7886, "num_input_tokens_seen": 2250244096, "step": 2146 }, { "epoch": 2.402237762237762, "grad_norm": 0.3825769066661367, "learning_rate": 4.795468731463232e-07, "loss": 0.6341, "num_input_tokens_seen": 2251292672, "step": 2147 }, { "epoch": 2.403356643356643, "grad_norm": 0.3541383603126581, "learning_rate": 4.778054572931723e-07, "loss": 0.5739, "num_input_tokens_seen": 2252341248, "step": 2148 }, { "epoch": 2.4044755244755245, "grad_norm": 0.384293026997871, "learning_rate": 4.7606687492419785e-07, "loss": 0.7629, "num_input_tokens_seen": 2253389824, "step": 2149 }, { "epoch": 2.4055944055944054, "grad_norm": 0.3556266545950602, "learning_rate": 4.7433112847548893e-07, "loss": 0.7077, "num_input_tokens_seen": 2254438400, "step": 2150 }, { "epoch": 2.4067132867132868, "grad_norm": 0.41096458618176246, "learning_rate": 4.725982203791607e-07, "loss": 0.7129, "num_input_tokens_seen": 2255486976, "step": 2151 }, { "epoch": 2.4078321678321677, "grad_norm": 0.4044063204307026, "learning_rate": 4.708681530633499e-07, "loss": 0.7221, "num_input_tokens_seen": 2256535552, "step": 2152 }, { "epoch": 2.408951048951049, "grad_norm": 0.36293884242685753, "learning_rate": 4.691409289522156e-07, "loss": 0.6069, "num_input_tokens_seen": 2257584128, "step": 2153 }, { "epoch": 2.41006993006993, "grad_norm": 0.3550973495473056, "learning_rate": 4.6741655046593087e-07, "loss": 0.5686, "num_input_tokens_seen": 2258632704, "step": 2154 }, { "epoch": 2.4111888111888113, "grad_norm": 0.4731257444619733, "learning_rate": 4.6569502002068336e-07, "loss": 0.7829, "num_input_tokens_seen": 2259681280, "step": 2155 }, { "epoch": 2.4123076923076923, "grad_norm": 0.4029013395573785, "learning_rate": 4.639763400286679e-07, "loss": 0.6782, "num_input_tokens_seen": 2260729856, "step": 2156 }, { "epoch": 2.413426573426573, "grad_norm": 0.3769090578229831, "learning_rate": 4.622605128980862e-07, "loss": 0.7621, "num_input_tokens_seen": 2261778432, "step": 2157 }, { "epoch": 2.4145454545454546, "grad_norm": 0.3726662422737028, "learning_rate": 4.6054754103314255e-07, "loss": 0.7352, "num_input_tokens_seen": 2262827008, "step": 2158 }, { "epoch": 2.4156643356643355, "grad_norm": 0.44692183338738384, "learning_rate": 4.588374268340412e-07, "loss": 0.617, "num_input_tokens_seen": 2263875584, "step": 2159 }, { "epoch": 2.416783216783217, "grad_norm": 0.3806491758771308, "learning_rate": 4.5713017269698207e-07, "loss": 0.8254, "num_input_tokens_seen": 2264924160, "step": 2160 }, { "epoch": 2.4179020979020978, "grad_norm": 0.3687675897395281, "learning_rate": 4.5542578101415576e-07, "loss": 0.7123, "num_input_tokens_seen": 2265972736, "step": 2161 }, { "epoch": 2.419020979020979, "grad_norm": 0.3763761147259082, "learning_rate": 4.5372425417374466e-07, "loss": 0.6617, "num_input_tokens_seen": 2267021312, "step": 2162 }, { "epoch": 2.42013986013986, "grad_norm": 0.42334703811069646, "learning_rate": 4.5202559455991473e-07, "loss": 0.6397, "num_input_tokens_seen": 2268069888, "step": 2163 }, { "epoch": 2.4212587412587414, "grad_norm": 0.3514343827808528, "learning_rate": 4.5032980455281596e-07, "loss": 0.5518, "num_input_tokens_seen": 2269118464, "step": 2164 }, { "epoch": 2.4223776223776223, "grad_norm": 0.5333203681244576, "learning_rate": 4.48636886528577e-07, "loss": 0.7122, "num_input_tokens_seen": 2270167040, "step": 2165 }, { "epoch": 2.4234965034965033, "grad_norm": 0.3591377366517055, "learning_rate": 4.469468428593016e-07, "loss": 0.6848, "num_input_tokens_seen": 2271215616, "step": 2166 }, { "epoch": 2.4246153846153846, "grad_norm": 0.32777518580896303, "learning_rate": 4.4525967591306757e-07, "loss": 0.5997, "num_input_tokens_seen": 2272264192, "step": 2167 }, { "epoch": 2.425734265734266, "grad_norm": 0.3326044868460106, "learning_rate": 4.4357538805391953e-07, "loss": 0.7304, "num_input_tokens_seen": 2273312768, "step": 2168 }, { "epoch": 2.426853146853147, "grad_norm": 0.3873436973031661, "learning_rate": 4.418939816418699e-07, "loss": 0.6174, "num_input_tokens_seen": 2274361344, "step": 2169 }, { "epoch": 2.427972027972028, "grad_norm": 0.3481532534450569, "learning_rate": 4.4021545903289357e-07, "loss": 0.6731, "num_input_tokens_seen": 2275409920, "step": 2170 }, { "epoch": 2.429090909090909, "grad_norm": 0.3393223642060605, "learning_rate": 4.3853982257892335e-07, "loss": 0.6229, "num_input_tokens_seen": 2276458496, "step": 2171 }, { "epoch": 2.43020979020979, "grad_norm": 0.3863650340360159, "learning_rate": 4.3686707462784945e-07, "loss": 0.7666, "num_input_tokens_seen": 2277507072, "step": 2172 }, { "epoch": 2.4313286713286715, "grad_norm": 0.36053775523422327, "learning_rate": 4.3519721752351305e-07, "loss": 0.6355, "num_input_tokens_seen": 2278555648, "step": 2173 }, { "epoch": 2.4324475524475524, "grad_norm": 0.33401967798280957, "learning_rate": 4.3353025360570623e-07, "loss": 0.6604, "num_input_tokens_seen": 2279604224, "step": 2174 }, { "epoch": 2.4335664335664333, "grad_norm": 0.37892269932467637, "learning_rate": 4.3186618521016745e-07, "loss": 0.7101, "num_input_tokens_seen": 2280652800, "step": 2175 }, { "epoch": 2.4346853146853147, "grad_norm": 0.3570847542729435, "learning_rate": 4.302050146685757e-07, "loss": 0.6598, "num_input_tokens_seen": 2281701376, "step": 2176 }, { "epoch": 2.435804195804196, "grad_norm": 0.3762644473454635, "learning_rate": 4.2854674430855224e-07, "loss": 0.5535, "num_input_tokens_seen": 2282749952, "step": 2177 }, { "epoch": 2.436923076923077, "grad_norm": 0.36897392344984886, "learning_rate": 4.26891376453652e-07, "loss": 0.575, "num_input_tokens_seen": 2283798528, "step": 2178 }, { "epoch": 2.438041958041958, "grad_norm": 0.3658511918028765, "learning_rate": 4.2523891342336506e-07, "loss": 0.6521, "num_input_tokens_seen": 2284847104, "step": 2179 }, { "epoch": 2.4391608391608393, "grad_norm": 0.40702570701126134, "learning_rate": 4.2358935753311125e-07, "loss": 0.6924, "num_input_tokens_seen": 2285895680, "step": 2180 }, { "epoch": 2.44027972027972, "grad_norm": 0.7710197356526447, "learning_rate": 4.219427110942348e-07, "loss": 0.7124, "num_input_tokens_seen": 2286944256, "step": 2181 }, { "epoch": 2.4413986013986015, "grad_norm": 0.38688534081664205, "learning_rate": 4.2029897641400584e-07, "loss": 0.7705, "num_input_tokens_seen": 2287992832, "step": 2182 }, { "epoch": 2.4425174825174825, "grad_norm": 0.3592488757865, "learning_rate": 4.186581557956124e-07, "loss": 0.6574, "num_input_tokens_seen": 2289041408, "step": 2183 }, { "epoch": 2.443636363636364, "grad_norm": 0.3714763474229317, "learning_rate": 4.170202515381605e-07, "loss": 0.6799, "num_input_tokens_seen": 2290089984, "step": 2184 }, { "epoch": 2.4447552447552447, "grad_norm": 0.3637132107251555, "learning_rate": 4.153852659366697e-07, "loss": 0.6479, "num_input_tokens_seen": 2291138560, "step": 2185 }, { "epoch": 2.445874125874126, "grad_norm": 0.36590202616794304, "learning_rate": 4.137532012820708e-07, "loss": 0.589, "num_input_tokens_seen": 2292187136, "step": 2186 }, { "epoch": 2.446993006993007, "grad_norm": 0.35772663953032774, "learning_rate": 4.1212405986119975e-07, "loss": 0.617, "num_input_tokens_seen": 2293235712, "step": 2187 }, { "epoch": 2.448111888111888, "grad_norm": 0.35525543717391067, "learning_rate": 4.104978439567972e-07, "loss": 0.6502, "num_input_tokens_seen": 2294284288, "step": 2188 }, { "epoch": 2.4492307692307693, "grad_norm": 0.3436707474455801, "learning_rate": 4.0887455584750547e-07, "loss": 0.716, "num_input_tokens_seen": 2295332864, "step": 2189 }, { "epoch": 2.4503496503496502, "grad_norm": 0.3394046559695296, "learning_rate": 4.072541978078642e-07, "loss": 0.5442, "num_input_tokens_seen": 2296381440, "step": 2190 }, { "epoch": 2.4514685314685316, "grad_norm": 0.36243345219393874, "learning_rate": 4.0563677210830763e-07, "loss": 0.6587, "num_input_tokens_seen": 2297430016, "step": 2191 }, { "epoch": 2.4525874125874125, "grad_norm": 0.3435951802088038, "learning_rate": 4.0402228101516036e-07, "loss": 0.6286, "num_input_tokens_seen": 2298478592, "step": 2192 }, { "epoch": 2.453706293706294, "grad_norm": 0.3486473859329218, "learning_rate": 4.0241072679063437e-07, "loss": 0.69, "num_input_tokens_seen": 2299527168, "step": 2193 }, { "epoch": 2.454825174825175, "grad_norm": 0.34202849797381585, "learning_rate": 4.0080211169282956e-07, "loss": 0.7809, "num_input_tokens_seen": 2300575744, "step": 2194 }, { "epoch": 2.455944055944056, "grad_norm": 0.44691826668953877, "learning_rate": 3.991964379757232e-07, "loss": 0.7086, "num_input_tokens_seen": 2301624320, "step": 2195 }, { "epoch": 2.457062937062937, "grad_norm": 0.3264278543134825, "learning_rate": 3.975937078891762e-07, "loss": 0.5642, "num_input_tokens_seen": 2302672896, "step": 2196 }, { "epoch": 2.458181818181818, "grad_norm": 0.3449797292356656, "learning_rate": 3.959939236789212e-07, "loss": 0.5554, "num_input_tokens_seen": 2303721472, "step": 2197 }, { "epoch": 2.4593006993006994, "grad_norm": 0.3311527297587433, "learning_rate": 3.9439708758656323e-07, "loss": 0.6969, "num_input_tokens_seen": 2304770048, "step": 2198 }, { "epoch": 2.4604195804195803, "grad_norm": 0.3463141553616862, "learning_rate": 3.9280320184957864e-07, "loss": 0.7139, "num_input_tokens_seen": 2305818624, "step": 2199 }, { "epoch": 2.4615384615384617, "grad_norm": 0.4032613729379509, "learning_rate": 3.912122687013065e-07, "loss": 0.6424, "num_input_tokens_seen": 2306867200, "step": 2200 }, { "epoch": 2.4626573426573426, "grad_norm": 0.35780194085389555, "learning_rate": 3.896242903709532e-07, "loss": 0.6316, "num_input_tokens_seen": 2307915776, "step": 2201 }, { "epoch": 2.463776223776224, "grad_norm": 0.3363942040672824, "learning_rate": 3.8803926908358047e-07, "loss": 0.75, "num_input_tokens_seen": 2308964352, "step": 2202 }, { "epoch": 2.464895104895105, "grad_norm": 0.38485748842913553, "learning_rate": 3.8645720706010997e-07, "loss": 0.6467, "num_input_tokens_seen": 2310012928, "step": 2203 }, { "epoch": 2.4660139860139862, "grad_norm": 0.33899942851093495, "learning_rate": 3.8487810651731463e-07, "loss": 0.6384, "num_input_tokens_seen": 2311061504, "step": 2204 }, { "epoch": 2.467132867132867, "grad_norm": 0.32945592353434944, "learning_rate": 3.8330196966781723e-07, "loss": 0.6291, "num_input_tokens_seen": 2312110080, "step": 2205 }, { "epoch": 2.468251748251748, "grad_norm": 0.4284996756086014, "learning_rate": 3.817287987200918e-07, "loss": 0.7577, "num_input_tokens_seen": 2313158656, "step": 2206 }, { "epoch": 2.4693706293706295, "grad_norm": 0.3570646226970355, "learning_rate": 3.8015859587845233e-07, "loss": 0.6575, "num_input_tokens_seen": 2314207232, "step": 2207 }, { "epoch": 2.4704895104895104, "grad_norm": 0.39676777250379186, "learning_rate": 3.78591363343056e-07, "loss": 0.6444, "num_input_tokens_seen": 2315255808, "step": 2208 }, { "epoch": 2.4716083916083917, "grad_norm": 0.35599150341465147, "learning_rate": 3.7702710330989765e-07, "loss": 0.6611, "num_input_tokens_seen": 2316304384, "step": 2209 }, { "epoch": 2.4727272727272727, "grad_norm": 0.33771404292297375, "learning_rate": 3.7546581797080567e-07, "loss": 0.6607, "num_input_tokens_seen": 2317352960, "step": 2210 }, { "epoch": 2.473846153846154, "grad_norm": 0.35645271586856414, "learning_rate": 3.739075095134437e-07, "loss": 0.708, "num_input_tokens_seen": 2318401536, "step": 2211 }, { "epoch": 2.474965034965035, "grad_norm": 0.37544737069579426, "learning_rate": 3.7235218012130067e-07, "loss": 0.7428, "num_input_tokens_seen": 2319450112, "step": 2212 }, { "epoch": 2.4760839160839163, "grad_norm": 0.353828883989088, "learning_rate": 3.707998319736936e-07, "loss": 0.6259, "num_input_tokens_seen": 2320498688, "step": 2213 }, { "epoch": 2.4772027972027972, "grad_norm": 0.34662650556499974, "learning_rate": 3.692504672457606e-07, "loss": 0.6235, "num_input_tokens_seen": 2321547264, "step": 2214 }, { "epoch": 2.478321678321678, "grad_norm": 0.36685175503034945, "learning_rate": 3.677040881084609e-07, "loss": 0.6792, "num_input_tokens_seen": 2322595840, "step": 2215 }, { "epoch": 2.4794405594405595, "grad_norm": 0.3514769668384242, "learning_rate": 3.6616069672856885e-07, "loss": 0.6478, "num_input_tokens_seen": 2323644416, "step": 2216 }, { "epoch": 2.4805594405594404, "grad_norm": 0.3641416887476937, "learning_rate": 3.6462029526867335e-07, "loss": 0.6594, "num_input_tokens_seen": 2324692992, "step": 2217 }, { "epoch": 2.481678321678322, "grad_norm": 0.331243675551818, "learning_rate": 3.6308288588717434e-07, "loss": 0.574, "num_input_tokens_seen": 2325741568, "step": 2218 }, { "epoch": 2.4827972027972027, "grad_norm": 0.3611409771745884, "learning_rate": 3.615484707382777e-07, "loss": 0.6648, "num_input_tokens_seen": 2326790144, "step": 2219 }, { "epoch": 2.483916083916084, "grad_norm": 0.33126183277411786, "learning_rate": 3.600170519719956e-07, "loss": 0.601, "num_input_tokens_seen": 2327838720, "step": 2220 }, { "epoch": 2.485034965034965, "grad_norm": 0.3447871817297807, "learning_rate": 3.584886317341396e-07, "loss": 0.6916, "num_input_tokens_seen": 2328887296, "step": 2221 }, { "epoch": 2.4861538461538464, "grad_norm": 0.373477809418186, "learning_rate": 3.5696321216632164e-07, "loss": 0.6992, "num_input_tokens_seen": 2329935872, "step": 2222 }, { "epoch": 2.4872727272727273, "grad_norm": 0.34082842839493865, "learning_rate": 3.5544079540594884e-07, "loss": 0.7376, "num_input_tokens_seen": 2330984448, "step": 2223 }, { "epoch": 2.4883916083916082, "grad_norm": 0.3251279916041272, "learning_rate": 3.5392138358621956e-07, "loss": 0.5345, "num_input_tokens_seen": 2332033024, "step": 2224 }, { "epoch": 2.4895104895104896, "grad_norm": 0.35874680114128205, "learning_rate": 3.5240497883612333e-07, "loss": 0.6042, "num_input_tokens_seen": 2333081600, "step": 2225 }, { "epoch": 2.4906293706293705, "grad_norm": 0.4557440618960138, "learning_rate": 3.5089158328043383e-07, "loss": 0.5901, "num_input_tokens_seen": 2334130176, "step": 2226 }, { "epoch": 2.491748251748252, "grad_norm": 0.3336285530037048, "learning_rate": 3.4938119903971195e-07, "loss": 0.6453, "num_input_tokens_seen": 2335178752, "step": 2227 }, { "epoch": 2.492867132867133, "grad_norm": 0.36864401301569, "learning_rate": 3.4787382823029626e-07, "loss": 0.8318, "num_input_tokens_seen": 2336227328, "step": 2228 }, { "epoch": 2.493986013986014, "grad_norm": 0.35717365545699137, "learning_rate": 3.4636947296430274e-07, "loss": 0.669, "num_input_tokens_seen": 2337275904, "step": 2229 }, { "epoch": 2.495104895104895, "grad_norm": 0.4345086008783318, "learning_rate": 3.4486813534962444e-07, "loss": 0.8892, "num_input_tokens_seen": 2338324480, "step": 2230 }, { "epoch": 2.4962237762237764, "grad_norm": 0.3763655819159716, "learning_rate": 3.43369817489923e-07, "loss": 0.7369, "num_input_tokens_seen": 2339373056, "step": 2231 }, { "epoch": 2.4973426573426574, "grad_norm": 0.3515334081477358, "learning_rate": 3.418745214846314e-07, "loss": 0.7378, "num_input_tokens_seen": 2340421632, "step": 2232 }, { "epoch": 2.4984615384615383, "grad_norm": 0.36415057831655095, "learning_rate": 3.40382249428948e-07, "loss": 0.6418, "num_input_tokens_seen": 2341470208, "step": 2233 }, { "epoch": 2.4995804195804197, "grad_norm": 0.34078651263293874, "learning_rate": 3.388930034138321e-07, "loss": 0.6464, "num_input_tokens_seen": 2342518784, "step": 2234 }, { "epoch": 2.5006993006993006, "grad_norm": 0.34202269969640753, "learning_rate": 3.374067855260055e-07, "loss": 0.5521, "num_input_tokens_seen": 2343567360, "step": 2235 }, { "epoch": 2.501818181818182, "grad_norm": 0.3654628898914463, "learning_rate": 3.3592359784794497e-07, "loss": 0.6952, "num_input_tokens_seen": 2344615936, "step": 2236 }, { "epoch": 2.502937062937063, "grad_norm": 0.33484019561851897, "learning_rate": 3.344434424578824e-07, "loss": 0.6042, "num_input_tokens_seen": 2345664512, "step": 2237 }, { "epoch": 2.504055944055944, "grad_norm": 0.36169270403566617, "learning_rate": 3.3296632142980097e-07, "loss": 0.6093, "num_input_tokens_seen": 2346713088, "step": 2238 }, { "epoch": 2.505174825174825, "grad_norm": 0.34838663627448496, "learning_rate": 3.314922368334322e-07, "loss": 0.5873, "num_input_tokens_seen": 2347761664, "step": 2239 }, { "epoch": 2.5062937062937065, "grad_norm": 0.387398008643468, "learning_rate": 3.300211907342521e-07, "loss": 0.806, "num_input_tokens_seen": 2348810240, "step": 2240 }, { "epoch": 2.5074125874125874, "grad_norm": 0.3598789712495993, "learning_rate": 3.2855318519347924e-07, "loss": 0.652, "num_input_tokens_seen": 2349858816, "step": 2241 }, { "epoch": 2.5085314685314684, "grad_norm": 0.3395494506007554, "learning_rate": 3.270882222680727e-07, "loss": 0.6257, "num_input_tokens_seen": 2350907392, "step": 2242 }, { "epoch": 2.5096503496503497, "grad_norm": 0.54207852138436, "learning_rate": 3.2562630401072796e-07, "loss": 0.5849, "num_input_tokens_seen": 2351955968, "step": 2243 }, { "epoch": 2.5107692307692306, "grad_norm": 0.36966869152899107, "learning_rate": 3.241674324698743e-07, "loss": 0.7776, "num_input_tokens_seen": 2353004544, "step": 2244 }, { "epoch": 2.511888111888112, "grad_norm": 0.3911303108357359, "learning_rate": 3.227116096896718e-07, "loss": 0.646, "num_input_tokens_seen": 2354053120, "step": 2245 }, { "epoch": 2.513006993006993, "grad_norm": 0.43699202467188386, "learning_rate": 3.2125883771000787e-07, "loss": 0.6269, "num_input_tokens_seen": 2355101696, "step": 2246 }, { "epoch": 2.514125874125874, "grad_norm": 0.36037573964966835, "learning_rate": 3.198091185664964e-07, "loss": 0.6682, "num_input_tokens_seen": 2356150272, "step": 2247 }, { "epoch": 2.515244755244755, "grad_norm": 0.3670778738718931, "learning_rate": 3.183624542904734e-07, "loss": 0.642, "num_input_tokens_seen": 2357198848, "step": 2248 }, { "epoch": 2.5163636363636366, "grad_norm": 0.46406903756219386, "learning_rate": 3.169188469089945e-07, "loss": 0.7413, "num_input_tokens_seen": 2358247424, "step": 2249 }, { "epoch": 2.5174825174825175, "grad_norm": 0.3514152445786773, "learning_rate": 3.1547829844483125e-07, "loss": 0.7462, "num_input_tokens_seen": 2359296000, "step": 2250 }, { "epoch": 2.5174825174825175, "eval_loss": 0.7195046544075012, "eval_runtime": 246.8147, "eval_samples_per_second": 2.366, "eval_steps_per_second": 0.296, "num_input_tokens_seen": 2359296000, "step": 2250 }, { "epoch": 2.5186013986013984, "grad_norm": 0.36175968390976326, "learning_rate": 3.1404081091647027e-07, "loss": 0.7276, "num_input_tokens_seen": 2360344576, "step": 2251 }, { "epoch": 2.51972027972028, "grad_norm": 0.3506275131618759, "learning_rate": 3.126063863381076e-07, "loss": 0.6048, "num_input_tokens_seen": 2361393152, "step": 2252 }, { "epoch": 2.5208391608391607, "grad_norm": 0.33970901856132574, "learning_rate": 3.111750267196492e-07, "loss": 0.6247, "num_input_tokens_seen": 2362441728, "step": 2253 }, { "epoch": 2.521958041958042, "grad_norm": 0.3660655346179623, "learning_rate": 3.097467340667057e-07, "loss": 0.7673, "num_input_tokens_seen": 2363490304, "step": 2254 }, { "epoch": 2.523076923076923, "grad_norm": 0.42848202434728777, "learning_rate": 3.083215103805895e-07, "loss": 0.7936, "num_input_tokens_seen": 2364538880, "step": 2255 }, { "epoch": 2.5241958041958044, "grad_norm": 1.074236609118112, "learning_rate": 3.068993576583149e-07, "loss": 0.619, "num_input_tokens_seen": 2365587456, "step": 2256 }, { "epoch": 2.5253146853146853, "grad_norm": 0.3398218998779769, "learning_rate": 3.0548027789259057e-07, "loss": 0.6772, "num_input_tokens_seen": 2366636032, "step": 2257 }, { "epoch": 2.5264335664335666, "grad_norm": 0.3325826763847176, "learning_rate": 3.04064273071821e-07, "loss": 0.6342, "num_input_tokens_seen": 2367684608, "step": 2258 }, { "epoch": 2.5275524475524476, "grad_norm": 0.3474045334064583, "learning_rate": 3.0265134518010274e-07, "loss": 0.8086, "num_input_tokens_seen": 2368733184, "step": 2259 }, { "epoch": 2.5286713286713285, "grad_norm": 0.37062489414659533, "learning_rate": 3.012414961972185e-07, "loss": 0.6255, "num_input_tokens_seen": 2369781760, "step": 2260 }, { "epoch": 2.52979020979021, "grad_norm": 0.3691129092976425, "learning_rate": 2.9983472809863996e-07, "loss": 0.6212, "num_input_tokens_seen": 2370830336, "step": 2261 }, { "epoch": 2.5309090909090908, "grad_norm": 0.3463388830094719, "learning_rate": 2.9843104285551844e-07, "loss": 0.6406, "num_input_tokens_seen": 2371878912, "step": 2262 }, { "epoch": 2.532027972027972, "grad_norm": 0.34379155713173076, "learning_rate": 2.970304424346887e-07, "loss": 0.7093, "num_input_tokens_seen": 2372927488, "step": 2263 }, { "epoch": 2.533146853146853, "grad_norm": 0.5511748386699037, "learning_rate": 2.95632928798662e-07, "loss": 0.6845, "num_input_tokens_seen": 2373976064, "step": 2264 }, { "epoch": 2.5342657342657344, "grad_norm": 0.33623095567967537, "learning_rate": 2.942385039056231e-07, "loss": 0.6001, "num_input_tokens_seen": 2375024640, "step": 2265 }, { "epoch": 2.5353846153846153, "grad_norm": 0.3714971249238029, "learning_rate": 2.9284716970943094e-07, "loss": 0.6591, "num_input_tokens_seen": 2376073216, "step": 2266 }, { "epoch": 2.5365034965034967, "grad_norm": 0.3437434677099865, "learning_rate": 2.9145892815961194e-07, "loss": 0.6419, "num_input_tokens_seen": 2377121792, "step": 2267 }, { "epoch": 2.5376223776223776, "grad_norm": 0.47442759213524, "learning_rate": 2.9007378120136044e-07, "loss": 0.7377, "num_input_tokens_seen": 2378170368, "step": 2268 }, { "epoch": 2.5387412587412586, "grad_norm": 0.35182308699779397, "learning_rate": 2.886917307755349e-07, "loss": 0.5976, "num_input_tokens_seen": 2379218944, "step": 2269 }, { "epoch": 2.53986013986014, "grad_norm": 0.40407593072288195, "learning_rate": 2.8731277881865275e-07, "loss": 0.7133, "num_input_tokens_seen": 2380267520, "step": 2270 }, { "epoch": 2.540979020979021, "grad_norm": 0.3437562663576225, "learning_rate": 2.859369272628928e-07, "loss": 0.6143, "num_input_tokens_seen": 2381316096, "step": 2271 }, { "epoch": 2.542097902097902, "grad_norm": 0.39600972019883934, "learning_rate": 2.845641780360872e-07, "loss": 0.5998, "num_input_tokens_seen": 2382364672, "step": 2272 }, { "epoch": 2.543216783216783, "grad_norm": 0.35813142075006815, "learning_rate": 2.8319453306172225e-07, "loss": 0.6149, "num_input_tokens_seen": 2383413248, "step": 2273 }, { "epoch": 2.5443356643356645, "grad_norm": 0.3492724301682286, "learning_rate": 2.818279942589347e-07, "loss": 0.6118, "num_input_tokens_seen": 2384461824, "step": 2274 }, { "epoch": 2.5454545454545454, "grad_norm": 0.34472798487509393, "learning_rate": 2.804645635425091e-07, "loss": 0.7162, "num_input_tokens_seen": 2385510400, "step": 2275 }, { "epoch": 2.546573426573427, "grad_norm": 0.3483850489713316, "learning_rate": 2.7910424282287407e-07, "loss": 0.6296, "num_input_tokens_seen": 2386558976, "step": 2276 }, { "epoch": 2.5476923076923077, "grad_norm": 0.38511573880096006, "learning_rate": 2.7774703400610086e-07, "loss": 0.6946, "num_input_tokens_seen": 2387607552, "step": 2277 }, { "epoch": 2.5488111888111886, "grad_norm": 0.35275212285219554, "learning_rate": 2.763929389939005e-07, "loss": 0.7511, "num_input_tokens_seen": 2388656128, "step": 2278 }, { "epoch": 2.54993006993007, "grad_norm": 0.33719662339904743, "learning_rate": 2.750419596836215e-07, "loss": 0.5908, "num_input_tokens_seen": 2389704704, "step": 2279 }, { "epoch": 2.551048951048951, "grad_norm": 0.33432228135733194, "learning_rate": 2.736940979682462e-07, "loss": 0.675, "num_input_tokens_seen": 2390753280, "step": 2280 }, { "epoch": 2.5521678321678323, "grad_norm": 0.32869140813514053, "learning_rate": 2.723493557363885e-07, "loss": 0.6393, "num_input_tokens_seen": 2391801856, "step": 2281 }, { "epoch": 2.553286713286713, "grad_norm": 0.3448780959571412, "learning_rate": 2.710077348722906e-07, "loss": 0.6421, "num_input_tokens_seen": 2392850432, "step": 2282 }, { "epoch": 2.5544055944055946, "grad_norm": 0.5235669160850345, "learning_rate": 2.696692372558224e-07, "loss": 0.6638, "num_input_tokens_seen": 2393899008, "step": 2283 }, { "epoch": 2.5555244755244755, "grad_norm": 0.3682289886216602, "learning_rate": 2.683338647624767e-07, "loss": 0.673, "num_input_tokens_seen": 2394947584, "step": 2284 }, { "epoch": 2.556643356643357, "grad_norm": 0.3341003577711962, "learning_rate": 2.670016192633687e-07, "loss": 0.6883, "num_input_tokens_seen": 2395996160, "step": 2285 }, { "epoch": 2.5577622377622378, "grad_norm": 0.36361607287993286, "learning_rate": 2.6567250262522937e-07, "loss": 0.6892, "num_input_tokens_seen": 2397044736, "step": 2286 }, { "epoch": 2.5588811188811187, "grad_norm": 0.3647874459698346, "learning_rate": 2.6434651671040894e-07, "loss": 0.7378, "num_input_tokens_seen": 2398093312, "step": 2287 }, { "epoch": 2.56, "grad_norm": 0.37259642692187467, "learning_rate": 2.6302366337686765e-07, "loss": 0.6978, "num_input_tokens_seen": 2399141888, "step": 2288 }, { "epoch": 2.561118881118881, "grad_norm": 0.3423824187573148, "learning_rate": 2.6170394447817824e-07, "loss": 0.707, "num_input_tokens_seen": 2400190464, "step": 2289 }, { "epoch": 2.5622377622377623, "grad_norm": 0.33057658818098445, "learning_rate": 2.603873618635222e-07, "loss": 0.7546, "num_input_tokens_seen": 2401239040, "step": 2290 }, { "epoch": 2.5633566433566433, "grad_norm": 0.34815761573765475, "learning_rate": 2.590739173776841e-07, "loss": 0.6234, "num_input_tokens_seen": 2402287616, "step": 2291 }, { "epoch": 2.5644755244755246, "grad_norm": 0.3588440609765585, "learning_rate": 2.577636128610539e-07, "loss": 0.7245, "num_input_tokens_seen": 2403336192, "step": 2292 }, { "epoch": 2.5655944055944055, "grad_norm": 0.35459035867660443, "learning_rate": 2.5645645014961947e-07, "loss": 0.6064, "num_input_tokens_seen": 2404384768, "step": 2293 }, { "epoch": 2.566713286713287, "grad_norm": 0.49965752388738877, "learning_rate": 2.551524310749684e-07, "loss": 0.7154, "num_input_tokens_seen": 2405433344, "step": 2294 }, { "epoch": 2.567832167832168, "grad_norm": 0.34652858548899573, "learning_rate": 2.53851557464283e-07, "loss": 0.808, "num_input_tokens_seen": 2406481920, "step": 2295 }, { "epoch": 2.5689510489510488, "grad_norm": 0.3768963824201261, "learning_rate": 2.525538311403367e-07, "loss": 0.743, "num_input_tokens_seen": 2407530496, "step": 2296 }, { "epoch": 2.57006993006993, "grad_norm": 0.33775179675150907, "learning_rate": 2.5125925392149533e-07, "loss": 0.5671, "num_input_tokens_seen": 2408579072, "step": 2297 }, { "epoch": 2.571188811188811, "grad_norm": 0.3313578550415155, "learning_rate": 2.499678276217102e-07, "loss": 0.6165, "num_input_tokens_seen": 2409627648, "step": 2298 }, { "epoch": 2.5723076923076924, "grad_norm": 0.3795087940309037, "learning_rate": 2.4867955405051826e-07, "loss": 0.58, "num_input_tokens_seen": 2410676224, "step": 2299 }, { "epoch": 2.5734265734265733, "grad_norm": 0.3553928610096232, "learning_rate": 2.4739443501304e-07, "loss": 0.6418, "num_input_tokens_seen": 2411724800, "step": 2300 }, { "epoch": 2.5745454545454547, "grad_norm": 0.36189974775161904, "learning_rate": 2.4611247230997366e-07, "loss": 0.6953, "num_input_tokens_seen": 2412773376, "step": 2301 }, { "epoch": 2.5756643356643356, "grad_norm": 0.33538624528729394, "learning_rate": 2.4483366773759705e-07, "loss": 0.5972, "num_input_tokens_seen": 2413821952, "step": 2302 }, { "epoch": 2.576783216783217, "grad_norm": 0.34968323023457776, "learning_rate": 2.4355802308776073e-07, "loss": 0.6312, "num_input_tokens_seen": 2414870528, "step": 2303 }, { "epoch": 2.577902097902098, "grad_norm": 0.34795915019304785, "learning_rate": 2.422855401478891e-07, "loss": 0.6304, "num_input_tokens_seen": 2415919104, "step": 2304 }, { "epoch": 2.579020979020979, "grad_norm": 1.365620871739148, "learning_rate": 2.410162207009761e-07, "loss": 0.6455, "num_input_tokens_seen": 2416967680, "step": 2305 }, { "epoch": 2.58013986013986, "grad_norm": 0.34518288084096765, "learning_rate": 2.397500665255825e-07, "loss": 0.6233, "num_input_tokens_seen": 2418016256, "step": 2306 }, { "epoch": 2.581258741258741, "grad_norm": 0.3380178753990226, "learning_rate": 2.384870793958349e-07, "loss": 0.7204, "num_input_tokens_seen": 2419064832, "step": 2307 }, { "epoch": 2.5823776223776225, "grad_norm": 0.3760270951394042, "learning_rate": 2.372272610814208e-07, "loss": 0.6586, "num_input_tokens_seen": 2420113408, "step": 2308 }, { "epoch": 2.5834965034965034, "grad_norm": 0.3641600120930854, "learning_rate": 2.3597061334758864e-07, "loss": 0.7094, "num_input_tokens_seen": 2421161984, "step": 2309 }, { "epoch": 2.5846153846153848, "grad_norm": 0.3386128106838299, "learning_rate": 2.3471713795514412e-07, "loss": 0.6513, "num_input_tokens_seen": 2422210560, "step": 2310 }, { "epoch": 2.5857342657342657, "grad_norm": 0.4045587836174649, "learning_rate": 2.334668366604481e-07, "loss": 0.7583, "num_input_tokens_seen": 2423259136, "step": 2311 }, { "epoch": 2.586853146853147, "grad_norm": 0.3704337885470886, "learning_rate": 2.3221971121541343e-07, "loss": 0.638, "num_input_tokens_seen": 2424307712, "step": 2312 }, { "epoch": 2.587972027972028, "grad_norm": 0.3568985183017979, "learning_rate": 2.3097576336750248e-07, "loss": 0.5978, "num_input_tokens_seen": 2425356288, "step": 2313 }, { "epoch": 2.589090909090909, "grad_norm": 0.5210125931728653, "learning_rate": 2.2973499485972634e-07, "loss": 0.6024, "num_input_tokens_seen": 2426404864, "step": 2314 }, { "epoch": 2.5902097902097903, "grad_norm": 0.3873161886268221, "learning_rate": 2.2849740743064063e-07, "loss": 0.8446, "num_input_tokens_seen": 2427453440, "step": 2315 }, { "epoch": 2.591328671328671, "grad_norm": 0.3997486654583363, "learning_rate": 2.2726300281434467e-07, "loss": 0.8431, "num_input_tokens_seen": 2428502016, "step": 2316 }, { "epoch": 2.5924475524475525, "grad_norm": 0.4205983753104791, "learning_rate": 2.260317827404762e-07, "loss": 0.6557, "num_input_tokens_seen": 2429550592, "step": 2317 }, { "epoch": 2.5935664335664335, "grad_norm": 0.36973839296169125, "learning_rate": 2.2480374893421142e-07, "loss": 0.6948, "num_input_tokens_seen": 2430599168, "step": 2318 }, { "epoch": 2.594685314685315, "grad_norm": 0.4422500818865057, "learning_rate": 2.2357890311626328e-07, "loss": 0.5599, "num_input_tokens_seen": 2431647744, "step": 2319 }, { "epoch": 2.5958041958041957, "grad_norm": 0.35448769823269344, "learning_rate": 2.2235724700287592e-07, "loss": 0.6915, "num_input_tokens_seen": 2432696320, "step": 2320 }, { "epoch": 2.596923076923077, "grad_norm": 0.3572519351371691, "learning_rate": 2.2113878230582615e-07, "loss": 0.8178, "num_input_tokens_seen": 2433744896, "step": 2321 }, { "epoch": 2.598041958041958, "grad_norm": 0.38444699486321265, "learning_rate": 2.1992351073241684e-07, "loss": 0.6875, "num_input_tokens_seen": 2434793472, "step": 2322 }, { "epoch": 2.599160839160839, "grad_norm": 0.35668629548061453, "learning_rate": 2.1871143398547735e-07, "loss": 0.617, "num_input_tokens_seen": 2435842048, "step": 2323 }, { "epoch": 2.6002797202797203, "grad_norm": 0.3756460863814064, "learning_rate": 2.1750255376336126e-07, "loss": 0.6776, "num_input_tokens_seen": 2436890624, "step": 2324 }, { "epoch": 2.6013986013986012, "grad_norm": 0.3473900196489958, "learning_rate": 2.162968717599423e-07, "loss": 0.5371, "num_input_tokens_seen": 2437939200, "step": 2325 }, { "epoch": 2.6025174825174826, "grad_norm": 0.3304513806204311, "learning_rate": 2.1509438966461433e-07, "loss": 0.6982, "num_input_tokens_seen": 2438987776, "step": 2326 }, { "epoch": 2.6036363636363635, "grad_norm": 0.35322382548408043, "learning_rate": 2.1389510916228513e-07, "loss": 0.7472, "num_input_tokens_seen": 2440036352, "step": 2327 }, { "epoch": 2.604755244755245, "grad_norm": 0.33916888231746567, "learning_rate": 2.1269903193337853e-07, "loss": 0.6019, "num_input_tokens_seen": 2441084928, "step": 2328 }, { "epoch": 2.605874125874126, "grad_norm": 0.33136557510249753, "learning_rate": 2.115061596538287e-07, "loss": 0.7517, "num_input_tokens_seen": 2442133504, "step": 2329 }, { "epoch": 2.606993006993007, "grad_norm": 0.3703180238446725, "learning_rate": 2.103164939950797e-07, "loss": 0.7789, "num_input_tokens_seen": 2443182080, "step": 2330 }, { "epoch": 2.608111888111888, "grad_norm": 0.3561937053662923, "learning_rate": 2.0913003662408254e-07, "loss": 0.6756, "num_input_tokens_seen": 2444230656, "step": 2331 }, { "epoch": 2.609230769230769, "grad_norm": 0.38367026968476414, "learning_rate": 2.079467892032924e-07, "loss": 0.6985, "num_input_tokens_seen": 2445279232, "step": 2332 }, { "epoch": 2.6103496503496504, "grad_norm": 0.37324876957129377, "learning_rate": 2.0676675339066726e-07, "loss": 0.7547, "num_input_tokens_seen": 2446327808, "step": 2333 }, { "epoch": 2.6114685314685313, "grad_norm": 0.35056160658969077, "learning_rate": 2.0558993083966388e-07, "loss": 0.5374, "num_input_tokens_seen": 2447376384, "step": 2334 }, { "epoch": 2.6125874125874127, "grad_norm": 0.34885941750533445, "learning_rate": 2.0441632319923798e-07, "loss": 0.7885, "num_input_tokens_seen": 2448424960, "step": 2335 }, { "epoch": 2.6137062937062936, "grad_norm": 0.36936894884642046, "learning_rate": 2.0324593211384026e-07, "loss": 0.5847, "num_input_tokens_seen": 2449473536, "step": 2336 }, { "epoch": 2.614825174825175, "grad_norm": 0.39555411775139354, "learning_rate": 2.020787592234133e-07, "loss": 0.5384, "num_input_tokens_seen": 2450522112, "step": 2337 }, { "epoch": 2.615944055944056, "grad_norm": 0.4511779279396811, "learning_rate": 2.0091480616339197e-07, "loss": 0.5595, "num_input_tokens_seen": 2451570688, "step": 2338 }, { "epoch": 2.6170629370629372, "grad_norm": 0.3512897602508161, "learning_rate": 1.9975407456469808e-07, "loss": 0.6298, "num_input_tokens_seen": 2452619264, "step": 2339 }, { "epoch": 2.618181818181818, "grad_norm": 0.4032977188639809, "learning_rate": 1.9859656605374062e-07, "loss": 0.6429, "num_input_tokens_seen": 2453667840, "step": 2340 }, { "epoch": 2.619300699300699, "grad_norm": 0.37085223299611164, "learning_rate": 1.9744228225241248e-07, "loss": 0.7437, "num_input_tokens_seen": 2454716416, "step": 2341 }, { "epoch": 2.6204195804195805, "grad_norm": 0.36433537967268226, "learning_rate": 1.962912247780868e-07, "loss": 0.797, "num_input_tokens_seen": 2455764992, "step": 2342 }, { "epoch": 2.6215384615384614, "grad_norm": 0.3453147656414811, "learning_rate": 1.9514339524361742e-07, "loss": 0.6242, "num_input_tokens_seen": 2456813568, "step": 2343 }, { "epoch": 2.6226573426573427, "grad_norm": 0.3836028305792592, "learning_rate": 1.939987952573344e-07, "loss": 0.6448, "num_input_tokens_seen": 2457862144, "step": 2344 }, { "epoch": 2.6237762237762237, "grad_norm": 0.4241481535399157, "learning_rate": 1.928574264230429e-07, "loss": 0.7, "num_input_tokens_seen": 2458910720, "step": 2345 }, { "epoch": 2.624895104895105, "grad_norm": 0.3502435388733535, "learning_rate": 1.9171929034002112e-07, "loss": 0.6599, "num_input_tokens_seen": 2459959296, "step": 2346 }, { "epoch": 2.626013986013986, "grad_norm": 0.43526563822222164, "learning_rate": 1.9058438860301621e-07, "loss": 0.8065, "num_input_tokens_seen": 2461007872, "step": 2347 }, { "epoch": 2.6271328671328673, "grad_norm": 0.3618365191800925, "learning_rate": 1.894527228022447e-07, "loss": 0.5611, "num_input_tokens_seen": 2462056448, "step": 2348 }, { "epoch": 2.6282517482517482, "grad_norm": 0.3319334564816168, "learning_rate": 1.883242945233879e-07, "loss": 0.628, "num_input_tokens_seen": 2463105024, "step": 2349 }, { "epoch": 2.629370629370629, "grad_norm": 0.34361497819475445, "learning_rate": 1.871991053475916e-07, "loss": 0.8103, "num_input_tokens_seen": 2464153600, "step": 2350 }, { "epoch": 2.6304895104895105, "grad_norm": 0.35387593443901166, "learning_rate": 1.8607715685146244e-07, "loss": 0.8288, "num_input_tokens_seen": 2465202176, "step": 2351 }, { "epoch": 2.631608391608392, "grad_norm": 0.3864351769561451, "learning_rate": 1.849584506070673e-07, "loss": 0.617, "num_input_tokens_seen": 2466250752, "step": 2352 }, { "epoch": 2.632727272727273, "grad_norm": 0.355671654661448, "learning_rate": 1.8384298818192814e-07, "loss": 0.5503, "num_input_tokens_seen": 2467299328, "step": 2353 }, { "epoch": 2.6338461538461537, "grad_norm": 0.3614313044016002, "learning_rate": 1.8273077113902276e-07, "loss": 0.6904, "num_input_tokens_seen": 2468347904, "step": 2354 }, { "epoch": 2.634965034965035, "grad_norm": 0.3427420288117635, "learning_rate": 1.8162180103678177e-07, "loss": 0.7589, "num_input_tokens_seen": 2469396480, "step": 2355 }, { "epoch": 2.636083916083916, "grad_norm": 0.34106681082369217, "learning_rate": 1.805160794290861e-07, "loss": 0.6092, "num_input_tokens_seen": 2470445056, "step": 2356 }, { "epoch": 2.6372027972027974, "grad_norm": 0.3512852815652738, "learning_rate": 1.79413607865265e-07, "loss": 0.6892, "num_input_tokens_seen": 2471493632, "step": 2357 }, { "epoch": 2.6383216783216783, "grad_norm": 0.3337553584836002, "learning_rate": 1.7831438789009337e-07, "loss": 0.6571, "num_input_tokens_seen": 2472542208, "step": 2358 }, { "epoch": 2.639440559440559, "grad_norm": 0.32998742444433493, "learning_rate": 1.772184210437894e-07, "loss": 0.6386, "num_input_tokens_seen": 2473590784, "step": 2359 }, { "epoch": 2.6405594405594406, "grad_norm": 0.3697491589350903, "learning_rate": 1.7612570886201442e-07, "loss": 0.9497, "num_input_tokens_seen": 2474639360, "step": 2360 }, { "epoch": 2.641678321678322, "grad_norm": 0.406237532416082, "learning_rate": 1.7503625287586896e-07, "loss": 0.7045, "num_input_tokens_seen": 2475687936, "step": 2361 }, { "epoch": 2.642797202797203, "grad_norm": 0.33663134127095096, "learning_rate": 1.739500546118908e-07, "loss": 0.6049, "num_input_tokens_seen": 2476736512, "step": 2362 }, { "epoch": 2.643916083916084, "grad_norm": 0.39191196294618125, "learning_rate": 1.728671155920525e-07, "loss": 0.6196, "num_input_tokens_seen": 2477785088, "step": 2363 }, { "epoch": 2.645034965034965, "grad_norm": 0.3348708592574552, "learning_rate": 1.717874373337608e-07, "loss": 0.5948, "num_input_tokens_seen": 2478833664, "step": 2364 }, { "epoch": 2.646153846153846, "grad_norm": 0.3392524759750838, "learning_rate": 1.7071102134985224e-07, "loss": 0.6138, "num_input_tokens_seen": 2479882240, "step": 2365 }, { "epoch": 2.6472727272727274, "grad_norm": 0.3304472367645497, "learning_rate": 1.6963786914859338e-07, "loss": 0.64, "num_input_tokens_seen": 2480930816, "step": 2366 }, { "epoch": 2.6483916083916084, "grad_norm": 0.39867329259746037, "learning_rate": 1.6856798223367777e-07, "loss": 0.8437, "num_input_tokens_seen": 2481979392, "step": 2367 }, { "epoch": 2.6495104895104893, "grad_norm": 0.3400700048237947, "learning_rate": 1.6750136210422235e-07, "loss": 0.5819, "num_input_tokens_seen": 2483027968, "step": 2368 }, { "epoch": 2.6506293706293707, "grad_norm": 0.33563248689127007, "learning_rate": 1.66438010254768e-07, "loss": 0.6379, "num_input_tokens_seen": 2484076544, "step": 2369 }, { "epoch": 2.651748251748252, "grad_norm": 0.33904064966645775, "learning_rate": 1.6537792817527543e-07, "loss": 0.6556, "num_input_tokens_seen": 2485125120, "step": 2370 }, { "epoch": 2.652867132867133, "grad_norm": 0.3702605192875378, "learning_rate": 1.6432111735112277e-07, "loss": 0.6788, "num_input_tokens_seen": 2486173696, "step": 2371 }, { "epoch": 2.653986013986014, "grad_norm": 0.3212020355301848, "learning_rate": 1.6326757926310748e-07, "loss": 0.6448, "num_input_tokens_seen": 2487222272, "step": 2372 }, { "epoch": 2.6551048951048952, "grad_norm": 0.3596877948676503, "learning_rate": 1.622173153874379e-07, "loss": 0.6232, "num_input_tokens_seen": 2488270848, "step": 2373 }, { "epoch": 2.656223776223776, "grad_norm": 0.32977530651381287, "learning_rate": 1.611703271957371e-07, "loss": 0.5747, "num_input_tokens_seen": 2489319424, "step": 2374 }, { "epoch": 2.6573426573426575, "grad_norm": 0.34650998226960894, "learning_rate": 1.601266161550366e-07, "loss": 0.8071, "num_input_tokens_seen": 2490368000, "step": 2375 }, { "epoch": 2.6584615384615384, "grad_norm": 0.40086404260609804, "learning_rate": 1.5908618372777656e-07, "loss": 0.593, "num_input_tokens_seen": 2491416576, "step": 2376 }, { "epoch": 2.6595804195804194, "grad_norm": 0.4098151289874036, "learning_rate": 1.5804903137180415e-07, "loss": 0.6145, "num_input_tokens_seen": 2492465152, "step": 2377 }, { "epoch": 2.6606993006993007, "grad_norm": 0.33015698011295, "learning_rate": 1.5701516054036886e-07, "loss": 0.6186, "num_input_tokens_seen": 2493513728, "step": 2378 }, { "epoch": 2.661818181818182, "grad_norm": 0.3784415205656482, "learning_rate": 1.5598457268212353e-07, "loss": 0.6529, "num_input_tokens_seen": 2494562304, "step": 2379 }, { "epoch": 2.662937062937063, "grad_norm": 0.3397977411204582, "learning_rate": 1.5495726924111942e-07, "loss": 0.6891, "num_input_tokens_seen": 2495610880, "step": 2380 }, { "epoch": 2.664055944055944, "grad_norm": 0.36237780412821874, "learning_rate": 1.5393325165680707e-07, "loss": 0.7731, "num_input_tokens_seen": 2496659456, "step": 2381 }, { "epoch": 2.6651748251748253, "grad_norm": 0.34297765769481936, "learning_rate": 1.5291252136403284e-07, "loss": 0.6344, "num_input_tokens_seen": 2497708032, "step": 2382 }, { "epoch": 2.666293706293706, "grad_norm": 0.3853312076142228, "learning_rate": 1.5189507979303575e-07, "loss": 0.6363, "num_input_tokens_seen": 2498756608, "step": 2383 }, { "epoch": 2.6674125874125876, "grad_norm": 0.4117490411210741, "learning_rate": 1.5088092836944844e-07, "loss": 0.8332, "num_input_tokens_seen": 2499805184, "step": 2384 }, { "epoch": 2.6685314685314685, "grad_norm": 0.3364387344356949, "learning_rate": 1.4987006851429147e-07, "loss": 0.6673, "num_input_tokens_seen": 2500853760, "step": 2385 }, { "epoch": 2.6696503496503494, "grad_norm": 0.34584228624943214, "learning_rate": 1.4886250164397458e-07, "loss": 0.6535, "num_input_tokens_seen": 2501902336, "step": 2386 }, { "epoch": 2.670769230769231, "grad_norm": 0.3438645967691636, "learning_rate": 1.4785822917029318e-07, "loss": 0.6838, "num_input_tokens_seen": 2502950912, "step": 2387 }, { "epoch": 2.671888111888112, "grad_norm": 0.39185050195481985, "learning_rate": 1.4685725250042692e-07, "loss": 0.6759, "num_input_tokens_seen": 2503999488, "step": 2388 }, { "epoch": 2.673006993006993, "grad_norm": 0.34506986670813106, "learning_rate": 1.4585957303693664e-07, "loss": 0.6384, "num_input_tokens_seen": 2505048064, "step": 2389 }, { "epoch": 2.674125874125874, "grad_norm": 0.33799332932373516, "learning_rate": 1.4486519217776273e-07, "loss": 0.7674, "num_input_tokens_seen": 2506096640, "step": 2390 }, { "epoch": 2.6752447552447554, "grad_norm": 0.3535342996573691, "learning_rate": 1.4387411131622592e-07, "loss": 0.6736, "num_input_tokens_seen": 2507145216, "step": 2391 }, { "epoch": 2.6763636363636363, "grad_norm": 0.33368251885346906, "learning_rate": 1.4288633184101953e-07, "loss": 0.5869, "num_input_tokens_seen": 2508193792, "step": 2392 }, { "epoch": 2.6774825174825176, "grad_norm": 0.3365674228989151, "learning_rate": 1.4190185513621473e-07, "loss": 0.6028, "num_input_tokens_seen": 2509242368, "step": 2393 }, { "epoch": 2.6786013986013986, "grad_norm": 0.3580411644101074, "learning_rate": 1.409206825812523e-07, "loss": 0.6303, "num_input_tokens_seen": 2510290944, "step": 2394 }, { "epoch": 2.6797202797202795, "grad_norm": 0.36101873952845104, "learning_rate": 1.3994281555094386e-07, "loss": 0.655, "num_input_tokens_seen": 2511339520, "step": 2395 }, { "epoch": 2.680839160839161, "grad_norm": 0.3964415383313029, "learning_rate": 1.3896825541547003e-07, "loss": 0.6869, "num_input_tokens_seen": 2512388096, "step": 2396 }, { "epoch": 2.681958041958042, "grad_norm": 0.41641489181326513, "learning_rate": 1.3799700354037605e-07, "loss": 0.696, "num_input_tokens_seen": 2513436672, "step": 2397 }, { "epoch": 2.683076923076923, "grad_norm": 0.3394053302749732, "learning_rate": 1.370290612865749e-07, "loss": 0.6224, "num_input_tokens_seen": 2514485248, "step": 2398 }, { "epoch": 2.684195804195804, "grad_norm": 0.3464480528001753, "learning_rate": 1.3606443001033864e-07, "loss": 0.7083, "num_input_tokens_seen": 2515533824, "step": 2399 }, { "epoch": 2.6853146853146854, "grad_norm": 0.333622598440002, "learning_rate": 1.3510311106330247e-07, "loss": 0.5806, "num_input_tokens_seen": 2516582400, "step": 2400 }, { "epoch": 2.6864335664335663, "grad_norm": 0.34230973297103634, "learning_rate": 1.341451057924592e-07, "loss": 0.6754, "num_input_tokens_seen": 2517630976, "step": 2401 }, { "epoch": 2.6875524475524477, "grad_norm": 0.3824173813328401, "learning_rate": 1.3319041554015782e-07, "loss": 0.6539, "num_input_tokens_seen": 2518679552, "step": 2402 }, { "epoch": 2.6886713286713286, "grad_norm": 0.3595282487126626, "learning_rate": 1.3223904164410494e-07, "loss": 0.6658, "num_input_tokens_seen": 2519728128, "step": 2403 }, { "epoch": 2.6897902097902096, "grad_norm": 0.3551651236649816, "learning_rate": 1.3129098543735758e-07, "loss": 0.8401, "num_input_tokens_seen": 2520776704, "step": 2404 }, { "epoch": 2.690909090909091, "grad_norm": 0.3688943935624153, "learning_rate": 1.303462482483256e-07, "loss": 0.5324, "num_input_tokens_seen": 2521825280, "step": 2405 }, { "epoch": 2.6920279720279723, "grad_norm": 0.38697672804890143, "learning_rate": 1.2940483140076788e-07, "loss": 0.5929, "num_input_tokens_seen": 2522873856, "step": 2406 }, { "epoch": 2.693146853146853, "grad_norm": 0.3316098651085797, "learning_rate": 1.2846673621379035e-07, "loss": 0.5994, "num_input_tokens_seen": 2523922432, "step": 2407 }, { "epoch": 2.694265734265734, "grad_norm": 0.3392963266751376, "learning_rate": 1.2753196400184548e-07, "loss": 0.6117, "num_input_tokens_seen": 2524971008, "step": 2408 }, { "epoch": 2.6953846153846155, "grad_norm": 0.7791649308466841, "learning_rate": 1.2660051607472885e-07, "loss": 0.7487, "num_input_tokens_seen": 2526019584, "step": 2409 }, { "epoch": 2.6965034965034964, "grad_norm": 0.35330810689985154, "learning_rate": 1.2567239373757923e-07, "loss": 0.6408, "num_input_tokens_seen": 2527068160, "step": 2410 }, { "epoch": 2.6976223776223778, "grad_norm": 0.36919993887345115, "learning_rate": 1.2474759829087413e-07, "loss": 0.6753, "num_input_tokens_seen": 2528116736, "step": 2411 }, { "epoch": 2.6987412587412587, "grad_norm": 0.3549830166398009, "learning_rate": 1.2382613103043062e-07, "loss": 0.6925, "num_input_tokens_seen": 2529165312, "step": 2412 }, { "epoch": 2.6998601398601396, "grad_norm": 0.3492874674027553, "learning_rate": 1.2290799324740144e-07, "loss": 0.7279, "num_input_tokens_seen": 2530213888, "step": 2413 }, { "epoch": 2.700979020979021, "grad_norm": 0.3296088841473642, "learning_rate": 1.2199318622827473e-07, "loss": 0.5556, "num_input_tokens_seen": 2531262464, "step": 2414 }, { "epoch": 2.7020979020979023, "grad_norm": 0.36329662067684326, "learning_rate": 1.2108171125487177e-07, "loss": 0.5939, "num_input_tokens_seen": 2532311040, "step": 2415 }, { "epoch": 2.7032167832167833, "grad_norm": 0.3487735429073764, "learning_rate": 1.2017356960434406e-07, "loss": 0.7539, "num_input_tokens_seen": 2533359616, "step": 2416 }, { "epoch": 2.704335664335664, "grad_norm": 0.36501853225170283, "learning_rate": 1.1926876254917314e-07, "loss": 0.6324, "num_input_tokens_seen": 2534408192, "step": 2417 }, { "epoch": 2.7054545454545456, "grad_norm": 0.36350677754471555, "learning_rate": 1.1836729135716818e-07, "loss": 0.5899, "num_input_tokens_seen": 2535456768, "step": 2418 }, { "epoch": 2.7065734265734265, "grad_norm": 0.32501656125015427, "learning_rate": 1.174691572914638e-07, "loss": 0.7363, "num_input_tokens_seen": 2536505344, "step": 2419 }, { "epoch": 2.707692307692308, "grad_norm": 0.34327657199969686, "learning_rate": 1.1657436161051916e-07, "loss": 0.6672, "num_input_tokens_seen": 2537553920, "step": 2420 }, { "epoch": 2.7088111888111888, "grad_norm": 0.3476013891181746, "learning_rate": 1.1568290556811495e-07, "loss": 0.7783, "num_input_tokens_seen": 2538602496, "step": 2421 }, { "epoch": 2.7099300699300697, "grad_norm": 0.3696263766485088, "learning_rate": 1.1479479041335368e-07, "loss": 0.6515, "num_input_tokens_seen": 2539651072, "step": 2422 }, { "epoch": 2.711048951048951, "grad_norm": 0.33922999853443503, "learning_rate": 1.1391001739065432e-07, "loss": 0.745, "num_input_tokens_seen": 2540699648, "step": 2423 }, { "epoch": 2.7121678321678324, "grad_norm": 0.36643297306138983, "learning_rate": 1.1302858773975634e-07, "loss": 0.8179, "num_input_tokens_seen": 2541748224, "step": 2424 }, { "epoch": 2.7132867132867133, "grad_norm": 0.3444767119057606, "learning_rate": 1.121505026957112e-07, "loss": 0.7885, "num_input_tokens_seen": 2542796800, "step": 2425 }, { "epoch": 2.7144055944055943, "grad_norm": 0.322547489139971, "learning_rate": 1.1127576348888502e-07, "loss": 0.6156, "num_input_tokens_seen": 2543845376, "step": 2426 }, { "epoch": 2.7155244755244756, "grad_norm": 0.3335568887443192, "learning_rate": 1.1040437134495708e-07, "loss": 0.6352, "num_input_tokens_seen": 2544893952, "step": 2427 }, { "epoch": 2.7166433566433565, "grad_norm": 0.37904445167693035, "learning_rate": 1.0953632748491455e-07, "loss": 0.8094, "num_input_tokens_seen": 2545942528, "step": 2428 }, { "epoch": 2.717762237762238, "grad_norm": 0.3860044024075062, "learning_rate": 1.0867163312505452e-07, "loss": 0.6211, "num_input_tokens_seen": 2546991104, "step": 2429 }, { "epoch": 2.718881118881119, "grad_norm": 0.4231312530171382, "learning_rate": 1.0781028947698113e-07, "loss": 0.6925, "num_input_tokens_seen": 2548039680, "step": 2430 }, { "epoch": 2.7199999999999998, "grad_norm": 0.3740404414837098, "learning_rate": 1.0695229774760147e-07, "loss": 0.7175, "num_input_tokens_seen": 2549088256, "step": 2431 }, { "epoch": 2.721118881118881, "grad_norm": 0.3297615849179434, "learning_rate": 1.0609765913912828e-07, "loss": 0.5854, "num_input_tokens_seen": 2550136832, "step": 2432 }, { "epoch": 2.7222377622377625, "grad_norm": 0.35505876108843315, "learning_rate": 1.0524637484907424e-07, "loss": 0.6098, "num_input_tokens_seen": 2551185408, "step": 2433 }, { "epoch": 2.7233566433566434, "grad_norm": 0.3523913873815648, "learning_rate": 1.0439844607025324e-07, "loss": 0.8065, "num_input_tokens_seen": 2552233984, "step": 2434 }, { "epoch": 2.7244755244755243, "grad_norm": 0.3359921686155325, "learning_rate": 1.0355387399077627e-07, "loss": 0.6329, "num_input_tokens_seen": 2553282560, "step": 2435 }, { "epoch": 2.7255944055944057, "grad_norm": 0.32116368241616855, "learning_rate": 1.0271265979405254e-07, "loss": 0.5419, "num_input_tokens_seen": 2554331136, "step": 2436 }, { "epoch": 2.7267132867132866, "grad_norm": 0.32693030453282146, "learning_rate": 1.0187480465878418e-07, "loss": 0.5454, "num_input_tokens_seen": 2555379712, "step": 2437 }, { "epoch": 2.727832167832168, "grad_norm": 0.3601896298905073, "learning_rate": 1.0104030975896794e-07, "loss": 0.8302, "num_input_tokens_seen": 2556428288, "step": 2438 }, { "epoch": 2.728951048951049, "grad_norm": 0.3916665917035458, "learning_rate": 1.0020917626389209e-07, "loss": 0.693, "num_input_tokens_seen": 2557476864, "step": 2439 }, { "epoch": 2.73006993006993, "grad_norm": 0.39478237432814545, "learning_rate": 9.938140533813478e-08, "loss": 0.5975, "num_input_tokens_seen": 2558525440, "step": 2440 }, { "epoch": 2.731188811188811, "grad_norm": 0.3653016109665951, "learning_rate": 9.855699814156266e-08, "loss": 0.8575, "num_input_tokens_seen": 2559574016, "step": 2441 }, { "epoch": 2.7323076923076925, "grad_norm": 0.3437089733279662, "learning_rate": 9.773595582932921e-08, "loss": 0.7426, "num_input_tokens_seen": 2560622592, "step": 2442 }, { "epoch": 2.7334265734265735, "grad_norm": 0.3312488116814734, "learning_rate": 9.691827955187222e-08, "loss": 0.6154, "num_input_tokens_seen": 2561671168, "step": 2443 }, { "epoch": 2.7345454545454544, "grad_norm": 0.3483495246524163, "learning_rate": 9.61039704549141e-08, "loss": 0.6935, "num_input_tokens_seen": 2562719744, "step": 2444 }, { "epoch": 2.7356643356643358, "grad_norm": 0.3822463876957252, "learning_rate": 9.52930296794588e-08, "loss": 0.6179, "num_input_tokens_seen": 2563768320, "step": 2445 }, { "epoch": 2.7367832167832167, "grad_norm": 0.4261022767876188, "learning_rate": 9.448545836179102e-08, "loss": 0.7338, "num_input_tokens_seen": 2564816896, "step": 2446 }, { "epoch": 2.737902097902098, "grad_norm": 0.4025340545819182, "learning_rate": 9.368125763347336e-08, "loss": 0.6381, "num_input_tokens_seen": 2565865472, "step": 2447 }, { "epoch": 2.739020979020979, "grad_norm": 0.3496653336698817, "learning_rate": 9.288042862134556e-08, "loss": 0.7011, "num_input_tokens_seen": 2566914048, "step": 2448 }, { "epoch": 2.74013986013986, "grad_norm": 0.3230105679301423, "learning_rate": 9.208297244752362e-08, "loss": 0.6146, "num_input_tokens_seen": 2567962624, "step": 2449 }, { "epoch": 2.7412587412587412, "grad_norm": 0.3468155677815861, "learning_rate": 9.128889022939791e-08, "loss": 0.6778, "num_input_tokens_seen": 2569011200, "step": 2450 }, { "epoch": 2.7423776223776226, "grad_norm": 0.3522716495710804, "learning_rate": 9.049818307963004e-08, "loss": 0.5959, "num_input_tokens_seen": 2570059776, "step": 2451 }, { "epoch": 2.7434965034965035, "grad_norm": 0.3611841101339901, "learning_rate": 8.971085210615321e-08, "loss": 0.6909, "num_input_tokens_seen": 2571108352, "step": 2452 }, { "epoch": 2.7446153846153845, "grad_norm": 0.3316089889053657, "learning_rate": 8.892689841216995e-08, "loss": 0.5937, "num_input_tokens_seen": 2572156928, "step": 2453 }, { "epoch": 2.745734265734266, "grad_norm": 0.4972796871543546, "learning_rate": 8.81463230961499e-08, "loss": 0.7223, "num_input_tokens_seen": 2573205504, "step": 2454 }, { "epoch": 2.7468531468531467, "grad_norm": 0.35547999670818303, "learning_rate": 8.736912725182983e-08, "loss": 0.6198, "num_input_tokens_seen": 2574254080, "step": 2455 }, { "epoch": 2.747972027972028, "grad_norm": 0.40420460001606356, "learning_rate": 8.659531196821142e-08, "loss": 0.5295, "num_input_tokens_seen": 2575302656, "step": 2456 }, { "epoch": 2.749090909090909, "grad_norm": 0.34334965599183004, "learning_rate": 8.582487832955788e-08, "loss": 0.654, "num_input_tokens_seen": 2576351232, "step": 2457 }, { "epoch": 2.75020979020979, "grad_norm": 0.3607497615694666, "learning_rate": 8.505782741539626e-08, "loss": 0.6923, "num_input_tokens_seen": 2577399808, "step": 2458 }, { "epoch": 2.7513286713286713, "grad_norm": 0.359543207417513, "learning_rate": 8.429416030051179e-08, "loss": 0.7479, "num_input_tokens_seen": 2578448384, "step": 2459 }, { "epoch": 2.7524475524475527, "grad_norm": 0.34444687498765153, "learning_rate": 8.353387805494967e-08, "loss": 0.6552, "num_input_tokens_seen": 2579496960, "step": 2460 }, { "epoch": 2.7535664335664336, "grad_norm": 0.37824534450569874, "learning_rate": 8.277698174401189e-08, "loss": 0.6799, "num_input_tokens_seen": 2580545536, "step": 2461 }, { "epoch": 2.7546853146853145, "grad_norm": 0.3337893690969729, "learning_rate": 8.202347242825565e-08, "loss": 0.5874, "num_input_tokens_seen": 2581594112, "step": 2462 }, { "epoch": 2.755804195804196, "grad_norm": 0.5044257301439781, "learning_rate": 8.127335116349305e-08, "loss": 0.5944, "num_input_tokens_seen": 2582642688, "step": 2463 }, { "epoch": 2.756923076923077, "grad_norm": 0.34944512561924346, "learning_rate": 8.052661900078835e-08, "loss": 0.6769, "num_input_tokens_seen": 2583691264, "step": 2464 }, { "epoch": 2.758041958041958, "grad_norm": 0.35520826817402956, "learning_rate": 7.978327698645705e-08, "loss": 0.725, "num_input_tokens_seen": 2584739840, "step": 2465 }, { "epoch": 2.759160839160839, "grad_norm": 0.3364123898600958, "learning_rate": 7.90433261620649e-08, "loss": 0.6167, "num_input_tokens_seen": 2585788416, "step": 2466 }, { "epoch": 2.76027972027972, "grad_norm": 0.6231166473439663, "learning_rate": 7.830676756442529e-08, "loss": 0.5821, "num_input_tokens_seen": 2586836992, "step": 2467 }, { "epoch": 2.7613986013986014, "grad_norm": 0.3350927209014676, "learning_rate": 7.757360222559878e-08, "loss": 0.6192, "num_input_tokens_seen": 2587885568, "step": 2468 }, { "epoch": 2.7625174825174827, "grad_norm": 0.40835765010705544, "learning_rate": 7.684383117289141e-08, "loss": 0.715, "num_input_tokens_seen": 2588934144, "step": 2469 }, { "epoch": 2.7636363636363637, "grad_norm": 0.33894815944120293, "learning_rate": 7.61174554288524e-08, "loss": 0.7061, "num_input_tokens_seen": 2589982720, "step": 2470 }, { "epoch": 2.7647552447552446, "grad_norm": 0.3850873969831372, "learning_rate": 7.539447601127542e-08, "loss": 0.7753, "num_input_tokens_seen": 2591031296, "step": 2471 }, { "epoch": 2.765874125874126, "grad_norm": 0.3815825501513916, "learning_rate": 7.46748939331926e-08, "loss": 0.6054, "num_input_tokens_seen": 2592079872, "step": 2472 }, { "epoch": 2.766993006993007, "grad_norm": 0.3424250363211656, "learning_rate": 7.39587102028777e-08, "loss": 0.7063, "num_input_tokens_seen": 2593128448, "step": 2473 }, { "epoch": 2.7681118881118882, "grad_norm": 0.352490880214663, "learning_rate": 7.324592582384215e-08, "loss": 0.6988, "num_input_tokens_seen": 2594177024, "step": 2474 }, { "epoch": 2.769230769230769, "grad_norm": 0.31964900714057626, "learning_rate": 7.2536541794834e-08, "loss": 0.553, "num_input_tokens_seen": 2595225600, "step": 2475 }, { "epoch": 2.77034965034965, "grad_norm": 0.3646336953550553, "learning_rate": 7.183055910983671e-08, "loss": 0.6731, "num_input_tokens_seen": 2596274176, "step": 2476 }, { "epoch": 2.7714685314685314, "grad_norm": 0.34524093332648753, "learning_rate": 7.112797875806904e-08, "loss": 0.6203, "num_input_tokens_seen": 2597322752, "step": 2477 }, { "epoch": 2.772587412587413, "grad_norm": 0.5208219959807263, "learning_rate": 7.042880172398043e-08, "loss": 0.6989, "num_input_tokens_seen": 2598371328, "step": 2478 }, { "epoch": 2.7737062937062937, "grad_norm": 0.4008773028698697, "learning_rate": 6.973302898725303e-08, "loss": 0.6745, "num_input_tokens_seen": 2599419904, "step": 2479 }, { "epoch": 2.7748251748251747, "grad_norm": 0.33671185107294427, "learning_rate": 6.904066152279815e-08, "loss": 0.7327, "num_input_tokens_seen": 2600468480, "step": 2480 }, { "epoch": 2.775944055944056, "grad_norm": 0.34167066836572096, "learning_rate": 6.835170030075638e-08, "loss": 0.6567, "num_input_tokens_seen": 2601517056, "step": 2481 }, { "epoch": 2.777062937062937, "grad_norm": 0.38328433701592507, "learning_rate": 6.766614628649525e-08, "loss": 0.6064, "num_input_tokens_seen": 2602565632, "step": 2482 }, { "epoch": 2.7781818181818183, "grad_norm": 0.3562537074561045, "learning_rate": 6.698400044060777e-08, "loss": 0.6902, "num_input_tokens_seen": 2603614208, "step": 2483 }, { "epoch": 2.7793006993006992, "grad_norm": 0.3611571471567616, "learning_rate": 6.630526371891155e-08, "loss": 0.7422, "num_input_tokens_seen": 2604662784, "step": 2484 }, { "epoch": 2.78041958041958, "grad_norm": 0.4546897089064987, "learning_rate": 6.56299370724478e-08, "loss": 0.6695, "num_input_tokens_seen": 2605711360, "step": 2485 }, { "epoch": 2.7815384615384615, "grad_norm": 0.3984069359104572, "learning_rate": 6.495802144747904e-08, "loss": 0.7529, "num_input_tokens_seen": 2606759936, "step": 2486 }, { "epoch": 2.782657342657343, "grad_norm": 0.32697296097074835, "learning_rate": 6.428951778548881e-08, "loss": 0.752, "num_input_tokens_seen": 2607808512, "step": 2487 }, { "epoch": 2.783776223776224, "grad_norm": 0.34154284824652376, "learning_rate": 6.362442702317923e-08, "loss": 0.6004, "num_input_tokens_seen": 2608857088, "step": 2488 }, { "epoch": 2.7848951048951047, "grad_norm": 0.38557870790897253, "learning_rate": 6.296275009247121e-08, "loss": 0.6841, "num_input_tokens_seen": 2609905664, "step": 2489 }, { "epoch": 2.786013986013986, "grad_norm": 0.3515926129213973, "learning_rate": 6.230448792050065e-08, "loss": 0.572, "num_input_tokens_seen": 2610954240, "step": 2490 }, { "epoch": 2.787132867132867, "grad_norm": 0.3776297844746458, "learning_rate": 6.164964142962027e-08, "loss": 0.6711, "num_input_tokens_seen": 2612002816, "step": 2491 }, { "epoch": 2.7882517482517484, "grad_norm": 0.34213333091094794, "learning_rate": 6.099821153739637e-08, "loss": 0.6412, "num_input_tokens_seen": 2613051392, "step": 2492 }, { "epoch": 2.7893706293706293, "grad_norm": 0.3534997765905321, "learning_rate": 6.035019915660717e-08, "loss": 0.6598, "num_input_tokens_seen": 2614099968, "step": 2493 }, { "epoch": 2.79048951048951, "grad_norm": 0.36438196961179187, "learning_rate": 5.970560519524327e-08, "loss": 0.6347, "num_input_tokens_seen": 2615148544, "step": 2494 }, { "epoch": 2.7916083916083916, "grad_norm": 0.39527634845130066, "learning_rate": 5.906443055650496e-08, "loss": 0.6806, "num_input_tokens_seen": 2616197120, "step": 2495 }, { "epoch": 2.792727272727273, "grad_norm": 0.40586927361523173, "learning_rate": 5.84266761388011e-08, "loss": 0.6992, "num_input_tokens_seen": 2617245696, "step": 2496 }, { "epoch": 2.793846153846154, "grad_norm": 0.36974015603749033, "learning_rate": 5.779234283574936e-08, "loss": 0.7976, "num_input_tokens_seen": 2618294272, "step": 2497 }, { "epoch": 2.794965034965035, "grad_norm": 0.34283788079893324, "learning_rate": 5.7161431536171816e-08, "loss": 0.6158, "num_input_tokens_seen": 2619342848, "step": 2498 }, { "epoch": 2.796083916083916, "grad_norm": 0.3401612094936671, "learning_rate": 5.653394312409771e-08, "loss": 0.6014, "num_input_tokens_seen": 2620391424, "step": 2499 }, { "epoch": 2.797202797202797, "grad_norm": 0.3919831469050033, "learning_rate": 5.590987847875845e-08, "loss": 0.7546, "num_input_tokens_seen": 2621440000, "step": 2500 }, { "epoch": 2.797202797202797, "eval_loss": 0.7187779545783997, "eval_runtime": 246.8662, "eval_samples_per_second": 2.366, "eval_steps_per_second": 0.296, "num_input_tokens_seen": 2621440000, "step": 2500 }, { "epoch": 2.7983216783216784, "grad_norm": 0.3607213162851952, "learning_rate": 5.528923847458928e-08, "loss": 0.5569, "num_input_tokens_seen": 2622488576, "step": 2501 }, { "epoch": 2.7994405594405594, "grad_norm": 0.33826777473155994, "learning_rate": 5.467202398122651e-08, "loss": 0.7357, "num_input_tokens_seen": 2623537152, "step": 2502 }, { "epoch": 2.8005594405594403, "grad_norm": 0.35730571191853794, "learning_rate": 5.4058235863506116e-08, "loss": 0.7066, "num_input_tokens_seen": 2624585728, "step": 2503 }, { "epoch": 2.8016783216783216, "grad_norm": 0.3591541413090711, "learning_rate": 5.3447874981464034e-08, "loss": 0.8059, "num_input_tokens_seen": 2625634304, "step": 2504 }, { "epoch": 2.802797202797203, "grad_norm": 0.39649316816758723, "learning_rate": 5.2840942190333086e-08, "loss": 0.6861, "num_input_tokens_seen": 2626682880, "step": 2505 }, { "epoch": 2.803916083916084, "grad_norm": 0.3705300891563728, "learning_rate": 5.223743834054329e-08, "loss": 0.6513, "num_input_tokens_seen": 2627731456, "step": 2506 }, { "epoch": 2.805034965034965, "grad_norm": 0.4146809598198725, "learning_rate": 5.1637364277719595e-08, "loss": 0.5416, "num_input_tokens_seen": 2628780032, "step": 2507 }, { "epoch": 2.806153846153846, "grad_norm": 0.3607703676095598, "learning_rate": 5.104072084268136e-08, "loss": 0.6172, "num_input_tokens_seen": 2629828608, "step": 2508 }, { "epoch": 2.807272727272727, "grad_norm": 0.35358173536757614, "learning_rate": 5.044750887144151e-08, "loss": 0.5928, "num_input_tokens_seen": 2630877184, "step": 2509 }, { "epoch": 2.8083916083916085, "grad_norm": 0.3414242531025578, "learning_rate": 4.9857729195203486e-08, "loss": 0.6963, "num_input_tokens_seen": 2631925760, "step": 2510 }, { "epoch": 2.8095104895104894, "grad_norm": 0.4390611790465848, "learning_rate": 4.927138264036291e-08, "loss": 0.7047, "num_input_tokens_seen": 2632974336, "step": 2511 }, { "epoch": 2.810629370629371, "grad_norm": 0.3274194957763868, "learning_rate": 4.8688470028503966e-08, "loss": 0.6311, "num_input_tokens_seen": 2634022912, "step": 2512 }, { "epoch": 2.8117482517482517, "grad_norm": 0.34283631997700514, "learning_rate": 4.810899217639997e-08, "loss": 0.6982, "num_input_tokens_seen": 2635071488, "step": 2513 }, { "epoch": 2.812867132867133, "grad_norm": 0.3530660749684633, "learning_rate": 4.753294989601032e-08, "loss": 0.8509, "num_input_tokens_seen": 2636120064, "step": 2514 }, { "epoch": 2.813986013986014, "grad_norm": 0.347532932050533, "learning_rate": 4.696034399448185e-08, "loss": 0.669, "num_input_tokens_seen": 2637168640, "step": 2515 }, { "epoch": 2.815104895104895, "grad_norm": 0.34608973496255885, "learning_rate": 4.639117527414527e-08, "loss": 0.5719, "num_input_tokens_seen": 2638217216, "step": 2516 }, { "epoch": 2.8162237762237763, "grad_norm": 0.35802380309443954, "learning_rate": 4.582544453251597e-08, "loss": 0.6416, "num_input_tokens_seen": 2639265792, "step": 2517 }, { "epoch": 2.817342657342657, "grad_norm": 0.4354275253074081, "learning_rate": 4.52631525622918e-08, "loss": 0.6419, "num_input_tokens_seen": 2640314368, "step": 2518 }, { "epoch": 2.8184615384615386, "grad_norm": 0.3259911554509107, "learning_rate": 4.470430015135197e-08, "loss": 0.7284, "num_input_tokens_seen": 2641362944, "step": 2519 }, { "epoch": 2.8195804195804195, "grad_norm": 0.353362454072338, "learning_rate": 4.414888808275619e-08, "loss": 0.7285, "num_input_tokens_seen": 2642411520, "step": 2520 }, { "epoch": 2.820699300699301, "grad_norm": 0.3672019718443443, "learning_rate": 4.35969171347439e-08, "loss": 0.6511, "num_input_tokens_seen": 2643460096, "step": 2521 }, { "epoch": 2.821818181818182, "grad_norm": 0.36954904870203215, "learning_rate": 4.304838808073281e-08, "loss": 0.7493, "num_input_tokens_seen": 2644508672, "step": 2522 }, { "epoch": 2.822937062937063, "grad_norm": 0.3398839364345148, "learning_rate": 4.25033016893181e-08, "loss": 0.6405, "num_input_tokens_seen": 2645557248, "step": 2523 }, { "epoch": 2.824055944055944, "grad_norm": 0.3471695535772355, "learning_rate": 4.1961658724270496e-08, "loss": 0.5696, "num_input_tokens_seen": 2646605824, "step": 2524 }, { "epoch": 2.825174825174825, "grad_norm": 0.3555088854477462, "learning_rate": 4.1423459944536224e-08, "loss": 0.6327, "num_input_tokens_seen": 2647654400, "step": 2525 }, { "epoch": 2.8262937062937064, "grad_norm": 0.39228130794813976, "learning_rate": 4.088870610423512e-08, "loss": 0.6895, "num_input_tokens_seen": 2648702976, "step": 2526 }, { "epoch": 2.8274125874125873, "grad_norm": 0.3828263344260485, "learning_rate": 4.035739795266086e-08, "loss": 0.8083, "num_input_tokens_seen": 2649751552, "step": 2527 }, { "epoch": 2.8285314685314686, "grad_norm": 0.3362194593959348, "learning_rate": 3.982953623427876e-08, "loss": 0.6886, "num_input_tokens_seen": 2650800128, "step": 2528 }, { "epoch": 2.8296503496503496, "grad_norm": 0.34419837202649, "learning_rate": 3.9305121688723855e-08, "loss": 0.517, "num_input_tokens_seen": 2651848704, "step": 2529 }, { "epoch": 2.830769230769231, "grad_norm": 0.36175009661713414, "learning_rate": 3.87841550508028e-08, "loss": 0.6603, "num_input_tokens_seen": 2652897280, "step": 2530 }, { "epoch": 2.831888111888112, "grad_norm": 0.35388310242551796, "learning_rate": 3.8266637050489716e-08, "loss": 0.6326, "num_input_tokens_seen": 2653945856, "step": 2531 }, { "epoch": 2.833006993006993, "grad_norm": 0.3386899439620571, "learning_rate": 3.7752568412927346e-08, "loss": 0.6368, "num_input_tokens_seen": 2654994432, "step": 2532 }, { "epoch": 2.834125874125874, "grad_norm": 0.365745470155295, "learning_rate": 3.7241949858424777e-08, "loss": 0.599, "num_input_tokens_seen": 2656043008, "step": 2533 }, { "epoch": 2.835244755244755, "grad_norm": 0.3368865586352208, "learning_rate": 3.673478210245718e-08, "loss": 0.6076, "num_input_tokens_seen": 2657091584, "step": 2534 }, { "epoch": 2.8363636363636364, "grad_norm": 0.33113607509757775, "learning_rate": 3.623106585566388e-08, "loss": 0.6969, "num_input_tokens_seen": 2658140160, "step": 2535 }, { "epoch": 2.8374825174825173, "grad_norm": 0.3828123010627376, "learning_rate": 3.573080182384864e-08, "loss": 0.6353, "num_input_tokens_seen": 2659188736, "step": 2536 }, { "epoch": 2.8386013986013987, "grad_norm": 0.3581447931959978, "learning_rate": 3.523399070797795e-08, "loss": 0.7189, "num_input_tokens_seen": 2660237312, "step": 2537 }, { "epoch": 2.8397202797202796, "grad_norm": 0.380653731177563, "learning_rate": 3.47406332041797e-08, "loss": 0.5916, "num_input_tokens_seen": 2661285888, "step": 2538 }, { "epoch": 2.840839160839161, "grad_norm": 0.3535180236821517, "learning_rate": 3.425073000374257e-08, "loss": 0.5792, "num_input_tokens_seen": 2662334464, "step": 2539 }, { "epoch": 2.841958041958042, "grad_norm": 0.34680038129895285, "learning_rate": 3.3764281793115804e-08, "loss": 0.6197, "num_input_tokens_seen": 2663383040, "step": 2540 }, { "epoch": 2.8430769230769233, "grad_norm": 0.355344575366833, "learning_rate": 3.328128925390667e-08, "loss": 0.6129, "num_input_tokens_seen": 2664431616, "step": 2541 }, { "epoch": 2.844195804195804, "grad_norm": 0.49618405351862277, "learning_rate": 3.280175306288103e-08, "loss": 0.6925, "num_input_tokens_seen": 2665480192, "step": 2542 }, { "epoch": 2.845314685314685, "grad_norm": 0.46017958728071273, "learning_rate": 3.2325673891961394e-08, "loss": 0.8072, "num_input_tokens_seen": 2666528768, "step": 2543 }, { "epoch": 2.8464335664335665, "grad_norm": 0.33548488031289847, "learning_rate": 3.1853052408226395e-08, "loss": 0.6871, "num_input_tokens_seen": 2667577344, "step": 2544 }, { "epoch": 2.8475524475524474, "grad_norm": 0.32555264337849354, "learning_rate": 3.138388927391017e-08, "loss": 0.5945, "num_input_tokens_seen": 2668625920, "step": 2545 }, { "epoch": 2.8486713286713288, "grad_norm": 0.44186540474998737, "learning_rate": 3.091818514639994e-08, "loss": 0.558, "num_input_tokens_seen": 2669674496, "step": 2546 }, { "epoch": 2.8497902097902097, "grad_norm": 0.34248244614230255, "learning_rate": 3.045594067823704e-08, "loss": 0.5643, "num_input_tokens_seen": 2670723072, "step": 2547 }, { "epoch": 2.850909090909091, "grad_norm": 0.3918246137307725, "learning_rate": 2.99971565171156e-08, "loss": 0.7352, "num_input_tokens_seen": 2671771648, "step": 2548 }, { "epoch": 2.852027972027972, "grad_norm": 0.37531108673910896, "learning_rate": 2.9541833305880287e-08, "loss": 0.6695, "num_input_tokens_seen": 2672820224, "step": 2549 }, { "epoch": 2.8531468531468533, "grad_norm": 0.36758356498161543, "learning_rate": 2.9089971682526862e-08, "loss": 0.5442, "num_input_tokens_seen": 2673868800, "step": 2550 }, { "epoch": 2.8542657342657343, "grad_norm": 0.36444763501975963, "learning_rate": 2.864157228019998e-08, "loss": 0.692, "num_input_tokens_seen": 2674917376, "step": 2551 }, { "epoch": 2.855384615384615, "grad_norm": 0.337706663559577, "learning_rate": 2.8196635727194276e-08, "loss": 0.7277, "num_input_tokens_seen": 2675965952, "step": 2552 }, { "epoch": 2.8565034965034966, "grad_norm": 0.3385092129443546, "learning_rate": 2.7755162646950773e-08, "loss": 0.7013, "num_input_tokens_seen": 2677014528, "step": 2553 }, { "epoch": 2.8576223776223775, "grad_norm": 0.37564442209407906, "learning_rate": 2.731715365805937e-08, "loss": 0.592, "num_input_tokens_seen": 2678063104, "step": 2554 }, { "epoch": 2.858741258741259, "grad_norm": 0.35659517954422937, "learning_rate": 2.688260937425413e-08, "loss": 0.8021, "num_input_tokens_seen": 2679111680, "step": 2555 }, { "epoch": 2.8598601398601398, "grad_norm": 0.35495142599693846, "learning_rate": 2.645153040441578e-08, "loss": 0.6836, "num_input_tokens_seen": 2680160256, "step": 2556 }, { "epoch": 2.860979020979021, "grad_norm": 0.34984601757783035, "learning_rate": 2.6023917352568652e-08, "loss": 0.618, "num_input_tokens_seen": 2681208832, "step": 2557 }, { "epoch": 2.862097902097902, "grad_norm": 0.3515674370572788, "learning_rate": 2.5599770817881508e-08, "loss": 0.7565, "num_input_tokens_seen": 2682257408, "step": 2558 }, { "epoch": 2.8632167832167834, "grad_norm": 0.38147847249660416, "learning_rate": 2.5179091394665346e-08, "loss": 0.7696, "num_input_tokens_seen": 2683305984, "step": 2559 }, { "epoch": 2.8643356643356643, "grad_norm": 0.3350210298170292, "learning_rate": 2.4761879672372535e-08, "loss": 0.6276, "num_input_tokens_seen": 2684354560, "step": 2560 }, { "epoch": 2.8654545454545453, "grad_norm": 0.35360713681133843, "learning_rate": 2.4348136235597398e-08, "loss": 0.7476, "num_input_tokens_seen": 2685403136, "step": 2561 }, { "epoch": 2.8665734265734266, "grad_norm": 0.3911793342558203, "learning_rate": 2.3937861664074523e-08, "loss": 0.5979, "num_input_tokens_seen": 2686451712, "step": 2562 }, { "epoch": 2.8676923076923075, "grad_norm": 0.3494634915795155, "learning_rate": 2.3531056532677122e-08, "loss": 0.7925, "num_input_tokens_seen": 2687500288, "step": 2563 }, { "epoch": 2.868811188811189, "grad_norm": 0.3624965010746436, "learning_rate": 2.3127721411417836e-08, "loss": 0.6842, "num_input_tokens_seen": 2688548864, "step": 2564 }, { "epoch": 2.86993006993007, "grad_norm": 0.35080356117435335, "learning_rate": 2.272785686544682e-08, "loss": 0.8109, "num_input_tokens_seen": 2689597440, "step": 2565 }, { "epoch": 2.871048951048951, "grad_norm": 0.32898595947851933, "learning_rate": 2.233146345505144e-08, "loss": 0.6439, "num_input_tokens_seen": 2690646016, "step": 2566 }, { "epoch": 2.872167832167832, "grad_norm": 0.35331218121602936, "learning_rate": 2.1938541735655183e-08, "loss": 0.7115, "num_input_tokens_seen": 2691694592, "step": 2567 }, { "epoch": 2.8732867132867135, "grad_norm": 0.3406027963767393, "learning_rate": 2.154909225781654e-08, "loss": 0.7045, "num_input_tokens_seen": 2692743168, "step": 2568 }, { "epoch": 2.8744055944055944, "grad_norm": 0.3875314411431314, "learning_rate": 2.1163115567230386e-08, "loss": 0.601, "num_input_tokens_seen": 2693791744, "step": 2569 }, { "epoch": 2.8755244755244753, "grad_norm": 0.31899081398604767, "learning_rate": 2.078061220472355e-08, "loss": 0.5456, "num_input_tokens_seen": 2694840320, "step": 2570 }, { "epoch": 2.8766433566433567, "grad_norm": 0.3444849070656428, "learning_rate": 2.0401582706257304e-08, "loss": 0.733, "num_input_tokens_seen": 2695888896, "step": 2571 }, { "epoch": 2.8777622377622376, "grad_norm": 0.33899987673349485, "learning_rate": 2.0026027602925158e-08, "loss": 0.7031, "num_input_tokens_seen": 2696937472, "step": 2572 }, { "epoch": 2.878881118881119, "grad_norm": 0.3643792759270612, "learning_rate": 1.9653947420951448e-08, "loss": 0.9375, "num_input_tokens_seen": 2697986048, "step": 2573 }, { "epoch": 2.88, "grad_norm": 0.323180031996962, "learning_rate": 1.928534268169302e-08, "loss": 0.5872, "num_input_tokens_seen": 2699034624, "step": 2574 }, { "epoch": 2.8811188811188813, "grad_norm": 0.3391810102917947, "learning_rate": 1.892021390163562e-08, "loss": 0.719, "num_input_tokens_seen": 2700083200, "step": 2575 }, { "epoch": 2.882237762237762, "grad_norm": 0.32064620241192665, "learning_rate": 1.8558561592395275e-08, "loss": 0.5889, "num_input_tokens_seen": 2701131776, "step": 2576 }, { "epoch": 2.8833566433566435, "grad_norm": 0.3590302364628149, "learning_rate": 1.8200386260716352e-08, "loss": 0.6301, "num_input_tokens_seen": 2702180352, "step": 2577 }, { "epoch": 2.8844755244755245, "grad_norm": 0.34883214590052053, "learning_rate": 1.7845688408471563e-08, "loss": 0.6651, "num_input_tokens_seen": 2703228928, "step": 2578 }, { "epoch": 2.8855944055944054, "grad_norm": 0.3952509169852559, "learning_rate": 1.74944685326614e-08, "loss": 0.6397, "num_input_tokens_seen": 2704277504, "step": 2579 }, { "epoch": 2.8867132867132868, "grad_norm": 0.3936666710823911, "learning_rate": 1.7146727125411655e-08, "loss": 0.5979, "num_input_tokens_seen": 2705326080, "step": 2580 }, { "epoch": 2.8878321678321677, "grad_norm": 0.35700322791597344, "learning_rate": 1.6802464673975893e-08, "loss": 0.7543, "num_input_tokens_seen": 2706374656, "step": 2581 }, { "epoch": 2.888951048951049, "grad_norm": 0.33753202522519493, "learning_rate": 1.6461681660731865e-08, "loss": 0.7046, "num_input_tokens_seen": 2707423232, "step": 2582 }, { "epoch": 2.89006993006993, "grad_norm": 0.3267074176049818, "learning_rate": 1.6124378563182053e-08, "loss": 0.5703, "num_input_tokens_seen": 2708471808, "step": 2583 }, { "epoch": 2.8911888111888113, "grad_norm": 0.3358916249566021, "learning_rate": 1.5790555853953116e-08, "loss": 0.6367, "num_input_tokens_seen": 2709520384, "step": 2584 }, { "epoch": 2.8923076923076922, "grad_norm": 0.34291249700716114, "learning_rate": 1.546021400079506e-08, "loss": 0.6915, "num_input_tokens_seen": 2710568960, "step": 2585 }, { "epoch": 2.8934265734265736, "grad_norm": 0.3582825662149562, "learning_rate": 1.513335346658068e-08, "loss": 0.6201, "num_input_tokens_seen": 2711617536, "step": 2586 }, { "epoch": 2.8945454545454545, "grad_norm": 0.41249118807063384, "learning_rate": 1.4809974709304176e-08, "loss": 0.6935, "num_input_tokens_seen": 2712666112, "step": 2587 }, { "epoch": 2.8956643356643355, "grad_norm": 0.3425262512805467, "learning_rate": 1.4490078182081979e-08, "loss": 0.6633, "num_input_tokens_seen": 2713714688, "step": 2588 }, { "epoch": 2.896783216783217, "grad_norm": 0.35584791844339264, "learning_rate": 1.4173664333149983e-08, "loss": 0.5979, "num_input_tokens_seen": 2714763264, "step": 2589 }, { "epoch": 2.8979020979020977, "grad_norm": 0.34148293049061057, "learning_rate": 1.3860733605865761e-08, "loss": 0.7129, "num_input_tokens_seen": 2715811840, "step": 2590 }, { "epoch": 2.899020979020979, "grad_norm": 0.34798577024450145, "learning_rate": 1.3551286438705513e-08, "loss": 0.5724, "num_input_tokens_seen": 2716860416, "step": 2591 }, { "epoch": 2.90013986013986, "grad_norm": 0.3544486490077694, "learning_rate": 1.324532326526351e-08, "loss": 0.6127, "num_input_tokens_seen": 2717908992, "step": 2592 }, { "epoch": 2.9012587412587414, "grad_norm": 0.45658255862653097, "learning_rate": 1.2942844514254038e-08, "loss": 0.553, "num_input_tokens_seen": 2718957568, "step": 2593 }, { "epoch": 2.9023776223776223, "grad_norm": 0.33550149314708444, "learning_rate": 1.2643850609507512e-08, "loss": 0.8386, "num_input_tokens_seen": 2720006144, "step": 2594 }, { "epoch": 2.9034965034965037, "grad_norm": 0.3185528973690535, "learning_rate": 1.2348341969972143e-08, "loss": 0.5611, "num_input_tokens_seen": 2721054720, "step": 2595 }, { "epoch": 2.9046153846153846, "grad_norm": 0.3574844630683223, "learning_rate": 1.2056319009712824e-08, "loss": 0.71, "num_input_tokens_seen": 2722103296, "step": 2596 }, { "epoch": 2.9057342657342655, "grad_norm": 0.3224194326050326, "learning_rate": 1.1767782137909467e-08, "loss": 0.5938, "num_input_tokens_seen": 2723151872, "step": 2597 }, { "epoch": 2.906853146853147, "grad_norm": 0.3211762845539648, "learning_rate": 1.148273175885839e-08, "loss": 0.5475, "num_input_tokens_seen": 2724200448, "step": 2598 }, { "epoch": 2.907972027972028, "grad_norm": 0.370774224096051, "learning_rate": 1.1201168271969266e-08, "loss": 0.6893, "num_input_tokens_seen": 2725249024, "step": 2599 }, { "epoch": 2.909090909090909, "grad_norm": 0.37930507435818595, "learning_rate": 1.0923092071767615e-08, "loss": 0.7421, "num_input_tokens_seen": 2726297600, "step": 2600 }, { "epoch": 2.91020979020979, "grad_norm": 0.3480430537614192, "learning_rate": 1.0648503547891487e-08, "loss": 0.6537, "num_input_tokens_seen": 2727346176, "step": 2601 }, { "epoch": 2.9113286713286715, "grad_norm": 0.3480941350349133, "learning_rate": 1.0377403085092275e-08, "loss": 0.5967, "num_input_tokens_seen": 2728394752, "step": 2602 }, { "epoch": 2.9124475524475524, "grad_norm": 0.38657891895843005, "learning_rate": 1.0109791063233898e-08, "loss": 0.6689, "num_input_tokens_seen": 2729443328, "step": 2603 }, { "epoch": 2.9135664335664337, "grad_norm": 0.33918360555804467, "learning_rate": 9.84566785729224e-09, "loss": 0.5944, "num_input_tokens_seen": 2730491904, "step": 2604 }, { "epoch": 2.9146853146853147, "grad_norm": 0.3497626927192013, "learning_rate": 9.585033837355151e-09, "loss": 0.7026, "num_input_tokens_seen": 2731540480, "step": 2605 }, { "epoch": 2.9158041958041956, "grad_norm": 0.3262691083356959, "learning_rate": 9.32788936862078e-09, "loss": 0.6758, "num_input_tokens_seen": 2732589056, "step": 2606 }, { "epoch": 2.916923076923077, "grad_norm": 0.5034535794843802, "learning_rate": 9.074234811398408e-09, "loss": 0.7042, "num_input_tokens_seen": 2733637632, "step": 2607 }, { "epoch": 2.9180419580419583, "grad_norm": 0.3394143026529658, "learning_rate": 8.824070521106787e-09, "loss": 0.7203, "num_input_tokens_seen": 2734686208, "step": 2608 }, { "epoch": 2.9191608391608392, "grad_norm": 0.3911946771012842, "learning_rate": 8.577396848274134e-09, "loss": 0.7092, "num_input_tokens_seen": 2735734784, "step": 2609 }, { "epoch": 2.92027972027972, "grad_norm": 0.3663573954920797, "learning_rate": 8.334214138538132e-09, "loss": 0.7815, "num_input_tokens_seen": 2736783360, "step": 2610 }, { "epoch": 2.9213986013986015, "grad_norm": 0.3553769928420004, "learning_rate": 8.094522732644272e-09, "loss": 0.6925, "num_input_tokens_seen": 2737831936, "step": 2611 }, { "epoch": 2.9225174825174824, "grad_norm": 0.3383255549782913, "learning_rate": 7.858322966446397e-09, "loss": 0.7153, "num_input_tokens_seen": 2738880512, "step": 2612 }, { "epoch": 2.923636363636364, "grad_norm": 0.35613436918578156, "learning_rate": 7.625615170906153e-09, "loss": 0.751, "num_input_tokens_seen": 2739929088, "step": 2613 }, { "epoch": 2.9247552447552447, "grad_norm": 0.3356703916084319, "learning_rate": 7.396399672092158e-09, "loss": 0.6437, "num_input_tokens_seen": 2740977664, "step": 2614 }, { "epoch": 2.9258741258741257, "grad_norm": 0.3421617324619456, "learning_rate": 7.17067679117861e-09, "loss": 0.6028, "num_input_tokens_seen": 2742026240, "step": 2615 }, { "epoch": 2.926993006993007, "grad_norm": 0.36077599178059266, "learning_rate": 6.948446844447787e-09, "loss": 0.6755, "num_input_tokens_seen": 2743074816, "step": 2616 }, { "epoch": 2.9281118881118884, "grad_norm": 0.32780828510734894, "learning_rate": 6.729710143286161e-09, "loss": 0.6773, "num_input_tokens_seen": 2744123392, "step": 2617 }, { "epoch": 2.9292307692307693, "grad_norm": 0.3390487366045574, "learning_rate": 6.514466994185786e-09, "loss": 0.7518, "num_input_tokens_seen": 2745171968, "step": 2618 }, { "epoch": 2.9303496503496502, "grad_norm": 0.3358482733621688, "learning_rate": 6.302717698744298e-09, "loss": 0.5816, "num_input_tokens_seen": 2746220544, "step": 2619 }, { "epoch": 2.9314685314685316, "grad_norm": 0.32608160194581975, "learning_rate": 6.094462553662972e-09, "loss": 0.5671, "num_input_tokens_seen": 2747269120, "step": 2620 }, { "epoch": 2.9325874125874125, "grad_norm": 0.3744311999595841, "learning_rate": 5.889701850747276e-09, "loss": 0.6945, "num_input_tokens_seen": 2748317696, "step": 2621 }, { "epoch": 2.933706293706294, "grad_norm": 0.3335718537561884, "learning_rate": 5.688435876906873e-09, "loss": 0.7404, "num_input_tokens_seen": 2749366272, "step": 2622 }, { "epoch": 2.934825174825175, "grad_norm": 0.3344798634549623, "learning_rate": 5.490664914153676e-09, "loss": 0.6878, "num_input_tokens_seen": 2750414848, "step": 2623 }, { "epoch": 2.9359440559440557, "grad_norm": 0.3339593074519587, "learning_rate": 5.296389239603239e-09, "loss": 0.6308, "num_input_tokens_seen": 2751463424, "step": 2624 }, { "epoch": 2.937062937062937, "grad_norm": 0.3860662354648349, "learning_rate": 5.10560912547281e-09, "loss": 0.5787, "num_input_tokens_seen": 2752512000, "step": 2625 }, { "epoch": 2.9381818181818184, "grad_norm": 0.3380050321163438, "learning_rate": 4.918324839082444e-09, "loss": 0.6914, "num_input_tokens_seen": 2753560576, "step": 2626 }, { "epoch": 2.9393006993006994, "grad_norm": 0.35490164977708605, "learning_rate": 4.734536642853338e-09, "loss": 0.7354, "num_input_tokens_seen": 2754609152, "step": 2627 }, { "epoch": 2.9404195804195803, "grad_norm": 0.3735402424103966, "learning_rate": 4.554244794308382e-09, "loss": 0.8268, "num_input_tokens_seen": 2755657728, "step": 2628 }, { "epoch": 2.9415384615384617, "grad_norm": 0.3352649022566988, "learning_rate": 4.377449546071055e-09, "loss": 0.6978, "num_input_tokens_seen": 2756706304, "step": 2629 }, { "epoch": 2.9426573426573426, "grad_norm": 0.3618514254094484, "learning_rate": 4.204151145865421e-09, "loss": 0.682, "num_input_tokens_seen": 2757754880, "step": 2630 }, { "epoch": 2.943776223776224, "grad_norm": 0.34276346272777297, "learning_rate": 4.034349836516127e-09, "loss": 0.6442, "num_input_tokens_seen": 2758803456, "step": 2631 }, { "epoch": 2.944895104895105, "grad_norm": 0.3427341190226994, "learning_rate": 3.8680458559475775e-09, "loss": 0.588, "num_input_tokens_seen": 2759852032, "step": 2632 }, { "epoch": 2.946013986013986, "grad_norm": 0.34130217091549986, "learning_rate": 3.705239437183372e-09, "loss": 0.6812, "num_input_tokens_seen": 2760900608, "step": 2633 }, { "epoch": 2.947132867132867, "grad_norm": 0.3577562276356123, "learning_rate": 3.5459308083471422e-09, "loss": 0.6295, "num_input_tokens_seen": 2761949184, "step": 2634 }, { "epoch": 2.9482517482517485, "grad_norm": 0.3503801312931761, "learning_rate": 3.3901201926606063e-09, "loss": 0.5187, "num_input_tokens_seen": 2762997760, "step": 2635 }, { "epoch": 2.9493706293706294, "grad_norm": 0.33566583115740756, "learning_rate": 3.237807808444404e-09, "loss": 0.6132, "num_input_tokens_seen": 2764046336, "step": 2636 }, { "epoch": 2.9504895104895104, "grad_norm": 0.3654794464833399, "learning_rate": 3.088993869117818e-09, "loss": 0.7231, "num_input_tokens_seen": 2765094912, "step": 2637 }, { "epoch": 2.9516083916083917, "grad_norm": 0.3383543701251067, "learning_rate": 2.943678583197662e-09, "loss": 0.6015, "num_input_tokens_seen": 2766143488, "step": 2638 }, { "epoch": 2.9527272727272726, "grad_norm": 0.3379142549096605, "learning_rate": 2.8018621542988402e-09, "loss": 0.6697, "num_input_tokens_seen": 2767192064, "step": 2639 }, { "epoch": 2.953846153846154, "grad_norm": 0.35755000694855316, "learning_rate": 2.6635447811332315e-09, "loss": 0.6365, "num_input_tokens_seen": 2768240640, "step": 2640 }, { "epoch": 2.954965034965035, "grad_norm": 0.3472527842527593, "learning_rate": 2.52872665751025e-09, "loss": 0.7623, "num_input_tokens_seen": 2769289216, "step": 2641 }, { "epoch": 2.956083916083916, "grad_norm": 0.3880220372602136, "learning_rate": 2.397407972336008e-09, "loss": 0.7513, "num_input_tokens_seen": 2770337792, "step": 2642 }, { "epoch": 2.957202797202797, "grad_norm": 0.35335862355171266, "learning_rate": 2.2695889096133184e-09, "loss": 0.6677, "num_input_tokens_seen": 2771386368, "step": 2643 }, { "epoch": 2.9583216783216786, "grad_norm": 0.376902598954362, "learning_rate": 2.145269648441417e-09, "loss": 0.6647, "num_input_tokens_seen": 2772434944, "step": 2644 }, { "epoch": 2.9594405594405595, "grad_norm": 0.3406821148477218, "learning_rate": 2.0244503630154066e-09, "loss": 0.7776, "num_input_tokens_seen": 2773483520, "step": 2645 }, { "epoch": 2.9605594405594404, "grad_norm": 0.45419725037522635, "learning_rate": 1.907131222626535e-09, "loss": 0.7085, "num_input_tokens_seen": 2774532096, "step": 2646 }, { "epoch": 2.961678321678322, "grad_norm": 0.3704812071725178, "learning_rate": 1.7933123916613614e-09, "loss": 0.6998, "num_input_tokens_seen": 2775580672, "step": 2647 }, { "epoch": 2.9627972027972027, "grad_norm": 0.34610629125284326, "learning_rate": 1.6829940296023139e-09, "loss": 0.7513, "num_input_tokens_seen": 2776629248, "step": 2648 }, { "epoch": 2.963916083916084, "grad_norm": 0.33898811920267274, "learning_rate": 1.5761762910260214e-09, "loss": 0.6613, "num_input_tokens_seen": 2777677824, "step": 2649 }, { "epoch": 2.965034965034965, "grad_norm": 0.4022068927204681, "learning_rate": 1.4728593256055357e-09, "loss": 0.6361, "num_input_tokens_seen": 2778726400, "step": 2650 }, { "epoch": 2.966153846153846, "grad_norm": 0.3901213050941425, "learning_rate": 1.3730432781070002e-09, "loss": 0.8096, "num_input_tokens_seen": 2779774976, "step": 2651 }, { "epoch": 2.9672727272727273, "grad_norm": 0.3489405701422099, "learning_rate": 1.2767282883927035e-09, "loss": 0.6983, "num_input_tokens_seen": 2780823552, "step": 2652 }, { "epoch": 2.9683916083916086, "grad_norm": 0.3528097861719825, "learning_rate": 1.1839144914180256e-09, "loss": 0.6842, "num_input_tokens_seen": 2781872128, "step": 2653 }, { "epoch": 2.9695104895104896, "grad_norm": 0.36892214100949844, "learning_rate": 1.094602017233104e-09, "loss": 0.7869, "num_input_tokens_seen": 2782920704, "step": 2654 }, { "epoch": 2.9706293706293705, "grad_norm": 0.3354259766263392, "learning_rate": 1.0087909909817228e-09, "loss": 0.7337, "num_input_tokens_seen": 2783969280, "step": 2655 }, { "epoch": 2.971748251748252, "grad_norm": 0.37088029764351155, "learning_rate": 9.264815329021459e-10, "loss": 0.6917, "num_input_tokens_seen": 2785017856, "step": 2656 }, { "epoch": 2.9728671328671328, "grad_norm": 0.33353566076812585, "learning_rate": 8.476737583251737e-10, "loss": 0.7033, "num_input_tokens_seen": 2786066432, "step": 2657 }, { "epoch": 2.973986013986014, "grad_norm": 0.3680303531977863, "learning_rate": 7.723677776763639e-10, "loss": 0.7057, "num_input_tokens_seen": 2787115008, "step": 2658 }, { "epoch": 2.975104895104895, "grad_norm": 0.33412491175756137, "learning_rate": 7.005636964732554e-10, "loss": 0.6595, "num_input_tokens_seen": 2788163584, "step": 2659 }, { "epoch": 2.976223776223776, "grad_norm": 0.3473994170804108, "learning_rate": 6.322616153275896e-10, "loss": 0.6379, "num_input_tokens_seen": 2789212160, "step": 2660 }, { "epoch": 2.9773426573426574, "grad_norm": 0.3496080838286373, "learning_rate": 5.674616299436441e-10, "loss": 0.6597, "num_input_tokens_seen": 2790260736, "step": 2661 }, { "epoch": 2.9784615384615387, "grad_norm": 0.4385531896733222, "learning_rate": 5.061638311187889e-10, "loss": 0.6397, "num_input_tokens_seen": 2791309312, "step": 2662 }, { "epoch": 2.9795804195804196, "grad_norm": 0.33087227607465164, "learning_rate": 4.4836830474265235e-10, "loss": 0.5986, "num_input_tokens_seen": 2792357888, "step": 2663 }, { "epoch": 2.9806993006993006, "grad_norm": 0.3287718805687028, "learning_rate": 3.9407513179851034e-10, "loss": 0.6381, "num_input_tokens_seen": 2793406464, "step": 2664 }, { "epoch": 2.981818181818182, "grad_norm": 0.3788388269679192, "learning_rate": 3.432843883610648e-10, "loss": 0.648, "num_input_tokens_seen": 2794455040, "step": 2665 }, { "epoch": 2.982937062937063, "grad_norm": 0.35759701332942706, "learning_rate": 2.9599614559810975e-10, "loss": 0.6502, "num_input_tokens_seen": 2795503616, "step": 2666 }, { "epoch": 2.984055944055944, "grad_norm": 0.35751023681339617, "learning_rate": 2.522104697696981e-10, "loss": 0.768, "num_input_tokens_seen": 2796552192, "step": 2667 }, { "epoch": 2.985174825174825, "grad_norm": 0.35481810589744434, "learning_rate": 2.1192742222786444e-10, "loss": 0.7205, "num_input_tokens_seen": 2797600768, "step": 2668 }, { "epoch": 2.986293706293706, "grad_norm": 0.3570518377092578, "learning_rate": 1.7514705941690247e-10, "loss": 0.7303, "num_input_tokens_seen": 2798649344, "step": 2669 }, { "epoch": 2.9874125874125874, "grad_norm": 0.35581558938797686, "learning_rate": 1.418694328730874e-10, "loss": 0.6629, "num_input_tokens_seen": 2799697920, "step": 2670 }, { "epoch": 2.988531468531469, "grad_norm": 0.40331095053410626, "learning_rate": 1.1209458922495365e-10, "loss": 0.5753, "num_input_tokens_seen": 2800746496, "step": 2671 }, { "epoch": 2.9896503496503497, "grad_norm": 0.3522993923978719, "learning_rate": 8.58225701930171e-11, "loss": 0.6927, "num_input_tokens_seen": 2801795072, "step": 2672 }, { "epoch": 2.9907692307692306, "grad_norm": 0.37657839716443675, "learning_rate": 6.30534125889426e-11, "loss": 0.5823, "num_input_tokens_seen": 2802843648, "step": 2673 }, { "epoch": 2.991888111888112, "grad_norm": 0.34227090209796407, "learning_rate": 4.3787148317209205e-11, "loss": 0.6706, "num_input_tokens_seen": 2803892224, "step": 2674 }, { "epoch": 2.993006993006993, "grad_norm": 0.3443654306833617, "learning_rate": 2.8023804372889762e-11, "loss": 0.7391, "num_input_tokens_seen": 2804940800, "step": 2675 }, { "epoch": 2.9941258741258743, "grad_norm": 0.3551684299089561, "learning_rate": 1.57634028441489e-11, "loss": 0.7509, "num_input_tokens_seen": 2805989376, "step": 2676 }, { "epoch": 2.995244755244755, "grad_norm": 0.37622805286364924, "learning_rate": 7.005960910022591e-12, "loss": 0.634, "num_input_tokens_seen": 2807037952, "step": 2677 }, { "epoch": 2.996363636363636, "grad_norm": 0.33152064613274396, "learning_rate": 1.7514908409732578e-12, "loss": 0.6167, "num_input_tokens_seen": 2808086528, "step": 2678 }, { "epoch": 2.9974825174825175, "grad_norm": 0.5693231184946563, "learning_rate": 0.0, "loss": 0.5345, "num_input_tokens_seen": 2809135104, "step": 2679 }, { "epoch": 2.9974825174825175, "num_input_tokens_seen": 2809135104, "step": 2679, "total_flos": 4487006021222400.0, "train_loss": 0.719823204545487, "train_runtime": 157993.0137, "train_samples_per_second": 0.543, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 2679, "num_input_tokens_seen": 2809135104, "num_train_epochs": 3, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4487006021222400.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }