diff --git "a/long_speech_lora/trainer_state.json" "b/long_speech_lora/trainer_state.json" new file mode 100644--- /dev/null +++ "b/long_speech_lora/trainer_state.json" @@ -0,0 +1,7287 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1035, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002898550724637681, + "grad_norm": 1.7796895708856793, + "learning_rate": 3.125e-06, + "loss": 1.8514, + "step": 1 + }, + { + "epoch": 0.005797101449275362, + "grad_norm": 1.742548277798407, + "learning_rate": 6.25e-06, + "loss": 1.937, + "step": 2 + }, + { + "epoch": 0.008695652173913044, + "grad_norm": 1.5905530955603362, + "learning_rate": 9.375000000000001e-06, + "loss": 1.8724, + "step": 3 + }, + { + "epoch": 0.011594202898550725, + "grad_norm": 1.6592768688949988, + "learning_rate": 1.25e-05, + "loss": 1.8877, + "step": 4 + }, + { + "epoch": 0.014492753623188406, + "grad_norm": 1.4035260613846172, + "learning_rate": 1.5625e-05, + "loss": 1.8086, + "step": 5 + }, + { + "epoch": 0.017391304347826087, + "grad_norm": 0.9555449880629443, + "learning_rate": 1.8750000000000002e-05, + "loss": 1.7276, + "step": 6 + }, + { + "epoch": 0.020289855072463767, + "grad_norm": 0.7915967541673472, + "learning_rate": 2.1875e-05, + "loss": 1.771, + "step": 7 + }, + { + "epoch": 0.02318840579710145, + "grad_norm": 0.7599954441380682, + "learning_rate": 2.5e-05, + "loss": 1.7122, + "step": 8 + }, + { + "epoch": 0.02608695652173913, + "grad_norm": 0.7128173682386719, + "learning_rate": 2.8125000000000003e-05, + "loss": 1.5764, + "step": 9 + }, + { + "epoch": 0.028985507246376812, + "grad_norm": 0.6773249478496584, + "learning_rate": 3.125e-05, + "loss": 1.5811, + "step": 10 + }, + { + "epoch": 0.03188405797101449, + "grad_norm": 0.6571598838212039, + "learning_rate": 3.4375e-05, + "loss": 1.6191, + "step": 11 + }, + { + "epoch": 0.034782608695652174, + "grad_norm": 0.6261792389264198, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.5684, + "step": 12 + }, + { + "epoch": 0.03768115942028986, + "grad_norm": 0.5143810493601375, + "learning_rate": 4.0625000000000005e-05, + "loss": 1.5375, + "step": 13 + }, + { + "epoch": 0.04057971014492753, + "grad_norm": 0.4855788824689092, + "learning_rate": 4.375e-05, + "loss": 1.5076, + "step": 14 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 0.473950934451779, + "learning_rate": 4.6875e-05, + "loss": 1.5083, + "step": 15 + }, + { + "epoch": 0.0463768115942029, + "grad_norm": 0.48567111749562547, + "learning_rate": 5e-05, + "loss": 1.6137, + "step": 16 + }, + { + "epoch": 0.04927536231884058, + "grad_norm": 0.43610179775052604, + "learning_rate": 5.3125000000000004e-05, + "loss": 1.5325, + "step": 17 + }, + { + "epoch": 0.05217391304347826, + "grad_norm": 0.4412188197378122, + "learning_rate": 5.6250000000000005e-05, + "loss": 1.555, + "step": 18 + }, + { + "epoch": 0.05507246376811594, + "grad_norm": 0.43034730708585867, + "learning_rate": 5.9375e-05, + "loss": 1.5453, + "step": 19 + }, + { + "epoch": 0.057971014492753624, + "grad_norm": 0.41694045848699307, + "learning_rate": 6.25e-05, + "loss": 1.5362, + "step": 20 + }, + { + "epoch": 0.06086956521739131, + "grad_norm": 0.4093648088428465, + "learning_rate": 6.562500000000001e-05, + "loss": 1.4596, + "step": 21 + }, + { + "epoch": 0.06376811594202898, + "grad_norm": 0.42036605295826535, + "learning_rate": 6.875e-05, + "loss": 1.5156, + "step": 22 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.4140215214641256, + "learning_rate": 7.1875e-05, + "loss": 1.5021, + "step": 23 + }, + { + "epoch": 0.06956521739130435, + "grad_norm": 0.41797125446436384, + "learning_rate": 7.500000000000001e-05, + "loss": 1.5595, + "step": 24 + }, + { + "epoch": 0.07246376811594203, + "grad_norm": 0.40448941023881985, + "learning_rate": 7.8125e-05, + "loss": 1.5284, + "step": 25 + }, + { + "epoch": 0.07536231884057971, + "grad_norm": 0.36201429136045177, + "learning_rate": 8.125000000000001e-05, + "loss": 1.5402, + "step": 26 + }, + { + "epoch": 0.0782608695652174, + "grad_norm": 0.38159291388896194, + "learning_rate": 8.4375e-05, + "loss": 1.4545, + "step": 27 + }, + { + "epoch": 0.08115942028985507, + "grad_norm": 0.39563825256543766, + "learning_rate": 8.75e-05, + "loss": 1.476, + "step": 28 + }, + { + "epoch": 0.08405797101449275, + "grad_norm": 0.3853757557962818, + "learning_rate": 9.062500000000001e-05, + "loss": 1.5553, + "step": 29 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 0.3715963100647608, + "learning_rate": 9.375e-05, + "loss": 1.4923, + "step": 30 + }, + { + "epoch": 0.08985507246376812, + "grad_norm": 0.3972739650610925, + "learning_rate": 9.687500000000001e-05, + "loss": 1.4166, + "step": 31 + }, + { + "epoch": 0.0927536231884058, + "grad_norm": 0.3709663185634906, + "learning_rate": 0.0001, + "loss": 1.4904, + "step": 32 + }, + { + "epoch": 0.09565217391304348, + "grad_norm": 0.37818493311274604, + "learning_rate": 9.999975473389572e-05, + "loss": 1.4303, + "step": 33 + }, + { + "epoch": 0.09855072463768116, + "grad_norm": 0.3727893878233448, + "learning_rate": 9.999901893798909e-05, + "loss": 1.5126, + "step": 34 + }, + { + "epoch": 0.10144927536231885, + "grad_norm": 0.3573590861971531, + "learning_rate": 9.999779261949875e-05, + "loss": 1.4088, + "step": 35 + }, + { + "epoch": 0.10434782608695652, + "grad_norm": 0.3962649324463349, + "learning_rate": 9.999607579045565e-05, + "loss": 1.4718, + "step": 36 + }, + { + "epoch": 0.1072463768115942, + "grad_norm": 0.3629563065883299, + "learning_rate": 9.999386846770303e-05, + "loss": 1.5376, + "step": 37 + }, + { + "epoch": 0.11014492753623188, + "grad_norm": 0.37698476595481845, + "learning_rate": 9.99911706728961e-05, + "loss": 1.5497, + "step": 38 + }, + { + "epoch": 0.11304347826086956, + "grad_norm": 0.36517596222828796, + "learning_rate": 9.9987982432502e-05, + "loss": 1.3701, + "step": 39 + }, + { + "epoch": 0.11594202898550725, + "grad_norm": 0.3754942171540997, + "learning_rate": 9.998430377779942e-05, + "loss": 1.4751, + "step": 40 + }, + { + "epoch": 0.11884057971014493, + "grad_norm": 0.37273876645697823, + "learning_rate": 9.998013474487833e-05, + "loss": 1.4959, + "step": 41 + }, + { + "epoch": 0.12173913043478261, + "grad_norm": 0.36526298295975423, + "learning_rate": 9.99754753746396e-05, + "loss": 1.477, + "step": 42 + }, + { + "epoch": 0.1246376811594203, + "grad_norm": 0.4028151666513751, + "learning_rate": 9.99703257127947e-05, + "loss": 1.4273, + "step": 43 + }, + { + "epoch": 0.12753623188405797, + "grad_norm": 0.3669671633234476, + "learning_rate": 9.99646858098651e-05, + "loss": 1.3938, + "step": 44 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 0.33083829945323007, + "learning_rate": 9.995855572118186e-05, + "loss": 1.4102, + "step": 45 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.3478285593739705, + "learning_rate": 9.995193550688517e-05, + "loss": 1.4027, + "step": 46 + }, + { + "epoch": 0.13623188405797101, + "grad_norm": 0.37609834638001705, + "learning_rate": 9.994482523192352e-05, + "loss": 1.4909, + "step": 47 + }, + { + "epoch": 0.1391304347826087, + "grad_norm": 0.3544704730906117, + "learning_rate": 9.993722496605333e-05, + "loss": 1.4603, + "step": 48 + }, + { + "epoch": 0.14202898550724638, + "grad_norm": 0.35471120831090747, + "learning_rate": 9.99291347838381e-05, + "loss": 1.4591, + "step": 49 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.3522333621422469, + "learning_rate": 9.992055476464772e-05, + "loss": 1.4661, + "step": 50 + }, + { + "epoch": 0.14782608695652175, + "grad_norm": 0.40369049060969037, + "learning_rate": 9.991148499265771e-05, + "loss": 1.3549, + "step": 51 + }, + { + "epoch": 0.15072463768115943, + "grad_norm": 0.37654258677829533, + "learning_rate": 9.990192555684837e-05, + "loss": 1.4566, + "step": 52 + }, + { + "epoch": 0.1536231884057971, + "grad_norm": 0.35023666520198726, + "learning_rate": 9.989187655100394e-05, + "loss": 1.4291, + "step": 53 + }, + { + "epoch": 0.1565217391304348, + "grad_norm": 0.3713582044260089, + "learning_rate": 9.98813380737116e-05, + "loss": 1.4899, + "step": 54 + }, + { + "epoch": 0.15942028985507245, + "grad_norm": 0.3483542245496034, + "learning_rate": 9.987031022836066e-05, + "loss": 1.422, + "step": 55 + }, + { + "epoch": 0.16231884057971013, + "grad_norm": 0.3428096360294795, + "learning_rate": 9.985879312314135e-05, + "loss": 1.417, + "step": 56 + }, + { + "epoch": 0.16521739130434782, + "grad_norm": 0.3645827259974512, + "learning_rate": 9.984678687104389e-05, + "loss": 1.4285, + "step": 57 + }, + { + "epoch": 0.1681159420289855, + "grad_norm": 0.35685607542080316, + "learning_rate": 9.983429158985736e-05, + "loss": 1.3918, + "step": 58 + }, + { + "epoch": 0.17101449275362318, + "grad_norm": 0.3370796491973602, + "learning_rate": 9.982130740216849e-05, + "loss": 1.4129, + "step": 59 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.3444756598243817, + "learning_rate": 9.980783443536057e-05, + "loss": 1.4355, + "step": 60 + }, + { + "epoch": 0.17681159420289855, + "grad_norm": 0.3436241209978691, + "learning_rate": 9.979387282161206e-05, + "loss": 1.4583, + "step": 61 + }, + { + "epoch": 0.17971014492753623, + "grad_norm": 0.32218525116364366, + "learning_rate": 9.977942269789537e-05, + "loss": 1.4524, + "step": 62 + }, + { + "epoch": 0.1826086956521739, + "grad_norm": 0.385973703132524, + "learning_rate": 9.976448420597556e-05, + "loss": 1.4419, + "step": 63 + }, + { + "epoch": 0.1855072463768116, + "grad_norm": 1.7247641389853836, + "learning_rate": 9.974905749240882e-05, + "loss": 1.3425, + "step": 64 + }, + { + "epoch": 0.18840579710144928, + "grad_norm": 0.3447341772023887, + "learning_rate": 9.973314270854115e-05, + "loss": 1.528, + "step": 65 + }, + { + "epoch": 0.19130434782608696, + "grad_norm": 0.35835098628054646, + "learning_rate": 9.971674001050686e-05, + "loss": 1.4713, + "step": 66 + }, + { + "epoch": 0.19420289855072465, + "grad_norm": 0.365150351821878, + "learning_rate": 9.969984955922697e-05, + "loss": 1.4537, + "step": 67 + }, + { + "epoch": 0.19710144927536233, + "grad_norm": 0.3866963594083402, + "learning_rate": 9.968247152040768e-05, + "loss": 1.5055, + "step": 68 + }, + { + "epoch": 0.2, + "grad_norm": 0.35045697501626877, + "learning_rate": 9.966460606453875e-05, + "loss": 1.4434, + "step": 69 + }, + { + "epoch": 0.2028985507246377, + "grad_norm": 0.36817264001563493, + "learning_rate": 9.964625336689181e-05, + "loss": 1.4294, + "step": 70 + }, + { + "epoch": 0.20579710144927535, + "grad_norm": 0.3654904538276859, + "learning_rate": 9.962741360751866e-05, + "loss": 1.4308, + "step": 71 + }, + { + "epoch": 0.20869565217391303, + "grad_norm": 0.3781497670043016, + "learning_rate": 9.960808697124946e-05, + "loss": 1.4685, + "step": 72 + }, + { + "epoch": 0.21159420289855072, + "grad_norm": 0.36156099913405126, + "learning_rate": 9.958827364769097e-05, + "loss": 1.4062, + "step": 73 + }, + { + "epoch": 0.2144927536231884, + "grad_norm": 0.35552781851256704, + "learning_rate": 9.956797383122463e-05, + "loss": 1.4428, + "step": 74 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 0.3335062272759448, + "learning_rate": 9.954718772100476e-05, + "loss": 1.4467, + "step": 75 + }, + { + "epoch": 0.22028985507246376, + "grad_norm": 0.3427215995763061, + "learning_rate": 9.952591552095646e-05, + "loss": 1.5089, + "step": 76 + }, + { + "epoch": 0.22318840579710145, + "grad_norm": 0.34794374393691757, + "learning_rate": 9.950415743977373e-05, + "loss": 1.4051, + "step": 77 + }, + { + "epoch": 0.22608695652173913, + "grad_norm": 0.3404770224687481, + "learning_rate": 9.948191369091735e-05, + "loss": 1.3876, + "step": 78 + }, + { + "epoch": 0.2289855072463768, + "grad_norm": 0.34102132992338396, + "learning_rate": 9.945918449261282e-05, + "loss": 1.4369, + "step": 79 + }, + { + "epoch": 0.2318840579710145, + "grad_norm": 0.33638460547428023, + "learning_rate": 9.943597006784825e-05, + "loss": 1.4164, + "step": 80 + }, + { + "epoch": 0.23478260869565218, + "grad_norm": 0.35290031375473546, + "learning_rate": 9.941227064437207e-05, + "loss": 1.3796, + "step": 81 + }, + { + "epoch": 0.23768115942028986, + "grad_norm": 0.3463360857934043, + "learning_rate": 9.93880864546909e-05, + "loss": 1.4276, + "step": 82 + }, + { + "epoch": 0.24057971014492754, + "grad_norm": 0.3566368609252091, + "learning_rate": 9.936341773606723e-05, + "loss": 1.4967, + "step": 83 + }, + { + "epoch": 0.24347826086956523, + "grad_norm": 0.3373773040313267, + "learning_rate": 9.933826473051707e-05, + "loss": 1.4079, + "step": 84 + }, + { + "epoch": 0.2463768115942029, + "grad_norm": 0.3393580838287239, + "learning_rate": 9.93126276848076e-05, + "loss": 1.4131, + "step": 85 + }, + { + "epoch": 0.2492753623188406, + "grad_norm": 0.3520135073078003, + "learning_rate": 9.928650685045477e-05, + "loss": 1.4729, + "step": 86 + }, + { + "epoch": 0.25217391304347825, + "grad_norm": 0.3526725034511152, + "learning_rate": 9.925990248372076e-05, + "loss": 1.4314, + "step": 87 + }, + { + "epoch": 0.25507246376811593, + "grad_norm": 0.3433193515525383, + "learning_rate": 9.92328148456116e-05, + "loss": 1.4505, + "step": 88 + }, + { + "epoch": 0.2579710144927536, + "grad_norm": 0.33837489039921237, + "learning_rate": 9.920524420187443e-05, + "loss": 1.4481, + "step": 89 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.33988682832234424, + "learning_rate": 9.917719082299501e-05, + "loss": 1.4149, + "step": 90 + }, + { + "epoch": 0.263768115942029, + "grad_norm": 0.33940846094652855, + "learning_rate": 9.91486549841951e-05, + "loss": 1.3847, + "step": 91 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.31996832381114065, + "learning_rate": 9.911963696542963e-05, + "loss": 1.3112, + "step": 92 + }, + { + "epoch": 0.26956521739130435, + "grad_norm": 0.31493707135599436, + "learning_rate": 9.909013705138406e-05, + "loss": 1.4216, + "step": 93 + }, + { + "epoch": 0.27246376811594203, + "grad_norm": 0.3204454590090509, + "learning_rate": 9.906015553147158e-05, + "loss": 1.3755, + "step": 94 + }, + { + "epoch": 0.2753623188405797, + "grad_norm": 0.3408318845906397, + "learning_rate": 9.902969269983018e-05, + "loss": 1.4574, + "step": 95 + }, + { + "epoch": 0.2782608695652174, + "grad_norm": 0.3196195350266631, + "learning_rate": 9.899874885531987e-05, + "loss": 1.4022, + "step": 96 + }, + { + "epoch": 0.2811594202898551, + "grad_norm": 0.33440793327421947, + "learning_rate": 9.89673243015197e-05, + "loss": 1.3766, + "step": 97 + }, + { + "epoch": 0.28405797101449276, + "grad_norm": 0.33693013386726023, + "learning_rate": 9.893541934672479e-05, + "loss": 1.4676, + "step": 98 + }, + { + "epoch": 0.28695652173913044, + "grad_norm": 0.3467550636007772, + "learning_rate": 9.890303430394328e-05, + "loss": 1.365, + "step": 99 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.3333645230781809, + "learning_rate": 9.887016949089333e-05, + "loss": 1.3514, + "step": 100 + }, + { + "epoch": 0.2927536231884058, + "grad_norm": 0.34610516226844007, + "learning_rate": 9.883682522999992e-05, + "loss": 1.4499, + "step": 101 + }, + { + "epoch": 0.2956521739130435, + "grad_norm": 0.3268443889818303, + "learning_rate": 9.88030018483917e-05, + "loss": 1.4303, + "step": 102 + }, + { + "epoch": 0.2985507246376812, + "grad_norm": 0.33465469810861087, + "learning_rate": 9.876869967789788e-05, + "loss": 1.3757, + "step": 103 + }, + { + "epoch": 0.30144927536231886, + "grad_norm": 0.33038430224796766, + "learning_rate": 9.87339190550448e-05, + "loss": 1.3676, + "step": 104 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 0.3404214439604057, + "learning_rate": 9.86986603210528e-05, + "loss": 1.3974, + "step": 105 + }, + { + "epoch": 0.3072463768115942, + "grad_norm": 0.32959296551839845, + "learning_rate": 9.866292382183278e-05, + "loss": 1.3484, + "step": 106 + }, + { + "epoch": 0.3101449275362319, + "grad_norm": 0.381137959130174, + "learning_rate": 9.86267099079828e-05, + "loss": 1.4149, + "step": 107 + }, + { + "epoch": 0.3130434782608696, + "grad_norm": 0.33114126577828235, + "learning_rate": 9.859001893478468e-05, + "loss": 1.3599, + "step": 108 + }, + { + "epoch": 0.3159420289855073, + "grad_norm": 0.36021993638794775, + "learning_rate": 9.855285126220053e-05, + "loss": 1.413, + "step": 109 + }, + { + "epoch": 0.3188405797101449, + "grad_norm": 0.355739607205717, + "learning_rate": 9.851520725486914e-05, + "loss": 1.4064, + "step": 110 + }, + { + "epoch": 0.3217391304347826, + "grad_norm": 0.3263260079885549, + "learning_rate": 9.847708728210246e-05, + "loss": 1.4048, + "step": 111 + }, + { + "epoch": 0.32463768115942027, + "grad_norm": 0.3199488973648368, + "learning_rate": 9.8438491717882e-05, + "loss": 1.3944, + "step": 112 + }, + { + "epoch": 0.32753623188405795, + "grad_norm": 0.3336592320156713, + "learning_rate": 9.839942094085511e-05, + "loss": 1.3799, + "step": 113 + }, + { + "epoch": 0.33043478260869563, + "grad_norm": 0.32960061743745567, + "learning_rate": 9.835987533433126e-05, + "loss": 1.43, + "step": 114 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.35822567336767946, + "learning_rate": 9.831985528627834e-05, + "loss": 1.4404, + "step": 115 + }, + { + "epoch": 0.336231884057971, + "grad_norm": 0.32466006600725356, + "learning_rate": 9.82793611893188e-05, + "loss": 1.391, + "step": 116 + }, + { + "epoch": 0.3391304347826087, + "grad_norm": 0.3452303089687653, + "learning_rate": 9.82383934407258e-05, + "loss": 1.4571, + "step": 117 + }, + { + "epoch": 0.34202898550724636, + "grad_norm": 0.3531330388118067, + "learning_rate": 9.819695244241936e-05, + "loss": 1.4726, + "step": 118 + }, + { + "epoch": 0.34492753623188405, + "grad_norm": 0.3284144929554227, + "learning_rate": 9.815503860096238e-05, + "loss": 1.4636, + "step": 119 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.33589451825622024, + "learning_rate": 9.811265232755662e-05, + "loss": 1.4076, + "step": 120 + }, + { + "epoch": 0.3507246376811594, + "grad_norm": 0.33465490795732467, + "learning_rate": 9.806979403803873e-05, + "loss": 1.3757, + "step": 121 + }, + { + "epoch": 0.3536231884057971, + "grad_norm": 0.35161889623674547, + "learning_rate": 9.802646415287615e-05, + "loss": 1.4065, + "step": 122 + }, + { + "epoch": 0.3565217391304348, + "grad_norm": 0.31894482948146224, + "learning_rate": 9.798266309716295e-05, + "loss": 1.4455, + "step": 123 + }, + { + "epoch": 0.35942028985507246, + "grad_norm": 0.3263915498362111, + "learning_rate": 9.793839130061573e-05, + "loss": 1.3291, + "step": 124 + }, + { + "epoch": 0.36231884057971014, + "grad_norm": 0.3264781414125749, + "learning_rate": 9.78936491975693e-05, + "loss": 1.3977, + "step": 125 + }, + { + "epoch": 0.3652173913043478, + "grad_norm": 0.3322110798968971, + "learning_rate": 9.784843722697253e-05, + "loss": 1.4516, + "step": 126 + }, + { + "epoch": 0.3681159420289855, + "grad_norm": 0.33040915159162, + "learning_rate": 9.780275583238397e-05, + "loss": 1.4418, + "step": 127 + }, + { + "epoch": 0.3710144927536232, + "grad_norm": 0.32982903923865825, + "learning_rate": 9.775660546196753e-05, + "loss": 1.399, + "step": 128 + }, + { + "epoch": 0.3739130434782609, + "grad_norm": 0.3398856478969671, + "learning_rate": 9.770998656848806e-05, + "loss": 1.4917, + "step": 129 + }, + { + "epoch": 0.37681159420289856, + "grad_norm": 0.33812428837562564, + "learning_rate": 9.766289960930697e-05, + "loss": 1.4136, + "step": 130 + }, + { + "epoch": 0.37971014492753624, + "grad_norm": 0.32546513362934915, + "learning_rate": 9.761534504637761e-05, + "loss": 1.4245, + "step": 131 + }, + { + "epoch": 0.3826086956521739, + "grad_norm": 0.3379554295481369, + "learning_rate": 9.756732334624093e-05, + "loss": 1.3917, + "step": 132 + }, + { + "epoch": 0.3855072463768116, + "grad_norm": 0.3196806084479148, + "learning_rate": 9.751883498002071e-05, + "loss": 1.3608, + "step": 133 + }, + { + "epoch": 0.3884057971014493, + "grad_norm": 0.366228317842041, + "learning_rate": 9.746988042341906e-05, + "loss": 1.3728, + "step": 134 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 0.3769852522598798, + "learning_rate": 9.742046015671174e-05, + "loss": 1.4481, + "step": 135 + }, + { + "epoch": 0.39420289855072466, + "grad_norm": 0.34122072082269356, + "learning_rate": 9.737057466474336e-05, + "loss": 1.4195, + "step": 136 + }, + { + "epoch": 0.39710144927536234, + "grad_norm": 0.3322686505315165, + "learning_rate": 9.732022443692276e-05, + "loss": 1.399, + "step": 137 + }, + { + "epoch": 0.4, + "grad_norm": 0.3296309366287408, + "learning_rate": 9.726940996721811e-05, + "loss": 1.421, + "step": 138 + }, + { + "epoch": 0.4028985507246377, + "grad_norm": 0.37435872581479346, + "learning_rate": 9.721813175415208e-05, + "loss": 1.4244, + "step": 139 + }, + { + "epoch": 0.4057971014492754, + "grad_norm": 0.3268496453435604, + "learning_rate": 9.716639030079697e-05, + "loss": 1.4099, + "step": 140 + }, + { + "epoch": 0.40869565217391307, + "grad_norm": 0.3554430337628762, + "learning_rate": 9.711418611476977e-05, + "loss": 1.4446, + "step": 141 + }, + { + "epoch": 0.4115942028985507, + "grad_norm": 0.33834590076214077, + "learning_rate": 9.706151970822718e-05, + "loss": 1.3205, + "step": 142 + }, + { + "epoch": 0.4144927536231884, + "grad_norm": 0.3414240635513846, + "learning_rate": 9.700839159786057e-05, + "loss": 1.4534, + "step": 143 + }, + { + "epoch": 0.41739130434782606, + "grad_norm": 0.32930885329942156, + "learning_rate": 9.695480230489093e-05, + "loss": 1.3587, + "step": 144 + }, + { + "epoch": 0.42028985507246375, + "grad_norm": 0.3390309331331547, + "learning_rate": 9.690075235506374e-05, + "loss": 1.339, + "step": 145 + }, + { + "epoch": 0.42318840579710143, + "grad_norm": 0.33898351347591354, + "learning_rate": 9.684624227864383e-05, + "loss": 1.3774, + "step": 146 + }, + { + "epoch": 0.4260869565217391, + "grad_norm": 0.3229718369377447, + "learning_rate": 9.679127261041015e-05, + "loss": 1.3538, + "step": 147 + }, + { + "epoch": 0.4289855072463768, + "grad_norm": 0.3375751395632948, + "learning_rate": 9.673584388965058e-05, + "loss": 1.4375, + "step": 148 + }, + { + "epoch": 0.4318840579710145, + "grad_norm": 0.3267376187700775, + "learning_rate": 9.667995666015654e-05, + "loss": 1.4029, + "step": 149 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.34796705983800497, + "learning_rate": 9.662361147021779e-05, + "loss": 1.4493, + "step": 150 + }, + { + "epoch": 0.43768115942028984, + "grad_norm": 0.3182925069013053, + "learning_rate": 9.656680887261693e-05, + "loss": 1.3708, + "step": 151 + }, + { + "epoch": 0.4405797101449275, + "grad_norm": 0.3408199380471595, + "learning_rate": 9.650954942462401e-05, + "loss": 1.4098, + "step": 152 + }, + { + "epoch": 0.4434782608695652, + "grad_norm": 0.33412473685571564, + "learning_rate": 9.645183368799113e-05, + "loss": 1.4252, + "step": 153 + }, + { + "epoch": 0.4463768115942029, + "grad_norm": 0.3318159670621602, + "learning_rate": 9.639366222894682e-05, + "loss": 1.4233, + "step": 154 + }, + { + "epoch": 0.4492753623188406, + "grad_norm": 0.34440731389898754, + "learning_rate": 9.63350356181906e-05, + "loss": 1.3829, + "step": 155 + }, + { + "epoch": 0.45217391304347826, + "grad_norm": 0.35692903412852806, + "learning_rate": 9.627595443088724e-05, + "loss": 1.357, + "step": 156 + }, + { + "epoch": 0.45507246376811594, + "grad_norm": 0.33466758251653783, + "learning_rate": 9.621641924666127e-05, + "loss": 1.406, + "step": 157 + }, + { + "epoch": 0.4579710144927536, + "grad_norm": 0.3366286518639209, + "learning_rate": 9.615643064959122e-05, + "loss": 1.4249, + "step": 158 + }, + { + "epoch": 0.4608695652173913, + "grad_norm": 0.32884355157952677, + "learning_rate": 9.609598922820382e-05, + "loss": 1.4149, + "step": 159 + }, + { + "epoch": 0.463768115942029, + "grad_norm": 0.3323077335804954, + "learning_rate": 9.60350955754684e-05, + "loss": 1.3898, + "step": 160 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.3284011884136777, + "learning_rate": 9.597375028879088e-05, + "loss": 1.3761, + "step": 161 + }, + { + "epoch": 0.46956521739130436, + "grad_norm": 0.33628429126159637, + "learning_rate": 9.591195397000805e-05, + "loss": 1.4473, + "step": 162 + }, + { + "epoch": 0.47246376811594204, + "grad_norm": 0.3479467044598075, + "learning_rate": 9.584970722538162e-05, + "loss": 1.4025, + "step": 163 + }, + { + "epoch": 0.4753623188405797, + "grad_norm": 0.34445922830801295, + "learning_rate": 9.578701066559225e-05, + "loss": 1.397, + "step": 164 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 0.3398702574419618, + "learning_rate": 9.572386490573357e-05, + "loss": 1.3751, + "step": 165 + }, + { + "epoch": 0.4811594202898551, + "grad_norm": 0.31614740777820005, + "learning_rate": 9.566027056530615e-05, + "loss": 1.3098, + "step": 166 + }, + { + "epoch": 0.48405797101449277, + "grad_norm": 0.3444149821598331, + "learning_rate": 9.559622826821145e-05, + "loss": 1.3685, + "step": 167 + }, + { + "epoch": 0.48695652173913045, + "grad_norm": 0.3455185724902944, + "learning_rate": 9.553173864274567e-05, + "loss": 1.4413, + "step": 168 + }, + { + "epoch": 0.48985507246376814, + "grad_norm": 0.32774886376386325, + "learning_rate": 9.546680232159355e-05, + "loss": 1.4031, + "step": 169 + }, + { + "epoch": 0.4927536231884058, + "grad_norm": 0.32560244502643815, + "learning_rate": 9.540141994182225e-05, + "loss": 1.4364, + "step": 170 + }, + { + "epoch": 0.4956521739130435, + "grad_norm": 0.34398546887992665, + "learning_rate": 9.533559214487503e-05, + "loss": 1.409, + "step": 171 + }, + { + "epoch": 0.4985507246376812, + "grad_norm": 0.39583900001909544, + "learning_rate": 9.526931957656497e-05, + "loss": 1.4527, + "step": 172 + }, + { + "epoch": 0.5014492753623189, + "grad_norm": 0.4626708756395286, + "learning_rate": 9.520260288706867e-05, + "loss": 1.4624, + "step": 173 + }, + { + "epoch": 0.5043478260869565, + "grad_norm": 0.3664093495829884, + "learning_rate": 9.513544273091983e-05, + "loss": 1.4639, + "step": 174 + }, + { + "epoch": 0.5072463768115942, + "grad_norm": 0.36499531804230495, + "learning_rate": 9.506783976700285e-05, + "loss": 1.4065, + "step": 175 + }, + { + "epoch": 0.5101449275362319, + "grad_norm": 0.33176315803612266, + "learning_rate": 9.499979465854633e-05, + "loss": 1.3712, + "step": 176 + }, + { + "epoch": 0.5130434782608696, + "grad_norm": 0.31906615813652695, + "learning_rate": 9.493130807311663e-05, + "loss": 1.4081, + "step": 177 + }, + { + "epoch": 0.5159420289855072, + "grad_norm": 0.34052218389638056, + "learning_rate": 9.486238068261129e-05, + "loss": 1.4268, + "step": 178 + }, + { + "epoch": 0.518840579710145, + "grad_norm": 0.3336134893967437, + "learning_rate": 9.479301316325237e-05, + "loss": 1.4078, + "step": 179 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.3360766687427952, + "learning_rate": 9.472320619557997e-05, + "loss": 1.3766, + "step": 180 + }, + { + "epoch": 0.5246376811594203, + "grad_norm": 0.3221253265397745, + "learning_rate": 9.465296046444538e-05, + "loss": 1.3538, + "step": 181 + }, + { + "epoch": 0.527536231884058, + "grad_norm": 0.33953118483885136, + "learning_rate": 9.458227665900446e-05, + "loss": 1.3964, + "step": 182 + }, + { + "epoch": 0.5304347826086957, + "grad_norm": 0.33685849921565403, + "learning_rate": 9.45111554727109e-05, + "loss": 1.4249, + "step": 183 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.35947381917427984, + "learning_rate": 9.443959760330934e-05, + "loss": 1.4087, + "step": 184 + }, + { + "epoch": 0.5362318840579711, + "grad_norm": 0.33994296278210917, + "learning_rate": 9.436760375282859e-05, + "loss": 1.3951, + "step": 185 + }, + { + "epoch": 0.5391304347826087, + "grad_norm": 0.3470448028628382, + "learning_rate": 9.429517462757467e-05, + "loss": 1.3688, + "step": 186 + }, + { + "epoch": 0.5420289855072464, + "grad_norm": 0.33294443162653775, + "learning_rate": 9.422231093812398e-05, + "loss": 1.3679, + "step": 187 + }, + { + "epoch": 0.5449275362318841, + "grad_norm": 0.31454677711788814, + "learning_rate": 9.414901339931624e-05, + "loss": 1.4419, + "step": 188 + }, + { + "epoch": 0.5478260869565217, + "grad_norm": 0.3434839073644547, + "learning_rate": 9.407528273024752e-05, + "loss": 1.3949, + "step": 189 + }, + { + "epoch": 0.5507246376811594, + "grad_norm": 0.3351386886311035, + "learning_rate": 9.400111965426319e-05, + "loss": 1.4022, + "step": 190 + }, + { + "epoch": 0.553623188405797, + "grad_norm": 0.3358706804811382, + "learning_rate": 9.39265248989508e-05, + "loss": 1.3474, + "step": 191 + }, + { + "epoch": 0.5565217391304348, + "grad_norm": 0.3572071382586898, + "learning_rate": 9.385149919613292e-05, + "loss": 1.3889, + "step": 192 + }, + { + "epoch": 0.5594202898550724, + "grad_norm": 0.3287944467382312, + "learning_rate": 9.377604328186008e-05, + "loss": 1.3805, + "step": 193 + }, + { + "epoch": 0.5623188405797102, + "grad_norm": 0.36810650453304095, + "learning_rate": 9.370015789640334e-05, + "loss": 1.4075, + "step": 194 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 0.3868422779658168, + "learning_rate": 9.362384378424726e-05, + "loss": 1.4251, + "step": 195 + }, + { + "epoch": 0.5681159420289855, + "grad_norm": 0.3295019502277694, + "learning_rate": 9.354710169408243e-05, + "loss": 1.4139, + "step": 196 + }, + { + "epoch": 0.5710144927536231, + "grad_norm": 0.3468700259339786, + "learning_rate": 9.346993237879817e-05, + "loss": 1.366, + "step": 197 + }, + { + "epoch": 0.5739130434782609, + "grad_norm": 0.3397883227300112, + "learning_rate": 9.339233659547521e-05, + "loss": 1.4216, + "step": 198 + }, + { + "epoch": 0.5768115942028985, + "grad_norm": 0.3430862510854982, + "learning_rate": 9.331431510537816e-05, + "loss": 1.407, + "step": 199 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.3463403087156221, + "learning_rate": 9.323586867394807e-05, + "loss": 1.3894, + "step": 200 + }, + { + "epoch": 0.5826086956521739, + "grad_norm": 0.3280253585339611, + "learning_rate": 9.315699807079497e-05, + "loss": 1.3499, + "step": 201 + }, + { + "epoch": 0.5855072463768116, + "grad_norm": 0.3465548223811757, + "learning_rate": 9.30777040696903e-05, + "loss": 1.3635, + "step": 202 + }, + { + "epoch": 0.5884057971014492, + "grad_norm": 0.36685509209544426, + "learning_rate": 9.29979874485593e-05, + "loss": 1.4247, + "step": 203 + }, + { + "epoch": 0.591304347826087, + "grad_norm": 0.3642879429079575, + "learning_rate": 9.291784898947336e-05, + "loss": 1.4265, + "step": 204 + }, + { + "epoch": 0.5942028985507246, + "grad_norm": 0.3369650372143289, + "learning_rate": 9.283728947864237e-05, + "loss": 1.3543, + "step": 205 + }, + { + "epoch": 0.5971014492753624, + "grad_norm": 0.3498733941972242, + "learning_rate": 9.275630970640705e-05, + "loss": 1.3867, + "step": 206 + }, + { + "epoch": 0.6, + "grad_norm": 0.3265518670612826, + "learning_rate": 9.267491046723111e-05, + "loss": 1.404, + "step": 207 + }, + { + "epoch": 0.6028985507246377, + "grad_norm": 0.3318790134308843, + "learning_rate": 9.259309255969354e-05, + "loss": 1.4059, + "step": 208 + }, + { + "epoch": 0.6057971014492753, + "grad_norm": 0.34642031197798473, + "learning_rate": 9.251085678648072e-05, + "loss": 1.4259, + "step": 209 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.3419250092734196, + "learning_rate": 9.242820395437854e-05, + "loss": 1.3711, + "step": 210 + }, + { + "epoch": 0.6115942028985507, + "grad_norm": 0.3461578047587994, + "learning_rate": 9.234513487426453e-05, + "loss": 1.4579, + "step": 211 + }, + { + "epoch": 0.6144927536231884, + "grad_norm": 0.351627952691499, + "learning_rate": 9.226165036109988e-05, + "loss": 1.4399, + "step": 212 + }, + { + "epoch": 0.6173913043478261, + "grad_norm": 0.3307586411986757, + "learning_rate": 9.217775123392145e-05, + "loss": 1.3946, + "step": 213 + }, + { + "epoch": 0.6202898550724638, + "grad_norm": 0.3354295846624239, + "learning_rate": 9.209343831583373e-05, + "loss": 1.3682, + "step": 214 + }, + { + "epoch": 0.6231884057971014, + "grad_norm": 0.3643294550764089, + "learning_rate": 9.200871243400073e-05, + "loss": 1.4177, + "step": 215 + }, + { + "epoch": 0.6260869565217392, + "grad_norm": 0.34428635756537734, + "learning_rate": 9.192357441963795e-05, + "loss": 1.4487, + "step": 216 + }, + { + "epoch": 0.6289855072463768, + "grad_norm": 0.33609027458329577, + "learning_rate": 9.183802510800415e-05, + "loss": 1.4307, + "step": 217 + }, + { + "epoch": 0.6318840579710145, + "grad_norm": 0.3563038361945473, + "learning_rate": 9.175206533839318e-05, + "loss": 1.4172, + "step": 218 + }, + { + "epoch": 0.6347826086956522, + "grad_norm": 0.3288387667207579, + "learning_rate": 9.166569595412575e-05, + "loss": 1.3713, + "step": 219 + }, + { + "epoch": 0.6376811594202898, + "grad_norm": 0.34157440710913767, + "learning_rate": 9.157891780254117e-05, + "loss": 1.3679, + "step": 220 + }, + { + "epoch": 0.6405797101449275, + "grad_norm": 0.3151382251052811, + "learning_rate": 9.1491731734989e-05, + "loss": 1.3795, + "step": 221 + }, + { + "epoch": 0.6434782608695652, + "grad_norm": 0.33817165115588743, + "learning_rate": 9.140413860682073e-05, + "loss": 1.3586, + "step": 222 + }, + { + "epoch": 0.6463768115942029, + "grad_norm": 0.3277750425977871, + "learning_rate": 9.131613927738138e-05, + "loss": 1.3885, + "step": 223 + }, + { + "epoch": 0.6492753623188405, + "grad_norm": 0.31658312922359383, + "learning_rate": 9.122773461000103e-05, + "loss": 1.4149, + "step": 224 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 0.3193871223544036, + "learning_rate": 9.113892547198643e-05, + "loss": 1.322, + "step": 225 + }, + { + "epoch": 0.6550724637681159, + "grad_norm": 0.3302835747056366, + "learning_rate": 9.104971273461243e-05, + "loss": 1.3769, + "step": 226 + }, + { + "epoch": 0.6579710144927536, + "grad_norm": 0.3186189847015454, + "learning_rate": 9.096009727311347e-05, + "loss": 1.3406, + "step": 227 + }, + { + "epoch": 0.6608695652173913, + "grad_norm": 0.3389034868184038, + "learning_rate": 9.087007996667494e-05, + "loss": 1.3658, + "step": 228 + }, + { + "epoch": 0.663768115942029, + "grad_norm": 0.33474986537379237, + "learning_rate": 9.077966169842459e-05, + "loss": 1.3651, + "step": 229 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.3556022501007949, + "learning_rate": 9.068884335542389e-05, + "loss": 1.4237, + "step": 230 + }, + { + "epoch": 0.6695652173913044, + "grad_norm": 0.3216681338623573, + "learning_rate": 9.05976258286593e-05, + "loss": 1.3785, + "step": 231 + }, + { + "epoch": 0.672463768115942, + "grad_norm": 0.33533701380419384, + "learning_rate": 9.05060100130335e-05, + "loss": 1.4665, + "step": 232 + }, + { + "epoch": 0.6753623188405797, + "grad_norm": 0.3314963078807375, + "learning_rate": 9.041399680735664e-05, + "loss": 1.4036, + "step": 233 + }, + { + "epoch": 0.6782608695652174, + "grad_norm": 0.33542193989045377, + "learning_rate": 9.03215871143376e-05, + "loss": 1.4348, + "step": 234 + }, + { + "epoch": 0.6811594202898551, + "grad_norm": 0.3547005064725891, + "learning_rate": 9.022878184057492e-05, + "loss": 1.4272, + "step": 235 + }, + { + "epoch": 0.6840579710144927, + "grad_norm": 0.33291554897811426, + "learning_rate": 9.013558189654819e-05, + "loss": 1.4591, + "step": 236 + }, + { + "epoch": 0.6869565217391305, + "grad_norm": 0.3379014298685863, + "learning_rate": 9.004198819660885e-05, + "loss": 1.4567, + "step": 237 + }, + { + "epoch": 0.6898550724637681, + "grad_norm": 0.3297563945475019, + "learning_rate": 8.99480016589714e-05, + "loss": 1.3799, + "step": 238 + }, + { + "epoch": 0.6927536231884058, + "grad_norm": 0.34042084947510615, + "learning_rate": 8.985362320570432e-05, + "loss": 1.3697, + "step": 239 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.3374245817202305, + "learning_rate": 8.975885376272102e-05, + "loss": 1.4046, + "step": 240 + }, + { + "epoch": 0.6985507246376812, + "grad_norm": 0.3732847854755435, + "learning_rate": 8.966369425977082e-05, + "loss": 1.3491, + "step": 241 + }, + { + "epoch": 0.7014492753623188, + "grad_norm": 0.35958390600115686, + "learning_rate": 8.956814563042968e-05, + "loss": 1.3671, + "step": 242 + }, + { + "epoch": 0.7043478260869566, + "grad_norm": 0.3572722721866322, + "learning_rate": 8.947220881209126e-05, + "loss": 1.4003, + "step": 243 + }, + { + "epoch": 0.7072463768115942, + "grad_norm": 0.34273191632214844, + "learning_rate": 8.937588474595753e-05, + "loss": 1.4104, + "step": 244 + }, + { + "epoch": 0.7101449275362319, + "grad_norm": 0.34878139471777386, + "learning_rate": 8.927917437702962e-05, + "loss": 1.3896, + "step": 245 + }, + { + "epoch": 0.7130434782608696, + "grad_norm": 0.33111504592475566, + "learning_rate": 8.918207865409856e-05, + "loss": 1.3313, + "step": 246 + }, + { + "epoch": 0.7159420289855073, + "grad_norm": 0.3438939035436239, + "learning_rate": 8.908459852973594e-05, + "loss": 1.3429, + "step": 247 + }, + { + "epoch": 0.7188405797101449, + "grad_norm": 0.3312679125692785, + "learning_rate": 8.898673496028456e-05, + "loss": 1.4395, + "step": 248 + }, + { + "epoch": 0.7217391304347827, + "grad_norm": 0.34484942367124294, + "learning_rate": 8.888848890584907e-05, + "loss": 1.3712, + "step": 249 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.340709492347014, + "learning_rate": 8.878986133028657e-05, + "loss": 1.37, + "step": 250 + }, + { + "epoch": 0.7275362318840579, + "grad_norm": 0.33398944764147226, + "learning_rate": 8.86908532011971e-05, + "loss": 1.3892, + "step": 251 + }, + { + "epoch": 0.7304347826086957, + "grad_norm": 0.35175222311902715, + "learning_rate": 8.85914654899142e-05, + "loss": 1.4108, + "step": 252 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.3484995200225896, + "learning_rate": 8.849169917149531e-05, + "loss": 1.3833, + "step": 253 + }, + { + "epoch": 0.736231884057971, + "grad_norm": 0.3532075346234238, + "learning_rate": 8.839155522471232e-05, + "loss": 1.313, + "step": 254 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 0.32136667953567727, + "learning_rate": 8.829103463204182e-05, + "loss": 1.3504, + "step": 255 + }, + { + "epoch": 0.7420289855072464, + "grad_norm": 0.3229081190755409, + "learning_rate": 8.81901383796556e-05, + "loss": 1.3771, + "step": 256 + }, + { + "epoch": 0.744927536231884, + "grad_norm": 0.3440518639418747, + "learning_rate": 8.808886745741089e-05, + "loss": 1.4158, + "step": 257 + }, + { + "epoch": 0.7478260869565218, + "grad_norm": 0.3352706545420464, + "learning_rate": 8.798722285884066e-05, + "loss": 1.4394, + "step": 258 + }, + { + "epoch": 0.7507246376811594, + "grad_norm": 0.33559926414830077, + "learning_rate": 8.788520558114391e-05, + "loss": 1.3911, + "step": 259 + }, + { + "epoch": 0.7536231884057971, + "grad_norm": 0.3216071156149776, + "learning_rate": 8.778281662517583e-05, + "loss": 1.429, + "step": 260 + }, + { + "epoch": 0.7565217391304347, + "grad_norm": 0.32211563215549827, + "learning_rate": 8.768005699543806e-05, + "loss": 1.3127, + "step": 261 + }, + { + "epoch": 0.7594202898550725, + "grad_norm": 0.34108464165661373, + "learning_rate": 8.757692770006876e-05, + "loss": 1.3773, + "step": 262 + }, + { + "epoch": 0.7623188405797101, + "grad_norm": 0.32535926486459094, + "learning_rate": 8.747342975083272e-05, + "loss": 1.3664, + "step": 263 + }, + { + "epoch": 0.7652173913043478, + "grad_norm": 0.33852048574771015, + "learning_rate": 8.736956416311154e-05, + "loss": 1.3663, + "step": 264 + }, + { + "epoch": 0.7681159420289855, + "grad_norm": 0.33710327017540265, + "learning_rate": 8.72653319558935e-05, + "loss": 1.4091, + "step": 265 + }, + { + "epoch": 0.7710144927536232, + "grad_norm": 0.3529196648547696, + "learning_rate": 8.716073415176374e-05, + "loss": 1.442, + "step": 266 + }, + { + "epoch": 0.7739130434782608, + "grad_norm": 0.34337677669937877, + "learning_rate": 8.705577177689403e-05, + "loss": 1.3316, + "step": 267 + }, + { + "epoch": 0.7768115942028986, + "grad_norm": 0.3354333510851631, + "learning_rate": 8.695044586103296e-05, + "loss": 1.3616, + "step": 268 + }, + { + "epoch": 0.7797101449275362, + "grad_norm": 0.3479441013536178, + "learning_rate": 8.684475743749556e-05, + "loss": 1.395, + "step": 269 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.37463973489254887, + "learning_rate": 8.673870754315336e-05, + "loss": 1.401, + "step": 270 + }, + { + "epoch": 0.7855072463768116, + "grad_norm": 0.31175117798278007, + "learning_rate": 8.663229721842415e-05, + "loss": 1.3223, + "step": 271 + }, + { + "epoch": 0.7884057971014493, + "grad_norm": 0.38303494453595516, + "learning_rate": 8.652552750726175e-05, + "loss": 1.4301, + "step": 272 + }, + { + "epoch": 0.7913043478260869, + "grad_norm": 0.3573014147864106, + "learning_rate": 8.64183994571458e-05, + "loss": 1.4263, + "step": 273 + }, + { + "epoch": 0.7942028985507247, + "grad_norm": 0.3211993716597447, + "learning_rate": 8.631091411907149e-05, + "loss": 1.3578, + "step": 274 + }, + { + "epoch": 0.7971014492753623, + "grad_norm": 0.37834773248299663, + "learning_rate": 8.620307254753923e-05, + "loss": 1.3745, + "step": 275 + }, + { + "epoch": 0.8, + "grad_norm": 0.31593418933802786, + "learning_rate": 8.609487580054428e-05, + "loss": 1.3654, + "step": 276 + }, + { + "epoch": 0.8028985507246377, + "grad_norm": 0.31504634745000243, + "learning_rate": 8.598632493956644e-05, + "loss": 1.4, + "step": 277 + }, + { + "epoch": 0.8057971014492754, + "grad_norm": 0.3384470107062998, + "learning_rate": 8.58774210295596e-05, + "loss": 1.3941, + "step": 278 + }, + { + "epoch": 0.808695652173913, + "grad_norm": 0.3260030165566468, + "learning_rate": 8.576816513894125e-05, + "loss": 1.348, + "step": 279 + }, + { + "epoch": 0.8115942028985508, + "grad_norm": 0.3527150892760629, + "learning_rate": 8.565855833958206e-05, + "loss": 1.4058, + "step": 280 + }, + { + "epoch": 0.8144927536231884, + "grad_norm": 0.3861860908831136, + "learning_rate": 8.554860170679534e-05, + "loss": 1.4282, + "step": 281 + }, + { + "epoch": 0.8173913043478261, + "grad_norm": 0.3137903423216692, + "learning_rate": 8.543829631932649e-05, + "loss": 1.352, + "step": 282 + }, + { + "epoch": 0.8202898550724638, + "grad_norm": 0.34862718728490294, + "learning_rate": 8.532764325934239e-05, + "loss": 1.4282, + "step": 283 + }, + { + "epoch": 0.8231884057971014, + "grad_norm": 0.3150871399912744, + "learning_rate": 8.521664361242089e-05, + "loss": 1.3802, + "step": 284 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 0.3107741737666529, + "learning_rate": 8.510529846753998e-05, + "loss": 1.4077, + "step": 285 + }, + { + "epoch": 0.8289855072463768, + "grad_norm": 0.33269493424037233, + "learning_rate": 8.499360891706729e-05, + "loss": 1.3348, + "step": 286 + }, + { + "epoch": 0.8318840579710145, + "grad_norm": 0.31493592697757294, + "learning_rate": 8.488157605674925e-05, + "loss": 1.3418, + "step": 287 + }, + { + "epoch": 0.8347826086956521, + "grad_norm": 0.3328720547121984, + "learning_rate": 8.476920098570036e-05, + "loss": 1.3832, + "step": 288 + }, + { + "epoch": 0.8376811594202899, + "grad_norm": 0.3157756166632203, + "learning_rate": 8.465648480639248e-05, + "loss": 1.3274, + "step": 289 + }, + { + "epoch": 0.8405797101449275, + "grad_norm": 0.33662897796614577, + "learning_rate": 8.454342862464395e-05, + "loss": 1.3086, + "step": 290 + }, + { + "epoch": 0.8434782608695652, + "grad_norm": 0.3272252672648793, + "learning_rate": 8.443003354960872e-05, + "loss": 1.4232, + "step": 291 + }, + { + "epoch": 0.8463768115942029, + "grad_norm": 0.35218283346681617, + "learning_rate": 8.431630069376552e-05, + "loss": 1.4371, + "step": 292 + }, + { + "epoch": 0.8492753623188406, + "grad_norm": 0.3436413205889393, + "learning_rate": 8.420223117290695e-05, + "loss": 1.3696, + "step": 293 + }, + { + "epoch": 0.8521739130434782, + "grad_norm": 0.34426616560941314, + "learning_rate": 8.408782610612849e-05, + "loss": 1.4137, + "step": 294 + }, + { + "epoch": 0.855072463768116, + "grad_norm": 0.31419677902933213, + "learning_rate": 8.39730866158175e-05, + "loss": 1.3294, + "step": 295 + }, + { + "epoch": 0.8579710144927536, + "grad_norm": 0.31097415762768543, + "learning_rate": 8.385801382764233e-05, + "loss": 1.3796, + "step": 296 + }, + { + "epoch": 0.8608695652173913, + "grad_norm": 0.3351050938384504, + "learning_rate": 8.374260887054116e-05, + "loss": 1.4819, + "step": 297 + }, + { + "epoch": 0.863768115942029, + "grad_norm": 0.3151109176190777, + "learning_rate": 8.362687287671094e-05, + "loss": 1.3711, + "step": 298 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.337074633378245, + "learning_rate": 8.351080698159632e-05, + "loss": 1.3923, + "step": 299 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.3371311952402845, + "learning_rate": 8.339441232387853e-05, + "loss": 1.3789, + "step": 300 + }, + { + "epoch": 0.8724637681159421, + "grad_norm": 0.3356424382906388, + "learning_rate": 8.32776900454641e-05, + "loss": 1.4003, + "step": 301 + }, + { + "epoch": 0.8753623188405797, + "grad_norm": 0.33796299079575864, + "learning_rate": 8.31606412914738e-05, + "loss": 1.4341, + "step": 302 + }, + { + "epoch": 0.8782608695652174, + "grad_norm": 0.32018941976781934, + "learning_rate": 8.30432672102313e-05, + "loss": 1.4523, + "step": 303 + }, + { + "epoch": 0.881159420289855, + "grad_norm": 0.3368637827820196, + "learning_rate": 8.292556895325194e-05, + "loss": 1.3903, + "step": 304 + }, + { + "epoch": 0.8840579710144928, + "grad_norm": 0.31352167875853487, + "learning_rate": 8.280754767523144e-05, + "loss": 1.3581, + "step": 305 + }, + { + "epoch": 0.8869565217391304, + "grad_norm": 0.31484573995633375, + "learning_rate": 8.268920453403457e-05, + "loss": 1.3967, + "step": 306 + }, + { + "epoch": 0.8898550724637682, + "grad_norm": 0.31504188464216054, + "learning_rate": 8.257054069068374e-05, + "loss": 1.3985, + "step": 307 + }, + { + "epoch": 0.8927536231884058, + "grad_norm": 0.32015281024694753, + "learning_rate": 8.245155730934777e-05, + "loss": 1.3273, + "step": 308 + }, + { + "epoch": 0.8956521739130435, + "grad_norm": 0.3183790437483911, + "learning_rate": 8.233225555733022e-05, + "loss": 1.2672, + "step": 309 + }, + { + "epoch": 0.8985507246376812, + "grad_norm": 0.32150150116629717, + "learning_rate": 8.221263660505813e-05, + "loss": 1.3995, + "step": 310 + }, + { + "epoch": 0.9014492753623189, + "grad_norm": 0.3132580361772673, + "learning_rate": 8.20927016260705e-05, + "loss": 1.3899, + "step": 311 + }, + { + "epoch": 0.9043478260869565, + "grad_norm": 0.3557171808896923, + "learning_rate": 8.197245179700673e-05, + "loss": 1.3861, + "step": 312 + }, + { + "epoch": 0.9072463768115943, + "grad_norm": 0.32080932799331907, + "learning_rate": 8.185188829759505e-05, + "loss": 1.2657, + "step": 313 + }, + { + "epoch": 0.9101449275362319, + "grad_norm": 0.33323239514109537, + "learning_rate": 8.173101231064113e-05, + "loss": 1.331, + "step": 314 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 0.33932442141864444, + "learning_rate": 8.160982502201624e-05, + "loss": 1.3583, + "step": 315 + }, + { + "epoch": 0.9159420289855073, + "grad_norm": 0.41517663636078217, + "learning_rate": 8.148832762064573e-05, + "loss": 1.4196, + "step": 316 + }, + { + "epoch": 0.9188405797101449, + "grad_norm": 0.3479488422667109, + "learning_rate": 8.136652129849738e-05, + "loss": 1.3765, + "step": 317 + }, + { + "epoch": 0.9217391304347826, + "grad_norm": 0.3250773691234272, + "learning_rate": 8.124440725056969e-05, + "loss": 1.3998, + "step": 318 + }, + { + "epoch": 0.9246376811594202, + "grad_norm": 0.630703005417282, + "learning_rate": 8.112198667488012e-05, + "loss": 1.2986, + "step": 319 + }, + { + "epoch": 0.927536231884058, + "grad_norm": 0.34656213869069796, + "learning_rate": 8.099926077245337e-05, + "loss": 1.4085, + "step": 320 + }, + { + "epoch": 0.9304347826086956, + "grad_norm": 0.3595735041645428, + "learning_rate": 8.08762307473096e-05, + "loss": 1.3973, + "step": 321 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.3492788413407257, + "learning_rate": 8.075289780645264e-05, + "loss": 1.3912, + "step": 322 + }, + { + "epoch": 0.936231884057971, + "grad_norm": 0.3576330587050802, + "learning_rate": 8.062926315985803e-05, + "loss": 1.4256, + "step": 323 + }, + { + "epoch": 0.9391304347826087, + "grad_norm": 0.3410475477414221, + "learning_rate": 8.050532802046135e-05, + "loss": 1.3586, + "step": 324 + }, + { + "epoch": 0.9420289855072463, + "grad_norm": 0.32056313028041444, + "learning_rate": 8.038109360414614e-05, + "loss": 1.3443, + "step": 325 + }, + { + "epoch": 0.9449275362318841, + "grad_norm": 0.32894846650068166, + "learning_rate": 8.025656112973202e-05, + "loss": 1.3798, + "step": 326 + }, + { + "epoch": 0.9478260869565217, + "grad_norm": 0.3255639658134978, + "learning_rate": 8.013173181896283e-05, + "loss": 1.3383, + "step": 327 + }, + { + "epoch": 0.9507246376811594, + "grad_norm": 0.31966797580007494, + "learning_rate": 8.000660689649449e-05, + "loss": 1.3544, + "step": 328 + }, + { + "epoch": 0.9536231884057971, + "grad_norm": 0.32692090968009707, + "learning_rate": 7.98811875898831e-05, + "loss": 1.4088, + "step": 329 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.3372144496418016, + "learning_rate": 7.975547512957285e-05, + "loss": 1.4309, + "step": 330 + }, + { + "epoch": 0.9594202898550724, + "grad_norm": 0.3246412166131606, + "learning_rate": 7.962947074888394e-05, + "loss": 1.3916, + "step": 331 + }, + { + "epoch": 0.9623188405797102, + "grad_norm": 0.34634645274643355, + "learning_rate": 7.950317568400054e-05, + "loss": 1.4104, + "step": 332 + }, + { + "epoch": 0.9652173913043478, + "grad_norm": 0.3256987549913797, + "learning_rate": 7.937659117395858e-05, + "loss": 1.3544, + "step": 333 + }, + { + "epoch": 0.9681159420289855, + "grad_norm": 0.33356722481281487, + "learning_rate": 7.924971846063365e-05, + "loss": 1.342, + "step": 334 + }, + { + "epoch": 0.9710144927536232, + "grad_norm": 0.3260083753687772, + "learning_rate": 7.912255878872878e-05, + "loss": 1.4006, + "step": 335 + }, + { + "epoch": 0.9739130434782609, + "grad_norm": 0.3768462741234547, + "learning_rate": 7.899511340576229e-05, + "loss": 1.4014, + "step": 336 + }, + { + "epoch": 0.9768115942028985, + "grad_norm": 0.33594184989494874, + "learning_rate": 7.886738356205546e-05, + "loss": 1.3538, + "step": 337 + }, + { + "epoch": 0.9797101449275363, + "grad_norm": 0.3538141580905989, + "learning_rate": 7.873937051072035e-05, + "loss": 1.4112, + "step": 338 + }, + { + "epoch": 0.9826086956521739, + "grad_norm": 0.33768085173175694, + "learning_rate": 7.861107550764744e-05, + "loss": 1.4318, + "step": 339 + }, + { + "epoch": 0.9855072463768116, + "grad_norm": 0.3103190809712041, + "learning_rate": 7.848249981149338e-05, + "loss": 1.3934, + "step": 340 + }, + { + "epoch": 0.9884057971014493, + "grad_norm": 0.35049170901785537, + "learning_rate": 7.835364468366856e-05, + "loss": 1.3604, + "step": 341 + }, + { + "epoch": 0.991304347826087, + "grad_norm": 0.32828748932738266, + "learning_rate": 7.822451138832478e-05, + "loss": 1.3985, + "step": 342 + }, + { + "epoch": 0.9942028985507246, + "grad_norm": 0.33349918656348, + "learning_rate": 7.809510119234287e-05, + "loss": 1.4051, + "step": 343 + }, + { + "epoch": 0.9971014492753624, + "grad_norm": 0.31203624586969825, + "learning_rate": 7.796541536532019e-05, + "loss": 1.4114, + "step": 344 + }, + { + "epoch": 1.0, + "grad_norm": 0.3240751813149832, + "learning_rate": 7.783545517955826e-05, + "loss": 1.3441, + "step": 345 + }, + { + "epoch": 1.0028985507246377, + "grad_norm": 0.3039393246768782, + "learning_rate": 7.77052219100502e-05, + "loss": 1.2368, + "step": 346 + }, + { + "epoch": 1.0057971014492753, + "grad_norm": 0.31372425053284514, + "learning_rate": 7.757471683446833e-05, + "loss": 1.1765, + "step": 347 + }, + { + "epoch": 1.008695652173913, + "grad_norm": 0.2985654423691086, + "learning_rate": 7.744394123315146e-05, + "loss": 1.2387, + "step": 348 + }, + { + "epoch": 1.0115942028985507, + "grad_norm": 0.30668006943966447, + "learning_rate": 7.731289638909248e-05, + "loss": 1.2512, + "step": 349 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.3297662794021686, + "learning_rate": 7.718158358792574e-05, + "loss": 1.2466, + "step": 350 + }, + { + "epoch": 1.017391304347826, + "grad_norm": 0.36571397703464864, + "learning_rate": 7.705000411791441e-05, + "loss": 1.2095, + "step": 351 + }, + { + "epoch": 1.0202898550724637, + "grad_norm": 0.36789475981765535, + "learning_rate": 7.691815926993785e-05, + "loss": 1.2127, + "step": 352 + }, + { + "epoch": 1.0231884057971015, + "grad_norm": 0.34691008452093475, + "learning_rate": 7.678605033747894e-05, + "loss": 1.1754, + "step": 353 + }, + { + "epoch": 1.0260869565217392, + "grad_norm": 0.3381901577900874, + "learning_rate": 7.665367861661142e-05, + "loss": 1.2585, + "step": 354 + }, + { + "epoch": 1.0289855072463767, + "grad_norm": 0.3456016883168296, + "learning_rate": 7.652104540598712e-05, + "loss": 1.2565, + "step": 355 + }, + { + "epoch": 1.0318840579710145, + "grad_norm": 0.3340793379287121, + "learning_rate": 7.638815200682331e-05, + "loss": 1.286, + "step": 356 + }, + { + "epoch": 1.0347826086956522, + "grad_norm": 0.3329632889293724, + "learning_rate": 7.62549997228898e-05, + "loss": 1.2579, + "step": 357 + }, + { + "epoch": 1.03768115942029, + "grad_norm": 0.32945204903041203, + "learning_rate": 7.612158986049632e-05, + "loss": 1.1978, + "step": 358 + }, + { + "epoch": 1.0405797101449274, + "grad_norm": 0.3240289810339555, + "learning_rate": 7.598792372847952e-05, + "loss": 1.1871, + "step": 359 + }, + { + "epoch": 1.0434782608695652, + "grad_norm": 0.3497054137706393, + "learning_rate": 7.585400263819025e-05, + "loss": 1.2407, + "step": 360 + }, + { + "epoch": 1.046376811594203, + "grad_norm": 0.3334051709529727, + "learning_rate": 7.571982790348071e-05, + "loss": 1.2475, + "step": 361 + }, + { + "epoch": 1.0492753623188407, + "grad_norm": 0.3216924338385901, + "learning_rate": 7.558540084069145e-05, + "loss": 1.2178, + "step": 362 + }, + { + "epoch": 1.0521739130434782, + "grad_norm": 0.3770387844464867, + "learning_rate": 7.545072276863858e-05, + "loss": 1.2979, + "step": 363 + }, + { + "epoch": 1.055072463768116, + "grad_norm": 0.33349794524452664, + "learning_rate": 7.531579500860069e-05, + "loss": 1.2679, + "step": 364 + }, + { + "epoch": 1.0579710144927537, + "grad_norm": 0.3410677559200434, + "learning_rate": 7.518061888430609e-05, + "loss": 1.3029, + "step": 365 + }, + { + "epoch": 1.0608695652173914, + "grad_norm": 0.32421257826543254, + "learning_rate": 7.50451957219196e-05, + "loss": 1.2383, + "step": 366 + }, + { + "epoch": 1.063768115942029, + "grad_norm": 0.33207438928525995, + "learning_rate": 7.490952685002965e-05, + "loss": 1.2317, + "step": 367 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.32506432414586334, + "learning_rate": 7.477361359963533e-05, + "loss": 1.1661, + "step": 368 + }, + { + "epoch": 1.0695652173913044, + "grad_norm": 0.32495557198051783, + "learning_rate": 7.463745730413313e-05, + "loss": 1.2343, + "step": 369 + }, + { + "epoch": 1.0724637681159421, + "grad_norm": 0.33951747813529576, + "learning_rate": 7.450105929930403e-05, + "loss": 1.1765, + "step": 370 + }, + { + "epoch": 1.0753623188405796, + "grad_norm": 0.3960232594734765, + "learning_rate": 7.436442092330033e-05, + "loss": 1.1708, + "step": 371 + }, + { + "epoch": 1.0782608695652174, + "grad_norm": 0.34965839265944354, + "learning_rate": 7.422754351663252e-05, + "loss": 1.1557, + "step": 372 + }, + { + "epoch": 1.0811594202898551, + "grad_norm": 0.3465625398151273, + "learning_rate": 7.409042842215611e-05, + "loss": 1.2163, + "step": 373 + }, + { + "epoch": 1.0840579710144929, + "grad_norm": 0.3441278544713875, + "learning_rate": 7.395307698505851e-05, + "loss": 1.2522, + "step": 374 + }, + { + "epoch": 1.0869565217391304, + "grad_norm": 0.34316475519905354, + "learning_rate": 7.381549055284582e-05, + "loss": 1.2401, + "step": 375 + }, + { + "epoch": 1.0898550724637681, + "grad_norm": 0.3468405311381756, + "learning_rate": 7.367767047532955e-05, + "loss": 1.2297, + "step": 376 + }, + { + "epoch": 1.0927536231884059, + "grad_norm": 0.35424537263860967, + "learning_rate": 7.353961810461343e-05, + "loss": 1.1903, + "step": 377 + }, + { + "epoch": 1.0956521739130434, + "grad_norm": 0.35865745036758906, + "learning_rate": 7.340133479508015e-05, + "loss": 1.2238, + "step": 378 + }, + { + "epoch": 1.098550724637681, + "grad_norm": 0.33961205561899227, + "learning_rate": 7.326282190337807e-05, + "loss": 1.2353, + "step": 379 + }, + { + "epoch": 1.1014492753623188, + "grad_norm": 0.3410877787281011, + "learning_rate": 7.312408078840788e-05, + "loss": 1.1938, + "step": 380 + }, + { + "epoch": 1.1043478260869566, + "grad_norm": 0.3261974323058093, + "learning_rate": 7.298511281130928e-05, + "loss": 1.2283, + "step": 381 + }, + { + "epoch": 1.107246376811594, + "grad_norm": 0.3375439427532852, + "learning_rate": 7.284591933544764e-05, + "loss": 1.166, + "step": 382 + }, + { + "epoch": 1.1101449275362318, + "grad_norm": 0.34226748130902523, + "learning_rate": 7.270650172640065e-05, + "loss": 1.2268, + "step": 383 + }, + { + "epoch": 1.1130434782608696, + "grad_norm": 0.34975018354668974, + "learning_rate": 7.256686135194483e-05, + "loss": 1.2753, + "step": 384 + }, + { + "epoch": 1.1159420289855073, + "grad_norm": 0.36870818906061614, + "learning_rate": 7.242699958204225e-05, + "loss": 1.2427, + "step": 385 + }, + { + "epoch": 1.1188405797101448, + "grad_norm": 0.35097638947331306, + "learning_rate": 7.228691778882693e-05, + "loss": 1.2588, + "step": 386 + }, + { + "epoch": 1.1217391304347826, + "grad_norm": 0.35715131379127846, + "learning_rate": 7.21466173465915e-05, + "loss": 1.2349, + "step": 387 + }, + { + "epoch": 1.1246376811594203, + "grad_norm": 0.3554441755613845, + "learning_rate": 7.200609963177367e-05, + "loss": 1.2218, + "step": 388 + }, + { + "epoch": 1.127536231884058, + "grad_norm": 0.35332606995255955, + "learning_rate": 7.186536602294278e-05, + "loss": 1.233, + "step": 389 + }, + { + "epoch": 1.1304347826086956, + "grad_norm": 0.34659479615561295, + "learning_rate": 7.172441790078614e-05, + "loss": 1.2277, + "step": 390 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.3634661952802433, + "learning_rate": 7.158325664809566e-05, + "loss": 1.1815, + "step": 391 + }, + { + "epoch": 1.136231884057971, + "grad_norm": 0.3483946097126382, + "learning_rate": 7.144188364975415e-05, + "loss": 1.2296, + "step": 392 + }, + { + "epoch": 1.1391304347826088, + "grad_norm": 0.3458491663438552, + "learning_rate": 7.130030029272179e-05, + "loss": 1.2762, + "step": 393 + }, + { + "epoch": 1.1420289855072463, + "grad_norm": 0.36175639738964943, + "learning_rate": 7.11585079660225e-05, + "loss": 1.1942, + "step": 394 + }, + { + "epoch": 1.144927536231884, + "grad_norm": 0.3593818284728034, + "learning_rate": 7.101650806073038e-05, + "loss": 1.2068, + "step": 395 + }, + { + "epoch": 1.1478260869565218, + "grad_norm": 0.334166827563346, + "learning_rate": 7.087430196995593e-05, + "loss": 1.1819, + "step": 396 + }, + { + "epoch": 1.1507246376811595, + "grad_norm": 0.3636336066976543, + "learning_rate": 7.073189108883255e-05, + "loss": 1.2438, + "step": 397 + }, + { + "epoch": 1.153623188405797, + "grad_norm": 0.35550038414146484, + "learning_rate": 7.058927681450269e-05, + "loss": 1.2546, + "step": 398 + }, + { + "epoch": 1.1565217391304348, + "grad_norm": 0.3638989954332178, + "learning_rate": 7.044646054610426e-05, + "loss": 1.2817, + "step": 399 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.36528513619908154, + "learning_rate": 7.030344368475684e-05, + "loss": 1.2634, + "step": 400 + }, + { + "epoch": 1.1623188405797102, + "grad_norm": 0.348052355901968, + "learning_rate": 7.016022763354798e-05, + "loss": 1.2002, + "step": 401 + }, + { + "epoch": 1.1652173913043478, + "grad_norm": 0.3595684193169886, + "learning_rate": 7.00168137975194e-05, + "loss": 1.1864, + "step": 402 + }, + { + "epoch": 1.1681159420289855, + "grad_norm": 0.35070589944718533, + "learning_rate": 6.98732035836532e-05, + "loss": 1.1749, + "step": 403 + }, + { + "epoch": 1.1710144927536232, + "grad_norm": 0.3583364136698803, + "learning_rate": 6.972939840085809e-05, + "loss": 1.2362, + "step": 404 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 0.3411795291050965, + "learning_rate": 6.958539965995558e-05, + "loss": 1.2365, + "step": 405 + }, + { + "epoch": 1.1768115942028985, + "grad_norm": 0.37126831887596484, + "learning_rate": 6.944120877366604e-05, + "loss": 1.2547, + "step": 406 + }, + { + "epoch": 1.1797101449275362, + "grad_norm": 0.3615486523323878, + "learning_rate": 6.929682715659496e-05, + "loss": 1.2008, + "step": 407 + }, + { + "epoch": 1.182608695652174, + "grad_norm": 0.3495522144501781, + "learning_rate": 6.915225622521901e-05, + "loss": 1.2137, + "step": 408 + }, + { + "epoch": 1.1855072463768117, + "grad_norm": 0.34558559090876845, + "learning_rate": 6.900749739787216e-05, + "loss": 1.1948, + "step": 409 + }, + { + "epoch": 1.1884057971014492, + "grad_norm": 0.3534560464350228, + "learning_rate": 6.886255209473174e-05, + "loss": 1.2296, + "step": 410 + }, + { + "epoch": 1.191304347826087, + "grad_norm": 0.38654103329628986, + "learning_rate": 6.871742173780458e-05, + "loss": 1.2375, + "step": 411 + }, + { + "epoch": 1.1942028985507247, + "grad_norm": 0.4990410023234168, + "learning_rate": 6.857210775091292e-05, + "loss": 1.1972, + "step": 412 + }, + { + "epoch": 1.1971014492753622, + "grad_norm": 0.3283618367174733, + "learning_rate": 6.842661155968062e-05, + "loss": 1.2236, + "step": 413 + }, + { + "epoch": 1.2, + "grad_norm": 0.3501614388462517, + "learning_rate": 6.828093459151902e-05, + "loss": 1.2599, + "step": 414 + }, + { + "epoch": 1.2028985507246377, + "grad_norm": 0.3566983584982769, + "learning_rate": 6.813507827561301e-05, + "loss": 1.2592, + "step": 415 + }, + { + "epoch": 1.2057971014492754, + "grad_norm": 0.35438824536081337, + "learning_rate": 6.798904404290703e-05, + "loss": 1.219, + "step": 416 + }, + { + "epoch": 1.208695652173913, + "grad_norm": 0.36738665957897987, + "learning_rate": 6.784283332609096e-05, + "loss": 1.2787, + "step": 417 + }, + { + "epoch": 1.2115942028985507, + "grad_norm": 0.3618484779747058, + "learning_rate": 6.769644755958614e-05, + "loss": 1.2557, + "step": 418 + }, + { + "epoch": 1.2144927536231884, + "grad_norm": 0.3475615543784353, + "learning_rate": 6.754988817953121e-05, + "loss": 1.2519, + "step": 419 + }, + { + "epoch": 1.2173913043478262, + "grad_norm": 0.3498171433494951, + "learning_rate": 6.740315662376808e-05, + "loss": 1.1832, + "step": 420 + }, + { + "epoch": 1.2202898550724637, + "grad_norm": 0.3485237559097342, + "learning_rate": 6.725625433182788e-05, + "loss": 1.1686, + "step": 421 + }, + { + "epoch": 1.2231884057971014, + "grad_norm": 0.3365638116771253, + "learning_rate": 6.710918274491668e-05, + "loss": 1.161, + "step": 422 + }, + { + "epoch": 1.2260869565217392, + "grad_norm": 0.339262847480053, + "learning_rate": 6.696194330590151e-05, + "loss": 1.3032, + "step": 423 + }, + { + "epoch": 1.228985507246377, + "grad_norm": 0.3695849544204241, + "learning_rate": 6.681453745929613e-05, + "loss": 1.2505, + "step": 424 + }, + { + "epoch": 1.2318840579710144, + "grad_norm": 0.3810556641153086, + "learning_rate": 6.666696665124682e-05, + "loss": 1.2176, + "step": 425 + }, + { + "epoch": 1.2347826086956522, + "grad_norm": 0.3794002652671474, + "learning_rate": 6.651923232951829e-05, + "loss": 1.2922, + "step": 426 + }, + { + "epoch": 1.23768115942029, + "grad_norm": 0.37219002176219357, + "learning_rate": 6.637133594347938e-05, + "loss": 1.2919, + "step": 427 + }, + { + "epoch": 1.2405797101449276, + "grad_norm": 0.3748146640073023, + "learning_rate": 6.62232789440889e-05, + "loss": 1.2549, + "step": 428 + }, + { + "epoch": 1.2434782608695651, + "grad_norm": 0.3431018972364436, + "learning_rate": 6.607506278388144e-05, + "loss": 1.1907, + "step": 429 + }, + { + "epoch": 1.2463768115942029, + "grad_norm": 0.3685201234625515, + "learning_rate": 6.592668891695298e-05, + "loss": 1.2368, + "step": 430 + }, + { + "epoch": 1.2492753623188406, + "grad_norm": 0.3638027931128809, + "learning_rate": 6.57781587989467e-05, + "loss": 1.2695, + "step": 431 + }, + { + "epoch": 1.2521739130434781, + "grad_norm": 0.3392431416089568, + "learning_rate": 6.562947388703879e-05, + "loss": 1.2651, + "step": 432 + }, + { + "epoch": 1.2550724637681159, + "grad_norm": 0.3523863327979242, + "learning_rate": 6.548063563992397e-05, + "loss": 1.2633, + "step": 433 + }, + { + "epoch": 1.2579710144927536, + "grad_norm": 0.3773185628146933, + "learning_rate": 6.533164551780134e-05, + "loss": 1.2669, + "step": 434 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 0.37080955852894376, + "learning_rate": 6.518250498235996e-05, + "loss": 1.2055, + "step": 435 + }, + { + "epoch": 1.263768115942029, + "grad_norm": 0.3610115012833989, + "learning_rate": 6.50332154967646e-05, + "loss": 1.2558, + "step": 436 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.36419810462728663, + "learning_rate": 6.488377852564125e-05, + "loss": 1.2273, + "step": 437 + }, + { + "epoch": 1.2695652173913043, + "grad_norm": 0.36955352159431015, + "learning_rate": 6.473419553506285e-05, + "loss": 1.1592, + "step": 438 + }, + { + "epoch": 1.272463768115942, + "grad_norm": 0.4000451451417096, + "learning_rate": 6.45844679925349e-05, + "loss": 1.2585, + "step": 439 + }, + { + "epoch": 1.2753623188405796, + "grad_norm": 0.3674813225161034, + "learning_rate": 6.443459736698105e-05, + "loss": 1.207, + "step": 440 + }, + { + "epoch": 1.2782608695652173, + "grad_norm": 0.36342273693767024, + "learning_rate": 6.428458512872868e-05, + "loss": 1.207, + "step": 441 + }, + { + "epoch": 1.281159420289855, + "grad_norm": 0.3772811021851, + "learning_rate": 6.413443274949446e-05, + "loss": 1.249, + "step": 442 + }, + { + "epoch": 1.2840579710144928, + "grad_norm": 0.3574482885159096, + "learning_rate": 6.398414170237001e-05, + "loss": 1.2111, + "step": 443 + }, + { + "epoch": 1.2869565217391306, + "grad_norm": 0.34461226274334095, + "learning_rate": 6.383371346180725e-05, + "loss": 1.2042, + "step": 444 + }, + { + "epoch": 1.289855072463768, + "grad_norm": 0.35375827819704075, + "learning_rate": 6.368314950360415e-05, + "loss": 1.2183, + "step": 445 + }, + { + "epoch": 1.2927536231884058, + "grad_norm": 0.3494607679069863, + "learning_rate": 6.353245130489012e-05, + "loss": 1.2267, + "step": 446 + }, + { + "epoch": 1.2956521739130435, + "grad_norm": 0.3376350549359254, + "learning_rate": 6.338162034411158e-05, + "loss": 1.2514, + "step": 447 + }, + { + "epoch": 1.298550724637681, + "grad_norm": 0.3514507439505588, + "learning_rate": 6.323065810101741e-05, + "loss": 1.2055, + "step": 448 + }, + { + "epoch": 1.3014492753623188, + "grad_norm": 0.374192088646086, + "learning_rate": 6.307956605664447e-05, + "loss": 1.2149, + "step": 449 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.36836907141990205, + "learning_rate": 6.292834569330301e-05, + "loss": 1.332, + "step": 450 + }, + { + "epoch": 1.3072463768115943, + "grad_norm": 0.35436366268435593, + "learning_rate": 6.277699849456224e-05, + "loss": 1.2918, + "step": 451 + }, + { + "epoch": 1.310144927536232, + "grad_norm": 0.3535565794861321, + "learning_rate": 6.262552594523565e-05, + "loss": 1.2382, + "step": 452 + }, + { + "epoch": 1.3130434782608695, + "grad_norm": 0.3923107343675531, + "learning_rate": 6.247392953136655e-05, + "loss": 1.2614, + "step": 453 + }, + { + "epoch": 1.3159420289855073, + "grad_norm": 0.3566047611610826, + "learning_rate": 6.23222107402134e-05, + "loss": 1.2574, + "step": 454 + }, + { + "epoch": 1.318840579710145, + "grad_norm": 0.3444110335156092, + "learning_rate": 6.217037106023527e-05, + "loss": 1.2158, + "step": 455 + }, + { + "epoch": 1.3217391304347825, + "grad_norm": 0.34800059904629854, + "learning_rate": 6.201841198107724e-05, + "loss": 1.2691, + "step": 456 + }, + { + "epoch": 1.3246376811594203, + "grad_norm": 0.3704659760771806, + "learning_rate": 6.186633499355576e-05, + "loss": 1.1669, + "step": 457 + }, + { + "epoch": 1.327536231884058, + "grad_norm": 0.35589030087499396, + "learning_rate": 6.171414158964402e-05, + "loss": 1.2421, + "step": 458 + }, + { + "epoch": 1.3304347826086955, + "grad_norm": 0.41000043026343475, + "learning_rate": 6.156183326245738e-05, + "loss": 1.1528, + "step": 459 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.3545298846533197, + "learning_rate": 6.140941150623865e-05, + "loss": 1.3154, + "step": 460 + }, + { + "epoch": 1.336231884057971, + "grad_norm": 0.3632756192190139, + "learning_rate": 6.12568778163434e-05, + "loss": 1.2769, + "step": 461 + }, + { + "epoch": 1.3391304347826087, + "grad_norm": 0.3766419178772542, + "learning_rate": 6.110423368922544e-05, + "loss": 1.215, + "step": 462 + }, + { + "epoch": 1.3420289855072465, + "grad_norm": 0.35769930623122026, + "learning_rate": 6.095148062242196e-05, + "loss": 1.2226, + "step": 463 + }, + { + "epoch": 1.344927536231884, + "grad_norm": 0.3652620834683046, + "learning_rate": 6.079862011453893e-05, + "loss": 1.2217, + "step": 464 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.37380916243000584, + "learning_rate": 6.064565366523641e-05, + "loss": 1.2051, + "step": 465 + }, + { + "epoch": 1.3507246376811595, + "grad_norm": 0.38594446149133127, + "learning_rate": 6.0492582775213825e-05, + "loss": 1.2652, + "step": 466 + }, + { + "epoch": 1.353623188405797, + "grad_norm": 0.3461990145984557, + "learning_rate": 6.0339408946195185e-05, + "loss": 1.2554, + "step": 467 + }, + { + "epoch": 1.3565217391304347, + "grad_norm": 0.3748678338524721, + "learning_rate": 6.0186133680914445e-05, + "loss": 1.191, + "step": 468 + }, + { + "epoch": 1.3594202898550725, + "grad_norm": 0.37370664196717224, + "learning_rate": 6.003275848310067e-05, + "loss": 1.2706, + "step": 469 + }, + { + "epoch": 1.3623188405797102, + "grad_norm": 0.36194306306178214, + "learning_rate": 5.9879284857463356e-05, + "loss": 1.2187, + "step": 470 + }, + { + "epoch": 1.365217391304348, + "grad_norm": 0.36087008057820225, + "learning_rate": 5.972571430967764e-05, + "loss": 1.2456, + "step": 471 + }, + { + "epoch": 1.3681159420289855, + "grad_norm": 0.36273835372082425, + "learning_rate": 5.9572048346369515e-05, + "loss": 1.2277, + "step": 472 + }, + { + "epoch": 1.3710144927536232, + "grad_norm": 0.37085205673967797, + "learning_rate": 5.941828847510108e-05, + "loss": 1.2768, + "step": 473 + }, + { + "epoch": 1.373913043478261, + "grad_norm": 0.3755185129215953, + "learning_rate": 5.9264436204355724e-05, + "loss": 1.2031, + "step": 474 + }, + { + "epoch": 1.3768115942028984, + "grad_norm": 0.37382431917426745, + "learning_rate": 5.911049304352332e-05, + "loss": 1.2843, + "step": 475 + }, + { + "epoch": 1.3797101449275362, + "grad_norm": 0.37855680727333874, + "learning_rate": 5.895646050288543e-05, + "loss": 1.2912, + "step": 476 + }, + { + "epoch": 1.382608695652174, + "grad_norm": 0.3654439184708917, + "learning_rate": 5.8802340093600495e-05, + "loss": 1.2292, + "step": 477 + }, + { + "epoch": 1.3855072463768117, + "grad_norm": 0.3846140132825601, + "learning_rate": 5.8648133327689036e-05, + "loss": 1.2675, + "step": 478 + }, + { + "epoch": 1.3884057971014494, + "grad_norm": 0.3766180728314526, + "learning_rate": 5.849384171801876e-05, + "loss": 1.205, + "step": 479 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 0.35496774282385274, + "learning_rate": 5.8339466778289745e-05, + "loss": 1.2035, + "step": 480 + }, + { + "epoch": 1.3942028985507247, + "grad_norm": 0.35882380091220856, + "learning_rate": 5.818501002301959e-05, + "loss": 1.2047, + "step": 481 + }, + { + "epoch": 1.3971014492753624, + "grad_norm": 0.36361359874976407, + "learning_rate": 5.803047296752856e-05, + "loss": 1.2068, + "step": 482 + }, + { + "epoch": 1.4, + "grad_norm": 0.35304052394158203, + "learning_rate": 5.7875857127924704e-05, + "loss": 1.2039, + "step": 483 + }, + { + "epoch": 1.4028985507246376, + "grad_norm": 0.3767536613499123, + "learning_rate": 5.772116402108903e-05, + "loss": 1.1734, + "step": 484 + }, + { + "epoch": 1.4057971014492754, + "grad_norm": 0.3673108485371312, + "learning_rate": 5.756639516466056e-05, + "loss": 1.2631, + "step": 485 + }, + { + "epoch": 1.4086956521739131, + "grad_norm": 0.37033398981771753, + "learning_rate": 5.741155207702146e-05, + "loss": 1.2284, + "step": 486 + }, + { + "epoch": 1.4115942028985506, + "grad_norm": 0.3803519741849858, + "learning_rate": 5.7256636277282193e-05, + "loss": 1.2512, + "step": 487 + }, + { + "epoch": 1.4144927536231884, + "grad_norm": 0.3822460303571093, + "learning_rate": 5.7101649285266524e-05, + "loss": 1.2285, + "step": 488 + }, + { + "epoch": 1.4173913043478261, + "grad_norm": 0.366694568605544, + "learning_rate": 5.694659262149666e-05, + "loss": 1.2652, + "step": 489 + }, + { + "epoch": 1.4202898550724639, + "grad_norm": 0.3599613129529298, + "learning_rate": 5.679146780717841e-05, + "loss": 1.199, + "step": 490 + }, + { + "epoch": 1.4231884057971014, + "grad_norm": 0.36225487078774454, + "learning_rate": 5.6636276364186105e-05, + "loss": 1.1848, + "step": 491 + }, + { + "epoch": 1.4260869565217391, + "grad_norm": 0.3599718189253672, + "learning_rate": 5.648101981504775e-05, + "loss": 1.2082, + "step": 492 + }, + { + "epoch": 1.4289855072463769, + "grad_norm": 0.37863788166143847, + "learning_rate": 5.6325699682930145e-05, + "loss": 1.2391, + "step": 493 + }, + { + "epoch": 1.4318840579710144, + "grad_norm": 0.3803432660363016, + "learning_rate": 5.617031749162381e-05, + "loss": 1.161, + "step": 494 + }, + { + "epoch": 1.434782608695652, + "grad_norm": 0.35786784027090707, + "learning_rate": 5.6014874765528124e-05, + "loss": 1.2861, + "step": 495 + }, + { + "epoch": 1.4376811594202898, + "grad_norm": 0.3642405560037894, + "learning_rate": 5.58593730296364e-05, + "loss": 1.2349, + "step": 496 + }, + { + "epoch": 1.4405797101449276, + "grad_norm": 0.369598439136747, + "learning_rate": 5.57038138095208e-05, + "loss": 1.285, + "step": 497 + }, + { + "epoch": 1.4434782608695653, + "grad_norm": 0.3555670502464068, + "learning_rate": 5.5548198631317494e-05, + "loss": 1.2145, + "step": 498 + }, + { + "epoch": 1.4463768115942028, + "grad_norm": 0.376327361594081, + "learning_rate": 5.539252902171164e-05, + "loss": 1.2245, + "step": 499 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.37654715270476347, + "learning_rate": 5.523680650792237e-05, + "loss": 1.2419, + "step": 500 + }, + { + "epoch": 1.4521739130434783, + "grad_norm": 0.5779377636764227, + "learning_rate": 5.508103261768783e-05, + "loss": 1.239, + "step": 501 + }, + { + "epoch": 1.4550724637681158, + "grad_norm": 0.37430911277789075, + "learning_rate": 5.492520887925028e-05, + "loss": 1.2577, + "step": 502 + }, + { + "epoch": 1.4579710144927536, + "grad_norm": 0.36147621449440515, + "learning_rate": 5.4769336821340936e-05, + "loss": 1.2851, + "step": 503 + }, + { + "epoch": 1.4608695652173913, + "grad_norm": 0.3731800543772072, + "learning_rate": 5.4613417973165106e-05, + "loss": 1.1851, + "step": 504 + }, + { + "epoch": 1.463768115942029, + "grad_norm": 0.38025435659821, + "learning_rate": 5.445745386438713e-05, + "loss": 1.2853, + "step": 505 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.3806710140744915, + "learning_rate": 5.430144602511539e-05, + "loss": 1.2698, + "step": 506 + }, + { + "epoch": 1.4695652173913043, + "grad_norm": 0.40891604532181375, + "learning_rate": 5.4145395985887246e-05, + "loss": 1.2388, + "step": 507 + }, + { + "epoch": 1.472463768115942, + "grad_norm": 0.3545961610157745, + "learning_rate": 5.3989305277654156e-05, + "loss": 1.19, + "step": 508 + }, + { + "epoch": 1.4753623188405798, + "grad_norm": 0.3648442660384036, + "learning_rate": 5.383317543176649e-05, + "loss": 1.203, + "step": 509 + }, + { + "epoch": 1.4782608695652173, + "grad_norm": 0.3850663135269365, + "learning_rate": 5.367700797995863e-05, + "loss": 1.2297, + "step": 510 + }, + { + "epoch": 1.481159420289855, + "grad_norm": 0.35394244670279573, + "learning_rate": 5.352080445433385e-05, + "loss": 1.2044, + "step": 511 + }, + { + "epoch": 1.4840579710144928, + "grad_norm": 0.3866450435083724, + "learning_rate": 5.336456638734938e-05, + "loss": 1.2203, + "step": 512 + }, + { + "epoch": 1.4869565217391305, + "grad_norm": 0.3800225621052723, + "learning_rate": 5.320829531180128e-05, + "loss": 1.2147, + "step": 513 + }, + { + "epoch": 1.4898550724637682, + "grad_norm": 0.37391354192034965, + "learning_rate": 5.30519927608095e-05, + "loss": 1.2173, + "step": 514 + }, + { + "epoch": 1.4927536231884058, + "grad_norm": 0.3908730346775049, + "learning_rate": 5.2895660267802714e-05, + "loss": 1.179, + "step": 515 + }, + { + "epoch": 1.4956521739130435, + "grad_norm": 0.3797397244263353, + "learning_rate": 5.27392993665034e-05, + "loss": 1.2397, + "step": 516 + }, + { + "epoch": 1.4985507246376812, + "grad_norm": 0.3698351874885442, + "learning_rate": 5.258291159091273e-05, + "loss": 1.292, + "step": 517 + }, + { + "epoch": 1.5014492753623188, + "grad_norm": 0.3680512756549276, + "learning_rate": 5.242649847529551e-05, + "loss": 1.1788, + "step": 518 + }, + { + "epoch": 1.5043478260869565, + "grad_norm": 0.3603216123639398, + "learning_rate": 5.227006155416517e-05, + "loss": 1.1539, + "step": 519 + }, + { + "epoch": 1.5072463768115942, + "grad_norm": 0.3830020055397342, + "learning_rate": 5.2113602362268674e-05, + "loss": 1.1658, + "step": 520 + }, + { + "epoch": 1.5101449275362318, + "grad_norm": 0.37049306835431794, + "learning_rate": 5.1957122434571485e-05, + "loss": 1.2754, + "step": 521 + }, + { + "epoch": 1.5130434782608697, + "grad_norm": 0.36878581085745593, + "learning_rate": 5.180062330624248e-05, + "loss": 1.26, + "step": 522 + }, + { + "epoch": 1.5159420289855072, + "grad_norm": 0.3932729911977662, + "learning_rate": 5.164410651263895e-05, + "loss": 1.2411, + "step": 523 + }, + { + "epoch": 1.518840579710145, + "grad_norm": 0.37380205081558054, + "learning_rate": 5.1487573589291424e-05, + "loss": 1.2778, + "step": 524 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 0.39041353684960733, + "learning_rate": 5.133102607188874e-05, + "loss": 1.1484, + "step": 525 + }, + { + "epoch": 1.5246376811594202, + "grad_norm": 0.37594098481535654, + "learning_rate": 5.117446549626289e-05, + "loss": 1.2161, + "step": 526 + }, + { + "epoch": 1.527536231884058, + "grad_norm": 0.38365451143587687, + "learning_rate": 5.101789339837396e-05, + "loss": 1.2256, + "step": 527 + }, + { + "epoch": 1.5304347826086957, + "grad_norm": 0.3855037750389005, + "learning_rate": 5.086131131429509e-05, + "loss": 1.2209, + "step": 528 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.3890790766439738, + "learning_rate": 5.07047207801974e-05, + "loss": 1.2338, + "step": 529 + }, + { + "epoch": 1.5362318840579712, + "grad_norm": 0.3700881037410359, + "learning_rate": 5.0548123332334896e-05, + "loss": 1.2475, + "step": 530 + }, + { + "epoch": 1.5391304347826087, + "grad_norm": 0.3743561390377829, + "learning_rate": 5.0391520507029424e-05, + "loss": 1.2239, + "step": 531 + }, + { + "epoch": 1.5420289855072464, + "grad_norm": 0.37802774104497083, + "learning_rate": 5.023491384065555e-05, + "loss": 1.2324, + "step": 532 + }, + { + "epoch": 1.5449275362318842, + "grad_norm": 0.36820878715854055, + "learning_rate": 5.0078304869625595e-05, + "loss": 1.2404, + "step": 533 + }, + { + "epoch": 1.5478260869565217, + "grad_norm": 0.3632460544127689, + "learning_rate": 4.992169513037441e-05, + "loss": 1.177, + "step": 534 + }, + { + "epoch": 1.5507246376811594, + "grad_norm": 0.3683252664871912, + "learning_rate": 4.9765086159344445e-05, + "loss": 1.182, + "step": 535 + }, + { + "epoch": 1.5536231884057972, + "grad_norm": 0.3831233196950789, + "learning_rate": 4.9608479492970594e-05, + "loss": 1.1991, + "step": 536 + }, + { + "epoch": 1.5565217391304347, + "grad_norm": 0.37245646640167623, + "learning_rate": 4.9451876667665116e-05, + "loss": 1.2376, + "step": 537 + }, + { + "epoch": 1.5594202898550724, + "grad_norm": 0.36522555829264214, + "learning_rate": 4.929527921980261e-05, + "loss": 1.2871, + "step": 538 + }, + { + "epoch": 1.5623188405797102, + "grad_norm": 0.35901097232709117, + "learning_rate": 4.9138688685704916e-05, + "loss": 1.2094, + "step": 539 + }, + { + "epoch": 1.5652173913043477, + "grad_norm": 0.3520423753812632, + "learning_rate": 4.898210660162605e-05, + "loss": 1.2363, + "step": 540 + }, + { + "epoch": 1.5681159420289856, + "grad_norm": 0.40852366010005403, + "learning_rate": 4.882553450373712e-05, + "loss": 1.2352, + "step": 541 + }, + { + "epoch": 1.5710144927536231, + "grad_norm": 0.3651205273751799, + "learning_rate": 4.866897392811126e-05, + "loss": 1.222, + "step": 542 + }, + { + "epoch": 1.5739130434782609, + "grad_norm": 0.3699594416077427, + "learning_rate": 4.851242641070859e-05, + "loss": 1.2149, + "step": 543 + }, + { + "epoch": 1.5768115942028986, + "grad_norm": 0.38193530242722756, + "learning_rate": 4.8355893487361084e-05, + "loss": 1.2766, + "step": 544 + }, + { + "epoch": 1.5797101449275361, + "grad_norm": 0.38568456101700965, + "learning_rate": 4.8199376693757544e-05, + "loss": 1.2844, + "step": 545 + }, + { + "epoch": 1.5826086956521739, + "grad_norm": 0.36059528632874444, + "learning_rate": 4.804287756542852e-05, + "loss": 1.2726, + "step": 546 + }, + { + "epoch": 1.5855072463768116, + "grad_norm": 0.36513879678761724, + "learning_rate": 4.788639763773133e-05, + "loss": 1.1763, + "step": 547 + }, + { + "epoch": 1.5884057971014491, + "grad_norm": 0.387466168821441, + "learning_rate": 4.772993844583483e-05, + "loss": 1.2544, + "step": 548 + }, + { + "epoch": 1.591304347826087, + "grad_norm": 0.5520887828224808, + "learning_rate": 4.75735015247045e-05, + "loss": 1.2285, + "step": 549 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.389584382030089, + "learning_rate": 4.7417088409087285e-05, + "loss": 1.2463, + "step": 550 + }, + { + "epoch": 1.5971014492753624, + "grad_norm": 0.3963144528047638, + "learning_rate": 4.7260700633496605e-05, + "loss": 1.1914, + "step": 551 + }, + { + "epoch": 1.6, + "grad_norm": 0.36855199490556523, + "learning_rate": 4.71043397321973e-05, + "loss": 1.2395, + "step": 552 + }, + { + "epoch": 1.6028985507246376, + "grad_norm": 0.3887397654253079, + "learning_rate": 4.6948007239190514e-05, + "loss": 1.2639, + "step": 553 + }, + { + "epoch": 1.6057971014492753, + "grad_norm": 0.3697755928376452, + "learning_rate": 4.6791704688198724e-05, + "loss": 1.1648, + "step": 554 + }, + { + "epoch": 1.608695652173913, + "grad_norm": 0.38405410279449403, + "learning_rate": 4.663543361265064e-05, + "loss": 1.2424, + "step": 555 + }, + { + "epoch": 1.6115942028985506, + "grad_norm": 0.36889274593199667, + "learning_rate": 4.647919554566616e-05, + "loss": 1.2037, + "step": 556 + }, + { + "epoch": 1.6144927536231886, + "grad_norm": 0.38742028194651634, + "learning_rate": 4.63229920200414e-05, + "loss": 1.144, + "step": 557 + }, + { + "epoch": 1.617391304347826, + "grad_norm": 0.3771419221596441, + "learning_rate": 4.61668245682335e-05, + "loss": 1.2386, + "step": 558 + }, + { + "epoch": 1.6202898550724638, + "grad_norm": 0.36745992758167406, + "learning_rate": 4.601069472234584e-05, + "loss": 1.2439, + "step": 559 + }, + { + "epoch": 1.6231884057971016, + "grad_norm": 0.37299246443958567, + "learning_rate": 4.585460401411275e-05, + "loss": 1.1891, + "step": 560 + }, + { + "epoch": 1.626086956521739, + "grad_norm": 0.39436742226379295, + "learning_rate": 4.569855397488462e-05, + "loss": 1.2345, + "step": 561 + }, + { + "epoch": 1.6289855072463768, + "grad_norm": 0.38332200212622664, + "learning_rate": 4.554254613561289e-05, + "loss": 1.221, + "step": 562 + }, + { + "epoch": 1.6318840579710145, + "grad_norm": 0.3668234731737798, + "learning_rate": 4.5386582026834906e-05, + "loss": 1.1407, + "step": 563 + }, + { + "epoch": 1.634782608695652, + "grad_norm": 0.3886901538482464, + "learning_rate": 4.5230663178659075e-05, + "loss": 1.2372, + "step": 564 + }, + { + "epoch": 1.6376811594202898, + "grad_norm": 0.3690709201915018, + "learning_rate": 4.507479112074974e-05, + "loss": 1.2135, + "step": 565 + }, + { + "epoch": 1.6405797101449275, + "grad_norm": 0.36879231080045594, + "learning_rate": 4.491896738231218e-05, + "loss": 1.1641, + "step": 566 + }, + { + "epoch": 1.643478260869565, + "grad_norm": 0.36645636944065885, + "learning_rate": 4.476319349207766e-05, + "loss": 1.1852, + "step": 567 + }, + { + "epoch": 1.646376811594203, + "grad_norm": 0.3431665404786532, + "learning_rate": 4.460747097828838e-05, + "loss": 1.1573, + "step": 568 + }, + { + "epoch": 1.6492753623188405, + "grad_norm": 0.3758095567042996, + "learning_rate": 4.445180136868252e-05, + "loss": 1.2862, + "step": 569 + }, + { + "epoch": 1.6521739130434783, + "grad_norm": 0.3747562731763405, + "learning_rate": 4.4296186190479203e-05, + "loss": 1.2232, + "step": 570 + }, + { + "epoch": 1.655072463768116, + "grad_norm": 0.3680948045233427, + "learning_rate": 4.414062697036361e-05, + "loss": 1.2261, + "step": 571 + }, + { + "epoch": 1.6579710144927535, + "grad_norm": 0.3951307328237191, + "learning_rate": 4.3985125234471874e-05, + "loss": 1.2456, + "step": 572 + }, + { + "epoch": 1.6608695652173913, + "grad_norm": 0.39734232299660693, + "learning_rate": 4.3829682508376194e-05, + "loss": 1.1953, + "step": 573 + }, + { + "epoch": 1.663768115942029, + "grad_norm": 0.3784998636514162, + "learning_rate": 4.367430031706987e-05, + "loss": 1.2367, + "step": 574 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.39715845084791845, + "learning_rate": 4.351898018495225e-05, + "loss": 1.2279, + "step": 575 + }, + { + "epoch": 1.6695652173913045, + "grad_norm": 0.378181731966129, + "learning_rate": 4.336372363581391e-05, + "loss": 1.2075, + "step": 576 + }, + { + "epoch": 1.672463768115942, + "grad_norm": 0.3690996052960561, + "learning_rate": 4.32085321928216e-05, + "loss": 1.0945, + "step": 577 + }, + { + "epoch": 1.6753623188405797, + "grad_norm": 0.3661279761386217, + "learning_rate": 4.305340737850334e-05, + "loss": 1.2039, + "step": 578 + }, + { + "epoch": 1.6782608695652175, + "grad_norm": 0.3703501070974622, + "learning_rate": 4.28983507147335e-05, + "loss": 1.1634, + "step": 579 + }, + { + "epoch": 1.681159420289855, + "grad_norm": 0.37705477138544613, + "learning_rate": 4.2743363722717825e-05, + "loss": 1.233, + "step": 580 + }, + { + "epoch": 1.6840579710144927, + "grad_norm": 0.37944231677619733, + "learning_rate": 4.258844792297855e-05, + "loss": 1.2484, + "step": 581 + }, + { + "epoch": 1.6869565217391305, + "grad_norm": 0.36121328853497303, + "learning_rate": 4.2433604835339445e-05, + "loss": 1.2517, + "step": 582 + }, + { + "epoch": 1.689855072463768, + "grad_norm": 0.3658490072297351, + "learning_rate": 4.227883597891098e-05, + "loss": 1.2833, + "step": 583 + }, + { + "epoch": 1.692753623188406, + "grad_norm": 0.3742426427268219, + "learning_rate": 4.21241428720753e-05, + "loss": 1.2188, + "step": 584 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.3833395112583662, + "learning_rate": 4.196952703247145e-05, + "loss": 1.265, + "step": 585 + }, + { + "epoch": 1.6985507246376812, + "grad_norm": 0.36472794357808286, + "learning_rate": 4.181498997698042e-05, + "loss": 1.1679, + "step": 586 + }, + { + "epoch": 1.701449275362319, + "grad_norm": 0.36498141790011873, + "learning_rate": 4.1660533221710266e-05, + "loss": 1.2138, + "step": 587 + }, + { + "epoch": 1.7043478260869565, + "grad_norm": 0.37102421652558093, + "learning_rate": 4.150615828198125e-05, + "loss": 1.2176, + "step": 588 + }, + { + "epoch": 1.7072463768115942, + "grad_norm": 0.36544210520658216, + "learning_rate": 4.135186667231097e-05, + "loss": 1.2098, + "step": 589 + }, + { + "epoch": 1.710144927536232, + "grad_norm": 0.3612434641690313, + "learning_rate": 4.119765990639952e-05, + "loss": 1.1763, + "step": 590 + }, + { + "epoch": 1.7130434782608694, + "grad_norm": 0.3620969506592556, + "learning_rate": 4.1043539497114605e-05, + "loss": 1.1872, + "step": 591 + }, + { + "epoch": 1.7159420289855074, + "grad_norm": 0.39393702299078354, + "learning_rate": 4.088950695647671e-05, + "loss": 1.2687, + "step": 592 + }, + { + "epoch": 1.718840579710145, + "grad_norm": 0.3817467440217286, + "learning_rate": 4.0735563795644294e-05, + "loss": 1.2771, + "step": 593 + }, + { + "epoch": 1.7217391304347827, + "grad_norm": 0.3927298023358771, + "learning_rate": 4.058171152489891e-05, + "loss": 1.2733, + "step": 594 + }, + { + "epoch": 1.7246376811594204, + "grad_norm": 0.3674064366862089, + "learning_rate": 4.042795165363048e-05, + "loss": 1.2438, + "step": 595 + }, + { + "epoch": 1.727536231884058, + "grad_norm": 0.3719771458126402, + "learning_rate": 4.0274285690322366e-05, + "loss": 1.2539, + "step": 596 + }, + { + "epoch": 1.7304347826086957, + "grad_norm": 0.37286309136721435, + "learning_rate": 4.012071514253665e-05, + "loss": 1.2219, + "step": 597 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.37200008726902983, + "learning_rate": 3.996724151689934e-05, + "loss": 1.1937, + "step": 598 + }, + { + "epoch": 1.736231884057971, + "grad_norm": 0.3769662425580422, + "learning_rate": 3.981386631908557e-05, + "loss": 1.1795, + "step": 599 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.38896295738805997, + "learning_rate": 3.966059105380483e-05, + "loss": 1.262, + "step": 600 + }, + { + "epoch": 1.7420289855072464, + "grad_norm": 0.38088532712001094, + "learning_rate": 3.9507417224786193e-05, + "loss": 1.2626, + "step": 601 + }, + { + "epoch": 1.744927536231884, + "grad_norm": 0.3906788265447541, + "learning_rate": 3.93543463347636e-05, + "loss": 1.1918, + "step": 602 + }, + { + "epoch": 1.7478260869565219, + "grad_norm": 0.3691860050404467, + "learning_rate": 3.920137988546109e-05, + "loss": 1.1616, + "step": 603 + }, + { + "epoch": 1.7507246376811594, + "grad_norm": 0.3792592507880301, + "learning_rate": 3.9048519377578064e-05, + "loss": 1.1926, + "step": 604 + }, + { + "epoch": 1.7536231884057971, + "grad_norm": 0.37902398772592705, + "learning_rate": 3.8895766310774574e-05, + "loss": 1.3234, + "step": 605 + }, + { + "epoch": 1.7565217391304349, + "grad_norm": 0.3808967277084784, + "learning_rate": 3.87431221836566e-05, + "loss": 1.2678, + "step": 606 + }, + { + "epoch": 1.7594202898550724, + "grad_norm": 0.3768612203952316, + "learning_rate": 3.859058849376136e-05, + "loss": 1.2442, + "step": 607 + }, + { + "epoch": 1.76231884057971, + "grad_norm": 0.3661782288025134, + "learning_rate": 3.843816673754262e-05, + "loss": 1.2757, + "step": 608 + }, + { + "epoch": 1.7652173913043478, + "grad_norm": 0.3746443716611926, + "learning_rate": 3.8285858410355984e-05, + "loss": 1.234, + "step": 609 + }, + { + "epoch": 1.7681159420289854, + "grad_norm": 0.38619920952815956, + "learning_rate": 3.8133665006444255e-05, + "loss": 1.2229, + "step": 610 + }, + { + "epoch": 1.7710144927536233, + "grad_norm": 0.37016562757932, + "learning_rate": 3.798158801892277e-05, + "loss": 1.2112, + "step": 611 + }, + { + "epoch": 1.7739130434782608, + "grad_norm": 0.39144763721074394, + "learning_rate": 3.782962893976475e-05, + "loss": 1.1941, + "step": 612 + }, + { + "epoch": 1.7768115942028986, + "grad_norm": 0.372157745001237, + "learning_rate": 3.7677789259786615e-05, + "loss": 1.1607, + "step": 613 + }, + { + "epoch": 1.7797101449275363, + "grad_norm": 0.38017415387323344, + "learning_rate": 3.7526070468633464e-05, + "loss": 1.2251, + "step": 614 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 0.3764265620005903, + "learning_rate": 3.737447405476436e-05, + "loss": 1.2389, + "step": 615 + }, + { + "epoch": 1.7855072463768116, + "grad_norm": 0.36301297876352934, + "learning_rate": 3.7223001505437775e-05, + "loss": 1.1647, + "step": 616 + }, + { + "epoch": 1.7884057971014493, + "grad_norm": 0.3589005180459851, + "learning_rate": 3.7071654306697003e-05, + "loss": 1.2044, + "step": 617 + }, + { + "epoch": 1.7913043478260868, + "grad_norm": 0.38118628063662097, + "learning_rate": 3.692043394335556e-05, + "loss": 1.2063, + "step": 618 + }, + { + "epoch": 1.7942028985507248, + "grad_norm": 0.37713318727543105, + "learning_rate": 3.676934189898259e-05, + "loss": 1.3151, + "step": 619 + }, + { + "epoch": 1.7971014492753623, + "grad_norm": 0.38497109120391243, + "learning_rate": 3.661837965588842e-05, + "loss": 1.1582, + "step": 620 + }, + { + "epoch": 1.8, + "grad_norm": 0.3958884224922945, + "learning_rate": 3.646754869510988e-05, + "loss": 1.2598, + "step": 621 + }, + { + "epoch": 1.8028985507246378, + "grad_norm": 0.370532843067504, + "learning_rate": 3.631685049639586e-05, + "loss": 1.2128, + "step": 622 + }, + { + "epoch": 1.8057971014492753, + "grad_norm": 0.40047093677653156, + "learning_rate": 3.616628653819276e-05, + "loss": 1.2316, + "step": 623 + }, + { + "epoch": 1.808695652173913, + "grad_norm": 0.37643906872365784, + "learning_rate": 3.6015858297630004e-05, + "loss": 1.2171, + "step": 624 + }, + { + "epoch": 1.8115942028985508, + "grad_norm": 0.39490427844818465, + "learning_rate": 3.5865567250505536e-05, + "loss": 1.2416, + "step": 625 + }, + { + "epoch": 1.8144927536231883, + "grad_norm": 0.3631993323865769, + "learning_rate": 3.5715414871271336e-05, + "loss": 1.2147, + "step": 626 + }, + { + "epoch": 1.8173913043478263, + "grad_norm": 0.35840772617807537, + "learning_rate": 3.556540263301896e-05, + "loss": 1.2015, + "step": 627 + }, + { + "epoch": 1.8202898550724638, + "grad_norm": 0.3791997912963071, + "learning_rate": 3.541553200746511e-05, + "loss": 1.1583, + "step": 628 + }, + { + "epoch": 1.8231884057971013, + "grad_norm": 0.37805560040982356, + "learning_rate": 3.526580446493717e-05, + "loss": 1.2238, + "step": 629 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 0.382383828357578, + "learning_rate": 3.511622147435877e-05, + "loss": 1.2201, + "step": 630 + }, + { + "epoch": 1.8289855072463768, + "grad_norm": 0.38874429445479597, + "learning_rate": 3.4966784503235394e-05, + "loss": 1.2319, + "step": 631 + }, + { + "epoch": 1.8318840579710145, + "grad_norm": 0.38625077800174934, + "learning_rate": 3.481749501764002e-05, + "loss": 1.2326, + "step": 632 + }, + { + "epoch": 1.8347826086956522, + "grad_norm": 0.37805590288266955, + "learning_rate": 3.466835448219867e-05, + "loss": 1.2072, + "step": 633 + }, + { + "epoch": 1.8376811594202898, + "grad_norm": 0.3876007771372343, + "learning_rate": 3.4519364360076045e-05, + "loss": 1.2188, + "step": 634 + }, + { + "epoch": 1.8405797101449275, + "grad_norm": 0.36997413690862124, + "learning_rate": 3.437052611296123e-05, + "loss": 1.2974, + "step": 635 + }, + { + "epoch": 1.8434782608695652, + "grad_norm": 0.38893326272743267, + "learning_rate": 3.422184120105331e-05, + "loss": 1.2325, + "step": 636 + }, + { + "epoch": 1.8463768115942027, + "grad_norm": 0.38534863103441785, + "learning_rate": 3.407331108304704e-05, + "loss": 1.2881, + "step": 637 + }, + { + "epoch": 1.8492753623188407, + "grad_norm": 0.35237887662066153, + "learning_rate": 3.392493721611857e-05, + "loss": 1.1636, + "step": 638 + }, + { + "epoch": 1.8521739130434782, + "grad_norm": 0.3522129349688945, + "learning_rate": 3.37767210559111e-05, + "loss": 1.2069, + "step": 639 + }, + { + "epoch": 1.855072463768116, + "grad_norm": 0.3828825108660318, + "learning_rate": 3.3628664056520645e-05, + "loss": 1.1511, + "step": 640 + }, + { + "epoch": 1.8579710144927537, + "grad_norm": 0.38984016931652277, + "learning_rate": 3.348076767048174e-05, + "loss": 1.2204, + "step": 641 + }, + { + "epoch": 1.8608695652173912, + "grad_norm": 0.36523507158461577, + "learning_rate": 3.3333033348753196e-05, + "loss": 1.262, + "step": 642 + }, + { + "epoch": 1.863768115942029, + "grad_norm": 0.37220367890890976, + "learning_rate": 3.3185462540703874e-05, + "loss": 1.2262, + "step": 643 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.3694812470086758, + "learning_rate": 3.303805669409848e-05, + "loss": 1.2474, + "step": 644 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 0.36698538082460586, + "learning_rate": 3.289081725508333e-05, + "loss": 1.2088, + "step": 645 + }, + { + "epoch": 1.8724637681159422, + "grad_norm": 0.3778477738916828, + "learning_rate": 3.2743745668172135e-05, + "loss": 1.1314, + "step": 646 + }, + { + "epoch": 1.8753623188405797, + "grad_norm": 0.35885473738105417, + "learning_rate": 3.259684337623192e-05, + "loss": 1.1323, + "step": 647 + }, + { + "epoch": 1.8782608695652174, + "grad_norm": 0.3865523562816111, + "learning_rate": 3.245011182046881e-05, + "loss": 1.2147, + "step": 648 + }, + { + "epoch": 1.8811594202898552, + "grad_norm": 0.530703476143991, + "learning_rate": 3.230355244041387e-05, + "loss": 1.294, + "step": 649 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.37902082343553395, + "learning_rate": 3.215716667390905e-05, + "loss": 1.2446, + "step": 650 + }, + { + "epoch": 1.8869565217391304, + "grad_norm": 0.3635449013765209, + "learning_rate": 3.201095595709298e-05, + "loss": 1.1876, + "step": 651 + }, + { + "epoch": 1.8898550724637682, + "grad_norm": 0.38375684981250285, + "learning_rate": 3.1864921724387e-05, + "loss": 1.2511, + "step": 652 + }, + { + "epoch": 1.8927536231884057, + "grad_norm": 0.374887470810997, + "learning_rate": 3.1719065408481005e-05, + "loss": 1.2076, + "step": 653 + }, + { + "epoch": 1.8956521739130436, + "grad_norm": 0.3788733526902221, + "learning_rate": 3.1573388440319404e-05, + "loss": 1.1485, + "step": 654 + }, + { + "epoch": 1.8985507246376812, + "grad_norm": 0.37343821294935253, + "learning_rate": 3.142789224908709e-05, + "loss": 1.2417, + "step": 655 + }, + { + "epoch": 1.901449275362319, + "grad_norm": 0.36972719766904644, + "learning_rate": 3.128257826219544e-05, + "loss": 1.1924, + "step": 656 + }, + { + "epoch": 1.9043478260869566, + "grad_norm": 0.39152027197251665, + "learning_rate": 3.1137447905268264e-05, + "loss": 1.2334, + "step": 657 + }, + { + "epoch": 1.9072463768115941, + "grad_norm": 0.3793593937622258, + "learning_rate": 3.099250260212785e-05, + "loss": 1.2044, + "step": 658 + }, + { + "epoch": 1.9101449275362319, + "grad_norm": 0.37274932277970574, + "learning_rate": 3.0847743774781e-05, + "loss": 1.2396, + "step": 659 + }, + { + "epoch": 1.9130434782608696, + "grad_norm": 0.3917130499161079, + "learning_rate": 3.070317284340505e-05, + "loss": 1.2224, + "step": 660 + }, + { + "epoch": 1.9159420289855071, + "grad_norm": 0.3730432872342999, + "learning_rate": 3.055879122633397e-05, + "loss": 1.1523, + "step": 661 + }, + { + "epoch": 1.9188405797101449, + "grad_norm": 0.38603243505310325, + "learning_rate": 3.041460034004443e-05, + "loss": 1.2139, + "step": 662 + }, + { + "epoch": 1.9217391304347826, + "grad_norm": 0.3705238103870671, + "learning_rate": 3.0270601599141912e-05, + "loss": 1.2359, + "step": 663 + }, + { + "epoch": 1.9246376811594201, + "grad_norm": 0.37597496158367705, + "learning_rate": 3.0126796416346814e-05, + "loss": 1.2185, + "step": 664 + }, + { + "epoch": 1.927536231884058, + "grad_norm": 0.3685212983823541, + "learning_rate": 2.9983186202480623e-05, + "loss": 1.1696, + "step": 665 + }, + { + "epoch": 1.9304347826086956, + "grad_norm": 0.369031802362704, + "learning_rate": 2.9839772366452035e-05, + "loss": 1.1996, + "step": 666 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.37822154642489714, + "learning_rate": 2.969655631524316e-05, + "loss": 1.2732, + "step": 667 + }, + { + "epoch": 1.936231884057971, + "grad_norm": 0.37245983427478613, + "learning_rate": 2.9553539453895755e-05, + "loss": 1.2615, + "step": 668 + }, + { + "epoch": 1.9391304347826086, + "grad_norm": 0.3778250952875639, + "learning_rate": 2.9410723185497324e-05, + "loss": 1.2146, + "step": 669 + }, + { + "epoch": 1.9420289855072463, + "grad_norm": 0.3745452473168881, + "learning_rate": 2.9268108911167457e-05, + "loss": 1.2042, + "step": 670 + }, + { + "epoch": 1.944927536231884, + "grad_norm": 0.37312413882240314, + "learning_rate": 2.9125698030044068e-05, + "loss": 1.1911, + "step": 671 + }, + { + "epoch": 1.9478260869565216, + "grad_norm": 0.4061345062579341, + "learning_rate": 2.8983491939269634e-05, + "loss": 1.2611, + "step": 672 + }, + { + "epoch": 1.9507246376811596, + "grad_norm": 0.3849328956575118, + "learning_rate": 2.8841492033977503e-05, + "loss": 1.2108, + "step": 673 + }, + { + "epoch": 1.953623188405797, + "grad_norm": 0.38053458611756497, + "learning_rate": 2.8699699707278223e-05, + "loss": 1.2144, + "step": 674 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 0.39621473951535024, + "learning_rate": 2.8558116350245854e-05, + "loss": 1.2493, + "step": 675 + }, + { + "epoch": 1.9594202898550726, + "grad_norm": 0.3695671513205437, + "learning_rate": 2.841674335190434e-05, + "loss": 1.2519, + "step": 676 + }, + { + "epoch": 1.96231884057971, + "grad_norm": 0.3830315846006876, + "learning_rate": 2.827558209921386e-05, + "loss": 1.2074, + "step": 677 + }, + { + "epoch": 1.9652173913043478, + "grad_norm": 0.3877343629077828, + "learning_rate": 2.8134633977057235e-05, + "loss": 1.2333, + "step": 678 + }, + { + "epoch": 1.9681159420289855, + "grad_norm": 0.39689935141233373, + "learning_rate": 2.7993900368226333e-05, + "loss": 1.2128, + "step": 679 + }, + { + "epoch": 1.971014492753623, + "grad_norm": 0.37755832002907747, + "learning_rate": 2.785338265340852e-05, + "loss": 1.1728, + "step": 680 + }, + { + "epoch": 1.973913043478261, + "grad_norm": 0.38446867990310063, + "learning_rate": 2.771308221117309e-05, + "loss": 1.1602, + "step": 681 + }, + { + "epoch": 1.9768115942028985, + "grad_norm": 0.3785335064750929, + "learning_rate": 2.757300041795776e-05, + "loss": 1.2085, + "step": 682 + }, + { + "epoch": 1.9797101449275363, + "grad_norm": 0.3879694395220702, + "learning_rate": 2.7433138648055168e-05, + "loss": 1.2096, + "step": 683 + }, + { + "epoch": 1.982608695652174, + "grad_norm": 0.38604305997893856, + "learning_rate": 2.729349827359936e-05, + "loss": 1.2739, + "step": 684 + }, + { + "epoch": 1.9855072463768115, + "grad_norm": 0.3795112440774168, + "learning_rate": 2.715408066455236e-05, + "loss": 1.2666, + "step": 685 + }, + { + "epoch": 1.9884057971014493, + "grad_norm": 0.3625119163490855, + "learning_rate": 2.701488718869073e-05, + "loss": 1.2317, + "step": 686 + }, + { + "epoch": 1.991304347826087, + "grad_norm": 0.3680979908316257, + "learning_rate": 2.6875919211592137e-05, + "loss": 1.2673, + "step": 687 + }, + { + "epoch": 1.9942028985507245, + "grad_norm": 0.39366314079628106, + "learning_rate": 2.673717809662194e-05, + "loss": 1.215, + "step": 688 + }, + { + "epoch": 1.9971014492753625, + "grad_norm": 0.3711217421698582, + "learning_rate": 2.659866520491986e-05, + "loss": 1.2061, + "step": 689 + }, + { + "epoch": 2.0, + "grad_norm": 0.3619509926469052, + "learning_rate": 2.646038189538659e-05, + "loss": 1.0882, + "step": 690 + }, + { + "epoch": 2.0028985507246375, + "grad_norm": 0.36298590926269914, + "learning_rate": 2.632232952467047e-05, + "loss": 1.0538, + "step": 691 + }, + { + "epoch": 2.0057971014492755, + "grad_norm": 0.36532280808197115, + "learning_rate": 2.6184509447154193e-05, + "loss": 1.1357, + "step": 692 + }, + { + "epoch": 2.008695652173913, + "grad_norm": 0.39561521212011347, + "learning_rate": 2.6046923014941494e-05, + "loss": 0.9882, + "step": 693 + }, + { + "epoch": 2.0115942028985505, + "grad_norm": 0.3663184321766037, + "learning_rate": 2.5909571577843905e-05, + "loss": 1.0739, + "step": 694 + }, + { + "epoch": 2.0144927536231885, + "grad_norm": 0.3719396287060232, + "learning_rate": 2.5772456483367497e-05, + "loss": 1.0861, + "step": 695 + }, + { + "epoch": 2.017391304347826, + "grad_norm": 0.39175032329764664, + "learning_rate": 2.563557907669968e-05, + "loss": 1.0997, + "step": 696 + }, + { + "epoch": 2.020289855072464, + "grad_norm": 0.3842127505386081, + "learning_rate": 2.5498940700695978e-05, + "loss": 1.0833, + "step": 697 + }, + { + "epoch": 2.0231884057971015, + "grad_norm": 0.41296235407870646, + "learning_rate": 2.5362542695866885e-05, + "loss": 1.0784, + "step": 698 + }, + { + "epoch": 2.026086956521739, + "grad_norm": 0.40929280219103825, + "learning_rate": 2.5226386400364686e-05, + "loss": 1.0951, + "step": 699 + }, + { + "epoch": 2.028985507246377, + "grad_norm": 0.39727740475543244, + "learning_rate": 2.5090473149970357e-05, + "loss": 0.9986, + "step": 700 + }, + { + "epoch": 2.0318840579710145, + "grad_norm": 0.39777015075034217, + "learning_rate": 2.4954804278080423e-05, + "loss": 1.0739, + "step": 701 + }, + { + "epoch": 2.034782608695652, + "grad_norm": 0.40515813767942754, + "learning_rate": 2.4819381115693923e-05, + "loss": 1.1273, + "step": 702 + }, + { + "epoch": 2.03768115942029, + "grad_norm": 0.3928754252415712, + "learning_rate": 2.4684204991399312e-05, + "loss": 1.0047, + "step": 703 + }, + { + "epoch": 2.0405797101449274, + "grad_norm": 0.39235743857450184, + "learning_rate": 2.4549277231361438e-05, + "loss": 1.0452, + "step": 704 + }, + { + "epoch": 2.0434782608695654, + "grad_norm": 0.41751282512992466, + "learning_rate": 2.4414599159308553e-05, + "loss": 1.0451, + "step": 705 + }, + { + "epoch": 2.046376811594203, + "grad_norm": 0.40629312672049445, + "learning_rate": 2.4280172096519298e-05, + "loss": 1.1042, + "step": 706 + }, + { + "epoch": 2.0492753623188404, + "grad_norm": 0.4057666557957047, + "learning_rate": 2.4145997361809758e-05, + "loss": 1.0483, + "step": 707 + }, + { + "epoch": 2.0521739130434784, + "grad_norm": 0.4116946242019697, + "learning_rate": 2.4012076271520495e-05, + "loss": 1.1184, + "step": 708 + }, + { + "epoch": 2.055072463768116, + "grad_norm": 0.4127782071588422, + "learning_rate": 2.3878410139503693e-05, + "loss": 1.1238, + "step": 709 + }, + { + "epoch": 2.0579710144927534, + "grad_norm": 0.3964820416953686, + "learning_rate": 2.3745000277110197e-05, + "loss": 1.0499, + "step": 710 + }, + { + "epoch": 2.0608695652173914, + "grad_norm": 0.43556452448044664, + "learning_rate": 2.36118479931767e-05, + "loss": 1.0943, + "step": 711 + }, + { + "epoch": 2.063768115942029, + "grad_norm": 0.3995865010547347, + "learning_rate": 2.347895459401288e-05, + "loss": 1.04, + "step": 712 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.4221661952062326, + "learning_rate": 2.334632138338859e-05, + "loss": 0.9803, + "step": 713 + }, + { + "epoch": 2.0695652173913044, + "grad_norm": 0.41950916776520863, + "learning_rate": 2.3213949662521066e-05, + "loss": 1.0886, + "step": 714 + }, + { + "epoch": 2.072463768115942, + "grad_norm": 0.4173493785071151, + "learning_rate": 2.308184073006216e-05, + "loss": 1.0596, + "step": 715 + }, + { + "epoch": 2.07536231884058, + "grad_norm": 0.39623286465989827, + "learning_rate": 2.2949995882085595e-05, + "loss": 1.0871, + "step": 716 + }, + { + "epoch": 2.0782608695652174, + "grad_norm": 0.39259310137723663, + "learning_rate": 2.2818416412074267e-05, + "loss": 1.0324, + "step": 717 + }, + { + "epoch": 2.081159420289855, + "grad_norm": 0.3822283284054439, + "learning_rate": 2.2687103610907534e-05, + "loss": 1.1117, + "step": 718 + }, + { + "epoch": 2.084057971014493, + "grad_norm": 0.407037401843374, + "learning_rate": 2.255605876684856e-05, + "loss": 1.0225, + "step": 719 + }, + { + "epoch": 2.0869565217391304, + "grad_norm": 0.4184329997154531, + "learning_rate": 2.2425283165531685e-05, + "loss": 1.0084, + "step": 720 + }, + { + "epoch": 2.0898550724637683, + "grad_norm": 0.4131172741343908, + "learning_rate": 2.22947780899498e-05, + "loss": 1.0207, + "step": 721 + }, + { + "epoch": 2.092753623188406, + "grad_norm": 0.4143196275192534, + "learning_rate": 2.216454482044176e-05, + "loss": 1.0337, + "step": 722 + }, + { + "epoch": 2.0956521739130434, + "grad_norm": 0.40754060408579984, + "learning_rate": 2.203458463467983e-05, + "loss": 1.1537, + "step": 723 + }, + { + "epoch": 2.0985507246376813, + "grad_norm": 0.42013725925992734, + "learning_rate": 2.1904898807657152e-05, + "loss": 0.9899, + "step": 724 + }, + { + "epoch": 2.101449275362319, + "grad_norm": 0.41687669776278075, + "learning_rate": 2.1775488611675233e-05, + "loss": 1.0832, + "step": 725 + }, + { + "epoch": 2.1043478260869564, + "grad_norm": 0.4286213604830879, + "learning_rate": 2.1646355316331458e-05, + "loss": 1.0802, + "step": 726 + }, + { + "epoch": 2.1072463768115943, + "grad_norm": 0.4042262579626966, + "learning_rate": 2.151750018850663e-05, + "loss": 1.0538, + "step": 727 + }, + { + "epoch": 2.110144927536232, + "grad_norm": 0.4010423956906586, + "learning_rate": 2.1388924492352565e-05, + "loss": 1.0897, + "step": 728 + }, + { + "epoch": 2.1130434782608694, + "grad_norm": 0.4120035283147293, + "learning_rate": 2.126062948927966e-05, + "loss": 1.1104, + "step": 729 + }, + { + "epoch": 2.1159420289855073, + "grad_norm": 0.4300470148265316, + "learning_rate": 2.1132616437944547e-05, + "loss": 1.0457, + "step": 730 + }, + { + "epoch": 2.118840579710145, + "grad_norm": 0.4153085209481317, + "learning_rate": 2.100488659423772e-05, + "loss": 1.0856, + "step": 731 + }, + { + "epoch": 2.121739130434783, + "grad_norm": 0.4060830438581685, + "learning_rate": 2.087744121127122e-05, + "loss": 1.0801, + "step": 732 + }, + { + "epoch": 2.1246376811594203, + "grad_norm": 0.4267224449360045, + "learning_rate": 2.075028153936636e-05, + "loss": 1.0158, + "step": 733 + }, + { + "epoch": 2.127536231884058, + "grad_norm": 0.4092513929978087, + "learning_rate": 2.062340882604143e-05, + "loss": 1.0211, + "step": 734 + }, + { + "epoch": 2.130434782608696, + "grad_norm": 0.4297526463869587, + "learning_rate": 2.049682431599947e-05, + "loss": 1.1129, + "step": 735 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.4636326790218994, + "learning_rate": 2.0370529251116067e-05, + "loss": 1.1291, + "step": 736 + }, + { + "epoch": 2.136231884057971, + "grad_norm": 0.3974548122667625, + "learning_rate": 2.0244524870427172e-05, + "loss": 0.9923, + "step": 737 + }, + { + "epoch": 2.139130434782609, + "grad_norm": 0.4038721913341886, + "learning_rate": 2.0118812410116915e-05, + "loss": 1.0817, + "step": 738 + }, + { + "epoch": 2.1420289855072463, + "grad_norm": 0.41807115165201914, + "learning_rate": 1.999339310350551e-05, + "loss": 1.09, + "step": 739 + }, + { + "epoch": 2.1449275362318843, + "grad_norm": 0.40763130794004726, + "learning_rate": 1.9868268181037185e-05, + "loss": 1.0475, + "step": 740 + }, + { + "epoch": 2.1478260869565218, + "grad_norm": 0.4099162086697869, + "learning_rate": 1.9743438870267988e-05, + "loss": 1.0527, + "step": 741 + }, + { + "epoch": 2.1507246376811593, + "grad_norm": 0.4046969215163759, + "learning_rate": 1.961890639585388e-05, + "loss": 1.0224, + "step": 742 + }, + { + "epoch": 2.1536231884057973, + "grad_norm": 0.40495982818104165, + "learning_rate": 1.949467197953866e-05, + "loss": 0.9912, + "step": 743 + }, + { + "epoch": 2.1565217391304348, + "grad_norm": 0.4115616809855344, + "learning_rate": 1.9370736840141978e-05, + "loss": 1.0773, + "step": 744 + }, + { + "epoch": 2.1594202898550723, + "grad_norm": 0.42477438614499907, + "learning_rate": 1.9247102193547384e-05, + "loss": 1.0183, + "step": 745 + }, + { + "epoch": 2.1623188405797102, + "grad_norm": 0.39454596479550186, + "learning_rate": 1.912376925269041e-05, + "loss": 1.0548, + "step": 746 + }, + { + "epoch": 2.1652173913043478, + "grad_norm": 0.4324946159925722, + "learning_rate": 1.900073922754665e-05, + "loss": 1.0532, + "step": 747 + }, + { + "epoch": 2.1681159420289857, + "grad_norm": 0.40496616232865795, + "learning_rate": 1.8878013325119902e-05, + "loss": 1.1552, + "step": 748 + }, + { + "epoch": 2.1710144927536232, + "grad_norm": 0.41915807837518143, + "learning_rate": 1.8755592749430322e-05, + "loss": 1.0243, + "step": 749 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.4186007202451323, + "learning_rate": 1.8633478701502628e-05, + "loss": 1.0744, + "step": 750 + }, + { + "epoch": 2.1768115942028987, + "grad_norm": 0.42045626939886377, + "learning_rate": 1.8511672379354284e-05, + "loss": 1.068, + "step": 751 + }, + { + "epoch": 2.1797101449275362, + "grad_norm": 0.4045186001077355, + "learning_rate": 1.8390174977983778e-05, + "loss": 1.0957, + "step": 752 + }, + { + "epoch": 2.1826086956521737, + "grad_norm": 0.4478832702569865, + "learning_rate": 1.8268987689358874e-05, + "loss": 1.0909, + "step": 753 + }, + { + "epoch": 2.1855072463768117, + "grad_norm": 0.4164615953299648, + "learning_rate": 1.814811170240495e-05, + "loss": 1.0386, + "step": 754 + }, + { + "epoch": 2.1884057971014492, + "grad_norm": 0.41902328103819775, + "learning_rate": 1.80275482029933e-05, + "loss": 1.0344, + "step": 755 + }, + { + "epoch": 2.1913043478260867, + "grad_norm": 0.41670788409755355, + "learning_rate": 1.7907298373929517e-05, + "loss": 0.9878, + "step": 756 + }, + { + "epoch": 2.1942028985507247, + "grad_norm": 0.4294226441948201, + "learning_rate": 1.7787363394941875e-05, + "loss": 1.0175, + "step": 757 + }, + { + "epoch": 2.197101449275362, + "grad_norm": 0.4254645454494433, + "learning_rate": 1.7667744442669793e-05, + "loss": 1.0615, + "step": 758 + }, + { + "epoch": 2.2, + "grad_norm": 0.4099964946904337, + "learning_rate": 1.7548442690652238e-05, + "loss": 0.9919, + "step": 759 + }, + { + "epoch": 2.2028985507246377, + "grad_norm": 0.42880536140401987, + "learning_rate": 1.7429459309316254e-05, + "loss": 1.0661, + "step": 760 + }, + { + "epoch": 2.205797101449275, + "grad_norm": 0.4173497311104388, + "learning_rate": 1.7310795465965452e-05, + "loss": 1.0304, + "step": 761 + }, + { + "epoch": 2.208695652173913, + "grad_norm": 0.4181309528124866, + "learning_rate": 1.7192452324768577e-05, + "loss": 1.1069, + "step": 762 + }, + { + "epoch": 2.2115942028985507, + "grad_norm": 0.4253296723606123, + "learning_rate": 1.7074431046748075e-05, + "loss": 1.1159, + "step": 763 + }, + { + "epoch": 2.214492753623188, + "grad_norm": 0.4140966246574362, + "learning_rate": 1.69567327897687e-05, + "loss": 1.035, + "step": 764 + }, + { + "epoch": 2.217391304347826, + "grad_norm": 0.4360262256456945, + "learning_rate": 1.683935870852621e-05, + "loss": 1.0341, + "step": 765 + }, + { + "epoch": 2.2202898550724637, + "grad_norm": 0.4129314987978601, + "learning_rate": 1.6722309954535915e-05, + "loss": 1.0361, + "step": 766 + }, + { + "epoch": 2.2231884057971016, + "grad_norm": 0.44728638008426197, + "learning_rate": 1.6605587676121492e-05, + "loss": 0.982, + "step": 767 + }, + { + "epoch": 2.226086956521739, + "grad_norm": 0.4142277894364414, + "learning_rate": 1.6489193018403694e-05, + "loss": 1.0186, + "step": 768 + }, + { + "epoch": 2.2289855072463767, + "grad_norm": 0.42466461089685326, + "learning_rate": 1.6373127123289082e-05, + "loss": 1.0878, + "step": 769 + }, + { + "epoch": 2.2318840579710146, + "grad_norm": 0.4255999017930268, + "learning_rate": 1.6257391129458866e-05, + "loss": 0.9795, + "step": 770 + }, + { + "epoch": 2.234782608695652, + "grad_norm": 0.4214111455741252, + "learning_rate": 1.614198617235768e-05, + "loss": 1.0523, + "step": 771 + }, + { + "epoch": 2.2376811594202897, + "grad_norm": 0.40833801140318804, + "learning_rate": 1.6026913384182513e-05, + "loss": 1.0665, + "step": 772 + }, + { + "epoch": 2.2405797101449276, + "grad_norm": 0.4060043083014689, + "learning_rate": 1.5912173893871534e-05, + "loss": 1.0294, + "step": 773 + }, + { + "epoch": 2.243478260869565, + "grad_norm": 0.441842102392729, + "learning_rate": 1.5797768827093055e-05, + "loss": 1.0781, + "step": 774 + }, + { + "epoch": 2.246376811594203, + "grad_norm": 0.42451158383299736, + "learning_rate": 1.5683699306234483e-05, + "loss": 1.03, + "step": 775 + }, + { + "epoch": 2.2492753623188406, + "grad_norm": 0.43280564540973687, + "learning_rate": 1.5569966450391273e-05, + "loss": 1.0932, + "step": 776 + }, + { + "epoch": 2.252173913043478, + "grad_norm": 0.4260799476878949, + "learning_rate": 1.5456571375356045e-05, + "loss": 0.9906, + "step": 777 + }, + { + "epoch": 2.255072463768116, + "grad_norm": 0.4289868937899867, + "learning_rate": 1.534351519360752e-05, + "loss": 1.1224, + "step": 778 + }, + { + "epoch": 2.2579710144927536, + "grad_norm": 0.4184482349129135, + "learning_rate": 1.5230799014299651e-05, + "loss": 1.0492, + "step": 779 + }, + { + "epoch": 2.260869565217391, + "grad_norm": 0.4169287607356858, + "learning_rate": 1.5118423943250771e-05, + "loss": 1.0076, + "step": 780 + }, + { + "epoch": 2.263768115942029, + "grad_norm": 0.4437723000239763, + "learning_rate": 1.500639108293272e-05, + "loss": 1.0756, + "step": 781 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 2.438737443529068, + "learning_rate": 1.4894701532460026e-05, + "loss": 1.0372, + "step": 782 + }, + { + "epoch": 2.269565217391304, + "grad_norm": 0.4259694730355945, + "learning_rate": 1.4783356387579123e-05, + "loss": 1.0914, + "step": 783 + }, + { + "epoch": 2.272463768115942, + "grad_norm": 0.42609879566763975, + "learning_rate": 1.4672356740657612e-05, + "loss": 1.1024, + "step": 784 + }, + { + "epoch": 2.2753623188405796, + "grad_norm": 0.41473766458960193, + "learning_rate": 1.4561703680673528e-05, + "loss": 1.0437, + "step": 785 + }, + { + "epoch": 2.2782608695652176, + "grad_norm": 0.41138794322562033, + "learning_rate": 1.4451398293204671e-05, + "loss": 0.9883, + "step": 786 + }, + { + "epoch": 2.281159420289855, + "grad_norm": 0.4345116661977155, + "learning_rate": 1.4341441660417948e-05, + "loss": 1.0405, + "step": 787 + }, + { + "epoch": 2.2840579710144926, + "grad_norm": 0.43156004240612655, + "learning_rate": 1.423183486105874e-05, + "loss": 1.0858, + "step": 788 + }, + { + "epoch": 2.2869565217391306, + "grad_norm": 0.43394375495039533, + "learning_rate": 1.4122578970440392e-05, + "loss": 1.013, + "step": 789 + }, + { + "epoch": 2.289855072463768, + "grad_norm": 0.42318889929148634, + "learning_rate": 1.4013675060433562e-05, + "loss": 1.0667, + "step": 790 + }, + { + "epoch": 2.292753623188406, + "grad_norm": 0.4338786349395585, + "learning_rate": 1.3905124199455733e-05, + "loss": 0.9574, + "step": 791 + }, + { + "epoch": 2.2956521739130435, + "grad_norm": 0.4263774516063788, + "learning_rate": 1.379692745246079e-05, + "loss": 1.0388, + "step": 792 + }, + { + "epoch": 2.298550724637681, + "grad_norm": 0.4578203586741276, + "learning_rate": 1.368908588092852e-05, + "loss": 1.0852, + "step": 793 + }, + { + "epoch": 2.301449275362319, + "grad_norm": 0.4223544444704819, + "learning_rate": 1.3581600542854211e-05, + "loss": 1.0764, + "step": 794 + }, + { + "epoch": 2.3043478260869565, + "grad_norm": 0.42040297195621995, + "learning_rate": 1.3474472492738266e-05, + "loss": 1.0818, + "step": 795 + }, + { + "epoch": 2.307246376811594, + "grad_norm": 0.42233699920038903, + "learning_rate": 1.3367702781575858e-05, + "loss": 1.0144, + "step": 796 + }, + { + "epoch": 2.310144927536232, + "grad_norm": 0.42739886636894053, + "learning_rate": 1.3261292456846647e-05, + "loss": 1.011, + "step": 797 + }, + { + "epoch": 2.3130434782608695, + "grad_norm": 0.4319353955954341, + "learning_rate": 1.315524256250445e-05, + "loss": 0.9984, + "step": 798 + }, + { + "epoch": 2.315942028985507, + "grad_norm": 0.4240304031792234, + "learning_rate": 1.3049554138967051e-05, + "loss": 1.0865, + "step": 799 + }, + { + "epoch": 2.318840579710145, + "grad_norm": 0.44946527738642017, + "learning_rate": 1.2944228223105953e-05, + "loss": 1.0496, + "step": 800 + }, + { + "epoch": 2.3217391304347825, + "grad_norm": 0.42198617091436585, + "learning_rate": 1.2839265848236271e-05, + "loss": 1.0357, + "step": 801 + }, + { + "epoch": 2.3246376811594205, + "grad_norm": 0.42787604239445254, + "learning_rate": 1.273466804410649e-05, + "loss": 1.0624, + "step": 802 + }, + { + "epoch": 2.327536231884058, + "grad_norm": 0.4259453527555043, + "learning_rate": 1.2630435836888477e-05, + "loss": 1.0371, + "step": 803 + }, + { + "epoch": 2.3304347826086955, + "grad_norm": 0.4405744784698457, + "learning_rate": 1.2526570249167285e-05, + "loss": 1.0722, + "step": 804 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.44433415788871033, + "learning_rate": 1.242307229993126e-05, + "loss": 1.1003, + "step": 805 + }, + { + "epoch": 2.336231884057971, + "grad_norm": 0.44002850613090233, + "learning_rate": 1.2319943004561951e-05, + "loss": 1.0334, + "step": 806 + }, + { + "epoch": 2.3391304347826085, + "grad_norm": 0.4327626792123435, + "learning_rate": 1.2217183374824182e-05, + "loss": 1.0841, + "step": 807 + }, + { + "epoch": 2.3420289855072465, + "grad_norm": 0.44177237553294435, + "learning_rate": 1.2114794418856112e-05, + "loss": 1.1006, + "step": 808 + }, + { + "epoch": 2.344927536231884, + "grad_norm": 0.4252814673055529, + "learning_rate": 1.2012777141159359e-05, + "loss": 1.0902, + "step": 809 + }, + { + "epoch": 2.3478260869565215, + "grad_norm": 0.44481606310880256, + "learning_rate": 1.1911132542589126e-05, + "loss": 1.0663, + "step": 810 + }, + { + "epoch": 2.3507246376811595, + "grad_norm": 0.44531350592923585, + "learning_rate": 1.180986162034441e-05, + "loss": 1.0395, + "step": 811 + }, + { + "epoch": 2.353623188405797, + "grad_norm": 0.4403754842576467, + "learning_rate": 1.1708965367958175e-05, + "loss": 1.0367, + "step": 812 + }, + { + "epoch": 2.356521739130435, + "grad_norm": 0.44504741014172594, + "learning_rate": 1.160844477528768e-05, + "loss": 1.0668, + "step": 813 + }, + { + "epoch": 2.3594202898550725, + "grad_norm": 0.45218366246573805, + "learning_rate": 1.150830082850468e-05, + "loss": 1.0078, + "step": 814 + }, + { + "epoch": 2.36231884057971, + "grad_norm": 0.4400472472708365, + "learning_rate": 1.1408534510085805e-05, + "loss": 1.0535, + "step": 815 + }, + { + "epoch": 2.365217391304348, + "grad_norm": 0.429340428309833, + "learning_rate": 1.130914679880291e-05, + "loss": 1.0736, + "step": 816 + }, + { + "epoch": 2.3681159420289855, + "grad_norm": 0.41976853039844914, + "learning_rate": 1.1210138669713444e-05, + "loss": 0.9793, + "step": 817 + }, + { + "epoch": 2.3710144927536234, + "grad_norm": 0.430344411304319, + "learning_rate": 1.1111511094150945e-05, + "loss": 0.9848, + "step": 818 + }, + { + "epoch": 2.373913043478261, + "grad_norm": 0.431007787368086, + "learning_rate": 1.1013265039715465e-05, + "loss": 0.9797, + "step": 819 + }, + { + "epoch": 2.3768115942028984, + "grad_norm": 0.43768154374858875, + "learning_rate": 1.0915401470264081e-05, + "loss": 1.0339, + "step": 820 + }, + { + "epoch": 2.3797101449275364, + "grad_norm": 0.4153960922316617, + "learning_rate": 1.081792134590145e-05, + "loss": 1.0726, + "step": 821 + }, + { + "epoch": 2.382608695652174, + "grad_norm": 0.4261661560061093, + "learning_rate": 1.0720825622970387e-05, + "loss": 1.0732, + "step": 822 + }, + { + "epoch": 2.3855072463768114, + "grad_norm": 0.46272436711753084, + "learning_rate": 1.0624115254042482e-05, + "loss": 1.0509, + "step": 823 + }, + { + "epoch": 2.3884057971014494, + "grad_norm": 0.4159332663897536, + "learning_rate": 1.0527791187908736e-05, + "loss": 1.0301, + "step": 824 + }, + { + "epoch": 2.391304347826087, + "grad_norm": 0.41855139337790126, + "learning_rate": 1.0431854369570316e-05, + "loss": 0.98, + "step": 825 + }, + { + "epoch": 2.3942028985507244, + "grad_norm": 0.4407049676844984, + "learning_rate": 1.0336305740229196e-05, + "loss": 1.0198, + "step": 826 + }, + { + "epoch": 2.3971014492753624, + "grad_norm": 0.44469510783381666, + "learning_rate": 1.0241146237278975e-05, + "loss": 1.0142, + "step": 827 + }, + { + "epoch": 2.4, + "grad_norm": 0.4204751833047234, + "learning_rate": 1.0146376794295698e-05, + "loss": 1.0435, + "step": 828 + }, + { + "epoch": 2.402898550724638, + "grad_norm": 0.43076006527935645, + "learning_rate": 1.0051998341028618e-05, + "loss": 1.0329, + "step": 829 + }, + { + "epoch": 2.4057971014492754, + "grad_norm": 0.4212241503239106, + "learning_rate": 9.958011803391166e-06, + "loss": 1.0517, + "step": 830 + }, + { + "epoch": 2.408695652173913, + "grad_norm": 0.43752577070512094, + "learning_rate": 9.864418103451828e-06, + "loss": 1.05, + "step": 831 + }, + { + "epoch": 2.411594202898551, + "grad_norm": 0.4539932456655938, + "learning_rate": 9.771218159425084e-06, + "loss": 1.0501, + "step": 832 + }, + { + "epoch": 2.4144927536231884, + "grad_norm": 0.44298901817857494, + "learning_rate": 9.678412885662418e-06, + "loss": 1.0399, + "step": 833 + }, + { + "epoch": 2.417391304347826, + "grad_norm": 0.44330383234774, + "learning_rate": 9.586003192643362e-06, + "loss": 1.0242, + "step": 834 + }, + { + "epoch": 2.420289855072464, + "grad_norm": 0.42235580319917715, + "learning_rate": 9.493989986966518e-06, + "loss": 1.0961, + "step": 835 + }, + { + "epoch": 2.4231884057971014, + "grad_norm": 0.42412654756876644, + "learning_rate": 9.402374171340705e-06, + "loss": 1.0747, + "step": 836 + }, + { + "epoch": 2.426086956521739, + "grad_norm": 0.4604003701876417, + "learning_rate": 9.311156644576108e-06, + "loss": 0.9956, + "step": 837 + }, + { + "epoch": 2.428985507246377, + "grad_norm": 0.4355065867115315, + "learning_rate": 9.220338301575414e-06, + "loss": 1.0515, + "step": 838 + }, + { + "epoch": 2.4318840579710144, + "grad_norm": 0.41606575435043913, + "learning_rate": 9.129920033325068e-06, + "loss": 1.0834, + "step": 839 + }, + { + "epoch": 2.4347826086956523, + "grad_norm": 0.41400057706555543, + "learning_rate": 9.039902726886535e-06, + "loss": 1.025, + "step": 840 + }, + { + "epoch": 2.43768115942029, + "grad_norm": 0.4212465286811161, + "learning_rate": 8.95028726538758e-06, + "loss": 1.0888, + "step": 841 + }, + { + "epoch": 2.4405797101449274, + "grad_norm": 0.44292414437801153, + "learning_rate": 8.861074528013586e-06, + "loss": 1.1063, + "step": 842 + }, + { + "epoch": 2.4434782608695653, + "grad_norm": 0.4618762426767351, + "learning_rate": 8.77226538999899e-06, + "loss": 1.0861, + "step": 843 + }, + { + "epoch": 2.446376811594203, + "grad_norm": 0.42934378228075604, + "learning_rate": 8.683860722618641e-06, + "loss": 1.0674, + "step": 844 + }, + { + "epoch": 2.449275362318841, + "grad_norm": 0.44137968841741865, + "learning_rate": 8.595861393179277e-06, + "loss": 1.0248, + "step": 845 + }, + { + "epoch": 2.4521739130434783, + "grad_norm": 0.45115385912472034, + "learning_rate": 8.508268265011005e-06, + "loss": 1.0471, + "step": 846 + }, + { + "epoch": 2.455072463768116, + "grad_norm": 0.44160775586291273, + "learning_rate": 8.42108219745884e-06, + "loss": 1.0375, + "step": 847 + }, + { + "epoch": 2.457971014492754, + "grad_norm": 0.44498128589628316, + "learning_rate": 8.334304045874247e-06, + "loss": 1.0928, + "step": 848 + }, + { + "epoch": 2.4608695652173913, + "grad_norm": 0.42944613569509194, + "learning_rate": 8.247934661606826e-06, + "loss": 1.0611, + "step": 849 + }, + { + "epoch": 2.463768115942029, + "grad_norm": 0.4293984310812336, + "learning_rate": 8.161974891995855e-06, + "loss": 1.0425, + "step": 850 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.43223021088950386, + "learning_rate": 8.076425580362052e-06, + "loss": 1.0966, + "step": 851 + }, + { + "epoch": 2.4695652173913043, + "grad_norm": 0.4511615485513439, + "learning_rate": 7.991287565999272e-06, + "loss": 0.9823, + "step": 852 + }, + { + "epoch": 2.472463768115942, + "grad_norm": 0.43175751442143545, + "learning_rate": 7.906561684166275e-06, + "loss": 1.046, + "step": 853 + }, + { + "epoch": 2.47536231884058, + "grad_norm": 0.4398354654162565, + "learning_rate": 7.822248766078555e-06, + "loss": 1.1159, + "step": 854 + }, + { + "epoch": 2.4782608695652173, + "grad_norm": 0.4217658734022817, + "learning_rate": 7.738349638900127e-06, + "loss": 1.0605, + "step": 855 + }, + { + "epoch": 2.4811594202898553, + "grad_norm": 0.4463848438795895, + "learning_rate": 7.654865125735483e-06, + "loss": 0.987, + "step": 856 + }, + { + "epoch": 2.4840579710144928, + "grad_norm": 0.4553067045132744, + "learning_rate": 7.571796045621482e-06, + "loss": 1.049, + "step": 857 + }, + { + "epoch": 2.4869565217391303, + "grad_norm": 0.4470257852745124, + "learning_rate": 7.489143213519301e-06, + "loss": 1.0841, + "step": 858 + }, + { + "epoch": 2.4898550724637682, + "grad_norm": 0.42594930418564064, + "learning_rate": 7.406907440306471e-06, + "loss": 1.0877, + "step": 859 + }, + { + "epoch": 2.4927536231884058, + "grad_norm": 0.4284878480179994, + "learning_rate": 7.325089532768892e-06, + "loss": 1.0765, + "step": 860 + }, + { + "epoch": 2.4956521739130437, + "grad_norm": 0.44182270672000895, + "learning_rate": 7.243690293592959e-06, + "loss": 1.0233, + "step": 861 + }, + { + "epoch": 2.4985507246376812, + "grad_norm": 0.43871383223404364, + "learning_rate": 7.1627105213576355e-06, + "loss": 1.0702, + "step": 862 + }, + { + "epoch": 2.5014492753623188, + "grad_norm": 0.4277793635895529, + "learning_rate": 7.08215101052665e-06, + "loss": 1.0573, + "step": 863 + }, + { + "epoch": 2.5043478260869563, + "grad_norm": 0.4406001751473407, + "learning_rate": 7.002012551440701e-06, + "loss": 1.0316, + "step": 864 + }, + { + "epoch": 2.5072463768115942, + "grad_norm": 0.5413472127354161, + "learning_rate": 6.922295930309691e-06, + "loss": 1.0798, + "step": 865 + }, + { + "epoch": 2.5101449275362318, + "grad_norm": 0.4301282293831735, + "learning_rate": 6.84300192920504e-06, + "loss": 1.0723, + "step": 866 + }, + { + "epoch": 2.5130434782608697, + "grad_norm": 0.43181259980748293, + "learning_rate": 6.764131326051953e-06, + "loss": 1.0395, + "step": 867 + }, + { + "epoch": 2.5159420289855072, + "grad_norm": 0.4357413758485379, + "learning_rate": 6.6856848946218635e-06, + "loss": 1.04, + "step": 868 + }, + { + "epoch": 2.5188405797101447, + "grad_norm": 0.4441512604958444, + "learning_rate": 6.607663404524795e-06, + "loss": 1.02, + "step": 869 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 0.4403400361786895, + "learning_rate": 6.53006762120183e-06, + "loss": 0.9813, + "step": 870 + }, + { + "epoch": 2.52463768115942, + "grad_norm": 0.4295706766182875, + "learning_rate": 6.452898305917587e-06, + "loss": 1.0977, + "step": 871 + }, + { + "epoch": 2.527536231884058, + "grad_norm": 0.4500164864119338, + "learning_rate": 6.376156215752743e-06, + "loss": 1.046, + "step": 872 + }, + { + "epoch": 2.5304347826086957, + "grad_norm": 0.4295283517592817, + "learning_rate": 6.299842103596665e-06, + "loss": 0.9962, + "step": 873 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.4298591342734868, + "learning_rate": 6.223956718139939e-06, + "loss": 1.0351, + "step": 874 + }, + { + "epoch": 2.536231884057971, + "grad_norm": 0.41916133011716233, + "learning_rate": 6.14850080386708e-06, + "loss": 0.9795, + "step": 875 + }, + { + "epoch": 2.5391304347826087, + "grad_norm": 0.450757056089375, + "learning_rate": 6.073475101049209e-06, + "loss": 1.0287, + "step": 876 + }, + { + "epoch": 2.5420289855072467, + "grad_norm": 0.4428910375540849, + "learning_rate": 5.998880345736812e-06, + "loss": 1.0841, + "step": 877 + }, + { + "epoch": 2.544927536231884, + "grad_norm": 0.4370122339112871, + "learning_rate": 5.924717269752478e-06, + "loss": 1.0355, + "step": 878 + }, + { + "epoch": 2.5478260869565217, + "grad_norm": 0.4328546688643461, + "learning_rate": 5.8509866006837725e-06, + "loss": 1.0458, + "step": 879 + }, + { + "epoch": 2.550724637681159, + "grad_norm": 0.45457918016504273, + "learning_rate": 5.777689061876035e-06, + "loss": 1.0407, + "step": 880 + }, + { + "epoch": 2.553623188405797, + "grad_norm": 0.41666707799866615, + "learning_rate": 5.704825372425343e-06, + "loss": 1.0336, + "step": 881 + }, + { + "epoch": 2.5565217391304347, + "grad_norm": 0.4500898444777061, + "learning_rate": 5.6323962471714286e-06, + "loss": 1.0082, + "step": 882 + }, + { + "epoch": 2.5594202898550726, + "grad_norm": 0.43189682364915644, + "learning_rate": 5.560402396690667e-06, + "loss": 1.0732, + "step": 883 + }, + { + "epoch": 2.56231884057971, + "grad_norm": 0.4517991164758783, + "learning_rate": 5.4888445272891e-06, + "loss": 1.0565, + "step": 884 + }, + { + "epoch": 2.5652173913043477, + "grad_norm": 0.43585727349975845, + "learning_rate": 5.417723340995545e-06, + "loss": 1.0569, + "step": 885 + }, + { + "epoch": 2.5681159420289856, + "grad_norm": 0.4451555207263539, + "learning_rate": 5.347039535554632e-06, + "loss": 1.0934, + "step": 886 + }, + { + "epoch": 2.571014492753623, + "grad_norm": 0.44753595012523295, + "learning_rate": 5.276793804420033e-06, + "loss": 1.0129, + "step": 887 + }, + { + "epoch": 2.573913043478261, + "grad_norm": 0.43340171540500966, + "learning_rate": 5.206986836747624e-06, + "loss": 1.057, + "step": 888 + }, + { + "epoch": 2.5768115942028986, + "grad_norm": 0.41103056048092484, + "learning_rate": 5.13761931738872e-06, + "loss": 1.0629, + "step": 889 + }, + { + "epoch": 2.579710144927536, + "grad_norm": 0.4379217485808061, + "learning_rate": 5.068691926883367e-06, + "loss": 1.1122, + "step": 890 + }, + { + "epoch": 2.5826086956521737, + "grad_norm": 0.4367395495858654, + "learning_rate": 5.000205341453679e-06, + "loss": 1.0641, + "step": 891 + }, + { + "epoch": 2.5855072463768116, + "grad_norm": 0.4346646618624072, + "learning_rate": 4.9321602329971735e-06, + "loss": 1.0247, + "step": 892 + }, + { + "epoch": 2.588405797101449, + "grad_norm": 0.4266332511623276, + "learning_rate": 4.864557269080183e-06, + "loss": 1.1, + "step": 893 + }, + { + "epoch": 2.591304347826087, + "grad_norm": 0.4280568908138626, + "learning_rate": 4.7973971129313455e-06, + "loss": 0.9916, + "step": 894 + }, + { + "epoch": 2.5942028985507246, + "grad_norm": 0.4157220462493493, + "learning_rate": 4.730680423435046e-06, + "loss": 1.0384, + "step": 895 + }, + { + "epoch": 2.597101449275362, + "grad_norm": 0.4657661567334127, + "learning_rate": 4.6644078551249916e-06, + "loss": 1.0206, + "step": 896 + }, + { + "epoch": 2.6, + "grad_norm": 0.4402043390402084, + "learning_rate": 4.59858005817776e-06, + "loss": 1.0051, + "step": 897 + }, + { + "epoch": 2.6028985507246376, + "grad_norm": 0.47342746863944507, + "learning_rate": 4.533197678406459e-06, + "loss": 0.9908, + "step": 898 + }, + { + "epoch": 2.6057971014492756, + "grad_norm": 0.44686945552614565, + "learning_rate": 4.468261357254339e-06, + "loss": 1.0194, + "step": 899 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.45848518372098457, + "learning_rate": 4.403771731788547e-06, + "loss": 1.0751, + "step": 900 + }, + { + "epoch": 2.6115942028985506, + "grad_norm": 0.41833931514497974, + "learning_rate": 4.339729434693851e-06, + "loss": 1.0486, + "step": 901 + }, + { + "epoch": 2.6144927536231886, + "grad_norm": 0.4154891635226541, + "learning_rate": 4.276135094266437e-06, + "loss": 1.0246, + "step": 902 + }, + { + "epoch": 2.617391304347826, + "grad_norm": 0.42902378243746886, + "learning_rate": 4.212989334407752e-06, + "loss": 1.0367, + "step": 903 + }, + { + "epoch": 2.620289855072464, + "grad_norm": 0.4413147059304679, + "learning_rate": 4.150292774618386e-06, + "loss": 1.0377, + "step": 904 + }, + { + "epoch": 2.6231884057971016, + "grad_norm": 0.4326053305994359, + "learning_rate": 4.088046029991954e-06, + "loss": 1.0321, + "step": 905 + }, + { + "epoch": 2.626086956521739, + "grad_norm": 0.43297947767772066, + "learning_rate": 4.026249711209134e-06, + "loss": 1.0814, + "step": 906 + }, + { + "epoch": 2.6289855072463766, + "grad_norm": 0.42391791250689304, + "learning_rate": 3.964904424531623e-06, + "loss": 1.1435, + "step": 907 + }, + { + "epoch": 2.6318840579710145, + "grad_norm": 0.44465042718334696, + "learning_rate": 3.90401077179619e-06, + "loss": 1.0755, + "step": 908 + }, + { + "epoch": 2.634782608695652, + "grad_norm": 0.4379840802629311, + "learning_rate": 3.843569350408799e-06, + "loss": 1.0326, + "step": 909 + }, + { + "epoch": 2.63768115942029, + "grad_norm": 0.4380256503816688, + "learning_rate": 3.7835807533387336e-06, + "loss": 0.9959, + "step": 910 + }, + { + "epoch": 2.6405797101449275, + "grad_norm": 0.4250114900172059, + "learning_rate": 3.724045569112766e-06, + "loss": 1.0413, + "step": 911 + }, + { + "epoch": 2.643478260869565, + "grad_norm": 0.43495634484636064, + "learning_rate": 3.664964381809416e-06, + "loss": 1.0502, + "step": 912 + }, + { + "epoch": 2.646376811594203, + "grad_norm": 0.41338659373447945, + "learning_rate": 3.606337771053181e-06, + "loss": 1.0322, + "step": 913 + }, + { + "epoch": 2.6492753623188405, + "grad_norm": 0.4607899596362807, + "learning_rate": 3.548166312008877e-06, + "loss": 1.062, + "step": 914 + }, + { + "epoch": 2.6521739130434785, + "grad_norm": 0.4456807876825619, + "learning_rate": 3.4904505753759863e-06, + "loss": 1.049, + "step": 915 + }, + { + "epoch": 2.655072463768116, + "grad_norm": 0.45066296980753234, + "learning_rate": 3.4331911273830784e-06, + "loss": 1.1202, + "step": 916 + }, + { + "epoch": 2.6579710144927535, + "grad_norm": 0.42887756180559006, + "learning_rate": 3.376388529782215e-06, + "loss": 1.0579, + "step": 917 + }, + { + "epoch": 2.660869565217391, + "grad_norm": 0.4242946529545818, + "learning_rate": 3.320043339843465e-06, + "loss": 1.0094, + "step": 918 + }, + { + "epoch": 2.663768115942029, + "grad_norm": 0.4509087953831623, + "learning_rate": 3.2641561103494424e-06, + "loss": 1.126, + "step": 919 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.4582297576992613, + "learning_rate": 3.2087273895898606e-06, + "loss": 1.0978, + "step": 920 + }, + { + "epoch": 2.6695652173913045, + "grad_norm": 0.41892321793577525, + "learning_rate": 3.153757721356182e-06, + "loss": 1.0188, + "step": 921 + }, + { + "epoch": 2.672463768115942, + "grad_norm": 0.43091493659712077, + "learning_rate": 3.0992476449362653e-06, + "loss": 1.0657, + "step": 922 + }, + { + "epoch": 2.6753623188405795, + "grad_norm": 0.4484469573992589, + "learning_rate": 3.0451976951090757e-06, + "loss": 1.0578, + "step": 923 + }, + { + "epoch": 2.6782608695652175, + "grad_norm": 0.45221935250795153, + "learning_rate": 2.991608402139434e-06, + "loss": 1.0728, + "step": 924 + }, + { + "epoch": 2.681159420289855, + "grad_norm": 0.42748137661848884, + "learning_rate": 2.938480291772827e-06, + "loss": 1.0517, + "step": 925 + }, + { + "epoch": 2.684057971014493, + "grad_norm": 0.4338746720819457, + "learning_rate": 2.8858138852302374e-06, + "loss": 1.0192, + "step": 926 + }, + { + "epoch": 2.6869565217391305, + "grad_norm": 0.44271385780896827, + "learning_rate": 2.833609699203038e-06, + "loss": 1.0409, + "step": 927 + }, + { + "epoch": 2.689855072463768, + "grad_norm": 0.44168360737350637, + "learning_rate": 2.7818682458479294e-06, + "loss": 1.0353, + "step": 928 + }, + { + "epoch": 2.692753623188406, + "grad_norm": 0.44662829054916564, + "learning_rate": 2.7305900327818936e-06, + "loss": 1.0321, + "step": 929 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 0.4372789501470448, + "learning_rate": 2.679775563077247e-06, + "loss": 1.0469, + "step": 930 + }, + { + "epoch": 2.6985507246376814, + "grad_norm": 0.4170715080873589, + "learning_rate": 2.6294253352566466e-06, + "loss": 1.0717, + "step": 931 + }, + { + "epoch": 2.701449275362319, + "grad_norm": 0.44425061018773043, + "learning_rate": 2.5795398432882756e-06, + "loss": 1.0892, + "step": 932 + }, + { + "epoch": 2.7043478260869565, + "grad_norm": 0.43077942102243316, + "learning_rate": 2.530119576580936e-06, + "loss": 1.0542, + "step": 933 + }, + { + "epoch": 2.707246376811594, + "grad_norm": 0.4370359842657613, + "learning_rate": 2.4811650199792924e-06, + "loss": 1.0096, + "step": 934 + }, + { + "epoch": 2.710144927536232, + "grad_norm": 0.43626145737902144, + "learning_rate": 2.4326766537590693e-06, + "loss": 1.081, + "step": 935 + }, + { + "epoch": 2.7130434782608694, + "grad_norm": 0.47685901764854666, + "learning_rate": 2.384654953622384e-06, + "loss": 1.1176, + "step": 936 + }, + { + "epoch": 2.7159420289855074, + "grad_norm": 0.45228260777925117, + "learning_rate": 2.3371003906930423e-06, + "loss": 1.0481, + "step": 937 + }, + { + "epoch": 2.718840579710145, + "grad_norm": 0.44256756961973887, + "learning_rate": 2.290013431511945e-06, + "loss": 1.0347, + "step": 938 + }, + { + "epoch": 2.7217391304347824, + "grad_norm": 0.4402726419838423, + "learning_rate": 2.243394538032484e-06, + "loss": 1.0369, + "step": 939 + }, + { + "epoch": 2.7246376811594204, + "grad_norm": 0.45365804923951414, + "learning_rate": 2.197244167616047e-06, + "loss": 1.0973, + "step": 940 + }, + { + "epoch": 2.727536231884058, + "grad_norm": 0.4525083377542681, + "learning_rate": 2.1515627730274822e-06, + "loss": 1.0616, + "step": 941 + }, + { + "epoch": 2.730434782608696, + "grad_norm": 0.41867968643258735, + "learning_rate": 2.106350802430718e-06, + "loss": 1.0361, + "step": 942 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 0.44410487106485796, + "learning_rate": 2.0616086993842876e-06, + "loss": 1.0262, + "step": 943 + }, + { + "epoch": 2.736231884057971, + "grad_norm": 0.42533114796177457, + "learning_rate": 2.0173369028370583e-06, + "loss": 1.0324, + "step": 944 + }, + { + "epoch": 2.7391304347826084, + "grad_norm": 0.41967355790971034, + "learning_rate": 1.9735358471238586e-06, + "loss": 1.0439, + "step": 945 + }, + { + "epoch": 2.7420289855072464, + "grad_norm": 0.4313810499422798, + "learning_rate": 1.9302059619612787e-06, + "loss": 1.0067, + "step": 946 + }, + { + "epoch": 2.744927536231884, + "grad_norm": 0.4457644564670882, + "learning_rate": 1.8873476724433902e-06, + "loss": 1.0433, + "step": 947 + }, + { + "epoch": 2.747826086956522, + "grad_norm": 0.44140575476367844, + "learning_rate": 1.8449613990376313e-06, + "loss": 1.0281, + "step": 948 + }, + { + "epoch": 2.7507246376811594, + "grad_norm": 0.41388990569707274, + "learning_rate": 1.8030475575806394e-06, + "loss": 1.0779, + "step": 949 + }, + { + "epoch": 2.753623188405797, + "grad_norm": 0.44319022004684594, + "learning_rate": 1.7616065592742038e-06, + "loss": 1.0709, + "step": 950 + }, + { + "epoch": 2.756521739130435, + "grad_norm": 0.42280831552653275, + "learning_rate": 1.7206388106812077e-06, + "loss": 1.0602, + "step": 951 + }, + { + "epoch": 2.7594202898550724, + "grad_norm": 0.41831113949584664, + "learning_rate": 1.6801447137216652e-06, + "loss": 1.0519, + "step": 952 + }, + { + "epoch": 2.7623188405797103, + "grad_norm": 0.42149777436767877, + "learning_rate": 1.6401246656687463e-06, + "loss": 1.0568, + "step": 953 + }, + { + "epoch": 2.765217391304348, + "grad_norm": 0.429110137697547, + "learning_rate": 1.6005790591448966e-06, + "loss": 1.1177, + "step": 954 + }, + { + "epoch": 2.7681159420289854, + "grad_norm": 0.46048857323106746, + "learning_rate": 1.5615082821180071e-06, + "loss": 1.0583, + "step": 955 + }, + { + "epoch": 2.7710144927536233, + "grad_norm": 0.4299763555661624, + "learning_rate": 1.522912717897551e-06, + "loss": 1.1047, + "step": 956 + }, + { + "epoch": 2.773913043478261, + "grad_norm": 0.47595502230009035, + "learning_rate": 1.4847927451308753e-06, + "loss": 1.0598, + "step": 957 + }, + { + "epoch": 2.776811594202899, + "grad_norm": 0.44472688488854684, + "learning_rate": 1.447148737799481e-06, + "loss": 1.0717, + "step": 958 + }, + { + "epoch": 2.7797101449275363, + "grad_norm": 0.446411341344231, + "learning_rate": 1.4099810652153212e-06, + "loss": 1.0873, + "step": 959 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 0.4395447440323806, + "learning_rate": 1.3732900920172154e-06, + "loss": 1.0097, + "step": 960 + }, + { + "epoch": 2.7855072463768114, + "grad_norm": 0.4374552230480354, + "learning_rate": 1.3370761781672346e-06, + "loss": 1.0025, + "step": 961 + }, + { + "epoch": 2.7884057971014493, + "grad_norm": 0.4585611691245378, + "learning_rate": 1.3013396789472055e-06, + "loss": 0.9921, + "step": 962 + }, + { + "epoch": 2.791304347826087, + "grad_norm": 0.4367319010484946, + "learning_rate": 1.2660809449552058e-06, + "loss": 1.005, + "step": 963 + }, + { + "epoch": 2.794202898550725, + "grad_norm": 0.41818614449882124, + "learning_rate": 1.2313003221021302e-06, + "loss": 1.0392, + "step": 964 + }, + { + "epoch": 2.7971014492753623, + "grad_norm": 0.43712018288101745, + "learning_rate": 1.1969981516082972e-06, + "loss": 1.0703, + "step": 965 + }, + { + "epoch": 2.8, + "grad_norm": 0.4330052924141849, + "learning_rate": 1.163174770000086e-06, + "loss": 1.0149, + "step": 966 + }, + { + "epoch": 2.802898550724638, + "grad_norm": 0.4637514588180937, + "learning_rate": 1.1298305091066664e-06, + "loss": 1.054, + "step": 967 + }, + { + "epoch": 2.8057971014492753, + "grad_norm": 0.4328211094942756, + "learning_rate": 1.0969656960567177e-06, + "loss": 1.1024, + "step": 968 + }, + { + "epoch": 2.8086956521739133, + "grad_norm": 0.49114261638602824, + "learning_rate": 1.0645806532752156e-06, + "loss": 1.0506, + "step": 969 + }, + { + "epoch": 2.8115942028985508, + "grad_norm": 0.43504595478449676, + "learning_rate": 1.0326756984803065e-06, + "loss": 1.0711, + "step": 970 + }, + { + "epoch": 2.8144927536231883, + "grad_norm": 0.4348937962062495, + "learning_rate": 1.0012511446801377e-06, + "loss": 1.1078, + "step": 971 + }, + { + "epoch": 2.8173913043478263, + "grad_norm": 0.44058656927819256, + "learning_rate": 9.70307300169826e-07, + "loss": 1.0991, + "step": 972 + }, + { + "epoch": 2.8202898550724638, + "grad_norm": 0.4295244566694527, + "learning_rate": 9.39844468528428e-07, + "loss": 0.9995, + "step": 973 + }, + { + "epoch": 2.8231884057971013, + "grad_norm": 0.4367203092602682, + "learning_rate": 9.09862948615936e-07, + "loss": 1.0519, + "step": 974 + }, + { + "epoch": 2.8260869565217392, + "grad_norm": 0.4449664564834592, + "learning_rate": 8.803630345703751e-07, + "loss": 1.0474, + "step": 975 + }, + { + "epoch": 2.8289855072463768, + "grad_norm": 0.4297347658970927, + "learning_rate": 8.513450158049108e-07, + "loss": 1.0695, + "step": 976 + }, + { + "epoch": 2.8318840579710143, + "grad_norm": 0.4486135418859604, + "learning_rate": 8.228091770049961e-07, + "loss": 1.0164, + "step": 977 + }, + { + "epoch": 2.8347826086956522, + "grad_norm": 0.43980229550927924, + "learning_rate": 7.947557981255904e-07, + "loss": 1.0317, + "step": 978 + }, + { + "epoch": 2.8376811594202898, + "grad_norm": 0.44553738280573807, + "learning_rate": 7.671851543884112e-07, + "loss": 1.0946, + "step": 979 + }, + { + "epoch": 2.8405797101449277, + "grad_norm": 0.4363004911544926, + "learning_rate": 7.400975162792367e-07, + "loss": 1.003, + "step": 980 + }, + { + "epoch": 2.8434782608695652, + "grad_norm": 0.4413405166653603, + "learning_rate": 7.134931495452413e-07, + "loss": 1.0882, + "step": 981 + }, + { + "epoch": 2.8463768115942027, + "grad_norm": 0.44085985363028396, + "learning_rate": 6.873723151924027e-07, + "loss": 0.9974, + "step": 982 + }, + { + "epoch": 2.8492753623188407, + "grad_norm": 0.44891911764344156, + "learning_rate": 6.617352694829381e-07, + "loss": 0.9997, + "step": 983 + }, + { + "epoch": 2.8521739130434782, + "grad_norm": 0.4297742893819775, + "learning_rate": 6.365822639327723e-07, + "loss": 1.0248, + "step": 984 + }, + { + "epoch": 2.855072463768116, + "grad_norm": 0.44307938049828505, + "learning_rate": 6.119135453090952e-07, + "loss": 1.0523, + "step": 985 + }, + { + "epoch": 2.8579710144927537, + "grad_norm": 0.4219491261370554, + "learning_rate": 5.877293556279306e-07, + "loss": 1.0316, + "step": 986 + }, + { + "epoch": 2.860869565217391, + "grad_norm": 0.4441565730933646, + "learning_rate": 5.64029932151755e-07, + "loss": 1.0601, + "step": 987 + }, + { + "epoch": 2.8637681159420287, + "grad_norm": 0.43904823016047406, + "learning_rate": 5.408155073871768e-07, + "loss": 1.0962, + "step": 988 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 0.4380193819651974, + "learning_rate": 5.180863090826604e-07, + "loss": 1.0828, + "step": 989 + }, + { + "epoch": 2.869565217391304, + "grad_norm": 0.46490668660417783, + "learning_rate": 4.95842560226284e-07, + "loss": 0.9954, + "step": 990 + }, + { + "epoch": 2.872463768115942, + "grad_norm": 0.44779443933129964, + "learning_rate": 4.7408447904354614e-07, + "loss": 0.9894, + "step": 991 + }, + { + "epoch": 2.8753623188405797, + "grad_norm": 0.44039118698865287, + "learning_rate": 4.52812278995246e-07, + "loss": 0.9391, + "step": 992 + }, + { + "epoch": 2.878260869565217, + "grad_norm": 0.44888017878839825, + "learning_rate": 4.3202616877536793e-07, + "loss": 1.044, + "step": 993 + }, + { + "epoch": 2.881159420289855, + "grad_norm": 0.4412322695340127, + "learning_rate": 4.117263523090442e-07, + "loss": 1.1098, + "step": 994 + }, + { + "epoch": 2.8840579710144927, + "grad_norm": 0.42595193117492713, + "learning_rate": 3.919130287505457e-07, + "loss": 1.0755, + "step": 995 + }, + { + "epoch": 2.8869565217391306, + "grad_norm": 0.44081324693289337, + "learning_rate": 3.725863924813389e-07, + "loss": 1.0776, + "step": 996 + }, + { + "epoch": 2.889855072463768, + "grad_norm": 0.45676229278822633, + "learning_rate": 3.5374663310818735e-07, + "loss": 1.121, + "step": 997 + }, + { + "epoch": 2.8927536231884057, + "grad_norm": 0.42858508933481326, + "learning_rate": 3.3539393546124784e-07, + "loss": 1.0342, + "step": 998 + }, + { + "epoch": 2.8956521739130436, + "grad_norm": 0.4554639141142107, + "learning_rate": 3.1752847959232167e-07, + "loss": 1.0403, + "step": 999 + }, + { + "epoch": 2.898550724637681, + "grad_norm": 0.4443160110274387, + "learning_rate": 3.0015044077303933e-07, + "loss": 0.9923, + "step": 1000 + }, + { + "epoch": 2.901449275362319, + "grad_norm": 0.45114283690245177, + "learning_rate": 2.8325998949314536e-07, + "loss": 1.0137, + "step": 1001 + }, + { + "epoch": 2.9043478260869566, + "grad_norm": 0.440281019286359, + "learning_rate": 2.668572914588496e-07, + "loss": 1.0009, + "step": 1002 + }, + { + "epoch": 2.907246376811594, + "grad_norm": 0.42131395328506477, + "learning_rate": 2.509425075911953e-07, + "loss": 1.0864, + "step": 1003 + }, + { + "epoch": 2.9101449275362317, + "grad_norm": 0.4431327301308889, + "learning_rate": 2.3551579402445455e-07, + "loss": 1.0369, + "step": 1004 + }, + { + "epoch": 2.9130434782608696, + "grad_norm": 0.437441254967641, + "learning_rate": 2.2057730210462979e-07, + "loss": 1.0946, + "step": 1005 + }, + { + "epoch": 2.915942028985507, + "grad_norm": 0.44460142080563914, + "learning_rate": 2.0612717838794926e-07, + "loss": 1.0682, + "step": 1006 + }, + { + "epoch": 2.918840579710145, + "grad_norm": 0.46357594598759, + "learning_rate": 1.9216556463943492e-07, + "loss": 1.0347, + "step": 1007 + }, + { + "epoch": 2.9217391304347826, + "grad_norm": 0.4280959868112658, + "learning_rate": 1.7869259783150905e-07, + "loss": 1.0446, + "step": 1008 + }, + { + "epoch": 2.92463768115942, + "grad_norm": 0.4391861785357275, + "learning_rate": 1.657084101426565e-07, + "loss": 1.0055, + "step": 1009 + }, + { + "epoch": 2.927536231884058, + "grad_norm": 0.43467829714626893, + "learning_rate": 1.5321312895612007e-07, + "loss": 1.0468, + "step": 1010 + }, + { + "epoch": 2.9304347826086956, + "grad_norm": 0.436157471233564, + "learning_rate": 1.4120687685866274e-07, + "loss": 1.003, + "step": 1011 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.4387651565287021, + "learning_rate": 1.2968977163934638e-07, + "loss": 1.0961, + "step": 1012 + }, + { + "epoch": 2.936231884057971, + "grad_norm": 0.4544754835767558, + "learning_rate": 1.1866192628839368e-07, + "loss": 1.1016, + "step": 1013 + }, + { + "epoch": 2.9391304347826086, + "grad_norm": 0.4558428482092103, + "learning_rate": 1.0812344899607252e-07, + "loss": 1.0319, + "step": 1014 + }, + { + "epoch": 2.942028985507246, + "grad_norm": 0.4298065423481269, + "learning_rate": 9.807444315163006e-08, + "loss": 1.0564, + "step": 1015 + }, + { + "epoch": 2.944927536231884, + "grad_norm": 0.45987333857679424, + "learning_rate": 8.851500734229357e-08, + "loss": 1.0879, + "step": 1016 + }, + { + "epoch": 2.9478260869565216, + "grad_norm": 0.42633685574770663, + "learning_rate": 7.944523535228233e-08, + "loss": 1.02, + "step": 1017 + }, + { + "epoch": 2.9507246376811596, + "grad_norm": 0.42941746517921314, + "learning_rate": 7.086521616190279e-08, + "loss": 1.0368, + "step": 1018 + }, + { + "epoch": 2.953623188405797, + "grad_norm": 0.4500990712597483, + "learning_rate": 6.27750339466715e-08, + "loss": 1.0091, + "step": 1019 + }, + { + "epoch": 2.9565217391304346, + "grad_norm": 0.43281524715248404, + "learning_rate": 5.517476807648248e-08, + "loss": 1.0871, + "step": 1020 + }, + { + "epoch": 2.9594202898550726, + "grad_norm": 0.4404835864216849, + "learning_rate": 4.806449311484107e-08, + "loss": 1.1031, + "step": 1021 + }, + { + "epoch": 2.96231884057971, + "grad_norm": 0.4292383359696952, + "learning_rate": 4.144427881813129e-08, + "loss": 0.9651, + "step": 1022 + }, + { + "epoch": 2.965217391304348, + "grad_norm": 0.43976585710369, + "learning_rate": 3.531419013491632e-08, + "loss": 1.0691, + "step": 1023 + }, + { + "epoch": 2.9681159420289855, + "grad_norm": 0.43252864631461296, + "learning_rate": 2.967428720531129e-08, + "loss": 0.9949, + "step": 1024 + }, + { + "epoch": 2.971014492753623, + "grad_norm": 0.4477919897543057, + "learning_rate": 2.4524625360400345e-08, + "loss": 1.0986, + "step": 1025 + }, + { + "epoch": 2.973913043478261, + "grad_norm": 0.4289179803109601, + "learning_rate": 1.986525512168158e-08, + "loss": 1.0116, + "step": 1026 + }, + { + "epoch": 2.9768115942028985, + "grad_norm": 0.45865303578317895, + "learning_rate": 1.5696222200578535e-08, + "loss": 1.0639, + "step": 1027 + }, + { + "epoch": 2.9797101449275365, + "grad_norm": 0.43468926771375377, + "learning_rate": 1.2017567497996097e-08, + "loss": 0.9828, + "step": 1028 + }, + { + "epoch": 2.982608695652174, + "grad_norm": 0.4353013480109291, + "learning_rate": 8.82932710389861e-09, + "loss": 1.0111, + "step": 1029 + }, + { + "epoch": 2.9855072463768115, + "grad_norm": 0.435625700326904, + "learning_rate": 6.131532296982379e-09, + "loss": 1.0963, + "step": 1030 + }, + { + "epoch": 2.988405797101449, + "grad_norm": 0.4393642554858853, + "learning_rate": 3.9242095443481345e-09, + "loss": 1.1145, + "step": 1031 + }, + { + "epoch": 2.991304347826087, + "grad_norm": 0.43072368766038216, + "learning_rate": 2.207380501262346e-09, + "loss": 1.0647, + "step": 1032 + }, + { + "epoch": 2.9942028985507245, + "grad_norm": 0.45588828392520236, + "learning_rate": 9.810620109129698e-10, + "loss": 1.0432, + "step": 1033 + }, + { + "epoch": 2.9971014492753625, + "grad_norm": 0.4459564292216382, + "learning_rate": 2.452661042817717e-10, + "loss": 1.1399, + "step": 1034 + }, + { + "epoch": 3.0, + "grad_norm": 0.4383463614226025, + "learning_rate": 0.0, + "loss": 0.9416, + "step": 1035 + }, + { + "epoch": 3.0, + "step": 1035, + "total_flos": 238917794807808.0, + "train_loss": 1.2324695289422924, + "train_runtime": 15380.0554, + "train_samples_per_second": 2.148, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1.0, + "max_steps": 1035, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 238917794807808.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}