Lyra_Base_9B / long_speech_lora /trainer_state.json
zszhong's picture
Upload 27 files
3d94b92 verified
raw
history blame
180 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1035,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002898550724637681,
"grad_norm": 1.7796895708856793,
"learning_rate": 3.125e-06,
"loss": 1.8514,
"step": 1
},
{
"epoch": 0.005797101449275362,
"grad_norm": 1.742548277798407,
"learning_rate": 6.25e-06,
"loss": 1.937,
"step": 2
},
{
"epoch": 0.008695652173913044,
"grad_norm": 1.5905530955603362,
"learning_rate": 9.375000000000001e-06,
"loss": 1.8724,
"step": 3
},
{
"epoch": 0.011594202898550725,
"grad_norm": 1.6592768688949988,
"learning_rate": 1.25e-05,
"loss": 1.8877,
"step": 4
},
{
"epoch": 0.014492753623188406,
"grad_norm": 1.4035260613846172,
"learning_rate": 1.5625e-05,
"loss": 1.8086,
"step": 5
},
{
"epoch": 0.017391304347826087,
"grad_norm": 0.9555449880629443,
"learning_rate": 1.8750000000000002e-05,
"loss": 1.7276,
"step": 6
},
{
"epoch": 0.020289855072463767,
"grad_norm": 0.7915967541673472,
"learning_rate": 2.1875e-05,
"loss": 1.771,
"step": 7
},
{
"epoch": 0.02318840579710145,
"grad_norm": 0.7599954441380682,
"learning_rate": 2.5e-05,
"loss": 1.7122,
"step": 8
},
{
"epoch": 0.02608695652173913,
"grad_norm": 0.7128173682386719,
"learning_rate": 2.8125000000000003e-05,
"loss": 1.5764,
"step": 9
},
{
"epoch": 0.028985507246376812,
"grad_norm": 0.6773249478496584,
"learning_rate": 3.125e-05,
"loss": 1.5811,
"step": 10
},
{
"epoch": 0.03188405797101449,
"grad_norm": 0.6571598838212039,
"learning_rate": 3.4375e-05,
"loss": 1.6191,
"step": 11
},
{
"epoch": 0.034782608695652174,
"grad_norm": 0.6261792389264198,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.5684,
"step": 12
},
{
"epoch": 0.03768115942028986,
"grad_norm": 0.5143810493601375,
"learning_rate": 4.0625000000000005e-05,
"loss": 1.5375,
"step": 13
},
{
"epoch": 0.04057971014492753,
"grad_norm": 0.4855788824689092,
"learning_rate": 4.375e-05,
"loss": 1.5076,
"step": 14
},
{
"epoch": 0.043478260869565216,
"grad_norm": 0.473950934451779,
"learning_rate": 4.6875e-05,
"loss": 1.5083,
"step": 15
},
{
"epoch": 0.0463768115942029,
"grad_norm": 0.48567111749562547,
"learning_rate": 5e-05,
"loss": 1.6137,
"step": 16
},
{
"epoch": 0.04927536231884058,
"grad_norm": 0.43610179775052604,
"learning_rate": 5.3125000000000004e-05,
"loss": 1.5325,
"step": 17
},
{
"epoch": 0.05217391304347826,
"grad_norm": 0.4412188197378122,
"learning_rate": 5.6250000000000005e-05,
"loss": 1.555,
"step": 18
},
{
"epoch": 0.05507246376811594,
"grad_norm": 0.43034730708585867,
"learning_rate": 5.9375e-05,
"loss": 1.5453,
"step": 19
},
{
"epoch": 0.057971014492753624,
"grad_norm": 0.41694045848699307,
"learning_rate": 6.25e-05,
"loss": 1.5362,
"step": 20
},
{
"epoch": 0.06086956521739131,
"grad_norm": 0.4093648088428465,
"learning_rate": 6.562500000000001e-05,
"loss": 1.4596,
"step": 21
},
{
"epoch": 0.06376811594202898,
"grad_norm": 0.42036605295826535,
"learning_rate": 6.875e-05,
"loss": 1.5156,
"step": 22
},
{
"epoch": 0.06666666666666667,
"grad_norm": 0.4140215214641256,
"learning_rate": 7.1875e-05,
"loss": 1.5021,
"step": 23
},
{
"epoch": 0.06956521739130435,
"grad_norm": 0.41797125446436384,
"learning_rate": 7.500000000000001e-05,
"loss": 1.5595,
"step": 24
},
{
"epoch": 0.07246376811594203,
"grad_norm": 0.40448941023881985,
"learning_rate": 7.8125e-05,
"loss": 1.5284,
"step": 25
},
{
"epoch": 0.07536231884057971,
"grad_norm": 0.36201429136045177,
"learning_rate": 8.125000000000001e-05,
"loss": 1.5402,
"step": 26
},
{
"epoch": 0.0782608695652174,
"grad_norm": 0.38159291388896194,
"learning_rate": 8.4375e-05,
"loss": 1.4545,
"step": 27
},
{
"epoch": 0.08115942028985507,
"grad_norm": 0.39563825256543766,
"learning_rate": 8.75e-05,
"loss": 1.476,
"step": 28
},
{
"epoch": 0.08405797101449275,
"grad_norm": 0.3853757557962818,
"learning_rate": 9.062500000000001e-05,
"loss": 1.5553,
"step": 29
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.3715963100647608,
"learning_rate": 9.375e-05,
"loss": 1.4923,
"step": 30
},
{
"epoch": 0.08985507246376812,
"grad_norm": 0.3972739650610925,
"learning_rate": 9.687500000000001e-05,
"loss": 1.4166,
"step": 31
},
{
"epoch": 0.0927536231884058,
"grad_norm": 0.3709663185634906,
"learning_rate": 0.0001,
"loss": 1.4904,
"step": 32
},
{
"epoch": 0.09565217391304348,
"grad_norm": 0.37818493311274604,
"learning_rate": 9.999975473389572e-05,
"loss": 1.4303,
"step": 33
},
{
"epoch": 0.09855072463768116,
"grad_norm": 0.3727893878233448,
"learning_rate": 9.999901893798909e-05,
"loss": 1.5126,
"step": 34
},
{
"epoch": 0.10144927536231885,
"grad_norm": 0.3573590861971531,
"learning_rate": 9.999779261949875e-05,
"loss": 1.4088,
"step": 35
},
{
"epoch": 0.10434782608695652,
"grad_norm": 0.3962649324463349,
"learning_rate": 9.999607579045565e-05,
"loss": 1.4718,
"step": 36
},
{
"epoch": 0.1072463768115942,
"grad_norm": 0.3629563065883299,
"learning_rate": 9.999386846770303e-05,
"loss": 1.5376,
"step": 37
},
{
"epoch": 0.11014492753623188,
"grad_norm": 0.37698476595481845,
"learning_rate": 9.99911706728961e-05,
"loss": 1.5497,
"step": 38
},
{
"epoch": 0.11304347826086956,
"grad_norm": 0.36517596222828796,
"learning_rate": 9.9987982432502e-05,
"loss": 1.3701,
"step": 39
},
{
"epoch": 0.11594202898550725,
"grad_norm": 0.3754942171540997,
"learning_rate": 9.998430377779942e-05,
"loss": 1.4751,
"step": 40
},
{
"epoch": 0.11884057971014493,
"grad_norm": 0.37273876645697823,
"learning_rate": 9.998013474487833e-05,
"loss": 1.4959,
"step": 41
},
{
"epoch": 0.12173913043478261,
"grad_norm": 0.36526298295975423,
"learning_rate": 9.99754753746396e-05,
"loss": 1.477,
"step": 42
},
{
"epoch": 0.1246376811594203,
"grad_norm": 0.4028151666513751,
"learning_rate": 9.99703257127947e-05,
"loss": 1.4273,
"step": 43
},
{
"epoch": 0.12753623188405797,
"grad_norm": 0.3669671633234476,
"learning_rate": 9.99646858098651e-05,
"loss": 1.3938,
"step": 44
},
{
"epoch": 0.13043478260869565,
"grad_norm": 0.33083829945323007,
"learning_rate": 9.995855572118186e-05,
"loss": 1.4102,
"step": 45
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.3478285593739705,
"learning_rate": 9.995193550688517e-05,
"loss": 1.4027,
"step": 46
},
{
"epoch": 0.13623188405797101,
"grad_norm": 0.37609834638001705,
"learning_rate": 9.994482523192352e-05,
"loss": 1.4909,
"step": 47
},
{
"epoch": 0.1391304347826087,
"grad_norm": 0.3544704730906117,
"learning_rate": 9.993722496605333e-05,
"loss": 1.4603,
"step": 48
},
{
"epoch": 0.14202898550724638,
"grad_norm": 0.35471120831090747,
"learning_rate": 9.99291347838381e-05,
"loss": 1.4591,
"step": 49
},
{
"epoch": 0.14492753623188406,
"grad_norm": 0.3522333621422469,
"learning_rate": 9.992055476464772e-05,
"loss": 1.4661,
"step": 50
},
{
"epoch": 0.14782608695652175,
"grad_norm": 0.40369049060969037,
"learning_rate": 9.991148499265771e-05,
"loss": 1.3549,
"step": 51
},
{
"epoch": 0.15072463768115943,
"grad_norm": 0.37654258677829533,
"learning_rate": 9.990192555684837e-05,
"loss": 1.4566,
"step": 52
},
{
"epoch": 0.1536231884057971,
"grad_norm": 0.35023666520198726,
"learning_rate": 9.989187655100394e-05,
"loss": 1.4291,
"step": 53
},
{
"epoch": 0.1565217391304348,
"grad_norm": 0.3713582044260089,
"learning_rate": 9.98813380737116e-05,
"loss": 1.4899,
"step": 54
},
{
"epoch": 0.15942028985507245,
"grad_norm": 0.3483542245496034,
"learning_rate": 9.987031022836066e-05,
"loss": 1.422,
"step": 55
},
{
"epoch": 0.16231884057971013,
"grad_norm": 0.3428096360294795,
"learning_rate": 9.985879312314135e-05,
"loss": 1.417,
"step": 56
},
{
"epoch": 0.16521739130434782,
"grad_norm": 0.3645827259974512,
"learning_rate": 9.984678687104389e-05,
"loss": 1.4285,
"step": 57
},
{
"epoch": 0.1681159420289855,
"grad_norm": 0.35685607542080316,
"learning_rate": 9.983429158985736e-05,
"loss": 1.3918,
"step": 58
},
{
"epoch": 0.17101449275362318,
"grad_norm": 0.3370796491973602,
"learning_rate": 9.982130740216849e-05,
"loss": 1.4129,
"step": 59
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.3444756598243817,
"learning_rate": 9.980783443536057e-05,
"loss": 1.4355,
"step": 60
},
{
"epoch": 0.17681159420289855,
"grad_norm": 0.3436241209978691,
"learning_rate": 9.979387282161206e-05,
"loss": 1.4583,
"step": 61
},
{
"epoch": 0.17971014492753623,
"grad_norm": 0.32218525116364366,
"learning_rate": 9.977942269789537e-05,
"loss": 1.4524,
"step": 62
},
{
"epoch": 0.1826086956521739,
"grad_norm": 0.385973703132524,
"learning_rate": 9.976448420597556e-05,
"loss": 1.4419,
"step": 63
},
{
"epoch": 0.1855072463768116,
"grad_norm": 1.7247641389853836,
"learning_rate": 9.974905749240882e-05,
"loss": 1.3425,
"step": 64
},
{
"epoch": 0.18840579710144928,
"grad_norm": 0.3447341772023887,
"learning_rate": 9.973314270854115e-05,
"loss": 1.528,
"step": 65
},
{
"epoch": 0.19130434782608696,
"grad_norm": 0.35835098628054646,
"learning_rate": 9.971674001050686e-05,
"loss": 1.4713,
"step": 66
},
{
"epoch": 0.19420289855072465,
"grad_norm": 0.365150351821878,
"learning_rate": 9.969984955922697e-05,
"loss": 1.4537,
"step": 67
},
{
"epoch": 0.19710144927536233,
"grad_norm": 0.3866963594083402,
"learning_rate": 9.968247152040768e-05,
"loss": 1.5055,
"step": 68
},
{
"epoch": 0.2,
"grad_norm": 0.35045697501626877,
"learning_rate": 9.966460606453875e-05,
"loss": 1.4434,
"step": 69
},
{
"epoch": 0.2028985507246377,
"grad_norm": 0.36817264001563493,
"learning_rate": 9.964625336689181e-05,
"loss": 1.4294,
"step": 70
},
{
"epoch": 0.20579710144927535,
"grad_norm": 0.3654904538276859,
"learning_rate": 9.962741360751866e-05,
"loss": 1.4308,
"step": 71
},
{
"epoch": 0.20869565217391303,
"grad_norm": 0.3781497670043016,
"learning_rate": 9.960808697124946e-05,
"loss": 1.4685,
"step": 72
},
{
"epoch": 0.21159420289855072,
"grad_norm": 0.36156099913405126,
"learning_rate": 9.958827364769097e-05,
"loss": 1.4062,
"step": 73
},
{
"epoch": 0.2144927536231884,
"grad_norm": 0.35552781851256704,
"learning_rate": 9.956797383122463e-05,
"loss": 1.4428,
"step": 74
},
{
"epoch": 0.21739130434782608,
"grad_norm": 0.3335062272759448,
"learning_rate": 9.954718772100476e-05,
"loss": 1.4467,
"step": 75
},
{
"epoch": 0.22028985507246376,
"grad_norm": 0.3427215995763061,
"learning_rate": 9.952591552095646e-05,
"loss": 1.5089,
"step": 76
},
{
"epoch": 0.22318840579710145,
"grad_norm": 0.34794374393691757,
"learning_rate": 9.950415743977373e-05,
"loss": 1.4051,
"step": 77
},
{
"epoch": 0.22608695652173913,
"grad_norm": 0.3404770224687481,
"learning_rate": 9.948191369091735e-05,
"loss": 1.3876,
"step": 78
},
{
"epoch": 0.2289855072463768,
"grad_norm": 0.34102132992338396,
"learning_rate": 9.945918449261282e-05,
"loss": 1.4369,
"step": 79
},
{
"epoch": 0.2318840579710145,
"grad_norm": 0.33638460547428023,
"learning_rate": 9.943597006784825e-05,
"loss": 1.4164,
"step": 80
},
{
"epoch": 0.23478260869565218,
"grad_norm": 0.35290031375473546,
"learning_rate": 9.941227064437207e-05,
"loss": 1.3796,
"step": 81
},
{
"epoch": 0.23768115942028986,
"grad_norm": 0.3463360857934043,
"learning_rate": 9.93880864546909e-05,
"loss": 1.4276,
"step": 82
},
{
"epoch": 0.24057971014492754,
"grad_norm": 0.3566368609252091,
"learning_rate": 9.936341773606723e-05,
"loss": 1.4967,
"step": 83
},
{
"epoch": 0.24347826086956523,
"grad_norm": 0.3373773040313267,
"learning_rate": 9.933826473051707e-05,
"loss": 1.4079,
"step": 84
},
{
"epoch": 0.2463768115942029,
"grad_norm": 0.3393580838287239,
"learning_rate": 9.93126276848076e-05,
"loss": 1.4131,
"step": 85
},
{
"epoch": 0.2492753623188406,
"grad_norm": 0.3520135073078003,
"learning_rate": 9.928650685045477e-05,
"loss": 1.4729,
"step": 86
},
{
"epoch": 0.25217391304347825,
"grad_norm": 0.3526725034511152,
"learning_rate": 9.925990248372076e-05,
"loss": 1.4314,
"step": 87
},
{
"epoch": 0.25507246376811593,
"grad_norm": 0.3433193515525383,
"learning_rate": 9.92328148456116e-05,
"loss": 1.4505,
"step": 88
},
{
"epoch": 0.2579710144927536,
"grad_norm": 0.33837489039921237,
"learning_rate": 9.920524420187443e-05,
"loss": 1.4481,
"step": 89
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.33988682832234424,
"learning_rate": 9.917719082299501e-05,
"loss": 1.4149,
"step": 90
},
{
"epoch": 0.263768115942029,
"grad_norm": 0.33940846094652855,
"learning_rate": 9.91486549841951e-05,
"loss": 1.3847,
"step": 91
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.31996832381114065,
"learning_rate": 9.911963696542963e-05,
"loss": 1.3112,
"step": 92
},
{
"epoch": 0.26956521739130435,
"grad_norm": 0.31493707135599436,
"learning_rate": 9.909013705138406e-05,
"loss": 1.4216,
"step": 93
},
{
"epoch": 0.27246376811594203,
"grad_norm": 0.3204454590090509,
"learning_rate": 9.906015553147158e-05,
"loss": 1.3755,
"step": 94
},
{
"epoch": 0.2753623188405797,
"grad_norm": 0.3408318845906397,
"learning_rate": 9.902969269983018e-05,
"loss": 1.4574,
"step": 95
},
{
"epoch": 0.2782608695652174,
"grad_norm": 0.3196195350266631,
"learning_rate": 9.899874885531987e-05,
"loss": 1.4022,
"step": 96
},
{
"epoch": 0.2811594202898551,
"grad_norm": 0.33440793327421947,
"learning_rate": 9.89673243015197e-05,
"loss": 1.3766,
"step": 97
},
{
"epoch": 0.28405797101449276,
"grad_norm": 0.33693013386726023,
"learning_rate": 9.893541934672479e-05,
"loss": 1.4676,
"step": 98
},
{
"epoch": 0.28695652173913044,
"grad_norm": 0.3467550636007772,
"learning_rate": 9.890303430394328e-05,
"loss": 1.365,
"step": 99
},
{
"epoch": 0.2898550724637681,
"grad_norm": 0.3333645230781809,
"learning_rate": 9.887016949089333e-05,
"loss": 1.3514,
"step": 100
},
{
"epoch": 0.2927536231884058,
"grad_norm": 0.34610516226844007,
"learning_rate": 9.883682522999992e-05,
"loss": 1.4499,
"step": 101
},
{
"epoch": 0.2956521739130435,
"grad_norm": 0.3268443889818303,
"learning_rate": 9.88030018483917e-05,
"loss": 1.4303,
"step": 102
},
{
"epoch": 0.2985507246376812,
"grad_norm": 0.33465469810861087,
"learning_rate": 9.876869967789788e-05,
"loss": 1.3757,
"step": 103
},
{
"epoch": 0.30144927536231886,
"grad_norm": 0.33038430224796766,
"learning_rate": 9.87339190550448e-05,
"loss": 1.3676,
"step": 104
},
{
"epoch": 0.30434782608695654,
"grad_norm": 0.3404214439604057,
"learning_rate": 9.86986603210528e-05,
"loss": 1.3974,
"step": 105
},
{
"epoch": 0.3072463768115942,
"grad_norm": 0.32959296551839845,
"learning_rate": 9.866292382183278e-05,
"loss": 1.3484,
"step": 106
},
{
"epoch": 0.3101449275362319,
"grad_norm": 0.381137959130174,
"learning_rate": 9.86267099079828e-05,
"loss": 1.4149,
"step": 107
},
{
"epoch": 0.3130434782608696,
"grad_norm": 0.33114126577828235,
"learning_rate": 9.859001893478468e-05,
"loss": 1.3599,
"step": 108
},
{
"epoch": 0.3159420289855073,
"grad_norm": 0.36021993638794775,
"learning_rate": 9.855285126220053e-05,
"loss": 1.413,
"step": 109
},
{
"epoch": 0.3188405797101449,
"grad_norm": 0.355739607205717,
"learning_rate": 9.851520725486914e-05,
"loss": 1.4064,
"step": 110
},
{
"epoch": 0.3217391304347826,
"grad_norm": 0.3263260079885549,
"learning_rate": 9.847708728210246e-05,
"loss": 1.4048,
"step": 111
},
{
"epoch": 0.32463768115942027,
"grad_norm": 0.3199488973648368,
"learning_rate": 9.8438491717882e-05,
"loss": 1.3944,
"step": 112
},
{
"epoch": 0.32753623188405795,
"grad_norm": 0.3336592320156713,
"learning_rate": 9.839942094085511e-05,
"loss": 1.3799,
"step": 113
},
{
"epoch": 0.33043478260869563,
"grad_norm": 0.32960061743745567,
"learning_rate": 9.835987533433126e-05,
"loss": 1.43,
"step": 114
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.35822567336767946,
"learning_rate": 9.831985528627834e-05,
"loss": 1.4404,
"step": 115
},
{
"epoch": 0.336231884057971,
"grad_norm": 0.32466006600725356,
"learning_rate": 9.82793611893188e-05,
"loss": 1.391,
"step": 116
},
{
"epoch": 0.3391304347826087,
"grad_norm": 0.3452303089687653,
"learning_rate": 9.82383934407258e-05,
"loss": 1.4571,
"step": 117
},
{
"epoch": 0.34202898550724636,
"grad_norm": 0.3531330388118067,
"learning_rate": 9.819695244241936e-05,
"loss": 1.4726,
"step": 118
},
{
"epoch": 0.34492753623188405,
"grad_norm": 0.3284144929554227,
"learning_rate": 9.815503860096238e-05,
"loss": 1.4636,
"step": 119
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.33589451825622024,
"learning_rate": 9.811265232755662e-05,
"loss": 1.4076,
"step": 120
},
{
"epoch": 0.3507246376811594,
"grad_norm": 0.33465490795732467,
"learning_rate": 9.806979403803873e-05,
"loss": 1.3757,
"step": 121
},
{
"epoch": 0.3536231884057971,
"grad_norm": 0.35161889623674547,
"learning_rate": 9.802646415287615e-05,
"loss": 1.4065,
"step": 122
},
{
"epoch": 0.3565217391304348,
"grad_norm": 0.31894482948146224,
"learning_rate": 9.798266309716295e-05,
"loss": 1.4455,
"step": 123
},
{
"epoch": 0.35942028985507246,
"grad_norm": 0.3263915498362111,
"learning_rate": 9.793839130061573e-05,
"loss": 1.3291,
"step": 124
},
{
"epoch": 0.36231884057971014,
"grad_norm": 0.3264781414125749,
"learning_rate": 9.78936491975693e-05,
"loss": 1.3977,
"step": 125
},
{
"epoch": 0.3652173913043478,
"grad_norm": 0.3322110798968971,
"learning_rate": 9.784843722697253e-05,
"loss": 1.4516,
"step": 126
},
{
"epoch": 0.3681159420289855,
"grad_norm": 0.33040915159162,
"learning_rate": 9.780275583238397e-05,
"loss": 1.4418,
"step": 127
},
{
"epoch": 0.3710144927536232,
"grad_norm": 0.32982903923865825,
"learning_rate": 9.775660546196753e-05,
"loss": 1.399,
"step": 128
},
{
"epoch": 0.3739130434782609,
"grad_norm": 0.3398856478969671,
"learning_rate": 9.770998656848806e-05,
"loss": 1.4917,
"step": 129
},
{
"epoch": 0.37681159420289856,
"grad_norm": 0.33812428837562564,
"learning_rate": 9.766289960930697e-05,
"loss": 1.4136,
"step": 130
},
{
"epoch": 0.37971014492753624,
"grad_norm": 0.32546513362934915,
"learning_rate": 9.761534504637761e-05,
"loss": 1.4245,
"step": 131
},
{
"epoch": 0.3826086956521739,
"grad_norm": 0.3379554295481369,
"learning_rate": 9.756732334624093e-05,
"loss": 1.3917,
"step": 132
},
{
"epoch": 0.3855072463768116,
"grad_norm": 0.3196806084479148,
"learning_rate": 9.751883498002071e-05,
"loss": 1.3608,
"step": 133
},
{
"epoch": 0.3884057971014493,
"grad_norm": 0.366228317842041,
"learning_rate": 9.746988042341906e-05,
"loss": 1.3728,
"step": 134
},
{
"epoch": 0.391304347826087,
"grad_norm": 0.3769852522598798,
"learning_rate": 9.742046015671174e-05,
"loss": 1.4481,
"step": 135
},
{
"epoch": 0.39420289855072466,
"grad_norm": 0.34122072082269356,
"learning_rate": 9.737057466474336e-05,
"loss": 1.4195,
"step": 136
},
{
"epoch": 0.39710144927536234,
"grad_norm": 0.3322686505315165,
"learning_rate": 9.732022443692276e-05,
"loss": 1.399,
"step": 137
},
{
"epoch": 0.4,
"grad_norm": 0.3296309366287408,
"learning_rate": 9.726940996721811e-05,
"loss": 1.421,
"step": 138
},
{
"epoch": 0.4028985507246377,
"grad_norm": 0.37435872581479346,
"learning_rate": 9.721813175415208e-05,
"loss": 1.4244,
"step": 139
},
{
"epoch": 0.4057971014492754,
"grad_norm": 0.3268496453435604,
"learning_rate": 9.716639030079697e-05,
"loss": 1.4099,
"step": 140
},
{
"epoch": 0.40869565217391307,
"grad_norm": 0.3554430337628762,
"learning_rate": 9.711418611476977e-05,
"loss": 1.4446,
"step": 141
},
{
"epoch": 0.4115942028985507,
"grad_norm": 0.33834590076214077,
"learning_rate": 9.706151970822718e-05,
"loss": 1.3205,
"step": 142
},
{
"epoch": 0.4144927536231884,
"grad_norm": 0.3414240635513846,
"learning_rate": 9.700839159786057e-05,
"loss": 1.4534,
"step": 143
},
{
"epoch": 0.41739130434782606,
"grad_norm": 0.32930885329942156,
"learning_rate": 9.695480230489093e-05,
"loss": 1.3587,
"step": 144
},
{
"epoch": 0.42028985507246375,
"grad_norm": 0.3390309331331547,
"learning_rate": 9.690075235506374e-05,
"loss": 1.339,
"step": 145
},
{
"epoch": 0.42318840579710143,
"grad_norm": 0.33898351347591354,
"learning_rate": 9.684624227864383e-05,
"loss": 1.3774,
"step": 146
},
{
"epoch": 0.4260869565217391,
"grad_norm": 0.3229718369377447,
"learning_rate": 9.679127261041015e-05,
"loss": 1.3538,
"step": 147
},
{
"epoch": 0.4289855072463768,
"grad_norm": 0.3375751395632948,
"learning_rate": 9.673584388965058e-05,
"loss": 1.4375,
"step": 148
},
{
"epoch": 0.4318840579710145,
"grad_norm": 0.3267376187700775,
"learning_rate": 9.667995666015654e-05,
"loss": 1.4029,
"step": 149
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.34796705983800497,
"learning_rate": 9.662361147021779e-05,
"loss": 1.4493,
"step": 150
},
{
"epoch": 0.43768115942028984,
"grad_norm": 0.3182925069013053,
"learning_rate": 9.656680887261693e-05,
"loss": 1.3708,
"step": 151
},
{
"epoch": 0.4405797101449275,
"grad_norm": 0.3408199380471595,
"learning_rate": 9.650954942462401e-05,
"loss": 1.4098,
"step": 152
},
{
"epoch": 0.4434782608695652,
"grad_norm": 0.33412473685571564,
"learning_rate": 9.645183368799113e-05,
"loss": 1.4252,
"step": 153
},
{
"epoch": 0.4463768115942029,
"grad_norm": 0.3318159670621602,
"learning_rate": 9.639366222894682e-05,
"loss": 1.4233,
"step": 154
},
{
"epoch": 0.4492753623188406,
"grad_norm": 0.34440731389898754,
"learning_rate": 9.63350356181906e-05,
"loss": 1.3829,
"step": 155
},
{
"epoch": 0.45217391304347826,
"grad_norm": 0.35692903412852806,
"learning_rate": 9.627595443088724e-05,
"loss": 1.357,
"step": 156
},
{
"epoch": 0.45507246376811594,
"grad_norm": 0.33466758251653783,
"learning_rate": 9.621641924666127e-05,
"loss": 1.406,
"step": 157
},
{
"epoch": 0.4579710144927536,
"grad_norm": 0.3366286518639209,
"learning_rate": 9.615643064959122e-05,
"loss": 1.4249,
"step": 158
},
{
"epoch": 0.4608695652173913,
"grad_norm": 0.32884355157952677,
"learning_rate": 9.609598922820382e-05,
"loss": 1.4149,
"step": 159
},
{
"epoch": 0.463768115942029,
"grad_norm": 0.3323077335804954,
"learning_rate": 9.60350955754684e-05,
"loss": 1.3898,
"step": 160
},
{
"epoch": 0.4666666666666667,
"grad_norm": 0.3284011884136777,
"learning_rate": 9.597375028879088e-05,
"loss": 1.3761,
"step": 161
},
{
"epoch": 0.46956521739130436,
"grad_norm": 0.33628429126159637,
"learning_rate": 9.591195397000805e-05,
"loss": 1.4473,
"step": 162
},
{
"epoch": 0.47246376811594204,
"grad_norm": 0.3479467044598075,
"learning_rate": 9.584970722538162e-05,
"loss": 1.4025,
"step": 163
},
{
"epoch": 0.4753623188405797,
"grad_norm": 0.34445922830801295,
"learning_rate": 9.578701066559225e-05,
"loss": 1.397,
"step": 164
},
{
"epoch": 0.4782608695652174,
"grad_norm": 0.3398702574419618,
"learning_rate": 9.572386490573357e-05,
"loss": 1.3751,
"step": 165
},
{
"epoch": 0.4811594202898551,
"grad_norm": 0.31614740777820005,
"learning_rate": 9.566027056530615e-05,
"loss": 1.3098,
"step": 166
},
{
"epoch": 0.48405797101449277,
"grad_norm": 0.3444149821598331,
"learning_rate": 9.559622826821145e-05,
"loss": 1.3685,
"step": 167
},
{
"epoch": 0.48695652173913045,
"grad_norm": 0.3455185724902944,
"learning_rate": 9.553173864274567e-05,
"loss": 1.4413,
"step": 168
},
{
"epoch": 0.48985507246376814,
"grad_norm": 0.32774886376386325,
"learning_rate": 9.546680232159355e-05,
"loss": 1.4031,
"step": 169
},
{
"epoch": 0.4927536231884058,
"grad_norm": 0.32560244502643815,
"learning_rate": 9.540141994182225e-05,
"loss": 1.4364,
"step": 170
},
{
"epoch": 0.4956521739130435,
"grad_norm": 0.34398546887992665,
"learning_rate": 9.533559214487503e-05,
"loss": 1.409,
"step": 171
},
{
"epoch": 0.4985507246376812,
"grad_norm": 0.39583900001909544,
"learning_rate": 9.526931957656497e-05,
"loss": 1.4527,
"step": 172
},
{
"epoch": 0.5014492753623189,
"grad_norm": 0.4626708756395286,
"learning_rate": 9.520260288706867e-05,
"loss": 1.4624,
"step": 173
},
{
"epoch": 0.5043478260869565,
"grad_norm": 0.3664093495829884,
"learning_rate": 9.513544273091983e-05,
"loss": 1.4639,
"step": 174
},
{
"epoch": 0.5072463768115942,
"grad_norm": 0.36499531804230495,
"learning_rate": 9.506783976700285e-05,
"loss": 1.4065,
"step": 175
},
{
"epoch": 0.5101449275362319,
"grad_norm": 0.33176315803612266,
"learning_rate": 9.499979465854633e-05,
"loss": 1.3712,
"step": 176
},
{
"epoch": 0.5130434782608696,
"grad_norm": 0.31906615813652695,
"learning_rate": 9.493130807311663e-05,
"loss": 1.4081,
"step": 177
},
{
"epoch": 0.5159420289855072,
"grad_norm": 0.34052218389638056,
"learning_rate": 9.486238068261129e-05,
"loss": 1.4268,
"step": 178
},
{
"epoch": 0.518840579710145,
"grad_norm": 0.3336134893967437,
"learning_rate": 9.479301316325237e-05,
"loss": 1.4078,
"step": 179
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.3360766687427952,
"learning_rate": 9.472320619557997e-05,
"loss": 1.3766,
"step": 180
},
{
"epoch": 0.5246376811594203,
"grad_norm": 0.3221253265397745,
"learning_rate": 9.465296046444538e-05,
"loss": 1.3538,
"step": 181
},
{
"epoch": 0.527536231884058,
"grad_norm": 0.33953118483885136,
"learning_rate": 9.458227665900446e-05,
"loss": 1.3964,
"step": 182
},
{
"epoch": 0.5304347826086957,
"grad_norm": 0.33685849921565403,
"learning_rate": 9.45111554727109e-05,
"loss": 1.4249,
"step": 183
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.35947381917427984,
"learning_rate": 9.443959760330934e-05,
"loss": 1.4087,
"step": 184
},
{
"epoch": 0.5362318840579711,
"grad_norm": 0.33994296278210917,
"learning_rate": 9.436760375282859e-05,
"loss": 1.3951,
"step": 185
},
{
"epoch": 0.5391304347826087,
"grad_norm": 0.3470448028628382,
"learning_rate": 9.429517462757467e-05,
"loss": 1.3688,
"step": 186
},
{
"epoch": 0.5420289855072464,
"grad_norm": 0.33294443162653775,
"learning_rate": 9.422231093812398e-05,
"loss": 1.3679,
"step": 187
},
{
"epoch": 0.5449275362318841,
"grad_norm": 0.31454677711788814,
"learning_rate": 9.414901339931624e-05,
"loss": 1.4419,
"step": 188
},
{
"epoch": 0.5478260869565217,
"grad_norm": 0.3434839073644547,
"learning_rate": 9.407528273024752e-05,
"loss": 1.3949,
"step": 189
},
{
"epoch": 0.5507246376811594,
"grad_norm": 0.3351386886311035,
"learning_rate": 9.400111965426319e-05,
"loss": 1.4022,
"step": 190
},
{
"epoch": 0.553623188405797,
"grad_norm": 0.3358706804811382,
"learning_rate": 9.39265248989508e-05,
"loss": 1.3474,
"step": 191
},
{
"epoch": 0.5565217391304348,
"grad_norm": 0.3572071382586898,
"learning_rate": 9.385149919613292e-05,
"loss": 1.3889,
"step": 192
},
{
"epoch": 0.5594202898550724,
"grad_norm": 0.3287944467382312,
"learning_rate": 9.377604328186008e-05,
"loss": 1.3805,
"step": 193
},
{
"epoch": 0.5623188405797102,
"grad_norm": 0.36810650453304095,
"learning_rate": 9.370015789640334e-05,
"loss": 1.4075,
"step": 194
},
{
"epoch": 0.5652173913043478,
"grad_norm": 0.3868422779658168,
"learning_rate": 9.362384378424726e-05,
"loss": 1.4251,
"step": 195
},
{
"epoch": 0.5681159420289855,
"grad_norm": 0.3295019502277694,
"learning_rate": 9.354710169408243e-05,
"loss": 1.4139,
"step": 196
},
{
"epoch": 0.5710144927536231,
"grad_norm": 0.3468700259339786,
"learning_rate": 9.346993237879817e-05,
"loss": 1.366,
"step": 197
},
{
"epoch": 0.5739130434782609,
"grad_norm": 0.3397883227300112,
"learning_rate": 9.339233659547521e-05,
"loss": 1.4216,
"step": 198
},
{
"epoch": 0.5768115942028985,
"grad_norm": 0.3430862510854982,
"learning_rate": 9.331431510537816e-05,
"loss": 1.407,
"step": 199
},
{
"epoch": 0.5797101449275363,
"grad_norm": 0.3463403087156221,
"learning_rate": 9.323586867394807e-05,
"loss": 1.3894,
"step": 200
},
{
"epoch": 0.5826086956521739,
"grad_norm": 0.3280253585339611,
"learning_rate": 9.315699807079497e-05,
"loss": 1.3499,
"step": 201
},
{
"epoch": 0.5855072463768116,
"grad_norm": 0.3465548223811757,
"learning_rate": 9.30777040696903e-05,
"loss": 1.3635,
"step": 202
},
{
"epoch": 0.5884057971014492,
"grad_norm": 0.36685509209544426,
"learning_rate": 9.29979874485593e-05,
"loss": 1.4247,
"step": 203
},
{
"epoch": 0.591304347826087,
"grad_norm": 0.3642879429079575,
"learning_rate": 9.291784898947336e-05,
"loss": 1.4265,
"step": 204
},
{
"epoch": 0.5942028985507246,
"grad_norm": 0.3369650372143289,
"learning_rate": 9.283728947864237e-05,
"loss": 1.3543,
"step": 205
},
{
"epoch": 0.5971014492753624,
"grad_norm": 0.3498733941972242,
"learning_rate": 9.275630970640705e-05,
"loss": 1.3867,
"step": 206
},
{
"epoch": 0.6,
"grad_norm": 0.3265518670612826,
"learning_rate": 9.267491046723111e-05,
"loss": 1.404,
"step": 207
},
{
"epoch": 0.6028985507246377,
"grad_norm": 0.3318790134308843,
"learning_rate": 9.259309255969354e-05,
"loss": 1.4059,
"step": 208
},
{
"epoch": 0.6057971014492753,
"grad_norm": 0.34642031197798473,
"learning_rate": 9.251085678648072e-05,
"loss": 1.4259,
"step": 209
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.3419250092734196,
"learning_rate": 9.242820395437854e-05,
"loss": 1.3711,
"step": 210
},
{
"epoch": 0.6115942028985507,
"grad_norm": 0.3461578047587994,
"learning_rate": 9.234513487426453e-05,
"loss": 1.4579,
"step": 211
},
{
"epoch": 0.6144927536231884,
"grad_norm": 0.351627952691499,
"learning_rate": 9.226165036109988e-05,
"loss": 1.4399,
"step": 212
},
{
"epoch": 0.6173913043478261,
"grad_norm": 0.3307586411986757,
"learning_rate": 9.217775123392145e-05,
"loss": 1.3946,
"step": 213
},
{
"epoch": 0.6202898550724638,
"grad_norm": 0.3354295846624239,
"learning_rate": 9.209343831583373e-05,
"loss": 1.3682,
"step": 214
},
{
"epoch": 0.6231884057971014,
"grad_norm": 0.3643294550764089,
"learning_rate": 9.200871243400073e-05,
"loss": 1.4177,
"step": 215
},
{
"epoch": 0.6260869565217392,
"grad_norm": 0.34428635756537734,
"learning_rate": 9.192357441963795e-05,
"loss": 1.4487,
"step": 216
},
{
"epoch": 0.6289855072463768,
"grad_norm": 0.33609027458329577,
"learning_rate": 9.183802510800415e-05,
"loss": 1.4307,
"step": 217
},
{
"epoch": 0.6318840579710145,
"grad_norm": 0.3563038361945473,
"learning_rate": 9.175206533839318e-05,
"loss": 1.4172,
"step": 218
},
{
"epoch": 0.6347826086956522,
"grad_norm": 0.3288387667207579,
"learning_rate": 9.166569595412575e-05,
"loss": 1.3713,
"step": 219
},
{
"epoch": 0.6376811594202898,
"grad_norm": 0.34157440710913767,
"learning_rate": 9.157891780254117e-05,
"loss": 1.3679,
"step": 220
},
{
"epoch": 0.6405797101449275,
"grad_norm": 0.3151382251052811,
"learning_rate": 9.1491731734989e-05,
"loss": 1.3795,
"step": 221
},
{
"epoch": 0.6434782608695652,
"grad_norm": 0.33817165115588743,
"learning_rate": 9.140413860682073e-05,
"loss": 1.3586,
"step": 222
},
{
"epoch": 0.6463768115942029,
"grad_norm": 0.3277750425977871,
"learning_rate": 9.131613927738138e-05,
"loss": 1.3885,
"step": 223
},
{
"epoch": 0.6492753623188405,
"grad_norm": 0.31658312922359383,
"learning_rate": 9.122773461000103e-05,
"loss": 1.4149,
"step": 224
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.3193871223544036,
"learning_rate": 9.113892547198643e-05,
"loss": 1.322,
"step": 225
},
{
"epoch": 0.6550724637681159,
"grad_norm": 0.3302835747056366,
"learning_rate": 9.104971273461243e-05,
"loss": 1.3769,
"step": 226
},
{
"epoch": 0.6579710144927536,
"grad_norm": 0.3186189847015454,
"learning_rate": 9.096009727311347e-05,
"loss": 1.3406,
"step": 227
},
{
"epoch": 0.6608695652173913,
"grad_norm": 0.3389034868184038,
"learning_rate": 9.087007996667494e-05,
"loss": 1.3658,
"step": 228
},
{
"epoch": 0.663768115942029,
"grad_norm": 0.33474986537379237,
"learning_rate": 9.077966169842459e-05,
"loss": 1.3651,
"step": 229
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.3556022501007949,
"learning_rate": 9.068884335542389e-05,
"loss": 1.4237,
"step": 230
},
{
"epoch": 0.6695652173913044,
"grad_norm": 0.3216681338623573,
"learning_rate": 9.05976258286593e-05,
"loss": 1.3785,
"step": 231
},
{
"epoch": 0.672463768115942,
"grad_norm": 0.33533701380419384,
"learning_rate": 9.05060100130335e-05,
"loss": 1.4665,
"step": 232
},
{
"epoch": 0.6753623188405797,
"grad_norm": 0.3314963078807375,
"learning_rate": 9.041399680735664e-05,
"loss": 1.4036,
"step": 233
},
{
"epoch": 0.6782608695652174,
"grad_norm": 0.33542193989045377,
"learning_rate": 9.03215871143376e-05,
"loss": 1.4348,
"step": 234
},
{
"epoch": 0.6811594202898551,
"grad_norm": 0.3547005064725891,
"learning_rate": 9.022878184057492e-05,
"loss": 1.4272,
"step": 235
},
{
"epoch": 0.6840579710144927,
"grad_norm": 0.33291554897811426,
"learning_rate": 9.013558189654819e-05,
"loss": 1.4591,
"step": 236
},
{
"epoch": 0.6869565217391305,
"grad_norm": 0.3379014298685863,
"learning_rate": 9.004198819660885e-05,
"loss": 1.4567,
"step": 237
},
{
"epoch": 0.6898550724637681,
"grad_norm": 0.3297563945475019,
"learning_rate": 8.99480016589714e-05,
"loss": 1.3799,
"step": 238
},
{
"epoch": 0.6927536231884058,
"grad_norm": 0.34042084947510615,
"learning_rate": 8.985362320570432e-05,
"loss": 1.3697,
"step": 239
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.3374245817202305,
"learning_rate": 8.975885376272102e-05,
"loss": 1.4046,
"step": 240
},
{
"epoch": 0.6985507246376812,
"grad_norm": 0.3732847854755435,
"learning_rate": 8.966369425977082e-05,
"loss": 1.3491,
"step": 241
},
{
"epoch": 0.7014492753623188,
"grad_norm": 0.35958390600115686,
"learning_rate": 8.956814563042968e-05,
"loss": 1.3671,
"step": 242
},
{
"epoch": 0.7043478260869566,
"grad_norm": 0.3572722721866322,
"learning_rate": 8.947220881209126e-05,
"loss": 1.4003,
"step": 243
},
{
"epoch": 0.7072463768115942,
"grad_norm": 0.34273191632214844,
"learning_rate": 8.937588474595753e-05,
"loss": 1.4104,
"step": 244
},
{
"epoch": 0.7101449275362319,
"grad_norm": 0.34878139471777386,
"learning_rate": 8.927917437702962e-05,
"loss": 1.3896,
"step": 245
},
{
"epoch": 0.7130434782608696,
"grad_norm": 0.33111504592475566,
"learning_rate": 8.918207865409856e-05,
"loss": 1.3313,
"step": 246
},
{
"epoch": 0.7159420289855073,
"grad_norm": 0.3438939035436239,
"learning_rate": 8.908459852973594e-05,
"loss": 1.3429,
"step": 247
},
{
"epoch": 0.7188405797101449,
"grad_norm": 0.3312679125692785,
"learning_rate": 8.898673496028456e-05,
"loss": 1.4395,
"step": 248
},
{
"epoch": 0.7217391304347827,
"grad_norm": 0.34484942367124294,
"learning_rate": 8.888848890584907e-05,
"loss": 1.3712,
"step": 249
},
{
"epoch": 0.7246376811594203,
"grad_norm": 0.340709492347014,
"learning_rate": 8.878986133028657e-05,
"loss": 1.37,
"step": 250
},
{
"epoch": 0.7275362318840579,
"grad_norm": 0.33398944764147226,
"learning_rate": 8.86908532011971e-05,
"loss": 1.3892,
"step": 251
},
{
"epoch": 0.7304347826086957,
"grad_norm": 0.35175222311902715,
"learning_rate": 8.85914654899142e-05,
"loss": 1.4108,
"step": 252
},
{
"epoch": 0.7333333333333333,
"grad_norm": 0.3484995200225896,
"learning_rate": 8.849169917149531e-05,
"loss": 1.3833,
"step": 253
},
{
"epoch": 0.736231884057971,
"grad_norm": 0.3532075346234238,
"learning_rate": 8.839155522471232e-05,
"loss": 1.313,
"step": 254
},
{
"epoch": 0.7391304347826086,
"grad_norm": 0.32136667953567727,
"learning_rate": 8.829103463204182e-05,
"loss": 1.3504,
"step": 255
},
{
"epoch": 0.7420289855072464,
"grad_norm": 0.3229081190755409,
"learning_rate": 8.81901383796556e-05,
"loss": 1.3771,
"step": 256
},
{
"epoch": 0.744927536231884,
"grad_norm": 0.3440518639418747,
"learning_rate": 8.808886745741089e-05,
"loss": 1.4158,
"step": 257
},
{
"epoch": 0.7478260869565218,
"grad_norm": 0.3352706545420464,
"learning_rate": 8.798722285884066e-05,
"loss": 1.4394,
"step": 258
},
{
"epoch": 0.7507246376811594,
"grad_norm": 0.33559926414830077,
"learning_rate": 8.788520558114391e-05,
"loss": 1.3911,
"step": 259
},
{
"epoch": 0.7536231884057971,
"grad_norm": 0.3216071156149776,
"learning_rate": 8.778281662517583e-05,
"loss": 1.429,
"step": 260
},
{
"epoch": 0.7565217391304347,
"grad_norm": 0.32211563215549827,
"learning_rate": 8.768005699543806e-05,
"loss": 1.3127,
"step": 261
},
{
"epoch": 0.7594202898550725,
"grad_norm": 0.34108464165661373,
"learning_rate": 8.757692770006876e-05,
"loss": 1.3773,
"step": 262
},
{
"epoch": 0.7623188405797101,
"grad_norm": 0.32535926486459094,
"learning_rate": 8.747342975083272e-05,
"loss": 1.3664,
"step": 263
},
{
"epoch": 0.7652173913043478,
"grad_norm": 0.33852048574771015,
"learning_rate": 8.736956416311154e-05,
"loss": 1.3663,
"step": 264
},
{
"epoch": 0.7681159420289855,
"grad_norm": 0.33710327017540265,
"learning_rate": 8.72653319558935e-05,
"loss": 1.4091,
"step": 265
},
{
"epoch": 0.7710144927536232,
"grad_norm": 0.3529196648547696,
"learning_rate": 8.716073415176374e-05,
"loss": 1.442,
"step": 266
},
{
"epoch": 0.7739130434782608,
"grad_norm": 0.34337677669937877,
"learning_rate": 8.705577177689403e-05,
"loss": 1.3316,
"step": 267
},
{
"epoch": 0.7768115942028986,
"grad_norm": 0.3354333510851631,
"learning_rate": 8.695044586103296e-05,
"loss": 1.3616,
"step": 268
},
{
"epoch": 0.7797101449275362,
"grad_norm": 0.3479441013536178,
"learning_rate": 8.684475743749556e-05,
"loss": 1.395,
"step": 269
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.37463973489254887,
"learning_rate": 8.673870754315336e-05,
"loss": 1.401,
"step": 270
},
{
"epoch": 0.7855072463768116,
"grad_norm": 0.31175117798278007,
"learning_rate": 8.663229721842415e-05,
"loss": 1.3223,
"step": 271
},
{
"epoch": 0.7884057971014493,
"grad_norm": 0.38303494453595516,
"learning_rate": 8.652552750726175e-05,
"loss": 1.4301,
"step": 272
},
{
"epoch": 0.7913043478260869,
"grad_norm": 0.3573014147864106,
"learning_rate": 8.64183994571458e-05,
"loss": 1.4263,
"step": 273
},
{
"epoch": 0.7942028985507247,
"grad_norm": 0.3211993716597447,
"learning_rate": 8.631091411907149e-05,
"loss": 1.3578,
"step": 274
},
{
"epoch": 0.7971014492753623,
"grad_norm": 0.37834773248299663,
"learning_rate": 8.620307254753923e-05,
"loss": 1.3745,
"step": 275
},
{
"epoch": 0.8,
"grad_norm": 0.31593418933802786,
"learning_rate": 8.609487580054428e-05,
"loss": 1.3654,
"step": 276
},
{
"epoch": 0.8028985507246377,
"grad_norm": 0.31504634745000243,
"learning_rate": 8.598632493956644e-05,
"loss": 1.4,
"step": 277
},
{
"epoch": 0.8057971014492754,
"grad_norm": 0.3384470107062998,
"learning_rate": 8.58774210295596e-05,
"loss": 1.3941,
"step": 278
},
{
"epoch": 0.808695652173913,
"grad_norm": 0.3260030165566468,
"learning_rate": 8.576816513894125e-05,
"loss": 1.348,
"step": 279
},
{
"epoch": 0.8115942028985508,
"grad_norm": 0.3527150892760629,
"learning_rate": 8.565855833958206e-05,
"loss": 1.4058,
"step": 280
},
{
"epoch": 0.8144927536231884,
"grad_norm": 0.3861860908831136,
"learning_rate": 8.554860170679534e-05,
"loss": 1.4282,
"step": 281
},
{
"epoch": 0.8173913043478261,
"grad_norm": 0.3137903423216692,
"learning_rate": 8.543829631932649e-05,
"loss": 1.352,
"step": 282
},
{
"epoch": 0.8202898550724638,
"grad_norm": 0.34862718728490294,
"learning_rate": 8.532764325934239e-05,
"loss": 1.4282,
"step": 283
},
{
"epoch": 0.8231884057971014,
"grad_norm": 0.3150871399912744,
"learning_rate": 8.521664361242089e-05,
"loss": 1.3802,
"step": 284
},
{
"epoch": 0.8260869565217391,
"grad_norm": 0.3107741737666529,
"learning_rate": 8.510529846753998e-05,
"loss": 1.4077,
"step": 285
},
{
"epoch": 0.8289855072463768,
"grad_norm": 0.33269493424037233,
"learning_rate": 8.499360891706729e-05,
"loss": 1.3348,
"step": 286
},
{
"epoch": 0.8318840579710145,
"grad_norm": 0.31493592697757294,
"learning_rate": 8.488157605674925e-05,
"loss": 1.3418,
"step": 287
},
{
"epoch": 0.8347826086956521,
"grad_norm": 0.3328720547121984,
"learning_rate": 8.476920098570036e-05,
"loss": 1.3832,
"step": 288
},
{
"epoch": 0.8376811594202899,
"grad_norm": 0.3157756166632203,
"learning_rate": 8.465648480639248e-05,
"loss": 1.3274,
"step": 289
},
{
"epoch": 0.8405797101449275,
"grad_norm": 0.33662897796614577,
"learning_rate": 8.454342862464395e-05,
"loss": 1.3086,
"step": 290
},
{
"epoch": 0.8434782608695652,
"grad_norm": 0.3272252672648793,
"learning_rate": 8.443003354960872e-05,
"loss": 1.4232,
"step": 291
},
{
"epoch": 0.8463768115942029,
"grad_norm": 0.35218283346681617,
"learning_rate": 8.431630069376552e-05,
"loss": 1.4371,
"step": 292
},
{
"epoch": 0.8492753623188406,
"grad_norm": 0.3436413205889393,
"learning_rate": 8.420223117290695e-05,
"loss": 1.3696,
"step": 293
},
{
"epoch": 0.8521739130434782,
"grad_norm": 0.34426616560941314,
"learning_rate": 8.408782610612849e-05,
"loss": 1.4137,
"step": 294
},
{
"epoch": 0.855072463768116,
"grad_norm": 0.31419677902933213,
"learning_rate": 8.39730866158175e-05,
"loss": 1.3294,
"step": 295
},
{
"epoch": 0.8579710144927536,
"grad_norm": 0.31097415762768543,
"learning_rate": 8.385801382764233e-05,
"loss": 1.3796,
"step": 296
},
{
"epoch": 0.8608695652173913,
"grad_norm": 0.3351050938384504,
"learning_rate": 8.374260887054116e-05,
"loss": 1.4819,
"step": 297
},
{
"epoch": 0.863768115942029,
"grad_norm": 0.3151109176190777,
"learning_rate": 8.362687287671094e-05,
"loss": 1.3711,
"step": 298
},
{
"epoch": 0.8666666666666667,
"grad_norm": 0.337074633378245,
"learning_rate": 8.351080698159632e-05,
"loss": 1.3923,
"step": 299
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.3371311952402845,
"learning_rate": 8.339441232387853e-05,
"loss": 1.3789,
"step": 300
},
{
"epoch": 0.8724637681159421,
"grad_norm": 0.3356424382906388,
"learning_rate": 8.32776900454641e-05,
"loss": 1.4003,
"step": 301
},
{
"epoch": 0.8753623188405797,
"grad_norm": 0.33796299079575864,
"learning_rate": 8.31606412914738e-05,
"loss": 1.4341,
"step": 302
},
{
"epoch": 0.8782608695652174,
"grad_norm": 0.32018941976781934,
"learning_rate": 8.30432672102313e-05,
"loss": 1.4523,
"step": 303
},
{
"epoch": 0.881159420289855,
"grad_norm": 0.3368637827820196,
"learning_rate": 8.292556895325194e-05,
"loss": 1.3903,
"step": 304
},
{
"epoch": 0.8840579710144928,
"grad_norm": 0.31352167875853487,
"learning_rate": 8.280754767523144e-05,
"loss": 1.3581,
"step": 305
},
{
"epoch": 0.8869565217391304,
"grad_norm": 0.31484573995633375,
"learning_rate": 8.268920453403457e-05,
"loss": 1.3967,
"step": 306
},
{
"epoch": 0.8898550724637682,
"grad_norm": 0.31504188464216054,
"learning_rate": 8.257054069068374e-05,
"loss": 1.3985,
"step": 307
},
{
"epoch": 0.8927536231884058,
"grad_norm": 0.32015281024694753,
"learning_rate": 8.245155730934777e-05,
"loss": 1.3273,
"step": 308
},
{
"epoch": 0.8956521739130435,
"grad_norm": 0.3183790437483911,
"learning_rate": 8.233225555733022e-05,
"loss": 1.2672,
"step": 309
},
{
"epoch": 0.8985507246376812,
"grad_norm": 0.32150150116629717,
"learning_rate": 8.221263660505813e-05,
"loss": 1.3995,
"step": 310
},
{
"epoch": 0.9014492753623189,
"grad_norm": 0.3132580361772673,
"learning_rate": 8.20927016260705e-05,
"loss": 1.3899,
"step": 311
},
{
"epoch": 0.9043478260869565,
"grad_norm": 0.3557171808896923,
"learning_rate": 8.197245179700673e-05,
"loss": 1.3861,
"step": 312
},
{
"epoch": 0.9072463768115943,
"grad_norm": 0.32080932799331907,
"learning_rate": 8.185188829759505e-05,
"loss": 1.2657,
"step": 313
},
{
"epoch": 0.9101449275362319,
"grad_norm": 0.33323239514109537,
"learning_rate": 8.173101231064113e-05,
"loss": 1.331,
"step": 314
},
{
"epoch": 0.9130434782608695,
"grad_norm": 0.33932442141864444,
"learning_rate": 8.160982502201624e-05,
"loss": 1.3583,
"step": 315
},
{
"epoch": 0.9159420289855073,
"grad_norm": 0.41517663636078217,
"learning_rate": 8.148832762064573e-05,
"loss": 1.4196,
"step": 316
},
{
"epoch": 0.9188405797101449,
"grad_norm": 0.3479488422667109,
"learning_rate": 8.136652129849738e-05,
"loss": 1.3765,
"step": 317
},
{
"epoch": 0.9217391304347826,
"grad_norm": 0.3250773691234272,
"learning_rate": 8.124440725056969e-05,
"loss": 1.3998,
"step": 318
},
{
"epoch": 0.9246376811594202,
"grad_norm": 0.630703005417282,
"learning_rate": 8.112198667488012e-05,
"loss": 1.2986,
"step": 319
},
{
"epoch": 0.927536231884058,
"grad_norm": 0.34656213869069796,
"learning_rate": 8.099926077245337e-05,
"loss": 1.4085,
"step": 320
},
{
"epoch": 0.9304347826086956,
"grad_norm": 0.3595735041645428,
"learning_rate": 8.08762307473096e-05,
"loss": 1.3973,
"step": 321
},
{
"epoch": 0.9333333333333333,
"grad_norm": 0.3492788413407257,
"learning_rate": 8.075289780645264e-05,
"loss": 1.3912,
"step": 322
},
{
"epoch": 0.936231884057971,
"grad_norm": 0.3576330587050802,
"learning_rate": 8.062926315985803e-05,
"loss": 1.4256,
"step": 323
},
{
"epoch": 0.9391304347826087,
"grad_norm": 0.3410475477414221,
"learning_rate": 8.050532802046135e-05,
"loss": 1.3586,
"step": 324
},
{
"epoch": 0.9420289855072463,
"grad_norm": 0.32056313028041444,
"learning_rate": 8.038109360414614e-05,
"loss": 1.3443,
"step": 325
},
{
"epoch": 0.9449275362318841,
"grad_norm": 0.32894846650068166,
"learning_rate": 8.025656112973202e-05,
"loss": 1.3798,
"step": 326
},
{
"epoch": 0.9478260869565217,
"grad_norm": 0.3255639658134978,
"learning_rate": 8.013173181896283e-05,
"loss": 1.3383,
"step": 327
},
{
"epoch": 0.9507246376811594,
"grad_norm": 0.31966797580007494,
"learning_rate": 8.000660689649449e-05,
"loss": 1.3544,
"step": 328
},
{
"epoch": 0.9536231884057971,
"grad_norm": 0.32692090968009707,
"learning_rate": 7.98811875898831e-05,
"loss": 1.4088,
"step": 329
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.3372144496418016,
"learning_rate": 7.975547512957285e-05,
"loss": 1.4309,
"step": 330
},
{
"epoch": 0.9594202898550724,
"grad_norm": 0.3246412166131606,
"learning_rate": 7.962947074888394e-05,
"loss": 1.3916,
"step": 331
},
{
"epoch": 0.9623188405797102,
"grad_norm": 0.34634645274643355,
"learning_rate": 7.950317568400054e-05,
"loss": 1.4104,
"step": 332
},
{
"epoch": 0.9652173913043478,
"grad_norm": 0.3256987549913797,
"learning_rate": 7.937659117395858e-05,
"loss": 1.3544,
"step": 333
},
{
"epoch": 0.9681159420289855,
"grad_norm": 0.33356722481281487,
"learning_rate": 7.924971846063365e-05,
"loss": 1.342,
"step": 334
},
{
"epoch": 0.9710144927536232,
"grad_norm": 0.3260083753687772,
"learning_rate": 7.912255878872878e-05,
"loss": 1.4006,
"step": 335
},
{
"epoch": 0.9739130434782609,
"grad_norm": 0.3768462741234547,
"learning_rate": 7.899511340576229e-05,
"loss": 1.4014,
"step": 336
},
{
"epoch": 0.9768115942028985,
"grad_norm": 0.33594184989494874,
"learning_rate": 7.886738356205546e-05,
"loss": 1.3538,
"step": 337
},
{
"epoch": 0.9797101449275363,
"grad_norm": 0.3538141580905989,
"learning_rate": 7.873937051072035e-05,
"loss": 1.4112,
"step": 338
},
{
"epoch": 0.9826086956521739,
"grad_norm": 0.33768085173175694,
"learning_rate": 7.861107550764744e-05,
"loss": 1.4318,
"step": 339
},
{
"epoch": 0.9855072463768116,
"grad_norm": 0.3103190809712041,
"learning_rate": 7.848249981149338e-05,
"loss": 1.3934,
"step": 340
},
{
"epoch": 0.9884057971014493,
"grad_norm": 0.35049170901785537,
"learning_rate": 7.835364468366856e-05,
"loss": 1.3604,
"step": 341
},
{
"epoch": 0.991304347826087,
"grad_norm": 0.32828748932738266,
"learning_rate": 7.822451138832478e-05,
"loss": 1.3985,
"step": 342
},
{
"epoch": 0.9942028985507246,
"grad_norm": 0.33349918656348,
"learning_rate": 7.809510119234287e-05,
"loss": 1.4051,
"step": 343
},
{
"epoch": 0.9971014492753624,
"grad_norm": 0.31203624586969825,
"learning_rate": 7.796541536532019e-05,
"loss": 1.4114,
"step": 344
},
{
"epoch": 1.0,
"grad_norm": 0.3240751813149832,
"learning_rate": 7.783545517955826e-05,
"loss": 1.3441,
"step": 345
},
{
"epoch": 1.0028985507246377,
"grad_norm": 0.3039393246768782,
"learning_rate": 7.77052219100502e-05,
"loss": 1.2368,
"step": 346
},
{
"epoch": 1.0057971014492753,
"grad_norm": 0.31372425053284514,
"learning_rate": 7.757471683446833e-05,
"loss": 1.1765,
"step": 347
},
{
"epoch": 1.008695652173913,
"grad_norm": 0.2985654423691086,
"learning_rate": 7.744394123315146e-05,
"loss": 1.2387,
"step": 348
},
{
"epoch": 1.0115942028985507,
"grad_norm": 0.30668006943966447,
"learning_rate": 7.731289638909248e-05,
"loss": 1.2512,
"step": 349
},
{
"epoch": 1.0144927536231885,
"grad_norm": 0.3297662794021686,
"learning_rate": 7.718158358792574e-05,
"loss": 1.2466,
"step": 350
},
{
"epoch": 1.017391304347826,
"grad_norm": 0.36571397703464864,
"learning_rate": 7.705000411791441e-05,
"loss": 1.2095,
"step": 351
},
{
"epoch": 1.0202898550724637,
"grad_norm": 0.36789475981765535,
"learning_rate": 7.691815926993785e-05,
"loss": 1.2127,
"step": 352
},
{
"epoch": 1.0231884057971015,
"grad_norm": 0.34691008452093475,
"learning_rate": 7.678605033747894e-05,
"loss": 1.1754,
"step": 353
},
{
"epoch": 1.0260869565217392,
"grad_norm": 0.3381901577900874,
"learning_rate": 7.665367861661142e-05,
"loss": 1.2585,
"step": 354
},
{
"epoch": 1.0289855072463767,
"grad_norm": 0.3456016883168296,
"learning_rate": 7.652104540598712e-05,
"loss": 1.2565,
"step": 355
},
{
"epoch": 1.0318840579710145,
"grad_norm": 0.3340793379287121,
"learning_rate": 7.638815200682331e-05,
"loss": 1.286,
"step": 356
},
{
"epoch": 1.0347826086956522,
"grad_norm": 0.3329632889293724,
"learning_rate": 7.62549997228898e-05,
"loss": 1.2579,
"step": 357
},
{
"epoch": 1.03768115942029,
"grad_norm": 0.32945204903041203,
"learning_rate": 7.612158986049632e-05,
"loss": 1.1978,
"step": 358
},
{
"epoch": 1.0405797101449274,
"grad_norm": 0.3240289810339555,
"learning_rate": 7.598792372847952e-05,
"loss": 1.1871,
"step": 359
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.3497054137706393,
"learning_rate": 7.585400263819025e-05,
"loss": 1.2407,
"step": 360
},
{
"epoch": 1.046376811594203,
"grad_norm": 0.3334051709529727,
"learning_rate": 7.571982790348071e-05,
"loss": 1.2475,
"step": 361
},
{
"epoch": 1.0492753623188407,
"grad_norm": 0.3216924338385901,
"learning_rate": 7.558540084069145e-05,
"loss": 1.2178,
"step": 362
},
{
"epoch": 1.0521739130434782,
"grad_norm": 0.3770387844464867,
"learning_rate": 7.545072276863858e-05,
"loss": 1.2979,
"step": 363
},
{
"epoch": 1.055072463768116,
"grad_norm": 0.33349794524452664,
"learning_rate": 7.531579500860069e-05,
"loss": 1.2679,
"step": 364
},
{
"epoch": 1.0579710144927537,
"grad_norm": 0.3410677559200434,
"learning_rate": 7.518061888430609e-05,
"loss": 1.3029,
"step": 365
},
{
"epoch": 1.0608695652173914,
"grad_norm": 0.32421257826543254,
"learning_rate": 7.50451957219196e-05,
"loss": 1.2383,
"step": 366
},
{
"epoch": 1.063768115942029,
"grad_norm": 0.33207438928525995,
"learning_rate": 7.490952685002965e-05,
"loss": 1.2317,
"step": 367
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.32506432414586334,
"learning_rate": 7.477361359963533e-05,
"loss": 1.1661,
"step": 368
},
{
"epoch": 1.0695652173913044,
"grad_norm": 0.32495557198051783,
"learning_rate": 7.463745730413313e-05,
"loss": 1.2343,
"step": 369
},
{
"epoch": 1.0724637681159421,
"grad_norm": 0.33951747813529576,
"learning_rate": 7.450105929930403e-05,
"loss": 1.1765,
"step": 370
},
{
"epoch": 1.0753623188405796,
"grad_norm": 0.3960232594734765,
"learning_rate": 7.436442092330033e-05,
"loss": 1.1708,
"step": 371
},
{
"epoch": 1.0782608695652174,
"grad_norm": 0.34965839265944354,
"learning_rate": 7.422754351663252e-05,
"loss": 1.1557,
"step": 372
},
{
"epoch": 1.0811594202898551,
"grad_norm": 0.3465625398151273,
"learning_rate": 7.409042842215611e-05,
"loss": 1.2163,
"step": 373
},
{
"epoch": 1.0840579710144929,
"grad_norm": 0.3441278544713875,
"learning_rate": 7.395307698505851e-05,
"loss": 1.2522,
"step": 374
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.34316475519905354,
"learning_rate": 7.381549055284582e-05,
"loss": 1.2401,
"step": 375
},
{
"epoch": 1.0898550724637681,
"grad_norm": 0.3468405311381756,
"learning_rate": 7.367767047532955e-05,
"loss": 1.2297,
"step": 376
},
{
"epoch": 1.0927536231884059,
"grad_norm": 0.35424537263860967,
"learning_rate": 7.353961810461343e-05,
"loss": 1.1903,
"step": 377
},
{
"epoch": 1.0956521739130434,
"grad_norm": 0.35865745036758906,
"learning_rate": 7.340133479508015e-05,
"loss": 1.2238,
"step": 378
},
{
"epoch": 1.098550724637681,
"grad_norm": 0.33961205561899227,
"learning_rate": 7.326282190337807e-05,
"loss": 1.2353,
"step": 379
},
{
"epoch": 1.1014492753623188,
"grad_norm": 0.3410877787281011,
"learning_rate": 7.312408078840788e-05,
"loss": 1.1938,
"step": 380
},
{
"epoch": 1.1043478260869566,
"grad_norm": 0.3261974323058093,
"learning_rate": 7.298511281130928e-05,
"loss": 1.2283,
"step": 381
},
{
"epoch": 1.107246376811594,
"grad_norm": 0.3375439427532852,
"learning_rate": 7.284591933544764e-05,
"loss": 1.166,
"step": 382
},
{
"epoch": 1.1101449275362318,
"grad_norm": 0.34226748130902523,
"learning_rate": 7.270650172640065e-05,
"loss": 1.2268,
"step": 383
},
{
"epoch": 1.1130434782608696,
"grad_norm": 0.34975018354668974,
"learning_rate": 7.256686135194483e-05,
"loss": 1.2753,
"step": 384
},
{
"epoch": 1.1159420289855073,
"grad_norm": 0.36870818906061614,
"learning_rate": 7.242699958204225e-05,
"loss": 1.2427,
"step": 385
},
{
"epoch": 1.1188405797101448,
"grad_norm": 0.35097638947331306,
"learning_rate": 7.228691778882693e-05,
"loss": 1.2588,
"step": 386
},
{
"epoch": 1.1217391304347826,
"grad_norm": 0.35715131379127846,
"learning_rate": 7.21466173465915e-05,
"loss": 1.2349,
"step": 387
},
{
"epoch": 1.1246376811594203,
"grad_norm": 0.3554441755613845,
"learning_rate": 7.200609963177367e-05,
"loss": 1.2218,
"step": 388
},
{
"epoch": 1.127536231884058,
"grad_norm": 0.35332606995255955,
"learning_rate": 7.186536602294278e-05,
"loss": 1.233,
"step": 389
},
{
"epoch": 1.1304347826086956,
"grad_norm": 0.34659479615561295,
"learning_rate": 7.172441790078614e-05,
"loss": 1.2277,
"step": 390
},
{
"epoch": 1.1333333333333333,
"grad_norm": 0.3634661952802433,
"learning_rate": 7.158325664809566e-05,
"loss": 1.1815,
"step": 391
},
{
"epoch": 1.136231884057971,
"grad_norm": 0.3483946097126382,
"learning_rate": 7.144188364975415e-05,
"loss": 1.2296,
"step": 392
},
{
"epoch": 1.1391304347826088,
"grad_norm": 0.3458491663438552,
"learning_rate": 7.130030029272179e-05,
"loss": 1.2762,
"step": 393
},
{
"epoch": 1.1420289855072463,
"grad_norm": 0.36175639738964943,
"learning_rate": 7.11585079660225e-05,
"loss": 1.1942,
"step": 394
},
{
"epoch": 1.144927536231884,
"grad_norm": 0.3593818284728034,
"learning_rate": 7.101650806073038e-05,
"loss": 1.2068,
"step": 395
},
{
"epoch": 1.1478260869565218,
"grad_norm": 0.334166827563346,
"learning_rate": 7.087430196995593e-05,
"loss": 1.1819,
"step": 396
},
{
"epoch": 1.1507246376811595,
"grad_norm": 0.3636336066976543,
"learning_rate": 7.073189108883255e-05,
"loss": 1.2438,
"step": 397
},
{
"epoch": 1.153623188405797,
"grad_norm": 0.35550038414146484,
"learning_rate": 7.058927681450269e-05,
"loss": 1.2546,
"step": 398
},
{
"epoch": 1.1565217391304348,
"grad_norm": 0.3638989954332178,
"learning_rate": 7.044646054610426e-05,
"loss": 1.2817,
"step": 399
},
{
"epoch": 1.1594202898550725,
"grad_norm": 0.36528513619908154,
"learning_rate": 7.030344368475684e-05,
"loss": 1.2634,
"step": 400
},
{
"epoch": 1.1623188405797102,
"grad_norm": 0.348052355901968,
"learning_rate": 7.016022763354798e-05,
"loss": 1.2002,
"step": 401
},
{
"epoch": 1.1652173913043478,
"grad_norm": 0.3595684193169886,
"learning_rate": 7.00168137975194e-05,
"loss": 1.1864,
"step": 402
},
{
"epoch": 1.1681159420289855,
"grad_norm": 0.35070589944718533,
"learning_rate": 6.98732035836532e-05,
"loss": 1.1749,
"step": 403
},
{
"epoch": 1.1710144927536232,
"grad_norm": 0.3583364136698803,
"learning_rate": 6.972939840085809e-05,
"loss": 1.2362,
"step": 404
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.3411795291050965,
"learning_rate": 6.958539965995558e-05,
"loss": 1.2365,
"step": 405
},
{
"epoch": 1.1768115942028985,
"grad_norm": 0.37126831887596484,
"learning_rate": 6.944120877366604e-05,
"loss": 1.2547,
"step": 406
},
{
"epoch": 1.1797101449275362,
"grad_norm": 0.3615486523323878,
"learning_rate": 6.929682715659496e-05,
"loss": 1.2008,
"step": 407
},
{
"epoch": 1.182608695652174,
"grad_norm": 0.3495522144501781,
"learning_rate": 6.915225622521901e-05,
"loss": 1.2137,
"step": 408
},
{
"epoch": 1.1855072463768117,
"grad_norm": 0.34558559090876845,
"learning_rate": 6.900749739787216e-05,
"loss": 1.1948,
"step": 409
},
{
"epoch": 1.1884057971014492,
"grad_norm": 0.3534560464350228,
"learning_rate": 6.886255209473174e-05,
"loss": 1.2296,
"step": 410
},
{
"epoch": 1.191304347826087,
"grad_norm": 0.38654103329628986,
"learning_rate": 6.871742173780458e-05,
"loss": 1.2375,
"step": 411
},
{
"epoch": 1.1942028985507247,
"grad_norm": 0.4990410023234168,
"learning_rate": 6.857210775091292e-05,
"loss": 1.1972,
"step": 412
},
{
"epoch": 1.1971014492753622,
"grad_norm": 0.3283618367174733,
"learning_rate": 6.842661155968062e-05,
"loss": 1.2236,
"step": 413
},
{
"epoch": 1.2,
"grad_norm": 0.3501614388462517,
"learning_rate": 6.828093459151902e-05,
"loss": 1.2599,
"step": 414
},
{
"epoch": 1.2028985507246377,
"grad_norm": 0.3566983584982769,
"learning_rate": 6.813507827561301e-05,
"loss": 1.2592,
"step": 415
},
{
"epoch": 1.2057971014492754,
"grad_norm": 0.35438824536081337,
"learning_rate": 6.798904404290703e-05,
"loss": 1.219,
"step": 416
},
{
"epoch": 1.208695652173913,
"grad_norm": 0.36738665957897987,
"learning_rate": 6.784283332609096e-05,
"loss": 1.2787,
"step": 417
},
{
"epoch": 1.2115942028985507,
"grad_norm": 0.3618484779747058,
"learning_rate": 6.769644755958614e-05,
"loss": 1.2557,
"step": 418
},
{
"epoch": 1.2144927536231884,
"grad_norm": 0.3475615543784353,
"learning_rate": 6.754988817953121e-05,
"loss": 1.2519,
"step": 419
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.3498171433494951,
"learning_rate": 6.740315662376808e-05,
"loss": 1.1832,
"step": 420
},
{
"epoch": 1.2202898550724637,
"grad_norm": 0.3485237559097342,
"learning_rate": 6.725625433182788e-05,
"loss": 1.1686,
"step": 421
},
{
"epoch": 1.2231884057971014,
"grad_norm": 0.3365638116771253,
"learning_rate": 6.710918274491668e-05,
"loss": 1.161,
"step": 422
},
{
"epoch": 1.2260869565217392,
"grad_norm": 0.339262847480053,
"learning_rate": 6.696194330590151e-05,
"loss": 1.3032,
"step": 423
},
{
"epoch": 1.228985507246377,
"grad_norm": 0.3695849544204241,
"learning_rate": 6.681453745929613e-05,
"loss": 1.2505,
"step": 424
},
{
"epoch": 1.2318840579710144,
"grad_norm": 0.3810556641153086,
"learning_rate": 6.666696665124682e-05,
"loss": 1.2176,
"step": 425
},
{
"epoch": 1.2347826086956522,
"grad_norm": 0.3794002652671474,
"learning_rate": 6.651923232951829e-05,
"loss": 1.2922,
"step": 426
},
{
"epoch": 1.23768115942029,
"grad_norm": 0.37219002176219357,
"learning_rate": 6.637133594347938e-05,
"loss": 1.2919,
"step": 427
},
{
"epoch": 1.2405797101449276,
"grad_norm": 0.3748146640073023,
"learning_rate": 6.62232789440889e-05,
"loss": 1.2549,
"step": 428
},
{
"epoch": 1.2434782608695651,
"grad_norm": 0.3431018972364436,
"learning_rate": 6.607506278388144e-05,
"loss": 1.1907,
"step": 429
},
{
"epoch": 1.2463768115942029,
"grad_norm": 0.3685201234625515,
"learning_rate": 6.592668891695298e-05,
"loss": 1.2368,
"step": 430
},
{
"epoch": 1.2492753623188406,
"grad_norm": 0.3638027931128809,
"learning_rate": 6.57781587989467e-05,
"loss": 1.2695,
"step": 431
},
{
"epoch": 1.2521739130434781,
"grad_norm": 0.3392431416089568,
"learning_rate": 6.562947388703879e-05,
"loss": 1.2651,
"step": 432
},
{
"epoch": 1.2550724637681159,
"grad_norm": 0.3523863327979242,
"learning_rate": 6.548063563992397e-05,
"loss": 1.2633,
"step": 433
},
{
"epoch": 1.2579710144927536,
"grad_norm": 0.3773185628146933,
"learning_rate": 6.533164551780134e-05,
"loss": 1.2669,
"step": 434
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.37080955852894376,
"learning_rate": 6.518250498235996e-05,
"loss": 1.2055,
"step": 435
},
{
"epoch": 1.263768115942029,
"grad_norm": 0.3610115012833989,
"learning_rate": 6.50332154967646e-05,
"loss": 1.2558,
"step": 436
},
{
"epoch": 1.2666666666666666,
"grad_norm": 0.36419810462728663,
"learning_rate": 6.488377852564125e-05,
"loss": 1.2273,
"step": 437
},
{
"epoch": 1.2695652173913043,
"grad_norm": 0.36955352159431015,
"learning_rate": 6.473419553506285e-05,
"loss": 1.1592,
"step": 438
},
{
"epoch": 1.272463768115942,
"grad_norm": 0.4000451451417096,
"learning_rate": 6.45844679925349e-05,
"loss": 1.2585,
"step": 439
},
{
"epoch": 1.2753623188405796,
"grad_norm": 0.3674813225161034,
"learning_rate": 6.443459736698105e-05,
"loss": 1.207,
"step": 440
},
{
"epoch": 1.2782608695652173,
"grad_norm": 0.36342273693767024,
"learning_rate": 6.428458512872868e-05,
"loss": 1.207,
"step": 441
},
{
"epoch": 1.281159420289855,
"grad_norm": 0.3772811021851,
"learning_rate": 6.413443274949446e-05,
"loss": 1.249,
"step": 442
},
{
"epoch": 1.2840579710144928,
"grad_norm": 0.3574482885159096,
"learning_rate": 6.398414170237001e-05,
"loss": 1.2111,
"step": 443
},
{
"epoch": 1.2869565217391306,
"grad_norm": 0.34461226274334095,
"learning_rate": 6.383371346180725e-05,
"loss": 1.2042,
"step": 444
},
{
"epoch": 1.289855072463768,
"grad_norm": 0.35375827819704075,
"learning_rate": 6.368314950360415e-05,
"loss": 1.2183,
"step": 445
},
{
"epoch": 1.2927536231884058,
"grad_norm": 0.3494607679069863,
"learning_rate": 6.353245130489012e-05,
"loss": 1.2267,
"step": 446
},
{
"epoch": 1.2956521739130435,
"grad_norm": 0.3376350549359254,
"learning_rate": 6.338162034411158e-05,
"loss": 1.2514,
"step": 447
},
{
"epoch": 1.298550724637681,
"grad_norm": 0.3514507439505588,
"learning_rate": 6.323065810101741e-05,
"loss": 1.2055,
"step": 448
},
{
"epoch": 1.3014492753623188,
"grad_norm": 0.374192088646086,
"learning_rate": 6.307956605664447e-05,
"loss": 1.2149,
"step": 449
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.36836907141990205,
"learning_rate": 6.292834569330301e-05,
"loss": 1.332,
"step": 450
},
{
"epoch": 1.3072463768115943,
"grad_norm": 0.35436366268435593,
"learning_rate": 6.277699849456224e-05,
"loss": 1.2918,
"step": 451
},
{
"epoch": 1.310144927536232,
"grad_norm": 0.3535565794861321,
"learning_rate": 6.262552594523565e-05,
"loss": 1.2382,
"step": 452
},
{
"epoch": 1.3130434782608695,
"grad_norm": 0.3923107343675531,
"learning_rate": 6.247392953136655e-05,
"loss": 1.2614,
"step": 453
},
{
"epoch": 1.3159420289855073,
"grad_norm": 0.3566047611610826,
"learning_rate": 6.23222107402134e-05,
"loss": 1.2574,
"step": 454
},
{
"epoch": 1.318840579710145,
"grad_norm": 0.3444110335156092,
"learning_rate": 6.217037106023527e-05,
"loss": 1.2158,
"step": 455
},
{
"epoch": 1.3217391304347825,
"grad_norm": 0.34800059904629854,
"learning_rate": 6.201841198107724e-05,
"loss": 1.2691,
"step": 456
},
{
"epoch": 1.3246376811594203,
"grad_norm": 0.3704659760771806,
"learning_rate": 6.186633499355576e-05,
"loss": 1.1669,
"step": 457
},
{
"epoch": 1.327536231884058,
"grad_norm": 0.35589030087499396,
"learning_rate": 6.171414158964402e-05,
"loss": 1.2421,
"step": 458
},
{
"epoch": 1.3304347826086955,
"grad_norm": 0.41000043026343475,
"learning_rate": 6.156183326245738e-05,
"loss": 1.1528,
"step": 459
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.3545298846533197,
"learning_rate": 6.140941150623865e-05,
"loss": 1.3154,
"step": 460
},
{
"epoch": 1.336231884057971,
"grad_norm": 0.3632756192190139,
"learning_rate": 6.12568778163434e-05,
"loss": 1.2769,
"step": 461
},
{
"epoch": 1.3391304347826087,
"grad_norm": 0.3766419178772542,
"learning_rate": 6.110423368922544e-05,
"loss": 1.215,
"step": 462
},
{
"epoch": 1.3420289855072465,
"grad_norm": 0.35769930623122026,
"learning_rate": 6.095148062242196e-05,
"loss": 1.2226,
"step": 463
},
{
"epoch": 1.344927536231884,
"grad_norm": 0.3652620834683046,
"learning_rate": 6.079862011453893e-05,
"loss": 1.2217,
"step": 464
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.37380916243000584,
"learning_rate": 6.064565366523641e-05,
"loss": 1.2051,
"step": 465
},
{
"epoch": 1.3507246376811595,
"grad_norm": 0.38594446149133127,
"learning_rate": 6.0492582775213825e-05,
"loss": 1.2652,
"step": 466
},
{
"epoch": 1.353623188405797,
"grad_norm": 0.3461990145984557,
"learning_rate": 6.0339408946195185e-05,
"loss": 1.2554,
"step": 467
},
{
"epoch": 1.3565217391304347,
"grad_norm": 0.3748678338524721,
"learning_rate": 6.0186133680914445e-05,
"loss": 1.191,
"step": 468
},
{
"epoch": 1.3594202898550725,
"grad_norm": 0.37370664196717224,
"learning_rate": 6.003275848310067e-05,
"loss": 1.2706,
"step": 469
},
{
"epoch": 1.3623188405797102,
"grad_norm": 0.36194306306178214,
"learning_rate": 5.9879284857463356e-05,
"loss": 1.2187,
"step": 470
},
{
"epoch": 1.365217391304348,
"grad_norm": 0.36087008057820225,
"learning_rate": 5.972571430967764e-05,
"loss": 1.2456,
"step": 471
},
{
"epoch": 1.3681159420289855,
"grad_norm": 0.36273835372082425,
"learning_rate": 5.9572048346369515e-05,
"loss": 1.2277,
"step": 472
},
{
"epoch": 1.3710144927536232,
"grad_norm": 0.37085205673967797,
"learning_rate": 5.941828847510108e-05,
"loss": 1.2768,
"step": 473
},
{
"epoch": 1.373913043478261,
"grad_norm": 0.3755185129215953,
"learning_rate": 5.9264436204355724e-05,
"loss": 1.2031,
"step": 474
},
{
"epoch": 1.3768115942028984,
"grad_norm": 0.37382431917426745,
"learning_rate": 5.911049304352332e-05,
"loss": 1.2843,
"step": 475
},
{
"epoch": 1.3797101449275362,
"grad_norm": 0.37855680727333874,
"learning_rate": 5.895646050288543e-05,
"loss": 1.2912,
"step": 476
},
{
"epoch": 1.382608695652174,
"grad_norm": 0.3654439184708917,
"learning_rate": 5.8802340093600495e-05,
"loss": 1.2292,
"step": 477
},
{
"epoch": 1.3855072463768117,
"grad_norm": 0.3846140132825601,
"learning_rate": 5.8648133327689036e-05,
"loss": 1.2675,
"step": 478
},
{
"epoch": 1.3884057971014494,
"grad_norm": 0.3766180728314526,
"learning_rate": 5.849384171801876e-05,
"loss": 1.205,
"step": 479
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.35496774282385274,
"learning_rate": 5.8339466778289745e-05,
"loss": 1.2035,
"step": 480
},
{
"epoch": 1.3942028985507247,
"grad_norm": 0.35882380091220856,
"learning_rate": 5.818501002301959e-05,
"loss": 1.2047,
"step": 481
},
{
"epoch": 1.3971014492753624,
"grad_norm": 0.36361359874976407,
"learning_rate": 5.803047296752856e-05,
"loss": 1.2068,
"step": 482
},
{
"epoch": 1.4,
"grad_norm": 0.35304052394158203,
"learning_rate": 5.7875857127924704e-05,
"loss": 1.2039,
"step": 483
},
{
"epoch": 1.4028985507246376,
"grad_norm": 0.3767536613499123,
"learning_rate": 5.772116402108903e-05,
"loss": 1.1734,
"step": 484
},
{
"epoch": 1.4057971014492754,
"grad_norm": 0.3673108485371312,
"learning_rate": 5.756639516466056e-05,
"loss": 1.2631,
"step": 485
},
{
"epoch": 1.4086956521739131,
"grad_norm": 0.37033398981771753,
"learning_rate": 5.741155207702146e-05,
"loss": 1.2284,
"step": 486
},
{
"epoch": 1.4115942028985506,
"grad_norm": 0.3803519741849858,
"learning_rate": 5.7256636277282193e-05,
"loss": 1.2512,
"step": 487
},
{
"epoch": 1.4144927536231884,
"grad_norm": 0.3822460303571093,
"learning_rate": 5.7101649285266524e-05,
"loss": 1.2285,
"step": 488
},
{
"epoch": 1.4173913043478261,
"grad_norm": 0.366694568605544,
"learning_rate": 5.694659262149666e-05,
"loss": 1.2652,
"step": 489
},
{
"epoch": 1.4202898550724639,
"grad_norm": 0.3599613129529298,
"learning_rate": 5.679146780717841e-05,
"loss": 1.199,
"step": 490
},
{
"epoch": 1.4231884057971014,
"grad_norm": 0.36225487078774454,
"learning_rate": 5.6636276364186105e-05,
"loss": 1.1848,
"step": 491
},
{
"epoch": 1.4260869565217391,
"grad_norm": 0.3599718189253672,
"learning_rate": 5.648101981504775e-05,
"loss": 1.2082,
"step": 492
},
{
"epoch": 1.4289855072463769,
"grad_norm": 0.37863788166143847,
"learning_rate": 5.6325699682930145e-05,
"loss": 1.2391,
"step": 493
},
{
"epoch": 1.4318840579710144,
"grad_norm": 0.3803432660363016,
"learning_rate": 5.617031749162381e-05,
"loss": 1.161,
"step": 494
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.35786784027090707,
"learning_rate": 5.6014874765528124e-05,
"loss": 1.2861,
"step": 495
},
{
"epoch": 1.4376811594202898,
"grad_norm": 0.3642405560037894,
"learning_rate": 5.58593730296364e-05,
"loss": 1.2349,
"step": 496
},
{
"epoch": 1.4405797101449276,
"grad_norm": 0.369598439136747,
"learning_rate": 5.57038138095208e-05,
"loss": 1.285,
"step": 497
},
{
"epoch": 1.4434782608695653,
"grad_norm": 0.3555670502464068,
"learning_rate": 5.5548198631317494e-05,
"loss": 1.2145,
"step": 498
},
{
"epoch": 1.4463768115942028,
"grad_norm": 0.376327361594081,
"learning_rate": 5.539252902171164e-05,
"loss": 1.2245,
"step": 499
},
{
"epoch": 1.4492753623188406,
"grad_norm": 0.37654715270476347,
"learning_rate": 5.523680650792237e-05,
"loss": 1.2419,
"step": 500
},
{
"epoch": 1.4521739130434783,
"grad_norm": 0.5779377636764227,
"learning_rate": 5.508103261768783e-05,
"loss": 1.239,
"step": 501
},
{
"epoch": 1.4550724637681158,
"grad_norm": 0.37430911277789075,
"learning_rate": 5.492520887925028e-05,
"loss": 1.2577,
"step": 502
},
{
"epoch": 1.4579710144927536,
"grad_norm": 0.36147621449440515,
"learning_rate": 5.4769336821340936e-05,
"loss": 1.2851,
"step": 503
},
{
"epoch": 1.4608695652173913,
"grad_norm": 0.3731800543772072,
"learning_rate": 5.4613417973165106e-05,
"loss": 1.1851,
"step": 504
},
{
"epoch": 1.463768115942029,
"grad_norm": 0.38025435659821,
"learning_rate": 5.445745386438713e-05,
"loss": 1.2853,
"step": 505
},
{
"epoch": 1.4666666666666668,
"grad_norm": 0.3806710140744915,
"learning_rate": 5.430144602511539e-05,
"loss": 1.2698,
"step": 506
},
{
"epoch": 1.4695652173913043,
"grad_norm": 0.40891604532181375,
"learning_rate": 5.4145395985887246e-05,
"loss": 1.2388,
"step": 507
},
{
"epoch": 1.472463768115942,
"grad_norm": 0.3545961610157745,
"learning_rate": 5.3989305277654156e-05,
"loss": 1.19,
"step": 508
},
{
"epoch": 1.4753623188405798,
"grad_norm": 0.3648442660384036,
"learning_rate": 5.383317543176649e-05,
"loss": 1.203,
"step": 509
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.3850663135269365,
"learning_rate": 5.367700797995863e-05,
"loss": 1.2297,
"step": 510
},
{
"epoch": 1.481159420289855,
"grad_norm": 0.35394244670279573,
"learning_rate": 5.352080445433385e-05,
"loss": 1.2044,
"step": 511
},
{
"epoch": 1.4840579710144928,
"grad_norm": 0.3866450435083724,
"learning_rate": 5.336456638734938e-05,
"loss": 1.2203,
"step": 512
},
{
"epoch": 1.4869565217391305,
"grad_norm": 0.3800225621052723,
"learning_rate": 5.320829531180128e-05,
"loss": 1.2147,
"step": 513
},
{
"epoch": 1.4898550724637682,
"grad_norm": 0.37391354192034965,
"learning_rate": 5.30519927608095e-05,
"loss": 1.2173,
"step": 514
},
{
"epoch": 1.4927536231884058,
"grad_norm": 0.3908730346775049,
"learning_rate": 5.2895660267802714e-05,
"loss": 1.179,
"step": 515
},
{
"epoch": 1.4956521739130435,
"grad_norm": 0.3797397244263353,
"learning_rate": 5.27392993665034e-05,
"loss": 1.2397,
"step": 516
},
{
"epoch": 1.4985507246376812,
"grad_norm": 0.3698351874885442,
"learning_rate": 5.258291159091273e-05,
"loss": 1.292,
"step": 517
},
{
"epoch": 1.5014492753623188,
"grad_norm": 0.3680512756549276,
"learning_rate": 5.242649847529551e-05,
"loss": 1.1788,
"step": 518
},
{
"epoch": 1.5043478260869565,
"grad_norm": 0.3603216123639398,
"learning_rate": 5.227006155416517e-05,
"loss": 1.1539,
"step": 519
},
{
"epoch": 1.5072463768115942,
"grad_norm": 0.3830020055397342,
"learning_rate": 5.2113602362268674e-05,
"loss": 1.1658,
"step": 520
},
{
"epoch": 1.5101449275362318,
"grad_norm": 0.37049306835431794,
"learning_rate": 5.1957122434571485e-05,
"loss": 1.2754,
"step": 521
},
{
"epoch": 1.5130434782608697,
"grad_norm": 0.36878581085745593,
"learning_rate": 5.180062330624248e-05,
"loss": 1.26,
"step": 522
},
{
"epoch": 1.5159420289855072,
"grad_norm": 0.3932729911977662,
"learning_rate": 5.164410651263895e-05,
"loss": 1.2411,
"step": 523
},
{
"epoch": 1.518840579710145,
"grad_norm": 0.37380205081558054,
"learning_rate": 5.1487573589291424e-05,
"loss": 1.2778,
"step": 524
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.39041353684960733,
"learning_rate": 5.133102607188874e-05,
"loss": 1.1484,
"step": 525
},
{
"epoch": 1.5246376811594202,
"grad_norm": 0.37594098481535654,
"learning_rate": 5.117446549626289e-05,
"loss": 1.2161,
"step": 526
},
{
"epoch": 1.527536231884058,
"grad_norm": 0.38365451143587687,
"learning_rate": 5.101789339837396e-05,
"loss": 1.2256,
"step": 527
},
{
"epoch": 1.5304347826086957,
"grad_norm": 0.3855037750389005,
"learning_rate": 5.086131131429509e-05,
"loss": 1.2209,
"step": 528
},
{
"epoch": 1.5333333333333332,
"grad_norm": 0.3890790766439738,
"learning_rate": 5.07047207801974e-05,
"loss": 1.2338,
"step": 529
},
{
"epoch": 1.5362318840579712,
"grad_norm": 0.3700881037410359,
"learning_rate": 5.0548123332334896e-05,
"loss": 1.2475,
"step": 530
},
{
"epoch": 1.5391304347826087,
"grad_norm": 0.3743561390377829,
"learning_rate": 5.0391520507029424e-05,
"loss": 1.2239,
"step": 531
},
{
"epoch": 1.5420289855072464,
"grad_norm": 0.37802774104497083,
"learning_rate": 5.023491384065555e-05,
"loss": 1.2324,
"step": 532
},
{
"epoch": 1.5449275362318842,
"grad_norm": 0.36820878715854055,
"learning_rate": 5.0078304869625595e-05,
"loss": 1.2404,
"step": 533
},
{
"epoch": 1.5478260869565217,
"grad_norm": 0.3632460544127689,
"learning_rate": 4.992169513037441e-05,
"loss": 1.177,
"step": 534
},
{
"epoch": 1.5507246376811594,
"grad_norm": 0.3683252664871912,
"learning_rate": 4.9765086159344445e-05,
"loss": 1.182,
"step": 535
},
{
"epoch": 1.5536231884057972,
"grad_norm": 0.3831233196950789,
"learning_rate": 4.9608479492970594e-05,
"loss": 1.1991,
"step": 536
},
{
"epoch": 1.5565217391304347,
"grad_norm": 0.37245646640167623,
"learning_rate": 4.9451876667665116e-05,
"loss": 1.2376,
"step": 537
},
{
"epoch": 1.5594202898550724,
"grad_norm": 0.36522555829264214,
"learning_rate": 4.929527921980261e-05,
"loss": 1.2871,
"step": 538
},
{
"epoch": 1.5623188405797102,
"grad_norm": 0.35901097232709117,
"learning_rate": 4.9138688685704916e-05,
"loss": 1.2094,
"step": 539
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.3520423753812632,
"learning_rate": 4.898210660162605e-05,
"loss": 1.2363,
"step": 540
},
{
"epoch": 1.5681159420289856,
"grad_norm": 0.40852366010005403,
"learning_rate": 4.882553450373712e-05,
"loss": 1.2352,
"step": 541
},
{
"epoch": 1.5710144927536231,
"grad_norm": 0.3651205273751799,
"learning_rate": 4.866897392811126e-05,
"loss": 1.222,
"step": 542
},
{
"epoch": 1.5739130434782609,
"grad_norm": 0.3699594416077427,
"learning_rate": 4.851242641070859e-05,
"loss": 1.2149,
"step": 543
},
{
"epoch": 1.5768115942028986,
"grad_norm": 0.38193530242722756,
"learning_rate": 4.8355893487361084e-05,
"loss": 1.2766,
"step": 544
},
{
"epoch": 1.5797101449275361,
"grad_norm": 0.38568456101700965,
"learning_rate": 4.8199376693757544e-05,
"loss": 1.2844,
"step": 545
},
{
"epoch": 1.5826086956521739,
"grad_norm": 0.36059528632874444,
"learning_rate": 4.804287756542852e-05,
"loss": 1.2726,
"step": 546
},
{
"epoch": 1.5855072463768116,
"grad_norm": 0.36513879678761724,
"learning_rate": 4.788639763773133e-05,
"loss": 1.1763,
"step": 547
},
{
"epoch": 1.5884057971014491,
"grad_norm": 0.387466168821441,
"learning_rate": 4.772993844583483e-05,
"loss": 1.2544,
"step": 548
},
{
"epoch": 1.591304347826087,
"grad_norm": 0.5520887828224808,
"learning_rate": 4.75735015247045e-05,
"loss": 1.2285,
"step": 549
},
{
"epoch": 1.5942028985507246,
"grad_norm": 0.389584382030089,
"learning_rate": 4.7417088409087285e-05,
"loss": 1.2463,
"step": 550
},
{
"epoch": 1.5971014492753624,
"grad_norm": 0.3963144528047638,
"learning_rate": 4.7260700633496605e-05,
"loss": 1.1914,
"step": 551
},
{
"epoch": 1.6,
"grad_norm": 0.36855199490556523,
"learning_rate": 4.71043397321973e-05,
"loss": 1.2395,
"step": 552
},
{
"epoch": 1.6028985507246376,
"grad_norm": 0.3887397654253079,
"learning_rate": 4.6948007239190514e-05,
"loss": 1.2639,
"step": 553
},
{
"epoch": 1.6057971014492753,
"grad_norm": 0.3697755928376452,
"learning_rate": 4.6791704688198724e-05,
"loss": 1.1648,
"step": 554
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.38405410279449403,
"learning_rate": 4.663543361265064e-05,
"loss": 1.2424,
"step": 555
},
{
"epoch": 1.6115942028985506,
"grad_norm": 0.36889274593199667,
"learning_rate": 4.647919554566616e-05,
"loss": 1.2037,
"step": 556
},
{
"epoch": 1.6144927536231886,
"grad_norm": 0.38742028194651634,
"learning_rate": 4.63229920200414e-05,
"loss": 1.144,
"step": 557
},
{
"epoch": 1.617391304347826,
"grad_norm": 0.3771419221596441,
"learning_rate": 4.61668245682335e-05,
"loss": 1.2386,
"step": 558
},
{
"epoch": 1.6202898550724638,
"grad_norm": 0.36745992758167406,
"learning_rate": 4.601069472234584e-05,
"loss": 1.2439,
"step": 559
},
{
"epoch": 1.6231884057971016,
"grad_norm": 0.37299246443958567,
"learning_rate": 4.585460401411275e-05,
"loss": 1.1891,
"step": 560
},
{
"epoch": 1.626086956521739,
"grad_norm": 0.39436742226379295,
"learning_rate": 4.569855397488462e-05,
"loss": 1.2345,
"step": 561
},
{
"epoch": 1.6289855072463768,
"grad_norm": 0.38332200212622664,
"learning_rate": 4.554254613561289e-05,
"loss": 1.221,
"step": 562
},
{
"epoch": 1.6318840579710145,
"grad_norm": 0.3668234731737798,
"learning_rate": 4.5386582026834906e-05,
"loss": 1.1407,
"step": 563
},
{
"epoch": 1.634782608695652,
"grad_norm": 0.3886901538482464,
"learning_rate": 4.5230663178659075e-05,
"loss": 1.2372,
"step": 564
},
{
"epoch": 1.6376811594202898,
"grad_norm": 0.3690709201915018,
"learning_rate": 4.507479112074974e-05,
"loss": 1.2135,
"step": 565
},
{
"epoch": 1.6405797101449275,
"grad_norm": 0.36879231080045594,
"learning_rate": 4.491896738231218e-05,
"loss": 1.1641,
"step": 566
},
{
"epoch": 1.643478260869565,
"grad_norm": 0.36645636944065885,
"learning_rate": 4.476319349207766e-05,
"loss": 1.1852,
"step": 567
},
{
"epoch": 1.646376811594203,
"grad_norm": 0.3431665404786532,
"learning_rate": 4.460747097828838e-05,
"loss": 1.1573,
"step": 568
},
{
"epoch": 1.6492753623188405,
"grad_norm": 0.3758095567042996,
"learning_rate": 4.445180136868252e-05,
"loss": 1.2862,
"step": 569
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.3747562731763405,
"learning_rate": 4.4296186190479203e-05,
"loss": 1.2232,
"step": 570
},
{
"epoch": 1.655072463768116,
"grad_norm": 0.3680948045233427,
"learning_rate": 4.414062697036361e-05,
"loss": 1.2261,
"step": 571
},
{
"epoch": 1.6579710144927535,
"grad_norm": 0.3951307328237191,
"learning_rate": 4.3985125234471874e-05,
"loss": 1.2456,
"step": 572
},
{
"epoch": 1.6608695652173913,
"grad_norm": 0.39734232299660693,
"learning_rate": 4.3829682508376194e-05,
"loss": 1.1953,
"step": 573
},
{
"epoch": 1.663768115942029,
"grad_norm": 0.3784998636514162,
"learning_rate": 4.367430031706987e-05,
"loss": 1.2367,
"step": 574
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.39715845084791845,
"learning_rate": 4.351898018495225e-05,
"loss": 1.2279,
"step": 575
},
{
"epoch": 1.6695652173913045,
"grad_norm": 0.378181731966129,
"learning_rate": 4.336372363581391e-05,
"loss": 1.2075,
"step": 576
},
{
"epoch": 1.672463768115942,
"grad_norm": 0.3690996052960561,
"learning_rate": 4.32085321928216e-05,
"loss": 1.0945,
"step": 577
},
{
"epoch": 1.6753623188405797,
"grad_norm": 0.3661279761386217,
"learning_rate": 4.305340737850334e-05,
"loss": 1.2039,
"step": 578
},
{
"epoch": 1.6782608695652175,
"grad_norm": 0.3703501070974622,
"learning_rate": 4.28983507147335e-05,
"loss": 1.1634,
"step": 579
},
{
"epoch": 1.681159420289855,
"grad_norm": 0.37705477138544613,
"learning_rate": 4.2743363722717825e-05,
"loss": 1.233,
"step": 580
},
{
"epoch": 1.6840579710144927,
"grad_norm": 0.37944231677619733,
"learning_rate": 4.258844792297855e-05,
"loss": 1.2484,
"step": 581
},
{
"epoch": 1.6869565217391305,
"grad_norm": 0.36121328853497303,
"learning_rate": 4.2433604835339445e-05,
"loss": 1.2517,
"step": 582
},
{
"epoch": 1.689855072463768,
"grad_norm": 0.3658490072297351,
"learning_rate": 4.227883597891098e-05,
"loss": 1.2833,
"step": 583
},
{
"epoch": 1.692753623188406,
"grad_norm": 0.3742426427268219,
"learning_rate": 4.21241428720753e-05,
"loss": 1.2188,
"step": 584
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.3833395112583662,
"learning_rate": 4.196952703247145e-05,
"loss": 1.265,
"step": 585
},
{
"epoch": 1.6985507246376812,
"grad_norm": 0.36472794357808286,
"learning_rate": 4.181498997698042e-05,
"loss": 1.1679,
"step": 586
},
{
"epoch": 1.701449275362319,
"grad_norm": 0.36498141790011873,
"learning_rate": 4.1660533221710266e-05,
"loss": 1.2138,
"step": 587
},
{
"epoch": 1.7043478260869565,
"grad_norm": 0.37102421652558093,
"learning_rate": 4.150615828198125e-05,
"loss": 1.2176,
"step": 588
},
{
"epoch": 1.7072463768115942,
"grad_norm": 0.36544210520658216,
"learning_rate": 4.135186667231097e-05,
"loss": 1.2098,
"step": 589
},
{
"epoch": 1.710144927536232,
"grad_norm": 0.3612434641690313,
"learning_rate": 4.119765990639952e-05,
"loss": 1.1763,
"step": 590
},
{
"epoch": 1.7130434782608694,
"grad_norm": 0.3620969506592556,
"learning_rate": 4.1043539497114605e-05,
"loss": 1.1872,
"step": 591
},
{
"epoch": 1.7159420289855074,
"grad_norm": 0.39393702299078354,
"learning_rate": 4.088950695647671e-05,
"loss": 1.2687,
"step": 592
},
{
"epoch": 1.718840579710145,
"grad_norm": 0.3817467440217286,
"learning_rate": 4.0735563795644294e-05,
"loss": 1.2771,
"step": 593
},
{
"epoch": 1.7217391304347827,
"grad_norm": 0.3927298023358771,
"learning_rate": 4.058171152489891e-05,
"loss": 1.2733,
"step": 594
},
{
"epoch": 1.7246376811594204,
"grad_norm": 0.3674064366862089,
"learning_rate": 4.042795165363048e-05,
"loss": 1.2438,
"step": 595
},
{
"epoch": 1.727536231884058,
"grad_norm": 0.3719771458126402,
"learning_rate": 4.0274285690322366e-05,
"loss": 1.2539,
"step": 596
},
{
"epoch": 1.7304347826086957,
"grad_norm": 0.37286309136721435,
"learning_rate": 4.012071514253665e-05,
"loss": 1.2219,
"step": 597
},
{
"epoch": 1.7333333333333334,
"grad_norm": 0.37200008726902983,
"learning_rate": 3.996724151689934e-05,
"loss": 1.1937,
"step": 598
},
{
"epoch": 1.736231884057971,
"grad_norm": 0.3769662425580422,
"learning_rate": 3.981386631908557e-05,
"loss": 1.1795,
"step": 599
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.38896295738805997,
"learning_rate": 3.966059105380483e-05,
"loss": 1.262,
"step": 600
},
{
"epoch": 1.7420289855072464,
"grad_norm": 0.38088532712001094,
"learning_rate": 3.9507417224786193e-05,
"loss": 1.2626,
"step": 601
},
{
"epoch": 1.744927536231884,
"grad_norm": 0.3906788265447541,
"learning_rate": 3.93543463347636e-05,
"loss": 1.1918,
"step": 602
},
{
"epoch": 1.7478260869565219,
"grad_norm": 0.3691860050404467,
"learning_rate": 3.920137988546109e-05,
"loss": 1.1616,
"step": 603
},
{
"epoch": 1.7507246376811594,
"grad_norm": 0.3792592507880301,
"learning_rate": 3.9048519377578064e-05,
"loss": 1.1926,
"step": 604
},
{
"epoch": 1.7536231884057971,
"grad_norm": 0.37902398772592705,
"learning_rate": 3.8895766310774574e-05,
"loss": 1.3234,
"step": 605
},
{
"epoch": 1.7565217391304349,
"grad_norm": 0.3808967277084784,
"learning_rate": 3.87431221836566e-05,
"loss": 1.2678,
"step": 606
},
{
"epoch": 1.7594202898550724,
"grad_norm": 0.3768612203952316,
"learning_rate": 3.859058849376136e-05,
"loss": 1.2442,
"step": 607
},
{
"epoch": 1.76231884057971,
"grad_norm": 0.3661782288025134,
"learning_rate": 3.843816673754262e-05,
"loss": 1.2757,
"step": 608
},
{
"epoch": 1.7652173913043478,
"grad_norm": 0.3746443716611926,
"learning_rate": 3.8285858410355984e-05,
"loss": 1.234,
"step": 609
},
{
"epoch": 1.7681159420289854,
"grad_norm": 0.38619920952815956,
"learning_rate": 3.8133665006444255e-05,
"loss": 1.2229,
"step": 610
},
{
"epoch": 1.7710144927536233,
"grad_norm": 0.37016562757932,
"learning_rate": 3.798158801892277e-05,
"loss": 1.2112,
"step": 611
},
{
"epoch": 1.7739130434782608,
"grad_norm": 0.39144763721074394,
"learning_rate": 3.782962893976475e-05,
"loss": 1.1941,
"step": 612
},
{
"epoch": 1.7768115942028986,
"grad_norm": 0.372157745001237,
"learning_rate": 3.7677789259786615e-05,
"loss": 1.1607,
"step": 613
},
{
"epoch": 1.7797101449275363,
"grad_norm": 0.38017415387323344,
"learning_rate": 3.7526070468633464e-05,
"loss": 1.2251,
"step": 614
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.3764265620005903,
"learning_rate": 3.737447405476436e-05,
"loss": 1.2389,
"step": 615
},
{
"epoch": 1.7855072463768116,
"grad_norm": 0.36301297876352934,
"learning_rate": 3.7223001505437775e-05,
"loss": 1.1647,
"step": 616
},
{
"epoch": 1.7884057971014493,
"grad_norm": 0.3589005180459851,
"learning_rate": 3.7071654306697003e-05,
"loss": 1.2044,
"step": 617
},
{
"epoch": 1.7913043478260868,
"grad_norm": 0.38118628063662097,
"learning_rate": 3.692043394335556e-05,
"loss": 1.2063,
"step": 618
},
{
"epoch": 1.7942028985507248,
"grad_norm": 0.37713318727543105,
"learning_rate": 3.676934189898259e-05,
"loss": 1.3151,
"step": 619
},
{
"epoch": 1.7971014492753623,
"grad_norm": 0.38497109120391243,
"learning_rate": 3.661837965588842e-05,
"loss": 1.1582,
"step": 620
},
{
"epoch": 1.8,
"grad_norm": 0.3958884224922945,
"learning_rate": 3.646754869510988e-05,
"loss": 1.2598,
"step": 621
},
{
"epoch": 1.8028985507246378,
"grad_norm": 0.370532843067504,
"learning_rate": 3.631685049639586e-05,
"loss": 1.2128,
"step": 622
},
{
"epoch": 1.8057971014492753,
"grad_norm": 0.40047093677653156,
"learning_rate": 3.616628653819276e-05,
"loss": 1.2316,
"step": 623
},
{
"epoch": 1.808695652173913,
"grad_norm": 0.37643906872365784,
"learning_rate": 3.6015858297630004e-05,
"loss": 1.2171,
"step": 624
},
{
"epoch": 1.8115942028985508,
"grad_norm": 0.39490427844818465,
"learning_rate": 3.5865567250505536e-05,
"loss": 1.2416,
"step": 625
},
{
"epoch": 1.8144927536231883,
"grad_norm": 0.3631993323865769,
"learning_rate": 3.5715414871271336e-05,
"loss": 1.2147,
"step": 626
},
{
"epoch": 1.8173913043478263,
"grad_norm": 0.35840772617807537,
"learning_rate": 3.556540263301896e-05,
"loss": 1.2015,
"step": 627
},
{
"epoch": 1.8202898550724638,
"grad_norm": 0.3791997912963071,
"learning_rate": 3.541553200746511e-05,
"loss": 1.1583,
"step": 628
},
{
"epoch": 1.8231884057971013,
"grad_norm": 0.37805560040982356,
"learning_rate": 3.526580446493717e-05,
"loss": 1.2238,
"step": 629
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.382383828357578,
"learning_rate": 3.511622147435877e-05,
"loss": 1.2201,
"step": 630
},
{
"epoch": 1.8289855072463768,
"grad_norm": 0.38874429445479597,
"learning_rate": 3.4966784503235394e-05,
"loss": 1.2319,
"step": 631
},
{
"epoch": 1.8318840579710145,
"grad_norm": 0.38625077800174934,
"learning_rate": 3.481749501764002e-05,
"loss": 1.2326,
"step": 632
},
{
"epoch": 1.8347826086956522,
"grad_norm": 0.37805590288266955,
"learning_rate": 3.466835448219867e-05,
"loss": 1.2072,
"step": 633
},
{
"epoch": 1.8376811594202898,
"grad_norm": 0.3876007771372343,
"learning_rate": 3.4519364360076045e-05,
"loss": 1.2188,
"step": 634
},
{
"epoch": 1.8405797101449275,
"grad_norm": 0.36997413690862124,
"learning_rate": 3.437052611296123e-05,
"loss": 1.2974,
"step": 635
},
{
"epoch": 1.8434782608695652,
"grad_norm": 0.38893326272743267,
"learning_rate": 3.422184120105331e-05,
"loss": 1.2325,
"step": 636
},
{
"epoch": 1.8463768115942027,
"grad_norm": 0.38534863103441785,
"learning_rate": 3.407331108304704e-05,
"loss": 1.2881,
"step": 637
},
{
"epoch": 1.8492753623188407,
"grad_norm": 0.35237887662066153,
"learning_rate": 3.392493721611857e-05,
"loss": 1.1636,
"step": 638
},
{
"epoch": 1.8521739130434782,
"grad_norm": 0.3522129349688945,
"learning_rate": 3.37767210559111e-05,
"loss": 1.2069,
"step": 639
},
{
"epoch": 1.855072463768116,
"grad_norm": 0.3828825108660318,
"learning_rate": 3.3628664056520645e-05,
"loss": 1.1511,
"step": 640
},
{
"epoch": 1.8579710144927537,
"grad_norm": 0.38984016931652277,
"learning_rate": 3.348076767048174e-05,
"loss": 1.2204,
"step": 641
},
{
"epoch": 1.8608695652173912,
"grad_norm": 0.36523507158461577,
"learning_rate": 3.3333033348753196e-05,
"loss": 1.262,
"step": 642
},
{
"epoch": 1.863768115942029,
"grad_norm": 0.37220367890890976,
"learning_rate": 3.3185462540703874e-05,
"loss": 1.2262,
"step": 643
},
{
"epoch": 1.8666666666666667,
"grad_norm": 0.3694812470086758,
"learning_rate": 3.303805669409848e-05,
"loss": 1.2474,
"step": 644
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.36698538082460586,
"learning_rate": 3.289081725508333e-05,
"loss": 1.2088,
"step": 645
},
{
"epoch": 1.8724637681159422,
"grad_norm": 0.3778477738916828,
"learning_rate": 3.2743745668172135e-05,
"loss": 1.1314,
"step": 646
},
{
"epoch": 1.8753623188405797,
"grad_norm": 0.35885473738105417,
"learning_rate": 3.259684337623192e-05,
"loss": 1.1323,
"step": 647
},
{
"epoch": 1.8782608695652174,
"grad_norm": 0.3865523562816111,
"learning_rate": 3.245011182046881e-05,
"loss": 1.2147,
"step": 648
},
{
"epoch": 1.8811594202898552,
"grad_norm": 0.530703476143991,
"learning_rate": 3.230355244041387e-05,
"loss": 1.294,
"step": 649
},
{
"epoch": 1.8840579710144927,
"grad_norm": 0.37902082343553395,
"learning_rate": 3.215716667390905e-05,
"loss": 1.2446,
"step": 650
},
{
"epoch": 1.8869565217391304,
"grad_norm": 0.3635449013765209,
"learning_rate": 3.201095595709298e-05,
"loss": 1.1876,
"step": 651
},
{
"epoch": 1.8898550724637682,
"grad_norm": 0.38375684981250285,
"learning_rate": 3.1864921724387e-05,
"loss": 1.2511,
"step": 652
},
{
"epoch": 1.8927536231884057,
"grad_norm": 0.374887470810997,
"learning_rate": 3.1719065408481005e-05,
"loss": 1.2076,
"step": 653
},
{
"epoch": 1.8956521739130436,
"grad_norm": 0.3788733526902221,
"learning_rate": 3.1573388440319404e-05,
"loss": 1.1485,
"step": 654
},
{
"epoch": 1.8985507246376812,
"grad_norm": 0.37343821294935253,
"learning_rate": 3.142789224908709e-05,
"loss": 1.2417,
"step": 655
},
{
"epoch": 1.901449275362319,
"grad_norm": 0.36972719766904644,
"learning_rate": 3.128257826219544e-05,
"loss": 1.1924,
"step": 656
},
{
"epoch": 1.9043478260869566,
"grad_norm": 0.39152027197251665,
"learning_rate": 3.1137447905268264e-05,
"loss": 1.2334,
"step": 657
},
{
"epoch": 1.9072463768115941,
"grad_norm": 0.3793593937622258,
"learning_rate": 3.099250260212785e-05,
"loss": 1.2044,
"step": 658
},
{
"epoch": 1.9101449275362319,
"grad_norm": 0.37274932277970574,
"learning_rate": 3.0847743774781e-05,
"loss": 1.2396,
"step": 659
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.3917130499161079,
"learning_rate": 3.070317284340505e-05,
"loss": 1.2224,
"step": 660
},
{
"epoch": 1.9159420289855071,
"grad_norm": 0.3730432872342999,
"learning_rate": 3.055879122633397e-05,
"loss": 1.1523,
"step": 661
},
{
"epoch": 1.9188405797101449,
"grad_norm": 0.38603243505310325,
"learning_rate": 3.041460034004443e-05,
"loss": 1.2139,
"step": 662
},
{
"epoch": 1.9217391304347826,
"grad_norm": 0.3705238103870671,
"learning_rate": 3.0270601599141912e-05,
"loss": 1.2359,
"step": 663
},
{
"epoch": 1.9246376811594201,
"grad_norm": 0.37597496158367705,
"learning_rate": 3.0126796416346814e-05,
"loss": 1.2185,
"step": 664
},
{
"epoch": 1.927536231884058,
"grad_norm": 0.3685212983823541,
"learning_rate": 2.9983186202480623e-05,
"loss": 1.1696,
"step": 665
},
{
"epoch": 1.9304347826086956,
"grad_norm": 0.369031802362704,
"learning_rate": 2.9839772366452035e-05,
"loss": 1.1996,
"step": 666
},
{
"epoch": 1.9333333333333333,
"grad_norm": 0.37822154642489714,
"learning_rate": 2.969655631524316e-05,
"loss": 1.2732,
"step": 667
},
{
"epoch": 1.936231884057971,
"grad_norm": 0.37245983427478613,
"learning_rate": 2.9553539453895755e-05,
"loss": 1.2615,
"step": 668
},
{
"epoch": 1.9391304347826086,
"grad_norm": 0.3778250952875639,
"learning_rate": 2.9410723185497324e-05,
"loss": 1.2146,
"step": 669
},
{
"epoch": 1.9420289855072463,
"grad_norm": 0.3745452473168881,
"learning_rate": 2.9268108911167457e-05,
"loss": 1.2042,
"step": 670
},
{
"epoch": 1.944927536231884,
"grad_norm": 0.37312413882240314,
"learning_rate": 2.9125698030044068e-05,
"loss": 1.1911,
"step": 671
},
{
"epoch": 1.9478260869565216,
"grad_norm": 0.4061345062579341,
"learning_rate": 2.8983491939269634e-05,
"loss": 1.2611,
"step": 672
},
{
"epoch": 1.9507246376811596,
"grad_norm": 0.3849328956575118,
"learning_rate": 2.8841492033977503e-05,
"loss": 1.2108,
"step": 673
},
{
"epoch": 1.953623188405797,
"grad_norm": 0.38053458611756497,
"learning_rate": 2.8699699707278223e-05,
"loss": 1.2144,
"step": 674
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.39621473951535024,
"learning_rate": 2.8558116350245854e-05,
"loss": 1.2493,
"step": 675
},
{
"epoch": 1.9594202898550726,
"grad_norm": 0.3695671513205437,
"learning_rate": 2.841674335190434e-05,
"loss": 1.2519,
"step": 676
},
{
"epoch": 1.96231884057971,
"grad_norm": 0.3830315846006876,
"learning_rate": 2.827558209921386e-05,
"loss": 1.2074,
"step": 677
},
{
"epoch": 1.9652173913043478,
"grad_norm": 0.3877343629077828,
"learning_rate": 2.8134633977057235e-05,
"loss": 1.2333,
"step": 678
},
{
"epoch": 1.9681159420289855,
"grad_norm": 0.39689935141233373,
"learning_rate": 2.7993900368226333e-05,
"loss": 1.2128,
"step": 679
},
{
"epoch": 1.971014492753623,
"grad_norm": 0.37755832002907747,
"learning_rate": 2.785338265340852e-05,
"loss": 1.1728,
"step": 680
},
{
"epoch": 1.973913043478261,
"grad_norm": 0.38446867990310063,
"learning_rate": 2.771308221117309e-05,
"loss": 1.1602,
"step": 681
},
{
"epoch": 1.9768115942028985,
"grad_norm": 0.3785335064750929,
"learning_rate": 2.757300041795776e-05,
"loss": 1.2085,
"step": 682
},
{
"epoch": 1.9797101449275363,
"grad_norm": 0.3879694395220702,
"learning_rate": 2.7433138648055168e-05,
"loss": 1.2096,
"step": 683
},
{
"epoch": 1.982608695652174,
"grad_norm": 0.38604305997893856,
"learning_rate": 2.729349827359936e-05,
"loss": 1.2739,
"step": 684
},
{
"epoch": 1.9855072463768115,
"grad_norm": 0.3795112440774168,
"learning_rate": 2.715408066455236e-05,
"loss": 1.2666,
"step": 685
},
{
"epoch": 1.9884057971014493,
"grad_norm": 0.3625119163490855,
"learning_rate": 2.701488718869073e-05,
"loss": 1.2317,
"step": 686
},
{
"epoch": 1.991304347826087,
"grad_norm": 0.3680979908316257,
"learning_rate": 2.6875919211592137e-05,
"loss": 1.2673,
"step": 687
},
{
"epoch": 1.9942028985507245,
"grad_norm": 0.39366314079628106,
"learning_rate": 2.673717809662194e-05,
"loss": 1.215,
"step": 688
},
{
"epoch": 1.9971014492753625,
"grad_norm": 0.3711217421698582,
"learning_rate": 2.659866520491986e-05,
"loss": 1.2061,
"step": 689
},
{
"epoch": 2.0,
"grad_norm": 0.3619509926469052,
"learning_rate": 2.646038189538659e-05,
"loss": 1.0882,
"step": 690
},
{
"epoch": 2.0028985507246375,
"grad_norm": 0.36298590926269914,
"learning_rate": 2.632232952467047e-05,
"loss": 1.0538,
"step": 691
},
{
"epoch": 2.0057971014492755,
"grad_norm": 0.36532280808197115,
"learning_rate": 2.6184509447154193e-05,
"loss": 1.1357,
"step": 692
},
{
"epoch": 2.008695652173913,
"grad_norm": 0.39561521212011347,
"learning_rate": 2.6046923014941494e-05,
"loss": 0.9882,
"step": 693
},
{
"epoch": 2.0115942028985505,
"grad_norm": 0.3663184321766037,
"learning_rate": 2.5909571577843905e-05,
"loss": 1.0739,
"step": 694
},
{
"epoch": 2.0144927536231885,
"grad_norm": 0.3719396287060232,
"learning_rate": 2.5772456483367497e-05,
"loss": 1.0861,
"step": 695
},
{
"epoch": 2.017391304347826,
"grad_norm": 0.39175032329764664,
"learning_rate": 2.563557907669968e-05,
"loss": 1.0997,
"step": 696
},
{
"epoch": 2.020289855072464,
"grad_norm": 0.3842127505386081,
"learning_rate": 2.5498940700695978e-05,
"loss": 1.0833,
"step": 697
},
{
"epoch": 2.0231884057971015,
"grad_norm": 0.41296235407870646,
"learning_rate": 2.5362542695866885e-05,
"loss": 1.0784,
"step": 698
},
{
"epoch": 2.026086956521739,
"grad_norm": 0.40929280219103825,
"learning_rate": 2.5226386400364686e-05,
"loss": 1.0951,
"step": 699
},
{
"epoch": 2.028985507246377,
"grad_norm": 0.39727740475543244,
"learning_rate": 2.5090473149970357e-05,
"loss": 0.9986,
"step": 700
},
{
"epoch": 2.0318840579710145,
"grad_norm": 0.39777015075034217,
"learning_rate": 2.4954804278080423e-05,
"loss": 1.0739,
"step": 701
},
{
"epoch": 2.034782608695652,
"grad_norm": 0.40515813767942754,
"learning_rate": 2.4819381115693923e-05,
"loss": 1.1273,
"step": 702
},
{
"epoch": 2.03768115942029,
"grad_norm": 0.3928754252415712,
"learning_rate": 2.4684204991399312e-05,
"loss": 1.0047,
"step": 703
},
{
"epoch": 2.0405797101449274,
"grad_norm": 0.39235743857450184,
"learning_rate": 2.4549277231361438e-05,
"loss": 1.0452,
"step": 704
},
{
"epoch": 2.0434782608695654,
"grad_norm": 0.41751282512992466,
"learning_rate": 2.4414599159308553e-05,
"loss": 1.0451,
"step": 705
},
{
"epoch": 2.046376811594203,
"grad_norm": 0.40629312672049445,
"learning_rate": 2.4280172096519298e-05,
"loss": 1.1042,
"step": 706
},
{
"epoch": 2.0492753623188404,
"grad_norm": 0.4057666557957047,
"learning_rate": 2.4145997361809758e-05,
"loss": 1.0483,
"step": 707
},
{
"epoch": 2.0521739130434784,
"grad_norm": 0.4116946242019697,
"learning_rate": 2.4012076271520495e-05,
"loss": 1.1184,
"step": 708
},
{
"epoch": 2.055072463768116,
"grad_norm": 0.4127782071588422,
"learning_rate": 2.3878410139503693e-05,
"loss": 1.1238,
"step": 709
},
{
"epoch": 2.0579710144927534,
"grad_norm": 0.3964820416953686,
"learning_rate": 2.3745000277110197e-05,
"loss": 1.0499,
"step": 710
},
{
"epoch": 2.0608695652173914,
"grad_norm": 0.43556452448044664,
"learning_rate": 2.36118479931767e-05,
"loss": 1.0943,
"step": 711
},
{
"epoch": 2.063768115942029,
"grad_norm": 0.3995865010547347,
"learning_rate": 2.347895459401288e-05,
"loss": 1.04,
"step": 712
},
{
"epoch": 2.066666666666667,
"grad_norm": 0.4221661952062326,
"learning_rate": 2.334632138338859e-05,
"loss": 0.9803,
"step": 713
},
{
"epoch": 2.0695652173913044,
"grad_norm": 0.41950916776520863,
"learning_rate": 2.3213949662521066e-05,
"loss": 1.0886,
"step": 714
},
{
"epoch": 2.072463768115942,
"grad_norm": 0.4173493785071151,
"learning_rate": 2.308184073006216e-05,
"loss": 1.0596,
"step": 715
},
{
"epoch": 2.07536231884058,
"grad_norm": 0.39623286465989827,
"learning_rate": 2.2949995882085595e-05,
"loss": 1.0871,
"step": 716
},
{
"epoch": 2.0782608695652174,
"grad_norm": 0.39259310137723663,
"learning_rate": 2.2818416412074267e-05,
"loss": 1.0324,
"step": 717
},
{
"epoch": 2.081159420289855,
"grad_norm": 0.3822283284054439,
"learning_rate": 2.2687103610907534e-05,
"loss": 1.1117,
"step": 718
},
{
"epoch": 2.084057971014493,
"grad_norm": 0.407037401843374,
"learning_rate": 2.255605876684856e-05,
"loss": 1.0225,
"step": 719
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.4184329997154531,
"learning_rate": 2.2425283165531685e-05,
"loss": 1.0084,
"step": 720
},
{
"epoch": 2.0898550724637683,
"grad_norm": 0.4131172741343908,
"learning_rate": 2.22947780899498e-05,
"loss": 1.0207,
"step": 721
},
{
"epoch": 2.092753623188406,
"grad_norm": 0.4143196275192534,
"learning_rate": 2.216454482044176e-05,
"loss": 1.0337,
"step": 722
},
{
"epoch": 2.0956521739130434,
"grad_norm": 0.40754060408579984,
"learning_rate": 2.203458463467983e-05,
"loss": 1.1537,
"step": 723
},
{
"epoch": 2.0985507246376813,
"grad_norm": 0.42013725925992734,
"learning_rate": 2.1904898807657152e-05,
"loss": 0.9899,
"step": 724
},
{
"epoch": 2.101449275362319,
"grad_norm": 0.41687669776278075,
"learning_rate": 2.1775488611675233e-05,
"loss": 1.0832,
"step": 725
},
{
"epoch": 2.1043478260869564,
"grad_norm": 0.4286213604830879,
"learning_rate": 2.1646355316331458e-05,
"loss": 1.0802,
"step": 726
},
{
"epoch": 2.1072463768115943,
"grad_norm": 0.4042262579626966,
"learning_rate": 2.151750018850663e-05,
"loss": 1.0538,
"step": 727
},
{
"epoch": 2.110144927536232,
"grad_norm": 0.4010423956906586,
"learning_rate": 2.1388924492352565e-05,
"loss": 1.0897,
"step": 728
},
{
"epoch": 2.1130434782608694,
"grad_norm": 0.4120035283147293,
"learning_rate": 2.126062948927966e-05,
"loss": 1.1104,
"step": 729
},
{
"epoch": 2.1159420289855073,
"grad_norm": 0.4300470148265316,
"learning_rate": 2.1132616437944547e-05,
"loss": 1.0457,
"step": 730
},
{
"epoch": 2.118840579710145,
"grad_norm": 0.4153085209481317,
"learning_rate": 2.100488659423772e-05,
"loss": 1.0856,
"step": 731
},
{
"epoch": 2.121739130434783,
"grad_norm": 0.4060830438581685,
"learning_rate": 2.087744121127122e-05,
"loss": 1.0801,
"step": 732
},
{
"epoch": 2.1246376811594203,
"grad_norm": 0.4267224449360045,
"learning_rate": 2.075028153936636e-05,
"loss": 1.0158,
"step": 733
},
{
"epoch": 2.127536231884058,
"grad_norm": 0.4092513929978087,
"learning_rate": 2.062340882604143e-05,
"loss": 1.0211,
"step": 734
},
{
"epoch": 2.130434782608696,
"grad_norm": 0.4297526463869587,
"learning_rate": 2.049682431599947e-05,
"loss": 1.1129,
"step": 735
},
{
"epoch": 2.1333333333333333,
"grad_norm": 0.4636326790218994,
"learning_rate": 2.0370529251116067e-05,
"loss": 1.1291,
"step": 736
},
{
"epoch": 2.136231884057971,
"grad_norm": 0.3974548122667625,
"learning_rate": 2.0244524870427172e-05,
"loss": 0.9923,
"step": 737
},
{
"epoch": 2.139130434782609,
"grad_norm": 0.4038721913341886,
"learning_rate": 2.0118812410116915e-05,
"loss": 1.0817,
"step": 738
},
{
"epoch": 2.1420289855072463,
"grad_norm": 0.41807115165201914,
"learning_rate": 1.999339310350551e-05,
"loss": 1.09,
"step": 739
},
{
"epoch": 2.1449275362318843,
"grad_norm": 0.40763130794004726,
"learning_rate": 1.9868268181037185e-05,
"loss": 1.0475,
"step": 740
},
{
"epoch": 2.1478260869565218,
"grad_norm": 0.4099162086697869,
"learning_rate": 1.9743438870267988e-05,
"loss": 1.0527,
"step": 741
},
{
"epoch": 2.1507246376811593,
"grad_norm": 0.4046969215163759,
"learning_rate": 1.961890639585388e-05,
"loss": 1.0224,
"step": 742
},
{
"epoch": 2.1536231884057973,
"grad_norm": 0.40495982818104165,
"learning_rate": 1.949467197953866e-05,
"loss": 0.9912,
"step": 743
},
{
"epoch": 2.1565217391304348,
"grad_norm": 0.4115616809855344,
"learning_rate": 1.9370736840141978e-05,
"loss": 1.0773,
"step": 744
},
{
"epoch": 2.1594202898550723,
"grad_norm": 0.42477438614499907,
"learning_rate": 1.9247102193547384e-05,
"loss": 1.0183,
"step": 745
},
{
"epoch": 2.1623188405797102,
"grad_norm": 0.39454596479550186,
"learning_rate": 1.912376925269041e-05,
"loss": 1.0548,
"step": 746
},
{
"epoch": 2.1652173913043478,
"grad_norm": 0.4324946159925722,
"learning_rate": 1.900073922754665e-05,
"loss": 1.0532,
"step": 747
},
{
"epoch": 2.1681159420289857,
"grad_norm": 0.40496616232865795,
"learning_rate": 1.8878013325119902e-05,
"loss": 1.1552,
"step": 748
},
{
"epoch": 2.1710144927536232,
"grad_norm": 0.41915807837518143,
"learning_rate": 1.8755592749430322e-05,
"loss": 1.0243,
"step": 749
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.4186007202451323,
"learning_rate": 1.8633478701502628e-05,
"loss": 1.0744,
"step": 750
},
{
"epoch": 2.1768115942028987,
"grad_norm": 0.42045626939886377,
"learning_rate": 1.8511672379354284e-05,
"loss": 1.068,
"step": 751
},
{
"epoch": 2.1797101449275362,
"grad_norm": 0.4045186001077355,
"learning_rate": 1.8390174977983778e-05,
"loss": 1.0957,
"step": 752
},
{
"epoch": 2.1826086956521737,
"grad_norm": 0.4478832702569865,
"learning_rate": 1.8268987689358874e-05,
"loss": 1.0909,
"step": 753
},
{
"epoch": 2.1855072463768117,
"grad_norm": 0.4164615953299648,
"learning_rate": 1.814811170240495e-05,
"loss": 1.0386,
"step": 754
},
{
"epoch": 2.1884057971014492,
"grad_norm": 0.41902328103819775,
"learning_rate": 1.80275482029933e-05,
"loss": 1.0344,
"step": 755
},
{
"epoch": 2.1913043478260867,
"grad_norm": 0.41670788409755355,
"learning_rate": 1.7907298373929517e-05,
"loss": 0.9878,
"step": 756
},
{
"epoch": 2.1942028985507247,
"grad_norm": 0.4294226441948201,
"learning_rate": 1.7787363394941875e-05,
"loss": 1.0175,
"step": 757
},
{
"epoch": 2.197101449275362,
"grad_norm": 0.4254645454494433,
"learning_rate": 1.7667744442669793e-05,
"loss": 1.0615,
"step": 758
},
{
"epoch": 2.2,
"grad_norm": 0.4099964946904337,
"learning_rate": 1.7548442690652238e-05,
"loss": 0.9919,
"step": 759
},
{
"epoch": 2.2028985507246377,
"grad_norm": 0.42880536140401987,
"learning_rate": 1.7429459309316254e-05,
"loss": 1.0661,
"step": 760
},
{
"epoch": 2.205797101449275,
"grad_norm": 0.4173497311104388,
"learning_rate": 1.7310795465965452e-05,
"loss": 1.0304,
"step": 761
},
{
"epoch": 2.208695652173913,
"grad_norm": 0.4181309528124866,
"learning_rate": 1.7192452324768577e-05,
"loss": 1.1069,
"step": 762
},
{
"epoch": 2.2115942028985507,
"grad_norm": 0.4253296723606123,
"learning_rate": 1.7074431046748075e-05,
"loss": 1.1159,
"step": 763
},
{
"epoch": 2.214492753623188,
"grad_norm": 0.4140966246574362,
"learning_rate": 1.69567327897687e-05,
"loss": 1.035,
"step": 764
},
{
"epoch": 2.217391304347826,
"grad_norm": 0.4360262256456945,
"learning_rate": 1.683935870852621e-05,
"loss": 1.0341,
"step": 765
},
{
"epoch": 2.2202898550724637,
"grad_norm": 0.4129314987978601,
"learning_rate": 1.6722309954535915e-05,
"loss": 1.0361,
"step": 766
},
{
"epoch": 2.2231884057971016,
"grad_norm": 0.44728638008426197,
"learning_rate": 1.6605587676121492e-05,
"loss": 0.982,
"step": 767
},
{
"epoch": 2.226086956521739,
"grad_norm": 0.4142277894364414,
"learning_rate": 1.6489193018403694e-05,
"loss": 1.0186,
"step": 768
},
{
"epoch": 2.2289855072463767,
"grad_norm": 0.42466461089685326,
"learning_rate": 1.6373127123289082e-05,
"loss": 1.0878,
"step": 769
},
{
"epoch": 2.2318840579710146,
"grad_norm": 0.4255999017930268,
"learning_rate": 1.6257391129458866e-05,
"loss": 0.9795,
"step": 770
},
{
"epoch": 2.234782608695652,
"grad_norm": 0.4214111455741252,
"learning_rate": 1.614198617235768e-05,
"loss": 1.0523,
"step": 771
},
{
"epoch": 2.2376811594202897,
"grad_norm": 0.40833801140318804,
"learning_rate": 1.6026913384182513e-05,
"loss": 1.0665,
"step": 772
},
{
"epoch": 2.2405797101449276,
"grad_norm": 0.4060043083014689,
"learning_rate": 1.5912173893871534e-05,
"loss": 1.0294,
"step": 773
},
{
"epoch": 2.243478260869565,
"grad_norm": 0.441842102392729,
"learning_rate": 1.5797768827093055e-05,
"loss": 1.0781,
"step": 774
},
{
"epoch": 2.246376811594203,
"grad_norm": 0.42451158383299736,
"learning_rate": 1.5683699306234483e-05,
"loss": 1.03,
"step": 775
},
{
"epoch": 2.2492753623188406,
"grad_norm": 0.43280564540973687,
"learning_rate": 1.5569966450391273e-05,
"loss": 1.0932,
"step": 776
},
{
"epoch": 2.252173913043478,
"grad_norm": 0.4260799476878949,
"learning_rate": 1.5456571375356045e-05,
"loss": 0.9906,
"step": 777
},
{
"epoch": 2.255072463768116,
"grad_norm": 0.4289868937899867,
"learning_rate": 1.534351519360752e-05,
"loss": 1.1224,
"step": 778
},
{
"epoch": 2.2579710144927536,
"grad_norm": 0.4184482349129135,
"learning_rate": 1.5230799014299651e-05,
"loss": 1.0492,
"step": 779
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.4169287607356858,
"learning_rate": 1.5118423943250771e-05,
"loss": 1.0076,
"step": 780
},
{
"epoch": 2.263768115942029,
"grad_norm": 0.4437723000239763,
"learning_rate": 1.500639108293272e-05,
"loss": 1.0756,
"step": 781
},
{
"epoch": 2.2666666666666666,
"grad_norm": 2.438737443529068,
"learning_rate": 1.4894701532460026e-05,
"loss": 1.0372,
"step": 782
},
{
"epoch": 2.269565217391304,
"grad_norm": 0.4259694730355945,
"learning_rate": 1.4783356387579123e-05,
"loss": 1.0914,
"step": 783
},
{
"epoch": 2.272463768115942,
"grad_norm": 0.42609879566763975,
"learning_rate": 1.4672356740657612e-05,
"loss": 1.1024,
"step": 784
},
{
"epoch": 2.2753623188405796,
"grad_norm": 0.41473766458960193,
"learning_rate": 1.4561703680673528e-05,
"loss": 1.0437,
"step": 785
},
{
"epoch": 2.2782608695652176,
"grad_norm": 0.41138794322562033,
"learning_rate": 1.4451398293204671e-05,
"loss": 0.9883,
"step": 786
},
{
"epoch": 2.281159420289855,
"grad_norm": 0.4345116661977155,
"learning_rate": 1.4341441660417948e-05,
"loss": 1.0405,
"step": 787
},
{
"epoch": 2.2840579710144926,
"grad_norm": 0.43156004240612655,
"learning_rate": 1.423183486105874e-05,
"loss": 1.0858,
"step": 788
},
{
"epoch": 2.2869565217391306,
"grad_norm": 0.43394375495039533,
"learning_rate": 1.4122578970440392e-05,
"loss": 1.013,
"step": 789
},
{
"epoch": 2.289855072463768,
"grad_norm": 0.42318889929148634,
"learning_rate": 1.4013675060433562e-05,
"loss": 1.0667,
"step": 790
},
{
"epoch": 2.292753623188406,
"grad_norm": 0.4338786349395585,
"learning_rate": 1.3905124199455733e-05,
"loss": 0.9574,
"step": 791
},
{
"epoch": 2.2956521739130435,
"grad_norm": 0.4263774516063788,
"learning_rate": 1.379692745246079e-05,
"loss": 1.0388,
"step": 792
},
{
"epoch": 2.298550724637681,
"grad_norm": 0.4578203586741276,
"learning_rate": 1.368908588092852e-05,
"loss": 1.0852,
"step": 793
},
{
"epoch": 2.301449275362319,
"grad_norm": 0.4223544444704819,
"learning_rate": 1.3581600542854211e-05,
"loss": 1.0764,
"step": 794
},
{
"epoch": 2.3043478260869565,
"grad_norm": 0.42040297195621995,
"learning_rate": 1.3474472492738266e-05,
"loss": 1.0818,
"step": 795
},
{
"epoch": 2.307246376811594,
"grad_norm": 0.42233699920038903,
"learning_rate": 1.3367702781575858e-05,
"loss": 1.0144,
"step": 796
},
{
"epoch": 2.310144927536232,
"grad_norm": 0.42739886636894053,
"learning_rate": 1.3261292456846647e-05,
"loss": 1.011,
"step": 797
},
{
"epoch": 2.3130434782608695,
"grad_norm": 0.4319353955954341,
"learning_rate": 1.315524256250445e-05,
"loss": 0.9984,
"step": 798
},
{
"epoch": 2.315942028985507,
"grad_norm": 0.4240304031792234,
"learning_rate": 1.3049554138967051e-05,
"loss": 1.0865,
"step": 799
},
{
"epoch": 2.318840579710145,
"grad_norm": 0.44946527738642017,
"learning_rate": 1.2944228223105953e-05,
"loss": 1.0496,
"step": 800
},
{
"epoch": 2.3217391304347825,
"grad_norm": 0.42198617091436585,
"learning_rate": 1.2839265848236271e-05,
"loss": 1.0357,
"step": 801
},
{
"epoch": 2.3246376811594205,
"grad_norm": 0.42787604239445254,
"learning_rate": 1.273466804410649e-05,
"loss": 1.0624,
"step": 802
},
{
"epoch": 2.327536231884058,
"grad_norm": 0.4259453527555043,
"learning_rate": 1.2630435836888477e-05,
"loss": 1.0371,
"step": 803
},
{
"epoch": 2.3304347826086955,
"grad_norm": 0.4405744784698457,
"learning_rate": 1.2526570249167285e-05,
"loss": 1.0722,
"step": 804
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.44433415788871033,
"learning_rate": 1.242307229993126e-05,
"loss": 1.1003,
"step": 805
},
{
"epoch": 2.336231884057971,
"grad_norm": 0.44002850613090233,
"learning_rate": 1.2319943004561951e-05,
"loss": 1.0334,
"step": 806
},
{
"epoch": 2.3391304347826085,
"grad_norm": 0.4327626792123435,
"learning_rate": 1.2217183374824182e-05,
"loss": 1.0841,
"step": 807
},
{
"epoch": 2.3420289855072465,
"grad_norm": 0.44177237553294435,
"learning_rate": 1.2114794418856112e-05,
"loss": 1.1006,
"step": 808
},
{
"epoch": 2.344927536231884,
"grad_norm": 0.4252814673055529,
"learning_rate": 1.2012777141159359e-05,
"loss": 1.0902,
"step": 809
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.44481606310880256,
"learning_rate": 1.1911132542589126e-05,
"loss": 1.0663,
"step": 810
},
{
"epoch": 2.3507246376811595,
"grad_norm": 0.44531350592923585,
"learning_rate": 1.180986162034441e-05,
"loss": 1.0395,
"step": 811
},
{
"epoch": 2.353623188405797,
"grad_norm": 0.4403754842576467,
"learning_rate": 1.1708965367958175e-05,
"loss": 1.0367,
"step": 812
},
{
"epoch": 2.356521739130435,
"grad_norm": 0.44504741014172594,
"learning_rate": 1.160844477528768e-05,
"loss": 1.0668,
"step": 813
},
{
"epoch": 2.3594202898550725,
"grad_norm": 0.45218366246573805,
"learning_rate": 1.150830082850468e-05,
"loss": 1.0078,
"step": 814
},
{
"epoch": 2.36231884057971,
"grad_norm": 0.4400472472708365,
"learning_rate": 1.1408534510085805e-05,
"loss": 1.0535,
"step": 815
},
{
"epoch": 2.365217391304348,
"grad_norm": 0.429340428309833,
"learning_rate": 1.130914679880291e-05,
"loss": 1.0736,
"step": 816
},
{
"epoch": 2.3681159420289855,
"grad_norm": 0.41976853039844914,
"learning_rate": 1.1210138669713444e-05,
"loss": 0.9793,
"step": 817
},
{
"epoch": 2.3710144927536234,
"grad_norm": 0.430344411304319,
"learning_rate": 1.1111511094150945e-05,
"loss": 0.9848,
"step": 818
},
{
"epoch": 2.373913043478261,
"grad_norm": 0.431007787368086,
"learning_rate": 1.1013265039715465e-05,
"loss": 0.9797,
"step": 819
},
{
"epoch": 2.3768115942028984,
"grad_norm": 0.43768154374858875,
"learning_rate": 1.0915401470264081e-05,
"loss": 1.0339,
"step": 820
},
{
"epoch": 2.3797101449275364,
"grad_norm": 0.4153960922316617,
"learning_rate": 1.081792134590145e-05,
"loss": 1.0726,
"step": 821
},
{
"epoch": 2.382608695652174,
"grad_norm": 0.4261661560061093,
"learning_rate": 1.0720825622970387e-05,
"loss": 1.0732,
"step": 822
},
{
"epoch": 2.3855072463768114,
"grad_norm": 0.46272436711753084,
"learning_rate": 1.0624115254042482e-05,
"loss": 1.0509,
"step": 823
},
{
"epoch": 2.3884057971014494,
"grad_norm": 0.4159332663897536,
"learning_rate": 1.0527791187908736e-05,
"loss": 1.0301,
"step": 824
},
{
"epoch": 2.391304347826087,
"grad_norm": 0.41855139337790126,
"learning_rate": 1.0431854369570316e-05,
"loss": 0.98,
"step": 825
},
{
"epoch": 2.3942028985507244,
"grad_norm": 0.4407049676844984,
"learning_rate": 1.0336305740229196e-05,
"loss": 1.0198,
"step": 826
},
{
"epoch": 2.3971014492753624,
"grad_norm": 0.44469510783381666,
"learning_rate": 1.0241146237278975e-05,
"loss": 1.0142,
"step": 827
},
{
"epoch": 2.4,
"grad_norm": 0.4204751833047234,
"learning_rate": 1.0146376794295698e-05,
"loss": 1.0435,
"step": 828
},
{
"epoch": 2.402898550724638,
"grad_norm": 0.43076006527935645,
"learning_rate": 1.0051998341028618e-05,
"loss": 1.0329,
"step": 829
},
{
"epoch": 2.4057971014492754,
"grad_norm": 0.4212241503239106,
"learning_rate": 9.958011803391166e-06,
"loss": 1.0517,
"step": 830
},
{
"epoch": 2.408695652173913,
"grad_norm": 0.43752577070512094,
"learning_rate": 9.864418103451828e-06,
"loss": 1.05,
"step": 831
},
{
"epoch": 2.411594202898551,
"grad_norm": 0.4539932456655938,
"learning_rate": 9.771218159425084e-06,
"loss": 1.0501,
"step": 832
},
{
"epoch": 2.4144927536231884,
"grad_norm": 0.44298901817857494,
"learning_rate": 9.678412885662418e-06,
"loss": 1.0399,
"step": 833
},
{
"epoch": 2.417391304347826,
"grad_norm": 0.44330383234774,
"learning_rate": 9.586003192643362e-06,
"loss": 1.0242,
"step": 834
},
{
"epoch": 2.420289855072464,
"grad_norm": 0.42235580319917715,
"learning_rate": 9.493989986966518e-06,
"loss": 1.0961,
"step": 835
},
{
"epoch": 2.4231884057971014,
"grad_norm": 0.42412654756876644,
"learning_rate": 9.402374171340705e-06,
"loss": 1.0747,
"step": 836
},
{
"epoch": 2.426086956521739,
"grad_norm": 0.4604003701876417,
"learning_rate": 9.311156644576108e-06,
"loss": 0.9956,
"step": 837
},
{
"epoch": 2.428985507246377,
"grad_norm": 0.4355065867115315,
"learning_rate": 9.220338301575414e-06,
"loss": 1.0515,
"step": 838
},
{
"epoch": 2.4318840579710144,
"grad_norm": 0.41606575435043913,
"learning_rate": 9.129920033325068e-06,
"loss": 1.0834,
"step": 839
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.41400057706555543,
"learning_rate": 9.039902726886535e-06,
"loss": 1.025,
"step": 840
},
{
"epoch": 2.43768115942029,
"grad_norm": 0.4212465286811161,
"learning_rate": 8.95028726538758e-06,
"loss": 1.0888,
"step": 841
},
{
"epoch": 2.4405797101449274,
"grad_norm": 0.44292414437801153,
"learning_rate": 8.861074528013586e-06,
"loss": 1.1063,
"step": 842
},
{
"epoch": 2.4434782608695653,
"grad_norm": 0.4618762426767351,
"learning_rate": 8.77226538999899e-06,
"loss": 1.0861,
"step": 843
},
{
"epoch": 2.446376811594203,
"grad_norm": 0.42934378228075604,
"learning_rate": 8.683860722618641e-06,
"loss": 1.0674,
"step": 844
},
{
"epoch": 2.449275362318841,
"grad_norm": 0.44137968841741865,
"learning_rate": 8.595861393179277e-06,
"loss": 1.0248,
"step": 845
},
{
"epoch": 2.4521739130434783,
"grad_norm": 0.45115385912472034,
"learning_rate": 8.508268265011005e-06,
"loss": 1.0471,
"step": 846
},
{
"epoch": 2.455072463768116,
"grad_norm": 0.44160775586291273,
"learning_rate": 8.42108219745884e-06,
"loss": 1.0375,
"step": 847
},
{
"epoch": 2.457971014492754,
"grad_norm": 0.44498128589628316,
"learning_rate": 8.334304045874247e-06,
"loss": 1.0928,
"step": 848
},
{
"epoch": 2.4608695652173913,
"grad_norm": 0.42944613569509194,
"learning_rate": 8.247934661606826e-06,
"loss": 1.0611,
"step": 849
},
{
"epoch": 2.463768115942029,
"grad_norm": 0.4293984310812336,
"learning_rate": 8.161974891995855e-06,
"loss": 1.0425,
"step": 850
},
{
"epoch": 2.466666666666667,
"grad_norm": 0.43223021088950386,
"learning_rate": 8.076425580362052e-06,
"loss": 1.0966,
"step": 851
},
{
"epoch": 2.4695652173913043,
"grad_norm": 0.4511615485513439,
"learning_rate": 7.991287565999272e-06,
"loss": 0.9823,
"step": 852
},
{
"epoch": 2.472463768115942,
"grad_norm": 0.43175751442143545,
"learning_rate": 7.906561684166275e-06,
"loss": 1.046,
"step": 853
},
{
"epoch": 2.47536231884058,
"grad_norm": 0.4398354654162565,
"learning_rate": 7.822248766078555e-06,
"loss": 1.1159,
"step": 854
},
{
"epoch": 2.4782608695652173,
"grad_norm": 0.4217658734022817,
"learning_rate": 7.738349638900127e-06,
"loss": 1.0605,
"step": 855
},
{
"epoch": 2.4811594202898553,
"grad_norm": 0.4463848438795895,
"learning_rate": 7.654865125735483e-06,
"loss": 0.987,
"step": 856
},
{
"epoch": 2.4840579710144928,
"grad_norm": 0.4553067045132744,
"learning_rate": 7.571796045621482e-06,
"loss": 1.049,
"step": 857
},
{
"epoch": 2.4869565217391303,
"grad_norm": 0.4470257852745124,
"learning_rate": 7.489143213519301e-06,
"loss": 1.0841,
"step": 858
},
{
"epoch": 2.4898550724637682,
"grad_norm": 0.42594930418564064,
"learning_rate": 7.406907440306471e-06,
"loss": 1.0877,
"step": 859
},
{
"epoch": 2.4927536231884058,
"grad_norm": 0.4284878480179994,
"learning_rate": 7.325089532768892e-06,
"loss": 1.0765,
"step": 860
},
{
"epoch": 2.4956521739130437,
"grad_norm": 0.44182270672000895,
"learning_rate": 7.243690293592959e-06,
"loss": 1.0233,
"step": 861
},
{
"epoch": 2.4985507246376812,
"grad_norm": 0.43871383223404364,
"learning_rate": 7.1627105213576355e-06,
"loss": 1.0702,
"step": 862
},
{
"epoch": 2.5014492753623188,
"grad_norm": 0.4277793635895529,
"learning_rate": 7.08215101052665e-06,
"loss": 1.0573,
"step": 863
},
{
"epoch": 2.5043478260869563,
"grad_norm": 0.4406001751473407,
"learning_rate": 7.002012551440701e-06,
"loss": 1.0316,
"step": 864
},
{
"epoch": 2.5072463768115942,
"grad_norm": 0.5413472127354161,
"learning_rate": 6.922295930309691e-06,
"loss": 1.0798,
"step": 865
},
{
"epoch": 2.5101449275362318,
"grad_norm": 0.4301282293831735,
"learning_rate": 6.84300192920504e-06,
"loss": 1.0723,
"step": 866
},
{
"epoch": 2.5130434782608697,
"grad_norm": 0.43181259980748293,
"learning_rate": 6.764131326051953e-06,
"loss": 1.0395,
"step": 867
},
{
"epoch": 2.5159420289855072,
"grad_norm": 0.4357413758485379,
"learning_rate": 6.6856848946218635e-06,
"loss": 1.04,
"step": 868
},
{
"epoch": 2.5188405797101447,
"grad_norm": 0.4441512604958444,
"learning_rate": 6.607663404524795e-06,
"loss": 1.02,
"step": 869
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.4403400361786895,
"learning_rate": 6.53006762120183e-06,
"loss": 0.9813,
"step": 870
},
{
"epoch": 2.52463768115942,
"grad_norm": 0.4295706766182875,
"learning_rate": 6.452898305917587e-06,
"loss": 1.0977,
"step": 871
},
{
"epoch": 2.527536231884058,
"grad_norm": 0.4500164864119338,
"learning_rate": 6.376156215752743e-06,
"loss": 1.046,
"step": 872
},
{
"epoch": 2.5304347826086957,
"grad_norm": 0.4295283517592817,
"learning_rate": 6.299842103596665e-06,
"loss": 0.9962,
"step": 873
},
{
"epoch": 2.533333333333333,
"grad_norm": 0.4298591342734868,
"learning_rate": 6.223956718139939e-06,
"loss": 1.0351,
"step": 874
},
{
"epoch": 2.536231884057971,
"grad_norm": 0.41916133011716233,
"learning_rate": 6.14850080386708e-06,
"loss": 0.9795,
"step": 875
},
{
"epoch": 2.5391304347826087,
"grad_norm": 0.450757056089375,
"learning_rate": 6.073475101049209e-06,
"loss": 1.0287,
"step": 876
},
{
"epoch": 2.5420289855072467,
"grad_norm": 0.4428910375540849,
"learning_rate": 5.998880345736812e-06,
"loss": 1.0841,
"step": 877
},
{
"epoch": 2.544927536231884,
"grad_norm": 0.4370122339112871,
"learning_rate": 5.924717269752478e-06,
"loss": 1.0355,
"step": 878
},
{
"epoch": 2.5478260869565217,
"grad_norm": 0.4328546688643461,
"learning_rate": 5.8509866006837725e-06,
"loss": 1.0458,
"step": 879
},
{
"epoch": 2.550724637681159,
"grad_norm": 0.45457918016504273,
"learning_rate": 5.777689061876035e-06,
"loss": 1.0407,
"step": 880
},
{
"epoch": 2.553623188405797,
"grad_norm": 0.41666707799866615,
"learning_rate": 5.704825372425343e-06,
"loss": 1.0336,
"step": 881
},
{
"epoch": 2.5565217391304347,
"grad_norm": 0.4500898444777061,
"learning_rate": 5.6323962471714286e-06,
"loss": 1.0082,
"step": 882
},
{
"epoch": 2.5594202898550726,
"grad_norm": 0.43189682364915644,
"learning_rate": 5.560402396690667e-06,
"loss": 1.0732,
"step": 883
},
{
"epoch": 2.56231884057971,
"grad_norm": 0.4517991164758783,
"learning_rate": 5.4888445272891e-06,
"loss": 1.0565,
"step": 884
},
{
"epoch": 2.5652173913043477,
"grad_norm": 0.43585727349975845,
"learning_rate": 5.417723340995545e-06,
"loss": 1.0569,
"step": 885
},
{
"epoch": 2.5681159420289856,
"grad_norm": 0.4451555207263539,
"learning_rate": 5.347039535554632e-06,
"loss": 1.0934,
"step": 886
},
{
"epoch": 2.571014492753623,
"grad_norm": 0.44753595012523295,
"learning_rate": 5.276793804420033e-06,
"loss": 1.0129,
"step": 887
},
{
"epoch": 2.573913043478261,
"grad_norm": 0.43340171540500966,
"learning_rate": 5.206986836747624e-06,
"loss": 1.057,
"step": 888
},
{
"epoch": 2.5768115942028986,
"grad_norm": 0.41103056048092484,
"learning_rate": 5.13761931738872e-06,
"loss": 1.0629,
"step": 889
},
{
"epoch": 2.579710144927536,
"grad_norm": 0.4379217485808061,
"learning_rate": 5.068691926883367e-06,
"loss": 1.1122,
"step": 890
},
{
"epoch": 2.5826086956521737,
"grad_norm": 0.4367395495858654,
"learning_rate": 5.000205341453679e-06,
"loss": 1.0641,
"step": 891
},
{
"epoch": 2.5855072463768116,
"grad_norm": 0.4346646618624072,
"learning_rate": 4.9321602329971735e-06,
"loss": 1.0247,
"step": 892
},
{
"epoch": 2.588405797101449,
"grad_norm": 0.4266332511623276,
"learning_rate": 4.864557269080183e-06,
"loss": 1.1,
"step": 893
},
{
"epoch": 2.591304347826087,
"grad_norm": 0.4280568908138626,
"learning_rate": 4.7973971129313455e-06,
"loss": 0.9916,
"step": 894
},
{
"epoch": 2.5942028985507246,
"grad_norm": 0.4157220462493493,
"learning_rate": 4.730680423435046e-06,
"loss": 1.0384,
"step": 895
},
{
"epoch": 2.597101449275362,
"grad_norm": 0.4657661567334127,
"learning_rate": 4.6644078551249916e-06,
"loss": 1.0206,
"step": 896
},
{
"epoch": 2.6,
"grad_norm": 0.4402043390402084,
"learning_rate": 4.59858005817776e-06,
"loss": 1.0051,
"step": 897
},
{
"epoch": 2.6028985507246376,
"grad_norm": 0.47342746863944507,
"learning_rate": 4.533197678406459e-06,
"loss": 0.9908,
"step": 898
},
{
"epoch": 2.6057971014492756,
"grad_norm": 0.44686945552614565,
"learning_rate": 4.468261357254339e-06,
"loss": 1.0194,
"step": 899
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.45848518372098457,
"learning_rate": 4.403771731788547e-06,
"loss": 1.0751,
"step": 900
},
{
"epoch": 2.6115942028985506,
"grad_norm": 0.41833931514497974,
"learning_rate": 4.339729434693851e-06,
"loss": 1.0486,
"step": 901
},
{
"epoch": 2.6144927536231886,
"grad_norm": 0.4154891635226541,
"learning_rate": 4.276135094266437e-06,
"loss": 1.0246,
"step": 902
},
{
"epoch": 2.617391304347826,
"grad_norm": 0.42902378243746886,
"learning_rate": 4.212989334407752e-06,
"loss": 1.0367,
"step": 903
},
{
"epoch": 2.620289855072464,
"grad_norm": 0.4413147059304679,
"learning_rate": 4.150292774618386e-06,
"loss": 1.0377,
"step": 904
},
{
"epoch": 2.6231884057971016,
"grad_norm": 0.4326053305994359,
"learning_rate": 4.088046029991954e-06,
"loss": 1.0321,
"step": 905
},
{
"epoch": 2.626086956521739,
"grad_norm": 0.43297947767772066,
"learning_rate": 4.026249711209134e-06,
"loss": 1.0814,
"step": 906
},
{
"epoch": 2.6289855072463766,
"grad_norm": 0.42391791250689304,
"learning_rate": 3.964904424531623e-06,
"loss": 1.1435,
"step": 907
},
{
"epoch": 2.6318840579710145,
"grad_norm": 0.44465042718334696,
"learning_rate": 3.90401077179619e-06,
"loss": 1.0755,
"step": 908
},
{
"epoch": 2.634782608695652,
"grad_norm": 0.4379840802629311,
"learning_rate": 3.843569350408799e-06,
"loss": 1.0326,
"step": 909
},
{
"epoch": 2.63768115942029,
"grad_norm": 0.4380256503816688,
"learning_rate": 3.7835807533387336e-06,
"loss": 0.9959,
"step": 910
},
{
"epoch": 2.6405797101449275,
"grad_norm": 0.4250114900172059,
"learning_rate": 3.724045569112766e-06,
"loss": 1.0413,
"step": 911
},
{
"epoch": 2.643478260869565,
"grad_norm": 0.43495634484636064,
"learning_rate": 3.664964381809416e-06,
"loss": 1.0502,
"step": 912
},
{
"epoch": 2.646376811594203,
"grad_norm": 0.41338659373447945,
"learning_rate": 3.606337771053181e-06,
"loss": 1.0322,
"step": 913
},
{
"epoch": 2.6492753623188405,
"grad_norm": 0.4607899596362807,
"learning_rate": 3.548166312008877e-06,
"loss": 1.062,
"step": 914
},
{
"epoch": 2.6521739130434785,
"grad_norm": 0.4456807876825619,
"learning_rate": 3.4904505753759863e-06,
"loss": 1.049,
"step": 915
},
{
"epoch": 2.655072463768116,
"grad_norm": 0.45066296980753234,
"learning_rate": 3.4331911273830784e-06,
"loss": 1.1202,
"step": 916
},
{
"epoch": 2.6579710144927535,
"grad_norm": 0.42887756180559006,
"learning_rate": 3.376388529782215e-06,
"loss": 1.0579,
"step": 917
},
{
"epoch": 2.660869565217391,
"grad_norm": 0.4242946529545818,
"learning_rate": 3.320043339843465e-06,
"loss": 1.0094,
"step": 918
},
{
"epoch": 2.663768115942029,
"grad_norm": 0.4509087953831623,
"learning_rate": 3.2641561103494424e-06,
"loss": 1.126,
"step": 919
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.4582297576992613,
"learning_rate": 3.2087273895898606e-06,
"loss": 1.0978,
"step": 920
},
{
"epoch": 2.6695652173913045,
"grad_norm": 0.41892321793577525,
"learning_rate": 3.153757721356182e-06,
"loss": 1.0188,
"step": 921
},
{
"epoch": 2.672463768115942,
"grad_norm": 0.43091493659712077,
"learning_rate": 3.0992476449362653e-06,
"loss": 1.0657,
"step": 922
},
{
"epoch": 2.6753623188405795,
"grad_norm": 0.4484469573992589,
"learning_rate": 3.0451976951090757e-06,
"loss": 1.0578,
"step": 923
},
{
"epoch": 2.6782608695652175,
"grad_norm": 0.45221935250795153,
"learning_rate": 2.991608402139434e-06,
"loss": 1.0728,
"step": 924
},
{
"epoch": 2.681159420289855,
"grad_norm": 0.42748137661848884,
"learning_rate": 2.938480291772827e-06,
"loss": 1.0517,
"step": 925
},
{
"epoch": 2.684057971014493,
"grad_norm": 0.4338746720819457,
"learning_rate": 2.8858138852302374e-06,
"loss": 1.0192,
"step": 926
},
{
"epoch": 2.6869565217391305,
"grad_norm": 0.44271385780896827,
"learning_rate": 2.833609699203038e-06,
"loss": 1.0409,
"step": 927
},
{
"epoch": 2.689855072463768,
"grad_norm": 0.44168360737350637,
"learning_rate": 2.7818682458479294e-06,
"loss": 1.0353,
"step": 928
},
{
"epoch": 2.692753623188406,
"grad_norm": 0.44662829054916564,
"learning_rate": 2.7305900327818936e-06,
"loss": 1.0321,
"step": 929
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.4372789501470448,
"learning_rate": 2.679775563077247e-06,
"loss": 1.0469,
"step": 930
},
{
"epoch": 2.6985507246376814,
"grad_norm": 0.4170715080873589,
"learning_rate": 2.6294253352566466e-06,
"loss": 1.0717,
"step": 931
},
{
"epoch": 2.701449275362319,
"grad_norm": 0.44425061018773043,
"learning_rate": 2.5795398432882756e-06,
"loss": 1.0892,
"step": 932
},
{
"epoch": 2.7043478260869565,
"grad_norm": 0.43077942102243316,
"learning_rate": 2.530119576580936e-06,
"loss": 1.0542,
"step": 933
},
{
"epoch": 2.707246376811594,
"grad_norm": 0.4370359842657613,
"learning_rate": 2.4811650199792924e-06,
"loss": 1.0096,
"step": 934
},
{
"epoch": 2.710144927536232,
"grad_norm": 0.43626145737902144,
"learning_rate": 2.4326766537590693e-06,
"loss": 1.081,
"step": 935
},
{
"epoch": 2.7130434782608694,
"grad_norm": 0.47685901764854666,
"learning_rate": 2.384654953622384e-06,
"loss": 1.1176,
"step": 936
},
{
"epoch": 2.7159420289855074,
"grad_norm": 0.45228260777925117,
"learning_rate": 2.3371003906930423e-06,
"loss": 1.0481,
"step": 937
},
{
"epoch": 2.718840579710145,
"grad_norm": 0.44256756961973887,
"learning_rate": 2.290013431511945e-06,
"loss": 1.0347,
"step": 938
},
{
"epoch": 2.7217391304347824,
"grad_norm": 0.4402726419838423,
"learning_rate": 2.243394538032484e-06,
"loss": 1.0369,
"step": 939
},
{
"epoch": 2.7246376811594204,
"grad_norm": 0.45365804923951414,
"learning_rate": 2.197244167616047e-06,
"loss": 1.0973,
"step": 940
},
{
"epoch": 2.727536231884058,
"grad_norm": 0.4525083377542681,
"learning_rate": 2.1515627730274822e-06,
"loss": 1.0616,
"step": 941
},
{
"epoch": 2.730434782608696,
"grad_norm": 0.41867968643258735,
"learning_rate": 2.106350802430718e-06,
"loss": 1.0361,
"step": 942
},
{
"epoch": 2.7333333333333334,
"grad_norm": 0.44410487106485796,
"learning_rate": 2.0616086993842876e-06,
"loss": 1.0262,
"step": 943
},
{
"epoch": 2.736231884057971,
"grad_norm": 0.42533114796177457,
"learning_rate": 2.0173369028370583e-06,
"loss": 1.0324,
"step": 944
},
{
"epoch": 2.7391304347826084,
"grad_norm": 0.41967355790971034,
"learning_rate": 1.9735358471238586e-06,
"loss": 1.0439,
"step": 945
},
{
"epoch": 2.7420289855072464,
"grad_norm": 0.4313810499422798,
"learning_rate": 1.9302059619612787e-06,
"loss": 1.0067,
"step": 946
},
{
"epoch": 2.744927536231884,
"grad_norm": 0.4457644564670882,
"learning_rate": 1.8873476724433902e-06,
"loss": 1.0433,
"step": 947
},
{
"epoch": 2.747826086956522,
"grad_norm": 0.44140575476367844,
"learning_rate": 1.8449613990376313e-06,
"loss": 1.0281,
"step": 948
},
{
"epoch": 2.7507246376811594,
"grad_norm": 0.41388990569707274,
"learning_rate": 1.8030475575806394e-06,
"loss": 1.0779,
"step": 949
},
{
"epoch": 2.753623188405797,
"grad_norm": 0.44319022004684594,
"learning_rate": 1.7616065592742038e-06,
"loss": 1.0709,
"step": 950
},
{
"epoch": 2.756521739130435,
"grad_norm": 0.42280831552653275,
"learning_rate": 1.7206388106812077e-06,
"loss": 1.0602,
"step": 951
},
{
"epoch": 2.7594202898550724,
"grad_norm": 0.41831113949584664,
"learning_rate": 1.6801447137216652e-06,
"loss": 1.0519,
"step": 952
},
{
"epoch": 2.7623188405797103,
"grad_norm": 0.42149777436767877,
"learning_rate": 1.6401246656687463e-06,
"loss": 1.0568,
"step": 953
},
{
"epoch": 2.765217391304348,
"grad_norm": 0.429110137697547,
"learning_rate": 1.6005790591448966e-06,
"loss": 1.1177,
"step": 954
},
{
"epoch": 2.7681159420289854,
"grad_norm": 0.46048857323106746,
"learning_rate": 1.5615082821180071e-06,
"loss": 1.0583,
"step": 955
},
{
"epoch": 2.7710144927536233,
"grad_norm": 0.4299763555661624,
"learning_rate": 1.522912717897551e-06,
"loss": 1.1047,
"step": 956
},
{
"epoch": 2.773913043478261,
"grad_norm": 0.47595502230009035,
"learning_rate": 1.4847927451308753e-06,
"loss": 1.0598,
"step": 957
},
{
"epoch": 2.776811594202899,
"grad_norm": 0.44472688488854684,
"learning_rate": 1.447148737799481e-06,
"loss": 1.0717,
"step": 958
},
{
"epoch": 2.7797101449275363,
"grad_norm": 0.446411341344231,
"learning_rate": 1.4099810652153212e-06,
"loss": 1.0873,
"step": 959
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.4395447440323806,
"learning_rate": 1.3732900920172154e-06,
"loss": 1.0097,
"step": 960
},
{
"epoch": 2.7855072463768114,
"grad_norm": 0.4374552230480354,
"learning_rate": 1.3370761781672346e-06,
"loss": 1.0025,
"step": 961
},
{
"epoch": 2.7884057971014493,
"grad_norm": 0.4585611691245378,
"learning_rate": 1.3013396789472055e-06,
"loss": 0.9921,
"step": 962
},
{
"epoch": 2.791304347826087,
"grad_norm": 0.4367319010484946,
"learning_rate": 1.2660809449552058e-06,
"loss": 1.005,
"step": 963
},
{
"epoch": 2.794202898550725,
"grad_norm": 0.41818614449882124,
"learning_rate": 1.2313003221021302e-06,
"loss": 1.0392,
"step": 964
},
{
"epoch": 2.7971014492753623,
"grad_norm": 0.43712018288101745,
"learning_rate": 1.1969981516082972e-06,
"loss": 1.0703,
"step": 965
},
{
"epoch": 2.8,
"grad_norm": 0.4330052924141849,
"learning_rate": 1.163174770000086e-06,
"loss": 1.0149,
"step": 966
},
{
"epoch": 2.802898550724638,
"grad_norm": 0.4637514588180937,
"learning_rate": 1.1298305091066664e-06,
"loss": 1.054,
"step": 967
},
{
"epoch": 2.8057971014492753,
"grad_norm": 0.4328211094942756,
"learning_rate": 1.0969656960567177e-06,
"loss": 1.1024,
"step": 968
},
{
"epoch": 2.8086956521739133,
"grad_norm": 0.49114261638602824,
"learning_rate": 1.0645806532752156e-06,
"loss": 1.0506,
"step": 969
},
{
"epoch": 2.8115942028985508,
"grad_norm": 0.43504595478449676,
"learning_rate": 1.0326756984803065e-06,
"loss": 1.0711,
"step": 970
},
{
"epoch": 2.8144927536231883,
"grad_norm": 0.4348937962062495,
"learning_rate": 1.0012511446801377e-06,
"loss": 1.1078,
"step": 971
},
{
"epoch": 2.8173913043478263,
"grad_norm": 0.44058656927819256,
"learning_rate": 9.70307300169826e-07,
"loss": 1.0991,
"step": 972
},
{
"epoch": 2.8202898550724638,
"grad_norm": 0.4295244566694527,
"learning_rate": 9.39844468528428e-07,
"loss": 0.9995,
"step": 973
},
{
"epoch": 2.8231884057971013,
"grad_norm": 0.4367203092602682,
"learning_rate": 9.09862948615936e-07,
"loss": 1.0519,
"step": 974
},
{
"epoch": 2.8260869565217392,
"grad_norm": 0.4449664564834592,
"learning_rate": 8.803630345703751e-07,
"loss": 1.0474,
"step": 975
},
{
"epoch": 2.8289855072463768,
"grad_norm": 0.4297347658970927,
"learning_rate": 8.513450158049108e-07,
"loss": 1.0695,
"step": 976
},
{
"epoch": 2.8318840579710143,
"grad_norm": 0.4486135418859604,
"learning_rate": 8.228091770049961e-07,
"loss": 1.0164,
"step": 977
},
{
"epoch": 2.8347826086956522,
"grad_norm": 0.43980229550927924,
"learning_rate": 7.947557981255904e-07,
"loss": 1.0317,
"step": 978
},
{
"epoch": 2.8376811594202898,
"grad_norm": 0.44553738280573807,
"learning_rate": 7.671851543884112e-07,
"loss": 1.0946,
"step": 979
},
{
"epoch": 2.8405797101449277,
"grad_norm": 0.4363004911544926,
"learning_rate": 7.400975162792367e-07,
"loss": 1.003,
"step": 980
},
{
"epoch": 2.8434782608695652,
"grad_norm": 0.4413405166653603,
"learning_rate": 7.134931495452413e-07,
"loss": 1.0882,
"step": 981
},
{
"epoch": 2.8463768115942027,
"grad_norm": 0.44085985363028396,
"learning_rate": 6.873723151924027e-07,
"loss": 0.9974,
"step": 982
},
{
"epoch": 2.8492753623188407,
"grad_norm": 0.44891911764344156,
"learning_rate": 6.617352694829381e-07,
"loss": 0.9997,
"step": 983
},
{
"epoch": 2.8521739130434782,
"grad_norm": 0.4297742893819775,
"learning_rate": 6.365822639327723e-07,
"loss": 1.0248,
"step": 984
},
{
"epoch": 2.855072463768116,
"grad_norm": 0.44307938049828505,
"learning_rate": 6.119135453090952e-07,
"loss": 1.0523,
"step": 985
},
{
"epoch": 2.8579710144927537,
"grad_norm": 0.4219491261370554,
"learning_rate": 5.877293556279306e-07,
"loss": 1.0316,
"step": 986
},
{
"epoch": 2.860869565217391,
"grad_norm": 0.4441565730933646,
"learning_rate": 5.64029932151755e-07,
"loss": 1.0601,
"step": 987
},
{
"epoch": 2.8637681159420287,
"grad_norm": 0.43904823016047406,
"learning_rate": 5.408155073871768e-07,
"loss": 1.0962,
"step": 988
},
{
"epoch": 2.8666666666666667,
"grad_norm": 0.4380193819651974,
"learning_rate": 5.180863090826604e-07,
"loss": 1.0828,
"step": 989
},
{
"epoch": 2.869565217391304,
"grad_norm": 0.46490668660417783,
"learning_rate": 4.95842560226284e-07,
"loss": 0.9954,
"step": 990
},
{
"epoch": 2.872463768115942,
"grad_norm": 0.44779443933129964,
"learning_rate": 4.7408447904354614e-07,
"loss": 0.9894,
"step": 991
},
{
"epoch": 2.8753623188405797,
"grad_norm": 0.44039118698865287,
"learning_rate": 4.52812278995246e-07,
"loss": 0.9391,
"step": 992
},
{
"epoch": 2.878260869565217,
"grad_norm": 0.44888017878839825,
"learning_rate": 4.3202616877536793e-07,
"loss": 1.044,
"step": 993
},
{
"epoch": 2.881159420289855,
"grad_norm": 0.4412322695340127,
"learning_rate": 4.117263523090442e-07,
"loss": 1.1098,
"step": 994
},
{
"epoch": 2.8840579710144927,
"grad_norm": 0.42595193117492713,
"learning_rate": 3.919130287505457e-07,
"loss": 1.0755,
"step": 995
},
{
"epoch": 2.8869565217391306,
"grad_norm": 0.44081324693289337,
"learning_rate": 3.725863924813389e-07,
"loss": 1.0776,
"step": 996
},
{
"epoch": 2.889855072463768,
"grad_norm": 0.45676229278822633,
"learning_rate": 3.5374663310818735e-07,
"loss": 1.121,
"step": 997
},
{
"epoch": 2.8927536231884057,
"grad_norm": 0.42858508933481326,
"learning_rate": 3.3539393546124784e-07,
"loss": 1.0342,
"step": 998
},
{
"epoch": 2.8956521739130436,
"grad_norm": 0.4554639141142107,
"learning_rate": 3.1752847959232167e-07,
"loss": 1.0403,
"step": 999
},
{
"epoch": 2.898550724637681,
"grad_norm": 0.4443160110274387,
"learning_rate": 3.0015044077303933e-07,
"loss": 0.9923,
"step": 1000
},
{
"epoch": 2.901449275362319,
"grad_norm": 0.45114283690245177,
"learning_rate": 2.8325998949314536e-07,
"loss": 1.0137,
"step": 1001
},
{
"epoch": 2.9043478260869566,
"grad_norm": 0.440281019286359,
"learning_rate": 2.668572914588496e-07,
"loss": 1.0009,
"step": 1002
},
{
"epoch": 2.907246376811594,
"grad_norm": 0.42131395328506477,
"learning_rate": 2.509425075911953e-07,
"loss": 1.0864,
"step": 1003
},
{
"epoch": 2.9101449275362317,
"grad_norm": 0.4431327301308889,
"learning_rate": 2.3551579402445455e-07,
"loss": 1.0369,
"step": 1004
},
{
"epoch": 2.9130434782608696,
"grad_norm": 0.437441254967641,
"learning_rate": 2.2057730210462979e-07,
"loss": 1.0946,
"step": 1005
},
{
"epoch": 2.915942028985507,
"grad_norm": 0.44460142080563914,
"learning_rate": 2.0612717838794926e-07,
"loss": 1.0682,
"step": 1006
},
{
"epoch": 2.918840579710145,
"grad_norm": 0.46357594598759,
"learning_rate": 1.9216556463943492e-07,
"loss": 1.0347,
"step": 1007
},
{
"epoch": 2.9217391304347826,
"grad_norm": 0.4280959868112658,
"learning_rate": 1.7869259783150905e-07,
"loss": 1.0446,
"step": 1008
},
{
"epoch": 2.92463768115942,
"grad_norm": 0.4391861785357275,
"learning_rate": 1.657084101426565e-07,
"loss": 1.0055,
"step": 1009
},
{
"epoch": 2.927536231884058,
"grad_norm": 0.43467829714626893,
"learning_rate": 1.5321312895612007e-07,
"loss": 1.0468,
"step": 1010
},
{
"epoch": 2.9304347826086956,
"grad_norm": 0.436157471233564,
"learning_rate": 1.4120687685866274e-07,
"loss": 1.003,
"step": 1011
},
{
"epoch": 2.9333333333333336,
"grad_norm": 0.4387651565287021,
"learning_rate": 1.2968977163934638e-07,
"loss": 1.0961,
"step": 1012
},
{
"epoch": 2.936231884057971,
"grad_norm": 0.4544754835767558,
"learning_rate": 1.1866192628839368e-07,
"loss": 1.1016,
"step": 1013
},
{
"epoch": 2.9391304347826086,
"grad_norm": 0.4558428482092103,
"learning_rate": 1.0812344899607252e-07,
"loss": 1.0319,
"step": 1014
},
{
"epoch": 2.942028985507246,
"grad_norm": 0.4298065423481269,
"learning_rate": 9.807444315163006e-08,
"loss": 1.0564,
"step": 1015
},
{
"epoch": 2.944927536231884,
"grad_norm": 0.45987333857679424,
"learning_rate": 8.851500734229357e-08,
"loss": 1.0879,
"step": 1016
},
{
"epoch": 2.9478260869565216,
"grad_norm": 0.42633685574770663,
"learning_rate": 7.944523535228233e-08,
"loss": 1.02,
"step": 1017
},
{
"epoch": 2.9507246376811596,
"grad_norm": 0.42941746517921314,
"learning_rate": 7.086521616190279e-08,
"loss": 1.0368,
"step": 1018
},
{
"epoch": 2.953623188405797,
"grad_norm": 0.4500990712597483,
"learning_rate": 6.27750339466715e-08,
"loss": 1.0091,
"step": 1019
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.43281524715248404,
"learning_rate": 5.517476807648248e-08,
"loss": 1.0871,
"step": 1020
},
{
"epoch": 2.9594202898550726,
"grad_norm": 0.4404835864216849,
"learning_rate": 4.806449311484107e-08,
"loss": 1.1031,
"step": 1021
},
{
"epoch": 2.96231884057971,
"grad_norm": 0.4292383359696952,
"learning_rate": 4.144427881813129e-08,
"loss": 0.9651,
"step": 1022
},
{
"epoch": 2.965217391304348,
"grad_norm": 0.43976585710369,
"learning_rate": 3.531419013491632e-08,
"loss": 1.0691,
"step": 1023
},
{
"epoch": 2.9681159420289855,
"grad_norm": 0.43252864631461296,
"learning_rate": 2.967428720531129e-08,
"loss": 0.9949,
"step": 1024
},
{
"epoch": 2.971014492753623,
"grad_norm": 0.4477919897543057,
"learning_rate": 2.4524625360400345e-08,
"loss": 1.0986,
"step": 1025
},
{
"epoch": 2.973913043478261,
"grad_norm": 0.4289179803109601,
"learning_rate": 1.986525512168158e-08,
"loss": 1.0116,
"step": 1026
},
{
"epoch": 2.9768115942028985,
"grad_norm": 0.45865303578317895,
"learning_rate": 1.5696222200578535e-08,
"loss": 1.0639,
"step": 1027
},
{
"epoch": 2.9797101449275365,
"grad_norm": 0.43468926771375377,
"learning_rate": 1.2017567497996097e-08,
"loss": 0.9828,
"step": 1028
},
{
"epoch": 2.982608695652174,
"grad_norm": 0.4353013480109291,
"learning_rate": 8.82932710389861e-09,
"loss": 1.0111,
"step": 1029
},
{
"epoch": 2.9855072463768115,
"grad_norm": 0.435625700326904,
"learning_rate": 6.131532296982379e-09,
"loss": 1.0963,
"step": 1030
},
{
"epoch": 2.988405797101449,
"grad_norm": 0.4393642554858853,
"learning_rate": 3.9242095443481345e-09,
"loss": 1.1145,
"step": 1031
},
{
"epoch": 2.991304347826087,
"grad_norm": 0.43072368766038216,
"learning_rate": 2.207380501262346e-09,
"loss": 1.0647,
"step": 1032
},
{
"epoch": 2.9942028985507245,
"grad_norm": 0.45588828392520236,
"learning_rate": 9.810620109129698e-10,
"loss": 1.0432,
"step": 1033
},
{
"epoch": 2.9971014492753625,
"grad_norm": 0.4459564292216382,
"learning_rate": 2.452661042817717e-10,
"loss": 1.1399,
"step": 1034
},
{
"epoch": 3.0,
"grad_norm": 0.4383463614226025,
"learning_rate": 0.0,
"loss": 0.9416,
"step": 1035
},
{
"epoch": 3.0,
"step": 1035,
"total_flos": 238917794807808.0,
"train_loss": 1.2324695289422924,
"train_runtime": 15380.0554,
"train_samples_per_second": 2.148,
"train_steps_per_second": 0.067
}
],
"logging_steps": 1.0,
"max_steps": 1035,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 238917794807808.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}