{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9892735251097027, "eval_steps": 500, "global_step": 510, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005850804485616773, "grad_norm": 2.4042019844055176, "learning_rate": 1.9607843137254904e-07, "loss": 0.4984, "step": 1 }, { "epoch": 0.011701608971233545, "grad_norm": 2.46307110786438, "learning_rate": 3.921568627450981e-07, "loss": 0.5004, "step": 2 }, { "epoch": 0.017552413456850317, "grad_norm": 2.4179155826568604, "learning_rate": 5.882352941176471e-07, "loss": 0.5124, "step": 3 }, { "epoch": 0.02340321794246709, "grad_norm": 2.3512749671936035, "learning_rate": 7.843137254901962e-07, "loss": 0.4679, "step": 4 }, { "epoch": 0.02925402242808386, "grad_norm": 2.3498940467834473, "learning_rate": 9.80392156862745e-07, "loss": 0.513, "step": 5 }, { "epoch": 0.035104826913700635, "grad_norm": 2.35388445854187, "learning_rate": 1.1764705882352942e-06, "loss": 0.506, "step": 6 }, { "epoch": 0.040955631399317405, "grad_norm": 2.130160093307495, "learning_rate": 1.3725490196078434e-06, "loss": 0.481, "step": 7 }, { "epoch": 0.04680643588493418, "grad_norm": 1.7983100414276123, "learning_rate": 1.5686274509803923e-06, "loss": 0.4636, "step": 8 }, { "epoch": 0.05265724037055095, "grad_norm": 1.7637169361114502, "learning_rate": 1.7647058823529414e-06, "loss": 0.4812, "step": 9 }, { "epoch": 0.05850804485616772, "grad_norm": 1.7297167778015137, "learning_rate": 1.96078431372549e-06, "loss": 0.4754, "step": 10 }, { "epoch": 0.0643588493417845, "grad_norm": 1.4557020664215088, "learning_rate": 2.1568627450980393e-06, "loss": 0.4483, "step": 11 }, { "epoch": 0.07020965382740127, "grad_norm": 1.4000929594039917, "learning_rate": 2.3529411764705885e-06, "loss": 0.4637, "step": 12 }, { "epoch": 0.07606045831301804, "grad_norm": 1.2742167711257935, "learning_rate": 2.549019607843137e-06, "loss": 0.4512, "step": 13 }, { "epoch": 0.08191126279863481, "grad_norm": 0.9762641787528992, "learning_rate": 2.7450980392156867e-06, "loss": 0.4386, "step": 14 }, { "epoch": 0.08776206728425158, "grad_norm": 0.8408783078193665, "learning_rate": 2.9411764705882355e-06, "loss": 0.389, "step": 15 }, { "epoch": 0.09361287176986836, "grad_norm": 0.8847479224205017, "learning_rate": 3.1372549019607846e-06, "loss": 0.4037, "step": 16 }, { "epoch": 0.09946367625548513, "grad_norm": 0.8736488223075867, "learning_rate": 3.3333333333333333e-06, "loss": 0.4273, "step": 17 }, { "epoch": 0.1053144807411019, "grad_norm": 0.7419383525848389, "learning_rate": 3.529411764705883e-06, "loss": 0.3898, "step": 18 }, { "epoch": 0.11116528522671867, "grad_norm": 0.7293541431427002, "learning_rate": 3.7254901960784316e-06, "loss": 0.4233, "step": 19 }, { "epoch": 0.11701608971233544, "grad_norm": 1.9727623462677002, "learning_rate": 3.92156862745098e-06, "loss": 0.3701, "step": 20 }, { "epoch": 0.12286689419795221, "grad_norm": 0.915031373500824, "learning_rate": 4.11764705882353e-06, "loss": 0.3727, "step": 21 }, { "epoch": 0.128717698683569, "grad_norm": 0.9051816463470459, "learning_rate": 4.313725490196079e-06, "loss": 0.3683, "step": 22 }, { "epoch": 0.13456850316918575, "grad_norm": 0.7431566715240479, "learning_rate": 4.509803921568628e-06, "loss": 0.3567, "step": 23 }, { "epoch": 0.14041930765480254, "grad_norm": 0.5959689021110535, "learning_rate": 4.705882352941177e-06, "loss": 0.342, "step": 24 }, { "epoch": 0.1462701121404193, "grad_norm": 0.5637457966804504, "learning_rate": 4.901960784313726e-06, "loss": 0.3199, "step": 25 }, { "epoch": 0.15212091662603608, "grad_norm": 0.5946214199066162, "learning_rate": 5.098039215686274e-06, "loss": 0.3299, "step": 26 }, { "epoch": 0.15797172111165286, "grad_norm": 0.4843839108943939, "learning_rate": 5.294117647058824e-06, "loss": 0.3049, "step": 27 }, { "epoch": 0.16382252559726962, "grad_norm": 0.49762770533561707, "learning_rate": 5.4901960784313735e-06, "loss": 0.2883, "step": 28 }, { "epoch": 0.1696733300828864, "grad_norm": 0.5575108528137207, "learning_rate": 5.686274509803922e-06, "loss": 0.302, "step": 29 }, { "epoch": 0.17552413456850316, "grad_norm": 0.5406191945075989, "learning_rate": 5.882352941176471e-06, "loss": 0.2851, "step": 30 }, { "epoch": 0.18137493905411994, "grad_norm": 0.4934878945350647, "learning_rate": 6.07843137254902e-06, "loss": 0.2748, "step": 31 }, { "epoch": 0.18722574353973673, "grad_norm": 0.5784087777137756, "learning_rate": 6.274509803921569e-06, "loss": 0.2993, "step": 32 }, { "epoch": 0.19307654802535348, "grad_norm": 0.5021258592605591, "learning_rate": 6.470588235294119e-06, "loss": 0.276, "step": 33 }, { "epoch": 0.19892735251097027, "grad_norm": 0.44409283995628357, "learning_rate": 6.666666666666667e-06, "loss": 0.2574, "step": 34 }, { "epoch": 0.20477815699658702, "grad_norm": 0.4324759840965271, "learning_rate": 6.862745098039216e-06, "loss": 0.2427, "step": 35 }, { "epoch": 0.2106289614822038, "grad_norm": 0.4134741425514221, "learning_rate": 7.058823529411766e-06, "loss": 0.2693, "step": 36 }, { "epoch": 0.21647976596782056, "grad_norm": 0.3712354600429535, "learning_rate": 7.2549019607843145e-06, "loss": 0.2547, "step": 37 }, { "epoch": 0.22233057045343735, "grad_norm": 0.3564675748348236, "learning_rate": 7.450980392156863e-06, "loss": 0.2189, "step": 38 }, { "epoch": 0.22818137493905413, "grad_norm": 0.3792021572589874, "learning_rate": 7.647058823529411e-06, "loss": 0.2457, "step": 39 }, { "epoch": 0.2340321794246709, "grad_norm": 0.39038991928100586, "learning_rate": 7.84313725490196e-06, "loss": 0.2488, "step": 40 }, { "epoch": 0.23988298391028767, "grad_norm": 0.3869354724884033, "learning_rate": 8.03921568627451e-06, "loss": 0.2234, "step": 41 }, { "epoch": 0.24573378839590443, "grad_norm": 0.44067057967185974, "learning_rate": 8.23529411764706e-06, "loss": 0.25, "step": 42 }, { "epoch": 0.2515845928815212, "grad_norm": 0.34073254466056824, "learning_rate": 8.43137254901961e-06, "loss": 0.2214, "step": 43 }, { "epoch": 0.257435397367138, "grad_norm": 0.3576607406139374, "learning_rate": 8.627450980392157e-06, "loss": 0.222, "step": 44 }, { "epoch": 0.26328620185275475, "grad_norm": 0.3421995937824249, "learning_rate": 8.823529411764707e-06, "loss": 0.2023, "step": 45 }, { "epoch": 0.2691370063383715, "grad_norm": 0.36221471428871155, "learning_rate": 9.019607843137256e-06, "loss": 0.2078, "step": 46 }, { "epoch": 0.2749878108239883, "grad_norm": 0.3617911636829376, "learning_rate": 9.215686274509804e-06, "loss": 0.2067, "step": 47 }, { "epoch": 0.2808386153096051, "grad_norm": 0.37211400270462036, "learning_rate": 9.411764705882354e-06, "loss": 0.1999, "step": 48 }, { "epoch": 0.28668941979522183, "grad_norm": 0.3575108051300049, "learning_rate": 9.607843137254903e-06, "loss": 0.2098, "step": 49 }, { "epoch": 0.2925402242808386, "grad_norm": 0.3736143112182617, "learning_rate": 9.803921568627451e-06, "loss": 0.2121, "step": 50 }, { "epoch": 0.2983910287664554, "grad_norm": 0.3584408164024353, "learning_rate": 1e-05, "loss": 0.1986, "step": 51 }, { "epoch": 0.30424183325207216, "grad_norm": 0.32740485668182373, "learning_rate": 9.999882884955554e-06, "loss": 0.1744, "step": 52 }, { "epoch": 0.3100926377376889, "grad_norm": 0.3438873887062073, "learning_rate": 9.999531545308584e-06, "loss": 0.1955, "step": 53 }, { "epoch": 0.3159434422233057, "grad_norm": 0.3557385206222534, "learning_rate": 9.998945997517957e-06, "loss": 0.1908, "step": 54 }, { "epoch": 0.3217942467089225, "grad_norm": 0.3771747052669525, "learning_rate": 9.998126269014255e-06, "loss": 0.1933, "step": 55 }, { "epoch": 0.32764505119453924, "grad_norm": 0.3500851094722748, "learning_rate": 9.997072398198492e-06, "loss": 0.1892, "step": 56 }, { "epoch": 0.333495855680156, "grad_norm": 0.33903267979621887, "learning_rate": 9.99578443444032e-06, "loss": 0.1747, "step": 57 }, { "epoch": 0.3393466601657728, "grad_norm": 0.3747689425945282, "learning_rate": 9.994262438075713e-06, "loss": 0.2007, "step": 58 }, { "epoch": 0.34519746465138956, "grad_norm": 0.33843183517456055, "learning_rate": 9.992506480404137e-06, "loss": 0.1789, "step": 59 }, { "epoch": 0.3510482691370063, "grad_norm": 0.49277859926223755, "learning_rate": 9.990516643685222e-06, "loss": 0.1799, "step": 60 }, { "epoch": 0.35689907362262313, "grad_norm": 0.32874795794487, "learning_rate": 9.988293021134888e-06, "loss": 0.167, "step": 61 }, { "epoch": 0.3627498781082399, "grad_norm": 0.3928215503692627, "learning_rate": 9.985835716921e-06, "loss": 0.1715, "step": 62 }, { "epoch": 0.36860068259385664, "grad_norm": 0.36052584648132324, "learning_rate": 9.983144846158472e-06, "loss": 0.1785, "step": 63 }, { "epoch": 0.37445148707947346, "grad_norm": 0.33686086535453796, "learning_rate": 9.980220534903889e-06, "loss": 0.1747, "step": 64 }, { "epoch": 0.3803022915650902, "grad_norm": 0.36128610372543335, "learning_rate": 9.977062920149583e-06, "loss": 0.1773, "step": 65 }, { "epoch": 0.38615309605070697, "grad_norm": 0.3438204824924469, "learning_rate": 9.973672149817232e-06, "loss": 0.1595, "step": 66 }, { "epoch": 0.3920039005363237, "grad_norm": 0.3195885419845581, "learning_rate": 9.970048382750925e-06, "loss": 0.1698, "step": 67 }, { "epoch": 0.39785470502194054, "grad_norm": 0.3541395664215088, "learning_rate": 9.966191788709716e-06, "loss": 0.1732, "step": 68 }, { "epoch": 0.4037055095075573, "grad_norm": 0.3797125816345215, "learning_rate": 9.96210254835968e-06, "loss": 0.1855, "step": 69 }, { "epoch": 0.40955631399317405, "grad_norm": 0.4068518280982971, "learning_rate": 9.957780853265441e-06, "loss": 0.1803, "step": 70 }, { "epoch": 0.41540711847879086, "grad_norm": 0.3230303227901459, "learning_rate": 9.953226905881208e-06, "loss": 0.1551, "step": 71 }, { "epoch": 0.4212579229644076, "grad_norm": 0.3526642918586731, "learning_rate": 9.948440919541277e-06, "loss": 0.1659, "step": 72 }, { "epoch": 0.4271087274500244, "grad_norm": 0.36039891839027405, "learning_rate": 9.943423118450051e-06, "loss": 0.1741, "step": 73 }, { "epoch": 0.43295953193564113, "grad_norm": 0.4426978826522827, "learning_rate": 9.938173737671531e-06, "loss": 0.1747, "step": 74 }, { "epoch": 0.43881033642125794, "grad_norm": 0.37956029176712036, "learning_rate": 9.932693023118299e-06, "loss": 0.1766, "step": 75 }, { "epoch": 0.4446611409068747, "grad_norm": 0.3615328371524811, "learning_rate": 9.926981231540007e-06, "loss": 0.1775, "step": 76 }, { "epoch": 0.45051194539249145, "grad_norm": 0.37767109274864197, "learning_rate": 9.921038630511345e-06, "loss": 0.1829, "step": 77 }, { "epoch": 0.45636274987810826, "grad_norm": 0.35032397508621216, "learning_rate": 9.91486549841951e-06, "loss": 0.1714, "step": 78 }, { "epoch": 0.462213554363725, "grad_norm": 0.41910672187805176, "learning_rate": 9.908462124451152e-06, "loss": 0.1716, "step": 79 }, { "epoch": 0.4680643588493418, "grad_norm": 0.3652605414390564, "learning_rate": 9.901828808578846e-06, "loss": 0.1496, "step": 80 }, { "epoch": 0.47391516333495853, "grad_norm": 0.40993812680244446, "learning_rate": 9.894965861547023e-06, "loss": 0.1633, "step": 81 }, { "epoch": 0.47976596782057535, "grad_norm": 0.3730096220970154, "learning_rate": 9.887873604857424e-06, "loss": 0.1661, "step": 82 }, { "epoch": 0.4856167723061921, "grad_norm": 0.36139336228370667, "learning_rate": 9.88055237075403e-06, "loss": 0.1677, "step": 83 }, { "epoch": 0.49146757679180886, "grad_norm": 0.38538867235183716, "learning_rate": 9.873002502207502e-06, "loss": 0.1708, "step": 84 }, { "epoch": 0.49731838127742567, "grad_norm": 0.38366183638572693, "learning_rate": 9.86522435289912e-06, "loss": 0.1742, "step": 85 }, { "epoch": 0.5031691857630424, "grad_norm": 0.3716081380844116, "learning_rate": 9.857218287204204e-06, "loss": 0.1539, "step": 86 }, { "epoch": 0.5090199902486592, "grad_norm": 0.40919578075408936, "learning_rate": 9.848984680175049e-06, "loss": 0.1774, "step": 87 }, { "epoch": 0.514870794734276, "grad_norm": 0.3685464560985565, "learning_rate": 9.840523917523354e-06, "loss": 0.1686, "step": 88 }, { "epoch": 0.5207215992198927, "grad_norm": 0.3441756069660187, "learning_rate": 9.831836395602164e-06, "loss": 0.1497, "step": 89 }, { "epoch": 0.5265724037055095, "grad_norm": 0.3777903914451599, "learning_rate": 9.822922521387277e-06, "loss": 0.1615, "step": 90 }, { "epoch": 0.5324232081911263, "grad_norm": 0.3399229347705841, "learning_rate": 9.813782712458206e-06, "loss": 0.1558, "step": 91 }, { "epoch": 0.538274012676743, "grad_norm": 0.41103455424308777, "learning_rate": 9.804417396978605e-06, "loss": 0.1591, "step": 92 }, { "epoch": 0.5441248171623598, "grad_norm": 0.4678979218006134, "learning_rate": 9.794827013676206e-06, "loss": 0.1793, "step": 93 }, { "epoch": 0.5499756216479766, "grad_norm": 0.3981055021286011, "learning_rate": 9.78501201182228e-06, "loss": 0.1731, "step": 94 }, { "epoch": 0.5558264261335933, "grad_norm": 0.3952052593231201, "learning_rate": 9.774972851210572e-06, "loss": 0.1687, "step": 95 }, { "epoch": 0.5616772306192102, "grad_norm": 0.5214592218399048, "learning_rate": 9.764710002135784e-06, "loss": 0.1497, "step": 96 }, { "epoch": 0.567528035104827, "grad_norm": 0.3616654872894287, "learning_rate": 9.754223945371524e-06, "loss": 0.1617, "step": 97 }, { "epoch": 0.5733788395904437, "grad_norm": 0.37656962871551514, "learning_rate": 9.743515172147793e-06, "loss": 0.1533, "step": 98 }, { "epoch": 0.5792296440760605, "grad_norm": 0.4490816593170166, "learning_rate": 9.732584184127973e-06, "loss": 0.1629, "step": 99 }, { "epoch": 0.5850804485616772, "grad_norm": 0.3636768162250519, "learning_rate": 9.721431493385322e-06, "loss": 0.1493, "step": 100 }, { "epoch": 0.590931253047294, "grad_norm": 0.3367384076118469, "learning_rate": 9.710057622378992e-06, "loss": 0.1582, "step": 101 }, { "epoch": 0.5967820575329108, "grad_norm": 0.5017166137695312, "learning_rate": 9.698463103929542e-06, "loss": 0.1699, "step": 102 }, { "epoch": 0.6026328620185275, "grad_norm": 0.3878183960914612, "learning_rate": 9.686648481193994e-06, "loss": 0.1646, "step": 103 }, { "epoch": 0.6084836665041443, "grad_norm": 0.35837483406066895, "learning_rate": 9.674614307640368e-06, "loss": 0.1637, "step": 104 }, { "epoch": 0.6143344709897611, "grad_norm": 0.41536927223205566, "learning_rate": 9.66236114702178e-06, "loss": 0.1634, "step": 105 }, { "epoch": 0.6201852754753778, "grad_norm": 0.3885203003883362, "learning_rate": 9.649889573350006e-06, "loss": 0.1573, "step": 106 }, { "epoch": 0.6260360799609946, "grad_norm": 0.3583751618862152, "learning_rate": 9.637200170868607e-06, "loss": 0.1372, "step": 107 }, { "epoch": 0.6318868844466115, "grad_norm": 0.4037657678127289, "learning_rate": 9.62429353402556e-06, "loss": 0.1538, "step": 108 }, { "epoch": 0.6377376889322282, "grad_norm": 0.35686999559402466, "learning_rate": 9.611170267445401e-06, "loss": 0.1572, "step": 109 }, { "epoch": 0.643588493417845, "grad_norm": 0.37882205843925476, "learning_rate": 9.597830985900913e-06, "loss": 0.1516, "step": 110 }, { "epoch": 0.6494392979034618, "grad_norm": 0.4156239330768585, "learning_rate": 9.584276314284316e-06, "loss": 0.1633, "step": 111 }, { "epoch": 0.6552901023890785, "grad_norm": 0.41733554005622864, "learning_rate": 9.570506887577994e-06, "loss": 0.1555, "step": 112 }, { "epoch": 0.6611409068746953, "grad_norm": 0.350704163312912, "learning_rate": 9.556523350824759e-06, "loss": 0.1417, "step": 113 }, { "epoch": 0.666991711360312, "grad_norm": 0.4148392975330353, "learning_rate": 9.542326359097619e-06, "loss": 0.158, "step": 114 }, { "epoch": 0.6728425158459288, "grad_norm": 0.40368494391441345, "learning_rate": 9.527916577469104e-06, "loss": 0.1599, "step": 115 }, { "epoch": 0.6786933203315456, "grad_norm": 0.41338789463043213, "learning_rate": 9.5132946809801e-06, "loss": 0.1638, "step": 116 }, { "epoch": 0.6845441248171623, "grad_norm": 0.3733426332473755, "learning_rate": 9.498461354608228e-06, "loss": 0.1471, "step": 117 }, { "epoch": 0.6903949293027791, "grad_norm": 0.3726441264152527, "learning_rate": 9.483417293235759e-06, "loss": 0.1683, "step": 118 }, { "epoch": 0.6962457337883959, "grad_norm": 0.375068724155426, "learning_rate": 9.468163201617063e-06, "loss": 0.1523, "step": 119 }, { "epoch": 0.7020965382740126, "grad_norm": 0.3736395537853241, "learning_rate": 9.452699794345583e-06, "loss": 0.15, "step": 120 }, { "epoch": 0.7079473427596294, "grad_norm": 0.37468886375427246, "learning_rate": 9.437027795820373e-06, "loss": 0.1598, "step": 121 }, { "epoch": 0.7137981472452463, "grad_norm": 0.36749985814094543, "learning_rate": 9.421147940212152e-06, "loss": 0.1605, "step": 122 }, { "epoch": 0.719648951730863, "grad_norm": 0.3597930669784546, "learning_rate": 9.405060971428924e-06, "loss": 0.149, "step": 123 }, { "epoch": 0.7254997562164798, "grad_norm": 0.39379099011421204, "learning_rate": 9.388767643081109e-06, "loss": 0.159, "step": 124 }, { "epoch": 0.7313505607020966, "grad_norm": 0.37788447737693787, "learning_rate": 9.372268718446259e-06, "loss": 0.1691, "step": 125 }, { "epoch": 0.7372013651877133, "grad_norm": 0.3714412748813629, "learning_rate": 9.355564970433288e-06, "loss": 0.1571, "step": 126 }, { "epoch": 0.7430521696733301, "grad_norm": 0.37507206201553345, "learning_rate": 9.338657181546277e-06, "loss": 0.1567, "step": 127 }, { "epoch": 0.7489029741589469, "grad_norm": 0.34279665350914, "learning_rate": 9.321546143847802e-06, "loss": 0.1463, "step": 128 }, { "epoch": 0.7547537786445636, "grad_norm": 0.4015829265117645, "learning_rate": 9.30423265892184e-06, "loss": 0.1604, "step": 129 }, { "epoch": 0.7606045831301804, "grad_norm": 0.36534038186073303, "learning_rate": 9.286717537836211e-06, "loss": 0.1595, "step": 130 }, { "epoch": 0.7664553876157971, "grad_norm": 0.35713326930999756, "learning_rate": 9.269001601104593e-06, "loss": 0.1641, "step": 131 }, { "epoch": 0.7723061921014139, "grad_norm": 0.393284410238266, "learning_rate": 9.251085678648072e-06, "loss": 0.1625, "step": 132 }, { "epoch": 0.7781569965870307, "grad_norm": 0.3619539141654968, "learning_rate": 9.232970609756267e-06, "loss": 0.1481, "step": 133 }, { "epoch": 0.7840078010726474, "grad_norm": 0.35219714045524597, "learning_rate": 9.214657243048021e-06, "loss": 0.1494, "step": 134 }, { "epoch": 0.7898586055582643, "grad_norm": 0.38188058137893677, "learning_rate": 9.196146436431635e-06, "loss": 0.161, "step": 135 }, { "epoch": 0.7957094100438811, "grad_norm": 0.3557851314544678, "learning_rate": 9.177439057064684e-06, "loss": 0.1571, "step": 136 }, { "epoch": 0.8015602145294978, "grad_norm": 0.3425387442111969, "learning_rate": 9.158535981313395e-06, "loss": 0.1429, "step": 137 }, { "epoch": 0.8074110190151146, "grad_norm": 0.35848724842071533, "learning_rate": 9.13943809471159e-06, "loss": 0.1592, "step": 138 }, { "epoch": 0.8132618235007314, "grad_norm": 0.3686351776123047, "learning_rate": 9.120146291919206e-06, "loss": 0.149, "step": 139 }, { "epoch": 0.8191126279863481, "grad_norm": 0.3813227713108063, "learning_rate": 9.100661476680379e-06, "loss": 0.1503, "step": 140 }, { "epoch": 0.8249634324719649, "grad_norm": 0.3804561495780945, "learning_rate": 9.08098456178111e-06, "loss": 0.1547, "step": 141 }, { "epoch": 0.8308142369575817, "grad_norm": 0.3664409816265106, "learning_rate": 9.061116469006504e-06, "loss": 0.1555, "step": 142 }, { "epoch": 0.8366650414431984, "grad_norm": 0.3414985239505768, "learning_rate": 9.041058129097586e-06, "loss": 0.1376, "step": 143 }, { "epoch": 0.8425158459288152, "grad_norm": 0.3685246407985687, "learning_rate": 9.020810481707709e-06, "loss": 0.148, "step": 144 }, { "epoch": 0.8483666504144319, "grad_norm": 0.3861325681209564, "learning_rate": 9.00037447535852e-06, "loss": 0.1649, "step": 145 }, { "epoch": 0.8542174549000487, "grad_norm": 0.38169729709625244, "learning_rate": 8.979751067395534e-06, "loss": 0.1475, "step": 146 }, { "epoch": 0.8600682593856656, "grad_norm": 0.39888978004455566, "learning_rate": 8.958941223943292e-06, "loss": 0.1673, "step": 147 }, { "epoch": 0.8659190638712823, "grad_norm": 0.38037005066871643, "learning_rate": 8.937945919860086e-06, "loss": 0.1577, "step": 148 }, { "epoch": 0.8717698683568991, "grad_norm": 0.37339454889297485, "learning_rate": 8.916766138692303e-06, "loss": 0.1581, "step": 149 }, { "epoch": 0.8776206728425159, "grad_norm": 0.39251771569252014, "learning_rate": 8.895402872628352e-06, "loss": 0.1482, "step": 150 }, { "epoch": 0.8834714773281326, "grad_norm": 0.33527621626853943, "learning_rate": 8.873857122452174e-06, "loss": 0.1361, "step": 151 }, { "epoch": 0.8893222818137494, "grad_norm": 0.380577027797699, "learning_rate": 8.852129897496367e-06, "loss": 0.1495, "step": 152 }, { "epoch": 0.8951730862993662, "grad_norm": 0.3689667284488678, "learning_rate": 8.83022221559489e-06, "loss": 0.1605, "step": 153 }, { "epoch": 0.9010238907849829, "grad_norm": 0.39303916692733765, "learning_rate": 8.808135103035407e-06, "loss": 0.1472, "step": 154 }, { "epoch": 0.9068746952705997, "grad_norm": 0.37502506375312805, "learning_rate": 8.785869594511182e-06, "loss": 0.1621, "step": 155 }, { "epoch": 0.9127254997562165, "grad_norm": 0.3729185461997986, "learning_rate": 8.763426733072624e-06, "loss": 0.1538, "step": 156 }, { "epoch": 0.9185763042418332, "grad_norm": 0.38975661993026733, "learning_rate": 8.740807570078419e-06, "loss": 0.165, "step": 157 }, { "epoch": 0.92442710872745, "grad_norm": 0.3393966555595398, "learning_rate": 8.718013165146275e-06, "loss": 0.1435, "step": 158 }, { "epoch": 0.9302779132130667, "grad_norm": 0.3675788938999176, "learning_rate": 8.695044586103297e-06, "loss": 0.1488, "step": 159 }, { "epoch": 0.9361287176986836, "grad_norm": 0.34999918937683105, "learning_rate": 8.671902908935942e-06, "loss": 0.139, "step": 160 }, { "epoch": 0.9419795221843004, "grad_norm": 0.3778494596481323, "learning_rate": 8.648589217739635e-06, "loss": 0.149, "step": 161 }, { "epoch": 0.9478303266699171, "grad_norm": 0.34992629289627075, "learning_rate": 8.625104604667965e-06, "loss": 0.1582, "step": 162 }, { "epoch": 0.9536811311555339, "grad_norm": 0.344200074672699, "learning_rate": 8.601450169881533e-06, "loss": 0.1531, "step": 163 }, { "epoch": 0.9595319356411507, "grad_norm": 0.3828461170196533, "learning_rate": 8.577627021496413e-06, "loss": 0.1496, "step": 164 }, { "epoch": 0.9653827401267674, "grad_norm": 0.38192272186279297, "learning_rate": 8.553636275532236e-06, "loss": 0.1595, "step": 165 }, { "epoch": 0.9712335446123842, "grad_norm": 0.346049964427948, "learning_rate": 8.529479055859918e-06, "loss": 0.1394, "step": 166 }, { "epoch": 0.977084349098001, "grad_norm": 0.38012537360191345, "learning_rate": 8.505156494148997e-06, "loss": 0.1624, "step": 167 }, { "epoch": 0.9829351535836177, "grad_norm": 0.3455771505832672, "learning_rate": 8.480669729814635e-06, "loss": 0.1536, "step": 168 }, { "epoch": 0.9887859580692345, "grad_norm": 0.3532484769821167, "learning_rate": 8.456019909964224e-06, "loss": 0.1457, "step": 169 }, { "epoch": 0.9946367625548513, "grad_norm": 0.3691612184047699, "learning_rate": 8.43120818934367e-06, "loss": 0.1502, "step": 170 }, { "epoch": 1.000487567040468, "grad_norm": 0.39640477299690247, "learning_rate": 8.40623573028327e-06, "loss": 0.1672, "step": 171 }, { "epoch": 1.0063383715260847, "grad_norm": 0.3387565314769745, "learning_rate": 8.381103702643295e-06, "loss": 0.0924, "step": 172 }, { "epoch": 1.0121891760117017, "grad_norm": 0.3130762577056885, "learning_rate": 8.35581328375915e-06, "loss": 0.0873, "step": 173 }, { "epoch": 1.0180399804973184, "grad_norm": 0.3515136241912842, "learning_rate": 8.330365658386252e-06, "loss": 0.0971, "step": 174 }, { "epoch": 1.023890784982935, "grad_norm": 0.32018211483955383, "learning_rate": 8.30476201864451e-06, "loss": 0.0848, "step": 175 }, { "epoch": 1.029741589468552, "grad_norm": 0.35570186376571655, "learning_rate": 8.27900356396249e-06, "loss": 0.077, "step": 176 }, { "epoch": 1.0355923939541687, "grad_norm": 0.3432004749774933, "learning_rate": 8.25309150102121e-06, "loss": 0.0922, "step": 177 }, { "epoch": 1.0414431984397854, "grad_norm": 0.3650023341178894, "learning_rate": 8.227027043697642e-06, "loss": 0.1081, "step": 178 }, { "epoch": 1.0472940029254023, "grad_norm": 0.3721574544906616, "learning_rate": 8.200811413007808e-06, "loss": 0.0952, "step": 179 }, { "epoch": 1.053144807411019, "grad_norm": 0.3511905074119568, "learning_rate": 8.174445837049614e-06, "loss": 0.093, "step": 180 }, { "epoch": 1.0589956118966357, "grad_norm": 0.3371464014053345, "learning_rate": 8.147931550945301e-06, "loss": 0.0873, "step": 181 }, { "epoch": 1.0648464163822526, "grad_norm": 0.3639083802700043, "learning_rate": 8.121269796783585e-06, "loss": 0.0906, "step": 182 }, { "epoch": 1.0706972208678693, "grad_norm": 0.3532220125198364, "learning_rate": 8.094461823561473e-06, "loss": 0.0851, "step": 183 }, { "epoch": 1.076548025353486, "grad_norm": 0.3342878520488739, "learning_rate": 8.06750888712576e-06, "loss": 0.0953, "step": 184 }, { "epoch": 1.082398829839103, "grad_norm": 0.3148074448108673, "learning_rate": 8.040412250114184e-06, "loss": 0.0837, "step": 185 }, { "epoch": 1.0882496343247197, "grad_norm": 0.32519224286079407, "learning_rate": 8.013173181896283e-06, "loss": 0.0798, "step": 186 }, { "epoch": 1.0941004388103364, "grad_norm": 0.3312150239944458, "learning_rate": 7.985792958513932e-06, "loss": 0.0933, "step": 187 }, { "epoch": 1.0999512432959533, "grad_norm": 0.32089629769325256, "learning_rate": 7.958272862621562e-06, "loss": 0.0922, "step": 188 }, { "epoch": 1.10580204778157, "grad_norm": 0.35258930921554565, "learning_rate": 7.930614183426074e-06, "loss": 0.0909, "step": 189 }, { "epoch": 1.1116528522671867, "grad_norm": 0.33893248438835144, "learning_rate": 7.902818216626446e-06, "loss": 0.0929, "step": 190 }, { "epoch": 1.1175036567528036, "grad_norm": 0.31684961915016174, "learning_rate": 7.874886264353035e-06, "loss": 0.0886, "step": 191 }, { "epoch": 1.1233544612384203, "grad_norm": 0.3478744328022003, "learning_rate": 7.846819635106569e-06, "loss": 0.0957, "step": 192 }, { "epoch": 1.129205265724037, "grad_norm": 0.3251475393772125, "learning_rate": 7.818619643696863e-06, "loss": 0.0922, "step": 193 }, { "epoch": 1.1350560702096537, "grad_norm": 0.29696688055992126, "learning_rate": 7.790287611181217e-06, "loss": 0.0914, "step": 194 }, { "epoch": 1.1409068746952706, "grad_norm": 0.3044435977935791, "learning_rate": 7.76182486480253e-06, "loss": 0.0892, "step": 195 }, { "epoch": 1.1467576791808873, "grad_norm": 0.3044622242450714, "learning_rate": 7.733232737927123e-06, "loss": 0.0857, "step": 196 }, { "epoch": 1.152608483666504, "grad_norm": 0.3341536819934845, "learning_rate": 7.70451256998228e-06, "loss": 0.0906, "step": 197 }, { "epoch": 1.158459288152121, "grad_norm": 0.2735297679901123, "learning_rate": 7.675665706393502e-06, "loss": 0.0732, "step": 198 }, { "epoch": 1.1643100926377377, "grad_norm": 0.39707475900650024, "learning_rate": 7.646693498521472e-06, "loss": 0.0917, "step": 199 }, { "epoch": 1.1701608971233544, "grad_norm": 0.31679078936576843, "learning_rate": 7.617597303598754e-06, "loss": 0.0881, "step": 200 }, { "epoch": 1.1760117016089713, "grad_norm": 0.360331267118454, "learning_rate": 7.588378484666214e-06, "loss": 0.0954, "step": 201 }, { "epoch": 1.181862506094588, "grad_norm": 0.38201650977134705, "learning_rate": 7.559038410509161e-06, "loss": 0.0904, "step": 202 }, { "epoch": 1.1877133105802047, "grad_norm": 0.32335299253463745, "learning_rate": 7.529578455593232e-06, "loss": 0.0945, "step": 203 }, { "epoch": 1.1935641150658216, "grad_norm": 0.4359491169452667, "learning_rate": 7.500000000000001e-06, "loss": 0.096, "step": 204 }, { "epoch": 1.1994149195514383, "grad_norm": 0.3149729073047638, "learning_rate": 7.47030442936232e-06, "loss": 0.09, "step": 205 }, { "epoch": 1.205265724037055, "grad_norm": 0.33973321318626404, "learning_rate": 7.440493134799425e-06, "loss": 0.0913, "step": 206 }, { "epoch": 1.211116528522672, "grad_norm": 0.30180642008781433, "learning_rate": 7.4105675128517456e-06, "loss": 0.0803, "step": 207 }, { "epoch": 1.2169673330082886, "grad_norm": 0.345156192779541, "learning_rate": 7.380528965415501e-06, "loss": 0.1082, "step": 208 }, { "epoch": 1.2228181374939053, "grad_norm": 0.35538923740386963, "learning_rate": 7.35037889967702e-06, "loss": 0.0941, "step": 209 }, { "epoch": 1.2286689419795223, "grad_norm": 0.3383888006210327, "learning_rate": 7.320118728046818e-06, "loss": 0.0977, "step": 210 }, { "epoch": 1.234519746465139, "grad_norm": 0.32048848271369934, "learning_rate": 7.289749868093432e-06, "loss": 0.0997, "step": 211 }, { "epoch": 1.2403705509507557, "grad_norm": 0.30647850036621094, "learning_rate": 7.259273742477017e-06, "loss": 0.0735, "step": 212 }, { "epoch": 1.2462213554363726, "grad_norm": 0.3137127757072449, "learning_rate": 7.2286917788826926e-06, "loss": 0.086, "step": 213 }, { "epoch": 1.2520721599219893, "grad_norm": 0.3300592005252838, "learning_rate": 7.19800540995367e-06, "loss": 0.1001, "step": 214 }, { "epoch": 1.257922964407606, "grad_norm": 0.36561137437820435, "learning_rate": 7.167216073224136e-06, "loss": 0.0939, "step": 215 }, { "epoch": 1.263773768893223, "grad_norm": 0.32352930307388306, "learning_rate": 7.136325211051905e-06, "loss": 0.0842, "step": 216 }, { "epoch": 1.2696245733788396, "grad_norm": 0.3289717733860016, "learning_rate": 7.1053342705508564e-06, "loss": 0.0935, "step": 217 }, { "epoch": 1.2754753778644563, "grad_norm": 0.28205370903015137, "learning_rate": 7.074244703523137e-06, "loss": 0.0778, "step": 218 }, { "epoch": 1.2813261823500732, "grad_norm": 0.38977792859077454, "learning_rate": 7.043057966391158e-06, "loss": 0.0956, "step": 219 }, { "epoch": 1.28717698683569, "grad_norm": 0.3073525130748749, "learning_rate": 7.011775520129363e-06, "loss": 0.0863, "step": 220 }, { "epoch": 1.2930277913213066, "grad_norm": 0.3111942410469055, "learning_rate": 6.980398830195785e-06, "loss": 0.0834, "step": 221 }, { "epoch": 1.2988785958069236, "grad_norm": 0.3141816556453705, "learning_rate": 6.948929366463397e-06, "loss": 0.0843, "step": 222 }, { "epoch": 1.3047294002925403, "grad_norm": 0.3614446222782135, "learning_rate": 6.9173686031512595e-06, "loss": 0.1021, "step": 223 }, { "epoch": 1.310580204778157, "grad_norm": 0.3350818157196045, "learning_rate": 6.885718018755448e-06, "loss": 0.0871, "step": 224 }, { "epoch": 1.3164310092637739, "grad_norm": 0.32823216915130615, "learning_rate": 6.8539790959798045e-06, "loss": 0.0916, "step": 225 }, { "epoch": 1.3222818137493906, "grad_norm": 0.382249116897583, "learning_rate": 6.822153321666469e-06, "loss": 0.0904, "step": 226 }, { "epoch": 1.3281326182350073, "grad_norm": 0.32061290740966797, "learning_rate": 6.790242186726231e-06, "loss": 0.0966, "step": 227 }, { "epoch": 1.3339834227206242, "grad_norm": 0.32373279333114624, "learning_rate": 6.758247186068684e-06, "loss": 0.0891, "step": 228 }, { "epoch": 1.339834227206241, "grad_norm": 0.32146140933036804, "learning_rate": 6.7261698185322e-06, "loss": 0.09, "step": 229 }, { "epoch": 1.3456850316918576, "grad_norm": 0.33953243494033813, "learning_rate": 6.6940115868137065e-06, "loss": 0.0877, "step": 230 }, { "epoch": 1.3515358361774745, "grad_norm": 0.3054323196411133, "learning_rate": 6.6617739973982985e-06, "loss": 0.088, "step": 231 }, { "epoch": 1.3573866406630912, "grad_norm": 0.34463798999786377, "learning_rate": 6.629458560488664e-06, "loss": 0.1081, "step": 232 }, { "epoch": 1.363237445148708, "grad_norm": 0.3094691336154938, "learning_rate": 6.597066789934336e-06, "loss": 0.089, "step": 233 }, { "epoch": 1.3690882496343248, "grad_norm": 0.3243754506111145, "learning_rate": 6.5646002031607726e-06, "loss": 0.1017, "step": 234 }, { "epoch": 1.3749390541199416, "grad_norm": 0.3523649573326111, "learning_rate": 6.5320603210982745e-06, "loss": 0.0976, "step": 235 }, { "epoch": 1.3807898586055583, "grad_norm": 0.32277393341064453, "learning_rate": 6.499448668110735e-06, "loss": 0.0987, "step": 236 }, { "epoch": 1.3866406630911752, "grad_norm": 0.36676648259162903, "learning_rate": 6.466766771924231e-06, "loss": 0.0962, "step": 237 }, { "epoch": 1.3924914675767919, "grad_norm": 0.3382190465927124, "learning_rate": 6.434016163555452e-06, "loss": 0.0997, "step": 238 }, { "epoch": 1.3983422720624086, "grad_norm": 0.3056662082672119, "learning_rate": 6.401198377239979e-06, "loss": 0.0832, "step": 239 }, { "epoch": 1.4041930765480253, "grad_norm": 0.34704071283340454, "learning_rate": 6.368314950360416e-06, "loss": 0.0989, "step": 240 }, { "epoch": 1.4100438810336422, "grad_norm": 0.3333982527256012, "learning_rate": 6.3353674233743585e-06, "loss": 0.1005, "step": 241 }, { "epoch": 1.415894685519259, "grad_norm": 0.3286687135696411, "learning_rate": 6.302357339742245e-06, "loss": 0.089, "step": 242 }, { "epoch": 1.4217454900048756, "grad_norm": 0.33592113852500916, "learning_rate": 6.269286245855039e-06, "loss": 0.0927, "step": 243 }, { "epoch": 1.4275962944904925, "grad_norm": 0.3280174434185028, "learning_rate": 6.236155690961795e-06, "loss": 0.085, "step": 244 }, { "epoch": 1.4334470989761092, "grad_norm": 0.31295254826545715, "learning_rate": 6.202967227097073e-06, "loss": 0.0944, "step": 245 }, { "epoch": 1.439297903461726, "grad_norm": 0.3453296422958374, "learning_rate": 6.169722409008244e-06, "loss": 0.0878, "step": 246 }, { "epoch": 1.4451487079473426, "grad_norm": 0.3456035852432251, "learning_rate": 6.136422794082645e-06, "loss": 0.0943, "step": 247 }, { "epoch": 1.4509995124329595, "grad_norm": 0.31323128938674927, "learning_rate": 6.10306994227463e-06, "loss": 0.0759, "step": 248 }, { "epoch": 1.4568503169185762, "grad_norm": 0.33041155338287354, "learning_rate": 6.0696654160324875e-06, "loss": 0.098, "step": 249 }, { "epoch": 1.462701121404193, "grad_norm": 0.3976047933101654, "learning_rate": 6.0362107802252486e-06, "loss": 0.1005, "step": 250 }, { "epoch": 1.4685519258898099, "grad_norm": 0.3161430358886719, "learning_rate": 6.002707602069377e-06, "loss": 0.0909, "step": 251 }, { "epoch": 1.4744027303754266, "grad_norm": 0.36212068796157837, "learning_rate": 5.9691574510553505e-06, "loss": 0.1077, "step": 252 }, { "epoch": 1.4802535348610433, "grad_norm": 0.3128092586994171, "learning_rate": 5.935561898874142e-06, "loss": 0.0965, "step": 253 }, { "epoch": 1.4861043393466602, "grad_norm": 0.29555660486221313, "learning_rate": 5.901922519343586e-06, "loss": 0.0938, "step": 254 }, { "epoch": 1.491955143832277, "grad_norm": 0.3516538143157959, "learning_rate": 5.8682408883346535e-06, "loss": 0.0948, "step": 255 }, { "epoch": 1.4978059483178936, "grad_norm": 0.33568522334098816, "learning_rate": 5.834518583697628e-06, "loss": 0.1097, "step": 256 }, { "epoch": 1.5036567528035105, "grad_norm": 0.3301846385002136, "learning_rate": 5.800757185188195e-06, "loss": 0.096, "step": 257 }, { "epoch": 1.5095075572891272, "grad_norm": 0.36975666880607605, "learning_rate": 5.766958274393428e-06, "loss": 0.1021, "step": 258 }, { "epoch": 1.515358361774744, "grad_norm": 0.36187541484832764, "learning_rate": 5.733123434657704e-06, "loss": 0.0978, "step": 259 }, { "epoch": 1.5212091662603608, "grad_norm": 0.302735298871994, "learning_rate": 5.699254251008524e-06, "loss": 0.0863, "step": 260 }, { "epoch": 1.5270599707459775, "grad_norm": 0.3669283986091614, "learning_rate": 5.66535231008227e-06, "loss": 0.0993, "step": 261 }, { "epoch": 1.5329107752315942, "grad_norm": 0.30971550941467285, "learning_rate": 5.631419200049867e-06, "loss": 0.094, "step": 262 }, { "epoch": 1.5387615797172112, "grad_norm": 0.31271669268608093, "learning_rate": 5.597456510542395e-06, "loss": 0.0784, "step": 263 }, { "epoch": 1.5446123842028279, "grad_norm": 0.3437775671482086, "learning_rate": 5.5634658325766066e-06, "loss": 0.094, "step": 264 }, { "epoch": 1.5504631886884446, "grad_norm": 0.30059608817100525, "learning_rate": 5.529448758480408e-06, "loss": 0.0837, "step": 265 }, { "epoch": 1.5563139931740615, "grad_norm": 0.3427652418613434, "learning_rate": 5.495406881818256e-06, "loss": 0.0974, "step": 266 }, { "epoch": 1.5621647976596782, "grad_norm": 0.325862318277359, "learning_rate": 5.46134179731651e-06, "loss": 0.0941, "step": 267 }, { "epoch": 1.568015602145295, "grad_norm": 0.2897391617298126, "learning_rate": 5.427255100788726e-06, "loss": 0.0892, "step": 268 }, { "epoch": 1.5738664066309118, "grad_norm": 0.33088555932044983, "learning_rate": 5.393148389060893e-06, "loss": 0.09, "step": 269 }, { "epoch": 1.5797172111165285, "grad_norm": 0.3650970757007599, "learning_rate": 5.359023259896638e-06, "loss": 0.0931, "step": 270 }, { "epoch": 1.5855680156021452, "grad_norm": 0.34107041358947754, "learning_rate": 5.3248813119223665e-06, "loss": 0.0937, "step": 271 }, { "epoch": 1.5914188200877621, "grad_norm": 0.3320484459400177, "learning_rate": 5.290724144552379e-06, "loss": 0.1058, "step": 272 }, { "epoch": 1.5972696245733788, "grad_norm": 0.31170666217803955, "learning_rate": 5.2565533579139484e-06, "loss": 0.0923, "step": 273 }, { "epoch": 1.6031204290589955, "grad_norm": 0.3246054947376251, "learning_rate": 5.222370552772353e-06, "loss": 0.0999, "step": 274 }, { "epoch": 1.6089712335446125, "grad_norm": 0.35101330280303955, "learning_rate": 5.188177330455886e-06, "loss": 0.1082, "step": 275 }, { "epoch": 1.6148220380302292, "grad_norm": 0.29805803298950195, "learning_rate": 5.153975292780852e-06, "loss": 0.0867, "step": 276 }, { "epoch": 1.6206728425158459, "grad_norm": 0.32440170645713806, "learning_rate": 5.119766041976516e-06, "loss": 0.0936, "step": 277 }, { "epoch": 1.6265236470014628, "grad_norm": 0.33313047885894775, "learning_rate": 5.085551180610046e-06, "loss": 0.0929, "step": 278 }, { "epoch": 1.6323744514870795, "grad_norm": 0.3095255494117737, "learning_rate": 5.05133231151145e-06, "loss": 0.0847, "step": 279 }, { "epoch": 1.6382252559726962, "grad_norm": 0.3372449278831482, "learning_rate": 5.017111037698477e-06, "loss": 0.104, "step": 280 }, { "epoch": 1.6494392979034618, "grad_norm": 0.32717257738113403, "learning_rate": 4.9828889623015265e-06, "loss": 0.0863, "step": 281 }, { "epoch": 1.6552901023890785, "grad_norm": 0.3508462607860565, "learning_rate": 4.948667688488552e-06, "loss": 0.0937, "step": 282 }, { "epoch": 1.6611409068746954, "grad_norm": 0.3077023923397064, "learning_rate": 4.9144488193899546e-06, "loss": 0.0933, "step": 283 }, { "epoch": 1.6669917113603119, "grad_norm": 0.3831559419631958, "learning_rate": 4.880233958023486e-06, "loss": 0.1023, "step": 284 }, { "epoch": 1.6728425158459288, "grad_norm": 0.31356075406074524, "learning_rate": 4.846024707219149e-06, "loss": 0.088, "step": 285 }, { "epoch": 1.6786933203315457, "grad_norm": 0.35236048698425293, "learning_rate": 4.811822669544115e-06, "loss": 0.0942, "step": 286 }, { "epoch": 1.6845441248171622, "grad_norm": 0.32862260937690735, "learning_rate": 4.777629447227649e-06, "loss": 0.1004, "step": 287 }, { "epoch": 1.6903949293027791, "grad_norm": 0.29887956380844116, "learning_rate": 4.7434466420860515e-06, "loss": 0.0838, "step": 288 }, { "epoch": 1.696245733788396, "grad_norm": 0.38082054257392883, "learning_rate": 4.7092758554476215e-06, "loss": 0.0955, "step": 289 }, { "epoch": 1.7020965382740125, "grad_norm": 0.3033042848110199, "learning_rate": 4.675118688077634e-06, "loss": 0.0907, "step": 290 }, { "epoch": 1.7079473427596294, "grad_norm": 0.3312883973121643, "learning_rate": 4.640976740103363e-06, "loss": 0.0922, "step": 291 }, { "epoch": 1.7137981472452464, "grad_norm": 0.3242630064487457, "learning_rate": 4.606851610939108e-06, "loss": 0.0918, "step": 292 }, { "epoch": 1.7196489517308629, "grad_norm": 0.3354322612285614, "learning_rate": 4.572744899211275e-06, "loss": 0.0988, "step": 293 }, { "epoch": 1.7254997562164798, "grad_norm": 0.3046918213367462, "learning_rate": 4.53865820268349e-06, "loss": 0.0829, "step": 294 }, { "epoch": 1.7313505607020967, "grad_norm": 0.3330130875110626, "learning_rate": 4.504593118181745e-06, "loss": 0.1022, "step": 295 }, { "epoch": 1.7372013651877132, "grad_norm": 0.31168803572654724, "learning_rate": 4.470551241519594e-06, "loss": 0.0841, "step": 296 }, { "epoch": 1.74305216967333, "grad_norm": 0.3428613841533661, "learning_rate": 4.436534167423395e-06, "loss": 0.0921, "step": 297 }, { "epoch": 1.748902974158947, "grad_norm": 0.33911219239234924, "learning_rate": 4.402543489457607e-06, "loss": 0.0969, "step": 298 }, { "epoch": 1.7547537786445635, "grad_norm": 0.3178160488605499, "learning_rate": 4.368580799950133e-06, "loss": 0.0884, "step": 299 }, { "epoch": 1.7606045831301804, "grad_norm": 0.3277307152748108, "learning_rate": 4.334647689917734e-06, "loss": 0.0974, "step": 300 }, { "epoch": 1.7664553876157971, "grad_norm": 0.3535825312137604, "learning_rate": 4.300745748991478e-06, "loss": 0.0896, "step": 301 }, { "epoch": 1.7723061921014138, "grad_norm": 0.32610517740249634, "learning_rate": 4.266876565342298e-06, "loss": 0.0994, "step": 302 }, { "epoch": 1.7781569965870307, "grad_norm": 0.3436540961265564, "learning_rate": 4.233041725606573e-06, "loss": 0.0968, "step": 303 }, { "epoch": 1.7840078010726474, "grad_norm": 0.35339972376823425, "learning_rate": 4.199242814811807e-06, "loss": 0.0964, "step": 304 }, { "epoch": 1.7898586055582641, "grad_norm": 0.3242701292037964, "learning_rate": 4.1654814163023735e-06, "loss": 0.0817, "step": 305 }, { "epoch": 1.795709410043881, "grad_norm": 0.38222551345825195, "learning_rate": 4.131759111665349e-06, "loss": 0.099, "step": 306 }, { "epoch": 1.8015602145294978, "grad_norm": 0.32425937056541443, "learning_rate": 4.098077480656415e-06, "loss": 0.0873, "step": 307 }, { "epoch": 1.8074110190151145, "grad_norm": 0.36380070447921753, "learning_rate": 4.064438101125859e-06, "loss": 0.0973, "step": 308 }, { "epoch": 1.8132618235007314, "grad_norm": 0.34305286407470703, "learning_rate": 4.03084254894465e-06, "loss": 0.0922, "step": 309 }, { "epoch": 1.819112627986348, "grad_norm": 0.353582501411438, "learning_rate": 3.997292397930624e-06, "loss": 0.0872, "step": 310 }, { "epoch": 1.8249634324719648, "grad_norm": 0.29907920956611633, "learning_rate": 3.963789219774753e-06, "loss": 0.0845, "step": 311 }, { "epoch": 1.8308142369575817, "grad_norm": 0.3310573399066925, "learning_rate": 3.930334583967514e-06, "loss": 0.0941, "step": 312 }, { "epoch": 1.8366650414431984, "grad_norm": 0.3727741241455078, "learning_rate": 3.896930057725372e-06, "loss": 0.0807, "step": 313 }, { "epoch": 1.8425158459288151, "grad_norm": 0.3489404022693634, "learning_rate": 3.863577205917356e-06, "loss": 0.1051, "step": 314 }, { "epoch": 1.848366650414432, "grad_norm": 0.3304194509983063, "learning_rate": 3.8302775909917585e-06, "loss": 0.0937, "step": 315 }, { "epoch": 1.8542174549000487, "grad_norm": 0.31810325384140015, "learning_rate": 3.7970327729029288e-06, "loss": 0.0917, "step": 316 }, { "epoch": 1.8600682593856654, "grad_norm": 0.3284529149532318, "learning_rate": 3.7638443090382067e-06, "loss": 0.0864, "step": 317 }, { "epoch": 1.8659190638712824, "grad_norm": 0.34479424357414246, "learning_rate": 3.730713754144961e-06, "loss": 0.0938, "step": 318 }, { "epoch": 1.871769868356899, "grad_norm": 0.32104262709617615, "learning_rate": 3.6976426602577565e-06, "loss": 0.0866, "step": 319 }, { "epoch": 1.8776206728425158, "grad_norm": 0.32675501704216003, "learning_rate": 3.6646325766256423e-06, "loss": 0.0841, "step": 320 }, { "epoch": 1.8834714773281327, "grad_norm": 0.3269306421279907, "learning_rate": 3.6316850496395863e-06, "loss": 0.0926, "step": 321 }, { "epoch": 1.8893222818137494, "grad_norm": 0.32884150743484497, "learning_rate": 3.598801622760021e-06, "loss": 0.0816, "step": 322 }, { "epoch": 1.895173086299366, "grad_norm": 0.3181195557117462, "learning_rate": 3.5659838364445505e-06, "loss": 0.0887, "step": 323 }, { "epoch": 1.901023890784983, "grad_norm": 0.3327188789844513, "learning_rate": 3.5332332280757706e-06, "loss": 0.0966, "step": 324 }, { "epoch": 1.9068746952705997, "grad_norm": 0.3728068470954895, "learning_rate": 3.5005513318892666e-06, "loss": 0.0991, "step": 325 }, { "epoch": 1.9127254997562164, "grad_norm": 0.37355464696884155, "learning_rate": 3.4679396789017263e-06, "loss": 0.0867, "step": 326 }, { "epoch": 1.9185763042418333, "grad_norm": 0.32492557168006897, "learning_rate": 3.4353997968392295e-06, "loss": 0.0894, "step": 327 }, { "epoch": 1.92442710872745, "grad_norm": 0.322022944688797, "learning_rate": 3.402933210065665e-06, "loss": 0.0894, "step": 328 }, { "epoch": 1.9302779132130667, "grad_norm": 0.31964507699012756, "learning_rate": 3.3705414395113354e-06, "loss": 0.0918, "step": 329 }, { "epoch": 1.9361287176986837, "grad_norm": 0.33390405774116516, "learning_rate": 3.3382260026017027e-06, "loss": 0.0884, "step": 330 }, { "epoch": 1.9419795221843004, "grad_norm": 0.33786076307296753, "learning_rate": 3.305988413186295e-06, "loss": 0.0872, "step": 331 }, { "epoch": 1.947830326669917, "grad_norm": 0.3263327181339264, "learning_rate": 3.2738301814678015e-06, "loss": 0.0922, "step": 332 }, { "epoch": 1.953681131155534, "grad_norm": 0.3341035544872284, "learning_rate": 3.241752813931316e-06, "loss": 0.0855, "step": 333 }, { "epoch": 1.9595319356411507, "grad_norm": 0.38113948702812195, "learning_rate": 3.2097578132737716e-06, "loss": 0.1033, "step": 334 }, { "epoch": 1.9653827401267674, "grad_norm": 0.33053913712501526, "learning_rate": 3.1778466783335328e-06, "loss": 0.0958, "step": 335 }, { "epoch": 1.9712335446123843, "grad_norm": 0.34964510798454285, "learning_rate": 3.1460209040201967e-06, "loss": 0.0886, "step": 336 }, { "epoch": 1.977084349098001, "grad_norm": 0.3344588279724121, "learning_rate": 3.114281981244553e-06, "loss": 0.0925, "step": 337 }, { "epoch": 1.9829351535836177, "grad_norm": 0.32202234864234924, "learning_rate": 3.082631396848743e-06, "loss": 0.0954, "step": 338 }, { "epoch": 1.9887859580692346, "grad_norm": 0.3456474542617798, "learning_rate": 3.0510706335366034e-06, "loss": 0.0856, "step": 339 }, { "epoch": 1.9946367625548513, "grad_norm": 0.3227405250072479, "learning_rate": 3.019601169804216e-06, "loss": 0.0878, "step": 340 }, { "epoch": 2.000487567040468, "grad_norm": 0.31147223711013794, "learning_rate": 2.9882244798706372e-06, "loss": 0.0759, "step": 341 }, { "epoch": 2.006338371526085, "grad_norm": 0.2763703763484955, "learning_rate": 2.956942033608843e-06, "loss": 0.0525, "step": 342 }, { "epoch": 2.0121891760117014, "grad_norm": 0.29425641894340515, "learning_rate": 2.9257552964768644e-06, "loss": 0.0601, "step": 343 }, { "epoch": 2.0180399804973184, "grad_norm": 0.2795559763908386, "learning_rate": 2.8946657294491452e-06, "loss": 0.0573, "step": 344 }, { "epoch": 2.0238907849829353, "grad_norm": 0.34892964363098145, "learning_rate": 2.863674788948097e-06, "loss": 0.0628, "step": 345 }, { "epoch": 2.0297415894685518, "grad_norm": 0.28666457533836365, "learning_rate": 2.832783926775865e-06, "loss": 0.0524, "step": 346 }, { "epoch": 2.0355923939541687, "grad_norm": 0.26126629114151, "learning_rate": 2.8019945900463307e-06, "loss": 0.0578, "step": 347 }, { "epoch": 2.0414431984397856, "grad_norm": 0.2673165500164032, "learning_rate": 2.771308221117309e-06, "loss": 0.0574, "step": 348 }, { "epoch": 2.047294002925402, "grad_norm": 0.2941865921020508, "learning_rate": 2.740726257522987e-06, "loss": 0.0551, "step": 349 }, { "epoch": 2.053144807411019, "grad_norm": 0.2612977623939514, "learning_rate": 2.7102501319065706e-06, "loss": 0.0644, "step": 350 }, { "epoch": 2.058995611896636, "grad_norm": 0.2891154885292053, "learning_rate": 2.6798812719531843e-06, "loss": 0.0563, "step": 351 }, { "epoch": 2.0648464163822524, "grad_norm": 0.2692639231681824, "learning_rate": 2.6496211003229795e-06, "loss": 0.0541, "step": 352 }, { "epoch": 2.0706972208678693, "grad_norm": 0.3014177680015564, "learning_rate": 2.6194710345845e-06, "loss": 0.0502, "step": 353 }, { "epoch": 2.0765480253534863, "grad_norm": 0.28264397382736206, "learning_rate": 2.5894324871482557e-06, "loss": 0.0566, "step": 354 }, { "epoch": 2.0823988298391027, "grad_norm": 0.2884339392185211, "learning_rate": 2.559506865200576e-06, "loss": 0.0527, "step": 355 }, { "epoch": 2.0882496343247197, "grad_norm": 0.26103538274765015, "learning_rate": 2.529695570637679e-06, "loss": 0.0596, "step": 356 }, { "epoch": 2.0941004388103366, "grad_norm": 0.2645220160484314, "learning_rate": 2.5000000000000015e-06, "loss": 0.0564, "step": 357 }, { "epoch": 2.099951243295953, "grad_norm": 0.3041948080062866, "learning_rate": 2.4704215444067684e-06, "loss": 0.0529, "step": 358 }, { "epoch": 2.10580204778157, "grad_norm": 0.26852619647979736, "learning_rate": 2.4409615894908407e-06, "loss": 0.0551, "step": 359 }, { "epoch": 2.111652852267187, "grad_norm": 0.2973073422908783, "learning_rate": 2.411621515333788e-06, "loss": 0.0616, "step": 360 }, { "epoch": 2.1175036567528034, "grad_norm": 0.2823100686073303, "learning_rate": 2.3824026964012487e-06, "loss": 0.0616, "step": 361 }, { "epoch": 2.1233544612384203, "grad_norm": 0.25822895765304565, "learning_rate": 2.35330650147853e-06, "loss": 0.0512, "step": 362 }, { "epoch": 2.1292052657240372, "grad_norm": 0.34585779905319214, "learning_rate": 2.324334293606499e-06, "loss": 0.0495, "step": 363 }, { "epoch": 2.1350560702096537, "grad_norm": 0.2767939567565918, "learning_rate": 2.2954874300177197e-06, "loss": 0.0561, "step": 364 }, { "epoch": 2.1409068746952706, "grad_norm": 0.27340444922447205, "learning_rate": 2.266767262072878e-06, "loss": 0.0562, "step": 365 }, { "epoch": 2.1467576791808876, "grad_norm": 0.273384690284729, "learning_rate": 2.238175135197471e-06, "loss": 0.0593, "step": 366 }, { "epoch": 2.152608483666504, "grad_norm": 0.28088316321372986, "learning_rate": 2.2097123888187825e-06, "loss": 0.0505, "step": 367 }, { "epoch": 2.158459288152121, "grad_norm": 0.2613745331764221, "learning_rate": 2.181380356303139e-06, "loss": 0.0525, "step": 368 }, { "epoch": 2.164310092637738, "grad_norm": 0.2488614320755005, "learning_rate": 2.1531803648934333e-06, "loss": 0.0498, "step": 369 }, { "epoch": 2.1701608971233544, "grad_norm": 0.2990882098674774, "learning_rate": 2.1251137356469677e-06, "loss": 0.0551, "step": 370 }, { "epoch": 2.1760117016089713, "grad_norm": 0.30025023221969604, "learning_rate": 2.0971817833735548e-06, "loss": 0.055, "step": 371 }, { "epoch": 2.181862506094588, "grad_norm": 0.2645871639251709, "learning_rate": 2.069385816573928e-06, "loss": 0.0564, "step": 372 }, { "epoch": 2.1877133105802047, "grad_norm": 0.29934531450271606, "learning_rate": 2.0417271373784403e-06, "loss": 0.0613, "step": 373 }, { "epoch": 2.1935641150658216, "grad_norm": 0.2638642489910126, "learning_rate": 2.0142070414860704e-06, "loss": 0.0539, "step": 374 }, { "epoch": 2.1994149195514385, "grad_norm": 0.2566313147544861, "learning_rate": 1.9868268181037186e-06, "loss": 0.054, "step": 375 }, { "epoch": 2.205265724037055, "grad_norm": 0.274949312210083, "learning_rate": 1.9595877498858175e-06, "loss": 0.0644, "step": 376 }, { "epoch": 2.211116528522672, "grad_norm": 0.21347293257713318, "learning_rate": 1.9324911128742406e-06, "loss": 0.046, "step": 377 }, { "epoch": 2.216967333008289, "grad_norm": 0.296029657125473, "learning_rate": 1.9055381764385272e-06, "loss": 0.0674, "step": 378 }, { "epoch": 2.2228181374939053, "grad_norm": 0.3009437024593353, "learning_rate": 1.8787302032164168e-06, "loss": 0.0635, "step": 379 }, { "epoch": 2.2286689419795223, "grad_norm": 0.2573295831680298, "learning_rate": 1.8520684490547014e-06, "loss": 0.0621, "step": 380 }, { "epoch": 2.234519746465139, "grad_norm": 0.28992000222206116, "learning_rate": 1.8255541629503865e-06, "loss": 0.051, "step": 381 }, { "epoch": 2.2403705509507557, "grad_norm": 0.2807430028915405, "learning_rate": 1.7991885869921928e-06, "loss": 0.056, "step": 382 }, { "epoch": 2.2462213554363726, "grad_norm": 0.2580874562263489, "learning_rate": 1.7729729563023613e-06, "loss": 0.0576, "step": 383 }, { "epoch": 2.2520721599219895, "grad_norm": 0.27799367904663086, "learning_rate": 1.746908498978791e-06, "loss": 0.0491, "step": 384 }, { "epoch": 2.257922964407606, "grad_norm": 0.28631195425987244, "learning_rate": 1.7209964360375137e-06, "loss": 0.0516, "step": 385 }, { "epoch": 2.263773768893223, "grad_norm": 0.2720945477485657, "learning_rate": 1.6952379813554914e-06, "loss": 0.0525, "step": 386 }, { "epoch": 2.26962457337884, "grad_norm": 0.2580566108226776, "learning_rate": 1.6696343416137495e-06, "loss": 0.0599, "step": 387 }, { "epoch": 2.2754753778644563, "grad_norm": 0.2627420723438263, "learning_rate": 1.6441867162408514e-06, "loss": 0.0531, "step": 388 }, { "epoch": 2.2813261823500732, "grad_norm": 0.2853785753250122, "learning_rate": 1.6188962973567068e-06, "loss": 0.0563, "step": 389 }, { "epoch": 2.28717698683569, "grad_norm": 0.26825377345085144, "learning_rate": 1.5937642697167288e-06, "loss": 0.062, "step": 390 }, { "epoch": 2.2930277913213066, "grad_norm": 0.24329300224781036, "learning_rate": 1.5687918106563326e-06, "loss": 0.0476, "step": 391 }, { "epoch": 2.2988785958069236, "grad_norm": 0.23808631300926208, "learning_rate": 1.5439800900357765e-06, "loss": 0.0514, "step": 392 }, { "epoch": 2.30472940029254, "grad_norm": 0.26007384061813354, "learning_rate": 1.5193302701853674e-06, "loss": 0.0478, "step": 393 }, { "epoch": 2.310580204778157, "grad_norm": 0.3034699261188507, "learning_rate": 1.4948435058510036e-06, "loss": 0.0596, "step": 394 }, { "epoch": 2.316431009263774, "grad_norm": 0.2601581811904907, "learning_rate": 1.4705209441400841e-06, "loss": 0.0529, "step": 395 }, { "epoch": 2.3222818137493904, "grad_norm": 0.26665881276130676, "learning_rate": 1.4463637244677648e-06, "loss": 0.0516, "step": 396 }, { "epoch": 2.3281326182350073, "grad_norm": 0.252401202917099, "learning_rate": 1.422372978503589e-06, "loss": 0.0568, "step": 397 }, { "epoch": 2.333983422720624, "grad_norm": 0.2799394726753235, "learning_rate": 1.3985498301184685e-06, "loss": 0.0544, "step": 398 }, { "epoch": 2.3398342272062407, "grad_norm": 0.29879605770111084, "learning_rate": 1.374895395332037e-06, "loss": 0.06, "step": 399 }, { "epoch": 2.3456850316918576, "grad_norm": 0.27037307620048523, "learning_rate": 1.351410782260366e-06, "loss": 0.0601, "step": 400 }, { "epoch": 2.3515358361774745, "grad_norm": 0.26528164744377136, "learning_rate": 1.3280970910640573e-06, "loss": 0.0547, "step": 401 }, { "epoch": 2.357386640663091, "grad_norm": 0.3015322983264923, "learning_rate": 1.3049554138967052e-06, "loss": 0.0586, "step": 402 }, { "epoch": 2.363237445148708, "grad_norm": 0.2491970807313919, "learning_rate": 1.2819868348537263e-06, "loss": 0.0492, "step": 403 }, { "epoch": 2.369088249634325, "grad_norm": 0.262861967086792, "learning_rate": 1.259192429921584e-06, "loss": 0.0547, "step": 404 }, { "epoch": 2.3749390541199413, "grad_norm": 0.2699519991874695, "learning_rate": 1.2365732669273778e-06, "loss": 0.0511, "step": 405 }, { "epoch": 2.3807898586055583, "grad_norm": 0.30865174531936646, "learning_rate": 1.2141304054888204e-06, "loss": 0.0583, "step": 406 }, { "epoch": 2.386640663091175, "grad_norm": 0.28503602743148804, "learning_rate": 1.1918648969645947e-06, "loss": 0.0615, "step": 407 }, { "epoch": 2.3924914675767917, "grad_norm": 0.22828808426856995, "learning_rate": 1.1697777844051105e-06, "loss": 0.0469, "step": 408 }, { "epoch": 2.3983422720624086, "grad_norm": 0.28332284092903137, "learning_rate": 1.1478701025036359e-06, "loss": 0.0581, "step": 409 }, { "epoch": 2.4041930765480255, "grad_norm": 0.2843197286128998, "learning_rate": 1.126142877547826e-06, "loss": 0.0633, "step": 410 }, { "epoch": 2.410043881033642, "grad_norm": 0.2659061849117279, "learning_rate": 1.1045971273716476e-06, "loss": 0.0522, "step": 411 }, { "epoch": 2.415894685519259, "grad_norm": 0.28145578503608704, "learning_rate": 1.083233861307697e-06, "loss": 0.0598, "step": 412 }, { "epoch": 2.421745490004876, "grad_norm": 0.29039686918258667, "learning_rate": 1.062054080139916e-06, "loss": 0.0585, "step": 413 }, { "epoch": 2.4275962944904923, "grad_norm": 0.2717377543449402, "learning_rate": 1.0410587760567104e-06, "loss": 0.0556, "step": 414 }, { "epoch": 2.4334470989761092, "grad_norm": 0.283588171005249, "learning_rate": 1.0202489326044663e-06, "loss": 0.0586, "step": 415 }, { "epoch": 2.439297903461726, "grad_norm": 0.2644156515598297, "learning_rate": 9.99625524641481e-07, "loss": 0.0519, "step": 416 }, { "epoch": 2.4451487079473426, "grad_norm": 0.28385522961616516, "learning_rate": 9.791895182922911e-07, "loss": 0.0584, "step": 417 }, { "epoch": 2.4509995124329595, "grad_norm": 0.2593843340873718, "learning_rate": 9.589418709024146e-07, "loss": 0.0582, "step": 418 }, { "epoch": 2.4568503169185765, "grad_norm": 0.28470900654792786, "learning_rate": 9.388835309934985e-07, "loss": 0.0651, "step": 419 }, { "epoch": 2.462701121404193, "grad_norm": 0.23977982997894287, "learning_rate": 9.190154382188921e-07, "loss": 0.0511, "step": 420 }, { "epoch": 2.46855192588981, "grad_norm": 0.2712235152721405, "learning_rate": 8.993385233196223e-07, "loss": 0.0545, "step": 421 }, { "epoch": 2.474402730375427, "grad_norm": 0.256510466337204, "learning_rate": 8.79853708080795e-07, "loss": 0.0563, "step": 422 }, { "epoch": 2.4802535348610433, "grad_norm": 0.27964967489242554, "learning_rate": 8.605619052884106e-07, "loss": 0.0634, "step": 423 }, { "epoch": 2.48610433934666, "grad_norm": 0.2607182264328003, "learning_rate": 8.414640186866063e-07, "loss": 0.0455, "step": 424 }, { "epoch": 2.491955143832277, "grad_norm": 0.28515201807022095, "learning_rate": 8.225609429353187e-07, "loss": 0.0582, "step": 425 }, { "epoch": 2.4978059483178936, "grad_norm": 0.29618608951568604, "learning_rate": 8.03853563568367e-07, "loss": 0.0615, "step": 426 }, { "epoch": 2.5036567528035105, "grad_norm": 0.30800962448120117, "learning_rate": 7.8534275695198e-07, "loss": 0.0654, "step": 427 }, { "epoch": 2.509507557289127, "grad_norm": 0.29063132405281067, "learning_rate": 7.670293902437331e-07, "loss": 0.0566, "step": 428 }, { "epoch": 2.515358361774744, "grad_norm": 0.28903132677078247, "learning_rate": 7.489143213519301e-07, "loss": 0.0635, "step": 429 }, { "epoch": 2.521209166260361, "grad_norm": 0.2604012191295624, "learning_rate": 7.309983988954078e-07, "loss": 0.0502, "step": 430 }, { "epoch": 2.5270599707459773, "grad_norm": 0.30102649331092834, "learning_rate": 7.132824621637891e-07, "loss": 0.057, "step": 431 }, { "epoch": 2.5329107752315942, "grad_norm": 0.2743903696537018, "learning_rate": 6.957673410781617e-07, "loss": 0.0458, "step": 432 }, { "epoch": 2.538761579717211, "grad_norm": 0.30411839485168457, "learning_rate": 6.784538561521986e-07, "loss": 0.0696, "step": 433 }, { "epoch": 2.5446123842028276, "grad_norm": 0.26881927251815796, "learning_rate": 6.613428184537235e-07, "loss": 0.0585, "step": 434 }, { "epoch": 2.5504631886884446, "grad_norm": 0.2715945541858673, "learning_rate": 6.444350295667112e-07, "loss": 0.0506, "step": 435 }, { "epoch": 2.5563139931740615, "grad_norm": 0.27122291922569275, "learning_rate": 6.277312815537423e-07, "loss": 0.0548, "step": 436 }, { "epoch": 2.562164797659678, "grad_norm": 0.30401352047920227, "learning_rate": 6.112323569188927e-07, "loss": 0.0536, "step": 437 }, { "epoch": 2.568015602145295, "grad_norm": 0.24554386734962463, "learning_rate": 5.949390285710777e-07, "loss": 0.0519, "step": 438 }, { "epoch": 2.573866406630912, "grad_norm": 0.27577313780784607, "learning_rate": 5.788520597878477e-07, "loss": 0.0529, "step": 439 }, { "epoch": 2.5797172111165283, "grad_norm": 0.3495662212371826, "learning_rate": 5.629722041796292e-07, "loss": 0.0569, "step": 440 }, { "epoch": 2.585568015602145, "grad_norm": 0.2668460011482239, "learning_rate": 5.473002056544191e-07, "loss": 0.0453, "step": 441 }, { "epoch": 2.591418820087762, "grad_norm": 0.2764504551887512, "learning_rate": 5.318367983829393e-07, "loss": 0.0573, "step": 442 }, { "epoch": 2.5972696245733786, "grad_norm": 0.2935810983181, "learning_rate": 5.165827067642415e-07, "loss": 0.0495, "step": 443 }, { "epoch": 2.6031204290589955, "grad_norm": 0.2925431728363037, "learning_rate": 5.015386453917742e-07, "loss": 0.0535, "step": 444 }, { "epoch": 2.6089712335446125, "grad_norm": 0.2892134487628937, "learning_rate": 4.867053190199011e-07, "loss": 0.0607, "step": 445 }, { "epoch": 2.614822038030229, "grad_norm": 0.28982144594192505, "learning_rate": 4.720834225308962e-07, "loss": 0.0612, "step": 446 }, { "epoch": 2.620672842515846, "grad_norm": 0.24418623745441437, "learning_rate": 4.576736409023813e-07, "loss": 0.051, "step": 447 }, { "epoch": 2.626523647001463, "grad_norm": 0.26711544394493103, "learning_rate": 4.4347664917524293e-07, "loss": 0.0598, "step": 448 }, { "epoch": 2.6323744514870793, "grad_norm": 0.28195297718048096, "learning_rate": 4.29493112422007e-07, "loss": 0.0546, "step": 449 }, { "epoch": 2.638225255972696, "grad_norm": 0.29547280073165894, "learning_rate": 4.15723685715686e-07, "loss": 0.0535, "step": 450 }, { "epoch": 2.644076060458313, "grad_norm": 0.2511988878250122, "learning_rate": 4.0216901409908695e-07, "loss": 0.0509, "step": 451 }, { "epoch": 2.6499268649439296, "grad_norm": 0.28745976090431213, "learning_rate": 3.8882973255459975e-07, "loss": 0.0502, "step": 452 }, { "epoch": 2.6557776694295465, "grad_norm": 0.2565235495567322, "learning_rate": 3.7570646597444196e-07, "loss": 0.0477, "step": 453 }, { "epoch": 2.6616284739151634, "grad_norm": 0.29118213057518005, "learning_rate": 3.627998291313939e-07, "loss": 0.0669, "step": 454 }, { "epoch": 2.66747927840078, "grad_norm": 0.29614195227622986, "learning_rate": 3.5011042664999663e-07, "loss": 0.0574, "step": 455 }, { "epoch": 2.673330082886397, "grad_norm": 0.25614967942237854, "learning_rate": 3.3763885297822153e-07, "loss": 0.0588, "step": 456 }, { "epoch": 2.6791808873720138, "grad_norm": 0.27203789353370667, "learning_rate": 3.2538569235963216e-07, "loss": 0.0508, "step": 457 }, { "epoch": 2.6850316918576302, "grad_norm": 0.26867544651031494, "learning_rate": 3.133515188060077e-07, "loss": 0.055, "step": 458 }, { "epoch": 2.690882496343247, "grad_norm": 0.28810831904411316, "learning_rate": 3.015368960704584e-07, "loss": 0.0602, "step": 459 }, { "epoch": 2.696733300828864, "grad_norm": 0.28718477487564087, "learning_rate": 2.899423776210092e-07, "loss": 0.0625, "step": 460 }, { "epoch": 2.7025841053144806, "grad_norm": 0.2641659379005432, "learning_rate": 2.785685066146776e-07, "loss": 0.057, "step": 461 }, { "epoch": 2.7084349098000975, "grad_norm": 0.3124268352985382, "learning_rate": 2.6741581587202747e-07, "loss": 0.0567, "step": 462 }, { "epoch": 2.7142857142857144, "grad_norm": 0.26910921931266785, "learning_rate": 2.5648482785220865e-07, "loss": 0.0603, "step": 463 }, { "epoch": 2.720136518771331, "grad_norm": 0.26910898089408875, "learning_rate": 2.4577605462847764e-07, "loss": 0.0575, "step": 464 }, { "epoch": 2.725987323256948, "grad_norm": 0.2963363230228424, "learning_rate": 2.3528999786421758e-07, "loss": 0.0556, "step": 465 }, { "epoch": 2.7318381277425647, "grad_norm": 0.2956644892692566, "learning_rate": 2.25027148789429e-07, "loss": 0.0594, "step": 466 }, { "epoch": 2.737688932228181, "grad_norm": 0.26555967330932617, "learning_rate": 2.1498798817772281e-07, "loss": 0.059, "step": 467 }, { "epoch": 2.743539736713798, "grad_norm": 0.2682870626449585, "learning_rate": 2.0517298632379445e-07, "loss": 0.0531, "step": 468 }, { "epoch": 2.749390541199415, "grad_norm": 0.2852657437324524, "learning_rate": 1.9558260302139642e-07, "loss": 0.0498, "step": 469 }, { "epoch": 2.7552413456850315, "grad_norm": 0.27579066157341003, "learning_rate": 1.8621728754179392e-07, "loss": 0.0621, "step": 470 }, { "epoch": 2.7610921501706485, "grad_norm": 0.29085394740104675, "learning_rate": 1.770774786127244e-07, "loss": 0.0575, "step": 471 }, { "epoch": 2.7669429546562654, "grad_norm": 0.2594444155693054, "learning_rate": 1.6816360439783797e-07, "loss": 0.0522, "step": 472 }, { "epoch": 2.772793759141882, "grad_norm": 0.292959600687027, "learning_rate": 1.5947608247664558e-07, "loss": 0.0563, "step": 473 }, { "epoch": 2.778644563627499, "grad_norm": 0.2607786953449249, "learning_rate": 1.510153198249531e-07, "loss": 0.0505, "step": 474 }, { "epoch": 2.7844953681131157, "grad_norm": 0.285178542137146, "learning_rate": 1.4278171279579757e-07, "loss": 0.0601, "step": 475 }, { "epoch": 2.790346172598732, "grad_norm": 0.28294607996940613, "learning_rate": 1.3477564710088097e-07, "loss": 0.0539, "step": 476 }, { "epoch": 2.796196977084349, "grad_norm": 0.29305973649024963, "learning_rate": 1.2699749779249926e-07, "loss": 0.0648, "step": 477 }, { "epoch": 2.802047781569966, "grad_norm": 0.2848883867263794, "learning_rate": 1.1944762924597286e-07, "loss": 0.0596, "step": 478 }, { "epoch": 2.8078985860555825, "grad_norm": 0.27572354674339294, "learning_rate": 1.1212639514257829e-07, "loss": 0.0545, "step": 479 }, { "epoch": 2.8137493905411994, "grad_norm": 0.2800915539264679, "learning_rate": 1.0503413845297739e-07, "loss": 0.0586, "step": 480 }, { "epoch": 2.8196001950268164, "grad_norm": 0.27929428219795227, "learning_rate": 9.817119142115472e-08, "loss": 0.0535, "step": 481 }, { "epoch": 2.825450999512433, "grad_norm": 0.29202061891555786, "learning_rate": 9.15378755488483e-08, "loss": 0.0677, "step": 482 }, { "epoch": 2.8313018039980498, "grad_norm": 0.2733800411224365, "learning_rate": 8.513450158049109e-08, "loss": 0.057, "step": 483 }, { "epoch": 2.8371526084836667, "grad_norm": 0.27349552512168884, "learning_rate": 7.896136948865429e-08, "loss": 0.0595, "step": 484 }, { "epoch": 2.843003412969283, "grad_norm": 0.3204438388347626, "learning_rate": 7.301876845999368e-08, "loss": 0.0638, "step": 485 }, { "epoch": 2.8488542174549, "grad_norm": 0.2688818573951721, "learning_rate": 6.730697688170251e-08, "loss": 0.0514, "step": 486 }, { "epoch": 2.854705021940517, "grad_norm": 0.27996474504470825, "learning_rate": 6.182626232847044e-08, "loss": 0.06, "step": 487 }, { "epoch": 2.8605558264261335, "grad_norm": 0.28294089436531067, "learning_rate": 5.6576881549949e-08, "loss": 0.0476, "step": 488 }, { "epoch": 2.8664066309117504, "grad_norm": 0.26451075077056885, "learning_rate": 5.155908045872349e-08, "loss": 0.0587, "step": 489 }, { "epoch": 2.8722574353973673, "grad_norm": 0.28928130865097046, "learning_rate": 4.677309411879327e-08, "loss": 0.0518, "step": 490 }, { "epoch": 2.878108239882984, "grad_norm": 0.2694530487060547, "learning_rate": 4.221914673455896e-08, "loss": 0.0541, "step": 491 }, { "epoch": 2.8839590443686007, "grad_norm": 0.3015761077404022, "learning_rate": 3.7897451640321326e-08, "loss": 0.0602, "step": 492 }, { "epoch": 2.8898098488542177, "grad_norm": 0.2916721999645233, "learning_rate": 3.3808211290284886e-08, "loss": 0.0565, "step": 493 }, { "epoch": 2.895660653339834, "grad_norm": 0.2948850989341736, "learning_rate": 2.995161724907658e-08, "loss": 0.0619, "step": 494 }, { "epoch": 2.901511457825451, "grad_norm": 0.310370534658432, "learning_rate": 2.6327850182769065e-08, "loss": 0.0604, "step": 495 }, { "epoch": 2.907362262311068, "grad_norm": 0.2993476092815399, "learning_rate": 2.29370798504186e-08, "loss": 0.0581, "step": 496 }, { "epoch": 2.9132130667966845, "grad_norm": 0.2968423366546631, "learning_rate": 1.9779465096112505e-08, "loss": 0.0571, "step": 497 }, { "epoch": 2.9190638712823014, "grad_norm": 0.2754110097885132, "learning_rate": 1.6855153841527915e-08, "loss": 0.0585, "step": 498 }, { "epoch": 2.9249146757679183, "grad_norm": 0.28342047333717346, "learning_rate": 1.4164283079001196e-08, "loss": 0.0567, "step": 499 }, { "epoch": 2.930765480253535, "grad_norm": 0.2792755961418152, "learning_rate": 1.1706978865113072e-08, "loss": 0.0497, "step": 500 }, { "epoch": 2.9366162847391517, "grad_norm": 0.3103918135166168, "learning_rate": 9.48335631477948e-09, "loss": 0.0637, "step": 501 }, { "epoch": 2.9424670892247686, "grad_norm": 0.2594921588897705, "learning_rate": 7.49351959586253e-09, "loss": 0.0587, "step": 502 }, { "epoch": 2.948317893710385, "grad_norm": 0.2774949371814728, "learning_rate": 5.737561924288315e-09, "loss": 0.0536, "step": 503 }, { "epoch": 2.954168698196002, "grad_norm": 0.3025616407394409, "learning_rate": 4.2155655596809455e-09, "loss": 0.0559, "step": 504 }, { "epoch": 2.960019502681619, "grad_norm": 0.2954673767089844, "learning_rate": 2.9276018015089725e-09, "loss": 0.0669, "step": 505 }, { "epoch": 2.9658703071672354, "grad_norm": 0.2557486295700073, "learning_rate": 1.8737309857463916e-09, "loss": 0.0663, "step": 506 }, { "epoch": 2.9717211116528524, "grad_norm": 0.29354172945022583, "learning_rate": 1.054002482043237e-09, "loss": 0.0623, "step": 507 }, { "epoch": 2.9775719161384693, "grad_norm": 0.3089665174484253, "learning_rate": 4.684546914163201e-10, "loss": 0.054, "step": 508 }, { "epoch": 2.9834227206240858, "grad_norm": 0.3170253038406372, "learning_rate": 1.1711504444733567e-10, "loss": 0.0626, "step": 509 }, { "epoch": 2.9892735251097027, "grad_norm": 0.2732970118522644, "learning_rate": 0.0, "loss": 0.0551, "step": 510 }, { "epoch": 2.9892735251097027, "step": 510, "total_flos": 1494941780148224.0, "train_loss": 0.029626486676872944, "train_runtime": 67167.8945, "train_samples_per_second": 0.733, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 510, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1494941780148224.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }