diff --git "a/training_logs.json" "b/training_logs.json" new file mode 100644--- /dev/null +++ "b/training_logs.json" @@ -0,0 +1,6861 @@ +[ + { + "loss": 30.8234, + "grad_norm": 0.5755094885826111, + "learning_rate": 0.0009991248796709547, + "epoch": 0.0 + }, + { + "loss": 24.1229, + "grad_norm": 0.2920963764190674, + "learning_rate": 0.0009982497593419095, + "epoch": 0.01 + }, + { + "loss": 22.7986, + "grad_norm": 0.5106011629104614, + "learning_rate": 0.0009973746390128642, + "epoch": 0.01 + }, + { + "loss": 21.4924, + "grad_norm": 0.9322375059127808, + "learning_rate": 0.000996499518683819, + "epoch": 0.01 + }, + { + "loss": 20.7911, + "grad_norm": 0.8529098629951477, + "learning_rate": 0.0009956243983547737, + "epoch": 0.01 + }, + { + "loss": 19.338, + "grad_norm": 0.776152491569519, + "learning_rate": 0.0009947492780257286, + "epoch": 0.02 + }, + { + "loss": 19.0175, + "grad_norm": 2.11796498298645, + "learning_rate": 0.0009938741576966832, + "epoch": 0.02 + }, + { + "loss": 18.2997, + "grad_norm": 1.3791886568069458, + "learning_rate": 0.0009929990373676381, + "epoch": 0.02 + }, + { + "loss": 17.2791, + "grad_norm": 1.3849037885665894, + "learning_rate": 0.0009921239170385928, + "epoch": 0.02 + }, + { + "loss": 17.3609, + "grad_norm": 1.1861941814422607, + "learning_rate": 0.0009912487967095476, + "epoch": 0.03 + }, + { + "loss": 17.1215, + "grad_norm": 1.494122862815857, + "learning_rate": 0.0009903736763805023, + "epoch": 0.03 + }, + { + "loss": 16.3944, + "grad_norm": 1.5872834920883179, + "learning_rate": 0.0009894985560514572, + "epoch": 0.03 + }, + { + "loss": 16.0054, + "grad_norm": 1.2658979892730713, + "learning_rate": 0.0009886234357224118, + "epoch": 0.03 + }, + { + "loss": 15.5523, + "grad_norm": 0.8640480041503906, + "learning_rate": 0.0009877483153933667, + "epoch": 0.04 + }, + { + "loss": 16.2465, + "grad_norm": 0.8946548700332642, + "learning_rate": 0.0009868731950643213, + "epoch": 0.04 + }, + { + "loss": 15.0235, + "grad_norm": 0.9279372692108154, + "learning_rate": 0.0009859980747352762, + "epoch": 0.04 + }, + { + "loss": 15.7517, + "grad_norm": 0.8807494044303894, + "learning_rate": 0.0009851229544062309, + "epoch": 0.04 + }, + { + "loss": 14.6884, + "grad_norm": 0.683822751045227, + "learning_rate": 0.0009842478340771857, + "epoch": 0.05 + }, + { + "loss": 14.0949, + "grad_norm": 1.1334095001220703, + "learning_rate": 0.0009833727137481404, + "epoch": 0.05 + }, + { + "loss": 14.3378, + "grad_norm": 1.1247657537460327, + "learning_rate": 0.0009824975934190953, + "epoch": 0.05 + }, + { + "loss": 13.7597, + "grad_norm": 0.9332773685455322, + "learning_rate": 0.00098162247309005, + "epoch": 0.06 + }, + { + "loss": 14.5567, + "grad_norm": 0.8742538690567017, + "learning_rate": 0.0009807473527610048, + "epoch": 0.06 + }, + { + "loss": 14.0188, + "grad_norm": 1.5592143535614014, + "learning_rate": 0.0009798722324319594, + "epoch": 0.06 + }, + { + "loss": 13.9401, + "grad_norm": 0.9473065733909607, + "learning_rate": 0.0009789971121029143, + "epoch": 0.06 + }, + { + "loss": 13.5177, + "grad_norm": 0.5469663143157959, + "learning_rate": 0.000978121991773869, + "epoch": 0.07 + }, + { + "loss": 13.513, + "grad_norm": 1.7497597932815552, + "learning_rate": 0.0009772468714448236, + "epoch": 0.07 + }, + { + "loss": 13.6579, + "grad_norm": 0.7552927136421204, + "learning_rate": 0.0009763717511157785, + "epoch": 0.07 + }, + { + "loss": 13.0899, + "grad_norm": 0.5602779984474182, + "learning_rate": 0.0009754966307867332, + "epoch": 0.07 + }, + { + "loss": 13.8637, + "grad_norm": 0.6577705144882202, + "learning_rate": 0.000974621510457688, + "epoch": 0.08 + }, + { + "loss": 14.2909, + "grad_norm": 1.0710817575454712, + "learning_rate": 0.0009737463901286428, + "epoch": 0.08 + }, + { + "loss": 13.3632, + "grad_norm": 0.48803457617759705, + "learning_rate": 0.0009728712697995975, + "epoch": 0.08 + }, + { + "loss": 13.5002, + "grad_norm": 0.9970788359642029, + "learning_rate": 0.0009719961494705523, + "epoch": 0.08 + }, + { + "loss": 13.6276, + "grad_norm": 0.9624769687652588, + "learning_rate": 0.000971121029141507, + "epoch": 0.09 + }, + { + "loss": 13.7281, + "grad_norm": 0.8082631230354309, + "learning_rate": 0.0009702459088124618, + "epoch": 0.09 + }, + { + "loss": 13.0793, + "grad_norm": 0.6732771396636963, + "learning_rate": 0.0009693707884834166, + "epoch": 0.09 + }, + { + "loss": 12.6621, + "grad_norm": 0.8451002240180969, + "learning_rate": 0.0009684956681543713, + "epoch": 0.09 + }, + { + "loss": 13.2374, + "grad_norm": 1.1656385660171509, + "learning_rate": 0.0009676205478253261, + "epoch": 0.1 + }, + { + "loss": 12.7625, + "grad_norm": 0.9667061567306519, + "learning_rate": 0.0009667454274962808, + "epoch": 0.1 + }, + { + "loss": 13.0046, + "grad_norm": 0.9311497807502747, + "learning_rate": 0.0009658703071672355, + "epoch": 0.1 + }, + { + "loss": 12.9037, + "grad_norm": 1.1891040802001953, + "learning_rate": 0.0009649951868381903, + "epoch": 0.1 + }, + { + "loss": 12.6521, + "grad_norm": 1.1127817630767822, + "learning_rate": 0.000964120066509145, + "epoch": 0.11 + }, + { + "loss": 13.2942, + "grad_norm": 0.6665758490562439, + "learning_rate": 0.0009632449461800998, + "epoch": 0.11 + }, + { + "loss": 12.4443, + "grad_norm": 0.8878126740455627, + "learning_rate": 0.0009623698258510546, + "epoch": 0.11 + }, + { + "loss": 13.0001, + "grad_norm": 1.5000464916229248, + "learning_rate": 0.0009614947055220093, + "epoch": 0.12 + }, + { + "loss": 12.2303, + "grad_norm": 1.1078687906265259, + "learning_rate": 0.0009606195851929641, + "epoch": 0.12 + }, + { + "loss": 12.1915, + "grad_norm": 0.8044748306274414, + "learning_rate": 0.0009597444648639187, + "epoch": 0.12 + }, + { + "loss": 12.7246, + "grad_norm": 0.9232500195503235, + "learning_rate": 0.0009588693445348735, + "epoch": 0.12 + }, + { + "loss": 11.9769, + "grad_norm": 0.7413458824157715, + "learning_rate": 0.0009579942242058283, + "epoch": 0.13 + }, + { + "loss": 12.8006, + "grad_norm": 1.1132707595825195, + "learning_rate": 0.000957119103876783, + "epoch": 0.13 + }, + { + "loss": 12.4323, + "grad_norm": 0.7814503312110901, + "learning_rate": 0.0009562439835477378, + "epoch": 0.13 + }, + { + "loss": 12.3482, + "grad_norm": 0.8854762315750122, + "learning_rate": 0.0009553688632186925, + "epoch": 0.13 + }, + { + "loss": 12.5045, + "grad_norm": 0.704131007194519, + "learning_rate": 0.0009544937428896473, + "epoch": 0.14 + }, + { + "loss": 12.1405, + "grad_norm": 0.7020297050476074, + "learning_rate": 0.0009536186225606021, + "epoch": 0.14 + }, + { + "loss": 11.5427, + "grad_norm": 0.398807168006897, + "learning_rate": 0.0009527435022315568, + "epoch": 0.14 + }, + { + "loss": 12.655, + "grad_norm": 1.0002299547195435, + "learning_rate": 0.0009518683819025116, + "epoch": 0.14 + }, + { + "loss": 11.9656, + "grad_norm": 0.7870428562164307, + "learning_rate": 0.0009509932615734664, + "epoch": 0.15 + }, + { + "loss": 12.4639, + "grad_norm": 0.9154604077339172, + "learning_rate": 0.0009501181412444211, + "epoch": 0.15 + }, + { + "loss": 11.6344, + "grad_norm": 1.1896569728851318, + "learning_rate": 0.0009492430209153759, + "epoch": 0.15 + }, + { + "loss": 12.4516, + "grad_norm": 0.8169024586677551, + "learning_rate": 0.0009483679005863306, + "epoch": 0.15 + }, + { + "loss": 12.1848, + "grad_norm": 0.8429264426231384, + "learning_rate": 0.0009474927802572854, + "epoch": 0.16 + }, + { + "loss": 11.2014, + "grad_norm": 0.8499436378479004, + "learning_rate": 0.0009466176599282402, + "epoch": 0.16 + }, + { + "loss": 12.2217, + "grad_norm": 0.8969743251800537, + "learning_rate": 0.0009457425395991948, + "epoch": 0.16 + }, + { + "loss": 11.7729, + "grad_norm": 1.0959218740463257, + "learning_rate": 0.0009448674192701496, + "epoch": 0.17 + }, + { + "loss": 11.6254, + "grad_norm": 1.1692876815795898, + "learning_rate": 0.0009439922989411043, + "epoch": 0.17 + }, + { + "loss": 11.5698, + "grad_norm": 1.9476372003555298, + "learning_rate": 0.0009431171786120591, + "epoch": 0.17 + }, + { + "loss": 11.4321, + "grad_norm": 1.1742662191390991, + "learning_rate": 0.0009422420582830139, + "epoch": 0.17 + }, + { + "loss": 11.3224, + "grad_norm": 0.9839737415313721, + "learning_rate": 0.0009413669379539686, + "epoch": 0.18 + }, + { + "loss": 11.8269, + "grad_norm": 0.9094179272651672, + "learning_rate": 0.0009404918176249234, + "epoch": 0.18 + }, + { + "loss": 11.8652, + "grad_norm": 0.9139958620071411, + "learning_rate": 0.0009396166972958782, + "epoch": 0.18 + }, + { + "loss": 11.5493, + "grad_norm": 0.7938945889472961, + "learning_rate": 0.0009387415769668329, + "epoch": 0.18 + }, + { + "loss": 11.413, + "grad_norm": 0.8102487921714783, + "learning_rate": 0.0009378664566377877, + "epoch": 0.19 + }, + { + "loss": 11.4015, + "grad_norm": 0.5892770290374756, + "learning_rate": 0.0009369913363087424, + "epoch": 0.19 + }, + { + "loss": 10.8455, + "grad_norm": 0.7269143462181091, + "learning_rate": 0.0009361162159796972, + "epoch": 0.19 + }, + { + "loss": 11.5612, + "grad_norm": 0.8169882893562317, + "learning_rate": 0.000935241095650652, + "epoch": 0.19 + }, + { + "loss": 10.545, + "grad_norm": 0.8424365520477295, + "learning_rate": 0.0009343659753216067, + "epoch": 0.2 + }, + { + "loss": 10.8486, + "grad_norm": 0.855518102645874, + "learning_rate": 0.0009334908549925615, + "epoch": 0.2 + }, + { + "loss": 10.3733, + "grad_norm": 1.1463903188705444, + "learning_rate": 0.0009326157346635162, + "epoch": 0.2 + }, + { + "loss": 10.794, + "grad_norm": 0.7493767142295837, + "learning_rate": 0.000931740614334471, + "epoch": 0.2 + }, + { + "loss": 10.5943, + "grad_norm": 0.8767346739768982, + "learning_rate": 0.0009308654940054258, + "epoch": 0.21 + }, + { + "loss": 11.4169, + "grad_norm": 1.0650781393051147, + "learning_rate": 0.0009299903736763805, + "epoch": 0.21 + }, + { + "loss": 10.8176, + "grad_norm": 0.8954362869262695, + "learning_rate": 0.0009291152533473353, + "epoch": 0.21 + }, + { + "loss": 10.9644, + "grad_norm": 0.697245180606842, + "learning_rate": 0.0009282401330182901, + "epoch": 0.22 + }, + { + "loss": 11.0427, + "grad_norm": 1.5471469163894653, + "learning_rate": 0.0009273650126892448, + "epoch": 0.22 + }, + { + "loss": 10.8293, + "grad_norm": 0.7173879146575928, + "learning_rate": 0.0009264898923601996, + "epoch": 0.22 + }, + { + "loss": 10.744, + "grad_norm": 1.1271495819091797, + "learning_rate": 0.0009256147720311543, + "epoch": 0.22 + }, + { + "loss": 10.3733, + "grad_norm": 0.7106486558914185, + "learning_rate": 0.0009247396517021091, + "epoch": 0.23 + }, + { + "loss": 10.9536, + "grad_norm": 1.1200592517852783, + "learning_rate": 0.0009238645313730638, + "epoch": 0.23 + }, + { + "loss": 10.4749, + "grad_norm": 1.0028458833694458, + "learning_rate": 0.0009229894110440185, + "epoch": 0.23 + }, + { + "loss": 11.4667, + "grad_norm": 1.187585711479187, + "learning_rate": 0.0009221142907149733, + "epoch": 0.23 + }, + { + "loss": 10.3349, + "grad_norm": 0.8691514134407043, + "learning_rate": 0.000921239170385928, + "epoch": 0.24 + }, + { + "loss": 10.6188, + "grad_norm": 0.8789599537849426, + "learning_rate": 0.0009203640500568828, + "epoch": 0.24 + }, + { + "loss": 10.454, + "grad_norm": 0.8376362919807434, + "learning_rate": 0.0009194889297278376, + "epoch": 0.24 + }, + { + "loss": 10.2419, + "grad_norm": 1.0760575532913208, + "learning_rate": 0.0009186138093987923, + "epoch": 0.24 + }, + { + "loss": 10.8593, + "grad_norm": 0.709028422832489, + "learning_rate": 0.0009177386890697471, + "epoch": 0.25 + }, + { + "loss": 11.073, + "grad_norm": 1.0934019088745117, + "learning_rate": 0.0009168635687407019, + "epoch": 0.25 + }, + { + "loss": 10.5596, + "grad_norm": 0.7833492159843445, + "learning_rate": 0.0009159884484116566, + "epoch": 0.25 + }, + { + "loss": 11.2079, + "grad_norm": 0.8762934803962708, + "learning_rate": 0.0009151133280826114, + "epoch": 0.25 + }, + { + "loss": 11.2229, + "grad_norm": 0.8059395551681519, + "learning_rate": 0.0009142382077535661, + "epoch": 0.26 + }, + { + "loss": 10.8706, + "grad_norm": 1.0892099142074585, + "learning_rate": 0.0009133630874245209, + "epoch": 0.26 + }, + { + "loss": 10.9983, + "grad_norm": 0.7471132278442383, + "learning_rate": 0.0009124879670954757, + "epoch": 0.26 + }, + { + "loss": 11.4291, + "grad_norm": 0.9766479730606079, + "learning_rate": 0.0009116128467664304, + "epoch": 0.27 + }, + { + "loss": 10.5895, + "grad_norm": 0.7469794154167175, + "learning_rate": 0.0009107377264373852, + "epoch": 0.27 + }, + { + "loss": 9.9826, + "grad_norm": 0.9510082602500916, + "learning_rate": 0.00090986260610834, + "epoch": 0.27 + }, + { + "loss": 10.1785, + "grad_norm": 0.8061089515686035, + "learning_rate": 0.0009089874857792947, + "epoch": 0.27 + }, + { + "loss": 10.5502, + "grad_norm": 0.7467952966690063, + "learning_rate": 0.0009081123654502495, + "epoch": 0.28 + }, + { + "loss": 10.4848, + "grad_norm": 0.9167515635490417, + "learning_rate": 0.0009072372451212042, + "epoch": 0.28 + }, + { + "loss": 10.7841, + "grad_norm": 1.0157630443572998, + "learning_rate": 0.000906362124792159, + "epoch": 0.28 + }, + { + "loss": 10.6985, + "grad_norm": 0.8764671087265015, + "learning_rate": 0.0009054870044631138, + "epoch": 0.28 + }, + { + "loss": 10.4706, + "grad_norm": 0.7716103196144104, + "learning_rate": 0.0009046118841340685, + "epoch": 0.29 + }, + { + "loss": 10.4371, + "grad_norm": 0.83449387550354, + "learning_rate": 0.0009037367638050233, + "epoch": 0.29 + }, + { + "loss": 10.2414, + "grad_norm": 0.785839855670929, + "learning_rate": 0.000902861643475978, + "epoch": 0.29 + }, + { + "loss": 10.0213, + "grad_norm": 0.7405595183372498, + "learning_rate": 0.0009019865231469327, + "epoch": 0.29 + }, + { + "loss": 10.2501, + "grad_norm": 0.929263710975647, + "learning_rate": 0.0009011114028178875, + "epoch": 0.3 + }, + { + "loss": 10.6749, + "grad_norm": 0.9185034036636353, + "learning_rate": 0.0009002362824888422, + "epoch": 0.3 + }, + { + "loss": 10.4313, + "grad_norm": 0.7888991832733154, + "learning_rate": 0.000899361162159797, + "epoch": 0.3 + }, + { + "loss": 10.4389, + "grad_norm": 0.9736090302467346, + "learning_rate": 0.0008984860418307517, + "epoch": 0.3 + }, + { + "loss": 9.9148, + "grad_norm": 0.7677895426750183, + "learning_rate": 0.0008976109215017065, + "epoch": 0.31 + }, + { + "loss": 9.7635, + "grad_norm": 0.9090219736099243, + "learning_rate": 0.0008967358011726613, + "epoch": 0.31 + }, + { + "loss": 10.0211, + "grad_norm": 0.7184523344039917, + "learning_rate": 0.000895860680843616, + "epoch": 0.31 + }, + { + "loss": 9.9932, + "grad_norm": 1.0859735012054443, + "learning_rate": 0.0008949855605145708, + "epoch": 0.31 + }, + { + "loss": 10.2804, + "grad_norm": 1.0252892971038818, + "learning_rate": 0.0008941104401855256, + "epoch": 0.32 + }, + { + "loss": 10.0543, + "grad_norm": 1.1707403659820557, + "learning_rate": 0.0008932353198564803, + "epoch": 0.32 + }, + { + "loss": 10.6658, + "grad_norm": 0.6616178750991821, + "learning_rate": 0.0008923601995274351, + "epoch": 0.32 + }, + { + "loss": 9.8623, + "grad_norm": 1.9947571754455566, + "learning_rate": 0.0008914850791983898, + "epoch": 0.33 + }, + { + "loss": 10.1607, + "grad_norm": 1.3363871574401855, + "learning_rate": 0.0008906099588693446, + "epoch": 0.33 + }, + { + "loss": 10.1063, + "grad_norm": 1.0039112567901611, + "learning_rate": 0.0008897348385402994, + "epoch": 0.33 + }, + { + "loss": 9.7059, + "grad_norm": 1.0225836038589478, + "learning_rate": 0.0008888597182112541, + "epoch": 0.33 + }, + { + "loss": 10.2506, + "grad_norm": 1.1005779504776, + "learning_rate": 0.0008879845978822089, + "epoch": 0.34 + }, + { + "loss": 10.3011, + "grad_norm": 1.1654433012008667, + "learning_rate": 0.0008871094775531636, + "epoch": 0.34 + }, + { + "loss": 10.088, + "grad_norm": 0.9155218601226807, + "learning_rate": 0.0008862343572241184, + "epoch": 0.34 + }, + { + "loss": 9.8835, + "grad_norm": 1.2090198993682861, + "learning_rate": 0.0008853592368950732, + "epoch": 0.34 + }, + { + "loss": 9.6644, + "grad_norm": 1.5198620557785034, + "learning_rate": 0.0008844841165660279, + "epoch": 0.35 + }, + { + "loss": 9.6799, + "grad_norm": 1.0043960809707642, + "learning_rate": 0.0008836089962369827, + "epoch": 0.35 + }, + { + "loss": 10.0658, + "grad_norm": 1.0404608249664307, + "learning_rate": 0.0008827338759079375, + "epoch": 0.35 + }, + { + "loss": 9.9551, + "grad_norm": 1.0412163734436035, + "learning_rate": 0.0008818587555788922, + "epoch": 0.35 + }, + { + "loss": 9.4082, + "grad_norm": 0.9032560586929321, + "learning_rate": 0.000880983635249847, + "epoch": 0.36 + }, + { + "loss": 10.2566, + "grad_norm": 1.2763034105300903, + "learning_rate": 0.0008801085149208016, + "epoch": 0.36 + }, + { + "loss": 9.8585, + "grad_norm": 0.8143719434738159, + "learning_rate": 0.0008792333945917563, + "epoch": 0.36 + }, + { + "loss": 9.5974, + "grad_norm": 1.3916654586791992, + "learning_rate": 0.000878358274262711, + "epoch": 0.36 + }, + { + "loss": 10.611, + "grad_norm": 1.2270894050598145, + "learning_rate": 0.0008774831539336658, + "epoch": 0.37 + }, + { + "loss": 9.4489, + "grad_norm": 1.339573621749878, + "learning_rate": 0.0008766080336046206, + "epoch": 0.37 + }, + { + "loss": 9.769, + "grad_norm": 1.023978352546692, + "learning_rate": 0.0008757329132755753, + "epoch": 0.37 + }, + { + "loss": 9.7854, + "grad_norm": 1.1513617038726807, + "learning_rate": 0.0008748577929465301, + "epoch": 0.38 + }, + { + "loss": 9.4378, + "grad_norm": 0.9918627142906189, + "learning_rate": 0.0008739826726174849, + "epoch": 0.38 + }, + { + "loss": 9.6902, + "grad_norm": 0.9365573525428772, + "learning_rate": 0.0008731075522884396, + "epoch": 0.38 + }, + { + "loss": 9.5533, + "grad_norm": 1.1697934865951538, + "learning_rate": 0.0008722324319593944, + "epoch": 0.38 + }, + { + "loss": 9.5204, + "grad_norm": 1.2257342338562012, + "learning_rate": 0.0008713573116303491, + "epoch": 0.39 + }, + { + "loss": 9.636, + "grad_norm": 1.0158884525299072, + "learning_rate": 0.0008704821913013039, + "epoch": 0.39 + }, + { + "loss": 9.8914, + "grad_norm": 1.4228135347366333, + "learning_rate": 0.0008696070709722587, + "epoch": 0.39 + }, + { + "loss": 9.3714, + "grad_norm": 1.2829135656356812, + "learning_rate": 0.0008687319506432134, + "epoch": 0.39 + }, + { + "loss": 9.7498, + "grad_norm": 1.2624573707580566, + "learning_rate": 0.0008678568303141682, + "epoch": 0.4 + }, + { + "loss": 9.8928, + "grad_norm": 1.3651659488677979, + "learning_rate": 0.000866981709985123, + "epoch": 0.4 + }, + { + "loss": 10.3697, + "grad_norm": 1.1383252143859863, + "learning_rate": 0.0008661065896560777, + "epoch": 0.4 + }, + { + "loss": 10.1876, + "grad_norm": 1.1688463687896729, + "learning_rate": 0.0008652314693270325, + "epoch": 0.4 + }, + { + "loss": 9.7974, + "grad_norm": 1.1377474069595337, + "learning_rate": 0.0008643563489979872, + "epoch": 0.41 + }, + { + "loss": 9.5742, + "grad_norm": 1.0107587575912476, + "learning_rate": 0.000863481228668942, + "epoch": 0.41 + }, + { + "loss": 9.9821, + "grad_norm": 1.3488329648971558, + "learning_rate": 0.0008626061083398968, + "epoch": 0.41 + }, + { + "loss": 9.3107, + "grad_norm": 1.0305010080337524, + "learning_rate": 0.0008617309880108515, + "epoch": 0.41 + }, + { + "loss": 9.3456, + "grad_norm": 0.8658286929130554, + "learning_rate": 0.0008608558676818063, + "epoch": 0.42 + }, + { + "loss": 9.3709, + "grad_norm": 1.1033709049224854, + "learning_rate": 0.000859980747352761, + "epoch": 0.42 + }, + { + "loss": 9.5077, + "grad_norm": 1.1051572561264038, + "learning_rate": 0.0008591056270237157, + "epoch": 0.42 + }, + { + "loss": 9.1458, + "grad_norm": 1.3423538208007812, + "learning_rate": 0.0008582305066946705, + "epoch": 0.43 + }, + { + "loss": 9.657, + "grad_norm": 1.1479153633117676, + "learning_rate": 0.0008573553863656252, + "epoch": 0.43 + }, + { + "loss": 10.5804, + "grad_norm": 1.1615872383117676, + "learning_rate": 0.00085648026603658, + "epoch": 0.43 + }, + { + "loss": 8.2792, + "grad_norm": 1.212221384048462, + "learning_rate": 0.0008556051457075347, + "epoch": 0.43 + }, + { + "loss": 9.3785, + "grad_norm": 1.0849367380142212, + "learning_rate": 0.0008547300253784895, + "epoch": 0.44 + }, + { + "loss": 9.4097, + "grad_norm": 1.119325041770935, + "learning_rate": 0.0008538549050494443, + "epoch": 0.44 + }, + { + "loss": 9.3308, + "grad_norm": 1.3356918096542358, + "learning_rate": 0.000852979784720399, + "epoch": 0.44 + }, + { + "loss": 9.4548, + "grad_norm": 0.9954844117164612, + "learning_rate": 0.0008521046643913538, + "epoch": 0.44 + }, + { + "loss": 8.9297, + "grad_norm": 0.8752724528312683, + "learning_rate": 0.0008512295440623086, + "epoch": 0.45 + }, + { + "loss": 9.1389, + "grad_norm": 1.2811753749847412, + "learning_rate": 0.0008503544237332633, + "epoch": 0.45 + }, + { + "loss": 9.3155, + "grad_norm": 1.253055453300476, + "learning_rate": 0.0008494793034042181, + "epoch": 0.45 + }, + { + "loss": 9.548, + "grad_norm": 1.2081260681152344, + "learning_rate": 0.0008486041830751728, + "epoch": 0.45 + }, + { + "loss": 9.0236, + "grad_norm": 1.3752362728118896, + "learning_rate": 0.0008477290627461276, + "epoch": 0.46 + }, + { + "loss": 9.0533, + "grad_norm": 1.057065725326538, + "learning_rate": 0.0008468539424170824, + "epoch": 0.46 + }, + { + "loss": 9.0675, + "grad_norm": 1.0036309957504272, + "learning_rate": 0.0008459788220880371, + "epoch": 0.46 + }, + { + "loss": 9.5195, + "grad_norm": 1.3881008625030518, + "learning_rate": 0.0008451037017589919, + "epoch": 0.46 + }, + { + "loss": 9.3519, + "grad_norm": 1.4355233907699585, + "learning_rate": 0.0008442285814299467, + "epoch": 0.47 + }, + { + "loss": 9.6383, + "grad_norm": 0.9438649415969849, + "learning_rate": 0.0008433534611009014, + "epoch": 0.47 + }, + { + "loss": 9.2643, + "grad_norm": 0.8599776029586792, + "learning_rate": 0.0008424783407718562, + "epoch": 0.47 + }, + { + "loss": 8.9869, + "grad_norm": 1.1090342998504639, + "learning_rate": 0.0008416032204428109, + "epoch": 0.48 + }, + { + "loss": 9.2475, + "grad_norm": 1.272929310798645, + "learning_rate": 0.0008407281001137657, + "epoch": 0.48 + }, + { + "loss": 9.5772, + "grad_norm": 0.9889743328094482, + "learning_rate": 0.0008398529797847205, + "epoch": 0.48 + }, + { + "loss": 9.9227, + "grad_norm": 1.2748692035675049, + "learning_rate": 0.0008389778594556752, + "epoch": 0.48 + }, + { + "loss": 9.9915, + "grad_norm": 1.4889165163040161, + "learning_rate": 0.00083810273912663, + "epoch": 0.49 + }, + { + "loss": 9.0012, + "grad_norm": 1.2172118425369263, + "learning_rate": 0.0008372276187975846, + "epoch": 0.49 + }, + { + "loss": 9.2968, + "grad_norm": 1.0313849449157715, + "learning_rate": 0.0008363524984685394, + "epoch": 0.49 + }, + { + "loss": 8.9158, + "grad_norm": 1.3325482606887817, + "learning_rate": 0.0008354773781394942, + "epoch": 0.49 + }, + { + "loss": 9.0097, + "grad_norm": 1.5407133102416992, + "learning_rate": 0.0008346022578104489, + "epoch": 0.5 + }, + { + "loss": 9.0166, + "grad_norm": 1.1565685272216797, + "learning_rate": 0.0008337271374814037, + "epoch": 0.5 + }, + { + "loss": 9.1856, + "grad_norm": 1.0405404567718506, + "learning_rate": 0.0008328520171523584, + "epoch": 0.5 + }, + { + "loss": 9.2405, + "grad_norm": 1.465058445930481, + "learning_rate": 0.0008319768968233132, + "epoch": 0.5 + }, + { + "loss": 8.835, + "grad_norm": 0.9321463704109192, + "learning_rate": 0.000831101776494268, + "epoch": 0.51 + }, + { + "loss": 9.4076, + "grad_norm": 1.1780034303665161, + "learning_rate": 0.0008302266561652227, + "epoch": 0.51 + }, + { + "loss": 9.5994, + "grad_norm": 1.488897681236267, + "learning_rate": 0.0008293515358361775, + "epoch": 0.51 + }, + { + "loss": 8.6378, + "grad_norm": 1.0508447885513306, + "learning_rate": 0.0008284764155071323, + "epoch": 0.51 + }, + { + "loss": 8.7946, + "grad_norm": 1.2236040830612183, + "learning_rate": 0.000827601295178087, + "epoch": 0.52 + }, + { + "loss": 9.4619, + "grad_norm": 1.0602221488952637, + "learning_rate": 0.0008267261748490418, + "epoch": 0.52 + }, + { + "loss": 8.927, + "grad_norm": 1.476576328277588, + "learning_rate": 0.0008258510545199965, + "epoch": 0.52 + }, + { + "loss": 8.766, + "grad_norm": 1.2723809480667114, + "learning_rate": 0.0008249759341909513, + "epoch": 0.52 + }, + { + "loss": 9.1577, + "grad_norm": 1.2955093383789062, + "learning_rate": 0.0008241008138619061, + "epoch": 0.53 + }, + { + "loss": 8.8254, + "grad_norm": 1.1421802043914795, + "learning_rate": 0.0008232256935328608, + "epoch": 0.53 + }, + { + "loss": 9.3559, + "grad_norm": 1.2015204429626465, + "learning_rate": 0.0008223505732038156, + "epoch": 0.53 + }, + { + "loss": 8.7055, + "grad_norm": 1.02347993850708, + "learning_rate": 0.0008214754528747703, + "epoch": 0.54 + }, + { + "loss": 9.1773, + "grad_norm": 1.0733789205551147, + "learning_rate": 0.0008206003325457251, + "epoch": 0.54 + }, + { + "loss": 9.4909, + "grad_norm": 1.140329360961914, + "learning_rate": 0.0008197252122166799, + "epoch": 0.54 + }, + { + "loss": 8.4982, + "grad_norm": 0.8933946490287781, + "learning_rate": 0.0008188500918876346, + "epoch": 0.54 + }, + { + "loss": 9.4497, + "grad_norm": 1.3848881721496582, + "learning_rate": 0.0008179749715585894, + "epoch": 0.55 + }, + { + "loss": 9.5758, + "grad_norm": 1.175162672996521, + "learning_rate": 0.0008170998512295442, + "epoch": 0.55 + }, + { + "loss": 9.5138, + "grad_norm": 1.1983882188796997, + "learning_rate": 0.0008162247309004989, + "epoch": 0.55 + }, + { + "loss": 9.0283, + "grad_norm": 0.9055472612380981, + "learning_rate": 0.0008153496105714536, + "epoch": 0.55 + }, + { + "loss": 9.2822, + "grad_norm": 0.8885380029678345, + "learning_rate": 0.0008144744902424083, + "epoch": 0.56 + }, + { + "loss": 8.9084, + "grad_norm": 1.0463942289352417, + "learning_rate": 0.0008135993699133631, + "epoch": 0.56 + }, + { + "loss": 9.0612, + "grad_norm": 1.1517601013183594, + "learning_rate": 0.0008127242495843179, + "epoch": 0.56 + }, + { + "loss": 9.7954, + "grad_norm": 1.6062026023864746, + "learning_rate": 0.0008118491292552726, + "epoch": 0.56 + }, + { + "loss": 8.823, + "grad_norm": 1.079883098602295, + "learning_rate": 0.0008109740089262274, + "epoch": 0.57 + }, + { + "loss": 8.6287, + "grad_norm": 0.8593969345092773, + "learning_rate": 0.0008100988885971821, + "epoch": 0.57 + }, + { + "loss": 9.046, + "grad_norm": 1.5058172941207886, + "learning_rate": 0.0008092237682681369, + "epoch": 0.57 + }, + { + "loss": 8.4422, + "grad_norm": 1.0326484441757202, + "learning_rate": 0.0008083486479390917, + "epoch": 0.57 + }, + { + "loss": 9.5016, + "grad_norm": 0.9177812337875366, + "learning_rate": 0.0008074735276100464, + "epoch": 0.58 + }, + { + "loss": 8.4734, + "grad_norm": 1.1267443895339966, + "learning_rate": 0.0008065984072810012, + "epoch": 0.58 + }, + { + "loss": 8.5878, + "grad_norm": 0.9788813591003418, + "learning_rate": 0.000805723286951956, + "epoch": 0.58 + }, + { + "loss": 9.1188, + "grad_norm": 1.1300309896469116, + "learning_rate": 0.0008048481666229107, + "epoch": 0.59 + }, + { + "loss": 8.7167, + "grad_norm": 0.9951778650283813, + "learning_rate": 0.0008039730462938655, + "epoch": 0.59 + }, + { + "loss": 9.1088, + "grad_norm": 0.9415300488471985, + "learning_rate": 0.0008030979259648202, + "epoch": 0.59 + }, + { + "loss": 8.4083, + "grad_norm": 0.990203320980072, + "learning_rate": 0.000802222805635775, + "epoch": 0.59 + }, + { + "loss": 8.4926, + "grad_norm": 1.0430456399917603, + "learning_rate": 0.0008013476853067298, + "epoch": 0.6 + }, + { + "loss": 9.3307, + "grad_norm": 0.9623116254806519, + "learning_rate": 0.0008004725649776845, + "epoch": 0.6 + }, + { + "loss": 8.8633, + "grad_norm": 1.0354257822036743, + "learning_rate": 0.0007995974446486392, + "epoch": 0.6 + }, + { + "loss": 8.7932, + "grad_norm": 1.1962673664093018, + "learning_rate": 0.0007987223243195939, + "epoch": 0.6 + }, + { + "loss": 8.4265, + "grad_norm": 1.0186195373535156, + "learning_rate": 0.0007978472039905487, + "epoch": 0.61 + }, + { + "loss": 8.4596, + "grad_norm": 1.2448772192001343, + "learning_rate": 0.0007969720836615035, + "epoch": 0.61 + }, + { + "loss": 9.0019, + "grad_norm": 1.11643385887146, + "learning_rate": 0.0007960969633324582, + "epoch": 0.61 + }, + { + "loss": 8.7469, + "grad_norm": 1.9622658491134644, + "learning_rate": 0.000795221843003413, + "epoch": 0.61 + }, + { + "loss": 8.208, + "grad_norm": 0.9547304511070251, + "learning_rate": 0.0007943467226743676, + "epoch": 0.62 + }, + { + "loss": 8.3751, + "grad_norm": 0.8313985466957092, + "learning_rate": 0.0007934716023453224, + "epoch": 0.62 + }, + { + "loss": 8.6238, + "grad_norm": 0.9323874711990356, + "learning_rate": 0.0007925964820162772, + "epoch": 0.62 + }, + { + "loss": 9.0078, + "grad_norm": 1.0662554502487183, + "learning_rate": 0.0007917213616872319, + "epoch": 0.62 + }, + { + "loss": 8.7407, + "grad_norm": 1.197045087814331, + "learning_rate": 0.0007908462413581867, + "epoch": 0.63 + }, + { + "loss": 8.9698, + "grad_norm": 1.0494697093963623, + "learning_rate": 0.0007899711210291415, + "epoch": 0.63 + }, + { + "loss": 8.56, + "grad_norm": 0.9860395789146423, + "learning_rate": 0.0007890960007000962, + "epoch": 0.63 + }, + { + "loss": 8.624, + "grad_norm": 0.8026842474937439, + "learning_rate": 0.000788220880371051, + "epoch": 0.64 + }, + { + "loss": 9.1911, + "grad_norm": 1.0249046087265015, + "learning_rate": 0.0007873457600420057, + "epoch": 0.64 + }, + { + "loss": 8.552, + "grad_norm": 1.3037137985229492, + "learning_rate": 0.0007864706397129605, + "epoch": 0.64 + }, + { + "loss": 8.6872, + "grad_norm": 1.1018158197402954, + "learning_rate": 0.0007855955193839153, + "epoch": 0.64 + }, + { + "loss": 8.5007, + "grad_norm": 0.9974724054336548, + "learning_rate": 0.00078472039905487, + "epoch": 0.65 + }, + { + "loss": 9.3866, + "grad_norm": 1.2537139654159546, + "learning_rate": 0.0007838452787258248, + "epoch": 0.65 + }, + { + "loss": 8.9869, + "grad_norm": 1.2758492231369019, + "learning_rate": 0.0007829701583967795, + "epoch": 0.65 + }, + { + "loss": 8.266, + "grad_norm": 0.9684768915176392, + "learning_rate": 0.0007820950380677343, + "epoch": 0.65 + }, + { + "loss": 9.0718, + "grad_norm": 1.0212547779083252, + "learning_rate": 0.0007812199177386891, + "epoch": 0.66 + }, + { + "loss": 8.1438, + "grad_norm": 1.2493318319320679, + "learning_rate": 0.0007803447974096438, + "epoch": 0.66 + }, + { + "loss": 8.4132, + "grad_norm": 0.8168124556541443, + "learning_rate": 0.0007794696770805986, + "epoch": 0.66 + }, + { + "loss": 8.4466, + "grad_norm": 1.2837003469467163, + "learning_rate": 0.0007785945567515534, + "epoch": 0.66 + }, + { + "loss": 8.6008, + "grad_norm": 1.1589733362197876, + "learning_rate": 0.0007777194364225081, + "epoch": 0.67 + }, + { + "loss": 8.7002, + "grad_norm": 1.036216378211975, + "learning_rate": 0.0007768443160934629, + "epoch": 0.67 + }, + { + "loss": 8.9616, + "grad_norm": 0.9488565921783447, + "learning_rate": 0.0007759691957644176, + "epoch": 0.67 + }, + { + "loss": 8.9011, + "grad_norm": 1.1349655389785767, + "learning_rate": 0.0007750940754353724, + "epoch": 0.67 + }, + { + "loss": 8.7398, + "grad_norm": 1.3466508388519287, + "learning_rate": 0.0007742189551063272, + "epoch": 0.68 + }, + { + "loss": 8.1787, + "grad_norm": 1.1343966722488403, + "learning_rate": 0.0007733438347772819, + "epoch": 0.68 + }, + { + "loss": 8.4513, + "grad_norm": 0.9983484148979187, + "learning_rate": 0.0007724687144482366, + "epoch": 0.68 + }, + { + "loss": 8.6249, + "grad_norm": 1.4816855192184448, + "learning_rate": 0.0007715935941191913, + "epoch": 0.69 + }, + { + "loss": 8.9094, + "grad_norm": 1.0790578126907349, + "learning_rate": 0.0007707184737901461, + "epoch": 0.69 + }, + { + "loss": 8.0177, + "grad_norm": 1.2572119235992432, + "learning_rate": 0.0007698433534611009, + "epoch": 0.69 + }, + { + "loss": 8.5014, + "grad_norm": 1.123079776763916, + "learning_rate": 0.0007689682331320556, + "epoch": 0.69 + }, + { + "loss": 8.2177, + "grad_norm": 0.8789654970169067, + "learning_rate": 0.0007680931128030104, + "epoch": 0.7 + }, + { + "loss": 8.3753, + "grad_norm": 0.9512013792991638, + "learning_rate": 0.0007672179924739651, + "epoch": 0.7 + }, + { + "loss": 8.5434, + "grad_norm": 1.929919719696045, + "learning_rate": 0.0007663428721449199, + "epoch": 0.7 + }, + { + "loss": 8.5505, + "grad_norm": 1.1756147146224976, + "learning_rate": 0.0007654677518158747, + "epoch": 0.7 + }, + { + "loss": 8.8823, + "grad_norm": 1.1833679676055908, + "learning_rate": 0.0007645926314868294, + "epoch": 0.71 + }, + { + "loss": 8.6715, + "grad_norm": 1.4701839685440063, + "learning_rate": 0.0007637175111577842, + "epoch": 0.71 + }, + { + "loss": 8.7559, + "grad_norm": 0.9352959990501404, + "learning_rate": 0.0007629299028616435, + "epoch": 0.71 + }, + { + "loss": 9.5594, + "grad_norm": 1.0391898155212402, + "learning_rate": 0.0007620547825325983, + "epoch": 0.71 + }, + { + "loss": 8.3431, + "grad_norm": 1.0766905546188354, + "learning_rate": 0.000761179662203553, + "epoch": 0.72 + }, + { + "loss": 8.3928, + "grad_norm": 1.10299551486969, + "learning_rate": 0.0007603045418745078, + "epoch": 0.72 + }, + { + "loss": 8.9913, + "grad_norm": 1.1581339836120605, + "learning_rate": 0.0007594294215454624, + "epoch": 0.72 + }, + { + "loss": 8.5142, + "grad_norm": 1.086441993713379, + "learning_rate": 0.0007585543012164172, + "epoch": 0.72 + }, + { + "loss": 8.7005, + "grad_norm": 0.9478667974472046, + "learning_rate": 0.000757679180887372, + "epoch": 0.73 + }, + { + "loss": 8.608, + "grad_norm": 1.0929220914840698, + "learning_rate": 0.0007568040605583267, + "epoch": 0.73 + }, + { + "loss": 8.1125, + "grad_norm": 1.217629313468933, + "learning_rate": 0.0007559289402292815, + "epoch": 0.73 + }, + { + "loss": 8.4331, + "grad_norm": 1.2786823511123657, + "learning_rate": 0.0007550538199002362, + "epoch": 0.73 + }, + { + "loss": 9.1985, + "grad_norm": 1.0184354782104492, + "learning_rate": 0.000754178699571191, + "epoch": 0.74 + }, + { + "loss": 8.6549, + "grad_norm": 0.93660968542099, + "learning_rate": 0.0007533035792421458, + "epoch": 0.74 + }, + { + "loss": 8.7819, + "grad_norm": 1.0092636346817017, + "learning_rate": 0.0007524284589131005, + "epoch": 0.74 + }, + { + "loss": 8.3759, + "grad_norm": 1.2108792066574097, + "learning_rate": 0.0007515533385840553, + "epoch": 0.75 + }, + { + "loss": 8.4973, + "grad_norm": 0.9994498491287231, + "learning_rate": 0.00075067821825501, + "epoch": 0.75 + }, + { + "loss": 8.3731, + "grad_norm": 1.153273344039917, + "learning_rate": 0.0007498030979259648, + "epoch": 0.75 + }, + { + "loss": 8.4148, + "grad_norm": 1.051223874092102, + "learning_rate": 0.0007489279775969196, + "epoch": 0.75 + }, + { + "loss": 8.6672, + "grad_norm": 1.4810237884521484, + "learning_rate": 0.0007480528572678743, + "epoch": 0.76 + }, + { + "loss": 8.6439, + "grad_norm": 1.021606206893921, + "learning_rate": 0.0007471777369388291, + "epoch": 0.76 + }, + { + "loss": 8.7591, + "grad_norm": 0.8680776357650757, + "learning_rate": 0.0007463026166097839, + "epoch": 0.76 + }, + { + "loss": 9.0187, + "grad_norm": 1.0177042484283447, + "learning_rate": 0.0007454274962807386, + "epoch": 0.76 + }, + { + "loss": 8.9481, + "grad_norm": 1.2384392023086548, + "learning_rate": 0.0007445523759516934, + "epoch": 0.77 + }, + { + "loss": 8.6184, + "grad_norm": 1.3748959302902222, + "learning_rate": 0.0007436772556226481, + "epoch": 0.77 + }, + { + "loss": 8.3906, + "grad_norm": 1.042493462562561, + "learning_rate": 0.0007428021352936029, + "epoch": 0.77 + }, + { + "loss": 9.3308, + "grad_norm": 1.0647776126861572, + "learning_rate": 0.0007419270149645576, + "epoch": 0.77 + }, + { + "loss": 8.332, + "grad_norm": 1.2385993003845215, + "learning_rate": 0.0007410518946355123, + "epoch": 0.78 + }, + { + "loss": 8.3127, + "grad_norm": 1.0191227197647095, + "learning_rate": 0.0007401767743064671, + "epoch": 0.78 + }, + { + "loss": 8.3151, + "grad_norm": 0.8735216856002808, + "learning_rate": 0.0007393016539774218, + "epoch": 0.78 + }, + { + "loss": 8.701, + "grad_norm": 1.202993392944336, + "learning_rate": 0.0007384265336483766, + "epoch": 0.78 + }, + { + "loss": 7.8262, + "grad_norm": 0.9682905673980713, + "learning_rate": 0.0007375514133193314, + "epoch": 0.79 + }, + { + "loss": 8.4729, + "grad_norm": 1.2290154695510864, + "learning_rate": 0.0007366762929902861, + "epoch": 0.79 + }, + { + "loss": 8.9253, + "grad_norm": 1.0369175672531128, + "learning_rate": 0.0007358011726612409, + "epoch": 0.79 + }, + { + "loss": 9.2036, + "grad_norm": 1.0748445987701416, + "learning_rate": 0.0007349260523321957, + "epoch": 0.8 + }, + { + "loss": 8.2364, + "grad_norm": 1.147964596748352, + "learning_rate": 0.0007340509320031504, + "epoch": 0.8 + }, + { + "loss": 9.006, + "grad_norm": 1.0363622903823853, + "learning_rate": 0.0007331758116741052, + "epoch": 0.8 + }, + { + "loss": 8.7969, + "grad_norm": 1.2576889991760254, + "learning_rate": 0.0007323006913450599, + "epoch": 0.8 + }, + { + "loss": 8.4052, + "grad_norm": 1.1075588464736938, + "learning_rate": 0.0007314255710160147, + "epoch": 0.81 + }, + { + "loss": 8.5912, + "grad_norm": 1.0697672367095947, + "learning_rate": 0.0007305504506869695, + "epoch": 0.81 + }, + { + "loss": 8.7837, + "grad_norm": 1.0865002870559692, + "learning_rate": 0.0007296753303579242, + "epoch": 0.81 + }, + { + "loss": 8.0798, + "grad_norm": 1.3645957708358765, + "learning_rate": 0.000728800210028879, + "epoch": 0.81 + }, + { + "loss": 8.2649, + "grad_norm": 1.0889688730239868, + "learning_rate": 0.0007279250896998337, + "epoch": 0.82 + }, + { + "loss": 7.902, + "grad_norm": 0.9943633675575256, + "learning_rate": 0.0007270499693707885, + "epoch": 0.82 + }, + { + "loss": 8.493, + "grad_norm": 1.3548861742019653, + "learning_rate": 0.0007261748490417433, + "epoch": 0.82 + }, + { + "loss": 9.2024, + "grad_norm": 1.1603728532791138, + "learning_rate": 0.000725299728712698, + "epoch": 0.82 + }, + { + "loss": 8.7272, + "grad_norm": 1.2872350215911865, + "learning_rate": 0.0007244246083836528, + "epoch": 0.83 + }, + { + "loss": 8.8292, + "grad_norm": 1.0431410074234009, + "learning_rate": 0.0007235494880546076, + "epoch": 0.83 + }, + { + "loss": 8.0473, + "grad_norm": 0.9648978114128113, + "learning_rate": 0.0007226743677255623, + "epoch": 0.83 + }, + { + "loss": 8.134, + "grad_norm": 0.8962783217430115, + "learning_rate": 0.0007217992473965171, + "epoch": 0.83 + }, + { + "loss": 8.2796, + "grad_norm": 0.8879069685935974, + "learning_rate": 0.0007209241270674718, + "epoch": 0.84 + }, + { + "loss": 8.6275, + "grad_norm": 1.0046008825302124, + "learning_rate": 0.0007200490067384265, + "epoch": 0.84 + }, + { + "loss": 8.2847, + "grad_norm": 1.1034067869186401, + "learning_rate": 0.0007191738864093813, + "epoch": 0.84 + }, + { + "loss": 8.723, + "grad_norm": 0.9179050326347351, + "learning_rate": 0.000718298766080336, + "epoch": 0.85 + }, + { + "loss": 8.2843, + "grad_norm": 1.0402296781539917, + "learning_rate": 0.0007174236457512908, + "epoch": 0.85 + }, + { + "loss": 8.2487, + "grad_norm": 1.2751373052597046, + "learning_rate": 0.0007165485254222455, + "epoch": 0.85 + }, + { + "loss": 8.3491, + "grad_norm": 0.8596373200416565, + "learning_rate": 0.0007156734050932003, + "epoch": 0.85 + }, + { + "loss": 8.4695, + "grad_norm": 1.0553058385849, + "learning_rate": 0.0007147982847641551, + "epoch": 0.86 + }, + { + "loss": 8.74, + "grad_norm": 1.0505644083023071, + "learning_rate": 0.0007139231644351098, + "epoch": 0.86 + }, + { + "loss": 8.3704, + "grad_norm": 1.4136569499969482, + "learning_rate": 0.0007130480441060646, + "epoch": 0.86 + }, + { + "loss": 7.9998, + "grad_norm": 0.9397268295288086, + "learning_rate": 0.0007121729237770194, + "epoch": 0.86 + }, + { + "loss": 8.5978, + "grad_norm": 1.1479915380477905, + "learning_rate": 0.0007112978034479741, + "epoch": 0.87 + }, + { + "loss": 8.6225, + "grad_norm": 1.0489866733551025, + "learning_rate": 0.0007104226831189289, + "epoch": 0.87 + }, + { + "loss": 8.3155, + "grad_norm": 0.9371022582054138, + "learning_rate": 0.0007095475627898836, + "epoch": 0.87 + }, + { + "loss": 8.3844, + "grad_norm": 1.1981381177902222, + "learning_rate": 0.0007086724424608384, + "epoch": 0.87 + }, + { + "loss": 8.5061, + "grad_norm": 0.8924277424812317, + "learning_rate": 0.0007077973221317932, + "epoch": 0.88 + }, + { + "loss": 8.1918, + "grad_norm": 1.4077969789505005, + "learning_rate": 0.0007069222018027479, + "epoch": 0.88 + }, + { + "loss": 8.3377, + "grad_norm": 1.1926066875457764, + "learning_rate": 0.0007060470814737027, + "epoch": 0.88 + }, + { + "loss": 8.4682, + "grad_norm": 1.1524171829223633, + "learning_rate": 0.0007051719611446574, + "epoch": 0.88 + }, + { + "loss": 8.5678, + "grad_norm": 1.0660207271575928, + "learning_rate": 0.0007042968408156122, + "epoch": 0.89 + }, + { + "loss": 7.9908, + "grad_norm": 1.1786776781082153, + "learning_rate": 0.000703421720486567, + "epoch": 0.89 + }, + { + "loss": 9.0339, + "grad_norm": 0.9970653057098389, + "learning_rate": 0.0007025466001575217, + "epoch": 0.89 + }, + { + "loss": 8.6511, + "grad_norm": 1.171247124671936, + "learning_rate": 0.0007016714798284765, + "epoch": 0.9 + }, + { + "loss": 8.0249, + "grad_norm": 1.1036537885665894, + "learning_rate": 0.0007007963594994313, + "epoch": 0.9 + }, + { + "loss": 8.2895, + "grad_norm": 1.4363912343978882, + "learning_rate": 0.000699921239170386, + "epoch": 0.9 + }, + { + "loss": 8.4263, + "grad_norm": 1.2977561950683594, + "learning_rate": 0.0006990461188413408, + "epoch": 0.9 + }, + { + "loss": 8.3236, + "grad_norm": 1.2732399702072144, + "learning_rate": 0.0006981709985122954, + "epoch": 0.91 + }, + { + "loss": 8.0876, + "grad_norm": 0.8092446327209473, + "learning_rate": 0.0006972958781832502, + "epoch": 0.91 + }, + { + "loss": 8.3052, + "grad_norm": 1.0607753992080688, + "learning_rate": 0.000696420757854205, + "epoch": 0.91 + }, + { + "loss": 8.2821, + "grad_norm": 1.2833763360977173, + "learning_rate": 0.0006955456375251597, + "epoch": 0.91 + }, + { + "loss": 8.0437, + "grad_norm": 1.2291605472564697, + "learning_rate": 0.0006946705171961145, + "epoch": 0.92 + }, + { + "loss": 7.9172, + "grad_norm": 0.9950680732727051, + "learning_rate": 0.0006937953968670692, + "epoch": 0.92 + }, + { + "loss": 7.8579, + "grad_norm": 1.170876145362854, + "learning_rate": 0.000692920276538024, + "epoch": 0.92 + }, + { + "loss": 8.7343, + "grad_norm": 1.0266340970993042, + "learning_rate": 0.0006920451562089788, + "epoch": 0.92 + }, + { + "loss": 8.3685, + "grad_norm": 1.1194366216659546, + "learning_rate": 0.0006911700358799335, + "epoch": 0.93 + }, + { + "loss": 8.8983, + "grad_norm": 1.130362868309021, + "learning_rate": 0.0006902949155508883, + "epoch": 0.93 + }, + { + "loss": 8.3624, + "grad_norm": 1.2582019567489624, + "learning_rate": 0.000689419795221843, + "epoch": 0.93 + }, + { + "loss": 8.5332, + "grad_norm": 1.0985493659973145, + "learning_rate": 0.0006885446748927978, + "epoch": 0.93 + }, + { + "loss": 8.263, + "grad_norm": 1.0480501651763916, + "learning_rate": 0.0006876695545637526, + "epoch": 0.94 + }, + { + "loss": 8.1911, + "grad_norm": 1.085471510887146, + "learning_rate": 0.0006867944342347073, + "epoch": 0.94 + }, + { + "loss": 8.6767, + "grad_norm": 1.109959602355957, + "learning_rate": 0.0006859193139056621, + "epoch": 0.94 + }, + { + "loss": 8.1904, + "grad_norm": 0.9299295544624329, + "learning_rate": 0.0006850441935766169, + "epoch": 0.94 + }, + { + "loss": 7.9858, + "grad_norm": 1.3819242715835571, + "learning_rate": 0.0006841690732475716, + "epoch": 0.95 + }, + { + "loss": 8.3134, + "grad_norm": 1.499324083328247, + "learning_rate": 0.0006832939529185264, + "epoch": 0.95 + }, + { + "loss": 8.1389, + "grad_norm": 1.0068879127502441, + "learning_rate": 0.0006824188325894811, + "epoch": 0.95 + }, + { + "loss": 8.0979, + "grad_norm": 1.232861876487732, + "learning_rate": 0.0006815437122604359, + "epoch": 0.96 + }, + { + "loss": 8.1456, + "grad_norm": 1.020922064781189, + "learning_rate": 0.0006806685919313907, + "epoch": 0.96 + }, + { + "loss": 8.1438, + "grad_norm": 1.2880629301071167, + "learning_rate": 0.0006797934716023453, + "epoch": 0.96 + }, + { + "loss": 7.8589, + "grad_norm": 1.2720872163772583, + "learning_rate": 0.0006789183512733001, + "epoch": 0.96 + }, + { + "loss": 8.338, + "grad_norm": 1.1569981575012207, + "learning_rate": 0.0006780432309442548, + "epoch": 0.97 + }, + { + "loss": 7.6167, + "grad_norm": 1.0755385160446167, + "learning_rate": 0.0006771681106152095, + "epoch": 0.97 + }, + { + "loss": 9.1889, + "grad_norm": 1.1371173858642578, + "learning_rate": 0.0006762929902861643, + "epoch": 0.97 + }, + { + "loss": 8.1603, + "grad_norm": 1.2543790340423584, + "learning_rate": 0.000675417869957119, + "epoch": 0.97 + }, + { + "loss": 8.1684, + "grad_norm": 1.665987491607666, + "learning_rate": 0.0006745427496280738, + "epoch": 0.98 + }, + { + "loss": 8.4957, + "grad_norm": 1.1479765176773071, + "learning_rate": 0.0006736676292990285, + "epoch": 0.98 + }, + { + "loss": 7.998, + "grad_norm": 1.1416277885437012, + "learning_rate": 0.0006727925089699833, + "epoch": 0.98 + }, + { + "loss": 8.4458, + "grad_norm": 1.2610832452774048, + "learning_rate": 0.0006719173886409381, + "epoch": 0.98 + }, + { + "loss": 8.2715, + "grad_norm": 1.2478748559951782, + "learning_rate": 0.0006710422683118928, + "epoch": 0.99 + }, + { + "loss": 8.0882, + "grad_norm": 0.9021313190460205, + "learning_rate": 0.0006701671479828476, + "epoch": 0.99 + }, + { + "loss": 8.2404, + "grad_norm": 1.0023951530456543, + "learning_rate": 0.0006692920276538024, + "epoch": 0.99 + }, + { + "loss": 8.681, + "grad_norm": 1.3342375755310059, + "learning_rate": 0.0006684169073247571, + "epoch": 0.99 + }, + { + "loss": 8.024, + "grad_norm": 1.0199118852615356, + "learning_rate": 0.0006675417869957119, + "epoch": 1.0 + }, + { + "loss": 8.3688, + "grad_norm": 0.893786609172821, + "learning_rate": 0.0006666666666666666, + "epoch": 1.0 + }, + { + "loss": 8.0561, + "grad_norm": 1.2774296998977661, + "learning_rate": 0.0006657915463376214, + "epoch": 1.0 + }, + { + "loss": 7.8444, + "grad_norm": 1.0824223756790161, + "learning_rate": 0.0006649164260085762, + "epoch": 1.01 + }, + { + "loss": 8.1771, + "grad_norm": 0.869452178478241, + "learning_rate": 0.0006640413056795309, + "epoch": 1.01 + }, + { + "loss": 7.6838, + "grad_norm": 1.1132241487503052, + "learning_rate": 0.0006631661853504857, + "epoch": 1.01 + }, + { + "loss": 7.9475, + "grad_norm": 1.2853749990463257, + "learning_rate": 0.0006622910650214405, + "epoch": 1.01 + }, + { + "loss": 8.8546, + "grad_norm": 1.2339048385620117, + "learning_rate": 0.0006614159446923952, + "epoch": 1.02 + }, + { + "loss": 8.1339, + "grad_norm": 1.2211487293243408, + "learning_rate": 0.00066054082436335, + "epoch": 1.02 + }, + { + "loss": 7.402, + "grad_norm": 1.0966975688934326, + "learning_rate": 0.0006596657040343047, + "epoch": 1.02 + }, + { + "loss": 8.1777, + "grad_norm": 1.0253325700759888, + "learning_rate": 0.0006587905837052595, + "epoch": 1.02 + }, + { + "loss": 8.2748, + "grad_norm": 1.2987836599349976, + "learning_rate": 0.0006579154633762143, + "epoch": 1.03 + }, + { + "loss": 8.3225, + "grad_norm": 0.945371687412262, + "learning_rate": 0.000657040343047169, + "epoch": 1.03 + }, + { + "loss": 8.4416, + "grad_norm": 1.0868079662322998, + "learning_rate": 0.0006561652227181238, + "epoch": 1.03 + }, + { + "loss": 8.1007, + "grad_norm": 1.0190479755401611, + "learning_rate": 0.0006552901023890784, + "epoch": 1.03 + }, + { + "loss": 8.1317, + "grad_norm": 1.0896625518798828, + "learning_rate": 0.0006544149820600332, + "epoch": 1.04 + }, + { + "loss": 7.7364, + "grad_norm": 1.1690502166748047, + "learning_rate": 0.000653539861730988, + "epoch": 1.04 + }, + { + "loss": 7.8173, + "grad_norm": 1.0521645545959473, + "learning_rate": 0.0006526647414019427, + "epoch": 1.04 + }, + { + "loss": 7.6212, + "grad_norm": 1.3057899475097656, + "learning_rate": 0.0006517896210728975, + "epoch": 1.04 + }, + { + "loss": 8.0228, + "grad_norm": 0.968885064125061, + "learning_rate": 0.0006509145007438522, + "epoch": 1.05 + }, + { + "loss": 7.8535, + "grad_norm": 1.1838873624801636, + "learning_rate": 0.000650039380414807, + "epoch": 1.05 + }, + { + "loss": 8.1991, + "grad_norm": 1.0967016220092773, + "learning_rate": 0.0006491642600857618, + "epoch": 1.05 + }, + { + "loss": 8.1515, + "grad_norm": 1.0798629522323608, + "learning_rate": 0.0006482891397567165, + "epoch": 1.06 + }, + { + "loss": 8.291, + "grad_norm": 1.1506596803665161, + "learning_rate": 0.0006474140194276713, + "epoch": 1.06 + }, + { + "loss": 7.956, + "grad_norm": 1.0459505319595337, + "learning_rate": 0.0006465388990986261, + "epoch": 1.06 + }, + { + "loss": 8.4393, + "grad_norm": 1.070776343345642, + "learning_rate": 0.0006456637787695808, + "epoch": 1.06 + }, + { + "loss": 8.5445, + "grad_norm": 1.3064284324645996, + "learning_rate": 0.0006447886584405356, + "epoch": 1.07 + }, + { + "loss": 8.701, + "grad_norm": 1.0707839727401733, + "learning_rate": 0.0006439135381114903, + "epoch": 1.07 + }, + { + "loss": 7.4342, + "grad_norm": 1.123377799987793, + "learning_rate": 0.0006430384177824451, + "epoch": 1.07 + }, + { + "loss": 8.4883, + "grad_norm": 1.7230886220932007, + "learning_rate": 0.0006421632974533999, + "epoch": 1.07 + }, + { + "loss": 8.5288, + "grad_norm": 0.9721227288246155, + "learning_rate": 0.0006412881771243546, + "epoch": 1.08 + }, + { + "loss": 7.8249, + "grad_norm": 1.2729851007461548, + "learning_rate": 0.0006404130567953094, + "epoch": 1.08 + }, + { + "loss": 8.3277, + "grad_norm": 0.9693044424057007, + "learning_rate": 0.0006395379364662642, + "epoch": 1.08 + }, + { + "loss": 7.8798, + "grad_norm": 1.104020118713379, + "learning_rate": 0.0006386628161372189, + "epoch": 1.08 + }, + { + "loss": 7.899, + "grad_norm": 1.0556141138076782, + "learning_rate": 0.0006377876958081737, + "epoch": 1.09 + }, + { + "loss": 8.6403, + "grad_norm": 1.227303147315979, + "learning_rate": 0.0006369125754791284, + "epoch": 1.09 + }, + { + "loss": 8.7407, + "grad_norm": 1.2486103773117065, + "learning_rate": 0.0006360374551500832, + "epoch": 1.09 + }, + { + "loss": 8.226, + "grad_norm": 1.1452488899230957, + "learning_rate": 0.000635162334821038, + "epoch": 1.09 + }, + { + "loss": 8.5083, + "grad_norm": 1.466182827949524, + "learning_rate": 0.0006342872144919927, + "epoch": 1.1 + }, + { + "loss": 7.8041, + "grad_norm": 1.2693302631378174, + "learning_rate": 0.0006334120941629474, + "epoch": 1.1 + }, + { + "loss": 7.918, + "grad_norm": 1.1236190795898438, + "learning_rate": 0.0006325369738339021, + "epoch": 1.1 + }, + { + "loss": 7.8792, + "grad_norm": 0.9166776537895203, + "learning_rate": 0.0006316618535048569, + "epoch": 1.11 + }, + { + "loss": 8.3714, + "grad_norm": 1.2021427154541016, + "learning_rate": 0.0006307867331758117, + "epoch": 1.11 + }, + { + "loss": 8.5282, + "grad_norm": 1.1508140563964844, + "learning_rate": 0.0006299116128467664, + "epoch": 1.11 + }, + { + "loss": 7.7235, + "grad_norm": 1.044027328491211, + "learning_rate": 0.0006290364925177212, + "epoch": 1.11 + }, + { + "loss": 8.0483, + "grad_norm": 1.00051748752594, + "learning_rate": 0.000628161372188676, + "epoch": 1.12 + }, + { + "loss": 8.0003, + "grad_norm": 1.0397716760635376, + "learning_rate": 0.0006272862518596307, + "epoch": 1.12 + }, + { + "loss": 8.274, + "grad_norm": 1.0577192306518555, + "learning_rate": 0.0006264111315305855, + "epoch": 1.12 + }, + { + "loss": 7.8435, + "grad_norm": 1.1829681396484375, + "learning_rate": 0.0006255360112015402, + "epoch": 1.12 + }, + { + "loss": 8.5019, + "grad_norm": 1.9353641271591187, + "learning_rate": 0.000624660890872495, + "epoch": 1.13 + }, + { + "loss": 8.4582, + "grad_norm": 1.237269639968872, + "learning_rate": 0.0006237857705434498, + "epoch": 1.13 + }, + { + "loss": 8.0735, + "grad_norm": 1.1674834489822388, + "learning_rate": 0.0006229106502144045, + "epoch": 1.13 + }, + { + "loss": 8.3781, + "grad_norm": 1.32883620262146, + "learning_rate": 0.0006220355298853593, + "epoch": 1.13 + }, + { + "loss": 8.723, + "grad_norm": 1.3197271823883057, + "learning_rate": 0.000621160409556314, + "epoch": 1.14 + }, + { + "loss": 8.414, + "grad_norm": 1.137764573097229, + "learning_rate": 0.0006202852892272688, + "epoch": 1.14 + }, + { + "loss": 7.9197, + "grad_norm": 1.1574738025665283, + "learning_rate": 0.0006194101688982236, + "epoch": 1.14 + }, + { + "loss": 8.09, + "grad_norm": 1.0444676876068115, + "learning_rate": 0.0006185350485691783, + "epoch": 1.14 + }, + { + "loss": 7.3329, + "grad_norm": 0.8655235767364502, + "learning_rate": 0.0006176599282401331, + "epoch": 1.15 + }, + { + "loss": 8.4163, + "grad_norm": 0.9860300421714783, + "learning_rate": 0.0006167848079110879, + "epoch": 1.15 + }, + { + "loss": 8.2608, + "grad_norm": 1.1680139303207397, + "learning_rate": 0.0006159096875820426, + "epoch": 1.15 + }, + { + "loss": 7.9283, + "grad_norm": 1.545938491821289, + "learning_rate": 0.0006150345672529974, + "epoch": 1.15 + }, + { + "loss": 8.4113, + "grad_norm": 1.2768994569778442, + "learning_rate": 0.0006141594469239521, + "epoch": 1.16 + }, + { + "loss": 8.2389, + "grad_norm": 1.0001721382141113, + "learning_rate": 0.0006132843265949069, + "epoch": 1.16 + }, + { + "loss": 8.397, + "grad_norm": 1.8651808500289917, + "learning_rate": 0.0006124092062658617, + "epoch": 1.16 + }, + { + "loss": 8.003, + "grad_norm": 0.947693407535553, + "learning_rate": 0.0006115340859368163, + "epoch": 1.17 + }, + { + "loss": 7.5861, + "grad_norm": 1.1168384552001953, + "learning_rate": 0.0006106589656077711, + "epoch": 1.17 + }, + { + "loss": 8.7788, + "grad_norm": 1.1341112852096558, + "learning_rate": 0.0006097838452787258, + "epoch": 1.17 + }, + { + "loss": 7.9428, + "grad_norm": 1.2905473709106445, + "learning_rate": 0.0006089087249496806, + "epoch": 1.17 + }, + { + "loss": 8.6196, + "grad_norm": 0.9961435794830322, + "learning_rate": 0.0006080336046206354, + "epoch": 1.18 + }, + { + "loss": 8.224, + "grad_norm": 1.3134316205978394, + "learning_rate": 0.0006071584842915901, + "epoch": 1.18 + }, + { + "loss": 7.9156, + "grad_norm": 1.5898418426513672, + "learning_rate": 0.0006062833639625449, + "epoch": 1.18 + }, + { + "loss": 8.2147, + "grad_norm": 0.99250727891922, + "learning_rate": 0.0006054082436334996, + "epoch": 1.18 + }, + { + "loss": 7.6957, + "grad_norm": 1.2642431259155273, + "learning_rate": 0.0006045331233044544, + "epoch": 1.19 + }, + { + "loss": 7.7926, + "grad_norm": 1.314082384109497, + "learning_rate": 0.0006036580029754092, + "epoch": 1.19 + }, + { + "loss": 7.9682, + "grad_norm": 1.1342573165893555, + "learning_rate": 0.0006027828826463639, + "epoch": 1.19 + }, + { + "loss": 8.0208, + "grad_norm": 1.3015680313110352, + "learning_rate": 0.0006019077623173187, + "epoch": 1.19 + }, + { + "loss": 8.3608, + "grad_norm": 0.9990431666374207, + "learning_rate": 0.0006010326419882735, + "epoch": 1.2 + }, + { + "loss": 8.2009, + "grad_norm": 0.9804344773292542, + "learning_rate": 0.0006001575216592282, + "epoch": 1.2 + }, + { + "loss": 8.0484, + "grad_norm": 1.1591954231262207, + "learning_rate": 0.0005992824013301829, + "epoch": 1.2 + }, + { + "loss": 8.116, + "grad_norm": 1.042474627494812, + "learning_rate": 0.0005984072810011376, + "epoch": 1.2 + }, + { + "loss": 7.9246, + "grad_norm": 1.8579179048538208, + "learning_rate": 0.0005975321606720924, + "epoch": 1.21 + }, + { + "loss": 7.9183, + "grad_norm": 0.8727061748504639, + "learning_rate": 0.0005966570403430472, + "epoch": 1.21 + }, + { + "loss": 7.675, + "grad_norm": 1.0189380645751953, + "learning_rate": 0.0005957819200140019, + "epoch": 1.21 + }, + { + "loss": 7.6222, + "grad_norm": 1.0766206979751587, + "learning_rate": 0.0005949067996849567, + "epoch": 1.22 + }, + { + "loss": 7.6455, + "grad_norm": 1.121745228767395, + "learning_rate": 0.0005940316793559114, + "epoch": 1.22 + }, + { + "loss": 8.1449, + "grad_norm": 1.2497507333755493, + "learning_rate": 0.0005931565590268662, + "epoch": 1.22 + }, + { + "loss": 8.3586, + "grad_norm": 1.301903486251831, + "learning_rate": 0.000592281438697821, + "epoch": 1.22 + }, + { + "loss": 8.163, + "grad_norm": 1.1964079141616821, + "learning_rate": 0.0005914063183687757, + "epoch": 1.23 + }, + { + "loss": 8.2938, + "grad_norm": 1.1423827409744263, + "learning_rate": 0.0005905311980397304, + "epoch": 1.23 + }, + { + "loss": 8.165, + "grad_norm": 1.119884967803955, + "learning_rate": 0.0005896560777106851, + "epoch": 1.23 + }, + { + "loss": 7.7234, + "grad_norm": 1.4375518560409546, + "learning_rate": 0.0005887809573816399, + "epoch": 1.23 + }, + { + "loss": 8.0758, + "grad_norm": 1.1417185068130493, + "learning_rate": 0.0005879058370525947, + "epoch": 1.24 + }, + { + "loss": 7.9137, + "grad_norm": 1.048060417175293, + "learning_rate": 0.0005870307167235494, + "epoch": 1.24 + }, + { + "loss": 8.4029, + "grad_norm": 0.9880658388137817, + "learning_rate": 0.0005861555963945042, + "epoch": 1.24 + }, + { + "loss": 8.4489, + "grad_norm": 1.000611424446106, + "learning_rate": 0.000585280476065459, + "epoch": 1.24 + }, + { + "loss": 8.2688, + "grad_norm": 1.3099920749664307, + "learning_rate": 0.0005844053557364137, + "epoch": 1.25 + }, + { + "loss": 7.7948, + "grad_norm": 0.8548302054405212, + "learning_rate": 0.0005835302354073685, + "epoch": 1.25 + }, + { + "loss": 8.442, + "grad_norm": 1.1732860803604126, + "learning_rate": 0.0005826551150783232, + "epoch": 1.25 + }, + { + "loss": 7.6346, + "grad_norm": 0.803125262260437, + "learning_rate": 0.000581779994749278, + "epoch": 1.25 + }, + { + "loss": 8.0567, + "grad_norm": 1.258419156074524, + "learning_rate": 0.0005809048744202328, + "epoch": 1.26 + }, + { + "loss": 8.1142, + "grad_norm": 1.1331418752670288, + "learning_rate": 0.0005800297540911875, + "epoch": 1.26 + }, + { + "loss": 8.5457, + "grad_norm": 1.5619804859161377, + "learning_rate": 0.0005791546337621423, + "epoch": 1.26 + }, + { + "loss": 7.9416, + "grad_norm": 1.880534052848816, + "learning_rate": 0.000578279513433097, + "epoch": 1.27 + }, + { + "loss": 7.8216, + "grad_norm": 1.2279471158981323, + "learning_rate": 0.0005774043931040518, + "epoch": 1.27 + }, + { + "loss": 7.8216, + "grad_norm": 1.1597974300384521, + "learning_rate": 0.0005765292727750066, + "epoch": 1.27 + }, + { + "loss": 7.9033, + "grad_norm": 1.1710484027862549, + "learning_rate": 0.0005756541524459613, + "epoch": 1.27 + }, + { + "loss": 7.6036, + "grad_norm": 1.0655231475830078, + "learning_rate": 0.0005747790321169161, + "epoch": 1.28 + }, + { + "loss": 7.5982, + "grad_norm": 1.0066710710525513, + "learning_rate": 0.0005739039117878709, + "epoch": 1.28 + }, + { + "loss": 7.738, + "grad_norm": 1.1333460807800293, + "learning_rate": 0.0005730287914588256, + "epoch": 1.28 + }, + { + "loss": 8.0025, + "grad_norm": 1.468841791152954, + "learning_rate": 0.0005721536711297804, + "epoch": 1.28 + }, + { + "loss": 7.4888, + "grad_norm": 1.1363178491592407, + "learning_rate": 0.0005712785508007351, + "epoch": 1.29 + }, + { + "loss": 7.3176, + "grad_norm": 1.1589970588684082, + "learning_rate": 0.0005704034304716899, + "epoch": 1.29 + }, + { + "loss": 7.6323, + "grad_norm": 0.9033693075180054, + "learning_rate": 0.0005695283101426447, + "epoch": 1.29 + }, + { + "loss": 7.8839, + "grad_norm": 1.2384039163589478, + "learning_rate": 0.0005686531898135993, + "epoch": 1.29 + }, + { + "loss": 7.8408, + "grad_norm": 1.3826912641525269, + "learning_rate": 0.0005677780694845541, + "epoch": 1.3 + }, + { + "loss": 7.4433, + "grad_norm": 1.1403487920761108, + "learning_rate": 0.0005669029491555088, + "epoch": 1.3 + }, + { + "loss": 8.5407, + "grad_norm": 1.037423014640808, + "learning_rate": 0.0005660278288264636, + "epoch": 1.3 + }, + { + "loss": 8.0943, + "grad_norm": 1.4421013593673706, + "learning_rate": 0.0005651527084974184, + "epoch": 1.3 + }, + { + "loss": 7.7771, + "grad_norm": 1.2977713346481323, + "learning_rate": 0.0005642775881683731, + "epoch": 1.31 + }, + { + "loss": 7.54, + "grad_norm": 1.049196720123291, + "learning_rate": 0.0005634024678393279, + "epoch": 1.31 + }, + { + "loss": 7.4699, + "grad_norm": 1.0489652156829834, + "learning_rate": 0.0005625273475102827, + "epoch": 1.31 + }, + { + "loss": 7.9441, + "grad_norm": 1.1373968124389648, + "learning_rate": 0.0005616522271812374, + "epoch": 1.32 + }, + { + "loss": 7.2627, + "grad_norm": 1.0570902824401855, + "learning_rate": 0.0005607771068521922, + "epoch": 1.32 + }, + { + "loss": 7.7472, + "grad_norm": 1.0547776222229004, + "learning_rate": 0.0005599019865231469, + "epoch": 1.32 + }, + { + "loss": 7.8815, + "grad_norm": 1.2481534481048584, + "learning_rate": 0.0005590268661941017, + "epoch": 1.32 + }, + { + "loss": 8.2547, + "grad_norm": 1.1728442907333374, + "learning_rate": 0.0005581517458650565, + "epoch": 1.33 + }, + { + "loss": 7.5035, + "grad_norm": 1.0567808151245117, + "learning_rate": 0.0005572766255360112, + "epoch": 1.33 + }, + { + "loss": 7.9982, + "grad_norm": 0.8234537243843079, + "learning_rate": 0.000556401505206966, + "epoch": 1.33 + }, + { + "loss": 7.5333, + "grad_norm": 1.09587824344635, + "learning_rate": 0.0005555263848779207, + "epoch": 1.33 + }, + { + "loss": 7.768, + "grad_norm": 1.3897008895874023, + "learning_rate": 0.0005546512645488755, + "epoch": 1.34 + }, + { + "loss": 7.7645, + "grad_norm": 1.1089082956314087, + "learning_rate": 0.0005537761442198303, + "epoch": 1.34 + }, + { + "loss": 7.7809, + "grad_norm": 1.2678576707839966, + "learning_rate": 0.000552901023890785, + "epoch": 1.34 + }, + { + "loss": 7.7376, + "grad_norm": 1.3946635723114014, + "learning_rate": 0.0005520259035617398, + "epoch": 1.34 + }, + { + "loss": 8.2773, + "grad_norm": 1.3742512464523315, + "learning_rate": 0.0005511507832326946, + "epoch": 1.35 + }, + { + "loss": 7.7902, + "grad_norm": 1.416434645652771, + "learning_rate": 0.0005502756629036493, + "epoch": 1.35 + }, + { + "loss": 7.6157, + "grad_norm": 1.0419012308120728, + "learning_rate": 0.0005494005425746041, + "epoch": 1.35 + }, + { + "loss": 7.5897, + "grad_norm": 1.7180145978927612, + "learning_rate": 0.0005485254222455588, + "epoch": 1.35 + }, + { + "loss": 8.0068, + "grad_norm": 1.6651771068572998, + "learning_rate": 0.0005476503019165136, + "epoch": 1.36 + }, + { + "loss": 7.4023, + "grad_norm": 1.0715596675872803, + "learning_rate": 0.0005467751815874683, + "epoch": 1.36 + }, + { + "loss": 8.0369, + "grad_norm": 1.208898901939392, + "learning_rate": 0.000545900061258423, + "epoch": 1.36 + }, + { + "loss": 7.6188, + "grad_norm": 0.9920070767402649, + "learning_rate": 0.0005450249409293778, + "epoch": 1.36 + }, + { + "loss": 8.6854, + "grad_norm": 1.174086570739746, + "learning_rate": 0.0005441498206003325, + "epoch": 1.37 + }, + { + "loss": 7.5733, + "grad_norm": 1.244912028312683, + "learning_rate": 0.0005432747002712873, + "epoch": 1.37 + }, + { + "loss": 7.389, + "grad_norm": 1.5966273546218872, + "learning_rate": 0.0005423995799422421, + "epoch": 1.37 + }, + { + "loss": 8.1756, + "grad_norm": 1.0320965051651, + "learning_rate": 0.0005415244596131968, + "epoch": 1.38 + }, + { + "loss": 8.897, + "grad_norm": 1.2478450536727905, + "learning_rate": 0.0005406493392841516, + "epoch": 1.38 + }, + { + "loss": 7.6083, + "grad_norm": 1.4347364902496338, + "learning_rate": 0.0005397742189551064, + "epoch": 1.38 + }, + { + "loss": 7.9916, + "grad_norm": 1.1878119707107544, + "learning_rate": 0.0005388990986260611, + "epoch": 1.38 + }, + { + "loss": 8.1032, + "grad_norm": 1.3169543743133545, + "learning_rate": 0.0005380239782970159, + "epoch": 1.39 + }, + { + "loss": 7.3094, + "grad_norm": 1.271192193031311, + "learning_rate": 0.0005371488579679706, + "epoch": 1.39 + }, + { + "loss": 7.2947, + "grad_norm": 1.484824299812317, + "learning_rate": 0.0005362737376389254, + "epoch": 1.39 + }, + { + "loss": 7.7483, + "grad_norm": 1.0237884521484375, + "learning_rate": 0.0005353986173098802, + "epoch": 1.39 + }, + { + "loss": 7.7284, + "grad_norm": 1.141897201538086, + "learning_rate": 0.0005345234969808349, + "epoch": 1.4 + }, + { + "loss": 7.9684, + "grad_norm": 1.2076783180236816, + "learning_rate": 0.0005336483766517897, + "epoch": 1.4 + }, + { + "loss": 7.4731, + "grad_norm": 1.0815685987472534, + "learning_rate": 0.0005327732563227444, + "epoch": 1.4 + }, + { + "loss": 7.6468, + "grad_norm": 1.9115163087844849, + "learning_rate": 0.0005318981359936992, + "epoch": 1.4 + }, + { + "loss": 8.179, + "grad_norm": 1.1872133016586304, + "learning_rate": 0.000531023015664654, + "epoch": 1.41 + }, + { + "loss": 8.1254, + "grad_norm": 1.144726037979126, + "learning_rate": 0.0005301478953356087, + "epoch": 1.41 + }, + { + "loss": 7.7947, + "grad_norm": 1.562495231628418, + "learning_rate": 0.0005292727750065635, + "epoch": 1.41 + }, + { + "loss": 7.2917, + "grad_norm": 1.20420241355896, + "learning_rate": 0.0005283976546775183, + "epoch": 1.41 + }, + { + "loss": 7.9956, + "grad_norm": 1.0302613973617554, + "learning_rate": 0.000527522534348473, + "epoch": 1.42 + }, + { + "loss": 7.8058, + "grad_norm": 1.161452293395996, + "learning_rate": 0.0005266474140194278, + "epoch": 1.42 + }, + { + "loss": 8.2652, + "grad_norm": 1.2876991033554077, + "learning_rate": 0.0005257722936903825, + "epoch": 1.42 + }, + { + "loss": 8.0375, + "grad_norm": 1.1002925634384155, + "learning_rate": 0.0005248971733613372, + "epoch": 1.43 + }, + { + "loss": 7.82, + "grad_norm": 1.0201154947280884, + "learning_rate": 0.000524022053032292, + "epoch": 1.43 + }, + { + "loss": 8.3203, + "grad_norm": 1.1177037954330444, + "learning_rate": 0.0005231469327032467, + "epoch": 1.43 + }, + { + "loss": 7.9789, + "grad_norm": 1.4295682907104492, + "learning_rate": 0.0005222718123742015, + "epoch": 1.43 + }, + { + "loss": 8.0088, + "grad_norm": 1.4420737028121948, + "learning_rate": 0.0005213966920451562, + "epoch": 1.44 + }, + { + "loss": 7.8298, + "grad_norm": 1.1020231246948242, + "learning_rate": 0.000520521571716111, + "epoch": 1.44 + }, + { + "loss": 7.8801, + "grad_norm": 1.4339189529418945, + "learning_rate": 0.0005196464513870657, + "epoch": 1.44 + }, + { + "loss": 7.6756, + "grad_norm": 1.5243607759475708, + "learning_rate": 0.0005187713310580204, + "epoch": 1.44 + }, + { + "loss": 8.1007, + "grad_norm": 0.9880979657173157, + "learning_rate": 0.0005178962107289752, + "epoch": 1.45 + }, + { + "loss": 7.7396, + "grad_norm": 1.1447367668151855, + "learning_rate": 0.0005170210903999299, + "epoch": 1.45 + }, + { + "loss": 7.8537, + "grad_norm": 1.384048342704773, + "learning_rate": 0.0005161459700708847, + "epoch": 1.45 + }, + { + "loss": 7.8855, + "grad_norm": 1.3757721185684204, + "learning_rate": 0.0005152708497418395, + "epoch": 1.45 + }, + { + "loss": 7.8651, + "grad_norm": 1.1160024404525757, + "learning_rate": 0.0005143957294127942, + "epoch": 1.46 + }, + { + "loss": 7.8378, + "grad_norm": 0.9774546027183533, + "learning_rate": 0.000513520609083749, + "epoch": 1.46 + }, + { + "loss": 7.9251, + "grad_norm": 1.5181477069854736, + "learning_rate": 0.0005126454887547038, + "epoch": 1.46 + }, + { + "loss": 8.6781, + "grad_norm": 1.203229308128357, + "learning_rate": 0.0005117703684256585, + "epoch": 1.46 + }, + { + "loss": 7.6571, + "grad_norm": 1.0401496887207031, + "learning_rate": 0.0005108952480966133, + "epoch": 1.47 + }, + { + "loss": 7.3908, + "grad_norm": 1.3228225708007812, + "learning_rate": 0.000510020127767568, + "epoch": 1.47 + }, + { + "loss": 8.1244, + "grad_norm": 1.3072296380996704, + "learning_rate": 0.0005091450074385228, + "epoch": 1.47 + }, + { + "loss": 7.7535, + "grad_norm": 1.9105629920959473, + "learning_rate": 0.0005082698871094776, + "epoch": 1.48 + }, + { + "loss": 8.2387, + "grad_norm": 1.3035160303115845, + "learning_rate": 0.0005073947667804323, + "epoch": 1.48 + }, + { + "loss": 7.998, + "grad_norm": 0.9805745482444763, + "learning_rate": 0.0005065196464513871, + "epoch": 1.48 + }, + { + "loss": 8.0499, + "grad_norm": 1.28218412399292, + "learning_rate": 0.0005056445261223418, + "epoch": 1.48 + }, + { + "loss": 8.0939, + "grad_norm": 1.289697527885437, + "learning_rate": 0.0005047694057932966, + "epoch": 1.49 + }, + { + "loss": 7.8801, + "grad_norm": 1.3982206583023071, + "learning_rate": 0.0005038942854642513, + "epoch": 1.49 + }, + { + "loss": 7.5012, + "grad_norm": 1.1884011030197144, + "learning_rate": 0.000503019165135206, + "epoch": 1.49 + }, + { + "loss": 7.7792, + "grad_norm": 1.2014328241348267, + "learning_rate": 0.0005021440448061608, + "epoch": 1.49 + }, + { + "loss": 8.3151, + "grad_norm": 1.2958098649978638, + "learning_rate": 0.0005012689244771155, + "epoch": 1.5 + }, + { + "loss": 7.3702, + "grad_norm": 1.1195346117019653, + "learning_rate": 0.0005003938041480703, + "epoch": 1.5 + }, + { + "loss": 8.0952, + "grad_norm": 1.2185337543487549, + "learning_rate": 0.0004995186838190251, + "epoch": 1.5 + }, + { + "loss": 7.6605, + "grad_norm": 1.1054099798202515, + "learning_rate": 0.0004986435634899798, + "epoch": 1.5 + }, + { + "loss": 7.8926, + "grad_norm": 1.3183029890060425, + "learning_rate": 0.0004977684431609346, + "epoch": 1.51 + }, + { + "loss": 7.8356, + "grad_norm": 1.3786067962646484, + "learning_rate": 0.0004968933228318894, + "epoch": 1.51 + }, + { + "loss": 7.7605, + "grad_norm": 1.3373888731002808, + "learning_rate": 0.0004960182025028441, + "epoch": 1.51 + }, + { + "loss": 7.9272, + "grad_norm": 1.5524091720581055, + "learning_rate": 0.0004951430821737989, + "epoch": 1.51 + }, + { + "loss": 8.1264, + "grad_norm": 0.927689790725708, + "learning_rate": 0.0004942679618447536, + "epoch": 1.52 + }, + { + "loss": 8.1456, + "grad_norm": 1.4429559707641602, + "learning_rate": 0.0004933928415157084, + "epoch": 1.52 + }, + { + "loss": 8.5349, + "grad_norm": 1.17830228805542, + "learning_rate": 0.0004925177211866632, + "epoch": 1.52 + }, + { + "loss": 8.4138, + "grad_norm": 1.7398778200149536, + "learning_rate": 0.0004916426008576179, + "epoch": 1.53 + }, + { + "loss": 7.6329, + "grad_norm": 1.101945161819458, + "learning_rate": 0.0004907674805285727, + "epoch": 1.53 + }, + { + "loss": 8.2694, + "grad_norm": 1.2424931526184082, + "learning_rate": 0.0004898923601995274, + "epoch": 1.53 + }, + { + "loss": 7.2639, + "grad_norm": 0.8726850748062134, + "learning_rate": 0.0004890172398704822, + "epoch": 1.53 + }, + { + "loss": 7.5542, + "grad_norm": 1.020978331565857, + "learning_rate": 0.0004881421195414369, + "epoch": 1.54 + }, + { + "loss": 7.3334, + "grad_norm": 1.058136224746704, + "learning_rate": 0.0004872669992123917, + "epoch": 1.54 + }, + { + "loss": 7.6285, + "grad_norm": 1.7856310606002808, + "learning_rate": 0.00048639187888334644, + "epoch": 1.54 + }, + { + "loss": 7.8873, + "grad_norm": 1.1540299654006958, + "learning_rate": 0.0004855167585543012, + "epoch": 1.54 + }, + { + "loss": 7.5676, + "grad_norm": 1.4844547510147095, + "learning_rate": 0.00048464163822525597, + "epoch": 1.55 + }, + { + "loss": 8.0284, + "grad_norm": 1.1018364429473877, + "learning_rate": 0.00048376651789621073, + "epoch": 1.55 + }, + { + "loss": 7.8478, + "grad_norm": 1.4421080350875854, + "learning_rate": 0.0004828913975671655, + "epoch": 1.55 + }, + { + "loss": 8.0614, + "grad_norm": 1.322413444519043, + "learning_rate": 0.00048201627723812025, + "epoch": 1.55 + }, + { + "loss": 7.9015, + "grad_norm": 1.1930081844329834, + "learning_rate": 0.000481141156909075, + "epoch": 1.56 + }, + { + "loss": 7.843, + "grad_norm": 1.2846688032150269, + "learning_rate": 0.0004802660365800298, + "epoch": 1.56 + }, + { + "loss": 7.8268, + "grad_norm": 2.0413529872894287, + "learning_rate": 0.00047939091625098454, + "epoch": 1.56 + }, + { + "loss": 7.3241, + "grad_norm": 1.058362364768982, + "learning_rate": 0.0004785157959219393, + "epoch": 1.56 + }, + { + "loss": 7.8329, + "grad_norm": 1.725417971611023, + "learning_rate": 0.00047764067559289406, + "epoch": 1.57 + }, + { + "loss": 7.6295, + "grad_norm": 1.1373404264450073, + "learning_rate": 0.00047676555526384877, + "epoch": 1.57 + }, + { + "loss": 7.4763, + "grad_norm": 1.1107378005981445, + "learning_rate": 0.00047589043493480353, + "epoch": 1.57 + }, + { + "loss": 7.8846, + "grad_norm": 1.2450941801071167, + "learning_rate": 0.0004750153146057583, + "epoch": 1.57 + }, + { + "loss": 8.4109, + "grad_norm": 1.0643541812896729, + "learning_rate": 0.00047414019427671305, + "epoch": 1.58 + }, + { + "loss": 7.9126, + "grad_norm": 1.2940372228622437, + "learning_rate": 0.0004732650739476678, + "epoch": 1.58 + }, + { + "loss": 7.7132, + "grad_norm": 2.6067655086517334, + "learning_rate": 0.0004723899536186226, + "epoch": 1.58 + }, + { + "loss": 7.5708, + "grad_norm": 0.9783037304878235, + "learning_rate": 0.00047151483328957734, + "epoch": 1.59 + }, + { + "loss": 7.2771, + "grad_norm": 1.037582278251648, + "learning_rate": 0.0004706397129605321, + "epoch": 1.59 + }, + { + "loss": 7.7599, + "grad_norm": 1.0178707838058472, + "learning_rate": 0.00046976459263148686, + "epoch": 1.59 + }, + { + "loss": 7.1538, + "grad_norm": 1.558307409286499, + "learning_rate": 0.0004688894723024416, + "epoch": 1.59 + }, + { + "loss": 7.5229, + "grad_norm": 1.1060800552368164, + "learning_rate": 0.0004680143519733964, + "epoch": 1.6 + }, + { + "loss": 7.5813, + "grad_norm": 1.8988709449768066, + "learning_rate": 0.00046713923164435115, + "epoch": 1.6 + }, + { + "loss": 7.8319, + "grad_norm": 1.6066781282424927, + "learning_rate": 0.00046626411131530586, + "epoch": 1.6 + }, + { + "loss": 7.8222, + "grad_norm": 1.4711729288101196, + "learning_rate": 0.0004653889909862606, + "epoch": 1.6 + }, + { + "loss": 7.6115, + "grad_norm": 1.3585811853408813, + "learning_rate": 0.0004645138706572154, + "epoch": 1.61 + }, + { + "loss": 7.7618, + "grad_norm": 1.1487444639205933, + "learning_rate": 0.00046363875032817014, + "epoch": 1.61 + }, + { + "loss": 7.868, + "grad_norm": 1.4386248588562012, + "learning_rate": 0.0004627636299991249, + "epoch": 1.61 + }, + { + "loss": 7.7931, + "grad_norm": 1.0714224576950073, + "learning_rate": 0.00046188850967007967, + "epoch": 1.61 + }, + { + "loss": 8.2688, + "grad_norm": 1.6375863552093506, + "learning_rate": 0.00046101338934103443, + "epoch": 1.62 + }, + { + "loss": 7.621, + "grad_norm": 1.024120807647705, + "learning_rate": 0.0004601382690119892, + "epoch": 1.62 + }, + { + "loss": 8.2226, + "grad_norm": 1.2234493494033813, + "learning_rate": 0.0004592631486829439, + "epoch": 1.62 + }, + { + "loss": 7.596, + "grad_norm": 1.0593066215515137, + "learning_rate": 0.00045838802835389866, + "epoch": 1.62 + }, + { + "loss": 7.7407, + "grad_norm": 1.2529680728912354, + "learning_rate": 0.0004575129080248534, + "epoch": 1.63 + }, + { + "loss": 7.221, + "grad_norm": 1.1312929391860962, + "learning_rate": 0.0004566377876958082, + "epoch": 1.63 + }, + { + "loss": 7.6136, + "grad_norm": 1.4004312753677368, + "learning_rate": 0.00045576266736676294, + "epoch": 1.63 + }, + { + "loss": 7.9718, + "grad_norm": 1.4514151811599731, + "learning_rate": 0.00045488754703771765, + "epoch": 1.64 + }, + { + "loss": 7.3337, + "grad_norm": 1.1595350503921509, + "learning_rate": 0.0004540124267086724, + "epoch": 1.64 + }, + { + "loss": 7.7414, + "grad_norm": 1.1403205394744873, + "learning_rate": 0.0004531373063796272, + "epoch": 1.64 + }, + { + "loss": 7.7323, + "grad_norm": 1.677051305770874, + "learning_rate": 0.00045226218605058194, + "epoch": 1.64 + }, + { + "loss": 8.0048, + "grad_norm": 1.338146686553955, + "learning_rate": 0.0004513870657215367, + "epoch": 1.65 + }, + { + "loss": 7.9544, + "grad_norm": 1.0941588878631592, + "learning_rate": 0.00045051194539249146, + "epoch": 1.65 + }, + { + "loss": 8.1043, + "grad_norm": 1.224746584892273, + "learning_rate": 0.0004496368250634462, + "epoch": 1.65 + }, + { + "loss": 8.0849, + "grad_norm": 1.5772489309310913, + "learning_rate": 0.000448761704734401, + "epoch": 1.65 + }, + { + "loss": 7.3165, + "grad_norm": 1.4434912204742432, + "learning_rate": 0.00044788658440535575, + "epoch": 1.66 + }, + { + "loss": 7.8826, + "grad_norm": 0.9971029162406921, + "learning_rate": 0.0004470114640763105, + "epoch": 1.66 + }, + { + "loss": 7.7822, + "grad_norm": 1.061712384223938, + "learning_rate": 0.00044613634374726527, + "epoch": 1.66 + }, + { + "loss": 7.8387, + "grad_norm": 1.6292518377304077, + "learning_rate": 0.00044526122341822003, + "epoch": 1.66 + }, + { + "loss": 7.3463, + "grad_norm": 1.0507898330688477, + "learning_rate": 0.00044438610308917474, + "epoch": 1.67 + }, + { + "loss": 7.693, + "grad_norm": 1.332474708557129, + "learning_rate": 0.0004435109827601295, + "epoch": 1.67 + }, + { + "loss": 7.4542, + "grad_norm": 1.3393101692199707, + "learning_rate": 0.00044263586243108426, + "epoch": 1.67 + }, + { + "loss": 7.4236, + "grad_norm": 1.4949504137039185, + "learning_rate": 0.000441760742102039, + "epoch": 1.67 + }, + { + "loss": 7.4087, + "grad_norm": 1.3824454545974731, + "learning_rate": 0.0004408856217729938, + "epoch": 1.68 + }, + { + "loss": 7.3, + "grad_norm": 1.3991942405700684, + "learning_rate": 0.00044001050144394855, + "epoch": 1.68 + }, + { + "loss": 7.1648, + "grad_norm": 1.3270092010498047, + "learning_rate": 0.0004391353811149033, + "epoch": 1.68 + }, + { + "loss": 7.753, + "grad_norm": 1.1912864446640015, + "learning_rate": 0.00043826026078585807, + "epoch": 1.69 + }, + { + "loss": 7.6531, + "grad_norm": 1.2112165689468384, + "learning_rate": 0.00043738514045681283, + "epoch": 1.69 + }, + { + "loss": 8.0168, + "grad_norm": 1.0204828977584839, + "learning_rate": 0.0004365100201277676, + "epoch": 1.69 + }, + { + "loss": 7.8334, + "grad_norm": 1.8065035343170166, + "learning_rate": 0.00043563489979872236, + "epoch": 1.69 + }, + { + "loss": 7.9395, + "grad_norm": 1.1826367378234863, + "learning_rate": 0.0004347597794696771, + "epoch": 1.7 + }, + { + "loss": 8.0071, + "grad_norm": 0.9689782857894897, + "learning_rate": 0.00043388465914063183, + "epoch": 1.7 + }, + { + "loss": 7.5284, + "grad_norm": 0.9889323115348816, + "learning_rate": 0.0004330095388115866, + "epoch": 1.7 + }, + { + "loss": 7.7318, + "grad_norm": 1.4257516860961914, + "learning_rate": 0.00043213441848254135, + "epoch": 1.7 + }, + { + "loss": 7.6343, + "grad_norm": 1.623134970664978, + "learning_rate": 0.0004312592981534961, + "epoch": 1.71 + }, + { + "loss": 7.9645, + "grad_norm": 1.2686361074447632, + "learning_rate": 0.0004303841778244509, + "epoch": 1.71 + }, + { + "loss": 7.5339, + "grad_norm": 1.5115247964859009, + "learning_rate": 0.00042950905749540564, + "epoch": 1.71 + }, + { + "loss": 7.7401, + "grad_norm": 1.285506010055542, + "learning_rate": 0.0004286339371663604, + "epoch": 1.71 + }, + { + "loss": 7.6018, + "grad_norm": 1.4150651693344116, + "learning_rate": 0.00042775881683731516, + "epoch": 1.72 + }, + { + "loss": 7.4015, + "grad_norm": 1.485231637954712, + "learning_rate": 0.0004268836965082699, + "epoch": 1.72 + }, + { + "loss": 8.4429, + "grad_norm": 2.1629021167755127, + "learning_rate": 0.0004260085761792247, + "epoch": 1.72 + }, + { + "loss": 7.8298, + "grad_norm": 1.1586624383926392, + "learning_rate": 0.00042513345585017945, + "epoch": 1.72 + }, + { + "loss": 7.8121, + "grad_norm": 1.0134670734405518, + "learning_rate": 0.0004242583355211342, + "epoch": 1.73 + }, + { + "loss": 7.8337, + "grad_norm": 1.257633090019226, + "learning_rate": 0.00042338321519208897, + "epoch": 1.73 + }, + { + "loss": 7.6701, + "grad_norm": 1.212266445159912, + "learning_rate": 0.0004225080948630437, + "epoch": 1.73 + }, + { + "loss": 7.706, + "grad_norm": 1.2191237211227417, + "learning_rate": 0.00042163297453399844, + "epoch": 1.74 + }, + { + "loss": 7.4639, + "grad_norm": 1.476140022277832, + "learning_rate": 0.0004207578542049532, + "epoch": 1.74 + }, + { + "loss": 7.8126, + "grad_norm": 1.0655369758605957, + "learning_rate": 0.0004198827338759079, + "epoch": 1.74 + }, + { + "loss": 7.4091, + "grad_norm": 1.3340696096420288, + "learning_rate": 0.00041900761354686267, + "epoch": 1.74 + }, + { + "loss": 7.5701, + "grad_norm": 1.3290128707885742, + "learning_rate": 0.00041813249321781743, + "epoch": 1.75 + }, + { + "loss": 7.5513, + "grad_norm": 1.1993497610092163, + "learning_rate": 0.0004172573728887722, + "epoch": 1.75 + }, + { + "loss": 7.5115, + "grad_norm": 0.9953559041023254, + "learning_rate": 0.00041638225255972696, + "epoch": 1.75 + }, + { + "loss": 8.0513, + "grad_norm": 1.1929738521575928, + "learning_rate": 0.0004155071322306817, + "epoch": 1.75 + }, + { + "loss": 7.4431, + "grad_norm": 1.0211223363876343, + "learning_rate": 0.0004146320119016365, + "epoch": 1.76 + }, + { + "loss": 7.4024, + "grad_norm": 1.0484708547592163, + "learning_rate": 0.00041375689157259124, + "epoch": 1.76 + }, + { + "loss": 7.4321, + "grad_norm": 1.2012499570846558, + "learning_rate": 0.000412881771243546, + "epoch": 1.76 + }, + { + "loss": 7.2608, + "grad_norm": 0.9850478768348694, + "learning_rate": 0.0004120066509145007, + "epoch": 1.76 + }, + { + "loss": 7.4744, + "grad_norm": 1.1142171621322632, + "learning_rate": 0.00041113153058545547, + "epoch": 1.77 + }, + { + "loss": 7.8258, + "grad_norm": 1.0107368230819702, + "learning_rate": 0.00041025641025641023, + "epoch": 1.77 + }, + { + "loss": 8.0338, + "grad_norm": 1.3827756643295288, + "learning_rate": 0.000409381289927365, + "epoch": 1.77 + }, + { + "loss": 7.2029, + "grad_norm": 1.056078553199768, + "learning_rate": 0.00040850616959831976, + "epoch": 1.77 + }, + { + "loss": 7.9763, + "grad_norm": 1.3796826601028442, + "learning_rate": 0.0004076310492692745, + "epoch": 1.78 + }, + { + "loss": 7.39, + "grad_norm": 1.5586506128311157, + "learning_rate": 0.0004067559289402293, + "epoch": 1.78 + }, + { + "loss": 7.7479, + "grad_norm": 1.3467471599578857, + "learning_rate": 0.00040588080861118404, + "epoch": 1.78 + }, + { + "loss": 7.5281, + "grad_norm": 1.5824648141860962, + "learning_rate": 0.0004050056882821388, + "epoch": 1.78 + }, + { + "loss": 7.4095, + "grad_norm": 1.5600448846817017, + "learning_rate": 0.00040413056795309357, + "epoch": 1.79 + }, + { + "loss": 7.6296, + "grad_norm": 1.4003773927688599, + "learning_rate": 0.00040325544762404833, + "epoch": 1.79 + }, + { + "loss": 7.2299, + "grad_norm": 1.1784484386444092, + "learning_rate": 0.0004023803272950031, + "epoch": 1.79 + }, + { + "loss": 7.2215, + "grad_norm": 1.0865730047225952, + "learning_rate": 0.0004015052069659578, + "epoch": 1.8 + }, + { + "loss": 7.619, + "grad_norm": 1.3708497285842896, + "learning_rate": 0.00040063008663691256, + "epoch": 1.8 + }, + { + "loss": 7.6305, + "grad_norm": 1.3728278875350952, + "learning_rate": 0.0003997549663078673, + "epoch": 1.8 + }, + { + "loss": 7.4218, + "grad_norm": 1.385901689529419, + "learning_rate": 0.0003988798459788221, + "epoch": 1.8 + }, + { + "loss": 7.7959, + "grad_norm": 1.5370672941207886, + "learning_rate": 0.00039800472564977685, + "epoch": 1.81 + }, + { + "loss": 7.8249, + "grad_norm": 1.039469838142395, + "learning_rate": 0.0003971296053207316, + "epoch": 1.81 + }, + { + "loss": 7.4311, + "grad_norm": 1.4947952032089233, + "learning_rate": 0.00039625448499168637, + "epoch": 1.81 + }, + { + "loss": 7.7797, + "grad_norm": 1.2262136936187744, + "learning_rate": 0.00039546687669554567, + "epoch": 1.81 + }, + { + "loss": 7.8844, + "grad_norm": 1.5757509469985962, + "learning_rate": 0.00039459175636650043, + "epoch": 1.82 + }, + { + "loss": 7.8737, + "grad_norm": 1.2183258533477783, + "learning_rate": 0.0003937166360374552, + "epoch": 1.82 + }, + { + "loss": 7.3515, + "grad_norm": 1.3697617053985596, + "learning_rate": 0.00039284151570840995, + "epoch": 1.82 + }, + { + "loss": 7.1169, + "grad_norm": 1.3007692098617554, + "learning_rate": 0.00039196639537936466, + "epoch": 1.82 + }, + { + "loss": 7.2926, + "grad_norm": 1.3538720607757568, + "learning_rate": 0.0003910912750503194, + "epoch": 1.83 + }, + { + "loss": 7.8445, + "grad_norm": 1.4245976209640503, + "learning_rate": 0.0003902161547212742, + "epoch": 1.83 + }, + { + "loss": 7.456, + "grad_norm": 1.323899269104004, + "learning_rate": 0.00038934103439222894, + "epoch": 1.83 + }, + { + "loss": 7.6163, + "grad_norm": 1.2635420560836792, + "learning_rate": 0.0003884659140631837, + "epoch": 1.83 + }, + { + "loss": 7.5885, + "grad_norm": 1.4714936017990112, + "learning_rate": 0.00038759079373413847, + "epoch": 1.84 + }, + { + "loss": 7.8382, + "grad_norm": 1.1696442365646362, + "learning_rate": 0.00038671567340509323, + "epoch": 1.84 + }, + { + "loss": 7.4885, + "grad_norm": 1.3797491788864136, + "learning_rate": 0.000385840553076048, + "epoch": 1.84 + }, + { + "loss": 7.4614, + "grad_norm": 1.0410481691360474, + "learning_rate": 0.00038496543274700275, + "epoch": 1.85 + }, + { + "loss": 6.9584, + "grad_norm": 1.7356559038162231, + "learning_rate": 0.0003840903124179575, + "epoch": 1.85 + }, + { + "loss": 7.8161, + "grad_norm": 1.326489806175232, + "learning_rate": 0.0003832151920889123, + "epoch": 1.85 + }, + { + "loss": 7.6985, + "grad_norm": 1.3822075128555298, + "learning_rate": 0.00038234007175986704, + "epoch": 1.85 + }, + { + "loss": 7.9532, + "grad_norm": 1.2612171173095703, + "learning_rate": 0.00038146495143082175, + "epoch": 1.86 + }, + { + "loss": 7.3309, + "grad_norm": 1.8743207454681396, + "learning_rate": 0.0003805898311017765, + "epoch": 1.86 + }, + { + "loss": 7.8573, + "grad_norm": 1.515641212463379, + "learning_rate": 0.0003797147107727312, + "epoch": 1.86 + }, + { + "loss": 8.0815, + "grad_norm": 1.970818281173706, + "learning_rate": 0.000378839590443686, + "epoch": 1.86 + }, + { + "loss": 7.7197, + "grad_norm": 1.6418136358261108, + "learning_rate": 0.00037796447011464074, + "epoch": 1.87 + }, + { + "loss": 7.6527, + "grad_norm": 1.3693944215774536, + "learning_rate": 0.0003770893497855955, + "epoch": 1.87 + }, + { + "loss": 7.7717, + "grad_norm": 1.311493992805481, + "learning_rate": 0.00037621422945655026, + "epoch": 1.87 + }, + { + "loss": 8.0735, + "grad_norm": 1.593992829322815, + "learning_rate": 0.000375339109127505, + "epoch": 1.87 + }, + { + "loss": 7.4285, + "grad_norm": 1.212729573249817, + "learning_rate": 0.0003744639887984598, + "epoch": 1.88 + }, + { + "loss": 7.7873, + "grad_norm": 1.1326895952224731, + "learning_rate": 0.00037358886846941455, + "epoch": 1.88 + }, + { + "loss": 7.3515, + "grad_norm": 1.3937299251556396, + "learning_rate": 0.0003727137481403693, + "epoch": 1.88 + }, + { + "loss": 7.353, + "grad_norm": 1.5152568817138672, + "learning_rate": 0.00037183862781132407, + "epoch": 1.88 + }, + { + "loss": 7.8015, + "grad_norm": 1.207973599433899, + "learning_rate": 0.0003709635074822788, + "epoch": 1.89 + }, + { + "loss": 7.6713, + "grad_norm": 1.003139615058899, + "learning_rate": 0.00037008838715323354, + "epoch": 1.89 + }, + { + "loss": 7.9247, + "grad_norm": 1.1870025396347046, + "learning_rate": 0.0003692132668241883, + "epoch": 1.89 + }, + { + "loss": 7.4496, + "grad_norm": 1.237275242805481, + "learning_rate": 0.00036833814649514307, + "epoch": 1.9 + }, + { + "loss": 7.2638, + "grad_norm": 1.7287304401397705, + "learning_rate": 0.00036746302616609783, + "epoch": 1.9 + }, + { + "loss": 7.7464, + "grad_norm": 1.5875813961029053, + "learning_rate": 0.0003665879058370526, + "epoch": 1.9 + }, + { + "loss": 7.683, + "grad_norm": 1.7219480276107788, + "learning_rate": 0.00036571278550800735, + "epoch": 1.9 + }, + { + "loss": 7.6059, + "grad_norm": 1.3815206289291382, + "learning_rate": 0.0003648376651789621, + "epoch": 1.91 + }, + { + "loss": 7.3258, + "grad_norm": 1.1902978420257568, + "learning_rate": 0.0003639625448499169, + "epoch": 1.91 + }, + { + "loss": 7.4436, + "grad_norm": 1.6532816886901855, + "learning_rate": 0.00036308742452087164, + "epoch": 1.91 + }, + { + "loss": 7.438, + "grad_norm": 1.1358212232589722, + "learning_rate": 0.0003622123041918264, + "epoch": 1.91 + }, + { + "loss": 7.9777, + "grad_norm": 1.3459230661392212, + "learning_rate": 0.00036133718386278116, + "epoch": 1.92 + }, + { + "loss": 7.9087, + "grad_norm": 1.0352368354797363, + "learning_rate": 0.0003604620635337359, + "epoch": 1.92 + }, + { + "loss": 7.5855, + "grad_norm": 1.2582918405532837, + "learning_rate": 0.00035958694320469063, + "epoch": 1.92 + }, + { + "loss": 7.4576, + "grad_norm": 1.1787996292114258, + "learning_rate": 0.0003587118228756454, + "epoch": 1.92 + }, + { + "loss": 7.2572, + "grad_norm": 1.2917609214782715, + "learning_rate": 0.00035783670254660015, + "epoch": 1.93 + }, + { + "loss": 7.6433, + "grad_norm": 1.1689330339431763, + "learning_rate": 0.0003569615822175549, + "epoch": 1.93 + }, + { + "loss": 7.6579, + "grad_norm": 1.2844352722167969, + "learning_rate": 0.0003560864618885097, + "epoch": 1.93 + }, + { + "loss": 7.5178, + "grad_norm": 1.498838186264038, + "learning_rate": 0.00035521134155946444, + "epoch": 1.93 + }, + { + "loss": 7.0155, + "grad_norm": 1.3718552589416504, + "learning_rate": 0.0003543362212304192, + "epoch": 1.94 + }, + { + "loss": 7.7558, + "grad_norm": 1.2343835830688477, + "learning_rate": 0.00035346110090137396, + "epoch": 1.94 + }, + { + "loss": 7.4386, + "grad_norm": 1.307979702949524, + "learning_rate": 0.0003525859805723287, + "epoch": 1.94 + }, + { + "loss": 7.4287, + "grad_norm": 1.46335768699646, + "learning_rate": 0.0003517108602432835, + "epoch": 1.95 + }, + { + "loss": 7.0541, + "grad_norm": 1.4892301559448242, + "learning_rate": 0.00035083573991423825, + "epoch": 1.95 + }, + { + "loss": 7.6458, + "grad_norm": 1.3297821283340454, + "learning_rate": 0.000349960619585193, + "epoch": 1.95 + }, + { + "loss": 7.3704, + "grad_norm": 1.9190036058425903, + "learning_rate": 0.0003490854992561477, + "epoch": 1.95 + }, + { + "loss": 7.9292, + "grad_norm": 1.1013009548187256, + "learning_rate": 0.0003482103789271025, + "epoch": 1.96 + }, + { + "loss": 7.8039, + "grad_norm": 1.284121036529541, + "learning_rate": 0.00034733525859805724, + "epoch": 1.96 + }, + { + "loss": 7.7188, + "grad_norm": 1.118995189666748, + "learning_rate": 0.000346460138269012, + "epoch": 1.96 + }, + { + "loss": 7.3617, + "grad_norm": 1.5446746349334717, + "learning_rate": 0.00034558501793996676, + "epoch": 1.96 + }, + { + "loss": 7.5614, + "grad_norm": 1.254835844039917, + "learning_rate": 0.0003447098976109215, + "epoch": 1.97 + }, + { + "loss": 7.9923, + "grad_norm": 2.215224266052246, + "learning_rate": 0.0003438347772818763, + "epoch": 1.97 + }, + { + "loss": 7.6609, + "grad_norm": 1.2917975187301636, + "learning_rate": 0.00034295965695283105, + "epoch": 1.97 + }, + { + "loss": 6.9695, + "grad_norm": 1.3251945972442627, + "learning_rate": 0.0003420845366237858, + "epoch": 1.97 + }, + { + "loss": 7.4109, + "grad_norm": 1.5397628545761108, + "learning_rate": 0.0003412094162947406, + "epoch": 1.98 + }, + { + "loss": 7.3063, + "grad_norm": 1.1789202690124512, + "learning_rate": 0.00034033429596569534, + "epoch": 1.98 + }, + { + "loss": 7.8137, + "grad_norm": 1.6068191528320312, + "learning_rate": 0.00033945917563665004, + "epoch": 1.98 + }, + { + "loss": 7.5466, + "grad_norm": 1.2397950887680054, + "learning_rate": 0.00033858405530760475, + "epoch": 1.98 + }, + { + "loss": 7.9522, + "grad_norm": 1.5175119638442993, + "learning_rate": 0.0003377089349785595, + "epoch": 1.99 + }, + { + "loss": 7.6781, + "grad_norm": 1.315258502960205, + "learning_rate": 0.0003368338146495143, + "epoch": 1.99 + }, + { + "loss": 7.7292, + "grad_norm": 2.664515256881714, + "learning_rate": 0.00033595869432046904, + "epoch": 1.99 + }, + { + "loss": 8.1965, + "grad_norm": 1.405129313468933, + "learning_rate": 0.0003350835739914238, + "epoch": 1.99 + }, + { + "loss": 7.4133, + "grad_norm": 1.0774602890014648, + "learning_rate": 0.00033420845366237856, + "epoch": 2.0 + }, + { + "loss": 8.1777, + "grad_norm": 1.75553560256958, + "learning_rate": 0.0003333333333333333, + "epoch": 2.0 + }, + { + "loss": 7.5693, + "grad_norm": 1.857081651687622, + "learning_rate": 0.0003324582130042881, + "epoch": 2.0 + }, + { + "loss": 7.4888, + "grad_norm": 1.0721529722213745, + "learning_rate": 0.00033158309267524285, + "epoch": 2.01 + }, + { + "loss": 7.1311, + "grad_norm": 1.0766797065734863, + "learning_rate": 0.0003307079723461976, + "epoch": 2.01 + }, + { + "loss": 7.5107, + "grad_norm": 1.4615150690078735, + "learning_rate": 0.00032983285201715237, + "epoch": 2.01 + }, + { + "loss": 7.5258, + "grad_norm": 1.4252068996429443, + "learning_rate": 0.00032895773168810713, + "epoch": 2.01 + }, + { + "loss": 7.6049, + "grad_norm": 1.2926585674285889, + "learning_rate": 0.0003280826113590619, + "epoch": 2.02 + }, + { + "loss": 7.2436, + "grad_norm": 1.6630724668502808, + "learning_rate": 0.0003272074910300166, + "epoch": 2.02 + }, + { + "loss": 6.951, + "grad_norm": 1.2705895900726318, + "learning_rate": 0.00032633237070097136, + "epoch": 2.02 + }, + { + "loss": 7.4782, + "grad_norm": 1.6801918745040894, + "learning_rate": 0.0003254572503719261, + "epoch": 2.02 + }, + { + "loss": 7.7247, + "grad_norm": 1.2789455652236938, + "learning_rate": 0.0003245821300428809, + "epoch": 2.03 + }, + { + "loss": 7.65, + "grad_norm": 1.0772324800491333, + "learning_rate": 0.00032370700971383565, + "epoch": 2.03 + }, + { + "loss": 7.3484, + "grad_norm": 1.218855857849121, + "learning_rate": 0.0003228318893847904, + "epoch": 2.03 + }, + { + "loss": 7.7201, + "grad_norm": 1.7484831809997559, + "learning_rate": 0.00032195676905574517, + "epoch": 2.03 + }, + { + "loss": 7.606, + "grad_norm": 1.4081809520721436, + "learning_rate": 0.00032108164872669993, + "epoch": 2.04 + }, + { + "loss": 7.4735, + "grad_norm": 1.2214211225509644, + "learning_rate": 0.0003202065283976547, + "epoch": 2.04 + }, + { + "loss": 7.3052, + "grad_norm": 2.243197441101074, + "learning_rate": 0.00031933140806860946, + "epoch": 2.04 + }, + { + "loss": 7.2611, + "grad_norm": 1.0560696125030518, + "learning_rate": 0.0003184562877395642, + "epoch": 2.04 + }, + { + "loss": 7.3347, + "grad_norm": 1.3903985023498535, + "learning_rate": 0.000317581167410519, + "epoch": 2.05 + }, + { + "loss": 7.4106, + "grad_norm": 1.285888910293579, + "learning_rate": 0.0003167060470814737, + "epoch": 2.05 + }, + { + "loss": 7.3237, + "grad_norm": 1.6455745697021484, + "learning_rate": 0.00031583092675242845, + "epoch": 2.05 + }, + { + "loss": 7.4445, + "grad_norm": 1.3552714586257935, + "learning_rate": 0.0003149558064233832, + "epoch": 2.06 + }, + { + "loss": 7.3175, + "grad_norm": 1.4250375032424927, + "learning_rate": 0.000314080686094338, + "epoch": 2.06 + }, + { + "loss": 7.5334, + "grad_norm": 1.8445017337799072, + "learning_rate": 0.00031320556576529274, + "epoch": 2.06 + }, + { + "loss": 7.7627, + "grad_norm": 1.1116868257522583, + "learning_rate": 0.0003123304454362475, + "epoch": 2.06 + }, + { + "loss": 7.5347, + "grad_norm": 1.1636768579483032, + "learning_rate": 0.00031145532510720226, + "epoch": 2.07 + }, + { + "loss": 7.6581, + "grad_norm": 1.4612860679626465, + "learning_rate": 0.000310580204778157, + "epoch": 2.07 + }, + { + "loss": 7.6164, + "grad_norm": 1.4403191804885864, + "learning_rate": 0.0003097050844491118, + "epoch": 2.07 + }, + { + "loss": 7.3776, + "grad_norm": 1.366955041885376, + "learning_rate": 0.00030882996412006655, + "epoch": 2.07 + }, + { + "loss": 7.556, + "grad_norm": 1.4476971626281738, + "learning_rate": 0.0003079548437910213, + "epoch": 2.08 + }, + { + "loss": 7.6019, + "grad_norm": 1.4753084182739258, + "learning_rate": 0.00030707972346197607, + "epoch": 2.08 + }, + { + "loss": 7.8493, + "grad_norm": 1.2335758209228516, + "learning_rate": 0.00030620460313293083, + "epoch": 2.08 + }, + { + "loss": 7.9252, + "grad_norm": 1.3958989381790161, + "learning_rate": 0.00030532948280388554, + "epoch": 2.08 + }, + { + "loss": 7.2945, + "grad_norm": 1.4621672630310059, + "learning_rate": 0.0003044543624748403, + "epoch": 2.09 + }, + { + "loss": 7.3977, + "grad_norm": 1.428195834159851, + "learning_rate": 0.00030357924214579506, + "epoch": 2.09 + }, + { + "loss": 7.74, + "grad_norm": 1.363600492477417, + "learning_rate": 0.0003027041218167498, + "epoch": 2.09 + }, + { + "loss": 7.4894, + "grad_norm": 1.2117736339569092, + "learning_rate": 0.0003018290014877046, + "epoch": 2.09 + }, + { + "loss": 7.5678, + "grad_norm": 1.9844530820846558, + "learning_rate": 0.00030095388115865935, + "epoch": 2.1 + }, + { + "loss": 7.6681, + "grad_norm": 1.3558523654937744, + "learning_rate": 0.0003000787608296141, + "epoch": 2.1 + }, + { + "loss": 7.9793, + "grad_norm": 1.3802049160003662, + "learning_rate": 0.0002992036405005688, + "epoch": 2.1 + }, + { + "loss": 8.1848, + "grad_norm": 1.845702886581421, + "learning_rate": 0.0002983285201715236, + "epoch": 2.11 + }, + { + "loss": 7.2184, + "grad_norm": 1.4479707479476929, + "learning_rate": 0.00029745339984247834, + "epoch": 2.11 + }, + { + "loss": 7.4373, + "grad_norm": 1.9233028888702393, + "learning_rate": 0.0002965782795134331, + "epoch": 2.11 + }, + { + "loss": 7.2478, + "grad_norm": 1.3621513843536377, + "learning_rate": 0.00029570315918438786, + "epoch": 2.11 + }, + { + "loss": 7.5867, + "grad_norm": 1.449763298034668, + "learning_rate": 0.00029482803885534257, + "epoch": 2.12 + }, + { + "loss": 7.2909, + "grad_norm": 1.543834924697876, + "learning_rate": 0.00029395291852629733, + "epoch": 2.12 + }, + { + "loss": 7.5481, + "grad_norm": 1.2582162618637085, + "learning_rate": 0.0002930777981972521, + "epoch": 2.12 + }, + { + "loss": 7.2092, + "grad_norm": 1.25532865524292, + "learning_rate": 0.00029220267786820686, + "epoch": 2.12 + }, + { + "loss": 7.5117, + "grad_norm": 1.4368300437927246, + "learning_rate": 0.0002913275575391616, + "epoch": 2.13 + }, + { + "loss": 7.8661, + "grad_norm": 1.4054632186889648, + "learning_rate": 0.0002904524372101164, + "epoch": 2.13 + }, + { + "loss": 7.7641, + "grad_norm": 1.4426825046539307, + "learning_rate": 0.00028957731688107114, + "epoch": 2.13 + }, + { + "loss": 6.9808, + "grad_norm": 1.6069836616516113, + "learning_rate": 0.0002887021965520259, + "epoch": 2.13 + }, + { + "loss": 8.0412, + "grad_norm": 1.603289246559143, + "learning_rate": 0.00028782707622298067, + "epoch": 2.14 + }, + { + "loss": 7.7541, + "grad_norm": 1.2069703340530396, + "learning_rate": 0.00028695195589393543, + "epoch": 2.14 + }, + { + "loss": 7.5413, + "grad_norm": 1.2976186275482178, + "learning_rate": 0.0002860768355648902, + "epoch": 2.14 + }, + { + "loss": 7.6833, + "grad_norm": 1.4646226167678833, + "learning_rate": 0.00028520171523584495, + "epoch": 2.14 + }, + { + "loss": 7.3603, + "grad_norm": 1.3783011436462402, + "learning_rate": 0.00028432659490679966, + "epoch": 2.15 + }, + { + "loss": 7.1131, + "grad_norm": 1.1677837371826172, + "learning_rate": 0.0002834514745777544, + "epoch": 2.15 + }, + { + "loss": 7.8353, + "grad_norm": 1.5966696739196777, + "learning_rate": 0.0002825763542487092, + "epoch": 2.15 + }, + { + "loss": 7.651, + "grad_norm": 1.3074275255203247, + "learning_rate": 0.00028170123391966394, + "epoch": 2.16 + }, + { + "loss": 6.8535, + "grad_norm": 1.2238943576812744, + "learning_rate": 0.0002808261135906187, + "epoch": 2.16 + }, + { + "loss": 7.1677, + "grad_norm": 1.2107079029083252, + "learning_rate": 0.00027995099326157347, + "epoch": 2.16 + }, + { + "loss": 7.1232, + "grad_norm": 1.482686996459961, + "learning_rate": 0.00027907587293252823, + "epoch": 2.16 + }, + { + "loss": 7.6958, + "grad_norm": 1.9235337972640991, + "learning_rate": 0.000278200752603483, + "epoch": 2.17 + }, + { + "loss": 7.5763, + "grad_norm": 1.0629470348358154, + "learning_rate": 0.00027732563227443775, + "epoch": 2.17 + }, + { + "loss": 7.417, + "grad_norm": 1.4404977560043335, + "learning_rate": 0.0002764505119453925, + "epoch": 2.17 + }, + { + "loss": 7.4457, + "grad_norm": 1.6266590356826782, + "learning_rate": 0.0002755753916163473, + "epoch": 2.17 + }, + { + "loss": 7.6768, + "grad_norm": 1.4418647289276123, + "learning_rate": 0.00027470027128730204, + "epoch": 2.18 + }, + { + "loss": 7.7301, + "grad_norm": 1.7269823551177979, + "learning_rate": 0.0002738251509582568, + "epoch": 2.18 + }, + { + "loss": 7.1704, + "grad_norm": 1.9527968168258667, + "learning_rate": 0.0002729500306292115, + "epoch": 2.18 + }, + { + "loss": 8.0284, + "grad_norm": 1.1195765733718872, + "learning_rate": 0.00027207491030016627, + "epoch": 2.18 + }, + { + "loss": 7.876, + "grad_norm": 1.381032109260559, + "learning_rate": 0.00027119978997112103, + "epoch": 2.19 + }, + { + "loss": 7.4609, + "grad_norm": 2.2558112144470215, + "learning_rate": 0.0002703246696420758, + "epoch": 2.19 + }, + { + "loss": 7.524, + "grad_norm": 1.0892398357391357, + "learning_rate": 0.00026944954931303056, + "epoch": 2.19 + }, + { + "loss": 7.1756, + "grad_norm": 1.432793140411377, + "learning_rate": 0.0002685744289839853, + "epoch": 2.19 + }, + { + "loss": 7.4677, + "grad_norm": 2.4381473064422607, + "learning_rate": 0.0002676993086549401, + "epoch": 2.2 + }, + { + "loss": 7.2004, + "grad_norm": 1.0947704315185547, + "learning_rate": 0.00026682418832589484, + "epoch": 2.2 + }, + { + "loss": 7.6084, + "grad_norm": 1.1396403312683105, + "learning_rate": 0.0002659490679968496, + "epoch": 2.2 + }, + { + "loss": 7.4592, + "grad_norm": 1.7132469415664673, + "learning_rate": 0.00026507394766780437, + "epoch": 2.2 + }, + { + "loss": 7.6666, + "grad_norm": 1.507416844367981, + "learning_rate": 0.00026419882733875913, + "epoch": 2.21 + }, + { + "loss": 7.9483, + "grad_norm": 1.997502326965332, + "learning_rate": 0.0002633237070097139, + "epoch": 2.21 + }, + { + "loss": 6.8979, + "grad_norm": 1.180274486541748, + "learning_rate": 0.0002624485866806686, + "epoch": 2.21 + }, + { + "loss": 7.5387, + "grad_norm": 1.4130629301071167, + "learning_rate": 0.00026157346635162336, + "epoch": 2.22 + }, + { + "loss": 7.7374, + "grad_norm": 1.9466407299041748, + "learning_rate": 0.0002606983460225781, + "epoch": 2.22 + }, + { + "loss": 7.2489, + "grad_norm": 1.2844946384429932, + "learning_rate": 0.00025982322569353283, + "epoch": 2.22 + }, + { + "loss": 7.2583, + "grad_norm": 1.4728493690490723, + "learning_rate": 0.0002589481053644876, + "epoch": 2.22 + }, + { + "loss": 7.1689, + "grad_norm": 1.505767583847046, + "learning_rate": 0.00025807298503544235, + "epoch": 2.23 + }, + { + "loss": 7.3824, + "grad_norm": 1.164609432220459, + "learning_rate": 0.0002571978647063971, + "epoch": 2.23 + }, + { + "loss": 8.208, + "grad_norm": 1.3337666988372803, + "learning_rate": 0.0002563227443773519, + "epoch": 2.23 + }, + { + "loss": 7.1503, + "grad_norm": 1.2840052843093872, + "learning_rate": 0.00025544762404830664, + "epoch": 2.23 + }, + { + "loss": 7.7838, + "grad_norm": 1.6767994165420532, + "learning_rate": 0.0002545725037192614, + "epoch": 2.24 + }, + { + "loss": 7.4818, + "grad_norm": 1.2790688276290894, + "learning_rate": 0.00025369738339021616, + "epoch": 2.24 + }, + { + "loss": 7.1404, + "grad_norm": 1.9306037425994873, + "learning_rate": 0.0002528222630611709, + "epoch": 2.24 + }, + { + "loss": 6.9151, + "grad_norm": 1.0568101406097412, + "learning_rate": 0.00025194714273212563, + "epoch": 2.24 + }, + { + "loss": 7.5813, + "grad_norm": 1.8494940996170044, + "learning_rate": 0.0002510720224030804, + "epoch": 2.25 + }, + { + "loss": 7.1433, + "grad_norm": 1.2321641445159912, + "learning_rate": 0.00025019690207403515, + "epoch": 2.25 + }, + { + "loss": 7.0211, + "grad_norm": 1.5231260061264038, + "learning_rate": 0.0002493217817449899, + "epoch": 2.25 + }, + { + "loss": 7.5108, + "grad_norm": 1.6787548065185547, + "learning_rate": 0.0002484466614159447, + "epoch": 2.25 + }, + { + "loss": 7.5859, + "grad_norm": 1.8862128257751465, + "learning_rate": 0.00024757154108689944, + "epoch": 2.26 + }, + { + "loss": 7.0871, + "grad_norm": 1.5295615196228027, + "learning_rate": 0.0002466964207578542, + "epoch": 2.26 + }, + { + "loss": 7.2151, + "grad_norm": 1.6439179182052612, + "learning_rate": 0.00024582130042880896, + "epoch": 2.26 + }, + { + "loss": 7.851, + "grad_norm": 1.5902001857757568, + "learning_rate": 0.0002449461800997637, + "epoch": 2.27 + }, + { + "loss": 7.695, + "grad_norm": 1.447240948677063, + "learning_rate": 0.00024407105977071846, + "epoch": 2.27 + }, + { + "loss": 7.218, + "grad_norm": 1.7448298931121826, + "learning_rate": 0.00024319593944167322, + "epoch": 2.27 + }, + { + "loss": 7.4559, + "grad_norm": 1.7815390825271606, + "learning_rate": 0.00024232081911262798, + "epoch": 2.27 + }, + { + "loss": 7.5519, + "grad_norm": 1.746805191040039, + "learning_rate": 0.00024144569878358275, + "epoch": 2.28 + }, + { + "loss": 7.4818, + "grad_norm": 1.771155834197998, + "learning_rate": 0.0002405705784545375, + "epoch": 2.28 + }, + { + "loss": 7.8775, + "grad_norm": 1.2886364459991455, + "learning_rate": 0.00023969545812549227, + "epoch": 2.28 + }, + { + "loss": 7.0862, + "grad_norm": 1.3562748432159424, + "learning_rate": 0.00023882033779644703, + "epoch": 2.28 + }, + { + "loss": 7.4458, + "grad_norm": 1.5549288988113403, + "learning_rate": 0.00023794521746740177, + "epoch": 2.29 + }, + { + "loss": 7.5017, + "grad_norm": 1.3231199979782104, + "learning_rate": 0.00023707009713835653, + "epoch": 2.29 + }, + { + "loss": 6.9317, + "grad_norm": 1.0973995923995972, + "learning_rate": 0.0002361949768093113, + "epoch": 2.29 + }, + { + "loss": 7.2512, + "grad_norm": 1.161665916442871, + "learning_rate": 0.00023531985648026605, + "epoch": 2.29 + }, + { + "loss": 7.3376, + "grad_norm": 1.1249802112579346, + "learning_rate": 0.0002344447361512208, + "epoch": 2.3 + }, + { + "loss": 7.6856, + "grad_norm": 1.4549752473831177, + "learning_rate": 0.00023356961582217557, + "epoch": 2.3 + }, + { + "loss": 7.6518, + "grad_norm": 1.2443310022354126, + "learning_rate": 0.0002326944954931303, + "epoch": 2.3 + }, + { + "loss": 7.9287, + "grad_norm": 1.2414274215698242, + "learning_rate": 0.00023181937516408507, + "epoch": 2.3 + }, + { + "loss": 7.4844, + "grad_norm": 1.250632882118225, + "learning_rate": 0.00023094425483503983, + "epoch": 2.31 + }, + { + "loss": 6.9439, + "grad_norm": 1.5678353309631348, + "learning_rate": 0.0002300691345059946, + "epoch": 2.31 + }, + { + "loss": 7.2214, + "grad_norm": 1.2777363061904907, + "learning_rate": 0.00022919401417694933, + "epoch": 2.31 + }, + { + "loss": 7.6909, + "grad_norm": 1.1702243089675903, + "learning_rate": 0.0002283188938479041, + "epoch": 2.32 + }, + { + "loss": 7.843, + "grad_norm": 1.1647387742996216, + "learning_rate": 0.00022744377351885883, + "epoch": 2.32 + }, + { + "loss": 7.5598, + "grad_norm": 1.5888360738754272, + "learning_rate": 0.0002265686531898136, + "epoch": 2.32 + }, + { + "loss": 7.4084, + "grad_norm": 1.2132010459899902, + "learning_rate": 0.00022569353286076835, + "epoch": 2.32 + }, + { + "loss": 8.0077, + "grad_norm": 1.3676106929779053, + "learning_rate": 0.0002248184125317231, + "epoch": 2.33 + }, + { + "loss": 7.4475, + "grad_norm": 1.4785172939300537, + "learning_rate": 0.00022394329220267787, + "epoch": 2.33 + }, + { + "loss": 7.4934, + "grad_norm": 1.6854803562164307, + "learning_rate": 0.00022306817187363264, + "epoch": 2.33 + }, + { + "loss": 7.5371, + "grad_norm": 1.3336540460586548, + "learning_rate": 0.00022219305154458737, + "epoch": 2.33 + }, + { + "loss": 7.091, + "grad_norm": 1.5374839305877686, + "learning_rate": 0.00022131793121554213, + "epoch": 2.34 + }, + { + "loss": 7.5715, + "grad_norm": 1.259857177734375, + "learning_rate": 0.0002204428108864969, + "epoch": 2.34 + }, + { + "loss": 7.5012, + "grad_norm": 1.435889482498169, + "learning_rate": 0.00021956769055745166, + "epoch": 2.34 + }, + { + "loss": 7.5925, + "grad_norm": 1.6067544221878052, + "learning_rate": 0.00021869257022840642, + "epoch": 2.34 + }, + { + "loss": 7.2756, + "grad_norm": 1.2057377099990845, + "learning_rate": 0.00021781744989936118, + "epoch": 2.35 + }, + { + "loss": 7.0737, + "grad_norm": 1.0249065160751343, + "learning_rate": 0.00021694232957031591, + "epoch": 2.35 + }, + { + "loss": 7.2857, + "grad_norm": 1.1336891651153564, + "learning_rate": 0.00021606720924127068, + "epoch": 2.35 + }, + { + "loss": 7.0709, + "grad_norm": 1.1853156089782715, + "learning_rate": 0.00021519208891222544, + "epoch": 2.35 + }, + { + "loss": 6.9118, + "grad_norm": 1.4682341814041138, + "learning_rate": 0.0002143169685831802, + "epoch": 2.36 + }, + { + "loss": 7.3363, + "grad_norm": 1.3039721250534058, + "learning_rate": 0.00021344184825413496, + "epoch": 2.36 + }, + { + "loss": 7.2827, + "grad_norm": 1.28932785987854, + "learning_rate": 0.00021256672792508972, + "epoch": 2.36 + }, + { + "loss": 7.6069, + "grad_norm": 1.7343271970748901, + "learning_rate": 0.00021169160759604448, + "epoch": 2.37 + }, + { + "loss": 7.3543, + "grad_norm": 1.9730132818222046, + "learning_rate": 0.00021081648726699922, + "epoch": 2.37 + }, + { + "loss": 7.3351, + "grad_norm": 2.070822238922119, + "learning_rate": 0.00020994136693795395, + "epoch": 2.37 + }, + { + "loss": 7.3199, + "grad_norm": 1.1327873468399048, + "learning_rate": 0.00020906624660890872, + "epoch": 2.37 + }, + { + "loss": 7.4058, + "grad_norm": 1.3796617984771729, + "learning_rate": 0.00020819112627986348, + "epoch": 2.38 + }, + { + "loss": 7.3027, + "grad_norm": 1.8397942781448364, + "learning_rate": 0.00020731600595081824, + "epoch": 2.38 + }, + { + "loss": 7.6354, + "grad_norm": 1.4503923654556274, + "learning_rate": 0.0002065283976546775, + "epoch": 2.38 + }, + { + "loss": 7.2284, + "grad_norm": 1.550950527191162, + "learning_rate": 0.00020565327732563227, + "epoch": 2.38 + }, + { + "loss": 7.3061, + "grad_norm": 1.5306216478347778, + "learning_rate": 0.00020477815699658703, + "epoch": 2.39 + }, + { + "loss": 7.3337, + "grad_norm": 1.269167184829712, + "learning_rate": 0.0002039030366675418, + "epoch": 2.39 + }, + { + "loss": 7.7686, + "grad_norm": 1.600019931793213, + "learning_rate": 0.00020302791633849656, + "epoch": 2.39 + }, + { + "loss": 7.35, + "grad_norm": 1.5773662328720093, + "learning_rate": 0.0002021527960094513, + "epoch": 2.39 + }, + { + "loss": 7.3691, + "grad_norm": 1.547160029411316, + "learning_rate": 0.00020127767568040605, + "epoch": 2.4 + }, + { + "loss": 7.4863, + "grad_norm": 1.4968856573104858, + "learning_rate": 0.00020040255535136081, + "epoch": 2.4 + }, + { + "loss": 7.9482, + "grad_norm": 1.2087891101837158, + "learning_rate": 0.00019952743502231558, + "epoch": 2.4 + }, + { + "loss": 7.0255, + "grad_norm": 1.290597677230835, + "learning_rate": 0.00019865231469327034, + "epoch": 2.4 + }, + { + "loss": 7.178, + "grad_norm": 1.5743247270584106, + "learning_rate": 0.0001977771943642251, + "epoch": 2.41 + }, + { + "loss": 7.6474, + "grad_norm": 1.5197412967681885, + "learning_rate": 0.00019690207403517984, + "epoch": 2.41 + }, + { + "loss": 7.3527, + "grad_norm": 1.4716495275497437, + "learning_rate": 0.0001960269537061346, + "epoch": 2.41 + }, + { + "loss": 7.6313, + "grad_norm": 1.9746785163879395, + "learning_rate": 0.00019515183337708936, + "epoch": 2.41 + }, + { + "loss": 7.6972, + "grad_norm": 1.2683417797088623, + "learning_rate": 0.00019427671304804412, + "epoch": 2.42 + }, + { + "loss": 7.1378, + "grad_norm": 1.1373748779296875, + "learning_rate": 0.00019340159271899888, + "epoch": 2.42 + }, + { + "loss": 7.0196, + "grad_norm": 1.4191349744796753, + "learning_rate": 0.00019252647238995364, + "epoch": 2.42 + }, + { + "loss": 6.9102, + "grad_norm": 1.6580002307891846, + "learning_rate": 0.00019165135206090838, + "epoch": 2.43 + }, + { + "loss": 7.5105, + "grad_norm": 1.2877469062805176, + "learning_rate": 0.00019077623173186314, + "epoch": 2.43 + }, + { + "loss": 8.0212, + "grad_norm": 1.2933236360549927, + "learning_rate": 0.00018990111140281788, + "epoch": 2.43 + }, + { + "loss": 7.2108, + "grad_norm": 1.6515684127807617, + "learning_rate": 0.00018902599107377264, + "epoch": 2.43 + }, + { + "loss": 7.2944, + "grad_norm": 1.443547010421753, + "learning_rate": 0.0001881508707447274, + "epoch": 2.44 + }, + { + "loss": 6.9623, + "grad_norm": 1.5022013187408447, + "learning_rate": 0.00018727575041568216, + "epoch": 2.44 + }, + { + "loss": 7.5751, + "grad_norm": 1.639228343963623, + "learning_rate": 0.0001864006300866369, + "epoch": 2.44 + }, + { + "loss": 7.6183, + "grad_norm": 1.3685816526412964, + "learning_rate": 0.00018552550975759166, + "epoch": 2.44 + }, + { + "loss": 7.7862, + "grad_norm": 1.4008909463882446, + "learning_rate": 0.00018465038942854642, + "epoch": 2.45 + }, + { + "loss": 7.3036, + "grad_norm": 1.4068384170532227, + "learning_rate": 0.00018377526909950118, + "epoch": 2.45 + }, + { + "loss": 7.3222, + "grad_norm": 1.4874199628829956, + "learning_rate": 0.00018290014877045594, + "epoch": 2.45 + }, + { + "loss": 7.4538, + "grad_norm": 2.161606788635254, + "learning_rate": 0.0001820250284414107, + "epoch": 2.45 + }, + { + "loss": 7.099, + "grad_norm": 1.4761602878570557, + "learning_rate": 0.00018114990811236544, + "epoch": 2.46 + }, + { + "loss": 7.6725, + "grad_norm": 1.3598577976226807, + "learning_rate": 0.0001802747877833202, + "epoch": 2.46 + }, + { + "loss": 7.4651, + "grad_norm": 1.352389931678772, + "learning_rate": 0.00017939966745427496, + "epoch": 2.46 + }, + { + "loss": 7.0266, + "grad_norm": 1.302270770072937, + "learning_rate": 0.00017852454712522973, + "epoch": 2.46 + }, + { + "loss": 7.4879, + "grad_norm": 1.2166621685028076, + "learning_rate": 0.0001776494267961845, + "epoch": 2.47 + }, + { + "loss": 6.7354, + "grad_norm": 1.4442105293273926, + "learning_rate": 0.00017677430646713925, + "epoch": 2.47 + }, + { + "loss": 7.1184, + "grad_norm": 1.6301904916763306, + "learning_rate": 0.000175899186138094, + "epoch": 2.47 + }, + { + "loss": 7.4326, + "grad_norm": 1.2478090524673462, + "learning_rate": 0.00017502406580904875, + "epoch": 2.48 + }, + { + "loss": 7.6185, + "grad_norm": 1.2676613330841064, + "learning_rate": 0.0001741489454800035, + "epoch": 2.48 + }, + { + "loss": 7.439, + "grad_norm": 1.4324458837509155, + "learning_rate": 0.00017327382515095827, + "epoch": 2.48 + }, + { + "loss": 7.7999, + "grad_norm": 1.634446382522583, + "learning_rate": 0.00017239870482191303, + "epoch": 2.48 + }, + { + "loss": 7.3043, + "grad_norm": 1.2877479791641235, + "learning_rate": 0.0001715235844928678, + "epoch": 2.49 + }, + { + "loss": 7.054, + "grad_norm": 1.7003803253173828, + "learning_rate": 0.00017064846416382255, + "epoch": 2.49 + }, + { + "loss": 7.1568, + "grad_norm": 1.8888310194015503, + "learning_rate": 0.00016977334383477726, + "epoch": 2.49 + }, + { + "loss": 7.3495, + "grad_norm": 1.2593083381652832, + "learning_rate": 0.00016889822350573202, + "epoch": 2.49 + }, + { + "loss": 7.4716, + "grad_norm": 1.4410508871078491, + "learning_rate": 0.00016802310317668679, + "epoch": 2.5 + }, + { + "loss": 7.5133, + "grad_norm": 1.20904541015625, + "learning_rate": 0.00016714798284764155, + "epoch": 2.5 + }, + { + "loss": 7.3222, + "grad_norm": 1.4503611326217651, + "learning_rate": 0.0001662728625185963, + "epoch": 2.5 + }, + { + "loss": 7.6387, + "grad_norm": 1.3705183267593384, + "learning_rate": 0.00016539774218955107, + "epoch": 2.5 + }, + { + "loss": 7.0609, + "grad_norm": 1.2106906175613403, + "learning_rate": 0.0001645226218605058, + "epoch": 2.51 + }, + { + "loss": 7.342, + "grad_norm": 1.5564229488372803, + "learning_rate": 0.00016364750153146057, + "epoch": 2.51 + }, + { + "loss": 7.8121, + "grad_norm": 1.6493812799453735, + "learning_rate": 0.00016277238120241533, + "epoch": 2.51 + }, + { + "loss": 7.3909, + "grad_norm": 1.9025623798370361, + "learning_rate": 0.0001618972608733701, + "epoch": 2.51 + }, + { + "loss": 7.0106, + "grad_norm": 1.2934685945510864, + "learning_rate": 0.00016102214054432485, + "epoch": 2.52 + }, + { + "loss": 7.5199, + "grad_norm": 1.2549662590026855, + "learning_rate": 0.00016014702021527962, + "epoch": 2.52 + }, + { + "loss": 7.3509, + "grad_norm": 1.2111480236053467, + "learning_rate": 0.00015927189988623435, + "epoch": 2.52 + }, + { + "loss": 7.5281, + "grad_norm": 2.2498984336853027, + "learning_rate": 0.0001583967795571891, + "epoch": 2.53 + }, + { + "loss": 7.5218, + "grad_norm": 1.4710973501205444, + "learning_rate": 0.00015752165922814387, + "epoch": 2.53 + }, + { + "loss": 7.1575, + "grad_norm": 1.4040391445159912, + "learning_rate": 0.00015664653889909864, + "epoch": 2.53 + }, + { + "loss": 7.3097, + "grad_norm": 2.3657708168029785, + "learning_rate": 0.0001557714185700534, + "epoch": 2.53 + }, + { + "loss": 7.3235, + "grad_norm": 1.8456711769104004, + "learning_rate": 0.00015489629824100816, + "epoch": 2.54 + }, + { + "loss": 7.1772, + "grad_norm": 1.3032398223876953, + "learning_rate": 0.0001540211779119629, + "epoch": 2.54 + }, + { + "loss": 7.331, + "grad_norm": 1.2472988367080688, + "learning_rate": 0.00015314605758291766, + "epoch": 2.54 + }, + { + "loss": 6.9758, + "grad_norm": 1.1861238479614258, + "learning_rate": 0.00015227093725387242, + "epoch": 2.54 + }, + { + "loss": 7.357, + "grad_norm": 1.2937425374984741, + "learning_rate": 0.00015139581692482718, + "epoch": 2.55 + }, + { + "loss": 7.6132, + "grad_norm": 1.5241109132766724, + "learning_rate": 0.00015052069659578194, + "epoch": 2.55 + }, + { + "loss": 7.1769, + "grad_norm": 1.2426915168762207, + "learning_rate": 0.00014964557626673668, + "epoch": 2.55 + }, + { + "loss": 7.2242, + "grad_norm": 1.5336363315582275, + "learning_rate": 0.0001487704559376914, + "epoch": 2.55 + }, + { + "loss": 8.0839, + "grad_norm": 1.6944379806518555, + "learning_rate": 0.00014789533560864617, + "epoch": 2.56 + }, + { + "loss": 7.2667, + "grad_norm": 1.6602429151535034, + "learning_rate": 0.00014702021527960093, + "epoch": 2.56 + }, + { + "loss": 7.4821, + "grad_norm": 1.331986665725708, + "learning_rate": 0.0001461450949505557, + "epoch": 2.56 + }, + { + "loss": 7.4808, + "grad_norm": 1.4923409223556519, + "learning_rate": 0.00014526997462151046, + "epoch": 2.56 + }, + { + "loss": 7.3579, + "grad_norm": 1.5323739051818848, + "learning_rate": 0.00014439485429246522, + "epoch": 2.57 + }, + { + "loss": 7.1833, + "grad_norm": 1.0281411409378052, + "learning_rate": 0.00014351973396341998, + "epoch": 2.57 + }, + { + "loss": 7.521, + "grad_norm": 1.777385950088501, + "learning_rate": 0.00014264461363437472, + "epoch": 2.57 + }, + { + "loss": 7.5531, + "grad_norm": 1.7528423070907593, + "learning_rate": 0.00014176949330532948, + "epoch": 2.58 + }, + { + "loss": 7.3295, + "grad_norm": 1.665503740310669, + "learning_rate": 0.00014089437297628424, + "epoch": 2.58 + }, + { + "loss": 6.9815, + "grad_norm": 1.4323763847351074, + "learning_rate": 0.000140019252647239, + "epoch": 2.58 + }, + { + "loss": 7.7957, + "grad_norm": 1.2623038291931152, + "learning_rate": 0.00013914413231819376, + "epoch": 2.58 + }, + { + "loss": 7.2667, + "grad_norm": 1.3770829439163208, + "learning_rate": 0.00013826901198914853, + "epoch": 2.59 + }, + { + "loss": 7.2641, + "grad_norm": 1.495597243309021, + "learning_rate": 0.00013739389166010326, + "epoch": 2.59 + }, + { + "loss": 7.6276, + "grad_norm": 1.0396783351898193, + "learning_rate": 0.00013651877133105802, + "epoch": 2.59 + }, + { + "loss": 7.4811, + "grad_norm": 1.5590603351593018, + "learning_rate": 0.00013564365100201278, + "epoch": 2.59 + }, + { + "loss": 6.9941, + "grad_norm": 1.266262173652649, + "learning_rate": 0.00013476853067296755, + "epoch": 2.6 + }, + { + "loss": 7.0138, + "grad_norm": 1.3331608772277832, + "learning_rate": 0.0001338934103439223, + "epoch": 2.6 + }, + { + "loss": 7.6792, + "grad_norm": 1.54330575466156, + "learning_rate": 0.00013301829001487707, + "epoch": 2.6 + }, + { + "loss": 7.5151, + "grad_norm": 1.266360878944397, + "learning_rate": 0.0001321431696858318, + "epoch": 2.6 + }, + { + "loss": 7.6357, + "grad_norm": 1.1992617845535278, + "learning_rate": 0.00013126804935678657, + "epoch": 2.61 + }, + { + "loss": 7.6848, + "grad_norm": 1.6269259452819824, + "learning_rate": 0.00013039292902774133, + "epoch": 2.61 + }, + { + "loss": 7.3941, + "grad_norm": 1.4221471548080444, + "learning_rate": 0.00012951780869869606, + "epoch": 2.61 + }, + { + "loss": 7.5638, + "grad_norm": 1.31778085231781, + "learning_rate": 0.00012864268836965082, + "epoch": 2.61 + }, + { + "loss": 7.3716, + "grad_norm": 1.4217979907989502, + "learning_rate": 0.00012776756804060559, + "epoch": 2.62 + }, + { + "loss": 7.7403, + "grad_norm": 1.549012541770935, + "learning_rate": 0.00012689244771156032, + "epoch": 2.62 + }, + { + "loss": 7.5079, + "grad_norm": 1.7808821201324463, + "learning_rate": 0.00012601732738251508, + "epoch": 2.62 + }, + { + "loss": 7.338, + "grad_norm": 1.6030139923095703, + "learning_rate": 0.00012514220705346984, + "epoch": 2.62 + }, + { + "loss": 7.2113, + "grad_norm": 1.688103437423706, + "learning_rate": 0.0001242670867244246, + "epoch": 2.63 + }, + { + "loss": 7.5297, + "grad_norm": 1.4482861757278442, + "learning_rate": 0.00012339196639537937, + "epoch": 2.63 + }, + { + "loss": 7.6226, + "grad_norm": 1.481149435043335, + "learning_rate": 0.00012251684606633413, + "epoch": 2.63 + }, + { + "loss": 7.1199, + "grad_norm": 1.5914816856384277, + "learning_rate": 0.00012164172573728888, + "epoch": 2.64 + }, + { + "loss": 7.5294, + "grad_norm": 1.6436686515808105, + "learning_rate": 0.00012076660540824364, + "epoch": 2.64 + }, + { + "loss": 7.7319, + "grad_norm": 1.422884225845337, + "learning_rate": 0.00011989148507919839, + "epoch": 2.64 + }, + { + "loss": 7.5878, + "grad_norm": 1.2468681335449219, + "learning_rate": 0.00011901636475015315, + "epoch": 2.64 + }, + { + "loss": 7.4093, + "grad_norm": 1.6080206632614136, + "learning_rate": 0.00011814124442110791, + "epoch": 2.65 + }, + { + "loss": 6.927, + "grad_norm": 1.2568819522857666, + "learning_rate": 0.00011726612409206266, + "epoch": 2.65 + }, + { + "loss": 7.524, + "grad_norm": 1.4558569192886353, + "learning_rate": 0.00011639100376301742, + "epoch": 2.65 + }, + { + "loss": 6.7721, + "grad_norm": 1.3554805517196655, + "learning_rate": 0.00011551588343397218, + "epoch": 2.65 + }, + { + "loss": 7.5129, + "grad_norm": 2.061342239379883, + "learning_rate": 0.00011464076310492692, + "epoch": 2.66 + }, + { + "loss": 7.271, + "grad_norm": 1.7581554651260376, + "learning_rate": 0.00011376564277588168, + "epoch": 2.66 + }, + { + "loss": 7.4605, + "grad_norm": 1.3818498849868774, + "learning_rate": 0.00011289052244683644, + "epoch": 2.66 + }, + { + "loss": 7.2747, + "grad_norm": 1.4640157222747803, + "learning_rate": 0.00011201540211779119, + "epoch": 2.66 + }, + { + "loss": 7.4137, + "grad_norm": 1.628440499305725, + "learning_rate": 0.00011114028178874595, + "epoch": 2.67 + }, + { + "loss": 7.1947, + "grad_norm": 2.1291253566741943, + "learning_rate": 0.00011026516145970071, + "epoch": 2.67 + }, + { + "loss": 7.3972, + "grad_norm": 1.53203284740448, + "learning_rate": 0.00010939004113065546, + "epoch": 2.67 + }, + { + "loss": 7.1343, + "grad_norm": 1.7009447813034058, + "learning_rate": 0.00010851492080161022, + "epoch": 2.67 + }, + { + "loss": 7.4999, + "grad_norm": 1.981833815574646, + "learning_rate": 0.00010763980047256499, + "epoch": 2.68 + }, + { + "loss": 7.0649, + "grad_norm": 1.4151135683059692, + "learning_rate": 0.00010676468014351973, + "epoch": 2.68 + }, + { + "loss": 7.4975, + "grad_norm": 1.8214997053146362, + "learning_rate": 0.0001058895598144745, + "epoch": 2.68 + }, + { + "loss": 7.1928, + "grad_norm": 1.475014328956604, + "learning_rate": 0.00010501443948542926, + "epoch": 2.69 + }, + { + "loss": 6.7309, + "grad_norm": 1.500470757484436, + "learning_rate": 0.00010413931915638399, + "epoch": 2.69 + }, + { + "loss": 7.2154, + "grad_norm": 1.0923032760620117, + "learning_rate": 0.00010326419882733875, + "epoch": 2.69 + }, + { + "loss": 7.4584, + "grad_norm": 1.476189136505127, + "learning_rate": 0.00010238907849829352, + "epoch": 2.69 + }, + { + "loss": 7.5696, + "grad_norm": 1.3299099206924438, + "learning_rate": 0.00010151395816924828, + "epoch": 2.7 + }, + { + "loss": 7.4462, + "grad_norm": 1.248026967048645, + "learning_rate": 0.00010063883784020303, + "epoch": 2.7 + }, + { + "loss": 7.057, + "grad_norm": 1.5154845714569092, + "learning_rate": 9.976371751115779e-05, + "epoch": 2.7 + }, + { + "loss": 7.4942, + "grad_norm": 1.504868745803833, + "learning_rate": 9.888859718211255e-05, + "epoch": 2.7 + }, + { + "loss": 7.7042, + "grad_norm": 1.2087482213974, + "learning_rate": 9.80134768530673e-05, + "epoch": 2.71 + }, + { + "loss": 7.7138, + "grad_norm": 2.066254138946533, + "learning_rate": 9.713835652402206e-05, + "epoch": 2.71 + }, + { + "loss": 7.4746, + "grad_norm": 1.2078548669815063, + "learning_rate": 9.626323619497682e-05, + "epoch": 2.71 + }, + { + "loss": 7.5682, + "grad_norm": 1.2530779838562012, + "learning_rate": 9.538811586593157e-05, + "epoch": 2.71 + }, + { + "loss": 7.4491, + "grad_norm": 1.5170719623565674, + "learning_rate": 9.451299553688632e-05, + "epoch": 2.72 + }, + { + "loss": 7.2938, + "grad_norm": 1.2933870553970337, + "learning_rate": 9.363787520784108e-05, + "epoch": 2.72 + }, + { + "loss": 7.1455, + "grad_norm": 1.212755799293518, + "learning_rate": 9.276275487879583e-05, + "epoch": 2.72 + }, + { + "loss": 7.3702, + "grad_norm": 1.4118942022323608, + "learning_rate": 9.188763454975059e-05, + "epoch": 2.72 + }, + { + "loss": 7.1194, + "grad_norm": 1.575276494026184, + "learning_rate": 9.101251422070535e-05, + "epoch": 2.73 + }, + { + "loss": 7.046, + "grad_norm": 1.3244752883911133, + "learning_rate": 9.01373938916601e-05, + "epoch": 2.73 + }, + { + "loss": 6.875, + "grad_norm": 1.369280219078064, + "learning_rate": 8.926227356261486e-05, + "epoch": 2.73 + }, + { + "loss": 7.4045, + "grad_norm": 1.3210042715072632, + "learning_rate": 8.838715323356962e-05, + "epoch": 2.74 + }, + { + "loss": 7.5159, + "grad_norm": 1.4352552890777588, + "learning_rate": 8.751203290452437e-05, + "epoch": 2.74 + }, + { + "loss": 7.2315, + "grad_norm": 1.4860197305679321, + "learning_rate": 8.663691257547913e-05, + "epoch": 2.74 + }, + { + "loss": 6.8597, + "grad_norm": 1.2331523895263672, + "learning_rate": 8.57617922464339e-05, + "epoch": 2.74 + }, + { + "loss": 7.3485, + "grad_norm": 1.2187525033950806, + "learning_rate": 8.488667191738863e-05, + "epoch": 2.75 + }, + { + "loss": 7.388, + "grad_norm": 1.1800241470336914, + "learning_rate": 8.401155158834339e-05, + "epoch": 2.75 + }, + { + "loss": 6.9186, + "grad_norm": 1.3542723655700684, + "learning_rate": 8.313643125929815e-05, + "epoch": 2.75 + }, + { + "loss": 6.9582, + "grad_norm": 1.3839143514633179, + "learning_rate": 8.22613109302529e-05, + "epoch": 2.75 + }, + { + "loss": 7.4176, + "grad_norm": 1.4546840190887451, + "learning_rate": 8.138619060120766e-05, + "epoch": 2.76 + }, + { + "loss": 7.2731, + "grad_norm": 1.3623560667037964, + "learning_rate": 8.051107027216243e-05, + "epoch": 2.76 + }, + { + "loss": 7.1633, + "grad_norm": 1.9331005811691284, + "learning_rate": 7.963594994311717e-05, + "epoch": 2.76 + }, + { + "loss": 6.8972, + "grad_norm": 1.2791029214859009, + "learning_rate": 7.876082961407194e-05, + "epoch": 2.76 + }, + { + "loss": 7.1043, + "grad_norm": 1.6202424764633179, + "learning_rate": 7.78857092850267e-05, + "epoch": 2.77 + }, + { + "loss": 7.0727, + "grad_norm": 1.0835381746292114, + "learning_rate": 7.701058895598145e-05, + "epoch": 2.77 + }, + { + "loss": 7.0958, + "grad_norm": 1.2778371572494507, + "learning_rate": 7.613546862693621e-05, + "epoch": 2.77 + }, + { + "loss": 7.2219, + "grad_norm": 1.9295389652252197, + "learning_rate": 7.526034829789097e-05, + "epoch": 2.77 + }, + { + "loss": 7.0189, + "grad_norm": 1.9394477605819702, + "learning_rate": 7.43852279688457e-05, + "epoch": 2.78 + }, + { + "loss": 7.0144, + "grad_norm": 1.4238934516906738, + "learning_rate": 7.351010763980047e-05, + "epoch": 2.78 + }, + { + "loss": 7.2353, + "grad_norm": 1.350537657737732, + "learning_rate": 7.263498731075523e-05, + "epoch": 2.78 + }, + { + "loss": 6.7353, + "grad_norm": 1.3214153051376343, + "learning_rate": 7.175986698170999e-05, + "epoch": 2.79 + }, + { + "loss": 7.4143, + "grad_norm": 2.469216823577881, + "learning_rate": 7.088474665266474e-05, + "epoch": 2.79 + }, + { + "loss": 7.4276, + "grad_norm": 1.414184808731079, + "learning_rate": 7.00096263236195e-05, + "epoch": 2.79 + }, + { + "loss": 6.9842, + "grad_norm": 1.4708011150360107, + "learning_rate": 6.913450599457426e-05, + "epoch": 2.79 + }, + { + "loss": 7.572, + "grad_norm": 1.449560284614563, + "learning_rate": 6.825938566552901e-05, + "epoch": 2.8 + }, + { + "loss": 7.3449, + "grad_norm": 1.1261264085769653, + "learning_rate": 6.738426533648377e-05, + "epoch": 2.8 + }, + { + "loss": 7.1776, + "grad_norm": 1.5502110719680786, + "learning_rate": 6.650914500743853e-05, + "epoch": 2.8 + }, + { + "loss": 7.0565, + "grad_norm": 1.3916562795639038, + "learning_rate": 6.563402467839328e-05, + "epoch": 2.8 + }, + { + "loss": 7.0882, + "grad_norm": 1.361229658126831, + "learning_rate": 6.475890434934803e-05, + "epoch": 2.81 + }, + { + "loss": 6.981, + "grad_norm": 1.6100305318832397, + "learning_rate": 6.388378402030279e-05, + "epoch": 2.81 + }, + { + "loss": 7.2502, + "grad_norm": 1.5449306964874268, + "learning_rate": 6.300866369125754e-05, + "epoch": 2.81 + }, + { + "loss": 7.4208, + "grad_norm": 1.3188410997390747, + "learning_rate": 6.21335433622123e-05, + "epoch": 2.81 + }, + { + "loss": 7.2957, + "grad_norm": 1.543289303779602, + "learning_rate": 6.125842303316706e-05, + "epoch": 2.82 + }, + { + "loss": 7.0319, + "grad_norm": 1.1590594053268433, + "learning_rate": 6.038330270412182e-05, + "epoch": 2.82 + }, + { + "loss": 7.23, + "grad_norm": 1.1623939275741577, + "learning_rate": 5.9508182375076575e-05, + "epoch": 2.82 + }, + { + "loss": 7.1254, + "grad_norm": 1.6204333305358887, + "learning_rate": 5.863306204603133e-05, + "epoch": 2.82 + }, + { + "loss": 7.4319, + "grad_norm": 1.5845638513565063, + "learning_rate": 5.775794171698609e-05, + "epoch": 2.83 + }, + { + "loss": 7.4574, + "grad_norm": 1.3281787633895874, + "learning_rate": 5.688282138794084e-05, + "epoch": 2.83 + }, + { + "loss": 6.8629, + "grad_norm": 1.6502999067306519, + "learning_rate": 5.6007701058895595e-05, + "epoch": 2.83 + }, + { + "loss": 7.1493, + "grad_norm": 1.7768168449401855, + "learning_rate": 5.513258072985036e-05, + "epoch": 2.83 + }, + { + "loss": 7.1971, + "grad_norm": 1.1763763427734375, + "learning_rate": 5.425746040080511e-05, + "epoch": 2.84 + }, + { + "loss": 7.4182, + "grad_norm": 1.4033911228179932, + "learning_rate": 5.338234007175987e-05, + "epoch": 2.84 + }, + { + "loss": 6.8175, + "grad_norm": 1.5407586097717285, + "learning_rate": 5.250721974271463e-05, + "epoch": 2.84 + }, + { + "loss": 7.5091, + "grad_norm": 1.5829062461853027, + "learning_rate": 5.163209941366938e-05, + "epoch": 2.85 + }, + { + "loss": 7.0728, + "grad_norm": 1.3185957670211792, + "learning_rate": 5.075697908462414e-05, + "epoch": 2.85 + }, + { + "loss": 7.1931, + "grad_norm": 1.1996837854385376, + "learning_rate": 4.9881858755578894e-05, + "epoch": 2.85 + }, + { + "loss": 7.2327, + "grad_norm": 1.6188883781433105, + "learning_rate": 4.900673842653365e-05, + "epoch": 2.85 + }, + { + "loss": 7.2432, + "grad_norm": 1.7829197645187378, + "learning_rate": 4.813161809748841e-05, + "epoch": 2.86 + }, + { + "loss": 6.8231, + "grad_norm": 1.3998175859451294, + "learning_rate": 4.725649776844316e-05, + "epoch": 2.86 + }, + { + "loss": 7.5838, + "grad_norm": 1.6664845943450928, + "learning_rate": 4.6381377439397914e-05, + "epoch": 2.86 + }, + { + "loss": 7.3804, + "grad_norm": 1.2328096628189087, + "learning_rate": 4.5506257110352676e-05, + "epoch": 2.86 + }, + { + "loss": 7.1497, + "grad_norm": 1.5543657541275024, + "learning_rate": 4.463113678130743e-05, + "epoch": 2.87 + }, + { + "loss": 7.5067, + "grad_norm": 2.0711114406585693, + "learning_rate": 4.3756016452262186e-05, + "epoch": 2.87 + }, + { + "loss": 7.1481, + "grad_norm": 2.340829372406006, + "learning_rate": 4.288089612321695e-05, + "epoch": 2.87 + }, + { + "loss": 7.2767, + "grad_norm": 1.3014119863510132, + "learning_rate": 4.2005775794171696e-05, + "epoch": 2.87 + }, + { + "loss": 7.2583, + "grad_norm": 1.186070442199707, + "learning_rate": 4.113065546512645e-05, + "epoch": 2.88 + }, + { + "loss": 7.7179, + "grad_norm": 1.4286901950836182, + "learning_rate": 4.025553513608121e-05, + "epoch": 2.88 + }, + { + "loss": 6.9271, + "grad_norm": 1.561988115310669, + "learning_rate": 3.938041480703597e-05, + "epoch": 2.88 + }, + { + "loss": 6.9378, + "grad_norm": 1.2756584882736206, + "learning_rate": 3.8505294477990723e-05, + "epoch": 2.88 + }, + { + "loss": 7.8091, + "grad_norm": 1.5452569723129272, + "learning_rate": 3.7630174148945485e-05, + "epoch": 2.89 + }, + { + "loss": 6.7905, + "grad_norm": 1.2616968154907227, + "learning_rate": 3.6755053819900234e-05, + "epoch": 2.89 + }, + { + "loss": 7.3958, + "grad_norm": 1.1684807538986206, + "learning_rate": 3.5879933490854995e-05, + "epoch": 2.89 + }, + { + "loss": 6.9238, + "grad_norm": 1.351366639137268, + "learning_rate": 3.500481316180975e-05, + "epoch": 2.9 + }, + { + "loss": 7.4026, + "grad_norm": 1.2473573684692383, + "learning_rate": 3.4129692832764505e-05, + "epoch": 2.9 + }, + { + "loss": 7.4247, + "grad_norm": 1.5123474597930908, + "learning_rate": 3.325457250371927e-05, + "epoch": 2.9 + }, + { + "loss": 7.0967, + "grad_norm": 1.1452938318252563, + "learning_rate": 3.2379452174674016e-05, + "epoch": 2.9 + }, + { + "loss": 7.0357, + "grad_norm": 1.1505627632141113, + "learning_rate": 3.150433184562877e-05, + "epoch": 2.91 + }, + { + "loss": 7.4973, + "grad_norm": 1.438091516494751, + "learning_rate": 3.062921151658353e-05, + "epoch": 2.91 + }, + { + "loss": 7.4715, + "grad_norm": 1.1489310264587402, + "learning_rate": 2.9754091187538288e-05, + "epoch": 2.91 + }, + { + "loss": 7.0076, + "grad_norm": 1.3423534631729126, + "learning_rate": 2.8878970858493046e-05, + "epoch": 2.91 + }, + { + "loss": 7.0935, + "grad_norm": 1.2484374046325684, + "learning_rate": 2.8003850529447798e-05, + "epoch": 2.92 + }, + { + "loss": 7.1792, + "grad_norm": 1.310231328010559, + "learning_rate": 2.7128730200402556e-05, + "epoch": 2.92 + }, + { + "loss": 7.3469, + "grad_norm": 1.417974591255188, + "learning_rate": 2.6253609871357314e-05, + "epoch": 2.92 + }, + { + "loss": 7.2473, + "grad_norm": 1.3878840208053589, + "learning_rate": 2.537848954231207e-05, + "epoch": 2.92 + }, + { + "loss": 7.1321, + "grad_norm": 1.6403028964996338, + "learning_rate": 2.459088124617135e-05, + "epoch": 2.93 + }, + { + "loss": 7.6076, + "grad_norm": 1.2110294103622437, + "learning_rate": 2.3715760917126104e-05, + "epoch": 2.93 + }, + { + "loss": 7.3466, + "grad_norm": 1.203755497932434, + "learning_rate": 2.2840640588080863e-05, + "epoch": 2.93 + }, + { + "loss": 7.4367, + "grad_norm": 1.2081892490386963, + "learning_rate": 2.1965520259035618e-05, + "epoch": 2.93 + }, + { + "loss": 7.6191, + "grad_norm": 1.2515225410461426, + "learning_rate": 2.1090399929990373e-05, + "epoch": 2.94 + }, + { + "loss": 7.2915, + "grad_norm": 1.2461618185043335, + "learning_rate": 2.021527960094513e-05, + "epoch": 2.94 + }, + { + "loss": 7.0825, + "grad_norm": 1.3424855470657349, + "learning_rate": 1.9340159271899886e-05, + "epoch": 2.94 + }, + { + "loss": 7.6924, + "grad_norm": 1.2109103202819824, + "learning_rate": 1.846503894285464e-05, + "epoch": 2.95 + }, + { + "loss": 7.531, + "grad_norm": 1.2161798477172852, + "learning_rate": 1.75899186138094e-05, + "epoch": 2.95 + }, + { + "loss": 7.1992, + "grad_norm": 1.347778081893921, + "learning_rate": 1.6714798284764158e-05, + "epoch": 2.95 + }, + { + "loss": 7.7785, + "grad_norm": 1.2869161367416382, + "learning_rate": 1.583967795571891e-05, + "epoch": 2.95 + }, + { + "loss": 7.6703, + "grad_norm": 1.1452679634094238, + "learning_rate": 1.4964557626673668e-05, + "epoch": 2.96 + }, + { + "loss": 7.3311, + "grad_norm": 1.7757437229156494, + "learning_rate": 1.4089437297628423e-05, + "epoch": 2.96 + }, + { + "loss": 7.4272, + "grad_norm": 1.2730258703231812, + "learning_rate": 1.3214316968583182e-05, + "epoch": 2.96 + }, + { + "loss": 6.8195, + "grad_norm": 1.0826276540756226, + "learning_rate": 1.2339196639537937e-05, + "epoch": 2.96 + }, + { + "loss": 7.1219, + "grad_norm": 1.3847414255142212, + "learning_rate": 1.1464076310492692e-05, + "epoch": 2.97 + }, + { + "loss": 7.5912, + "grad_norm": 1.4612926244735718, + "learning_rate": 1.0588955981447449e-05, + "epoch": 2.97 + }, + { + "loss": 6.9373, + "grad_norm": 1.5692036151885986, + "learning_rate": 9.713835652402205e-06, + "epoch": 2.97 + }, + { + "loss": 7.7104, + "grad_norm": 1.4740134477615356, + "learning_rate": 8.838715323356962e-06, + "epoch": 2.97 + }, + { + "loss": 7.1918, + "grad_norm": 1.026573657989502, + "learning_rate": 7.963594994311717e-06, + "epoch": 2.98 + }, + { + "loss": 6.8717, + "grad_norm": 1.1959487199783325, + "learning_rate": 7.088474665266474e-06, + "epoch": 2.98 + }, + { + "loss": 7.4154, + "grad_norm": 1.1354584693908691, + "learning_rate": 6.213354336221231e-06, + "epoch": 2.98 + }, + { + "loss": 7.1622, + "grad_norm": 1.3372441530227661, + "learning_rate": 5.338234007175987e-06, + "epoch": 2.98 + }, + { + "loss": 6.9564, + "grad_norm": 1.1713366508483887, + "learning_rate": 4.463113678130743e-06, + "epoch": 2.99 + }, + { + "loss": 7.462, + "grad_norm": 1.8238294124603271, + "learning_rate": 3.587993349085499e-06, + "epoch": 2.99 + }, + { + "loss": 7.5493, + "grad_norm": 1.3313993215560913, + "learning_rate": 2.7128730200402555e-06, + "epoch": 2.99 + }, + { + "loss": 7.2399, + "grad_norm": 1.1780248880386353, + "learning_rate": 1.8377526909950118e-06, + "epoch": 3.0 + }, + { + "loss": 6.879, + "grad_norm": 1.2703826427459717, + "learning_rate": 9.626323619497682e-07, + "epoch": 3.0 + }, + { + "train_runtime": 104781.7564, + "train_samples_per_second": 3.49, + "train_steps_per_second": 0.109, + "train_loss": 8.437174775609405, + "epoch": 3.0 + } +] \ No newline at end of file