diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.5503329847542517, + "epoch": 0.6003632560955474, "eval_steps": 500, - "global_step": 9999, + "global_step": 10908, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -70000,6 +70000,6369 @@ "learning_rate": 8.269088862876066e-06, "loss": 0.6735, "step": 9999 + }, + { + "epoch": 0.5503880235566074, + "grad_norm": 0.8945016264915466, + "learning_rate": 8.268760867188586e-06, + "loss": 0.7575, + "step": 10000 + }, + { + "epoch": 0.5504430623589631, + "grad_norm": 0.7708195447921753, + "learning_rate": 8.268432846933974e-06, + "loss": 0.6988, + "step": 10001 + }, + { + "epoch": 0.5504981011613187, + "grad_norm": 0.7884799838066101, + "learning_rate": 8.268104802114696e-06, + "loss": 0.8085, + "step": 10002 + }, + { + "epoch": 0.5505531399636744, + "grad_norm": 0.7801569104194641, + "learning_rate": 8.267776732733217e-06, + "loss": 0.886, + "step": 10003 + }, + { + "epoch": 0.5506081787660301, + "grad_norm": 0.714645504951477, + "learning_rate": 8.267448638792004e-06, + "loss": 0.7151, + "step": 10004 + }, + { + "epoch": 0.5506632175683858, + "grad_norm": 0.653136134147644, + "learning_rate": 8.267120520293519e-06, + "loss": 0.6347, + "step": 10005 + }, + { + "epoch": 0.5507182563707413, + "grad_norm": 0.8821585774421692, + "learning_rate": 8.266792377240233e-06, + "loss": 0.6457, + "step": 10006 + }, + { + "epoch": 0.550773295173097, + "grad_norm": 0.7056930661201477, + "learning_rate": 8.266464209634608e-06, + "loss": 0.8709, + "step": 10007 + }, + { + "epoch": 0.5508283339754527, + "grad_norm": 0.6505821347236633, + "learning_rate": 8.266136017479113e-06, + "loss": 0.7674, + "step": 10008 + }, + { + "epoch": 0.5508833727778083, + "grad_norm": 0.7947389483451843, + "learning_rate": 8.265807800776216e-06, + "loss": 0.7882, + "step": 10009 + }, + { + "epoch": 0.550938411580164, + "grad_norm": 0.7466071844100952, + "learning_rate": 8.265479559528379e-06, + "loss": 0.7673, + "step": 10010 + }, + { + "epoch": 0.5509934503825197, + "grad_norm": 0.706430971622467, + "learning_rate": 8.265151293738074e-06, + "loss": 0.7796, + "step": 10011 + }, + { + "epoch": 0.5510484891848754, + "grad_norm": 0.7701015472412109, + "learning_rate": 8.264823003407765e-06, + "loss": 0.7631, + "step": 10012 + }, + { + "epoch": 0.551103527987231, + "grad_norm": 0.6923625469207764, + "learning_rate": 8.264494688539922e-06, + "loss": 0.7659, + "step": 10013 + }, + { + "epoch": 0.5511585667895866, + "grad_norm": 0.6585322618484497, + "learning_rate": 8.264166349137008e-06, + "loss": 0.7248, + "step": 10014 + }, + { + "epoch": 0.5512136055919423, + "grad_norm": 0.698451578617096, + "learning_rate": 8.263837985201493e-06, + "loss": 0.7768, + "step": 10015 + }, + { + "epoch": 0.551268644394298, + "grad_norm": 0.7585058808326721, + "learning_rate": 8.263509596735847e-06, + "loss": 0.8535, + "step": 10016 + }, + { + "epoch": 0.5513236831966536, + "grad_norm": 0.6973930597305298, + "learning_rate": 8.263181183742536e-06, + "loss": 0.8253, + "step": 10017 + }, + { + "epoch": 0.5513787219990093, + "grad_norm": 0.6752467751502991, + "learning_rate": 8.26285274622403e-06, + "loss": 0.7402, + "step": 10018 + }, + { + "epoch": 0.551433760801365, + "grad_norm": 0.717555820941925, + "learning_rate": 8.262524284182794e-06, + "loss": 0.8057, + "step": 10019 + }, + { + "epoch": 0.5514887996037207, + "grad_norm": 0.6975438594818115, + "learning_rate": 8.2621957976213e-06, + "loss": 0.803, + "step": 10020 + }, + { + "epoch": 0.5515438384060762, + "grad_norm": 0.667797327041626, + "learning_rate": 8.261867286542016e-06, + "loss": 0.7387, + "step": 10021 + }, + { + "epoch": 0.5515988772084319, + "grad_norm": 0.7330532670021057, + "learning_rate": 8.261538750947411e-06, + "loss": 0.8143, + "step": 10022 + }, + { + "epoch": 0.5516539160107876, + "grad_norm": 0.7034017443656921, + "learning_rate": 8.261210190839952e-06, + "loss": 0.739, + "step": 10023 + }, + { + "epoch": 0.5517089548131433, + "grad_norm": 0.709284245967865, + "learning_rate": 8.260881606222113e-06, + "loss": 0.8021, + "step": 10024 + }, + { + "epoch": 0.5517639936154989, + "grad_norm": 0.7587909698486328, + "learning_rate": 8.260552997096359e-06, + "loss": 0.8346, + "step": 10025 + }, + { + "epoch": 0.5518190324178546, + "grad_norm": 0.7413986325263977, + "learning_rate": 8.26022436346516e-06, + "loss": 0.6777, + "step": 10026 + }, + { + "epoch": 0.5518740712202103, + "grad_norm": 0.7112768292427063, + "learning_rate": 8.25989570533099e-06, + "loss": 0.7017, + "step": 10027 + }, + { + "epoch": 0.551929110022566, + "grad_norm": 0.7097088098526001, + "learning_rate": 8.259567022696315e-06, + "loss": 0.7315, + "step": 10028 + }, + { + "epoch": 0.5519841488249215, + "grad_norm": 0.6544226408004761, + "learning_rate": 8.259238315563606e-06, + "loss": 0.7729, + "step": 10029 + }, + { + "epoch": 0.5520391876272772, + "grad_norm": 0.6892885565757751, + "learning_rate": 8.258909583935335e-06, + "loss": 0.7919, + "step": 10030 + }, + { + "epoch": 0.5520942264296329, + "grad_norm": 0.697424054145813, + "learning_rate": 8.258580827813972e-06, + "loss": 0.7514, + "step": 10031 + }, + { + "epoch": 0.5521492652319886, + "grad_norm": 0.7021437883377075, + "learning_rate": 8.258252047201989e-06, + "loss": 0.747, + "step": 10032 + }, + { + "epoch": 0.5522043040343442, + "grad_norm": 0.6974816918373108, + "learning_rate": 8.257923242101854e-06, + "loss": 0.7245, + "step": 10033 + }, + { + "epoch": 0.5522593428366999, + "grad_norm": 0.6645311117172241, + "learning_rate": 8.25759441251604e-06, + "loss": 0.649, + "step": 10034 + }, + { + "epoch": 0.5523143816390556, + "grad_norm": 0.7223736643791199, + "learning_rate": 8.25726555844702e-06, + "loss": 0.7792, + "step": 10035 + }, + { + "epoch": 0.5523694204414112, + "grad_norm": 0.7253531813621521, + "learning_rate": 8.256936679897262e-06, + "loss": 0.7636, + "step": 10036 + }, + { + "epoch": 0.5524244592437668, + "grad_norm": 0.6979514956474304, + "learning_rate": 8.256607776869241e-06, + "loss": 0.7929, + "step": 10037 + }, + { + "epoch": 0.5524794980461225, + "grad_norm": 0.7442019581794739, + "learning_rate": 8.25627884936543e-06, + "loss": 0.6984, + "step": 10038 + }, + { + "epoch": 0.5525345368484782, + "grad_norm": 0.7519513964653015, + "learning_rate": 8.255949897388294e-06, + "loss": 0.7228, + "step": 10039 + }, + { + "epoch": 0.5525895756508339, + "grad_norm": 0.7302790880203247, + "learning_rate": 8.255620920940313e-06, + "loss": 0.7555, + "step": 10040 + }, + { + "epoch": 0.5526446144531895, + "grad_norm": 0.6521434187889099, + "learning_rate": 8.255291920023956e-06, + "loss": 0.7825, + "step": 10041 + }, + { + "epoch": 0.5526996532555452, + "grad_norm": 0.8270126581192017, + "learning_rate": 8.254962894641695e-06, + "loss": 0.7939, + "step": 10042 + }, + { + "epoch": 0.5527546920579008, + "grad_norm": 0.7209310531616211, + "learning_rate": 8.254633844796007e-06, + "loss": 0.8286, + "step": 10043 + }, + { + "epoch": 0.5528097308602565, + "grad_norm": 0.6506814360618591, + "learning_rate": 8.25430477048936e-06, + "loss": 0.7209, + "step": 10044 + }, + { + "epoch": 0.5528647696626121, + "grad_norm": 0.6914637684822083, + "learning_rate": 8.25397567172423e-06, + "loss": 0.705, + "step": 10045 + }, + { + "epoch": 0.5529198084649678, + "grad_norm": 0.8369725942611694, + "learning_rate": 8.253646548503091e-06, + "loss": 0.8254, + "step": 10046 + }, + { + "epoch": 0.5529748472673235, + "grad_norm": 0.7809324860572815, + "learning_rate": 8.253317400828414e-06, + "loss": 0.8117, + "step": 10047 + }, + { + "epoch": 0.5530298860696792, + "grad_norm": 0.7184550762176514, + "learning_rate": 8.252988228702676e-06, + "loss": 0.738, + "step": 10048 + }, + { + "epoch": 0.5530849248720348, + "grad_norm": 0.7111478447914124, + "learning_rate": 8.252659032128347e-06, + "loss": 0.7143, + "step": 10049 + }, + { + "epoch": 0.5531399636743904, + "grad_norm": 0.7506794333457947, + "learning_rate": 8.252329811107905e-06, + "loss": 0.7721, + "step": 10050 + }, + { + "epoch": 0.5531950024767461, + "grad_norm": 0.7700625658035278, + "learning_rate": 8.252000565643823e-06, + "loss": 0.7993, + "step": 10051 + }, + { + "epoch": 0.5532500412791017, + "grad_norm": 0.6985816955566406, + "learning_rate": 8.251671295738575e-06, + "loss": 0.7461, + "step": 10052 + }, + { + "epoch": 0.5533050800814574, + "grad_norm": 0.6932175755500793, + "learning_rate": 8.251342001394635e-06, + "loss": 0.6804, + "step": 10053 + }, + { + "epoch": 0.5533601188838131, + "grad_norm": 0.8060765266418457, + "learning_rate": 8.25101268261448e-06, + "loss": 0.7137, + "step": 10054 + }, + { + "epoch": 0.5534151576861688, + "grad_norm": 0.6853482127189636, + "learning_rate": 8.250683339400582e-06, + "loss": 0.7229, + "step": 10055 + }, + { + "epoch": 0.5534701964885244, + "grad_norm": 0.7581862211227417, + "learning_rate": 8.25035397175542e-06, + "loss": 0.8091, + "step": 10056 + }, + { + "epoch": 0.55352523529088, + "grad_norm": 0.7375245094299316, + "learning_rate": 8.250024579681466e-06, + "loss": 0.7234, + "step": 10057 + }, + { + "epoch": 0.5535802740932357, + "grad_norm": 0.7904585599899292, + "learning_rate": 8.249695163181198e-06, + "loss": 0.7295, + "step": 10058 + }, + { + "epoch": 0.5536353128955914, + "grad_norm": 0.6593602895736694, + "learning_rate": 8.249365722257092e-06, + "loss": 0.7492, + "step": 10059 + }, + { + "epoch": 0.553690351697947, + "grad_norm": 0.7226922512054443, + "learning_rate": 8.249036256911622e-06, + "loss": 0.8177, + "step": 10060 + }, + { + "epoch": 0.5537453905003027, + "grad_norm": 0.7268722653388977, + "learning_rate": 8.248706767147265e-06, + "loss": 0.8059, + "step": 10061 + }, + { + "epoch": 0.5538004293026584, + "grad_norm": 0.7797269225120544, + "learning_rate": 8.248377252966499e-06, + "loss": 0.8122, + "step": 10062 + }, + { + "epoch": 0.5538554681050141, + "grad_norm": 0.7199145555496216, + "learning_rate": 8.248047714371797e-06, + "loss": 0.7312, + "step": 10063 + }, + { + "epoch": 0.5539105069073696, + "grad_norm": 0.6950703263282776, + "learning_rate": 8.24771815136564e-06, + "loss": 0.757, + "step": 10064 + }, + { + "epoch": 0.5539655457097253, + "grad_norm": 0.6413441896438599, + "learning_rate": 8.247388563950502e-06, + "loss": 0.6955, + "step": 10065 + }, + { + "epoch": 0.554020584512081, + "grad_norm": 0.7650758624076843, + "learning_rate": 8.24705895212886e-06, + "loss": 0.8355, + "step": 10066 + }, + { + "epoch": 0.5540756233144367, + "grad_norm": 0.7067090272903442, + "learning_rate": 8.246729315903192e-06, + "loss": 0.7409, + "step": 10067 + }, + { + "epoch": 0.5541306621167923, + "grad_norm": 0.7763532996177673, + "learning_rate": 8.246399655275976e-06, + "loss": 0.8097, + "step": 10068 + }, + { + "epoch": 0.554185700919148, + "grad_norm": 0.6865057945251465, + "learning_rate": 8.246069970249689e-06, + "loss": 0.7597, + "step": 10069 + }, + { + "epoch": 0.5542407397215037, + "grad_norm": 0.7643107771873474, + "learning_rate": 8.24574026082681e-06, + "loss": 0.7403, + "step": 10070 + }, + { + "epoch": 0.5542957785238594, + "grad_norm": 0.7354087829589844, + "learning_rate": 8.245410527009815e-06, + "loss": 0.8896, + "step": 10071 + }, + { + "epoch": 0.5543508173262149, + "grad_norm": 0.7971135973930359, + "learning_rate": 8.245080768801183e-06, + "loss": 0.7738, + "step": 10072 + }, + { + "epoch": 0.5544058561285706, + "grad_norm": 1.0506731271743774, + "learning_rate": 8.244750986203394e-06, + "loss": 0.7888, + "step": 10073 + }, + { + "epoch": 0.5544608949309263, + "grad_norm": 0.8305885195732117, + "learning_rate": 8.244421179218925e-06, + "loss": 0.8186, + "step": 10074 + }, + { + "epoch": 0.554515933733282, + "grad_norm": 0.9507874250411987, + "learning_rate": 8.244091347850253e-06, + "loss": 0.7975, + "step": 10075 + }, + { + "epoch": 0.5545709725356376, + "grad_norm": 0.7146797776222229, + "learning_rate": 8.243761492099861e-06, + "loss": 0.6895, + "step": 10076 + }, + { + "epoch": 0.5546260113379933, + "grad_norm": 0.734990656375885, + "learning_rate": 8.243431611970225e-06, + "loss": 0.8087, + "step": 10077 + }, + { + "epoch": 0.554681050140349, + "grad_norm": 0.6807795166969299, + "learning_rate": 8.243101707463825e-06, + "loss": 0.7861, + "step": 10078 + }, + { + "epoch": 0.5547360889427047, + "grad_norm": 0.7412874698638916, + "learning_rate": 8.242771778583142e-06, + "loss": 0.7864, + "step": 10079 + }, + { + "epoch": 0.5547911277450602, + "grad_norm": 0.6655074954032898, + "learning_rate": 8.242441825330652e-06, + "loss": 0.6554, + "step": 10080 + }, + { + "epoch": 0.5548461665474159, + "grad_norm": 0.7549700140953064, + "learning_rate": 8.242111847708838e-06, + "loss": 0.8031, + "step": 10081 + }, + { + "epoch": 0.5549012053497716, + "grad_norm": 0.8907766342163086, + "learning_rate": 8.241781845720181e-06, + "loss": 0.8068, + "step": 10082 + }, + { + "epoch": 0.5549562441521273, + "grad_norm": 0.7347774505615234, + "learning_rate": 8.241451819367157e-06, + "loss": 0.7453, + "step": 10083 + }, + { + "epoch": 0.5550112829544829, + "grad_norm": 0.6856632828712463, + "learning_rate": 8.24112176865225e-06, + "loss": 0.6235, + "step": 10084 + }, + { + "epoch": 0.5550663217568386, + "grad_norm": 0.7134507298469543, + "learning_rate": 8.24079169357794e-06, + "loss": 0.7991, + "step": 10085 + }, + { + "epoch": 0.5551213605591943, + "grad_norm": 0.7814854383468628, + "learning_rate": 8.240461594146704e-06, + "loss": 0.7681, + "step": 10086 + }, + { + "epoch": 0.5551763993615499, + "grad_norm": 0.6893261671066284, + "learning_rate": 8.240131470361028e-06, + "loss": 0.7746, + "step": 10087 + }, + { + "epoch": 0.5552314381639055, + "grad_norm": 0.925003170967102, + "learning_rate": 8.239801322223393e-06, + "loss": 0.7621, + "step": 10088 + }, + { + "epoch": 0.5552864769662612, + "grad_norm": 0.6261017918586731, + "learning_rate": 8.239471149736277e-06, + "loss": 0.7673, + "step": 10089 + }, + { + "epoch": 0.5553415157686169, + "grad_norm": 0.7268226146697998, + "learning_rate": 8.239140952902162e-06, + "loss": 0.7375, + "step": 10090 + }, + { + "epoch": 0.5553965545709726, + "grad_norm": 0.8062194585800171, + "learning_rate": 8.238810731723532e-06, + "loss": 0.8002, + "step": 10091 + }, + { + "epoch": 0.5554515933733282, + "grad_norm": 0.892842173576355, + "learning_rate": 8.238480486202867e-06, + "loss": 0.7959, + "step": 10092 + }, + { + "epoch": 0.5555066321756839, + "grad_norm": 0.7530377507209778, + "learning_rate": 8.23815021634265e-06, + "loss": 0.8137, + "step": 10093 + }, + { + "epoch": 0.5555616709780395, + "grad_norm": 0.6994850635528564, + "learning_rate": 8.237819922145364e-06, + "loss": 0.7966, + "step": 10094 + }, + { + "epoch": 0.5556167097803951, + "grad_norm": 0.8502941727638245, + "learning_rate": 8.237489603613488e-06, + "loss": 0.7668, + "step": 10095 + }, + { + "epoch": 0.5556717485827508, + "grad_norm": 0.6583576798439026, + "learning_rate": 8.237159260749507e-06, + "loss": 0.7379, + "step": 10096 + }, + { + "epoch": 0.5557267873851065, + "grad_norm": 0.9539539217948914, + "learning_rate": 8.236828893555904e-06, + "loss": 0.7563, + "step": 10097 + }, + { + "epoch": 0.5557818261874622, + "grad_norm": 0.7446413040161133, + "learning_rate": 8.236498502035162e-06, + "loss": 0.7329, + "step": 10098 + }, + { + "epoch": 0.5558368649898178, + "grad_norm": 0.8950835466384888, + "learning_rate": 8.236168086189761e-06, + "loss": 0.8144, + "step": 10099 + }, + { + "epoch": 0.5558919037921735, + "grad_norm": 0.7255009412765503, + "learning_rate": 8.235837646022191e-06, + "loss": 0.6946, + "step": 10100 + }, + { + "epoch": 0.5559469425945291, + "grad_norm": 0.6983402967453003, + "learning_rate": 8.235507181534929e-06, + "loss": 0.7371, + "step": 10101 + }, + { + "epoch": 0.5560019813968848, + "grad_norm": 1.043593168258667, + "learning_rate": 8.235176692730463e-06, + "loss": 0.6763, + "step": 10102 + }, + { + "epoch": 0.5560570201992404, + "grad_norm": 0.7452800869941711, + "learning_rate": 8.234846179611272e-06, + "loss": 0.8945, + "step": 10103 + }, + { + "epoch": 0.5561120590015961, + "grad_norm": 0.6367164254188538, + "learning_rate": 8.234515642179845e-06, + "loss": 0.6542, + "step": 10104 + }, + { + "epoch": 0.5561670978039518, + "grad_norm": 0.8377598524093628, + "learning_rate": 8.234185080438664e-06, + "loss": 0.787, + "step": 10105 + }, + { + "epoch": 0.5562221366063075, + "grad_norm": 0.7353680729866028, + "learning_rate": 8.233854494390214e-06, + "loss": 0.6391, + "step": 10106 + }, + { + "epoch": 0.5562771754086631, + "grad_norm": 0.7431599497795105, + "learning_rate": 8.233523884036977e-06, + "loss": 0.8221, + "step": 10107 + }, + { + "epoch": 0.5563322142110187, + "grad_norm": 0.7292743921279907, + "learning_rate": 8.233193249381442e-06, + "loss": 0.7791, + "step": 10108 + }, + { + "epoch": 0.5563872530133744, + "grad_norm": 0.7251895666122437, + "learning_rate": 8.232862590426091e-06, + "loss": 0.7993, + "step": 10109 + }, + { + "epoch": 0.5564422918157301, + "grad_norm": 0.7373167276382446, + "learning_rate": 8.23253190717341e-06, + "loss": 0.861, + "step": 10110 + }, + { + "epoch": 0.5564973306180857, + "grad_norm": 0.6689401268959045, + "learning_rate": 8.232201199625887e-06, + "loss": 0.7002, + "step": 10111 + }, + { + "epoch": 0.5565523694204414, + "grad_norm": 0.7405139207839966, + "learning_rate": 8.231870467786003e-06, + "loss": 0.8041, + "step": 10112 + }, + { + "epoch": 0.5566074082227971, + "grad_norm": 0.7561736702919006, + "learning_rate": 8.231539711656246e-06, + "loss": 0.7687, + "step": 10113 + }, + { + "epoch": 0.5566624470251528, + "grad_norm": 0.6857489943504333, + "learning_rate": 8.231208931239103e-06, + "loss": 0.7175, + "step": 10114 + }, + { + "epoch": 0.5567174858275084, + "grad_norm": 0.7410408854484558, + "learning_rate": 8.230878126537057e-06, + "loss": 0.7337, + "step": 10115 + }, + { + "epoch": 0.556772524629864, + "grad_norm": 0.7533249258995056, + "learning_rate": 8.230547297552595e-06, + "loss": 0.7226, + "step": 10116 + }, + { + "epoch": 0.5568275634322197, + "grad_norm": 0.6227561235427856, + "learning_rate": 8.230216444288207e-06, + "loss": 0.711, + "step": 10117 + }, + { + "epoch": 0.5568826022345754, + "grad_norm": 0.6790871024131775, + "learning_rate": 8.229885566746373e-06, + "loss": 0.728, + "step": 10118 + }, + { + "epoch": 0.556937641036931, + "grad_norm": 1.0007857084274292, + "learning_rate": 8.229554664929587e-06, + "loss": 0.9193, + "step": 10119 + }, + { + "epoch": 0.5569926798392867, + "grad_norm": 0.7167220711708069, + "learning_rate": 8.229223738840331e-06, + "loss": 0.8288, + "step": 10120 + }, + { + "epoch": 0.5570477186416424, + "grad_norm": 0.8037107586860657, + "learning_rate": 8.228892788481095e-06, + "loss": 0.8462, + "step": 10121 + }, + { + "epoch": 0.5571027574439981, + "grad_norm": 0.7355597615242004, + "learning_rate": 8.228561813854363e-06, + "loss": 0.7998, + "step": 10122 + }, + { + "epoch": 0.5571577962463536, + "grad_norm": 0.7384124994277954, + "learning_rate": 8.228230814962625e-06, + "loss": 0.7861, + "step": 10123 + }, + { + "epoch": 0.5572128350487093, + "grad_norm": 0.8170364499092102, + "learning_rate": 8.227899791808371e-06, + "loss": 0.8005, + "step": 10124 + }, + { + "epoch": 0.557267873851065, + "grad_norm": 0.678702175617218, + "learning_rate": 8.227568744394084e-06, + "loss": 0.7408, + "step": 10125 + }, + { + "epoch": 0.5573229126534207, + "grad_norm": 0.7212443947792053, + "learning_rate": 8.227237672722255e-06, + "loss": 0.7127, + "step": 10126 + }, + { + "epoch": 0.5573779514557763, + "grad_norm": 0.7035290002822876, + "learning_rate": 8.22690657679537e-06, + "loss": 0.8263, + "step": 10127 + }, + { + "epoch": 0.557432990258132, + "grad_norm": 0.6535285115242004, + "learning_rate": 8.226575456615921e-06, + "loss": 0.6979, + "step": 10128 + }, + { + "epoch": 0.5574880290604877, + "grad_norm": 0.7353794574737549, + "learning_rate": 8.226244312186396e-06, + "loss": 0.6838, + "step": 10129 + }, + { + "epoch": 0.5575430678628434, + "grad_norm": 0.5839618444442749, + "learning_rate": 8.225913143509278e-06, + "loss": 0.5925, + "step": 10130 + }, + { + "epoch": 0.5575981066651989, + "grad_norm": 0.6922228336334229, + "learning_rate": 8.225581950587063e-06, + "loss": 0.6808, + "step": 10131 + }, + { + "epoch": 0.5576531454675546, + "grad_norm": 0.753989040851593, + "learning_rate": 8.225250733422236e-06, + "loss": 0.6567, + "step": 10132 + }, + { + "epoch": 0.5577081842699103, + "grad_norm": 0.7327600717544556, + "learning_rate": 8.22491949201729e-06, + "loss": 0.8311, + "step": 10133 + }, + { + "epoch": 0.557763223072266, + "grad_norm": 0.6435133218765259, + "learning_rate": 8.224588226374712e-06, + "loss": 0.6684, + "step": 10134 + }, + { + "epoch": 0.5578182618746216, + "grad_norm": 0.6402057409286499, + "learning_rate": 8.22425693649699e-06, + "loss": 0.7569, + "step": 10135 + }, + { + "epoch": 0.5578733006769773, + "grad_norm": 0.7454472780227661, + "learning_rate": 8.223925622386617e-06, + "loss": 0.7908, + "step": 10136 + }, + { + "epoch": 0.557928339479333, + "grad_norm": 0.7373154759407043, + "learning_rate": 8.223594284046084e-06, + "loss": 0.8232, + "step": 10137 + }, + { + "epoch": 0.5579833782816885, + "grad_norm": 0.6478374004364014, + "learning_rate": 8.223262921477878e-06, + "loss": 0.7353, + "step": 10138 + }, + { + "epoch": 0.5580384170840442, + "grad_norm": 0.715212881565094, + "learning_rate": 8.222931534684488e-06, + "loss": 0.729, + "step": 10139 + }, + { + "epoch": 0.5580934558863999, + "grad_norm": 0.9226915240287781, + "learning_rate": 8.22260012366841e-06, + "loss": 0.7846, + "step": 10140 + }, + { + "epoch": 0.5581484946887556, + "grad_norm": 0.6481993198394775, + "learning_rate": 8.222268688432132e-06, + "loss": 0.6955, + "step": 10141 + }, + { + "epoch": 0.5582035334911112, + "grad_norm": 0.7240349054336548, + "learning_rate": 8.221937228978145e-06, + "loss": 0.7956, + "step": 10142 + }, + { + "epoch": 0.5582585722934669, + "grad_norm": 0.7089122533798218, + "learning_rate": 8.221605745308939e-06, + "loss": 0.7481, + "step": 10143 + }, + { + "epoch": 0.5583136110958226, + "grad_norm": 0.7292537093162537, + "learning_rate": 8.221274237427009e-06, + "loss": 0.7797, + "step": 10144 + }, + { + "epoch": 0.5583686498981782, + "grad_norm": 0.7104652523994446, + "learning_rate": 8.220942705334841e-06, + "loss": 0.7966, + "step": 10145 + }, + { + "epoch": 0.5584236887005338, + "grad_norm": 0.7656546831130981, + "learning_rate": 8.220611149034931e-06, + "loss": 0.7541, + "step": 10146 + }, + { + "epoch": 0.5584787275028895, + "grad_norm": 0.7618892788887024, + "learning_rate": 8.22027956852977e-06, + "loss": 0.6994, + "step": 10147 + }, + { + "epoch": 0.5585337663052452, + "grad_norm": 0.6445756554603577, + "learning_rate": 8.219947963821851e-06, + "loss": 0.7303, + "step": 10148 + }, + { + "epoch": 0.5585888051076009, + "grad_norm": 0.6529820561408997, + "learning_rate": 8.219616334913663e-06, + "loss": 0.7008, + "step": 10149 + }, + { + "epoch": 0.5586438439099565, + "grad_norm": 0.6890642046928406, + "learning_rate": 8.219284681807703e-06, + "loss": 0.8124, + "step": 10150 + }, + { + "epoch": 0.5586988827123122, + "grad_norm": 0.7273370027542114, + "learning_rate": 8.218953004506458e-06, + "loss": 0.7507, + "step": 10151 + }, + { + "epoch": 0.5587539215146679, + "grad_norm": 0.7239277362823486, + "learning_rate": 8.218621303012425e-06, + "loss": 0.7929, + "step": 10152 + }, + { + "epoch": 0.5588089603170235, + "grad_norm": 0.660275399684906, + "learning_rate": 8.218289577328096e-06, + "loss": 0.7418, + "step": 10153 + }, + { + "epoch": 0.5588639991193791, + "grad_norm": 0.7406648993492126, + "learning_rate": 8.217957827455965e-06, + "loss": 0.8072, + "step": 10154 + }, + { + "epoch": 0.5589190379217348, + "grad_norm": 0.7051703333854675, + "learning_rate": 8.217626053398522e-06, + "loss": 0.6562, + "step": 10155 + }, + { + "epoch": 0.5589740767240905, + "grad_norm": 0.93423992395401, + "learning_rate": 8.217294255158266e-06, + "loss": 0.738, + "step": 10156 + }, + { + "epoch": 0.5590291155264462, + "grad_norm": 0.8362720608711243, + "learning_rate": 8.216962432737685e-06, + "loss": 0.8585, + "step": 10157 + }, + { + "epoch": 0.5590841543288018, + "grad_norm": 0.9195587038993835, + "learning_rate": 8.216630586139277e-06, + "loss": 0.8778, + "step": 10158 + }, + { + "epoch": 0.5591391931311575, + "grad_norm": 0.7181550860404968, + "learning_rate": 8.216298715365534e-06, + "loss": 0.702, + "step": 10159 + }, + { + "epoch": 0.5591942319335131, + "grad_norm": 0.6900259852409363, + "learning_rate": 8.21596682041895e-06, + "loss": 0.7652, + "step": 10160 + }, + { + "epoch": 0.5592492707358688, + "grad_norm": 0.7523833513259888, + "learning_rate": 8.215634901302022e-06, + "loss": 0.7881, + "step": 10161 + }, + { + "epoch": 0.5593043095382244, + "grad_norm": 0.6659645438194275, + "learning_rate": 8.215302958017241e-06, + "loss": 0.694, + "step": 10162 + }, + { + "epoch": 0.5593593483405801, + "grad_norm": 0.8898606300354004, + "learning_rate": 8.214970990567105e-06, + "loss": 0.8534, + "step": 10163 + }, + { + "epoch": 0.5594143871429358, + "grad_norm": 0.6759241819381714, + "learning_rate": 8.214638998954108e-06, + "loss": 0.8241, + "step": 10164 + }, + { + "epoch": 0.5594694259452915, + "grad_norm": 0.7136911749839783, + "learning_rate": 8.214306983180744e-06, + "loss": 0.7846, + "step": 10165 + }, + { + "epoch": 0.559524464747647, + "grad_norm": 0.6781616806983948, + "learning_rate": 8.213974943249509e-06, + "loss": 0.7116, + "step": 10166 + }, + { + "epoch": 0.5595795035500027, + "grad_norm": 0.7134156227111816, + "learning_rate": 8.213642879162898e-06, + "loss": 0.7537, + "step": 10167 + }, + { + "epoch": 0.5596345423523584, + "grad_norm": 1.306710124015808, + "learning_rate": 8.213310790923408e-06, + "loss": 0.8506, + "step": 10168 + }, + { + "epoch": 0.5596895811547141, + "grad_norm": 0.725304901599884, + "learning_rate": 8.212978678533534e-06, + "loss": 0.8115, + "step": 10169 + }, + { + "epoch": 0.5597446199570697, + "grad_norm": 0.7833520174026489, + "learning_rate": 8.212646541995772e-06, + "loss": 0.919, + "step": 10170 + }, + { + "epoch": 0.5597996587594254, + "grad_norm": 0.6938104033470154, + "learning_rate": 8.212314381312621e-06, + "loss": 0.7303, + "step": 10171 + }, + { + "epoch": 0.5598546975617811, + "grad_norm": 0.6860232949256897, + "learning_rate": 8.211982196486573e-06, + "loss": 0.7709, + "step": 10172 + }, + { + "epoch": 0.5599097363641368, + "grad_norm": 0.6611567139625549, + "learning_rate": 8.211649987520126e-06, + "loss": 0.7711, + "step": 10173 + }, + { + "epoch": 0.5599647751664923, + "grad_norm": 0.8603463172912598, + "learning_rate": 8.211317754415778e-06, + "loss": 0.8527, + "step": 10174 + }, + { + "epoch": 0.560019813968848, + "grad_norm": 0.7350558638572693, + "learning_rate": 8.210985497176025e-06, + "loss": 0.8148, + "step": 10175 + }, + { + "epoch": 0.5600748527712037, + "grad_norm": 0.6881470084190369, + "learning_rate": 8.210653215803365e-06, + "loss": 0.7526, + "step": 10176 + }, + { + "epoch": 0.5601298915735594, + "grad_norm": 0.6879626512527466, + "learning_rate": 8.210320910300296e-06, + "loss": 0.7649, + "step": 10177 + }, + { + "epoch": 0.560184930375915, + "grad_norm": 0.6843587160110474, + "learning_rate": 8.209988580669312e-06, + "loss": 0.8131, + "step": 10178 + }, + { + "epoch": 0.5602399691782707, + "grad_norm": 0.6684302687644958, + "learning_rate": 8.209656226912915e-06, + "loss": 0.7256, + "step": 10179 + }, + { + "epoch": 0.5602950079806264, + "grad_norm": 0.7973861694335938, + "learning_rate": 8.209323849033601e-06, + "loss": 0.7924, + "step": 10180 + }, + { + "epoch": 0.560350046782982, + "grad_norm": 0.6850616931915283, + "learning_rate": 8.208991447033867e-06, + "loss": 0.7423, + "step": 10181 + }, + { + "epoch": 0.5604050855853376, + "grad_norm": 0.8284440636634827, + "learning_rate": 8.208659020916213e-06, + "loss": 0.7637, + "step": 10182 + }, + { + "epoch": 0.5604601243876933, + "grad_norm": 0.7671821713447571, + "learning_rate": 8.208326570683136e-06, + "loss": 0.7688, + "step": 10183 + }, + { + "epoch": 0.560515163190049, + "grad_norm": 0.8359144330024719, + "learning_rate": 8.207994096337135e-06, + "loss": 0.8179, + "step": 10184 + }, + { + "epoch": 0.5605702019924046, + "grad_norm": 0.6389699578285217, + "learning_rate": 8.207661597880709e-06, + "loss": 0.6987, + "step": 10185 + }, + { + "epoch": 0.5606252407947603, + "grad_norm": 0.6472755074501038, + "learning_rate": 8.20732907531636e-06, + "loss": 0.6984, + "step": 10186 + }, + { + "epoch": 0.560680279597116, + "grad_norm": 0.8231903314590454, + "learning_rate": 8.20699652864658e-06, + "loss": 0.8212, + "step": 10187 + }, + { + "epoch": 0.5607353183994717, + "grad_norm": 0.7550386190414429, + "learning_rate": 8.206663957873876e-06, + "loss": 0.7446, + "step": 10188 + }, + { + "epoch": 0.5607903572018272, + "grad_norm": 0.6704659461975098, + "learning_rate": 8.206331363000743e-06, + "loss": 0.7035, + "step": 10189 + }, + { + "epoch": 0.5608453960041829, + "grad_norm": 0.7258654236793518, + "learning_rate": 8.20599874402968e-06, + "loss": 0.7032, + "step": 10190 + }, + { + "epoch": 0.5609004348065386, + "grad_norm": 0.674609363079071, + "learning_rate": 8.20566610096319e-06, + "loss": 0.7545, + "step": 10191 + }, + { + "epoch": 0.5609554736088943, + "grad_norm": 0.6978347301483154, + "learning_rate": 8.205333433803773e-06, + "loss": 0.8198, + "step": 10192 + }, + { + "epoch": 0.5610105124112499, + "grad_norm": 0.6252121329307556, + "learning_rate": 8.205000742553925e-06, + "loss": 0.6639, + "step": 10193 + }, + { + "epoch": 0.5610655512136056, + "grad_norm": 0.7288224101066589, + "learning_rate": 8.204668027216152e-06, + "loss": 0.8035, + "step": 10194 + }, + { + "epoch": 0.5611205900159613, + "grad_norm": 0.6591556072235107, + "learning_rate": 8.20433528779295e-06, + "loss": 0.7552, + "step": 10195 + }, + { + "epoch": 0.561175628818317, + "grad_norm": 0.769827127456665, + "learning_rate": 8.204002524286823e-06, + "loss": 0.7279, + "step": 10196 + }, + { + "epoch": 0.5612306676206725, + "grad_norm": 0.74398273229599, + "learning_rate": 8.203669736700271e-06, + "loss": 0.7638, + "step": 10197 + }, + { + "epoch": 0.5612857064230282, + "grad_norm": 0.9343454241752625, + "learning_rate": 8.203336925035795e-06, + "loss": 0.7513, + "step": 10198 + }, + { + "epoch": 0.5613407452253839, + "grad_norm": 0.6667190194129944, + "learning_rate": 8.203004089295894e-06, + "loss": 0.77, + "step": 10199 + }, + { + "epoch": 0.5613957840277396, + "grad_norm": 0.7684557437896729, + "learning_rate": 8.202671229483073e-06, + "loss": 0.803, + "step": 10200 + }, + { + "epoch": 0.5614508228300952, + "grad_norm": 0.6551374793052673, + "learning_rate": 8.202338345599832e-06, + "loss": 0.6914, + "step": 10201 + }, + { + "epoch": 0.5615058616324509, + "grad_norm": 0.717464029788971, + "learning_rate": 8.202005437648674e-06, + "loss": 0.6797, + "step": 10202 + }, + { + "epoch": 0.5615609004348066, + "grad_norm": 0.7053301334381104, + "learning_rate": 8.2016725056321e-06, + "loss": 0.7857, + "step": 10203 + }, + { + "epoch": 0.5616159392371622, + "grad_norm": 0.8392077684402466, + "learning_rate": 8.20133954955261e-06, + "loss": 0.8321, + "step": 10204 + }, + { + "epoch": 0.5616709780395178, + "grad_norm": 0.6630520820617676, + "learning_rate": 8.201006569412711e-06, + "loss": 0.7093, + "step": 10205 + }, + { + "epoch": 0.5617260168418735, + "grad_norm": 0.6835867762565613, + "learning_rate": 8.200673565214905e-06, + "loss": 0.6623, + "step": 10206 + }, + { + "epoch": 0.5617810556442292, + "grad_norm": 0.7635336518287659, + "learning_rate": 8.200340536961691e-06, + "loss": 0.8378, + "step": 10207 + }, + { + "epoch": 0.5618360944465849, + "grad_norm": 0.6500052213668823, + "learning_rate": 8.200007484655575e-06, + "loss": 0.6836, + "step": 10208 + }, + { + "epoch": 0.5618911332489405, + "grad_norm": 0.6549860835075378, + "learning_rate": 8.199674408299058e-06, + "loss": 0.6868, + "step": 10209 + }, + { + "epoch": 0.5619461720512962, + "grad_norm": 0.7995957732200623, + "learning_rate": 8.199341307894647e-06, + "loss": 0.7719, + "step": 10210 + }, + { + "epoch": 0.5620012108536518, + "grad_norm": 0.6869412064552307, + "learning_rate": 8.199008183444843e-06, + "loss": 0.7921, + "step": 10211 + }, + { + "epoch": 0.5620562496560075, + "grad_norm": 0.9125131964683533, + "learning_rate": 8.198675034952149e-06, + "loss": 0.9015, + "step": 10212 + }, + { + "epoch": 0.5621112884583631, + "grad_norm": 0.6851146221160889, + "learning_rate": 8.198341862419068e-06, + "loss": 0.7773, + "step": 10213 + }, + { + "epoch": 0.5621663272607188, + "grad_norm": 0.6808778047561646, + "learning_rate": 8.198008665848108e-06, + "loss": 0.7375, + "step": 10214 + }, + { + "epoch": 0.5622213660630745, + "grad_norm": 0.6419697999954224, + "learning_rate": 8.19767544524177e-06, + "loss": 0.7496, + "step": 10215 + }, + { + "epoch": 0.5622764048654302, + "grad_norm": 0.7325716614723206, + "learning_rate": 8.197342200602559e-06, + "loss": 0.7424, + "step": 10216 + }, + { + "epoch": 0.5623314436677858, + "grad_norm": 0.6165832281112671, + "learning_rate": 8.19700893193298e-06, + "loss": 0.6364, + "step": 10217 + }, + { + "epoch": 0.5623864824701414, + "grad_norm": 0.7632125020027161, + "learning_rate": 8.196675639235539e-06, + "loss": 0.7175, + "step": 10218 + }, + { + "epoch": 0.5624415212724971, + "grad_norm": 0.6789713501930237, + "learning_rate": 8.196342322512738e-06, + "loss": 0.7122, + "step": 10219 + }, + { + "epoch": 0.5624965600748528, + "grad_norm": 0.7341050505638123, + "learning_rate": 8.196008981767084e-06, + "loss": 0.7598, + "step": 10220 + }, + { + "epoch": 0.5625515988772084, + "grad_norm": 0.7318429350852966, + "learning_rate": 8.195675617001083e-06, + "loss": 0.7723, + "step": 10221 + }, + { + "epoch": 0.5626066376795641, + "grad_norm": 0.6940313577651978, + "learning_rate": 8.195342228217238e-06, + "loss": 0.7885, + "step": 10222 + }, + { + "epoch": 0.5626616764819198, + "grad_norm": 0.8792300820350647, + "learning_rate": 8.195008815418058e-06, + "loss": 0.7657, + "step": 10223 + }, + { + "epoch": 0.5627167152842754, + "grad_norm": 0.7234559655189514, + "learning_rate": 8.194675378606044e-06, + "loss": 0.7988, + "step": 10224 + }, + { + "epoch": 0.562771754086631, + "grad_norm": 0.6698254942893982, + "learning_rate": 8.194341917783708e-06, + "loss": 0.6378, + "step": 10225 + }, + { + "epoch": 0.5628267928889867, + "grad_norm": 0.6546483635902405, + "learning_rate": 8.194008432953552e-06, + "loss": 0.7113, + "step": 10226 + }, + { + "epoch": 0.5628818316913424, + "grad_norm": 0.6532583832740784, + "learning_rate": 8.193674924118085e-06, + "loss": 0.6782, + "step": 10227 + }, + { + "epoch": 0.562936870493698, + "grad_norm": 0.770578920841217, + "learning_rate": 8.19334139127981e-06, + "loss": 0.8519, + "step": 10228 + }, + { + "epoch": 0.5629919092960537, + "grad_norm": 0.7255409359931946, + "learning_rate": 8.193007834441235e-06, + "loss": 0.6555, + "step": 10229 + }, + { + "epoch": 0.5630469480984094, + "grad_norm": 0.6659883856773376, + "learning_rate": 8.19267425360487e-06, + "loss": 0.7836, + "step": 10230 + }, + { + "epoch": 0.5631019869007651, + "grad_norm": 0.6596028208732605, + "learning_rate": 8.192340648773221e-06, + "loss": 0.6199, + "step": 10231 + }, + { + "epoch": 0.5631570257031207, + "grad_norm": 0.8226001858711243, + "learning_rate": 8.192007019948793e-06, + "loss": 0.8101, + "step": 10232 + }, + { + "epoch": 0.5632120645054763, + "grad_norm": 0.7465038895606995, + "learning_rate": 8.191673367134094e-06, + "loss": 0.8437, + "step": 10233 + }, + { + "epoch": 0.563267103307832, + "grad_norm": 1.0008004903793335, + "learning_rate": 8.191339690331632e-06, + "loss": 0.8626, + "step": 10234 + }, + { + "epoch": 0.5633221421101877, + "grad_norm": 0.7538222670555115, + "learning_rate": 8.191005989543917e-06, + "loss": 0.7222, + "step": 10235 + }, + { + "epoch": 0.5633771809125433, + "grad_norm": 0.6252872943878174, + "learning_rate": 8.190672264773454e-06, + "loss": 0.8038, + "step": 10236 + }, + { + "epoch": 0.563432219714899, + "grad_norm": 0.7083514928817749, + "learning_rate": 8.190338516022752e-06, + "loss": 0.7863, + "step": 10237 + }, + { + "epoch": 0.5634872585172547, + "grad_norm": 0.6887454390525818, + "learning_rate": 8.19000474329432e-06, + "loss": 0.7034, + "step": 10238 + }, + { + "epoch": 0.5635422973196104, + "grad_norm": 0.7487072348594666, + "learning_rate": 8.189670946590666e-06, + "loss": 0.8618, + "step": 10239 + }, + { + "epoch": 0.5635973361219659, + "grad_norm": 0.6999371647834778, + "learning_rate": 8.189337125914298e-06, + "loss": 0.7613, + "step": 10240 + }, + { + "epoch": 0.5636523749243216, + "grad_norm": 0.8265380263328552, + "learning_rate": 8.18900328126773e-06, + "loss": 0.7576, + "step": 10241 + }, + { + "epoch": 0.5637074137266773, + "grad_norm": 0.6688962578773499, + "learning_rate": 8.188669412653463e-06, + "loss": 0.712, + "step": 10242 + }, + { + "epoch": 0.563762452529033, + "grad_norm": 0.6343923211097717, + "learning_rate": 8.188335520074011e-06, + "loss": 0.7239, + "step": 10243 + }, + { + "epoch": 0.5638174913313886, + "grad_norm": 0.7122388482093811, + "learning_rate": 8.188001603531883e-06, + "loss": 0.7892, + "step": 10244 + }, + { + "epoch": 0.5638725301337443, + "grad_norm": 0.6646286845207214, + "learning_rate": 8.187667663029587e-06, + "loss": 0.7805, + "step": 10245 + }, + { + "epoch": 0.5639275689361, + "grad_norm": 0.742938220500946, + "learning_rate": 8.187333698569638e-06, + "loss": 0.8444, + "step": 10246 + }, + { + "epoch": 0.5639826077384557, + "grad_norm": 0.7260885238647461, + "learning_rate": 8.18699971015454e-06, + "loss": 0.8621, + "step": 10247 + }, + { + "epoch": 0.5640376465408112, + "grad_norm": 0.7920067310333252, + "learning_rate": 8.186665697786804e-06, + "loss": 0.7391, + "step": 10248 + }, + { + "epoch": 0.5640926853431669, + "grad_norm": 0.7472825646400452, + "learning_rate": 8.186331661468943e-06, + "loss": 0.7249, + "step": 10249 + }, + { + "epoch": 0.5641477241455226, + "grad_norm": 0.692643940448761, + "learning_rate": 8.185997601203465e-06, + "loss": 0.7884, + "step": 10250 + }, + { + "epoch": 0.5642027629478783, + "grad_norm": 0.715455174446106, + "learning_rate": 8.185663516992884e-06, + "loss": 0.7369, + "step": 10251 + }, + { + "epoch": 0.5642578017502339, + "grad_norm": 0.7566105723381042, + "learning_rate": 8.185329408839705e-06, + "loss": 0.7378, + "step": 10252 + }, + { + "epoch": 0.5643128405525896, + "grad_norm": 0.8163520693778992, + "learning_rate": 8.184995276746445e-06, + "loss": 0.7326, + "step": 10253 + }, + { + "epoch": 0.5643678793549453, + "grad_norm": 0.6280468106269836, + "learning_rate": 8.184661120715615e-06, + "loss": 0.6858, + "step": 10254 + }, + { + "epoch": 0.564422918157301, + "grad_norm": 0.7246795892715454, + "learning_rate": 8.184326940749723e-06, + "loss": 0.8111, + "step": 10255 + }, + { + "epoch": 0.5644779569596565, + "grad_norm": 0.7429527640342712, + "learning_rate": 8.18399273685128e-06, + "loss": 0.7642, + "step": 10256 + }, + { + "epoch": 0.5645329957620122, + "grad_norm": 0.7308861017227173, + "learning_rate": 8.183658509022802e-06, + "loss": 0.7844, + "step": 10257 + }, + { + "epoch": 0.5645880345643679, + "grad_norm": 0.7549033164978027, + "learning_rate": 8.1833242572668e-06, + "loss": 0.8585, + "step": 10258 + }, + { + "epoch": 0.5646430733667236, + "grad_norm": 0.6779888868331909, + "learning_rate": 8.182989981585782e-06, + "loss": 0.6808, + "step": 10259 + }, + { + "epoch": 0.5646981121690792, + "grad_norm": 0.887113630771637, + "learning_rate": 8.182655681982266e-06, + "loss": 0.8229, + "step": 10260 + }, + { + "epoch": 0.5647531509714349, + "grad_norm": 0.6405711770057678, + "learning_rate": 8.18232135845876e-06, + "loss": 0.6901, + "step": 10261 + }, + { + "epoch": 0.5648081897737905, + "grad_norm": 0.7302486300468445, + "learning_rate": 8.18198701101778e-06, + "loss": 0.6853, + "step": 10262 + }, + { + "epoch": 0.5648632285761462, + "grad_norm": 0.6374662518501282, + "learning_rate": 8.181652639661837e-06, + "loss": 0.7177, + "step": 10263 + }, + { + "epoch": 0.5649182673785018, + "grad_norm": 0.9267570972442627, + "learning_rate": 8.181318244393444e-06, + "loss": 0.7926, + "step": 10264 + }, + { + "epoch": 0.5649733061808575, + "grad_norm": 0.8196623921394348, + "learning_rate": 8.180983825215114e-06, + "loss": 0.7127, + "step": 10265 + }, + { + "epoch": 0.5650283449832132, + "grad_norm": 0.7004575133323669, + "learning_rate": 8.180649382129361e-06, + "loss": 0.7858, + "step": 10266 + }, + { + "epoch": 0.5650833837855688, + "grad_norm": 0.7667824625968933, + "learning_rate": 8.180314915138701e-06, + "loss": 0.7742, + "step": 10267 + }, + { + "epoch": 0.5651384225879245, + "grad_norm": 0.7372623682022095, + "learning_rate": 8.179980424245644e-06, + "loss": 0.7949, + "step": 10268 + }, + { + "epoch": 0.5651934613902801, + "grad_norm": 0.6417940258979797, + "learning_rate": 8.179645909452704e-06, + "loss": 0.6683, + "step": 10269 + }, + { + "epoch": 0.5652485001926358, + "grad_norm": 0.6736140251159668, + "learning_rate": 8.179311370762398e-06, + "loss": 0.6564, + "step": 10270 + }, + { + "epoch": 0.5653035389949914, + "grad_norm": 0.6727200746536255, + "learning_rate": 8.178976808177239e-06, + "loss": 0.8065, + "step": 10271 + }, + { + "epoch": 0.5653585777973471, + "grad_norm": 0.7565415501594543, + "learning_rate": 8.17864222169974e-06, + "loss": 0.9055, + "step": 10272 + }, + { + "epoch": 0.5654136165997028, + "grad_norm": 0.8938627243041992, + "learning_rate": 8.178307611332418e-06, + "loss": 0.8009, + "step": 10273 + }, + { + "epoch": 0.5654686554020585, + "grad_norm": 0.7439131140708923, + "learning_rate": 8.177972977077786e-06, + "loss": 0.7807, + "step": 10274 + }, + { + "epoch": 0.5655236942044141, + "grad_norm": 0.7603998184204102, + "learning_rate": 8.17763831893836e-06, + "loss": 0.818, + "step": 10275 + }, + { + "epoch": 0.5655787330067698, + "grad_norm": 0.7088946104049683, + "learning_rate": 8.177303636916655e-06, + "loss": 0.7741, + "step": 10276 + }, + { + "epoch": 0.5656337718091254, + "grad_norm": 0.6801518201828003, + "learning_rate": 8.176968931015187e-06, + "loss": 0.7633, + "step": 10277 + }, + { + "epoch": 0.5656888106114811, + "grad_norm": 0.6739299297332764, + "learning_rate": 8.17663420123647e-06, + "loss": 0.7772, + "step": 10278 + }, + { + "epoch": 0.5657438494138367, + "grad_norm": 0.7432494759559631, + "learning_rate": 8.176299447583021e-06, + "loss": 0.7368, + "step": 10279 + }, + { + "epoch": 0.5657988882161924, + "grad_norm": 0.7847158908843994, + "learning_rate": 8.175964670057357e-06, + "loss": 0.7824, + "step": 10280 + }, + { + "epoch": 0.5658539270185481, + "grad_norm": 0.8732449412345886, + "learning_rate": 8.17562986866199e-06, + "loss": 0.8035, + "step": 10281 + }, + { + "epoch": 0.5659089658209038, + "grad_norm": 0.7988447546958923, + "learning_rate": 8.17529504339944e-06, + "loss": 0.828, + "step": 10282 + }, + { + "epoch": 0.5659640046232594, + "grad_norm": 0.7063263058662415, + "learning_rate": 8.174960194272224e-06, + "loss": 0.7723, + "step": 10283 + }, + { + "epoch": 0.566019043425615, + "grad_norm": 0.7635022401809692, + "learning_rate": 8.174625321282856e-06, + "loss": 0.7156, + "step": 10284 + }, + { + "epoch": 0.5660740822279707, + "grad_norm": 0.6505927443504333, + "learning_rate": 8.174290424433853e-06, + "loss": 0.7409, + "step": 10285 + }, + { + "epoch": 0.5661291210303264, + "grad_norm": 0.6919816136360168, + "learning_rate": 8.173955503727734e-06, + "loss": 0.7829, + "step": 10286 + }, + { + "epoch": 0.566184159832682, + "grad_norm": 0.7024216651916504, + "learning_rate": 8.173620559167015e-06, + "loss": 0.7378, + "step": 10287 + }, + { + "epoch": 0.5662391986350377, + "grad_norm": 0.7134365439414978, + "learning_rate": 8.173285590754212e-06, + "loss": 0.7737, + "step": 10288 + }, + { + "epoch": 0.5662942374373934, + "grad_norm": 0.6867973804473877, + "learning_rate": 8.172950598491845e-06, + "loss": 0.7169, + "step": 10289 + }, + { + "epoch": 0.5663492762397491, + "grad_norm": 0.6900742650032043, + "learning_rate": 8.172615582382432e-06, + "loss": 0.7888, + "step": 10290 + }, + { + "epoch": 0.5664043150421046, + "grad_norm": 0.7026718854904175, + "learning_rate": 8.172280542428488e-06, + "loss": 0.8179, + "step": 10291 + }, + { + "epoch": 0.5664593538444603, + "grad_norm": 0.6940855979919434, + "learning_rate": 8.171945478632533e-06, + "loss": 0.7686, + "step": 10292 + }, + { + "epoch": 0.566514392646816, + "grad_norm": 0.6717686653137207, + "learning_rate": 8.171610390997085e-06, + "loss": 0.7865, + "step": 10293 + }, + { + "epoch": 0.5665694314491717, + "grad_norm": 0.6947711110115051, + "learning_rate": 8.171275279524661e-06, + "loss": 0.7811, + "step": 10294 + }, + { + "epoch": 0.5666244702515273, + "grad_norm": 0.6907814741134644, + "learning_rate": 8.170940144217782e-06, + "loss": 0.7095, + "step": 10295 + }, + { + "epoch": 0.566679509053883, + "grad_norm": 0.723952054977417, + "learning_rate": 8.170604985078965e-06, + "loss": 0.7814, + "step": 10296 + }, + { + "epoch": 0.5667345478562387, + "grad_norm": 0.7775490880012512, + "learning_rate": 8.17026980211073e-06, + "loss": 0.797, + "step": 10297 + }, + { + "epoch": 0.5667895866585944, + "grad_norm": 0.7557885646820068, + "learning_rate": 8.169934595315597e-06, + "loss": 0.8423, + "step": 10298 + }, + { + "epoch": 0.5668446254609499, + "grad_norm": 0.7838338017463684, + "learning_rate": 8.169599364696083e-06, + "loss": 0.7114, + "step": 10299 + }, + { + "epoch": 0.5668996642633056, + "grad_norm": 0.6632605791091919, + "learning_rate": 8.169264110254707e-06, + "loss": 0.6723, + "step": 10300 + }, + { + "epoch": 0.5669547030656613, + "grad_norm": 0.735756516456604, + "learning_rate": 8.168928831993991e-06, + "loss": 0.7533, + "step": 10301 + }, + { + "epoch": 0.567009741868017, + "grad_norm": 0.6981016993522644, + "learning_rate": 8.168593529916457e-06, + "loss": 0.7882, + "step": 10302 + }, + { + "epoch": 0.5670647806703726, + "grad_norm": 0.6413942575454712, + "learning_rate": 8.168258204024619e-06, + "loss": 0.6593, + "step": 10303 + }, + { + "epoch": 0.5671198194727283, + "grad_norm": 0.7040891051292419, + "learning_rate": 8.167922854321002e-06, + "loss": 0.7295, + "step": 10304 + }, + { + "epoch": 0.567174858275084, + "grad_norm": 0.7132521867752075, + "learning_rate": 8.167587480808126e-06, + "loss": 0.7128, + "step": 10305 + }, + { + "epoch": 0.5672298970774396, + "grad_norm": 0.756529688835144, + "learning_rate": 8.167252083488508e-06, + "loss": 0.7044, + "step": 10306 + }, + { + "epoch": 0.5672849358797952, + "grad_norm": 0.8456888198852539, + "learning_rate": 8.166916662364672e-06, + "loss": 0.8304, + "step": 10307 + }, + { + "epoch": 0.5673399746821509, + "grad_norm": 0.7758522629737854, + "learning_rate": 8.166581217439138e-06, + "loss": 0.7192, + "step": 10308 + }, + { + "epoch": 0.5673950134845066, + "grad_norm": 0.8110343217849731, + "learning_rate": 8.166245748714428e-06, + "loss": 0.8794, + "step": 10309 + }, + { + "epoch": 0.5674500522868622, + "grad_norm": 0.6803586483001709, + "learning_rate": 8.165910256193062e-06, + "loss": 0.7402, + "step": 10310 + }, + { + "epoch": 0.5675050910892179, + "grad_norm": 0.7294176816940308, + "learning_rate": 8.165574739877563e-06, + "loss": 0.7325, + "step": 10311 + }, + { + "epoch": 0.5675601298915736, + "grad_norm": 0.835488498210907, + "learning_rate": 8.165239199770448e-06, + "loss": 0.8317, + "step": 10312 + }, + { + "epoch": 0.5676151686939293, + "grad_norm": 0.6497608423233032, + "learning_rate": 8.164903635874246e-06, + "loss": 0.6902, + "step": 10313 + }, + { + "epoch": 0.5676702074962848, + "grad_norm": 0.6782082915306091, + "learning_rate": 8.164568048191474e-06, + "loss": 0.7941, + "step": 10314 + }, + { + "epoch": 0.5677252462986405, + "grad_norm": 0.6974388957023621, + "learning_rate": 8.164232436724656e-06, + "loss": 0.7899, + "step": 10315 + }, + { + "epoch": 0.5677802851009962, + "grad_norm": 0.7222558259963989, + "learning_rate": 8.163896801476314e-06, + "loss": 0.8034, + "step": 10316 + }, + { + "epoch": 0.5678353239033519, + "grad_norm": 0.6562586426734924, + "learning_rate": 8.16356114244897e-06, + "loss": 0.7864, + "step": 10317 + }, + { + "epoch": 0.5678903627057075, + "grad_norm": 0.6888270378112793, + "learning_rate": 8.16322545964515e-06, + "loss": 0.8455, + "step": 10318 + }, + { + "epoch": 0.5679454015080632, + "grad_norm": 0.642084002494812, + "learning_rate": 8.162889753067372e-06, + "loss": 0.7478, + "step": 10319 + }, + { + "epoch": 0.5680004403104189, + "grad_norm": 0.7077270746231079, + "learning_rate": 8.16255402271816e-06, + "loss": 0.7281, + "step": 10320 + }, + { + "epoch": 0.5680554791127745, + "grad_norm": 0.7202198505401611, + "learning_rate": 8.16221826860004e-06, + "loss": 0.7893, + "step": 10321 + }, + { + "epoch": 0.5681105179151301, + "grad_norm": 0.8950369954109192, + "learning_rate": 8.161882490715534e-06, + "loss": 0.772, + "step": 10322 + }, + { + "epoch": 0.5681655567174858, + "grad_norm": 0.6986666917800903, + "learning_rate": 8.161546689067166e-06, + "loss": 0.7712, + "step": 10323 + }, + { + "epoch": 0.5682205955198415, + "grad_norm": 0.7095959782600403, + "learning_rate": 8.161210863657458e-06, + "loss": 0.8373, + "step": 10324 + }, + { + "epoch": 0.5682756343221972, + "grad_norm": 0.7510485649108887, + "learning_rate": 8.160875014488936e-06, + "loss": 0.9106, + "step": 10325 + }, + { + "epoch": 0.5683306731245528, + "grad_norm": 0.7558283805847168, + "learning_rate": 8.160539141564123e-06, + "loss": 0.8192, + "step": 10326 + }, + { + "epoch": 0.5683857119269085, + "grad_norm": 0.7523400187492371, + "learning_rate": 8.160203244885545e-06, + "loss": 0.8276, + "step": 10327 + }, + { + "epoch": 0.5684407507292641, + "grad_norm": 0.6911195516586304, + "learning_rate": 8.159867324455724e-06, + "loss": 0.6286, + "step": 10328 + }, + { + "epoch": 0.5684957895316198, + "grad_norm": 0.6456325054168701, + "learning_rate": 8.159531380277188e-06, + "loss": 0.7419, + "step": 10329 + }, + { + "epoch": 0.5685508283339754, + "grad_norm": 0.9318492412567139, + "learning_rate": 8.159195412352458e-06, + "loss": 0.8131, + "step": 10330 + }, + { + "epoch": 0.5686058671363311, + "grad_norm": 0.7012938857078552, + "learning_rate": 8.158859420684062e-06, + "loss": 0.7074, + "step": 10331 + }, + { + "epoch": 0.5686609059386868, + "grad_norm": 0.7152053117752075, + "learning_rate": 8.158523405274523e-06, + "loss": 0.7186, + "step": 10332 + }, + { + "epoch": 0.5687159447410425, + "grad_norm": 0.7074982523918152, + "learning_rate": 8.158187366126368e-06, + "loss": 0.8021, + "step": 10333 + }, + { + "epoch": 0.5687709835433981, + "grad_norm": 0.689536452293396, + "learning_rate": 8.157851303242123e-06, + "loss": 0.7493, + "step": 10334 + }, + { + "epoch": 0.5688260223457537, + "grad_norm": 0.7411753535270691, + "learning_rate": 8.157515216624313e-06, + "loss": 0.8012, + "step": 10335 + }, + { + "epoch": 0.5688810611481094, + "grad_norm": 0.6831420063972473, + "learning_rate": 8.157179106275463e-06, + "loss": 0.7114, + "step": 10336 + }, + { + "epoch": 0.5689360999504651, + "grad_norm": 0.6786901950836182, + "learning_rate": 8.1568429721981e-06, + "loss": 0.7638, + "step": 10337 + }, + { + "epoch": 0.5689911387528207, + "grad_norm": 0.7546970844268799, + "learning_rate": 8.15650681439475e-06, + "loss": 0.7711, + "step": 10338 + }, + { + "epoch": 0.5690461775551764, + "grad_norm": 0.8071785569190979, + "learning_rate": 8.156170632867942e-06, + "loss": 0.8105, + "step": 10339 + }, + { + "epoch": 0.5691012163575321, + "grad_norm": 0.7872087359428406, + "learning_rate": 8.155834427620198e-06, + "loss": 0.7657, + "step": 10340 + }, + { + "epoch": 0.5691562551598878, + "grad_norm": 0.724328875541687, + "learning_rate": 8.155498198654047e-06, + "loss": 0.7978, + "step": 10341 + }, + { + "epoch": 0.5692112939622433, + "grad_norm": 0.8559905886650085, + "learning_rate": 8.155161945972016e-06, + "loss": 0.7766, + "step": 10342 + }, + { + "epoch": 0.569266332764599, + "grad_norm": 0.607418417930603, + "learning_rate": 8.154825669576635e-06, + "loss": 0.642, + "step": 10343 + }, + { + "epoch": 0.5693213715669547, + "grad_norm": 0.7403624653816223, + "learning_rate": 8.154489369470426e-06, + "loss": 0.7301, + "step": 10344 + }, + { + "epoch": 0.5693764103693104, + "grad_norm": 0.7388540506362915, + "learning_rate": 8.154153045655922e-06, + "loss": 0.7895, + "step": 10345 + }, + { + "epoch": 0.569431449171666, + "grad_norm": 0.8327579498291016, + "learning_rate": 8.153816698135646e-06, + "loss": 0.7589, + "step": 10346 + }, + { + "epoch": 0.5694864879740217, + "grad_norm": 0.7738710641860962, + "learning_rate": 8.153480326912128e-06, + "loss": 0.7828, + "step": 10347 + }, + { + "epoch": 0.5695415267763774, + "grad_norm": 0.8280724287033081, + "learning_rate": 8.153143931987896e-06, + "loss": 0.8194, + "step": 10348 + }, + { + "epoch": 0.5695965655787331, + "grad_norm": 0.8290724754333496, + "learning_rate": 8.152807513365478e-06, + "loss": 0.5941, + "step": 10349 + }, + { + "epoch": 0.5696516043810886, + "grad_norm": 0.7514322400093079, + "learning_rate": 8.152471071047403e-06, + "loss": 0.676, + "step": 10350 + }, + { + "epoch": 0.5697066431834443, + "grad_norm": 0.6990258693695068, + "learning_rate": 8.1521346050362e-06, + "loss": 0.804, + "step": 10351 + }, + { + "epoch": 0.5697616819858, + "grad_norm": 0.6781288981437683, + "learning_rate": 8.151798115334396e-06, + "loss": 0.7372, + "step": 10352 + }, + { + "epoch": 0.5698167207881556, + "grad_norm": 0.764301061630249, + "learning_rate": 8.151461601944523e-06, + "loss": 0.8242, + "step": 10353 + }, + { + "epoch": 0.5698717595905113, + "grad_norm": 0.7577376961708069, + "learning_rate": 8.151125064869106e-06, + "loss": 0.7354, + "step": 10354 + }, + { + "epoch": 0.569926798392867, + "grad_norm": 0.767764687538147, + "learning_rate": 8.150788504110678e-06, + "loss": 0.7262, + "step": 10355 + }, + { + "epoch": 0.5699818371952227, + "grad_norm": 0.6634765267372131, + "learning_rate": 8.150451919671767e-06, + "loss": 0.7527, + "step": 10356 + }, + { + "epoch": 0.5700368759975782, + "grad_norm": 0.8803308010101318, + "learning_rate": 8.150115311554901e-06, + "loss": 0.8172, + "step": 10357 + }, + { + "epoch": 0.5700919147999339, + "grad_norm": 0.695791482925415, + "learning_rate": 8.149778679762611e-06, + "loss": 0.7538, + "step": 10358 + }, + { + "epoch": 0.5701469536022896, + "grad_norm": 0.7047555446624756, + "learning_rate": 8.149442024297432e-06, + "loss": 0.7533, + "step": 10359 + }, + { + "epoch": 0.5702019924046453, + "grad_norm": 0.7148274183273315, + "learning_rate": 8.149105345161886e-06, + "loss": 0.6736, + "step": 10360 + }, + { + "epoch": 0.5702570312070009, + "grad_norm": 0.673204243183136, + "learning_rate": 8.148768642358508e-06, + "loss": 0.7713, + "step": 10361 + }, + { + "epoch": 0.5703120700093566, + "grad_norm": 0.6258989572525024, + "learning_rate": 8.148431915889827e-06, + "loss": 0.6578, + "step": 10362 + }, + { + "epoch": 0.5703671088117123, + "grad_norm": 0.8411956429481506, + "learning_rate": 8.148095165758377e-06, + "loss": 0.8387, + "step": 10363 + }, + { + "epoch": 0.570422147614068, + "grad_norm": 0.7802130579948425, + "learning_rate": 8.147758391966685e-06, + "loss": 0.8564, + "step": 10364 + }, + { + "epoch": 0.5704771864164235, + "grad_norm": 0.6665176153182983, + "learning_rate": 8.147421594517282e-06, + "loss": 0.688, + "step": 10365 + }, + { + "epoch": 0.5705322252187792, + "grad_norm": 0.7166683673858643, + "learning_rate": 8.147084773412702e-06, + "loss": 0.6704, + "step": 10366 + }, + { + "epoch": 0.5705872640211349, + "grad_norm": 0.6948957443237305, + "learning_rate": 8.146747928655476e-06, + "loss": 0.7116, + "step": 10367 + }, + { + "epoch": 0.5706423028234906, + "grad_norm": 0.588965892791748, + "learning_rate": 8.146411060248134e-06, + "loss": 0.5644, + "step": 10368 + }, + { + "epoch": 0.5706973416258462, + "grad_norm": 0.8020890355110168, + "learning_rate": 8.14607416819321e-06, + "loss": 0.6978, + "step": 10369 + }, + { + "epoch": 0.5707523804282019, + "grad_norm": 0.9900732040405273, + "learning_rate": 8.145737252493234e-06, + "loss": 0.7295, + "step": 10370 + }, + { + "epoch": 0.5708074192305576, + "grad_norm": 0.7236563563346863, + "learning_rate": 8.145400313150737e-06, + "loss": 0.7555, + "step": 10371 + }, + { + "epoch": 0.5708624580329132, + "grad_norm": 0.6784152984619141, + "learning_rate": 8.145063350168257e-06, + "loss": 0.7283, + "step": 10372 + }, + { + "epoch": 0.5709174968352688, + "grad_norm": 0.6255244612693787, + "learning_rate": 8.14472636354832e-06, + "loss": 0.6722, + "step": 10373 + }, + { + "epoch": 0.5709725356376245, + "grad_norm": 0.8250948786735535, + "learning_rate": 8.14438935329346e-06, + "loss": 0.8406, + "step": 10374 + }, + { + "epoch": 0.5710275744399802, + "grad_norm": 0.7308233380317688, + "learning_rate": 8.144052319406215e-06, + "loss": 0.8084, + "step": 10375 + }, + { + "epoch": 0.5710826132423359, + "grad_norm": 0.7850058674812317, + "learning_rate": 8.143715261889112e-06, + "loss": 0.7892, + "step": 10376 + }, + { + "epoch": 0.5711376520446915, + "grad_norm": 0.81241774559021, + "learning_rate": 8.143378180744687e-06, + "loss": 0.7819, + "step": 10377 + }, + { + "epoch": 0.5711926908470472, + "grad_norm": 0.7174570560455322, + "learning_rate": 8.143041075975473e-06, + "loss": 0.7104, + "step": 10378 + }, + { + "epoch": 0.5712477296494028, + "grad_norm": 0.6954129934310913, + "learning_rate": 8.142703947584004e-06, + "loss": 0.7821, + "step": 10379 + }, + { + "epoch": 0.5713027684517585, + "grad_norm": 0.6895242929458618, + "learning_rate": 8.142366795572813e-06, + "loss": 0.7687, + "step": 10380 + }, + { + "epoch": 0.5713578072541141, + "grad_norm": 0.6543757319450378, + "learning_rate": 8.142029619944434e-06, + "loss": 0.7042, + "step": 10381 + }, + { + "epoch": 0.5714128460564698, + "grad_norm": 0.6712427139282227, + "learning_rate": 8.141692420701404e-06, + "loss": 0.6861, + "step": 10382 + }, + { + "epoch": 0.5714678848588255, + "grad_norm": 1.6716055870056152, + "learning_rate": 8.141355197846253e-06, + "loss": 0.8209, + "step": 10383 + }, + { + "epoch": 0.5715229236611812, + "grad_norm": 0.7509854435920715, + "learning_rate": 8.141017951381516e-06, + "loss": 0.8246, + "step": 10384 + }, + { + "epoch": 0.5715779624635368, + "grad_norm": 0.7161786556243896, + "learning_rate": 8.14068068130973e-06, + "loss": 0.835, + "step": 10385 + }, + { + "epoch": 0.5716330012658924, + "grad_norm": 0.7423714995384216, + "learning_rate": 8.140343387633427e-06, + "loss": 0.8004, + "step": 10386 + }, + { + "epoch": 0.5716880400682481, + "grad_norm": 0.6955768465995789, + "learning_rate": 8.140006070355146e-06, + "loss": 0.7299, + "step": 10387 + }, + { + "epoch": 0.5717430788706038, + "grad_norm": 0.6742254495620728, + "learning_rate": 8.13966872947742e-06, + "loss": 0.6549, + "step": 10388 + }, + { + "epoch": 0.5717981176729594, + "grad_norm": 0.7332299947738647, + "learning_rate": 8.139331365002782e-06, + "loss": 0.7945, + "step": 10389 + }, + { + "epoch": 0.5718531564753151, + "grad_norm": 0.6552133560180664, + "learning_rate": 8.138993976933771e-06, + "loss": 0.7193, + "step": 10390 + }, + { + "epoch": 0.5719081952776708, + "grad_norm": 0.6708530187606812, + "learning_rate": 8.138656565272923e-06, + "loss": 0.8053, + "step": 10391 + }, + { + "epoch": 0.5719632340800265, + "grad_norm": 0.7837093472480774, + "learning_rate": 8.138319130022771e-06, + "loss": 0.7752, + "step": 10392 + }, + { + "epoch": 0.572018272882382, + "grad_norm": 0.6910337805747986, + "learning_rate": 8.137981671185853e-06, + "loss": 0.7573, + "step": 10393 + }, + { + "epoch": 0.5720733116847377, + "grad_norm": 0.6758334636688232, + "learning_rate": 8.137644188764704e-06, + "loss": 0.8251, + "step": 10394 + }, + { + "epoch": 0.5721283504870934, + "grad_norm": 0.7513287663459778, + "learning_rate": 8.137306682761862e-06, + "loss": 0.6491, + "step": 10395 + }, + { + "epoch": 0.572183389289449, + "grad_norm": 0.678210973739624, + "learning_rate": 8.136969153179863e-06, + "loss": 0.7761, + "step": 10396 + }, + { + "epoch": 0.5722384280918047, + "grad_norm": 0.8256083726882935, + "learning_rate": 8.13663160002124e-06, + "loss": 0.7813, + "step": 10397 + }, + { + "epoch": 0.5722934668941604, + "grad_norm": 0.8383314609527588, + "learning_rate": 8.136294023288538e-06, + "loss": 0.7669, + "step": 10398 + }, + { + "epoch": 0.5723485056965161, + "grad_norm": 0.7150036692619324, + "learning_rate": 8.135956422984287e-06, + "loss": 0.8322, + "step": 10399 + }, + { + "epoch": 0.5724035444988717, + "grad_norm": 1.3011385202407837, + "learning_rate": 8.13561879911103e-06, + "loss": 0.8044, + "step": 10400 + }, + { + "epoch": 0.5724585833012273, + "grad_norm": 0.6749194860458374, + "learning_rate": 8.135281151671298e-06, + "loss": 0.6426, + "step": 10401 + }, + { + "epoch": 0.572513622103583, + "grad_norm": 0.7370286583900452, + "learning_rate": 8.134943480667635e-06, + "loss": 0.8051, + "step": 10402 + }, + { + "epoch": 0.5725686609059387, + "grad_norm": 0.6827631592750549, + "learning_rate": 8.134605786102574e-06, + "loss": 0.6961, + "step": 10403 + }, + { + "epoch": 0.5726236997082943, + "grad_norm": 0.7593247294425964, + "learning_rate": 8.134268067978655e-06, + "loss": 0.7514, + "step": 10404 + }, + { + "epoch": 0.57267873851065, + "grad_norm": 0.7229800224304199, + "learning_rate": 8.133930326298417e-06, + "loss": 0.8105, + "step": 10405 + }, + { + "epoch": 0.5727337773130057, + "grad_norm": 0.720973551273346, + "learning_rate": 8.133592561064396e-06, + "loss": 0.6866, + "step": 10406 + }, + { + "epoch": 0.5727888161153614, + "grad_norm": 0.7530742883682251, + "learning_rate": 8.133254772279135e-06, + "loss": 0.773, + "step": 10407 + }, + { + "epoch": 0.5728438549177169, + "grad_norm": 0.6897457838058472, + "learning_rate": 8.132916959945167e-06, + "loss": 0.8107, + "step": 10408 + }, + { + "epoch": 0.5728988937200726, + "grad_norm": 0.6659066081047058, + "learning_rate": 8.132579124065034e-06, + "loss": 0.8036, + "step": 10409 + }, + { + "epoch": 0.5729539325224283, + "grad_norm": 0.6925005316734314, + "learning_rate": 8.132241264641276e-06, + "loss": 0.7869, + "step": 10410 + }, + { + "epoch": 0.573008971324784, + "grad_norm": 0.8681634068489075, + "learning_rate": 8.131903381676433e-06, + "loss": 0.7411, + "step": 10411 + }, + { + "epoch": 0.5730640101271396, + "grad_norm": 0.669561505317688, + "learning_rate": 8.13156547517304e-06, + "loss": 0.7398, + "step": 10412 + }, + { + "epoch": 0.5731190489294953, + "grad_norm": 0.6737409234046936, + "learning_rate": 8.131227545133639e-06, + "loss": 0.7319, + "step": 10413 + }, + { + "epoch": 0.573174087731851, + "grad_norm": 0.7111513614654541, + "learning_rate": 8.130889591560772e-06, + "loss": 0.7192, + "step": 10414 + }, + { + "epoch": 0.5732291265342067, + "grad_norm": 0.6618744134902954, + "learning_rate": 8.130551614456974e-06, + "loss": 0.6636, + "step": 10415 + }, + { + "epoch": 0.5732841653365622, + "grad_norm": 0.8150144815444946, + "learning_rate": 8.13021361382479e-06, + "loss": 0.7168, + "step": 10416 + }, + { + "epoch": 0.5733392041389179, + "grad_norm": 0.744898796081543, + "learning_rate": 8.129875589666758e-06, + "loss": 0.8562, + "step": 10417 + }, + { + "epoch": 0.5733942429412736, + "grad_norm": 0.7831705212593079, + "learning_rate": 8.129537541985419e-06, + "loss": 0.8491, + "step": 10418 + }, + { + "epoch": 0.5734492817436293, + "grad_norm": 0.8097667098045349, + "learning_rate": 8.129199470783313e-06, + "loss": 0.7623, + "step": 10419 + }, + { + "epoch": 0.5735043205459849, + "grad_norm": 0.7951840758323669, + "learning_rate": 8.128861376062982e-06, + "loss": 0.8195, + "step": 10420 + }, + { + "epoch": 0.5735593593483406, + "grad_norm": 0.5902833938598633, + "learning_rate": 8.128523257826966e-06, + "loss": 0.6244, + "step": 10421 + }, + { + "epoch": 0.5736143981506963, + "grad_norm": 1.113287329673767, + "learning_rate": 8.128185116077805e-06, + "loss": 0.8382, + "step": 10422 + }, + { + "epoch": 0.573669436953052, + "grad_norm": 0.6899390816688538, + "learning_rate": 8.127846950818046e-06, + "loss": 0.7632, + "step": 10423 + }, + { + "epoch": 0.5737244757554075, + "grad_norm": 0.6905965805053711, + "learning_rate": 8.127508762050225e-06, + "loss": 0.7429, + "step": 10424 + }, + { + "epoch": 0.5737795145577632, + "grad_norm": 0.7036122679710388, + "learning_rate": 8.127170549776882e-06, + "loss": 0.7699, + "step": 10425 + }, + { + "epoch": 0.5738345533601189, + "grad_norm": 0.6599798202514648, + "learning_rate": 8.126832314000566e-06, + "loss": 0.7169, + "step": 10426 + }, + { + "epoch": 0.5738895921624746, + "grad_norm": 0.8682155609130859, + "learning_rate": 8.126494054723815e-06, + "loss": 0.851, + "step": 10427 + }, + { + "epoch": 0.5739446309648302, + "grad_norm": 0.6661516427993774, + "learning_rate": 8.12615577194917e-06, + "loss": 0.7287, + "step": 10428 + }, + { + "epoch": 0.5739996697671859, + "grad_norm": 0.6805256009101868, + "learning_rate": 8.125817465679176e-06, + "loss": 0.7033, + "step": 10429 + }, + { + "epoch": 0.5740547085695415, + "grad_norm": 0.7088646292686462, + "learning_rate": 8.125479135916375e-06, + "loss": 0.7295, + "step": 10430 + }, + { + "epoch": 0.5741097473718972, + "grad_norm": 0.6854971647262573, + "learning_rate": 8.12514078266331e-06, + "loss": 0.8102, + "step": 10431 + }, + { + "epoch": 0.5741647861742528, + "grad_norm": 0.7481474876403809, + "learning_rate": 8.124802405922521e-06, + "loss": 0.7463, + "step": 10432 + }, + { + "epoch": 0.5742198249766085, + "grad_norm": 0.8280898928642273, + "learning_rate": 8.124464005696556e-06, + "loss": 0.8067, + "step": 10433 + }, + { + "epoch": 0.5742748637789642, + "grad_norm": 0.696812629699707, + "learning_rate": 8.124125581987953e-06, + "loss": 0.7041, + "step": 10434 + }, + { + "epoch": 0.5743299025813199, + "grad_norm": 0.791084349155426, + "learning_rate": 8.123787134799262e-06, + "loss": 0.8244, + "step": 10435 + }, + { + "epoch": 0.5743849413836755, + "grad_norm": 0.7422665953636169, + "learning_rate": 8.123448664133022e-06, + "loss": 0.7792, + "step": 10436 + }, + { + "epoch": 0.5744399801860312, + "grad_norm": 0.7302834987640381, + "learning_rate": 8.123110169991777e-06, + "loss": 0.7617, + "step": 10437 + }, + { + "epoch": 0.5744950189883868, + "grad_norm": 0.6640440821647644, + "learning_rate": 8.122771652378071e-06, + "loss": 0.7965, + "step": 10438 + }, + { + "epoch": 0.5745500577907424, + "grad_norm": 0.7704516649246216, + "learning_rate": 8.12243311129445e-06, + "loss": 0.7814, + "step": 10439 + }, + { + "epoch": 0.5746050965930981, + "grad_norm": 0.673254668712616, + "learning_rate": 8.122094546743459e-06, + "loss": 0.7364, + "step": 10440 + }, + { + "epoch": 0.5746601353954538, + "grad_norm": 0.7648451924324036, + "learning_rate": 8.121755958727639e-06, + "loss": 0.8585, + "step": 10441 + }, + { + "epoch": 0.5747151741978095, + "grad_norm": 0.6660173535346985, + "learning_rate": 8.121417347249539e-06, + "loss": 0.6989, + "step": 10442 + }, + { + "epoch": 0.5747702130001651, + "grad_norm": 0.7128653526306152, + "learning_rate": 8.1210787123117e-06, + "loss": 0.8317, + "step": 10443 + }, + { + "epoch": 0.5748252518025208, + "grad_norm": 0.6404966115951538, + "learning_rate": 8.12074005391667e-06, + "loss": 0.6957, + "step": 10444 + }, + { + "epoch": 0.5748802906048764, + "grad_norm": 0.9597657918930054, + "learning_rate": 8.120401372066993e-06, + "loss": 0.9266, + "step": 10445 + }, + { + "epoch": 0.5749353294072321, + "grad_norm": 0.7735045552253723, + "learning_rate": 8.120062666765213e-06, + "loss": 0.8159, + "step": 10446 + }, + { + "epoch": 0.5749903682095877, + "grad_norm": 0.8031814098358154, + "learning_rate": 8.11972393801388e-06, + "loss": 0.7741, + "step": 10447 + }, + { + "epoch": 0.5750454070119434, + "grad_norm": 0.7008558511734009, + "learning_rate": 8.119385185815535e-06, + "loss": 0.6558, + "step": 10448 + }, + { + "epoch": 0.5751004458142991, + "grad_norm": 0.8162875175476074, + "learning_rate": 8.119046410172725e-06, + "loss": 0.7196, + "step": 10449 + }, + { + "epoch": 0.5751554846166548, + "grad_norm": 0.8142701983451843, + "learning_rate": 8.118707611088e-06, + "loss": 0.7709, + "step": 10450 + }, + { + "epoch": 0.5752105234190104, + "grad_norm": 0.7671986818313599, + "learning_rate": 8.118368788563902e-06, + "loss": 0.8725, + "step": 10451 + }, + { + "epoch": 0.575265562221366, + "grad_norm": 0.6604374051094055, + "learning_rate": 8.118029942602979e-06, + "loss": 0.7119, + "step": 10452 + }, + { + "epoch": 0.5753206010237217, + "grad_norm": 0.7119179368019104, + "learning_rate": 8.117691073207776e-06, + "loss": 0.7445, + "step": 10453 + }, + { + "epoch": 0.5753756398260774, + "grad_norm": 0.7572842240333557, + "learning_rate": 8.117352180380843e-06, + "loss": 0.7672, + "step": 10454 + }, + { + "epoch": 0.575430678628433, + "grad_norm": 0.688667356967926, + "learning_rate": 8.117013264124725e-06, + "loss": 0.7733, + "step": 10455 + }, + { + "epoch": 0.5754857174307887, + "grad_norm": 0.6683163046836853, + "learning_rate": 8.116674324441971e-06, + "loss": 0.6381, + "step": 10456 + }, + { + "epoch": 0.5755407562331444, + "grad_norm": 0.7792099714279175, + "learning_rate": 8.116335361335126e-06, + "loss": 0.7781, + "step": 10457 + }, + { + "epoch": 0.5755957950355001, + "grad_norm": 0.702132523059845, + "learning_rate": 8.115996374806738e-06, + "loss": 0.7442, + "step": 10458 + }, + { + "epoch": 0.5756508338378556, + "grad_norm": 0.7021365761756897, + "learning_rate": 8.115657364859356e-06, + "loss": 0.7215, + "step": 10459 + }, + { + "epoch": 0.5757058726402113, + "grad_norm": 0.7032247185707092, + "learning_rate": 8.115318331495527e-06, + "loss": 0.7069, + "step": 10460 + }, + { + "epoch": 0.575760911442567, + "grad_norm": 0.8301237225532532, + "learning_rate": 8.1149792747178e-06, + "loss": 0.789, + "step": 10461 + }, + { + "epoch": 0.5758159502449227, + "grad_norm": 0.7051018476486206, + "learning_rate": 8.11464019452872e-06, + "loss": 0.7511, + "step": 10462 + }, + { + "epoch": 0.5758709890472783, + "grad_norm": 0.8422626256942749, + "learning_rate": 8.114301090930843e-06, + "loss": 0.6507, + "step": 10463 + }, + { + "epoch": 0.575926027849634, + "grad_norm": 0.7751632332801819, + "learning_rate": 8.113961963926708e-06, + "loss": 0.7357, + "step": 10464 + }, + { + "epoch": 0.5759810666519897, + "grad_norm": 0.7158333659172058, + "learning_rate": 8.11362281351887e-06, + "loss": 0.8382, + "step": 10465 + }, + { + "epoch": 0.5760361054543454, + "grad_norm": 0.6926481127738953, + "learning_rate": 8.113283639709878e-06, + "loss": 0.7078, + "step": 10466 + }, + { + "epoch": 0.5760911442567009, + "grad_norm": 0.7091588973999023, + "learning_rate": 8.112944442502277e-06, + "loss": 0.7932, + "step": 10467 + }, + { + "epoch": 0.5761461830590566, + "grad_norm": 0.6979780197143555, + "learning_rate": 8.11260522189862e-06, + "loss": 0.6812, + "step": 10468 + }, + { + "epoch": 0.5762012218614123, + "grad_norm": 0.6735736131668091, + "learning_rate": 8.112265977901455e-06, + "loss": 0.7499, + "step": 10469 + }, + { + "epoch": 0.576256260663768, + "grad_norm": 0.6995692849159241, + "learning_rate": 8.111926710513334e-06, + "loss": 0.7123, + "step": 10470 + }, + { + "epoch": 0.5763112994661236, + "grad_norm": 0.7162681818008423, + "learning_rate": 8.111587419736802e-06, + "loss": 0.7586, + "step": 10471 + }, + { + "epoch": 0.5763663382684793, + "grad_norm": 0.945935070514679, + "learning_rate": 8.111248105574414e-06, + "loss": 0.8474, + "step": 10472 + }, + { + "epoch": 0.576421377070835, + "grad_norm": 0.608730673789978, + "learning_rate": 8.110908768028716e-06, + "loss": 0.6433, + "step": 10473 + }, + { + "epoch": 0.5764764158731907, + "grad_norm": 0.6777853965759277, + "learning_rate": 8.110569407102263e-06, + "loss": 0.7913, + "step": 10474 + }, + { + "epoch": 0.5765314546755462, + "grad_norm": 0.6310930848121643, + "learning_rate": 8.1102300227976e-06, + "loss": 0.719, + "step": 10475 + }, + { + "epoch": 0.5765864934779019, + "grad_norm": 0.7048485279083252, + "learning_rate": 8.109890615117282e-06, + "loss": 0.7341, + "step": 10476 + }, + { + "epoch": 0.5766415322802576, + "grad_norm": 0.672987163066864, + "learning_rate": 8.10955118406386e-06, + "loss": 0.7637, + "step": 10477 + }, + { + "epoch": 0.5766965710826133, + "grad_norm": 0.7018216252326965, + "learning_rate": 8.109211729639882e-06, + "loss": 0.6924, + "step": 10478 + }, + { + "epoch": 0.5767516098849689, + "grad_norm": 0.7183761596679688, + "learning_rate": 8.108872251847901e-06, + "loss": 0.7945, + "step": 10479 + }, + { + "epoch": 0.5768066486873246, + "grad_norm": 0.7332683801651001, + "learning_rate": 8.108532750690469e-06, + "loss": 0.7686, + "step": 10480 + }, + { + "epoch": 0.5768616874896803, + "grad_norm": 0.7118290066719055, + "learning_rate": 8.108193226170139e-06, + "loss": 0.6917, + "step": 10481 + }, + { + "epoch": 0.5769167262920358, + "grad_norm": 0.8242507576942444, + "learning_rate": 8.107853678289456e-06, + "loss": 0.9119, + "step": 10482 + }, + { + "epoch": 0.5769717650943915, + "grad_norm": 0.7138590216636658, + "learning_rate": 8.10751410705098e-06, + "loss": 0.7095, + "step": 10483 + }, + { + "epoch": 0.5770268038967472, + "grad_norm": 0.7541199326515198, + "learning_rate": 8.107174512457259e-06, + "loss": 0.8042, + "step": 10484 + }, + { + "epoch": 0.5770818426991029, + "grad_norm": 0.7776939868927002, + "learning_rate": 8.106834894510846e-06, + "loss": 0.8075, + "step": 10485 + }, + { + "epoch": 0.5771368815014585, + "grad_norm": 0.6466917395591736, + "learning_rate": 8.106495253214293e-06, + "loss": 0.707, + "step": 10486 + }, + { + "epoch": 0.5771919203038142, + "grad_norm": 0.687101423740387, + "learning_rate": 8.106155588570153e-06, + "loss": 0.6945, + "step": 10487 + }, + { + "epoch": 0.5772469591061699, + "grad_norm": 0.8338418006896973, + "learning_rate": 8.10581590058098e-06, + "loss": 0.8044, + "step": 10488 + }, + { + "epoch": 0.5773019979085255, + "grad_norm": 0.7052263617515564, + "learning_rate": 8.105476189249325e-06, + "loss": 0.8216, + "step": 10489 + }, + { + "epoch": 0.5773570367108811, + "grad_norm": 0.7205906510353088, + "learning_rate": 8.105136454577744e-06, + "loss": 0.8853, + "step": 10490 + }, + { + "epoch": 0.5774120755132368, + "grad_norm": 0.7875076532363892, + "learning_rate": 8.10479669656879e-06, + "loss": 0.822, + "step": 10491 + }, + { + "epoch": 0.5774671143155925, + "grad_norm": 0.6858797669410706, + "learning_rate": 8.104456915225012e-06, + "loss": 0.7924, + "step": 10492 + }, + { + "epoch": 0.5775221531179482, + "grad_norm": 0.6991322636604309, + "learning_rate": 8.104117110548968e-06, + "loss": 0.8144, + "step": 10493 + }, + { + "epoch": 0.5775771919203038, + "grad_norm": 0.7768846750259399, + "learning_rate": 8.103777282543209e-06, + "loss": 0.7793, + "step": 10494 + }, + { + "epoch": 0.5776322307226595, + "grad_norm": 0.7055716514587402, + "learning_rate": 8.103437431210293e-06, + "loss": 0.7653, + "step": 10495 + }, + { + "epoch": 0.5776872695250151, + "grad_norm": 1.009839653968811, + "learning_rate": 8.10309755655277e-06, + "loss": 0.7646, + "step": 10496 + }, + { + "epoch": 0.5777423083273708, + "grad_norm": 0.699435293674469, + "learning_rate": 8.102757658573197e-06, + "loss": 0.7806, + "step": 10497 + }, + { + "epoch": 0.5777973471297264, + "grad_norm": 0.8566381931304932, + "learning_rate": 8.102417737274129e-06, + "loss": 0.8302, + "step": 10498 + }, + { + "epoch": 0.5778523859320821, + "grad_norm": 0.745801568031311, + "learning_rate": 8.10207779265812e-06, + "loss": 0.91, + "step": 10499 + }, + { + "epoch": 0.5779074247344378, + "grad_norm": 0.6867349743843079, + "learning_rate": 8.101737824727724e-06, + "loss": 0.771, + "step": 10500 + }, + { + "epoch": 0.5779624635367935, + "grad_norm": 0.6693048477172852, + "learning_rate": 8.101397833485496e-06, + "loss": 0.7967, + "step": 10501 + }, + { + "epoch": 0.5780175023391491, + "grad_norm": 0.7485450506210327, + "learning_rate": 8.101057818933993e-06, + "loss": 0.7132, + "step": 10502 + }, + { + "epoch": 0.5780725411415047, + "grad_norm": 0.7619839906692505, + "learning_rate": 8.100717781075769e-06, + "loss": 0.7379, + "step": 10503 + }, + { + "epoch": 0.5781275799438604, + "grad_norm": 0.7651955485343933, + "learning_rate": 8.100377719913382e-06, + "loss": 0.8437, + "step": 10504 + }, + { + "epoch": 0.5781826187462161, + "grad_norm": 0.692385196685791, + "learning_rate": 8.100037635449384e-06, + "loss": 0.7666, + "step": 10505 + }, + { + "epoch": 0.5782376575485717, + "grad_norm": 0.7332374453544617, + "learning_rate": 8.099697527686334e-06, + "loss": 0.7476, + "step": 10506 + }, + { + "epoch": 0.5782926963509274, + "grad_norm": 0.6934877634048462, + "learning_rate": 8.099357396626786e-06, + "loss": 0.8054, + "step": 10507 + }, + { + "epoch": 0.5783477351532831, + "grad_norm": 0.8393011689186096, + "learning_rate": 8.099017242273298e-06, + "loss": 0.8655, + "step": 10508 + }, + { + "epoch": 0.5784027739556388, + "grad_norm": 0.6850646734237671, + "learning_rate": 8.098677064628425e-06, + "loss": 0.7424, + "step": 10509 + }, + { + "epoch": 0.5784578127579943, + "grad_norm": 0.7302095293998718, + "learning_rate": 8.098336863694728e-06, + "loss": 0.903, + "step": 10510 + }, + { + "epoch": 0.57851285156035, + "grad_norm": 0.7474033236503601, + "learning_rate": 8.097996639474757e-06, + "loss": 0.7509, + "step": 10511 + }, + { + "epoch": 0.5785678903627057, + "grad_norm": 0.6525655388832092, + "learning_rate": 8.097656391971074e-06, + "loss": 0.7097, + "step": 10512 + }, + { + "epoch": 0.5786229291650614, + "grad_norm": 0.8197451829910278, + "learning_rate": 8.097316121186234e-06, + "loss": 0.7401, + "step": 10513 + }, + { + "epoch": 0.578677967967417, + "grad_norm": 0.7048231959342957, + "learning_rate": 8.096975827122795e-06, + "loss": 0.7964, + "step": 10514 + }, + { + "epoch": 0.5787330067697727, + "grad_norm": 0.8417022228240967, + "learning_rate": 8.096635509783315e-06, + "loss": 0.7703, + "step": 10515 + }, + { + "epoch": 0.5787880455721284, + "grad_norm": 0.7313926815986633, + "learning_rate": 8.096295169170352e-06, + "loss": 0.7565, + "step": 10516 + }, + { + "epoch": 0.5788430843744841, + "grad_norm": 0.7156692147254944, + "learning_rate": 8.095954805286464e-06, + "loss": 0.7456, + "step": 10517 + }, + { + "epoch": 0.5788981231768396, + "grad_norm": 0.7366768717765808, + "learning_rate": 8.095614418134205e-06, + "loss": 0.72, + "step": 10518 + }, + { + "epoch": 0.5789531619791953, + "grad_norm": 0.7011533379554749, + "learning_rate": 8.09527400771614e-06, + "loss": 0.7683, + "step": 10519 + }, + { + "epoch": 0.579008200781551, + "grad_norm": 0.6849086284637451, + "learning_rate": 8.094933574034823e-06, + "loss": 0.6938, + "step": 10520 + }, + { + "epoch": 0.5790632395839067, + "grad_norm": 0.7351469397544861, + "learning_rate": 8.094593117092814e-06, + "loss": 0.7364, + "step": 10521 + }, + { + "epoch": 0.5791182783862623, + "grad_norm": 0.7133724689483643, + "learning_rate": 8.09425263689267e-06, + "loss": 0.7328, + "step": 10522 + }, + { + "epoch": 0.579173317188618, + "grad_norm": 0.6713461875915527, + "learning_rate": 8.093912133436954e-06, + "loss": 0.7296, + "step": 10523 + }, + { + "epoch": 0.5792283559909737, + "grad_norm": 0.7057825922966003, + "learning_rate": 8.093571606728222e-06, + "loss": 0.7732, + "step": 10524 + }, + { + "epoch": 0.5792833947933292, + "grad_norm": 0.7378783226013184, + "learning_rate": 8.093231056769033e-06, + "loss": 0.7907, + "step": 10525 + }, + { + "epoch": 0.5793384335956849, + "grad_norm": 0.8796947598457336, + "learning_rate": 8.092890483561947e-06, + "loss": 0.7325, + "step": 10526 + }, + { + "epoch": 0.5793934723980406, + "grad_norm": 0.7326352000236511, + "learning_rate": 8.092549887109525e-06, + "loss": 0.7948, + "step": 10527 + }, + { + "epoch": 0.5794485112003963, + "grad_norm": 0.7131063342094421, + "learning_rate": 8.092209267414325e-06, + "loss": 0.7595, + "step": 10528 + }, + { + "epoch": 0.5795035500027519, + "grad_norm": 0.6993252635002136, + "learning_rate": 8.091868624478908e-06, + "loss": 0.782, + "step": 10529 + }, + { + "epoch": 0.5795585888051076, + "grad_norm": 0.6945857405662537, + "learning_rate": 8.091527958305835e-06, + "loss": 0.7283, + "step": 10530 + }, + { + "epoch": 0.5796136276074633, + "grad_norm": 0.8203904032707214, + "learning_rate": 8.091187268897667e-06, + "loss": 0.7787, + "step": 10531 + }, + { + "epoch": 0.579668666409819, + "grad_norm": 0.6450221538543701, + "learning_rate": 8.09084655625696e-06, + "loss": 0.7092, + "step": 10532 + }, + { + "epoch": 0.5797237052121745, + "grad_norm": 0.6852096915245056, + "learning_rate": 8.090505820386279e-06, + "loss": 0.7916, + "step": 10533 + }, + { + "epoch": 0.5797787440145302, + "grad_norm": 1.0816445350646973, + "learning_rate": 8.090165061288182e-06, + "loss": 0.7545, + "step": 10534 + }, + { + "epoch": 0.5798337828168859, + "grad_norm": 0.7312847375869751, + "learning_rate": 8.089824278965233e-06, + "loss": 0.7395, + "step": 10535 + }, + { + "epoch": 0.5798888216192416, + "grad_norm": 0.7281426191329956, + "learning_rate": 8.089483473419992e-06, + "loss": 0.7677, + "step": 10536 + }, + { + "epoch": 0.5799438604215972, + "grad_norm": 0.7392409443855286, + "learning_rate": 8.08914264465502e-06, + "loss": 0.7674, + "step": 10537 + }, + { + "epoch": 0.5799988992239529, + "grad_norm": 0.7041863799095154, + "learning_rate": 8.088801792672877e-06, + "loss": 0.6156, + "step": 10538 + }, + { + "epoch": 0.5800539380263086, + "grad_norm": 0.7113755345344543, + "learning_rate": 8.088460917476128e-06, + "loss": 0.7677, + "step": 10539 + }, + { + "epoch": 0.5801089768286642, + "grad_norm": 0.673966646194458, + "learning_rate": 8.088120019067334e-06, + "loss": 0.7557, + "step": 10540 + }, + { + "epoch": 0.5801640156310198, + "grad_norm": 0.8165854215621948, + "learning_rate": 8.087779097449055e-06, + "loss": 0.8102, + "step": 10541 + }, + { + "epoch": 0.5802190544333755, + "grad_norm": 0.7010880708694458, + "learning_rate": 8.087438152623857e-06, + "loss": 0.7816, + "step": 10542 + }, + { + "epoch": 0.5802740932357312, + "grad_norm": 0.726177990436554, + "learning_rate": 8.0870971845943e-06, + "loss": 0.7671, + "step": 10543 + }, + { + "epoch": 0.5803291320380869, + "grad_norm": 0.7403919696807861, + "learning_rate": 8.086756193362946e-06, + "loss": 0.8449, + "step": 10544 + }, + { + "epoch": 0.5803841708404425, + "grad_norm": 0.6897104382514954, + "learning_rate": 8.086415178932358e-06, + "loss": 0.7563, + "step": 10545 + }, + { + "epoch": 0.5804392096427982, + "grad_norm": 0.7682604193687439, + "learning_rate": 8.0860741413051e-06, + "loss": 0.8019, + "step": 10546 + }, + { + "epoch": 0.5804942484451538, + "grad_norm": 0.7317522168159485, + "learning_rate": 8.085733080483736e-06, + "loss": 0.7446, + "step": 10547 + }, + { + "epoch": 0.5805492872475095, + "grad_norm": 0.8503430485725403, + "learning_rate": 8.085391996470826e-06, + "loss": 0.7343, + "step": 10548 + }, + { + "epoch": 0.5806043260498651, + "grad_norm": 0.8550657629966736, + "learning_rate": 8.085050889268937e-06, + "loss": 0.9267, + "step": 10549 + }, + { + "epoch": 0.5806593648522208, + "grad_norm": 0.7751224637031555, + "learning_rate": 8.084709758880633e-06, + "loss": 0.7404, + "step": 10550 + }, + { + "epoch": 0.5807144036545765, + "grad_norm": 0.6346186399459839, + "learning_rate": 8.084368605308475e-06, + "loss": 0.66, + "step": 10551 + }, + { + "epoch": 0.5807694424569322, + "grad_norm": 0.7295717597007751, + "learning_rate": 8.084027428555027e-06, + "loss": 0.8313, + "step": 10552 + }, + { + "epoch": 0.5808244812592878, + "grad_norm": 0.6962289810180664, + "learning_rate": 8.083686228622856e-06, + "loss": 0.7871, + "step": 10553 + }, + { + "epoch": 0.5808795200616435, + "grad_norm": 0.6968896389007568, + "learning_rate": 8.083345005514522e-06, + "loss": 0.7261, + "step": 10554 + }, + { + "epoch": 0.5809345588639991, + "grad_norm": 0.8374869227409363, + "learning_rate": 8.083003759232595e-06, + "loss": 0.797, + "step": 10555 + }, + { + "epoch": 0.5809895976663548, + "grad_norm": 0.6511034369468689, + "learning_rate": 8.082662489779637e-06, + "loss": 0.7237, + "step": 10556 + }, + { + "epoch": 0.5810446364687104, + "grad_norm": 0.6644287705421448, + "learning_rate": 8.082321197158212e-06, + "loss": 0.6969, + "step": 10557 + }, + { + "epoch": 0.5810996752710661, + "grad_norm": 0.7681102752685547, + "learning_rate": 8.081979881370884e-06, + "loss": 0.7193, + "step": 10558 + }, + { + "epoch": 0.5811547140734218, + "grad_norm": 0.7930792570114136, + "learning_rate": 8.081638542420224e-06, + "loss": 0.7198, + "step": 10559 + }, + { + "epoch": 0.5812097528757775, + "grad_norm": 0.7227992415428162, + "learning_rate": 8.081297180308791e-06, + "loss": 0.7533, + "step": 10560 + }, + { + "epoch": 0.581264791678133, + "grad_norm": 0.7293071150779724, + "learning_rate": 8.080955795039156e-06, + "loss": 0.6228, + "step": 10561 + }, + { + "epoch": 0.5813198304804887, + "grad_norm": 0.7356483936309814, + "learning_rate": 8.080614386613879e-06, + "loss": 0.7299, + "step": 10562 + }, + { + "epoch": 0.5813748692828444, + "grad_norm": 0.8181473016738892, + "learning_rate": 8.080272955035531e-06, + "loss": 0.6576, + "step": 10563 + }, + { + "epoch": 0.5814299080852001, + "grad_norm": 0.7066958546638489, + "learning_rate": 8.079931500306675e-06, + "loss": 0.7372, + "step": 10564 + }, + { + "epoch": 0.5814849468875557, + "grad_norm": 0.6821097135543823, + "learning_rate": 8.079590022429877e-06, + "loss": 0.7516, + "step": 10565 + }, + { + "epoch": 0.5815399856899114, + "grad_norm": 0.6879069209098816, + "learning_rate": 8.079248521407707e-06, + "loss": 0.7525, + "step": 10566 + }, + { + "epoch": 0.5815950244922671, + "grad_norm": 0.956345796585083, + "learning_rate": 8.078906997242729e-06, + "loss": 0.8175, + "step": 10567 + }, + { + "epoch": 0.5816500632946227, + "grad_norm": 0.6942328214645386, + "learning_rate": 8.078565449937508e-06, + "loss": 0.6264, + "step": 10568 + }, + { + "epoch": 0.5817051020969783, + "grad_norm": 0.7073766589164734, + "learning_rate": 8.078223879494615e-06, + "loss": 0.766, + "step": 10569 + }, + { + "epoch": 0.581760140899334, + "grad_norm": 0.7649571895599365, + "learning_rate": 8.077882285916614e-06, + "loss": 0.8767, + "step": 10570 + }, + { + "epoch": 0.5818151797016897, + "grad_norm": 0.6384355425834656, + "learning_rate": 8.077540669206076e-06, + "loss": 0.7444, + "step": 10571 + }, + { + "epoch": 0.5818702185040453, + "grad_norm": 0.7173928022384644, + "learning_rate": 8.077199029365565e-06, + "loss": 0.8277, + "step": 10572 + }, + { + "epoch": 0.581925257306401, + "grad_norm": 0.7310757637023926, + "learning_rate": 8.076857366397648e-06, + "loss": 0.8425, + "step": 10573 + }, + { + "epoch": 0.5819802961087567, + "grad_norm": 0.6888872385025024, + "learning_rate": 8.076515680304897e-06, + "loss": 0.6961, + "step": 10574 + }, + { + "epoch": 0.5820353349111124, + "grad_norm": 0.7290124297142029, + "learning_rate": 8.076173971089877e-06, + "loss": 0.7865, + "step": 10575 + }, + { + "epoch": 0.582090373713468, + "grad_norm": 0.7402634024620056, + "learning_rate": 8.075832238755156e-06, + "loss": 0.7196, + "step": 10576 + }, + { + "epoch": 0.5821454125158236, + "grad_norm": 0.74916672706604, + "learning_rate": 8.075490483303305e-06, + "loss": 0.8361, + "step": 10577 + }, + { + "epoch": 0.5822004513181793, + "grad_norm": 0.8146494626998901, + "learning_rate": 8.07514870473689e-06, + "loss": 0.7398, + "step": 10578 + }, + { + "epoch": 0.582255490120535, + "grad_norm": 0.6632487177848816, + "learning_rate": 8.07480690305848e-06, + "loss": 0.7239, + "step": 10579 + }, + { + "epoch": 0.5823105289228906, + "grad_norm": 0.6912766695022583, + "learning_rate": 8.074465078270645e-06, + "loss": 0.7488, + "step": 10580 + }, + { + "epoch": 0.5823655677252463, + "grad_norm": 0.7410522699356079, + "learning_rate": 8.074123230375952e-06, + "loss": 0.7413, + "step": 10581 + }, + { + "epoch": 0.582420606527602, + "grad_norm": 0.7932689189910889, + "learning_rate": 8.073781359376972e-06, + "loss": 0.7894, + "step": 10582 + }, + { + "epoch": 0.5824756453299577, + "grad_norm": 0.6710309982299805, + "learning_rate": 8.073439465276277e-06, + "loss": 0.6727, + "step": 10583 + }, + { + "epoch": 0.5825306841323132, + "grad_norm": 0.7457143068313599, + "learning_rate": 8.07309754807643e-06, + "loss": 0.6719, + "step": 10584 + }, + { + "epoch": 0.5825857229346689, + "grad_norm": 0.7340453863143921, + "learning_rate": 8.072755607780008e-06, + "loss": 0.7397, + "step": 10585 + }, + { + "epoch": 0.5826407617370246, + "grad_norm": 0.7532176971435547, + "learning_rate": 8.072413644389574e-06, + "loss": 0.7368, + "step": 10586 + }, + { + "epoch": 0.5826958005393803, + "grad_norm": 0.9317812919616699, + "learning_rate": 8.072071657907703e-06, + "loss": 0.9113, + "step": 10587 + }, + { + "epoch": 0.5827508393417359, + "grad_norm": 0.8535491228103638, + "learning_rate": 8.071729648336963e-06, + "loss": 0.7708, + "step": 10588 + }, + { + "epoch": 0.5828058781440916, + "grad_norm": 0.6720348000526428, + "learning_rate": 8.071387615679926e-06, + "loss": 0.7521, + "step": 10589 + }, + { + "epoch": 0.5828609169464473, + "grad_norm": 0.7113864421844482, + "learning_rate": 8.071045559939162e-06, + "loss": 0.8713, + "step": 10590 + }, + { + "epoch": 0.582915955748803, + "grad_norm": 0.7760024070739746, + "learning_rate": 8.070703481117242e-06, + "loss": 0.7567, + "step": 10591 + }, + { + "epoch": 0.5829709945511585, + "grad_norm": 0.9548617005348206, + "learning_rate": 8.070361379216735e-06, + "loss": 0.7937, + "step": 10592 + }, + { + "epoch": 0.5830260333535142, + "grad_norm": 0.7796840667724609, + "learning_rate": 8.070019254240216e-06, + "loss": 0.7485, + "step": 10593 + }, + { + "epoch": 0.5830810721558699, + "grad_norm": 0.7006514668464661, + "learning_rate": 8.069677106190253e-06, + "loss": 0.7813, + "step": 10594 + }, + { + "epoch": 0.5831361109582256, + "grad_norm": 0.646396279335022, + "learning_rate": 8.069334935069417e-06, + "loss": 0.7437, + "step": 10595 + }, + { + "epoch": 0.5831911497605812, + "grad_norm": 0.8257368206977844, + "learning_rate": 8.068992740880283e-06, + "loss": 0.7351, + "step": 10596 + }, + { + "epoch": 0.5832461885629369, + "grad_norm": 0.6646208763122559, + "learning_rate": 8.068650523625422e-06, + "loss": 0.6554, + "step": 10597 + }, + { + "epoch": 0.5833012273652926, + "grad_norm": 0.8495579957962036, + "learning_rate": 8.068308283307402e-06, + "loss": 0.791, + "step": 10598 + }, + { + "epoch": 0.5833562661676482, + "grad_norm": 0.7283076047897339, + "learning_rate": 8.0679660199288e-06, + "loss": 0.7327, + "step": 10599 + }, + { + "epoch": 0.5834113049700038, + "grad_norm": 0.704572856426239, + "learning_rate": 8.067623733492187e-06, + "loss": 0.6094, + "step": 10600 + }, + { + "epoch": 0.5834663437723595, + "grad_norm": 0.6435144543647766, + "learning_rate": 8.067281424000136e-06, + "loss": 0.6974, + "step": 10601 + }, + { + "epoch": 0.5835213825747152, + "grad_norm": 0.9628346562385559, + "learning_rate": 8.066939091455215e-06, + "loss": 0.8933, + "step": 10602 + }, + { + "epoch": 0.5835764213770709, + "grad_norm": 0.6856930255889893, + "learning_rate": 8.066596735860004e-06, + "loss": 0.7414, + "step": 10603 + }, + { + "epoch": 0.5836314601794265, + "grad_norm": 0.7341175675392151, + "learning_rate": 8.066254357217072e-06, + "loss": 0.7553, + "step": 10604 + }, + { + "epoch": 0.5836864989817822, + "grad_norm": 0.7124871611595154, + "learning_rate": 8.065911955528995e-06, + "loss": 0.663, + "step": 10605 + }, + { + "epoch": 0.5837415377841378, + "grad_norm": 0.816028892993927, + "learning_rate": 8.065569530798341e-06, + "loss": 0.8778, + "step": 10606 + }, + { + "epoch": 0.5837965765864935, + "grad_norm": 0.8735721111297607, + "learning_rate": 8.06522708302769e-06, + "loss": 0.7866, + "step": 10607 + }, + { + "epoch": 0.5838516153888491, + "grad_norm": 0.6780036687850952, + "learning_rate": 8.06488461221961e-06, + "loss": 0.7329, + "step": 10608 + }, + { + "epoch": 0.5839066541912048, + "grad_norm": 0.7624822854995728, + "learning_rate": 8.06454211837668e-06, + "loss": 0.8095, + "step": 10609 + }, + { + "epoch": 0.5839616929935605, + "grad_norm": 0.8269234895706177, + "learning_rate": 8.06419960150147e-06, + "loss": 0.7194, + "step": 10610 + }, + { + "epoch": 0.5840167317959161, + "grad_norm": 0.6748649477958679, + "learning_rate": 8.063857061596558e-06, + "loss": 0.702, + "step": 10611 + }, + { + "epoch": 0.5840717705982718, + "grad_norm": 0.9700273275375366, + "learning_rate": 8.063514498664515e-06, + "loss": 0.7917, + "step": 10612 + }, + { + "epoch": 0.5841268094006274, + "grad_norm": 0.7798827290534973, + "learning_rate": 8.063171912707916e-06, + "loss": 0.798, + "step": 10613 + }, + { + "epoch": 0.5841818482029831, + "grad_norm": 0.6613249778747559, + "learning_rate": 8.06282930372934e-06, + "loss": 0.7216, + "step": 10614 + }, + { + "epoch": 0.5842368870053387, + "grad_norm": 0.727116048336029, + "learning_rate": 8.062486671731357e-06, + "loss": 0.8054, + "step": 10615 + }, + { + "epoch": 0.5842919258076944, + "grad_norm": 0.6704444289207458, + "learning_rate": 8.062144016716543e-06, + "loss": 0.7503, + "step": 10616 + }, + { + "epoch": 0.5843469646100501, + "grad_norm": 0.6867938041687012, + "learning_rate": 8.061801338687477e-06, + "loss": 0.8005, + "step": 10617 + }, + { + "epoch": 0.5844020034124058, + "grad_norm": 0.7097555994987488, + "learning_rate": 8.061458637646729e-06, + "loss": 0.8515, + "step": 10618 + }, + { + "epoch": 0.5844570422147614, + "grad_norm": 0.6624881625175476, + "learning_rate": 8.061115913596878e-06, + "loss": 0.7735, + "step": 10619 + }, + { + "epoch": 0.584512081017117, + "grad_norm": 0.6649004220962524, + "learning_rate": 8.060773166540498e-06, + "loss": 0.7837, + "step": 10620 + }, + { + "epoch": 0.5845671198194727, + "grad_norm": 0.6732968091964722, + "learning_rate": 8.06043039648017e-06, + "loss": 0.7846, + "step": 10621 + }, + { + "epoch": 0.5846221586218284, + "grad_norm": 0.7551947236061096, + "learning_rate": 8.060087603418464e-06, + "loss": 0.6868, + "step": 10622 + }, + { + "epoch": 0.584677197424184, + "grad_norm": 0.7781728506088257, + "learning_rate": 8.059744787357959e-06, + "loss": 0.8088, + "step": 10623 + }, + { + "epoch": 0.5847322362265397, + "grad_norm": 0.6362790465354919, + "learning_rate": 8.05940194830123e-06, + "loss": 0.664, + "step": 10624 + }, + { + "epoch": 0.5847872750288954, + "grad_norm": 0.670386791229248, + "learning_rate": 8.059059086250856e-06, + "loss": 0.6839, + "step": 10625 + }, + { + "epoch": 0.5848423138312511, + "grad_norm": 0.7030045986175537, + "learning_rate": 8.058716201209414e-06, + "loss": 0.7243, + "step": 10626 + }, + { + "epoch": 0.5848973526336066, + "grad_norm": 0.7881805896759033, + "learning_rate": 8.058373293179477e-06, + "loss": 0.7994, + "step": 10627 + }, + { + "epoch": 0.5849523914359623, + "grad_norm": 0.7077344059944153, + "learning_rate": 8.058030362163628e-06, + "loss": 0.822, + "step": 10628 + }, + { + "epoch": 0.585007430238318, + "grad_norm": 0.6787039637565613, + "learning_rate": 8.057687408164439e-06, + "loss": 0.7619, + "step": 10629 + }, + { + "epoch": 0.5850624690406737, + "grad_norm": 1.1377217769622803, + "learning_rate": 8.05734443118449e-06, + "loss": 0.8632, + "step": 10630 + }, + { + "epoch": 0.5851175078430293, + "grad_norm": 0.7002600431442261, + "learning_rate": 8.05700143122636e-06, + "loss": 0.8184, + "step": 10631 + }, + { + "epoch": 0.585172546645385, + "grad_norm": 0.7016324400901794, + "learning_rate": 8.056658408292626e-06, + "loss": 0.658, + "step": 10632 + }, + { + "epoch": 0.5852275854477407, + "grad_norm": 0.6674843430519104, + "learning_rate": 8.056315362385864e-06, + "loss": 0.7281, + "step": 10633 + }, + { + "epoch": 0.5852826242500964, + "grad_norm": 0.6789288520812988, + "learning_rate": 8.055972293508653e-06, + "loss": 0.8192, + "step": 10634 + }, + { + "epoch": 0.5853376630524519, + "grad_norm": 0.6740062236785889, + "learning_rate": 8.055629201663575e-06, + "loss": 0.7343, + "step": 10635 + }, + { + "epoch": 0.5853927018548076, + "grad_norm": 0.7417730689048767, + "learning_rate": 8.055286086853204e-06, + "loss": 0.8161, + "step": 10636 + }, + { + "epoch": 0.5854477406571633, + "grad_norm": 0.6680465340614319, + "learning_rate": 8.054942949080122e-06, + "loss": 0.7589, + "step": 10637 + }, + { + "epoch": 0.585502779459519, + "grad_norm": 0.7205108404159546, + "learning_rate": 8.054599788346904e-06, + "loss": 0.6837, + "step": 10638 + }, + { + "epoch": 0.5855578182618746, + "grad_norm": 0.8694404363632202, + "learning_rate": 8.054256604656134e-06, + "loss": 0.8033, + "step": 10639 + }, + { + "epoch": 0.5856128570642303, + "grad_norm": 0.685471773147583, + "learning_rate": 8.053913398010389e-06, + "loss": 0.7654, + "step": 10640 + }, + { + "epoch": 0.585667895866586, + "grad_norm": 1.3463424444198608, + "learning_rate": 8.053570168412249e-06, + "loss": 0.7743, + "step": 10641 + }, + { + "epoch": 0.5857229346689417, + "grad_norm": 0.9380106329917908, + "learning_rate": 8.05322691586429e-06, + "loss": 0.8984, + "step": 10642 + }, + { + "epoch": 0.5857779734712972, + "grad_norm": 0.7408519387245178, + "learning_rate": 8.052883640369096e-06, + "loss": 0.7716, + "step": 10643 + }, + { + "epoch": 0.5858330122736529, + "grad_norm": 0.7712904214859009, + "learning_rate": 8.052540341929248e-06, + "loss": 0.7767, + "step": 10644 + }, + { + "epoch": 0.5858880510760086, + "grad_norm": 0.8464158177375793, + "learning_rate": 8.052197020547321e-06, + "loss": 0.8333, + "step": 10645 + }, + { + "epoch": 0.5859430898783643, + "grad_norm": 0.6970158219337463, + "learning_rate": 8.0518536762259e-06, + "loss": 0.7354, + "step": 10646 + }, + { + "epoch": 0.5859981286807199, + "grad_norm": 0.7048965096473694, + "learning_rate": 8.051510308967563e-06, + "loss": 0.8333, + "step": 10647 + }, + { + "epoch": 0.5860531674830756, + "grad_norm": 0.6443868279457092, + "learning_rate": 8.05116691877489e-06, + "loss": 0.7386, + "step": 10648 + }, + { + "epoch": 0.5861082062854313, + "grad_norm": 0.6653542518615723, + "learning_rate": 8.050823505650465e-06, + "loss": 0.8116, + "step": 10649 + }, + { + "epoch": 0.5861632450877869, + "grad_norm": 0.7293158769607544, + "learning_rate": 8.050480069596868e-06, + "loss": 0.8231, + "step": 10650 + }, + { + "epoch": 0.5862182838901425, + "grad_norm": 0.6876117587089539, + "learning_rate": 8.050136610616676e-06, + "loss": 0.7856, + "step": 10651 + }, + { + "epoch": 0.5862733226924982, + "grad_norm": 0.6811665296554565, + "learning_rate": 8.049793128712477e-06, + "loss": 0.7667, + "step": 10652 + }, + { + "epoch": 0.5863283614948539, + "grad_norm": 0.701034426689148, + "learning_rate": 8.049449623886849e-06, + "loss": 0.7812, + "step": 10653 + }, + { + "epoch": 0.5863834002972095, + "grad_norm": 0.6872833967208862, + "learning_rate": 8.049106096142372e-06, + "loss": 0.755, + "step": 10654 + }, + { + "epoch": 0.5864384390995652, + "grad_norm": 0.6643580198287964, + "learning_rate": 8.04876254548163e-06, + "loss": 0.7692, + "step": 10655 + }, + { + "epoch": 0.5864934779019209, + "grad_norm": 0.6672106981277466, + "learning_rate": 8.048418971907206e-06, + "loss": 0.7424, + "step": 10656 + }, + { + "epoch": 0.5865485167042765, + "grad_norm": 0.8030515313148499, + "learning_rate": 8.04807537542168e-06, + "loss": 0.8074, + "step": 10657 + }, + { + "epoch": 0.5866035555066321, + "grad_norm": 0.713417112827301, + "learning_rate": 8.047731756027637e-06, + "loss": 0.6974, + "step": 10658 + }, + { + "epoch": 0.5866585943089878, + "grad_norm": 0.7715572118759155, + "learning_rate": 8.047388113727657e-06, + "loss": 0.7353, + "step": 10659 + }, + { + "epoch": 0.5867136331113435, + "grad_norm": 0.7009812593460083, + "learning_rate": 8.047044448524323e-06, + "loss": 0.7992, + "step": 10660 + }, + { + "epoch": 0.5867686719136992, + "grad_norm": 0.6425079107284546, + "learning_rate": 8.046700760420219e-06, + "loss": 0.7394, + "step": 10661 + }, + { + "epoch": 0.5868237107160548, + "grad_norm": 0.7713460922241211, + "learning_rate": 8.046357049417927e-06, + "loss": 0.7759, + "step": 10662 + }, + { + "epoch": 0.5868787495184105, + "grad_norm": 0.7310347557067871, + "learning_rate": 8.046013315520033e-06, + "loss": 0.7278, + "step": 10663 + }, + { + "epoch": 0.5869337883207661, + "grad_norm": 0.7493315935134888, + "learning_rate": 8.045669558729117e-06, + "loss": 0.7808, + "step": 10664 + }, + { + "epoch": 0.5869888271231218, + "grad_norm": 0.7547439336776733, + "learning_rate": 8.045325779047763e-06, + "loss": 0.8245, + "step": 10665 + }, + { + "epoch": 0.5870438659254774, + "grad_norm": 0.7556985020637512, + "learning_rate": 8.044981976478557e-06, + "loss": 0.8, + "step": 10666 + }, + { + "epoch": 0.5870989047278331, + "grad_norm": 0.8330736756324768, + "learning_rate": 8.04463815102408e-06, + "loss": 0.8177, + "step": 10667 + }, + { + "epoch": 0.5871539435301888, + "grad_norm": 0.7823941111564636, + "learning_rate": 8.04429430268692e-06, + "loss": 0.8306, + "step": 10668 + }, + { + "epoch": 0.5872089823325445, + "grad_norm": 0.9141719937324524, + "learning_rate": 8.043950431469657e-06, + "loss": 0.9137, + "step": 10669 + }, + { + "epoch": 0.5872640211349001, + "grad_norm": 0.6967095732688904, + "learning_rate": 8.043606537374878e-06, + "loss": 0.7262, + "step": 10670 + }, + { + "epoch": 0.5873190599372557, + "grad_norm": 0.7909649014472961, + "learning_rate": 8.043262620405166e-06, + "loss": 0.8332, + "step": 10671 + }, + { + "epoch": 0.5873740987396114, + "grad_norm": 0.7967168092727661, + "learning_rate": 8.042918680563107e-06, + "loss": 0.7966, + "step": 10672 + }, + { + "epoch": 0.5874291375419671, + "grad_norm": 0.7637625336647034, + "learning_rate": 8.042574717851287e-06, + "loss": 0.8322, + "step": 10673 + }, + { + "epoch": 0.5874841763443227, + "grad_norm": 0.6968004107475281, + "learning_rate": 8.04223073227229e-06, + "loss": 0.8061, + "step": 10674 + }, + { + "epoch": 0.5875392151466784, + "grad_norm": 0.7325586080551147, + "learning_rate": 8.0418867238287e-06, + "loss": 0.7922, + "step": 10675 + }, + { + "epoch": 0.5875942539490341, + "grad_norm": 0.6784406304359436, + "learning_rate": 8.041542692523103e-06, + "loss": 0.7327, + "step": 10676 + }, + { + "epoch": 0.5876492927513898, + "grad_norm": 0.8297861218452454, + "learning_rate": 8.041198638358088e-06, + "loss": 0.9347, + "step": 10677 + }, + { + "epoch": 0.5877043315537454, + "grad_norm": 0.6227413415908813, + "learning_rate": 8.040854561336236e-06, + "loss": 0.655, + "step": 10678 + }, + { + "epoch": 0.587759370356101, + "grad_norm": 0.752098023891449, + "learning_rate": 8.040510461460134e-06, + "loss": 0.7608, + "step": 10679 + }, + { + "epoch": 0.5878144091584567, + "grad_norm": 0.7008342146873474, + "learning_rate": 8.040166338732372e-06, + "loss": 0.7385, + "step": 10680 + }, + { + "epoch": 0.5878694479608124, + "grad_norm": 0.6768027544021606, + "learning_rate": 8.039822193155532e-06, + "loss": 0.6812, + "step": 10681 + }, + { + "epoch": 0.587924486763168, + "grad_norm": 0.7728545069694519, + "learning_rate": 8.039478024732203e-06, + "loss": 0.7696, + "step": 10682 + }, + { + "epoch": 0.5879795255655237, + "grad_norm": 0.7257505655288696, + "learning_rate": 8.03913383346497e-06, + "loss": 0.6686, + "step": 10683 + }, + { + "epoch": 0.5880345643678794, + "grad_norm": 0.7755837440490723, + "learning_rate": 8.03878961935642e-06, + "loss": 0.8469, + "step": 10684 + }, + { + "epoch": 0.5880896031702351, + "grad_norm": 0.7187668085098267, + "learning_rate": 8.038445382409142e-06, + "loss": 0.8249, + "step": 10685 + }, + { + "epoch": 0.5881446419725906, + "grad_norm": 0.638053834438324, + "learning_rate": 8.038101122625722e-06, + "loss": 0.6876, + "step": 10686 + }, + { + "epoch": 0.5881996807749463, + "grad_norm": 0.7323756217956543, + "learning_rate": 8.037756840008746e-06, + "loss": 0.7489, + "step": 10687 + }, + { + "epoch": 0.588254719577302, + "grad_norm": 0.6795439720153809, + "learning_rate": 8.037412534560804e-06, + "loss": 0.7246, + "step": 10688 + }, + { + "epoch": 0.5883097583796577, + "grad_norm": 0.8136376142501831, + "learning_rate": 8.037068206284482e-06, + "loss": 0.8518, + "step": 10689 + }, + { + "epoch": 0.5883647971820133, + "grad_norm": 0.6484195590019226, + "learning_rate": 8.036723855182367e-06, + "loss": 0.7018, + "step": 10690 + }, + { + "epoch": 0.588419835984369, + "grad_norm": 0.7465028166770935, + "learning_rate": 8.036379481257048e-06, + "loss": 0.8276, + "step": 10691 + }, + { + "epoch": 0.5884748747867247, + "grad_norm": 0.7761173844337463, + "learning_rate": 8.036035084511116e-06, + "loss": 0.6371, + "step": 10692 + }, + { + "epoch": 0.5885299135890804, + "grad_norm": 0.830008864402771, + "learning_rate": 8.035690664947156e-06, + "loss": 0.8199, + "step": 10693 + }, + { + "epoch": 0.5885849523914359, + "grad_norm": 0.6614254117012024, + "learning_rate": 8.03534622256776e-06, + "loss": 0.656, + "step": 10694 + }, + { + "epoch": 0.5886399911937916, + "grad_norm": 0.7229047417640686, + "learning_rate": 8.035001757375509e-06, + "loss": 0.7622, + "step": 10695 + }, + { + "epoch": 0.5886950299961473, + "grad_norm": 0.7044325470924377, + "learning_rate": 8.034657269373001e-06, + "loss": 0.7678, + "step": 10696 + }, + { + "epoch": 0.5887500687985029, + "grad_norm": 0.7109018564224243, + "learning_rate": 8.03431275856282e-06, + "loss": 0.7976, + "step": 10697 + }, + { + "epoch": 0.5888051076008586, + "grad_norm": 0.7812879085540771, + "learning_rate": 8.033968224947557e-06, + "loss": 0.7163, + "step": 10698 + }, + { + "epoch": 0.5888601464032143, + "grad_norm": 0.7408469915390015, + "learning_rate": 8.033623668529802e-06, + "loss": 0.6895, + "step": 10699 + }, + { + "epoch": 0.58891518520557, + "grad_norm": 0.7654302716255188, + "learning_rate": 8.033279089312142e-06, + "loss": 0.8126, + "step": 10700 + }, + { + "epoch": 0.5889702240079255, + "grad_norm": 0.7307846546173096, + "learning_rate": 8.032934487297169e-06, + "loss": 0.7958, + "step": 10701 + }, + { + "epoch": 0.5890252628102812, + "grad_norm": 0.6658591032028198, + "learning_rate": 8.032589862487472e-06, + "loss": 0.717, + "step": 10702 + }, + { + "epoch": 0.5890803016126369, + "grad_norm": 1.4167139530181885, + "learning_rate": 8.03224521488564e-06, + "loss": 0.8599, + "step": 10703 + }, + { + "epoch": 0.5891353404149926, + "grad_norm": 0.6723609566688538, + "learning_rate": 8.031900544494266e-06, + "loss": 0.8167, + "step": 10704 + }, + { + "epoch": 0.5891903792173482, + "grad_norm": 0.6420501470565796, + "learning_rate": 8.03155585131594e-06, + "loss": 0.692, + "step": 10705 + }, + { + "epoch": 0.5892454180197039, + "grad_norm": 0.6973454356193542, + "learning_rate": 8.031211135353251e-06, + "loss": 0.7709, + "step": 10706 + }, + { + "epoch": 0.5893004568220596, + "grad_norm": 0.7752252221107483, + "learning_rate": 8.03086639660879e-06, + "loss": 0.7795, + "step": 10707 + }, + { + "epoch": 0.5893554956244152, + "grad_norm": 0.8193135857582092, + "learning_rate": 8.030521635085149e-06, + "loss": 0.812, + "step": 10708 + }, + { + "epoch": 0.5894105344267708, + "grad_norm": 0.7976878881454468, + "learning_rate": 8.03017685078492e-06, + "loss": 0.8039, + "step": 10709 + }, + { + "epoch": 0.5894655732291265, + "grad_norm": 0.7545839548110962, + "learning_rate": 8.02983204371069e-06, + "loss": 0.8238, + "step": 10710 + }, + { + "epoch": 0.5895206120314822, + "grad_norm": 0.6544732451438904, + "learning_rate": 8.029487213865054e-06, + "loss": 0.7471, + "step": 10711 + }, + { + "epoch": 0.5895756508338379, + "grad_norm": 0.7054508924484253, + "learning_rate": 8.029142361250603e-06, + "loss": 0.8283, + "step": 10712 + }, + { + "epoch": 0.5896306896361935, + "grad_norm": 0.7425236105918884, + "learning_rate": 8.02879748586993e-06, + "loss": 0.8031, + "step": 10713 + }, + { + "epoch": 0.5896857284385492, + "grad_norm": 0.8390052318572998, + "learning_rate": 8.028452587725626e-06, + "loss": 0.7218, + "step": 10714 + }, + { + "epoch": 0.5897407672409049, + "grad_norm": 0.8116903901100159, + "learning_rate": 8.028107666820282e-06, + "loss": 0.8057, + "step": 10715 + }, + { + "epoch": 0.5897958060432605, + "grad_norm": 0.602308452129364, + "learning_rate": 8.027762723156492e-06, + "loss": 0.6428, + "step": 10716 + }, + { + "epoch": 0.5898508448456161, + "grad_norm": 0.7480159401893616, + "learning_rate": 8.027417756736848e-06, + "loss": 0.7566, + "step": 10717 + }, + { + "epoch": 0.5899058836479718, + "grad_norm": 0.6823177933692932, + "learning_rate": 8.027072767563943e-06, + "loss": 0.8337, + "step": 10718 + }, + { + "epoch": 0.5899609224503275, + "grad_norm": 0.6841796040534973, + "learning_rate": 8.026727755640367e-06, + "loss": 0.751, + "step": 10719 + }, + { + "epoch": 0.5900159612526832, + "grad_norm": 0.7257139086723328, + "learning_rate": 8.026382720968718e-06, + "loss": 0.7373, + "step": 10720 + }, + { + "epoch": 0.5900710000550388, + "grad_norm": 0.6318400502204895, + "learning_rate": 8.026037663551584e-06, + "loss": 0.7205, + "step": 10721 + }, + { + "epoch": 0.5901260388573945, + "grad_norm": 0.6612908840179443, + "learning_rate": 8.025692583391564e-06, + "loss": 0.7613, + "step": 10722 + }, + { + "epoch": 0.5901810776597501, + "grad_norm": 0.7555351853370667, + "learning_rate": 8.025347480491246e-06, + "loss": 0.718, + "step": 10723 + }, + { + "epoch": 0.5902361164621058, + "grad_norm": 0.6944366097450256, + "learning_rate": 8.025002354853227e-06, + "loss": 0.7775, + "step": 10724 + }, + { + "epoch": 0.5902911552644614, + "grad_norm": 0.6968230605125427, + "learning_rate": 8.0246572064801e-06, + "loss": 0.7316, + "step": 10725 + }, + { + "epoch": 0.5903461940668171, + "grad_norm": 0.7083567380905151, + "learning_rate": 8.024312035374459e-06, + "loss": 0.7844, + "step": 10726 + }, + { + "epoch": 0.5904012328691728, + "grad_norm": 0.7183080315589905, + "learning_rate": 8.0239668415389e-06, + "loss": 0.8308, + "step": 10727 + }, + { + "epoch": 0.5904562716715285, + "grad_norm": 0.8350495100021362, + "learning_rate": 8.023621624976014e-06, + "loss": 0.9077, + "step": 10728 + }, + { + "epoch": 0.590511310473884, + "grad_norm": 0.6876987218856812, + "learning_rate": 8.023276385688396e-06, + "loss": 0.7483, + "step": 10729 + }, + { + "epoch": 0.5905663492762397, + "grad_norm": 0.8617128133773804, + "learning_rate": 8.022931123678646e-06, + "loss": 0.7058, + "step": 10730 + }, + { + "epoch": 0.5906213880785954, + "grad_norm": 0.6921959519386292, + "learning_rate": 8.02258583894935e-06, + "loss": 0.7542, + "step": 10731 + }, + { + "epoch": 0.5906764268809511, + "grad_norm": 0.7394077181816101, + "learning_rate": 8.02224053150311e-06, + "loss": 0.7761, + "step": 10732 + }, + { + "epoch": 0.5907314656833067, + "grad_norm": 0.6672187447547913, + "learning_rate": 8.02189520134252e-06, + "loss": 0.6904, + "step": 10733 + }, + { + "epoch": 0.5907865044856624, + "grad_norm": 0.7498076558113098, + "learning_rate": 8.021549848470174e-06, + "loss": 0.7994, + "step": 10734 + }, + { + "epoch": 0.5908415432880181, + "grad_norm": 0.699832558631897, + "learning_rate": 8.021204472888669e-06, + "loss": 0.7413, + "step": 10735 + }, + { + "epoch": 0.5908965820903738, + "grad_norm": 0.7628722190856934, + "learning_rate": 8.020859074600598e-06, + "loss": 0.8202, + "step": 10736 + }, + { + "epoch": 0.5909516208927293, + "grad_norm": 0.8023744225502014, + "learning_rate": 8.020513653608558e-06, + "loss": 0.8225, + "step": 10737 + }, + { + "epoch": 0.591006659695085, + "grad_norm": 0.7283689379692078, + "learning_rate": 8.02016820991515e-06, + "loss": 0.6706, + "step": 10738 + }, + { + "epoch": 0.5910616984974407, + "grad_norm": 0.7199996113777161, + "learning_rate": 8.019822743522962e-06, + "loss": 0.8258, + "step": 10739 + }, + { + "epoch": 0.5911167372997963, + "grad_norm": 0.623249888420105, + "learning_rate": 8.019477254434598e-06, + "loss": 0.6188, + "step": 10740 + }, + { + "epoch": 0.591171776102152, + "grad_norm": 0.7331949472427368, + "learning_rate": 8.01913174265265e-06, + "loss": 0.8013, + "step": 10741 + }, + { + "epoch": 0.5912268149045077, + "grad_norm": 0.7003010511398315, + "learning_rate": 8.018786208179716e-06, + "loss": 0.8305, + "step": 10742 + }, + { + "epoch": 0.5912818537068634, + "grad_norm": 0.6879638433456421, + "learning_rate": 8.01844065101839e-06, + "loss": 0.7622, + "step": 10743 + }, + { + "epoch": 0.591336892509219, + "grad_norm": 0.6597324013710022, + "learning_rate": 8.018095071171276e-06, + "loss": 0.7362, + "step": 10744 + }, + { + "epoch": 0.5913919313115746, + "grad_norm": 0.664905846118927, + "learning_rate": 8.017749468640967e-06, + "loss": 0.7629, + "step": 10745 + }, + { + "epoch": 0.5914469701139303, + "grad_norm": 0.7358053922653198, + "learning_rate": 8.017403843430059e-06, + "loss": 0.7798, + "step": 10746 + }, + { + "epoch": 0.591502008916286, + "grad_norm": 0.699603259563446, + "learning_rate": 8.017058195541152e-06, + "loss": 0.6249, + "step": 10747 + }, + { + "epoch": 0.5915570477186416, + "grad_norm": 0.6736140847206116, + "learning_rate": 8.016712524976843e-06, + "loss": 0.6904, + "step": 10748 + }, + { + "epoch": 0.5916120865209973, + "grad_norm": 0.6803401112556458, + "learning_rate": 8.016366831739732e-06, + "loss": 0.6868, + "step": 10749 + }, + { + "epoch": 0.591667125323353, + "grad_norm": 0.7152959704399109, + "learning_rate": 8.016021115832413e-06, + "loss": 0.7747, + "step": 10750 + }, + { + "epoch": 0.5917221641257087, + "grad_norm": 0.6469255685806274, + "learning_rate": 8.015675377257489e-06, + "loss": 0.7309, + "step": 10751 + }, + { + "epoch": 0.5917772029280642, + "grad_norm": 0.7902734875679016, + "learning_rate": 8.015329616017554e-06, + "loss": 0.7575, + "step": 10752 + }, + { + "epoch": 0.5918322417304199, + "grad_norm": 0.7447189688682556, + "learning_rate": 8.014983832115208e-06, + "loss": 0.7759, + "step": 10753 + }, + { + "epoch": 0.5918872805327756, + "grad_norm": 0.6135374903678894, + "learning_rate": 8.014638025553053e-06, + "loss": 0.6681, + "step": 10754 + }, + { + "epoch": 0.5919423193351313, + "grad_norm": 0.8614835739135742, + "learning_rate": 8.014292196333684e-06, + "loss": 0.7203, + "step": 10755 + }, + { + "epoch": 0.5919973581374869, + "grad_norm": 0.7649008631706238, + "learning_rate": 8.013946344459703e-06, + "loss": 0.7966, + "step": 10756 + }, + { + "epoch": 0.5920523969398426, + "grad_norm": 1.0862764120101929, + "learning_rate": 8.013600469933707e-06, + "loss": 0.866, + "step": 10757 + }, + { + "epoch": 0.5921074357421983, + "grad_norm": 0.7304185628890991, + "learning_rate": 8.013254572758296e-06, + "loss": 0.7599, + "step": 10758 + }, + { + "epoch": 0.592162474544554, + "grad_norm": 0.6329634785652161, + "learning_rate": 8.012908652936072e-06, + "loss": 0.6855, + "step": 10759 + }, + { + "epoch": 0.5922175133469095, + "grad_norm": 0.6692202687263489, + "learning_rate": 8.012562710469631e-06, + "loss": 0.817, + "step": 10760 + }, + { + "epoch": 0.5922725521492652, + "grad_norm": 0.6577631235122681, + "learning_rate": 8.012216745361577e-06, + "loss": 0.7813, + "step": 10761 + }, + { + "epoch": 0.5923275909516209, + "grad_norm": 0.6877861022949219, + "learning_rate": 8.011870757614506e-06, + "loss": 0.7142, + "step": 10762 + }, + { + "epoch": 0.5923826297539766, + "grad_norm": 0.7132022380828857, + "learning_rate": 8.011524747231023e-06, + "loss": 0.747, + "step": 10763 + }, + { + "epoch": 0.5924376685563322, + "grad_norm": 0.7841360569000244, + "learning_rate": 8.011178714213726e-06, + "loss": 0.7511, + "step": 10764 + }, + { + "epoch": 0.5924927073586879, + "grad_norm": 0.8572794198989868, + "learning_rate": 8.010832658565215e-06, + "loss": 0.8704, + "step": 10765 + }, + { + "epoch": 0.5925477461610436, + "grad_norm": 0.6825506687164307, + "learning_rate": 8.010486580288092e-06, + "loss": 0.7472, + "step": 10766 + }, + { + "epoch": 0.5926027849633992, + "grad_norm": 0.7484591603279114, + "learning_rate": 8.010140479384957e-06, + "loss": 0.7679, + "step": 10767 + }, + { + "epoch": 0.5926578237657548, + "grad_norm": 0.712602436542511, + "learning_rate": 8.009794355858412e-06, + "loss": 0.7706, + "step": 10768 + }, + { + "epoch": 0.5927128625681105, + "grad_norm": 0.8911493420600891, + "learning_rate": 8.00944820971106e-06, + "loss": 0.8396, + "step": 10769 + }, + { + "epoch": 0.5927679013704662, + "grad_norm": 0.7300251126289368, + "learning_rate": 8.009102040945498e-06, + "loss": 0.7611, + "step": 10770 + }, + { + "epoch": 0.5928229401728219, + "grad_norm": 0.727343738079071, + "learning_rate": 8.008755849564333e-06, + "loss": 0.6785, + "step": 10771 + }, + { + "epoch": 0.5928779789751775, + "grad_norm": 0.8323808908462524, + "learning_rate": 8.008409635570163e-06, + "loss": 0.7429, + "step": 10772 + }, + { + "epoch": 0.5929330177775332, + "grad_norm": 0.6651942133903503, + "learning_rate": 8.00806339896559e-06, + "loss": 0.7683, + "step": 10773 + }, + { + "epoch": 0.5929880565798888, + "grad_norm": 0.7164554595947266, + "learning_rate": 8.007717139753222e-06, + "loss": 0.7742, + "step": 10774 + }, + { + "epoch": 0.5930430953822445, + "grad_norm": 0.6906408667564392, + "learning_rate": 8.007370857935654e-06, + "loss": 0.7322, + "step": 10775 + }, + { + "epoch": 0.5930981341846001, + "grad_norm": 0.6384999752044678, + "learning_rate": 8.007024553515493e-06, + "loss": 0.7011, + "step": 10776 + }, + { + "epoch": 0.5931531729869558, + "grad_norm": 0.6997355222702026, + "learning_rate": 8.006678226495338e-06, + "loss": 0.7303, + "step": 10777 + }, + { + "epoch": 0.5932082117893115, + "grad_norm": 0.6730707287788391, + "learning_rate": 8.006331876877797e-06, + "loss": 0.7461, + "step": 10778 + }, + { + "epoch": 0.5932632505916672, + "grad_norm": 0.7529115080833435, + "learning_rate": 8.00598550466547e-06, + "loss": 0.7487, + "step": 10779 + }, + { + "epoch": 0.5933182893940228, + "grad_norm": 0.7186329960823059, + "learning_rate": 8.00563910986096e-06, + "loss": 0.8025, + "step": 10780 + }, + { + "epoch": 0.5933733281963784, + "grad_norm": 0.7523752450942993, + "learning_rate": 8.005292692466869e-06, + "loss": 0.8291, + "step": 10781 + }, + { + "epoch": 0.5934283669987341, + "grad_norm": 1.182645559310913, + "learning_rate": 8.004946252485806e-06, + "loss": 0.8037, + "step": 10782 + }, + { + "epoch": 0.5934834058010897, + "grad_norm": 0.736570417881012, + "learning_rate": 8.004599789920369e-06, + "loss": 0.8259, + "step": 10783 + }, + { + "epoch": 0.5935384446034454, + "grad_norm": 0.757665753364563, + "learning_rate": 8.004253304773165e-06, + "loss": 0.7773, + "step": 10784 + }, + { + "epoch": 0.5935934834058011, + "grad_norm": 0.6988566517829895, + "learning_rate": 8.003906797046798e-06, + "loss": 0.7895, + "step": 10785 + }, + { + "epoch": 0.5936485222081568, + "grad_norm": 0.6921454071998596, + "learning_rate": 8.00356026674387e-06, + "loss": 0.8068, + "step": 10786 + }, + { + "epoch": 0.5937035610105124, + "grad_norm": 0.7053877115249634, + "learning_rate": 8.003213713866988e-06, + "loss": 0.7632, + "step": 10787 + }, + { + "epoch": 0.593758599812868, + "grad_norm": 0.8193650245666504, + "learning_rate": 8.002867138418757e-06, + "loss": 0.759, + "step": 10788 + }, + { + "epoch": 0.5938136386152237, + "grad_norm": 0.6089804768562317, + "learning_rate": 8.002520540401779e-06, + "loss": 0.7117, + "step": 10789 + }, + { + "epoch": 0.5938686774175794, + "grad_norm": 0.6869456768035889, + "learning_rate": 8.002173919818662e-06, + "loss": 0.7724, + "step": 10790 + }, + { + "epoch": 0.593923716219935, + "grad_norm": 0.7279118895530701, + "learning_rate": 8.001827276672007e-06, + "loss": 0.7578, + "step": 10791 + }, + { + "epoch": 0.5939787550222907, + "grad_norm": 0.6960133910179138, + "learning_rate": 8.00148061096442e-06, + "loss": 0.7887, + "step": 10792 + }, + { + "epoch": 0.5940337938246464, + "grad_norm": 0.6774740815162659, + "learning_rate": 8.001133922698511e-06, + "loss": 0.7146, + "step": 10793 + }, + { + "epoch": 0.5940888326270021, + "grad_norm": 0.6696349382400513, + "learning_rate": 8.000787211876883e-06, + "loss": 0.7829, + "step": 10794 + }, + { + "epoch": 0.5941438714293577, + "grad_norm": 1.5037024021148682, + "learning_rate": 8.000440478502142e-06, + "loss": 0.8198, + "step": 10795 + }, + { + "epoch": 0.5941989102317133, + "grad_norm": 0.7373353838920593, + "learning_rate": 8.000093722576893e-06, + "loss": 0.7864, + "step": 10796 + }, + { + "epoch": 0.594253949034069, + "grad_norm": 0.8120700120925903, + "learning_rate": 7.999746944103743e-06, + "loss": 0.7918, + "step": 10797 + }, + { + "epoch": 0.5943089878364247, + "grad_norm": 0.7669811844825745, + "learning_rate": 7.999400143085296e-06, + "loss": 0.751, + "step": 10798 + }, + { + "epoch": 0.5943640266387803, + "grad_norm": 0.8090860843658447, + "learning_rate": 7.999053319524163e-06, + "loss": 0.8387, + "step": 10799 + }, + { + "epoch": 0.594419065441136, + "grad_norm": 0.6994315385818481, + "learning_rate": 7.998706473422945e-06, + "loss": 0.7084, + "step": 10800 + }, + { + "epoch": 0.5944741042434917, + "grad_norm": 0.7913107872009277, + "learning_rate": 7.998359604784254e-06, + "loss": 0.7454, + "step": 10801 + }, + { + "epoch": 0.5945291430458474, + "grad_norm": 0.6831398010253906, + "learning_rate": 7.998012713610696e-06, + "loss": 0.7422, + "step": 10802 + }, + { + "epoch": 0.5945841818482029, + "grad_norm": 0.7324068546295166, + "learning_rate": 7.997665799904875e-06, + "loss": 0.7622, + "step": 10803 + }, + { + "epoch": 0.5946392206505586, + "grad_norm": 0.8192811012268066, + "learning_rate": 7.997318863669399e-06, + "loss": 0.7783, + "step": 10804 + }, + { + "epoch": 0.5946942594529143, + "grad_norm": 0.8008341789245605, + "learning_rate": 7.996971904906879e-06, + "loss": 0.7673, + "step": 10805 + }, + { + "epoch": 0.59474929825527, + "grad_norm": 0.6899568438529968, + "learning_rate": 7.99662492361992e-06, + "loss": 0.7477, + "step": 10806 + }, + { + "epoch": 0.5948043370576256, + "grad_norm": 0.7322555780410767, + "learning_rate": 7.996277919811132e-06, + "loss": 0.7673, + "step": 10807 + }, + { + "epoch": 0.5948593758599813, + "grad_norm": 1.008300542831421, + "learning_rate": 7.995930893483117e-06, + "loss": 0.7556, + "step": 10808 + }, + { + "epoch": 0.594914414662337, + "grad_norm": 0.7211925387382507, + "learning_rate": 7.99558384463849e-06, + "loss": 0.761, + "step": 10809 + }, + { + "epoch": 0.5949694534646927, + "grad_norm": 0.7143383622169495, + "learning_rate": 7.995236773279855e-06, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.5950244922670482, + "grad_norm": 0.7682802677154541, + "learning_rate": 7.994889679409825e-06, + "loss": 0.8538, + "step": 10811 + }, + { + "epoch": 0.5950795310694039, + "grad_norm": 0.6304698586463928, + "learning_rate": 7.994542563031004e-06, + "loss": 0.7343, + "step": 10812 + }, + { + "epoch": 0.5951345698717596, + "grad_norm": 0.6704440116882324, + "learning_rate": 7.994195424146002e-06, + "loss": 0.6921, + "step": 10813 + }, + { + "epoch": 0.5951896086741153, + "grad_norm": 0.8626209497451782, + "learning_rate": 7.99384826275743e-06, + "loss": 0.7049, + "step": 10814 + }, + { + "epoch": 0.5952446474764709, + "grad_norm": 0.810922384262085, + "learning_rate": 7.993501078867895e-06, + "loss": 0.793, + "step": 10815 + }, + { + "epoch": 0.5952996862788266, + "grad_norm": 0.8495855927467346, + "learning_rate": 7.993153872480009e-06, + "loss": 0.8078, + "step": 10816 + }, + { + "epoch": 0.5953547250811823, + "grad_norm": 0.7430331707000732, + "learning_rate": 7.992806643596378e-06, + "loss": 0.7957, + "step": 10817 + }, + { + "epoch": 0.595409763883538, + "grad_norm": 0.7188051342964172, + "learning_rate": 7.992459392219614e-06, + "loss": 0.725, + "step": 10818 + }, + { + "epoch": 0.5954648026858935, + "grad_norm": 0.7046926021575928, + "learning_rate": 7.992112118352326e-06, + "loss": 0.7438, + "step": 10819 + }, + { + "epoch": 0.5955198414882492, + "grad_norm": 0.7982804775238037, + "learning_rate": 7.991764821997123e-06, + "loss": 0.7046, + "step": 10820 + }, + { + "epoch": 0.5955748802906049, + "grad_norm": 0.6392245292663574, + "learning_rate": 7.991417503156618e-06, + "loss": 0.7413, + "step": 10821 + }, + { + "epoch": 0.5956299190929606, + "grad_norm": 0.7518960237503052, + "learning_rate": 7.99107016183342e-06, + "loss": 0.7661, + "step": 10822 + }, + { + "epoch": 0.5956849578953162, + "grad_norm": 0.7413721680641174, + "learning_rate": 7.99072279803014e-06, + "loss": 0.6538, + "step": 10823 + }, + { + "epoch": 0.5957399966976719, + "grad_norm": 0.7729454636573792, + "learning_rate": 7.990375411749384e-06, + "loss": 0.8056, + "step": 10824 + }, + { + "epoch": 0.5957950355000275, + "grad_norm": 0.8059296607971191, + "learning_rate": 7.99002800299377e-06, + "loss": 0.8699, + "step": 10825 + }, + { + "epoch": 0.5958500743023831, + "grad_norm": 0.5947105288505554, + "learning_rate": 7.989680571765907e-06, + "loss": 0.6481, + "step": 10826 + }, + { + "epoch": 0.5959051131047388, + "grad_norm": 0.7303743362426758, + "learning_rate": 7.989333118068404e-06, + "loss": 0.7401, + "step": 10827 + }, + { + "epoch": 0.5959601519070945, + "grad_norm": 0.7121400237083435, + "learning_rate": 7.988985641903873e-06, + "loss": 0.78, + "step": 10828 + }, + { + "epoch": 0.5960151907094502, + "grad_norm": 0.6921802163124084, + "learning_rate": 7.988638143274926e-06, + "loss": 0.7234, + "step": 10829 + }, + { + "epoch": 0.5960702295118058, + "grad_norm": 0.6715331673622131, + "learning_rate": 7.988290622184174e-06, + "loss": 0.7606, + "step": 10830 + }, + { + "epoch": 0.5961252683141615, + "grad_norm": 0.6315215229988098, + "learning_rate": 7.98794307863423e-06, + "loss": 0.6902, + "step": 10831 + }, + { + "epoch": 0.5961803071165171, + "grad_norm": 0.6884782314300537, + "learning_rate": 7.987595512627707e-06, + "loss": 0.7808, + "step": 10832 + }, + { + "epoch": 0.5962353459188728, + "grad_norm": 0.7050700783729553, + "learning_rate": 7.987247924167215e-06, + "loss": 0.7248, + "step": 10833 + }, + { + "epoch": 0.5962903847212284, + "grad_norm": 0.7232446074485779, + "learning_rate": 7.986900313255367e-06, + "loss": 0.8686, + "step": 10834 + }, + { + "epoch": 0.5963454235235841, + "grad_norm": 0.693631649017334, + "learning_rate": 7.986552679894778e-06, + "loss": 0.7567, + "step": 10835 + }, + { + "epoch": 0.5964004623259398, + "grad_norm": 0.6462356448173523, + "learning_rate": 7.986205024088054e-06, + "loss": 0.7091, + "step": 10836 + }, + { + "epoch": 0.5964555011282955, + "grad_norm": 0.7465559840202332, + "learning_rate": 7.985857345837814e-06, + "loss": 0.8965, + "step": 10837 + }, + { + "epoch": 0.5965105399306511, + "grad_norm": 0.6803271770477295, + "learning_rate": 7.985509645146672e-06, + "loss": 0.7602, + "step": 10838 + }, + { + "epoch": 0.5965655787330068, + "grad_norm": 1.1414798498153687, + "learning_rate": 7.985161922017238e-06, + "loss": 0.7806, + "step": 10839 + }, + { + "epoch": 0.5966206175353624, + "grad_norm": 0.6583230495452881, + "learning_rate": 7.984814176452123e-06, + "loss": 0.6727, + "step": 10840 + }, + { + "epoch": 0.5966756563377181, + "grad_norm": 0.6582550406455994, + "learning_rate": 7.984466408453946e-06, + "loss": 0.6794, + "step": 10841 + }, + { + "epoch": 0.5967306951400737, + "grad_norm": 0.8680793642997742, + "learning_rate": 7.984118618025318e-06, + "loss": 0.7999, + "step": 10842 + }, + { + "epoch": 0.5967857339424294, + "grad_norm": 0.772777795791626, + "learning_rate": 7.983770805168853e-06, + "loss": 0.6278, + "step": 10843 + }, + { + "epoch": 0.5968407727447851, + "grad_norm": 0.8099700808525085, + "learning_rate": 7.983422969887167e-06, + "loss": 0.7631, + "step": 10844 + }, + { + "epoch": 0.5968958115471408, + "grad_norm": 0.660271406173706, + "learning_rate": 7.983075112182871e-06, + "loss": 0.7557, + "step": 10845 + }, + { + "epoch": 0.5969508503494964, + "grad_norm": 0.7205530405044556, + "learning_rate": 7.982727232058582e-06, + "loss": 0.8258, + "step": 10846 + }, + { + "epoch": 0.597005889151852, + "grad_norm": 0.7925810813903809, + "learning_rate": 7.982379329516912e-06, + "loss": 0.7534, + "step": 10847 + }, + { + "epoch": 0.5970609279542077, + "grad_norm": 0.7255545854568481, + "learning_rate": 7.982031404560477e-06, + "loss": 0.8394, + "step": 10848 + }, + { + "epoch": 0.5971159667565634, + "grad_norm": 0.835394561290741, + "learning_rate": 7.981683457191893e-06, + "loss": 0.8384, + "step": 10849 + }, + { + "epoch": 0.597171005558919, + "grad_norm": 0.6781747937202454, + "learning_rate": 7.981335487413775e-06, + "loss": 0.8173, + "step": 10850 + }, + { + "epoch": 0.5972260443612747, + "grad_norm": 0.8602943420410156, + "learning_rate": 7.980987495228737e-06, + "loss": 0.8257, + "step": 10851 + }, + { + "epoch": 0.5972810831636304, + "grad_norm": 0.7157264947891235, + "learning_rate": 7.980639480639394e-06, + "loss": 0.7267, + "step": 10852 + }, + { + "epoch": 0.5973361219659861, + "grad_norm": 0.7695063352584839, + "learning_rate": 7.980291443648364e-06, + "loss": 0.7794, + "step": 10853 + }, + { + "epoch": 0.5973911607683416, + "grad_norm": 0.723971426486969, + "learning_rate": 7.979943384258262e-06, + "loss": 0.7761, + "step": 10854 + }, + { + "epoch": 0.5974461995706973, + "grad_norm": 0.691722571849823, + "learning_rate": 7.979595302471702e-06, + "loss": 0.7276, + "step": 10855 + }, + { + "epoch": 0.597501238373053, + "grad_norm": 0.7019701600074768, + "learning_rate": 7.9792471982913e-06, + "loss": 0.7965, + "step": 10856 + }, + { + "epoch": 0.5975562771754087, + "grad_norm": 0.6626996994018555, + "learning_rate": 7.978899071719675e-06, + "loss": 0.7124, + "step": 10857 + }, + { + "epoch": 0.5976113159777643, + "grad_norm": 0.6871625781059265, + "learning_rate": 7.978550922759443e-06, + "loss": 0.7742, + "step": 10858 + }, + { + "epoch": 0.59766635478012, + "grad_norm": 0.7153579592704773, + "learning_rate": 7.978202751413217e-06, + "loss": 0.7852, + "step": 10859 + }, + { + "epoch": 0.5977213935824757, + "grad_norm": 0.6891841292381287, + "learning_rate": 7.977854557683619e-06, + "loss": 0.7873, + "step": 10860 + }, + { + "epoch": 0.5977764323848314, + "grad_norm": 0.6864004731178284, + "learning_rate": 7.977506341573262e-06, + "loss": 0.7223, + "step": 10861 + }, + { + "epoch": 0.5978314711871869, + "grad_norm": 0.7163059115409851, + "learning_rate": 7.977158103084764e-06, + "loss": 0.679, + "step": 10862 + }, + { + "epoch": 0.5978865099895426, + "grad_norm": 0.6727336049079895, + "learning_rate": 7.976809842220742e-06, + "loss": 0.7148, + "step": 10863 + }, + { + "epoch": 0.5979415487918983, + "grad_norm": 0.672960638999939, + "learning_rate": 7.976461558983814e-06, + "loss": 0.7263, + "step": 10864 + }, + { + "epoch": 0.597996587594254, + "grad_norm": 0.9124444127082825, + "learning_rate": 7.976113253376601e-06, + "loss": 0.6876, + "step": 10865 + }, + { + "epoch": 0.5980516263966096, + "grad_norm": 0.6415041089057922, + "learning_rate": 7.975764925401715e-06, + "loss": 0.6655, + "step": 10866 + }, + { + "epoch": 0.5981066651989653, + "grad_norm": 0.7342595458030701, + "learning_rate": 7.975416575061776e-06, + "loss": 0.7753, + "step": 10867 + }, + { + "epoch": 0.598161704001321, + "grad_norm": 0.7161775231361389, + "learning_rate": 7.975068202359402e-06, + "loss": 0.7525, + "step": 10868 + }, + { + "epoch": 0.5982167428036765, + "grad_norm": 0.7087578773498535, + "learning_rate": 7.974719807297212e-06, + "loss": 0.7196, + "step": 10869 + }, + { + "epoch": 0.5982717816060322, + "grad_norm": 0.6472536325454712, + "learning_rate": 7.974371389877826e-06, + "loss": 0.6837, + "step": 10870 + }, + { + "epoch": 0.5983268204083879, + "grad_norm": 0.6625581383705139, + "learning_rate": 7.97402295010386e-06, + "loss": 0.6379, + "step": 10871 + }, + { + "epoch": 0.5983818592107436, + "grad_norm": 0.7621071934700012, + "learning_rate": 7.973674487977934e-06, + "loss": 0.8291, + "step": 10872 + }, + { + "epoch": 0.5984368980130992, + "grad_norm": 0.693394660949707, + "learning_rate": 7.973326003502666e-06, + "loss": 0.7677, + "step": 10873 + }, + { + "epoch": 0.5984919368154549, + "grad_norm": 0.6393985152244568, + "learning_rate": 7.972977496680674e-06, + "loss": 0.7058, + "step": 10874 + }, + { + "epoch": 0.5985469756178106, + "grad_norm": 0.7101462483406067, + "learning_rate": 7.972628967514582e-06, + "loss": 0.7396, + "step": 10875 + }, + { + "epoch": 0.5986020144201663, + "grad_norm": 0.8131522536277771, + "learning_rate": 7.972280416007003e-06, + "loss": 0.8461, + "step": 10876 + }, + { + "epoch": 0.5986570532225218, + "grad_norm": 0.7186655402183533, + "learning_rate": 7.971931842160564e-06, + "loss": 0.7721, + "step": 10877 + }, + { + "epoch": 0.5987120920248775, + "grad_norm": 0.7520855069160461, + "learning_rate": 7.971583245977877e-06, + "loss": 0.7733, + "step": 10878 + }, + { + "epoch": 0.5987671308272332, + "grad_norm": 0.6548848748207092, + "learning_rate": 7.971234627461569e-06, + "loss": 0.6555, + "step": 10879 + }, + { + "epoch": 0.5988221696295889, + "grad_norm": 0.7341775894165039, + "learning_rate": 7.970885986614254e-06, + "loss": 0.8292, + "step": 10880 + }, + { + "epoch": 0.5988772084319445, + "grad_norm": 0.7126352190971375, + "learning_rate": 7.970537323438556e-06, + "loss": 0.7704, + "step": 10881 + }, + { + "epoch": 0.5989322472343002, + "grad_norm": 0.7291527390480042, + "learning_rate": 7.970188637937097e-06, + "loss": 0.8175, + "step": 10882 + }, + { + "epoch": 0.5989872860366559, + "grad_norm": 0.682767927646637, + "learning_rate": 7.969839930112493e-06, + "loss": 0.8187, + "step": 10883 + }, + { + "epoch": 0.5990423248390115, + "grad_norm": 0.7820014953613281, + "learning_rate": 7.969491199967368e-06, + "loss": 0.7949, + "step": 10884 + }, + { + "epoch": 0.5990973636413671, + "grad_norm": 0.7257336974143982, + "learning_rate": 7.969142447504341e-06, + "loss": 0.8461, + "step": 10885 + }, + { + "epoch": 0.5991524024437228, + "grad_norm": 0.6813532114028931, + "learning_rate": 7.968793672726033e-06, + "loss": 0.7889, + "step": 10886 + }, + { + "epoch": 0.5992074412460785, + "grad_norm": 0.6868439316749573, + "learning_rate": 7.96844487563507e-06, + "loss": 0.7268, + "step": 10887 + }, + { + "epoch": 0.5992624800484342, + "grad_norm": 0.6547278761863708, + "learning_rate": 7.968096056234067e-06, + "loss": 0.7026, + "step": 10888 + }, + { + "epoch": 0.5993175188507898, + "grad_norm": 0.6704558730125427, + "learning_rate": 7.96774721452565e-06, + "loss": 0.6994, + "step": 10889 + }, + { + "epoch": 0.5993725576531455, + "grad_norm": 0.7134065628051758, + "learning_rate": 7.967398350512439e-06, + "loss": 0.7728, + "step": 10890 + }, + { + "epoch": 0.5994275964555011, + "grad_norm": 0.751265823841095, + "learning_rate": 7.967049464197056e-06, + "loss": 0.8421, + "step": 10891 + }, + { + "epoch": 0.5994826352578568, + "grad_norm": 0.8558571934700012, + "learning_rate": 7.966700555582125e-06, + "loss": 0.9144, + "step": 10892 + }, + { + "epoch": 0.5995376740602124, + "grad_norm": 0.8338084816932678, + "learning_rate": 7.966351624670263e-06, + "loss": 0.7502, + "step": 10893 + }, + { + "epoch": 0.5995927128625681, + "grad_norm": 0.7017131447792053, + "learning_rate": 7.9660026714641e-06, + "loss": 0.7778, + "step": 10894 + }, + { + "epoch": 0.5996477516649238, + "grad_norm": 0.7176111340522766, + "learning_rate": 7.965653695966253e-06, + "loss": 0.8478, + "step": 10895 + }, + { + "epoch": 0.5997027904672795, + "grad_norm": 0.7026060819625854, + "learning_rate": 7.965304698179349e-06, + "loss": 0.7111, + "step": 10896 + }, + { + "epoch": 0.5997578292696351, + "grad_norm": 0.6383810639381409, + "learning_rate": 7.964955678106005e-06, + "loss": 0.6429, + "step": 10897 + }, + { + "epoch": 0.5998128680719907, + "grad_norm": 0.8024059534072876, + "learning_rate": 7.96460663574885e-06, + "loss": 0.7308, + "step": 10898 + }, + { + "epoch": 0.5998679068743464, + "grad_norm": 0.7378466725349426, + "learning_rate": 7.964257571110504e-06, + "loss": 0.7593, + "step": 10899 + }, + { + "epoch": 0.5999229456767021, + "grad_norm": 0.7089043855667114, + "learning_rate": 7.963908484193593e-06, + "loss": 0.6862, + "step": 10900 + }, + { + "epoch": 0.5999779844790577, + "grad_norm": 0.765295684337616, + "learning_rate": 7.963559375000738e-06, + "loss": 0.6759, + "step": 10901 + }, + { + "epoch": 0.6000330232814134, + "grad_norm": 0.7040783166885376, + "learning_rate": 7.963210243534565e-06, + "loss": 0.7754, + "step": 10902 + }, + { + "epoch": 0.6000880620837691, + "grad_norm": 0.8593736886978149, + "learning_rate": 7.962861089797698e-06, + "loss": 0.8765, + "step": 10903 + }, + { + "epoch": 0.6001431008861248, + "grad_norm": 0.6613926291465759, + "learning_rate": 7.962511913792758e-06, + "loss": 0.6697, + "step": 10904 + }, + { + "epoch": 0.6001981396884803, + "grad_norm": 0.6369597911834717, + "learning_rate": 7.962162715522372e-06, + "loss": 0.7145, + "step": 10905 + }, + { + "epoch": 0.600253178490836, + "grad_norm": 1.1790162324905396, + "learning_rate": 7.961813494989164e-06, + "loss": 0.8067, + "step": 10906 + }, + { + "epoch": 0.6003082172931917, + "grad_norm": 0.7548268437385559, + "learning_rate": 7.961464252195759e-06, + "loss": 0.7936, + "step": 10907 + }, + { + "epoch": 0.6003632560955474, + "grad_norm": 0.6204384565353394, + "learning_rate": 7.961114987144781e-06, + "loss": 0.6374, + "step": 10908 } ], "logging_steps": 1, @@ -70019,7 +76382,7 @@ "attributes": {} } }, - "total_flos": 2.9507661855341937e+19, + "total_flos": 3.219017656946393e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null