diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.01, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1e-05, + "grad_norm": 1.6317715205474108, + "learning_rate": 3e-06, + "loss": 10.867, + "step": 1 + }, + { + "epoch": 2e-05, + "grad_norm": 1.6173147870740345, + "learning_rate": 6e-06, + "loss": 10.8657, + "step": 2 + }, + { + "epoch": 3e-05, + "grad_norm": 1.6387509359885835, + "learning_rate": 9e-06, + "loss": 10.8658, + "step": 3 + }, + { + "epoch": 4e-05, + "grad_norm": 1.597552357815991, + "learning_rate": 1.2e-05, + "loss": 10.865, + "step": 4 + }, + { + "epoch": 5e-05, + "grad_norm": 1.6454471252189307, + "learning_rate": 1.5e-05, + "loss": 10.8617, + "step": 5 + }, + { + "epoch": 6e-05, + "grad_norm": 1.6407925722175996, + "learning_rate": 1.8e-05, + "loss": 10.8593, + "step": 6 + }, + { + "epoch": 7e-05, + "grad_norm": 1.6096088910322361, + "learning_rate": 2.1000000000000002e-05, + "loss": 10.8456, + "step": 7 + }, + { + "epoch": 8e-05, + "grad_norm": 1.4682698515009915, + "learning_rate": 2.4e-05, + "loss": 10.8184, + "step": 8 + }, + { + "epoch": 9e-05, + "grad_norm": 1.3934246427009196, + "learning_rate": 2.7e-05, + "loss": 10.8113, + "step": 9 + }, + { + "epoch": 0.0001, + "grad_norm": 1.3326087040550991, + "learning_rate": 3e-05, + "loss": 10.7969, + "step": 10 + }, + { + "epoch": 0.00011, + "grad_norm": 1.2173090118888668, + "learning_rate": 3.2999999999999996e-05, + "loss": 10.7801, + "step": 11 + }, + { + "epoch": 0.00012, + "grad_norm": 1.176457724285593, + "learning_rate": 3.6e-05, + "loss": 10.7688, + "step": 12 + }, + { + "epoch": 0.00013, + "grad_norm": 1.1304424318539597, + "learning_rate": 3.9e-05, + "loss": 10.7498, + "step": 13 + }, + { + "epoch": 0.00014, + "grad_norm": 1.1158244568462428, + "learning_rate": 4.2000000000000004e-05, + "loss": 10.739, + "step": 14 + }, + { + "epoch": 0.00015, + "grad_norm": 1.10654756415174, + "learning_rate": 4.4999999999999996e-05, + "loss": 10.7299, + "step": 15 + }, + { + "epoch": 0.00016, + "grad_norm": 1.0751770608444569, + "learning_rate": 4.8e-05, + "loss": 10.7126, + "step": 16 + }, + { + "epoch": 0.00017, + "grad_norm": 1.046143788290158, + "learning_rate": 5.1000000000000006e-05, + "loss": 10.6968, + "step": 17 + }, + { + "epoch": 0.00018, + "grad_norm": 1.0230391412556632, + "learning_rate": 5.4e-05, + "loss": 10.6806, + "step": 18 + }, + { + "epoch": 0.00019, + "grad_norm": 0.9869982733638543, + "learning_rate": 5.7e-05, + "loss": 10.6649, + "step": 19 + }, + { + "epoch": 0.0002, + "grad_norm": 0.9728818553338922, + "learning_rate": 6e-05, + "loss": 10.6526, + "step": 20 + }, + { + "epoch": 0.00021, + "grad_norm": 0.9439994749998407, + "learning_rate": 6.3e-05, + "loss": 10.639, + "step": 21 + }, + { + "epoch": 0.00022, + "grad_norm": 0.9279528910342688, + "learning_rate": 6.599999999999999e-05, + "loss": 10.6244, + "step": 22 + }, + { + "epoch": 0.00023, + "grad_norm": 0.9208663519602571, + "learning_rate": 6.9e-05, + "loss": 10.6103, + "step": 23 + }, + { + "epoch": 0.00024, + "grad_norm": 0.9178551557561957, + "learning_rate": 7.2e-05, + "loss": 10.5993, + "step": 24 + }, + { + "epoch": 0.00025, + "grad_norm": 0.9193923250060233, + "learning_rate": 7.500000000000001e-05, + "loss": 10.5847, + "step": 25 + }, + { + "epoch": 0.00026, + "grad_norm": 0.9190901609677985, + "learning_rate": 7.8e-05, + "loss": 10.5717, + "step": 26 + }, + { + "epoch": 0.00027, + "grad_norm": 0.913753327244254, + "learning_rate": 8.1e-05, + "loss": 10.5597, + "step": 27 + }, + { + "epoch": 0.00028, + "grad_norm": 0.9119625217070774, + "learning_rate": 8.400000000000001e-05, + "loss": 10.5467, + "step": 28 + }, + { + "epoch": 0.00029, + "grad_norm": 0.9131038863398008, + "learning_rate": 8.7e-05, + "loss": 10.5323, + "step": 29 + }, + { + "epoch": 0.0003, + "grad_norm": 0.9186172829723749, + "learning_rate": 8.999999999999999e-05, + "loss": 10.517, + "step": 30 + }, + { + "epoch": 0.00031, + "grad_norm": 0.9155453723962563, + "learning_rate": 9.3e-05, + "loss": 10.5024, + "step": 31 + }, + { + "epoch": 0.00032, + "grad_norm": 0.909575589137279, + "learning_rate": 9.6e-05, + "loss": 10.4882, + "step": 32 + }, + { + "epoch": 0.00033, + "grad_norm": 0.9102175928717151, + "learning_rate": 9.900000000000001e-05, + "loss": 10.4717, + "step": 33 + }, + { + "epoch": 0.00034, + "grad_norm": 0.9103991171564593, + "learning_rate": 0.00010200000000000001, + "loss": 10.4554, + "step": 34 + }, + { + "epoch": 0.00035, + "grad_norm": 0.9164468459870767, + "learning_rate": 0.00010500000000000002, + "loss": 10.4388, + "step": 35 + }, + { + "epoch": 0.00036, + "grad_norm": 0.9129850089149896, + "learning_rate": 0.000108, + "loss": 10.4213, + "step": 36 + }, + { + "epoch": 0.00037, + "grad_norm": 0.9029808327462479, + "learning_rate": 0.000111, + "loss": 10.4047, + "step": 37 + }, + { + "epoch": 0.00038, + "grad_norm": 0.9074583652458479, + "learning_rate": 0.000114, + "loss": 10.3859, + "step": 38 + }, + { + "epoch": 0.00039, + "grad_norm": 0.9101621604618185, + "learning_rate": 0.000117, + "loss": 10.3665, + "step": 39 + }, + { + "epoch": 0.0004, + "grad_norm": 0.9144345472354501, + "learning_rate": 0.00012, + "loss": 10.3443, + "step": 40 + }, + { + "epoch": 0.00041, + "grad_norm": 0.9045315909874942, + "learning_rate": 0.000123, + "loss": 10.3264, + "step": 41 + }, + { + "epoch": 0.00042, + "grad_norm": 0.9101518170592343, + "learning_rate": 0.000126, + "loss": 10.3059, + "step": 42 + }, + { + "epoch": 0.00043, + "grad_norm": 0.9147491310031046, + "learning_rate": 0.000129, + "loss": 10.2831, + "step": 43 + }, + { + "epoch": 0.00044, + "grad_norm": 0.9138166723808987, + "learning_rate": 0.00013199999999999998, + "loss": 10.2617, + "step": 44 + }, + { + "epoch": 0.00045, + "grad_norm": 0.9166634386783579, + "learning_rate": 0.000135, + "loss": 10.2388, + "step": 45 + }, + { + "epoch": 0.00046, + "grad_norm": 0.9061319871747918, + "learning_rate": 0.000138, + "loss": 10.2171, + "step": 46 + }, + { + "epoch": 0.00047, + "grad_norm": 0.9134144094551727, + "learning_rate": 0.000141, + "loss": 10.1906, + "step": 47 + }, + { + "epoch": 0.00048, + "grad_norm": 0.9111015850262806, + "learning_rate": 0.000144, + "loss": 10.1669, + "step": 48 + }, + { + "epoch": 0.00049, + "grad_norm": 0.9106086112424904, + "learning_rate": 0.000147, + "loss": 10.1439, + "step": 49 + }, + { + "epoch": 0.0005, + "grad_norm": 0.9135108001899231, + "learning_rate": 0.00015000000000000001, + "loss": 10.1178, + "step": 50 + }, + { + "epoch": 0.00051, + "grad_norm": 0.9152031108656089, + "learning_rate": 0.000153, + "loss": 10.0918, + "step": 51 + }, + { + "epoch": 0.00052, + "grad_norm": 0.9133043896843657, + "learning_rate": 0.000156, + "loss": 10.0658, + "step": 52 + }, + { + "epoch": 0.00053, + "grad_norm": 0.9039676544194273, + "learning_rate": 0.000159, + "loss": 10.0419, + "step": 53 + }, + { + "epoch": 0.00054, + "grad_norm": 0.9245050218484777, + "learning_rate": 0.000162, + "loss": 10.0099, + "step": 54 + }, + { + "epoch": 0.00055, + "grad_norm": 0.915653013423474, + "learning_rate": 0.000165, + "loss": 9.9858, + "step": 55 + }, + { + "epoch": 0.00056, + "grad_norm": 0.90743999026624, + "learning_rate": 0.00016800000000000002, + "loss": 9.9567, + "step": 56 + }, + { + "epoch": 0.00057, + "grad_norm": 0.9125740935300273, + "learning_rate": 0.000171, + "loss": 9.9292, + "step": 57 + }, + { + "epoch": 0.00058, + "grad_norm": 0.9103134473221595, + "learning_rate": 0.000174, + "loss": 9.9046, + "step": 58 + }, + { + "epoch": 0.00059, + "grad_norm": 0.91502039796166, + "learning_rate": 0.000177, + "loss": 9.8727, + "step": 59 + }, + { + "epoch": 0.0006, + "grad_norm": 0.902549468432534, + "learning_rate": 0.00017999999999999998, + "loss": 9.8467, + "step": 60 + }, + { + "epoch": 0.00061, + "grad_norm": 0.9011753674575653, + "learning_rate": 0.000183, + "loss": 9.8184, + "step": 61 + }, + { + "epoch": 0.00062, + "grad_norm": 0.9073829944096542, + "learning_rate": 0.000186, + "loss": 9.7865, + "step": 62 + }, + { + "epoch": 0.00063, + "grad_norm": 0.900277782228101, + "learning_rate": 0.000189, + "loss": 9.7594, + "step": 63 + }, + { + "epoch": 0.00064, + "grad_norm": 0.8964836174343672, + "learning_rate": 0.000192, + "loss": 9.7292, + "step": 64 + }, + { + "epoch": 0.00065, + "grad_norm": 0.9040253232587138, + "learning_rate": 0.00019500000000000002, + "loss": 9.6969, + "step": 65 + }, + { + "epoch": 0.00066, + "grad_norm": 0.8969700410935529, + "learning_rate": 0.00019800000000000002, + "loss": 9.6728, + "step": 66 + }, + { + "epoch": 0.00067, + "grad_norm": 0.8981910064021, + "learning_rate": 0.000201, + "loss": 9.643, + "step": 67 + }, + { + "epoch": 0.00068, + "grad_norm": 0.9049113812894196, + "learning_rate": 0.00020400000000000003, + "loss": 9.61, + "step": 68 + }, + { + "epoch": 0.00069, + "grad_norm": 0.8903730719674341, + "learning_rate": 0.00020700000000000002, + "loss": 9.5824, + "step": 69 + }, + { + "epoch": 0.0007, + "grad_norm": 0.9034192157313848, + "learning_rate": 0.00021000000000000004, + "loss": 9.5456, + "step": 70 + }, + { + "epoch": 0.00071, + "grad_norm": 0.8933343163190056, + "learning_rate": 0.00021299999999999997, + "loss": 9.5189, + "step": 71 + }, + { + "epoch": 0.00072, + "grad_norm": 0.8968593008835964, + "learning_rate": 0.000216, + "loss": 9.4914, + "step": 72 + }, + { + "epoch": 0.00073, + "grad_norm": 0.8960167287531013, + "learning_rate": 0.00021899999999999998, + "loss": 9.4574, + "step": 73 + }, + { + "epoch": 0.00074, + "grad_norm": 0.8965354004907367, + "learning_rate": 0.000222, + "loss": 9.4304, + "step": 74 + }, + { + "epoch": 0.00075, + "grad_norm": 0.8923958382533757, + "learning_rate": 0.000225, + "loss": 9.3946, + "step": 75 + }, + { + "epoch": 0.00076, + "grad_norm": 0.9011054851249423, + "learning_rate": 0.000228, + "loss": 9.3667, + "step": 76 + }, + { + "epoch": 0.00077, + "grad_norm": 0.8959973023769764, + "learning_rate": 0.000231, + "loss": 9.3374, + "step": 77 + }, + { + "epoch": 0.00078, + "grad_norm": 0.8901362352309407, + "learning_rate": 0.000234, + "loss": 9.3055, + "step": 78 + }, + { + "epoch": 0.00079, + "grad_norm": 0.8931237127830396, + "learning_rate": 0.00023700000000000001, + "loss": 9.2694, + "step": 79 + }, + { + "epoch": 0.0008, + "grad_norm": 0.8937972757596185, + "learning_rate": 0.00024, + "loss": 9.2355, + "step": 80 + }, + { + "epoch": 0.00081, + "grad_norm": 0.8984186266430717, + "learning_rate": 0.00024300000000000002, + "loss": 9.2033, + "step": 81 + }, + { + "epoch": 0.00082, + "grad_norm": 0.9060711515558022, + "learning_rate": 0.000246, + "loss": 9.1789, + "step": 82 + }, + { + "epoch": 0.00083, + "grad_norm": 0.8964035250431631, + "learning_rate": 0.00024900000000000004, + "loss": 9.1434, + "step": 83 + }, + { + "epoch": 0.00084, + "grad_norm": 0.8986377804082708, + "learning_rate": 0.000252, + "loss": 9.1119, + "step": 84 + }, + { + "epoch": 0.00085, + "grad_norm": 0.8933390971667627, + "learning_rate": 0.000255, + "loss": 9.0885, + "step": 85 + }, + { + "epoch": 0.00086, + "grad_norm": 0.8953786211031305, + "learning_rate": 0.000258, + "loss": 9.0521, + "step": 86 + }, + { + "epoch": 0.00087, + "grad_norm": 0.8876773666743288, + "learning_rate": 0.000261, + "loss": 9.0302, + "step": 87 + }, + { + "epoch": 0.00088, + "grad_norm": 0.8929152197347487, + "learning_rate": 0.00026399999999999997, + "loss": 8.9927, + "step": 88 + }, + { + "epoch": 0.00089, + "grad_norm": 0.8872457913370299, + "learning_rate": 0.000267, + "loss": 8.9669, + "step": 89 + }, + { + "epoch": 0.0009, + "grad_norm": 0.8862509419810688, + "learning_rate": 0.00027, + "loss": 8.9393, + "step": 90 + }, + { + "epoch": 0.00091, + "grad_norm": 0.8807880266520192, + "learning_rate": 0.000273, + "loss": 8.9089, + "step": 91 + }, + { + "epoch": 0.00092, + "grad_norm": 0.8831207589668301, + "learning_rate": 0.000276, + "loss": 8.876, + "step": 92 + }, + { + "epoch": 0.00093, + "grad_norm": 0.883866988348631, + "learning_rate": 0.000279, + "loss": 8.8462, + "step": 93 + }, + { + "epoch": 0.00094, + "grad_norm": 0.8824853161719922, + "learning_rate": 0.000282, + "loss": 8.8199, + "step": 94 + }, + { + "epoch": 0.00095, + "grad_norm": 0.8794572899807177, + "learning_rate": 0.000285, + "loss": 8.789, + "step": 95 + }, + { + "epoch": 0.00096, + "grad_norm": 0.8879520784944948, + "learning_rate": 0.000288, + "loss": 8.7571, + "step": 96 + }, + { + "epoch": 0.00097, + "grad_norm": 0.8798042444972031, + "learning_rate": 0.000291, + "loss": 8.7321, + "step": 97 + }, + { + "epoch": 0.00098, + "grad_norm": 0.8802597894834375, + "learning_rate": 0.000294, + "loss": 8.6954, + "step": 98 + }, + { + "epoch": 0.00099, + "grad_norm": 0.8806466323910314, + "learning_rate": 0.000297, + "loss": 8.6749, + "step": 99 + }, + { + "epoch": 0.001, + "grad_norm": 0.8777097574069823, + "learning_rate": 0.00030000000000000003, + "loss": 8.6485, + "step": 100 + }, + { + "epoch": 0.00101, + "grad_norm": 0.8786755147609817, + "learning_rate": 0.00030300000000000005, + "loss": 8.614, + "step": 101 + }, + { + "epoch": 0.00102, + "grad_norm": 0.8680143868447665, + "learning_rate": 0.000306, + "loss": 8.5949, + "step": 102 + }, + { + "epoch": 0.00103, + "grad_norm": 0.8748449452730288, + "learning_rate": 0.000309, + "loss": 8.5706, + "step": 103 + }, + { + "epoch": 0.00104, + "grad_norm": 0.8670215859388973, + "learning_rate": 0.000312, + "loss": 8.5498, + "step": 104 + }, + { + "epoch": 0.00105, + "grad_norm": 0.8687292008731472, + "learning_rate": 0.000315, + "loss": 8.5231, + "step": 105 + }, + { + "epoch": 0.00106, + "grad_norm": 0.8611486197845404, + "learning_rate": 0.000318, + "loss": 8.4945, + "step": 106 + }, + { + "epoch": 0.00107, + "grad_norm": 0.8521924348129856, + "learning_rate": 0.000321, + "loss": 8.4693, + "step": 107 + }, + { + "epoch": 0.00108, + "grad_norm": 0.8581933475380797, + "learning_rate": 0.000324, + "loss": 8.4407, + "step": 108 + }, + { + "epoch": 0.00109, + "grad_norm": 0.8524146875206363, + "learning_rate": 0.000327, + "loss": 8.421, + "step": 109 + }, + { + "epoch": 0.0011, + "grad_norm": 0.8682563584613229, + "learning_rate": 0.00033, + "loss": 8.3983, + "step": 110 + }, + { + "epoch": 0.00111, + "grad_norm": 0.8871469803064874, + "learning_rate": 0.000333, + "loss": 8.3685, + "step": 111 + }, + { + "epoch": 0.00112, + "grad_norm": 0.9236879668329372, + "learning_rate": 0.00033600000000000004, + "loss": 8.3463, + "step": 112 + }, + { + "epoch": 0.00113, + "grad_norm": 0.9129864456504505, + "learning_rate": 0.000339, + "loss": 8.3063, + "step": 113 + }, + { + "epoch": 0.00114, + "grad_norm": 0.8352263501003522, + "learning_rate": 0.000342, + "loss": 8.2966, + "step": 114 + }, + { + "epoch": 0.00115, + "grad_norm": 0.8592375580090957, + "learning_rate": 0.00034500000000000004, + "loss": 8.2718, + "step": 115 + }, + { + "epoch": 0.00116, + "grad_norm": 0.8674294753896091, + "learning_rate": 0.000348, + "loss": 8.2506, + "step": 116 + }, + { + "epoch": 0.00117, + "grad_norm": 0.8276917493567776, + "learning_rate": 0.000351, + "loss": 8.2188, + "step": 117 + }, + { + "epoch": 0.00118, + "grad_norm": 0.8476744963131545, + "learning_rate": 0.000354, + "loss": 8.2045, + "step": 118 + }, + { + "epoch": 0.00119, + "grad_norm": 0.844358071644388, + "learning_rate": 0.000357, + "loss": 8.1926, + "step": 119 + }, + { + "epoch": 0.0012, + "grad_norm": 0.8166594269287538, + "learning_rate": 0.00035999999999999997, + "loss": 8.1658, + "step": 120 + }, + { + "epoch": 0.00121, + "grad_norm": 0.8239930081024902, + "learning_rate": 0.000363, + "loss": 8.1389, + "step": 121 + }, + { + "epoch": 0.00122, + "grad_norm": 0.8099951959348987, + "learning_rate": 0.000366, + "loss": 8.1225, + "step": 122 + }, + { + "epoch": 0.00123, + "grad_norm": 0.830800320388625, + "learning_rate": 0.000369, + "loss": 8.1005, + "step": 123 + }, + { + "epoch": 0.00124, + "grad_norm": 0.8139169053139192, + "learning_rate": 0.000372, + "loss": 8.0791, + "step": 124 + }, + { + "epoch": 0.00125, + "grad_norm": 0.8112246790149765, + "learning_rate": 0.000375, + "loss": 8.0547, + "step": 125 + }, + { + "epoch": 0.00126, + "grad_norm": 0.7922278873371895, + "learning_rate": 0.000378, + "loss": 8.0424, + "step": 126 + }, + { + "epoch": 0.00127, + "grad_norm": 0.7755075943975184, + "learning_rate": 0.000381, + "loss": 8.0182, + "step": 127 + }, + { + "epoch": 0.00128, + "grad_norm": 0.8028212720713388, + "learning_rate": 0.000384, + "loss": 8.0, + "step": 128 + }, + { + "epoch": 0.00129, + "grad_norm": 0.8765129391436198, + "learning_rate": 0.00038700000000000003, + "loss": 7.9904, + "step": 129 + }, + { + "epoch": 0.0013, + "grad_norm": 1.1486399983200042, + "learning_rate": 0.00039000000000000005, + "loss": 7.9724, + "step": 130 + }, + { + "epoch": 0.00131, + "grad_norm": 1.0507777578095836, + "learning_rate": 0.000393, + "loss": 7.9382, + "step": 131 + }, + { + "epoch": 0.00132, + "grad_norm": 0.7511305165281239, + "learning_rate": 0.00039600000000000003, + "loss": 7.9248, + "step": 132 + }, + { + "epoch": 0.00133, + "grad_norm": 0.8108077692812641, + "learning_rate": 0.00039900000000000005, + "loss": 7.906, + "step": 133 + }, + { + "epoch": 0.00134, + "grad_norm": 0.8301313874662418, + "learning_rate": 0.000402, + "loss": 7.8952, + "step": 134 + }, + { + "epoch": 0.00135, + "grad_norm": 0.7123568807247732, + "learning_rate": 0.00040500000000000003, + "loss": 7.8651, + "step": 135 + }, + { + "epoch": 0.00136, + "grad_norm": 0.7697556904537746, + "learning_rate": 0.00040800000000000005, + "loss": 7.8515, + "step": 136 + }, + { + "epoch": 0.00137, + "grad_norm": 0.7190977621725152, + "learning_rate": 0.000411, + "loss": 7.8299, + "step": 137 + }, + { + "epoch": 0.00138, + "grad_norm": 0.7147305335216294, + "learning_rate": 0.00041400000000000003, + "loss": 7.8066, + "step": 138 + }, + { + "epoch": 0.00139, + "grad_norm": 0.747529428120578, + "learning_rate": 0.00041700000000000005, + "loss": 7.7954, + "step": 139 + }, + { + "epoch": 0.0014, + "grad_norm": 0.6748198408281931, + "learning_rate": 0.00042000000000000007, + "loss": 7.7774, + "step": 140 + }, + { + "epoch": 0.00141, + "grad_norm": 0.662142968172009, + "learning_rate": 0.000423, + "loss": 7.7644, + "step": 141 + }, + { + "epoch": 0.00142, + "grad_norm": 0.6859796391897652, + "learning_rate": 0.00042599999999999995, + "loss": 7.7534, + "step": 142 + }, + { + "epoch": 0.00143, + "grad_norm": 0.6828514310354903, + "learning_rate": 0.00042899999999999997, + "loss": 7.7255, + "step": 143 + }, + { + "epoch": 0.00144, + "grad_norm": 0.6490687938821236, + "learning_rate": 0.000432, + "loss": 7.7078, + "step": 144 + }, + { + "epoch": 0.00145, + "grad_norm": 0.6453156151137228, + "learning_rate": 0.000435, + "loss": 7.7035, + "step": 145 + }, + { + "epoch": 0.00146, + "grad_norm": 0.7166050341593803, + "learning_rate": 0.00043799999999999997, + "loss": 7.6672, + "step": 146 + }, + { + "epoch": 0.00147, + "grad_norm": 0.7443563124698165, + "learning_rate": 0.000441, + "loss": 7.6627, + "step": 147 + }, + { + "epoch": 0.00148, + "grad_norm": 0.7124453009383569, + "learning_rate": 0.000444, + "loss": 7.648, + "step": 148 + }, + { + "epoch": 0.00149, + "grad_norm": 0.6775306354557482, + "learning_rate": 0.00044699999999999997, + "loss": 7.6419, + "step": 149 + }, + { + "epoch": 0.0015, + "grad_norm": 0.8906380813028638, + "learning_rate": 0.00045, + "loss": 7.6135, + "step": 150 + }, + { + "epoch": 0.00151, + "grad_norm": 1.2892607147030477, + "learning_rate": 0.000453, + "loss": 7.605, + "step": 151 + }, + { + "epoch": 0.00152, + "grad_norm": 0.6492280537852009, + "learning_rate": 0.000456, + "loss": 7.5875, + "step": 152 + }, + { + "epoch": 0.00153, + "grad_norm": 0.8226748559002907, + "learning_rate": 0.000459, + "loss": 7.5783, + "step": 153 + }, + { + "epoch": 0.00154, + "grad_norm": 1.073498340899344, + "learning_rate": 0.000462, + "loss": 7.5662, + "step": 154 + }, + { + "epoch": 0.00155, + "grad_norm": 0.772416311968319, + "learning_rate": 0.000465, + "loss": 7.5481, + "step": 155 + }, + { + "epoch": 0.00156, + "grad_norm": 0.7999364848474875, + "learning_rate": 0.000468, + "loss": 7.5209, + "step": 156 + }, + { + "epoch": 0.00157, + "grad_norm": 0.9435570004081977, + "learning_rate": 0.000471, + "loss": 7.5196, + "step": 157 + }, + { + "epoch": 0.00158, + "grad_norm": 0.8364296006508578, + "learning_rate": 0.00047400000000000003, + "loss": 7.4997, + "step": 158 + }, + { + "epoch": 0.00159, + "grad_norm": 0.5596750978008624, + "learning_rate": 0.000477, + "loss": 7.4825, + "step": 159 + }, + { + "epoch": 0.0016, + "grad_norm": 0.5748856794890025, + "learning_rate": 0.00048, + "loss": 7.4616, + "step": 160 + }, + { + "epoch": 0.00161, + "grad_norm": 0.5986411025981033, + "learning_rate": 0.00048300000000000003, + "loss": 7.4419, + "step": 161 + }, + { + "epoch": 0.00162, + "grad_norm": 0.5117747222458712, + "learning_rate": 0.00048600000000000005, + "loss": 7.4429, + "step": 162 + }, + { + "epoch": 0.00163, + "grad_norm": 0.5509815158764758, + "learning_rate": 0.0004890000000000001, + "loss": 7.4259, + "step": 163 + }, + { + "epoch": 0.00164, + "grad_norm": 0.46393808675226217, + "learning_rate": 0.000492, + "loss": 7.4236, + "step": 164 + }, + { + "epoch": 0.00165, + "grad_norm": 0.5390679177469344, + "learning_rate": 0.000495, + "loss": 7.4006, + "step": 165 + }, + { + "epoch": 0.00166, + "grad_norm": 0.645354949444588, + "learning_rate": 0.0004980000000000001, + "loss": 7.3773, + "step": 166 + }, + { + "epoch": 0.00167, + "grad_norm": 1.0877276724000633, + "learning_rate": 0.000501, + "loss": 7.3741, + "step": 167 + }, + { + "epoch": 0.00168, + "grad_norm": 1.0909247587015876, + "learning_rate": 0.000504, + "loss": 7.3697, + "step": 168 + }, + { + "epoch": 0.00169, + "grad_norm": 0.5084368948335112, + "learning_rate": 0.0005070000000000001, + "loss": 7.3418, + "step": 169 + }, + { + "epoch": 0.0017, + "grad_norm": 1.5053615947271437, + "learning_rate": 0.00051, + "loss": 7.3492, + "step": 170 + }, + { + "epoch": 0.00171, + "grad_norm": 0.6120595065498764, + "learning_rate": 0.000513, + "loss": 7.3094, + "step": 171 + }, + { + "epoch": 0.00172, + "grad_norm": 0.9401489103136018, + "learning_rate": 0.000516, + "loss": 7.3189, + "step": 172 + }, + { + "epoch": 0.00173, + "grad_norm": 0.5558816443971099, + "learning_rate": 0.0005189999999999999, + "loss": 7.2906, + "step": 173 + }, + { + "epoch": 0.00174, + "grad_norm": 0.7810658223347703, + "learning_rate": 0.000522, + "loss": 7.314, + "step": 174 + }, + { + "epoch": 0.00175, + "grad_norm": 0.5409187817835382, + "learning_rate": 0.000525, + "loss": 7.2727, + "step": 175 + }, + { + "epoch": 0.00176, + "grad_norm": 0.630636606756854, + "learning_rate": 0.0005279999999999999, + "loss": 7.2484, + "step": 176 + }, + { + "epoch": 0.00177, + "grad_norm": 0.5206138592215499, + "learning_rate": 0.000531, + "loss": 7.2507, + "step": 177 + }, + { + "epoch": 0.00178, + "grad_norm": 0.5577166707239251, + "learning_rate": 0.000534, + "loss": 7.2428, + "step": 178 + }, + { + "epoch": 0.00179, + "grad_norm": 0.4980094016420717, + "learning_rate": 0.000537, + "loss": 7.2333, + "step": 179 + }, + { + "epoch": 0.0018, + "grad_norm": 0.5436132024149352, + "learning_rate": 0.00054, + "loss": 7.2053, + "step": 180 + }, + { + "epoch": 0.00181, + "grad_norm": 0.4850436660416719, + "learning_rate": 0.000543, + "loss": 7.1915, + "step": 181 + }, + { + "epoch": 0.00182, + "grad_norm": 0.686701132410735, + "learning_rate": 0.000546, + "loss": 7.1909, + "step": 182 + }, + { + "epoch": 0.00183, + "grad_norm": 0.68400997444651, + "learning_rate": 0.000549, + "loss": 7.1802, + "step": 183 + }, + { + "epoch": 0.00184, + "grad_norm": 0.8396027257351396, + "learning_rate": 0.000552, + "loss": 7.171, + "step": 184 + }, + { + "epoch": 0.00185, + "grad_norm": 0.7665365037410753, + "learning_rate": 0.000555, + "loss": 7.1476, + "step": 185 + }, + { + "epoch": 0.00186, + "grad_norm": 0.5359175667027454, + "learning_rate": 0.000558, + "loss": 7.1506, + "step": 186 + }, + { + "epoch": 0.00187, + "grad_norm": 0.5513800735908173, + "learning_rate": 0.000561, + "loss": 7.1261, + "step": 187 + }, + { + "epoch": 0.00188, + "grad_norm": 0.5352093004255375, + "learning_rate": 0.000564, + "loss": 7.1044, + "step": 188 + }, + { + "epoch": 0.00189, + "grad_norm": 0.5938457818726526, + "learning_rate": 0.000567, + "loss": 7.1004, + "step": 189 + }, + { + "epoch": 0.0019, + "grad_norm": 0.5273842405001533, + "learning_rate": 0.00057, + "loss": 7.0834, + "step": 190 + }, + { + "epoch": 0.00191, + "grad_norm": 0.47487115515279366, + "learning_rate": 0.000573, + "loss": 7.0721, + "step": 191 + }, + { + "epoch": 0.00192, + "grad_norm": 0.6036837698051599, + "learning_rate": 0.000576, + "loss": 7.0655, + "step": 192 + }, + { + "epoch": 0.00193, + "grad_norm": 0.3951184212196986, + "learning_rate": 0.000579, + "loss": 7.061, + "step": 193 + }, + { + "epoch": 0.00194, + "grad_norm": 0.442083567688087, + "learning_rate": 0.000582, + "loss": 7.0548, + "step": 194 + }, + { + "epoch": 0.00195, + "grad_norm": 0.439546882682468, + "learning_rate": 0.000585, + "loss": 7.0348, + "step": 195 + }, + { + "epoch": 0.00196, + "grad_norm": 0.46247531692771043, + "learning_rate": 0.000588, + "loss": 7.0228, + "step": 196 + }, + { + "epoch": 0.00197, + "grad_norm": 0.4140335072217301, + "learning_rate": 0.000591, + "loss": 7.0171, + "step": 197 + }, + { + "epoch": 0.00198, + "grad_norm": 0.3685986320410548, + "learning_rate": 0.000594, + "loss": 7.0081, + "step": 198 + }, + { + "epoch": 0.00199, + "grad_norm": 0.4020373564129086, + "learning_rate": 0.0005970000000000001, + "loss": 6.9898, + "step": 199 + }, + { + "epoch": 0.002, + "grad_norm": 0.37126410475941546, + "learning_rate": 0.0006000000000000001, + "loss": 6.9867, + "step": 200 + }, + { + "epoch": 0.00201, + "grad_norm": 0.3773154493828028, + "learning_rate": 0.000603, + "loss": 6.9617, + "step": 201 + }, + { + "epoch": 0.00202, + "grad_norm": 0.3540017416986532, + "learning_rate": 0.0006060000000000001, + "loss": 6.9491, + "step": 202 + }, + { + "epoch": 0.00203, + "grad_norm": 0.403279648640721, + "learning_rate": 0.0006090000000000001, + "loss": 6.9534, + "step": 203 + }, + { + "epoch": 0.00204, + "grad_norm": 0.5112949618253247, + "learning_rate": 0.000612, + "loss": 6.9385, + "step": 204 + }, + { + "epoch": 0.00205, + "grad_norm": 0.7200998739972175, + "learning_rate": 0.000615, + "loss": 6.931, + "step": 205 + }, + { + "epoch": 0.00206, + "grad_norm": 1.209379808685074, + "learning_rate": 0.000618, + "loss": 6.9351, + "step": 206 + }, + { + "epoch": 0.00207, + "grad_norm": 1.0040442357645134, + "learning_rate": 0.000621, + "loss": 6.9279, + "step": 207 + }, + { + "epoch": 0.00208, + "grad_norm": 0.7878464521205251, + "learning_rate": 0.000624, + "loss": 6.9129, + "step": 208 + }, + { + "epoch": 0.00209, + "grad_norm": 1.3096135062236434, + "learning_rate": 0.000627, + "loss": 6.9067, + "step": 209 + }, + { + "epoch": 0.0021, + "grad_norm": 0.910504669978176, + "learning_rate": 0.00063, + "loss": 6.8878, + "step": 210 + }, + { + "epoch": 0.00211, + "grad_norm": 1.3015817966038044, + "learning_rate": 0.000633, + "loss": 6.8987, + "step": 211 + }, + { + "epoch": 0.00212, + "grad_norm": 0.5587575104994011, + "learning_rate": 0.000636, + "loss": 6.8696, + "step": 212 + }, + { + "epoch": 0.00213, + "grad_norm": 0.8081412049208773, + "learning_rate": 0.000639, + "loss": 6.8608, + "step": 213 + }, + { + "epoch": 0.00214, + "grad_norm": 0.6397075273457759, + "learning_rate": 0.000642, + "loss": 6.8511, + "step": 214 + }, + { + "epoch": 0.00215, + "grad_norm": 0.5875043250740225, + "learning_rate": 0.000645, + "loss": 6.8482, + "step": 215 + }, + { + "epoch": 0.00216, + "grad_norm": 0.6060774535893669, + "learning_rate": 0.000648, + "loss": 6.8413, + "step": 216 + }, + { + "epoch": 0.00217, + "grad_norm": 0.5183751970166313, + "learning_rate": 0.000651, + "loss": 6.8119, + "step": 217 + }, + { + "epoch": 0.00218, + "grad_norm": 0.5539011900924167, + "learning_rate": 0.000654, + "loss": 6.8171, + "step": 218 + }, + { + "epoch": 0.00219, + "grad_norm": 0.5004312163301685, + "learning_rate": 0.000657, + "loss": 6.7882, + "step": 219 + }, + { + "epoch": 0.0022, + "grad_norm": 0.4762494747244133, + "learning_rate": 0.00066, + "loss": 6.8062, + "step": 220 + }, + { + "epoch": 0.00221, + "grad_norm": 0.353452768224107, + "learning_rate": 0.0006630000000000001, + "loss": 6.7814, + "step": 221 + }, + { + "epoch": 0.00222, + "grad_norm": 0.3879096289808107, + "learning_rate": 0.000666, + "loss": 6.7696, + "step": 222 + }, + { + "epoch": 0.00223, + "grad_norm": 0.4253518811476648, + "learning_rate": 0.000669, + "loss": 6.7658, + "step": 223 + }, + { + "epoch": 0.00224, + "grad_norm": 0.4076338977034843, + "learning_rate": 0.0006720000000000001, + "loss": 6.7609, + "step": 224 + }, + { + "epoch": 0.00225, + "grad_norm": 0.4491881195721512, + "learning_rate": 0.000675, + "loss": 6.7489, + "step": 225 + }, + { + "epoch": 0.00226, + "grad_norm": 0.4195112098951784, + "learning_rate": 0.000678, + "loss": 6.7444, + "step": 226 + }, + { + "epoch": 0.00227, + "grad_norm": 0.371663908330708, + "learning_rate": 0.0006810000000000001, + "loss": 6.7174, + "step": 227 + }, + { + "epoch": 0.00228, + "grad_norm": 0.3462952066328263, + "learning_rate": 0.000684, + "loss": 6.7197, + "step": 228 + }, + { + "epoch": 0.00229, + "grad_norm": 0.5288966810878937, + "learning_rate": 0.000687, + "loss": 6.7178, + "step": 229 + }, + { + "epoch": 0.0023, + "grad_norm": 0.7623592789505088, + "learning_rate": 0.0006900000000000001, + "loss": 6.6993, + "step": 230 + }, + { + "epoch": 0.00231, + "grad_norm": 0.9880780315432149, + "learning_rate": 0.000693, + "loss": 6.6923, + "step": 231 + }, + { + "epoch": 0.00232, + "grad_norm": 1.028691756937799, + "learning_rate": 0.000696, + "loss": 6.7142, + "step": 232 + }, + { + "epoch": 0.00233, + "grad_norm": 0.9501761132787978, + "learning_rate": 0.0006990000000000001, + "loss": 6.6946, + "step": 233 + }, + { + "epoch": 0.00234, + "grad_norm": 0.9999298841530961, + "learning_rate": 0.000702, + "loss": 6.6948, + "step": 234 + }, + { + "epoch": 0.00235, + "grad_norm": 0.6945930996245869, + "learning_rate": 0.000705, + "loss": 6.6675, + "step": 235 + }, + { + "epoch": 0.00236, + "grad_norm": 0.6422608041910052, + "learning_rate": 0.000708, + "loss": 6.6513, + "step": 236 + }, + { + "epoch": 0.00237, + "grad_norm": 0.47985797682797676, + "learning_rate": 0.0007109999999999999, + "loss": 6.657, + "step": 237 + }, + { + "epoch": 0.00238, + "grad_norm": 0.6191094472441181, + "learning_rate": 0.000714, + "loss": 6.6482, + "step": 238 + }, + { + "epoch": 0.00239, + "grad_norm": 0.5181523072026278, + "learning_rate": 0.000717, + "loss": 6.628, + "step": 239 + }, + { + "epoch": 0.0024, + "grad_norm": 0.5894783578801835, + "learning_rate": 0.0007199999999999999, + "loss": 6.645, + "step": 240 + }, + { + "epoch": 0.00241, + "grad_norm": 0.5639698176641863, + "learning_rate": 0.000723, + "loss": 6.6279, + "step": 241 + }, + { + "epoch": 0.00242, + "grad_norm": 0.9095170652542525, + "learning_rate": 0.000726, + "loss": 6.6087, + "step": 242 + }, + { + "epoch": 0.00243, + "grad_norm": 1.3373514416355459, + "learning_rate": 0.000729, + "loss": 6.6191, + "step": 243 + }, + { + "epoch": 0.00244, + "grad_norm": 0.5837452319331187, + "learning_rate": 0.000732, + "loss": 6.5991, + "step": 244 + }, + { + "epoch": 0.00245, + "grad_norm": 1.0261084178764917, + "learning_rate": 0.000735, + "loss": 6.6035, + "step": 245 + }, + { + "epoch": 0.00246, + "grad_norm": 0.5663979442820745, + "learning_rate": 0.000738, + "loss": 6.574, + "step": 246 + }, + { + "epoch": 0.00247, + "grad_norm": 0.5896636858778472, + "learning_rate": 0.000741, + "loss": 6.5719, + "step": 247 + }, + { + "epoch": 0.00248, + "grad_norm": 0.47326080430149503, + "learning_rate": 0.000744, + "loss": 6.5757, + "step": 248 + }, + { + "epoch": 0.00249, + "grad_norm": 0.6248164732961499, + "learning_rate": 0.000747, + "loss": 6.5582, + "step": 249 + }, + { + "epoch": 0.0025, + "grad_norm": 0.48362025373458484, + "learning_rate": 0.00075, + "loss": 6.5567, + "step": 250 + }, + { + "epoch": 0.00251, + "grad_norm": 0.4858269118610639, + "learning_rate": 0.000753, + "loss": 6.5227, + "step": 251 + }, + { + "epoch": 0.00252, + "grad_norm": 0.41862369731289734, + "learning_rate": 0.000756, + "loss": 6.5302, + "step": 252 + }, + { + "epoch": 0.00253, + "grad_norm": 0.47200180330590646, + "learning_rate": 0.000759, + "loss": 6.522, + "step": 253 + }, + { + "epoch": 0.00254, + "grad_norm": 0.3651546358356223, + "learning_rate": 0.000762, + "loss": 6.5136, + "step": 254 + }, + { + "epoch": 0.00255, + "grad_norm": 0.4285192093499382, + "learning_rate": 0.0007650000000000001, + "loss": 6.5149, + "step": 255 + }, + { + "epoch": 0.00256, + "grad_norm": 0.42767490819993825, + "learning_rate": 0.000768, + "loss": 6.4917, + "step": 256 + }, + { + "epoch": 0.00257, + "grad_norm": 0.48143237273687123, + "learning_rate": 0.000771, + "loss": 6.4974, + "step": 257 + }, + { + "epoch": 0.00258, + "grad_norm": 0.5467090762573645, + "learning_rate": 0.0007740000000000001, + "loss": 6.4744, + "step": 258 + }, + { + "epoch": 0.00259, + "grad_norm": 0.6750367098607939, + "learning_rate": 0.000777, + "loss": 6.4781, + "step": 259 + }, + { + "epoch": 0.0026, + "grad_norm": 0.8320211245129605, + "learning_rate": 0.0007800000000000001, + "loss": 6.4681, + "step": 260 + }, + { + "epoch": 0.00261, + "grad_norm": 0.905841412497731, + "learning_rate": 0.0007830000000000001, + "loss": 6.4809, + "step": 261 + }, + { + "epoch": 0.00262, + "grad_norm": 1.0687360869781928, + "learning_rate": 0.000786, + "loss": 6.4647, + "step": 262 + }, + { + "epoch": 0.00263, + "grad_norm": 1.2051408681263374, + "learning_rate": 0.0007890000000000001, + "loss": 6.4874, + "step": 263 + }, + { + "epoch": 0.00264, + "grad_norm": 0.8690142485653533, + "learning_rate": 0.0007920000000000001, + "loss": 6.455, + "step": 264 + }, + { + "epoch": 0.00265, + "grad_norm": 1.2774066489819682, + "learning_rate": 0.000795, + "loss": 6.4641, + "step": 265 + }, + { + "epoch": 0.00266, + "grad_norm": 0.7288978979341997, + "learning_rate": 0.0007980000000000001, + "loss": 6.4454, + "step": 266 + }, + { + "epoch": 0.00267, + "grad_norm": 0.6506287971604123, + "learning_rate": 0.0008010000000000001, + "loss": 6.4371, + "step": 267 + }, + { + "epoch": 0.00268, + "grad_norm": 0.6866545943797145, + "learning_rate": 0.000804, + "loss": 6.4338, + "step": 268 + }, + { + "epoch": 0.00269, + "grad_norm": 1.0440114151727509, + "learning_rate": 0.0008070000000000001, + "loss": 6.4227, + "step": 269 + }, + { + "epoch": 0.0027, + "grad_norm": 1.170351969791303, + "learning_rate": 0.0008100000000000001, + "loss": 6.4362, + "step": 270 + }, + { + "epoch": 0.00271, + "grad_norm": 0.6832222674646221, + "learning_rate": 0.000813, + "loss": 6.4253, + "step": 271 + }, + { + "epoch": 0.00272, + "grad_norm": 0.617936452008115, + "learning_rate": 0.0008160000000000001, + "loss": 6.4063, + "step": 272 + }, + { + "epoch": 0.00273, + "grad_norm": 0.6750066600068259, + "learning_rate": 0.0008190000000000001, + "loss": 6.4008, + "step": 273 + }, + { + "epoch": 0.00274, + "grad_norm": 0.6315560378177079, + "learning_rate": 0.000822, + "loss": 6.379, + "step": 274 + }, + { + "epoch": 0.00275, + "grad_norm": 0.6774266517760174, + "learning_rate": 0.0008250000000000001, + "loss": 6.3852, + "step": 275 + }, + { + "epoch": 0.00276, + "grad_norm": 0.7272895822836024, + "learning_rate": 0.0008280000000000001, + "loss": 6.3794, + "step": 276 + }, + { + "epoch": 0.00277, + "grad_norm": 0.6752471037637485, + "learning_rate": 0.0008310000000000001, + "loss": 6.3735, + "step": 277 + }, + { + "epoch": 0.00278, + "grad_norm": 0.5678457826039285, + "learning_rate": 0.0008340000000000001, + "loss": 6.354, + "step": 278 + }, + { + "epoch": 0.00279, + "grad_norm": 0.4611700189072147, + "learning_rate": 0.0008370000000000001, + "loss": 6.3529, + "step": 279 + }, + { + "epoch": 0.0028, + "grad_norm": 0.343285643042232, + "learning_rate": 0.0008400000000000001, + "loss": 6.3329, + "step": 280 + }, + { + "epoch": 0.00281, + "grad_norm": 0.4519631747446028, + "learning_rate": 0.0008430000000000001, + "loss": 6.3253, + "step": 281 + }, + { + "epoch": 0.00282, + "grad_norm": 0.3255189118052276, + "learning_rate": 0.000846, + "loss": 6.3232, + "step": 282 + }, + { + "epoch": 0.00283, + "grad_norm": 0.4297016476682907, + "learning_rate": 0.0008489999999999999, + "loss": 6.3149, + "step": 283 + }, + { + "epoch": 0.00284, + "grad_norm": 0.37515936714697207, + "learning_rate": 0.0008519999999999999, + "loss": 6.3058, + "step": 284 + }, + { + "epoch": 0.00285, + "grad_norm": 0.3458870104505622, + "learning_rate": 0.000855, + "loss": 6.3075, + "step": 285 + }, + { + "epoch": 0.00286, + "grad_norm": 0.39943359732325345, + "learning_rate": 0.0008579999999999999, + "loss": 6.2857, + "step": 286 + }, + { + "epoch": 0.00287, + "grad_norm": 0.3631751365570726, + "learning_rate": 0.000861, + "loss": 6.2875, + "step": 287 + }, + { + "epoch": 0.00288, + "grad_norm": 0.552280032213235, + "learning_rate": 0.000864, + "loss": 6.2707, + "step": 288 + }, + { + "epoch": 0.00289, + "grad_norm": 0.9396362484724781, + "learning_rate": 0.0008669999999999999, + "loss": 6.2923, + "step": 289 + }, + { + "epoch": 0.0029, + "grad_norm": 1.3959756273937387, + "learning_rate": 0.00087, + "loss": 6.2945, + "step": 290 + }, + { + "epoch": 0.00291, + "grad_norm": 0.6791923155853262, + "learning_rate": 0.000873, + "loss": 6.2663, + "step": 291 + }, + { + "epoch": 0.00292, + "grad_norm": 0.9879732835250642, + "learning_rate": 0.0008759999999999999, + "loss": 6.2794, + "step": 292 + }, + { + "epoch": 0.00293, + "grad_norm": 1.0140384817226566, + "learning_rate": 0.000879, + "loss": 6.2652, + "step": 293 + }, + { + "epoch": 0.00294, + "grad_norm": 0.9889225551341856, + "learning_rate": 0.000882, + "loss": 6.2822, + "step": 294 + }, + { + "epoch": 0.00295, + "grad_norm": 1.0429477871581094, + "learning_rate": 0.0008849999999999999, + "loss": 6.2468, + "step": 295 + }, + { + "epoch": 0.00296, + "grad_norm": 1.1774929945794055, + "learning_rate": 0.000888, + "loss": 6.2705, + "step": 296 + }, + { + "epoch": 0.00297, + "grad_norm": 0.7364156162079134, + "learning_rate": 0.000891, + "loss": 6.2278, + "step": 297 + }, + { + "epoch": 0.00298, + "grad_norm": 0.8424638476384282, + "learning_rate": 0.0008939999999999999, + "loss": 6.2455, + "step": 298 + }, + { + "epoch": 0.00299, + "grad_norm": 0.8668489286879963, + "learning_rate": 0.000897, + "loss": 6.2225, + "step": 299 + }, + { + "epoch": 0.003, + "grad_norm": 0.9039057952602142, + "learning_rate": 0.0009, + "loss": 6.2236, + "step": 300 + }, + { + "epoch": 0.00301, + "grad_norm": 1.0344858724084711, + "learning_rate": 0.0009029999999999999, + "loss": 6.2222, + "step": 301 + }, + { + "epoch": 0.00302, + "grad_norm": 0.801399966246171, + "learning_rate": 0.000906, + "loss": 6.2007, + "step": 302 + }, + { + "epoch": 0.00303, + "grad_norm": 0.7276255151675343, + "learning_rate": 0.000909, + "loss": 6.2106, + "step": 303 + }, + { + "epoch": 0.00304, + "grad_norm": 0.8306162070729353, + "learning_rate": 0.000912, + "loss": 6.205, + "step": 304 + }, + { + "epoch": 0.00305, + "grad_norm": 0.7650178489304597, + "learning_rate": 0.000915, + "loss": 6.2045, + "step": 305 + }, + { + "epoch": 0.00306, + "grad_norm": 0.7024963687074245, + "learning_rate": 0.000918, + "loss": 6.1878, + "step": 306 + }, + { + "epoch": 0.00307, + "grad_norm": 0.5687961336654864, + "learning_rate": 0.000921, + "loss": 6.1778, + "step": 307 + }, + { + "epoch": 0.00308, + "grad_norm": 0.4515866134049927, + "learning_rate": 0.000924, + "loss": 6.1586, + "step": 308 + }, + { + "epoch": 0.00309, + "grad_norm": 0.5454081565882548, + "learning_rate": 0.000927, + "loss": 6.1763, + "step": 309 + }, + { + "epoch": 0.0031, + "grad_norm": 0.4033959865123679, + "learning_rate": 0.00093, + "loss": 6.1549, + "step": 310 + }, + { + "epoch": 0.00311, + "grad_norm": 0.5004494164305024, + "learning_rate": 0.000933, + "loss": 6.1478, + "step": 311 + }, + { + "epoch": 0.00312, + "grad_norm": 0.470361361901935, + "learning_rate": 0.000936, + "loss": 6.1275, + "step": 312 + }, + { + "epoch": 0.00313, + "grad_norm": 0.4973667290148138, + "learning_rate": 0.0009390000000000001, + "loss": 6.1399, + "step": 313 + }, + { + "epoch": 0.00314, + "grad_norm": 0.47398584855820086, + "learning_rate": 0.000942, + "loss": 6.1453, + "step": 314 + }, + { + "epoch": 0.00315, + "grad_norm": 0.39081338977861474, + "learning_rate": 0.000945, + "loss": 6.1206, + "step": 315 + }, + { + "epoch": 0.00316, + "grad_norm": 0.308452368547838, + "learning_rate": 0.0009480000000000001, + "loss": 6.0925, + "step": 316 + }, + { + "epoch": 0.00317, + "grad_norm": 0.33735261718955184, + "learning_rate": 0.000951, + "loss": 6.1112, + "step": 317 + }, + { + "epoch": 0.00318, + "grad_norm": 0.3843792856632324, + "learning_rate": 0.000954, + "loss": 6.1055, + "step": 318 + }, + { + "epoch": 0.00319, + "grad_norm": 0.45015697169720664, + "learning_rate": 0.0009570000000000001, + "loss": 6.0951, + "step": 319 + }, + { + "epoch": 0.0032, + "grad_norm": 0.6347836654615971, + "learning_rate": 0.00096, + "loss": 6.097, + "step": 320 + }, + { + "epoch": 0.00321, + "grad_norm": 0.9264517947757075, + "learning_rate": 0.000963, + "loss": 6.0955, + "step": 321 + }, + { + "epoch": 0.00322, + "grad_norm": 1.3311662164937155, + "learning_rate": 0.0009660000000000001, + "loss": 6.0984, + "step": 322 + }, + { + "epoch": 0.00323, + "grad_norm": 0.7192768486088142, + "learning_rate": 0.000969, + "loss": 6.081, + "step": 323 + }, + { + "epoch": 0.00324, + "grad_norm": 0.8815686637890671, + "learning_rate": 0.0009720000000000001, + "loss": 6.0921, + "step": 324 + }, + { + "epoch": 0.00325, + "grad_norm": 0.9399377971403509, + "learning_rate": 0.0009750000000000001, + "loss": 6.0559, + "step": 325 + }, + { + "epoch": 0.00326, + "grad_norm": 1.1161084705724094, + "learning_rate": 0.0009780000000000001, + "loss": 6.0866, + "step": 326 + }, + { + "epoch": 0.00327, + "grad_norm": 0.9798577225908143, + "learning_rate": 0.000981, + "loss": 6.08, + "step": 327 + }, + { + "epoch": 0.00328, + "grad_norm": 1.4594200076446016, + "learning_rate": 0.000984, + "loss": 6.0949, + "step": 328 + }, + { + "epoch": 0.00329, + "grad_norm": 0.7530414187502621, + "learning_rate": 0.000987, + "loss": 6.0592, + "step": 329 + }, + { + "epoch": 0.0033, + "grad_norm": 0.9983029089238832, + "learning_rate": 0.00099, + "loss": 6.0707, + "step": 330 + }, + { + "epoch": 0.00331, + "grad_norm": 0.835334238631822, + "learning_rate": 0.0009930000000000002, + "loss": 6.0609, + "step": 331 + }, + { + "epoch": 0.00332, + "grad_norm": 1.2633352311024129, + "learning_rate": 0.0009960000000000001, + "loss": 6.0632, + "step": 332 + }, + { + "epoch": 0.00333, + "grad_norm": 0.9415989578137678, + "learning_rate": 0.000999, + "loss": 6.0388, + "step": 333 + }, + { + "epoch": 0.00334, + "grad_norm": 0.752057040577036, + "learning_rate": 0.001002, + "loss": 6.0434, + "step": 334 + }, + { + "epoch": 0.00335, + "grad_norm": 0.6368712711427076, + "learning_rate": 0.001005, + "loss": 6.039, + "step": 335 + }, + { + "epoch": 0.00336, + "grad_norm": 0.5277321252477392, + "learning_rate": 0.001008, + "loss": 6.0255, + "step": 336 + }, + { + "epoch": 0.00337, + "grad_norm": 0.545640031716998, + "learning_rate": 0.0010110000000000002, + "loss": 6.0051, + "step": 337 + }, + { + "epoch": 0.00338, + "grad_norm": 0.5605817392374447, + "learning_rate": 0.0010140000000000001, + "loss": 6.0183, + "step": 338 + }, + { + "epoch": 0.00339, + "grad_norm": 0.4963166785854256, + "learning_rate": 0.0010170000000000001, + "loss": 6.0214, + "step": 339 + }, + { + "epoch": 0.0034, + "grad_norm": 0.496225629593103, + "learning_rate": 0.00102, + "loss": 5.9915, + "step": 340 + }, + { + "epoch": 0.00341, + "grad_norm": 0.45013704339915594, + "learning_rate": 0.001023, + "loss": 5.9719, + "step": 341 + }, + { + "epoch": 0.00342, + "grad_norm": 0.37899658405778774, + "learning_rate": 0.001026, + "loss": 5.9765, + "step": 342 + }, + { + "epoch": 0.00343, + "grad_norm": 0.4216072327604619, + "learning_rate": 0.0010290000000000002, + "loss": 5.9773, + "step": 343 + }, + { + "epoch": 0.00344, + "grad_norm": 0.4964437378733662, + "learning_rate": 0.001032, + "loss": 5.9678, + "step": 344 + }, + { + "epoch": 0.00345, + "grad_norm": 0.6031142308234815, + "learning_rate": 0.001035, + "loss": 5.9605, + "step": 345 + }, + { + "epoch": 0.00346, + "grad_norm": 0.6349778009023861, + "learning_rate": 0.0010379999999999999, + "loss": 5.9675, + "step": 346 + }, + { + "epoch": 0.00347, + "grad_norm": 0.6213366174245898, + "learning_rate": 0.001041, + "loss": 5.9393, + "step": 347 + }, + { + "epoch": 0.00348, + "grad_norm": 0.684857817555668, + "learning_rate": 0.001044, + "loss": 5.9499, + "step": 348 + }, + { + "epoch": 0.00349, + "grad_norm": 0.8421661368559449, + "learning_rate": 0.001047, + "loss": 5.9398, + "step": 349 + }, + { + "epoch": 0.0035, + "grad_norm": 0.8062802732253019, + "learning_rate": 0.00105, + "loss": 5.9568, + "step": 350 + }, + { + "epoch": 0.00351, + "grad_norm": 0.7669273491138234, + "learning_rate": 0.001053, + "loss": 5.9549, + "step": 351 + }, + { + "epoch": 0.00352, + "grad_norm": 1.0673384228730578, + "learning_rate": 0.0010559999999999999, + "loss": 5.9491, + "step": 352 + }, + { + "epoch": 0.00353, + "grad_norm": 1.313057737158452, + "learning_rate": 0.001059, + "loss": 5.9596, + "step": 353 + }, + { + "epoch": 0.00354, + "grad_norm": 0.8748293544965601, + "learning_rate": 0.001062, + "loss": 5.9228, + "step": 354 + }, + { + "epoch": 0.00355, + "grad_norm": 0.7617258072087358, + "learning_rate": 0.001065, + "loss": 5.9348, + "step": 355 + }, + { + "epoch": 0.00356, + "grad_norm": 0.9561923051718775, + "learning_rate": 0.001068, + "loss": 5.9306, + "step": 356 + }, + { + "epoch": 0.00357, + "grad_norm": 0.8631216371911231, + "learning_rate": 0.001071, + "loss": 5.9199, + "step": 357 + }, + { + "epoch": 0.00358, + "grad_norm": 0.9525953351992685, + "learning_rate": 0.001074, + "loss": 5.9184, + "step": 358 + }, + { + "epoch": 0.00359, + "grad_norm": 0.6858307664974747, + "learning_rate": 0.001077, + "loss": 5.8973, + "step": 359 + }, + { + "epoch": 0.0036, + "grad_norm": 0.7167037627907079, + "learning_rate": 0.00108, + "loss": 5.9065, + "step": 360 + }, + { + "epoch": 0.00361, + "grad_norm": 0.7969047134484166, + "learning_rate": 0.001083, + "loss": 5.8986, + "step": 361 + }, + { + "epoch": 0.00362, + "grad_norm": 0.8186204145394074, + "learning_rate": 0.001086, + "loss": 5.888, + "step": 362 + }, + { + "epoch": 0.00363, + "grad_norm": 0.7436182013517663, + "learning_rate": 0.001089, + "loss": 5.8766, + "step": 363 + }, + { + "epoch": 0.00364, + "grad_norm": 0.6658023261534547, + "learning_rate": 0.001092, + "loss": 5.8872, + "step": 364 + }, + { + "epoch": 0.00365, + "grad_norm": 0.7206668214896482, + "learning_rate": 0.001095, + "loss": 5.8872, + "step": 365 + }, + { + "epoch": 0.00366, + "grad_norm": 0.6345413922647961, + "learning_rate": 0.001098, + "loss": 5.8617, + "step": 366 + }, + { + "epoch": 0.00367, + "grad_norm": 0.6094924231218852, + "learning_rate": 0.001101, + "loss": 5.8618, + "step": 367 + }, + { + "epoch": 0.00368, + "grad_norm": 0.7732260654201254, + "learning_rate": 0.001104, + "loss": 5.8653, + "step": 368 + }, + { + "epoch": 0.00369, + "grad_norm": 0.9451132874875877, + "learning_rate": 0.001107, + "loss": 5.8779, + "step": 369 + }, + { + "epoch": 0.0037, + "grad_norm": 1.047387946938651, + "learning_rate": 0.00111, + "loss": 5.8603, + "step": 370 + }, + { + "epoch": 0.00371, + "grad_norm": 0.8709497379832931, + "learning_rate": 0.001113, + "loss": 5.8262, + "step": 371 + }, + { + "epoch": 0.00372, + "grad_norm": 0.6993621521276565, + "learning_rate": 0.001116, + "loss": 5.8434, + "step": 372 + }, + { + "epoch": 0.00373, + "grad_norm": 0.6889019297226876, + "learning_rate": 0.001119, + "loss": 5.8344, + "step": 373 + }, + { + "epoch": 0.00374, + "grad_norm": 0.8187808414499578, + "learning_rate": 0.001122, + "loss": 5.8424, + "step": 374 + }, + { + "epoch": 0.00375, + "grad_norm": 0.7395556739972736, + "learning_rate": 0.0011250000000000001, + "loss": 5.8404, + "step": 375 + }, + { + "epoch": 0.00376, + "grad_norm": 0.5027446756427529, + "learning_rate": 0.001128, + "loss": 5.8296, + "step": 376 + }, + { + "epoch": 0.00377, + "grad_norm": 0.45297103786338255, + "learning_rate": 0.001131, + "loss": 5.8239, + "step": 377 + }, + { + "epoch": 0.00378, + "grad_norm": 0.5150298222384522, + "learning_rate": 0.001134, + "loss": 5.82, + "step": 378 + }, + { + "epoch": 0.00379, + "grad_norm": 0.4216428009711753, + "learning_rate": 0.001137, + "loss": 5.8036, + "step": 379 + }, + { + "epoch": 0.0038, + "grad_norm": 0.43574801532624385, + "learning_rate": 0.00114, + "loss": 5.8234, + "step": 380 + }, + { + "epoch": 0.00381, + "grad_norm": 0.4737821231317218, + "learning_rate": 0.0011430000000000001, + "loss": 5.7908, + "step": 381 + }, + { + "epoch": 0.00382, + "grad_norm": 0.5236457098681065, + "learning_rate": 0.001146, + "loss": 5.7778, + "step": 382 + }, + { + "epoch": 0.00383, + "grad_norm": 0.5006340354259897, + "learning_rate": 0.001149, + "loss": 5.787, + "step": 383 + }, + { + "epoch": 0.00384, + "grad_norm": 0.5762330042414852, + "learning_rate": 0.001152, + "loss": 5.7915, + "step": 384 + }, + { + "epoch": 0.00385, + "grad_norm": 0.6623935085819848, + "learning_rate": 0.001155, + "loss": 5.7808, + "step": 385 + }, + { + "epoch": 0.00386, + "grad_norm": 0.6780931079980513, + "learning_rate": 0.001158, + "loss": 5.7864, + "step": 386 + }, + { + "epoch": 0.00387, + "grad_norm": 0.6283557380392781, + "learning_rate": 0.0011610000000000001, + "loss": 5.76, + "step": 387 + }, + { + "epoch": 0.00388, + "grad_norm": 0.8661947460887196, + "learning_rate": 0.001164, + "loss": 5.7642, + "step": 388 + }, + { + "epoch": 0.00389, + "grad_norm": 1.1877567285678448, + "learning_rate": 0.001167, + "loss": 5.7898, + "step": 389 + }, + { + "epoch": 0.0039, + "grad_norm": 0.5285139722189788, + "learning_rate": 0.00117, + "loss": 5.7491, + "step": 390 + }, + { + "epoch": 0.00391, + "grad_norm": 0.6430118692881639, + "learning_rate": 0.001173, + "loss": 5.7622, + "step": 391 + }, + { + "epoch": 0.00392, + "grad_norm": 0.6748100403552353, + "learning_rate": 0.001176, + "loss": 5.7713, + "step": 392 + }, + { + "epoch": 0.00393, + "grad_norm": 0.7344199345621275, + "learning_rate": 0.0011790000000000001, + "loss": 5.7448, + "step": 393 + }, + { + "epoch": 0.00394, + "grad_norm": 0.8611653440542995, + "learning_rate": 0.001182, + "loss": 5.7377, + "step": 394 + }, + { + "epoch": 0.00395, + "grad_norm": 1.0200403565527223, + "learning_rate": 0.001185, + "loss": 5.7297, + "step": 395 + }, + { + "epoch": 0.00396, + "grad_norm": 1.3219082304761296, + "learning_rate": 0.001188, + "loss": 5.7571, + "step": 396 + }, + { + "epoch": 0.00397, + "grad_norm": 1.0648841185793536, + "learning_rate": 0.001191, + "loss": 5.7452, + "step": 397 + }, + { + "epoch": 0.00398, + "grad_norm": 0.8818852137031271, + "learning_rate": 0.0011940000000000002, + "loss": 5.7476, + "step": 398 + }, + { + "epoch": 0.00399, + "grad_norm": 0.7229082271352473, + "learning_rate": 0.0011970000000000001, + "loss": 5.7455, + "step": 399 + }, + { + "epoch": 0.004, + "grad_norm": 0.7464785543636239, + "learning_rate": 0.0012000000000000001, + "loss": 5.7335, + "step": 400 + }, + { + "epoch": 0.00401, + "grad_norm": 0.788008045277313, + "learning_rate": 0.001203, + "loss": 5.7147, + "step": 401 + }, + { + "epoch": 0.00402, + "grad_norm": 0.8844811042429299, + "learning_rate": 0.001206, + "loss": 5.7255, + "step": 402 + }, + { + "epoch": 0.00403, + "grad_norm": 0.8786909754931423, + "learning_rate": 0.001209, + "loss": 5.7201, + "step": 403 + }, + { + "epoch": 0.00404, + "grad_norm": 0.9763167280044874, + "learning_rate": 0.0012120000000000002, + "loss": 5.7188, + "step": 404 + }, + { + "epoch": 0.00405, + "grad_norm": 0.9708490732732808, + "learning_rate": 0.0012150000000000002, + "loss": 5.7216, + "step": 405 + }, + { + "epoch": 0.00406, + "grad_norm": 0.7953769652219763, + "learning_rate": 0.0012180000000000001, + "loss": 5.6987, + "step": 406 + }, + { + "epoch": 0.00407, + "grad_norm": 0.7231086032909518, + "learning_rate": 0.0012209999999999999, + "loss": 5.6945, + "step": 407 + }, + { + "epoch": 0.00408, + "grad_norm": 0.6539281657127057, + "learning_rate": 0.001224, + "loss": 5.7108, + "step": 408 + }, + { + "epoch": 0.00409, + "grad_norm": 0.48725126694443294, + "learning_rate": 0.001227, + "loss": 5.6827, + "step": 409 + }, + { + "epoch": 0.0041, + "grad_norm": 0.5935989194477762, + "learning_rate": 0.00123, + "loss": 5.6771, + "step": 410 + }, + { + "epoch": 0.00411, + "grad_norm": 0.6524672694061662, + "learning_rate": 0.001233, + "loss": 5.6945, + "step": 411 + }, + { + "epoch": 0.00412, + "grad_norm": 0.5686252583506046, + "learning_rate": 0.001236, + "loss": 5.6796, + "step": 412 + }, + { + "epoch": 0.00413, + "grad_norm": 0.5206794027062916, + "learning_rate": 0.0012389999999999999, + "loss": 5.676, + "step": 413 + }, + { + "epoch": 0.00414, + "grad_norm": 0.5100681259570212, + "learning_rate": 0.001242, + "loss": 5.6561, + "step": 414 + }, + { + "epoch": 0.00415, + "grad_norm": 0.5323494958966052, + "learning_rate": 0.001245, + "loss": 5.6572, + "step": 415 + }, + { + "epoch": 0.00416, + "grad_norm": 0.593833682795931, + "learning_rate": 0.001248, + "loss": 5.6419, + "step": 416 + }, + { + "epoch": 0.00417, + "grad_norm": 0.5881502982050868, + "learning_rate": 0.001251, + "loss": 5.6711, + "step": 417 + }, + { + "epoch": 0.00418, + "grad_norm": 0.6571826832460801, + "learning_rate": 0.001254, + "loss": 5.6451, + "step": 418 + }, + { + "epoch": 0.00419, + "grad_norm": 0.8242389183582979, + "learning_rate": 0.0012569999999999999, + "loss": 5.6393, + "step": 419 + }, + { + "epoch": 0.0042, + "grad_norm": 0.7322278119135938, + "learning_rate": 0.00126, + "loss": 5.6531, + "step": 420 + }, + { + "epoch": 0.00421, + "grad_norm": 0.6080486142741918, + "learning_rate": 0.001263, + "loss": 5.6415, + "step": 421 + }, + { + "epoch": 0.00422, + "grad_norm": 0.6271805214461816, + "learning_rate": 0.001266, + "loss": 5.6327, + "step": 422 + }, + { + "epoch": 0.00423, + "grad_norm": 0.5934268893913589, + "learning_rate": 0.001269, + "loss": 5.6286, + "step": 423 + }, + { + "epoch": 0.00424, + "grad_norm": 0.6795296945522938, + "learning_rate": 0.001272, + "loss": 5.6299, + "step": 424 + }, + { + "epoch": 0.00425, + "grad_norm": 0.7845276448161947, + "learning_rate": 0.001275, + "loss": 5.6344, + "step": 425 + }, + { + "epoch": 0.00426, + "grad_norm": 0.9176562117983922, + "learning_rate": 0.001278, + "loss": 5.6402, + "step": 426 + }, + { + "epoch": 0.00427, + "grad_norm": 1.0306205830362438, + "learning_rate": 0.001281, + "loss": 5.6293, + "step": 427 + }, + { + "epoch": 0.00428, + "grad_norm": 1.0252381178312269, + "learning_rate": 0.001284, + "loss": 5.6086, + "step": 428 + }, + { + "epoch": 0.00429, + "grad_norm": 1.3332162612637855, + "learning_rate": 0.001287, + "loss": 5.6393, + "step": 429 + }, + { + "epoch": 0.0043, + "grad_norm": 0.8174291079939794, + "learning_rate": 0.00129, + "loss": 5.6202, + "step": 430 + }, + { + "epoch": 0.00431, + "grad_norm": 0.821028463418781, + "learning_rate": 0.001293, + "loss": 5.6102, + "step": 431 + }, + { + "epoch": 0.00432, + "grad_norm": 0.7475137161763143, + "learning_rate": 0.001296, + "loss": 5.6163, + "step": 432 + }, + { + "epoch": 0.00433, + "grad_norm": 0.7571870052992741, + "learning_rate": 0.001299, + "loss": 5.604, + "step": 433 + }, + { + "epoch": 0.00434, + "grad_norm": 1.0543560255015263, + "learning_rate": 0.001302, + "loss": 5.6184, + "step": 434 + }, + { + "epoch": 0.00435, + "grad_norm": 0.8758758304266553, + "learning_rate": 0.001305, + "loss": 5.6066, + "step": 435 + }, + { + "epoch": 0.00436, + "grad_norm": 0.9137062421440546, + "learning_rate": 0.001308, + "loss": 5.5859, + "step": 436 + }, + { + "epoch": 0.00437, + "grad_norm": 1.301736875083812, + "learning_rate": 0.001311, + "loss": 5.6173, + "step": 437 + }, + { + "epoch": 0.00438, + "grad_norm": 0.9356953917037294, + "learning_rate": 0.001314, + "loss": 5.5919, + "step": 438 + }, + { + "epoch": 0.00439, + "grad_norm": 0.8522821992819578, + "learning_rate": 0.001317, + "loss": 5.6158, + "step": 439 + }, + { + "epoch": 0.0044, + "grad_norm": 0.7182761753028103, + "learning_rate": 0.00132, + "loss": 5.5821, + "step": 440 + }, + { + "epoch": 0.00441, + "grad_norm": 0.6293266892726601, + "learning_rate": 0.001323, + "loss": 5.577, + "step": 441 + }, + { + "epoch": 0.00442, + "grad_norm": 0.8189921873128356, + "learning_rate": 0.0013260000000000001, + "loss": 5.5762, + "step": 442 + }, + { + "epoch": 0.00443, + "grad_norm": 1.0212422821054057, + "learning_rate": 0.001329, + "loss": 5.5904, + "step": 443 + }, + { + "epoch": 0.00444, + "grad_norm": 0.8951785018901781, + "learning_rate": 0.001332, + "loss": 5.5851, + "step": 444 + }, + { + "epoch": 0.00445, + "grad_norm": 0.7868769140150608, + "learning_rate": 0.001335, + "loss": 5.5661, + "step": 445 + }, + { + "epoch": 0.00446, + "grad_norm": 0.790102612629763, + "learning_rate": 0.001338, + "loss": 5.5718, + "step": 446 + }, + { + "epoch": 0.00447, + "grad_norm": 0.8396194874372788, + "learning_rate": 0.001341, + "loss": 5.5716, + "step": 447 + }, + { + "epoch": 0.00448, + "grad_norm": 0.9120841535821665, + "learning_rate": 0.0013440000000000001, + "loss": 5.5589, + "step": 448 + }, + { + "epoch": 0.00449, + "grad_norm": 0.8573073152890212, + "learning_rate": 0.001347, + "loss": 5.5603, + "step": 449 + }, + { + "epoch": 0.0045, + "grad_norm": 0.7420732830276576, + "learning_rate": 0.00135, + "loss": 5.5551, + "step": 450 + }, + { + "epoch": 0.00451, + "grad_norm": 0.7054051762730813, + "learning_rate": 0.001353, + "loss": 5.5451, + "step": 451 + }, + { + "epoch": 0.00452, + "grad_norm": 0.5383194985068459, + "learning_rate": 0.001356, + "loss": 5.5229, + "step": 452 + }, + { + "epoch": 0.00453, + "grad_norm": 0.5845224529530345, + "learning_rate": 0.001359, + "loss": 5.5144, + "step": 453 + }, + { + "epoch": 0.00454, + "grad_norm": 0.5197277965757966, + "learning_rate": 0.0013620000000000001, + "loss": 5.5269, + "step": 454 + }, + { + "epoch": 0.00455, + "grad_norm": 0.44901215490386587, + "learning_rate": 0.0013650000000000001, + "loss": 5.5227, + "step": 455 + }, + { + "epoch": 0.00456, + "grad_norm": 0.40823042735319937, + "learning_rate": 0.001368, + "loss": 5.5155, + "step": 456 + }, + { + "epoch": 0.00457, + "grad_norm": 0.4077054091161063, + "learning_rate": 0.001371, + "loss": 5.5045, + "step": 457 + }, + { + "epoch": 0.00458, + "grad_norm": 0.4051721280659754, + "learning_rate": 0.001374, + "loss": 5.5115, + "step": 458 + }, + { + "epoch": 0.00459, + "grad_norm": 0.3819222938644956, + "learning_rate": 0.0013770000000000002, + "loss": 5.4986, + "step": 459 + }, + { + "epoch": 0.0046, + "grad_norm": 0.4096791477469412, + "learning_rate": 0.0013800000000000002, + "loss": 5.4851, + "step": 460 + }, + { + "epoch": 0.00461, + "grad_norm": 0.44167018915391465, + "learning_rate": 0.0013830000000000001, + "loss": 5.4805, + "step": 461 + }, + { + "epoch": 0.00462, + "grad_norm": 0.49346826910496583, + "learning_rate": 0.001386, + "loss": 5.4928, + "step": 462 + }, + { + "epoch": 0.00463, + "grad_norm": 0.463489633996114, + "learning_rate": 0.001389, + "loss": 5.4652, + "step": 463 + }, + { + "epoch": 0.00464, + "grad_norm": 0.507205937400668, + "learning_rate": 0.001392, + "loss": 5.4859, + "step": 464 + }, + { + "epoch": 0.00465, + "grad_norm": 0.5318639728571777, + "learning_rate": 0.0013950000000000002, + "loss": 5.4572, + "step": 465 + }, + { + "epoch": 0.00466, + "grad_norm": 0.6489252257104292, + "learning_rate": 0.0013980000000000002, + "loss": 5.4792, + "step": 466 + }, + { + "epoch": 0.00467, + "grad_norm": 0.7957789059873086, + "learning_rate": 0.0014010000000000001, + "loss": 5.4727, + "step": 467 + }, + { + "epoch": 0.00468, + "grad_norm": 0.7621042481808248, + "learning_rate": 0.001404, + "loss": 5.462, + "step": 468 + }, + { + "epoch": 0.00469, + "grad_norm": 0.573822393217689, + "learning_rate": 0.001407, + "loss": 5.457, + "step": 469 + }, + { + "epoch": 0.0047, + "grad_norm": 0.7543746323219513, + "learning_rate": 0.00141, + "loss": 5.4709, + "step": 470 + }, + { + "epoch": 0.00471, + "grad_norm": 0.8641651762100534, + "learning_rate": 0.001413, + "loss": 5.4567, + "step": 471 + }, + { + "epoch": 0.00472, + "grad_norm": 0.9026674726231502, + "learning_rate": 0.001416, + "loss": 5.4699, + "step": 472 + }, + { + "epoch": 0.00473, + "grad_norm": 0.8212885101559565, + "learning_rate": 0.001419, + "loss": 5.4778, + "step": 473 + }, + { + "epoch": 0.00474, + "grad_norm": 0.8914030740906659, + "learning_rate": 0.0014219999999999999, + "loss": 5.461, + "step": 474 + }, + { + "epoch": 0.00475, + "grad_norm": 1.0570094425693455, + "learning_rate": 0.001425, + "loss": 5.4652, + "step": 475 + }, + { + "epoch": 0.00476, + "grad_norm": 0.9736444976311589, + "learning_rate": 0.001428, + "loss": 5.4875, + "step": 476 + }, + { + "epoch": 0.00477, + "grad_norm": 1.1550380737092787, + "learning_rate": 0.001431, + "loss": 5.4568, + "step": 477 + }, + { + "epoch": 0.00478, + "grad_norm": 0.848321570803796, + "learning_rate": 0.001434, + "loss": 5.4695, + "step": 478 + }, + { + "epoch": 0.00479, + "grad_norm": 0.9517827225501269, + "learning_rate": 0.001437, + "loss": 5.4501, + "step": 479 + }, + { + "epoch": 0.0048, + "grad_norm": 1.0883787540754652, + "learning_rate": 0.0014399999999999999, + "loss": 5.4562, + "step": 480 + }, + { + "epoch": 0.00481, + "grad_norm": 0.9422991164230814, + "learning_rate": 0.001443, + "loss": 5.4516, + "step": 481 + }, + { + "epoch": 0.00482, + "grad_norm": 0.9159499791385636, + "learning_rate": 0.001446, + "loss": 5.4273, + "step": 482 + }, + { + "epoch": 0.00483, + "grad_norm": 0.9688645055474768, + "learning_rate": 0.001449, + "loss": 5.44, + "step": 483 + }, + { + "epoch": 0.00484, + "grad_norm": 1.1114303023214132, + "learning_rate": 0.001452, + "loss": 5.4474, + "step": 484 + }, + { + "epoch": 0.00485, + "grad_norm": 0.9051569573634253, + "learning_rate": 0.001455, + "loss": 5.4468, + "step": 485 + }, + { + "epoch": 0.00486, + "grad_norm": 0.9247454458721566, + "learning_rate": 0.001458, + "loss": 5.4217, + "step": 486 + }, + { + "epoch": 0.00487, + "grad_norm": 0.852928162562673, + "learning_rate": 0.001461, + "loss": 5.4339, + "step": 487 + }, + { + "epoch": 0.00488, + "grad_norm": 0.8142513932978471, + "learning_rate": 0.001464, + "loss": 5.41, + "step": 488 + }, + { + "epoch": 0.00489, + "grad_norm": 0.9329231917883894, + "learning_rate": 0.001467, + "loss": 5.403, + "step": 489 + }, + { + "epoch": 0.0049, + "grad_norm": 0.9855383918059676, + "learning_rate": 0.00147, + "loss": 5.4217, + "step": 490 + }, + { + "epoch": 0.00491, + "grad_norm": 0.7649189111405409, + "learning_rate": 0.001473, + "loss": 5.4131, + "step": 491 + }, + { + "epoch": 0.00492, + "grad_norm": 0.7909135941762935, + "learning_rate": 0.001476, + "loss": 5.4088, + "step": 492 + }, + { + "epoch": 0.00493, + "grad_norm": 0.9895574066894314, + "learning_rate": 0.001479, + "loss": 5.412, + "step": 493 + }, + { + "epoch": 0.00494, + "grad_norm": 1.132991334341666, + "learning_rate": 0.001482, + "loss": 5.429, + "step": 494 + }, + { + "epoch": 0.00495, + "grad_norm": 0.761507737933228, + "learning_rate": 0.001485, + "loss": 5.4134, + "step": 495 + }, + { + "epoch": 0.00496, + "grad_norm": 0.9726159326361932, + "learning_rate": 0.001488, + "loss": 5.4067, + "step": 496 + }, + { + "epoch": 0.00497, + "grad_norm": 1.1482864163713484, + "learning_rate": 0.001491, + "loss": 5.3997, + "step": 497 + }, + { + "epoch": 0.00498, + "grad_norm": 0.8327332046897746, + "learning_rate": 0.001494, + "loss": 5.4147, + "step": 498 + }, + { + "epoch": 0.00499, + "grad_norm": 0.725916611519047, + "learning_rate": 0.001497, + "loss": 5.3792, + "step": 499 + }, + { + "epoch": 0.005, + "grad_norm": 0.6719782846245154, + "learning_rate": 0.0015, + "loss": 5.3842, + "step": 500 + }, + { + "epoch": 0.00501, + "grad_norm": 0.7463076928465905, + "learning_rate": 0.001503, + "loss": 5.3796, + "step": 501 + }, + { + "epoch": 0.00502, + "grad_norm": 0.8408104186601356, + "learning_rate": 0.001506, + "loss": 5.389, + "step": 502 + }, + { + "epoch": 0.00503, + "grad_norm": 0.860909611402231, + "learning_rate": 0.0015090000000000001, + "loss": 5.3926, + "step": 503 + }, + { + "epoch": 0.00504, + "grad_norm": 0.7026490274993983, + "learning_rate": 0.001512, + "loss": 5.3646, + "step": 504 + }, + { + "epoch": 0.00505, + "grad_norm": 0.7184807991697565, + "learning_rate": 0.001515, + "loss": 5.3547, + "step": 505 + }, + { + "epoch": 0.00506, + "grad_norm": 0.7839475253802514, + "learning_rate": 0.001518, + "loss": 5.3611, + "step": 506 + }, + { + "epoch": 0.00507, + "grad_norm": 0.7039499110993044, + "learning_rate": 0.001521, + "loss": 5.3522, + "step": 507 + }, + { + "epoch": 0.00508, + "grad_norm": 0.5587706859739108, + "learning_rate": 0.001524, + "loss": 5.3512, + "step": 508 + }, + { + "epoch": 0.00509, + "grad_norm": 0.4952941325228141, + "learning_rate": 0.0015270000000000001, + "loss": 5.326, + "step": 509 + }, + { + "epoch": 0.0051, + "grad_norm": 0.5131490664852795, + "learning_rate": 0.0015300000000000001, + "loss": 5.3428, + "step": 510 + }, + { + "epoch": 0.00511, + "grad_norm": 0.5539213487194597, + "learning_rate": 0.001533, + "loss": 5.3196, + "step": 511 + }, + { + "epoch": 0.00512, + "grad_norm": 0.5937876471208409, + "learning_rate": 0.001536, + "loss": 5.3158, + "step": 512 + }, + { + "epoch": 0.00513, + "grad_norm": 0.5441672327389838, + "learning_rate": 0.001539, + "loss": 5.327, + "step": 513 + }, + { + "epoch": 0.00514, + "grad_norm": 0.5371789671410057, + "learning_rate": 0.001542, + "loss": 5.3038, + "step": 514 + }, + { + "epoch": 0.00515, + "grad_norm": 0.5194765862771661, + "learning_rate": 0.0015450000000000001, + "loss": 5.3109, + "step": 515 + }, + { + "epoch": 0.00516, + "grad_norm": 0.5575198815834714, + "learning_rate": 0.0015480000000000001, + "loss": 5.3083, + "step": 516 + }, + { + "epoch": 0.00517, + "grad_norm": 0.5237583962443445, + "learning_rate": 0.001551, + "loss": 5.2962, + "step": 517 + }, + { + "epoch": 0.00518, + "grad_norm": 0.5237181167506534, + "learning_rate": 0.001554, + "loss": 5.2786, + "step": 518 + }, + { + "epoch": 0.00519, + "grad_norm": 0.6652146969315359, + "learning_rate": 0.001557, + "loss": 5.2847, + "step": 519 + }, + { + "epoch": 0.0052, + "grad_norm": 0.8817258231902962, + "learning_rate": 0.0015600000000000002, + "loss": 5.2966, + "step": 520 + }, + { + "epoch": 0.00521, + "grad_norm": 0.9047592596397305, + "learning_rate": 0.0015630000000000002, + "loss": 5.2732, + "step": 521 + }, + { + "epoch": 0.00522, + "grad_norm": 0.6139926424217688, + "learning_rate": 0.0015660000000000001, + "loss": 5.2701, + "step": 522 + }, + { + "epoch": 0.00523, + "grad_norm": 0.6292464658556638, + "learning_rate": 0.001569, + "loss": 5.2826, + "step": 523 + }, + { + "epoch": 0.00524, + "grad_norm": 0.752030715547053, + "learning_rate": 0.001572, + "loss": 5.2856, + "step": 524 + }, + { + "epoch": 0.00525, + "grad_norm": 0.929770536798091, + "learning_rate": 0.001575, + "loss": 5.287, + "step": 525 + }, + { + "epoch": 0.00526, + "grad_norm": 0.887370520628206, + "learning_rate": 0.0015780000000000002, + "loss": 5.2635, + "step": 526 + }, + { + "epoch": 0.00527, + "grad_norm": 0.7819104471156305, + "learning_rate": 0.0015810000000000002, + "loss": 5.2728, + "step": 527 + }, + { + "epoch": 0.00528, + "grad_norm": 0.9038037239389326, + "learning_rate": 0.0015840000000000001, + "loss": 5.2534, + "step": 528 + }, + { + "epoch": 0.00529, + "grad_norm": 0.8898068580069259, + "learning_rate": 0.001587, + "loss": 5.265, + "step": 529 + }, + { + "epoch": 0.0053, + "grad_norm": 1.0010848916209774, + "learning_rate": 0.00159, + "loss": 5.2764, + "step": 530 + }, + { + "epoch": 0.00531, + "grad_norm": 1.0010984765594055, + "learning_rate": 0.001593, + "loss": 5.2677, + "step": 531 + }, + { + "epoch": 0.00532, + "grad_norm": 1.026224264336229, + "learning_rate": 0.0015960000000000002, + "loss": 5.2779, + "step": 532 + }, + { + "epoch": 0.00533, + "grad_norm": 0.9759289805508353, + "learning_rate": 0.0015990000000000002, + "loss": 5.2678, + "step": 533 + }, + { + "epoch": 0.00534, + "grad_norm": 1.1376558518204782, + "learning_rate": 0.0016020000000000001, + "loss": 5.2612, + "step": 534 + }, + { + "epoch": 0.00535, + "grad_norm": 1.0517455067238486, + "learning_rate": 0.001605, + "loss": 5.2672, + "step": 535 + }, + { + "epoch": 0.00536, + "grad_norm": 0.9398243649272562, + "learning_rate": 0.001608, + "loss": 5.2627, + "step": 536 + }, + { + "epoch": 0.00537, + "grad_norm": 0.9512995727424398, + "learning_rate": 0.0016110000000000002, + "loss": 5.2558, + "step": 537 + }, + { + "epoch": 0.00538, + "grad_norm": 1.2746657535312511, + "learning_rate": 0.0016140000000000002, + "loss": 5.2451, + "step": 538 + }, + { + "epoch": 0.00539, + "grad_norm": 1.067518036012326, + "learning_rate": 0.0016170000000000002, + "loss": 5.2415, + "step": 539 + }, + { + "epoch": 0.0054, + "grad_norm": 1.1859628541053806, + "learning_rate": 0.0016200000000000001, + "loss": 5.2618, + "step": 540 + }, + { + "epoch": 0.00541, + "grad_norm": 0.8882936824028492, + "learning_rate": 0.001623, + "loss": 5.2308, + "step": 541 + }, + { + "epoch": 0.00542, + "grad_norm": 0.8517075205384302, + "learning_rate": 0.001626, + "loss": 5.2545, + "step": 542 + }, + { + "epoch": 0.00543, + "grad_norm": 0.8283552605034004, + "learning_rate": 0.0016290000000000002, + "loss": 5.2098, + "step": 543 + }, + { + "epoch": 0.00544, + "grad_norm": 0.9087829134911761, + "learning_rate": 0.0016320000000000002, + "loss": 5.2265, + "step": 544 + }, + { + "epoch": 0.00545, + "grad_norm": 0.8034144620978907, + "learning_rate": 0.0016350000000000002, + "loss": 5.2348, + "step": 545 + }, + { + "epoch": 0.00546, + "grad_norm": 0.7091235133563132, + "learning_rate": 0.0016380000000000001, + "loss": 5.2004, + "step": 546 + }, + { + "epoch": 0.00547, + "grad_norm": 0.6683331586466694, + "learning_rate": 0.001641, + "loss": 5.1915, + "step": 547 + }, + { + "epoch": 0.00548, + "grad_norm": 0.5441552662279447, + "learning_rate": 0.001644, + "loss": 5.1843, + "step": 548 + }, + { + "epoch": 0.00549, + "grad_norm": 0.5462993858197037, + "learning_rate": 0.0016470000000000002, + "loss": 5.1806, + "step": 549 + }, + { + "epoch": 0.0055, + "grad_norm": 0.5691406737163984, + "learning_rate": 0.0016500000000000002, + "loss": 5.1914, + "step": 550 + }, + { + "epoch": 0.00551, + "grad_norm": 0.57998143312047, + "learning_rate": 0.0016530000000000002, + "loss": 5.1789, + "step": 551 + }, + { + "epoch": 0.00552, + "grad_norm": 0.5286482487653069, + "learning_rate": 0.0016560000000000001, + "loss": 5.158, + "step": 552 + }, + { + "epoch": 0.00553, + "grad_norm": 0.48759014943874474, + "learning_rate": 0.001659, + "loss": 5.1465, + "step": 553 + }, + { + "epoch": 0.00554, + "grad_norm": 0.4792524526847805, + "learning_rate": 0.0016620000000000003, + "loss": 5.1537, + "step": 554 + }, + { + "epoch": 0.00555, + "grad_norm": 0.5054837577144806, + "learning_rate": 0.0016650000000000002, + "loss": 5.1496, + "step": 555 + }, + { + "epoch": 0.00556, + "grad_norm": 0.5148667639200912, + "learning_rate": 0.0016680000000000002, + "loss": 5.1288, + "step": 556 + }, + { + "epoch": 0.00557, + "grad_norm": 0.5122706189056161, + "learning_rate": 0.0016710000000000002, + "loss": 5.1153, + "step": 557 + }, + { + "epoch": 0.00558, + "grad_norm": 0.6046224327964763, + "learning_rate": 0.0016740000000000001, + "loss": 5.1358, + "step": 558 + }, + { + "epoch": 0.00559, + "grad_norm": 0.6851636176151574, + "learning_rate": 0.001677, + "loss": 5.1268, + "step": 559 + }, + { + "epoch": 0.0056, + "grad_norm": 0.6922547745331437, + "learning_rate": 0.0016800000000000003, + "loss": 5.096, + "step": 560 + }, + { + "epoch": 0.00561, + "grad_norm": 0.7411075733344746, + "learning_rate": 0.0016830000000000003, + "loss": 5.1176, + "step": 561 + }, + { + "epoch": 0.00562, + "grad_norm": 0.8981627852593407, + "learning_rate": 0.0016860000000000002, + "loss": 5.1206, + "step": 562 + }, + { + "epoch": 0.00563, + "grad_norm": 1.1260200882948381, + "learning_rate": 0.001689, + "loss": 5.1239, + "step": 563 + }, + { + "epoch": 0.00564, + "grad_norm": 1.1027210513289374, + "learning_rate": 0.001692, + "loss": 5.113, + "step": 564 + }, + { + "epoch": 0.00565, + "grad_norm": 0.890213024695838, + "learning_rate": 0.001695, + "loss": 5.1071, + "step": 565 + }, + { + "epoch": 0.00566, + "grad_norm": 0.8627979775394023, + "learning_rate": 0.0016979999999999999, + "loss": 5.0976, + "step": 566 + }, + { + "epoch": 0.00567, + "grad_norm": 0.9823945407746334, + "learning_rate": 0.0017009999999999998, + "loss": 5.101, + "step": 567 + }, + { + "epoch": 0.00568, + "grad_norm": 0.9715849000742567, + "learning_rate": 0.0017039999999999998, + "loss": 5.1139, + "step": 568 + }, + { + "epoch": 0.00569, + "grad_norm": 0.9989913016974431, + "learning_rate": 0.001707, + "loss": 5.0861, + "step": 569 + }, + { + "epoch": 0.0057, + "grad_norm": 1.0927877338999235, + "learning_rate": 0.00171, + "loss": 5.1262, + "step": 570 + }, + { + "epoch": 0.00571, + "grad_norm": 1.0328737042176641, + "learning_rate": 0.001713, + "loss": 5.1192, + "step": 571 + }, + { + "epoch": 0.00572, + "grad_norm": 1.3722388350456287, + "learning_rate": 0.0017159999999999999, + "loss": 5.1049, + "step": 572 + }, + { + "epoch": 0.00573, + "grad_norm": 0.9525979203379623, + "learning_rate": 0.0017189999999999998, + "loss": 5.1081, + "step": 573 + }, + { + "epoch": 0.00574, + "grad_norm": 1.0626128534442882, + "learning_rate": 0.001722, + "loss": 5.1048, + "step": 574 + }, + { + "epoch": 0.00575, + "grad_norm": 0.9331527599734185, + "learning_rate": 0.001725, + "loss": 5.074, + "step": 575 + }, + { + "epoch": 0.00576, + "grad_norm": 0.9277735357052385, + "learning_rate": 0.001728, + "loss": 5.0843, + "step": 576 + }, + { + "epoch": 0.00577, + "grad_norm": 0.9070967517565243, + "learning_rate": 0.001731, + "loss": 5.0908, + "step": 577 + }, + { + "epoch": 0.00578, + "grad_norm": 0.8451551366430134, + "learning_rate": 0.0017339999999999999, + "loss": 5.0704, + "step": 578 + }, + { + "epoch": 0.00579, + "grad_norm": 0.7590562073625285, + "learning_rate": 0.0017369999999999998, + "loss": 5.058, + "step": 579 + }, + { + "epoch": 0.0058, + "grad_norm": 0.6385326977156373, + "learning_rate": 0.00174, + "loss": 5.0662, + "step": 580 + }, + { + "epoch": 0.00581, + "grad_norm": 0.5982129206576257, + "learning_rate": 0.001743, + "loss": 5.0624, + "step": 581 + }, + { + "epoch": 0.00582, + "grad_norm": 0.7358321954275717, + "learning_rate": 0.001746, + "loss": 5.0371, + "step": 582 + }, + { + "epoch": 0.00583, + "grad_norm": 0.790704997209863, + "learning_rate": 0.001749, + "loss": 5.0641, + "step": 583 + }, + { + "epoch": 0.00584, + "grad_norm": 0.7351931257233056, + "learning_rate": 0.0017519999999999999, + "loss": 5.0549, + "step": 584 + }, + { + "epoch": 0.00585, + "grad_norm": 0.6167540927458872, + "learning_rate": 0.0017549999999999998, + "loss": 5.0362, + "step": 585 + }, + { + "epoch": 0.00586, + "grad_norm": 0.6143004439139178, + "learning_rate": 0.001758, + "loss": 5.0316, + "step": 586 + }, + { + "epoch": 0.00587, + "grad_norm": 0.619830997492515, + "learning_rate": 0.001761, + "loss": 5.018, + "step": 587 + }, + { + "epoch": 0.00588, + "grad_norm": 0.676754014852622, + "learning_rate": 0.001764, + "loss": 5.0113, + "step": 588 + }, + { + "epoch": 0.00589, + "grad_norm": 0.8152367183756798, + "learning_rate": 0.001767, + "loss": 5.0063, + "step": 589 + }, + { + "epoch": 0.0059, + "grad_norm": 0.8067730505459064, + "learning_rate": 0.0017699999999999999, + "loss": 5.0009, + "step": 590 + }, + { + "epoch": 0.00591, + "grad_norm": 0.7857115694134444, + "learning_rate": 0.001773, + "loss": 5.0014, + "step": 591 + }, + { + "epoch": 0.00592, + "grad_norm": 1.0305248086573016, + "learning_rate": 0.001776, + "loss": 5.0384, + "step": 592 + }, + { + "epoch": 0.00593, + "grad_norm": 1.1034322728795254, + "learning_rate": 0.001779, + "loss": 5.0387, + "step": 593 + }, + { + "epoch": 0.00594, + "grad_norm": 0.796782800390064, + "learning_rate": 0.001782, + "loss": 5.0138, + "step": 594 + }, + { + "epoch": 0.00595, + "grad_norm": 0.947951394101208, + "learning_rate": 0.001785, + "loss": 5.0288, + "step": 595 + }, + { + "epoch": 0.00596, + "grad_norm": 0.948001541672118, + "learning_rate": 0.0017879999999999999, + "loss": 5.0236, + "step": 596 + }, + { + "epoch": 0.00597, + "grad_norm": 0.8614311642183788, + "learning_rate": 0.001791, + "loss": 4.983, + "step": 597 + }, + { + "epoch": 0.00598, + "grad_norm": 0.7822502963641336, + "learning_rate": 0.001794, + "loss": 5.0032, + "step": 598 + }, + { + "epoch": 0.00599, + "grad_norm": 0.662502314672726, + "learning_rate": 0.001797, + "loss": 4.9824, + "step": 599 + }, + { + "epoch": 0.006, + "grad_norm": 0.612691797581944, + "learning_rate": 0.0018, + "loss": 4.9865, + "step": 600 + }, + { + "epoch": 0.00601, + "grad_norm": 0.6872791409552744, + "learning_rate": 0.001803, + "loss": 4.9768, + "step": 601 + }, + { + "epoch": 0.00602, + "grad_norm": 0.6805787880992017, + "learning_rate": 0.0018059999999999999, + "loss": 4.9611, + "step": 602 + }, + { + "epoch": 0.00603, + "grad_norm": 0.6726518476757681, + "learning_rate": 0.001809, + "loss": 4.9863, + "step": 603 + }, + { + "epoch": 0.00604, + "grad_norm": 0.6569683723636671, + "learning_rate": 0.001812, + "loss": 4.9594, + "step": 604 + }, + { + "epoch": 0.00605, + "grad_norm": 0.7635898698708745, + "learning_rate": 0.001815, + "loss": 4.9733, + "step": 605 + }, + { + "epoch": 0.00606, + "grad_norm": 0.7455554600622951, + "learning_rate": 0.001818, + "loss": 4.9743, + "step": 606 + }, + { + "epoch": 0.00607, + "grad_norm": 0.7493825131119356, + "learning_rate": 0.001821, + "loss": 4.968, + "step": 607 + }, + { + "epoch": 0.00608, + "grad_norm": 0.7957828614510537, + "learning_rate": 0.001824, + "loss": 4.9536, + "step": 608 + }, + { + "epoch": 0.00609, + "grad_norm": 0.8304899179729943, + "learning_rate": 0.001827, + "loss": 4.9564, + "step": 609 + }, + { + "epoch": 0.0061, + "grad_norm": 0.8109281284885322, + "learning_rate": 0.00183, + "loss": 4.9141, + "step": 610 + }, + { + "epoch": 0.00611, + "grad_norm": 0.7476158062529945, + "learning_rate": 0.001833, + "loss": 4.936, + "step": 611 + }, + { + "epoch": 0.00612, + "grad_norm": 0.7704555466663776, + "learning_rate": 0.001836, + "loss": 4.9262, + "step": 612 + }, + { + "epoch": 0.00613, + "grad_norm": 0.7179188189545482, + "learning_rate": 0.001839, + "loss": 4.9285, + "step": 613 + }, + { + "epoch": 0.00614, + "grad_norm": 0.6487628040476678, + "learning_rate": 0.001842, + "loss": 4.9483, + "step": 614 + }, + { + "epoch": 0.00615, + "grad_norm": 0.6346341426047015, + "learning_rate": 0.001845, + "loss": 4.9336, + "step": 615 + }, + { + "epoch": 0.00616, + "grad_norm": 0.6919482296512011, + "learning_rate": 0.001848, + "loss": 4.92, + "step": 616 + }, + { + "epoch": 0.00617, + "grad_norm": 0.8470324326487232, + "learning_rate": 0.001851, + "loss": 4.937, + "step": 617 + }, + { + "epoch": 0.00618, + "grad_norm": 0.9780451420741652, + "learning_rate": 0.001854, + "loss": 4.918, + "step": 618 + }, + { + "epoch": 0.00619, + "grad_norm": 1.0393940937639112, + "learning_rate": 0.001857, + "loss": 4.9393, + "step": 619 + }, + { + "epoch": 0.0062, + "grad_norm": 1.0864888594503201, + "learning_rate": 0.00186, + "loss": 4.9315, + "step": 620 + }, + { + "epoch": 0.00621, + "grad_norm": 1.1243573748169302, + "learning_rate": 0.001863, + "loss": 4.931, + "step": 621 + }, + { + "epoch": 0.00622, + "grad_norm": 1.2701399731469665, + "learning_rate": 0.001866, + "loss": 4.9502, + "step": 622 + }, + { + "epoch": 0.00623, + "grad_norm": 0.8273605340089976, + "learning_rate": 0.001869, + "loss": 4.9139, + "step": 623 + }, + { + "epoch": 0.00624, + "grad_norm": 0.9643311427939416, + "learning_rate": 0.001872, + "loss": 4.9253, + "step": 624 + }, + { + "epoch": 0.00625, + "grad_norm": 1.2310098040523034, + "learning_rate": 0.001875, + "loss": 4.9539, + "step": 625 + }, + { + "epoch": 0.00626, + "grad_norm": 0.8268848993203812, + "learning_rate": 0.0018780000000000001, + "loss": 4.9199, + "step": 626 + }, + { + "epoch": 0.00627, + "grad_norm": 0.9575639973355906, + "learning_rate": 0.001881, + "loss": 4.909, + "step": 627 + }, + { + "epoch": 0.00628, + "grad_norm": 0.9391450767019313, + "learning_rate": 0.001884, + "loss": 4.9487, + "step": 628 + }, + { + "epoch": 0.00629, + "grad_norm": 0.8387975784827085, + "learning_rate": 0.001887, + "loss": 4.8976, + "step": 629 + }, + { + "epoch": 0.0063, + "grad_norm": 0.7745062985579545, + "learning_rate": 0.00189, + "loss": 4.9062, + "step": 630 + }, + { + "epoch": 0.00631, + "grad_norm": 0.8213366097020988, + "learning_rate": 0.0018930000000000002, + "loss": 4.9006, + "step": 631 + }, + { + "epoch": 0.00632, + "grad_norm": 0.8670608316656828, + "learning_rate": 0.0018960000000000001, + "loss": 4.9226, + "step": 632 + }, + { + "epoch": 0.00633, + "grad_norm": 0.9183940724690223, + "learning_rate": 0.001899, + "loss": 4.9055, + "step": 633 + }, + { + "epoch": 0.00634, + "grad_norm": 0.8953336214105924, + "learning_rate": 0.001902, + "loss": 4.8931, + "step": 634 + }, + { + "epoch": 0.00635, + "grad_norm": 0.8869843611928877, + "learning_rate": 0.001905, + "loss": 4.8851, + "step": 635 + }, + { + "epoch": 0.00636, + "grad_norm": 1.014617544478078, + "learning_rate": 0.001908, + "loss": 4.911, + "step": 636 + }, + { + "epoch": 0.00637, + "grad_norm": 0.8681767489690893, + "learning_rate": 0.0019110000000000002, + "loss": 4.909, + "step": 637 + }, + { + "epoch": 0.00638, + "grad_norm": 0.9792692499883113, + "learning_rate": 0.0019140000000000001, + "loss": 4.9073, + "step": 638 + }, + { + "epoch": 0.00639, + "grad_norm": 0.8702384519183142, + "learning_rate": 0.001917, + "loss": 4.8724, + "step": 639 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7656979903996737, + "learning_rate": 0.00192, + "loss": 4.8809, + "step": 640 + }, + { + "epoch": 0.00641, + "grad_norm": 0.7895438458212233, + "learning_rate": 0.001923, + "loss": 4.8762, + "step": 641 + }, + { + "epoch": 0.00642, + "grad_norm": 0.8281699495124627, + "learning_rate": 0.001926, + "loss": 4.8749, + "step": 642 + }, + { + "epoch": 0.00643, + "grad_norm": 0.7845360432833325, + "learning_rate": 0.0019290000000000002, + "loss": 4.8856, + "step": 643 + }, + { + "epoch": 0.00644, + "grad_norm": 0.6857666802048429, + "learning_rate": 0.0019320000000000001, + "loss": 4.8583, + "step": 644 + }, + { + "epoch": 0.00645, + "grad_norm": 0.5056941566313082, + "learning_rate": 0.001935, + "loss": 4.8522, + "step": 645 + }, + { + "epoch": 0.00646, + "grad_norm": 0.47964630717910517, + "learning_rate": 0.001938, + "loss": 4.8628, + "step": 646 + }, + { + "epoch": 0.00647, + "grad_norm": 0.519046406898008, + "learning_rate": 0.001941, + "loss": 4.8552, + "step": 647 + }, + { + "epoch": 0.00648, + "grad_norm": 0.4935185080219269, + "learning_rate": 0.0019440000000000002, + "loss": 4.8488, + "step": 648 + }, + { + "epoch": 0.00649, + "grad_norm": 0.5205599594018984, + "learning_rate": 0.0019470000000000002, + "loss": 4.8333, + "step": 649 + }, + { + "epoch": 0.0065, + "grad_norm": 0.45295352715192905, + "learning_rate": 0.0019500000000000001, + "loss": 4.8617, + "step": 650 + }, + { + "epoch": 0.00651, + "grad_norm": 0.4378817965282719, + "learning_rate": 0.001953, + "loss": 4.84, + "step": 651 + }, + { + "epoch": 0.00652, + "grad_norm": 0.48395025642257355, + "learning_rate": 0.0019560000000000003, + "loss": 4.8266, + "step": 652 + }, + { + "epoch": 0.00653, + "grad_norm": 0.5194845024092328, + "learning_rate": 0.0019590000000000002, + "loss": 4.8294, + "step": 653 + }, + { + "epoch": 0.00654, + "grad_norm": 0.5707068593609262, + "learning_rate": 0.001962, + "loss": 4.8026, + "step": 654 + }, + { + "epoch": 0.00655, + "grad_norm": 0.6725172209521785, + "learning_rate": 0.001965, + "loss": 4.834, + "step": 655 + }, + { + "epoch": 0.00656, + "grad_norm": 0.7729568432311581, + "learning_rate": 0.001968, + "loss": 4.8398, + "step": 656 + }, + { + "epoch": 0.00657, + "grad_norm": 0.9182317536069756, + "learning_rate": 0.001971, + "loss": 4.8175, + "step": 657 + }, + { + "epoch": 0.00658, + "grad_norm": 0.9241834469814327, + "learning_rate": 0.001974, + "loss": 4.8482, + "step": 658 + }, + { + "epoch": 0.00659, + "grad_norm": 0.7554289117841486, + "learning_rate": 0.001977, + "loss": 4.8199, + "step": 659 + }, + { + "epoch": 0.0066, + "grad_norm": 0.786222644002383, + "learning_rate": 0.00198, + "loss": 4.8336, + "step": 660 + }, + { + "epoch": 0.00661, + "grad_norm": 0.7486956414167568, + "learning_rate": 0.001983, + "loss": 4.8035, + "step": 661 + }, + { + "epoch": 0.00662, + "grad_norm": 0.8224936329580866, + "learning_rate": 0.0019860000000000004, + "loss": 4.8038, + "step": 662 + }, + { + "epoch": 0.00663, + "grad_norm": 1.0333891900276588, + "learning_rate": 0.0019890000000000003, + "loss": 4.8216, + "step": 663 + }, + { + "epoch": 0.00664, + "grad_norm": 1.0454184895583787, + "learning_rate": 0.0019920000000000003, + "loss": 4.826, + "step": 664 + }, + { + "epoch": 0.00665, + "grad_norm": 0.9684168839706867, + "learning_rate": 0.0019950000000000002, + "loss": 4.7952, + "step": 665 + }, + { + "epoch": 0.00666, + "grad_norm": 1.0353944208678731, + "learning_rate": 0.001998, + "loss": 4.8213, + "step": 666 + }, + { + "epoch": 0.00667, + "grad_norm": 0.9360230286599108, + "learning_rate": 0.002001, + "loss": 4.8166, + "step": 667 + }, + { + "epoch": 0.00668, + "grad_norm": 0.7256915823313501, + "learning_rate": 0.002004, + "loss": 4.83, + "step": 668 + }, + { + "epoch": 0.00669, + "grad_norm": 0.8394069732991961, + "learning_rate": 0.002007, + "loss": 4.8494, + "step": 669 + }, + { + "epoch": 0.0067, + "grad_norm": 1.077577977883947, + "learning_rate": 0.00201, + "loss": 4.8532, + "step": 670 + }, + { + "epoch": 0.00671, + "grad_norm": 0.944781077919628, + "learning_rate": 0.002013, + "loss": 4.8236, + "step": 671 + }, + { + "epoch": 0.00672, + "grad_norm": 0.7024086289087288, + "learning_rate": 0.002016, + "loss": 4.7966, + "step": 672 + }, + { + "epoch": 0.00673, + "grad_norm": 0.6931174948166909, + "learning_rate": 0.002019, + "loss": 4.809, + "step": 673 + }, + { + "epoch": 0.00674, + "grad_norm": 0.6108100930625565, + "learning_rate": 0.0020220000000000004, + "loss": 4.8005, + "step": 674 + }, + { + "epoch": 0.00675, + "grad_norm": 0.5498762136107717, + "learning_rate": 0.0020250000000000003, + "loss": 4.8171, + "step": 675 + }, + { + "epoch": 0.00676, + "grad_norm": 0.6244881777878202, + "learning_rate": 0.0020280000000000003, + "loss": 4.7951, + "step": 676 + }, + { + "epoch": 0.00677, + "grad_norm": 0.5815667573949113, + "learning_rate": 0.0020310000000000003, + "loss": 4.7913, + "step": 677 + }, + { + "epoch": 0.00678, + "grad_norm": 0.521491074875552, + "learning_rate": 0.0020340000000000002, + "loss": 4.7959, + "step": 678 + }, + { + "epoch": 0.00679, + "grad_norm": 0.5801192243287052, + "learning_rate": 0.002037, + "loss": 4.7864, + "step": 679 + }, + { + "epoch": 0.0068, + "grad_norm": 0.6398788453868361, + "learning_rate": 0.00204, + "loss": 4.7875, + "step": 680 + }, + { + "epoch": 0.00681, + "grad_norm": 0.6456567912241706, + "learning_rate": 0.002043, + "loss": 4.7807, + "step": 681 + }, + { + "epoch": 0.00682, + "grad_norm": 0.6889535888719373, + "learning_rate": 0.002046, + "loss": 4.7776, + "step": 682 + }, + { + "epoch": 0.00683, + "grad_norm": 0.68132819276425, + "learning_rate": 0.002049, + "loss": 4.7612, + "step": 683 + }, + { + "epoch": 0.00684, + "grad_norm": 0.6724100911918068, + "learning_rate": 0.002052, + "loss": 4.7814, + "step": 684 + }, + { + "epoch": 0.00685, + "grad_norm": 0.815928121483196, + "learning_rate": 0.0020550000000000004, + "loss": 4.7913, + "step": 685 + }, + { + "epoch": 0.00686, + "grad_norm": 0.7537150884203632, + "learning_rate": 0.0020580000000000004, + "loss": 4.7584, + "step": 686 + }, + { + "epoch": 0.00687, + "grad_norm": 0.7326770278926477, + "learning_rate": 0.0020610000000000003, + "loss": 4.7655, + "step": 687 + }, + { + "epoch": 0.00688, + "grad_norm": 0.7145131916015852, + "learning_rate": 0.002064, + "loss": 4.7631, + "step": 688 + }, + { + "epoch": 0.00689, + "grad_norm": 0.694828893634318, + "learning_rate": 0.002067, + "loss": 4.7685, + "step": 689 + }, + { + "epoch": 0.0069, + "grad_norm": 0.7402628986076095, + "learning_rate": 0.00207, + "loss": 4.7629, + "step": 690 + }, + { + "epoch": 0.00691, + "grad_norm": 0.7809038473743037, + "learning_rate": 0.0020729999999999998, + "loss": 4.734, + "step": 691 + }, + { + "epoch": 0.00692, + "grad_norm": 0.9321737974219415, + "learning_rate": 0.0020759999999999997, + "loss": 4.7623, + "step": 692 + }, + { + "epoch": 0.00693, + "grad_norm": 1.0940875671633443, + "learning_rate": 0.0020789999999999997, + "loss": 4.788, + "step": 693 + }, + { + "epoch": 0.00694, + "grad_norm": 0.9030072391970948, + "learning_rate": 0.002082, + "loss": 4.7582, + "step": 694 + }, + { + "epoch": 0.00695, + "grad_norm": 0.8857004379777869, + "learning_rate": 0.002085, + "loss": 4.7649, + "step": 695 + }, + { + "epoch": 0.00696, + "grad_norm": 0.8991843577664433, + "learning_rate": 0.002088, + "loss": 4.7712, + "step": 696 + }, + { + "epoch": 0.00697, + "grad_norm": 1.0094726318481275, + "learning_rate": 0.002091, + "loss": 4.7869, + "step": 697 + }, + { + "epoch": 0.00698, + "grad_norm": 1.09376996979269, + "learning_rate": 0.002094, + "loss": 4.7773, + "step": 698 + }, + { + "epoch": 0.00699, + "grad_norm": 0.8766228601427865, + "learning_rate": 0.002097, + "loss": 4.7548, + "step": 699 + }, + { + "epoch": 0.007, + "grad_norm": 0.8580408760742435, + "learning_rate": 0.0021, + "loss": 4.7811, + "step": 700 + }, + { + "epoch": 0.00701, + "grad_norm": 0.7398805390269545, + "learning_rate": 0.002103, + "loss": 4.7496, + "step": 701 + }, + { + "epoch": 0.00702, + "grad_norm": 0.8329034135969645, + "learning_rate": 0.002106, + "loss": 4.762, + "step": 702 + }, + { + "epoch": 0.00703, + "grad_norm": 0.8505784581176613, + "learning_rate": 0.0021089999999999998, + "loss": 4.7427, + "step": 703 + }, + { + "epoch": 0.00704, + "grad_norm": 0.8003313240374758, + "learning_rate": 0.0021119999999999997, + "loss": 4.7516, + "step": 704 + }, + { + "epoch": 0.00705, + "grad_norm": 0.7109629500256387, + "learning_rate": 0.002115, + "loss": 4.7635, + "step": 705 + }, + { + "epoch": 0.00706, + "grad_norm": 0.6289559048526784, + "learning_rate": 0.002118, + "loss": 4.7473, + "step": 706 + }, + { + "epoch": 0.00707, + "grad_norm": 0.5482382237711074, + "learning_rate": 0.002121, + "loss": 4.7193, + "step": 707 + }, + { + "epoch": 0.00708, + "grad_norm": 0.5598390325768088, + "learning_rate": 0.002124, + "loss": 4.7015, + "step": 708 + }, + { + "epoch": 0.00709, + "grad_norm": 0.5529551760214286, + "learning_rate": 0.002127, + "loss": 4.7331, + "step": 709 + }, + { + "epoch": 0.0071, + "grad_norm": 0.6671084870548735, + "learning_rate": 0.00213, + "loss": 4.7271, + "step": 710 + }, + { + "epoch": 0.00711, + "grad_norm": 1.1336515667563218, + "learning_rate": 0.002133, + "loss": 4.7225, + "step": 711 + }, + { + "epoch": 0.00712, + "grad_norm": 0.8878745395415352, + "learning_rate": 0.002136, + "loss": 4.761, + "step": 712 + }, + { + "epoch": 0.00713, + "grad_norm": 0.6767692274074788, + "learning_rate": 0.002139, + "loss": 4.7547, + "step": 713 + }, + { + "epoch": 0.00714, + "grad_norm": 0.7698487468076475, + "learning_rate": 0.002142, + "loss": 4.7544, + "step": 714 + }, + { + "epoch": 0.00715, + "grad_norm": 0.6189384208707256, + "learning_rate": 0.0021449999999999998, + "loss": 4.7077, + "step": 715 + }, + { + "epoch": 0.00716, + "grad_norm": 0.5968344999809513, + "learning_rate": 0.002148, + "loss": 4.7064, + "step": 716 + }, + { + "epoch": 0.00717, + "grad_norm": 0.5117629580972962, + "learning_rate": 0.002151, + "loss": 4.7111, + "step": 717 + }, + { + "epoch": 0.00718, + "grad_norm": 0.5586203984201494, + "learning_rate": 0.002154, + "loss": 4.7079, + "step": 718 + }, + { + "epoch": 0.00719, + "grad_norm": 0.5075479633130432, + "learning_rate": 0.002157, + "loss": 4.7121, + "step": 719 + }, + { + "epoch": 0.0072, + "grad_norm": 0.4840284295584723, + "learning_rate": 0.00216, + "loss": 4.7045, + "step": 720 + }, + { + "epoch": 0.00721, + "grad_norm": 0.4971167150526394, + "learning_rate": 0.002163, + "loss": 4.7056, + "step": 721 + }, + { + "epoch": 0.00722, + "grad_norm": 0.5347204422021664, + "learning_rate": 0.002166, + "loss": 4.6807, + "step": 722 + }, + { + "epoch": 0.00723, + "grad_norm": 0.5424206330484346, + "learning_rate": 0.002169, + "loss": 4.6942, + "step": 723 + }, + { + "epoch": 0.00724, + "grad_norm": 0.5219918511718452, + "learning_rate": 0.002172, + "loss": 4.6912, + "step": 724 + }, + { + "epoch": 0.00725, + "grad_norm": 0.4975444213381179, + "learning_rate": 0.002175, + "loss": 4.6834, + "step": 725 + }, + { + "epoch": 0.00726, + "grad_norm": 0.5840922524665152, + "learning_rate": 0.002178, + "loss": 4.7008, + "step": 726 + }, + { + "epoch": 0.00727, + "grad_norm": 0.7633698368434719, + "learning_rate": 0.0021809999999999998, + "loss": 4.6777, + "step": 727 + }, + { + "epoch": 0.00728, + "grad_norm": 0.9052599955462425, + "learning_rate": 0.002184, + "loss": 4.7084, + "step": 728 + }, + { + "epoch": 0.00729, + "grad_norm": 0.9052862109699649, + "learning_rate": 0.002187, + "loss": 4.7065, + "step": 729 + }, + { + "epoch": 0.0073, + "grad_norm": 0.9788098741089467, + "learning_rate": 0.00219, + "loss": 4.7163, + "step": 730 + }, + { + "epoch": 0.00731, + "grad_norm": 0.9218858714566311, + "learning_rate": 0.002193, + "loss": 4.7193, + "step": 731 + }, + { + "epoch": 0.00732, + "grad_norm": 1.1412145043722297, + "learning_rate": 0.002196, + "loss": 4.7513, + "step": 732 + }, + { + "epoch": 0.00733, + "grad_norm": 1.0851884555194036, + "learning_rate": 0.002199, + "loss": 4.7135, + "step": 733 + }, + { + "epoch": 0.00734, + "grad_norm": 0.9295060612046938, + "learning_rate": 0.002202, + "loss": 4.745, + "step": 734 + }, + { + "epoch": 0.00735, + "grad_norm": 1.1744848346988581, + "learning_rate": 0.002205, + "loss": 4.766, + "step": 735 + }, + { + "epoch": 0.00736, + "grad_norm": 0.8278931321658332, + "learning_rate": 0.002208, + "loss": 4.7413, + "step": 736 + }, + { + "epoch": 0.00737, + "grad_norm": 0.8170748474782394, + "learning_rate": 0.002211, + "loss": 4.701, + "step": 737 + }, + { + "epoch": 0.00738, + "grad_norm": 0.772640277783885, + "learning_rate": 0.002214, + "loss": 4.7083, + "step": 738 + }, + { + "epoch": 0.00739, + "grad_norm": 0.6328057525928463, + "learning_rate": 0.0022170000000000002, + "loss": 4.6886, + "step": 739 + }, + { + "epoch": 0.0074, + "grad_norm": 0.6456777453177256, + "learning_rate": 0.00222, + "loss": 4.7001, + "step": 740 + }, + { + "epoch": 0.00741, + "grad_norm": 0.5912580097221507, + "learning_rate": 0.002223, + "loss": 4.6652, + "step": 741 + }, + { + "epoch": 0.00742, + "grad_norm": 0.4622238730689768, + "learning_rate": 0.002226, + "loss": 4.6972, + "step": 742 + }, + { + "epoch": 0.00743, + "grad_norm": 0.4904966133460914, + "learning_rate": 0.002229, + "loss": 4.6585, + "step": 743 + }, + { + "epoch": 0.00744, + "grad_norm": 0.5611948954376664, + "learning_rate": 0.002232, + "loss": 4.6643, + "step": 744 + }, + { + "epoch": 0.00745, + "grad_norm": 0.6222946629538655, + "learning_rate": 0.002235, + "loss": 4.6777, + "step": 745 + }, + { + "epoch": 0.00746, + "grad_norm": 0.6678221674623648, + "learning_rate": 0.002238, + "loss": 4.6735, + "step": 746 + }, + { + "epoch": 0.00747, + "grad_norm": 0.7945455347777014, + "learning_rate": 0.002241, + "loss": 4.6637, + "step": 747 + }, + { + "epoch": 0.00748, + "grad_norm": 0.9027593299924651, + "learning_rate": 0.002244, + "loss": 4.683, + "step": 748 + }, + { + "epoch": 0.00749, + "grad_norm": 0.8224517051186598, + "learning_rate": 0.002247, + "loss": 4.6612, + "step": 749 + }, + { + "epoch": 0.0075, + "grad_norm": 0.6737194715747743, + "learning_rate": 0.0022500000000000003, + "loss": 4.6955, + "step": 750 + }, + { + "epoch": 0.00751, + "grad_norm": 0.7232822830562855, + "learning_rate": 0.0022530000000000002, + "loss": 4.6893, + "step": 751 + }, + { + "epoch": 0.00752, + "grad_norm": 0.6449225627821611, + "learning_rate": 0.002256, + "loss": 4.637, + "step": 752 + }, + { + "epoch": 0.00753, + "grad_norm": 0.6558342053271724, + "learning_rate": 0.002259, + "loss": 4.6782, + "step": 753 + }, + { + "epoch": 0.00754, + "grad_norm": 0.7056102529212358, + "learning_rate": 0.002262, + "loss": 4.6906, + "step": 754 + }, + { + "epoch": 0.00755, + "grad_norm": 0.9043185481447295, + "learning_rate": 0.002265, + "loss": 4.6606, + "step": 755 + }, + { + "epoch": 0.00756, + "grad_norm": 0.8770375990036015, + "learning_rate": 0.002268, + "loss": 4.6551, + "step": 756 + }, + { + "epoch": 0.00757, + "grad_norm": 0.784655485815756, + "learning_rate": 0.002271, + "loss": 4.6655, + "step": 757 + }, + { + "epoch": 0.00758, + "grad_norm": 0.6912079950422929, + "learning_rate": 0.002274, + "loss": 4.655, + "step": 758 + }, + { + "epoch": 0.00759, + "grad_norm": 0.6281371231708264, + "learning_rate": 0.002277, + "loss": 4.6629, + "step": 759 + }, + { + "epoch": 0.0076, + "grad_norm": 0.5992557531443571, + "learning_rate": 0.00228, + "loss": 4.6463, + "step": 760 + }, + { + "epoch": 0.00761, + "grad_norm": 0.6034804931159479, + "learning_rate": 0.002283, + "loss": 4.6146, + "step": 761 + }, + { + "epoch": 0.00762, + "grad_norm": 0.6060668832918386, + "learning_rate": 0.0022860000000000003, + "loss": 4.5953, + "step": 762 + }, + { + "epoch": 0.00763, + "grad_norm": 0.5816892837581542, + "learning_rate": 0.0022890000000000002, + "loss": 4.6459, + "step": 763 + }, + { + "epoch": 0.00764, + "grad_norm": 0.599786845424844, + "learning_rate": 0.002292, + "loss": 4.6554, + "step": 764 + }, + { + "epoch": 0.00765, + "grad_norm": 0.6640243443171511, + "learning_rate": 0.002295, + "loss": 4.6451, + "step": 765 + }, + { + "epoch": 0.00766, + "grad_norm": 0.6663176934335011, + "learning_rate": 0.002298, + "loss": 4.6438, + "step": 766 + }, + { + "epoch": 0.00767, + "grad_norm": 0.6384162747773746, + "learning_rate": 0.002301, + "loss": 4.6392, + "step": 767 + }, + { + "epoch": 0.00768, + "grad_norm": 0.8537658256313299, + "learning_rate": 0.002304, + "loss": 4.6457, + "step": 768 + }, + { + "epoch": 0.00769, + "grad_norm": 1.0098120726516953, + "learning_rate": 0.002307, + "loss": 4.6694, + "step": 769 + }, + { + "epoch": 0.0077, + "grad_norm": 0.7423003820485509, + "learning_rate": 0.00231, + "loss": 4.6223, + "step": 770 + }, + { + "epoch": 0.00771, + "grad_norm": 0.6691351211965346, + "learning_rate": 0.002313, + "loss": 4.6569, + "step": 771 + }, + { + "epoch": 0.00772, + "grad_norm": 0.8185609860643415, + "learning_rate": 0.002316, + "loss": 4.6531, + "step": 772 + }, + { + "epoch": 0.00773, + "grad_norm": 1.0039613061235502, + "learning_rate": 0.0023190000000000003, + "loss": 4.6664, + "step": 773 + }, + { + "epoch": 0.00774, + "grad_norm": 0.8500607234716588, + "learning_rate": 0.0023220000000000003, + "loss": 4.6845, + "step": 774 + }, + { + "epoch": 0.00775, + "grad_norm": 0.7057192127032131, + "learning_rate": 0.0023250000000000002, + "loss": 4.6688, + "step": 775 + }, + { + "epoch": 0.00776, + "grad_norm": 0.8527617729361273, + "learning_rate": 0.002328, + "loss": 4.6732, + "step": 776 + }, + { + "epoch": 0.00777, + "grad_norm": 0.7987783133918631, + "learning_rate": 0.002331, + "loss": 4.6469, + "step": 777 + }, + { + "epoch": 0.00778, + "grad_norm": 0.8221364860163118, + "learning_rate": 0.002334, + "loss": 4.6645, + "step": 778 + }, + { + "epoch": 0.00779, + "grad_norm": 0.8901185821524535, + "learning_rate": 0.002337, + "loss": 4.6243, + "step": 779 + }, + { + "epoch": 0.0078, + "grad_norm": 0.889956887452623, + "learning_rate": 0.00234, + "loss": 4.646, + "step": 780 + }, + { + "epoch": 0.00781, + "grad_norm": 0.763195026878423, + "learning_rate": 0.002343, + "loss": 4.6465, + "step": 781 + }, + { + "epoch": 0.00782, + "grad_norm": 0.6941360203492353, + "learning_rate": 0.002346, + "loss": 4.6378, + "step": 782 + }, + { + "epoch": 0.00783, + "grad_norm": 0.6504570243922513, + "learning_rate": 0.002349, + "loss": 4.6195, + "step": 783 + }, + { + "epoch": 0.00784, + "grad_norm": 0.5612286515783087, + "learning_rate": 0.002352, + "loss": 4.6293, + "step": 784 + }, + { + "epoch": 0.00785, + "grad_norm": 0.6491841258536649, + "learning_rate": 0.0023550000000000003, + "loss": 4.6258, + "step": 785 + }, + { + "epoch": 0.00786, + "grad_norm": 0.7618604140557245, + "learning_rate": 0.0023580000000000003, + "loss": 4.6353, + "step": 786 + }, + { + "epoch": 0.00787, + "grad_norm": 0.9289251225559676, + "learning_rate": 0.0023610000000000003, + "loss": 4.6478, + "step": 787 + }, + { + "epoch": 0.00788, + "grad_norm": 1.0160410825626025, + "learning_rate": 0.002364, + "loss": 4.6569, + "step": 788 + }, + { + "epoch": 0.00789, + "grad_norm": 0.9904632547103471, + "learning_rate": 0.002367, + "loss": 4.6841, + "step": 789 + }, + { + "epoch": 0.0079, + "grad_norm": 0.978549262142993, + "learning_rate": 0.00237, + "loss": 4.6525, + "step": 790 + }, + { + "epoch": 0.00791, + "grad_norm": 0.7626000247604355, + "learning_rate": 0.002373, + "loss": 4.6008, + "step": 791 + }, + { + "epoch": 0.00792, + "grad_norm": 0.7032433100729657, + "learning_rate": 0.002376, + "loss": 4.6361, + "step": 792 + }, + { + "epoch": 0.00793, + "grad_norm": 0.6164393933780811, + "learning_rate": 0.002379, + "loss": 4.6263, + "step": 793 + }, + { + "epoch": 0.00794, + "grad_norm": 0.6849838631141278, + "learning_rate": 0.002382, + "loss": 4.6402, + "step": 794 + }, + { + "epoch": 0.00795, + "grad_norm": 0.7541453624973187, + "learning_rate": 0.002385, + "loss": 4.6181, + "step": 795 + }, + { + "epoch": 0.00796, + "grad_norm": 0.7900314117496404, + "learning_rate": 0.0023880000000000004, + "loss": 4.6154, + "step": 796 + }, + { + "epoch": 0.00797, + "grad_norm": 0.8032302751261208, + "learning_rate": 0.0023910000000000003, + "loss": 4.6598, + "step": 797 + }, + { + "epoch": 0.00798, + "grad_norm": 0.7037506066431396, + "learning_rate": 0.0023940000000000003, + "loss": 4.6206, + "step": 798 + }, + { + "epoch": 0.00799, + "grad_norm": 0.6873143294249761, + "learning_rate": 0.0023970000000000003, + "loss": 4.6565, + "step": 799 + }, + { + "epoch": 0.008, + "grad_norm": 0.5611968184575689, + "learning_rate": 0.0024000000000000002, + "loss": 4.6002, + "step": 800 + }, + { + "epoch": 0.00801, + "grad_norm": 0.6431524925156966, + "learning_rate": 0.002403, + "loss": 4.6326, + "step": 801 + }, + { + "epoch": 0.00802, + "grad_norm": 0.664361851854149, + "learning_rate": 0.002406, + "loss": 4.624, + "step": 802 + }, + { + "epoch": 0.00803, + "grad_norm": 0.7385532765716779, + "learning_rate": 0.002409, + "loss": 4.6034, + "step": 803 + }, + { + "epoch": 0.00804, + "grad_norm": 0.6763586866376322, + "learning_rate": 0.002412, + "loss": 4.609, + "step": 804 + }, + { + "epoch": 0.00805, + "grad_norm": 0.4984516329675046, + "learning_rate": 0.002415, + "loss": 4.5958, + "step": 805 + }, + { + "epoch": 0.00806, + "grad_norm": 0.6422151194456196, + "learning_rate": 0.002418, + "loss": 4.6202, + "step": 806 + }, + { + "epoch": 0.00807, + "grad_norm": 0.6520347813599764, + "learning_rate": 0.0024210000000000004, + "loss": 4.6111, + "step": 807 + }, + { + "epoch": 0.00808, + "grad_norm": 0.6466117231395049, + "learning_rate": 0.0024240000000000004, + "loss": 4.5938, + "step": 808 + }, + { + "epoch": 0.00809, + "grad_norm": 0.7173763198404414, + "learning_rate": 0.0024270000000000003, + "loss": 4.5875, + "step": 809 + }, + { + "epoch": 0.0081, + "grad_norm": 0.7630708316428362, + "learning_rate": 0.0024300000000000003, + "loss": 4.6257, + "step": 810 + }, + { + "epoch": 0.00811, + "grad_norm": 0.6597258634486322, + "learning_rate": 0.0024330000000000003, + "loss": 4.5916, + "step": 811 + }, + { + "epoch": 0.00812, + "grad_norm": 0.5596135689444884, + "learning_rate": 0.0024360000000000002, + "loss": 4.5639, + "step": 812 + }, + { + "epoch": 0.00813, + "grad_norm": 0.5455291997717118, + "learning_rate": 0.0024389999999999998, + "loss": 4.6039, + "step": 813 + }, + { + "epoch": 0.00814, + "grad_norm": 0.643394229982758, + "learning_rate": 0.0024419999999999997, + "loss": 4.6125, + "step": 814 + }, + { + "epoch": 0.00815, + "grad_norm": 0.7154153020141445, + "learning_rate": 0.0024449999999999997, + "loss": 4.6204, + "step": 815 + }, + { + "epoch": 0.00816, + "grad_norm": 0.7061775094598466, + "learning_rate": 0.002448, + "loss": 4.6005, + "step": 816 + }, + { + "epoch": 0.00817, + "grad_norm": 0.7497949470997279, + "learning_rate": 0.002451, + "loss": 4.6054, + "step": 817 + }, + { + "epoch": 0.00818, + "grad_norm": 0.8303821582765404, + "learning_rate": 0.002454, + "loss": 4.6047, + "step": 818 + }, + { + "epoch": 0.00819, + "grad_norm": 0.7870772660210968, + "learning_rate": 0.002457, + "loss": 4.6129, + "step": 819 + }, + { + "epoch": 0.0082, + "grad_norm": 0.8045799279983024, + "learning_rate": 0.00246, + "loss": 4.5959, + "step": 820 + }, + { + "epoch": 0.00821, + "grad_norm": 0.6847814476141592, + "learning_rate": 0.002463, + "loss": 4.5753, + "step": 821 + }, + { + "epoch": 0.00822, + "grad_norm": 0.6767342952639601, + "learning_rate": 0.002466, + "loss": 4.5743, + "step": 822 + }, + { + "epoch": 0.00823, + "grad_norm": 0.7814972414925468, + "learning_rate": 0.002469, + "loss": 4.5892, + "step": 823 + }, + { + "epoch": 0.00824, + "grad_norm": 0.7970591329920511, + "learning_rate": 0.002472, + "loss": 4.6124, + "step": 824 + }, + { + "epoch": 0.00825, + "grad_norm": 0.7237059608391628, + "learning_rate": 0.0024749999999999998, + "loss": 4.6158, + "step": 825 + }, + { + "epoch": 0.00826, + "grad_norm": 0.8825063151832095, + "learning_rate": 0.0024779999999999997, + "loss": 4.5838, + "step": 826 + }, + { + "epoch": 0.00827, + "grad_norm": 0.895197450200386, + "learning_rate": 0.002481, + "loss": 4.6224, + "step": 827 + }, + { + "epoch": 0.00828, + "grad_norm": 0.7760924275661016, + "learning_rate": 0.002484, + "loss": 4.6087, + "step": 828 + }, + { + "epoch": 0.00829, + "grad_norm": 0.8835354563964355, + "learning_rate": 0.002487, + "loss": 4.6157, + "step": 829 + }, + { + "epoch": 0.0083, + "grad_norm": 0.8984013262238549, + "learning_rate": 0.00249, + "loss": 4.6232, + "step": 830 + }, + { + "epoch": 0.00831, + "grad_norm": 0.8653834946214679, + "learning_rate": 0.002493, + "loss": 4.6038, + "step": 831 + }, + { + "epoch": 0.00832, + "grad_norm": 0.8641563564291244, + "learning_rate": 0.002496, + "loss": 4.6089, + "step": 832 + }, + { + "epoch": 0.00833, + "grad_norm": 0.8849940351412556, + "learning_rate": 0.002499, + "loss": 4.6069, + "step": 833 + }, + { + "epoch": 0.00834, + "grad_norm": 0.9333438912141526, + "learning_rate": 0.002502, + "loss": 4.6165, + "step": 834 + }, + { + "epoch": 0.00835, + "grad_norm": 0.8853225829488065, + "learning_rate": 0.002505, + "loss": 4.6108, + "step": 835 + }, + { + "epoch": 0.00836, + "grad_norm": 0.9042165265638229, + "learning_rate": 0.002508, + "loss": 4.6433, + "step": 836 + }, + { + "epoch": 0.00837, + "grad_norm": 0.8034248166459536, + "learning_rate": 0.0025109999999999998, + "loss": 4.5745, + "step": 837 + }, + { + "epoch": 0.00838, + "grad_norm": 0.8086998635281932, + "learning_rate": 0.0025139999999999997, + "loss": 4.6114, + "step": 838 + }, + { + "epoch": 0.00839, + "grad_norm": 0.587375456181757, + "learning_rate": 0.002517, + "loss": 4.5954, + "step": 839 + }, + { + "epoch": 0.0084, + "grad_norm": 0.6561997865962053, + "learning_rate": 0.00252, + "loss": 4.5818, + "step": 840 + }, + { + "epoch": 0.00841, + "grad_norm": 0.6860624256233325, + "learning_rate": 0.002523, + "loss": 4.5813, + "step": 841 + }, + { + "epoch": 0.00842, + "grad_norm": 0.6384378914075728, + "learning_rate": 0.002526, + "loss": 4.5783, + "step": 842 + }, + { + "epoch": 0.00843, + "grad_norm": 0.5397302637540236, + "learning_rate": 0.002529, + "loss": 4.5728, + "step": 843 + }, + { + "epoch": 0.00844, + "grad_norm": 0.5753282450552862, + "learning_rate": 0.002532, + "loss": 4.5912, + "step": 844 + }, + { + "epoch": 0.00845, + "grad_norm": 0.605188801053789, + "learning_rate": 0.002535, + "loss": 4.5588, + "step": 845 + }, + { + "epoch": 0.00846, + "grad_norm": 0.523231421872222, + "learning_rate": 0.002538, + "loss": 4.5771, + "step": 846 + }, + { + "epoch": 0.00847, + "grad_norm": 0.4442194357732046, + "learning_rate": 0.002541, + "loss": 4.5539, + "step": 847 + }, + { + "epoch": 0.00848, + "grad_norm": 0.4340595323225716, + "learning_rate": 0.002544, + "loss": 4.5562, + "step": 848 + }, + { + "epoch": 0.00849, + "grad_norm": 0.40987686572717924, + "learning_rate": 0.002547, + "loss": 4.5471, + "step": 849 + }, + { + "epoch": 0.0085, + "grad_norm": 0.38823479672015115, + "learning_rate": 0.00255, + "loss": 4.5547, + "step": 850 + }, + { + "epoch": 0.00851, + "grad_norm": 0.42997075207568086, + "learning_rate": 0.002553, + "loss": 4.5733, + "step": 851 + }, + { + "epoch": 0.00852, + "grad_norm": 0.5192269912554525, + "learning_rate": 0.002556, + "loss": 4.5624, + "step": 852 + }, + { + "epoch": 0.00853, + "grad_norm": 0.6626364019198889, + "learning_rate": 0.002559, + "loss": 4.5465, + "step": 853 + }, + { + "epoch": 0.00854, + "grad_norm": 0.9475750496728129, + "learning_rate": 0.002562, + "loss": 4.5544, + "step": 854 + }, + { + "epoch": 0.00855, + "grad_norm": 0.9196861875673602, + "learning_rate": 0.002565, + "loss": 4.5554, + "step": 855 + }, + { + "epoch": 0.00856, + "grad_norm": 0.6972970172901616, + "learning_rate": 0.002568, + "loss": 4.5747, + "step": 856 + }, + { + "epoch": 0.00857, + "grad_norm": 0.7425487517589463, + "learning_rate": 0.002571, + "loss": 4.5717, + "step": 857 + }, + { + "epoch": 0.00858, + "grad_norm": 0.678911102843075, + "learning_rate": 0.002574, + "loss": 4.5576, + "step": 858 + }, + { + "epoch": 0.00859, + "grad_norm": 0.6680695283315139, + "learning_rate": 0.002577, + "loss": 4.5494, + "step": 859 + }, + { + "epoch": 0.0086, + "grad_norm": 0.7323389776740598, + "learning_rate": 0.00258, + "loss": 4.5797, + "step": 860 + }, + { + "epoch": 0.00861, + "grad_norm": 0.6564756555111457, + "learning_rate": 0.0025830000000000002, + "loss": 4.5928, + "step": 861 + }, + { + "epoch": 0.00862, + "grad_norm": 0.6458920384469554, + "learning_rate": 0.002586, + "loss": 4.5402, + "step": 862 + }, + { + "epoch": 0.00863, + "grad_norm": 0.56307833723133, + "learning_rate": 0.002589, + "loss": 4.5659, + "step": 863 + }, + { + "epoch": 0.00864, + "grad_norm": 0.6301527121328465, + "learning_rate": 0.002592, + "loss": 4.5169, + "step": 864 + }, + { + "epoch": 0.00865, + "grad_norm": 0.5741592962502369, + "learning_rate": 0.002595, + "loss": 4.5342, + "step": 865 + }, + { + "epoch": 0.00866, + "grad_norm": 0.6185631950272453, + "learning_rate": 0.002598, + "loss": 4.5624, + "step": 866 + }, + { + "epoch": 0.00867, + "grad_norm": 0.7226342011827854, + "learning_rate": 0.002601, + "loss": 4.5538, + "step": 867 + }, + { + "epoch": 0.00868, + "grad_norm": 0.6871063796143311, + "learning_rate": 0.002604, + "loss": 4.5354, + "step": 868 + }, + { + "epoch": 0.00869, + "grad_norm": 0.6146544871717051, + "learning_rate": 0.002607, + "loss": 4.5411, + "step": 869 + }, + { + "epoch": 0.0087, + "grad_norm": 0.5850681754527672, + "learning_rate": 0.00261, + "loss": 4.5279, + "step": 870 + }, + { + "epoch": 0.00871, + "grad_norm": 0.6619050810997609, + "learning_rate": 0.002613, + "loss": 4.5496, + "step": 871 + }, + { + "epoch": 0.00872, + "grad_norm": 0.6309510333250544, + "learning_rate": 0.002616, + "loss": 4.5312, + "step": 872 + }, + { + "epoch": 0.00873, + "grad_norm": 0.620458951387438, + "learning_rate": 0.0026190000000000002, + "loss": 4.5409, + "step": 873 + }, + { + "epoch": 0.00874, + "grad_norm": 0.7675582961358233, + "learning_rate": 0.002622, + "loss": 4.5679, + "step": 874 + }, + { + "epoch": 0.00875, + "grad_norm": 0.8413199287183839, + "learning_rate": 0.002625, + "loss": 4.5217, + "step": 875 + }, + { + "epoch": 0.00876, + "grad_norm": 0.6439305106538762, + "learning_rate": 0.002628, + "loss": 4.5435, + "step": 876 + }, + { + "epoch": 0.00877, + "grad_norm": 0.6155973146282673, + "learning_rate": 0.002631, + "loss": 4.5607, + "step": 877 + }, + { + "epoch": 0.00878, + "grad_norm": 0.6572664983016872, + "learning_rate": 0.002634, + "loss": 4.4959, + "step": 878 + }, + { + "epoch": 0.00879, + "grad_norm": 0.678308084092591, + "learning_rate": 0.002637, + "loss": 4.526, + "step": 879 + }, + { + "epoch": 0.0088, + "grad_norm": 0.682644138366955, + "learning_rate": 0.00264, + "loss": 4.5397, + "step": 880 + }, + { + "epoch": 0.00881, + "grad_norm": 0.6916960221367301, + "learning_rate": 0.002643, + "loss": 4.5424, + "step": 881 + }, + { + "epoch": 0.00882, + "grad_norm": 0.8625989079873627, + "learning_rate": 0.002646, + "loss": 4.5523, + "step": 882 + }, + { + "epoch": 0.00883, + "grad_norm": 0.9115056000231379, + "learning_rate": 0.002649, + "loss": 4.5538, + "step": 883 + }, + { + "epoch": 0.00884, + "grad_norm": 1.1386535317654105, + "learning_rate": 0.0026520000000000003, + "loss": 4.5528, + "step": 884 + }, + { + "epoch": 0.00885, + "grad_norm": 0.9401738564991132, + "learning_rate": 0.0026550000000000002, + "loss": 4.5478, + "step": 885 + }, + { + "epoch": 0.00886, + "grad_norm": 1.0261918984073035, + "learning_rate": 0.002658, + "loss": 4.5785, + "step": 886 + }, + { + "epoch": 0.00887, + "grad_norm": 1.2427377192417122, + "learning_rate": 0.002661, + "loss": 4.6, + "step": 887 + }, + { + "epoch": 0.00888, + "grad_norm": 1.061546131863703, + "learning_rate": 0.002664, + "loss": 4.5885, + "step": 888 + }, + { + "epoch": 0.00889, + "grad_norm": 0.907593068920597, + "learning_rate": 0.002667, + "loss": 4.535, + "step": 889 + }, + { + "epoch": 0.0089, + "grad_norm": 0.8129313603021157, + "learning_rate": 0.00267, + "loss": 4.5855, + "step": 890 + }, + { + "epoch": 0.00891, + "grad_norm": 0.8985699151758613, + "learning_rate": 0.002673, + "loss": 4.5762, + "step": 891 + }, + { + "epoch": 0.00892, + "grad_norm": 0.9328845387585828, + "learning_rate": 0.002676, + "loss": 4.5962, + "step": 892 + }, + { + "epoch": 0.00893, + "grad_norm": 0.9658499469408992, + "learning_rate": 0.002679, + "loss": 4.5743, + "step": 893 + }, + { + "epoch": 0.00894, + "grad_norm": 0.9262060176852961, + "learning_rate": 0.002682, + "loss": 4.5624, + "step": 894 + }, + { + "epoch": 0.00895, + "grad_norm": 1.0340643070320608, + "learning_rate": 0.0026850000000000003, + "loss": 4.5798, + "step": 895 + }, + { + "epoch": 0.00896, + "grad_norm": 0.961568519948311, + "learning_rate": 0.0026880000000000003, + "loss": 4.5724, + "step": 896 + }, + { + "epoch": 0.00897, + "grad_norm": 0.9522979094666822, + "learning_rate": 0.0026910000000000002, + "loss": 4.5901, + "step": 897 + }, + { + "epoch": 0.00898, + "grad_norm": 0.90558868323236, + "learning_rate": 0.002694, + "loss": 4.5802, + "step": 898 + }, + { + "epoch": 0.00899, + "grad_norm": 0.7752291080346148, + "learning_rate": 0.002697, + "loss": 4.5848, + "step": 899 + }, + { + "epoch": 0.009, + "grad_norm": 0.7198757797557334, + "learning_rate": 0.0027, + "loss": 4.5998, + "step": 900 + }, + { + "epoch": 0.00901, + "grad_norm": 0.7328950523365488, + "learning_rate": 0.002703, + "loss": 4.5552, + "step": 901 + }, + { + "epoch": 0.00902, + "grad_norm": 0.5974121750232518, + "learning_rate": 0.002706, + "loss": 4.558, + "step": 902 + }, + { + "epoch": 0.00903, + "grad_norm": 0.5753314236304967, + "learning_rate": 0.002709, + "loss": 4.5534, + "step": 903 + }, + { + "epoch": 0.00904, + "grad_norm": 0.6128643778394687, + "learning_rate": 0.002712, + "loss": 4.5329, + "step": 904 + }, + { + "epoch": 0.00905, + "grad_norm": 0.5691916638541509, + "learning_rate": 0.002715, + "loss": 4.5446, + "step": 905 + }, + { + "epoch": 0.00906, + "grad_norm": 0.5122473095411203, + "learning_rate": 0.002718, + "loss": 4.5538, + "step": 906 + }, + { + "epoch": 0.00907, + "grad_norm": 0.4479956748150508, + "learning_rate": 0.0027210000000000003, + "loss": 4.5603, + "step": 907 + }, + { + "epoch": 0.00908, + "grad_norm": 0.4387217017050332, + "learning_rate": 0.0027240000000000003, + "loss": 4.5304, + "step": 908 + }, + { + "epoch": 0.00909, + "grad_norm": 0.4358600325368994, + "learning_rate": 0.0027270000000000003, + "loss": 4.5454, + "step": 909 + }, + { + "epoch": 0.0091, + "grad_norm": 0.3849373702796595, + "learning_rate": 0.0027300000000000002, + "loss": 4.5477, + "step": 910 + }, + { + "epoch": 0.00911, + "grad_norm": 0.4574591880095763, + "learning_rate": 0.002733, + "loss": 4.5038, + "step": 911 + }, + { + "epoch": 0.00912, + "grad_norm": 0.6121326743360266, + "learning_rate": 0.002736, + "loss": 4.5131, + "step": 912 + }, + { + "epoch": 0.00913, + "grad_norm": 0.6845161679319078, + "learning_rate": 0.002739, + "loss": 4.513, + "step": 913 + }, + { + "epoch": 0.00914, + "grad_norm": 0.7333722528690678, + "learning_rate": 0.002742, + "loss": 4.5432, + "step": 914 + }, + { + "epoch": 0.00915, + "grad_norm": 0.6714442153045657, + "learning_rate": 0.002745, + "loss": 4.5048, + "step": 915 + }, + { + "epoch": 0.00916, + "grad_norm": 0.5861682792098877, + "learning_rate": 0.002748, + "loss": 4.4768, + "step": 916 + }, + { + "epoch": 0.00917, + "grad_norm": 0.7568015257717708, + "learning_rate": 0.002751, + "loss": 4.4985, + "step": 917 + }, + { + "epoch": 0.00918, + "grad_norm": 0.67845085346177, + "learning_rate": 0.0027540000000000004, + "loss": 4.5185, + "step": 918 + }, + { + "epoch": 0.00919, + "grad_norm": 0.6376249584521229, + "learning_rate": 0.0027570000000000003, + "loss": 4.4976, + "step": 919 + }, + { + "epoch": 0.0092, + "grad_norm": 0.5953295490629925, + "learning_rate": 0.0027600000000000003, + "loss": 4.5104, + "step": 920 + }, + { + "epoch": 0.00921, + "grad_norm": 0.5470113372090761, + "learning_rate": 0.0027630000000000003, + "loss": 4.4892, + "step": 921 + }, + { + "epoch": 0.00922, + "grad_norm": 0.5586775976924615, + "learning_rate": 0.0027660000000000002, + "loss": 4.5007, + "step": 922 + }, + { + "epoch": 0.00923, + "grad_norm": 0.5602976125001545, + "learning_rate": 0.002769, + "loss": 4.4767, + "step": 923 + }, + { + "epoch": 0.00924, + "grad_norm": 0.5512257756150963, + "learning_rate": 0.002772, + "loss": 4.4992, + "step": 924 + }, + { + "epoch": 0.00925, + "grad_norm": 0.6104014154605217, + "learning_rate": 0.002775, + "loss": 4.4804, + "step": 925 + }, + { + "epoch": 0.00926, + "grad_norm": 0.627882987139063, + "learning_rate": 0.002778, + "loss": 4.4994, + "step": 926 + }, + { + "epoch": 0.00927, + "grad_norm": 0.5637389395797978, + "learning_rate": 0.002781, + "loss": 4.4582, + "step": 927 + }, + { + "epoch": 0.00928, + "grad_norm": 0.5793012970445044, + "learning_rate": 0.002784, + "loss": 4.505, + "step": 928 + }, + { + "epoch": 0.00929, + "grad_norm": 0.5538600481661593, + "learning_rate": 0.0027870000000000004, + "loss": 4.4867, + "step": 929 + }, + { + "epoch": 0.0093, + "grad_norm": 0.4915384406563099, + "learning_rate": 0.0027900000000000004, + "loss": 4.4652, + "step": 930 + }, + { + "epoch": 0.00931, + "grad_norm": 0.5291025545236148, + "learning_rate": 0.0027930000000000003, + "loss": 4.4796, + "step": 931 + }, + { + "epoch": 0.00932, + "grad_norm": 0.5189072536981689, + "learning_rate": 0.0027960000000000003, + "loss": 4.4881, + "step": 932 + }, + { + "epoch": 0.00933, + "grad_norm": 0.4391692113712953, + "learning_rate": 0.0027990000000000003, + "loss": 4.4495, + "step": 933 + }, + { + "epoch": 0.00934, + "grad_norm": 0.5067956527040228, + "learning_rate": 0.0028020000000000002, + "loss": 4.4811, + "step": 934 + }, + { + "epoch": 0.00935, + "grad_norm": 0.5752082887405049, + "learning_rate": 0.002805, + "loss": 4.4893, + "step": 935 + }, + { + "epoch": 0.00936, + "grad_norm": 0.8597610157431208, + "learning_rate": 0.002808, + "loss": 4.5012, + "step": 936 + }, + { + "epoch": 0.00937, + "grad_norm": 1.0923006968336397, + "learning_rate": 0.002811, + "loss": 4.5462, + "step": 937 + }, + { + "epoch": 0.00938, + "grad_norm": 0.8211742236965874, + "learning_rate": 0.002814, + "loss": 4.5052, + "step": 938 + }, + { + "epoch": 0.00939, + "grad_norm": 0.699189062953733, + "learning_rate": 0.002817, + "loss": 4.4989, + "step": 939 + }, + { + "epoch": 0.0094, + "grad_norm": 0.6251265662758575, + "learning_rate": 0.00282, + "loss": 4.495, + "step": 940 + }, + { + "epoch": 0.00941, + "grad_norm": 0.8614570213641495, + "learning_rate": 0.002823, + "loss": 4.4823, + "step": 941 + }, + { + "epoch": 0.00942, + "grad_norm": 0.9710835908546839, + "learning_rate": 0.002826, + "loss": 4.5282, + "step": 942 + }, + { + "epoch": 0.00943, + "grad_norm": 0.8255156288963836, + "learning_rate": 0.002829, + "loss": 4.5175, + "step": 943 + }, + { + "epoch": 0.00944, + "grad_norm": 0.8529303551550831, + "learning_rate": 0.002832, + "loss": 4.5233, + "step": 944 + }, + { + "epoch": 0.00945, + "grad_norm": 0.8805636588256177, + "learning_rate": 0.002835, + "loss": 4.5307, + "step": 945 + }, + { + "epoch": 0.00946, + "grad_norm": 0.8815394682869104, + "learning_rate": 0.002838, + "loss": 4.545, + "step": 946 + }, + { + "epoch": 0.00947, + "grad_norm": 0.9792537736888025, + "learning_rate": 0.0028409999999999998, + "loss": 4.5259, + "step": 947 + }, + { + "epoch": 0.00948, + "grad_norm": 1.16480627354709, + "learning_rate": 0.0028439999999999997, + "loss": 4.5258, + "step": 948 + }, + { + "epoch": 0.00949, + "grad_norm": 0.9311558570406092, + "learning_rate": 0.002847, + "loss": 4.5678, + "step": 949 + }, + { + "epoch": 0.0095, + "grad_norm": 0.9151500308490546, + "learning_rate": 0.00285, + "loss": 4.5566, + "step": 950 + }, + { + "epoch": 0.00951, + "grad_norm": 0.8788038193040383, + "learning_rate": 0.002853, + "loss": 4.539, + "step": 951 + }, + { + "epoch": 0.00952, + "grad_norm": 0.6883343465241535, + "learning_rate": 0.002856, + "loss": 4.5492, + "step": 952 + }, + { + "epoch": 0.00953, + "grad_norm": 0.7428316161957196, + "learning_rate": 0.002859, + "loss": 4.4912, + "step": 953 + }, + { + "epoch": 0.00954, + "grad_norm": 0.6589075049765231, + "learning_rate": 0.002862, + "loss": 4.5478, + "step": 954 + }, + { + "epoch": 0.00955, + "grad_norm": 0.6374906159512967, + "learning_rate": 0.002865, + "loss": 4.5305, + "step": 955 + }, + { + "epoch": 0.00956, + "grad_norm": 0.7037593758453927, + "learning_rate": 0.002868, + "loss": 4.5123, + "step": 956 + }, + { + "epoch": 0.00957, + "grad_norm": 0.7575837270625414, + "learning_rate": 0.002871, + "loss": 4.5364, + "step": 957 + }, + { + "epoch": 0.00958, + "grad_norm": 0.8136537306721928, + "learning_rate": 0.002874, + "loss": 4.5221, + "step": 958 + }, + { + "epoch": 0.00959, + "grad_norm": 0.7529303499273822, + "learning_rate": 0.002877, + "loss": 4.5134, + "step": 959 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7723176518474248, + "learning_rate": 0.0028799999999999997, + "loss": 4.5128, + "step": 960 + }, + { + "epoch": 0.00961, + "grad_norm": 0.6826701565045171, + "learning_rate": 0.002883, + "loss": 4.4802, + "step": 961 + }, + { + "epoch": 0.00962, + "grad_norm": 0.6022941634910601, + "learning_rate": 0.002886, + "loss": 4.5086, + "step": 962 + }, + { + "epoch": 0.00963, + "grad_norm": 0.5423609760386227, + "learning_rate": 0.002889, + "loss": 4.5025, + "step": 963 + }, + { + "epoch": 0.00964, + "grad_norm": 0.5430502092899187, + "learning_rate": 0.002892, + "loss": 4.493, + "step": 964 + }, + { + "epoch": 0.00965, + "grad_norm": 0.5046734063237629, + "learning_rate": 0.002895, + "loss": 4.4643, + "step": 965 + }, + { + "epoch": 0.00966, + "grad_norm": 0.4625107724617308, + "learning_rate": 0.002898, + "loss": 4.4758, + "step": 966 + }, + { + "epoch": 0.00967, + "grad_norm": 0.4403845150464526, + "learning_rate": 0.002901, + "loss": 4.5117, + "step": 967 + }, + { + "epoch": 0.00968, + "grad_norm": 0.40739936008519134, + "learning_rate": 0.002904, + "loss": 4.4717, + "step": 968 + }, + { + "epoch": 0.00969, + "grad_norm": 0.41620271754109195, + "learning_rate": 0.002907, + "loss": 4.5059, + "step": 969 + }, + { + "epoch": 0.0097, + "grad_norm": 0.3884080596123629, + "learning_rate": 0.00291, + "loss": 4.4561, + "step": 970 + }, + { + "epoch": 0.00971, + "grad_norm": 0.40963591049792675, + "learning_rate": 0.002913, + "loss": 4.4624, + "step": 971 + }, + { + "epoch": 0.00972, + "grad_norm": 0.427860611994679, + "learning_rate": 0.002916, + "loss": 4.4693, + "step": 972 + }, + { + "epoch": 0.00973, + "grad_norm": 0.6266830722962007, + "learning_rate": 0.002919, + "loss": 4.4689, + "step": 973 + }, + { + "epoch": 0.00974, + "grad_norm": 0.7795241673951117, + "learning_rate": 0.002922, + "loss": 4.4734, + "step": 974 + }, + { + "epoch": 0.00975, + "grad_norm": 0.7392088002205494, + "learning_rate": 0.002925, + "loss": 4.4915, + "step": 975 + }, + { + "epoch": 0.00976, + "grad_norm": 0.6090919745791438, + "learning_rate": 0.002928, + "loss": 4.4688, + "step": 976 + }, + { + "epoch": 0.00977, + "grad_norm": 0.6721667952819499, + "learning_rate": 0.002931, + "loss": 4.5114, + "step": 977 + }, + { + "epoch": 0.00978, + "grad_norm": 0.7157395676490952, + "learning_rate": 0.002934, + "loss": 4.4828, + "step": 978 + }, + { + "epoch": 0.00979, + "grad_norm": 0.6673604586868628, + "learning_rate": 0.002937, + "loss": 4.4724, + "step": 979 + }, + { + "epoch": 0.0098, + "grad_norm": 0.6241827666303977, + "learning_rate": 0.00294, + "loss": 4.4641, + "step": 980 + }, + { + "epoch": 0.00981, + "grad_norm": 0.6477735028588306, + "learning_rate": 0.002943, + "loss": 4.479, + "step": 981 + }, + { + "epoch": 0.00982, + "grad_norm": 0.5903229373466107, + "learning_rate": 0.002946, + "loss": 4.4717, + "step": 982 + }, + { + "epoch": 0.00983, + "grad_norm": 0.5367081021491611, + "learning_rate": 0.0029490000000000002, + "loss": 4.4684, + "step": 983 + }, + { + "epoch": 0.00984, + "grad_norm": 0.55855952891459, + "learning_rate": 0.002952, + "loss": 4.455, + "step": 984 + }, + { + "epoch": 0.00985, + "grad_norm": 0.6566800838357816, + "learning_rate": 0.002955, + "loss": 4.4789, + "step": 985 + }, + { + "epoch": 0.00986, + "grad_norm": 0.7602203626787503, + "learning_rate": 0.002958, + "loss": 4.4461, + "step": 986 + }, + { + "epoch": 0.00987, + "grad_norm": 0.7515002206199092, + "learning_rate": 0.002961, + "loss": 4.4656, + "step": 987 + }, + { + "epoch": 0.00988, + "grad_norm": 0.7510028045825011, + "learning_rate": 0.002964, + "loss": 4.497, + "step": 988 + }, + { + "epoch": 0.00989, + "grad_norm": 0.6686524266681442, + "learning_rate": 0.002967, + "loss": 4.438, + "step": 989 + }, + { + "epoch": 0.0099, + "grad_norm": 0.7110181433203292, + "learning_rate": 0.00297, + "loss": 4.4759, + "step": 990 + }, + { + "epoch": 0.00991, + "grad_norm": 0.9487700940206629, + "learning_rate": 0.002973, + "loss": 4.4909, + "step": 991 + }, + { + "epoch": 0.00992, + "grad_norm": 1.0128664486589338, + "learning_rate": 0.002976, + "loss": 4.4936, + "step": 992 + }, + { + "epoch": 0.00993, + "grad_norm": 0.776315556008045, + "learning_rate": 0.002979, + "loss": 4.4927, + "step": 993 + }, + { + "epoch": 0.00994, + "grad_norm": 0.6954881784010936, + "learning_rate": 0.002982, + "loss": 4.4894, + "step": 994 + }, + { + "epoch": 0.00995, + "grad_norm": 0.8343170542469196, + "learning_rate": 0.0029850000000000002, + "loss": 4.4966, + "step": 995 + }, + { + "epoch": 0.00996, + "grad_norm": 0.8064256733311401, + "learning_rate": 0.002988, + "loss": 4.4974, + "step": 996 + }, + { + "epoch": 0.00997, + "grad_norm": 0.7600837252115415, + "learning_rate": 0.002991, + "loss": 4.4859, + "step": 997 + }, + { + "epoch": 0.00998, + "grad_norm": 0.7413851219310601, + "learning_rate": 0.002994, + "loss": 4.4878, + "step": 998 + }, + { + "epoch": 0.00999, + "grad_norm": 0.7789142980889687, + "learning_rate": 0.002997, + "loss": 4.4774, + "step": 999 + }, + { + "epoch": 0.01, + "grad_norm": 0.5973093879700552, + "learning_rate": 0.003, + "loss": 4.4903, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.9643642855424e+16, + "train_batch_size": 1024, + "trial_name": null, + "trial_params": null +}