{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996087515448103, "eval_steps": 50, "global_step": 2515, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00039745874812914924, "grad_norm": 4.6875, "learning_rate": 4.000000000000001e-06, "lm_loss": 13.1874, "loss": 34.0326, "mask_loss": 8.446, "step": 1, "topk_loss": 12.3992 }, { "epoch": 0.0007949174962582985, "grad_norm": 4.5, "learning_rate": 8.000000000000001e-06, "lm_loss": 13.1965, "loss": 34.0618, "mask_loss": 8.4655, "step": 2, "topk_loss": 12.3999 }, { "epoch": 0.0011923762443874478, "grad_norm": 4.375, "learning_rate": 1.2e-05, "lm_loss": 13.1916, "loss": 34.0373, "mask_loss": 8.4476, "step": 3, "topk_loss": 12.398 }, { "epoch": 0.001589834992516597, "grad_norm": 4.5, "learning_rate": 1.6000000000000003e-05, "lm_loss": 13.1915, "loss": 34.0283, "mask_loss": 8.4445, "step": 4, "topk_loss": 12.3922 }, { "epoch": 0.0019872937406457463, "grad_norm": 4.5625, "learning_rate": 2e-05, "lm_loss": 13.181, "loss": 33.9921, "mask_loss": 8.4261, "step": 5, "topk_loss": 12.385 }, { "epoch": 0.0023847524887748955, "grad_norm": 4.28125, "learning_rate": 2.4e-05, "lm_loss": 13.1677, "loss": 33.9502, "mask_loss": 8.4174, "step": 6, "topk_loss": 12.365 }, { "epoch": 0.0027822112369040447, "grad_norm": 4.46875, "learning_rate": 2.8000000000000003e-05, "lm_loss": 13.15, "loss": 33.8695, "mask_loss": 8.3807, "step": 7, "topk_loss": 12.3388 }, { "epoch": 0.003179669985033194, "grad_norm": 4.34375, "learning_rate": 3.2000000000000005e-05, "lm_loss": 13.1204, "loss": 33.7783, "mask_loss": 8.3465, "step": 8, "topk_loss": 12.3114 }, { "epoch": 0.003577128733162343, "grad_norm": 4.28125, "learning_rate": 3.6e-05, "lm_loss": 13.0919, "loss": 33.688, "mask_loss": 8.3141, "step": 9, "topk_loss": 12.2821 }, { "epoch": 0.003974587481291493, "grad_norm": 4.125, "learning_rate": 4e-05, "lm_loss": 13.0259, "loss": 33.4608, "mask_loss": 8.2243, "step": 10, "topk_loss": 12.2106 }, { "epoch": 0.004372046229420641, "grad_norm": 4.09375, "learning_rate": 4.4000000000000006e-05, "lm_loss": 12.9687, "loss": 33.24, "mask_loss": 8.1515, "step": 11, "topk_loss": 12.1198 }, { "epoch": 0.004769504977549791, "grad_norm": 3.96875, "learning_rate": 4.8e-05, "lm_loss": 12.8958, "loss": 33.001, "mask_loss": 8.0599, "step": 12, "topk_loss": 12.0454 }, { "epoch": 0.005166963725678941, "grad_norm": 3.859375, "learning_rate": 5.2000000000000004e-05, "lm_loss": 12.8306, "loss": 32.7889, "mask_loss": 7.9851, "step": 13, "topk_loss": 11.9732 }, { "epoch": 0.005564422473808089, "grad_norm": 3.9375, "learning_rate": 5.6000000000000006e-05, "lm_loss": 12.7439, "loss": 32.4743, "mask_loss": 7.8639, "step": 14, "topk_loss": 11.8666 }, { "epoch": 0.005961881221937239, "grad_norm": 3.828125, "learning_rate": 6e-05, "lm_loss": 12.663, "loss": 32.2566, "mask_loss": 7.7964, "step": 15, "topk_loss": 11.7973 }, { "epoch": 0.006359339970066388, "grad_norm": 3.828125, "learning_rate": 6.400000000000001e-05, "lm_loss": 12.6021, "loss": 32.0481, "mask_loss": 7.7306, "step": 16, "topk_loss": 11.7154 }, { "epoch": 0.006756798718195537, "grad_norm": 4.1875, "learning_rate": 6.800000000000001e-05, "lm_loss": 12.4821, "loss": 31.6167, "mask_loss": 7.5745, "step": 17, "topk_loss": 11.5601 }, { "epoch": 0.007154257466324686, "grad_norm": 4.46875, "learning_rate": 7.2e-05, "lm_loss": 12.3375, "loss": 31.0941, "mask_loss": 7.4022, "step": 18, "topk_loss": 11.3544 }, { "epoch": 0.007551716214453836, "grad_norm": 5.0, "learning_rate": 7.6e-05, "lm_loss": 12.1452, "loss": 30.3741, "mask_loss": 7.1468, "step": 19, "topk_loss": 11.0821 }, { "epoch": 0.007949174962582985, "grad_norm": 5.375, "learning_rate": 8e-05, "lm_loss": 11.9773, "loss": 29.7729, "mask_loss": 6.9488, "step": 20, "topk_loss": 10.8468 }, { "epoch": 0.008346633710712134, "grad_norm": 6.03125, "learning_rate": 8.4e-05, "lm_loss": 11.777, "loss": 28.9801, "mask_loss": 6.6822, "step": 21, "topk_loss": 10.5208 }, { "epoch": 0.008744092458841283, "grad_norm": 6.21875, "learning_rate": 8.800000000000001e-05, "lm_loss": 11.6339, "loss": 28.5436, "mask_loss": 6.5708, "step": 22, "topk_loss": 10.3388 }, { "epoch": 0.009141551206970433, "grad_norm": 6.5625, "learning_rate": 9.200000000000001e-05, "lm_loss": 11.478, "loss": 27.9254, "mask_loss": 6.3739, "step": 23, "topk_loss": 10.0736 }, { "epoch": 0.009539009955099582, "grad_norm": 7.28125, "learning_rate": 9.6e-05, "lm_loss": 11.2339, "loss": 26.8817, "mask_loss": 6.0258, "step": 24, "topk_loss": 9.622 }, { "epoch": 0.00993646870322873, "grad_norm": 7.3125, "learning_rate": 0.0001, "lm_loss": 11.0631, "loss": 26.26, "mask_loss": 5.8523, "step": 25, "topk_loss": 9.3446 }, { "epoch": 0.010333927451357881, "grad_norm": 7.6875, "learning_rate": 0.00010400000000000001, "lm_loss": 10.8143, "loss": 25.2251, "mask_loss": 5.5267, "step": 26, "topk_loss": 8.8841 }, { "epoch": 0.01073138619948703, "grad_norm": 7.75, "learning_rate": 0.00010800000000000001, "lm_loss": 10.5906, "loss": 24.3879, "mask_loss": 5.2855, "step": 27, "topk_loss": 8.5119 }, { "epoch": 0.011128844947616179, "grad_norm": 7.90625, "learning_rate": 0.00011200000000000001, "lm_loss": 10.3778, "loss": 23.3374, "mask_loss": 4.96, "step": 28, "topk_loss": 7.9996 }, { "epoch": 0.011526303695745328, "grad_norm": 7.53125, "learning_rate": 0.000116, "lm_loss": 10.1656, "loss": 22.4997, "mask_loss": 4.7425, "step": 29, "topk_loss": 7.5915 }, { "epoch": 0.011923762443874478, "grad_norm": 7.65625, "learning_rate": 0.00012, "lm_loss": 9.8681, "loss": 21.171, "mask_loss": 4.3273, "step": 30, "topk_loss": 6.9756 }, { "epoch": 0.012321221192003627, "grad_norm": 7.0625, "learning_rate": 0.000124, "lm_loss": 9.6834, "loss": 20.4646, "mask_loss": 4.1626, "step": 31, "topk_loss": 6.6186 }, { "epoch": 0.012718679940132776, "grad_norm": 6.65625, "learning_rate": 0.00012800000000000002, "lm_loss": 9.5271, "loss": 19.8425, "mask_loss": 4.0218, "step": 32, "topk_loss": 6.2936 }, { "epoch": 0.013116138688261926, "grad_norm": 7.59375, "learning_rate": 0.000132, "lm_loss": 9.1918, "loss": 17.9938, "mask_loss": 3.4106, "step": 33, "topk_loss": 5.3914 }, { "epoch": 0.013513597436391075, "grad_norm": 7.4375, "learning_rate": 0.00013600000000000003, "lm_loss": 9.1679, "loss": 18.1523, "mask_loss": 3.5831, "step": 34, "topk_loss": 5.4013 }, { "epoch": 0.013911056184520224, "grad_norm": 7.46875, "learning_rate": 0.00014, "lm_loss": 8.9109, "loss": 16.9051, "mask_loss": 3.196, "step": 35, "topk_loss": 4.7982 }, { "epoch": 0.014308514932649372, "grad_norm": 7.1875, "learning_rate": 0.000144, "lm_loss": 8.6916, "loss": 16.0099, "mask_loss": 2.9918, "step": 36, "topk_loss": 4.3265 }, { "epoch": 0.014705973680778523, "grad_norm": 6.0, "learning_rate": 0.000148, "lm_loss": 8.6587, "loss": 15.9112, "mask_loss": 3.0149, "step": 37, "topk_loss": 4.2376 }, { "epoch": 0.015103432428907672, "grad_norm": 5.75, "learning_rate": 0.000152, "lm_loss": 8.5764, "loss": 15.4146, "mask_loss": 2.9393, "step": 38, "topk_loss": 3.8989 }, { "epoch": 0.01550089117703682, "grad_norm": 5.78125, "learning_rate": 0.00015600000000000002, "lm_loss": 8.5832, "loss": 15.6764, "mask_loss": 3.089, "step": 39, "topk_loss": 4.0043 }, { "epoch": 0.01589834992516597, "grad_norm": 5.78125, "learning_rate": 0.00016, "lm_loss": 8.4336, "loss": 14.6859, "mask_loss": 2.7272, "step": 40, "topk_loss": 3.525 }, { "epoch": 0.01629580867329512, "grad_norm": 6.0, "learning_rate": 0.000164, "lm_loss": 8.4196, "loss": 14.891, "mask_loss": 2.8416, "step": 41, "topk_loss": 3.6298 }, { "epoch": 0.016693267421424268, "grad_norm": 5.90625, "learning_rate": 0.000168, "lm_loss": 8.2859, "loss": 13.974, "mask_loss": 2.506, "step": 42, "topk_loss": 3.1822 }, { "epoch": 0.017090726169553417, "grad_norm": 6.4375, "learning_rate": 0.000172, "lm_loss": 8.3544, "loss": 14.6417, "mask_loss": 2.8646, "step": 43, "topk_loss": 3.4227 }, { "epoch": 0.017488184917682566, "grad_norm": 6.28125, "learning_rate": 0.00017600000000000002, "lm_loss": 8.1956, "loss": 14.0252, "mask_loss": 2.6556, "step": 44, "topk_loss": 3.174 }, { "epoch": 0.017885643665811718, "grad_norm": 5.9375, "learning_rate": 0.00018, "lm_loss": 8.0939, "loss": 12.9191, "mask_loss": 2.2004, "step": 45, "topk_loss": 2.6249 }, { "epoch": 0.018283102413940867, "grad_norm": 6.03125, "learning_rate": 0.00018400000000000003, "lm_loss": 8.0049, "loss": 13.2306, "mask_loss": 2.4107, "step": 46, "topk_loss": 2.815 }, { "epoch": 0.018680561162070015, "grad_norm": 5.34375, "learning_rate": 0.000188, "lm_loss": 7.8148, "loss": 12.1767, "mask_loss": 2.0091, "step": 47, "topk_loss": 2.3528 }, { "epoch": 0.019078019910199164, "grad_norm": 5.40625, "learning_rate": 0.000192, "lm_loss": 7.7777, "loss": 12.071, "mask_loss": 1.9772, "step": 48, "topk_loss": 2.3161 }, { "epoch": 0.019475478658328313, "grad_norm": 5.28125, "learning_rate": 0.000196, "lm_loss": 7.7451, "loss": 12.0437, "mask_loss": 1.9371, "step": 49, "topk_loss": 2.3615 }, { "epoch": 0.01987293740645746, "grad_norm": 5.6875, "learning_rate": 0.0002, "lm_loss": 7.7121, "loss": 12.1355, "mask_loss": 2.017, "step": 50, "topk_loss": 2.4065 }, { "epoch": 0.01987293740645746, "eval_lm_loss": 320.9280700683594, "eval_loss": 324.5454406738281, "eval_mask_hit_rate": 0.18616949021816254, "eval_mask_loss": 1.6387584209442139, "eval_mask_top_10_hit_rate": 0.5981898903846741, "eval_mask_top_1_hit_rate": 0.719202995300293, "eval_mask_top_20_hit_rate": 0.5433949828147888, "eval_mask_top_5_hit_rate": 0.6420404314994812, "eval_runtime": 147.9637, "eval_samples_per_second": 13.841, "eval_steps_per_second": 6.921, "eval_token_accuracy": 0.17364394664764404, "eval_top_k_diff": 15844.38671875, "eval_topk_loss": 1.9786218404769897, "step": 50 }, { "epoch": 0.02027039615458661, "grad_norm": 5.375, "learning_rate": 0.00019999991878507574, "lm_loss": 7.4662, "loss": 11.3416, "mask_loss": 1.7451, "step": 51, "topk_loss": 2.1303 }, { "epoch": 0.020667854902715763, "grad_norm": 4.96875, "learning_rate": 0.00019999967514043482, "lm_loss": 7.3426, "loss": 11.0435, "mask_loss": 1.6574, "step": 52, "topk_loss": 2.0435 }, { "epoch": 0.02106531365084491, "grad_norm": 4.875, "learning_rate": 0.000199999269066473, "lm_loss": 7.27, "loss": 10.9138, "mask_loss": 1.6493, "step": 53, "topk_loss": 1.9945 }, { "epoch": 0.02146277239897406, "grad_norm": 3.578125, "learning_rate": 0.00019999870056384994, "lm_loss": 6.9719, "loss": 9.7904, "mask_loss": 1.2467, "step": 54, "topk_loss": 1.5717 }, { "epoch": 0.02186023114710321, "grad_norm": 4.09375, "learning_rate": 0.00019999796963348897, "lm_loss": 7.0271, "loss": 10.1004, "mask_loss": 1.3678, "step": 55, "topk_loss": 1.7055 }, { "epoch": 0.022257689895232358, "grad_norm": 3.578125, "learning_rate": 0.00019999707627657736, "lm_loss": 6.9151, "loss": 9.7666, "mask_loss": 1.2558, "step": 56, "topk_loss": 1.5958 }, { "epoch": 0.022655148643361506, "grad_norm": 3.546875, "learning_rate": 0.0001999960204945662, "lm_loss": 6.85, "loss": 9.5896, "mask_loss": 1.1859, "step": 57, "topk_loss": 1.5536 }, { "epoch": 0.023052607391490655, "grad_norm": 3.0625, "learning_rate": 0.0001999948022891704, "lm_loss": 6.6166, "loss": 9.1052, "mask_loss": 1.1085, "step": 58, "topk_loss": 1.3801 }, { "epoch": 0.023450066139619807, "grad_norm": 2.65625, "learning_rate": 0.00019999342166236868, "lm_loss": 6.6206, "loss": 9.1253, "mask_loss": 1.1119, "step": 59, "topk_loss": 1.3928 }, { "epoch": 0.023847524887748956, "grad_norm": 2.375, "learning_rate": 0.00019999187861640362, "lm_loss": 6.6699, "loss": 8.8312, "mask_loss": 0.9332, "step": 60, "topk_loss": 1.2282 }, { "epoch": 0.024244983635878105, "grad_norm": 1.8515625, "learning_rate": 0.00019999017315378153, "lm_loss": 6.5315, "loss": 8.4229, "mask_loss": 0.8263, "step": 61, "topk_loss": 1.0651 }, { "epoch": 0.024642442384007254, "grad_norm": 1.75, "learning_rate": 0.00019998830527727263, "lm_loss": 6.5239, "loss": 8.435, "mask_loss": 0.8378, "step": 62, "topk_loss": 1.0733 }, { "epoch": 0.025039901132136402, "grad_norm": 1.5859375, "learning_rate": 0.0001999862749899109, "lm_loss": 6.2827, "loss": 7.9781, "mask_loss": 0.7475, "step": 63, "topk_loss": 0.9479 }, { "epoch": 0.02543735988026555, "grad_norm": 1.5, "learning_rate": 0.0001999840822949941, "lm_loss": 6.351, "loss": 7.812, "mask_loss": 0.6495, "step": 64, "topk_loss": 0.8115 }, { "epoch": 0.0258348186283947, "grad_norm": 1.4453125, "learning_rate": 0.0001999817271960839, "lm_loss": 6.3347, "loss": 7.8494, "mask_loss": 0.6792, "step": 65, "topk_loss": 0.8355 }, { "epoch": 0.026232277376523852, "grad_norm": 1.546875, "learning_rate": 0.0001999792096970056, "lm_loss": 6.231, "loss": 7.4894, "mask_loss": 0.572, "step": 66, "topk_loss": 0.6865 }, { "epoch": 0.026629736124653, "grad_norm": 1.359375, "learning_rate": 0.00019997652980184843, "lm_loss": 6.2884, "loss": 7.6097, "mask_loss": 0.6205, "step": 67, "topk_loss": 0.7008 }, { "epoch": 0.02702719487278215, "grad_norm": 1.359375, "learning_rate": 0.00019997368751496528, "lm_loss": 6.0524, "loss": 7.2628, "mask_loss": 0.5931, "step": 68, "topk_loss": 0.6173 }, { "epoch": 0.0274246536209113, "grad_norm": 1.359375, "learning_rate": 0.00019997068284097295, "lm_loss": 6.1849, "loss": 7.393, "mask_loss": 0.5898, "step": 69, "topk_loss": 0.6183 }, { "epoch": 0.027822112369040447, "grad_norm": 1.1796875, "learning_rate": 0.00019996751578475186, "lm_loss": 6.0249, "loss": 7.0865, "mask_loss": 0.5251, "step": 70, "topk_loss": 0.5364 }, { "epoch": 0.028219571117169596, "grad_norm": 1.078125, "learning_rate": 0.0001999641863514463, "lm_loss": 5.9085, "loss": 6.8733, "mask_loss": 0.5183, "step": 71, "topk_loss": 0.4465 }, { "epoch": 0.028617029865298745, "grad_norm": 0.98046875, "learning_rate": 0.0001999606945464642, "lm_loss": 5.8834, "loss": 6.853, "mask_loss": 0.5229, "step": 72, "topk_loss": 0.4467 }, { "epoch": 0.029014488613427897, "grad_norm": 0.8828125, "learning_rate": 0.0001999570403754774, "lm_loss": 5.796, "loss": 6.6022, "mask_loss": 0.4409, "step": 73, "topk_loss": 0.3653 }, { "epoch": 0.029411947361557046, "grad_norm": 0.8671875, "learning_rate": 0.00019995322384442127, "lm_loss": 5.7359, "loss": 6.5091, "mask_loss": 0.4397, "step": 74, "topk_loss": 0.3336 }, { "epoch": 0.029809406109686194, "grad_norm": 0.921875, "learning_rate": 0.00019994924495949504, "lm_loss": 5.6994, "loss": 6.5209, "mask_loss": 0.46, "step": 75, "topk_loss": 0.3615 }, { "epoch": 0.030206864857815343, "grad_norm": 1.0390625, "learning_rate": 0.00019994510372716158, "lm_loss": 5.5887, "loss": 6.4227, "mask_loss": 0.4811, "step": 76, "topk_loss": 0.353 }, { "epoch": 0.030604323605944492, "grad_norm": 0.875, "learning_rate": 0.0001999408001541475, "lm_loss": 5.472, "loss": 6.2222, "mask_loss": 0.445, "step": 77, "topk_loss": 0.3052 }, { "epoch": 0.03100178235407364, "grad_norm": 0.80859375, "learning_rate": 0.00019993633424744307, "lm_loss": 5.5286, "loss": 6.2379, "mask_loss": 0.4314, "step": 78, "topk_loss": 0.2779 }, { "epoch": 0.03139924110220279, "grad_norm": 0.78515625, "learning_rate": 0.0001999317060143023, "lm_loss": 5.5386, "loss": 6.2345, "mask_loss": 0.4308, "step": 79, "topk_loss": 0.2652 }, { "epoch": 0.03179669985033194, "grad_norm": 0.7421875, "learning_rate": 0.00019992691546224278, "lm_loss": 5.3942, "loss": 6.0123, "mask_loss": 0.3858, "step": 80, "topk_loss": 0.2323 }, { "epoch": 0.03219415859846109, "grad_norm": 0.6171875, "learning_rate": 0.00019992196259904584, "lm_loss": 5.4596, "loss": 5.9972, "mask_loss": 0.3486, "step": 81, "topk_loss": 0.189 }, { "epoch": 0.03259161734659024, "grad_norm": 0.70703125, "learning_rate": 0.00019991684743275636, "lm_loss": 5.3695, "loss": 5.9697, "mask_loss": 0.3813, "step": 82, "topk_loss": 0.219 }, { "epoch": 0.03298907609471939, "grad_norm": 0.60546875, "learning_rate": 0.00019991156997168293, "lm_loss": 5.2743, "loss": 5.8273, "mask_loss": 0.3639, "step": 83, "topk_loss": 0.1891 }, { "epoch": 0.033386534842848536, "grad_norm": 0.953125, "learning_rate": 0.0001999061302243977, "lm_loss": 5.3097, "loss": 6.0197, "mask_loss": 0.4563, "step": 84, "topk_loss": 0.2538 }, { "epoch": 0.03378399359097769, "grad_norm": 0.6953125, "learning_rate": 0.00019990052819973642, "lm_loss": 5.1893, "loss": 5.7923, "mask_loss": 0.3903, "step": 85, "topk_loss": 0.2126 }, { "epoch": 0.034181452339106834, "grad_norm": 0.5078125, "learning_rate": 0.00019989476390679854, "lm_loss": 5.1706, "loss": 5.6086, "mask_loss": 0.3068, "step": 86, "topk_loss": 0.1312 }, { "epoch": 0.034578911087235986, "grad_norm": 0.6171875, "learning_rate": 0.0001998888373549469, "lm_loss": 5.0552, "loss": 5.563, "mask_loss": 0.3447, "step": 87, "topk_loss": 0.1631 }, { "epoch": 0.03497636983536513, "grad_norm": 0.56640625, "learning_rate": 0.00019988274855380804, "lm_loss": 5.1613, "loss": 5.6357, "mask_loss": 0.32, "step": 88, "topk_loss": 0.1545 }, { "epoch": 0.035373828583494284, "grad_norm": 0.72265625, "learning_rate": 0.00019987649751327196, "lm_loss": 5.0626, "loss": 5.607, "mask_loss": 0.3565, "step": 89, "topk_loss": 0.1879 }, { "epoch": 0.035771287331623436, "grad_norm": 0.478515625, "learning_rate": 0.00019987008424349225, "lm_loss": 4.9735, "loss": 5.4065, "mask_loss": 0.3099, "step": 90, "topk_loss": 0.1231 }, { "epoch": 0.03616874607975258, "grad_norm": 0.72265625, "learning_rate": 0.00019986350875488595, "lm_loss": 4.9142, "loss": 5.4443, "mask_loss": 0.3508, "step": 91, "topk_loss": 0.1793 }, { "epoch": 0.03656620482788173, "grad_norm": 0.6953125, "learning_rate": 0.00019985677105813362, "lm_loss": 4.9241, "loss": 5.4404, "mask_loss": 0.3437, "step": 92, "topk_loss": 0.1725 }, { "epoch": 0.03696366357601088, "grad_norm": 0.5078125, "learning_rate": 0.00019984987116417928, "lm_loss": 4.9039, "loss": 5.3879, "mask_loss": 0.3354, "step": 93, "topk_loss": 0.1485 }, { "epoch": 0.03736112232414003, "grad_norm": 0.41796875, "learning_rate": 0.00019984280908423043, "lm_loss": 4.7759, "loss": 5.1349, "mask_loss": 0.2757, "step": 94, "topk_loss": 0.0832 }, { "epoch": 0.037758581072269176, "grad_norm": 0.453125, "learning_rate": 0.00019983558482975799, "lm_loss": 4.8718, "loss": 5.2221, "mask_loss": 0.2656, "step": 95, "topk_loss": 0.0848 }, { "epoch": 0.03815603982039833, "grad_norm": 0.59765625, "learning_rate": 0.00019982819841249632, "lm_loss": 4.7451, "loss": 5.2327, "mask_loss": 0.3364, "step": 96, "topk_loss": 0.1513 }, { "epoch": 0.03855349856852748, "grad_norm": 0.625, "learning_rate": 0.00019982064984444317, "lm_loss": 4.7885, "loss": 5.2653, "mask_loss": 0.3325, "step": 97, "topk_loss": 0.1443 }, { "epoch": 0.038950957316656626, "grad_norm": 0.48046875, "learning_rate": 0.00019981293913785963, "lm_loss": 4.7141, "loss": 5.13, "mask_loss": 0.3013, "step": 98, "topk_loss": 0.1147 }, { "epoch": 0.03934841606478578, "grad_norm": 0.40625, "learning_rate": 0.00019980506630527022, "lm_loss": 4.5939, "loss": 4.974, "mask_loss": 0.2834, "step": 99, "topk_loss": 0.0967 }, { "epoch": 0.03974587481291492, "grad_norm": 0.380859375, "learning_rate": 0.00019979703135946278, "lm_loss": 4.7149, "loss": 5.0904, "mask_loss": 0.2803, "step": 100, "topk_loss": 0.0952 }, { "epoch": 0.03974587481291492, "eval_lm_loss": 706.3309326171875, "eval_loss": 706.7132568359375, "eval_mask_hit_rate": 0.3013326823711395, "eval_mask_loss": 0.27427732944488525, "eval_mask_top_10_hit_rate": 0.7967365980148315, "eval_mask_top_1_hit_rate": 0.8906242847442627, "eval_mask_top_20_hit_rate": 0.7520042657852173, "eval_mask_top_5_hit_rate": 0.8314727544784546, "eval_runtime": 147.1012, "eval_samples_per_second": 13.922, "eval_steps_per_second": 6.961, "eval_token_accuracy": 0.36604347825050354, "eval_top_k_diff": -257.53240966796875, "eval_topk_loss": 0.10801860690116882, "step": 100 }, { "epoch": 0.040143333561044076, "grad_norm": 0.55859375, "learning_rate": 0.00019978883431348845, "lm_loss": 4.6222, "loss": 5.0187, "mask_loss": 0.2857, "step": 101, "topk_loss": 0.1108 }, { "epoch": 0.04054079230917322, "grad_norm": 0.4453125, "learning_rate": 0.00019978047518066165, "lm_loss": 4.5105, "loss": 4.9176, "mask_loss": 0.2925, "step": 102, "topk_loss": 0.1146 }, { "epoch": 0.04093825105730237, "grad_norm": 0.453125, "learning_rate": 0.0001997719539745601, "lm_loss": 4.577, "loss": 4.9669, "mask_loss": 0.2823, "step": 103, "topk_loss": 0.1076 }, { "epoch": 0.041335709805431525, "grad_norm": 0.443359375, "learning_rate": 0.0001997632707090249, "lm_loss": 4.5435, "loss": 4.9416, "mask_loss": 0.2884, "step": 104, "topk_loss": 0.1098 }, { "epoch": 0.04173316855356067, "grad_norm": 0.373046875, "learning_rate": 0.00019975442539816012, "lm_loss": 4.4278, "loss": 4.7529, "mask_loss": 0.2572, "step": 105, "topk_loss": 0.0679 }, { "epoch": 0.04213062730168982, "grad_norm": 0.380859375, "learning_rate": 0.00019974541805633324, "lm_loss": 4.4459, "loss": 4.8022, "mask_loss": 0.2692, "step": 106, "topk_loss": 0.0871 }, { "epoch": 0.04252808604981897, "grad_norm": 0.33984375, "learning_rate": 0.0001997362486981749, "lm_loss": 4.4027, "loss": 4.7272, "mask_loss": 0.2551, "step": 107, "topk_loss": 0.0694 }, { "epoch": 0.04292554479794812, "grad_norm": 0.421875, "learning_rate": 0.00019972691733857883, "lm_loss": 4.3911, "loss": 4.756, "mask_loss": 0.2738, "step": 108, "topk_loss": 0.091 }, { "epoch": 0.043323003546077266, "grad_norm": 0.38671875, "learning_rate": 0.00019971742399270195, "lm_loss": 4.3224, "loss": 4.6388, "mask_loss": 0.251, "step": 109, "topk_loss": 0.0655 }, { "epoch": 0.04372046229420642, "grad_norm": 0.48046875, "learning_rate": 0.0001997077686759643, "lm_loss": 4.3758, "loss": 4.7341, "mask_loss": 0.2682, "step": 110, "topk_loss": 0.0901 }, { "epoch": 0.04411792104233557, "grad_norm": 0.3125, "learning_rate": 0.000199697951404049, "lm_loss": 4.2691, "loss": 4.5957, "mask_loss": 0.2554, "step": 111, "topk_loss": 0.0712 }, { "epoch": 0.044515379790464715, "grad_norm": 0.3671875, "learning_rate": 0.00019968797219290226, "lm_loss": 4.1656, "loss": 4.4926, "mask_loss": 0.2545, "step": 112, "topk_loss": 0.0725 }, { "epoch": 0.04491283853859387, "grad_norm": 0.50390625, "learning_rate": 0.00019967783105873324, "lm_loss": 4.275, "loss": 4.6403, "mask_loss": 0.2683, "step": 113, "topk_loss": 0.097 }, { "epoch": 0.04531029728672301, "grad_norm": 0.333984375, "learning_rate": 0.00019966752801801416, "lm_loss": 4.2514, "loss": 4.579, "mask_loss": 0.2551, "step": 114, "topk_loss": 0.0725 }, { "epoch": 0.045707756034852165, "grad_norm": 0.408203125, "learning_rate": 0.00019965706308748028, "lm_loss": 4.3101, "loss": 4.6724, "mask_loss": 0.2659, "step": 115, "topk_loss": 0.0964 }, { "epoch": 0.04610521478298131, "grad_norm": 0.396484375, "learning_rate": 0.0001996464362841298, "lm_loss": 4.2544, "loss": 4.5976, "mask_loss": 0.2575, "step": 116, "topk_loss": 0.0857 }, { "epoch": 0.04650267353111046, "grad_norm": 0.3203125, "learning_rate": 0.00019963564762522372, "lm_loss": 4.166, "loss": 4.4859, "mask_loss": 0.2479, "step": 117, "topk_loss": 0.072 }, { "epoch": 0.046900132279239615, "grad_norm": 0.328125, "learning_rate": 0.00019962469712828614, "lm_loss": 4.1945, "loss": 4.5042, "mask_loss": 0.2447, "step": 118, "topk_loss": 0.0651 }, { "epoch": 0.04729759102736876, "grad_norm": 0.3515625, "learning_rate": 0.00019961358481110385, "lm_loss": 4.2637, "loss": 4.5898, "mask_loss": 0.2511, "step": 119, "topk_loss": 0.0751 }, { "epoch": 0.04769504977549791, "grad_norm": 0.333984375, "learning_rate": 0.0001996023106917267, "lm_loss": 4.1716, "loss": 4.4768, "mask_loss": 0.2403, "step": 120, "topk_loss": 0.065 }, { "epoch": 0.04809250852362706, "grad_norm": 0.29296875, "learning_rate": 0.00019959087478846707, "lm_loss": 4.1371, "loss": 4.4277, "mask_loss": 0.2343, "step": 121, "topk_loss": 0.0564 }, { "epoch": 0.04848996727175621, "grad_norm": 0.478515625, "learning_rate": 0.00019957927711990035, "lm_loss": 4.1211, "loss": 4.563, "mask_loss": 0.3337, "step": 122, "topk_loss": 0.1081 }, { "epoch": 0.048887426019885355, "grad_norm": 0.322265625, "learning_rate": 0.00019956751770486462, "lm_loss": 4.0737, "loss": 4.382, "mask_loss": 0.2407, "step": 123, "topk_loss": 0.0676 }, { "epoch": 0.04928488476801451, "grad_norm": 0.287109375, "learning_rate": 0.00019955559656246067, "lm_loss": 3.9877, "loss": 4.2723, "mask_loss": 0.2326, "step": 124, "topk_loss": 0.052 }, { "epoch": 0.04968234351614366, "grad_norm": 0.341796875, "learning_rate": 0.00019954351371205203, "lm_loss": 3.9643, "loss": 4.2797, "mask_loss": 0.2451, "step": 125, "topk_loss": 0.0702 }, { "epoch": 0.050079802264272805, "grad_norm": 0.306640625, "learning_rate": 0.00019953126917326478, "lm_loss": 4.0202, "loss": 4.3116, "mask_loss": 0.2359, "step": 126, "topk_loss": 0.0555 }, { "epoch": 0.05047726101240196, "grad_norm": 0.326171875, "learning_rate": 0.0001995188629659878, "lm_loss": 4.0063, "loss": 4.3179, "mask_loss": 0.245, "step": 127, "topk_loss": 0.0665 }, { "epoch": 0.0508747197605311, "grad_norm": 0.359375, "learning_rate": 0.00019950629511037237, "lm_loss": 3.9746, "loss": 4.2866, "mask_loss": 0.2411, "step": 128, "topk_loss": 0.0708 }, { "epoch": 0.051272178508660254, "grad_norm": 0.310546875, "learning_rate": 0.00019949356562683256, "lm_loss": 3.9167, "loss": 4.2184, "mask_loss": 0.2379, "step": 129, "topk_loss": 0.0637 }, { "epoch": 0.0516696372567894, "grad_norm": 0.30078125, "learning_rate": 0.00019948067453604473, "lm_loss": 3.9231, "loss": 4.2166, "mask_loss": 0.2347, "step": 130, "topk_loss": 0.0588 }, { "epoch": 0.05206709600491855, "grad_norm": 0.310546875, "learning_rate": 0.00019946762185894793, "lm_loss": 3.835, "loss": 4.1341, "mask_loss": 0.2372, "step": 131, "topk_loss": 0.062 }, { "epoch": 0.052464554753047704, "grad_norm": 0.3125, "learning_rate": 0.00019945440761674359, "lm_loss": 3.9297, "loss": 4.228, "mask_loss": 0.2332, "step": 132, "topk_loss": 0.0652 }, { "epoch": 0.05286201350117685, "grad_norm": 0.3125, "learning_rate": 0.00019944103183089558, "lm_loss": 3.9599, "loss": 4.2613, "mask_loss": 0.2367, "step": 133, "topk_loss": 0.0647 }, { "epoch": 0.053259472249306, "grad_norm": 0.2734375, "learning_rate": 0.00019942749452313017, "lm_loss": 3.8308, "loss": 4.1095, "mask_loss": 0.2253, "step": 134, "topk_loss": 0.0534 }, { "epoch": 0.05365693099743515, "grad_norm": 0.3046875, "learning_rate": 0.00019941379571543596, "lm_loss": 3.8412, "loss": 4.1291, "mask_loss": 0.2316, "step": 135, "topk_loss": 0.0563 }, { "epoch": 0.0540543897455643, "grad_norm": 0.294921875, "learning_rate": 0.00019939993543006395, "lm_loss": 3.8049, "loss": 4.0803, "mask_loss": 0.2271, "step": 136, "topk_loss": 0.0483 }, { "epoch": 0.054451848493693444, "grad_norm": 0.3984375, "learning_rate": 0.0001993859136895274, "lm_loss": 3.8622, "loss": 4.1821, "mask_loss": 0.2446, "step": 137, "topk_loss": 0.0753 }, { "epoch": 0.0548493072418226, "grad_norm": 0.2890625, "learning_rate": 0.00019937173051660172, "lm_loss": 3.8942, "loss": 4.1659, "mask_loss": 0.2219, "step": 138, "topk_loss": 0.0498 }, { "epoch": 0.05524676598995175, "grad_norm": 0.283203125, "learning_rate": 0.00019935738593432464, "lm_loss": 3.7593, "loss": 4.0358, "mask_loss": 0.2235, "step": 139, "topk_loss": 0.053 }, { "epoch": 0.055644224738080894, "grad_norm": 0.2890625, "learning_rate": 0.00019934287996599607, "lm_loss": 3.775, "loss": 4.0379, "mask_loss": 0.2182, "step": 140, "topk_loss": 0.0447 }, { "epoch": 0.056041683486210046, "grad_norm": 0.279296875, "learning_rate": 0.00019932821263517805, "lm_loss": 3.7231, "loss": 3.9992, "mask_loss": 0.2219, "step": 141, "topk_loss": 0.0542 }, { "epoch": 0.05643914223433919, "grad_norm": 0.28125, "learning_rate": 0.00019931338396569467, "lm_loss": 3.7311, "loss": 3.9954, "mask_loss": 0.2175, "step": 142, "topk_loss": 0.0469 }, { "epoch": 0.056836600982468344, "grad_norm": 0.3203125, "learning_rate": 0.00019929839398163214, "lm_loss": 3.6278, "loss": 3.9166, "mask_loss": 0.2331, "step": 143, "topk_loss": 0.0557 }, { "epoch": 0.05723405973059749, "grad_norm": 0.314453125, "learning_rate": 0.00019928324270733862, "lm_loss": 3.6726, "loss": 3.9472, "mask_loss": 0.2248, "step": 144, "topk_loss": 0.0499 }, { "epoch": 0.05763151847872664, "grad_norm": 0.2890625, "learning_rate": 0.00019926793016742435, "lm_loss": 3.6402, "loss": 3.9064, "mask_loss": 0.2184, "step": 145, "topk_loss": 0.0478 }, { "epoch": 0.058028977226855794, "grad_norm": 0.283203125, "learning_rate": 0.00019925245638676144, "lm_loss": 3.6263, "loss": 3.8756, "mask_loss": 0.2114, "step": 146, "topk_loss": 0.0378 }, { "epoch": 0.05842643597498494, "grad_norm": 0.310546875, "learning_rate": 0.00019923682139048396, "lm_loss": 3.61, "loss": 3.8891, "mask_loss": 0.2262, "step": 147, "topk_loss": 0.0528 }, { "epoch": 0.05882389472311409, "grad_norm": 0.33984375, "learning_rate": 0.00019922102520398776, "lm_loss": 3.6744, "loss": 3.9671, "mask_loss": 0.2268, "step": 148, "topk_loss": 0.066 }, { "epoch": 0.059221353471243236, "grad_norm": 0.359375, "learning_rate": 0.0001992050678529306, "lm_loss": 3.6987, "loss": 4.0077, "mask_loss": 0.2327, "step": 149, "topk_loss": 0.0763 }, { "epoch": 0.05961881221937239, "grad_norm": 0.30859375, "learning_rate": 0.00019918894936323195, "lm_loss": 3.6849, "loss": 3.9491, "mask_loss": 0.2151, "step": 150, "topk_loss": 0.0491 }, { "epoch": 0.05961881221937239, "eval_lm_loss": 722.8411254882812, "eval_loss": 723.1029663085938, "eval_mask_hit_rate": 0.3664598762989044, "eval_mask_loss": 0.20979231595993042, "eval_mask_top_10_hit_rate": 0.8763496279716492, "eval_mask_top_1_hit_rate": 0.9443230628967285, "eval_mask_top_20_hit_rate": 0.8418170809745789, "eval_mask_top_5_hit_rate": 0.9021064639091492, "eval_runtime": 144.2832, "eval_samples_per_second": 14.194, "eval_steps_per_second": 7.097, "eval_token_accuracy": 0.44723033905029297, "eval_top_k_diff": -527.2366943359375, "eval_topk_loss": 0.05206891894340515, "step": 150 }, { "epoch": 0.060016270967501534, "grad_norm": 0.302734375, "learning_rate": 0.00019917266976107308, "lm_loss": 3.6228, "loss": 3.8777, "mask_loss": 0.2122, "step": 151, "topk_loss": 0.0426 }, { "epoch": 0.060413729715630686, "grad_norm": 0.275390625, "learning_rate": 0.00019915622907289694, "lm_loss": 3.5653, "loss": 3.8254, "mask_loss": 0.2136, "step": 152, "topk_loss": 0.0465 }, { "epoch": 0.06081118846375984, "grad_norm": 0.322265625, "learning_rate": 0.00019913962732540807, "lm_loss": 3.5934, "loss": 3.8731, "mask_loss": 0.2234, "step": 153, "topk_loss": 0.0563 }, { "epoch": 0.061208647211888983, "grad_norm": 0.28125, "learning_rate": 0.00019912286454557267, "lm_loss": 3.5041, "loss": 3.7657, "mask_loss": 0.216, "step": 154, "topk_loss": 0.0455 }, { "epoch": 0.061606105960018136, "grad_norm": 0.263671875, "learning_rate": 0.00019910594076061853, "lm_loss": 3.5847, "loss": 3.8352, "mask_loss": 0.2104, "step": 155, "topk_loss": 0.0401 }, { "epoch": 0.06200356470814728, "grad_norm": 0.248046875, "learning_rate": 0.0001990888559980349, "lm_loss": 3.4814, "loss": 3.7271, "mask_loss": 0.2087, "step": 156, "topk_loss": 0.037 }, { "epoch": 0.06240102345627643, "grad_norm": 0.287109375, "learning_rate": 0.00019907161028557253, "lm_loss": 3.4867, "loss": 3.7611, "mask_loss": 0.2215, "step": 157, "topk_loss": 0.0529 }, { "epoch": 0.06279848220440558, "grad_norm": 0.298828125, "learning_rate": 0.00019905420365124362, "lm_loss": 3.4163, "loss": 3.6892, "mask_loss": 0.2213, "step": 158, "topk_loss": 0.0516 }, { "epoch": 0.06319594095253474, "grad_norm": 0.3046875, "learning_rate": 0.00019903663612332175, "lm_loss": 3.495, "loss": 3.7624, "mask_loss": 0.2175, "step": 159, "topk_loss": 0.0499 }, { "epoch": 0.06359339970066388, "grad_norm": 0.26953125, "learning_rate": 0.0001990189077303418, "lm_loss": 3.4781, "loss": 3.7132, "mask_loss": 0.2029, "step": 160, "topk_loss": 0.0321 }, { "epoch": 0.06399085844879303, "grad_norm": 0.251953125, "learning_rate": 0.00019900101850109999, "lm_loss": 3.4428, "loss": 3.6969, "mask_loss": 0.2104, "step": 161, "topk_loss": 0.0436 }, { "epoch": 0.06438831719692217, "grad_norm": 0.275390625, "learning_rate": 0.00019898296846465377, "lm_loss": 3.3954, "loss": 3.6293, "mask_loss": 0.2024, "step": 162, "topk_loss": 0.0315 }, { "epoch": 0.06478577594505133, "grad_norm": 0.251953125, "learning_rate": 0.00019896475765032175, "lm_loss": 3.5172, "loss": 3.7485, "mask_loss": 0.1987, "step": 163, "topk_loss": 0.0325 }, { "epoch": 0.06518323469318048, "grad_norm": 0.27734375, "learning_rate": 0.0001989463860876838, "lm_loss": 3.4891, "loss": 3.7279, "mask_loss": 0.2038, "step": 164, "topk_loss": 0.035 }, { "epoch": 0.06558069344130962, "grad_norm": 0.263671875, "learning_rate": 0.00019892785380658078, "lm_loss": 3.3805, "loss": 3.6412, "mask_loss": 0.2121, "step": 165, "topk_loss": 0.0487 }, { "epoch": 0.06597815218943878, "grad_norm": 0.27734375, "learning_rate": 0.0001989091608371146, "lm_loss": 3.4758, "loss": 3.72, "mask_loss": 0.2043, "step": 166, "topk_loss": 0.0399 }, { "epoch": 0.06637561093756793, "grad_norm": 0.2890625, "learning_rate": 0.0001988903072096483, "lm_loss": 3.4014, "loss": 3.6362, "mask_loss": 0.2012, "step": 167, "topk_loss": 0.0337 }, { "epoch": 0.06677306968569707, "grad_norm": 0.2333984375, "learning_rate": 0.00019887129295480577, "lm_loss": 3.3535, "loss": 3.5857, "mask_loss": 0.2002, "step": 168, "topk_loss": 0.0321 }, { "epoch": 0.06717052843382622, "grad_norm": 0.251953125, "learning_rate": 0.00019885211810347184, "lm_loss": 3.348, "loss": 3.5855, "mask_loss": 0.2034, "step": 169, "topk_loss": 0.0341 }, { "epoch": 0.06756798718195538, "grad_norm": 0.26171875, "learning_rate": 0.00019883278268679216, "lm_loss": 3.326, "loss": 3.5556, "mask_loss": 0.1982, "step": 170, "topk_loss": 0.0314 }, { "epoch": 0.06796544593008452, "grad_norm": 0.271484375, "learning_rate": 0.00019881328673617327, "lm_loss": 3.3377, "loss": 3.5755, "mask_loss": 0.2037, "step": 171, "topk_loss": 0.0342 }, { "epoch": 0.06836290467821367, "grad_norm": 0.318359375, "learning_rate": 0.00019879363028328236, "lm_loss": 3.3258, "loss": 3.5867, "mask_loss": 0.2129, "step": 172, "topk_loss": 0.048 }, { "epoch": 0.06876036342634283, "grad_norm": 0.27734375, "learning_rate": 0.0001987738133600474, "lm_loss": 3.2987, "loss": 3.5528, "mask_loss": 0.2083, "step": 173, "topk_loss": 0.0458 }, { "epoch": 0.06915782217447197, "grad_norm": 0.30078125, "learning_rate": 0.000198753835998657, "lm_loss": 3.2923, "loss": 3.5251, "mask_loss": 0.1979, "step": 174, "topk_loss": 0.0349 }, { "epoch": 0.06955528092260112, "grad_norm": 0.283203125, "learning_rate": 0.00019873369823156036, "lm_loss": 3.3049, "loss": 3.5294, "mask_loss": 0.1948, "step": 175, "topk_loss": 0.0298 }, { "epoch": 0.06995273967073026, "grad_norm": 0.232421875, "learning_rate": 0.0001987134000914672, "lm_loss": 3.2683, "loss": 3.5029, "mask_loss": 0.1999, "step": 176, "topk_loss": 0.0348 }, { "epoch": 0.07035019841885942, "grad_norm": 0.26171875, "learning_rate": 0.00019869294161134777, "lm_loss": 3.2072, "loss": 3.4514, "mask_loss": 0.2033, "step": 177, "topk_loss": 0.0409 }, { "epoch": 0.07074765716698857, "grad_norm": 0.265625, "learning_rate": 0.00019867232282443277, "lm_loss": 3.3398, "loss": 3.5819, "mask_loss": 0.2012, "step": 178, "topk_loss": 0.0409 }, { "epoch": 0.07114511591511771, "grad_norm": 0.275390625, "learning_rate": 0.00019865154376421323, "lm_loss": 3.2698, "loss": 3.5008, "mask_loss": 0.1976, "step": 179, "topk_loss": 0.0335 }, { "epoch": 0.07154257466324687, "grad_norm": 0.28125, "learning_rate": 0.00019863060446444054, "lm_loss": 3.19, "loss": 3.4156, "mask_loss": 0.1943, "step": 180, "topk_loss": 0.0312 }, { "epoch": 0.07194003341137602, "grad_norm": 0.26953125, "learning_rate": 0.00019860950495912642, "lm_loss": 3.2296, "loss": 3.4531, "mask_loss": 0.1938, "step": 181, "topk_loss": 0.0298 }, { "epoch": 0.07233749215950516, "grad_norm": 0.2734375, "learning_rate": 0.0001985882452825427, "lm_loss": 3.2934, "loss": 3.5372, "mask_loss": 0.2014, "step": 182, "topk_loss": 0.0424 }, { "epoch": 0.07273495090763431, "grad_norm": 0.279296875, "learning_rate": 0.00019856682546922155, "lm_loss": 3.2163, "loss": 3.4633, "mask_loss": 0.2032, "step": 183, "topk_loss": 0.0438 }, { "epoch": 0.07313240965576347, "grad_norm": 0.27734375, "learning_rate": 0.00019854524555395502, "lm_loss": 3.1944, "loss": 3.4276, "mask_loss": 0.1967, "step": 184, "topk_loss": 0.0365 }, { "epoch": 0.07352986840389261, "grad_norm": 0.2177734375, "learning_rate": 0.00019852350557179538, "lm_loss": 3.1588, "loss": 3.3794, "mask_loss": 0.1912, "step": 185, "topk_loss": 0.0295 }, { "epoch": 0.07392732715202176, "grad_norm": 0.25390625, "learning_rate": 0.00019850160555805486, "lm_loss": 3.196, "loss": 3.4323, "mask_loss": 0.1969, "step": 186, "topk_loss": 0.0393 }, { "epoch": 0.07432478590015092, "grad_norm": 0.24609375, "learning_rate": 0.00019847954554830558, "lm_loss": 3.1731, "loss": 3.3968, "mask_loss": 0.1939, "step": 187, "topk_loss": 0.0298 }, { "epoch": 0.07472224464828006, "grad_norm": 0.24609375, "learning_rate": 0.00019845732557837966, "lm_loss": 3.186, "loss": 3.4056, "mask_loss": 0.1901, "step": 188, "topk_loss": 0.0295 }, { "epoch": 0.07511970339640921, "grad_norm": 0.2431640625, "learning_rate": 0.0001984349456843689, "lm_loss": 3.1528, "loss": 3.3835, "mask_loss": 0.1955, "step": 189, "topk_loss": 0.0352 }, { "epoch": 0.07551716214453835, "grad_norm": 0.26171875, "learning_rate": 0.00019841240590262493, "lm_loss": 3.131, "loss": 3.3606, "mask_loss": 0.1956, "step": 190, "topk_loss": 0.034 }, { "epoch": 0.07591462089266751, "grad_norm": 0.234375, "learning_rate": 0.0001983897062697591, "lm_loss": 3.213, "loss": 3.4263, "mask_loss": 0.1881, "step": 191, "topk_loss": 0.0252 }, { "epoch": 0.07631207964079666, "grad_norm": 0.24609375, "learning_rate": 0.00019836684682264242, "lm_loss": 3.1288, "loss": 3.3551, "mask_loss": 0.192, "step": 192, "topk_loss": 0.0343 }, { "epoch": 0.0767095383889258, "grad_norm": 0.2451171875, "learning_rate": 0.00019834382759840538, "lm_loss": 3.1637, "loss": 3.3819, "mask_loss": 0.1889, "step": 193, "topk_loss": 0.0293 }, { "epoch": 0.07710699713705496, "grad_norm": 0.2255859375, "learning_rate": 0.0001983206486344381, "lm_loss": 3.0662, "loss": 3.2871, "mask_loss": 0.1928, "step": 194, "topk_loss": 0.0282 }, { "epoch": 0.0775044558851841, "grad_norm": 0.251953125, "learning_rate": 0.0001982973099683902, "lm_loss": 3.0626, "loss": 3.2906, "mask_loss": 0.1945, "step": 195, "topk_loss": 0.0336 }, { "epoch": 0.07790191463331325, "grad_norm": 0.2412109375, "learning_rate": 0.00019827381163817055, "lm_loss": 3.0142, "loss": 3.2344, "mask_loss": 0.1899, "step": 196, "topk_loss": 0.0303 }, { "epoch": 0.0782993733814424, "grad_norm": 0.2392578125, "learning_rate": 0.00019825015368194748, "lm_loss": 3.0554, "loss": 3.2809, "mask_loss": 0.1919, "step": 197, "topk_loss": 0.0335 }, { "epoch": 0.07869683212957156, "grad_norm": 0.2255859375, "learning_rate": 0.00019822633613814862, "lm_loss": 3.0914, "loss": 3.304, "mask_loss": 0.1855, "step": 198, "topk_loss": 0.0271 }, { "epoch": 0.0790942908777007, "grad_norm": 0.2314453125, "learning_rate": 0.0001982023590454607, "lm_loss": 3.0515, "loss": 3.2819, "mask_loss": 0.1956, "step": 199, "topk_loss": 0.0348 }, { "epoch": 0.07949174962582985, "grad_norm": 0.2216796875, "learning_rate": 0.00019817822244282973, "lm_loss": 3.0209, "loss": 3.2367, "mask_loss": 0.1891, "step": 200, "topk_loss": 0.0267 }, { "epoch": 0.07949174962582985, "eval_lm_loss": 729.3817749023438, "eval_loss": 729.6011962890625, "eval_mask_hit_rate": 0.40478789806365967, "eval_mask_loss": 0.18504393100738525, "eval_mask_top_10_hit_rate": 0.9154680371284485, "eval_mask_top_1_hit_rate": 0.9666690826416016, "eval_mask_top_20_hit_rate": 0.8877524137496948, "eval_mask_top_5_hit_rate": 0.9352514743804932, "eval_runtime": 144.7547, "eval_samples_per_second": 14.148, "eval_steps_per_second": 7.074, "eval_token_accuracy": 0.4949284791946411, "eval_top_k_diff": -554.7236328125, "eval_topk_loss": 0.03440730273723602, "step": 200 }, { "epoch": 0.079889208373959, "grad_norm": 0.29296875, "learning_rate": 0.00019815392636946073, "lm_loss": 3.0973, "loss": 3.3482, "mask_loss": 0.2018, "step": 201, "topk_loss": 0.0491 }, { "epoch": 0.08028666712208815, "grad_norm": 0.29296875, "learning_rate": 0.0001981294708648178, "lm_loss": 2.9858, "loss": 3.197, "mask_loss": 0.1852, "step": 202, "topk_loss": 0.026 }, { "epoch": 0.0806841258702173, "grad_norm": 0.263671875, "learning_rate": 0.00019810485596862392, "lm_loss": 3.0715, "loss": 3.298, "mask_loss": 0.1895, "step": 203, "topk_loss": 0.037 }, { "epoch": 0.08108158461834644, "grad_norm": 0.232421875, "learning_rate": 0.0001980800817208611, "lm_loss": 3.0564, "loss": 3.2737, "mask_loss": 0.188, "step": 204, "topk_loss": 0.0293 }, { "epoch": 0.0814790433664756, "grad_norm": 0.2392578125, "learning_rate": 0.00019805514816177006, "lm_loss": 3.062, "loss": 3.2838, "mask_loss": 0.1892, "step": 205, "topk_loss": 0.0326 }, { "epoch": 0.08187650211460475, "grad_norm": 0.22265625, "learning_rate": 0.00019803005533185038, "lm_loss": 3.0045, "loss": 3.2168, "mask_loss": 0.1863, "step": 206, "topk_loss": 0.0261 }, { "epoch": 0.08227396086273389, "grad_norm": 0.2333984375, "learning_rate": 0.0001980048032718603, "lm_loss": 3.0125, "loss": 3.2284, "mask_loss": 0.1866, "step": 207, "topk_loss": 0.0293 }, { "epoch": 0.08267141961086305, "grad_norm": 0.2265625, "learning_rate": 0.00019797939202281664, "lm_loss": 2.9925, "loss": 3.2016, "mask_loss": 0.1839, "step": 208, "topk_loss": 0.0252 }, { "epoch": 0.0830688783589922, "grad_norm": 0.2275390625, "learning_rate": 0.00019795382162599495, "lm_loss": 2.9526, "loss": 3.1611, "mask_loss": 0.1821, "step": 209, "topk_loss": 0.0264 }, { "epoch": 0.08346633710712134, "grad_norm": 0.236328125, "learning_rate": 0.00019792809212292912, "lm_loss": 3.0184, "loss": 3.2271, "mask_loss": 0.1825, "step": 210, "topk_loss": 0.0263 }, { "epoch": 0.08386379585525049, "grad_norm": 0.20703125, "learning_rate": 0.0001979022035554116, "lm_loss": 2.987, "loss": 3.201, "mask_loss": 0.1875, "step": 211, "topk_loss": 0.0264 }, { "epoch": 0.08426125460337965, "grad_norm": 0.24609375, "learning_rate": 0.00019787615596549308, "lm_loss": 2.9766, "loss": 3.1854, "mask_loss": 0.183, "step": 212, "topk_loss": 0.0257 }, { "epoch": 0.08465871335150879, "grad_norm": 0.21484375, "learning_rate": 0.00019784994939548266, "lm_loss": 2.9765, "loss": 3.1829, "mask_loss": 0.1792, "step": 213, "topk_loss": 0.0272 }, { "epoch": 0.08505617209963794, "grad_norm": 0.2119140625, "learning_rate": 0.00019782358388794767, "lm_loss": 2.9522, "loss": 3.1674, "mask_loss": 0.1847, "step": 214, "topk_loss": 0.0305 }, { "epoch": 0.0854536308477671, "grad_norm": 0.2099609375, "learning_rate": 0.00019779705948571346, "lm_loss": 3.0139, "loss": 3.2169, "mask_loss": 0.1792, "step": 215, "topk_loss": 0.0239 }, { "epoch": 0.08585108959589624, "grad_norm": 0.2197265625, "learning_rate": 0.0001977703762318637, "lm_loss": 2.8918, "loss": 3.0996, "mask_loss": 0.1829, "step": 216, "topk_loss": 0.0249 }, { "epoch": 0.08624854834402539, "grad_norm": 0.2412109375, "learning_rate": 0.0001977435341697399, "lm_loss": 2.9047, "loss": 3.1255, "mask_loss": 0.1885, "step": 217, "topk_loss": 0.0323 }, { "epoch": 0.08664600709215453, "grad_norm": 0.2119140625, "learning_rate": 0.00019771653334294152, "lm_loss": 2.9129, "loss": 3.1139, "mask_loss": 0.179, "step": 218, "topk_loss": 0.022 }, { "epoch": 0.08704346584028369, "grad_norm": 0.201171875, "learning_rate": 0.00019768937379532604, "lm_loss": 2.8712, "loss": 3.075, "mask_loss": 0.1812, "step": 219, "topk_loss": 0.0225 }, { "epoch": 0.08744092458841284, "grad_norm": 0.2119140625, "learning_rate": 0.00019766205557100868, "lm_loss": 2.9447, "loss": 3.1567, "mask_loss": 0.1842, "step": 220, "topk_loss": 0.0278 }, { "epoch": 0.08783838333654198, "grad_norm": 0.224609375, "learning_rate": 0.00019763457871436235, "lm_loss": 2.9517, "loss": 3.1604, "mask_loss": 0.181, "step": 221, "topk_loss": 0.0276 }, { "epoch": 0.08823584208467114, "grad_norm": 0.2099609375, "learning_rate": 0.00019760694327001767, "lm_loss": 2.9037, "loss": 3.1047, "mask_loss": 0.179, "step": 222, "topk_loss": 0.022 }, { "epoch": 0.08863330083280029, "grad_norm": 0.2177734375, "learning_rate": 0.00019757914928286287, "lm_loss": 2.8908, "loss": 3.0943, "mask_loss": 0.1776, "step": 223, "topk_loss": 0.0259 }, { "epoch": 0.08903075958092943, "grad_norm": 0.212890625, "learning_rate": 0.00019755119679804367, "lm_loss": 2.8657, "loss": 3.066, "mask_loss": 0.1779, "step": 224, "topk_loss": 0.0223 }, { "epoch": 0.08942821832905858, "grad_norm": 0.21484375, "learning_rate": 0.00019752308586096326, "lm_loss": 2.9197, "loss": 3.1269, "mask_loss": 0.1807, "step": 225, "topk_loss": 0.0265 }, { "epoch": 0.08982567707718773, "grad_norm": 0.203125, "learning_rate": 0.00019749481651728216, "lm_loss": 2.9023, "loss": 3.1059, "mask_loss": 0.1783, "step": 226, "topk_loss": 0.0253 }, { "epoch": 0.09022313582531688, "grad_norm": 0.2080078125, "learning_rate": 0.00019746638881291829, "lm_loss": 2.9175, "loss": 3.1209, "mask_loss": 0.1786, "step": 227, "topk_loss": 0.0248 }, { "epoch": 0.09062059457344603, "grad_norm": 0.2197265625, "learning_rate": 0.0001974378027940466, "lm_loss": 2.8023, "loss": 3.0019, "mask_loss": 0.1784, "step": 228, "topk_loss": 0.0212 }, { "epoch": 0.09101805332157518, "grad_norm": 0.205078125, "learning_rate": 0.00019740905850709948, "lm_loss": 2.8686, "loss": 3.0719, "mask_loss": 0.1806, "step": 229, "topk_loss": 0.0226 }, { "epoch": 0.09141551206970433, "grad_norm": 0.205078125, "learning_rate": 0.0001973801559987661, "lm_loss": 2.8833, "loss": 3.0931, "mask_loss": 0.1829, "step": 230, "topk_loss": 0.0269 }, { "epoch": 0.09181297081783348, "grad_norm": 0.2138671875, "learning_rate": 0.00019735109531599285, "lm_loss": 2.8533, "loss": 3.057, "mask_loss": 0.1784, "step": 231, "topk_loss": 0.0253 }, { "epoch": 0.09221042956596262, "grad_norm": 0.220703125, "learning_rate": 0.0001973218765059829, "lm_loss": 2.7831, "loss": 2.9885, "mask_loss": 0.1791, "step": 232, "topk_loss": 0.0263 }, { "epoch": 0.09260788831409178, "grad_norm": 0.21875, "learning_rate": 0.00019729249961619635, "lm_loss": 2.8501, "loss": 3.0536, "mask_loss": 0.1774, "step": 233, "topk_loss": 0.0261 }, { "epoch": 0.09300534706222092, "grad_norm": 0.2216796875, "learning_rate": 0.00019726296469435, "lm_loss": 2.9035, "loss": 3.1098, "mask_loss": 0.1781, "step": 234, "topk_loss": 0.0282 }, { "epoch": 0.09340280581035007, "grad_norm": 0.2177734375, "learning_rate": 0.00019723327178841743, "lm_loss": 2.7768, "loss": 2.9769, "mask_loss": 0.1772, "step": 235, "topk_loss": 0.0229 }, { "epoch": 0.09380026455847923, "grad_norm": 0.1982421875, "learning_rate": 0.0001972034209466287, "lm_loss": 2.7848, "loss": 2.9854, "mask_loss": 0.1756, "step": 236, "topk_loss": 0.0249 }, { "epoch": 0.09419772330660837, "grad_norm": 0.2353515625, "learning_rate": 0.00019717341221747056, "lm_loss": 2.9198, "loss": 3.1178, "mask_loss": 0.1749, "step": 237, "topk_loss": 0.0232 }, { "epoch": 0.09459518205473752, "grad_norm": 0.1982421875, "learning_rate": 0.00019714324564968613, "lm_loss": 2.768, "loss": 2.9666, "mask_loss": 0.1745, "step": 238, "topk_loss": 0.024 }, { "epoch": 0.09499264080286667, "grad_norm": 0.2373046875, "learning_rate": 0.0001971129212922749, "lm_loss": 2.7249, "loss": 2.9168, "mask_loss": 0.1725, "step": 239, "topk_loss": 0.0195 }, { "epoch": 0.09539009955099582, "grad_norm": 0.1982421875, "learning_rate": 0.0001970824391944927, "lm_loss": 2.802, "loss": 2.9983, "mask_loss": 0.1741, "step": 240, "topk_loss": 0.0221 }, { "epoch": 0.09578755829912497, "grad_norm": 0.244140625, "learning_rate": 0.00019705179940585155, "lm_loss": 2.8494, "loss": 3.0489, "mask_loss": 0.1753, "step": 241, "topk_loss": 0.0241 }, { "epoch": 0.09618501704725411, "grad_norm": 0.220703125, "learning_rate": 0.00019702100197611962, "lm_loss": 2.8158, "loss": 3.0133, "mask_loss": 0.174, "step": 242, "topk_loss": 0.0234 }, { "epoch": 0.09658247579538327, "grad_norm": 0.26171875, "learning_rate": 0.0001969900469553211, "lm_loss": 2.8639, "loss": 3.086, "mask_loss": 0.1847, "step": 243, "topk_loss": 0.0374 }, { "epoch": 0.09697993454351242, "grad_norm": 0.2158203125, "learning_rate": 0.00019695893439373622, "lm_loss": 2.8073, "loss": 3.0075, "mask_loss": 0.1754, "step": 244, "topk_loss": 0.0248 }, { "epoch": 0.09737739329164156, "grad_norm": 0.24609375, "learning_rate": 0.00019692766434190105, "lm_loss": 2.7724, "loss": 2.9628, "mask_loss": 0.1706, "step": 245, "topk_loss": 0.0199 }, { "epoch": 0.09777485203977071, "grad_norm": 0.224609375, "learning_rate": 0.00019689623685060744, "lm_loss": 2.7779, "loss": 2.9715, "mask_loss": 0.1706, "step": 246, "topk_loss": 0.0231 }, { "epoch": 0.09817231078789987, "grad_norm": 0.2373046875, "learning_rate": 0.0001968646519709031, "lm_loss": 2.731, "loss": 2.9322, "mask_loss": 0.1761, "step": 247, "topk_loss": 0.0251 }, { "epoch": 0.09856976953602901, "grad_norm": 0.29296875, "learning_rate": 0.00019683290975409126, "lm_loss": 2.7375, "loss": 2.9383, "mask_loss": 0.1772, "step": 248, "topk_loss": 0.0236 }, { "epoch": 0.09896722828415816, "grad_norm": 0.224609375, "learning_rate": 0.00019680101025173073, "lm_loss": 2.7414, "loss": 2.9383, "mask_loss": 0.1749, "step": 249, "topk_loss": 0.022 }, { "epoch": 0.09936468703228732, "grad_norm": 0.21484375, "learning_rate": 0.00019676895351563588, "lm_loss": 2.6959, "loss": 2.8897, "mask_loss": 0.1725, "step": 250, "topk_loss": 0.0214 }, { "epoch": 0.09936468703228732, "eval_lm_loss": 725.5792236328125, "eval_loss": 725.7776489257812, "eval_mask_hit_rate": 0.42889782786369324, "eval_mask_loss": 0.1703386902809143, "eval_mask_top_10_hit_rate": 0.9372355937957764, "eval_mask_top_1_hit_rate": 0.978151798248291, "eval_mask_top_20_hit_rate": 0.9138137102127075, "eval_mask_top_5_hit_rate": 0.9532917141914368, "eval_runtime": 144.6138, "eval_samples_per_second": 14.162, "eval_steps_per_second": 7.081, "eval_token_accuracy": 0.5259637832641602, "eval_top_k_diff": -537.6810913085938, "eval_topk_loss": 0.028092004358768463, "step": 250 }, { "epoch": 0.09976214578041646, "grad_norm": 0.2578125, "learning_rate": 0.00019673673959787639, "lm_loss": 2.6879, "loss": 2.9018, "mask_loss": 0.1815, "step": 251, "topk_loss": 0.0324 }, { "epoch": 0.10015960452854561, "grad_norm": 0.314453125, "learning_rate": 0.00019670436855077726, "lm_loss": 2.7854, "loss": 2.9788, "mask_loss": 0.1692, "step": 252, "topk_loss": 0.0242 }, { "epoch": 0.10055706327667475, "grad_norm": 0.212890625, "learning_rate": 0.00019667184042691875, "lm_loss": 2.764, "loss": 2.959, "mask_loss": 0.1714, "step": 253, "topk_loss": 0.0235 }, { "epoch": 0.10095452202480391, "grad_norm": 0.2314453125, "learning_rate": 0.00019663915527913625, "lm_loss": 2.787, "loss": 2.9914, "mask_loss": 0.1759, "step": 254, "topk_loss": 0.0285 }, { "epoch": 0.10135198077293306, "grad_norm": 0.2578125, "learning_rate": 0.00019660631316052021, "lm_loss": 2.7066, "loss": 2.9002, "mask_loss": 0.1721, "step": 255, "topk_loss": 0.0215 }, { "epoch": 0.1017494395210622, "grad_norm": 0.25390625, "learning_rate": 0.00019657331412441598, "lm_loss": 2.7055, "loss": 2.901, "mask_loss": 0.1719, "step": 256, "topk_loss": 0.0236 }, { "epoch": 0.10214689826919136, "grad_norm": 0.23046875, "learning_rate": 0.0001965401582244239, "lm_loss": 2.7156, "loss": 2.9156, "mask_loss": 0.1733, "step": 257, "topk_loss": 0.0267 }, { "epoch": 0.10254435701732051, "grad_norm": 0.22265625, "learning_rate": 0.000196506845514399, "lm_loss": 2.7108, "loss": 2.902, "mask_loss": 0.1704, "step": 258, "topk_loss": 0.0208 }, { "epoch": 0.10294181576544965, "grad_norm": 0.26953125, "learning_rate": 0.00019647337604845107, "lm_loss": 2.7099, "loss": 2.9024, "mask_loss": 0.1696, "step": 259, "topk_loss": 0.0229 }, { "epoch": 0.1033392745135788, "grad_norm": 0.236328125, "learning_rate": 0.00019643974988094458, "lm_loss": 2.6189, "loss": 2.8054, "mask_loss": 0.1682, "step": 260, "topk_loss": 0.0184 }, { "epoch": 0.10373673326170796, "grad_norm": 0.2158203125, "learning_rate": 0.00019640596706649841, "lm_loss": 2.6854, "loss": 2.884, "mask_loss": 0.1734, "step": 261, "topk_loss": 0.0252 }, { "epoch": 0.1041341920098371, "grad_norm": 0.265625, "learning_rate": 0.00019637202765998592, "lm_loss": 2.6226, "loss": 2.8346, "mask_loss": 0.1796, "step": 262, "topk_loss": 0.0324 }, { "epoch": 0.10453165075796625, "grad_norm": 0.2138671875, "learning_rate": 0.00019633793171653488, "lm_loss": 2.6852, "loss": 2.8731, "mask_loss": 0.1679, "step": 263, "topk_loss": 0.02 }, { "epoch": 0.10492910950609541, "grad_norm": 0.2177734375, "learning_rate": 0.00019630367929152724, "lm_loss": 2.6046, "loss": 2.8038, "mask_loss": 0.1743, "step": 264, "topk_loss": 0.0249 }, { "epoch": 0.10532656825422455, "grad_norm": 0.2109375, "learning_rate": 0.00019626927044059914, "lm_loss": 2.6704, "loss": 2.8629, "mask_loss": 0.1709, "step": 265, "topk_loss": 0.0216 }, { "epoch": 0.1057240270023537, "grad_norm": 0.203125, "learning_rate": 0.00019623470521964092, "lm_loss": 2.6652, "loss": 2.8514, "mask_loss": 0.167, "step": 266, "topk_loss": 0.0193 }, { "epoch": 0.10612148575048284, "grad_norm": 0.2216796875, "learning_rate": 0.00019619998368479674, "lm_loss": 2.5714, "loss": 2.7607, "mask_loss": 0.17, "step": 267, "topk_loss": 0.0192 }, { "epoch": 0.106518944498612, "grad_norm": 0.1875, "learning_rate": 0.00019616510589246474, "lm_loss": 2.6796, "loss": 2.8766, "mask_loss": 0.1737, "step": 268, "topk_loss": 0.0233 }, { "epoch": 0.10691640324674115, "grad_norm": 0.2294921875, "learning_rate": 0.00019613007189929688, "lm_loss": 2.6008, "loss": 2.7886, "mask_loss": 0.1672, "step": 269, "topk_loss": 0.0206 }, { "epoch": 0.1073138619948703, "grad_norm": 0.2421875, "learning_rate": 0.00019609488176219886, "lm_loss": 2.6117, "loss": 2.807, "mask_loss": 0.1725, "step": 270, "topk_loss": 0.0228 }, { "epoch": 0.10771132074299945, "grad_norm": 0.19921875, "learning_rate": 0.00019605953553832988, "lm_loss": 2.6278, "loss": 2.8085, "mask_loss": 0.1629, "step": 271, "topk_loss": 0.0179 }, { "epoch": 0.1081087794911286, "grad_norm": 0.20703125, "learning_rate": 0.0001960240332851028, "lm_loss": 2.6579, "loss": 2.8498, "mask_loss": 0.169, "step": 272, "topk_loss": 0.0229 }, { "epoch": 0.10850623823925774, "grad_norm": 0.2099609375, "learning_rate": 0.00019598837506018391, "lm_loss": 2.7048, "loss": 2.8962, "mask_loss": 0.1682, "step": 273, "topk_loss": 0.0232 }, { "epoch": 0.10890369698738689, "grad_norm": 0.22265625, "learning_rate": 0.0001959525609214928, "lm_loss": 2.6585, "loss": 2.8469, "mask_loss": 0.17, "step": 274, "topk_loss": 0.0184 }, { "epoch": 0.10930115573551605, "grad_norm": 0.197265625, "learning_rate": 0.00019591659092720227, "lm_loss": 2.6604, "loss": 2.8479, "mask_loss": 0.1682, "step": 275, "topk_loss": 0.0193 }, { "epoch": 0.1096986144836452, "grad_norm": 0.1826171875, "learning_rate": 0.00019588046513573839, "lm_loss": 2.4789, "loss": 2.6631, "mask_loss": 0.167, "step": 276, "topk_loss": 0.0172 }, { "epoch": 0.11009607323177434, "grad_norm": 0.2470703125, "learning_rate": 0.00019584418360578016, "lm_loss": 2.592, "loss": 2.7834, "mask_loss": 0.168, "step": 277, "topk_loss": 0.0234 }, { "epoch": 0.1104935319799035, "grad_norm": 0.2041015625, "learning_rate": 0.00019580774639625968, "lm_loss": 2.586, "loss": 2.7701, "mask_loss": 0.1666, "step": 278, "topk_loss": 0.0174 }, { "epoch": 0.11089099072803264, "grad_norm": 0.171875, "learning_rate": 0.00019577115356636182, "lm_loss": 2.5704, "loss": 2.7551, "mask_loss": 0.1656, "step": 279, "topk_loss": 0.019 }, { "epoch": 0.11128844947616179, "grad_norm": 0.1904296875, "learning_rate": 0.00019573440517552427, "lm_loss": 2.6662, "loss": 2.8529, "mask_loss": 0.1666, "step": 280, "topk_loss": 0.0201 }, { "epoch": 0.11168590822429093, "grad_norm": 0.205078125, "learning_rate": 0.0001956975012834374, "lm_loss": 2.5903, "loss": 2.7714, "mask_loss": 0.1629, "step": 281, "topk_loss": 0.0182 }, { "epoch": 0.11208336697242009, "grad_norm": 0.1826171875, "learning_rate": 0.0001956604419500441, "lm_loss": 2.5635, "loss": 2.7467, "mask_loss": 0.1665, "step": 282, "topk_loss": 0.0167 }, { "epoch": 0.11248082572054924, "grad_norm": 0.1875, "learning_rate": 0.00019562322723553984, "lm_loss": 2.5431, "loss": 2.7258, "mask_loss": 0.1658, "step": 283, "topk_loss": 0.017 }, { "epoch": 0.11287828446867838, "grad_norm": 0.2041015625, "learning_rate": 0.00019558585720037236, "lm_loss": 2.635, "loss": 2.8242, "mask_loss": 0.166, "step": 284, "topk_loss": 0.0232 }, { "epoch": 0.11327574321680754, "grad_norm": 0.1875, "learning_rate": 0.00019554833190524182, "lm_loss": 2.57, "loss": 2.757, "mask_loss": 0.1669, "step": 285, "topk_loss": 0.0201 }, { "epoch": 0.11367320196493669, "grad_norm": 0.181640625, "learning_rate": 0.00019551065141110047, "lm_loss": 2.5794, "loss": 2.7671, "mask_loss": 0.1669, "step": 286, "topk_loss": 0.0208 }, { "epoch": 0.11407066071306583, "grad_norm": 0.205078125, "learning_rate": 0.00019547281577915267, "lm_loss": 2.5847, "loss": 2.7863, "mask_loss": 0.1732, "step": 287, "topk_loss": 0.0285 }, { "epoch": 0.11446811946119498, "grad_norm": 0.203125, "learning_rate": 0.00019543482507085482, "lm_loss": 2.4931, "loss": 2.682, "mask_loss": 0.1675, "step": 288, "topk_loss": 0.0213 }, { "epoch": 0.11486557820932414, "grad_norm": 0.185546875, "learning_rate": 0.00019539667934791513, "lm_loss": 2.5702, "loss": 2.7578, "mask_loss": 0.168, "step": 289, "topk_loss": 0.0197 }, { "epoch": 0.11526303695745328, "grad_norm": 0.18359375, "learning_rate": 0.00019535837867229363, "lm_loss": 2.614, "loss": 2.7945, "mask_loss": 0.1625, "step": 290, "topk_loss": 0.018 }, { "epoch": 0.11566049570558243, "grad_norm": 0.2578125, "learning_rate": 0.0001953199231062021, "lm_loss": 2.6642, "loss": 2.8608, "mask_loss": 0.1715, "step": 291, "topk_loss": 0.0251 }, { "epoch": 0.11605795445371159, "grad_norm": 0.2197265625, "learning_rate": 0.00019528131271210383, "lm_loss": 2.534, "loss": 2.7202, "mask_loss": 0.1677, "step": 292, "topk_loss": 0.0185 }, { "epoch": 0.11645541320184073, "grad_norm": 0.1943359375, "learning_rate": 0.0001952425475527136, "lm_loss": 2.6175, "loss": 2.7969, "mask_loss": 0.1626, "step": 293, "topk_loss": 0.0168 }, { "epoch": 0.11685287194996988, "grad_norm": 0.2158203125, "learning_rate": 0.00019520362769099764, "lm_loss": 2.5311, "loss": 2.7126, "mask_loss": 0.1629, "step": 294, "topk_loss": 0.0186 }, { "epoch": 0.11725033069809902, "grad_norm": 0.251953125, "learning_rate": 0.0001951645531901734, "lm_loss": 2.542, "loss": 2.7468, "mask_loss": 0.1748, "step": 295, "topk_loss": 0.03 }, { "epoch": 0.11764778944622818, "grad_norm": 0.310546875, "learning_rate": 0.00019512532411370954, "lm_loss": 2.5719, "loss": 2.756, "mask_loss": 0.1638, "step": 296, "topk_loss": 0.0203 }, { "epoch": 0.11804524819435733, "grad_norm": 0.1962890625, "learning_rate": 0.0001950859405253258, "lm_loss": 2.5602, "loss": 2.7346, "mask_loss": 0.1598, "step": 297, "topk_loss": 0.0146 }, { "epoch": 0.11844270694248647, "grad_norm": 0.24609375, "learning_rate": 0.00019504640248899286, "lm_loss": 2.5456, "loss": 2.7311, "mask_loss": 0.1654, "step": 298, "topk_loss": 0.0201 }, { "epoch": 0.11884016569061563, "grad_norm": 0.220703125, "learning_rate": 0.0001950067100689323, "lm_loss": 2.5376, "loss": 2.7266, "mask_loss": 0.1673, "step": 299, "topk_loss": 0.0216 }, { "epoch": 0.11923762443874478, "grad_norm": 0.201171875, "learning_rate": 0.00019496686332961646, "lm_loss": 2.5539, "loss": 2.7356, "mask_loss": 0.1625, "step": 300, "topk_loss": 0.0192 }, { "epoch": 0.11923762443874478, "eval_lm_loss": 725.0694580078125, "eval_loss": 725.2548217773438, "eval_mask_hit_rate": 0.4468006491661072, "eval_mask_loss": 0.1619417518377304, "eval_mask_top_10_hit_rate": 0.9498538970947266, "eval_mask_top_1_hit_rate": 0.9841432571411133, "eval_mask_top_20_hit_rate": 0.9292661547660828, "eval_mask_top_5_hit_rate": 0.9635714888572693, "eval_runtime": 144.0678, "eval_samples_per_second": 14.216, "eval_steps_per_second": 7.108, "eval_token_accuracy": 0.5460589528083801, "eval_top_k_diff": -557.624267578125, "eval_topk_loss": 0.02342473529279232, "step": 300 }, { "epoch": 0.11963508318687392, "grad_norm": 0.181640625, "learning_rate": 0.00019492686233576833, "lm_loss": 2.5287, "loss": 2.7131, "mask_loss": 0.165, "step": 301, "topk_loss": 0.0194 }, { "epoch": 0.12003254193500307, "grad_norm": 0.19140625, "learning_rate": 0.0001948867071523615, "lm_loss": 2.5342, "loss": 2.7148, "mask_loss": 0.1631, "step": 302, "topk_loss": 0.0175 }, { "epoch": 0.12043000068313223, "grad_norm": 0.203125, "learning_rate": 0.00019484639784461994, "lm_loss": 2.4958, "loss": 2.6775, "mask_loss": 0.1647, "step": 303, "topk_loss": 0.017 }, { "epoch": 0.12082745943126137, "grad_norm": 0.1826171875, "learning_rate": 0.00019480593447801799, "lm_loss": 2.489, "loss": 2.6758, "mask_loss": 0.166, "step": 304, "topk_loss": 0.0208 }, { "epoch": 0.12122491817939052, "grad_norm": 0.2216796875, "learning_rate": 0.00019476531711828027, "lm_loss": 2.512, "loss": 2.705, "mask_loss": 0.1678, "step": 305, "topk_loss": 0.0252 }, { "epoch": 0.12162237692751968, "grad_norm": 0.201171875, "learning_rate": 0.00019472454583138144, "lm_loss": 2.5055, "loss": 2.6908, "mask_loss": 0.164, "step": 306, "topk_loss": 0.0213 }, { "epoch": 0.12201983567564882, "grad_norm": 0.2255859375, "learning_rate": 0.0001946836206835463, "lm_loss": 2.5713, "loss": 2.7526, "mask_loss": 0.162, "step": 307, "topk_loss": 0.0193 }, { "epoch": 0.12241729442377797, "grad_norm": 0.220703125, "learning_rate": 0.0001946425417412495, "lm_loss": 2.4884, "loss": 2.6637, "mask_loss": 0.1589, "step": 308, "topk_loss": 0.0164 }, { "epoch": 0.12281475317190711, "grad_norm": 0.25, "learning_rate": 0.00019460130907121545, "lm_loss": 2.5331, "loss": 2.726, "mask_loss": 0.1689, "step": 309, "topk_loss": 0.024 }, { "epoch": 0.12321221192003627, "grad_norm": 0.255859375, "learning_rate": 0.00019455992274041835, "lm_loss": 2.547, "loss": 2.733, "mask_loss": 0.1649, "step": 310, "topk_loss": 0.0211 }, { "epoch": 0.12360967066816542, "grad_norm": 0.203125, "learning_rate": 0.00019451838281608197, "lm_loss": 2.536, "loss": 2.7128, "mask_loss": 0.1605, "step": 311, "topk_loss": 0.0163 }, { "epoch": 0.12400712941629456, "grad_norm": 0.19140625, "learning_rate": 0.00019447668936567952, "lm_loss": 2.4844, "loss": 2.6624, "mask_loss": 0.1608, "step": 312, "topk_loss": 0.0172 }, { "epoch": 0.12440458816442372, "grad_norm": 0.205078125, "learning_rate": 0.0001944348424569336, "lm_loss": 2.4448, "loss": 2.6258, "mask_loss": 0.1618, "step": 313, "topk_loss": 0.0192 }, { "epoch": 0.12480204691255287, "grad_norm": 0.23828125, "learning_rate": 0.00019439284215781613, "lm_loss": 2.4238, "loss": 2.6091, "mask_loss": 0.1635, "step": 314, "topk_loss": 0.0219 }, { "epoch": 0.12519950566068203, "grad_norm": 0.1923828125, "learning_rate": 0.00019435068853654807, "lm_loss": 2.4857, "loss": 2.667, "mask_loss": 0.1626, "step": 315, "topk_loss": 0.0187 }, { "epoch": 0.12559696440881116, "grad_norm": 0.2353515625, "learning_rate": 0.00019430838166159954, "lm_loss": 2.5535, "loss": 2.7497, "mask_loss": 0.1674, "step": 316, "topk_loss": 0.0288 }, { "epoch": 0.12599442315694032, "grad_norm": 0.2158203125, "learning_rate": 0.0001942659216016895, "lm_loss": 2.4172, "loss": 2.5921, "mask_loss": 0.1587, "step": 317, "topk_loss": 0.0162 }, { "epoch": 0.12639188190506948, "grad_norm": 0.24609375, "learning_rate": 0.00019422330842578577, "lm_loss": 2.4766, "loss": 2.6646, "mask_loss": 0.1639, "step": 318, "topk_loss": 0.0241 }, { "epoch": 0.1267893406531986, "grad_norm": 0.23046875, "learning_rate": 0.00019418054220310483, "lm_loss": 2.5006, "loss": 2.6787, "mask_loss": 0.1625, "step": 319, "topk_loss": 0.0155 }, { "epoch": 0.12718679940132777, "grad_norm": 0.1943359375, "learning_rate": 0.00019413762300311182, "lm_loss": 2.455, "loss": 2.6357, "mask_loss": 0.1624, "step": 320, "topk_loss": 0.0183 }, { "epoch": 0.1275842581494569, "grad_norm": 0.23828125, "learning_rate": 0.00019409455089552038, "lm_loss": 2.5089, "loss": 2.6893, "mask_loss": 0.16, "step": 321, "topk_loss": 0.0204 }, { "epoch": 0.12798171689758606, "grad_norm": 0.1904296875, "learning_rate": 0.0001940513259502924, "lm_loss": 2.4325, "loss": 2.6111, "mask_loss": 0.1624, "step": 322, "topk_loss": 0.0162 }, { "epoch": 0.12837917564571522, "grad_norm": 0.2314453125, "learning_rate": 0.00019400794823763815, "lm_loss": 2.4237, "loss": 2.5945, "mask_loss": 0.1567, "step": 323, "topk_loss": 0.0142 }, { "epoch": 0.12877663439384435, "grad_norm": 0.212890625, "learning_rate": 0.00019396441782801592, "lm_loss": 2.5473, "loss": 2.729, "mask_loss": 0.1632, "step": 324, "topk_loss": 0.0185 }, { "epoch": 0.1291740931419735, "grad_norm": 0.1953125, "learning_rate": 0.00019392073479213213, "lm_loss": 2.4768, "loss": 2.6561, "mask_loss": 0.1606, "step": 325, "topk_loss": 0.0188 }, { "epoch": 0.12957155189010267, "grad_norm": 0.2451171875, "learning_rate": 0.00019387689920094103, "lm_loss": 2.4692, "loss": 2.6498, "mask_loss": 0.1615, "step": 326, "topk_loss": 0.019 }, { "epoch": 0.1299690106382318, "grad_norm": 0.1923828125, "learning_rate": 0.00019383291112564478, "lm_loss": 2.4854, "loss": 2.6632, "mask_loss": 0.16, "step": 327, "topk_loss": 0.0178 }, { "epoch": 0.13036646938636096, "grad_norm": 0.2060546875, "learning_rate": 0.00019378877063769309, "lm_loss": 2.4537, "loss": 2.6312, "mask_loss": 0.1587, "step": 328, "topk_loss": 0.0188 }, { "epoch": 0.13076392813449011, "grad_norm": 0.2265625, "learning_rate": 0.00019374447780878327, "lm_loss": 2.5265, "loss": 2.7126, "mask_loss": 0.1639, "step": 329, "topk_loss": 0.0222 }, { "epoch": 0.13116138688261925, "grad_norm": 0.2177734375, "learning_rate": 0.0001937000327108601, "lm_loss": 2.4116, "loss": 2.5915, "mask_loss": 0.1613, "step": 330, "topk_loss": 0.0185 }, { "epoch": 0.1315588456307484, "grad_norm": 0.232421875, "learning_rate": 0.00019365543541611575, "lm_loss": 2.4401, "loss": 2.6144, "mask_loss": 0.1587, "step": 331, "topk_loss": 0.0156 }, { "epoch": 0.13195630437887756, "grad_norm": 0.2001953125, "learning_rate": 0.00019361068599698945, "lm_loss": 2.4014, "loss": 2.5804, "mask_loss": 0.1605, "step": 332, "topk_loss": 0.0185 }, { "epoch": 0.1323537631270067, "grad_norm": 0.19921875, "learning_rate": 0.00019356578452616772, "lm_loss": 2.4467, "loss": 2.6252, "mask_loss": 0.1604, "step": 333, "topk_loss": 0.0181 }, { "epoch": 0.13275122187513586, "grad_norm": 0.2255859375, "learning_rate": 0.00019352073107658385, "lm_loss": 2.3818, "loss": 2.5607, "mask_loss": 0.1606, "step": 334, "topk_loss": 0.0183 }, { "epoch": 0.133148680623265, "grad_norm": 0.177734375, "learning_rate": 0.0001934755257214181, "lm_loss": 2.4268, "loss": 2.6045, "mask_loss": 0.1592, "step": 335, "topk_loss": 0.0184 }, { "epoch": 0.13354613937139415, "grad_norm": 0.1845703125, "learning_rate": 0.00019343016853409754, "lm_loss": 2.3896, "loss": 2.5674, "mask_loss": 0.1611, "step": 336, "topk_loss": 0.0166 }, { "epoch": 0.1339435981195233, "grad_norm": 0.2021484375, "learning_rate": 0.00019338465958829572, "lm_loss": 2.4972, "loss": 2.6721, "mask_loss": 0.1585, "step": 337, "topk_loss": 0.0163 }, { "epoch": 0.13434105686765244, "grad_norm": 0.251953125, "learning_rate": 0.00019333899895793272, "lm_loss": 2.4311, "loss": 2.6161, "mask_loss": 0.1619, "step": 338, "topk_loss": 0.0231 }, { "epoch": 0.1347385156157816, "grad_norm": 0.1865234375, "learning_rate": 0.0001932931867171751, "lm_loss": 2.3507, "loss": 2.5257, "mask_loss": 0.1579, "step": 339, "topk_loss": 0.0171 }, { "epoch": 0.13513597436391075, "grad_norm": 0.2353515625, "learning_rate": 0.00019324722294043558, "lm_loss": 2.427, "loss": 2.6035, "mask_loss": 0.1595, "step": 340, "topk_loss": 0.0171 }, { "epoch": 0.13553343311203989, "grad_norm": 0.2451171875, "learning_rate": 0.00019320110770237308, "lm_loss": 2.4311, "loss": 2.6085, "mask_loss": 0.1604, "step": 341, "topk_loss": 0.017 }, { "epoch": 0.13593089186016905, "grad_norm": 0.1865234375, "learning_rate": 0.00019315484107789246, "lm_loss": 2.3972, "loss": 2.5739, "mask_loss": 0.159, "step": 342, "topk_loss": 0.0177 }, { "epoch": 0.1363283506082982, "grad_norm": 0.1904296875, "learning_rate": 0.00019310842314214458, "lm_loss": 2.46, "loss": 2.6425, "mask_loss": 0.1619, "step": 343, "topk_loss": 0.0205 }, { "epoch": 0.13672580935642734, "grad_norm": 0.224609375, "learning_rate": 0.000193061853970526, "lm_loss": 2.2869, "loss": 2.4596, "mask_loss": 0.159, "step": 344, "topk_loss": 0.0138 }, { "epoch": 0.1371232681045565, "grad_norm": 0.2119140625, "learning_rate": 0.00019301513363867895, "lm_loss": 2.3823, "loss": 2.5584, "mask_loss": 0.1607, "step": 345, "topk_loss": 0.0154 }, { "epoch": 0.13752072685268565, "grad_norm": 0.19140625, "learning_rate": 0.0001929682622224912, "lm_loss": 2.4612, "loss": 2.6359, "mask_loss": 0.1581, "step": 346, "topk_loss": 0.0165 }, { "epoch": 0.13791818560081479, "grad_norm": 0.2001953125, "learning_rate": 0.00019292123979809594, "lm_loss": 2.4941, "loss": 2.6731, "mask_loss": 0.1604, "step": 347, "topk_loss": 0.0187 }, { "epoch": 0.13831564434894394, "grad_norm": 0.310546875, "learning_rate": 0.00019287406644187156, "lm_loss": 2.4155, "loss": 2.59, "mask_loss": 0.1558, "step": 348, "topk_loss": 0.0188 }, { "epoch": 0.13871310309707308, "grad_norm": 0.216796875, "learning_rate": 0.00019282674223044177, "lm_loss": 2.4682, "loss": 2.64, "mask_loss": 0.1555, "step": 349, "topk_loss": 0.0163 }, { "epoch": 0.13911056184520224, "grad_norm": 0.251953125, "learning_rate": 0.0001927792672406751, "lm_loss": 2.4956, "loss": 2.6801, "mask_loss": 0.1615, "step": 350, "topk_loss": 0.0231 }, { "epoch": 0.13911056184520224, "eval_lm_loss": 721.0567016601562, "eval_loss": 721.2329711914062, "eval_mask_hit_rate": 0.4606643319129944, "eval_mask_loss": 0.1554771065711975, "eval_mask_top_10_hit_rate": 0.9582056999206543, "eval_mask_top_1_hit_rate": 0.9878551959991455, "eval_mask_top_20_hit_rate": 0.9397642612457275, "eval_mask_top_5_hit_rate": 0.9702033996582031, "eval_runtime": 145.1773, "eval_samples_per_second": 14.107, "eval_steps_per_second": 7.053, "eval_token_accuracy": 0.5597802400588989, "eval_top_k_diff": -564.8240356445312, "eval_topk_loss": 0.02081306278705597, "step": 350 }, { "epoch": 0.1395080205933314, "grad_norm": 0.234375, "learning_rate": 0.00019273164154968522, "lm_loss": 2.5178, "loss": 2.6936, "mask_loss": 0.1571, "step": 351, "topk_loss": 0.0187 }, { "epoch": 0.13990547934146053, "grad_norm": 0.203125, "learning_rate": 0.00019268386523483037, "lm_loss": 2.3764, "loss": 2.5479, "mask_loss": 0.156, "step": 352, "topk_loss": 0.0155 }, { "epoch": 0.14030293808958968, "grad_norm": 0.1875, "learning_rate": 0.0001926359383737136, "lm_loss": 2.3557, "loss": 2.5295, "mask_loss": 0.1579, "step": 353, "topk_loss": 0.0158 }, { "epoch": 0.14070039683771884, "grad_norm": 0.1884765625, "learning_rate": 0.00019258786104418244, "lm_loss": 2.3476, "loss": 2.5195, "mask_loss": 0.1562, "step": 354, "topk_loss": 0.0157 }, { "epoch": 0.14109785558584798, "grad_norm": 0.267578125, "learning_rate": 0.00019253963332432878, "lm_loss": 2.4405, "loss": 2.6102, "mask_loss": 0.1543, "step": 355, "topk_loss": 0.0154 }, { "epoch": 0.14149531433397713, "grad_norm": 0.21484375, "learning_rate": 0.0001924912552924889, "lm_loss": 2.4723, "loss": 2.6404, "mask_loss": 0.1524, "step": 356, "topk_loss": 0.0157 }, { "epoch": 0.1418927730821063, "grad_norm": 0.22265625, "learning_rate": 0.0001924427270272431, "lm_loss": 2.4031, "loss": 2.5813, "mask_loss": 0.1572, "step": 357, "topk_loss": 0.021 }, { "epoch": 0.14229023183023543, "grad_norm": 0.1787109375, "learning_rate": 0.00019239404860741578, "lm_loss": 2.2449, "loss": 2.4173, "mask_loss": 0.1549, "step": 358, "topk_loss": 0.0175 }, { "epoch": 0.14268769057836458, "grad_norm": 0.17578125, "learning_rate": 0.00019234522011207528, "lm_loss": 2.3971, "loss": 2.5702, "mask_loss": 0.157, "step": 359, "topk_loss": 0.016 }, { "epoch": 0.14308514932649374, "grad_norm": 0.1982421875, "learning_rate": 0.00019229624162053356, "lm_loss": 2.3401, "loss": 2.5133, "mask_loss": 0.1566, "step": 360, "topk_loss": 0.0167 }, { "epoch": 0.14348260807462287, "grad_norm": 0.1650390625, "learning_rate": 0.0001922471132123464, "lm_loss": 2.3972, "loss": 2.567, "mask_loss": 0.1534, "step": 361, "topk_loss": 0.0164 }, { "epoch": 0.14388006682275203, "grad_norm": 0.2080078125, "learning_rate": 0.00019219783496731292, "lm_loss": 2.3702, "loss": 2.5355, "mask_loss": 0.152, "step": 362, "topk_loss": 0.0133 }, { "epoch": 0.14427752557088117, "grad_norm": 0.193359375, "learning_rate": 0.00019214840696547575, "lm_loss": 2.3916, "loss": 2.5707, "mask_loss": 0.1599, "step": 363, "topk_loss": 0.0192 }, { "epoch": 0.14467498431901032, "grad_norm": 0.1875, "learning_rate": 0.0001920988292871207, "lm_loss": 2.3686, "loss": 2.5369, "mask_loss": 0.1522, "step": 364, "topk_loss": 0.0161 }, { "epoch": 0.14507244306713948, "grad_norm": 0.1904296875, "learning_rate": 0.00019204910201277672, "lm_loss": 2.3517, "loss": 2.5237, "mask_loss": 0.1555, "step": 365, "topk_loss": 0.0165 }, { "epoch": 0.14546990181526862, "grad_norm": 0.1904296875, "learning_rate": 0.00019199922522321574, "lm_loss": 2.3763, "loss": 2.5485, "mask_loss": 0.1553, "step": 366, "topk_loss": 0.0169 }, { "epoch": 0.14586736056339777, "grad_norm": 0.1953125, "learning_rate": 0.0001919491989994526, "lm_loss": 2.4421, "loss": 2.6105, "mask_loss": 0.1537, "step": 367, "topk_loss": 0.0147 }, { "epoch": 0.14626481931152693, "grad_norm": 0.181640625, "learning_rate": 0.00019189902342274471, "lm_loss": 2.3083, "loss": 2.4815, "mask_loss": 0.1568, "step": 368, "topk_loss": 0.0164 }, { "epoch": 0.14666227805965606, "grad_norm": 0.2001953125, "learning_rate": 0.00019184869857459232, "lm_loss": 2.3493, "loss": 2.5219, "mask_loss": 0.1557, "step": 369, "topk_loss": 0.0169 }, { "epoch": 0.14705973680778522, "grad_norm": 0.23046875, "learning_rate": 0.0001917982245367379, "lm_loss": 2.3418, "loss": 2.5175, "mask_loss": 0.1564, "step": 370, "topk_loss": 0.0193 }, { "epoch": 0.14745719555591438, "grad_norm": 0.1962890625, "learning_rate": 0.00019174760139116642, "lm_loss": 2.4028, "loss": 2.5704, "mask_loss": 0.1531, "step": 371, "topk_loss": 0.0145 }, { "epoch": 0.14785465430404351, "grad_norm": 0.2041015625, "learning_rate": 0.00019169682922010492, "lm_loss": 2.34, "loss": 2.5094, "mask_loss": 0.1543, "step": 372, "topk_loss": 0.0151 }, { "epoch": 0.14825211305217267, "grad_norm": 0.220703125, "learning_rate": 0.00019164590810602262, "lm_loss": 2.2568, "loss": 2.4285, "mask_loss": 0.1558, "step": 373, "topk_loss": 0.016 }, { "epoch": 0.14864957180030183, "grad_norm": 0.20703125, "learning_rate": 0.00019159483813163054, "lm_loss": 2.3312, "loss": 2.5006, "mask_loss": 0.1551, "step": 374, "topk_loss": 0.0143 }, { "epoch": 0.14904703054843096, "grad_norm": 0.205078125, "learning_rate": 0.00019154361937988163, "lm_loss": 2.3235, "loss": 2.4962, "mask_loss": 0.1544, "step": 375, "topk_loss": 0.0183 }, { "epoch": 0.14944448929656012, "grad_norm": 0.197265625, "learning_rate": 0.00019149225193397043, "lm_loss": 2.3933, "loss": 2.5681, "mask_loss": 0.1555, "step": 376, "topk_loss": 0.0193 }, { "epoch": 0.14984194804468925, "grad_norm": 0.2109375, "learning_rate": 0.00019144073587733294, "lm_loss": 2.355, "loss": 2.5333, "mask_loss": 0.1579, "step": 377, "topk_loss": 0.0205 }, { "epoch": 0.15023940679281841, "grad_norm": 0.201171875, "learning_rate": 0.00019138907129364664, "lm_loss": 2.3831, "loss": 2.5541, "mask_loss": 0.1546, "step": 378, "topk_loss": 0.0164 }, { "epoch": 0.15063686554094757, "grad_norm": 0.2197265625, "learning_rate": 0.0001913372582668303, "lm_loss": 2.3562, "loss": 2.5254, "mask_loss": 0.1544, "step": 379, "topk_loss": 0.0148 }, { "epoch": 0.1510343242890767, "grad_norm": 0.26953125, "learning_rate": 0.00019128529688104364, "lm_loss": 2.4145, "loss": 2.6087, "mask_loss": 0.1629, "step": 380, "topk_loss": 0.0314 }, { "epoch": 0.15143178303720586, "grad_norm": 0.1845703125, "learning_rate": 0.0001912331872206875, "lm_loss": 2.4485, "loss": 2.6143, "mask_loss": 0.1507, "step": 381, "topk_loss": 0.0151 }, { "epoch": 0.15182924178533502, "grad_norm": 0.1875, "learning_rate": 0.00019118092937040352, "lm_loss": 2.358, "loss": 2.5224, "mask_loss": 0.1491, "step": 382, "topk_loss": 0.0153 }, { "epoch": 0.15222670053346415, "grad_norm": 0.2421875, "learning_rate": 0.0001911285234150741, "lm_loss": 2.3632, "loss": 2.5428, "mask_loss": 0.1573, "step": 383, "topk_loss": 0.0223 }, { "epoch": 0.1526241592815933, "grad_norm": 0.197265625, "learning_rate": 0.0001910759694398221, "lm_loss": 2.3099, "loss": 2.4799, "mask_loss": 0.1533, "step": 384, "topk_loss": 0.0167 }, { "epoch": 0.15302161802972247, "grad_norm": 0.1708984375, "learning_rate": 0.00019102326753001086, "lm_loss": 2.3235, "loss": 2.4979, "mask_loss": 0.1567, "step": 385, "topk_loss": 0.0177 }, { "epoch": 0.1534190767778516, "grad_norm": 0.208984375, "learning_rate": 0.000190970417771244, "lm_loss": 2.3149, "loss": 2.4784, "mask_loss": 0.1492, "step": 386, "topk_loss": 0.0143 }, { "epoch": 0.15381653552598076, "grad_norm": 0.20703125, "learning_rate": 0.00019091742024936537, "lm_loss": 2.372, "loss": 2.5421, "mask_loss": 0.1525, "step": 387, "topk_loss": 0.0175 }, { "epoch": 0.15421399427410992, "grad_norm": 0.1728515625, "learning_rate": 0.0001908642750504587, "lm_loss": 2.2879, "loss": 2.4548, "mask_loss": 0.1521, "step": 388, "topk_loss": 0.0148 }, { "epoch": 0.15461145302223905, "grad_norm": 0.201171875, "learning_rate": 0.0001908109822608477, "lm_loss": 2.3243, "loss": 2.5138, "mask_loss": 0.1627, "step": 389, "topk_loss": 0.0268 }, { "epoch": 0.1550089117703682, "grad_norm": 0.259765625, "learning_rate": 0.00019075754196709572, "lm_loss": 2.3223, "loss": 2.4913, "mask_loss": 0.1537, "step": 390, "topk_loss": 0.0153 }, { "epoch": 0.15540637051849734, "grad_norm": 0.220703125, "learning_rate": 0.00019070395425600578, "lm_loss": 2.341, "loss": 2.5162, "mask_loss": 0.155, "step": 391, "topk_loss": 0.0202 }, { "epoch": 0.1558038292666265, "grad_norm": 0.34375, "learning_rate": 0.0001906502192146203, "lm_loss": 2.2998, "loss": 2.5121, "mask_loss": 0.1694, "step": 392, "topk_loss": 0.0429 }, { "epoch": 0.15620128801475566, "grad_norm": 0.1865234375, "learning_rate": 0.00019059633693022104, "lm_loss": 2.387, "loss": 2.5585, "mask_loss": 0.1538, "step": 393, "topk_loss": 0.0177 }, { "epoch": 0.1565987467628848, "grad_norm": 0.16015625, "learning_rate": 0.00019054230749032894, "lm_loss": 2.3027, "loss": 2.4735, "mask_loss": 0.154, "step": 394, "topk_loss": 0.0168 }, { "epoch": 0.15699620551101395, "grad_norm": 0.1650390625, "learning_rate": 0.00019048813098270387, "lm_loss": 2.299, "loss": 2.4641, "mask_loss": 0.1508, "step": 395, "topk_loss": 0.0143 }, { "epoch": 0.1573936642591431, "grad_norm": 0.1767578125, "learning_rate": 0.00019043380749534473, "lm_loss": 2.3667, "loss": 2.5338, "mask_loss": 0.1512, "step": 396, "topk_loss": 0.0159 }, { "epoch": 0.15779112300727224, "grad_norm": 0.1669921875, "learning_rate": 0.000190379337116489, "lm_loss": 2.2272, "loss": 2.3972, "mask_loss": 0.1545, "step": 397, "topk_loss": 0.0155 }, { "epoch": 0.1581885817554014, "grad_norm": 0.166015625, "learning_rate": 0.0001903247199346129, "lm_loss": 2.3447, "loss": 2.5147, "mask_loss": 0.1537, "step": 398, "topk_loss": 0.0164 }, { "epoch": 0.15858604050353056, "grad_norm": 0.17578125, "learning_rate": 0.00019026995603843097, "lm_loss": 2.2948, "loss": 2.4652, "mask_loss": 0.1534, "step": 399, "topk_loss": 0.0169 }, { "epoch": 0.1589834992516597, "grad_norm": 0.162109375, "learning_rate": 0.0001902150455168962, "lm_loss": 2.3454, "loss": 2.5136, "mask_loss": 0.1525, "step": 400, "topk_loss": 0.0157 }, { "epoch": 0.1589834992516597, "eval_lm_loss": 713.7522583007812, "eval_loss": 713.9226684570312, "eval_mask_hit_rate": 0.4711623787879944, "eval_mask_loss": 0.15088751912117004, "eval_mask_top_10_hit_rate": 0.964043378829956, "eval_mask_top_1_hit_rate": 0.9903273582458496, "eval_mask_top_20_hit_rate": 0.9471466541290283, "eval_mask_top_5_hit_rate": 0.9748400449752808, "eval_runtime": 148.3333, "eval_samples_per_second": 13.807, "eval_steps_per_second": 6.903, "eval_token_accuracy": 0.5702993869781494, "eval_top_k_diff": -536.0791015625, "eval_topk_loss": 0.019542653113603592, "step": 400 }, { "epoch": 0.15938095799978885, "grad_norm": 0.173828125, "learning_rate": 0.0001901599884591996, "lm_loss": 2.3265, "loss": 2.4894, "mask_loss": 0.1491, "step": 401, "topk_loss": 0.0138 }, { "epoch": 0.159778416747918, "grad_norm": 0.177734375, "learning_rate": 0.0001901047849547703, "lm_loss": 2.3177, "loss": 2.4838, "mask_loss": 0.1497, "step": 402, "topk_loss": 0.0164 }, { "epoch": 0.16017587549604714, "grad_norm": 0.16796875, "learning_rate": 0.00019004943509327523, "lm_loss": 2.3049, "loss": 2.4731, "mask_loss": 0.1531, "step": 403, "topk_loss": 0.0151 }, { "epoch": 0.1605733342441763, "grad_norm": 0.1904296875, "learning_rate": 0.00018999393896461917, "lm_loss": 2.2954, "loss": 2.4605, "mask_loss": 0.151, "step": 404, "topk_loss": 0.014 }, { "epoch": 0.16097079299230543, "grad_norm": 0.2109375, "learning_rate": 0.0001899382966589443, "lm_loss": 2.3559, "loss": 2.5239, "mask_loss": 0.1516, "step": 405, "topk_loss": 0.0163 }, { "epoch": 0.1613682517404346, "grad_norm": 0.16796875, "learning_rate": 0.0001898825082666304, "lm_loss": 2.3218, "loss": 2.4847, "mask_loss": 0.1495, "step": 406, "topk_loss": 0.0134 }, { "epoch": 0.16176571048856375, "grad_norm": 0.244140625, "learning_rate": 0.00018982657387829445, "lm_loss": 2.2732, "loss": 2.4401, "mask_loss": 0.1522, "step": 407, "topk_loss": 0.0146 }, { "epoch": 0.16216316923669288, "grad_norm": 0.1611328125, "learning_rate": 0.00018977049358479057, "lm_loss": 2.2258, "loss": 2.3913, "mask_loss": 0.1505, "step": 408, "topk_loss": 0.015 }, { "epoch": 0.16256062798482204, "grad_norm": 0.185546875, "learning_rate": 0.00018971426747720993, "lm_loss": 2.3396, "loss": 2.5083, "mask_loss": 0.1517, "step": 409, "topk_loss": 0.017 }, { "epoch": 0.1629580867329512, "grad_norm": 0.1669921875, "learning_rate": 0.0001896578956468805, "lm_loss": 2.3099, "loss": 2.4746, "mask_loss": 0.1501, "step": 410, "topk_loss": 0.0146 }, { "epoch": 0.16335554548108033, "grad_norm": 0.16796875, "learning_rate": 0.00018960137818536694, "lm_loss": 2.3023, "loss": 2.471, "mask_loss": 0.1535, "step": 411, "topk_loss": 0.0152 }, { "epoch": 0.1637530042292095, "grad_norm": 0.15625, "learning_rate": 0.00018954471518447052, "lm_loss": 2.3396, "loss": 2.5091, "mask_loss": 0.1522, "step": 412, "topk_loss": 0.0173 }, { "epoch": 0.16415046297733865, "grad_norm": 0.1923828125, "learning_rate": 0.00018948790673622884, "lm_loss": 2.2395, "loss": 2.4175, "mask_loss": 0.1569, "step": 413, "topk_loss": 0.0211 }, { "epoch": 0.16454792172546778, "grad_norm": 0.1962890625, "learning_rate": 0.00018943095293291572, "lm_loss": 2.217, "loss": 2.3831, "mask_loss": 0.1516, "step": 414, "topk_loss": 0.0145 }, { "epoch": 0.16494538047359694, "grad_norm": 0.173828125, "learning_rate": 0.00018937385386704126, "lm_loss": 2.3289, "loss": 2.5003, "mask_loss": 0.1517, "step": 415, "topk_loss": 0.0196 }, { "epoch": 0.1653428392217261, "grad_norm": 0.1494140625, "learning_rate": 0.00018931660963135126, "lm_loss": 2.2105, "loss": 2.376, "mask_loss": 0.1511, "step": 416, "topk_loss": 0.0144 }, { "epoch": 0.16574029796985523, "grad_norm": 0.189453125, "learning_rate": 0.00018925922031882758, "lm_loss": 2.2829, "loss": 2.4448, "mask_loss": 0.1483, "step": 417, "topk_loss": 0.0136 }, { "epoch": 0.1661377567179844, "grad_norm": 0.1689453125, "learning_rate": 0.00018920168602268748, "lm_loss": 2.2297, "loss": 2.3947, "mask_loss": 0.1514, "step": 418, "topk_loss": 0.0136 }, { "epoch": 0.16653521546611352, "grad_norm": 0.169921875, "learning_rate": 0.00018914400683638384, "lm_loss": 2.3595, "loss": 2.5301, "mask_loss": 0.1528, "step": 419, "topk_loss": 0.0178 }, { "epoch": 0.16693267421424268, "grad_norm": 0.1689453125, "learning_rate": 0.00018908618285360494, "lm_loss": 2.2403, "loss": 2.4043, "mask_loss": 0.1505, "step": 420, "topk_loss": 0.0135 }, { "epoch": 0.16733013296237184, "grad_norm": 0.1650390625, "learning_rate": 0.00018902821416827412, "lm_loss": 2.2461, "loss": 2.4134, "mask_loss": 0.1519, "step": 421, "topk_loss": 0.0154 }, { "epoch": 0.16772759171050097, "grad_norm": 0.16796875, "learning_rate": 0.00018897010087454987, "lm_loss": 2.2979, "loss": 2.4568, "mask_loss": 0.1468, "step": 422, "topk_loss": 0.0121 }, { "epoch": 0.16812505045863013, "grad_norm": 0.15234375, "learning_rate": 0.00018891184306682552, "lm_loss": 2.205, "loss": 2.3694, "mask_loss": 0.1502, "step": 423, "topk_loss": 0.0142 }, { "epoch": 0.1685225092067593, "grad_norm": 0.2138671875, "learning_rate": 0.00018885344083972914, "lm_loss": 2.2649, "loss": 2.4345, "mask_loss": 0.153, "step": 424, "topk_loss": 0.0167 }, { "epoch": 0.16891996795488842, "grad_norm": 0.396484375, "learning_rate": 0.00018879489428812334, "lm_loss": 2.2908, "loss": 2.5763, "mask_loss": 0.2318, "step": 425, "topk_loss": 0.0537 }, { "epoch": 0.16931742670301758, "grad_norm": 0.212890625, "learning_rate": 0.00018873620350710527, "lm_loss": 2.2308, "loss": 2.3953, "mask_loss": 0.1496, "step": 426, "topk_loss": 0.0149 }, { "epoch": 0.16971488545114674, "grad_norm": 0.2314453125, "learning_rate": 0.0001886773685920062, "lm_loss": 2.2539, "loss": 2.4188, "mask_loss": 0.1513, "step": 427, "topk_loss": 0.0136 }, { "epoch": 0.17011234419927587, "grad_norm": 0.16796875, "learning_rate": 0.00018861838963839164, "lm_loss": 2.1987, "loss": 2.3669, "mask_loss": 0.1523, "step": 428, "topk_loss": 0.0158 }, { "epoch": 0.17050980294740503, "grad_norm": 0.1708984375, "learning_rate": 0.00018855926674206098, "lm_loss": 2.2432, "loss": 2.4087, "mask_loss": 0.1501, "step": 429, "topk_loss": 0.0154 }, { "epoch": 0.1709072616955342, "grad_norm": 0.3046875, "learning_rate": 0.0001884999999990475, "lm_loss": 2.2507, "loss": 2.4245, "mask_loss": 0.1537, "step": 430, "topk_loss": 0.0202 }, { "epoch": 0.17130472044366332, "grad_norm": 0.1630859375, "learning_rate": 0.00018844058950561805, "lm_loss": 2.2252, "loss": 2.3899, "mask_loss": 0.1511, "step": 431, "topk_loss": 0.0136 }, { "epoch": 0.17170217919179248, "grad_norm": 0.234375, "learning_rate": 0.00018838103535827297, "lm_loss": 2.2232, "loss": 2.3882, "mask_loss": 0.1487, "step": 432, "topk_loss": 0.0163 }, { "epoch": 0.1720996379399216, "grad_norm": 0.189453125, "learning_rate": 0.00018832133765374606, "lm_loss": 2.2149, "loss": 2.3849, "mask_loss": 0.1534, "step": 433, "topk_loss": 0.0166 }, { "epoch": 0.17249709668805077, "grad_norm": 0.1572265625, "learning_rate": 0.00018826149648900416, "lm_loss": 2.2296, "loss": 2.3944, "mask_loss": 0.1496, "step": 434, "topk_loss": 0.0152 }, { "epoch": 0.17289455543617993, "grad_norm": 0.158203125, "learning_rate": 0.00018820151196124717, "lm_loss": 2.2515, "loss": 2.418, "mask_loss": 0.1514, "step": 435, "topk_loss": 0.0151 }, { "epoch": 0.17329201418430906, "grad_norm": 0.16796875, "learning_rate": 0.00018814138416790787, "lm_loss": 2.2328, "loss": 2.3971, "mask_loss": 0.1501, "step": 436, "topk_loss": 0.0142 }, { "epoch": 0.17368947293243822, "grad_norm": 0.1640625, "learning_rate": 0.00018808111320665173, "lm_loss": 2.2423, "loss": 2.4073, "mask_loss": 0.1507, "step": 437, "topk_loss": 0.0144 }, { "epoch": 0.17408693168056738, "grad_norm": 0.169921875, "learning_rate": 0.00018802069917537686, "lm_loss": 2.241, "loss": 2.4033, "mask_loss": 0.1486, "step": 438, "topk_loss": 0.0136 }, { "epoch": 0.1744843904286965, "grad_norm": 0.1669921875, "learning_rate": 0.0001879601421722136, "lm_loss": 2.1755, "loss": 2.3374, "mask_loss": 0.1476, "step": 439, "topk_loss": 0.0143 }, { "epoch": 0.17488184917682567, "grad_norm": 0.166015625, "learning_rate": 0.0001878994422955246, "lm_loss": 2.2717, "loss": 2.4436, "mask_loss": 0.1528, "step": 440, "topk_loss": 0.0191 }, { "epoch": 0.17527930792495483, "grad_norm": 0.1708984375, "learning_rate": 0.00018783859964390464, "lm_loss": 2.2874, "loss": 2.4501, "mask_loss": 0.1482, "step": 441, "topk_loss": 0.0146 }, { "epoch": 0.17567676667308396, "grad_norm": 0.1669921875, "learning_rate": 0.0001877776143161803, "lm_loss": 2.2562, "loss": 2.4221, "mask_loss": 0.1501, "step": 442, "topk_loss": 0.0158 }, { "epoch": 0.17607422542121312, "grad_norm": 0.1806640625, "learning_rate": 0.00018771648641140995, "lm_loss": 2.2138, "loss": 2.3753, "mask_loss": 0.1495, "step": 443, "topk_loss": 0.012 }, { "epoch": 0.17647168416934228, "grad_norm": 0.1650390625, "learning_rate": 0.0001876552160288836, "lm_loss": 2.2031, "loss": 2.3683, "mask_loss": 0.15, "step": 444, "topk_loss": 0.0152 }, { "epoch": 0.1768691429174714, "grad_norm": 0.1875, "learning_rate": 0.00018759380326812257, "lm_loss": 2.2839, "loss": 2.4543, "mask_loss": 0.1508, "step": 445, "topk_loss": 0.0196 }, { "epoch": 0.17726660166560057, "grad_norm": 0.1552734375, "learning_rate": 0.00018753224822887958, "lm_loss": 2.2664, "loss": 2.4313, "mask_loss": 0.1495, "step": 446, "topk_loss": 0.0155 }, { "epoch": 0.1776640604137297, "grad_norm": 0.18359375, "learning_rate": 0.00018747055101113832, "lm_loss": 2.2679, "loss": 2.4332, "mask_loss": 0.1499, "step": 447, "topk_loss": 0.0154 }, { "epoch": 0.17806151916185886, "grad_norm": 0.17578125, "learning_rate": 0.00018740871171511357, "lm_loss": 2.255, "loss": 2.4222, "mask_loss": 0.1507, "step": 448, "topk_loss": 0.0166 }, { "epoch": 0.17845897790998802, "grad_norm": 0.1630859375, "learning_rate": 0.00018734673044125072, "lm_loss": 2.2597, "loss": 2.4192, "mask_loss": 0.1447, "step": 449, "topk_loss": 0.0148 }, { "epoch": 0.17885643665811715, "grad_norm": 0.1728515625, "learning_rate": 0.0001872846072902259, "lm_loss": 2.201, "loss": 2.3614, "mask_loss": 0.1471, "step": 450, "topk_loss": 0.0133 }, { "epoch": 0.17885643665811715, "eval_lm_loss": 708.869384765625, "eval_loss": 709.0347900390625, "eval_mask_hit_rate": 0.4796638786792755, "eval_mask_loss": 0.1468515545129776, "eval_mask_top_10_hit_rate": 0.9682042598724365, "eval_mask_top_1_hit_rate": 0.9918677806854248, "eval_mask_top_20_hit_rate": 0.9525398015975952, "eval_mask_top_5_hit_rate": 0.978020429611206, "eval_runtime": 143.8801, "eval_samples_per_second": 14.234, "eval_steps_per_second": 7.117, "eval_token_accuracy": 0.5777865052223206, "eval_top_k_diff": -531.690185546875, "eval_topk_loss": 0.018510261550545692, "step": 450 }, { "epoch": 0.1792538954062463, "grad_norm": 0.16015625, "learning_rate": 0.00018722234236294568, "lm_loss": 2.2775, "loss": 2.4381, "mask_loss": 0.1468, "step": 451, "topk_loss": 0.0138 }, { "epoch": 0.17965135415437547, "grad_norm": 0.1640625, "learning_rate": 0.00018715993576054685, "lm_loss": 2.1349, "loss": 2.3018, "mask_loss": 0.1513, "step": 452, "topk_loss": 0.0157 }, { "epoch": 0.1800488129025046, "grad_norm": 0.17578125, "learning_rate": 0.00018709738758439635, "lm_loss": 2.2279, "loss": 2.3951, "mask_loss": 0.1484, "step": 453, "topk_loss": 0.0189 }, { "epoch": 0.18044627165063376, "grad_norm": 0.154296875, "learning_rate": 0.00018703469793609112, "lm_loss": 2.1836, "loss": 2.344, "mask_loss": 0.1476, "step": 454, "topk_loss": 0.0128 }, { "epoch": 0.18084373039876292, "grad_norm": 0.1826171875, "learning_rate": 0.00018697186691745782, "lm_loss": 2.2809, "loss": 2.4444, "mask_loss": 0.148, "step": 455, "topk_loss": 0.0155 }, { "epoch": 0.18124118914689205, "grad_norm": 0.1806640625, "learning_rate": 0.00018690889463055283, "lm_loss": 2.2363, "loss": 2.4038, "mask_loss": 0.151, "step": 456, "topk_loss": 0.0166 }, { "epoch": 0.1816386478950212, "grad_norm": 0.1748046875, "learning_rate": 0.0001868457811776619, "lm_loss": 2.2923, "loss": 2.4541, "mask_loss": 0.1475, "step": 457, "topk_loss": 0.0143 }, { "epoch": 0.18203610664315037, "grad_norm": 0.189453125, "learning_rate": 0.00018678252666130013, "lm_loss": 2.2225, "loss": 2.386, "mask_loss": 0.1488, "step": 458, "topk_loss": 0.0147 }, { "epoch": 0.1824335653912795, "grad_norm": 0.1748046875, "learning_rate": 0.00018671913118421175, "lm_loss": 2.1708, "loss": 2.3386, "mask_loss": 0.1522, "step": 459, "topk_loss": 0.0155 }, { "epoch": 0.18283102413940866, "grad_norm": 0.1640625, "learning_rate": 0.0001866555948493699, "lm_loss": 2.2273, "loss": 2.3916, "mask_loss": 0.1488, "step": 460, "topk_loss": 0.0155 }, { "epoch": 0.1832284828875378, "grad_norm": 0.1650390625, "learning_rate": 0.0001865919177599766, "lm_loss": 2.2133, "loss": 2.3763, "mask_loss": 0.1481, "step": 461, "topk_loss": 0.0149 }, { "epoch": 0.18362594163566695, "grad_norm": 0.251953125, "learning_rate": 0.00018652810001946243, "lm_loss": 2.2823, "loss": 2.4524, "mask_loss": 0.149, "step": 462, "topk_loss": 0.0211 }, { "epoch": 0.1840234003837961, "grad_norm": 0.2177734375, "learning_rate": 0.00018646414173148642, "lm_loss": 2.1885, "loss": 2.3479, "mask_loss": 0.1469, "step": 463, "topk_loss": 0.0125 }, { "epoch": 0.18442085913192524, "grad_norm": 0.166015625, "learning_rate": 0.00018640004299993597, "lm_loss": 2.1878, "loss": 2.3487, "mask_loss": 0.1478, "step": 464, "topk_loss": 0.0131 }, { "epoch": 0.1848183178800544, "grad_norm": 0.1708984375, "learning_rate": 0.00018633580392892648, "lm_loss": 2.1575, "loss": 2.319, "mask_loss": 0.1474, "step": 465, "topk_loss": 0.0141 }, { "epoch": 0.18521577662818356, "grad_norm": 0.169921875, "learning_rate": 0.00018627142462280144, "lm_loss": 2.2481, "loss": 2.4132, "mask_loss": 0.1479, "step": 466, "topk_loss": 0.0172 }, { "epoch": 0.1856132353763127, "grad_norm": 0.1875, "learning_rate": 0.00018620690518613206, "lm_loss": 2.2096, "loss": 2.3743, "mask_loss": 0.148, "step": 467, "topk_loss": 0.0168 }, { "epoch": 0.18601069412444185, "grad_norm": 0.177734375, "learning_rate": 0.00018614224572371715, "lm_loss": 2.2322, "loss": 2.3952, "mask_loss": 0.1483, "step": 468, "topk_loss": 0.0148 }, { "epoch": 0.186408152872571, "grad_norm": 0.2041015625, "learning_rate": 0.00018607744634058294, "lm_loss": 2.1991, "loss": 2.3597, "mask_loss": 0.1448, "step": 469, "topk_loss": 0.0159 }, { "epoch": 0.18680561162070014, "grad_norm": 0.162109375, "learning_rate": 0.00018601250714198302, "lm_loss": 2.1219, "loss": 2.282, "mask_loss": 0.147, "step": 470, "topk_loss": 0.0131 }, { "epoch": 0.1872030703688293, "grad_norm": 0.15625, "learning_rate": 0.00018594742823339802, "lm_loss": 2.1696, "loss": 2.331, "mask_loss": 0.1484, "step": 471, "topk_loss": 0.013 }, { "epoch": 0.18760052911695846, "grad_norm": 0.1708984375, "learning_rate": 0.0001858822097205355, "lm_loss": 2.2675, "loss": 2.4326, "mask_loss": 0.1487, "step": 472, "topk_loss": 0.0163 }, { "epoch": 0.1879979878650876, "grad_norm": 0.1640625, "learning_rate": 0.0001858168517093298, "lm_loss": 2.2102, "loss": 2.3716, "mask_loss": 0.1471, "step": 473, "topk_loss": 0.0143 }, { "epoch": 0.18839544661321675, "grad_norm": 0.1650390625, "learning_rate": 0.00018575135430594185, "lm_loss": 2.2098, "loss": 2.3708, "mask_loss": 0.1452, "step": 474, "topk_loss": 0.0158 }, { "epoch": 0.1887929053613459, "grad_norm": 0.173828125, "learning_rate": 0.00018568571761675893, "lm_loss": 2.2549, "loss": 2.415, "mask_loss": 0.1457, "step": 475, "topk_loss": 0.0144 }, { "epoch": 0.18919036410947504, "grad_norm": 0.1552734375, "learning_rate": 0.00018561994174839467, "lm_loss": 2.1715, "loss": 2.3349, "mask_loss": 0.1468, "step": 476, "topk_loss": 0.0166 }, { "epoch": 0.1895878228576042, "grad_norm": 0.1630859375, "learning_rate": 0.0001855540268076887, "lm_loss": 2.2258, "loss": 2.3837, "mask_loss": 0.1443, "step": 477, "topk_loss": 0.0136 }, { "epoch": 0.18998528160573333, "grad_norm": 0.1533203125, "learning_rate": 0.0001854879729017066, "lm_loss": 2.1926, "loss": 2.3549, "mask_loss": 0.1479, "step": 478, "topk_loss": 0.0144 }, { "epoch": 0.1903827403538625, "grad_norm": 0.1630859375, "learning_rate": 0.00018542178013773955, "lm_loss": 2.1665, "loss": 2.3269, "mask_loss": 0.1462, "step": 479, "topk_loss": 0.0142 }, { "epoch": 0.19078019910199165, "grad_norm": 0.1767578125, "learning_rate": 0.00018535544862330436, "lm_loss": 2.2233, "loss": 2.3878, "mask_loss": 0.1466, "step": 480, "topk_loss": 0.0178 }, { "epoch": 0.19117765785012078, "grad_norm": 0.23828125, "learning_rate": 0.0001852889784661433, "lm_loss": 2.167, "loss": 2.3412, "mask_loss": 0.1538, "step": 481, "topk_loss": 0.0204 }, { "epoch": 0.19157511659824994, "grad_norm": 0.1787109375, "learning_rate": 0.00018522236977422363, "lm_loss": 2.1695, "loss": 2.328, "mask_loss": 0.1447, "step": 482, "topk_loss": 0.0138 }, { "epoch": 0.1919725753463791, "grad_norm": 0.2216796875, "learning_rate": 0.00018515562265573784, "lm_loss": 2.181, "loss": 2.3646, "mask_loss": 0.1569, "step": 483, "topk_loss": 0.0268 }, { "epoch": 0.19237003409450823, "grad_norm": 0.25390625, "learning_rate": 0.00018508873721910315, "lm_loss": 2.231, "loss": 2.3881, "mask_loss": 0.1436, "step": 484, "topk_loss": 0.0134 }, { "epoch": 0.1927674928426374, "grad_norm": 0.166015625, "learning_rate": 0.00018502171357296144, "lm_loss": 2.2171, "loss": 2.3809, "mask_loss": 0.1497, "step": 485, "topk_loss": 0.0141 }, { "epoch": 0.19316495159076655, "grad_norm": 0.2294921875, "learning_rate": 0.00018495455182617913, "lm_loss": 2.2345, "loss": 2.3997, "mask_loss": 0.1487, "step": 486, "topk_loss": 0.0165 }, { "epoch": 0.19356241033889568, "grad_norm": 0.216796875, "learning_rate": 0.00018488725208784694, "lm_loss": 2.2348, "loss": 2.3944, "mask_loss": 0.1462, "step": 487, "topk_loss": 0.0133 }, { "epoch": 0.19395986908702484, "grad_norm": 0.150390625, "learning_rate": 0.00018481981446727977, "lm_loss": 2.1731, "loss": 2.3324, "mask_loss": 0.1465, "step": 488, "topk_loss": 0.0128 }, { "epoch": 0.194357327835154, "grad_norm": 0.1875, "learning_rate": 0.00018475223907401638, "lm_loss": 2.1905, "loss": 2.3522, "mask_loss": 0.146, "step": 489, "topk_loss": 0.0157 }, { "epoch": 0.19475478658328313, "grad_norm": 0.1630859375, "learning_rate": 0.00018468452601781948, "lm_loss": 2.1728, "loss": 2.3308, "mask_loss": 0.1447, "step": 490, "topk_loss": 0.0133 }, { "epoch": 0.1951522453314123, "grad_norm": 0.2021484375, "learning_rate": 0.0001846166754086752, "lm_loss": 2.197, "loss": 2.357, "mask_loss": 0.1457, "step": 491, "topk_loss": 0.0143 }, { "epoch": 0.19554970407954142, "grad_norm": 0.2275390625, "learning_rate": 0.0001845486873567932, "lm_loss": 2.1466, "loss": 2.3035, "mask_loss": 0.1441, "step": 492, "topk_loss": 0.0128 }, { "epoch": 0.19594716282767058, "grad_norm": 0.1796875, "learning_rate": 0.00018448056197260636, "lm_loss": 2.1935, "loss": 2.3586, "mask_loss": 0.149, "step": 493, "topk_loss": 0.0161 }, { "epoch": 0.19634462157579974, "grad_norm": 0.1552734375, "learning_rate": 0.00018441229936677064, "lm_loss": 2.149, "loss": 2.3075, "mask_loss": 0.1457, "step": 494, "topk_loss": 0.0129 }, { "epoch": 0.19674208032392887, "grad_norm": 0.1611328125, "learning_rate": 0.00018434389965016495, "lm_loss": 2.1451, "loss": 2.3038, "mask_loss": 0.1457, "step": 495, "topk_loss": 0.0129 }, { "epoch": 0.19713953907205803, "grad_norm": 0.1904296875, "learning_rate": 0.00018427536293389075, "lm_loss": 2.1242, "loss": 2.2799, "mask_loss": 0.144, "step": 496, "topk_loss": 0.0118 }, { "epoch": 0.1975369978201872, "grad_norm": 0.17578125, "learning_rate": 0.0001842066893292722, "lm_loss": 2.206, "loss": 2.3713, "mask_loss": 0.1482, "step": 497, "topk_loss": 0.0172 }, { "epoch": 0.19793445656831632, "grad_norm": 0.1494140625, "learning_rate": 0.0001841378789478557, "lm_loss": 2.1557, "loss": 2.3138, "mask_loss": 0.1442, "step": 498, "topk_loss": 0.014 }, { "epoch": 0.19833191531644548, "grad_norm": 0.197265625, "learning_rate": 0.0001840689319014098, "lm_loss": 2.1505, "loss": 2.3097, "mask_loss": 0.1457, "step": 499, "topk_loss": 0.0135 }, { "epoch": 0.19872937406457464, "grad_norm": 0.1650390625, "learning_rate": 0.00018399984830192522, "lm_loss": 2.199, "loss": 2.3564, "mask_loss": 0.1445, "step": 500, "topk_loss": 0.0129 }, { "epoch": 0.19872937406457464, "eval_lm_loss": 705.312744140625, "eval_loss": 705.47412109375, "eval_mask_hit_rate": 0.4869436025619507, "eval_mask_loss": 0.1438208818435669, "eval_mask_top_10_hit_rate": 0.9714464545249939, "eval_mask_top_1_hit_rate": 0.9930768013000488, "eval_mask_top_20_hit_rate": 0.9567796587944031, "eval_mask_top_5_hit_rate": 0.9805311560630798, "eval_runtime": 144.86, "eval_samples_per_second": 14.138, "eval_steps_per_second": 7.069, "eval_token_accuracy": 0.5837854146957397, "eval_top_k_diff": -524.4144287109375, "eval_topk_loss": 0.017527751624584198, "step": 500 }, { "epoch": 0.19912683281270377, "grad_norm": 0.1669921875, "learning_rate": 0.00018393062826161418, "lm_loss": 2.174, "loss": 2.3281, "mask_loss": 0.1425, "step": 501, "topk_loss": 0.0116 }, { "epoch": 0.19952429156083293, "grad_norm": 0.1875, "learning_rate": 0.00018386127189291084, "lm_loss": 2.1827, "loss": 2.3404, "mask_loss": 0.1451, "step": 502, "topk_loss": 0.0126 }, { "epoch": 0.1999217503089621, "grad_norm": 0.1591796875, "learning_rate": 0.0001837917793084705, "lm_loss": 2.2056, "loss": 2.363, "mask_loss": 0.1436, "step": 503, "topk_loss": 0.0138 }, { "epoch": 0.20031920905709122, "grad_norm": 0.18359375, "learning_rate": 0.00018372215062116998, "lm_loss": 2.1284, "loss": 2.2903, "mask_loss": 0.1454, "step": 504, "topk_loss": 0.0165 }, { "epoch": 0.20071666780522038, "grad_norm": 0.1591796875, "learning_rate": 0.00018365238594410696, "lm_loss": 2.0537, "loss": 2.2151, "mask_loss": 0.1473, "step": 505, "topk_loss": 0.0141 }, { "epoch": 0.2011141265533495, "grad_norm": 0.166015625, "learning_rate": 0.00018358248539060017, "lm_loss": 2.164, "loss": 2.3251, "mask_loss": 0.1462, "step": 506, "topk_loss": 0.0149 }, { "epoch": 0.20151158530147867, "grad_norm": 0.1708984375, "learning_rate": 0.00018351244907418893, "lm_loss": 2.1705, "loss": 2.3272, "mask_loss": 0.1424, "step": 507, "topk_loss": 0.0142 }, { "epoch": 0.20190904404960783, "grad_norm": 0.1669921875, "learning_rate": 0.00018344227710863316, "lm_loss": 2.1265, "loss": 2.2824, "mask_loss": 0.1426, "step": 508, "topk_loss": 0.0133 }, { "epoch": 0.20230650279773696, "grad_norm": 0.1630859375, "learning_rate": 0.00018337196960791302, "lm_loss": 2.109, "loss": 2.2652, "mask_loss": 0.1434, "step": 509, "topk_loss": 0.0128 }, { "epoch": 0.20270396154586612, "grad_norm": 0.1982421875, "learning_rate": 0.00018330152668622892, "lm_loss": 2.1532, "loss": 2.3208, "mask_loss": 0.1476, "step": 510, "topk_loss": 0.0201 }, { "epoch": 0.20310142029399528, "grad_norm": 0.158203125, "learning_rate": 0.00018323094845800123, "lm_loss": 2.1261, "loss": 2.2814, "mask_loss": 0.1429, "step": 511, "topk_loss": 0.0123 }, { "epoch": 0.2034988790421244, "grad_norm": 0.1787109375, "learning_rate": 0.00018316023503786997, "lm_loss": 2.1097, "loss": 2.2726, "mask_loss": 0.1463, "step": 512, "topk_loss": 0.0165 }, { "epoch": 0.20389633779025357, "grad_norm": 0.1591796875, "learning_rate": 0.00018308938654069487, "lm_loss": 2.2056, "loss": 2.3602, "mask_loss": 0.1418, "step": 513, "topk_loss": 0.0128 }, { "epoch": 0.20429379653838273, "grad_norm": 0.1513671875, "learning_rate": 0.00018301840308155507, "lm_loss": 2.1918, "loss": 2.3471, "mask_loss": 0.1432, "step": 514, "topk_loss": 0.0121 }, { "epoch": 0.20469125528651186, "grad_norm": 0.181640625, "learning_rate": 0.00018294728477574886, "lm_loss": 2.2104, "loss": 2.365, "mask_loss": 0.1416, "step": 515, "topk_loss": 0.013 }, { "epoch": 0.20508871403464102, "grad_norm": 0.15625, "learning_rate": 0.00018287603173879364, "lm_loss": 2.192, "loss": 2.3463, "mask_loss": 0.1417, "step": 516, "topk_loss": 0.0125 }, { "epoch": 0.20548617278277018, "grad_norm": 0.15625, "learning_rate": 0.00018280464408642556, "lm_loss": 2.1694, "loss": 2.3299, "mask_loss": 0.1452, "step": 517, "topk_loss": 0.0153 }, { "epoch": 0.2058836315308993, "grad_norm": 0.171875, "learning_rate": 0.00018273312193459952, "lm_loss": 2.2149, "loss": 2.3693, "mask_loss": 0.1418, "step": 518, "topk_loss": 0.0125 }, { "epoch": 0.20628109027902847, "grad_norm": 0.169921875, "learning_rate": 0.00018266146539948878, "lm_loss": 2.1073, "loss": 2.267, "mask_loss": 0.1466, "step": 519, "topk_loss": 0.013 }, { "epoch": 0.2066785490271576, "grad_norm": 0.17578125, "learning_rate": 0.000182589674597485, "lm_loss": 2.1863, "loss": 2.3462, "mask_loss": 0.1452, "step": 520, "topk_loss": 0.0147 }, { "epoch": 0.20707600777528676, "grad_norm": 0.216796875, "learning_rate": 0.00018251774964519785, "lm_loss": 2.174, "loss": 2.3344, "mask_loss": 0.1461, "step": 521, "topk_loss": 0.0144 }, { "epoch": 0.20747346652341592, "grad_norm": 0.146484375, "learning_rate": 0.00018244569065945494, "lm_loss": 2.1521, "loss": 2.3099, "mask_loss": 0.144, "step": 522, "topk_loss": 0.0138 }, { "epoch": 0.20787092527154505, "grad_norm": 0.15625, "learning_rate": 0.00018237349775730152, "lm_loss": 2.1465, "loss": 2.3039, "mask_loss": 0.1437, "step": 523, "topk_loss": 0.0137 }, { "epoch": 0.2082683840196742, "grad_norm": 0.1435546875, "learning_rate": 0.00018230117105600047, "lm_loss": 2.1609, "loss": 2.3177, "mask_loss": 0.1445, "step": 524, "topk_loss": 0.0123 }, { "epoch": 0.20866584276780337, "grad_norm": 0.162109375, "learning_rate": 0.00018222871067303192, "lm_loss": 2.1812, "loss": 2.3414, "mask_loss": 0.1465, "step": 525, "topk_loss": 0.0136 }, { "epoch": 0.2090633015159325, "grad_norm": 0.1455078125, "learning_rate": 0.00018215611672609317, "lm_loss": 2.1737, "loss": 2.3348, "mask_loss": 0.1469, "step": 526, "topk_loss": 0.0143 }, { "epoch": 0.20946076026406166, "grad_norm": 0.25390625, "learning_rate": 0.00018208338933309843, "lm_loss": 2.1444, "loss": 2.316, "mask_loss": 0.1503, "step": 527, "topk_loss": 0.0213 }, { "epoch": 0.20985821901219082, "grad_norm": 0.203125, "learning_rate": 0.0001820105286121787, "lm_loss": 2.0979, "loss": 2.2516, "mask_loss": 0.1413, "step": 528, "topk_loss": 0.0124 }, { "epoch": 0.21025567776031995, "grad_norm": 0.2294921875, "learning_rate": 0.00018193753468168154, "lm_loss": 2.0986, "loss": 2.2581, "mask_loss": 0.1445, "step": 529, "topk_loss": 0.015 }, { "epoch": 0.2106531365084491, "grad_norm": 0.162109375, "learning_rate": 0.0001818644076601709, "lm_loss": 2.2358, "loss": 2.3915, "mask_loss": 0.1417, "step": 530, "topk_loss": 0.014 }, { "epoch": 0.21105059525657827, "grad_norm": 0.1787109375, "learning_rate": 0.0001817911476664269, "lm_loss": 2.1233, "loss": 2.2813, "mask_loss": 0.1459, "step": 531, "topk_loss": 0.0121 }, { "epoch": 0.2114480540047074, "grad_norm": 0.158203125, "learning_rate": 0.00018171775481944563, "lm_loss": 2.1246, "loss": 2.2851, "mask_loss": 0.1454, "step": 532, "topk_loss": 0.0151 }, { "epoch": 0.21184551275283656, "grad_norm": 0.1865234375, "learning_rate": 0.00018164422923843893, "lm_loss": 2.1321, "loss": 2.291, "mask_loss": 0.1458, "step": 533, "topk_loss": 0.0131 }, { "epoch": 0.2122429715009657, "grad_norm": 0.169921875, "learning_rate": 0.00018157057104283431, "lm_loss": 2.1354, "loss": 2.2932, "mask_loss": 0.1449, "step": 534, "topk_loss": 0.0129 }, { "epoch": 0.21264043024909485, "grad_norm": 0.1533203125, "learning_rate": 0.00018149678035227473, "lm_loss": 2.0942, "loss": 2.2499, "mask_loss": 0.1424, "step": 535, "topk_loss": 0.0132 }, { "epoch": 0.213037888997224, "grad_norm": 0.240234375, "learning_rate": 0.0001814228572866182, "lm_loss": 2.1389, "loss": 2.2964, "mask_loss": 0.1438, "step": 536, "topk_loss": 0.0137 }, { "epoch": 0.21343534774535314, "grad_norm": 0.1943359375, "learning_rate": 0.00018134880196593795, "lm_loss": 2.1755, "loss": 2.3365, "mask_loss": 0.1442, "step": 537, "topk_loss": 0.0167 }, { "epoch": 0.2138328064934823, "grad_norm": 0.2060546875, "learning_rate": 0.00018127461451052183, "lm_loss": 2.1824, "loss": 2.3344, "mask_loss": 0.141, "step": 538, "topk_loss": 0.011 }, { "epoch": 0.21423026524161146, "grad_norm": 0.181640625, "learning_rate": 0.00018120029504087247, "lm_loss": 2.1616, "loss": 2.3154, "mask_loss": 0.141, "step": 539, "topk_loss": 0.0128 }, { "epoch": 0.2146277239897406, "grad_norm": 0.154296875, "learning_rate": 0.00018112584367770685, "lm_loss": 2.1107, "loss": 2.2671, "mask_loss": 0.1428, "step": 540, "topk_loss": 0.0136 }, { "epoch": 0.21502518273786975, "grad_norm": 0.2197265625, "learning_rate": 0.00018105126054195617, "lm_loss": 2.1106, "loss": 2.2669, "mask_loss": 0.1431, "step": 541, "topk_loss": 0.0132 }, { "epoch": 0.2154226414859989, "grad_norm": 0.1689453125, "learning_rate": 0.00018097654575476577, "lm_loss": 2.1814, "loss": 2.3328, "mask_loss": 0.1399, "step": 542, "topk_loss": 0.0115 }, { "epoch": 0.21582010023412804, "grad_norm": 0.1533203125, "learning_rate": 0.00018090169943749476, "lm_loss": 2.1029, "loss": 2.2617, "mask_loss": 0.1445, "step": 543, "topk_loss": 0.0143 }, { "epoch": 0.2162175589822572, "grad_norm": 0.1884765625, "learning_rate": 0.00018082672171171584, "lm_loss": 2.1275, "loss": 2.2836, "mask_loss": 0.1421, "step": 544, "topk_loss": 0.014 }, { "epoch": 0.21661501773038636, "grad_norm": 0.2236328125, "learning_rate": 0.00018075161269921527, "lm_loss": 2.1511, "loss": 2.303, "mask_loss": 0.1411, "step": 545, "topk_loss": 0.0108 }, { "epoch": 0.2170124764785155, "grad_norm": 0.1943359375, "learning_rate": 0.0001806763725219925, "lm_loss": 2.1007, "loss": 2.2545, "mask_loss": 0.1413, "step": 546, "topk_loss": 0.0125 }, { "epoch": 0.21740993522664465, "grad_norm": 0.1728515625, "learning_rate": 0.00018060100130226002, "lm_loss": 2.1629, "loss": 2.3212, "mask_loss": 0.1444, "step": 547, "topk_loss": 0.0139 }, { "epoch": 0.21780739397477378, "grad_norm": 0.1923828125, "learning_rate": 0.0001805254991624432, "lm_loss": 2.1271, "loss": 2.2895, "mask_loss": 0.1456, "step": 548, "topk_loss": 0.0169 }, { "epoch": 0.21820485272290294, "grad_norm": 0.18359375, "learning_rate": 0.00018044986622518002, "lm_loss": 2.1949, "loss": 2.3501, "mask_loss": 0.141, "step": 549, "topk_loss": 0.0142 }, { "epoch": 0.2186023114710321, "grad_norm": 0.173828125, "learning_rate": 0.000180374102613321, "lm_loss": 2.1611, "loss": 2.321, "mask_loss": 0.1446, "step": 550, "topk_loss": 0.0153 }, { "epoch": 0.2186023114710321, "eval_lm_loss": 704.4793701171875, "eval_loss": 704.6373291015625, "eval_mask_hit_rate": 0.49326711893081665, "eval_mask_loss": 0.14139045774936676, "eval_mask_top_10_hit_rate": 0.9738826751708984, "eval_mask_top_1_hit_rate": 0.9939601421356201, "eval_mask_top_20_hit_rate": 0.959991455078125, "eval_mask_top_5_hit_rate": 0.9823801517486572, "eval_runtime": 143.9303, "eval_samples_per_second": 14.229, "eval_steps_per_second": 7.115, "eval_token_accuracy": 0.5886777639389038, "eval_top_k_diff": -542.8375244140625, "eval_topk_loss": 0.01653169095516205, "step": 550 }, { "epoch": 0.21899977021916123, "grad_norm": 0.17578125, "learning_rate": 0.00018029820844992883, "lm_loss": 2.1458, "loss": 2.3007, "mask_loss": 0.1428, "step": 551, "topk_loss": 0.0122 }, { "epoch": 0.2193972289672904, "grad_norm": 0.1708984375, "learning_rate": 0.00018022218385827828, "lm_loss": 2.1524, "loss": 2.308, "mask_loss": 0.1433, "step": 552, "topk_loss": 0.0124 }, { "epoch": 0.21979468771541955, "grad_norm": 0.1513671875, "learning_rate": 0.00018014602896185595, "lm_loss": 2.1102, "loss": 2.2644, "mask_loss": 0.1418, "step": 553, "topk_loss": 0.0124 }, { "epoch": 0.22019214646354868, "grad_norm": 0.1552734375, "learning_rate": 0.0001800697438843602, "lm_loss": 2.1335, "loss": 2.2837, "mask_loss": 0.1388, "step": 554, "topk_loss": 0.0114 }, { "epoch": 0.22058960521167784, "grad_norm": 0.181640625, "learning_rate": 0.0001799933287497007, "lm_loss": 2.1569, "loss": 2.3133, "mask_loss": 0.1417, "step": 555, "topk_loss": 0.0146 }, { "epoch": 0.220987063959807, "grad_norm": 0.1728515625, "learning_rate": 0.00017991678368199846, "lm_loss": 2.2034, "loss": 2.3606, "mask_loss": 0.1401, "step": 556, "topk_loss": 0.0171 }, { "epoch": 0.22138452270793613, "grad_norm": 0.14453125, "learning_rate": 0.00017984010880558554, "lm_loss": 2.129, "loss": 2.2839, "mask_loss": 0.1419, "step": 557, "topk_loss": 0.0131 }, { "epoch": 0.22178198145606529, "grad_norm": 0.2890625, "learning_rate": 0.00017976330424500478, "lm_loss": 2.0838, "loss": 2.244, "mask_loss": 0.1447, "step": 558, "topk_loss": 0.0155 }, { "epoch": 0.22217944020419444, "grad_norm": 0.1708984375, "learning_rate": 0.00017968637012500972, "lm_loss": 2.0653, "loss": 2.2237, "mask_loss": 0.1442, "step": 559, "topk_loss": 0.0142 }, { "epoch": 0.22257689895232358, "grad_norm": 0.1806640625, "learning_rate": 0.00017960930657056438, "lm_loss": 2.1554, "loss": 2.3135, "mask_loss": 0.1431, "step": 560, "topk_loss": 0.015 }, { "epoch": 0.22297435770045274, "grad_norm": 0.22265625, "learning_rate": 0.00017953211370684295, "lm_loss": 2.1087, "loss": 2.2637, "mask_loss": 0.1415, "step": 561, "topk_loss": 0.0136 }, { "epoch": 0.22337181644858187, "grad_norm": 0.1640625, "learning_rate": 0.00017945479165922966, "lm_loss": 2.141, "loss": 2.292, "mask_loss": 0.138, "step": 562, "topk_loss": 0.0129 }, { "epoch": 0.22376927519671103, "grad_norm": 0.181640625, "learning_rate": 0.0001793773405533186, "lm_loss": 2.1095, "loss": 2.2669, "mask_loss": 0.1423, "step": 563, "topk_loss": 0.015 }, { "epoch": 0.22416673394484019, "grad_norm": 0.58203125, "learning_rate": 0.0001792997605149135, "lm_loss": 2.1214, "loss": 2.3774, "mask_loss": 0.201, "step": 564, "topk_loss": 0.0549 }, { "epoch": 0.22456419269296932, "grad_norm": 0.2451171875, "learning_rate": 0.00017922205167002754, "lm_loss": 2.0831, "loss": 2.2377, "mask_loss": 0.1414, "step": 565, "topk_loss": 0.0133 }, { "epoch": 0.22496165144109848, "grad_norm": 0.228515625, "learning_rate": 0.00017914421414488298, "lm_loss": 2.1192, "loss": 2.2754, "mask_loss": 0.1429, "step": 566, "topk_loss": 0.0133 }, { "epoch": 0.22535911018922763, "grad_norm": 0.1728515625, "learning_rate": 0.00017906624806591126, "lm_loss": 2.1005, "loss": 2.2539, "mask_loss": 0.1405, "step": 567, "topk_loss": 0.0129 }, { "epoch": 0.22575656893735677, "grad_norm": 0.169921875, "learning_rate": 0.00017898815355975255, "lm_loss": 2.11, "loss": 2.2656, "mask_loss": 0.1429, "step": 568, "topk_loss": 0.0127 }, { "epoch": 0.22615402768548593, "grad_norm": 0.166015625, "learning_rate": 0.00017890993075325565, "lm_loss": 2.1541, "loss": 2.3053, "mask_loss": 0.14, "step": 569, "topk_loss": 0.0112 }, { "epoch": 0.22655148643361508, "grad_norm": 0.1630859375, "learning_rate": 0.00017883157977347774, "lm_loss": 2.0569, "loss": 2.2116, "mask_loss": 0.1428, "step": 570, "topk_loss": 0.0119 }, { "epoch": 0.22694894518174422, "grad_norm": 0.201171875, "learning_rate": 0.00017875310074768418, "lm_loss": 2.1304, "loss": 2.2855, "mask_loss": 0.1429, "step": 571, "topk_loss": 0.0123 }, { "epoch": 0.22734640392987338, "grad_norm": 0.15234375, "learning_rate": 0.00017867449380334834, "lm_loss": 2.1621, "loss": 2.318, "mask_loss": 0.1417, "step": 572, "topk_loss": 0.0142 }, { "epoch": 0.22774386267800253, "grad_norm": 0.1689453125, "learning_rate": 0.00017859575906815139, "lm_loss": 2.0412, "loss": 2.1965, "mask_loss": 0.1427, "step": 573, "topk_loss": 0.0126 }, { "epoch": 0.22814132142613167, "grad_norm": 0.197265625, "learning_rate": 0.00017851689666998198, "lm_loss": 2.0992, "loss": 2.2534, "mask_loss": 0.1416, "step": 574, "topk_loss": 0.0127 }, { "epoch": 0.22853878017426082, "grad_norm": 0.1640625, "learning_rate": 0.00017843790673693625, "lm_loss": 2.1353, "loss": 2.2892, "mask_loss": 0.1407, "step": 575, "topk_loss": 0.0132 }, { "epoch": 0.22893623892238996, "grad_norm": 0.1572265625, "learning_rate": 0.0001783587893973174, "lm_loss": 2.1064, "loss": 2.2618, "mask_loss": 0.141, "step": 576, "topk_loss": 0.0143 }, { "epoch": 0.22933369767051912, "grad_norm": 0.181640625, "learning_rate": 0.00017827954477963557, "lm_loss": 2.1688, "loss": 2.3245, "mask_loss": 0.1431, "step": 577, "topk_loss": 0.0125 }, { "epoch": 0.22973115641864827, "grad_norm": 0.2138671875, "learning_rate": 0.00017820017301260776, "lm_loss": 2.015, "loss": 2.1693, "mask_loss": 0.1418, "step": 578, "topk_loss": 0.0126 }, { "epoch": 0.2301286151667774, "grad_norm": 0.16015625, "learning_rate": 0.00017812067422515732, "lm_loss": 2.1014, "loss": 2.2561, "mask_loss": 0.1435, "step": 579, "topk_loss": 0.0113 }, { "epoch": 0.23052607391490657, "grad_norm": 0.140625, "learning_rate": 0.00017804104854641408, "lm_loss": 2.1788, "loss": 2.333, "mask_loss": 0.1413, "step": 580, "topk_loss": 0.0129 }, { "epoch": 0.23092353266303572, "grad_norm": 0.251953125, "learning_rate": 0.00017796129610571384, "lm_loss": 2.1014, "loss": 2.258, "mask_loss": 0.143, "step": 581, "topk_loss": 0.0136 }, { "epoch": 0.23132099141116486, "grad_norm": 0.21875, "learning_rate": 0.0001778814170325984, "lm_loss": 2.1588, "loss": 2.3118, "mask_loss": 0.1398, "step": 582, "topk_loss": 0.0132 }, { "epoch": 0.23171845015929401, "grad_norm": 0.1435546875, "learning_rate": 0.0001778014114568153, "lm_loss": 2.0827, "loss": 2.2349, "mask_loss": 0.1409, "step": 583, "topk_loss": 0.0113 }, { "epoch": 0.23211590890742317, "grad_norm": 0.158203125, "learning_rate": 0.00017772127950831733, "lm_loss": 2.0314, "loss": 2.1852, "mask_loss": 0.1416, "step": 584, "topk_loss": 0.0122 }, { "epoch": 0.2325133676555523, "grad_norm": 0.2314453125, "learning_rate": 0.0001776410213172628, "lm_loss": 2.1545, "loss": 2.3171, "mask_loss": 0.1438, "step": 585, "topk_loss": 0.0188 }, { "epoch": 0.23291082640368146, "grad_norm": 0.240234375, "learning_rate": 0.00017756063701401492, "lm_loss": 2.0725, "loss": 2.2282, "mask_loss": 0.1413, "step": 586, "topk_loss": 0.0144 }, { "epoch": 0.23330828515181062, "grad_norm": 0.162109375, "learning_rate": 0.00017748012672914176, "lm_loss": 2.0879, "loss": 2.2427, "mask_loss": 0.1416, "step": 587, "topk_loss": 0.0132 }, { "epoch": 0.23370574389993976, "grad_norm": 0.1787109375, "learning_rate": 0.00017739949059341617, "lm_loss": 2.0445, "loss": 2.2004, "mask_loss": 0.142, "step": 588, "topk_loss": 0.0139 }, { "epoch": 0.23410320264806891, "grad_norm": 0.2041015625, "learning_rate": 0.00017731872873781517, "lm_loss": 2.097, "loss": 2.2535, "mask_loss": 0.1421, "step": 589, "topk_loss": 0.0145 }, { "epoch": 0.23450066139619805, "grad_norm": 0.19921875, "learning_rate": 0.00017723784129352018, "lm_loss": 2.1099, "loss": 2.2612, "mask_loss": 0.1395, "step": 590, "topk_loss": 0.0119 }, { "epoch": 0.2348981201443272, "grad_norm": 0.1494140625, "learning_rate": 0.0001771568283919166, "lm_loss": 2.0986, "loss": 2.2533, "mask_loss": 0.1422, "step": 591, "topk_loss": 0.0124 }, { "epoch": 0.23529557889245636, "grad_norm": 0.255859375, "learning_rate": 0.00017707569016459348, "lm_loss": 2.1375, "loss": 2.2998, "mask_loss": 0.1441, "step": 592, "topk_loss": 0.0182 }, { "epoch": 0.2356930376405855, "grad_norm": 0.1474609375, "learning_rate": 0.00017699442674334358, "lm_loss": 2.1091, "loss": 2.2608, "mask_loss": 0.1403, "step": 593, "topk_loss": 0.0113 }, { "epoch": 0.23609049638871465, "grad_norm": 0.18359375, "learning_rate": 0.0001769130382601629, "lm_loss": 2.1533, "loss": 2.3117, "mask_loss": 0.1425, "step": 594, "topk_loss": 0.016 }, { "epoch": 0.2364879551368438, "grad_norm": 0.1513671875, "learning_rate": 0.00017683152484725066, "lm_loss": 2.0379, "loss": 2.1905, "mask_loss": 0.1403, "step": 595, "topk_loss": 0.0122 }, { "epoch": 0.23688541388497295, "grad_norm": 0.150390625, "learning_rate": 0.00017674988663700898, "lm_loss": 2.0963, "loss": 2.2508, "mask_loss": 0.1414, "step": 596, "topk_loss": 0.0131 }, { "epoch": 0.2372828726331021, "grad_norm": 0.162109375, "learning_rate": 0.00017666812376204266, "lm_loss": 2.0917, "loss": 2.2451, "mask_loss": 0.1403, "step": 597, "topk_loss": 0.013 }, { "epoch": 0.23768033138123126, "grad_norm": 0.1630859375, "learning_rate": 0.00017658623635515906, "lm_loss": 2.092, "loss": 2.2446, "mask_loss": 0.1392, "step": 598, "topk_loss": 0.0134 }, { "epoch": 0.2380777901293604, "grad_norm": 0.1484375, "learning_rate": 0.00017650422454936772, "lm_loss": 2.1547, "loss": 2.3048, "mask_loss": 0.139, "step": 599, "topk_loss": 0.0111 }, { "epoch": 0.23847524887748955, "grad_norm": 0.169921875, "learning_rate": 0.00017642208847788032, "lm_loss": 2.173, "loss": 2.3238, "mask_loss": 0.1388, "step": 600, "topk_loss": 0.012 }, { "epoch": 0.23847524887748955, "eval_lm_loss": 699.9920043945312, "eval_loss": 700.147216796875, "eval_mask_hit_rate": 0.49874863028526306, "eval_mask_loss": 0.1390666365623474, "eval_mask_top_10_hit_rate": 0.975847601890564, "eval_mask_top_1_hit_rate": 0.9946634769439697, "eval_mask_top_20_hit_rate": 0.9626063108444214, "eval_mask_top_5_hit_rate": 0.9838546514511108, "eval_runtime": 143.9171, "eval_samples_per_second": 14.23, "eval_steps_per_second": 7.115, "eval_token_accuracy": 0.5928311347961426, "eval_top_k_diff": -522.0046997070312, "eval_topk_loss": 0.0161855798214674, "step": 600 }, { "epoch": 0.2388727076256187, "grad_norm": 0.169921875, "learning_rate": 0.00017633982827411032, "lm_loss": 2.0913, "loss": 2.2444, "mask_loss": 0.14, "step": 601, "topk_loss": 0.0131 }, { "epoch": 0.23927016637374784, "grad_norm": 0.1494140625, "learning_rate": 0.00017625744407167288, "lm_loss": 2.0633, "loss": 2.2181, "mask_loss": 0.1417, "step": 602, "topk_loss": 0.0131 }, { "epoch": 0.239667625121877, "grad_norm": 0.1494140625, "learning_rate": 0.0001761749360043845, "lm_loss": 2.0286, "loss": 2.1887, "mask_loss": 0.1452, "step": 603, "topk_loss": 0.0149 }, { "epoch": 0.24006508387000614, "grad_norm": 0.1904296875, "learning_rate": 0.00017609230420626297, "lm_loss": 2.0473, "loss": 2.1985, "mask_loss": 0.1397, "step": 604, "topk_loss": 0.0116 }, { "epoch": 0.2404625426181353, "grad_norm": 0.146484375, "learning_rate": 0.00017600954881152693, "lm_loss": 2.0933, "loss": 2.2414, "mask_loss": 0.1379, "step": 605, "topk_loss": 0.0102 }, { "epoch": 0.24086000136626445, "grad_norm": 0.154296875, "learning_rate": 0.0001759266699545959, "lm_loss": 2.1098, "loss": 2.263, "mask_loss": 0.1397, "step": 606, "topk_loss": 0.0136 }, { "epoch": 0.24125746011439358, "grad_norm": 0.1669921875, "learning_rate": 0.00017584366777008984, "lm_loss": 2.0171, "loss": 2.1688, "mask_loss": 0.1391, "step": 607, "topk_loss": 0.0126 }, { "epoch": 0.24165491886252274, "grad_norm": 0.15234375, "learning_rate": 0.0001757605423928291, "lm_loss": 2.1159, "loss": 2.2668, "mask_loss": 0.138, "step": 608, "topk_loss": 0.0128 }, { "epoch": 0.2420523776106519, "grad_norm": 0.154296875, "learning_rate": 0.00017567729395783405, "lm_loss": 2.0309, "loss": 2.1834, "mask_loss": 0.1404, "step": 609, "topk_loss": 0.012 }, { "epoch": 0.24244983635878103, "grad_norm": 0.1474609375, "learning_rate": 0.00017559392260032506, "lm_loss": 2.072, "loss": 2.2261, "mask_loss": 0.1417, "step": 610, "topk_loss": 0.0125 }, { "epoch": 0.2428472951069102, "grad_norm": 0.158203125, "learning_rate": 0.00017551042845572208, "lm_loss": 2.0899, "loss": 2.2414, "mask_loss": 0.1399, "step": 611, "topk_loss": 0.0116 }, { "epoch": 0.24324475385503935, "grad_norm": 0.1435546875, "learning_rate": 0.0001754268116596445, "lm_loss": 2.0962, "loss": 2.2487, "mask_loss": 0.1398, "step": 612, "topk_loss": 0.0127 }, { "epoch": 0.24364221260316848, "grad_norm": 0.1640625, "learning_rate": 0.00017534307234791098, "lm_loss": 2.1056, "loss": 2.2622, "mask_loss": 0.1442, "step": 613, "topk_loss": 0.0124 }, { "epoch": 0.24403967135129764, "grad_norm": 0.1416015625, "learning_rate": 0.00017525921065653918, "lm_loss": 2.1073, "loss": 2.257, "mask_loss": 0.1376, "step": 614, "topk_loss": 0.0121 }, { "epoch": 0.2444371300994268, "grad_norm": 0.177734375, "learning_rate": 0.00017517522672174548, "lm_loss": 2.0809, "loss": 2.2406, "mask_loss": 0.1426, "step": 615, "topk_loss": 0.0172 }, { "epoch": 0.24483458884755593, "grad_norm": 0.259765625, "learning_rate": 0.00017509112067994487, "lm_loss": 2.1177, "loss": 2.2708, "mask_loss": 0.1405, "step": 616, "topk_loss": 0.0125 }, { "epoch": 0.2452320475956851, "grad_norm": 0.138671875, "learning_rate": 0.00017500689266775063, "lm_loss": 2.0753, "loss": 2.2273, "mask_loss": 0.1397, "step": 617, "topk_loss": 0.0123 }, { "epoch": 0.24562950634381422, "grad_norm": 0.140625, "learning_rate": 0.00017492254282197424, "lm_loss": 2.1113, "loss": 2.2631, "mask_loss": 0.1406, "step": 618, "topk_loss": 0.0113 }, { "epoch": 0.24602696509194338, "grad_norm": 0.1787109375, "learning_rate": 0.00017483807127962502, "lm_loss": 2.0832, "loss": 2.2345, "mask_loss": 0.1396, "step": 619, "topk_loss": 0.0117 }, { "epoch": 0.24642442384007254, "grad_norm": 0.15625, "learning_rate": 0.00017475347817790996, "lm_loss": 2.096, "loss": 2.2442, "mask_loss": 0.1382, "step": 620, "topk_loss": 0.0099 }, { "epoch": 0.24682188258820167, "grad_norm": 0.1494140625, "learning_rate": 0.0001746687636542335, "lm_loss": 2.0826, "loss": 2.2342, "mask_loss": 0.1389, "step": 621, "topk_loss": 0.0127 }, { "epoch": 0.24721934133633083, "grad_norm": 0.166015625, "learning_rate": 0.00017458392784619735, "lm_loss": 2.1393, "loss": 2.2944, "mask_loss": 0.1395, "step": 622, "topk_loss": 0.0156 }, { "epoch": 0.24761680008446, "grad_norm": 0.173828125, "learning_rate": 0.00017449897089160014, "lm_loss": 2.0229, "loss": 2.1723, "mask_loss": 0.1387, "step": 623, "topk_loss": 0.0106 }, { "epoch": 0.24801425883258912, "grad_norm": 0.1767578125, "learning_rate": 0.00017441389292843733, "lm_loss": 2.1188, "loss": 2.2722, "mask_loss": 0.1396, "step": 624, "topk_loss": 0.0137 }, { "epoch": 0.24841171758071828, "grad_norm": 0.1552734375, "learning_rate": 0.00017432869409490093, "lm_loss": 2.148, "loss": 2.3017, "mask_loss": 0.1405, "step": 625, "topk_loss": 0.0131 }, { "epoch": 0.24880917632884744, "grad_norm": 0.19921875, "learning_rate": 0.0001742433745293793, "lm_loss": 2.1267, "loss": 2.2854, "mask_loss": 0.1423, "step": 626, "topk_loss": 0.0163 }, { "epoch": 0.24920663507697657, "grad_norm": 0.2060546875, "learning_rate": 0.00017415793437045685, "lm_loss": 2.1103, "loss": 2.2611, "mask_loss": 0.1395, "step": 627, "topk_loss": 0.0113 }, { "epoch": 0.24960409382510573, "grad_norm": 0.158203125, "learning_rate": 0.00017407237375691392, "lm_loss": 2.0104, "loss": 2.1611, "mask_loss": 0.1391, "step": 628, "topk_loss": 0.0116 }, { "epoch": 0.25000155257323486, "grad_norm": 0.19140625, "learning_rate": 0.00017398669282772645, "lm_loss": 2.0692, "loss": 2.2181, "mask_loss": 0.1372, "step": 629, "topk_loss": 0.0117 }, { "epoch": 0.25039901132136405, "grad_norm": 0.193359375, "learning_rate": 0.00017390089172206592, "lm_loss": 2.0427, "loss": 2.1912, "mask_loss": 0.1357, "step": 630, "topk_loss": 0.0128 }, { "epoch": 0.2507964700694932, "grad_norm": 0.19140625, "learning_rate": 0.00017381497057929884, "lm_loss": 2.0333, "loss": 2.1853, "mask_loss": 0.1395, "step": 631, "topk_loss": 0.0125 }, { "epoch": 0.2511939288176223, "grad_norm": 0.1630859375, "learning_rate": 0.00017372892953898688, "lm_loss": 2.1063, "loss": 2.2558, "mask_loss": 0.1371, "step": 632, "topk_loss": 0.0124 }, { "epoch": 0.2515913875657515, "grad_norm": 0.1953125, "learning_rate": 0.00017364276874088633, "lm_loss": 2.0446, "loss": 2.1945, "mask_loss": 0.1377, "step": 633, "topk_loss": 0.0122 }, { "epoch": 0.25198884631388063, "grad_norm": 0.205078125, "learning_rate": 0.00017355648832494803, "lm_loss": 2.0793, "loss": 2.2304, "mask_loss": 0.1373, "step": 634, "topk_loss": 0.0138 }, { "epoch": 0.25238630506200976, "grad_norm": 0.162109375, "learning_rate": 0.00017347008843131712, "lm_loss": 2.0977, "loss": 2.2481, "mask_loss": 0.1377, "step": 635, "topk_loss": 0.0128 }, { "epoch": 0.25278376381013895, "grad_norm": 0.2236328125, "learning_rate": 0.0001733835692003329, "lm_loss": 2.053, "loss": 2.2053, "mask_loss": 0.1398, "step": 636, "topk_loss": 0.0125 }, { "epoch": 0.2531812225582681, "grad_norm": 0.16796875, "learning_rate": 0.0001732969307725283, "lm_loss": 1.9659, "loss": 2.1155, "mask_loss": 0.1381, "step": 637, "topk_loss": 0.0115 }, { "epoch": 0.2535786813063972, "grad_norm": 0.1484375, "learning_rate": 0.00017321017328863009, "lm_loss": 2.0408, "loss": 2.1949, "mask_loss": 0.1394, "step": 638, "topk_loss": 0.0147 }, { "epoch": 0.25397614005452634, "grad_norm": 0.1484375, "learning_rate": 0.00017312329688955828, "lm_loss": 2.0357, "loss": 2.188, "mask_loss": 0.1403, "step": 639, "topk_loss": 0.012 }, { "epoch": 0.25437359880265553, "grad_norm": 0.2578125, "learning_rate": 0.00017303630171642607, "lm_loss": 2.1687, "loss": 2.3187, "mask_loss": 0.1377, "step": 640, "topk_loss": 0.0123 }, { "epoch": 0.25477105755078466, "grad_norm": 0.2255859375, "learning_rate": 0.0001729491879105396, "lm_loss": 2.0333, "loss": 2.1854, "mask_loss": 0.138, "step": 641, "topk_loss": 0.0141 }, { "epoch": 0.2551685162989138, "grad_norm": 0.2255859375, "learning_rate": 0.0001728619556133977, "lm_loss": 2.1377, "loss": 2.2926, "mask_loss": 0.1386, "step": 642, "topk_loss": 0.0163 }, { "epoch": 0.255565975047043, "grad_norm": 0.1787109375, "learning_rate": 0.0001727746049666916, "lm_loss": 2.0641, "loss": 2.2196, "mask_loss": 0.1395, "step": 643, "topk_loss": 0.0161 }, { "epoch": 0.2559634337951721, "grad_norm": 0.21875, "learning_rate": 0.0001726871361123049, "lm_loss": 2.0548, "loss": 2.2032, "mask_loss": 0.1365, "step": 644, "topk_loss": 0.0119 }, { "epoch": 0.25636089254330124, "grad_norm": 0.169921875, "learning_rate": 0.0001725995491923131, "lm_loss": 2.1623, "loss": 2.3127, "mask_loss": 0.1378, "step": 645, "topk_loss": 0.0126 }, { "epoch": 0.25675835129143043, "grad_norm": 0.212890625, "learning_rate": 0.00017251184434898347, "lm_loss": 2.0753, "loss": 2.2262, "mask_loss": 0.1376, "step": 646, "topk_loss": 0.0133 }, { "epoch": 0.25715581003955956, "grad_norm": 0.232421875, "learning_rate": 0.0001724240217247749, "lm_loss": 2.1105, "loss": 2.2604, "mask_loss": 0.1378, "step": 647, "topk_loss": 0.0122 }, { "epoch": 0.2575532687876887, "grad_norm": 0.1865234375, "learning_rate": 0.00017233608146233754, "lm_loss": 2.0786, "loss": 2.2304, "mask_loss": 0.1396, "step": 648, "topk_loss": 0.0122 }, { "epoch": 0.2579507275358179, "grad_norm": 0.1455078125, "learning_rate": 0.00017224802370451262, "lm_loss": 1.9832, "loss": 2.1332, "mask_loss": 0.1381, "step": 649, "topk_loss": 0.0119 }, { "epoch": 0.258348186283947, "grad_norm": 0.2138671875, "learning_rate": 0.0001721598485943322, "lm_loss": 2.0282, "loss": 2.1852, "mask_loss": 0.1409, "step": 650, "topk_loss": 0.016 }, { "epoch": 0.258348186283947, "eval_lm_loss": 697.0224609375, "eval_loss": 697.17529296875, "eval_mask_hit_rate": 0.5033272504806519, "eval_mask_loss": 0.13698282837867737, "eval_mask_top_10_hit_rate": 0.9774259328842163, "eval_mask_top_1_hit_rate": 0.9951572418212891, "eval_mask_top_20_hit_rate": 0.9647897481918335, "eval_mask_top_5_hit_rate": 0.9849950075149536, "eval_runtime": 144.1684, "eval_samples_per_second": 14.206, "eval_steps_per_second": 7.103, "eval_token_accuracy": 0.5961358547210693, "eval_top_k_diff": -516.7088012695312, "eval_topk_loss": 0.015864230692386627, "step": 650 }, { "epoch": 0.25874564503207614, "grad_norm": 0.1875, "learning_rate": 0.00017207155627501898, "lm_loss": 2.0546, "loss": 2.2045, "mask_loss": 0.1381, "step": 651, "topk_loss": 0.0119 }, { "epoch": 0.25914310378020533, "grad_norm": 0.138671875, "learning_rate": 0.00017198314688998608, "lm_loss": 2.0694, "loss": 2.2223, "mask_loss": 0.1398, "step": 652, "topk_loss": 0.0131 }, { "epoch": 0.25954056252833446, "grad_norm": 0.169921875, "learning_rate": 0.00017189462058283668, "lm_loss": 2.0493, "loss": 2.203, "mask_loss": 0.1399, "step": 653, "topk_loss": 0.0137 }, { "epoch": 0.2599380212764636, "grad_norm": 0.1484375, "learning_rate": 0.00017180597749736395, "lm_loss": 2.074, "loss": 2.2212, "mask_loss": 0.1358, "step": 654, "topk_loss": 0.0114 }, { "epoch": 0.2603354800245928, "grad_norm": 0.16015625, "learning_rate": 0.00017171721777755074, "lm_loss": 2.0825, "loss": 2.2314, "mask_loss": 0.1369, "step": 655, "topk_loss": 0.0119 }, { "epoch": 0.2607329387727219, "grad_norm": 0.138671875, "learning_rate": 0.0001716283415675693, "lm_loss": 2.019, "loss": 2.1732, "mask_loss": 0.1421, "step": 656, "topk_loss": 0.012 }, { "epoch": 0.26113039752085104, "grad_norm": 0.13671875, "learning_rate": 0.00017153934901178113, "lm_loss": 2.0657, "loss": 2.2213, "mask_loss": 0.1419, "step": 657, "topk_loss": 0.0137 }, { "epoch": 0.26152785626898023, "grad_norm": 0.134765625, "learning_rate": 0.0001714502402547367, "lm_loss": 1.9909, "loss": 2.1457, "mask_loss": 0.1424, "step": 658, "topk_loss": 0.0124 }, { "epoch": 0.26192531501710936, "grad_norm": 0.1796875, "learning_rate": 0.00017136101544117525, "lm_loss": 2.0786, "loss": 2.2269, "mask_loss": 0.137, "step": 659, "topk_loss": 0.0113 }, { "epoch": 0.2623227737652385, "grad_norm": 0.146484375, "learning_rate": 0.00017127167471602447, "lm_loss": 2.0891, "loss": 2.2369, "mask_loss": 0.1367, "step": 660, "topk_loss": 0.011 }, { "epoch": 0.2627202325133677, "grad_norm": 0.158203125, "learning_rate": 0.0001711822182244004, "lm_loss": 2.0156, "loss": 2.1625, "mask_loss": 0.1363, "step": 661, "topk_loss": 0.0107 }, { "epoch": 0.2631176912614968, "grad_norm": 0.1923828125, "learning_rate": 0.00017109264611160708, "lm_loss": 2.0307, "loss": 2.18, "mask_loss": 0.1379, "step": 662, "topk_loss": 0.0114 }, { "epoch": 0.26351515000962594, "grad_norm": 0.16015625, "learning_rate": 0.00017100295852313634, "lm_loss": 2.0537, "loss": 2.2058, "mask_loss": 0.139, "step": 663, "topk_loss": 0.0131 }, { "epoch": 0.26391260875775513, "grad_norm": 0.1494140625, "learning_rate": 0.0001709131556046676, "lm_loss": 2.0956, "loss": 2.2443, "mask_loss": 0.1375, "step": 664, "topk_loss": 0.0113 }, { "epoch": 0.26431006750588426, "grad_norm": 0.1611328125, "learning_rate": 0.00017082323750206761, "lm_loss": 2.0686, "loss": 2.2177, "mask_loss": 0.1365, "step": 665, "topk_loss": 0.0126 }, { "epoch": 0.2647075262540134, "grad_norm": 0.1669921875, "learning_rate": 0.00017073320436139023, "lm_loss": 2.0032, "loss": 2.1575, "mask_loss": 0.1415, "step": 666, "topk_loss": 0.0128 }, { "epoch": 0.2651049850021425, "grad_norm": 0.1435546875, "learning_rate": 0.0001706430563288761, "lm_loss": 2.0073, "loss": 2.1573, "mask_loss": 0.1382, "step": 667, "topk_loss": 0.0118 }, { "epoch": 0.2655024437502717, "grad_norm": 0.169921875, "learning_rate": 0.0001705527935509526, "lm_loss": 2.0847, "loss": 2.237, "mask_loss": 0.1385, "step": 668, "topk_loss": 0.0139 }, { "epoch": 0.26589990249840084, "grad_norm": 0.2060546875, "learning_rate": 0.00017046241617423336, "lm_loss": 2.0493, "loss": 2.201, "mask_loss": 0.1388, "step": 669, "topk_loss": 0.0129 }, { "epoch": 0.26629736124653, "grad_norm": 0.1513671875, "learning_rate": 0.00017037192434551823, "lm_loss": 2.0498, "loss": 2.1971, "mask_loss": 0.1358, "step": 670, "topk_loss": 0.0115 }, { "epoch": 0.26669481999465916, "grad_norm": 0.1923828125, "learning_rate": 0.00017028131821179295, "lm_loss": 2.1096, "loss": 2.2599, "mask_loss": 0.1383, "step": 671, "topk_loss": 0.012 }, { "epoch": 0.2670922787427883, "grad_norm": 0.1845703125, "learning_rate": 0.000170190597920229, "lm_loss": 2.0352, "loss": 2.1855, "mask_loss": 0.1383, "step": 672, "topk_loss": 0.0121 }, { "epoch": 0.2674897374909174, "grad_norm": 0.166015625, "learning_rate": 0.0001700997636181831, "lm_loss": 2.0313, "loss": 2.1832, "mask_loss": 0.1399, "step": 673, "topk_loss": 0.012 }, { "epoch": 0.2678871962390466, "grad_norm": 0.18359375, "learning_rate": 0.00017000881545319735, "lm_loss": 2.0797, "loss": 2.2309, "mask_loss": 0.1378, "step": 674, "topk_loss": 0.0135 }, { "epoch": 0.26828465498717574, "grad_norm": 0.203125, "learning_rate": 0.00016991775357299866, "lm_loss": 2.0063, "loss": 2.1545, "mask_loss": 0.1373, "step": 675, "topk_loss": 0.0109 }, { "epoch": 0.2686821137353049, "grad_norm": 0.208984375, "learning_rate": 0.00016982657812549874, "lm_loss": 2.0277, "loss": 2.1847, "mask_loss": 0.1407, "step": 676, "topk_loss": 0.0163 }, { "epoch": 0.26907957248343406, "grad_norm": 0.1474609375, "learning_rate": 0.00016973528925879374, "lm_loss": 1.9607, "loss": 2.1099, "mask_loss": 0.137, "step": 677, "topk_loss": 0.0122 }, { "epoch": 0.2694770312315632, "grad_norm": 0.21875, "learning_rate": 0.000169643887121164, "lm_loss": 2.0658, "loss": 2.2251, "mask_loss": 0.1411, "step": 678, "topk_loss": 0.0182 }, { "epoch": 0.2698744899796923, "grad_norm": 0.255859375, "learning_rate": 0.00016955237186107387, "lm_loss": 2.0325, "loss": 2.1816, "mask_loss": 0.1364, "step": 679, "topk_loss": 0.0128 }, { "epoch": 0.2702719487278215, "grad_norm": 0.1875, "learning_rate": 0.00016946074362717147, "lm_loss": 2.0734, "loss": 2.2269, "mask_loss": 0.1391, "step": 680, "topk_loss": 0.0144 }, { "epoch": 0.27066940747595064, "grad_norm": 0.181640625, "learning_rate": 0.00016936900256828838, "lm_loss": 2.0314, "loss": 2.1812, "mask_loss": 0.1394, "step": 681, "topk_loss": 0.0104 }, { "epoch": 0.27106686622407977, "grad_norm": 0.18359375, "learning_rate": 0.00016927714883343948, "lm_loss": 2.0189, "loss": 2.1689, "mask_loss": 0.1388, "step": 682, "topk_loss": 0.0111 }, { "epoch": 0.27146432497220896, "grad_norm": 0.228515625, "learning_rate": 0.00016918518257182265, "lm_loss": 2.0225, "loss": 2.1695, "mask_loss": 0.136, "step": 683, "topk_loss": 0.011 }, { "epoch": 0.2718617837203381, "grad_norm": 0.154296875, "learning_rate": 0.00016909310393281856, "lm_loss": 2.0208, "loss": 2.1713, "mask_loss": 0.1388, "step": 684, "topk_loss": 0.0117 }, { "epoch": 0.2722592424684672, "grad_norm": 0.142578125, "learning_rate": 0.00016900091306599042, "lm_loss": 2.0877, "loss": 2.2388, "mask_loss": 0.1388, "step": 685, "topk_loss": 0.0122 }, { "epoch": 0.2726567012165964, "grad_norm": 0.1650390625, "learning_rate": 0.00016890861012108365, "lm_loss": 2.0105, "loss": 2.1592, "mask_loss": 0.1366, "step": 686, "topk_loss": 0.0122 }, { "epoch": 0.27305415996472554, "grad_norm": 0.18359375, "learning_rate": 0.00016881619524802583, "lm_loss": 2.0659, "loss": 2.2165, "mask_loss": 0.1384, "step": 687, "topk_loss": 0.0122 }, { "epoch": 0.27345161871285467, "grad_norm": 0.142578125, "learning_rate": 0.00016872366859692627, "lm_loss": 2.042, "loss": 2.188, "mask_loss": 0.1355, "step": 688, "topk_loss": 0.0105 }, { "epoch": 0.27384907746098386, "grad_norm": 0.1513671875, "learning_rate": 0.0001686310303180759, "lm_loss": 2.0872, "loss": 2.2323, "mask_loss": 0.1338, "step": 689, "topk_loss": 0.0112 }, { "epoch": 0.274246536209113, "grad_norm": 0.1669921875, "learning_rate": 0.00016853828056194697, "lm_loss": 2.0472, "loss": 2.1952, "mask_loss": 0.1368, "step": 690, "topk_loss": 0.0112 }, { "epoch": 0.2746439949572421, "grad_norm": 0.224609375, "learning_rate": 0.00016844541947919268, "lm_loss": 2.0788, "loss": 2.2304, "mask_loss": 0.1373, "step": 691, "topk_loss": 0.0143 }, { "epoch": 0.2750414537053713, "grad_norm": 0.15234375, "learning_rate": 0.00016835244722064716, "lm_loss": 1.9852, "loss": 2.1361, "mask_loss": 0.1386, "step": 692, "topk_loss": 0.0123 }, { "epoch": 0.27543891245350044, "grad_norm": 0.2333984375, "learning_rate": 0.0001682593639373252, "lm_loss": 2.0409, "loss": 2.1966, "mask_loss": 0.1405, "step": 693, "topk_loss": 0.0151 }, { "epoch": 0.27583637120162957, "grad_norm": 0.17578125, "learning_rate": 0.00016816616978042174, "lm_loss": 2.042, "loss": 2.1896, "mask_loss": 0.1356, "step": 694, "topk_loss": 0.012 }, { "epoch": 0.2762338299497587, "grad_norm": 0.2490234375, "learning_rate": 0.00016807286490131196, "lm_loss": 2.0005, "loss": 2.15, "mask_loss": 0.1373, "step": 695, "topk_loss": 0.0122 }, { "epoch": 0.2766312886978879, "grad_norm": 0.181640625, "learning_rate": 0.0001679794494515508, "lm_loss": 2.0451, "loss": 2.1984, "mask_loss": 0.1384, "step": 696, "topk_loss": 0.0149 }, { "epoch": 0.277028747446017, "grad_norm": 0.3671875, "learning_rate": 0.00016788592358287286, "lm_loss": 2.0757, "loss": 2.2443, "mask_loss": 0.1449, "step": 697, "topk_loss": 0.0238 }, { "epoch": 0.27742620619414615, "grad_norm": 0.19140625, "learning_rate": 0.00016779228744719205, "lm_loss": 2.0335, "loss": 2.1829, "mask_loss": 0.1376, "step": 698, "topk_loss": 0.0117 }, { "epoch": 0.27782366494227534, "grad_norm": 0.185546875, "learning_rate": 0.0001676985411966014, "lm_loss": 2.0207, "loss": 2.1695, "mask_loss": 0.1379, "step": 699, "topk_loss": 0.011 }, { "epoch": 0.27822112369040447, "grad_norm": 0.1904296875, "learning_rate": 0.00016760468498337283, "lm_loss": 1.9973, "loss": 2.1477, "mask_loss": 0.1369, "step": 700, "topk_loss": 0.0134 }, { "epoch": 0.27822112369040447, "eval_lm_loss": 695.3560180664062, "eval_loss": 695.5069580078125, "eval_mask_hit_rate": 0.5073809623718262, "eval_mask_loss": 0.13550084829330444, "eval_mask_top_10_hit_rate": 0.9787076711654663, "eval_mask_top_1_hit_rate": 0.995525598526001, "eval_mask_top_20_hit_rate": 0.9665408134460449, "eval_mask_top_5_hit_rate": 0.9859187602996826, "eval_runtime": 144.1276, "eval_samples_per_second": 14.21, "eval_steps_per_second": 7.105, "eval_token_accuracy": 0.5988088846206665, "eval_top_k_diff": -515.3989868164062, "eval_topk_loss": 0.015472196973860264, "step": 700 }, { "epoch": 0.2786185824385336, "grad_norm": 0.1923828125, "learning_rate": 0.00016751071895995684, "lm_loss": 1.9556, "loss": 2.1084, "mask_loss": 0.1402, "step": 701, "topk_loss": 0.0126 }, { "epoch": 0.2790160411866628, "grad_norm": 0.2119140625, "learning_rate": 0.00016741664327898223, "lm_loss": 2.0031, "loss": 2.1516, "mask_loss": 0.1363, "step": 702, "topk_loss": 0.0123 }, { "epoch": 0.2794134999347919, "grad_norm": 0.2109375, "learning_rate": 0.0001673224580932561, "lm_loss": 2.0067, "loss": 2.1557, "mask_loss": 0.137, "step": 703, "topk_loss": 0.012 }, { "epoch": 0.27981095868292105, "grad_norm": 0.197265625, "learning_rate": 0.00016722816355576323, "lm_loss": 2.0194, "loss": 2.1701, "mask_loss": 0.1381, "step": 704, "topk_loss": 0.0126 }, { "epoch": 0.28020841743105024, "grad_norm": 0.19921875, "learning_rate": 0.00016713375981966612, "lm_loss": 2.0297, "loss": 2.1771, "mask_loss": 0.1363, "step": 705, "topk_loss": 0.0111 }, { "epoch": 0.28060587617917937, "grad_norm": 0.177734375, "learning_rate": 0.0001670392470383046, "lm_loss": 2.0465, "loss": 2.1973, "mask_loss": 0.1373, "step": 706, "topk_loss": 0.0134 }, { "epoch": 0.2810033349273085, "grad_norm": 0.1689453125, "learning_rate": 0.00016694462536519561, "lm_loss": 2.0325, "loss": 2.1811, "mask_loss": 0.1352, "step": 707, "topk_loss": 0.0134 }, { "epoch": 0.2814007936754377, "grad_norm": 0.16015625, "learning_rate": 0.00016684989495403308, "lm_loss": 2.0532, "loss": 2.2063, "mask_loss": 0.138, "step": 708, "topk_loss": 0.0151 }, { "epoch": 0.2817982524235668, "grad_norm": 0.2236328125, "learning_rate": 0.0001667550559586874, "lm_loss": 2.0983, "loss": 2.247, "mask_loss": 0.1364, "step": 709, "topk_loss": 0.0122 }, { "epoch": 0.28219571117169595, "grad_norm": 0.1708984375, "learning_rate": 0.00016666010853320543, "lm_loss": 1.977, "loss": 2.127, "mask_loss": 0.1376, "step": 710, "topk_loss": 0.0124 }, { "epoch": 0.28259316991982514, "grad_norm": 0.1572265625, "learning_rate": 0.0001665650528318101, "lm_loss": 2.0323, "loss": 2.1819, "mask_loss": 0.1378, "step": 711, "topk_loss": 0.0118 }, { "epoch": 0.28299062866795427, "grad_norm": 0.173828125, "learning_rate": 0.0001664698890089003, "lm_loss": 1.9691, "loss": 2.12, "mask_loss": 0.137, "step": 712, "topk_loss": 0.0138 }, { "epoch": 0.2833880874160834, "grad_norm": 0.1669921875, "learning_rate": 0.00016637461721905045, "lm_loss": 2.065, "loss": 2.214, "mask_loss": 0.1377, "step": 713, "topk_loss": 0.0113 }, { "epoch": 0.2837855461642126, "grad_norm": 0.1748046875, "learning_rate": 0.00016627923761701038, "lm_loss": 2.0362, "loss": 2.1902, "mask_loss": 0.14, "step": 714, "topk_loss": 0.014 }, { "epoch": 0.2841830049123417, "grad_norm": 0.140625, "learning_rate": 0.00016618375035770498, "lm_loss": 2.0468, "loss": 2.1948, "mask_loss": 0.1373, "step": 715, "topk_loss": 0.0107 }, { "epoch": 0.28458046366047085, "grad_norm": 0.150390625, "learning_rate": 0.00016608815559623414, "lm_loss": 2.0417, "loss": 2.1922, "mask_loss": 0.138, "step": 716, "topk_loss": 0.0125 }, { "epoch": 0.28497792240860004, "grad_norm": 0.1533203125, "learning_rate": 0.0001659924534878723, "lm_loss": 2.0316, "loss": 2.1754, "mask_loss": 0.1327, "step": 717, "topk_loss": 0.011 }, { "epoch": 0.28537538115672917, "grad_norm": 0.1513671875, "learning_rate": 0.00016589664418806814, "lm_loss": 2.0287, "loss": 2.1845, "mask_loss": 0.14, "step": 718, "topk_loss": 0.0158 }, { "epoch": 0.2857728399048583, "grad_norm": 0.1806640625, "learning_rate": 0.00016580072785244462, "lm_loss": 2.0515, "loss": 2.2143, "mask_loss": 0.1435, "step": 719, "topk_loss": 0.0193 }, { "epoch": 0.2861702986529875, "grad_norm": 0.166015625, "learning_rate": 0.00016570470463679856, "lm_loss": 2.1156, "loss": 2.2641, "mask_loss": 0.1365, "step": 720, "topk_loss": 0.012 }, { "epoch": 0.2865677574011166, "grad_norm": 0.1416015625, "learning_rate": 0.00016560857469710022, "lm_loss": 2.0524, "loss": 2.1965, "mask_loss": 0.1328, "step": 721, "topk_loss": 0.0113 }, { "epoch": 0.28696521614924575, "grad_norm": 0.1474609375, "learning_rate": 0.00016551233818949337, "lm_loss": 2.0192, "loss": 2.1676, "mask_loss": 0.1359, "step": 722, "topk_loss": 0.0124 }, { "epoch": 0.28736267489737494, "grad_norm": 0.14453125, "learning_rate": 0.0001654159952702948, "lm_loss": 2.0376, "loss": 2.1842, "mask_loss": 0.1342, "step": 723, "topk_loss": 0.0124 }, { "epoch": 0.28776013364550407, "grad_norm": 0.1494140625, "learning_rate": 0.0001653195460959942, "lm_loss": 2.0463, "loss": 2.1955, "mask_loss": 0.1367, "step": 724, "topk_loss": 0.0126 }, { "epoch": 0.2881575923936332, "grad_norm": 0.1328125, "learning_rate": 0.00016522299082325382, "lm_loss": 2.0002, "loss": 2.1447, "mask_loss": 0.1336, "step": 725, "topk_loss": 0.0109 }, { "epoch": 0.28855505114176233, "grad_norm": 0.1591796875, "learning_rate": 0.00016512632960890823, "lm_loss": 2.063, "loss": 2.2123, "mask_loss": 0.1358, "step": 726, "topk_loss": 0.0135 }, { "epoch": 0.2889525098898915, "grad_norm": 0.16015625, "learning_rate": 0.00016502956260996404, "lm_loss": 2.0324, "loss": 2.181, "mask_loss": 0.1365, "step": 727, "topk_loss": 0.0121 }, { "epoch": 0.28934996863802065, "grad_norm": 0.158203125, "learning_rate": 0.00016493268998359986, "lm_loss": 1.9948, "loss": 2.1435, "mask_loss": 0.1368, "step": 728, "topk_loss": 0.0119 }, { "epoch": 0.2897474273861498, "grad_norm": 0.1962890625, "learning_rate": 0.00016483571188716562, "lm_loss": 2.0435, "loss": 2.1896, "mask_loss": 0.1348, "step": 729, "topk_loss": 0.0114 }, { "epoch": 0.29014488613427897, "grad_norm": 0.16015625, "learning_rate": 0.00016473862847818277, "lm_loss": 2.0547, "loss": 2.2039, "mask_loss": 0.1362, "step": 730, "topk_loss": 0.013 }, { "epoch": 0.2905423448824081, "grad_norm": 0.1591796875, "learning_rate": 0.00016464143991434373, "lm_loss": 1.989, "loss": 2.1381, "mask_loss": 0.1372, "step": 731, "topk_loss": 0.0118 }, { "epoch": 0.29093980363053723, "grad_norm": 0.13671875, "learning_rate": 0.00016454414635351175, "lm_loss": 2.0332, "loss": 2.1796, "mask_loss": 0.1357, "step": 732, "topk_loss": 0.0107 }, { "epoch": 0.2913372623786664, "grad_norm": 0.14453125, "learning_rate": 0.00016444674795372058, "lm_loss": 2.0188, "loss": 2.1681, "mask_loss": 0.137, "step": 733, "topk_loss": 0.0122 }, { "epoch": 0.29173472112679555, "grad_norm": 0.138671875, "learning_rate": 0.0001643492448731743, "lm_loss": 1.9968, "loss": 2.1447, "mask_loss": 0.136, "step": 734, "topk_loss": 0.0118 }, { "epoch": 0.2921321798749247, "grad_norm": 0.15234375, "learning_rate": 0.00016425163727024707, "lm_loss": 2.0086, "loss": 2.1536, "mask_loss": 0.1341, "step": 735, "topk_loss": 0.0109 }, { "epoch": 0.29252963862305387, "grad_norm": 0.146484375, "learning_rate": 0.0001641539253034827, "lm_loss": 2.0357, "loss": 2.1823, "mask_loss": 0.1349, "step": 736, "topk_loss": 0.0118 }, { "epoch": 0.292927097371183, "grad_norm": 0.271484375, "learning_rate": 0.00016405610913159465, "lm_loss": 1.9641, "loss": 2.1237, "mask_loss": 0.1402, "step": 737, "topk_loss": 0.0194 }, { "epoch": 0.29332455611931213, "grad_norm": 0.21875, "learning_rate": 0.0001639581889134655, "lm_loss": 2.0021, "loss": 2.1475, "mask_loss": 0.1339, "step": 738, "topk_loss": 0.0116 }, { "epoch": 0.2937220148674413, "grad_norm": 0.1904296875, "learning_rate": 0.000163860164808147, "lm_loss": 1.9545, "loss": 2.101, "mask_loss": 0.1357, "step": 739, "topk_loss": 0.0109 }, { "epoch": 0.29411947361557045, "grad_norm": 0.1396484375, "learning_rate": 0.0001637620369748595, "lm_loss": 2.0343, "loss": 2.1775, "mask_loss": 0.132, "step": 740, "topk_loss": 0.0111 }, { "epoch": 0.2945169323636996, "grad_norm": 0.1484375, "learning_rate": 0.0001636638055729919, "lm_loss": 1.9935, "loss": 2.1405, "mask_loss": 0.136, "step": 741, "topk_loss": 0.0109 }, { "epoch": 0.29491439111182877, "grad_norm": 0.1943359375, "learning_rate": 0.00016356547076210135, "lm_loss": 2.0936, "loss": 2.2431, "mask_loss": 0.1348, "step": 742, "topk_loss": 0.0148 }, { "epoch": 0.2953118498599579, "grad_norm": 0.1513671875, "learning_rate": 0.0001634670327019129, "lm_loss": 2.0426, "loss": 2.1898, "mask_loss": 0.1351, "step": 743, "topk_loss": 0.0121 }, { "epoch": 0.29570930860808703, "grad_norm": 0.1923828125, "learning_rate": 0.00016336849155231935, "lm_loss": 1.9431, "loss": 2.0942, "mask_loss": 0.1381, "step": 744, "topk_loss": 0.013 }, { "epoch": 0.2961067673562162, "grad_norm": 0.12890625, "learning_rate": 0.00016326984747338095, "lm_loss": 1.9955, "loss": 2.1404, "mask_loss": 0.1338, "step": 745, "topk_loss": 0.0112 }, { "epoch": 0.29650422610434535, "grad_norm": 0.1396484375, "learning_rate": 0.0001631711006253251, "lm_loss": 1.9625, "loss": 2.1121, "mask_loss": 0.1382, "step": 746, "topk_loss": 0.0114 }, { "epoch": 0.2969016848524745, "grad_norm": 0.1484375, "learning_rate": 0.00016307225116854622, "lm_loss": 2.021, "loss": 2.1701, "mask_loss": 0.1363, "step": 747, "topk_loss": 0.0129 }, { "epoch": 0.29729914360060367, "grad_norm": 0.1435546875, "learning_rate": 0.00016297329926360523, "lm_loss": 1.9784, "loss": 2.1291, "mask_loss": 0.138, "step": 748, "topk_loss": 0.0127 }, { "epoch": 0.2976966023487328, "grad_norm": 0.130859375, "learning_rate": 0.00016287424507122964, "lm_loss": 1.9802, "loss": 2.1269, "mask_loss": 0.1356, "step": 749, "topk_loss": 0.0111 }, { "epoch": 0.29809406109686193, "grad_norm": 0.1494140625, "learning_rate": 0.00016277508875231302, "lm_loss": 2.0315, "loss": 2.1765, "mask_loss": 0.1338, "step": 750, "topk_loss": 0.0112 }, { "epoch": 0.29809406109686193, "eval_lm_loss": 694.0836181640625, "eval_loss": 694.2323608398438, "eval_mask_hit_rate": 0.5113680362701416, "eval_mask_loss": 0.133888840675354, "eval_mask_top_10_hit_rate": 0.9798060059547424, "eval_mask_top_1_hit_rate": 0.9958555698394775, "eval_mask_top_20_hit_rate": 0.9680401086807251, "eval_mask_top_5_hit_rate": 0.986741304397583, "eval_runtime": 143.6233, "eval_samples_per_second": 14.26, "eval_steps_per_second": 7.13, "eval_token_accuracy": 0.6011416912078857, "eval_top_k_diff": -520.20947265625, "eval_topk_loss": 0.01484230998903513, "step": 750 }, { "epoch": 0.2984915198449911, "grad_norm": 0.15234375, "learning_rate": 0.0001626758304679148, "lm_loss": 2.0184, "loss": 2.1698, "mask_loss": 0.1377, "step": 751, "topk_loss": 0.0137 }, { "epoch": 0.29888897859312025, "grad_norm": 0.1435546875, "learning_rate": 0.00016257647037926006, "lm_loss": 1.9852, "loss": 2.1327, "mask_loss": 0.1355, "step": 752, "topk_loss": 0.012 }, { "epoch": 0.2992864373412494, "grad_norm": 0.142578125, "learning_rate": 0.00016247700864773927, "lm_loss": 2.0101, "loss": 2.158, "mask_loss": 0.135, "step": 753, "topk_loss": 0.0129 }, { "epoch": 0.2996838960893785, "grad_norm": 0.1435546875, "learning_rate": 0.00016237744543490796, "lm_loss": 1.9929, "loss": 2.1393, "mask_loss": 0.1342, "step": 754, "topk_loss": 0.0121 }, { "epoch": 0.3000813548375077, "grad_norm": 0.1474609375, "learning_rate": 0.00016227778090248648, "lm_loss": 2.0294, "loss": 2.1785, "mask_loss": 0.1378, "step": 755, "topk_loss": 0.0113 }, { "epoch": 0.30047881358563683, "grad_norm": 0.171875, "learning_rate": 0.0001621780152123598, "lm_loss": 2.038, "loss": 2.1829, "mask_loss": 0.1337, "step": 756, "topk_loss": 0.0112 }, { "epoch": 0.30087627233376596, "grad_norm": 0.1552734375, "learning_rate": 0.0001620781485265772, "lm_loss": 1.9621, "loss": 2.1093, "mask_loss": 0.1368, "step": 757, "topk_loss": 0.0105 }, { "epoch": 0.30127373108189515, "grad_norm": 0.1806640625, "learning_rate": 0.00016197818100735197, "lm_loss": 1.9951, "loss": 2.1406, "mask_loss": 0.1345, "step": 758, "topk_loss": 0.0111 }, { "epoch": 0.3016711898300243, "grad_norm": 0.173828125, "learning_rate": 0.00016187811281706115, "lm_loss": 2.0192, "loss": 2.1718, "mask_loss": 0.1378, "step": 759, "topk_loss": 0.0148 }, { "epoch": 0.3020686485781534, "grad_norm": 0.2265625, "learning_rate": 0.00016177794411824544, "lm_loss": 2.0547, "loss": 2.1978, "mask_loss": 0.1334, "step": 760, "topk_loss": 0.0096 }, { "epoch": 0.3024661073262826, "grad_norm": 0.1396484375, "learning_rate": 0.00016167767507360866, "lm_loss": 1.9905, "loss": 2.1363, "mask_loss": 0.1345, "step": 761, "topk_loss": 0.0113 }, { "epoch": 0.3028635660744117, "grad_norm": 0.1650390625, "learning_rate": 0.00016157730584601764, "lm_loss": 2.0099, "loss": 2.1553, "mask_loss": 0.1334, "step": 762, "topk_loss": 0.012 }, { "epoch": 0.30326102482254086, "grad_norm": 0.162109375, "learning_rate": 0.00016147683659850198, "lm_loss": 1.9952, "loss": 2.1452, "mask_loss": 0.1368, "step": 763, "topk_loss": 0.0132 }, { "epoch": 0.30365848357067005, "grad_norm": 0.1337890625, "learning_rate": 0.00016137626749425377, "lm_loss": 1.9443, "loss": 2.0912, "mask_loss": 0.1364, "step": 764, "topk_loss": 0.0106 }, { "epoch": 0.3040559423187992, "grad_norm": 0.15625, "learning_rate": 0.00016127559869662722, "lm_loss": 2.0581, "loss": 2.2016, "mask_loss": 0.1316, "step": 765, "topk_loss": 0.0119 }, { "epoch": 0.3044534010669283, "grad_norm": 0.1416015625, "learning_rate": 0.0001611748303691385, "lm_loss": 1.9773, "loss": 2.1237, "mask_loss": 0.136, "step": 766, "topk_loss": 0.0104 }, { "epoch": 0.3048508598150575, "grad_norm": 0.14453125, "learning_rate": 0.00016107396267546546, "lm_loss": 1.9836, "loss": 2.1356, "mask_loss": 0.1385, "step": 767, "topk_loss": 0.0134 }, { "epoch": 0.3052483185631866, "grad_norm": 0.19140625, "learning_rate": 0.00016097299577944735, "lm_loss": 2.0708, "loss": 2.2217, "mask_loss": 0.1358, "step": 768, "topk_loss": 0.015 }, { "epoch": 0.30564577731131576, "grad_norm": 0.150390625, "learning_rate": 0.00016087192984508451, "lm_loss": 1.9767, "loss": 2.124, "mask_loss": 0.1366, "step": 769, "topk_loss": 0.0107 }, { "epoch": 0.30604323605944495, "grad_norm": 0.1669921875, "learning_rate": 0.00016077076503653826, "lm_loss": 2.0156, "loss": 2.1636, "mask_loss": 0.1353, "step": 770, "topk_loss": 0.0128 }, { "epoch": 0.3064406948075741, "grad_norm": 0.138671875, "learning_rate": 0.00016066950151813033, "lm_loss": 2.012, "loss": 2.1566, "mask_loss": 0.1334, "step": 771, "topk_loss": 0.0112 }, { "epoch": 0.3068381535557032, "grad_norm": 0.1630859375, "learning_rate": 0.00016056813945434302, "lm_loss": 2.0295, "loss": 2.1749, "mask_loss": 0.1353, "step": 772, "topk_loss": 0.0101 }, { "epoch": 0.3072356123038324, "grad_norm": 0.138671875, "learning_rate": 0.0001604666790098185, "lm_loss": 2.0299, "loss": 2.1751, "mask_loss": 0.1341, "step": 773, "topk_loss": 0.0111 }, { "epoch": 0.3076330710519615, "grad_norm": 0.255859375, "learning_rate": 0.00016036512034935886, "lm_loss": 1.9921, "loss": 2.1371, "mask_loss": 0.1334, "step": 774, "topk_loss": 0.0116 }, { "epoch": 0.30803052980009066, "grad_norm": 0.140625, "learning_rate": 0.00016026346363792567, "lm_loss": 1.967, "loss": 2.1142, "mask_loss": 0.136, "step": 775, "topk_loss": 0.0112 }, { "epoch": 0.30842798854821984, "grad_norm": 0.1748046875, "learning_rate": 0.0001601617090406397, "lm_loss": 2.0243, "loss": 2.1723, "mask_loss": 0.1356, "step": 776, "topk_loss": 0.0124 }, { "epoch": 0.308825447296349, "grad_norm": 0.1240234375, "learning_rate": 0.00016005985672278093, "lm_loss": 1.9735, "loss": 2.1208, "mask_loss": 0.1353, "step": 777, "topk_loss": 0.012 }, { "epoch": 0.3092229060444781, "grad_norm": 0.158203125, "learning_rate": 0.0001599579068497878, "lm_loss": 2.0466, "loss": 2.1879, "mask_loss": 0.1309, "step": 778, "topk_loss": 0.0104 }, { "epoch": 0.3096203647926073, "grad_norm": 0.1884765625, "learning_rate": 0.00015985585958725736, "lm_loss": 2.0242, "loss": 2.1745, "mask_loss": 0.1364, "step": 779, "topk_loss": 0.0139 }, { "epoch": 0.3100178235407364, "grad_norm": 0.1455078125, "learning_rate": 0.00015975371510094485, "lm_loss": 1.9809, "loss": 2.1286, "mask_loss": 0.1363, "step": 780, "topk_loss": 0.0115 }, { "epoch": 0.31041528228886556, "grad_norm": 0.189453125, "learning_rate": 0.00015965147355676343, "lm_loss": 1.9497, "loss": 2.0973, "mask_loss": 0.1358, "step": 781, "topk_loss": 0.0118 }, { "epoch": 0.3108127410369947, "grad_norm": 0.1689453125, "learning_rate": 0.00015954913512078385, "lm_loss": 1.9924, "loss": 2.1432, "mask_loss": 0.1369, "step": 782, "topk_loss": 0.0139 }, { "epoch": 0.3112101997851239, "grad_norm": 0.1728515625, "learning_rate": 0.00015944669995923426, "lm_loss": 2.0464, "loss": 2.1883, "mask_loss": 0.131, "step": 783, "topk_loss": 0.0108 }, { "epoch": 0.311607658533253, "grad_norm": 0.166015625, "learning_rate": 0.00015934416823849997, "lm_loss": 1.9721, "loss": 2.1281, "mask_loss": 0.1398, "step": 784, "topk_loss": 0.0162 }, { "epoch": 0.31200511728138214, "grad_norm": 0.1630859375, "learning_rate": 0.00015924154012512308, "lm_loss": 2.0275, "loss": 2.1707, "mask_loss": 0.1315, "step": 785, "topk_loss": 0.0117 }, { "epoch": 0.3124025760295113, "grad_norm": 0.1298828125, "learning_rate": 0.00015913881578580227, "lm_loss": 1.9875, "loss": 2.1333, "mask_loss": 0.135, "step": 786, "topk_loss": 0.0108 }, { "epoch": 0.31280003477764046, "grad_norm": 0.140625, "learning_rate": 0.00015903599538739254, "lm_loss": 1.9729, "loss": 2.1233, "mask_loss": 0.1376, "step": 787, "topk_loss": 0.0128 }, { "epoch": 0.3131974935257696, "grad_norm": 0.1435546875, "learning_rate": 0.00015893307909690493, "lm_loss": 1.976, "loss": 2.1287, "mask_loss": 0.1389, "step": 788, "topk_loss": 0.0137 }, { "epoch": 0.3135949522738988, "grad_norm": 0.1357421875, "learning_rate": 0.00015883006708150623, "lm_loss": 1.9939, "loss": 2.1413, "mask_loss": 0.1358, "step": 789, "topk_loss": 0.0116 }, { "epoch": 0.3139924110220279, "grad_norm": 0.13671875, "learning_rate": 0.0001587269595085186, "lm_loss": 2.0191, "loss": 2.1641, "mask_loss": 0.1336, "step": 790, "topk_loss": 0.0115 }, { "epoch": 0.31438986977015704, "grad_norm": 0.14453125, "learning_rate": 0.00015862375654541964, "lm_loss": 2.005, "loss": 2.1523, "mask_loss": 0.1346, "step": 791, "topk_loss": 0.0127 }, { "epoch": 0.3147873285182862, "grad_norm": 0.142578125, "learning_rate": 0.0001585204583598417, "lm_loss": 1.9805, "loss": 2.1273, "mask_loss": 0.1354, "step": 792, "topk_loss": 0.0114 }, { "epoch": 0.31518478726641536, "grad_norm": 0.1416015625, "learning_rate": 0.00015841706511957184, "lm_loss": 1.9668, "loss": 2.1207, "mask_loss": 0.1385, "step": 793, "topk_loss": 0.0154 }, { "epoch": 0.3155822460145445, "grad_norm": 0.134765625, "learning_rate": 0.00015831357699255157, "lm_loss": 2.0282, "loss": 2.172, "mask_loss": 0.1334, "step": 794, "topk_loss": 0.0104 }, { "epoch": 0.3159797047626737, "grad_norm": 0.140625, "learning_rate": 0.00015820999414687656, "lm_loss": 1.9895, "loss": 2.1362, "mask_loss": 0.1339, "step": 795, "topk_loss": 0.0128 }, { "epoch": 0.3163771635108028, "grad_norm": 0.1416015625, "learning_rate": 0.00015810631675079617, "lm_loss": 1.9351, "loss": 2.0815, "mask_loss": 0.1344, "step": 796, "topk_loss": 0.012 }, { "epoch": 0.31677462225893194, "grad_norm": 0.16796875, "learning_rate": 0.00015800254497271352, "lm_loss": 1.9859, "loss": 2.1291, "mask_loss": 0.1328, "step": 797, "topk_loss": 0.0104 }, { "epoch": 0.3171720810070611, "grad_norm": 0.1533203125, "learning_rate": 0.0001578986789811849, "lm_loss": 1.9865, "loss": 2.1324, "mask_loss": 0.1349, "step": 798, "topk_loss": 0.011 }, { "epoch": 0.31756953975519026, "grad_norm": 0.150390625, "learning_rate": 0.00015779471894491966, "lm_loss": 2.0869, "loss": 2.2302, "mask_loss": 0.1321, "step": 799, "topk_loss": 0.0112 }, { "epoch": 0.3179669985033194, "grad_norm": 0.171875, "learning_rate": 0.00015769066503277997, "lm_loss": 2.0015, "loss": 2.1486, "mask_loss": 0.1341, "step": 800, "topk_loss": 0.0129 }, { "epoch": 0.3179669985033194, "eval_lm_loss": 693.9171752929688, "eval_loss": 694.0645141601562, "eval_mask_hit_rate": 0.5143714547157288, "eval_mask_loss": 0.13279682397842407, "eval_mask_top_10_hit_rate": 0.9807169437408447, "eval_mask_top_1_hit_rate": 0.9961609840393066, "eval_mask_top_20_hit_rate": 0.9693107008934021, "eval_mask_top_5_hit_rate": 0.9874145984649658, "eval_runtime": 144.2716, "eval_samples_per_second": 14.195, "eval_steps_per_second": 7.098, "eval_token_accuracy": 0.6030571460723877, "eval_top_k_diff": -527.3522338867188, "eval_topk_loss": 0.014561420306563377, "step": 800 }, { "epoch": 0.3183644572514486, "grad_norm": 0.142578125, "learning_rate": 0.0001575865174137805, "lm_loss": 1.9426, "loss": 2.0876, "mask_loss": 0.1347, "step": 801, "topk_loss": 0.0103 }, { "epoch": 0.3187619159995777, "grad_norm": 0.146484375, "learning_rate": 0.00015748227625708797, "lm_loss": 1.9777, "loss": 2.1199, "mask_loss": 0.1313, "step": 802, "topk_loss": 0.0109 }, { "epoch": 0.31915937474770684, "grad_norm": 0.1689453125, "learning_rate": 0.0001573779417320212, "lm_loss": 2.0242, "loss": 2.1683, "mask_loss": 0.1337, "step": 803, "topk_loss": 0.0104 }, { "epoch": 0.319556833495836, "grad_norm": 0.1337890625, "learning_rate": 0.00015727351400805052, "lm_loss": 2.007, "loss": 2.1524, "mask_loss": 0.1331, "step": 804, "topk_loss": 0.0123 }, { "epoch": 0.31995429224396515, "grad_norm": 0.1337890625, "learning_rate": 0.0001571689932547978, "lm_loss": 2.0379, "loss": 2.1856, "mask_loss": 0.1353, "step": 805, "topk_loss": 0.0125 }, { "epoch": 0.3203517509920943, "grad_norm": 0.181640625, "learning_rate": 0.00015706437964203596, "lm_loss": 2.0187, "loss": 2.1612, "mask_loss": 0.1315, "step": 806, "topk_loss": 0.0109 }, { "epoch": 0.3207492097402235, "grad_norm": 0.1455078125, "learning_rate": 0.0001569596733396886, "lm_loss": 2.0469, "loss": 2.1899, "mask_loss": 0.1325, "step": 807, "topk_loss": 0.0104 }, { "epoch": 0.3211466684883526, "grad_norm": 0.1435546875, "learning_rate": 0.00015685487451783017, "lm_loss": 1.9933, "loss": 2.1382, "mask_loss": 0.1333, "step": 808, "topk_loss": 0.0116 }, { "epoch": 0.32154412723648174, "grad_norm": 0.2119140625, "learning_rate": 0.0001567499833466852, "lm_loss": 2.0374, "loss": 2.1831, "mask_loss": 0.1335, "step": 809, "topk_loss": 0.0122 }, { "epoch": 0.32194158598461087, "grad_norm": 0.1484375, "learning_rate": 0.00015664499999662815, "lm_loss": 1.9767, "loss": 2.1208, "mask_loss": 0.1327, "step": 810, "topk_loss": 0.0114 }, { "epoch": 0.32233904473274005, "grad_norm": 0.193359375, "learning_rate": 0.0001565399246381834, "lm_loss": 1.9659, "loss": 2.1118, "mask_loss": 0.1334, "step": 811, "topk_loss": 0.0125 }, { "epoch": 0.3227365034808692, "grad_norm": 0.1875, "learning_rate": 0.0001564347574420247, "lm_loss": 1.8997, "loss": 2.0483, "mask_loss": 0.1366, "step": 812, "topk_loss": 0.0121 }, { "epoch": 0.3231339622289983, "grad_norm": 0.150390625, "learning_rate": 0.00015632949857897498, "lm_loss": 1.9574, "loss": 2.1024, "mask_loss": 0.1344, "step": 813, "topk_loss": 0.0107 }, { "epoch": 0.3235314209771275, "grad_norm": 0.158203125, "learning_rate": 0.000156224148220006, "lm_loss": 2.0218, "loss": 2.168, "mask_loss": 0.1347, "step": 814, "topk_loss": 0.0115 }, { "epoch": 0.32392887972525664, "grad_norm": 0.25390625, "learning_rate": 0.00015611870653623825, "lm_loss": 2.0542, "loss": 2.201, "mask_loss": 0.1344, "step": 815, "topk_loss": 0.0124 }, { "epoch": 0.32432633847338577, "grad_norm": 0.142578125, "learning_rate": 0.00015601317369894044, "lm_loss": 1.976, "loss": 2.1171, "mask_loss": 0.1323, "step": 816, "topk_loss": 0.0089 }, { "epoch": 0.32472379722151495, "grad_norm": 0.146484375, "learning_rate": 0.00015590754987952944, "lm_loss": 1.9774, "loss": 2.1179, "mask_loss": 0.1297, "step": 817, "topk_loss": 0.0108 }, { "epoch": 0.3251212559696441, "grad_norm": 0.169921875, "learning_rate": 0.00015580183524956982, "lm_loss": 1.9778, "loss": 2.1274, "mask_loss": 0.1365, "step": 818, "topk_loss": 0.0132 }, { "epoch": 0.3255187147177732, "grad_norm": 0.1435546875, "learning_rate": 0.0001556960299807737, "lm_loss": 1.9972, "loss": 2.1413, "mask_loss": 0.1333, "step": 819, "topk_loss": 0.0108 }, { "epoch": 0.3259161734659024, "grad_norm": 0.287109375, "learning_rate": 0.00015559013424500047, "lm_loss": 2.0236, "loss": 2.1729, "mask_loss": 0.1348, "step": 820, "topk_loss": 0.0145 }, { "epoch": 0.32631363221403153, "grad_norm": 0.15625, "learning_rate": 0.00015548414821425638, "lm_loss": 1.9912, "loss": 2.1343, "mask_loss": 0.1323, "step": 821, "topk_loss": 0.0109 }, { "epoch": 0.32671109096216067, "grad_norm": 0.2099609375, "learning_rate": 0.00015537807206069434, "lm_loss": 1.9572, "loss": 2.1075, "mask_loss": 0.1351, "step": 822, "topk_loss": 0.0152 }, { "epoch": 0.32710854971028985, "grad_norm": 0.28515625, "learning_rate": 0.00015527190595661375, "lm_loss": 2.0038, "loss": 2.1511, "mask_loss": 0.1346, "step": 823, "topk_loss": 0.0127 }, { "epoch": 0.327506008458419, "grad_norm": 0.1533203125, "learning_rate": 0.00015516565007446, "lm_loss": 1.9312, "loss": 2.0774, "mask_loss": 0.1342, "step": 824, "topk_loss": 0.012 }, { "epoch": 0.3279034672065481, "grad_norm": 0.25390625, "learning_rate": 0.0001550593045868244, "lm_loss": 2.0119, "loss": 2.1579, "mask_loss": 0.1327, "step": 825, "topk_loss": 0.0133 }, { "epoch": 0.3283009259546773, "grad_norm": 0.197265625, "learning_rate": 0.00015495286966644373, "lm_loss": 2.0199, "loss": 2.1609, "mask_loss": 0.131, "step": 826, "topk_loss": 0.01 }, { "epoch": 0.32869838470280643, "grad_norm": 0.1669921875, "learning_rate": 0.0001548463454862001, "lm_loss": 1.955, "loss": 2.1022, "mask_loss": 0.1357, "step": 827, "topk_loss": 0.0115 }, { "epoch": 0.32909584345093557, "grad_norm": 0.169921875, "learning_rate": 0.00015473973221912055, "lm_loss": 1.9188, "loss": 2.0655, "mask_loss": 0.1343, "step": 828, "topk_loss": 0.0125 }, { "epoch": 0.32949330219906475, "grad_norm": 0.244140625, "learning_rate": 0.0001546330300383769, "lm_loss": 1.9589, "loss": 2.1009, "mask_loss": 0.1308, "step": 829, "topk_loss": 0.0112 }, { "epoch": 0.3298907609471939, "grad_norm": 0.265625, "learning_rate": 0.00015452623911728523, "lm_loss": 1.9411, "loss": 2.0862, "mask_loss": 0.1334, "step": 830, "topk_loss": 0.0117 }, { "epoch": 0.330288219695323, "grad_norm": 0.146484375, "learning_rate": 0.00015441935962930598, "lm_loss": 2.0214, "loss": 2.1666, "mask_loss": 0.1343, "step": 831, "topk_loss": 0.0108 }, { "epoch": 0.3306856784434522, "grad_norm": 0.21484375, "learning_rate": 0.00015431239174804328, "lm_loss": 1.9706, "loss": 2.113, "mask_loss": 0.1324, "step": 832, "topk_loss": 0.0101 }, { "epoch": 0.33108313719158133, "grad_norm": 0.1826171875, "learning_rate": 0.00015420533564724495, "lm_loss": 1.995, "loss": 2.137, "mask_loss": 0.1317, "step": 833, "topk_loss": 0.0102 }, { "epoch": 0.33148059593971047, "grad_norm": 0.1416015625, "learning_rate": 0.000154098191500802, "lm_loss": 2.0297, "loss": 2.1746, "mask_loss": 0.1339, "step": 834, "topk_loss": 0.0111 }, { "epoch": 0.33187805468783965, "grad_norm": 0.158203125, "learning_rate": 0.00015399095948274852, "lm_loss": 1.9266, "loss": 2.0701, "mask_loss": 0.1332, "step": 835, "topk_loss": 0.0103 }, { "epoch": 0.3322755134359688, "grad_norm": 0.158203125, "learning_rate": 0.00015388363976726133, "lm_loss": 2.0069, "loss": 2.1489, "mask_loss": 0.1307, "step": 836, "topk_loss": 0.0113 }, { "epoch": 0.3326729721840979, "grad_norm": 0.1591796875, "learning_rate": 0.00015377623252865968, "lm_loss": 1.9808, "loss": 2.1257, "mask_loss": 0.1344, "step": 837, "topk_loss": 0.0105 }, { "epoch": 0.33307043093222705, "grad_norm": 0.16015625, "learning_rate": 0.00015366873794140498, "lm_loss": 1.9781, "loss": 2.1324, "mask_loss": 0.1383, "step": 838, "topk_loss": 0.016 }, { "epoch": 0.33346788968035623, "grad_norm": 0.146484375, "learning_rate": 0.0001535611561801005, "lm_loss": 1.9886, "loss": 2.1363, "mask_loss": 0.1348, "step": 839, "topk_loss": 0.0129 }, { "epoch": 0.33386534842848536, "grad_norm": 0.1337890625, "learning_rate": 0.00015345348741949117, "lm_loss": 1.9809, "loss": 2.1258, "mask_loss": 0.1341, "step": 840, "topk_loss": 0.0108 }, { "epoch": 0.3342628071766145, "grad_norm": 0.1337890625, "learning_rate": 0.0001533457318344632, "lm_loss": 1.939, "loss": 2.0832, "mask_loss": 0.1327, "step": 841, "topk_loss": 0.0116 }, { "epoch": 0.3346602659247437, "grad_norm": 0.146484375, "learning_rate": 0.00015323788960004377, "lm_loss": 1.999, "loss": 2.1412, "mask_loss": 0.1319, "step": 842, "topk_loss": 0.0103 }, { "epoch": 0.3350577246728728, "grad_norm": 0.1494140625, "learning_rate": 0.00015312996089140088, "lm_loss": 1.943, "loss": 2.0855, "mask_loss": 0.1318, "step": 843, "topk_loss": 0.0107 }, { "epoch": 0.33545518342100195, "grad_norm": 0.138671875, "learning_rate": 0.00015302194588384302, "lm_loss": 1.9981, "loss": 2.1457, "mask_loss": 0.1353, "step": 844, "topk_loss": 0.0124 }, { "epoch": 0.33585264216913113, "grad_norm": 0.1328125, "learning_rate": 0.00015291384475281877, "lm_loss": 2.0021, "loss": 2.1457, "mask_loss": 0.1328, "step": 845, "topk_loss": 0.0107 }, { "epoch": 0.33625010091726026, "grad_norm": 0.1494140625, "learning_rate": 0.00015280565767391657, "lm_loss": 1.9705, "loss": 2.1158, "mask_loss": 0.1341, "step": 846, "topk_loss": 0.0112 }, { "epoch": 0.3366475596653894, "grad_norm": 0.13671875, "learning_rate": 0.0001526973848228646, "lm_loss": 1.9768, "loss": 2.1185, "mask_loss": 0.1309, "step": 847, "topk_loss": 0.0108 }, { "epoch": 0.3370450184135186, "grad_norm": 0.1416015625, "learning_rate": 0.0001525890263755303, "lm_loss": 1.9851, "loss": 2.1264, "mask_loss": 0.1305, "step": 848, "topk_loss": 0.0108 }, { "epoch": 0.3374424771616477, "grad_norm": 0.14453125, "learning_rate": 0.00015248058250792008, "lm_loss": 1.9555, "loss": 2.1018, "mask_loss": 0.1332, "step": 849, "topk_loss": 0.0131 }, { "epoch": 0.33783993590977685, "grad_norm": 0.126953125, "learning_rate": 0.00015237205339617917, "lm_loss": 1.9254, "loss": 2.0689, "mask_loss": 0.1324, "step": 850, "topk_loss": 0.0111 }, { "epoch": 0.33783993590977685, "eval_lm_loss": 693.2371826171875, "eval_loss": 693.3829345703125, "eval_mask_hit_rate": 0.5174924731254578, "eval_mask_loss": 0.13158242404460907, "eval_mask_top_10_hit_rate": 0.9814284443855286, "eval_mask_top_1_hit_rate": 0.996345043182373, "eval_mask_top_20_hit_rate": 0.9703196883201599, "eval_mask_top_5_hit_rate": 0.9878877997398376, "eval_runtime": 144.2145, "eval_samples_per_second": 14.201, "eval_steps_per_second": 7.101, "eval_token_accuracy": 0.6048117876052856, "eval_top_k_diff": -530.045654296875, "eval_topk_loss": 0.014150199480354786, "step": 850 }, { "epoch": 0.33823739465790603, "grad_norm": 0.1767578125, "learning_rate": 0.00015226343921659124, "lm_loss": 1.9664, "loss": 2.1133, "mask_loss": 0.1343, "step": 851, "topk_loss": 0.0126 }, { "epoch": 0.33863485340603516, "grad_norm": 0.1630859375, "learning_rate": 0.00015215474014557815, "lm_loss": 2.0229, "loss": 2.1646, "mask_loss": 0.1322, "step": 852, "topk_loss": 0.0095 }, { "epoch": 0.3390323121541643, "grad_norm": 0.166015625, "learning_rate": 0.00015204595635969964, "lm_loss": 2.012, "loss": 2.1594, "mask_loss": 0.1336, "step": 853, "topk_loss": 0.0137 }, { "epoch": 0.3394297709022935, "grad_norm": 0.134765625, "learning_rate": 0.00015193708803565303, "lm_loss": 1.9471, "loss": 2.0903, "mask_loss": 0.133, "step": 854, "topk_loss": 0.0102 }, { "epoch": 0.3398272296504226, "grad_norm": 0.185546875, "learning_rate": 0.000151828135350273, "lm_loss": 2.0118, "loss": 2.1573, "mask_loss": 0.1326, "step": 855, "topk_loss": 0.0129 }, { "epoch": 0.34022468839855174, "grad_norm": 0.14453125, "learning_rate": 0.00015171909848053119, "lm_loss": 1.9994, "loss": 2.1399, "mask_loss": 0.1312, "step": 856, "topk_loss": 0.0092 }, { "epoch": 0.34062214714668093, "grad_norm": 0.130859375, "learning_rate": 0.00015160997760353605, "lm_loss": 2.0021, "loss": 2.1448, "mask_loss": 0.1322, "step": 857, "topk_loss": 0.0104 }, { "epoch": 0.34101960589481006, "grad_norm": 0.1474609375, "learning_rate": 0.00015150077289653244, "lm_loss": 1.9989, "loss": 2.1424, "mask_loss": 0.132, "step": 858, "topk_loss": 0.0115 }, { "epoch": 0.3414170646429392, "grad_norm": 0.13671875, "learning_rate": 0.00015139148453690145, "lm_loss": 1.9976, "loss": 2.1409, "mask_loss": 0.1329, "step": 859, "topk_loss": 0.0104 }, { "epoch": 0.3418145233910684, "grad_norm": 0.134765625, "learning_rate": 0.00015128211270215992, "lm_loss": 1.9472, "loss": 2.0914, "mask_loss": 0.1332, "step": 860, "topk_loss": 0.011 }, { "epoch": 0.3422119821391975, "grad_norm": 0.1416015625, "learning_rate": 0.0001511726575699604, "lm_loss": 1.9626, "loss": 2.1048, "mask_loss": 0.1309, "step": 861, "topk_loss": 0.0112 }, { "epoch": 0.34260944088732664, "grad_norm": 0.2001953125, "learning_rate": 0.0001510631193180907, "lm_loss": 2.0179, "loss": 2.1569, "mask_loss": 0.1293, "step": 862, "topk_loss": 0.0097 }, { "epoch": 0.34300689963545583, "grad_norm": 0.1357421875, "learning_rate": 0.0001509534981244736, "lm_loss": 1.9051, "loss": 2.0497, "mask_loss": 0.1341, "step": 863, "topk_loss": 0.0106 }, { "epoch": 0.34340435838358496, "grad_norm": 0.1376953125, "learning_rate": 0.0001508437941671667, "lm_loss": 1.9047, "loss": 2.0507, "mask_loss": 0.1345, "step": 864, "topk_loss": 0.0115 }, { "epoch": 0.3438018171317141, "grad_norm": 0.1650390625, "learning_rate": 0.00015073400762436197, "lm_loss": 1.9328, "loss": 2.0763, "mask_loss": 0.1319, "step": 865, "topk_loss": 0.0116 }, { "epoch": 0.3441992758798432, "grad_norm": 0.1376953125, "learning_rate": 0.0001506241386743854, "lm_loss": 2.0323, "loss": 2.1749, "mask_loss": 0.1313, "step": 866, "topk_loss": 0.0112 }, { "epoch": 0.3445967346279724, "grad_norm": 0.1337890625, "learning_rate": 0.0001505141874956971, "lm_loss": 1.9805, "loss": 2.1269, "mask_loss": 0.1344, "step": 867, "topk_loss": 0.0119 }, { "epoch": 0.34499419337610154, "grad_norm": 0.15234375, "learning_rate": 0.00015040415426689055, "lm_loss": 2.0136, "loss": 2.1543, "mask_loss": 0.1288, "step": 868, "topk_loss": 0.0119 }, { "epoch": 0.3453916521242307, "grad_norm": 0.138671875, "learning_rate": 0.00015029403916669258, "lm_loss": 1.9408, "loss": 2.0839, "mask_loss": 0.1327, "step": 869, "topk_loss": 0.0104 }, { "epoch": 0.34578911087235986, "grad_norm": 0.1474609375, "learning_rate": 0.00015018384237396292, "lm_loss": 1.98, "loss": 2.1231, "mask_loss": 0.1319, "step": 870, "topk_loss": 0.0111 }, { "epoch": 0.346186569620489, "grad_norm": 0.15234375, "learning_rate": 0.0001500735640676941, "lm_loss": 1.9363, "loss": 2.0827, "mask_loss": 0.1337, "step": 871, "topk_loss": 0.0127 }, { "epoch": 0.3465840283686181, "grad_norm": 0.138671875, "learning_rate": 0.00014996320442701102, "lm_loss": 2.0109, "loss": 2.1533, "mask_loss": 0.1324, "step": 872, "topk_loss": 0.01 }, { "epoch": 0.3469814871167473, "grad_norm": 0.134765625, "learning_rate": 0.00014985276363117065, "lm_loss": 1.9425, "loss": 2.0854, "mask_loss": 0.132, "step": 873, "topk_loss": 0.0109 }, { "epoch": 0.34737894586487644, "grad_norm": 0.1494140625, "learning_rate": 0.00014974224185956186, "lm_loss": 1.9894, "loss": 2.1354, "mask_loss": 0.1339, "step": 874, "topk_loss": 0.012 }, { "epoch": 0.3477764046130056, "grad_norm": 0.140625, "learning_rate": 0.0001496316392917049, "lm_loss": 1.9791, "loss": 2.1316, "mask_loss": 0.1382, "step": 875, "topk_loss": 0.0142 }, { "epoch": 0.34817386336113476, "grad_norm": 0.134765625, "learning_rate": 0.00014952095610725139, "lm_loss": 1.9584, "loss": 2.1019, "mask_loss": 0.1331, "step": 876, "topk_loss": 0.0104 }, { "epoch": 0.3485713221092639, "grad_norm": 0.1416015625, "learning_rate": 0.0001494101924859839, "lm_loss": 1.9915, "loss": 2.1382, "mask_loss": 0.133, "step": 877, "topk_loss": 0.0136 }, { "epoch": 0.348968780857393, "grad_norm": 0.1328125, "learning_rate": 0.0001492993486078156, "lm_loss": 1.9783, "loss": 2.121, "mask_loss": 0.1323, "step": 878, "topk_loss": 0.0103 }, { "epoch": 0.3493662396055222, "grad_norm": 0.126953125, "learning_rate": 0.00014918842465279, "lm_loss": 1.9671, "loss": 2.1064, "mask_loss": 0.1298, "step": 879, "topk_loss": 0.0096 }, { "epoch": 0.34976369835365134, "grad_norm": 0.1357421875, "learning_rate": 0.00014907742080108073, "lm_loss": 1.9153, "loss": 2.0578, "mask_loss": 0.1318, "step": 880, "topk_loss": 0.0107 }, { "epoch": 0.3501611571017805, "grad_norm": 0.1357421875, "learning_rate": 0.0001489663372329912, "lm_loss": 1.9766, "loss": 2.1174, "mask_loss": 0.1315, "step": 881, "topk_loss": 0.0093 }, { "epoch": 0.35055861584990966, "grad_norm": 0.1484375, "learning_rate": 0.00014885517412895424, "lm_loss": 1.9713, "loss": 2.1244, "mask_loss": 0.1373, "step": 882, "topk_loss": 0.0158 }, { "epoch": 0.3509560745980388, "grad_norm": 0.1416015625, "learning_rate": 0.00014874393166953192, "lm_loss": 1.9796, "loss": 2.1193, "mask_loss": 0.1301, "step": 883, "topk_loss": 0.0097 }, { "epoch": 0.3513535333461679, "grad_norm": 0.205078125, "learning_rate": 0.00014863261003541525, "lm_loss": 1.954, "loss": 2.0957, "mask_loss": 0.1314, "step": 884, "topk_loss": 0.0104 }, { "epoch": 0.3517509920942971, "grad_norm": 0.140625, "learning_rate": 0.0001485212094074237, "lm_loss": 1.9849, "loss": 2.1253, "mask_loss": 0.1303, "step": 885, "topk_loss": 0.0101 }, { "epoch": 0.35214845084242624, "grad_norm": 0.21484375, "learning_rate": 0.00014840972996650525, "lm_loss": 1.9263, "loss": 2.0688, "mask_loss": 0.1315, "step": 886, "topk_loss": 0.011 }, { "epoch": 0.3525459095905554, "grad_norm": 0.1845703125, "learning_rate": 0.0001482981718937357, "lm_loss": 1.8775, "loss": 2.0324, "mask_loss": 0.1371, "step": 887, "topk_loss": 0.0178 }, { "epoch": 0.35294336833868456, "grad_norm": 0.1259765625, "learning_rate": 0.00014818653537031868, "lm_loss": 1.9684, "loss": 2.1109, "mask_loss": 0.1323, "step": 888, "topk_loss": 0.0102 }, { "epoch": 0.3533408270868137, "grad_norm": 0.1689453125, "learning_rate": 0.00014807482057758528, "lm_loss": 1.9892, "loss": 2.1299, "mask_loss": 0.1304, "step": 889, "topk_loss": 0.0104 }, { "epoch": 0.3537382858349428, "grad_norm": 0.13671875, "learning_rate": 0.0001479630276969936, "lm_loss": 1.9411, "loss": 2.0891, "mask_loss": 0.1357, "step": 890, "topk_loss": 0.0123 }, { "epoch": 0.354135744583072, "grad_norm": 0.1435546875, "learning_rate": 0.00014785115691012864, "lm_loss": 2.0491, "loss": 2.1903, "mask_loss": 0.1302, "step": 891, "topk_loss": 0.0111 }, { "epoch": 0.35453320333120114, "grad_norm": 0.12890625, "learning_rate": 0.000147739208398702, "lm_loss": 1.9524, "loss": 2.0971, "mask_loss": 0.1335, "step": 892, "topk_loss": 0.0113 }, { "epoch": 0.3549306620793303, "grad_norm": 0.15234375, "learning_rate": 0.00014762718234455151, "lm_loss": 1.9716, "loss": 2.1133, "mask_loss": 0.1307, "step": 893, "topk_loss": 0.011 }, { "epoch": 0.3553281208274594, "grad_norm": 0.1328125, "learning_rate": 0.00014751507892964082, "lm_loss": 1.9312, "loss": 2.0766, "mask_loss": 0.1334, "step": 894, "topk_loss": 0.012 }, { "epoch": 0.3557255795755886, "grad_norm": 0.1494140625, "learning_rate": 0.00014740289833605939, "lm_loss": 1.9993, "loss": 2.1397, "mask_loss": 0.1293, "step": 895, "topk_loss": 0.0111 }, { "epoch": 0.3561230383237177, "grad_norm": 0.1416015625, "learning_rate": 0.00014729064074602198, "lm_loss": 1.9107, "loss": 2.0586, "mask_loss": 0.1354, "step": 896, "topk_loss": 0.0125 }, { "epoch": 0.35652049707184685, "grad_norm": 0.1337890625, "learning_rate": 0.00014717830634186844, "lm_loss": 1.9405, "loss": 2.0837, "mask_loss": 0.131, "step": 897, "topk_loss": 0.0122 }, { "epoch": 0.35691795581997604, "grad_norm": 0.1416015625, "learning_rate": 0.00014706589530606335, "lm_loss": 2.0209, "loss": 2.163, "mask_loss": 0.1316, "step": 898, "topk_loss": 0.0106 }, { "epoch": 0.35731541456810517, "grad_norm": 0.162109375, "learning_rate": 0.0001469534078211958, "lm_loss": 2.0017, "loss": 2.1479, "mask_loss": 0.1329, "step": 899, "topk_loss": 0.0133 }, { "epoch": 0.3577128733162343, "grad_norm": 0.1787109375, "learning_rate": 0.00014684084406997903, "lm_loss": 2.0326, "loss": 2.1736, "mask_loss": 0.1308, "step": 900, "topk_loss": 0.0101 }, { "epoch": 0.3577128733162343, "eval_lm_loss": 692.6829833984375, "eval_loss": 692.8275146484375, "eval_mask_hit_rate": 0.5198476314544678, "eval_mask_loss": 0.13041795790195465, "eval_mask_top_10_hit_rate": 0.982144832611084, "eval_mask_top_1_hit_rate": 0.9965412616729736, "eval_mask_top_20_hit_rate": 0.9712841510772705, "eval_mask_top_5_hit_rate": 0.9884326457977295, "eval_runtime": 143.7303, "eval_samples_per_second": 14.249, "eval_steps_per_second": 7.124, "eval_token_accuracy": 0.606438159942627, "eval_top_k_diff": -526.1043701171875, "eval_topk_loss": 0.014123985543847084, "step": 900 }, { "epoch": 0.3581103320643635, "grad_norm": 0.1396484375, "learning_rate": 0.0001467282042352502, "lm_loss": 2.0374, "loss": 2.1807, "mask_loss": 0.1308, "step": 901, "topk_loss": 0.0124 }, { "epoch": 0.3585077908124926, "grad_norm": 0.1396484375, "learning_rate": 0.00014661548849996997, "lm_loss": 1.9942, "loss": 2.136, "mask_loss": 0.1317, "step": 902, "topk_loss": 0.0101 }, { "epoch": 0.35890524956062175, "grad_norm": 0.1337890625, "learning_rate": 0.00014650269704722237, "lm_loss": 1.8926, "loss": 2.0358, "mask_loss": 0.1325, "step": 903, "topk_loss": 0.0107 }, { "epoch": 0.35930270830875094, "grad_norm": 0.1513671875, "learning_rate": 0.00014638983006021438, "lm_loss": 2.0248, "loss": 2.1703, "mask_loss": 0.1331, "step": 904, "topk_loss": 0.0125 }, { "epoch": 0.35970016705688007, "grad_norm": 0.1455078125, "learning_rate": 0.0001462768877222757, "lm_loss": 1.9501, "loss": 2.0948, "mask_loss": 0.1327, "step": 905, "topk_loss": 0.012 }, { "epoch": 0.3600976258050092, "grad_norm": 0.171875, "learning_rate": 0.00014616387021685836, "lm_loss": 1.9497, "loss": 2.0882, "mask_loss": 0.1288, "step": 906, "topk_loss": 0.0098 }, { "epoch": 0.3604950845531384, "grad_norm": 0.12353515625, "learning_rate": 0.00014605077772753656, "lm_loss": 1.8988, "loss": 2.0387, "mask_loss": 0.1309, "step": 907, "topk_loss": 0.0089 }, { "epoch": 0.3608925433012675, "grad_norm": 0.1474609375, "learning_rate": 0.00014593761043800622, "lm_loss": 1.9902, "loss": 2.1325, "mask_loss": 0.1315, "step": 908, "topk_loss": 0.0108 }, { "epoch": 0.36129000204939665, "grad_norm": 0.212890625, "learning_rate": 0.00014582436853208483, "lm_loss": 1.9879, "loss": 2.1295, "mask_loss": 0.1306, "step": 909, "topk_loss": 0.011 }, { "epoch": 0.36168746079752584, "grad_norm": 0.14453125, "learning_rate": 0.00014571105219371102, "lm_loss": 1.9871, "loss": 2.1328, "mask_loss": 0.1344, "step": 910, "topk_loss": 0.0113 }, { "epoch": 0.36208491954565497, "grad_norm": 0.2041015625, "learning_rate": 0.00014559766160694436, "lm_loss": 1.9585, "loss": 2.101, "mask_loss": 0.1306, "step": 911, "topk_loss": 0.0119 }, { "epoch": 0.3624823782937841, "grad_norm": 0.1474609375, "learning_rate": 0.00014548419695596505, "lm_loss": 1.9858, "loss": 2.127, "mask_loss": 0.1307, "step": 912, "topk_loss": 0.0104 }, { "epoch": 0.3628798370419133, "grad_norm": 0.177734375, "learning_rate": 0.0001453706584250735, "lm_loss": 1.9632, "loss": 2.106, "mask_loss": 0.1315, "step": 913, "topk_loss": 0.0112 }, { "epoch": 0.3632772957900424, "grad_norm": 0.1279296875, "learning_rate": 0.00014525704619869015, "lm_loss": 1.9146, "loss": 2.0587, "mask_loss": 0.1334, "step": 914, "topk_loss": 0.0107 }, { "epoch": 0.36367475453817155, "grad_norm": 0.203125, "learning_rate": 0.00014514336046135518, "lm_loss": 1.9857, "loss": 2.1289, "mask_loss": 0.1318, "step": 915, "topk_loss": 0.0113 }, { "epoch": 0.36407221328630074, "grad_norm": 0.2021484375, "learning_rate": 0.00014502960139772824, "lm_loss": 2.0455, "loss": 2.1842, "mask_loss": 0.1287, "step": 916, "topk_loss": 0.0101 }, { "epoch": 0.36446967203442987, "grad_norm": 0.154296875, "learning_rate": 0.00014491576919258792, "lm_loss": 1.9393, "loss": 2.082, "mask_loss": 0.1333, "step": 917, "topk_loss": 0.0094 }, { "epoch": 0.364867130782559, "grad_norm": 0.1640625, "learning_rate": 0.00014480186403083173, "lm_loss": 2.0212, "loss": 2.1674, "mask_loss": 0.1345, "step": 918, "topk_loss": 0.0117 }, { "epoch": 0.3652645895306882, "grad_norm": 0.2431640625, "learning_rate": 0.00014468788609747565, "lm_loss": 1.9828, "loss": 2.1276, "mask_loss": 0.1322, "step": 919, "topk_loss": 0.0126 }, { "epoch": 0.3656620482788173, "grad_norm": 0.265625, "learning_rate": 0.00014457383557765386, "lm_loss": 1.9425, "loss": 2.0874, "mask_loss": 0.1335, "step": 920, "topk_loss": 0.0114 }, { "epoch": 0.36605950702694645, "grad_norm": 0.1796875, "learning_rate": 0.00014445971265661842, "lm_loss": 1.9202, "loss": 2.064, "mask_loss": 0.1333, "step": 921, "topk_loss": 0.0105 }, { "epoch": 0.3664569657750756, "grad_norm": 0.3046875, "learning_rate": 0.00014434551751973907, "lm_loss": 1.967, "loss": 2.1158, "mask_loss": 0.1341, "step": 922, "topk_loss": 0.0148 }, { "epoch": 0.36685442452320477, "grad_norm": 0.26171875, "learning_rate": 0.00014423125035250276, "lm_loss": 2.0018, "loss": 2.1425, "mask_loss": 0.1295, "step": 923, "topk_loss": 0.0112 }, { "epoch": 0.3672518832713339, "grad_norm": 0.224609375, "learning_rate": 0.00014411691134051348, "lm_loss": 1.9848, "loss": 2.1317, "mask_loss": 0.1317, "step": 924, "topk_loss": 0.0152 }, { "epoch": 0.36764934201946303, "grad_norm": 0.1513671875, "learning_rate": 0.00014400250066949196, "lm_loss": 1.924, "loss": 2.066, "mask_loss": 0.1314, "step": 925, "topk_loss": 0.0106 }, { "epoch": 0.3680468007675922, "grad_norm": 0.1630859375, "learning_rate": 0.00014388801852527526, "lm_loss": 1.9335, "loss": 2.0793, "mask_loss": 0.1345, "step": 926, "topk_loss": 0.0113 }, { "epoch": 0.36844425951572135, "grad_norm": 0.28125, "learning_rate": 0.00014377346509381647, "lm_loss": 2.0293, "loss": 2.1739, "mask_loss": 0.1308, "step": 927, "topk_loss": 0.0138 }, { "epoch": 0.3688417182638505, "grad_norm": 0.17578125, "learning_rate": 0.00014365884056118466, "lm_loss": 1.9228, "loss": 2.064, "mask_loss": 0.1312, "step": 928, "topk_loss": 0.0101 }, { "epoch": 0.36923917701197967, "grad_norm": 0.181640625, "learning_rate": 0.00014354414511356427, "lm_loss": 1.9399, "loss": 2.0858, "mask_loss": 0.1336, "step": 929, "topk_loss": 0.0123 }, { "epoch": 0.3696366357601088, "grad_norm": 0.173828125, "learning_rate": 0.00014342937893725488, "lm_loss": 1.9367, "loss": 2.0863, "mask_loss": 0.133, "step": 930, "topk_loss": 0.0166 }, { "epoch": 0.37003409450823793, "grad_norm": 0.2236328125, "learning_rate": 0.00014331454221867108, "lm_loss": 1.9506, "loss": 2.0912, "mask_loss": 0.1298, "step": 931, "topk_loss": 0.0109 }, { "epoch": 0.3704315532563671, "grad_norm": 0.2412109375, "learning_rate": 0.0001431996351443419, "lm_loss": 1.9137, "loss": 2.0563, "mask_loss": 0.1311, "step": 932, "topk_loss": 0.0115 }, { "epoch": 0.37082901200449625, "grad_norm": 0.158203125, "learning_rate": 0.00014308465790091086, "lm_loss": 2.0159, "loss": 2.157, "mask_loss": 0.1303, "step": 933, "topk_loss": 0.0108 }, { "epoch": 0.3712264707526254, "grad_norm": 0.1357421875, "learning_rate": 0.0001429696106751352, "lm_loss": 1.9211, "loss": 2.0639, "mask_loss": 0.1328, "step": 934, "topk_loss": 0.01 }, { "epoch": 0.37162392950075457, "grad_norm": 0.1552734375, "learning_rate": 0.00014285449365388598, "lm_loss": 1.9324, "loss": 2.0745, "mask_loss": 0.1312, "step": 935, "topk_loss": 0.0109 }, { "epoch": 0.3720213882488837, "grad_norm": 0.197265625, "learning_rate": 0.00014273930702414766, "lm_loss": 1.9688, "loss": 2.1088, "mask_loss": 0.13, "step": 936, "topk_loss": 0.0099 }, { "epoch": 0.37241884699701283, "grad_norm": 0.2099609375, "learning_rate": 0.00014262405097301763, "lm_loss": 1.9681, "loss": 2.109, "mask_loss": 0.1295, "step": 937, "topk_loss": 0.0114 }, { "epoch": 0.372816305745142, "grad_norm": 0.1865234375, "learning_rate": 0.0001425087256877062, "lm_loss": 1.916, "loss": 2.0563, "mask_loss": 0.1298, "step": 938, "topk_loss": 0.0105 }, { "epoch": 0.37321376449327115, "grad_norm": 0.1376953125, "learning_rate": 0.00014239333135553596, "lm_loss": 1.9135, "loss": 2.0549, "mask_loss": 0.1314, "step": 939, "topk_loss": 0.0099 }, { "epoch": 0.3736112232414003, "grad_norm": 0.1767578125, "learning_rate": 0.00014227786816394184, "lm_loss": 1.9723, "loss": 2.1155, "mask_loss": 0.1313, "step": 940, "topk_loss": 0.0118 }, { "epoch": 0.37400868198952947, "grad_norm": 0.17578125, "learning_rate": 0.0001421623363004705, "lm_loss": 1.9524, "loss": 2.0944, "mask_loss": 0.1313, "step": 941, "topk_loss": 0.0107 }, { "epoch": 0.3744061407376586, "grad_norm": 0.23828125, "learning_rate": 0.00014204673595278016, "lm_loss": 1.9616, "loss": 2.1092, "mask_loss": 0.1335, "step": 942, "topk_loss": 0.014 }, { "epoch": 0.37480359948578773, "grad_norm": 0.169921875, "learning_rate": 0.00014193106730864025, "lm_loss": 2.0416, "loss": 2.1849, "mask_loss": 0.1309, "step": 943, "topk_loss": 0.0125 }, { "epoch": 0.3752010582339169, "grad_norm": 0.13671875, "learning_rate": 0.00014181533055593123, "lm_loss": 1.9677, "loss": 2.1085, "mask_loss": 0.1299, "step": 944, "topk_loss": 0.0109 }, { "epoch": 0.37559851698204605, "grad_norm": 0.171875, "learning_rate": 0.00014169952588264417, "lm_loss": 1.9138, "loss": 2.0575, "mask_loss": 0.1322, "step": 945, "topk_loss": 0.0115 }, { "epoch": 0.3759959757301752, "grad_norm": 0.2353515625, "learning_rate": 0.00014158365347688033, "lm_loss": 1.9805, "loss": 2.1201, "mask_loss": 0.1288, "step": 946, "topk_loss": 0.0108 }, { "epoch": 0.37639343447830437, "grad_norm": 0.2890625, "learning_rate": 0.00014146771352685112, "lm_loss": 1.984, "loss": 2.1205, "mask_loss": 0.1266, "step": 947, "topk_loss": 0.0099 }, { "epoch": 0.3767908932264335, "grad_norm": 0.1376953125, "learning_rate": 0.00014135170622087763, "lm_loss": 1.9521, "loss": 2.0922, "mask_loss": 0.1306, "step": 948, "topk_loss": 0.0095 }, { "epoch": 0.37718835197456263, "grad_norm": 0.169921875, "learning_rate": 0.00014123563174739037, "lm_loss": 1.9175, "loss": 2.0623, "mask_loss": 0.1333, "step": 949, "topk_loss": 0.0114 }, { "epoch": 0.3775858107226918, "grad_norm": 0.2080078125, "learning_rate": 0.0001411194902949289, "lm_loss": 1.9277, "loss": 2.0684, "mask_loss": 0.13, "step": 950, "topk_loss": 0.0107 }, { "epoch": 0.3775858107226918, "eval_lm_loss": 690.0902709960938, "eval_loss": 690.23388671875, "eval_mask_hit_rate": 0.5220222473144531, "eval_mask_loss": 0.12960419058799744, "eval_mask_top_10_hit_rate": 0.9827470183372498, "eval_mask_top_1_hit_rate": 0.9967126846313477, "eval_mask_top_20_hit_rate": 0.9721159934997559, "eval_mask_top_5_hit_rate": 0.9888770580291748, "eval_runtime": 144.1331, "eval_samples_per_second": 14.209, "eval_steps_per_second": 7.105, "eval_token_accuracy": 0.607653021812439, "eval_top_k_diff": -511.9417724609375, "eval_topk_loss": 0.013996293768286705, "step": 950 }, { "epoch": 0.37798326947082095, "grad_norm": 0.173828125, "learning_rate": 0.0001410032820521416, "lm_loss": 1.9364, "loss": 2.0792, "mask_loss": 0.131, "step": 951, "topk_loss": 0.0117 }, { "epoch": 0.3783807282189501, "grad_norm": 0.203125, "learning_rate": 0.00014088700720778542, "lm_loss": 1.9916, "loss": 2.132, "mask_loss": 0.1295, "step": 952, "topk_loss": 0.0109 }, { "epoch": 0.3787781869670792, "grad_norm": 0.140625, "learning_rate": 0.0001407706659507253, "lm_loss": 1.9614, "loss": 2.1022, "mask_loss": 0.131, "step": 953, "topk_loss": 0.0097 }, { "epoch": 0.3791756457152084, "grad_norm": 0.17578125, "learning_rate": 0.00014065425846993424, "lm_loss": 1.9445, "loss": 2.0854, "mask_loss": 0.1298, "step": 954, "topk_loss": 0.0112 }, { "epoch": 0.37957310446333753, "grad_norm": 0.1953125, "learning_rate": 0.0001405377849544927, "lm_loss": 1.9137, "loss": 2.0556, "mask_loss": 0.1321, "step": 955, "topk_loss": 0.0099 }, { "epoch": 0.37997056321146666, "grad_norm": 0.1884765625, "learning_rate": 0.00014042124559358846, "lm_loss": 1.9787, "loss": 2.1205, "mask_loss": 0.1304, "step": 956, "topk_loss": 0.0114 }, { "epoch": 0.38036802195959585, "grad_norm": 0.140625, "learning_rate": 0.00014030464057651626, "lm_loss": 1.9185, "loss": 2.0577, "mask_loss": 0.1299, "step": 957, "topk_loss": 0.0093 }, { "epoch": 0.380765480707725, "grad_norm": 0.1494140625, "learning_rate": 0.00014018797009267736, "lm_loss": 1.9063, "loss": 2.0482, "mask_loss": 0.1313, "step": 958, "topk_loss": 0.0106 }, { "epoch": 0.3811629394558541, "grad_norm": 0.1708984375, "learning_rate": 0.00014007123433157953, "lm_loss": 1.9527, "loss": 2.09, "mask_loss": 0.1279, "step": 959, "topk_loss": 0.0094 }, { "epoch": 0.3815603982039833, "grad_norm": 0.1953125, "learning_rate": 0.00013995443348283645, "lm_loss": 1.9635, "loss": 2.1111, "mask_loss": 0.1325, "step": 960, "topk_loss": 0.0151 }, { "epoch": 0.38195785695211243, "grad_norm": 0.16015625, "learning_rate": 0.00013983756773616762, "lm_loss": 1.9174, "loss": 2.06, "mask_loss": 0.1322, "step": 961, "topk_loss": 0.0105 }, { "epoch": 0.38235531570024156, "grad_norm": 0.1318359375, "learning_rate": 0.0001397206372813978, "lm_loss": 1.9535, "loss": 2.0936, "mask_loss": 0.1304, "step": 962, "topk_loss": 0.0098 }, { "epoch": 0.38275277444837075, "grad_norm": 0.1455078125, "learning_rate": 0.000139603642308457, "lm_loss": 1.9739, "loss": 2.1229, "mask_loss": 0.1342, "step": 963, "topk_loss": 0.0148 }, { "epoch": 0.3831502331964999, "grad_norm": 0.177734375, "learning_rate": 0.00013948658300737998, "lm_loss": 1.9771, "loss": 2.1228, "mask_loss": 0.1316, "step": 964, "topk_loss": 0.014 }, { "epoch": 0.383547691944629, "grad_norm": 0.1474609375, "learning_rate": 0.00013936945956830602, "lm_loss": 1.9432, "loss": 2.0818, "mask_loss": 0.1284, "step": 965, "topk_loss": 0.0101 }, { "epoch": 0.3839451506927582, "grad_norm": 0.16796875, "learning_rate": 0.00013925227218147847, "lm_loss": 1.9558, "loss": 2.0974, "mask_loss": 0.1299, "step": 966, "topk_loss": 0.0117 }, { "epoch": 0.38434260944088733, "grad_norm": 0.134765625, "learning_rate": 0.00013913502103724468, "lm_loss": 1.9696, "loss": 2.1086, "mask_loss": 0.1284, "step": 967, "topk_loss": 0.0106 }, { "epoch": 0.38474006818901646, "grad_norm": 0.18359375, "learning_rate": 0.00013901770632605547, "lm_loss": 2.0012, "loss": 2.144, "mask_loss": 0.1296, "step": 968, "topk_loss": 0.0132 }, { "epoch": 0.38513752693714565, "grad_norm": 0.126953125, "learning_rate": 0.00013890032823846496, "lm_loss": 1.8701, "loss": 2.0099, "mask_loss": 0.1298, "step": 969, "topk_loss": 0.01 }, { "epoch": 0.3855349856852748, "grad_norm": 0.1552734375, "learning_rate": 0.00013878288696513022, "lm_loss": 1.9742, "loss": 2.1138, "mask_loss": 0.1303, "step": 970, "topk_loss": 0.0092 }, { "epoch": 0.3859324444334039, "grad_norm": 0.1396484375, "learning_rate": 0.0001386653826968109, "lm_loss": 1.8972, "loss": 2.0379, "mask_loss": 0.1309, "step": 971, "topk_loss": 0.0099 }, { "epoch": 0.3863299031815331, "grad_norm": 0.12890625, "learning_rate": 0.00013854781562436906, "lm_loss": 1.954, "loss": 2.0933, "mask_loss": 0.1298, "step": 972, "topk_loss": 0.0096 }, { "epoch": 0.38672736192966223, "grad_norm": 0.1455078125, "learning_rate": 0.00013843018593876868, "lm_loss": 1.9546, "loss": 2.101, "mask_loss": 0.1339, "step": 973, "topk_loss": 0.0126 }, { "epoch": 0.38712482067779136, "grad_norm": 0.19140625, "learning_rate": 0.00013831249383107545, "lm_loss": 1.9676, "loss": 2.1118, "mask_loss": 0.1325, "step": 974, "topk_loss": 0.0116 }, { "epoch": 0.38752227942592055, "grad_norm": 0.1494140625, "learning_rate": 0.00013819473949245654, "lm_loss": 1.9611, "loss": 2.1034, "mask_loss": 0.1309, "step": 975, "topk_loss": 0.0114 }, { "epoch": 0.3879197381740497, "grad_norm": 0.138671875, "learning_rate": 0.0001380769231141801, "lm_loss": 1.962, "loss": 2.1035, "mask_loss": 0.131, "step": 976, "topk_loss": 0.0106 }, { "epoch": 0.3883171969221788, "grad_norm": 0.1494140625, "learning_rate": 0.00013795904488761516, "lm_loss": 1.9095, "loss": 2.0576, "mask_loss": 0.1336, "step": 977, "topk_loss": 0.0145 }, { "epoch": 0.388714655670308, "grad_norm": 0.259765625, "learning_rate": 0.00013784110500423104, "lm_loss": 1.937, "loss": 2.0796, "mask_loss": 0.1311, "step": 978, "topk_loss": 0.0114 }, { "epoch": 0.3891121144184371, "grad_norm": 0.1396484375, "learning_rate": 0.0001377231036555974, "lm_loss": 2.0023, "loss": 2.1457, "mask_loss": 0.1311, "step": 979, "topk_loss": 0.0123 }, { "epoch": 0.38950957316656626, "grad_norm": 0.134765625, "learning_rate": 0.0001376050410333836, "lm_loss": 1.9727, "loss": 2.1118, "mask_loss": 0.1298, "step": 980, "topk_loss": 0.0092 }, { "epoch": 0.3899070319146954, "grad_norm": 0.2060546875, "learning_rate": 0.00013748691732935864, "lm_loss": 1.9375, "loss": 2.0856, "mask_loss": 0.1326, "step": 981, "topk_loss": 0.0155 }, { "epoch": 0.3903044906628246, "grad_norm": 0.216796875, "learning_rate": 0.00013736873273539058, "lm_loss": 1.9246, "loss": 2.0686, "mask_loss": 0.1329, "step": 982, "topk_loss": 0.0111 }, { "epoch": 0.3907019494109537, "grad_norm": 0.1572265625, "learning_rate": 0.00013725048744344658, "lm_loss": 1.9293, "loss": 2.0731, "mask_loss": 0.1322, "step": 983, "topk_loss": 0.0116 }, { "epoch": 0.39109940815908284, "grad_norm": 0.1748046875, "learning_rate": 0.00013713218164559222, "lm_loss": 2.0246, "loss": 2.1643, "mask_loss": 0.1287, "step": 984, "topk_loss": 0.0109 }, { "epoch": 0.391496866907212, "grad_norm": 0.162109375, "learning_rate": 0.00013701381553399145, "lm_loss": 1.9579, "loss": 2.0978, "mask_loss": 0.1298, "step": 985, "topk_loss": 0.0101 }, { "epoch": 0.39189432565534116, "grad_norm": 0.1982421875, "learning_rate": 0.00013689538930090618, "lm_loss": 1.9602, "loss": 2.1008, "mask_loss": 0.1293, "step": 986, "topk_loss": 0.0113 }, { "epoch": 0.3922917844034703, "grad_norm": 0.1376953125, "learning_rate": 0.00013677690313869593, "lm_loss": 1.9438, "loss": 2.0823, "mask_loss": 0.1284, "step": 987, "topk_loss": 0.0102 }, { "epoch": 0.3926892431515995, "grad_norm": 0.1416015625, "learning_rate": 0.0001366583572398176, "lm_loss": 1.9037, "loss": 2.0451, "mask_loss": 0.1307, "step": 988, "topk_loss": 0.0107 }, { "epoch": 0.3930867018997286, "grad_norm": 0.138671875, "learning_rate": 0.00013653975179682515, "lm_loss": 2.012, "loss": 2.15, "mask_loss": 0.1281, "step": 989, "topk_loss": 0.0099 }, { "epoch": 0.39348416064785774, "grad_norm": 0.18359375, "learning_rate": 0.00013642108700236916, "lm_loss": 1.9912, "loss": 2.1324, "mask_loss": 0.1305, "step": 990, "topk_loss": 0.0107 }, { "epoch": 0.3938816193959869, "grad_norm": 0.173828125, "learning_rate": 0.00013630236304919673, "lm_loss": 1.9203, "loss": 2.0621, "mask_loss": 0.1303, "step": 991, "topk_loss": 0.0115 }, { "epoch": 0.39427907814411606, "grad_norm": 0.1279296875, "learning_rate": 0.00013618358013015098, "lm_loss": 1.9615, "loss": 2.1036, "mask_loss": 0.1313, "step": 992, "topk_loss": 0.0109 }, { "epoch": 0.3946765368922452, "grad_norm": 0.1357421875, "learning_rate": 0.00013606473843817086, "lm_loss": 1.9148, "loss": 2.0565, "mask_loss": 0.1302, "step": 993, "topk_loss": 0.0114 }, { "epoch": 0.3950739956403744, "grad_norm": 0.146484375, "learning_rate": 0.0001359458381662907, "lm_loss": 2.0316, "loss": 2.1717, "mask_loss": 0.1303, "step": 994, "topk_loss": 0.0098 }, { "epoch": 0.3954714543885035, "grad_norm": 0.158203125, "learning_rate": 0.00013582687950764, "lm_loss": 1.9824, "loss": 2.1226, "mask_loss": 0.1288, "step": 995, "topk_loss": 0.0115 }, { "epoch": 0.39586891313663264, "grad_norm": 0.21875, "learning_rate": 0.0001357078626554432, "lm_loss": 1.9552, "loss": 2.0961, "mask_loss": 0.1303, "step": 996, "topk_loss": 0.0106 }, { "epoch": 0.3962663718847618, "grad_norm": 0.1591796875, "learning_rate": 0.00013558878780301918, "lm_loss": 1.8611, "loss": 2.0037, "mask_loss": 0.1321, "step": 997, "topk_loss": 0.0104 }, { "epoch": 0.39666383063289096, "grad_norm": 0.2177734375, "learning_rate": 0.000135469655143781, "lm_loss": 1.9426, "loss": 2.0828, "mask_loss": 0.1294, "step": 998, "topk_loss": 0.0108 }, { "epoch": 0.3970612893810201, "grad_norm": 0.16015625, "learning_rate": 0.0001353504648712357, "lm_loss": 1.9499, "loss": 2.0884, "mask_loss": 0.1291, "step": 999, "topk_loss": 0.0095 }, { "epoch": 0.3974587481291493, "grad_norm": 0.142578125, "learning_rate": 0.00013523121717898387, "lm_loss": 1.9975, "loss": 2.1404, "mask_loss": 0.1318, "step": 1000, "topk_loss": 0.0111 }, { "epoch": 0.3974587481291493, "eval_lm_loss": 690.0011596679688, "eval_loss": 690.1435546875, "eval_mask_hit_rate": 0.5240002870559692, "eval_mask_loss": 0.1288737952709198, "eval_mask_top_10_hit_rate": 0.9831773638725281, "eval_mask_top_1_hit_rate": 0.9968259334564209, "eval_mask_top_20_hit_rate": 0.9727537631988525, "eval_mask_top_5_hit_rate": 0.9891763925552368, "eval_runtime": 144.1675, "eval_samples_per_second": 14.206, "eval_steps_per_second": 7.103, "eval_token_accuracy": 0.6087426543235779, "eval_top_k_diff": -517.8064575195312, "eval_topk_loss": 0.013575425371527672, "step": 1000 }, { "epoch": 0.3978562068772784, "grad_norm": 0.1318359375, "learning_rate": 0.00013511191226071932, "lm_loss": 1.9351, "loss": 2.0778, "mask_loss": 0.1313, "step": 1001, "topk_loss": 0.0114 }, { "epoch": 0.39825366562540754, "grad_norm": 0.2451171875, "learning_rate": 0.00013499255031022885, "lm_loss": 1.9526, "loss": 2.0903, "mask_loss": 0.1272, "step": 1002, "topk_loss": 0.0105 }, { "epoch": 0.3986511243735367, "grad_norm": 0.197265625, "learning_rate": 0.0001348731315213919, "lm_loss": 1.9861, "loss": 2.126, "mask_loss": 0.1294, "step": 1003, "topk_loss": 0.0105 }, { "epoch": 0.39904858312166586, "grad_norm": 0.14453125, "learning_rate": 0.00013475365608818027, "lm_loss": 1.927, "loss": 2.0676, "mask_loss": 0.1305, "step": 1004, "topk_loss": 0.0102 }, { "epoch": 0.399446041869795, "grad_norm": 0.140625, "learning_rate": 0.00013463412420465767, "lm_loss": 1.9248, "loss": 2.0663, "mask_loss": 0.1296, "step": 1005, "topk_loss": 0.0119 }, { "epoch": 0.3998435006179242, "grad_norm": 0.171875, "learning_rate": 0.00013451453606497956, "lm_loss": 1.9101, "loss": 2.0537, "mask_loss": 0.1307, "step": 1006, "topk_loss": 0.0129 }, { "epoch": 0.4002409593660533, "grad_norm": 0.1943359375, "learning_rate": 0.00013439489186339282, "lm_loss": 1.9559, "loss": 2.0934, "mask_loss": 0.1275, "step": 1007, "topk_loss": 0.0101 }, { "epoch": 0.40063841811418244, "grad_norm": 0.130859375, "learning_rate": 0.00013427519179423528, "lm_loss": 1.8883, "loss": 2.0283, "mask_loss": 0.1303, "step": 1008, "topk_loss": 0.0097 }, { "epoch": 0.40103587686231157, "grad_norm": 0.134765625, "learning_rate": 0.00013415543605193567, "lm_loss": 1.9815, "loss": 2.1222, "mask_loss": 0.13, "step": 1009, "topk_loss": 0.0107 }, { "epoch": 0.40143333561044076, "grad_norm": 0.1845703125, "learning_rate": 0.00013403562483101298, "lm_loss": 1.9476, "loss": 2.0912, "mask_loss": 0.1309, "step": 1010, "topk_loss": 0.0127 }, { "epoch": 0.4018307943585699, "grad_norm": 0.142578125, "learning_rate": 0.00013391575832607643, "lm_loss": 2.0251, "loss": 2.1652, "mask_loss": 0.1289, "step": 1011, "topk_loss": 0.0111 }, { "epoch": 0.402228253106699, "grad_norm": 0.1416015625, "learning_rate": 0.000133795836731825, "lm_loss": 1.9069, "loss": 2.0469, "mask_loss": 0.1302, "step": 1012, "topk_loss": 0.0098 }, { "epoch": 0.4026257118548282, "grad_norm": 0.1279296875, "learning_rate": 0.00013367586024304714, "lm_loss": 1.9828, "loss": 2.1196, "mask_loss": 0.1276, "step": 1013, "topk_loss": 0.0093 }, { "epoch": 0.40302317060295734, "grad_norm": 0.189453125, "learning_rate": 0.0001335558290546205, "lm_loss": 2.0127, "loss": 2.1539, "mask_loss": 0.1295, "step": 1014, "topk_loss": 0.0118 }, { "epoch": 0.40342062935108647, "grad_norm": 0.169921875, "learning_rate": 0.00013343574336151153, "lm_loss": 1.918, "loss": 2.0595, "mask_loss": 0.1293, "step": 1015, "topk_loss": 0.0123 }, { "epoch": 0.40381808809921566, "grad_norm": 0.1376953125, "learning_rate": 0.00013331560335877525, "lm_loss": 1.9773, "loss": 2.1191, "mask_loss": 0.1314, "step": 1016, "topk_loss": 0.0104 }, { "epoch": 0.4042155468473448, "grad_norm": 0.1328125, "learning_rate": 0.0001331954092415549, "lm_loss": 1.98, "loss": 2.1209, "mask_loss": 0.1302, "step": 1017, "topk_loss": 0.0106 }, { "epoch": 0.4046130055954739, "grad_norm": 0.1396484375, "learning_rate": 0.00013307516120508161, "lm_loss": 1.9573, "loss": 2.0995, "mask_loss": 0.1317, "step": 1018, "topk_loss": 0.0105 }, { "epoch": 0.4050104643436031, "grad_norm": 0.138671875, "learning_rate": 0.00013295485944467405, "lm_loss": 2.0041, "loss": 2.1453, "mask_loss": 0.1314, "step": 1019, "topk_loss": 0.0099 }, { "epoch": 0.40540792309173224, "grad_norm": 0.14453125, "learning_rate": 0.0001328345041557382, "lm_loss": 1.8941, "loss": 2.0372, "mask_loss": 0.1321, "step": 1020, "topk_loss": 0.011 }, { "epoch": 0.40580538183986137, "grad_norm": 0.12890625, "learning_rate": 0.000132714095533767, "lm_loss": 1.8778, "loss": 2.0211, "mask_loss": 0.1318, "step": 1021, "topk_loss": 0.0115 }, { "epoch": 0.40620284058799055, "grad_norm": 0.12353515625, "learning_rate": 0.00013259363377433994, "lm_loss": 1.962, "loss": 2.0981, "mask_loss": 0.1272, "step": 1022, "topk_loss": 0.0089 }, { "epoch": 0.4066002993361197, "grad_norm": 0.12353515625, "learning_rate": 0.0001324731190731229, "lm_loss": 1.9555, "loss": 2.0984, "mask_loss": 0.1309, "step": 1023, "topk_loss": 0.012 }, { "epoch": 0.4069977580842488, "grad_norm": 0.1484375, "learning_rate": 0.00013235255162586773, "lm_loss": 1.9699, "loss": 2.1104, "mask_loss": 0.1297, "step": 1024, "topk_loss": 0.0108 }, { "epoch": 0.407395216832378, "grad_norm": 0.1572265625, "learning_rate": 0.000132231931628412, "lm_loss": 1.8687, "loss": 2.0083, "mask_loss": 0.1294, "step": 1025, "topk_loss": 0.0103 }, { "epoch": 0.40779267558050714, "grad_norm": 0.2080078125, "learning_rate": 0.0001321112592766785, "lm_loss": 1.8635, "loss": 2.0047, "mask_loss": 0.1306, "step": 1026, "topk_loss": 0.0106 }, { "epoch": 0.40819013432863627, "grad_norm": 0.130859375, "learning_rate": 0.00013199053476667518, "lm_loss": 1.9358, "loss": 2.0763, "mask_loss": 0.1294, "step": 1027, "topk_loss": 0.0111 }, { "epoch": 0.40858759307676545, "grad_norm": 0.259765625, "learning_rate": 0.0001318697582944947, "lm_loss": 1.9778, "loss": 2.12, "mask_loss": 0.1299, "step": 1028, "topk_loss": 0.0122 }, { "epoch": 0.4089850518248946, "grad_norm": 0.2060546875, "learning_rate": 0.00013174893005631414, "lm_loss": 1.9636, "loss": 2.1044, "mask_loss": 0.1295, "step": 1029, "topk_loss": 0.0113 }, { "epoch": 0.4093825105730237, "grad_norm": 0.1259765625, "learning_rate": 0.00013162805024839448, "lm_loss": 1.9506, "loss": 2.0864, "mask_loss": 0.1263, "step": 1030, "topk_loss": 0.0094 }, { "epoch": 0.4097799693211529, "grad_norm": 0.1416015625, "learning_rate": 0.00013150711906708077, "lm_loss": 1.9486, "loss": 2.0921, "mask_loss": 0.1325, "step": 1031, "topk_loss": 0.0109 }, { "epoch": 0.41017742806928204, "grad_norm": 0.177734375, "learning_rate": 0.00013138613670880123, "lm_loss": 1.9334, "loss": 2.0708, "mask_loss": 0.1281, "step": 1032, "topk_loss": 0.0093 }, { "epoch": 0.41057488681741117, "grad_norm": 0.16015625, "learning_rate": 0.0001312651033700674, "lm_loss": 1.9784, "loss": 2.118, "mask_loss": 0.1295, "step": 1033, "topk_loss": 0.0101 }, { "epoch": 0.41097234556554035, "grad_norm": 0.2119140625, "learning_rate": 0.0001311440192474735, "lm_loss": 1.9494, "loss": 2.0924, "mask_loss": 0.1305, "step": 1034, "topk_loss": 0.0125 }, { "epoch": 0.4113698043136695, "grad_norm": 0.1689453125, "learning_rate": 0.00013102288453769632, "lm_loss": 1.9732, "loss": 2.1122, "mask_loss": 0.1275, "step": 1035, "topk_loss": 0.0115 }, { "epoch": 0.4117672630617986, "grad_norm": 0.13671875, "learning_rate": 0.00013090169943749476, "lm_loss": 1.9036, "loss": 2.0438, "mask_loss": 0.1301, "step": 1036, "topk_loss": 0.0101 }, { "epoch": 0.41216472180992775, "grad_norm": 0.19140625, "learning_rate": 0.0001307804641437096, "lm_loss": 1.88, "loss": 2.0215, "mask_loss": 0.1311, "step": 1037, "topk_loss": 0.0104 }, { "epoch": 0.41256218055805693, "grad_norm": 0.318359375, "learning_rate": 0.00013065917885326313, "lm_loss": 1.9213, "loss": 2.1418, "mask_loss": 0.1812, "step": 1038, "topk_loss": 0.0393 }, { "epoch": 0.41295963930618607, "grad_norm": 0.2470703125, "learning_rate": 0.00013053784376315888, "lm_loss": 1.8941, "loss": 2.0317, "mask_loss": 0.1272, "step": 1039, "topk_loss": 0.0105 }, { "epoch": 0.4133570980543152, "grad_norm": 0.1962890625, "learning_rate": 0.0001304164590704813, "lm_loss": 1.9242, "loss": 2.0638, "mask_loss": 0.1287, "step": 1040, "topk_loss": 0.0109 }, { "epoch": 0.4137545568024444, "grad_norm": 0.1748046875, "learning_rate": 0.00013029502497239533, "lm_loss": 1.9485, "loss": 2.0907, "mask_loss": 0.1297, "step": 1041, "topk_loss": 0.0125 }, { "epoch": 0.4141520155505735, "grad_norm": 0.1689453125, "learning_rate": 0.00013017354166614613, "lm_loss": 1.9211, "loss": 2.061, "mask_loss": 0.1294, "step": 1042, "topk_loss": 0.0106 }, { "epoch": 0.41454947429870265, "grad_norm": 0.2021484375, "learning_rate": 0.0001300520093490589, "lm_loss": 1.9634, "loss": 2.1031, "mask_loss": 0.1288, "step": 1043, "topk_loss": 0.0109 }, { "epoch": 0.41494693304683183, "grad_norm": 0.193359375, "learning_rate": 0.0001299304282185384, "lm_loss": 1.9653, "loss": 2.1087, "mask_loss": 0.1307, "step": 1044, "topk_loss": 0.0126 }, { "epoch": 0.41534439179496097, "grad_norm": 0.142578125, "learning_rate": 0.0001298087984720687, "lm_loss": 1.9501, "loss": 2.0898, "mask_loss": 0.1295, "step": 1045, "topk_loss": 0.0102 }, { "epoch": 0.4157418505430901, "grad_norm": 0.14453125, "learning_rate": 0.00012968712030721278, "lm_loss": 1.9013, "loss": 2.0384, "mask_loss": 0.1279, "step": 1046, "topk_loss": 0.0092 }, { "epoch": 0.4161393092912193, "grad_norm": 0.166015625, "learning_rate": 0.00012956539392161229, "lm_loss": 1.91, "loss": 2.0526, "mask_loss": 0.1311, "step": 1047, "topk_loss": 0.0115 }, { "epoch": 0.4165367680393484, "grad_norm": 0.1552734375, "learning_rate": 0.00012944361951298722, "lm_loss": 1.9418, "loss": 2.08, "mask_loss": 0.1285, "step": 1048, "topk_loss": 0.0097 }, { "epoch": 0.41693422678747755, "grad_norm": 0.1337890625, "learning_rate": 0.0001293217972791356, "lm_loss": 1.9176, "loss": 2.0586, "mask_loss": 0.1306, "step": 1049, "topk_loss": 0.0104 }, { "epoch": 0.41733168553560673, "grad_norm": 0.1220703125, "learning_rate": 0.00012919992741793307, "lm_loss": 1.9382, "loss": 2.0761, "mask_loss": 0.128, "step": 1050, "topk_loss": 0.0099 }, { "epoch": 0.41733168553560673, "eval_lm_loss": 689.869384765625, "eval_loss": 690.0109252929688, "eval_mask_hit_rate": 0.5258909463882446, "eval_mask_loss": 0.12821093201637268, "eval_mask_top_10_hit_rate": 0.9835715293884277, "eval_mask_top_1_hit_rate": 0.9969048500061035, "eval_mask_top_20_hit_rate": 0.9733020067214966, "eval_mask_top_5_hit_rate": 0.9894579648971558, "eval_runtime": 143.5444, "eval_samples_per_second": 14.267, "eval_steps_per_second": 7.134, "eval_token_accuracy": 0.6096411943435669, "eval_top_k_diff": -523.7191162109375, "eval_topk_loss": 0.013318357057869434, "step": 1050 }, { "epoch": 0.41772914428373586, "grad_norm": 0.1962890625, "learning_rate": 0.0001290780101273326, "lm_loss": 1.943, "loss": 2.1011, "mask_loss": 0.1379, "step": 1051, "topk_loss": 0.0202 }, { "epoch": 0.418126603031865, "grad_norm": 0.134765625, "learning_rate": 0.00012895604560536435, "lm_loss": 1.9621, "loss": 2.0999, "mask_loss": 0.1288, "step": 1052, "topk_loss": 0.009 }, { "epoch": 0.4185240617799942, "grad_norm": 0.1328125, "learning_rate": 0.0001288340340501351, "lm_loss": 1.9349, "loss": 2.0802, "mask_loss": 0.1337, "step": 1053, "topk_loss": 0.0115 }, { "epoch": 0.4189215205281233, "grad_norm": 0.173828125, "learning_rate": 0.000128711975659828, "lm_loss": 1.9396, "loss": 2.077, "mask_loss": 0.1274, "step": 1054, "topk_loss": 0.01 }, { "epoch": 0.41931897927625245, "grad_norm": 0.11962890625, "learning_rate": 0.0001285898706327023, "lm_loss": 1.92, "loss": 2.0605, "mask_loss": 0.1302, "step": 1055, "topk_loss": 0.0103 }, { "epoch": 0.41971643802438163, "grad_norm": 0.12890625, "learning_rate": 0.00012846771916709304, "lm_loss": 1.855, "loss": 1.9959, "mask_loss": 0.1316, "step": 1056, "topk_loss": 0.0093 }, { "epoch": 0.42011389677251076, "grad_norm": 0.134765625, "learning_rate": 0.00012834552146141065, "lm_loss": 1.9188, "loss": 2.0576, "mask_loss": 0.1292, "step": 1057, "topk_loss": 0.0095 }, { "epoch": 0.4205113555206399, "grad_norm": 0.1455078125, "learning_rate": 0.00012822327771414067, "lm_loss": 1.9327, "loss": 2.0795, "mask_loss": 0.1333, "step": 1058, "topk_loss": 0.0134 }, { "epoch": 0.4209088142687691, "grad_norm": 0.16796875, "learning_rate": 0.00012810098812384346, "lm_loss": 1.8952, "loss": 2.0352, "mask_loss": 0.129, "step": 1059, "topk_loss": 0.011 }, { "epoch": 0.4213062730168982, "grad_norm": 0.234375, "learning_rate": 0.0001279786528891538, "lm_loss": 1.8074, "loss": 1.9475, "mask_loss": 0.1296, "step": 1060, "topk_loss": 0.0105 }, { "epoch": 0.42170373176502735, "grad_norm": 0.1259765625, "learning_rate": 0.0001278562722087806, "lm_loss": 1.9386, "loss": 2.0806, "mask_loss": 0.1302, "step": 1061, "topk_loss": 0.0117 }, { "epoch": 0.42210119051315653, "grad_norm": 0.154296875, "learning_rate": 0.00012773384628150667, "lm_loss": 1.8896, "loss": 2.0308, "mask_loss": 0.13, "step": 1062, "topk_loss": 0.0112 }, { "epoch": 0.42249864926128566, "grad_norm": 0.12890625, "learning_rate": 0.0001276113753061882, "lm_loss": 1.9116, "loss": 2.0539, "mask_loss": 0.1319, "step": 1063, "topk_loss": 0.0104 }, { "epoch": 0.4228961080094148, "grad_norm": 0.1728515625, "learning_rate": 0.00012748885948175466, "lm_loss": 1.8769, "loss": 2.0163, "mask_loss": 0.1297, "step": 1064, "topk_loss": 0.0098 }, { "epoch": 0.4232935667575439, "grad_norm": 0.19921875, "learning_rate": 0.0001273662990072083, "lm_loss": 1.9988, "loss": 2.1388, "mask_loss": 0.1285, "step": 1065, "topk_loss": 0.0114 }, { "epoch": 0.4236910255056731, "grad_norm": 0.1796875, "learning_rate": 0.0001272436940816239, "lm_loss": 1.9298, "loss": 2.0734, "mask_loss": 0.1302, "step": 1066, "topk_loss": 0.0133 }, { "epoch": 0.42408848425380224, "grad_norm": 0.130859375, "learning_rate": 0.00012712104490414844, "lm_loss": 1.9186, "loss": 2.0582, "mask_loss": 0.129, "step": 1067, "topk_loss": 0.0106 }, { "epoch": 0.4244859430019314, "grad_norm": 0.2099609375, "learning_rate": 0.00012699835167400084, "lm_loss": 1.9561, "loss": 2.0928, "mask_loss": 0.128, "step": 1068, "topk_loss": 0.0088 }, { "epoch": 0.42488340175006056, "grad_norm": 0.22265625, "learning_rate": 0.0001268756145904715, "lm_loss": 1.9269, "loss": 2.0654, "mask_loss": 0.1281, "step": 1069, "topk_loss": 0.0104 }, { "epoch": 0.4252808604981897, "grad_norm": 0.1376953125, "learning_rate": 0.00012675283385292212, "lm_loss": 1.948, "loss": 2.0851, "mask_loss": 0.128, "step": 1070, "topk_loss": 0.0092 }, { "epoch": 0.4256783192463188, "grad_norm": 0.1376953125, "learning_rate": 0.00012663000966078516, "lm_loss": 1.9498, "loss": 2.0969, "mask_loss": 0.1344, "step": 1071, "topk_loss": 0.0128 }, { "epoch": 0.426075777994448, "grad_norm": 0.126953125, "learning_rate": 0.00012650714221356388, "lm_loss": 1.9578, "loss": 2.0974, "mask_loss": 0.1282, "step": 1072, "topk_loss": 0.0114 }, { "epoch": 0.42647323674257714, "grad_norm": 0.1328125, "learning_rate": 0.00012638423171083165, "lm_loss": 1.9249, "loss": 2.0654, "mask_loss": 0.1309, "step": 1073, "topk_loss": 0.0096 }, { "epoch": 0.4268706954907063, "grad_norm": 0.1806640625, "learning_rate": 0.00012626127835223177, "lm_loss": 1.9757, "loss": 2.1239, "mask_loss": 0.1333, "step": 1074, "topk_loss": 0.0149 }, { "epoch": 0.42726815423883546, "grad_norm": 0.224609375, "learning_rate": 0.00012613828233747727, "lm_loss": 2.0369, "loss": 2.1774, "mask_loss": 0.1285, "step": 1075, "topk_loss": 0.0119 }, { "epoch": 0.4276656129869646, "grad_norm": 0.1259765625, "learning_rate": 0.00012601524386635036, "lm_loss": 1.9857, "loss": 2.1217, "mask_loss": 0.1263, "step": 1076, "topk_loss": 0.0097 }, { "epoch": 0.4280630717350937, "grad_norm": 0.1298828125, "learning_rate": 0.00012589216313870223, "lm_loss": 1.8912, "loss": 2.0309, "mask_loss": 0.1286, "step": 1077, "topk_loss": 0.011 }, { "epoch": 0.4284605304832229, "grad_norm": 0.15625, "learning_rate": 0.0001257690403544527, "lm_loss": 1.8908, "loss": 2.0333, "mask_loss": 0.1294, "step": 1078, "topk_loss": 0.0131 }, { "epoch": 0.42885798923135204, "grad_norm": 0.1259765625, "learning_rate": 0.00012564587571359, "lm_loss": 1.9254, "loss": 2.0659, "mask_loss": 0.1293, "step": 1079, "topk_loss": 0.0112 }, { "epoch": 0.4292554479794812, "grad_norm": 0.2099609375, "learning_rate": 0.00012552266941617018, "lm_loss": 1.9031, "loss": 2.0396, "mask_loss": 0.1272, "step": 1080, "topk_loss": 0.0093 }, { "epoch": 0.42965290672761036, "grad_norm": 0.1279296875, "learning_rate": 0.00012539942166231712, "lm_loss": 2.0051, "loss": 2.1465, "mask_loss": 0.1295, "step": 1081, "topk_loss": 0.0119 }, { "epoch": 0.4300503654757395, "grad_norm": 0.1298828125, "learning_rate": 0.00012527613265222187, "lm_loss": 1.9319, "loss": 2.0702, "mask_loss": 0.1279, "step": 1082, "topk_loss": 0.0104 }, { "epoch": 0.4304478242238686, "grad_norm": 0.158203125, "learning_rate": 0.00012515280258614266, "lm_loss": 1.9023, "loss": 2.0417, "mask_loss": 0.1298, "step": 1083, "topk_loss": 0.0096 }, { "epoch": 0.4308452829719978, "grad_norm": 0.1357421875, "learning_rate": 0.0001250294316644043, "lm_loss": 1.9735, "loss": 2.1158, "mask_loss": 0.1309, "step": 1084, "topk_loss": 0.0115 }, { "epoch": 0.43124274172012694, "grad_norm": 0.16015625, "learning_rate": 0.000124906020087398, "lm_loss": 1.8906, "loss": 2.0301, "mask_loss": 0.1297, "step": 1085, "topk_loss": 0.0097 }, { "epoch": 0.4316402004682561, "grad_norm": 0.1396484375, "learning_rate": 0.000124782568055581, "lm_loss": 1.9388, "loss": 2.0832, "mask_loss": 0.1321, "step": 1086, "topk_loss": 0.0123 }, { "epoch": 0.43203765921638526, "grad_norm": 0.12890625, "learning_rate": 0.00012465907576947622, "lm_loss": 1.8991, "loss": 2.0419, "mask_loss": 0.1317, "step": 1087, "topk_loss": 0.0111 }, { "epoch": 0.4324351179645144, "grad_norm": 0.130859375, "learning_rate": 0.000124535543429672, "lm_loss": 1.937, "loss": 2.0774, "mask_loss": 0.1283, "step": 1088, "topk_loss": 0.0121 }, { "epoch": 0.4328325767126435, "grad_norm": 0.17578125, "learning_rate": 0.0001244119712368218, "lm_loss": 1.8791, "loss": 2.016, "mask_loss": 0.1273, "step": 1089, "topk_loss": 0.0096 }, { "epoch": 0.4332300354607727, "grad_norm": 0.125, "learning_rate": 0.00012428835939164363, "lm_loss": 1.9085, "loss": 2.0468, "mask_loss": 0.1281, "step": 1090, "topk_loss": 0.0102 }, { "epoch": 0.43362749420890184, "grad_norm": 0.1806640625, "learning_rate": 0.00012416470809492011, "lm_loss": 2.0234, "loss": 2.1647, "mask_loss": 0.1301, "step": 1091, "topk_loss": 0.0111 }, { "epoch": 0.434024952957031, "grad_norm": 0.134765625, "learning_rate": 0.00012404101754749782, "lm_loss": 1.9514, "loss": 2.0922, "mask_loss": 0.1295, "step": 1092, "topk_loss": 0.0112 }, { "epoch": 0.4344224117051601, "grad_norm": 0.173828125, "learning_rate": 0.00012391728795028718, "lm_loss": 1.9315, "loss": 2.0693, "mask_loss": 0.1275, "step": 1093, "topk_loss": 0.0102 }, { "epoch": 0.4348198704532893, "grad_norm": 0.13671875, "learning_rate": 0.00012379351950426187, "lm_loss": 1.9659, "loss": 2.1042, "mask_loss": 0.1273, "step": 1094, "topk_loss": 0.011 }, { "epoch": 0.4352173292014184, "grad_norm": 0.1298828125, "learning_rate": 0.00012366971241045894, "lm_loss": 1.95, "loss": 2.089, "mask_loss": 0.1298, "step": 1095, "topk_loss": 0.0092 }, { "epoch": 0.43561478794954756, "grad_norm": 0.12255859375, "learning_rate": 0.00012354586686997792, "lm_loss": 1.9592, "loss": 2.0967, "mask_loss": 0.1276, "step": 1096, "topk_loss": 0.0099 }, { "epoch": 0.43601224669767674, "grad_norm": 0.1904296875, "learning_rate": 0.00012342198308398108, "lm_loss": 1.9012, "loss": 2.0601, "mask_loss": 0.1373, "step": 1097, "topk_loss": 0.0216 }, { "epoch": 0.4364097054458059, "grad_norm": 0.2001953125, "learning_rate": 0.00012329806125369253, "lm_loss": 1.8855, "loss": 2.0276, "mask_loss": 0.1309, "step": 1098, "topk_loss": 0.0112 }, { "epoch": 0.436807164193935, "grad_norm": 0.185546875, "learning_rate": 0.0001231741015803984, "lm_loss": 1.9489, "loss": 2.0897, "mask_loss": 0.1292, "step": 1099, "topk_loss": 0.0117 }, { "epoch": 0.4372046229420642, "grad_norm": 0.1318359375, "learning_rate": 0.00012305010426544614, "lm_loss": 1.8913, "loss": 2.0306, "mask_loss": 0.1289, "step": 1100, "topk_loss": 0.0105 }, { "epoch": 0.4372046229420642, "eval_lm_loss": 689.3663330078125, "eval_loss": 689.50732421875, "eval_mask_hit_rate": 0.5272812843322754, "eval_mask_loss": 0.12767279148101807, "eval_mask_top_10_hit_rate": 0.9839221835136414, "eval_mask_top_1_hit_rate": 0.9970452785491943, "eval_mask_top_20_hit_rate": 0.9737967252731323, "eval_mask_top_5_hit_rate": 0.9897100925445557, "eval_runtime": 144.2783, "eval_samples_per_second": 14.195, "eval_steps_per_second": 7.097, "eval_token_accuracy": 0.6103806495666504, "eval_top_k_diff": -526.74169921875, "eval_topk_loss": 0.013399062678217888, "step": 1100 }, { "epoch": 0.4376020816901933, "grad_norm": 0.2060546875, "learning_rate": 0.00012292606951024447, "lm_loss": 1.9511, "loss": 2.0901, "mask_loss": 0.1288, "step": 1101, "topk_loss": 0.0102 }, { "epoch": 0.43799954043832245, "grad_norm": 0.1279296875, "learning_rate": 0.00012280199751626278, "lm_loss": 1.8895, "loss": 2.0293, "mask_loss": 0.13, "step": 1102, "topk_loss": 0.0098 }, { "epoch": 0.43839699918645164, "grad_norm": 0.1318359375, "learning_rate": 0.00012267788848503106, "lm_loss": 1.9403, "loss": 2.0793, "mask_loss": 0.1289, "step": 1103, "topk_loss": 0.0101 }, { "epoch": 0.4387944579345808, "grad_norm": 0.14453125, "learning_rate": 0.00012255374261813944, "lm_loss": 1.9576, "loss": 2.0953, "mask_loss": 0.1273, "step": 1104, "topk_loss": 0.0104 }, { "epoch": 0.4391919166827099, "grad_norm": 0.1552734375, "learning_rate": 0.00012242956011723782, "lm_loss": 1.9051, "loss": 2.0509, "mask_loss": 0.1335, "step": 1105, "topk_loss": 0.0124 }, { "epoch": 0.4395893754308391, "grad_norm": 0.12890625, "learning_rate": 0.00012230534118403568, "lm_loss": 1.9667, "loss": 2.1039, "mask_loss": 0.1273, "step": 1106, "topk_loss": 0.0098 }, { "epoch": 0.4399868341789682, "grad_norm": 0.1435546875, "learning_rate": 0.00012218108602030163, "lm_loss": 1.9088, "loss": 2.0525, "mask_loss": 0.1317, "step": 1107, "topk_loss": 0.012 }, { "epoch": 0.44038429292709735, "grad_norm": 0.1376953125, "learning_rate": 0.00012205679482786317, "lm_loss": 2.0079, "loss": 2.1486, "mask_loss": 0.1294, "step": 1108, "topk_loss": 0.0113 }, { "epoch": 0.44078175167522654, "grad_norm": 0.1513671875, "learning_rate": 0.00012193246780860628, "lm_loss": 1.973, "loss": 2.1148, "mask_loss": 0.1301, "step": 1109, "topk_loss": 0.0117 }, { "epoch": 0.44117921042335567, "grad_norm": 0.1513671875, "learning_rate": 0.00012180810516447512, "lm_loss": 2.0177, "loss": 2.1566, "mask_loss": 0.1278, "step": 1110, "topk_loss": 0.0111 }, { "epoch": 0.4415766691714848, "grad_norm": 0.13671875, "learning_rate": 0.00012168370709747177, "lm_loss": 1.9501, "loss": 2.0943, "mask_loss": 0.132, "step": 1111, "topk_loss": 0.0122 }, { "epoch": 0.441974127919614, "grad_norm": 0.140625, "learning_rate": 0.00012155927380965582, "lm_loss": 1.9107, "loss": 2.0503, "mask_loss": 0.1302, "step": 1112, "topk_loss": 0.0095 }, { "epoch": 0.4423715866677431, "grad_norm": 0.16015625, "learning_rate": 0.0001214348055031441, "lm_loss": 1.9949, "loss": 2.1317, "mask_loss": 0.127, "step": 1113, "topk_loss": 0.0097 }, { "epoch": 0.44276904541587225, "grad_norm": 0.11865234375, "learning_rate": 0.00012131030238011025, "lm_loss": 1.8971, "loss": 2.0372, "mask_loss": 0.1292, "step": 1114, "topk_loss": 0.0109 }, { "epoch": 0.44316650416400144, "grad_norm": 0.15625, "learning_rate": 0.0001211857646427845, "lm_loss": 1.9859, "loss": 2.1258, "mask_loss": 0.128, "step": 1115, "topk_loss": 0.0119 }, { "epoch": 0.44356396291213057, "grad_norm": 0.15234375, "learning_rate": 0.00012106119249345336, "lm_loss": 1.9206, "loss": 2.0607, "mask_loss": 0.1296, "step": 1116, "topk_loss": 0.0105 }, { "epoch": 0.4439614216602597, "grad_norm": 0.140625, "learning_rate": 0.00012093658613445913, "lm_loss": 1.9308, "loss": 2.0717, "mask_loss": 0.1296, "step": 1117, "topk_loss": 0.0113 }, { "epoch": 0.4443588804083889, "grad_norm": 0.1630859375, "learning_rate": 0.00012081194576819974, "lm_loss": 1.9135, "loss": 2.0519, "mask_loss": 0.1285, "step": 1118, "topk_loss": 0.0099 }, { "epoch": 0.444756339156518, "grad_norm": 0.15625, "learning_rate": 0.00012068727159712838, "lm_loss": 1.9549, "loss": 2.0941, "mask_loss": 0.1289, "step": 1119, "topk_loss": 0.0103 }, { "epoch": 0.44515379790464715, "grad_norm": 0.1640625, "learning_rate": 0.00012056256382375308, "lm_loss": 1.9009, "loss": 2.0382, "mask_loss": 0.1278, "step": 1120, "topk_loss": 0.0095 }, { "epoch": 0.4455512566527763, "grad_norm": 0.1474609375, "learning_rate": 0.0001204378226506365, "lm_loss": 1.9945, "loss": 2.1307, "mask_loss": 0.127, "step": 1121, "topk_loss": 0.0092 }, { "epoch": 0.44594871540090547, "grad_norm": 0.12255859375, "learning_rate": 0.00012031304828039554, "lm_loss": 1.9242, "loss": 2.0615, "mask_loss": 0.1277, "step": 1122, "topk_loss": 0.0096 }, { "epoch": 0.4463461741490346, "grad_norm": 0.138671875, "learning_rate": 0.00012018824091570103, "lm_loss": 1.9284, "loss": 2.0676, "mask_loss": 0.129, "step": 1123, "topk_loss": 0.0101 }, { "epoch": 0.44674363289716373, "grad_norm": 0.1787109375, "learning_rate": 0.00012006340075927736, "lm_loss": 1.9781, "loss": 2.1182, "mask_loss": 0.1283, "step": 1124, "topk_loss": 0.0118 }, { "epoch": 0.4471410916452929, "grad_norm": 0.138671875, "learning_rate": 0.00011993852801390226, "lm_loss": 1.9751, "loss": 2.1202, "mask_loss": 0.1317, "step": 1125, "topk_loss": 0.0134 }, { "epoch": 0.44753855039342205, "grad_norm": 0.12890625, "learning_rate": 0.00011981362288240627, "lm_loss": 1.8825, "loss": 2.0205, "mask_loss": 0.1295, "step": 1126, "topk_loss": 0.0085 }, { "epoch": 0.4479360091415512, "grad_norm": 0.134765625, "learning_rate": 0.00011968868556767266, "lm_loss": 1.9584, "loss": 2.0997, "mask_loss": 0.1308, "step": 1127, "topk_loss": 0.0105 }, { "epoch": 0.44833346788968037, "grad_norm": 0.1337890625, "learning_rate": 0.00011956371627263687, "lm_loss": 1.9445, "loss": 2.0809, "mask_loss": 0.1263, "step": 1128, "topk_loss": 0.0101 }, { "epoch": 0.4487309266378095, "grad_norm": 0.1376953125, "learning_rate": 0.00011943871520028642, "lm_loss": 1.9005, "loss": 2.0399, "mask_loss": 0.1296, "step": 1129, "topk_loss": 0.0098 }, { "epoch": 0.44912838538593863, "grad_norm": 0.138671875, "learning_rate": 0.00011931368255366027, "lm_loss": 1.9449, "loss": 2.0849, "mask_loss": 0.1299, "step": 1130, "topk_loss": 0.0101 }, { "epoch": 0.4495258441340678, "grad_norm": 0.134765625, "learning_rate": 0.0001191886185358488, "lm_loss": 1.9556, "loss": 2.0947, "mask_loss": 0.1281, "step": 1131, "topk_loss": 0.011 }, { "epoch": 0.44992330288219695, "grad_norm": 0.1376953125, "learning_rate": 0.00011906352334999331, "lm_loss": 1.926, "loss": 2.0596, "mask_loss": 0.1247, "step": 1132, "topk_loss": 0.0089 }, { "epoch": 0.4503207616303261, "grad_norm": 0.1396484375, "learning_rate": 0.00011893839719928573, "lm_loss": 1.8941, "loss": 2.0384, "mask_loss": 0.1325, "step": 1133, "topk_loss": 0.0118 }, { "epoch": 0.45071822037845527, "grad_norm": 0.12890625, "learning_rate": 0.00011881324028696824, "lm_loss": 1.9472, "loss": 2.0868, "mask_loss": 0.1292, "step": 1134, "topk_loss": 0.0103 }, { "epoch": 0.4511156791265844, "grad_norm": 0.15625, "learning_rate": 0.00011868805281633304, "lm_loss": 1.9391, "loss": 2.0752, "mask_loss": 0.1268, "step": 1135, "topk_loss": 0.0093 }, { "epoch": 0.45151313787471353, "grad_norm": 0.14453125, "learning_rate": 0.00011856283499072196, "lm_loss": 1.8981, "loss": 2.0365, "mask_loss": 0.1284, "step": 1136, "topk_loss": 0.0101 }, { "epoch": 0.4519105966228427, "grad_norm": 0.1865234375, "learning_rate": 0.00011843758701352614, "lm_loss": 1.8743, "loss": 2.0243, "mask_loss": 0.1346, "step": 1137, "topk_loss": 0.0154 }, { "epoch": 0.45230805537097185, "grad_norm": 0.130859375, "learning_rate": 0.00011831230908818563, "lm_loss": 1.9063, "loss": 2.0504, "mask_loss": 0.1299, "step": 1138, "topk_loss": 0.0143 }, { "epoch": 0.452705514119101, "grad_norm": 0.169921875, "learning_rate": 0.00011818700141818921, "lm_loss": 1.9056, "loss": 2.0454, "mask_loss": 0.1293, "step": 1139, "topk_loss": 0.0105 }, { "epoch": 0.45310297286723017, "grad_norm": 0.16796875, "learning_rate": 0.00011806166420707392, "lm_loss": 1.9614, "loss": 2.0986, "mask_loss": 0.1276, "step": 1140, "topk_loss": 0.0096 }, { "epoch": 0.4535004316153593, "grad_norm": 0.1357421875, "learning_rate": 0.00011793629765842482, "lm_loss": 1.9674, "loss": 2.112, "mask_loss": 0.1318, "step": 1141, "topk_loss": 0.0128 }, { "epoch": 0.45389789036348843, "grad_norm": 0.126953125, "learning_rate": 0.00011781090197587459, "lm_loss": 1.9271, "loss": 2.0674, "mask_loss": 0.1295, "step": 1142, "topk_loss": 0.0108 }, { "epoch": 0.4542953491116176, "grad_norm": 0.203125, "learning_rate": 0.00011768547736310327, "lm_loss": 1.8767, "loss": 2.0153, "mask_loss": 0.1282, "step": 1143, "topk_loss": 0.0104 }, { "epoch": 0.45469280785974675, "grad_norm": 0.1552734375, "learning_rate": 0.00011756002402383783, "lm_loss": 1.9228, "loss": 2.0596, "mask_loss": 0.1268, "step": 1144, "topk_loss": 0.01 }, { "epoch": 0.4550902666078759, "grad_norm": 0.13671875, "learning_rate": 0.00011743454216185201, "lm_loss": 1.9901, "loss": 2.1275, "mask_loss": 0.1275, "step": 1145, "topk_loss": 0.0099 }, { "epoch": 0.45548772535600507, "grad_norm": 0.16796875, "learning_rate": 0.00011730903198096573, "lm_loss": 1.9429, "loss": 2.0844, "mask_loss": 0.1297, "step": 1146, "topk_loss": 0.0118 }, { "epoch": 0.4558851841041342, "grad_norm": 0.1328125, "learning_rate": 0.000117183493685045, "lm_loss": 1.9209, "loss": 2.0591, "mask_loss": 0.1285, "step": 1147, "topk_loss": 0.0097 }, { "epoch": 0.45628264285226333, "grad_norm": 0.125, "learning_rate": 0.00011705792747800153, "lm_loss": 1.8713, "loss": 2.0083, "mask_loss": 0.1286, "step": 1148, "topk_loss": 0.0084 }, { "epoch": 0.45668010160039246, "grad_norm": 0.140625, "learning_rate": 0.0001169323335637923, "lm_loss": 1.9579, "loss": 2.0947, "mask_loss": 0.1269, "step": 1149, "topk_loss": 0.0098 }, { "epoch": 0.45707756034852165, "grad_norm": 0.14453125, "learning_rate": 0.00011680671214641927, "lm_loss": 1.9345, "loss": 2.0771, "mask_loss": 0.1301, "step": 1150, "topk_loss": 0.0124 }, { "epoch": 0.45707756034852165, "eval_lm_loss": 689.210693359375, "eval_loss": 689.3509521484375, "eval_mask_hit_rate": 0.5287200808525085, "eval_mask_loss": 0.12717577815055847, "eval_mask_top_10_hit_rate": 0.9842155575752258, "eval_mask_top_1_hit_rate": 0.9971096515655518, "eval_mask_top_20_hit_rate": 0.9742197394371033, "eval_mask_top_5_hit_rate": 0.9899111390113831, "eval_runtime": 144.4636, "eval_samples_per_second": 14.177, "eval_steps_per_second": 7.088, "eval_token_accuracy": 0.6111284494400024, "eval_top_k_diff": -525.6437377929688, "eval_topk_loss": 0.013032329268753529, "step": 1150 }, { "epoch": 0.4574750190966508, "grad_norm": 0.1396484375, "learning_rate": 0.00011668106342992917, "lm_loss": 1.939, "loss": 2.0791, "mask_loss": 0.1299, "step": 1151, "topk_loss": 0.0103 }, { "epoch": 0.4578724778447799, "grad_norm": 0.130859375, "learning_rate": 0.000116555387618413, "lm_loss": 1.9299, "loss": 2.0699, "mask_loss": 0.1301, "step": 1152, "topk_loss": 0.01 }, { "epoch": 0.4582699365929091, "grad_norm": 0.158203125, "learning_rate": 0.00011642968491600581, "lm_loss": 1.9045, "loss": 2.0441, "mask_loss": 0.128, "step": 1153, "topk_loss": 0.0115 }, { "epoch": 0.45866739534103823, "grad_norm": 0.1640625, "learning_rate": 0.0001163039555268863, "lm_loss": 1.8923, "loss": 2.0362, "mask_loss": 0.1296, "step": 1154, "topk_loss": 0.0142 }, { "epoch": 0.45906485408916736, "grad_norm": 0.1943359375, "learning_rate": 0.0001161781996552765, "lm_loss": 1.9283, "loss": 2.0677, "mask_loss": 0.1269, "step": 1155, "topk_loss": 0.0125 }, { "epoch": 0.45946231283729655, "grad_norm": 0.2236328125, "learning_rate": 0.0001160524175054415, "lm_loss": 2.0017, "loss": 2.1393, "mask_loss": 0.1267, "step": 1156, "topk_loss": 0.0109 }, { "epoch": 0.4598597715854257, "grad_norm": 0.14453125, "learning_rate": 0.00011592660928168904, "lm_loss": 1.9245, "loss": 2.0614, "mask_loss": 0.1265, "step": 1157, "topk_loss": 0.0105 }, { "epoch": 0.4602572303335548, "grad_norm": 0.2373046875, "learning_rate": 0.00011580077518836927, "lm_loss": 1.9163, "loss": 2.0628, "mask_loss": 0.1307, "step": 1158, "topk_loss": 0.0158 }, { "epoch": 0.460654689081684, "grad_norm": 0.2001953125, "learning_rate": 0.00011567491542987427, "lm_loss": 1.8958, "loss": 2.0348, "mask_loss": 0.128, "step": 1159, "topk_loss": 0.0111 }, { "epoch": 0.46105214782981313, "grad_norm": 0.11767578125, "learning_rate": 0.0001155490302106379, "lm_loss": 1.8862, "loss": 2.0216, "mask_loss": 0.126, "step": 1160, "topk_loss": 0.0094 }, { "epoch": 0.46144960657794226, "grad_norm": 0.1455078125, "learning_rate": 0.00011542311973513534, "lm_loss": 1.905, "loss": 2.0475, "mask_loss": 0.1318, "step": 1161, "topk_loss": 0.0107 }, { "epoch": 0.46184706532607145, "grad_norm": 0.1630859375, "learning_rate": 0.00011529718420788269, "lm_loss": 1.9248, "loss": 2.0633, "mask_loss": 0.1284, "step": 1162, "topk_loss": 0.0101 }, { "epoch": 0.4622445240742006, "grad_norm": 0.15234375, "learning_rate": 0.00011517122383343692, "lm_loss": 1.9848, "loss": 2.1228, "mask_loss": 0.1269, "step": 1163, "topk_loss": 0.0111 }, { "epoch": 0.4626419828223297, "grad_norm": 0.12255859375, "learning_rate": 0.00011504523881639526, "lm_loss": 1.9251, "loss": 2.0624, "mask_loss": 0.1272, "step": 1164, "topk_loss": 0.01 }, { "epoch": 0.4630394415704589, "grad_norm": 0.17578125, "learning_rate": 0.00011491922936139499, "lm_loss": 1.8857, "loss": 2.0289, "mask_loss": 0.1295, "step": 1165, "topk_loss": 0.0137 }, { "epoch": 0.46343690031858803, "grad_norm": 0.1552734375, "learning_rate": 0.00011479319567311304, "lm_loss": 1.9295, "loss": 2.0796, "mask_loss": 0.1347, "step": 1166, "topk_loss": 0.0154 }, { "epoch": 0.46383435906671716, "grad_norm": 0.138671875, "learning_rate": 0.00011466713795626576, "lm_loss": 1.9211, "loss": 2.0586, "mask_loss": 0.1266, "step": 1167, "topk_loss": 0.0109 }, { "epoch": 0.46423181781484635, "grad_norm": 0.1298828125, "learning_rate": 0.0001145410564156085, "lm_loss": 1.9013, "loss": 2.0452, "mask_loss": 0.133, "step": 1168, "topk_loss": 0.0109 }, { "epoch": 0.4646292765629755, "grad_norm": 0.1494140625, "learning_rate": 0.00011441495125593538, "lm_loss": 1.9573, "loss": 2.0961, "mask_loss": 0.128, "step": 1169, "topk_loss": 0.0108 }, { "epoch": 0.4650267353111046, "grad_norm": 0.1943359375, "learning_rate": 0.00011428882268207872, "lm_loss": 1.9372, "loss": 2.0773, "mask_loss": 0.1288, "step": 1170, "topk_loss": 0.0114 }, { "epoch": 0.4654241940592338, "grad_norm": 0.15234375, "learning_rate": 0.00011416267089890901, "lm_loss": 1.8922, "loss": 2.0304, "mask_loss": 0.1266, "step": 1171, "topk_loss": 0.0116 }, { "epoch": 0.46582165280736293, "grad_norm": 0.1279296875, "learning_rate": 0.00011403649611133444, "lm_loss": 1.9008, "loss": 2.0439, "mask_loss": 0.1317, "step": 1172, "topk_loss": 0.0114 }, { "epoch": 0.46621911155549206, "grad_norm": 0.1806640625, "learning_rate": 0.00011391029852430048, "lm_loss": 1.9365, "loss": 2.0713, "mask_loss": 0.126, "step": 1173, "topk_loss": 0.0087 }, { "epoch": 0.46661657030362125, "grad_norm": 0.17578125, "learning_rate": 0.0001137840783427897, "lm_loss": 1.8781, "loss": 2.0154, "mask_loss": 0.1264, "step": 1174, "topk_loss": 0.0109 }, { "epoch": 0.4670140290517504, "grad_norm": 0.189453125, "learning_rate": 0.00011365783577182132, "lm_loss": 1.9147, "loss": 2.0551, "mask_loss": 0.1299, "step": 1175, "topk_loss": 0.0106 }, { "epoch": 0.4674114877998795, "grad_norm": 0.158203125, "learning_rate": 0.000113531571016451, "lm_loss": 1.9203, "loss": 2.0579, "mask_loss": 0.1271, "step": 1176, "topk_loss": 0.0106 }, { "epoch": 0.4678089465480087, "grad_norm": 0.1884765625, "learning_rate": 0.0001134052842817704, "lm_loss": 1.8985, "loss": 2.0398, "mask_loss": 0.1293, "step": 1177, "topk_loss": 0.012 }, { "epoch": 0.46820640529613783, "grad_norm": 0.16015625, "learning_rate": 0.0001132789757729068, "lm_loss": 1.9963, "loss": 2.1341, "mask_loss": 0.127, "step": 1178, "topk_loss": 0.0109 }, { "epoch": 0.46860386404426696, "grad_norm": 0.140625, "learning_rate": 0.00011315264569502298, "lm_loss": 1.9405, "loss": 2.0809, "mask_loss": 0.1279, "step": 1179, "topk_loss": 0.0126 }, { "epoch": 0.4690013227923961, "grad_norm": 0.1357421875, "learning_rate": 0.00011302629425331666, "lm_loss": 1.9246, "loss": 2.0608, "mask_loss": 0.1264, "step": 1180, "topk_loss": 0.0098 }, { "epoch": 0.4693987815405253, "grad_norm": 0.138671875, "learning_rate": 0.00011289992165302035, "lm_loss": 1.9812, "loss": 2.1207, "mask_loss": 0.1285, "step": 1181, "topk_loss": 0.011 }, { "epoch": 0.4697962402886544, "grad_norm": 0.12353515625, "learning_rate": 0.00011277352809940081, "lm_loss": 1.921, "loss": 2.0608, "mask_loss": 0.1283, "step": 1182, "topk_loss": 0.0115 }, { "epoch": 0.47019369903678354, "grad_norm": 0.2734375, "learning_rate": 0.00011264711379775892, "lm_loss": 1.8834, "loss": 2.0426, "mask_loss": 0.1373, "step": 1183, "topk_loss": 0.0219 }, { "epoch": 0.47059115778491273, "grad_norm": 0.203125, "learning_rate": 0.00011252067895342923, "lm_loss": 1.9036, "loss": 2.0438, "mask_loss": 0.1287, "step": 1184, "topk_loss": 0.0114 }, { "epoch": 0.47098861653304186, "grad_norm": 0.13671875, "learning_rate": 0.00011239422377177973, "lm_loss": 1.8994, "loss": 2.0381, "mask_loss": 0.1285, "step": 1185, "topk_loss": 0.0101 }, { "epoch": 0.471386075281171, "grad_norm": 0.126953125, "learning_rate": 0.00011226774845821129, "lm_loss": 1.8618, "loss": 2.0016, "mask_loss": 0.129, "step": 1186, "topk_loss": 0.0108 }, { "epoch": 0.4717835340293002, "grad_norm": 0.1298828125, "learning_rate": 0.0001121412532181576, "lm_loss": 1.9478, "loss": 2.0871, "mask_loss": 0.1287, "step": 1187, "topk_loss": 0.0106 }, { "epoch": 0.4721809927774293, "grad_norm": 0.16015625, "learning_rate": 0.00011201473825708471, "lm_loss": 1.7894, "loss": 1.9291, "mask_loss": 0.1295, "step": 1188, "topk_loss": 0.0102 }, { "epoch": 0.47257845152555844, "grad_norm": 0.18359375, "learning_rate": 0.00011188820378049065, "lm_loss": 1.9324, "loss": 2.0723, "mask_loss": 0.1277, "step": 1189, "topk_loss": 0.0121 }, { "epoch": 0.4729759102736876, "grad_norm": 0.1357421875, "learning_rate": 0.00011176164999390522, "lm_loss": 1.991, "loss": 2.1278, "mask_loss": 0.1275, "step": 1190, "topk_loss": 0.0092 }, { "epoch": 0.47337336902181676, "grad_norm": 0.130859375, "learning_rate": 0.0001116350771028895, "lm_loss": 1.9111, "loss": 2.0493, "mask_loss": 0.1281, "step": 1191, "topk_loss": 0.0101 }, { "epoch": 0.4737708277699459, "grad_norm": 0.1396484375, "learning_rate": 0.00011150848531303567, "lm_loss": 1.8928, "loss": 2.0307, "mask_loss": 0.1284, "step": 1192, "topk_loss": 0.0096 }, { "epoch": 0.4741682865180751, "grad_norm": 0.125, "learning_rate": 0.00011138187482996658, "lm_loss": 1.9357, "loss": 2.0759, "mask_loss": 0.1292, "step": 1193, "topk_loss": 0.011 }, { "epoch": 0.4745657452662042, "grad_norm": 0.1416015625, "learning_rate": 0.00011125524585933542, "lm_loss": 1.9198, "loss": 2.0555, "mask_loss": 0.1259, "step": 1194, "topk_loss": 0.0098 }, { "epoch": 0.47496320401433334, "grad_norm": 0.173828125, "learning_rate": 0.00011112859860682547, "lm_loss": 1.8896, "loss": 2.0298, "mask_loss": 0.1293, "step": 1195, "topk_loss": 0.0109 }, { "epoch": 0.4753606627624625, "grad_norm": 0.1484375, "learning_rate": 0.00011100193327814964, "lm_loss": 1.9365, "loss": 2.0732, "mask_loss": 0.1269, "step": 1196, "topk_loss": 0.0098 }, { "epoch": 0.47575812151059166, "grad_norm": 0.1923828125, "learning_rate": 0.00011087525007905031, "lm_loss": 1.9849, "loss": 2.1198, "mask_loss": 0.1255, "step": 1197, "topk_loss": 0.0094 }, { "epoch": 0.4761555802587208, "grad_norm": 0.1611328125, "learning_rate": 0.00011074854921529869, "lm_loss": 1.8791, "loss": 2.0175, "mask_loss": 0.1283, "step": 1198, "topk_loss": 0.01 }, { "epoch": 0.47655303900685, "grad_norm": 0.1767578125, "learning_rate": 0.00011062183089269487, "lm_loss": 1.8634, "loss": 2.0048, "mask_loss": 0.1296, "step": 1199, "topk_loss": 0.0118 }, { "epoch": 0.4769504977549791, "grad_norm": 0.140625, "learning_rate": 0.0001104950953170672, "lm_loss": 1.9092, "loss": 2.0484, "mask_loss": 0.1275, "step": 1200, "topk_loss": 0.0117 }, { "epoch": 0.4769504977549791, "eval_lm_loss": 688.0737915039062, "eval_loss": 688.2135009765625, "eval_mask_hit_rate": 0.5299395322799683, "eval_mask_loss": 0.12668579816818237, "eval_mask_top_10_hit_rate": 0.9844464659690857, "eval_mask_top_1_hit_rate": 0.9971840381622314, "eval_mask_top_20_hit_rate": 0.9745547771453857, "eval_mask_top_5_hit_rate": 0.9900786876678467, "eval_runtime": 144.1965, "eval_samples_per_second": 14.203, "eval_steps_per_second": 7.101, "eval_token_accuracy": 0.611724853515625, "eval_top_k_diff": -523.8722534179688, "eval_topk_loss": 0.013040510006248951, "step": 1200 }, { "epoch": 0.47734795650310824, "grad_norm": 0.130859375, "learning_rate": 0.00011036834269427214, "lm_loss": 1.9617, "loss": 2.101, "mask_loss": 0.1289, "step": 1201, "topk_loss": 0.0104 }, { "epoch": 0.4777454152512374, "grad_norm": 0.138671875, "learning_rate": 0.00011024157323019373, "lm_loss": 1.9129, "loss": 2.0465, "mask_loss": 0.1241, "step": 1202, "topk_loss": 0.0095 }, { "epoch": 0.47814287399936656, "grad_norm": 0.150390625, "learning_rate": 0.00011011478713074343, "lm_loss": 1.9386, "loss": 2.0754, "mask_loss": 0.1265, "step": 1203, "topk_loss": 0.0103 }, { "epoch": 0.4785403327474957, "grad_norm": 0.138671875, "learning_rate": 0.00010998798460185971, "lm_loss": 1.9397, "loss": 2.0819, "mask_loss": 0.1297, "step": 1204, "topk_loss": 0.0126 }, { "epoch": 0.4789377914956249, "grad_norm": 0.1376953125, "learning_rate": 0.00010986116584950774, "lm_loss": 1.9978, "loss": 2.1365, "mask_loss": 0.1284, "step": 1205, "topk_loss": 0.0103 }, { "epoch": 0.479335250243754, "grad_norm": 0.1298828125, "learning_rate": 0.00010973433107967902, "lm_loss": 1.943, "loss": 2.0842, "mask_loss": 0.1298, "step": 1206, "topk_loss": 0.0113 }, { "epoch": 0.47973270899188314, "grad_norm": 0.1259765625, "learning_rate": 0.00010960748049839103, "lm_loss": 1.919, "loss": 2.0543, "mask_loss": 0.1259, "step": 1207, "topk_loss": 0.0094 }, { "epoch": 0.48013016774001227, "grad_norm": 0.12109375, "learning_rate": 0.00010948061431168701, "lm_loss": 1.9268, "loss": 2.0629, "mask_loss": 0.1261, "step": 1208, "topk_loss": 0.0099 }, { "epoch": 0.48052762648814146, "grad_norm": 0.2314453125, "learning_rate": 0.00010935373272563556, "lm_loss": 1.9396, "loss": 2.0815, "mask_loss": 0.1301, "step": 1209, "topk_loss": 0.0118 }, { "epoch": 0.4809250852362706, "grad_norm": 0.12451171875, "learning_rate": 0.00010922683594633021, "lm_loss": 1.8694, "loss": 2.006, "mask_loss": 0.1261, "step": 1210, "topk_loss": 0.0105 }, { "epoch": 0.4813225439843997, "grad_norm": 0.1572265625, "learning_rate": 0.00010909992417988919, "lm_loss": 1.8895, "loss": 2.0313, "mask_loss": 0.1302, "step": 1211, "topk_loss": 0.0116 }, { "epoch": 0.4817200027325289, "grad_norm": 0.130859375, "learning_rate": 0.00010897299763245512, "lm_loss": 1.9207, "loss": 2.0579, "mask_loss": 0.1274, "step": 1212, "topk_loss": 0.0097 }, { "epoch": 0.48211746148065804, "grad_norm": 0.123046875, "learning_rate": 0.00010884605651019459, "lm_loss": 1.9332, "loss": 2.0717, "mask_loss": 0.1286, "step": 1213, "topk_loss": 0.0099 }, { "epoch": 0.48251492022878717, "grad_norm": 0.1328125, "learning_rate": 0.00010871910101929785, "lm_loss": 1.9124, "loss": 2.0518, "mask_loss": 0.1289, "step": 1214, "topk_loss": 0.0106 }, { "epoch": 0.48291237897691636, "grad_norm": 0.1259765625, "learning_rate": 0.00010859213136597853, "lm_loss": 1.9437, "loss": 2.0799, "mask_loss": 0.1255, "step": 1215, "topk_loss": 0.0107 }, { "epoch": 0.4833098377250455, "grad_norm": 0.177734375, "learning_rate": 0.00010846514775647325, "lm_loss": 1.9137, "loss": 2.053, "mask_loss": 0.1276, "step": 1216, "topk_loss": 0.0117 }, { "epoch": 0.4837072964731746, "grad_norm": 0.1279296875, "learning_rate": 0.00010833815039704132, "lm_loss": 1.9353, "loss": 2.0706, "mask_loss": 0.1256, "step": 1217, "topk_loss": 0.0096 }, { "epoch": 0.4841047552213038, "grad_norm": 0.12451171875, "learning_rate": 0.00010821113949396428, "lm_loss": 1.9319, "loss": 2.0705, "mask_loss": 0.1275, "step": 1218, "topk_loss": 0.0111 }, { "epoch": 0.48450221396943294, "grad_norm": 0.1669921875, "learning_rate": 0.0001080841152535458, "lm_loss": 1.9325, "loss": 2.0684, "mask_loss": 0.1266, "step": 1219, "topk_loss": 0.0093 }, { "epoch": 0.48489967271756207, "grad_norm": 0.1640625, "learning_rate": 0.00010795707788211118, "lm_loss": 1.9437, "loss": 2.0846, "mask_loss": 0.1302, "step": 1220, "topk_loss": 0.0108 }, { "epoch": 0.48529713146569126, "grad_norm": 0.169921875, "learning_rate": 0.00010783002758600702, "lm_loss": 1.9576, "loss": 2.1026, "mask_loss": 0.1297, "step": 1221, "topk_loss": 0.0153 }, { "epoch": 0.4856945902138204, "grad_norm": 0.1171875, "learning_rate": 0.00010770296457160088, "lm_loss": 1.9115, "loss": 2.0482, "mask_loss": 0.1265, "step": 1222, "topk_loss": 0.0101 }, { "epoch": 0.4860920489619495, "grad_norm": 0.16796875, "learning_rate": 0.00010757588904528106, "lm_loss": 1.931, "loss": 2.0681, "mask_loss": 0.1263, "step": 1223, "topk_loss": 0.0107 }, { "epoch": 0.4864895077100787, "grad_norm": 0.140625, "learning_rate": 0.00010744880121345613, "lm_loss": 1.9292, "loss": 2.064, "mask_loss": 0.1255, "step": 1224, "topk_loss": 0.0093 }, { "epoch": 0.48688696645820784, "grad_norm": 0.1357421875, "learning_rate": 0.00010732170128255468, "lm_loss": 1.9025, "loss": 2.0443, "mask_loss": 0.1313, "step": 1225, "topk_loss": 0.0105 }, { "epoch": 0.48728442520633697, "grad_norm": 0.12451171875, "learning_rate": 0.00010719458945902492, "lm_loss": 1.9129, "loss": 2.0516, "mask_loss": 0.1286, "step": 1226, "topk_loss": 0.0101 }, { "epoch": 0.48768188395446616, "grad_norm": 0.1279296875, "learning_rate": 0.00010706746594933438, "lm_loss": 1.9273, "loss": 2.0636, "mask_loss": 0.1254, "step": 1227, "topk_loss": 0.0108 }, { "epoch": 0.4880793427025953, "grad_norm": 0.1552734375, "learning_rate": 0.00010694033095996962, "lm_loss": 1.946, "loss": 2.0808, "mask_loss": 0.1253, "step": 1228, "topk_loss": 0.0095 }, { "epoch": 0.4884768014507244, "grad_norm": 0.126953125, "learning_rate": 0.00010681318469743582, "lm_loss": 1.9932, "loss": 2.13, "mask_loss": 0.1268, "step": 1229, "topk_loss": 0.01 }, { "epoch": 0.4888742601988536, "grad_norm": 0.142578125, "learning_rate": 0.00010668602736825641, "lm_loss": 1.9252, "loss": 2.0662, "mask_loss": 0.1287, "step": 1230, "topk_loss": 0.0123 }, { "epoch": 0.48927171894698274, "grad_norm": 0.126953125, "learning_rate": 0.00010655885917897286, "lm_loss": 1.9524, "loss": 2.0886, "mask_loss": 0.1261, "step": 1231, "topk_loss": 0.0101 }, { "epoch": 0.48966917769511187, "grad_norm": 0.134765625, "learning_rate": 0.0001064316803361443, "lm_loss": 1.935, "loss": 2.0723, "mask_loss": 0.1281, "step": 1232, "topk_loss": 0.0092 }, { "epoch": 0.49006663644324105, "grad_norm": 0.1875, "learning_rate": 0.00010630449104634712, "lm_loss": 1.9682, "loss": 2.1198, "mask_loss": 0.1327, "step": 1233, "topk_loss": 0.0189 }, { "epoch": 0.4904640951913702, "grad_norm": 0.1943359375, "learning_rate": 0.00010617729151617465, "lm_loss": 1.9365, "loss": 2.0733, "mask_loss": 0.1257, "step": 1234, "topk_loss": 0.0111 }, { "epoch": 0.4908615539394993, "grad_norm": 0.15625, "learning_rate": 0.00010605008195223694, "lm_loss": 1.9279, "loss": 2.065, "mask_loss": 0.1257, "step": 1235, "topk_loss": 0.0114 }, { "epoch": 0.49125901268762845, "grad_norm": 0.115234375, "learning_rate": 0.00010592286256116027, "lm_loss": 1.9797, "loss": 2.1129, "mask_loss": 0.1242, "step": 1236, "topk_loss": 0.0089 }, { "epoch": 0.49165647143575764, "grad_norm": 0.177734375, "learning_rate": 0.00010579563354958692, "lm_loss": 1.9539, "loss": 2.0972, "mask_loss": 0.1304, "step": 1237, "topk_loss": 0.0129 }, { "epoch": 0.49205393018388677, "grad_norm": 0.1298828125, "learning_rate": 0.00010566839512417479, "lm_loss": 1.8372, "loss": 1.975, "mask_loss": 0.1287, "step": 1238, "topk_loss": 0.0091 }, { "epoch": 0.4924513889320159, "grad_norm": 0.12890625, "learning_rate": 0.000105541147491597, "lm_loss": 1.9374, "loss": 2.0735, "mask_loss": 0.1268, "step": 1239, "topk_loss": 0.0093 }, { "epoch": 0.4928488476801451, "grad_norm": 0.12255859375, "learning_rate": 0.00010541389085854176, "lm_loss": 1.897, "loss": 2.0366, "mask_loss": 0.1284, "step": 1240, "topk_loss": 0.0113 }, { "epoch": 0.4932463064282742, "grad_norm": 0.20703125, "learning_rate": 0.0001052866254317118, "lm_loss": 1.9831, "loss": 2.1197, "mask_loss": 0.1252, "step": 1241, "topk_loss": 0.0113 }, { "epoch": 0.49364376517640335, "grad_norm": 0.1552734375, "learning_rate": 0.00010515935141782414, "lm_loss": 1.8972, "loss": 2.0329, "mask_loss": 0.1258, "step": 1242, "topk_loss": 0.0099 }, { "epoch": 0.49404122392453254, "grad_norm": 0.12353515625, "learning_rate": 0.0001050320690236098, "lm_loss": 1.8784, "loss": 2.012, "mask_loss": 0.1247, "step": 1243, "topk_loss": 0.0089 }, { "epoch": 0.49443868267266167, "grad_norm": 0.1611328125, "learning_rate": 0.00010490477845581337, "lm_loss": 1.9397, "loss": 2.0775, "mask_loss": 0.127, "step": 1244, "topk_loss": 0.0108 }, { "epoch": 0.4948361414207908, "grad_norm": 0.185546875, "learning_rate": 0.00010477747992119273, "lm_loss": 1.8701, "loss": 2.0037, "mask_loss": 0.1244, "step": 1245, "topk_loss": 0.0092 }, { "epoch": 0.49523360016892, "grad_norm": 0.177734375, "learning_rate": 0.00010465017362651868, "lm_loss": 1.9222, "loss": 2.0605, "mask_loss": 0.1279, "step": 1246, "topk_loss": 0.0103 }, { "epoch": 0.4956310589170491, "grad_norm": 0.1826171875, "learning_rate": 0.00010452285977857463, "lm_loss": 1.9269, "loss": 2.0687, "mask_loss": 0.1297, "step": 1247, "topk_loss": 0.0122 }, { "epoch": 0.49602851766517825, "grad_norm": 0.1240234375, "learning_rate": 0.0001043955385841563, "lm_loss": 1.9156, "loss": 2.0507, "mask_loss": 0.1259, "step": 1248, "topk_loss": 0.0092 }, { "epoch": 0.49642597641330743, "grad_norm": 0.1748046875, "learning_rate": 0.00010426821025007134, "lm_loss": 1.962, "loss": 2.0986, "mask_loss": 0.126, "step": 1249, "topk_loss": 0.0106 }, { "epoch": 0.49682343516143657, "grad_norm": 0.197265625, "learning_rate": 0.0001041408749831389, "lm_loss": 1.927, "loss": 2.0636, "mask_loss": 0.1273, "step": 1250, "topk_loss": 0.0093 }, { "epoch": 0.49682343516143657, "eval_lm_loss": 689.285888671875, "eval_loss": 689.4251708984375, "eval_mask_hit_rate": 0.530980110168457, "eval_mask_loss": 0.126395121216774, "eval_mask_top_10_hit_rate": 0.9846255779266357, "eval_mask_top_1_hit_rate": 0.997197151184082, "eval_mask_top_20_hit_rate": 0.9748088717460632, "eval_mask_top_5_hit_rate": 0.9901968240737915, "eval_runtime": 144.3204, "eval_samples_per_second": 14.191, "eval_steps_per_second": 7.095, "eval_token_accuracy": 0.6121336221694946, "eval_top_k_diff": -530.7422485351562, "eval_topk_loss": 0.012827267870306969, "step": 1250 }, { "epoch": 0.4972208939095657, "grad_norm": 0.16796875, "learning_rate": 0.0001040135329901895, "lm_loss": 1.984, "loss": 2.1308, "mask_loss": 0.131, "step": 1251, "topk_loss": 0.0158 }, { "epoch": 0.4976183526576949, "grad_norm": 0.1220703125, "learning_rate": 0.00010388618447806455, "lm_loss": 1.903, "loss": 2.0404, "mask_loss": 0.1279, "step": 1252, "topk_loss": 0.0095 }, { "epoch": 0.498015811405824, "grad_norm": 0.1162109375, "learning_rate": 0.00010375882965361605, "lm_loss": 1.913, "loss": 2.0495, "mask_loss": 0.1271, "step": 1253, "topk_loss": 0.0094 }, { "epoch": 0.49841327015395315, "grad_norm": 0.134765625, "learning_rate": 0.00010363146872370622, "lm_loss": 1.9371, "loss": 2.076, "mask_loss": 0.1283, "step": 1254, "topk_loss": 0.0106 }, { "epoch": 0.49881072890208233, "grad_norm": 0.134765625, "learning_rate": 0.00010350410189520723, "lm_loss": 1.9335, "loss": 2.072, "mask_loss": 0.1289, "step": 1255, "topk_loss": 0.0096 }, { "epoch": 0.49920818765021147, "grad_norm": 0.1220703125, "learning_rate": 0.00010337672937500085, "lm_loss": 1.9149, "loss": 2.0508, "mask_loss": 0.1261, "step": 1256, "topk_loss": 0.0099 }, { "epoch": 0.4996056463983406, "grad_norm": 0.13671875, "learning_rate": 0.00010324935136997806, "lm_loss": 1.9897, "loss": 2.1257, "mask_loss": 0.126, "step": 1257, "topk_loss": 0.0101 }, { "epoch": 0.5000031051464697, "grad_norm": 0.1220703125, "learning_rate": 0.00010312196808703876, "lm_loss": 1.9753, "loss": 2.1177, "mask_loss": 0.13, "step": 1258, "topk_loss": 0.0124 }, { "epoch": 0.5004005638945989, "grad_norm": 0.12451171875, "learning_rate": 0.00010299457973309142, "lm_loss": 1.9811, "loss": 2.1195, "mask_loss": 0.128, "step": 1259, "topk_loss": 0.0104 }, { "epoch": 0.5007980226427281, "grad_norm": 0.1259765625, "learning_rate": 0.00010286718651505275, "lm_loss": 1.9495, "loss": 2.0852, "mask_loss": 0.1253, "step": 1260, "topk_loss": 0.0103 }, { "epoch": 0.5011954813908572, "grad_norm": 0.1259765625, "learning_rate": 0.00010273978863984742, "lm_loss": 1.9871, "loss": 2.1249, "mask_loss": 0.1265, "step": 1261, "topk_loss": 0.0113 }, { "epoch": 0.5015929401389864, "grad_norm": 0.1318359375, "learning_rate": 0.00010261238631440748, "lm_loss": 1.8924, "loss": 2.0294, "mask_loss": 0.1266, "step": 1262, "topk_loss": 0.0104 }, { "epoch": 0.5019903988871155, "grad_norm": 0.11962890625, "learning_rate": 0.00010248497974567244, "lm_loss": 1.885, "loss": 2.0211, "mask_loss": 0.1253, "step": 1263, "topk_loss": 0.0107 }, { "epoch": 0.5023878576352446, "grad_norm": 0.1455078125, "learning_rate": 0.00010235756914058856, "lm_loss": 1.9563, "loss": 2.0923, "mask_loss": 0.126, "step": 1264, "topk_loss": 0.01 }, { "epoch": 0.5027853163833738, "grad_norm": 0.12109375, "learning_rate": 0.00010223015470610871, "lm_loss": 1.8781, "loss": 2.0166, "mask_loss": 0.1285, "step": 1265, "topk_loss": 0.01 }, { "epoch": 0.503182775131503, "grad_norm": 0.1220703125, "learning_rate": 0.00010210273664919191, "lm_loss": 1.893, "loss": 2.03, "mask_loss": 0.1268, "step": 1266, "topk_loss": 0.0101 }, { "epoch": 0.5035802338796321, "grad_norm": 0.1640625, "learning_rate": 0.00010197531517680319, "lm_loss": 1.9048, "loss": 2.0391, "mask_loss": 0.1253, "step": 1267, "topk_loss": 0.009 }, { "epoch": 0.5039776926277613, "grad_norm": 0.15234375, "learning_rate": 0.00010184789049591299, "lm_loss": 1.9125, "loss": 2.0491, "mask_loss": 0.1248, "step": 1268, "topk_loss": 0.0118 }, { "epoch": 0.5043751513758904, "grad_norm": 0.12451171875, "learning_rate": 0.0001017204628134971, "lm_loss": 1.9724, "loss": 2.1081, "mask_loss": 0.1257, "step": 1269, "topk_loss": 0.01 }, { "epoch": 0.5047726101240195, "grad_norm": 0.1533203125, "learning_rate": 0.00010159303233653604, "lm_loss": 1.9138, "loss": 2.0603, "mask_loss": 0.1319, "step": 1270, "topk_loss": 0.0147 }, { "epoch": 0.5051700688721487, "grad_norm": 0.1240234375, "learning_rate": 0.00010146559927201495, "lm_loss": 1.9232, "loss": 2.0605, "mask_loss": 0.1272, "step": 1271, "topk_loss": 0.0101 }, { "epoch": 0.5055675276202779, "grad_norm": 0.12060546875, "learning_rate": 0.0001013381638269232, "lm_loss": 1.9513, "loss": 2.0924, "mask_loss": 0.1277, "step": 1272, "topk_loss": 0.0134 }, { "epoch": 0.505964986368407, "grad_norm": 0.1259765625, "learning_rate": 0.00010121072620825397, "lm_loss": 1.9005, "loss": 2.0375, "mask_loss": 0.1265, "step": 1273, "topk_loss": 0.0104 }, { "epoch": 0.5063624451165362, "grad_norm": 0.1416015625, "learning_rate": 0.000101083286623004, "lm_loss": 1.9435, "loss": 2.0812, "mask_loss": 0.129, "step": 1274, "topk_loss": 0.0087 }, { "epoch": 0.5067599038646653, "grad_norm": 0.12109375, "learning_rate": 0.00010095584527817319, "lm_loss": 1.8898, "loss": 2.0306, "mask_loss": 0.1292, "step": 1275, "topk_loss": 0.0116 }, { "epoch": 0.5071573626127944, "grad_norm": 0.130859375, "learning_rate": 0.00010082840238076436, "lm_loss": 1.8348, "loss": 1.971, "mask_loss": 0.1276, "step": 1276, "topk_loss": 0.0087 }, { "epoch": 0.5075548213609236, "grad_norm": 0.173828125, "learning_rate": 0.00010070095813778281, "lm_loss": 1.9219, "loss": 2.0633, "mask_loss": 0.1317, "step": 1277, "topk_loss": 0.0098 }, { "epoch": 0.5079522801090527, "grad_norm": 0.1298828125, "learning_rate": 0.000100573512756236, "lm_loss": 2.0069, "loss": 2.1456, "mask_loss": 0.1267, "step": 1278, "topk_loss": 0.012 }, { "epoch": 0.5083497388571819, "grad_norm": 0.11962890625, "learning_rate": 0.0001004460664431333, "lm_loss": 1.8486, "loss": 1.9884, "mask_loss": 0.1294, "step": 1279, "topk_loss": 0.0104 }, { "epoch": 0.5087471976053111, "grad_norm": 0.12255859375, "learning_rate": 0.00010031861940548555, "lm_loss": 1.9759, "loss": 2.1135, "mask_loss": 0.1276, "step": 1280, "topk_loss": 0.0099 }, { "epoch": 0.5091446563534402, "grad_norm": 0.12451171875, "learning_rate": 0.00010019117185030478, "lm_loss": 1.9389, "loss": 2.0731, "mask_loss": 0.1249, "step": 1281, "topk_loss": 0.0092 }, { "epoch": 0.5095421151015693, "grad_norm": 0.11962890625, "learning_rate": 0.00010006372398460387, "lm_loss": 1.9281, "loss": 2.0706, "mask_loss": 0.1299, "step": 1282, "topk_loss": 0.0126 }, { "epoch": 0.5099395738496985, "grad_norm": 0.1220703125, "learning_rate": 9.993627601539617e-05, "lm_loss": 1.9432, "loss": 2.0786, "mask_loss": 0.1259, "step": 1283, "topk_loss": 0.0096 }, { "epoch": 0.5103370325978276, "grad_norm": 0.1328125, "learning_rate": 9.980882814969524e-05, "lm_loss": 1.9283, "loss": 2.0659, "mask_loss": 0.1277, "step": 1284, "topk_loss": 0.0099 }, { "epoch": 0.5107344913459568, "grad_norm": 0.1162109375, "learning_rate": 9.968138059451446e-05, "lm_loss": 1.9759, "loss": 2.1111, "mask_loss": 0.1254, "step": 1285, "topk_loss": 0.0098 }, { "epoch": 0.511131950094086, "grad_norm": 0.11279296875, "learning_rate": 9.955393355686671e-05, "lm_loss": 1.8777, "loss": 2.0146, "mask_loss": 0.126, "step": 1286, "topk_loss": 0.0109 }, { "epoch": 0.5115294088422151, "grad_norm": 0.1591796875, "learning_rate": 9.942648724376403e-05, "lm_loss": 1.9045, "loss": 2.0417, "mask_loss": 0.1267, "step": 1287, "topk_loss": 0.0105 }, { "epoch": 0.5119268675903442, "grad_norm": 0.119140625, "learning_rate": 9.929904186221722e-05, "lm_loss": 1.9832, "loss": 2.1178, "mask_loss": 0.1253, "step": 1288, "topk_loss": 0.0093 }, { "epoch": 0.5123243263384734, "grad_norm": 0.125, "learning_rate": 9.917159761923566e-05, "lm_loss": 2.0023, "loss": 2.139, "mask_loss": 0.126, "step": 1289, "topk_loss": 0.0107 }, { "epoch": 0.5127217850866025, "grad_norm": 0.12451171875, "learning_rate": 9.904415472182682e-05, "lm_loss": 1.9215, "loss": 2.0587, "mask_loss": 0.1279, "step": 1290, "topk_loss": 0.0092 }, { "epoch": 0.5131192438347317, "grad_norm": 0.1220703125, "learning_rate": 9.891671337699602e-05, "lm_loss": 1.8964, "loss": 2.0349, "mask_loss": 0.1294, "step": 1291, "topk_loss": 0.0091 }, { "epoch": 0.5135167025828609, "grad_norm": 0.12451171875, "learning_rate": 9.878927379174605e-05, "lm_loss": 1.9059, "loss": 2.0463, "mask_loss": 0.1285, "step": 1292, "topk_loss": 0.0118 }, { "epoch": 0.51391416133099, "grad_norm": 0.1572265625, "learning_rate": 9.866183617307682e-05, "lm_loss": 1.9395, "loss": 2.0755, "mask_loss": 0.1266, "step": 1293, "topk_loss": 0.0095 }, { "epoch": 0.5143116200791191, "grad_norm": 0.140625, "learning_rate": 9.853440072798507e-05, "lm_loss": 1.9447, "loss": 2.0812, "mask_loss": 0.1273, "step": 1294, "topk_loss": 0.0092 }, { "epoch": 0.5147090788272483, "grad_norm": 0.1279296875, "learning_rate": 9.840696766346401e-05, "lm_loss": 1.9092, "loss": 2.0484, "mask_loss": 0.1283, "step": 1295, "topk_loss": 0.0109 }, { "epoch": 0.5151065375753774, "grad_norm": 0.1259765625, "learning_rate": 9.827953718650295e-05, "lm_loss": 1.8731, "loss": 2.0116, "mask_loss": 0.1287, "step": 1296, "topk_loss": 0.0098 }, { "epoch": 0.5155039963235066, "grad_norm": 0.123046875, "learning_rate": 9.815210950408704e-05, "lm_loss": 1.9524, "loss": 2.0892, "mask_loss": 0.1272, "step": 1297, "topk_loss": 0.0096 }, { "epoch": 0.5159014550716358, "grad_norm": 0.125, "learning_rate": 9.802468482319683e-05, "lm_loss": 1.8773, "loss": 2.0144, "mask_loss": 0.127, "step": 1298, "topk_loss": 0.0102 }, { "epoch": 0.5162989138197649, "grad_norm": 0.150390625, "learning_rate": 9.78972633508081e-05, "lm_loss": 1.9349, "loss": 2.0831, "mask_loss": 0.1314, "step": 1299, "topk_loss": 0.0167 }, { "epoch": 0.516696372567894, "grad_norm": 0.15234375, "learning_rate": 9.776984529389131e-05, "lm_loss": 1.9471, "loss": 2.0846, "mask_loss": 0.1275, "step": 1300, "topk_loss": 0.01 }, { "epoch": 0.516696372567894, "eval_lm_loss": 688.7545166015625, "eval_loss": 688.8933715820312, "eval_mask_hit_rate": 0.5318499803543091, "eval_mask_loss": 0.12599800527095795, "eval_mask_top_10_hit_rate": 0.98480224609375, "eval_mask_top_1_hit_rate": 0.9972474575042725, "eval_mask_top_20_hit_rate": 0.9750704765319824, "eval_mask_top_5_hit_rate": 0.9903174638748169, "eval_runtime": 144.2764, "eval_samples_per_second": 14.195, "eval_steps_per_second": 7.097, "eval_token_accuracy": 0.6126667857170105, "eval_top_k_diff": -528.0156860351562, "eval_topk_loss": 0.012844683602452278, "step": 1300 }, { "epoch": 0.5170938313160232, "grad_norm": 0.1171875, "learning_rate": 9.764243085941145e-05, "lm_loss": 1.8988, "loss": 2.0387, "mask_loss": 0.1302, "step": 1301, "topk_loss": 0.0097 }, { "epoch": 0.5174912900641523, "grad_norm": 0.11669921875, "learning_rate": 9.751502025432756e-05, "lm_loss": 1.916, "loss": 2.0535, "mask_loss": 0.1283, "step": 1302, "topk_loss": 0.0091 }, { "epoch": 0.5178887488122815, "grad_norm": 0.11279296875, "learning_rate": 9.738761368559256e-05, "lm_loss": 1.859, "loss": 1.9954, "mask_loss": 0.127, "step": 1303, "topk_loss": 0.0095 }, { "epoch": 0.5182862075604107, "grad_norm": 0.11865234375, "learning_rate": 9.726021136015265e-05, "lm_loss": 1.9261, "loss": 2.0612, "mask_loss": 0.1262, "step": 1304, "topk_loss": 0.0088 }, { "epoch": 0.5186836663085398, "grad_norm": 0.15234375, "learning_rate": 9.713281348494726e-05, "lm_loss": 1.938, "loss": 2.0769, "mask_loss": 0.1278, "step": 1305, "topk_loss": 0.0111 }, { "epoch": 0.5190811250566689, "grad_norm": 0.115234375, "learning_rate": 9.700542026690859e-05, "lm_loss": 1.894, "loss": 2.0337, "mask_loss": 0.1293, "step": 1306, "topk_loss": 0.0104 }, { "epoch": 0.519478583804798, "grad_norm": 0.11669921875, "learning_rate": 9.687803191296126e-05, "lm_loss": 1.8523, "loss": 1.9875, "mask_loss": 0.1264, "step": 1307, "topk_loss": 0.0088 }, { "epoch": 0.5198760425529272, "grad_norm": 0.1875, "learning_rate": 9.675064863002196e-05, "lm_loss": 1.9044, "loss": 2.0509, "mask_loss": 0.1314, "step": 1308, "topk_loss": 0.0151 }, { "epoch": 0.5202735013010563, "grad_norm": 0.12158203125, "learning_rate": 9.662327062499918e-05, "lm_loss": 1.9658, "loss": 2.1023, "mask_loss": 0.1264, "step": 1309, "topk_loss": 0.0101 }, { "epoch": 0.5206709600491856, "grad_norm": 0.1201171875, "learning_rate": 9.64958981047928e-05, "lm_loss": 1.9269, "loss": 2.0627, "mask_loss": 0.1263, "step": 1310, "topk_loss": 0.0095 }, { "epoch": 0.5210684187973147, "grad_norm": 0.134765625, "learning_rate": 9.636853127629383e-05, "lm_loss": 1.9253, "loss": 2.0637, "mask_loss": 0.1285, "step": 1311, "topk_loss": 0.01 }, { "epoch": 0.5214658775454438, "grad_norm": 0.12890625, "learning_rate": 9.6241170346384e-05, "lm_loss": 1.9359, "loss": 2.0784, "mask_loss": 0.13, "step": 1312, "topk_loss": 0.0124 }, { "epoch": 0.521863336293573, "grad_norm": 0.11865234375, "learning_rate": 9.611381552193548e-05, "lm_loss": 1.8707, "loss": 2.0083, "mask_loss": 0.1269, "step": 1313, "topk_loss": 0.0107 }, { "epoch": 0.5222607950417021, "grad_norm": 0.1552734375, "learning_rate": 9.598646700981051e-05, "lm_loss": 1.9509, "loss": 2.1019, "mask_loss": 0.1345, "step": 1314, "topk_loss": 0.0164 }, { "epoch": 0.5226582537898312, "grad_norm": 0.138671875, "learning_rate": 9.585912501686111e-05, "lm_loss": 1.8712, "loss": 2.0107, "mask_loss": 0.1292, "step": 1315, "topk_loss": 0.0104 }, { "epoch": 0.5230557125379605, "grad_norm": 0.1552734375, "learning_rate": 9.57317897499287e-05, "lm_loss": 1.9163, "loss": 2.0533, "mask_loss": 0.1266, "step": 1316, "topk_loss": 0.0104 }, { "epoch": 0.5234531712860896, "grad_norm": 0.1162109375, "learning_rate": 9.56044614158437e-05, "lm_loss": 1.9554, "loss": 2.0914, "mask_loss": 0.1266, "step": 1317, "topk_loss": 0.0094 }, { "epoch": 0.5238506300342187, "grad_norm": 0.1220703125, "learning_rate": 9.547714022142537e-05, "lm_loss": 1.9215, "loss": 2.0598, "mask_loss": 0.1283, "step": 1318, "topk_loss": 0.0101 }, { "epoch": 0.5242480887823479, "grad_norm": 0.12451171875, "learning_rate": 9.534982637348137e-05, "lm_loss": 1.8686, "loss": 2.0076, "mask_loss": 0.1275, "step": 1319, "topk_loss": 0.0115 }, { "epoch": 0.524645547530477, "grad_norm": 0.1201171875, "learning_rate": 9.522252007880732e-05, "lm_loss": 1.8746, "loss": 2.0121, "mask_loss": 0.1273, "step": 1320, "topk_loss": 0.0101 }, { "epoch": 0.5250430062786061, "grad_norm": 0.1328125, "learning_rate": 9.509522154418667e-05, "lm_loss": 1.9392, "loss": 2.0784, "mask_loss": 0.1276, "step": 1321, "topk_loss": 0.0116 }, { "epoch": 0.5254404650267354, "grad_norm": 0.12255859375, "learning_rate": 9.496793097639022e-05, "lm_loss": 1.9044, "loss": 2.0384, "mask_loss": 0.1251, "step": 1322, "topk_loss": 0.0089 }, { "epoch": 0.5258379237748645, "grad_norm": 0.12353515625, "learning_rate": 9.484064858217587e-05, "lm_loss": 1.8868, "loss": 2.0242, "mask_loss": 0.1276, "step": 1323, "topk_loss": 0.0099 }, { "epoch": 0.5262353825229936, "grad_norm": 0.2001953125, "learning_rate": 9.471337456828822e-05, "lm_loss": 1.8827, "loss": 2.0208, "mask_loss": 0.1269, "step": 1324, "topk_loss": 0.0112 }, { "epoch": 0.5266328412711228, "grad_norm": 0.12060546875, "learning_rate": 9.458610914145826e-05, "lm_loss": 1.8432, "loss": 1.9847, "mask_loss": 0.1299, "step": 1325, "topk_loss": 0.0115 }, { "epoch": 0.5270303000192519, "grad_norm": 0.11865234375, "learning_rate": 9.4458852508403e-05, "lm_loss": 1.9125, "loss": 2.0457, "mask_loss": 0.1245, "step": 1326, "topk_loss": 0.0087 }, { "epoch": 0.527427758767381, "grad_norm": 0.1552734375, "learning_rate": 9.433160487582526e-05, "lm_loss": 1.8939, "loss": 2.0335, "mask_loss": 0.1291, "step": 1327, "topk_loss": 0.0105 }, { "epoch": 0.5278252175155103, "grad_norm": 0.1474609375, "learning_rate": 9.420436645041311e-05, "lm_loss": 1.9032, "loss": 2.0403, "mask_loss": 0.1259, "step": 1328, "topk_loss": 0.0111 }, { "epoch": 0.5282226762636394, "grad_norm": 0.12890625, "learning_rate": 9.407713743883976e-05, "lm_loss": 1.9396, "loss": 2.0754, "mask_loss": 0.1262, "step": 1329, "topk_loss": 0.0096 }, { "epoch": 0.5286201350117685, "grad_norm": 0.12109375, "learning_rate": 9.394991804776309e-05, "lm_loss": 1.9457, "loss": 2.0857, "mask_loss": 0.1294, "step": 1330, "topk_loss": 0.0106 }, { "epoch": 0.5290175937598977, "grad_norm": 0.1689453125, "learning_rate": 9.382270848382537e-05, "lm_loss": 1.9432, "loss": 2.0787, "mask_loss": 0.1254, "step": 1331, "topk_loss": 0.0102 }, { "epoch": 0.5294150525080268, "grad_norm": 0.1904296875, "learning_rate": 9.369550895365291e-05, "lm_loss": 1.8668, "loss": 2.004, "mask_loss": 0.1281, "step": 1332, "topk_loss": 0.0091 }, { "epoch": 0.5298125112561559, "grad_norm": 0.181640625, "learning_rate": 9.356831966385571e-05, "lm_loss": 1.9468, "loss": 2.0846, "mask_loss": 0.1272, "step": 1333, "topk_loss": 0.0106 }, { "epoch": 0.530209970004285, "grad_norm": 0.1337890625, "learning_rate": 9.344114082102712e-05, "lm_loss": 1.9696, "loss": 2.1073, "mask_loss": 0.1265, "step": 1334, "topk_loss": 0.0112 }, { "epoch": 0.5306074287524143, "grad_norm": 0.21484375, "learning_rate": 9.331397263174364e-05, "lm_loss": 1.8924, "loss": 2.0298, "mask_loss": 0.1271, "step": 1335, "topk_loss": 0.0103 }, { "epoch": 0.5310048875005434, "grad_norm": 0.27734375, "learning_rate": 9.318681530256423e-05, "lm_loss": 1.9154, "loss": 2.0501, "mask_loss": 0.125, "step": 1336, "topk_loss": 0.0098 }, { "epoch": 0.5314023462486726, "grad_norm": 0.1337890625, "learning_rate": 9.30596690400304e-05, "lm_loss": 1.9739, "loss": 2.1086, "mask_loss": 0.1259, "step": 1337, "topk_loss": 0.0089 }, { "epoch": 0.5317998049968017, "grad_norm": 0.12451171875, "learning_rate": 9.293253405066563e-05, "lm_loss": 1.9695, "loss": 2.1053, "mask_loss": 0.127, "step": 1338, "topk_loss": 0.0088 }, { "epoch": 0.5321972637449308, "grad_norm": 0.1826171875, "learning_rate": 9.28054105409751e-05, "lm_loss": 1.9149, "loss": 2.0536, "mask_loss": 0.1274, "step": 1339, "topk_loss": 0.0113 }, { "epoch": 0.53259472249306, "grad_norm": 0.154296875, "learning_rate": 9.267829871744536e-05, "lm_loss": 1.8955, "loss": 2.0342, "mask_loss": 0.1282, "step": 1340, "topk_loss": 0.0104 }, { "epoch": 0.5329921812411892, "grad_norm": 0.1787109375, "learning_rate": 9.25511987865439e-05, "lm_loss": 1.9207, "loss": 2.056, "mask_loss": 0.1264, "step": 1341, "topk_loss": 0.0089 }, { "epoch": 0.5333896399893183, "grad_norm": 0.1259765625, "learning_rate": 9.242411095471897e-05, "lm_loss": 1.9638, "loss": 2.1008, "mask_loss": 0.1277, "step": 1342, "topk_loss": 0.0093 }, { "epoch": 0.5337870987374475, "grad_norm": 0.1220703125, "learning_rate": 9.229703542839917e-05, "lm_loss": 1.9143, "loss": 2.0539, "mask_loss": 0.1284, "step": 1343, "topk_loss": 0.0112 }, { "epoch": 0.5341845574855766, "grad_norm": 0.166015625, "learning_rate": 9.216997241399303e-05, "lm_loss": 1.8686, "loss": 2.0033, "mask_loss": 0.125, "step": 1344, "topk_loss": 0.0097 }, { "epoch": 0.5345820162337057, "grad_norm": 0.1376953125, "learning_rate": 9.204292211788884e-05, "lm_loss": 1.8998, "loss": 2.0334, "mask_loss": 0.1255, "step": 1345, "topk_loss": 0.0081 }, { "epoch": 0.5349794749818348, "grad_norm": 0.12890625, "learning_rate": 9.19158847464542e-05, "lm_loss": 1.8785, "loss": 2.0162, "mask_loss": 0.1277, "step": 1346, "topk_loss": 0.0099 }, { "epoch": 0.5353769337299641, "grad_norm": 0.150390625, "learning_rate": 9.178886050603574e-05, "lm_loss": 1.9795, "loss": 2.1147, "mask_loss": 0.1253, "step": 1347, "topk_loss": 0.0099 }, { "epoch": 0.5357743924780932, "grad_norm": 0.12158203125, "learning_rate": 9.166184960295872e-05, "lm_loss": 1.9062, "loss": 2.0437, "mask_loss": 0.1265, "step": 1348, "topk_loss": 0.011 }, { "epoch": 0.5361718512262224, "grad_norm": 0.1435546875, "learning_rate": 9.153485224352675e-05, "lm_loss": 1.8742, "loss": 2.0098, "mask_loss": 0.1269, "step": 1349, "topk_loss": 0.0087 }, { "epoch": 0.5365693099743515, "grad_norm": 0.12255859375, "learning_rate": 9.140786863402147e-05, "lm_loss": 1.8814, "loss": 2.017, "mask_loss": 0.126, "step": 1350, "topk_loss": 0.0096 }, { "epoch": 0.5365693099743515, "eval_lm_loss": 688.3827514648438, "eval_loss": 688.5211791992188, "eval_mask_hit_rate": 0.5326009392738342, "eval_mask_loss": 0.12567125260829926, "eval_mask_top_10_hit_rate": 0.9849717020988464, "eval_mask_top_1_hit_rate": 0.9972808361053467, "eval_mask_top_20_hit_rate": 0.9753085374832153, "eval_mask_top_5_hit_rate": 0.990444004535675, "eval_runtime": 143.7784, "eval_samples_per_second": 14.244, "eval_steps_per_second": 7.122, "eval_token_accuracy": 0.6129680275917053, "eval_top_k_diff": -529.1483764648438, "eval_topk_loss": 0.012769084423780441, "step": 1350 }, { "epoch": 0.5369667687224806, "grad_norm": 0.1943359375, "learning_rate": 9.12808989807022e-05, "lm_loss": 1.9485, "loss": 2.0909, "mask_loss": 0.1276, "step": 1351, "topk_loss": 0.0148 }, { "epoch": 0.5373642274706097, "grad_norm": 0.11083984375, "learning_rate": 9.115394348980546e-05, "lm_loss": 1.8064, "loss": 1.9452, "mask_loss": 0.1287, "step": 1352, "topk_loss": 0.0101 }, { "epoch": 0.537761686218739, "grad_norm": 0.11865234375, "learning_rate": 9.102700236754492e-05, "lm_loss": 1.9177, "loss": 2.0579, "mask_loss": 0.1282, "step": 1353, "topk_loss": 0.0119 }, { "epoch": 0.5381591449668681, "grad_norm": 0.1484375, "learning_rate": 9.090007582011082e-05, "lm_loss": 1.8761, "loss": 2.0117, "mask_loss": 0.1252, "step": 1354, "topk_loss": 0.0104 }, { "epoch": 0.5385566037149973, "grad_norm": 0.1298828125, "learning_rate": 9.077316405366981e-05, "lm_loss": 1.9223, "loss": 2.0624, "mask_loss": 0.1284, "step": 1355, "topk_loss": 0.0116 }, { "epoch": 0.5389540624631264, "grad_norm": 0.150390625, "learning_rate": 9.064626727436445e-05, "lm_loss": 1.9325, "loss": 2.0709, "mask_loss": 0.128, "step": 1356, "topk_loss": 0.0104 }, { "epoch": 0.5393515212112555, "grad_norm": 0.11865234375, "learning_rate": 9.051938568831298e-05, "lm_loss": 1.9355, "loss": 2.0713, "mask_loss": 0.1257, "step": 1357, "topk_loss": 0.0101 }, { "epoch": 0.5397489799593846, "grad_norm": 0.1220703125, "learning_rate": 9.039251950160899e-05, "lm_loss": 1.8959, "loss": 2.0302, "mask_loss": 0.1255, "step": 1358, "topk_loss": 0.0088 }, { "epoch": 0.5401464387075139, "grad_norm": 0.1220703125, "learning_rate": 9.026566892032105e-05, "lm_loss": 1.9949, "loss": 2.1293, "mask_loss": 0.1238, "step": 1359, "topk_loss": 0.0106 }, { "epoch": 0.540543897455643, "grad_norm": 0.16796875, "learning_rate": 9.01388341504923e-05, "lm_loss": 1.884, "loss": 2.0216, "mask_loss": 0.1278, "step": 1360, "topk_loss": 0.0097 }, { "epoch": 0.5409413562037722, "grad_norm": 0.125, "learning_rate": 9.001201539814031e-05, "lm_loss": 1.865, "loss": 2.0017, "mask_loss": 0.1267, "step": 1361, "topk_loss": 0.01 }, { "epoch": 0.5413388149519013, "grad_norm": 0.12158203125, "learning_rate": 8.98852128692566e-05, "lm_loss": 1.9142, "loss": 2.0569, "mask_loss": 0.1315, "step": 1362, "topk_loss": 0.0112 }, { "epoch": 0.5417362737000304, "grad_norm": 0.1318359375, "learning_rate": 8.975842676980629e-05, "lm_loss": 1.9219, "loss": 2.0595, "mask_loss": 0.1277, "step": 1363, "topk_loss": 0.01 }, { "epoch": 0.5421337324481595, "grad_norm": 0.1552734375, "learning_rate": 8.963165730572787e-05, "lm_loss": 1.9269, "loss": 2.0639, "mask_loss": 0.1272, "step": 1364, "topk_loss": 0.0097 }, { "epoch": 0.5425311911962887, "grad_norm": 0.166015625, "learning_rate": 8.950490468293279e-05, "lm_loss": 1.9045, "loss": 2.0434, "mask_loss": 0.1278, "step": 1365, "topk_loss": 0.0111 }, { "epoch": 0.5429286499444179, "grad_norm": 0.1279296875, "learning_rate": 8.937816910730513e-05, "lm_loss": 1.9077, "loss": 2.0504, "mask_loss": 0.1304, "step": 1366, "topk_loss": 0.0123 }, { "epoch": 0.543326108692547, "grad_norm": 0.1220703125, "learning_rate": 8.925145078470135e-05, "lm_loss": 1.8535, "loss": 1.9929, "mask_loss": 0.1286, "step": 1367, "topk_loss": 0.0107 }, { "epoch": 0.5437235674406762, "grad_norm": 0.1455078125, "learning_rate": 8.912474992094974e-05, "lm_loss": 1.9703, "loss": 2.1097, "mask_loss": 0.1262, "step": 1368, "topk_loss": 0.0132 }, { "epoch": 0.5441210261888053, "grad_norm": 0.197265625, "learning_rate": 8.899806672185037e-05, "lm_loss": 1.9328, "loss": 2.0695, "mask_loss": 0.1276, "step": 1369, "topk_loss": 0.0091 }, { "epoch": 0.5445184849369344, "grad_norm": 0.1376953125, "learning_rate": 8.887140139317454e-05, "lm_loss": 1.9, "loss": 2.039, "mask_loss": 0.1276, "step": 1370, "topk_loss": 0.0114 }, { "epoch": 0.5449159436850636, "grad_norm": 0.1240234375, "learning_rate": 8.87447541406646e-05, "lm_loss": 1.9248, "loss": 2.0604, "mask_loss": 0.1252, "step": 1371, "topk_loss": 0.0104 }, { "epoch": 0.5453134024331928, "grad_norm": 0.12255859375, "learning_rate": 8.861812517003345e-05, "lm_loss": 1.9264, "loss": 2.0684, "mask_loss": 0.1294, "step": 1372, "topk_loss": 0.0126 }, { "epoch": 0.545710861181322, "grad_norm": 0.12890625, "learning_rate": 8.849151468696434e-05, "lm_loss": 1.89, "loss": 2.0247, "mask_loss": 0.1253, "step": 1373, "topk_loss": 0.0095 }, { "epoch": 0.5461083199294511, "grad_norm": 0.1357421875, "learning_rate": 8.836492289711051e-05, "lm_loss": 1.9543, "loss": 2.0929, "mask_loss": 0.1274, "step": 1374, "topk_loss": 0.0112 }, { "epoch": 0.5465057786775802, "grad_norm": 0.1484375, "learning_rate": 8.823835000609482e-05, "lm_loss": 1.9542, "loss": 2.0914, "mask_loss": 0.1274, "step": 1375, "topk_loss": 0.0099 }, { "epoch": 0.5469032374257093, "grad_norm": 0.12890625, "learning_rate": 8.811179621950936e-05, "lm_loss": 1.9566, "loss": 2.0955, "mask_loss": 0.1284, "step": 1376, "topk_loss": 0.0105 }, { "epoch": 0.5473006961738385, "grad_norm": 0.1259765625, "learning_rate": 8.798526174291531e-05, "lm_loss": 1.9385, "loss": 2.0803, "mask_loss": 0.1292, "step": 1377, "topk_loss": 0.0126 }, { "epoch": 0.5476981549219677, "grad_norm": 0.1181640625, "learning_rate": 8.785874678184242e-05, "lm_loss": 1.8737, "loss": 2.0062, "mask_loss": 0.1237, "step": 1378, "topk_loss": 0.0088 }, { "epoch": 0.5480956136700968, "grad_norm": 0.177734375, "learning_rate": 8.773225154178873e-05, "lm_loss": 1.92, "loss": 2.058, "mask_loss": 0.127, "step": 1379, "topk_loss": 0.011 }, { "epoch": 0.548493072418226, "grad_norm": 0.138671875, "learning_rate": 8.76057762282203e-05, "lm_loss": 1.9239, "loss": 2.0607, "mask_loss": 0.1255, "step": 1380, "topk_loss": 0.0112 }, { "epoch": 0.5488905311663551, "grad_norm": 0.12451171875, "learning_rate": 8.747932104657076e-05, "lm_loss": 1.9414, "loss": 2.0752, "mask_loss": 0.1244, "step": 1381, "topk_loss": 0.0095 }, { "epoch": 0.5492879899144842, "grad_norm": 0.12353515625, "learning_rate": 8.73528862022411e-05, "lm_loss": 1.9191, "loss": 2.0573, "mask_loss": 0.128, "step": 1382, "topk_loss": 0.0102 }, { "epoch": 0.5496854486626134, "grad_norm": 0.166015625, "learning_rate": 8.722647190059924e-05, "lm_loss": 1.8907, "loss": 2.0265, "mask_loss": 0.1266, "step": 1383, "topk_loss": 0.0091 }, { "epoch": 0.5500829074107426, "grad_norm": 0.1435546875, "learning_rate": 8.710007834697969e-05, "lm_loss": 1.8894, "loss": 2.0261, "mask_loss": 0.1269, "step": 1384, "topk_loss": 0.0099 }, { "epoch": 0.5504803661588717, "grad_norm": 0.130859375, "learning_rate": 8.697370574668335e-05, "lm_loss": 1.8602, "loss": 1.9982, "mask_loss": 0.1286, "step": 1385, "topk_loss": 0.0094 }, { "epoch": 0.5508778249070009, "grad_norm": 0.11474609375, "learning_rate": 8.684735430497704e-05, "lm_loss": 1.9372, "loss": 2.0721, "mask_loss": 0.1264, "step": 1386, "topk_loss": 0.0085 }, { "epoch": 0.55127528365513, "grad_norm": 0.119140625, "learning_rate": 8.672102422709323e-05, "lm_loss": 1.9999, "loss": 2.1386, "mask_loss": 0.1274, "step": 1387, "topk_loss": 0.0114 }, { "epoch": 0.5516727424032591, "grad_norm": 0.171875, "learning_rate": 8.659471571822964e-05, "lm_loss": 1.9412, "loss": 2.078, "mask_loss": 0.1265, "step": 1388, "topk_loss": 0.0102 }, { "epoch": 0.5520702011513883, "grad_norm": 0.1533203125, "learning_rate": 8.6468428983549e-05, "lm_loss": 1.8598, "loss": 1.9957, "mask_loss": 0.1269, "step": 1389, "topk_loss": 0.009 }, { "epoch": 0.5524676598995174, "grad_norm": 0.11962890625, "learning_rate": 8.634216422817867e-05, "lm_loss": 1.8608, "loss": 1.9996, "mask_loss": 0.1274, "step": 1390, "topk_loss": 0.0113 }, { "epoch": 0.5528651186476466, "grad_norm": 0.1123046875, "learning_rate": 8.621592165721034e-05, "lm_loss": 1.9219, "loss": 2.0558, "mask_loss": 0.1253, "step": 1391, "topk_loss": 0.0087 }, { "epoch": 0.5532625773957758, "grad_norm": 0.12353515625, "learning_rate": 8.608970147569954e-05, "lm_loss": 1.9326, "loss": 2.0663, "mask_loss": 0.1248, "step": 1392, "topk_loss": 0.0089 }, { "epoch": 0.5536600361439049, "grad_norm": 0.142578125, "learning_rate": 8.596350388866558e-05, "lm_loss": 1.916, "loss": 2.0553, "mask_loss": 0.1274, "step": 1393, "topk_loss": 0.0119 }, { "epoch": 0.554057494892034, "grad_norm": 0.134765625, "learning_rate": 8.5837329101091e-05, "lm_loss": 1.8629, "loss": 2.0017, "mask_loss": 0.1272, "step": 1394, "topk_loss": 0.0117 }, { "epoch": 0.5544549536401632, "grad_norm": 0.140625, "learning_rate": 8.57111773179213e-05, "lm_loss": 1.8755, "loss": 2.0101, "mask_loss": 0.1261, "step": 1395, "topk_loss": 0.0085 }, { "epoch": 0.5548524123882923, "grad_norm": 0.126953125, "learning_rate": 8.558504874406464e-05, "lm_loss": 1.9013, "loss": 2.0408, "mask_loss": 0.1271, "step": 1396, "topk_loss": 0.0124 }, { "epoch": 0.5552498711364215, "grad_norm": 0.11865234375, "learning_rate": 8.545894358439148e-05, "lm_loss": 1.896, "loss": 2.0308, "mask_loss": 0.1265, "step": 1397, "topk_loss": 0.0083 }, { "epoch": 0.5556473298845507, "grad_norm": 0.1162109375, "learning_rate": 8.533286204373424e-05, "lm_loss": 1.9127, "loss": 2.052, "mask_loss": 0.1289, "step": 1398, "topk_loss": 0.0104 }, { "epoch": 0.5560447886326798, "grad_norm": 0.1435546875, "learning_rate": 8.520680432688702e-05, "lm_loss": 1.9136, "loss": 2.0554, "mask_loss": 0.1288, "step": 1399, "topk_loss": 0.013 }, { "epoch": 0.5564422473808089, "grad_norm": 0.16015625, "learning_rate": 8.508077063860506e-05, "lm_loss": 1.9322, "loss": 2.0681, "mask_loss": 0.1249, "step": 1400, "topk_loss": 0.0111 }, { "epoch": 0.5564422473808089, "eval_lm_loss": 687.5111083984375, "eval_loss": 687.6494140625, "eval_mask_hit_rate": 0.5331037044525146, "eval_mask_loss": 0.12541639804840088, "eval_mask_top_10_hit_rate": 0.9850775599479675, "eval_mask_top_1_hit_rate": 0.997307538986206, "eval_mask_top_20_hit_rate": 0.9754698276519775, "eval_mask_top_5_hit_rate": 0.9905154705047607, "eval_runtime": 144.4147, "eval_samples_per_second": 14.181, "eval_steps_per_second": 7.091, "eval_token_accuracy": 0.6132440567016602, "eval_top_k_diff": -523.2880249023438, "eval_topk_loss": 0.012859683483839035, "step": 1400 }, { "epoch": 0.5568397061289381, "grad_norm": 0.1123046875, "learning_rate": 8.495476118360477e-05, "lm_loss": 1.912, "loss": 2.0477, "mask_loss": 0.1263, "step": 1401, "topk_loss": 0.0094 }, { "epoch": 0.5572371648770672, "grad_norm": 0.123046875, "learning_rate": 8.48287761665631e-05, "lm_loss": 1.9322, "loss": 2.0712, "mask_loss": 0.1292, "step": 1402, "topk_loss": 0.0098 }, { "epoch": 0.5576346236251964, "grad_norm": 0.119140625, "learning_rate": 8.470281579211733e-05, "lm_loss": 1.9025, "loss": 2.0405, "mask_loss": 0.1283, "step": 1403, "topk_loss": 0.0097 }, { "epoch": 0.5580320823733256, "grad_norm": 0.12060546875, "learning_rate": 8.45768802648647e-05, "lm_loss": 1.8724, "loss": 2.0123, "mask_loss": 0.1289, "step": 1404, "topk_loss": 0.011 }, { "epoch": 0.5584295411214547, "grad_norm": 0.12890625, "learning_rate": 8.44509697893621e-05, "lm_loss": 1.8755, "loss": 2.0127, "mask_loss": 0.1258, "step": 1405, "topk_loss": 0.0114 }, { "epoch": 0.5588269998695838, "grad_norm": 0.1455078125, "learning_rate": 8.432508457012571e-05, "lm_loss": 1.8924, "loss": 2.0313, "mask_loss": 0.1269, "step": 1406, "topk_loss": 0.012 }, { "epoch": 0.559224458617713, "grad_norm": 0.1162109375, "learning_rate": 8.419922481163075e-05, "lm_loss": 1.9442, "loss": 2.0822, "mask_loss": 0.1283, "step": 1407, "topk_loss": 0.0097 }, { "epoch": 0.5596219173658421, "grad_norm": 0.12890625, "learning_rate": 8.407339071831097e-05, "lm_loss": 1.9364, "loss": 2.0706, "mask_loss": 0.1246, "step": 1408, "topk_loss": 0.0096 }, { "epoch": 0.5600193761139713, "grad_norm": 0.11962890625, "learning_rate": 8.394758249455853e-05, "lm_loss": 1.8795, "loss": 2.0169, "mask_loss": 0.1279, "step": 1409, "topk_loss": 0.0096 }, { "epoch": 0.5604168348621005, "grad_norm": 0.1171875, "learning_rate": 8.382180034472353e-05, "lm_loss": 1.8734, "loss": 2.0089, "mask_loss": 0.1263, "step": 1410, "topk_loss": 0.0092 }, { "epoch": 0.5608142936102296, "grad_norm": 0.12060546875, "learning_rate": 8.369604447311373e-05, "lm_loss": 1.9064, "loss": 2.0414, "mask_loss": 0.1263, "step": 1411, "topk_loss": 0.0086 }, { "epoch": 0.5612117523583587, "grad_norm": 0.1201171875, "learning_rate": 8.35703150839942e-05, "lm_loss": 1.8829, "loss": 2.0187, "mask_loss": 0.1255, "step": 1412, "topk_loss": 0.0104 }, { "epoch": 0.5616092111064879, "grad_norm": 0.1611328125, "learning_rate": 8.344461238158699e-05, "lm_loss": 1.9118, "loss": 2.0582, "mask_loss": 0.131, "step": 1413, "topk_loss": 0.0154 }, { "epoch": 0.562006669854617, "grad_norm": 0.1259765625, "learning_rate": 8.331893657007082e-05, "lm_loss": 1.8842, "loss": 2.0185, "mask_loss": 0.1253, "step": 1414, "topk_loss": 0.0091 }, { "epoch": 0.5624041286027462, "grad_norm": 0.11279296875, "learning_rate": 8.319328785358078e-05, "lm_loss": 1.9075, "loss": 2.0437, "mask_loss": 0.1263, "step": 1415, "topk_loss": 0.0099 }, { "epoch": 0.5628015873508754, "grad_norm": 0.1435546875, "learning_rate": 8.306766643620774e-05, "lm_loss": 1.903, "loss": 2.0387, "mask_loss": 0.1261, "step": 1416, "topk_loss": 0.0096 }, { "epoch": 0.5631990460990045, "grad_norm": 0.1259765625, "learning_rate": 8.29420725219985e-05, "lm_loss": 1.8671, "loss": 2.0027, "mask_loss": 0.1268, "step": 1417, "topk_loss": 0.0088 }, { "epoch": 0.5635965048471336, "grad_norm": 0.140625, "learning_rate": 8.281650631495501e-05, "lm_loss": 1.8995, "loss": 2.0343, "mask_loss": 0.1248, "step": 1418, "topk_loss": 0.01 }, { "epoch": 0.5639939635952628, "grad_norm": 0.146484375, "learning_rate": 8.26909680190343e-05, "lm_loss": 1.9331, "loss": 2.071, "mask_loss": 0.1264, "step": 1419, "topk_loss": 0.0115 }, { "epoch": 0.5643914223433919, "grad_norm": 0.1923828125, "learning_rate": 8.256545783814802e-05, "lm_loss": 1.853, "loss": 1.9906, "mask_loss": 0.1275, "step": 1420, "topk_loss": 0.0102 }, { "epoch": 0.564788881091521, "grad_norm": 0.123046875, "learning_rate": 8.243997597616217e-05, "lm_loss": 1.9117, "loss": 2.0458, "mask_loss": 0.1249, "step": 1421, "topk_loss": 0.0091 }, { "epoch": 0.5651863398396503, "grad_norm": 0.12890625, "learning_rate": 8.231452263689674e-05, "lm_loss": 1.9307, "loss": 2.0671, "mask_loss": 0.1261, "step": 1422, "topk_loss": 0.0102 }, { "epoch": 0.5655837985877794, "grad_norm": 0.140625, "learning_rate": 8.218909802412542e-05, "lm_loss": 1.9443, "loss": 2.0841, "mask_loss": 0.1291, "step": 1423, "topk_loss": 0.0107 }, { "epoch": 0.5659812573359085, "grad_norm": 0.12451171875, "learning_rate": 8.20637023415752e-05, "lm_loss": 1.8607, "loss": 1.9997, "mask_loss": 0.1284, "step": 1424, "topk_loss": 0.0106 }, { "epoch": 0.5663787160840377, "grad_norm": 0.1376953125, "learning_rate": 8.19383357929261e-05, "lm_loss": 1.9362, "loss": 2.0714, "mask_loss": 0.1248, "step": 1425, "topk_loss": 0.0103 }, { "epoch": 0.5667761748321668, "grad_norm": 0.119140625, "learning_rate": 8.181299858181082e-05, "lm_loss": 1.8738, "loss": 2.0099, "mask_loss": 0.1261, "step": 1426, "topk_loss": 0.01 }, { "epoch": 0.5671736335802959, "grad_norm": 0.12890625, "learning_rate": 8.168769091181438e-05, "lm_loss": 1.9215, "loss": 2.0634, "mask_loss": 0.1293, "step": 1427, "topk_loss": 0.0126 }, { "epoch": 0.5675710923284252, "grad_norm": 0.11767578125, "learning_rate": 8.156241298647388e-05, "lm_loss": 1.8747, "loss": 2.014, "mask_loss": 0.1293, "step": 1428, "topk_loss": 0.01 }, { "epoch": 0.5679685510765543, "grad_norm": 0.1337890625, "learning_rate": 8.143716500927804e-05, "lm_loss": 1.9372, "loss": 2.0732, "mask_loss": 0.1263, "step": 1429, "topk_loss": 0.0096 }, { "epoch": 0.5683660098246834, "grad_norm": 0.13671875, "learning_rate": 8.131194718366696e-05, "lm_loss": 1.9328, "loss": 2.0675, "mask_loss": 0.1254, "step": 1430, "topk_loss": 0.0093 }, { "epoch": 0.5687634685728126, "grad_norm": 0.12109375, "learning_rate": 8.11867597130318e-05, "lm_loss": 1.9498, "loss": 2.0898, "mask_loss": 0.1283, "step": 1431, "topk_loss": 0.0117 }, { "epoch": 0.5691609273209417, "grad_norm": 0.173828125, "learning_rate": 8.106160280071431e-05, "lm_loss": 1.9142, "loss": 2.0487, "mask_loss": 0.126, "step": 1432, "topk_loss": 0.0086 }, { "epoch": 0.5695583860690708, "grad_norm": 0.166015625, "learning_rate": 8.093647665000672e-05, "lm_loss": 1.9168, "loss": 2.0526, "mask_loss": 0.1262, "step": 1433, "topk_loss": 0.0097 }, { "epoch": 0.5699558448172001, "grad_norm": 0.1318359375, "learning_rate": 8.081138146415121e-05, "lm_loss": 1.9281, "loss": 2.0671, "mask_loss": 0.1288, "step": 1434, "topk_loss": 0.0102 }, { "epoch": 0.5703533035653292, "grad_norm": 0.126953125, "learning_rate": 8.068631744633976e-05, "lm_loss": 1.951, "loss": 2.0878, "mask_loss": 0.1256, "step": 1435, "topk_loss": 0.0112 }, { "epoch": 0.5707507623134583, "grad_norm": 0.12060546875, "learning_rate": 8.056128479971361e-05, "lm_loss": 1.977, "loss": 2.1121, "mask_loss": 0.125, "step": 1436, "topk_loss": 0.0101 }, { "epoch": 0.5711482210615875, "grad_norm": 0.1416015625, "learning_rate": 8.043628372736311e-05, "lm_loss": 1.9565, "loss": 2.0922, "mask_loss": 0.1252, "step": 1437, "topk_loss": 0.0106 }, { "epoch": 0.5715456798097166, "grad_norm": 0.17578125, "learning_rate": 8.031131443232734e-05, "lm_loss": 1.9105, "loss": 2.0467, "mask_loss": 0.1267, "step": 1438, "topk_loss": 0.0095 }, { "epoch": 0.5719431385578457, "grad_norm": 0.162109375, "learning_rate": 8.018637711759377e-05, "lm_loss": 1.8987, "loss": 2.0331, "mask_loss": 0.1252, "step": 1439, "topk_loss": 0.0093 }, { "epoch": 0.572340597305975, "grad_norm": 0.1357421875, "learning_rate": 8.006147198609778e-05, "lm_loss": 1.9341, "loss": 2.0673, "mask_loss": 0.1235, "step": 1440, "topk_loss": 0.0097 }, { "epoch": 0.5727380560541041, "grad_norm": 0.123046875, "learning_rate": 7.993659924072265e-05, "lm_loss": 1.9626, "loss": 2.1014, "mask_loss": 0.1275, "step": 1441, "topk_loss": 0.0113 }, { "epoch": 0.5731355148022332, "grad_norm": 0.11865234375, "learning_rate": 7.9811759084299e-05, "lm_loss": 1.8526, "loss": 1.9898, "mask_loss": 0.1268, "step": 1442, "topk_loss": 0.0104 }, { "epoch": 0.5735329735503624, "grad_norm": 0.123046875, "learning_rate": 7.968695171960449e-05, "lm_loss": 1.8952, "loss": 2.0323, "mask_loss": 0.1271, "step": 1443, "topk_loss": 0.01 }, { "epoch": 0.5739304322984915, "grad_norm": 0.11962890625, "learning_rate": 7.956217734936353e-05, "lm_loss": 1.9943, "loss": 2.1317, "mask_loss": 0.1265, "step": 1444, "topk_loss": 0.0109 }, { "epoch": 0.5743278910466206, "grad_norm": 0.11865234375, "learning_rate": 7.943743617624695e-05, "lm_loss": 1.9151, "loss": 2.0512, "mask_loss": 0.1258, "step": 1445, "topk_loss": 0.0103 }, { "epoch": 0.5747253497947499, "grad_norm": 0.123046875, "learning_rate": 7.931272840287165e-05, "lm_loss": 1.9121, "loss": 2.048, "mask_loss": 0.1262, "step": 1446, "topk_loss": 0.0097 }, { "epoch": 0.575122808542879, "grad_norm": 0.15234375, "learning_rate": 7.918805423180029e-05, "lm_loss": 1.9777, "loss": 2.1201, "mask_loss": 0.1269, "step": 1447, "topk_loss": 0.0155 }, { "epoch": 0.5755202672910081, "grad_norm": 0.1318359375, "learning_rate": 7.90634138655409e-05, "lm_loss": 1.9178, "loss": 2.0617, "mask_loss": 0.1302, "step": 1448, "topk_loss": 0.0137 }, { "epoch": 0.5759177260391373, "grad_norm": 0.1279296875, "learning_rate": 7.893880750654668e-05, "lm_loss": 1.906, "loss": 2.0468, "mask_loss": 0.1286, "step": 1449, "topk_loss": 0.0123 }, { "epoch": 0.5763151847872664, "grad_norm": 0.1318359375, "learning_rate": 7.881423535721551e-05, "lm_loss": 1.9239, "loss": 2.0578, "mask_loss": 0.1246, "step": 1450, "topk_loss": 0.0092 }, { "epoch": 0.5763151847872664, "eval_lm_loss": 687.342041015625, "eval_loss": 687.4800415039062, "eval_mask_hit_rate": 0.5336586236953735, "eval_mask_loss": 0.12521842122077942, "eval_mask_top_10_hit_rate": 0.9852107763290405, "eval_mask_top_1_hit_rate": 0.9973337650299072, "eval_mask_top_20_hit_rate": 0.9756519198417664, "eval_mask_top_5_hit_rate": 0.9906177520751953, "eval_runtime": 144.3265, "eval_samples_per_second": 14.19, "eval_steps_per_second": 7.095, "eval_token_accuracy": 0.6135073304176331, "eval_top_k_diff": -522.5317993164062, "eval_topk_loss": 0.012827243655920029, "step": 1450 }, { "epoch": 0.5767126435353955, "grad_norm": 0.337890625, "learning_rate": 7.868969761988978e-05, "lm_loss": 1.8985, "loss": 2.0948, "mask_loss": 0.1625, "step": 1451, "topk_loss": 0.0338 }, { "epoch": 0.5771101022835247, "grad_norm": 0.19921875, "learning_rate": 7.856519449685591e-05, "lm_loss": 1.9234, "loss": 2.0611, "mask_loss": 0.1265, "step": 1452, "topk_loss": 0.0112 }, { "epoch": 0.5775075610316539, "grad_norm": 0.123046875, "learning_rate": 7.844072619034417e-05, "lm_loss": 1.9635, "loss": 2.1034, "mask_loss": 0.1275, "step": 1453, "topk_loss": 0.0124 }, { "epoch": 0.577905019779783, "grad_norm": 0.125, "learning_rate": 7.831629290252823e-05, "lm_loss": 1.9108, "loss": 2.0478, "mask_loss": 0.1269, "step": 1454, "topk_loss": 0.0101 }, { "epoch": 0.5783024785279122, "grad_norm": 0.1318359375, "learning_rate": 7.819189483552493e-05, "lm_loss": 1.9113, "loss": 2.0471, "mask_loss": 0.1258, "step": 1455, "topk_loss": 0.01 }, { "epoch": 0.5786999372760413, "grad_norm": 0.123046875, "learning_rate": 7.806753219139377e-05, "lm_loss": 1.8646, "loss": 1.9995, "mask_loss": 0.1265, "step": 1456, "topk_loss": 0.0084 }, { "epoch": 0.5790973960241704, "grad_norm": 0.111328125, "learning_rate": 7.794320517213687e-05, "lm_loss": 1.9129, "loss": 2.0482, "mask_loss": 0.1259, "step": 1457, "topk_loss": 0.0094 }, { "epoch": 0.5794948547722996, "grad_norm": 0.123046875, "learning_rate": 7.781891397969838e-05, "lm_loss": 1.8934, "loss": 2.0337, "mask_loss": 0.127, "step": 1458, "topk_loss": 0.0133 }, { "epoch": 0.5798923135204288, "grad_norm": 0.11669921875, "learning_rate": 7.769465881596434e-05, "lm_loss": 1.9682, "loss": 2.1037, "mask_loss": 0.1254, "step": 1459, "topk_loss": 0.0102 }, { "epoch": 0.5802897722685579, "grad_norm": 0.11474609375, "learning_rate": 7.75704398827622e-05, "lm_loss": 1.876, "loss": 2.0089, "mask_loss": 0.1245, "step": 1460, "topk_loss": 0.0083 }, { "epoch": 0.5806872310166871, "grad_norm": 0.306640625, "learning_rate": 7.744625738186059e-05, "lm_loss": 1.9677, "loss": 2.1455, "mask_loss": 0.1494, "step": 1461, "topk_loss": 0.0285 }, { "epoch": 0.5810846897648162, "grad_norm": 0.12255859375, "learning_rate": 7.732211151496895e-05, "lm_loss": 1.9315, "loss": 2.066, "mask_loss": 0.1256, "step": 1462, "topk_loss": 0.009 }, { "epoch": 0.5814821485129453, "grad_norm": 0.125, "learning_rate": 7.719800248373726e-05, "lm_loss": 1.9285, "loss": 2.0643, "mask_loss": 0.1263, "step": 1463, "topk_loss": 0.0095 }, { "epoch": 0.5818796072610745, "grad_norm": 0.12451171875, "learning_rate": 7.707393048975558e-05, "lm_loss": 1.8748, "loss": 2.0142, "mask_loss": 0.1282, "step": 1464, "topk_loss": 0.0113 }, { "epoch": 0.5822770660092037, "grad_norm": 0.134765625, "learning_rate": 7.694989573455388e-05, "lm_loss": 1.8607, "loss": 1.9976, "mask_loss": 0.1269, "step": 1465, "topk_loss": 0.0099 }, { "epoch": 0.5826745247573328, "grad_norm": 0.1435546875, "learning_rate": 7.682589841960164e-05, "lm_loss": 1.8428, "loss": 1.9787, "mask_loss": 0.1261, "step": 1466, "topk_loss": 0.0098 }, { "epoch": 0.583071983505462, "grad_norm": 0.1796875, "learning_rate": 7.670193874630749e-05, "lm_loss": 1.9415, "loss": 2.0803, "mask_loss": 0.1276, "step": 1467, "topk_loss": 0.0113 }, { "epoch": 0.5834694422535911, "grad_norm": 0.1328125, "learning_rate": 7.657801691601896e-05, "lm_loss": 2.0485, "loss": 2.181, "mask_loss": 0.1232, "step": 1468, "topk_loss": 0.0093 }, { "epoch": 0.5838669010017202, "grad_norm": 0.126953125, "learning_rate": 7.645413313002207e-05, "lm_loss": 1.84, "loss": 1.9775, "mask_loss": 0.1289, "step": 1469, "topk_loss": 0.0086 }, { "epoch": 0.5842643597498494, "grad_norm": 0.130859375, "learning_rate": 7.633028758954109e-05, "lm_loss": 1.8724, "loss": 2.0092, "mask_loss": 0.1262, "step": 1470, "topk_loss": 0.0106 }, { "epoch": 0.5846618184979786, "grad_norm": 0.15234375, "learning_rate": 7.620648049573815e-05, "lm_loss": 1.8695, "loss": 2.0039, "mask_loss": 0.1254, "step": 1471, "topk_loss": 0.009 }, { "epoch": 0.5850592772461077, "grad_norm": 0.130859375, "learning_rate": 7.608271204971287e-05, "lm_loss": 1.9123, "loss": 2.0483, "mask_loss": 0.1257, "step": 1472, "topk_loss": 0.0103 }, { "epoch": 0.5854567359942369, "grad_norm": 0.1455078125, "learning_rate": 7.59589824525022e-05, "lm_loss": 1.8812, "loss": 2.0175, "mask_loss": 0.1274, "step": 1473, "topk_loss": 0.0089 }, { "epoch": 0.585854194742366, "grad_norm": 0.1298828125, "learning_rate": 7.583529190507992e-05, "lm_loss": 1.9804, "loss": 2.1196, "mask_loss": 0.1275, "step": 1474, "topk_loss": 0.0118 }, { "epoch": 0.5862516534904951, "grad_norm": 0.1142578125, "learning_rate": 7.571164060835641e-05, "lm_loss": 1.9199, "loss": 2.0533, "mask_loss": 0.1244, "step": 1475, "topk_loss": 0.009 }, { "epoch": 0.5866491122386243, "grad_norm": 0.1201171875, "learning_rate": 7.558802876317825e-05, "lm_loss": 1.9074, "loss": 2.0467, "mask_loss": 0.1278, "step": 1476, "topk_loss": 0.0115 }, { "epoch": 0.5870465709867534, "grad_norm": 0.1357421875, "learning_rate": 7.546445657032801e-05, "lm_loss": 1.9905, "loss": 2.1273, "mask_loss": 0.1257, "step": 1477, "topk_loss": 0.011 }, { "epoch": 0.5874440297348826, "grad_norm": 0.11962890625, "learning_rate": 7.534092423052381e-05, "lm_loss": 1.9266, "loss": 2.0633, "mask_loss": 0.1269, "step": 1478, "topk_loss": 0.0098 }, { "epoch": 0.5878414884830118, "grad_norm": 0.1259765625, "learning_rate": 7.521743194441904e-05, "lm_loss": 1.9371, "loss": 2.0692, "mask_loss": 0.1234, "step": 1479, "topk_loss": 0.0087 }, { "epoch": 0.5882389472311409, "grad_norm": 0.125, "learning_rate": 7.509397991260202e-05, "lm_loss": 1.9036, "loss": 2.0376, "mask_loss": 0.1244, "step": 1480, "topk_loss": 0.0096 }, { "epoch": 0.58863640597927, "grad_norm": 0.171875, "learning_rate": 7.497056833559573e-05, "lm_loss": 1.9135, "loss": 2.0566, "mask_loss": 0.1294, "step": 1481, "topk_loss": 0.0137 }, { "epoch": 0.5890338647273992, "grad_norm": 0.17578125, "learning_rate": 7.484719741385735e-05, "lm_loss": 1.8347, "loss": 1.9715, "mask_loss": 0.1274, "step": 1482, "topk_loss": 0.0094 }, { "epoch": 0.5894313234755283, "grad_norm": 0.12158203125, "learning_rate": 7.472386734777814e-05, "lm_loss": 1.8663, "loss": 2.002, "mask_loss": 0.1258, "step": 1483, "topk_loss": 0.01 }, { "epoch": 0.5898287822236575, "grad_norm": 0.1220703125, "learning_rate": 7.460057833768292e-05, "lm_loss": 1.9451, "loss": 2.0813, "mask_loss": 0.1264, "step": 1484, "topk_loss": 0.0098 }, { "epoch": 0.5902262409717867, "grad_norm": 0.1142578125, "learning_rate": 7.447733058382981e-05, "lm_loss": 1.9263, "loss": 2.0637, "mask_loss": 0.1265, "step": 1485, "topk_loss": 0.0109 }, { "epoch": 0.5906236997199158, "grad_norm": 0.1357421875, "learning_rate": 7.435412428641001e-05, "lm_loss": 1.9, "loss": 2.0355, "mask_loss": 0.1259, "step": 1486, "topk_loss": 0.0096 }, { "epoch": 0.5910211584680449, "grad_norm": 0.126953125, "learning_rate": 7.423095964554731e-05, "lm_loss": 1.8908, "loss": 2.0295, "mask_loss": 0.1278, "step": 1487, "topk_loss": 0.011 }, { "epoch": 0.5914186172161741, "grad_norm": 0.1416015625, "learning_rate": 7.41078368612978e-05, "lm_loss": 1.9473, "loss": 2.0858, "mask_loss": 0.1276, "step": 1488, "topk_loss": 0.0108 }, { "epoch": 0.5918160759643032, "grad_norm": 0.1259765625, "learning_rate": 7.398475613364969e-05, "lm_loss": 1.9053, "loss": 2.0394, "mask_loss": 0.1249, "step": 1489, "topk_loss": 0.0093 }, { "epoch": 0.5922135347124324, "grad_norm": 0.1787109375, "learning_rate": 7.386171766252274e-05, "lm_loss": 1.9353, "loss": 2.0831, "mask_loss": 0.1312, "step": 1490, "topk_loss": 0.0165 }, { "epoch": 0.5926109934605616, "grad_norm": 0.146484375, "learning_rate": 7.373872164776824e-05, "lm_loss": 1.8609, "loss": 2.0012, "mask_loss": 0.1276, "step": 1491, "topk_loss": 0.0126 }, { "epoch": 0.5930084522086907, "grad_norm": 0.5, "learning_rate": 7.361576828916839e-05, "lm_loss": 1.8879, "loss": 2.0956, "mask_loss": 0.1665, "step": 1492, "topk_loss": 0.0412 }, { "epoch": 0.5934059109568198, "grad_norm": 0.1298828125, "learning_rate": 7.349285778643614e-05, "lm_loss": 1.908, "loss": 2.045, "mask_loss": 0.1267, "step": 1493, "topk_loss": 0.0102 }, { "epoch": 0.593803369704949, "grad_norm": 0.1572265625, "learning_rate": 7.336999033921486e-05, "lm_loss": 1.9472, "loss": 2.0836, "mask_loss": 0.1257, "step": 1494, "topk_loss": 0.0107 }, { "epoch": 0.5942008284530781, "grad_norm": 0.1943359375, "learning_rate": 7.324716614707793e-05, "lm_loss": 1.8706, "loss": 2.0106, "mask_loss": 0.1298, "step": 1495, "topk_loss": 0.0102 }, { "epoch": 0.5945982872012073, "grad_norm": 0.1396484375, "learning_rate": 7.312438540952852e-05, "lm_loss": 1.8157, "loss": 1.9582, "mask_loss": 0.1301, "step": 1496, "topk_loss": 0.0124 }, { "epoch": 0.5949957459493365, "grad_norm": 0.1484375, "learning_rate": 7.300164832599917e-05, "lm_loss": 1.8506, "loss": 1.9867, "mask_loss": 0.1268, "step": 1497, "topk_loss": 0.0092 }, { "epoch": 0.5953932046974656, "grad_norm": 0.158203125, "learning_rate": 7.287895509585156e-05, "lm_loss": 1.8594, "loss": 1.9964, "mask_loss": 0.1277, "step": 1498, "topk_loss": 0.0093 }, { "epoch": 0.5957906634455947, "grad_norm": 0.123046875, "learning_rate": 7.275630591837613e-05, "lm_loss": 1.9108, "loss": 2.0471, "mask_loss": 0.1255, "step": 1499, "topk_loss": 0.0107 }, { "epoch": 0.5961881221937239, "grad_norm": 0.1533203125, "learning_rate": 7.263370099279172e-05, "lm_loss": 1.9664, "loss": 2.1044, "mask_loss": 0.1264, "step": 1500, "topk_loss": 0.0115 }, { "epoch": 0.5961881221937239, "eval_lm_loss": 688.2054443359375, "eval_loss": 688.3433837890625, "eval_mask_hit_rate": 0.5340866446495056, "eval_mask_loss": 0.12516987323760986, "eval_mask_top_10_hit_rate": 0.9852725267410278, "eval_mask_top_1_hit_rate": 0.9973559379577637, "eval_mask_top_20_hit_rate": 0.9757484197616577, "eval_mask_top_5_hit_rate": 0.9906572103500366, "eval_runtime": 143.6792, "eval_samples_per_second": 14.254, "eval_steps_per_second": 7.127, "eval_token_accuracy": 0.6136693358421326, "eval_top_k_diff": -533.968017578125, "eval_topk_loss": 0.012739567086100578, "step": 1500 }, { "epoch": 0.596585580941853, "grad_norm": 0.162109375, "learning_rate": 7.251114051824534e-05, "lm_loss": 1.8898, "loss": 2.0319, "mask_loss": 0.1297, "step": 1501, "topk_loss": 0.0124 }, { "epoch": 0.5969830396899822, "grad_norm": 0.203125, "learning_rate": 7.238862469381177e-05, "lm_loss": 1.9536, "loss": 2.0893, "mask_loss": 0.126, "step": 1502, "topk_loss": 0.0096 }, { "epoch": 0.5973804984381114, "grad_norm": 0.1376953125, "learning_rate": 7.226615371849337e-05, "lm_loss": 1.9464, "loss": 2.0878, "mask_loss": 0.1285, "step": 1503, "topk_loss": 0.013 }, { "epoch": 0.5977779571862405, "grad_norm": 0.158203125, "learning_rate": 7.214372779121942e-05, "lm_loss": 1.9762, "loss": 2.1121, "mask_loss": 0.126, "step": 1504, "topk_loss": 0.0099 }, { "epoch": 0.5981754159343696, "grad_norm": 0.126953125, "learning_rate": 7.202134711084624e-05, "lm_loss": 1.882, "loss": 2.0148, "mask_loss": 0.1242, "step": 1505, "topk_loss": 0.0087 }, { "epoch": 0.5985728746824988, "grad_norm": 0.1220703125, "learning_rate": 7.189901187615658e-05, "lm_loss": 1.8998, "loss": 2.0348, "mask_loss": 0.1255, "step": 1506, "topk_loss": 0.0094 }, { "epoch": 0.5989703334306279, "grad_norm": 0.126953125, "learning_rate": 7.177672228585935e-05, "lm_loss": 1.8359, "loss": 1.9746, "mask_loss": 0.1276, "step": 1507, "topk_loss": 0.0111 }, { "epoch": 0.599367792178757, "grad_norm": 0.1689453125, "learning_rate": 7.165447853858937e-05, "lm_loss": 1.9679, "loss": 2.1039, "mask_loss": 0.1256, "step": 1508, "topk_loss": 0.0103 }, { "epoch": 0.5997652509268863, "grad_norm": 0.2041015625, "learning_rate": 7.153228083290698e-05, "lm_loss": 1.8872, "loss": 2.0229, "mask_loss": 0.1256, "step": 1509, "topk_loss": 0.01 }, { "epoch": 0.6001627096750154, "grad_norm": 0.140625, "learning_rate": 7.141012936729771e-05, "lm_loss": 1.8764, "loss": 2.0122, "mask_loss": 0.126, "step": 1510, "topk_loss": 0.0098 }, { "epoch": 0.6005601684231445, "grad_norm": 0.1162109375, "learning_rate": 7.128802434017205e-05, "lm_loss": 1.8735, "loss": 2.0111, "mask_loss": 0.1272, "step": 1511, "topk_loss": 0.0104 }, { "epoch": 0.6009576271712737, "grad_norm": 0.146484375, "learning_rate": 7.116596594986494e-05, "lm_loss": 1.917, "loss": 2.0519, "mask_loss": 0.1257, "step": 1512, "topk_loss": 0.0092 }, { "epoch": 0.6013550859194028, "grad_norm": 0.1435546875, "learning_rate": 7.104395439463567e-05, "lm_loss": 1.9171, "loss": 2.0525, "mask_loss": 0.1263, "step": 1513, "topk_loss": 0.0091 }, { "epoch": 0.6017525446675319, "grad_norm": 0.1337890625, "learning_rate": 7.092198987266742e-05, "lm_loss": 1.9347, "loss": 2.0748, "mask_loss": 0.1287, "step": 1514, "topk_loss": 0.0114 }, { "epoch": 0.6021500034156612, "grad_norm": 0.1171875, "learning_rate": 7.080007258206698e-05, "lm_loss": 1.9195, "loss": 2.0541, "mask_loss": 0.1258, "step": 1515, "topk_loss": 0.0088 }, { "epoch": 0.6025474621637903, "grad_norm": 0.189453125, "learning_rate": 7.067820272086443e-05, "lm_loss": 1.854, "loss": 1.9898, "mask_loss": 0.1266, "step": 1516, "topk_loss": 0.0091 }, { "epoch": 0.6029449209119194, "grad_norm": 0.154296875, "learning_rate": 7.055638048701278e-05, "lm_loss": 1.8665, "loss": 2.0022, "mask_loss": 0.126, "step": 1517, "topk_loss": 0.0097 }, { "epoch": 0.6033423796600486, "grad_norm": 0.2255859375, "learning_rate": 7.043460607838772e-05, "lm_loss": 1.8717, "loss": 2.0117, "mask_loss": 0.1279, "step": 1518, "topk_loss": 0.0121 }, { "epoch": 0.6037398384081777, "grad_norm": 0.15625, "learning_rate": 7.031287969278728e-05, "lm_loss": 1.9536, "loss": 2.093, "mask_loss": 0.1274, "step": 1519, "topk_loss": 0.012 }, { "epoch": 0.6041372971563068, "grad_norm": 0.134765625, "learning_rate": 7.019120152793135e-05, "lm_loss": 1.9424, "loss": 2.0781, "mask_loss": 0.1255, "step": 1520, "topk_loss": 0.0103 }, { "epoch": 0.6045347559044361, "grad_norm": 0.1181640625, "learning_rate": 7.006957178146162e-05, "lm_loss": 1.9302, "loss": 2.0649, "mask_loss": 0.1254, "step": 1521, "topk_loss": 0.0093 }, { "epoch": 0.6049322146525652, "grad_norm": 0.1533203125, "learning_rate": 6.994799065094113e-05, "lm_loss": 1.9078, "loss": 2.0483, "mask_loss": 0.1288, "step": 1522, "topk_loss": 0.0117 }, { "epoch": 0.6053296734006943, "grad_norm": 0.1533203125, "learning_rate": 6.982645833385391e-05, "lm_loss": 1.9466, "loss": 2.0862, "mask_loss": 0.1265, "step": 1523, "topk_loss": 0.0132 }, { "epoch": 0.6057271321488235, "grad_norm": 0.14453125, "learning_rate": 6.970497502760471e-05, "lm_loss": 1.8732, "loss": 2.0105, "mask_loss": 0.1271, "step": 1524, "topk_loss": 0.0102 }, { "epoch": 0.6061245908969526, "grad_norm": 0.158203125, "learning_rate": 6.95835409295187e-05, "lm_loss": 1.8879, "loss": 2.0242, "mask_loss": 0.1254, "step": 1525, "topk_loss": 0.0109 }, { "epoch": 0.6065220496450817, "grad_norm": 0.150390625, "learning_rate": 6.94621562368411e-05, "lm_loss": 1.9548, "loss": 2.0923, "mask_loss": 0.1263, "step": 1526, "topk_loss": 0.0112 }, { "epoch": 0.606919508393211, "grad_norm": 0.1162109375, "learning_rate": 6.934082114673688e-05, "lm_loss": 1.8601, "loss": 1.9986, "mask_loss": 0.1276, "step": 1527, "topk_loss": 0.0109 }, { "epoch": 0.6073169671413401, "grad_norm": 0.1171875, "learning_rate": 6.921953585629043e-05, "lm_loss": 1.8455, "loss": 1.9823, "mask_loss": 0.1275, "step": 1528, "topk_loss": 0.0093 }, { "epoch": 0.6077144258894692, "grad_norm": 0.12255859375, "learning_rate": 6.909830056250527e-05, "lm_loss": 1.93, "loss": 2.0652, "mask_loss": 0.1246, "step": 1529, "topk_loss": 0.0106 }, { "epoch": 0.6081118846375984, "grad_norm": 0.126953125, "learning_rate": 6.89771154623037e-05, "lm_loss": 1.9597, "loss": 2.0993, "mask_loss": 0.1271, "step": 1530, "topk_loss": 0.0124 }, { "epoch": 0.6085093433857275, "grad_norm": 0.15234375, "learning_rate": 6.88559807525265e-05, "lm_loss": 1.8741, "loss": 2.0109, "mask_loss": 0.1274, "step": 1531, "topk_loss": 0.0094 }, { "epoch": 0.6089068021338566, "grad_norm": 0.1328125, "learning_rate": 6.873489662993261e-05, "lm_loss": 1.9553, "loss": 2.0951, "mask_loss": 0.1274, "step": 1532, "topk_loss": 0.0124 }, { "epoch": 0.6093042608819857, "grad_norm": 0.11962890625, "learning_rate": 6.861386329119877e-05, "lm_loss": 1.8683, "loss": 2.0041, "mask_loss": 0.1267, "step": 1533, "topk_loss": 0.0091 }, { "epoch": 0.609701719630115, "grad_norm": 0.1259765625, "learning_rate": 6.849288093291924e-05, "lm_loss": 1.9182, "loss": 2.0571, "mask_loss": 0.1285, "step": 1534, "topk_loss": 0.0103 }, { "epoch": 0.6100991783782441, "grad_norm": 0.1416015625, "learning_rate": 6.837194975160554e-05, "lm_loss": 1.9849, "loss": 2.1179, "mask_loss": 0.1241, "step": 1535, "topk_loss": 0.0089 }, { "epoch": 0.6104966371263733, "grad_norm": 0.12109375, "learning_rate": 6.825106994368593e-05, "lm_loss": 1.8937, "loss": 2.0317, "mask_loss": 0.1267, "step": 1536, "topk_loss": 0.0112 }, { "epoch": 0.6108940958745024, "grad_norm": 0.1181640625, "learning_rate": 6.813024170550531e-05, "lm_loss": 1.9574, "loss": 2.0939, "mask_loss": 0.1254, "step": 1537, "topk_loss": 0.0111 }, { "epoch": 0.6112915546226315, "grad_norm": 0.11767578125, "learning_rate": 6.800946523332484e-05, "lm_loss": 1.936, "loss": 2.073, "mask_loss": 0.1268, "step": 1538, "topk_loss": 0.0102 }, { "epoch": 0.6116890133707606, "grad_norm": 0.1435546875, "learning_rate": 6.788874072332152e-05, "lm_loss": 1.9401, "loss": 2.0826, "mask_loss": 0.1284, "step": 1539, "topk_loss": 0.0141 }, { "epoch": 0.6120864721188899, "grad_norm": 0.1220703125, "learning_rate": 6.776806837158802e-05, "lm_loss": 1.8665, "loss": 2.0022, "mask_loss": 0.1264, "step": 1540, "topk_loss": 0.0093 }, { "epoch": 0.612483930867019, "grad_norm": 0.1142578125, "learning_rate": 6.764744837413225e-05, "lm_loss": 1.848, "loss": 1.9839, "mask_loss": 0.1263, "step": 1541, "topk_loss": 0.0096 }, { "epoch": 0.6128813896151482, "grad_norm": 0.12353515625, "learning_rate": 6.75268809268771e-05, "lm_loss": 1.9316, "loss": 2.0693, "mask_loss": 0.1266, "step": 1542, "topk_loss": 0.0111 }, { "epoch": 0.6132788483632773, "grad_norm": 0.1171875, "learning_rate": 6.74063662256601e-05, "lm_loss": 1.9418, "loss": 2.0779, "mask_loss": 0.1252, "step": 1543, "topk_loss": 0.011 }, { "epoch": 0.6136763071114064, "grad_norm": 0.12353515625, "learning_rate": 6.728590446623305e-05, "lm_loss": 1.9308, "loss": 2.0664, "mask_loss": 0.1256, "step": 1544, "topk_loss": 0.0099 }, { "epoch": 0.6140737658595355, "grad_norm": 0.119140625, "learning_rate": 6.716549584426182e-05, "lm_loss": 1.8845, "loss": 2.0173, "mask_loss": 0.1246, "step": 1545, "topk_loss": 0.0082 }, { "epoch": 0.6144712246076648, "grad_norm": 0.123046875, "learning_rate": 6.704514055532597e-05, "lm_loss": 1.9182, "loss": 2.0568, "mask_loss": 0.1265, "step": 1546, "topk_loss": 0.012 }, { "epoch": 0.6148686833557939, "grad_norm": 0.11962890625, "learning_rate": 6.692483879491841e-05, "lm_loss": 1.9522, "loss": 2.091, "mask_loss": 0.1285, "step": 1547, "topk_loss": 0.0102 }, { "epoch": 0.615266142103923, "grad_norm": 0.12353515625, "learning_rate": 6.68045907584451e-05, "lm_loss": 1.9256, "loss": 2.0618, "mask_loss": 0.127, "step": 1548, "topk_loss": 0.0092 }, { "epoch": 0.6156636008520522, "grad_norm": 0.126953125, "learning_rate": 6.668439664122475e-05, "lm_loss": 1.9176, "loss": 2.0529, "mask_loss": 0.1261, "step": 1549, "topk_loss": 0.0092 }, { "epoch": 0.6160610596001813, "grad_norm": 0.1591796875, "learning_rate": 6.656425663848848e-05, "lm_loss": 1.8485, "loss": 1.9852, "mask_loss": 0.1272, "step": 1550, "topk_loss": 0.0095 }, { "epoch": 0.6160610596001813, "eval_lm_loss": 687.9506225585938, "eval_loss": 688.0882568359375, "eval_mask_hit_rate": 0.534498929977417, "eval_mask_loss": 0.12500986456871033, "eval_mask_top_10_hit_rate": 0.9853236675262451, "eval_mask_top_1_hit_rate": 0.9973630905151367, "eval_mask_top_20_hit_rate": 0.975848376750946, "eval_mask_top_5_hit_rate": 0.9906688928604126, "eval_runtime": 144.4709, "eval_samples_per_second": 14.176, "eval_steps_per_second": 7.088, "eval_token_accuracy": 0.6139100193977356, "eval_top_k_diff": -530.8353271484375, "eval_topk_loss": 0.012646865099668503, "step": 1550 }, { "epoch": 0.6164585183483104, "grad_norm": 0.11474609375, "learning_rate": 6.644417094537956e-05, "lm_loss": 1.9334, "loss": 2.072, "mask_loss": 0.1269, "step": 1551, "topk_loss": 0.0117 }, { "epoch": 0.6168559770964397, "grad_norm": 0.11328125, "learning_rate": 6.63241397569529e-05, "lm_loss": 1.8446, "loss": 1.9813, "mask_loss": 0.1269, "step": 1552, "topk_loss": 0.0098 }, { "epoch": 0.6172534358445688, "grad_norm": 0.11767578125, "learning_rate": 6.620416326817504e-05, "lm_loss": 1.9193, "loss": 2.052, "mask_loss": 0.1234, "step": 1553, "topk_loss": 0.0094 }, { "epoch": 0.617650894592698, "grad_norm": 0.11767578125, "learning_rate": 6.60842416739236e-05, "lm_loss": 1.8713, "loss": 2.0097, "mask_loss": 0.1282, "step": 1554, "topk_loss": 0.0102 }, { "epoch": 0.6180483533408271, "grad_norm": 0.1455078125, "learning_rate": 6.596437516898703e-05, "lm_loss": 1.9221, "loss": 2.0639, "mask_loss": 0.1295, "step": 1555, "topk_loss": 0.0122 }, { "epoch": 0.6184458120889562, "grad_norm": 0.11474609375, "learning_rate": 6.584456394806434e-05, "lm_loss": 1.921, "loss": 2.0569, "mask_loss": 0.1265, "step": 1556, "topk_loss": 0.0094 }, { "epoch": 0.6188432708370853, "grad_norm": 0.11328125, "learning_rate": 6.572480820576469e-05, "lm_loss": 1.8918, "loss": 2.0312, "mask_loss": 0.128, "step": 1557, "topk_loss": 0.0114 }, { "epoch": 0.6192407295852146, "grad_norm": 0.1162109375, "learning_rate": 6.560510813660719e-05, "lm_loss": 1.8826, "loss": 2.0195, "mask_loss": 0.1273, "step": 1558, "topk_loss": 0.0096 }, { "epoch": 0.6196381883333437, "grad_norm": 0.1689453125, "learning_rate": 6.548546393502045e-05, "lm_loss": 1.9157, "loss": 2.0527, "mask_loss": 0.1269, "step": 1559, "topk_loss": 0.01 }, { "epoch": 0.6200356470814729, "grad_norm": 0.130859375, "learning_rate": 6.536587579534236e-05, "lm_loss": 1.8873, "loss": 2.0228, "mask_loss": 0.1252, "step": 1560, "topk_loss": 0.0103 }, { "epoch": 0.620433105829602, "grad_norm": 0.11962890625, "learning_rate": 6.524634391181975e-05, "lm_loss": 1.8931, "loss": 2.0282, "mask_loss": 0.1253, "step": 1561, "topk_loss": 0.0099 }, { "epoch": 0.6208305645777311, "grad_norm": 0.12255859375, "learning_rate": 6.51268684786081e-05, "lm_loss": 1.9159, "loss": 2.0564, "mask_loss": 0.1285, "step": 1562, "topk_loss": 0.012 }, { "epoch": 0.6212280233258602, "grad_norm": 0.1328125, "learning_rate": 6.500744968977116e-05, "lm_loss": 1.9496, "loss": 2.0841, "mask_loss": 0.1245, "step": 1563, "topk_loss": 0.01 }, { "epoch": 0.6216254820739894, "grad_norm": 0.12890625, "learning_rate": 6.48880877392807e-05, "lm_loss": 1.8963, "loss": 2.0317, "mask_loss": 0.1262, "step": 1564, "topk_loss": 0.0093 }, { "epoch": 0.6220229408221186, "grad_norm": 0.123046875, "learning_rate": 6.476878282101614e-05, "lm_loss": 1.9197, "loss": 2.0538, "mask_loss": 0.1241, "step": 1565, "topk_loss": 0.01 }, { "epoch": 0.6224203995702478, "grad_norm": 0.1201171875, "learning_rate": 6.46495351287643e-05, "lm_loss": 1.9183, "loss": 2.0556, "mask_loss": 0.1268, "step": 1566, "topk_loss": 0.0105 }, { "epoch": 0.6228178583183769, "grad_norm": 0.126953125, "learning_rate": 6.453034485621904e-05, "lm_loss": 1.9571, "loss": 2.0942, "mask_loss": 0.1272, "step": 1567, "topk_loss": 0.0099 }, { "epoch": 0.623215317066506, "grad_norm": 0.1357421875, "learning_rate": 6.441121219698087e-05, "lm_loss": 1.9259, "loss": 2.0597, "mask_loss": 0.125, "step": 1568, "topk_loss": 0.0089 }, { "epoch": 0.6236127758146351, "grad_norm": 0.11376953125, "learning_rate": 6.429213734455683e-05, "lm_loss": 1.9268, "loss": 2.0627, "mask_loss": 0.126, "step": 1569, "topk_loss": 0.0099 }, { "epoch": 0.6240102345627643, "grad_norm": 0.1259765625, "learning_rate": 6.417312049236004e-05, "lm_loss": 1.9038, "loss": 2.0373, "mask_loss": 0.124, "step": 1570, "topk_loss": 0.0094 }, { "epoch": 0.6244076933108935, "grad_norm": 0.1220703125, "learning_rate": 6.405416183370936e-05, "lm_loss": 1.8937, "loss": 2.0307, "mask_loss": 0.1275, "step": 1571, "topk_loss": 0.0095 }, { "epoch": 0.6248051520590227, "grad_norm": 0.1435546875, "learning_rate": 6.393526156182918e-05, "lm_loss": 1.8852, "loss": 2.0238, "mask_loss": 0.1267, "step": 1572, "topk_loss": 0.012 }, { "epoch": 0.6252026108071518, "grad_norm": 0.1328125, "learning_rate": 6.381641986984901e-05, "lm_loss": 1.9827, "loss": 2.1183, "mask_loss": 0.1256, "step": 1573, "topk_loss": 0.0101 }, { "epoch": 0.6256000695552809, "grad_norm": 0.115234375, "learning_rate": 6.369763695080327e-05, "lm_loss": 1.9191, "loss": 2.0537, "mask_loss": 0.1248, "step": 1574, "topk_loss": 0.0098 }, { "epoch": 0.62599752830341, "grad_norm": 0.1220703125, "learning_rate": 6.357891299763086e-05, "lm_loss": 1.8599, "loss": 1.9967, "mask_loss": 0.1272, "step": 1575, "topk_loss": 0.0096 }, { "epoch": 0.6263949870515392, "grad_norm": 0.1630859375, "learning_rate": 6.346024820317488e-05, "lm_loss": 1.894, "loss": 2.0287, "mask_loss": 0.1251, "step": 1576, "topk_loss": 0.0096 }, { "epoch": 0.6267924457996684, "grad_norm": 0.1416015625, "learning_rate": 6.334164276018242e-05, "lm_loss": 1.8849, "loss": 2.0216, "mask_loss": 0.1267, "step": 1577, "topk_loss": 0.0099 }, { "epoch": 0.6271899045477975, "grad_norm": 0.126953125, "learning_rate": 6.32230968613041e-05, "lm_loss": 1.8525, "loss": 1.9909, "mask_loss": 0.1278, "step": 1578, "topk_loss": 0.0106 }, { "epoch": 0.6275873632959267, "grad_norm": 0.1650390625, "learning_rate": 6.310461069909384e-05, "lm_loss": 1.8428, "loss": 1.9792, "mask_loss": 0.1258, "step": 1579, "topk_loss": 0.0106 }, { "epoch": 0.6279848220440558, "grad_norm": 0.1845703125, "learning_rate": 6.298618446600856e-05, "lm_loss": 1.8653, "loss": 1.999, "mask_loss": 0.1242, "step": 1580, "topk_loss": 0.0095 }, { "epoch": 0.6283822807921849, "grad_norm": 0.1376953125, "learning_rate": 6.286781835440778e-05, "lm_loss": 1.9132, "loss": 2.0546, "mask_loss": 0.1285, "step": 1581, "topk_loss": 0.013 }, { "epoch": 0.6287797395403141, "grad_norm": 0.11962890625, "learning_rate": 6.274951255655344e-05, "lm_loss": 1.9113, "loss": 2.0469, "mask_loss": 0.1258, "step": 1582, "topk_loss": 0.0098 }, { "epoch": 0.6291771982884433, "grad_norm": 0.1142578125, "learning_rate": 6.263126726460945e-05, "lm_loss": 1.8686, "loss": 2.0055, "mask_loss": 0.1281, "step": 1583, "topk_loss": 0.0088 }, { "epoch": 0.6295746570365724, "grad_norm": 0.1376953125, "learning_rate": 6.251308267064143e-05, "lm_loss": 1.943, "loss": 2.087, "mask_loss": 0.1282, "step": 1584, "topk_loss": 0.0157 }, { "epoch": 0.6299721157847016, "grad_norm": 0.11767578125, "learning_rate": 6.239495896661643e-05, "lm_loss": 1.8985, "loss": 2.0348, "mask_loss": 0.127, "step": 1585, "topk_loss": 0.0094 }, { "epoch": 0.6303695745328307, "grad_norm": 0.1279296875, "learning_rate": 6.227689634440263e-05, "lm_loss": 1.893, "loss": 2.0332, "mask_loss": 0.1281, "step": 1586, "topk_loss": 0.012 }, { "epoch": 0.6307670332809598, "grad_norm": 0.1416015625, "learning_rate": 6.215889499576898e-05, "lm_loss": 1.8923, "loss": 2.0254, "mask_loss": 0.124, "step": 1587, "topk_loss": 0.0091 }, { "epoch": 0.631164492029089, "grad_norm": 0.1455078125, "learning_rate": 6.204095511238487e-05, "lm_loss": 1.8733, "loss": 2.0108, "mask_loss": 0.1275, "step": 1588, "topk_loss": 0.01 }, { "epoch": 0.6315619507772181, "grad_norm": 0.12109375, "learning_rate": 6.192307688581989e-05, "lm_loss": 1.9041, "loss": 2.0397, "mask_loss": 0.1248, "step": 1589, "topk_loss": 0.0109 }, { "epoch": 0.6319594095253473, "grad_norm": 0.119140625, "learning_rate": 6.180526050754346e-05, "lm_loss": 1.9108, "loss": 2.045, "mask_loss": 0.1254, "step": 1590, "topk_loss": 0.0088 }, { "epoch": 0.6323568682734765, "grad_norm": 0.138671875, "learning_rate": 6.168750616892459e-05, "lm_loss": 1.9126, "loss": 2.048, "mask_loss": 0.1255, "step": 1591, "topk_loss": 0.0099 }, { "epoch": 0.6327543270216056, "grad_norm": 0.1259765625, "learning_rate": 6.156981406123137e-05, "lm_loss": 1.9135, "loss": 2.0517, "mask_loss": 0.1274, "step": 1592, "topk_loss": 0.0108 }, { "epoch": 0.6331517857697347, "grad_norm": 0.12890625, "learning_rate": 6.145218437563097e-05, "lm_loss": 1.9114, "loss": 2.046, "mask_loss": 0.1257, "step": 1593, "topk_loss": 0.0089 }, { "epoch": 0.6335492445178639, "grad_norm": 0.130859375, "learning_rate": 6.133461730318911e-05, "lm_loss": 1.9025, "loss": 2.0381, "mask_loss": 0.1265, "step": 1594, "topk_loss": 0.0091 }, { "epoch": 0.633946703265993, "grad_norm": 0.138671875, "learning_rate": 6.12171130348698e-05, "lm_loss": 1.9223, "loss": 2.0587, "mask_loss": 0.1267, "step": 1595, "topk_loss": 0.0097 }, { "epoch": 0.6343441620141222, "grad_norm": 0.1474609375, "learning_rate": 6.109967176153506e-05, "lm_loss": 1.8486, "loss": 1.9804, "mask_loss": 0.1234, "step": 1596, "topk_loss": 0.0084 }, { "epoch": 0.6347416207622514, "grad_norm": 0.11376953125, "learning_rate": 6.0982293673944544e-05, "lm_loss": 1.9579, "loss": 2.0944, "mask_loss": 0.1267, "step": 1597, "topk_loss": 0.0098 }, { "epoch": 0.6351390795103805, "grad_norm": 0.11572265625, "learning_rate": 6.0864978962755335e-05, "lm_loss": 1.8544, "loss": 1.9925, "mask_loss": 0.1269, "step": 1598, "topk_loss": 0.0112 }, { "epoch": 0.6355365382585096, "grad_norm": 0.11474609375, "learning_rate": 6.074772781852158e-05, "lm_loss": 1.9056, "loss": 2.0393, "mask_loss": 0.1248, "step": 1599, "topk_loss": 0.0089 }, { "epoch": 0.6359339970066388, "grad_norm": 0.1376953125, "learning_rate": 6.0630540431694026e-05, "lm_loss": 1.8817, "loss": 2.0173, "mask_loss": 0.1259, "step": 1600, "topk_loss": 0.0097 }, { "epoch": 0.6359339970066388, "eval_lm_loss": 686.8424682617188, "eval_loss": 686.9800415039062, "eval_mask_hit_rate": 0.534833550453186, "eval_mask_loss": 0.12482748180627823, "eval_mask_top_10_hit_rate": 0.9854109287261963, "eval_mask_top_1_hit_rate": 0.9973936080932617, "eval_mask_top_20_hit_rate": 0.9759554862976074, "eval_mask_top_5_hit_rate": 0.9907490015029907, "eval_runtime": 144.1701, "eval_samples_per_second": 14.205, "eval_steps_per_second": 7.103, "eval_token_accuracy": 0.6140152215957642, "eval_top_k_diff": -523.09375, "eval_topk_loss": 0.01275416649878025, "step": 1600 }, { "epoch": 0.6363314557547679, "grad_norm": 0.123046875, "learning_rate": 6.051341699262003e-05, "lm_loss": 1.8735, "loss": 2.0167, "mask_loss": 0.1314, "step": 1601, "topk_loss": 0.0119 }, { "epoch": 0.6367289145028971, "grad_norm": 0.146484375, "learning_rate": 6.039635769154301e-05, "lm_loss": 1.8738, "loss": 2.0069, "mask_loss": 0.1247, "step": 1602, "topk_loss": 0.0085 }, { "epoch": 0.6371263732510263, "grad_norm": 0.1806640625, "learning_rate": 6.027936271860223e-05, "lm_loss": 1.9179, "loss": 2.0517, "mask_loss": 0.1249, "step": 1603, "topk_loss": 0.0089 }, { "epoch": 0.6375238319991554, "grad_norm": 0.115234375, "learning_rate": 6.016243226383241e-05, "lm_loss": 1.9301, "loss": 2.0631, "mask_loss": 0.1237, "step": 1604, "topk_loss": 0.0093 }, { "epoch": 0.6379212907472845, "grad_norm": 0.111328125, "learning_rate": 6.004556651716354e-05, "lm_loss": 1.8686, "loss": 2.0035, "mask_loss": 0.1266, "step": 1605, "topk_loss": 0.0083 }, { "epoch": 0.6383187494954137, "grad_norm": 0.138671875, "learning_rate": 5.992876566842047e-05, "lm_loss": 1.9002, "loss": 2.0353, "mask_loss": 0.1248, "step": 1606, "topk_loss": 0.0103 }, { "epoch": 0.6387162082435428, "grad_norm": 0.11328125, "learning_rate": 5.981202990732267e-05, "lm_loss": 1.9001, "loss": 2.0344, "mask_loss": 0.1255, "step": 1607, "topk_loss": 0.0087 }, { "epoch": 0.639113666991672, "grad_norm": 0.1318359375, "learning_rate": 5.969535942348379e-05, "lm_loss": 1.943, "loss": 2.0812, "mask_loss": 0.1265, "step": 1608, "topk_loss": 0.0116 }, { "epoch": 0.6395111257398012, "grad_norm": 0.1171875, "learning_rate": 5.957875440641155e-05, "lm_loss": 1.9564, "loss": 2.0918, "mask_loss": 0.1257, "step": 1609, "topk_loss": 0.0096 }, { "epoch": 0.6399085844879303, "grad_norm": 0.13671875, "learning_rate": 5.946221504550732e-05, "lm_loss": 1.8903, "loss": 2.0278, "mask_loss": 0.1279, "step": 1610, "topk_loss": 0.0096 }, { "epoch": 0.6403060432360594, "grad_norm": 0.1484375, "learning_rate": 5.934574153006579e-05, "lm_loss": 1.9521, "loss": 2.0879, "mask_loss": 0.1263, "step": 1611, "topk_loss": 0.0095 }, { "epoch": 0.6407035019841886, "grad_norm": 0.1162109375, "learning_rate": 5.922933404927473e-05, "lm_loss": 1.9203, "loss": 2.0542, "mask_loss": 0.1239, "step": 1612, "topk_loss": 0.01 }, { "epoch": 0.6411009607323177, "grad_norm": 0.11865234375, "learning_rate": 5.911299279221463e-05, "lm_loss": 1.8034, "loss": 1.9406, "mask_loss": 0.128, "step": 1613, "topk_loss": 0.0092 }, { "epoch": 0.641498419480447, "grad_norm": 0.1181640625, "learning_rate": 5.899671794785839e-05, "lm_loss": 1.9372, "loss": 2.0712, "mask_loss": 0.1243, "step": 1614, "topk_loss": 0.0096 }, { "epoch": 0.6418958782285761, "grad_norm": 0.14453125, "learning_rate": 5.888050970507114e-05, "lm_loss": 1.8709, "loss": 2.0097, "mask_loss": 0.1276, "step": 1615, "topk_loss": 0.0111 }, { "epoch": 0.6422933369767052, "grad_norm": 0.1162109375, "learning_rate": 5.876436825260967e-05, "lm_loss": 1.9412, "loss": 2.0764, "mask_loss": 0.1249, "step": 1616, "topk_loss": 0.0103 }, { "epoch": 0.6426907957248343, "grad_norm": 0.1162109375, "learning_rate": 5.86482937791224e-05, "lm_loss": 1.8946, "loss": 2.0269, "mask_loss": 0.1251, "step": 1617, "topk_loss": 0.0072 }, { "epoch": 0.6430882544729635, "grad_norm": 0.1376953125, "learning_rate": 5.85322864731489e-05, "lm_loss": 1.9194, "loss": 2.0567, "mask_loss": 0.1281, "step": 1618, "topk_loss": 0.0092 }, { "epoch": 0.6434857132210926, "grad_norm": 0.12353515625, "learning_rate": 5.841634652311969e-05, "lm_loss": 1.8678, "loss": 2.0049, "mask_loss": 0.1277, "step": 1619, "topk_loss": 0.0095 }, { "epoch": 0.6438831719692217, "grad_norm": 0.1123046875, "learning_rate": 5.830047411735588e-05, "lm_loss": 1.9259, "loss": 2.0574, "mask_loss": 0.1223, "step": 1620, "topk_loss": 0.0093 }, { "epoch": 0.644280630717351, "grad_norm": 0.1201171875, "learning_rate": 5.818466944406877e-05, "lm_loss": 1.8894, "loss": 2.0248, "mask_loss": 0.1255, "step": 1621, "topk_loss": 0.0099 }, { "epoch": 0.6446780894654801, "grad_norm": 0.11376953125, "learning_rate": 5.8068932691359753e-05, "lm_loss": 1.9152, "loss": 2.0512, "mask_loss": 0.1264, "step": 1622, "topk_loss": 0.0096 }, { "epoch": 0.6450755482136092, "grad_norm": 0.12158203125, "learning_rate": 5.795326404721988e-05, "lm_loss": 1.9404, "loss": 2.0807, "mask_loss": 0.1271, "step": 1623, "topk_loss": 0.0132 }, { "epoch": 0.6454730069617384, "grad_norm": 0.12109375, "learning_rate": 5.783766369952952e-05, "lm_loss": 1.8673, "loss": 1.9987, "mask_loss": 0.1229, "step": 1624, "topk_loss": 0.0086 }, { "epoch": 0.6458704657098675, "grad_norm": 0.11279296875, "learning_rate": 5.772213183605817e-05, "lm_loss": 1.9092, "loss": 2.0442, "mask_loss": 0.1246, "step": 1625, "topk_loss": 0.0104 }, { "epoch": 0.6462679244579966, "grad_norm": 0.11669921875, "learning_rate": 5.760666864446403e-05, "lm_loss": 1.8945, "loss": 2.0302, "mask_loss": 0.1266, "step": 1626, "topk_loss": 0.009 }, { "epoch": 0.6466653832061259, "grad_norm": 0.130859375, "learning_rate": 5.7491274312293816e-05, "lm_loss": 1.8766, "loss": 2.0137, "mask_loss": 0.1274, "step": 1627, "topk_loss": 0.0097 }, { "epoch": 0.647062841954255, "grad_norm": 0.11572265625, "learning_rate": 5.7375949026982365e-05, "lm_loss": 1.9554, "loss": 2.092, "mask_loss": 0.1258, "step": 1628, "topk_loss": 0.0108 }, { "epoch": 0.6474603007023841, "grad_norm": 0.1298828125, "learning_rate": 5.726069297585235e-05, "lm_loss": 1.8913, "loss": 2.0263, "mask_loss": 0.1246, "step": 1629, "topk_loss": 0.0103 }, { "epoch": 0.6478577594505133, "grad_norm": 0.2333984375, "learning_rate": 5.714550634611401e-05, "lm_loss": 1.8887, "loss": 2.0472, "mask_loss": 0.1386, "step": 1630, "topk_loss": 0.0199 }, { "epoch": 0.6482552181986424, "grad_norm": 0.11669921875, "learning_rate": 5.703038932486484e-05, "lm_loss": 1.8684, "loss": 2.0043, "mask_loss": 0.1265, "step": 1631, "topk_loss": 0.0094 }, { "epoch": 0.6486526769467715, "grad_norm": 0.169921875, "learning_rate": 5.691534209908919e-05, "lm_loss": 1.904, "loss": 2.0369, "mask_loss": 0.1234, "step": 1632, "topk_loss": 0.0096 }, { "epoch": 0.6490501356949008, "grad_norm": 0.1767578125, "learning_rate": 5.680036485565811e-05, "lm_loss": 1.9141, "loss": 2.048, "mask_loss": 0.1243, "step": 1633, "topk_loss": 0.0096 }, { "epoch": 0.6494475944430299, "grad_norm": 0.1357421875, "learning_rate": 5.668545778132897e-05, "lm_loss": 1.8989, "loss": 2.0371, "mask_loss": 0.1274, "step": 1634, "topk_loss": 0.0108 }, { "epoch": 0.649845053191159, "grad_norm": 0.1796875, "learning_rate": 5.6570621062745146e-05, "lm_loss": 1.9017, "loss": 2.0359, "mask_loss": 0.1239, "step": 1635, "topk_loss": 0.0103 }, { "epoch": 0.6502425119392882, "grad_norm": 0.12158203125, "learning_rate": 5.6455854886435765e-05, "lm_loss": 1.869, "loss": 2.0046, "mask_loss": 0.1262, "step": 1636, "topk_loss": 0.0094 }, { "epoch": 0.6506399706874173, "grad_norm": 0.1259765625, "learning_rate": 5.634115943881535e-05, "lm_loss": 1.9075, "loss": 2.0437, "mask_loss": 0.1252, "step": 1637, "topk_loss": 0.011 }, { "epoch": 0.6510374294355464, "grad_norm": 0.15625, "learning_rate": 5.622653490618353e-05, "lm_loss": 1.982, "loss": 2.1171, "mask_loss": 0.1254, "step": 1638, "topk_loss": 0.0097 }, { "epoch": 0.6514348881836757, "grad_norm": 0.1279296875, "learning_rate": 5.611198147472481e-05, "lm_loss": 1.9015, "loss": 2.04, "mask_loss": 0.1291, "step": 1639, "topk_loss": 0.0094 }, { "epoch": 0.6518323469318048, "grad_norm": 0.1533203125, "learning_rate": 5.5997499330508066e-05, "lm_loss": 1.8138, "loss": 1.9527, "mask_loss": 0.1291, "step": 1640, "topk_loss": 0.0098 }, { "epoch": 0.6522298056799339, "grad_norm": 0.1376953125, "learning_rate": 5.5883088659486525e-05, "lm_loss": 1.8897, "loss": 2.0261, "mask_loss": 0.1262, "step": 1641, "topk_loss": 0.0102 }, { "epoch": 0.6526272644280631, "grad_norm": 0.11376953125, "learning_rate": 5.576874964749727e-05, "lm_loss": 1.8951, "loss": 2.0294, "mask_loss": 0.1247, "step": 1642, "topk_loss": 0.0096 }, { "epoch": 0.6530247231761922, "grad_norm": 0.12451171875, "learning_rate": 5.5654482480260964e-05, "lm_loss": 1.923, "loss": 2.0618, "mask_loss": 0.1273, "step": 1643, "topk_loss": 0.0115 }, { "epoch": 0.6534221819243213, "grad_norm": 0.1474609375, "learning_rate": 5.5540287343381606e-05, "lm_loss": 1.8869, "loss": 2.0227, "mask_loss": 0.1261, "step": 1644, "topk_loss": 0.0096 }, { "epoch": 0.6538196406724506, "grad_norm": 0.1982421875, "learning_rate": 5.542616442234618e-05, "lm_loss": 1.8922, "loss": 2.047, "mask_loss": 0.135, "step": 1645, "topk_loss": 0.0198 }, { "epoch": 0.6542170994205797, "grad_norm": 0.1396484375, "learning_rate": 5.531211390252438e-05, "lm_loss": 1.9581, "loss": 2.0917, "mask_loss": 0.1247, "step": 1646, "topk_loss": 0.009 }, { "epoch": 0.6546145581687088, "grad_norm": 0.12451171875, "learning_rate": 5.519813596916833e-05, "lm_loss": 1.9184, "loss": 2.0532, "mask_loss": 0.1247, "step": 1647, "topk_loss": 0.0101 }, { "epoch": 0.655012016916838, "grad_norm": 0.125, "learning_rate": 5.5084230807412126e-05, "lm_loss": 1.9282, "loss": 2.0674, "mask_loss": 0.1273, "step": 1648, "topk_loss": 0.012 }, { "epoch": 0.6554094756649671, "grad_norm": 0.14453125, "learning_rate": 5.497039860227181e-05, "lm_loss": 1.8658, "loss": 1.9997, "mask_loss": 0.124, "step": 1649, "topk_loss": 0.0098 }, { "epoch": 0.6558069344130962, "grad_norm": 0.11376953125, "learning_rate": 5.485663953864484e-05, "lm_loss": 1.9037, "loss": 2.0386, "mask_loss": 0.1262, "step": 1650, "topk_loss": 0.0087 }, { "epoch": 0.6558069344130962, "eval_lm_loss": 687.2271118164062, "eval_loss": 687.3645629882812, "eval_mask_hit_rate": 0.5350625514984131, "eval_mask_loss": 0.12471777945756912, "eval_mask_top_10_hit_rate": 0.9854594469070435, "eval_mask_top_1_hit_rate": 0.9974088668823242, "eval_mask_top_20_hit_rate": 0.9760203957557678, "eval_mask_top_5_hit_rate": 0.9907782077789307, "eval_runtime": 144.1065, "eval_samples_per_second": 14.212, "eval_steps_per_second": 7.106, "eval_token_accuracy": 0.6141659617424011, "eval_top_k_diff": -526.4982299804688, "eval_topk_loss": 0.012702615931630135, "step": 1650 }, { "epoch": 0.6562043931612254, "grad_norm": 0.111328125, "learning_rate": 5.474295380130989e-05, "lm_loss": 1.8784, "loss": 2.013, "mask_loss": 0.1255, "step": 1651, "topk_loss": 0.0091 }, { "epoch": 0.6566018519093546, "grad_norm": 0.1220703125, "learning_rate": 5.462934157492656e-05, "lm_loss": 1.9161, "loss": 2.0519, "mask_loss": 0.1261, "step": 1652, "topk_loss": 0.0097 }, { "epoch": 0.6569993106574837, "grad_norm": 0.1484375, "learning_rate": 5.4515803044034985e-05, "lm_loss": 1.9306, "loss": 2.0692, "mask_loss": 0.1257, "step": 1653, "topk_loss": 0.0129 }, { "epoch": 0.6573967694056129, "grad_norm": 0.1162109375, "learning_rate": 5.440233839305564e-05, "lm_loss": 1.9124, "loss": 2.045, "mask_loss": 0.1233, "step": 1654, "topk_loss": 0.0093 }, { "epoch": 0.657794228153742, "grad_norm": 0.134765625, "learning_rate": 5.428894780628899e-05, "lm_loss": 1.8946, "loss": 2.0295, "mask_loss": 0.1243, "step": 1655, "topk_loss": 0.0105 }, { "epoch": 0.6581916869018711, "grad_norm": 0.10888671875, "learning_rate": 5.417563146791519e-05, "lm_loss": 1.9002, "loss": 2.0332, "mask_loss": 0.1238, "step": 1656, "topk_loss": 0.0092 }, { "epoch": 0.6585891456500003, "grad_norm": 0.11376953125, "learning_rate": 5.4062389561993786e-05, "lm_loss": 1.9347, "loss": 2.0702, "mask_loss": 0.1256, "step": 1657, "topk_loss": 0.0099 }, { "epoch": 0.6589866043981295, "grad_norm": 0.1103515625, "learning_rate": 5.3949222272463464e-05, "lm_loss": 1.9164, "loss": 2.0519, "mask_loss": 0.1258, "step": 1658, "topk_loss": 0.0097 }, { "epoch": 0.6593840631462586, "grad_norm": 0.1416015625, "learning_rate": 5.383612978314164e-05, "lm_loss": 1.8827, "loss": 2.0163, "mask_loss": 0.1246, "step": 1659, "topk_loss": 0.0089 }, { "epoch": 0.6597815218943878, "grad_norm": 0.11474609375, "learning_rate": 5.372311227772431e-05, "lm_loss": 1.9462, "loss": 2.0799, "mask_loss": 0.1236, "step": 1660, "topk_loss": 0.0101 }, { "epoch": 0.6601789806425169, "grad_norm": 0.1201171875, "learning_rate": 5.3610169939785615e-05, "lm_loss": 1.9548, "loss": 2.0921, "mask_loss": 0.126, "step": 1661, "topk_loss": 0.0113 }, { "epoch": 0.660576439390646, "grad_norm": 0.11767578125, "learning_rate": 5.349730295277764e-05, "lm_loss": 1.8905, "loss": 2.025, "mask_loss": 0.1253, "step": 1662, "topk_loss": 0.0093 }, { "epoch": 0.6609738981387752, "grad_norm": 0.115234375, "learning_rate": 5.338451150003008e-05, "lm_loss": 1.868, "loss": 2.0035, "mask_loss": 0.1255, "step": 1663, "topk_loss": 0.01 }, { "epoch": 0.6613713568869044, "grad_norm": 0.1474609375, "learning_rate": 5.3271795764749856e-05, "lm_loss": 1.9277, "loss": 2.0632, "mask_loss": 0.1252, "step": 1664, "topk_loss": 0.0102 }, { "epoch": 0.6617688156350335, "grad_norm": 0.11962890625, "learning_rate": 5.3159155930021e-05, "lm_loss": 1.9127, "loss": 2.0476, "mask_loss": 0.1254, "step": 1665, "topk_loss": 0.0095 }, { "epoch": 0.6621662743831627, "grad_norm": 0.166015625, "learning_rate": 5.304659217880423e-05, "lm_loss": 1.905, "loss": 2.0432, "mask_loss": 0.128, "step": 1666, "topk_loss": 0.0102 }, { "epoch": 0.6625637331312918, "grad_norm": 0.123046875, "learning_rate": 5.293410469393667e-05, "lm_loss": 1.8786, "loss": 2.0161, "mask_loss": 0.1277, "step": 1667, "topk_loss": 0.0097 }, { "epoch": 0.6629611918794209, "grad_norm": 0.11181640625, "learning_rate": 5.282169365813158e-05, "lm_loss": 1.9189, "loss": 2.0558, "mask_loss": 0.1259, "step": 1668, "topk_loss": 0.011 }, { "epoch": 0.6633586506275501, "grad_norm": 0.11474609375, "learning_rate": 5.270935925397802e-05, "lm_loss": 1.9022, "loss": 2.0397, "mask_loss": 0.1267, "step": 1669, "topk_loss": 0.0108 }, { "epoch": 0.6637561093756793, "grad_norm": 0.1513671875, "learning_rate": 5.259710166394062e-05, "lm_loss": 1.886, "loss": 2.0255, "mask_loss": 0.1273, "step": 1670, "topk_loss": 0.0122 }, { "epoch": 0.6641535681238084, "grad_norm": 0.115234375, "learning_rate": 5.2484921070359226e-05, "lm_loss": 1.8727, "loss": 2.0109, "mask_loss": 0.1276, "step": 1671, "topk_loss": 0.0107 }, { "epoch": 0.6645510268719376, "grad_norm": 0.146484375, "learning_rate": 5.237281765544852e-05, "lm_loss": 1.8136, "loss": 1.9479, "mask_loss": 0.1251, "step": 1672, "topk_loss": 0.0092 }, { "epoch": 0.6649484856200667, "grad_norm": 0.109375, "learning_rate": 5.2260791601298e-05, "lm_loss": 1.8949, "loss": 2.0347, "mask_loss": 0.1284, "step": 1673, "topk_loss": 0.0115 }, { "epoch": 0.6653459443681958, "grad_norm": 0.1162109375, "learning_rate": 5.214884308987136e-05, "lm_loss": 1.8865, "loss": 2.0207, "mask_loss": 0.1248, "step": 1674, "topk_loss": 0.0094 }, { "epoch": 0.665743403116325, "grad_norm": 0.130859375, "learning_rate": 5.2036972303006426e-05, "lm_loss": 1.8873, "loss": 2.024, "mask_loss": 0.1266, "step": 1675, "topk_loss": 0.01 }, { "epoch": 0.6661408618644541, "grad_norm": 0.130859375, "learning_rate": 5.192517942241474e-05, "lm_loss": 1.9327, "loss": 2.0659, "mask_loss": 0.1239, "step": 1676, "topk_loss": 0.0092 }, { "epoch": 0.6665383206125833, "grad_norm": 0.11279296875, "learning_rate": 5.181346462968131e-05, "lm_loss": 1.8702, "loss": 2.0044, "mask_loss": 0.125, "step": 1677, "topk_loss": 0.0092 }, { "epoch": 0.6669357793607125, "grad_norm": 0.1123046875, "learning_rate": 5.1701828106264305e-05, "lm_loss": 1.9382, "loss": 2.0733, "mask_loss": 0.1237, "step": 1678, "topk_loss": 0.0114 }, { "epoch": 0.6673332381088416, "grad_norm": 0.1279296875, "learning_rate": 5.159027003349479e-05, "lm_loss": 1.9099, "loss": 2.0431, "mask_loss": 0.1241, "step": 1679, "topk_loss": 0.0091 }, { "epoch": 0.6677306968569707, "grad_norm": 0.1455078125, "learning_rate": 5.147879059257632e-05, "lm_loss": 1.9072, "loss": 2.0402, "mask_loss": 0.1238, "step": 1680, "topk_loss": 0.0092 }, { "epoch": 0.6681281556050999, "grad_norm": 0.13671875, "learning_rate": 5.13673899645848e-05, "lm_loss": 1.8836, "loss": 2.0254, "mask_loss": 0.1291, "step": 1681, "topk_loss": 0.0128 }, { "epoch": 0.668525614353229, "grad_norm": 0.10986328125, "learning_rate": 5.12560683304681e-05, "lm_loss": 1.9045, "loss": 2.0388, "mask_loss": 0.1247, "step": 1682, "topk_loss": 0.0096 }, { "epoch": 0.6689230731013582, "grad_norm": 0.1171875, "learning_rate": 5.1144825871045796e-05, "lm_loss": 1.8755, "loss": 2.0099, "mask_loss": 0.1265, "step": 1683, "topk_loss": 0.0079 }, { "epoch": 0.6693205318494874, "grad_norm": 0.15234375, "learning_rate": 5.103366276700884e-05, "lm_loss": 1.9191, "loss": 2.0561, "mask_loss": 0.1273, "step": 1684, "topk_loss": 0.0097 }, { "epoch": 0.6697179905976165, "grad_norm": 0.12060546875, "learning_rate": 5.092257919891929e-05, "lm_loss": 1.9203, "loss": 2.0551, "mask_loss": 0.1249, "step": 1685, "topk_loss": 0.0099 }, { "epoch": 0.6701154493457456, "grad_norm": 0.11376953125, "learning_rate": 5.081157534721002e-05, "lm_loss": 1.8822, "loss": 2.0158, "mask_loss": 0.124, "step": 1686, "topk_loss": 0.0096 }, { "epoch": 0.6705129080938748, "grad_norm": 0.1552734375, "learning_rate": 5.070065139218443e-05, "lm_loss": 1.8402, "loss": 1.9812, "mask_loss": 0.1281, "step": 1687, "topk_loss": 0.0129 }, { "epoch": 0.6709103668420039, "grad_norm": 0.1103515625, "learning_rate": 5.05898075140161e-05, "lm_loss": 1.9395, "loss": 2.0754, "mask_loss": 0.1265, "step": 1688, "topk_loss": 0.0094 }, { "epoch": 0.6713078255901331, "grad_norm": 0.1328125, "learning_rate": 5.04790438927486e-05, "lm_loss": 1.8916, "loss": 2.0311, "mask_loss": 0.1285, "step": 1689, "topk_loss": 0.011 }, { "epoch": 0.6717052843382623, "grad_norm": 0.11767578125, "learning_rate": 5.036836070829512e-05, "lm_loss": 1.8803, "loss": 2.0128, "mask_loss": 0.1232, "step": 1690, "topk_loss": 0.0093 }, { "epoch": 0.6721027430863914, "grad_norm": 0.1162109375, "learning_rate": 5.025775814043816e-05, "lm_loss": 1.883, "loss": 2.0221, "mask_loss": 0.1281, "step": 1691, "topk_loss": 0.011 }, { "epoch": 0.6725002018345205, "grad_norm": 0.12158203125, "learning_rate": 5.014723636882932e-05, "lm_loss": 1.9106, "loss": 2.0437, "mask_loss": 0.1237, "step": 1692, "topk_loss": 0.0094 }, { "epoch": 0.6728976605826497, "grad_norm": 0.12890625, "learning_rate": 5.003679557298896e-05, "lm_loss": 1.8627, "loss": 2.0023, "mask_loss": 0.1283, "step": 1693, "topk_loss": 0.0113 }, { "epoch": 0.6732951193307788, "grad_norm": 0.125, "learning_rate": 4.992643593230587e-05, "lm_loss": 1.9231, "loss": 2.0596, "mask_loss": 0.1272, "step": 1694, "topk_loss": 0.0093 }, { "epoch": 0.673692578078908, "grad_norm": 0.1318359375, "learning_rate": 4.98161576260371e-05, "lm_loss": 1.9269, "loss": 2.0619, "mask_loss": 0.1244, "step": 1695, "topk_loss": 0.0106 }, { "epoch": 0.6740900368270372, "grad_norm": 0.1103515625, "learning_rate": 4.9705960833307455e-05, "lm_loss": 1.9213, "loss": 2.0574, "mask_loss": 0.1252, "step": 1696, "topk_loss": 0.0109 }, { "epoch": 0.6744874955751663, "grad_norm": 0.11962890625, "learning_rate": 4.9595845733109455e-05, "lm_loss": 1.9245, "loss": 2.0593, "mask_loss": 0.1258, "step": 1697, "topk_loss": 0.009 }, { "epoch": 0.6748849543232954, "grad_norm": 0.109375, "learning_rate": 4.948581250430291e-05, "lm_loss": 1.9287, "loss": 2.0645, "mask_loss": 0.1248, "step": 1698, "topk_loss": 0.011 }, { "epoch": 0.6752824130714246, "grad_norm": 0.1279296875, "learning_rate": 4.9375861325614606e-05, "lm_loss": 1.951, "loss": 2.0861, "mask_loss": 0.1255, "step": 1699, "topk_loss": 0.0097 }, { "epoch": 0.6756798718195537, "grad_norm": 0.140625, "learning_rate": 4.926599237563807e-05, "lm_loss": 1.9685, "loss": 2.1018, "mask_loss": 0.1235, "step": 1700, "topk_loss": 0.0098 }, { "epoch": 0.6756798718195537, "eval_lm_loss": 687.1553955078125, "eval_loss": 687.2926025390625, "eval_mask_hit_rate": 0.5353307723999023, "eval_mask_loss": 0.1246270090341568, "eval_mask_top_10_hit_rate": 0.985496997833252, "eval_mask_top_1_hit_rate": 0.9974105358123779, "eval_mask_top_20_hit_rate": 0.9760777950286865, "eval_mask_top_5_hit_rate": 0.9908045530319214, "eval_runtime": 144.3717, "eval_samples_per_second": 14.186, "eval_steps_per_second": 7.093, "eval_token_accuracy": 0.6143229007720947, "eval_top_k_diff": -527.6005859375, "eval_topk_loss": 0.012607835233211517, "step": 1700 }, { "epoch": 0.6760773305676829, "grad_norm": 0.12158203125, "learning_rate": 4.915620583283329e-05, "lm_loss": 1.8965, "loss": 2.0348, "mask_loss": 0.1282, "step": 1701, "topk_loss": 0.01 }, { "epoch": 0.6764747893158121, "grad_norm": 0.1494140625, "learning_rate": 4.904650187552637e-05, "lm_loss": 1.866, "loss": 2.0022, "mask_loss": 0.1255, "step": 1702, "topk_loss": 0.0106 }, { "epoch": 0.6768722480639412, "grad_norm": 0.11083984375, "learning_rate": 4.893688068190932e-05, "lm_loss": 1.8394, "loss": 1.9773, "mask_loss": 0.1282, "step": 1703, "topk_loss": 0.0096 }, { "epoch": 0.6772697068120703, "grad_norm": 0.1337890625, "learning_rate": 4.8827342430039624e-05, "lm_loss": 1.8682, "loss": 2.0079, "mask_loss": 0.1286, "step": 1704, "topk_loss": 0.0111 }, { "epoch": 0.6776671655601995, "grad_norm": 0.1259765625, "learning_rate": 4.87178872978401e-05, "lm_loss": 1.8977, "loss": 2.0312, "mask_loss": 0.1247, "step": 1705, "topk_loss": 0.0088 }, { "epoch": 0.6780646243083286, "grad_norm": 0.138671875, "learning_rate": 4.860851546309858e-05, "lm_loss": 1.8222, "loss": 1.96, "mask_loss": 0.1259, "step": 1706, "topk_loss": 0.0119 }, { "epoch": 0.6784620830564577, "grad_norm": 0.138671875, "learning_rate": 4.8499227103467574e-05, "lm_loss": 1.8512, "loss": 1.9842, "mask_loss": 0.1239, "step": 1707, "topk_loss": 0.0091 }, { "epoch": 0.678859541804587, "grad_norm": 0.12353515625, "learning_rate": 4.8390022396463965e-05, "lm_loss": 1.8797, "loss": 2.0186, "mask_loss": 0.1273, "step": 1708, "topk_loss": 0.0116 }, { "epoch": 0.6792570005527161, "grad_norm": 0.1357421875, "learning_rate": 4.828090151946882e-05, "lm_loss": 1.9118, "loss": 2.0482, "mask_loss": 0.1246, "step": 1709, "topk_loss": 0.0118 }, { "epoch": 0.6796544593008452, "grad_norm": 0.130859375, "learning_rate": 4.817186464972702e-05, "lm_loss": 1.958, "loss": 2.093, "mask_loss": 0.1255, "step": 1710, "topk_loss": 0.0095 }, { "epoch": 0.6800519180489744, "grad_norm": 0.1748046875, "learning_rate": 4.8062911964347004e-05, "lm_loss": 1.9112, "loss": 2.0502, "mask_loss": 0.1251, "step": 1711, "topk_loss": 0.0139 }, { "epoch": 0.6804493767971035, "grad_norm": 0.1494140625, "learning_rate": 4.7954043640300394e-05, "lm_loss": 1.8771, "loss": 2.0308, "mask_loss": 0.1357, "step": 1712, "topk_loss": 0.018 }, { "epoch": 0.6808468355452326, "grad_norm": 0.1513671875, "learning_rate": 4.7845259854421875e-05, "lm_loss": 1.9122, "loss": 2.0563, "mask_loss": 0.1302, "step": 1713, "topk_loss": 0.0139 }, { "epoch": 0.6812442942933619, "grad_norm": 0.11669921875, "learning_rate": 4.773656078340879e-05, "lm_loss": 1.9049, "loss": 2.0427, "mask_loss": 0.1274, "step": 1714, "topk_loss": 0.0104 }, { "epoch": 0.681641753041491, "grad_norm": 0.11962890625, "learning_rate": 4.762794660382086e-05, "lm_loss": 1.9272, "loss": 2.0626, "mask_loss": 0.1263, "step": 1715, "topk_loss": 0.0092 }, { "epoch": 0.6820392117896201, "grad_norm": 0.1259765625, "learning_rate": 4.751941749207995e-05, "lm_loss": 1.9171, "loss": 2.0481, "mask_loss": 0.123, "step": 1716, "topk_loss": 0.0079 }, { "epoch": 0.6824366705377493, "grad_norm": 0.1513671875, "learning_rate": 4.741097362446973e-05, "lm_loss": 1.9016, "loss": 2.0418, "mask_loss": 0.1281, "step": 1717, "topk_loss": 0.0122 }, { "epoch": 0.6828341292858784, "grad_norm": 0.1416015625, "learning_rate": 4.730261517713541e-05, "lm_loss": 1.8347, "loss": 1.9794, "mask_loss": 0.1306, "step": 1718, "topk_loss": 0.0141 }, { "epoch": 0.6832315880340075, "grad_norm": 0.1533203125, "learning_rate": 4.719434232608345e-05, "lm_loss": 1.8892, "loss": 2.024, "mask_loss": 0.1244, "step": 1719, "topk_loss": 0.0104 }, { "epoch": 0.6836290467821368, "grad_norm": 0.1328125, "learning_rate": 4.708615524718128e-05, "lm_loss": 1.9153, "loss": 2.0522, "mask_loss": 0.1255, "step": 1720, "topk_loss": 0.0114 }, { "epoch": 0.6840265055302659, "grad_norm": 0.1259765625, "learning_rate": 4.6978054116156987e-05, "lm_loss": 1.9998, "loss": 2.1383, "mask_loss": 0.1255, "step": 1721, "topk_loss": 0.013 }, { "epoch": 0.684423964278395, "grad_norm": 0.1123046875, "learning_rate": 4.687003910859911e-05, "lm_loss": 1.9427, "loss": 2.0796, "mask_loss": 0.127, "step": 1722, "topk_loss": 0.0099 }, { "epoch": 0.6848214230265242, "grad_norm": 0.1240234375, "learning_rate": 4.676211039995623e-05, "lm_loss": 1.893, "loss": 2.027, "mask_loss": 0.1244, "step": 1723, "topk_loss": 0.0096 }, { "epoch": 0.6852188817746533, "grad_norm": 0.13671875, "learning_rate": 4.6654268165536805e-05, "lm_loss": 1.8517, "loss": 1.9876, "mask_loss": 0.1266, "step": 1724, "topk_loss": 0.0092 }, { "epoch": 0.6856163405227824, "grad_norm": 0.126953125, "learning_rate": 4.6546512580508804e-05, "lm_loss": 1.9012, "loss": 2.0353, "mask_loss": 0.1243, "step": 1725, "topk_loss": 0.0098 }, { "epoch": 0.6860137992709117, "grad_norm": 0.12109375, "learning_rate": 4.643884381989947e-05, "lm_loss": 1.8417, "loss": 1.9781, "mask_loss": 0.1267, "step": 1726, "topk_loss": 0.0097 }, { "epoch": 0.6864112580190408, "grad_norm": 0.1240234375, "learning_rate": 4.633126205859504e-05, "lm_loss": 1.9173, "loss": 2.0508, "mask_loss": 0.124, "step": 1727, "topk_loss": 0.0095 }, { "epoch": 0.6868087167671699, "grad_norm": 0.115234375, "learning_rate": 4.6223767471340326e-05, "lm_loss": 1.8859, "loss": 2.0179, "mask_loss": 0.1228, "step": 1728, "topk_loss": 0.0092 }, { "epoch": 0.6872061755152991, "grad_norm": 0.11279296875, "learning_rate": 4.6116360232738675e-05, "lm_loss": 1.9024, "loss": 2.0398, "mask_loss": 0.1267, "step": 1729, "topk_loss": 0.0107 }, { "epoch": 0.6876036342634282, "grad_norm": 0.1142578125, "learning_rate": 4.600904051725148e-05, "lm_loss": 1.9572, "loss": 2.0923, "mask_loss": 0.1261, "step": 1730, "topk_loss": 0.0089 }, { "epoch": 0.6880010930115573, "grad_norm": 0.21875, "learning_rate": 4.5901808499198004e-05, "lm_loss": 1.8948, "loss": 2.0469, "mask_loss": 0.1327, "step": 1731, "topk_loss": 0.0194 }, { "epoch": 0.6883985517596865, "grad_norm": 0.11572265625, "learning_rate": 4.5794664352755055e-05, "lm_loss": 1.9081, "loss": 2.0427, "mask_loss": 0.1249, "step": 1732, "topk_loss": 0.0097 }, { "epoch": 0.6887960105078157, "grad_norm": 0.1259765625, "learning_rate": 4.5687608251956714e-05, "lm_loss": 1.8729, "loss": 2.0116, "mask_loss": 0.1266, "step": 1733, "topk_loss": 0.0122 }, { "epoch": 0.6891934692559448, "grad_norm": 0.1494140625, "learning_rate": 4.5580640370694027e-05, "lm_loss": 1.9346, "loss": 2.0687, "mask_loss": 0.1247, "step": 1734, "topk_loss": 0.0094 }, { "epoch": 0.689590928004074, "grad_norm": 0.11474609375, "learning_rate": 4.54737608827148e-05, "lm_loss": 1.8316, "loss": 1.9689, "mask_loss": 0.1269, "step": 1735, "topk_loss": 0.0104 }, { "epoch": 0.6899883867522031, "grad_norm": 0.11767578125, "learning_rate": 4.5366969961623166e-05, "lm_loss": 1.9245, "loss": 2.0613, "mask_loss": 0.1263, "step": 1736, "topk_loss": 0.0105 }, { "epoch": 0.6903858455003322, "grad_norm": 0.11328125, "learning_rate": 4.526026778087947e-05, "lm_loss": 1.8769, "loss": 2.0139, "mask_loss": 0.1277, "step": 1737, "topk_loss": 0.0094 }, { "epoch": 0.6907833042484613, "grad_norm": 0.1279296875, "learning_rate": 4.515365451379993e-05, "lm_loss": 1.9314, "loss": 2.0678, "mask_loss": 0.1262, "step": 1738, "topk_loss": 0.0102 }, { "epoch": 0.6911807629965906, "grad_norm": 0.1123046875, "learning_rate": 4.504713033355629e-05, "lm_loss": 1.9246, "loss": 2.0592, "mask_loss": 0.1249, "step": 1739, "topk_loss": 0.0097 }, { "epoch": 0.6915782217447197, "grad_norm": 0.1376953125, "learning_rate": 4.4940695413175626e-05, "lm_loss": 1.9354, "loss": 2.0684, "mask_loss": 0.1242, "step": 1740, "topk_loss": 0.0089 }, { "epoch": 0.6919756804928489, "grad_norm": 0.1103515625, "learning_rate": 4.483434992554001e-05, "lm_loss": 1.8529, "loss": 1.9866, "mask_loss": 0.1249, "step": 1741, "topk_loss": 0.0088 }, { "epoch": 0.692373139240978, "grad_norm": 0.12451171875, "learning_rate": 4.472809404338627e-05, "lm_loss": 1.9255, "loss": 2.0644, "mask_loss": 0.1275, "step": 1742, "topk_loss": 0.0113 }, { "epoch": 0.6927705979891071, "grad_norm": 0.12158203125, "learning_rate": 4.4621927939305695e-05, "lm_loss": 1.8837, "loss": 2.0211, "mask_loss": 0.1272, "step": 1743, "topk_loss": 0.0102 }, { "epoch": 0.6931680567372362, "grad_norm": 0.11376953125, "learning_rate": 4.451585178574368e-05, "lm_loss": 1.8989, "loss": 2.032, "mask_loss": 0.1237, "step": 1744, "topk_loss": 0.0094 }, { "epoch": 0.6935655154853655, "grad_norm": 0.1083984375, "learning_rate": 4.440986575499956e-05, "lm_loss": 1.883, "loss": 2.0159, "mask_loss": 0.1233, "step": 1745, "topk_loss": 0.0096 }, { "epoch": 0.6939629742334946, "grad_norm": 0.1875, "learning_rate": 4.430397001922631e-05, "lm_loss": 1.8949, "loss": 2.0337, "mask_loss": 0.126, "step": 1746, "topk_loss": 0.0127 }, { "epoch": 0.6943604329816238, "grad_norm": 0.1796875, "learning_rate": 4.4198164750430217e-05, "lm_loss": 1.9357, "loss": 2.0683, "mask_loss": 0.1232, "step": 1747, "topk_loss": 0.0094 }, { "epoch": 0.6947578917297529, "grad_norm": 0.1279296875, "learning_rate": 4.40924501204706e-05, "lm_loss": 1.8786, "loss": 2.0171, "mask_loss": 0.1264, "step": 1748, "topk_loss": 0.0121 }, { "epoch": 0.695155350477882, "grad_norm": 0.1328125, "learning_rate": 4.398682630105958e-05, "lm_loss": 1.9416, "loss": 2.0772, "mask_loss": 0.1267, "step": 1749, "topk_loss": 0.009 }, { "epoch": 0.6955528092260111, "grad_norm": 0.1279296875, "learning_rate": 4.388129346376178e-05, "lm_loss": 1.9031, "loss": 2.0414, "mask_loss": 0.1276, "step": 1750, "topk_loss": 0.0107 }, { "epoch": 0.6955528092260111, "eval_lm_loss": 687.1192626953125, "eval_loss": 687.2565307617188, "eval_mask_hit_rate": 0.5354921817779541, "eval_mask_loss": 0.12459547817707062, "eval_mask_top_10_hit_rate": 0.9855087995529175, "eval_mask_top_1_hit_rate": 0.9974174499511719, "eval_mask_top_20_hit_rate": 0.9761062264442444, "eval_mask_top_5_hit_rate": 0.990810751914978, "eval_runtime": 144.4662, "eval_samples_per_second": 14.176, "eval_steps_per_second": 7.088, "eval_token_accuracy": 0.6143444180488586, "eval_top_k_diff": -527.2138671875, "eval_topk_loss": 0.012662166729569435, "step": 1750 }, { "epoch": 0.6959502679741404, "grad_norm": 0.1201171875, "learning_rate": 4.377585177999404e-05, "lm_loss": 1.9144, "loss": 2.051, "mask_loss": 0.1258, "step": 1751, "topk_loss": 0.0108 }, { "epoch": 0.6963477267222695, "grad_norm": 0.10986328125, "learning_rate": 4.367050142102507e-05, "lm_loss": 1.9316, "loss": 2.0626, "mask_loss": 0.1224, "step": 1752, "topk_loss": 0.0086 }, { "epoch": 0.6967451854703987, "grad_norm": 0.1142578125, "learning_rate": 4.3565242557975326e-05, "lm_loss": 1.9602, "loss": 2.0933, "mask_loss": 0.1231, "step": 1753, "topk_loss": 0.01 }, { "epoch": 0.6971426442185278, "grad_norm": 0.1220703125, "learning_rate": 4.3460075361816635e-05, "lm_loss": 1.9415, "loss": 2.0767, "mask_loss": 0.124, "step": 1754, "topk_loss": 0.0112 }, { "epoch": 0.6975401029666569, "grad_norm": 0.11865234375, "learning_rate": 4.335500000337189e-05, "lm_loss": 1.9125, "loss": 2.0484, "mask_loss": 0.1261, "step": 1755, "topk_loss": 0.0098 }, { "epoch": 0.697937561714786, "grad_norm": 0.1103515625, "learning_rate": 4.3250016653314864e-05, "lm_loss": 1.8968, "loss": 2.0319, "mask_loss": 0.1254, "step": 1756, "topk_loss": 0.0097 }, { "epoch": 0.6983350204629153, "grad_norm": 0.177734375, "learning_rate": 4.314512548216985e-05, "lm_loss": 1.8136, "loss": 1.9476, "mask_loss": 0.1251, "step": 1757, "topk_loss": 0.0089 }, { "epoch": 0.6987324792110444, "grad_norm": 0.1201171875, "learning_rate": 4.304032666031139e-05, "lm_loss": 1.9155, "loss": 2.0515, "mask_loss": 0.1257, "step": 1758, "topk_loss": 0.0102 }, { "epoch": 0.6991299379591736, "grad_norm": 0.1279296875, "learning_rate": 4.2935620357964076e-05, "lm_loss": 1.8805, "loss": 2.0169, "mask_loss": 0.1266, "step": 1759, "topk_loss": 0.0098 }, { "epoch": 0.6995273967073027, "grad_norm": 0.130859375, "learning_rate": 4.283100674520219e-05, "lm_loss": 1.903, "loss": 2.0378, "mask_loss": 0.1252, "step": 1760, "topk_loss": 0.0096 }, { "epoch": 0.6999248554554318, "grad_norm": 0.1171875, "learning_rate": 4.272648599194948e-05, "lm_loss": 1.9202, "loss": 2.0537, "mask_loss": 0.1248, "step": 1761, "topk_loss": 0.0087 }, { "epoch": 0.700322314203561, "grad_norm": 0.130859375, "learning_rate": 4.262205826797883e-05, "lm_loss": 1.9085, "loss": 2.0459, "mask_loss": 0.1252, "step": 1762, "topk_loss": 0.0122 }, { "epoch": 0.7007197729516901, "grad_norm": 0.115234375, "learning_rate": 4.251772374291203e-05, "lm_loss": 1.8532, "loss": 1.9879, "mask_loss": 0.1254, "step": 1763, "topk_loss": 0.0093 }, { "epoch": 0.7011172316998193, "grad_norm": 0.12158203125, "learning_rate": 4.24134825862195e-05, "lm_loss": 1.9942, "loss": 2.1308, "mask_loss": 0.1265, "step": 1764, "topk_loss": 0.0101 }, { "epoch": 0.7015146904479485, "grad_norm": 0.12158203125, "learning_rate": 4.2309334967219995e-05, "lm_loss": 1.8729, "loss": 2.0136, "mask_loss": 0.1285, "step": 1765, "topk_loss": 0.0122 }, { "epoch": 0.7019121491960776, "grad_norm": 0.1279296875, "learning_rate": 4.2205281055080325e-05, "lm_loss": 1.9088, "loss": 2.043, "mask_loss": 0.1245, "step": 1766, "topk_loss": 0.0097 }, { "epoch": 0.7023096079442067, "grad_norm": 0.11279296875, "learning_rate": 4.210132101881516e-05, "lm_loss": 1.869, "loss": 2.0075, "mask_loss": 0.1285, "step": 1767, "topk_loss": 0.0099 }, { "epoch": 0.7027070666923358, "grad_norm": 0.11279296875, "learning_rate": 4.1997455027286525e-05, "lm_loss": 1.8832, "loss": 2.0195, "mask_loss": 0.1252, "step": 1768, "topk_loss": 0.011 }, { "epoch": 0.703104525440465, "grad_norm": 0.11279296875, "learning_rate": 4.189368324920385e-05, "lm_loss": 1.8982, "loss": 2.0335, "mask_loss": 0.1255, "step": 1769, "topk_loss": 0.0098 }, { "epoch": 0.7035019841885942, "grad_norm": 0.1123046875, "learning_rate": 4.179000585312347e-05, "lm_loss": 1.8529, "loss": 1.9909, "mask_loss": 0.1276, "step": 1770, "topk_loss": 0.0105 }, { "epoch": 0.7038994429367234, "grad_norm": 0.1318359375, "learning_rate": 4.1686423007448426e-05, "lm_loss": 1.9087, "loss": 2.0481, "mask_loss": 0.1284, "step": 1771, "topk_loss": 0.011 }, { "epoch": 0.7042969016848525, "grad_norm": 0.1181640625, "learning_rate": 4.158293488042818e-05, "lm_loss": 1.8858, "loss": 2.0233, "mask_loss": 0.1268, "step": 1772, "topk_loss": 0.0107 }, { "epoch": 0.7046943604329816, "grad_norm": 0.11279296875, "learning_rate": 4.147954164015832e-05, "lm_loss": 1.9356, "loss": 2.0718, "mask_loss": 0.1262, "step": 1773, "topk_loss": 0.01 }, { "epoch": 0.7050918191811107, "grad_norm": 0.11474609375, "learning_rate": 4.1376243454580366e-05, "lm_loss": 1.8602, "loss": 1.9965, "mask_loss": 0.126, "step": 1774, "topk_loss": 0.0103 }, { "epoch": 0.7054892779292399, "grad_norm": 0.11083984375, "learning_rate": 4.127304049148142e-05, "lm_loss": 1.9244, "loss": 2.0568, "mask_loss": 0.1232, "step": 1775, "topk_loss": 0.0091 }, { "epoch": 0.7058867366773691, "grad_norm": 0.12451171875, "learning_rate": 4.116993291849381e-05, "lm_loss": 1.9718, "loss": 2.106, "mask_loss": 0.125, "step": 1776, "topk_loss": 0.0092 }, { "epoch": 0.7062841954254983, "grad_norm": 0.1181640625, "learning_rate": 4.1066920903095076e-05, "lm_loss": 1.9779, "loss": 2.1117, "mask_loss": 0.1247, "step": 1777, "topk_loss": 0.0091 }, { "epoch": 0.7066816541736274, "grad_norm": 0.1181640625, "learning_rate": 4.0964004612607465e-05, "lm_loss": 1.9201, "loss": 2.0557, "mask_loss": 0.1251, "step": 1778, "topk_loss": 0.0105 }, { "epoch": 0.7070791129217565, "grad_norm": 0.11767578125, "learning_rate": 4.086118421419774e-05, "lm_loss": 1.929, "loss": 2.0673, "mask_loss": 0.1269, "step": 1779, "topk_loss": 0.0113 }, { "epoch": 0.7074765716698856, "grad_norm": 0.109375, "learning_rate": 4.0758459874876954e-05, "lm_loss": 1.9014, "loss": 2.0363, "mask_loss": 0.1256, "step": 1780, "topk_loss": 0.0093 }, { "epoch": 0.7078740304180148, "grad_norm": 0.1279296875, "learning_rate": 4.065583176150005e-05, "lm_loss": 1.936, "loss": 2.0691, "mask_loss": 0.1232, "step": 1781, "topk_loss": 0.0099 }, { "epoch": 0.708271489166144, "grad_norm": 0.11572265625, "learning_rate": 4.0553300040765755e-05, "lm_loss": 1.9247, "loss": 2.059, "mask_loss": 0.1245, "step": 1782, "topk_loss": 0.0098 }, { "epoch": 0.7086689479142732, "grad_norm": 0.11181640625, "learning_rate": 4.04508648792162e-05, "lm_loss": 1.8667, "loss": 2.0013, "mask_loss": 0.125, "step": 1783, "topk_loss": 0.0096 }, { "epoch": 0.7090664066624023, "grad_norm": 0.1328125, "learning_rate": 4.034852644323661e-05, "lm_loss": 1.8723, "loss": 2.0095, "mask_loss": 0.1266, "step": 1784, "topk_loss": 0.0106 }, { "epoch": 0.7094638654105314, "grad_norm": 0.126953125, "learning_rate": 4.024628489905517e-05, "lm_loss": 1.8956, "loss": 2.0314, "mask_loss": 0.1262, "step": 1785, "topk_loss": 0.0096 }, { "epoch": 0.7098613241586605, "grad_norm": 0.1171875, "learning_rate": 4.014414041274267e-05, "lm_loss": 1.9571, "loss": 2.0909, "mask_loss": 0.1242, "step": 1786, "topk_loss": 0.0095 }, { "epoch": 0.7102587829067897, "grad_norm": 0.11865234375, "learning_rate": 4.004209315021225e-05, "lm_loss": 1.8871, "loss": 2.0244, "mask_loss": 0.1273, "step": 1787, "topk_loss": 0.01 }, { "epoch": 0.7106562416549188, "grad_norm": 0.111328125, "learning_rate": 3.994014327721912e-05, "lm_loss": 1.9109, "loss": 2.0447, "mask_loss": 0.1248, "step": 1788, "topk_loss": 0.009 }, { "epoch": 0.711053700403048, "grad_norm": 0.1162109375, "learning_rate": 3.9838290959360304e-05, "lm_loss": 1.8853, "loss": 2.0224, "mask_loss": 0.1262, "step": 1789, "topk_loss": 0.0108 }, { "epoch": 0.7114511591511772, "grad_norm": 0.126953125, "learning_rate": 3.973653636207437e-05, "lm_loss": 1.8864, "loss": 2.0219, "mask_loss": 0.1268, "step": 1790, "topk_loss": 0.0087 }, { "epoch": 0.7118486178993063, "grad_norm": 0.11083984375, "learning_rate": 3.9634879650641153e-05, "lm_loss": 1.8685, "loss": 2.0048, "mask_loss": 0.1272, "step": 1791, "topk_loss": 0.0091 }, { "epoch": 0.7122460766474354, "grad_norm": 0.1513671875, "learning_rate": 3.953332099018151e-05, "lm_loss": 1.8939, "loss": 2.0353, "mask_loss": 0.1287, "step": 1792, "topk_loss": 0.0128 }, { "epoch": 0.7126435353955646, "grad_norm": 0.126953125, "learning_rate": 3.943186054565699e-05, "lm_loss": 1.9455, "loss": 2.0885, "mask_loss": 0.1282, "step": 1793, "topk_loss": 0.0147 }, { "epoch": 0.7130409941436937, "grad_norm": 0.1162109375, "learning_rate": 3.933049848186967e-05, "lm_loss": 1.9081, "loss": 2.0451, "mask_loss": 0.126, "step": 1794, "topk_loss": 0.011 }, { "epoch": 0.713438452891823, "grad_norm": 0.14453125, "learning_rate": 3.9229234963461766e-05, "lm_loss": 1.8904, "loss": 2.0273, "mask_loss": 0.1267, "step": 1795, "topk_loss": 0.0102 }, { "epoch": 0.7138359116399521, "grad_norm": 0.1376953125, "learning_rate": 3.9128070154915496e-05, "lm_loss": 1.9129, "loss": 2.0478, "mask_loss": 0.124, "step": 1796, "topk_loss": 0.0108 }, { "epoch": 0.7142333703880812, "grad_norm": 0.13671875, "learning_rate": 3.902700422055266e-05, "lm_loss": 1.8739, "loss": 2.0057, "mask_loss": 0.1231, "step": 1797, "topk_loss": 0.0088 }, { "epoch": 0.7146308291362103, "grad_norm": 0.24609375, "learning_rate": 3.892603732453455e-05, "lm_loss": 1.9116, "loss": 2.0648, "mask_loss": 0.1329, "step": 1798, "topk_loss": 0.0204 }, { "epoch": 0.7150282878843395, "grad_norm": 0.11474609375, "learning_rate": 3.882516963086154e-05, "lm_loss": 1.911, "loss": 2.0459, "mask_loss": 0.1254, "step": 1799, "topk_loss": 0.0095 }, { "epoch": 0.7154257466324686, "grad_norm": 0.12890625, "learning_rate": 3.872440130337281e-05, "lm_loss": 1.8986, "loss": 2.0375, "mask_loss": 0.1264, "step": 1800, "topk_loss": 0.0125 }, { "epoch": 0.7154257466324686, "eval_lm_loss": 687.646728515625, "eval_loss": 687.7838745117188, "eval_mask_hit_rate": 0.5356104969978333, "eval_mask_loss": 0.12455646693706512, "eval_mask_top_10_hit_rate": 0.9855278730392456, "eval_mask_top_1_hit_rate": 0.9974215030670166, "eval_mask_top_20_hit_rate": 0.9761320948600769, "eval_mask_top_5_hit_rate": 0.9908208847045898, "eval_runtime": 144.5288, "eval_samples_per_second": 14.17, "eval_steps_per_second": 7.085, "eval_token_accuracy": 0.6144205331802368, "eval_top_k_diff": -531.4647216796875, "eval_topk_loss": 0.012550951912999153, "step": 1800 }, { "epoch": 0.7158232053805978, "grad_norm": 0.1103515625, "learning_rate": 3.862373250574626e-05, "lm_loss": 1.8972, "loss": 2.0315, "mask_loss": 0.1249, "step": 1801, "topk_loss": 0.0094 }, { "epoch": 0.716220664128727, "grad_norm": 0.12255859375, "learning_rate": 3.852316340149803e-05, "lm_loss": 1.9338, "loss": 2.0749, "mask_loss": 0.1287, "step": 1802, "topk_loss": 0.0125 }, { "epoch": 0.7166181228768561, "grad_norm": 0.1201171875, "learning_rate": 3.842269415398239e-05, "lm_loss": 1.9396, "loss": 2.0795, "mask_loss": 0.1284, "step": 1803, "topk_loss": 0.0115 }, { "epoch": 0.7170155816249852, "grad_norm": 0.1240234375, "learning_rate": 3.832232492639137e-05, "lm_loss": 1.9292, "loss": 2.0664, "mask_loss": 0.1268, "step": 1804, "topk_loss": 0.0105 }, { "epoch": 0.7174130403731144, "grad_norm": 0.11376953125, "learning_rate": 3.822205588175457e-05, "lm_loss": 1.8633, "loss": 2.0003, "mask_loss": 0.1259, "step": 1805, "topk_loss": 0.0111 }, { "epoch": 0.7178104991212435, "grad_norm": 0.134765625, "learning_rate": 3.8121887182938845e-05, "lm_loss": 1.8854, "loss": 2.0247, "mask_loss": 0.1274, "step": 1806, "topk_loss": 0.0119 }, { "epoch": 0.7182079578693727, "grad_norm": 0.1162109375, "learning_rate": 3.802181899264809e-05, "lm_loss": 1.887, "loss": 2.0228, "mask_loss": 0.1258, "step": 1807, "topk_loss": 0.01 }, { "epoch": 0.7186054166175019, "grad_norm": 0.11767578125, "learning_rate": 3.7921851473422834e-05, "lm_loss": 1.9839, "loss": 2.1216, "mask_loss": 0.1265, "step": 1808, "topk_loss": 0.0112 }, { "epoch": 0.719002875365631, "grad_norm": 0.162109375, "learning_rate": 3.782198478764021e-05, "lm_loss": 1.8774, "loss": 2.0124, "mask_loss": 0.1252, "step": 1809, "topk_loss": 0.0099 }, { "epoch": 0.7194003341137601, "grad_norm": 0.11376953125, "learning_rate": 3.772221909751353e-05, "lm_loss": 1.8666, "loss": 2.0032, "mask_loss": 0.127, "step": 1810, "topk_loss": 0.0096 }, { "epoch": 0.7197977928618893, "grad_norm": 0.166015625, "learning_rate": 3.762255456509206e-05, "lm_loss": 1.8814, "loss": 2.0285, "mask_loss": 0.1326, "step": 1811, "topk_loss": 0.0146 }, { "epoch": 0.7201952516100184, "grad_norm": 0.125, "learning_rate": 3.752299135226074e-05, "lm_loss": 1.9509, "loss": 2.0859, "mask_loss": 0.1252, "step": 1812, "topk_loss": 0.0098 }, { "epoch": 0.7205927103581476, "grad_norm": 0.1240234375, "learning_rate": 3.742352962073995e-05, "lm_loss": 1.9578, "loss": 2.097, "mask_loss": 0.127, "step": 1813, "topk_loss": 0.0121 }, { "epoch": 0.7209901691062768, "grad_norm": 0.115234375, "learning_rate": 3.732416953208522e-05, "lm_loss": 1.8663, "loss": 2.0015, "mask_loss": 0.1259, "step": 1814, "topk_loss": 0.0093 }, { "epoch": 0.7213876278544059, "grad_norm": 0.11767578125, "learning_rate": 3.722491124768702e-05, "lm_loss": 1.9073, "loss": 2.0426, "mask_loss": 0.1268, "step": 1815, "topk_loss": 0.0084 }, { "epoch": 0.721785086602535, "grad_norm": 0.12353515625, "learning_rate": 3.71257549287704e-05, "lm_loss": 1.9045, "loss": 2.0425, "mask_loss": 0.1269, "step": 1816, "topk_loss": 0.0111 }, { "epoch": 0.7221825453506642, "grad_norm": 0.11181640625, "learning_rate": 3.70267007363948e-05, "lm_loss": 1.8537, "loss": 1.9912, "mask_loss": 0.1267, "step": 1817, "topk_loss": 0.0108 }, { "epoch": 0.7225800040987933, "grad_norm": 0.11279296875, "learning_rate": 3.6927748831453836e-05, "lm_loss": 1.831, "loss": 1.9661, "mask_loss": 0.125, "step": 1818, "topk_loss": 0.0101 }, { "epoch": 0.7229774628469224, "grad_norm": 0.119140625, "learning_rate": 3.682889937467493e-05, "lm_loss": 1.8503, "loss": 1.9856, "mask_loss": 0.1254, "step": 1819, "topk_loss": 0.0099 }, { "epoch": 0.7233749215950517, "grad_norm": 0.1923828125, "learning_rate": 3.673015252661909e-05, "lm_loss": 1.859, "loss": 1.993, "mask_loss": 0.1248, "step": 1820, "topk_loss": 0.0093 }, { "epoch": 0.7237723803431808, "grad_norm": 0.134765625, "learning_rate": 3.6631508447680675e-05, "lm_loss": 1.8928, "loss": 2.0283, "mask_loss": 0.1249, "step": 1821, "topk_loss": 0.0105 }, { "epoch": 0.7241698390913099, "grad_norm": 0.1123046875, "learning_rate": 3.653296729808712e-05, "lm_loss": 1.9698, "loss": 2.1035, "mask_loss": 0.1236, "step": 1822, "topk_loss": 0.0101 }, { "epoch": 0.7245672978394391, "grad_norm": 0.1103515625, "learning_rate": 3.643452923789866e-05, "lm_loss": 1.8551, "loss": 1.9914, "mask_loss": 0.1265, "step": 1823, "topk_loss": 0.0098 }, { "epoch": 0.7249647565875682, "grad_norm": 0.1201171875, "learning_rate": 3.633619442700811e-05, "lm_loss": 1.9069, "loss": 2.0422, "mask_loss": 0.1253, "step": 1824, "topk_loss": 0.01 }, { "epoch": 0.7253622153356973, "grad_norm": 0.11279296875, "learning_rate": 3.623796302514051e-05, "lm_loss": 1.9117, "loss": 2.0472, "mask_loss": 0.1257, "step": 1825, "topk_loss": 0.0098 }, { "epoch": 0.7257596740838266, "grad_norm": 0.130859375, "learning_rate": 3.613983519185301e-05, "lm_loss": 1.9091, "loss": 2.0477, "mask_loss": 0.1268, "step": 1826, "topk_loss": 0.0118 }, { "epoch": 0.7261571328319557, "grad_norm": 0.1376953125, "learning_rate": 3.604181108653449e-05, "lm_loss": 1.8876, "loss": 2.0223, "mask_loss": 0.1254, "step": 1827, "topk_loss": 0.0093 }, { "epoch": 0.7265545915800848, "grad_norm": 0.11572265625, "learning_rate": 3.594389086840537e-05, "lm_loss": 1.791, "loss": 1.9253, "mask_loss": 0.1257, "step": 1828, "topk_loss": 0.0086 }, { "epoch": 0.726952050328214, "grad_norm": 0.1162109375, "learning_rate": 3.58460746965173e-05, "lm_loss": 1.9217, "loss": 2.0565, "mask_loss": 0.1252, "step": 1829, "topk_loss": 0.0096 }, { "epoch": 0.7273495090763431, "grad_norm": 0.12109375, "learning_rate": 3.574836272975293e-05, "lm_loss": 1.8529, "loss": 1.9888, "mask_loss": 0.1268, "step": 1830, "topk_loss": 0.0092 }, { "epoch": 0.7277469678244722, "grad_norm": 0.11376953125, "learning_rate": 3.5650755126825706e-05, "lm_loss": 1.8983, "loss": 2.0354, "mask_loss": 0.1279, "step": 1831, "topk_loss": 0.0091 }, { "epoch": 0.7281444265726015, "grad_norm": 0.11572265625, "learning_rate": 3.555325204627944e-05, "lm_loss": 1.9628, "loss": 2.1015, "mask_loss": 0.1284, "step": 1832, "topk_loss": 0.0103 }, { "epoch": 0.7285418853207306, "grad_norm": 0.1103515625, "learning_rate": 3.545585364648828e-05, "lm_loss": 1.8873, "loss": 2.0248, "mask_loss": 0.1269, "step": 1833, "topk_loss": 0.0106 }, { "epoch": 0.7289393440688597, "grad_norm": 0.130859375, "learning_rate": 3.5358560085656276e-05, "lm_loss": 1.9359, "loss": 2.0735, "mask_loss": 0.1275, "step": 1834, "topk_loss": 0.0101 }, { "epoch": 0.7293368028169889, "grad_norm": 0.150390625, "learning_rate": 3.5261371521817244e-05, "lm_loss": 1.8569, "loss": 1.9914, "mask_loss": 0.1251, "step": 1835, "topk_loss": 0.0094 }, { "epoch": 0.729734261565118, "grad_norm": 0.11376953125, "learning_rate": 3.516428811283439e-05, "lm_loss": 1.8744, "loss": 2.007, "mask_loss": 0.1238, "step": 1836, "topk_loss": 0.0088 }, { "epoch": 0.7301317203132471, "grad_norm": 0.1298828125, "learning_rate": 3.506731001640017e-05, "lm_loss": 1.9186, "loss": 2.0535, "mask_loss": 0.1244, "step": 1837, "topk_loss": 0.0105 }, { "epoch": 0.7305291790613764, "grad_norm": 0.1083984375, "learning_rate": 3.497043739003594e-05, "lm_loss": 1.861, "loss": 1.9987, "mask_loss": 0.1279, "step": 1838, "topk_loss": 0.0099 }, { "epoch": 0.7309266378095055, "grad_norm": 0.1171875, "learning_rate": 3.487367039109182e-05, "lm_loss": 1.9308, "loss": 2.0705, "mask_loss": 0.1279, "step": 1839, "topk_loss": 0.0118 }, { "epoch": 0.7313240965576346, "grad_norm": 0.11865234375, "learning_rate": 3.47770091767462e-05, "lm_loss": 1.8843, "loss": 2.0196, "mask_loss": 0.1245, "step": 1840, "topk_loss": 0.0108 }, { "epoch": 0.7317215553057638, "grad_norm": 0.1201171875, "learning_rate": 3.4680453904005805e-05, "lm_loss": 1.9344, "loss": 2.0687, "mask_loss": 0.1252, "step": 1841, "topk_loss": 0.0091 }, { "epoch": 0.7321190140538929, "grad_norm": 0.154296875, "learning_rate": 3.4584004729705213e-05, "lm_loss": 1.9301, "loss": 2.071, "mask_loss": 0.1279, "step": 1842, "topk_loss": 0.013 }, { "epoch": 0.732516472802022, "grad_norm": 0.1416015625, "learning_rate": 3.4487661810506656e-05, "lm_loss": 1.9054, "loss": 2.0481, "mask_loss": 0.129, "step": 1843, "topk_loss": 0.0137 }, { "epoch": 0.7329139315501512, "grad_norm": 0.11572265625, "learning_rate": 3.439142530289981e-05, "lm_loss": 1.8682, "loss": 2.0045, "mask_loss": 0.1258, "step": 1844, "topk_loss": 0.0105 }, { "epoch": 0.7333113902982804, "grad_norm": 0.11376953125, "learning_rate": 3.4295295363201476e-05, "lm_loss": 1.9143, "loss": 2.0493, "mask_loss": 0.125, "step": 1845, "topk_loss": 0.01 }, { "epoch": 0.7337088490464095, "grad_norm": 0.1337890625, "learning_rate": 3.419927214755538e-05, "lm_loss": 1.8642, "loss": 2.0035, "mask_loss": 0.1295, "step": 1846, "topk_loss": 0.0098 }, { "epoch": 0.7341063077945387, "grad_norm": 0.1181640625, "learning_rate": 3.4103355811931915e-05, "lm_loss": 1.9921, "loss": 2.1293, "mask_loss": 0.1262, "step": 1847, "topk_loss": 0.0111 }, { "epoch": 0.7345037665426678, "grad_norm": 0.1396484375, "learning_rate": 3.400754651212776e-05, "lm_loss": 1.8553, "loss": 1.9919, "mask_loss": 0.1265, "step": 1848, "topk_loss": 0.0101 }, { "epoch": 0.7349012252907969, "grad_norm": 0.13671875, "learning_rate": 3.391184440376588e-05, "lm_loss": 2.0014, "loss": 2.1335, "mask_loss": 0.1232, "step": 1849, "topk_loss": 0.0089 }, { "epoch": 0.7352986840389261, "grad_norm": 0.138671875, "learning_rate": 3.381624964229504e-05, "lm_loss": 2.0318, "loss": 2.1653, "mask_loss": 0.1232, "step": 1850, "topk_loss": 0.0103 }, { "epoch": 0.7352986840389261, "eval_lm_loss": 687.4080810546875, "eval_loss": 687.5452270507812, "eval_mask_hit_rate": 0.5357130765914917, "eval_mask_loss": 0.12451068311929703, "eval_mask_top_10_hit_rate": 0.9855594038963318, "eval_mask_top_1_hit_rate": 0.9974288940429688, "eval_mask_top_20_hit_rate": 0.9761731624603271, "eval_mask_top_5_hit_rate": 0.9908478260040283, "eval_runtime": 144.0268, "eval_samples_per_second": 14.22, "eval_steps_per_second": 7.11, "eval_token_accuracy": 0.6144834756851196, "eval_top_k_diff": -528.8475341796875, "eval_topk_loss": 0.012559626251459122, "step": 1850 }, { "epoch": 0.7356961427870553, "grad_norm": 0.12890625, "learning_rate": 3.3720762382989654e-05, "lm_loss": 1.9008, "loss": 2.0361, "mask_loss": 0.1252, "step": 1851, "topk_loss": 0.0101 }, { "epoch": 0.7360936015351844, "grad_norm": 0.140625, "learning_rate": 3.3625382780949574e-05, "lm_loss": 1.8123, "loss": 1.9497, "mask_loss": 0.1269, "step": 1852, "topk_loss": 0.0105 }, { "epoch": 0.7364910602833136, "grad_norm": 0.11572265625, "learning_rate": 3.3530110991099706e-05, "lm_loss": 1.8366, "loss": 1.9694, "mask_loss": 0.1253, "step": 1853, "topk_loss": 0.0075 }, { "epoch": 0.7368885190314427, "grad_norm": 0.1259765625, "learning_rate": 3.343494716818989e-05, "lm_loss": 1.9022, "loss": 2.0385, "mask_loss": 0.1269, "step": 1854, "topk_loss": 0.0095 }, { "epoch": 0.7372859777795718, "grad_norm": 0.12451171875, "learning_rate": 3.333989146679458e-05, "lm_loss": 1.943, "loss": 2.0801, "mask_loss": 0.127, "step": 1855, "topk_loss": 0.0101 }, { "epoch": 0.737683436527701, "grad_norm": 0.11962890625, "learning_rate": 3.324494404131261e-05, "lm_loss": 1.918, "loss": 2.0573, "mask_loss": 0.1274, "step": 1856, "topk_loss": 0.0119 }, { "epoch": 0.7380808952758302, "grad_norm": 0.11865234375, "learning_rate": 3.315010504596692e-05, "lm_loss": 1.9292, "loss": 2.0654, "mask_loss": 0.1267, "step": 1857, "topk_loss": 0.0095 }, { "epoch": 0.7384783540239593, "grad_norm": 0.13671875, "learning_rate": 3.305537463480437e-05, "lm_loss": 1.9136, "loss": 2.0482, "mask_loss": 0.1258, "step": 1858, "topk_loss": 0.0088 }, { "epoch": 0.7388758127720885, "grad_norm": 0.138671875, "learning_rate": 3.296075296169542e-05, "lm_loss": 1.9337, "loss": 2.0683, "mask_loss": 0.1246, "step": 1859, "topk_loss": 0.01 }, { "epoch": 0.7392732715202176, "grad_norm": 0.12451171875, "learning_rate": 3.286624018033389e-05, "lm_loss": 1.8758, "loss": 2.0088, "mask_loss": 0.1239, "step": 1860, "topk_loss": 0.009 }, { "epoch": 0.7396707302683467, "grad_norm": 0.11474609375, "learning_rate": 3.277183644423677e-05, "lm_loss": 1.8901, "loss": 2.0252, "mask_loss": 0.1258, "step": 1861, "topk_loss": 0.0093 }, { "epoch": 0.7400681890164759, "grad_norm": 0.11767578125, "learning_rate": 3.267754190674389e-05, "lm_loss": 1.9397, "loss": 2.079, "mask_loss": 0.1285, "step": 1862, "topk_loss": 0.0108 }, { "epoch": 0.7404656477646051, "grad_norm": 0.1181640625, "learning_rate": 3.258335672101778e-05, "lm_loss": 1.9947, "loss": 2.1311, "mask_loss": 0.1259, "step": 1863, "topk_loss": 0.0105 }, { "epoch": 0.7408631065127342, "grad_norm": 0.1376953125, "learning_rate": 3.248928104004321e-05, "lm_loss": 1.9536, "loss": 2.0884, "mask_loss": 0.1249, "step": 1864, "topk_loss": 0.01 }, { "epoch": 0.7412605652608634, "grad_norm": 0.111328125, "learning_rate": 3.2395315016627195e-05, "lm_loss": 1.8897, "loss": 2.0264, "mask_loss": 0.1269, "step": 1865, "topk_loss": 0.0098 }, { "epoch": 0.7416580240089925, "grad_norm": 0.1162109375, "learning_rate": 3.230145880339861e-05, "lm_loss": 1.8932, "loss": 2.029, "mask_loss": 0.1258, "step": 1866, "topk_loss": 0.0099 }, { "epoch": 0.7420554827571216, "grad_norm": 0.11376953125, "learning_rate": 3.220771255280797e-05, "lm_loss": 1.9349, "loss": 2.0697, "mask_loss": 0.1252, "step": 1867, "topk_loss": 0.0095 }, { "epoch": 0.7424529415052508, "grad_norm": 0.125, "learning_rate": 3.211407641712716e-05, "lm_loss": 1.8691, "loss": 2.0071, "mask_loss": 0.1282, "step": 1868, "topk_loss": 0.0098 }, { "epoch": 0.74285040025338, "grad_norm": 0.11767578125, "learning_rate": 3.202055054844921e-05, "lm_loss": 1.9568, "loss": 2.0909, "mask_loss": 0.1245, "step": 1869, "topk_loss": 0.0095 }, { "epoch": 0.7432478590015091, "grad_norm": 0.1533203125, "learning_rate": 3.1927135098688056e-05, "lm_loss": 1.8673, "loss": 2.0069, "mask_loss": 0.128, "step": 1870, "topk_loss": 0.0116 }, { "epoch": 0.7436453177496383, "grad_norm": 0.1201171875, "learning_rate": 3.1833830219578284e-05, "lm_loss": 1.8598, "loss": 1.9951, "mask_loss": 0.1265, "step": 1871, "topk_loss": 0.0088 }, { "epoch": 0.7440427764977674, "grad_norm": 0.1328125, "learning_rate": 3.174063606267483e-05, "lm_loss": 1.893, "loss": 2.0251, "mask_loss": 0.1226, "step": 1872, "topk_loss": 0.0095 }, { "epoch": 0.7444402352458965, "grad_norm": 0.11279296875, "learning_rate": 3.164755277935284e-05, "lm_loss": 1.8708, "loss": 2.0078, "mask_loss": 0.1278, "step": 1873, "topk_loss": 0.0092 }, { "epoch": 0.7448376939940257, "grad_norm": 0.1513671875, "learning_rate": 3.155458052080735e-05, "lm_loss": 1.9627, "loss": 2.1058, "mask_loss": 0.1296, "step": 1874, "topk_loss": 0.0135 }, { "epoch": 0.7452351527421548, "grad_norm": 0.11083984375, "learning_rate": 3.146171943805307e-05, "lm_loss": 1.8988, "loss": 2.0342, "mask_loss": 0.1266, "step": 1875, "topk_loss": 0.0088 }, { "epoch": 0.745632611490284, "grad_norm": 0.1259765625, "learning_rate": 3.13689696819241e-05, "lm_loss": 1.9575, "loss": 2.099, "mask_loss": 0.1284, "step": 1876, "topk_loss": 0.0131 }, { "epoch": 0.7460300702384132, "grad_norm": 0.1171875, "learning_rate": 3.1276331403073735e-05, "lm_loss": 1.9339, "loss": 2.0669, "mask_loss": 0.1239, "step": 1877, "topk_loss": 0.0092 }, { "epoch": 0.7464275289865423, "grad_norm": 0.1298828125, "learning_rate": 3.118380475197419e-05, "lm_loss": 1.9466, "loss": 2.081, "mask_loss": 0.1251, "step": 1878, "topk_loss": 0.0094 }, { "epoch": 0.7468249877346714, "grad_norm": 0.1240234375, "learning_rate": 3.109138987891639e-05, "lm_loss": 1.8772, "loss": 2.0131, "mask_loss": 0.1256, "step": 1879, "topk_loss": 0.0103 }, { "epoch": 0.7472224464828006, "grad_norm": 0.11279296875, "learning_rate": 3.0999086934009625e-05, "lm_loss": 1.8922, "loss": 2.0279, "mask_loss": 0.1268, "step": 1880, "topk_loss": 0.0089 }, { "epoch": 0.7476199052309297, "grad_norm": 0.11279296875, "learning_rate": 3.090689606718146e-05, "lm_loss": 1.9026, "loss": 2.0382, "mask_loss": 0.1263, "step": 1881, "topk_loss": 0.0093 }, { "epoch": 0.7480173639790589, "grad_norm": 0.12109375, "learning_rate": 3.081481742817736e-05, "lm_loss": 1.8625, "loss": 1.9959, "mask_loss": 0.1244, "step": 1882, "topk_loss": 0.0089 }, { "epoch": 0.7484148227271881, "grad_norm": 0.138671875, "learning_rate": 3.072285116656053e-05, "lm_loss": 1.8993, "loss": 2.0387, "mask_loss": 0.1282, "step": 1883, "topk_loss": 0.0112 }, { "epoch": 0.7488122814753172, "grad_norm": 0.11181640625, "learning_rate": 3.0630997431711636e-05, "lm_loss": 1.9531, "loss": 2.0886, "mask_loss": 0.1254, "step": 1884, "topk_loss": 0.0101 }, { "epoch": 0.7492097402234463, "grad_norm": 0.12109375, "learning_rate": 3.053925637282856e-05, "lm_loss": 1.9536, "loss": 2.0873, "mask_loss": 0.124, "step": 1885, "topk_loss": 0.0098 }, { "epoch": 0.7496071989715755, "grad_norm": 0.11181640625, "learning_rate": 3.0447628138926156e-05, "lm_loss": 1.9276, "loss": 2.0615, "mask_loss": 0.125, "step": 1886, "topk_loss": 0.0089 }, { "epoch": 0.7500046577197046, "grad_norm": 0.126953125, "learning_rate": 3.035611287883603e-05, "lm_loss": 1.8919, "loss": 2.0299, "mask_loss": 0.1256, "step": 1887, "topk_loss": 0.0125 }, { "epoch": 0.7504021164678338, "grad_norm": 0.109375, "learning_rate": 3.0264710741206283e-05, "lm_loss": 1.8436, "loss": 1.9778, "mask_loss": 0.1251, "step": 1888, "topk_loss": 0.0091 }, { "epoch": 0.750799575215963, "grad_norm": 0.1494140625, "learning_rate": 3.0173421874501262e-05, "lm_loss": 1.9214, "loss": 2.065, "mask_loss": 0.1287, "step": 1889, "topk_loss": 0.0149 }, { "epoch": 0.7511970339640921, "grad_norm": 0.111328125, "learning_rate": 3.0082246427001347e-05, "lm_loss": 1.8264, "loss": 1.9638, "mask_loss": 0.1273, "step": 1890, "topk_loss": 0.01 }, { "epoch": 0.7515944927122212, "grad_norm": 0.10693359375, "learning_rate": 2.9991184546802663e-05, "lm_loss": 1.8666, "loss": 2.0018, "mask_loss": 0.1258, "step": 1891, "topk_loss": 0.0094 }, { "epoch": 0.7519919514603504, "grad_norm": 0.1162109375, "learning_rate": 2.9900236381816893e-05, "lm_loss": 1.9626, "loss": 2.0946, "mask_loss": 0.1234, "step": 1892, "topk_loss": 0.0085 }, { "epoch": 0.7523894102084795, "grad_norm": 0.115234375, "learning_rate": 2.980940207977101e-05, "lm_loss": 1.9205, "loss": 2.0548, "mask_loss": 0.1246, "step": 1893, "topk_loss": 0.0096 }, { "epoch": 0.7527868689566087, "grad_norm": 0.123046875, "learning_rate": 2.9718681788207016e-05, "lm_loss": 1.8521, "loss": 1.9935, "mask_loss": 0.1283, "step": 1894, "topk_loss": 0.013 }, { "epoch": 0.7531843277047379, "grad_norm": 0.173828125, "learning_rate": 2.962807565448179e-05, "lm_loss": 1.8949, "loss": 2.0285, "mask_loss": 0.1249, "step": 1895, "topk_loss": 0.0088 }, { "epoch": 0.753581786452867, "grad_norm": 0.10986328125, "learning_rate": 2.9537583825766667e-05, "lm_loss": 1.8918, "loss": 2.0257, "mask_loss": 0.1246, "step": 1896, "topk_loss": 0.0092 }, { "epoch": 0.7539792452009961, "grad_norm": 0.1181640625, "learning_rate": 2.9447206449047427e-05, "lm_loss": 1.9062, "loss": 2.0406, "mask_loss": 0.1241, "step": 1897, "topk_loss": 0.0104 }, { "epoch": 0.7543767039491253, "grad_norm": 0.1142578125, "learning_rate": 2.9356943671123904e-05, "lm_loss": 1.8916, "loss": 2.0297, "mask_loss": 0.1287, "step": 1898, "topk_loss": 0.0094 }, { "epoch": 0.7547741626972544, "grad_norm": 0.126953125, "learning_rate": 2.926679563860978e-05, "lm_loss": 1.9112, "loss": 2.0523, "mask_loss": 0.1278, "step": 1899, "topk_loss": 0.0133 }, { "epoch": 0.7551716214453836, "grad_norm": 0.1240234375, "learning_rate": 2.9176762497932375e-05, "lm_loss": 1.9086, "loss": 2.0441, "mask_loss": 0.1252, "step": 1900, "topk_loss": 0.0103 }, { "epoch": 0.7551716214453836, "eval_lm_loss": 687.020751953125, "eval_loss": 687.1578369140625, "eval_mask_hit_rate": 0.5357862710952759, "eval_mask_loss": 0.12445982545614243, "eval_mask_top_10_hit_rate": 0.9855801463127136, "eval_mask_top_1_hit_rate": 0.9974265098571777, "eval_mask_top_20_hit_rate": 0.9762018918991089, "eval_mask_top_5_hit_rate": 0.9908568859100342, "eval_runtime": 144.5163, "eval_samples_per_second": 14.171, "eval_steps_per_second": 7.086, "eval_token_accuracy": 0.6144884824752808, "eval_top_k_diff": -527.6876220703125, "eval_topk_loss": 0.012589771300554276, "step": 1900 }, { "epoch": 0.7555690801935128, "grad_norm": 0.1376953125, "learning_rate": 2.9086844395332392e-05, "lm_loss": 1.8859, "loss": 2.0197, "mask_loss": 0.1241, "step": 1901, "topk_loss": 0.0097 }, { "epoch": 0.7559665389416419, "grad_norm": 0.12353515625, "learning_rate": 2.899704147686366e-05, "lm_loss": 1.9076, "loss": 2.0417, "mask_loss": 0.1253, "step": 1902, "topk_loss": 0.0089 }, { "epoch": 0.756363997689771, "grad_norm": 0.126953125, "learning_rate": 2.890735388839295e-05, "lm_loss": 1.9016, "loss": 2.038, "mask_loss": 0.1258, "step": 1903, "topk_loss": 0.0106 }, { "epoch": 0.7567614564379002, "grad_norm": 0.1318359375, "learning_rate": 2.8817781775599618e-05, "lm_loss": 1.9314, "loss": 2.0643, "mask_loss": 0.1244, "step": 1904, "topk_loss": 0.0086 }, { "epoch": 0.7571589151860293, "grad_norm": 0.11474609375, "learning_rate": 2.8728325283975553e-05, "lm_loss": 1.8348, "loss": 1.9699, "mask_loss": 0.1253, "step": 1905, "topk_loss": 0.0098 }, { "epoch": 0.7575563739341584, "grad_norm": 0.1328125, "learning_rate": 2.8638984558824777e-05, "lm_loss": 1.9063, "loss": 2.0438, "mask_loss": 0.1278, "step": 1906, "topk_loss": 0.0096 }, { "epoch": 0.7579538326822877, "grad_norm": 0.11181640625, "learning_rate": 2.8549759745263314e-05, "lm_loss": 1.9223, "loss": 2.0585, "mask_loss": 0.1249, "step": 1907, "topk_loss": 0.0112 }, { "epoch": 0.7583512914304168, "grad_norm": 0.1162109375, "learning_rate": 2.8460650988218886e-05, "lm_loss": 1.9308, "loss": 2.0659, "mask_loss": 0.1254, "step": 1908, "topk_loss": 0.0096 }, { "epoch": 0.7587487501785459, "grad_norm": 0.11767578125, "learning_rate": 2.8371658432430716e-05, "lm_loss": 1.9609, "loss": 2.0927, "mask_loss": 0.1235, "step": 1909, "topk_loss": 0.0083 }, { "epoch": 0.7591462089266751, "grad_norm": 0.126953125, "learning_rate": 2.8282782222449267e-05, "lm_loss": 1.9922, "loss": 2.1295, "mask_loss": 0.1249, "step": 1910, "topk_loss": 0.0125 }, { "epoch": 0.7595436676748042, "grad_norm": 0.1142578125, "learning_rate": 2.8194022502636075e-05, "lm_loss": 1.8613, "loss": 1.9952, "mask_loss": 0.1254, "step": 1911, "topk_loss": 0.0086 }, { "epoch": 0.7599411264229333, "grad_norm": 0.130859375, "learning_rate": 2.8105379417163357e-05, "lm_loss": 1.8442, "loss": 1.9784, "mask_loss": 0.125, "step": 1912, "topk_loss": 0.0092 }, { "epoch": 0.7603385851710626, "grad_norm": 0.115234375, "learning_rate": 2.801685311001396e-05, "lm_loss": 1.832, "loss": 1.9689, "mask_loss": 0.1266, "step": 1913, "topk_loss": 0.0103 }, { "epoch": 0.7607360439191917, "grad_norm": 0.119140625, "learning_rate": 2.7928443724981045e-05, "lm_loss": 1.8346, "loss": 1.9715, "mask_loss": 0.1263, "step": 1914, "topk_loss": 0.0105 }, { "epoch": 0.7611335026673208, "grad_norm": 0.109375, "learning_rate": 2.7840151405667837e-05, "lm_loss": 1.9061, "loss": 2.0427, "mask_loss": 0.1274, "step": 1915, "topk_loss": 0.0092 }, { "epoch": 0.76153096141545, "grad_norm": 0.130859375, "learning_rate": 2.7751976295487402e-05, "lm_loss": 1.9256, "loss": 2.062, "mask_loss": 0.1253, "step": 1916, "topk_loss": 0.0112 }, { "epoch": 0.7619284201635791, "grad_norm": 0.12158203125, "learning_rate": 2.766391853766247e-05, "lm_loss": 1.8934, "loss": 2.028, "mask_loss": 0.1253, "step": 1917, "topk_loss": 0.0092 }, { "epoch": 0.7623258789117082, "grad_norm": 0.1220703125, "learning_rate": 2.757597827522509e-05, "lm_loss": 1.9198, "loss": 2.0545, "mask_loss": 0.1249, "step": 1918, "topk_loss": 0.0098 }, { "epoch": 0.7627233376598375, "grad_norm": 0.10888671875, "learning_rate": 2.7488155651016556e-05, "lm_loss": 1.9409, "loss": 2.0746, "mask_loss": 0.1248, "step": 1919, "topk_loss": 0.009 }, { "epoch": 0.7631207964079666, "grad_norm": 0.1171875, "learning_rate": 2.7400450807686938e-05, "lm_loss": 1.9052, "loss": 2.0384, "mask_loss": 0.1241, "step": 1920, "topk_loss": 0.0091 }, { "epoch": 0.7635182551560957, "grad_norm": 0.10791015625, "learning_rate": 2.731286388769514e-05, "lm_loss": 1.872, "loss": 2.0074, "mask_loss": 0.1257, "step": 1921, "topk_loss": 0.0097 }, { "epoch": 0.7639157139042249, "grad_norm": 0.1142578125, "learning_rate": 2.722539503330843e-05, "lm_loss": 1.9372, "loss": 2.0737, "mask_loss": 0.1271, "step": 1922, "topk_loss": 0.0094 }, { "epoch": 0.764313172652354, "grad_norm": 0.1796875, "learning_rate": 2.7138044386602358e-05, "lm_loss": 1.9238, "loss": 2.0597, "mask_loss": 0.1267, "step": 1923, "topk_loss": 0.0092 }, { "epoch": 0.7647106314004831, "grad_norm": 0.1220703125, "learning_rate": 2.705081208946043e-05, "lm_loss": 1.8969, "loss": 2.0332, "mask_loss": 0.1248, "step": 1924, "topk_loss": 0.0115 }, { "epoch": 0.7651080901486124, "grad_norm": 0.1103515625, "learning_rate": 2.6963698283573958e-05, "lm_loss": 1.9095, "loss": 2.0487, "mask_loss": 0.1293, "step": 1925, "topk_loss": 0.0099 }, { "epoch": 0.7655055488967415, "grad_norm": 0.1142578125, "learning_rate": 2.6876703110441747e-05, "lm_loss": 1.8824, "loss": 2.017, "mask_loss": 0.1241, "step": 1926, "topk_loss": 0.0105 }, { "epoch": 0.7659030076448706, "grad_norm": 0.11279296875, "learning_rate": 2.6789826711369924e-05, "lm_loss": 1.8939, "loss": 2.0288, "mask_loss": 0.1253, "step": 1927, "topk_loss": 0.0096 }, { "epoch": 0.7663004663929998, "grad_norm": 0.11328125, "learning_rate": 2.670306922747171e-05, "lm_loss": 1.8682, "loss": 2.0054, "mask_loss": 0.1277, "step": 1928, "topk_loss": 0.0096 }, { "epoch": 0.7666979251411289, "grad_norm": 0.1103515625, "learning_rate": 2.6616430799667136e-05, "lm_loss": 1.9249, "loss": 2.0587, "mask_loss": 0.125, "step": 1929, "topk_loss": 0.0088 }, { "epoch": 0.767095383889258, "grad_norm": 0.126953125, "learning_rate": 2.6529911568682876e-05, "lm_loss": 1.8887, "loss": 2.0276, "mask_loss": 0.1284, "step": 1930, "topk_loss": 0.0104 }, { "epoch": 0.7674928426373872, "grad_norm": 0.11865234375, "learning_rate": 2.644351167505199e-05, "lm_loss": 1.9812, "loss": 2.1151, "mask_loss": 0.1255, "step": 1931, "topk_loss": 0.0084 }, { "epoch": 0.7678903013855164, "grad_norm": 0.1279296875, "learning_rate": 2.635723125911368e-05, "lm_loss": 1.9342, "loss": 2.0737, "mask_loss": 0.1276, "step": 1932, "topk_loss": 0.0118 }, { "epoch": 0.7682877601336455, "grad_norm": 0.12255859375, "learning_rate": 2.6271070461013116e-05, "lm_loss": 1.8474, "loss": 1.9836, "mask_loss": 0.126, "step": 1933, "topk_loss": 0.0103 }, { "epoch": 0.7686852188817747, "grad_norm": 0.1396484375, "learning_rate": 2.6185029420701136e-05, "lm_loss": 1.921, "loss": 2.0654, "mask_loss": 0.1302, "step": 1934, "topk_loss": 0.0142 }, { "epoch": 0.7690826776299038, "grad_norm": 0.11279296875, "learning_rate": 2.6099108277934103e-05, "lm_loss": 1.8728, "loss": 2.0099, "mask_loss": 0.1264, "step": 1935, "topk_loss": 0.0108 }, { "epoch": 0.7694801363780329, "grad_norm": 0.11767578125, "learning_rate": 2.6013307172273548e-05, "lm_loss": 1.8983, "loss": 2.0338, "mask_loss": 0.1256, "step": 1936, "topk_loss": 0.0098 }, { "epoch": 0.769877595126162, "grad_norm": 0.11376953125, "learning_rate": 2.59276262430861e-05, "lm_loss": 1.8407, "loss": 1.9774, "mask_loss": 0.1257, "step": 1937, "topk_loss": 0.0111 }, { "epoch": 0.7702750538742913, "grad_norm": 0.1240234375, "learning_rate": 2.5842065629543166e-05, "lm_loss": 1.8745, "loss": 2.0144, "mask_loss": 0.127, "step": 1938, "topk_loss": 0.0129 }, { "epoch": 0.7706725126224204, "grad_norm": 0.12353515625, "learning_rate": 2.575662547062071e-05, "lm_loss": 1.8346, "loss": 1.9679, "mask_loss": 0.126, "step": 1939, "topk_loss": 0.0073 }, { "epoch": 0.7710699713705496, "grad_norm": 0.1123046875, "learning_rate": 2.5671305905099075e-05, "lm_loss": 1.8815, "loss": 2.0147, "mask_loss": 0.1244, "step": 1940, "topk_loss": 0.0088 }, { "epoch": 0.7714674301186787, "grad_norm": 0.1201171875, "learning_rate": 2.558610707156268e-05, "lm_loss": 1.8496, "loss": 1.9857, "mask_loss": 0.1262, "step": 1941, "topk_loss": 0.0099 }, { "epoch": 0.7718648888668078, "grad_norm": 0.173828125, "learning_rate": 2.5501029108399866e-05, "lm_loss": 1.8575, "loss": 1.9934, "mask_loss": 0.1258, "step": 1942, "topk_loss": 0.01 }, { "epoch": 0.772262347614937, "grad_norm": 0.12158203125, "learning_rate": 2.5416072153802683e-05, "lm_loss": 1.8578, "loss": 1.9934, "mask_loss": 0.1256, "step": 1943, "topk_loss": 0.0099 }, { "epoch": 0.7726598063630662, "grad_norm": 0.12451171875, "learning_rate": 2.5331236345766517e-05, "lm_loss": 1.9165, "loss": 2.0485, "mask_loss": 0.1232, "step": 1944, "topk_loss": 0.0089 }, { "epoch": 0.7730572651111953, "grad_norm": 0.11083984375, "learning_rate": 2.5246521822090064e-05, "lm_loss": 1.8737, "loss": 2.0102, "mask_loss": 0.1256, "step": 1945, "topk_loss": 0.0109 }, { "epoch": 0.7734547238593245, "grad_norm": 0.11376953125, "learning_rate": 2.5161928720374993e-05, "lm_loss": 1.9129, "loss": 2.0471, "mask_loss": 0.1249, "step": 1946, "topk_loss": 0.0092 }, { "epoch": 0.7738521826074536, "grad_norm": 0.12890625, "learning_rate": 2.5077457178025777e-05, "lm_loss": 1.8962, "loss": 2.0343, "mask_loss": 0.1265, "step": 1947, "topk_loss": 0.0116 }, { "epoch": 0.7742496413555827, "grad_norm": 0.1171875, "learning_rate": 2.4993107332249387e-05, "lm_loss": 1.9555, "loss": 2.0913, "mask_loss": 0.1255, "step": 1948, "topk_loss": 0.0103 }, { "epoch": 0.7746471001037118, "grad_norm": 0.1171875, "learning_rate": 2.4908879320055167e-05, "lm_loss": 1.86, "loss": 1.9944, "mask_loss": 0.1253, "step": 1949, "topk_loss": 0.0091 }, { "epoch": 0.7750445588518411, "grad_norm": 0.1220703125, "learning_rate": 2.4824773278254544e-05, "lm_loss": 1.9012, "loss": 2.0344, "mask_loss": 0.1239, "step": 1950, "topk_loss": 0.0093 }, { "epoch": 0.7750445588518411, "eval_lm_loss": 686.8341064453125, "eval_loss": 686.97119140625, "eval_mask_hit_rate": 0.5358555316925049, "eval_mask_loss": 0.12442468106746674, "eval_mask_top_10_hit_rate": 0.9855944514274597, "eval_mask_top_1_hit_rate": 0.9974312782287598, "eval_mask_top_20_hit_rate": 0.9762206077575684, "eval_mask_top_5_hit_rate": 0.9908714294433594, "eval_runtime": 144.2029, "eval_samples_per_second": 14.202, "eval_steps_per_second": 7.101, "eval_token_accuracy": 0.6145423650741577, "eval_top_k_diff": -526.3795166015625, "eval_topk_loss": 0.012643979862332344, "step": 1950 }, { "epoch": 0.7754420175999702, "grad_norm": 0.1318359375, "learning_rate": 2.4740789343460857e-05, "lm_loss": 1.916, "loss": 2.054, "mask_loss": 0.1272, "step": 1951, "topk_loss": 0.0108 }, { "epoch": 0.7758394763480994, "grad_norm": 0.1591796875, "learning_rate": 2.4656927652089034e-05, "lm_loss": 1.9277, "loss": 2.0645, "mask_loss": 0.1259, "step": 1952, "topk_loss": 0.0108 }, { "epoch": 0.7762369350962285, "grad_norm": 0.11083984375, "learning_rate": 2.457318834035551e-05, "lm_loss": 1.8682, "loss": 2.0037, "mask_loss": 0.1262, "step": 1953, "topk_loss": 0.0093 }, { "epoch": 0.7766343938443576, "grad_norm": 0.1103515625, "learning_rate": 2.4489571544277945e-05, "lm_loss": 1.8163, "loss": 1.9471, "mask_loss": 0.1226, "step": 1954, "topk_loss": 0.0081 }, { "epoch": 0.7770318525924867, "grad_norm": 0.1328125, "learning_rate": 2.4406077399674963e-05, "lm_loss": 1.8933, "loss": 2.027, "mask_loss": 0.1231, "step": 1955, "topk_loss": 0.0106 }, { "epoch": 0.777429311340616, "grad_norm": 0.1396484375, "learning_rate": 2.4322706042165967e-05, "lm_loss": 1.9016, "loss": 2.0375, "mask_loss": 0.1262, "step": 1956, "topk_loss": 0.0097 }, { "epoch": 0.7778267700887451, "grad_norm": 0.138671875, "learning_rate": 2.4239457607170946e-05, "lm_loss": 1.8388, "loss": 1.9749, "mask_loss": 0.1264, "step": 1957, "topk_loss": 0.0097 }, { "epoch": 0.7782242288368743, "grad_norm": 0.11376953125, "learning_rate": 2.4156332229910182e-05, "lm_loss": 1.8868, "loss": 2.0214, "mask_loss": 0.1259, "step": 1958, "topk_loss": 0.0087 }, { "epoch": 0.7786216875850034, "grad_norm": 0.11083984375, "learning_rate": 2.4073330045404118e-05, "lm_loss": 1.939, "loss": 2.0748, "mask_loss": 0.1247, "step": 1959, "topk_loss": 0.0111 }, { "epoch": 0.7790191463331325, "grad_norm": 0.13671875, "learning_rate": 2.3990451188473073e-05, "lm_loss": 1.9089, "loss": 2.0474, "mask_loss": 0.1275, "step": 1960, "topk_loss": 0.011 }, { "epoch": 0.7794166050812616, "grad_norm": 0.1259765625, "learning_rate": 2.390769579373705e-05, "lm_loss": 1.8258, "loss": 1.9621, "mask_loss": 0.1266, "step": 1961, "topk_loss": 0.0097 }, { "epoch": 0.7798140638293908, "grad_norm": 0.1083984375, "learning_rate": 2.3825063995615505e-05, "lm_loss": 1.8292, "loss": 1.9649, "mask_loss": 0.1264, "step": 1962, "topk_loss": 0.0092 }, { "epoch": 0.78021152257752, "grad_norm": 0.1416015625, "learning_rate": 2.3742555928327137e-05, "lm_loss": 1.8748, "loss": 2.012, "mask_loss": 0.1279, "step": 1963, "topk_loss": 0.0093 }, { "epoch": 0.7806089813256492, "grad_norm": 0.11376953125, "learning_rate": 2.36601717258897e-05, "lm_loss": 1.9319, "loss": 2.0692, "mask_loss": 0.1264, "step": 1964, "topk_loss": 0.0109 }, { "epoch": 0.7810064400737783, "grad_norm": 0.1181640625, "learning_rate": 2.35779115221197e-05, "lm_loss": 1.9059, "loss": 2.0435, "mask_loss": 0.1269, "step": 1965, "topk_loss": 0.0107 }, { "epoch": 0.7814038988219074, "grad_norm": 0.11279296875, "learning_rate": 2.3495775450632283e-05, "lm_loss": 1.9232, "loss": 2.0584, "mask_loss": 0.1263, "step": 1966, "topk_loss": 0.0089 }, { "epoch": 0.7818013575700365, "grad_norm": 0.1259765625, "learning_rate": 2.341376364484097e-05, "lm_loss": 1.9051, "loss": 2.0416, "mask_loss": 0.1266, "step": 1967, "topk_loss": 0.0099 }, { "epoch": 0.7821988163181657, "grad_norm": 0.126953125, "learning_rate": 2.333187623795734e-05, "lm_loss": 1.9181, "loss": 2.0577, "mask_loss": 0.1278, "step": 1968, "topk_loss": 0.0118 }, { "epoch": 0.7825962750662949, "grad_norm": 0.11572265625, "learning_rate": 2.325011336299103e-05, "lm_loss": 1.9011, "loss": 2.0339, "mask_loss": 0.1235, "step": 1969, "topk_loss": 0.0094 }, { "epoch": 0.782993733814424, "grad_norm": 0.11376953125, "learning_rate": 2.3168475152749346e-05, "lm_loss": 1.8592, "loss": 1.9931, "mask_loss": 0.1244, "step": 1970, "topk_loss": 0.0095 }, { "epoch": 0.7833911925625532, "grad_norm": 0.1201171875, "learning_rate": 2.308696173983711e-05, "lm_loss": 1.9293, "loss": 2.0637, "mask_loss": 0.1257, "step": 1971, "topk_loss": 0.0087 }, { "epoch": 0.7837886513106823, "grad_norm": 0.1083984375, "learning_rate": 2.3005573256656443e-05, "lm_loss": 1.8757, "loss": 2.0112, "mask_loss": 0.1256, "step": 1972, "topk_loss": 0.0099 }, { "epoch": 0.7841861100588114, "grad_norm": 0.1240234375, "learning_rate": 2.292430983540652e-05, "lm_loss": 1.927, "loss": 2.067, "mask_loss": 0.1275, "step": 1973, "topk_loss": 0.0124 }, { "epoch": 0.7845835688069406, "grad_norm": 0.1162109375, "learning_rate": 2.2843171608083414e-05, "lm_loss": 1.8984, "loss": 2.0312, "mask_loss": 0.1242, "step": 1974, "topk_loss": 0.0085 }, { "epoch": 0.7849810275550698, "grad_norm": 0.13671875, "learning_rate": 2.276215870647983e-05, "lm_loss": 1.9559, "loss": 2.0891, "mask_loss": 0.1237, "step": 1975, "topk_loss": 0.0095 }, { "epoch": 0.785378486303199, "grad_norm": 0.1318359375, "learning_rate": 2.2681271262184856e-05, "lm_loss": 1.8774, "loss": 2.0193, "mask_loss": 0.1289, "step": 1976, "topk_loss": 0.013 }, { "epoch": 0.7857759450513281, "grad_norm": 0.1123046875, "learning_rate": 2.260050940658388e-05, "lm_loss": 1.9079, "loss": 2.0424, "mask_loss": 0.1248, "step": 1977, "topk_loss": 0.0097 }, { "epoch": 0.7861734037994572, "grad_norm": 0.11962890625, "learning_rate": 2.251987327085825e-05, "lm_loss": 1.9246, "loss": 2.0578, "mask_loss": 0.123, "step": 1978, "topk_loss": 0.0102 }, { "epoch": 0.7865708625475863, "grad_norm": 0.1630859375, "learning_rate": 2.2439362985985124e-05, "lm_loss": 1.8482, "loss": 1.9903, "mask_loss": 0.1281, "step": 1979, "topk_loss": 0.014 }, { "epoch": 0.7869683212957155, "grad_norm": 0.11376953125, "learning_rate": 2.235897868273723e-05, "lm_loss": 1.8265, "loss": 1.9625, "mask_loss": 0.1257, "step": 1980, "topk_loss": 0.0103 }, { "epoch": 0.7873657800438447, "grad_norm": 0.12890625, "learning_rate": 2.2278720491682682e-05, "lm_loss": 1.9383, "loss": 2.0739, "mask_loss": 0.1263, "step": 1981, "topk_loss": 0.0093 }, { "epoch": 0.7877632387919739, "grad_norm": 0.12451171875, "learning_rate": 2.2198588543184728e-05, "lm_loss": 1.8872, "loss": 2.0209, "mask_loss": 0.1233, "step": 1982, "topk_loss": 0.0105 }, { "epoch": 0.788160697540103, "grad_norm": 0.134765625, "learning_rate": 2.2118582967401604e-05, "lm_loss": 1.9392, "loss": 2.0782, "mask_loss": 0.1266, "step": 1983, "topk_loss": 0.0124 }, { "epoch": 0.7885581562882321, "grad_norm": 0.11474609375, "learning_rate": 2.2038703894286182e-05, "lm_loss": 1.8916, "loss": 2.0264, "mask_loss": 0.126, "step": 1984, "topk_loss": 0.0088 }, { "epoch": 0.7889556150363612, "grad_norm": 0.119140625, "learning_rate": 2.1958951453585964e-05, "lm_loss": 1.9144, "loss": 2.0487, "mask_loss": 0.1246, "step": 1985, "topk_loss": 0.0097 }, { "epoch": 0.7893530737844904, "grad_norm": 0.1142578125, "learning_rate": 2.187932577484271e-05, "lm_loss": 1.8908, "loss": 2.0243, "mask_loss": 0.1248, "step": 1986, "topk_loss": 0.0087 }, { "epoch": 0.7897505325326195, "grad_norm": 0.1083984375, "learning_rate": 2.179982698739228e-05, "lm_loss": 1.8755, "loss": 2.0111, "mask_loss": 0.1267, "step": 1987, "topk_loss": 0.0089 }, { "epoch": 0.7901479912807488, "grad_norm": 0.1162109375, "learning_rate": 2.1720455220364444e-05, "lm_loss": 1.919, "loss": 2.0554, "mask_loss": 0.1264, "step": 1988, "topk_loss": 0.01 }, { "epoch": 0.7905454500288779, "grad_norm": 0.109375, "learning_rate": 2.1641210602682637e-05, "lm_loss": 1.9149, "loss": 2.0494, "mask_loss": 0.1248, "step": 1989, "topk_loss": 0.0097 }, { "epoch": 0.790942908777007, "grad_norm": 0.11083984375, "learning_rate": 2.1562093263063777e-05, "lm_loss": 1.8652, "loss": 2.0003, "mask_loss": 0.1255, "step": 1990, "topk_loss": 0.0097 }, { "epoch": 0.7913403675251361, "grad_norm": 0.1181640625, "learning_rate": 2.148310333001804e-05, "lm_loss": 1.9265, "loss": 2.0623, "mask_loss": 0.1266, "step": 1991, "topk_loss": 0.0093 }, { "epoch": 0.7917378262732653, "grad_norm": 0.18359375, "learning_rate": 2.140424093184864e-05, "lm_loss": 1.8628, "loss": 1.9956, "mask_loss": 0.1235, "step": 1992, "topk_loss": 0.0093 }, { "epoch": 0.7921352850213944, "grad_norm": 0.126953125, "learning_rate": 2.132550619665168e-05, "lm_loss": 1.825, "loss": 1.9607, "mask_loss": 0.1272, "step": 1993, "topk_loss": 0.0085 }, { "epoch": 0.7925327437695237, "grad_norm": 0.11328125, "learning_rate": 2.1246899252315843e-05, "lm_loss": 1.8818, "loss": 2.0198, "mask_loss": 0.1276, "step": 1994, "topk_loss": 0.0104 }, { "epoch": 0.7929302025176528, "grad_norm": 0.12109375, "learning_rate": 2.116842022652228e-05, "lm_loss": 1.9266, "loss": 2.0608, "mask_loss": 0.1245, "step": 1995, "topk_loss": 0.0097 }, { "epoch": 0.7933276612657819, "grad_norm": 0.11328125, "learning_rate": 2.109006924674436e-05, "lm_loss": 1.9416, "loss": 2.0784, "mask_loss": 0.1262, "step": 1996, "topk_loss": 0.0106 }, { "epoch": 0.793725120013911, "grad_norm": 0.1787109375, "learning_rate": 2.101184644024745e-05, "lm_loss": 1.8627, "loss": 2.0, "mask_loss": 0.1266, "step": 1997, "topk_loss": 0.0107 }, { "epoch": 0.7941225787620402, "grad_norm": 0.107421875, "learning_rate": 2.0933751934088743e-05, "lm_loss": 1.9036, "loss": 2.038, "mask_loss": 0.1254, "step": 1998, "topk_loss": 0.0091 }, { "epoch": 0.7945200375101693, "grad_norm": 0.115234375, "learning_rate": 2.085578585511705e-05, "lm_loss": 1.9016, "loss": 2.0373, "mask_loss": 0.1251, "step": 1999, "topk_loss": 0.0105 }, { "epoch": 0.7949174962582986, "grad_norm": 0.11474609375, "learning_rate": 2.0777948329972497e-05, "lm_loss": 1.9414, "loss": 2.0777, "mask_loss": 0.1254, "step": 2000, "topk_loss": 0.0109 }, { "epoch": 0.7949174962582986, "eval_lm_loss": 687.2774658203125, "eval_loss": 687.4144287109375, "eval_mask_hit_rate": 0.5359091758728027, "eval_mask_loss": 0.12442217767238617, "eval_mask_top_10_hit_rate": 0.9855918884277344, "eval_mask_top_1_hit_rate": 0.9974300861358643, "eval_mask_top_20_hit_rate": 0.9762183427810669, "eval_mask_top_5_hit_rate": 0.9908632636070251, "eval_runtime": 143.846, "eval_samples_per_second": 14.237, "eval_steps_per_second": 7.119, "eval_token_accuracy": 0.6145159006118774, "eval_top_k_diff": -529.1758422851562, "eval_topk_loss": 0.012563161551952362, "step": 2000 }, { "epoch": 0.7953149550064277, "grad_norm": 0.12109375, "learning_rate": 2.0700239485086505e-05, "lm_loss": 1.8677, "loss": 2.0028, "mask_loss": 0.1262, "step": 2001, "topk_loss": 0.0089 }, { "epoch": 0.7957124137545568, "grad_norm": 0.11865234375, "learning_rate": 2.06226594466814e-05, "lm_loss": 1.9036, "loss": 2.0411, "mask_loss": 0.1271, "step": 2002, "topk_loss": 0.0104 }, { "epoch": 0.796109872502686, "grad_norm": 0.10546875, "learning_rate": 2.054520834077036e-05, "lm_loss": 1.8778, "loss": 2.015, "mask_loss": 0.1279, "step": 2003, "topk_loss": 0.0093 }, { "epoch": 0.7965073312508151, "grad_norm": 0.10595703125, "learning_rate": 2.046788629315707e-05, "lm_loss": 1.8541, "loss": 1.9886, "mask_loss": 0.1247, "step": 2004, "topk_loss": 0.0099 }, { "epoch": 0.7969047899989442, "grad_norm": 0.1298828125, "learning_rate": 2.0390693429435627e-05, "lm_loss": 1.9962, "loss": 2.1304, "mask_loss": 0.1244, "step": 2005, "topk_loss": 0.0098 }, { "epoch": 0.7973022487470734, "grad_norm": 0.1376953125, "learning_rate": 2.031362987499027e-05, "lm_loss": 1.9665, "loss": 2.0999, "mask_loss": 0.1246, "step": 2006, "topk_loss": 0.0089 }, { "epoch": 0.7976997074952026, "grad_norm": 0.11572265625, "learning_rate": 2.023669575499526e-05, "lm_loss": 1.9415, "loss": 2.0783, "mask_loss": 0.1253, "step": 2007, "topk_loss": 0.0115 }, { "epoch": 0.7980971662433317, "grad_norm": 0.1220703125, "learning_rate": 2.0159891194414504e-05, "lm_loss": 1.9272, "loss": 2.0644, "mask_loss": 0.1267, "step": 2008, "topk_loss": 0.0105 }, { "epoch": 0.7984946249914608, "grad_norm": 0.1171875, "learning_rate": 2.0083216318001564e-05, "lm_loss": 1.9089, "loss": 2.0452, "mask_loss": 0.1251, "step": 2009, "topk_loss": 0.0112 }, { "epoch": 0.79889208373959, "grad_norm": 0.1123046875, "learning_rate": 2.0006671250299337e-05, "lm_loss": 1.8893, "loss": 2.0264, "mask_loss": 0.1266, "step": 2010, "topk_loss": 0.0105 }, { "epoch": 0.7992895424877191, "grad_norm": 0.125, "learning_rate": 1.9930256115639832e-05, "lm_loss": 1.7778, "loss": 1.916, "mask_loss": 0.1284, "step": 2011, "topk_loss": 0.0099 }, { "epoch": 0.7996870012358483, "grad_norm": 0.11328125, "learning_rate": 1.985397103814407e-05, "lm_loss": 1.8974, "loss": 2.0344, "mask_loss": 0.1263, "step": 2012, "topk_loss": 0.0107 }, { "epoch": 0.8000844599839775, "grad_norm": 0.1337890625, "learning_rate": 1.977781614172176e-05, "lm_loss": 1.8861, "loss": 2.0212, "mask_loss": 0.1253, "step": 2013, "topk_loss": 0.0098 }, { "epoch": 0.8004819187321066, "grad_norm": 0.11669921875, "learning_rate": 1.9701791550071202e-05, "lm_loss": 1.8094, "loss": 1.9439, "mask_loss": 0.1254, "step": 2014, "topk_loss": 0.009 }, { "epoch": 0.8008793774802357, "grad_norm": 0.11865234375, "learning_rate": 1.9625897386679038e-05, "lm_loss": 1.8877, "loss": 2.0223, "mask_loss": 0.1257, "step": 2015, "topk_loss": 0.0089 }, { "epoch": 0.8012768362283649, "grad_norm": 0.1103515625, "learning_rate": 1.9550133774820002e-05, "lm_loss": 1.9153, "loss": 2.0523, "mask_loss": 0.1264, "step": 2016, "topk_loss": 0.0106 }, { "epoch": 0.801674294976494, "grad_norm": 0.1357421875, "learning_rate": 1.9474500837556842e-05, "lm_loss": 1.9657, "loss": 2.0985, "mask_loss": 0.1237, "step": 2017, "topk_loss": 0.0091 }, { "epoch": 0.8020717537246231, "grad_norm": 0.1171875, "learning_rate": 1.9398998697740002e-05, "lm_loss": 1.8875, "loss": 2.0217, "mask_loss": 0.1255, "step": 2018, "topk_loss": 0.0087 }, { "epoch": 0.8024692124727524, "grad_norm": 0.1103515625, "learning_rate": 1.9323627478007522e-05, "lm_loss": 1.9256, "loss": 2.0609, "mask_loss": 0.1257, "step": 2019, "topk_loss": 0.0096 }, { "epoch": 0.8028666712208815, "grad_norm": 0.1123046875, "learning_rate": 1.924838730078474e-05, "lm_loss": 1.8953, "loss": 2.0295, "mask_loss": 0.125, "step": 2020, "topk_loss": 0.0092 }, { "epoch": 0.8032641299690106, "grad_norm": 0.11962890625, "learning_rate": 1.917327828828417e-05, "lm_loss": 1.9119, "loss": 2.049, "mask_loss": 0.1267, "step": 2021, "topk_loss": 0.0104 }, { "epoch": 0.8036615887171398, "grad_norm": 0.11865234375, "learning_rate": 1.9098300562505266e-05, "lm_loss": 1.9271, "loss": 2.0639, "mask_loss": 0.1258, "step": 2022, "topk_loss": 0.0109 }, { "epoch": 0.8040590474652689, "grad_norm": 0.123046875, "learning_rate": 1.902345424523423e-05, "lm_loss": 1.9097, "loss": 2.048, "mask_loss": 0.1268, "step": 2023, "topk_loss": 0.0115 }, { "epoch": 0.804456506213398, "grad_norm": 0.111328125, "learning_rate": 1.894873945804383e-05, "lm_loss": 1.9408, "loss": 2.0737, "mask_loss": 0.1237, "step": 2024, "topk_loss": 0.0092 }, { "epoch": 0.8048539649615273, "grad_norm": 0.10791015625, "learning_rate": 1.887415632229318e-05, "lm_loss": 1.8618, "loss": 1.9936, "mask_loss": 0.123, "step": 2025, "topk_loss": 0.0088 }, { "epoch": 0.8052514237096564, "grad_norm": 0.1435546875, "learning_rate": 1.879970495912755e-05, "lm_loss": 1.9102, "loss": 2.05, "mask_loss": 0.1281, "step": 2026, "topk_loss": 0.0117 }, { "epoch": 0.8056488824577855, "grad_norm": 0.11474609375, "learning_rate": 1.8725385489478176e-05, "lm_loss": 1.8907, "loss": 2.028, "mask_loss": 0.1261, "step": 2027, "topk_loss": 0.0112 }, { "epoch": 0.8060463412059147, "grad_norm": 0.1455078125, "learning_rate": 1.8651198034062058e-05, "lm_loss": 1.9389, "loss": 2.0798, "mask_loss": 0.1271, "step": 2028, "topk_loss": 0.0138 }, { "epoch": 0.8064437999540438, "grad_norm": 0.12890625, "learning_rate": 1.857714271338178e-05, "lm_loss": 1.9314, "loss": 2.0663, "mask_loss": 0.1254, "step": 2029, "topk_loss": 0.0095 }, { "epoch": 0.8068412587021729, "grad_norm": 0.11279296875, "learning_rate": 1.850321964772528e-05, "lm_loss": 1.9748, "loss": 2.1091, "mask_loss": 0.1245, "step": 2030, "topk_loss": 0.0098 }, { "epoch": 0.8072387174503022, "grad_norm": 0.1171875, "learning_rate": 1.8429428957165696e-05, "lm_loss": 1.8888, "loss": 2.0244, "mask_loss": 0.1255, "step": 2031, "topk_loss": 0.0102 }, { "epoch": 0.8076361761984313, "grad_norm": 0.11865234375, "learning_rate": 1.8355770761561098e-05, "lm_loss": 1.9131, "loss": 2.0502, "mask_loss": 0.1255, "step": 2032, "topk_loss": 0.0116 }, { "epoch": 0.8080336349465604, "grad_norm": 0.11376953125, "learning_rate": 1.8282245180554413e-05, "lm_loss": 1.934, "loss": 2.0718, "mask_loss": 0.1279, "step": 2033, "topk_loss": 0.01 }, { "epoch": 0.8084310936946896, "grad_norm": 0.1162109375, "learning_rate": 1.820885233357311e-05, "lm_loss": 1.9349, "loss": 2.0669, "mask_loss": 0.1228, "step": 2034, "topk_loss": 0.0092 }, { "epoch": 0.8088285524428187, "grad_norm": 0.11669921875, "learning_rate": 1.8135592339829098e-05, "lm_loss": 1.9668, "loss": 2.1008, "mask_loss": 0.1246, "step": 2035, "topk_loss": 0.0094 }, { "epoch": 0.8092260111909478, "grad_norm": 0.11865234375, "learning_rate": 1.8062465318318454e-05, "lm_loss": 1.9336, "loss": 2.0664, "mask_loss": 0.1231, "step": 2036, "topk_loss": 0.0098 }, { "epoch": 0.8096234699390771, "grad_norm": 0.1162109375, "learning_rate": 1.798947138782131e-05, "lm_loss": 1.8986, "loss": 2.0331, "mask_loss": 0.1257, "step": 2037, "topk_loss": 0.0087 }, { "epoch": 0.8100209286872062, "grad_norm": 0.109375, "learning_rate": 1.791661066690159e-05, "lm_loss": 1.8422, "loss": 1.9779, "mask_loss": 0.1263, "step": 2038, "topk_loss": 0.0094 }, { "epoch": 0.8104183874353353, "grad_norm": 0.1201171875, "learning_rate": 1.784388327390687e-05, "lm_loss": 1.8936, "loss": 2.0325, "mask_loss": 0.1274, "step": 2039, "topk_loss": 0.0116 }, { "epoch": 0.8108158461834645, "grad_norm": 0.115234375, "learning_rate": 1.7771289326968098e-05, "lm_loss": 1.9652, "loss": 2.096, "mask_loss": 0.1214, "step": 2040, "topk_loss": 0.0095 }, { "epoch": 0.8112133049315936, "grad_norm": 0.1162109375, "learning_rate": 1.7698828943999545e-05, "lm_loss": 1.8781, "loss": 2.0122, "mask_loss": 0.1251, "step": 2041, "topk_loss": 0.0091 }, { "epoch": 0.8116107636797227, "grad_norm": 0.11376953125, "learning_rate": 1.7626502242698484e-05, "lm_loss": 1.8796, "loss": 2.0157, "mask_loss": 0.1256, "step": 2042, "topk_loss": 0.0105 }, { "epoch": 0.8120082224278519, "grad_norm": 0.115234375, "learning_rate": 1.7554309340545084e-05, "lm_loss": 1.8795, "loss": 2.0135, "mask_loss": 0.1251, "step": 2043, "topk_loss": 0.0089 }, { "epoch": 0.8124056811759811, "grad_norm": 0.1181640625, "learning_rate": 1.7482250354802156e-05, "lm_loss": 1.896, "loss": 2.0344, "mask_loss": 0.1283, "step": 2044, "topk_loss": 0.0101 }, { "epoch": 0.8128031399241102, "grad_norm": 0.1298828125, "learning_rate": 1.7410325402515003e-05, "lm_loss": 1.859, "loss": 1.9968, "mask_loss": 0.128, "step": 2045, "topk_loss": 0.0097 }, { "epoch": 0.8132005986722394, "grad_norm": 0.11865234375, "learning_rate": 1.7338534600511224e-05, "lm_loss": 1.9085, "loss": 2.0445, "mask_loss": 0.1256, "step": 2046, "topk_loss": 0.0104 }, { "epoch": 0.8135980574203685, "grad_norm": 0.1083984375, "learning_rate": 1.7266878065400527e-05, "lm_loss": 1.9167, "loss": 2.0494, "mask_loss": 0.1244, "step": 2047, "topk_loss": 0.0083 }, { "epoch": 0.8139955161684976, "grad_norm": 0.11376953125, "learning_rate": 1.719535591357446e-05, "lm_loss": 1.8987, "loss": 2.0348, "mask_loss": 0.1256, "step": 2048, "topk_loss": 0.0106 }, { "epoch": 0.8143929749166268, "grad_norm": 0.1123046875, "learning_rate": 1.712396826120639e-05, "lm_loss": 1.8859, "loss": 2.0243, "mask_loss": 0.1277, "step": 2049, "topk_loss": 0.0106 }, { "epoch": 0.814790433664756, "grad_norm": 0.11279296875, "learning_rate": 1.7052715224251147e-05, "lm_loss": 1.9431, "loss": 2.0784, "mask_loss": 0.1251, "step": 2050, "topk_loss": 0.0102 }, { "epoch": 0.814790433664756, "eval_lm_loss": 687.176025390625, "eval_loss": 687.31298828125, "eval_mask_hit_rate": 0.5359328389167786, "eval_mask_loss": 0.12440869212150574, "eval_mask_top_10_hit_rate": 0.9855953454971313, "eval_mask_top_1_hit_rate": 0.9974322319030762, "eval_mask_top_20_hit_rate": 0.9762266874313354, "eval_mask_top_5_hit_rate": 0.9908630847930908, "eval_runtime": 144.3186, "eval_samples_per_second": 14.191, "eval_steps_per_second": 7.095, "eval_token_accuracy": 0.6145559549331665, "eval_top_k_diff": -528.7608642578125, "eval_topk_loss": 0.0125643415376544, "step": 2050 }, { "epoch": 0.8151878924128851, "grad_norm": 0.11865234375, "learning_rate": 1.6981596918444953e-05, "lm_loss": 1.9253, "loss": 2.0592, "mask_loss": 0.1247, "step": 2051, "topk_loss": 0.0092 }, { "epoch": 0.8155853511610143, "grad_norm": 0.12109375, "learning_rate": 1.6910613459305146e-05, "lm_loss": 1.9252, "loss": 2.0608, "mask_loss": 0.1256, "step": 2052, "topk_loss": 0.01 }, { "epoch": 0.8159828099091434, "grad_norm": 0.130859375, "learning_rate": 1.6839764962130057e-05, "lm_loss": 1.7789, "loss": 1.9157, "mask_loss": 0.1272, "step": 2053, "topk_loss": 0.0096 }, { "epoch": 0.8163802686572725, "grad_norm": 0.11865234375, "learning_rate": 1.6769051541998803e-05, "lm_loss": 1.8885, "loss": 2.0259, "mask_loss": 0.1276, "step": 2054, "topk_loss": 0.0097 }, { "epoch": 0.8167777274054017, "grad_norm": 0.11279296875, "learning_rate": 1.669847331377109e-05, "lm_loss": 1.9007, "loss": 2.0387, "mask_loss": 0.1283, "step": 2055, "topk_loss": 0.0098 }, { "epoch": 0.8171751861535309, "grad_norm": 0.1171875, "learning_rate": 1.6628030392087e-05, "lm_loss": 1.9148, "loss": 2.0535, "mask_loss": 0.1284, "step": 2056, "topk_loss": 0.0103 }, { "epoch": 0.81757264490166, "grad_norm": 0.1181640625, "learning_rate": 1.6557722891366878e-05, "lm_loss": 1.9268, "loss": 2.0637, "mask_loss": 0.1273, "step": 2057, "topk_loss": 0.0097 }, { "epoch": 0.8179701036497892, "grad_norm": 0.1220703125, "learning_rate": 1.6487550925811092e-05, "lm_loss": 1.8928, "loss": 2.0282, "mask_loss": 0.1265, "step": 2058, "topk_loss": 0.0089 }, { "epoch": 0.8183675623979183, "grad_norm": 0.1201171875, "learning_rate": 1.6417514609399865e-05, "lm_loss": 1.958, "loss": 2.0927, "mask_loss": 0.1248, "step": 2059, "topk_loss": 0.0099 }, { "epoch": 0.8187650211460474, "grad_norm": 0.12890625, "learning_rate": 1.6347614055893055e-05, "lm_loss": 1.8695, "loss": 2.0049, "mask_loss": 0.1256, "step": 2060, "topk_loss": 0.0098 }, { "epoch": 0.8191624798941766, "grad_norm": 0.12255859375, "learning_rate": 1.6277849378830057e-05, "lm_loss": 1.8947, "loss": 2.0258, "mask_loss": 0.123, "step": 2061, "topk_loss": 0.0081 }, { "epoch": 0.8195599386423058, "grad_norm": 0.1083984375, "learning_rate": 1.620822069152952e-05, "lm_loss": 1.9621, "loss": 2.0934, "mask_loss": 0.1222, "step": 2062, "topk_loss": 0.0091 }, { "epoch": 0.8199573973904349, "grad_norm": 0.11376953125, "learning_rate": 1.613872810708921e-05, "lm_loss": 1.8747, "loss": 2.0133, "mask_loss": 0.1287, "step": 2063, "topk_loss": 0.0099 }, { "epoch": 0.8203548561385641, "grad_norm": 0.1416015625, "learning_rate": 1.606937173838582e-05, "lm_loss": 1.7899, "loss": 1.927, "mask_loss": 0.1278, "step": 2064, "topk_loss": 0.0093 }, { "epoch": 0.8207523148866932, "grad_norm": 0.10888671875, "learning_rate": 1.6000151698074816e-05, "lm_loss": 1.9232, "loss": 2.0579, "mask_loss": 0.125, "step": 2065, "topk_loss": 0.0098 }, { "epoch": 0.8211497736348223, "grad_norm": 0.125, "learning_rate": 1.5931068098590186e-05, "lm_loss": 1.8724, "loss": 2.007, "mask_loss": 0.1244, "step": 2066, "topk_loss": 0.0103 }, { "epoch": 0.8215472323829515, "grad_norm": 0.12353515625, "learning_rate": 1.586212105214432e-05, "lm_loss": 1.9352, "loss": 2.0713, "mask_loss": 0.1256, "step": 2067, "topk_loss": 0.0104 }, { "epoch": 0.8219446911310807, "grad_norm": 0.1103515625, "learning_rate": 1.5793310670727814e-05, "lm_loss": 1.8989, "loss": 2.0332, "mask_loss": 0.124, "step": 2068, "topk_loss": 0.0103 }, { "epoch": 0.8223421498792098, "grad_norm": 0.12109375, "learning_rate": 1.5724637066109248e-05, "lm_loss": 1.9171, "loss": 2.0565, "mask_loss": 0.1268, "step": 2069, "topk_loss": 0.0126 }, { "epoch": 0.822739608627339, "grad_norm": 0.10302734375, "learning_rate": 1.5656100349835057e-05, "lm_loss": 1.9178, "loss": 2.0523, "mask_loss": 0.1252, "step": 2070, "topk_loss": 0.0093 }, { "epoch": 0.8231370673754681, "grad_norm": 0.1123046875, "learning_rate": 1.5587700633229363e-05, "lm_loss": 1.8394, "loss": 1.9755, "mask_loss": 0.1272, "step": 2071, "topk_loss": 0.0089 }, { "epoch": 0.8235345261235972, "grad_norm": 0.10595703125, "learning_rate": 1.5519438027393662e-05, "lm_loss": 1.8679, "loss": 1.998, "mask_loss": 0.1218, "step": 2072, "topk_loss": 0.0083 }, { "epoch": 0.8239319848717264, "grad_norm": 0.1279296875, "learning_rate": 1.5451312643206827e-05, "lm_loss": 1.9522, "loss": 2.088, "mask_loss": 0.1257, "step": 2073, "topk_loss": 0.0101 }, { "epoch": 0.8243294436198555, "grad_norm": 0.1123046875, "learning_rate": 1.538332459132482e-05, "lm_loss": 1.8928, "loss": 2.0295, "mask_loss": 0.1271, "step": 2074, "topk_loss": 0.0095 }, { "epoch": 0.8247269023679847, "grad_norm": 0.12158203125, "learning_rate": 1.531547398218053e-05, "lm_loss": 1.8445, "loss": 1.9775, "mask_loss": 0.1244, "step": 2075, "topk_loss": 0.0085 }, { "epoch": 0.8251243611161139, "grad_norm": 0.111328125, "learning_rate": 1.5247760925983601e-05, "lm_loss": 1.9218, "loss": 2.0537, "mask_loss": 0.1243, "step": 2076, "topk_loss": 0.0077 }, { "epoch": 0.825521819864243, "grad_norm": 0.11083984375, "learning_rate": 1.5180185532720237e-05, "lm_loss": 1.9045, "loss": 2.0409, "mask_loss": 0.126, "step": 2077, "topk_loss": 0.0104 }, { "epoch": 0.8259192786123721, "grad_norm": 0.11962890625, "learning_rate": 1.5112747912153057e-05, "lm_loss": 1.8387, "loss": 1.9758, "mask_loss": 0.1274, "step": 2078, "topk_loss": 0.0097 }, { "epoch": 0.8263167373605013, "grad_norm": 0.11376953125, "learning_rate": 1.5045448173820908e-05, "lm_loss": 1.8736, "loss": 2.0055, "mask_loss": 0.1237, "step": 2079, "topk_loss": 0.0081 }, { "epoch": 0.8267141961086304, "grad_norm": 0.11376953125, "learning_rate": 1.4978286427038601e-05, "lm_loss": 1.971, "loss": 2.1062, "mask_loss": 0.1252, "step": 2080, "topk_loss": 0.01 }, { "epoch": 0.8271116548567596, "grad_norm": 0.11767578125, "learning_rate": 1.4911262780896884e-05, "lm_loss": 1.8702, "loss": 2.0083, "mask_loss": 0.128, "step": 2081, "topk_loss": 0.0101 }, { "epoch": 0.8275091136048888, "grad_norm": 0.1298828125, "learning_rate": 1.4844377344262172e-05, "lm_loss": 1.895, "loss": 2.0298, "mask_loss": 0.1256, "step": 2082, "topk_loss": 0.0092 }, { "epoch": 0.8279065723530179, "grad_norm": 0.11181640625, "learning_rate": 1.4777630225776374e-05, "lm_loss": 1.8548, "loss": 1.9897, "mask_loss": 0.1253, "step": 2083, "topk_loss": 0.0096 }, { "epoch": 0.828304031101147, "grad_norm": 0.123046875, "learning_rate": 1.4711021533856728e-05, "lm_loss": 1.9464, "loss": 2.0808, "mask_loss": 0.1257, "step": 2084, "topk_loss": 0.0087 }, { "epoch": 0.8287014898492762, "grad_norm": 0.173828125, "learning_rate": 1.4644551376695636e-05, "lm_loss": 1.8577, "loss": 1.9984, "mask_loss": 0.1267, "step": 2085, "topk_loss": 0.014 }, { "epoch": 0.8290989485974053, "grad_norm": 0.1083984375, "learning_rate": 1.4578219862260478e-05, "lm_loss": 1.8929, "loss": 2.0281, "mask_loss": 0.1265, "step": 2086, "topk_loss": 0.0087 }, { "epoch": 0.8294964073455345, "grad_norm": 0.11572265625, "learning_rate": 1.4512027098293445e-05, "lm_loss": 1.8918, "loss": 2.0291, "mask_loss": 0.126, "step": 2087, "topk_loss": 0.0113 }, { "epoch": 0.8298938660936637, "grad_norm": 0.107421875, "learning_rate": 1.4445973192311312e-05, "lm_loss": 1.8956, "loss": 2.03, "mask_loss": 0.1252, "step": 2088, "topk_loss": 0.0092 }, { "epoch": 0.8302913248417928, "grad_norm": 0.11181640625, "learning_rate": 1.4380058251605343e-05, "lm_loss": 1.873, "loss": 2.01, "mask_loss": 0.1255, "step": 2089, "topk_loss": 0.0115 }, { "epoch": 0.8306887835899219, "grad_norm": 0.126953125, "learning_rate": 1.4314282383241096e-05, "lm_loss": 1.9036, "loss": 2.0418, "mask_loss": 0.1276, "step": 2090, "topk_loss": 0.0106 }, { "epoch": 0.8310862423380511, "grad_norm": 0.1513671875, "learning_rate": 1.4248645694058193e-05, "lm_loss": 1.8892, "loss": 2.0246, "mask_loss": 0.1255, "step": 2091, "topk_loss": 0.0099 }, { "epoch": 0.8314837010861802, "grad_norm": 0.1083984375, "learning_rate": 1.4183148290670223e-05, "lm_loss": 1.9051, "loss": 2.0387, "mask_loss": 0.1245, "step": 2092, "topk_loss": 0.0091 }, { "epoch": 0.8318811598343094, "grad_norm": 0.1298828125, "learning_rate": 1.4117790279464526e-05, "lm_loss": 1.8981, "loss": 2.0332, "mask_loss": 0.1252, "step": 2093, "topk_loss": 0.0098 }, { "epoch": 0.8322786185824386, "grad_norm": 0.126953125, "learning_rate": 1.4052571766601996e-05, "lm_loss": 1.8772, "loss": 2.0117, "mask_loss": 0.1258, "step": 2094, "topk_loss": 0.0087 }, { "epoch": 0.8326760773305677, "grad_norm": 0.125, "learning_rate": 1.3987492858016994e-05, "lm_loss": 1.9037, "loss": 2.0373, "mask_loss": 0.1247, "step": 2095, "topk_loss": 0.0089 }, { "epoch": 0.8330735360786968, "grad_norm": 0.1318359375, "learning_rate": 1.392255365941707e-05, "lm_loss": 1.9019, "loss": 2.0371, "mask_loss": 0.125, "step": 2096, "topk_loss": 0.0102 }, { "epoch": 0.833470994826826, "grad_norm": 0.11083984375, "learning_rate": 1.3857754276282875e-05, "lm_loss": 1.813, "loss": 1.9519, "mask_loss": 0.1284, "step": 2097, "topk_loss": 0.0106 }, { "epoch": 0.8338684535749551, "grad_norm": 0.12890625, "learning_rate": 1.3793094813867947e-05, "lm_loss": 1.9538, "loss": 2.0897, "mask_loss": 0.1252, "step": 2098, "topk_loss": 0.0108 }, { "epoch": 0.8342659123230842, "grad_norm": 0.11572265625, "learning_rate": 1.372857537719855e-05, "lm_loss": 1.9157, "loss": 2.0535, "mask_loss": 0.1253, "step": 2099, "topk_loss": 0.0126 }, { "epoch": 0.8346633710712135, "grad_norm": 0.1376953125, "learning_rate": 1.3664196071073521e-05, "lm_loss": 1.8927, "loss": 2.0387, "mask_loss": 0.1296, "step": 2100, "topk_loss": 0.0164 }, { "epoch": 0.8346633710712135, "eval_lm_loss": 687.4456787109375, "eval_loss": 687.58251953125, "eval_mask_hit_rate": 0.5359505414962769, "eval_mask_loss": 0.12440785765647888, "eval_mask_top_10_hit_rate": 0.9855968952178955, "eval_mask_top_1_hit_rate": 0.9974315166473389, "eval_mask_top_20_hit_rate": 0.9762290120124817, "eval_mask_top_5_hit_rate": 0.9908639192581177, "eval_runtime": 144.4261, "eval_samples_per_second": 14.18, "eval_steps_per_second": 7.09, "eval_token_accuracy": 0.6145721673965454, "eval_top_k_diff": -530.545166015625, "eval_topk_loss": 0.012532277032732964, "step": 2100 }, { "epoch": 0.8350608298193426, "grad_norm": 0.1259765625, "learning_rate": 1.3599957000064057e-05, "lm_loss": 1.9435, "loss": 2.0832, "mask_loss": 0.1273, "step": 2101, "topk_loss": 0.0125 }, { "epoch": 0.8354582885674717, "grad_norm": 0.111328125, "learning_rate": 1.353585826851358e-05, "lm_loss": 1.9445, "loss": 2.0803, "mask_loss": 0.1255, "step": 2102, "topk_loss": 0.0103 }, { "epoch": 0.8358557473156009, "grad_norm": 0.10986328125, "learning_rate": 1.3471899980537594e-05, "lm_loss": 1.8861, "loss": 2.0213, "mask_loss": 0.126, "step": 2103, "topk_loss": 0.0092 }, { "epoch": 0.83625320606373, "grad_norm": 0.1103515625, "learning_rate": 1.3408082240023412e-05, "lm_loss": 1.8881, "loss": 2.0225, "mask_loss": 0.1237, "step": 2104, "topk_loss": 0.0107 }, { "epoch": 0.8366506648118591, "grad_norm": 0.10791015625, "learning_rate": 1.334440515063009e-05, "lm_loss": 1.9355, "loss": 2.0751, "mask_loss": 0.1289, "step": 2105, "topk_loss": 0.0106 }, { "epoch": 0.8370481235599884, "grad_norm": 0.11083984375, "learning_rate": 1.3280868815788249e-05, "lm_loss": 1.8827, "loss": 2.018, "mask_loss": 0.126, "step": 2106, "topk_loss": 0.0093 }, { "epoch": 0.8374455823081175, "grad_norm": 0.1083984375, "learning_rate": 1.3217473338699859e-05, "lm_loss": 1.9252, "loss": 2.0583, "mask_loss": 0.1246, "step": 2107, "topk_loss": 0.0085 }, { "epoch": 0.8378430410562466, "grad_norm": 0.10888671875, "learning_rate": 1.3154218822338094e-05, "lm_loss": 1.8615, "loss": 1.9949, "mask_loss": 0.1244, "step": 2108, "topk_loss": 0.009 }, { "epoch": 0.8382404998043758, "grad_norm": 0.10888671875, "learning_rate": 1.3091105369447165e-05, "lm_loss": 1.9219, "loss": 2.0596, "mask_loss": 0.1279, "step": 2109, "topk_loss": 0.0098 }, { "epoch": 0.8386379585525049, "grad_norm": 0.11279296875, "learning_rate": 1.3028133082542172e-05, "lm_loss": 1.9221, "loss": 2.0598, "mask_loss": 0.1286, "step": 2110, "topk_loss": 0.0091 }, { "epoch": 0.839035417300634, "grad_norm": 0.1083984375, "learning_rate": 1.2965302063908902e-05, "lm_loss": 1.9019, "loss": 2.035, "mask_loss": 0.1242, "step": 2111, "topk_loss": 0.0089 }, { "epoch": 0.8394328760487633, "grad_norm": 0.1416015625, "learning_rate": 1.2902612415603665e-05, "lm_loss": 1.9063, "loss": 2.0412, "mask_loss": 0.1248, "step": 2112, "topk_loss": 0.0101 }, { "epoch": 0.8398303347968924, "grad_norm": 0.126953125, "learning_rate": 1.2840064239453176e-05, "lm_loss": 1.898, "loss": 2.0321, "mask_loss": 0.1235, "step": 2113, "topk_loss": 0.0106 }, { "epoch": 0.8402277935450215, "grad_norm": 0.1318359375, "learning_rate": 1.277765763705434e-05, "lm_loss": 1.9033, "loss": 2.0409, "mask_loss": 0.1264, "step": 2114, "topk_loss": 0.0112 }, { "epoch": 0.8406252522931507, "grad_norm": 0.1083984375, "learning_rate": 1.2715392709774099e-05, "lm_loss": 1.9219, "loss": 2.0598, "mask_loss": 0.1271, "step": 2115, "topk_loss": 0.0109 }, { "epoch": 0.8410227110412798, "grad_norm": 0.1943359375, "learning_rate": 1.2653269558749292e-05, "lm_loss": 1.8536, "loss": 1.9936, "mask_loss": 0.128, "step": 2116, "topk_loss": 0.012 }, { "epoch": 0.8414201697894089, "grad_norm": 0.1337890625, "learning_rate": 1.259128828488646e-05, "lm_loss": 1.8852, "loss": 2.0253, "mask_loss": 0.1286, "step": 2117, "topk_loss": 0.0115 }, { "epoch": 0.8418176285375382, "grad_norm": 0.125, "learning_rate": 1.252944898886168e-05, "lm_loss": 1.9582, "loss": 2.0964, "mask_loss": 0.1274, "step": 2118, "topk_loss": 0.0108 }, { "epoch": 0.8422150872856673, "grad_norm": 0.11083984375, "learning_rate": 1.2467751771120462e-05, "lm_loss": 1.9098, "loss": 2.0448, "mask_loss": 0.1252, "step": 2119, "topk_loss": 0.0098 }, { "epoch": 0.8426125460337964, "grad_norm": 0.10693359375, "learning_rate": 1.2406196731877462e-05, "lm_loss": 1.8932, "loss": 2.0318, "mask_loss": 0.1289, "step": 2120, "topk_loss": 0.0097 }, { "epoch": 0.8430100047819256, "grad_norm": 0.142578125, "learning_rate": 1.2344783971116436e-05, "lm_loss": 1.8694, "loss": 2.0124, "mask_loss": 0.1287, "step": 2121, "topk_loss": 0.0143 }, { "epoch": 0.8434074635300547, "grad_norm": 0.111328125, "learning_rate": 1.2283513588590067e-05, "lm_loss": 1.8623, "loss": 2.0009, "mask_loss": 0.1279, "step": 2122, "topk_loss": 0.0107 }, { "epoch": 0.8438049222781838, "grad_norm": 0.1298828125, "learning_rate": 1.2222385683819714e-05, "lm_loss": 2.0004, "loss": 2.1375, "mask_loss": 0.1256, "step": 2123, "topk_loss": 0.0114 }, { "epoch": 0.8442023810263131, "grad_norm": 0.111328125, "learning_rate": 1.2161400356095375e-05, "lm_loss": 1.9151, "loss": 2.0496, "mask_loss": 0.1242, "step": 2124, "topk_loss": 0.0103 }, { "epoch": 0.8445998397744422, "grad_norm": 0.140625, "learning_rate": 1.2100557704475401e-05, "lm_loss": 1.8272, "loss": 1.9643, "mask_loss": 0.1275, "step": 2125, "topk_loss": 0.0095 }, { "epoch": 0.8449972985225713, "grad_norm": 0.11083984375, "learning_rate": 1.2039857827786416e-05, "lm_loss": 1.9475, "loss": 2.0824, "mask_loss": 0.1259, "step": 2126, "topk_loss": 0.009 }, { "epoch": 0.8453947572707005, "grad_norm": 0.11669921875, "learning_rate": 1.1979300824623163e-05, "lm_loss": 1.9046, "loss": 2.0357, "mask_loss": 0.1225, "step": 2127, "topk_loss": 0.0086 }, { "epoch": 0.8457922160188296, "grad_norm": 0.1181640625, "learning_rate": 1.191888679334826e-05, "lm_loss": 1.9204, "loss": 2.0547, "mask_loss": 0.1249, "step": 2128, "topk_loss": 0.0093 }, { "epoch": 0.8461896747669587, "grad_norm": 0.10595703125, "learning_rate": 1.1858615832092156e-05, "lm_loss": 1.8909, "loss": 2.0276, "mask_loss": 0.1274, "step": 2129, "topk_loss": 0.0092 }, { "epoch": 0.8465871335150879, "grad_norm": 0.1298828125, "learning_rate": 1.1798488038752853e-05, "lm_loss": 1.9399, "loss": 2.0785, "mask_loss": 0.1252, "step": 2130, "topk_loss": 0.0134 }, { "epoch": 0.8469845922632171, "grad_norm": 0.130859375, "learning_rate": 1.1738503510995857e-05, "lm_loss": 1.9269, "loss": 2.0617, "mask_loss": 0.1252, "step": 2131, "topk_loss": 0.0096 }, { "epoch": 0.8473820510113462, "grad_norm": 0.11376953125, "learning_rate": 1.1678662346253933e-05, "lm_loss": 1.9157, "loss": 2.0515, "mask_loss": 0.1258, "step": 2132, "topk_loss": 0.01 }, { "epoch": 0.8477795097594754, "grad_norm": 0.115234375, "learning_rate": 1.1618964641727004e-05, "lm_loss": 1.8925, "loss": 2.0287, "mask_loss": 0.1263, "step": 2133, "topk_loss": 0.0099 }, { "epoch": 0.8481769685076045, "grad_norm": 0.10986328125, "learning_rate": 1.1559410494381951e-05, "lm_loss": 1.9129, "loss": 2.0473, "mask_loss": 0.1256, "step": 2134, "topk_loss": 0.0089 }, { "epoch": 0.8485744272557336, "grad_norm": 0.1455078125, "learning_rate": 1.1500000000952516e-05, "lm_loss": 1.9484, "loss": 2.0885, "mask_loss": 0.1276, "step": 2135, "topk_loss": 0.0126 }, { "epoch": 0.8489718860038628, "grad_norm": 0.11328125, "learning_rate": 1.1440733257939018e-05, "lm_loss": 1.923, "loss": 2.0559, "mask_loss": 0.1241, "step": 2136, "topk_loss": 0.0087 }, { "epoch": 0.849369344751992, "grad_norm": 0.130859375, "learning_rate": 1.1381610361608374e-05, "lm_loss": 1.8942, "loss": 2.0304, "mask_loss": 0.1254, "step": 2137, "topk_loss": 0.0109 }, { "epoch": 0.8497668035001211, "grad_norm": 0.11962890625, "learning_rate": 1.1322631407993811e-05, "lm_loss": 1.9726, "loss": 2.1108, "mask_loss": 0.1256, "step": 2138, "topk_loss": 0.0125 }, { "epoch": 0.8501642622482503, "grad_norm": 0.11474609375, "learning_rate": 1.1263796492894751e-05, "lm_loss": 1.9186, "loss": 2.053, "mask_loss": 0.1252, "step": 2139, "topk_loss": 0.0091 }, { "epoch": 0.8505617209963794, "grad_norm": 0.10888671875, "learning_rate": 1.1205105711876651e-05, "lm_loss": 1.8365, "loss": 1.9712, "mask_loss": 0.1248, "step": 2140, "topk_loss": 0.0099 }, { "epoch": 0.8509591797445085, "grad_norm": 0.12060546875, "learning_rate": 1.1146559160270875e-05, "lm_loss": 1.8962, "loss": 2.0291, "mask_loss": 0.1239, "step": 2141, "topk_loss": 0.009 }, { "epoch": 0.8513566384926377, "grad_norm": 0.11279296875, "learning_rate": 1.1088156933174487e-05, "lm_loss": 1.8732, "loss": 2.0136, "mask_loss": 0.1299, "step": 2142, "topk_loss": 0.0106 }, { "epoch": 0.8517540972407669, "grad_norm": 0.1103515625, "learning_rate": 1.102989912545015e-05, "lm_loss": 1.9038, "loss": 2.0408, "mask_loss": 0.127, "step": 2143, "topk_loss": 0.0099 }, { "epoch": 0.852151555988896, "grad_norm": 0.11181640625, "learning_rate": 1.0971785831725901e-05, "lm_loss": 1.8961, "loss": 2.0278, "mask_loss": 0.1231, "step": 2144, "topk_loss": 0.0087 }, { "epoch": 0.8525490147370252, "grad_norm": 0.111328125, "learning_rate": 1.0913817146395088e-05, "lm_loss": 1.9004, "loss": 2.0378, "mask_loss": 0.1271, "step": 2145, "topk_loss": 0.0104 }, { "epoch": 0.8529464734851543, "grad_norm": 0.125, "learning_rate": 1.0855993163616174e-05, "lm_loss": 1.9038, "loss": 2.0462, "mask_loss": 0.1294, "step": 2146, "topk_loss": 0.0129 }, { "epoch": 0.8533439322332834, "grad_norm": 0.11669921875, "learning_rate": 1.0798313977312557e-05, "lm_loss": 1.9402, "loss": 2.0766, "mask_loss": 0.1274, "step": 2147, "topk_loss": 0.0091 }, { "epoch": 0.8537413909814126, "grad_norm": 0.11083984375, "learning_rate": 1.0740779681172453e-05, "lm_loss": 1.9036, "loss": 2.0357, "mask_loss": 0.1231, "step": 2148, "topk_loss": 0.009 }, { "epoch": 0.8541388497295418, "grad_norm": 0.1611328125, "learning_rate": 1.0683390368648726e-05, "lm_loss": 1.9036, "loss": 2.0408, "mask_loss": 0.1265, "step": 2149, "topk_loss": 0.0107 }, { "epoch": 0.8545363084776709, "grad_norm": 0.11328125, "learning_rate": 1.0626146132958759e-05, "lm_loss": 1.8856, "loss": 2.0193, "mask_loss": 0.1244, "step": 2150, "topk_loss": 0.0093 }, { "epoch": 0.8545363084776709, "eval_lm_loss": 687.17578125, "eval_loss": 687.3126831054688, "eval_mask_hit_rate": 0.5359647274017334, "eval_mask_loss": 0.12439900636672974, "eval_mask_top_10_hit_rate": 0.9855988025665283, "eval_mask_top_1_hit_rate": 0.9974305629730225, "eval_mask_top_20_hit_rate": 0.9762325286865234, "eval_mask_top_5_hit_rate": 0.9908649921417236, "eval_runtime": 143.8341, "eval_samples_per_second": 14.239, "eval_steps_per_second": 7.119, "eval_token_accuracy": 0.6145526170730591, "eval_top_k_diff": -529.2546997070312, "eval_topk_loss": 0.012550394050776958, "step": 2150 }, { "epoch": 0.8549337672258001, "grad_norm": 0.13671875, "learning_rate": 1.0569047067084293e-05, "lm_loss": 1.8935, "loss": 2.035, "mask_loss": 0.128, "step": 2151, "topk_loss": 0.0136 }, { "epoch": 0.8553312259739292, "grad_norm": 0.111328125, "learning_rate": 1.0512093263771206e-05, "lm_loss": 1.8717, "loss": 2.0083, "mask_loss": 0.1273, "step": 2152, "topk_loss": 0.0094 }, { "epoch": 0.8557286847220583, "grad_norm": 0.1181640625, "learning_rate": 1.045528481552951e-05, "lm_loss": 1.8821, "loss": 2.0174, "mask_loss": 0.126, "step": 2153, "topk_loss": 0.0093 }, { "epoch": 0.8561261434701875, "grad_norm": 0.107421875, "learning_rate": 1.039862181463307e-05, "lm_loss": 1.9685, "loss": 2.1001, "mask_loss": 0.1233, "step": 2154, "topk_loss": 0.0084 }, { "epoch": 0.8565236022183167, "grad_norm": 0.1064453125, "learning_rate": 1.034210435311952e-05, "lm_loss": 1.8213, "loss": 1.9588, "mask_loss": 0.1274, "step": 2155, "topk_loss": 0.0101 }, { "epoch": 0.8569210609664458, "grad_norm": 0.12451171875, "learning_rate": 1.0285732522790092e-05, "lm_loss": 1.9379, "loss": 2.0686, "mask_loss": 0.1221, "step": 2156, "topk_loss": 0.0086 }, { "epoch": 0.857318519714575, "grad_norm": 0.11328125, "learning_rate": 1.0229506415209444e-05, "lm_loss": 1.8874, "loss": 2.0255, "mask_loss": 0.1271, "step": 2157, "topk_loss": 0.0111 }, { "epoch": 0.8577159784627041, "grad_norm": 0.154296875, "learning_rate": 1.0173426121705576e-05, "lm_loss": 1.8887, "loss": 2.0259, "mask_loss": 0.1263, "step": 2158, "topk_loss": 0.0109 }, { "epoch": 0.8581134372108332, "grad_norm": 0.11865234375, "learning_rate": 1.0117491733369611e-05, "lm_loss": 1.8654, "loss": 2.0052, "mask_loss": 0.1283, "step": 2159, "topk_loss": 0.0115 }, { "epoch": 0.8585108959589624, "grad_norm": 0.107421875, "learning_rate": 1.0061703341055706e-05, "lm_loss": 1.8766, "loss": 2.0102, "mask_loss": 0.1251, "step": 2160, "topk_loss": 0.0085 }, { "epoch": 0.8589083547070915, "grad_norm": 0.12890625, "learning_rate": 1.0006061035380843e-05, "lm_loss": 1.7927, "loss": 1.937, "mask_loss": 0.1311, "step": 2161, "topk_loss": 0.0132 }, { "epoch": 0.8593058134552207, "grad_norm": 0.11279296875, "learning_rate": 9.950564906724757e-06, "lm_loss": 1.8876, "loss": 2.0232, "mask_loss": 0.1269, "step": 2162, "topk_loss": 0.0087 }, { "epoch": 0.8597032722033499, "grad_norm": 0.12890625, "learning_rate": 9.89521504522971e-06, "lm_loss": 1.9252, "loss": 2.0581, "mask_loss": 0.1238, "step": 2163, "topk_loss": 0.0091 }, { "epoch": 0.860100730951479, "grad_norm": 0.11181640625, "learning_rate": 9.840011540800409e-06, "lm_loss": 1.8699, "loss": 2.0043, "mask_loss": 0.1248, "step": 2164, "topk_loss": 0.0096 }, { "epoch": 0.8604981896996081, "grad_norm": 0.1318359375, "learning_rate": 9.784954483103803e-06, "lm_loss": 1.901, "loss": 2.0414, "mask_loss": 0.1277, "step": 2165, "topk_loss": 0.0127 }, { "epoch": 0.8608956484477372, "grad_norm": 0.11279296875, "learning_rate": 9.730043961569013e-06, "lm_loss": 1.9087, "loss": 2.0432, "mask_loss": 0.1244, "step": 2166, "topk_loss": 0.0101 }, { "epoch": 0.8612931071958664, "grad_norm": 0.111328125, "learning_rate": 9.675280065387116e-06, "lm_loss": 1.9338, "loss": 2.0717, "mask_loss": 0.1273, "step": 2167, "topk_loss": 0.0106 }, { "epoch": 0.8616905659439956, "grad_norm": 0.130859375, "learning_rate": 9.620662883511e-06, "lm_loss": 1.8762, "loss": 2.0116, "mask_loss": 0.1256, "step": 2168, "topk_loss": 0.0098 }, { "epoch": 0.8620880246921248, "grad_norm": 0.111328125, "learning_rate": 9.56619250465528e-06, "lm_loss": 1.9275, "loss": 2.0628, "mask_loss": 0.1256, "step": 2169, "topk_loss": 0.0097 }, { "epoch": 0.8624854834402539, "grad_norm": 0.1396484375, "learning_rate": 9.511869017296116e-06, "lm_loss": 1.9117, "loss": 2.0558, "mask_loss": 0.1293, "step": 2170, "topk_loss": 0.0148 }, { "epoch": 0.862882942188383, "grad_norm": 0.1240234375, "learning_rate": 9.457692509671069e-06, "lm_loss": 1.9475, "loss": 2.087, "mask_loss": 0.1279, "step": 2171, "topk_loss": 0.0115 }, { "epoch": 0.8632804009365121, "grad_norm": 0.1201171875, "learning_rate": 9.403663069778945e-06, "lm_loss": 1.8539, "loss": 1.9927, "mask_loss": 0.1277, "step": 2172, "topk_loss": 0.011 }, { "epoch": 0.8636778596846413, "grad_norm": 0.1376953125, "learning_rate": 9.349780785379703e-06, "lm_loss": 1.9392, "loss": 2.0739, "mask_loss": 0.1252, "step": 2173, "topk_loss": 0.0095 }, { "epoch": 0.8640753184327705, "grad_norm": 0.11083984375, "learning_rate": 9.29604574399423e-06, "lm_loss": 1.8761, "loss": 2.0089, "mask_loss": 0.1241, "step": 2174, "topk_loss": 0.0087 }, { "epoch": 0.8644727771808997, "grad_norm": 0.203125, "learning_rate": 9.242458032904311e-06, "lm_loss": 1.9129, "loss": 2.0492, "mask_loss": 0.1249, "step": 2175, "topk_loss": 0.0114 }, { "epoch": 0.8648702359290288, "grad_norm": 0.111328125, "learning_rate": 9.189017739152328e-06, "lm_loss": 1.9449, "loss": 2.0813, "mask_loss": 0.1248, "step": 2176, "topk_loss": 0.0115 }, { "epoch": 0.8652676946771579, "grad_norm": 0.11767578125, "learning_rate": 9.135724949541314e-06, "lm_loss": 1.9331, "loss": 2.0664, "mask_loss": 0.1247, "step": 2177, "topk_loss": 0.0086 }, { "epoch": 0.865665153425287, "grad_norm": 0.1171875, "learning_rate": 9.082579750634646e-06, "lm_loss": 1.93, "loss": 2.0705, "mask_loss": 0.1283, "step": 2178, "topk_loss": 0.0123 }, { "epoch": 0.8660626121734162, "grad_norm": 0.1435546875, "learning_rate": 9.029582228755996e-06, "lm_loss": 1.9148, "loss": 2.0521, "mask_loss": 0.127, "step": 2179, "topk_loss": 0.0104 }, { "epoch": 0.8664600709215454, "grad_norm": 0.1259765625, "learning_rate": 8.976732469989157e-06, "lm_loss": 1.8945, "loss": 2.0325, "mask_loss": 0.1264, "step": 2180, "topk_loss": 0.0115 }, { "epoch": 0.8668575296696746, "grad_norm": 0.11767578125, "learning_rate": 8.924030560177921e-06, "lm_loss": 1.828, "loss": 1.9669, "mask_loss": 0.1283, "step": 2181, "topk_loss": 0.0106 }, { "epoch": 0.8672549884178037, "grad_norm": 0.11572265625, "learning_rate": 8.871476584925909e-06, "lm_loss": 1.8724, "loss": 2.0077, "mask_loss": 0.1255, "step": 2182, "topk_loss": 0.0097 }, { "epoch": 0.8676524471659328, "grad_norm": 0.11328125, "learning_rate": 8.819070629596482e-06, "lm_loss": 1.8596, "loss": 1.9933, "mask_loss": 0.1251, "step": 2183, "topk_loss": 0.0086 }, { "epoch": 0.868049905914062, "grad_norm": 0.1171875, "learning_rate": 8.766812779312528e-06, "lm_loss": 1.9529, "loss": 2.0883, "mask_loss": 0.1261, "step": 2184, "topk_loss": 0.0093 }, { "epoch": 0.8684473646621911, "grad_norm": 0.1435546875, "learning_rate": 8.714703118956402e-06, "lm_loss": 1.8389, "loss": 1.976, "mask_loss": 0.1273, "step": 2185, "topk_loss": 0.0098 }, { "epoch": 0.8688448234103202, "grad_norm": 0.111328125, "learning_rate": 8.662741733169743e-06, "lm_loss": 1.9521, "loss": 2.0861, "mask_loss": 0.1248, "step": 2186, "topk_loss": 0.0092 }, { "epoch": 0.8692422821584495, "grad_norm": 0.224609375, "learning_rate": 8.61092870635336e-06, "lm_loss": 1.9036, "loss": 2.0395, "mask_loss": 0.1252, "step": 2187, "topk_loss": 0.0107 }, { "epoch": 0.8696397409065786, "grad_norm": 0.140625, "learning_rate": 8.559264122667087e-06, "lm_loss": 1.9506, "loss": 2.0836, "mask_loss": 0.1239, "step": 2188, "topk_loss": 0.0091 }, { "epoch": 0.8700371996547077, "grad_norm": 0.115234375, "learning_rate": 8.507748066029597e-06, "lm_loss": 1.9317, "loss": 2.0702, "mask_loss": 0.1274, "step": 2189, "topk_loss": 0.0111 }, { "epoch": 0.8704346584028368, "grad_norm": 0.10986328125, "learning_rate": 8.456380620118365e-06, "lm_loss": 1.9114, "loss": 2.0483, "mask_loss": 0.1265, "step": 2190, "topk_loss": 0.0104 }, { "epoch": 0.870832117150966, "grad_norm": 0.1552734375, "learning_rate": 8.405161868369448e-06, "lm_loss": 1.9055, "loss": 2.0511, "mask_loss": 0.1299, "step": 2191, "topk_loss": 0.0157 }, { "epoch": 0.8712295758990951, "grad_norm": 0.11865234375, "learning_rate": 8.354091893977401e-06, "lm_loss": 1.912, "loss": 2.0495, "mask_loss": 0.1259, "step": 2192, "topk_loss": 0.0116 }, { "epoch": 0.8716270346472244, "grad_norm": 0.115234375, "learning_rate": 8.303170779895086e-06, "lm_loss": 1.8498, "loss": 1.9863, "mask_loss": 0.1258, "step": 2193, "topk_loss": 0.0108 }, { "epoch": 0.8720244933953535, "grad_norm": 0.119140625, "learning_rate": 8.2523986088336e-06, "lm_loss": 1.8843, "loss": 2.0206, "mask_loss": 0.1265, "step": 2194, "topk_loss": 0.0099 }, { "epoch": 0.8724219521434826, "grad_norm": 0.11328125, "learning_rate": 8.201775463262107e-06, "lm_loss": 1.8533, "loss": 1.9879, "mask_loss": 0.1252, "step": 2195, "topk_loss": 0.0094 }, { "epoch": 0.8728194108916117, "grad_norm": 0.1162109375, "learning_rate": 8.151301425407699e-06, "lm_loss": 1.8667, "loss": 2.0021, "mask_loss": 0.1261, "step": 2196, "topk_loss": 0.0093 }, { "epoch": 0.8732168696397409, "grad_norm": 0.10498046875, "learning_rate": 8.100976577255281e-06, "lm_loss": 1.8093, "loss": 1.9459, "mask_loss": 0.1273, "step": 2197, "topk_loss": 0.0094 }, { "epoch": 0.87361432838787, "grad_norm": 0.11572265625, "learning_rate": 8.050801000547426e-06, "lm_loss": 1.9094, "loss": 2.05, "mask_loss": 0.1277, "step": 2198, "topk_loss": 0.0128 }, { "epoch": 0.8740117871359993, "grad_norm": 0.1064453125, "learning_rate": 8.00077477678427e-06, "lm_loss": 1.8488, "loss": 1.9849, "mask_loss": 0.126, "step": 2199, "topk_loss": 0.0101 }, { "epoch": 0.8744092458841284, "grad_norm": 0.1279296875, "learning_rate": 7.950897987223304e-06, "lm_loss": 1.8628, "loss": 1.999, "mask_loss": 0.1271, "step": 2200, "topk_loss": 0.0091 }, { "epoch": 0.8744092458841284, "eval_lm_loss": 687.38037109375, "eval_loss": 687.517333984375, "eval_mask_hit_rate": 0.5359687805175781, "eval_mask_loss": 0.12440390139818192, "eval_mask_top_10_hit_rate": 0.9855976104736328, "eval_mask_top_1_hit_rate": 0.997429609298706, "eval_mask_top_20_hit_rate": 0.9762320518493652, "eval_mask_top_5_hit_rate": 0.990863561630249, "eval_runtime": 144.9294, "eval_samples_per_second": 14.131, "eval_steps_per_second": 7.066, "eval_token_accuracy": 0.614575982093811, "eval_top_k_diff": -530.4586181640625, "eval_topk_loss": 0.012535331770777702, "step": 2200 }, { "epoch": 0.8748067046322575, "grad_norm": 0.1083984375, "learning_rate": 7.901170712879325e-06, "lm_loss": 1.9053, "loss": 2.0407, "mask_loss": 0.1261, "step": 2201, "topk_loss": 0.0093 }, { "epoch": 0.8752041633803866, "grad_norm": 0.1181640625, "learning_rate": 7.851593034524262e-06, "lm_loss": 1.9185, "loss": 2.0577, "mask_loss": 0.1274, "step": 2202, "topk_loss": 0.0118 }, { "epoch": 0.8756016221285158, "grad_norm": 0.1162109375, "learning_rate": 7.802165032687092e-06, "lm_loss": 1.8895, "loss": 2.025, "mask_loss": 0.1259, "step": 2203, "topk_loss": 0.0096 }, { "epoch": 0.8759990808766449, "grad_norm": 0.1279296875, "learning_rate": 7.752886787653624e-06, "lm_loss": 1.8695, "loss": 2.0027, "mask_loss": 0.1239, "step": 2204, "topk_loss": 0.0094 }, { "epoch": 0.8763965396247742, "grad_norm": 0.11181640625, "learning_rate": 7.703758379466441e-06, "lm_loss": 1.969, "loss": 2.1055, "mask_loss": 0.1254, "step": 2205, "topk_loss": 0.0111 }, { "epoch": 0.8767939983729033, "grad_norm": 0.115234375, "learning_rate": 7.654779887924734e-06, "lm_loss": 1.9584, "loss": 2.0925, "mask_loss": 0.1249, "step": 2206, "topk_loss": 0.0092 }, { "epoch": 0.8771914571210324, "grad_norm": 0.12158203125, "learning_rate": 7.605951392584221e-06, "lm_loss": 1.9129, "loss": 2.049, "mask_loss": 0.1256, "step": 2207, "topk_loss": 0.0105 }, { "epoch": 0.8775889158691615, "grad_norm": 0.1162109375, "learning_rate": 7.557272972756923e-06, "lm_loss": 1.9447, "loss": 2.0829, "mask_loss": 0.126, "step": 2208, "topk_loss": 0.0122 }, { "epoch": 0.8779863746172907, "grad_norm": 0.12451171875, "learning_rate": 7.508744707511117e-06, "lm_loss": 1.8635, "loss": 1.9989, "mask_loss": 0.1262, "step": 2209, "topk_loss": 0.0093 }, { "epoch": 0.8783838333654198, "grad_norm": 0.10791015625, "learning_rate": 7.460366675671215e-06, "lm_loss": 1.9124, "loss": 2.0458, "mask_loss": 0.1239, "step": 2210, "topk_loss": 0.0094 }, { "epoch": 0.878781292113549, "grad_norm": 0.142578125, "learning_rate": 7.412138955817571e-06, "lm_loss": 1.9115, "loss": 2.0482, "mask_loss": 0.1263, "step": 2211, "topk_loss": 0.0105 }, { "epoch": 0.8791787508616782, "grad_norm": 0.1728515625, "learning_rate": 7.3640616262864e-06, "lm_loss": 1.9172, "loss": 2.0554, "mask_loss": 0.1261, "step": 2212, "topk_loss": 0.0121 }, { "epoch": 0.8795762096098073, "grad_norm": 0.1298828125, "learning_rate": 7.316134765169635e-06, "lm_loss": 1.9638, "loss": 2.0982, "mask_loss": 0.1238, "step": 2213, "topk_loss": 0.0105 }, { "epoch": 0.8799736683579364, "grad_norm": 0.11376953125, "learning_rate": 7.268358450314794e-06, "lm_loss": 1.8659, "loss": 2.0009, "mask_loss": 0.1261, "step": 2214, "topk_loss": 0.0088 }, { "epoch": 0.8803711271060656, "grad_norm": 0.134765625, "learning_rate": 7.220732759324911e-06, "lm_loss": 1.9214, "loss": 2.0551, "mask_loss": 0.1236, "step": 2215, "topk_loss": 0.0101 }, { "epoch": 0.8807685858541947, "grad_norm": 0.134765625, "learning_rate": 7.173257769558262e-06, "lm_loss": 1.849, "loss": 1.9857, "mask_loss": 0.1275, "step": 2216, "topk_loss": 0.0092 }, { "epoch": 0.8811660446023238, "grad_norm": 0.126953125, "learning_rate": 7.125933558128451e-06, "lm_loss": 1.8932, "loss": 2.0279, "mask_loss": 0.1239, "step": 2217, "topk_loss": 0.0108 }, { "epoch": 0.8815635033504531, "grad_norm": 0.1259765625, "learning_rate": 7.078760201904089e-06, "lm_loss": 1.9196, "loss": 2.0543, "mask_loss": 0.126, "step": 2218, "topk_loss": 0.0087 }, { "epoch": 0.8819609620985822, "grad_norm": 0.1162109375, "learning_rate": 7.031737777508818e-06, "lm_loss": 1.9288, "loss": 2.062, "mask_loss": 0.1236, "step": 2219, "topk_loss": 0.0097 }, { "epoch": 0.8823584208467113, "grad_norm": 0.11376953125, "learning_rate": 6.984866361321063e-06, "lm_loss": 1.9207, "loss": 2.0537, "mask_loss": 0.1242, "step": 2220, "topk_loss": 0.0088 }, { "epoch": 0.8827558795948405, "grad_norm": 0.11181640625, "learning_rate": 6.938146029474013e-06, "lm_loss": 1.8107, "loss": 1.9457, "mask_loss": 0.1258, "step": 2221, "topk_loss": 0.0093 }, { "epoch": 0.8831533383429696, "grad_norm": 0.125, "learning_rate": 6.891576857855431e-06, "lm_loss": 1.8534, "loss": 1.9932, "mask_loss": 0.1276, "step": 2222, "topk_loss": 0.0122 }, { "epoch": 0.8835507970910987, "grad_norm": 0.11376953125, "learning_rate": 6.845158922107553e-06, "lm_loss": 1.8817, "loss": 2.0154, "mask_loss": 0.1241, "step": 2223, "topk_loss": 0.0096 }, { "epoch": 0.883948255839228, "grad_norm": 0.1259765625, "learning_rate": 6.798892297626946e-06, "lm_loss": 1.9011, "loss": 2.0375, "mask_loss": 0.126, "step": 2224, "topk_loss": 0.0104 }, { "epoch": 0.8843457145873571, "grad_norm": 0.1318359375, "learning_rate": 6.75277705956443e-06, "lm_loss": 1.9335, "loss": 2.0707, "mask_loss": 0.1265, "step": 2225, "topk_loss": 0.0107 }, { "epoch": 0.8847431733354862, "grad_norm": 0.130859375, "learning_rate": 6.70681328282492e-06, "lm_loss": 1.8614, "loss": 2.004, "mask_loss": 0.1301, "step": 2226, "topk_loss": 0.0125 }, { "epoch": 0.8851406320836154, "grad_norm": 0.126953125, "learning_rate": 6.661001042067294e-06, "lm_loss": 1.8986, "loss": 2.0315, "mask_loss": 0.1234, "step": 2227, "topk_loss": 0.0095 }, { "epoch": 0.8855380908317445, "grad_norm": 0.1162109375, "learning_rate": 6.615340411704318e-06, "lm_loss": 1.8793, "loss": 2.0171, "mask_loss": 0.1272, "step": 2228, "topk_loss": 0.0107 }, { "epoch": 0.8859355495798736, "grad_norm": 0.12890625, "learning_rate": 6.569831465902488e-06, "lm_loss": 1.9577, "loss": 2.0932, "mask_loss": 0.1242, "step": 2229, "topk_loss": 0.0113 }, { "epoch": 0.8863330083280029, "grad_norm": 0.134765625, "learning_rate": 6.524474278581905e-06, "lm_loss": 1.9069, "loss": 2.0411, "mask_loss": 0.1243, "step": 2230, "topk_loss": 0.0098 }, { "epoch": 0.886730467076132, "grad_norm": 0.1494140625, "learning_rate": 6.479268923416182e-06, "lm_loss": 1.8512, "loss": 1.9893, "mask_loss": 0.1276, "step": 2231, "topk_loss": 0.0105 }, { "epoch": 0.8871279258242611, "grad_norm": 0.1181640625, "learning_rate": 6.4342154738323054e-06, "lm_loss": 1.9095, "loss": 2.0453, "mask_loss": 0.1255, "step": 2232, "topk_loss": 0.0103 }, { "epoch": 0.8875253845723903, "grad_norm": 0.1279296875, "learning_rate": 6.389314003010538e-06, "lm_loss": 1.8588, "loss": 1.9968, "mask_loss": 0.1277, "step": 2233, "topk_loss": 0.0103 }, { "epoch": 0.8879228433205194, "grad_norm": 0.11279296875, "learning_rate": 6.344564583884271e-06, "lm_loss": 1.9211, "loss": 2.0571, "mask_loss": 0.1265, "step": 2234, "topk_loss": 0.0095 }, { "epoch": 0.8883203020686485, "grad_norm": 0.115234375, "learning_rate": 6.299967289139896e-06, "lm_loss": 1.8947, "loss": 2.0331, "mask_loss": 0.1279, "step": 2235, "topk_loss": 0.0105 }, { "epoch": 0.8887177608167778, "grad_norm": 0.11865234375, "learning_rate": 6.255522191216756e-06, "lm_loss": 1.9274, "loss": 2.0611, "mask_loss": 0.1243, "step": 2236, "topk_loss": 0.0095 }, { "epoch": 0.8891152195649069, "grad_norm": 0.11376953125, "learning_rate": 6.211229362306947e-06, "lm_loss": 1.8775, "loss": 2.0134, "mask_loss": 0.1267, "step": 2237, "topk_loss": 0.0091 }, { "epoch": 0.889512678313036, "grad_norm": 0.12353515625, "learning_rate": 6.167088874355231e-06, "lm_loss": 1.9621, "loss": 2.097, "mask_loss": 0.1244, "step": 2238, "topk_loss": 0.0105 }, { "epoch": 0.8899101370611652, "grad_norm": 0.1552734375, "learning_rate": 6.123100799058978e-06, "lm_loss": 1.9197, "loss": 2.0546, "mask_loss": 0.1254, "step": 2239, "topk_loss": 0.0094 }, { "epoch": 0.8903075958092943, "grad_norm": 0.11328125, "learning_rate": 6.079265207867901e-06, "lm_loss": 1.973, "loss": 2.1072, "mask_loss": 0.1248, "step": 2240, "topk_loss": 0.0094 }, { "epoch": 0.8907050545574234, "grad_norm": 0.12255859375, "learning_rate": 6.0355821719841e-06, "lm_loss": 1.9375, "loss": 2.0719, "mask_loss": 0.1249, "step": 2241, "topk_loss": 0.0095 }, { "epoch": 0.8911025133055526, "grad_norm": 0.11328125, "learning_rate": 5.992051762361883e-06, "lm_loss": 1.8657, "loss": 2.0021, "mask_loss": 0.1263, "step": 2242, "topk_loss": 0.0101 }, { "epoch": 0.8914999720536818, "grad_norm": 0.10986328125, "learning_rate": 5.948674049707603e-06, "lm_loss": 1.8879, "loss": 2.0215, "mask_loss": 0.1245, "step": 2243, "topk_loss": 0.0091 }, { "epoch": 0.8918974308018109, "grad_norm": 0.1103515625, "learning_rate": 5.905449104479632e-06, "lm_loss": 1.9105, "loss": 2.0441, "mask_loss": 0.1245, "step": 2244, "topk_loss": 0.0091 }, { "epoch": 0.8922948895499401, "grad_norm": 0.11181640625, "learning_rate": 5.862376996888175e-06, "lm_loss": 1.9115, "loss": 2.0469, "mask_loss": 0.125, "step": 2245, "topk_loss": 0.0104 }, { "epoch": 0.8926923482980692, "grad_norm": 0.1220703125, "learning_rate": 5.819457796895189e-06, "lm_loss": 1.919, "loss": 2.0514, "mask_loss": 0.1235, "step": 2246, "topk_loss": 0.009 }, { "epoch": 0.8930898070461983, "grad_norm": 0.11474609375, "learning_rate": 5.776691574214277e-06, "lm_loss": 1.893, "loss": 2.0296, "mask_loss": 0.1263, "step": 2247, "topk_loss": 0.0102 }, { "epoch": 0.8934872657943275, "grad_norm": 0.10986328125, "learning_rate": 5.734078398310538e-06, "lm_loss": 1.8622, "loss": 1.9989, "mask_loss": 0.1263, "step": 2248, "topk_loss": 0.0105 }, { "epoch": 0.8938847245424567, "grad_norm": 0.1171875, "learning_rate": 5.691618338400484e-06, "lm_loss": 1.9444, "loss": 2.0784, "mask_loss": 0.1236, "step": 2249, "topk_loss": 0.0104 }, { "epoch": 0.8942821832905858, "grad_norm": 0.11279296875, "learning_rate": 5.6493114634519455e-06, "lm_loss": 1.8951, "loss": 2.0316, "mask_loss": 0.1268, "step": 2250, "topk_loss": 0.0098 }, { "epoch": 0.8942821832905858, "eval_lm_loss": 687.3623046875, "eval_loss": 687.499267578125, "eval_mask_hit_rate": 0.5359718799591064, "eval_mask_loss": 0.12440043687820435, "eval_mask_top_10_hit_rate": 0.985600471496582, "eval_mask_top_1_hit_rate": 0.9974288940429688, "eval_mask_top_20_hit_rate": 0.9762358665466309, "eval_mask_top_5_hit_rate": 0.9908660054206848, "eval_runtime": 144.2344, "eval_samples_per_second": 14.199, "eval_steps_per_second": 7.1, "eval_token_accuracy": 0.6145902872085571, "eval_top_k_diff": -529.9525146484375, "eval_topk_loss": 0.012543203309178352, "step": 2250 }, { "epoch": 0.894679642038715, "grad_norm": 0.1396484375, "learning_rate": 5.607157842183896e-06, "lm_loss": 1.8832, "loss": 2.0298, "mask_loss": 0.1305, "step": 2251, "topk_loss": 0.0161 }, { "epoch": 0.8950771007868441, "grad_norm": 0.125, "learning_rate": 5.565157543066402e-06, "lm_loss": 1.9263, "loss": 2.0671, "mask_loss": 0.1276, "step": 2252, "topk_loss": 0.0133 }, { "epoch": 0.8954745595349732, "grad_norm": 0.1474609375, "learning_rate": 5.5233106343205e-06, "lm_loss": 1.9293, "loss": 2.0669, "mask_loss": 0.1268, "step": 2253, "topk_loss": 0.0108 }, { "epoch": 0.8958720182831024, "grad_norm": 0.111328125, "learning_rate": 5.481617183918053e-06, "lm_loss": 1.9071, "loss": 2.0405, "mask_loss": 0.125, "step": 2254, "topk_loss": 0.0084 }, { "epoch": 0.8962694770312316, "grad_norm": 0.10888671875, "learning_rate": 5.4400772595816774e-06, "lm_loss": 1.8233, "loss": 1.9602, "mask_loss": 0.1265, "step": 2255, "topk_loss": 0.0103 }, { "epoch": 0.8966669357793607, "grad_norm": 0.115234375, "learning_rate": 5.398690928784578e-06, "lm_loss": 1.8864, "loss": 2.0203, "mask_loss": 0.1243, "step": 2256, "topk_loss": 0.0096 }, { "epoch": 0.8970643945274899, "grad_norm": 0.11279296875, "learning_rate": 5.357458258750547e-06, "lm_loss": 1.8947, "loss": 2.0277, "mask_loss": 0.1239, "step": 2257, "topk_loss": 0.0091 }, { "epoch": 0.897461853275619, "grad_norm": 0.11767578125, "learning_rate": 5.316379316453713e-06, "lm_loss": 1.9164, "loss": 2.0509, "mask_loss": 0.1248, "step": 2258, "topk_loss": 0.0098 }, { "epoch": 0.8978593120237481, "grad_norm": 0.119140625, "learning_rate": 5.275454168618577e-06, "lm_loss": 1.8878, "loss": 2.0233, "mask_loss": 0.1262, "step": 2259, "topk_loss": 0.0093 }, { "epoch": 0.8982567707718773, "grad_norm": 0.12060546875, "learning_rate": 5.2346828817197655e-06, "lm_loss": 1.9441, "loss": 2.083, "mask_loss": 0.1273, "step": 2260, "topk_loss": 0.0117 }, { "epoch": 0.8986542295200065, "grad_norm": 0.119140625, "learning_rate": 5.194065521982028e-06, "lm_loss": 1.8663, "loss": 2.0009, "mask_loss": 0.1246, "step": 2261, "topk_loss": 0.0099 }, { "epoch": 0.8990516882681356, "grad_norm": 0.11376953125, "learning_rate": 5.153602155380089e-06, "lm_loss": 1.9223, "loss": 2.0597, "mask_loss": 0.1272, "step": 2262, "topk_loss": 0.0102 }, { "epoch": 0.8994491470162648, "grad_norm": 0.109375, "learning_rate": 5.113292847638518e-06, "lm_loss": 1.8611, "loss": 1.9966, "mask_loss": 0.1266, "step": 2263, "topk_loss": 0.0089 }, { "epoch": 0.8998466057643939, "grad_norm": 0.10888671875, "learning_rate": 5.073137664231675e-06, "lm_loss": 1.9406, "loss": 2.0761, "mask_loss": 0.1255, "step": 2264, "topk_loss": 0.01 }, { "epoch": 0.900244064512523, "grad_norm": 0.10693359375, "learning_rate": 5.033136670383554e-06, "lm_loss": 1.9223, "loss": 2.0605, "mask_loss": 0.1285, "step": 2265, "topk_loss": 0.0097 }, { "epoch": 0.9006415232606522, "grad_norm": 0.11376953125, "learning_rate": 4.993289931067713e-06, "lm_loss": 1.921, "loss": 2.0529, "mask_loss": 0.123, "step": 2266, "topk_loss": 0.0089 }, { "epoch": 0.9010389820087814, "grad_norm": 0.109375, "learning_rate": 4.953597511007158e-06, "lm_loss": 1.9313, "loss": 2.063, "mask_loss": 0.1236, "step": 2267, "topk_loss": 0.0081 }, { "epoch": 0.9014364407569105, "grad_norm": 0.216796875, "learning_rate": 4.914059474674216e-06, "lm_loss": 1.9744, "loss": 2.1198, "mask_loss": 0.1299, "step": 2268, "topk_loss": 0.0155 }, { "epoch": 0.9018338995050397, "grad_norm": 0.115234375, "learning_rate": 4.874675886290459e-06, "lm_loss": 1.8485, "loss": 1.984, "mask_loss": 0.1253, "step": 2269, "topk_loss": 0.0102 }, { "epoch": 0.9022313582531688, "grad_norm": 0.11181640625, "learning_rate": 4.835446809826604e-06, "lm_loss": 1.8822, "loss": 2.0156, "mask_loss": 0.1252, "step": 2270, "topk_loss": 0.0082 }, { "epoch": 0.9026288170012979, "grad_norm": 0.12060546875, "learning_rate": 4.796372309002372e-06, "lm_loss": 1.9152, "loss": 2.0512, "mask_loss": 0.1263, "step": 2271, "topk_loss": 0.0098 }, { "epoch": 0.9030262757494271, "grad_norm": 0.10595703125, "learning_rate": 4.757452447286415e-06, "lm_loss": 1.8863, "loss": 2.0192, "mask_loss": 0.1239, "step": 2272, "topk_loss": 0.009 }, { "epoch": 0.9034237344975562, "grad_norm": 0.154296875, "learning_rate": 4.718687287896195e-06, "lm_loss": 1.9199, "loss": 2.0688, "mask_loss": 0.1321, "step": 2273, "topk_loss": 0.0168 }, { "epoch": 0.9038211932456854, "grad_norm": 0.111328125, "learning_rate": 4.680076893797914e-06, "lm_loss": 1.8638, "loss": 1.9998, "mask_loss": 0.1263, "step": 2274, "topk_loss": 0.0097 }, { "epoch": 0.9042186519938146, "grad_norm": 0.10693359375, "learning_rate": 4.641621327706369e-06, "lm_loss": 1.938, "loss": 2.0717, "mask_loss": 0.1243, "step": 2275, "topk_loss": 0.0094 }, { "epoch": 0.9046161107419437, "grad_norm": 0.1474609375, "learning_rate": 4.603320652084886e-06, "lm_loss": 1.8922, "loss": 2.0352, "mask_loss": 0.1287, "step": 2276, "topk_loss": 0.0143 }, { "epoch": 0.9050135694900728, "grad_norm": 0.111328125, "learning_rate": 4.565174929145188e-06, "lm_loss": 1.8313, "loss": 1.966, "mask_loss": 0.1254, "step": 2277, "topk_loss": 0.0093 }, { "epoch": 0.905411028238202, "grad_norm": 0.12255859375, "learning_rate": 4.527184220847325e-06, "lm_loss": 1.8758, "loss": 2.0109, "mask_loss": 0.126, "step": 2278, "topk_loss": 0.0091 }, { "epoch": 0.9058084869863311, "grad_norm": 0.109375, "learning_rate": 4.489348588899556e-06, "lm_loss": 1.8638, "loss": 1.9987, "mask_loss": 0.1257, "step": 2279, "topk_loss": 0.0092 }, { "epoch": 0.9062059457344603, "grad_norm": 0.11669921875, "learning_rate": 4.451668094758199e-06, "lm_loss": 1.9263, "loss": 2.0598, "mask_loss": 0.1241, "step": 2280, "topk_loss": 0.0094 }, { "epoch": 0.9066034044825895, "grad_norm": 0.1455078125, "learning_rate": 4.414142799627663e-06, "lm_loss": 1.8887, "loss": 2.0264, "mask_loss": 0.1277, "step": 2281, "topk_loss": 0.01 }, { "epoch": 0.9070008632307186, "grad_norm": 0.1103515625, "learning_rate": 4.3767727644602015e-06, "lm_loss": 1.9005, "loss": 2.039, "mask_loss": 0.1272, "step": 2282, "topk_loss": 0.0112 }, { "epoch": 0.9073983219788477, "grad_norm": 0.115234375, "learning_rate": 4.339558049955927e-06, "lm_loss": 1.8799, "loss": 2.0178, "mask_loss": 0.1259, "step": 2283, "topk_loss": 0.012 }, { "epoch": 0.9077957807269769, "grad_norm": 0.1103515625, "learning_rate": 4.3024987165626305e-06, "lm_loss": 1.956, "loss": 2.0933, "mask_loss": 0.1261, "step": 2284, "topk_loss": 0.0113 }, { "epoch": 0.908193239475106, "grad_norm": 0.1103515625, "learning_rate": 4.265594824475738e-06, "lm_loss": 1.8609, "loss": 1.9953, "mask_loss": 0.1254, "step": 2285, "topk_loss": 0.009 }, { "epoch": 0.9085906982232352, "grad_norm": 0.138671875, "learning_rate": 4.22884643363819e-06, "lm_loss": 1.8658, "loss": 2.0039, "mask_loss": 0.1276, "step": 2286, "topk_loss": 0.0105 }, { "epoch": 0.9089881569713644, "grad_norm": 0.1201171875, "learning_rate": 4.192253603740337e-06, "lm_loss": 1.9286, "loss": 2.065, "mask_loss": 0.1256, "step": 2287, "topk_loss": 0.0108 }, { "epoch": 0.9093856157194935, "grad_norm": 0.11669921875, "learning_rate": 4.155816394219858e-06, "lm_loss": 1.9051, "loss": 2.0402, "mask_loss": 0.1246, "step": 2288, "topk_loss": 0.0105 }, { "epoch": 0.9097830744676226, "grad_norm": 0.1142578125, "learning_rate": 4.119534864261643e-06, "lm_loss": 1.9425, "loss": 2.0796, "mask_loss": 0.128, "step": 2289, "topk_loss": 0.009 }, { "epoch": 0.9101805332157518, "grad_norm": 0.1328125, "learning_rate": 4.0834090727977505e-06, "lm_loss": 1.9699, "loss": 2.1058, "mask_loss": 0.1257, "step": 2290, "topk_loss": 0.0101 }, { "epoch": 0.9105779919638809, "grad_norm": 0.12109375, "learning_rate": 4.04743907850722e-06, "lm_loss": 1.8334, "loss": 1.9683, "mask_loss": 0.1262, "step": 2291, "topk_loss": 0.0087 }, { "epoch": 0.9109754507120101, "grad_norm": 0.11279296875, "learning_rate": 4.011624939816094e-06, "lm_loss": 1.9264, "loss": 2.0581, "mask_loss": 0.1231, "step": 2292, "topk_loss": 0.0085 }, { "epoch": 0.9113729094601393, "grad_norm": 0.1279296875, "learning_rate": 3.975966714897195e-06, "lm_loss": 1.9287, "loss": 2.0642, "mask_loss": 0.1264, "step": 2293, "topk_loss": 0.0091 }, { "epoch": 0.9117703682082684, "grad_norm": 0.10498046875, "learning_rate": 3.940464461670135e-06, "lm_loss": 1.9425, "loss": 2.0769, "mask_loss": 0.1253, "step": 2294, "topk_loss": 0.0092 }, { "epoch": 0.9121678269563975, "grad_norm": 0.1171875, "learning_rate": 3.9051182378011755e-06, "lm_loss": 1.9558, "loss": 2.0917, "mask_loss": 0.1264, "step": 2295, "topk_loss": 0.0095 }, { "epoch": 0.9125652857045267, "grad_norm": 0.125, "learning_rate": 3.8699281007031245e-06, "lm_loss": 1.8112, "loss": 1.9456, "mask_loss": 0.126, "step": 2296, "topk_loss": 0.0085 }, { "epoch": 0.9129627444526558, "grad_norm": 0.166015625, "learning_rate": 3.834894107535269e-06, "lm_loss": 1.956, "loss": 2.0907, "mask_loss": 0.1249, "step": 2297, "topk_loss": 0.0098 }, { "epoch": 0.9133602032007849, "grad_norm": 0.14453125, "learning_rate": 3.8000163152032697e-06, "lm_loss": 1.8895, "loss": 2.0251, "mask_loss": 0.1246, "step": 2298, "topk_loss": 0.011 }, { "epoch": 0.9137576619489142, "grad_norm": 0.1240234375, "learning_rate": 3.7652947803590855e-06, "lm_loss": 1.8623, "loss": 1.9993, "mask_loss": 0.1275, "step": 2299, "topk_loss": 0.0094 }, { "epoch": 0.9141551206970433, "grad_norm": 0.10888671875, "learning_rate": 3.7307295594008472e-06, "lm_loss": 1.9201, "loss": 2.0575, "mask_loss": 0.127, "step": 2300, "topk_loss": 0.0103 }, { "epoch": 0.9141551206970433, "eval_lm_loss": 687.35888671875, "eval_loss": 687.495849609375, "eval_mask_hit_rate": 0.5359721183776855, "eval_mask_loss": 0.12440076470375061, "eval_mask_top_10_hit_rate": 0.9856007695198059, "eval_mask_top_1_hit_rate": 0.9974291324615479, "eval_mask_top_20_hit_rate": 0.9762364029884338, "eval_mask_top_5_hit_rate": 0.9908666610717773, "eval_runtime": 143.7299, "eval_samples_per_second": 14.249, "eval_steps_per_second": 7.124, "eval_token_accuracy": 0.614601731300354, "eval_top_k_diff": -530.08642578125, "eval_topk_loss": 0.012541667558252811, "step": 2300 }, { "epoch": 0.9145525794451724, "grad_norm": 0.12109375, "learning_rate": 3.696320708472778e-06, "lm_loss": 1.917, "loss": 2.0521, "mask_loss": 0.125, "step": 2301, "topk_loss": 0.0101 }, { "epoch": 0.9149500381933016, "grad_norm": 0.130859375, "learning_rate": 3.6620682834651366e-06, "lm_loss": 1.9343, "loss": 2.0674, "mask_loss": 0.124, "step": 2302, "topk_loss": 0.0092 }, { "epoch": 0.9153474969414307, "grad_norm": 0.11181640625, "learning_rate": 3.627972340014085e-06, "lm_loss": 1.8732, "loss": 2.0106, "mask_loss": 0.1276, "step": 2303, "topk_loss": 0.0097 }, { "epoch": 0.9157449556895598, "grad_norm": 0.126953125, "learning_rate": 3.594032933501601e-06, "lm_loss": 1.9487, "loss": 2.0853, "mask_loss": 0.1256, "step": 2304, "topk_loss": 0.0109 }, { "epoch": 0.9161424144376891, "grad_norm": 0.119140625, "learning_rate": 3.5602501190554193e-06, "lm_loss": 1.9035, "loss": 2.0389, "mask_loss": 0.1253, "step": 2305, "topk_loss": 0.0102 }, { "epoch": 0.9165398731858182, "grad_norm": 0.1123046875, "learning_rate": 3.526623951548913e-06, "lm_loss": 1.8585, "loss": 1.9931, "mask_loss": 0.1261, "step": 2306, "topk_loss": 0.0085 }, { "epoch": 0.9169373319339473, "grad_norm": 0.1611328125, "learning_rate": 3.4931544856010133e-06, "lm_loss": 1.975, "loss": 2.1077, "mask_loss": 0.1238, "step": 2307, "topk_loss": 0.0088 }, { "epoch": 0.9173347906820765, "grad_norm": 0.1298828125, "learning_rate": 3.4598417755761225e-06, "lm_loss": 1.89, "loss": 2.0273, "mask_loss": 0.1263, "step": 2308, "topk_loss": 0.011 }, { "epoch": 0.9177322494302056, "grad_norm": 0.1376953125, "learning_rate": 3.4266858755840346e-06, "lm_loss": 1.8809, "loss": 2.0172, "mask_loss": 0.1261, "step": 2309, "topk_loss": 0.0103 }, { "epoch": 0.9181297081783347, "grad_norm": 0.10888671875, "learning_rate": 3.393686839479815e-06, "lm_loss": 1.9265, "loss": 2.0603, "mask_loss": 0.1251, "step": 2310, "topk_loss": 0.0088 }, { "epoch": 0.918527166926464, "grad_norm": 0.12255859375, "learning_rate": 3.360844720863765e-06, "lm_loss": 1.9214, "loss": 2.0567, "mask_loss": 0.125, "step": 2311, "topk_loss": 0.0103 }, { "epoch": 0.9189246256745931, "grad_norm": 0.15234375, "learning_rate": 3.3281595730812575e-06, "lm_loss": 1.9702, "loss": 2.1047, "mask_loss": 0.1229, "step": 2312, "topk_loss": 0.0116 }, { "epoch": 0.9193220844227222, "grad_norm": 0.12158203125, "learning_rate": 3.295631449222758e-06, "lm_loss": 1.9301, "loss": 2.0679, "mask_loss": 0.1269, "step": 2313, "topk_loss": 0.0109 }, { "epoch": 0.9197195431708514, "grad_norm": 0.10791015625, "learning_rate": 3.2632604021236358e-06, "lm_loss": 1.8932, "loss": 2.029, "mask_loss": 0.1258, "step": 2314, "topk_loss": 0.01 }, { "epoch": 0.9201170019189805, "grad_norm": 0.13671875, "learning_rate": 3.2310464843641307e-06, "lm_loss": 1.8941, "loss": 2.0359, "mask_loss": 0.1295, "step": 2315, "topk_loss": 0.0124 }, { "epoch": 0.9205144606671096, "grad_norm": 0.140625, "learning_rate": 3.1989897482692767e-06, "lm_loss": 1.8964, "loss": 2.0293, "mask_loss": 0.1244, "step": 2316, "topk_loss": 0.0085 }, { "epoch": 0.9209119194152389, "grad_norm": 0.1298828125, "learning_rate": 3.1670902459087658e-06, "lm_loss": 1.9011, "loss": 2.0372, "mask_loss": 0.1267, "step": 2317, "topk_loss": 0.0094 }, { "epoch": 0.921309378163368, "grad_norm": 0.1201171875, "learning_rate": 3.1353480290969183e-06, "lm_loss": 1.9406, "loss": 2.0769, "mask_loss": 0.1265, "step": 2318, "topk_loss": 0.0099 }, { "epoch": 0.9217068369114971, "grad_norm": 0.1376953125, "learning_rate": 3.103763149392569e-06, "lm_loss": 1.859, "loss": 1.995, "mask_loss": 0.1266, "step": 2319, "topk_loss": 0.0094 }, { "epoch": 0.9221042956596263, "grad_norm": 0.216796875, "learning_rate": 3.0723356580989904e-06, "lm_loss": 1.8902, "loss": 2.0444, "mask_loss": 0.1333, "step": 2320, "topk_loss": 0.0209 }, { "epoch": 0.9225017544077554, "grad_norm": 0.1162109375, "learning_rate": 3.041065606263804e-06, "lm_loss": 1.8954, "loss": 2.0335, "mask_loss": 0.1276, "step": 2321, "topk_loss": 0.0105 }, { "epoch": 0.9228992131558845, "grad_norm": 0.2197265625, "learning_rate": 3.0099530446789036e-06, "lm_loss": 1.8985, "loss": 2.0366, "mask_loss": 0.1275, "step": 2322, "topk_loss": 0.0106 }, { "epoch": 0.9232966719040138, "grad_norm": 0.12158203125, "learning_rate": 2.978998023880386e-06, "lm_loss": 1.8974, "loss": 2.0356, "mask_loss": 0.1276, "step": 2323, "topk_loss": 0.0107 }, { "epoch": 0.9236941306521429, "grad_norm": 0.1103515625, "learning_rate": 2.9482005941484423e-06, "lm_loss": 1.883, "loss": 2.0155, "mask_loss": 0.1234, "step": 2324, "topk_loss": 0.0091 }, { "epoch": 0.924091589400272, "grad_norm": 0.11474609375, "learning_rate": 2.9175608055072913e-06, "lm_loss": 1.9774, "loss": 2.1142, "mask_loss": 0.1252, "step": 2325, "topk_loss": 0.0116 }, { "epoch": 0.9244890481484012, "grad_norm": 0.109375, "learning_rate": 2.8870787077250994e-06, "lm_loss": 1.92, "loss": 2.0525, "mask_loss": 0.1234, "step": 2326, "topk_loss": 0.0092 }, { "epoch": 0.9248865068965303, "grad_norm": 0.11669921875, "learning_rate": 2.856754350313873e-06, "lm_loss": 1.896, "loss": 2.032, "mask_loss": 0.126, "step": 2327, "topk_loss": 0.0099 }, { "epoch": 0.9252839656446594, "grad_norm": 0.1328125, "learning_rate": 2.826587782529444e-06, "lm_loss": 1.9167, "loss": 2.0525, "mask_loss": 0.1252, "step": 2328, "topk_loss": 0.0106 }, { "epoch": 0.9256814243927886, "grad_norm": 0.109375, "learning_rate": 2.7965790533713064e-06, "lm_loss": 1.9363, "loss": 2.068, "mask_loss": 0.1231, "step": 2329, "topk_loss": 0.0086 }, { "epoch": 0.9260788831409178, "grad_norm": 0.1142578125, "learning_rate": 2.7667282115826033e-06, "lm_loss": 1.9306, "loss": 2.0698, "mask_loss": 0.1275, "step": 2330, "topk_loss": 0.0117 }, { "epoch": 0.9264763418890469, "grad_norm": 0.12451171875, "learning_rate": 2.737035305650004e-06, "lm_loss": 1.8215, "loss": 1.9577, "mask_loss": 0.1264, "step": 2331, "topk_loss": 0.0099 }, { "epoch": 0.9268738006371761, "grad_norm": 0.1484375, "learning_rate": 2.707500383803663e-06, "lm_loss": 1.8949, "loss": 2.0284, "mask_loss": 0.1248, "step": 2332, "topk_loss": 0.0087 }, { "epoch": 0.9272712593853052, "grad_norm": 0.12353515625, "learning_rate": 2.678123494017093e-06, "lm_loss": 1.9689, "loss": 2.1045, "mask_loss": 0.1259, "step": 2333, "topk_loss": 0.0097 }, { "epoch": 0.9276687181334343, "grad_norm": 0.169921875, "learning_rate": 2.6489046840071475e-06, "lm_loss": 1.8665, "loss": 2.0033, "mask_loss": 0.1255, "step": 2334, "topk_loss": 0.0113 }, { "epoch": 0.9280661768815635, "grad_norm": 0.109375, "learning_rate": 2.619844001233884e-06, "lm_loss": 1.9569, "loss": 2.0919, "mask_loss": 0.1246, "step": 2335, "topk_loss": 0.0104 }, { "epoch": 0.9284636356296927, "grad_norm": 0.115234375, "learning_rate": 2.590941492900534e-06, "lm_loss": 1.8497, "loss": 1.9884, "mask_loss": 0.1285, "step": 2336, "topk_loss": 0.0102 }, { "epoch": 0.9288610943778218, "grad_norm": 0.1318359375, "learning_rate": 2.562197205953376e-06, "lm_loss": 1.9076, "loss": 2.0424, "mask_loss": 0.1242, "step": 2337, "topk_loss": 0.0106 }, { "epoch": 0.929258553125951, "grad_norm": 0.115234375, "learning_rate": 2.5336111870817414e-06, "lm_loss": 1.8494, "loss": 1.9835, "mask_loss": 0.1246, "step": 2338, "topk_loss": 0.0095 }, { "epoch": 0.9296560118740801, "grad_norm": 0.15234375, "learning_rate": 2.5051834827178432e-06, "lm_loss": 1.9085, "loss": 2.0447, "mask_loss": 0.1255, "step": 2339, "topk_loss": 0.0107 }, { "epoch": 0.9300534706222092, "grad_norm": 0.1162109375, "learning_rate": 2.4769141390367567e-06, "lm_loss": 1.8833, "loss": 2.0206, "mask_loss": 0.1276, "step": 2340, "topk_loss": 0.0097 }, { "epoch": 0.9304509293703384, "grad_norm": 0.115234375, "learning_rate": 2.4488032019563402e-06, "lm_loss": 1.9008, "loss": 2.0353, "mask_loss": 0.125, "step": 2341, "topk_loss": 0.0094 }, { "epoch": 0.9308483881184676, "grad_norm": 0.1279296875, "learning_rate": 2.4208507171371353e-06, "lm_loss": 1.8745, "loss": 2.01, "mask_loss": 0.1263, "step": 2342, "topk_loss": 0.0092 }, { "epoch": 0.9312458468665967, "grad_norm": 0.115234375, "learning_rate": 2.3930567299823457e-06, "lm_loss": 1.9383, "loss": 2.0736, "mask_loss": 0.1251, "step": 2343, "topk_loss": 0.0103 }, { "epoch": 0.9316433056147259, "grad_norm": 0.11865234375, "learning_rate": 2.36542128563767e-06, "lm_loss": 1.9577, "loss": 2.0925, "mask_loss": 0.1252, "step": 2344, "topk_loss": 0.0096 }, { "epoch": 0.932040764362855, "grad_norm": 0.1064453125, "learning_rate": 2.3379444289913342e-06, "lm_loss": 1.9382, "loss": 2.0704, "mask_loss": 0.1237, "step": 2345, "topk_loss": 0.0086 }, { "epoch": 0.9324382231109841, "grad_norm": 0.1279296875, "learning_rate": 2.31062620467396e-06, "lm_loss": 1.903, "loss": 2.0397, "mask_loss": 0.1272, "step": 2346, "topk_loss": 0.0094 }, { "epoch": 0.9328356818591133, "grad_norm": 0.11865234375, "learning_rate": 2.2834666570584862e-06, "lm_loss": 1.9193, "loss": 2.0559, "mask_loss": 0.127, "step": 2347, "topk_loss": 0.0096 }, { "epoch": 0.9332331406072425, "grad_norm": 0.162109375, "learning_rate": 2.256465830260135e-06, "lm_loss": 1.9004, "loss": 2.0394, "mask_loss": 0.1266, "step": 2348, "topk_loss": 0.0124 }, { "epoch": 0.9336305993553716, "grad_norm": 0.11328125, "learning_rate": 2.229623768136313e-06, "lm_loss": 1.8747, "loss": 2.0122, "mask_loss": 0.1273, "step": 2349, "topk_loss": 0.0102 }, { "epoch": 0.9340280581035008, "grad_norm": 0.1259765625, "learning_rate": 2.2029405142865224e-06, "lm_loss": 1.8982, "loss": 2.0336, "mask_loss": 0.1252, "step": 2350, "topk_loss": 0.0102 }, { "epoch": 0.9340280581035008, "eval_lm_loss": 687.3653564453125, "eval_loss": 687.5023193359375, "eval_mask_hit_rate": 0.5359728336334229, "eval_mask_loss": 0.12440057098865509, "eval_mask_top_10_hit_rate": 0.9856017231941223, "eval_mask_top_1_hit_rate": 0.997429370880127, "eval_mask_top_20_hit_rate": 0.9762368202209473, "eval_mask_top_5_hit_rate": 0.9908671975135803, "eval_runtime": 144.3473, "eval_samples_per_second": 14.188, "eval_steps_per_second": 7.094, "eval_token_accuracy": 0.6146026849746704, "eval_top_k_diff": -530.0986328125, "eval_topk_loss": 0.012541640549898148, "step": 2350 }, { "epoch": 0.9344255168516299, "grad_norm": 0.1123046875, "learning_rate": 2.1764161120523484e-06, "lm_loss": 1.8715, "loss": 2.0062, "mask_loss": 0.1254, "step": 2351, "topk_loss": 0.0093 }, { "epoch": 0.934822975599759, "grad_norm": 0.11865234375, "learning_rate": 2.1500506045173287e-06, "lm_loss": 1.8771, "loss": 2.0135, "mask_loss": 0.126, "step": 2352, "topk_loss": 0.0104 }, { "epoch": 0.9352204343478882, "grad_norm": 0.12158203125, "learning_rate": 2.123844034506928e-06, "lm_loss": 1.8795, "loss": 2.0134, "mask_loss": 0.1244, "step": 2353, "topk_loss": 0.0096 }, { "epoch": 0.9356178930960174, "grad_norm": 0.10888671875, "learning_rate": 2.097796444588418e-06, "lm_loss": 1.9088, "loss": 2.0436, "mask_loss": 0.1248, "step": 2354, "topk_loss": 0.0101 }, { "epoch": 0.9360153518441465, "grad_norm": 0.1171875, "learning_rate": 2.0719078770708777e-06, "lm_loss": 1.8938, "loss": 2.0272, "mask_loss": 0.1244, "step": 2355, "topk_loss": 0.009 }, { "epoch": 0.9364128105922757, "grad_norm": 0.1259765625, "learning_rate": 2.046178374005059e-06, "lm_loss": 1.9042, "loss": 2.0408, "mask_loss": 0.1266, "step": 2356, "topk_loss": 0.01 }, { "epoch": 0.9368102693404048, "grad_norm": 0.11376953125, "learning_rate": 2.020607977183353e-06, "lm_loss": 1.8682, "loss": 2.0092, "mask_loss": 0.1288, "step": 2357, "topk_loss": 0.0122 }, { "epoch": 0.9372077280885339, "grad_norm": 0.1513671875, "learning_rate": 1.9951967281397257e-06, "lm_loss": 1.8533, "loss": 1.9906, "mask_loss": 0.1277, "step": 2358, "topk_loss": 0.0096 }, { "epoch": 0.937605186836663, "grad_norm": 0.10986328125, "learning_rate": 1.969944668149637e-06, "lm_loss": 1.8471, "loss": 1.9827, "mask_loss": 0.1269, "step": 2359, "topk_loss": 0.0088 }, { "epoch": 0.9380026455847922, "grad_norm": 0.11572265625, "learning_rate": 1.9448518382299553e-06, "lm_loss": 1.8907, "loss": 2.0257, "mask_loss": 0.1254, "step": 2360, "topk_loss": 0.0096 }, { "epoch": 0.9384001043329214, "grad_norm": 0.1279296875, "learning_rate": 1.9199182791389214e-06, "lm_loss": 1.9252, "loss": 2.0615, "mask_loss": 0.126, "step": 2361, "topk_loss": 0.0103 }, { "epoch": 0.9387975630810506, "grad_norm": 0.126953125, "learning_rate": 1.8951440313760837e-06, "lm_loss": 1.8604, "loss": 1.9968, "mask_loss": 0.1262, "step": 2362, "topk_loss": 0.0103 }, { "epoch": 0.9391950218291797, "grad_norm": 0.10986328125, "learning_rate": 1.8705291351822307e-06, "lm_loss": 1.9192, "loss": 2.0565, "mask_loss": 0.1267, "step": 2363, "topk_loss": 0.0107 }, { "epoch": 0.9395924805773088, "grad_norm": 0.10888671875, "learning_rate": 1.84607363053928e-06, "lm_loss": 1.9336, "loss": 2.0659, "mask_loss": 0.1234, "step": 2364, "topk_loss": 0.0089 }, { "epoch": 0.939989939325438, "grad_norm": 0.1240234375, "learning_rate": 1.8217775571702677e-06, "lm_loss": 1.9271, "loss": 2.0636, "mask_loss": 0.1259, "step": 2365, "topk_loss": 0.0106 }, { "epoch": 0.9403873980735671, "grad_norm": 0.109375, "learning_rate": 1.7976409545392924e-06, "lm_loss": 1.9143, "loss": 2.0514, "mask_loss": 0.1263, "step": 2366, "topk_loss": 0.0108 }, { "epoch": 0.9407848568216963, "grad_norm": 0.1162109375, "learning_rate": 1.7736638618513934e-06, "lm_loss": 1.8904, "loss": 2.0244, "mask_loss": 0.1258, "step": 2367, "topk_loss": 0.0082 }, { "epoch": 0.9411823155698255, "grad_norm": 0.126953125, "learning_rate": 1.7498463180525172e-06, "lm_loss": 1.9554, "loss": 2.0874, "mask_loss": 0.1224, "step": 2368, "topk_loss": 0.0096 }, { "epoch": 0.9415797743179546, "grad_norm": 0.109375, "learning_rate": 1.7261883618294616e-06, "lm_loss": 1.8866, "loss": 2.0236, "mask_loss": 0.1267, "step": 2369, "topk_loss": 0.0103 }, { "epoch": 0.9419772330660837, "grad_norm": 0.115234375, "learning_rate": 1.7026900316098215e-06, "lm_loss": 1.89, "loss": 2.0293, "mask_loss": 0.1291, "step": 2370, "topk_loss": 0.0101 }, { "epoch": 0.9423746918142129, "grad_norm": 0.13671875, "learning_rate": 1.6793513655618986e-06, "lm_loss": 1.9824, "loss": 2.1146, "mask_loss": 0.122, "step": 2371, "topk_loss": 0.0103 }, { "epoch": 0.942772150562342, "grad_norm": 0.1083984375, "learning_rate": 1.6561724015946356e-06, "lm_loss": 1.8736, "loss": 2.0121, "mask_loss": 0.1274, "step": 2372, "topk_loss": 0.011 }, { "epoch": 0.9431696093104712, "grad_norm": 0.1337890625, "learning_rate": 1.6331531773576048e-06, "lm_loss": 1.9437, "loss": 2.0775, "mask_loss": 0.1243, "step": 2373, "topk_loss": 0.0095 }, { "epoch": 0.9435670680586004, "grad_norm": 0.11376953125, "learning_rate": 1.6102937302408972e-06, "lm_loss": 1.8901, "loss": 2.0256, "mask_loss": 0.1254, "step": 2374, "topk_loss": 0.0101 }, { "epoch": 0.9439645268067295, "grad_norm": 0.11767578125, "learning_rate": 1.587594097375078e-06, "lm_loss": 1.925, "loss": 2.0596, "mask_loss": 0.1254, "step": 2375, "topk_loss": 0.0092 }, { "epoch": 0.9443619855548586, "grad_norm": 0.10888671875, "learning_rate": 1.5650543156311205e-06, "lm_loss": 1.9367, "loss": 2.0719, "mask_loss": 0.1247, "step": 2376, "topk_loss": 0.0105 }, { "epoch": 0.9447594443029877, "grad_norm": 0.1552734375, "learning_rate": 1.5426744216203493e-06, "lm_loss": 1.899, "loss": 2.0373, "mask_loss": 0.1273, "step": 2377, "topk_loss": 0.011 }, { "epoch": 0.9451569030511169, "grad_norm": 0.142578125, "learning_rate": 1.5204544516944198e-06, "lm_loss": 1.8555, "loss": 1.9928, "mask_loss": 0.1273, "step": 2378, "topk_loss": 0.01 }, { "epoch": 0.9455543617992461, "grad_norm": 0.11376953125, "learning_rate": 1.4983944419451613e-06, "lm_loss": 1.9303, "loss": 2.0642, "mask_loss": 0.1238, "step": 2379, "topk_loss": 0.0102 }, { "epoch": 0.9459518205473753, "grad_norm": 0.1591796875, "learning_rate": 1.4764944282046445e-06, "lm_loss": 1.8519, "loss": 1.9877, "mask_loss": 0.1254, "step": 2380, "topk_loss": 0.0105 }, { "epoch": 0.9463492792955044, "grad_norm": 0.11474609375, "learning_rate": 1.4547544460450035e-06, "lm_loss": 1.8738, "loss": 2.01, "mask_loss": 0.1255, "step": 2381, "topk_loss": 0.0108 }, { "epoch": 0.9467467380436335, "grad_norm": 0.115234375, "learning_rate": 1.4331745307784805e-06, "lm_loss": 1.8965, "loss": 2.0315, "mask_loss": 0.1252, "step": 2382, "topk_loss": 0.0097 }, { "epoch": 0.9471441967917626, "grad_norm": 0.1103515625, "learning_rate": 1.411754717457292e-06, "lm_loss": 1.8521, "loss": 1.9849, "mask_loss": 0.1239, "step": 2383, "topk_loss": 0.0089 }, { "epoch": 0.9475416555398918, "grad_norm": 0.1103515625, "learning_rate": 1.3904950408735962e-06, "lm_loss": 1.8487, "loss": 1.9845, "mask_loss": 0.1262, "step": 2384, "topk_loss": 0.0096 }, { "epoch": 0.9479391142880209, "grad_norm": 0.11181640625, "learning_rate": 1.36939553555947e-06, "lm_loss": 1.9358, "loss": 2.0704, "mask_loss": 0.1248, "step": 2385, "topk_loss": 0.0099 }, { "epoch": 0.9483365730361502, "grad_norm": 0.1103515625, "learning_rate": 1.3484562357867992e-06, "lm_loss": 1.8867, "loss": 2.021, "mask_loss": 0.1248, "step": 2386, "topk_loss": 0.0095 }, { "epoch": 0.9487340317842793, "grad_norm": 0.44140625, "learning_rate": 1.3276771755672545e-06, "lm_loss": 1.9017, "loss": 2.1053, "mask_loss": 0.1622, "step": 2387, "topk_loss": 0.0414 }, { "epoch": 0.9491314905324084, "grad_norm": 0.11181640625, "learning_rate": 1.307058388652238e-06, "lm_loss": 1.8856, "loss": 2.0201, "mask_loss": 0.1256, "step": 2388, "topk_loss": 0.009 }, { "epoch": 0.9495289492805375, "grad_norm": 0.119140625, "learning_rate": 1.286599908532815e-06, "lm_loss": 1.8799, "loss": 2.014, "mask_loss": 0.1247, "step": 2389, "topk_loss": 0.0093 }, { "epoch": 0.9499264080286667, "grad_norm": 0.1494140625, "learning_rate": 1.2663017684396593e-06, "lm_loss": 1.8059, "loss": 1.9435, "mask_loss": 0.1279, "step": 2390, "topk_loss": 0.0097 }, { "epoch": 0.9503238667767958, "grad_norm": 0.1181640625, "learning_rate": 1.2461640013430087e-06, "lm_loss": 1.9366, "loss": 2.0719, "mask_loss": 0.1246, "step": 2391, "topk_loss": 0.0107 }, { "epoch": 0.950721325524925, "grad_norm": 0.12353515625, "learning_rate": 1.22618663995262e-06, "lm_loss": 1.8743, "loss": 2.0077, "mask_loss": 0.1249, "step": 2392, "topk_loss": 0.0085 }, { "epoch": 0.9511187842730542, "grad_norm": 0.2412109375, "learning_rate": 1.20636971671767e-06, "lm_loss": 1.888, "loss": 2.0355, "mask_loss": 0.1307, "step": 2393, "topk_loss": 0.0168 }, { "epoch": 0.9515162430211833, "grad_norm": 0.1171875, "learning_rate": 1.1867132638267664e-06, "lm_loss": 1.8891, "loss": 2.0248, "mask_loss": 0.1261, "step": 2394, "topk_loss": 0.0096 }, { "epoch": 0.9519137017693124, "grad_norm": 0.1435546875, "learning_rate": 1.167217313207858e-06, "lm_loss": 1.8765, "loss": 2.0168, "mask_loss": 0.1266, "step": 2395, "topk_loss": 0.0137 }, { "epoch": 0.9523111605174416, "grad_norm": 0.1337890625, "learning_rate": 1.1478818965281911e-06, "lm_loss": 1.9205, "loss": 2.0549, "mask_loss": 0.1258, "step": 2396, "topk_loss": 0.0085 }, { "epoch": 0.9527086192655707, "grad_norm": 0.1162109375, "learning_rate": 1.1287070451942438e-06, "lm_loss": 1.9174, "loss": 2.0505, "mask_loss": 0.1241, "step": 2397, "topk_loss": 0.009 }, { "epoch": 0.9531060780137, "grad_norm": 0.10791015625, "learning_rate": 1.109692790351713e-06, "lm_loss": 1.9071, "loss": 2.0398, "mask_loss": 0.1243, "step": 2398, "topk_loss": 0.0085 }, { "epoch": 0.9535035367618291, "grad_norm": 0.11962890625, "learning_rate": 1.0908391628854041e-06, "lm_loss": 1.8462, "loss": 1.9832, "mask_loss": 0.1274, "step": 2399, "topk_loss": 0.0096 }, { "epoch": 0.9539009955099582, "grad_norm": 0.11865234375, "learning_rate": 1.0721461934192545e-06, "lm_loss": 1.9195, "loss": 2.0519, "mask_loss": 0.1236, "step": 2400, "topk_loss": 0.0087 }, { "epoch": 0.9539009955099582, "eval_lm_loss": 687.3690795898438, "eval_loss": 687.5060424804688, "eval_mask_hit_rate": 0.5359731912612915, "eval_mask_loss": 0.12440052628517151, "eval_mask_top_10_hit_rate": 0.9856013059616089, "eval_mask_top_1_hit_rate": 0.997429609298706, "eval_mask_top_20_hit_rate": 0.9762364625930786, "eval_mask_top_5_hit_rate": 0.990867018699646, "eval_runtime": 144.1892, "eval_samples_per_second": 14.204, "eval_steps_per_second": 7.102, "eval_token_accuracy": 0.6146031618118286, "eval_top_k_diff": -530.1141357421875, "eval_topk_loss": 0.012541300617158413, "step": 2400 }, { "epoch": 0.9542984542580873, "grad_norm": 0.11572265625, "learning_rate": 1.0536139123162093e-06, "lm_loss": 1.8699, "loss": 2.0035, "mask_loss": 0.1253, "step": 2401, "topk_loss": 0.0083 }, { "epoch": 0.9546959130062165, "grad_norm": 0.1162109375, "learning_rate": 1.035242349678245e-06, "lm_loss": 1.8848, "loss": 2.0193, "mask_loss": 0.1261, "step": 2402, "topk_loss": 0.0085 }, { "epoch": 0.9550933717543456, "grad_norm": 0.1318359375, "learning_rate": 1.0170315353462466e-06, "lm_loss": 1.9585, "loss": 2.0925, "mask_loss": 0.1241, "step": 2403, "topk_loss": 0.0099 }, { "epoch": 0.9554908305024749, "grad_norm": 0.12109375, "learning_rate": 9.9898149890002e-07, "lm_loss": 1.9355, "loss": 2.0712, "mask_loss": 0.1262, "step": 2404, "topk_loss": 0.0096 }, { "epoch": 0.955888289250604, "grad_norm": 0.1259765625, "learning_rate": 9.810922696582014e-07, "lm_loss": 1.9173, "loss": 2.0542, "mask_loss": 0.1258, "step": 2405, "topk_loss": 0.011 }, { "epoch": 0.9562857479987331, "grad_norm": 0.11474609375, "learning_rate": 9.633638766782582e-07, "lm_loss": 1.9092, "loss": 2.044, "mask_loss": 0.1253, "step": 2406, "topk_loss": 0.0095 }, { "epoch": 0.9566832067468622, "grad_norm": 0.11572265625, "learning_rate": 9.457963487563781e-07, "lm_loss": 1.8532, "loss": 1.9889, "mask_loss": 0.1254, "step": 2407, "topk_loss": 0.0103 }, { "epoch": 0.9570806654949914, "grad_norm": 0.1162109375, "learning_rate": 9.283897144274689e-07, "lm_loss": 1.8861, "loss": 2.0216, "mask_loss": 0.1258, "step": 2408, "topk_loss": 0.0097 }, { "epoch": 0.9574781242431205, "grad_norm": 0.126953125, "learning_rate": 9.11144001965103e-07, "lm_loss": 1.881, "loss": 2.0185, "mask_loss": 0.1271, "step": 2409, "topk_loss": 0.0104 }, { "epoch": 0.9578755829912498, "grad_norm": 0.1240234375, "learning_rate": 8.940592393814728e-07, "lm_loss": 1.8996, "loss": 2.0353, "mask_loss": 0.1259, "step": 2410, "topk_loss": 0.0099 }, { "epoch": 0.9582730417393789, "grad_norm": 0.1162109375, "learning_rate": 8.771354544273247e-07, "lm_loss": 1.8675, "loss": 2.0009, "mask_loss": 0.1237, "step": 2411, "topk_loss": 0.0096 }, { "epoch": 0.958670500487508, "grad_norm": 0.15625, "learning_rate": 8.603726745919361e-07, "lm_loss": 1.8671, "loss": 2.0062, "mask_loss": 0.129, "step": 2412, "topk_loss": 0.0102 }, { "epoch": 0.9590679592356371, "grad_norm": 0.11669921875, "learning_rate": 8.437709271030603e-07, "lm_loss": 1.9812, "loss": 2.1134, "mask_loss": 0.1229, "step": 2413, "topk_loss": 0.0094 }, { "epoch": 0.9594654179837663, "grad_norm": 0.11083984375, "learning_rate": 8.273302389269044e-07, "lm_loss": 1.9474, "loss": 2.0823, "mask_loss": 0.1258, "step": 2414, "topk_loss": 0.0091 }, { "epoch": 0.9598628767318954, "grad_norm": 0.10595703125, "learning_rate": 8.110506367680515e-07, "lm_loss": 1.845, "loss": 1.979, "mask_loss": 0.1244, "step": 2415, "topk_loss": 0.0096 }, { "epoch": 0.9602603354800245, "grad_norm": 0.11669921875, "learning_rate": 7.94932147069416e-07, "lm_loss": 1.8422, "loss": 1.9828, "mask_loss": 0.1291, "step": 2416, "topk_loss": 0.0115 }, { "epoch": 0.9606577942281538, "grad_norm": 0.11376953125, "learning_rate": 7.789747960122551e-07, "lm_loss": 1.874, "loss": 2.0101, "mask_loss": 0.1265, "step": 2417, "topk_loss": 0.0096 }, { "epoch": 0.9610552529762829, "grad_norm": 0.11328125, "learning_rate": 7.631786095160687e-07, "lm_loss": 1.8978, "loss": 2.0364, "mask_loss": 0.128, "step": 2418, "topk_loss": 0.0107 }, { "epoch": 0.961452711724412, "grad_norm": 0.1103515625, "learning_rate": 7.47543613238566e-07, "lm_loss": 1.8937, "loss": 2.0272, "mask_loss": 0.1238, "step": 2419, "topk_loss": 0.0097 }, { "epoch": 0.9618501704725412, "grad_norm": 0.140625, "learning_rate": 7.320698325756658e-07, "lm_loss": 1.8951, "loss": 2.0322, "mask_loss": 0.1261, "step": 2420, "topk_loss": 0.0109 }, { "epoch": 0.9622476292206703, "grad_norm": 0.11181640625, "learning_rate": 7.167572926613853e-07, "lm_loss": 1.9415, "loss": 2.0742, "mask_loss": 0.1237, "step": 2421, "topk_loss": 0.009 }, { "epoch": 0.9626450879687994, "grad_norm": 0.130859375, "learning_rate": 7.01606018367873e-07, "lm_loss": 1.9043, "loss": 2.0436, "mask_loss": 0.1273, "step": 2422, "topk_loss": 0.0121 }, { "epoch": 0.9630425467169287, "grad_norm": 0.1171875, "learning_rate": 6.866160343053318e-07, "lm_loss": 1.8718, "loss": 2.0095, "mask_loss": 0.1265, "step": 2423, "topk_loss": 0.0112 }, { "epoch": 0.9634400054650578, "grad_norm": 0.11669921875, "learning_rate": 6.71787364821952e-07, "lm_loss": 1.9277, "loss": 2.0597, "mask_loss": 0.1235, "step": 2424, "topk_loss": 0.0085 }, { "epoch": 0.963837464213187, "grad_norm": 0.1279296875, "learning_rate": 6.571200340039218e-07, "lm_loss": 1.8771, "loss": 2.0154, "mask_loss": 0.1264, "step": 2425, "topk_loss": 0.0119 }, { "epoch": 0.9642349229613161, "grad_norm": 0.1201171875, "learning_rate": 6.426140656753621e-07, "lm_loss": 1.892, "loss": 2.0291, "mask_loss": 0.1251, "step": 2426, "topk_loss": 0.0119 }, { "epoch": 0.9646323817094452, "grad_norm": 0.115234375, "learning_rate": 6.282694833983138e-07, "lm_loss": 1.8803, "loss": 2.0187, "mask_loss": 0.127, "step": 2427, "topk_loss": 0.0114 }, { "epoch": 0.9650298404575743, "grad_norm": 0.1220703125, "learning_rate": 6.140863104726391e-07, "lm_loss": 1.9055, "loss": 2.045, "mask_loss": 0.1271, "step": 2428, "topk_loss": 0.0124 }, { "epoch": 0.9654272992057036, "grad_norm": 0.11865234375, "learning_rate": 6.000645699360541e-07, "lm_loss": 1.9126, "loss": 2.0477, "mask_loss": 0.125, "step": 2429, "topk_loss": 0.0101 }, { "epoch": 0.9658247579538327, "grad_norm": 0.109375, "learning_rate": 5.862042845640403e-07, "lm_loss": 1.9581, "loss": 2.0898, "mask_loss": 0.1228, "step": 2430, "topk_loss": 0.0089 }, { "epoch": 0.9662222167019618, "grad_norm": 0.185546875, "learning_rate": 5.725054768698557e-07, "lm_loss": 1.9026, "loss": 2.0387, "mask_loss": 0.1244, "step": 2431, "topk_loss": 0.0117 }, { "epoch": 0.966619675450091, "grad_norm": 0.1142578125, "learning_rate": 5.589681691044346e-07, "lm_loss": 1.9114, "loss": 2.0518, "mask_loss": 0.1274, "step": 2432, "topk_loss": 0.0131 }, { "epoch": 0.9670171341982201, "grad_norm": 0.11767578125, "learning_rate": 5.455923832564214e-07, "lm_loss": 1.8213, "loss": 1.9585, "mask_loss": 0.1263, "step": 2433, "topk_loss": 0.0109 }, { "epoch": 0.9674145929463492, "grad_norm": 0.1220703125, "learning_rate": 5.323781410520812e-07, "lm_loss": 1.92, "loss": 2.0552, "mask_loss": 0.1256, "step": 2434, "topk_loss": 0.0096 }, { "epoch": 0.9678120516944785, "grad_norm": 0.11181640625, "learning_rate": 5.19325463955278e-07, "lm_loss": 1.9386, "loss": 2.0739, "mask_loss": 0.1254, "step": 2435, "topk_loss": 0.0099 }, { "epoch": 0.9682095104426076, "grad_norm": 0.11572265625, "learning_rate": 5.064343731674637e-07, "lm_loss": 1.8839, "loss": 2.0184, "mask_loss": 0.1259, "step": 2436, "topk_loss": 0.0086 }, { "epoch": 0.9686069691907367, "grad_norm": 0.1279296875, "learning_rate": 4.93704889627622e-07, "lm_loss": 1.9107, "loss": 2.0489, "mask_loss": 0.1263, "step": 2437, "topk_loss": 0.0119 }, { "epoch": 0.9690044279388659, "grad_norm": 0.1279296875, "learning_rate": 4.811370340122134e-07, "lm_loss": 1.8769, "loss": 2.0172, "mask_loss": 0.1284, "step": 2438, "topk_loss": 0.0119 }, { "epoch": 0.969401886686995, "grad_norm": 0.1298828125, "learning_rate": 4.6873082673521975e-07, "lm_loss": 1.8932, "loss": 2.0289, "mask_loss": 0.125, "step": 2439, "topk_loss": 0.0107 }, { "epoch": 0.9697993454351241, "grad_norm": 0.11328125, "learning_rate": 4.564862879479881e-07, "lm_loss": 1.8464, "loss": 1.9827, "mask_loss": 0.1256, "step": 2440, "topk_loss": 0.0108 }, { "epoch": 0.9701968041832533, "grad_norm": 0.146484375, "learning_rate": 4.4440343753933135e-07, "lm_loss": 1.8917, "loss": 2.0308, "mask_loss": 0.1269, "step": 2441, "topk_loss": 0.0122 }, { "epoch": 0.9705942629313825, "grad_norm": 0.1162109375, "learning_rate": 4.324822951353946e-07, "lm_loss": 1.8782, "loss": 2.0162, "mask_loss": 0.1272, "step": 2442, "topk_loss": 0.0108 }, { "epoch": 0.9709917216795116, "grad_norm": 0.11572265625, "learning_rate": 4.2072288009966654e-07, "lm_loss": 1.8861, "loss": 2.0231, "mask_loss": 0.1266, "step": 2443, "topk_loss": 0.0103 }, { "epoch": 0.9713891804276408, "grad_norm": 0.25390625, "learning_rate": 4.091252115329569e-07, "lm_loss": 1.8934, "loss": 2.0477, "mask_loss": 0.1333, "step": 2444, "topk_loss": 0.0211 }, { "epoch": 0.9717866391757699, "grad_norm": 0.1142578125, "learning_rate": 3.976893082733413e-07, "lm_loss": 1.8902, "loss": 2.0255, "mask_loss": 0.1259, "step": 2445, "topk_loss": 0.0094 }, { "epoch": 0.972184097923899, "grad_norm": 0.1513671875, "learning_rate": 3.864151888961387e-07, "lm_loss": 1.9002, "loss": 2.0379, "mask_loss": 0.1258, "step": 2446, "topk_loss": 0.0119 }, { "epoch": 0.9725815566720282, "grad_norm": 0.10498046875, "learning_rate": 3.7530287171387843e-07, "lm_loss": 1.8868, "loss": 2.024, "mask_loss": 0.1269, "step": 2447, "topk_loss": 0.0103 }, { "epoch": 0.9729790154201574, "grad_norm": 0.12890625, "learning_rate": 3.6435237477627784e-07, "lm_loss": 1.8634, "loss": 1.998, "mask_loss": 0.1254, "step": 2448, "topk_loss": 0.0092 }, { "epoch": 0.9733764741682865, "grad_norm": 0.1259765625, "learning_rate": 3.5356371587021987e-07, "lm_loss": 1.9244, "loss": 2.0558, "mask_loss": 0.1229, "step": 2449, "topk_loss": 0.0085 }, { "epoch": 0.9737739329164157, "grad_norm": 0.109375, "learning_rate": 3.429369125197091e-07, "lm_loss": 1.9097, "loss": 2.044, "mask_loss": 0.1255, "step": 2450, "topk_loss": 0.0089 }, { "epoch": 0.9737739329164157, "eval_lm_loss": 687.3634033203125, "eval_loss": 687.5003662109375, "eval_mask_hit_rate": 0.535973310470581, "eval_mask_loss": 0.12440047413110733, "eval_mask_top_10_hit_rate": 0.985601544380188, "eval_mask_top_1_hit_rate": 0.997429609298706, "eval_mask_top_20_hit_rate": 0.9762364625930786, "eval_mask_top_5_hit_rate": 0.9908676147460938, "eval_runtime": 143.7229, "eval_samples_per_second": 14.25, "eval_steps_per_second": 7.125, "eval_token_accuracy": 0.6146076917648315, "eval_top_k_diff": -530.109619140625, "eval_topk_loss": 0.012541374191641808, "step": 2450 }, { "epoch": 0.9741713916645448, "grad_norm": 0.146484375, "learning_rate": 3.3247198198583793e-07, "lm_loss": 1.9452, "loss": 2.0893, "mask_loss": 0.1291, "step": 2451, "topk_loss": 0.0149 }, { "epoch": 0.9745688504126739, "grad_norm": 0.12255859375, "learning_rate": 3.221689412667872e-07, "lm_loss": 1.8813, "loss": 2.0189, "mask_loss": 0.1272, "step": 2452, "topk_loss": 0.0104 }, { "epoch": 0.9749663091608031, "grad_norm": 0.1181640625, "learning_rate": 3.1202780709775894e-07, "lm_loss": 1.8595, "loss": 1.9945, "mask_loss": 0.1261, "step": 2453, "topk_loss": 0.0089 }, { "epoch": 0.9753637679089323, "grad_norm": 0.1611328125, "learning_rate": 3.020485959509989e-07, "lm_loss": 1.8525, "loss": 1.9929, "mask_loss": 0.1275, "step": 2454, "topk_loss": 0.0129 }, { "epoch": 0.9757612266570614, "grad_norm": 0.11669921875, "learning_rate": 2.9223132403570773e-07, "lm_loss": 1.9263, "loss": 2.0603, "mask_loss": 0.1246, "step": 2455, "topk_loss": 0.0094 }, { "epoch": 0.9761586854051906, "grad_norm": 0.1240234375, "learning_rate": 2.825760072980632e-07, "lm_loss": 1.9789, "loss": 2.1153, "mask_loss": 0.1253, "step": 2456, "topk_loss": 0.0112 }, { "epoch": 0.9765561441533197, "grad_norm": 0.11572265625, "learning_rate": 2.7308266142119785e-07, "lm_loss": 1.9255, "loss": 2.0597, "mask_loss": 0.1245, "step": 2457, "topk_loss": 0.0098 }, { "epoch": 0.9769536029014488, "grad_norm": 0.126953125, "learning_rate": 2.637513018251325e-07, "lm_loss": 1.9166, "loss": 2.0541, "mask_loss": 0.1268, "step": 2458, "topk_loss": 0.0107 }, { "epoch": 0.977351061649578, "grad_norm": 0.14453125, "learning_rate": 2.545819436667651e-07, "lm_loss": 1.8749, "loss": 2.0101, "mask_loss": 0.1256, "step": 2459, "topk_loss": 0.0096 }, { "epoch": 0.9777485203977072, "grad_norm": 0.1171875, "learning_rate": 2.45574601839893e-07, "lm_loss": 1.921, "loss": 2.0566, "mask_loss": 0.1262, "step": 2460, "topk_loss": 0.0094 }, { "epoch": 0.9781459791458363, "grad_norm": 0.12890625, "learning_rate": 2.3672929097512396e-07, "lm_loss": 1.859, "loss": 1.9979, "mask_loss": 0.1273, "step": 2461, "topk_loss": 0.0116 }, { "epoch": 0.9785434378939655, "grad_norm": 0.1123046875, "learning_rate": 2.280460254398764e-07, "lm_loss": 1.8034, "loss": 1.9411, "mask_loss": 0.1275, "step": 2462, "topk_loss": 0.0102 }, { "epoch": 0.9789408966420946, "grad_norm": 0.12109375, "learning_rate": 2.19524819338357e-07, "lm_loss": 1.8752, "loss": 2.0141, "mask_loss": 0.1273, "step": 2463, "topk_loss": 0.0116 }, { "epoch": 0.9793383553902237, "grad_norm": 0.11376953125, "learning_rate": 2.1116568651156076e-07, "lm_loss": 1.884, "loss": 2.0205, "mask_loss": 0.1264, "step": 2464, "topk_loss": 0.0101 }, { "epoch": 0.9797358141383529, "grad_norm": 0.10498046875, "learning_rate": 2.0296864053721555e-07, "lm_loss": 1.8769, "loss": 2.0129, "mask_loss": 0.1257, "step": 2465, "topk_loss": 0.0103 }, { "epoch": 0.9801332728864821, "grad_norm": 0.11474609375, "learning_rate": 1.9493369472977086e-07, "lm_loss": 1.9309, "loss": 2.0648, "mask_loss": 0.1241, "step": 2466, "topk_loss": 0.0098 }, { "epoch": 0.9805307316346112, "grad_norm": 0.1103515625, "learning_rate": 1.8706086214036467e-07, "lm_loss": 1.9119, "loss": 2.0473, "mask_loss": 0.1258, "step": 2467, "topk_loss": 0.0097 }, { "epoch": 0.9809281903827404, "grad_norm": 0.1171875, "learning_rate": 1.7935015555683444e-07, "lm_loss": 1.9403, "loss": 2.0755, "mask_loss": 0.1246, "step": 2468, "topk_loss": 0.0106 }, { "epoch": 0.9813256491308695, "grad_norm": 0.1123046875, "learning_rate": 1.718015875036727e-07, "lm_loss": 1.9041, "loss": 2.0384, "mask_loss": 0.1253, "step": 2469, "topk_loss": 0.009 }, { "epoch": 0.9817231078789986, "grad_norm": 0.11474609375, "learning_rate": 1.6441517024200492e-07, "lm_loss": 1.8488, "loss": 1.9795, "mask_loss": 0.1221, "step": 2470, "topk_loss": 0.0086 }, { "epoch": 0.9821205666271278, "grad_norm": 0.1298828125, "learning_rate": 1.5719091576957835e-07, "lm_loss": 1.9321, "loss": 2.0716, "mask_loss": 0.1265, "step": 2471, "topk_loss": 0.013 }, { "epoch": 0.9825180253752569, "grad_norm": 0.119140625, "learning_rate": 1.5012883582073976e-07, "lm_loss": 1.8822, "loss": 2.0182, "mask_loss": 0.1261, "step": 2472, "topk_loss": 0.0099 }, { "epoch": 0.9829154841233861, "grad_norm": 0.119140625, "learning_rate": 1.4322894186640236e-07, "lm_loss": 1.9432, "loss": 2.0766, "mask_loss": 0.1222, "step": 2473, "topk_loss": 0.0112 }, { "epoch": 0.9833129428715153, "grad_norm": 0.109375, "learning_rate": 1.3649124511406764e-07, "lm_loss": 1.9177, "loss": 2.0534, "mask_loss": 0.1258, "step": 2474, "topk_loss": 0.01 }, { "epoch": 0.9837104016196444, "grad_norm": 0.1328125, "learning_rate": 1.299157565077702e-07, "lm_loss": 1.8753, "loss": 2.0122, "mask_loss": 0.1263, "step": 2475, "topk_loss": 0.0106 }, { "epoch": 0.9841078603677735, "grad_norm": 0.1513671875, "learning_rate": 1.2350248672804433e-07, "lm_loss": 1.8758, "loss": 2.0142, "mask_loss": 0.1289, "step": 2476, "topk_loss": 0.0095 }, { "epoch": 0.9845053191159027, "grad_norm": 0.12255859375, "learning_rate": 1.1725144619197937e-07, "lm_loss": 1.9304, "loss": 2.0647, "mask_loss": 0.1253, "step": 2477, "topk_loss": 0.009 }, { "epoch": 0.9849027778640318, "grad_norm": 0.11279296875, "learning_rate": 1.1116264505310891e-07, "lm_loss": 1.8625, "loss": 1.9944, "mask_loss": 0.1231, "step": 2478, "topk_loss": 0.0088 }, { "epoch": 0.985300236612161, "grad_norm": 0.11328125, "learning_rate": 1.0523609320147732e-07, "lm_loss": 1.8549, "loss": 1.9947, "mask_loss": 0.1293, "step": 2479, "topk_loss": 0.0106 }, { "epoch": 0.9856976953602902, "grad_norm": 0.10546875, "learning_rate": 9.947180026357305e-08, "lm_loss": 1.858, "loss": 1.9912, "mask_loss": 0.1249, "step": 2480, "topk_loss": 0.0084 }, { "epoch": 0.9860951541084193, "grad_norm": 0.1103515625, "learning_rate": 9.386977560232879e-08, "lm_loss": 1.8755, "loss": 2.0128, "mask_loss": 0.1277, "step": 2481, "topk_loss": 0.0096 }, { "epoch": 0.9864926128565484, "grad_norm": 0.109375, "learning_rate": 8.843002831709912e-08, "lm_loss": 1.8747, "loss": 2.0086, "mask_loss": 0.1244, "step": 2482, "topk_loss": 0.0095 }, { "epoch": 0.9868900716046776, "grad_norm": 0.12353515625, "learning_rate": 8.31525672436606e-08, "lm_loss": 1.9447, "loss": 2.0818, "mask_loss": 0.1265, "step": 2483, "topk_loss": 0.0106 }, { "epoch": 0.9872875303528067, "grad_norm": 0.12060546875, "learning_rate": 7.803740095417844e-08, "lm_loss": 1.8844, "loss": 2.0219, "mask_loss": 0.1271, "step": 2484, "topk_loss": 0.0104 }, { "epoch": 0.9876849891009359, "grad_norm": 0.1181640625, "learning_rate": 7.308453775721758e-08, "lm_loss": 1.953, "loss": 2.0876, "mask_loss": 0.1245, "step": 2485, "topk_loss": 0.01 }, { "epoch": 0.9880824478490651, "grad_norm": 0.107421875, "learning_rate": 6.829398569770939e-08, "lm_loss": 1.8718, "loss": 2.0061, "mask_loss": 0.125, "step": 2486, "topk_loss": 0.0093 }, { "epoch": 0.9884799065971942, "grad_norm": 0.10498046875, "learning_rate": 6.366575255694062e-08, "lm_loss": 1.8023, "loss": 1.9392, "mask_loss": 0.127, "step": 2487, "topk_loss": 0.0099 }, { "epoch": 0.9888773653453233, "grad_norm": 0.11669921875, "learning_rate": 5.919984585253113e-08, "lm_loss": 1.8762, "loss": 2.0138, "mask_loss": 0.1271, "step": 2488, "topk_loss": 0.0105 }, { "epoch": 0.9892748240934525, "grad_norm": 0.11181640625, "learning_rate": 5.4896272838445004e-08, "lm_loss": 1.9008, "loss": 2.0385, "mask_loss": 0.1282, "step": 2489, "topk_loss": 0.0094 }, { "epoch": 0.9896722828415816, "grad_norm": 0.10986328125, "learning_rate": 5.075504050499058e-08, "lm_loss": 1.9343, "loss": 2.0691, "mask_loss": 0.1255, "step": 2490, "topk_loss": 0.0093 }, { "epoch": 0.9900697415897108, "grad_norm": 0.11474609375, "learning_rate": 4.677615557874271e-08, "lm_loss": 1.8501, "loss": 1.9853, "mask_loss": 0.1257, "step": 2491, "topk_loss": 0.0094 }, { "epoch": 0.99046720033784, "grad_norm": 0.1123046875, "learning_rate": 4.295962452262048e-08, "lm_loss": 1.8687, "loss": 2.0074, "mask_loss": 0.1276, "step": 2492, "topk_loss": 0.0111 }, { "epoch": 0.9908646590859691, "grad_norm": 0.10986328125, "learning_rate": 3.93054535357873e-08, "lm_loss": 1.913, "loss": 2.0455, "mask_loss": 0.1233, "step": 2493, "topk_loss": 0.0092 }, { "epoch": 0.9912621178340982, "grad_norm": 0.1162109375, "learning_rate": 3.5813648553717494e-08, "lm_loss": 1.8775, "loss": 2.0153, "mask_loss": 0.1268, "step": 2494, "topk_loss": 0.011 }, { "epoch": 0.9916595765822274, "grad_norm": 0.11474609375, "learning_rate": 3.2484215248140824e-08, "lm_loss": 1.9635, "loss": 2.1013, "mask_loss": 0.1252, "step": 2495, "topk_loss": 0.0126 }, { "epoch": 0.9920570353303565, "grad_norm": 0.1171875, "learning_rate": 2.9317159027064666e-08, "lm_loss": 1.8992, "loss": 2.0349, "mask_loss": 0.1254, "step": 2496, "topk_loss": 0.0103 }, { "epoch": 0.9924544940784856, "grad_norm": 0.1123046875, "learning_rate": 2.6312485034718504e-08, "lm_loss": 1.9095, "loss": 2.0447, "mask_loss": 0.1253, "step": 2497, "topk_loss": 0.0099 }, { "epoch": 0.9928519528266149, "grad_norm": 0.11279296875, "learning_rate": 2.347019815158724e-08, "lm_loss": 1.8812, "loss": 2.0148, "mask_loss": 0.1252, "step": 2498, "topk_loss": 0.0085 }, { "epoch": 0.993249411574744, "grad_norm": 0.125, "learning_rate": 2.0790302994411204e-08, "lm_loss": 1.816, "loss": 1.957, "mask_loss": 0.1293, "step": 2499, "topk_loss": 0.0118 }, { "epoch": 0.9936468703228731, "grad_norm": 0.12353515625, "learning_rate": 1.827280391611952e-08, "lm_loss": 1.8924, "loss": 2.0265, "mask_loss": 0.1254, "step": 2500, "topk_loss": 0.0087 }, { "epoch": 0.9936468703228731, "eval_lm_loss": 687.3634033203125, "eval_loss": 687.5003662109375, "eval_mask_hit_rate": 0.535973310470581, "eval_mask_loss": 0.12440047413110733, "eval_mask_top_10_hit_rate": 0.9856016039848328, "eval_mask_top_1_hit_rate": 0.997429609298706, "eval_mask_top_20_hit_rate": 0.9762364625930786, "eval_mask_top_5_hit_rate": 0.9908676147460938, "eval_runtime": 144.2467, "eval_samples_per_second": 14.198, "eval_steps_per_second": 7.099, "eval_token_accuracy": 0.6146076917648315, "eval_top_k_diff": -530.1094970703125, "eval_topk_loss": 0.012541375122964382, "step": 2500 }, { "epoch": 0.9940443290710023, "grad_norm": 0.109375, "learning_rate": 1.591770500589673e-08, "lm_loss": 1.8376, "loss": 1.9718, "mask_loss": 0.1253, "step": 2501, "topk_loss": 0.0089 }, { "epoch": 0.9944417878191314, "grad_norm": 0.1162109375, "learning_rate": 1.3725010089116198e-08, "lm_loss": 1.8811, "loss": 2.0195, "mask_loss": 0.1277, "step": 2502, "topk_loss": 0.0107 }, { "epoch": 0.9948392465672605, "grad_norm": 0.125, "learning_rate": 1.1694722727384477e-08, "lm_loss": 1.8401, "loss": 1.9748, "mask_loss": 0.126, "step": 2503, "topk_loss": 0.0087 }, { "epoch": 0.9952367053153898, "grad_norm": 0.1376953125, "learning_rate": 9.82684621847474e-09, "lm_loss": 1.9323, "loss": 2.0686, "mask_loss": 0.1253, "step": 2504, "topk_loss": 0.011 }, { "epoch": 0.9956341640635189, "grad_norm": 0.12890625, "learning_rate": 8.121383596393362e-09, "lm_loss": 1.8994, "loss": 2.0352, "mask_loss": 0.1258, "step": 2505, "topk_loss": 0.0101 }, { "epoch": 0.996031622811648, "grad_norm": 0.10791015625, "learning_rate": 6.578337631313325e-09, "lm_loss": 1.8886, "loss": 2.0258, "mask_loss": 0.1274, "step": 2506, "topk_loss": 0.0098 }, { "epoch": 0.9964290815597772, "grad_norm": 0.1142578125, "learning_rate": 5.197710829596414e-09, "lm_loss": 1.8999, "loss": 2.0346, "mask_loss": 0.1258, "step": 2507, "topk_loss": 0.0088 }, { "epoch": 0.9968265403079063, "grad_norm": 0.11328125, "learning_rate": 3.9795054337932184e-09, "lm_loss": 1.8878, "loss": 2.0225, "mask_loss": 0.1253, "step": 2508, "topk_loss": 0.0094 }, { "epoch": 0.9972239990560354, "grad_norm": 0.1103515625, "learning_rate": 2.9237234226431322e-09, "lm_loss": 1.9018, "loss": 2.0338, "mask_loss": 0.1234, "step": 2509, "topk_loss": 0.0086 }, { "epoch": 0.9976214578041647, "grad_norm": 0.11181640625, "learning_rate": 2.0303665110410484e-09, "lm_loss": 1.8768, "loss": 2.0138, "mask_loss": 0.1267, "step": 2510, "topk_loss": 0.0102 }, { "epoch": 0.9980189165522938, "grad_norm": 0.1318359375, "learning_rate": 1.2994361500706652e-09, "lm_loss": 1.8978, "loss": 2.0396, "mask_loss": 0.1287, "step": 2511, "topk_loss": 0.0131 }, { "epoch": 0.9984163753004229, "grad_norm": 0.10693359375, "learning_rate": 7.309335269822804e-10, "lm_loss": 1.8741, "loss": 2.0061, "mask_loss": 0.1239, "step": 2512, "topk_loss": 0.008 }, { "epoch": 0.9988138340485521, "grad_norm": 0.11474609375, "learning_rate": 3.2485956518168994e-10, "lm_loss": 1.9016, "loss": 2.0378, "mask_loss": 0.1248, "step": 2513, "topk_loss": 0.0114 }, { "epoch": 0.9992112927966812, "grad_norm": 0.115234375, "learning_rate": 8.121492427459742e-11, "lm_loss": 1.9225, "loss": 2.0591, "mask_loss": 0.1265, "step": 2514, "topk_loss": 0.0101 }, { "epoch": 0.9996087515448103, "grad_norm": 0.111328125, "learning_rate": 0.0, "lm_loss": 1.9003, "loss": 2.0373, "mask_loss": 0.1264, "step": 2515, "topk_loss": 0.0106 } ], "logging_steps": 1, "max_steps": 2515, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.5540471456590725e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }