diff --git "a/train_job_output.txt" "b/train_job_output.txt" --- "a/train_job_output.txt" +++ "b/train_job_output.txt" @@ -389,4 +389,65 @@ command outputs: 71%|███████ | 7600/10682 [1:14:06<25:23, 2.02it/s] 71%|███████ | 7601/10682 [1:14:07<25:23, 2.02it/s] 71%|███████ | 7602/10682 [1:14:07<25:28, 2.02it/s] 71%|███████ | 7603/10682 [1:14:08<25:25, 2.02it/s] 71%|███████ | 7604/10682 [1:14:08<25:24, 2.02it/s] 71%|███████ | 7605/10682 [1:14:09<25:21, 2.02it/s] 71%|███████ | 7606/10682 [1:14:09<25:21, 2.02it/s] 71%|███████ | 7607/10682 [1:14:10<25:20, 2.02it/s] 71%|███████ | 7608/10682 [1:14:10<25:22, 2.02it/s] 71%|███████ | 7609/10682 [1:14:11<25:20, 2.02it/s] 71%|███████ | 7610/10682 [1:14:11<25:20, 2.02it/s] 71%|███████▏ | 7611/10682 [1:14:12<25:18, 2.02it/s] 71%|███████▏ | 7612/10682 [1:14:12<25:19, 2.02it/s] 71%|███████▏ | 7613/10682 [1:14:13<25:18, 2.02it/s] 71%|███████▏ | 7614/10682 [1:14:13<25:18, 2.02it/s] 71%|███████▏ | 7615/10682 [1:14:14<25:18, 2.02it/s] 71%|███████▏ | 7616/10682 [1:14:14<25:18, 2.02it/s] 71%|███████▏ | 7617/10682 [1:14:15<25:18, 2.02it/s] 71%|███████▏ | 7618/10682 [1:14:15<25:17, 2.02it/s] 71%|███████▏ | 7619/10682 [1:14:16<25:17, 2.02it/s] 71%|███████▏ | 7620/10682 [1:14:16<25:16, 2.02it/s] 71%|███████▏ | 7621/10682 [1:14:17<25:17, 2.02it/s] 71%|███████▏ | 7622/10682 [1:14:17<25:15, 2.02it/s] 71%|███████▏ | 7623/10682 [1:14:18<25:15, 2.02it/s] 71%|███████▏ | 7624/10682 [1:14:18<25:13, 2.02it/s] 71%|███████▏ | 7625/10682 [1:14:19<25:14, 2.02it/s]{'loss': 2.8268, 'grad_norm': 0.2409941554069519, 'learning_rate': 0.00022944844326774121, 'epoch': 9.99} 71%|███████▏ | 7625/10682 [1:14:19<25:14, 2.02it/s] 71%|███████▏ | 7626/10682 [1:14:19<25:20, 2.01it/s] 71%|███████▏ | 7627/10682 [1:14:20<25:16, 2.01it/s] 71%|███████▏ | 7628/10682 [1:14:20<25:17, 2.01it/s] 71%|███████▏ | 7629/10682 [1:14:21<25:13, 2.02it/s] 71%|███████▏ | 7630/10682 [1:14:21<24:57, 2.04it/s] 71%|███████▏ | 7631/10682 [1:15:37<19:34:31, 23.10s/it] 71%|███████▏ | 7632/10682 [1:15:37<13:49:27, 16.32s/it] 71%|███████▏ | 7633/10682 [1:15:38<9:47:57, 11.57s/it] 71%|███████▏ | 7634/10682 [1:15:38<6:59:04, 8.25s/it] 71%|███████▏ | 7635/10682 [1:15:39<5:00:57, 5.93s/it] 71%|███████▏ | 7636/10682 [1:15:39<3:38:07, 4.30s/it] 71%|███████▏ | 7637/10682 [1:15:40<2:40:09, 3.16s/it] 72%|███████▏ | 7638/10682 [1:15:40<1:59:44, 2.36s/it] 72%|███████▏ | 7639/10682 [1:15:41<1:31:17, 1.80s/it] 72%|███████▏ | 7640/10682 [1:15:41<1:11:25, 1.41s/it] 72%|███████▏ | 7641/10682 [1:15:42<57:30, 1.13s/it] 72%|███████▏ | 7642/10682 [1:15:42<47:44, 1.06it/s] 72%|███████▏ | 7643/10682 [1:15:43<40:55, 1.24it/s] 72%|███████▏ | 7644/10682 [1:15:43<36:09, 1.40it/s] 72%|███████▏ | 7645/10682 [1:15:44<32:47, 1.54it/s] 72%|███████▏ | 7646/10682 [1:15:44<30:27, 1.66it/s] 72%|███████▏ | 7647/10682 [1:15:45<28:50, 1.75it/s] 72%|███████▏ | 7648/10682 [1:15:45<27:41, 1.83it/s] 72%|███████▏ | 7649/10682 [1:15:46<26:59, 1.87it/s] 72%|███████▏ | 7650/10682 [1:15:46<26:21, 1.92it/s] {'loss': 2.7373, 'grad_norm': 0.24934782087802887, 'learning_rate': 0.00022602213475715589, 'epoch': 10.03} 72%|███████▏ | 7650/10682 [1:15:46<26:21, 1.92it/s] 72%|███████▏ | 7651/10682 [1:15:47<26:03, 1.94it/s] 72%|███████▏ | 7652/10682 [1:15:47<25:49, 1.96it/s] 72%|███████▏ | 7653/10682 [1:15:48<27:34, 1.83it/s] 72%|███████▏ | 7654/10682 [1:15:49<26:49, 1.88it/s] 72%|███████▏ | 7655/10682 [1:15:49<26:16, 1.92it/s] 72%|███████▏ | 7656/10682 [1:15:50<25:52, 1.95it/s] 72%|███████▏ | 7657/10682 [1:15:50<25:35, 1.97it/s] 72%|███████▏ | 7658/10682 [1:15:51<25:23, 1.98it/s] 72%|███████▏ | 7659/10682 [1:15:51<25:14, 2.00it/s] 72%|███████▏ | 7660/10682 [1:15:52<25:08, 2.00it/s] 72%|███████▏ | 7661/10682 [1:15:52<25:12, 2.00it/s] 72%|███████▏ | 7662/10682 [1:15:53<25:11, 2.00it/s] 72%|███████▏ | 7663/10682 [1:15:53<25:07, 2.00it/s] 72%|███████▏ | 7664/10682 [1:15:54<25:04, 2.01it/s] 72%|███████▏ | 7665/10682 [1:15:54<25:03, 2.01it/s] 72%|███████▏ | 7666/10682 [1:15:55<24:58, 2.01it/s] 72%|███████▏ | 7667/10682 [1:15:55<24:57, 2.01it/s] 72%|███████▏ | 7668/10682 [1:15:56<24:54, 2.02it/s] 72%|███████▏ | 7669/10682 [1:15:56<24:53, 2.02it/s] 72%|███████▏ | 7670/10682 [1:15:56<24:51, 2.02it/s] 72%|███████▏ | 7671/10682 [1:15:57<24:52, 2.02it/s] 72%|███████▏ | 7672/10682 [1:15:57<24:52, 2.02it/s] 72%|███████▏ | 7673/10682 [1:15:58<24:55, 2.01it/s] 72%|███████▏ | 7674/10682 [1:15:58<24:51, 2.02it/s] 72%|███████▏ | 7675/10682 [1:15:59<24:50, 2.02it/s] {'loss': 2.7203, 'grad_norm': 0.24969111382961273, 'learning_rate': 0.0002226141146140523, 'epoch': 10.06} - 72%|███████▏ | 7675/10682 [1:15:59<24:50, 2.02it/s] 72%|███████▏ | 7676/10682 [1:15:59<24:58, 2.01it/s] 72%|███████▏ | 7677/10682 [1:16:00<24:53, 2.01it/s] 72%|███████▏ | 7678/10682 [1:16:00<24:53, 2.01it/s] 72%|███████▏ | 7679/10682 [1:16:01<24:51, 2.01it/s] \ No newline at end of file + 72%|███████▏ | 7675/10682 [1:15:59<24:50, 2.02it/s] 72%|███████▏ | 7676/10682 [1:15:59<24:58, 2.01it/s] 72%|███████▏ | 7677/10682 [1:16:00<24:53, 2.01it/s] 72%|███████▏ | 7678/10682 [1:16:00<24:53, 2.01it/s] 72%|███████▏ | 7679/10682 [1:16:01<24:51, 2.01it/s] 72%|███████▏ | 7680/10682 [1:16:01<24:50, 2.01it/s] 72%|███████▏ | 7681/10682 [1:16:02<24:46, 2.02it/s] 72%|███████▏ | 7682/10682 [1:16:02<24:45, 2.02it/s] 72%|███████▏ | 7683/10682 [1:16:03<24:43, 2.02it/s] 72%|███████▏ | 7684/10682 [1:16:03<24:44, 2.02it/s] 72%|███████▏ | 7685/10682 [1:16:04<24:45, 2.02it/s] 72%|███████▏ | 7686/10682 [1:16:04<24:44, 2.02it/s] 72%|███████▏ | 7687/10682 [1:16:05<24:45, 2.02it/s] 72%|███████▏ | 7688/10682 [1:16:05<24:43, 2.02it/s] 72%|███████▏ | 7689/10682 [1:16:06<24:43, 2.02it/s] 72%|███████▏ | 7690/10682 [1:16:06<24:41, 2.02it/s] 72%|███████▏ | 7691/10682 [1:16:07<24:40, 2.02it/s] 72%|███████▏ | 7692/10682 [1:16:07<24:39, 2.02it/s] 72%|███████▏ | 7693/10682 [1:16:08<24:37, 2.02it/s] 72%|███████▏ | 7694/10682 [1:16:08<24:36, 2.02it/s] 72%|███████▏ | 7695/10682 [1:16:09<24:36, 2.02it/s] 72%|███████▏ | 7696/10682 [1:16:09<24:36, 2.02it/s] 72%|███████▏ | 7697/10682 [1:16:10<24:34, 2.02it/s] 72%|███████▏ | 7698/10682 [1:16:10<24:34, 2.02it/s] 72%|███████▏ | 7699/10682 [1:16:11<24:33, 2.02it/s] 72%|███████▏ | 7700/10682 [1:16:11<24:35, 2.02it/s] {'loss': 2.7166, 'grad_norm': 0.24718637764453888, 'learning_rate': 0.00021922461032806601, 'epoch': 10.09} + 72%|███████▏ | 7700/10682 [1:16:11<24:35, 2.02it/s] 72%|███████▏ | 7701/10682 [1:16:12<24:38, 2.02it/s] 72%|███████▏ | 7702/10682 [1:16:12<24:36, 2.02it/s] 72%|███████▏ | 7703/10682 [1:16:13<24:34, 2.02it/s] 72%|███████▏ | 7704/10682 [1:16:13<24:32, 2.02it/s] 72%|███████▏ | 7705/10682 [1:16:14<24:31, 2.02it/s] 72%|███████▏ | 7706/10682 [1:16:14<24:32, 2.02it/s] 72%|███████▏ | 7707/10682 [1:16:15<24:31, 2.02it/s] 72%|███████▏ | 7708/10682 [1:16:15<24:31, 2.02it/s] 72%|███████▏ | 7709/10682 [1:16:16<24:29, 2.02it/s] 72%|███████▏ | 7710/10682 [1:16:16<24:28, 2.02it/s] 72%|███████▏ | 7711/10682 [1:16:17<24:29, 2.02it/s] 72%|███████▏ | 7712/10682 [1:16:17<24:29, 2.02it/s] 72%|███████▏ | 7713/10682 [1:16:18<24:27, 2.02it/s] 72%|███████▏ | 7714/10682 [1:16:18<24:28, 2.02it/s] 72%|███████▏ | 7715/10682 [1:16:19<24:27, 2.02it/s] 72%|███████▏ | 7716/10682 [1:16:19<24:27, 2.02it/s] 72%|███████▏ | 7717/10682 [1:16:20<24:26, 2.02it/s] 72%|███████▏ | 7718/10682 [1:16:20<24:25, 2.02it/s] 72%|███████▏ | 7719/10682 [1:16:21<24:24, 2.02it/s] 72%|███████▏ | 7720/10682 [1:16:21<24:25, 2.02it/s] 72%|███████▏ | 7721/10682 [1:16:22<24:24, 2.02it/s] 72%|███████▏ | 7722/10682 [1:16:22<24:24, 2.02it/s] 72%|███████▏ | 7723/10682 [1:16:23<24:22, 2.02it/s] 72%|███████▏ | 7724/10682 [1:16:23<24:23, 2.02it/s] 72%|███████▏ | 7725/10682 [1:16:24<24:22, 2.02it/s]{'loss': 2.7256, 'grad_norm': 0.2528446316719055, 'learning_rate': 0.0002158538481528759, 'epoch': 10.12} + 72%|███████▏ | 7725/10682 [1:16:24<24:22, 2.02it/s] 72%|███████▏ | 7726/10682 [1:16:24<24:23, 2.02it/s] 72%|███████▏ | 7727/10682 [1:16:25<24:22, 2.02it/s] 72%|███████▏ | 7728/10682 [1:16:25<24:22, 2.02it/s] 72%|███████▏ | 7729/10682 [1:16:26<24:20, 2.02it/s] 72%|███████▏ | 7730/10682 [1:16:26<24:20, 2.02it/s] 72%|███████▏ | 7731/10682 [1:16:27<24:19, 2.02it/s] 72%|███████▏ | 7732/10682 [1:16:27<24:20, 2.02it/s] 72%|███████▏ | 7733/10682 [1:16:28<24:18, 2.02it/s] 72%|███████▏ | 7734/10682 [1:16:28<24:19, 2.02it/s] 72%|███████▏ | 7735/10682 [1:16:29<24:18, 2.02it/s] 72%|███████▏ | 7736/10682 [1:16:29<24:17, 2.02it/s] 72%|███████▏ | 7737/10682 [1:16:30<24:17, 2.02it/s] 72%|███████▏ | 7738/10682 [1:16:30<24:17, 2.02it/s] 72%|███████▏ | 7739/10682 [1:16:31<24:16, 2.02it/s] 72%|███████▏ | 7740/10682 [1:16:31<24:15, 2.02it/s] 72%|███████▏ | 7741/10682 [1:16:32<24:13, 2.02it/s] 72%|███████▏ | 7742/10682 [1:16:32<24:13, 2.02it/s] 72%|███████▏ | 7743/10682 [1:16:33<24:12, 2.02it/s] 72%|███████▏ | 7744/10682 [1:16:33<24:15, 2.02it/s] 73%|███████▎ | 7745/10682 [1:16:34<24:13, 2.02it/s] 73%|███████▎ | 7746/10682 [1:16:34<24:14, 2.02it/s] 73%|███████▎ | 7747/10682 [1:16:35<24:11, 2.02it/s] 73%|███████▎ | 7748/10682 [1:16:35<24:11, 2.02it/s] 73%|███████▎ | 7749/10682 [1:16:36<24:09, 2.02it/s] 73%|███████▎ | 7750/10682 [1:16:36<24:10, 2.02it/s] {'loss': 2.7293, 'grad_norm': 0.2552899718284607, 'learning_rate': 0.00021250205309110155, 'epoch': 10.16} + 73%|███████▎ | 7750/10682 [1:16:36<24:10, 2.02it/s] 73%|███████▎ | 7751/10682 [1:16:37<24:11, 2.02it/s] 73%|███████▎ | 7752/10682 [1:16:37<24:10, 2.02it/s] 73%|███████▎ | 7753/10682 [1:16:38<24:08, 2.02it/s] 73%|███████▎ | 7754/10682 [1:16:38<24:09, 2.02it/s] 73%|███████▎ | 7755/10682 [1:16:39<24:09, 2.02it/s] 73%|███████▎ | 7756/10682 [1:16:39<24:07, 2.02it/s] 73%|███████▎ | 7757/10682 [1:16:40<24:06, 2.02it/s] 73%|███████▎ | 7758/10682 [1:16:40<24:05, 2.02it/s] 73%|███████▎ | 7759/10682 [1:16:41<24:04, 2.02it/s] 73%|███████▎ | 7760/10682 [1:16:41<24:03, 2.02it/s] 73%|███████▎ | 7761/10682 [1:16:42<24:02, 2.02it/s] 73%|███████▎ | 7762/10682 [1:16:42<24:03, 2.02it/s] 73%|███████▎ | 7763/10682 [1:16:43<24:02, 2.02it/s] 73%|███████▎ | 7764/10682 [1:16:43<24:01, 2.02it/s] 73%|███████▎ | 7765/10682 [1:16:44<24:00, 2.03it/s] 73%|███████▎ | 7766/10682 [1:16:44<24:01, 2.02it/s] 73%|███████▎ | 7767/10682 [1:16:44<24:02, 2.02it/s] 73%|███████▎ | 7768/10682 [1:16:45<24:03, 2.02it/s] 73%|███████▎ | 7769/10682 [1:16:45<24:01, 2.02it/s] 73%|███████▎ | 7770/10682 [1:16:46<24:01, 2.02it/s] 73%|███████▎ | 7771/10682 [1:16:46<23:59, 2.02it/s] 73%|███████▎ | 7772/10682 [1:16:47<24:00, 2.02it/s] 73%|███████▎ | 7773/10682 [1:16:47<23:59, 2.02it/s] 73%|███████▎ | 7774/10682 [1:16:48<23:58, 2.02it/s] 73%|███████▎ | 7775/10682 [1:16:48<23:57, 2.02it/s] {'loss': 2.7374, 'grad_norm': 0.25173649191856384, 'learning_rate': 0.00020916944887928359, 'epoch': 10.19} + 73%|███████▎ | 7775/10682 [1:16:48<23:57, 2.02it/s] 73%|███████▎ | 7776/10682 [1:16:49<23:58, 2.02it/s] 73%|███████▎ | 7777/10682 [1:16:49<23:58, 2.02it/s] 73%|███████▎ | 7778/10682 [1:16:50<23:57, 2.02it/s] 73%|███████▎ | 7779/10682 [1:16:50<23:56, 2.02it/s] 73%|███████▎ | 7780/10682 [1:16:51<23:55, 2.02it/s] 73%|███████▎ | 7781/10682 [1:16:51<23:54, 2.02it/s] 73%|███████▎ | 7782/10682 [1:16:52<23:54, 2.02it/s] 73%|███████▎ | 7783/10682 [1:16:52<23:55, 2.02it/s] 73%|███████▎ | 7784/10682 [1:16:53<23:53, 2.02it/s] 73%|███████▎ | 7785/10682 [1:16:53<23:52, 2.02it/s] 73%|███████▎ | 7786/10682 [1:16:54<23:51, 2.02it/s] 73%|███████▎ | 7787/10682 [1:16:54<23:51, 2.02it/s] 73%|███████▎ | 7788/10682 [1:16:55<23:50, 2.02it/s] 73%|███████▎ | 7789/10682 [1:16:55<23:49, 2.02it/s] 73%|███████▎ | 7790/10682 [1:16:56<23:49, 2.02it/s] 73%|███████▎ | 7791/10682 [1:16:56<23:49, 2.02it/s] 73%|███████▎ | 7792/10682 [1:16:57<23:49, 2.02it/s] 73%|███████▎ | 7793/10682 [1:16:57<23:48, 2.02it/s] 73%|███████▎ | 7794/10682 [1:16:58<23:47, 2.02it/s] 73%|███████▎ | 7795/10682 [1:16:58<23:46, 2.02it/s] 73%|███████▎ | 7796/10682 [1:16:59<23:46, 2.02it/s] 73%|███████▎ | 7797/10682 [1:16:59<23:45, 2.02it/s] 73%|███████▎ | 7798/10682 [1:17:00<23:46, 2.02it/s] 73%|███████▎ | 7799/10682 [1:17:00<23:46, 2.02it/s] 73%|███████▎ | 7800/10682 [1:17:01<23:46, 2.02it/s] {'loss': 2.7275, 'grad_norm': 0.24839505553245544, 'learning_rate': 0.00020585625797294927, 'epoch': 10.22} + 73%|███████▎ | 7800/10682 [1:17:01<23:46, 2.02it/s] 73%|███████▎ | 7801/10682 [1:17:01<23:52, 2.01it/s] 73%|███████▎ | 7802/10682 [1:17:02<23:49, 2.01it/s] 73%|███████▎ | 7803/10682 [1:17:02<23:50, 2.01it/s] 73%|███████▎ | 7804/10682 [1:17:03<23:48, 2.02it/s] 73%|███████▎ | 7805/10682 [1:17:03<23:46, 2.02it/s] 73%|███████▎ | 7806/10682 [1:17:04<23:43, 2.02it/s] 73%|███████▎ | 7807/10682 [1:17:04<23:44, 2.02it/s] 73%|███████▎ | 7808/10682 [1:17:05<23:41, 2.02it/s] 73%|███████▎ | 7809/10682 [1:17:05<23:42, 2.02it/s] 73%|███████▎ | 7810/10682 [1:17:06<23:40, 2.02it/s] 73%|███████▎ | 7811/10682 [1:17:06<23:40, 2.02it/s] 73%|███████▎ | 7812/10682 [1:17:07<23:38, 2.02it/s] 73%|███████▎ | 7813/10682 [1:17:07<23:38, 2.02it/s] 73%|███████▎ | 7814/10682 [1:17:08<23:37, 2.02it/s] 73%|███████▎ | 7815/10682 [1:17:08<23:38, 2.02it/s] 73%|███████▎ | 7816/10682 [1:17:09<23:36, 2.02it/s] 73%|███████▎ | 7817/10682 [1:17:09<23:38, 2.02it/s] 73%|███████▎ | 7818/10682 [1:17:10<23:36, 2.02it/s] 73%|███████▎ | 7819/10682 [1:17:10<23:36, 2.02it/s] 73%|███████▎ | 7820/10682 [1:17:11<23:34, 2.02it/s] 73%|███████▎ | 7821/10682 [1:17:11<23:35, 2.02it/s] 73%|███████▎ | 7822/10682 [1:17:12<23:34, 2.02it/s] 73%|███████▎ | 7823/10682 [1:17:12<23:34, 2.02it/s] 73%|███████▎ | 7824/10682 [1:17:13<23:33, 2.02it/s] 73%|███████▎ | 7825/10682 [1:17:13<23:33, 2.02it/s]{'loss': 2.7346, 'grad_norm': 0.2532835602760315, 'learning_rate': 0.00020256270153176371, 'epoch': 10.26} + 73%|███████▎ | 7825/10682 [1:17:13<23:33, 2.02it/s] 73%|███████▎ | 7826/10682 [1:17:14<23:36, 2.02it/s] 73%|███████▎ | 7827/10682 [1:17:14<23:37, 2.01it/s] 73%|███████▎ | 7828/10682 [1:17:15<23:34, 2.02it/s] 73%|███████▎ | 7829/10682 [1:17:15<23:33, 2.02it/s] 73%|███████▎ | 7830/10682 [1:17:16<23:31, 2.02it/s] 73%|███████▎ | 7831/10682 [1:17:16<23:31, 2.02it/s] 73%|███████▎ | 7832/10682 [1:17:17<23:29, 2.02it/s] 73%|███████▎ | 7833/10682 [1:17:17<23:30, 2.02it/s] 73%|███████▎ | 7834/10682 [1:17:18<23:29, 2.02it/s] 73%|███████▎ | 7835/10682 [1:17:18<23:30, 2.02it/s] 73%|███████▎ | 7836/10682 [1:17:19<23:30, 2.02it/s] 73%|███████▎ | 7837/10682 [1:17:19<23:29, 2.02it/s] 73%|███████▎ | 7838/10682 [1:17:20<23:29, 2.02it/s] 73%|███████▎ | 7839/10682 [1:17:20<23:28, 2.02it/s] 73%|███████▎ | 7840/10682 [1:17:21<23:26, 2.02it/s] 73%|███████▎ | 7841/10682 [1:17:21<23:26, 2.02it/s] 73%|███████▎ | 7842/10682 [1:17:22<23:25, 2.02it/s] 73%|███████▎ | 7843/10682 [1:17:22<23:25, 2.02it/s] 73%|███████▎ | 7844/10682 [1:17:23<23:23, 2.02it/s] 73%|███████▎ | 7845/10682 [1:17:23<23:23, 2.02it/s] 73%|███████▎ | 7846/10682 [1:17:24<23:22, 2.02it/s] 73%|███████▎ | 7847/10682 [1:17:24<23:23, 2.02it/s] 73%|███████▎ | 7848/10682 [1:17:25<23:22, 2.02it/s] 73%|███████▎ | 7849/10682 [1:17:25<23:22, 2.02it/s] 73%|███████▎ | 7850/10682 [1:17:26<23:21, 2.02it/s]{'loss': 2.7399, 'grad_norm': 0.24879726767539978, 'learning_rate': 0.00019928899940476624, 'epoch': 10.29} + 73%|███████▎ | 7850/10682 [1:17:26<23:21, 2.02it/s] 73%|███████▎ | 7851/10682 [1:17:26<23:23, 2.02it/s] 74%|███████▎ | 7852/10682 [1:17:27<23:22, 2.02it/s] 74%|███████▎ | 7853/10682 [1:17:27<23:22, 2.02it/s] 74%|███████▎ | 7854/10682 [1:17:28<23:21, 2.02it/s] 74%|███████▎ | 7855/10682 [1:17:28<23:19, 2.02it/s] 74%|███████▎ | 7856/10682 [1:17:29<23:18, 2.02it/s] 74%|███████▎ | 7857/10682 [1:17:29<23:18, 2.02it/s] 74%|███████▎ | 7858/10682 [1:17:30<23:18, 2.02it/s] 74%|███████▎ | 7859/10682 [1:17:30<23:18, 2.02it/s] 74%|███████▎ | 7860/10682 [1:17:31<23:17, 2.02it/s] 74%|███████▎ | 7861/10682 [1:17:31<23:17, 2.02it/s] 74%|███████▎ | 7862/10682 [1:17:32<23:15, 2.02it/s] 74%|███████▎ | 7863/10682 [1:17:32<23:14, 2.02it/s] 74%|███████▎ | 7864/10682 [1:17:33<23:14, 2.02it/s] 74%|███████▎ | 7865/10682 [1:17:33<23:14, 2.02it/s] 74%|███████▎ | 7866/10682 [1:17:33<23:12, 2.02it/s] 74%|███████▎ | 7867/10682 [1:17:34<23:12, 2.02it/s] 74%|███████▎ | 7868/10682 [1:17:34<23:13, 2.02it/s] 74%|███████▎ | 7869/10682 [1:17:35<23:12, 2.02it/s] 74%|███████▎ | 7870/10682 [1:17:35<23:11, 2.02it/s] 74%|███████▎ | 7871/10682 [1:17:36<23:11, 2.02it/s] 74%|███████▎ | 7872/10682 [1:17:36<23:10, 2.02it/s] 74%|███████▎ | 7873/10682 [1:17:37<23:11, 2.02it/s] 74%|███████▎ | 7874/10682 [1:17:37<23:09, 2.02it/s] 74%|███████▎ | 7875/10682 [1:17:38<23:10, 2.02it/s]{'loss': 2.7341, 'grad_norm': 0.25345608592033386, 'learning_rate': 0.00019603537011569566, 'epoch': 10.32} + 74%|███████▎ | 7875/10682 [1:17:38<23:10, 2.02it/s] 74%|███████▎ | 7876/10682 [1:17:38<23:10, 2.02it/s] 74%|███████▎ | 7877/10682 [1:17:39<23:10, 2.02it/s] 74%|███████▍ | 7878/10682 [1:17:39<23:09, 2.02it/s] 74%|███████▍ | 7879/10682 [1:17:40<23:08, 2.02it/s] 74%|███████▍ | 7880/10682 [1:17:40<23:05, 2.02it/s] 74%|███████▍ | 7881/10682 [1:17:41<23:05, 2.02it/s] 74%|███████▍ | 7882/10682 [1:17:41<23:04, 2.02it/s] 74%|███████▍ | 7883/10682 [1:17:42<23:03, 2.02it/s] 74%|███████▍ | 7884/10682 [1:17:42<23:05, 2.02it/s] 74%|███████▍ | 7885/10682 [1:17:43<23:06, 2.02it/s] 74%|███████▍ | 7886/10682 [1:17:43<23:04, 2.02it/s] 74%|███████▍ | 7887/10682 [1:17:44<23:05, 2.02it/s] 74%|███████▍ | 7888/10682 [1:17:44<23:03, 2.02it/s] 74%|███████▍ | 7889/10682 [1:17:45<23:03, 2.02it/s] 74%|███████▍ | 7890/10682 [1:17:45<23:03, 2.02it/s] 74%|███████▍ | 7891/10682 [1:17:46<23:02, 2.02it/s] 74%|███████▍ | 7892/10682 [1:17:46<23:01, 2.02it/s] 74%|███████▍ | 7893/10682 [1:17:47<23:01, 2.02it/s] 74%|███████▍ | 7894/10682 [1:17:47<23:00, 2.02it/s] 74%|███████▍ | 7895/10682 [1:17:48<23:00, 2.02it/s] 74%|███████▍ | 7896/10682 [1:17:48<22:59, 2.02it/s] 74%|███████▍ | 7897/10682 [1:17:49<22:57, 2.02it/s] 74%|███████▍ | 7898/10682 [1:17:49<22:58, 2.02it/s] 74%|███████▍ | 7899/10682 [1:17:50<22:57, 2.02it/s] 74%|███████▍ | 7900/10682 [1:17:50<22:58, 2.02it/s] {'loss': 2.7434, 'grad_norm': 0.25620734691619873, 'learning_rate': 0.0001928020308484042, 'epoch': 10.35} + 74%|███████▍ | 7900/10682 [1:17:50<22:58, 2.02it/s] 74%|███████▍ | 7901/10682 [1:17:51<22:57, 2.02it/s] 74%|███████▍ | 7902/10682 [1:17:51<22:57, 2.02it/s] 74%|███████▍ | 7903/10682 [1:17:52<22:55, 2.02it/s] 74%|███████▍ | 7904/10682 [1:17:52<22:55, 2.02it/s] 74%|███████▍ | 7905/10682 [1:17:53<22:53, 2.02it/s] 74%|███████▍ | 7906/10682 [1:17:53<22:56, 2.02it/s] 74%|███████▍ | 7907/10682 [1:17:54<22:55, 2.02it/s] 74%|███████▍ | 7908/10682 [1:17:54<22:54, 2.02it/s] 74%|███████▍ | 7909/10682 [1:17:55<22:53, 2.02it/s] 74%|███████▍ | 7910/10682 [1:17:55<22:52, 2.02it/s] 74%|███████▍ | 7911/10682 [1:17:56<22:51, 2.02it/s] 74%|███████▍ | 7912/10682 [1:17:56<22:50, 2.02it/s] 74%|███████▍ | 7913/10682 [1:17:57<22:50, 2.02it/s] 74%|███████▍ | 7914/10682 [1:17:57<22:49, 2.02it/s] 74%|███████▍ | 7915/10682 [1:17:58<22:49, 2.02it/s] 74%|███████▍ | 7916/10682 [1:17:58<22:47, 2.02it/s] 74%|███████▍ | 7917/10682 [1:17:59<22:48, 2.02it/s] 74%|███████▍ | 7918/10682 [1:17:59<22:47, 2.02it/s] 74%|███████▍ | 7919/10682 [1:18:00<22:49, 2.02it/s] 74%|███████▍ | 7920/10682 [1:18:00<22:47, 2.02it/s] 74%|███████▍ | 7921/10682 [1:18:01<22:46, 2.02it/s] 74%|███████▍ | 7922/10682 [1:18:01<22:46, 2.02it/s] 74%|███████▍ | 7923/10682 [1:18:02<22:45, 2.02it/s] 74%|███████▍ | 7924/10682 [1:18:02<22:45, 2.02it/s] 74%|███████▍ | 7925/10682 [1:18:03<22:45, 2.02it/s]{'loss': 2.7412, 'grad_norm': 0.24962928891181946, 'learning_rate': 0.00018958919743235897, 'epoch': 10.39} + 74%|███████▍ | 7925/10682 [1:18:03<22:45, 2.02it/s] 74%|███████▍ | 7926/10682 [1:18:03<22:46, 2.02it/s] 74%|███████▍ | 7927/10682 [1:18:04<22:46, 2.02it/s] 74%|███████▍ | 7928/10682 [1:18:04<22:43, 2.02it/s] 74%|███████▍ | 7929/10682 [1:18:05<22:43, 2.02it/s] 74%|███████▍ | 7930/10682 [1:18:05<22:42, 2.02it/s] 74%|███████▍ | 7931/10682 [1:18:06<22:42, 2.02it/s] 74%|███████▍ | 7932/10682 [1:18:06<22:41, 2.02it/s] 74%|███████▍ | 7933/10682 [1:18:07<22:40, 2.02it/s] 74%|███████▍ | 7934/10682 [1:18:07<22:40, 2.02it/s] 74%|███████▍ | 7935/10682 [1:18:08<22:39, 2.02it/s] 74%|███████▍ | 7936/10682 [1:18:08<22:39, 2.02it/s] 74%|███████▍ | 7937/10682 [1:18:09<22:38, 2.02it/s] 74%|███████▍ | 7938/10682 [1:18:09<22:37, 2.02it/s] 74%|███████▍ | 7939/10682 [1:18:10<22:36, 2.02it/s] 74%|███████▍ | 7940/10682 [1:18:10<22:35, 2.02it/s] 74%|███████▍ | 7941/10682 [1:18:11<22:34, 2.02it/s] 74%|███████▍ | 7942/10682 [1:18:11<22:35, 2.02it/s] 74%|███████▍ | 7943/10682 [1:18:12<22:35, 2.02it/s] 74%|███████▍ | 7944/10682 [1:18:12<22:34, 2.02it/s] 74%|███████▍ | 7945/10682 [1:18:13<22:34, 2.02it/s] 74%|███████▍ | 7946/10682 [1:18:13<22:33, 2.02it/s] 74%|███████▍ | 7947/10682 [1:18:14<22:34, 2.02it/s] 74%|███████▍ | 7948/10682 [1:18:14<22:32, 2.02it/s] 74%|███████▍ | 7949/10682 [1:18:15<22:32, 2.02it/s] 74%|███████▍ | 7950/10682 [1:18:15<22:30, 2.02it/s]{'loss': 2.7484, 'grad_norm': 0.25517159700393677, 'learning_rate': 0.0001863970843282357, 'epoch': 10.42} + 74%|███████▍ | 7950/10682 [1:18:15<22:30, 2.02it/s] 74%|███████▍ | 7951/10682 [1:18:16<22:31, 2.02it/s] 74%|███████▍ | 7952/10682 [1:18:16<22:30, 2.02it/s] 74%|███████▍ | 7953/10682 [1:18:17<22:30, 2.02it/s] 74%|███████▍ | 7954/10682 [1:18:17<22:28, 2.02it/s] 74%|███████▍ | 7955/10682 [1:18:18<22:29, 2.02it/s] 74%|███████▍ | 7956/10682 [1:18:18<22:28, 2.02it/s] 74%|███████▍ | 7957/10682 [1:18:19<22:30, 2.02it/s] 74%|███████▍ | 7958/10682 [1:18:19<22:29, 2.02it/s] 75%|███████▍ | 7959/10682 [1:18:20<22:29, 2.02it/s] 75%|███████▍ | 7960/10682 [1:18:20<22:26, 2.02it/s] 75%|███████▍ | 7961/10682 [1:18:21<22:26, 2.02it/s] 75%|███████▍ | 7962/10682 [1:18:21<22:24, 2.02it/s] 75%|███████▍ | 7963/10682 [1:18:22<22:25, 2.02it/s] 75%|███████▍ | 7964/10682 [1:18:22<22:23, 2.02it/s] 75%|███████▍ | 7965/10682 [1:18:23<22:24, 2.02it/s] 75%|███████▍ | 7966/10682 [1:18:23<22:21, 2.02it/s] 75%|███████▍ | 7967/10682 [1:18:23<22:21, 2.02it/s] 75%|███████▍ | 7968/10682 [1:18:24<22:21, 2.02it/s] 75%|███████▍ | 7969/10682 [1:18:24<22:20, 2.02it/s] 75%|███████▍ | 7970/10682 [1:18:25<22:21, 2.02it/s] 75%|███████▍ | 7971/10682 [1:18:25<22:20, 2.02it/s] 75%|███████▍ | 7972/10682 [1:18:26<22:19, 2.02it/s] 75%|███████▍ | 7973/10682 [1:18:26<22:19, 2.02it/s] 75%|███████▍ | 7974/10682 [1:18:27<22:18, 2.02it/s] 75%|███████▍ | 7975/10682 [1:18:27<22:18, 2.02it/s] {'loss': 2.7486, 'grad_norm': 0.255978524684906, 'learning_rate': 0.00018322590461360383, 'epoch': 10.45} + 75%|███████▍ | 7975/10682 [1:18:27<22:18, 2.02it/s] 75%|███████▍ | 7976/10682 [1:18:28<22:18, 2.02it/s] 75%|███████▍ | 7977/10682 [1:18:28<22:18, 2.02it/s] 75%|███████▍ | 7978/10682 [1:18:29<22:17, 2.02it/s] 75%|███████▍ | 7979/10682 [1:18:29<22:16, 2.02it/s] 75%|███████▍ | 7980/10682 [1:18:30<22:17, 2.02it/s] 75%|███████▍ | 7981/10682 [1:18:30<22:16, 2.02it/s] 75%|███████▍ | 7982/10682 [1:18:31<22:16, 2.02it/s] 75%|███████▍ | 7983/10682 [1:18:31<22:15, 2.02it/s] 75%|███████▍ | 7984/10682 [1:18:32<22:14, 2.02it/s] 75%|███████▍ | 7985/10682 [1:18:32<22:14, 2.02it/s] 75%|███████▍ | 7986/10682 [1:18:33<22:12, 2.02it/s] 75%|███████▍ | 7987/10682 [1:18:33<22:13, 2.02it/s] 75%|███████▍ | 7988/10682 [1:18:34<22:13, 2.02it/s] 75%|███████▍ | 7989/10682 [1:18:34<22:13, 2.02it/s] 75%|███████▍ | 7990/10682 [1:18:35<22:12, 2.02it/s] 75%|███████▍ | 7991/10682 [1:18:35<22:12, 2.02it/s] 75%|███████▍ | 7992/10682 [1:18:36<22:09, 2.02it/s] 75%|███████▍ | 7993/10682 [1:18:36<22:09, 2.02it/s] 75%|███████▍ | 7994/10682 [1:18:37<22:08, 2.02it/s] 75%|███████▍ | 7995/10682 [1:18:37<22:09, 2.02it/s] 75%|███████▍ | 7996/10682 [1:18:38<22:08, 2.02it/s] 75%|███████▍ | 7997/10682 [1:18:38<22:09, 2.02it/s] 75%|███████▍ | 7998/10682 [1:18:39<22:09, 2.02it/s] 75%|███████▍ | 7999/10682 [1:18:39<22:08, 2.02it/s] 75%|███████▍ | 8000/10682 [1:18:40<22:08, 2.02it/s] {'loss': 2.76, 'grad_norm': 0.2518884539604187, 'learning_rate': 0.00018007586996870206, 'epoch': 10.48} + 75%|███████▍ | 8000/10682 [1:18:40<22:08, 2.02it/s] 75%|███████▍ | 8001/10682 [1:18:40<22:10, 2.02it/s] 75%|███████▍ | 8002/10682 [1:18:41<22:08, 2.02it/s] 75%|███████▍ | 8003/10682 [1:18:41<22:08, 2.02it/s] 75%|███████▍ | 8004/10682 [1:18:42<22:06, 2.02it/s] 75%|███████▍ | 8005/10682 [1:18:42<22:06, 2.02it/s] 75%|███████▍ | 8006/10682 [1:18:43<22:04, 2.02it/s] 75%|███████▍ | 8007/10682 [1:18:43<22:04, 2.02it/s] 75%|███████▍ | 8008/10682 [1:18:44<22:03, 2.02it/s] 75%|███████▍ | 8009/10682 [1:18:44<22:03, 2.02it/s] 75%|███████▍ | 8010/10682 [1:18:45<22:04, 2.02it/s] 75%|███████▍ | 8011/10682 [1:18:45<22:02, 2.02it/s] 75%|███████▌ | 8012/10682 [1:18:46<22:02, 2.02it/s] 75%|███████▌ | 8013/10682 [1:18:46<22:02, 2.02it/s] 75%|███████▌ | 8014/10682 [1:18:47<22:01, 2.02it/s] 75%|███████▌ | 8015/10682 [1:18:47<22:01, 2.02it/s] 75%|███████▌ | 8016/10682 [1:18:48<22:00, 2.02it/s] 75%|███████▌ | 8017/10682 [1:18:48<21:59, 2.02it/s] 75%|███████▌ | 8018/10682 [1:18:49<21:59, 2.02it/s] 75%|███████▌ | 8019/10682 [1:18:49<21:57, 2.02it/s] 75%|███████▌ | 8020/10682 [1:18:50<21:57, 2.02it/s] 75%|███████▌ | 8021/10682 [1:18:50<21:56, 2.02it/s] 75%|███████▌ | 8022/10682 [1:18:51<21:56, 2.02it/s] 75%|███████▌ | 8023/10682 [1:18:51<21:55, 2.02it/s] 75%|███████▌ | 8024/10682 [1:18:52<21:55, 2.02it/s] 75%|███████▌ | 8025/10682 [1:18:52<21:54, 2.02it/s]{'loss': 2.7441, 'grad_norm': 0.24857422709465027, 'learning_rate': 0.00017694719066230924, 'epoch': 10.52} + 75%|███████▌ | 8025/10682 [1:18:52<21:54, 2.02it/s] 75%|███████▌ | 8026/10682 [1:18:53<21:55, 2.02it/s] 75%|███████▌ | 8027/10682 [1:18:53<21:55, 2.02it/s] 75%|███████▌ | 8028/10682 [1:18:54<21:55, 2.02it/s] 75%|███████▌ | 8029/10682 [1:18:54<21:53, 2.02it/s] 75%|███████▌ | 8030/10682 [1:18:55<21:53, 2.02it/s] 75%|███████▌ | 8031/10682 [1:18:55<21:51, 2.02it/s] 75%|███████▌ | 8032/10682 [1:18:56<21:50, 2.02it/s] 75%|███████▌ | 8033/10682 [1:18:56<21:47, 2.03it/s] 75%|███████▌ | 8034/10682 [1:18:57<21:48, 2.02it/s] 75%|███████▌ | 8035/10682 [1:18:57<21:47, 2.02it/s] 75%|███████▌ | 8036/10682 [1:18:58<21:48, 2.02it/s] 75%|███████▌ | 8037/10682 [1:18:58<21:46, 2.02it/s] 75%|███████▌ | 8038/10682 [1:18:59<21:48, 2.02it/s] 75%|███████▌ | 8039/10682 [1:18:59<21:45, 2.02it/s] 75%|███████▌ | 8040/10682 [1:19:00<21:47, 2.02it/s] 75%|███████▌ | 8041/10682 [1:19:00<21:45, 2.02it/s] 75%|███████▌ | 8042/10682 [1:19:01<21:46, 2.02it/s] 75%|███████▌ | 8043/10682 [1:19:01<21:44, 2.02it/s] 75%|███████▌ | 8044/10682 [1:19:02<21:45, 2.02it/s] 75%|███████▌ | 8045/10682 [1:19:02<21:44, 2.02it/s] 75%|███████▌ | 8046/10682 [1:19:03<21:44, 2.02it/s] 75%|███████▌ | 8047/10682 [1:19:03<21:43, 2.02it/s] 75%|███████▌ | 8048/10682 [1:19:04<21:43, 2.02it/s] 75%|███████▌ | 8049/10682 [1:19:04<21:42, 2.02it/s] 75%|███████▌ | 8050/10682 [1:19:05<21:42, 2.02it/s] {'loss': 2.7611, 'grad_norm': 0.250192254781723, 'learning_rate': 0.00017384007553770858, 'epoch': 10.55} + 75%|███████▌ | 8050/10682 [1:19:05<21:42, 2.02it/s] 75%|███████▌ | 8051/10682 [1:19:05<21:44, 2.02it/s] 75%|███████▌ | 8052/10682 [1:19:06<21:42, 2.02it/s] 75%|███████▌ | 8053/10682 [1:19:06<21:42, 2.02it/s] 75%|███████▌ | 8054/10682 [1:19:07<21:40, 2.02it/s] 75%|███████▌ | 8055/10682 [1:19:07<21:39, 2.02it/s] 75%|███████▌ | 8056/10682 [1:19:08<21:38, 2.02it/s] 75%|███████▌ | 8057/10682 [1:19:08<21:39, 2.02it/s] 75%|███████▌ | 8058/10682 [1:19:09<21:37, 2.02it/s] 75%|███████▌ | 8059/10682 [1:19:09<21:37, 2.02it/s] 75%|███████▌ | 8060/10682 [1:19:10<21:36, 2.02it/s] 75%|███████▌ | 8061/10682 [1:19:10<21:36, 2.02it/s] 75%|███████▌ | 8062/10682 [1:19:11<21:37, 2.02it/s] 75%|███████▌ | 8063/10682 [1:19:11<21:36, 2.02it/s] 75%|███████▌ | 8064/10682 [1:19:11<21:34, 2.02it/s] 76%|███████▌ | 8065/10682 [1:19:12<21:34, 2.02it/s] 76%|███████▌ | 8066/10682 [1:19:12<21:33, 2.02it/s] 76%|███████▌ | 8067/10682 [1:19:13<21:34, 2.02it/s] 76%|███████▌ | 8068/10682 [1:19:13<21:34, 2.02it/s] 76%|███████▌ | 8069/10682 [1:19:14<21:33, 2.02it/s] 76%|███████▌ | 8070/10682 [1:19:14<21:32, 2.02it/s] 76%|███████▌ | 8071/10682 [1:19:15<21:31, 2.02it/s] 76%|███████▌ | 8072/10682 [1:19:15<21:30, 2.02it/s] 76%|███████▌ | 8073/10682 [1:19:16<21:31, 2.02it/s] 76%|███████▌ | 8074/10682 [1:19:16<21:30, 2.02it/s] 76%|███████▌ | 8075/10682 [1:19:17<21:30, 2.02it/s]{'loss': 2.7562, 'grad_norm': 0.2497178018093109, 'learning_rate': 0.00017075473199874692, 'epoch': 10.58} + 76%|███████▌ | 8075/10682 [1:19:17<21:30, 2.02it/s] 76%|███████▌ | 8076/10682 [1:19:17<21:31, 2.02it/s] 76%|███████▌ | 8077/10682 [1:19:18<21:30, 2.02it/s] 76%|███████▌ | 8078/10682 [1:19:18<21:29, 2.02it/s] 76%|███████▌ | 8079/10682 [1:19:19<21:29, 2.02it/s] 76%|███████▌ | 8080/10682 [1:19:19<21:27, 2.02it/s] 76%|███████▌ | 8081/10682 [1:19:20<21:27, 2.02it/s] 76%|███████▌ | 8082/10682 [1:19:20<21:26, 2.02it/s] 76%|███████▌ | 8083/10682 [1:19:21<21:26, 2.02it/s] 76%|███████▌ | 8084/10682 [1:19:21<21:27, 2.02it/s] 76%|███████▌ | 8085/10682 [1:19:22<21:25, 2.02it/s] 76%|███████▌ | 8086/10682 [1:19:22<21:24, 2.02it/s] 76%|███████▌ | 8087/10682 [1:19:23<21:24, 2.02it/s] 76%|███████▌ | 8088/10682 [1:19:23<21:23, 2.02it/s] 76%|███████▌ | 8089/10682 [1:19:24<21:23, 2.02it/s] 76%|███████▌ | 8090/10682 [1:19:24<21:22, 2.02it/s] 76%|███████▌ | 8091/10682 [1:19:25<21:22, 2.02it/s] 76%|███████▌ | 8092/10682 [1:19:25<21:21, 2.02it/s] 76%|███████▌ | 8093/10682 [1:19:26<21:22, 2.02it/s] 76%|███████▌ | 8094/10682 [1:19:26<21:20, 2.02it/s] 76%|███████▌ | 8095/10682 [1:19:27<21:21, 2.02it/s] 76%|███████▌ | 8096/10682 [1:19:27<21:19, 2.02it/s] 76%|███████▌ | 8097/10682 [1:19:28<21:18, 2.02it/s] 76%|███████▌ | 8098/10682 [1:19:28<21:18, 2.02it/s] 76%|███████▌ | 8099/10682 [1:19:29<21:18, 2.02it/s] 76%|███████▌ | 8100/10682 [1:19:29<21:18, 2.02it/s] {'loss': 2.7643, 'grad_norm': 0.24932293593883514, 'learning_rate': 0.00016769136599599017, 'epoch': 10.62} + 76%|███████▌ | 8100/10682 [1:19:29<21:18, 2.02it/s] 76%|███████▌ | 8101/10682 [1:19:30<21:18, 2.02it/s] 76%|███████▌ | 8102/10682 [1:19:30<21:17, 2.02it/s] 76%|███████▌ | 8103/10682 [1:19:31<21:16, 2.02it/s] 76%|███████▌ | 8104/10682 [1:19:31<21:15, 2.02it/s] 76%|███████▌ | 8105/10682 [1:19:32<21:14, 2.02it/s] 76%|███████▌ | 8106/10682 [1:19:32<21:14, 2.02it/s] 76%|███████▌ | 8107/10682 [1:19:33<21:13, 2.02it/s] 76%|███████▌ | 8108/10682 [1:19:33<21:12, 2.02it/s] 76%|███████▌ | 8109/10682 [1:19:34<21:12, 2.02it/s] 76%|███████▌ | 8110/10682 [1:19:34<21:26, 2.00it/s] 76%|███████▌ | 8111/10682 [1:19:35<21:21, 2.01it/s] 76%|███████▌ | 8112/10682 [1:19:35<21:17, 2.01it/s] 76%|███████▌ | 8113/10682 [1:19:36<21:14, 2.02it/s] 76%|███████▌ | 8114/10682 [1:19:36<21:13, 2.02it/s] 76%|███████▌ | 8115/10682 [1:19:37<21:12, 2.02it/s] 76%|███████▌ | 8116/10682 [1:19:37<21:10, 2.02it/s] 76%|███████▌ | 8117/10682 [1:19:38<21:09, 2.02it/s] 76%|███████▌ | 8118/10682 [1:19:38<21:09, 2.02it/s] 76%|███████▌ | 8119/10682 [1:19:39<21:09, 2.02it/s] 76%|███████▌ | 8120/10682 [1:19:39<21:08, 2.02it/s] 76%|███████▌ | 8121/10682 [1:19:40<21:07, 2.02it/s] 76%|███████▌ | 8122/10682 [1:19:40<21:06, 2.02it/s] 76%|███████▌ | 8123/10682 [1:19:41<21:05, 2.02it/s] 76%|███████▌ | 8124/10682 [1:19:41<21:04, 2.02it/s] 76%|███████▌ | 8125/10682 [1:19:42<21:05, 2.02it/s]{'loss': 2.7628, 'grad_norm': 0.250422865152359, 'learning_rate': 0.0001646501820129766, 'epoch': 10.65} + 76%|███████▌ | 8125/10682 [1:19:42<21:05, 2.02it/s] 76%|███████▌ | 8126/10682 [1:19:42<21:05, 2.02it/s] 76%|███████▌ | 8127/10682 [1:19:43<21:04, 2.02it/s] 76%|███████▌ | 8128/10682 [1:19:43<21:03, 2.02it/s] 76%|███████▌ | 8129/10682 [1:19:44<21:03, 2.02it/s] 76%|███████▌ | 8130/10682 [1:19:44<21:03, 2.02it/s] 76%|███████▌ | 8131/10682 [1:19:45<21:02, 2.02it/s] 76%|███████▌ | 8132/10682 [1:19:45<21:02, 2.02it/s] 76%|███████▌ | 8133/10682 [1:19:46<21:00, 2.02it/s] 76%|███████▌ | 8134/10682 [1:19:46<20:59, 2.02it/s] 76%|███████▌ | 8135/10682 [1:19:47<20:59, 2.02it/s] 76%|███████▌ | 8136/10682 [1:19:47<20:58, 2.02it/s] 76%|███████▌ | 8137/10682 [1:19:48<20:59, 2.02it/s] 76%|███████▌ | 8138/10682 [1:19:48<20:57, 2.02it/s] 76%|███████▌ | 8139/10682 [1:19:49<20:57, 2.02it/s] 76%|███████▌ | 8140/10682 [1:19:49<20:57, 2.02it/s] 76%|███████▌ | 8141/10682 [1:19:50<20:56, 2.02it/s] 76%|███████▌ | 8142/10682 [1:19:50<20:57, 2.02it/s] 76%|███████▌ | 8143/10682 [1:19:51<20:56, 2.02it/s] 76%|███████▌ | 8144/10682 [1:19:51<20:57, 2.02it/s] 76%|███████▌ | 8145/10682 [1:19:52<20:55, 2.02it/s] 76%|███████▋ | 8146/10682 [1:19:52<20:54, 2.02it/s] 76%|███████▋ | 8147/10682 [1:19:53<20:54, 2.02it/s] 76%|███████▋ | 8148/10682 [1:19:53<20:54, 2.02it/s] 76%|███████▋ | 8149/10682 [1:19:54<20:55, 2.02it/s] 76%|███████▋ | 8150/10682 [1:19:54<20:53, 2.02it/s] {'loss': 2.7547, 'grad_norm': 0.25278058648109436, 'learning_rate': 0.00016163138305256598, 'epoch': 10.68} + 76%|███████▋ | 8150/10682 [1:19:54<20:53, 2.02it/s] 76%|███████▋ | 8151/10682 [1:19:55<20:54, 2.02it/s] 76%|███████▋ | 8152/10682 [1:19:55<20:52, 2.02it/s] 76%|███████▋ | 8153/10682 [1:19:56<20:52, 2.02it/s] 76%|███████▋ | 8154/10682 [1:19:56<20:51, 2.02it/s] 76%|███████▋ | 8155/10682 [1:19:57<20:51, 2.02it/s] 76%|███████▋ | 8156/10682 [1:19:57<20:51, 2.02it/s] 76%|███████▋ | 8157/10682 [1:19:58<20:50, 2.02it/s] 76%|███████▋ | 8158/10682 [1:19:58<20:49, 2.02it/s] 76%|███████▋ | 8159/10682 [1:19:59<20:48, 2.02it/s] 76%|███████▋ | 8160/10682 [1:19:59<20:47, 2.02it/s] 76%|███████▋ | 8161/10682 [1:20:00<20:46, 2.02it/s] 76%|███████▋ | 8162/10682 [1:20:00<20:45, 2.02it/s] 76%|███████▋ | 8163/10682 [1:20:01<20:44, 2.02it/s] 76%|███████▋ | 8164/10682 [1:20:01<20:44, 2.02it/s] 76%|███████▋ | 8165/10682 [1:20:01<20:44, 2.02it/s] 76%|███████▋ | 8166/10682 [1:20:02<20:44, 2.02it/s] 76%|███████▋ | 8167/10682 [1:20:02<20:44, 2.02it/s] 76%|███████▋ | 8168/10682 [1:20:03<20:44, 2.02it/s] 76%|███████▋ | 8169/10682 [1:20:03<20:42, 2.02it/s] 76%|█���█████▋ | 8170/10682 [1:20:04<20:42, 2.02it/s] 76%|███████▋ | 8171/10682 [1:20:04<20:41, 2.02it/s] 77%|███████▋ | 8172/10682 [1:20:05<20:40, 2.02it/s] 77%|███████▋ | 8173/10682 [1:20:05<20:41, 2.02it/s] 77%|███████▋ | 8174/10682 [1:20:06<20:41, 2.02it/s] 77%|███████▋ | 8175/10682 [1:20:06<20:40, 2.02it/s] {'loss': 2.7612, 'grad_norm': 0.24929340183734894, 'learning_rate': 0.00015863517062339038, 'epoch': 10.71} + 77%|███████▋ | 8175/10682 [1:20:06<20:40, 2.02it/s] 77%|███████▋ | 8176/10682 [1:20:07<20:42, 2.02it/s] 77%|███████▋ | 8177/10682 [1:20:07<20:40, 2.02it/s] 77%|███████▋ | 8178/10682 [1:20:08<20:41, 2.02it/s] 77%|███████▋ | 8179/10682 [1:20:08<20:37, 2.02it/s] 77%|███████▋ | 8180/10682 [1:20:09<20:38, 2.02it/s] 77%|███████▋ | 8181/10682 [1:20:09<20:35, 2.02it/s] 77%|███████▋ | 8182/10682 [1:20:10<20:36, 2.02it/s] 77%|███████▋ | 8183/10682 [1:20:10<20:34, 2.02it/s] 77%|███████▋ | 8184/10682 [1:20:11<20:35, 2.02it/s] 77%|███████▋ | 8185/10682 [1:20:11<20:34, 2.02it/s] 77%|███████▋ | 8186/10682 [1:20:12<20:34, 2.02it/s] 77%|███████▋ | 8187/10682 [1:20:12<20:34, 2.02it/s] 77%|███████▋ | 8188/10682 [1:20:13<20:35, 2.02it/s] 77%|███████▋ | 8189/10682 [1:20:13<20:32, 2.02it/s] 77%|███████▋ | 8190/10682 [1:20:14<20:33, 2.02it/s] 77%|███████▋ | 8191/10682 [1:20:14<20:31, 2.02it/s] 77%|███████▋ | 8192/10682 [1:20:15<20:31, 2.02it/s] 77%|███████▋ | 8193/10682 [1:20:15<22:16, 1.86it/s] 77%|███████▋ | 8194/10682 [1:20:16<21:46, 1.90it/s] 77%|███████▋ | 8195/10682 [1:20:16<21:21, 1.94it/s] 77%|███████▋ | 8196/10682 [1:20:17<21:06, 1.96it/s] 77%|███████▋ | 8197/10682 [1:20:17<20:53, 1.98it/s] 77%|███████▋ | 8198/10682 [1:20:18<20:46, 1.99it/s] 77%|███████▋ | 8199/10682 [1:20:18<20:38, 2.00it/s] 77%|███████▋ | 8200/10682 [1:20:19<20:35, 2.01it/s] {'loss': 2.7629, 'grad_norm': 0.24858906865119934, 'learning_rate': 0.00015566174472640188, 'epoch': 10.75} + 77%|███████▋ | 8200/10682 [1:20:19<20:35, 2.01it/s] 77%|███████▋ | 8201/10682 [1:20:19<20:33, 2.01it/s] 77%|███████▋ | 8202/10682 [1:20:20<20:32, 2.01it/s] 77%|███████▋ | 8203/10682 [1:20:20<20:29, 2.02it/s] 77%|███████▋ | 8204/10682 [1:20:21<20:28, 2.02it/s] 77%|███████▋ | 8205/10682 [1:20:21<20:25, 2.02it/s] 77%|███████▋ | 8206/10682 [1:20:22<20:25, 2.02it/s] 77%|███████▋ | 8207/10682 [1:20:22<20:23, 2.02it/s] 77%|███████▋ | 8208/10682 [1:20:23<20:23, 2.02it/s] 77%|███████▋ | 8209/10682 [1:20:23<20:22, 2.02it/s] 77%|███████▋ | 8210/10682 [1:20:24<20:22, 2.02it/s] 77%|███████▋ | 8211/10682 [1:20:24<20:21, 2.02it/s] 77%|███████▋ | 8212/10682 [1:20:25<20:21, 2.02it/s] 77%|███████▋ | 8213/10682 [1:20:25<20:20, 2.02it/s] 77%|███████▋ | 8214/10682 [1:20:26<20:20, 2.02it/s] 77%|███████▋ | 8215/10682 [1:20:26<20:19, 2.02it/s] 77%|███████▋ | 8216/10682 [1:20:27<20:18, 2.02it/s] 77%|███████▋ | 8217/10682 [1:20:27<20:18, 2.02it/s] 77%|███████▋ | 8218/10682 [1:20:28<20:17, 2.02it/s] 77%|███████▋ | 8219/10682 [1:20:28<20:18, 2.02it/s] 77%|███████▋ | 8220/10682 [1:20:29<20:18, 2.02it/s] 77%|███████▋ | 8221/10682 [1:20:29<20:18, 2.02it/s] 77%|███████▋ | 8222/10682 [1:20:30<20:16, 2.02it/s] 77%|███████▋ | 8223/10682 [1:20:30<20:17, 2.02it/s] 77%|███████▋ | 8224/10682 [1:20:31<20:16, 2.02it/s] 77%|███████▋ | 8225/10682 [1:20:31<20:16, 2.02it/s] {'loss': 2.7648, 'grad_norm': 0.25700506567955017, 'learning_rate': 0.0001527113038415231, 'epoch': 10.78} + 77%|███████▋ | 8225/10682 [1:20:31<20:16, 2.02it/s] 77%|███████▋ | 8226/10682 [1:20:32<20:21, 2.01it/s] 77%|███████▋ | 8227/10682 [1:20:32<20:20, 2.01it/s] 77%|███████▋ | 8228/10682 [1:20:33<20:18, 2.01it/s] 77%|███████▋ | 8229/10682 [1:20:33<20:17, 2.01it/s] 77%|███████▋ | 8230/10682 [1:20:34<20:15, 2.02it/s] 77%|███████▋ | 8231/10682 [1:20:34<20:20, 2.01it/s] 77%|███████▋ | 8232/10682 [1:20:35<20:18, 2.01it/s] 77%|███████▋ | 8233/10682 [1:20:35<20:16, 2.01it/s] 77%|███████▋ | 8234/10682 [1:20:36<20:14, 2.02it/s] 77%|███████▋ | 8235/10682 [1:20:36<20:14, 2.01it/s] 77%|███████▋ | 8236/10682 [1:20:37<20:13, 2.02it/s] 77%|███████▋ | 8237/10682 [1:20:37<20:11, 2.02it/s] 77%|███████▋ | 8238/10682 [1:20:38<20:09, 2.02it/s] 77%|███████▋ | 8239/10682 [1:20:38<20:09, 2.02it/s] 77%|███████▋ | 8240/10682 [1:20:39<20:08, 2.02it/s] 77%|███████▋ | 8241/10682 [1:20:39<20:08, 2.02it/s] 77%|███████▋ | 8242/10682 [1:20:40<20:06, 2.02it/s] 77%|███████▋ | 8243/10682 [1:20:40<20:05, 2.02it/s] 77%|███████▋ | 8244/10682 [1:20:41<20:05, 2.02it/s] 77%|███████▋ | 8245/10682 [1:20:41<20:04, 2.02it/s] 77%|███████▋ | 8246/10682 [1:20:42<20:03, 2.02it/s] 77%|███████▋ | 8247/10682 [1:20:42<20:04, 2.02it/s] 77%|███████▋ | 8248/10682 [1:20:43<20:02, 2.02it/s] 77%|███████▋ | 8249/10682 [1:20:43<20:03, 2.02it/s] 77%|███████▋ | 8250/10682 [1:20:44<20:01, 2.02it/s] {'loss': 2.762, 'grad_norm': 0.2492111623287201, 'learning_rate': 0.00014978404491439802, 'epoch': 10.81} + 77%|███████▋ | 8250/10682 [1:20:44<20:01, 2.02it/s] 77%|███████▋ | 8251/10682 [1:20:44<20:06, 2.01it/s] 77%|███████▋ | 8252/10682 [1:20:45<20:02, 2.02it/s] 77%|███████▋ | 8253/10682 [1:20:45<20:03, 2.02it/s] 77%|███████▋ | 8254/10682 [1:20:46<20:01, 2.02it/s] 77%|███████▋ | 8255/10682 [1:20:46<20:01, 2.02it/s] 77%|███████▋ | 8256/10682 [1:20:47<19:58, 2.02it/s] 77%|███████▋ | 8257/10682 [1:20:47<20:00, 2.02it/s] 77%|███████▋ | 8258/10682 [1:20:48<19:59, 2.02it/s] 77%|███████▋ | 8259/10682 [1:20:48<19:59, 2.02it/s] 77%|███████▋ | 8260/10682 [1:20:49<19:56, 2.02it/s] 77%|███████▋ | 8261/10682 [1:20:49<19:57, 2.02it/s] 77%|███████▋ | 8262/10682 [1:20:50<19:55, 2.02it/s] 77%|███████▋ | 8263/10682 [1:20:50<19:56, 2.02it/s] 77%|███████▋ | 8264/10682 [1:20:51<19:54, 2.02it/s] 77%|███████▋ | 8265/10682 [1:20:51<19:55, 2.02it/s] 77%|███████▋ | 8266/10682 [1:20:52<19:54, 2.02it/s] 77%|███████▋ | 8267/10682 [1:20:52<19:54, 2.02it/s] 77%|███████▋ | 8268/10682 [1:20:53<19:52, 2.02it/s] 77%|███████▋ | 8269/10682 [1:20:53<19:51, 2.03it/s] 77%|███████▋ | 8270/10682 [1:20:54<19:52, 2.02it/s] 77%|███████▋ | 8271/10682 [1:20:54<19:51, 2.02it/s] 77%|███████▋ | 8272/10682 [1:20:55<19:51, 2.02it/s] 77%|███████▋ | 8273/10682 [1:20:55<19:49, 2.03it/s] 77%|███████▋ | 8274/10682 [1:20:56<19:49, 2.03it/s] 77%|███████▋ | 8275/10682 [1:20:56<19:49, 2.02it/s] {'loss': 2.7657, 'grad_norm': 0.2531864643096924, 'learning_rate': 0.00014688016334324605, 'epoch': 10.85} + 77%|███████▋ | 8275/10682 [1:20:56<19:49, 2.02it/s] 77%|███████▋ | 8276/10682 [1:20:57<19:49, 2.02it/s] 77%|███████▋ | 8277/10682 [1:20:57<19:50, 2.02it/s] 77%|███████▋ | 8278/10682 [1:20:58<19:51, 2.02it/s] 78%|███████▊ | 8279/10682 [1:20:58<19:50, 2.02it/s] 78%|███████▊ | 8280/10682 [1:20:59<19:48, 2.02it/s] 78%|███████▊ | 8281/10682 [1:20:59<19:47, 2.02it/s] 78%|███████▊ | 8282/10682 [1:21:00<19:47, 2.02it/s] 78%|███████▊ | 8283/10682 [1:21:00<19:46, 2.02it/s] 78%|███████▊ | 8284/10682 [1:21:01<19:44, 2.02it/s] 78%|███████▊ | 8285/10682 [1:21:01<19:44, 2.02it/s] 78%|███████▊ | 8286/10682 [1:21:02<19:44, 2.02it/s] 78%|███████▊ | 8287/10682 [1:21:02<19:42, 2.02it/s] 78%|███████▊ | 8288/10682 [1:21:02<19:44, 2.02it/s] 78%|███████▊ | 8289/10682 [1:21:03<19:42, 2.02it/s] 78%|███████▊ | 8290/10682 [1:21:04<21:22, 1.87it/s] 78%|███████▊ | 8291/10682 [1:21:04<20:53, 1.91it/s] 78%|███████▊ | 8292/10682 [1:21:05<20:31, 1.94it/s] 78%|███████▊ | 8293/10682 [1:21:05<20:17, 1.96it/s] 78%|███████▊ | 8294/10682 [1:21:06<20:07, 1.98it/s] 78%|███████▊ | 8295/10682 [1:21:06<19:58, 1.99it/s] 78%|███████▊ | 8296/10682 [1:21:07<19:52, 2.00it/s] 78%|███████▊ | 8297/10682 [1:21:07<19:47, 2.01it/s] 78%|███████▊ | 8298/10682 [1:21:08<19:44, 2.01it/s] 78%|███████▊ | 8299/10682 [1:21:08<19:43, 2.01it/s] 78%|███████▊ | 8300/10682 [1:21:09<19:41, 2.02it/s] {'loss': 2.7693, 'grad_norm': 0.2504305839538574, 'learning_rate': 0.00014399985296581835, 'epoch': 10.88} + 78%|███████▊ | 8300/10682 [1:21:09<19:41, 2.02it/s] 78%|███████▊ | 8301/10682 [1:21:09<19:41, 2.02it/s] 78%|███████▊ | 8302/10682 [1:21:10<19:39, 2.02it/s] 78%|███████▊ | 8303/10682 [1:21:10<19:38, 2.02it/s] 78%|███████▊ | 8304/10682 [1:21:11<19:38, 2.02it/s] 78%|███████▊ | 8305/10682 [1:21:11<19:36, 2.02it/s] 78%|███████▊ | 8306/10682 [1:21:12<19:36, 2.02it/s] 78%|███████▊ | 8307/10682 [1:21:12<19:36, 2.02it/s] 78%|███████▊ | 8308/10682 [1:21:13<19:36, 2.02it/s] 78%|███████▊ | 8309/10682 [1:21:13<19:35, 2.02it/s] 78%|███████▊ | 8310/10682 [1:21:14<19:34, 2.02it/s] 78%|███████▊ | 8311/10682 [1:21:14<19:33, 2.02it/s] 78%|███████▊ | 8312/10682 [1:21:15<19:33, 2.02it/s] 78%|███████▊ | 8313/10682 [1:21:15<19:31, 2.02it/s] 78%|███████▊ | 8314/10682 [1:21:16<19:31, 2.02it/s] 78%|███████▊ | 8315/10682 [1:21:16<19:31, 2.02it/s] 78%|███████▊ | 8316/10682 [1:21:16<19:32, 2.02it/s] 78%|███████▊ | 8317/10682 [1:21:17<19:31, 2.02it/s] 78%|███████▊ | 8318/10682 [1:21:17<19:29, 2.02it/s] 78%|███████▊ | 8319/10682 [1:21:18<19:28, 2.02it/s] 78%|███████▊ | 8320/10682 [1:21:18<19:28, 2.02it/s] 78%|███████▊ | 8321/10682 [1:21:19<19:26, 2.02it/s] 78%|███████▊ | 8322/10682 [1:21:19<19:27, 2.02it/s] 78%|███████▊ | 8323/10682 [1:21:20<19:26, 2.02it/s] 78%|███████▊ | 8324/10682 [1:21:20<19:26, 2.02it/s] 78%|███████▊ | 8325/10682 [1:21:21<19:25, 2.02it/s]{'loss': 2.7622, 'grad_norm': 0.25592100620269775, 'learning_rate': 0.00014114330604645943, 'epoch': 10.91} + 78%|███████▊ | 8325/10682 [1:21:21<19:25, 2.02it/s] 78%|███████▊ | 8326/10682 [1:21:21<19:25, 2.02it/s] 78%|███████▊ | 8327/10682 [1:21:22<19:24, 2.02it/s] 78%|███████▊ | 8328/10682 [1:21:22<19:24, 2.02it/s] 78%|███████▊ | 8329/10682 [1:21:23<19:23, 2.02it/s] 78%|███████▊ | 8330/10682 [1:21:23<19:23, 2.02it/s] 78%|███████▊ | 8331/10682 [1:21:24<19:24, 2.02it/s] 78%|███████▊ | 8332/10682 [1:21:24<19:24, 2.02it/s] 78%|███████▊ | 8333/10682 [1:21:25<19:22, 2.02it/s] 78%|███████▊ | 8334/10682 [1:21:25<19:22, 2.02it/s] 78%|███████▊ | 8335/10682 [1:21:26<19:21, 2.02it/s] 78%|███████▊ | 8336/10682 [1:21:26<19:21, 2.02it/s] 78%|███████▊ | 8337/10682 [1:21:27<19:21, 2.02it/s] 78%|███████▊ | 8338/10682 [1:21:27<19:20, 2.02it/s] 78%|███████▊ | 8339/10682 [1:21:28<19:17, 2.02it/s] 78%|███████▊ | 8340/10682 [1:21:28<19:18, 2.02it/s] 78%|███████▊ | 8341/10682 [1:21:29<19:16, 2.02it/s] 78%|███████▊ | 8342/10682 [1:21:29<19:17, 2.02it/s] 78%|███████▊ | 8343/10682 [1:21:30<19:16, 2.02it/s] 78%|███████▊ | 8344/10682 [1:21:30<19:16, 2.02it/s] 78%|███████▊ | 8345/10682 [1:21:31<19:14, 2.02it/s] 78%|███████▊ | 8346/10682 [1:21:31<19:15, 2.02it/s] 78%|███████▊ | 8347/10682 [1:21:32<19:13, 2.02it/s] 78%|███████▊ | 8348/10682 [1:21:32<19:14, 2.02it/s] 78%|███████▊ | 8349/10682 [1:21:33<19:14, 2.02it/s] 78%|███████▊ | 8350/10682 [1:21:33<19:14, 2.02it/s] {'loss': 2.7691, 'grad_norm': 0.24885424971580505, 'learning_rate': 0.00013831071326327282, 'epoch': 10.94} + 78%|███████▊ | 8350/10682 [1:21:33<19:14, 2.02it/s] 78%|███████▊ | 8351/10682 [1:21:34<19:14, 2.02it/s] 78%|███████▊ | 8352/10682 [1:21:34<19:15, 2.02it/s] 78%|███████▊ | 8353/10682 [1:21:35<19:13, 2.02it/s] 78%|███████▊ | 8354/10682 [1:21:35<19:12, 2.02it/s] 78%|███████▊ | 8355/10682 [1:21:36<19:11, 2.02it/s] 78%|███████▊ | 8356/10682 [1:21:36<19:12, 2.02it/s] 78%|███████▊ | 8357/10682 [1:21:37<19:10, 2.02it/s] 78%|███████▊ | 8358/10682 [1:21:37<19:11, 2.02it/s] 78%|███████▊ | 8359/10682 [1:21:38<19:08, 2.02it/s] 78%|███████▊ | 8360/10682 [1:21:38<19:07, 2.02it/s] 78%|███████▊ | 8361/10682 [1:21:39<19:06, 2.02it/s] 78%|███████▊ | 8362/10682 [1:21:39<19:07, 2.02it/s] 78%|███████▊ | 8363/10682 [1:21:40<19:05, 2.02it/s] 78%|███████▊ | 8364/10682 [1:21:40<19:06, 2.02it/s] 78%|███████▊ | 8365/10682 [1:21:41<19:06, 2.02it/s] 78%|███████▊ | 8366/10682 [1:21:41<19:06, 2.02it/s] 78%|███████▊ | 8367/10682 [1:21:42<19:03, 2.02it/s] 78%|███████▊ | 8368/10682 [1:21:42<19:04, 2.02it/s] 78%|███████▊ | 8369/10682 [1:21:43<19:03, 2.02it/s] 78%|███████▊ | 8370/10682 [1:21:43<19:03, 2.02it/s] 78%|███████▊ | 8371/10682 [1:21:44<19:03, 2.02it/s] 78%|███████▊ | 8372/10682 [1:21:44<19:02, 2.02it/s] 78%|███████▊ | 8373/10682 [1:21:45<19:02, 2.02it/s] 78%|███████▊ | 8374/10682 [1:21:45<19:02, 2.02it/s] 78%|███████▊ | 8375/10682 [1:21:46<19:02, 2.02it/s]{'loss': 2.7674, 'grad_norm': 0.2501627802848816, 'learning_rate': 0.0001355022636953933, 'epoch': 10.98} + 78%|███████▊ | 8375/10682 [1:21:46<19:02, 2.02it/s] 78%|███████▊ | 8376/10682 [1:21:46<19:01, 2.02it/s] 78%|███████▊ | 8377/10682 [1:21:47<19:01, 2.02it/s] 78%|███████▊ | 8378/10682 [1:21:47<19:00, 2.02it/s] 78%|███████▊ | 8379/10682 [1:21:48<18:59, 2.02it/s] 78%|███████▊ | 8380/10682 [1:21:48<18:59, 2.02it/s] 78%|███████▊ | 8381/10682 [1:21:49<18:58, 2.02it/s] 78%|███████▊ | 8382/10682 [1:21:49<18:57, 2.02it/s] 78%|███████▊ | 8383/10682 [1:21:50<18:56, 2.02it/s] 78%|███████▊ | 8384/10682 [1:21:50<18:55, 2.02it/s] 78%|███████▊ | 8385/10682 [1:21:51<18:56, 2.02it/s] 79%|███████▊ | 8386/10682 [1:21:51<18:54, 2.02it/s] 79%|███████▊ | 8387/10682 [1:21:52<18:56, 2.02it/s] 79%|███████▊ | 8388/10682 [1:21:52<18:55, 2.02it/s] 79%|███████▊ | 8389/10682 [1:21:53<18:54, 2.02it/s] 79%|███████▊ | 8390/10682 [1:21:53<18:53, 2.02it/s] 79%|███████▊ | 8391/10682 [1:21:54<18:53, 2.02it/s] 79%|███████▊ | 8392/10682 [1:21:54<18:53, 2.02it/s] 79%|███████▊ | 8393/10682 [1:21:55<18:42, 2.04it/s] 79%|███████▊ | 8394/10682 [1:22:55<11:46:12, 18.52s/it] 79%|███████▊ | 8395/10682 [1:22:56<8:19:47, 13.11s/it] 79%|███████▊ | 8396/10682 [1:22:56<5:55:25, 9.33s/it] 79%|███████▊ | 8397/10682 [1:22:57<4:14:21, 6.68s/it] 79%|███████▊ | 8398/10682 [1:22:57<3:03:44, 4.83s/it] 79%|███████▊ | 8399/10682 [1:22:58<2:14:20, 3.53s/it] 79%|███████▊ | 8400/10682 [1:22:58<1:39:37, 2.62s/it]{'loss': 2.7408, 'grad_norm': 0.2503925561904907, 'learning_rate': 0.0001327181448103661, 'epoch': 11.01} + 79%|███████▊ | 8400/10682 [1:22:58<1:39:37, 2.62s/it] 79%|███████▊ | 8401/10682 [1:22:59<1:15:23, 1.98s/it] 79%|███████▊ | 8402/10682 [1:22:59<58:22, 1.54s/it] 79%|███████▊ | 8403/10682 [1:23:00<46:30, 1.22s/it] 79%|███████▊ | 8404/10682 [1:23:00<38:09, 1.00s/it] 79%|███████▊ | 8405/10682 [1:23:01<32:21, 1.17it/s] 79%|███████▊ | 8406/10682 [1:23:01<28:16, 1.34it/s] 79%|███████▊ | 8407/10682 [1:23:02<25:25, 1.49it/s] 79%|███████▊ | 8408/10682 [1:23:02<23:25, 1.62it/s] 79%|███████▊ | 8409/10682 [1:23:03<22:02, 1.72it/s] 79%|███████▊ | 8410/10682 [1:23:03<21:09, 1.79it/s] 79%|███████▊ | 8411/10682 [1:23:04<20:25, 1.85it/s] 79%|███████▊ | 8412/10682 [1:23:04<19:55, 1.90it/s] 79%|███████▉ | 8413/10682 [1:23:05<19:34, 1.93it/s] 79%|███████▉ | 8414/10682 [1:23:05<19:19, 1.96it/s] 79%|███████▉ | 8415/10682 [1:23:06<19:08, 1.97it/s] 79%|███████▉ | 8416/10682 [1:23:06<19:00, 1.99it/s] 79%|███████▉ | 8417/10682 [1:23:07<18:54, 2.00it/s] 79%|███████▉ | 8418/10682 [1:23:07<18:50, 2.00it/s] 79%|███████▉ | 8419/10682 [1:23:08<18:48, 2.01it/s] 79%|███████▉ | 8420/10682 [1:23:08<18:45, 2.01it/s] 79%|███████▉ | 8421/10682 [1:23:09<18:42, 2.01it/s] 79%|███████▉ | 8422/10682 [1:23:09<18:43, 2.01it/s] 79%|███████▉ | 8423/10682 [1:23:10<18:41, 2.01it/s] 79%|███████▉ | 8424/10682 [1:23:10<18:41, 2.01it/s] 79%|█���█████▉ | 8425/10682 [1:23:11<18:41, 2.01it/s] {'loss': 2.6711, 'grad_norm': 0.2549366354942322, 'learning_rate': 0.00012995854245163207, 'epoch': 11.04} + 79%|███████▉ | 8425/10682 [1:23:11<18:41, 2.01it/s] 79%|███████▉ | 8426/10682 [1:23:11<18:41, 2.01it/s] 79%|███████▉ | 8427/10682 [1:23:12<18:39, 2.02it/s] 79%|███████▉ | 8428/10682 [1:23:12<18:42, 2.01it/s] 79%|███████▉ | 8429/10682 [1:23:13<18:41, 2.01it/s] 79%|███████▉ | 8430/10682 [1:23:13<18:37, 2.02it/s] 79%|███████▉ | 8431/10682 [1:23:14<18:36, 2.02it/s] 79%|███████▉ | 8432/10682 [1:23:14<18:35, 2.02it/s] 79%|███████▉ | 8433/10682 [1:23:15<18:36, 2.01it/s] 79%|███████▉ | 8434/10682 [1:23:15<18:34, 2.02it/s] 79%|███████▉ | 8435/10682 [1:23:16<18:33, 2.02it/s] 79%|███████▉ | 8436/10682 [1:23:16<18:32, 2.02it/s] 79%|███████▉ | 8437/10682 [1:23:17<18:31, 2.02it/s] 79%|███████▉ | 8438/10682 [1:23:17<18:30, 2.02it/s] 79%|███████▉ | 8439/10682 [1:23:18<18:29, 2.02it/s] 79%|███████▉ | 8440/10682 [1:23:18<18:30, 2.02it/s] 79%|███████▉ | 8441/10682 [1:23:19<18:29, 2.02it/s] 79%|███████▉ | 8442/10682 [1:23:19<18:29, 2.02it/s] 79%|███████▉ | 8443/10682 [1:23:19<18:28, 2.02it/s] 79%|███████▉ | 8444/10682 [1:23:20<18:27, 2.02it/s] 79%|███████▉ | 8445/10682 [1:23:20<18:26, 2.02it/s] 79%|███████▉ | 8446/10682 [1:23:21<18:26, 2.02it/s] 79%|███████▉ | 8447/10682 [1:23:21<18:25, 2.02it/s] 79%|███████▉ | 8448/10682 [1:23:22<18:25, 2.02it/s] 79%|███████▉ | 8449/10682 [1:23:22<18:25, 2.02it/s] 79%|███████▉ | 8450/10682 [1:23:23<18:23, 2.02it/s] {'loss': 2.6797, 'grad_norm': 0.25918829441070557, 'learning_rate': 0.0001272236408261237, 'epoch': 11.07} + 79%|███████▉ | 8450/10682 [1:23:23<18:23, 2.02it/s] 79%|███████▉ | 8451/10682 [1:23:23<18:25, 2.02it/s] 79%|███████▉ | 8452/10682 [1:23:24<18:23, 2.02it/s] 79%|███████▉ | 8453/10682 [1:23:24<18:23, 2.02it/s] 79%|███████▉ | 8454/10682 [1:23:25<18:22, 2.02it/s] 79%|███████▉ | 8455/10682 [1:23:25<18:23, 2.02it/s] 79%|███████▉ | 8456/10682 [1:23:26<18:22, 2.02it/s] 79%|███████▉ | 8457/10682 [1:23:26<18:22, 2.02it/s] 79%|███████▉ | 8458/10682 [1:23:27<18:20, 2.02it/s] 79%|███████▉ | 8459/10682 [1:23:27<18:20, 2.02it/s] 79%|███████▉ | 8460/10682 [1:23:28<18:19, 2.02it/s] 79%|███████▉ | 8461/10682 [1:23:28<18:19, 2.02it/s] 79%|███████▉ | 8462/10682 [1:23:29<18:18, 2.02it/s] 79%|███████▉ | 8463/10682 [1:23:29<18:17, 2.02it/s] 79%|███████▉ | 8464/10682 [1:23:30<18:18, 2.02it/s] 79%|███████▉ | 8465/10682 [1:23:30<18:17, 2.02it/s] 79%|███████▉ | 8466/10682 [1:23:31<18:18, 2.02it/s] 79%|███████▉ | 8467/10682 [1:23:31<18:16, 2.02it/s] 79%|███████▉ | 8468/10682 [1:23:32<18:16, 2.02it/s] 79%|███████▉ | 8469/10682 [1:23:32<18:15, 2.02it/s] 79%|███████▉ | 8470/10682 [1:23:33<18:15, 2.02it/s] 79%|███████▉ | 8471/10682 [1:23:33<18:14, 2.02it/s] 79%|███████▉ | 8472/10682 [1:23:34<18:14, 2.02it/s] 79%|███████▉ | 8473/10682 [1:23:34<18:14, 2.02it/s] 79%|███████▉ | 8474/10682 [1:23:35<18:13, 2.02it/s] 79%|███████▉ | 8475/10682 [1:23:35<18:11, 2.02it/s]{'loss': 2.68, 'grad_norm': 0.25277212262153625, 'learning_rate': 0.00012451362249196797, 'epoch': 11.11} + 79%|███████▉ | 8475/10682 [1:23:35<18:11, 2.02it/s] 79%|███████▉ | 8476/10682 [1:23:36<18:11, 2.02it/s] 79%|███████▉ | 8477/10682 [1:23:36<18:12, 2.02it/s] 79%|███████▉ | 8478/10682 [1:23:37<18:10, 2.02it/s] 79%|███████▉ | 8479/10682 [1:23:37<18:10, 2.02it/s] 79%|███████▉ | 8480/10682 [1:23:38<18:08, 2.02it/s] 79%|███████▉ | 8481/10682 [1:23:38<18:12, 2.02it/s] 79%|███████▉ | 8482/10682 [1:23:39<18:10, 2.02it/s] 79%|███████▉ | 8483/10682 [1:23:39<18:09, 2.02it/s] 79%|███████▉ | 8484/10682 [1:23:40<18:08, 2.02it/s] 79%|███████▉ | 8485/10682 [1:23:40<18:08, 2.02it/s] 79%|███████▉ | 8486/10682 [1:23:41<18:05, 2.02it/s] 79%|███████▉ | 8487/10682 [1:23:41<18:05, 2.02it/s] 79%|███████▉ | 8488/10682 [1:23:42<18:04, 2.02it/s] 79%|███████▉ | 8489/10682 [1:23:42<18:05, 2.02it/s] 79%|███████▉ | 8490/10682 [1:23:43<18:04, 2.02it/s] 79%|███████▉ | 8491/10682 [1:23:43<18:03, 2.02it/s] 79%|███████▉ | 8492/10682 [1:23:44<18:02, 2.02it/s] 80%|███████▉ | 8493/10682 [1:23:44<18:02, 2.02it/s] 80%|███████▉ | 8494/10682 [1:23:45<18:01, 2.02it/s] 80%|███████▉ | 8495/10682 [1:23:45<18:02, 2.02it/s] 80%|███████▉ | 8496/10682 [1:23:46<18:02, 2.02it/s] 80%|███████▉ | 8497/10682 [1:23:46<18:01, 2.02it/s] 80%|███████▉ | 8498/10682 [1:23:47<18:03, 2.02it/s] 80%|███████▉ | 8499/10682 [1:23:47<18:02, 2.02it/s] 80%|███████▉ | 8500/10682 [1:23:48<18:00, 2.02it/s] {'loss': 2.6882, 'grad_norm': 0.2537059783935547, 'learning_rate': 0.00012182866834630096, 'epoch': 11.14} + 80%|███████▉ | 8500/10682 [1:23:48<18:00, 2.02it/s] 80%|███████▉ | 8501/10682 [1:23:48<17:59, 2.02it/s] 80%|███████▉ | 8502/10682 [1:23:49<18:00, 2.02it/s] 80%|███████▉ | 8503/10682 [1:23:49<17:58, 2.02it/s] 80%|███████▉ | 8504/10682 [1:23:50<17:57, 2.02it/s] 80%|███████▉ | 8505/10682 [1:23:50<17:57, 2.02it/s] 80%|███████▉ | 8506/10682 [1:23:51<17:57, 2.02it/s] 80%|███████▉ | 8507/10682 [1:23:51<17:57, 2.02it/s] 80%|███████▉ | 8508/10682 [1:23:52<17:57, 2.02it/s] 80%|███████▉ | 8509/10682 [1:23:52<17:56, 2.02it/s] 80%|███████▉ | 8510/10682 [1:23:53<17:57, 2.02it/s] 80%|███████▉ | 8511/10682 [1:23:53<17:56, 2.02it/s] 80%|███████▉ | 8512/10682 [1:23:54<17:54, 2.02it/s] 80%|███████▉ | 8513/10682 [1:23:54<17:54, 2.02it/s] 80%|███████▉ | 8514/10682 [1:23:55<17:53, 2.02it/s] 80%|███████▉ | 8515/10682 [1:23:55<17:53, 2.02it/s] 80%|███████▉ | 8516/10682 [1:23:56<17:52, 2.02it/s] 80%|███████▉ | 8517/10682 [1:23:56<17:51, 2.02it/s] 80%|███████▉ | 8518/10682 [1:23:57<17:51, 2.02it/s] 80%|███████▉ | 8519/10682 [1:23:57<17:51, 2.02it/s] 80%|███████▉ | 8520/10682 [1:23:58<17:50, 2.02it/s] 80%|███████▉ | 8521/10682 [1:23:58<17:51, 2.02it/s] 80%|███████▉ | 8522/10682 [1:23:59<17:50, 2.02it/s] 80%|███████▉ | 8523/10682 [1:23:59<17:50, 2.02it/s] 80%|███████▉ | 8524/10682 [1:24:00<17:49, 2.02it/s] 80%|███████▉ | 8525/10682 [1:24:00<17:49, 2.02it/s] {'loss': 2.6859, 'grad_norm': 0.2521715760231018, 'learning_rate': 0.00011916895761319264, 'epoch': 11.17} + 80%|███████▉ | 8525/10682 [1:24:00<17:49, 2.02it/s] 80%|███████▉ | 8526/10682 [1:24:01<17:48, 2.02it/s] 80%|███████▉ | 8527/10682 [1:24:01<17:47, 2.02it/s] 80%|███████▉ | 8528/10682 [1:24:02<17:45, 2.02it/s] 80%|███████▉ | 8529/10682 [1:24:02<17:46, 2.02it/s] 80%|███████▉ | 8530/10682 [1:24:03<17:45, 2.02it/s] 80%|███████▉ | 8531/10682 [1:24:03<17:44, 2.02it/s] 80%|███████▉ | 8532/10682 [1:24:04<17:43, 2.02it/s] 80%|███████▉ | 8533/10682 [1:24:04<17:43, 2.02it/s] 80%|███████▉ | 8534/10682 [1:24:05<17:45, 2.02it/s] 80%|███████▉ | 8535/10682 [1:24:05<17:43, 2.02it/s] 80%|███████▉ | 8536/10682 [1:24:06<17:44, 2.02it/s] 80%|███████▉ | 8537/10682 [1:24:06<17:42, 2.02it/s] 80%|███████▉ | 8538/10682 [1:24:07<17:42, 2.02it/s] 80%|███████▉ | 8539/10682 [1:24:07<17:40, 2.02it/s] 80%|███████▉ | 8540/10682 [1:24:08<17:39, 2.02it/s] 80%|███████▉ | 8541/10682 [1:24:08<17:39, 2.02it/s] 80%|███████▉ | 8542/10682 [1:24:09<17:38, 2.02it/s] 80%|███████▉ | 8543/10682 [1:24:09<17:37, 2.02it/s] 80%|███████▉ | 8544/10682 [1:24:10<17:38, 2.02it/s] 80%|███████▉ | 8545/10682 [1:24:10<17:36, 2.02it/s] 80%|████████ | 8546/10682 [1:24:10<17:37, 2.02it/s] 80%|████████ | 8547/10682 [1:24:11<17:36, 2.02it/s] 80%|████████ | 8548/10682 [1:24:11<17:36, 2.02it/s] 80%|████████ | 8549/10682 [1:24:12<17:34, 2.02it/s] 80%|████████ | 8550/10682 [1:24:12<17:34, 2.02it/s] {'loss': 2.6865, 'grad_norm': 0.25585633516311646, 'learning_rate': 0.0001165346678316832, 'epoch': 11.21} + 80%|████████ | 8550/10682 [1:24:12<17:34, 2.02it/s] 80%|████████ | 8551/10682 [1:24:13<17:34, 2.02it/s] 80%|████████ | 8552/10682 [1:24:13<17:33, 2.02it/s] 80%|████████ | 8553/10682 [1:24:14<17:32, 2.02it/s] 80%|████████ | 8554/10682 [1:24:14<17:33, 2.02it/s] 80%|████████ | 8555/10682 [1:24:15<17:31, 2.02it/s] 80%|████████ | 8556/10682 [1:24:15<17:33, 2.02it/s] 80%|████████ | 8557/10682 [1:24:16<17:32, 2.02it/s] 80%|████████ | 8558/10682 [1:24:16<17:32, 2.02it/s] 80%|████████ | 8559/10682 [1:24:17<17:31, 2.02it/s] 80%|████████ | 8560/10682 [1:24:17<17:30, 2.02it/s] 80%|████████ | 8561/10682 [1:24:18<17:30, 2.02it/s] 80%|████████ | 8562/10682 [1:24:18<17:30, 2.02it/s] 80%|████████ | 8563/10682 [1:24:19<17:29, 2.02it/s] 80%|████████ | 8564/10682 [1:24:19<17:28, 2.02it/s] 80%|████████ | 8565/10682 [1:24:20<17:27, 2.02it/s] 80%|████████ | 8566/10682 [1:24:20<17:28, 2.02it/s] 80%|████████ | 8567/10682 [1:24:21<17:27, 2.02it/s] 80%|████████ | 8568/10682 [1:24:21<17:28, 2.02it/s] 80%|████████ | 8569/10682 [1:24:22<17:27, 2.02it/s] 80%|████████ | 8570/10682 [1:24:22<17:28, 2.01it/s] 80%|████████ | 8571/10682 [1:24:23<17:26, 2.02it/s] 80%|████████ | 8572/10682 [1:24:23<17:26, 2.02it/s] 80%|████████ | 8573/10682 [1:24:24<17:26, 2.02it/s] 80%|████████ | 8574/10682 [1:24:24<17:25, 2.02it/s] 80%|████████ | 8575/10682 [1:24:25<17:24, 2.02it/s] {'loss': 2.6856, 'grad_norm': 0.25255653262138367, 'learning_rate': 0.00011392597484393285, 'epoch': 11.24} + 80%|████████ | 8575/10682 [1:24:25<17:24, 2.02it/s] 80%|████████ | 8576/10682 [1:24:25<17:24, 2.02it/s] 80%|████████ | 8577/10682 [1:24:26<17:22, 2.02it/s] 80%|████████ | 8578/10682 [1:24:26<17:22, 2.02it/s] 80%|████████ | 8579/10682 [1:24:27<17:21, 2.02it/s] 80%|████████ | 8580/10682 [1:24:27<17:20, 2.02it/s] 80%|████████ | 8581/10682 [1:24:28<17:20, 2.02it/s] 80%|████████ | 8582/10682 [1:24:28<17:19, 2.02it/s] 80%|████████ | 8583/10682 [1:24:29<17:19, 2.02it/s] 80%|████████ | 8584/10682 [1:24:29<17:18, 2.02it/s] 80%|████████ | 8585/10682 [1:24:30<17:18, 2.02it/s] 80%|████████ | 8586/10682 [1:24:30<17:17, 2.02it/s] 80%|████████ | 8587/10682 [1:24:31<17:18, 2.02it/s] 80%|████████ | 8588/10682 [1:24:31<17:16, 2.02it/s] 80%|████████ | 8589/10682 [1:24:32<17:18, 2.02it/s] 80%|████████ | 8590/10682 [1:24:32<17:16, 2.02it/s] 80%|████████ | 8591/10682 [1:24:33<17:15, 2.02it/s] 80%|████████ | 8592/10682 [1:24:33<17:15, 2.02it/s] 80%|████████ | 8593/10682 [1:24:34<17:14, 2.02it/s] 80%|████████ | 8594/10682 [1:24:34<17:17, 2.01it/s] 80%|████████ | 8595/10682 [1:24:35<17:16, 2.01it/s] 80%|████████ | 8596/10682 [1:24:35<17:16, 2.01it/s] 80%|████████ | 8597/10682 [1:24:36<17:15, 2.01it/s] 80%|████████ | 8598/10682 [1:24:36<17:14, 2.02it/s] 80%|████████ | 8599/10682 [1:24:37<17:13, 2.02it/s] 81%|████████ | 8600/10682 [1:24:37<17:11, 2.02it/s]{'loss': 2.6888, 'grad_norm': 0.2578771114349365, 'learning_rate': 0.00011134305278348312, 'epoch': 11.27} + 81%|████████ | 8600/10682 [1:24:37<17:11, 2.02it/s] 81%|████████ | 8601/10682 [1:24:38<17:12, 2.02it/s] 81%|████████ | 8602/10682 [1:24:38<17:11, 2.02it/s] 81%|████████ | 8603/10682 [1:24:39<17:10, 2.02it/s] 81%|████████ | 8604/10682 [1:24:39<17:09, 2.02it/s] 81%|████████ | 8605/10682 [1:24:40<17:10, 2.02it/s] 81%|████████ | 8606/10682 [1:24:40<17:08, 2.02it/s] 81%|████████ | 8607/10682 [1:24:41<17:07, 2.02it/s] 81%|████████ | 8608/10682 [1:24:41<17:06, 2.02it/s] 81%|████████ | 8609/10682 [1:24:42<17:05, 2.02it/s] 81%|████████ | 8610/10682 [1:24:42<17:04, 2.02it/s] 81%|████████ | 8611/10682 [1:24:43<17:05, 2.02it/s] 81%|████████ | 8612/10682 [1:24:43<17:04, 2.02it/s] 81%|████████ | 8613/10682 [1:24:44<17:04, 2.02it/s] 81%|████████ | 8614/10682 [1:24:44<17:03, 2.02it/s] 81%|████████ | 8615/10682 [1:24:45<17:04, 2.02it/s] 81%|████████ | 8616/10682 [1:24:45<17:03, 2.02it/s] 81%|████████ | 8617/10682 [1:24:46<17:02, 2.02it/s] 81%|████████ | 8618/10682 [1:24:46<17:02, 2.02it/s] 81%|████████ | 8619/10682 [1:24:47<17:01, 2.02it/s] 81%|████████ | 8620/10682 [1:24:47<17:01, 2.02it/s] 81%|████████ | 8621/10682 [1:24:48<17:01, 2.02it/s] 81%|████████ | 8622/10682 [1:24:48<17:00, 2.02it/s] 81%|████████ | 8623/10682 [1:24:49<16:58, 2.02it/s] 81%|████████ | 8624/10682 [1:24:49<16:59, 2.02it/s] 81%|████████ | 8625/10682 [1:24:50<17:00, 2.02it/s] {'loss': 2.6909, 'grad_norm': 0.25223779678344727, 'learning_rate': 0.00010878607406363367, 'epoch': 11.3} + 81%|████████ | 8625/10682 [1:24:50<17:00, 2.02it/s] 81%|████████ | 8626/10682 [1:24:50<17:01, 2.01it/s] 81%|████████ | 8627/10682 [1:24:51<17:00, 2.01it/s] 81%|████████ | 8628/10682 [1:24:51<16:58, 2.02it/s] 81%|████████ | 8629/10682 [1:24:52<16:57, 2.02it/s] 81%|████████ | 8630/10682 [1:24:52<16:56, 2.02it/s] 81%|████████ | 8631/10682 [1:24:53<16:56, 2.02it/s] 81%|████████ | 8632/10682 [1:24:53<16:54, 2.02it/s] 81%|████████ | 8633/10682 [1:24:54<16:54, 2.02it/s] 81%|████████ | 8634/10682 [1:24:54<16:52, 2.02it/s] 81%|████████ | 8635/10682 [1:24:55<16:53, 2.02it/s] 81%|████████ | 8636/10682 [1:24:55<16:52, 2.02it/s] 81%|████████ | 8637/10682 [1:24:56<16:52, 2.02it/s] 81%|████████ | 8638/10682 [1:24:56<16:53, 2.02it/s] 81%|████████ | 8639/10682 [1:24:57<16:53, 2.02it/s] 81%|████████ | 8640/10682 [1:24:57<16:52, 2.02it/s] 81%|████████ | 8641/10682 [1:24:58<16:51, 2.02it/s] 81%|████████ | 8642/10682 [1:24:58<16:51, 2.02it/s] 81%|████████ | 8643/10682 [1:24:59<16:50, 2.02it/s] 81%|████████ | 8644/10682 [1:24:59<16:49, 2.02it/s] 81%|████████ | 8645/10682 [1:25:00<16:49, 2.02it/s] 81%|████████ | 8646/10682 [1:25:00<16:49, 2.02it/s] 81%|████████ | 8647/10682 [1:25:01<16:48, 2.02it/s] 81%|████████ | 8648/10682 [1:25:01<16:49, 2.02it/s] 81%|████████ | 8649/10682 [1:25:02<16:47, 2.02it/s] 81%|████████ | 8650/10682 [1:25:02<16:49, 2.01it/s] {'loss': 2.6978, 'grad_norm': 0.251982182264328, 'learning_rate': 0.00010625520936593375, 'epoch': 11.34} + 81%|████████ | 8650/10682 [1:25:02<16:49, 2.01it/s] 81%|████████ | 8651/10682 [1:25:03<16:48, 2.01it/s] 81%|████████ | 8652/10682 [1:25:03<16:47, 2.02it/s] 81%|████████ | 8653/10682 [1:25:04<16:46, 2.02it/s] 81%|████████ | 8654/10682 [1:25:04<16:45, 2.02it/s] 81%|████████ | 8655/10682 [1:25:05<16:59, 1.99it/s] 81%|████████ | 8656/10682 [1:25:05<16:53, 2.00it/s] 81%|████████ | 8657/10682 [1:25:06<16:50, 2.00it/s] 81%|████████ | 8658/10682 [1:25:06<16:46, 2.01it/s] 81%|████████ | 8659/10682 [1:25:07<16:45, 2.01it/s] 81%|████████ | 8660/10682 [1:25:07<16:44, 2.01it/s] 81%|████████ | 8661/10682 [1:25:07<16:43, 2.01it/s] 81%|████████ | 8662/10682 [1:25:08<16:40, 2.02it/s] 81%|████████ | 8663/10682 [1:25:08<16:40, 2.02it/s] 81%|████████ | 8664/10682 [1:25:09<16:39, 2.02it/s] 81%|████████ | 8665/10682 [1:25:09<16:38, 2.02it/s] 81%|████████ | 8666/10682 [1:25:10<16:38, 2.02it/s] 81%|████████ | 8667/10682 [1:25:10<16:38, 2.02it/s] 81%|████████ | 8668/10682 [1:25:11<16:38, 2.02it/s] 81%|████████ | 8669/10682 [1:25:11<16:37, 2.02it/s] 81%|████████ | 8670/10682 [1:25:12<16:36, 2.02it/s] 81%|████████ | 8671/10682 [1:25:12<16:35, 2.02it/s] 81%|████████ | 8672/10682 [1:25:13<16:36, 2.02it/s] 81%|████████ | 8673/10682 [1:25:13<16:35, 2.02it/s] 81%|████████ | 8674/10682 [1:25:14<16:35, 2.02it/s] 81%|████████ | 8675/10682 [1:25:14<16:34, 2.02it/s]{'loss': 2.6954, 'grad_norm': 0.25424081087112427, 'learning_rate': 0.0001037506276287885, 'epoch': 11.37} + 81%|████████ | 8675/10682 [1:25:14<16:34, 2.02it/s] 81%|████████ | 8676/10682 [1:25:15<16:35, 2.02it/s] 81%|██��█████ | 8677/10682 [1:25:15<16:34, 2.02it/s] 81%|████████ | 8678/10682 [1:25:16<16:33, 2.02it/s] 81%|████████ | 8679/10682 [1:25:16<16:32, 2.02it/s] 81%|████████▏ | 8680/10682 [1:25:17<16:31, 2.02it/s] 81%|████████▏ | 8681/10682 [1:25:17<16:30, 2.02it/s] 81%|████████▏ | 8682/10682 [1:25:18<16:30, 2.02it/s] 81%|████████▏ | 8683/10682 [1:25:18<16:29, 2.02it/s] 81%|████████▏ | 8684/10682 [1:25:19<16:29, 2.02it/s] 81%|████████▏ | 8685/10682 [1:25:19<16:29, 2.02it/s] 81%|████████▏ | 8686/10682 [1:25:20<16:27, 2.02it/s] 81%|████████▏ | 8687/10682 [1:25:20<16:28, 2.02it/s] 81%|████████▏ | 8688/10682 [1:25:21<16:27, 2.02it/s] 81%|████████▏ | 8689/10682 [1:25:21<16:27, 2.02it/s] 81%|████████▏ | 8690/10682 [1:25:22<16:26, 2.02it/s] 81%|████████▏ | 8691/10682 [1:25:22<16:27, 2.02it/s] 81%|████████▏ | 8692/10682 [1:25:23<16:25, 2.02it/s] 81%|████████▏ | 8693/10682 [1:25:23<16:25, 2.02it/s] 81%|████████▏ | 8694/10682 [1:25:24<16:23, 2.02it/s] 81%|████████▏ | 8695/10682 [1:25:24<16:24, 2.02it/s] 81%|████████▏ | 8696/10682 [1:25:25<16:23, 2.02it/s] 81%|████████▏ | 8697/10682 [1:25:25<16:23, 2.02it/s] 81%|████████▏ | 8698/10682 [1:25:26<16:22, 2.02it/s] 81%|████████▏ | 8699/10682 [1:25:26<16:21, 2.02it/s] 81%|████████▏ | 8700/10682 [1:25:27<16:20, 2.02it/s] {'loss': 2.6988, 'grad_norm': 0.2529226243495941, 'learning_rate': 0.0001012724960361826, 'epoch': 11.4} + 81%|████████▏ | 8700/10682 [1:25:27<16:20, 2.02it/s] 81%|████████▏ | 8701/10682 [1:25:27<16:21, 2.02it/s] 81%|████████▏ | 8702/10682 [1:25:28<16:20, 2.02it/s] 81%|████████▏ | 8703/10682 [1:25:28<16:18, 2.02it/s] 81%|████████▏ | 8704/10682 [1:25:29<16:18, 2.02it/s] 81%|████████▏ | 8705/10682 [1:25:29<16:17, 2.02it/s] 82%|████████▏ | 8706/10682 [1:25:30<16:20, 2.02it/s] 82%|████████▏ | 8707/10682 [1:25:30<16:18, 2.02it/s] 82%|████████▏ | 8708/10682 [1:25:31<16:18, 2.02it/s] 82%|████████▏ | 8709/10682 [1:25:31<16:16, 2.02it/s] 82%|████████▏ | 8710/10682 [1:25:32<16:17, 2.02it/s] 82%|████████▏ | 8711/10682 [1:25:32<16:15, 2.02it/s] 82%|████████▏ | 8712/10682 [1:25:33<16:15, 2.02it/s] 82%|████████▏ | 8713/10682 [1:25:33<16:15, 2.02it/s] 82%|████████▏ | 8714/10682 [1:25:34<16:14, 2.02it/s] 82%|████████▏ | 8715/10682 [1:25:34<16:15, 2.02it/s] 82%|████████▏ | 8716/10682 [1:25:35<16:14, 2.02it/s] 82%|████████▏ | 8717/10682 [1:25:35<16:13, 2.02it/s] 82%|████████▏ | 8718/10682 [1:25:36<16:12, 2.02it/s] 82%|████████▏ | 8719/10682 [1:25:36<16:12, 2.02it/s] 82%|████████▏ | 8720/10682 [1:25:37<16:10, 2.02it/s] 82%|████████▏ | 8721/10682 [1:25:37<16:11, 2.02it/s] 82%|████████▏ | 8722/10682 [1:25:38<16:10, 2.02it/s] 82%|████████▏ | 8723/10682 [1:25:38<16:11, 2.02it/s] 82%|████████▏ | 8724/10682 [1:25:39<16:10, 2.02it/s] 82%|████████▏ | 8725/10682 [1:25:39<16:10, 2.02it/s] {'loss': 2.7008, 'grad_norm': 0.2554323971271515, 'learning_rate': 9.882098000652034e-05, 'epoch': 11.44} + 82%|████████▏ | 8725/10682 [1:25:39<16:10, 2.02it/s] 82%|████████▏ | 8726/10682 [1:25:40<16:12, 2.01it/s] 82%|████████▏ | 8727/10682 [1:25:40<16:10, 2.02it/s] 82%|████████▏ | 8728/10682 [1:25:41<16:09, 2.01it/s] 82%|████████▏ | 8729/10682 [1:25:41<16:08, 2.02it/s] 82%|████████▏ | 8730/10682 [1:25:42<16:08, 2.02it/s] 82%|████████▏ | 8731/10682 [1:25:42<16:09, 2.01it/s] 82%|████████▏ | 8732/10682 [1:25:43<16:07, 2.02it/s] 82%|████████▏ | 8733/10682 [1:25:43<16:06, 2.02it/s] 82%|████████▏ | 8734/10682 [1:25:44<16:04, 2.02it/s] 82%|████████▏ | 8735/10682 [1:25:44<16:05, 2.02it/s] 82%|████████▏ | 8736/10682 [1:25:45<16:04, 2.02it/s] 82%|████████▏ | 8737/10682 [1:25:45<16:04, 2.02it/s] 82%|████████▏ | 8738/10682 [1:25:46<16:02, 2.02it/s] 82%|████████▏ | 8739/10682 [1:25:46<16:02, 2.02it/s] 82%|████████▏ | 8740/10682 [1:25:47<16:01, 2.02it/s] 82%|████████▏ | 8741/10682 [1:25:47<16:00, 2.02it/s] 82%|████████▏ | 8742/10682 [1:25:48<16:00, 2.02it/s] 82%|████████▏ | 8743/10682 [1:25:48<15:59, 2.02it/s] 82%|████████▏ | 8744/10682 [1:25:49<15:59, 2.02it/s] 82%|████████▏ | 8745/10682 [1:25:49<15:58, 2.02it/s] 82%|████████▏ | 8746/10682 [1:25:50<15:58, 2.02it/s] 82%|████████▏ | 8747/10682 [1:25:50<15:57, 2.02it/s] 82%|████████▏ | 8748/10682 [1:25:51<15:57, 2.02it/s] 82%|████████▏ | 8749/10682 [1:25:51<15:56, 2.02it/s] 82%|████████▏ | 8750/10682 [1:25:52<15:55, 2.02it/s] {'loss': 2.6909, 'grad_norm': 0.25148528814315796, 'learning_rate': 9.639624318158335e-05, 'epoch': 11.47} + 82%|████████▏ | 8750/10682 [1:25:52<15:55, 2.02it/s] 82%|████████▏ | 8751/10682 [1:25:52<15:55, 2.02it/s] 82%|████████▏ | 8752/10682 [1:25:53<15:56, 2.02it/s] 82%|████████▏ | 8753/10682 [1:25:53<15:54, 2.02it/s] 82%|████████▏ | 8754/10682 [1:25:54<15:55, 2.02it/s] 82%|████████▏ | 8755/10682 [1:25:54<15:54, 2.02it/s] 82%|████████▏ | 8756/10682 [1:25:55<15:53, 2.02it/s] 82%|████████▏ | 8757/10682 [1:25:55<15:54, 2.02it/s] 82%|████████▏ | 8758/10682 [1:25:56<15:52, 2.02it/s] 82%|████████▏ | 8759/10682 [1:25:56<15:51, 2.02it/s] 82%|████████▏ | 8760/10682 [1:25:57<15:50, 2.02it/s] 82%|████████▏ | 8761/10682 [1:25:57<15:49, 2.02it/s] 82%|████████▏ | 8762/10682 [1:25:58<15:49, 2.02it/s] 82%|████████▏ | 8763/10682 [1:25:58<15:48, 2.02it/s] 82%|████████▏ | 8764/10682 [1:25:59<15:48, 2.02it/s] 82%|████████▏ | 8765/10682 [1:25:59<15:48, 2.02it/s] 82%|████████▏ | 8766/10682 [1:26:00<15:47, 2.02it/s] 82%|████████▏ | 8767/10682 [1:26:00<15:46, 2.02it/s] 82%|████████▏ | 8768/10682 [1:26:00<15:47, 2.02it/s] 82%|████████▏ | 8769/10682 [1:26:01<15:47, 2.02it/s] 82%|████████▏ | 8770/10682 [1:26:01<15:46, 2.02it/s] 82%|████████▏ | 8771/10682 [1:26:02<15:46, 2.02it/s] 82%|████████▏ | 8772/10682 [1:26:02<15:45, 2.02it/s] 82%|████████▏ | 8773/10682 [1:26:03<15:44, 2.02it/s] 82%|████████▏ | 8774/10682 [1:26:03<15:44, 2.02it/s] 82%|████████▏ | 8775/10682 [1:26:04<15:45, 2.02it/s] {'loss': 2.6974, 'grad_norm': 0.2538389563560486, 'learning_rate': 9.399844741560781e-05, 'epoch': 11.5} + 82%|████████▏ | 8775/10682 [1:26:04<15:45, 2.02it/s] 82%|████████▏ | 8776/10682 [1:26:04<15:46, 2.01it/s] 82%|████████▏ | 8777/10682 [1:26:05<15:45, 2.01it/s] 82%|████████▏ | 8778/10682 [1:26:05<15:44, 2.02it/s] 82%|████████▏ | 8779/10682 [1:26:06<15:43, 2.02it/s] 82%|████████▏ | 8780/10682 [1:26:06<15:42, 2.02it/s] 82%|████████▏ | 8781/10682 [1:26:07<15:42, 2.02it/s] 82%|████████▏ | 8782/10682 [1:26:07<15:42, 2.02it/s] 82%|████████▏ | 8783/10682 [1:26:08<15:41, 2.02it/s] 82%|████████▏ | 8784/10682 [1:26:08<15:40, 2.02it/s] 82%|████████▏ | 8785/10682 [1:26:09<15:39, 2.02it/s] 82%|████████▏ | 8786/10682 [1:26:09<15:38, 2.02it/s] 82%|████████▏ | 8787/10682 [1:26:10<15:37, 2.02it/s] 82%|████████▏ | 8788/10682 [1:26:10<15:38, 2.02it/s] 82%|████████▏ | 8789/10682 [1:26:11<15:37, 2.02it/s] 82%|████████▏ | 8790/10682 [1:26:11<15:39, 2.01it/s] 82%|████████▏ | 8791/10682 [1:26:12<15:37, 2.02it/s] 82%|████████▏ | 8792/10682 [1:26:12<15:38, 2.01it/s] 82%|████████▏ | 8793/10682 [1:26:13<15:36, 2.02it/s] 82%|████████▏ | 8794/10682 [1:26:13<15:36, 2.02it/s] 82%|████████▏ | 8795/10682 [1:26:14<15:35, 2.02it/s] 82%|████████▏ | 8796/10682 [1:26:14<15:35, 2.02it/s] 82%|████████▏ | 8797/10682 [1:26:15<15:35, 2.01it/s] 82%|████████▏ | 8798/10682 [1:26:15<15:34, 2.02it/s] 82%|████████▏ | 8799/10682 [1:26:16<15:33, 2.02it/s] 82%|████████▏ | 8800/10682 [1:26:16<15:33, 2.02it/s]{'loss': 2.6965, 'grad_norm': 0.2544967532157898, 'learning_rate': 9.162775276448015e-05, 'epoch': 11.53} + 82%|████████▏ | 8800/10682 [1:26:16<15:33, 2.02it/s] 82%|███████��▏ | 8801/10682 [1:26:17<15:34, 2.01it/s] 82%|████████▏ | 8802/10682 [1:26:17<15:32, 2.02it/s] 82%|████████▏ | 8803/10682 [1:26:18<15:31, 2.02it/s] 82%|████████▏ | 8804/10682 [1:26:18<15:29, 2.02it/s] 82%|████████▏ | 8805/10682 [1:26:19<15:30, 2.02it/s] 82%|████████▏ | 8806/10682 [1:26:19<15:28, 2.02it/s] 82%|████████▏ | 8807/10682 [1:26:20<15:28, 2.02it/s] 82%|████████▏ | 8808/10682 [1:26:20<15:28, 2.02it/s] 82%|████████▏ | 8809/10682 [1:26:21<15:27, 2.02it/s] 82%|████████▏ | 8810/10682 [1:26:21<15:26, 2.02it/s] 82%|████████▏ | 8811/10682 [1:26:22<15:26, 2.02it/s] 82%|████████▏ | 8812/10682 [1:26:22<15:28, 2.01it/s] 83%|████████▎ | 8813/10682 [1:26:23<15:27, 2.02it/s] 83%|████████▎ | 8814/10682 [1:26:23<15:26, 2.02it/s] 83%|████████▎ | 8815/10682 [1:26:24<15:25, 2.02it/s] 83%|████████▎ | 8816/10682 [1:26:24<15:24, 2.02it/s] 83%|████████▎ | 8817/10682 [1:26:25<15:24, 2.02it/s] 83%|████████▎ | 8818/10682 [1:26:25<15:23, 2.02it/s] 83%|████████▎ | 8819/10682 [1:26:26<15:22, 2.02it/s] 83%|████████▎ | 8820/10682 [1:26:26<15:22, 2.02it/s] 83%|████████▎ | 8821/10682 [1:26:27<15:21, 2.02it/s] 83%|████████▎ | 8822/10682 [1:26:27<15:21, 2.02it/s] 83%|████████▎ | 8823/10682 [1:26:28<16:43, 1.85it/s] 83%|████████▎ | 8824/10682 [1:26:28<16:17, 1.90it/s] 83%|████████▎ | 8825/10682 [1:26:29<16:00, 1.93it/s] {'loss': 2.6901, 'grad_norm': 0.25129666924476624, 'learning_rate': 8.928431747505355e-05, 'epoch': 11.57} + 83%|████████▎ | 8825/10682 [1:26:29<16:00, 1.93it/s] 83%|████████▎ | 8826/10682 [1:26:29<15:48, 1.96it/s] 83%|████████▎ | 8827/10682 [1:26:30<15:38, 1.98it/s] 83%|████████▎ | 8828/10682 [1:26:30<15:31, 1.99it/s] 83%|████████▎ | 8829/10682 [1:26:31<15:26, 2.00it/s] 83%|████████▎ | 8830/10682 [1:26:31<15:22, 2.01it/s] 83%|████████▎ | 8831/10682 [1:26:32<15:22, 2.01it/s] 83%|████████▎ | 8832/10682 [1:26:32<15:20, 2.01it/s] 83%|████████▎ | 8833/10682 [1:26:33<15:18, 2.01it/s] 83%|████████▎ | 8834/10682 [1:26:33<15:16, 2.02it/s] 83%|████████▎ | 8835/10682 [1:26:34<15:16, 2.01it/s] 83%|████████▎ | 8836/10682 [1:26:34<15:15, 2.02it/s] 83%|████████▎ | 8837/10682 [1:26:35<15:13, 2.02it/s] 83%|████████▎ | 8838/10682 [1:26:35<15:12, 2.02it/s] 83%|████████▎ | 8839/10682 [1:26:36<15:12, 2.02it/s] 83%|████████▎ | 8840/10682 [1:26:36<15:12, 2.02it/s] 83%|████████▎ | 8841/10682 [1:26:37<15:10, 2.02it/s] 83%|████████▎ | 8842/10682 [1:26:37<15:10, 2.02it/s] 83%|████████▎ | 8843/10682 [1:26:38<15:09, 2.02it/s] 83%|████████▎ | 8844/10682 [1:26:38<15:10, 2.02it/s] 83%|████████▎ | 8845/10682 [1:26:39<15:09, 2.02it/s] 83%|████████▎ | 8846/10682 [1:26:39<15:09, 2.02it/s] 83%|████████▎ | 8847/10682 [1:26:40<15:07, 2.02it/s] 83%|████████▎ | 8848/10682 [1:26:40<15:07, 2.02it/s] 83%|████████▎ | 8849/10682 [1:26:41<15:06, 2.02it/s] 83%|████████▎ | 8850/10682 [1:26:41<15:05, 2.02it/s]{'loss': 2.6973, 'grad_norm': 0.2536369860172272, 'learning_rate': 8.6968297974584e-05, 'epoch': 11.6} + 83%|████████▎ | 8850/10682 [1:26:41<15:05, 2.02it/s] 83%|████████▎ | 8851/10682 [1:26:42<15:06, 2.02it/s] 83%|████████▎ | 8852/10682 [1:26:42<15:06, 2.02it/s] 83%|████████▎ | 8853/10682 [1:26:43<15:04, 2.02it/s] 83%|████████▎ | 8854/10682 [1:26:43<15:04, 2.02it/s] 83%|████████▎ | 8855/10682 [1:26:44<15:02, 2.02it/s] 83%|████████▎ | 8856/10682 [1:26:44<15:02, 2.02it/s] 83%|████████▎ | 8857/10682 [1:26:45<15:02, 2.02it/s] 83%|████████▎ | 8858/10682 [1:26:45<15:03, 2.02it/s] 83%|████████▎ | 8859/10682 [1:26:46<15:04, 2.02it/s] 83%|████████▎ | 8860/10682 [1:26:46<15:02, 2.02it/s] 83%|████████▎ | 8861/10682 [1:26:47<15:02, 2.02it/s] 83%|████████▎ | 8862/10682 [1:26:47<15:00, 2.02it/s] 83%|████████▎ | 8863/10682 [1:26:48<15:01, 2.02it/s] 83%|████████▎ | 8864/10682 [1:26:48<14:59, 2.02it/s] 83%|���███████▎ | 8865/10682 [1:26:49<14:59, 2.02it/s] 83%|████████▎ | 8866/10682 [1:26:49<14:58, 2.02it/s] 83%|████████▎ | 8867/10682 [1:26:50<14:58, 2.02it/s] 83%|████████▎ | 8868/10682 [1:26:50<14:56, 2.02it/s] 83%|████████▎ | 8869/10682 [1:26:51<14:56, 2.02it/s] 83%|████████▎ | 8870/10682 [1:26:51<14:56, 2.02it/s] 83%|████████▎ | 8871/10682 [1:26:52<14:56, 2.02it/s] 83%|████████▎ | 8872/10682 [1:26:52<14:54, 2.02it/s] 83%|████████▎ | 8873/10682 [1:26:53<14:56, 2.02it/s] 83%|████████▎ | 8874/10682 [1:26:53<14:54, 2.02it/s] 83%|████████▎ | 8875/10682 [1:26:54<14:55, 2.02it/s] {'loss': 2.7137, 'grad_norm': 0.25195086002349854, 'learning_rate': 8.467984886028967e-05, 'epoch': 11.63} + 83%|████████▎ | 8875/10682 [1:26:54<14:55, 2.02it/s] 83%|████████▎ | 8876/10682 [1:26:54<14:55, 2.02it/s] 83%|████████▎ | 8877/10682 [1:26:55<14:55, 2.02it/s] 83%|████████▎ | 8878/10682 [1:26:55<14:52, 2.02it/s] 83%|████████▎ | 8879/10682 [1:26:56<14:52, 2.02it/s] 83%|████████▎ | 8880/10682 [1:26:56<14:51, 2.02it/s] 83%|████████▎ | 8881/10682 [1:26:57<14:51, 2.02it/s] 83%|████████▎ | 8882/10682 [1:26:57<14:50, 2.02it/s] 83%|████████▎ | 8883/10682 [1:26:58<14:50, 2.02it/s] 83%|████████▎ | 8884/10682 [1:26:58<14:48, 2.02it/s] 83%|████████▎ | 8885/10682 [1:26:59<14:49, 2.02it/s] 83%|████████▎ | 8886/10682 [1:26:59<14:48, 2.02it/s] 83%|████████▎ | 8887/10682 [1:27:00<14:48, 2.02it/s] 83%|████████▎ | 8888/10682 [1:27:00<14:47, 2.02it/s] 83%|████████▎ | 8889/10682 [1:27:01<14:47, 2.02it/s] 83%|████████▎ | 8890/10682 [1:27:01<14:46, 2.02it/s] 83%|████████▎ | 8891/10682 [1:27:02<14:46, 2.02it/s] 83%|████████▎ | 8892/10682 [1:27:02<14:45, 2.02it/s] 83%|████████▎ | 8893/10682 [1:27:03<14:44, 2.02it/s] 83%|████████▎ | 8894/10682 [1:27:03<14:43, 2.02it/s] 83%|████████▎ | 8895/10682 [1:27:04<14:43, 2.02it/s] 83%|████████▎ | 8896/10682 [1:27:04<14:42, 2.02it/s] 83%|████████▎ | 8897/10682 [1:27:05<14:54, 2.00it/s] 83%|████████▎ | 8898/10682 [1:27:05<14:50, 2.00it/s] 83%|████████▎ | 8899/10682 [1:27:06<14:47, 2.01it/s] 83%|████████▎ | 8900/10682 [1:27:06<14:45, 2.01it/s] {'loss': 2.7131, 'grad_norm': 0.2521492540836334, 'learning_rate': 8.24191228890303e-05, 'epoch': 11.66} + 83%|████████▎ | 8900/10682 [1:27:06<14:45, 2.01it/s] 83%|████████▎ | 8901/10682 [1:27:07<14:44, 2.01it/s] 83%|████████▎ | 8902/10682 [1:27:07<14:44, 2.01it/s] 83%|████████▎ | 8903/10682 [1:27:08<14:42, 2.02it/s] 83%|████████▎ | 8904/10682 [1:27:08<14:42, 2.01it/s] 83%|████████▎ | 8905/10682 [1:27:09<14:42, 2.01it/s] 83%|████████▎ | 8906/10682 [1:27:09<14:41, 2.02it/s] 83%|████████▎ | 8907/10682 [1:27:10<14:40, 2.02it/s] 83%|████████▎ | 8908/10682 [1:27:10<14:39, 2.02it/s] 83%|████████▎ | 8909/10682 [1:27:10<14:38, 2.02it/s] 83%|████████▎ | 8910/10682 [1:27:11<14:37, 2.02it/s] 83%|████████▎ | 8911/10682 [1:27:11<14:38, 2.02it/s] 83%|████████▎ | 8912/10682 [1:27:12<14:36, 2.02it/s] 83%|████████▎ | 8913/10682 [1:27:12<14:36, 2.02it/s] 83%|████████▎ | 8914/10682 [1:27:13<14:35, 2.02it/s] 83%|████████▎ | 8915/10682 [1:27:13<14:36, 2.02it/s] 83%|████████▎ | 8916/10682 [1:27:14<14:34, 2.02it/s] 83%|████████▎ | 8917/10682 [1:27:14<14:36, 2.01it/s] 83%|████████▎ | 8918/10682 [1:27:15<14:33, 2.02it/s] 83%|████████▎ | 8919/10682 [1:27:15<14:33, 2.02it/s] 84%|████████▎ | 8920/10682 [1:27:16<14:32, 2.02it/s] 84%|████████▎ | 8921/10682 [1:27:16<14:32, 2.02it/s] 84%|████████▎ | 8922/10682 [1:27:17<14:31, 2.02it/s] 84%|████████▎ | 8923/10682 [1:27:17<14:31, 2.02it/s] 84%|████████▎ | 8924/10682 [1:27:18<14:30, 2.02it/s] 84%|████████▎ | 8925/10682 [1:27:18<14:30, 2.02it/s] {'loss': 2.7041, 'grad_norm': 0.254290372133255, 'learning_rate': 8.018627096711106e-05, 'epoch': 11.7} + 84%|████████▎ | 8925/10682 [1:27:18<14:30, 2.02it/s] 84%|████████▎ | 8926/10682 [1:27:19<14:30, 2.02it/s] 84%|████████▎ | 8927/10682 [1:27:19<14:29, 2.02it/s] 84%|████████▎ | 8928/10682 [1:27:20<14:28, 2.02it/s] 84%|████████▎ | 8929/10682 [1:27:20<14:28, 2.02it/s] 84%|████████▎ | 8930/10682 [1:27:21<14:28, 2.02it/s] 84%|████████▎ | 8931/10682 [1:27:21<14:27, 2.02it/s] 84%|████████▎ | 8932/10682 [1:27:22<15:40, 1.86it/s] 84%|████████▎ | 8933/10682 [1:27:23<15:18, 1.90it/s] 84%|████████▎ | 8934/10682 [1:27:23<15:02, 1.94it/s] 84%|████████▎ | 8935/10682 [1:27:24<14:51, 1.96it/s] 84%|████████▎ | 8936/10682 [1:27:24<14:42, 1.98it/s] 84%|████████▎ | 8937/10682 [1:27:25<14:35, 1.99it/s] 84%|████████▎ | 8938/10682 [1:27:25<14:31, 2.00it/s] 84%|████████▎ | 8939/10682 [1:27:25<14:27, 2.01it/s] 84%|████████▎ | 8940/10682 [1:27:26<14:26, 2.01it/s] 84%|████████▎ | 8941/10682 [1:27:26<14:24, 2.01it/s] 84%|████████▎ | 8942/10682 [1:27:27<14:24, 2.01it/s] 84%|████████▎ | 8943/10682 [1:27:27<14:22, 2.02it/s] 84%|████████▎ | 8944/10682 [1:27:28<14:21, 2.02it/s] 84%|████████▎ | 8945/10682 [1:27:28<14:21, 2.02it/s] 84%|████████▎ | 8946/10682 [1:27:29<14:21, 2.02it/s] 84%|████████▍ | 8947/10682 [1:27:29<14:20, 2.02it/s] 84%|████████▍ | 8948/10682 [1:27:30<14:20, 2.02it/s] 84%|████████▍ | 8949/10682 [1:27:30<14:19, 2.02it/s] 84%|████████▍ | 8950/10682 [1:27:31<14:18, 2.02it/s]{'loss': 2.7036, 'grad_norm': 0.2529490888118744, 'learning_rate': 7.798144214020909e-05, 'epoch': 11.73} + 84%|████████▍ | 8950/10682 [1:27:31<14:18, 2.02it/s] 84%|████████▍ | 8951/10682 [1:27:31<14:19, 2.01it/s] 84%|████████▍ | 8952/10682 [1:27:32<14:18, 2.01it/s] 84%|████████▍ | 8953/10682 [1:27:32<14:17, 2.02it/s] 84%|████████▍ | 8954/10682 [1:27:33<14:15, 2.02it/s] 84%|████████▍ | 8955/10682 [1:27:33<14:16, 2.02it/s] 84%|████████▍ | 8956/10682 [1:27:34<14:15, 2.02it/s] 84%|████████▍ | 8957/10682 [1:27:34<14:16, 2.02it/s] 84%|████████▍ | 8958/10682 [1:27:35<14:15, 2.02it/s] 84%|████████▍ | 8959/10682 [1:27:35<14:13, 2.02it/s] 84%|████████▍ | 8960/10682 [1:27:36<14:13, 2.02it/s] 84%|████████▍ | 8961/10682 [1:27:36<14:13, 2.02it/s] 84%|████████▍ | 8962/10682 [1:27:37<14:14, 2.01it/s] 84%|████████▍ | 8963/10682 [1:27:37<14:13, 2.01it/s] 84%|████████▍ | 8964/10682 [1:27:38<14:12, 2.01it/s] 84%|████████▍ | 8965/10682 [1:27:38<14:10, 2.02it/s] 84%|████████▍ | 8966/10682 [1:27:39<14:11, 2.02it/s] 84%|████████▍ | 8967/10682 [1:27:39<14:08, 2.02it/s] 84%|████████▍ | 8968/10682 [1:27:40<14:08, 2.02it/s] 84%|████████▍ | 8969/10682 [1:27:40<14:07, 2.02it/s] 84%|████████▍ | 8970/10682 [1:27:41<14:07, 2.02it/s] 84%|████████▍ | 8971/10682 [1:27:41<14:06, 2.02it/s] 84%|████████▍ | 8972/10682 [1:27:42<14:05, 2.02it/s] 84%|████████▍ | 8973/10682 [1:27:42<14:06, 2.02it/s] 84%|████████▍ | 8974/10682 [1:27:43<14:04, 2.02it/s] 84%|████████▍ | 8975/10682 [1:27:43<14:04, 2.02it/s]{'loss': 2.709, 'grad_norm': 0.25795498490333557, 'learning_rate': 7.58047835834249e-05, 'epoch': 11.76} + 84%|████████▍ | 8975/10682 [1:27:43<14:04, 2.02it/s] 84%|████████▍ | 8976/10682 [1:27:44<14:04, 2.02it/s] 84%|████████▍ | 8977/10682 [1:27:44<14:05, 2.02it/s] 84%|████████▍ | 8978/10682 [1:27:45<14:03, 2.02it/s] 84%|████████▍ | 8979/10682 [1:27:45<14:03, 2.02it/s] 84%|████████▍ | 8980/10682 [1:27:46<14:02, 2.02it/s] 84%|████████▍ | 8981/10682 [1:27:46<14:02, 2.02it/s] 84%|████████▍ | 8982/10682 [1:27:47<14:01, 2.02it/s] 84%|████████▍ | 8983/10682 [1:27:47<14:02, 2.02it/s] 84%|████████▍ | 8984/10682 [1:27:48<14:00, 2.02it/s] 84%|████████▍ | 8985/10682 [1:27:48<14:02, 2.01it/s] 84%|████████▍ | 8986/10682 [1:27:49<14:00, 2.02it/s] 84%|████████▍ | 8987/10682 [1:27:49<13:59, 2.02it/s] 84%|████████▍ | 8988/10682 [1:27:50<13:59, 2.02it/s] 84%|████████▍ | 8989/10682 [1:27:50<13:58, 2.02it/s] 84%|████████▍ | 8990/10682 [1:27:51<13:57, 2.02it/s] 84%|████████▍ | 8991/10682 [1:27:51<13:57, 2.02it/s] 84%|████████▍ | 8992/10682 [1:27:52<13:57, 2.02it/s] 84%|████████▍ | 8993/10682 [1:27:52<13:56, 2.02it/s] 84%|████████▍ | 8994/10682 [1:27:53<13:57, 2.01it/s] 84%|████████▍ | 8995/10682 [1:27:53<13:56, 2.02it/s] 84%|████████▍ | 8996/10682 [1:27:54<13:55, 2.02it/s] 84%|████████▍ | 8997/10682 [1:27:54<13:55, 2.02it/s] 84%|████████▍ | 8998/10682 [1:27:55<13:53, 2.02it/s] 84%|████████▍ | 8999/10682 [1:27:55<13:52, 2.02it/s] 84%|████████▍ | 9000/10682 [1:27:56<13:52, 2.02it/s] {'loss': 2.7046, 'grad_norm': 0.25507137179374695, 'learning_rate': 7.365644059145782e-05, 'epoch': 11.8} + 84%|████████▍ | 9000/10682 [1:27:56<13:52, 2.02it/s] 84%|████████▍ | 9001/10682 [1:27:56<13:53, 2.02it/s] 84%|████████▍ | 9002/10682 [1:27:57<13:51, 2.02it/s] 84%|████████▍ | 9003/10682 [1:27:57<13:52, 2.02it/s] 84%|████████▍ | 9004/10682 [1:27:58<13:51, 2.02it/s] 84%|████████▍ | 9005/10682 [1:27:58<13:51, 2.02it/s] 84%|████████▍ | 9006/10682 [1:27:59<13:50, 2.02it/s] 84%|████████▍ | 9007/10682 [1:27:59<13:50, 2.02it/s] 84%|████████▍ | 9008/10682 [1:28:00<13:50, 2.02it/s] 84%|████████▍ | 9009/10682 [1:28:00<13:48, 2.02it/s] 84%|████████▍ | 9010/10682 [1:28:01<13:47, 2.02it/s] 84%|████████▍ | 9011/10682 [1:28:01<13:47, 2.02it/s] 84%|████████▍ | 9012/10682 [1:28:02<13:46, 2.02it/s] 84%|████████▍ | 9013/10682 [1:28:02<13:46, 2.02it/s] 84%|████████▍ | 9014/10682 [1:28:03<13:46, 2.02it/s] 84%|████████▍ | 9015/10682 [1:28:03<13:45, 2.02it/s] 84%|████████▍ | 9016/10682 [1:28:04<13:45, 2.02it/s] 84%|████████▍ | 9017/10682 [1:28:04<13:44, 2.02it/s] 84%|████████▍ | 9018/10682 [1:28:05<13:45, 2.02it/s] 84%|████████▍ | 9019/10682 [1:28:05<13:43, 2.02it/s] 84%|████████▍ | 9020/10682 [1:28:06<13:44, 2.02it/s] 84%|████████▍ | 9021/10682 [1:28:06<13:43, 2.02it/s] 84%|████████▍ | 9022/10682 [1:28:07<13:43, 2.02it/s] 84%|████████▍ | 9023/10682 [1:28:07<13:42, 2.02it/s] 84%|████████▍ | 9024/10682 [1:28:08<13:41, 2.02it/s] 84%|████████▍ | 9025/10682 [1:28:08<13:42, 2.02it/s]{'loss': 2.7064, 'grad_norm': 0.25425198674201965, 'learning_rate': 7.153655656890773e-05, 'epoch': 11.83} + 84%|████████▍ | 9025/10682 [1:28:08<13:42, 2.02it/s] 84%|████████▍ | 9026/10682 [1:28:09<13:41, 2.02it/s] 85%|████████▍ | 9027/10682 [1:28:09<13:40, 2.02it/s] 85%|████████▍ | 9028/10682 [1:28:10<13:38, 2.02it/s] 85%|████████▍ | 9029/10682 [1:28:10<13:38, 2.02it/s] 85%|████████▍ | 9030/10682 [1:28:11<13:37, 2.02it/s] 85%|████████▍ | 9031/10682 [1:28:11<13:37, 2.02it/s] 85%|████████▍ | 9032/10682 [1:28:12<13:36, 2.02it/s] 85%|████████▍ | 9033/10682 [1:28:12<13:36, 2.02it/s] 85%|████████▍ | 9034/10682 [1:28:13<13:34, 2.02it/s] 85%|████████▍ | 9035/10682 [1:28:13<13:34, 2.02it/s] 85%|████████▍ | 9036/10682 [1:28:14<13:33, 2.02it/s] 85%|████████▍ | 9037/10682 [1:28:14<13:34, 2.02it/s] 85%|████████▍ | 9038/10682 [1:28:15<13:33, 2.02it/s] 85%|████████▍ | 9039/10682 [1:28:15<13:33, 2.02it/s] 85%|████████▍ | 9040/10682 [1:28:16<13:32, 2.02it/s] 85%|████████▍ | 9041/10682 [1:28:16<13:32, 2.02it/s] 85%|████████▍ | 9042/10682 [1:28:17<13:31, 2.02it/s] 85%|████████▍ | 9043/10682 [1:28:17<13:32, 2.02it/s] 85%|████████▍ | 9044/10682 [1:28:18<13:32, 2.02it/s] 85%|████████▍ | 9045/10682 [1:28:18<13:30, 2.02it/s] 85%|████████▍ | 9046/10682 [1:28:19<13:30, 2.02it/s] 85%|████████▍ | 9047/10682 [1:28:19<13:29, 2.02it/s] 85%|████████▍ | 9048/10682 [1:28:20<13:29, 2.02it/s] 85%|████████▍ | 9049/10682 [1:28:20<13:28, 2.02it/s] 85%|████████▍ | 9050/10682 [1:28:20<13:29, 2.02it/s]{'loss': 2.7188, 'grad_norm': 0.2525677978992462, 'learning_rate': 6.94452730207023e-05, 'epoch': 11.86} + 85%|████████▍ | 9050/10682 [1:28:20<13:29, 2.02it/s] 85%|████████▍ | 9051/10682 [1:28:21<13:28, 2.02it/s] 85%|████████▍ | 9052/10682 [1:28:21<13:28, 2.02it/s] 85%|████████▍ | 9053/10682 [1:28:22<13:27, 2.02it/s] 85%|████████▍ | 9054/10682 [1:28:22<13:28, 2.01it/s] 85%|████████▍ | 9055/10682 [1:28:23<13:26, 2.02it/s] 85%|████████▍ | 9056/10682 [1:28:23<13:26, 2.02it/s] 85%|████████▍ | 9057/10682 [1:28:24<13:24, 2.02it/s] 85%|████████▍ | 9058/10682 [1:28:24<13:24, 2.02it/s] 85%|████████▍ | 9059/10682 [1:28:25<13:22, 2.02it/s] 85%|████████▍ | 9060/10682 [1:28:25<13:22, 2.02it/s] 85%|████████▍ | 9061/10682 [1:28:26<13:22, 2.02it/s] 85%|████████▍ | 9062/10682 [1:28:26<13:22, 2.02it/s] 85%|████████▍ | 9063/10682 [1:28:27<13:21, 2.02it/s] 85%|████████▍ | 9064/10682 [1:28:27<13:21, 2.02it/s] 85%|████████▍ | 9065/10682 [1:28:28<13:20, 2.02it/s] 85%|████████▍ | 9066/10682 [1:28:28<13:20, 2.02it/s] 85%|████████▍ | 9067/10682 [1:28:29<13:19, 2.02it/s] 85%|████████▍ | 9068/10682 [1:28:29<13:19, 2.02it/s] 85%|████████▍ | 9069/10682 [1:28:30<13:19, 2.02it/s] 85%|████████▍ | 9070/10682 [1:28:30<13:18, 2.02it/s] 85%|████████▍ | 9071/10682 [1:28:31<13:18, 2.02it/s] 85%|████████▍ | 9072/10682 [1:28:31<13:17, 2.02it/s] 85%|████████▍ | 9073/10682 [1:28:32<13:16, 2.02it/s] 85%|████████▍ | 9074/10682 [1:28:32<13:15, 2.02it/s] 85%|████████▍ | 9075/10682 [1:28:33<13:15, 2.02it/s] {'loss': 2.7163, 'grad_norm': 0.2548724412918091, 'learning_rate': 6.738272954265156e-05, 'epoch': 11.89} + 85%|████████▍ | 9075/10682 [1:28:33<13:15, 2.02it/s] 85%|████████▍ | 9076/10682 [1:28:33<13:15, 2.02it/s] 85%|████████▍ | 9077/10682 [1:28:34<13:15, 2.02it/s] 85%|████████▍ | 9078/10682 [1:28:34<13:16, 2.01it/s] 85%|████████▍ | 9079/10682 [1:28:35<13:15, 2.02it/s] 85%|████████▌ | 9080/10682 [1:28:35<13:13, 2.02it/s] 85%|████████▌ | 9081/10682 [1:28:36<13:13, 2.02it/s] 85%|████████▌ | 9082/10682 [1:28:36<13:13, 2.02it/s] 85%|████████▌ | 9083/10682 [1:28:37<13:12, 2.02it/s] 85%|████████▌ | 9084/10682 [1:28:37<13:11, 2.02it/s] 85%|████████▌ | 9085/10682 [1:28:38<13:10, 2.02it/s] 85%|████████▌ | 9086/10682 [1:28:38<13:09, 2.02it/s] 85%|████████▌ | 9087/10682 [1:28:39<13:09, 2.02it/s] 85%|████████▌ | 9088/10682 [1:28:39<13:08, 2.02it/s] 85%|████████▌ | 9089/10682 [1:28:40<13:08, 2.02it/s] 85%|████████▌ | 9090/10682 [1:28:40<13:07, 2.02it/s] 85%|████████▌ | 9091/10682 [1:28:41<13:07, 2.02it/s] 85%|████████▌ | 9092/10682 [1:28:41<13:06, 2.02it/s] 85%|████████▌ | 9093/10682 [1:28:42<13:06, 2.02it/s] 85%|████████▌ | 9094/10682 [1:28:42<13:06, 2.02it/s] 85%|████████▌ | 9095/10682 [1:28:43<13:04, 2.02it/s] 85%|████████▌ | 9096/10682 [1:28:43<13:04, 2.02it/s] 85%|████████▌ | 9097/10682 [1:28:44<13:03, 2.02it/s] 85%|████████▌ | 9098/10682 [1:28:44<13:03, 2.02it/s] 85%|████████▌ | 9099/10682 [1:28:45<13:03, 2.02it/s] 85%|████████▌ | 9100/10682 [1:28:45<13:02, 2.02it/s]{'loss': 2.7077, 'grad_norm': 0.25056371092796326, 'learning_rate': 6.534906381212979e-05, 'epoch': 11.93} + 85%|████████▌ | 9100/10682 [1:28:45<13:02, 2.02it/s] 85%|████████▌ | 9101/10682 [1:28:46<13:04, 2.02it/s] 85%|████████▌ | 9102/10682 [1:28:46<13:02, 2.02it/s] 85%|████████▌ | 9103/10682 [1:28:47<13:01, 2.02it/s] 85%|████████▌ | 9104/10682 [1:28:47<13:00, 2.02it/s] 85%|████████▌ | 9105/10682 [1:28:48<13:00, 2.02it/s] 85%|████████▌ | 9106/10682 [1:28:48<12:59, 2.02it/s] 85%|████████▌ | 9107/10682 [1:28:49<13:00, 2.02it/s] 85%|████████▌ | 9108/10682 [1:28:49<12:58, 2.02it/s] 85%|████████▌ | 9109/10682 [1:28:50<12:58, 2.02it/s] 85%|████████▌ | 9110/10682 [1:28:50<12:57, 2.02it/s] 85%|████████▌ | 9111/10682 [1:28:51<12:56, 2.02it/s] 85%|████████▌ | 9112/10682 [1:28:51<12:57, 2.02it/s] 85%|██████���█▌ | 9113/10682 [1:28:52<12:55, 2.02it/s] 85%|████████▌ | 9114/10682 [1:28:52<12:56, 2.02it/s] 85%|████████▌ | 9115/10682 [1:28:53<12:57, 2.02it/s] 85%|████████▌ | 9116/10682 [1:28:53<12:56, 2.02it/s] 85%|████████▌ | 9117/10682 [1:28:54<12:54, 2.02it/s] 85%|████████▌ | 9118/10682 [1:28:54<12:54, 2.02it/s] 85%|████████▌ | 9119/10682 [1:28:55<12:54, 2.02it/s] 85%|████████▌ | 9120/10682 [1:28:55<12:53, 2.02it/s] 85%|████████▌ | 9121/10682 [1:28:56<12:52, 2.02it/s] 85%|████████▌ | 9122/10682 [1:28:56<12:52, 2.02it/s] 85%|████████▌ | 9123/10682 [1:28:57<12:51, 2.02it/s] 85%|████████▌ | 9124/10682 [1:28:57<12:51, 2.02it/s] 85%|████████▌ | 9125/10682 [1:28:58<12:50, 2.02it/s] {'loss': 2.7022, 'grad_norm': 0.25071099400520325, 'learning_rate': 6.334441157888504e-05, 'epoch': 11.96} + 85%|████████▌ | 9125/10682 [1:28:58<12:50, 2.02it/s] 85%|████████▌ | 9126/10682 [1:28:58<12:50, 2.02it/s] 85%|████████▌ | 9127/10682 [1:28:59<12:50, 2.02it/s] 85%|████████▌ | 9128/10682 [1:28:59<12:48, 2.02it/s] 85%|████████▌ | 9129/10682 [1:29:00<12:48, 2.02it/s] 85%|████████▌ | 9130/10682 [1:29:00<12:47, 2.02it/s] 85%|████████▌ | 9131/10682 [1:29:01<12:47, 2.02it/s] 85%|████████▌ | 9132/10682 [1:29:01<12:46, 2.02it/s] 85%|████████▌ | 9133/10682 [1:29:02<12:46, 2.02it/s] 86%|████████▌ | 9134/10682 [1:29:02<12:45, 2.02it/s] 86%|████████▌ | 9135/10682 [1:29:03<12:45, 2.02it/s] 86%|████████▌ | 9136/10682 [1:29:03<12:44, 2.02it/s] 86%|████████▌ | 9137/10682 [1:29:04<12:45, 2.02it/s] 86%|████████▌ | 9138/10682 [1:29:04<12:43, 2.02it/s] 86%|████████▌ | 9139/10682 [1:29:05<12:44, 2.02it/s] 86%|████████▌ | 9140/10682 [1:29:05<12:43, 2.02it/s] 86%|████████▌ | 9141/10682 [1:29:06<12:43, 2.02it/s] 86%|████████▌ | 9142/10682 [1:29:06<12:41, 2.02it/s] 86%|████████▌ | 9143/10682 [1:29:07<12:42, 2.02it/s] 86%|████████▌ | 9144/10682 [1:29:07<12:40, 2.02it/s] 86%|████████▌ | 9145/10682 [1:29:08<12:40, 2.02it/s] 86%|████████▌ | 9146/10682 [1:29:08<12:40, 2.02it/s] 86%|████████▌ | 9147/10682 [1:29:09<12:40, 2.02it/s] 86%|████████▌ | 9148/10682 [1:29:09<12:39, 2.02it/s] 86%|████████▌ | 9149/10682 [1:29:10<12:39, 2.02it/s] 86%|████████▌ | 9150/10682 [1:29:10<12:38, 2.02it/s]{'loss': 2.7139, 'grad_norm': 0.2529626786708832, 'learning_rate': 6.1368906655978e-05, 'epoch': 11.99} + 86%|████████▌ | 9150/10682 [1:29:10<12:38, 2.02it/s] 86%|████████▌ | 9151/10682 [1:29:11<12:38, 2.02it/s] 86%|████████▌ | 9152/10682 [1:29:11<12:38, 2.02it/s] 86%|████████▌ | 9153/10682 [1:29:11<12:36, 2.02it/s] 86%|████████▌ | 9154/10682 [1:29:12<12:36, 2.02it/s] 86%|████████▌ | 9155/10682 [1:29:12<12:36, 2.02it/s] 86%|████████▌ | 9156/10682 [1:29:13<12:28, 2.04it/s] 86%|████████▌ | 9157/10682 [1:29:58<5:54:28, 13.95s/it] 86%|████████▌ | 9158/10682 [1:29:59<4:11:48, 9.91s/it] 86%|████████▌ | 9159/10682 [1:29:59<2:59:54, 7.09s/it] 86%|████████▌ | 9160/10682 [1:30:00<2:09:42, 5.11s/it] 86%|████████▌ | 9161/10682 [1:30:00<1:34:29, 3.73s/it] 86%|████████▌ | 9162/10682 [1:30:01<1:09:52, 2.76s/it] 86%|████████▌ | 9163/10682 [1:30:01<52:38, 2.08s/it] 86%|████████▌ | 9164/10682 [1:30:02<40:34, 1.60s/it] 86%|████████▌ | 9165/10682 [1:30:02<32:08, 1.27s/it] 86%|████████▌ | 9166/10682 [1:30:03<26:14, 1.04s/it] 86%|████████▌ | 9167/10682 [1:30:03<22:06, 1.14it/s] 86%|████████▌ | 9168/10682 [1:30:04<19:12, 1.31it/s] 86%|████████▌ | 9169/10682 [1:30:04<17:16, 1.46it/s] 86%|████████▌ | 9170/10682 [1:30:05<15:49, 1.59it/s] 86%|████████▌ | 9171/10682 [1:30:05<14:49, 1.70it/s] 86%|████████▌ | 9172/10682 [1:30:06<14:05, 1.78it/s] 86%|████████▌ | 9173/10682 [1:30:06<13:36, 1.85it/s] 86%|████████▌ | 9174/10682 [1:30:07<13:15, 1.90it/s] 86%|████████▌ | 9175/10682 [1:30:07<13:00, 1.93it/s]{'loss': 2.6615, 'grad_norm': 0.25235533714294434, 'learning_rate': 5.94226809108499e-05, 'epoch': 12.02} + 86%|████████▌ | 9175/10682 [1:30:07<13:00, 1.93it/s] 86%|████████▌ | 9176/10682 [1:30:08<12:51, 1.95it/s] 86%|████████▌ | 9177/10682 [1:30:08<12:43, 1.97it/s] 86%|████████▌ | 9178/10682 [1:30:09<12:38, 1.98it/s] 86%|████████▌ | 9179/10682 [1:30:09<12:33, 1.99it/s] 86%|████████▌ | 9180/10682 [1:30:10<12:30, 2.00it/s] 86%|████████▌ | 9181/10682 [1:30:10<12:27, 2.01it/s] 86%|████████▌ | 9182/10682 [1:30:11<12:26, 2.01it/s] 86%|████████▌ | 9183/10682 [1:30:11<12:24, 2.01it/s] 86%|████████▌ | 9184/10682 [1:30:12<12:23, 2.01it/s] 86%|████████▌ | 9185/10682 [1:30:12<12:23, 2.01it/s] 86%|████████▌ | 9186/10682 [1:30:13<12:27, 2.00it/s] 86%|████████▌ | 9187/10682 [1:30:13<12:22, 2.01it/s] 86%|████████▌ | 9188/10682 [1:30:14<12:20, 2.02it/s] 86%|████████▌ | 9189/10682 [1:30:14<12:20, 2.02it/s] 86%|████████▌ | 9190/10682 [1:30:15<12:21, 2.01it/s] 86%|████████▌ | 9191/10682 [1:30:15<12:20, 2.01it/s] 86%|████████▌ | 9192/10682 [1:30:16<12:19, 2.02it/s] 86%|████████▌ | 9193/10682 [1:30:16<12:19, 2.01it/s] 86%|████████▌ | 9194/10682 [1:30:17<12:18, 2.01it/s] 86%|████████▌ | 9195/10682 [1:30:17<12:18, 2.01it/s] 86%|████████▌ | 9196/10682 [1:30:18<12:16, 2.02it/s] 86%|████████▌ | 9197/10682 [1:30:18<12:18, 2.01it/s] 86%|████████▌ | 9198/10682 [1:30:19<12:16, 2.02it/s] 86%|████████▌ | 9199/10682 [1:30:19<12:15, 2.02it/s] 86%|████████▌ | 9200/10682 [1:30:20<12:16, 2.01it/s] {'loss': 2.65, 'grad_norm': 0.254749059677124, 'learning_rate': 5.7505864256519716e-05, 'epoch': 12.06} + 86%|████████▌ | 9200/10682 [1:30:20<12:16, 2.01it/s] 86%|████████▌ | 9201/10682 [1:30:20<12:17, 2.01it/s] 86%|████████▌ | 9202/10682 [1:30:21<12:15, 2.01it/s] 86%|████████▌ | 9203/10682 [1:30:21<12:13, 2.02it/s] 86%|████████▌ | 9204/10682 [1:30:22<12:12, 2.02it/s] 86%|████████▌ | 9205/10682 [1:30:22<12:11, 2.02it/s] 86%|████████▌ | 9206/10682 [1:30:23<12:11, 2.02it/s] 86%|████████▌ | 9207/10682 [1:30:23<12:10, 2.02it/s] 86%|████████▌ | 9208/10682 [1:30:24<12:09, 2.02it/s] 86%|████████▌ | 9209/10682 [1:30:24<12:09, 2.02it/s] \ No newline at end of file