2023-10-24 16:25:58,391 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:25:58,392 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-24 16:25:58,392 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:25:58,393 MultiCorpus: 7936 train + 992 dev + 992 test sentences - NER_ICDAR_EUROPEANA Corpus: 7936 train + 992 dev + 992 test sentences - /home/ubuntu/.flair/datasets/ner_icdar_europeana/fr 2023-10-24 16:25:58,393 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:25:58,393 Train: 7936 sentences 2023-10-24 16:25:58,393 (train_with_dev=False, train_with_test=False) 2023-10-24 16:25:58,393 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:25:58,393 Training Params: 2023-10-24 16:25:58,393 - learning_rate: "3e-05" 2023-10-24 16:25:58,393 - mini_batch_size: "8" 2023-10-24 16:25:58,393 - max_epochs: "10" 2023-10-24 16:25:58,393 - shuffle: "True" 2023-10-24 16:25:58,393 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:25:58,393 Plugins: 2023-10-24 16:25:58,393 - TensorboardLogger 2023-10-24 16:25:58,393 - LinearScheduler | warmup_fraction: '0.1' 2023-10-24 16:25:58,393 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:25:58,393 Final evaluation on model from best epoch (best-model.pt) 2023-10-24 16:25:58,393 - metric: "('micro avg', 'f1-score')" 2023-10-24 16:25:58,393 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:25:58,393 Computation: 2023-10-24 16:25:58,393 - compute on device: cuda:0 2023-10-24 16:25:58,393 - embedding storage: none 2023-10-24 16:25:58,393 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:25:58,393 Model training base path: "hmbench-icdar/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-2" 2023-10-24 16:25:58,393 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:25:58,393 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:25:58,393 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-24 16:26:06,342 epoch 1 - iter 99/992 - loss 1.84117328 - time (sec): 7.95 - samples/sec: 1981.13 - lr: 0.000003 - momentum: 0.000000 2023-10-24 16:26:14,855 epoch 1 - iter 198/992 - loss 1.10138130 - time (sec): 16.46 - samples/sec: 1996.76 - lr: 0.000006 - momentum: 0.000000 2023-10-24 16:26:23,310 epoch 1 - iter 297/992 - loss 0.81486082 - time (sec): 24.92 - samples/sec: 2001.63 - lr: 0.000009 - momentum: 0.000000 2023-10-24 16:26:31,932 epoch 1 - iter 396/992 - loss 0.64687710 - time (sec): 33.54 - samples/sec: 2015.43 - lr: 0.000012 - momentum: 0.000000 2023-10-24 16:26:39,914 epoch 1 - iter 495/992 - loss 0.55583741 - time (sec): 41.52 - samples/sec: 1997.30 - lr: 0.000015 - momentum: 0.000000 2023-10-24 16:26:48,202 epoch 1 - iter 594/992 - loss 0.48695047 - time (sec): 49.81 - samples/sec: 1991.39 - lr: 0.000018 - momentum: 0.000000 2023-10-24 16:26:56,126 epoch 1 - iter 693/992 - loss 0.44276893 - time (sec): 57.73 - samples/sec: 1982.49 - lr: 0.000021 - momentum: 0.000000 2023-10-24 16:27:04,291 epoch 1 - iter 792/992 - loss 0.40513633 - time (sec): 65.90 - samples/sec: 1978.17 - lr: 0.000024 - momentum: 0.000000 2023-10-24 16:27:13,005 epoch 1 - iter 891/992 - loss 0.37397311 - time (sec): 74.61 - samples/sec: 1975.31 - lr: 0.000027 - momentum: 0.000000 2023-10-24 16:27:21,230 epoch 1 - iter 990/992 - loss 0.35036716 - time (sec): 82.84 - samples/sec: 1973.58 - lr: 0.000030 - momentum: 0.000000 2023-10-24 16:27:21,425 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:27:21,425 EPOCH 1 done: loss 0.3496 - lr: 0.000030 2023-10-24 16:27:24,457 DEV : loss 0.09255984425544739 - f1-score (micro avg) 0.7088 2023-10-24 16:27:24,472 saving best model 2023-10-24 16:27:24,943 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:27:33,171 epoch 2 - iter 99/992 - loss 0.10446376 - time (sec): 8.23 - samples/sec: 2021.90 - lr: 0.000030 - momentum: 0.000000 2023-10-24 16:27:41,671 epoch 2 - iter 198/992 - loss 0.10760078 - time (sec): 16.73 - samples/sec: 1971.38 - lr: 0.000029 - momentum: 0.000000 2023-10-24 16:27:49,862 epoch 2 - iter 297/992 - loss 0.10514711 - time (sec): 24.92 - samples/sec: 1966.95 - lr: 0.000029 - momentum: 0.000000 2023-10-24 16:27:58,454 epoch 2 - iter 396/992 - loss 0.10491517 - time (sec): 33.51 - samples/sec: 1965.67 - lr: 0.000029 - momentum: 0.000000 2023-10-24 16:28:06,674 epoch 2 - iter 495/992 - loss 0.10303159 - time (sec): 41.73 - samples/sec: 1962.28 - lr: 0.000028 - momentum: 0.000000 2023-10-24 16:28:15,062 epoch 2 - iter 594/992 - loss 0.10348052 - time (sec): 50.12 - samples/sec: 1961.05 - lr: 0.000028 - momentum: 0.000000 2023-10-24 16:28:23,656 epoch 2 - iter 693/992 - loss 0.10177488 - time (sec): 58.71 - samples/sec: 1964.56 - lr: 0.000028 - momentum: 0.000000 2023-10-24 16:28:32,361 epoch 2 - iter 792/992 - loss 0.10172499 - time (sec): 67.42 - samples/sec: 1957.58 - lr: 0.000027 - momentum: 0.000000 2023-10-24 16:28:40,452 epoch 2 - iter 891/992 - loss 0.10087184 - time (sec): 75.51 - samples/sec: 1956.03 - lr: 0.000027 - momentum: 0.000000 2023-10-24 16:28:48,445 epoch 2 - iter 990/992 - loss 0.09922248 - time (sec): 83.50 - samples/sec: 1961.60 - lr: 0.000027 - momentum: 0.000000 2023-10-24 16:28:48,581 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:28:48,581 EPOCH 2 done: loss 0.0993 - lr: 0.000027 2023-10-24 16:28:51,691 DEV : loss 0.09279114753007889 - f1-score (micro avg) 0.7279 2023-10-24 16:28:51,706 saving best model 2023-10-24 16:28:52,375 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:29:01,133 epoch 3 - iter 99/992 - loss 0.07605989 - time (sec): 8.76 - samples/sec: 1917.87 - lr: 0.000026 - momentum: 0.000000 2023-10-24 16:29:09,138 epoch 3 - iter 198/992 - loss 0.07095587 - time (sec): 16.76 - samples/sec: 1941.89 - lr: 0.000026 - momentum: 0.000000 2023-10-24 16:29:17,484 epoch 3 - iter 297/992 - loss 0.06933994 - time (sec): 25.11 - samples/sec: 1968.54 - lr: 0.000026 - momentum: 0.000000 2023-10-24 16:29:25,795 epoch 3 - iter 396/992 - loss 0.06953657 - time (sec): 33.42 - samples/sec: 1984.05 - lr: 0.000025 - momentum: 0.000000 2023-10-24 16:29:34,059 epoch 3 - iter 495/992 - loss 0.06985299 - time (sec): 41.68 - samples/sec: 1966.49 - lr: 0.000025 - momentum: 0.000000 2023-10-24 16:29:42,455 epoch 3 - iter 594/992 - loss 0.07018513 - time (sec): 50.08 - samples/sec: 1957.35 - lr: 0.000025 - momentum: 0.000000 2023-10-24 16:29:50,658 epoch 3 - iter 693/992 - loss 0.06885542 - time (sec): 58.28 - samples/sec: 1963.79 - lr: 0.000024 - momentum: 0.000000 2023-10-24 16:29:58,686 epoch 3 - iter 792/992 - loss 0.06830171 - time (sec): 66.31 - samples/sec: 1969.72 - lr: 0.000024 - momentum: 0.000000 2023-10-24 16:30:06,906 epoch 3 - iter 891/992 - loss 0.06866294 - time (sec): 74.53 - samples/sec: 1970.78 - lr: 0.000024 - momentum: 0.000000 2023-10-24 16:30:15,479 epoch 3 - iter 990/992 - loss 0.06874566 - time (sec): 83.10 - samples/sec: 1970.06 - lr: 0.000023 - momentum: 0.000000 2023-10-24 16:30:15,620 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:30:15,621 EPOCH 3 done: loss 0.0687 - lr: 0.000023 2023-10-24 16:30:19,034 DEV : loss 0.10878178477287292 - f1-score (micro avg) 0.7642 2023-10-24 16:30:19,049 saving best model 2023-10-24 16:30:19,637 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:30:28,163 epoch 4 - iter 99/992 - loss 0.04392941 - time (sec): 8.52 - samples/sec: 1987.79 - lr: 0.000023 - momentum: 0.000000 2023-10-24 16:30:36,328 epoch 4 - iter 198/992 - loss 0.04639438 - time (sec): 16.69 - samples/sec: 1952.85 - lr: 0.000023 - momentum: 0.000000 2023-10-24 16:30:44,969 epoch 4 - iter 297/992 - loss 0.04736008 - time (sec): 25.33 - samples/sec: 1973.99 - lr: 0.000022 - momentum: 0.000000 2023-10-24 16:30:53,150 epoch 4 - iter 396/992 - loss 0.04778313 - time (sec): 33.51 - samples/sec: 1968.61 - lr: 0.000022 - momentum: 0.000000 2023-10-24 16:31:01,433 epoch 4 - iter 495/992 - loss 0.04940814 - time (sec): 41.79 - samples/sec: 1968.99 - lr: 0.000022 - momentum: 0.000000 2023-10-24 16:31:09,947 epoch 4 - iter 594/992 - loss 0.04959742 - time (sec): 50.31 - samples/sec: 1965.94 - lr: 0.000021 - momentum: 0.000000 2023-10-24 16:31:17,969 epoch 4 - iter 693/992 - loss 0.04901512 - time (sec): 58.33 - samples/sec: 1965.80 - lr: 0.000021 - momentum: 0.000000 2023-10-24 16:31:26,565 epoch 4 - iter 792/992 - loss 0.05033168 - time (sec): 66.93 - samples/sec: 1958.37 - lr: 0.000021 - momentum: 0.000000 2023-10-24 16:31:34,725 epoch 4 - iter 891/992 - loss 0.05069359 - time (sec): 75.09 - samples/sec: 1965.14 - lr: 0.000020 - momentum: 0.000000 2023-10-24 16:31:42,979 epoch 4 - iter 990/992 - loss 0.04985751 - time (sec): 83.34 - samples/sec: 1964.11 - lr: 0.000020 - momentum: 0.000000 2023-10-24 16:31:43,127 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:31:43,127 EPOCH 4 done: loss 0.0498 - lr: 0.000020 2023-10-24 16:31:46,247 DEV : loss 0.12828028202056885 - f1-score (micro avg) 0.7563 2023-10-24 16:31:46,262 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:31:54,899 epoch 5 - iter 99/992 - loss 0.03290449 - time (sec): 8.64 - samples/sec: 1954.44 - lr: 0.000020 - momentum: 0.000000 2023-10-24 16:32:03,134 epoch 5 - iter 198/992 - loss 0.03381169 - time (sec): 16.87 - samples/sec: 1924.32 - lr: 0.000019 - momentum: 0.000000 2023-10-24 16:32:11,746 epoch 5 - iter 297/992 - loss 0.03697508 - time (sec): 25.48 - samples/sec: 1942.23 - lr: 0.000019 - momentum: 0.000000 2023-10-24 16:32:19,881 epoch 5 - iter 396/992 - loss 0.03788595 - time (sec): 33.62 - samples/sec: 1937.06 - lr: 0.000019 - momentum: 0.000000 2023-10-24 16:32:28,085 epoch 5 - iter 495/992 - loss 0.03765117 - time (sec): 41.82 - samples/sec: 1939.00 - lr: 0.000018 - momentum: 0.000000 2023-10-24 16:32:36,427 epoch 5 - iter 594/992 - loss 0.03686130 - time (sec): 50.16 - samples/sec: 1949.84 - lr: 0.000018 - momentum: 0.000000 2023-10-24 16:32:44,439 epoch 5 - iter 693/992 - loss 0.03765527 - time (sec): 58.18 - samples/sec: 1951.39 - lr: 0.000018 - momentum: 0.000000 2023-10-24 16:32:52,622 epoch 5 - iter 792/992 - loss 0.03734430 - time (sec): 66.36 - samples/sec: 1952.28 - lr: 0.000017 - momentum: 0.000000 2023-10-24 16:33:01,356 epoch 5 - iter 891/992 - loss 0.03750261 - time (sec): 75.09 - samples/sec: 1957.40 - lr: 0.000017 - momentum: 0.000000 2023-10-24 16:33:09,599 epoch 5 - iter 990/992 - loss 0.03737905 - time (sec): 83.34 - samples/sec: 1964.25 - lr: 0.000017 - momentum: 0.000000 2023-10-24 16:33:09,763 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:33:09,763 EPOCH 5 done: loss 0.0373 - lr: 0.000017 2023-10-24 16:33:13,201 DEV : loss 0.16802850365638733 - f1-score (micro avg) 0.7613 2023-10-24 16:33:13,216 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:33:21,537 epoch 6 - iter 99/992 - loss 0.02894776 - time (sec): 8.32 - samples/sec: 1949.39 - lr: 0.000016 - momentum: 0.000000 2023-10-24 16:33:29,913 epoch 6 - iter 198/992 - loss 0.02934176 - time (sec): 16.70 - samples/sec: 1933.41 - lr: 0.000016 - momentum: 0.000000 2023-10-24 16:33:38,373 epoch 6 - iter 297/992 - loss 0.02785056 - time (sec): 25.16 - samples/sec: 1915.53 - lr: 0.000016 - momentum: 0.000000 2023-10-24 16:33:46,336 epoch 6 - iter 396/992 - loss 0.02582194 - time (sec): 33.12 - samples/sec: 1931.95 - lr: 0.000015 - momentum: 0.000000 2023-10-24 16:33:54,785 epoch 6 - iter 495/992 - loss 0.02658002 - time (sec): 41.57 - samples/sec: 1938.95 - lr: 0.000015 - momentum: 0.000000 2023-10-24 16:34:03,288 epoch 6 - iter 594/992 - loss 0.02696841 - time (sec): 50.07 - samples/sec: 1958.83 - lr: 0.000015 - momentum: 0.000000 2023-10-24 16:34:11,609 epoch 6 - iter 693/992 - loss 0.02669713 - time (sec): 58.39 - samples/sec: 1959.03 - lr: 0.000014 - momentum: 0.000000 2023-10-24 16:34:19,905 epoch 6 - iter 792/992 - loss 0.02835216 - time (sec): 66.69 - samples/sec: 1955.74 - lr: 0.000014 - momentum: 0.000000 2023-10-24 16:34:28,428 epoch 6 - iter 891/992 - loss 0.02822978 - time (sec): 75.21 - samples/sec: 1950.65 - lr: 0.000014 - momentum: 0.000000 2023-10-24 16:34:36,703 epoch 6 - iter 990/992 - loss 0.02826272 - time (sec): 83.49 - samples/sec: 1960.26 - lr: 0.000013 - momentum: 0.000000 2023-10-24 16:34:36,863 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:34:36,863 EPOCH 6 done: loss 0.0282 - lr: 0.000013 2023-10-24 16:34:39,974 DEV : loss 0.1790362298488617 - f1-score (micro avg) 0.7511 2023-10-24 16:34:39,989 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:34:48,498 epoch 7 - iter 99/992 - loss 0.01644419 - time (sec): 8.51 - samples/sec: 1981.82 - lr: 0.000013 - momentum: 0.000000 2023-10-24 16:34:56,792 epoch 7 - iter 198/992 - loss 0.02013642 - time (sec): 16.80 - samples/sec: 2028.95 - lr: 0.000013 - momentum: 0.000000 2023-10-24 16:35:05,121 epoch 7 - iter 297/992 - loss 0.02125966 - time (sec): 25.13 - samples/sec: 1985.38 - lr: 0.000012 - momentum: 0.000000 2023-10-24 16:35:13,293 epoch 7 - iter 396/992 - loss 0.02244887 - time (sec): 33.30 - samples/sec: 1971.50 - lr: 0.000012 - momentum: 0.000000 2023-10-24 16:35:21,764 epoch 7 - iter 495/992 - loss 0.02220930 - time (sec): 41.77 - samples/sec: 1972.33 - lr: 0.000012 - momentum: 0.000000 2023-10-24 16:35:29,832 epoch 7 - iter 594/992 - loss 0.02281129 - time (sec): 49.84 - samples/sec: 1973.45 - lr: 0.000011 - momentum: 0.000000 2023-10-24 16:35:38,344 epoch 7 - iter 693/992 - loss 0.02207634 - time (sec): 58.35 - samples/sec: 1975.07 - lr: 0.000011 - momentum: 0.000000 2023-10-24 16:35:46,894 epoch 7 - iter 792/992 - loss 0.02167285 - time (sec): 66.90 - samples/sec: 1970.54 - lr: 0.000011 - momentum: 0.000000 2023-10-24 16:35:55,536 epoch 7 - iter 891/992 - loss 0.02151418 - time (sec): 75.55 - samples/sec: 1963.93 - lr: 0.000010 - momentum: 0.000000 2023-10-24 16:36:03,450 epoch 7 - iter 990/992 - loss 0.02174271 - time (sec): 83.46 - samples/sec: 1960.72 - lr: 0.000010 - momentum: 0.000000 2023-10-24 16:36:03,610 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:36:03,610 EPOCH 7 done: loss 0.0219 - lr: 0.000010 2023-10-24 16:36:07,061 DEV : loss 0.21934953331947327 - f1-score (micro avg) 0.7551 2023-10-24 16:36:07,077 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:36:15,500 epoch 8 - iter 99/992 - loss 0.01765916 - time (sec): 8.42 - samples/sec: 1960.11 - lr: 0.000010 - momentum: 0.000000 2023-10-24 16:36:24,237 epoch 8 - iter 198/992 - loss 0.01884836 - time (sec): 17.16 - samples/sec: 1946.40 - lr: 0.000009 - momentum: 0.000000 2023-10-24 16:36:32,496 epoch 8 - iter 297/992 - loss 0.01792273 - time (sec): 25.42 - samples/sec: 1946.76 - lr: 0.000009 - momentum: 0.000000 2023-10-24 16:36:40,614 epoch 8 - iter 396/992 - loss 0.01573405 - time (sec): 33.54 - samples/sec: 1946.65 - lr: 0.000009 - momentum: 0.000000 2023-10-24 16:36:48,919 epoch 8 - iter 495/992 - loss 0.01514155 - time (sec): 41.84 - samples/sec: 1953.36 - lr: 0.000008 - momentum: 0.000000 2023-10-24 16:36:57,378 epoch 8 - iter 594/992 - loss 0.01537701 - time (sec): 50.30 - samples/sec: 1952.83 - lr: 0.000008 - momentum: 0.000000 2023-10-24 16:37:05,565 epoch 8 - iter 693/992 - loss 0.01515308 - time (sec): 58.49 - samples/sec: 1957.64 - lr: 0.000008 - momentum: 0.000000 2023-10-24 16:37:13,947 epoch 8 - iter 792/992 - loss 0.01497357 - time (sec): 66.87 - samples/sec: 1960.07 - lr: 0.000007 - momentum: 0.000000 2023-10-24 16:37:22,132 epoch 8 - iter 891/992 - loss 0.01482836 - time (sec): 75.05 - samples/sec: 1968.70 - lr: 0.000007 - momentum: 0.000000 2023-10-24 16:37:30,512 epoch 8 - iter 990/992 - loss 0.01497237 - time (sec): 83.43 - samples/sec: 1962.21 - lr: 0.000007 - momentum: 0.000000 2023-10-24 16:37:30,659 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:37:30,660 EPOCH 8 done: loss 0.0150 - lr: 0.000007 2023-10-24 16:37:33,778 DEV : loss 0.2332099825143814 - f1-score (micro avg) 0.7553 2023-10-24 16:37:33,794 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:37:41,933 epoch 9 - iter 99/992 - loss 0.01078508 - time (sec): 8.14 - samples/sec: 1968.47 - lr: 0.000006 - momentum: 0.000000 2023-10-24 16:37:50,232 epoch 9 - iter 198/992 - loss 0.01190655 - time (sec): 16.44 - samples/sec: 1970.40 - lr: 0.000006 - momentum: 0.000000 2023-10-24 16:37:58,290 epoch 9 - iter 297/992 - loss 0.01143782 - time (sec): 24.50 - samples/sec: 1969.81 - lr: 0.000006 - momentum: 0.000000 2023-10-24 16:38:06,384 epoch 9 - iter 396/992 - loss 0.01071707 - time (sec): 32.59 - samples/sec: 1984.83 - lr: 0.000005 - momentum: 0.000000 2023-10-24 16:38:15,150 epoch 9 - iter 495/992 - loss 0.01063424 - time (sec): 41.36 - samples/sec: 1971.48 - lr: 0.000005 - momentum: 0.000000 2023-10-24 16:38:23,284 epoch 9 - iter 594/992 - loss 0.01044319 - time (sec): 49.49 - samples/sec: 1967.33 - lr: 0.000005 - momentum: 0.000000 2023-10-24 16:38:31,709 epoch 9 - iter 693/992 - loss 0.01001900 - time (sec): 57.91 - samples/sec: 1959.44 - lr: 0.000004 - momentum: 0.000000 2023-10-24 16:38:40,254 epoch 9 - iter 792/992 - loss 0.00985411 - time (sec): 66.46 - samples/sec: 1969.84 - lr: 0.000004 - momentum: 0.000000 2023-10-24 16:38:48,649 epoch 9 - iter 891/992 - loss 0.01079216 - time (sec): 74.85 - samples/sec: 1977.22 - lr: 0.000004 - momentum: 0.000000 2023-10-24 16:38:57,185 epoch 9 - iter 990/992 - loss 0.01138071 - time (sec): 83.39 - samples/sec: 1963.27 - lr: 0.000003 - momentum: 0.000000 2023-10-24 16:38:57,344 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:38:57,344 EPOCH 9 done: loss 0.0114 - lr: 0.000003 2023-10-24 16:39:00,468 DEV : loss 0.228724405169487 - f1-score (micro avg) 0.7601 2023-10-24 16:39:00,484 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:39:08,677 epoch 10 - iter 99/992 - loss 0.00781916 - time (sec): 8.19 - samples/sec: 2036.06 - lr: 0.000003 - momentum: 0.000000 2023-10-24 16:39:16,773 epoch 10 - iter 198/992 - loss 0.00897859 - time (sec): 16.29 - samples/sec: 1997.68 - lr: 0.000003 - momentum: 0.000000 2023-10-24 16:39:25,014 epoch 10 - iter 297/992 - loss 0.00853106 - time (sec): 24.53 - samples/sec: 1999.32 - lr: 0.000002 - momentum: 0.000000 2023-10-24 16:39:34,307 epoch 10 - iter 396/992 - loss 0.00758245 - time (sec): 33.82 - samples/sec: 1977.32 - lr: 0.000002 - momentum: 0.000000 2023-10-24 16:39:42,756 epoch 10 - iter 495/992 - loss 0.00749694 - time (sec): 42.27 - samples/sec: 1963.08 - lr: 0.000002 - momentum: 0.000000 2023-10-24 16:39:51,210 epoch 10 - iter 594/992 - loss 0.00702956 - time (sec): 50.73 - samples/sec: 1972.74 - lr: 0.000001 - momentum: 0.000000 2023-10-24 16:39:59,286 epoch 10 - iter 693/992 - loss 0.00687848 - time (sec): 58.80 - samples/sec: 1971.60 - lr: 0.000001 - momentum: 0.000000 2023-10-24 16:40:07,435 epoch 10 - iter 792/992 - loss 0.00674550 - time (sec): 66.95 - samples/sec: 1981.65 - lr: 0.000001 - momentum: 0.000000 2023-10-24 16:40:15,573 epoch 10 - iter 891/992 - loss 0.00706231 - time (sec): 75.09 - samples/sec: 1972.65 - lr: 0.000000 - momentum: 0.000000 2023-10-24 16:40:23,725 epoch 10 - iter 990/992 - loss 0.00727311 - time (sec): 83.24 - samples/sec: 1964.44 - lr: 0.000000 - momentum: 0.000000 2023-10-24 16:40:23,948 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:40:23,948 EPOCH 10 done: loss 0.0073 - lr: 0.000000 2023-10-24 16:40:27,065 DEV : loss 0.24580398201942444 - f1-score (micro avg) 0.7635 2023-10-24 16:40:27,554 ---------------------------------------------------------------------------------------------------- 2023-10-24 16:40:27,555 Loading model from best epoch ... 2023-10-24 16:40:29,030 SequenceTagger predicts: Dictionary with 13 tags: O, S-PER, B-PER, E-PER, I-PER, S-LOC, B-LOC, E-LOC, I-LOC, S-ORG, B-ORG, E-ORG, I-ORG 2023-10-24 16:40:32,100 Results: - F-score (micro) 0.7594 - F-score (macro) 0.6798 - Accuracy 0.633 By class: precision recall f1-score support LOC 0.8125 0.8137 0.8131 655 PER 0.7322 0.7848 0.7576 223 ORG 0.5000 0.4409 0.4686 127 micro avg 0.7587 0.7602 0.7594 1005 macro avg 0.6816 0.6798 0.6798 1005 weighted avg 0.7552 0.7602 0.7573 1005 2023-10-24 16:40:32,100 ----------------------------------------------------------------------------------------------------