2023-10-23 22:00:02,468 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:00:02,469 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=21, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-23 22:00:02,469 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:00:02,469 MultiCorpus: 3575 train + 1235 dev + 1266 test sentences - NER_HIPE_2022 Corpus: 3575 train + 1235 dev + 1266 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/hipe2020/de/with_doc_seperator 2023-10-23 22:00:02,469 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:00:02,469 Train: 3575 sentences 2023-10-23 22:00:02,469 (train_with_dev=False, train_with_test=False) 2023-10-23 22:00:02,469 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:00:02,469 Training Params: 2023-10-23 22:00:02,469 - learning_rate: "3e-05" 2023-10-23 22:00:02,469 - mini_batch_size: "4" 2023-10-23 22:00:02,469 - max_epochs: "10" 2023-10-23 22:00:02,470 - shuffle: "True" 2023-10-23 22:00:02,470 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:00:02,470 Plugins: 2023-10-23 22:00:02,470 - TensorboardLogger 2023-10-23 22:00:02,470 - LinearScheduler | warmup_fraction: '0.1' 2023-10-23 22:00:02,470 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:00:02,470 Final evaluation on model from best epoch (best-model.pt) 2023-10-23 22:00:02,470 - metric: "('micro avg', 'f1-score')" 2023-10-23 22:00:02,470 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:00:02,470 Computation: 2023-10-23 22:00:02,470 - compute on device: cuda:0 2023-10-23 22:00:02,470 - embedding storage: none 2023-10-23 22:00:02,470 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:00:02,470 Model training base path: "hmbench-hipe2020/de-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-4" 2023-10-23 22:00:02,470 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:00:02,470 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:00:02,470 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-23 22:00:08,104 epoch 1 - iter 89/894 - loss 2.62343567 - time (sec): 5.63 - samples/sec: 1469.36 - lr: 0.000003 - momentum: 0.000000 2023-10-23 22:00:13,867 epoch 1 - iter 178/894 - loss 1.57512476 - time (sec): 11.40 - samples/sec: 1513.53 - lr: 0.000006 - momentum: 0.000000 2023-10-23 22:00:19,441 epoch 1 - iter 267/894 - loss 1.20032023 - time (sec): 16.97 - samples/sec: 1504.38 - lr: 0.000009 - momentum: 0.000000 2023-10-23 22:00:24,975 epoch 1 - iter 356/894 - loss 1.00779245 - time (sec): 22.50 - samples/sec: 1503.59 - lr: 0.000012 - momentum: 0.000000 2023-10-23 22:00:30,549 epoch 1 - iter 445/894 - loss 0.85753315 - time (sec): 28.08 - samples/sec: 1512.61 - lr: 0.000015 - momentum: 0.000000 2023-10-23 22:00:36,026 epoch 1 - iter 534/894 - loss 0.75991655 - time (sec): 33.56 - samples/sec: 1510.60 - lr: 0.000018 - momentum: 0.000000 2023-10-23 22:00:41,741 epoch 1 - iter 623/894 - loss 0.68458506 - time (sec): 39.27 - samples/sec: 1518.10 - lr: 0.000021 - momentum: 0.000000 2023-10-23 22:00:47,325 epoch 1 - iter 712/894 - loss 0.62574853 - time (sec): 44.85 - samples/sec: 1520.57 - lr: 0.000024 - momentum: 0.000000 2023-10-23 22:00:52,887 epoch 1 - iter 801/894 - loss 0.58111481 - time (sec): 50.42 - samples/sec: 1517.67 - lr: 0.000027 - momentum: 0.000000 2023-10-23 22:00:58,859 epoch 1 - iter 890/894 - loss 0.53984463 - time (sec): 56.39 - samples/sec: 1524.89 - lr: 0.000030 - momentum: 0.000000 2023-10-23 22:00:59,171 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:00:59,172 EPOCH 1 done: loss 0.5387 - lr: 0.000030 2023-10-23 22:01:04,018 DEV : loss 0.14558294415473938 - f1-score (micro avg) 0.6342 2023-10-23 22:01:04,039 saving best model 2023-10-23 22:01:04,602 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:01:10,103 epoch 2 - iter 89/894 - loss 0.14667895 - time (sec): 5.50 - samples/sec: 1469.57 - lr: 0.000030 - momentum: 0.000000 2023-10-23 22:01:15,704 epoch 2 - iter 178/894 - loss 0.15235244 - time (sec): 11.10 - samples/sec: 1534.04 - lr: 0.000029 - momentum: 0.000000 2023-10-23 22:01:21,587 epoch 2 - iter 267/894 - loss 0.15030861 - time (sec): 16.98 - samples/sec: 1547.95 - lr: 0.000029 - momentum: 0.000000 2023-10-23 22:01:27,195 epoch 2 - iter 356/894 - loss 0.15389929 - time (sec): 22.59 - samples/sec: 1533.96 - lr: 0.000029 - momentum: 0.000000 2023-10-23 22:01:32,890 epoch 2 - iter 445/894 - loss 0.15549779 - time (sec): 28.29 - samples/sec: 1531.29 - lr: 0.000028 - momentum: 0.000000 2023-10-23 22:01:38,600 epoch 2 - iter 534/894 - loss 0.14895964 - time (sec): 34.00 - samples/sec: 1517.66 - lr: 0.000028 - momentum: 0.000000 2023-10-23 22:01:44,182 epoch 2 - iter 623/894 - loss 0.15223696 - time (sec): 39.58 - samples/sec: 1512.68 - lr: 0.000028 - momentum: 0.000000 2023-10-23 22:01:49,668 epoch 2 - iter 712/894 - loss 0.14480688 - time (sec): 45.06 - samples/sec: 1499.66 - lr: 0.000027 - momentum: 0.000000 2023-10-23 22:01:55,550 epoch 2 - iter 801/894 - loss 0.14199992 - time (sec): 50.95 - samples/sec: 1512.58 - lr: 0.000027 - momentum: 0.000000 2023-10-23 22:02:01,279 epoch 2 - iter 890/894 - loss 0.14023230 - time (sec): 56.68 - samples/sec: 1520.24 - lr: 0.000027 - momentum: 0.000000 2023-10-23 22:02:01,528 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:02:01,528 EPOCH 2 done: loss 0.1398 - lr: 0.000027 2023-10-23 22:02:08,034 DEV : loss 0.15009824931621552 - f1-score (micro avg) 0.7154 2023-10-23 22:02:08,054 saving best model 2023-10-23 22:02:08,783 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:02:14,310 epoch 3 - iter 89/894 - loss 0.07903434 - time (sec): 5.53 - samples/sec: 1419.56 - lr: 0.000026 - momentum: 0.000000 2023-10-23 22:02:20,077 epoch 3 - iter 178/894 - loss 0.08627761 - time (sec): 11.29 - samples/sec: 1445.44 - lr: 0.000026 - momentum: 0.000000 2023-10-23 22:02:25,634 epoch 3 - iter 267/894 - loss 0.08164110 - time (sec): 16.85 - samples/sec: 1471.01 - lr: 0.000026 - momentum: 0.000000 2023-10-23 22:02:31,429 epoch 3 - iter 356/894 - loss 0.08568194 - time (sec): 22.64 - samples/sec: 1501.09 - lr: 0.000025 - momentum: 0.000000 2023-10-23 22:02:36,950 epoch 3 - iter 445/894 - loss 0.08356653 - time (sec): 28.17 - samples/sec: 1476.31 - lr: 0.000025 - momentum: 0.000000 2023-10-23 22:02:42,852 epoch 3 - iter 534/894 - loss 0.08369738 - time (sec): 34.07 - samples/sec: 1491.63 - lr: 0.000025 - momentum: 0.000000 2023-10-23 22:02:48,743 epoch 3 - iter 623/894 - loss 0.08210453 - time (sec): 39.96 - samples/sec: 1504.93 - lr: 0.000024 - momentum: 0.000000 2023-10-23 22:02:54,331 epoch 3 - iter 712/894 - loss 0.08028534 - time (sec): 45.55 - samples/sec: 1517.67 - lr: 0.000024 - momentum: 0.000000 2023-10-23 22:02:59,876 epoch 3 - iter 801/894 - loss 0.08231477 - time (sec): 51.09 - samples/sec: 1515.30 - lr: 0.000024 - momentum: 0.000000 2023-10-23 22:03:05,538 epoch 3 - iter 890/894 - loss 0.08214108 - time (sec): 56.75 - samples/sec: 1518.46 - lr: 0.000023 - momentum: 0.000000 2023-10-23 22:03:05,783 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:03:05,783 EPOCH 3 done: loss 0.0828 - lr: 0.000023 2023-10-23 22:03:12,290 DEV : loss 0.1573496311903 - f1-score (micro avg) 0.7352 2023-10-23 22:03:12,310 saving best model 2023-10-23 22:03:13,023 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:03:18,641 epoch 4 - iter 89/894 - loss 0.06023096 - time (sec): 5.62 - samples/sec: 1509.49 - lr: 0.000023 - momentum: 0.000000 2023-10-23 22:03:24,166 epoch 4 - iter 178/894 - loss 0.05491063 - time (sec): 11.14 - samples/sec: 1486.24 - lr: 0.000023 - momentum: 0.000000 2023-10-23 22:03:29,752 epoch 4 - iter 267/894 - loss 0.04891097 - time (sec): 16.73 - samples/sec: 1506.70 - lr: 0.000022 - momentum: 0.000000 2023-10-23 22:03:35,652 epoch 4 - iter 356/894 - loss 0.04752950 - time (sec): 22.63 - samples/sec: 1527.14 - lr: 0.000022 - momentum: 0.000000 2023-10-23 22:03:41,423 epoch 4 - iter 445/894 - loss 0.04766369 - time (sec): 28.40 - samples/sec: 1527.67 - lr: 0.000022 - momentum: 0.000000 2023-10-23 22:03:47,052 epoch 4 - iter 534/894 - loss 0.05016494 - time (sec): 34.03 - samples/sec: 1521.48 - lr: 0.000021 - momentum: 0.000000 2023-10-23 22:03:52,551 epoch 4 - iter 623/894 - loss 0.04807291 - time (sec): 39.53 - samples/sec: 1522.71 - lr: 0.000021 - momentum: 0.000000 2023-10-23 22:03:58,196 epoch 4 - iter 712/894 - loss 0.04922704 - time (sec): 45.17 - samples/sec: 1521.89 - lr: 0.000021 - momentum: 0.000000 2023-10-23 22:04:03,969 epoch 4 - iter 801/894 - loss 0.05047950 - time (sec): 50.95 - samples/sec: 1520.51 - lr: 0.000020 - momentum: 0.000000 2023-10-23 22:04:09,644 epoch 4 - iter 890/894 - loss 0.05249311 - time (sec): 56.62 - samples/sec: 1522.65 - lr: 0.000020 - momentum: 0.000000 2023-10-23 22:04:09,891 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:04:09,891 EPOCH 4 done: loss 0.0524 - lr: 0.000020 2023-10-23 22:04:16,372 DEV : loss 0.21319110691547394 - f1-score (micro avg) 0.748 2023-10-23 22:04:16,393 saving best model 2023-10-23 22:04:17,106 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:04:22,892 epoch 5 - iter 89/894 - loss 0.03139489 - time (sec): 5.79 - samples/sec: 1555.16 - lr: 0.000020 - momentum: 0.000000 2023-10-23 22:04:28,574 epoch 5 - iter 178/894 - loss 0.02921430 - time (sec): 11.47 - samples/sec: 1527.76 - lr: 0.000019 - momentum: 0.000000 2023-10-23 22:04:34,137 epoch 5 - iter 267/894 - loss 0.03308587 - time (sec): 17.03 - samples/sec: 1517.29 - lr: 0.000019 - momentum: 0.000000 2023-10-23 22:04:39,873 epoch 5 - iter 356/894 - loss 0.03277134 - time (sec): 22.77 - samples/sec: 1533.13 - lr: 0.000019 - momentum: 0.000000 2023-10-23 22:04:45,803 epoch 5 - iter 445/894 - loss 0.03290898 - time (sec): 28.70 - samples/sec: 1557.66 - lr: 0.000018 - momentum: 0.000000 2023-10-23 22:04:51,276 epoch 5 - iter 534/894 - loss 0.03073298 - time (sec): 34.17 - samples/sec: 1537.89 - lr: 0.000018 - momentum: 0.000000 2023-10-23 22:04:57,009 epoch 5 - iter 623/894 - loss 0.03132323 - time (sec): 39.90 - samples/sec: 1528.69 - lr: 0.000018 - momentum: 0.000000 2023-10-23 22:05:02,566 epoch 5 - iter 712/894 - loss 0.03142016 - time (sec): 45.46 - samples/sec: 1531.50 - lr: 0.000017 - momentum: 0.000000 2023-10-23 22:05:08,111 epoch 5 - iter 801/894 - loss 0.03248282 - time (sec): 51.00 - samples/sec: 1519.37 - lr: 0.000017 - momentum: 0.000000 2023-10-23 22:05:13,728 epoch 5 - iter 890/894 - loss 0.03245031 - time (sec): 56.62 - samples/sec: 1517.85 - lr: 0.000017 - momentum: 0.000000 2023-10-23 22:05:14,035 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:05:14,036 EPOCH 5 done: loss 0.0323 - lr: 0.000017 2023-10-23 22:05:20,524 DEV : loss 0.24947355687618256 - f1-score (micro avg) 0.7729 2023-10-23 22:05:20,545 saving best model 2023-10-23 22:05:21,250 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:05:26,639 epoch 6 - iter 89/894 - loss 0.01792561 - time (sec): 5.39 - samples/sec: 1388.32 - lr: 0.000016 - momentum: 0.000000 2023-10-23 22:05:32,260 epoch 6 - iter 178/894 - loss 0.01992217 - time (sec): 11.01 - samples/sec: 1458.36 - lr: 0.000016 - momentum: 0.000000 2023-10-23 22:05:38,009 epoch 6 - iter 267/894 - loss 0.02269894 - time (sec): 16.76 - samples/sec: 1509.74 - lr: 0.000016 - momentum: 0.000000 2023-10-23 22:05:43,702 epoch 6 - iter 356/894 - loss 0.02500337 - time (sec): 22.45 - samples/sec: 1515.16 - lr: 0.000015 - momentum: 0.000000 2023-10-23 22:05:49,609 epoch 6 - iter 445/894 - loss 0.02486673 - time (sec): 28.36 - samples/sec: 1537.92 - lr: 0.000015 - momentum: 0.000000 2023-10-23 22:05:55,122 epoch 6 - iter 534/894 - loss 0.02638849 - time (sec): 33.87 - samples/sec: 1526.69 - lr: 0.000015 - momentum: 0.000000 2023-10-23 22:06:00,860 epoch 6 - iter 623/894 - loss 0.02529988 - time (sec): 39.61 - samples/sec: 1531.21 - lr: 0.000014 - momentum: 0.000000 2023-10-23 22:06:06,606 epoch 6 - iter 712/894 - loss 0.02507466 - time (sec): 45.36 - samples/sec: 1522.93 - lr: 0.000014 - momentum: 0.000000 2023-10-23 22:06:12,290 epoch 6 - iter 801/894 - loss 0.02413790 - time (sec): 51.04 - samples/sec: 1522.67 - lr: 0.000014 - momentum: 0.000000 2023-10-23 22:06:17,946 epoch 6 - iter 890/894 - loss 0.02403350 - time (sec): 56.70 - samples/sec: 1520.89 - lr: 0.000013 - momentum: 0.000000 2023-10-23 22:06:18,185 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:06:18,185 EPOCH 6 done: loss 0.0242 - lr: 0.000013 2023-10-23 22:06:24,684 DEV : loss 0.26065516471862793 - f1-score (micro avg) 0.7557 2023-10-23 22:06:24,704 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:06:30,636 epoch 7 - iter 89/894 - loss 0.01279211 - time (sec): 5.93 - samples/sec: 1607.14 - lr: 0.000013 - momentum: 0.000000 2023-10-23 22:06:36,220 epoch 7 - iter 178/894 - loss 0.01603667 - time (sec): 11.52 - samples/sec: 1543.94 - lr: 0.000013 - momentum: 0.000000 2023-10-23 22:06:41,728 epoch 7 - iter 267/894 - loss 0.01679587 - time (sec): 17.02 - samples/sec: 1505.93 - lr: 0.000012 - momentum: 0.000000 2023-10-23 22:06:47,208 epoch 7 - iter 356/894 - loss 0.02168416 - time (sec): 22.50 - samples/sec: 1483.39 - lr: 0.000012 - momentum: 0.000000 2023-10-23 22:06:52,959 epoch 7 - iter 445/894 - loss 0.02179523 - time (sec): 28.25 - samples/sec: 1488.98 - lr: 0.000012 - momentum: 0.000000 2023-10-23 22:06:58,548 epoch 7 - iter 534/894 - loss 0.02050756 - time (sec): 33.84 - samples/sec: 1492.08 - lr: 0.000011 - momentum: 0.000000 2023-10-23 22:07:04,349 epoch 7 - iter 623/894 - loss 0.01912097 - time (sec): 39.64 - samples/sec: 1505.91 - lr: 0.000011 - momentum: 0.000000 2023-10-23 22:07:10,221 epoch 7 - iter 712/894 - loss 0.01820047 - time (sec): 45.52 - samples/sec: 1528.48 - lr: 0.000011 - momentum: 0.000000 2023-10-23 22:07:15,794 epoch 7 - iter 801/894 - loss 0.01769554 - time (sec): 51.09 - samples/sec: 1523.06 - lr: 0.000010 - momentum: 0.000000 2023-10-23 22:07:21,354 epoch 7 - iter 890/894 - loss 0.01702500 - time (sec): 56.65 - samples/sec: 1521.23 - lr: 0.000010 - momentum: 0.000000 2023-10-23 22:07:21,596 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:07:21,596 EPOCH 7 done: loss 0.0170 - lr: 0.000010 2023-10-23 22:07:28,083 DEV : loss 0.28167280554771423 - f1-score (micro avg) 0.7629 2023-10-23 22:07:28,104 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:07:33,751 epoch 8 - iter 89/894 - loss 0.01867921 - time (sec): 5.65 - samples/sec: 1520.34 - lr: 0.000010 - momentum: 0.000000 2023-10-23 22:07:39,538 epoch 8 - iter 178/894 - loss 0.01577457 - time (sec): 11.43 - samples/sec: 1510.75 - lr: 0.000009 - momentum: 0.000000 2023-10-23 22:07:45,307 epoch 8 - iter 267/894 - loss 0.01153896 - time (sec): 17.20 - samples/sec: 1532.20 - lr: 0.000009 - momentum: 0.000000 2023-10-23 22:07:51,236 epoch 8 - iter 356/894 - loss 0.01015795 - time (sec): 23.13 - samples/sec: 1547.33 - lr: 0.000009 - momentum: 0.000000 2023-10-23 22:07:56,657 epoch 8 - iter 445/894 - loss 0.01021030 - time (sec): 28.55 - samples/sec: 1523.00 - lr: 0.000008 - momentum: 0.000000 2023-10-23 22:08:02,222 epoch 8 - iter 534/894 - loss 0.01037248 - time (sec): 34.12 - samples/sec: 1526.13 - lr: 0.000008 - momentum: 0.000000 2023-10-23 22:08:07,768 epoch 8 - iter 623/894 - loss 0.01003113 - time (sec): 39.66 - samples/sec: 1524.13 - lr: 0.000008 - momentum: 0.000000 2023-10-23 22:08:13,205 epoch 8 - iter 712/894 - loss 0.01000431 - time (sec): 45.10 - samples/sec: 1508.44 - lr: 0.000007 - momentum: 0.000000 2023-10-23 22:08:19,069 epoch 8 - iter 801/894 - loss 0.01026783 - time (sec): 50.96 - samples/sec: 1518.12 - lr: 0.000007 - momentum: 0.000000 2023-10-23 22:08:24,766 epoch 8 - iter 890/894 - loss 0.01030903 - time (sec): 56.66 - samples/sec: 1522.06 - lr: 0.000007 - momentum: 0.000000 2023-10-23 22:08:25,014 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:08:25,014 EPOCH 8 done: loss 0.0103 - lr: 0.000007 2023-10-23 22:08:31,510 DEV : loss 0.281236857175827 - f1-score (micro avg) 0.7692 2023-10-23 22:08:31,530 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:08:37,018 epoch 9 - iter 89/894 - loss 0.01133282 - time (sec): 5.49 - samples/sec: 1487.95 - lr: 0.000006 - momentum: 0.000000 2023-10-23 22:08:42,585 epoch 9 - iter 178/894 - loss 0.01040047 - time (sec): 11.05 - samples/sec: 1483.85 - lr: 0.000006 - momentum: 0.000000 2023-10-23 22:08:48,138 epoch 9 - iter 267/894 - loss 0.00836655 - time (sec): 16.61 - samples/sec: 1507.47 - lr: 0.000006 - momentum: 0.000000 2023-10-23 22:08:53,608 epoch 9 - iter 356/894 - loss 0.00715496 - time (sec): 22.08 - samples/sec: 1502.77 - lr: 0.000005 - momentum: 0.000000 2023-10-23 22:08:59,232 epoch 9 - iter 445/894 - loss 0.00763303 - time (sec): 27.70 - samples/sec: 1507.64 - lr: 0.000005 - momentum: 0.000000 2023-10-23 22:09:05,207 epoch 9 - iter 534/894 - loss 0.00813773 - time (sec): 33.68 - samples/sec: 1536.08 - lr: 0.000005 - momentum: 0.000000 2023-10-23 22:09:10,929 epoch 9 - iter 623/894 - loss 0.00743043 - time (sec): 39.40 - samples/sec: 1530.64 - lr: 0.000004 - momentum: 0.000000 2023-10-23 22:09:16,928 epoch 9 - iter 712/894 - loss 0.00654784 - time (sec): 45.40 - samples/sec: 1538.13 - lr: 0.000004 - momentum: 0.000000 2023-10-23 22:09:22,403 epoch 9 - iter 801/894 - loss 0.00640737 - time (sec): 50.87 - samples/sec: 1525.27 - lr: 0.000004 - momentum: 0.000000 2023-10-23 22:09:28,095 epoch 9 - iter 890/894 - loss 0.00639578 - time (sec): 56.56 - samples/sec: 1526.16 - lr: 0.000003 - momentum: 0.000000 2023-10-23 22:09:28,330 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:09:28,330 EPOCH 9 done: loss 0.0064 - lr: 0.000003 2023-10-23 22:09:34,553 DEV : loss 0.27773818373680115 - f1-score (micro avg) 0.7747 2023-10-23 22:09:34,574 saving best model 2023-10-23 22:09:35,270 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:09:41,400 epoch 10 - iter 89/894 - loss 0.00131393 - time (sec): 6.13 - samples/sec: 1458.45 - lr: 0.000003 - momentum: 0.000000 2023-10-23 22:09:47,083 epoch 10 - iter 178/894 - loss 0.00128416 - time (sec): 11.81 - samples/sec: 1466.61 - lr: 0.000003 - momentum: 0.000000 2023-10-23 22:09:52,565 epoch 10 - iter 267/894 - loss 0.00231588 - time (sec): 17.29 - samples/sec: 1504.94 - lr: 0.000002 - momentum: 0.000000 2023-10-23 22:09:58,365 epoch 10 - iter 356/894 - loss 0.00225184 - time (sec): 23.09 - samples/sec: 1530.37 - lr: 0.000002 - momentum: 0.000000 2023-10-23 22:10:03,874 epoch 10 - iter 445/894 - loss 0.00250123 - time (sec): 28.60 - samples/sec: 1508.55 - lr: 0.000002 - momentum: 0.000000 2023-10-23 22:10:09,384 epoch 10 - iter 534/894 - loss 0.00292605 - time (sec): 34.11 - samples/sec: 1505.48 - lr: 0.000001 - momentum: 0.000000 2023-10-23 22:10:15,106 epoch 10 - iter 623/894 - loss 0.00273991 - time (sec): 39.84 - samples/sec: 1509.10 - lr: 0.000001 - momentum: 0.000000 2023-10-23 22:10:20,592 epoch 10 - iter 712/894 - loss 0.00327829 - time (sec): 45.32 - samples/sec: 1503.66 - lr: 0.000001 - momentum: 0.000000 2023-10-23 22:10:26,277 epoch 10 - iter 801/894 - loss 0.00401238 - time (sec): 51.01 - samples/sec: 1506.60 - lr: 0.000000 - momentum: 0.000000 2023-10-23 22:10:31,962 epoch 10 - iter 890/894 - loss 0.00370280 - time (sec): 56.69 - samples/sec: 1507.83 - lr: 0.000000 - momentum: 0.000000 2023-10-23 22:10:32,457 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:10:32,457 EPOCH 10 done: loss 0.0038 - lr: 0.000000 2023-10-23 22:10:38,700 DEV : loss 0.28241854906082153 - f1-score (micro avg) 0.774 2023-10-23 22:10:39,272 ---------------------------------------------------------------------------------------------------- 2023-10-23 22:10:39,272 Loading model from best epoch ... 2023-10-23 22:10:41,022 SequenceTagger predicts: Dictionary with 21 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org, S-prod, B-prod, E-prod, I-prod, S-time, B-time, E-time, I-time 2023-10-23 22:10:45,841 Results: - F-score (micro) 0.7534 - F-score (macro) 0.6778 - Accuracy 0.6245 By class: precision recall f1-score support loc 0.8088 0.8658 0.8363 596 pers 0.7077 0.7417 0.7243 333 org 0.5487 0.4697 0.5061 132 prod 0.6441 0.5758 0.6080 66 time 0.7143 0.7143 0.7143 49 micro avg 0.7434 0.7636 0.7534 1176 macro avg 0.6847 0.6735 0.6778 1176 weighted avg 0.7378 0.7636 0.7496 1176 2023-10-23 22:10:45,841 ----------------------------------------------------------------------------------------------------