In [70]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [77]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, XLMRobertaForTokenClassification, AdamW, get_linear_schedule_with_warmup
from datasets import DatasetDict
from seqeval.metrics import classification_report

In [78]:
from datasets import load_dataset
dataset = load_dataset('masakhane/masakhaner2', 'kin') 

In [79]:
print("Original dataset sizes:")
print(f"Train: {len(dataset['train'])}")
print(f"Validation: {len(dataset['validation'])}")
print(f"Test: {len(dataset['test'])}")

Original dataset sizes:
Train: 7825
Validation: 1118
Test: 2235


In [80]:
def tokenize_and_align_labels(examples, tokenizer, max_length=128):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, 
                                 max_length=max_length, padding="max_length")

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [83]:
model_name = 'Davlan/afro-xlmr-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [84]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, 
                                 fn_kwargs={"tokenizer": tokenizer})

Map:   0%|          | 0/7825 [00:00<?, ? examples/s]

Map:   0%|          | 0/1118 [00:00<?, ? examples/s]

Map:   0%|          | 0/2235 [00:00<?, ? examples/s]

In [85]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7825
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1118
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2235
    })
})

In [86]:
print("Dataset sizes after tokenization:")
print(f"Train: {len(tokenized_datasets['train'])}")
print(f"Validation: {len(tokenized_datasets['validation'])}")
print(f"Test: {len(tokenized_datasets['test'])}")

columns_to_keep = ['input_ids', 'attention_mask', 'labels']
tokenized_datasets = tokenized_datasets.remove_columns([col for col in tokenized_datasets['train'].column_names if col not in columns_to_keep])

print("Dataset sizes after column selection:")
print(f"Train: {len(tokenized_datasets['train'])}")
print(f"Validation: {len(tokenized_datasets['validation'])}")
print(f"Test: {len(tokenized_datasets['test'])}")

# Set the format of the datasets to PyTorch tensors
tokenized_datasets = tokenized_datasets.with_format("torch")

Dataset sizes after tokenization:
Train: 7825
Validation: 1118
Test: 2235
Dataset sizes after column selection:
Train: 7825
Validation: 1118
Test: 2235


In [87]:
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=16)
eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=16)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

print("DataLoader sizes:")
print(f"Train: {len(train_dataloader)}")
print(f"Validation: {len(eval_dataloader)}")
print(f"Test: {len(test_dataloader)}")

DataLoader sizes:
Train: 490
Validation: 70
Test: 140


In [88]:
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

model = XLMRobertaForTokenClassification.from_pretrained(model_name, 
                                                         num_labels=num_labels,
                                                         id2label=id2label,
                                                         label2id=label2id)

# Setup training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768

In [89]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                                               num_warmup_steps=0,
                                               num_training_steps=num_training_steps)

In [90]:
# Training loop
def train_loop(dataloader, model, optimizer, lr_scheduler, epoch):
    model.train()
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1}/{num_epochs} completed")

# Evaluation loop
def eval_loop(dataloader, model):
    model.eval()
    predictions, true_labels = [], []
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=2).cpu().numpy())
        true_labels.extend(batch["labels"].cpu().numpy())
    
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, true_labels)
    ]
    true_labels = [
        [label_list[l] for l in label if l != -100]
        for label in true_labels
    ]
    return true_predictions, true_labels

In [91]:
for epoch in range(num_epochs):
    train_loop(train_dataloader, model, optimizer, lr_scheduler, epoch)
    predictions, labels = eval_loop(eval_dataloader, model)
    print(classification_report(labels, predictions))

# Final evaluation on test set
test_predictions, test_labels = eval_loop(test_dataloader, model)
print("Final Test Results:")
print(classification_report(test_labels, test_predictions))

Epoch 1/3 completed
              precision    recall  f1-score   support

        DATE       0.65      0.78      0.71       373
         LOC       0.85      0.88      0.87       524
         ORG       0.69      0.88      0.78       512
         PER       0.95      0.91      0.93       667

   micro avg       0.79      0.87      0.83      2076
   macro avg       0.79      0.86      0.82      2076
weighted avg       0.81      0.87      0.84      2076

Epoch 2/3 completed
              precision    recall  f1-score   support

        DATE       0.72      0.81      0.76       373
         LOC       0.87      0.90      0.88       524
         ORG       0.77      0.88      0.82       512
         PER       0.95      0.94      0.94       667

   micro avg       0.84      0.89      0.86      2076
   macro avg       0.83      0.88      0.85      2076
weighted avg       0.84      0.89      0.87      2076

Epoch 3/3 completed
              precision    recall  f1-score   support

        DATE   

In [92]:
model.save_pretrained('afro_xlmr_ner')
tokenizer.save_pretrained('afro_xlmr_ner')

('afro_xlmr_ner/tokenizer_config.json',
 'afro_xlmr_ner/special_tokens_map.json',
 'afro_xlmr_ner/sentencepiece.bpe.model',
 'afro_xlmr_ner/added_tokens.json',
 'afro_xlmr_ner/tokenizer.json')