NER in Hindi

muril_base_cased_hindi_ner

Base model is google/muril-base-cased, a BERT model pre-trained on 17 Indian languages and their transliterated counterparts. Hindi NER dataset is from HiNER.

Usage

example:

from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

model = AutoModelForTokenClassification.from_pretrained("MichaelHuang/muril_base_cased_hindi_ner")
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

# Define the labels dictionary
labels_dict = {
    0: "B-FESTIVAL",
    1: "B-GAME",
    2: "B-LANGUAGE",
    3: "B-LITERATURE",
    4: "B-LOCATION",
    5: "B-MISC",
    6: "B-NUMEX",
    7: "B-ORGANIZATION",
    8: "B-PERSON",
    9: "B-RELIGION",
    10: "B-TIMEX",
    11: "I-FESTIVAL",
    12: "I-GAME",
    13: "I-LANGUAGE",
    14: "I-LITERATURE",
    15: "I-LOCATION",
    16: "I-MISC",
    17: "I-NUMEX",
    18: "I-ORGANIZATION",
    19: "I-PERSON",
    20: "I-RELIGION",
    21: "I-TIMEX",
    22: "O"
}

def ner_predict(sentence, model, tokenizer, labels_dict):
    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted labels
    predicted_labels = torch.argmax(outputs.logits, dim=2)

    # Convert tokens and labels to lists
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = predicted_labels.squeeze().tolist()

    # Map numeric labels to string labels
    predicted_labels = [labels_dict[label] for label in labels]

    # Combine tokens and labels
    result = list(zip(tokens, predicted_labels))

    return result

test_sentence = "अकबर ईद पर टेनिस खेलता है"
predictions = ner_predict(test_sentence, model, tokenizer, labels_dict)

for token, label in predictions:
    print(f"{token}: {label}")

Eval results

eval_loss	eval_accuracy	eval_f1	epoch	eval_precision	eval_recall
0.11	0.97	0.88	3.0	0.87	0.89