|
!pip install -q transformers |
|
|
|
from transformers import RemBertForTokenClassification, RemBertTokenizerFast |
|
from transformers import XLMRobertaForTokenClassification, XLMRobertaTokenizerFast |
|
import torch |
|
|
|
|
|
|
|
main_path = "Misha24-10/MultiCoNER-2-recognition-model" |
|
|
|
model_1 = XLMRobertaForTokenClassification.from_pretrained(main_path, |
|
subfolder = "xlm_roberta_large_mountain") |
|
tokenizer_1 = XLMRobertaTokenizerFast.from_pretrained(main_path, |
|
subfolder = "xlm_roberta_large_mountain") |
|
|
|
model_2 = RemBertForTokenClassification.from_pretrained(main_path, |
|
subfolder = "google-rembert-ft_for_multi_ner_v3") |
|
tokenizer_2 = RemBertTokenizerFast.from_pretrained(main_path, |
|
subfolder = "google-rembert-ft_for_multi_ner_v3") |
|
|
|
model_3 = RemBertForTokenClassification.from_pretrained(main_path, |
|
subfolder = "google-rembert-ft_for_multi_ner_sky") |
|
tokenizer_3 = RemBertTokenizerFast.from_pretrained(main_path, |
|
subfolder = "google-rembert-ft_for_multi_ner_sky") |
|
|
|
import torch |
|
|
|
def compute_last_leyer_probs(model, tokenizer, sentence): |
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
number_of_tokens = tokenizer.encode_plus(sentence, return_tensors='pt',)['input_ids'].shape[-1] |
|
list_of_words = sentence.split() |
|
|
|
inputs = tokenizer(list_of_words, is_split_into_words=True, padding='max_length', max_length = min(number_of_tokens,512), truncation=True, return_tensors="pt") |
|
input_ids = inputs['input_ids'].to(device) |
|
attention_mask = inputs['attention_mask'].to(device) |
|
label_ids = torch.Tensor(align_word_ids(inputs.word_ids())) |
|
with torch.no_grad(): |
|
outputs = model(input_ids=input_ids, attention_mask=attention_mask) |
|
logits = outputs.logits |
|
return (logits[:, (label_ids == 1), :]) |
|
|
|
weights = {'model_1': 1, 'model_2': 1, 'model_3': 1} |
|
|
|
def align_word_ids(word_ids, return_word_ids=False): |
|
previous_word_idx = None |
|
label_ids = [] |
|
index_list = [] |
|
for idx, word_idx in enumerate(word_ids): |
|
|
|
if word_idx is None: |
|
label_ids.append(-100) |
|
|
|
elif word_idx != previous_word_idx: |
|
try: |
|
label_ids.append(1) |
|
index_list.append(idx) |
|
except: |
|
label_ids.append(-100) |
|
else: |
|
try: |
|
label_ids.append(1 if label_all_tokens else -100) |
|
except: |
|
label_ids.append(-100) |
|
previous_word_idx = word_idx |
|
|
|
if return_word_ids: |
|
return label_ids, index_list |
|
else: |
|
return label_ids |
|
|
|
def weighted_voting(sentence): |
|
predictions = [] |
|
for idx, (model, tokenizer) in enumerate([(model_1, tokenizer_1), (model_2, tokenizer_2), (model_3, tokenizer_3)]): |
|
logits = compute_last_leyer_probs(model, tokenizer, sentence) |
|
predictions.append(logits * weights[f'model_{idx+1}']) |
|
final_logits = sum(predictions) |
|
final_predictions = torch.argmax(final_logits, dim=2) |
|
labels = [model_1.config.id2label[i] for i in final_predictions.tolist()[0]] |
|
return labels |
|
|
|
sent_ex = "Elon Musk 's brother sits on the boards of tesla".lower() |
|
|
|
weighted_voting(sent_ex) |