denis-gordeev
commited on
Commit
•
b700fd2
1
Parent(s):
8e998c8
Update README.md
Browse files
README.md
CHANGED
@@ -20,7 +20,58 @@ should probably proofread and complete it, then remove this comment. -->
|
|
20 |
Russian NER model fine-tuned on RURED2.
|
21 |
https://github.com/denis-gordeev/rured2
|
22 |
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
This model is a fine-tuned version of [microsoft/mdeberta-v3-base](https://huggingface.co/microsoft/mdeberta-v3-base) on the None dataset.
|
26 |
It achieves the following results on the evaluation set:
|
|
|
20 |
Russian NER model fine-tuned on RURED2.
|
21 |
https://github.com/denis-gordeev/rured2
|
22 |
|
23 |
+
This model outputs multiple possible labels for a single token. So for proper usage you can use it like in the following code:
|
24 |
+
```
|
25 |
+
import torch
|
26 |
+
from torch import nn
|
27 |
+
from transformers import (AutoTokenizer, AutoModelForTokenClassification,
|
28 |
+
TrainingArguments, Trainer)
|
29 |
+
|
30 |
+
model_name = "denis-gordeev/rured2-ner-microsoft-mdeberta-v3-base"
|
31 |
+
model = AutoModelForTokenClassification.from_pretrained(
|
32 |
+
model_name).to('cuda')
|
33 |
+
|
34 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
35 |
+
|
36 |
+
def predict(text:str, glue_tokens=False, output_together=True, glue_words=True):
|
37 |
+
sigmoid = nn.Sigmoid()
|
38 |
+
tokenized = tokenizer(text)
|
39 |
+
input_ids = torch.tensor(
|
40 |
+
[tokenized["input_ids"]], dtype=torch.long
|
41 |
+
).to("cuda")
|
42 |
+
token_type_ids = torch.tensor(
|
43 |
+
[tokenized["token_type_ids"]], dtype=torch.long
|
44 |
+
).to("cuda")
|
45 |
+
attention_mask = torch.tensor(
|
46 |
+
[tokenized["attention_mask"]], dtype=torch.long
|
47 |
+
).to("cuda")
|
48 |
+
preds = model(**{"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask})
|
49 |
+
logits = sigmoid(preds.logits)
|
50 |
+
|
51 |
+
output_tokens = []
|
52 |
+
output_preds = []
|
53 |
+
for i, token in enumerate(input_ids[0]):
|
54 |
+
if token > 3:
|
55 |
+
class_ids = (logits[0][i] > 0.5).nonzero()
|
56 |
+
if class_ids.shape[0] >= 1:
|
57 |
+
class_names = [id_to_label[int(cl)] for cl in class_ids]
|
58 |
+
else:
|
59 |
+
class_names = [id_to_label[int(logits[0][i].argmax())]]
|
60 |
+
converted_token = tokenizer.convert_ids_to_tokens([token])[0]
|
61 |
+
new_word_bool = converted_token.startswith("▁")
|
62 |
+
converted_token = converted_token.replace("▁", "")
|
63 |
+
if glue_words and not(new_word_bool) and output_tokens:
|
64 |
+
output_tokens[-1] += converted_token
|
65 |
+
else:
|
66 |
+
output_tokens.append(converted_token)
|
67 |
+
output_preds.append(class_names)
|
68 |
+
else:
|
69 |
+
class_names = []
|
70 |
+
if output_together:
|
71 |
+
return [[output_tokens[t_i], output_preds[t_i]] for t_i in range(len(output_tokens))]
|
72 |
+
return output_tokens, output_preds
|
73 |
+
```
|
74 |
+
# denis-gordeev/rured2-ner-microsoft-mdeberta-v3-base
|
75 |
|
76 |
This model is a fine-tuned version of [microsoft/mdeberta-v3-base](https://huggingface.co/microsoft/mdeberta-v3-base) on the None dataset.
|
77 |
It achieves the following results on the evaluation set:
|