KoichiYasuoka
commited on
Commit
•
d8ebfde
1
Parent(s):
a31ff2e
initial release
Browse files- README.md +58 -0
- config.json +0 -0
- maker.py +59 -0
- merges.txt +0 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +15 -0
- tokenizer.json +0 -0
- tokenizer_config.json +15 -0
- vocab.json +0 -0
README.md
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- "la"
|
4 |
+
tags:
|
5 |
+
- "latin"
|
6 |
+
- "token-classification"
|
7 |
+
- "pos"
|
8 |
+
- "dependency-parsing"
|
9 |
+
datasets:
|
10 |
+
- "universal_dependencies"
|
11 |
+
license: "cc-by-sa-4.0"
|
12 |
+
pipeline_tag: "token-classification"
|
13 |
+
widget:
|
14 |
+
- text: "deus videt te non sentientem"
|
15 |
+
---
|
16 |
+
|
17 |
+
# roberta-base-latin-ud-goeswith
|
18 |
+
|
19 |
+
## Model Description
|
20 |
+
|
21 |
+
This is a RoBERTa model pre-trained on CC-100 Latin texts for POS-tagging and dependency-parsing (using `goeswith` for subwords), derived from [roberta-base-latin-v2](https://huggingface.co/ClassCat/roberta-base-latin-v2).
|
22 |
+
|
23 |
+
## How to Use
|
24 |
+
|
25 |
+
```py
|
26 |
+
class UDgoeswith(object):
|
27 |
+
def __init__(self,bert):
|
28 |
+
from transformers import AutoTokenizer,AutoModelForTokenClassification
|
29 |
+
self.tokenizer=AutoTokenizer.from_pretrained(bert)
|
30 |
+
self.model=AutoModelForTokenClassification.from_pretrained(bert)
|
31 |
+
def __call__(self,text):
|
32 |
+
import numpy,torch,ufal.chu_liu_edmonds
|
33 |
+
w=self.tokenizer(text,return_offsets_mapping=True)
|
34 |
+
v=w["input_ids"]
|
35 |
+
n=len(v)-1
|
36 |
+
with torch.no_grad():
|
37 |
+
d=self.model(input_ids=torch.tensor([v[0:i]+[self.tokenizer.mask_token_id]+v[i+1:]+[v[i]] for i in range(1,n)]))
|
38 |
+
e=d.logits.numpy()[:,1:n,:]
|
39 |
+
e[:,:,0]=numpy.nan
|
40 |
+
m=numpy.full((n,n),numpy.nan)
|
41 |
+
m[1:,1:]=numpy.nanmax(e,axis=2).transpose()
|
42 |
+
p=numpy.zeros((n,n))
|
43 |
+
p[1:,1:]=numpy.nanargmax(e,axis=2).transpose()
|
44 |
+
for i in range(1,n):
|
45 |
+
m[i,0],m[i,i],p[i,0]=m[i,i],numpy.nan,p[i,i]
|
46 |
+
h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
|
47 |
+
u="# text = "+text+"\n"
|
48 |
+
v=[(s,e) for s,e in w["offset_mapping"] if s<e]
|
49 |
+
for i,(s,e) in enumerate(v,1):
|
50 |
+
q=self.model.config.id2label[p[i,h[i]]].split("|")
|
51 |
+
u+="\t".join([str(i),text[s:e],"_",q[0],"_","|".join(q[1:-1]),str(h[i]),q[-1],"_","_" if i<len(v) and e<v[i][0] else "SpaceAfter=No"])+"\n"
|
52 |
+
return u+"\n"
|
53 |
+
|
54 |
+
nlp=UDgoeswith("KoichiYasuoka/roberta-base-latin-ud-goeswith")
|
55 |
+
print(nlp("deus videt te non sentientem"))
|
56 |
+
```
|
57 |
+
|
58 |
+
[ufal.chu-liu-edmonds](https://pypi.org/project/ufal.chu-liu-edmonds/) is required.
|
config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
maker.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! /usr/bin/python3
|
2 |
+
src="ClassCat/roberta-base-latin-v2"
|
3 |
+
tgt="KoichiYasuoka/roberta-base-latin-ud-goeswith"
|
4 |
+
import os
|
5 |
+
url="https://github.com/UniversalDependencies/UD_Latin-ITTB"
|
6 |
+
os.system("test -d "+os.path.basename(url)+" || git clone --depth=1 "+url)
|
7 |
+
url="https://github.com/UniversalDependencies/UD_Latin-LLCT"
|
8 |
+
os.system("test -d "+os.path.basename(url)+" || git clone --depth=1 "+url)
|
9 |
+
os.system("for F in train dev test ; do cat UD_Latin-*/*-$F.conllu > $F.conllu ; done")
|
10 |
+
class UDgoeswithDataset(object):
|
11 |
+
def __init__(self,conllu,tokenizer):
|
12 |
+
self.ids,self.tags,label=[],[],set()
|
13 |
+
with open(conllu,"r",encoding="utf-8") as r:
|
14 |
+
cls,sep,msk=tokenizer.cls_token_id,tokenizer.sep_token_id,tokenizer.mask_token_id
|
15 |
+
dep,c="-|_|dep",[]
|
16 |
+
for s in r:
|
17 |
+
t=s.split("\t")
|
18 |
+
if len(t)==10 and t[0].isdecimal():
|
19 |
+
c.append(t)
|
20 |
+
elif c!=[]:
|
21 |
+
v=tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"]
|
22 |
+
for i in range(len(v)-1,-1,-1):
|
23 |
+
for j in range(1,len(v[i])):
|
24 |
+
c.insert(i+1,[c[i][0],"_","_","X","_","_",c[i][0],"goeswith","_","_"])
|
25 |
+
y=["0"]+[t[0] for t in c]
|
26 |
+
h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(c,1)]
|
27 |
+
p,v=[t[3]+"|"+t[5]+"|"+t[7] for t in c],sum(v,[])
|
28 |
+
if len(v)<tokenizer.model_max_length-3:
|
29 |
+
self.ids.append([cls]+v+[sep])
|
30 |
+
self.tags.append([dep]+p+[dep])
|
31 |
+
label=set(sum([self.tags[-1],list(label)],[]))
|
32 |
+
for i,k in enumerate(v):
|
33 |
+
self.ids.append([cls]+v[0:i]+[msk]+v[i+1:]+[sep,k])
|
34 |
+
self.tags.append([dep]+[t if h[j]==i+1 else dep for j,t in enumerate(p)]+[dep,dep])
|
35 |
+
c=[]
|
36 |
+
self.label2id={l:i for i,l in enumerate(sorted(label))}
|
37 |
+
def __call__(*args):
|
38 |
+
label=set(sum([list(t.label2id) for t in args],[]))
|
39 |
+
lid={l:i for i,l in enumerate(sorted(label))}
|
40 |
+
for t in args:
|
41 |
+
t.label2id=lid
|
42 |
+
return lid
|
43 |
+
__len__=lambda self:len(self.ids)
|
44 |
+
__getitem__=lambda self,i:{"input_ids":self.ids[i],"labels":[self.label2id[t] for t in self.tags[i]]}
|
45 |
+
from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer
|
46 |
+
from tokenizers.pre_tokenizers import ByteLevel
|
47 |
+
tkz=AutoTokenizer.from_pretrained(src)
|
48 |
+
tkz.backend_tokenizer.pre_tokenizer=ByteLevel(add_prefix_space=True)
|
49 |
+
tkz.model_max_length=514
|
50 |
+
trainDS=UDgoeswithDataset("train.conllu",tkz)
|
51 |
+
devDS=UDgoeswithDataset("dev.conllu",tkz)
|
52 |
+
testDS=UDgoeswithDataset("test.conllu",tkz)
|
53 |
+
lid=trainDS(devDS,testDS)
|
54 |
+
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()})
|
55 |
+
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
|
56 |
+
trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),train_dataset=trainDS,eval_dataset=devDS)
|
57 |
+
trn.train()
|
58 |
+
trn.save_model(tgt)
|
59 |
+
tkz.save_pretrained(tgt)
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12b13f895f382b674d4cb369aee50ff34804ecafddf30f676f675355a3102acf
|
3 |
+
size 529322353
|
special_tokens_map.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"cls_token": "<s>",
|
4 |
+
"eos_token": "</s>",
|
5 |
+
"mask_token": {
|
6 |
+
"content": "<mask>",
|
7 |
+
"lstrip": true,
|
8 |
+
"normalized": true,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false
|
11 |
+
},
|
12 |
+
"pad_token": "<pad>",
|
13 |
+
"sep_token": "</s>",
|
14 |
+
"unk_token": "<unk>"
|
15 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": true,
|
3 |
+
"bos_token": "<s>",
|
4 |
+
"cls_token": "<s>",
|
5 |
+
"eos_token": "</s>",
|
6 |
+
"errors": "replace",
|
7 |
+
"keep_accents": true,
|
8 |
+
"mask_token": "<mask>",
|
9 |
+
"model_max_length": 514,
|
10 |
+
"pad_token": "<pad>",
|
11 |
+
"sep_token": "</s>",
|
12 |
+
"tokenizer_class": "RobertaTokenizerFast",
|
13 |
+
"trim_offsets": true,
|
14 |
+
"unk_token": "<unk>"
|
15 |
+
}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|