KoichiYasuoka's picture
initial release
e854ba5
raw
history blame
5.11 kB
#! /usr/bin/python3
#pip3 install transformers accelerate deepspeed triton datasets
tgt="modernbert-base-thai-wikipedia"
import os,json
os.system("""
if test -d transformers
then :
else git clone --depth=1 https://github.com/huggingface/transformers transformers-all
ln -s transformers-all/src/transformers transformers
fi
test -d ModernBERT-base || git clone --depth=1 https://huggingface.co./answerdotai/ModernBERT-base
test -f ModernBERT-base/configuration_modernbert.py || sed 's/^from \\.\\.\\./from transformers./' transformers/models/modernbert/configuration_modernbert.py > ModernBERT-base/configuration_modernbert.py
test -f ModernBERT-base/modeling_modernbert.py || sed -e 's/^from \\.\\.\\./from transformers./' -e 's/^from .* import is_triton_available/import importlib\\nis_triton_available = lambda: importlib.util.find_spec("triton") is not None/' transformers/models/modernbert/modeling_modernbert.py > ModernBERT-base/modeling_modernbert.py
""")
with open("ModernBERT-base/config.json","r",encoding="utf-8") as r:
d=json.load(r)
if not "auto_map" in d:
d["auto_map"]={
"AutoConfig":"configuration_modernbert.ModernBertConfig",
"AutoModel":"modeling_modernbert.ModernBertModel",
"AutoModelForMaskedLM":"modeling_modernbert.ModernBertForMaskedLM",
"AutoModelForSequenceClassification":"modeling_modernbert.ModernBertForSequenceClassification",
"AutoModelForTokenClassification":"modeling_modernbert.ModernBertForTokenClassification"
}
with open("ModernBERT-base/config.json","w",encoding="utf-8") as w:
json.dump(d,w,indent=2)
if not os.path.isfile("token.txt"):
os.system("""
D=spaCy-Thai/UD_Thai-Corpora
test -d $D || git clone --depth=1 https://github.com/KoichiYasuoka/spaCy-Thai
cat $D/*-ud-dev.conllu > dev.conllu
cat $D/*-ud-test.conllu > test.conllu
cat $D/*-ud-train*.conllu $D/*-ud-orchid.conllu > train.conllu
V=OSKut/VISTEC-TP-TH-2021
test -d $V || git clone --depth=1 https://github.com/mrpeerat/OSKut
( sed -e 's/<[^>]*>//g' -e 's/[|_]/ / g' $V/*/*processed.txt
awk -F '\\t' '{if(NF==10&&$1~/^[1-9][0-9]*$/)printf($1>1?" %s":"%s",$2);else if(NF==0)print}' *.conllu
) > token.txt""")
if not os.path.isfile("train.txt"):
from datasets import load_dataset
with open("train.txt","w",encoding="utf-8") as w:
d,u=load_dataset("wikimedia/wikipedia","20231101.th"),""
for t in d["train"]:
for s in t["text"].split("\n"):
if len(s)+len(u)<10000:
u+=" "+s
else:
print(u,file=w)
u=s
print(u,file=w)
if not os.path.isfile("tokenizer.json"):
from tokenizers import Tokenizer,models,pre_tokenizers,normalizers,processors,decoders,trainers
import unicodedata
s=["[CLS]","[PAD]","[SEP]","[UNK]","[MASK]"]
spt=Tokenizer(models.Unigram())
spt.pre_tokenizer=pre_tokenizers.Sequence([pre_tokenizers.Whitespace(),pre_tokenizers.Punctuation()])
spt.normalizer=normalizers.Sequence([normalizers.Nmt(),normalizers.NFKC()])
spt.post_processor=processors.TemplateProcessing(single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1",special_tokens=[("[CLS]",0),("[SEP]",2)])
spt.decoder=decoders.WordPiece(prefix="",cleanup=True)
spt.train(trainer=trainers.UnigramTrainer(vocab_size=3000,max_piece_length=4,special_tokens=s,unk_token="[UNK]",n_sub_iterations=2),files=["token.txt"])
d=json.loads(spt.to_str())
d["model"]["vocab"]=[t for t in d["model"]["vocab"] if len(t[0])<2 or unicodedata.category(t[0][0])!="Mn" and int((ord(t[0][-1])-1)/7)!=521]
spt.from_str(json.dumps(d)).save("tokenizer.json")
from transformers import DebertaV2TokenizerFast
tkz=DebertaV2TokenizerFast(tokenizer_file="tokenizer.json",vocab_file="/dev/null",split_by_punct=True,do_lower_case=False,keep_accents=True,model_input_names=["input_ids","attention_mask"])
tkz.save_pretrained(tgt)
with open("train.py","w",encoding="utf-8") as w:
print(f'#! /usr/bin/env deepspeed\ntgt="{tgt}"'+'''
from transformers import DebertaV2TokenizerFast,ModernBertForMaskedLM,AutoConfig,DataCollatorForLanguageModeling,TrainingArguments,Trainer
tkz=DebertaV2TokenizerFast.from_pretrained(tgt)
c={"trust_remote_code":True,"vocab_size":len(tkz),"tokenizer_class":type(tkz).__name__}
for k,v in tkz.special_tokens_map.items():
c[k+"_id"]=tkz.convert_tokens_to_ids(v)
cfg=AutoConfig.from_pretrained("ModernBERT-base",**c)
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=1,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,save_safetensors=False)
class ReadLineDS(object):
def __init__(self,file,tokenizer):
self.tokenizer=tokenizer
with open(file,"r",encoding="utf-8") as r:
self.lines=[s.strip() for s in r if s.strip()>""]
__len__=lambda self:len(self.lines)
__getitem__=lambda self,i:self.tokenizer(self.lines[i],truncation=True,add_special_tokens=True,max_length=8190)
trn=Trainer(args=arg,data_collator=DataCollatorForLanguageModeling(tkz),model=ModernBertForMaskedLM(cfg),train_dataset=ReadLineDS("train.txt",tkz))
trn.train()
trn.save_model(tgt)''',file=w)
os.system("chmod 755 train.py ; ./train.py")
os.system(f"cp ModernBERT-base/*.py {tgt}")