martynawck commited on
Commit
eab0a00
·
1 Parent(s): 2636b4e

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +39 -0
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model description
2
+
3
+ - Morphosyntactic analyzer: Stanza
4
+ - Tagset: UD
5
+ - Embedding vectors: Fasttext (wiki)
6
+ - Dataset: PDB (http://git.nlp.ipipan.waw.pl/alina/PDBUD/tree/master/PDB-UD/PDB-UD)
7
+
8
+ # How to use
9
+
10
+ ## Clone
11
+
12
+ ```
13
+ git clone [email protected]:ipipan/nlpre_stanza_ud_fasttext_pdb
14
+ ```
15
+
16
+ ## Load model
17
+
18
+ ```
19
+ import stanza
20
+ lang = 'pl'
21
+ model_name = 'nlpre_stanza_ud_fasttext_pdb'
22
+ prefix = 'pdb1809'
23
+ config = \
24
+ {
25
+ # Comma-separated list of processors to use
26
+ 'processors': 'tokenize,mwt,pos,lemma',
27
+ # Language code for the language to build the Pipeline in
28
+ 'lang': lang,
29
+ # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
30
+ # You only need model paths if you have a specific model outside of stanza_resources
31
+ 'tokenize_model_path': os.path.join(model_name, f'{lang}_{prefix}_tokenizer.pt'),
32
+ 'mwt_model_path': os.path.join(model_name, f'{lang}_{prefix}_mwt_expander.pt'),
33
+ 'pos_model_path': os.path.join(model_name, f'{lang}_{prefix}_tagger.pt'),
34
+ 'pos_pretrain_path': os.path.join(model_name, f'{lang}_{prefix}.pretrain.pt'),
35
+ 'lemma_model_path': os.path.join(model_name, f'{lang}_{prefix}_lemmatizer.pt'),
36
+ # Use pretokenized text as input and disable tokenization
37
+ 'tokenize_pretokenized': True
38
+ }
39
+ model = stanza.Pipeline(**config)