Update README.md
Browse files
README.md
CHANGED
@@ -21,7 +21,63 @@ The model is intended to be used with the dialect identification system that is
|
|
21 |
|
22 |
|
23 |
## How to use
|
|
|
|
|
|
|
|
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
## Citation
|
27 |
```bibtex
|
|
|
21 |
|
22 |
|
23 |
## How to use
|
24 |
+
```python
|
25 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
26 |
+
from camel_tools.dialectid import DIDModel6
|
27 |
+
import torch
|
28 |
|
29 |
+
DID = DIDModel6.pretrained()
|
30 |
+
DA_PHRASE_MAP = {'BEI': 'في بيروت منقول',
|
31 |
+
'CAI': 'في القاهرة بنقول',
|
32 |
+
'DOH': 'في الدوحة نقول',
|
33 |
+
'RAB': 'في الرباط كنقولو',
|
34 |
+
'TUN': 'في تونس نقولو'}
|
35 |
+
|
36 |
+
|
37 |
+
def predict_dialect(sent):
|
38 |
+
"""Predicts the dialect of a sentence using the
|
39 |
+
CAMeL Tools MADAR 6 DID model"""
|
40 |
+
|
41 |
+
predictions = DID.predict([sent])
|
42 |
+
|
43 |
+
if predictions[0].top != "MSA":
|
44 |
+
scores = predictions[0].scores
|
45 |
+
highest = sorted(
|
46 |
+
scores.items(), key=lambda x: x[1], reverse=True)[0]
|
47 |
+
name = highest[0]
|
48 |
+
score = highest[1]
|
49 |
+
|
50 |
+
else:
|
51 |
+
scores = predictions[0].scores
|
52 |
+
second_highest = sorted(
|
53 |
+
scores.items(), key=lambda x: x[1], reverse=True)[1]
|
54 |
+
name = second_highest[0]
|
55 |
+
score = second_highest[1]
|
56 |
+
|
57 |
+
return name, score
|
58 |
+
|
59 |
+
tokenizer = AutoTokenizer.from_pretrained('CAMeL-Lab/arat5-coda-did')
|
60 |
+
model = AutoModelForSeq2SeqLM.from_pretrained('CAMeL-Lab/arat5-coda-did')
|
61 |
+
|
62 |
+
text = 'اتنين هامبورجر و اتنين قهوة، لو سمحت. عايزهم تيك اواي.'
|
63 |
+
|
64 |
+
pred_dialect, _ = predict_dialect(text)
|
65 |
+
text = DA_PHRASE_MAP[pred_dialect] + ' ' + text
|
66 |
+
|
67 |
+
inputs = tokenizer(text, return_tensors='pt')
|
68 |
+
gen_kwargs = {'num_beams': 5, 'max_length': 200,
|
69 |
+
'num_return_sequences': 1,
|
70 |
+
'no_repeat_ngram_size': 0, 'early_stopping': False
|
71 |
+
}
|
72 |
+
|
73 |
+
codafied_text = model.generate(**inputs, **gen_kwargs)
|
74 |
+
codafied_text = tokenizer.batch_decode(codafied_text,
|
75 |
+
skip_special_tokens=True,
|
76 |
+
clean_up_tokenization_spaces=False)[0]
|
77 |
+
|
78 |
+
print(codafied_text)
|
79 |
+
"اثنين هامبورجر واثنين قهوة، لو سمحت. عايزهم تيك اوي."
|
80 |
+
```
|
81 |
|
82 |
## Citation
|
83 |
```bibtex
|