balhafni commited on
Commit
83267f3
1 Parent(s): 3c25f31

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +56 -0
README.md CHANGED
@@ -21,7 +21,63 @@ The model is intended to be used with the dialect identification system that is
21
 
22
 
23
  ## How to use
 
 
 
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  ## Citation
27
  ```bibtex
 
21
 
22
 
23
  ## How to use
24
+ ```python
25
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
26
+ from camel_tools.dialectid import DIDModel6
27
+ import torch
28
 
29
+ DID = DIDModel6.pretrained()
30
+ DA_PHRASE_MAP = {'BEI': 'في بيروت منقول',
31
+ 'CAI': 'في القاهرة بنقول',
32
+ 'DOH': 'في الدوحة نقول',
33
+ 'RAB': 'في الرباط كنقولو',
34
+ 'TUN': 'في تونس نقولو'}
35
+
36
+
37
+ def predict_dialect(sent):
38
+ """Predicts the dialect of a sentence using the
39
+ CAMeL Tools MADAR 6 DID model"""
40
+
41
+ predictions = DID.predict([sent])
42
+
43
+ if predictions[0].top != "MSA":
44
+ scores = predictions[0].scores
45
+ highest = sorted(
46
+ scores.items(), key=lambda x: x[1], reverse=True)[0]
47
+ name = highest[0]
48
+ score = highest[1]
49
+
50
+ else:
51
+ scores = predictions[0].scores
52
+ second_highest = sorted(
53
+ scores.items(), key=lambda x: x[1], reverse=True)[1]
54
+ name = second_highest[0]
55
+ score = second_highest[1]
56
+
57
+ return name, score
58
+
59
+ tokenizer = AutoTokenizer.from_pretrained('CAMeL-Lab/arat5-coda-did')
60
+ model = AutoModelForSeq2SeqLM.from_pretrained('CAMeL-Lab/arat5-coda-did')
61
+
62
+ text = 'اتنين هامبورجر و اتنين قهوة، لو سمحت. عايزهم تيك اواي.'
63
+
64
+ pred_dialect, _ = predict_dialect(text)
65
+ text = DA_PHRASE_MAP[pred_dialect] + ' ' + text
66
+
67
+ inputs = tokenizer(text, return_tensors='pt')
68
+ gen_kwargs = {'num_beams': 5, 'max_length': 200,
69
+ 'num_return_sequences': 1,
70
+ 'no_repeat_ngram_size': 0, 'early_stopping': False
71
+ }
72
+
73
+ codafied_text = model.generate(**inputs, **gen_kwargs)
74
+ codafied_text = tokenizer.batch_decode(codafied_text,
75
+ skip_special_tokens=True,
76
+ clean_up_tokenization_spaces=False)[0]
77
+
78
+ print(codafied_text)
79
+ "اثنين هامبورجر واثنين قهوة، لو سمحت. عايزهم تيك اوي."
80
+ ```
81
 
82
  ## Citation
83
  ```bibtex