amineabdaoui
commited on
Commit
•
3d18153
1
Parent(s):
43e7ad4
initial commit
Browse files- README.md +53 -0
- config.json +25 -0
- pytorch_model.bin +3 -0
- tf_model.h5 +3 -0
- tokenizer_config.json +1 -0
- vocab.txt +0 -0
README.md
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- ar
|
4 |
+
- dz
|
5 |
+
|
6 |
+
tags:
|
7 |
+
- pytorch
|
8 |
+
- bert
|
9 |
+
- multilingual
|
10 |
+
- ar
|
11 |
+
- dz
|
12 |
+
|
13 |
+
license: apache-2.0
|
14 |
+
|
15 |
+
widget:
|
16 |
+
- text: " أنا من الجزائر من ولاية [MASK] "
|
17 |
+
- text: "rabi [MASK] khouya sami"
|
18 |
+
- text: " ربي [MASK] خويا لعزيز"
|
19 |
+
- text: "tahya el [MASK]."
|
20 |
+
- text: "rouhi ya dzayer [MASK]"
|
21 |
+
|
22 |
+
inference: true
|
23 |
+
---
|
24 |
+
|
25 |
+
# DziriBERT
|
26 |
+
DziriBERT is the first Transformer-based Language Model that has been pre-trained specifically for the Algerian Dialect. It handles Algerian text contents written using both Arabic and Latin characters. It sets new state of the art results on Algerian text classification datasets, even if it has been pre-trained on much less data (~1 million tweets).
|
27 |
+
|
28 |
+
For more information, please visit our paper: link.
|
29 |
+
|
30 |
+
## How to use
|
31 |
+
|
32 |
+
```python
|
33 |
+
from transformers import BertTokenizer, BertForMaskedLM
|
34 |
+
|
35 |
+
tokenizer = BertTokenizer.from_pretrained("amine/dziribert")
|
36 |
+
model = BertForMaskedLM.from_pretrained("amine/dziribert")
|
37 |
+
|
38 |
+
```
|
39 |
+
|
40 |
+
### How to cite
|
41 |
+
|
42 |
+
```bibtex
|
43 |
+
@article{dziribert,
|
44 |
+
title={DziriBERT: a Pre-trained Language Model for the Algerian Dialect},
|
45 |
+
author={Abdaoui, Amine and Berrimi, Mohamed and Oussalah, Mourad and Moussaoui, Abdelouahab},
|
46 |
+
journal={arXiv preprint arXiv:XXX.XXXXX},
|
47 |
+
year={2021}
|
48 |
+
}
|
49 |
+
```
|
50 |
+
|
51 |
+
## Contact
|
52 |
+
|
53 |
+
Please contact [email protected] for any question, feedback or request.
|
config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": ".",
|
3 |
+
"architectures": [
|
4 |
+
"BertForMaskedLM"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"transformers_version": "4.10.3",
|
22 |
+
"type_vocab_size": 2,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 50000
|
25 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0717245dc2d2aa9108bb5266e2541a7df0a11467265e746d4972532e9876d084
|
3 |
+
size 498061650
|
tf_model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8875357e722fde8db51ab392b59adbfef9ac1694afef4f3ed7ab9523a1d5bd5d
|
3 |
+
size 653438424
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "model_max_length": 512, "max_len": 512}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|