Spaces:
Sleeping
Sleeping
import torch | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
from datasets import load_dataset | |
import numpy as np | |
from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
def compute_metrics(pred): | |
labels = pred.label_ids | |
preds = pred.predictions.argmax(-1) | |
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted') | |
acc = accuracy_score(labels, preds) | |
return { | |
'accuracy': acc, | |
'f1': f1, | |
'precision': precision, | |
'recall': recall | |
} | |
class ArabicDialectTrainer: | |
def __init__(self, model_name="CAMeL-Lab/bert-base-arabic-camelbert-msa"): | |
self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# 18 فئة للهجات العربية المختلفة | |
self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=18) | |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
self.model.to(self.device) | |
# تعريف تصنيف اللهجات | |
self.dialect_mapping = { | |
0: 'OM', # عُمان | |
1: 'SD', # السودان | |
2: 'SA', # السعودية | |
3: 'KW', # الكويت | |
4: 'QA', # قطر | |
5: 'LB', # لبنان | |
6: 'JO', # الأردن | |
7: 'SY', # سوريا | |
8: 'IQ', # العراق | |
9: 'MA', # المغرب | |
10: 'EG', # مصر | |
11: 'PL', # فلسطين | |
12: 'YE', # اليمن | |
13: 'BH', # البحرين | |
14: 'DZ', # الجزائر | |
15: 'AE', # الإمارات | |
16: 'TN', # تونس | |
17: 'LY' # ليبيا | |
} | |
def tokenize_data(self, examples): | |
return self.tokenizer( | |
examples['text'], | |
padding='max_length', | |
truncation=True, | |
max_length=128 | |
) | |
def prepare_dataset(self, dataset): | |
tokenized_dataset = dataset.map(self.tokenize_data, batched=True) | |
tokenized_dataset = tokenized_dataset.remove_columns(['text', 'id']) | |
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels') | |
tokenized_dataset.set_format('torch') | |
return tokenized_dataset | |
def train(self, train_dataset, eval_dataset=None, output_dir="./trained_model", num_train_epochs=3): | |
print("تهيئة معلمات التدريب...") | |
training_args = TrainingArguments( | |
output_dir=output_dir, | |
num_train_epochs=num_train_epochs, | |
per_device_train_batch_size=32, | |
per_device_eval_batch_size=32, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir='./logs', | |
logging_steps=100, | |
evaluation_strategy="epoch" if eval_dataset else "no", | |
save_strategy="epoch", | |
load_best_model_at_end=True if eval_dataset else False, | |
metric_for_best_model="f1" if eval_dataset else None, | |
) | |
trainer = Trainer( | |
model=self.model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=eval_dataset, | |
compute_metrics=compute_metrics, | |
) | |
print("بدء التدريب...") | |
trainer.train() | |
if eval_dataset: | |
print("تقييم النموذج...") | |
results = trainer.evaluate() | |
print(f"نتائج التقييم: {results}") | |
print("حفظ النموذج...") | |
self.model.save_pretrained(output_dir) | |
self.tokenizer.save_pretrained(output_dir) | |
print("تم حفظ النموذج بنجاح!") | |
def main(): | |
print("تحميل مجموعة البيانات...") | |
dataset = load_dataset("Abdelrahman-Rezk/Arabic_Dialect_Identification") | |
trainer = ArabicDialectTrainer() | |
print("تجهيز البيانات للتدريب...") | |
train_dataset = trainer.prepare_dataset(dataset['train']) | |
eval_dataset = trainer.prepare_dataset(dataset['validation']) | |
print("بدء عملية التدريب...") | |
trainer.train(train_dataset, eval_dataset) | |
if __name__ == "__main__": | |
main() | |