import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer from datasets import load_dataset import numpy as np from sklearn.metrics import accuracy_score, precision_recall_fscore_support def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted') acc = accuracy_score(labels, preds) return { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall } class ArabicDialectTrainer: def __init__(self, model_name="CAMeL-Lab/bert-base-arabic-camelbert-msa"): self.tokenizer = AutoTokenizer.from_pretrained(model_name) # 18 فئة للهجات العربية المختلفة self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=18) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device) # تعريف تصنيف اللهجات self.dialect_mapping = { 0: 'OM', # عُمان 1: 'SD', # السودان 2: 'SA', # السعودية 3: 'KW', # الكويت 4: 'QA', # قطر 5: 'LB', # لبنان 6: 'JO', # الأردن 7: 'SY', # سوريا 8: 'IQ', # العراق 9: 'MA', # المغرب 10: 'EG', # مصر 11: 'PL', # فلسطين 12: 'YE', # اليمن 13: 'BH', # البحرين 14: 'DZ', # الجزائر 15: 'AE', # الإمارات 16: 'TN', # تونس 17: 'LY' # ليبيا } def tokenize_data(self, examples): return self.tokenizer( examples['text'], padding='max_length', truncation=True, max_length=128 ) def prepare_dataset(self, dataset): tokenized_dataset = dataset.map(self.tokenize_data, batched=True) tokenized_dataset = tokenized_dataset.remove_columns(['text', 'id']) tokenized_dataset = tokenized_dataset.rename_column('label', 'labels') tokenized_dataset.set_format('torch') return tokenized_dataset def train(self, train_dataset, eval_dataset=None, output_dir="./trained_model", num_train_epochs=3): print("تهيئة معلمات التدريب...") training_args = TrainingArguments( output_dir=output_dir, num_train_epochs=num_train_epochs, per_device_train_batch_size=32, per_device_eval_batch_size=32, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=100, evaluation_strategy="epoch" if eval_dataset else "no", save_strategy="epoch", load_best_model_at_end=True if eval_dataset else False, metric_for_best_model="f1" if eval_dataset else None, ) trainer = Trainer( model=self.model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) print("بدء التدريب...") trainer.train() if eval_dataset: print("تقييم النموذج...") results = trainer.evaluate() print(f"نتائج التقييم: {results}") print("حفظ النموذج...") self.model.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) print("تم حفظ النموذج بنجاح!") def main(): print("تحميل مجموعة البيانات...") dataset = load_dataset("Abdelrahman-Rezk/Arabic_Dialect_Identification") trainer = ArabicDialectTrainer() print("تجهيز البيانات للتدريب...") train_dataset = trainer.prepare_dataset(dataset['train']) eval_dataset = trainer.prepare_dataset(dataset['validation']) print("بدء عملية التدريب...") trainer.train(train_dataset, eval_dataset) if __name__ == "__main__": main()