Spaces:

Omartificial-Intelligence-Space
/

Arabic-MMMLU-Leaderborad

Running

App Files Files Community

Omartificial-Intelligence-Space commited on Sep 25, 2024

Commit

4159f5f

verified ·

1 Parent(s): 0b82379

update submit

Browse files

Files changed (1) hide show

src/submission/submit.py +24 -13

src/submission/submit.py CHANGED Viewed

@@ -12,7 +12,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 from langchain.prompts import PromptTemplate
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO, FIXED_QUESTIONS_FILE
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
@@ -71,7 +71,7 @@ def get_top_prediction(text, tokenizer, model):
     return top_option
 @spaces.GPU(duration=120)
-def evaluate_model_accuracy_by_subject(model_name):
     try:
         # Load the model and tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -86,13 +86,12 @@ def evaluate_model_accuracy_by_subject(model_name):
         else:
             model = model.cpu()
-        # Load fixed questions from JSON file
-        fixed_questions_path = os.path.join(EVAL_RESULTS_PATH, FIXED_QUESTIONS_FILE)
-        if not os.path.exists(fixed_questions_path):
-            return "Fixed questions file not found. Please run the preselection step.", {}
-        with open(fixed_questions_path, 'r') as f:
-            fixed_questions = json.load(f)
         # Define prompt template
         template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
@@ -110,12 +109,24 @@ Answer:"""
         overall_correct_predictions = 0
         overall_total_questions = 0
-        for subject, questions in fixed_questions.items():
             correct_predictions = 0
             total_questions = 0
             results = []
-            for data in questions:
                 # Prepare text input
                 text = prompt_template.format(
                     Question=data['Question'],
@@ -224,7 +235,7 @@ def add_new_eval(
     # Now, perform the evaluation
     try:
-        overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model)
         if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
             return styled_error(overall_accuracy)
     except Exception as e:
@@ -233,7 +244,7 @@ def add_new_eval(
     # Prepare results for storage
     results_dict = {
         "config": {
-            "model": model,  # Ensure 'model' key is present
             "base_model": base_model,
             "revision": revision,
             "precision": precision,
@@ -272,4 +283,4 @@ def add_new_eval(
     # Remove the local results file
     os.remove(results_file_path)
-    return styled_message("Your model has been evaluated and the results are now on the leaderboard!")

 from langchain.prompts import PromptTemplate
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
     return top_option
 @spaces.GPU(duration=120)
+def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=30):
     try:
         # Load the model and tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         else:
             model = model.cpu()
+        # Load your custom MMMLU dataset from HuggingFace
+        dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
+        dataset = dataset['test']
+        # Filter out excluded subjects
+        dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
         # Define prompt template
         template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
         overall_correct_predictions = 0
         overall_total_questions = 0
+        subjects = dataset.unique('Subject')
+        for subject in subjects:
+            subject_data = dataset.filter(lambda x: x['Subject'] == subject)
+            # Sample num_questions_per_subject from each subject
+            if num_questions_per_subject > 0:
+                if len(subject_data) < num_questions_per_subject:
+                    print(f"Warning: Not enough questions for subject '{subject}'. Using all available questions.")
+                    selected_indices = range(len(subject_data))
+                else:
+                    selected_indices = random.sample(range(len(subject_data)), num_questions_per_subject)
+                subject_data = subject_data.select(selected_indices)
             correct_predictions = 0
             total_questions = 0
             results = []
+            for data in subject_data:
                 # Prepare text input
                 text = prompt_template.format(
                     Question=data['Question'],
     # Now, perform the evaluation
     try:
+        overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=30)
         if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
             return styled_error(overall_accuracy)
     except Exception as e:
     # Prepare results for storage
     results_dict = {
         "config": {
+            "model_name": model,
             "base_model": base_model,
             "revision": revision,
             "precision": precision,
     # Remove the local results file
     os.remove(results_file_path)
+    return styled_message("Your model has been evaluated and the results are now on the leaderboard!")