Omartificial-Intelligence-Space commited on
Commit
4159f5f
·
verified ·
1 Parent(s): 0b82379

update submit

Browse files
Files changed (1) hide show
  1. src/submission/submit.py +24 -13
src/submission/submit.py CHANGED
@@ -12,7 +12,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
12
  from langchain.prompts import PromptTemplate
13
 
14
  from src.display.formatting import styled_error, styled_message, styled_warning
15
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO, FIXED_QUESTIONS_FILE
16
  from src.submission.check_validity import (
17
  already_submitted_models,
18
  check_model_card,
@@ -71,7 +71,7 @@ def get_top_prediction(text, tokenizer, model):
71
  return top_option
72
 
73
  @spaces.GPU(duration=120)
74
- def evaluate_model_accuracy_by_subject(model_name):
75
  try:
76
  # Load the model and tokenizer
77
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -86,13 +86,12 @@ def evaluate_model_accuracy_by_subject(model_name):
86
  else:
87
  model = model.cpu()
88
 
89
- # Load fixed questions from JSON file
90
- fixed_questions_path = os.path.join(EVAL_RESULTS_PATH, FIXED_QUESTIONS_FILE)
91
- if not os.path.exists(fixed_questions_path):
92
- return "Fixed questions file not found. Please run the preselection step.", {}
93
 
94
- with open(fixed_questions_path, 'r') as f:
95
- fixed_questions = json.load(f)
96
 
97
  # Define prompt template
98
  template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
@@ -110,12 +109,24 @@ Answer:"""
110
  overall_correct_predictions = 0
111
  overall_total_questions = 0
112
 
113
- for subject, questions in fixed_questions.items():
 
 
 
 
 
 
 
 
 
 
 
 
114
  correct_predictions = 0
115
  total_questions = 0
116
  results = []
117
 
118
- for data in questions:
119
  # Prepare text input
120
  text = prompt_template.format(
121
  Question=data['Question'],
@@ -224,7 +235,7 @@ def add_new_eval(
224
 
225
  # Now, perform the evaluation
226
  try:
227
- overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model)
228
  if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
229
  return styled_error(overall_accuracy)
230
  except Exception as e:
@@ -233,7 +244,7 @@ def add_new_eval(
233
  # Prepare results for storage
234
  results_dict = {
235
  "config": {
236
- "model": model, # Ensure 'model' key is present
237
  "base_model": base_model,
238
  "revision": revision,
239
  "precision": precision,
@@ -272,4 +283,4 @@ def add_new_eval(
272
  # Remove the local results file
273
  os.remove(results_file_path)
274
 
275
- return styled_message("Your model has been evaluated and the results are now on the leaderboard!")
 
12
  from langchain.prompts import PromptTemplate
13
 
14
  from src.display.formatting import styled_error, styled_message, styled_warning
15
+ from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
16
  from src.submission.check_validity import (
17
  already_submitted_models,
18
  check_model_card,
 
71
  return top_option
72
 
73
  @spaces.GPU(duration=120)
74
+ def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=30):
75
  try:
76
  # Load the model and tokenizer
77
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
86
  else:
87
  model = model.cpu()
88
 
89
+ # Load your custom MMMLU dataset from HuggingFace
90
+ dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
91
+ dataset = dataset['test']
 
92
 
93
+ # Filter out excluded subjects
94
+ dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
95
 
96
  # Define prompt template
97
  template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
 
109
  overall_correct_predictions = 0
110
  overall_total_questions = 0
111
 
112
+ subjects = dataset.unique('Subject')
113
+ for subject in subjects:
114
+ subject_data = dataset.filter(lambda x: x['Subject'] == subject)
115
+
116
+ # Sample num_questions_per_subject from each subject
117
+ if num_questions_per_subject > 0:
118
+ if len(subject_data) < num_questions_per_subject:
119
+ print(f"Warning: Not enough questions for subject '{subject}'. Using all available questions.")
120
+ selected_indices = range(len(subject_data))
121
+ else:
122
+ selected_indices = random.sample(range(len(subject_data)), num_questions_per_subject)
123
+ subject_data = subject_data.select(selected_indices)
124
+
125
  correct_predictions = 0
126
  total_questions = 0
127
  results = []
128
 
129
+ for data in subject_data:
130
  # Prepare text input
131
  text = prompt_template.format(
132
  Question=data['Question'],
 
235
 
236
  # Now, perform the evaluation
237
  try:
238
+ overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=30)
239
  if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
240
  return styled_error(overall_accuracy)
241
  except Exception as e:
 
244
  # Prepare results for storage
245
  results_dict = {
246
  "config": {
247
+ "model_name": model,
248
  "base_model": base_model,
249
  "revision": revision,
250
  "precision": precision,
 
283
  # Remove the local results file
284
  os.remove(results_file_path)
285
 
286
+ return styled_message("Your model has been evaluated and the results are now on the leaderboard!")