Omartificial-Intelligence-Space
commited on
update submit
Browse files- src/submission/submit.py +24 -13
src/submission/submit.py
CHANGED
@@ -12,7 +12,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
12 |
from langchain.prompts import PromptTemplate
|
13 |
|
14 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
15 |
-
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
|
16 |
from src.submission.check_validity import (
|
17 |
already_submitted_models,
|
18 |
check_model_card,
|
@@ -71,7 +71,7 @@ def get_top_prediction(text, tokenizer, model):
|
|
71 |
return top_option
|
72 |
|
73 |
@spaces.GPU(duration=120)
|
74 |
-
def evaluate_model_accuracy_by_subject(model_name):
|
75 |
try:
|
76 |
# Load the model and tokenizer
|
77 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
@@ -86,13 +86,12 @@ def evaluate_model_accuracy_by_subject(model_name):
|
|
86 |
else:
|
87 |
model = model.cpu()
|
88 |
|
89 |
-
# Load
|
90 |
-
|
91 |
-
|
92 |
-
return "Fixed questions file not found. Please run the preselection step.", {}
|
93 |
|
94 |
-
|
95 |
-
|
96 |
|
97 |
# Define prompt template
|
98 |
template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
|
@@ -110,12 +109,24 @@ Answer:"""
|
|
110 |
overall_correct_predictions = 0
|
111 |
overall_total_questions = 0
|
112 |
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
correct_predictions = 0
|
115 |
total_questions = 0
|
116 |
results = []
|
117 |
|
118 |
-
for data in
|
119 |
# Prepare text input
|
120 |
text = prompt_template.format(
|
121 |
Question=data['Question'],
|
@@ -224,7 +235,7 @@ def add_new_eval(
|
|
224 |
|
225 |
# Now, perform the evaluation
|
226 |
try:
|
227 |
-
overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model)
|
228 |
if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
|
229 |
return styled_error(overall_accuracy)
|
230 |
except Exception as e:
|
@@ -233,7 +244,7 @@ def add_new_eval(
|
|
233 |
# Prepare results for storage
|
234 |
results_dict = {
|
235 |
"config": {
|
236 |
-
"
|
237 |
"base_model": base_model,
|
238 |
"revision": revision,
|
239 |
"precision": precision,
|
@@ -272,4 +283,4 @@ def add_new_eval(
|
|
272 |
# Remove the local results file
|
273 |
os.remove(results_file_path)
|
274 |
|
275 |
-
return styled_message("Your model has been evaluated and the results are now on the leaderboard!")
|
|
|
12 |
from langchain.prompts import PromptTemplate
|
13 |
|
14 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
15 |
+
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
|
16 |
from src.submission.check_validity import (
|
17 |
already_submitted_models,
|
18 |
check_model_card,
|
|
|
71 |
return top_option
|
72 |
|
73 |
@spaces.GPU(duration=120)
|
74 |
+
def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=30):
|
75 |
try:
|
76 |
# Load the model and tokenizer
|
77 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
|
86 |
else:
|
87 |
model = model.cpu()
|
88 |
|
89 |
+
# Load your custom MMMLU dataset from HuggingFace
|
90 |
+
dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
|
91 |
+
dataset = dataset['test']
|
|
|
92 |
|
93 |
+
# Filter out excluded subjects
|
94 |
+
dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
|
95 |
|
96 |
# Define prompt template
|
97 |
template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
|
|
|
109 |
overall_correct_predictions = 0
|
110 |
overall_total_questions = 0
|
111 |
|
112 |
+
subjects = dataset.unique('Subject')
|
113 |
+
for subject in subjects:
|
114 |
+
subject_data = dataset.filter(lambda x: x['Subject'] == subject)
|
115 |
+
|
116 |
+
# Sample num_questions_per_subject from each subject
|
117 |
+
if num_questions_per_subject > 0:
|
118 |
+
if len(subject_data) < num_questions_per_subject:
|
119 |
+
print(f"Warning: Not enough questions for subject '{subject}'. Using all available questions.")
|
120 |
+
selected_indices = range(len(subject_data))
|
121 |
+
else:
|
122 |
+
selected_indices = random.sample(range(len(subject_data)), num_questions_per_subject)
|
123 |
+
subject_data = subject_data.select(selected_indices)
|
124 |
+
|
125 |
correct_predictions = 0
|
126 |
total_questions = 0
|
127 |
results = []
|
128 |
|
129 |
+
for data in subject_data:
|
130 |
# Prepare text input
|
131 |
text = prompt_template.format(
|
132 |
Question=data['Question'],
|
|
|
235 |
|
236 |
# Now, perform the evaluation
|
237 |
try:
|
238 |
+
overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=30)
|
239 |
if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
|
240 |
return styled_error(overall_accuracy)
|
241 |
except Exception as e:
|
|
|
244 |
# Prepare results for storage
|
245 |
results_dict = {
|
246 |
"config": {
|
247 |
+
"model_name": model,
|
248 |
"base_model": base_model,
|
249 |
"revision": revision,
|
250 |
"precision": precision,
|
|
|
283 |
# Remove the local results file
|
284 |
os.remove(results_file_path)
|
285 |
|
286 |
+
return styled_message("Your model has been evaluated and the results are now on the leaderboard!")
|