Omartificial-Intelligence-Space commited on
Commit
c842ee2
·
verified ·
1 Parent(s): 7cbbcca

update submit

Browse files
Files changed (1) hide show
  1. src/submission/submit.py +250 -56
src/submission/submit.py CHANGED
@@ -1,8 +1,17 @@
1
- import json
2
  import os
 
 
 
 
 
 
 
3
  from datetime import datetime, timezone
4
  import random
5
 
 
 
 
6
  import torch
7
  import pandas as pd
8
  import numpy as np
@@ -10,6 +19,9 @@ from datasets import load_dataset
10
  from transformers import AutoTokenizer, AutoModelForCausalLM
11
  from langchain.prompts import PromptTemplate
12
 
 
 
 
13
  from src.display.formatting import styled_error, styled_message, styled_warning
14
  from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
15
  from src.submission.check_validity import (
@@ -19,11 +31,20 @@ from src.submission.check_validity import (
19
  is_model_on_hub,
20
  )
21
 
 
 
 
22
  import spaces
23
 
 
 
 
24
  REQUESTED_MODELS = None
25
  USERS_TO_SUBMISSION_DATES = None
26
 
 
 
 
27
  # List of subjects to exclude from evaluation
28
  excluded_subjects = [
29
  "human_sexuality",
@@ -35,8 +56,11 @@ excluded_subjects = [
35
  "world_religions"
36
  ]
37
 
38
- def get_top_prediction(text, tokenizer, model):
39
- inputs = tokenizer(text, return_tensors='pt')
 
 
 
40
  if torch.cuda.is_available():
41
  model = model.cuda()
42
  inputs = {k: v.cuda() for k, v in inputs.items()}
@@ -44,48 +68,93 @@ def get_top_prediction(text, tokenizer, model):
44
  model = model.cpu()
45
  inputs = {k: v.cpu() for k, v in inputs.items()}
46
 
 
 
 
47
  with torch.no_grad():
48
  outputs = model(**inputs)
49
- logits = outputs.logits[0, -1] # Get logits of the last token
 
 
 
50
 
51
  options = [' A', ' B', ' C', ' D']
52
- option_logits = []
53
-
54
- for option in options:
55
- option_ids = tokenizer(option).input_ids
56
- if option_ids and option_ids[-1] < logits.size(0):
57
- option_id = option_ids[-1]
58
- option_logit = logits[option_id]
59
- option_logits.append((option_logit.item(), option.strip()))
 
 
 
 
 
 
 
 
 
 
 
 
60
  else:
61
- print(f"Skipping option '{option}' due to index out of range.")
 
 
 
 
 
 
 
62
 
63
- if not option_logits:
64
- return "No valid options"
65
 
66
- top_option = max(option_logits, key=lambda x: x[0])[1]
67
- return top_option
68
 
69
  @spaces.GPU(duration=120)
70
- def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=100):
71
  try:
 
72
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
73
  tokenizer.pad_token = tokenizer.eos_token
74
 
 
 
 
75
  model = AutoModelForCausalLM.from_pretrained(
76
  model_name,
77
  trust_remote_code=True
78
  )
 
 
 
 
 
 
 
 
79
  if torch.cuda.is_available():
80
  model = model.cuda() # Move model to GPU if available
81
  else:
82
  model = model.cpu()
83
 
 
 
 
 
84
  dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
85
  dataset = dataset['test']
86
 
 
 
 
 
87
  dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
88
 
 
 
 
 
89
  template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
90
  Question: {Question}
91
  A) {A}
@@ -94,20 +163,30 @@ C) {C}
94
  D) {D}
95
  Answer:"""
96
 
 
 
 
97
  prompt_template = PromptTemplate(template=template, input_variables=['Question', 'A', 'B', 'C', 'D'])
98
 
 
 
 
 
99
  subject_results = {}
100
  overall_correct_predictions = 0
101
  overall_total_questions = 0
102
 
103
- subjects = dataset.unique('Subject')
104
 
105
- # To track best performance per subject
106
- best_in_class = {subject: {"model_name": None, "accuracy": 0} for subject in subjects}
107
 
 
 
108
  for subject in subjects:
109
  subject_data = dataset.filter(lambda x: x['Subject'] == subject)
110
 
 
 
 
 
111
  if num_questions_per_subject > 0:
112
  if len(subject_data) < num_questions_per_subject:
113
  print(f"Warning: Not enough questions for subject '{subject}'. Using all available questions.")
@@ -116,40 +195,74 @@ Answer:"""
116
  selected_indices = random.sample(range(len(subject_data)), num_questions_per_subject)
117
  subject_data = subject_data.select(selected_indices)
118
 
 
 
 
119
  correct_predictions = 0
120
  total_questions = 0
121
  results = []
122
 
123
- for data in subject_data:
124
- text = prompt_template.format(
125
- Question=data['Question'],
126
- A=data['A'],
127
- B=data['B'],
128
- C=data['C'],
129
- D=data['D']
130
- )
131
-
132
- top_prediction = get_top_prediction(text, tokenizer, model)
133
- is_correct = (top_prediction == data['Answer'])
134
- correct_predictions += int(is_correct)
135
- total_questions += 1
136
- overall_correct_predictions += int(is_correct)
137
- overall_total_questions += 1
138
-
139
- results.append({
140
- 'Question': data['Question'],
141
- 'Answer': data['Answer'],
142
- 'Prediction': top_prediction,
143
- 'Correct': is_correct
144
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  accuracy = correct_predictions / total_questions if total_questions > 0 else 0
147
 
148
- # Check if this model is the best for the current subject
149
- if accuracy > best_in_class[subject]['accuracy']:
150
- best_in_class[subject]['model_name'] = model_name
151
- best_in_class[subject]['accuracy'] = accuracy
152
 
 
 
 
153
  subject_results[subject] = {
154
  'Correct Predictions': correct_predictions,
155
  'Total Questions': total_questions,
@@ -157,20 +270,27 @@ Answer:"""
157
  'Results DataFrame': pd.DataFrame(results)
158
  }
159
 
 
 
 
160
  overall_accuracy = (overall_correct_predictions / overall_total_questions) * 100 if overall_total_questions > 0 else 0
161
 
162
- return overall_accuracy, subject_results, best_in_class
 
 
 
 
 
 
163
 
164
  except Exception as e:
165
  import traceback
166
  tb = traceback.format_exc()
167
  print(f"Error in evaluate_model_accuracy_by_subject: {e}\n{tb}")
168
- return f"Error: {str(e)}", {}, {}
 
 
169
 
170
- def display_best_in_class(best_in_class):
171
- print("\nBest Model in Each Subject:\n")
172
- for subject, info in best_in_class.items():
173
- print(f"{subject}: {info['model_name']} with accuracy: {info['accuracy'] * 100:.2f}%")
174
 
175
  def add_new_eval(
176
  model: str,
@@ -185,41 +305,101 @@ def add_new_eval(
185
  if not REQUESTED_MODELS:
186
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
187
 
 
 
 
188
  user_name = ""
189
  model_path = model
190
  if "/" in model:
191
  user_name = model.split("/")[0]
192
  model_path = model.split("/")[1]
193
 
 
 
 
194
  precision = precision.split(" ")[0]
195
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
196
 
 
 
 
197
  if model_type is None or model_type == "":
198
  return styled_error("Please select a model type.")
199
 
 
 
 
200
  # Does the model actually exist?
201
  if revision == "":
202
  revision = "main"
203
 
 
 
 
 
204
  if weight_type in ["Delta", "Adapter"]:
205
  base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
206
  if not base_model_on_hub:
207
  return styled_error(f'Base model "{base_model}" {error}')
208
 
 
 
 
209
  if not weight_type == "Adapter":
210
  model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
211
  if not model_on_hub:
212
  return styled_error(f'Model "{model}" {error}')
213
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  try:
215
- overall_accuracy, subject_results, best_in_class = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=100)
216
  if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
217
  return styled_error(overall_accuracy)
218
  except Exception as e:
219
  return styled_error(f"An error occurred during evaluation: {str(e)}")
220
 
221
- # Display the best in class results
222
- display_best_in_class(best_in_class)
223
 
224
  # Prepare results for storage
225
  results_dict = {
@@ -231,22 +411,35 @@ def add_new_eval(
231
  "weight_type": weight_type,
232
  "model_type": model_type,
233
  "submitted_time": current_time,
 
 
 
 
234
  },
235
  "results": {
236
  "average": overall_accuracy,
237
  },
238
  }
239
 
 
 
 
240
  # Include per-subject accuracies
241
  for subject, data in subject_results.items():
242
  accuracy = data['Accuracy']
243
  results_dict['results'][subject] = accuracy
244
 
 
 
 
245
  # Save results to a JSON file
246
  results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"
247
  with open(results_file_path, "w") as f:
248
  json.dump(results_dict, f, indent=4)
249
 
 
 
 
250
  # Upload the results file
251
  API.upload_file(
252
  path_or_fileobj=results_file_path,
@@ -256,6 +449,7 @@ def add_new_eval(
256
  commit_message=f"Add results for {model}"
257
  )
258
 
 
259
  os.remove(results_file_path)
260
 
261
  return styled_message("Your model has been evaluated and the results are now on the leaderboard!")
 
 
1
  import os
2
+
3
+
4
+ # Set environment variable for better memory management
5
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
6
+
7
+
8
+ import json
9
  from datetime import datetime, timezone
10
  import random
11
 
12
+
13
+
14
+
15
  import torch
16
  import pandas as pd
17
  import numpy as np
 
19
  from transformers import AutoTokenizer, AutoModelForCausalLM
20
  from langchain.prompts import PromptTemplate
21
 
22
+
23
+
24
+
25
  from src.display.formatting import styled_error, styled_message, styled_warning
26
  from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
27
  from src.submission.check_validity import (
 
31
  is_model_on_hub,
32
  )
33
 
34
+
35
+
36
+
37
  import spaces
38
 
39
+
40
+
41
+
42
  REQUESTED_MODELS = None
43
  USERS_TO_SUBMISSION_DATES = None
44
 
45
+
46
+
47
+
48
  # List of subjects to exclude from evaluation
49
  excluded_subjects = [
50
  "human_sexuality",
 
56
  "world_religions"
57
  ]
58
 
59
+
60
+
61
+
62
+ def get_top_prediction(batch_texts, tokenizer, model):
63
+ inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
64
  if torch.cuda.is_available():
65
  model = model.cuda()
66
  inputs = {k: v.cuda() for k, v in inputs.items()}
 
68
  model = model.cpu()
69
  inputs = {k: v.cpu() for k, v in inputs.items()}
70
 
71
+
72
+
73
+
74
  with torch.no_grad():
75
  outputs = model(**inputs)
76
+ logits = outputs.logits[:, -1, :] # Get logits of the last token for each input in the batch
77
+
78
+
79
+
80
 
81
  options = [' A', ' B', ' C', ' D']
82
+ predictions = []
83
+
84
+
85
+
86
+
87
+ for i in range(len(batch_texts)):
88
+ option_logits = []
89
+ for option in options:
90
+ option_ids = tokenizer(option).input_ids
91
+ if option_ids and option_ids[-1] < logits.size(1):
92
+ option_logit = logits[i, option_ids[-1]].item()
93
+ option_logits.append((option_logit, option.strip()))
94
+ else:
95
+ print(f"Skipping option '{option}' due to index out of range for input {i}.")
96
+
97
+
98
+
99
+
100
+ if not option_logits:
101
+ predictions.append("No valid options")
102
  else:
103
+ top_option = max(option_logits, key=lambda x: x[0])[1]
104
+ predictions.append(top_option)
105
+
106
+
107
+
108
+
109
+ return predictions
110
+
111
 
 
 
112
 
 
 
113
 
114
  @spaces.GPU(duration=120)
115
+ def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=100, batch_size=32):
116
  try:
117
+ # Load the model and tokenizer
118
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
119
  tokenizer.pad_token = tokenizer.eos_token
120
 
121
+
122
+
123
+
124
  model = AutoModelForCausalLM.from_pretrained(
125
  model_name,
126
  trust_remote_code=True
127
  )
128
+
129
+
130
+ # Convert model to FP16 (half precision) to reduce memory usage
131
+ model = model.half()
132
+
133
+
134
+
135
+
136
  if torch.cuda.is_available():
137
  model = model.cuda() # Move model to GPU if available
138
  else:
139
  model = model.cpu()
140
 
141
+
142
+
143
+
144
+ # Load your custom MMMLU dataset from HuggingFace
145
  dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
146
  dataset = dataset['test']
147
 
148
+
149
+
150
+
151
+ # Filter out excluded subjects
152
  dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
153
 
154
+
155
+
156
+
157
+ # Define prompt template
158
  template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
159
  Question: {Question}
160
  A) {A}
 
163
  D) {D}
164
  Answer:"""
165
 
166
+
167
+
168
+
169
  prompt_template = PromptTemplate(template=template, input_variables=['Question', 'A', 'B', 'C', 'D'])
170
 
171
+
172
+
173
+
174
+ # Initialize results storage
175
  subject_results = {}
176
  overall_correct_predictions = 0
177
  overall_total_questions = 0
178
 
 
179
 
 
 
180
 
181
+
182
+ subjects = dataset.unique('Subject')
183
  for subject in subjects:
184
  subject_data = dataset.filter(lambda x: x['Subject'] == subject)
185
 
186
+
187
+
188
+
189
+ # Sample num_questions_per_subject from each subject
190
  if num_questions_per_subject > 0:
191
  if len(subject_data) < num_questions_per_subject:
192
  print(f"Warning: Not enough questions for subject '{subject}'. Using all available questions.")
 
195
  selected_indices = random.sample(range(len(subject_data)), num_questions_per_subject)
196
  subject_data = subject_data.select(selected_indices)
197
 
198
+
199
+
200
+
201
  correct_predictions = 0
202
  total_questions = 0
203
  results = []
204
 
205
+
206
+
207
+
208
+ model.eval()
209
+ # Batch processing
210
+ for i in range(0, len(subject_data), batch_size):
211
+ batch_data = subject_data[i:i + batch_size]
212
+
213
+ # Generate batch texts
214
+ batch_texts = [
215
+ prompt_template.format(
216
+ Question=batch_data['Question'][j],
217
+ A=batch_data['A'][j],
218
+ B=batch_data['B'][j],
219
+ C=batch_data['C'][j],
220
+ D=batch_data['D'][j]
221
+ ) for j in range(len(batch_data['Question']))
222
+ ]
223
+
224
+
225
+
226
+
227
+ # Get the top predictions for the batch
228
+ batch_predictions = get_top_prediction(batch_texts, tokenizer, model)
229
+
230
+
231
+
232
+
233
+ for j in range(len(batch_data['Question'])):
234
+ top_prediction = batch_predictions[j]
235
+ is_correct = (top_prediction == batch_data['Answer'][j])
236
+ correct_predictions += int(is_correct)
237
+ total_questions += 1
238
+ overall_correct_predictions += int(is_correct)
239
+ overall_total_questions += 1
240
+
241
+
242
+
243
+
244
+ results.append({
245
+ 'Question': batch_data['Question'][j],
246
+ 'Answer': batch_data['Answer'][j],
247
+ 'Prediction': top_prediction,
248
+ 'Correct': is_correct
249
+ })
250
+
251
+
252
+
253
+
254
+ # Clear GPU memory after processing each subject
255
+ torch.cuda.empty_cache()
256
+
257
+
258
+
259
 
260
  accuracy = correct_predictions / total_questions if total_questions > 0 else 0
261
 
 
 
 
 
262
 
263
+
264
+
265
+ # Store results for this subject
266
  subject_results[subject] = {
267
  'Correct Predictions': correct_predictions,
268
  'Total Questions': total_questions,
 
270
  'Results DataFrame': pd.DataFrame(results)
271
  }
272
 
273
+
274
+
275
+
276
  overall_accuracy = (overall_correct_predictions / overall_total_questions) * 100 if overall_total_questions > 0 else 0
277
 
278
+
279
+
280
+
281
+ return overall_accuracy, subject_results
282
+
283
+
284
+
285
 
286
  except Exception as e:
287
  import traceback
288
  tb = traceback.format_exc()
289
  print(f"Error in evaluate_model_accuracy_by_subject: {e}\n{tb}")
290
+ return f"Error: {str(e)}", {}
291
+
292
+
293
 
 
 
 
 
294
 
295
  def add_new_eval(
296
  model: str,
 
305
  if not REQUESTED_MODELS:
306
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
307
 
308
+
309
+
310
+
311
  user_name = ""
312
  model_path = model
313
  if "/" in model:
314
  user_name = model.split("/")[0]
315
  model_path = model.split("/")[1]
316
 
317
+
318
+
319
+
320
  precision = precision.split(" ")[0]
321
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
322
 
323
+
324
+
325
+
326
  if model_type is None or model_type == "":
327
  return styled_error("Please select a model type.")
328
 
329
+
330
+
331
+
332
  # Does the model actually exist?
333
  if revision == "":
334
  revision = "main"
335
 
336
+
337
+
338
+
339
+ # Is the model on the hub?
340
  if weight_type in ["Delta", "Adapter"]:
341
  base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
342
  if not base_model_on_hub:
343
  return styled_error(f'Base model "{base_model}" {error}')
344
 
345
+
346
+
347
+
348
  if not weight_type == "Adapter":
349
  model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
350
  if not model_on_hub:
351
  return styled_error(f'Model "{model}" {error}')
352
 
353
+
354
+
355
+
356
+ # Is the model info correctly filled?
357
+ try:
358
+ model_info = API.model_info(repo_id=model, revision=revision)
359
+ except Exception:
360
+ return styled_error("Could not get your model information. Please fill it up properly.")
361
+
362
+
363
+
364
+
365
+ model_size = get_model_size(model_info=model_info, precision=precision)
366
+
367
+
368
+
369
+
370
+ # Were the model card and license filled?
371
+ try:
372
+ license = model_info.cardData["license"]
373
+ except Exception:
374
+ return styled_error("Please select a license for your model")
375
+
376
+
377
+
378
+
379
+ modelcard_OK, error_msg = check_model_card(model)
380
+ if not modelcard_OK:
381
+ return styled_error(error_msg)
382
+
383
+
384
+
385
+
386
+ # Check for duplicate submission
387
+ if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
388
+ return styled_warning("This model has been already submitted.")
389
+
390
+
391
+
392
+
393
+ # Now, perform the evaluation
394
  try:
395
+ overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=100, batch_size=32)
396
  if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
397
  return styled_error(overall_accuracy)
398
  except Exception as e:
399
  return styled_error(f"An error occurred during evaluation: {str(e)}")
400
 
401
+
402
+
403
 
404
  # Prepare results for storage
405
  results_dict = {
 
411
  "weight_type": weight_type,
412
  "model_type": model_type,
413
  "submitted_time": current_time,
414
+ "license": license,
415
+ "likes": model_info.likes,
416
+ "params": model_size,
417
+ "still_on_hub": True,
418
  },
419
  "results": {
420
  "average": overall_accuracy,
421
  },
422
  }
423
 
424
+
425
+
426
+
427
  # Include per-subject accuracies
428
  for subject, data in subject_results.items():
429
  accuracy = data['Accuracy']
430
  results_dict['results'][subject] = accuracy
431
 
432
+
433
+
434
+
435
  # Save results to a JSON file
436
  results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"
437
  with open(results_file_path, "w") as f:
438
  json.dump(results_dict, f, indent=4)
439
 
440
+
441
+
442
+
443
  # Upload the results file
444
  API.upload_file(
445
  path_or_fileobj=results_file_path,
 
449
  commit_message=f"Add results for {model}"
450
  )
451
 
452
+ # Remove the local results file
453
  os.remove(results_file_path)
454
 
455
  return styled_message("Your model has been evaluated and the results are now on the leaderboard!")