Open source eval metrics and library?
#1
by
er1k0
- opened
Hi,
We've been evaluating the DeepSeek 1.5B model and noticed some significant differences between our evaluation scores and the ones you published. We're particularly interested in understanding how you achieved the reported 42 on MMLU and 74 on GSM8K, as our results on these datasets are considerably lower.
It would be great if you can share more details about your eval process:
- Is it ok to open source your evaluation metrics and library code?
- Are you using a perplexity-based approach for the scoring dataset, where you score the option text?
- Is there any prompt engineering involved to instruct the model?
Thanks in advance!
Hi, you're probably not running it with the right settings, below is the code, I just run it and it's working fine:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
compute_dtype = torch.bfloat16
cache_dir = None
attn_implem = "flash_attention_2"
model_id = "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-1.5B-v1.1"
#Load model
model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=cache_dir, torch_dtype=compute_dtype, attn_implementation=attn_implem, device_map='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
###########################################################################################
import numpy as np
import copy
import lm_eval
model.eval();
model.config.use_cache = False
try:
lm_eval.tasks.initialize_tasks()
except:
pass
model_eval = lm_eval.models.huggingface.HFLM(pretrained=model, tokenizer=tokenizer)
eval_batch_size = 1
results = {}
########################################################################################################################
#GMS8K
for task in [("gsm8k", 5)]:
tag, fewshot = task
results[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
print(tag, results[tag])
results[tag] = results[tag][tag]['exact_match,strict-match']
#'exact_match,strict-match': np.float64(0.7475360121304018)
########################################################################################################################
#MMLU
results_mmlu = {}
for task in [("mmlu", 5)]:
tag, fewshot = task
results_mmlu[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
print(tag, results_mmlu[tag])
mmlu_list = "hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
mmlu_list = [l.replace('hendrycksTest-','') for l in mmlu_list.split(',')]
results_mmlu = results_mmlu['mmlu']
k = []
for r in results_mmlu:
if np.any([(l in r) for l in mmlu_list]):
k.append(results_mmlu[r]['acc,none'])
assert len(k)==57
results['mmlu'] = np.mean(k)
print('MMLU avg acc', results['mmlu'])
#MMLU avg acc 0.4183564871541478
########################################################################################################################