Open source eval metrics and library?

#1
by er1k0 - opened

Hi,

We've been evaluating the DeepSeek 1.5B model and noticed some significant differences between our evaluation scores and the ones you published. We're particularly interested in understanding how you achieved the reported 42 on MMLU and 74 on GSM8K, as our results on these datasets are considerably lower.

It would be great if you can share more details about your eval process:

  • Is it ok to open source your evaluation metrics and library code?
  • Are you using a perplexity-based approach for the scoring dataset, where you score the option text?
  • Is there any prompt engineering involved to instruct the model?

Thanks in advance!

Mobius Labs GmbH org

Hi, you're probably not running it with the right settings, below is the code, I just run it and it's working fine:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

compute_dtype  = torch.bfloat16 
cache_dir      = None
attn_implem    = "flash_attention_2" 
model_id       = "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-1.5B-v1.1"

#Load model
model     = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=cache_dir, torch_dtype=compute_dtype, attn_implementation=attn_implem, device_map='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
###########################################################################################

import numpy as np
import copy
import lm_eval
model.eval();
model.config.use_cache = False
try:
    lm_eval.tasks.initialize_tasks() 
except:
    pass
model_eval      = lm_eval.models.huggingface.HFLM(pretrained=model, tokenizer=tokenizer)
eval_batch_size = 1 

results = {}

########################################################################################################################
#GMS8K
for task in [("gsm8k", 5)]:
    tag, fewshot = task
    results[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
    print(tag, results[tag])
results[tag] = results[tag][tag]['exact_match,strict-match']

#'exact_match,strict-match': np.float64(0.7475360121304018)

########################################################################################################################
#MMLU
results_mmlu = {}
for task in [("mmlu", 5)]:
    tag, fewshot = task
    results_mmlu[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
    print(tag, results_mmlu[tag])

mmlu_list    = "hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
mmlu_list    = [l.replace('hendrycksTest-','') for l in mmlu_list.split(',')]
results_mmlu = results_mmlu['mmlu']

k = []
for r in results_mmlu:
    if np.any([(l in r) for l in mmlu_list]):
        k.append(results_mmlu[r]['acc,none'])

assert len(k)==57

results['mmlu'] = np.mean(k)
print('MMLU avg acc', results['mmlu']) 

#MMLU avg acc 0.4183564871541478
########################################################################################################################

Sign up or log in to comment