mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-1.5B-v1.1 · Open source eval metrics and library?

Hi, you're probably not running it with the right settings, below is the code, I just run it and it's working fine:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

compute_dtype  = torch.bfloat16 
cache_dir      = None
attn_implem    = "flash_attention_2" 
model_id       = "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-1.5B-v1.1"

#Load model
model     = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=cache_dir, torch_dtype=compute_dtype, attn_implementation=attn_implem, device_map='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
###########################################################################################

import numpy as np
import copy
import lm_eval
model.eval();
model.config.use_cache = False
try:
    lm_eval.tasks.initialize_tasks() 
except:
    pass
model_eval      = lm_eval.models.huggingface.HFLM(pretrained=model, tokenizer=tokenizer)
eval_batch_size = 1 

results = {}

########################################################################################################################
#GMS8K
for task in [("gsm8k", 5)]:
    tag, fewshot = task
    results[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
    print(tag, results[tag])
results[tag] = results[tag][tag]['exact_match,strict-match']

#'exact_match,strict-match': np.float64(0.7475360121304018)

########################################################################################################################
#MMLU
results_mmlu = {}
for task in [("mmlu", 5)]:
    tag, fewshot = task
    results_mmlu[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
    print(tag, results_mmlu[tag])

mmlu_list    = "hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
mmlu_list    = [l.replace('hendrycksTest-','') for l in mmlu_list.split(',')]
results_mmlu = results_mmlu['mmlu']

k = []
for r in results_mmlu:
    if np.any([(l in r) for l in mmlu_list]):
        k.append(results_mmlu[r]['acc,none'])

assert len(k)==57

results['mmlu'] = np.mean(k)
print('MMLU avg acc', results['mmlu']) 

#MMLU avg acc 0.4183564871541478
########################################################################################################################