Spaces:

giux78
/

leaderboard-evalita

Running

File size: 5,061 Bytes

056eba8
2a80b46
b98f07f
056eba8
 
b98f07f
056eba8
 
b98f07f
 
f7d1b51
 
056eba8
f7d1b51
056eba8
f7d1b51
 
056eba8
 
 
 
 
 
 
 
 
 
c61c87d
072fab0
69297e0
f4eb9a2
056eba8
 
b98f07f
 
056eba8
 
 
 
 
 
b98f07f
072fab0
7a7f4b5
072fab0
056eba8
 
 
02b54d0
b98f07f
 
 
056eba8
 
f7d1b51
056eba8
 
 
 
 
 
 
 
f7d1b51
056eba8
 
2a80b46
70b9c07
 
b98f07f
056eba8
2a80b46
 
 
 
f7d1b51
 
 
 
 
 
 
c61c87d
d427ae9
 
c61c87d
f7d1b51
 
 
 
 
f4eb9a2
056eba8
f7d1b51
 
f4eb9a2
056eba8
f7d1b51
056eba8
bb46151
 
f7d1b51
02b54d0
b98f07f
 
 
 
 
 
 
 
4b4926f
b98f07f
 
056eba8
f7d1b51
056eba8
 
818f024
428b5d0
f7d1b51
818f024
428b5d0
 
 
 
 
c61c87d
428b5d0
f4eb9a2
428b5d0
c61c87d
428b5d0
b98f07f
c61c87d
 
428b5d0
 
056eba8
f7d1b51
 
 
 
 
 
056eba8
 
 
 
 
 
818f024
 
056eba8

import json
import os
from dataclasses import dataclass
from typing import Dict, List, Tuple

import dateutil
import numpy as np

from src.display_models.utils import AutoEvalColumn, make_clickable_model

METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
BENCH_TO_NAME = {
    "arc:challenge": AutoEvalColumn.arc.name,
    "hellaswag": AutoEvalColumn.hellaswag.name,
    "hendrycksTest": AutoEvalColumn.mmlu.name,
    "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
}


@dataclass
class EvalResult:
    eval_name: str
    org: str
    model: str
    revision: str
    results: dict
    precision: str = ""
    model_type: str = ""
    weight_type: str = "Original"
    date: str = ""

    def to_dict(self):
        from src.load_from_hub import is_model_on_hub

        if self.org is not None:
            base_model = f"{self.org}/{self.model}"
        else:
            base_model = f"{self.model}"
        data_dict = {}

        data_dict["eval_name"] = self.eval_name  # not a column, just a save name
        data_dict["weight_type"] = self.weight_type  # not a column, just a save name
        data_dict[AutoEvalColumn.precision.name] = self.precision
        data_dict[AutoEvalColumn.model_type.name] = self.model_type
        data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
        data_dict[AutoEvalColumn.dummy.name] = base_model
        data_dict[AutoEvalColumn.revision.name] = self.revision
        data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
        data_dict[AutoEvalColumn.still_on_hub.name] = (
            is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
        )

        for benchmark in BENCHMARKS:
            if benchmark not in self.results.keys():
                self.results[benchmark] = None

        for k, v in BENCH_TO_NAME.items():
            data_dict[v] = self.results[k]

        return data_dict


def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
    with open(json_filepath) as fp:
        data = json.load(fp)

    for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
        if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
            return None, []  # we skip models with the wrong version

    try:
        config = data["config"]
    except KeyError:
        config = data["config_general"]
    model = config.get("model_name", None)
    if model is None:
        model = config.get("model_args", None)

    model_sha = config.get("model_sha", "")
    model_split = model.split("/", 1)

    precision = config.get("model_dtype")
    if precision == "None":
        precision = "GPTQ"

    model = model_split[-1]

    if len(model_split) == 1:
        org = None
        model = model_split[0]
        result_key = f"{model}_{precision}"
    else:
        org = model_split[0]
        model = model_split[1]
        result_key = f"{org}_{model}_{precision}"

    eval_results = []
    for benchmark, metric in zip(BENCHMARKS, METRICS):
        accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
        if accs.size == 0 or any([acc is None for acc in accs]):
            continue
        mean_acc = np.mean(accs) * 100.0
        eval_results.append(
            EvalResult(
                eval_name=result_key,
                org=org,
                model=model,
                revision=model_sha,
                results={benchmark: mean_acc},
                precision=precision,  # todo model_type=, weight_type=
                date=config.get("submission_date"),
            )
        )

    return result_key, eval_results


def get_eval_results(results_path: str) -> List[EvalResult]:
    json_filepaths = []

    for root, dir, files in os.walk(results_path):
        # We should only have json files in model results
        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
            continue

        # Sort the files by date
        # store results by precision maybe?
        try:
            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
        except dateutil.parser._parser.ParserError:
            files = [files[-1]]

        # up_to_date = files[-1]
        for file in files:
            json_filepaths.append(os.path.join(root, file))

    eval_results = {}
    for json_filepath in json_filepaths:
        result_key, results = parse_eval_result(json_filepath)
        for eval_result in results:
            if result_key in eval_results.keys():
                eval_results[result_key].results.update(eval_result.results)
            else:
                eval_results[result_key] = eval_result

    eval_results = [v for v in eval_results.values()]

    return eval_results


def get_eval_results_dicts(results_path: str) -> List[Dict]:
    eval_results = get_eval_results(results_path)

    return [e.to_dict() for e in eval_results]