Spaces:
AIR-Bench
/
Running on CPU Upgrade

File size: 3,201 Bytes
649e0fb
a3d4c8d
649e0fb
 
 
a3d4c8d
 
2bee5cb
649e0fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec8e2d4
649e0fb
 
 
 
 
 
 
 
ec8e2d4
649e0fb
 
ec8e2d4
649e0fb
 
 
 
 
 
 
 
 
 
 
 
 
1a22df4
7845083
7a743dd
ace0540
 
6f9f649
ace0540
 
 
 
 
 
6f9f649
ace0540
 
 
 
 
 
6f9f649
7a743dd
ace0540
649e0fb
 
5e11615
649e0fb
7845083
649e0fb
7845083
649e0fb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os.path
from typing import Dict, List

import pandas as pd

from src.columns import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA
from src.models import FullEvalResult, LeaderboardDataStore, TaskType, get_safe_name
from src.utils import get_default_cols, get_leaderboard_df

pd.options.mode.copy_on_write = True


def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
    """
    Load the evaluation results from a json file
    """
    model_result_filepaths = []
    for root, dirs, files in os.walk(results_path):
        if len(files) == 0:
            continue

        # select the latest results
        for file in files:
            if not (file.startswith("results") and file.endswith(".json")):
                print(f"skip {file}")
                continue
            model_result_filepaths.append(os.path.join(root, file))

    eval_results = {}
    for model_result_filepath in model_result_filepaths:
        # create evaluation results
        try:
            eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
        except UnicodeDecodeError:
            print(f"loading file failed. {model_result_filepath}")
            continue
        print(f"file loaded: {model_result_filepath}")
        timestamp = eval_result.timestamp
        eval_results[timestamp] = eval_result

    results = []
    for k, v in eval_results.items():
        try:
            v.to_dict()
            results.append(v)
        except KeyError:
            print(f"loading failed: {k}")
            continue
    return results


def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
    ds = LeaderboardDataStore(version, get_safe_name(version), None, None, None, None, None, None, None, None)
    ds.raw_data = load_raw_eval_results(file_path)
    print(f"raw data: {len(ds.raw_data)}")

    ds.qa_raw_df = get_leaderboard_df(ds, TaskType.qa, DEFAULT_METRIC_QA)
    print(f"QA data loaded: {ds.qa_raw_df.shape}")
    ds.qa_fmt_df = ds.qa_raw_df.copy()
    qa_cols, ds.qa_types = get_default_cols(TaskType.qa, ds.slug, add_fix_cols=True)
    ds.qa_fmt_df = ds.qa_fmt_df[~ds.qa_fmt_df[COL_NAME_IS_ANONYMOUS]][qa_cols]
    ds.qa_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)

    ds.doc_raw_df = get_leaderboard_df(ds, TaskType.long_doc, DEFAULT_METRIC_LONG_DOC)
    print(f"Long-Doc data loaded: {len(ds.doc_raw_df)}")
    ds.doc_fmt_df = ds.doc_raw_df.copy()
    doc_cols, ds.doc_types = get_default_cols(TaskType.long_doc, ds.slug, add_fix_cols=True)
    ds.doc_fmt_df = ds.doc_fmt_df[~ds.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
    ds.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)

    ds.reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in ds.raw_data])))
    return ds


def load_eval_results(file_path: str) -> Dict[str, LeaderboardDataStore]:
    output = {}
    for version in BENCHMARK_VERSION_LIST:
        fn = f"{file_path}/{version}"
        output[version] = load_leaderboard_datastore(fn, version)
    return output