Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,201 Bytes
649e0fb a3d4c8d 649e0fb a3d4c8d 2bee5cb 649e0fb ec8e2d4 649e0fb ec8e2d4 649e0fb ec8e2d4 649e0fb 1a22df4 7845083 7a743dd ace0540 6f9f649 ace0540 6f9f649 ace0540 6f9f649 7a743dd ace0540 649e0fb 5e11615 649e0fb 7845083 649e0fb 7845083 649e0fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import os.path
from typing import Dict, List
import pandas as pd
from src.columns import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA
from src.models import FullEvalResult, LeaderboardDataStore, TaskType, get_safe_name
from src.utils import get_default_cols, get_leaderboard_df
pd.options.mode.copy_on_write = True
def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
"""
Load the evaluation results from a json file
"""
model_result_filepaths = []
for root, dirs, files in os.walk(results_path):
if len(files) == 0:
continue
# select the latest results
for file in files:
if not (file.startswith("results") and file.endswith(".json")):
print(f"skip {file}")
continue
model_result_filepaths.append(os.path.join(root, file))
eval_results = {}
for model_result_filepath in model_result_filepaths:
# create evaluation results
try:
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
except UnicodeDecodeError:
print(f"loading file failed. {model_result_filepath}")
continue
print(f"file loaded: {model_result_filepath}")
timestamp = eval_result.timestamp
eval_results[timestamp] = eval_result
results = []
for k, v in eval_results.items():
try:
v.to_dict()
results.append(v)
except KeyError:
print(f"loading failed: {k}")
continue
return results
def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
ds = LeaderboardDataStore(version, get_safe_name(version), None, None, None, None, None, None, None, None)
ds.raw_data = load_raw_eval_results(file_path)
print(f"raw data: {len(ds.raw_data)}")
ds.qa_raw_df = get_leaderboard_df(ds, TaskType.qa, DEFAULT_METRIC_QA)
print(f"QA data loaded: {ds.qa_raw_df.shape}")
ds.qa_fmt_df = ds.qa_raw_df.copy()
qa_cols, ds.qa_types = get_default_cols(TaskType.qa, ds.slug, add_fix_cols=True)
ds.qa_fmt_df = ds.qa_fmt_df[~ds.qa_fmt_df[COL_NAME_IS_ANONYMOUS]][qa_cols]
ds.qa_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
ds.doc_raw_df = get_leaderboard_df(ds, TaskType.long_doc, DEFAULT_METRIC_LONG_DOC)
print(f"Long-Doc data loaded: {len(ds.doc_raw_df)}")
ds.doc_fmt_df = ds.doc_raw_df.copy()
doc_cols, ds.doc_types = get_default_cols(TaskType.long_doc, ds.slug, add_fix_cols=True)
ds.doc_fmt_df = ds.doc_fmt_df[~ds.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
ds.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
ds.reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in ds.raw_data])))
return ds
def load_eval_results(file_path: str) -> Dict[str, LeaderboardDataStore]:
output = {}
for version in BENCHMARK_VERSION_LIST:
fn = f"{file_path}/{version}"
output[version] = load_leaderboard_datastore(fn, version)
return output
|