Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
4eb64b4
·
1 Parent(s): 592bb62

refactor: move the data model

Browse files
Files changed (3) hide show
  1. src/read_evals.py +3 -123
  2. src/utils.py +2 -1
  3. tests/src/test_read_evals.py +2 -1
src/read_evals.py CHANGED
@@ -1,18 +1,13 @@
1
- import json
2
  import os.path
3
- from collections import defaultdict
4
- from dataclasses import dataclass
5
  from typing import List
6
 
7
  import pandas as pd
8
 
9
- from src.benchmarks import get_safe_name, BenchmarksQA, BenchmarksLongDoc
10
  from src.display.utils import COLS_QA, COLS_LONG_DOC
11
- from src.display.column_names import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, \
12
- COL_NAME_RETRIEVAL_MODEL_LINK, COL_NAME_RERANKING_MODEL_LINK, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, \
13
- COL_NAME_IS_ANONYMOUS
14
 
15
- from src.display.formatting import make_clickable_model
16
 
17
  pd.options.mode.copy_on_write = True
18
 
@@ -24,121 +19,6 @@ def calculate_mean(row):
24
  return row.mean()
25
 
26
 
27
- @dataclass
28
- class EvalResult:
29
- """
30
- Evaluation result of a single embedding model with a specific reranking model on benchmarks over different
31
- domains, languages, and datasets
32
- """
33
- eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]_[metric]
34
- retrieval_model: str
35
- reranking_model: str
36
- results: list # results on all the benchmarks stored as dict
37
- task: str
38
- metric: str
39
- timestamp: str = "" # submission timestamp
40
- revision: str = ""
41
- is_anonymous: bool = False
42
-
43
-
44
- @dataclass
45
- class FullEvalResult:
46
- """
47
- Evaluation result of a single embedding model with a specific reranking model on benchmarks over different tasks
48
- """
49
- eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
50
- retrieval_model: str
51
- reranking_model: str
52
- retrieval_model_link: str
53
- reranking_model_link: str
54
- results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
55
- timestamp: str = ""
56
- revision: str = ""
57
- is_anonymous: bool = False
58
-
59
- @classmethod
60
- def init_from_json_file(cls, json_filepath):
61
- """
62
- Initiate from the result json file for a single model.
63
- The json file will be written only when the status is FINISHED.
64
- """
65
- with open(json_filepath) as fp:
66
- model_data = json.load(fp)
67
-
68
- # store all the results for different metrics and tasks
69
- result_list = []
70
- retrieval_model_link = ""
71
- reranking_model_link = ""
72
- revision = ""
73
- for item in model_data:
74
- config = item.get("config", {})
75
- # eval results for different metrics
76
- results = item.get("results", [])
77
- retrieval_model_link = config["retrieval_model_link"]
78
- if config["reranking_model_link"] is None:
79
- reranking_model_link = ""
80
- else:
81
- reranking_model_link = config["reranking_model_link"]
82
- eval_result = EvalResult(
83
- eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
84
- retrieval_model=config["retrieval_model"],
85
- reranking_model=config["reranking_model"],
86
- results=results,
87
- task=config["task"],
88
- metric=config["metric"],
89
- timestamp=config.get("timestamp", "2024-05-12T12:24:02Z"),
90
- revision=config.get("revision", "3a2ba9dcad796a48a02ca1147557724e"),
91
- is_anonymous=config.get("is_anonymous", False)
92
- )
93
- result_list.append(eval_result)
94
- return cls(
95
- eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
96
- retrieval_model=result_list[0].retrieval_model,
97
- reranking_model=result_list[0].reranking_model,
98
- retrieval_model_link=retrieval_model_link,
99
- reranking_model_link=reranking_model_link,
100
- results=result_list,
101
- timestamp=result_list[0].timestamp,
102
- revision=result_list[0].revision,
103
- is_anonymous=result_list[0].is_anonymous
104
- )
105
-
106
- def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
107
- """
108
- Convert the results in all the EvalResults over different tasks and metrics. The output is a list of dict compatible with the dataframe UI
109
- """
110
- results = defaultdict(dict)
111
- for eval_result in self.results:
112
- if eval_result.metric != metric:
113
- continue
114
- if eval_result.task != task:
115
- continue
116
- results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
117
- results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = (
118
- make_clickable_model(self.retrieval_model, self.retrieval_model_link))
119
- results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = (
120
- make_clickable_model(self.reranking_model, self.reranking_model_link))
121
- results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL_LINK] = self.retrieval_model_link
122
- results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
123
- results[eval_result.eval_name][COL_NAME_REVISION] = self.revision
124
- results[eval_result.eval_name][COL_NAME_TIMESTAMP] = self.timestamp
125
- results[eval_result.eval_name][COL_NAME_IS_ANONYMOUS] = self.is_anonymous
126
-
127
- # print(f'result loaded: {eval_result.eval_name}')
128
- for result in eval_result.results:
129
- # add result for each domain, language, and dataset
130
- domain = result["domain"]
131
- lang = result["lang"]
132
- dataset = result["dataset"]
133
- value = result["value"] * 100
134
- if dataset == 'default':
135
- benchmark_name = f"{domain}_{lang}"
136
- else:
137
- benchmark_name = f"{domain}_{lang}_{dataset}"
138
- results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
139
- return [v for v in results.values()]
140
-
141
-
142
  def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
143
  """
144
  Load the evaluation results from a json file
 
 
1
  import os.path
 
 
2
  from typing import List
3
 
4
  import pandas as pd
5
 
6
+ from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
7
  from src.display.utils import COLS_QA, COLS_LONG_DOC
8
+ from src.display.column_names import COL_NAME_AVG, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_IS_ANONYMOUS
 
 
9
 
10
+ from src.models import FullEvalResult
11
 
12
  pd.options.mode.copy_on_write = True
13
 
 
19
  return row.mean()
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
23
  """
24
  Load the evaluation results from a json file
src/utils.py CHANGED
@@ -12,7 +12,8 @@ from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC,
12
  from src.display.column_names import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, \
13
  COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
14
  from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
15
- from src.read_evals import FullEvalResult, get_leaderboard_df, calculate_mean
 
16
 
17
  import re
18
 
 
12
  from src.display.column_names import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, \
13
  COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
14
  from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
15
+ from src.read_evals import get_leaderboard_df, calculate_mean
16
+ from src.models import FullEvalResult
17
 
18
  import re
19
 
tests/src/test_read_evals.py CHANGED
@@ -1,6 +1,7 @@
1
  from pathlib import Path
2
 
3
- from src.read_evals import FullEvalResult, get_raw_eval_results, get_leaderboard_df
 
4
 
5
  cur_fp = Path(__file__)
6
 
 
1
  from pathlib import Path
2
 
3
+ from src.read_evals import get_raw_eval_results, get_leaderboard_df
4
+ from src.models import FullEvalResult
5
 
6
  cur_fp = Path(__file__)
7