Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
3014147
·
1 Parent(s): b671337

test: add unit tests for utils

Browse files
Files changed (3) hide show
  1. src/loaders.py +3 -1
  2. src/utils.py +28 -21
  3. tests/src/test_utils.py +57 -19
src/loaders.py CHANGED
@@ -6,7 +6,7 @@ import pandas as pd
6
  from src.columns import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
7
  from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA
8
  from src.models import FullEvalResult, LeaderboardDataStore, TaskType, get_safe_name
9
- from src.utils import get_default_cols, get_leaderboard_df
10
 
11
  pd.options.mode.copy_on_write = True
12
 
@@ -60,6 +60,7 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
60
  ds.qa_fmt_df = ds.qa_raw_df.copy()
61
  qa_cols, ds.qa_types = get_default_cols(TaskType.qa, ds.slug, add_fix_cols=True)
62
  ds.qa_fmt_df = ds.qa_fmt_df[~ds.qa_fmt_df[COL_NAME_IS_ANONYMOUS]][qa_cols]
 
63
  ds.qa_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
64
 
65
  ds.doc_raw_df = get_leaderboard_df(ds, TaskType.long_doc, DEFAULT_METRIC_LONG_DOC)
@@ -67,6 +68,7 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
67
  ds.doc_fmt_df = ds.doc_raw_df.copy()
68
  doc_cols, ds.doc_types = get_default_cols(TaskType.long_doc, ds.slug, add_fix_cols=True)
69
  ds.doc_fmt_df = ds.doc_fmt_df[~ds.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
 
70
  ds.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
71
 
72
  ds.reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in ds.raw_data])))
 
6
  from src.columns import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
7
  from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA
8
  from src.models import FullEvalResult, LeaderboardDataStore, TaskType, get_safe_name
9
+ from src.utils import get_default_cols, get_leaderboard_df, reset_rank
10
 
11
  pd.options.mode.copy_on_write = True
12
 
 
60
  ds.qa_fmt_df = ds.qa_raw_df.copy()
61
  qa_cols, ds.qa_types = get_default_cols(TaskType.qa, ds.slug, add_fix_cols=True)
62
  ds.qa_fmt_df = ds.qa_fmt_df[~ds.qa_fmt_df[COL_NAME_IS_ANONYMOUS]][qa_cols]
63
+ ds.qa_fmt_df = reset_rank(ds.qa_fmt_df)
64
  ds.qa_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
65
 
66
  ds.doc_raw_df = get_leaderboard_df(ds, TaskType.long_doc, DEFAULT_METRIC_LONG_DOC)
 
68
  ds.doc_fmt_df = ds.doc_raw_df.copy()
69
  doc_cols, ds.doc_types = get_default_cols(TaskType.long_doc, ds.slug, add_fix_cols=True)
70
  ds.doc_fmt_df = ds.doc_fmt_df[~ds.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
71
+ ds.doc_fmt_df = reset_rank(ds.doc_fmt_df)
72
  ds.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
73
 
74
  ds.reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in ds.raw_data])))
src/utils.py CHANGED
@@ -334,42 +334,49 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
334
  """
335
  Creates a dataframe from all the individual experiment results
336
  """
337
- raw_data = datastore.raw_data
338
- cols = [
339
- COL_NAME_IS_ANONYMOUS,
340
- ]
 
 
 
341
  if task == TaskType.qa:
342
  benchmarks = QABenchmarks[datastore.slug]
343
  elif task == TaskType.long_doc:
344
  benchmarks = LongDocBenchmarks[datastore.slug]
345
  else:
346
  raise NotImplementedError
347
- cols_qa, _ = get_default_col_names_and_types(benchmarks)
348
- cols += cols_qa
349
- benchmark_cols = [t.value.col_name for t in list(benchmarks.value)]
350
- all_data_json = []
351
- for v in raw_data:
352
- all_data_json += v.to_dict(task=task.value, metric=metric)
353
- df = pd.DataFrame.from_records(all_data_json)
354
-
355
- _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
356
 
357
- # calculate the average score for selected benchmarks
358
- df[COL_NAME_AVG] = df[list(_benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2)
 
 
 
 
359
  df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
360
  df.reset_index(inplace=True, drop=True)
361
 
362
- _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
363
- df = df[_cols].round(decimals=2)
 
 
 
 
 
364
 
365
- # filter out if any of the benchmarks have not been produced
366
- df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
367
 
368
  # shorten the revision
369
  df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
370
 
371
- # # replace "0" with "-" for average score
372
- # df[COL_NAME_AVG] = df[COL_NAME_AVG].replace(0, "-")
373
  return df
374
 
375
 
 
334
  """
335
  Creates a dataframe from all the individual experiment results
336
  """
337
+ # load the selected metrics into a DataFrame from the raw json
338
+ all_data_json = []
339
+ for v in datastore.raw_data:
340
+ all_data_json += v.to_dict(task=task.value, metric=metric)
341
+ df = pd.DataFrame.from_records(all_data_json)
342
+
343
+ # calculate the average scores for selected task
344
  if task == TaskType.qa:
345
  benchmarks = QABenchmarks[datastore.slug]
346
  elif task == TaskType.long_doc:
347
  benchmarks = LongDocBenchmarks[datastore.slug]
348
  else:
349
  raise NotImplementedError
350
+ valid_cols = frozenset(df.columns.to_list())
351
+ benchmark_cols = []
352
+ for t in list(benchmarks.value):
353
+ if t.value.col_name not in valid_cols:
354
+ continue
355
+ benchmark_cols.append(t.value.col_name)
 
 
 
356
 
357
+ ## filter out the columns that are not in the data
358
+ df[COL_NAME_AVG] = (
359
+ df[list(benchmark_cols)]
360
+ .apply(calculate_mean, axis=1)
361
+ .round(decimals=2)
362
+ )
363
  df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
364
  df.reset_index(inplace=True, drop=True)
365
 
366
+ # filter out columns that are not in the data
367
+ display_cols = [COL_NAME_IS_ANONYMOUS, COL_NAME_AVG]
368
+ default_cols, _ = get_default_col_names_and_types(benchmarks)
369
+ for col in default_cols:
370
+ if col in valid_cols:
371
+ display_cols.append(col)
372
+ df = df[display_cols].round(decimals=2)
373
 
374
+ # rank the scores
375
+ df = reset_rank(df)
376
 
377
  # shorten the revision
378
  df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
379
 
 
 
380
  return df
381
 
382
 
tests/src/test_utils.py CHANGED
@@ -1,7 +1,7 @@
1
  import pytest
2
  import pandas as pd
3
 
4
- from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols
5
  from src.models import model_hyperlink, TaskType
6
  from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
7
 
@@ -29,13 +29,14 @@ def toy_df():
29
  "NoReranker"
30
  ],
31
  "Rank 🏆": [1, 2, 3, 4],
32
- "Revision": ["", "", "", ""],
33
  "Submission Date": ["", "", "", ""],
34
  "Average ⬆️": [0.6, 0.4, 0.3, 0.2],
35
  "wiki_en": [0.8, 0.7, 0.2, 0.1],
36
  "wiki_zh": [0.4, 0.1, 0.4, 0.3],
37
  "news_en": [0.8, 0.7, 0.2, 0.1],
38
- "news_zh": [0.4, 0.1, 0.4, 0.3],
 
39
  }
40
  )
41
 
@@ -94,18 +95,22 @@ def test_filter_queries(query, expected):
94
 
95
 
96
  @pytest.mark.parametrize(
97
- "task_type, slug, expected",
98
  [
99
- (TaskType.qa, "air_bench_2404", NUM_QA_BENCHMARKS_24_04),
100
- (TaskType.long_doc, "air_bench_2404", NUM_DOC_BENCHMARKS_24_04),
101
- (TaskType.qa, "air_bench_2405", NUM_QA_BENCHMARKS_24_05),
102
- (TaskType.long_doc, "air_bench_2405", NUM_DOC_BENCHMARKS_24_05),
103
  ]
104
  )
105
- def test_get_default_cols(task_type, slug, expected):
106
  attr_cols = ['Rank 🏆', 'Retrieval Method', 'Reranking Model', 'Revision', 'Submission Date', 'Average ⬆️']
107
  cols, types = get_default_cols(task_type, slug)
108
- benchmark_cols = list(frozenset(cols).difference(frozenset(attr_cols)))
 
 
 
 
109
  assert len(benchmark_cols) == expected
110
 
111
 
@@ -133,8 +138,8 @@ def test_get_selected_cols(task_type, domains, languages, expected):
133
  cols = get_selected_cols(task_type, slug, domains, languages)
134
  assert sorted(cols) == sorted(expected)
135
 
136
-
137
- def test_select_columns(toy_df):
138
  expected = [
139
  'Rank 🏆',
140
  'Retrieval Method',
@@ -145,13 +150,46 @@ def test_select_columns(toy_df):
145
  'news_zh']
146
  df_result = select_columns(
147
  toy_df,
148
- [
149
- "news",
150
- ],
151
- [
152
- "zh",
153
- ],
154
  version_slug="air_bench_2404",
 
155
  )
156
  assert len(df_result.columns) == len(expected)
157
- assert df_result["Average ⬆️"].equals(df_result["news_zh"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pytest
2
  import pandas as pd
3
 
4
+ from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem
5
  from src.models import model_hyperlink, TaskType
6
  from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
7
 
 
29
  "NoReranker"
30
  ],
31
  "Rank 🏆": [1, 2, 3, 4],
32
+ "Revision": ["123", "234", "345", "456"],
33
  "Submission Date": ["", "", "", ""],
34
  "Average ⬆️": [0.6, 0.4, 0.3, 0.2],
35
  "wiki_en": [0.8, 0.7, 0.2, 0.1],
36
  "wiki_zh": [0.4, 0.1, 0.4, 0.3],
37
  "news_en": [0.8, 0.7, 0.2, 0.1],
38
+ "news_zh": [0.4, 0.1, 0.2, 0.3],
39
+ "Anonymous Submission": [False, False, False, True],
40
  }
41
  )
42
 
 
95
 
96
 
97
  @pytest.mark.parametrize(
98
+ "task_type, slug, add_fix_cols, expected",
99
  [
100
+ (TaskType.qa, "air_bench_2404", True, NUM_QA_BENCHMARKS_24_04),
101
+ (TaskType.long_doc, "air_bench_2404", True, NUM_DOC_BENCHMARKS_24_04),
102
+ (TaskType.qa, "air_bench_2405", False, NUM_QA_BENCHMARKS_24_05),
103
+ (TaskType.long_doc, "air_bench_2405", False, NUM_DOC_BENCHMARKS_24_05),
104
  ]
105
  )
106
+ def test_get_default_cols(task_type, slug, add_fix_cols, expected):
107
  attr_cols = ['Rank 🏆', 'Retrieval Method', 'Reranking Model', 'Revision', 'Submission Date', 'Average ⬆️']
108
  cols, types = get_default_cols(task_type, slug)
109
+ cols_set = frozenset(cols)
110
+ attrs_set = frozenset(attr_cols)
111
+ if add_fix_cols:
112
+ assert attrs_set.issubset(cols_set)
113
+ benchmark_cols = list(cols_set.difference(attrs_set))
114
  assert len(benchmark_cols) == expected
115
 
116
 
 
138
  cols = get_selected_cols(task_type, slug, domains, languages)
139
  assert sorted(cols) == sorted(expected)
140
 
141
+ @pytest.mark.parametrize("reset_rank", [False])
142
+ def test_select_columns(toy_df, reset_rank):
143
  expected = [
144
  'Rank 🏆',
145
  'Retrieval Method',
 
150
  'news_zh']
151
  df_result = select_columns(
152
  toy_df,
153
+ ["news"],
154
+ ["zh"],
 
 
 
 
155
  version_slug="air_bench_2404",
156
+ reset_ranking=reset_rank
157
  )
158
  assert len(df_result.columns) == len(expected)
159
+ if reset_rank:
160
+ assert df_result["Average ⬆️"].equals(df_result["news_zh"])
161
+ else:
162
+ assert df_result["Average ⬆️"].equals(toy_df["Average ⬆️"])
163
+
164
+
165
+ @pytest.mark.parametrize(
166
+ "reset_rank, show_anony",
167
+ [
168
+ (False, True),
169
+ (True, True),
170
+ (True, False),
171
+ ]
172
+ )
173
+ def test__update_df_elem(toy_df, reset_rank, show_anony):
174
+ df = _update_df_elem(
175
+ TaskType.qa,
176
+ "AIR-Bench_24.04",
177
+ toy_df,
178
+ ["news"],
179
+ ["zh"],
180
+ [],
181
+ "",
182
+ show_anony,
183
+ reset_rank
184
+ )
185
+ if show_anony:
186
+ assert df.shape[0] == 4
187
+ else:
188
+ assert df.shape[0] == 3
189
+ if show_anony:
190
+ if reset_rank:
191
+ assert df["Average ⬆️"].equals(df["news_zh"])
192
+ else:
193
+ assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])
194
+
195
+