Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
729aa2a
·
1 Parent(s): 7a743dd

test: add unit tests for columns

Browse files
src/columns.py CHANGED
@@ -1,7 +1,7 @@
1
  from dataclasses import dataclass, make_dataclass
2
 
3
 
4
- def fields(raw_class):
5
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
6
 
7
 
@@ -19,45 +19,104 @@ class ColumnContent:
19
 
20
  def get_default_auto_eval_column_dict():
21
  auto_eval_column_dict = []
22
- # Init
23
- auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent(COL_NAME_RANK, "number", True)])
 
 
 
 
 
 
 
 
 
24
  auto_eval_column_dict.append(
25
  [
26
  "retrieval_model",
27
  ColumnContent,
28
- ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", True, hidden=False, never_hidden=True),
 
 
 
 
 
29
  ]
30
  )
31
  auto_eval_column_dict.append(
32
  [
33
  "reranking_model",
34
  ColumnContent,
35
- ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, hidden=False, never_hidden=True),
 
 
 
 
 
36
  ]
37
  )
38
  auto_eval_column_dict.append(
39
- ["revision", ColumnContent, ColumnContent(COL_NAME_REVISION, "markdown", True, never_hidden=True)]
 
 
 
 
 
 
 
 
 
40
  )
41
  auto_eval_column_dict.append(
42
- ["timestamp", ColumnContent, ColumnContent(COL_NAME_TIMESTAMP, "date", True, never_hidden=True)]
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  )
44
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)])
45
  auto_eval_column_dict.append(
46
  [
47
  "retrieval_model_link",
48
  ColumnContent,
49
- ColumnContent(COL_NAME_RETRIEVAL_MODEL_LINK, "markdown", False, hidden=True, never_hidden=False),
 
 
 
 
 
50
  ]
51
  )
52
  auto_eval_column_dict.append(
53
  [
54
  "reranking_model_link",
55
  ColumnContent,
56
- ColumnContent(COL_NAME_RERANKING_MODEL_LINK, "markdown", False, hidden=True, never_hidden=False),
 
 
 
 
 
57
  ]
58
  )
59
  auto_eval_column_dict.append(
60
- ["is_anonymous", ColumnContent, ColumnContent(COL_NAME_IS_ANONYMOUS, "bool", False, hidden=True)]
 
 
 
 
 
 
 
 
 
61
  )
62
  return auto_eval_column_dict
63
 
@@ -76,8 +135,8 @@ def make_autoevalcolumn(cls_name, benchmarks):
76
 
77
  def get_default_col_names_and_types(benchmarks):
78
  AutoEvalColumn = make_autoevalcolumn("AutoEvalColumn", benchmarks)
79
- col_names = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
80
- col_types = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
81
  return col_names, col_types
82
 
83
 
 
1
  from dataclasses import dataclass, make_dataclass
2
 
3
 
4
+ def _fields(raw_class):
5
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
6
 
7
 
 
19
 
20
  def get_default_auto_eval_column_dict():
21
  auto_eval_column_dict = []
22
+ auto_eval_column_dict.append(
23
+ [
24
+ "rank",
25
+ ColumnContent,
26
+ ColumnContent(
27
+ COL_NAME_RANK,
28
+ "number",
29
+ True
30
+ )
31
+ ]
32
+ )
33
  auto_eval_column_dict.append(
34
  [
35
  "retrieval_model",
36
  ColumnContent,
37
+ ColumnContent(
38
+ COL_NAME_RETRIEVAL_MODEL,
39
+ "markdown",
40
+ True,
41
+ never_hidden=True
42
+ )
43
  ]
44
  )
45
  auto_eval_column_dict.append(
46
  [
47
  "reranking_model",
48
  ColumnContent,
49
+ ColumnContent(
50
+ COL_NAME_RERANKING_MODEL,
51
+ "markdown",
52
+ True,
53
+ never_hidden=True
54
+ )
55
  ]
56
  )
57
  auto_eval_column_dict.append(
58
+ [
59
+ "revision",
60
+ ColumnContent,
61
+ ColumnContent(
62
+ COL_NAME_REVISION,
63
+ "markdown",
64
+ True,
65
+ never_hidden=True
66
+ )
67
+ ]
68
  )
69
  auto_eval_column_dict.append(
70
+ [
71
+ "timestamp",
72
+ ColumnContent,
73
+ ColumnContent(
74
+ COL_NAME_TIMESTAMP, "date", True, never_hidden=True
75
+ )
76
+ ]
77
+ )
78
+ auto_eval_column_dict.append(
79
+ [
80
+ "average",
81
+ ColumnContent,
82
+ ColumnContent(COL_NAME_AVG, "number", True)
83
+ ]
84
  )
 
85
  auto_eval_column_dict.append(
86
  [
87
  "retrieval_model_link",
88
  ColumnContent,
89
+ ColumnContent(
90
+ COL_NAME_RETRIEVAL_MODEL_LINK,
91
+ "markdown",
92
+ False,
93
+ hidden=True,
94
+ )
95
  ]
96
  )
97
  auto_eval_column_dict.append(
98
  [
99
  "reranking_model_link",
100
  ColumnContent,
101
+ ColumnContent(
102
+ COL_NAME_RERANKING_MODEL_LINK,
103
+ "markdown",
104
+ False,
105
+ hidden=True,
106
+ )
107
  ]
108
  )
109
  auto_eval_column_dict.append(
110
+ [
111
+ "is_anonymous",
112
+ ColumnContent,
113
+ ColumnContent(
114
+ COL_NAME_IS_ANONYMOUS,
115
+ "bool",
116
+ False,
117
+ hidden=True
118
+ )
119
+ ]
120
  )
121
  return auto_eval_column_dict
122
 
 
135
 
136
  def get_default_col_names_and_types(benchmarks):
137
  AutoEvalColumn = make_autoevalcolumn("AutoEvalColumn", benchmarks)
138
+ col_names = [c.name for c in _fields(AutoEvalColumn) if not c.hidden]
139
+ col_types = [c.type for c in _fields(AutoEvalColumn) if not c.hidden]
140
  return col_names, col_types
141
 
142
 
tests/src/test_columns.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from src.benchmarks import QABenchmarks, LongDocBenchmarks
4
+ from src.columns import get_default_auto_eval_column_dict, \
5
+ get_fixed_col_names_and_types, get_default_col_names_and_types, make_autoevalcolumn, COL_NAME_RANK, \
6
+ COL_NAME_RETRIEVAL_MODEL, \
7
+ COL_NAME_RERANKING_MODEL, COL_NAME_REVISION, \
8
+ COL_NAME_TIMESTAMP, COL_NAME_AVG
9
+
10
+
11
+ # Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
12
+ # 24.05
13
+ # | Task | dev | test |
14
+ # | ---- | --- | ---- |
15
+ # | Long-Doc | 4 | 11 |
16
+ # | QA | 54 | 53 |
17
+ #
18
+ # 24.04
19
+ # | Task | test |
20
+ # | ---- | ---- |
21
+ # | Long-Doc | 15 |
22
+ # | QA | 13 |
23
+
24
+ @pytest.fixture()
25
+ def expected_col_names():
26
+ return [
27
+ "rank",
28
+ "retrieval_model",
29
+ "reranking_model",
30
+ "revision",
31
+ "timestamp",
32
+ "average",
33
+ "retrieval_model_link",
34
+ "reranking_model_link",
35
+ "is_anonymous",
36
+ ]
37
+
38
+
39
+ @pytest.fixture()
40
+ def expected_hidden_col_names():
41
+ return [
42
+ "retrieval_model_link",
43
+ "reranking_model_link",
44
+ "is_anonymous",
45
+ ]
46
+
47
+
48
+ def test_get_default_auto_eval_column_dict(
49
+ expected_col_names, expected_hidden_col_names):
50
+ col_list = get_default_auto_eval_column_dict()
51
+ assert len(col_list) == 9
52
+ hidden_cols = []
53
+ for col_tuple, expected_col in zip(col_list, expected_col_names):
54
+ col, _, col_content = col_tuple
55
+ assert col == expected_col
56
+ if col_content.hidden:
57
+ hidden_cols.append(col)
58
+ assert hidden_cols == expected_hidden_col_names
59
+
60
+
61
+ def test_get_fixed_col_names_and_types():
62
+ col_names, col_types = get_fixed_col_names_and_types()
63
+ assert len(col_names) == 6
64
+ assert len(col_types) == 6
65
+ expected_col_and_type = [
66
+ (COL_NAME_RANK, "number"),
67
+ (COL_NAME_RETRIEVAL_MODEL, "markdown"),
68
+ (COL_NAME_RERANKING_MODEL, "markdown"),
69
+ (COL_NAME_REVISION, "markdown"),
70
+ (COL_NAME_TIMESTAMP, "date"),
71
+ (COL_NAME_AVG, "number"),
72
+ ]
73
+ for col_name, col_type, (c_name, c_type) in zip(col_names, col_types, expected_col_and_type):
74
+ assert col_name == c_name
75
+ assert col_type == c_type
76
+
77
+
78
+ @pytest.mark.parametrize(
79
+ 'benchmarks, expected_benchmark_len',
80
+ [
81
+ (QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
82
+ (LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11})
83
+ ]
84
+ )
85
+ def test_make_autoevalcolumn(
86
+ benchmarks, expected_benchmark_len, expected_col_names):
87
+ expected_default_attrs = frozenset(expected_col_names)
88
+ for benchmark in benchmarks:
89
+ TestEvalColumn = make_autoevalcolumn("TestEvalColumn", benchmark)
90
+ attrs = []
91
+ for k, v in TestEvalColumn.__dict__.items():
92
+ if not k.startswith("__"):
93
+ attrs.append(k)
94
+ attrs = frozenset(attrs)
95
+ assert expected_default_attrs.issubset(attrs)
96
+ benchmark_attrs = attrs.difference(expected_default_attrs)
97
+ assert len(benchmark_attrs) == expected_benchmark_len[benchmark.name]
98
+
99
+
100
+ @pytest.mark.parametrize(
101
+ 'benchmarks, expected_benchmark_len',
102
+ [
103
+ (QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
104
+ (LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11})
105
+ ]
106
+ )
107
+ def test_get_default_col_names_and_types(
108
+ benchmarks,
109
+ expected_benchmark_len,
110
+ expected_col_names,
111
+ expected_hidden_col_names):
112
+ default_col_len = len(expected_col_names)
113
+ hidden_col_len = len(expected_hidden_col_names)
114
+ for benchmark in benchmarks:
115
+ col_names, col_types = get_default_col_names_and_types(benchmark)
116
+ assert len(col_names) == expected_benchmark_len[benchmark.name] + default_col_len - hidden_col_len
tests/src/{display/test_utils.py → test_utils.py} RENAMED
File without changes