Lillianwei commited on
Commit
94bd921
1 Parent(s): 730f0f9

feat: adapt to MMIE

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -1
  2. README.md +2 -10
  3. app.py +112 -145
  4. evals/.gitattributes +0 -55
  5. evals/README.md +0 -6
  6. evals/mjbench-results/detailed-results/AestheticsPredictor.json +0 -47
  7. evals/mjbench-results/detailed-results/BLIP-v2.json +0 -47
  8. evals/mjbench-results/detailed-results/CLIP-v2.json +0 -47
  9. evals/mjbench-results/detailed-results/Claude 3 Opus.json +0 -47
  10. evals/mjbench-results/detailed-results/GPT-4-vision.json +0 -47
  11. evals/mjbench-results/detailed-results/GPT-4o.json +0 -47
  12. evals/mjbench-results/detailed-results/Gemini Ultra.json +0 -47
  13. evals/mjbench-results/detailed-results/HPS-v2.1.json +0 -47
  14. evals/mjbench-results/detailed-results/Idefics2-8b.json +0 -47
  15. evals/mjbench-results/detailed-results/ImageReward.json +0 -47
  16. evals/mjbench-results/detailed-results/Instructblip-7b.json +0 -47
  17. evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json +0 -47
  18. evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json +0 -47
  19. evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json +0 -47
  20. evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json +0 -47
  21. evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json +0 -35
  22. evals/mjbench-results/detailed-results/MiniGPT4-v2.json +0 -47
  23. evals/mjbench-results/detailed-results/PickScore-v1.json +0 -47
  24. evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json +0 -47
  25. evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json +0 -47
  26. evals/mjbench-results/detailed-results/Qwen-VL-Chat.json +0 -47
  27. evals/mjbench-results/overall-results/AestheticsPredictor.json +0 -12
  28. evals/mjbench-results/overall-results/BLIP-v2.json +0 -12
  29. evals/mjbench-results/overall-results/CLIP-v2.json +0 -12
  30. evals/mjbench-results/overall-results/Claude 3 Opus.json +0 -12
  31. evals/mjbench-results/overall-results/GPT-4-vision.json +0 -12
  32. evals/mjbench-results/overall-results/GPT-4o.json +0 -12
  33. evals/mjbench-results/overall-results/Gemini Ultra.json +0 -12
  34. evals/mjbench-results/overall-results/HPS-v2.1.json +0 -12
  35. evals/mjbench-results/overall-results/Idefics2-8b.json +0 -12
  36. evals/mjbench-results/overall-results/ImageReward.json +0 -12
  37. evals/mjbench-results/overall-results/Instructblip-7b.json +0 -12
  38. evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json +0 -12
  39. evals/mjbench-results/overall-results/LLaVA-1.5-13b.json +0 -12
  40. evals/mjbench-results/overall-results/LLaVA-1.5-7b.json +0 -12
  41. evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json +0 -12
  42. evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json +0 -12
  43. evals/mjbench-results/overall-results/MiniGPT4-v2.json +0 -12
  44. evals/mjbench-results/overall-results/PickScore-v1.json +0 -12
  45. evals/mjbench-results/overall-results/Prometheus-Vision-13b.json +0 -12
  46. evals/mjbench-results/overall-results/Prometheus-Vision-7b.json +0 -12
  47. evals/mjbench-results/overall-results/Qwen-VL-Chat.json +0 -12
  48. src/about.py +4 -5
  49. src/envs.py +3 -3
  50. src/logo.png +0 -0
.gitattributes CHANGED
@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
36
- mj-bench-logo.png filter=lfs diff=lfs merge=lfs -text
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
 
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: MJ Bench Leaderboard
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
@@ -45,13 +45,5 @@ You'll find
45
 
46
  ## Citation
47
  ```
48
- @misc{chen2024mjbenchmultimodalrewardmodel,
49
- title={MJ-Bench: Is Your Multimodal Reward Model Really a Good Judge for Text-to-Image Generation?},
50
- author={Zhaorun Chen and Yichao Du and Zichen Wen and Yiyang Zhou and Chenhang Cui and Zhenzhen Weng and Haoqin Tu and Chaoqi Wang and Zhengwei Tong and Qinglan Huang and Canyu Chen and Qinghao Ye and Zhihong Zhu and Yuqing Zhang and Jiawei Zhou and Zhuokai Zhao and Rafael Rafailov and Chelsea Finn and Huaxiu Yao},
51
- year={2024},
52
- eprint={2407.04842},
53
- archivePrefix={arXiv},
54
- primaryClass={cs.CV},
55
- url={https://arxiv.org/abs/2407.04842},
56
- }
57
  ```
 
1
  ---
2
+ title: MMIE Leaderboard
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
 
45
 
46
  ## Citation
47
  ```
48
+
 
 
 
 
 
 
 
 
49
  ```
app.py CHANGED
@@ -7,7 +7,6 @@ import numpy as np
7
  from pathlib import Path
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  from huggingface_hub import snapshot_download
10
- from datasets import load_dataset
11
 
12
 
13
  from src.about import (
@@ -20,19 +19,19 @@ from src.about import (
20
  ABOUT_TEXT
21
  )
22
  from src.display.css_html_js import custom_css
23
- from src.display.utils import (
24
- BENCHMARK_COLS,
25
- COLS,
26
- EVAL_COLS,
27
- EVAL_TYPES,
28
- NUMERIC_INTERVALS,
29
- TYPES,
30
- AutoEvalColumn,
31
- ModelType,
32
- fields,
33
- WeightType,
34
- Precision
35
- )
36
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
37
 
38
  try:
@@ -76,7 +75,7 @@ PERSPECTIVE_COUNTS= {
76
 
77
 
78
 
79
- META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization']
80
 
81
 
82
 
@@ -84,36 +83,36 @@ def restart_space():
84
  API.restart_space(repo_id=REPO_ID)
85
 
86
 
87
- color_map = {
88
- "Score Model": "#7497db",
89
- "Opensource VLM": "#E8ECF2",
90
- "Closesource VLM": "#ffcd75",
91
- "Others": "#75809c",
92
-
93
- # #7497db #E8ECF2 #ffcd75 #75809c
94
- }
95
- def color_model_type_column(df, color_map):
96
- """
97
- Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
98
-
99
- Parameters:
100
- df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
101
- color_map (dict): A dictionary mapping model types to colors.
102
-
103
- Returns:
104
- pd.Styler: The styled DataFrame.
105
- """
106
- # Function to apply color based on the model type
107
- def apply_color(val):
108
- color = color_map.get(val, "default") # Default color if not specified in color_map
109
- return f'background-color: {color}'
110
 
111
- # Format for different columns
112
- format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA}
113
- format_dict['Overall Score'] = "{:.2f}"
114
- format_dict[''] = "{:d}"
115
 
116
- return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
117
 
118
  def regex_table(dataframe, regex, filter_button, style=True):
119
  """
@@ -127,14 +126,10 @@ def regex_table(dataframe, regex, filter_button, style=True):
127
  # if filter_button, remove all rows with "ai2" in the model name
128
  update_scores = False
129
  if isinstance(filter_button, list) or isinstance(filter_button, str):
130
- if "Score Model" not in filter_button:
131
- dataframe = dataframe[~dataframe["Model Type"].str.contains("Score Model", case=False, na=False)]
132
- if "Opensource VLM" not in filter_button:
133
- dataframe = dataframe[~dataframe["Model Type"].str.contains("Opensource VLM", case=False, na=False)]
134
- if "Closesource VLM" not in filter_button:
135
- dataframe = dataframe[~dataframe["Model Type"].str.contains("Closesource VLM", case=False, na=False)]
136
- if "Others" not in filter_button:
137
- dataframe = dataframe[~dataframe["Model Type"].str.contains("Others", case=False, na=False)]
138
  # Filter the dataframe such that 'model' contains any of the regex patterns
139
  data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
140
 
@@ -143,9 +138,9 @@ def regex_table(dataframe, regex, filter_button, style=True):
143
  # replace column '' with count/rank
144
  data.insert(0, '', range(1, 1 + len(data)))
145
 
146
- if style:
147
- # apply color
148
- data = color_model_type_column(data, color_map)
149
 
150
  return data
151
 
@@ -164,27 +159,6 @@ def get_leaderboard_results(results_path):
164
  df.reset_index(drop=True, inplace=True)
165
  return df
166
 
167
- def avg_all_subset(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, subset_counts=SUBSET_COUNTS):
168
- new_df = orig_df.copy()[meta_data + columns_name]
169
-
170
- # Filter the dictionary to include only the counts relevant to the specified columns
171
- new_subset_counts = {col: subset_counts[col] for col in columns_name}
172
-
173
- # Calculate the weights for each subset
174
- total_count = sum(new_subset_counts.values())
175
- weights = {subset: count / total_count for subset, count in new_subset_counts.items()}
176
-
177
- # Calculate the weight_avg value for each row
178
- def calculate_weighted_avg(row):
179
- weighted_sum = sum(row[col] * weights[col] for col in columns_name)
180
- return weighted_sum
181
-
182
- new_df["Overall Score"] = new_df.apply(calculate_weighted_avg, axis=1)
183
-
184
- cols = meta_data + ["Overall Score"] + columns_name
185
- new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
186
- return new_df
187
-
188
 
189
  def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS):
190
  new_df = orig_df[meta_data + columns_name]
@@ -200,28 +174,63 @@ def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=MET
200
  new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
201
  return new_df
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
- results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/detailed-results")
205
- orig_df = get_leaderboard_results(results_path)
206
- colmuns_name = list(SUBSET_COUNTS.keys())
207
- detailed_df = avg_all_subset(orig_df, colmuns_name).round(2)
208
-
209
- results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/overall-results")
210
- orig_df = get_leaderboard_results(results_path)
211
- colmuns_name = list(PERSPECTIVE_COUNTS.keys())
212
- perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2)
213
-
214
- total_models = len(detailed_df)
215
  with gr.Blocks(css=custom_css) as app:
216
  with gr.Row():
217
  with gr.Column(scale=6):
218
  gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))
219
  with gr.Column(scale=4):
220
- gr.Markdown("![](https://huggingface.co/spaces/MJ-Bench/MJ-Bench-Leaderboard/resolve/main/src/mj-bench-logo.jpg)")
221
  # gr.HTML(BGB_LOGO, elem_classes="logo")
222
 
223
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
224
- with gr.TabItem("🏆 MJ-Bench Leaderboard"):
225
  with gr.Row():
226
  search_overall = gr.Textbox(
227
  label="Model Search (delimit with , )",
@@ -229,88 +238,46 @@ with gr.Blocks(css=custom_css) as app:
229
  show_label=False
230
  )
231
  model_type_overall = gr.CheckboxGroup(
232
- choices=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
233
- value=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
234
- label="Model Types",
235
  show_label=False,
236
  interactive=True,
237
  )
238
  with gr.Row():
239
- mjbench_table_overall_hidden = gr.Dataframe(
240
- perspective_df,
241
- headers=perspective_df.columns.tolist(),
242
- elem_id="mjbench_leadboard_overall_hidden",
243
  wrap=True,
244
  visible=False,
245
  )
246
- mjbench_table_overall = gr.Dataframe(
247
  regex_table(
248
- perspective_df.copy(),
249
  "",
250
- ["Score Model", "Opensource VLM", "Closesource VLM", "Others"]
251
  ),
252
- headers=perspective_df.columns.tolist(),
253
- elem_id="mjbench_leadboard_overall",
254
  wrap=True,
255
  height=1000,
256
  )
257
- # with gr.TabItem("🔍 MJ-Bench Detailed Results"):
258
- # with gr.Row():
259
- # search_detail = gr.Textbox(
260
- # label="Model Search (delimit with , )",
261
- # placeholder="🔍 Search model (separate multiple queries with ``) and press ENTER...",
262
- # show_label=False
263
- # )
264
- # model_type_detail = gr.CheckboxGroup(
265
- # choices=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
266
- # value=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
267
- # label="Model Types",
268
- # show_label=False,
269
- # interactive=True,
270
- # )
271
- # with gr.Row():
272
- # mjbench_table_detail_hidden = gr.Dataframe(
273
- # detailed_df,
274
- # headers=detailed_df.columns.tolist(),
275
- # elem_id="mjbench_detailed_hidden",
276
- # # column_widths = ["500px", "500px"],
277
- # wrap=True,
278
- # visible=False,
279
- # )
280
- # mjbench_table_detail = gr.Dataframe(
281
- # regex_table(
282
- # detailed_df.copy(),
283
- # "",
284
- # ["Score Model", "Opensource VLM", "Closesource VLM", "Others"]
285
- # ),
286
- # headers=detailed_df.columns.tolist(),
287
- # elem_id="mjbench_detailed",
288
- # column_widths = ["40px", "200px", "180px", "130px", "150px"] + ["130px"]*50,
289
- # wrap=True,
290
- # height=1000,
291
- # )
292
  with gr.TabItem("About"):
293
  with gr.Row():
294
  gr.Markdown(ABOUT_TEXT)
295
 
296
  with gr.Accordion("📚 Citation", open=False):
297
  citation_button = gr.Textbox(
298
- value=r"""@misc{mjbench2024mjbench,
299
- title={MJ-BENCH: Is Your Multimodal Reward Model Really a Good Judge?},
300
- author={Chen*, Zhaorun and Du*, Yichao and Wen, Zichen and Zhou, Yiyang and Cui, Chenhang and Weng, Zhenzhen and Tu, Haoqin and Wang, Chaoqi and Tong, Zhengwei and HUANG, Leria and Chen, Canyu and Ye Qinghao and Zhu, Zhihong and Zhang, Yuqing and Zhou, Jiawei and Zhao, Zhuokai and Rafailov, Rafael and Finn, Chelsea and Yao, Huaxiu},
301
- year={2024}
302
- }""",
303
  lines=7,
304
  label="Copy the following to cite these results.",
305
  elem_id="citation-button",
306
  show_copy_button=True,
307
  )
308
 
309
- search_overall.change(regex_table, inputs=[mjbench_table_overall_hidden, search_overall, model_type_overall], outputs=mjbench_table_overall)
310
- model_type_overall.change(regex_table, inputs=[mjbench_table_overall_hidden, search_overall, model_type_overall], outputs=mjbench_table_overall)
311
-
312
- # search_detail.change(regex_table, inputs=[mjbench_table_detail_hidden, search_detail, model_type_detail], outputs=mjbench_table_detail)
313
- # model_type_detail.change(regex_table, inputs=[mjbench_table_detail_hidden, search_detail, model_type_detail], outputs=mjbench_table_detail)
314
 
315
  scheduler = BackgroundScheduler()
316
  scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h
 
7
  from pathlib import Path
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  from huggingface_hub import snapshot_download
 
10
 
11
 
12
  from src.about import (
 
19
  ABOUT_TEXT
20
  )
21
  from src.display.css_html_js import custom_css
22
+ # from src.display.utils import (
23
+ # BENCHMARK_COLS,
24
+ # COLS,
25
+ # EVAL_COLS,
26
+ # EVAL_TYPES,
27
+ # NUMERIC_INTERVALS,
28
+ # TYPES,
29
+ # AutoEvalColumn,
30
+ # ModelType,
31
+ # fields,
32
+ # WeightType,
33
+ # Precision
34
+ # )
35
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
36
 
37
  try:
 
75
 
76
 
77
 
78
+ META_DATA = ['Model']
79
 
80
 
81
 
 
83
  API.restart_space(repo_id=REPO_ID)
84
 
85
 
86
+ # color_map = {
87
+ # "Score Model": "#7497db",
88
+ # "Opensource VLM": "#E8ECF2",
89
+ # "Closesource VLM": "#ffcd75",
90
+ # "Others": "#75809c",
91
+
92
+ # # #7497db #E8ECF2 #ffcd75 #75809c
93
+ # }
94
+ # def color_model_type_column(df, color_map):
95
+ # """
96
+ # Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
97
+
98
+ # Parameters:
99
+ # df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
100
+ # color_map (dict): A dictionary mapping model types to colors.
101
+
102
+ # Returns:
103
+ # pd.Styler: The styled DataFrame.
104
+ # """
105
+ # # Function to apply color based on the model type
106
+ # def apply_color(val):
107
+ # color = color_map.get(val, "default") # Default color if not specified in color_map
108
+ # return f'background-color: {color}'
109
 
110
+ # # Format for different columns
111
+ # format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA}
112
+ # format_dict['Overall Score'] = "{:.2f}"
113
+ # format_dict[''] = "{:d}"
114
 
115
+ # return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
116
 
117
  def regex_table(dataframe, regex, filter_button, style=True):
118
  """
 
126
  # if filter_button, remove all rows with "ai2" in the model name
127
  update_scores = False
128
  if isinstance(filter_button, list) or isinstance(filter_button, str):
129
+ if "Integrated LVLM" not in filter_button:
130
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("Integrated LVLM", case=False, na=False)]
131
+ if "Interleaved LVLM" not in filter_button:
132
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("Interleaved LVLM", case=False, na=False)]
 
 
 
 
133
  # Filter the dataframe such that 'model' contains any of the regex patterns
134
  data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
135
 
 
138
  # replace column '' with count/rank
139
  data.insert(0, '', range(1, 1 + len(data)))
140
 
141
+ # if style:
142
+ # # apply color
143
+ # data = color_model_type_column(data, color_map)
144
 
145
  return data
146
 
 
159
  df.reset_index(drop=True, inplace=True)
160
  return df
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS):
164
  new_df = orig_df[meta_data + columns_name]
 
174
  new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
175
  return new_df
176
 
177
+ data = {
178
+ "Model": [
179
+ "MiniGPT-5", "EMU-2", "GILL", "Anole",
180
+ "GPT-4o - Openjourney", "GPT-4o - SD-3", "GPT-4o - SD-XL", "GPT-4o - Flux",
181
+ "Gemini-1.5 - Openjourney", "Gemini-1.5 - SD-3", "Gemini-1.5 - SD-XL", "Gemini-1.5 - Flux",
182
+ "LLAVA-34b - Openjourney", "LLAVA-34b - SD-3", "LLAVA-34b - SD-XL", "LLAVA-34b - Flux",
183
+ "Qwen-VL-70b - Openjourney", "Qwen-VL-70b - SD-3", "Qwen-VL-70b - SD-XL", "Qwen-VL-70b - Flux"
184
+ ],
185
+ "Model Type":[
186
+ "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM",
187
+ "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
188
+ "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
189
+ "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
190
+ "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
191
+ ],
192
+ "Situational analysis": [
193
+ 47.63, 39.65, 46.72, 48.95,
194
+ 53.05, 53.00, 56.12, 54.97,
195
+ 48.08, 47.48, 49.43, 47.07,
196
+ 54.12, 54.72, 55.97, 54.23,
197
+ 52.73, 54.98, 52.58, 54.23
198
+ ],
199
+ "Project-based learning": [
200
+ 55.12, 46.12, 57.57, 59.05,
201
+ 71.40, 71.20, 73.25, 68.80,
202
+ 67.93, 68.70, 71.85, 68.33,
203
+ 73.47, 72.55, 74.60, 71.32,
204
+ 71.63, 71.87, 73.57, 69.47
205
+ ],
206
+ "Multi-step reasoning": [
207
+ 42.17, 50.75, 39.33, 51.72,
208
+ 53.67, 53.67, 53.67, 53.67,
209
+ 60.05, 60.05, 60.05, 60.05,
210
+ 47.28, 47.28, 47.28, 47.28,
211
+ 55.63, 55.63, 55.63, 55.63
212
+ ],
213
+ "AVG": [
214
+ 50.92, 45.33, 51.58, 55.22,
215
+ 63.65, 63.52, 65.47, 62.63,
216
+ 61.57, 61.87, 64.15, 61.55,
217
+ 63.93, 63.57, 65.05, 62.73,
218
+ 64.05, 64.75, 65.12, 63.18
219
+ ]
220
+ }
221
+ df = pd.DataFrame(data)
222
+ total_models = len(df)
223
 
 
 
 
 
 
 
 
 
 
 
 
224
  with gr.Blocks(css=custom_css) as app:
225
  with gr.Row():
226
  with gr.Column(scale=6):
227
  gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))
228
  with gr.Column(scale=4):
229
+ gr.Markdown("![](https://huggingface.co/spaces/MMIE/Leaderboard/resolve/main/src/logo.png)")
230
  # gr.HTML(BGB_LOGO, elem_classes="logo")
231
 
232
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
233
+ with gr.TabItem("🏆 MMIE Leaderboard"):
234
  with gr.Row():
235
  search_overall = gr.Textbox(
236
  label="Model Search (delimit with , )",
 
238
  show_label=False
239
  )
240
  model_type_overall = gr.CheckboxGroup(
241
+ choices=["Interleaved LVLM", "Integrated LVLM"],
242
+ value=["Interleaved LVLM", "Integrated LVLM"],
243
+ label="Model Type",
244
  show_label=False,
245
  interactive=True,
246
  )
247
  with gr.Row():
248
+ mmie_table_overall_hidden = gr.Dataframe(
249
+ df,
250
+ headers=df.columns.tolist(),
251
+ elem_id="mmie_leadboard_overall_hidden",
252
  wrap=True,
253
  visible=False,
254
  )
255
+ mmie_table_overall = gr.Dataframe(
256
  regex_table(
257
+ df.copy(),
258
  "",
259
+ ["Interleaved LVLM", "Integrated LVLM"]
260
  ),
261
+ headers=df.columns.tolist(),
262
+ elem_id="mmie_leadboard_overall",
263
  wrap=True,
264
  height=1000,
265
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  with gr.TabItem("About"):
267
  with gr.Row():
268
  gr.Markdown(ABOUT_TEXT)
269
 
270
  with gr.Accordion("📚 Citation", open=False):
271
  citation_button = gr.Textbox(
272
+ value=r"""""",
 
 
 
 
273
  lines=7,
274
  label="Copy the following to cite these results.",
275
  elem_id="citation-button",
276
  show_copy_button=True,
277
  )
278
 
279
+ search_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall)
280
+ model_type_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall)
 
 
 
281
 
282
  scheduler = BackgroundScheduler()
283
  scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h
evals/.gitattributes DELETED
@@ -1,55 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.lz4 filter=lfs diff=lfs merge=lfs -text
12
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
- *.model filter=lfs diff=lfs merge=lfs -text
14
- *.msgpack filter=lfs diff=lfs merge=lfs -text
15
- *.npy filter=lfs diff=lfs merge=lfs -text
16
- *.npz filter=lfs diff=lfs merge=lfs -text
17
- *.onnx filter=lfs diff=lfs merge=lfs -text
18
- *.ot filter=lfs diff=lfs merge=lfs -text
19
- *.parquet filter=lfs diff=lfs merge=lfs -text
20
- *.pb filter=lfs diff=lfs merge=lfs -text
21
- *.pickle filter=lfs diff=lfs merge=lfs -text
22
- *.pkl filter=lfs diff=lfs merge=lfs -text
23
- *.pt filter=lfs diff=lfs merge=lfs -text
24
- *.pth filter=lfs diff=lfs merge=lfs -text
25
- *.rar filter=lfs diff=lfs merge=lfs -text
26
- *.safetensors filter=lfs diff=lfs merge=lfs -text
27
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
- *.tar.* filter=lfs diff=lfs merge=lfs -text
29
- *.tar filter=lfs diff=lfs merge=lfs -text
30
- *.tflite filter=lfs diff=lfs merge=lfs -text
31
- *.tgz filter=lfs diff=lfs merge=lfs -text
32
- *.wasm filter=lfs diff=lfs merge=lfs -text
33
- *.xz filter=lfs diff=lfs merge=lfs -text
34
- *.zip filter=lfs diff=lfs merge=lfs -text
35
- *.zst filter=lfs diff=lfs merge=lfs -text
36
- *tfevents* filter=lfs diff=lfs merge=lfs -text
37
- # Audio files - uncompressed
38
- *.pcm filter=lfs diff=lfs merge=lfs -text
39
- *.sam filter=lfs diff=lfs merge=lfs -text
40
- *.raw filter=lfs diff=lfs merge=lfs -text
41
- # Audio files - compressed
42
- *.aac filter=lfs diff=lfs merge=lfs -text
43
- *.flac filter=lfs diff=lfs merge=lfs -text
44
- *.mp3 filter=lfs diff=lfs merge=lfs -text
45
- *.ogg filter=lfs diff=lfs merge=lfs -text
46
- *.wav filter=lfs diff=lfs merge=lfs -text
47
- # Image files - uncompressed
48
- *.bmp filter=lfs diff=lfs merge=lfs -text
49
- *.gif filter=lfs diff=lfs merge=lfs -text
50
- *.png filter=lfs diff=lfs merge=lfs -text
51
- *.tiff filter=lfs diff=lfs merge=lfs -text
52
- # Image files - compressed
53
- *.jpg filter=lfs diff=lfs merge=lfs -text
54
- *.jpeg filter=lfs diff=lfs merge=lfs -text
55
- *.webp filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/README.md DELETED
@@ -1,6 +0,0 @@
1
- ---
2
- # For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
3
- # Doc / guide: https://huggingface.co/docs/hub/datasets-cards
4
- {}
5
- ---
6
- # Coming Soon
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/AestheticsPredictor.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "AestheticsPredictor",
4
- "Model Type": "Score Model",
5
- "Input Type": "Single Image",
6
- "Organization": "LAION",
7
- "Alignment-Object": 35.9,
8
- "Alignment-Attribute": 38.4,
9
- "Alignment-Action": 43.6,
10
- "Alignment-Location": 31.6,
11
- "Alignment-Count": 35.7,
12
- "Alignment-Avg": 34.8,
13
- "Safety-Toxicity-Crime": 51.7,
14
- "Safety-Toxicity-Shocking": 58.6,
15
- "Safety-Toxicity-Disgust": 64.3,
16
- "Safety-Toxicity-Avg": 57.3,
17
- "Safety-Nsfw-Evident": 14.6,
18
- "Safety-Nsfw-Evasive": 55.2,
19
- "Safety-Nsfw-Subtle": 14.2,
20
- "Safety-Nsfw-Avg": 37.5,
21
- "Quality-Distortion-Human_face": 78.7,
22
- "Quality-Distortion-Human_limb": 57.1,
23
- "Quality-Distortion-Object": 51.3,
24
- "Quality-Distortion-Avg": 52.1,
25
- "Quality-Blurry-Defocused": 90.1,
26
- "Quality-Blurry-Motion": 93.4,
27
- "Quality-Blurry-Avg": 91.6,
28
- "Bias-Age": 59.4,
29
- "Bias-Gender": 62.0,
30
- "Bias-Race": 64.2,
31
- "Bias-Nationality": 62.4,
32
- "Bias-Religion": 61.0,
33
- "Bias-Avg": 62.0,
34
- "Bias-Age-NDS": 85.3,
35
- "Bias-Gender-NDS": 85.9,
36
- "Bias-Race-NDS": 86.3,
37
- "Bias-Nationality-NDS": 85.8,
38
- "Bias-Religion-NDS": 86.2,
39
- "Bias-Avg-NDS": 85.9,
40
- "Bias-Age-GES": 91.9,
41
- "Bias-Gender-GES": 92.1,
42
- "Bias-Race-GES": 92.4,
43
- "Bias-Nationality-GES": 92.1,
44
- "Bias-Religion-GES": 92.3,
45
- "Bias-Avg-GES": 92.1
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/BLIP-v2.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "BLIP-v2",
4
- "Model Type": "Score Model",
5
- "Input Type": "Single Image",
6
- "Organization": "Salesforce",
7
- "Alignment-Object": 23.5,
8
- "Alignment-Attribute": 22.7,
9
- "Alignment-Action": 24.8,
10
- "Alignment-Location": 19.7,
11
- "Alignment-Count": 16.1,
12
- "Alignment-Avg": 21.5,
13
- "Safety-Toxicity-Crime": 6.9,
14
- "Safety-Toxicity-Shocking": 0.0,
15
- "Safety-Toxicity-Disgust": 4.8,
16
- "Safety-Toxicity-Avg": 4.5,
17
- "Safety-Nsfw-Evident": 58.4,
18
- "Safety-Nsfw-Evasive": 51.1,
19
- "Safety-Nsfw-Subtle": 35.7,
20
- "Safety-Nsfw-Avg": 49.1,
21
- "Quality-Distortion-Human_face": 3.6,
22
- "Quality-Distortion-Human_limb": 2.0,
23
- "Quality-Distortion-Object": 1.1,
24
- "Quality-Distortion-Avg": 1.9,
25
- "Quality-Blurry-Defocused": 8.3,
26
- "Quality-Blurry-Motion": 47.2,
27
- "Quality-Blurry-Avg": 15.0,
28
- "Bias-Age": 69.6,
29
- "Bias-Gender": 68.5,
30
- "Bias-Race": 65.9,
31
- "Bias-Nationality": 68.6,
32
- "Bias-Religion": 74.7,
33
- "Bias-Avg": 68.5,
34
- "Bias-Age-NDS": 85.3,
35
- "Bias-Gender-NDS": 83.6,
36
- "Bias-Race-NDS": 82.7,
37
- "Bias-Nationality-NDS": 81.8,
38
- "Bias-Religion-NDS": 87.5,
39
- "Bias-Avg-NDS": 83.6,
40
- "Bias-Age-GES": 92.2,
41
- "Bias-Gender-GES": 91.3,
42
- "Bias-Race-GES": 90.7,
43
- "Bias-Nationality-GES": 90.4,
44
- "Bias-Religion-GES": 93.1,
45
- "Bias-Avg-GES": 91.3
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/CLIP-v2.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "CLIP-v2",
4
- "Model Type": "Score Model",
5
- "Input Type": "Single Image",
6
- "Organization": "LAION",
7
- "Alignment-Object": 42.2,
8
- "Alignment-Attribute": 45.9,
9
- "Alignment-Action": 45.3,
10
- "Alignment-Location": 43.4,
11
- "Alignment-Count": 55.4,
12
- "Alignment-Avg": 44.0,
13
- "Safety-Toxicity-Crime": 89.7,
14
- "Safety-Toxicity-Shocking": 96.6,
15
- "Safety-Toxicity-Disgust": 97.6,
16
- "Safety-Toxicity-Avg": 94.4,
17
- "Safety-Nsfw-Evident": 20.8,
18
- "Safety-Nsfw-Evasive": 4.5,
19
- "Safety-Nsfw-Subtle": 16.6,
20
- "Safety-Nsfw-Avg": 7.9,
21
- "Quality-Distortion-Human_face": 26.6,
22
- "Quality-Distortion-Human_limb": 17.2,
23
- "Quality-Distortion-Object": 34.0,
24
- "Quality-Distortion-Avg": 19.3,
25
- "Quality-Blurry-Defocused": 50.6,
26
- "Quality-Blurry-Motion": 63.7,
27
- "Quality-Blurry-Avg": 56.7,
28
- "Bias-Age": 57.2,
29
- "Bias-Gender": 57.8,
30
- "Bias-Race": 55.5,
31
- "Bias-Nationality": 59.5,
32
- "Bias-Religion": 60.8,
33
- "Bias-Avg": 57.7,
34
- "Bias-Age-NDS": 73.6,
35
- "Bias-Gender-NDS": 75.2,
36
- "Bias-Race-NDS": 73.1,
37
- "Bias-Nationality-NDS": 79.1,
38
- "Bias-Religion-NDS": 78.4,
39
- "Bias-Avg-NDS": 75.2,
40
- "Bias-Age-GES": 73.6,
41
- "Bias-Gender-GES": 75.2,
42
- "Bias-Race-GES": 73.1,
43
- "Bias-Nationality-GES": 79.1,
44
- "Bias-Religion-GES": 78.4,
45
- "Bias-Avg-GES": 75.2
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/Claude 3 Opus.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "Claude 3 Opus",
4
- "Model Type": "Closesource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "Anthropic",
7
- "Alignment-Object": 64.9,
8
- "Alignment-Attribute": 38.9,
9
- "Alignment-Action": 44.4,
10
- "Alignment-Location": 55.3,
11
- "Alignment-Count": 55.4,
12
- "Alignment-Avg": 57.1,
13
- "Safety-Toxicity-Crime": 62.1,
14
- "Safety-Toxicity-Shocking": 37.9,
15
- "Safety-Toxicity-Disgust": 50.0,
16
- "Safety-Toxicity-Avg": 50.6,
17
- "Safety-Nsfw-Evident": 10.5,
18
- "Safety-Nsfw-Evasive": 6.2,
19
- "Safety-Nsfw-Subtle": 3.6,
20
- "Safety-Nsfw-Avg": 8.3,
21
- "Quality-Distortion-Human_face": 26.6,
22
- "Quality-Distortion-Human_limb": 19.3,
23
- "Quality-Distortion-Object": 10.7,
24
- "Quality-Distortion-Avg": 17.6,
25
- "Quality-Blurry-Defocused": 89.6,
26
- "Quality-Blurry-Motion": 93.3,
27
- "Quality-Blurry-Avg": 92.7,
28
- "Bias-Age": 53.9,
29
- "Bias-Gender": 58.2,
30
- "Bias-Race": 62.1,
31
- "Bias-Nationality": 59.0,
32
- "Bias-Religion": 54.0,
33
- "Bias-Avg": 58.2,
34
- "Bias-Age-NDS": 63.3,
35
- "Bias-Gender-NDS": 66.1,
36
- "Bias-Race-NDS": 67.5,
37
- "Bias-Nationality-NDS": 66.9,
38
- "Bias-Religion-NDS": 66.8,
39
- "Bias-Avg-NDS": 66.1,
40
- "Bias-Age-GES": 83.2,
41
- "Bias-Gender-GES": 85.2,
42
- "Bias-Race-GES": 86.5,
43
- "Bias-Nationality-GES": 85.8,
44
- "Bias-Religion-GES": 84.8,
45
- "Bias-Avg-GES": 85.2
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/GPT-4-vision.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "GPT-4-vision",
4
- "Model Type": "Closesource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "OpenAI",
7
- "Alignment-Object": 68.1,
8
- "Alignment-Attribute": 62.9,
9
- "Alignment-Action": 64.1,
10
- "Alignment-Location": 67.1,
11
- "Alignment-Count": 73.2,
12
- "Alignment-Avg": 66.1,
13
- "Safety-Toxicity-Crime": 75.9,
14
- "Safety-Toxicity-Shocking": 69.0,
15
- "Safety-Toxicity-Disgust": 81.0,
16
- "Safety-Toxicity-Avg": 76.4,
17
- "Safety-Nsfw-Evident": 69.5,
18
- "Safety-Nsfw-Evasive": 43.2,
19
- "Safety-Nsfw-Subtle": 32.5,
20
- "Safety-Nsfw-Avg": 44.1,
21
- "Quality-Distortion-Human_face": 87.6,
22
- "Quality-Distortion-Human_limb": 57.6,
23
- "Quality-Distortion-Object": 83.1,
24
- "Quality-Distortion-Avg": 75.7,
25
- "Quality-Blurry-Defocused": 98.8,
26
- "Quality-Blurry-Motion": 99.3,
27
- "Quality-Blurry-Avg": 99.2,
28
- "Bias-Age": 76.7,
29
- "Bias-Gender": 79.1,
30
- "Bias-Race": 77.4,
31
- "Bias-Nationality": 81.0,
32
- "Bias-Religion": 86.5,
33
- "Bias-Avg": 79.1,
34
- "Bias-Age-NDS": 81.2,
35
- "Bias-Gender-NDS": 80.2,
36
- "Bias-Race-NDS": 77.6,
37
- "Bias-Nationality-NDS": 79.9,
38
- "Bias-Religion-NDS": 88.2,
39
- "Bias-Avg-NDS": 80.2,
40
- "Bias-Age-GES": 93.0,
41
- "Bias-Gender-GES": 93.2,
42
- "Bias-Race-GES": 92.2,
43
- "Bias-Nationality-GES": 93.4,
44
- "Bias-Religion-GES": 96.4,
45
- "Bias-Avg-GES": 93.2
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/GPT-4o.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "GPT-4o",
4
- "Model Type": "Closesource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "OpenAI",
7
- "Alignment-Object": 62.2,
8
- "Alignment-Attribute": 57.2,
9
- "Alignment-Action": 64.1,
10
- "Alignment-Location": 63.2,
11
- "Alignment-Count": 67.9,
12
- "Alignment-Avg": 61.5,
13
- "Safety-Toxicity-Crime": 86.2,
14
- "Safety-Toxicity-Shocking": 96.6,
15
- "Safety-Toxicity-Disgust": 95.2,
16
- "Safety-Toxicity-Avg": 92.1,
17
- "Safety-Nsfw-Evident": 72.3,
18
- "Safety-Nsfw-Evasive": 51.7,
19
- "Safety-Nsfw-Subtle": 38.9,
20
- "Safety-Nsfw-Avg": 54.3,
21
- "Quality-Distortion-Human_face": 99.4,
22
- "Quality-Distortion-Human_limb": 78.2,
23
- "Quality-Distortion-Object": 100.0,
24
- "Quality-Distortion-Avg": 93.8,
25
- "Quality-Blurry-Defocused": 100.0,
26
- "Quality-Blurry-Motion": 100.0,
27
- "Quality-Blurry-Avg": 100.0,
28
- "Bias-Age": 60.9,
29
- "Bias-Gender": 66.6,
30
- "Bias-Race": 69.1,
31
- "Bias-Nationality": 68.2,
32
- "Bias-Religion": 69.6,
33
- "Bias-Avg": 66.6,
34
- "Bias-Age-NDS": 81.2,
35
- "Bias-Gender-NDS": 82.7,
36
- "Bias-Race-NDS": 82.8,
37
- "Bias-Nationality-NDS": 83.2,
38
- "Bias-Religion-NDS": 86.1,
39
- "Bias-Avg-NDS": 82.7,
40
- "Bias-Age-GES": 91.8,
41
- "Bias-Gender-GES": 92.9,
42
- "Bias-Race-GES": 93.1,
43
- "Bias-Nationality-GES": 93.3,
44
- "Bias-Religion-GES": 94.4,
45
- "Bias-Avg-GES": 92.9
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/Gemini Ultra.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "Gemini Ultra",
4
- "Model Type": "Closesource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "Google",
7
- "Alignment-Object": 71.7,
8
- "Alignment-Attribute": 65.1,
9
- "Alignment-Action": 63.2,
10
- "Alignment-Location": 64.5,
11
- "Alignment-Count": 67.8,
12
- "Alignment-Avg": 67.2,
13
- "Safety-Toxicity-Crime": 65.5,
14
- "Safety-Toxicity-Shocking": 41.4,
15
- "Safety-Toxicity-Disgust": 78.6,
16
- "Safety-Toxicity-Avg": 64.0,
17
- "Safety-Nsfw-Evident": 31.6,
18
- "Safety-Nsfw-Evasive": 19.1,
19
- "Safety-Nsfw-Subtle": 10.3,
20
- "Safety-Nsfw-Avg": 22.7,
21
- "Quality-Distortion-Human_face": 73.4,
22
- "Quality-Distortion-Human_limb": 32.5,
23
- "Quality-Distortion-Object": 61.0,
24
- "Quality-Distortion-Avg": 55.7,
25
- "Quality-Blurry-Defocused": 86.5,
26
- "Quality-Blurry-Motion": 97.3,
27
- "Quality-Blurry-Avg": 93.9,
28
- "Bias-Age": 48.7,
29
- "Bias-Gender": 56.9,
30
- "Bias-Race": 62.9,
31
- "Bias-Nationality": 60.0,
32
- "Bias-Religion": 49.9,
33
- "Bias-Avg": 56.9,
34
- "Bias-Age-NDS": 72.6,
35
- "Bias-Gender-NDS": 75.8,
36
- "Bias-Race-NDS": 78.4,
37
- "Bias-Nationality-NDS": 77.0,
38
- "Bias-Religion-NDS": 72.3,
39
- "Bias-Avg-NDS": 75.8,
40
- "Bias-Age-GES": 86.6,
41
- "Bias-Gender-GES": 89.0,
42
- "Bias-Race-GES": 90.8,
43
- "Bias-Nationality-GES": 90.0,
44
- "Bias-Religion-GES": 86.2,
45
- "Bias-Avg-GES": 89.0
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/HPS-v2.1.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "HPS-v2.1",
4
- "Model Type": "Score Model",
5
- "Input Type": "Single Image",
6
- "Organization": "CUHK MMLab",
7
- "Alignment-Object": 49.4,
8
- "Alignment-Attribute": 53.7,
9
- "Alignment-Action": 49.6,
10
- "Alignment-Location": 51.3,
11
- "Alignment-Count": 57.1,
12
- "Alignment-Avg": 48.8,
13
- "Safety-Toxicity-Crime": 89.7,
14
- "Safety-Toxicity-Shocking": 86.2,
15
- "Safety-Toxicity-Disgust": 85.7,
16
- "Safety-Toxicity-Avg": 87.6,
17
- "Safety-Nsfw-Evident": 1.1,
18
- "Safety-Nsfw-Evasive": 30.8,
19
- "Safety-Nsfw-Subtle": 0.6,
20
- "Safety-Nsfw-Avg": 15.1,
21
- "Quality-Distortion-Human_face": 60.4,
22
- "Quality-Distortion-Human_limb": 37.1,
23
- "Quality-Distortion-Object": 80.3,
24
- "Quality-Distortion-Avg": 51.7,
25
- "Quality-Blurry-Defocused": 85.7,
26
- "Quality-Blurry-Motion": 94.6,
27
- "Quality-Blurry-Avg": 88.6,
28
- "Bias-Age": 52.9,
29
- "Bias-Gender": 55.3,
30
- "Bias-Race": 55.7,
31
- "Bias-Nationality": 55.0,
32
- "Bias-Religion": 62.4,
33
- "Bias-Avg": 55.3,
34
- "Bias-Age-NDS": 75.8,
35
- "Bias-Gender-NDS": 78.2,
36
- "Bias-Race-NDS": 79.5,
37
- "Bias-Nationality-NDS": 78.6,
38
- "Bias-Religion-NDS": 79.3,
39
- "Bias-Avg-NDS": 78.2,
40
- "Bias-Age-GES": 86.4,
41
- "Bias-Gender-GES": 87.8,
42
- "Bias-Race-GES": 88.5,
43
- "Bias-Nationality-GES": 88.0,
44
- "Bias-Religion-GES": 88.5,
45
- "Bias-Avg-GES": 87.8
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/Idefics2-8b.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "Idefics2-8b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "HuggingFace",
7
- "Alignment-Object": 35.5,
8
- "Alignment-Attribute": 31.7,
9
- "Alignment-Action": 30.8,
10
- "Alignment-Location": 29.9,
11
- "Alignment-Count": 30.4,
12
- "Alignment-Avg": 32.6,
13
- "Safety-Toxicity-Crime": 58.6,
14
- "Safety-Toxicity-Shocking": 44.8,
15
- "Safety-Toxicity-Disgust": 57.1,
16
- "Safety-Toxicity-Avg": 52.8,
17
- "Safety-Nsfw-Evident": 32.9,
18
- "Safety-Nsfw-Evasive": 13.2,
19
- "Safety-Nsfw-Subtle": 19.5,
20
- "Safety-Nsfw-Avg": 20.2,
21
- "Quality-Distortion-Human_face": 29.6,
22
- "Quality-Distortion-Human_limb": 25.8,
23
- "Quality-Distortion-Object": 2.3,
24
- "Quality-Distortion-Avg": 21.7,
25
- "Quality-Blurry-Defocused": 70.6,
26
- "Quality-Blurry-Motion": 46.9,
27
- "Quality-Blurry-Avg": 58.7,
28
- "Bias-Age": 37.4,
29
- "Bias-Gender": 42.7,
30
- "Bias-Race": 45.3,
31
- "Bias-Nationality": 46.9,
32
- "Bias-Religion": 35.2,
33
- "Bias-Avg": 42.7,
34
- "Bias-Age-NDS": 55.1,
35
- "Bias-Gender-NDS": 59.2,
36
- "Bias-Race-NDS": 61.7,
37
- "Bias-Nationality-NDS": 62.8,
38
- "Bias-Religion-NDS": 51.0,
39
- "Bias-Avg-NDS": 59.2,
40
- "Bias-Age-GES": 77.0,
41
- "Bias-Gender-GES": 79.7,
42
- "Bias-Race-GES": 81.3,
43
- "Bias-Nationality-GES": 82.0,
44
- "Bias-Religion-GES": 74.4,
45
- "Bias-Avg-GES": 79.8
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/ImageReward.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "ImageReward",
4
- "Model Type": "Score Model",
5
- "Input Type": "Single Image",
6
- "Organization": "THUDM",
7
- "Alignment-Object": 50.6,
8
- "Alignment-Attribute": 52.8,
9
- "Alignment-Action": 47.1,
10
- "Alignment-Location": 57.9,
11
- "Alignment-Count": 53.6,
12
- "Alignment-Avg": 51.1,
13
- "Safety-Toxicity-Crime": 96.6,
14
- "Safety-Toxicity-Shocking": 96.6,
15
- "Safety-Toxicity-Disgust": 95.2,
16
- "Safety-Toxicity-Avg": 95.5,
17
- "Safety-Nsfw-Evident": 31.1,
18
- "Safety-Nsfw-Evasive": 10.2,
19
- "Safety-Nsfw-Subtle": 27.4,
20
- "Safety-Nsfw-Avg": 18.2,
21
- "Quality-Distortion-Human_face": 31.4,
22
- "Quality-Distortion-Human_limb": 34.4,
23
- "Quality-Distortion-Object": 40.2,
24
- "Quality-Distortion-Avg": 33.3,
25
- "Quality-Blurry-Defocused": 77.4,
26
- "Quality-Blurry-Motion": 86.6,
27
- "Quality-Blurry-Avg": 82.1,
28
- "Bias-Age": 41.8,
29
- "Bias-Gender": 40.4,
30
- "Bias-Race": 36.8,
31
- "Bias-Nationality": 39.5,
32
- "Bias-Religion": 52.8,
33
- "Bias-Avg": 40.4,
34
- "Bias-Age-NDS": 73.9,
35
- "Bias-Gender-NDS": 73.2,
36
- "Bias-Race-NDS": 70.9,
37
- "Bias-Nationality-NDS": 73.0,
38
- "Bias-Religion-NDS": 80.2,
39
- "Bias-Avg-NDS": 73.2,
40
- "Bias-Age-GES": 85.5,
41
- "Bias-Gender-GES": 85.0,
42
- "Bias-Race-GES": 83.6,
43
- "Bias-Nationality-GES": 84.8,
44
- "Bias-Religion-GES": 89.0,
45
- "Bias-Avg-GES": 85.0
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/Instructblip-7b.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "Instructblip-7b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "Salesforce",
7
- "Alignment-Object": 17.1,
8
- "Alignment-Attribute": 17.4,
9
- "Alignment-Action": 16.2,
10
- "Alignment-Location": 13.1,
11
- "Alignment-Count": 21.4,
12
- "Alignment-Avg": 17.1,
13
- "Safety-Toxicity-Crime": 31.0,
14
- "Safety-Toxicity-Shocking": 34.5,
15
- "Safety-Toxicity-Disgust": 40.5,
16
- "Safety-Toxicity-Avg": 39.3,
17
- "Safety-Nsfw-Evident": 36.9,
18
- "Safety-Nsfw-Evasive": 24.2,
19
- "Safety-Nsfw-Subtle": 30.6,
20
- "Safety-Nsfw-Avg": 33.7,
21
- "Quality-Distortion-Human_face": 12.4,
22
- "Quality-Distortion-Human_limb": 9.3,
23
- "Quality-Distortion-Object": 21.0,
24
- "Quality-Distortion-Avg": 13.3,
25
- "Quality-Blurry-Defocused": 32.3,
26
- "Quality-Blurry-Motion": 31.1,
27
- "Quality-Blurry-Avg": 31.7,
28
- "Bias-Age": 52.5,
29
- "Bias-Gender": 53.6,
30
- "Bias-Race": 53.6,
31
- "Bias-Nationality": 52.0,
32
- "Bias-Religion": 61.1,
33
- "Bias-Avg": 53.6,
34
- "Bias-Age-NDS": 80.8,
35
- "Bias-Gender-NDS": 80.6,
36
- "Bias-Race-NDS": 80.3,
37
- "Bias-Nationality-NDS": 79.0,
38
- "Bias-Religion-NDS": 85.4,
39
- "Bias-Avg-NDS": 80.6,
40
- "Bias-Age-GES": 91.0,
41
- "Bias-Gender-GES": 91.2,
42
- "Bias-Race-GES": 91.1,
43
- "Bias-Nationality-GES": 90.4,
44
- "Bias-Religion-GES": 93.8,
45
- "Bias-Avg-GES": 91.1
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "InternVL-Chat-V1-5",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "OpenGVLab",
7
- "Alignment-Object": 73.3,
8
- "Alignment-Attribute": 74.8,
9
- "Alignment-Action": 78.6,
10
- "Alignment-Location": 80.5,
11
- "Alignment-Count": 78.6,
12
- "Alignment-Avg": 75.8,
13
- "Safety-Toxicity-Crime": 34.5,
14
- "Safety-Toxicity-Shocking": 10.3,
15
- "Safety-Toxicity-Disgust": 28.6,
16
- "Safety-Toxicity-Avg": 25.8,
17
- "Safety-Nsfw-Evident": 23.3,
18
- "Safety-Nsfw-Evasive": 10.6,
19
- "Safety-Nsfw-Subtle": 7.2,
20
- "Safety-Nsfw-Avg": 16.2,
21
- "Quality-Distortion-Human_face": 97.0,
22
- "Quality-Distortion-Human_limb": 95.4,
23
- "Quality-Distortion-Object": 97.1,
24
- "Quality-Distortion-Avg": 97.1,
25
- "Quality-Blurry-Defocused": 89.7,
26
- "Quality-Blurry-Motion": 89.7,
27
- "Quality-Blurry-Avg": 89.7,
28
- "Bias-Age": 40.0,
29
- "Bias-Gender": 41.3,
30
- "Bias-Race": 42.1,
31
- "Bias-Nationality": 42.0,
32
- "Bias-Religion": 39.8,
33
- "Bias-Avg": 41.3,
34
- "Bias-Age-NDS": 74.0,
35
- "Bias-Gender-NDS": 74.1,
36
- "Bias-Race-NDS": 73.6,
37
- "Bias-Nationality-NDS": 73.9,
38
- "Bias-Religion-NDS": 76.6,
39
- "Bias-Avg-NDS": 74.1,
40
- "Bias-Age-GES": 86.9,
41
- "Bias-Gender-GES": 87.2,
42
- "Bias-Race-GES": 87.1,
43
- "Bias-Nationality-GES": 87.3,
44
- "Bias-Religion-GES": 88.0,
45
- "Bias-Avg-GES": 87.2
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "LLaVA-1.5-13b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "UW-Madison & Microsoft",
7
- "Alignment-Object": 17.7,
8
- "Alignment-Attribute": 13.5,
9
- "Alignment-Action": 11.8,
10
- "Alignment-Location": 16.5,
11
- "Alignment-Count": 8.9,
12
- "Alignment-Avg": 10.3,
13
- "Safety-Toxicity-Crime": 31.0,
14
- "Safety-Toxicity-Shocking": 31.0,
15
- "Safety-Toxicity-Disgust": 40.5,
16
- "Safety-Toxicity-Avg": 33.7,
17
- "Safety-Nsfw-Evident": 40.8,
18
- "Safety-Nsfw-Evasive": 29.9,
19
- "Safety-Nsfw-Subtle": 33.6,
20
- "Safety-Nsfw-Avg": 34.7,
21
- "Quality-Distortion-Human_face": 20.1,
22
- "Quality-Distortion-Human_limb": 14.6,
23
- "Quality-Distortion-Object": 13.3,
24
- "Quality-Distortion-Avg": 16.4,
25
- "Quality-Blurry-Defocused": 18.0,
26
- "Quality-Blurry-Motion": 34.0,
27
- "Quality-Blurry-Avg": 26.1,
28
- "Bias-Age": 67.0,
29
- "Bias-Gender": 70.1,
30
- "Bias-Race": 68.9,
31
- "Bias-Nationality": 72.7,
32
- "Bias-Religion": 75.1,
33
- "Bias-Avg": 70.1,
34
- "Bias-Age-NDS": 71.9,
35
- "Bias-Gender-NDS": 74.8,
36
- "Bias-Race-NDS": 76.6,
37
- "Bias-Nationality-NDS": 74.0,
38
- "Bias-Religion-NDS": 80.6,
39
- "Bias-Avg-NDS": 74.8,
40
- "Bias-Age-GES": 87.5,
41
- "Bias-Gender-GES": 88.8,
42
- "Bias-Race-GES": 88.9,
43
- "Bias-Nationality-GES": 89.5,
44
- "Bias-Religion-GES": 90.1,
45
- "Bias-Avg-GES": 88.8
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "LLaVA-1.5-7b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "UW-Madison & Microsoft",
7
- "Alignment-Object": 20.7,
8
- "Alignment-Attribute": 25.2,
9
- "Alignment-Action": 23.1,
10
- "Alignment-Location": 18.2,
11
- "Alignment-Count": 17.9,
12
- "Alignment-Avg": 22.0,
13
- "Safety-Toxicity-Crime": 44.8,
14
- "Safety-Toxicity-Shocking": 41.4,
15
- "Safety-Toxicity-Disgust": 47.6,
16
- "Safety-Toxicity-Avg": 43.8,
17
- "Safety-Nsfw-Evident": 35.7,
18
- "Safety-Nsfw-Evasive": 21.2,
19
- "Safety-Nsfw-Subtle": 17.6,
20
- "Safety-Nsfw-Avg": 26.3,
21
- "Quality-Distortion-Human_face": 13.6,
22
- "Quality-Distortion-Human_limb": 7.3,
23
- "Quality-Distortion-Object": 9.2,
24
- "Quality-Distortion-Avg": 10.2,
25
- "Quality-Blurry-Defocused": 7.1,
26
- "Quality-Blurry-Motion": 19.1,
27
- "Quality-Blurry-Avg": 13.1,
28
- "Bias-Age": 80.8,
29
- "Bias-Gender": 83.9,
30
- "Bias-Race": 84.6,
31
- "Bias-Nationality": 84.9,
32
- "Bias-Religion": 88.1,
33
- "Bias-Avg": 84.0,
34
- "Bias-Age-NDS": 67.6,
35
- "Bias-Gender-NDS": 71.4,
36
- "Bias-Race-NDS": 75.8,
37
- "Bias-Nationality-NDS": 68.4,
38
- "Bias-Religion-NDS": 77.3,
39
- "Bias-Avg-NDS": 71.4,
40
- "Bias-Age-GES": 87.4,
41
- "Bias-Gender-GES": 88.9,
42
- "Bias-Race-GES": 90.1,
43
- "Bias-Nationality-GES": 88.7,
44
- "Bias-Religion-GES": 90.7,
45
- "Bias-Avg-GES": 88.9
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "LLaVA-NeXT-mistral-7b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "UW-Madison & ByteDance",
7
- "Alignment-Object": 25.9,
8
- "Alignment-Attribute": 30.0,
9
- "Alignment-Action": 41.9,
10
- "Alignment-Location": 33.8,
11
- "Alignment-Count": 35.7,
12
- "Alignment-Avg": 31.3,
13
- "Safety-Toxicity-Crime": 20.7,
14
- "Safety-Toxicity-Shocking": 24.1,
15
- "Safety-Toxicity-Disgust": 19.0,
16
- "Safety-Toxicity-Avg": 21.3,
17
- "Safety-Nsfw-Evident": 35.7,
18
- "Safety-Nsfw-Evasive": 14.1,
19
- "Safety-Nsfw-Subtle": 23.3,
20
- "Safety-Nsfw-Avg": 25.6,
21
- "Quality-Distortion-Human_face": 28.4,
22
- "Quality-Distortion-Human_limb": 27.8,
23
- "Quality-Distortion-Object": 19.0,
24
- "Quality-Distortion-Avg": 30.1,
25
- "Quality-Blurry-Defocused": 41.7,
26
- "Quality-Blurry-Motion": 66.1,
27
- "Quality-Blurry-Avg": 53.9,
28
- "Bias-Age": 54.3,
29
- "Bias-Gender": 56.7,
30
- "Bias-Race": 57.0,
31
- "Bias-Nationality": 56.1,
32
- "Bias-Religion": 64.8,
33
- "Bias-Avg": 56.6,
34
- "Bias-Age-NDS": 63.2,
35
- "Bias-Gender-NDS": 64.1,
36
- "Bias-Race-NDS": 62.5,
37
- "Bias-Nationality-NDS": 63.8,
38
- "Bias-Religion-NDS": 74.2,
39
- "Bias-Avg-NDS": 64.1,
40
- "Bias-Age-GES": 82.1,
41
- "Bias-Gender-GES": 82.8,
42
- "Bias-Race-GES": 82.4,
43
- "Bias-Nationality-GES": 82.5,
44
- "Bias-Religion-GES": 87.8,
45
- "Bias-Avg-GES": 82.8
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json DELETED
@@ -1,35 +0,0 @@
1
- [
2
- {
3
- "Model": "LLaVA-NeXT-vicuna-13b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "UW-Madison & ByteDance",
7
- "Alignment-Object": 25.9,
8
- "Alignment-Attribute": 27.4,
9
- "Alignment-Action": 31.6,
10
- "Alignment-Location": 38.9,
11
- "Alignment-Count": 32.1,
12
- "Alignment-Avg": 29.1,
13
- "Safety-Toxicity-Crime": 44.8,
14
- "Safety-Toxicity-Shocking": 37.9,
15
- "Safety-Toxicity-Disgust": 52.4,
16
- "Safety-Toxicity-Avg": 43.8,
17
- "Safety-Nsfw-Evident": 40.9,
18
- "Safety-Nsfw-Evasive": 25.1,
19
- "Safety-Nsfw-Subtle": 27.8,
20
- "Safety-Nsfw-Avg": 36.5,
21
- "Quality-Distortion-Human_face": 18.9,
22
- "Quality-Distortion-Human_limb": 27.8,
23
- "Quality-Distortion-Object": 12.0,
24
- "Quality-Distortion-Avg": 20.5,
25
- "Quality-Blurry-Defocused": 40.6,
26
- "Quality-Blurry-Motion": 45.4,
27
- "Quality-Blurry-Avg": 43.0,
28
- "Bias-Age": 54.3,
29
- "Bias-Gender": 56.7,
30
- "Bias-Race": 57.0,
31
- "Bias-Nationality": 56.1,
32
- "Bias-Religion": 64.8,
33
- "Bias-Avg": 56.6
34
- }
35
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/MiniGPT4-v2.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "MiniGPT4-v2",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "Vision-CAIR",
7
- "Alignment-Object": 37.5,
8
- "Alignment-Attribute": 30.9,
9
- "Alignment-Action": 30.8,
10
- "Alignment-Location": 32.5,
11
- "Alignment-Count": 39.3,
12
- "Alignment-Avg": 32.8,
13
- "Safety-Toxicity-Crime": 41.4,
14
- "Safety-Toxicity-Shocking": 62.1,
15
- "Safety-Toxicity-Disgust": 42.9,
16
- "Safety-Toxicity-Avg": 48.3,
17
- "Safety-Nsfw-Evident": 39.6,
18
- "Safety-Nsfw-Evasive": 21.4,
19
- "Safety-Nsfw-Subtle": 36.5,
20
- "Safety-Nsfw-Avg": 32.6,
21
- "Quality-Distortion-Human_face": 39.6,
22
- "Quality-Distortion-Human_limb": 39.1,
23
- "Quality-Distortion-Object": 42.0,
24
- "Quality-Distortion-Avg": 40.0,
25
- "Quality-Blurry-Defocused": 33.4,
26
- "Quality-Blurry-Motion": 37.4,
27
- "Quality-Blurry-Avg": 35.4,
28
- "Bias-Age": 31.8,
29
- "Bias-Gender": 32.2,
30
- "Bias-Race": 31.9,
31
- "Bias-Nationality": 34.1,
32
- "Bias-Religion": 28.3,
33
- "Bias-Avg": 32.2,
34
- "Bias-Age-NDS": 68.1,
35
- "Bias-Gender-NDS": 67.2,
36
- "Bias-Race-NDS": 66.2,
37
- "Bias-Nationality-NDS": 67.0,
38
- "Bias-Religion-NDS": 69.3,
39
- "Bias-Avg-NDS": 67.2,
40
- "Bias-Age-GES": 83.7,
41
- "Bias-Gender-GES": 83.3,
42
- "Bias-Race-GES": 82.8,
43
- "Bias-Nationality-GES": 83.4,
44
- "Bias-Religion-GES": 84.1,
45
- "Bias-Avg-GES": 83.3
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/PickScore-v1.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "PickScore-v1",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "Stability AI",
7
- "Alignment-Object": 60.9,
8
- "Alignment-Attribute": 60.3,
9
- "Alignment-Action": 62.4,
10
- "Alignment-Location": 59.2,
11
- "Alignment-Count": 67.9,
12
- "Alignment-Avg": 60.9,
13
- "Safety-Toxicity-Crime": 89.7,
14
- "Safety-Toxicity-Shocking": 82.8,
15
- "Safety-Toxicity-Disgust": 88.1,
16
- "Safety-Toxicity-Avg": 86.5,
17
- "Safety-Nsfw-Evident": 3.1,
18
- "Safety-Nsfw-Evasive": 48.2,
19
- "Safety-Nsfw-Subtle": 2.1,
20
- "Safety-Nsfw-Avg": 32.2,
21
- "Quality-Distortion-Human_face": 83.4,
22
- "Quality-Distortion-Human_limb": 68.2,
23
- "Quality-Distortion-Object": 92.1,
24
- "Quality-Distortion-Avg": 79.3,
25
- "Quality-Blurry-Defocused": 80.6,
26
- "Quality-Blurry-Motion": 93.4,
27
- "Quality-Blurry-Avg": 86.6,
28
- "Bias-Age": 30.4,
29
- "Bias-Gender": 31.1,
30
- "Bias-Race": 30.8,
31
- "Bias-Nationality": 31.7,
32
- "Bias-Religion": 33.0,
33
- "Bias-Avg": 31.1,
34
- "Bias-Age-NDS": 65.3,
35
- "Bias-Gender-NDS": 66.7,
36
- "Bias-Race-NDS": 66.4,
37
- "Bias-Nationality-NDS": 67.3,
38
- "Bias-Religion-NDS": 69.4,
39
- "Bias-Avg-NDS": 66.7,
40
- "Bias-Age-GES": 80.5,
41
- "Bias-Gender-GES": 81.2,
42
- "Bias-Race-GES": 81.0,
43
- "Bias-Nationality-GES": 81.6,
44
- "Bias-Religion-GES": 82.6,
45
- "Bias-Avg-GES": 81.2
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "Prometheus-Vision-13b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "prometheus-eval",
7
- "Alignment-Object": 14.3,
8
- "Alignment-Attribute": 10.9,
9
- "Alignment-Action": 9.4,
10
- "Alignment-Location": 11.7,
11
- "Alignment-Count": 16.1,
12
- "Alignment-Avg": 11.8,
13
- "Safety-Toxicity-Crime": 0.0,
14
- "Safety-Toxicity-Shocking": 0.0,
15
- "Safety-Toxicity-Disgust": 0.0,
16
- "Safety-Toxicity-Avg": 0.0,
17
- "Safety-Nsfw-Evident": 6.5,
18
- "Safety-Nsfw-Evasive": 4.1,
19
- "Safety-Nsfw-Subtle": 4.2,
20
- "Safety-Nsfw-Avg": 5.3,
21
- "Quality-Distortion-Human_face": 7.1,
22
- "Quality-Distortion-Human_limb": 4.6,
23
- "Quality-Distortion-Object": 7.2,
24
- "Quality-Distortion-Avg": 6.2,
25
- "Quality-Blurry-Defocused": 9.4,
26
- "Quality-Blurry-Motion": 10.6,
27
- "Quality-Blurry-Avg": 10.0,
28
- "Bias-Age": 65.1,
29
- "Bias-Gender": 65.8,
30
- "Bias-Race": 63.4,
31
- "Bias-Nationality": 65.7,
32
- "Bias-Religion": 77.1,
33
- "Bias-Avg": 65.8,
34
- "Bias-Age-NDS": 54.2,
35
- "Bias-Gender-NDS": 44.7,
36
- "Bias-Race-NDS": 36.0,
37
- "Bias-Nationality-NDS": 39.3,
38
- "Bias-Religion-NDS": 65.7,
39
- "Bias-Avg-NDS": 44.7,
40
- "Bias-Age-GES": 79.2,
41
- "Bias-Gender-GES": 76.0,
42
- "Bias-Race-GES": 72.7,
43
- "Bias-Nationality-GES": 74.1,
44
- "Bias-Religion-GES": 85.1,
45
- "Bias-Avg-GES": 76.0
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "Prometheus-Vision-7b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "prometheus-eval",
7
- "Alignment-Object": 19.5,
8
- "Alignment-Attribute": 15.2,
9
- "Alignment-Action": 16.2,
10
- "Alignment-Location": 22.1,
11
- "Alignment-Count": 26.8,
12
- "Alignment-Avg": 18.8,
13
- "Safety-Toxicity-Crime": 0.0,
14
- "Safety-Toxicity-Shocking": 0.0,
15
- "Safety-Toxicity-Disgust": 0.0,
16
- "Safety-Toxicity-Avg": 0.0,
17
- "Safety-Nsfw-Evident": 10.3,
18
- "Safety-Nsfw-Evasive": 6.8,
19
- "Safety-Nsfw-Subtle": 4.3,
20
- "Safety-Nsfw-Avg": 7.1,
21
- "Quality-Distortion-Human_face": 16.6,
22
- "Quality-Distortion-Human_limb": 17.9,
23
- "Quality-Distortion-Object": 14.1,
24
- "Quality-Distortion-Avg": 16.4,
25
- "Quality-Blurry-Defocused": 22.3,
26
- "Quality-Blurry-Motion": 30.3,
27
- "Quality-Blurry-Avg": 26.3,
28
- "Bias-Age": 43.8,
29
- "Bias-Gender": 50.4,
30
- "Bias-Race": 54.4,
31
- "Bias-Nationality": 53.6,
32
- "Bias-Religion": 44.9,
33
- "Bias-Avg": 50.4,
34
- "Bias-Age-NDS": 47.2,
35
- "Bias-Gender-NDS": 42.5,
36
- "Bias-Race-NDS": 37.8,
37
- "Bias-Nationality-NDS": 40.0,
38
- "Bias-Religion-NDS": 54.2,
39
- "Bias-Avg-NDS": 42.5,
40
- "Bias-Age-GES": 74.9,
41
- "Bias-Gender-GES": 74.3,
42
- "Bias-Race-GES": 73.1,
43
- "Bias-Nationality-GES": 74.2,
44
- "Bias-Religion-GES": 77.3,
45
- "Bias-Avg-GES": 74.3
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/detailed-results/Qwen-VL-Chat.json DELETED
@@ -1,47 +0,0 @@
1
- [
2
- {
3
- "Model": "Qwen-VL-Chat",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "Alibaba",
7
- "Alignment-Object": 30.7,
8
- "Alignment-Attribute": 29.1,
9
- "Alignment-Action": 35.9,
10
- "Alignment-Location": 29.9,
11
- "Alignment-Count": 32.1,
12
- "Alignment-Avg": 31.1,
13
- "Safety-Toxicity-Crime": 27.6,
14
- "Safety-Toxicity-Shocking": 13.8,
15
- "Safety-Toxicity-Disgust": 31.0,
16
- "Safety-Toxicity-Avg": 24.7,
17
- "Safety-Nsfw-Evident": 18.9,
18
- "Safety-Nsfw-Evasive": 7.6,
19
- "Safety-Nsfw-Subtle": 6.3,
20
- "Safety-Nsfw-Avg": 11.6,
21
- "Quality-Distortion-Human_face": 14.2,
22
- "Quality-Distortion-Human_limb": 15.9,
23
- "Quality-Distortion-Object": 9.4,
24
- "Quality-Distortion-Avg": 13.6,
25
- "Quality-Blurry-Defocused": 0.9,
26
- "Quality-Blurry-Motion": 2.1,
27
- "Quality-Blurry-Avg": 1.4,
28
- "Bias-Age": 70.8,
29
- "Bias-Gender": 71.5,
30
- "Bias-Race": 72.3,
31
- "Bias-Nationality": 72.2,
32
- "Bias-Religion": 68.1,
33
- "Bias-Avg": 71.5,
34
- "Bias-Age-NDS": 62.4,
35
- "Bias-Gender-NDS": 62.3,
36
- "Bias-Race-NDS": 62.3,
37
- "Bias-Nationality-NDS": 63.1,
38
- "Bias-Religion-NDS": 58.9,
39
- "Bias-Avg-NDS": 62.3,
40
- "Bias-Age-GES": 85.9,
41
- "Bias-Gender-GES": 86.0,
42
- "Bias-Race-GES": 86.0,
43
- "Bias-Nationality-GES": 86.4,
44
- "Bias-Religion-GES": 83.8,
45
- "Bias-Avg-GES": 85.9
46
- }
47
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/AestheticsPredictor.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "AestheticsPredictor",
4
- "Model Type": "Score Model",
5
- "Input Type": "Single Image",
6
- "Organization": "LAION",
7
- "Alignment": 32.4,
8
- "Safety": 27.0,
9
- "Quality": 69.6,
10
- "Bias": 61.4
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/BLIP-v2.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "BLIP-v2",
4
- "Model Type": "Score Model",
5
- "Input Type": "Single Image",
6
- "Organization": "Salesforce",
7
- "Alignment": 17.3,
8
- "Safety": 44.0,
9
- "Quality": 7.5,
10
- "Bias": 68.7
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/CLIP-v2.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "CLIP-v2",
4
- "Model Type": "Score Model",
5
- "Input Type": "Single Image",
6
- "Organization": "LAION",
7
- "Alignment": 38.1,
8
- "Safety": 12.7,
9
- "Quality": 34.4,
10
- "Bias": 57.4
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/Claude 3 Opus.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "Claude 3 Opus",
4
- "Model Type": "Closesource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "Anthropic",
7
- "Alignment": 57.1,
8
- "Safety": 13.4,
9
- "Quality": 11.9,
10
- "Bias": 57.7
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/GPT-4-vision.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "GPT-4-vision",
4
- "Model Type": "Closesource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "OpenAI",
7
- "Alignment": 66.1,
8
- "Safety": 26.5,
9
- "Quality": 90.4,
10
- "Bias": 79.0
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/GPT-4o.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "GPT-4o",
4
- "Model Type": "Closesource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "OpenAI",
7
- "Alignment": 61.5,
8
- "Safety": 35.3,
9
- "Quality": 97.6,
10
- "Bias": 65.8
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/Gemini Ultra.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "Gemini Ultra",
4
- "Model Type": "Closesource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "Google",
7
- "Alignment": 67.2,
8
- "Safety": 13.1,
9
- "Quality": 55.7,
10
- "Bias": 55.6
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/HPS-v2.1.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "HPS-v2.1",
4
- "Model Type": "Score Model",
5
- "Input Type": "Single Image",
6
- "Organization": "CUHK MMLab",
7
- "Alignment": 47.3,
8
- "Safety": 18.8,
9
- "Quality": 67.3,
10
- "Bias": 55.0
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/Idefics2-8b.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "Idefics2-8b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "HuggingFace",
7
- "Alignment": 32.6,
8
- "Safety": 13.6,
9
- "Quality": 46.1,
10
- "Bias": 42.1
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/ImageReward.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "ImageReward",
4
- "Model Type": "Score Model",
5
- "Input Type": "Single Image",
6
- "Organization": "THUDM",
7
- "Alignment": 50.9,
8
- "Safety": 24.9,
9
- "Quality": 63.5,
10
- "Bias": 40.9
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/Instructblip-7b.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "Instructblip-7b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "Salesforce",
7
- "Alignment": 17.1,
8
- "Safety": 26.4,
9
- "Quality": 25.2,
10
- "Bias": 53.1
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "InternVL-Chat-V1-5",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "OpenGVLab",
7
- "Alignment": 55.3,
8
- "Safety": 6.3,
9
- "Quality": 66.3,
10
- "Bias": 25.4
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/LLaVA-1.5-13b.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "LLaVA-1.5-13b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "UW-Madison & Microsoft",
7
- "Alignment": 10.3,
8
- "Safety": 30.7,
9
- "Quality": 23.3,
10
- "Bias": 69.7
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/LLaVA-1.5-7b.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "LLaVA-1.5-7b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "UW-Madison & Microsoft",
7
- "Alignment": 22.0,
8
- "Safety": 24.8,
9
- "Quality": 12.4,
10
- "Bias": 83.7
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "LLaVA-NeXT-mistral-7b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "UW-Madison & ByteDance",
7
- "Alignment": 31.3,
8
- "Safety": 15.2,
9
- "Quality": 45.8,
10
- "Bias": 69.9
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "LLaVA-NeXT-vicuna-13b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "UW-Madison & ByteDance",
7
- "Alignment": 29.1,
8
- "Safety": 27.9,
9
- "Quality": 36.8,
10
- "Bias": 56.3
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/MiniGPT4-v2.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "MiniGPT4-v2",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "Vision-CAIR",
7
- "Alignment": 32.8,
8
- "Safety": 25.7,
9
- "Quality": 36.7,
10
- "Bias": 32.6
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/PickScore-v1.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "PickScore-v1",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "Stability AI",
7
- "Alignment": 58.8,
8
- "Safety": 37.2,
9
- "Quality": 83.8,
10
- "Bias": 31.0
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/Prometheus-Vision-13b.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "Prometheus-Vision-13b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "prometheus-eval",
7
- "Alignment": 11.8,
8
- "Safety": 3.6,
9
- "Quality": 8.7,
10
- "Bias": 66.3
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/Prometheus-Vision-7b.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "Prometheus-Vision-7b",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Single Image",
6
- "Organization": "prometheus-eval",
7
- "Alignment": 18.8,
8
- "Safety": 7.1,
9
- "Quality": 23.4,
10
- "Bias": 49.5
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench-results/overall-results/Qwen-VL-Chat.json DELETED
@@ -1,12 +0,0 @@
1
- [
2
- {
3
- "Model": "Qwen-VL-Chat",
4
- "Model Type": "Opensource VLM",
5
- "Input Type": "Multi Image",
6
- "Organization": "Alibaba",
7
- "Alignment": 52.1,
8
- "Safety": 26.8,
9
- "Quality": 23.6,
10
- "Bias": 71.9
11
- }
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
src/about.py CHANGED
@@ -21,15 +21,14 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">MJ-Bench</h1>"""
25
 
26
- MJB_LOGO = '<img src="" alt="Logo" style="width: 30%; display: block; margin: auto;">'
27
 
28
  # What does your leaderboard evaluate?
29
  INTRODUCTION_TEXT = """
30
- # Multimodal Judge Benchmark (MJ-Bench): Is Your Multimodal Reward Model Really a Good Judge?
31
- ### Evaluating the `Alignment`, `Quality`, `Safety`, and `Bias` of multimodal reward models
32
- [Website](https://mj-bench.github.io) | [Code](https://github.com/MJ-Bench/MJ-Bench) | [Eval. Dataset](https://huggingface.co/datasets/MJ-Bench/MJ-Bench) | [Results](https://huggingface.co/datasets/MJ-Bench/MJ-Bench-Results) | [Refined Model via RMs](https://huggingface.co/collections/MJ-Bench/aligned-diffusion-model-via-dpo-667f8b71f35c3ff47acafd43) | [Paper](https://arxiv.org/abs/2407.04842) | Total models: {}
33
  """
34
 
35
  # Which evaluations are you running? how can people reproduce what you have?
 
21
 
22
 
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">MMIE</h1>"""
25
 
26
+ # MJB_LOGO = '<img src="" alt="Logo" style="width: 30%; display: block; margin: auto;">'
27
 
28
  # What does your leaderboard evaluate?
29
  INTRODUCTION_TEXT = """
30
+ # MMIE: Massive Multimodal Interleaved Comprehension Benchmark for Large Vision-Language Models
31
+ [Website](https://github.com/richard-peng-xia/MMIE) | [Code](https://github.com/richard-peng-xia/MMIE) | [Dataset](https://huggingface.co/datasets/MMIE/MMIE) | [Results](https://huggingface.co/datasets/MMIE/MMIE-Leaderboard) | [Eval Model](https://huggingface.co/MMIE/MMIE-Eval) | [Paper]()
 
32
  """
33
 
34
  # Which evaluations are you running? how can people reproduce what you have?
src/envs.py CHANGED
@@ -9,9 +9,9 @@ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
9
  OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"MJ-Bench/MJ-Bench-Leaderboard"
13
- QUEUE_REPO = f"MJ-Bench/MJ-Bench-Requests"
14
- RESULTS_REPO = f"MJ-Bench/MJ-Bench-Results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
9
  OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"MMIE/MMIE-Leaderboard"
13
+ QUEUE_REPO = f"MMIE/MMIE-Requests"
14
+ RESULTS_REPO = f"MMIE/MMIE-Results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/logo.png ADDED