File size: 7,817 Bytes
3c75092
b1f12bd
b0464d1
14d9c3b
3c75092
5459fa4
 
 
 
 
3c75092
 
 
1018e52
 
 
0302c93
3c75092
 
efef417
779f80b
3c75092
 
14d9c3b
3647aad
 
 
 
 
 
 
7f008cb
0302c93
e401827
3647aad
 
 
 
0302c93
e06d81a
3c75092
 
 
 
 
f65a20c
0302c93
 
 
 
 
 
 
3c75092
 
f65a20c
0302c93
 
f65a20c
 
3c75092
 
0302c93
a59a0d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c75092
 
 
0302c93
a59a0d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c75092
 
0302c93
a59a0d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f65a20c
 
0302c93
a59a0d0
 
 
 
 
 
 
 
 
 
 
 
 
 
e2f94e9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# CONSTANTS-URL
URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
RESULTS = 'ShoppingMMLU_overall.json'
SHOPPINGMMLU_README = 'https://raw.githubusercontent.com/KL4805/ShoppingMMLU/refs/heads/main/README.md'
# CONSTANTS-CITATION
CITATION_BUTTON_TEXT = r"""@article{jin2024shopping,
  title={Shopping MMLU: A Massive Multi-Task Online Shopping Benchmark for Large Language Models},
  author={Jin, Yilun and Li, Zheng and Zhang, Chenwei and Cao, Tianyu and Gao, Yifan and Jayarao, Pratik and Li, Mao and Liu, Xin and Sarkhel, Ritesh and Tang, Xianfeng and others},
  journal={arXiv preprint arXiv:2410.20745},
  year={2024}
}"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
# CONSTANTS-TEXT
LEADERBORAD_INTRODUCTION = """# Shopping MMLU Leaderboard
### Welcome to Shopping MMLU Leaderboard! On this leaderboard we share the evaluation results of LLMs obtained by the OpenSource Framework:
### [Shopping MMLU: A Massive Multi-Task Online Shopping Benchmark for Large Language Models](https://github.com/KL4805/ShoppingMMLU) 🏆
### Currently, Shopping MMLU Leaderboard covers {} different LLMs and {} main online shopping skills. 

This leaderboard was last updated: {}. 

Shopping MMLU Leaderboard only includes open-source LLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [Shopping MMLU](https://github.com/KL4805/ShoppingMMLU) to support your LLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [email protected] and [email protected].
"""
# CONSTANTS-FIELDS
META_FIELDS = ['Method', 'Param (B)', 'OpenSource', 'Verified']
# MAIN_FIELDS = [
#     'MMBench_V11', 'MMStar', 'MME',
#     'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D', 
#     'HallusionBench', 'SEEDBench_IMG', 'MMVet', 
#     'LLaVABench', 'CCBench', 'RealWorldQA', 'POPE', 'ScienceQA_TEST',
#     'SEEDBench2_Plus', 'MMT-Bench_VAL', 'BLINK'
# ]
MAIN_FIELDS = [
    'Shopping Concept Understanding', 'Shopping Knowledge Reasoning', 'User Behavior Alignment','Multi-lingual Abilities'
]
# DEFAULT_BENCH = [
#     'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D', 
#     'HallusionBench', 'MMVet'
# ]
DEFAULT_BENCH = ['Shopping Concept Understanding', 'Shopping Knowledge Reasoning', 'User Behavior Alignment','Multi-lingual Abilities']
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']

# The README file for each benchmark
LEADERBOARD_MD = {}

LEADERBOARD_MD['MAIN'] = f"""
## Included Shopping Skills: 

- Shopping Concept Understanding: Understanding domain-specific short texts in online shopping (e.g. brands, product models). 
- Shopping Knowledge Reasoning: Reasoning over commonsense, numeric, and implicit product-product multi-hop knowledge. 
- User Behavior Alignment: Modeling heterogeneous and implicit user behaviors (e.g. click, query, purchase).  
- Multi-lingual Abilities: Online shopping across marketplaces around the globe. 

## Main Evaluation Results

- Metrics:
  - Avg Score: The average score on all 4 online shopping skills (normalized to 0 - 100, the higher the better). 
  - Detailed metrics and evaluation results for each skill are provided in the consequent tabs. 
"""



LEADERBOARD_MD['Shopping Concept Understanding'] = """
## Shopping Concept Understanding Evaluation Results

Online shopping concepts such as brands and product models are domain-specific and not often seen in pre-training. Moreover, they often appear in short texts (e.g. queries, attribute-value pairs) and thus no sufficient contexts are given to help understand them. Hence, failing to understand these concepts compromises the performance of LLMs on downstream tasks.

The included sub-skills and tasks include: 
- **Concept Normalization**: 
  - Product Category Synonym
  - Attribute Value Synonym
- **Elaboration**: 
  - Attribute Explanation
  - Product Category Explanation
- **Relational Inference**: 
  - Applicable Attribute to Product Category
  - Applicable Product Category to Attribute
  - Inapplicable Attributes
  - Valid Attribute Value Given Attribute and Product Category
  - Valid Attribute Given Attribute Value and Product Category
  - Product Category Classification
  - Product Category Generation
- **Sentiment Analysis**: 
  - Aspect-based Sentiment Classification
  - Aspect-based Review Retrieval
  - Aspect-based Review Selection
  - Aspect-based Reviews Overall Sentiment Classification
- **Information Extraction**: 
  - Attribute Value Extraction
  - Query Named Entity Recognition
  - Aspect-based Review Keyphrase Selection
  - Aspect-based Review Keyphrase Extraction
- **Summarization**: 
  - Attribute Naming from Decription
  - Product Category Naming from Description
  - Review Aspect Retrieval
  - Single Conversation Topic Selection
  - Multi-Conversation Topic Retrieval
  - Product Keyphrase Selection
  - Product Keyphrase Retrieval
  - Product Title Generation
"""


LEADERBOARD_MD['Shopping Knowledge Reasoning'] = """
## Shopping Knowledge Reasoning Evaluation Results

This skill focuses on understanding and applying various implicit knowledge to perform reasoning over products and their attributes. For example, calculations such as the total volume of a product pack require numeric reasoning, and finding compatible products requires multi-hop reasoning among various products over a product knowledge graph. 

The included sub-skills and tasks include: 
- **Numeric Reasoning**: 
  - Unit Conversation
  - Product Numeric Reasoning
- **Commonsense Reasoning**
- **Implicit Multi-Hop Reasoning**:
  - Product Compatibility
  - Complementary Product Categories
  - Implicit Attribute Reasoning
  - Related Brands Selection
  - Related Brands Retrieval
"""

LEADERBOARD_MD['User Behavior Alignment'] = """
## User Behavior Alignment Evaluation Results

Accurately modeling user behaviors is a crucial skill in online shopping. A large variety of user behaviors exist in online shopping, including queries, clicks, add-to-carts, purchases, etc. Moreover, these behaviors are generally implicit and not expressed in text. 

Consequently, LLMs trained with general texts encounter challenges in aligning with the heterogeneous and implicit user behaviors as they rarely observe such inputs during pre-training.

The included sub-skills and tasks include: 
- **Query-Query Relations**: 
  - Query Re-Writing
  - Query-Query Intention Selection
  - Intention-Based Related Query Retrieval
- **Query-Product Relations**: 
  - Product Category Selection for Query
  - Query-Product Relation Selection
  - Query-Product Ranking
- **Sessions**: 
  - Session-based Query Recommendation
  - Session-based Next Query Selection
  - Session-based Next Product Selection
- **Purchases**:
  - Product Co-Purchase Selection
  - Product Co-Purchase Retrieval
- **Reviews and QA**: 
  - Review Rating Prediction 
  - Aspect-Sentiment-Based Review Generation
  - Review Helpfulness Selection
  - Product-Based Question Answering
"""

LEADERBOARD_MD['Multi-lingual Abilities'] = """
## Multi-lingual Abilities Evaluation Results

Multi-lingual models are desired in online shopping as they can be deployed in multiple marketplaces without re-training.

The included sub-skills and tasks include: 
- **Multi-lingual Shopping Concept Understanding**: 
  - Multi-lingual Product Title Generation
  - Multi-lingual Product Keyphrase Selection
  - Cross-lingual Product Title Translation
  - Cross-lingual Product Entity Alignment
- **Multi-lingual User Behavior Alignment**: 
  - Multi-lingual Query-product Relation Selection
  - Multi-lingual Query-product Ranking
  - Multi-lingual Session-based Product Recommendation 
"""