Spaces:
Runtime error
Runtime error
File size: 7,817 Bytes
3c75092 b1f12bd b0464d1 14d9c3b 3c75092 5459fa4 3c75092 1018e52 0302c93 3c75092 efef417 779f80b 3c75092 14d9c3b 3647aad 7f008cb 0302c93 e401827 3647aad 0302c93 e06d81a 3c75092 f65a20c 0302c93 3c75092 f65a20c 0302c93 f65a20c 3c75092 0302c93 a59a0d0 3c75092 0302c93 a59a0d0 3c75092 0302c93 a59a0d0 f65a20c 0302c93 a59a0d0 e2f94e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# CONSTANTS-URL
URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
RESULTS = 'ShoppingMMLU_overall.json'
SHOPPINGMMLU_README = 'https://raw.githubusercontent.com/KL4805/ShoppingMMLU/refs/heads/main/README.md'
# CONSTANTS-CITATION
CITATION_BUTTON_TEXT = r"""@article{jin2024shopping,
title={Shopping MMLU: A Massive Multi-Task Online Shopping Benchmark for Large Language Models},
author={Jin, Yilun and Li, Zheng and Zhang, Chenwei and Cao, Tianyu and Gao, Yifan and Jayarao, Pratik and Li, Mao and Liu, Xin and Sarkhel, Ritesh and Tang, Xianfeng and others},
journal={arXiv preprint arXiv:2410.20745},
year={2024}
}"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
# CONSTANTS-TEXT
LEADERBORAD_INTRODUCTION = """# Shopping MMLU Leaderboard
### Welcome to Shopping MMLU Leaderboard! On this leaderboard we share the evaluation results of LLMs obtained by the OpenSource Framework:
### [Shopping MMLU: A Massive Multi-Task Online Shopping Benchmark for Large Language Models](https://github.com/KL4805/ShoppingMMLU) 🏆
### Currently, Shopping MMLU Leaderboard covers {} different LLMs and {} main online shopping skills.
This leaderboard was last updated: {}.
Shopping MMLU Leaderboard only includes open-source LLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [Shopping MMLU](https://github.com/KL4805/ShoppingMMLU) to support your LLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [email protected] and [email protected].
"""
# CONSTANTS-FIELDS
META_FIELDS = ['Method', 'Param (B)', 'OpenSource', 'Verified']
# MAIN_FIELDS = [
# 'MMBench_V11', 'MMStar', 'MME',
# 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
# 'HallusionBench', 'SEEDBench_IMG', 'MMVet',
# 'LLaVABench', 'CCBench', 'RealWorldQA', 'POPE', 'ScienceQA_TEST',
# 'SEEDBench2_Plus', 'MMT-Bench_VAL', 'BLINK'
# ]
MAIN_FIELDS = [
'Shopping Concept Understanding', 'Shopping Knowledge Reasoning', 'User Behavior Alignment','Multi-lingual Abilities'
]
# DEFAULT_BENCH = [
# 'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
# 'HallusionBench', 'MMVet'
# ]
DEFAULT_BENCH = ['Shopping Concept Understanding', 'Shopping Knowledge Reasoning', 'User Behavior Alignment','Multi-lingual Abilities']
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
# The README file for each benchmark
LEADERBOARD_MD = {}
LEADERBOARD_MD['MAIN'] = f"""
## Included Shopping Skills:
- Shopping Concept Understanding: Understanding domain-specific short texts in online shopping (e.g. brands, product models).
- Shopping Knowledge Reasoning: Reasoning over commonsense, numeric, and implicit product-product multi-hop knowledge.
- User Behavior Alignment: Modeling heterogeneous and implicit user behaviors (e.g. click, query, purchase).
- Multi-lingual Abilities: Online shopping across marketplaces around the globe.
## Main Evaluation Results
- Metrics:
- Avg Score: The average score on all 4 online shopping skills (normalized to 0 - 100, the higher the better).
- Detailed metrics and evaluation results for each skill are provided in the consequent tabs.
"""
LEADERBOARD_MD['Shopping Concept Understanding'] = """
## Shopping Concept Understanding Evaluation Results
Online shopping concepts such as brands and product models are domain-specific and not often seen in pre-training. Moreover, they often appear in short texts (e.g. queries, attribute-value pairs) and thus no sufficient contexts are given to help understand them. Hence, failing to understand these concepts compromises the performance of LLMs on downstream tasks.
The included sub-skills and tasks include:
- **Concept Normalization**:
- Product Category Synonym
- Attribute Value Synonym
- **Elaboration**:
- Attribute Explanation
- Product Category Explanation
- **Relational Inference**:
- Applicable Attribute to Product Category
- Applicable Product Category to Attribute
- Inapplicable Attributes
- Valid Attribute Value Given Attribute and Product Category
- Valid Attribute Given Attribute Value and Product Category
- Product Category Classification
- Product Category Generation
- **Sentiment Analysis**:
- Aspect-based Sentiment Classification
- Aspect-based Review Retrieval
- Aspect-based Review Selection
- Aspect-based Reviews Overall Sentiment Classification
- **Information Extraction**:
- Attribute Value Extraction
- Query Named Entity Recognition
- Aspect-based Review Keyphrase Selection
- Aspect-based Review Keyphrase Extraction
- **Summarization**:
- Attribute Naming from Decription
- Product Category Naming from Description
- Review Aspect Retrieval
- Single Conversation Topic Selection
- Multi-Conversation Topic Retrieval
- Product Keyphrase Selection
- Product Keyphrase Retrieval
- Product Title Generation
"""
LEADERBOARD_MD['Shopping Knowledge Reasoning'] = """
## Shopping Knowledge Reasoning Evaluation Results
This skill focuses on understanding and applying various implicit knowledge to perform reasoning over products and their attributes. For example, calculations such as the total volume of a product pack require numeric reasoning, and finding compatible products requires multi-hop reasoning among various products over a product knowledge graph.
The included sub-skills and tasks include:
- **Numeric Reasoning**:
- Unit Conversation
- Product Numeric Reasoning
- **Commonsense Reasoning**
- **Implicit Multi-Hop Reasoning**:
- Product Compatibility
- Complementary Product Categories
- Implicit Attribute Reasoning
- Related Brands Selection
- Related Brands Retrieval
"""
LEADERBOARD_MD['User Behavior Alignment'] = """
## User Behavior Alignment Evaluation Results
Accurately modeling user behaviors is a crucial skill in online shopping. A large variety of user behaviors exist in online shopping, including queries, clicks, add-to-carts, purchases, etc. Moreover, these behaviors are generally implicit and not expressed in text.
Consequently, LLMs trained with general texts encounter challenges in aligning with the heterogeneous and implicit user behaviors as they rarely observe such inputs during pre-training.
The included sub-skills and tasks include:
- **Query-Query Relations**:
- Query Re-Writing
- Query-Query Intention Selection
- Intention-Based Related Query Retrieval
- **Query-Product Relations**:
- Product Category Selection for Query
- Query-Product Relation Selection
- Query-Product Ranking
- **Sessions**:
- Session-based Query Recommendation
- Session-based Next Query Selection
- Session-based Next Product Selection
- **Purchases**:
- Product Co-Purchase Selection
- Product Co-Purchase Retrieval
- **Reviews and QA**:
- Review Rating Prediction
- Aspect-Sentiment-Based Review Generation
- Review Helpfulness Selection
- Product-Based Question Answering
"""
LEADERBOARD_MD['Multi-lingual Abilities'] = """
## Multi-lingual Abilities Evaluation Results
Multi-lingual models are desired in online shopping as they can be deployed in multiple marketplaces without re-training.
The included sub-skills and tasks include:
- **Multi-lingual Shopping Concept Understanding**:
- Multi-lingual Product Title Generation
- Multi-lingual Product Keyphrase Selection
- Cross-lingual Product Title Translation
- Cross-lingual Product Entity Alignment
- **Multi-lingual User Behavior Alignment**:
- Multi-lingual Query-product Relation Selection
- Multi-lingual Query-product Ranking
- Multi-lingual Session-based Product Recommendation
"""
|