Update scripts/eval_mteb.py
Browse files- scripts/eval_mteb.py +63 -2
scripts/eval_mteb.py
CHANGED
@@ -119,7 +119,19 @@ CMTEB_TASK_LIST = ['TNews', 'IFlyTek', 'MultilingualSentiment', 'JDReview', 'Onl
|
|
119 |
'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',
|
120 |
'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC', 'STS22']
|
121 |
|
|
|
|
|
|
|
|
|
|
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
logging.basicConfig(
|
125 |
level=logging.INFO,
|
@@ -136,7 +148,6 @@ def get_detailed_instruct(task_description: str) -> str:
|
|
136 |
|
137 |
def get_task_def_by_task_name_and_type(task_name: str, task_type: str, default_instruct='Given a web search query, retrieve relevant passages that answer the query') -> str:
|
138 |
if task_type in ['STS']:
|
139 |
-
# return "Given a premise, retrieve a hypothesis that is entailed by the premise."
|
140 |
return "Retrieve semantically similar text"
|
141 |
|
142 |
if task_type in ['Summarization']:
|
@@ -166,6 +177,13 @@ def get_task_def_by_task_name_and_type(task_name: str, task_type: str, default_i
|
|
166 |
'JDReview': 'Classify the customer review for iPhone on e-commerce platform into positive or negative',
|
167 |
'OnlineShopping': 'Classify the customer review for online shopping into positive or negative',
|
168 |
'Waimai': 'Classify the customer review from a food takeaway platform into positive or negative',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
}
|
170 |
return task_name_to_instruct[task_name]
|
171 |
|
@@ -187,6 +205,16 @@ def get_task_def_by_task_name_and_type(task_name: str, task_type: str, default_i
|
|
187 |
'CLSClusteringP2P': 'Identify the main category of scholar papers based on the titles and abstracts',
|
188 |
'ThuNewsClusteringS2S': 'Identify the topic or theme of the given news articles based on the titles',
|
189 |
'ThuNewsClusteringP2P': 'Identify the topic or theme of the given news articles based on the titles and contents',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
}
|
191 |
return task_name_to_instruct[task_name]
|
192 |
|
@@ -206,6 +234,16 @@ def get_task_def_by_task_name_and_type(task_name: str, task_type: str, default_i
|
|
206 |
'CMedQAv2': 'Given a Chinese community medical question, retrieve replies that best answer the question',
|
207 |
'Ocnli': 'Retrieve semantically similar text.',
|
208 |
'Cmnli': 'Retrieve semantically similar text.',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
}
|
210 |
return task_name_to_instruct[task_name]
|
211 |
|
@@ -237,6 +275,24 @@ def get_task_def_by_task_name_and_type(task_name: str, task_type: str, default_i
|
|
237 |
'EcomRetrieval': 'Given a user query from an e-commerce website, retrieve description sentences of relevant products',
|
238 |
'MedicalRetrieval': 'Given a medical question, retrieve user replies that best answer the question',
|
239 |
'VideoRetrieval': 'Given a video search query, retrieve the titles of relevant videos',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
}
|
241 |
|
242 |
# add lower case keys to match some beir names
|
@@ -629,9 +685,14 @@ def main(args):
|
|
629 |
elif args.task == 'cmteb':
|
630 |
task_names = CMTEB_TASK_LIST
|
631 |
lang = ['zh','zh-CN']
|
|
|
|
|
|
|
|
|
|
|
632 |
else:
|
633 |
task_names = [args.task]
|
634 |
-
lang = ['en','zh','zh-CN']
|
635 |
for task in task_names:
|
636 |
evaluation = MTEB(tasks=[task], task_langs=lang)
|
637 |
task_cls = evaluation.tasks[0]
|
|
|
119 |
'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',
|
120 |
'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC', 'STS22']
|
121 |
|
122 |
+
MTEB_PL = [
|
123 |
+
"CBD","PolEmo2.0-IN","PolEmo2.0-OUT","AllegroReviews","PAC","MassiveIntentClassification","MassiveScenarioClassification",
|
124 |
+
"SICK-E-PL","PPC","CDSC-E","PSC","8TagsClustering","SICK-R-PL","CDSC-R","STS22",
|
125 |
+
"ArguAna-PL","DBPedia-PL","FiQA-PL","HotpotQA-PL","MSMARCO-PL","NFCorpus-PL","NQ-PL","Quora-PL","SCIDOCS-PL","SciFact-PL","TRECCOVID-PL"
|
126 |
+
]
|
127 |
|
128 |
+
MTEB_FR = [
|
129 |
+
"AmazonReviewsClassification","MasakhaNEWSClassification","MassiveIntentClassification",
|
130 |
+
"MassiveScenarioClassification","MTOPDomainClassification","MTOPIntentClassification","OpusparcusPC","PawsX",
|
131 |
+
"AlloProfClusteringP2P","AlloProfClusteringS2S","HALClusteringS2S","MasakhaNEWSClusteringP2P","MasakhaNEWSClusteringS2S","MLSUMClusteringP2P","MLSUMClusteringS2S",
|
132 |
+
"SyntecReranking","AlloprofReranking","AlloprofRetrieval","BSARDRetrieval","SyntecRetrieval","XPQARetrieval","MintakaRetrieval",
|
133 |
+
"SummEvalFr","STSBenchmarkMultilingualSTS","STS22","SICKFr"
|
134 |
+
]
|
135 |
|
136 |
logging.basicConfig(
|
137 |
level=logging.INFO,
|
|
|
148 |
|
149 |
def get_task_def_by_task_name_and_type(task_name: str, task_type: str, default_instruct='Given a web search query, retrieve relevant passages that answer the query') -> str:
|
150 |
if task_type in ['STS']:
|
|
|
151 |
return "Retrieve semantically similar text"
|
152 |
|
153 |
if task_type in ['Summarization']:
|
|
|
177 |
'JDReview': 'Classify the customer review for iPhone on e-commerce platform into positive or negative',
|
178 |
'OnlineShopping': 'Classify the customer review for online shopping into positive or negative',
|
179 |
'Waimai': 'Classify the customer review from a food takeaway platform into positive or negative',
|
180 |
+
# MTEB-pl eval instructions
|
181 |
+
"CBD":"Classify the sentiment of polish tweet reviews",
|
182 |
+
"PolEmo2.0-IN": "Classify the sentiment of in-domain (medicine and hotels) online reviews",
|
183 |
+
"PolEmo2.0-OUT":"Classify the sentiment of out-of-domain (products and school) online reviews",
|
184 |
+
"AllegroReviews": "Classify the sentiment of reviews from e-commerce marketplace Allegro",
|
185 |
+
"PAC": "Classify the sentence into one of the two types: \"BEZPIECZNE_POSTANOWIENIE_UMOWNE\" and \"KLAUZULA_ABUZYWNA\"",
|
186 |
+
|
187 |
}
|
188 |
return task_name_to_instruct[task_name]
|
189 |
|
|
|
205 |
'CLSClusteringP2P': 'Identify the main category of scholar papers based on the titles and abstracts',
|
206 |
'ThuNewsClusteringS2S': 'Identify the topic or theme of the given news articles based on the titles',
|
207 |
'ThuNewsClusteringP2P': 'Identify the topic or theme of the given news articles based on the titles and contents',
|
208 |
+
# MTEB-fr eval instructions
|
209 |
+
"AlloProfClusteringP2P": "Identify the main category of Allo Prof document based on the titles and descriptions",
|
210 |
+
"AlloProfClusteringS2S": "Identify the main category of Allo Prof document based on the titles",
|
211 |
+
"HALClusteringS2S": "Identify the main category of academic passage based on the titles and contents",
|
212 |
+
"MasakhaNEWSClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents",
|
213 |
+
"MasakhaNEWSClusteringS2S": "Identify the topic or theme of the given news articles based on the titles",
|
214 |
+
"MLSUMClusteringP2P": "Identify the topic or theme of the given articles based on the titles and contents",
|
215 |
+
"MLSUMClusteringS2S": "Identify the topic or theme of the given articles based on the titles",
|
216 |
+
# MTEB-pl eval instructions
|
217 |
+
"8TagsClustering": "Identify of headlines from social media posts in Polish into 8 categories: film, history, food, medicine, motorization, work, sport and technology",
|
218 |
}
|
219 |
return task_name_to_instruct[task_name]
|
220 |
|
|
|
234 |
'CMedQAv2': 'Given a Chinese community medical question, retrieve replies that best answer the question',
|
235 |
'Ocnli': 'Retrieve semantically similar text.',
|
236 |
'Cmnli': 'Retrieve semantically similar text.',
|
237 |
+
# MTEB-fr eval instructions
|
238 |
+
"AlloprofReranking": "Given a question, retrieve passages that answer the question",
|
239 |
+
"OpusparcusPC":"Retrieve semantically similar text",
|
240 |
+
"PawsX":"Retrieve semantically similar text",
|
241 |
+
"SyntecReranking": "Given a question, retrieve passages that answer the question",
|
242 |
+
# MTEB-pl eval instructions
|
243 |
+
"SICK-E-PL": "Retrieve semantically similar text",
|
244 |
+
"PPC": "Retrieve semantically similar text",
|
245 |
+
"CDSC-E": "Retrieve semantically similar text",
|
246 |
+
"PSC": "Retrieve semantically similar text",
|
247 |
}
|
248 |
return task_name_to_instruct[task_name]
|
249 |
|
|
|
275 |
'EcomRetrieval': 'Given a user query from an e-commerce website, retrieve description sentences of relevant products',
|
276 |
'MedicalRetrieval': 'Given a medical question, retrieve user replies that best answer the question',
|
277 |
'VideoRetrieval': 'Given a video search query, retrieve the titles of relevant videos',
|
278 |
+
# MTEB-fr eval instructions
|
279 |
+
"AlloprofRetrieval": "Given a question, retrieve passages that answer the question",
|
280 |
+
"BSARDRetrieval": "Given a question, retrieve passages that answer the question",
|
281 |
+
"SyntecRetrieval": "Given a question, retrieve passages that answer the question",
|
282 |
+
"XPQARetrieval": "Given a question, retrieve passages that answer the question",
|
283 |
+
"MintakaRetrieval": "Given a question, retrieve passages that answer the question",
|
284 |
+
# MTEB-pl eval instructions
|
285 |
+
"ArguAna-PL": "Given a claim, find documents that refute the claim",
|
286 |
+
"DBPedia-PL": "Given a query, retrieve relevant entity descriptions from DBPedia",
|
287 |
+
"FiQA-PL": "Given a financial question, retrieve user replies that best answer the question",
|
288 |
+
"HotpotQA-PL": "Given a multi-hop question, retrieve documents that can help answer the question",
|
289 |
+
"MSMARCO-PL": "Given a web search query, retrieve relevant passages that answer the query",
|
290 |
+
"NFCorpus-PL": "Given a question, retrieve relevant documents that best answer the question",
|
291 |
+
"NQ-PL": "Given a question, retrieve Wikipedia passages that answer the question",
|
292 |
+
"Quora-PL": "Given a question, retrieve questions that are semantically equivalent to the given question",
|
293 |
+
"SCIDOCS-PL": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper",
|
294 |
+
"SciFact-PL": "Given a scientific claim, retrieve documents that support or refute the claim",
|
295 |
+
"TRECCOVID-PL": "Given a query on COVID-19, retrieve documents that answer the query"
|
296 |
}
|
297 |
|
298 |
# add lower case keys to match some beir names
|
|
|
685 |
elif args.task == 'cmteb':
|
686 |
task_names = CMTEB_TASK_LIST
|
687 |
lang = ['zh','zh-CN']
|
688 |
+
elif args.task == 'mteb-fr':
|
689 |
+
tas_names = MTEB_FR
|
690 |
+
lang = ['fr']
|
691 |
+
elif args.task == 'mteb-pl':
|
692 |
+
lang = ['pl']
|
693 |
else:
|
694 |
task_names = [args.task]
|
695 |
+
lang = ['en','zh','zh-CN','pl','fr']
|
696 |
for task in task_names:
|
697 |
evaluation = MTEB(tasks=[task], task_langs=lang)
|
698 |
task_cls = evaluation.tasks[0]
|