config: | |
REPO_ID: "mteb/leaderboard" | |
RESULTS_REPO: mteb/results | |
LEADERBOARD_NAME: "MTEB Leaderboard" | |
tasks: | |
BitextMining: | |
icon: "π" | |
metric: f1 | |
metric_description: "[F1](" | |
task_description: "Bitext mining is the task of finding parallel sentences in two languages." | |
Classification: | |
icon: "β€οΈ" | |
metric: accuracy | |
metric_description: "[Accuracy](" | |
task_description: "Classification is the task of assigning a label to a text." | |
Clustering: | |
icon: "β¨" | |
metric: v_measure | |
metric_description: "Validity Measure (V-measure)" | |
task_description: "Clustering is the task of grouping similar documents together." | |
PairClassification: | |
icon: "π" | |
metric: max_ap | |
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)" | |
task_description: "Pair classification is the task of determining whether two texts are similar." | |
Reranking: | |
icon: "π₯" | |
metric: map | |
metric_description: "Mean Average Precision (MAP)" | |
task_description: "Reranking is the task of reordering a list of documents to improve relevance." | |
Retrieval: | |
icon: "π" | |
metric: ndcg_at_10 | |
metric_description: "Normalized Discounted Cumulative Gain @ 10 (nDCG@10)" | |
task_description: "Retrieval is the task of finding relevant documents for a query." | |
STS: | |
icon: "βοΈ" | |
metric: cosine_spearman | |
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)" | |
task_description: "Semantic Textual Similarity is the task of determining how similar two texts are." | |
Summarization: | |
icon: "π" | |
metric: cosine_spearman | |
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)" | |
task_description: "Summarization is the task of generating a summary of a text." | |
MultilabelClassification: | |
icon: "π·οΈ" | |
metric: accuracy | |
metric_description: "Accuracy" | |
task_description: "Multilabel classification is the task of assigning multiple labels to a text." | |
InstructionRetrieval: | |
icon: "ππ" | |
metric: "p-MRR" | |
metric_description: "paired mean reciprocal rank (p-MRR)" | |
task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions." | |
boards: | |
en: | |
title: English | |
language_long: "English" | |
has_overall: true | |
acronym: null | |
icon: null | |
special_icons: null | |
credits: null | |
tasks: | |
Classification: | |
- AmazonCounterfactualClassification (en) | |
- AmazonPolarityClassification | |
- AmazonReviewsClassification (en) | |
- Banking77Classification | |
- EmotionClassification | |
- ImdbClassification | |
- MassiveIntentClassification (en) | |
- MassiveScenarioClassification (en) | |
- MTOPDomainClassification (en) | |
- MTOPIntentClassification (en) | |
- ToxicConversationsClassification | |
- TweetSentimentExtractionClassification | |
Clustering: | |
- ArxivClusteringP2P | |
- ArxivClusteringS2S | |
- BiorxivClusteringP2P | |
- BiorxivClusteringS2S | |
- MedrxivClusteringP2P | |
- MedrxivClusteringS2S | |
- RedditClustering | |
- RedditClusteringP2P | |
- StackExchangeClustering | |
- StackExchangeClusteringP2P | |
- TwentyNewsgroupsClustering | |
PairClassification: | |
- SprintDuplicateQuestions | |
- TwitterSemEval2015 | |
- TwitterURLCorpus | |
Reranking: | |
- AskUbuntuDupQuestions | |
- MindSmallReranking | |
- SciDocsRR | |
- StackOverflowDupQuestions | |
Retrieval: | |
- ArguAna | |
- ClimateFEVER | |
- CQADupstackRetrieval | |
- DBPedia | |
- FEVER | |
- FiQA2018 | |
- HotpotQA | |
- NFCorpus | |
- NQ | |
- QuoraRetrieval | |
- SciFact | |
- Touche2020 | |
STS: | |
- SICK-R | |
- STS12 | |
- STS13 | |
- STS14 | |
- STS15 | |
- STS16 | |
- STS17 (en-en) | |
- STS22 (en) | |
- STSBenchmark | |
Summarization: | |
- SummEval | |
en-x: | |
title: "English-X" | |
language_long: "117 (Pairs of: English & other language)" | |
has_overall: false | |
acronym: null | |
icon: null | |
special_icons: null | |
credits: null | |
tasks: | |
BitextMining: ['BUCC (de-en)', 'BUCC (fr-en)', 'BUCC (ru-en)', 'BUCC (zh-en)', 'Tatoeba (afr-eng)', 'Tatoeba (amh-eng)', 'Tatoeba (ang-eng)', 'Tatoeba (ara-eng)', 'Tatoeba (arq-eng)', 'Tatoeba (arz-eng)', 'Tatoeba (ast-eng)', 'Tatoeba (awa-eng)', 'Tatoeba (aze-eng)', 'Tatoeba (bel-eng)', 'Tatoeba (ben-eng)', 'Tatoeba (ber-eng)', 'Tatoeba (bos-eng)', 'Tatoeba (bre-eng)', 'Tatoeba (bul-eng)', 'Tatoeba (cat-eng)', 'Tatoeba (cbk-eng)', 'Tatoeba (ceb-eng)', 'Tatoeba (ces-eng)', 'Tatoeba (cha-eng)', 'Tatoeba (cmn-eng)', 'Tatoeba (cor-eng)', 'Tatoeba (csb-eng)', 'Tatoeba (cym-eng)', 'Tatoeba (dan-eng)', 'Tatoeba (deu-eng)', 'Tatoeba (dsb-eng)', 'Tatoeba (dtp-eng)', 'Tatoeba (ell-eng)', 'Tatoeba (epo-eng)', 'Tatoeba (est-eng)', 'Tatoeba (eus-eng)', 'Tatoeba (fao-eng)', 'Tatoeba (fin-eng)', 'Tatoeba (fra-eng)', 'Tatoeba (fry-eng)', 'Tatoeba (gla-eng)', 'Tatoeba (gle-eng)', 'Tatoeba (glg-eng)', 'Tatoeba (gsw-eng)', 'Tatoeba (heb-eng)', 'Tatoeba (hin-eng)', 'Tatoeba (hrv-eng)', 'Tatoeba (hsb-eng)', 'Tatoeba (hun-eng)', 'Tatoeba (hye-eng)', 'Tatoeba (ido-eng)', 'Tatoeba (ile-eng)', 'Tatoeba (ina-eng)', 'Tatoeba (ind-eng)', 'Tatoeba (isl-eng)', 'Tatoeba (ita-eng)', 'Tatoeba (jav-eng)', 'Tatoeba (jpn-eng)', 'Tatoeba (kab-eng)', 'Tatoeba (kat-eng)', 'Tatoeba (kaz-eng)', 'Tatoeba (khm-eng)', 'Tatoeba (kor-eng)', 'Tatoeba (kur-eng)', 'Tatoeba (kzj-eng)', 'Tatoeba (lat-eng)', 'Tatoeba (lfn-eng)', 'Tatoeba (lit-eng)', 'Tatoeba (lvs-eng)', 'Tatoeba (mal-eng)', 'Tatoeba (mar-eng)', 'Tatoeba (max-eng)', 'Tatoeba (mhr-eng)', 'Tatoeba (mkd-eng)', 'Tatoeba (mon-eng)', 'Tatoeba (nds-eng)', 'Tatoeba (nld-eng)', 'Tatoeba (nno-eng)', 'Tatoeba (nob-eng)', 'Tatoeba (nov-eng)', 'Tatoeba (oci-eng)', 'Tatoeba (orv-eng)', 'Tatoeba (pam-eng)', 'Tatoeba (pes-eng)', 'Tatoeba (pms-eng)', 'Tatoeba (pol-eng)', 'Tatoeba (por-eng)', 'Tatoeba (ron-eng)', 'Tatoeba (rus-eng)', 'Tatoeba (slk-eng)', 'Tatoeba (slv-eng)', 'Tatoeba (spa-eng)', 'Tatoeba (sqi-eng)', 'Tatoeba (srp-eng)', 'Tatoeba (swe-eng)', 'Tatoeba (swg-eng)', 'Tatoeba (swh-eng)', 'Tatoeba (tam-eng)', 'Tatoeba (tat-eng)', 'Tatoeba (tel-eng)', 'Tatoeba (tgl-eng)', 'Tatoeba (tha-eng)', 'Tatoeba (tuk-eng)', 'Tatoeba (tur-eng)', 'Tatoeba (tzl-eng)', 'Tatoeba (uig-eng)', 'Tatoeba (ukr-eng)', 'Tatoeba (urd-eng)', 'Tatoeba (uzb-eng)', 'Tatoeba (vie-eng)', 'Tatoeba (war-eng)', 'Tatoeba (wuu-eng)', 'Tatoeba (xho-eng)', 'Tatoeba (yid-eng)', 'Tatoeba (yue-eng)', 'Tatoeba (zsm-eng)'] | |
zh: | |
title: Chinese | |
language_long: Chinese | |
has_overall: true | |
acronym: C-MTEB | |
icon: "π¨π³" | |
special_icons: | |
Classification: "π§‘" | |
credits: "[FlagEmbedding](" | |
tasks: | |
Classification: | |
- AmazonReviewsClassification (zh) | |
- IFlyTek | |
- JDReview | |
- MassiveIntentClassification (zh-CN) | |
- MassiveScenarioClassification (zh-CN) | |
- MultilingualSentiment | |
- OnlineShopping | |
- TNews | |
- Waimai | |
Clustering: | |
- CLSClusteringP2P | |
- CLSClusteringS2S | |
- ThuNewsClusteringP2P | |
- ThuNewsClusteringS2S | |
PairClassification: | |
- Cmnli | |
- Ocnli | |
Reranking: | |
- CMedQAv1 | |
- CMedQAv2 | |
- MMarcoReranking | |
- T2Reranking | |
Retrieval: | |
- CmedqaRetrieval | |
- CovidRetrieval | |
- DuRetrieval | |
- EcomRetrieval | |
- MedicalRetrieval | |
- MMarcoRetrieval | |
- T2Retrieval | |
- VideoRetrieval | |
STS: | |
- AFQMC | |
- ATEC | |
- BQ | |
- LCQMC | |
- PAWSX | |
- QBQTC | |
- STS22 (zh) | |
- STSB | |
da: | |
title: Danish | |
language_long: Danish | |
has_overall: false | |
acronym: null | |
icon: "π©π°" | |
special_icons: | |
Classification: "π€" | |
credits: "[Kenneth Enevoldsen](, [scandinavian-embedding-benchmark](" | |
tasks: | |
BitextMining: | |
- BornholmBitextMining | |
Classification: | |
- AngryTweetsClassification | |
- DanishPoliticalCommentsClassification | |
- DKHateClassification | |
- LccSentimentClassification | |
- MassiveIntentClassification (da) | |
- MassiveScenarioClassification (da) | |
- NordicLangClassification | |
- ScalaDaClassification | |
fr: | |
title: French | |
language_long: "French" | |
has_overall: true | |
acronym: "F-MTEB" | |
icon: "π«π·" | |
special_icons: | |
Classification: "π" | |
credits: "[Lyon-NLP]( [Gabriel Sequeira](, [Imene Kerboua](, [Wissam Siblini](, [Mathieu Ciancone](, [Marion Schaeffer](" | |
tasks: | |
Classification: | |
- AmazonReviewsClassification (fr) | |
- MasakhaNEWSClassification (fra) | |
- MassiveIntentClassification (fr) | |
- MassiveScenarioClassification (fr) | |
- MTOPDomainClassification (fr) | |
- MTOPIntentClassification (fr) | |
Clustering: | |
- AlloProfClusteringP2P | |
- AlloProfClusteringS2S | |
- HALClusteringS2S | |
- MLSUMClusteringP2P (fr) | |
- MLSUMClusteringS2S (fr) | |
- MasakhaNEWSClusteringP2P (fra) | |
- MasakhaNEWSClusteringS2S (fra) | |
PairClassification: | |
- OpusparcusPC (fr) | |
- PawsXPairClassification (fr) | |
Reranking: | |
- AlloprofReranking | |
- SyntecReranking | |
Retrieval: | |
- AlloprofRetrieval | |
- BSARDRetrieval | |
- MintakaRetrieval (fr) | |
- SyntecRetrieval | |
- XPQARetrieval (fr) | |
STS: | |
- STS22 (fr) | |
- STSBenchmarkMultilingualSTS (fr) | |
- SICKFr | |
Summarization: | |
- SummEvalFr | |
'no': | |
title: Norwegian | |
language_long: "Norwegian BokmΓ₯l" | |
has_overall: false | |
acronym: null | |
icon: "π³π΄" | |
special_icons: | |
Classification: "π" | |
credits: "[Kenneth Enevoldsen](, [scandinavian-embedding-benchmark](" | |
tasks: | |
Classification: | |
- NoRecClassification | |
- NordicLangClassification | |
- NorwegianParliament | |
- MassiveIntentClassification (nb) | |
- MassiveScenarioClassification (nb) | |
- ScalaNbClassification | |
instructions: | |
title: English | |
language_long: "English" | |
has_overall: false | |
acronym: null | |
icon: null | |
credits: "[Orion Weller, FollowIR](" | |
tasks: | |
InstructionRetrieval: | |
- Robust04InstructionRetrieval | |
- News21InstructionRetrieval | |
- Core17InstructionRetrieval | |
de: | |
title: German | |
language_long: "German" | |
has_overall: false | |
acronym: null | |
icon: "π©πͺ" | |
special_icons: null | |
credits: "[Silvan](, [Sam Heymann](" | |
tasks: | |
Clustering: | |
- BlurbsClusteringP2P | |
- BlurbsClusteringS2S | |
- TenKGnadClusteringP2P | |
- TenKGnadClusteringS2S | |
Retrieval: | |
- GermanQuAD-Retrieval | |
- GermanDPR | |
- XMarket (de) | |
- GerDaLIR | |
STS: | |
- GermanSTSBenchmark | |
- STS22 (de-en) | |
PairClassification: | |
- FalseFriendsGermanEnglish | |
- PawsXPairClassification (de) | |
Reranking: | |
- MIRACLReranking (de) | |
Classification: | |
- AmazonCounterfactualClassification (de) | |
- AmazonReviewsClassification (de) | |
- MTOPDomainClassification (de) | |
- MTOPIntentClassification (de) | |
- MassiveIntentClassification (de) | |
- MassiveScenarioClassification (de) | |
pl: | |
title: Polish | |
language_long: Polish | |
has_overall: true | |
acronym: null | |
icon: "π΅π±" | |
special_icons: | |
Classification: "π€" | |
credits: "[RafaΕ PoΕwiata](" | |
tasks: | |
Classification: | |
- AllegroReviews | |
- CBD | |
- MassiveIntentClassification (pl) | |
- MassiveScenarioClassification (pl) | |
- PAC | |
- PolEmo2.0-IN | |
- PolEmo2.0-OUT | |
Clustering: | |
- 8TagsClustering | |
PairClassification: | |
- CDSC-E | |
- PPC | |
- PSC | |
- SICK-E-PL | |
Retrieval: | |
- ArguAna-PL | |
- DBPedia-PL | |
- FiQA-PL | |
- HotpotQA-PL | |
- NFCorpus-PL | |
- NQ-PL | |
- Quora-PL | |
- SciFact-PL | |
STS: | |
- CDSC-R | |
- SICK-R-PL | |
- STS22 (pl) | |
ru: | |
title: Russian | |
language_long: "Russian" | |
has_overall: true | |
acronym: null | |
icon: "π·πΊ" | |
special_icons: null | |
credits: "[Roman Solomatin]( and SaluteDevices: [Alena Fenogenova](, [Aleksandr Abramov](, [Artem Snegirev](, [Anna Maksimova](, [Maria Tikhonova](" | |
tasks: | |
Classification: | |
- GeoreviewClassification | |
- HeadlineClassification | |
- InappropriatenessClassification | |
- KinopoiskClassification | |
- RuReviewsClassification | |
- RuSciBenchGRNTIClassification | |
- RuSciBenchOECDClassification | |
- MassiveIntentClassification (ru) | |
- MassiveScenarioClassification (ru) | |
Clustering: | |
- GeoreviewClusteringP2P | |
- RuSciBenchGRNTIClusteringP2P | |
- RuSciBenchOECDClusteringP2P | |
PairClassification: | |
- TERRa | |
Reranking: | |
- RuBQReranking | |
- MIRACLReranking (ru) | |
Retrieval: | |
- RiaNewsRetrieval | |
- RuBQRetrieval | |
- MIRACLRetrieval (ru) | |
STS: | |
- RUParaPhraserSTS | |
- RuSTSBenchmarkSTS | |
- STS22 (ru) | |
MultilabelClassification: | |
- CEDRClassification | |
- SensitiveTopicsClassification | |
se: | |
title: Swedish | |
language_long: Swedish | |
has_overall: false | |
acronym: null | |
icon: "πΈπͺ" | |
special_icons: | |
Classification: "π" | |
credits: "[Kenneth Enevoldsen](, [scandinavian-embedding-benchmark](" | |
tasks: | |
Classification: | |
- NoRecClassification | |
- NordicLangClassification | |
- NorwegianParliament | |
- MassiveIntentClassification (nb) | |
- MassiveScenarioClassification (nb) | |
- ScalaNbClassification | |
other-cls: | |
title: "Other Languages" | |
language_long: "47 (Only languages not included in the other tabs)" | |
has_overall: false | |
acronym: null | |
icon: null | |
special_icons: | |
Classification: "πππ" | |
credits: null | |
tasks: | |
Classification: ['AmazonCounterfactualClassification (de)', 'AmazonCounterfactualClassification (ja)', 'AmazonReviewsClassification (de)', 'AmazonReviewsClassification (es)', 'AmazonReviewsClassification (fr)', 'AmazonReviewsClassification (ja)', 'AmazonReviewsClassification (zh)', 'MTOPDomainClassification (de)', 'MTOPDomainClassification (es)', 'MTOPDomainClassification (fr)', 'MTOPDomainClassification (hi)', 'MTOPDomainClassification (th)', 'MTOPIntentClassification (de)', 'MTOPIntentClassification (es)', 'MTOPIntentClassification (fr)', 'MTOPIntentClassification (hi)', 'MTOPIntentClassification (th)', 'MassiveIntentClassification (af)', 'MassiveIntentClassification (am)', 'MassiveIntentClassification (ar)', 'MassiveIntentClassification (az)', 'MassiveIntentClassification (bn)', 'MassiveIntentClassification (cy)', 'MassiveIntentClassification (de)', 'MassiveIntentClassification (el)', 'MassiveIntentClassification (es)', 'MassiveIntentClassification (fa)', 'MassiveIntentClassification (fi)', 'MassiveIntentClassification (fr)', 'MassiveIntentClassification (he)', 'MassiveIntentClassification (hi)', 'MassiveIntentClassification (hu)', 'MassiveIntentClassification (hy)', 'MassiveIntentClassification (id)', 'MassiveIntentClassification (is)', 'MassiveIntentClassification (it)', 'MassiveIntentClassification (ja)', 'MassiveIntentClassification (jv)', 'MassiveIntentClassification (ka)', 'MassiveIntentClassification (km)', 'MassiveIntentClassification (kn)', 'MassiveIntentClassification (ko)', 'MassiveIntentClassification (lv)', 'MassiveIntentClassification (ml)', 'MassiveIntentClassification (mn)', 'MassiveIntentClassification (ms)', 'MassiveIntentClassification (my)', 'MassiveIntentClassification (nl)', 'MassiveIntentClassification (pt)', 'MassiveIntentClassification (ro)', 'MassiveIntentClassification (ru)', 'MassiveIntentClassification (sl)', 'MassiveIntentClassification (sq)', 'MassiveIntentClassification (sw)', 'MassiveIntentClassification (ta)', 'MassiveIntentClassification (te)', 'MassiveIntentClassification (th)', 'MassiveIntentClassification (tl)', 'MassiveIntentClassification (tr)', 'MassiveIntentClassification (ur)', 'MassiveIntentClassification (vi)', 'MassiveIntentClassification (zh-TW)', 'MassiveScenarioClassification (af)', 'MassiveScenarioClassification (am)', 'MassiveScenarioClassification (ar)', 'MassiveScenarioClassification (az)', 'MassiveScenarioClassification (bn)', 'MassiveScenarioClassification (cy)', 'MassiveScenarioClassification (de)', 'MassiveScenarioClassification (el)', 'MassiveScenarioClassification (es)', 'MassiveScenarioClassification (fa)', 'MassiveScenarioClassification (fi)', 'MassiveScenarioClassification (fr)', 'MassiveScenarioClassification (he)', 'MassiveScenarioClassification (hi)', 'MassiveScenarioClassification (hu)', 'MassiveScenarioClassification (hy)', 'MassiveScenarioClassification (id)', 'MassiveScenarioClassification (is)', 'MassiveScenarioClassification (it)', 'MassiveScenarioClassification (ja)', 'MassiveScenarioClassification (jv)', 'MassiveScenarioClassification (ka)', 'MassiveScenarioClassification (km)', 'MassiveScenarioClassification (kn)', 'MassiveScenarioClassification (ko)', 'MassiveScenarioClassification (lv)', 'MassiveScenarioClassification (ml)', 'MassiveScenarioClassification (mn)', 'MassiveScenarioClassification (ms)', 'MassiveScenarioClassification (my)', 'MassiveScenarioClassification (nl)', 'MassiveScenarioClassification (pt)', 'MassiveScenarioClassification (ro)', 'MassiveScenarioClassification (ru)', 'MassiveScenarioClassification (sl)', 'MassiveScenarioClassification (sq)', 'MassiveScenarioClassification (sw)', 'MassiveScenarioClassification (ta)', 'MassiveScenarioClassification (te)', 'MassiveScenarioClassification (th)', 'MassiveScenarioClassification (tl)', 'MassiveScenarioClassification (tr)', 'MassiveScenarioClassification (ur)', 'MassiveScenarioClassification (vi)', 'MassiveScenarioClassification (zh-TW)'] | |
other-sts: | |
title: Other | |
language_long: "Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish (Only language combos not included in the other tabs)" | |
has_overall: false | |
acronym: null | |
icon: null | |
special_icons: null | |
credits: null | |
tasks: | |
STS: ["STS17 (ar-ar)", "STS17 (en-ar)", "STS17 (en-de)", "STS17 (en-tr)", "STS17 (es-en)", "STS17 (es-es)", "STS17 (fr-en)", "STS17 (it-en)", "STS17 (ko-ko)", "STS17 (nl-en)", "STS22 (ar)", "STS22 (de)", "STS22 (de-en)", "STS22 (de-fr)", "STS22 (de-pl)", "STS22 (es)", "STS22 (es-en)", "STS22 (es-it)", "STS22 (fr)", "STS22 (fr-pl)", "STS22 (it)", "STS22 (pl)", "STS22 (pl-en)", "STS22 (ru)", "STS22 (tr)", "STS22 (zh-en)", "STSBenchmark"] | |
law: | |
title: Law | |
language_long: "English, German, Chinese" | |
has_overall: false | |
acronym: null | |
icon: "βοΈ" | |
special_icons: null | |
credits: "[Voyage AI](" | |
tasks: | |
Retrieval: | |
- AILACasedocs | |
- AILAStatutes | |
- GerDaLIRSmall | |
- LeCaRDv2 | |
- LegalBenchConsumerContractsQA | |
- LegalBenchCorporateLobbying | |
- LegalQuAD | |
- LegalSummarization | |
longembed: | |
title: LongEmbed | |
language_long: "English" | |
has_overall: false | |
acronym: null | |
icon: "π" | |
special_icons: null | |
credits: "[LongEmbed (Dawei Zhu et al.)](" | |
metric: nDCG@10 (for NarrativeQA, QMSum, SummScreenFD, WikimQA) & nDCG@1 (for passkey and needle) | |
tasks: | |
Retrieval: | |
- LEMBNarrativeQARetrieval | |
- LEMBNeedleRetrieval | |
- LEMBPasskeyRetrieval | |
- LEMBQMSumRetrieval | |
- LEMBSummScreenFDRetrieval | |
- LEMBWikimQARetrieval | |
rar-b: | |
title: RAR-b | |
language_long: "English" | |
has_overall: false | |
acronym: null | |
icon: "π" | |
special_icons: null | |
credits: "[RAR-b (Chenghao Xiao et al.)](" | |
metric: nDCG@10 | |
tasks: | |
Retrieval: | |
- ARCChallenge | |
- AlphaNLI | |
- HellaSwag | |
- PIQA | |
- Quail | |
- RARbCode | |
- RARbMath | |
- SIQA | |
- SpartQA | |
- TempReasonL1 | |
- TempReasonL2Fact | |
- TempReasonL2Pure | |
- TempReasonL3Fact | |
- TempReasonL3Pure | |
- WinoGrande | |
bright: | |
title: BRIGHT | |
language_long: "English" | |
has_overall: false | |
acronym: null | |
icon: "π" | |
special_icons: null | |
credits: "[BRIGHT (Hongjin Su, Howard Yen, Mengzhou Xia et al.)](" | |
metric: nDCG@10 | |
split: standard | |
desc: This tab only allows submissions with the original queries; not results from LLM rewritten queries or using reranking. | |
tasks: | |
Retrieval: | |
- BrightRetrieval (biology) | |
- BrightRetrieval (earth_science) | |
- BrightRetrieval (economics) | |
- BrightRetrieval (psychology) | |
- BrightRetrieval (robotics) | |
- BrightRetrieval (stackoverflow) | |
- BrightRetrieval (sustainable_living) | |
- BrightRetrieval (pony) | |
- BrightRetrieval (leetcode) | |
- BrightRetrieval (aops) | |
- BrightRetrieval (theoremqa_theorems) | |
- BrightRetrieval (theoremqa_questions) | |
bright_long: | |
title: BRIGHT Long | |
language_long: "English" | |
has_overall: false | |
acronym: null | |
icon: "π" | |
special_icons: null | |
credits: "[BRIGHT (Hongjin Su, Howard Yen, Mengzhou Xia et al.)](" | |
metric: Recall@1 | |
split: long | |
desc: This tab is for the long document setting of BRIGHT. | |
tasks: | |
Retrieval: | |
- BrightRetrieval (biology) | |
- BrightRetrieval (earth_science) | |
- BrightRetrieval (economics) | |
- BrightRetrieval (psychology) | |
- BrightRetrieval (robotics) | |
- BrightRetrieval (stackoverflow) | |
- BrightRetrieval (sustainable_living) | |
- BrightRetrieval (pony) | |
coir: | |
title: CoIR | |
language_long: "Code" | |
has_overall: false | |
acronym: null | |
icon: "π»" | |
special_icons: null | |
credits: "[Samoed]( and [monikernemo]( and [CoIR (Xiangyang Li, Kuicai Dong, Yi Quan Lee et al.)](" | |
metric: nDCG@10 | |
tasks: | |
Retrieval: | |
- AppsRetrieval | |
- CodeFeedbackMT | |
- CodeFeedbackST | |
- CodeSearchNetCCRetrieval (python) | |
- CodeSearchNetCCRetrieval (javascript) | |
- CodeSearchNetCCRetrieval (go) | |
- CodeSearchNetCCRetrieval (ruby) | |
- CodeSearchNetCCRetrieval (java) | |
- CodeSearchNetCCRetrieval (php) | |
- CodeSearchNetRetrieval (python) | |
- CodeSearchNetRetrieval (javascript) | |
- CodeSearchNetRetrieval (go) | |
- CodeSearchNetRetrieval (ruby) | |
- CodeSearchNetRetrieval (java) | |
- CodeSearchNetRetrieval (php) | |
- CodeTransOceanContest | |
- CodeTransOceanDL | |
- CosQA | |
- StackOverflowQA | |
- SyntheticText2SQL | |