diff --git "a/README.md" "b/README.md"
--- "a/README.md"
+++ "b/README.md"
@@ -1,3 +1,2827 @@
----
-license: apache-2.0
----
+---
+tags:
+- mteb
+- transformers.js
+- transformers
+model-index:
+- name: mxbai-angle-large-v1
+ results:
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/amazon_counterfactual
+ name: MTEB AmazonCounterfactualClassification (en)
+ config: en
+ split: test
+ revision: e8379541af4e31359cca9fbcf4b00f2671dba205
+ metrics:
+ - type: accuracy
+ value: 75.044776119403
+ - type: ap
+ value: 37.7362433623053
+ - type: f1
+ value: 68.92736573359774
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/amazon_polarity
+ name: MTEB AmazonPolarityClassification
+ config: default
+ split: test
+ revision: e2d317d38cd51312af73b3d32a06d1a08b442046
+ metrics:
+ - type: accuracy
+ value: 93.84025000000001
+ - type: ap
+ value: 90.93190875404055
+ - type: f1
+ value: 93.8297833897293
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/amazon_reviews_multi
+ name: MTEB AmazonReviewsClassification (en)
+ config: en
+ split: test
+ revision: 1399c76144fd37290681b995c656ef9b2e06e26d
+ metrics:
+ - type: accuracy
+ value: 49.184
+ - type: f1
+ value: 48.74163227751588
+ - task:
+ type: Retrieval
+ dataset:
+ type: arguana
+ name: MTEB ArguAna
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 41.252
+ - type: map_at_10
+ value: 57.778
+ - type: map_at_100
+ value: 58.233000000000004
+ - type: map_at_1000
+ value: 58.23700000000001
+ - type: map_at_3
+ value: 53.449999999999996
+ - type: map_at_5
+ value: 56.376000000000005
+ - type: mrr_at_1
+ value: 41.679
+ - type: mrr_at_10
+ value: 57.92699999999999
+ - type: mrr_at_100
+ value: 58.389
+ - type: mrr_at_1000
+ value: 58.391999999999996
+ - type: mrr_at_3
+ value: 53.651
+ - type: mrr_at_5
+ value: 56.521
+ - type: ndcg_at_1
+ value: 41.252
+ - type: ndcg_at_10
+ value: 66.018
+ - type: ndcg_at_100
+ value: 67.774
+ - type: ndcg_at_1000
+ value: 67.84400000000001
+ - type: ndcg_at_3
+ value: 57.372
+ - type: ndcg_at_5
+ value: 62.646
+ - type: precision_at_1
+ value: 41.252
+ - type: precision_at_10
+ value: 9.189
+ - type: precision_at_100
+ value: 0.991
+ - type: precision_at_1000
+ value: 0.1
+ - type: precision_at_3
+ value: 22.902
+ - type: precision_at_5
+ value: 16.302
+ - type: recall_at_1
+ value: 41.252
+ - type: recall_at_10
+ value: 91.892
+ - type: recall_at_100
+ value: 99.14699999999999
+ - type: recall_at_1000
+ value: 99.644
+ - type: recall_at_3
+ value: 68.706
+ - type: recall_at_5
+ value: 81.50800000000001
+ - task:
+ type: Clustering
+ dataset:
+ type: mteb/arxiv-clustering-p2p
+ name: MTEB ArxivClusteringP2P
+ config: default
+ split: test
+ revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d
+ metrics:
+ - type: v_measure
+ value: 48.97294504317859
+ - task:
+ type: Clustering
+ dataset:
+ type: mteb/arxiv-clustering-s2s
+ name: MTEB ArxivClusteringS2S
+ config: default
+ split: test
+ revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53
+ metrics:
+ - type: v_measure
+ value: 42.98071077674629
+ - task:
+ type: Reranking
+ dataset:
+ type: mteb/askubuntudupquestions-reranking
+ name: MTEB AskUbuntuDupQuestions
+ config: default
+ split: test
+ revision: 2000358ca161889fa9c082cb41daa8dcfb161a54
+ metrics:
+ - type: map
+ value: 65.16477858490782
+ - type: mrr
+ value: 78.23583080508287
+ - task:
+ type: STS
+ dataset:
+ type: mteb/biosses-sts
+ name: MTEB BIOSSES
+ config: default
+ split: test
+ revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
+ metrics:
+ - type: cos_sim_pearson
+ value: 89.6277629421789
+ - type: cos_sim_spearman
+ value: 88.4056288400568
+ - type: euclidean_pearson
+ value: 87.94871847578163
+ - type: euclidean_spearman
+ value: 88.4056288400568
+ - type: manhattan_pearson
+ value: 87.73271254229648
+ - type: manhattan_spearman
+ value: 87.91826833762677
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/banking77
+ name: MTEB Banking77Classification
+ config: default
+ split: test
+ revision: 0fd18e25b25c072e09e0d92ab615fda904d66300
+ metrics:
+ - type: accuracy
+ value: 87.81818181818181
+ - type: f1
+ value: 87.79879337316918
+ - task:
+ type: Clustering
+ dataset:
+ type: mteb/biorxiv-clustering-p2p
+ name: MTEB BiorxivClusteringP2P
+ config: default
+ split: test
+ revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40
+ metrics:
+ - type: v_measure
+ value: 39.91773608582761
+ - task:
+ type: Clustering
+ dataset:
+ type: mteb/biorxiv-clustering-s2s
+ name: MTEB BiorxivClusteringS2S
+ config: default
+ split: test
+ revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908
+ metrics:
+ - type: v_measure
+ value: 36.73059477462478
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackAndroidRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 32.745999999999995
+ - type: map_at_10
+ value: 43.632
+ - type: map_at_100
+ value: 45.206
+ - type: map_at_1000
+ value: 45.341
+ - type: map_at_3
+ value: 39.956
+ - type: map_at_5
+ value: 42.031
+ - type: mrr_at_1
+ value: 39.485
+ - type: mrr_at_10
+ value: 49.537
+ - type: mrr_at_100
+ value: 50.249
+ - type: mrr_at_1000
+ value: 50.294000000000004
+ - type: mrr_at_3
+ value: 46.757
+ - type: mrr_at_5
+ value: 48.481
+ - type: ndcg_at_1
+ value: 39.485
+ - type: ndcg_at_10
+ value: 50.058
+ - type: ndcg_at_100
+ value: 55.586
+ - type: ndcg_at_1000
+ value: 57.511
+ - type: ndcg_at_3
+ value: 44.786
+ - type: ndcg_at_5
+ value: 47.339999999999996
+ - type: precision_at_1
+ value: 39.485
+ - type: precision_at_10
+ value: 9.557
+ - type: precision_at_100
+ value: 1.552
+ - type: precision_at_1000
+ value: 0.202
+ - type: precision_at_3
+ value: 21.412
+ - type: precision_at_5
+ value: 15.479000000000001
+ - type: recall_at_1
+ value: 32.745999999999995
+ - type: recall_at_10
+ value: 62.056
+ - type: recall_at_100
+ value: 85.088
+ - type: recall_at_1000
+ value: 96.952
+ - type: recall_at_3
+ value: 46.959
+ - type: recall_at_5
+ value: 54.06999999999999
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackEnglishRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 31.898
+ - type: map_at_10
+ value: 42.142
+ - type: map_at_100
+ value: 43.349
+ - type: map_at_1000
+ value: 43.483
+ - type: map_at_3
+ value: 39.18
+ - type: map_at_5
+ value: 40.733000000000004
+ - type: mrr_at_1
+ value: 39.617999999999995
+ - type: mrr_at_10
+ value: 47.922
+ - type: mrr_at_100
+ value: 48.547000000000004
+ - type: mrr_at_1000
+ value: 48.597
+ - type: mrr_at_3
+ value: 45.86
+ - type: mrr_at_5
+ value: 46.949000000000005
+ - type: ndcg_at_1
+ value: 39.617999999999995
+ - type: ndcg_at_10
+ value: 47.739
+ - type: ndcg_at_100
+ value: 51.934999999999995
+ - type: ndcg_at_1000
+ value: 54.007000000000005
+ - type: ndcg_at_3
+ value: 43.748
+ - type: ndcg_at_5
+ value: 45.345
+ - type: precision_at_1
+ value: 39.617999999999995
+ - type: precision_at_10
+ value: 8.962
+ - type: precision_at_100
+ value: 1.436
+ - type: precision_at_1000
+ value: 0.192
+ - type: precision_at_3
+ value: 21.083
+ - type: precision_at_5
+ value: 14.752
+ - type: recall_at_1
+ value: 31.898
+ - type: recall_at_10
+ value: 57.587999999999994
+ - type: recall_at_100
+ value: 75.323
+ - type: recall_at_1000
+ value: 88.304
+ - type: recall_at_3
+ value: 45.275
+ - type: recall_at_5
+ value: 49.99
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackGamingRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 40.458
+ - type: map_at_10
+ value: 52.942
+ - type: map_at_100
+ value: 53.974
+ - type: map_at_1000
+ value: 54.031
+ - type: map_at_3
+ value: 49.559999999999995
+ - type: map_at_5
+ value: 51.408
+ - type: mrr_at_1
+ value: 46.27
+ - type: mrr_at_10
+ value: 56.31699999999999
+ - type: mrr_at_100
+ value: 56.95099999999999
+ - type: mrr_at_1000
+ value: 56.98
+ - type: mrr_at_3
+ value: 53.835
+ - type: mrr_at_5
+ value: 55.252
+ - type: ndcg_at_1
+ value: 46.27
+ - type: ndcg_at_10
+ value: 58.964000000000006
+ - type: ndcg_at_100
+ value: 62.875
+ - type: ndcg_at_1000
+ value: 63.969
+ - type: ndcg_at_3
+ value: 53.297000000000004
+ - type: ndcg_at_5
+ value: 55.938
+ - type: precision_at_1
+ value: 46.27
+ - type: precision_at_10
+ value: 9.549000000000001
+ - type: precision_at_100
+ value: 1.2409999999999999
+ - type: precision_at_1000
+ value: 0.13799999999999998
+ - type: precision_at_3
+ value: 23.762
+ - type: precision_at_5
+ value: 16.262999999999998
+ - type: recall_at_1
+ value: 40.458
+ - type: recall_at_10
+ value: 73.446
+ - type: recall_at_100
+ value: 90.12400000000001
+ - type: recall_at_1000
+ value: 97.795
+ - type: recall_at_3
+ value: 58.123000000000005
+ - type: recall_at_5
+ value: 64.68
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackGisRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 27.443
+ - type: map_at_10
+ value: 36.081
+ - type: map_at_100
+ value: 37.163000000000004
+ - type: map_at_1000
+ value: 37.232
+ - type: map_at_3
+ value: 33.308
+ - type: map_at_5
+ value: 34.724
+ - type: mrr_at_1
+ value: 29.492
+ - type: mrr_at_10
+ value: 38.138
+ - type: mrr_at_100
+ value: 39.065
+ - type: mrr_at_1000
+ value: 39.119
+ - type: mrr_at_3
+ value: 35.593
+ - type: mrr_at_5
+ value: 36.785000000000004
+ - type: ndcg_at_1
+ value: 29.492
+ - type: ndcg_at_10
+ value: 41.134
+ - type: ndcg_at_100
+ value: 46.300999999999995
+ - type: ndcg_at_1000
+ value: 48.106
+ - type: ndcg_at_3
+ value: 35.77
+ - type: ndcg_at_5
+ value: 38.032
+ - type: precision_at_1
+ value: 29.492
+ - type: precision_at_10
+ value: 6.249
+ - type: precision_at_100
+ value: 0.9299999999999999
+ - type: precision_at_1000
+ value: 0.11199999999999999
+ - type: precision_at_3
+ value: 15.065999999999999
+ - type: precision_at_5
+ value: 10.373000000000001
+ - type: recall_at_1
+ value: 27.443
+ - type: recall_at_10
+ value: 54.80199999999999
+ - type: recall_at_100
+ value: 78.21900000000001
+ - type: recall_at_1000
+ value: 91.751
+ - type: recall_at_3
+ value: 40.211000000000006
+ - type: recall_at_5
+ value: 45.599000000000004
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackMathematicaRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 18.731
+ - type: map_at_10
+ value: 26.717999999999996
+ - type: map_at_100
+ value: 27.897
+ - type: map_at_1000
+ value: 28.029
+ - type: map_at_3
+ value: 23.91
+ - type: map_at_5
+ value: 25.455
+ - type: mrr_at_1
+ value: 23.134
+ - type: mrr_at_10
+ value: 31.769
+ - type: mrr_at_100
+ value: 32.634
+ - type: mrr_at_1000
+ value: 32.707
+ - type: mrr_at_3
+ value: 28.938999999999997
+ - type: mrr_at_5
+ value: 30.531000000000002
+ - type: ndcg_at_1
+ value: 23.134
+ - type: ndcg_at_10
+ value: 32.249
+ - type: ndcg_at_100
+ value: 37.678
+ - type: ndcg_at_1000
+ value: 40.589999999999996
+ - type: ndcg_at_3
+ value: 26.985999999999997
+ - type: ndcg_at_5
+ value: 29.457
+ - type: precision_at_1
+ value: 23.134
+ - type: precision_at_10
+ value: 5.8709999999999996
+ - type: precision_at_100
+ value: 0.988
+ - type: precision_at_1000
+ value: 0.13799999999999998
+ - type: precision_at_3
+ value: 12.852
+ - type: precision_at_5
+ value: 9.428
+ - type: recall_at_1
+ value: 18.731
+ - type: recall_at_10
+ value: 44.419
+ - type: recall_at_100
+ value: 67.851
+ - type: recall_at_1000
+ value: 88.103
+ - type: recall_at_3
+ value: 29.919
+ - type: recall_at_5
+ value: 36.230000000000004
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackPhysicsRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 30.324
+ - type: map_at_10
+ value: 41.265
+ - type: map_at_100
+ value: 42.559000000000005
+ - type: map_at_1000
+ value: 42.669000000000004
+ - type: map_at_3
+ value: 38.138
+ - type: map_at_5
+ value: 39.881
+ - type: mrr_at_1
+ value: 36.67
+ - type: mrr_at_10
+ value: 46.774
+ - type: mrr_at_100
+ value: 47.554
+ - type: mrr_at_1000
+ value: 47.593
+ - type: mrr_at_3
+ value: 44.338
+ - type: mrr_at_5
+ value: 45.723
+ - type: ndcg_at_1
+ value: 36.67
+ - type: ndcg_at_10
+ value: 47.367
+ - type: ndcg_at_100
+ value: 52.623
+ - type: ndcg_at_1000
+ value: 54.59
+ - type: ndcg_at_3
+ value: 42.323
+ - type: ndcg_at_5
+ value: 44.727
+ - type: precision_at_1
+ value: 36.67
+ - type: precision_at_10
+ value: 8.518
+ - type: precision_at_100
+ value: 1.2890000000000001
+ - type: precision_at_1000
+ value: 0.163
+ - type: precision_at_3
+ value: 19.955000000000002
+ - type: precision_at_5
+ value: 14.11
+ - type: recall_at_1
+ value: 30.324
+ - type: recall_at_10
+ value: 59.845000000000006
+ - type: recall_at_100
+ value: 81.77499999999999
+ - type: recall_at_1000
+ value: 94.463
+ - type: recall_at_3
+ value: 46.019
+ - type: recall_at_5
+ value: 52.163000000000004
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackProgrammersRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 24.229
+ - type: map_at_10
+ value: 35.004000000000005
+ - type: map_at_100
+ value: 36.409000000000006
+ - type: map_at_1000
+ value: 36.521
+ - type: map_at_3
+ value: 31.793
+ - type: map_at_5
+ value: 33.432
+ - type: mrr_at_1
+ value: 30.365
+ - type: mrr_at_10
+ value: 40.502
+ - type: mrr_at_100
+ value: 41.372
+ - type: mrr_at_1000
+ value: 41.435
+ - type: mrr_at_3
+ value: 37.804
+ - type: mrr_at_5
+ value: 39.226
+ - type: ndcg_at_1
+ value: 30.365
+ - type: ndcg_at_10
+ value: 41.305
+ - type: ndcg_at_100
+ value: 47.028999999999996
+ - type: ndcg_at_1000
+ value: 49.375
+ - type: ndcg_at_3
+ value: 35.85
+ - type: ndcg_at_5
+ value: 38.12
+ - type: precision_at_1
+ value: 30.365
+ - type: precision_at_10
+ value: 7.808
+ - type: precision_at_100
+ value: 1.228
+ - type: precision_at_1000
+ value: 0.161
+ - type: precision_at_3
+ value: 17.352
+ - type: precision_at_5
+ value: 12.42
+ - type: recall_at_1
+ value: 24.229
+ - type: recall_at_10
+ value: 54.673
+ - type: recall_at_100
+ value: 78.766
+ - type: recall_at_1000
+ value: 94.625
+ - type: recall_at_3
+ value: 39.602
+ - type: recall_at_5
+ value: 45.558
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 26.695
+ - type: map_at_10
+ value: 36.0895
+ - type: map_at_100
+ value: 37.309416666666664
+ - type: map_at_1000
+ value: 37.42558333333334
+ - type: map_at_3
+ value: 33.19616666666666
+ - type: map_at_5
+ value: 34.78641666666667
+ - type: mrr_at_1
+ value: 31.486083333333337
+ - type: mrr_at_10
+ value: 40.34774999999999
+ - type: mrr_at_100
+ value: 41.17533333333333
+ - type: mrr_at_1000
+ value: 41.231583333333326
+ - type: mrr_at_3
+ value: 37.90075
+ - type: mrr_at_5
+ value: 39.266999999999996
+ - type: ndcg_at_1
+ value: 31.486083333333337
+ - type: ndcg_at_10
+ value: 41.60433333333334
+ - type: ndcg_at_100
+ value: 46.74525
+ - type: ndcg_at_1000
+ value: 48.96166666666667
+ - type: ndcg_at_3
+ value: 36.68825
+ - type: ndcg_at_5
+ value: 38.966499999999996
+ - type: precision_at_1
+ value: 31.486083333333337
+ - type: precision_at_10
+ value: 7.29675
+ - type: precision_at_100
+ value: 1.1621666666666666
+ - type: precision_at_1000
+ value: 0.1545
+ - type: precision_at_3
+ value: 16.8815
+ - type: precision_at_5
+ value: 11.974583333333333
+ - type: recall_at_1
+ value: 26.695
+ - type: recall_at_10
+ value: 53.651916666666665
+ - type: recall_at_100
+ value: 76.12083333333332
+ - type: recall_at_1000
+ value: 91.31191666666668
+ - type: recall_at_3
+ value: 40.03575
+ - type: recall_at_5
+ value: 45.876666666666665
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackStatsRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 25.668000000000003
+ - type: map_at_10
+ value: 32.486
+ - type: map_at_100
+ value: 33.371
+ - type: map_at_1000
+ value: 33.458
+ - type: map_at_3
+ value: 30.261
+ - type: map_at_5
+ value: 31.418000000000003
+ - type: mrr_at_1
+ value: 28.988000000000003
+ - type: mrr_at_10
+ value: 35.414
+ - type: mrr_at_100
+ value: 36.149
+ - type: mrr_at_1000
+ value: 36.215
+ - type: mrr_at_3
+ value: 33.333
+ - type: mrr_at_5
+ value: 34.43
+ - type: ndcg_at_1
+ value: 28.988000000000003
+ - type: ndcg_at_10
+ value: 36.732
+ - type: ndcg_at_100
+ value: 41.331
+ - type: ndcg_at_1000
+ value: 43.575
+ - type: ndcg_at_3
+ value: 32.413
+ - type: ndcg_at_5
+ value: 34.316
+ - type: precision_at_1
+ value: 28.988000000000003
+ - type: precision_at_10
+ value: 5.7059999999999995
+ - type: precision_at_100
+ value: 0.882
+ - type: precision_at_1000
+ value: 0.11299999999999999
+ - type: precision_at_3
+ value: 13.65
+ - type: precision_at_5
+ value: 9.417
+ - type: recall_at_1
+ value: 25.668000000000003
+ - type: recall_at_10
+ value: 47.147
+ - type: recall_at_100
+ value: 68.504
+ - type: recall_at_1000
+ value: 85.272
+ - type: recall_at_3
+ value: 35.19
+ - type: recall_at_5
+ value: 39.925
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackTexRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 17.256
+ - type: map_at_10
+ value: 24.58
+ - type: map_at_100
+ value: 25.773000000000003
+ - type: map_at_1000
+ value: 25.899
+ - type: map_at_3
+ value: 22.236
+ - type: map_at_5
+ value: 23.507
+ - type: mrr_at_1
+ value: 20.957
+ - type: mrr_at_10
+ value: 28.416000000000004
+ - type: mrr_at_100
+ value: 29.447000000000003
+ - type: mrr_at_1000
+ value: 29.524
+ - type: mrr_at_3
+ value: 26.245
+ - type: mrr_at_5
+ value: 27.451999999999998
+ - type: ndcg_at_1
+ value: 20.957
+ - type: ndcg_at_10
+ value: 29.285
+ - type: ndcg_at_100
+ value: 35.003
+ - type: ndcg_at_1000
+ value: 37.881
+ - type: ndcg_at_3
+ value: 25.063000000000002
+ - type: ndcg_at_5
+ value: 26.983
+ - type: precision_at_1
+ value: 20.957
+ - type: precision_at_10
+ value: 5.344
+ - type: precision_at_100
+ value: 0.958
+ - type: precision_at_1000
+ value: 0.13799999999999998
+ - type: precision_at_3
+ value: 11.918
+ - type: precision_at_5
+ value: 8.596
+ - type: recall_at_1
+ value: 17.256
+ - type: recall_at_10
+ value: 39.644
+ - type: recall_at_100
+ value: 65.279
+ - type: recall_at_1000
+ value: 85.693
+ - type: recall_at_3
+ value: 27.825
+ - type: recall_at_5
+ value: 32.792
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackUnixRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 26.700000000000003
+ - type: map_at_10
+ value: 36.205999999999996
+ - type: map_at_100
+ value: 37.316
+ - type: map_at_1000
+ value: 37.425000000000004
+ - type: map_at_3
+ value: 33.166000000000004
+ - type: map_at_5
+ value: 35.032999999999994
+ - type: mrr_at_1
+ value: 31.436999999999998
+ - type: mrr_at_10
+ value: 40.61
+ - type: mrr_at_100
+ value: 41.415
+ - type: mrr_at_1000
+ value: 41.48
+ - type: mrr_at_3
+ value: 37.966
+ - type: mrr_at_5
+ value: 39.599000000000004
+ - type: ndcg_at_1
+ value: 31.436999999999998
+ - type: ndcg_at_10
+ value: 41.771
+ - type: ndcg_at_100
+ value: 46.784
+ - type: ndcg_at_1000
+ value: 49.183
+ - type: ndcg_at_3
+ value: 36.437000000000005
+ - type: ndcg_at_5
+ value: 39.291
+ - type: precision_at_1
+ value: 31.436999999999998
+ - type: precision_at_10
+ value: 6.987
+ - type: precision_at_100
+ value: 1.072
+ - type: precision_at_1000
+ value: 0.13899999999999998
+ - type: precision_at_3
+ value: 16.448999999999998
+ - type: precision_at_5
+ value: 11.866
+ - type: recall_at_1
+ value: 26.700000000000003
+ - type: recall_at_10
+ value: 54.301
+ - type: recall_at_100
+ value: 75.871
+ - type: recall_at_1000
+ value: 92.529
+ - type: recall_at_3
+ value: 40.201
+ - type: recall_at_5
+ value: 47.208
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackWebmastersRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 24.296
+ - type: map_at_10
+ value: 33.116
+ - type: map_at_100
+ value: 34.81
+ - type: map_at_1000
+ value: 35.032000000000004
+ - type: map_at_3
+ value: 30.105999999999998
+ - type: map_at_5
+ value: 31.839000000000002
+ - type: mrr_at_1
+ value: 29.051
+ - type: mrr_at_10
+ value: 37.803
+ - type: mrr_at_100
+ value: 38.856
+ - type: mrr_at_1000
+ value: 38.903999999999996
+ - type: mrr_at_3
+ value: 35.211
+ - type: mrr_at_5
+ value: 36.545
+ - type: ndcg_at_1
+ value: 29.051
+ - type: ndcg_at_10
+ value: 39.007
+ - type: ndcg_at_100
+ value: 45.321
+ - type: ndcg_at_1000
+ value: 47.665
+ - type: ndcg_at_3
+ value: 34.1
+ - type: ndcg_at_5
+ value: 36.437000000000005
+ - type: precision_at_1
+ value: 29.051
+ - type: precision_at_10
+ value: 7.668
+ - type: precision_at_100
+ value: 1.542
+ - type: precision_at_1000
+ value: 0.24
+ - type: precision_at_3
+ value: 16.14
+ - type: precision_at_5
+ value: 11.897
+ - type: recall_at_1
+ value: 24.296
+ - type: recall_at_10
+ value: 49.85
+ - type: recall_at_100
+ value: 78.457
+ - type: recall_at_1000
+ value: 92.618
+ - type: recall_at_3
+ value: 36.138999999999996
+ - type: recall_at_5
+ value: 42.223
+ - task:
+ type: Retrieval
+ dataset:
+ type: BeIR/cqadupstack
+ name: MTEB CQADupstackWordpressRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 20.591
+ - type: map_at_10
+ value: 28.902
+ - type: map_at_100
+ value: 29.886000000000003
+ - type: map_at_1000
+ value: 29.987000000000002
+ - type: map_at_3
+ value: 26.740000000000002
+ - type: map_at_5
+ value: 27.976
+ - type: mrr_at_1
+ value: 22.366
+ - type: mrr_at_10
+ value: 30.971
+ - type: mrr_at_100
+ value: 31.865
+ - type: mrr_at_1000
+ value: 31.930999999999997
+ - type: mrr_at_3
+ value: 28.927999999999997
+ - type: mrr_at_5
+ value: 30.231
+ - type: ndcg_at_1
+ value: 22.366
+ - type: ndcg_at_10
+ value: 33.641
+ - type: ndcg_at_100
+ value: 38.477
+ - type: ndcg_at_1000
+ value: 41.088
+ - type: ndcg_at_3
+ value: 29.486
+ - type: ndcg_at_5
+ value: 31.612000000000002
+ - type: precision_at_1
+ value: 22.366
+ - type: precision_at_10
+ value: 5.3420000000000005
+ - type: precision_at_100
+ value: 0.828
+ - type: precision_at_1000
+ value: 0.11800000000000001
+ - type: precision_at_3
+ value: 12.939
+ - type: precision_at_5
+ value: 9.094
+ - type: recall_at_1
+ value: 20.591
+ - type: recall_at_10
+ value: 46.052
+ - type: recall_at_100
+ value: 68.193
+ - type: recall_at_1000
+ value: 87.638
+ - type: recall_at_3
+ value: 34.966
+ - type: recall_at_5
+ value: 40.082
+ - task:
+ type: Retrieval
+ dataset:
+ type: climate-fever
+ name: MTEB ClimateFEVER
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 15.091
+ - type: map_at_10
+ value: 26.38
+ - type: map_at_100
+ value: 28.421999999999997
+ - type: map_at_1000
+ value: 28.621999999999996
+ - type: map_at_3
+ value: 21.597
+ - type: map_at_5
+ value: 24.12
+ - type: mrr_at_1
+ value: 34.266999999999996
+ - type: mrr_at_10
+ value: 46.864
+ - type: mrr_at_100
+ value: 47.617
+ - type: mrr_at_1000
+ value: 47.644
+ - type: mrr_at_3
+ value: 43.312
+ - type: mrr_at_5
+ value: 45.501000000000005
+ - type: ndcg_at_1
+ value: 34.266999999999996
+ - type: ndcg_at_10
+ value: 36.095
+ - type: ndcg_at_100
+ value: 43.447
+ - type: ndcg_at_1000
+ value: 46.661
+ - type: ndcg_at_3
+ value: 29.337999999999997
+ - type: ndcg_at_5
+ value: 31.824
+ - type: precision_at_1
+ value: 34.266999999999996
+ - type: precision_at_10
+ value: 11.472
+ - type: precision_at_100
+ value: 1.944
+ - type: precision_at_1000
+ value: 0.255
+ - type: precision_at_3
+ value: 21.933
+ - type: precision_at_5
+ value: 17.224999999999998
+ - type: recall_at_1
+ value: 15.091
+ - type: recall_at_10
+ value: 43.022
+ - type: recall_at_100
+ value: 68.075
+ - type: recall_at_1000
+ value: 85.76
+ - type: recall_at_3
+ value: 26.564
+ - type: recall_at_5
+ value: 33.594
+ - task:
+ type: Retrieval
+ dataset:
+ type: dbpedia-entity
+ name: MTEB DBPedia
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 9.252
+ - type: map_at_10
+ value: 20.923
+ - type: map_at_100
+ value: 30.741000000000003
+ - type: map_at_1000
+ value: 32.542
+ - type: map_at_3
+ value: 14.442
+ - type: map_at_5
+ value: 17.399
+ - type: mrr_at_1
+ value: 70.25
+ - type: mrr_at_10
+ value: 78.17
+ - type: mrr_at_100
+ value: 78.444
+ - type: mrr_at_1000
+ value: 78.45100000000001
+ - type: mrr_at_3
+ value: 76.958
+ - type: mrr_at_5
+ value: 77.571
+ - type: ndcg_at_1
+ value: 58.375
+ - type: ndcg_at_10
+ value: 44.509
+ - type: ndcg_at_100
+ value: 49.897999999999996
+ - type: ndcg_at_1000
+ value: 57.269999999999996
+ - type: ndcg_at_3
+ value: 48.64
+ - type: ndcg_at_5
+ value: 46.697
+ - type: precision_at_1
+ value: 70.25
+ - type: precision_at_10
+ value: 36.05
+ - type: precision_at_100
+ value: 11.848
+ - type: precision_at_1000
+ value: 2.213
+ - type: precision_at_3
+ value: 52.917
+ - type: precision_at_5
+ value: 45.7
+ - type: recall_at_1
+ value: 9.252
+ - type: recall_at_10
+ value: 27.006999999999998
+ - type: recall_at_100
+ value: 57.008
+ - type: recall_at_1000
+ value: 80.697
+ - type: recall_at_3
+ value: 15.798000000000002
+ - type: recall_at_5
+ value: 20.4
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/emotion
+ name: MTEB EmotionClassification
+ config: default
+ split: test
+ revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37
+ metrics:
+ - type: accuracy
+ value: 50.88
+ - type: f1
+ value: 45.545495028653384
+ - task:
+ type: Retrieval
+ dataset:
+ type: fever
+ name: MTEB FEVER
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 75.424
+ - type: map_at_10
+ value: 83.435
+ - type: map_at_100
+ value: 83.66900000000001
+ - type: map_at_1000
+ value: 83.685
+ - type: map_at_3
+ value: 82.39800000000001
+ - type: map_at_5
+ value: 83.07
+ - type: mrr_at_1
+ value: 81.113
+ - type: mrr_at_10
+ value: 87.77199999999999
+ - type: mrr_at_100
+ value: 87.862
+ - type: mrr_at_1000
+ value: 87.86500000000001
+ - type: mrr_at_3
+ value: 87.17099999999999
+ - type: mrr_at_5
+ value: 87.616
+ - type: ndcg_at_1
+ value: 81.113
+ - type: ndcg_at_10
+ value: 86.909
+ - type: ndcg_at_100
+ value: 87.746
+ - type: ndcg_at_1000
+ value: 88.017
+ - type: ndcg_at_3
+ value: 85.368
+ - type: ndcg_at_5
+ value: 86.28099999999999
+ - type: precision_at_1
+ value: 81.113
+ - type: precision_at_10
+ value: 10.363
+ - type: precision_at_100
+ value: 1.102
+ - type: precision_at_1000
+ value: 0.11399999999999999
+ - type: precision_at_3
+ value: 32.507999999999996
+ - type: precision_at_5
+ value: 20.138
+ - type: recall_at_1
+ value: 75.424
+ - type: recall_at_10
+ value: 93.258
+ - type: recall_at_100
+ value: 96.545
+ - type: recall_at_1000
+ value: 98.284
+ - type: recall_at_3
+ value: 89.083
+ - type: recall_at_5
+ value: 91.445
+ - task:
+ type: Retrieval
+ dataset:
+ type: fiqa
+ name: MTEB FiQA2018
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 22.532
+ - type: map_at_10
+ value: 37.141999999999996
+ - type: map_at_100
+ value: 39.162
+ - type: map_at_1000
+ value: 39.322
+ - type: map_at_3
+ value: 32.885
+ - type: map_at_5
+ value: 35.093999999999994
+ - type: mrr_at_1
+ value: 44.29
+ - type: mrr_at_10
+ value: 53.516
+ - type: mrr_at_100
+ value: 54.24
+ - type: mrr_at_1000
+ value: 54.273
+ - type: mrr_at_3
+ value: 51.286
+ - type: mrr_at_5
+ value: 52.413
+ - type: ndcg_at_1
+ value: 44.29
+ - type: ndcg_at_10
+ value: 45.268
+ - type: ndcg_at_100
+ value: 52.125
+ - type: ndcg_at_1000
+ value: 54.778000000000006
+ - type: ndcg_at_3
+ value: 41.829
+ - type: ndcg_at_5
+ value: 42.525
+ - type: precision_at_1
+ value: 44.29
+ - type: precision_at_10
+ value: 12.5
+ - type: precision_at_100
+ value: 1.9720000000000002
+ - type: precision_at_1000
+ value: 0.245
+ - type: precision_at_3
+ value: 28.035
+ - type: precision_at_5
+ value: 20.093
+ - type: recall_at_1
+ value: 22.532
+ - type: recall_at_10
+ value: 52.419000000000004
+ - type: recall_at_100
+ value: 77.43299999999999
+ - type: recall_at_1000
+ value: 93.379
+ - type: recall_at_3
+ value: 38.629000000000005
+ - type: recall_at_5
+ value: 43.858000000000004
+ - task:
+ type: Retrieval
+ dataset:
+ type: hotpotqa
+ name: MTEB HotpotQA
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 39.359
+ - type: map_at_10
+ value: 63.966
+ - type: map_at_100
+ value: 64.87
+ - type: map_at_1000
+ value: 64.92599999999999
+ - type: map_at_3
+ value: 60.409
+ - type: map_at_5
+ value: 62.627
+ - type: mrr_at_1
+ value: 78.717
+ - type: mrr_at_10
+ value: 84.468
+ - type: mrr_at_100
+ value: 84.655
+ - type: mrr_at_1000
+ value: 84.661
+ - type: mrr_at_3
+ value: 83.554
+ - type: mrr_at_5
+ value: 84.133
+ - type: ndcg_at_1
+ value: 78.717
+ - type: ndcg_at_10
+ value: 72.03399999999999
+ - type: ndcg_at_100
+ value: 75.158
+ - type: ndcg_at_1000
+ value: 76.197
+ - type: ndcg_at_3
+ value: 67.049
+ - type: ndcg_at_5
+ value: 69.808
+ - type: precision_at_1
+ value: 78.717
+ - type: precision_at_10
+ value: 15.201
+ - type: precision_at_100
+ value: 1.764
+ - type: precision_at_1000
+ value: 0.19
+ - type: precision_at_3
+ value: 43.313
+ - type: precision_at_5
+ value: 28.165000000000003
+ - type: recall_at_1
+ value: 39.359
+ - type: recall_at_10
+ value: 76.003
+ - type: recall_at_100
+ value: 88.197
+ - type: recall_at_1000
+ value: 95.003
+ - type: recall_at_3
+ value: 64.97
+ - type: recall_at_5
+ value: 70.41199999999999
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/imdb
+ name: MTEB ImdbClassification
+ config: default
+ split: test
+ revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7
+ metrics:
+ - type: accuracy
+ value: 92.83200000000001
+ - type: ap
+ value: 89.33560571859861
+ - type: f1
+ value: 92.82322915005167
+ - task:
+ type: Retrieval
+ dataset:
+ type: msmarco
+ name: MTEB MSMARCO
+ config: default
+ split: dev
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 21.983
+ - type: map_at_10
+ value: 34.259
+ - type: map_at_100
+ value: 35.432
+ - type: map_at_1000
+ value: 35.482
+ - type: map_at_3
+ value: 30.275999999999996
+ - type: map_at_5
+ value: 32.566
+ - type: mrr_at_1
+ value: 22.579
+ - type: mrr_at_10
+ value: 34.882999999999996
+ - type: mrr_at_100
+ value: 35.984
+ - type: mrr_at_1000
+ value: 36.028
+ - type: mrr_at_3
+ value: 30.964999999999996
+ - type: mrr_at_5
+ value: 33.245000000000005
+ - type: ndcg_at_1
+ value: 22.564
+ - type: ndcg_at_10
+ value: 41.258
+ - type: ndcg_at_100
+ value: 46.824
+ - type: ndcg_at_1000
+ value: 48.037
+ - type: ndcg_at_3
+ value: 33.17
+ - type: ndcg_at_5
+ value: 37.263000000000005
+ - type: precision_at_1
+ value: 22.564
+ - type: precision_at_10
+ value: 6.572
+ - type: precision_at_100
+ value: 0.935
+ - type: precision_at_1000
+ value: 0.104
+ - type: precision_at_3
+ value: 14.130999999999998
+ - type: precision_at_5
+ value: 10.544
+ - type: recall_at_1
+ value: 21.983
+ - type: recall_at_10
+ value: 62.775000000000006
+ - type: recall_at_100
+ value: 88.389
+ - type: recall_at_1000
+ value: 97.603
+ - type: recall_at_3
+ value: 40.878
+ - type: recall_at_5
+ value: 50.690000000000005
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/mtop_domain
+ name: MTEB MTOPDomainClassification (en)
+ config: en
+ split: test
+ revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
+ metrics:
+ - type: accuracy
+ value: 93.95120839033288
+ - type: f1
+ value: 93.73824125055208
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/mtop_intent
+ name: MTEB MTOPIntentClassification (en)
+ config: en
+ split: test
+ revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
+ metrics:
+ - type: accuracy
+ value: 76.78978568171455
+ - type: f1
+ value: 57.50180552858304
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/amazon_massive_intent
+ name: MTEB MassiveIntentClassification (en)
+ config: en
+ split: test
+ revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
+ metrics:
+ - type: accuracy
+ value: 76.24411566913248
+ - type: f1
+ value: 74.37851403532832
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/amazon_massive_scenario
+ name: MTEB MassiveScenarioClassification (en)
+ config: en
+ split: test
+ revision: 7d571f92784cd94a019292a1f45445077d0ef634
+ metrics:
+ - type: accuracy
+ value: 79.94620040349699
+ - type: f1
+ value: 80.21293397970435
+ - task:
+ type: Clustering
+ dataset:
+ type: mteb/medrxiv-clustering-p2p
+ name: MTEB MedrxivClusteringP2P
+ config: default
+ split: test
+ revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73
+ metrics:
+ - type: v_measure
+ value: 33.44403096245675
+ - task:
+ type: Clustering
+ dataset:
+ type: mteb/medrxiv-clustering-s2s
+ name: MTEB MedrxivClusteringS2S
+ config: default
+ split: test
+ revision: 35191c8c0dca72d8ff3efcd72aa802307d469663
+ metrics:
+ - type: v_measure
+ value: 31.659594631336812
+ - task:
+ type: Reranking
+ dataset:
+ type: mteb/mind_small
+ name: MTEB MindSmallReranking
+ config: default
+ split: test
+ revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69
+ metrics:
+ - type: map
+ value: 32.53833075108798
+ - type: mrr
+ value: 33.78840823218308
+ - task:
+ type: Retrieval
+ dataset:
+ type: nfcorpus
+ name: MTEB NFCorpus
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 7.185999999999999
+ - type: map_at_10
+ value: 15.193999999999999
+ - type: map_at_100
+ value: 19.538
+ - type: map_at_1000
+ value: 21.178
+ - type: map_at_3
+ value: 11.208
+ - type: map_at_5
+ value: 12.745999999999999
+ - type: mrr_at_1
+ value: 48.916
+ - type: mrr_at_10
+ value: 58.141
+ - type: mrr_at_100
+ value: 58.656
+ - type: mrr_at_1000
+ value: 58.684999999999995
+ - type: mrr_at_3
+ value: 55.521
+ - type: mrr_at_5
+ value: 57.239
+ - type: ndcg_at_1
+ value: 47.059
+ - type: ndcg_at_10
+ value: 38.644
+ - type: ndcg_at_100
+ value: 36.272999999999996
+ - type: ndcg_at_1000
+ value: 44.996
+ - type: ndcg_at_3
+ value: 43.293
+ - type: ndcg_at_5
+ value: 40.819
+ - type: precision_at_1
+ value: 48.916
+ - type: precision_at_10
+ value: 28.607
+ - type: precision_at_100
+ value: 9.195
+ - type: precision_at_1000
+ value: 2.225
+ - type: precision_at_3
+ value: 40.454
+ - type: precision_at_5
+ value: 34.985
+ - type: recall_at_1
+ value: 7.185999999999999
+ - type: recall_at_10
+ value: 19.654
+ - type: recall_at_100
+ value: 37.224000000000004
+ - type: recall_at_1000
+ value: 68.663
+ - type: recall_at_3
+ value: 12.158
+ - type: recall_at_5
+ value: 14.674999999999999
+ - task:
+ type: Retrieval
+ dataset:
+ type: nq
+ name: MTEB NQ
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 31.552000000000003
+ - type: map_at_10
+ value: 47.75
+ - type: map_at_100
+ value: 48.728
+ - type: map_at_1000
+ value: 48.754
+ - type: map_at_3
+ value: 43.156
+ - type: map_at_5
+ value: 45.883
+ - type: mrr_at_1
+ value: 35.66
+ - type: mrr_at_10
+ value: 50.269
+ - type: mrr_at_100
+ value: 50.974
+ - type: mrr_at_1000
+ value: 50.991
+ - type: mrr_at_3
+ value: 46.519
+ - type: mrr_at_5
+ value: 48.764
+ - type: ndcg_at_1
+ value: 35.632000000000005
+ - type: ndcg_at_10
+ value: 55.786
+ - type: ndcg_at_100
+ value: 59.748999999999995
+ - type: ndcg_at_1000
+ value: 60.339
+ - type: ndcg_at_3
+ value: 47.292
+ - type: ndcg_at_5
+ value: 51.766999999999996
+ - type: precision_at_1
+ value: 35.632000000000005
+ - type: precision_at_10
+ value: 9.267
+ - type: precision_at_100
+ value: 1.149
+ - type: precision_at_1000
+ value: 0.12
+ - type: precision_at_3
+ value: 21.601
+ - type: precision_at_5
+ value: 15.539
+ - type: recall_at_1
+ value: 31.552000000000003
+ - type: recall_at_10
+ value: 77.62400000000001
+ - type: recall_at_100
+ value: 94.527
+ - type: recall_at_1000
+ value: 98.919
+ - type: recall_at_3
+ value: 55.898
+ - type: recall_at_5
+ value: 66.121
+ - task:
+ type: Retrieval
+ dataset:
+ type: quora
+ name: MTEB QuoraRetrieval
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 71.414
+ - type: map_at_10
+ value: 85.37400000000001
+ - type: map_at_100
+ value: 86.01100000000001
+ - type: map_at_1000
+ value: 86.027
+ - type: map_at_3
+ value: 82.562
+ - type: map_at_5
+ value: 84.284
+ - type: mrr_at_1
+ value: 82.24000000000001
+ - type: mrr_at_10
+ value: 88.225
+ - type: mrr_at_100
+ value: 88.324
+ - type: mrr_at_1000
+ value: 88.325
+ - type: mrr_at_3
+ value: 87.348
+ - type: mrr_at_5
+ value: 87.938
+ - type: ndcg_at_1
+ value: 82.24000000000001
+ - type: ndcg_at_10
+ value: 88.97699999999999
+ - type: ndcg_at_100
+ value: 90.16
+ - type: ndcg_at_1000
+ value: 90.236
+ - type: ndcg_at_3
+ value: 86.371
+ - type: ndcg_at_5
+ value: 87.746
+ - type: precision_at_1
+ value: 82.24000000000001
+ - type: precision_at_10
+ value: 13.481000000000002
+ - type: precision_at_100
+ value: 1.534
+ - type: precision_at_1000
+ value: 0.157
+ - type: precision_at_3
+ value: 37.86
+ - type: precision_at_5
+ value: 24.738
+ - type: recall_at_1
+ value: 71.414
+ - type: recall_at_10
+ value: 95.735
+ - type: recall_at_100
+ value: 99.696
+ - type: recall_at_1000
+ value: 99.979
+ - type: recall_at_3
+ value: 88.105
+ - type: recall_at_5
+ value: 92.17999999999999
+ - task:
+ type: Clustering
+ dataset:
+ type: mteb/reddit-clustering
+ name: MTEB RedditClustering
+ config: default
+ split: test
+ revision: 24640382cdbf8abc73003fb0fa6d111a705499eb
+ metrics:
+ - type: v_measure
+ value: 60.22146692057259
+ - task:
+ type: Clustering
+ dataset:
+ type: mteb/reddit-clustering-p2p
+ name: MTEB RedditClusteringP2P
+ config: default
+ split: test
+ revision: 282350215ef01743dc01b456c7f5241fa8937f16
+ metrics:
+ - type: v_measure
+ value: 65.29273320614578
+ - task:
+ type: Retrieval
+ dataset:
+ type: scidocs
+ name: MTEB SCIDOCS
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 5.023
+ - type: map_at_10
+ value: 14.161000000000001
+ - type: map_at_100
+ value: 16.68
+ - type: map_at_1000
+ value: 17.072000000000003
+ - type: map_at_3
+ value: 9.763
+ - type: map_at_5
+ value: 11.977
+ - type: mrr_at_1
+ value: 24.8
+ - type: mrr_at_10
+ value: 37.602999999999994
+ - type: mrr_at_100
+ value: 38.618
+ - type: mrr_at_1000
+ value: 38.659
+ - type: mrr_at_3
+ value: 34.117
+ - type: mrr_at_5
+ value: 36.082
+ - type: ndcg_at_1
+ value: 24.8
+ - type: ndcg_at_10
+ value: 23.316
+ - type: ndcg_at_100
+ value: 32.613
+ - type: ndcg_at_1000
+ value: 38.609
+ - type: ndcg_at_3
+ value: 21.697
+ - type: ndcg_at_5
+ value: 19.241
+ - type: precision_at_1
+ value: 24.8
+ - type: precision_at_10
+ value: 12.36
+ - type: precision_at_100
+ value: 2.593
+ - type: precision_at_1000
+ value: 0.402
+ - type: precision_at_3
+ value: 20.767
+ - type: precision_at_5
+ value: 17.34
+ - type: recall_at_1
+ value: 5.023
+ - type: recall_at_10
+ value: 25.069999999999997
+ - type: recall_at_100
+ value: 52.563
+ - type: recall_at_1000
+ value: 81.525
+ - type: recall_at_3
+ value: 12.613
+ - type: recall_at_5
+ value: 17.583
+ - task:
+ type: STS
+ dataset:
+ type: mteb/sickr-sts
+ name: MTEB SICK-R
+ config: default
+ split: test
+ revision: a6ea5a8cab320b040a23452cc28066d9beae2cee
+ metrics:
+ - type: cos_sim_pearson
+ value: 87.71506247604255
+ - type: cos_sim_spearman
+ value: 82.91813463738802
+ - type: euclidean_pearson
+ value: 85.5154616194479
+ - type: euclidean_spearman
+ value: 82.91815254466314
+ - type: manhattan_pearson
+ value: 85.5280917850374
+ - type: manhattan_spearman
+ value: 82.92276537286398
+ - task:
+ type: STS
+ dataset:
+ type: mteb/sts12-sts
+ name: MTEB STS12
+ config: default
+ split: test
+ revision: a0d554a64d88156834ff5ae9920b964011b16384
+ metrics:
+ - type: cos_sim_pearson
+ value: 87.43772054228462
+ - type: cos_sim_spearman
+ value: 78.75750601716682
+ - type: euclidean_pearson
+ value: 85.76074482955764
+ - type: euclidean_spearman
+ value: 78.75651057223058
+ - type: manhattan_pearson
+ value: 85.73390291701668
+ - type: manhattan_spearman
+ value: 78.72699385957797
+ - task:
+ type: STS
+ dataset:
+ type: mteb/sts13-sts
+ name: MTEB STS13
+ config: default
+ split: test
+ revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
+ metrics:
+ - type: cos_sim_pearson
+ value: 89.58144067172472
+ - type: cos_sim_spearman
+ value: 90.3524512966946
+ - type: euclidean_pearson
+ value: 89.71365391594237
+ - type: euclidean_spearman
+ value: 90.35239632843408
+ - type: manhattan_pearson
+ value: 89.66905421746478
+ - type: manhattan_spearman
+ value: 90.31508211683513
+ - task:
+ type: STS
+ dataset:
+ type: mteb/sts14-sts
+ name: MTEB STS14
+ config: default
+ split: test
+ revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
+ metrics:
+ - type: cos_sim_pearson
+ value: 87.77692637102102
+ - type: cos_sim_spearman
+ value: 85.45710562643485
+ - type: euclidean_pearson
+ value: 87.42456979928723
+ - type: euclidean_spearman
+ value: 85.45709386240908
+ - type: manhattan_pearson
+ value: 87.40754529526272
+ - type: manhattan_spearman
+ value: 85.44834854173303
+ - task:
+ type: STS
+ dataset:
+ type: mteb/sts15-sts
+ name: MTEB STS15
+ config: default
+ split: test
+ revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
+ metrics:
+ - type: cos_sim_pearson
+ value: 88.28491331695997
+ - type: cos_sim_spearman
+ value: 89.62037029566964
+ - type: euclidean_pearson
+ value: 89.02479391362826
+ - type: euclidean_spearman
+ value: 89.62036733618466
+ - type: manhattan_pearson
+ value: 89.00394756040342
+ - type: manhattan_spearman
+ value: 89.60867744215236
+ - task:
+ type: STS
+ dataset:
+ type: mteb/sts16-sts
+ name: MTEB STS16
+ config: default
+ split: test
+ revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
+ metrics:
+ - type: cos_sim_pearson
+ value: 85.08911381280191
+ - type: cos_sim_spearman
+ value: 86.5791780765767
+ - type: euclidean_pearson
+ value: 86.16063473577861
+ - type: euclidean_spearman
+ value: 86.57917745378766
+ - type: manhattan_pearson
+ value: 86.13677924604175
+ - type: manhattan_spearman
+ value: 86.56115615768685
+ - task:
+ type: STS
+ dataset:
+ type: mteb/sts17-crosslingual-sts
+ name: MTEB STS17 (en-en)
+ config: en-en
+ split: test
+ revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
+ metrics:
+ - type: cos_sim_pearson
+ value: 89.58029496205235
+ - type: cos_sim_spearman
+ value: 89.49551253826998
+ - type: euclidean_pearson
+ value: 90.13714840963748
+ - type: euclidean_spearman
+ value: 89.49551253826998
+ - type: manhattan_pearson
+ value: 90.13039633601363
+ - type: manhattan_spearman
+ value: 89.4513453745516
+ - task:
+ type: STS
+ dataset:
+ type: mteb/sts22-crosslingual-sts
+ name: MTEB STS22 (en)
+ config: en
+ split: test
+ revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
+ metrics:
+ - type: cos_sim_pearson
+ value: 69.01546399666435
+ - type: cos_sim_spearman
+ value: 69.33824484595624
+ - type: euclidean_pearson
+ value: 70.76511642998874
+ - type: euclidean_spearman
+ value: 69.33824484595624
+ - type: manhattan_pearson
+ value: 70.84320785047453
+ - type: manhattan_spearman
+ value: 69.54233632223537
+ - task:
+ type: STS
+ dataset:
+ type: mteb/stsbenchmark-sts
+ name: MTEB STSBenchmark
+ config: default
+ split: test
+ revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
+ metrics:
+ - type: cos_sim_pearson
+ value: 87.26389196390119
+ - type: cos_sim_spearman
+ value: 89.09721478341385
+ - type: euclidean_pearson
+ value: 88.97208685922517
+ - type: euclidean_spearman
+ value: 89.09720927308881
+ - type: manhattan_pearson
+ value: 88.97513670502573
+ - type: manhattan_spearman
+ value: 89.07647853984004
+ - task:
+ type: Reranking
+ dataset:
+ type: mteb/scidocs-reranking
+ name: MTEB SciDocsRR
+ config: default
+ split: test
+ revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab
+ metrics:
+ - type: map
+ value: 87.53075025771936
+ - type: mrr
+ value: 96.24327651288436
+ - task:
+ type: Retrieval
+ dataset:
+ type: scifact
+ name: MTEB SciFact
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 60.428000000000004
+ - type: map_at_10
+ value: 70.088
+ - type: map_at_100
+ value: 70.589
+ - type: map_at_1000
+ value: 70.614
+ - type: map_at_3
+ value: 67.191
+ - type: map_at_5
+ value: 68.515
+ - type: mrr_at_1
+ value: 63.333
+ - type: mrr_at_10
+ value: 71.13000000000001
+ - type: mrr_at_100
+ value: 71.545
+ - type: mrr_at_1000
+ value: 71.569
+ - type: mrr_at_3
+ value: 68.944
+ - type: mrr_at_5
+ value: 70.078
+ - type: ndcg_at_1
+ value: 63.333
+ - type: ndcg_at_10
+ value: 74.72800000000001
+ - type: ndcg_at_100
+ value: 76.64999999999999
+ - type: ndcg_at_1000
+ value: 77.176
+ - type: ndcg_at_3
+ value: 69.659
+ - type: ndcg_at_5
+ value: 71.626
+ - type: precision_at_1
+ value: 63.333
+ - type: precision_at_10
+ value: 10
+ - type: precision_at_100
+ value: 1.09
+ - type: precision_at_1000
+ value: 0.11299999999999999
+ - type: precision_at_3
+ value: 27.111
+ - type: precision_at_5
+ value: 17.666999999999998
+ - type: recall_at_1
+ value: 60.428000000000004
+ - type: recall_at_10
+ value: 87.98899999999999
+ - type: recall_at_100
+ value: 96.167
+ - type: recall_at_1000
+ value: 100
+ - type: recall_at_3
+ value: 74.006
+ - type: recall_at_5
+ value: 79.05
+ - task:
+ type: PairClassification
+ dataset:
+ type: mteb/sprintduplicatequestions-pairclassification
+ name: MTEB SprintDuplicateQuestions
+ config: default
+ split: test
+ revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46
+ metrics:
+ - type: cos_sim_accuracy
+ value: 99.87326732673267
+ - type: cos_sim_ap
+ value: 96.81770773701805
+ - type: cos_sim_f1
+ value: 93.6318407960199
+ - type: cos_sim_precision
+ value: 93.16831683168317
+ - type: cos_sim_recall
+ value: 94.1
+ - type: dot_accuracy
+ value: 99.87326732673267
+ - type: dot_ap
+ value: 96.8174218946665
+ - type: dot_f1
+ value: 93.6318407960199
+ - type: dot_precision
+ value: 93.16831683168317
+ - type: dot_recall
+ value: 94.1
+ - type: euclidean_accuracy
+ value: 99.87326732673267
+ - type: euclidean_ap
+ value: 96.81770773701807
+ - type: euclidean_f1
+ value: 93.6318407960199
+ - type: euclidean_precision
+ value: 93.16831683168317
+ - type: euclidean_recall
+ value: 94.1
+ - type: manhattan_accuracy
+ value: 99.87227722772278
+ - type: manhattan_ap
+ value: 96.83164126821747
+ - type: manhattan_f1
+ value: 93.54677338669335
+ - type: manhattan_precision
+ value: 93.5935935935936
+ - type: manhattan_recall
+ value: 93.5
+ - type: max_accuracy
+ value: 99.87326732673267
+ - type: max_ap
+ value: 96.83164126821747
+ - type: max_f1
+ value: 93.6318407960199
+ - task:
+ type: Clustering
+ dataset:
+ type: mteb/stackexchange-clustering
+ name: MTEB StackExchangeClustering
+ config: default
+ split: test
+ revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259
+ metrics:
+ - type: v_measure
+ value: 65.6212042420246
+ - task:
+ type: Clustering
+ dataset:
+ type: mteb/stackexchange-clustering-p2p
+ name: MTEB StackExchangeClusteringP2P
+ config: default
+ split: test
+ revision: 815ca46b2622cec33ccafc3735d572c266efdb44
+ metrics:
+ - type: v_measure
+ value: 35.779230635982564
+ - task:
+ type: Reranking
+ dataset:
+ type: mteb/stackoverflowdupquestions-reranking
+ name: MTEB StackOverflowDupQuestions
+ config: default
+ split: test
+ revision: e185fbe320c72810689fc5848eb6114e1ef5ec69
+ metrics:
+ - type: map
+ value: 55.217701909036286
+ - type: mrr
+ value: 56.17658995416349
+ - task:
+ type: Summarization
+ dataset:
+ type: mteb/summeval
+ name: MTEB SummEval
+ config: default
+ split: test
+ revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c
+ metrics:
+ - type: cos_sim_pearson
+ value: 30.954206018888453
+ - type: cos_sim_spearman
+ value: 32.71062599450096
+ - type: dot_pearson
+ value: 30.95420929056943
+ - type: dot_spearman
+ value: 32.71062599450096
+ - task:
+ type: Retrieval
+ dataset:
+ type: trec-covid
+ name: MTEB TRECCOVID
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 0.22699999999999998
+ - type: map_at_10
+ value: 1.924
+ - type: map_at_100
+ value: 10.525
+ - type: map_at_1000
+ value: 24.973
+ - type: map_at_3
+ value: 0.638
+ - type: map_at_5
+ value: 1.0659999999999998
+ - type: mrr_at_1
+ value: 84
+ - type: mrr_at_10
+ value: 91.067
+ - type: mrr_at_100
+ value: 91.067
+ - type: mrr_at_1000
+ value: 91.067
+ - type: mrr_at_3
+ value: 90.667
+ - type: mrr_at_5
+ value: 91.067
+ - type: ndcg_at_1
+ value: 81
+ - type: ndcg_at_10
+ value: 75.566
+ - type: ndcg_at_100
+ value: 56.387
+ - type: ndcg_at_1000
+ value: 49.834
+ - type: ndcg_at_3
+ value: 80.899
+ - type: ndcg_at_5
+ value: 80.75099999999999
+ - type: precision_at_1
+ value: 84
+ - type: precision_at_10
+ value: 79
+ - type: precision_at_100
+ value: 57.56
+ - type: precision_at_1000
+ value: 21.8
+ - type: precision_at_3
+ value: 84.667
+ - type: precision_at_5
+ value: 85.2
+ - type: recall_at_1
+ value: 0.22699999999999998
+ - type: recall_at_10
+ value: 2.136
+ - type: recall_at_100
+ value: 13.861
+ - type: recall_at_1000
+ value: 46.299
+ - type: recall_at_3
+ value: 0.6649999999999999
+ - type: recall_at_5
+ value: 1.145
+ - task:
+ type: Retrieval
+ dataset:
+ type: webis-touche2020
+ name: MTEB Touche2020
+ config: default
+ split: test
+ revision: None
+ metrics:
+ - type: map_at_1
+ value: 2.752
+ - type: map_at_10
+ value: 9.951
+ - type: map_at_100
+ value: 16.794999999999998
+ - type: map_at_1000
+ value: 18.251
+ - type: map_at_3
+ value: 5.288
+ - type: map_at_5
+ value: 6.954000000000001
+ - type: mrr_at_1
+ value: 38.775999999999996
+ - type: mrr_at_10
+ value: 50.458000000000006
+ - type: mrr_at_100
+ value: 51.324999999999996
+ - type: mrr_at_1000
+ value: 51.339999999999996
+ - type: mrr_at_3
+ value: 46.939
+ - type: mrr_at_5
+ value: 47.857
+ - type: ndcg_at_1
+ value: 36.735
+ - type: ndcg_at_10
+ value: 25.198999999999998
+ - type: ndcg_at_100
+ value: 37.938
+ - type: ndcg_at_1000
+ value: 49.145
+ - type: ndcg_at_3
+ value: 29.348000000000003
+ - type: ndcg_at_5
+ value: 25.804
+ - type: precision_at_1
+ value: 38.775999999999996
+ - type: precision_at_10
+ value: 22.041
+ - type: precision_at_100
+ value: 7.939
+ - type: precision_at_1000
+ value: 1.555
+ - type: precision_at_3
+ value: 29.932
+ - type: precision_at_5
+ value: 24.490000000000002
+ - type: recall_at_1
+ value: 2.752
+ - type: recall_at_10
+ value: 16.197
+ - type: recall_at_100
+ value: 49.166
+ - type: recall_at_1000
+ value: 84.18900000000001
+ - type: recall_at_3
+ value: 6.438000000000001
+ - type: recall_at_5
+ value: 9.093
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/toxic_conversations_50k
+ name: MTEB ToxicConversationsClassification
+ config: default
+ split: test
+ revision: d7c0de2777da35d6aae2200a62c6e0e5af397c4c
+ metrics:
+ - type: accuracy
+ value: 71.47980000000001
+ - type: ap
+ value: 14.605194452178754
+ - type: f1
+ value: 55.07362924988948
+ - task:
+ type: Classification
+ dataset:
+ type: mteb/tweet_sentiment_extraction
+ name: MTEB TweetSentimentExtractionClassification
+ config: default
+ split: test
+ revision: d604517c81ca91fe16a244d1248fc021f9ecee7a
+ metrics:
+ - type: accuracy
+ value: 59.708545557441994
+ - type: f1
+ value: 60.04751270975683
+ - task:
+ type: Clustering
+ dataset:
+ type: mteb/twentynewsgroups-clustering
+ name: MTEB TwentyNewsgroupsClustering
+ config: default
+ split: test
+ revision: 6125ec4e24fa026cec8a478383ee943acfbd5449
+ metrics:
+ - type: v_measure
+ value: 53.21105960597211
+ - task:
+ type: PairClassification
+ dataset:
+ type: mteb/twittersemeval2015-pairclassification
+ name: MTEB TwitterSemEval2015
+ config: default
+ split: test
+ revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1
+ metrics:
+ - type: cos_sim_accuracy
+ value: 87.58419264469214
+ - type: cos_sim_ap
+ value: 78.55300004517404
+ - type: cos_sim_f1
+ value: 71.49673530889001
+ - type: cos_sim_precision
+ value: 68.20795400095831
+ - type: cos_sim_recall
+ value: 75.11873350923483
+ - type: dot_accuracy
+ value: 87.58419264469214
+ - type: dot_ap
+ value: 78.55297659559511
+ - type: dot_f1
+ value: 71.49673530889001
+ - type: dot_precision
+ value: 68.20795400095831
+ - type: dot_recall
+ value: 75.11873350923483
+ - type: euclidean_accuracy
+ value: 87.58419264469214
+ - type: euclidean_ap
+ value: 78.55300477331477
+ - type: euclidean_f1
+ value: 71.49673530889001
+ - type: euclidean_precision
+ value: 68.20795400095831
+ - type: euclidean_recall
+ value: 75.11873350923483
+ - type: manhattan_accuracy
+ value: 87.5663110210407
+ - type: manhattan_ap
+ value: 78.49982050876562
+ - type: manhattan_f1
+ value: 71.35488740722104
+ - type: manhattan_precision
+ value: 68.18946862226497
+ - type: manhattan_recall
+ value: 74.82849604221636
+ - type: max_accuracy
+ value: 87.58419264469214
+ - type: max_ap
+ value: 78.55300477331477
+ - type: max_f1
+ value: 71.49673530889001
+ - task:
+ type: PairClassification
+ dataset:
+ type: mteb/twitterurlcorpus-pairclassification
+ name: MTEB TwitterURLCorpus
+ config: default
+ split: test
+ revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf
+ metrics:
+ - type: cos_sim_accuracy
+ value: 89.09069740365584
+ - type: cos_sim_ap
+ value: 86.22749303724757
+ - type: cos_sim_f1
+ value: 78.36863452005407
+ - type: cos_sim_precision
+ value: 76.49560117302053
+ - type: cos_sim_recall
+ value: 80.33569448721897
+ - type: dot_accuracy
+ value: 89.09069740365584
+ - type: dot_ap
+ value: 86.22750233655673
+ - type: dot_f1
+ value: 78.36863452005407
+ - type: dot_precision
+ value: 76.49560117302053
+ - type: dot_recall
+ value: 80.33569448721897
+ - type: euclidean_accuracy
+ value: 89.09069740365584
+ - type: euclidean_ap
+ value: 86.22749355597347
+ - type: euclidean_f1
+ value: 78.36863452005407
+ - type: euclidean_precision
+ value: 76.49560117302053
+ - type: euclidean_recall
+ value: 80.33569448721897
+ - type: manhattan_accuracy
+ value: 89.08293553770326
+ - type: manhattan_ap
+ value: 86.21913616084771
+ - type: manhattan_f1
+ value: 78.3907031479847
+ - type: manhattan_precision
+ value: 75.0352013517319
+ - type: manhattan_recall
+ value: 82.06036341238065
+ - type: max_accuracy
+ value: 89.09069740365584
+ - type: max_ap
+ value: 86.22750233655673
+ - type: max_f1
+ value: 78.3907031479847
+license: apache-2.0
+language:
+- en
+library_name: sentence-transformers
+pipeline_tag: feature-extraction
+---
+
+
+
+
+ +
+ ++The crispy sentence embedding family from Mixedbread. +
+ +# mxbai-embed-large-v1 + +Here, we provide several ways to produce sentence embeddings. Please note that you have to provide the prompt `Represent this sentence for searching relevant passages:` for query if you want to use it for retrieval. Besides that you don't need any prompt. Our model also supports [Matryoshka Representation Learning and binary quantization](https://www.mixedbread.ai/blog/binary-mrl). + +## Quickstart + +Here, we provide several ways to produce sentence embeddings. Please note that you have to provide the prompt `Represent this sentence for searching relevant passages:` for query if you want to use it for retrieval. Besides that you don't need any prompt. + +### sentence-transformers + +``` +python -m pip install -U sentence-transformers +``` + +```python +from sentence_transformers import SentenceTransformer +from sentence_transformers.util import cos_sim +from sentence_transformers.quantization import quantize_embeddings + +# 1. Specify preffered dimensions +dimensions = 512 + +# 2. load model +model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=dimensions) + +# For retrieval you need to pass this prompt. +query = 'Represent this sentence for searching relevant passages: A man is eating a piece of bread' + +docs = [ + query, + "A man is eating food.", + "A man is eating pasta.", + "The girl is carrying a baby.", + "A man is riding a horse.", +] + +# 2. Encode +embeddings = model.encode(docs) + +# Optional: Quantize the embeddings +binary_embeddings = quantize_embeddings(embeddings, precision="ubinary") + +similarities = cos_sim(embeddings[0], embeddings[1:]) +print('similarities:', similarities) + + +``` +### Transformers + +```python +from typing import Dict + +import torch +import numpy as np +from transformers import AutoModel, AutoTokenizer +from sentence_transformers.util import cos_sim + +# For retrieval you need to pass this prompt. Please find our more in our blog post. +def transform_query(query: str) -> str: + """ For retrieval, add the prompt for query (not for documents). + """ + return f'Represent this sentence for searching relevant passages: {query}' + +# The model works really well with cls pooling (default) but also with mean pooling. +def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray: + if strategy == 'cls': + outputs = outputs[:, 0] + elif strategy == 'mean': + outputs = torch.sum( + outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True) + else: + raise NotImplementedError + return outputs.detach().cpu().numpy() + +# 1. load model +model_id = 'mixedbread-ai/mxbai-embed-large-v1' +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModel.from_pretrained(model_id).cuda() + + +docs = [ + transform_query('A man is eating a piece of bread'), + "A man is eating food.", + "A man is eating pasta.", + "The girl is carrying a baby.", + "A man is riding a horse.", +] + +# 2. encode +inputs = tokenizer(docs, padding=True, return_tensors='pt') +for k, v in inputs.items(): + inputs[k] = v.cuda() +outputs = model(**inputs).last_hidden_state +embeddings = pooling(outputs, inputs, 'cls') + +similarities = cos_sim(embeddings[0], embeddings[1:]) +print('similarities:', similarities) +``` + +### Transformers.js + +If you haven't already, you can install the [Transformers.js](https://huggingface.co./docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@xenova/transformers) using: +```bash +npm i @xenova/transformers +``` + +You can then use the model to compute embeddings like this: + +```js +import { pipeline, cos_sim } from '@xenova/transformers'; + +// Create a feature extraction pipeline +const extractor = await pipeline('feature-extraction', 'mixedbread-ai/mxbai-embed-large-v1', { + quantized: false, // Comment out this line to use the quantized version +}); + +// Generate sentence embeddings +const docs = [ + 'Represent this sentence for searching relevant passages: A man is eating a piece of bread', + 'A man is eating food.', + 'A man is eating pasta.', + 'The girl is carrying a baby.', + 'A man is riding a horse.', +] +const output = await extractor(docs, { pooling: 'cls' }); + +// Compute similarity scores +const [source_embeddings, ...document_embeddings ] = output.tolist(); +const similarities = document_embeddings.map(x => cos_sim(source_embeddings, x)); +console.log(similarities); // [0.7919578577247139, 0.6369278664248345, 0.16512018371357193, 0.3620778366720027] +``` + +### Using API + +You can use the model via our API as follows: + +```python +from mixedbread_ai.client import MixedbreadAI, EncodingFormat +from sklearn.metrics.pairwise import cosine_similarity +import os + +mxbai = MixedbreadAI(api_key="{MIXEDBREAD_API_KEY}") + +english_sentences = [ + 'What is the capital of Australia?', + 'Canberra is the capital of Australia.' +] + +res = mxbai.embeddings( + input=english_sentences, + model="mixedbread-ai/mxbai-embed-large-v1", + normalized=True, + encoding_format=[EncodingFormat.FLOAT, EncodingFormat.UBINARY, EncodingFormat.INT_8], + dimensions=512 +) + +encoded_embeddings = res.data[0].embedding +print(res.dimensions, encoded_embeddings.ubinary, encoded_embeddings.float_, encoded_embeddings.int_8) +``` + +The API comes with native int8 and binary quantization support! Check out the [docs](https://mixedbread.ai/docs) for more information. + +## Evaluation +As of March 2024, our model archives SOTA performance for Bert-large sized models on the [MTEB](https://huggingface.co./spaces/mteb/leaderboard). It ourperforms commercial models like OpenAIs text-embedding-3-large and matches the performance of model 20x it's size like the [echo-mistral-7b](https://huggingface.co./jspringer/echo-mistral-7b-instruct-lasttoken). Our model was trained with no overlap of the MTEB data, which indicates that our model generalizes well across several domains, tasks and text length. We know there are some limitations with this model, which will be fixed in v2. + + +| Model | Avg (56 datasets) | Classification (12 datasets) | Clustering (11 datasets) | PairClassification (3 datasets) | Reranking (4 datasets) | Retrieval (15 datasets) | STS (10 datasets) | Summarization (1 dataset) | +| --------------------------------------------------------------------------------------------- | ----------------- | ---------------------------- | ------------------------ | ------------------------------- | ---------------------- | ----------------------- | ----------------- | ------------------------- | +| **mxbai-embed-large-v1** | **64.68** | 75.64 | 46.71 | 87.2 | 60.11 | 54.39 | 85.00 | 32.71 | +| [bge-large-en-v1.5](https://huggingface.co./BAAI/bge-large-en-v1.5) | 64.23 | 75.97 | 46.08 | 87.12 | 60.03 | 54.29 | 83.11 | 31.61 | +| [mxbai-embed-2d-large-v1](https://huggingface.co./mixedbread-ai/mxbai-embed-2d-large-v1) | 63.25 | 74.14 | 46.07 | 85.89 | 58.94 | 51.42 | 84.9 | 31.55 | +| [nomic-embed-text-v1](https://huggingface.co./nomic-ai/nomic-embed-text-v1) | 62.39 | 74.12 | 43.91 | 85.15 | 55.69 | 52.81 | 82.06 | 30.08 | +| [jina-embeddings-v2-base-en](https://huggingface.co./jinaai/jina-embeddings-v2-base-en) | 60.38 | 73.45 | 41.73 | 85.38 | 56.98 | 47.87 | 80.7 | 31.6 | +| *Proprietary Models* | | | | | | | | | +| [OpenAI text-embedding-3-large](https://openai.com/blog/new-embedding-models-and-api-updates) | 64.58 | 75.45 | 49.01 | 85.72 | 59.16 | 55.44 | 81.73 | 29.92 | +| [Cohere embed-english-v3.0](https://txt.cohere.com/introducing-embed-v3/) | 64.47 | 76.49 | 47.43 | 85.84 | 58.01 | 55.00 | 82.62 | 30.18 | +| [OpenAI text-embedding-ada-002](https://openai.com/blog/new-and-improved-embedding-model) | 60.99 | 70.93 | 45.90 | 84.89 | 56.32 | 49.25 | 80.97 | 30.80 | + + +Please find more information in our [blog post](https://mixedbread.ai/blog/mxbai-embed-large-v1). + +## Matryoshka and Binary Quantization + +Embeddings in their commonly used form (float arrays) have a high memory footprint when used at scale. Two approaches to solve this problem are Matryoshka Representation Learning (MRL) and (Binary) Quantization. While MRL reduces the number of dimensions of an embedding, binary quantization transforms the value of each dimension from a float32 into a lower precision (int8 or even binary). The model supports both approaches! + +You can also take it one step further, and combine both MRL and quantization. This combination of binary quantization and MRL allows you to reduce the memory usage of your embeddings significantly. This leads to much lower costs when using a vector database in particular. You can read more about the technology and its advantages in our [blog post](https://www.mixedbread.ai/blog/binary-mrl). + +## Community +Please join our [Discord Community](https://discord.gg/jDfMHzAVfU) and share your feedback and thoughts! We are here to help and also always happy to chat. + +## License +Apache 2.0 + +## Citation + +```bibtex +@online{emb2024mxbai, + title={Open Source Strikes Bread - New Fluffy Embeddings Model}, + author={Sean Lee and Aamir Shakir and Darius Koenig and Julius Lipp}, + year={2024}, + url={https://www.mixedbread.ai/blog/mxbai-embed-large-v1}, +} + +@article{li2023angle, + title={AnglE-optimized Text Embeddings}, + author={Li, Xianming and Li, Jing}, + journal={arXiv preprint arXiv:2309.12871}, + year={2023} +} +```