Add new SentenceTransformer model.

Browse files

Files changed (11) hide show

1_Pooling/config.json +10 -0
README.md +815 -0
config.json +32 -0
config_sentence_transformers.json +10 -0
model.safetensors +3 -0
modules.json +20 -0
sentence_bert_config.json +4 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +57 -0
vocab.txt +0 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 768,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

README.md ADDED Viewed

	@@ -0,0 +1,815 @@

+---
+base_model: BAAI/bge-base-en-v1.5
+datasets: []
+language:
+- en
+library_name: sentence-transformers
+license: apache-2.0
+metrics:
+- cosine_accuracy@1
+- cosine_accuracy@3
+- cosine_accuracy@5
+- cosine_accuracy@10
+- cosine_precision@1
+- cosine_precision@3
+- cosine_precision@5
+- cosine_precision@10
+- cosine_recall@1
+- cosine_recall@3
+- cosine_recall@5
+- cosine_recall@10
+- cosine_ndcg@10
+- cosine_mrr@10
+- cosine_map@100
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- sentence-similarity
+- feature-extraction
+- generated_from_trainer
+- dataset_size:6300
+- loss:MatryoshkaLoss
+- loss:MultipleNegativesRankingLoss
+widget:
+- source_sentence: The U.S. International Trade Commission (ITC) has become a significant
+    forum to litigate intellectual property disputes. An adverse result in an ITC
+    action can lead to a prohibition on importing infringing products, which, given
+    the importance of the U.S. market, could significantly impact a company including
+    preventing the importation of many important products or necessitating workarounds
+    that may limit certain features of their products.
+  sentences:
+  - What was the overall impact of foreign currencies on net sales in 2023?
+  - What potential consequences could result from intellectual property disputes in
+    the U.S. International Trade Commission for the company?
+  - What was the total purchase consideration for the VMware acquisition?
+- source_sentence: Reinsurance contracts are normally classified as treaty or facultative
+    contracts. Treaty reinsurance refers to reinsurance coverage for all or a portion
+    of a specified group or class of risks ceded by a direct insurer or reinsurer,
+    while facultative reinsurance involves coverage of specific individual underlying
+    risks. Reinsurance contracts are further classified as quota-share or excess.
+  sentences:
+  - What type of information will you find under 'Note 13 — Commitments and Contingencies'
+    in an Annual Report on Form 10-K?
+  - What type of reinsurance contracts are offered by Berkshire Hathaway Reinsurance
+    Group?
+  - What are the consequences for a company violating anti-bribery laws in the U.S.?
+- source_sentence: Commitments and contingencies related to legal proceedings are
+    detailed in Part II, Item 8, under 'Financial Statements and Supplementary Data
+    – Note 14'.
+  sentences:
+  - Where can one find commitments and contingencies related to legal proceedings
+    in the context provided?
+  - What is discussed in Item 3. Legal Proceedings of a company's report?
+  - How are net realized capital gains and losses treated in the financial statements
+    according to the Company?
+- source_sentence: The “Glossary of Terms and Acronyms” is included on pages 315-321
+    in the set of financial documents.
+  sentences:
+  - What are the principles used in preparing the discussed financial statements?
+  - What is the total remaining budget for future common stock repurchases under the
+    company's stock repurchase programs as of December 31, 2023?
+  - Where is the “Glossary of Terms and Acronyms” located in a set of financial documents?
+- source_sentence: The table presents our market risk by asset category for positions
+    accounted for at fair value or accounted for at the lower of cost or fair value,
+    that are not included in VaR. As of December 2023, equity was at $1,562 million
+    and debt was at $2,446 million.
+  sentences:
+  - What are the market risk values for Goldman Sachs' equity and debt positions not
+    included in VaR as of December 2023?
+  - What was the conclusion of the Company's review regarding the impact of the American
+    Rescue Plan, the Consolidated Appropriations Act, 2021, and related tax provisions
+    on its business for the fiscal year ended June 30, 2023?
+  - How much did the company's finance lease obligations total as of December 31,
+    2023?
+model-index:
+- name: BGE base Financial Matryoshka
+  results:
+  - task:
+      type: information-retrieval
+      name: Information Retrieval
+    dataset:
+      name: dim 768
+      type: dim_768
+    metrics:
+    - type: cosine_accuracy@1
+      value: 0.6957142857142857
+      name: Cosine Accuracy@1
+    - type: cosine_accuracy@3
+      value: 0.8371428571428572
+      name: Cosine Accuracy@3
+    - type: cosine_accuracy@5
+      value: 0.8714285714285714
+      name: Cosine Accuracy@5
+    - type: cosine_accuracy@10
+      value: 0.9242857142857143
+      name: Cosine Accuracy@10
+    - type: cosine_precision@1
+      value: 0.6957142857142857
+      name: Cosine Precision@1
+    - type: cosine_precision@3
+      value: 0.27904761904761904
+      name: Cosine Precision@3
+    - type: cosine_precision@5
+      value: 0.17428571428571424
+      name: Cosine Precision@5
+    - type: cosine_precision@10
+      value: 0.09242857142857142
+      name: Cosine Precision@10
+    - type: cosine_recall@1
+      value: 0.6957142857142857
+      name: Cosine Recall@1
+    - type: cosine_recall@3
+      value: 0.8371428571428572
+      name: Cosine Recall@3
+    - type: cosine_recall@5
+      value: 0.8714285714285714
+      name: Cosine Recall@5
+    - type: cosine_recall@10
+      value: 0.9242857142857143
+      name: Cosine Recall@10
+    - type: cosine_ndcg@10
+      value: 0.8105294489003092
+      name: Cosine Ndcg@10
+    - type: cosine_mrr@10
+      value: 0.7741910430839002
+      name: Cosine Mrr@10
+    - type: cosine_map@100
+      value: 0.7773317927980538
+      name: Cosine Map@100
+  - task:
+      type: information-retrieval
+      name: Information Retrieval
+    dataset:
+      name: dim 512
+      type: dim_512
+    metrics:
+    - type: cosine_accuracy@1
+      value: 0.7
+      name: Cosine Accuracy@1
+    - type: cosine_accuracy@3
+      value: 0.8285714285714286
+      name: Cosine Accuracy@3
+    - type: cosine_accuracy@5
+      value: 0.8671428571428571
+      name: Cosine Accuracy@5
+    - type: cosine_accuracy@10
+      value: 0.9185714285714286
+      name: Cosine Accuracy@10
+    - type: cosine_precision@1
+      value: 0.7
+      name: Cosine Precision@1
+    - type: cosine_precision@3
+      value: 0.27619047619047615
+      name: Cosine Precision@3
+    - type: cosine_precision@5
+      value: 0.1734285714285714
+      name: Cosine Precision@5
+    - type: cosine_precision@10
+      value: 0.09185714285714283
+      name: Cosine Precision@10
+    - type: cosine_recall@1
+      value: 0.7
+      name: Cosine Recall@1
+    - type: cosine_recall@3
+      value: 0.8285714285714286
+      name: Cosine Recall@3
+    - type: cosine_recall@5
+      value: 0.8671428571428571
+      name: Cosine Recall@5
+    - type: cosine_recall@10
+      value: 0.9185714285714286
+      name: Cosine Recall@10
+    - type: cosine_ndcg@10
+      value: 0.8090367290103152
+      name: Cosine Ndcg@10
+    - type: cosine_mrr@10
+      value: 0.7740351473922898
+      name: Cosine Mrr@10
+    - type: cosine_map@100
+      value: 0.7776494145961331
+      name: Cosine Map@100
+  - task:
+      type: information-retrieval
+      name: Information Retrieval
+    dataset:
+      name: dim 256
+      type: dim_256
+    metrics:
+    - type: cosine_accuracy@1
+      value: 0.6928571428571428
+      name: Cosine Accuracy@1
+    - type: cosine_accuracy@3
+      value: 0.8185714285714286
+      name: Cosine Accuracy@3
+    - type: cosine_accuracy@5
+      value: 0.8585714285714285
+      name: Cosine Accuracy@5
+    - type: cosine_accuracy@10
+      value: 0.91
+      name: Cosine Accuracy@10
+    - type: cosine_precision@1
+      value: 0.6928571428571428
+      name: Cosine Precision@1
+    - type: cosine_precision@3
+      value: 0.27285714285714285
+      name: Cosine Precision@3
+    - type: cosine_precision@5
+      value: 0.17171428571428568
+      name: Cosine Precision@5
+    - type: cosine_precision@10
+      value: 0.09099999999999998
+      name: Cosine Precision@10
+    - type: cosine_recall@1
+      value: 0.6928571428571428
+      name: Cosine Recall@1
+    - type: cosine_recall@3
+      value: 0.8185714285714286
+      name: Cosine Recall@3
+    - type: cosine_recall@5
+      value: 0.8585714285714285
+      name: Cosine Recall@5
+    - type: cosine_recall@10
+      value: 0.91
+      name: Cosine Recall@10
+    - type: cosine_ndcg@10
+      value: 0.8016663265681359
+      name: Cosine Ndcg@10
+    - type: cosine_mrr@10
+      value: 0.7669977324263035
+      name: Cosine Mrr@10
+    - type: cosine_map@100
+      value: 0.7711841838569463
+      name: Cosine Map@100
+  - task:
+      type: information-retrieval
+      name: Information Retrieval
+    dataset:
+      name: dim 128
+      type: dim_128
+    metrics:
+    - type: cosine_accuracy@1
+      value: 0.6871428571428572
+      name: Cosine Accuracy@1
+    - type: cosine_accuracy@3
+      value: 0.8071428571428572
+      name: Cosine Accuracy@3
+    - type: cosine_accuracy@5
+      value: 0.8585714285714285
+      name: Cosine Accuracy@5
+    - type: cosine_accuracy@10
+      value: 0.8985714285714286
+      name: Cosine Accuracy@10
+    - type: cosine_precision@1
+      value: 0.6871428571428572
+      name: Cosine Precision@1
+    - type: cosine_precision@3
+      value: 0.26904761904761904
+      name: Cosine Precision@3
+    - type: cosine_precision@5
+      value: 0.1717142857142857
+      name: Cosine Precision@5
+    - type: cosine_precision@10
+      value: 0.08985714285714283
+      name: Cosine Precision@10
+    - type: cosine_recall@1
+      value: 0.6871428571428572
+      name: Cosine Recall@1
+    - type: cosine_recall@3
+      value: 0.8071428571428572
+      name: Cosine Recall@3
+    - type: cosine_recall@5
+      value: 0.8585714285714285
+      name: Cosine Recall@5
+    - type: cosine_recall@10
+      value: 0.8985714285714286
+      name: Cosine Recall@10
+    - type: cosine_ndcg@10
+      value: 0.7921056491431833
+      name: Cosine Ndcg@10
+    - type: cosine_mrr@10
+      value: 0.7580946712018135
+      name: Cosine Mrr@10
+    - type: cosine_map@100
+      value: 0.7627063166788922
+      name: Cosine Map@100
+  - task:
+      type: information-retrieval
+      name: Information Retrieval
+    dataset:
+      name: dim 64
+      type: dim_64
+    metrics:
+    - type: cosine_accuracy@1
+      value: 0.6642857142857143
+      name: Cosine Accuracy@1
+    - type: cosine_accuracy@3
+      value: 0.7842857142857143
+      name: Cosine Accuracy@3
+    - type: cosine_accuracy@5
+      value: 0.8257142857142857
+      name: Cosine Accuracy@5
+    - type: cosine_accuracy@10
+      value: 0.8728571428571429
+      name: Cosine Accuracy@10
+    - type: cosine_precision@1
+      value: 0.6642857142857143
+      name: Cosine Precision@1
+    - type: cosine_precision@3
+      value: 0.26142857142857145
+      name: Cosine Precision@3
+    - type: cosine_precision@5
+      value: 0.16514285714285715
+      name: Cosine Precision@5
+    - type: cosine_precision@10
+      value: 0.08728571428571427
+      name: Cosine Precision@10
+    - type: cosine_recall@1
+      value: 0.6642857142857143
+      name: Cosine Recall@1
+    - type: cosine_recall@3
+      value: 0.7842857142857143
+      name: Cosine Recall@3
+    - type: cosine_recall@5
+      value: 0.8257142857142857
+      name: Cosine Recall@5
+    - type: cosine_recall@10
+      value: 0.8728571428571429
+      name: Cosine Recall@10
+    - type: cosine_ndcg@10
+      value: 0.7689727571743198
+      name: Cosine Ndcg@10
+    - type: cosine_mrr@10
+      value: 0.7358214285714282
+      name: Cosine Mrr@10
+    - type: cosine_map@100
+      value: 0.7406658506857838
+      name: Cosine Map@100
+---
+# BGE base Financial Matryoshka
+This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5). It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
+## Model Details
+### Model Description
+- **Model Type:** Sentence Transformer
+- **Base model:** [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) <!-- at revision a5beb1e3e68b9ab74eb54cfd186867f64f240e1a -->
+- **Maximum Sequence Length:** 512 tokens
+- **Output Dimensionality:** 768 tokens
+- **Similarity Function:** Cosine Similarity
+<!-- - **Training Dataset:** Unknown -->
+- **Language:** en
+- **License:** apache-2.0
+### Model Sources
+- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
+- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
+- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
+### Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel
+  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+  (2): Normalize()
+)
+```
+## Usage
+### Direct Usage (Sentence Transformers)
+First install the Sentence Transformers library:
+```bash
+pip install -U sentence-transformers
+```
+Then you can load this model and run inference.
+```python
+from sentence_transformers import SentenceTransformer
+# Download from the 🤗 Hub
+model = SentenceTransformer("akashmaggon/bge-base-financial-matryoshka")
+# Run inference
+sentences = [
+    'The table presents our market risk by asset category for positions accounted for at fair value or accounted for at the lower of cost or fair value, that are not included in VaR. As of December 2023, equity was at $1,562 million and debt was at $2,446 million.',
+    "What are the market risk values for Goldman Sachs' equity and debt positions not included in VaR as of December 2023?",
+    "What was the conclusion of the Company's review regarding the impact of the American Rescue Plan, the Consolidated Appropriations Act, 2021, and related tax provisions on its business for the fiscal year ended June 30, 2023?",
+]
+embeddings = model.encode(sentences)
+print(embeddings.shape)
+# [3, 768]
+# Get the similarity scores for the embeddings
+similarities = model.similarity(embeddings, embeddings)
+print(similarities.shape)
+# [3, 3]
+```
+<!--
+### Direct Usage (Transformers)
+<details><summary>Click to see the direct usage in Transformers</summary>
+</details>
+-->
+<!--
+### Downstream Usage (Sentence Transformers)
+You can finetune this model on your own dataset.
+<details><summary>Click to expand</summary>
+</details>
+-->
+<!--
+### Out-of-Scope Use
+*List how the model may foreseeably be misused and address what users ought not to do with the model.*
+-->
+## Evaluation
+### Metrics
+#### Information Retrieval
+* Dataset: `dim_768`
+* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
+| Metric              | Value      |
+|:--------------------|:-----------|
+| cosine_accuracy@1   | 0.6957     |
+| cosine_accuracy@3   | 0.8371     |
+| cosine_accuracy@5   | 0.8714     |
+| cosine_accuracy@10  | 0.9243     |
+| cosine_precision@1  | 0.6957     |
+| cosine_precision@3  | 0.279      |
+| cosine_precision@5  | 0.1743     |
+| cosine_precision@10 | 0.0924     |
+| cosine_recall@1     | 0.6957     |
+| cosine_recall@3     | 0.8371     |
+| cosine_recall@5     | 0.8714     |
+| cosine_recall@10    | 0.9243     |
+| cosine_ndcg@10      | 0.8105     |
+| cosine_mrr@10       | 0.7742     |
+| **cosine_map@100**  | **0.7773** |
+#### Information Retrieval
+* Dataset: `dim_512`
+* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
+| Metric              | Value      |
+|:--------------------|:-----------|
+| cosine_accuracy@1   | 0.7        |
+| cosine_accuracy@3   | 0.8286     |
+| cosine_accuracy@5   | 0.8671     |
+| cosine_accuracy@10  | 0.9186     |
+| cosine_precision@1  | 0.7        |
+| cosine_precision@3  | 0.2762     |
+| cosine_precision@5  | 0.1734     |
+| cosine_precision@10 | 0.0919     |
+| cosine_recall@1     | 0.7        |
+| cosine_recall@3     | 0.8286     |
+| cosine_recall@5     | 0.8671     |
+| cosine_recall@10    | 0.9186     |
+| cosine_ndcg@10      | 0.809      |
+| cosine_mrr@10       | 0.774      |
+| **cosine_map@100**  | **0.7776** |
+#### Information Retrieval
+* Dataset: `dim_256`
+* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
+| Metric              | Value      |
+|:--------------------|:-----------|
+| cosine_accuracy@1   | 0.6929     |
+| cosine_accuracy@3   | 0.8186     |
+| cosine_accuracy@5   | 0.8586     |
+| cosine_accuracy@10  | 0.91       |
+| cosine_precision@1  | 0.6929     |
+| cosine_precision@3  | 0.2729     |
+| cosine_precision@5  | 0.1717     |
+| cosine_precision@10 | 0.091      |
+| cosine_recall@1     | 0.6929     |
+| cosine_recall@3     | 0.8186     |
+| cosine_recall@5     | 0.8586     |
+| cosine_recall@10    | 0.91       |
+| cosine_ndcg@10      | 0.8017     |
+| cosine_mrr@10       | 0.767      |
+| **cosine_map@100**  | **0.7712** |
+#### Information Retrieval
+* Dataset: `dim_128`
+* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
+| Metric              | Value      |
+|:--------------------|:-----------|
+| cosine_accuracy@1   | 0.6871     |
+| cosine_accuracy@3   | 0.8071     |
+| cosine_accuracy@5   | 0.8586     |
+| cosine_accuracy@10  | 0.8986     |
+| cosine_precision@1  | 0.6871     |
+| cosine_precision@3  | 0.269      |
+| cosine_precision@5  | 0.1717     |
+| cosine_precision@10 | 0.0899     |
+| cosine_recall@1     | 0.6871     |
+| cosine_recall@3     | 0.8071     |
+| cosine_recall@5     | 0.8586     |
+| cosine_recall@10    | 0.8986     |
+| cosine_ndcg@10      | 0.7921     |
+| cosine_mrr@10       | 0.7581     |
+| **cosine_map@100**  | **0.7627** |
+#### Information Retrieval
+* Dataset: `dim_64`
+* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
+| Metric              | Value      |
+|:--------------------|:-----------|
+| cosine_accuracy@1   | 0.6643     |
+| cosine_accuracy@3   | 0.7843     |
+| cosine_accuracy@5   | 0.8257     |
+| cosine_accuracy@10  | 0.8729     |
+| cosine_precision@1  | 0.6643     |
+| cosine_precision@3  | 0.2614     |
+| cosine_precision@5  | 0.1651     |
+| cosine_precision@10 | 0.0873     |
+| cosine_recall@1     | 0.6643     |
+| cosine_recall@3     | 0.7843     |
+| cosine_recall@5     | 0.8257     |
+| cosine_recall@10    | 0.8729     |
+| cosine_ndcg@10      | 0.769      |
+| cosine_mrr@10       | 0.7358     |
+| **cosine_map@100**  | **0.7407** |
+<!--
+## Bias, Risks and Limitations
+*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
+-->
+<!--
+### Recommendations
+*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
+-->
+## Training Details
+### Training Dataset
+#### Unnamed Dataset
+* Size: 6,300 training samples
+* Columns: <code>positive</code> and <code>anchor</code>
+* Approximate statistics based on the first 1000 samples:
+  |         | positive                                                                           | anchor                                                                            |
+  |:--------|:-----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
+  | type    | string                                                                             | string                                                                            |
+  | details | <ul><li>min: 7 tokens</li><li>mean: 44.39 tokens</li><li>max: 512 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 20.64 tokens</li><li>max: 51 tokens</li></ul> |
+* Samples:
+  | positive                                                                                                                                     | anchor                                                                                                                    |
+  |:---------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------|
+  | <code>Johnson & Johnson reported cash and cash equivalents of $21,859 million as of the end of 2023.</code>                                  | <code>What was the amount of cash and cash equivalents reported by Johnson & Johnson at the end of 2023?</code>           |
+  | <code>Johnson & Johnson's consolidated statements of earnings for 2023 reported total net earnings of $35,153 million.</code>                | <code>What was the total net earnings for Johnson & Johnson in 2023?</code>                                               |
+  | <code>As of December 31, 2023, short-term investments were valued at $236,118 thousand and long-term investments at $86,676 thousand.</code> | <code>What is the total value of short-term and long-term investments held by the company as of December 31, 2023?</code> |
+* Loss: [<code>MatryoshkaLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#matryoshkaloss) with these parameters:
+  ```json
+  {
+      "loss": "MultipleNegativesRankingLoss",
+      "matryoshka_dims": [
+          768,
+          512,
+          256,
+          128,
+          64
+      ],
+      "matryoshka_weights": [
+          1,
+          1,
+          1,
+          1,
+          1
+      ],
+      "n_dims_per_step": -1
+  }
+  ```
+### Training Hyperparameters
+#### Non-Default Hyperparameters
+- `eval_strategy`: epoch
+- `per_device_train_batch_size`: 32
+- `per_device_eval_batch_size`: 16
+- `gradient_accumulation_steps`: 16
+- `learning_rate`: 2e-05
+- `num_train_epochs`: 4
+- `lr_scheduler_type`: cosine
+- `warmup_ratio`: 0.1
+- `bf16`: True
+- `load_best_model_at_end`: True
+- `optim`: adamw_torch_fused
+- `batch_sampler`: no_duplicates
+#### All Hyperparameters
+<details><summary>Click to expand</summary>
+- `overwrite_output_dir`: False
+- `do_predict`: False
+- `eval_strategy`: epoch
+- `prediction_loss_only`: True
+- `per_device_train_batch_size`: 32
+- `per_device_eval_batch_size`: 16
+- `per_gpu_train_batch_size`: None
+- `per_gpu_eval_batch_size`: None
+- `gradient_accumulation_steps`: 16
+- `eval_accumulation_steps`: None
+- `learning_rate`: 2e-05
+- `weight_decay`: 0.0
+- `adam_beta1`: 0.9
+- `adam_beta2`: 0.999
+- `adam_epsilon`: 1e-08
+- `max_grad_norm`: 1.0
+- `num_train_epochs`: 4
+- `max_steps`: -1
+- `lr_scheduler_type`: cosine
+- `lr_scheduler_kwargs`: {}
+- `warmup_ratio`: 0.1
+- `warmup_steps`: 0
+- `log_level`: passive
+- `log_level_replica`: warning
+- `log_on_each_node`: True
+- `logging_nan_inf_filter`: True
+- `save_safetensors`: True
+- `save_on_each_node`: False
+- `save_only_model`: False
+- `restore_callback_states_from_checkpoint`: False
+- `no_cuda`: False
+- `use_cpu`: False
+- `use_mps_device`: False
+- `seed`: 42
+- `data_seed`: None
+- `jit_mode_eval`: False
+- `use_ipex`: False
+- `bf16`: True
+- `fp16`: False
+- `fp16_opt_level`: O1
+- `half_precision_backend`: auto
+- `bf16_full_eval`: False
+- `fp16_full_eval`: False
+- `tf32`: None
+- `local_rank`: 0
+- `ddp_backend`: None
+- `tpu_num_cores`: None
+- `tpu_metrics_debug`: False
+- `debug`: []
+- `dataloader_drop_last`: False
+- `dataloader_num_workers`: 0
+- `dataloader_prefetch_factor`: None
+- `past_index`: -1
+- `disable_tqdm`: False
+- `remove_unused_columns`: True
+- `label_names`: None
+- `load_best_model_at_end`: True
+- `ignore_data_skip`: False
+- `fsdp`: []
+- `fsdp_min_num_params`: 0
+- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
+- `fsdp_transformer_layer_cls_to_wrap`: None
+- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
+- `deepspeed`: None
+- `label_smoothing_factor`: 0.0
+- `optim`: adamw_torch_fused
+- `optim_args`: None
+- `adafactor`: False
+- `group_by_length`: False
+- `length_column_name`: length
+- `ddp_find_unused_parameters`: None
+- `ddp_bucket_cap_mb`: None
+- `ddp_broadcast_buffers`: False
+- `dataloader_pin_memory`: True
+- `dataloader_persistent_workers`: False
+- `skip_memory_metrics`: True
+- `use_legacy_prediction_loop`: False
+- `push_to_hub`: False
+- `resume_from_checkpoint`: None
+- `hub_model_id`: None
+- `hub_strategy`: every_save
+- `hub_private_repo`: False
+- `hub_always_push`: False
+- `gradient_checkpointing`: False
+- `gradient_checkpointing_kwargs`: None
+- `include_inputs_for_metrics`: False
+- `eval_do_concat_batches`: True
+- `fp16_backend`: auto
+- `push_to_hub_model_id`: None
+- `push_to_hub_organization`: None
+- `mp_parameters`:
+- `auto_find_batch_size`: False
+- `full_determinism`: False
+- `torchdynamo`: None
+- `ray_scope`: last
+- `ddp_timeout`: 1800
+- `torch_compile`: False
+- `torch_compile_backend`: None
+- `torch_compile_mode`: None
+- `dispatch_batches`: None
+- `split_batches`: None
+- `include_tokens_per_second`: False
+- `include_num_input_tokens_seen`: False
+- `neftune_noise_alpha`: None
+- `optim_target_modules`: None
+- `batch_eval_metrics`: False
+- `batch_sampler`: no_duplicates
+- `multi_dataset_batch_sampler`: proportional
+</details>
+### Training Logs
+| Epoch      | Step   | Training Loss | dim_128_cosine_map@100 | dim_256_cosine_map@100 | dim_512_cosine_map@100 | dim_64_cosine_map@100 | dim_768_cosine_map@100 |
+|:----------:|:------:|:-------------:|:----------------------:|:----------------------:|:----------------------:|:---------------------:|:----------------------:|
+| 0.8122     | 10     | 1.5779        | -                      | -                      | -                      | -                     | -                      |
+| 0.9746     | 12     | -             | 0.7388                 | 0.7509                 | 0.7604                 | 0.7081                | 0.7579                 |
+| 1.6244     | 20     | 0.6572        | -                      | -                      | -                      | -                     | -                      |
+| 1.9492     | 24     | -             | 0.7612                 | 0.7670                 | 0.7729                 | 0.7269                | 0.7705                 |
+| 2.4365     | 30     | 0.4661        | -                      | -                      | -                      | -                     | -                      |
+| 2.9239     | 36     | -             | 0.7623                 | 0.7702                 | 0.7771                 | 0.7386                | 0.7758                 |
+| 3.2487     | 40     | 0.3774        | -                      | -                      | -                      | -                     | -                      |
+| **3.8985** | **48** | **-**         | **0.7627**             | **0.7712**             | **0.7776**             | **0.7407**            | **0.7773**             |
+* The bold row denotes the saved checkpoint.
+### Framework Versions
+- Python: 3.10.12
+- Sentence Transformers: 3.0.1
+- Transformers: 4.41.2
+- PyTorch: 2.3.1+cu121
+- Accelerate: 0.32.1
+- Datasets: 2.19.1
+- Tokenizers: 0.19.1
+## Citation
+### BibTeX
+#### Sentence Transformers
+```bibtex
+@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}
+```
+#### MatryoshkaLoss
+```bibtex
+@misc{kusupati2024matryoshka,
+    title={Matryoshka Representation Learning},
+    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
+    year={2024},
+    eprint={2205.13147},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}
+```
+#### MultipleNegativesRankingLoss
+```bibtex
+@misc{henderson2017efficient,
+    title={Efficient Natural Language Response Suggestion for Smart Reply},
+    author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
+    year={2017},
+    eprint={1705.00652},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+<!--
+## Glossary
+*Clearly define terms in order to be accessible across audiences.*
+-->
+<!--
+## Model Card Authors
+*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
+-->
+<!--
+## Model Card Contact
+*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
+-->

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "BAAI/bge-base-en-v1.5",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "__version__": {
+    "sentence_transformers": "3.0.1",
+    "transformers": "4.41.2",
+    "pytorch": "2.3.1+cu121"
+  },
+  "prompts": {},
+  "default_prompt_name": null,
+  "similarity_fn_name": null
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e232bb945ad1629abb00ae10af12914f8527da2d1fd8987588361b178b0968bf
+size 437951328

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 512,
+  "do_lower_case": true
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff