Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

README.md +233 -0
bench-coven.txt +130 -0
bench-mistral.txt +128 -0
config.json +31 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +1 -0
special_tokens_map.json +35 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +50 -0

README.md ADDED Viewed

	@@ -0,0 +1,233 @@

+---
+language:
+  - en
+license: apache-2.0
+tags:
+  - text-generation
+  - large-language-model
+  - orpo
+base_model:
+  - mistralai/Mistral-7B-Instruct-v0.2
+model-index:
+  - name: Coven 7B 128K ORPO
+    description: "Coven 7B 128K ORPO is a derivative of Mistral-7B-Instruct-v0.2, fine-tuned to perform specialized tasks involving deeper understanding and reasoning over context. This model exhibits strong capabilities in both general language understanding and task-specific challenges."
+    results:
+      - task:
+          type: text-generation
+          name: Winogrande Challenge
+        dataset:
+          name: Winogrande
+          type: winogrande_xl
+          split: test
+          args:
+            num_few_shot: 5
+        metrics:
+          - type: accuracy
+            value: 77.82
+            name: accuracy
+      - task:
+          type: text-generation
+          name: TruthfulQA Generation
+        dataset:
+          name: TruthfulQA
+          type: truthful_qa
+          config: multiple_choice
+          split: validation
+          args:
+            num_few_shot: 0
+        metrics:
+          - type: accuracy
+            value: 49.55
+            name: accuracy
+      - task:
+          type: text-generation
+          name: PIQA Problem Solving
+        dataset:
+          name: PIQA
+          type: piqa
+          split: validation
+          args:
+            num_few_shot: 5
+        metrics:
+          - type: accuracy
+            value: 82.05
+            name: accuracy
+      - task:
+          type: text-generation
+          name: OpenBookQA Facts
+        dataset:
+          name: OpenBookQA
+          type: openbookqa
+          split: test
+          args:
+            num_few_shot: 5
+        metrics:
+          - type: accuracy
+            value: 34.60
+            name: accuracy
+      - task:
+          type: text-generation
+          name: MMLU Knowledge Test
+        dataset:
+          name: MMLU
+          type: mmlu
+          config: all
+          split: test
+          args:
+            num_few_shot: 5
+        metrics:
+          - type: accuracy
+            value: 63.00
+            name: accuracy
+      - task:
+          type: text-generation
+          name: Hellaswag Contextual Completions
+        dataset:
+          name: Hellaswag
+          type: hellaswag
+          split: validation
+          args:
+            num_few_shot: 10
+        metrics:
+          - type: accuracy
+            value: 65.37
+            name: accuracy
+      - task:
+          type: text-generation
+          name: GSM8k Mathematical Reasoning
+        dataset:
+          name: GSM8k
+          type: gsm8k
+          split: test
+          args:
+            num_few_shot: 5
+        metrics:
+          - type: accuracy
+            value: 72.18
+            name: exact match (strict)
+          - type: accuracy
+            value: 72.63
+            name: exact match (flexible)
+      - task:
+          type: text-generation
+          name: BoolQ Question Answering
+        dataset:
+          name: BoolQ
+          type: boolq
+          split: validation
+          args:
+            num_few_shot: 5
+        metrics:
+          - type: accuracy
+            value: 87.43
+            name: accuracy
+      - task:
+          type: text-generation
+          name: ARC Challenge
+        dataset:
+          name: ARC Challenge
+          type: ai2_arc
+          split: test
+          args:
+            num_few_shot: 25
+        metrics:
+          - type: accuracy
+            value: 59.64
+            name: accuracy
+---
+# 🧙 Coven 7B 128K ORPO
+Coven 7B 128K is an improved iteration of Mistral-7B-Instruct-v0.2, refined to expand processing capabilities and refine language model preferences. This model includes a significantly increased context constraint of 128K tokens using the Yarn technique, which allows for more extensive data processing and understanding of complex language scenarios. In addition, the Coven 7B 128K ORPO 32K tokenization uses the innovative ORPO (Monolithic Preference Optimization without Reference Model) technology. ORPO simplifies the fine-tuning process by directly optimizing the odds ratio to distinguish between favorable and unfavorable generation styles, effectively improving model performance without the need for an additional preference alignment step.
+### Eval
+| Task                | Model                   | Metric            | Value    | Change (%)                   |
+|---------------------|-------------------------|-------------------|----------|------------------------------|
+| Winogrande          | Mistral-7B-Instruct-v0.2| Accuracy          | 73.64%   | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy          | 77.82%   | +5.67%                       |
+| TruthfulQA          | Mistral-7B-Instruct-v0.2| Accuracy          | 59.54%   | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy          | 49.55%   | -16.78%                      |
+| PIQA                | Mistral-7B-Instruct-v0.2| Accuracy          | 80.03%   | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy          | 82.05%   | +2.52%                       |
+| OpenBookQA          | Mistral-7B-Instruct-v0.2| Accuracy          | 36.00%   | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy          | 34.60%   | -3.89%                       |
+|                     | Mistral-7B-Instruct-v0.2| Accuracy Normalized| 45.20% | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy Normalized| 48.00% | +6.19%                       |
+| MMLU                | Mistral-7B-Instruct-v0.2| Accuracy          | 58.79%   | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy          | 63.00%   | +7.16%                       |
+| Hellaswag           | Mistral-7B-Instruct-v0.2| Accuracy          | 66.08%   | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy          | 65.37%   | -1.08%                       |
+|                     | Mistral-7B-Instruct-v0.2| Accuracy Normalized| 83.68% | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy Normalized| 84.29% | +0.73%                       |
+| GSM8K (Strict)      | Mistral-7B-Instruct-v0.2| Exact Match       | 41.55%   | -                            |
+|                     | Coven 7B 128K ORPO      | Exact Match       | 72.18%   | +73.65%                      |
+| GSM8K (Flexible)    | Mistral-7B-Instruct-v0.2| Exact Match       | 41.93%   | -                            |
+|                     | Coven 7B 128K ORPO      | Exact Match       | 72.63%   | +73.29%                      |
+| BoolQ               | Mistral-7B-Instruct-v0.2| Accuracy          | 85.29%   | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy          | 87.43%   | +2.51%                       |
+| ARC Easy            | Mistral-7B-Instruct-v0.2| Accuracy          | 81.36%   | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy          | 85.02%   | +4.50%                       |
+|                     | Mistral-7B-Instruct-v0.2| Accuracy Normalized| 76.60% | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy Normalized| 82.95% | +8.29%                       |
+| ARC Challenge       | Mistral-7B-Instruct-v0.2| Accuracy          | 54.35%   | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy          | 59.64%   | +9.74%                       |
+|                     | Mistral-7B-Instruct-v0.2| Accuracy Normalized| 55.80% | -                            |
+|                     | Coven 7B 128K ORPO      | Accuracy Normalized| 61.69% | +10.52%                      |
+## Model Details
+* **Model name**: Coven 7B 128K ORPO alpha
+* **Fine-tuned by**: raidhon
+* **Base model**: [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+* **Parameters**: 7B
+* **Context**: 128K
+* **Language(s)**: Multilingual
+* **License**: Apache2.0
+## 💻 Usage
+```python
+# Install transformers from source - only needed for versions <= v4.34
+# pip install git+https://github.com/huggingface/transformers.git
+# pip install accelerate
+import torch
+from transformers import pipeline
+pipe = pipeline("text-generation", model="raidhon/coven_7b_128k_orpo_alpha", torch_dtype=torch.float16, device_map="auto")
+messages = [
+    {
+        "role": "system",
+        "content": "You are a friendly chatbot who always responds in the style of a pirate",
+    },
+    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+]
+prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+outputs = pipe(prompt, max_new_tokens=4096, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
+print(outputs[0]["generated_text"])
+```

bench-coven.txt ADDED Viewed

	@@ -0,0 +1,130 @@

+raidhon/coven_7b_128k_orpo_alpha
+|                 Tasks                 |Version|     Filter     |n-shot|  Metric   | Value |   |Stderr|
+|---------------------------------------|-------|----------------|-----:|-----------|------:|---|-----:|
+|winogrande                             |      1|none            |     0|acc        | 0.7782|±  |0.0117|
+|truthfulqa                             |N/A    |none            |     0|rouge1_max |47.8575|±  |0.8139|
+|                                       |       |none            |     0|bleu_max   |21.9412|±  |0.7280|
+|                                       |       |none            |     0|rouge2_max |32.7726|±  |0.9228|
+|                                       |       |none            |     0|rougeL_diff|-1.4310|±  |0.7806|
+|                                       |       |none            |     0|acc        | 0.4955|±  |0.0115|
+|                                       |       |none            |     0|bleu_diff  |-0.2883|±  |0.6228|
+|                                       |       |none            |     0|rouge2_acc | 0.3807|±  |0.0170|
+|                                       |       |none            |     0|rougeL_max |44.1785|±  |0.8274|
+|                                       |       |none            |     0|rougeL_acc | 0.4443|±  |0.0174|
+|                                       |       |none            |     0|rouge2_diff|-1.5603|±  |0.8950|
+|                                       |       |none            |     0|bleu_acc   | 0.4321|±  |0.0173|
+|                                       |       |none            |     0|rouge1_diff|-0.7276|±  |0.7721|
+|                                       |       |none            |     0|rouge1_acc | 0.4774|±  |0.0175|
+| - truthfulqa_gen                      |      3|none            |     0|bleu_max   |21.9412|±  |0.7280|
+|                                       |       |none            |     0|bleu_acc   | 0.4321|±  |0.0173|
+|                                       |       |none            |     0|bleu_diff  |-0.2883|±  |0.6228|
+|                                       |       |none            |     0|rouge1_max |47.8575|±  |0.8139|
+|                                       |       |none            |     0|rouge1_acc | 0.4774|±  |0.0175|
+|                                       |       |none            |     0|rouge1_diff|-0.7276|±  |0.7721|
+|                                       |       |none            |     0|rouge2_max |32.7726|±  |0.9228|
+|                                       |       |none            |     0|rouge2_acc | 0.3807|±  |0.0170|
+|                                       |       |none            |     0|rouge2_diff|-1.5603|±  |0.8950|
+|                                       |       |none            |     0|rougeL_max |44.1785|±  |0.8274|
+|                                       |       |none            |     0|rougeL_acc | 0.4443|±  |0.0174|
+|                                       |       |none            |     0|rougeL_diff|-1.4310|±  |0.7806|
+| - truthfulqa_mc1                      |      2|none            |     0|acc        | 0.4174|±  |0.0173|
+| - truthfulqa_mc2                      |      2|none            |     0|acc        | 0.5736|±  |0.0151|
+|piqa                                   |      1|none            |     0|acc        | 0.8205|±  |0.0090|
+|                                       |       |none            |     0|acc_norm   | 0.8395|±  |0.0086|
+|openbookqa                             |      1|none            |     0|acc        | 0.3460|±  |0.0213|
+|                                       |       |none            |     0|acc_norm   | 0.4800|±  |0.0224|
+|mmlu                                   |N/A    |none            |     0|acc        | 0.6300|±  |0.0038|
+| - humanities                          |N/A    |none            |     0|acc        | 0.5779|±  |0.0066|
+|  - formal_logic                       |      0|none            |     0|acc        | 0.4127|±  |0.0440|
+|  - high_school_european_history       |      0|none            |     0|acc        | 0.8061|±  |0.0309|
+|  - high_school_us_history             |      0|none            |     0|acc        | 0.8480|±  |0.0252|
+|  - high_school_world_history          |      0|none            |     0|acc        | 0.8523|±  |0.0231|
+|  - international_law                  |      0|none            |     0|acc        | 0.7934|±  |0.0370|
+|  - international_law                  |      0|none            |     0|acc        | 0.7934|±  |0.0370|                                                                     [46/1966]
+|  - jurisprudence                      |      0|none            |     0|acc        | 0.7685|±  |0.0408|
+|  - logical_fallacies                  |      0|none            |     0|acc        | 0.7730|±  |0.0329|
+|  - moral_disputes                     |      0|none            |     0|acc        | 0.7110|±  |0.0244|
+|  - moral_scenarios                    |      0|none            |     0|acc        | 0.2894|±  |0.0152|
+|  - philosophy                         |      0|none            |     0|acc        | 0.7106|±  |0.0258|
+|  - prehistory                         |      0|none            |     0|acc        | 0.7685|±  |0.0235|
+|  - professional_law                   |      0|none            |     0|acc        | 0.4824|±  |0.0128|
+|  - world_religions                    |      0|none            |     0|acc        | 0.8129|±  |0.0299|
+| - other                               |N/A    |none            |     0|acc        | 0.7090|±  |0.0078|
+|  - business_ethics                    |      0|none            |     0|acc        | 0.5900|±  |0.0494|
+|  - clinical_knowledge                 |      0|none            |     0|acc        | 0.7245|±  |0.0275|
+|  - college_medicine                   |      0|none            |     0|acc        | 0.6532|±  |0.0363|
+|  - global_facts                       |      0|none            |     0|acc        | 0.3200|±  |0.0469|
+|  - human_aging                        |      0|none            |     0|acc        | 0.7040|±  |0.0306|
+|  - management                         |      0|none            |     0|acc        | 0.7864|±  |0.0406|
+|  - marketing                          |      0|none            |     0|acc        | 0.8632|±  |0.0225|
+|  - medical_genetics                   |      0|none            |     0|acc        | 0.7500|±  |0.0435|
+|  - miscellaneous                      |      0|none            |     0|acc        | 0.8212|±  |0.0137|
+|  - nutrition                          |      0|none            |     0|acc        | 0.7451|±  |0.0250|
+|  - professional_accounting            |      0|none            |     0|acc        | 0.5000|±  |0.0298|
+|  - professional_medicine              |      0|none            |     0|acc        | 0.7059|±  |0.0277|
+|  - virology                           |      0|none            |     0|acc        | 0.5301|±  |0.0389|
+| - social_sciences                     |N/A    |none            |     0|acc        | 0.7358|±  |0.0077|
+|  - econometrics                       |      0|none            |     0|acc        | 0.4474|±  |0.0468|
+|  - high_school_geography              |      0|none            |     0|acc        | 0.7525|±  |0.0307|
+|  - high_school_government_and_politics|      0|none            |     0|acc        | 0.9016|±  |0.0215|
+|  - high_school_macroeconomics         |      0|none            |     0|acc        | 0.6564|±  |0.0241|
+|  - high_school_microeconomics         |      0|none            |     0|acc        | 0.6807|±  |0.0303|
+|  - high_school_psychology             |      0|none            |     0|acc        | 0.8404|±  |0.0157|
+|  - human_sexuality                    |      0|none            |     0|acc        | 0.7405|��  |0.0384|
+|  - professional_psychology            |      0|none            |     0|acc        | 0.6552|±  |0.0192|
+|  - public_relations                   |      0|none            |     0|acc        | 0.6727|±  |0.0449|
+|  - security_studies                   |      0|none            |     0|acc        | 0.7673|±  |0.0270|
+|  - sociology                          |      0|none            |     0|acc        | 0.8358|±  |0.0262|
+|  - us_foreign_policy                  |      0|none            |     0|acc        | 0.8600|±  |0.0349|
+| - stem                                |N/A    |none            |     0|acc        | 0.5265|±  |0.0085|
+|  - abstract_algebra                   |      0|none            |     0|acc        | 0.3100|±  |0.0465|
+|  - anatomy                            |      0|none            |     0|acc        | 0.6000|±  |0.0423|
+|  - astronomy                          |      0|none            |     0|acc        | 0.6842|±  |0.0378|
+|  - college_biology                    |      0|none            |     0|acc        | 0.7292|±  |0.0372|
+|  - college_chemistry                  |      0|none            |     0|acc        | 0.4700|±  |0.0502|
+|  - college_chemistry                  |      0|none            |     0|acc        | 0.4700|±  |0.0502|                                                                      [5/1966]
+|  - college_computer_science           |      0|none            |     0|acc        | 0.5600|±  |0.0499|
+|  - college_mathematics                |      0|none            |     0|acc        | 0.3500|±  |0.0479|
+|  - college_physics                    |      0|none            |     0|acc        | 0.3529|±  |0.0476|
+|  - computer_security                  |      0|none            |     0|acc        | 0.7100|±  |0.0456|
+|  - conceptual_physics                 |      0|none            |     0|acc        | 0.5574|±  |0.0325|
+|  - electrical_engineering             |      0|none            |     0|acc        | 0.5793|±  |0.0411|
+|  - elementary_mathematics             |      0|none            |     0|acc        | 0.4101|±  |0.0253|
+|  - high_school_biology                |      0|none            |     0|acc        | 0.7903|±  |0.0232|
+|  - high_school_chemistry              |      0|none            |     0|acc        | 0.4828|±  |0.0352|
+|  - high_school_computer_science       |      0|none            |     0|acc        | 0.6600|±  |0.0476|
+|  - high_school_mathematics            |      0|none            |     0|acc        | 0.3444|±  |0.0290|
+|  - high_school_physics                |      0|none            |     0|acc        | 0.3642|±  |0.0393|
+|  - high_school_statistics             |      0|none            |     0|acc        | 0.5000|±  |0.0341|
+|  - machine_learning                   |      0|none            |     0|acc        | 0.5268|±  |0.0474|
+|hellaswag                              |      1|none            |     0|acc        | 0.6537|±  |0.0047|
+|                                       |       |none            |     0|acc_norm   | 0.8429|±  |0.0036|
+|gsm8k                                  |      3|strict-match    |     5|exact_match| 0.7218|±  |0.0123|
+|                                       |       |flexible-extract|     5|exact_match| 0.7263|±  |0.0123|
+|boolq                                  |      2|none            |     0|acc        | 0.8743|±  |0.0058|
+|arc_easy                               |      1|none            |     0|acc        | 0.8502|±  |0.0073|
+|                                       |       |none            |     0|acc_norm   | 0.8295|±  |0.0077|
+|arc_challenge                          |      1|none            |     0|acc        | 0.5964|±  |0.0143|
+|                                       |       |none            |     0|acc_norm   | 0.6169|±  |0.0142|
+|      Groups      |Version|Filter|n-shot|  Metric   | Value |   |Stderr|
+|------------------|-------|------|-----:|-----------|------:|---|-----:|
+|truthfulqa        |N/A    |none  |     0|rouge1_max |47.8575|±  |0.8139|
+|                  |       |none  |     0|bleu_max   |21.9412|±  |0.7280|
+|                  |       |none  |     0|rouge2_max |32.7726|±  |0.9228|
+|                  |       |none  |     0|rougeL_diff|-1.4310|±  |0.7806|
+|                  |       |none  |     0|acc        | 0.4955|±  |0.0115|
+|                  |       |none  |     0|bleu_diff  |-0.2883|±  |0.6228|
+|                  |       |none  |     0|rouge2_acc | 0.3807|±  |0.0170|
+|                  |       |none  |     0|rougeL_max |44.1785|±  |0.8274|
+|                  |       |none  |     0|rougeL_acc | 0.4443|±  |0.0174|
+|                  |       |none  |     0|rouge2_diff|-1.5603|±  |0.8950|
+|                  |       |none  |     0|bleu_acc   | 0.4321|±  |0.0173|
+|                  |       |none  |     0|rouge1_diff|-0.7276|±  |0.7721|
+|                  |       |none  |     0|rouge1_acc | 0.4774|±  |0.0175|
+|mmlu              |N/A    |none  |     0|acc        | 0.6300|±  |0.0038|
+| - humanities     |N/A    |none  |     0|acc        | 0.5779|±  |0.0066|
+| - other          |N/A    |none  |     0|acc        | 0.7090|±  |0.0078|
+| - social_sciences|N/A    |none  |     0|acc        | 0.7358|±  |0.0077|
+| - stem           |N/A    |none  |     0|acc        | 0.5265|±  |0.0085|

bench-mistral.txt ADDED Viewed

	@@ -0,0 +1,128 @@

+mistralai/Mistral-7B-Instruct-v0.2
+|                 Tasks                 |Version|     Filter     |n-shot|  Metric   | Value |   |Stderr|
+|---------------------------------------|-------|----------------|-----:|-----------|------:|---|-----:|
+|winogrande                             |      1|none            |     0|acc        | 0.7364|±  |0.0124|
+|truthfulqa                             |N/A    |none            |     0|acc        | 0.5954|±  |0.0116|
+|                                       |       |none            |     0|rouge1_max |46.4534|±  |0.8502|
+|                                       |       |none            |     0|rougeL_diff| 5.5378|±  |0.8859|
+|                                       |       |none            |     0|bleu_acc   | 0.5483|±  |0.0174|
+|                                       |       |none            |     0|rouge2_max |31.1969|±  |0.9785|
+|                                       |       |none            |     0|rougeL_max |43.4263|±  |0.8666|
+|                                       |       |none            |     0|rougeL_acc | 0.5606|±  |0.0174|
+|                                       |       |none            |     0|rouge2_acc | 0.4529|±  |0.0174|
+|                                       |       |none            |     0|rouge2_diff| 5.3591|±  |0.9416|
+|                                       |       |none            |     0|bleu_max   |21.2977|±  |0.7504|
+|                                       |       |none            |     0|rouge1_acc | 0.5741|±  |0.0173|
+|                                       |       |none            |     0|bleu_diff  | 4.3215|±  |0.6161|
+|                                       |       |none            |     0|rouge1_diff| 5.7381|±  |0.8786|
+| - truthfulqa_gen                      |      3|none            |     0|bleu_max   |21.2977|±  |0.7504|
+|                                       |       |none            |     0|bleu_acc   | 0.5483|±  |0.0174|
+|                                       |       |none            |     0|bleu_diff  | 4.3215|±  |0.6161|
+|                                       |       |none            |     0|rouge1_max |46.4534|±  |0.8502|
+|                                       |       |none            |     0|rouge1_acc | 0.5741|±  |0.0173|
+|                                       |       |none            |     0|rouge1_diff| 5.7381|±  |0.8786|
+|                                       |       |none            |     0|rouge2_max |31.1969|±  |0.9785|
+|                                       |       |none            |     0|rouge2_acc | 0.4529|±  |0.0174|
+|                                       |       |none            |     0|rouge2_diff| 5.3591|±  |0.9416|
+|                                       |       |none            |     0|rougeL_max |43.4263|±  |0.8666|
+|                                       |       |none            |     0|rougeL_acc | 0.5606|±  |0.0174|
+|                                       |       |none            |     0|rougeL_diff| 5.5378|±  |0.8859|
+| - truthfulqa_mc1                      |      2|none            |     0|acc        | 0.5226|±  |0.0175|
+| - truthfulqa_mc2                      |      2|none            |     0|acc        | 0.6681|±  |0.0153|
+|piqa                                   |      1|none            |     0|acc        | 0.8003|±  |0.0093|
+|                                       |       |none            |     0|acc_norm   | 0.8047|±  |0.0092|
+|openbookqa                             |      1|none            |     0|acc        | 0.3600|±  |0.0215|
+|                                       |       |none            |     0|acc_norm   | 0.4520|±  |0.0223|
+|mmlu                                   |N/A    |none            |     0|acc        | 0.5879|±  |0.0039|
+| - humanities                          |N/A    |none            |     0|acc        | 0.5396|±  |0.0069|
+|  - formal_logic                       |      0|none            |     0|acc        | 0.3651|±  |0.0431|
+|  - high_school_european_history       |      0|none            |     0|acc        | 0.7273|±  |0.0348|
+|  - high_school_us_history             |      0|none            |     0|acc        | 0.7794|±  |0.0291|
+|  - high_school_world_history          |      0|none            |     0|acc        | 0.7764|±  |0.0271|
+|  - international_law                  |      0|none            |     0|acc        | 0.7438|±  |0.0398|
+|  - jurisprudence                      |      0|none            |     0|acc        | 0.7130|±  |0.0437|
+|  - logical_fallacies                  |      0|none            |     0|acc        | 0.7546|±  |0.0338|
+|  - moral_disputes                     |      0|none            |     0|acc        | 0.6532|±  |0.0256|
+|  - moral_scenarios                    |      0|none            |     0|acc        | 0.3564|±  |0.0160|
+|  - philosophy                         |      0|none            |     0|acc        | 0.6463|±  |0.0272|
+|  - prehistory                         |      0|none            |     0|acc        | 0.6821|±  |0.0259|
+|  - professional_law                   |      0|none            |     0|acc        | 0.4133|±  |0.0126|
+|  - world_religions                    |      0|none            |     0|acc        | 0.8129|±  |0.0299|
+| - other                               |N/A    |none            |     0|acc        | 0.6621|±  |0.0082|
+|  - business_ethics                    |      0|none            |     0|acc        | 0.5900|±  |0.0494|
+|  - clinical_knowledge                 |      0|none            |     0|acc        | 0.6491|±  |0.0294|
+|  - college_medicine                   |      0|none            |     0|acc        | 0.5549|±  |0.0379|
+|  - global_facts                       |      0|none            |     0|acc        | 0.3800|±  |0.0488|
+|  - human_aging                        |      0|none            |     0|acc        | 0.6233|±  |0.0325|
+|  - management                         |      0|none            |     0|acc        | 0.7184|±  |0.0445|
+|  - marketing                          |      0|none            |     0|acc        | 0.8761|±  |0.0216|
+|  - medical_genetics                   |      0|none            |     0|acc        | 0.6500|±  |0.0479|
+|  - miscellaneous                      |      0|none            |     0|acc        | 0.7944|±  |0.0145|
+|  - nutrition                          |      0|none            |     0|acc        | 0.6732|±  |0.0269|
+|  - professional_accounting            |      0|none            |     0|acc        | 0.4468|±  |0.0297|
+|  - professional_medicine              |      0|none            |     0|acc        | 0.6581|±  |0.0288|
+|  - virology                           |      0|none            |     0|acc        | 0.4578|±  |0.0388|
+| - social_sciences                     |N/A    |none            |     0|acc        | 0.6799|±  |0.0082|
+|  - econometrics                       |      0|none            |     0|acc        | 0.4649|±  |0.0469|
+|  - high_school_geography              |      0|none            |     0|acc        | 0.7374|±  |0.0314|
+|  - high_school_government_and_politics|      0|none            |     0|acc        | 0.8031|±  |0.0287|
+|  - high_school_macroeconomics         |      0|none            |     0|acc        | 0.5590|±  |0.0252|
+|  - high_school_microeconomics         |      0|none            |     0|acc        | 0.6387|±  |0.0312|
+|  - high_school_psychology             |      0|none            |     0|acc        | 0.7853|±  |0.0176|
+|  - human_sexuality                    |      0|none            |     0|acc        | 0.6794|±  |0.0409|
+|  - professional_psychology            |      0|none            |     0|acc        | 0.5866|±  |0.0199|
+|  - public_relations                   |      0|none            |     0|acc        | 0.6455|±  |0.0458|
+|  - security_studies                   |      0|none            |     0|acc        | 0.6816|±  |0.0298|
+|  - sociology                          |      0|none            |     0|acc        | 0.8408|±  |0.0259|
+|  - us_foreign_policy                  |      0|none            |     0|acc        | 0.8500|±  |0.0359|
+| - stem                                |N/A    |none            |     0|acc        | 0.4970|±  |0.0087|
+|  - abstract_algebra                   |      0|none            |     0|acc        | 0.3200|±  |0.0469|
+|  - anatomy                            |      0|none            |     0|acc        | 0.5704|±  |0.0428|
+|  - astronomy                          |      0|none            |     0|acc        | 0.6382|±  |0.0391|
+|  - college_biology                    |      0|none            |     0|acc        | 0.6597|±  |0.0396|
+|  - college_chemistry                  |      0|none            |     0|acc        | 0.4100|±  |0.0494|
+|  - college_computer_science           |      0|none            |     0|acc        | 0.5400|±  |0.0501|
+|  - college_mathematics                |      0|none            |     0|acc        | 0.3400|±  |0.0476|
+|  - college_physics                    |      0|none            |     0|acc        | 0.3725|±  |0.0481|
+|  - computer_security                  |      0|none            |     0|acc        | 0.6700|±  |0.0473|
+|  - conceptual_physics                 |      0|none            |     0|acc        | 0.4809|±  |0.0327|
+|  - electrical_engineering             |      0|none            |     0|acc        | 0.5931|±  |0.0409|
+|  - elementary_mathematics             |      0|none            |     0|acc        | 0.4233|±  |0.0254|
+|  - high_school_biology                |      0|none            |     0|acc        | 0.6774|±  |0.0266|
+|  - high_school_chemistry              |      0|none            |     0|acc        | 0.4877|±  |0.0352|
+|  - high_school_computer_science       |      0|none            |     0|acc        | 0.6100|±  |0.0490|
+|  - high_school_mathematics            |      0|none            |     0|acc        | 0.3556|±  |0.0292|
+|  - high_school_physics                |      0|none            |     0|acc        | 0.3642|±  |0.0393|
+|  - high_school_statistics             |      0|none            |     0|acc        | 0.4630|±  |0.0340|
+|  - machine_learning                   |      0|none            |     0|acc        | 0.4643|±  |0.0473|
+|hellaswag                              |      1|none            |     0|acc        | 0.6608|±  |0.0047|
+|                                       |       |none            |     0|acc_norm   | 0.8368|±  |0.0037|
+|gsm8k                                  |      3|strict-match    |     5|exact_match| 0.4155|±  |0.0136|
+|                                       |       |flexible-extract|     5|exact_match| 0.4193|±  |0.0136|
+|boolq                                  |      2|none            |     0|acc        | 0.8529|±  |0.0062|
+|arc_easy                               |      1|none            |     0|acc        | 0.8136|±  |0.0080|
+|                                       |       |none            |     0|acc_norm   | 0.7660|±  |0.0087|
+|arc_challenge                          |      1|none            |     0|acc        | 0.5435|±  |0.0146|
+|                                       |       |none            |     0|acc_norm   | 0.5580|±  |0.0145|
+|      Groups      |Version|Filter|n-shot|  Metric   | Value |   |Stderr|
+|------------------|-------|------|-----:|-----------|------:|---|-----:|
+|truthfulqa        |N/A    |none  |     0|acc        | 0.5954|±  |0.0116|
+|                  |       |none  |     0|rouge1_max |46.4534|±  |0.8502|
+|                  |       |none  |     0|rougeL_diff| 5.5378|±  |0.8859|
+|                  |       |none  |     0|bleu_acc   | 0.5483|±  |0.0174|
+|                  |       |none  |     0|rouge2_max |31.1969|±  |0.9785|
+|                  |       |none  |     0|rougeL_max |43.4263|±  |0.8666|
+|                  |       |none  |     0|rougeL_acc | 0.5606|±  |0.0174|
+|                  |       |none  |     0|rouge2_acc | 0.4529|±  |0.0174|
+|                  |       |none  |     0|rouge2_diff| 5.3591|±  |0.9416|
+|                  |       |none  |     0|bleu_max   |21.2977|±  |0.7504|
+|                  |       |none  |     0|rouge1_acc | 0.5741|±  |0.0173|
+|                  |       |none  |     0|bleu_diff  | 4.3215|±  |0.6161|
+|                  |       |none  |     0|rouge1_diff| 5.7381|±  |0.8786|
+|mmlu              |N/A    |none  |     0|acc        | 0.5879|±  |0.0039|
+| - humanities     |N/A    |none  |     0|acc        | 0.5396|±  |0.0069|
+| - other          |N/A    |none  |     0|acc        | 0.6621|±  |0.0082|
+| - social_sciences|N/A    |none  |     0|acc        | 0.6799|±  |0.0082|
+| - stem           |N/A    |none  |     0|acc        | 0.4970|±  |0.0087|

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "raidhon/coven_7b_128k_orpo_alpha",
+  "architectures": ["MistralForCausalLM"],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "max_sequence_length": 131072,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-5,
+  "rope_scaling": {
+    "factor": 4.0,
+    "finetuned": true,
+    "original_max_position_embeddings": 32768,
+    "type": "yarn"
+  },
+  "rope_theta": 10000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.39.3",
+  "use_cache": true,
+  "vocab_size": 32000
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae114d2ad6ffb2f589ba9085c22a114f63650b5e18b4bbc18355589df8006b48
+size 9919903616

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9aca265234f2460495e81ead1f264d5ee78b8fb0dc3c10aeba69dc2e11bd089
+size 4563594120

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"metadata": {"mergekit_version": "0.0.4.2", "total_size": 14483464192}, "weight_map": {"lm_head.weight": "model-00001-of-00002.safetensors", "model.embed_tokens.weight": "model-00001-of-00002.safetensors", "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.norm.weight": "model-00002-of-00002.safetensors"}}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "additional_special_tokens": [
+    "<unk>",
+    "<s>",
+    "</s>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<unk>",
+    "<s>",
+    "</s>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message + '\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'Human: ' + content + '\\nAssistant: ' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "split_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": true
+}