Spaces:

Nomyx
/

textgen-webui

Runtime error

App Files Files Community

lonestar108 commited on Sep 20, 2023

Commit

a6c6ee5

1 Parent(s): 4f493f5

updates

Browse files

Files changed (19) hide show

models/PygmalionAI_pygmalion-350m/huggingface-metadata.txt +1 -1
models/TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ/README.md +0 -325
models/TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ/config.json +0 -44
models/TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ/llama_rope_scaled_monkey_patch.py +0 -65
models/TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ/modelling_llama.py +0 -894
models/TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ/special_tokens_map.json +0 -23
models/TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ/LICENSE.txt +126 -0
models/TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ/README.md +308 -0
models/TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ/USE_POLICY.md +50 -0
models/TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ/config.json +39 -0
models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/generation_config.json +1 -2
models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/huggingface-metadata.txt +3 -3
models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/model.safetensors +2 -2
models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/quantize_config.json +3 -2
models/TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ/special_tokens_map.json +6 -0
models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/tokenizer.json +106 -100
models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/tokenizer.model +0 -0
models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/tokenizer_config.json +11 -7
run.py +2 -2

models/PygmalionAI_pygmalion-350m/huggingface-metadata.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 url: https://huggingface.co/PygmalionAI/pygmalion-350m
 branch: main
-download date: 2023-09-08 12:27:24
 sha256sum:
     356aa4ab61193d13e3e7a097bb5f2c025dc2536d5f127154889202ba3c735ae2 pytorch_model.bin

 url: https://huggingface.co/PygmalionAI/pygmalion-350m
 branch: main
+download date: 2023-09-20 03:12:34
 sha256sum:
     356aa4ab61193d13e3e7a097bb5f2c025dc2536d5f127154889202ba3c735ae2 pytorch_model.bin

models/TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ/README.md DELETED Viewed

@@ -1,325 +0,0 @@
----
-inference: false
-license: other
----
-<!-- header start -->
-<!-- 200823 -->
-<div style="width: auto; margin-left: auto; margin-right: auto">
-<img src="https://i.imgur.com/EBdldam.jpg" alt="TheBlokeAI" style="width: 100%; min-width: 400px; display: block; margin: auto;">
-</div>
-<div style="display: flex; justify-content: space-between; width: 100%;">
-    <div style="display: flex; flex-direction: column; align-items: flex-start;">
-        <p style="margin-top: 0.5em; margin-bottom: 0em;"><a href="https://discord.gg/theblokeai">Chat & support: TheBloke's Discord server</a></p>
-    </div>
-    <div style="display: flex; flex-direction: column; align-items: flex-end;">
-        <p style="margin-top: 0.5em; margin-bottom: 0em;"><a href="https://www.patreon.com/TheBlokeAI">Want to contribute? TheBloke's Patreon page</a></p>
-    </div>
-</div>
-<div style="text-align:center; margin-top: 0em; margin-bottom: 0em"><p style="margin-top: 0.25em; margin-bottom: 0em;">TheBloke's LLM work is generously supported by a grant from <a href="https://a16z.com">andreessen horowitz (a16z)</a></p></div>
-<hr style="margin-top: 1.0em; margin-bottom: 1.0em;">
-<!-- header end -->
-# TehVenom's merge of PygmalionAI's Pygmalion 13B GPTQ
-These files are GPTQ 4bit model files for [TehVenom's merge of PygmalionAI's Pygmalion 13B](https://huggingface.co/TehVenom/Pygmalion-13b-Merged) merged with [Kaio Ken's SuperHOT 8K](https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test).
-It is the result of quantising to 4bit using [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa).
-**This is an experimental new GPTQ which offers up to 8K context size**
-The increased context is tested to work with [ExLlama](https://github.com/turboderp/exllama), via the latest release of [text-generation-webui](https://github.com/oobabooga/text-generation-webui).
-It has also been tested from Python code using AutoGPTQ, and `trust_remote_code=True`.
-Code credits:
-- Original concept and code for increasing context length: [kaiokendev](https://huggingface.co/kaiokendev)
-- Updated Llama modelling code that includes this automatically via trust_remote_code: [emozilla](https://huggingface.co/emozilla).
-Please read carefully below to see how to use it.
-GGML versions are not yet provided, as there is not yet support for SuperHOT in llama.cpp. This is being investigated and will hopefully come soon.
-## Repositories available
-* [4-bit GPTQ models for GPU inference](https://huggingface.co/TheBloke/Pygmalion-13B-SuperHOT-8K-GPTQ)
-* [2, 3, 4, 5, 6 and 8-bit GGML models for CPU inference](https://huggingface.co/TheBloke/Pygmalion-13B-SuperHOT-8K-GGML)
-* [Unquantised SuperHOT fp16 model in pytorch format, for GPU inference and for further conversions](https://huggingface.co/TheBloke/Pygmalion-13B-SuperHOT-8K-fp16)
-* [Unquantised base fp16 model in pytorch format, for GPU inference and for further conversions](https://huggingface.co/PygmalionAI/pygmalion-13b)
-## How to easily download and use this model in text-generation-webui with ExLlama
-Please make sure you're using the latest version of text-generation-webui
-1. Click the **Model tab**.
-2. Under **Download custom model or LoRA**, enter `TheBloke/Pygmalion-13B-SuperHOT-8K-GPTQ`.
-3. Click **Download**.
-4. The model will start downloading. Once it's finished it will say "Done"
-5. Untick **Autoload the model**
-6. In the top left, click the refresh icon next to **Model**.
-7. In the **Model** dropdown, choose the model you just downloaded: `Pygmalion-13B-SuperHOT-8K-GPTQ`
-8. To use the increased context, set the **Loader** to **ExLlama**, set **max_seq_len** to 8192 or 4096, and set **compress_pos_emb** to **4** for 8192 context, or to **2** for 4096 context.
-9. Now click **Save Settings** followed by **Reload**
-10. The model will automatically load, and is now ready for use!
-11. Once you're ready, click the **Text Generation tab** and enter a prompt to get started!
-## How to use this GPTQ model from Python code with AutoGPTQ
-First make sure you have AutoGPTQ and Einops installed:
-```
-pip3 install einops auto-gptq
-```
-Then run the following code. Note that in order to get this to work, `config.json` has been hardcoded to a sequence length of 8192.
-If you want to try 4096 instead to reduce VRAM usage, please manually edit `config.json` to set `max_position_embeddings` to the value you want.
-```python
-from transformers import AutoTokenizer, pipeline, logging
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-import argparse
-model_name_or_path = "TheBloke/Pygmalion-13B-SuperHOT-8K-GPTQ"
-model_basename = "pygmalion-13b-superhot-8k-GPTQ-4bit-128g.no-act.order"
-use_triton = False
-tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
-model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
-        model_basename=model_basename,
-        use_safetensors=True,
-        trust_remote_code=True,
-        device_map='auto',
-        use_triton=use_triton,
-        quantize_config=None)
-model.seqlen = 8192
-# Note: check the prompt template is correct for this model.
-prompt = "Tell me about AI"
-prompt_template=f'''USER: {prompt}
-ASSISTANT:'''
-print("\n\n*** Generate:")
-input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
-output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
-print(tokenizer.decode(output[0]))
-# Inference can also be done using transformers' pipeline
-# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
-logging.set_verbosity(logging.CRITICAL)
-print("*** Pipeline:")
-pipe = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    max_new_tokens=512,
-    temperature=0.7,
-    top_p=0.95,
-    repetition_penalty=1.15
-)
-print(pipe(prompt_template)[0]['generated_text'])
-```
-## Using other UIs: monkey patch
-Provided in the repo is `llama_rope_scaled_monkey_patch.py`, written by @kaiokendev.
-It can be theoretically be added to any Python UI or custom code to enable the same result as `trust_remote_code=True`.  I have not tested this, and it should be superseded by using `trust_remote_code=True`, but I include it for completeness and for interest.
-## Provided files
-**pygmalion-13b-superhot-8k-GPTQ-4bit-128g.no-act.order.safetensors**
-This will work with AutoGPTQ, ExLlama, and CUDA versions of GPTQ-for-LLaMa. There are reports of issues with Triton mode of recent GPTQ-for-LLaMa. If you have issues, please use AutoGPTQ instead.
-It was created with group_size 128 to increase inference accuracy, but without --act-order (desc_act) to increase compatibility and improve inference speed.
-* `pygmalion-13b-superhot-8k-GPTQ-4bit-128g.no-act.order.safetensors`
-  * Works for use with ExLlama with increased context (4096 or 8192)
-  * Works with AutoGPTQ in Python code, including with increased context, if `trust_remote_code=True` is set.
-  * Should work with GPTQ-for-LLaMa in CUDA mode, but unknown if increased context works - TBC.  May have issues with GPTQ-for-LLaMa Triton mode.
-  * Works with text-generation-webui, including one-click-installers.
-  * Parameters: Groupsize = 128. Act Order / desc_act = False.
-<!-- footer start -->
-<!-- 200823 -->
-## Discord
-For further support, and discussions on these models and AI in general, join us at:
-[TheBloke AI's Discord server](https://discord.gg/theblokeai)
-## Thanks, and how to contribute.
-Thanks to the [chirper.ai](https://chirper.ai) team!
-I've had a lot of people ask if they can contribute. I enjoy providing models and helping people, and would love to be able to spend even more time doing it, as well as expanding into new projects like fine tuning/training.
-If you're able and willing to contribute it will be most gratefully received and will help me to keep providing more models, and to start work on new AI projects.
-Donaters will get priority support on any and all AI/LLM/model questions and requests, access to a private Discord room, plus other benefits.
-* Patreon: https://patreon.com/TheBlokeAI
-* Ko-Fi: https://ko-fi.com/TheBlokeAI
-**Special thanks to**: Aemon Algiz.
-**Patreon special mentions**: Sam, theTransient, Jonathan Leane, Steven Wood, webtim, Johann-Peter Hartmann, Geoffrey Montalvo, Gabriel Tamborski, Willem Michiel, John Villwock, Derek Yates, Mesiah Bishop, Eugene Pentland, Pieter, Chadd, Stephen Murray, Daniel P. Andersen, terasurfer, Brandon Frisco, Thomas Belote, Sid, Nathan LeClaire, Magnesian, Alps Aficionado, Stanislav Ovsiannikov, Alex, Joseph William Delisle, Nikolai Manek, Michael Davis, Junyu Yang, K, J, Spencer Kim, Stefan Sabev, Olusegun Samson, transmissions 11, Michael Levine, Cory Kujawski, Rainer Wilmers, zynix, Kalila, Luke @flexchar, Ajan Kanaga, Mandus, vamX, Ai Maven, Mano Prime, Matthew Berman, subjectnull, Vitor Caleffi, Clay Pascal, biorpg, alfie_i, 阿明, Jeffrey Morgan, ya boyyy, Raymond Fosdick, knownsqashed, Olakabola, Leonard Tan, ReadyPlayerEmma, Enrico Ros, Dave, Talal Aujan, Illia Dulskyi, Sean Connelly, senxiiz, Artur Olbinski, Elle, Raven Klaugh, Fen Risland, Deep Realms, Imad Khwaja, Fred von Graf, Will Dee, usrbinkat, SuperWojo, Alexandros Triantafyllidis, Swaroop Kallakuri, Dan Guido, John Detwiler, Pedro Madruga, Iucharbius, Viktor Bowallius, Asp the Wyvern, Edmond Seymore, Trenton Dambrowitz, Space Cruiser, Spiking Neurons AB, Pyrater, LangChain4j, Tony Hughes, Kacper Wikieł, Rishabh Srivastava, David Ziegler, Luke Pendergrass, Andrey, Gabriel Puliatti, Lone Striker, Sebastain Graf, Pierre Kircher, Randy H, NimbleBox.ai, Vadim, danny, Deo Leter
-Thank you to all my generous patrons and donaters!
-And thank you again to a16z for their generous grant.
-<!-- footer end -->
-# Original model card: Kaio Ken's SuperHOT 8K
-### SuperHOT Prototype 2 w/ 8K Context
-This is a second prototype of SuperHOT, this time 30B with 8K context and no RLHF, using the same technique described in [the github blog](https://kaiokendev.github.io/til#extending-context-to-8k).
-Tests have shown that the model does indeed leverage the extended context at 8K.
-You will need to **use either the monkeypatch** or, if you are already using the monkeypatch, **change the scaling factor to 0.25 and the maximum sequence length to 8192**
-#### Looking for Merged & Quantized Models?
-- 30B 4-bit CUDA: [tmpupload/superhot-30b-8k-4bit-safetensors](https://huggingface.co/tmpupload/superhot-30b-8k-4bit-safetensors)
-- 30B 4-bit CUDA 128g: [tmpupload/superhot-30b-8k-4bit-128g-safetensors](https://huggingface.co/tmpupload/superhot-30b-8k-4bit-128g-safetensors)
-#### Training Details
-I trained the LoRA with the following configuration:
-- 1200 samples (~400 samples over 2048 sequence length)
-- learning rate of 3e-4
-- 3 epochs
-- The exported modules are:
-    - q_proj
-    - k_proj
-    - v_proj
-    - o_proj
-    - no bias
-- Rank = 4
-- Alpha = 8
-- no dropout
-- weight decay of 0.1
-- AdamW beta1 of 0.9 and beta2 0.99, epsilon of 1e-5
-- Trained on 4-bit base model
-# Original model card: TehVenom's merge of PygmalionAI's Pygmalion 13B
-<h1 style="text-align: center">Pygmalion 13b</h1>
-<h2 style="text-align: center">A conversational LLaMA fine-tune.</h2>
-## Model Details:
-Pygmalion 13b is a dialogue model based on Meta's LLaMA-13b.
-This is version 1. It has been fine-tuned using a subset of the data from Pygmalion-6B-v8-pt4,
-for those of you familiar with the project.
-The current Pygmalion-13b has been trained as a LoRA, then merged down to the base model for distribuition.
-## Applying the XORs
-This models has the XOR files pre-applied out of the box.
-Converted from the XORs weights from PygmalionAI's release https://huggingface.co/PygmalionAI/pygmalion-13b
-## Prompting
-The model was trained on the usual Pygmalion persona + chat format, so any of the usual UIs should already handle everything correctly. If you're using the model directly, this is the expected formatting:
-```
-[CHARACTER]'s Persona: [A few sentences about the character you want the model to play]
-<START>
-[DIALOGUE HISTORY]
-You: [User's input message here]
-[CHARACTER]:
-```
-Where `[CHARACTER]` is, as you can probably guess, the name of the character you want the model to portray, `<START>` should be used verbatim as a delimiter token to separate persona and scenario data from the dialogue, and `[DIALOGUE HISTORY]` is a sliding window of chat history so the model can have conversational context to draw from. Here's a concrete example:
-```
-Assistant's Persona: Assistant is a highly intelligent language model trained to comply with user requests.
-<START>
-Assistant: Hello! How may I help you today?
-You: What is Zork?
-Assistant:
-```
-Which will generate something like:
-```
- Zork is an interactive fiction computer game created in the 1970s by Infocom, Inc., which was later acquired by Activision Blizzard. It is widely considered one of the most influential games ever made and has been credited with popularizing text-based adventure games. The original version of Zork was written in the programming language MACRO-10, but it was ported to many other platforms over the years."
-```
-The model will automatically emit an end-of-text token (`</s>`) when it judges that the response is complete.
-## Eval / Benchmark scores
-Current evals out of the Pygmalion-13b model: <br>
-<html>
-<head>
-	<style>
-		table {
-			border:1px solid #b3adad;
-			border-collapse:collapse;
-			padding:5px;
-		}
-		table th {
-			border:1px solid #b3adad;
-			padding:5px;
-			background: #f0f0f0;
-			color: #313030;
-		}
-		table td {
-			border:1px solid #b3adad;
-			text-align:center;
-			padding:5px;
-			background: #ffffff;
-			color: #313030;
-		}
-	</style>
-</head>
-<body>
-	<table>
-		<thead>
-			<tr>
-				<th>Model:</th>
-				<th>Wikitext2</th>
-				<th>Ptb-New</th>
-				<th>C4-New</th>
-			</tr>
-		</thead>
-		<tbody>
-			<tr>
-				<td>Pygmalion 13b - 16bit</td>
-				<td>5.710726737976074</td>
-				<td>23.633684158325195</td>
-				<td>7.6324849128723145</td>
-			</tr>
-		</tbody>
-	</table>
-</body>
-</html>
-<br>Thanks to YellowRose#1776 for the numbers.
-<hr>
-## Other notes
-- When prompted correctly, the model will always start by generating a BOS token. This behavior is an accidental side-effect which we plan to address in future model versions and should not be relied upon.
-- The model was trained as a LoRA with a somewhat unorthodox configuration which causes errors when used with the current version of `peft`, hence we release it as a full model instead.
-## Limitations and biases
-The intended use-case for this model is fictional conversation for entertainment purposes. Any other sort of usage is out of scope.
-As such, it was **not** fine-tuned to be safe and harmless: the base model _and_ this fine-tune have been trained on data known to contain profanity and texts that are lewd or otherwise offensive. It may produce socially unacceptable or undesirable text, even if the prompt itself does not include anything explicitly offensive. Outputs might often be factually wrong or misleading.

models/TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ/config.json DELETED Viewed

@@ -1,44 +0,0 @@
-{
-  "_name_or_path": "/workspace/superhot_process/pygmalion-13b/source",
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "badwordsids": [
-    [
-      0
-    ]
-  ],
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 5120,
-  "initializer_range": 0.02,
-  "intermediate_size": 13824,
-  "max_position_embeddings": 8192,
-  "max_sequence_length": 2048,
-  "model_type": "llama",
-  "num_attention_heads": 40,
-  "num_hidden_layers": 40,
-  "pad_token_id": 0,
-  "rms_norm_eps": 1e-06,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.30.0.dev0",
-  "use_cache": true,
-  "vocab_size": 32000,
-  "auto_map": {
-    "AutoModel": "modelling_llama.LlamaModel",
-    "AutoModelForCausalLM": "modelling_llama.LlamaForCausalLM",
-    "AutoModelForSequenceClassification": "modelling_llama.LlamaForSequenceClassification"
-  },
-  "quantization_config": {
-    "bits": 4,
-    "group_size": 128,
-    "damp_percent": 0.01,
-    "desc_act": false,
-    "sym": true,
-    "true_sequential": true,
-    "model_file_base_name": "model",
-    "quant_method": "gptq"
-  }
-}

models/TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ/llama_rope_scaled_monkey_patch.py DELETED Viewed

@@ -1,65 +0,0 @@
-import torch
-import transformers
-import transformers.models.llama.modeling_llama
-from einops import rearrange
-import random
-# This monkey patch file is not needed if using ExLlama, or if using `trust_remote_code=True``
-class ScaledRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        max_position_embeddings = 8192
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(
-            self.max_seq_len_cached,
-            device=self.inv_freq.device,
-            dtype=self.inv_freq.dtype,
-        )
-        self.scale = 1 / 4
-        t *= self.scale
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer(
-            "cos_cached", emb.cos()[None, None, :, :], persistent=False
-        )
-        self.register_buffer(
-            "sin_cached", emb.sin()[None, None, :, :], persistent=False
-        )
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(
-                self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype
-            )
-            t *= self.scale
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer(
-                "cos_cached", emb.cos()[None, None, :, :], persistent=False
-            )
-            self.register_buffer(
-                "sin_cached", emb.sin()[None, None, :, :], persistent=False
-            )
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-def replace_llama_rope_with_scaled_rope():
-    transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = (
-        ScaledRotaryEmbedding
-    )

models/TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ/modelling_llama.py DELETED Viewed

@@ -1,894 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from transformers.models.llama.modeling_llama import LlamaConfig
-logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "LlamaConfig"
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-    inverted_mask = 1.0 - expanded_mask
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return (self.weight * hidden_states).to(input_dtype)
-class LlamaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, scale=1, device=None):
-        super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
-        self.scale = scale
-        t *= self.scale
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        dtype = torch.get_default_dtype()
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False)
-            self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False)
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
-    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-class LlamaMLP(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.position_embeddings_scale = 2048 / self.max_position_embeddings
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings, scale=self.position_embeddings_scale)
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        past_key_value = (key_states, value_states) if use_cache else None
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(
-                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
-            )
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        attn_output = self.o_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights,)
-        if use_cache:
-            outputs += (present_key_value,)
-        return outputs
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-    Args:
-        config: LlamaConfig
-    """
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.embed_tokens
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-        return combined_attention_mask
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-        hidden_states = inputs_embeds
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-            if self.gradient_checkpointing and self.training:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-                    return custom_forward
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-        hidden_states = self.norm(hidden_states)
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-    def get_output_embeddings(self):
-        return self.lm_head
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-    def set_decoder(self, decoder):
-        self.model = decoder
-    def get_decoder(self):
-        return self.model
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-        Returns:
-        Example:
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
-            else:
-                sequence_lengths = -1
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )

models/TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ/special_tokens_map.json DELETED Viewed

@@ -1,23 +0,0 @@
-{
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
-}

models/TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,126 @@

+LLAMA 2 COMMUNITY LICENSE AGREEMENT
+Llama 2 Version Release Date: July 18, 2023
+"Agreement" means the terms and conditions for use, reproduction, distribution and
+modification of the Llama Materials set forth herein.
+"Documentation" means the specifications, manuals and documentation
+accompanying Llama 2 distributed by Meta at ai.meta.com/resources/models-and-
+libraries/llama-downloads/.
+"Licensee" or "you" means you, or your employer or any other person or entity (if
+you are entering into this Agreement on such person or entity's behalf), of the age
+required under applicable laws, rules or regulations to provide legal consent and that
+has legal authority to bind your employer or such other person or entity if you are
+entering in this Agreement on their behalf.
+"Llama 2" means the foundational large language models and software and
+algorithms, including machine-learning model code, trained model weights,
+inference-enabling code, training-enabling code, fine-tuning enabling code and other
+elements of the foregoing distributed by Meta at ai.meta.com/resources/models-and-
+libraries/llama-downloads/.
+"Llama Materials" means, collectively, Meta's proprietary Llama 2 and
+Documentation (and any portion thereof) made available under this Agreement.
+"Meta" or "we" means Meta Platforms Ireland Limited (if you are located in or, if you
+are an entity, your principal place of business is in the EEA or Switzerland) and Meta
+Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+By clicking "I Accept" below or by using or distributing any portion or element of the
+Llama Materials, you agree to be bound by this Agreement.
+1. License Rights and Redistribution.
+      a. Grant of Rights. You are granted a non-exclusive, worldwide, non-
+transferable and royalty-free limited license under Meta's intellectual property or
+other rights owned by Meta embodied in the Llama Materials to use, reproduce,
+distribute, copy, create derivative works of, and make modifications to the Llama
+Materials.
+      b. Redistribution and Use.
+            i. If you distribute or make the Llama Materials, or any derivative works
+thereof, available to a third party, you shall provide a copy of this Agreement to such
+third party.
+            ii.  If you receive Llama Materials, or any derivative works thereof, from
+a Licensee as part of an integrated end user product, then Section 2 of this
+Agreement will not apply to you.
+            iii. You must retain in all copies of the Llama Materials that you
+distribute the following attribution notice within a "Notice" text file distributed as a
+part of such copies: "Llama 2 is licensed under the LLAMA 2 Community License,
+Copyright (c) Meta Platforms, Inc. All Rights Reserved."
+            iv. Your use of the Llama Materials must comply with applicable laws
+and regulations (including trade compliance laws and regulations) and adhere to the
+Acceptable Use Policy for the Llama Materials (available at
+https://ai.meta.com/llama/use-policy), which is hereby incorporated by reference into
+this Agreement.
+            v. You will not use the Llama Materials or any output or results of the
+Llama Materials to improve any other large language model (excluding Llama 2 or
+derivative works thereof).
+2. Additional Commercial Terms. If, on the Llama 2 version release date, the
+monthly active users of the products or services made available by or for Licensee,
+or Licensee's affiliates, is greater than 700 million monthly active users in the
+preceding calendar month, you must request a license from Meta, which Meta may
+grant to you in its sole discretion, and you are not authorized to exercise any of the
+rights under this Agreement unless or until Meta otherwise expressly grants you
+such rights.
+3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE
+LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE
+PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY
+WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR
+FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE
+FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING
+THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR
+USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS.
+4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE
+LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT,
+NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS
+AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL,
+CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN
+IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF
+ANY OF THE FOREGOING.
+5. Intellectual Property.
+      a. No trademark licenses are granted under this Agreement, and in
+connection with the Llama Materials, neither Meta nor Licensee may use any name
+or mark owned by or associated with the other or any of its affiliates, except as
+required for reasonable and customary use in describing and redistributing the
+Llama Materials.
+      b. Subject to Meta's ownership of Llama Materials and derivatives made by or
+for Meta, with respect to any derivative works and modifications of the Llama
+Materials that are made by you, as between you and Meta, you are and will be the
+owner of such derivative works and modifications.
+      c. If you institute litigation or other proceedings against Meta or any entity
+(including a cross-claim or counterclaim in a lawsuit) alleging that the Llama
+Materials or Llama 2 outputs or results, or any portion of any of the foregoing,
+constitutes infringement of intellectual property or other rights owned or licensable
+by you, then any licenses granted to you under this Agreement shall terminate as of
+the date such litigation or claim is filed or instituted. You will indemnify and hold
+harmless Meta from and against any claim by any third party arising out of or related
+to your use or distribution of the Llama Materials.
+6. Term and Termination. The term of this Agreement will commence upon your
+acceptance of this Agreement or access to the Llama Materials and will continue in
+full force and effect until terminated in accordance with the terms and conditions
+herein. Meta may terminate this Agreement if you are in breach of any term or
+condition of this Agreement. Upon termination of this Agreement, you shall delete
+and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the
+termination of this Agreement.
+7. Governing Law and Jurisdiction. This Agreement will be governed and
+construed under the laws of the State of California without regard to choice of law
+principles, and the UN Convention on Contracts for the International Sale of Goods
+does not apply to this Agreement. The courts of California shall have exclusive
+jurisdiction of any dispute arising out of this Agreement.

models/TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ/README.md ADDED Viewed

	@@ -0,0 +1,308 @@

+---
+base_model: https://huggingface.co/royallab/Pygmalion-2-13b-SuperCOT2
+inference: false
+language:
+- en
+library_name: transformers
+license: llama2
+model_creator: royallab
+model_name: Pygmalion 2 13B SuperCOT2
+model_type: llama
+pipeline_tag: text-generation
+prompt_template: 'Below is an instruction that describes a task. Write a response
+  that appropriately completes the request.
+  ### Instruction:
+  {prompt}
+  ### Response:
+  '
+quantized_by: TheBloke
+tags:
+- llama
+- llama-2
+---
+<!-- header start -->
+<!-- 200823 -->
+<div style="width: auto; margin-left: auto; margin-right: auto">
+<img src="https://i.imgur.com/EBdldam.jpg" alt="TheBlokeAI" style="width: 100%; min-width: 400px; display: block; margin: auto;">
+</div>
+<div style="display: flex; justify-content: space-between; width: 100%;">
+    <div style="display: flex; flex-direction: column; align-items: flex-start;">
+        <p style="margin-top: 0.5em; margin-bottom: 0em;"><a href="https://discord.gg/theblokeai">Chat & support: TheBloke's Discord server</a></p>
+    </div>
+    <div style="display: flex; flex-direction: column; align-items: flex-end;">
+        <p style="margin-top: 0.5em; margin-bottom: 0em;"><a href="https://www.patreon.com/TheBlokeAI">Want to contribute? TheBloke's Patreon page</a></p>
+    </div>
+</div>
+<div style="text-align:center; margin-top: 0em; margin-bottom: 0em"><p style="margin-top: 0.25em; margin-bottom: 0em;">TheBloke's LLM work is generously supported by a grant from <a href="https://a16z.com">andreessen horowitz (a16z)</a></p></div>
+<hr style="margin-top: 1.0em; margin-bottom: 1.0em;">
+<!-- header end -->
+# Pygmalion 2 13B SuperCOT2 - GPTQ
+- Model creator: [royallab](https://huggingface.co/royallab)
+- Original model: [Pygmalion 2 13B SuperCOT2](https://huggingface.co/royallab/Pygmalion-2-13b-SuperCOT2)
+<!-- description start -->
+## Description
+This repo contains GPTQ model files for [royallab's Pygmalion 2 13B SuperCOT2](https://huggingface.co/royallab/Pygmalion-2-13b-SuperCOT2).
+Multiple GPTQ parameter permutations are provided; see Provided Files below for details of the options provided, their parameters, and the software used to create them.
+<!-- description end -->
+<!-- repositories-available start -->
+## Repositories available
+* [AWQ model(s) for GPU inference.](https://huggingface.co/TheBloke/Pygmalion-2-13B-SuperCOT2-AWQ)
+* [GPTQ models for GPU inference, with multiple quantisation parameter options.](https://huggingface.co/TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ)
+* [2, 3, 4, 5, 6 and 8-bit GGUF models for CPU+GPU inference](https://huggingface.co/TheBloke/Pygmalion-2-13B-SuperCOT2-GGUF)
+* [royallab's original unquantised fp16 model in pytorch format, for GPU inference and for further conversions](https://huggingface.co/royallab/Pygmalion-2-13b-SuperCOT2)
+<!-- repositories-available end -->
+<!-- prompt-template start -->
+## Prompt template: Alpaca
+```
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction:
+{prompt}
+### Response:
+```
+<!-- prompt-template end -->
+<!-- README_GPTQ.md-provided-files start -->
+## Provided files and GPTQ parameters
+Multiple quantisation parameters are provided, to allow you to choose the best one for your hardware and requirements.
+Each separate quant is in a different branch.  See below for instructions on fetching from different branches.
+All recent GPTQ files are made with AutoGPTQ, and all files in non-main branches are made with AutoGPTQ. Files in the `main` branch which were uploaded before August 2023 were made with GPTQ-for-LLaMa.
+<details>
+  <summary>Explanation of GPTQ parameters</summary>
+- Bits: The bit size of the quantised model.
+- GS: GPTQ group size. Higher numbers use less VRAM, but have lower quantisation accuracy. "None" is the lowest possible value.
+- Act Order: True or False. Also known as `desc_act`. True results in better quantisation accuracy. Some GPTQ clients have had issues with models that use Act Order plus Group Size, but this is generally resolved now.
+- Damp %: A GPTQ parameter that affects how samples are processed for quantisation. 0.01 is default, but 0.1 results in slightly better accuracy.
+- GPTQ dataset: The dataset used for quantisation. Using a dataset more appropriate to the model's training can improve quantisation accuracy. Note that the GPTQ dataset is not the same as the dataset used to train the model - please refer to the original model repo for details of the training dataset(s).
+- Sequence Length: The length of the dataset sequences used for quantisation. Ideally this is the same as the model sequence length. For some very long sequence models (16+K), a lower sequence length may have to be used.  Note that a lower sequence length does not limit the sequence length of the quantised model. It only impacts the quantisation accuracy on longer inference sequences.
+- ExLlama Compatibility: Whether this file can be loaded with ExLlama, which currently only supports Llama models in 4-bit.
+</details>
+| Branch | Bits | GS | Act Order | Damp % | GPTQ Dataset | Seq Len | Size | ExLlama | Desc |
+| ------ | ---- | -- | --------- | ------ | ------------ | ------- | ---- | ------- | ---- |
+| [main](https://huggingface.co/TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ/tree/main) | 4 | 128 | Yes | 0.1 | [wikitext](https://huggingface.co/datasets/wikitext/viewer/wikitext-2-v1/test) | 4096 | 7.26 GB | Yes | 4-bit, with Act Order and group size 128g. Uses even less VRAM than 64g, but with slightly lower accuracy. |
+| [gptq-4bit-32g-actorder_True](https://huggingface.co/TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ/tree/gptq-4bit-32g-actorder_True) | 4 | 32 | Yes | 0.1 | [wikitext](https://huggingface.co/datasets/wikitext/viewer/wikitext-2-v1/test) | 4096 | 8.00 GB | Yes | 4-bit, with Act Order and group size 32g. Gives highest possible inference quality, with maximum VRAM usage. |
+| [gptq-8bit--1g-actorder_True](https://huggingface.co/TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ/tree/gptq-8bit--1g-actorder_True) | 8 | None | Yes | 0.1 | [wikitext](https://huggingface.co/datasets/wikitext/viewer/wikitext-2-v1/test) | 4096 | 13.36 GB | No | 8-bit, with Act Order. No group size, to lower VRAM requirements. |
+| [gptq-8bit-128g-actorder_True](https://huggingface.co/TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ/tree/gptq-8bit-128g-actorder_True) | 8 | 128 | Yes | 0.1 | [wikitext](https://huggingface.co/datasets/wikitext/viewer/wikitext-2-v1/test) | 4096 | 13.65 GB | No | 8-bit, with group size 128g for higher inference quality and with Act Order for even higher accuracy. |
+<!-- README_GPTQ.md-provided-files end -->
+<!-- README_GPTQ.md-download-from-branches start -->
+## How to download from branches
+- In text-generation-webui, you can add `:branch` to the end of the download name, eg `TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ:main`
+- With Git, you can clone a branch with:
+```
+git clone --single-branch --branch main https://huggingface.co/TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ
+```
+- In Python Transformers code, the branch is the `revision` parameter; see below.
+<!-- README_GPTQ.md-download-from-branches end -->
+<!-- README_GPTQ.md-text-generation-webui start -->
+## How to easily download and use this model in [text-generation-webui](https://github.com/oobabooga/text-generation-webui).
+Please make sure you're using the latest version of [text-generation-webui](https://github.com/oobabooga/text-generation-webui).
+It is strongly recommended to use the text-generation-webui one-click-installers unless you're sure you know how to make a manual install.
+1. Click the **Model tab**.
+2. Under **Download custom model or LoRA**, enter `TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ`.
+  - To download from a specific branch, enter for example `TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ:main`
+  - see Provided Files above for the list of branches for each option.
+3. Click **Download**.
+4. The model will start downloading. Once it's finished it will say "Done".
+5. In the top left, click the refresh icon next to **Model**.
+6. In the **Model** dropdown, choose the model you just downloaded: `Pygmalion-2-13B-SuperCOT2-GPTQ`
+7. The model will automatically load, and is now ready for use!
+8. If you want any custom settings, set them and then click **Save settings for this model** followed by **Reload the Model** in the top right.
+  * Note that you do not need to and should not set manual GPTQ parameters any more. These are set automatically from the file `quantize_config.json`.
+9. Once you're ready, click the **Text Generation tab** and enter a prompt to get started!
+<!-- README_GPTQ.md-text-generation-webui end -->
+<!-- README_GPTQ.md-use-from-python start -->
+## How to use this GPTQ model from Python code
+### Install the necessary packages
+Requires: Transformers 4.32.0 or later, Optimum 1.12.0 or later, and AutoGPTQ 0.4.2 or later.
+```shell
+pip3 install transformers>=4.32.0 optimum>=1.12.0
+pip3 install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7
+```
+If you have problems installing AutoGPTQ using the pre-built wheels, install it from source instead:
+```shell
+pip3 uninstall -y auto-gptq
+git clone https://github.com/PanQiWei/AutoGPTQ
+cd AutoGPTQ
+pip3 install .
+```
+### For CodeLlama models only: you must use Transformers 4.33.0 or later.
+If 4.33.0 is not yet released when you read this, you will need to install Transformers from source:
+```shell
+pip3 uninstall -y transformers
+pip3 install git+https://github.com/huggingface/transformers.git
+```
+### You can then use the following code
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+model_name_or_path = "TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ"
+# To use a different branch, change revision
+# For example: revision="main"
+model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
+                                             device_map="auto",
+                                             trust_remote_code=False,
+                                             revision="main")
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
+prompt = "Tell me about AI"
+prompt_template=f'''Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction:
+{prompt}
+### Response:
+'''
+print("\n\n*** Generate:")
+input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
+output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
+print(tokenizer.decode(output[0]))
+# Inference can also be done using transformers' pipeline
+print("*** Pipeline:")
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_new_tokens=512,
+    do_sample=True,
+    temperature=0.7,
+    top_p=0.95,
+    top_k=40,
+    repetition_penalty=1.1
+)
+print(pipe(prompt_template)[0]['generated_text'])
+```
+<!-- README_GPTQ.md-use-from-python end -->
+<!-- README_GPTQ.md-compatibility start -->
+## Compatibility
+The files provided are tested to work with AutoGPTQ, both via Transformers and using AutoGPTQ directly. They should also work with [Occ4m's GPTQ-for-LLaMa fork](https://github.com/0cc4m/KoboldAI).
+[ExLlama](https://github.com/turboderp/exllama) is compatible with Llama models in 4-bit. Please see the Provided Files table above for per-file compatibility.
+[Huggingface Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference) is compatible with all GPTQ models.
+<!-- README_GPTQ.md-compatibility end -->
+<!-- footer start -->
+<!-- 200823 -->
+## Discord
+For further support, and discussions on these models and AI in general, join us at:
+[TheBloke AI's Discord server](https://discord.gg/theblokeai)
+## Thanks, and how to contribute
+Thanks to the [chirper.ai](https://chirper.ai) team!
+Thanks to Clay from [gpus.llm-utils.org](llm-utils)!
+I've had a lot of people ask if they can contribute. I enjoy providing models and helping people, and would love to be able to spend even more time doing it, as well as expanding into new projects like fine tuning/training.
+If you're able and willing to contribute it will be most gratefully received and will help me to keep providing more models, and to start work on new AI projects.
+Donaters will get priority support on any and all AI/LLM/model questions and requests, access to a private Discord room, plus other benefits.
+* Patreon: https://patreon.com/TheBlokeAI
+* Ko-Fi: https://ko-fi.com/TheBlokeAI
+**Special thanks to**: Aemon Algiz.
+**Patreon special mentions**: Alicia Loh, Stephen Murray, K, Ajan Kanaga, RoA, Magnesian, Deo Leter, Olakabola, Eugene Pentland, zynix, Deep Realms, Raymond Fosdick, Elijah Stavena, Iucharbius, Erik Bjäreholt, Luis Javier Navarrete Lozano, Nicholas, theTransient, John Detwiler, alfie_i, knownsqashed, Mano Prime, Willem Michiel, Enrico Ros, LangChain4j, OG, Michael Dempsey, Pierre Kircher, Pedro Madruga, James Bentley, Thomas Belote, Luke @flexchar, Leonard Tan, Johann-Peter Hartmann, Illia Dulskyi, Fen Risland, Chadd, S_X, Jeff Scroggin, Ken Nordquist, Sean Connelly, Artur Olbinski, Swaroop Kallakuri, Jack West, Ai Maven, David Ziegler, Russ Johnson, transmissions 11, John Villwock, Alps Aficionado, Clay Pascal, Viktor Bowallius, Subspace Studios, Rainer Wilmers, Trenton Dambrowitz, vamX, Michael Levine, 준교 김, Brandon Frisco, Kalila, Trailburnt, Randy H, Talal Aujan, Nathan Dryer, Vadim, 阿明, ReadyPlayerEmma, Tiffany J. Kim, George Stoitzev, Spencer Kim, Jerry Meng, Gabriel Tamborski, Cory Kujawski, Jeffrey Morgan, Spiking Neurons AB, Edmond Seymore, Alexandros Triantafyllidis, Lone Striker, Cap'n Zoog, Nikolai Manek, danny, ya boyyy, Derek Yates, usrbinkat, Mandus, TL, Nathan LeClaire, subjectnull, Imad Khwaja, webtim, Raven Klaugh, Asp the Wyvern, Gabriel Puliatti, Caitlyn Gatomon, Joseph William Delisle, Jonathan Leane, Luke Pendergrass, SuperWojo, Sebastain Graf, Will Dee, Fred von Graf, Andrey, Dan Guido, Daniel P. Andersen, Nitin Borwankar, Elle, Vitor Caleffi, biorpg, jjj, NimbleBox.ai, Pieter, Matthew Berman, terasurfer, Michael Davis, Alex, Stanislav Ovsiannikov
+Thank you to all my generous patrons and donaters!
+And thank you again to a16z for their generous grant.
+<!-- footer end -->
+# Original model card: royallab's Pygmalion 2 13B SuperCOT2
+# Model Card: Pygmalion-2-13b-SuperCOT2
+This is a merge between:
+- [Pygmalion 2 13b](https://huggingface.co/PygmalionAI/pygmalion-2-13b)
+- [Ausboss's Llama2 SuperCOT2 loras](https://huggingface.co/ausboss/llama2-13b-supercot-loras2) at a weight of 1.00.
+The merge was performed by a commandline version of [EzTrainer](https://github.com/CoffeeVampir3/ez-trainer) by CoffeeVampire/Blackroot via [zaraki-tools](https://github.com/zarakiquemparte/zaraki-tools) by Zaraki.
+This merge differs from the previous Pyg-2-SuperCOT merge. According to AusBoss, this version was trained closer to SuperCOT llama1. The intended objective is the same, which is to make Pygmalion smarter.
+The SuperCOT2 lora was merged at a weight of 1.
+## Usage:
+Since this is a merge between Pygmalion-2 and SuperCOT2, the following instruction formats should work:
+Metharme:
+```
+<|system|>This is a text adventure game. Describe the scenario to the user and give him three options to pick from on each turn.<|user|>Start!<|model|>
+```
+Alpaca:
+```
+### Instruction:
+Your instruction or question here.
+### Response:
+```
+## Bias, Risks, and Limitations
+The model will show biases similar to those observed in niche roleplaying forums on the Internet, besides those exhibited by the base model. It is not intended for supplying factual information or advice in any form.
+## Training Details
+This model is merged and can be reproduced using the tools mentioned above. Please refer to all provided links for extra model-specific details.

models/TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ/USE_POLICY.md ADDED Viewed

	@@ -0,0 +1,50 @@

+# Llama 2 Acceptable Use Policy
+Meta is committed to promoting safe and fair use of its tools and features, including Llama 2. If you access or use Llama 2, you agree to this Acceptable Use Policy (“Policy”). The most recent copy of this policy can be found at [ai.meta.com/llama/use-policy](http://ai.meta.com/llama/use-policy).
+## Prohibited Uses
+We want everyone to use Llama 2 safely and responsibly. You agree you will not use, or allow others to use, Llama 2 to:
+1. Violate the law or others’ rights, including to:
+    1. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
+        1. Violence or terrorism
+        2. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
+        3. Human trafficking, exploitation, and sexual violence
+        4. The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
+        5. Sexual solicitation
+        6. Any other criminal activity
+    2. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
+    3. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
+    4. Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
+    5. Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
+    6. Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any products or services using the Llama 2 Materials
+    7. Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
+2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of Llama 2 related to the following:
+    1. Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State
+    2. Guns and illegal weapons (including weapon development)
+    3. Illegal drugs and regulated/controlled substances
+    4. Operation of critical infrastructure, transportation technologies, or heavy machinery
+    5. Self-harm or harm to others, including suicide, cutting, and eating disorders
+    6. Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
+3. Intentionally deceive or mislead others, including use of Llama 2 related to the following:
+    1. Generating, promoting, or furthering fraud or the creation or promotion of disinformation
+    2. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
+    3. Generating, promoting, or further distributing spam
+    4. Impersonating another individual without consent, authorization, or legal right
+    5. Representing that the use of Llama 2 or outputs are human-generated
+    6. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
+4. Fail to appropriately disclose to end users any known dangers of your AI system
+Please report any violation of this Policy, software “bug,” or other problems that could lead to a violation of this Policy through one of the following means:
+* Reporting issues with the model: [github.com/facebookresearch/llama](http://github.com/facebookresearch/llama)
+* Reporting risky content generated by the model: [developers.facebook.com/llama_output_feedback](http://developers.facebook.com/llama_output_feedback)
+* Reporting bugs and security concerns: [facebook.com/whitehat/info](http://facebook.com/whitehat/info)
+* Reporting violations of the Acceptable Use Policy or unlicensed uses of Llama: [[email protected]](mailto:[email protected])

models/TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ/config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+    "_name_or_path": "royallab/Pygmalion-2-13b-SuperCOT2",
+    "architectures": [
+        "LlamaForCausalLM"
+    ],
+    "badwordsids": "[[29961], [14352], [24630], [29962], [11759], [15974], [5519], [25473], [18899], [25901], [7110], [9341], [13531], [518], [9310], [2636], [3366], [21069], [11970], [23098], [16733], [21298], [18173], [10846], [3816], [28513], [15625], [23192], [28166], [10062], [1385], [11724], [3108], [15555], [10834], [10370], [14330], [1822], [12436], [5262], [17094], [10725], [17077], [11424], [4197], [24406], [13359], [17531], [24566], [23076], [4514], [13192], [19942], [16261], [7072], [6024], [1402], [1839], [2033], [13970], [850], [5913], [28895], [5387], [8308], [24927], [5691], [12940], [19997], [18959], [11287], [16862], [4638], [22322], [29861], [21251], [14704], [17548], [12452], [17288], [23160], [24960], [8219], [18024], [5539], [7464], [27865], [29588], [20068], [19660], [27706], [22896], [24264], [12258], [2314], [4400], [5586], [12622], [6796], [7226], [21939], [18456], [14178], [21540], [21945], [14664], [16215], [10338], [17361], [7503], [13769], [26073], [9601], [26909], [7961], [8999], [20840], [16272], [21545], [3199], [10514], [5159], [22689], [6525], [20526], [27077], [18017]]",
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "intermediate_size": 13824,
+    "max_position_embeddings": 4096,
+    "model_type": "llama",
+    "num_attention_heads": 40,
+    "num_hidden_layers": 40,
+    "num_key_value_heads": 40,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 10000.0,
+    "tie_word_embeddings": false,
+    "torch_dtype": "float16",
+    "transformers_version": "4.34.0.dev0",
+    "use_cache": true,
+    "vocab_size": 32000,
+    "pad_token_id": 0,
+    "quantization_config": {
+        "bits": 4,
+        "group_size": 128,
+        "damp_percent": 0.1,
+        "desc_act": true,
+        "sym": true,
+        "true_sequential": true,
+        "model_name_or_path": null,
+        "model_file_base_name": "model",
+        "quant_method": "gptq"
+    }
+}

models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/generation_config.json RENAMED Viewed

@@ -2,6 +2,5 @@
   "_from_model_config": true,
   "bos_token_id": 1,
   "eos_token_id": 2,
-  "pad_token_id": 0,
-  "transformers_version": "4.30.0.dev0"
 }

   "_from_model_config": true,
   "bos_token_id": 1,
   "eos_token_id": 2,
+  "transformers_version": "4.34.0.dev0"
 }

models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/huggingface-metadata.txt RENAMED Viewed

@@ -1,6 +1,6 @@
-url: https://huggingface.co/TheBloke/Pygmalion-13B-SuperHOT-8K-GPTQ
 branch: main
-download date: 2023-09-08 13:41:43
 sha256sum:
-    988a7c1a954367afea66d96278d90abfbce752f027978a8bdf12524805a421a1 model.safetensors
     9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 tokenizer.model

+url: https://huggingface.co/TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ
 branch: main
+download date: 2023-09-20 03:13:55
 sha256sum:
+    457189eea624e2822724055c5bb4d2c5fa9da26adfca7ca1b3900301f0b86a57 model.safetensors
     9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 tokenizer.model

models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:988a7c1a954367afea66d96278d90abfbce752f027978a8bdf12524805a421a1
-size 7454797216

 version https://git-lfs.github.com/spec/v1
+oid sha256:457189eea624e2822724055c5bb4d2c5fa9da26adfca7ca1b3900301f0b86a57
+size 7259435192

models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/quantize_config.json RENAMED Viewed

@@ -1,9 +1,10 @@
 {
   "bits": 4,
   "group_size": 128,
-  "damp_percent": 0.01,
-  "desc_act": false,
   "sym": true,
   "true_sequential": true,
   "model_file_base_name": "model"
 }

 {
   "bits": 4,
   "group_size": 128,
+  "damp_percent": 0.1,
+  "desc_act": true,
   "sym": true,
   "true_sequential": true,
+  "model_name_or_path": null,
   "model_file_base_name": "model"
 }

models/TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "</s>",
+  "unk_token": "<unk>"
+}

models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/tokenizer.json RENAMED Viewed

@@ -9,7 +9,7 @@
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
-      "normalized": true,
       "special": true
     },
     {
@@ -18,7 +18,7 @@
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
-      "normalized": true,
       "special": true
     },
     {
@@ -27,7 +27,7 @@
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
-      "normalized": true,
       "special": true
     }
   ],
@@ -77,10 +77,16 @@
           "type_id": 0
         }
       },
       {
         "Sequence": {
           "id": "B",
-          "type_id": 0
         }
       }
     ],
@@ -93260,126 +93266,126 @@
       "▁livre s",
       "lu b",
       "l ub",
       "▁▁ ▁▁",
-      "▁▁ ▁▁▁▁",
-      "▁▁ ▁▁▁▁▁▁▁▁",
-      "▁▁ ▁▁▁▁▁",
       "▁▁ ▁▁▁▁▁▁",
-      "▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
-      "▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
-      "▁▁ ▁▁▁▁▁▁▁▁▁▁",
-      "▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
-      "▁▁ ▁▁▁",
-      "▁▁ ▁▁▁▁▁▁▁▁▁",
-      "▁▁ ▁▁▁▁▁▁▁",
-      "▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
-      "▁▁ ▁",
-      "▁▁▁▁ ▁▁",
       "▁▁▁▁ ▁▁▁▁",
-      "▁▁▁▁ ▁▁▁▁▁▁▁▁",
-      "▁▁▁▁ ▁▁▁▁▁",
-      "▁▁▁▁ ▁▁▁▁▁▁",
-      "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
-      "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
-      "▁▁▁▁ ▁▁▁",
-      "▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
-      "▁▁▁▁ ▁▁▁▁▁▁▁",
-      "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
       "▁▁▁▁ ▁",
-      "▁▁▁▁▁▁▁▁ ▁▁",
-      "▁▁▁▁▁▁▁▁ ▁▁▁▁",
       "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
-      "▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
-      "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
-      "▁▁▁▁▁▁▁▁ ▁▁▁",
-      "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
-      "▁▁▁▁▁▁▁▁ ▁",
-      "▁▁▁▁▁ ▁▁",
-      "▁▁▁▁▁ ▁▁▁▁",
-      "▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
-      "▁▁▁▁▁ ▁▁▁▁▁",
-      "▁▁▁▁▁ ▁▁▁▁▁▁",
-      "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
-      "▁▁▁▁▁ ▁▁▁",
-      "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
-      "▁▁▁▁▁ ▁▁▁▁▁▁▁",
       "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
-      "▁▁▁▁▁ ▁",
-      "▁▁▁▁▁▁ ▁▁",
-      "▁▁▁▁▁▁ ▁▁▁▁",
-      "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
-      "▁▁▁▁▁▁ ▁▁▁▁▁",
-      "▁▁▁▁▁▁ ▁▁▁▁▁▁",
       "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
-      "▁▁▁▁▁▁ ▁▁▁",
-      "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
-      "▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
-      "▁▁▁▁▁▁ ▁",
-      "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
       "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
-      "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
-      "▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
-      "▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
       "▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
-      "▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
-      "▁▁▁▁▁▁▁▁▁▁ ▁▁",
-      "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
-      "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
       "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
-      "▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
-      "▁▁▁▁▁▁▁▁▁▁ ▁",
       "▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
-      "▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
-      "▁▁▁ ▁▁",
-      "▁▁▁ ▁▁▁▁",
-      "▁▁▁ ▁▁▁▁▁▁▁▁",
-      "▁▁▁ ▁▁▁▁▁",
-      "▁▁▁ ▁▁▁▁▁▁",
-      "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
       "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
-      "▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
       "▁▁▁ ▁▁▁",
       "▁▁▁ ▁▁▁▁▁▁▁▁▁",
-      "▁▁▁ ▁▁▁▁▁▁▁",
-      "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
-      "▁▁▁ ▁",
-      "▁▁▁▁▁▁▁▁▁ ▁▁",
-      "▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
-      "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
-      "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
       "▁▁▁▁▁▁▁▁▁ ▁▁▁",
-      "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
-      "▁▁▁▁▁▁▁▁▁ ▁",
-      "▁▁▁▁▁▁▁ ▁▁",
-      "▁▁▁▁▁▁▁ ▁▁▁▁",
-      "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
       "▁▁▁▁▁▁▁ ▁▁▁▁▁",
       "▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
       "▁▁▁▁▁▁▁ ▁▁▁",
-      "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
       "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
-      "▁▁▁▁▁▁▁ ▁",
-      "▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
-      "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
-      "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
       "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
-      "▁▁▁▁▁▁▁▁▁▁▁ ▁",
-      "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
       "▁ ▁▁",
-      "▁ ▁▁▁▁",
       "▁ ▁▁▁▁▁▁▁▁",
-      "▁ ▁▁▁▁▁",
       "▁ ▁▁▁▁▁▁",
-      "▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
-      "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
       "▁ ▁▁▁▁▁▁▁▁▁▁",
-      "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
-      "▁ ▁▁▁",
-      "▁ ▁▁▁▁▁▁▁▁▁",
-      "▁ ▁▁▁▁▁▁▁",
-      "▁ ▁▁▁▁▁▁▁▁▁▁▁",
-      "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
-      "▁ ▁"
     ]
   }
 }

       "single_word": false,
       "lstrip": false,
       "rstrip": false,
+      "normalized": false,
       "special": true
     },
     {
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
+      "normalized": false,
       "special": true
     },
     {
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
+      "normalized": false,
       "special": true
     }
   ],
           "type_id": 0
         }
       },
+      {
+        "SpecialToken": {
+          "id": "<s>",
+          "type_id": 1
+        }
+      },
       {
         "Sequence": {
           "id": "B",
+          "type_id": 1
         }
       }
     ],
       "▁livre s",
       "lu b",
       "l ub",
+      "▁ ▁",
       "▁▁ ▁▁",
+      "▁▁▁ ▁",
+      "▁ ▁▁▁",
       "▁▁ ▁▁▁▁▁▁",
       "▁▁▁▁ ▁▁▁▁",
+      "▁▁▁▁▁ ▁▁▁",
+      "▁▁▁▁▁▁ ▁▁",
+      "▁▁▁ ▁▁▁▁▁",
+      "▁▁▁▁▁▁▁ ▁",
+      "▁ ▁▁▁▁▁▁▁",
+      "▁▁ ▁▁▁",
       "▁▁▁▁ ▁",
+      "▁▁▁ ▁▁",
+      "▁ ▁▁▁▁",
+      "▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
       "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
       "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
       "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
       "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
       "▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
       "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
       "▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
       "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁��",
+      "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
+      "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁ ▁▁▁▁",
+      "▁▁▁▁ ▁▁",
+      "▁▁▁▁▁ ▁",
       "▁▁▁ ▁▁▁",
+      "▁ ▁▁▁▁▁",
+      "▁▁ ▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁ ▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁ ▁▁▁▁",
+      "▁▁▁▁▁ ▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁ ▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁▁ ▁▁",
       "▁▁▁ ▁▁▁▁▁▁▁▁▁",
       "▁▁▁▁▁▁▁▁▁ ▁▁▁",
       "▁▁▁▁▁▁▁ ▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁▁▁ ▁",
+      "▁ ▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
+      "▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
+      "▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
+      "▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
       "▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
+      "▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁ ▁▁▁▁▁▁▁▁",
+      "▁▁▁▁ ▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁ ▁▁",
+      "▁▁▁▁▁ ▁▁▁▁▁",
+      "▁▁▁▁▁▁ ▁▁▁▁",
+      "▁▁▁ ▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁ ▁",
       "▁▁▁▁▁▁▁ ▁▁▁",
+      "▁ ▁▁▁▁▁▁▁▁▁",
+      "▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
+      "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
+      "▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
+      "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
+      "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
       "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
       "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
+      "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁ ▁",
       "▁ ▁▁",
+      "▁▁ ▁▁▁▁▁▁▁",
+      "▁▁▁▁ ▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁ ▁",
+      "▁▁▁▁▁ ▁▁▁▁",
+      "▁▁▁▁▁▁ ▁▁▁",
+      "▁▁▁ ▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁ ▁▁",
       "▁ ▁▁▁▁▁▁▁▁",
+      "▁▁ ▁▁▁▁▁",
+      "▁▁▁▁ ▁▁▁",
+      "▁▁▁▁▁ ▁▁",
+      "▁▁▁▁▁▁ ▁",
+      "▁▁▁ ▁▁▁▁",
       "▁ ▁▁▁▁▁▁",
+      "▁▁ ▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁ ▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁ ▁▁▁",
+      "▁▁▁▁▁ ▁▁▁▁▁▁",
+      "▁▁▁▁▁▁ ▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁▁ ▁",
+      "▁▁▁ ▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁ ▁▁",
+      "▁▁▁▁▁▁▁ ▁▁▁▁",
       "▁ ▁▁▁▁▁▁▁▁▁▁",
+      "▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
+      "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
+      "▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
+      "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
+      "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
+      "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
+      "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁"
     ]
   }
 }

models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/tokenizer.model RENAMED Viewed

File without changes

models/{TheBloke_Pygmalion-13B-SuperHOT-8K-GPTQ → TheBloke_Pygmalion-2-13B-SuperCOT2-GPTQ}/tokenizer_config.json RENAMED Viewed

@@ -1,11 +1,9 @@
 {
-  "add_bos_token": true,
-  "add_eos_token": false,
   "bos_token": {
     "__type": "AddedToken",
     "content": "<s>",
     "lstrip": false,
-    "normalized": true,
     "rstrip": false,
     "single_word": false
   },
@@ -14,20 +12,26 @@
     "__type": "AddedToken",
     "content": "</s>",
     "lstrip": false,
-    "normalized": true,
     "rstrip": false,
     "single_word": false
   },
-  "model_max_length": 2048,
   "pad_token": null,
   "sp_model_kwargs": {},
   "tokenizer_class": "LlamaTokenizer",
   "unk_token": {
     "__type": "AddedToken",
     "content": "<unk>",
     "lstrip": false,
-    "normalized": true,
     "rstrip": false,
     "single_word": false
-  }
 }

 {
   "bos_token": {
     "__type": "AddedToken",
     "content": "<s>",
     "lstrip": false,
+    "normalized": false,
     "rstrip": false,
     "single_word": false
   },
     "__type": "AddedToken",
     "content": "</s>",
     "lstrip": false,
+    "normalized": false,
     "rstrip": false,
     "single_word": false
   },
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
   "pad_token": null,
+  "padding_side": "right",
   "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
   "tokenizer_class": "LlamaTokenizer",
+  "trust_remote_code": false,
   "unk_token": {
     "__type": "AddedToken",
     "content": "<unk>",
     "lstrip": false,
+    "normalized": false,
     "rstrip": false,
     "single_word": false
+  },
+  "use_default_system_prompt": true,
+  "use_fast": true
 }

run.py CHANGED Viewed

@@ -1,3 +1,3 @@
 import os
-os.system('python download-model.py PygmalionAI/pygmalion-350m --branch main')
-os.system('python server.py --auto-devices --gpu-memory 24 --public-api')

 import os
+os.system('python3 download-model.py TheBloke/Pygmalion-2-13B-SuperCOT2-GPTQ --branch main')
+os.system('python3 server.py --auto-devices --gpu-memory 24 --public-api')