Upload 9 files

Browse files

Files changed (9) hide show

.gitattributes +0 -16
README.md +9 -329
config.json +38 -2
handler.py +30 -0
pytorch_model.bin +3 -0
requirements.txt +1 -6
special_tokens_map.json +1 -0
tokenizer_config.json +1 -0
vocab.txt +0 -0

.gitattributes CHANGED Viewed

@@ -2,13 +2,11 @@
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
@@ -22,10 +20,8 @@
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,15 +29,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q2_K.gguf filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q3_K_L.gguf filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
-codellama-7b-instruct.Q5_0.gguf filter=lfs diff=lfs merge=lfs -text

 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,336 +1,16 @@
 ---
-license: apache-2.0
-datasets:
-- PetraAI/PetraAI
 language:
-- ar
 - en
-- ch
-- zh
-metrics:
-- accuracy
-- bertscore
-- bleu
-- chrf
-- code_eval
-- brier_score
 tags:
-- chemistry
-- biology
-- finance
-- legal
-- music
-- code
-- art
-- climate
-- medical
-- text-generation-inference
 ---
-### Inference Speed
-> The result is generated using [this script](examples/benchmark/generation_speed.py), batch size of input is 1, decode strategy is beam search and enforce the model to generate 512 tokens, speed metric is tokens/s (the larger, the better).
->
-> The quantized model is loaded using the setup that can gain the fastest inference speed.
-| model         | GPU           | num_beams | fp16  | gptq-int4 |
-|---------------|---------------|-----------|-------|-----------|
-| llama-7b      | 1xA100-40G    | 1         | 18.87 | 25.53     |
-| llama-7b      | 1xA100-40G    | 4         | 68.79 | 91.30     |
-| moss-moon 16b | 1xA100-40G    | 1         | 12.48 | 15.25     |
-| moss-moon 16b | 1xA100-40G    | 4         | OOM   | 42.67     |
-| moss-moon 16b | 2xA100-40G    | 1         | 06.83 | 06.78     |
-| moss-moon 16b | 2xA100-40G    | 4         | 13.10 | 10.80     |
-| gpt-j 6b      | 1xRTX3060-12G | 1         | OOM   | 29.55     |
-| gpt-j 6b      | 1xRTX3060-12G | 4         | OOM   | 47.36     |
-### Perplexity
-For perplexity comparison, you can turn to [here](https://github.com/qwopqwop200/GPTQ-for-LLaMa#result) and [here](https://github.com/qwopqwop200/GPTQ-for-LLaMa#gptq-vs-bitsandbytes)
-## Installation
-### Quick Installation
-You can install the latest stable release of AutoGPTQ from pip with pre-built wheels compatible with PyTorch 2.0.1:
-* For CUDA 11.7: `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/`
-* For CUDA 11.8: `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/`
-* For RoCm 5.4.2: `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm542/`
-**Warning:** These wheels are not expected to work on PyTorch nightly. Please install AutoGPTQ from source when using PyTorch nightly.
-#### disable cuda extensions
-By default, cuda extensions will be installed when `torch` and `cuda` is already installed in your machine, if you don't want to use them, using:
-```shell
-BUILD_CUDA_EXT=0 pip install auto-gptq
-```
-And to make sure `autogptq_cuda` is not ever in your virtual environment, run:
-```shell
-pip uninstall autogptq_cuda -y
-```
-#### to support triton speedup
-To integrate with `triton`, using:
-> warning: currently triton only supports linux; 3-bit quantization is not supported when using triton
-```shell
-pip install auto-gptq[triton]
-```
-### Install from source
-<details>
-<summary>click to see details</summary>
-Clone the source code:
-```shell
-git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
-```
-Then, install from source:
-```shell
-pip install .
-```
-Like quick installation, you can also set `BUILD_CUDA_EXT=0` to disable pytorch extension building.
-Use `.[triton]` if you want to integrate with triton and it's available on your operating system.
-To install from source for AMD GPUs supporting RoCm, please specify the `ROCM_VERSION` environment variable. The compilation can be speeded up by specifying the `PYTORCH_ROCM_ARCH` variable ([reference](https://github.com/pytorch/pytorch/blob/7b73b1e8a73a1777ebe8d2cd4487eb13da55b3ba/setup.py#L132)), for example `gfx90a` for MI200 series devices. Example:
-```
-ROCM_VERSION=5.6 pip install .
-```
-For RoCm systems, the packages `rocsparse-dev`, `hipsparse-dev`, `rocthrust-dev`, `rocblas-dev` and `hipblas-dev` are required to build.
-</details>
-## Quick Tour
-### Quantization and Inference
-> warning: this is just a showcase of the usage of basic apis in AutoGPTQ, which uses only one sample to quantize a much small model, quality of quantized model using such little samples may not good.
-Below is an example for the simplest use of `auto_gptq` to quantize a model and inference after quantization:
-```python
-from transformers import AutoTokenizer, TextGenerationPipeline
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-import logging
-logging.basicConfig(
-    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
-)
-pretrained_model_dir = "facebook/opt-125m"
-quantized_model_dir = "opt-125m-4bit"
-tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
-examples = [
-    tokenizer(
-        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
-    )
-]
-quantize_config = BaseQuantizeConfig(
-    bits=4,  # quantize model to 4-bit
-    group_size=128,  # it is recommended to set the value to 128
-    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
-)
-# load un-quantized model, by default, the model will always be loaded into CPU memory
-model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
-# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
-model.quantize(examples)
-# save quantized model
-model.save_quantized(quantized_model_dir)
-# save quantized model using safetensors
-model.save_quantized(quantized_model_dir, use_safetensors=True)
-# push quantized model to Hugging Face Hub.
-# to use use_auth_token=True, Login first via huggingface-cli login.
-# or pass explcit token with: use_auth_token="hf_xxxxxxx"
-# (uncomment the following three lines to enable this feature)
-# repo_id = f"YourUserName/{quantized_model_dir}"
-# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
-# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
-# alternatively you can save and push at the same time
-# (uncomment the following three lines to enable this feature)
-# repo_id = f"YourUserName/{quantized_model_dir}"
-# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
-# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
-# load quantized model to the first GPU
-model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")
-# download quantized model from Hugging Face Hub and load to the first GPU
-# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
-# inference with model.generate
-print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))
-# or you can also use pipeline
-pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
-print(pipeline("auto-gptq is")[0]["generated_text"])
-```
-For more advanced features of model quantization, please reference to [this script](examples/quantization/quant_with_alpaca.py)
-### Customize Model
-<details>
-<summary>Below is an example to extend `auto_gptq` to support `OPT` model, as you will see, it's very easy:</summary>
-```python
-from auto_gptq.modeling import BaseGPTQForCausalLM
-class OPTGPTQForCausalLM(BaseGPTQForCausalLM):
-    # chained attribute name of transformer layer block
-    layers_block_name = "model.decoder.layers"
-    # chained attribute names of other nn modules that in the same level as the transformer layer block
-    outside_layer_modules = [
-        "model.decoder.embed_tokens", "model.decoder.embed_positions", "model.decoder.project_out",
-        "model.decoder.project_in", "model.decoder.final_layer_norm"
-    ]
-    # chained attribute names of linear layers in transformer layer module
-    # normally, there are four sub lists, for each one the modules in it can be seen as one operation,
-    # and the order should be the order when they are truly executed, in this case (and usually in most cases),
-    # they are: attention q_k_v projection, attention output projection, MLP project input, MLP project output
-    inside_layer_modules = [
-        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
-        ["self_attn.out_proj"],
-        ["fc1"],
-        ["fc2"]
-    ]
-```
-After this, you can use `OPTGPTQForCausalLM.from_pretrained` and other methods as shown in Basic.
-</details>
-### Evaluation on Downstream Tasks
-You can use tasks defined in `auto_gptq.eval_tasks` to evaluate model's performance on specific down-stream task before and after quantization.
-The predefined tasks support all causal-language-models implemented in [🤗 transformers](https://github.com/huggingface/transformers) and in this project.
-<details>
-<summary>Below is an example to evaluate `EleutherAI/gpt-j-6b` on sequence-classification task using `cardiffnlp/tweet_sentiment_multilingual` dataset:</summary>
-```python
-from functools import partial
-import datasets
-from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-from auto_gptq.eval_tasks import SequenceClassificationTask
-MODEL = "EleutherAI/gpt-j-6b"
-DATASET = "cardiffnlp/tweet_sentiment_multilingual"
-TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
-ID2LABEL = {
-    0: "negative",
-    1: "neutral",
-    2: "positive"
-}
-LABELS = list(ID2LABEL.values())
-def ds_refactor_fn(samples):
-    text_data = samples["text"]
-    label_data = samples["label"]
-    new_samples = {"prompt": [], "label": []}
-    for text, label in zip(text_data, label_data):
-        prompt = TEMPLATE.format(labels=LABELS, text=text)
-        new_samples["prompt"].append(prompt)
-        new_samples["label"].append(ID2LABEL[label])
-    return new_samples
-#  model = AutoModelForCausalLM.from_pretrained(MODEL).eval().half().to("cuda:0")
-model = AutoGPTQForCausalLM.from_pretrained(MODEL, BaseQuantizeConfig())
-tokenizer = AutoTokenizer.from_pretrained(MODEL)
-task = SequenceClassificationTask(
-        model=model,
-        tokenizer=tokenizer,
-        classes=LABELS,
-        data_name_or_path=DATASET,
-        prompt_col_name="prompt",
-        label_col_name="label",
-        **{
-            "num_samples": 1000,  # how many samples will be sampled to evaluation
-            "sample_max_len": 1024,  # max tokens for each sample
-            "block_max_len": 2048,  # max tokens for each data block
-            # function to load dataset, one must only accept data_name_or_path as input
-            # and return datasets.Dataset
-            "load_fn": partial(datasets.load_dataset, name="english"),
-            # function to preprocess dataset, which is used for datasets.Dataset.map,
-            # must return Dict[str, list] with only two keys: [prompt_col_name, label_col_name]
-            "preprocess_fn": ds_refactor_fn,
-            # truncate label when sample's length exceed sample_max_len
-            "truncate_prompt": False
-        }
-    )
-# note that max_new_tokens will be automatically specified internally based on given classes
-print(task.run())
-# self-consistency
-print(
-    task.run(
-        generation_config=GenerationConfig(
-            num_beams=3,
-            num_return_sequences=3,
-            do_sample=True
-        )
-    )
-)
-```
-</details>
-## Learn More
-[tutorials](docs/tutorial) provide step-by-step guidance to integrate `auto_gptq` with your own project and some best practice principles.
-[examples](examples/README.md) provide plenty of example scripts to use `auto_gptq` in different ways.
-## Supported Models
-> you can use `model.config.model_type` to compare with the table below to check whether the model you use is supported by `auto_gptq`.
->
-> for example, model_type of `WizardLM`, `vicuna` and `gpt4all` are all `llama`, hence they are all supported by `auto_gptq`.
-| model type                         | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt                                                                            |
-|------------------------------------|--------------|-----------|-----------|---------------|-------------------------------------------------------------------------------------------------|
-| bloom                              | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
-| gpt2                               | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
-| gpt_neox                           | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
-| gptj                               | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
-| llama                              | ✅            | ✅         | ✅         | ✅             | ✅                                                                                               |
-| moss                               | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
-| opt                                | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
-| gpt_bigcode                        | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
-| codegen                            | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
-| falcon(RefinedWebModel/RefinedWeb) | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
-## Supported Evaluation Tasks
-Currently, `auto_gptq` supports: `LanguageModelingTask`, `SequenceClassificationTask` and `TextSummarizationTask`; more Tasks will come soon!
-## Running tests
-Tests can be run with:
-```
-pytest tests/ -s
-```
-## Acknowledgement
-- Specially thanks **Elias Frantar**, **Saleh Ashkboos**, **Torsten Hoefler** and **Dan Alistarh** for proposing **GPTQ** algorithm and open source the [code](https://github.com/IST-DASLab/gptq).
-- Specially thanks **qwopqwop200**, for code in this project that relevant to quantization are mainly referenced from [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda).
-[![Star History Chart](https://api.star-history.com/svg?repos=PanQiwei/AutoGPTQ&type=Date)](https://star-history.com/#PanQiWei/AutoGPTQ&Date)

 ---
 language:
 - en
 tags:
+- text-classification
+- emotion
+- endpoints-template
+license: apache-2.0
+datasets:
+- emotion
+metrics:
+- Accuracy, F1 Score
 ---
+# Fork of [bhadresh-savani/distilbert-base-uncased-emotion](https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion)

config.json CHANGED Viewed

@@ -1,3 +1,39 @@
 {
-    "model_type": "llama"
-}

 {
+  "_name_or_path": "./",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "sadness",
+    "1": "joy",
+    "2": "love",
+    "3": "anger",
+    "4": "fear",
+    "5": "surprise"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "anger": 3,
+    "fear": 4,
+    "joy": 1,
+    "love": 2,
+    "sadness": 0,
+    "surprise": 5
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "transformers_version": "4.11.0.dev0",
+  "vocab_size": 30522
+}

handler.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from typing import Dict, List, Any
+from transformers import pipeline
+import holidays
+class EndpointHandler:
+    def __init__(self, path=""):
+        self.pipeline = pipeline("text-classification", model=path)
+        self.holidays = holidays.US()
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+         data args:
+              inputs (:obj: `str`)
+              date (:obj: `str`)
+        Return:
+              A :obj:`list` | `dict`: will be serialized and returned
+        """
+        # get inputs
+        inputs = data.pop("inputs", data)
+        # get additional date field
+        date = data.pop("date", None)
+        # check if date exists and if it is a holiday
+        if date is not None and date in self.holidays:
+            return [{"label": "happy", "score": 1}]
+        # run normal prediction
+        prediction = self.pipeline(inputs)
+        return prediction

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5aa7398d830fcc94f95af88d7cc3013813668cfc58a07d75a8116cfd8af75c4d
+size 267875479

requirements.txt CHANGED Viewed

@@ -1,6 +1 @@
-pandas
-ninja
-fastparquet
-torch>=2.0.1
-safetensors>=0.3.2
-sentencepiece>=0.1.97


1	+ holidaysholidays

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "distilbert-base-uncased"}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff