Add lora model and custom inference file

Files changed (8) hide show

handler.py +91 -0
model/adapter_config.json +34 -0
model/adapter_model.safetensors +3 -0
model/added_tokens.json +4 -0
model/special_tokens_map.json +24 -0
model/tokenizer.json +0 -0
model/tokenizer.model +3 -0
model/tokenizer_config.json +59 -0

handler.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import json
+import logging
+import torch
+from typing import List
+from typing import Dict, Any
+from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria
+import torch
+class MyStoppingCriteria(StoppingCriteria):
+    def __init__(self, target_sequence, prompt, tokenizer):
+        self.target_sequence = target_sequence
+        self.prompt = prompt
+        self.tokenizer = tokenizer
+    def __call__(self, input_ids, scores, **kwargs):
+        # Get the generated text as a string
+        generated_text = self.tokenizer.decode(input_ids[0])
+        generated_text = generated_text.replace(self.prompt, '')
+        # Check if the target sequence appears in the generated text
+        if self.target_sequence in generated_text:
+            return True  # Stop generation
+        return False  # Continue generation
+    def __len__(self):
+        return 1
+    def __iter__(self):
+        yield self
+class EndpointHandler:
+    def __init__(self, model_dir=""):
+        # load model and processor from path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
+        self.model = AutoModelForCausalLM.from_pretrained(model_dir, load_in_4bit=True, device_map="auto")
+        self.template = {
+            "prompt_input": """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n""",
+            "prompt_no_input": """Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n""",
+            "response_split": """### Response:"""
+        }
+        self.instruction = """Extract the start and end sequences for the categories 'personal information', 'work experience', 'education' and 'skills' from the following text in dictionary form"""
+        if torch.cuda.is_available():
+            self.device = "cuda"
+        else:
+            self.device = "cpu"
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
+        """
+        Args:
+            data (:dict:):
+                The payload with the text prompt and generation parameters.
+        """
+        # process input
+        inputs = data.pop("inputs", data)
+        parameters = data.pop("parameters", None)
+        res = self.template["prompt_input"].format(
+            instruction=self.instruction, input=input
+        )
+        messages = [
+            {"role": "user", "content": res},
+        ]
+        input_ids = self.tokenizer.apply_chat_template(
+            messages, truncation=True, add_generation_prompt=True, return_tensors="pt"
+            ).input_ids
+        input_ids = input_ids.to(self.device)
+        # pass inputs with all kwargs in data
+        if parameters is not None:
+            outputs = self.model.generate(
+                input_ids=input_ids,
+                stopping_criteria=MyStoppingCriteria("</s>", inputs, self.tokenizer),
+                **parameters)
+        else:
+            outputs = self.model.generate(
+                input_ids=input_ids, max_new_tokens=32,
+                stopping_criteria=MyStoppingCriteria("</s>", inputs, self.tokenizer)
+            )
+        # postprocess the prediction
+        prediction = self.tokenizer.decode(outputs[0][input_ids.shape[1]:]) #, skip_special_tokens=True)
+        prediction = prediction.split("</s>")[0]
+        # TODO: add processing of the LLM output
+        return [{"generated_text": prediction}]

model/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [
+    "lm_head",
+    "embed_tokens"
+  ],
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "lm_head",
+    "embed_tokens"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_rslora": false
+}

model/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e8755b8e8b0e194e4db8d9cbcb6e7f81cbc205a286f7a99de201b60345371dc
+size 3173223624

model/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  ", 'e':": 32001,
+  "{'s': '": 32000
+}

model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "{'s': '",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32001": {
+      "content": ", 'e':",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}