Fan21
/

Llama-mt-lora

@@ -12,19 +12,85 @@ This model is fine-tuned with LLaMA with 8 Nvidia RTX 1080Ti GPUs and enhanced w
 ### Here is how to use it with texts in HuggingFace
 ```python
 # A list of special tokens the model was trained with
-special_tokens_dict = {
-    'additional_special_tokens': [
-        '[SAFE]','[UNSAFE]', '[OK]', '[SELF_M]','[SELF_F]', '[SELF_N]',
-        '[PARTNER_M]', '[PARTNER_F]', '[PARTNER_N]',
-        '[ABOUT_M]', '[ABOUT_F]', '[ABOUT_N]', '<speaker1>', '<speaker2>'
-    ],
-    'bos_token': '<bos>',
-    'eos_token': '<eos>',
-}
-from transformers import AutoTokenizer, AutoModelForCausalLM
-math_bot_tokenizer = AutoTokenizer.from_pretrained('uf-aice-lab/SafeMathBot')
-safe_math_bot = AutoModelForCausalLM.from_pretrained('uf-aice-lab/SafeMathBot')
-text = "Replace me by any text you'd like."
-encoded_input = math_bot_tokenizer(text, return_tensors='pt')
-output = safe_math_bot(**encoded_input)
 ```

 ### Here is how to use it with texts in HuggingFace
 ```python
 # A list of special tokens the model was trained with
+from transformers import LlamaTokenizer, AutoModelForCausalLM
+tokenizer = LlamaTokenizer.from_pretrained("Fan21/Llama-mt-lora")
+BASE_MODEL = "Fan21/Llama-mt-lora"
+if torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
+if device == "cuda":
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL,
+        load_in_8bit=False,
+        torch_dtype=torch.float16,
+        device_map="auto",
+    )
+elif device == "mps":
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL,
+        device_map={"": device},
+        torch_dtype=torch.float16,
+    )
+else:
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
+    )
+def generate_prompt(instruction, input=None):
+    if input:
+        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+### Instruction:
+{instruction}
+### Input:
+{input}
+### Response:"""
+    else:
+        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction:
+{instruction}
+### Response:"""
+if device != "cpu":
+    model.half()
+model.eval()
+if torch.__version__ >= "2":
+    model = torch.compile(model)
+def evaluate(
+    instruction,
+    input=None,
+    temperature=0.1,
+    top_p=0.75,
+    top_k=40,
+    num_beams=4,
+    max_new_tokens=128,
+    **kwargs,
+):
+    prompt = generate_prompt(instruction, input)
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(device)
+    generation_config = GenerationConfig(
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        num_beams=num_beams,
+        **kwargs,
+    )
+    with torch.no_grad():
+        generation_output = model.generate(
+            input_ids=input_ids,
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+            output_scores=True,
+            max_new_tokens=max_new_tokens,
+        )
+    s = generation_output.sequences[0]
+    output = tokenizer.decode(s)
+    return output.split("### Response:")[1].strip()
 ```