File size: 3,391 Bytes

fe1ad0f
 
c1ec7d4
 
 
cae450a
fe1ad0f
c1ec7d4
 
 
7c4cfa7
c1ec7d4
 
 
 
 
 
 
7c4cfa7
b1db2e4
fe1ad0f
 
d4fef3a
 
b1db2e4
d4fef3a
b1db2e4
d4fef3a
b1db2e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4fef3a
b1db2e4
 
 
 
 
 
 
 
d4fef3a
 
 
b1db2e4
 
 
 
 
405bbd1
b1db2e4
 
 
 
d4fef3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1db2e4
 
 
 
d4fef3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d965a7
 
 
 
 
 
d4fef3a
 
b1db2e4
d4fef3a
 
 
 
 
 
 
 
 
 
 
c1ec7d4

---
tags:
- autotrain
- meta-llama
- meta-llama/Llama-2-7b-hf
inference: true
widget:
- text: >
    instruction: "If you are a doctor, please answer the medical questions based
    on the patient's description."

    input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last
    year. I am having some major bilateral temple pain along with numbness that
    comes and goes in my left arm/hand/fingers. I have had headaches since the
    aneurysm, but this is different. Also, my moods have been horrible for the
    past few weeks."

    response: ''
library_name: peft
pipeline_tag: text-generation
---


```python
import transformers
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from torch import cuda, bfloat16

base_model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


hf_auth = "your-huggingface-access-token"
model_config = transformers.AutoConfig.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

config = PeftConfig.from_pretrained("Ashishkr/llama2_medical_consultation")
model = PeftModel.from_pretrained(model, "Ashishkr/llama2_medical_consultation").to(device)

model.eval()
print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)



```



```python
def llama_generate(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 128,
    temperature: float = 0.92):

    inputs = tokenizer(
        [prompt],
        return_tensors="pt",
        return_token_type_ids=False,
    ).to(
        device
    )

    # Check if bfloat16 is supported, otherwise use float16
    dtype_to_use = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

    with torch.autocast("cuda", dtype=dtype_to_use):
        response = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded_output = tokenizer.decode(
        response["sequences"][0],
        skip_special_tokens=True,
    )

    return decoded_output[len(prompt) :]

prompt = """
 instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n

input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year.
I am having some major bilateral temple pain along with numbness that comes and
goes in my left arm/hand/fingers. I have had headaches since the aneurysm,
but this is different. Also, my moods have been horrible for the past few weeks.\n

response:  """
# You can use the function as before
response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=100,
    temperature=0.92,
)

print(response)


```