File size: 3,391 Bytes
fe1ad0f
 
c1ec7d4
 
 
cae450a
fe1ad0f
c1ec7d4
 
 
7c4cfa7
c1ec7d4
 
 
 
 
 
 
7c4cfa7
b1db2e4
fe1ad0f
 
d4fef3a
 
b1db2e4
d4fef3a
b1db2e4
d4fef3a
b1db2e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4fef3a
b1db2e4
 
 
 
 
 
 
 
d4fef3a
 
 
b1db2e4
 
 
 
 
405bbd1
b1db2e4
 
 
 
d4fef3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1db2e4
 
 
 
d4fef3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d965a7
 
 
 
 
 
d4fef3a
 
b1db2e4
d4fef3a
 
 
 
 
 
 
 
 
 
 
c1ec7d4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
---
tags:
- autotrain
- meta-llama
- meta-llama/Llama-2-7b-hf
inference: true
widget:
- text: >
    instruction: "If you are a doctor, please answer the medical questions based
    on the patient's description."

    input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last
    year. I am having some major bilateral temple pain along with numbness that
    comes and goes in my left arm/hand/fingers. I have had headaches since the
    aneurysm, but this is different. Also, my moods have been horrible for the
    past few weeks."

    response: ''
library_name: peft
pipeline_tag: text-generation
---


```python
import transformers
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from torch import cuda, bfloat16

base_model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


hf_auth = "your-huggingface-access-token"
model_config = transformers.AutoConfig.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

config = PeftConfig.from_pretrained("Ashishkr/llama2_medical_consultation")
model = PeftModel.from_pretrained(model, "Ashishkr/llama2_medical_consultation").to(device)

model.eval()
print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)



```



```python
def llama_generate(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 128,
    temperature: float = 0.92):

    inputs = tokenizer(
        [prompt],
        return_tensors="pt",
        return_token_type_ids=False,
    ).to(
        device
    )

    # Check if bfloat16 is supported, otherwise use float16
    dtype_to_use = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

    with torch.autocast("cuda", dtype=dtype_to_use):
        response = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded_output = tokenizer.decode(
        response["sequences"][0],
        skip_special_tokens=True,
    )

    return decoded_output[len(prompt) :]

prompt = """
 instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n

input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year.
I am having some major bilateral temple pain along with numbness that comes and
goes in my left arm/hand/fingers. I have had headaches since the aneurysm,
but this is different. Also, my moods have been horrible for the past few weeks.\n

response:  """
# You can use the function as before
response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=100,
    temperature=0.92,
)

print(response)


```