|
--- |
|
library_name: transformers |
|
tags: |
|
- biology |
|
- chemistry |
|
- biological materials |
|
- materials science |
|
- engineering |
|
- materials informatics |
|
- scientific AI |
|
- AI4science |
|
- Llama-3-1 |
|
--- |
|
|
|
## Inference example |
|
|
|
``` |
|
model_name='lamm-mit/Bioinspired-Llama-3-1-8B-128k-gamma' |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
trust_remote_code=True, |
|
device_map="auto", |
|
torch_dtype =torch.bfloat16, |
|
attn_implementation="flash_attention_2" |
|
) |
|
model.config.use_cache = True |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
``` |
|
|
|
#### Function to interact with the model |
|
|
|
``` |
|
def generate_response (text_input="What is spider silk?", |
|
system_prompt='', |
|
num_return_sequences=1, |
|
temperature=1., #the higher the temperature, the more creative the model becomes |
|
max_new_tokens=127,device='cuda', |
|
add_special_tokens = False, #since tokenizer.apply_chat_template adds <|begin_of_text|> template already, set to False |
|
num_beams=1,eos_token_id= [ |
|
128001, |
|
128008, |
|
128009 |
|
], verbatim=False, |
|
top_k = 50, |
|
top_p = 0.9, |
|
repetition_penalty=1.1, |
|
messages=[], |
|
): |
|
|
|
if messages==[]: #start new messages dictionary |
|
if system_prompt != '': #include system prompt if provided |
|
messages.extend ([ {"role": "system", "content": system_prompt}, ]) |
|
messages.extend ( [ {"role": "user", "content": text_input}, ]) |
|
|
|
else: #if messages provided, will extend (make sure to add previous response as assistant message) |
|
messages.append ({"role": "user", "content": text_input}) |
|
|
|
text_input = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
inputs = tokenizer([text_input], add_special_tokens = add_special_tokens, return_tensors ='pt' ).to(device) |
|
if verbatim: |
|
print (inputs) |
|
with torch.no_grad(): |
|
outputs = model.generate(**inputs, |
|
max_new_tokens=max_new_tokens, |
|
temperature=temperature, |
|
num_beams=num_beams, |
|
top_k = top_k,eos_token_id=eos_token_id, |
|
top_p =top_p, |
|
num_return_sequences = num_return_sequences, |
|
do_sample =True, repetition_penalty=repetition_penalty, |
|
) |
|
outputs=outputs[:, inputs["input_ids"].shape[1]:] |
|
return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True), messages |
|
``` |
|
Usage: |
|
``` |
|
res,_= generate_response (text_input = "What is collagen?", system_prompt = 'You are a materials scientist.', |
|
num_return_sequences=1, |
|
temperature=1., #the higher the temperature, the more creative the model becomes |
|
max_new_tokens=127, |
|
num_beams=1, |
|
top_k = 50, top_p =0.9, repetition_penalty=1.1, |
|
|
|
) |
|
print (res[0]) |
|
``` |
|
To realize multi-turn interactions, see this example: |
|
``` |
|
res, messages = generate_response (text_input="What is spider silk?", messages=[]) |
|
messages.append ({"role": "assistant", "content": res[0]}, ) #append result to messages dict |
|
print (res) |
|
res, messages = generate_response (text_input="Explain this result in detail.", messages=messages) |
|
messages.append ({"role": "assistant", "content": res[0]}, ) #append result to messages dict |
|
print (res) |
|
res, messages = generate_response (text_input="Provide this in JSON format.", messages=messages) |
|
messages.append ({"role": "assistant", "content": res[0]}) #append result to messages dict |
|
print (res) |
|
``` |