--- library_name: transformers tags: - biology - chemistry - biological materials - materials science - engineering - materials informatics - scientific AI - AI4science - Llama-3-1 --- ## Inference example ``` model_name='lamm-mit/Bioinspired-Llama-3-1-8B-128k-gamma' model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, device_map="auto", torch_dtype =torch.bfloat16, attn_implementation="flash_attention_2" ) model.config.use_cache = True tokenizer = AutoTokenizer.from_pretrained(model_name) ``` #### Function to interact with the model ``` def generate_response (text_input="What is spider silk?", system_prompt='', num_return_sequences=1, temperature=1., #the higher the temperature, the more creative the model becomes max_new_tokens=127,device='cuda', add_special_tokens = False, #since tokenizer.apply_chat_template adds <|begin_of_text|> template already, set to False num_beams=1,eos_token_id= [ 128001, 128008, 128009 ], verbatim=False, top_k = 50, top_p = 0.9, repetition_penalty=1.1, messages=[], ): if messages==[]: #start new messages dictionary if system_prompt != '': #include system prompt if provided messages.extend ([ {"role": "system", "content": system_prompt}, ]) messages.extend ( [ {"role": "user", "content": text_input}, ]) else: #if messages provided, will extend (make sure to add previous response as assistant message) messages.append ({"role": "user", "content": text_input}) text_input = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer([text_input], add_special_tokens = add_special_tokens, return_tensors ='pt' ).to(device) if verbatim: print (inputs) with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=temperature, num_beams=num_beams, top_k = top_k,eos_token_id=eos_token_id, top_p =top_p, num_return_sequences = num_return_sequences, do_sample =True, repetition_penalty=repetition_penalty, ) outputs=outputs[:, inputs["input_ids"].shape[1]:] return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True), messages ``` Usage: ``` res,_= generate_response (text_input = "What is collagen?", system_prompt = 'You are a materials scientist.', num_return_sequences=1, temperature=1., #the higher the temperature, the more creative the model becomes max_new_tokens=127, num_beams=1, top_k = 50, top_p =0.9, repetition_penalty=1.1, ) print (res[0]) ``` To realize multi-turn interactions, see this example: ``` res, messages = generate_response (text_input="What is spider silk?", messages=[]) messages.append ({"role": "assistant", "content": res[0]}, ) #append result to messages dict print (res) res, messages = generate_response (text_input="Explain this result in detail.", messages=messages) messages.append ({"role": "assistant", "content": res[0]}, ) #append result to messages dict print (res) res, messages = generate_response (text_input="Provide this in JSON format.", messages=messages) messages.append ({"role": "assistant", "content": res[0]}) #append result to messages dict print (res) ```