from transformers import TextStreamer from unsloth import FastLanguageModel max_seq_length = 2048 dtype = None load_in_4bit = False alpaca_prompt = """Provide a helpful and informative response to the following prompt. ### Prompt: {} ### Response: {}""" prompt = "What is your base model?" model, tokenizer = FastLanguageModel.from_pretrained( model_name="merged_tinyllama_base_model", max_seq_length=max_seq_length, dtype=dtype ) FastLanguageModel.for_inference(model) inputs = tokenizer( [alpaca_prompt.format(prompt, "")], return_tensors="pt" ).to("cuda").to(dtype) # Generate text text_streamer = TextStreamer(tokenizer) _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=2000)