import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from torch import device if __name__ == "__main__": # Load Granite Model and Tokenizer def load_model(): model_path = "ibm-granite/granite-3.1-8b-instruct" tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto") # Use auto device (GPU/CPU) model.eval() # Set model to evaluation mode return model, tokenizer model, tokenizer = load_model() # Generate responses with the model def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): # Prepare chat context chat = [{"role": "system", "content": system_message}] for user_msg, bot_msg in history: if user_msg: chat.append({"role": "user", "content": user_msg}) if bot_msg: chat.append({"role": "assistant", "content": bot_msg}) chat.append({"role": "user", "content": message}) # Determine device device = "cuda" if torch.cuda.is_available() else "cpu" # Tokenize input with chat template chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) input_tokens = tokenizer(chat, return_tensors="pt").to(device) # Generate output tokens output_tokens = model.generate( **input_tokens, max_new_tokens=max_tokens, temperature=temperature, top_p=