import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Model path - use the actual Hugging Face model ID or local path MODEL_PATH = "TOOTLE/Gemma_instruct_model_gguf" # or your local model path alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: You are a software engineering expert and your job is help your junior solve coding problems. ### Input: {} ### Response: """ def load_model(): tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) model = AutoModelForCausalLM.from_pretrained( MODEL_PATH, torch_dtype=torch.float16, # Spécifiez float16 pour économiser de la mémoire device_map="auto", offload_folder="offload" # Ajoutez un dossier pour le déchargement des poids ) return model, tokenizer def chatbot_response(prompt): inputs = tokenizer( alpaca_prompt.format(prompt), return_tensors="pt", truncation=True, max_length=512 ) print(inputs) outputs = model.generate( inputs["input_ids"], max_new_tokens=1024, temperature=0.7, do_sample=True ) print(outputs) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) reponse = response.split("### Response:") return reponse[-1] # Load model and tokenizer model, tokenizer = load_model() # Gradio interface with gr.Blocks() as demo: gr.Markdown("# 💬 Chat with Gemma Model") with gr.Row(): input_text = gr.Textbox( label="Ask your question:", placeholder="Example: Code in python a function that perform the addition of two float numbers..." ) output_text = gr.Textbox(label="Model response:") submit_button = gr.Button("Send") submit_button.click(chatbot_response, inputs=input_text, outputs=output_text) if __name__ == "__main__": demo.launch()