import os import gradio as gr from ctransformers import AutoModelForCausalLM # Define the model repository and file MODEL_REPO = "TheBloke/OpenHermes-2-Mistral-7B-GGUF" MODEL_FILE = "openhermes-2-mistral-7b.Q8_0.gguf" # Use Q8_0 for better CPU performance # Download and load the model print(f"Downloading {MODEL_FILE} from {MODEL_REPO}...") model = AutoModelForCausalLM.from_pretrained( MODEL_REPO, model_file=MODEL_FILE, model_type="mistral", # gpu_layers=50 if torch.cuda.is_available() else 0, # Use GPU if available context_length=256 # Reduce context length for faster response ) print("Model loaded successfully.") # Function to generate responses def chat_with_model(prompt): response = model(prompt) return response # Gradio UI iface = gr.Interface( fn=chat_with_model, inputs=gr.Textbox(lines=2, placeholder="Enter your query..."), outputs="text", title="Mistral-7B Chatbot", description="Optimized chatbot using Mistral-7B GGUF with improved speed.", ) # Run the Gradio app if __name__ == "__main__": iface.launch(share=True)