Spaces:
Running
Running
import os | |
import gradio as gr | |
from ctransformers import AutoModelForCausalLM | |
# Define the model repository and file | |
MODEL_REPO = "TheBloke/OpenHermes-2-Mistral-7B-GGUF" | |
MODEL_FILE = "openhermes-2-mistral-7b.Q8_0.gguf" # Use Q8_0 for better CPU performance | |
# Download and load the model | |
print(f"Downloading {MODEL_FILE} from {MODEL_REPO}...") | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_REPO, | |
model_file=MODEL_FILE, | |
model_type="mistral", | |
# gpu_layers=50 if torch.cuda.is_available() else 0, # Use GPU if available | |
context_length=256 # Reduce context length for faster response | |
) | |
print("Model loaded successfully.") | |
# Function to generate responses | |
def chat_with_model(prompt): | |
response = model(prompt) | |
return response | |
# Gradio UI | |
iface = gr.Interface( | |
fn=chat_with_model, | |
inputs=gr.Textbox(lines=2, placeholder="Enter your query..."), | |
outputs="text", | |
title="Mistral-7B Chatbot", | |
description="Optimized chatbot using Mistral-7B GGUF with improved speed.", | |
) | |
# Run the Gradio app | |
if __name__ == "__main__": | |
iface.launch(share=True) |