import os
import gradio as gr
from ctransformers import AutoModelForCausalLM

# Define the model repository and file
MODEL_REPO = "TheBloke/OpenHermes-2-Mistral-7B-GGUF"
MODEL_FILE = "openhermes-2-mistral-7b.Q8_0.gguf"  # Use Q8_0 for better CPU performance

# Download and load the model
print(f"Downloading {MODEL_FILE} from {MODEL_REPO}...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_REPO,
    model_file=MODEL_FILE,
    model_type="mistral",
    # gpu_layers=50 if torch.cuda.is_available() else 0,  # Use GPU if available
    context_length=256  # Reduce context length for faster response
)
print("Model loaded successfully.")

# Function to generate responses
def chat_with_model(prompt):
    response = model(prompt)
    return response

# Gradio UI
iface = gr.Interface(
    fn=chat_with_model,
    inputs=gr.Textbox(lines=2, placeholder="Enter your query..."),
    outputs="text",
    title="Mistral-7B Chatbot",
    description="Optimized chatbot using Mistral-7B GGUF with improved speed.",
)

# Run the Gradio app
if __name__ == "__main__":
    iface.launch(share=True)