import json
import subprocess
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
import gradio as gr
from huggingface_hub import hf_hub_download

llm = None
llm_model = None

# Download the new model
hf_hub_download(
    repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
    filename="llama-3.2-1b-instruct-q4_k_m.gguf",
    local_dir="./models"
)

def get_messages_formatter_type(model_name):
    return MessagesFormatterType.LLAMA_3

def respond(
    message,
    history: list[tuple[str, str]],
    model,
    system_message,
    max_tokens,
    temperature,
    top_p,
    top_k,
    repeat_penalty,
):
    global llm
    global llm_model
    
    chat_template = get_messages_formatter_type(model)
    
    if llm is None or llm_model != model:
        llm = Llama(
            model_path=f"models/{model}",
            n_gpu_layers=0,  
            n_batch=4096,     
            n_ctx=2048,      
        )
        llm_model = model
    
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt=f"{system_message}",
        predefined_messages_formatter_type=chat_template,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.temperature = temperature
    settings.top_k = top_k
    settings.top_p = top_p
    settings.max_tokens = max_tokens
    settings.repeat_penalty = repeat_penalty
    settings.stream = True

    messages = BasicChatHistory()

    for msn in history:
        user = {
            'role': Roles.user,
            'content': msn[0]
        }
        assistant = {
            'role': Roles.assistant,
            'content': msn[1]
        }
        messages.add_message(user)
        messages.add_message(assistant)
    
    stream = agent.get_chat_response(
        message,
        llm_sampling_settings=settings,
        chat_history=messages,
        returns_streaming_generator=True,
        print_output=False
    )
    
    outputs = ""
    for output in stream:
        outputs += output
        yield outputs

description = """<p><center>
<a href="https://huggingface.co./hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>

Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.

</center></p>
"""

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Dropdown([
                "llama-3.2-1b-instruct-q4_k_m.gguf"
            ],
            value="llama-3.2-1b-instruct-q4_k_m.gguf",
            label="Model"
        ),
        gr.Textbox(value="You are a world-class AI system named Meta Llama 3.2 (1B). You are capable of complex reasoning, reflecting on your thoughts, and providing detailed and accurate responses. You are designed to excel in conversational dialogue, agentic retrieval, and summarization tasks. You can understand and generate text in multiple languages. Reason through the query inside <thinking> tags, and then provide your final response inside <output> tags. If you detect that you made a mistake in your reasoning at any point, correct yourself inside <reflection> tags.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=1024, step=1, label="Max tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p",
        ),
        gr.Slider(
            minimum=0,
            maximum=100,
            value=40,
            step=1,
            label="Top-k",
        ),
        gr.Slider(
            minimum=0.0,
            maximum=2.0,
            value=1.1,
            step=0.1,
            label="Repetition penalty",
        ),
    ],
    theme=gr.themes.Default(
        primary_hue="blue",
        secondary_hue="cyan",
        neutral_hue="gray",
        font=[gr.themes.GoogleFont("Roboto"), "ui-sans-serif", "system-ui", "sans-serif"]
    ).set(
        body_background_fill="#f8f9fa",
        block_background_fill="#ffffff",
        block_border_width="1px",
        block_title_background_fill="#e9ecef",
        input_background_fill="#f8f9fa",
        button_secondary_background_fill="#007bff",
        border_color_accent="#ced4da",
        border_color_primary="#ced4da",
        background_fill_secondary="#f8f9fa",
        color_accent_soft="#007bff",
        code_background_fill="#f8f9fa",
    ),
    title="Meta Llama 3.2 (1B)",
    description=description,
    chatbot=gr.Chatbot(
        scale=1, 
        likeable=True,
        show_copy_button=True
    ),
    cache_examples=False,
    autofocus=False,
    concurrency_limit=10
)

if __name__ == "__main__":
    demo.launch()