import torch
import gradio as gr
from peft import AutoPeftModelForCausalLM
from huggingface_hub import InferenceClient
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co./docs/huggingface_hub/v0.22.2/en/guides/inference
"""
model_name = "tz3r0n4r/SmolLM2-FT-Race-v0.002.76"
# client = InferenceClient("tz3r0n4r/SmolLM2-FT-Race-v0.002.76")
model = AutoPeftModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16
    )
tokenizer = AutoTokenizer.from_pretrained(model_name)


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [
        {
            "role": "system",
            "content": system_message
        },
        {
            "role": "user",
            "content": message
        }
    ]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    outputs = model.generate(
        **tokenizer(prompt, return_tensors="pt", padding=True).to(model.device),
        max_new_tokens=max_tokens,
        temperature=temperature,  # Try 0.8-0.9 to reduce repetition
        do_sample=True,
        top_p=top_p,
        # Add these parameters:
        num_beams=3,  # Add beam search
        no_repeat_ngram_size=3,  # Prevent repetition
        repetition_penalty=1.2,  # Penalize repeated content
        length_penalty=1.0,  # Encourage completion
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
prompt = """
You are a skilled assistant specialized in reading comprehension and question generation. For each passage, generate ONE meaningful multiple-choice question that tests deep understanding.

REQUIRED FORMAT:
Question: [Clear, specific question]
Options:
A) [Distinct option 1]
B) [Distinct option 2]
C) [Distinct option 3]
D) [Distinct option 4]
Answer: [A/B/C/D]

RULES:
1. ALWAYS include Question:, Options:, and Answer: markers
2. NEVER repeat options
3. Make all options similar in length
4. Base everything on passage content only
5. Complete all responses fully
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value=prompt, label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()