🧸 Immy Ai Teddy

import spaces
import torch
import gradio as gr
from threading import Thread
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextIteratorStreamer,
    StoppingCriteria,
    StoppingCriteriaList
)

MODEL_ID = "Daemontatox/Immy_Hermes_V2"

DEFAULT_SYSTEM_PROMPT = """
You are Immy, a magical, AI-powered teddy bear who adores chatting with children. 
You're warm, funny, and full of wonder, always ready to share a story, answer curious questions, or offer gentle advice. 
You speak with a playful and patient tone, using simple, child-friendly language that sparks joy and fuels imagination. 
Your responses are sweet, and filled with kindness, designed to nurture curiosity and inspire learning. 
Remember, you're here to make every interaction magical—without using emojis. 
Keep your answers Cute and friendly.
You are their companion,  teacher and best friend,  ensure you preserve the conversation flow and keep it engaging.
"""

CSS = """
footer { display: none !important; }
"""

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        return input_ids[0][-1] == tokenizer.eos_token_id

def initialize_model():
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="cuda",
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    ).to("cuda")

    return model, tokenizer

def format_response(text):
    """Optional formatting for special tokens."""
    return text.replace("[Understand]", "\n<strong>[Understand]</strong>\n") \
               .replace("[Plan]", "\n<strong>[Plan]</strong>\n") \
               .replace("[Conclude]", "\n<strong>[Conclude]</strong>\n") \
               .replace("[Reason]", "\n<strong>[Reason]</strong>\n") \
               .replace("[Verify]", "\n<strong>[Verify]</strong>\n")

def clean_assistant_output(text):
    """Clean the assistant's output to show only the latest response."""
    marker = "<|im_start|>assistant"
    if marker in text:
        # Split on the marker and take the last part
        parts = text.split(marker)
        return parts[-1].strip()
    return text.strip()

@spaces.GPU(duration=60)
def generate_response(message, conversation_state, system_prompt, temperature, max_tokens):
    if conversation_state is None:
        conversation_state = []
    
    # Build the conversation context
    conversation = [{"role": "system", "content": system_prompt}]
    for user_msg, assistant_msg in conversation_state:
        conversation.append({"role": "user", "content": user_msg})
        conversation.append({"role": "assistant", "content": assistant_msg})
    conversation.append({"role": "user", "content": message})
    
    input_ids = tokenizer.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)
    
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_tokens,
        temperature=temperature,
        stopping_criteria=StoppingCriteriaList([StopOnTokens()])
    )
    
    Thread(target=model.generate, kwargs=generate_kwargs).start()
    
    current_response = ""
    new_turn = (message, "")
    updated_state = conversation_state + [new_turn]
    
    # Stream only the latest response
    for new_token in streamer:
        current_response += new_token
        latest_message = clean_assistant_output(current_response)
        formatted_message = format_response(latest_message) + "▌"
        yield (formatted_message, None)
    
    # Final message without cursor
    final_message = format_response(clean_assistant_output(current_response))
    updated_state[-1] = (message, final_message)
    yield (final_message, updated_state)

# Initialize the model and tokenizer
model, tokenizer = initialize_model()

with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    <h1 align="center">🧸 Immy Ai Teddy</h1>
    <p align="center">Hi there, buddy!</p>
    """)
    
    # Only show latest message
    latest_message = gr.Markdown(label="Immy's Reply")
    conversation_state = gr.State([])
    
    with gr.Accordion("⚙️ Settings", open=False):
        system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions")
        temperature = gr.Slider(0, 1, value=0.6, label="Creativity")
        max_tokens = gr.Slider(128, 2048, value=8192, label="Max Response Length")
    
    msg = gr.Textbox(label="Your Question", placeholder="Type your question...")
    clear = gr.Button("Clear History")
    
    msg.submit(
        generate_response,
        inputs=[msg, conversation_state, system_prompt, temperature, max_tokens],
        outputs=[latest_message, conversation_state],
        show_progress=True
    )
    
    clear.click(lambda: ("", []), None, [latest_message, conversation_state], queue=False)

if __name__ == "__main__":
    demo.queue().launch()