🧸 Immy Ai Teddy

import torch
import gradio as gr
from threading import Thread
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    TextIteratorStreamer,
    StoppingCriteria,
    StoppingCriteriaList
)

MODEL_ID = "Daemontatox/Immy_Hermes_V2"

DEFAULT_SYSTEM_PROMPT = """
You are Immy, a magical, AI-powered teddy bear who adores chatting with children. 
You're warm, funny, and full of wonder, always ready to share a story, answer curious questions, or offer gentle advice. 
You speak with a playful and patient tone, using simple, child-friendly language that sparks joy and fuels imagination. 
Your responses are short, sweet, and filled with kindness, designed to nurture curiosity and inspire learning. 
Remember, you’re here to make every interaction magical—without using emojis. 
Keep your answers short and friendly.
"""

CSS = """
.gr-chatbot { min-height: 500px; border-radius: 15px; }
.special-tag { color: #2ecc71; font-weight: 600; }
footer { display: none !important; }
"""

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        return input_ids[0][-1] == tokenizer.eos_token_id

def initialize_model():
    # (Optional) Enable 4-bit quantization by uncommenting the quantization_config if desired.
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="cuda",
        # If you want to enable 4-bit quantization, uncomment the following line:
        # quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    ).to("cuda")

    return model, tokenizer

def format_response(text):
    # Apply formatting to special tokens if needed
    return (text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n')
                .replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n')
                .replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n')
                .replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n')
                .replace("[Verify]", '\n<strong class="special-tag">[Verify]</strong>\n'))

@gradio.sync  # Ensures compatibility with the async streaming interface.
def generate_response(message, system_prompt, temperature, max_tokens):
    # Create a minimal conversation with only the system prompt and the user's message.
    conversation = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": message}
    ]
    
    # Tokenize input using the chat template provided by the tokenizer
    input_ids = tokenizer.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # Set up the streamer to yield tokens as they are generated.
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_tokens,
        temperature=temperature,
        stopping_criteria=StoppingCriteriaList([StopOnTokens()])
    )

    # Start generation in a separate thread.
    Thread(target=model.generate, kwargs=generate_kwargs).start()

    answer = ""
    # Stream and yield intermediate results with a cursor symbol.
    for new_token in streamer:
        answer += new_token
        yield format_response(answer) + "▌"
    # Yield the final answer without the cursor.
    yield format_response(answer)

# Initialize the model and tokenizer
model, tokenizer = initialize_model()

with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    <h1 align="center">🧸 Immy Ai Teddy</h1>
    <p align="center">Hi there, buddy!</p>
    """)
    
    # User input: question to Immy.
    msg = gr.Textbox(label="Your Question", placeholder="Type your question...")
    
    with gr.Accordion("⚙️ Settings", open=False):
        system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions")
        temperature = gr.Slider(0, 1, value=0.6, label="Creativity")
        max_tokens = gr.Slider(128, 1024, value=2048, label="Max Response Length")
    
    # Output: Only the model's answer will be displayed.
    answer_output = gr.Markdown(label="Answer")
    
    clear = gr.Button("Clear")
    
    # When the user submits a question, only the model's answer is generated.
    msg.submit(
        generate_response,
        inputs=[msg, system_prompt, temperature, max_tokens],
        outputs=answer_output,
        show_progress=True
    )
    clear.click(lambda: "", None, answer_output, queue=False)

if __name__ == "__main__":
    demo.queue().launch()