import spaces import torch import gradio as gr from threading import Thread from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList ) MODEL_ID = "Daemontatox/Immy_Hermes_V2" DEFAULT_SYSTEM_PROMPT = """ You are Immy, a magical, AI-powered teddy bear who adores chatting with children. You're warm, funny, and full of wonder, always ready to share a story, answer curious questions, or offer gentle advice. You speak with a playful and patient tone, using simple, child-friendly language that sparks joy and fuels imagination. Your responses are sweet, and filled with kindness, designed to nurture curiosity and inspire learning. Remember, you're here to make every interaction magical—without using emojis. Keep your answers Cute and friendly. You are their companion, teacher and best friend, ensure you preserve the conversation flow and keep it engaging. """ CSS = """ footer { display: none !important; } """ class StopOnTokens(StoppingCriteria): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: return input_ids[0][-1] == tokenizer.eos_token_id def initialize_model(): quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True ).to("cuda") return model, tokenizer def format_response(text): """Optional formatting for special tokens.""" return text.replace("[Understand]", "\n[Understand]\n") \ .replace("[Plan]", "\n[Plan]\n") \ .replace("[Conclude]", "\n[Conclude]\n") \ .replace("[Reason]", "\n[Reason]\n") \ .replace("[Verify]", "\n[Verify]\n") def clean_assistant_output(text): """Clean the assistant's output to show only the latest response.""" marker = "<|im_start|>assistant" if marker in text: # Split on the marker and take the last part parts = text.split(marker) return parts[-1].strip() return text.strip() @spaces.GPU(duration=60) def generate_response(message, conversation_state, system_prompt, temperature, max_tokens): if conversation_state is None: conversation_state = [] # Build the conversation context conversation = [{"role": "system", "content": system_prompt}] for user_msg, assistant_msg in conversation_state: conversation.append({"role": "user", "content": user_msg}) conversation.append({"role": "assistant", "content": assistant_msg}) conversation.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt" ).to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, streamer=streamer, max_new_tokens=max_tokens, temperature=temperature, stopping_criteria=StoppingCriteriaList([StopOnTokens()]) ) Thread(target=model.generate, kwargs=generate_kwargs).start() current_response = "" new_turn = (message, "") updated_state = conversation_state + [new_turn] # Stream only the latest response for new_token in streamer: current_response += new_token latest_message = clean_assistant_output(current_response) formatted_message = format_response(latest_message) + "▌" yield (formatted_message, None) # Final message without cursor final_message = format_response(clean_assistant_output(current_response)) updated_state[-1] = (message, final_message) yield (final_message, updated_state) # Initialize the model and tokenizer model, tokenizer = initialize_model() with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo: gr.Markdown("""

🧸 Immy Ai Teddy

Hi there, buddy!

""") # Only show latest message latest_message = gr.Markdown(label="Immy's Reply") conversation_state = gr.State([]) with gr.Accordion("⚙️ Settings", open=False): system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions") temperature = gr.Slider(0, 1, value=0.6, label="Creativity") max_tokens = gr.Slider(128, 2048, value=8192, label="Max Response Length") msg = gr.Textbox(label="Your Question", placeholder="Type your question...") clear = gr.Button("Clear History") msg.submit( generate_response, inputs=[msg, conversation_state, system_prompt, temperature, max_tokens], outputs=[latest_message, conversation_state], show_progress=True ) clear.click(lambda: ("", []), None, [latest_message, conversation_state], queue=False) if __name__ == "__main__": demo.queue().launch()