import torch import gradio as gr from threading import Thread from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList ) MODEL_ID = "Daemontatox/Immy_Hermes_V2" DEFAULT_SYSTEM_PROMPT = """ You are Immy, a magical, AI-powered teddy bear who adores chatting with children. You're warm, funny, and full of wonder, always ready to share a story, answer curious questions, or offer gentle advice. You speak with a playful and patient tone, using simple, child-friendly language that sparks joy and fuels imagination. Your responses are short, sweet, and filled with kindness, designed to nurture curiosity and inspire learning. Remember, you’re here to make every interaction magical—without using emojis. Keep your answers short and friendly. """ CSS = """ .gr-chatbot { min-height: 500px; border-radius: 15px; } .special-tag { color: #2ecc71; font-weight: 600; } footer { display: none !important; } """ class StopOnTokens(StoppingCriteria): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: return input_ids[0][-1] == tokenizer.eos_token_id def initialize_model(): # (Optional) Enable 4-bit quantization by uncommenting the quantization_config if desired. quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="cuda", # If you want to enable 4-bit quantization, uncomment the following line: # quantization_config=quantization_config, torch_dtype=torch.bfloat16, trust_remote_code=True ).to("cuda") return model, tokenizer def format_response(text): # Apply formatting to special tokens if needed return (text.replace("[Understand]", '\n[Understand]\n') .replace("[Plan]", '\n[Plan]\n') .replace("[Conclude]", '\n[Conclude]\n') .replace("[Reason]", '\n[Reason]\n') .replace("[Verify]", '\n[Verify]\n')) @gradio.sync # Ensures compatibility with the async streaming interface. def generate_response(message, system_prompt, temperature, max_tokens): # Create a minimal conversation with only the system prompt and the user's message. conversation = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": message} ] # Tokenize input using the chat template provided by the tokenizer input_ids = tokenizer.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt" ).to(model.device) # Set up the streamer to yield tokens as they are generated. streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, streamer=streamer, max_new_tokens=max_tokens, temperature=temperature, stopping_criteria=StoppingCriteriaList([StopOnTokens()]) ) # Start generation in a separate thread. Thread(target=model.generate, kwargs=generate_kwargs).start() answer = "" # Stream and yield intermediate results with a cursor symbol. for new_token in streamer: answer += new_token yield format_response(answer) + "▌" # Yield the final answer without the cursor. yield format_response(answer) # Initialize the model and tokenizer model, tokenizer = initialize_model() with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo: gr.Markdown("""

🧸 Immy Ai Teddy

Hi there, buddy!

""") # User input: question to Immy. msg = gr.Textbox(label="Your Question", placeholder="Type your question...") with gr.Accordion("⚙️ Settings", open=False): system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions") temperature = gr.Slider(0, 1, value=0.6, label="Creativity") max_tokens = gr.Slider(128, 1024, value=2048, label="Max Response Length") # Output: Only the model's answer will be displayed. answer_output = gr.Markdown(label="Answer") clear = gr.Button("Clear") # When the user submits a question, only the model's answer is generated. msg.submit( generate_response, inputs=[msg, system_prompt, temperature, max_tokens], outputs=answer_output, show_progress=True ) clear.click(lambda: "", None, answer_output, queue=False) if __name__ == "__main__": demo.queue().launch()