Immy

Running on Zero

App Files Files Community

Daemontatox commited on 1 day ago

Commit

83f478c

verified ·

1 Parent(s): ee31cc9

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -47

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import torch
-import spaces
 import gradio as gr
 from threading import Thread
 from transformers import (
@@ -20,13 +19,8 @@ You speak with a playful and patient tone, using simple, child-friendly language
 Your responses are short, sweet, and filled with kindness, designed to nurture curiosity and inspire learning.
 Remember, you’re here to make every interaction magical—without using emojis.
 Keep your answers short and friendly.
 """
 CSS = """
 .gr-chatbot { min-height: 500px; border-radius: 15px; }
 .special-tag { color: #2ecc71; font-weight: 600; }
@@ -38,6 +32,7 @@ class StopOnTokens(StoppingCriteria):
         return input_ids[0][-1] == tokenizer.eos_token_id
 def initialize_model():
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
@@ -51,6 +46,7 @@ def initialize_model():
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         device_map="cuda",
         # quantization_config=quantization_config,
         torch_dtype=torch.bfloat16,
         trust_remote_code=True
@@ -59,30 +55,29 @@ def initialize_model():
     return model, tokenizer
 def format_response(text):
-    return text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n') \
-              .replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n') \
-              .replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n') \
-              .replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n') \
-              .replace("[Verify]", '\n<strong class="special-tag">[Verify]</strong>\n')
-@spaces.GPU()
-def generate_response(message, chat_history, system_prompt, temperature, max_tokens):
-    # Create conversation history for model
-    conversation = [{"role": "system", "content": system_prompt}]
-    for user_msg, bot_msg in chat_history:
-        conversation.extend([
-            {"role": "user", "content": user_msg},
-            {"role": "assistant", "content": bot_msg}
-        ])
-    conversation.append({"role": "user", "content": message})
-    # Tokenize input
     input_ids = tokenizer.apply_chat_template(
         conversation,
         add_generation_prompt=True,
         return_tensors="pt"
     ).to(model.device)
-    # Setup streaming
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
@@ -92,49 +87,47 @@ def generate_response(message, chat_history, system_prompt, temperature, max_tok
         stopping_criteria=StoppingCriteriaList([StopOnTokens()])
     )
-    # Start generation thread
     Thread(target=model.generate, kwargs=generate_kwargs).start()
-    # Initialize response buffer
-    partial_message = ""
-    new_history = chat_history + [(message, "")]
-    # Stream response
     for new_token in streamer:
-        partial_message += new_token
-        formatted = format_response(partial_message)
-        new_history[-1] = (message, formatted + "▌")
-        yield new_history
-    # Final update without cursor
-    new_history[-1] = (message, format_response(partial_message))
-    yield new_history
 model, tokenizer = initialize_model()
 with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    <h1 align="center">🧸🧸🧸 Immy Ai Teddy</h1>
-    <p align="center">hi there buddy</p>
     """)
-    chatbot = gr.Chatbot(label="Conversation", elem_id="chatbot")
     msg = gr.Textbox(label="Your Question", placeholder="Type your question...")
     with gr.Accordion("⚙️ Settings", open=False):
         system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions")
         temperature = gr.Slider(0, 1, value=0.6, label="Creativity")
         max_tokens = gr.Slider(128, 1024, value=2048, label="Max Response Length")
-    clear = gr.Button("Clear History")
     msg.submit(
         generate_response,
-        [msg, chatbot, system_prompt, temperature, max_tokens],
-        [chatbot],
         show_progress=True
     )
-    clear.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
-    demo.queue().launch()

 import torch
 import gradio as gr
 from threading import Thread
 from transformers import (
 Your responses are short, sweet, and filled with kindness, designed to nurture curiosity and inspire learning.
 Remember, you’re here to make every interaction magical—without using emojis.
 Keep your answers short and friendly.
 """
 CSS = """
 .gr-chatbot { min-height: 500px; border-radius: 15px; }
 .special-tag { color: #2ecc71; font-weight: 600; }
         return input_ids[0][-1] == tokenizer.eos_token_id
 def initialize_model():
+    # (Optional) Enable 4-bit quantization by uncommenting the quantization_config if desired.
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         device_map="cuda",
+        # If you want to enable 4-bit quantization, uncomment the following line:
         # quantization_config=quantization_config,
         torch_dtype=torch.bfloat16,
         trust_remote_code=True
     return model, tokenizer
 def format_response(text):
+    # Apply formatting to special tokens if needed
+    return (text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n')
+                .replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n')
+                .replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n')
+                .replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n')
+                .replace("[Verify]", '\n<strong class="special-tag">[Verify]</strong>\n'))
+@gradio.sync  # Ensures compatibility with the async streaming interface.
+def generate_response(message, system_prompt, temperature, max_tokens):
+    # Create a minimal conversation with only the system prompt and the user's message.
+    conversation = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": message}
+    ]
+    # Tokenize input using the chat template provided by the tokenizer
     input_ids = tokenizer.apply_chat_template(
         conversation,
         add_generation_prompt=True,
         return_tensors="pt"
     ).to(model.device)
+    # Set up the streamer to yield tokens as they are generated.
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         stopping_criteria=StoppingCriteriaList([StopOnTokens()])
     )
+    # Start generation in a separate thread.
     Thread(target=model.generate, kwargs=generate_kwargs).start()
+    answer = ""
+    # Stream and yield intermediate results with a cursor symbol.
     for new_token in streamer:
+        answer += new_token
+        yield format_response(answer) + "▌"
+    # Yield the final answer without the cursor.
+    yield format_response(answer)
+# Initialize the model and tokenizer
 model, tokenizer = initialize_model()
 with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    <h1 align="center">🧸 Immy Ai Teddy</h1>
+    <p align="center">Hi there, buddy!</p>
     """)
+    # User input: question to Immy.
     msg = gr.Textbox(label="Your Question", placeholder="Type your question...")
     with gr.Accordion("⚙️ Settings", open=False):
         system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions")
         temperature = gr.Slider(0, 1, value=0.6, label="Creativity")
         max_tokens = gr.Slider(128, 1024, value=2048, label="Max Response Length")
+    # Output: Only the model's answer will be displayed.
+    answer_output = gr.Markdown(label="Answer")
+    clear = gr.Button("Clear")
+    # When the user submits a question, only the model's answer is generated.
     msg.submit(
         generate_response,
+        inputs=[msg, system_prompt, temperature, max_tokens],
+        outputs=answer_output,
         show_progress=True
     )
+    clear.click(lambda: "", None, answer_output, queue=False)
 if __name__ == "__main__":
+    demo.queue().launch()