Immy

Running on Zero

App Files Files Community

Daemontatox commited on about 23 hours ago

Commit

d6bec85

verified ·

1 Parent(s): 83f478c

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -5

app.py CHANGED Viewed

@@ -32,7 +32,6 @@ class StopOnTokens(StoppingCriteria):
         return input_ids[0][-1] == tokenizer.eos_token_id
 def initialize_model():
-    # (Optional) Enable 4-bit quantization by uncommenting the quantization_config if desired.
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
@@ -46,7 +45,7 @@ def initialize_model():
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         device_map="cuda",
-        # If you want to enable 4-bit quantization, uncomment the following line:
         # quantization_config=quantization_config,
         torch_dtype=torch.bfloat16,
         trust_remote_code=True
@@ -55,14 +54,13 @@ def initialize_model():
     return model, tokenizer
 def format_response(text):
-    # Apply formatting to special tokens if needed
     return (text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n')
                 .replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n')
                 .replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n')
                 .replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n')
                 .replace("[Verify]", '\n<strong class="special-tag">[Verify]</strong>\n'))
-@gradio.sync  # Ensures compatibility with the async streaming interface.
 def generate_response(message, system_prompt, temperature, max_tokens):
     # Create a minimal conversation with only the system prompt and the user's message.
     conversation = [
@@ -70,7 +68,7 @@ def generate_response(message, system_prompt, temperature, max_tokens):
         {"role": "user", "content": message}
     ]
-    # Tokenize input using the chat template provided by the tokenizer
     input_ids = tokenizer.apply_chat_template(
         conversation,
         add_generation_prompt=True,

         return input_ids[0][-1] == tokenizer.eos_token_id
 def initialize_model():
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         device_map="cuda",
+        # If you want to enable 4-bit quantization, uncomment the next line:
         # quantization_config=quantization_config,
         torch_dtype=torch.bfloat16,
         trust_remote_code=True
     return model, tokenizer
 def format_response(text):
     return (text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n')
                 .replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n')
                 .replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n')
                 .replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n')
                 .replace("[Verify]", '\n<strong class="special-tag">[Verify]</strong>\n'))
+@gr.sync   # Use gr.sync instead of gradio.sync
 def generate_response(message, system_prompt, temperature, max_tokens):
     # Create a minimal conversation with only the system prompt and the user's message.
     conversation = [
         {"role": "user", "content": message}
     ]
+    # Tokenize input using the chat template provided by the tokenizer.
     input_ids = tokenizer.apply_chat_template(
         conversation,
         add_generation_prompt=True,