Daemontatox commited on
Commit
d6bec85
·
verified ·
1 Parent(s): 83f478c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -5
app.py CHANGED
@@ -32,7 +32,6 @@ class StopOnTokens(StoppingCriteria):
32
  return input_ids[0][-1] == tokenizer.eos_token_id
33
 
34
  def initialize_model():
35
- # (Optional) Enable 4-bit quantization by uncommenting the quantization_config if desired.
36
  quantization_config = BitsAndBytesConfig(
37
  load_in_4bit=True,
38
  bnb_4bit_compute_dtype=torch.bfloat16,
@@ -46,7 +45,7 @@ def initialize_model():
46
  model = AutoModelForCausalLM.from_pretrained(
47
  MODEL_ID,
48
  device_map="cuda",
49
- # If you want to enable 4-bit quantization, uncomment the following line:
50
  # quantization_config=quantization_config,
51
  torch_dtype=torch.bfloat16,
52
  trust_remote_code=True
@@ -55,14 +54,13 @@ def initialize_model():
55
  return model, tokenizer
56
 
57
  def format_response(text):
58
- # Apply formatting to special tokens if needed
59
  return (text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n')
60
  .replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n')
61
  .replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n')
62
  .replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n')
63
  .replace("[Verify]", '\n<strong class="special-tag">[Verify]</strong>\n'))
64
 
65
- @gradio.sync # Ensures compatibility with the async streaming interface.
66
  def generate_response(message, system_prompt, temperature, max_tokens):
67
  # Create a minimal conversation with only the system prompt and the user's message.
68
  conversation = [
@@ -70,7 +68,7 @@ def generate_response(message, system_prompt, temperature, max_tokens):
70
  {"role": "user", "content": message}
71
  ]
72
 
73
- # Tokenize input using the chat template provided by the tokenizer
74
  input_ids = tokenizer.apply_chat_template(
75
  conversation,
76
  add_generation_prompt=True,
 
32
  return input_ids[0][-1] == tokenizer.eos_token_id
33
 
34
  def initialize_model():
 
35
  quantization_config = BitsAndBytesConfig(
36
  load_in_4bit=True,
37
  bnb_4bit_compute_dtype=torch.bfloat16,
 
45
  model = AutoModelForCausalLM.from_pretrained(
46
  MODEL_ID,
47
  device_map="cuda",
48
+ # If you want to enable 4-bit quantization, uncomment the next line:
49
  # quantization_config=quantization_config,
50
  torch_dtype=torch.bfloat16,
51
  trust_remote_code=True
 
54
  return model, tokenizer
55
 
56
  def format_response(text):
 
57
  return (text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n')
58
  .replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n')
59
  .replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n')
60
  .replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n')
61
  .replace("[Verify]", '\n<strong class="special-tag">[Verify]</strong>\n'))
62
 
63
+ @gr.sync # Use gr.sync instead of gradio.sync
64
  def generate_response(message, system_prompt, temperature, max_tokens):
65
  # Create a minimal conversation with only the system prompt and the user's message.
66
  conversation = [
 
68
  {"role": "user", "content": message}
69
  ]
70
 
71
+ # Tokenize input using the chat template provided by the tokenizer.
72
  input_ids = tokenizer.apply_chat_template(
73
  conversation,
74
  add_generation_prompt=True,