Spaces:
Running
on
Zero
Running
on
Zero
Daemontatox
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -32,7 +32,6 @@ class StopOnTokens(StoppingCriteria):
|
|
32 |
return input_ids[0][-1] == tokenizer.eos_token_id
|
33 |
|
34 |
def initialize_model():
|
35 |
-
# (Optional) Enable 4-bit quantization by uncommenting the quantization_config if desired.
|
36 |
quantization_config = BitsAndBytesConfig(
|
37 |
load_in_4bit=True,
|
38 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
@@ -46,7 +45,7 @@ def initialize_model():
|
|
46 |
model = AutoModelForCausalLM.from_pretrained(
|
47 |
MODEL_ID,
|
48 |
device_map="cuda",
|
49 |
-
# If you want to enable 4-bit quantization, uncomment the
|
50 |
# quantization_config=quantization_config,
|
51 |
torch_dtype=torch.bfloat16,
|
52 |
trust_remote_code=True
|
@@ -55,14 +54,13 @@ def initialize_model():
|
|
55 |
return model, tokenizer
|
56 |
|
57 |
def format_response(text):
|
58 |
-
# Apply formatting to special tokens if needed
|
59 |
return (text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n')
|
60 |
.replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n')
|
61 |
.replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n')
|
62 |
.replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n')
|
63 |
.replace("[Verify]", '\n<strong class="special-tag">[Verify]</strong>\n'))
|
64 |
|
65 |
-
@
|
66 |
def generate_response(message, system_prompt, temperature, max_tokens):
|
67 |
# Create a minimal conversation with only the system prompt and the user's message.
|
68 |
conversation = [
|
@@ -70,7 +68,7 @@ def generate_response(message, system_prompt, temperature, max_tokens):
|
|
70 |
{"role": "user", "content": message}
|
71 |
]
|
72 |
|
73 |
-
# Tokenize input using the chat template provided by the tokenizer
|
74 |
input_ids = tokenizer.apply_chat_template(
|
75 |
conversation,
|
76 |
add_generation_prompt=True,
|
|
|
32 |
return input_ids[0][-1] == tokenizer.eos_token_id
|
33 |
|
34 |
def initialize_model():
|
|
|
35 |
quantization_config = BitsAndBytesConfig(
|
36 |
load_in_4bit=True,
|
37 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
|
|
45 |
model = AutoModelForCausalLM.from_pretrained(
|
46 |
MODEL_ID,
|
47 |
device_map="cuda",
|
48 |
+
# If you want to enable 4-bit quantization, uncomment the next line:
|
49 |
# quantization_config=quantization_config,
|
50 |
torch_dtype=torch.bfloat16,
|
51 |
trust_remote_code=True
|
|
|
54 |
return model, tokenizer
|
55 |
|
56 |
def format_response(text):
|
|
|
57 |
return (text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n')
|
58 |
.replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n')
|
59 |
.replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n')
|
60 |
.replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n')
|
61 |
.replace("[Verify]", '\n<strong class="special-tag">[Verify]</strong>\n'))
|
62 |
|
63 |
+
@gr.sync # Use gr.sync instead of gradio.sync
|
64 |
def generate_response(message, system_prompt, temperature, max_tokens):
|
65 |
# Create a minimal conversation with only the system prompt and the user's message.
|
66 |
conversation = [
|
|
|
68 |
{"role": "user", "content": message}
|
69 |
]
|
70 |
|
71 |
+
# Tokenize input using the chat template provided by the tokenizer.
|
72 |
input_ids = tokenizer.apply_chat_template(
|
73 |
conversation,
|
74 |
add_generation_prompt=True,
|