Spaces:
Runtime error
Runtime error
ffreemt
commited on
Commit
·
3cae1b6
1
Parent(s):
f0fb4eb
Update buff enabled
Browse files- .gitignore +1 -0
- app.py +39 -26
- run-app.sh +1 -0
.gitignore
CHANGED
@@ -9,3 +9,4 @@ models
|
|
9 |
.ruff_cache
|
10 |
run-nodemon.sh
|
11 |
app-.py
|
|
|
|
9 |
.ruff_cache
|
10 |
run-nodemon.sh
|
11 |
app-.py
|
12 |
+
nodemon.json
|
app.py
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
# ruff: noqa: E501
|
4 |
import os
|
5 |
import time
|
6 |
-
from dataclasses import asdict, dataclass
|
7 |
from pathlib import Path
|
8 |
from types import SimpleNamespace
|
9 |
|
@@ -39,9 +39,9 @@ URL = "https://huggingface.co/TheBloke/Wizard-Vicuna-7B-Uncensored-GGML/raw/main
|
|
39 |
|
40 |
url = "https://huggingface.co/savvamadar/ggml-gpt4all-j-v1.3-groovy/blob/main/ggml-gpt4all-j-v1.3-groovy.bin"
|
41 |
url = "https://huggingface.co/TheBloke/Llama-2-13B-GGML/blob/main/llama-2-13b.ggmlv3.q4_K_S.bin" # 7.37G
|
|
|
|
|
42 |
url = "https://huggingface.co/localmodels/Llama-2-13B-Chat-ggml/blob/main/llama-2-13b-chat.ggmlv3.q4_K_S.bin" # 7.37G
|
43 |
-
url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin" # 6.93G
|
44 |
-
url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q4_K_M.bin" #
|
45 |
|
46 |
prompt_template="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
47 |
|
@@ -50,9 +50,6 @@ prompt_template="""Below is an instruction that describes a task. Write a respon
|
|
50 |
### Response:
|
51 |
"""
|
52 |
|
53 |
-
prompt_template_qa = """Question: {question}
|
54 |
-
Answer: Let's work this out in a step by step way to be sure we have the right answer."""
|
55 |
-
|
56 |
prompt_template = """System: You are a helpful,
|
57 |
respectful and honest assistant. Always answer as
|
58 |
helpfully as possible, while being safe. Your answers
|
@@ -67,9 +64,17 @@ information.
|
|
67 |
User: {prompt}
|
68 |
Assistant: """
|
69 |
|
70 |
-
|
|
|
71 |
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
logger.debug(f"{model_loc} {file_size}GB")
|
75 |
|
@@ -85,7 +90,7 @@ logger.debug(f"model_file: {_}, exists: {Path(_).exists()}")
|
|
85 |
LLM = None
|
86 |
LLM = AutoModelForCausalLM.from_pretrained(
|
87 |
model_loc,
|
88 |
-
model_type="llama",
|
89 |
threads=cpu_count,
|
90 |
)
|
91 |
|
@@ -100,7 +105,7 @@ except Exception:
|
|
100 |
|
101 |
ns = SimpleNamespace(
|
102 |
response="",
|
103 |
-
generator=[],
|
104 |
)
|
105 |
|
106 |
|
@@ -115,17 +120,17 @@ class GenerationConfig:
|
|
115 |
reset: bool = False
|
116 |
stream: bool = True
|
117 |
threads: int = cpu_count
|
118 |
-
stop: list[str] = field(default_factory=lambda: [stop_string])
|
119 |
|
120 |
|
121 |
def generate(
|
122 |
-
|
123 |
-
llm
|
124 |
generation_config: GenerationConfig = GenerationConfig(),
|
125 |
):
|
126 |
"""Run model inference, will return a Generator if streaming is true."""
|
127 |
# if not user_prompt.strip():
|
128 |
-
_ = prompt_template.format(
|
129 |
print(_)
|
130 |
return llm(
|
131 |
_,
|
@@ -210,13 +215,13 @@ def predict(prompt, bot):
|
|
210 |
for word in generator:
|
211 |
# record first response time
|
212 |
if flag:
|
213 |
-
|
|
|
214 |
flag = 0
|
215 |
-
|
216 |
-
print(word, flush=True) # vertical stream
|
217 |
response += word
|
218 |
-
ns.response = response
|
219 |
-
buff.update(value=response)
|
220 |
print("")
|
221 |
logger.debug(f"{response=}")
|
222 |
except Exception as exc:
|
@@ -229,7 +234,7 @@ def predict(prompt, bot):
|
|
229 |
f"{atime.duration/(len(prompt) + len(response)):.1f}s/char)" # type: ignore
|
230 |
)
|
231 |
|
232 |
-
bot.append([prompt, f"{response} {_}"])
|
233 |
|
234 |
return prompt, bot
|
235 |
|
@@ -247,9 +252,9 @@ def predict_api(prompt):
|
|
247 |
max_new_tokens=512, # adjust as needed
|
248 |
seed=42,
|
249 |
reset=False, # reset history (cache)
|
250 |
-
stream=True,
|
251 |
threads=cpu_count,
|
252 |
-
stop=prompt_prefix[1:2],
|
253 |
)
|
254 |
|
255 |
generator = generate(
|
@@ -274,6 +279,10 @@ def predict_api(prompt):
|
|
274 |
return response
|
275 |
|
276 |
|
|
|
|
|
|
|
|
|
277 |
css = """
|
278 |
.importantButton {
|
279 |
background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
|
@@ -320,8 +329,9 @@ examples = [
|
|
320 |
["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
|
321 |
]
|
322 |
|
|
|
|
|
323 |
with gr.Blocks(
|
324 |
-
# title="mpt-30b-chat-ggml",
|
325 |
title=f"{Path(model_loc).name}",
|
326 |
theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
|
327 |
css=css,
|
@@ -343,7 +353,7 @@ with gr.Blocks(
|
|
343 |
|
344 |
# chatbot = gr.Chatbot().style(height=700) # 500
|
345 |
chatbot = gr.Chatbot(height=500)
|
346 |
-
buff = gr.Textbox(show_label=False, visible=
|
347 |
with gr.Row():
|
348 |
with gr.Column(scale=5):
|
349 |
msg = gr.Textbox(
|
@@ -359,12 +369,13 @@ with gr.Blocks(
|
|
359 |
with gr.Row(visible=False):
|
360 |
with gr.Accordion("Advanced Options:", open=False):
|
361 |
with gr.Row():
|
362 |
-
with gr.Column(scale=2):
|
363 |
system = gr.Textbox(
|
364 |
label="System Prompt",
|
365 |
value=prompt_template,
|
366 |
show_label=False,
|
367 |
-
).style(container=False)
|
|
|
368 |
with gr.Column():
|
369 |
with gr.Row():
|
370 |
change = gr.Button("Change System Prompt")
|
@@ -445,6 +456,8 @@ with gr.Blocks(
|
|
445 |
api_name="api",
|
446 |
)
|
447 |
|
|
|
|
|
448 |
# concurrency_count=5, max_size=20
|
449 |
# max_size=36, concurrency_count=14
|
450 |
block.queue(concurrency_count=5, max_size=20).launch(debug=True)
|
|
|
3 |
# ruff: noqa: E501
|
4 |
import os
|
5 |
import time
|
6 |
+
from dataclasses import asdict, dataclass
|
7 |
from pathlib import Path
|
8 |
from types import SimpleNamespace
|
9 |
|
|
|
39 |
|
40 |
url = "https://huggingface.co/savvamadar/ggml-gpt4all-j-v1.3-groovy/blob/main/ggml-gpt4all-j-v1.3-groovy.bin"
|
41 |
url = "https://huggingface.co/TheBloke/Llama-2-13B-GGML/blob/main/llama-2-13b.ggmlv3.q4_K_S.bin" # 7.37G
|
42 |
+
# url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin" # 6.93G
|
43 |
+
# url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q4_K_M.bin" # 7.87G
|
44 |
url = "https://huggingface.co/localmodels/Llama-2-13B-Chat-ggml/blob/main/llama-2-13b-chat.ggmlv3.q4_K_S.bin" # 7.37G
|
|
|
|
|
45 |
|
46 |
prompt_template="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
47 |
|
|
|
50 |
### Response:
|
51 |
"""
|
52 |
|
|
|
|
|
|
|
53 |
prompt_template = """System: You are a helpful,
|
54 |
respectful and honest assistant. Always answer as
|
55 |
helpfully as possible, while being safe. Your answers
|
|
|
64 |
User: {prompt}
|
65 |
Assistant: """
|
66 |
|
67 |
+
prompt_template = """Question: {question}
|
68 |
+
Answer: Let's work this out in a step by step way to be sure we have the right answer."""
|
69 |
|
70 |
+
_ = [elm for elm in prompt_template.splitlines() if elm.strip()]
|
71 |
+
stop_string = [elm.split(":")[0] + ":" for elm in _][-2]
|
72 |
+
|
73 |
+
try:
|
74 |
+
model_loc, file_size = dl_hf_model(url)
|
75 |
+
except Exception as exc_:
|
76 |
+
logger.error(exc_)
|
77 |
+
raise SystemExit(1) from exc_
|
78 |
|
79 |
logger.debug(f"{model_loc} {file_size}GB")
|
80 |
|
|
|
90 |
LLM = None
|
91 |
LLM = AutoModelForCausalLM.from_pretrained(
|
92 |
model_loc,
|
93 |
+
model_type="llama",
|
94 |
threads=cpu_count,
|
95 |
)
|
96 |
|
|
|
105 |
|
106 |
ns = SimpleNamespace(
|
107 |
response="",
|
108 |
+
generator=(_ for _ in []),
|
109 |
)
|
110 |
|
111 |
|
|
|
120 |
reset: bool = False
|
121 |
stream: bool = True
|
122 |
threads: int = cpu_count
|
123 |
+
# stop: list[str] = field(default_factory=lambda: [stop_string])
|
124 |
|
125 |
|
126 |
def generate(
|
127 |
+
question: str,
|
128 |
+
llm=LLM,
|
129 |
generation_config: GenerationConfig = GenerationConfig(),
|
130 |
):
|
131 |
"""Run model inference, will return a Generator if streaming is true."""
|
132 |
# if not user_prompt.strip():
|
133 |
+
_ = prompt_template.format(question=question)
|
134 |
print(_)
|
135 |
return llm(
|
136 |
_,
|
|
|
215 |
for word in generator:
|
216 |
# record first response time
|
217 |
if flag:
|
218 |
+
fisrt_arr = f"{time.time() - then:.1f}s"
|
219 |
+
logger.debug(f"\t 1st arrival: {fisrt_arr}")
|
220 |
flag = 0
|
221 |
+
print(word, end="", flush=True)
|
222 |
+
# print(word, flush=True) # vertical stream
|
223 |
response += word
|
224 |
+
ns.response = f"({fisrt_arr}){response}"
|
|
|
225 |
print("")
|
226 |
logger.debug(f"{response=}")
|
227 |
except Exception as exc:
|
|
|
234 |
f"{atime.duration/(len(prompt) + len(response)):.1f}s/char)" # type: ignore
|
235 |
)
|
236 |
|
237 |
+
bot.append([prompt, f"{response} \n{_}"])
|
238 |
|
239 |
return prompt, bot
|
240 |
|
|
|
252 |
max_new_tokens=512, # adjust as needed
|
253 |
seed=42,
|
254 |
reset=False, # reset history (cache)
|
255 |
+
stream=True,
|
256 |
threads=cpu_count,
|
257 |
+
# stop=prompt_prefix[1:2],
|
258 |
)
|
259 |
|
260 |
generator = generate(
|
|
|
279 |
return response
|
280 |
|
281 |
|
282 |
+
def update_buff():
|
283 |
+
return ns.response
|
284 |
+
|
285 |
+
|
286 |
css = """
|
287 |
.importantButton {
|
288 |
background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
|
|
|
329 |
["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
|
330 |
]
|
331 |
|
332 |
+
logger.info("start block")
|
333 |
+
|
334 |
with gr.Blocks(
|
|
|
335 |
title=f"{Path(model_loc).name}",
|
336 |
theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
|
337 |
css=css,
|
|
|
353 |
|
354 |
# chatbot = gr.Chatbot().style(height=700) # 500
|
355 |
chatbot = gr.Chatbot(height=500)
|
356 |
+
buff = gr.Textbox(show_label=False, visible=True)
|
357 |
with gr.Row():
|
358 |
with gr.Column(scale=5):
|
359 |
msg = gr.Textbox(
|
|
|
369 |
with gr.Row(visible=False):
|
370 |
with gr.Accordion("Advanced Options:", open=False):
|
371 |
with gr.Row():
|
372 |
+
with gr.Column(scale=2, container=False):
|
373 |
system = gr.Textbox(
|
374 |
label="System Prompt",
|
375 |
value=prompt_template,
|
376 |
show_label=False,
|
377 |
+
# ).style(container=False)
|
378 |
+
)
|
379 |
with gr.Column():
|
380 |
with gr.Row():
|
381 |
change = gr.Button("Change System Prompt")
|
|
|
456 |
api_name="api",
|
457 |
)
|
458 |
|
459 |
+
block.load(update_buff, [], buff, every=1)
|
460 |
+
|
461 |
# concurrency_count=5, max_size=20
|
462 |
# max_size=36, concurrency_count=14
|
463 |
block.queue(concurrency_count=5, max_size=20).launch(debug=True)
|
run-app.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
nodemon -w . -x python app.py
|