Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -14,20 +14,8 @@ hf_hub_download(
|
|
14 |
filename="fluentlylm-prinum-q4_k_m.gguf",
|
15 |
local_dir="./models"
|
16 |
)
|
17 |
-
model = "fluentlylm-prinum-q4_k_m.gguf"
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
llm = Llama(
|
22 |
-
model_path=f"models/{model}",
|
23 |
-
flash_attn=True,
|
24 |
-
n_gpu_layers=90,
|
25 |
-
n_batch=1536,
|
26 |
-
n_ctx=8192,
|
27 |
-
)
|
28 |
-
provider = LlamaCppPythonProvider(llm)
|
29 |
-
|
30 |
-
@spaces.GPU(duration=120)
|
31 |
def respond(
|
32 |
message,
|
33 |
history: list[tuple[str, str]],
|
@@ -39,6 +27,16 @@ def respond(
|
|
39 |
top_k,
|
40 |
repeat_penalty,
|
41 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
agent = LlamaCppAgent(
|
44 |
provider,
|
@@ -82,13 +80,13 @@ def respond(
|
|
82 |
outputs += output
|
83 |
yield outputs
|
84 |
|
85 |
-
def create_interface(description):
|
86 |
return gr.ChatInterface(
|
87 |
respond,
|
88 |
additional_inputs=[
|
89 |
-
gr.Textbox(value=
|
90 |
-
gr.Textbox(value="You are Fluently, a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness
|
91 |
-
gr.Slider(minimum=1, maximum=4096, value=
|
92 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
|
93 |
gr.Slider(
|
94 |
minimum=0.1,
|
@@ -112,19 +110,19 @@ def create_interface(description):
|
|
112 |
label="Repetition penalty",
|
113 |
),
|
114 |
],
|
115 |
-
title="",
|
116 |
description=description,
|
117 |
chatbot=gr.Chatbot(
|
118 |
-
label=
|
119 |
scale=1,
|
120 |
show_copy_button=True
|
121 |
)
|
122 |
)
|
123 |
|
124 |
-
description = """#
|
125 |
-
interface = create_interface(description)
|
126 |
|
127 |
-
demo = gr.Blocks(
|
128 |
|
129 |
with demo:
|
130 |
interface.render()
|
|
|
14 |
filename="fluentlylm-prinum-q4_k_m.gguf",
|
15 |
local_dir="./models"
|
16 |
)
|
|
|
17 |
|
18 |
+
@spaces.GPU(duration=110)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def respond(
|
20 |
message,
|
21 |
history: list[tuple[str, str]],
|
|
|
27 |
top_k,
|
28 |
repeat_penalty,
|
29 |
):
|
30 |
+
chat_template = MessagesFormatterType.GEMMA_2
|
31 |
+
|
32 |
+
llm = Llama(
|
33 |
+
model_path=f"models/{model}",
|
34 |
+
flash_attn=True,
|
35 |
+
n_gpu_layers=81,
|
36 |
+
n_batch=1024,
|
37 |
+
n_ctx=8192,
|
38 |
+
)
|
39 |
+
provider = LlamaCppPythonProvider(llm)
|
40 |
|
41 |
agent = LlamaCppAgent(
|
42 |
provider,
|
|
|
80 |
outputs += output
|
81 |
yield outputs
|
82 |
|
83 |
+
def create_interface(model_name, description):
|
84 |
return gr.ChatInterface(
|
85 |
respond,
|
86 |
additional_inputs=[
|
87 |
+
gr.Textbox(value=model_name, label="Model", interactive=False),
|
88 |
+
gr.Textbox(value="You are Fluently, a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness", label="System message"),
|
89 |
+
gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
|
90 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
|
91 |
gr.Slider(
|
92 |
minimum=0.1,
|
|
|
110 |
label="Repetition penalty",
|
111 |
),
|
112 |
],
|
113 |
+
title=f"**FluentlyLM Prinum** ```on ZeroGPU```",
|
114 |
description=description,
|
115 |
chatbot=gr.Chatbot(
|
116 |
+
label=None,
|
117 |
scale=1,
|
118 |
show_copy_button=True
|
119 |
)
|
120 |
)
|
121 |
|
122 |
+
description = """# **FluentlyLM Prinum ```on ZeroGPU```"""
|
123 |
+
interface = create_interface('fluentlylm-prinum-q4_k_m.gguf', description)
|
124 |
|
125 |
+
demo = gr.Blocks()
|
126 |
|
127 |
with demo:
|
128 |
interface.render()
|