Spaces:

ehristoforu
/

FluentlyLM-Prinum-demo

Running on Zero

App Files Files Community

ehristoforu commited on 4 days ago

Commit

b096473

verified ·

1 Parent(s): 407f2d8

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -22

app.py CHANGED Viewed

@@ -14,20 +14,8 @@ hf_hub_download(
     filename="fluentlylm-prinum-q4_k_m.gguf",
     local_dir="./models"
 )
-model = "fluentlylm-prinum-q4_k_m.gguf"
-chat_template = MessagesFormatterType.GEMMA_2
-llm = Llama(
-    model_path=f"models/{model}",
-    flash_attn=True,
-    n_gpu_layers=90,
-    n_batch=1536,
-    n_ctx=8192,
-)
-provider = LlamaCppPythonProvider(llm)
-@spaces.GPU(duration=120)
 def respond(
     message,
     history: list[tuple[str, str]],
@@ -39,6 +27,16 @@ def respond(
     top_k,
     repeat_penalty,
 ):
     agent = LlamaCppAgent(
         provider,
@@ -82,13 +80,13 @@ def respond(
         outputs += output
         yield outputs
-def create_interface(description):
     return gr.ChatInterface(
         respond,
         additional_inputs=[
-            gr.Textbox(value=model, label="Model", interactive=False),
-            gr.Textbox(value="You are Fluently, a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness.", label="System message"),
-            gr.Slider(minimum=1, maximum=4096, value=1536, step=8, label="Max tokens"),
             gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
             gr.Slider(
                 minimum=0.1,
@@ -112,19 +110,19 @@ def create_interface(description):
                 label="Repetition penalty",
             ),
         ],
-        title="",
         description=description,
         chatbot=gr.Chatbot(
-            label="Good afternoon.",
             scale=1,
             show_copy_button=True
         )
     )
-description = """# /**FluentlyLM Prinum** ```on ZeroGPU```"""
-interface = create_interface(description)
-demo = gr.Blocks(theme='allenai/gradio-theme', title="FluentlyLM Prinum – Demo")
 with demo:
     interface.render()

     filename="fluentlylm-prinum-q4_k_m.gguf",
     local_dir="./models"
 )
+@spaces.GPU(duration=110)
 def respond(
     message,
     history: list[tuple[str, str]],
     top_k,
     repeat_penalty,
 ):
+    chat_template = MessagesFormatterType.GEMMA_2
+    llm = Llama(
+        model_path=f"models/{model}",
+        flash_attn=True,
+        n_gpu_layers=81,
+        n_batch=1024,
+        n_ctx=8192,
+    )
+    provider = LlamaCppPythonProvider(llm)
     agent = LlamaCppAgent(
         provider,
         outputs += output
         yield outputs
+def create_interface(model_name, description):
     return gr.ChatInterface(
         respond,
         additional_inputs=[
+            gr.Textbox(value=model_name, label="Model", interactive=False),
+            gr.Textbox(value="You are Fluently, a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness", label="System message"),
+            gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
             gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
             gr.Slider(
                 minimum=0.1,
                 label="Repetition penalty",
             ),
         ],
+        title=f"**FluentlyLM Prinum** ```on ZeroGPU```",
         description=description,
         chatbot=gr.Chatbot(
+            label=None,
             scale=1,
             show_copy_button=True
         )
     )
+description = """# **FluentlyLM Prinum ```on ZeroGPU```"""
+interface = create_interface('fluentlylm-prinum-q4_k_m.gguf', description)
+demo = gr.Blocks()
 with demo:
     interface.render()