ehristoforu commited on
Commit
b096473
·
verified ·
1 Parent(s): 407f2d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -22
app.py CHANGED
@@ -14,20 +14,8 @@ hf_hub_download(
14
  filename="fluentlylm-prinum-q4_k_m.gguf",
15
  local_dir="./models"
16
  )
17
- model = "fluentlylm-prinum-q4_k_m.gguf"
18
 
19
- chat_template = MessagesFormatterType.GEMMA_2
20
-
21
- llm = Llama(
22
- model_path=f"models/{model}",
23
- flash_attn=True,
24
- n_gpu_layers=90,
25
- n_batch=1536,
26
- n_ctx=8192,
27
- )
28
- provider = LlamaCppPythonProvider(llm)
29
-
30
- @spaces.GPU(duration=120)
31
  def respond(
32
  message,
33
  history: list[tuple[str, str]],
@@ -39,6 +27,16 @@ def respond(
39
  top_k,
40
  repeat_penalty,
41
  ):
 
 
 
 
 
 
 
 
 
 
42
 
43
  agent = LlamaCppAgent(
44
  provider,
@@ -82,13 +80,13 @@ def respond(
82
  outputs += output
83
  yield outputs
84
 
85
- def create_interface(description):
86
  return gr.ChatInterface(
87
  respond,
88
  additional_inputs=[
89
- gr.Textbox(value=model, label="Model", interactive=False),
90
- gr.Textbox(value="You are Fluently, a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness.", label="System message"),
91
- gr.Slider(minimum=1, maximum=4096, value=1536, step=8, label="Max tokens"),
92
  gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
93
  gr.Slider(
94
  minimum=0.1,
@@ -112,19 +110,19 @@ def create_interface(description):
112
  label="Repetition penalty",
113
  ),
114
  ],
115
- title="",
116
  description=description,
117
  chatbot=gr.Chatbot(
118
- label="Good afternoon.",
119
  scale=1,
120
  show_copy_button=True
121
  )
122
  )
123
 
124
- description = """# /**FluentlyLM Prinum** ```on ZeroGPU```"""
125
- interface = create_interface(description)
126
 
127
- demo = gr.Blocks(theme='allenai/gradio-theme', title="FluentlyLM Prinum – Demo")
128
 
129
  with demo:
130
  interface.render()
 
14
  filename="fluentlylm-prinum-q4_k_m.gguf",
15
  local_dir="./models"
16
  )
 
17
 
18
+ @spaces.GPU(duration=110)
 
 
 
 
 
 
 
 
 
 
 
19
  def respond(
20
  message,
21
  history: list[tuple[str, str]],
 
27
  top_k,
28
  repeat_penalty,
29
  ):
30
+ chat_template = MessagesFormatterType.GEMMA_2
31
+
32
+ llm = Llama(
33
+ model_path=f"models/{model}",
34
+ flash_attn=True,
35
+ n_gpu_layers=81,
36
+ n_batch=1024,
37
+ n_ctx=8192,
38
+ )
39
+ provider = LlamaCppPythonProvider(llm)
40
 
41
  agent = LlamaCppAgent(
42
  provider,
 
80
  outputs += output
81
  yield outputs
82
 
83
+ def create_interface(model_name, description):
84
  return gr.ChatInterface(
85
  respond,
86
  additional_inputs=[
87
+ gr.Textbox(value=model_name, label="Model", interactive=False),
88
+ gr.Textbox(value="You are Fluently, a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness", label="System message"),
89
+ gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
90
  gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
91
  gr.Slider(
92
  minimum=0.1,
 
110
  label="Repetition penalty",
111
  ),
112
  ],
113
+ title=f"**FluentlyLM Prinum** ```on ZeroGPU```",
114
  description=description,
115
  chatbot=gr.Chatbot(
116
+ label=None,
117
  scale=1,
118
  show_copy_button=True
119
  )
120
  )
121
 
122
+ description = """# **FluentlyLM Prinum ```on ZeroGPU```"""
123
+ interface = create_interface('fluentlylm-prinum-q4_k_m.gguf', description)
124
 
125
+ demo = gr.Blocks()
126
 
127
  with demo:
128
  interface.render()