Llama-3.2-1b-CPU

Running

App Files Files Community

KingNish commited on Sep 26

Commit

4fde3b2

•

1 Parent(s): 7ebbc54

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -38

app.py CHANGED Viewed

@@ -9,8 +9,10 @@ from llama_cpp_agent.chat_history.messages import Roles
 import gradio as gr
 from huggingface_hub import hf_hub_download
 llm = None
 llm_model = None
 # Download the new model
 hf_hub_download(
@@ -22,6 +24,31 @@ hf_hub_download(
 def get_messages_formatter_type(model_name):
     return MessagesFormatterType.LLAMA_3
 def respond(
     message,
     history: list[tuple[str, str]],
@@ -34,29 +61,13 @@ def respond(
     repeat_penalty,
 ):
     global llm
-    global llm_model
     chat_template = get_messages_formatter_type(model)
-    if llm is None or llm_model != model:
-        llm = Llama(
-            model_path=f"models/{model}",
-            n_gpu_layers=0,  # Adjust based on your GPU
-            n_batch=32398,     # Adjust based on your RAM
-            n_ctx=512,      # Adjust based on your RAM and desired context length
-        )
-        llm_model = model
-    provider = LlamaCppPythonProvider(llm)
-    agent = LlamaCppAgent(
-        provider,
-        system_prompt=f"{system_message}",
-        predefined_messages_formatter_type=chat_template,
-        debug_output=True
-    )
-    settings = provider.get_provider_default_settings()
     settings.temperature = temperature
     settings.top_k = top_k
     settings.top_p = top_p
@@ -118,30 +129,20 @@ demo = gr.ChatInterface(
             value="llama-3.2-1b-instruct-q4_k_m.gguf",
             label="Model"
         ),
-        gr.TextArea(value="""You are Meta Llama 3.2 (1B), an advanced AI assistant created by Meta. Your capabilities include:
-1. Complex reasoning and problem-solving
-2. Multilingual understanding and generation
-3. Creative and analytical writing
-4. Code understanding and generation
-5. Task decomposition and step-by-step guidance
-6. Summarization and information extraction
-Always strive for accuracy, clarity, and helpfulness in your responses. If you're unsure about something, express your uncertainty. Use the following format for your responses:
-""", label="System message"),
         gr.Slider(minimum=1, maximum=2048, value=1024, step=1, label="Max tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
             maximum=2.0,
-            value=0.95,
             step=0.05,
             label="Top-p",
         ),
         gr.Slider(
             minimum=0,
             maximum=100,
-            value=40,
             step=1,
             label="Top-k",
         ),
@@ -181,11 +182,11 @@ Always strive for accuracy, clarity, and helpfulness in your responses. If you'r
         ["Explain the difference between machine learning and deep learning."],
         ["Summarize the key points of climate change and its global impact."],
         ["Explain quantum computing to a 10-year-old."],
-        ["Design a step-by-step meal plan for someone trying to lose weight and build muscle."]
     ],
     cache_examples=False,
-    autofocus=False,
-    concurrency_limit=None
 )
 if __name__ == "__main__":

 import gradio as gr
 from huggingface_hub import hf_hub_download
+# Global variables to store the model and agent
 llm = None
 llm_model = None
+agent = None
 # Download the new model
 hf_hub_download(
 def get_messages_formatter_type(model_name):
     return MessagesFormatterType.LLAMA_3
+def load_model(model_path):
+    global llm
+    global llm_model
+    if llm is None or llm_model != model_path:
+        llm = Llama(
+            model_path=model_path,
+            n_gpu_layers=0,  # Adjust based on your GPU
+            n_batch=32398,     # Adjust based on your RAM
+            n_ctx=512,      # Adjust based on your RAM and desired context length
+        )
+        llm_model = model_path
+    return llm
+def load_agent(llm, system_message, chat_template):
+    global agent
+    if agent is None:
+        provider = LlamaCppPythonProvider(llm)
+        agent = LlamaCppAgent(
+            provider,
+            system_prompt=system_message,
+            predefined_messages_formatter_type=chat_template,
+            debug_output=True
+        )
+    return agent
 def respond(
     message,
     history: list[tuple[str, str]],
     repeat_penalty,
 ):
     global llm
+    global agent
     chat_template = get_messages_formatter_type(model)
+    llm = load_model(f"models/{model}")
+    agent = load_agent(llm, system_message, chat_template)
+    settings = agent.provider.get_provider_default_settings()
     settings.temperature = temperature
     settings.top_k = top_k
     settings.top_p = top_p
             value="llama-3.2-1b-instruct-q4_k_m.gguf",
             label="Model"
         ),
+        gr.TextArea(value="""You are Meta Llama 3.2 (1B), an advanced AI assistant created by Meta.""", label="System message"),
         gr.Slider(minimum=1, maximum=2048, value=1024, step=1, label="Max tokens"),
+        gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
             maximum=2.0,
+            value=0.9,
             step=0.05,
             label="Top-p",
         ),
         gr.Slider(
             minimum=0,
             maximum=100,
+            value=1,
             step=1,
             label="Top-k",
         ),
         ["Explain the difference between machine learning and deep learning."],
         ["Summarize the key points of climate change and its global impact."],
         ["Explain quantum computing to a 10-year-old."],
+        ["Design a step-by-Step Meal Plan for Weight Loss and Muscle Gain."],
     ],
     cache_examples=False,
+    autofocus false,
+    concurrency_limit None
 )
 if __name__ == "__main__":