Llama-3.2-1b-CPU

Running

KingNish commited on Sep 26

Commit

d3ffa5e

•

1 Parent(s): 66b33d0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import subprocess
 from llama_cpp import Llama
 from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
@@ -40,9 +41,9 @@ def respond(
     if llm is None or llm_model != model:
         llm = Llama(
             model_path=f"models/{model}",
-            n_gpu_layers=0,
-            n_batch=32000,
-            n_ctx=2048,
         )
         llm_model = model
@@ -77,6 +78,9 @@ def respond(
         messages.add_message(user)
         messages.add_message(assistant)
     stream = agent.get_chat_response(
         message,
         llm_sampling_settings=settings,
@@ -88,8 +92,15 @@ def respond(
     outputs = ""
     for output in stream:
         outputs += output
         yield outputs
 description = """<p><center>
 <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>

 import json
 import subprocess
+import time
 from llama_cpp import Llama
 from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
     if llm is None or llm_model != model:
         llm = Llama(
             model_path=f"models/{model}",
+            n_gpu_layers=4,  # Adjust based on your GPU
+            n_batch=64000,     # Adjust based on your RAM
+            n_ctx=1024,      # Adjust based on your RAM and desired context length
         )
         llm_model = model
         messages.add_message(user)
         messages.add_message(assistant)
+    start_time = time.time()
+    token_count = 0
     stream = agent.get_chat_response(
         message,
         llm_sampling_settings=settings,
     outputs = ""
     for output in stream:
         outputs += output
+        token_count += len(output.split())
         yield outputs
+    end_time = time.time()
+    latency = end_time - start_time
+    speed = token_count / (end_time - start_time)
+    print(f"Latency: {latency} seconds")
+    print(f"Speed: {speed} tokens/second")
 description = """<p><center>
 <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>