Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import json
|
2 |
import subprocess
|
|
|
3 |
from llama_cpp import Llama
|
4 |
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
|
5 |
from llama_cpp_agent.providers import LlamaCppPythonProvider
|
@@ -40,9 +41,9 @@ def respond(
|
|
40 |
if llm is None or llm_model != model:
|
41 |
llm = Llama(
|
42 |
model_path=f"models/{model}",
|
43 |
-
n_gpu_layers=
|
44 |
-
n_batch=
|
45 |
-
n_ctx=
|
46 |
)
|
47 |
llm_model = model
|
48 |
|
@@ -77,6 +78,9 @@ def respond(
|
|
77 |
messages.add_message(user)
|
78 |
messages.add_message(assistant)
|
79 |
|
|
|
|
|
|
|
80 |
stream = agent.get_chat_response(
|
81 |
message,
|
82 |
llm_sampling_settings=settings,
|
@@ -88,8 +92,15 @@ def respond(
|
|
88 |
outputs = ""
|
89 |
for output in stream:
|
90 |
outputs += output
|
|
|
91 |
yield outputs
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
description = """<p><center>
|
94 |
<a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
|
95 |
|
|
|
1 |
import json
|
2 |
import subprocess
|
3 |
+
import time
|
4 |
from llama_cpp import Llama
|
5 |
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
|
6 |
from llama_cpp_agent.providers import LlamaCppPythonProvider
|
|
|
41 |
if llm is None or llm_model != model:
|
42 |
llm = Llama(
|
43 |
model_path=f"models/{model}",
|
44 |
+
n_gpu_layers=4, # Adjust based on your GPU
|
45 |
+
n_batch=64000, # Adjust based on your RAM
|
46 |
+
n_ctx=1024, # Adjust based on your RAM and desired context length
|
47 |
)
|
48 |
llm_model = model
|
49 |
|
|
|
78 |
messages.add_message(user)
|
79 |
messages.add_message(assistant)
|
80 |
|
81 |
+
start_time = time.time()
|
82 |
+
token_count = 0
|
83 |
+
|
84 |
stream = agent.get_chat_response(
|
85 |
message,
|
86 |
llm_sampling_settings=settings,
|
|
|
92 |
outputs = ""
|
93 |
for output in stream:
|
94 |
outputs += output
|
95 |
+
token_count += len(output.split())
|
96 |
yield outputs
|
97 |
|
98 |
+
end_time = time.time()
|
99 |
+
latency = end_time - start_time
|
100 |
+
speed = token_count / (end_time - start_time)
|
101 |
+
print(f"Latency: {latency} seconds")
|
102 |
+
print(f"Speed: {speed} tokens/second")
|
103 |
+
|
104 |
description = """<p><center>
|
105 |
<a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
|
106 |
|