Spaces:
Running
on
Zero
How to run this code locally?
Hi there!
i find this space amazing!
i'd like to run this code but using my local gguf models.
What should be modified in the app.py code to achieve this?
i've put models in gguf format in the models directory.
i tried like this :
import spaces
import logging
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
from llama_cpp_agent.llm_output_settings import (
LlmStructuredOutputSettings,
LlmStructuredOutputType,
)
from llama_cpp_agent.tools import WebSearchTool
from llama_cpp_agent.prompt_templates import web_search_system_prompt, research_system_prompt
from ui import css, PLACEHOLDER
from utils import CitingSources
from settings import get_context_by_model, get_messages_formatter_type
#hf_hub_download(
# repo_id="bartowski/Mistral-7B-Instruct-v0.3-GGUF",
# filename="Mistral-7B-Instruct-v0.3-Q4_K_M.gguf",
# local_dir="./models"
#)
#hf_hub_download(
# repo_id="bartowski/Meta-Llama-3-8B-Instruct-GGUF",
# filename="Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
# local_dir="./models"
#)
#hf_hub_download(
# repo_id="TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF",
# filename="mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",
# local_dir="./models"
#)
examples = [
["latest news about Yann LeCun"],
["Latest news site:github.blog"],
["Where I can find best hotel in Galapagos, Ecuador intitle:hotel"],
["filetype:pdf intitle:python"]
]
def write_message_to_user():
"""
Let you write a message to the user.
"""
return "Please write the message to the user."
@spaces.GPU(duration=120)
def respond(
message,
history: list[tuple[str, str]],
model = 'Mistral-7B-Instruct-v0.3-Q4_K_M.gguf',
system_message = 'Helpful assistant',
max_tokens = 2048,
temperature = 0.45,
top_p = 0.95,
top_k = 40,
repeat_penalty = 1.1,
):
if "Mistral" in model:
model = 'Mistral-7B-Instruct-v0.3-Q4_K_M.gguf'
elif "Mixtral" in model:
model = 'mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf'
else:
model = 'Meta-Llama-3-8B-Instruct-Q4_K_M.gguf'
yield model
chat_template = get_messages_formatter_type(model)
llm = Llama(
model_path=f"models/{model}",
flash_attn=True,
n_gpu_layers=10,
n_batch=1024,
n_ctx=get_context_by_model(model),
)
provider = LlamaCppPythonProvider(llm)
logging.info(f"Loaded chat examples: {chat_template}")
search_tool = WebSearchTool(
llm_provider=provider,
message_formatter_type=chat_template,
max_tokens_search_results=12000,
max_tokens_per_summary=2048,
)
web_search_agent = LlamaCppAgent(
provider,
system_prompt=web_search_system_prompt,
predefined_messages_formatter_type=chat_template,
debug_output=True,
)
answer_agent = LlamaCppAgent(
provider,
system_prompt=research_system_prompt,
predefined_messages_formatter_type=chat_template,
debug_output=True,
)
settings = provider.get_provider_default_settings()
settings.stream = False
settings.temperature = temperature
settings.top_k = top_k
settings.top_p = top_p
settings.max_tokens = max_tokens
settings.repeat_penalty = repeat_penalty
output_settings = LlmStructuredOutputSettings.from_functions(
[search_tool.get_tool()]
)
messages = BasicChatHistory()
for msn in history:
user = {"role": Roles.user, "content": msn[0]}
assistant = {"role": Roles.assistant, "content": msn[1]}
messages.add_message(user)
messages.add_message(assistant)
result = web_search_agent.get_chat_response(
message,
llm_sampling_settings=settings,
structured_output_settings=output_settings,
add_message_to_chat_history=False,
add_response_to_chat_history=False,
print_output=False,
)
outputs = ""
settings.stream = True
response_text = answer_agent.get_chat_response(
f"Write a detailed and complete research document that fulfills the following user request: '{message}', based on the information from the web below.\n\n" +
result[0]["return_value"],
role=Roles.tool,
llm_sampling_settings=settings,
chat_history=messages,
returns_streaming_generator=True,
print_output=False,
)
for text in response_text:
outputs += text
yield outputs
output_settings = LlmStructuredOutputSettings.from_pydantic_models(
[CitingSources], LlmStructuredOutputType.object_instance
)
citing_sources = answer_agent.get_chat_response(
"Cite the sources you used in your response.",
role=Roles.tool,
llm_sampling_settings=settings,
chat_history=messages,
returns_streaming_generator=False,
structured_output_settings=output_settings,
print_output=False,
)
outputs += "\n\nSources:\n"
outputs += "\n".join(citing_sources.sources)
yield outputs
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Dropdown([
'Mistral 7B Instruct v0.3',
'Mixtral 8x7b Instruct v0.1',
'Llama 3 8B Instruct'
],
value="Mistral 7B Instruct v0.3",
label="Model"
),
gr.Textbox(value=web_search_system_prompt, label="System message"),
gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.45, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p",
),
gr.Slider(
minimum=0,
maximum=100,
value=40,
step=1,
label="Top-k",
),
gr.Slider(
minimum=0.0,
maximum=2.0,
value=1.1,
step=0.1,
label="Repetition penalty",
),
],
theme=gr.themes.Soft(
primary_hue="green",
secondary_hue="lime",
neutral_hue="gray",
font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
body_background_fill_dark="#0c0505",
block_background_fill_dark="#0c0505",
block_border_width="1px",
block_title_background_fill_dark="#1b0f0f",
input_background_fill_dark="#140b0b",
button_secondary_background_fill_dark="#140b0b",
border_color_accent_dark="#1b0f0f",
border_color_primary_dark="#1b0f0f",
background_fill_secondary_dark="#0c0505",
color_accent_soft_dark="transparent",
code_background_fill_dark="#140b0b"
),
css=css,
retry_btn="Retry",
undo_btn="Undo",
clear_btn="Clear",
submit_btn="Send",
examples = (examples),
description="Llama-cpp-agent: Chat Web Search DDG Agent",
analytics_enabled=False,
chatbot=gr.Chatbot(
scale=1,
placeholder=PLACEHOLDER,
show_copy_button=True
)
)
if __name__ == "__main__":
demo.launch()
but i have an error :
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/queueing.py", line 580, in process_events
response = await route_utils.call_process_api(
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/route_utils.py", line 276, in call_process_api
output = await app.get_blocks().process_api(
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/blocks.py", line 1928, in process_api
result = await self.call_function(
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/blocks.py", line 1526, in call_function
prediction = await utils.async_iteration(iterator)
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/utils.py", line 657, in async_iteration
return await iterator.__anext__()
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/utils.py", line 783, in asyncgen_wrapper
response = await iterator.__anext__()
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/chat_interface.py", line 608, in _stream_fn
async for response in generator:
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/utils.py", line 650, in __anext__
return await anyio.to_thread.run_sync(
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2177, in run_sync_in_worker_thread
return await future
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 859, in run
result = context.run(func, *args)
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/utils.py", line 633, in run_sync_iterator_async
return next(iterator)
File "/whatever/ddg-web-search-chat/app.py", line 73, in respond
llm = Llama(
File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/llama_cpp/llama.py", line 279, in __init__
self.n_batch = min(n_ctx, n_batch) # ???
TypeError: '<' not supported between instances of 'int' and 'NoneType'
Hello!
Love you like it!
remove:
import spaces
@spaces.GPU(duration=120)
you need gpu if you don't have it then you need to adjust
n_batch=1024, // use 512
n_gpu_layers=10,
your error look like your self.n_batch = min(n_ctx, n_batch) # ???
is not a int
Thank you!
i made the modification your proposed but still have this same error.TypeError: '<' not supported between instances of 'int' and 'NoneType'
your error look like your self.n_batch = min(n_ctx, n_batch) # ??? is not a int
Yes i'm currently searching the web, and it looks like this is the problem.
But i don't know why!
What part of the code could send NoneType instead of an int?
i thought if the code could run on a space it should run on my computer in a python venv.
yeah it would run.. I would create a local repo in github but currently i don't have gpu that is why I use humble HF Spaces hehe