import langchain_community,langchain from langchain_community.llms import LlamaCpp from llama_cpp import Llama from langchain.schema import AIMessage, HumanMessage, SystemMessage import gradio as gr # Define the model path space_model_path = "./model/llama-3.2-1b-instruct-q8_0.gguf" model_path = "hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF" file_name = "llama-3.2-1b-instruct-q8_0.gguf" Llama.from_pretrained(repo_id = model_path, filename=file_name, local_dir="./model") system_message = "You are a helpful assistant who acts like a pirate." llm = LlamaCpp( model_path=space_model_path, temperature=0.8, max_tokens=250, top_p=0.6, verbose=True ) def stream_response(message, history): print(f"Input: {message}. History: {history}\n") history_langchain_format = [] history_langchain_format.append(SystemMessage(content=system_message)) for human, ai in history: history_langchain_format.append(HumanMessage(content=human)) history_langchain_format.append(AIMessage(content=ai)) if message is not None: history_langchain_format.append(HumanMessage(content=message)) partial_message = "" for response in llm.stream(history_langchain_format): partial_message += response yield partial_message demo_interface = gr.ChatInterface( stream_response, textbox=gr.Textbox(placeholder="Send to the LLM...", container=False, autoscroll=True, scale=7), ) demo_interface.launch(share=False, debug=True)