import os import signal import threading import time import subprocess import asyncio OLLAMA = os.path.expanduser("~/ollama") process = None OLLAMA_SERVICE_THREAD = None if not os.path.exists(OLLAMA): subprocess.run("curl -L https://ollama.com/download/ollama-linux-amd64 -o ~/ollama", shell=True) os.chmod(OLLAMA, 0o755) def ollama_service_thread(): global process process = subprocess.Popen("~/ollama serve", shell=True, preexec_fn=os.setsid) process.wait() def terminate(): global process, OLLAMA_SERVICE_THREAD if process: os.killpg(os.getpgid(process.pid), signal.SIGTERM) if OLLAMA_SERVICE_THREAD: OLLAMA_SERVICE_THREAD.join() process = None OLLAMA_SERVICE_THREAD = None print("Ollama service stopped.") # Uncomment and modify the model to what you want locally # model = "moondream" # model = os.environ.get("MODEL") # subprocess.run(f"~/ollama pull {model}", shell=True) import ollama import gradio as gr from ollama import AsyncClient client = AsyncClient(host='http://localhost:11434', timeout=120) HF_TOKEN = os.environ.get("HF_TOKEN", None) TITLE = "

Ollama Chat

" DESCRIPTION = f"""

Feel free to test models with ollama.
First run please type /init to launch process.
Type /pull model_name to pull model.

""" INIT_SIGN = "" def init(): global OLLAMA_SERVICE_THREAD OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread) OLLAMA_SERVICE_THREAD.start() print("Giving ollama serve a moment") time.sleep(10) global INIT_SIGN INIT_SIGN = "FINISHED" def ollama_func(command): if " " in command: c1, c2 = command.split(" ") else: c1 = command c2 = "" function_map = { "/init": init, "/pull": lambda: ollama.pull(c2), "/list": ollama.list, "/bye": terminate, } if c1 in function_map: function_map.get(c1)() return "Running..." else: return "No supported command." def launch(): global OLLAMA_SERVICE_THREAD OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread) OLLAMA_SERVICE_THREAD.start() async def stream_chat(message: str, history: list, model: str, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float): print(f"message: {message}") conversation = [] for prompt, answer in history: conversation.extend([ {"role": "user", "content": prompt}, {"role": "assistant", "content": answer}, ]) conversation.append({"role": "user", "content": message}) print(f"Conversation is -\n{conversation}") if message.startswith("/"): resp = ollama_func(message) yield resp else: if not INIT_SIGN: yield "Please initialize Ollama" else: if not process: launch() print("Giving ollama serve a moment") time.sleep(10) buffer = "" async for part in await client.chat( model=model, stream=True, messages=conversation, keep_alive="60s", options={ 'num_predict': max_new_tokens, 'temperature': temperature, 'top_p': top_p, 'top_k': top_k, 'repeat_penalty': penalty, 'low_vram': True, }, ): buffer += part['message']['content'] yield buffer chatbot = gr.Chatbot(height=600, placeholder=DESCRIPTION) with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo: gr.HTML(TITLE) gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") gr.ChatInterface( fn=stream_chat, chatbot=chatbot, fill_height=True, additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), additional_inputs=[ gr.Textbox( value="qwen2:0.5b", label="Model", render=False, ), gr.Slider( minimum=0, maximum=1, step=0.1, value=0.8, label="Temperature", render=False, ), gr.Slider( minimum=128, maximum=2048, step=1, value=1024, label="Max New Tokens", render=False, ), gr.Slider( minimum=0.0, maximum=1.0, step=0.1, value=0.8, label="top_p", render=False, ), gr.Slider( minimum=1, maximum=20, step=1, value=20, label="top_k", render=False, ), gr.Slider( minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Repetition penalty", render=False, ), ], examples=[ ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."], ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."], ["Tell me a random fun fact about the Roman Empire."], ["Show me a code snippet of a website's sticky header in CSS and JavaScript."], ], cache_examples=False, ) if __name__ == "__main__": demo.launch()