llama-cpp-python-cuda-gradio

Runtime error

App Files Files Community

Radamés Ajna commited on Aug 18, 2023

Commit

eee4d14

•

0 Parent(s):

Duplicate from radames/llama-cpp-python-cuda-gradio

Browse files

Files changed (4) hide show

.gitattributes +35 -0
Dockerfile +45 -0
README.md +11 -0
app.py +70 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
+FROM nvidia/cuda:${CUDA_IMAGE}
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    ocl-icd-opencl-dev opencl-headers clinfo \
+    libclblast-dev libopenblas-dev \
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
+COPY . .
+# setting build related env vars
+ENV CUDA_DOCKER_ARCH=all
+ENV LLAMA_CUBLAS=1
+# Install depencencies
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings gradio huggingface_hub hf_transfer
+# Install llama-cpp-python (build with cuda)
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=$HOME/app \
+	PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+	SYSTEM=spaces
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python3", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: Llama Cpp Python Cuda
+emoji: 🏆
+colorFrom: pink
+colorTo: indigo
+sdk: docker
+pinned: false
+duplicated_from: radames/llama-cpp-python-cuda-gradio
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import gradio as gr
+import copy
+import time
+import ctypes  # to run on C api directly
+import llama_cpp
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download  # load from huggingfaces
+llm = Llama(
+    model_path=hf_hub_download(
+        repo_id="TheBloke/WizardLM-7B-uncensored-GGML",
+        filename="WizardLM-7B-uncensored.ggmlv3.q4_0.bin",
+    ),
+    n_ctx=2048,
+)  # download model from hf/ n_ctx=2048 for high ccontext length
+history = []
+pre_prompt = " The user and the AI are having a conversation : <|endoftext|> \n "
+def generate_text(input_text, history):
+    temp = ""
+    if history == []:
+        input_text_with_history = (
+            f"SYSTEM:{pre_prompt}"
+            + "\n"
+            + f"USER: {input_text} "
+            + "\n"
+            + " ASSISTANT:"
+        )
+    else:
+        input_text_with_history = f"{history[-1][1]}" + "\n"
+        input_text_with_history += f"USER: {input_text}" + "\n" + " ASSISTANT:"
+    output = llm(
+        input_text_with_history,
+        max_tokens=1024,
+        stop=[
+            "<|prompter|>",
+            "<|endoftext|>",
+            "<|endoftext|> \n",
+            "ASSISTANT:",
+            "USER:",
+            "SYSTEM:",
+        ],
+        stream=True,
+    )
+    for out in output:
+        stream = copy.deepcopy(out)
+        temp += stream["choices"][0]["text"]
+        yield temp
+    history = ["init", input_text_with_history]
+demo = gr.ChatInterface(
+    generate_text,
+    title="llama-cpp-python on GPU",
+    description="Running LLM with https://github.com/abetlen/llama-cpp-python. btw the text streaming thing was the hardest thing to impliment",
+    examples=["Hello", "Am I cool?", "Are tomatoes vegetables?"],
+    cache_examples=True,
+    retry_btn=None,
+    undo_btn="Delete Previous",
+    clear_btn="Clear",
+)
+demo.queue(concurrency_count=1, max_size=5)
+demo.launch()