FROM ubuntu:22.04 AS base ENV CCACHE_DIR=/root/.cache/ccache ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so" ENV TRANSFORMERS_CACHE="/root/.cache/huggingface/hub" ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} RUN --mount=type=cache,target=/var/cache/apt \ apt-get update -y && \ apt-get install -y \ curl \ ccache \ git \ wget \ vim \ numactl \ gcc-12 \ g++-12 \ python3 \ python3-pip \ libtcmalloc-minimal4 \ libnuma-dev \ ffmpeg \ libsm6 \ libxext6 \ libgl1 \ cmake && \ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 && \ rm -rf /var/lib/apt/lists/* RUN --mount=type=cache,target=/root/.cache/pip \ pip install intel-openmp RUN useradd -ms /bin/bash vllmuser USER vllmuser WORKDIR /home/vllmuser RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl RUN git clone https://github.com/vllm-project/vllm.git WORKDIR /home/vllmuser/vllm COPY requirements-build.txt requirements-build.txt COPY requirements-common.txt requirements-common.txt COPY requirements-cpu.txt requirements-cpu.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install --upgrade pip && \ pip install -r requirements-build.txt RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git RUN --mount=type=cache,target=/root/.cache/ccache \ cmake -B ./oneDNN/build -S ./oneDNN -G "Unix Makefiles" \ -DONEDNN_LIBRARY_TYPE=STATIC \ -DONEDNN_BUILD_DOC=OFF \ -DONEDNN_BUILD_EXAMPLES=OFF \ -DONEDNN_BUILD_TESTS=OFF \ -DONEDNN_BUILD_GRAPH=OFF \ -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \ cmake --build ./oneDNN/build --target install --config Release WORKDIR /home/vllmuser/vllm RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-cpu.txt COPY ./ /home/vllmuser/vllm/ ENV CFLAGS="-O3 -march=native -ffast-math -fopenmp" ENV CXXFLAGS="-O3 -march=native -ffast-math -fopenmp" RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/ccache \ --mount=type=bind,source=.git,target=.git \ VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ pip install dist/*.whl && \ rm -rf dist RUN [ -d /home/vllmuser/vllm/tests ] || ln -s /home/vllmuser/vllm/tests /home/vllmuser/tests && \ [ -d /home/vllmuser/vllm/examples ] || ln -s /home/vllmuser/vllm/examples /home/vllmuser/examples && \ [ -d /home/vllmuser/vllm/benchmarks ] || ln -s /home/vllmuser/vllm/benchmarks /home/vllmuser/benchmarks RUN pip install "ray[serve]" EXPOSE 8000 7860 CMD ["bash", "-c", "vllm serve llava-hf/llava-interleave-qwen-0.5b-hf --port 7860 & wait"]