diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x index 4e89bb3057c5..9270b48c54d4 100644 --- a/docker/Dockerfile.s390x +++ b/docker/Dockerfile.s390x @@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \ RUN microdnf install -y \ which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ - openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \ + openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile && \ microdnf clean all # Python Installation @@ -136,6 +136,71 @@ RUN --mount=type=cache,target=/root/.cache/uv \ mkdir -p /tmp/hf-xet/dist && \ cp dist/*.whl /tmp/hf-xet/dist/ +# Build numba +FROM python-install AS numba-builder + +ARG MAX_JOBS +ARG NUMBA_VERSION=0.61.2 + +WORKDIR /tmp + +# Clone all required dependencies +RUN --mount=type=cache,target=/root/.cache/uv \ + microdnf install ninja-build gcc gcc-c++ -y && \ + git clone --recursive https://github.com/llvm/llvm-project.git -b llvmorg-15.0.7 && \ + git clone --recursive https://github.com/numba/llvmlite.git -b v0.44.0 && \ + git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \ + cd llvm-project && mkdir build && cd build && \ + uv pip install 'cmake<4' setuptools numpy && \ + export PREFIX=/usr/local && CMAKE_ARGS="${CMAKE_ARGS} -DLLVM_ENABLE_PROJECTS=lld;libunwind;compiler-rt" \ + CFLAGS="$(echo $CFLAGS | sed 's/-fno-plt //g')" \ + CXXFLAGS="$(echo $CXXFLAGS | sed 's/-fno-plt //g')" \ + CMAKE_ARGS="${CMAKE_ARGS} -DFFI_INCLUDE_DIR=$PREFIX/include" \ + CMAKE_ARGS="${CMAKE_ARGS} -DFFI_LIBRARY_DIR=$PREFIX/lib" \ + cmake -DCMAKE_INSTALL_PREFIX="${PREFIX}" \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_LIBRARY_PATH="${PREFIX}" \ + -DLLVM_ENABLE_LIBEDIT=OFF \ + -DLLVM_ENABLE_LIBXML2=OFF \ + -DLLVM_ENABLE_RTTI=ON \ + -DLLVM_ENABLE_TERMINFO=OFF \ + -DLLVM_INCLUDE_BENCHMARKS=OFF \ + -DLLVM_INCLUDE_DOCS=OFF \ + -DLLVM_INCLUDE_EXAMPLES=OFF \ + -DLLVM_INCLUDE_GO_TESTS=OFF \ + -DLLVM_INCLUDE_TESTS=OFF \ + -DLLVM_INCLUDE_UTILS=ON \ + -DLLVM_INSTALL_UTILS=ON \ + -DLLVM_UTILS_INSTALL_DIR=libexec/llvm \ + -DLLVM_BUILD_LLVM_DYLIB=OFF \ + -DLLVM_LINK_LLVM_DYLIB=OFF \ + -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=WebAssembly \ + -DLLVM_ENABLE_FFI=ON \ + -DLLVM_ENABLE_Z3_SOLVER=OFF \ + -DLLVM_OPTIMIZED_TABLEGEN=ON \ + -DCMAKE_POLICY_DEFAULT_CMP0111=NEW \ + -DCOMPILER_RT_BUILD_BUILTINS=ON \ + -DCOMPILER_RT_BUILTINS_HIDE_SYMBOLS=OFF \ + -DCOMPILER_RT_BUILD_LIBFUZZER=OFF \ + -DCOMPILER_RT_BUILD_CRT=OFF \ + -DCOMPILER_RT_BUILD_MEMPROF=OFF \ + -DCOMPILER_RT_BUILD_PROFILE=OFF \ + -DCOMPILER_RT_BUILD_SANITIZERS=OFF \ + -DCOMPILER_RT_BUILD_XRAY=OFF \ + -DCOMPILER_RT_BUILD_GWP_ASAN=OFF \ + -DCOMPILER_RT_BUILD_ORC=OFF \ + -DCOMPILER_RT_INCLUDE_TESTS=OFF \ + ${CMAKE_ARGS} -GNinja ../llvm \ + + && ninja install . && \ + # build llvmlite + cd ../../llvmlite && python setup.py bdist_wheel && \ + cd ../numba && \ + if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \ + sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \ + fi && python setup.py bdist_wheel + + # Final build stage FROM python-install AS vllm-cpu ARG PYTHON_VERSION @@ -163,23 +228,30 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \ --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \ + --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \ + --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \ sed -i '/^torch/d' requirements/build.txt && \ - ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \ - VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \ - HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \ - TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \ + ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \ + VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \ + HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \ + TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \ + LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \ + NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \ uv pip install -v \ $ARROW_WHL_FILE \ $VISION_WHL_FILE \ $HF_XET_WHL_FILE \ $TORCH_WHL_FILE \ + $LLVM_WHL_FILE \ + $NUMBA_WHL_FILE \ --index-strategy unsafe-best-match \ -r requirements/build.txt \ - -r requirements/cpu.txt + -r requirements/cpu.txt + # Build and install vllm RUN --mount=type=cache,target=/root/.cache/uv \ - VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ + VLLM_TARGET_DEVICE=cpu VLLM_CPU_MOE_PREPACK=0 python setup.py bdist_wheel && \ uv pip install "$(echo dist/*.whl)[tensorizer]" # setup non-root user for vllm @@ -196,4 +268,3 @@ WORKDIR /home/vllm # Set the default entrypoint ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] - diff --git a/requirements/common.txt b/requirements/common.txt index 1a8fea0dd7d9..6bc71df24f0e 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -20,7 +20,8 @@ prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.11, < 0.11 llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" -outlines_core == 0.2.10 +outlines_core == 0.2.10 ; platform_machine != "s390x" +outlines == 0.1.11 ; platform_machine == "s390x" # required for outlines backend disk cache diskcache == 5.6.3 lark == 1.2.2 diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 6860275acab6..f4b95b72898c 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -1,8 +1,8 @@ # Common dependencies -r common.txt -numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding -numba == 0.61.2; python_version > '3.9' +numba == 0.60.0; python_version == '3.9' and platform_machine != "s390x" # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding +numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x" # Dependencies for CPUs packaging>=24.2 diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6fc894827c4a..679905aed9ec 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1076,12 +1076,13 @@ def create_engine_config( # Set default arguments for V0 or V1 Engine. if use_v1: self._set_default_args_v1(usage_context, model_config) - # Disable chunked prefill for POWER (ppc64le)/ARM CPUs in V1 + # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1 if current_platform.is_cpu( ) and current_platform.get_cpu_architecture() in ( - CpuArchEnum.POWERPC, CpuArchEnum.ARM): + CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM): logger.info( - "Chunked prefill is not supported for ARM and POWER CPUs; " + "Chunked prefill is not supported for ARM and POWER " + "and S390X CPUs; " "disabling it for V1 backend.") self.enable_chunked_prefill = False else: diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index fe258f76b9d7..c748595a7153 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -332,5 +332,6 @@ def default_v1(cls, model_config) -> bool: supplied model configuration. """ arch = cls.get_cpu_architecture() - return (cls.supports_v1(model_config) and arch - in (CpuArchEnum.X86, CpuArchEnum.POWERPC, CpuArchEnum.ARM)) + return (cls.supports_v1(model_config) + and arch in (CpuArchEnum.X86, CpuArchEnum.POWERPC, + CpuArchEnum.ARM, CpuArchEnum.S390X)) diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 4017f1ca7eec..40334375b83a 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -81,6 +81,7 @@ class CpuArchEnum(enum.Enum): X86 = enum.auto() ARM = enum.auto() POWERPC = enum.auto() + S390X = enum.auto() OTHER = enum.auto() UNKNOWN = enum.auto() @@ -377,6 +378,8 @@ def get_cpu_architecture(cls) -> CpuArchEnum: return CpuArchEnum.ARM elif machine.startswith("ppc"): return CpuArchEnum.POWERPC + elif machine == "s390x": + return CpuArchEnum.S390X return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index f83d6804840e..be78597926e0 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -43,8 +43,9 @@ def init_device(self): # Setup OpenMP threads affinity. omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND if omp_cpuids == "auto" and platform.system() == "Linux": - if current_platform.get_cpu_architecture() == CpuArchEnum.POWERPC: - # For POWERPC SMT-8/4/2 + cpu_arch = current_platform.get_cpu_architecture() + if cpu_arch in (CpuArchEnum.POWERPC, CpuArchEnum.S390X): + # For S390X/POWERPC SMT-8/4/2 self.local_omp_cpuid = self._get_autobind_cpu_ids( lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4]) elif current_platform.get_cpu_architecture() == CpuArchEnum.X86: