From 3a2a4e23905e93e6c9d16943d391c575072c6b8f Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Mon, 4 May 2026 15:50:20 +0200 Subject: [PATCH 1/4] fix: install RIXL wheel in final stage of Dockerfile.rocm Signed-off-by: simondanielsson --- docker/Dockerfile.rocm | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 0ed12f11da94..353d3e7aebac 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -517,6 +517,22 @@ COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker +# Install RIXL wheel +RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ + uv pip install --system /rixl_install/*.whl + +# RIXL/NixlConnector runtime dependencies (RDMA userspace libraries) +RUN apt-get update -q -y && apt-get install -q -y \ + librdmacm1 \ + libibverbs1 \ + ibverbs-providers \ + ibverbs-utils \ + && rm -rf /var/lib/apt/lists/* + +# Use legacy IPC mode for HSA to avoid GPU memory pinning issues with UCX rocm_ipc +# See: https://github.com/ROCm/rocm-libraries/issues/6266 +ENV HSA_ENABLE_IPC_MODE_LEGACY=1 + ENV TOKENIZERS_PARALLELISM=false # ENV that can improve safe tensor loading, and end-to-end time From cd4a3209eab1d943d492d36f97e24b379f4e77c1 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Mon, 4 May 2026 16:05:22 +0200 Subject: [PATCH 2/4] fix: add --no-install-recommends and mvoe install before COPYs Signed-off-by: simondanielsson --- docker/Dockerfile.rocm | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 353d3e7aebac..b8ca0392deec 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -512,23 +512,23 @@ ARG BASE_IMAGE ARG NIC_BACKEND ARG AINIC_VERSION -# Copy over the benchmark scripts as well -COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks -COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker - -# Install RIXL wheel -RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ - uv pip install --system /rixl_install/*.whl - # RIXL/NixlConnector runtime dependencies (RDMA userspace libraries) -RUN apt-get update -q -y && apt-get install -q -y \ +RUN apt-get update -q && apt-get install -q -y --no-install-recommends \ librdmacm1 \ libibverbs1 \ ibverbs-providers \ ibverbs-utils \ && rm -rf /var/lib/apt/lists/* +# Install RIXL wheel +RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ + uv pip install --system /rixl_install/*.whl + +# Copy over the benchmark scripts as well +COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks +COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples +COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker + # Use legacy IPC mode for HSA to avoid GPU memory pinning issues with UCX rocm_ipc # See: https://github.com/ROCm/rocm-libraries/issues/6266 ENV HSA_ENABLE_IPC_MODE_LEGACY=1 From 8d9b6ecac2eb82ab375f1d8eb8575cec8d07a852 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Mon, 4 May 2026 20:05:03 +0200 Subject: [PATCH 3/4] docs: update nixl documentation with path to correct image Signed-off-by: simondanielsson --- docs/features/nixl_connector_usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md index ea668615153c..e7ee3a64008c 100644 --- a/docs/features/nixl_connector_usage.md +++ b/docs/features/nixl_connector_usage.md @@ -13,7 +13,7 @@ Install the NIXL library: `uv pip install nixl`, as a quick start on Nvidia plat - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions - The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files -For ROCm platform, the [base ROCm docker file](../../docker/Dockerfile.rocm_base) includes RIXL and ucx already. +For ROCm platform, the [ROCm docker file](../../docker/Dockerfile.rocm) includes RIXL and ucx already. - Refer to [RIXL official repository](https://github.com/rocm/rixl) for more information - The supportive libraries for RIXL can be found in [requirements/kv_connectors_rocm.txt](../../requirements/kv_connectors_rocm.txt) From c94ffdf5f607662c74c323e41fb2dbe0d637fe27 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Tue, 5 May 2026 18:49:08 +0200 Subject: [PATCH 4/4] fix: remove redundant RDMA runtime deps and move RIXL installation to earrlier Signed-off-by: simondanielsson --- docker/Dockerfile.rocm | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index b8ca0392deec..92ac17bcd7e1 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -507,23 +507,15 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ && pip uninstall -y vllm \ && uv pip install --system *.whl +# Install RIXL wheel +RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ + uv pip install --system /rixl_install/*.whl + ARG COMMON_WORKDIR ARG BASE_IMAGE ARG NIC_BACKEND ARG AINIC_VERSION -# RIXL/NixlConnector runtime dependencies (RDMA userspace libraries) -RUN apt-get update -q && apt-get install -q -y --no-install-recommends \ - librdmacm1 \ - libibverbs1 \ - ibverbs-providers \ - ibverbs-utils \ - && rm -rf /var/lib/apt/lists/* - -# Install RIXL wheel -RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ - uv pip install --system /rixl_install/*.whl - # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples