From a63939c8f0339d55702fb268610b47f4e9b2ea9b Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 11 May 2026 02:49:31 +0000 Subject: [PATCH 1/6] [AMD CI] upgrade cache-dit to 1.3.0 in install script The AMD CI base image ships cache-dit==1.1.8, but multimodal_gen requires >=1.2.0 (uses cache_dit.parallelism.ParallelismBackend.AUTO, added in 1.2.0) and pyproject.toml pins ==1.3.0. The previous 'pip install cache-dit' (no --upgrade, no version) was a no-op against the preinstalled 1.1.8, which caused every multimodal-gen-test-{1,2}-gpu-amd[-rocm720] job to fail with: RuntimeError: cache-dit>=1.2.0 is required for --cache-dit-config. AttributeError: ParallelismBackend.AUTO Pin to 1.3.0 (matching python/pyproject.toml) and force --upgrade so pip replaces the image's stale 1.1.8. Co-authored-by: Bingxu Chen --- scripts/ci/amd/amd_ci_install_dependency.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/ci/amd/amd_ci_install_dependency.sh b/scripts/ci/amd/amd_ci_install_dependency.sh index 2ce9bf8b487e..aae3a9c5b775 100755 --- a/scripts/ci/amd/amd_ci_install_dependency.sh +++ b/scripts/ci/amd/amd_ci_install_dependency.sh @@ -168,8 +168,11 @@ EOF docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache huggingface_hub[hf_xet] docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache pytest - # Install cache-dit for qwen_image_t2i_cache_dit_enabled test (added in PR 16204) - docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache cache-dit || echo "cache-dit installation failed" + # Install cache-dit for qwen_image_t2i_cache_dit_enabled test (added in PR 16204). + # Pin to match python/pyproject.toml; --upgrade is required because the AMD CI + # image ships cache-dit==1.1.8, which lacks the `cache_dit.parallelism` module + # and `ParallelismBackend.AUTO` used by multimodal_gen/runtime/cache/cache_dit_integration.py. + docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache --upgrade 'cache-dit==1.3.0' || echo "cache-dit installation failed" # Install accelerate for distributed training and inference support docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache accelerate || echo "accelerate installation failed" From fba6ac9f2903f78720ce82cd15ec92ba26be086a Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 11 May 2026 02:54:50 +0000 Subject: [PATCH 2/6] [AMD CI] trim verbose cache-dit install comments Co-authored-by: Bingxu Chen --- scripts/ci/amd/amd_ci_install_dependency.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/ci/amd/amd_ci_install_dependency.sh b/scripts/ci/amd/amd_ci_install_dependency.sh index aae3a9c5b775..0f12879494be 100755 --- a/scripts/ci/amd/amd_ci_install_dependency.sh +++ b/scripts/ci/amd/amd_ci_install_dependency.sh @@ -168,10 +168,7 @@ EOF docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache huggingface_hub[hf_xet] docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache pytest - # Install cache-dit for qwen_image_t2i_cache_dit_enabled test (added in PR 16204). - # Pin to match python/pyproject.toml; --upgrade is required because the AMD CI - # image ships cache-dit==1.1.8, which lacks the `cache_dit.parallelism` module - # and `ParallelismBackend.AUTO` used by multimodal_gen/runtime/cache/cache_dit_integration.py. + # Install cache-dit for qwen_image_t2i_cache_dit_enabled test (added in PR 16204) docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache --upgrade 'cache-dit==1.3.0' || echo "cache-dit installation failed" # Install accelerate for distributed training and inference support From 5a27b0e6bce4f9ffd5014e52fe1b78d570c41815 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 11 May 2026 02:54:50 +0000 Subject: [PATCH 3/6] [AMD] pin cache-dit==1.3.0 in rocm.Dockerfile Match the install-script change so newly built ROCm images already ship cache-dit==1.3.0, instead of relying on the CI install script to upgrade the stale 1.1.8 from the base image at every job start. Co-authored-by: Bingxu Chen --- docker/rocm.Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/rocm.Dockerfile b/docker/rocm.Dockerfile index f19f11e57f6a..a5e75253988f 100644 --- a/docker/rocm.Dockerfile +++ b/docker/rocm.Dockerfile @@ -258,7 +258,8 @@ RUN pip install IPython \ && pip install orjson \ && pip install python-multipart \ && pip install torchao==0.9.0 \ - && pip install pybind11 + && pip install pybind11 \ + && pip install cache-dit==1.3.0 RUN pip uninstall -y sgl_kernel sglang RUN git clone ${SGL_REPO} \ From 723fdd3f2d4f357325c11329abe6a2ef45425b4d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 11 May 2026 05:09:52 +0000 Subject: [PATCH 4/6] Revert "[AMD] pin cache-dit==1.3.0 in rocm.Dockerfile" This revert reverses the previous Dockerfile change after review pointed out it was a no-op: the next RUN block (rocm.Dockerfile:281) installs 'python[srt_hip,diffusion_hip]', and the diffusion_hip extra in python/pyproject_other.toml pins cache-dit==1.1.8, so pip downgrades the 1.3.0 we just installed. The real fix is to bump the pin in the toml files (next two commits). Co-authored-by: Bingxu Chen --- docker/rocm.Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docker/rocm.Dockerfile b/docker/rocm.Dockerfile index a5e75253988f..f19f11e57f6a 100644 --- a/docker/rocm.Dockerfile +++ b/docker/rocm.Dockerfile @@ -258,8 +258,7 @@ RUN pip install IPython \ && pip install orjson \ && pip install python-multipart \ && pip install torchao==0.9.0 \ - && pip install pybind11 \ - && pip install cache-dit==1.3.0 + && pip install pybind11 RUN pip uninstall -y sgl_kernel sglang RUN git clone ${SGL_REPO} \ From 4a50a07824cd9e734d47fe2509621a17df938707 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 11 May 2026 05:10:00 +0000 Subject: [PATCH 5/6] [AMD] bump cache-dit 1.1.8 -> 1.3.0 in pyproject_other.toml diffusion_hip This is the real source of the cache-dit==1.1.8 in AMD CI images: the ROCm Dockerfile renames pyproject_other.toml -> pyproject.toml and installs the diffusion_hip extra, which used to pin 1.1.8. Bumping to 1.3.0 aligns with python/pyproject.toml and fixes R10 (multimodal-gen-test-{1,2}-gpu-amd[-rocm720]) at the source. Co-authored-by: Bingxu Chen --- python/pyproject_other.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject_other.toml b/python/pyproject_other.toml index d6f40474f26f..ad6f12e33053 100755 --- a/python/pyproject_other.toml +++ b/python/pyproject_other.toml @@ -103,7 +103,7 @@ diffusion_hip = [ "st_attn==0.0.7", "vsa==0.0.4", "runai_model_streamer>=0.15.5", - "cache-dit==1.1.8", + "cache-dit==1.3.0", ] # For Intel Gaudi(device : hpu) follow the installation guide From 2774de0f8b00d4c2f9e96abb4abc9011139bf194 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 11 May 2026 05:10:00 +0000 Subject: [PATCH 6/6] [AMD] bump cache-dit 1.1.8 -> 1.3.0 in amd-sglang wheel diffusion_hip Same fix in the amd-sglang wheel pyproject so wheel-built images stay in sync with python/pyproject.toml and python/pyproject_other.toml. Co-authored-by: Bingxu Chen --- 3rdparty/amd/wheel/sglang/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/amd/wheel/sglang/pyproject.toml b/3rdparty/amd/wheel/sglang/pyproject.toml index d04c3f3bb96c..5ebdc64816c6 100644 --- a/3rdparty/amd/wheel/sglang/pyproject.toml +++ b/3rdparty/amd/wheel/sglang/pyproject.toml @@ -110,7 +110,7 @@ diffusion_hip = [ "st_attn==0.0.7", "vsa==0.0.4", "runai_model_streamer>=0.15.5", - "cache-dit==1.1.8", + "cache-dit==1.3.0", "addict", ]