From f5d8dd14239bd16341bd4c760a84698586f6edac Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 12 Mar 2026 23:54:48 -0500
Subject: [PATCH 1/4] [CI] Add persistent cache mounts for all CI test
 downloads and media URLs

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       | 19 ++++++++
 tests/conftest.py                             |  8 ++--
 .../openai/test_token_in_token_out.py         |  7 +--
 tests/evals/gpt_oss/test_gpqa_correctness.py  |  4 +-
 tests/evals/gsm8k/gsm8k_eval.py               |  4 +-
 .../prithvi_io_processor/prithvi_processor.py | 24 ++++++++--
 .../test_extraction.py                        |  2 +-
 vllm/envs.py                                  |  5 ++
 vllm/multimodal/media/connector.py            | 47 +++++++++++++++++++
 9 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 8895771f0a40..c0e566cbe1f4 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -382,6 +382,18 @@ HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 
+MODELSCOPE_CACHE="$(realpath ~)/modelscope"
+mkdir -p "${MODELSCOPE_CACHE}"
+MODELSCOPE_MOUNT="/root/.cache/modelscope"
+
+VLLM_TEST_CACHE="$(realpath ~)/vllm-test-cache"
+mkdir -p "${VLLM_TEST_CACHE}"
+VLLM_TEST_CACHE_MOUNT="/root/.cache/vllm-test-cache"
+
+VLLM_CACHE="$(realpath ~)/vllm-cache"
+mkdir -p "${VLLM_CACHE}"
+VLLM_CACHE_MOUNT="/root/.cache/vllm"
+
 # ---- Command source selection ----
 # Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
 # Fall back to $* for backward compatibility, but warn that inner
@@ -493,7 +505,14 @@ else
     -e AWS_ACCESS_KEY_ID \
     -e AWS_SECRET_ACCESS_KEY \
     -v "${HF_CACHE}:${HF_MOUNT}" \
+    -v "${MODELSCOPE_CACHE}:${MODELSCOPE_MOUNT}" \
+    -v "${VLLM_TEST_CACHE}:${VLLM_TEST_CACHE_MOUNT}" \
+    -v "${VLLM_CACHE}:${VLLM_CACHE_MOUNT}" \
     -e "HF_HOME=${HF_MOUNT}" \
+    -e "MODELSCOPE_CACHE=${MODELSCOPE_MOUNT}" \
+    -e "VLLM_TEST_CACHE=${VLLM_TEST_CACHE_MOUNT}" \
+    -e "VLLM_CACHE_ROOT=${VLLM_CACHE_MOUNT}" \
+    -e "VLLM_MEDIA_CACHE=${VLLM_CACHE_MOUNT}/media_cache" \
     -e "PYTHONPATH=${MYPYTHONPATH}" \
     --name "${container_name}" \
     "${image_name}" \
diff --git a/tests/conftest.py b/tests/conftest.py
index 719bfa5ed1f0..a0f8e24816ef 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1288,10 +1288,10 @@ def num_gpus_available():
     return current_platform.device_count()
 
 
-temp_dir = tempfile.gettempdir()
-_dummy_opt_path = os.path.join(temp_dir, "dummy_opt")
-_dummy_llava_path = os.path.join(temp_dir, "dummy_llava")
-_dummy_gemma2_embedding_path = os.path.join(temp_dir, "dummy_gemma2_embedding")
+_test_cache_dir = os.environ.get("VLLM_TEST_CACHE", tempfile.gettempdir())
+_dummy_opt_path = os.path.join(_test_cache_dir, "dummy_opt")
+_dummy_llava_path = os.path.join(_test_cache_dir, "dummy_llava")
+_dummy_gemma2_embedding_path = os.path.join(_test_cache_dir, "dummy_gemma2_embedding")
 
 
 @pytest.fixture
diff --git a/tests/entrypoints/openai/test_token_in_token_out.py b/tests/entrypoints/openai/test_token_in_token_out.py
index c7f8abe27e6e..d3b40655ee25 100644
--- a/tests/entrypoints/openai/test_token_in_token_out.py
+++ b/tests/entrypoints/openai/test_token_in_token_out.py
@@ -1,9 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import os
-import tempfile
-
 import pytest
 
 from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
@@ -12,7 +9,7 @@
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "Qwen/Qwen3-0.6B"
-MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
+MODEL_PATH = MODEL_NAME
 
 
 @pytest.fixture(scope="module")
@@ -21,7 +18,7 @@ def server():
     MODEL_PATH = download_weights_from_hf(
         MODEL_NAME,
         allow_patterns=["*"],
-        cache_dir=MODEL_PATH,
+        cache_dir=None,
         ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"],
     )
     args = [
diff --git a/tests/evals/gpt_oss/test_gpqa_correctness.py b/tests/evals/gpt_oss/test_gpqa_correctness.py
index 63188ec40767..3fb6cb9ded3d 100644
--- a/tests/evals/gpt_oss/test_gpqa_correctness.py
+++ b/tests/evals/gpt_oss/test_gpqa_correctness.py
@@ -23,7 +23,9 @@
 TOL = 0.05  # Absolute tolerance for accuracy comparison
 
 # Path to tiktoken encoding files
-TIKTOKEN_DATA_DIR = Path(__file__).parent / "data"
+TIKTOKEN_DATA_DIR = (
+    Path(os.environ.get("VLLM_TEST_CACHE", str(Path(__file__).parent))) / "tiktoken"
+)
 
 # Tiktoken encoding files to download
 TIKTOKEN_FILES = {
diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py
index 647c149ef5fd..db9bbcb96e8a 100644
--- a/tests/evals/gsm8k/gsm8k_eval.py
+++ b/tests/evals/gsm8k/gsm8k_eval.py
@@ -25,7 +25,9 @@
 def download_and_cache_file(url: str, filename: str | None = None) -> str:
     """Download and cache a file from a URL."""
     if filename is None:
-        filename = os.path.join("/tmp", url.split("/")[-1])
+        cache_dir = os.environ.get("VLLM_TEST_CACHE", "/tmp")
+        os.makedirs(os.path.join(cache_dir, "gsm8k"), exist_ok=True)
+        filename = os.path.join(cache_dir, "gsm8k", url.split("/")[-1])
 
     if os.path.exists(filename):
         return filename
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
index b22239fcc267..ba4c2c6c8c8b 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -3,6 +3,7 @@
 
 import base64
 import datetime
+import hashlib
 import os
 import tempfile
 import urllib.request
@@ -113,11 +114,24 @@ def read_geotiff(
 
         write_to_file = file_data
     elif file_path is not None and path_type == "url":
-        resp = urllib.request.urlopen(file_path)
-        # with tempfile.NamedTemporaryFile() as tmpfile:
-        #     tmpfile.write(resp.read())
-        #     path = tmpfile.name
-        write_to_file = resp.read()
+        # Cache URL downloads to avoid re-downloading in CI
+        cache_base = os.environ.get("VLLM_TEST_CACHE")
+        if cache_base:
+            cache_dir = os.path.join(cache_base, "prithvi")
+            os.makedirs(cache_dir, exist_ok=True)
+            url_hash = hashlib.sha256(file_path.encode()).hexdigest()[:16]
+            ext = os.path.splitext(file_path)[1] or ".tiff"
+            cached_path = os.path.join(cache_dir, f"{url_hash}{ext}")
+            if os.path.exists(cached_path):
+                path = cached_path
+            else:
+                resp = urllib.request.urlopen(file_path)
+                with open(cached_path, "wb") as f:
+                    f.write(resp.read())
+                path = cached_path
+        else:
+            resp = urllib.request.urlopen(file_path)
+            write_to_file = resp.read()
     elif file_path is not None and path_type == "path":
         path = file_path
     elif file_path is not None and path_type == "b64_json":
diff --git a/tests/v1/kv_connector/extract_hidden_states_integration/test_extraction.py b/tests/v1/kv_connector/extract_hidden_states_integration/test_extraction.py
index 6a8c64152fec..695e9ea9c00c 100644
--- a/tests/v1/kv_connector/extract_hidden_states_integration/test_extraction.py
+++ b/tests/v1/kv_connector/extract_hidden_states_integration/test_extraction.py
@@ -63,7 +63,7 @@ def predictable_llama_config_path(tmp_path_factory):
     # Create a simple tokenizer
     tokenizer = LlamaTokenizerFast.from_pretrained(
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        cache_dir=os.path.expanduser("~/.cache/huggingface"),
+        cache_dir=os.environ.get("HF_HOME", None),
     )
     tokenizer.save_pretrained(config_dir)
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 3b7312a4f378..8facacde4105 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -63,6 +63,7 @@
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_VIDEO_FETCH_TIMEOUT: int = 30
     VLLM_AUDIO_FETCH_TIMEOUT: int = 10
+    VLLM_MEDIA_CACHE: str = ""
     VLLM_MEDIA_URL_ALLOW_REDIRECTS: bool = True
     VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
     VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
@@ -759,6 +760,9 @@ def _get_or_set_default() -> str:
     "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(
         os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")
     ),
+    # Directory for caching media downloads (images, video, audio fetched
+    # from URLs during inference). Empty string disables caching.
+    "VLLM_MEDIA_CACHE": lambda: os.getenv("VLLM_MEDIA_CACHE", ""),
     # Whether to allow HTTP redirects when fetching from media URLs.
     # Default to True
     "VLLM_MEDIA_URL_ALLOW_REDIRECTS": lambda: bool(
@@ -1750,6 +1754,7 @@ def compile_factors() -> dict[str, object]:
         "VLLM_IMAGE_FETCH_TIMEOUT",
         "VLLM_VIDEO_FETCH_TIMEOUT",
         "VLLM_AUDIO_FETCH_TIMEOUT",
+        "VLLM_MEDIA_CACHE",
         "VLLM_MEDIA_URL_ALLOW_REDIRECTS",
         "VLLM_MEDIA_LOADING_THREAD_COUNT",
         "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB",
diff --git a/vllm/multimodal/media/connector.py b/vllm/multimodal/media/connector.py
index 80aaa2a8293e..4bb66aa35333 100644
--- a/vllm/multimodal/media/connector.py
+++ b/vllm/multimodal/media/connector.py
@@ -3,6 +3,8 @@
 
 import asyncio
 import atexit
+import hashlib
+import os
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Any, TypeVar
@@ -16,6 +18,7 @@
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
+from vllm.logger import init_logger
 from vllm.utils.registry import ExtensionManager
 
 from .audio import AudioEmbeddingMediaIO, AudioMediaIO
@@ -23,6 +26,8 @@
 from .image import ImageEmbeddingMediaIO, ImageMediaIO
 from .video import VideoMediaIO
 
+logger = init_logger(__name__)
+
 _M = TypeVar("_M")
 
 global_thread_pool = ThreadPoolExecutor(
@@ -116,6 +121,34 @@ def __init__(
             allowed_media_domains = []
         self.allowed_media_domains = allowed_media_domains
 
+        # Media download cache directory (opt-in via VLLM_MEDIA_CACHE)
+        self._media_cache_dir: str | None = None
+        media_cache = envs.VLLM_MEDIA_CACHE
+        if media_cache:
+            self._media_cache_dir = media_cache
+            os.makedirs(media_cache, exist_ok=True)
+
+    def _get_cached_bytes(self, url: str) -> bytes | None:
+        """Return cached bytes for a URL, or None if not cached."""
+        if not self._media_cache_dir:
+            return None
+        cache_path = self._media_cache_path(url)
+        if cache_path.exists():
+            return cache_path.read_bytes()
+        return None
+
+    def _put_cached_bytes(self, url: str, data: bytes) -> None:
+        """Store downloaded bytes in the cache."""
+        if not self._media_cache_dir:
+            return
+        cache_path = self._media_cache_path(url)
+        cache_path.write_bytes(data)
+
+    def _media_cache_path(self, url: str) -> Path:
+        url_hash = hashlib.sha256(url.encode()).hexdigest()[:20]
+        ext = Path(url.split("?")[0]).suffix or ""
+        return Path(self._media_cache_dir) / f"{url_hash}{ext}"  # type: ignore[arg-type]
+
     def _load_data_url(
         self,
         url_spec: Url,
@@ -178,6 +211,10 @@ def load_from_url(
         if url_spec.scheme and url_spec.scheme.startswith("http"):
             self._assert_url_in_allowed_media_domains(url_spec)
 
+            cached = self._get_cached_bytes(url)
+            if cached is not None:
+                return media_io.load_bytes(cached)
+
             connection = self.connection
             data = connection.get_bytes(
                 url_spec.url,
@@ -185,6 +222,7 @@ def load_from_url(
                 allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
             )
 
+            self._put_cached_bytes(url, data)
             return media_io.load_bytes(data)
 
         if url_spec.scheme == "data":
@@ -209,12 +247,21 @@ async def load_from_url_async(
         if url_spec.scheme and url_spec.scheme.startswith("http"):
             self._assert_url_in_allowed_media_domains(url_spec)
 
+            cached = self._get_cached_bytes(url)
+            if cached is not None:
+                future = loop.run_in_executor(
+                    global_thread_pool, media_io.load_bytes, cached
+                )
+                return await future
+
             connection = self.connection
             data = await connection.async_get_bytes(
                 url_spec.url,
                 timeout=fetch_timeout,
                 allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
             )
+
+            self._put_cached_bytes(url, data)
             future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data)
             return await future
 

From e4d3e6cba3b68d1e9c30bfb1536688be19e02c96 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Fri, 13 Mar 2026 00:02:36 -0500
Subject: [PATCH 2/4] [CI] Add persistent cache mounts for all CI test
 downloads and media URLs

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .../prithvi_io_processor/prithvi_processor.py      | 14 ++++++++++++--
 vllm/multimodal/media/connector.py                 | 14 +++++++++++++-
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
index ba4c2c6c8c8b..80528fe8ea4e 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -126,8 +126,18 @@ def read_geotiff(
                 path = cached_path
             else:
                 resp = urllib.request.urlopen(file_path)
-                with open(cached_path, "wb") as f:
-                    f.write(resp.read())
+                # Write to a temporary file and atomically rename
+                # to prevent race conditions with parallel tests.
+                with tempfile.NamedTemporaryFile(
+                    mode="wb", dir=cache_dir, delete=False
+                ) as tmp_file:
+                    tmp_file.write(resp.read())
+                    tmp_path = tmp_file.name
+                try:
+                    os.rename(tmp_path, cached_path)
+                except OSError:
+                    # Another process may have already written the file.
+                    os.remove(tmp_path)
                 path = cached_path
         else:
             resp = urllib.request.urlopen(file_path)
diff --git a/vllm/multimodal/media/connector.py b/vllm/multimodal/media/connector.py
index 4bb66aa35333..93c805b19604 100644
--- a/vllm/multimodal/media/connector.py
+++ b/vllm/multimodal/media/connector.py
@@ -5,6 +5,7 @@
 import atexit
 import hashlib
 import os
+import tempfile
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Any, TypeVar
@@ -142,7 +143,18 @@ def _put_cached_bytes(self, url: str, data: bytes) -> None:
         if not self._media_cache_dir:
             return
         cache_path = self._media_cache_path(url)
-        cache_path.write_bytes(data)
+        # Write to a temporary file and atomically rename to prevent
+        # race conditions when multiple processes cache the same URL.
+        with tempfile.NamedTemporaryFile(
+            mode="wb", dir=self._media_cache_dir, delete=False
+        ) as tmp_file:
+            tmp_file.write(data)
+            tmp_path = tmp_file.name
+        try:
+            os.rename(tmp_path, str(cache_path))
+        except OSError:
+            # Another process may have already written the file.
+            os.remove(tmp_path)
 
     def _media_cache_path(self, url: str) -> Path:
         url_hash = hashlib.sha256(url.encode()).hexdigest()[:20]

From a54c8edbeeca7fae7e57560204438e89385c30d4 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sun, 15 Mar 2026 15:18:13 -0500
Subject: [PATCH 3/4] [ROCm][CI] Split out media cache changes into separate PR

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       |  1 -
 vllm/envs.py                                  |  5 --
 vllm/multimodal/media/connector.py            | 59 -------------------
 3 files changed, 65 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index c83f2a34713c..723523e81f2d 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -526,7 +526,6 @@ else
     -e "MODELSCOPE_CACHE=${MODELSCOPE_MOUNT}" \
     -e "VLLM_TEST_CACHE=${VLLM_TEST_CACHE_MOUNT}" \
     -e "VLLM_CACHE_ROOT=${VLLM_CACHE_MOUNT}" \
-    -e "VLLM_MEDIA_CACHE=${VLLM_CACHE_MOUNT}/media_cache" \
     -e "PYTHONPATH=${MYPYTHONPATH}" \
     --name "${container_name}" \
     "${image_name}" \
diff --git a/vllm/envs.py b/vllm/envs.py
index 641affdccf65..d310e9e1307d 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -63,7 +63,6 @@
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_VIDEO_FETCH_TIMEOUT: int = 30
     VLLM_AUDIO_FETCH_TIMEOUT: int = 10
-    VLLM_MEDIA_CACHE: str = ""
     VLLM_MEDIA_URL_ALLOW_REDIRECTS: bool = True
     VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
     VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
@@ -761,9 +760,6 @@ def _get_or_set_default() -> str:
     "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(
         os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")
     ),
-    # Directory for caching media downloads (images, video, audio fetched
-    # from URLs during inference). Empty string disables caching.
-    "VLLM_MEDIA_CACHE": lambda: os.getenv("VLLM_MEDIA_CACHE", ""),
     # Whether to allow HTTP redirects when fetching from media URLs.
     # Default to True
     "VLLM_MEDIA_URL_ALLOW_REDIRECTS": lambda: bool(
@@ -1759,7 +1755,6 @@ def compile_factors() -> dict[str, object]:
         "VLLM_IMAGE_FETCH_TIMEOUT",
         "VLLM_VIDEO_FETCH_TIMEOUT",
         "VLLM_AUDIO_FETCH_TIMEOUT",
-        "VLLM_MEDIA_CACHE",
         "VLLM_MEDIA_URL_ALLOW_REDIRECTS",
         "VLLM_MEDIA_LOADING_THREAD_COUNT",
         "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB",
diff --git a/vllm/multimodal/media/connector.py b/vllm/multimodal/media/connector.py
index 93c805b19604..80aaa2a8293e 100644
--- a/vllm/multimodal/media/connector.py
+++ b/vllm/multimodal/media/connector.py
@@ -3,9 +3,6 @@
 
 import asyncio
 import atexit
-import hashlib
-import os
-import tempfile
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Any, TypeVar
@@ -19,7 +16,6 @@
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
-from vllm.logger import init_logger
 from vllm.utils.registry import ExtensionManager
 
 from .audio import AudioEmbeddingMediaIO, AudioMediaIO
@@ -27,8 +23,6 @@
 from .image import ImageEmbeddingMediaIO, ImageMediaIO
 from .video import VideoMediaIO
 
-logger = init_logger(__name__)
-
 _M = TypeVar("_M")
 
 global_thread_pool = ThreadPoolExecutor(
@@ -122,45 +116,6 @@ def __init__(
             allowed_media_domains = []
         self.allowed_media_domains = allowed_media_domains
 
-        # Media download cache directory (opt-in via VLLM_MEDIA_CACHE)
-        self._media_cache_dir: str | None = None
-        media_cache = envs.VLLM_MEDIA_CACHE
-        if media_cache:
-            self._media_cache_dir = media_cache
-            os.makedirs(media_cache, exist_ok=True)
-
-    def _get_cached_bytes(self, url: str) -> bytes | None:
-        """Return cached bytes for a URL, or None if not cached."""
-        if not self._media_cache_dir:
-            return None
-        cache_path = self._media_cache_path(url)
-        if cache_path.exists():
-            return cache_path.read_bytes()
-        return None
-
-    def _put_cached_bytes(self, url: str, data: bytes) -> None:
-        """Store downloaded bytes in the cache."""
-        if not self._media_cache_dir:
-            return
-        cache_path = self._media_cache_path(url)
-        # Write to a temporary file and atomically rename to prevent
-        # race conditions when multiple processes cache the same URL.
-        with tempfile.NamedTemporaryFile(
-            mode="wb", dir=self._media_cache_dir, delete=False
-        ) as tmp_file:
-            tmp_file.write(data)
-            tmp_path = tmp_file.name
-        try:
-            os.rename(tmp_path, str(cache_path))
-        except OSError:
-            # Another process may have already written the file.
-            os.remove(tmp_path)
-
-    def _media_cache_path(self, url: str) -> Path:
-        url_hash = hashlib.sha256(url.encode()).hexdigest()[:20]
-        ext = Path(url.split("?")[0]).suffix or ""
-        return Path(self._media_cache_dir) / f"{url_hash}{ext}"  # type: ignore[arg-type]
-
     def _load_data_url(
         self,
         url_spec: Url,
@@ -223,10 +178,6 @@ def load_from_url(
         if url_spec.scheme and url_spec.scheme.startswith("http"):
             self._assert_url_in_allowed_media_domains(url_spec)
 
-            cached = self._get_cached_bytes(url)
-            if cached is not None:
-                return media_io.load_bytes(cached)
-
             connection = self.connection
             data = connection.get_bytes(
                 url_spec.url,
@@ -234,7 +185,6 @@ def load_from_url(
                 allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
             )
 
-            self._put_cached_bytes(url, data)
             return media_io.load_bytes(data)
 
         if url_spec.scheme == "data":
@@ -259,21 +209,12 @@ async def load_from_url_async(
         if url_spec.scheme and url_spec.scheme.startswith("http"):
             self._assert_url_in_allowed_media_domains(url_spec)
 
-            cached = self._get_cached_bytes(url)
-            if cached is not None:
-                future = loop.run_in_executor(
-                    global_thread_pool, media_io.load_bytes, cached
-                )
-                return await future
-
             connection = self.connection
             data = await connection.async_get_bytes(
                 url_spec.url,
                 timeout=fetch_timeout,
                 allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
             )
-
-            self._put_cached_bytes(url, data)
             future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data)
             return await future
 

From 8e2c7151cd16454d7373989dbe4ea861aed1b18f Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sun, 15 Mar 2026 15:21:06 -0500
Subject: [PATCH 4/4] [ROCm][CI] Split out media cache changes into separate PR

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/scripts/hardware_ci/run-amd-test.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 723523e81f2d..c83f2a34713c 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -526,6 +526,7 @@ else
     -e "MODELSCOPE_CACHE=${MODELSCOPE_MOUNT}" \
     -e "VLLM_TEST_CACHE=${VLLM_TEST_CACHE_MOUNT}" \
     -e "VLLM_CACHE_ROOT=${VLLM_CACHE_MOUNT}" \
+    -e "VLLM_MEDIA_CACHE=${VLLM_CACHE_MOUNT}/media_cache" \
     -e "PYTHONPATH=${MYPYTHONPATH}" \
     --name "${container_name}" \
     "${image_name}" \