vllm-project · markmc · Mar 10, 2026 · Mar 10, 2026
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -124,6 +124,8 @@ def test_models(
     [
         ("facebook/opt-125m", "ray", "", "L4", {}),
         ("facebook/opt-125m", "mp", "", "L4", {}),
+        ("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+        ("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
         ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
         ("facebook/opt-125m", "ray", "", "A100", {}),

diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
@@ -1,14 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import multiprocessing
 import random
-import threading
 import time
-from unittest import mock
 
-import multiprocess as mp
 import numpy as np
-import pytest
 import torch.distributed as dist
 
 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
@@ -25,14 +22,7 @@ def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
     return [np.random.randint(1, 100, i) for i in sizes]
 
 
-def distributed_run(fn, world_size, timeout=60):
-    """Run a function in multiple processes with proper error handling.
-
-    Args:
-        fn: Function to run in each process
-        world_size: Number of processes to spawn
-        timeout: Maximum time in seconds to wait for processes (default: 60)
-    """
+def distributed_run(fn, world_size):
     number_of_processes = world_size
     processes = []
     for i in range(number_of_processes):
@@ -43,45 +33,19 @@ def distributed_run(fn, world_size, timeout=60):
         env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
         env["MASTER_ADDR"] = "localhost"
         env["MASTER_PORT"] = "12345"
-        p = mp.Process(target=fn, args=(env,))
+        p = multiprocessing.Process(target=fn, args=(env,))
         processes.append(p)
         p.start()
 
-    # Monitor processes and fail fast if any process fails
-    start_time = time.time()
-    failed_processes = []
-
-    # Wait for all processes, checking for failures
-    while time.time() - start_time < timeout:
-        all_done = True
-        for i, p in enumerate(processes):
-            if p.is_alive():
-                all_done = False
-            elif p.exitcode != 0:
-                # Process failed
-                failed_processes.append((i, p.exitcode))
-                break
-
-        if failed_processes or all_done:
-            break
-        time.sleep(0.1)  # Check every 100ms
+    for p in processes:
+        p.join()
 
-    # Check for timeout if no failures detected yet
-    for i, p in enumerate(processes):
-        if p.is_alive():
-            p.kill()
-            p.join()
-
-    # Report failures
-    if failed_processes:
-        error_msg = "Distributed test failed:\n"
-        for rank, status in failed_processes:
-            error_msg += f"  Rank {rank}: Exit code {status}\n"
-        raise AssertionError(error_msg)
+    for p in processes:
+        assert p.exitcode == 0
 
 
 def worker_fn_wrapper(fn):
-    # `mp.Process` cannot accept environment variables directly
+    # `multiprocessing.Process` cannot accept environment variables directly
     # so we need to pass the environment variables as arguments
     # and update the environment variables in the function
     def wrapped_fn(env):
@@ -151,244 +115,3 @@ def worker_fn():
 
 def test_shm_broadcast():
     distributed_run(worker_fn, 4)
-
-
-@worker_fn_wrapper
-def worker_fn_test_shutdown_busy():
-    rank = dist.get_rank()
-    writer_rank = 2
-    message_queue = MessageQueue.create_from_process_group(
-        dist.group.WORLD, 40 * 1024, 2, writer_rank
-    )
-
-    if not message_queue._is_writer:
-        # Put into busy mode
-        message_queue._spin_condition.busy_loop_s = 9999
-
-        shutdown_event = threading.Event()
-
-        def shutdown_thread(mq, shutdown_event):
-            shutdown_event.wait()
-            mq.shutdown()
-
-        threading.Thread(
-            target=shutdown_thread, args=(message_queue, shutdown_event)
-        ).start()
-
-        with pytest.raises(TimeoutError):
-            message_queue.dequeue(timeout=0.01)
-
-        shutdown_event.set()
-
-        with pytest.raises(RuntimeError, match="cancelled"):
-            message_queue.dequeue(timeout=1)
-
-        assert message_queue.shutting_down
-
-    print(f"torch distributed passed the test! Rank {rank}")
-    dist.barrier()
-
-
-def test_message_queue_shutdown_busy(caplog_vllm):
-    distributed_run(worker_fn_test_shutdown_busy, 4)
-    print(caplog_vllm.text)
-
-
-@worker_fn_wrapper
-def worker_fn_test_shutdown_idle():
-    rank = dist.get_rank()
-    writer_rank = 2
-    message_queue = MessageQueue.create_from_process_group(
-        dist.group.WORLD, 40 * 1024, 2, writer_rank
-    )
-
-    if not message_queue._is_writer:
-        # Put into idle mode
-        message_queue._spin_condition.last_read = 0
-
-        shutdown_event = threading.Event()
-
-        def shutdown_thread(mq, shutdown_event):
-            shutdown_event.wait()
-            mq.shutdown()
-
-        threading.Thread(
-            target=shutdown_thread, args=(message_queue, shutdown_event)
-        ).start()
-
-        with pytest.raises(TimeoutError):
-            message_queue.dequeue(timeout=0.01)
-
-        shutdown_event.set()
-
-        with pytest.raises(RuntimeError, match="cancelled"):
-            message_queue.dequeue(timeout=1)
-
-        assert message_queue.shutting_down
-
-    print(f"torch distributed passed the test! Rank {rank}")
-    dist.barrier()
-
-
-def test_message_queue_shutdown_idle():
-    distributed_run(worker_fn_test_shutdown_idle, 4)
-
-
-@worker_fn_wrapper
-def worker_fn_test_idle_to_busy():
-    rank = dist.get_rank()
-    writer_rank = 2
-    message_queue = MessageQueue.create_from_process_group(
-        dist.group.WORLD, 40 * 1024, 2, writer_rank
-    )
-
-    message1 = "hello world"
-    message2 = np.random.randint(1, 100, 100)
-    with mock.patch.object(
-        message_queue._spin_condition, "wait", wraps=message_queue._spin_condition.wait
-    ) as wrapped_wait:
-        if not message_queue._is_writer:
-            # Put into idle mode
-            message_queue._spin_condition.last_read = 0
-
-            # no messages, so expect a TimeoutError
-            with pytest.raises(TimeoutError):
-                message_queue.dequeue(timeout=0.01)
-            # wait should only be called once while idle
-            assert wrapped_wait.call_count == 1
-
-            # sync with the writer and wait for message1
-            dist.barrier()
-            recv_message = message_queue.dequeue(timeout=5)
-            assert recv_message == message1
-            # second call to wait, with a message read, this puts in a busy spin
-            assert wrapped_wait.call_count == 2
-
-            # sync with the writer and wait for message2
-            dist.barrier()
-            recv_message = message_queue.dequeue(timeout=1)
-            assert np.array_equal(recv_message, message2)
-            # in busy mode, we expect wait to have been called multiple times
-            assert wrapped_wait.call_count > 3
-        else:
-            # writer writes two messages in sync with the reader
-            dist.barrier()
-            # sleep delays the send to ensure reader enters the read loop
-            time.sleep(0.1)
-            message_queue.enqueue(message1)
-
-            dist.barrier()
-            time.sleep(0.1)
-            message_queue.enqueue(message2)
-
-    message_queue.shutdown()
-    assert message_queue.shutting_down
-    print(f"torch distributed passed the test! Rank {rank}")
-
-
-def test_message_queue_idle_wake():
-    distributed_run(worker_fn_test_idle_to_busy, 4)
-
-
-@worker_fn_wrapper
-def worker_fn_test_busy_to_idle():
-    rank = dist.get_rank()
-    writer_rank = 2
-    message_queue = MessageQueue.create_from_process_group(
-        dist.group.WORLD, 40 * 1024, 2, writer_rank
-    )
-
-    message1 = 12345
-    message2 = list(range(3))
-    with mock.patch.object(
-        message_queue._spin_condition, "wait", wraps=message_queue._spin_condition.wait
-    ) as wrapped_wait:
-        if not message_queue._is_writer:
-            # Put into busy mode
-            message_queue._spin_condition.busy_loop_s = 9999
-
-            # sync with the writer and wait for message1
-            dist.barrier()
-            recv_message = message_queue.dequeue(timeout=1)
-            assert recv_message == message1
-            # in busy mode, we expect wait to have been called many times
-            assert wrapped_wait.call_count > 1
-
-            # simulate busy loop ending
-            message_queue._spin_condition.busy_loop_s = 0
-            # ensure we enter idle mode, then record call count
-            with pytest.raises(TimeoutError):
-                message_queue.dequeue(timeout=0.01)
-            call_count = wrapped_wait.call_count
-
-            # sync with the writer and wait for message2
-            dist.barrier()
-            recv_message = message_queue.dequeue(timeout=1)
-            assert recv_message == message2
-
-            # call to wait after idle should only happen once
-            assert wrapped_wait.call_count == call_count + 1
-        else:
-            # writer writes two messages in sync with the reader
-            dist.barrier()
-            # sleep delays the send to ensure reader enters the read loop
-            time.sleep(0.1)
-            message_queue.enqueue(message1)
-
-            dist.barrier()
-            time.sleep(0.1)
-            message_queue.enqueue(message2)
-
-    message_queue.shutdown()
-    assert message_queue.shutting_down
-    print(f"torch distributed passed the test! Rank {rank}")
-
-
-def test_message_queue_busy_to_idle():
-    distributed_run(worker_fn_test_busy_to_idle, 4)
-
-
-def test_warning_logs(caplog_vllm):
-    """
-    Test that warning logs are emitted at VLLM_RINGBUFFER_WARNING_INTERVAL intervals
-    when indefinite=False, and are not emitted when indefinite=True.
-    """
-
-    # Patch the warning log interval to every 1 ms during reads
-    with mock.patch(
-        "vllm.distributed.device_communicators.shm_broadcast.VLLM_RINGBUFFER_WARNING_INTERVAL",
-        new=0.001,  # 1 ms
-    ):
-        writer = MessageQueue(
-            n_reader=1,
-            n_local_reader=1,
-            max_chunk_bytes=1024 * 1024,  # 1MB chunks
-            max_chunks=10,
-        )
-        reader = MessageQueue.create_from_handle(writer.export_handle(), rank=0)
-        writer.wait_until_ready()
-        reader.wait_until_ready()
-
-        # We should have at least one warning log here
-        # "0 seconds" expected due to rounding of 1ms test interval
-        with pytest.raises(TimeoutError):
-            reader.dequeue(timeout=0.01, indefinite=False)
-        assert any(
-            "No available shared memory broadcast block found in 0 seconds"
-            in record.message
-            for record in caplog_vllm.records
-        )
-        caplog_vllm.clear()
-
-        # We should have no warnings this time
-        with pytest.raises(TimeoutError):
-            reader.dequeue(timeout=0.01, indefinite=True)
-        assert all(
-            "No available shared memory broadcast block found in 0 seconds"
-            not in record.message
-            for record in caplog_vllm.records
-        )
-
-        # Clean up when done
-        writer.shutdown()
-        reader.shutdown()