vllm-project · DarkLight1337 · Oct 10, 2024 · May 29, 2024 · Jun 3, 2024 · Jun 3, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -77,8 +77,8 @@ steps:
   - vllm/
   - tests/basic_correctness/test_chunked_prefill
   commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
 
 - label: Core Test # 10min
   mirror_hardwares: [amd]
@@ -88,7 +88,11 @@ steps:
   - vllm/distributed
   - tests/core
   commands:
-  - pytest -v -s core
+  - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1  pytest -v -s core/test_scheduler.py
+  - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1  pytest -v -s core core/test_chunked_prefill_scheduler.py
+  - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1  pytest -v -s core core/block/e2e/test_correctness.py
+  - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1  pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
+  - pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
 
 - label: Entrypoints Test # 40min
   working_dir: "/vllm-workspace/tests"
@@ -185,7 +189,8 @@ steps:
   - vllm/
   - tests/prefix_caching
   commands:
-    - pytest -v -s prefix_caching
+    - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
+    - pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
 
 - label: Samplers Test # 36min
   source_file_dependencies:
@@ -209,7 +214,8 @@ steps:
   - tests/spec_decode
   commands:
     - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
+    - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
+    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=tests/spec_decode/e2e/test_compatibility.py
 
 - label: LoRA Test %N # 15min each
   mirror_hardwares: [amd]

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -221,7 +221,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument("--enable-prefix-caching",
                         action='store_true',
                         help="Enable automatic prefix caching")
-    parser.add_argument('--use-v2-block-manager', action='store_true')
+    parser.add_argument('--use-v2-block-manager',
+                        action='store_true',
+                        default=EngineArgs.use_v2_block_manager)
     parser.add_argument(
         "--ray-workers-use-nsight",
         action='store_true',

diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
@@ -33,6 +33,7 @@
 from transformers import PreTrainedTokenizerBase
 
 from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 try:
@@ -177,6 +178,7 @@ def main(args):
                         help='enable prefix caching')
     parser.add_argument('--use-v2-block-manager',
                         action='store_true',
+                        default=EngineArgs.use_v2_block_manager,
                         help='Use BlockSpaceMangerV2')
     parser.add_argument('--num-prompts',
                         type=int,

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -473,6 +473,7 @@ def main(args: argparse.Namespace):
         help="Maximum number of forward steps per scheduler call.")
     parser.add_argument("--use-v2-block-manager",
                         action='store_true',
+                        default=EngineArgs.use_v2_block_manager,
                         help="Enable block manager v2.")
     parser.add_argument(
         "--enable-prefix-caching",

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
@@ -11,6 +11,8 @@
 
 import pytest
 
+import vllm.envs as envs
+
 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
 
@@ -20,6 +22,16 @@
 ]
 
 
+@pytest.fixture(scope="module", autouse=False)
+def check_deprecated_block_manager():
+    assert envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1 is True, (
+        "To allow the use of deprecated BlockSpaceManagerV1, set the "
+        "environment variable VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. "
+        "You can run the tests with: "
+        "`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest tests/core/test_scheduler.py`"  #noqa
+    )
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
@@ -2,11 +2,22 @@
 
 import pytest
 
+import vllm.envs as envs
 from vllm import SamplingParams
 
 from .conftest import get_token_ids_from_llm_generator
 
 
+@pytest.fixture(scope="module", autouse=False)
+def check_deprecated_block_manager():
+    assert envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1 is True, (
+        "To allow the use of deprecated BlockSpaceManagerV1, set the "
+        "environment variable VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. "
+        "You can run the tests with: "
+        "`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest tests/core/test_scheduler.py`"  #noqa
+    )
+
+
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{

diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+import vllm.envs as envs
 from vllm import LLM, SamplingParams
 
 from .conftest import get_text_from_llm_generator
@@ -12,6 +13,16 @@
 BLOCK_SIZE = 16
 
 
+@pytest.fixture(scope="module", autouse=False)
+def check_deprecated_block_manager():
+    assert envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1 is True, (
+        "To allow the use of deprecated BlockSpaceManagerV1, set the "
+        "environment variable VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. "
+        "You can run the tests with: "
+        "`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest tests/core/block/e2e/test_correctness_sliding_window.py`"  #noqa
+    )
+
+
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{

diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
@@ -3,6 +3,7 @@
 
 import pytest  # noqa
 
+import vllm.envs as envs
 from vllm.config import CacheConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler
@@ -27,6 +28,16 @@ def schedule_and_update_computed_tokens(scheduler):
     return metas, out
 
 
+@pytest.fixture(scope="module", autouse=False)
+def check_deprecated_block_manager():
+    assert envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1 is True, (
+        "To allow the use of deprecated BlockSpaceManagerV1, set the "
+        "environment variable VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. "
+        "You can run the tests with: "
+        "`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest tests/core/test_chunked_prefill_scheduler.py`"  #noqa
+    )
+
+
 @pytest.mark.parametrize('use_v2_block_manager', [True, False])
 def test_simple(use_v2_block_manager: bool):
     """Verify basic scheduling works."""

diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
@@ -6,6 +6,7 @@
 import pytest
 from torch import Use  # noqa
 
+import vllm.envs as envs
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler, SchedulingBudget
@@ -17,6 +18,16 @@
                     schedule_and_update_computed_tokens)
 
 
+@pytest.fixture(scope="module", autouse=False)
+def check_deprecated_block_manager():
+    assert envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1 is True, (
+        "To allow the use of deprecated BlockSpaceManagerV1, set the "
+        "environment variable VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. "
+        "You can run the tests with: "
+        "`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest tests/core/test_scheduler.py`"  #noqa
+    )
+
+
 @pytest.mark.parametrize('use_v2_block_manager', [True, False])
 def test_scheduler_add_seq_group(use_v2_block_manager: bool):
     block_size = 4

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
@@ -6,6 +6,7 @@
 
 import pytest
 
+import vllm.envs as envs
 from tests.kernels.utils import override_backend_env_variable
 from vllm.block import PhysicalTokenBlock
 from vllm.core.block_manager_v1 import CachedBlockAllocator
@@ -18,6 +19,16 @@
 ]
 
 
+@pytest.fixture(scope="module", autouse=False)
+def check_deprecated_block_manager():
+    assert envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1 is True, (
+        "To allow the use of deprecated BlockSpaceManagerV1, set the "
+        "environment variable VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. "
+        "You can run the tests with: "
+        "`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest tests/prefix_caching/test_prefix_caching.py`"  #noqa
+    )
+
+
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("num_blocks", [16])
 def test_block_allocator(

diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
@@ -1,10 +1,21 @@
 import pytest
 
+import vllm.envs as envs
 from vllm import SamplingParams
 
 from .conftest import get_output_from_llm_generator
 
 
+@pytest.fixture(scope="module", autouse=False)
+def check_deprecated_block_manager():
+    assert envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1 is True, (
+        "To allow the use of deprecated BlockSpaceManagerV1, set the "
+        "environment variable VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. "
+        "You can run the tests with: "
+        "`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest tests/spec_decode/e2e/test_compatibility.py`"  #noqa
+    )
+
+
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{

diff --git a/vllm/config.py b/vllm/config.py
@@ -1037,6 +1037,18 @@ def _verify_args(self) -> None:
                 f"({self.num_scheduler_steps}) must be greater than or "
                 "equal to 1.")
 
+        if (not self.use_v2_block_manager \
+            and not envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1):
+            raise ValueError(
+                "The use of BlockSpaceManagerV1 is deprecated and will "
+                "be removed in a future release. Please switch to "
+                "BlockSpaceManagerV2 by setting --use-v2-block-manager to "
+                "True. If you wish to suppress this error temporarily, "
+                "you can set the environment variable "
+                "`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. If your use "
+                "case is not supported in BlockSpaceManagerV2, please "
+                "file an issue with detailed information.")
+
     @property
     def is_multi_step(self) -> bool:
         return self.num_scheduler_steps > 1

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -64,6 +64,7 @@
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
+    VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False
 
 
 def get_default_cache_root():
@@ -434,6 +435,11 @@ def get_default_config_root():
     # and trust the driver's peer-to-peer capability report.
     "VLLM_SKIP_P2P_CHECK":
     lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",
+
+    # If set, allowing the use of deprecated block manager V1
+    "VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1":
+    lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0"
+                           ) == "1",
 }
 
 # end-env-vars-definition