Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/scripts/hardware_ci/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ function cpu_tests() {
# Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
# VLLM_USE_V1=0 pytest -x -s -v \
# pytest -x -s -v \
# tests/quantization/test_ipex_quant.py"

# Run multi-lora tests
Expand Down
3 changes: 1 addition & 2 deletions examples/offline_inference/mlpspeculator.py
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto, we should delete the example if it doesn't work on V1

Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
This file demonstrates the usage of text generation with an LLM model,
comparing the performance with and without speculative decoding.

Note that still not support `v1`:
VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
Note that this example is out of date and not supported in vLLM v1.
"""

import gc
Expand Down
2 changes: 0 additions & 2 deletions examples/offline_inference/qwen2_5_omni/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \

# Read vision and audio inputs from a single video file
# NOTE: V1 engine does not support interleaved modalities yet.
VLLM_USE_V1=0 \
python examples/offline_inference/qwen2_5_omni/only_thinker.py \
-q use_audio_in_video

# Multiple audios
VLLM_USE_V1=0 \
python examples/offline_inference/qwen2_5_omni/only_thinker.py \
-q multi_audios
```
Expand Down
7 changes: 1 addition & 6 deletions examples/offline_inference/qwen2_5_omni/only_thinker.py
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems we should delete this example rather than allowing it to continue on, if V0 is required cc @ywang96

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from typing import NamedTuple

import vllm.envs as envs
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
Expand Down Expand Up @@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult:
)
asset = VideoAsset(name="baby_reading", num_frames=16)
audio = asset.get_audio(sampling_rate=16000)
assert not envs.VLLM_USE_V1, (
"V1 does not support use_audio_in_video. "
"Please launch this example with "
"`VLLM_USE_V1=0`."
)

return QueryResult(
inputs={
"prompt": prompt,
Expand Down
43 changes: 12 additions & 31 deletions examples/others/lmcache/cpu_offload_lmcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from vllm.engine.arg_utils import EngineArgs


def setup_environment_variables(vllm_version: str):
def setup_environment_variables():
# LMCache-related environment variables
# Use experimental features in LMCache
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
Expand All @@ -47,34 +47,23 @@ def setup_environment_variables(vllm_version: str):
os.environ["LMCACHE_LOCAL_CPU"] = "True"
# Set local CPU memory limit to 5.0 GB
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
if vllm_version == "v0":
os.environ["VLLM_USE_V1"] = "0"


@contextlib.contextmanager
def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
def build_llm_with_lmcache(lmcache_connector: str, model: str):
ktc = KVTransferConfig(
kv_connector=lmcache_connector,
kv_role="kv_both",
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
if vllm_version == "v0":
llm_args = EngineArgs(
model=model,
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
enable_chunked_prefill=True, # Only in v0
)
else:
llm_args = EngineArgs(
model=model,
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
)
llm_args = EngineArgs(
model=model,
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
)

llm = LLM(**asdict(llm_args))
try:
Expand Down Expand Up @@ -116,18 +105,10 @@ def parse_args():


def main():
args = parse_args()

if args.version == "v0":
lmcache_connector = "LMCacheConnector"
model = "mistralai/Mistral-7B-Instruct-v0.2"
else:
lmcache_connector = "LMCacheConnectorV1"
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"

setup_environment_variables(args.version)

with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
lmcache_connector = "LMCacheConnectorV1"
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
setup_environment_variables()
with build_llm_with_lmcache(lmcache_connector, model) as llm:
# This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts
shared_prompt = "Hello, how are you?" * 1000
Expand Down
3 changes: 0 additions & 3 deletions tests/entrypoints/openai/test_orca_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ def monkeypatch_module():

@pytest.fixture(scope="module", params=[True])
def server(request, monkeypatch_module):
use_v1 = request.param
monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")

args = [
"--dtype",
"bfloat16",
Expand Down
13 changes: 0 additions & 13 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@
VLLM_SKIP_P2P_CHECK: bool = False
VLLM_DISABLED_KERNELS: list[str] = []
VLLM_DISABLE_PYNCCL: bool = False
VLLM_USE_V1: bool = True
VLLM_ROCM_USE_AITER: bool = False
VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
VLLM_ROCM_USE_AITER_LINEAR: bool = True
Expand Down Expand Up @@ -874,8 +873,6 @@ def get_vllm_port() -> int | None:
"VLLM_DISABLE_PYNCCL": lambda: (
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
),
# If set, use the V1 code path.
"VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
# Disable aiter ops unless specifically enabled.
# Acts as a parent switch to enable the rest of the other operations.
"VLLM_ROCM_USE_AITER": lambda: (
Expand Down Expand Up @@ -1510,16 +1507,6 @@ def is_set(name: str):
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


def set_vllm_use_v1(use_v1: bool):
if is_set("VLLM_USE_V1"):
raise ValueError(
"Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
"explicitly by the user. Please raise this as a Github "
"Issue and explicitly set VLLM_USE_V1=0 or 1."
)
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"


def compute_hash() -> str:
"""
WARNING: Whenever a new key is added to this environment
Expand Down
1 change: 0 additions & 1 deletion vllm/usage/usage_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
"VLLM_USE_FLASHINFER_SAMPLER",
"VLLM_PP_LAYER_PARTITION",
"VLLM_USE_TRITON_AWQ",
"VLLM_USE_V1",
"VLLM_ENABLE_V1_MULTIPROCESSING",
]

Expand Down
Loading