diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py index 5434f8b6a4e4..20cd26bdddf5 100644 --- a/benchmarks/benchmark_block_pool.py +++ b/benchmarks/benchmark_block_pool.py @@ -5,7 +5,7 @@ from benchmark_utils import TimeCollector from tabulate import tabulate -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.core.block_pool import BlockPool diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 6e0f3b51c9d2..f64fd09bab9f 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -46,7 +46,7 @@ from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def test_long_document_qa(llm=None, sampling_params=None, prompts=None): diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index 626b150ee4ce..dedb564fffac 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -19,7 +19,7 @@ VllmConfig, ) from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index d7dc0e991c4d..146c268a6b7f 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -37,7 +37,7 @@ from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser try: from vllm.transformers_utils.tokenizer import get_tokenizer diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 769f52dbab6e..a35db0063b0a 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -11,7 +11,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser # Select a equi-probable random priority diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 539ab2ed0a4d..55001cf3722a 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -51,7 +51,7 @@ from backend_request_func import get_tokenizer try: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 22fc2678fd1c..67fccdf4fd07 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -15,7 +15,7 @@ from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 2deebf3ddb7a..f7325ddd2cbb 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -18,7 +18,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( w8a8_triton_block_scaled_mm, ) -from vllm.utils import FlexibleArgumentParser, cdiv +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.math_utils import cdiv DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py index d33b84fc3601..7792cfd03b0e 100644 --- a/benchmarks/kernels/bench_per_token_quant_fp8.py +++ b/benchmarks/kernels/bench_per_token_quant_fp8.py @@ -10,7 +10,7 @@ from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py index 7662655b5efa..66268b71b3de 100644 --- a/benchmarks/kernels/benchmark_activation.py +++ b/benchmarks/kernels/benchmark_activation.py @@ -10,7 +10,7 @@ from vllm.model_executor.custom_op import CustomOp from vllm.platforms import current_platform from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE batch_size_range = [1, 16, 32, 64, 128] diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index 66b44c27d6ee..6bcb17983795 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -28,7 +28,7 @@ from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser parser = FlexibleArgumentParser( description="Benchmark BitBLAS int4 on a specific target." diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py index 726a2a371d10..7982cbb1422c 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.scalar_type import scalar_types -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser WEIGHT_SHAPES_MOE = { "nvidia/DeepSeek-R1-FP4": [ diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index b419b2fa0e3e..027f67ad4db6 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser # Weight shapes for different models: [num_experts, topk, hidden_size, # intermediate_size] diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py index df06a940e6d4..b414efa6e330 100644 --- a/benchmarks/kernels/benchmark_device_communicators.py +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -39,7 +39,7 @@ ) from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator from vllm.logger import init_logger -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser logger = init_logger(__name__) diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 14330ae6f03c..d525bd5faacf 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -13,7 +13,7 @@ fused_experts, fused_topk, ) -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = [ "nm-testing/Mixtral-8x7B-Instruct-v0.1", diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index bcfa64c3f425..6fa5c248670e 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -7,7 +7,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 39338f338761..bf1512268fe0 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -25,7 +25,7 @@ from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_TP_SIZES = [1] diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index e1d5239f5cc9..8787724d77cf 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -33,7 +33,7 @@ quantize_weights, ) from vllm.scalar_type import ScalarType, scalar_types -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024] diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 34cc45e94d76..12ca9214b1f9 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -44,7 +44,7 @@ sort_weights, ) from vllm.scalar_type import ScalarType, scalar_types -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192] diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 9298d3b58dfb..bc6cf83bc21f 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -22,7 +22,7 @@ from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser FP8_DTYPE = current_platform.fp8_dtype() diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 459eafa6d907..efa5a7386027 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -17,7 +17,7 @@ ) from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser FP8_DTYPE = current_platform.fp8_dtype() diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index b9147361708f..cb848d2bf579 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -39,7 +39,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 1b1e71adeec4..46ab2a5fe5e9 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -9,7 +9,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random, diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 61427a77b4e3..3c2ac9128947 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -7,7 +7,7 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py index e0ff09d4b397..0d3aef0c630b 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache.py @@ -9,7 +9,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random, diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index 29f1b2ccdcf6..12f17ea575d9 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -12,7 +12,7 @@ ) from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random_flash, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 24869c91a8d7..29ef6409bb16 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -8,7 +8,7 @@ from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def benchmark_rope_kernels_multi_lora( diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index f7cdc25794ca..29ce18234dfa 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -8,7 +8,7 @@ import flashinfer import torch -from vllm.utils import round_up +from vllm.utils.math_utils import round_up FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FP8_DTYPE = torch.float8_e4m3fn diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 7993354475fc..2a25d0374811 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -8,7 +8,7 @@ import flashinfer import torch -from vllm.utils import round_up +from vllm.utils.math_utils import round_up FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FP8_DTYPE = torch.float8_e4m3fn diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 602fad181074..ab54f81985bc 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -18,7 +18,7 @@ ) from vllm.platforms import current_platform from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser mp.set_start_method("spawn", force=True) diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 9a4da0ef5a85..6964a3d3e082 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -11,7 +11,7 @@ import seaborn as sns from torch.utils.benchmark import Measurement as TMeasurement -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser if __name__ == "__main__": parser = FlexibleArgumentParser( diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index 0957a9c65f06..178599952d5c 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -5,7 +5,7 @@ import pstats from vllm import LLM, SamplingParams -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser # A very long prompt, total number of tokens is about 15k. LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000 diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index c4eed2037781..53d69bbdbdc7 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -18,7 +18,7 @@ from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.audio import AudioAsset from vllm.lora.request import LoRARequest -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] question_per_audio_count = { diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py index 9e7036fea613..c42b00730fe4 100644 --- a/examples/offline_inference/basic/chat.py +++ b/examples/offline_inference/basic/chat.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def create_parser(): diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py index dc3bc399ca8a..b72ddde1fb55 100644 --- a/examples/offline_inference/basic/classify.py +++ b/examples/offline_inference/basic/classify.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index 158836728bee..eeb7137ff7ba 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py index 6a41ef4d84bb..9650dcfe967b 100644 --- a/examples/offline_inference/basic/generate.py +++ b/examples/offline_inference/basic/generate.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def create_parser(): diff --git a/examples/offline_inference/basic/reward.py b/examples/offline_inference/basic/reward.py index aa173cf96f5b..e9508568655d 100644 --- a/examples/offline_inference/basic/reward.py +++ b/examples/offline_inference/basic/reward.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index c9ca7a8bf06b..cbca50eb5efa 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index 4a1b0c40604b..c1d6c6db53df 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -13,7 +13,7 @@ from vllm import LLM, EngineArgs, PromptType, SamplingParams from vllm.assets.audio import AudioAsset -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser class ModelRequestData(NamedTuple): diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index d7f2a1633113..d9215255a808 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -8,7 +8,7 @@ import argparse from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def create_test_prompts() -> list[tuple[str, SamplingParams]]: diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py index cc78c0cbbf7c..52c2363c8987 100644 --- a/examples/offline_inference/load_sharded_state.py +++ b/examples/offline_inference/load_sharded_state.py @@ -25,7 +25,7 @@ import dataclasses from vllm import LLM, EngineArgs, SamplingParams -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/pooling/embed_jina_embeddings_v3.py b/examples/offline_inference/pooling/embed_jina_embeddings_v3.py index 33a63deee91b..b117b0bd5fbe 100644 --- a/examples/offline_inference/pooling/embed_jina_embeddings_v3.py +++ b/examples/offline_inference/pooling/embed_jina_embeddings_v3.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/pooling/embed_matryoshka_fy.py b/examples/offline_inference/pooling/embed_matryoshka_fy.py index 6871bcfccf1b..6544df852303 100644 --- a/examples/offline_inference/pooling/embed_matryoshka_fy.py +++ b/examples/offline_inference/pooling/embed_matryoshka_fy.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs, PoolingParams -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/offline_inference/pooling/multi_vector_retrieval.py index 8b8892117d37..fa7d1c3ba216 100644 --- a/examples/offline_inference/pooling/multi_vector_retrieval.py +++ b/examples/offline_inference/pooling/multi_vector_retrieval.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/pooling/ner.py b/examples/offline_inference/pooling/ner.py index f18742fac0d5..b2dffdd6c5ee 100644 --- a/examples/offline_inference/pooling/ner.py +++ b/examples/offline_inference/pooling/ner.py @@ -5,7 +5,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py index dfcbd8c8d360..3b127e4fd29d 100644 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -13,7 +13,7 @@ from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000)) DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0)) diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index 62effd5c8b62..6fbe1303f431 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -13,7 +13,7 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.multimodal.image import convert_image_mode -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser class QueryResult(NamedTuple): diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index 41d7a3492320..e25f46b126e6 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -30,7 +30,7 @@ from vllm import LLM, EngineArgs from vllm.model_executor.model_loader import ShardedStateLoader -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index af65b6d38e02..f5f6e28b5fd9 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -9,7 +9,7 @@ from vllm.v1.metrics.reader import Counter, Vector try: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 7668b10916ac..c1ea95f8d064 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -22,7 +22,7 @@ from vllm.assets.video import VideoAsset from vllm.lora.request import LoRARequest from vllm.multimodal.image import convert_image_mode -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser class ModelRequestData(NamedTuple): diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index b9115121a946..5cb47c15038e 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -18,7 +18,7 @@ from vllm import LLM, EngineArgs, SamplingParams from vllm.lora.request import LoRARequest from vllm.multimodal.utils import fetch_image -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser QUESTION = "What is the content of each image?" IMAGE_URLS = [ diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py index cf4695c2545f..63d85d5d9eef 100644 --- a/examples/offline_inference/vision_language_pooling.py +++ b/examples/offline_inference/vision_language_pooling.py @@ -18,7 +18,7 @@ from vllm import LLM, EngineArgs from vllm.entrypoints.score_utils import ScoreMultiModalParam from vllm.multimodal.utils import fetch_image -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser ROOT_DIR = Path(__file__).parent.parent.parent EXAMPLES_DIR = ROOT_DIR / "examples" diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 5d515fbfb671..9fa600ff458d 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -26,7 +26,7 @@ from openai import OpenAI from utils import get_first_model -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 2601c9eff971..3644a03b32ed 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -16,7 +16,7 @@ tensorize_vllm_model, tensorizer_kwargs_arg, ) -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser logger = logging.getLogger() diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index bcee0eb3d6fa..472b1487ef44 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -22,7 +22,7 @@ optional_type, parse_type, ) -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser @pytest.mark.parametrize( diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 0b9d171aa481..b5d71c20bb4e 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -7,7 +7,7 @@ from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args from vllm.entrypoints.openai.serving_models import LoRAModulePath -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from ...utils import VLLM_PATH diff --git a/tests/kernels/attention/test_deepgemm_attention.py b/tests/kernels/attention/test_deepgemm_attention.py index 74a5d8117962..e2ae3b833b20 100644 --- a/tests/kernels/attention/test_deepgemm_attention.py +++ b/tests/kernels/attention/test_deepgemm_attention.py @@ -6,7 +6,6 @@ import torch from vllm.platforms import current_platform -from vllm.utils import cdiv from vllm.utils.deep_gemm import ( _ceil_to_ue8m0, calc_diff, @@ -16,6 +15,7 @@ get_paged_mqa_logits_metadata, ) from vllm.utils.import_utils import has_deep_gemm +from vllm.utils.math_utils import cdiv def kv_cache_cast_to_fp8(x: torch.Tensor) -> torch.Tensor: diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 00f06da5a47b..79981009c9db 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -10,7 +10,7 @@ get_nvfp4_global_scale, ) from vllm.platforms import current_platform -from vllm.utils import round_up +from vllm.utils.math_utils import round_up if not current_platform.is_device_capability(100): pytest.skip( diff --git a/tests/kernels/attention/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py index 44f3e42e8714..e1a7e50c2b56 100644 --- a/tests/kernels/attention/test_mla_decode_cpu.py +++ b/tests/kernels/attention/test_mla_decode_cpu.py @@ -7,7 +7,7 @@ import vllm._custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv def ref_mla( diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py index 01ba0951b825..04085fe5fa0f 100644 --- a/tests/kernels/attention/test_triton_decode_attention.py +++ b/tests/kernels/attention/test_triton_decode_attention.py @@ -5,7 +5,7 @@ import torch from vllm.attention.ops.triton_decode_attention import decode_attention_fwd -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv @pytest.mark.parametrize("B", [3, 5]) diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py index 4c60241bdb01..1c10cb3b2c69 100644 --- a/tests/kernels/moe/test_cutlass_grouped_gemm.py +++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py @@ -13,8 +13,8 @@ from tests.kernels.utils import baseline_scaled_mm from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import cdiv from vllm.utils.deep_gemm import per_block_cast_to_fp8 +from vllm.utils.math_utils import cdiv @pytest.mark.parametrize( diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index d4a79a7eff75..dfd317bcf72f 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -27,7 +27,7 @@ triton_kernel_moe_forward, ) from vllm.model_executor.layers.utils import shuffle_weight -from vllm.utils import round_up +from vllm.utils.math_utils import round_up def deshuffle(w: torch.Tensor): diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py index bde0478d9c18..8975f00bd4c6 100644 --- a/tests/kernels/moe/test_moe_align_block_size.py +++ b/tests/kernels/moe/test_moe_align_block_size.py @@ -13,7 +13,7 @@ moe_align_block_size, ) from vllm.platforms import current_platform -from vllm.utils import round_up +from vllm.utils.math_utils import round_up NUM_TOKENS = [1, 3, 256, 2256, 4096] NUM_EXPERTS = [32, 160, 256, 257] diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index ac7f3fc5e6f0..a2de64974b35 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index e665c636fa26..0f0ed3326d15 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -45,7 +45,7 @@ TopKWeightAndReduceDelegate, ) from vllm.platforms import current_platform -from vllm.utils import round_up +from vllm.utils.math_utils import round_up from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index 8b3bebb391f2..92e78ec2396d 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -8,7 +8,7 @@ persistent_masked_m_silu_mul_quant, ) from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv fp8_dtype = torch.float8_e4m3fn diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 65ce4073ad5b..c7e6c4240e85 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -16,8 +16,8 @@ ) from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input -from vllm.utils import round_up from vllm.utils.deep_gemm import per_block_cast_to_fp8 +from vllm.utils.math_utils import round_up def triton_moe( diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 34ce91585520..830d43569e98 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -6,7 +6,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast from vllm.platforms import current_platform -from vllm.utils import round_up +from vllm.utils.math_utils import round_up # Using the default value (240.0) from pytorch will cause accuracy # issue on dynamic quantization models. Here use 224.0 for rocm. diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index 835c067e2f72..de595b0a34e4 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -13,7 +13,7 @@ from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8 from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv MNK_FACTORS = [ (1, 256, 128), diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index 12f7fc66d17b..351cff246d61 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -18,7 +18,7 @@ from vllm.attention.backends.registry import _Backend from vllm.config import ModelConfig from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, is_torch_equal_or_newer from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 81fd6433b0c8..1a256a6e192a 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -22,7 +22,7 @@ from vllm.attention.backends.registry import _Backend from vllm.attention.ops.flashmla import is_flashmla_dense_supported from vllm.config.vllm import set_current_vllm_config -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import FullAttentionSpec diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py index 25de65a56b37..02324d2aca6e 100644 --- a/tests/v1/attention/test_sparse_mla_backends.py +++ b/tests/v1/attention/test_sparse_mla_backends.py @@ -23,7 +23,7 @@ from vllm import _custom_ops as ops from vllm.attention.ops import flashmla from vllm.model_executor.layers.linear import ColumnParallelLinear -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.mla.flashmla_sparse import FlashMLASparseBackend from vllm.v1.attention.backends.mla.indexer import split_prefill_chunks diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py index 943402e429b6..cf632f146989 100644 --- a/tests/v1/engine/test_engine_args.py +++ b/tests/v1/engine/test_engine_args.py @@ -8,7 +8,7 @@ from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def test_prefix_caching_from_cli(): diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py index a40a66308a66..4b89c28f0ca6 100644 --- a/vllm/attention/layers/cross_attention.py +++ b/vllm/attention/layers/cross_attention.py @@ -16,7 +16,7 @@ from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, subclass_attention_backend, diff --git a/vllm/attention/ops/pallas_kv_cache_update.py b/vllm/attention/ops/pallas_kv_cache_update.py index d0d836cc6aa5..51214b02271a 100644 --- a/vllm/attention/ops/pallas_kv_cache_update.py +++ b/vllm/attention/ops/pallas_kv_cache_update.py @@ -7,7 +7,7 @@ from jax.experimental import pallas as pl from jax.experimental.pallas import tpu as pltpu -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv def _kv_cache_update_kernel( diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py index 5c1ce68dde1b..bcd1e2cd5644 100644 --- a/vllm/attention/ops/rocm_aiter_paged_attn.py +++ b/vllm/attention/ops/rocm_aiter_paged_attn.py @@ -6,7 +6,7 @@ from vllm.attention.ops.paged_attn import PagedAttention from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv FP8_DTYPE = current_platform.fp8_dtype() diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index eb8cd64c34ba..55e24bd5d9d3 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -58,7 +58,7 @@ librosa = PlaceholderModule("librosa") try: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index aaf19e6d4235..4f44faece75e 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -19,7 +19,7 @@ kStaticTensorScale, ) from vllm.platforms import current_platform -from vllm.utils import round_up +from vllm.utils.math_utils import round_up from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32 from .fx_utils import is_func diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c0ea84b6e4e8..617c464cff25 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -82,7 +82,8 @@ maybe_override_with_speculators, ) from vllm.transformers_utils.utils import check_gguf_file -from vllm.utils import FlexibleArgumentParser, is_in_ray_actor +from vllm.utils import is_in_ray_actor +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GiB_bytes from vllm.utils.network_utils import get_ip from vllm.v1.sample.logits_processor import LogitsProcessor diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 53dab90f45f7..184cc47ceb83 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -26,7 +26,8 @@ from vllm.logger import init_logger from vllm.sampling_params import SamplingParams from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit +from vllm.utils import random_uuid, set_ulimit +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.version import __version__ as VLLM_VERSION logger = init_logger("vllm.entrypoints.api_server") diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py index 7a1d24776009..2ff98577c363 100644 --- a/vllm/entrypoints/cli/benchmark/main.py +++ b/vllm/entrypoints/cli/benchmark/main.py @@ -9,7 +9,7 @@ from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG if typing.TYPE_CHECKING: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = argparse.ArgumentParser diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py index e47dce0a401a..ad943a63de9d 100644 --- a/vllm/entrypoints/cli/collect_env.py +++ b/vllm/entrypoints/cli/collect_env.py @@ -8,7 +8,7 @@ from vllm.entrypoints.cli.types import CLISubcommand if typing.TYPE_CHECKING: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = argparse.ArgumentParser diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py index 213a46603622..a3e73eb7a4c9 100644 --- a/vllm/entrypoints/cli/main.py +++ b/vllm/entrypoints/cli/main.py @@ -20,7 +20,7 @@ def main(): import vllm.entrypoints.cli.run_batch import vllm.entrypoints.cli.serve from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser CMD_MODULES = [ vllm.entrypoints.cli.openai, diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index a27c6fe6618a..99a8759c84f4 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -13,7 +13,7 @@ from vllm.entrypoints.cli.types import CLISubcommand if TYPE_CHECKING: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = argparse.ArgumentParser diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py index 4b18ceb5215f..64d1bec1f1ff 100644 --- a/vllm/entrypoints/cli/run_batch.py +++ b/vllm/entrypoints/cli/run_batch.py @@ -11,7 +11,7 @@ from vllm.logger import init_logger if typing.TYPE_CHECKING: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = argparse.ArgumentParser diff --git a/vllm/entrypoints/cli/types.py b/vllm/entrypoints/cli/types.py index f4eeb5b3c2e1..f22b844b4ddf 100644 --- a/vllm/entrypoints/cli/types.py +++ b/vllm/entrypoints/cli/types.py @@ -5,7 +5,7 @@ import typing if typing.TYPE_CHECKING: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = argparse.ArgumentParser diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 99d6cbaa86b8..1a775d3d6809 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -29,7 +29,7 @@ from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.logger import init_logger -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index da036e30ba7e..4caccf88fd7d 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -32,7 +32,8 @@ from vllm.entrypoints.openai.serving_score import ServingScores from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager -from vllm.utils import FlexibleArgumentParser, random_uuid +from vllm.utils import random_uuid +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index ec5fb3b56b7f..088bb679fef4 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -31,7 +31,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.transformers_utils.tokenizers import MistralTokenizer -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fla/ops/layernorm_guard.py b/vllm/model_executor/layers/fla/ops/layernorm_guard.py index 307d0859c24e..89352d12beef 100644 --- a/vllm/model_executor/layers/fla/ops/layernorm_guard.py +++ b/vllm/model_executor/layers/fla/ops/layernorm_guard.py @@ -21,7 +21,7 @@ from einops import rearrange from vllm.triton_utils import tl, triton -from vllm.utils import cdiv, next_power_of_2 +from vllm.utils.math_utils import cdiv, next_power_of_2 from .utils import input_guard diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 200212dfb42b..5403d4e62f85 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -14,9 +14,9 @@ OCP_MX_Scheme, ) from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape -from vllm.utils import cdiv from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.import_utils import has_triton_kernels +from vllm.utils.math_utils import cdiv logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py index 85294f6aea6e..6cca95412327 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py @@ -10,8 +10,8 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens from vllm.triton_utils import tl, triton -from vllm.utils import round_up from vllm.utils.deep_gemm import get_mk_alignment_for_contiguous_layout +from vllm.utils.math_utils import round_up def expert_num_tokens_round_up_and_sum( diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index a5c5c115f36c..13866a5c5bf4 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -12,7 +12,7 @@ TopKWeightAndReduceDelegate, ) from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input -from vllm.utils import round_up +from vllm.utils.math_utils import round_up from vllm.v1.worker.ubatching import ( dbo_current_ubatch_id, dbo_enabled, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 71393f4f6c27..c144aa23e46e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -55,9 +55,9 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum -from vllm.utils import cdiv, round_up from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.import_utils import has_deep_ep, has_pplx +from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import current_stream, direct_register_custom_op from vllm.v1.worker.ubatching import dbo_current_ubatch_id diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 8514b63556ae..3b5916f8ccaf 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -16,7 +16,7 @@ count_expert_num_tokens, disable_inplace, ) -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.worker.ubatching import ( dbo_current_ubatch_id, dbo_enabled, diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py index f4d8a86c058a..7f6155997264 100644 --- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py +++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py @@ -5,7 +5,7 @@ from vllm import _custom_ops as ops from vllm.triton_utils import triton -from vllm.utils import round_up +from vllm.utils.math_utils import round_up def moe_align_block_size( diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 0e77fa54cd50..2766a2c2249f 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -15,7 +15,7 @@ _validate_scale_shape, moe_kernel_quantize_input, ) -from vllm.utils import cdiv, round_up +from vllm.utils.math_utils import cdiv, round_up logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 0627ea50d821..1f946d67a8f5 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -23,8 +23,8 @@ mxfp8_e4m3_quantize, ) from vllm.triton_utils import tl, triton -from vllm.utils import cdiv from vllm.utils.flashinfer import flashinfer_fp4_quantize +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import is_torch_equal_or_newer diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 2890a2c6d702..06b4f9271b41 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -26,7 +26,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.import_utils import PlaceholderModule if TYPE_CHECKING: diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index d4367be1c785..d2f9f1b0b5c0 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -6,7 +6,7 @@ import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry -from vllm.utils import cdiv, round_up +from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 846c8e7669be..44f6824b5212 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -32,7 +32,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import sequence_parallel_chunk from vllm.sequence import IntermediateTensors -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index e86fc23c7d36..069078850217 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -22,7 +22,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import NestedTensors from vllm.sequence import IntermediateTensors -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import ( is_pin_memory_available, is_uva_available, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 098e9058f529..1fb3aba9b1f7 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -22,7 +22,7 @@ from vllm.inputs import ProcessorInputs, PromptType from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = object diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 38da04102c44..9cedea346192 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -6,42 +6,56 @@ import enum import getpass import inspect -import json import multiprocessing import os import signal import sys import tempfile -import textwrap import threading import traceback import uuid import warnings import weakref -from argparse import ( - Action, - ArgumentDefaultsHelpFormatter, - ArgumentParser, - ArgumentTypeError, - RawDescriptionHelpFormatter, - _ArgumentGroup, -) -from collections import defaultdict from collections.abc import Callable -from functools import partial, wraps +from functools import cache, partial, wraps from typing import TYPE_CHECKING, Any, TypeVar import cloudpickle import psutil -import regex as re import torch -import yaml import vllm.envs as envs from vllm.logger import enable_trace_function_call, init_logger from vllm.ray.lazy_utils import is_in_ray_actor + +# Import utilities from specialized modules for backward compatibility +from vllm.utils.argparse_utils import ( + FlexibleArgumentParser, + SortedHelpFormatter, + StoreBoolean, +) +from vllm.utils.math_utils import ( + cdiv, + next_power_of_2, + prev_power_of_2, + round_down, + round_up, +) from vllm.utils.platform_utils import cuda_is_initialized, xpu_is_initialized +__all__ = [ + # Argparse utilities + "FlexibleArgumentParser", + "SortedHelpFormatter", + "StoreBoolean", + # Math utilities + "cdiv", + "next_power_of_2", + "prev_power_of_2", + "round_down", + "round_up", +] + _DEPRECATED_MAPPINGS = { "cprofile": "profiling", "cprofile_context": "profiling", @@ -139,31 +153,31 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) -def cdiv(a: int, b: int) -> int: - """Ceiling division.""" - return -(a // -b) - - -def next_power_of_2(n) -> int: - """The next power of 2 (inclusive)""" - if n < 1: - return 1 - return 1 << (n - 1).bit_length() - +def update_environment_variables(envs: dict[str, str]): + for k, v in envs.items(): + if k in os.environ and os.environ[k] != v: + logger.warning( + "Overwriting environment variable %s from '%s' to '%s'", + k, + os.environ[k], + v, + ) + os.environ[k] = v -def prev_power_of_2(n: int) -> int: - """The previous power of 2 (inclusive)""" - if n <= 0: - return 0 - return 1 << (n.bit_length() - 1) +@cache +def is_pin_memory_available() -> bool: + from vllm.platforms import current_platform -def round_up(x: int, y: int) -> int: - return ((x + y - 1) // y) * y + return current_platform.is_pin_memory_available() -def round_down(x: int, y: int) -> int: - return (x // y) * y +@cache +def is_uva_available() -> bool: + """Check if Unified Virtual Addressing (UVA) is available.""" + # UVA requires pinned memory. + # TODO: Add more requirements for UVA if needed. + return is_pin_memory_available() # TODO: This function can be removed if transformer_modules classes are @@ -214,488 +228,6 @@ def weak_bound(*args, **kwargs) -> None: return weak_bound -class StoreBoolean(Action): - def __call__(self, parser, namespace, values, option_string=None): - if values.lower() == "true": - setattr(namespace, self.dest, True) - elif values.lower() == "false": - setattr(namespace, self.dest, False) - else: - raise ValueError( - f"Invalid boolean value: {values}. Expected 'true' or 'false'." - ) - - -class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter): - """SortedHelpFormatter that sorts arguments by their option strings.""" - - def _split_lines(self, text, width): - """ - 1. Sentences split across lines have their single newlines removed. - 2. Paragraphs and explicit newlines are split into separate lines. - 3. Each line is wrapped to the specified width (width of terminal). - """ - # The patterns also include whitespace after the newline - single_newline = re.compile(r"(? to the front, e,g: - # [Before] - # vllm serve -tp 2 --model --enforce-eager --port 8001 - # [After] - # vllm serve -tp 2 --enforce-eager --port 8001 - args = [ - "serve", - model_tag, - *args[1:model_idx], - *args[rest_start_idx:], - ] - print("args", args) - except StopIteration: - pass - - if "--config" in args: - args = self._pull_args_from_config(args) - - def repl(match: re.Match) -> str: - """Replaces underscores with dashes in the matched string.""" - return match.group(0).replace("_", "-") - - # Everything between the first -- and the first . - pattern = re.compile(r"(?<=--)[^\.]*") - - # Convert underscores to dashes and vice versa in argument names - processed_args = list[str]() - for i, arg in enumerate(args): - if arg.startswith("--help="): - FlexibleArgumentParser._search_keyword = arg.split("=", 1)[-1].lower() - processed_args.append("--help") - elif arg.startswith("--"): - if "=" in arg: - key, value = arg.split("=", 1) - key = pattern.sub(repl, key, count=1) - processed_args.append(f"{key}={value}") - else: - key = pattern.sub(repl, arg, count=1) - processed_args.append(key) - elif arg.startswith("-O") and arg != "-O" and arg[2] != ".": - # allow -O flag to be used without space, e.g. -O3 or -Odecode - # -O.<...> handled later - # also handle -O= here - mode = arg[3:] if arg[2] == "=" else arg[2:] - processed_args.append(f"-O.mode={mode}") - elif ( - arg == "-O" - and i + 1 < len(args) - and args[i + 1] in {"0", "1", "2", "3"} - ): - # Convert -O to -O.mode - processed_args.append("-O.mode") - else: - processed_args.append(arg) - - def create_nested_dict(keys: list[str], value: str) -> dict[str, Any]: - """Creates a nested dictionary from a list of keys and a value. - - For example, `keys = ["a", "b", "c"]` and `value = 1` will create: - `{"a": {"b": {"c": 1}}}` - """ - nested_dict: Any = value - for key in reversed(keys): - nested_dict = {key: nested_dict} - return nested_dict - - def recursive_dict_update( - original: dict[str, Any], - update: dict[str, Any], - ) -> set[str]: - """Recursively updates a dictionary with another dictionary. - Returns a set of duplicate keys that were overwritten. - """ - duplicates = set[str]() - for k, v in update.items(): - if isinstance(v, dict) and isinstance(original.get(k), dict): - nested_duplicates = recursive_dict_update(original[k], v) - duplicates |= {f"{k}.{d}" for d in nested_duplicates} - elif isinstance(v, list) and isinstance(original.get(k), list): - original[k] += v - else: - if k in original: - duplicates.add(k) - original[k] = v - return duplicates - - delete = set[int]() - dict_args = defaultdict[str, dict[str, Any]](dict) - duplicates = set[str]() - for i, processed_arg in enumerate(processed_args): - if i in delete: # skip if value from previous arg - continue - - if processed_arg.startswith("-") and "." in processed_arg: - if "=" in processed_arg: - processed_arg, value_str = processed_arg.split("=", 1) - if "." not in processed_arg: - # False positive, '.' was only in the value - continue - else: - value_str = processed_args[i + 1] - delete.add(i + 1) - - if processed_arg.endswith("+"): - processed_arg = processed_arg[:-1] - value_str = json.dumps(list(value_str.split(","))) - - key, *keys = processed_arg.split(".") - try: - value = json.loads(value_str) - except json.decoder.JSONDecodeError: - value = value_str - - # Merge all values with the same key into a single dict - arg_dict = create_nested_dict(keys, value) - arg_duplicates = recursive_dict_update(dict_args[key], arg_dict) - duplicates |= {f"{key}.{d}" for d in arg_duplicates} - delete.add(i) - # Filter out the dict args we set to None - processed_args = [a for i, a in enumerate(processed_args) if i not in delete] - if duplicates: - logger.warning("Found duplicate keys %s", ", ".join(duplicates)) - - # Add the dict args back as if they were originally passed as JSON - for dict_arg, dict_value in dict_args.items(): - processed_args.append(dict_arg) - processed_args.append(json.dumps(dict_value)) - - return super().parse_args(processed_args, namespace) - - def check_port(self, value): - try: - value = int(value) - except ValueError: - msg = "Port must be an integer" - raise ArgumentTypeError(msg) from None - - if not (1024 <= value <= 65535): - raise ArgumentTypeError("Port must be between 1024 and 65535") - - return value - - def _pull_args_from_config(self, args: list[str]) -> list[str]: - """Method to pull arguments specified in the config file - into the command-line args variable. - - The arguments in config file will be inserted between - the argument list. - - example: - ```yaml - port: 12323 - tensor-parallel-size: 4 - ``` - ```python - $: vllm {serve,chat,complete} "facebook/opt-12B" \ - --config config.yaml -tp 2 - $: args = [ - "serve,chat,complete", - "facebook/opt-12B", - '--config', 'config.yaml', - '-tp', '2' - ] - $: args = [ - "serve,chat,complete", - "facebook/opt-12B", - '--port', '12323', - '--tensor-parallel-size', '4', - '-tp', '2' - ] - ``` - - Please note how the config args are inserted after the sub command. - this way the order of priorities is maintained when these are args - parsed by super(). - """ - assert args.count("--config") <= 1, "More than one config file specified!" - - index = args.index("--config") - if index == len(args) - 1: - raise ValueError( - "No config file specified! \ - Please check your command-line arguments." - ) - - file_path = args[index + 1] - - config_args = self.load_config_file(file_path) - - # 0th index might be the sub command {serve,chat,complete,...} - # optionally followed by model_tag (only for serve) - # followed by config args - # followed by rest of cli args. - # maintaining this order will enforce the precedence - # of cli > config > defaults - if args[0].startswith("-"): - # No sub command (e.g., api_server entry point) - args = config_args + args[0:index] + args[index + 2 :] - elif args[0] == "serve": - model_in_cli = len(args) > 1 and not args[1].startswith("-") - model_in_config = any(arg == "--model" for arg in config_args) - - if not model_in_cli and not model_in_config: - raise ValueError( - "No model specified! Please specify model either " - "as a positional argument or in a config file." - ) - - if model_in_cli: - # Model specified as positional arg, keep CLI version - args = ( - [args[0]] - + [args[1]] - + config_args - + args[2:index] - + args[index + 2 :] - ) - else: - # No model in CLI, use config if available - args = [args[0]] + config_args + args[1:index] + args[index + 2 :] - else: - args = [args[0]] + config_args + args[1:index] + args[index + 2 :] - - return args - - def load_config_file(self, file_path: str) -> list[str]: - """Loads a yaml file and returns the key value pairs as a - flattened list with argparse like pattern - ```yaml - port: 12323 - tensor-parallel-size: 4 - ``` - returns: - processed_args: list[str] = [ - '--port': '12323', - '--tensor-parallel-size': '4' - ] - """ - extension: str = file_path.split(".")[-1] - if extension not in ("yaml", "yml"): - raise ValueError( - "Config file must be of a yaml/yml type.\ - %s supplied", - extension, - ) - - # only expecting a flat dictionary of atomic types - processed_args: list[str] = [] - - config: dict[str, int | str] = {} - try: - with open(file_path) as config_file: - config = yaml.safe_load(config_file) - except Exception as ex: - logger.error( - "Unable to read the config file at %s. \ - Make sure path is correct", - file_path, - ) - raise ex - - store_boolean_arguments = [ - action.dest for action in self._actions if isinstance(action, StoreBoolean) - ] - - for key, value in config.items(): - if isinstance(value, bool) and key not in store_boolean_arguments: - if value: - processed_args.append("--" + key) - elif isinstance(value, list): - if value: - processed_args.append("--" + key) - for item in value: - processed_args.append(str(item)) - else: - processed_args.append("--" + key) - processed_args.append(str(value)) - - return processed_args - - class AtomicCounter: """An atomic, thread-safe counter""" diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py new file mode 100644 index 000000000000..0007c72f1e38 --- /dev/null +++ b/vllm/utils/argparse_utils.py @@ -0,0 +1,507 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Argument parsing utilities for vLLM.""" + +import json +import sys +import textwrap +from argparse import ( + Action, + ArgumentDefaultsHelpFormatter, + ArgumentParser, + ArgumentTypeError, + RawDescriptionHelpFormatter, + _ArgumentGroup, +) +from collections import defaultdict +from typing import TYPE_CHECKING, Any + +import regex as re +import yaml + +from vllm.logger import init_logger + +if TYPE_CHECKING: + from argparse import Namespace +else: + Namespace = object + +logger = init_logger(__name__) + + +class StoreBoolean(Action): + def __call__(self, parser, namespace, values, option_string=None): + if values.lower() == "true": + setattr(namespace, self.dest, True) + elif values.lower() == "false": + setattr(namespace, self.dest, False) + else: + raise ValueError( + f"Invalid boolean value: {values}. Expected 'true' or 'false'." + ) + + +class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter): + """SortedHelpFormatter that sorts arguments by their option strings.""" + + def _split_lines(self, text, width): + """ + 1. Sentences split across lines have their single newlines removed. + 2. Paragraphs and explicit newlines are split into separate lines. + 3. Each line is wrapped to the specified width (width of terminal). + """ + # The patterns also include whitespace after the newline + single_newline = re.compile(r"(? to the front, e,g: + # [Before] + # vllm serve -tp 2 --model --enforce-eager --port 8001 + # [After] + # vllm serve -tp 2 --enforce-eager --port 8001 + args = [ + "serve", + model_tag, + *args[1:model_idx], + *args[rest_start_idx:], + ] + except StopIteration: + pass + + if "--config" in args: + args = self._pull_args_from_config(args) + + def repl(match: re.Match) -> str: + """Replaces underscores with dashes in the matched string.""" + return match.group(0).replace("_", "-") + + # Everything between the first -- and the first . + pattern = re.compile(r"(?<=--)[^\.]*") + + # Convert underscores to dashes and vice versa in argument names + processed_args = list[str]() + for i, arg in enumerate(args): + if arg.startswith("--help="): + FlexibleArgumentParser._search_keyword = arg.split("=", 1)[-1].lower() + processed_args.append("--help") + elif arg.startswith("--"): + if "=" in arg: + key, value = arg.split("=", 1) + key = pattern.sub(repl, key, count=1) + processed_args.append(f"{key}={value}") + else: + key = pattern.sub(repl, arg, count=1) + processed_args.append(key) + elif arg.startswith("-O") and arg != "-O" and arg[2] != ".": + # allow -O flag to be used without space, e.g. -O3 or -Odecode + # -O.<...> handled later + # also handle -O= here + mode = arg[3:] if arg[2] == "=" else arg[2:] + processed_args.append(f"-O.mode={mode}") + elif ( + arg == "-O" + and i + 1 < len(args) + and args[i + 1] in {"0", "1", "2", "3"} + ): + # Convert -O to -O.mode + processed_args.append("-O.mode") + else: + processed_args.append(arg) + + def create_nested_dict(keys: list[str], value: str) -> dict[str, Any]: + """Creates a nested dictionary from a list of keys and a value. + + For example, `keys = ["a", "b", "c"]` and `value = 1` will create: + `{"a": {"b": {"c": 1}}}` + """ + nested_dict: Any = value + for key in reversed(keys): + nested_dict = {key: nested_dict} + return nested_dict + + def recursive_dict_update( + original: dict[str, Any], + update: dict[str, Any], + ) -> set[str]: + """Recursively updates a dictionary with another dictionary. + Returns a set of duplicate keys that were overwritten. + """ + duplicates = set[str]() + for k, v in update.items(): + if isinstance(v, dict) and isinstance(original.get(k), dict): + nested_duplicates = recursive_dict_update(original[k], v) + duplicates |= {f"{k}.{d}" for d in nested_duplicates} + elif isinstance(v, list) and isinstance(original.get(k), list): + original[k] += v + else: + if k in original: + duplicates.add(k) + original[k] = v + return duplicates + + delete = set[int]() + dict_args = defaultdict[str, dict[str, Any]](dict) + duplicates = set[str]() + for i, processed_arg in enumerate(processed_args): + if i in delete: # skip if value from previous arg + continue + + if processed_arg.startswith("-") and "." in processed_arg: + if "=" in processed_arg: + processed_arg, value_str = processed_arg.split("=", 1) + if "." not in processed_arg: + # False positive, '.' was only in the value + continue + else: + value_str = processed_args[i + 1] + delete.add(i + 1) + + if processed_arg.endswith("+"): + processed_arg = processed_arg[:-1] + value_str = json.dumps(list(value_str.split(","))) + + key, *keys = processed_arg.split(".") + try: + value = json.loads(value_str) + except json.decoder.JSONDecodeError: + value = value_str + + # Merge all values with the same key into a single dict + arg_dict = create_nested_dict(keys, value) + arg_duplicates = recursive_dict_update(dict_args[key], arg_dict) + duplicates |= {f"{key}.{d}" for d in arg_duplicates} + delete.add(i) + # Filter out the dict args we set to None + processed_args = [a for i, a in enumerate(processed_args) if i not in delete] + if duplicates: + logger.warning("Found duplicate keys %s", ", ".join(duplicates)) + + # Add the dict args back as if they were originally passed as JSON + for dict_arg, dict_value in dict_args.items(): + processed_args.append(dict_arg) + processed_args.append(json.dumps(dict_value)) + + return super().parse_args(processed_args, namespace) + + def check_port(self, value): + try: + value = int(value) + except ValueError: + msg = "Port must be an integer" + raise ArgumentTypeError(msg) from None + + if not (1024 <= value <= 65535): + raise ArgumentTypeError("Port must be between 1024 and 65535") + + return value + + def _pull_args_from_config(self, args: list[str]) -> list[str]: + """Method to pull arguments specified in the config file + into the command-line args variable. + + The arguments in config file will be inserted between + the argument list. + + example: + ```yaml + port: 12323 + tensor-parallel-size: 4 + ``` + ```python + $: vllm {serve,chat,complete} "facebook/opt-12B" \ + --config config.yaml -tp 2 + $: args = [ + "serve,chat,complete", + "facebook/opt-12B", + '--config', 'config.yaml', + '-tp', '2' + ] + $: args = [ + "serve,chat,complete", + "facebook/opt-12B", + '--port', '12323', + '--tensor-parallel-size', '4', + '-tp', '2' + ] + ``` + + Please note how the config args are inserted after the sub command. + this way the order of priorities is maintained when these are args + parsed by super(). + """ + assert args.count("--config") <= 1, "More than one config file specified!" + + index = args.index("--config") + if index == len(args) - 1: + raise ValueError( + "No config file specified! \ + Please check your command-line arguments." + ) + + file_path = args[index + 1] + + config_args = self.load_config_file(file_path) + + # 0th index might be the sub command {serve,chat,complete,...} + # optionally followed by model_tag (only for serve) + # followed by config args + # followed by rest of cli args. + # maintaining this order will enforce the precedence + # of cli > config > defaults + if args[0].startswith("-"): + # No sub command (e.g., api_server entry point) + args = config_args + args[0:index] + args[index + 2 :] + elif args[0] == "serve": + model_in_cli = len(args) > 1 and not args[1].startswith("-") + model_in_config = any(arg == "--model" for arg in config_args) + + if not model_in_cli and not model_in_config: + raise ValueError( + "No model specified! Please specify model either " + "as a positional argument or in a config file." + ) + + if model_in_cli: + # Model specified as positional arg, keep CLI version + args = ( + [args[0]] + + [args[1]] + + config_args + + args[2:index] + + args[index + 2 :] + ) + else: + # No model in CLI, use config if available + args = [args[0]] + config_args + args[1:index] + args[index + 2 :] + else: + args = [args[0]] + config_args + args[1:index] + args[index + 2 :] + + return args + + def load_config_file(self, file_path: str) -> list[str]: + """Loads a yaml file and returns the key value pairs as a + flattened list with argparse like pattern + ```yaml + port: 12323 + tensor-parallel-size: 4 + ``` + returns: + processed_args: list[str] = [ + '--port': '12323', + '--tensor-parallel-size': '4' + ] + """ + extension: str = file_path.split(".")[-1] + if extension not in ("yaml", "yml"): + raise ValueError( + f"Config file must be of a yaml/yml type. {extension} supplied" + ) + + # only expecting a flat dictionary of atomic types + processed_args: list[str] = [] + + config: dict[str, int | str] = {} + try: + with open(file_path) as config_file: + config = yaml.safe_load(config_file) + except Exception as ex: + logger.error( + "Unable to read the config file at %s. Check path correctness", + file_path, + ) + raise ex + + store_boolean_arguments = [ + action.dest for action in self._actions if isinstance(action, StoreBoolean) + ] + + for key, value in config.items(): + if isinstance(value, bool) and key not in store_boolean_arguments: + if value: + processed_args.append("--" + key) + elif isinstance(value, list): + if value: + processed_args.append("--" + key) + for item in value: + processed_args.append(str(item)) + else: + processed_args.append("--" + key) + processed_args.append(str(value)) + + return processed_args diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 2e8cd302b0f5..a928cce09011 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -16,8 +16,8 @@ import vllm.envs as envs from vllm.logger import logger from vllm.platforms import current_platform -from vllm.utils import cdiv from vllm.utils.import_utils import has_deep_gemm +from vllm.utils.math_utils import cdiv @functools.cache diff --git a/vllm/utils/math_utils.py b/vllm/utils/math_utils.py new file mode 100644 index 000000000000..bdfa5fd2cbcb --- /dev/null +++ b/vllm/utils/math_utils.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Math utility functions for vLLM.""" + + +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def next_power_of_2(n: int) -> int: + """The next power of 2 (inclusive)""" + if n < 1: + return 1 + return 1 << (n - 1).bit_length() + + +def prev_power_of_2(n: int) -> int: + """The previous power of 2 (inclusive)""" + if n <= 0: + return 0 + return 1 << (n.bit_length() - 1) + + +def round_up(x: int, y: int) -> int: + """Round up x to the nearest multiple of y.""" + return ((x + y - 1) // y) * y + + +def round_down(x: int, y: int) -> int: + """Round down x to the nearest multiple of y.""" + return (x // y) * y diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 720fbd2c15c5..1eac94940e78 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -37,7 +37,7 @@ from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 029293d2f6dd..e71d4ca4629d 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -34,12 +34,12 @@ ) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton -from vllm.utils import cdiv from vllm.utils.flashinfer import ( can_use_trtllm_attention, flashinfer_disable_q_quantization, use_trtllm_attention, ) +from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.attention.backends.utils import ( AttentionCGSupport, diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index ffea14ec63f8..e12cc581dd1a 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -28,7 +28,7 @@ from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import is_torch_equal_or_newer from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index 7ca8501a8a6f..f9d2426eaf63 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -7,7 +7,7 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( PAD_SLOT_ID, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index b920fd929e85..0ec157300419 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -220,8 +220,8 @@ UnquantizedLinearMethod, ) from vllm.platforms import current_platform -from vllm.utils import cdiv, round_down from vllm.utils.flashinfer import has_nvidia_artifactory +from vllm.utils.math_utils import cdiv, round_down from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index 141436e66c32..bf8e4d5a6289 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -22,7 +22,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.triton_utils import tl, triton -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl from vllm.v1.attention.backends.utils import ( AttentionCGSupport, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index d935c02243bd..962cad927e6d 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -10,7 +10,7 @@ from vllm.attention.backends.abstract import AttentionLayer from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd from vllm.config import VllmConfig -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonDecodeMetadata, diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index 28085cb1424b..40a551787796 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -13,7 +13,7 @@ ) from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import cdiv, next_power_of_2 +from vllm.utils.math_utils import cdiv, next_power_of_2 logger = init_logger(__name__) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index cb5855548098..a0d354df06ca 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -21,7 +21,7 @@ from typing_extensions import runtime_checkable from vllm.config import VllmConfig, get_layers_from_vllm_config -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionImpl diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 584904daea8b..6e026215d402 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -12,8 +12,8 @@ from vllm import envs from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import cdiv from vllm.utils.hashing import sha256_cbor +from vllm.utils.math_utils import cdiv from vllm.utils.mem_constants import GiB_bytes from vllm.v1.kv_cache_interface import ( ChunkedLocalAttentionSpec, diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 586034182686..6699fb9818cb 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -5,7 +5,7 @@ from collections import defaultdict from collections.abc import Sequence -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock from vllm.v1.kv_cache_interface import ( diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 62faf590b23f..fd0a9b395e5f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -29,10 +29,11 @@ from vllm.transformers_utils.config import maybe_register_config_serialize_by_value from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, cdiv +from vllm.utils import Device from vllm.utils.async_utils import cancel_task_threadsafe from vllm.utils.collection_utils import as_list from vllm.utils.func_utils import deprecate_kwargs +from vllm.utils.math_utils import cdiv from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 392519f8fa9a..0f564fdb3b08 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -10,7 +10,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import get_dtype_size logger = init_logger(__name__) diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 9bf06d51609f..e041015e56e9 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -6,7 +6,7 @@ from vllm.distributed import get_dcp_group from vllm.logger import init_logger -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.utils import CpuGpuBuffer logger = init_logger(__name__) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 31429fe699a2..6759fe630e62 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -70,12 +70,11 @@ from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import ( - cdiv, check_use_alibi, length_from_prompt_token_ids_or_embeds, - round_up, ) from vllm.utils.jsontree import json_map_leaves +from vllm.utils.math_utils import cdiv, round_up from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_utils import DeviceMemoryProfiler from vllm.utils.platform_utils import is_pin_memory_available diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 18b857a64136..ce769e8575ff 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -53,7 +53,8 @@ from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask -from vllm.utils import LayerBlockType, cdiv, prev_power_of_2 +from vllm.utils import LayerBlockType +from vllm.utils.math_utils import cdiv, prev_power_of_2 from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.attention.backends.pallas import ( TPU_STR_DTYPE_TO_TORCH_DTYPE, diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index fae1f8e37b0c..f1885f9b34a1 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -25,7 +25,7 @@ from vllm.platforms import current_platform from vllm.platforms.tpu import USE_TPU_INFERENCE from vllm.tasks import SupportedTask -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig, KVCacheSpec