Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/benchmark_block_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from benchmark_utils import TimeCollector
from tabulate import tabulate

from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool


Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_long_document_qa_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_ngram_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
VllmConfig,
)
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

try:
from vllm.transformers_utils.tokenizer import get_tokenizer
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_prioritization.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from transformers import AutoTokenizer, PreTrainedTokenizerBase

from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


# Select a equi-probable random priority
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_serving_structured_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
from backend_request_func import get_tokenizer

try:
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/cutlass_benchmarks/sparse_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from weight_shapes import WEIGHT_SHAPES

from vllm import _custom_ops as ops
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
w8a8_triton_block_scaled_mm,
)
from vllm.utils import FlexibleArgumentParser, cdiv
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.math_utils import cdiv

DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/bench_per_token_quant_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from vllm.model_executor.custom_op import CustomOp
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

batch_size_range = [1, 16, 32, 64, 128]
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_bitblas.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target

from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

parser = FlexibleArgumentParser(
description="Benchmark BitBLAS int4 on a specific target."
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_cutlass_fp4_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.scalar_type import scalar_types
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

WEIGHT_SHAPES_MOE = {
"nvidia/DeepSeek-R1-FP4": [
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_cutlass_moe_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

# Weight shapes for different models: [num_experts, topk, hidden_size,
# intermediate_size]
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_device_communicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
)
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

logger = init_logger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
fused_experts,
fused_topk,
)
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

DEFAULT_MODELS = [
"nm-testing/Mixtral-8x7B-Instruct-v0.1",
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT

from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_TP_SIZES = [1]
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_machete.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
quantize_weights,
)
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_marlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
sort_weights,
)
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

FP8_DTYPE = current_platform.fp8_dtype()

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_moe_permute_unpermute.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
)
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

FP8_DTYPE = current_platform.fp8_dtype()

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_mrope.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_paged_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_reshape_and_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_reshape_and_cache_flash.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random_flash,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def benchmark_rope_kernels_multi_lora(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_trtllm_decode_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import flashinfer
import torch

from vllm.utils import round_up
from vllm.utils.math_utils import round_up

FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
FP8_DTYPE = torch.float8_e4m3fn
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_trtllm_prefill_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import flashinfer
import torch

from vllm.utils import round_up
from vllm.utils.math_utils import round_up

FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
FP8_DTYPE = torch.float8_e4m3fn
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_w8a8_block_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

mp.set_start_method("spawn", force=True)

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/graph_machete_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import seaborn as sns
from torch.utils.benchmark import Measurement as TMeasurement

from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

if __name__ == "__main__":
parser = FlexibleArgumentParser(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/overheads/benchmark_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pstats

from vllm import LLM, SamplingParams
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

# A very long prompt, total number of tokens is about 15k.
LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/audio_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser

audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
question_per_audio_count = {
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/basic/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def create_parser():
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/basic/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from argparse import Namespace

from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def parse_args():
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/basic/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from argparse import Namespace

from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def parse_args():
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/basic/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def create_parser():
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/basic/reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from argparse import Namespace

from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def parse_args():
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/basic/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from argparse import Namespace

from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def parse_args():
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/encoder_decoder_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from vllm import LLM, EngineArgs, PromptType, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


class ModelRequestData(NamedTuple):
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/llm_engine_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import argparse

from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def create_test_prompts() -> list[tuple[str, SamplingParams]]:
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/load_sharded_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import dataclasses

from vllm import LLM, EngineArgs, SamplingParams
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def parse_args():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from argparse import Namespace

from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def parse_args():
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/pooling/embed_matryoshka_fy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from argparse import Namespace

from vllm import LLM, EngineArgs, PoolingParams
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser


def parse_args():
Expand Down
Loading