Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/benchmark_ngram_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def invoke_main() -> None:
)
parser.add_argument(
"--batched", action="store_true", help="consider time to prepare batch"
) # noqa: E501
)
parser.add_argument(
"--num-iteration",
type=int,
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/benchmark_serving_structured_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -909,13 +909,13 @@ def create_argument_parser():
parser.add_argument(
"--tokenizer",
type=str,
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument(
"--tokenizer-mode",
type=str,
default="auto",
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument(
"--num-prompts",
Expand Down
6 changes: 3 additions & 3 deletions csrc/cutlass_extensions/vllm_cutlass_library_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ class MixedInputKernelScheduleType(enum.Enum):
] = {
**KernelScheduleTag, # type: ignore
**{
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501
},
}
2 changes: 1 addition & 1 deletion examples/offline_inference/vision_language_pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
if query["modality"] == "text":
text = query["text"]
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
prompt = f"Find me an everyday image that matches the given caption: {text}"
image = None
elif query["modality"] == "image":
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,9 @@ async def forward_request(self, url, data, use_chunked=True):
async with session.post(
url=url, json=data, headers=headers
) as response:
if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501
if 200 <= response.status < 300 or 400 <= response.status < 500:
if use_chunked:
async for chunk_bytes in response.content.iter_chunked( # noqa: E501
async for chunk_bytes in response.content.iter_chunked(
1024
):
yield chunk_bytes
Expand Down
46 changes: 0 additions & 46 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,52 +56,6 @@ include = ["vllm*"]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
# TEMPORARY! These ignores will be fixed forward
## Line length violations
"csrc/cutlass_extensions/vllm_cutlass_library_extension.py" = ["E501"]
"tests/compile/piecewise/test_simple.py" = ["E501"]
"tests/compile/piecewise/test_toy_llama.py" = ["E501", "B023"]
"tests/entrypoints/conftest.py" = ["E501"]
"tests/entrypoints/openai/test_audio.py" = ["E501"]
"tests/entrypoints/openai/test_chat.py" = ["E501"]
"tests/entrypoints/openai/test_chat_template.py" = ["E501"]
"tests/entrypoints/openai/test_chat_with_tool_reasoning.py" = ["E501"]
"tests/entrypoints/openai/test_completion_with_function_calling.py" = ["E501"]
"tests/entrypoints/openai/test_video.py" = ["E501"]
"tests/entrypoints/openai/test_vision.py" = ["E501"]
"tests/entrypoints/test_chat_utils.py" = ["E501"]
"tests/kernels/moe/modular_kernel_tools/common.py" = ["E501"]
"tests/models/language/generation/test_gemma.py" = ["E501"]
"tests/models/language/generation/test_mistral.py" = ["E501"]
"tests/models/multimodal/generation/test_ultravox.py" = ["E501"]
"tests/models/multimodal/generation/test_voxtral.py" = ["E501"]
"tests/models/multimodal/generation/vlm_utils/custom_inputs.py" = ["E501"]
"tests/tool_use/test_tool_choice_required.py" = ["E501"]
"tests/v1/attention/utils.py" = ["E501"]
"tests/v1/entrypoints/openai/responses/test_image.py" = ["E501"]
"tests/v1/kv_connector/nixl_integration/test_accuracy.py" = ["E501"]
"tests/v1/kv_connector/unit/test_offloading_connector.py" = ["E501"]
"tests/v1/logits_processors/test_custom_offline.py" = ["E501"]
"vllm/attention/ops/pallas_kv_cache_update.py" = ["E501"]
"vllm/compilation/collective_fusion.py" = ["E501"]
"vllm/compilation/wrapper.py" = ["E501"]
"vllm/config/vllm.py" = ["E501"]
"vllm/distributed/device_communicators/all2all.py" = ["E501"]
"vllm/entrypoints/openai/protocol.py" = ["E501"]
"vllm/lora/layers/vocal_parallel_embedding.py" = ["E501"]
"vllm/model_executor/model_loader/bitsandbytes_loader.py" = ["E501"]
"vllm/model_executor/models/bailing_moe.py" = ["E501"]
"vllm/model_executor/models/hyperclovax_vision.py" = ["E501"]
"vllm/model_executor/models/llama4_eagle.py" = ["E501"]
"vllm/model_executor/models/longcat_flash_mtp.py" = ["E501"]
"vllm/model_executor/models/phi4mm.py" = ["E501"]
"vllm/model_executor/models/qwen3_next.py" = ["E501"]
"vllm/model_executor/layers/quantization/ptpc_fp8.py" = ["E501"]
"vllm/v1/attention/backends/mla/common.py" = ["E501"]
"vllm/v1/engine/utils.py" = ["E501"]
"vllm/v1/utils.py" = ["E501"]
"vllm/v1/worker/gpu_model_runner.py" = ["E501"]
# End of temporary ignores

[tool.ruff.lint]
select = [
Expand Down
28 changes: 17 additions & 11 deletions tests/compile/piecewise/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,14 @@ def test_simple_piecewise_compile(use_inductor):
splitting_ops=["silly.attention"],
use_inductor_graph_partition=False,
use_inductor=use_inductor,
expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1
expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
expected_num_backend_compilations=3, # num_piecewise_capturable_graphs_seen
expected_num_cudagraph_captured=6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
# 2 * num_layers + 1
expected_num_piecewise_graphs_seen=5,
# 1 + num_layers
expected_num_piecewise_capturable_graphs_seen=3,
# num_piecewise_capturable_graphs_seen
expected_num_backend_compilations=3,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
expected_num_cudagraph_captured=6,
)


Expand All @@ -147,14 +151,16 @@ def test_simple_inductor_graph_partition(splitting_ops):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

_run_simple_model(
# inductor graph partition automatically resets splitting_ops
# to be an empty list
# Inductor graph partition automatically resets splitting_ops to an empty list
splitting_ops=splitting_ops,
use_inductor_graph_partition=True,
use_inductor=True,
expected_num_piecewise_graphs_seen=1, # since not splitting at fx graph level
expected_num_piecewise_capturable_graphs_seen=1, # since not splitting at fx graph level
expected_num_backend_compilations=1, # since not splitting at fx graph level
expected_num_cudagraph_captured=6, # inductor graph partition still captures 6
# graph, same as fx graph partition.
# Since not splitting at fx graph level
expected_num_piecewise_graphs_seen=1,
# Since not splitting at fx graph level
expected_num_piecewise_capturable_graphs_seen=1,
# Since not splitting at fx graph level
expected_num_backend_compilations=1,
# Inductor graph partition still captures 6 graph, same as fx graph partition
expected_num_cudagraph_captured=6,
)
14 changes: 9 additions & 5 deletions tests/compile/piecewise/test_toy_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,11 +367,14 @@ def test_toy_llama(use_inductor: bool):
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}

with compilation_counter.expect(
num_graphs_seen=1, # one graph for the model
# One graph for the model
num_graphs_seen=1,
num_piecewise_graphs_seen=1,
num_piecewise_capturable_graphs_seen=1,
num_backend_compilations=1, # num_piecewise_capturable_graphs_seen
num_cudagraph_captured=2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
# num_piecewise_capturable_graphs_seen
num_backend_compilations=1,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
num_cudagraph_captured=2,
**kwargs,
):
outputs.append(
Expand Down Expand Up @@ -478,9 +481,10 @@ def benchmark():
# it is fine here, because we only use the lambda function once.
runtime = do_bench(
lambda: graphs[b][0]( # noqa
input_ids[:b], positions[:b]
input_ids[:b], # noqa
positions[:b], # noqa
)
) # noqa
)
piecewise_cudagraph_time[b] = runtime
else:
runtime = do_bench(lambda: graphs[b][0].replay()) # noqa
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_functionalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def test_fix_functionalization(model_class: torch.nn.Module, do_fusion: bool):
# check if the functionalization pass is applied
for op in model.ops_in_model(do_fusion):
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None

# make sure the ops were all de-functionalized
found = dict()
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_fusion_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ def test_attention_quant_pattern(
elif quant_key.dtype == FP4_DTYPE:
assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, (
"Attention should have output_block_scale after FP4 fusion"
) # noqa: E501
)

# Check that results are close
torch.testing.assert_close(result_unfused, result_fused_1, atol=1e-2, rtol=1e-2)
4 changes: 2 additions & 2 deletions tests/compile/test_sequence_parallelism.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def ops_in_model(self):
):
# If fusion happens, the fused op is the one
# we check for (de)functionalization
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default] # noqa: E501
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
else:
# If no fusion, the original ops are checked
return [
Expand Down Expand Up @@ -322,7 +322,7 @@ def sequence_parallelism_pass_on_test_model(
# check if the functionalization pass is applied
for op in model.ops_in_model():
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None

# make sure the ops were all de-functionalized
found = dict()
Expand Down
8 changes: 4 additions & 4 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def iter_params(self, model_id: str):
# [Decoder-only]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
"bigscience/bloomz-1b1": PPTestSettings.fast(),
Expand Down Expand Up @@ -138,7 +138,7 @@ def iter_params(self, model_id: str):
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"state-spaces/mamba-130m-hf": PPTestSettings.fast(),
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"), # noqa: E501
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),
"mosaicml/mpt-7b": PPTestSettings.fast(),
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
Expand All @@ -151,13 +151,13 @@ def iter_params(self, model_id: str):
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(
multi_node_only=True, load_format="dummy"
), # noqa: E501
),
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
"bigcode/starcoder2-3b": PPTestSettings.fast(),
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),
# FIXME: Cannot load tokenizer in latest transformers version.
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
Expand Down
3 changes: 2 additions & 1 deletion tests/entrypoints/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def sample_complex_json_schema():
"type": "array",
"items": {
"type": "string",
"pattern": "^[a-z]{1,10}$", # Combining length and pattern restrictions
# Combining length and pattern restrictions
"pattern": "^[a-z]{1,10}$",
},
},
},
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ async def test_single_chat_session_audio_base64encoded(
{
"type": "audio_url",
"audio_url": {
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501
},
},
{"type": "text", "text": "What's happening in this audio?"},
Expand Down
13 changes: 7 additions & 6 deletions tests/entrypoints/openai/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,17 +835,18 @@ async def test_extra_fields_allowed(client: openai.AsyncOpenAI):

@pytest.mark.asyncio
async def test_complex_message_content(client: openai.AsyncOpenAI):
content = [
{
"type": "text",
"text": "what is 1+1? please provide the result without any other text.",
}
]
resp = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "what is 1+1? please provide the result without any other text.",
}
],
"content": content,
}
],
temperature=0,
Expand Down
4 changes: 2 additions & 2 deletions tests/entrypoints/openai/test_chat_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ def test_load_chat_template():
assert (
template_content
== """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
) # noqa: E501
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
)


def test_no_load_chat_template_filelike():
Expand Down
10 changes: 6 additions & 4 deletions tests/entrypoints/openai/test_chat_with_tool_reasoning.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,13 @@ async def client(server):
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'",
"description": "The city to find the weather for, e.g. "
"'San Francisco'",
},
"state": {
"type": "string",
"description": "the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'",
"description": "the two-letter abbreviation for the state that "
"the city is in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
Expand All @@ -69,7 +70,8 @@ async def client(server):
{"role": "assistant", "content": "I'm doing well! How can I help you?"},
{
"role": "user",
"content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
"content": "Can you tell me what the temperate will be in Dallas, "
"in fahrenheit?",
},
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,14 @@
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'Vienna'",
"description": "The city to find the weather for, e.g. "
"'Vienna'",
"default": "Vienna",
},
"country": {
"type": "string",
"description": "The country that the city is in, e.g. 'Austria'",
"description": "The country that the city is in, e.g. "
"'Austria'",
},
"unit": {
"type": "string",
Expand Down Expand Up @@ -85,12 +87,14 @@
"properties": {
"city": {
"type": "string",
"description": "The city to get the forecast for, e.g. 'Vienna'",
"description": "The city to get the forecast for, e.g. "
"'Vienna'",
"default": "Vienna",
},
"country": {
"type": "string",
"description": "The country that the city is in, e.g. 'Austria'",
"description": "The country that the city is in, e.g. "
"'Austria'",
},
"days": {
"type": "integer",
Expand Down
4 changes: 2 additions & 2 deletions tests/entrypoints/openai/test_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ async def test_single_chat_session_video_base64encoded(
{
"type": "video_url",
"video_url": {
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
},
},
{"type": "text", "text": "What's in this video?"},
Expand Down Expand Up @@ -238,7 +238,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
{
"type": "video_url",
"video_url": {
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
},
},
{"type": "text", "text": "What's in this video?"},
Expand Down
Loading