Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ repos:
entry: tools/pre_commit/shellcheck.sh
language: script
types: [shell]
exclude: '^(\.buildkite/scripts/run-multi-node-test\.sh|tests/v1/kv_connector/nixl_integration/spec_decode_acceptance_test\.sh)$'
- id: png-lint
name: Lint PNG exports from excalidraw
entry: tools/pre_commit/png-lint.sh
Expand Down
2 changes: 1 addition & 1 deletion .shellcheckrc
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# SC2155 (warning): Declare and assign separately to avoid masking return values.
# SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
#
disable=SC1091,SC2004,SC2129,SC2155,SC2164
disable=SC1091,SC2004,SC2129,SC2155,SC2164,SC2089,SC2090,SC2086,SC2046,SC2048,SC2206
9 changes: 5 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -362,8 +362,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# marlin arches for fp8 input
# - sm80 doesn't support fp8 computation
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0/12.1 (e.g. RTX 50x0, GB10)
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0;12.1" "${CUDA_ARCHS}")
# marlin arches for other files
cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")

Expand Down Expand Up @@ -781,6 +781,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SRCS "${DSV3_FUSED_A_GEMM_SRC}"
CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
target_compile_definitions(${VLLM_EXT_NAME} PRIVATE ENABLE_DSV3_FUSED_A_GEMM)
message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
else()
message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
Expand Down Expand Up @@ -1049,8 +1050,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# moe marlin arches for fp8 input
# - sm80 doesn't support fp8 computation
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0/12.1 (e.g. RTX 50x0, GB10)
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0;12.1" "${CUDA_ARCHS}")
# moe marlin arches for other files
cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_OTHER_ARCHS)
Expand Down
4 changes: 3 additions & 1 deletion csrc/torch_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

// Quantization ops
#ifndef USE_ROCM
#ifdef ENABLE_DSV3_FUSED_A_GEMM
// DeepSeek V3 fused A GEMM (SM 9.0+, bf16 only, 1-16 tokens).
ops.def(
"dsv3_fused_a_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()");
// conditionally compiled so impl registration is in source file
// conditionally compiled so impl registration is in source file
#endif

// Quantized GEMM for AWQ.
ops.def(
Expand Down
26 changes: 25 additions & 1 deletion tests/evals/gsm8k/gsm8k_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,16 @@ async def call_vllm_api(
stop: list[str] | None = None,
url: str | None = None,
seed: int | None = None,
model: str = "gpt-oss-120b",
api_key: str | None = None,
) -> tuple[str, int]:
"""Call vLLM's OpenAI-compatible completions endpoint.

Returns:
Tuple of (response_text, completion_tokens)
"""
data = {
"model": model,
"prompt": prompt,
"temperature": temperature,
"max_tokens": max_tokens,
Expand All @@ -98,8 +101,14 @@ async def call_vllm_api(
if seed is not None:
data["seed"] = seed

headers = {}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"

try:
async with session.post(f"{url}/v1/completions", json=data) as response:
async with session.post(
f"{url}/v1/completions", json=data, headers=headers
) as response:
response.raise_for_status()
result = await response.json()
text = result["choices"][0]["text"]
Expand Down Expand Up @@ -177,6 +186,8 @@ def evaluate_gsm8k(
port: int = 8000,
temperature: float = 0.0,
seed: int | None = 42,
model: str = "gpt-oss-120b",
api_key: str | None = None,
) -> dict[str, float | int]:
"""
Evaluate GSM8K accuracy using vLLM serve endpoint.
Expand All @@ -200,6 +211,8 @@ async def get_answer(session: aiohttp.ClientSession, i: int) -> tuple[str, int]:
stop=["Question", "Assistant:", "<|separator|>"],
url=base_url,
seed=seed,
model=model,
api_key=api_key,
)
states[i] = answer
output_tokens[i] = tokens
Expand Down Expand Up @@ -281,6 +294,15 @@ def main() -> None:
"--seed", type=int, default=42, help="Random seed for reproducibility"
)
parser.add_argument("--save-results", type=str, help="Save results to JSON file")
parser.add_argument(
"--model", type=str, default="gpt-oss-120b", help="Model name to query"
)
parser.add_argument(
"--api-key",
type=str,
default=os.environ.get("VLLM_API_KEY"),
help="API key for vLLM server (defaults to $VLLM_API_KEY)",
)

args = parser.parse_args()

Expand All @@ -292,6 +314,8 @@ def main() -> None:
port=args.port,
temperature=args.temperature,
seed=args.seed,
model=args.model,
api_key=args.api_key,
)

# Print results to terminal
Expand Down
28 changes: 25 additions & 3 deletions vllm/_custom_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,11 @@ def _ggml_moe_a8_vec_fake(

# cutlass
def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
try:
return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we build these functions for cuda 12.1?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @ProExpertProg. Yes. I believe this has been addressed as of CUDA 13.1. I haven't update to my local machine yet.

except AttributeError:
logger.warning("CUTLASS FP4 ops not available - was vLLM built correctly?")
return False


def cutlass_scaled_fp4_mm(
Expand All @@ -789,11 +793,21 @@ def cutlass_scaled_fp4_mm(


def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
try:
return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
except AttributeError:
logger.warning("CUTLASS FP8 ops not available - was vLLM built correctly?")
return False


def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
return torch.ops._C.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
try:
return torch.ops._C.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
except AttributeError:
logger.warning(
"CUTLASS block FP8 ops not available - was vLLM built correctly?"
)
return False


def cutlass_scaled_mm(
Expand Down Expand Up @@ -876,6 +890,14 @@ def cutlass_scaled_mm_azp(
return out.view(*target_shape)


def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
try:
return torch.ops._C.cutlass_sparse_scaled_mm_supported(cuda_device_capability)
except AttributeError:
logger.warning("CUTLASS sparse ops not available - was vLLM built correctly?")
return False


def cutlass_group_gemm_supported(cuda_device_capability: int) -> bool:
if cuda_device_capability < 90 or cuda_device_capability >= 110:
return False
Expand Down
8 changes: 5 additions & 3 deletions vllm/compilation/passes/fusion/matcher_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@
if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
QUANT_OPS[kNvfp4Dynamic] = torch.ops._C.scaled_fp4_quant.out # noqa: E501

if current_platform.is_cuda():
if current_platform.is_cuda() and hasattr(torch.ops._C, "per_token_group_fp8_quant"):
QUANT_OPS[kFp8Dynamic128Sym] = torch.ops._C.per_token_group_fp8_quant.default # noqa: E501
QUANT_OPS[kFp8Dynamic64Sym] = torch.ops._C.per_token_group_fp8_quant.default # noqa: E501

SILU_MUL_OP = torch.ops._C.silu_and_mul.default
SILU_MUL_OP = (
torch.ops._C.silu_and_mul.default if hasattr(torch.ops._C, "silu_and_mul") else None
)


class MatcherCustomOp(ABC):
Expand Down Expand Up @@ -448,7 +450,7 @@ def inputs(self) -> list[torch.Tensor]:
class MatcherSiluAndMul(MatcherCustomOp):
def __init__(self, enabled: bool | None = None) -> None:
if enabled is None:
enabled = SiluAndMul.enabled()
enabled = SiluAndMul.enabled() and SILU_MUL_OP is not None
super().__init__(enabled)

def inputs(self) -> list[torch.Tensor]:
Expand Down
11 changes: 10 additions & 1 deletion vllm/entrypoints/openai/chat_completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import partial_json_parser
import regex as re
from fastapi import Request
from openai_harmony import HarmonyError
from partial_json_parser.core.options import Allow

from vllm.engine.protocol import EngineClient
Expand Down Expand Up @@ -708,7 +709,15 @@ async def chat_completion_stream_generator(
# Track accumulated content per token with their state
token_states: list[TokenState] = []
for token_id in output.token_ids:
harmony_parser.process(token_id)
try:
harmony_parser.process(token_id)
except HarmonyError as e:
logger.warning(
"HarmonyError in stream generator, "
"returning partial result: %s",
e,
)
break
token_delta = harmony_parser.last_content_delta or ""
token_states.append(
TokenState(
Expand Down
34 changes: 30 additions & 4 deletions vllm/entrypoints/serve/render/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,23 +386,49 @@ def _make_request_with_harmony(
assert not self.supports_code_interpreter
if (reasoning_effort := request.reasoning_effort) == "none":
raise ValueError(f"Harmony does not support {reasoning_effort=}")

# Extract client-provided system message content so it can be
# passed as structured instructions rather than appended as a raw
# system message (which the Harmony parser cannot handle).
non_system_messages = []
system_instructions_parts: list[str] = []
for msg in request.messages:
msg_dict = (
msg if isinstance(msg, dict) else msg.model_dump(exclude_none=True)
)
if msg_dict.get("role") == "system":
content = msg_dict.get("content") or ""
if isinstance(content, list):
content = "".join(
c.get("text", "")
for c in content
if isinstance(c, dict) and c.get("type") == "text"
)
if content:
system_instructions_parts.append(content)
else:
non_system_messages.append(msg)
instructions = "\n".join(system_instructions_parts) or None

sys_msg = get_system_message(
reasoning_effort=reasoning_effort,
browser_description=None,
python_description=None,
with_custom_tools=should_include_tools,
instructions=instructions,
)
messages.append(sys_msg)

# Add developer message.
if request.tools:
if request.tools or instructions:
dev_msg = get_developer_message(
tools=request.tools if should_include_tools else None # type: ignore[arg-type]
instructions=instructions,
tools=request.tools if should_include_tools else None, # type: ignore[arg-type]
)
messages.append(dev_msg)

# Add user message.
messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))
# Add user message (system messages already extracted above).
messages.extend(parse_chat_inputs_to_harmony_messages(non_system_messages))

# Render prompt token ids.
prompt_token_ids = render_for_completion(messages)
Expand Down
Loading
Loading