diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml index 1ef29f36cec0..f25eae2400c5 100644 --- a/.buildkite/test_areas/lm_eval.yaml +++ b/.buildkite/test_areas/lm_eval.yaml @@ -73,3 +73,29 @@ steps: num_devices: 2 commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt + +- label: GPQA Eval (GPT-OSS) (H100) + timeout_in_minutes: 120 + device: h100 + optional: true + num_devices: 2 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/evals/gpt_oss/ + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt + +- label: GPQA Eval (GPT-OSS) (B200) + timeout_in_minutes: 120 + device: b200 + optional: true + num_devices: 2 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/evals/gpt_oss/ + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index 5c5a9dbcbb69..69390cd6d373 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -153,33 +153,6 @@ steps: - pytest -v -s transformers_utils - pytest -v -s config -- label: GPT-OSS Eval (H100) - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - device: h100 - optional: true - source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - -- label: GPT-OSS Eval (B200) - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - device: b200 - optional: true - source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - - label: Batch Invariance (H100) timeout_in_minutes: 25 device: h100 diff --git a/tests/evals/gpt_oss/README.md b/tests/evals/gpt_oss/README.md new file mode 100644 index 000000000000..98c0098bbd28 --- /dev/null +++ b/tests/evals/gpt_oss/README.md @@ -0,0 +1,49 @@ +# GPQA Evaluation using GPT-OSS + +This directory contains GPQA evaluation tests using the GPT-OSS evaluation package and vLLM server. + +## Usage + +### Run tests with pytest (like buildkite) + +```bash +# H200 +pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py \ + --config-list-file=configs/models-h200.txt + +# B200 +pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py \ + --config-list-file=configs/models-b200.txt +``` + +## Configuration Format + +Model configs in `configs/` directory use this YAML format: + +```yaml +model_name: "openai/gpt-oss-20b" +metric_threshold: 0.568 # Minimum expected accuracy +reasoning_effort: "low" # Reasoning effort level (default: "low") +server_args: "--tensor-parallel-size 2" # Server arguments +startup_max_wait_seconds: 1800 # Max wait for server startup (default: 1800) +env: # Environment variables (optional) + SOME_VAR: "value" +``` + +The `server_args` field accepts any arguments that can be passed to `vllm serve`. + +The `env` field accepts a dictionary of environment variables to set for the server process. + +## Adding New Models + +1. Create a new YAML config file in the `configs/` directory +2. Add the filename to the appropriate `models-*.txt` file + +## Tiktoken Encoding Files + +The tiktoken encoding files required by the vLLM server are automatically downloaded from OpenAI's public blob storage on first run: + +- `cl100k_base.tiktoken` +- `o200k_base.tiktoken` + +Files are cached in the `data/` directory. The `TIKTOKEN_ENCODINGS_BASE` environment variable is automatically set to point to this directory when running evaluations. diff --git a/tests/evals/gpt_oss/configs/gpt-oss-20b-baseline.yaml b/tests/evals/gpt_oss/configs/gpt-oss-20b-baseline.yaml new file mode 100644 index 000000000000..1df1cc93e47c --- /dev/null +++ b/tests/evals/gpt_oss/configs/gpt-oss-20b-baseline.yaml @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +model_name: "openai/gpt-oss-20b" +metric_threshold: 0.568 +reasoning_effort: "low" +server_args: "--tensor-parallel-size 2" diff --git a/tests/evals/gpt_oss/configs/gpt-oss-20b-flashinfer-mxfp4-bf16.yaml b/tests/evals/gpt_oss/configs/gpt-oss-20b-flashinfer-mxfp4-bf16.yaml new file mode 100644 index 000000000000..952f7e870357 --- /dev/null +++ b/tests/evals/gpt_oss/configs/gpt-oss-20b-flashinfer-mxfp4-bf16.yaml @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +model_name: "openai/gpt-oss-20b" +metric_threshold: 0.568 +reasoning_effort: "low" +server_args: "--tensor-parallel-size 2" +env: + VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: "1" diff --git a/tests/evals/gpt_oss/configs/gpt-oss-20b-flashinfer-mxfp4-mxfp8-cutlass.yaml b/tests/evals/gpt_oss/configs/gpt-oss-20b-flashinfer-mxfp4-mxfp8-cutlass.yaml new file mode 100644 index 000000000000..23ec14819ef4 --- /dev/null +++ b/tests/evals/gpt_oss/configs/gpt-oss-20b-flashinfer-mxfp4-mxfp8-cutlass.yaml @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +model_name: "openai/gpt-oss-20b" +metric_threshold: 0.568 +reasoning_effort: "low" +server_args: "--tensor-parallel-size 2" +env: + VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: "1" diff --git a/tests/evals/gpt_oss/configs/gpt-oss-20b-marlin.yaml b/tests/evals/gpt_oss/configs/gpt-oss-20b-marlin.yaml new file mode 100644 index 000000000000..97e97fd19a6b --- /dev/null +++ b/tests/evals/gpt_oss/configs/gpt-oss-20b-marlin.yaml @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +model_name: "openai/gpt-oss-20b" +metric_threshold: 0.568 +reasoning_effort: "low" +server_args: "--tensor-parallel-size 2" +env: + VLLM_MXFP4_USE_MARLIN: "1" diff --git a/tests/evals/gpt_oss/configs/gpt-oss-20b-sm100-fi-mxfp4-mxfp8-trtllm.yaml b/tests/evals/gpt_oss/configs/gpt-oss-20b-sm100-fi-mxfp4-mxfp8-trtllm.yaml new file mode 100644 index 000000000000..4cea743490f7 --- /dev/null +++ b/tests/evals/gpt_oss/configs/gpt-oss-20b-sm100-fi-mxfp4-mxfp8-trtllm.yaml @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +model_name: "openai/gpt-oss-20b" +metric_threshold: 0.568 +reasoning_effort: "low" +server_args: "--tensor-parallel-size 2" +env: + VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: "1" diff --git a/tests/evals/gpt_oss/configs/models-b200.txt b/tests/evals/gpt_oss/configs/models-b200.txt new file mode 100644 index 000000000000..8519109e192a --- /dev/null +++ b/tests/evals/gpt_oss/configs/models-b200.txt @@ -0,0 +1,5 @@ +# B200 model configurations for GPQA evaluation +# Tests different environment variable combinations +gpt-oss-20b-flashinfer-mxfp4-bf16.yaml +gpt-oss-20b-flashinfer-mxfp4-mxfp8-cutlass.yaml +gpt-oss-20b-sm100-fi-mxfp4-mxfp8-trtllm.yaml \ No newline at end of file diff --git a/tests/evals/gpt_oss/configs/models-h100.txt b/tests/evals/gpt_oss/configs/models-h100.txt new file mode 100644 index 000000000000..9577bac5f1d4 --- /dev/null +++ b/tests/evals/gpt_oss/configs/models-h100.txt @@ -0,0 +1,5 @@ +# H100 model configurations for GPQA evaluation +# Tests different environment variable combinations +gpt-oss-20b-baseline.yaml +gpt-oss-20b-flashinfer-mxfp4-bf16.yaml +gpt-oss-20b-marlin.yaml diff --git a/tests/evals/gpt_oss/conftest.py b/tests/evals/gpt_oss/conftest.py index 2f140ae2c8e9..d35dec4831a3 100644 --- a/tests/evals/gpt_oss/conftest.py +++ b/tests/evals/gpt_oss/conftest.py @@ -4,13 +4,61 @@ Pytest configuration for GPT-OSS evaluation tests. """ +from pathlib import Path + def pytest_addoption(parser): - """Add command line options for pytest.""" - parser.addoption("--model", action="store", help="Model name to evaluate") - parser.addoption( - "--metric", action="store", type=float, help="Expected metric threshold" - ) + """Add custom command line options.""" parser.addoption( - "--server-args", action="store", default="", help="Additional server arguments" + "--config-list-file", + required=True, + help="File containing list of config files to test", ) + + +def pytest_generate_tests(metafunc): + """Generate test parameters from config files.""" + if "config_filename" in metafunc.fixturenames: + config_list_file = metafunc.config.getoption("--config-list-file") + + # Handle both relative and absolute paths + config_list_path = Path(config_list_file) + if not config_list_path.is_absolute(): + # If relative, try relative to test directory first + test_dir_path = Path(__file__).parent / config_list_file + if test_dir_path.exists(): + config_list_path = test_dir_path + else: + # Try relative to current working directory + config_list_path = Path.cwd() / config_list_file + + print(f"Looking for config list at: {config_list_path}") + + config_files = [] + if config_list_path.exists(): + # Determine config directory (same directory as the list file) + config_dir = config_list_path.parent + + with open(config_list_path) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + config_path = config_dir / line + print(f"Checking config file: {config_path}") + if config_path.exists(): + config_files.append(config_path) + print(f" Found: {config_path}") + else: + print(f" Missing: {config_path}") + else: + print(f"Config list file not found: {config_list_path}") + + # Generate test parameters + if config_files: + metafunc.parametrize( + "config_filename", + config_files, + ids=[config_file.stem for config_file in config_files], + ) + else: + print("No config files found, test will be skipped") diff --git a/tests/evals/gpt_oss/test_gpqa_correctness.py b/tests/evals/gpt_oss/test_gpqa_correctness.py index 151deaa059f0..63188ec40767 100644 --- a/tests/evals/gpt_oss/test_gpqa_correctness.py +++ b/tests/evals/gpt_oss/test_gpqa_correctness.py @@ -5,22 +5,48 @@ Usage: pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py \ - --model openai/gpt-oss-20b \ - --metric 0.58 \ - --server-args "--tensor-parallel-size 2" + --config-list-file=configs/models-h200.txt """ +import os +import shlex import subprocess import sys +import urllib.request +from pathlib import Path import regex as re +import yaml from tests.utils import RemoteOpenAIServer TOL = 0.05 # Absolute tolerance for accuracy comparison +# Path to tiktoken encoding files +TIKTOKEN_DATA_DIR = Path(__file__).parent / "data" -def run_gpqa_eval(model_name: str, base_url: str) -> float: +# Tiktoken encoding files to download +TIKTOKEN_FILES = { + "cl100k_base.tiktoken": "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", + "o200k_base.tiktoken": "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken", +} + + +def ensure_tiktoken_files(): + """Download tiktoken encoding files if they don't exist.""" + TIKTOKEN_DATA_DIR.mkdir(parents=True, exist_ok=True) + + for filename, url in TIKTOKEN_FILES.items(): + filepath = TIKTOKEN_DATA_DIR / filename + if not filepath.exists(): + print(f"Downloading {filename} from {url}...") + urllib.request.urlretrieve(url, filepath) + print(f" Downloaded to {filepath}") + else: + print(f" {filename} already exists.") + + +def run_gpqa_eval(model_name: str, base_url: str, reasoning_effort: str) -> float: """Run GPQA evaluation using the gpt-oss evaluation package.""" # Build the command to run the evaluation @@ -33,7 +59,7 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float: "--model", model_name, "--reasoning-effort", - "low", + reasoning_effort, "--base-url", base_url, "--n-threads", @@ -41,16 +67,29 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float: ] try: + # Set up environment for the evaluation subprocess + # Inherit current environment and add required variables + eval_env = os.environ.copy() + eval_env["OPENAI_API_KEY"] = "dummy" + # Run the evaluation result = subprocess.run( cmd, text=True, capture_output=True, timeout=1800, # 30 minute timeout - env={"OPENAI_API_KEY": "dummy"}, + env=eval_env, ) - print("Evaluation process output:\n", result.stdout) + print("Evaluation process stdout:\n", result.stdout) + print("Evaluation process stderr:\n", result.stderr) + print(f"Evaluation process return code: {result.returncode}") + + if result.returncode != 0: + raise RuntimeError( + f"Evaluation failed with exit code {result.returncode}:\n" + f"stdout: {result.stdout}\nstderr: {result.stderr}" + ) # Parse the output to extract the score match = re.search(r"'metric':\s*([\d.]+)", result.stdout) @@ -64,47 +103,62 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float: except subprocess.TimeoutExpired as e: raise RuntimeError("Evaluation timed out") from e - except subprocess.CalledProcessError as e: - raise RuntimeError( - f"Evaluation failed with exit code {e.returncode}:\n" - f"stdout: {e.stdout}\nstderr: {e.stderr}" - ) from e -def test_gpqa_correctness(request): - """Test GPQA correctness for GPT-OSS model.""" +def test_gpqa_correctness(config_filename): + """Test GPQA correctness for a given model configuration.""" + # Ensure tiktoken files are downloaded + ensure_tiktoken_files() + + # Verify tiktoken files exist + for filename in TIKTOKEN_FILES: + filepath = TIKTOKEN_DATA_DIR / filename + assert filepath.exists(), f"Tiktoken file not found: {filepath}" - # Get command line arguments - model_name = request.config.getoption("--model") - expected_metric = request.config.getoption("--metric") - server_args_str = request.config.getoption("--server-args") + eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) - # Parse server arguments - server_args = [] - if server_args_str: - server_args = server_args_str.split() + # Parse server arguments from config (use shlex to handle quoted strings) + server_args_str = eval_config.get("server_args", "") + server_args = shlex.split(server_args_str) if server_args_str else [] # Add standard server arguments server_args.extend( [ "--trust-remote-code", + "--enforce-eager", + "--disable-uvicorn-access-log", ] ) - print(f"Starting GPQA evaluation for model: {model_name}") - print(f"Expected metric threshold: {expected_metric}") + # Build server environment with tiktoken path and any config-specified vars + server_env = {"TIKTOKEN_ENCODINGS_BASE": str(TIKTOKEN_DATA_DIR)} + if eval_config.get("env"): + server_env.update(eval_config["env"]) + + reasoning_effort = eval_config.get("reasoning_effort", "low") + + print(f"Starting GPQA evaluation for model: {eval_config['model_name']}") + print(f"Expected metric threshold: {eval_config['metric_threshold']}") + print(f"Reasoning effort: {reasoning_effort}") print(f"Server args: {' '.join(server_args)}") + print(f"Server environment variables: {server_env}") # Launch server and run evaluation with RemoteOpenAIServer( - model_name, server_args, max_wait_seconds=1800 + eval_config["model_name"], + server_args, + env_dict=server_env, + max_wait_seconds=eval_config.get("startup_max_wait_seconds", 1800), ) as remote_server: base_url = remote_server.url_for("v1") print(f"Server started at: {base_url}") - measured_metric = run_gpqa_eval(model_name, base_url) + measured_metric = run_gpqa_eval( + eval_config["model_name"], base_url, reasoning_effort + ) + expected_metric = eval_config["metric_threshold"] - print(f"GPQA Results for {model_name}:") + print(f"GPQA Results for {eval_config['model_name']}:") print(f" Measured metric: {measured_metric:.4f}") print(f" Expected metric: {expected_metric:.4f}") print(f" Tolerance: {TOL:.4f}") @@ -115,4 +169,4 @@ def test_gpqa_correctness(request): f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}" ) - print(f"✅ GPQA test passed for {model_name}") + print(f"GPQA test passed for {eval_config['model_name']}") diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index b6b8a17aea63..22e71d39101f 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -242,6 +242,10 @@ def __post_init__(self): def quant_dtype(self) -> torch.dtype | str | None: return self._a1.dtype + @property + def weight_quant_dtype(self) -> torch.dtype | str | None: + return self._w1.dtype + @property def is_quantized(self) -> bool: return self.quant_dtype is not None diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 4ec76ee9820c..b9566a3a921a 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -4,6 +4,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( @@ -18,6 +19,8 @@ kFp8Dynamic128Sym, kFp8Static128BlockSym, kFp8StaticTensorSym, + kMxfp4Static, + kMxfp8Dynamic, kNvfp4Dynamic, kNvfp4Static, ) @@ -64,10 +67,18 @@ def __init__( quant_config: FusedMoEQuantConfig, ): super().__init__(moe_config, quant_config) - assert quant_config.quant_dtype in ("nvfp4", torch.float8_e4m3fn, None), ( - "Only nvfp4, fp8, bfloat16 and" + + assert quant_config.weight_quant_dtype in ( + "mxfp4", + "nvfp4", + torch.float8_e4m3fn, + None, + ), ( + "Only mxfp4, nvfp4, fp8, bfloat16 and" " float16 quantization are currently supported." ) + self.device = moe_config.device + self.num_experts = moe_config.num_local_experts self.ep_rank = moe_config.moe_parallel_config.ep_rank self.ep_size = moe_config.moe_parallel_config.ep_size self.tp_rank = moe_config.moe_parallel_config.tp_rank @@ -78,6 +89,28 @@ def __init__( # - pass per-block weight scales to the kernel # - skip input activation quantization (kernel applies scaling) self.use_deepseek_fp8_block_scale = quant_config.is_block_quantized + self.max_capture_size = ( + get_current_vllm_config().compilation_config.max_cudagraph_capture_size + ) + + if quant_config.weight_quant_dtype == "mxfp4": + # This value is used specifically for gpt-oss, + # Need to revisit this for other models + self.gemm1_alpha = torch.tensor( + [1.702] * self.num_experts, dtype=torch.float32, device=self.device + ) + self.gemm1_beta = torch.tensor( + [1.0] * self.num_experts, dtype=torch.float32, device=self.device + ) + self.gemm1_clamp_limit = torch.tensor( + [7.0] * self.num_experts, dtype=torch.float32, device=self.device + ) + if quant_config.quant_dtype == "mxfp8": + self.fake_input_scale = torch.ones( + self.num_experts, + device=self.device, + dtype=torch.float32, + ) @property def expects_unquantized_inputs(self) -> bool: @@ -119,20 +152,33 @@ def _supports_quant_scheme( ] and p.has_device_capability(90) ) - # fp8 block-scale on 9.0 + # fp8 block-scale, wmxfp4a16 on 9.0 or ( - scheme == (kFp8Static128BlockSym, kFp8Dynamic128Sym) + scheme + in [ + (kMxfp4Static, None), + (kFp8Static128BlockSym, kFp8Dynamic128Sym), + ] and p.is_device_capability(90) ) - # nvfp4 on 10.0+ + # nvfp4, wmxfp4amxfp8 on 10.0+ or ( - scheme == (kNvfp4Static, kNvfp4Dynamic) and p.has_device_capability(100) + scheme + in [ + (kMxfp4Static, kMxfp8Dynamic), + (kNvfp4Static, kNvfp4Dynamic), + ] + and p.has_device_capability(100) ) ) @staticmethod def _supports_activation(activation: MoEActivation) -> bool: - return activation in [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL] + return activation in [ + MoEActivation.SILU, + MoEActivation.RELU2_NO_MUL, + MoEActivation.SWIGLUOAI, + ] @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: @@ -216,12 +262,23 @@ def apply( activation_str_to_value_map = { MoEActivation.SILU: ActivationType.Swiglu, # This is the default + MoEActivation.SWIGLUOAI: ActivationType.Swiglu, # gpt-oss alias MoEActivation.RELU2_NO_MUL: ActivationType.Relu2, } assert activation in activation_str_to_value_map, ( f"{activation=} missing from {activation_str_to_value_map.keys()=}" ) + quant_scales = None + fc1_expert_weights = None + fc2_expert_weights = None + fc1_expert_biases = None + fc2_expert_biases = None + swiglu_alpha = None + swiglu_beta = None + swiglu_limit = None + use_mxfp8_act_scaling = False + use_w4_group_scaling = False # Select quantization metadata based on FP8 format/path if ( self.quant_dtype == torch.float8_e4m3fn @@ -256,6 +313,43 @@ def apply( # FlashInfer API requires weight to be long for nvfp4 fc1_expert_weights = w1.view(torch.long) fc2_expert_weights = w2.view(torch.long) + elif self.weight_quant_dtype == "mxfp4": + assert self.w1_scale is not None and self.w2_scale is not None + assert w1.is_contiguous() and w2.is_contiguous() + assert self.gemm1_alpha is not None + assert self.gemm1_beta is not None + assert self.gemm1_clamp_limit is not None + assert topk_ids.is_contiguous() + + fc1_expert_biases = self.w1_bias + fc2_expert_biases = self.w2_bias + swiglu_alpha = self.gemm1_alpha + swiglu_beta = self.gemm1_beta + swiglu_limit = self.gemm1_clamp_limit + + if self.quant_dtype == "mxfp8": + assert self.fake_input_scale is not None + fc1_expert_weights = w1.view(torch.long) + fc2_expert_weights = w2.view(torch.long) + + quant_scales = [ + self.w1_scale.view(torch.int32), + self.fake_input_scale, + self.w2_scale.view(torch.int32), + self.fake_input_scale, + ] + use_mxfp8_act_scaling = True + else: + assert hidden_states.dtype == torch.bfloat16 + fc1_expert_weights = w1 + fc2_expert_weights = w2 + quant_scales = [ + self.w1_scale, + self.w2_scale, + ] + a1q_scale = None + use_w4_group_scaling = True + elif self.use_deepseek_fp8_block_scale: # FP8 block-scale path: provide block-scale weights, omit a1q_scale quant_scales = [ @@ -277,6 +371,12 @@ def apply( token_final_scales=topk_weights, fc1_expert_weights=fc1_expert_weights, fc2_expert_weights=fc2_expert_weights, + fc1_expert_biases=fc1_expert_biases, + fc2_expert_biases=fc2_expert_biases, + swiglu_alpha=swiglu_alpha, + swiglu_beta=swiglu_beta, + swiglu_limit=swiglu_limit, + output=output, output_dtype=self.out_dtype, quant_scales=quant_scales, input_sf=a1q_scale, @@ -284,10 +384,12 @@ def apply( tp_rank=self.tp_rank, ep_size=self.ep_size, ep_rank=self.ep_rank, - output=output, activation_type=activation_str_to_value_map[activation], # Informs FlashInfer to use the block-scale decoding path when True use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale, + use_mxfp8_act_scaling=use_mxfp8_act_scaling, + use_w4_group_scaling=use_w4_group_scaling, + tune_max_num_tokens=max(self.max_capture_size, 1), ) def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None: diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index b4ceaa379f01..c2c0e809d700 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -564,9 +564,13 @@ def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bo # @property - def quant_dtype(self) -> torch.dtype | None: + def quant_dtype(self) -> torch.dtype | str | None: return self.quant_config.quant_dtype + @property + def weight_quant_dtype(self) -> torch.dtype | str | None: + return self.quant_config.weight_quant_dtype + @property def block_shape(self) -> list[int] | None: return self.quant_config.block_shape diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index 61e06fa603d6..2bd4cd79e031 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -25,15 +25,20 @@ def __init__( self, moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig, - gemm1_alpha, - gemm1_beta, - gemm1_clamp_limit, max_capture_size, ): super().__init__(moe_config, quant_config) - self.gemm1_alpha = gemm1_alpha - self.gemm1_beta = gemm1_beta - self.gemm1_clamp_limit = gemm1_clamp_limit + self.device = torch.cuda.current_device() + self.num_experts = moe_config.num_local_experts + self.gemm1_alpha = torch.tensor( + [1.702] * self.num_experts, dtype=torch.float32, device=self.device + ) + self.gemm1_beta = torch.tensor( + [1.0] * self.num_experts, dtype=torch.float32, device=self.device + ) + self.gemm1_clamp_limit = torch.tensor( + [7.0] * self.num_experts, dtype=torch.float32, device=self.device + ) self.max_capture_size = max_capture_size @staticmethod diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index ad32abf582c1..019e408c1959 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -195,11 +195,12 @@ def _mxfp8_e4m3_quantize( A_scale: torch.Tensor | None, per_act_token_quant: bool, block_shape: list[int] | None = None, + is_sf_swizzled_layout: bool = False, ) -> tuple[torch.Tensor, torch.Tensor]: assert A_scale is None assert not per_act_token_quant assert block_shape is None - return mxfp8_e4m3_quantize(A) + return mxfp8_e4m3_quantize(A, is_sf_swizzled_layout) def _mxfp6_e3m2_quantize( @@ -275,7 +276,13 @@ def moe_kernel_quantize_input( elif quant_dtype == "mxfp8": # TODO: `quant_dtype == "mxfp8"` is ambiguous, # should be fp8_e4m3. OCP MX also defines `fp8_e5m2`. - return _mxfp8_e4m3_quantize(A, A_scale, per_act_token_quant, block_shape) + return _mxfp8_e4m3_quantize( + A, + A_scale, + per_act_token_quant, + block_shape, + is_sf_swizzled_layout=is_fp4_scale_swizzled, + ) elif quant_dtype == "mxfp6_e3m2": return _mxfp6_e3m2_quantize(A, A_scale, per_act_token_quant, block_shape) elif quant_dtype == "mxfp6_e2m3": diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 492963855e8f..d81f0f80d2e7 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -256,6 +256,7 @@ def __init__(self, moe: FusedMoEConfig): "Please check your environment and try again." ) self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {} + # Initialized in process_weights_after_loading for CUTLASS/SM90 backends self.moe_mk: mk.FusedMoEModularKernel | None = None def create_weights( @@ -648,19 +649,6 @@ def swap_every_two_rows(x, axis=-1): self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 ): - layer.gemm1_alpha = Parameter( - torch.tensor([1.702] * self.num_experts, dtype=torch.float32).cuda(), - requires_grad=False, - ) - layer.gemm1_beta = Parameter( - torch.tensor([1.0] * self.num_experts, dtype=torch.float32).cuda(), - requires_grad=False, - ) - layer.gemm1_clamp_limit = Parameter( - torch.tensor([7.0] * self.num_experts, dtype=torch.float32).cuda(), - requires_grad=False, - ) - sf_block_size = 32 # mxfp4 block size # Common shape assertions @@ -772,6 +760,30 @@ def _interleave_mxfp4_cutlass_sm90(w): layer.w2_weight_scale = torch.nn.Parameter( w2_scales_interleaved, requires_grad=False ) + + # theses two kernels go through the `flashinfer_cutlass_fused_moe` path + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, + ) + + self.moe_quant_config = self.get_fused_moe_quant_config(layer) + assert self.moe_quant_config is not None + prepare_finalize = maybe_make_prepare_finalize( + moe=self.moe, + quant_config=self.moe_quant_config, + routing_tables=layer._maybe_init_expert_routing_tables(), + allow_new_interface=True, + ) + assert prepare_finalize is not None + + self.moe_mk = mk.FusedMoEModularKernel( + prepare_finalize, + FlashInferExperts( + moe_config=self.moe, + quant_config=self.moe_quant_config, + ), + shared_experts=None, + ) elif self.mxfp4_backend == Mxfp4Backend.TRITON: from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig @@ -847,7 +859,10 @@ def get_fused_moe_quant_config( w1_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale, ) - elif self.mxfp4_backend in [Mxfp4Backend.SM100_FI_MXFP4_BF16]: + elif self.mxfp4_backend in [ + Mxfp4Backend.SM100_FI_MXFP4_BF16, + Mxfp4Backend.SM90_FI_MXFP4_BF16, + ]: return mxfp4_w4a16_moe_quant_config( w1_bias=layer.w13_bias, w2_bias=layer.w2_bias, @@ -897,9 +912,6 @@ def select_gemm_impl( ): # B200 code-path kwargs = { - "gemm1_alpha": layer.gemm1_alpha, - "gemm1_beta": layer.gemm1_beta, - "gemm1_clamp_limit": layer.gemm1_clamp_limit, # TODO(bnell): part of quant_config "max_capture_size": self.max_capture_size, } @@ -935,20 +947,6 @@ def apply( if layer.enable_eplb: raise NotImplementedError("EPLB is not supported for mxfp4") - if self.mxfp4_backend == Mxfp4Backend.MARLIN: - assert self.moe_mk is not None - - return self.moe_mk( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - activation=layer.activation, - global_num_experts=layer.global_num_experts, - expert_map=layer.expert_map, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - ) assert _can_support_mxfp4( layer.use_grouped_topk, layer.topk_group, @@ -967,69 +965,23 @@ def apply( assert ( self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 + or self.mxfp4_backend == Mxfp4Backend.MARLIN ) - from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe - - # Backend-specific preparation - if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS: - from flashinfer import mxfp8_quantize - - x_quant, x_scale = mxfp8_quantize(x, True, 32) - - fake_input_scale = torch.ones(self.num_experts, device=x.device) - quant_scales = [ - layer.w13_weight_scale.contiguous().view(torch.int32), - fake_input_scale, - layer.w2_weight_scale.contiguous().view(torch.int32), - fake_input_scale, - ] - - fi_input = x_quant - extra_kwargs = dict( - use_mxfp8_act_scaling=True, - input_sf=x_scale, - fc1_expert_weights=layer.w13_weight.contiguous().view(torch.long), - fc2_expert_weights=layer.w2_weight.contiguous().view(torch.long), - ) - elif self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16: - assert x.dtype == torch.bfloat16 - - quant_scales = [ - layer.w13_weight_scale, - layer.w2_weight_scale, - ] - - fi_input = x - extra_kwargs = dict( - use_w4_group_scaling=True, - fc1_expert_weights=layer.w13_weight, - fc2_expert_weights=layer.w2_weight, - ) - output = torch.empty_like(x, dtype=torch.bfloat16) - - flashinfer_cutlass_fused_moe( - input=fi_input, - token_selected_experts=topk_ids.to(torch.int).contiguous(), - token_final_scales=topk_weights, - output_dtype=torch.bfloat16, - output=output, - quant_scales=quant_scales, - fc1_expert_biases=layer.w13_bias, - fc2_expert_biases=layer.w2_bias, - swiglu_alpha=layer.gemm1_alpha, - swiglu_beta=layer.gemm1_beta, - swiglu_limit=layer.gemm1_clamp_limit, - tp_size=self.moe.tp_size, - tp_rank=self.moe.tp_rank, - ep_size=self.moe.ep_size, - ep_rank=self.moe.ep_rank, - tune_max_num_tokens=max(self.max_capture_size, 1), - **extra_kwargs, + assert self.moe_mk is not None + return self.moe_mk( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + expert_map=layer.expert_map, + shared_experts_input=shared_experts_input, ) - return output - def apply_monolithic( self, layer: FusedMoE, diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index e42868e4176b..12a1799d157c 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -19,6 +19,7 @@ FP8_DTYPE = current_platform.fp8_dtype() FP4_DTYPE = torch.uint8 +MXFP_SCALE_DTYPE = torch.uint8 def get_fp8_min_max() -> tuple[float, float]: @@ -151,6 +152,18 @@ def __str__(self): kDynamic64Scale = ScaleDesc(torch.float32, False, GroupShape(1, 64)) kFp8Dynamic64Sym = QuantKey(FP8_DTYPE, kDynamic64Scale, symmetric=True) +# TODO (zyongye): Convert all the torch.dtype to scale_dtype +# Changing that requires changing torch compile fused AR+Quant Quant key +# to avoid assertion error +kMxfp4DynamicGroupScale = ScaleDesc(MXFP_SCALE_DTYPE, False, GroupShape(1, 32)) +kMxfp4Dynamic = QuantKey(FP4_DTYPE, scale=kMxfp4DynamicGroupScale, symmetric=True) + +kMxfp8DynamicGroupScale = ScaleDesc(MXFP_SCALE_DTYPE, False, GroupShape(1, 32)) +kMxfp8Dynamic = QuantKey(FP8_DTYPE, scale=kMxfp8DynamicGroupScale, symmetric=True) + +kMxfp4StaticGroupScale = ScaleDesc(MXFP_SCALE_DTYPE, True, GroupShape(1, 32)) +kMxfp4Static = QuantKey(FP4_DTYPE, scale=kMxfp4StaticGroupScale, symmetric=True) + # Normalize the group_shape to the full extent for any dims that are -1 def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape):