Skip to content

[Perf] SM90 cutlass fp8 mm supports odd M by swap_ab, 180~290% kernel performance improvement#44572

Open
yewentao256 wants to merge 2 commits into
mainfrom
wentao-fp8-scaled-mm-oddM
Open

[Perf] SM90 cutlass fp8 mm supports odd M by swap_ab, 180~290% kernel performance improvement#44572
yewentao256 wants to merge 2 commits into
mainfrom
wentao-fp8-scaled-mm-oddM

Conversation

@yewentao256
Copy link
Copy Markdown
Member

Purpose

A follow up for #43706, we now directly implement in kernel level.

The swap_ab logic is from SM100/SM120 currently

Test

Acc covered in unit tests

Perf:

import time
import statistics

import torch
from vllm import _custom_ops as ops
from vllm.model_executor.kernels.linear.scaled_mm.cutlass import (
    CutlassFp8BlockScaledMMKernel,
)

torch.cuda.set_device(0)
assert torch.cuda.is_available()

kernel = object.__new__(CutlassFp8BlockScaledMMKernel)
kernel.config = type("Config", (), {"out_dtype": torch.bfloat16})()


def make_inputs(m, n, k):
    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16).to(torch.float8_e4m3fn)
    b = torch.randn((n, k), device="cuda", dtype=torch.bfloat16).to(torch.float8_e4m3fn)
    as_ = torch.randn((m, k // 128), device="cuda", dtype=torch.float32)
    bs = torch.randn((n // 128, k // 128), device="cuda", dtype=torch.float32)
    return a, b, as_, bs


def old_padded_dispatch(a, b, as_, bs):
    m = a.shape[0]
    padded_m = m if m % 4 == 0 else m + 4 - (m % 4)

    if padded_m == m:
        return ops.cutlass_scaled_mm(
            a, b.T, scale_a=as_, scale_b=bs.T, out_dtype=torch.bfloat16
        )

    padded_a = torch.zeros((padded_m, a.shape[1]), device=a.device, dtype=a.dtype)
    padded_a[:m].copy_(a)

    padded_as = torch.ones((padded_m, as_.shape[1]), device=as_.device, dtype=as_.dtype)
    padded_as[:m].copy_(as_)

    out = ops.cutlass_scaled_mm(
        padded_a,
        b.T,
        scale_a=padded_as,
        scale_b=bs.T,
        out_dtype=torch.bfloat16,
    )
    return out[:m].contiguous()


def new_native_dispatch(a, b, as_, bs):
    return kernel.apply_block_scaled_mm(a, b, as_, bs)


def bench(fn, args, warmup=100, iters=1000, wall_iters=20):
    for _ in range(warmup):
        fn(*args)
    torch.cuda.synchronize()

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    for _ in range(iters):
        fn(*args)
    end.record()
    torch.cuda.synchronize()
    gpu_us = start.elapsed_time(end) * 1000 / iters

    wall = []
    for _ in range(wall_iters):
        torch.cuda.synchronize()
        t0 = time.perf_counter()
        fn(*args)
        torch.cuda.synchronize()
        wall.append((time.perf_counter() - t0) * 1e6)

    return gpu_us, statistics.median(wall)


print("GPU:", torch.cuda.get_device_name(0))
print("odd M: old padded path vs new native path")
print("m n k | old_gpu new_gpu speedup |")

for m, n, k in [
    (1, 4096, 7168),
    (2, 4096, 7168),
    (3, 4096, 7168),
    (5, 4096, 7168),
    (7, 4096, 7168),
    (33, 4096, 7168),
    (1, 7168, 16384),
    (5, 7168, 16384),
]:
    args = make_inputs(m, n, k)

    old_out = old_padded_dispatch(*args)
    new_out = new_native_dispatch(*args)
    max_diff = (old_out - new_out).abs().max().item()

    old_gpu, old_wall = bench(old_padded_dispatch, args)
    new_gpu, new_wall = bench(new_native_dispatch, args)

    print(
        f"{m:2d} {n:5d} {k:5d} | "
        f"{old_gpu:8.2f} {new_gpu:8.2f} {old_gpu/new_gpu:5.2f}x | "
    )

Using this script we can get

 1  4096  7168 |    49.86    17.31  2.88x | 
 2  4096  7168 |    49.35    17.15  2.88x | 
 3  4096  7168 |    50.44    17.72  2.85x | 
 5  4096  7168 |    50.42    17.54  2.87x | 
 7  4096  7168 |    50.53    18.42  2.74x | 
33  4096  7168 |    50.58    17.72  2.85x | 
 1  7168 16384 |    65.49    35.10  1.87x | 
 5  7168 16384 |    70.74    34.84  2.03x | 

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Copy link
Copy Markdown

@claude claude Bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Claude Code Review

This repository is configured for manual code reviews. Comment @claude review to trigger a review and subscribe this PR to future pushes, or @claude review once for a one-time review.

Tip: disable this comment in your organization's Code Review settings.

@yewentao256 yewentao256 added the ready ONLY add when PR is ready to merge/full CI is needed label Jun 4, 2026
@mergify mergify Bot added the nvidia label Jun 4, 2026
Copy link
Copy Markdown
Member

@mgoin mgoin left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank god we finally can get rid of this hack, great work!!

@github-project-automation github-project-automation Bot moved this to Ready in NVIDIA Jun 5, 2026
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

nvidia ready ONLY add when PR is ready to merge/full CI is needed

Projects

Status: Ready

Development

Successfully merging this pull request may close these issues.

2 participants