Skip to content
Open
Show file tree
Hide file tree
Changes from 148 commits
Commits
Show all changes
160 commits
Select commit Hold shift + click to select a range
21d7d67
Functionalized patterns in prep for utility
ProExpertProg Sep 6, 2025
f3b4cf1
TEMP Mostly working
ProExpertProg Sep 9, 2025
cdad3c0
TEMP: fixed rmsnorm issue (TODO assert dtypes in fused norm_quant ker…
ProExpertProg Sep 12, 2025
8e4a56f
rms works fully now, had to remove more conversions (and add them in …
ProExpertProg Sep 16, 2025
e151e6d
quant works except (torch,torch)
ProExpertProg Sep 16, 2025
14fdc8b
quant with fix for pure torch, broke others
ProExpertProg Sep 18, 2025
05a65f3
ALL WORKS
ProExpertProg Sep 18, 2025
e6b394e
Add TODO
ProExpertProg Sep 20, 2025
d96913a
Cleanup test_fusion.py, added extra layer of rms/quant
ProExpertProg Sep 25, 2025
b172747
Functionalize attn+quant patterns
ProExpertProg Sep 25, 2025
1ae80c6
Move global vllm_config to pass manager
ProExpertProg Sep 25, 2025
77835fd
Attention fusion works with custom ops
ProExpertProg Sep 25, 2025
1277999
Remove V0 attn fusion test
ProExpertProg Sep 25, 2025
d843a67
Add triton attn test to attn+quant fusion
ProExpertProg Sep 26, 2025
cdd1529
Flat product for better test names/visibility
ProExpertProg Sep 26, 2025
141a37e
Fix rmsnorm
ProExpertProg Sep 26, 2025
c6d6c3b
Refactor E2E attn fusion test
ProExpertProg Sep 26, 2025
490ac86
Add TP=2 test (untested)
ProExpertProg Sep 26, 2025
d0b1b56
improve tests by adding more cases
ProExpertProg Sep 26, 2025
47b4688
TEMP working on caplog
ProExpertProg Sep 27, 2025
ae7f56f
Temp MP workaround P2
ProExpertProg Sep 30, 2025
eb899a4
Temp MP workaround P3
ProExpertProg Sep 30, 2025
a2aa978
Test for caplog utils
ProExpertProg Oct 1, 2025
21a9f9f
Fixed tests, passing with 2.8, 2.9 tbd
ProExpertProg Oct 2, 2025
66a35a9
Update tests/compile/backend.py
ProExpertProg Oct 2, 2025
7eb1364
Update csrc/layernorm_kernels.cu
ProExpertProg Oct 2, 2025
5fef180
clean up fullgraph tests
ProExpertProg Oct 2, 2025
db479ae
TEMP allreduce fusion
ProExpertProg Oct 2, 2025
54189a9
allreduce fusion working (custom ops on)
ProExpertProg Oct 3, 2025
b7f52bf
allreduce fusion working with/without custom ops (except fp4)
ProExpertProg Oct 3, 2025
d09a278
allreduce fusion working with/without custom ops (with fp4)
ProExpertProg Oct 3, 2025
c8675ff
log depyf folder, fix context for TestBackend, fix pattern dump
ProExpertProg Oct 3, 2025
d3f95fe
fullgraph allreduce test update requirements
ProExpertProg Oct 3, 2025
4dbfcf7
Move e2e tests to new file, add to test pipeline
ProExpertProg Oct 3, 2025
31d0127
Add e2e fusions to fullgraph test (should work with Triton backend), …
ProExpertProg Oct 3, 2025
c653d24
Fix spelling, precommit
ProExpertProg Oct 4, 2025
1756f67
add back fp4
ProExpertProg Oct 4, 2025
5619bc3
clean up e2e tests
ProExpertProg Oct 10, 2025
32989d8
add pattern for final allreduce in model
ProExpertProg Oct 10, 2025
46ee626
add more comprehensive testing for quantfp8 (-rmsnorm+-quant still fa…
ProExpertProg Oct 10, 2025
a1c7fdb
add more comprehensive testing for allreduce-rmsnorm, fix fp4 (-rmsno…
ProExpertProg Oct 10, 2025
c3264d8
Fix partial match rmsnorm+quant, fix allreduce+rmsnorm match
ProExpertProg Oct 10, 2025
095277c
Simplify matcher utils by using RMSNorm.forward_static
ProExpertProg Oct 10, 2025
52f78ce
Add allreduce test to 2-gpu test
ProExpertProg Oct 11, 2025
1b1a63e
Fix e2e allreduce fusion test
ProExpertProg Oct 11, 2025
0d6e550
fix func test
ProExpertProg Oct 12, 2025
26892df
fix pass manager test
ProExpertProg Oct 12, 2025
3547b87
fix sequence parallelism test
ProExpertProg Oct 12, 2025
af1ffa7
PR review
ProExpertProg Oct 15, 2025
97b3ff2
Merge remote-tracking branch 'upstream/main' into luka/custom-op-matc…
ProExpertProg Oct 15, 2025
b5f89e5
Cleanup test_full_graph.py
ProExpertProg Oct 15, 2025
f6429e4
Cleanup test_fusion_attn.py
ProExpertProg Oct 15, 2025
8a363d3
Slight improvement for E2E fusion
ProExpertProg Oct 15, 2025
12a7c6d
Tests & docs for flat_product
ProExpertProg Oct 15, 2025
db16ee1
Merge branch 'main' into luka/custom-op-matching-2
ProExpertProg Oct 15, 2025
8ffb474
Remove/fix TODOs
ProExpertProg Oct 15, 2025
2a6299c
Fix e2e test patterns
ProExpertProg Oct 15, 2025
465ce58
Update tests/compile/test_fusion.py
ProExpertProg Oct 15, 2025
bb0254a
Merge branch 'main' into luka/custom-op-matching-2
ProExpertProg Oct 15, 2025
bcd95b5
Fix func test
ProExpertProg Oct 15, 2025
db2b1c7
Smaller model for e2e fusion test
ProExpertProg Oct 15, 2025
a3ebf0a
fix fp8 quant tests
ProExpertProg Oct 15, 2025
3943257
Restore original torch.Parameter behavior in RMSNorm
ProExpertProg Oct 15, 2025
532cbcf
Add comment to test_logger
ProExpertProg Oct 15, 2025
7e6f5b3
add flat_product example
ProExpertProg Oct 15, 2025
24f1298
PR comments: cleanup fusion passes, & matching
ProExpertProg Oct 15, 2025
de7405b
PR comments: add _custom_op suffix
ProExpertProg Oct 15, 2025
6253d5b
Add e2e to L40 distributed, move tests to start of B200 distributed
ProExpertProg Oct 15, 2025
876ef22
Fix tests, PR feedback
ProExpertProg Oct 15, 2025
e99a759
Break up B200 tests, move allreduce to H200
ProExpertProg Oct 15, 2025
a226864
Merge branch 'main' into luka/custom-op-matching-2
ProExpertProg Oct 16, 2025
ae581e1
Fix attention fusion test numerics
ProExpertProg Oct 16, 2025
c03b29b
Remove inductor graph partition from unit test (included in e2e tests)
ProExpertProg Oct 16, 2025
d2e0489
Relax tolerance for L40 fusion test
ProExpertProg Oct 16, 2025
65ef5fd
Merge branch 'main' into luka/custom-op-matching-2
ProExpertProg Oct 16, 2025
d4fe977
Fix NamedTuple
ProExpertProg Oct 16, 2025
6319e39
Update test durations
ProExpertProg Oct 16, 2025
e34d36d
More tweaking of precision
ProExpertProg Oct 16, 2025
f72ee43
Split original pr
ilmarkov Sep 4, 2025
c4c0215
Update bench
ilmarkov Sep 5, 2025
309d79e
Update threshold configuration
ilmarkov Sep 8, 2025
afcfd73
Move all_reduce from custom op in fused_moe
ilmarkov Sep 8, 2025
0248dcd
Linter fixes
ilmarkov Oct 16, 2025
18e4771
Upd
ilmarkov Oct 16, 2025
1debd8e
Merge branch 'main' into imarkov/fused_allreduce_torch_native
ilmarkov Oct 21, 2025
9516d2b
Upd after review
ilmarkov Oct 21, 2025
b789044
Update fused_moe
ilmarkov Oct 27, 2025
4001935
Merge branch 'main' into imarkov/fused_allreduce_torch_native
ilmarkov Oct 27, 2025
6077616
Address comments
ilmarkov Nov 2, 2025
afc8af8
Remove bench_compile
ilmarkov Nov 2, 2025
c3af2af
Split PR. Second part. Compile ranges
ilmarkov Sep 4, 2025
0cbb065
Remove general shape graph
ilmarkov Sep 4, 2025
d5392f5
Add test to test pipeline
ilmarkov Sep 5, 2025
027c9eb
Fix pre-commit
ilmarkov Sep 9, 2025
b2992d3
Upd
ilmarkov Oct 16, 2025
3499384
Upd config
ilmarkov Oct 16, 2025
5336ee6
Fix
ilmarkov Oct 16, 2025
4958474
Priotitize compile_sizes
ilmarkov Oct 17, 2025
04306ed
Fix inductor config
ilmarkov Oct 28, 2025
9dc4eea
Laith's fix
ilmarkov Nov 3, 2025
2c63f0b
Upd
ilmarkov Nov 4, 2025
67f7ae1
Update config
ilmarkov Nov 4, 2025
8b8d01d
Merge branch 'imarkov/fused_allreduce_torch_native' into imarkov/cond…
ilmarkov Nov 4, 2025
fcebc21
Add caching
ilmarkov Nov 4, 2025
65151bc
Address comments
ilmarkov Nov 5, 2025
1f7afdb
Add debug log
ilmarkov Nov 5, 2025
8da1585
Merge branch 'main' into imarkov/fused_allreduce_torch_native
ilmarkov Nov 5, 2025
df22202
Update benchmark
ilmarkov Nov 5, 2025
a21de2b
Fix
ilmarkov Nov 5, 2025
45f4093
Update bench and constants
ilmarkov Nov 5, 2025
c26e056
Rename in benchmark
ilmarkov Nov 5, 2025
1bee5a6
Merge branch 'main' into imarkov/fused_allreduce_torch_native
ilmarkov Nov 5, 2025
bcc0cc0
Add max_token_num to object
ilmarkov Nov 5, 2025
43b163c
Add test
ilmarkov Nov 5, 2025
71c6b72
Update comments
ilmarkov Nov 6, 2025
ada24e6
Merge branch 'imarkov/fused_allreduce_torch_native' into imarkov/cond…
ilmarkov Nov 6, 2025
6766e4f
Update fakify for compile sizes
ilmarkov Nov 5, 2025
af87d7a
Linter fix
ilmarkov Nov 6, 2025
56273da
Merge branch 'main' into imarkov/fused_allreduce_torch_native
ilmarkov Nov 6, 2025
459f71c
Merge branch 'imarkov/fused_allreduce_torch_native' into imarkov/cond…
ilmarkov Nov 6, 2025
2785e4d
Minor updates
ilmarkov Nov 7, 2025
1f83a66
Merge branch 'main' into imarkov/fused_allreduce_torch_native
ProExpertProg Nov 7, 2025
b4c1b1d
Address the review
ilmarkov Nov 10, 2025
ab33605
Merge branch 'main' into imarkov/fused_allreduce_torch_native
robertgshaw2-redhat Nov 10, 2025
3fac39b
Merge branch 'main' into imarkov/fused_allreduce_torch_native
ilmarkov Nov 10, 2025
b0a3884
Fix SP
ilmarkov Nov 10, 2025
a3e7bdc
Merge branch 'imarkov/fused_allreduce_torch_native' into imarkov/cond…
ilmarkov Nov 10, 2025
a810969
Merge branch 'main' into imarkov/conditional_compilation_ranges
ilmarkov Nov 11, 2025
319abd5
Remove dynamic shape
ilmarkov Nov 12, 2025
d168de0
Make ranges inclusive-inclusive
ilmarkov Nov 13, 2025
b65e752
Merge branch 'main' into imarkov/conditional_compilation_ranges
ilmarkov Nov 14, 2025
af10400
Merge branch 'main' into imarkov/conditional_compilation_ranges
ilmarkov Nov 18, 2025
6c05919
Add test for inductor cache hits
ilmarkov Nov 19, 2025
03637e7
Merge branch 'main' into imarkov/conditional_compilation_ranges
ilmarkov Nov 19, 2025
3f72483
Address comments
ilmarkov Nov 19, 2025
9b00ebc
Address comments
ilmarkov Nov 20, 2025
8a40ac6
Update test
ilmarkov Nov 20, 2025
ef05682
Address comments
ilmarkov Nov 20, 2025
63af962
Merge branch 'main' into imarkov/conditional_compilation_ranges
ilmarkov Nov 20, 2025
7647089
Merge branch 'main' into imarkov/conditional_compilation_ranges
ilmarkov Nov 21, 2025
ee89388
Update test utils
ilmarkov Nov 21, 2025
925e87d
Fix pre-commit after merge
ilmarkov Nov 21, 2025
809e170
Fix tests
ilmarkov Nov 21, 2025
e07c939
Add fixture instead of decorator
ilmarkov Nov 21, 2025
f4db45c
Fix re-used compilation config
ilmarkov Nov 23, 2025
97a8d58
Merge branch 'main' into imarkov/conditional_compilation_ranges
ilmarkov Nov 23, 2025
4f280ce
Fix e2e
ilmarkov Nov 23, 2025
f714957
Merge branch 'main' into imarkov/conditional_compilation_ranges
ilmarkov Nov 23, 2025
b27f89d
Fix e2e adapt to number of compile ranges
ilmarkov Nov 24, 2025
eedc70e
Merge branch 'main' into imarkov/conditional_compilation_ranges
ilmarkov Nov 25, 2025
cc8f2f8
Slight fix of test
ilmarkov Nov 25, 2025
d1dd4db
Fix tests after refactor
ilmarkov Nov 25, 2025
a2b67a4
Simplify
ilmarkov Nov 25, 2025
0776364
Address comments
ilmarkov Nov 26, 2025
42bf355
Merge branch 'main' into imarkov/conditional_compilation_ranges
ilmarkov Nov 26, 2025
ca832fc
Merge remote-tracking branch 'upstream/main' into imarkov/conditional…
ProExpertProg Dec 2, 2025
ba90b9e
Only warm up model if mode=VLLM_COMPILE
ProExpertProg Dec 2, 2025
771203f
Fix capture-sizes
ProExpertProg Dec 2, 2025
0e0eab9
Fix doc range
ProExpertProg Dec 2, 2025
3d2c36b
pre-commit
ProExpertProg Dec 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion tests/compile/distributed/test_fusions_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,9 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
log_holder.text,
)
assert len(log_matches) == 2, log_holder.text
# 2 for 2 compile ranges
# (global compile range is split due to enable_fi_allreduce_fusion)
assert len(log_matches) == 4, log_holder.text

assert int(log_matches[0]) == matches.attention_fusion
assert int(log_matches[1]) == matches.attention_fusion
Expand Down
168 changes: 168 additions & 0 deletions tests/compile/test_compile_ranges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any

import torch
from torch import fx as fx
from torch import nn

# This import automatically registers `torch.ops.silly.attention`
import tests.compile.silly_attention # noqa
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile
from vllm.compilation.inductor_pass import (
InductorPass,
get_pass_context,
)
from vllm.config import (
VllmConfig,
set_current_vllm_config,
)
from vllm.config.compilation import CompilationConfig, CompilationMode
from vllm.config.scheduler import SchedulerConfig
from vllm.config.utils import Range
from vllm.forward_context import set_forward_context

BATCH_SIZE = 64
MLP_SIZE = 128


@support_torch_compile
class TestModel(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
super().__init__()

def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x + x
attn_output = torch.empty_like(x)
torch.ops.silly.attention(x, x, x, attn_output)
x = attn_output
x = x * 3
return x


@torch.inference_mode
def run_model(vllm_config: VllmConfig, model: nn.Module, batch_sizes: list[int]):
with set_forward_context({}, vllm_config=vllm_config):
model(torch.randn(BATCH_SIZE, MLP_SIZE))
for batch_size in batch_sizes:
model(torch.randn(batch_size, MLP_SIZE))


class PostGradRangeChecker(InductorPass):
def __init__(self, ranges: list[Range]):
self.ranges = ranges
self.num_calls = 0

def __call__(self, graph: fx.Graph):
compile_range = get_pass_context().compile_range
assert compile_range in self.ranges, (
f"Compile range {compile_range} not in {self.ranges}"
)
self.num_calls += 1

def uuid(self) -> str:
state: dict[str, Any] = {}
return InductorPass.hash_dict(state)


def test_compile_ranges(use_fresh_inductor_cache):
post_grad_range_checker = PostGradRangeChecker(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How come this works without disabling the vllm cache?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably clean inductor cache allows us to avoid cache hits of vllm cache

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm ok

[
Range(start=1, end=8),
Range(start=16, end=16),
Range(start=9, end=32),
Range(start=64, end=64),
Range(start=33, end=8192),
]
)
torch.set_default_device("cuda")
vllm_config = VllmConfig(
scheduler_config=SchedulerConfig(
max_num_batched_tokens=8192,
),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
compile_ranges_split_points=[8, 32],
compile_sizes=[16, 64, 128],
Copy link
Contributor

@laithsakka laithsakka Nov 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I wonder if we shall we call those now specialize sizes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

compile_specialize_sizes? so that it is symmetrical to compile_ranges

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah we can do in follow-up

inductor_compile_config={
"post_grad_custom_post_pass": post_grad_range_checker,
},
),
)

with set_current_vllm_config(vllm_config):
model = TestModel(vllm_config=vllm_config, prefix="").eval()
# Number of compilations: 3 for each compile range + 2 compile sizes
batch_sizes = [1, 4, 16, 24, 48, 64, 8192]

with compilation_counter.expect(
num_graphs_seen=1,
num_piecewise_graphs_seen=1,
num_backend_compilations=5,
):
run_model(vllm_config, model, batch_sizes)
assert post_grad_range_checker.num_calls == 5


def test_compile_config_get_compile_ranges():
compilation_config = CompilationConfig(
compile_ranges_split_points=[8, 32],
)
VllmConfig(
scheduler_config=SchedulerConfig(
max_num_batched_tokens=8192,
),
compilation_config=compilation_config,
)
assert compilation_config.get_compile_ranges() == [
Range(start=1, end=8),
Range(start=9, end=32),
Range(start=33, end=8192),
]


def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
# To force multiple compilations, we disable the compile cache
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")

post_grad_range_checker = PostGradRangeChecker(
ranges=[
Range(start=1, end=8),
Range(start=9, end=8192),
]
)
scheduler_config = SchedulerConfig(
max_num_batched_tokens=8192,
)
torch.set_default_device("cuda")

def create_vllm_config():
return VllmConfig(
scheduler_config=scheduler_config,
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
compile_ranges_split_points=[8],
inductor_compile_config={
"post_grad_custom_post_pass": post_grad_range_checker,
},
),
)

vllm_config_1 = create_vllm_config()
with set_current_vllm_config(vllm_config_1):
model1 = TestModel(vllm_config=vllm_config_1, prefix="").eval()
batch_sizes = [1, 16]
run_model(vllm_config_1, model1, batch_sizes)
assert post_grad_range_checker.num_calls == 2

post_grad_range_checker.num_calls = 0
# Create a new vllm config with the new pass context
vllm_config_2 = create_vllm_config()
with set_current_vllm_config(vllm_config_2):
model2 = TestModel(vllm_config=vllm_config_2, prefix="").eval()
batch_sizes = [4, 32]
run_model(vllm_config_2, model2, batch_sizes)
# Check that cache is used, so the number of calls
# should be 0
assert post_grad_range_checker.num_calls == 0
27 changes: 27 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@
from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_num_threads

try:
from torch._inductor.utils import fresh_cache

torch_inductor_fresh_cache_available = True
except ImportError:
torch_inductor_fresh_cache_available = False


logger = init_logger(__name__)

_TEST_DIR = os.path.dirname(__file__)
Expand Down Expand Up @@ -1424,3 +1432,22 @@ def disable_deepgemm_ue8m0(monkeypatch):
# Clear cache so the next time it is used it is processed with the
# default VLLM_USE_DEEP_GEMM_E8M0 setting.
is_deep_gemm_e8m0_used.cache_clear()


@pytest.fixture
def use_fresh_inductor_cache():
"""
Use a fresh inductor cache for the test.
This is useful to ensure that the test is not affected by the
previous test calls.
"""
if not torch_inductor_fresh_cache_available:
print(
"torch._inductor.utils.fresh_cache is not available, "
"the test will not use fresh inductor cache."
)
yield
return

with fresh_cache():
yield
Loading