From e79bd135306f1eeefcfdfa88dba5b26f8958eb44 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Thu, 12 Mar 2026 21:41:48 -0700
Subject: [PATCH 01/27] [Kernel] Fuse temperature scaling + softmax into single
 Triton kernel for sampling

- Add fused_temperature_softmax Triton kernel (2-pass online softmax)
- Replace div_ + softmax with fused kernel in sampler standard path
- Add correctness tests and benchmark script
- Reduces kernel launches from 2 to 1 and memory passes from 6 to 3
---
 .../bench_fused_temperature_softmax.py        |  99 ++++++++++
 python/sglang/srt/layers/fused_sampling.py    | 181 ++++++++++++++++++
 python/sglang/srt/layers/sampler.py           |  22 ++-
 .../test_fused_temperature_softmax.py         | 160 ++++++++++++++++
 4 files changed, 457 insertions(+), 5 deletions(-)
 create mode 100644 benchmark/kernels/bench_fused_temperature_softmax.py
 create mode 100644 python/sglang/srt/layers/fused_sampling.py
 create mode 100644 test/registered/sampling/test_fused_temperature_softmax.py

diff --git a/benchmark/kernels/bench_fused_temperature_softmax.py b/benchmark/kernels/bench_fused_temperature_softmax.py
new file mode 100644
index 000000000000..1896849e09b2
--- /dev/null
+++ b/benchmark/kernels/bench_fused_temperature_softmax.py
@@ -0,0 +1,99 @@
+"""Benchmark: fused_temperature_softmax vs separate div_ + softmax.
+
+Measures wall-clock time with torch.cuda.Event timing, 200 iterations
+after 50 warmup. Reports per-call latency and speedup.
+"""
+
+import argparse
+
+import torch
+
+
+def benchmark_fn(fn, warmup=50, iters=200):
+    """Time a zero-arg callable using CUDA events."""
+    for _ in range(warmup):
+        fn()
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) / iters * 1000  # microseconds
+
+
+def reference_temperature_softmax(logits, temperatures):
+    """Original two-kernel path."""
+    logits.div_(temperatures)
+    logits[:] = torch.softmax(logits, dim=-1)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--warmup", type=int, default=50)
+    parser.add_argument("--iters", type=int, default=200)
+    args = parser.parse_args()
+
+    from sglang.srt.layers.fused_sampling import (
+        fused_temperature_softmax,
+        fused_temperature_softmax_inplace,
+    )
+
+    configs = [
+        # (batch_size, vocab_size, dtype)
+        (1, 32000, torch.bfloat16),
+        (1, 128256, torch.bfloat16),
+        (32, 32000, torch.bfloat16),
+        (32, 128256, torch.bfloat16),
+        (128, 32000, torch.bfloat16),
+        (128, 128256, torch.bfloat16),
+        (512, 32000, torch.bfloat16),
+        (512, 128256, torch.bfloat16),
+    ]
+
+    print(f"{'bs':>5}  {'vocab':>7}  {'dtype':>8}  {'original (us)':>14}  "
+          f"{'fused (us)':>11}  {'inplace (us)':>13}  {'speedup':>8}  {'speedup_ip':>11}")
+    print("-" * 100)
+
+    for bs, vocab, dtype in configs:
+        temps = (torch.rand(bs, 1, dtype=torch.float32, device="cuda") * 1.5 + 0.1)
+
+        # --- Original ---
+        logits_orig = torch.randn(bs, vocab, dtype=dtype, device="cuda")
+
+        def run_original():
+            l = logits_orig.clone()
+            l.div_(temps)
+            l[:] = torch.softmax(l, dim=-1)
+
+        t_orig = benchmark_fn(run_original, args.warmup, args.iters)
+
+        # --- Fused (out-of-place) ---
+        logits_fused = torch.randn(bs, vocab, dtype=dtype, device="cuda")
+
+        def run_fused():
+            fused_temperature_softmax(logits_fused, temps)
+
+        t_fused = benchmark_fn(run_fused, args.warmup, args.iters)
+
+        # --- Fused (in-place) ---
+        logits_ip = torch.randn(bs, vocab, dtype=dtype, device="cuda")
+
+        def run_inplace():
+            fused_temperature_softmax_inplace(logits_ip, temps)
+
+        t_ip = benchmark_fn(run_inplace, args.warmup, args.iters)
+
+        speedup = t_orig / t_fused
+        speedup_ip = t_orig / t_ip
+        print(
+            f"{bs:>5}  {vocab:>7}  {str(dtype):>8}  {t_orig:>14.1f}  "
+            f"{t_fused:>11.1f}  {t_ip:>13.1f}  {speedup:>7.2f}x  {speedup_ip:>10.2f}x"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
new file mode 100644
index 000000000000..ee5018dd5c06
--- /dev/null
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -0,0 +1,181 @@
+"""Fused Triton kernels for the sampling pipeline.
+
+Fuses temperature scaling + softmax into a single kernel to reduce
+kernel launch overhead and global memory traffic during decode.
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _fused_temperature_softmax_kernel(
+    logits_ptr,
+    temperatures_ptr,
+    output_ptr,
+    vocab_size,
+    logits_stride,
+    output_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+
+    temp = tl.load(temperatures_ptr + row_idx)
+    inv_temp = 1.0 / temp
+
+    logits_row = logits_ptr + row_idx * logits_stride
+    output_row = output_ptr + row_idx * output_stride
+
+    # Pass 1: online softmax -- find max and accumulate sum(exp) in one sweep.
+    # Using the numerically stable online algorithm:
+    #   m_new  = max(m_old, block_max)
+    #   d_new  = d_old * exp(m_old - m_new) + sum(exp(x - m_new))
+    running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
+    running_sum = tl.full([], value=0.0, dtype=tl.float32)
+
+    for start in range(0, vocab_size, BLOCK_SIZE):
+        offsets = start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < vocab_size
+        x = tl.load(logits_row + offsets, mask=mask, other=float("-inf"))
+        x = (x * inv_temp).to(tl.float32)
+
+        block_max = tl.max(x, axis=0)
+        new_max = tl.maximum(running_max, block_max)
+
+        running_sum = running_sum * tl.exp(running_max - new_max) + tl.sum(
+            tl.exp(x - new_max), axis=0
+        )
+        running_max = new_max
+
+    log_sum = tl.log(running_sum)
+
+    # Pass 2: normalize and write probabilities.
+    for start in range(0, vocab_size, BLOCK_SIZE):
+        offsets = start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < vocab_size
+        x = tl.load(logits_row + offsets, mask=mask, other=float("-inf"))
+        x = (x * inv_temp).to(tl.float32)
+
+        prob = tl.exp(x - running_max - log_sum)
+        tl.store(output_row + offsets, prob, mask=mask)
+
+
+def fused_temperature_softmax(
+    logits: torch.Tensor,
+    temperatures: torch.Tensor,
+) -> torch.Tensor:
+    """Fused temperature scaling + softmax in a single Triton kernel.
+
+    Replaces the two-kernel sequence:
+        logits.div_(temperatures)
+        probs = torch.softmax(logits, dim=-1)
+
+    Args:
+        logits: Raw logits of shape ``(batch_size, vocab_size)``.
+        temperatures: Per-request temperatures of shape ``(batch_size, 1)``.
+
+    Returns:
+        Probability tensor of shape ``(batch_size, vocab_size)`` in float32.
+    """
+    batch_size, vocab_size = logits.shape
+    if batch_size == 0:
+        return torch.empty(0, vocab_size, dtype=torch.float32, device=logits.device)
+
+    output = torch.empty(
+        batch_size, vocab_size, dtype=torch.float32, device=logits.device
+    )
+    temperatures_flat = temperatures.view(-1)
+
+    BLOCK_SIZE = min(triton.next_power_of_2(vocab_size), 4096)
+
+    grid = (batch_size,)
+    _fused_temperature_softmax_kernel[grid](
+        logits,
+        temperatures_flat,
+        output,
+        vocab_size,
+        logits.stride(0),
+        output.stride(0),
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    return output
+
+
+@triton.jit
+def _fused_temperature_softmax_inplace_kernel(
+    logits_ptr,
+    temperatures_ptr,
+    vocab_size,
+    stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+
+    temp = tl.load(temperatures_ptr + row_idx)
+    inv_temp = 1.0 / temp
+
+    row_start = logits_ptr + row_idx * stride
+
+    # Pass 1: online max + sum
+    running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
+    running_sum = tl.full([], value=0.0, dtype=tl.float32)
+
+    for start in range(0, vocab_size, BLOCK_SIZE):
+        offsets = start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < vocab_size
+        x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
+        x = (x * inv_temp).to(tl.float32)
+
+        block_max = tl.max(x, axis=0)
+        new_max = tl.maximum(running_max, block_max)
+
+        running_sum = running_sum * tl.exp(running_max - new_max) + tl.sum(
+            tl.exp(x - new_max), axis=0
+        )
+        running_max = new_max
+
+    log_sum = tl.log(running_sum)
+
+    # Pass 2: normalize in-place
+    for start in range(0, vocab_size, BLOCK_SIZE):
+        offsets = start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < vocab_size
+        x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
+        x = (x * inv_temp).to(tl.float32)
+
+        prob = tl.exp(x - running_max - log_sum)
+        tl.store(row_start + offsets, prob, mask=mask)
+
+
+def fused_temperature_softmax_inplace(
+    logits: torch.Tensor,
+    temperatures: torch.Tensor,
+) -> None:
+    """In-place fused temperature scaling + softmax.
+
+    After this call, ``logits`` contains probabilities (in the original dtype).
+    This matches the original code pattern:
+        logits.div_(temperatures)
+        logits[:] = torch.softmax(logits, dim=-1)
+
+    Args:
+        logits: Raw logits of shape ``(batch_size, vocab_size)``. Modified in-place.
+        temperatures: Per-request temperatures of shape ``(batch_size, 1)``.
+    """
+    batch_size, vocab_size = logits.shape
+    if batch_size == 0:
+        return
+
+    temperatures_flat = temperatures.view(-1)
+
+    BLOCK_SIZE = min(triton.next_power_of_2(vocab_size), 4096)
+
+    grid = (batch_size,)
+    _fused_temperature_softmax_inplace_kernel[grid](
+        logits,
+        temperatures_flat,
+        vocab_size,
+        logits.stride(0),
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index a4c7c7db037e..c9e115186a6c 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -27,6 +27,15 @@
         top_k_renorm_prob,
         top_p_renorm_prob,
     )
+
+_use_fused_sampling = False
+if is_cuda():
+    try:
+        from sglang.srt.layers.fused_sampling import fused_temperature_softmax
+
+        _use_fused_sampling = True
+    except ImportError:
+        pass
 if is_npu():
     import torch_npu
 
@@ -152,11 +161,14 @@ def forward(
                     logprobs = logprobs_via_logsoftmax_kernel
             else:
                 # Standard path: do softmax and sample from probs.
-                logits.div_(sampling_info.temperatures)
-
-                # In-place op to save memory
-                logits[:] = torch.softmax(logits, dim=-1)
-                probs = logits
+                if _use_fused_sampling:
+                    probs = fused_temperature_softmax(
+                        logits, sampling_info.temperatures
+                    )
+                else:
+                    logits.div_(sampling_info.temperatures)
+                    logits[:] = torch.softmax(logits, dim=-1)
+                    probs = logits
 
                 batch_next_token_ids = self._sample_from_probs(
                     probs, sampling_info, positions, simple_sampling_case
diff --git a/test/registered/sampling/test_fused_temperature_softmax.py b/test/registered/sampling/test_fused_temperature_softmax.py
new file mode 100644
index 000000000000..b993e435c6f8
--- /dev/null
+++ b/test/registered/sampling/test_fused_temperature_softmax.py
@@ -0,0 +1,160 @@
+"""Correctness tests for fused_temperature_softmax Triton kernel.
+
+Compares the fused kernel output against the reference PyTorch implementation
+(logits.div_(temperatures) followed by torch.softmax) across a range of batch
+sizes, vocab sizes, dtypes, and temperature values.
+"""
+
+import unittest
+
+import torch
+
+from sglang.srt.layers.fused_sampling import (
+    fused_temperature_softmax,
+    fused_temperature_softmax_inplace,
+)
+from sglang.srt.utils import get_device
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_cuda_ci(est_time=15, suite="stage-b-test-small-1-gpu")
+
+
+def reference_temperature_softmax(logits, temperatures):
+    """Reference implementation: div + softmax (separate kernels)."""
+    logits = logits.clone()
+    logits.div_(temperatures)
+    return torch.softmax(logits, dim=-1).float()
+
+
+class TestFusedTemperatureSoftmax(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.set_default_device(get_device())
+        torch.manual_seed(42)
+
+    def _check_close(self, fused, ref, atol=1e-5, rtol=1e-5):
+        """Assert outputs are close and both are valid probability distributions."""
+        self.assertEqual(fused.shape, ref.shape)
+        # Valid probabilities: non-negative, sum to ~1
+        self.assertTrue(
+            (fused >= 0).all(), f"Negative probabilities in fused output"
+        )
+        row_sums = fused.sum(dim=-1)
+        torch.testing.assert_close(
+            row_sums,
+            torch.ones_like(row_sums),
+            atol=1e-4,
+            rtol=1e-4,
+        )
+        torch.testing.assert_close(fused, ref, atol=atol, rtol=rtol)
+
+    # --- out-of-place kernel ---
+
+    def test_basic(self):
+        logits = torch.randn(4, 1024, dtype=torch.bfloat16)
+        temps = torch.tensor([0.7, 1.0, 1.5, 2.0], dtype=torch.float32).view(-1, 1)
+        ref = reference_temperature_softmax(logits, temps)
+        fused = fused_temperature_softmax(logits, temps)
+        self._check_close(fused, ref, atol=1e-4, rtol=1e-3)
+
+    def test_large_vocab(self):
+        logits = torch.randn(8, 128256, dtype=torch.bfloat16)
+        temps = torch.full((8, 1), 0.6, dtype=torch.float32)
+        ref = reference_temperature_softmax(logits, temps)
+        fused = fused_temperature_softmax(logits, temps)
+        self._check_close(fused, ref, atol=1e-4, rtol=1e-3)
+
+    def test_batch_sizes(self):
+        for bs in [1, 2, 16, 64, 128, 512]:
+            logits = torch.randn(bs, 32000, dtype=torch.bfloat16)
+            temps = torch.rand(bs, 1, dtype=torch.float32) * 1.5 + 0.1
+            ref = reference_temperature_softmax(logits, temps)
+            fused = fused_temperature_softmax(logits, temps)
+            self._check_close(fused, ref, atol=1e-4, rtol=1e-3)
+
+    def test_temperature_one(self):
+        """Temperature=1.0 should be equivalent to plain softmax."""
+        logits = torch.randn(16, 32000, dtype=torch.bfloat16)
+        temps = torch.ones(16, 1, dtype=torch.float32)
+        ref = torch.softmax(logits.float(), dim=-1)
+        fused = fused_temperature_softmax(logits, temps)
+        self._check_close(fused, ref, atol=1e-4, rtol=1e-3)
+
+    def test_very_low_temperature(self):
+        """Very low temperature should produce near-one-hot distribution."""
+        logits = torch.randn(4, 1024, dtype=torch.bfloat16)
+        temps = torch.full((4, 1), 0.01, dtype=torch.float32)
+        fused = fused_temperature_softmax(logits, temps)
+        # Max probability should be very close to 1.0
+        max_probs = fused.max(dim=-1).values
+        self.assertTrue((max_probs > 0.99).all())
+
+    def test_very_high_temperature(self):
+        """Very high temperature should produce near-uniform distribution."""
+        logits = torch.randn(4, 1024, dtype=torch.bfloat16)
+        temps = torch.full((4, 1), 100.0, dtype=torch.float32)
+        fused = fused_temperature_softmax(logits, temps)
+        uniform = 1.0 / 1024
+        self.assertTrue(
+            (fused - uniform).abs().max() < 0.01,
+            "High temperature should produce near-uniform distribution",
+        )
+
+    def test_fp16_input(self):
+        logits = torch.randn(8, 32000, dtype=torch.float16)
+        temps = torch.rand(8, 1, dtype=torch.float32) * 1.5 + 0.1
+        ref = reference_temperature_softmax(logits, temps)
+        fused = fused_temperature_softmax(logits, temps)
+        self._check_close(fused, ref, atol=1e-3, rtol=1e-2)
+
+    def test_fp32_input(self):
+        logits = torch.randn(8, 32000, dtype=torch.float32)
+        temps = torch.rand(8, 1, dtype=torch.float32) + 0.5
+        ref = reference_temperature_softmax(logits, temps)
+        fused = fused_temperature_softmax(logits, temps)
+        self._check_close(fused, ref, atol=1e-5, rtol=1e-5)
+
+    def test_mixed_temperatures(self):
+        """Each row has a different temperature."""
+        logits = torch.randn(8, 32000, dtype=torch.bfloat16)
+        temps = torch.tensor(
+            [0.1, 0.5, 0.7, 1.0, 1.2, 1.5, 2.0, 5.0], dtype=torch.float32
+        ).view(-1, 1)
+        ref = reference_temperature_softmax(logits, temps)
+        fused = fused_temperature_softmax(logits, temps)
+        self._check_close(fused, ref, atol=1e-4, rtol=1e-3)
+
+    def test_empty_batch(self):
+        logits = torch.randn(0, 32000, dtype=torch.bfloat16)
+        temps = torch.ones(0, 1, dtype=torch.float32)
+        fused = fused_temperature_softmax(logits, temps)
+        self.assertEqual(fused.shape, (0, 32000))
+
+    # --- in-place kernel ---
+
+    def test_inplace_basic(self):
+        logits = torch.randn(8, 32000, dtype=torch.float32)
+        temps = torch.rand(8, 1, dtype=torch.float32) * 1.5 + 0.1
+        ref = reference_temperature_softmax(logits, temps)
+        fused_temperature_softmax_inplace(logits, temps)
+        # In-place writes back to logits in the original dtype
+        self._check_close(logits.float(), ref, atol=1e-5, rtol=1e-5)
+
+    def test_inplace_bf16(self):
+        logits = torch.randn(8, 32000, dtype=torch.bfloat16)
+        temps = torch.rand(8, 1, dtype=torch.float32) + 0.5
+        ref = reference_temperature_softmax(logits, temps)
+        fused_temperature_softmax_inplace(logits, temps)
+        self._check_close(logits.float(), ref, atol=2e-3, rtol=2e-3)
+
+    def test_inplace_large_vocab(self):
+        logits = torch.randn(4, 128256, dtype=torch.bfloat16)
+        temps = torch.full((4, 1), 0.8, dtype=torch.float32)
+        ref = reference_temperature_softmax(logits, temps)
+        fused_temperature_softmax_inplace(logits, temps)
+        self._check_close(logits.float(), ref, atol=2e-3, rtol=2e-3)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 7a5928aa4fabc249ec7d793e1d267c05666a6825 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Thu, 12 Mar 2026 21:55:45 -0700
Subject: [PATCH 02/27] Add AutoTune

---
 .../bench_fused_temperature_softmax.py        |  8 +++--
 python/sglang/srt/layers/fused_sampling.py    | 29 ++++++++++++-------
 python/sglang/srt/layers/sampler.py           |  6 ++--
 .../test_fused_temperature_softmax.py         |  4 +--
 4 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/benchmark/kernels/bench_fused_temperature_softmax.py b/benchmark/kernels/bench_fused_temperature_softmax.py
index 1896849e09b2..df524a97d1a9 100644
--- a/benchmark/kernels/bench_fused_temperature_softmax.py
+++ b/benchmark/kernels/bench_fused_temperature_softmax.py
@@ -54,12 +54,14 @@ def main():
         (512, 128256, torch.bfloat16),
     ]
 
-    print(f"{'bs':>5}  {'vocab':>7}  {'dtype':>8}  {'original (us)':>14}  "
-          f"{'fused (us)':>11}  {'inplace (us)':>13}  {'speedup':>8}  {'speedup_ip':>11}")
+    print(
+        f"{'bs':>5}  {'vocab':>7}  {'dtype':>8}  {'original (us)':>14}  "
+        f"{'fused (us)':>11}  {'inplace (us)':>13}  {'speedup':>8}  {'speedup_ip':>11}"
+    )
     print("-" * 100)
 
     for bs, vocab, dtype in configs:
-        temps = (torch.rand(bs, 1, dtype=torch.float32, device="cuda") * 1.5 + 0.1)
+        temps = torch.rand(bs, 1, dtype=torch.float32, device="cuda") * 1.5 + 0.1
 
         # --- Original ---
         logits_orig = torch.randn(bs, vocab, dtype=dtype, device="cuda")
diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
index ee5018dd5c06..fa0904f8587c 100644
--- a/python/sglang/srt/layers/fused_sampling.py
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -8,7 +8,22 @@
 import triton
 import triton.language as tl
 
-
+_AUTOTUNE_CONFIGS = [
+    triton.Config({"BLOCK_SIZE": 1024}, num_warps=4),
+    triton.Config({"BLOCK_SIZE": 1024}, num_warps=8),
+    triton.Config({"BLOCK_SIZE": 2048}, num_warps=8),
+    triton.Config({"BLOCK_SIZE": 2048}, num_warps=16),
+    triton.Config({"BLOCK_SIZE": 4096}, num_warps=8),
+    triton.Config({"BLOCK_SIZE": 4096}, num_warps=16),
+    triton.Config({"BLOCK_SIZE": 8192}, num_warps=16),
+    triton.Config({"BLOCK_SIZE": 8192}, num_warps=32),
+    triton.Config({"BLOCK_SIZE": 16384}, num_warps=16),
+    triton.Config({"BLOCK_SIZE": 16384}, num_warps=32),
+    triton.Config({"BLOCK_SIZE": 32768}, num_warps=32),
+]
+
+
+@triton.autotune(configs=_AUTOTUNE_CONFIGS, key=["vocab_size"])
 @triton.jit
 def _fused_temperature_softmax_kernel(
     logits_ptr,
@@ -27,10 +42,7 @@ def _fused_temperature_softmax_kernel(
     logits_row = logits_ptr + row_idx * logits_stride
     output_row = output_ptr + row_idx * output_stride
 
-    # Pass 1: online softmax -- find max and accumulate sum(exp) in one sweep.
-    # Using the numerically stable online algorithm:
-    #   m_new  = max(m_old, block_max)
-    #   d_new  = d_old * exp(m_old - m_new) + sum(exp(x - m_new))
+    # Pass 1: online softmax — find max and accumulate sum(exp) in one sweep.
     running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
     running_sum = tl.full([], value=0.0, dtype=tl.float32)
 
@@ -87,8 +99,6 @@ def fused_temperature_softmax(
     )
     temperatures_flat = temperatures.view(-1)
 
-    BLOCK_SIZE = min(triton.next_power_of_2(vocab_size), 4096)
-
     grid = (batch_size,)
     _fused_temperature_softmax_kernel[grid](
         logits,
@@ -97,11 +107,11 @@ def fused_temperature_softmax(
         vocab_size,
         logits.stride(0),
         output.stride(0),
-        BLOCK_SIZE=BLOCK_SIZE,
     )
     return output
 
 
+@triton.autotune(configs=_AUTOTUNE_CONFIGS, key=["vocab_size"])
 @triton.jit
 def _fused_temperature_softmax_inplace_kernel(
     logits_ptr,
@@ -169,13 +179,10 @@ def fused_temperature_softmax_inplace(
 
     temperatures_flat = temperatures.view(-1)
 
-    BLOCK_SIZE = min(triton.next_power_of_2(vocab_size), 4096)
-
     grid = (batch_size,)
     _fused_temperature_softmax_inplace_kernel[grid](
         logits,
         temperatures_flat,
         vocab_size,
         logits.stride(0),
-        BLOCK_SIZE=BLOCK_SIZE,
     )
diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index c9e115186a6c..f63a22a02651 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -31,7 +31,7 @@
 _use_fused_sampling = False
 if is_cuda():
     try:
-        from sglang.srt.layers.fused_sampling import fused_temperature_softmax
+        from sglang.srt.layers.fused_sampling import fused_temperature_softmax_inplace
 
         _use_fused_sampling = True
     except ImportError:
@@ -162,13 +162,13 @@ def forward(
             else:
                 # Standard path: do softmax and sample from probs.
                 if _use_fused_sampling:
-                    probs = fused_temperature_softmax(
+                    fused_temperature_softmax_inplace(
                         logits, sampling_info.temperatures
                     )
                 else:
                     logits.div_(sampling_info.temperatures)
                     logits[:] = torch.softmax(logits, dim=-1)
-                    probs = logits
+                probs = logits
 
                 batch_next_token_ids = self._sample_from_probs(
                     probs, sampling_info, positions, simple_sampling_case
diff --git a/test/registered/sampling/test_fused_temperature_softmax.py b/test/registered/sampling/test_fused_temperature_softmax.py
index b993e435c6f8..a2ff6f5bb503 100644
--- a/test/registered/sampling/test_fused_temperature_softmax.py
+++ b/test/registered/sampling/test_fused_temperature_softmax.py
@@ -37,9 +37,7 @@ def _check_close(self, fused, ref, atol=1e-5, rtol=1e-5):
         """Assert outputs are close and both are valid probability distributions."""
         self.assertEqual(fused.shape, ref.shape)
         # Valid probabilities: non-negative, sum to ~1
-        self.assertTrue(
-            (fused >= 0).all(), f"Negative probabilities in fused output"
-        )
+        self.assertTrue((fused >= 0).all(), f"Negative probabilities in fused output")
         row_sums = fused.sum(dim=-1)
         torch.testing.assert_close(
             row_sums,

From dcd11ad4ff1ff29807f3e85a69babb6f2c3f76d7 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Thu, 12 Mar 2026 22:03:34 -0700
Subject: [PATCH 03/27] Add Dispatcher

---
 python/sglang/srt/layers/fused_sampling.py | 224 +++++++++++++++------
 1 file changed, 161 insertions(+), 63 deletions(-)

diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
index fa0904f8587c..e462cddb1235 100644
--- a/python/sglang/srt/layers/fused_sampling.py
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -2,30 +2,108 @@
 
 Fuses temperature scaling + softmax into a single kernel to reduce
 kernel launch overhead and global memory traffic during decode.
+
+Two kernel variants:
+  - Single-pass: vocab fits in one tile (1 read + 1 write). Used when
+    next_power_of_2(vocab) <= 32768.
+  - Multi-pass: 2-pass online softmax with autotune (2 reads + 1 write).
+    Used for large vocabs (e.g. 128K+).
 """
 
 import torch
 import triton
 import triton.language as tl
 
-_AUTOTUNE_CONFIGS = [
-    triton.Config({"BLOCK_SIZE": 1024}, num_warps=4),
-    triton.Config({"BLOCK_SIZE": 1024}, num_warps=8),
+_MAX_SINGLE_PASS_BLOCK = 32768
+
+# ---------------------------------------------------------------------------
+# Single-pass kernel: entire vocab fits in one BLOCK_SIZE tile.
+# Data stays in registers — only 1 global memory read + 1 write.
+# ---------------------------------------------------------------------------
+
+
+@triton.jit
+def _single_pass_temperature_softmax_kernel(
+    logits_ptr,
+    temperatures_ptr,
+    output_ptr,
+    vocab_size,
+    logits_stride,
+    output_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+    temp = tl.load(temperatures_ptr + row_idx)
+    inv_temp = 1.0 / temp
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < vocab_size
+
+    x = tl.load(
+        logits_ptr + row_idx * logits_stride + offsets,
+        mask=mask,
+        other=float("-inf"),
+    )
+    x = (x * inv_temp).to(tl.float32)
+
+    x_max = tl.max(x, axis=0)
+    exp_x = tl.exp(x - x_max)
+    prob = exp_x / tl.sum(exp_x, axis=0)
+
+    tl.store(output_ptr + row_idx * output_stride + offsets, prob, mask=mask)
+
+
+@triton.jit
+def _single_pass_temperature_softmax_inplace_kernel(
+    logits_ptr,
+    temperatures_ptr,
+    vocab_size,
+    stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+    temp = tl.load(temperatures_ptr + row_idx)
+    inv_temp = 1.0 / temp
+
+    row_start = logits_ptr + row_idx * stride
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < vocab_size
+
+    x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
+    x = (x * inv_temp).to(tl.float32)
+
+    x_max = tl.max(x, axis=0)
+    exp_x = tl.exp(x - x_max)
+    prob = exp_x / tl.sum(exp_x, axis=0)
+
+    tl.store(row_start + offsets, prob, mask=mask)
+
+
+# ---------------------------------------------------------------------------
+# Multi-pass kernel: vocab too large for one tile.
+# 2-pass online softmax with autotune over (BLOCK_SIZE, num_warps).
+# ---------------------------------------------------------------------------
+
+_MULTI_PASS_AUTOTUNE_CONFIGS = [
     triton.Config({"BLOCK_SIZE": 2048}, num_warps=8),
     triton.Config({"BLOCK_SIZE": 2048}, num_warps=16),
     triton.Config({"BLOCK_SIZE": 4096}, num_warps=8),
     triton.Config({"BLOCK_SIZE": 4096}, num_warps=16),
+    triton.Config({"BLOCK_SIZE": 4096}, num_warps=16, num_stages=4),
     triton.Config({"BLOCK_SIZE": 8192}, num_warps=16),
     triton.Config({"BLOCK_SIZE": 8192}, num_warps=32),
+    triton.Config({"BLOCK_SIZE": 8192}, num_warps=32, num_stages=4),
     triton.Config({"BLOCK_SIZE": 16384}, num_warps=16),
     triton.Config({"BLOCK_SIZE": 16384}, num_warps=32),
+    triton.Config({"BLOCK_SIZE": 16384}, num_warps=32, num_stages=4),
     triton.Config({"BLOCK_SIZE": 32768}, num_warps=32),
+    triton.Config({"BLOCK_SIZE": 32768}, num_warps=32, num_stages=4),
 ]
 
 
-@triton.autotune(configs=_AUTOTUNE_CONFIGS, key=["vocab_size"])
+@triton.autotune(configs=_MULTI_PASS_AUTOTUNE_CONFIGS, key=["vocab_size"])
 @triton.jit
-def _fused_temperature_softmax_kernel(
+def _multi_pass_temperature_softmax_kernel(
     logits_ptr,
     temperatures_ptr,
     output_ptr,
@@ -35,14 +113,13 @@ def _fused_temperature_softmax_kernel(
     BLOCK_SIZE: tl.constexpr,
 ):
     row_idx = tl.program_id(0)
-
     temp = tl.load(temperatures_ptr + row_idx)
     inv_temp = 1.0 / temp
 
     logits_row = logits_ptr + row_idx * logits_stride
     output_row = output_ptr + row_idx * output_stride
 
-    # Pass 1: online softmax — find max and accumulate sum(exp) in one sweep.
+    # Pass 1: online softmax — find max and accumulate sum(exp).
     running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
     running_sum = tl.full([], value=0.0, dtype=tl.float32)
 
@@ -54,7 +131,6 @@ def _fused_temperature_softmax_kernel(
 
         block_max = tl.max(x, axis=0)
         new_max = tl.maximum(running_max, block_max)
-
         running_sum = running_sum * tl.exp(running_max - new_max) + tl.sum(
             tl.exp(x - new_max), axis=0
         )
@@ -73,47 +149,9 @@ def _fused_temperature_softmax_kernel(
         tl.store(output_row + offsets, prob, mask=mask)
 
 
-def fused_temperature_softmax(
-    logits: torch.Tensor,
-    temperatures: torch.Tensor,
-) -> torch.Tensor:
-    """Fused temperature scaling + softmax in a single Triton kernel.
-
-    Replaces the two-kernel sequence:
-        logits.div_(temperatures)
-        probs = torch.softmax(logits, dim=-1)
-
-    Args:
-        logits: Raw logits of shape ``(batch_size, vocab_size)``.
-        temperatures: Per-request temperatures of shape ``(batch_size, 1)``.
-
-    Returns:
-        Probability tensor of shape ``(batch_size, vocab_size)`` in float32.
-    """
-    batch_size, vocab_size = logits.shape
-    if batch_size == 0:
-        return torch.empty(0, vocab_size, dtype=torch.float32, device=logits.device)
-
-    output = torch.empty(
-        batch_size, vocab_size, dtype=torch.float32, device=logits.device
-    )
-    temperatures_flat = temperatures.view(-1)
-
-    grid = (batch_size,)
-    _fused_temperature_softmax_kernel[grid](
-        logits,
-        temperatures_flat,
-        output,
-        vocab_size,
-        logits.stride(0),
-        output.stride(0),
-    )
-    return output
-
-
-@triton.autotune(configs=_AUTOTUNE_CONFIGS, key=["vocab_size"])
+@triton.autotune(configs=_MULTI_PASS_AUTOTUNE_CONFIGS, key=["vocab_size"])
 @triton.jit
-def _fused_temperature_softmax_inplace_kernel(
+def _multi_pass_temperature_softmax_inplace_kernel(
     logits_ptr,
     temperatures_ptr,
     vocab_size,
@@ -121,13 +159,11 @@ def _fused_temperature_softmax_inplace_kernel(
     BLOCK_SIZE: tl.constexpr,
 ):
     row_idx = tl.program_id(0)
-
     temp = tl.load(temperatures_ptr + row_idx)
     inv_temp = 1.0 / temp
 
     row_start = logits_ptr + row_idx * stride
 
-    # Pass 1: online max + sum
     running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
     running_sum = tl.full([], value=0.0, dtype=tl.float32)
 
@@ -139,7 +175,6 @@ def _fused_temperature_softmax_inplace_kernel(
 
         block_max = tl.max(x, axis=0)
         new_max = tl.maximum(running_max, block_max)
-
         running_sum = running_sum * tl.exp(running_max - new_max) + tl.sum(
             tl.exp(x - new_max), axis=0
         )
@@ -147,7 +182,6 @@ def _fused_temperature_softmax_inplace_kernel(
 
     log_sum = tl.log(running_sum)
 
-    # Pass 2: normalize in-place
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
@@ -158,16 +192,69 @@ def _fused_temperature_softmax_inplace_kernel(
         tl.store(row_start + offsets, prob, mask=mask)
 
 
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def _single_pass_num_warps(block_size: int) -> int:
+    return max(4, min(32, block_size // 256))
+
+
+def fused_temperature_softmax(
+    logits: torch.Tensor,
+    temperatures: torch.Tensor,
+) -> torch.Tensor:
+    """Fused temperature scaling + softmax in a single Triton kernel.
+
+    Args:
+        logits: Raw logits of shape ``(batch_size, vocab_size)``.
+        temperatures: Per-request temperatures of shape ``(batch_size, 1)``.
+
+    Returns:
+        Probability tensor of shape ``(batch_size, vocab_size)`` in float32.
+    """
+    batch_size, vocab_size = logits.shape
+    if batch_size == 0:
+        return torch.empty(0, vocab_size, dtype=torch.float32, device=logits.device)
+
+    output = torch.empty(
+        batch_size, vocab_size, dtype=torch.float32, device=logits.device
+    )
+    temperatures_flat = temperatures.view(-1)
+    grid = (batch_size,)
+
+    block_size = triton.next_power_of_2(vocab_size)
+    if block_size <= _MAX_SINGLE_PASS_BLOCK:
+        _single_pass_temperature_softmax_kernel[grid](
+            logits,
+            temperatures_flat,
+            output,
+            vocab_size,
+            logits.stride(0),
+            output.stride(0),
+            BLOCK_SIZE=block_size,
+            num_warps=_single_pass_num_warps(block_size),
+        )
+    else:
+        _multi_pass_temperature_softmax_kernel[grid](
+            logits,
+            temperatures_flat,
+            output,
+            vocab_size,
+            logits.stride(0),
+            output.stride(0),
+        )
+    return output
+
+
 def fused_temperature_softmax_inplace(
     logits: torch.Tensor,
     temperatures: torch.Tensor,
 ) -> None:
     """In-place fused temperature scaling + softmax.
 
-    After this call, ``logits`` contains probabilities (in the original dtype).
-    This matches the original code pattern:
-        logits.div_(temperatures)
-        logits[:] = torch.softmax(logits, dim=-1)
+    After this call, ``logits`` contains probabilities.
 
     Args:
         logits: Raw logits of shape ``(batch_size, vocab_size)``. Modified in-place.
@@ -178,11 +265,22 @@ def fused_temperature_softmax_inplace(
         return
 
     temperatures_flat = temperatures.view(-1)
-
     grid = (batch_size,)
-    _fused_temperature_softmax_inplace_kernel[grid](
-        logits,
-        temperatures_flat,
-        vocab_size,
-        logits.stride(0),
-    )
+
+    block_size = triton.next_power_of_2(vocab_size)
+    if block_size <= _MAX_SINGLE_PASS_BLOCK:
+        _single_pass_temperature_softmax_inplace_kernel[grid](
+            logits,
+            temperatures_flat,
+            vocab_size,
+            logits.stride(0),
+            BLOCK_SIZE=block_size,
+            num_warps=_single_pass_num_warps(block_size),
+        )
+    else:
+        _multi_pass_temperature_softmax_inplace_kernel[grid](
+            logits,
+            temperatures_flat,
+            vocab_size,
+            logits.stride(0),
+        )

From 97a84bd882475347b3b1db2c3e27ba6e26695220 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Thu, 26 Mar 2026 06:07:01 -0700
Subject: [PATCH 04/27] Fix Errors

---
 .../bench_fused_temperature_softmax.py        | 29 +++++++------------
 python/sglang/srt/layers/fused_sampling.py    | 11 ++++---
 python/sglang/srt/layers/sampler.py           |  7 ++---
 3 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/benchmark/kernels/bench_fused_temperature_softmax.py b/benchmark/kernels/bench_fused_temperature_softmax.py
index df524a97d1a9..7c230c020ae3 100644
--- a/benchmark/kernels/bench_fused_temperature_softmax.py
+++ b/benchmark/kernels/bench_fused_temperature_softmax.py
@@ -25,12 +25,6 @@ def benchmark_fn(fn, warmup=50, iters=200):
     return start.elapsed_time(end) / iters * 1000  # microseconds
 
 
-def reference_temperature_softmax(logits, temperatures):
-    """Original two-kernel path."""
-    logits.div_(temperatures)
-    logits[:] = torch.softmax(logits, dim=-1)
-
-
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--warmup", type=int, default=50)
@@ -62,30 +56,27 @@ def main():
 
     for bs, vocab, dtype in configs:
         temps = torch.rand(bs, 1, dtype=torch.float32, device="cuda") * 1.5 + 0.1
+        # Pre-allocate a source tensor; each run clones from it for fair comparison.
+        logits_src = torch.randn(bs, vocab, dtype=dtype, device="cuda")
 
         # --- Original ---
-        logits_orig = torch.randn(bs, vocab, dtype=dtype, device="cuda")
-
-        def run_original():
-            l = logits_orig.clone()
-            l.div_(temps)
+        def run_original(src=logits_src, t=temps):
+            l = src.clone()
+            l.div_(t)
             l[:] = torch.softmax(l, dim=-1)
 
         t_orig = benchmark_fn(run_original, args.warmup, args.iters)
 
         # --- Fused (out-of-place) ---
-        logits_fused = torch.randn(bs, vocab, dtype=dtype, device="cuda")
-
-        def run_fused():
-            fused_temperature_softmax(logits_fused, temps)
+        def run_fused(src=logits_src, t=temps):
+            fused_temperature_softmax(src.clone(), t)
 
         t_fused = benchmark_fn(run_fused, args.warmup, args.iters)
 
         # --- Fused (in-place) ---
-        logits_ip = torch.randn(bs, vocab, dtype=dtype, device="cuda")
-
-        def run_inplace():
-            fused_temperature_softmax_inplace(logits_ip, temps)
+        def run_inplace(src=logits_src, t=temps):
+            l = src.clone()
+            fused_temperature_softmax_inplace(l, t)
 
         t_ip = benchmark_fn(run_inplace, args.warmup, args.iters)
 
diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
index e462cddb1235..7cc089e83808 100644
--- a/python/sglang/srt/layers/fused_sampling.py
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -119,7 +119,6 @@ def _multi_pass_temperature_softmax_kernel(
     logits_row = logits_ptr + row_idx * logits_stride
     output_row = output_ptr + row_idx * output_stride
 
-    # Pass 1: online softmax — find max and accumulate sum(exp).
     running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
     running_sum = tl.full([], value=0.0, dtype=tl.float32)
 
@@ -138,7 +137,6 @@ def _multi_pass_temperature_softmax_kernel(
 
     log_sum = tl.log(running_sum)
 
-    # Pass 2: normalize and write probabilities.
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
@@ -218,10 +216,13 @@ def fused_temperature_softmax(
     if batch_size == 0:
         return torch.empty(0, vocab_size, dtype=torch.float32, device=logits.device)
 
+    if not logits.is_contiguous():
+        logits = logits.contiguous()
+
     output = torch.empty(
         batch_size, vocab_size, dtype=torch.float32, device=logits.device
     )
-    temperatures_flat = temperatures.view(-1)
+    temperatures_flat = temperatures.contiguous().view(-1)
     grid = (batch_size,)
 
     block_size = triton.next_power_of_2(vocab_size)
@@ -264,7 +265,9 @@ def fused_temperature_softmax_inplace(
     if batch_size == 0:
         return
 
-    temperatures_flat = temperatures.view(-1)
+    assert logits.is_contiguous(), "logits must be contiguous for in-place kernel"
+
+    temperatures_flat = temperatures.contiguous().view(-1)
     grid = (batch_size,)
 
     block_size = triton.next_power_of_2(vocab_size)
diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index f63a22a02651..348f9a35eae1 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -30,12 +30,9 @@
 
 _use_fused_sampling = False
 if is_cuda():
-    try:
-        from sglang.srt.layers.fused_sampling import fused_temperature_softmax_inplace
+    from sglang.srt.layers.fused_sampling import fused_temperature_softmax_inplace
 
-        _use_fused_sampling = True
-    except ImportError:
-        pass
+    _use_fused_sampling = True
 if is_npu():
     import torch_npu
 

From 8d73f493ea1aa0fa82f804bf002266b874777aae Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Mon, 30 Mar 2026 04:58:54 -0700
Subject: [PATCH 05/27] Add Autotune and Refactor Minor changes

---
 benchmark/kernels/bench_compare_softmax.py    | 107 +++++++++++++
 .../bench_fused_temperature_softmax.py        |  49 +++---
 python/sglang/srt/layers/fused_sampling.py    | 145 +++++++++++-------
 python/sglang/srt/layers/sampler.py           |   3 +-
 .../sglang/srt/model_executor/model_runner.py |  18 ++-
 .../test_fused_temperature_softmax.py         | 122 ++++++++++++++-
 6 files changed, 354 insertions(+), 90 deletions(-)
 create mode 100644 benchmark/kernels/bench_compare_softmax.py

diff --git a/benchmark/kernels/bench_compare_softmax.py b/benchmark/kernels/bench_compare_softmax.py
new file mode 100644
index 000000000000..e81f65ab1657
--- /dev/null
+++ b/benchmark/kernels/bench_compare_softmax.py
@@ -0,0 +1,107 @@
+"""Benchmark: Triton fused kernel vs flashinfer.sampling.softmax vs PyTorch baseline.
+
+Fair comparison: all variants clone logits each iteration to avoid
+measuring on already-softmaxed data.
+Uses torch.cuda.Event timing, 200 iterations after 50 warmup.
+"""
+
+import argparse
+import torch
+
+
+def benchmark_fn(fn, warmup=50, iters=200):
+    """Time a zero-arg callable using CUDA events. Returns microseconds."""
+    for _ in range(warmup):
+        fn()
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) / iters * 1000  # ms -> us
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--warmup", type=int, default=50)
+    parser.add_argument("--iters", type=int, default=200)
+    args = parser.parse_args()
+
+    from flashinfer.sampling import softmax as flashinfer_softmax
+    from sglang.srt.layers.fused_sampling import (
+        fused_temperature_softmax,
+        fused_temperature_softmax_inplace,
+    )
+
+    configs = [
+        # (batch_size, vocab_size, dtype)
+        (1, 32000, torch.bfloat16),
+        (1, 128256, torch.bfloat16),
+        (32, 32000, torch.bfloat16),
+        (32, 128256, torch.bfloat16),
+        (128, 32000, torch.bfloat16),
+        (128, 128256, torch.bfloat16),
+        (512, 32000, torch.bfloat16),
+        (512, 128256, torch.bfloat16),
+    ]
+
+    header = (
+        f"{'bs':>5}  {'vocab':>7}  {'dtype':>8}  "
+        f"{'pytorch(us)':>12}  {'triton(us)':>11}  {'triton_ip(us)':>14}  "
+        f"{'flashinfer(us)':>15}  "
+        f"{'tri/py':>7}  {'fi/py':>7}  {'tri/fi':>7}"
+    )
+    print(header)
+    print("-" * len(header))
+
+    for bs, vocab, dtype in configs:
+        temps = torch.rand(bs, 1, dtype=torch.float32, device="cuda") * 1.5 + 0.1
+        temps_flat = temps.view(-1)
+        logits_base = torch.randn(bs, vocab, dtype=dtype, device="cuda")
+
+        # --- PyTorch baseline (clone each iter for fairness) ---
+        def run_pytorch(logits_base=logits_base, temps=temps):
+            l = logits_base.clone()
+            l.div_(temps)
+            l[:] = torch.softmax(l, dim=-1)
+
+        t_pytorch = benchmark_fn(run_pytorch, args.warmup, args.iters)
+
+        # --- Triton out-of-place (clone each iter) ---
+        def run_triton(logits_base=logits_base, temps=temps):
+            l = logits_base.clone()
+            fused_temperature_softmax(l, temps)
+
+        t_triton = benchmark_fn(run_triton, args.warmup, args.iters)
+
+        # --- Triton in-place (clone each iter) ---
+        def run_triton_ip(logits_base=logits_base, temps=temps):
+            l = logits_base.clone()
+            fused_temperature_softmax_inplace(l, temps)
+
+        t_triton_ip = benchmark_fn(run_triton_ip, args.warmup, args.iters)
+
+        # --- flashinfer (clone each iter) ---
+        def run_flashinfer(logits_base=logits_base, temps_flat=temps_flat):
+            l = logits_base.clone()
+            flashinfer_softmax(l, temperature=temps_flat)
+
+        t_fi = benchmark_fn(run_flashinfer, args.warmup, args.iters)
+
+        speedup_triton = t_pytorch / t_triton
+        speedup_fi = t_pytorch / t_fi
+        triton_vs_fi = t_fi / t_triton  # >1 means triton faster
+
+        print(
+            f"{bs:>5}  {vocab:>7}  {str(dtype):>8}  "
+            f"{t_pytorch:>12.1f}  {t_triton:>11.1f}  {t_triton_ip:>14.1f}  "
+            f"{t_fi:>15.1f}  "
+            f"{speedup_triton:>6.2f}x  {speedup_fi:>6.2f}x  {triton_vs_fi:>6.2f}x"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/kernels/bench_fused_temperature_softmax.py b/benchmark/kernels/bench_fused_temperature_softmax.py
index 7c230c020ae3..264821afd2e2 100644
--- a/benchmark/kernels/bench_fused_temperature_softmax.py
+++ b/benchmark/kernels/bench_fused_temperature_softmax.py
@@ -1,8 +1,4 @@
-"""Benchmark: fused_temperature_softmax vs separate div_ + softmax.
-
-Measures wall-clock time with torch.cuda.Event timing, 200 iterations
-after 50 warmup. Reports per-call latency and speedup.
-"""
+"""Benchmark: fused_temperature_softmax vs div_+softmax vs flashinfer.sampling.softmax."""
 
 import argparse
 
@@ -31,6 +27,8 @@ def main():
     parser.add_argument("--iters", type=int, default=200)
     args = parser.parse_args()
 
+    from flashinfer.sampling import softmax as flashinfer_softmax
+
     from sglang.srt.layers.fused_sampling import (
         fused_temperature_softmax,
         fused_temperature_softmax_inplace,
@@ -48,43 +46,52 @@ def main():
         (512, 128256, torch.bfloat16),
     ]
 
-    print(
-        f"{'bs':>5}  {'vocab':>7}  {'dtype':>8}  {'original (us)':>14}  "
-        f"{'fused (us)':>11}  {'inplace (us)':>13}  {'speedup':>8}  {'speedup_ip':>11}"
+    header = (
+        f"{'bs':>5}  {'vocab':>7}  {'dtype':>8}  "
+        f"{'baseline (us)':>14}  {'triton (us)':>12}  {'inplace (us)':>13}  {'flashinfer (us)':>16}  "
+        f"{'tri/base':>9}  {'fi/base':>8}"
     )
-    print("-" * 100)
+    print(header)
+    print("-" * len(header))
 
     for bs, vocab, dtype in configs:
         temps = torch.rand(bs, 1, dtype=torch.float32, device="cuda") * 1.5 + 0.1
-        # Pre-allocate a source tensor; each run clones from it for fair comparison.
+        temps_1d = temps.view(-1)
         logits_src = torch.randn(bs, vocab, dtype=dtype, device="cuda")
 
-        # --- Original ---
-        def run_original(src=logits_src, t=temps):
+        # --- Baseline: div_ + softmax ---
+        def run_baseline(src=logits_src, t=temps):
             l = src.clone()
             l.div_(t)
             l[:] = torch.softmax(l, dim=-1)
 
-        t_orig = benchmark_fn(run_original, args.warmup, args.iters)
+        t_base = benchmark_fn(run_baseline, args.warmup, args.iters)
 
-        # --- Fused (out-of-place) ---
-        def run_fused(src=logits_src, t=temps):
+        # --- Triton fused (out-of-place) ---
+        def run_triton(src=logits_src, t=temps):
             fused_temperature_softmax(src.clone(), t)
 
-        t_fused = benchmark_fn(run_fused, args.warmup, args.iters)
+        t_triton = benchmark_fn(run_triton, args.warmup, args.iters)
 
-        # --- Fused (in-place) ---
+        # --- Triton fused (in-place) ---
         def run_inplace(src=logits_src, t=temps):
             l = src.clone()
             fused_temperature_softmax_inplace(l, t)
 
         t_ip = benchmark_fn(run_inplace, args.warmup, args.iters)
 
-        speedup = t_orig / t_fused
-        speedup_ip = t_orig / t_ip
+        # --- FlashInfer softmax with temperature ---
+        def run_flashinfer(src=logits_src, t=temps_1d):
+            flashinfer_softmax(src, temperature=t)
+
+        t_fi = benchmark_fn(run_flashinfer, args.warmup, args.iters)
+
+        sp_triton = t_base / t_triton
+        sp_fi = t_base / t_fi
         print(
-            f"{bs:>5}  {vocab:>7}  {str(dtype):>8}  {t_orig:>14.1f}  "
-            f"{t_fused:>11.1f}  {t_ip:>13.1f}  {speedup:>7.2f}x  {speedup_ip:>10.2f}x"
+            f"{bs:>5}  {vocab:>7}  {str(dtype):>8}  "
+            f"{t_base:>14.1f}  {t_triton:>12.1f}  {t_ip:>13.1f}  {t_fi:>16.1f}  "
+            f"{sp_triton:>8.2f}x  {sp_fi:>7.2f}x"
         )
 
 
diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
index 7cc089e83808..c78649d1db45 100644
--- a/python/sglang/srt/layers/fused_sampling.py
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -10,10 +10,14 @@
     Used for large vocabs (e.g. 128K+).
 """
 
+import logging
+
 import torch
 import triton
 import triton.language as tl
 
+logger = logging.getLogger(__name__)
+
 _MAX_SINGLE_PASS_BLOCK = 32768
 
 # ---------------------------------------------------------------------------
@@ -199,19 +203,63 @@ def _single_pass_num_warps(block_size: int) -> int:
     return max(4, min(32, block_size // 256))
 
 
+def _dispatch_kernel(
+    logits: torch.Tensor,
+    temperatures_flat: torch.Tensor,
+    vocab_size: int,
+    batch_size: int,
+    output: torch.Tensor = None,
+) -> None:
+    """Dispatch to single-pass or multi-pass kernel. output=None means in-place."""
+    grid = (batch_size,)
+    block_size = triton.next_power_of_2(vocab_size)
+    inplace = output is None
+
+    if block_size <= _MAX_SINGLE_PASS_BLOCK:
+        if inplace:
+            _single_pass_temperature_softmax_inplace_kernel[grid](
+                logits,
+                temperatures_flat,
+                vocab_size,
+                logits.stride(0),
+                BLOCK_SIZE=block_size,
+                num_warps=_single_pass_num_warps(block_size),
+            )
+        else:
+            _single_pass_temperature_softmax_kernel[grid](
+                logits,
+                temperatures_flat,
+                output,
+                vocab_size,
+                logits.stride(0),
+                output.stride(0),
+                BLOCK_SIZE=block_size,
+                num_warps=_single_pass_num_warps(block_size),
+            )
+    else:
+        if inplace:
+            _multi_pass_temperature_softmax_inplace_kernel[grid](
+                logits,
+                temperatures_flat,
+                vocab_size,
+                logits.stride(0),
+            )
+        else:
+            _multi_pass_temperature_softmax_kernel[grid](
+                logits,
+                temperatures_flat,
+                output,
+                vocab_size,
+                logits.stride(0),
+                output.stride(0),
+            )
+
+
 def fused_temperature_softmax(
     logits: torch.Tensor,
     temperatures: torch.Tensor,
 ) -> torch.Tensor:
-    """Fused temperature scaling + softmax in a single Triton kernel.
-
-    Args:
-        logits: Raw logits of shape ``(batch_size, vocab_size)``.
-        temperatures: Per-request temperatures of shape ``(batch_size, 1)``.
-
-    Returns:
-        Probability tensor of shape ``(batch_size, vocab_size)`` in float32.
-    """
+    """Fused temperature scaling + softmax. Returns float32 probabilities."""
     batch_size, vocab_size = logits.shape
     if batch_size == 0:
         return torch.empty(0, vocab_size, dtype=torch.float32, device=logits.device)
@@ -223,29 +271,7 @@ def fused_temperature_softmax(
         batch_size, vocab_size, dtype=torch.float32, device=logits.device
     )
     temperatures_flat = temperatures.contiguous().view(-1)
-    grid = (batch_size,)
-
-    block_size = triton.next_power_of_2(vocab_size)
-    if block_size <= _MAX_SINGLE_PASS_BLOCK:
-        _single_pass_temperature_softmax_kernel[grid](
-            logits,
-            temperatures_flat,
-            output,
-            vocab_size,
-            logits.stride(0),
-            output.stride(0),
-            BLOCK_SIZE=block_size,
-            num_warps=_single_pass_num_warps(block_size),
-        )
-    else:
-        _multi_pass_temperature_softmax_kernel[grid](
-            logits,
-            temperatures_flat,
-            output,
-            vocab_size,
-            logits.stride(0),
-            output.stride(0),
-        )
+    _dispatch_kernel(logits, temperatures_flat, vocab_size, batch_size, output)
     return output
 
 
@@ -253,14 +279,7 @@ def fused_temperature_softmax_inplace(
     logits: torch.Tensor,
     temperatures: torch.Tensor,
 ) -> None:
-    """In-place fused temperature scaling + softmax.
-
-    After this call, ``logits`` contains probabilities.
-
-    Args:
-        logits: Raw logits of shape ``(batch_size, vocab_size)``. Modified in-place.
-        temperatures: Per-request temperatures of shape ``(batch_size, 1)``.
-    """
+    """In-place fused temperature scaling + softmax. Overwrites logits with probabilities."""
     batch_size, vocab_size = logits.shape
     if batch_size == 0:
         return
@@ -268,22 +287,34 @@ def fused_temperature_softmax_inplace(
     assert logits.is_contiguous(), "logits must be contiguous for in-place kernel"
 
     temperatures_flat = temperatures.contiguous().view(-1)
-    grid = (batch_size,)
+    _dispatch_kernel(logits, temperatures_flat, vocab_size, batch_size)
+
+
+def warmup_fused_temperature_softmax(
+    vocab_size: int,
+    device: torch.device = None,
+) -> None:
+    """Pre-compile and autotune kernels at startup so first request has no latency spike."""
+    if device is None:
+        device = torch.cuda.current_device()
 
     block_size = triton.next_power_of_2(vocab_size)
-    if block_size <= _MAX_SINGLE_PASS_BLOCK:
-        _single_pass_temperature_softmax_inplace_kernel[grid](
-            logits,
-            temperatures_flat,
-            vocab_size,
-            logits.stride(0),
-            BLOCK_SIZE=block_size,
-            num_warps=_single_pass_num_warps(block_size),
-        )
-    else:
-        _multi_pass_temperature_softmax_inplace_kernel[grid](
-            logits,
-            temperatures_flat,
-            vocab_size,
-            logits.stride(0),
-        )
+    is_multi_pass = block_size > _MAX_SINGLE_PASS_BLOCK
+    label = "multi-pass autotune" if is_multi_pass else "single-pass JIT"
+    logger.info(
+        "Warming up fused_temperature_softmax (%s, vocab_size=%d) ...",
+        label,
+        vocab_size,
+    )
+
+    # Small dummy tensors — only 1 row needed to trigger compile + autotune.
+    dummy_logits = torch.randn(1, vocab_size, dtype=torch.bfloat16, device=device)
+    dummy_temps = torch.ones(1, 1, dtype=torch.float32, device=device)
+
+    # Trigger out-of-place kernel
+    fused_temperature_softmax(dummy_logits, dummy_temps)
+    # Trigger in-place kernel
+    fused_temperature_softmax_inplace(dummy_logits.clone(), dummy_temps)
+    torch.cuda.synchronize(device)
+
+    logger.info("fused_temperature_softmax warmup done (vocab_size=%d).", vocab_size)
diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index 348f9a35eae1..a140698e77af 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -18,6 +18,7 @@
 from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils.common import crash_on_warnings, get_bool_env_var, is_cuda, is_npu
 
+_use_fused_sampling = False
 if is_cuda():
     from flashinfer.sampling import (
         min_p_sampling_from_probs,
@@ -28,8 +29,6 @@
         top_p_renorm_prob,
     )
 
-_use_fused_sampling = False
-if is_cuda():
     from sglang.srt.layers.fused_sampling import fused_temperature_softmax_inplace
 
     _use_fused_sampling = True
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 37d578f2b737..7f0b008c5978 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -1812,16 +1812,26 @@ def init_double_sparsity_channel_config(self, selected_channel):
             )
 
     def kernel_warmup(self):
-        """
-        Warmup and tune kernels before cuda graph capture.
-        Currently only doing FlashInfer autotune.
-        """
+        """Warmup and tune kernels before cuda graph capture."""
         if self.device != "cuda":
             return
 
         if self._should_run_flashinfer_autotune():
             self._flashinfer_autotune()
 
+        self._warmup_fused_sampling()
+
+    def _warmup_fused_sampling(self):
+        """Pre-compile and autotune fused sampling Triton kernels."""
+        from sglang.srt.layers.fused_sampling import (
+            warmup_fused_temperature_softmax,
+        )
+
+        warmup_fused_temperature_softmax(
+            vocab_size=self.model_config.vocab_size,
+            device=self.device,
+        )
+
     def _should_run_flashinfer_autotune(self) -> bool:
         """Check if flashinfer autotune should be run."""
         if self.server_args.disable_flashinfer_autotune:
diff --git a/test/registered/sampling/test_fused_temperature_softmax.py b/test/registered/sampling/test_fused_temperature_softmax.py
index a2ff6f5bb503..956d67c2cefe 100644
--- a/test/registered/sampling/test_fused_temperature_softmax.py
+++ b/test/registered/sampling/test_fused_temperature_softmax.py
@@ -1,13 +1,9 @@
-"""Correctness tests for fused_temperature_softmax Triton kernel.
-
-Compares the fused kernel output against the reference PyTorch implementation
-(logits.div_(temperatures) followed by torch.softmax) across a range of batch
-sizes, vocab sizes, dtypes, and temperature values.
-"""
+"""Correctness tests for fused_temperature_softmax Triton kernel."""
 
 import unittest
 
 import torch
+from flashinfer.sampling import softmax as flashinfer_softmax
 
 from sglang.srt.layers.fused_sampling import (
     fused_temperature_softmax,
@@ -153,6 +149,120 @@ def test_inplace_large_vocab(self):
         fused_temperature_softmax_inplace(logits, temps)
         self._check_close(logits.float(), ref, atol=2e-3, rtol=2e-3)
 
+    # --- exact known-value correctness ---
+
+    def test_known_uniform_logits(self):
+        """Identical logits must produce uniform distribution regardless of temperature."""
+        logits = torch.zeros(2, 5, dtype=torch.float32)
+        temps = torch.tensor([0.5, 2.0], dtype=torch.float32).view(-1, 1)
+        fused = fused_temperature_softmax(logits, temps)
+        expected = torch.full((2, 5), 0.2, dtype=torch.float32, device=fused.device)
+        torch.testing.assert_close(fused, expected, atol=1e-6, rtol=1e-6)
+
+    def test_known_softmax_values(self):
+        """Verify against hand-computed softmax(logits / T)."""
+        logits = torch.tensor([[1.0, 2.0, 3.0]], dtype=torch.float32)
+        temps = torch.tensor([[1.0]], dtype=torch.float32)
+        fused = fused_temperature_softmax(logits, temps)
+        # softmax([1,2,3]) = exp([1,2,3]) / sum(exp([1,2,3]))
+        e = torch.exp(logits)
+        expected = (e / e.sum(dim=-1, keepdim=True)).to(fused.device)
+        torch.testing.assert_close(fused, expected, atol=1e-6, rtol=1e-6)
+
+    def test_known_softmax_with_temperature(self):
+        """Verify softmax([1,2,3] / 0.5) against hand computation."""
+        logits = torch.tensor([[1.0, 2.0, 3.0]], dtype=torch.float32)
+        temps = torch.tensor([[0.5]], dtype=torch.float32)
+        fused = fused_temperature_softmax(logits, temps)
+        scaled = logits / 0.5
+        e = torch.exp(scaled)
+        expected = (e / e.sum(dim=-1, keepdim=True)).to(fused.device)
+        torch.testing.assert_close(fused, expected, atol=1e-6, rtol=1e-6)
+
+    # --- argmax preservation ---
+
+    def test_argmax_preserved(self):
+        """argmax must be invariant to temperature for finite T > 0."""
+        logits = torch.randn(64, 32000, dtype=torch.bfloat16)
+        original_argmax = logits.float().argmax(dim=-1)
+        for t_val in [0.1, 0.5, 1.0, 2.0, 10.0]:
+            temps = torch.full((64, 1), t_val, dtype=torch.float32)
+            fused = fused_temperature_softmax(logits, temps)
+            fused_argmax = fused.argmax(dim=-1)
+            self.assertTrue(
+                (original_argmax == fused_argmax).all(),
+                f"argmax changed at temperature={t_val}",
+            )
+
+    # --- numerical stability ---
+
+    def test_large_logits_no_nan(self):
+        """Extreme logit magnitudes must not produce NaN or Inf."""
+        logits = torch.tensor(
+            [[1e6, -1e6, 0.0], [1e4, 1e4 + 1, 1e4 - 1]], dtype=torch.float32
+        )
+        temps = torch.tensor([[1.0], [0.01]], dtype=torch.float32)
+        fused = fused_temperature_softmax(logits, temps)
+        self.assertFalse(torch.isnan(fused).any(), "NaN in output")
+        self.assertFalse(torch.isinf(fused).any(), "Inf in output")
+        row_sums = fused.sum(dim=-1)
+        torch.testing.assert_close(
+            row_sums,
+            torch.ones_like(row_sums),
+            atol=1e-4,
+            rtol=1e-4,
+        )
+
+    def test_large_logits_inplace_no_nan(self):
+        """In-place variant: extreme logits must not produce NaN or Inf."""
+        logits = torch.tensor(
+            [[1e6, -1e6, 0.0], [1e4, 1e4 + 1, 1e4 - 1]], dtype=torch.float32
+        )
+        temps = torch.tensor([[1.0], [0.01]], dtype=torch.float32)
+        fused_temperature_softmax_inplace(logits, temps)
+        self.assertFalse(torch.isnan(logits).any(), "NaN in output")
+        self.assertFalse(torch.isinf(logits).any(), "Inf in output")
+
+    # --- comparison with flashinfer.sampling.softmax ---
+
+    def test_vs_flashinfer_basic(self):
+        logits = torch.randn(4, 1024, dtype=torch.bfloat16)
+        temps = torch.tensor([0.7, 1.0, 1.5, 2.0], dtype=torch.float32).view(-1, 1)
+        fused = fused_temperature_softmax(logits, temps)
+        fi = flashinfer_softmax(logits, temperature=temps.view(-1))
+        self._check_close(fused, fi, atol=1e-4, rtol=1e-3)
+
+    def test_vs_flashinfer_large_vocab(self):
+        logits = torch.randn(8, 128256, dtype=torch.bfloat16)
+        temps = torch.full((8, 1), 0.6, dtype=torch.float32)
+        fused = fused_temperature_softmax(logits, temps)
+        fi = flashinfer_softmax(logits, temperature=temps.view(-1))
+        self._check_close(fused, fi, atol=1e-4, rtol=1e-3)
+
+    def test_vs_flashinfer_batch_sizes(self):
+        for bs in [1, 16, 64, 128, 512]:
+            logits = torch.randn(bs, 32000, dtype=torch.bfloat16)
+            temps = torch.rand(bs, 1, dtype=torch.float32) * 1.5 + 0.1
+            fused = fused_temperature_softmax(logits, temps)
+            fi = flashinfer_softmax(logits, temperature=temps.view(-1))
+            self._check_close(fused, fi, atol=1e-4, rtol=1e-3)
+
+    def test_vs_flashinfer_scalar_temperature(self):
+        logits = torch.randn(16, 32000, dtype=torch.bfloat16)
+        temps_2d = torch.full((16, 1), 0.8, dtype=torch.float32)
+        fused = fused_temperature_softmax(logits, temps_2d)
+        fi = flashinfer_softmax(logits, temperature=0.8)
+        self._check_close(fused, fi, atol=1e-4, rtol=1e-3)
+
+    def test_vs_flashinfer_mixed_temperatures(self):
+        logits = torch.randn(8, 32000, dtype=torch.bfloat16)
+        temps = torch.tensor(
+            [0.1, 0.5, 0.7, 1.0, 1.2, 1.5, 2.0, 5.0], dtype=torch.float32
+        ).view(-1, 1)
+        fused = fused_temperature_softmax(logits, temps)
+        fi = flashinfer_softmax(logits, temperature=temps.view(-1))
+        self._check_close(fused, fi, atol=1e-4, rtol=1e-3)
+
 
 if __name__ == "__main__":
     unittest.main()

From 131ce9dbc2b53938423468942d2131bb53dd17bc Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Mon, 30 Mar 2026 05:06:06 -0700
Subject: [PATCH 06/27] Fix Lint and Merge Bench

---
 benchmark/kernels/bench_compare_softmax.py    | 107 ------------------
 .../bench_fused_temperature_softmax.py        |  19 +++-
 2 files changed, 14 insertions(+), 112 deletions(-)
 delete mode 100644 benchmark/kernels/bench_compare_softmax.py

diff --git a/benchmark/kernels/bench_compare_softmax.py b/benchmark/kernels/bench_compare_softmax.py
deleted file mode 100644
index e81f65ab1657..000000000000
--- a/benchmark/kernels/bench_compare_softmax.py
+++ /dev/null
@@ -1,107 +0,0 @@
-"""Benchmark: Triton fused kernel vs flashinfer.sampling.softmax vs PyTorch baseline.
-
-Fair comparison: all variants clone logits each iteration to avoid
-measuring on already-softmaxed data.
-Uses torch.cuda.Event timing, 200 iterations after 50 warmup.
-"""
-
-import argparse
-import torch
-
-
-def benchmark_fn(fn, warmup=50, iters=200):
-    """Time a zero-arg callable using CUDA events. Returns microseconds."""
-    for _ in range(warmup):
-        fn()
-    torch.cuda.synchronize()
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
-    start.record()
-    for _ in range(iters):
-        fn()
-    end.record()
-    torch.cuda.synchronize()
-    return start.elapsed_time(end) / iters * 1000  # ms -> us
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--warmup", type=int, default=50)
-    parser.add_argument("--iters", type=int, default=200)
-    args = parser.parse_args()
-
-    from flashinfer.sampling import softmax as flashinfer_softmax
-    from sglang.srt.layers.fused_sampling import (
-        fused_temperature_softmax,
-        fused_temperature_softmax_inplace,
-    )
-
-    configs = [
-        # (batch_size, vocab_size, dtype)
-        (1, 32000, torch.bfloat16),
-        (1, 128256, torch.bfloat16),
-        (32, 32000, torch.bfloat16),
-        (32, 128256, torch.bfloat16),
-        (128, 32000, torch.bfloat16),
-        (128, 128256, torch.bfloat16),
-        (512, 32000, torch.bfloat16),
-        (512, 128256, torch.bfloat16),
-    ]
-
-    header = (
-        f"{'bs':>5}  {'vocab':>7}  {'dtype':>8}  "
-        f"{'pytorch(us)':>12}  {'triton(us)':>11}  {'triton_ip(us)':>14}  "
-        f"{'flashinfer(us)':>15}  "
-        f"{'tri/py':>7}  {'fi/py':>7}  {'tri/fi':>7}"
-    )
-    print(header)
-    print("-" * len(header))
-
-    for bs, vocab, dtype in configs:
-        temps = torch.rand(bs, 1, dtype=torch.float32, device="cuda") * 1.5 + 0.1
-        temps_flat = temps.view(-1)
-        logits_base = torch.randn(bs, vocab, dtype=dtype, device="cuda")
-
-        # --- PyTorch baseline (clone each iter for fairness) ---
-        def run_pytorch(logits_base=logits_base, temps=temps):
-            l = logits_base.clone()
-            l.div_(temps)
-            l[:] = torch.softmax(l, dim=-1)
-
-        t_pytorch = benchmark_fn(run_pytorch, args.warmup, args.iters)
-
-        # --- Triton out-of-place (clone each iter) ---
-        def run_triton(logits_base=logits_base, temps=temps):
-            l = logits_base.clone()
-            fused_temperature_softmax(l, temps)
-
-        t_triton = benchmark_fn(run_triton, args.warmup, args.iters)
-
-        # --- Triton in-place (clone each iter) ---
-        def run_triton_ip(logits_base=logits_base, temps=temps):
-            l = logits_base.clone()
-            fused_temperature_softmax_inplace(l, temps)
-
-        t_triton_ip = benchmark_fn(run_triton_ip, args.warmup, args.iters)
-
-        # --- flashinfer (clone each iter) ---
-        def run_flashinfer(logits_base=logits_base, temps_flat=temps_flat):
-            l = logits_base.clone()
-            flashinfer_softmax(l, temperature=temps_flat)
-
-        t_fi = benchmark_fn(run_flashinfer, args.warmup, args.iters)
-
-        speedup_triton = t_pytorch / t_triton
-        speedup_fi = t_pytorch / t_fi
-        triton_vs_fi = t_fi / t_triton  # >1 means triton faster
-
-        print(
-            f"{bs:>5}  {vocab:>7}  {str(dtype):>8}  "
-            f"{t_pytorch:>12.1f}  {t_triton:>11.1f}  {t_triton_ip:>14.1f}  "
-            f"{t_fi:>15.1f}  "
-            f"{speedup_triton:>6.2f}x  {speedup_fi:>6.2f}x  {triton_vs_fi:>6.2f}x"
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/kernels/bench_fused_temperature_softmax.py b/benchmark/kernels/bench_fused_temperature_softmax.py
index 264821afd2e2..fc624b721ecf 100644
--- a/benchmark/kernels/bench_fused_temperature_softmax.py
+++ b/benchmark/kernels/bench_fused_temperature_softmax.py
@@ -1,4 +1,11 @@
-"""Benchmark: fused_temperature_softmax vs div_+softmax vs flashinfer.sampling.softmax."""
+"""Benchmark: fused_temperature_softmax vs separate div_ + softmax vs flashinfer.sampling.softmax.
+
+Each path clones logits every iteration so timing is not skewed by in-place reuse.
+Uses torch.cuda.Event timing; default 50 warmup, 200 timed iterations.
+
+Columns tri/base and fi/base are speedup vs PyTorch baseline; tri/fi is t_flashinfer/t_triton
+(>1 means Triton is faster).
+"""
 
 import argparse
 
@@ -49,7 +56,7 @@ def main():
     header = (
         f"{'bs':>5}  {'vocab':>7}  {'dtype':>8}  "
         f"{'baseline (us)':>14}  {'triton (us)':>12}  {'inplace (us)':>13}  {'flashinfer (us)':>16}  "
-        f"{'tri/base':>9}  {'fi/base':>8}"
+        f"{'tri/base':>9}  {'fi/base':>8}  {'tri/fi':>7}"
     )
     print(header)
     print("-" * len(header))
@@ -80,18 +87,20 @@ def run_inplace(src=logits_src, t=temps):
 
         t_ip = benchmark_fn(run_inplace, args.warmup, args.iters)
 
-        # --- FlashInfer softmax with temperature ---
+        # --- FlashInfer (clone each iter, same as other paths) ---
         def run_flashinfer(src=logits_src, t=temps_1d):
-            flashinfer_softmax(src, temperature=t)
+            l = src.clone()
+            flashinfer_softmax(l, temperature=t)
 
         t_fi = benchmark_fn(run_flashinfer, args.warmup, args.iters)
 
         sp_triton = t_base / t_triton
         sp_fi = t_base / t_fi
+        tri_vs_fi = t_fi / t_triton
         print(
             f"{bs:>5}  {vocab:>7}  {str(dtype):>8}  "
             f"{t_base:>14.1f}  {t_triton:>12.1f}  {t_ip:>13.1f}  {t_fi:>16.1f}  "
-            f"{sp_triton:>8.2f}x  {sp_fi:>7.2f}x"
+            f"{sp_triton:>8.2f}x  {sp_fi:>7.2f}x  {tri_vs_fi:>6.2f}x"
         )
 
 

From e333bc54c6d56b73767bac58b3341c08f81d24bd Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Mon, 30 Mar 2026 13:24:36 -0700
Subject: [PATCH 07/27] Fix AMD CI Crash

---
 python/sglang/srt/model_executor/model_runner.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 7f0b008c5978..2ad6dac11169 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -1812,7 +1812,10 @@ def init_double_sparsity_channel_config(self, selected_channel):
             )
 
     def kernel_warmup(self):
-        """Warmup and tune kernels before cuda graph capture."""
+        """
+        Warmup and tune kernels before cuda graph capture.
+        Currently only doing FlashInfer autotune.
+        """
         if self.device != "cuda":
             return
 
@@ -1823,6 +1826,9 @@ def kernel_warmup(self):
 
     def _warmup_fused_sampling(self):
         """Pre-compile and autotune fused sampling Triton kernels."""
+        if _is_hip:
+            return
+
         from sglang.srt.layers.fused_sampling import (
             warmup_fused_temperature_softmax,
         )

From fa120a886966cff28e3f617b92a01d9ffa27fa3e Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Tue, 31 Mar 2026 06:08:30 -0700
Subject: [PATCH 08/27] Change Dtype

---
 python/sglang/srt/layers/fused_sampling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
index c78649d1db45..6519a1171b98 100644
--- a/python/sglang/srt/layers/fused_sampling.py
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -308,7 +308,7 @@ def warmup_fused_temperature_softmax(
     )
 
     # Small dummy tensors — only 1 row needed to trigger compile + autotune.
-    dummy_logits = torch.randn(1, vocab_size, dtype=torch.bfloat16, device=device)
+    dummy_logits = torch.randn(1, vocab_size, dtype=torch.float32, device=device)
     dummy_temps = torch.ones(1, 1, dtype=torch.float32, device=device)
 
     # Trigger out-of-place kernel

From cc84c884468b7c0a82f40f3c239705da7b376635 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Tue, 31 Mar 2026 12:24:16 -0700
Subject: [PATCH 09/27] Remove Trition Autotune at In-Place Kernel

---
 python/sglang/srt/layers/fused_sampling.py | 45 +++++++++++++++++++---
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
index 6519a1171b98..e6692f1a409a 100644
--- a/python/sglang/srt/layers/fused_sampling.py
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -151,7 +151,6 @@ def _multi_pass_temperature_softmax_kernel(
         tl.store(output_row + offsets, prob, mask=mask)
 
 
-@triton.autotune(configs=_MULTI_PASS_AUTOTUNE_CONFIGS, key=["vocab_size"])
 @triton.jit
 def _multi_pass_temperature_softmax_inplace_kernel(
     logits_ptr,
@@ -198,11 +197,23 @@ def _multi_pass_temperature_softmax_inplace_kernel(
 # Public API
 # ---------------------------------------------------------------------------
 
+_DEFAULT_MULTI_PASS_CONFIG = {"BLOCK_SIZE": 4096, "num_warps": 16}
+
+# Populated by warmup from the out-of-place kernel's autotune result.
+_multi_pass_inplace_config: dict | None = None
+
 
 def _single_pass_num_warps(block_size: int) -> int:
     return max(4, min(32, block_size // 256))
 
 
+def _get_multi_pass_inplace_config() -> dict:
+    """Return the launch config for the multi-pass in-place kernel."""
+    if _multi_pass_inplace_config is not None:
+        return _multi_pass_inplace_config
+    return _DEFAULT_MULTI_PASS_CONFIG
+
+
 def _dispatch_kernel(
     logits: torch.Tensor,
     temperatures_flat: torch.Tensor,
@@ -238,11 +249,13 @@ def _dispatch_kernel(
             )
     else:
         if inplace:
+            cfg = _get_multi_pass_inplace_config()
             _multi_pass_temperature_softmax_inplace_kernel[grid](
                 logits,
                 temperatures_flat,
                 vocab_size,
                 logits.stride(0),
+                **cfg,
             )
         else:
             _multi_pass_temperature_softmax_kernel[grid](
@@ -294,7 +307,14 @@ def warmup_fused_temperature_softmax(
     vocab_size: int,
     device: torch.device = None,
 ) -> None:
-    """Pre-compile and autotune kernels at startup so first request has no latency spike."""
+    """Pre-compile and autotune kernels at startup so first request has no latency spike.
+
+    For multi-pass kernels the out-of-place variant is autotuned (safe — separate
+    input/output buffers), and its winning config is reused for the in-place
+    variant so that no autotune ever runs on a live logits buffer.
+    """
+    global _multi_pass_inplace_config
+
     if device is None:
         device = torch.cuda.current_device()
 
@@ -307,13 +327,28 @@ def warmup_fused_temperature_softmax(
         vocab_size,
     )
 
-    # Small dummy tensors — only 1 row needed to trigger compile + autotune.
     dummy_logits = torch.randn(1, vocab_size, dtype=torch.float32, device=device)
     dummy_temps = torch.ones(1, 1, dtype=torch.float32, device=device)
 
-    # Trigger out-of-place kernel
+    # 1. Out-of-place kernel: autotune runs here (safe, separate buffers).
     fused_temperature_softmax(dummy_logits, dummy_temps)
-    # Trigger in-place kernel
+
+    # 2. Propagate best config to the in-place kernel (no autotune needed).
+    if is_multi_pass:
+        best = _multi_pass_temperature_softmax_kernel.best_config
+        _multi_pass_inplace_config = {
+            "BLOCK_SIZE": best.kwargs["BLOCK_SIZE"],
+            "num_warps": best.num_warps,
+            "num_stages": best.num_stages,
+        }
+        logger.info(
+            "Multi-pass autotune result: BLOCK_SIZE=%d, num_warps=%d, num_stages=%d",
+            _multi_pass_inplace_config["BLOCK_SIZE"],
+            _multi_pass_inplace_config["num_warps"],
+            _multi_pass_inplace_config["num_stages"],
+        )
+
+    # 3. In-place kernel: JIT compile only (uses the config from step 2).
     fused_temperature_softmax_inplace(dummy_logits.clone(), dummy_temps)
     torch.cuda.synchronize(device)
 

From f356ce2684a73d38f085cc80939cd70ea8097160 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Tue, 31 Mar 2026 18:06:18 -0700
Subject: [PATCH 10/27] Fix Sampler Issues

---
 python/sglang/srt/layers/sampler.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index bf20cace7724..306ef022345c 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -20,12 +20,10 @@
 
 _use_fused_sampling = False
 if is_cuda():
-    from flashinfer.sampling import (
-        min_p_sampling_from_probs,
-        top_k_top_p_sampling_from_probs,
-    )
     from sgl_kernel import (
+        min_p_sampling_from_probs,
         top_k_renorm_prob,
+        top_k_top_p_sampling_from_probs,
         top_p_renorm_prob,
     )
 
@@ -335,15 +333,13 @@ def _attach_logprobs_to_output(
             (
                 logits_output.next_token_top_logprobs_val,
                 logits_output.next_token_top_logprobs_idx,
-            ) = get_top_logprobs(logprobs, top_logprobs_nums, no_copy_to_cpu=True)
+            ) = get_top_logprobs(logprobs, top_logprobs_nums)
 
         if any(x is not None for x in token_ids_logprobs):
             (
                 logits_output.next_token_token_ids_logprobs_val,
                 logits_output.next_token_token_ids_logprobs_idx,
-            ) = get_token_ids_logprobs(
-                logprobs, token_ids_logprobs, no_copy_to_cpu=True
-            )
+            ) = get_token_ids_logprobs(logprobs, token_ids_logprobs)
 
         logits_output.next_token_logprobs = logprobs[
             torch.arange(len(batch_next_token_ids), device=sampling_info.device),
@@ -407,7 +403,7 @@ def compute_logprobs_only(
             (
                 logits_output.next_token_top_logprobs_val,
                 logits_output.next_token_top_logprobs_idx,
-            ) = get_top_logprobs(logprobs, top_logprobs_nums, no_copy_to_cpu=True)
+            ) = get_top_logprobs(logprobs, top_logprobs_nums)
 
         # Handle token_ids logprobs if requested
         if needs_token_ids_logprobs:

From bf92dc140a290e999bcc0448ba879195f868002d Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Tue, 31 Mar 2026 18:21:06 -0700
Subject: [PATCH 11/27] Remove WarmUp for Test Notebook CI

---
 python/sglang/srt/layers/sampler.py              | 14 +++++++++-----
 python/sglang/srt/model_executor/model_runner.py | 12 +-----------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index 306ef022345c..bf20cace7724 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -20,10 +20,12 @@
 
 _use_fused_sampling = False
 if is_cuda():
-    from sgl_kernel import (
+    from flashinfer.sampling import (
         min_p_sampling_from_probs,
-        top_k_renorm_prob,
         top_k_top_p_sampling_from_probs,
+    )
+    from sgl_kernel import (
+        top_k_renorm_prob,
         top_p_renorm_prob,
     )
 
@@ -333,13 +335,15 @@ def _attach_logprobs_to_output(
             (
                 logits_output.next_token_top_logprobs_val,
                 logits_output.next_token_top_logprobs_idx,
-            ) = get_top_logprobs(logprobs, top_logprobs_nums)
+            ) = get_top_logprobs(logprobs, top_logprobs_nums, no_copy_to_cpu=True)
 
         if any(x is not None for x in token_ids_logprobs):
             (
                 logits_output.next_token_token_ids_logprobs_val,
                 logits_output.next_token_token_ids_logprobs_idx,
-            ) = get_token_ids_logprobs(logprobs, token_ids_logprobs)
+            ) = get_token_ids_logprobs(
+                logprobs, token_ids_logprobs, no_copy_to_cpu=True
+            )
 
         logits_output.next_token_logprobs = logprobs[
             torch.arange(len(batch_next_token_ids), device=sampling_info.device),
@@ -403,7 +407,7 @@ def compute_logprobs_only(
             (
                 logits_output.next_token_top_logprobs_val,
                 logits_output.next_token_top_logprobs_idx,
-            ) = get_top_logprobs(logprobs, top_logprobs_nums)
+            ) = get_top_logprobs(logprobs, top_logprobs_nums, no_copy_to_cpu=True)
 
         # Handle token_ids logprobs if requested
         if needs_token_ids_logprobs:
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index b4f6e0a66b8e..4fafd984a77a 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -1975,17 +1975,7 @@ def kernel_warmup(self):
 
     def _warmup_fused_sampling(self):
         """Pre-compile and autotune fused sampling Triton kernels."""
-        if _is_hip:
-            return
-
-        from sglang.srt.layers.fused_sampling import (
-            warmup_fused_temperature_softmax,
-        )
-
-        warmup_fused_temperature_softmax(
-            vocab_size=self.model_config.vocab_size,
-            device=self.device,
-        )
+        pass
 
     def _should_run_flashinfer_autotune(self) -> bool:
         """Check if flashinfer autotune should be run."""

From 066dbb009230f4ca67b8040dccfa0033d07789f3 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Tue, 31 Mar 2026 23:25:22 -0700
Subject: [PATCH 12/27] Remove WarmUp for Test Notebook CI2

---
 python/sglang/srt/layers/sampler.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index bf20cace7724..5f6e82e795f3 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -18,7 +18,6 @@
 from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils.common import crash_on_warnings, get_bool_env_var, is_cuda, is_npu
 
-_use_fused_sampling = False
 if is_cuda():
     from flashinfer.sampling import (
         min_p_sampling_from_probs,
@@ -28,10 +27,6 @@
         top_k_renorm_prob,
         top_p_renorm_prob,
     )
-
-    from sglang.srt.layers.fused_sampling import fused_temperature_softmax_inplace
-
-    _use_fused_sampling = True
 if is_npu():
     import torch_npu
 
@@ -157,13 +152,8 @@ def forward(
                     logprobs = logprobs_via_logsoftmax_kernel
             else:
                 # Standard path: do softmax and sample from probs.
-                if _use_fused_sampling:
-                    fused_temperature_softmax_inplace(
-                        logits, sampling_info.temperatures
-                    )
-                else:
-                    logits.div_(sampling_info.temperatures)
-                    logits[:] = torch.softmax(logits, dim=-1)
+                logits.div_(sampling_info.temperatures)
+                logits[:] = torch.softmax(logits, dim=-1)
                 probs = logits
 
                 batch_next_token_ids = self._sample_from_probs(

From d5eeaca2d6896331d9d5baf34fb3ade9c3b58bc5 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Tue, 31 Mar 2026 23:54:29 -0700
Subject: [PATCH 13/27] Rollback inplace kernel

---
 python/sglang/srt/layers/fused_sampling.py    | 50 +++++++++++++------
 python/sglang/srt/layers/sampler.py           | 17 +++++--
 .../sglang/srt/model_executor/model_runner.py | 12 ++++-
 3 files changed, 59 insertions(+), 20 deletions(-)

diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
index e6692f1a409a..428904f8ba6a 100644
--- a/python/sglang/srt/layers/fused_sampling.py
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -297,7 +297,11 @@ def fused_temperature_softmax_inplace(
     if batch_size == 0:
         return
 
-    assert logits.is_contiguous(), "logits must be contiguous for in-place kernel"
+    if not logits.is_contiguous():
+        work = logits.contiguous()
+        fused_temperature_softmax_inplace(work, temperatures)
+        logits.copy_(work)
+        return
 
     temperatures_flat = temperatures.contiguous().view(-1)
     _dispatch_kernel(logits, temperatures_flat, vocab_size, batch_size)
@@ -305,13 +309,17 @@ def fused_temperature_softmax_inplace(
 
 def warmup_fused_temperature_softmax(
     vocab_size: int,
-    device: torch.device = None,
+    device: torch.device | int | None = None,
+    logits_dtype: torch.dtype = torch.float32,
 ) -> None:
     """Pre-compile and autotune kernels at startup so first request has no latency spike.
 
     For multi-pass kernels the out-of-place variant is autotuned (safe — separate
     input/output buffers), and its winning config is reused for the in-place
     variant so that no autotune ever runs on a live logits buffer.
+
+    ``logits_dtype`` should match ``next_token_logits`` at inference (usually
+    ``model_config.dtype``) so Triton specializes the same way as in production.
     """
     global _multi_pass_inplace_config
 
@@ -322,12 +330,13 @@ def warmup_fused_temperature_softmax(
     is_multi_pass = block_size > _MAX_SINGLE_PASS_BLOCK
     label = "multi-pass autotune" if is_multi_pass else "single-pass JIT"
     logger.info(
-        "Warming up fused_temperature_softmax (%s, vocab_size=%d) ...",
+        "Warming up fused_temperature_softmax (%s, vocab_size=%d, logits_dtype=%s) ...",
         label,
         vocab_size,
+        logits_dtype,
     )
 
-    dummy_logits = torch.randn(1, vocab_size, dtype=torch.float32, device=device)
+    dummy_logits = torch.randn(1, vocab_size, dtype=logits_dtype, device=device)
     dummy_temps = torch.ones(1, 1, dtype=torch.float32, device=device)
 
     # 1. Out-of-place kernel: autotune runs here (safe, separate buffers).
@@ -335,18 +344,27 @@ def warmup_fused_temperature_softmax(
 
     # 2. Propagate best config to the in-place kernel (no autotune needed).
     if is_multi_pass:
-        best = _multi_pass_temperature_softmax_kernel.best_config
-        _multi_pass_inplace_config = {
-            "BLOCK_SIZE": best.kwargs["BLOCK_SIZE"],
-            "num_warps": best.num_warps,
-            "num_stages": best.num_stages,
-        }
-        logger.info(
-            "Multi-pass autotune result: BLOCK_SIZE=%d, num_warps=%d, num_stages=%d",
-            _multi_pass_inplace_config["BLOCK_SIZE"],
-            _multi_pass_inplace_config["num_warps"],
-            _multi_pass_inplace_config["num_stages"],
-        )
+        best = getattr(_multi_pass_temperature_softmax_kernel, "best_config", None)
+        if best is not None:
+            _multi_pass_inplace_config = {
+                "BLOCK_SIZE": best.kwargs["BLOCK_SIZE"],
+                "num_warps": best.num_warps,
+            }
+            if best.num_stages is not None:
+                _multi_pass_inplace_config["num_stages"] = best.num_stages
+            ns = _multi_pass_inplace_config.get("num_stages", "default")
+            logger.info(
+                "Multi-pass autotune result: BLOCK_SIZE=%d, num_warps=%d, num_stages=%s",
+                _multi_pass_inplace_config["BLOCK_SIZE"],
+                _multi_pass_inplace_config["num_warps"],
+                ns,
+            )
+        else:
+            _multi_pass_inplace_config = None
+            logger.warning(
+                "Multi-pass fused softmax: autotune did not set best_config; "
+                "using default launch config for in-place kernel."
+            )
 
     # 3. In-place kernel: JIT compile only (uses the config from step 2).
     fused_temperature_softmax_inplace(dummy_logits.clone(), dummy_temps)
diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index 5f6e82e795f3..784800e4bc4c 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -18,6 +18,7 @@
 from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils.common import crash_on_warnings, get_bool_env_var, is_cuda, is_npu
 
+_use_fused_sampling = False
 if is_cuda():
     from flashinfer.sampling import (
         min_p_sampling_from_probs,
@@ -27,6 +28,10 @@
         top_k_renorm_prob,
         top_p_renorm_prob,
     )
+
+    from sglang.srt.layers.fused_sampling import fused_temperature_softmax_inplace
+
+    _use_fused_sampling = True
 if is_npu():
     import torch_npu
 
@@ -152,9 +157,15 @@ def forward(
                     logprobs = logprobs_via_logsoftmax_kernel
             else:
                 # Standard path: do softmax and sample from probs.
-                logits.div_(sampling_info.temperatures)
-                logits[:] = torch.softmax(logits, dim=-1)
-                probs = logits
+                if _use_fused_sampling:
+                    fused_temperature_softmax_inplace(
+                        logits, sampling_info.temperatures
+                    )
+                    probs = logits
+                else:
+                    logits.div_(sampling_info.temperatures)
+                    logits[:] = torch.softmax(logits, dim=-1)
+                    probs = logits
 
                 batch_next_token_ids = self._sample_from_probs(
                     probs, sampling_info, positions, simple_sampling_case
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 4fafd984a77a..e36e6db2491c 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -1975,7 +1975,17 @@ def kernel_warmup(self):
 
     def _warmup_fused_sampling(self):
         """Pre-compile and autotune fused sampling Triton kernels."""
-        pass
+        if _is_hip:
+            return
+        from sglang.srt.layers.fused_sampling import warmup_fused_temperature_softmax
+
+        logits_warmup_dtype = (
+            torch.float32 if self.server_args.enable_fp32_lm_head else self.dtype
+        )
+        warmup_fused_temperature_softmax(
+            self.model_config.vocab_size,
+            logits_dtype=logits_warmup_dtype,
+        )
 
     def _should_run_flashinfer_autotune(self) -> bool:
         """Check if flashinfer autotune should be run."""

From 08161e1b4b59675538b1c0942890e799327dc712 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Wed, 1 Apr 2026 08:38:21 -0700
Subject: [PATCH 14/27] sampler: use OOP fused softmax for grammar batches (fix
 structured output sampling)

---
 python/sglang/srt/layers/sampler.py | 41 ++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index 784800e4bc4c..a360c7b47717 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -29,7 +29,10 @@
         top_p_renorm_prob,
     )
 
-    from sglang.srt.layers.fused_sampling import fused_temperature_softmax_inplace
+    from sglang.srt.layers.fused_sampling import (
+        fused_temperature_softmax,
+        fused_temperature_softmax_inplace,
+    )
 
     _use_fused_sampling = True
 if is_npu():
@@ -43,6 +46,32 @@
 _BUILT_IN_SAMPLING_BACKENDS = {"flashinfer", "pytorch", "ascend"}
 
 
+def _sampling_batch_has_active_grammar(sampling_info: SamplingBatchInfo) -> bool:
+    g = sampling_info.grammars
+    return bool(g) and any(x for x in g if x)
+
+
+def _fused_temperature_softmax_to_probs_inplace(
+    logits: torch.Tensor,
+    temperatures: torch.Tensor,
+    *,
+    use_out_of_place: bool,
+) -> None:
+    """Scale by temperature and softmax into ``logits`` (probabilities), in-place.
+
+    When ``use_out_of_place`` is True, run the fused kernel with a separate fp32
+    buffer then ``copy_`` into ``logits``. That matches the legacy path (softmax
+    result stored in ``logits.dtype`` via PyTorch) and avoids writing fp32
+    softmax outputs directly into a bf16/fp16 logits tensor — which can skew
+    probabilities enough that xgrammar-guided decoding samples invalid tokens.
+    """
+    if use_out_of_place:
+        probs = fused_temperature_softmax(logits, temperatures)
+        logits.copy_(probs)
+    else:
+        fused_temperature_softmax_inplace(logits, temperatures)
+
+
 class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
@@ -158,8 +187,14 @@ def forward(
             else:
                 # Standard path: do softmax and sample from probs.
                 if _use_fused_sampling:
-                    fused_temperature_softmax_inplace(
-                        logits, sampling_info.temperatures
+                    # Structured output / xgrammar: keep numerics aligned with the
+                    # reference logits.div_; softmax(logits) path (probs in logits dtype).
+                    _fused_temperature_softmax_to_probs_inplace(
+                        logits,
+                        sampling_info.temperatures,
+                        use_out_of_place=_sampling_batch_has_active_grammar(
+                            sampling_info
+                        ),
                     )
                     probs = logits
                 else:

From f30d3d854c85d9f5648fd81a7b0f7209cf940ce3 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Wed, 1 Apr 2026 10:13:13 -0700
Subject: [PATCH 15/27] Change to 3-Pass Kernel

---
 python/sglang/srt/layers/fused_sampling.py | 66 +++++++++++-----------
 python/sglang/srt/layers/sampler.py        | 41 +-------------
 2 files changed, 35 insertions(+), 72 deletions(-)

diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
index 428904f8ba6a..55852c2f0f34 100644
--- a/python/sglang/srt/layers/fused_sampling.py
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -38,7 +38,6 @@ def _single_pass_temperature_softmax_kernel(
 ):
     row_idx = tl.program_id(0)
     temp = tl.load(temperatures_ptr + row_idx)
-    inv_temp = 1.0 / temp
 
     offsets = tl.arange(0, BLOCK_SIZE)
     mask = offsets < vocab_size
@@ -48,7 +47,7 @@ def _single_pass_temperature_softmax_kernel(
         mask=mask,
         other=float("-inf"),
     )
-    x = (x * inv_temp).to(tl.float32)
+    x = (x / temp).to(tl.float32)
 
     x_max = tl.max(x, axis=0)
     exp_x = tl.exp(x - x_max)
@@ -67,14 +66,13 @@ def _single_pass_temperature_softmax_inplace_kernel(
 ):
     row_idx = tl.program_id(0)
     temp = tl.load(temperatures_ptr + row_idx)
-    inv_temp = 1.0 / temp
 
     row_start = logits_ptr + row_idx * stride
     offsets = tl.arange(0, BLOCK_SIZE)
     mask = offsets < vocab_size
 
     x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
-    x = (x * inv_temp).to(tl.float32)
+    x = (x / temp).to(tl.float32)
 
     x_max = tl.max(x, axis=0)
     exp_x = tl.exp(x - x_max)
@@ -118,36 +116,36 @@ def _multi_pass_temperature_softmax_kernel(
 ):
     row_idx = tl.program_id(0)
     temp = tl.load(temperatures_ptr + row_idx)
-    inv_temp = 1.0 / temp
 
     logits_row = logits_ptr + row_idx * logits_stride
     output_row = output_ptr + row_idx * output_stride
 
-    running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
-    running_sum = tl.full([], value=0.0, dtype=tl.float32)
-
+    # Pass 1: find global max (matches PyTorch's first reduction pass)
+    global_max = tl.full([], value=float("-inf"), dtype=tl.float32)
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
         x = tl.load(logits_row + offsets, mask=mask, other=float("-inf"))
-        x = (x * inv_temp).to(tl.float32)
-
-        block_max = tl.max(x, axis=0)
-        new_max = tl.maximum(running_max, block_max)
-        running_sum = running_sum * tl.exp(running_max - new_max) + tl.sum(
-            tl.exp(x - new_max), axis=0
-        )
-        running_max = new_max
+        x = (x / temp).to(tl.float32)
+        global_max = tl.maximum(global_max, tl.max(x, axis=0))
 
-    log_sum = tl.log(running_sum)
+    # Pass 2: compute sum of exp(x - max) (matches PyTorch's second pass)
+    sum_exp = tl.full([], value=0.0, dtype=tl.float32)
+    for start in range(0, vocab_size, BLOCK_SIZE):
+        offsets = start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < vocab_size
+        x = tl.load(logits_row + offsets, mask=mask, other=float("-inf"))
+        x = (x / temp).to(tl.float32)
+        sum_exp += tl.sum(tl.exp(x - global_max), axis=0)
 
+    # Pass 3: normalize (matches PyTorch's exp(x-max)/sum)
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
         x = tl.load(logits_row + offsets, mask=mask, other=float("-inf"))
-        x = (x * inv_temp).to(tl.float32)
+        x = (x / temp).to(tl.float32)
 
-        prob = tl.exp(x - running_max - log_sum)
+        prob = tl.exp(x - global_max) / sum_exp
         tl.store(output_row + offsets, prob, mask=mask)
 
 
@@ -161,35 +159,35 @@ def _multi_pass_temperature_softmax_inplace_kernel(
 ):
     row_idx = tl.program_id(0)
     temp = tl.load(temperatures_ptr + row_idx)
-    inv_temp = 1.0 / temp
 
     row_start = logits_ptr + row_idx * stride
 
-    running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
-    running_sum = tl.full([], value=0.0, dtype=tl.float32)
-
+    # Pass 1: find global max (matches PyTorch's first reduction pass)
+    global_max = tl.full([], value=float("-inf"), dtype=tl.float32)
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
         x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
-        x = (x * inv_temp).to(tl.float32)
-
-        block_max = tl.max(x, axis=0)
-        new_max = tl.maximum(running_max, block_max)
-        running_sum = running_sum * tl.exp(running_max - new_max) + tl.sum(
-            tl.exp(x - new_max), axis=0
-        )
-        running_max = new_max
+        x = (x / temp).to(tl.float32)
+        global_max = tl.maximum(global_max, tl.max(x, axis=0))
 
-    log_sum = tl.log(running_sum)
+    # Pass 2: compute sum of exp(x - max) (matches PyTorch's second pass)
+    sum_exp = tl.full([], value=0.0, dtype=tl.float32)
+    for start in range(0, vocab_size, BLOCK_SIZE):
+        offsets = start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < vocab_size
+        x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
+        x = (x / temp).to(tl.float32)
+        sum_exp += tl.sum(tl.exp(x - global_max), axis=0)
 
+    # Pass 3: normalize (matches PyTorch's exp(x-max)/sum)
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
         x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
-        x = (x * inv_temp).to(tl.float32)
+        x = (x / temp).to(tl.float32)
 
-        prob = tl.exp(x - running_max - log_sum)
+        prob = tl.exp(x - global_max) / sum_exp
         tl.store(row_start + offsets, prob, mask=mask)
 
 
diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index a360c7b47717..784800e4bc4c 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -29,10 +29,7 @@
         top_p_renorm_prob,
     )
 
-    from sglang.srt.layers.fused_sampling import (
-        fused_temperature_softmax,
-        fused_temperature_softmax_inplace,
-    )
+    from sglang.srt.layers.fused_sampling import fused_temperature_softmax_inplace
 
     _use_fused_sampling = True
 if is_npu():
@@ -46,32 +43,6 @@
 _BUILT_IN_SAMPLING_BACKENDS = {"flashinfer", "pytorch", "ascend"}
 
 
-def _sampling_batch_has_active_grammar(sampling_info: SamplingBatchInfo) -> bool:
-    g = sampling_info.grammars
-    return bool(g) and any(x for x in g if x)
-
-
-def _fused_temperature_softmax_to_probs_inplace(
-    logits: torch.Tensor,
-    temperatures: torch.Tensor,
-    *,
-    use_out_of_place: bool,
-) -> None:
-    """Scale by temperature and softmax into ``logits`` (probabilities), in-place.
-
-    When ``use_out_of_place`` is True, run the fused kernel with a separate fp32
-    buffer then ``copy_`` into ``logits``. That matches the legacy path (softmax
-    result stored in ``logits.dtype`` via PyTorch) and avoids writing fp32
-    softmax outputs directly into a bf16/fp16 logits tensor — which can skew
-    probabilities enough that xgrammar-guided decoding samples invalid tokens.
-    """
-    if use_out_of_place:
-        probs = fused_temperature_softmax(logits, temperatures)
-        logits.copy_(probs)
-    else:
-        fused_temperature_softmax_inplace(logits, temperatures)
-
-
 class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
@@ -187,14 +158,8 @@ def forward(
             else:
                 # Standard path: do softmax and sample from probs.
                 if _use_fused_sampling:
-                    # Structured output / xgrammar: keep numerics aligned with the
-                    # reference logits.div_; softmax(logits) path (probs in logits dtype).
-                    _fused_temperature_softmax_to_probs_inplace(
-                        logits,
-                        sampling_info.temperatures,
-                        use_out_of_place=_sampling_batch_has_active_grammar(
-                            sampling_info
-                        ),
+                    fused_temperature_softmax_inplace(
+                        logits, sampling_info.temperatures
                     )
                     probs = logits
                 else:

From e4e89546a8c2e5f3cb8f039b87b19ff0d81bc6e9 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Wed, 1 Apr 2026 10:36:12 -0700
Subject: [PATCH 16/27] Add Hybrid Logic

---
 python/sglang/srt/layers/sampler.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index 784800e4bc4c..4196787820f4 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -32,6 +32,11 @@
     from sglang.srt.layers.fused_sampling import fused_temperature_softmax_inplace
 
     _use_fused_sampling = True
+
+# Batch size threshold for fused Triton kernel vs PyTorch softmax.
+# Below this threshold, PyTorch's native div+softmax is faster.
+# At and above this threshold, the fused Triton kernel wins.
+_FUSED_SAMPLING_BATCH_THRESHOLD = 128
 if is_npu():
     import torch_npu
 
@@ -157,7 +162,12 @@ def forward(
                     logprobs = logprobs_via_logsoftmax_kernel
             else:
                 # Standard path: do softmax and sample from probs.
-                if _use_fused_sampling:
+                # Use fused Triton kernel for large batches where it excels;
+                # fall back to PyTorch for small batches where launch overhead dominates.
+                if (
+                    _use_fused_sampling
+                    and logits.shape[0] >= _FUSED_SAMPLING_BATCH_THRESHOLD
+                ):
                     fused_temperature_softmax_inplace(
                         logits, sampling_info.temperatures
                     )

From a06721da75fc091a408b0e60dcfc87af5c1462b3 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Sun, 5 Apr 2026 09:42:20 -0700
Subject: [PATCH 17/27] Add TestCoverage More detailed

---
 .../test_fused_temperature_softmax.py         | 130 +++++++++++-------
 1 file changed, 83 insertions(+), 47 deletions(-)

diff --git a/test/registered/sampling/test_fused_temperature_softmax.py b/test/registered/sampling/test_fused_temperature_softmax.py
index 956d67c2cefe..6803393f0e34 100644
--- a/test/registered/sampling/test_fused_temperature_softmax.py
+++ b/test/registered/sampling/test_fused_temperature_softmax.py
@@ -1,4 +1,17 @@
-"""Correctness tests for fused_temperature_softmax Triton kernel."""
+"""Correctness tests for fused_temperature_softmax Triton kernel.
+
+Two reference implementations are used:
+
+  1. fp32 reference — logits.float()/temp then softmax in fp32.
+     The Triton kernel also promotes to fp32 internally, so this reference
+     shares the same precision path and can be checked with tight tolerance.
+     This proves **kernel correctness**.
+
+  2. Native-dtype reference — logits.div_(temp) in the original dtype then
+     softmax.  The in-place div_ truncates intermediates to bf16/fp16, so
+     a looser tolerance is needed.  This covers the **dtype truncation gap**
+     between the fused kernel and the existing PyTorch sampling path.
+"""
 
 import unittest
 
@@ -15,9 +28,23 @@
 
 register_cuda_ci(est_time=15, suite="stage-b-test-small-1-gpu")
 
+# Tolerance table — chosen per dtype to cover known rounding gaps.
+# fp32 ref is tight (kernel also runs in fp32 internally).
+# Native-dtype ref is looser (div_ truncates to bf16/fp16 before softmax).
+_TOL = {
+    torch.bfloat16: {"fp32_ref": (1e-5, 1e-5), "native_ref": (5e-4, 2e-2)},
+    torch.float16: {"fp32_ref": (1e-5, 1e-5), "native_ref": (1e-3, 1e-2)},
+    torch.float32: {"fp32_ref": (1e-5, 1e-5), "native_ref": (1e-5, 1e-5)},
+}
+
 
-def reference_temperature_softmax(logits, temperatures):
-    """Reference implementation: div + softmax (separate kernels)."""
+def reference_fp32(logits, temperatures):
+    """fp32 reference: promotes to fp32 first, matching the kernel's internal precision."""
+    return torch.softmax(logits.float() / temperatures.float(), dim=-1)
+
+
+def reference_native(logits, temperatures):
+    """Native-dtype reference: div_ in original dtype, matching the existing PyTorch path."""
     logits = logits.clone()
     logits.div_(temperatures)
     return torch.softmax(logits, dim=-1).float()
@@ -32,55 +59,56 @@ def setUpClass(cls):
     def _check_close(self, fused, ref, atol=1e-5, rtol=1e-5):
         """Assert outputs are close and both are valid probability distributions."""
         self.assertEqual(fused.shape, ref.shape)
-        # Valid probabilities: non-negative, sum to ~1
-        self.assertTrue((fused >= 0).all(), f"Negative probabilities in fused output")
+        self.assertTrue((fused >= 0).all(), "Negative probabilities in fused output")
         row_sums = fused.sum(dim=-1)
         torch.testing.assert_close(
-            row_sums,
-            torch.ones_like(row_sums),
-            atol=1e-4,
-            rtol=1e-4,
+            row_sums, torch.ones_like(row_sums), atol=1e-4, rtol=1e-4
         )
         torch.testing.assert_close(fused, ref, atol=atol, rtol=rtol)
 
-    # --- out-of-place kernel ---
+    def _check_both_refs(self, logits, temps, fused, dtype):
+        """Check fused output against both fp32 and native-dtype references."""
+        tol = _TOL[dtype]
+        ref_f32 = reference_fp32(logits, temps)
+        ref_nat = reference_native(logits, temps)
+        self._check_close(fused, ref_f32, *tol["fp32_ref"])
+        self._check_close(fused, ref_nat, *tol["native_ref"])
+
+    # ------------------------------------------------------------------
+    # Out-of-place kernel
+    # ------------------------------------------------------------------
 
     def test_basic(self):
         logits = torch.randn(4, 1024, dtype=torch.bfloat16)
         temps = torch.tensor([0.7, 1.0, 1.5, 2.0], dtype=torch.float32).view(-1, 1)
-        ref = reference_temperature_softmax(logits, temps)
         fused = fused_temperature_softmax(logits, temps)
-        self._check_close(fused, ref, atol=1e-4, rtol=1e-3)
+        self._check_both_refs(logits, temps, fused, torch.bfloat16)
 
     def test_large_vocab(self):
         logits = torch.randn(8, 128256, dtype=torch.bfloat16)
         temps = torch.full((8, 1), 0.6, dtype=torch.float32)
-        ref = reference_temperature_softmax(logits, temps)
         fused = fused_temperature_softmax(logits, temps)
-        self._check_close(fused, ref, atol=1e-4, rtol=1e-3)
+        self._check_both_refs(logits, temps, fused, torch.bfloat16)
 
     def test_batch_sizes(self):
         for bs in [1, 2, 16, 64, 128, 512]:
             logits = torch.randn(bs, 32000, dtype=torch.bfloat16)
             temps = torch.rand(bs, 1, dtype=torch.float32) * 1.5 + 0.1
-            ref = reference_temperature_softmax(logits, temps)
             fused = fused_temperature_softmax(logits, temps)
-            self._check_close(fused, ref, atol=1e-4, rtol=1e-3)
+            self._check_both_refs(logits, temps, fused, torch.bfloat16)
 
     def test_temperature_one(self):
         """Temperature=1.0 should be equivalent to plain softmax."""
         logits = torch.randn(16, 32000, dtype=torch.bfloat16)
         temps = torch.ones(16, 1, dtype=torch.float32)
-        ref = torch.softmax(logits.float(), dim=-1)
         fused = fused_temperature_softmax(logits, temps)
-        self._check_close(fused, ref, atol=1e-4, rtol=1e-3)
+        self._check_both_refs(logits, temps, fused, torch.bfloat16)
 
     def test_very_low_temperature(self):
         """Very low temperature should produce near-one-hot distribution."""
         logits = torch.randn(4, 1024, dtype=torch.bfloat16)
         temps = torch.full((4, 1), 0.01, dtype=torch.float32)
         fused = fused_temperature_softmax(logits, temps)
-        # Max probability should be very close to 1.0
         max_probs = fused.max(dim=-1).values
         self.assertTrue((max_probs > 0.99).all())
 
@@ -98,16 +126,14 @@ def test_very_high_temperature(self):
     def test_fp16_input(self):
         logits = torch.randn(8, 32000, dtype=torch.float16)
         temps = torch.rand(8, 1, dtype=torch.float32) * 1.5 + 0.1
-        ref = reference_temperature_softmax(logits, temps)
         fused = fused_temperature_softmax(logits, temps)
-        self._check_close(fused, ref, atol=1e-3, rtol=1e-2)
+        self._check_both_refs(logits, temps, fused, torch.float16)
 
     def test_fp32_input(self):
         logits = torch.randn(8, 32000, dtype=torch.float32)
         temps = torch.rand(8, 1, dtype=torch.float32) + 0.5
-        ref = reference_temperature_softmax(logits, temps)
         fused = fused_temperature_softmax(logits, temps)
-        self._check_close(fused, ref, atol=1e-5, rtol=1e-5)
+        self._check_both_refs(logits, temps, fused, torch.float32)
 
     def test_mixed_temperatures(self):
         """Each row has a different temperature."""
@@ -115,9 +141,8 @@ def test_mixed_temperatures(self):
         temps = torch.tensor(
             [0.1, 0.5, 0.7, 1.0, 1.2, 1.5, 2.0, 5.0], dtype=torch.float32
         ).view(-1, 1)
-        ref = reference_temperature_softmax(logits, temps)
         fused = fused_temperature_softmax(logits, temps)
-        self._check_close(fused, ref, atol=1e-4, rtol=1e-3)
+        self._check_both_refs(logits, temps, fused, torch.bfloat16)
 
     def test_empty_batch(self):
         logits = torch.randn(0, 32000, dtype=torch.bfloat16)
@@ -125,31 +150,40 @@ def test_empty_batch(self):
         fused = fused_temperature_softmax(logits, temps)
         self.assertEqual(fused.shape, (0, 32000))
 
-    # --- in-place kernel ---
+    # ------------------------------------------------------------------
+    # In-place kernel
+    # ------------------------------------------------------------------
 
     def test_inplace_basic(self):
         logits = torch.randn(8, 32000, dtype=torch.float32)
         temps = torch.rand(8, 1, dtype=torch.float32) * 1.5 + 0.1
-        ref = reference_temperature_softmax(logits, temps)
+        ref = reference_fp32(logits, temps)
         fused_temperature_softmax_inplace(logits, temps)
-        # In-place writes back to logits in the original dtype
-        self._check_close(logits.float(), ref, atol=1e-5, rtol=1e-5)
+        self._check_close(logits.float(), ref, *_TOL[torch.float32]["fp32_ref"])
 
     def test_inplace_bf16(self):
         logits = torch.randn(8, 32000, dtype=torch.bfloat16)
         temps = torch.rand(8, 1, dtype=torch.float32) + 0.5
-        ref = reference_temperature_softmax(logits, temps)
+        ref_f32 = reference_fp32(logits, temps)
+        ref_nat = reference_native(logits, temps)
         fused_temperature_softmax_inplace(logits, temps)
-        self._check_close(logits.float(), ref, atol=2e-3, rtol=2e-3)
+        # In-place stores fp32 probabilities into bf16 buffer, adding another
+        # truncation step. Use native-dtype tolerance for both references.
+        tol_nat = _TOL[torch.bfloat16]["native_ref"]
+        self._check_close(logits.float(), ref_f32, *tol_nat)
+        self._check_close(logits.float(), ref_nat, *tol_nat)
 
     def test_inplace_large_vocab(self):
         logits = torch.randn(4, 128256, dtype=torch.bfloat16)
         temps = torch.full((4, 1), 0.8, dtype=torch.float32)
-        ref = reference_temperature_softmax(logits, temps)
+        ref_f32 = reference_fp32(logits, temps)
         fused_temperature_softmax_inplace(logits, temps)
-        self._check_close(logits.float(), ref, atol=2e-3, rtol=2e-3)
+        tol_nat = _TOL[torch.bfloat16]["native_ref"]
+        self._check_close(logits.float(), ref_f32, *tol_nat)
 
-    # --- exact known-value correctness ---
+    # ------------------------------------------------------------------
+    # Exact known-value correctness (fp32 only — no dtype truncation)
+    # ------------------------------------------------------------------
 
     def test_known_uniform_logits(self):
         """Identical logits must produce uniform distribution regardless of temperature."""
@@ -164,7 +198,6 @@ def test_known_softmax_values(self):
         logits = torch.tensor([[1.0, 2.0, 3.0]], dtype=torch.float32)
         temps = torch.tensor([[1.0]], dtype=torch.float32)
         fused = fused_temperature_softmax(logits, temps)
-        # softmax([1,2,3]) = exp([1,2,3]) / sum(exp([1,2,3]))
         e = torch.exp(logits)
         expected = (e / e.sum(dim=-1, keepdim=True)).to(fused.device)
         torch.testing.assert_close(fused, expected, atol=1e-6, rtol=1e-6)
@@ -179,7 +212,9 @@ def test_known_softmax_with_temperature(self):
         expected = (e / e.sum(dim=-1, keepdim=True)).to(fused.device)
         torch.testing.assert_close(fused, expected, atol=1e-6, rtol=1e-6)
 
-    # --- argmax preservation ---
+    # ------------------------------------------------------------------
+    # Argmax preservation
+    # ------------------------------------------------------------------
 
     def test_argmax_preserved(self):
         """argmax must be invariant to temperature for finite T > 0."""
@@ -194,7 +229,9 @@ def test_argmax_preserved(self):
                 f"argmax changed at temperature={t_val}",
             )
 
-    # --- numerical stability ---
+    # ------------------------------------------------------------------
+    # Numerical stability
+    # ------------------------------------------------------------------
 
     def test_large_logits_no_nan(self):
         """Extreme logit magnitudes must not produce NaN or Inf."""
@@ -207,10 +244,7 @@ def test_large_logits_no_nan(self):
         self.assertFalse(torch.isinf(fused).any(), "Inf in output")
         row_sums = fused.sum(dim=-1)
         torch.testing.assert_close(
-            row_sums,
-            torch.ones_like(row_sums),
-            atol=1e-4,
-            rtol=1e-4,
+            row_sums, torch.ones_like(row_sums), atol=1e-4, rtol=1e-4
         )
 
     def test_large_logits_inplace_no_nan(self):
@@ -223,21 +257,23 @@ def test_large_logits_inplace_no_nan(self):
         self.assertFalse(torch.isnan(logits).any(), "NaN in output")
         self.assertFalse(torch.isinf(logits).any(), "Inf in output")
 
-    # --- comparison with flashinfer.sampling.softmax ---
+    # ------------------------------------------------------------------
+    # Comparison with flashinfer.sampling.softmax
+    # ------------------------------------------------------------------
 
     def test_vs_flashinfer_basic(self):
         logits = torch.randn(4, 1024, dtype=torch.bfloat16)
         temps = torch.tensor([0.7, 1.0, 1.5, 2.0], dtype=torch.float32).view(-1, 1)
         fused = fused_temperature_softmax(logits, temps)
         fi = flashinfer_softmax(logits, temperature=temps.view(-1))
-        self._check_close(fused, fi, atol=1e-4, rtol=1e-3)
+        self._check_close(fused, fi, *_TOL[torch.bfloat16]["native_ref"])
 
     def test_vs_flashinfer_large_vocab(self):
         logits = torch.randn(8, 128256, dtype=torch.bfloat16)
         temps = torch.full((8, 1), 0.6, dtype=torch.float32)
         fused = fused_temperature_softmax(logits, temps)
         fi = flashinfer_softmax(logits, temperature=temps.view(-1))
-        self._check_close(fused, fi, atol=1e-4, rtol=1e-3)
+        self._check_close(fused, fi, *_TOL[torch.bfloat16]["native_ref"])
 
     def test_vs_flashinfer_batch_sizes(self):
         for bs in [1, 16, 64, 128, 512]:
@@ -245,14 +281,14 @@ def test_vs_flashinfer_batch_sizes(self):
             temps = torch.rand(bs, 1, dtype=torch.float32) * 1.5 + 0.1
             fused = fused_temperature_softmax(logits, temps)
             fi = flashinfer_softmax(logits, temperature=temps.view(-1))
-            self._check_close(fused, fi, atol=1e-4, rtol=1e-3)
+            self._check_close(fused, fi, *_TOL[torch.bfloat16]["native_ref"])
 
     def test_vs_flashinfer_scalar_temperature(self):
         logits = torch.randn(16, 32000, dtype=torch.bfloat16)
         temps_2d = torch.full((16, 1), 0.8, dtype=torch.float32)
         fused = fused_temperature_softmax(logits, temps_2d)
         fi = flashinfer_softmax(logits, temperature=0.8)
-        self._check_close(fused, fi, atol=1e-4, rtol=1e-3)
+        self._check_close(fused, fi, *_TOL[torch.bfloat16]["native_ref"])
 
     def test_vs_flashinfer_mixed_temperatures(self):
         logits = torch.randn(8, 32000, dtype=torch.bfloat16)
@@ -261,7 +297,7 @@ def test_vs_flashinfer_mixed_temperatures(self):
         ).view(-1, 1)
         fused = fused_temperature_softmax(logits, temps)
         fi = flashinfer_softmax(logits, temperature=temps.view(-1))
-        self._check_close(fused, fi, atol=1e-4, rtol=1e-3)
+        self._check_close(fused, fi, *_TOL[torch.bfloat16]["native_ref"])
 
 
 if __name__ == "__main__":

From 97518a4f22da547d3974750e9071821f4e9db933 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Sun, 5 Apr 2026 10:05:09 -0700
Subject: [PATCH 18/27] Tolerance Update

---
 test/registered/sampling/test_fused_temperature_softmax.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/registered/sampling/test_fused_temperature_softmax.py b/test/registered/sampling/test_fused_temperature_softmax.py
index 6803393f0e34..2e1e91f60ec6 100644
--- a/test/registered/sampling/test_fused_temperature_softmax.py
+++ b/test/registered/sampling/test_fused_temperature_softmax.py
@@ -32,7 +32,7 @@
 # fp32 ref is tight (kernel also runs in fp32 internally).
 # Native-dtype ref is looser (div_ truncates to bf16/fp16 before softmax).
 _TOL = {
-    torch.bfloat16: {"fp32_ref": (1e-5, 1e-5), "native_ref": (5e-4, 2e-2)},
+    torch.bfloat16: {"fp32_ref": (1e-5, 1e-5), "native_ref": (2e-2, 1e-1)},
     torch.float16: {"fp32_ref": (1e-5, 1e-5), "native_ref": (1e-3, 1e-2)},
     torch.float32: {"fp32_ref": (1e-5, 1e-5), "native_ref": (1e-5, 1e-5)},
 }
@@ -62,7 +62,7 @@ def _check_close(self, fused, ref, atol=1e-5, rtol=1e-5):
         self.assertTrue((fused >= 0).all(), "Negative probabilities in fused output")
         row_sums = fused.sum(dim=-1)
         torch.testing.assert_close(
-            row_sums, torch.ones_like(row_sums), atol=1e-4, rtol=1e-4
+            row_sums, torch.ones_like(row_sums), atol=1e-3, rtol=1e-3
         )
         torch.testing.assert_close(fused, ref, atol=atol, rtol=rtol)
 

From 3c8d733d89116fbd54bbbca691a10160b46bdaab Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Sun, 5 Apr 2026 10:09:10 -0700
Subject: [PATCH 19/27] Tolerance Updates

---
 test/registered/sampling/test_fused_temperature_softmax.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/registered/sampling/test_fused_temperature_softmax.py b/test/registered/sampling/test_fused_temperature_softmax.py
index 2e1e91f60ec6..ee5699e5d67f 100644
--- a/test/registered/sampling/test_fused_temperature_softmax.py
+++ b/test/registered/sampling/test_fused_temperature_softmax.py
@@ -106,7 +106,7 @@ def test_temperature_one(self):
 
     def test_very_low_temperature(self):
         """Very low temperature should produce near-one-hot distribution."""
-        logits = torch.randn(4, 1024, dtype=torch.bfloat16)
+        logits = torch.randn(4, 1024, dtype=torch.float32)
         temps = torch.full((4, 1), 0.01, dtype=torch.float32)
         fused = fused_temperature_softmax(logits, temps)
         max_probs = fused.max(dim=-1).values

From 8148e921f1668f4f1f956de4f7ce9c6243823f82 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Sun, 5 Apr 2026 10:11:42 -0700
Subject: [PATCH 20/27] Sample Size 100

---
 test/registered/sampling/test_fused_temperature_softmax.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/registered/sampling/test_fused_temperature_softmax.py b/test/registered/sampling/test_fused_temperature_softmax.py
index ee5699e5d67f..d0ce619b93b4 100644
--- a/test/registered/sampling/test_fused_temperature_softmax.py
+++ b/test/registered/sampling/test_fused_temperature_softmax.py
@@ -106,7 +106,7 @@ def test_temperature_one(self):
 
     def test_very_low_temperature(self):
         """Very low temperature should produce near-one-hot distribution."""
-        logits = torch.randn(4, 1024, dtype=torch.float32)
+        logits = torch.randn(4, 100, dtype=torch.float32)
         temps = torch.full((4, 1), 0.01, dtype=torch.float32)
         fused = fused_temperature_softmax(logits, temps)
         max_probs = fused.max(dim=-1).values

From 932069bf91f17131b808e4e9aa62d2fc78e1c941 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Sun, 5 Apr 2026 10:18:42 -0700
Subject: [PATCH 21/27] Modify Future Flasky Tests

---
 .../test_fused_temperature_softmax.py         | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/test/registered/sampling/test_fused_temperature_softmax.py b/test/registered/sampling/test_fused_temperature_softmax.py
index d0ce619b93b4..02f193e815fe 100644
--- a/test/registered/sampling/test_fused_temperature_softmax.py
+++ b/test/registered/sampling/test_fused_temperature_softmax.py
@@ -93,7 +93,7 @@ def test_large_vocab(self):
     def test_batch_sizes(self):
         for bs in [1, 2, 16, 64, 128, 512]:
             logits = torch.randn(bs, 32000, dtype=torch.bfloat16)
-            temps = torch.rand(bs, 1, dtype=torch.float32) * 1.5 + 0.1
+            temps = torch.rand(bs, 1, dtype=torch.float32) * 1.5 + 0.5
             fused = fused_temperature_softmax(logits, temps)
             self._check_both_refs(logits, temps, fused, torch.bfloat16)
 
@@ -106,7 +106,9 @@ def test_temperature_one(self):
 
     def test_very_low_temperature(self):
         """Very low temperature should produce near-one-hot distribution."""
-        logits = torch.randn(4, 100, dtype=torch.float32)
+        logits = torch.zeros(4, 1024, dtype=torch.float32)
+        for i in range(4):
+            logits[i, i * 100] = 5.0
         temps = torch.full((4, 1), 0.01, dtype=torch.float32)
         fused = fused_temperature_softmax(logits, temps)
         max_probs = fused.max(dim=-1).values
@@ -125,7 +127,7 @@ def test_very_high_temperature(self):
 
     def test_fp16_input(self):
         logits = torch.randn(8, 32000, dtype=torch.float16)
-        temps = torch.rand(8, 1, dtype=torch.float32) * 1.5 + 0.1
+        temps = torch.rand(8, 1, dtype=torch.float32) * 1.5 + 0.5
         fused = fused_temperature_softmax(logits, temps)
         self._check_both_refs(logits, temps, fused, torch.float16)
 
@@ -142,7 +144,15 @@ def test_mixed_temperatures(self):
             [0.1, 0.5, 0.7, 1.0, 1.2, 1.5, 2.0, 5.0], dtype=torch.float32
         ).view(-1, 1)
         fused = fused_temperature_softmax(logits, temps)
-        self._check_both_refs(logits, temps, fused, torch.bfloat16)
+        tol = _TOL[torch.bfloat16]
+        ref_f32 = reference_fp32(logits, temps)
+        self._check_close(fused, ref_f32, *tol["fp32_ref"])
+        # Native-dtype reference only for moderate temperatures (>= 0.5).
+        # Below 0.5, bf16 div_ truncation is amplified by exp(), making the
+        # native path diverge far from the (more accurate) fp32 fused kernel.
+        moderate = temps.view(-1) >= 0.5
+        ref_nat = reference_native(logits, temps)
+        self._check_close(fused[moderate], ref_nat[moderate], *tol["native_ref"])
 
     def test_empty_batch(self):
         logits = torch.randn(0, 32000, dtype=torch.bfloat16)
@@ -156,7 +166,7 @@ def test_empty_batch(self):
 
     def test_inplace_basic(self):
         logits = torch.randn(8, 32000, dtype=torch.float32)
-        temps = torch.rand(8, 1, dtype=torch.float32) * 1.5 + 0.1
+        temps = torch.rand(8, 1, dtype=torch.float32) * 1.5 + 0.5
         ref = reference_fp32(logits, temps)
         fused_temperature_softmax_inplace(logits, temps)
         self._check_close(logits.float(), ref, *_TOL[torch.float32]["fp32_ref"])
@@ -278,7 +288,7 @@ def test_vs_flashinfer_large_vocab(self):
     def test_vs_flashinfer_batch_sizes(self):
         for bs in [1, 16, 64, 128, 512]:
             logits = torch.randn(bs, 32000, dtype=torch.bfloat16)
-            temps = torch.rand(bs, 1, dtype=torch.float32) * 1.5 + 0.1
+            temps = torch.rand(bs, 1, dtype=torch.float32) * 1.5 + 0.5
             fused = fused_temperature_softmax(logits, temps)
             fi = flashinfer_softmax(logits, temperature=temps.view(-1))
             self._check_close(fused, fi, *_TOL[torch.bfloat16]["native_ref"])
@@ -297,7 +307,8 @@ def test_vs_flashinfer_mixed_temperatures(self):
         ).view(-1, 1)
         fused = fused_temperature_softmax(logits, temps)
         fi = flashinfer_softmax(logits, temperature=temps.view(-1))
-        self._check_close(fused, fi, *_TOL[torch.bfloat16]["native_ref"])
+        moderate = temps.view(-1) >= 0.5
+        self._check_close(fused[moderate], fi[moderate], *_TOL[torch.bfloat16]["native_ref"])
 
 
 if __name__ == "__main__":

From 9c560d5cb6254b2df0a8ad94f7ed928d5f225cbd Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Sun, 5 Apr 2026 13:07:45 -0700
Subject: [PATCH 22/27] Remove DocString

---
 .../sampling/test_fused_temperature_softmax.py    | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/test/registered/sampling/test_fused_temperature_softmax.py b/test/registered/sampling/test_fused_temperature_softmax.py
index 02f193e815fe..d311aced79d8 100644
--- a/test/registered/sampling/test_fused_temperature_softmax.py
+++ b/test/registered/sampling/test_fused_temperature_softmax.py
@@ -1,18 +1,3 @@
-"""Correctness tests for fused_temperature_softmax Triton kernel.
-
-Two reference implementations are used:
-
-  1. fp32 reference — logits.float()/temp then softmax in fp32.
-     The Triton kernel also promotes to fp32 internally, so this reference
-     shares the same precision path and can be checked with tight tolerance.
-     This proves **kernel correctness**.
-
-  2. Native-dtype reference — logits.div_(temp) in the original dtype then
-     softmax.  The in-place div_ truncates intermediates to bf16/fp16, so
-     a looser tolerance is needed.  This covers the **dtype truncation gap**
-     between the fused kernel and the existing PyTorch sampling path.
-"""
-
 import unittest
 
 import torch

From 661a1de0290d9d545d783246cb236dfa868d8165 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Sun, 5 Apr 2026 13:15:41 -0700
Subject: [PATCH 23/27] Fix Lint

---
 test/registered/sampling/test_fused_temperature_softmax.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/registered/sampling/test_fused_temperature_softmax.py b/test/registered/sampling/test_fused_temperature_softmax.py
index d311aced79d8..d98fd4a0eb17 100644
--- a/test/registered/sampling/test_fused_temperature_softmax.py
+++ b/test/registered/sampling/test_fused_temperature_softmax.py
@@ -293,7 +293,9 @@ def test_vs_flashinfer_mixed_temperatures(self):
         fused = fused_temperature_softmax(logits, temps)
         fi = flashinfer_softmax(logits, temperature=temps.view(-1))
         moderate = temps.view(-1) >= 0.5
-        self._check_close(fused[moderate], fi[moderate], *_TOL[torch.bfloat16]["native_ref"])
+        self._check_close(
+            fused[moderate], fi[moderate], *_TOL[torch.bfloat16]["native_ref"]
+        )
 
 
 if __name__ == "__main__":

From c709eb446a18d6f595e1346760ef9905ae8b66ca Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Sun, 5 Apr 2026 13:33:03 -0700
Subject: [PATCH 24/27] Modify DocString

---
 python/sglang/srt/layers/fused_sampling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
index 55852c2f0f34..0108c27774d3 100644
--- a/python/sglang/srt/layers/fused_sampling.py
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -6,7 +6,7 @@
 Two kernel variants:
   - Single-pass: vocab fits in one tile (1 read + 1 write). Used when
     next_power_of_2(vocab) <= 32768.
-  - Multi-pass: 2-pass online softmax with autotune (2 reads + 1 write).
+  - Multi-pass: 3-pass softmax with autotune (3 reads + 1 write).
     Used for large vocabs (e.g. 128K+).
 """
 
@@ -83,7 +83,7 @@ def _single_pass_temperature_softmax_inplace_kernel(
 
 # ---------------------------------------------------------------------------
 # Multi-pass kernel: vocab too large for one tile.
-# 2-pass online softmax with autotune over (BLOCK_SIZE, num_warps).
+# 3-pass softmax with autotune over (BLOCK_SIZE, num_warps).
 # ---------------------------------------------------------------------------
 
 _MULTI_PASS_AUTOTUNE_CONFIGS = [

From aaf61aafb1e06d001caea4943e7e4235ef611219 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Sun, 5 Apr 2026 17:59:59 -0700
Subject: [PATCH 25/27] Try online softmax

---
 python/sglang/srt/layers/fused_sampling.py    | 105 ++++++++++--------
 .../sglang/srt/model_executor/model_runner.py |   1 +
 2 files changed, 62 insertions(+), 44 deletions(-)

diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
index 0108c27774d3..3d57ceb8d265 100644
--- a/python/sglang/srt/layers/fused_sampling.py
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -6,8 +6,9 @@
 Two kernel variants:
   - Single-pass: vocab fits in one tile (1 read + 1 write). Used when
     next_power_of_2(vocab) <= 32768.
-  - Multi-pass: 3-pass softmax with autotune (3 reads + 1 write).
-    Used for large vocabs (e.g. 128K+).
+  - Multi-pass: online 2-pass softmax with autotune (2 reads + 1 write).
+    Pass 1 fuses max + sum via the Milakov-Gimelshein correction.
+    Pass 2 normalizes and writes.  Used for large vocabs (e.g. 128K+).
 """
 
 import logging
@@ -83,7 +84,10 @@ def _single_pass_temperature_softmax_inplace_kernel(
 
 # ---------------------------------------------------------------------------
 # Multi-pass kernel: vocab too large for one tile.
-# 3-pass softmax with autotune over (BLOCK_SIZE, num_warps).
+# Online 2-pass softmax (Milakov-Gimelshein) with autotune over
+# (BLOCK_SIZE, num_warps).  No num_stages > 1: the online correction
+# term creates a tight loop-carried dependency that conflicts with
+# Triton's software pipelining.
 # ---------------------------------------------------------------------------
 
 _MULTI_PASS_AUTOTUNE_CONFIGS = [
@@ -91,15 +95,11 @@ def _single_pass_temperature_softmax_inplace_kernel(
     triton.Config({"BLOCK_SIZE": 2048}, num_warps=16),
     triton.Config({"BLOCK_SIZE": 4096}, num_warps=8),
     triton.Config({"BLOCK_SIZE": 4096}, num_warps=16),
-    triton.Config({"BLOCK_SIZE": 4096}, num_warps=16, num_stages=4),
     triton.Config({"BLOCK_SIZE": 8192}, num_warps=16),
     triton.Config({"BLOCK_SIZE": 8192}, num_warps=32),
-    triton.Config({"BLOCK_SIZE": 8192}, num_warps=32, num_stages=4),
     triton.Config({"BLOCK_SIZE": 16384}, num_warps=16),
     triton.Config({"BLOCK_SIZE": 16384}, num_warps=32),
-    triton.Config({"BLOCK_SIZE": 16384}, num_warps=32, num_stages=4),
     triton.Config({"BLOCK_SIZE": 32768}, num_warps=32),
-    triton.Config({"BLOCK_SIZE": 32768}, num_warps=32, num_stages=4),
 ]
 
 
@@ -120,32 +120,28 @@ def _multi_pass_temperature_softmax_kernel(
     logits_row = logits_ptr + row_idx * logits_stride
     output_row = output_ptr + row_idx * output_stride
 
-    # Pass 1: find global max (matches PyTorch's first reduction pass)
-    global_max = tl.full([], value=float("-inf"), dtype=tl.float32)
+    # Pass 1: online max + sum (Milakov-Gimelshein)
+    running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
+    running_sum = tl.full([], value=0.0, dtype=tl.float32)
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
         x = tl.load(logits_row + offsets, mask=mask, other=float("-inf"))
         x = (x / temp).to(tl.float32)
-        global_max = tl.maximum(global_max, tl.max(x, axis=0))
-
-    # Pass 2: compute sum of exp(x - max) (matches PyTorch's second pass)
-    sum_exp = tl.full([], value=0.0, dtype=tl.float32)
-    for start in range(0, vocab_size, BLOCK_SIZE):
-        offsets = start + tl.arange(0, BLOCK_SIZE)
-        mask = offsets < vocab_size
-        x = tl.load(logits_row + offsets, mask=mask, other=float("-inf"))
-        x = (x / temp).to(tl.float32)
-        sum_exp += tl.sum(tl.exp(x - global_max), axis=0)
-
-    # Pass 3: normalize (matches PyTorch's exp(x-max)/sum)
+        tile_max = tl.max(x, axis=0)
+        new_max = tl.maximum(running_max, tile_max)
+        running_sum = running_sum * tl.exp(running_max - new_max) + tl.sum(
+            tl.exp(x - new_max), axis=0
+        )
+        running_max = new_max
+
+    # Pass 2: normalize and write
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
         x = tl.load(logits_row + offsets, mask=mask, other=float("-inf"))
         x = (x / temp).to(tl.float32)
-
-        prob = tl.exp(x - global_max) / sum_exp
+        prob = tl.exp(x - running_max) / running_sum
         tl.store(output_row + offsets, prob, mask=mask)
 
 
@@ -162,32 +158,28 @@ def _multi_pass_temperature_softmax_inplace_kernel(
 
     row_start = logits_ptr + row_idx * stride
 
-    # Pass 1: find global max (matches PyTorch's first reduction pass)
-    global_max = tl.full([], value=float("-inf"), dtype=tl.float32)
+    # Pass 1: online max + sum (Milakov-Gimelshein)
+    running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
+    running_sum = tl.full([], value=0.0, dtype=tl.float32)
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
         x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
         x = (x / temp).to(tl.float32)
-        global_max = tl.maximum(global_max, tl.max(x, axis=0))
-
-    # Pass 2: compute sum of exp(x - max) (matches PyTorch's second pass)
-    sum_exp = tl.full([], value=0.0, dtype=tl.float32)
-    for start in range(0, vocab_size, BLOCK_SIZE):
-        offsets = start + tl.arange(0, BLOCK_SIZE)
-        mask = offsets < vocab_size
-        x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
-        x = (x / temp).to(tl.float32)
-        sum_exp += tl.sum(tl.exp(x - global_max), axis=0)
-
-    # Pass 3: normalize (matches PyTorch's exp(x-max)/sum)
+        tile_max = tl.max(x, axis=0)
+        new_max = tl.maximum(running_max, tile_max)
+        running_sum = running_sum * tl.exp(running_max - new_max) + tl.sum(
+            tl.exp(x - new_max), axis=0
+        )
+        running_max = new_max
+
+    # Pass 2: normalize and write in-place
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
         x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
         x = (x / temp).to(tl.float32)
-
-        prob = tl.exp(x - global_max) / sum_exp
+        prob = tl.exp(x - running_max) / running_sum
         tl.store(row_start + offsets, prob, mask=mask)
 
 
@@ -309,6 +301,7 @@ def warmup_fused_temperature_softmax(
     vocab_size: int,
     device: torch.device | int | None = None,
     logits_dtype: torch.dtype = torch.float32,
+    tp_group=None,
 ) -> None:
     """Pre-compile and autotune kernels at startup so first request has no latency spike.
 
@@ -316,6 +309,10 @@ def warmup_fused_temperature_softmax(
     input/output buffers), and its winning config is reused for the in-place
     variant so that no autotune ever runs on a live logits buffer.
 
+    When running under tensor parallelism, the autotune result is broadcast from
+    rank 0 so that all ranks use the same kernel config, guaranteeing bitwise-equal
+    sampling results across ranks.
+
     ``logits_dtype`` should match ``next_token_logits`` at inference (usually
     ``model_config.dtype``) so Triton specializes the same way as in production.
     """
@@ -348,14 +345,10 @@ def warmup_fused_temperature_softmax(
                 "BLOCK_SIZE": best.kwargs["BLOCK_SIZE"],
                 "num_warps": best.num_warps,
             }
-            if best.num_stages is not None:
-                _multi_pass_inplace_config["num_stages"] = best.num_stages
-            ns = _multi_pass_inplace_config.get("num_stages", "default")
             logger.info(
-                "Multi-pass autotune result: BLOCK_SIZE=%d, num_warps=%d, num_stages=%s",
+                "Multi-pass autotune result: BLOCK_SIZE=%d, num_warps=%d",
                 _multi_pass_inplace_config["BLOCK_SIZE"],
                 _multi_pass_inplace_config["num_warps"],
-                ns,
             )
         else:
             _multi_pass_inplace_config = None
@@ -364,8 +357,32 @@ def warmup_fused_temperature_softmax(
                 "using default launch config for in-place kernel."
             )
 
+        _broadcast_multi_pass_config(tp_group)
+
     # 3. In-place kernel: JIT compile only (uses the config from step 2).
     fused_temperature_softmax_inplace(dummy_logits.clone(), dummy_temps)
     torch.cuda.synchronize(device)
 
     logger.info("fused_temperature_softmax warmup done (vocab_size=%d).", vocab_size)
+
+
+def _broadcast_multi_pass_config(tp_group=None) -> None:
+    """Broadcast the multi-pass autotune config from TP rank 0 to all ranks.
+
+    Ensures every rank uses the identical kernel launch config, which is
+    required for bitwise-equal sampling results under tensor parallelism.
+    """
+    global _multi_pass_inplace_config
+
+    if tp_group is None or tp_group.world_size <= 1:
+        return
+
+    cfg = tp_group.broadcast_object(_multi_pass_inplace_config, src=0)
+    _multi_pass_inplace_config = cfg
+
+    logger.info(
+        "Multi-pass config after TP broadcast: %s (rank %d/%d)",
+        _multi_pass_inplace_config,
+        tp_group.rank_in_group,
+        tp_group.world_size,
+    )
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 201b544ab336..ba35666a156f 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -2074,6 +2074,7 @@ def _warmup_fused_sampling(self):
         warmup_fused_temperature_softmax(
             self.model_config.vocab_size,
             logits_dtype=logits_warmup_dtype,
+            tp_group=self.tp_group,
         )
 
     def _should_run_flashinfer_autotune(self) -> bool:

From e12fbb4d2b10415134bbb44c6cf35554b164a131 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Sun, 5 Apr 2026 18:20:33 -0700
Subject: [PATCH 26/27] Modify Threshold and rename SUITE Name

---
 python/sglang/srt/layers/sampler.py                        | 2 +-
 test/registered/sampling/test_fused_temperature_softmax.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index 4196787820f4..afcaab18ff0e 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -36,7 +36,7 @@
 # Batch size threshold for fused Triton kernel vs PyTorch softmax.
 # Below this threshold, PyTorch's native div+softmax is faster.
 # At and above this threshold, the fused Triton kernel wins.
-_FUSED_SAMPLING_BATCH_THRESHOLD = 128
+_FUSED_SAMPLING_BATCH_THRESHOLD = 32
 if is_npu():
     import torch_npu
 
diff --git a/test/registered/sampling/test_fused_temperature_softmax.py b/test/registered/sampling/test_fused_temperature_softmax.py
index d98fd4a0eb17..952584d7db65 100644
--- a/test/registered/sampling/test_fused_temperature_softmax.py
+++ b/test/registered/sampling/test_fused_temperature_softmax.py
@@ -11,7 +11,7 @@
 from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_utils import CustomTestCase
 
-register_cuda_ci(est_time=15, suite="stage-b-test-small-1-gpu")
+register_cuda_ci(est_time=15, suite="stage-b-test-1-gpu-small")
 
 # Tolerance table — chosen per dtype to cover known rounding gaps.
 # fp32 ref is tight (kernel also runs in fp32 internally).

From afc7193a1d1ed16f7637ffd3276f60b0e08e1575 Mon Sep 17 00:00:00 2001
From: godmook <cmoh4135@naver.com>
Date: Sun, 5 Apr 2026 18:40:50 -0700
Subject: [PATCH 27/27] Refactoring tiny formula

---
 python/sglang/srt/layers/fused_sampling.py | 36 ++++++++++------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/python/sglang/srt/layers/fused_sampling.py b/python/sglang/srt/layers/fused_sampling.py
index 3d57ceb8d265..07150489ea30 100644
--- a/python/sglang/srt/layers/fused_sampling.py
+++ b/python/sglang/srt/layers/fused_sampling.py
@@ -44,11 +44,9 @@ def _single_pass_temperature_softmax_kernel(
     mask = offsets < vocab_size
 
     x = tl.load(
-        logits_ptr + row_idx * logits_stride + offsets,
-        mask=mask,
-        other=float("-inf"),
-    )
-    x = (x / temp).to(tl.float32)
+        logits_ptr + row_idx * logits_stride + offsets, mask=mask, other=float("-inf")
+    ).to(tl.float32)
+    x = x / temp
 
     x_max = tl.max(x, axis=0)
     exp_x = tl.exp(x - x_max)
@@ -72,8 +70,8 @@ def _single_pass_temperature_softmax_inplace_kernel(
     offsets = tl.arange(0, BLOCK_SIZE)
     mask = offsets < vocab_size
 
-    x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
-    x = (x / temp).to(tl.float32)
+    x = tl.load(row_start + offsets, mask=mask, other=float("-inf")).to(tl.float32)
+    x = x / temp
 
     x_max = tl.max(x, axis=0)
     exp_x = tl.exp(x - x_max)
@@ -121,13 +119,13 @@ def _multi_pass_temperature_softmax_kernel(
     output_row = output_ptr + row_idx * output_stride
 
     # Pass 1: online max + sum (Milakov-Gimelshein)
-    running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
-    running_sum = tl.full([], value=0.0, dtype=tl.float32)
+    running_max = tl.full([], float("-inf"), dtype=tl.float32)
+    running_sum = tl.full([], 0.0, dtype=tl.float32)
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
-        x = tl.load(logits_row + offsets, mask=mask, other=float("-inf"))
-        x = (x / temp).to(tl.float32)
+        x = tl.load(logits_row + offsets, mask=mask, other=float("-inf")).to(tl.float32)
+        x = x / temp
         tile_max = tl.max(x, axis=0)
         new_max = tl.maximum(running_max, tile_max)
         running_sum = running_sum * tl.exp(running_max - new_max) + tl.sum(
@@ -139,8 +137,8 @@ def _multi_pass_temperature_softmax_kernel(
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
-        x = tl.load(logits_row + offsets, mask=mask, other=float("-inf"))
-        x = (x / temp).to(tl.float32)
+        x = tl.load(logits_row + offsets, mask=mask, other=float("-inf")).to(tl.float32)
+        x = x / temp
         prob = tl.exp(x - running_max) / running_sum
         tl.store(output_row + offsets, prob, mask=mask)
 
@@ -159,13 +157,13 @@ def _multi_pass_temperature_softmax_inplace_kernel(
     row_start = logits_ptr + row_idx * stride
 
     # Pass 1: online max + sum (Milakov-Gimelshein)
-    running_max = tl.full([], value=float("-inf"), dtype=tl.float32)
-    running_sum = tl.full([], value=0.0, dtype=tl.float32)
+    running_max = tl.full([], float("-inf"), dtype=tl.float32)
+    running_sum = tl.full([], 0.0, dtype=tl.float32)
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
-        x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
-        x = (x / temp).to(tl.float32)
+        x = tl.load(row_start + offsets, mask=mask, other=float("-inf")).to(tl.float32)
+        x = x / temp
         tile_max = tl.max(x, axis=0)
         new_max = tl.maximum(running_max, tile_max)
         running_sum = running_sum * tl.exp(running_max - new_max) + tl.sum(
@@ -177,8 +175,8 @@ def _multi_pass_temperature_softmax_inplace_kernel(
     for start in range(0, vocab_size, BLOCK_SIZE):
         offsets = start + tl.arange(0, BLOCK_SIZE)
         mask = offsets < vocab_size
-        x = tl.load(row_start + offsets, mask=mask, other=float("-inf"))
-        x = (x / temp).to(tl.float32)
+        x = tl.load(row_start + offsets, mask=mask, other=float("-inf")).to(tl.float32)
+        x = x / temp
         prob = tl.exp(x - running_max) / running_sum
         tl.store(row_start + offsets, prob, mask=mask)