Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
89cb865
feat: SM120 (Blackwell Desktop) support for DeepSeek-V4 inference
AliceChenyy May 8, 2026
76acfc7
Add torch fallback for CompressorPrefillPlan.generate when tvm_ffi is…
rahulvijayaraghavan Apr 25, 2026
3135bb7
DeepSeek V4 - add triton and torch fallback for hc_split_sinkhorn
polisettyvarma May 13, 2026
937f734
Add triton fallback for fused_rope
rahulvijayaraghavan Apr 25, 2026
10cde88
Support MXFP4 in MoE using fused_moe triton
polisettyvarma May 13, 2026
3c36042
nsa_indexer: pure-torch Hadamard fallback for XPU
rahulvijayaraghavan Apr 27, 2026
80dd3a1
Add torch/triton fallback for compressor kernels
rahulvijayaraghavan Apr 30, 2026
3b6c421
Add torch fallback for fused norm rope jit kernel
rahulvijayaraghavan Apr 30, 2026
5417aff
support SGLANG_FP8_PAGED_MQA_LOGITS_TRITON also
polisettyvarma May 13, 2026
8bedc6d
xpu: chunk SM120 sparse decode fallback to bound peak memory and clam…
rahulvijayaraghavan May 2, 2026
ace49de
topk: drop dead allocations in fused_topk_torch_native
rahulvijayaraghavan May 2, 2026
420c87b
indexer: remove L0 sync hot spots in topk_transform_512_pytorch_vecto…
rahulvijayaraghavan May 2, 2026
471cf46
flash_mla_sm120_fallback: hoist SGLANG_SM120_SPARSE_CHUNK_MIB to impo…
rahulvijayaraghavan May 2, 2026
3367b60
jit_kernel/deepseek_v4: remove L0 host-sync hot spots in torch fallbacks
rahulvijayaraghavan May 2, 2026
7baf67e
optimize _torch_plan_compress_prefill memory usage
polisettyvarma May 14, 2026
2718d5a
Add sqrtsoftplus support to fused_topk_torch_native
rahulvijayaraghavan Apr 30, 2026
f767fff
Make tilelang import optional in deepseek_v4_rope
rahulvijayaraghavan Apr 24, 2026
221b574
changes required to pass flashmla metadata and flashmla xpu spport
polisettyvarma May 14, 2026
bdca710
add pytorch profiling support for XPU in bench offline throughput
polisettyvarma May 14, 2026
d923977
fix bug in torch compress plan prefill due to conflict resolving inco…
polisettyvarma May 15, 2026
bf3ed74
remove _torch_plan_compress_prefill implementation
polisettyvarma May 15, 2026
9cc22ad
remove _triton_compress_forward implementation
polisettyvarma May 15, 2026
08ecb3c
remove unnecessary changes
polisettyvarma May 15, 2026
e0a2029
remove _hc_split_sinkhorn_torch implementation
polisettyvarma May 15, 2026
7aa7d3a
iset default env for XPU
polisettyvarma May 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions python/sglang/bench_offline_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import os
import random
import time
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple

import numpy as np

Expand Down Expand Up @@ -53,6 +53,9 @@ class BenchArgs:
extra_request_body: Optional[str] = None
apply_chat_template: bool = False
profile: bool = False
profile_activities: Tuple[str] = ("CPU", "GPU")
profile_start_step: Optional[int] = None
profile_steps: Optional[int] = None
skip_warmup: bool = False
do_not_exit: bool = False
prompt_suffix: str = ""
Expand Down Expand Up @@ -169,6 +172,26 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="Use Torch Profiler. The endpoint must be launched with "
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
)
parser.add_argument(
"--profile-activities",
type=str,
nargs="+",
default=["CPU", "GPU"],
choices=["CPU", "GPU", "CUDA_PROFILER", "XPU"],
help="Profiler activities: CPU, GPU, XPU, CUDA_PROFILER. If CPU/GPU/XPU, use torch profiler. If CUDA_PROFILER, use CUDA profiler.",
)
parser.add_argument(
"--profile-start-step",
type=int,
default=None,
help="Decode step at which to start profiling (0-indexed). If not specified, defaults to output_len // 2.",
)
parser.add_argument(
"--profile-steps",
type=int,
default=None,
help="Number of decode steps to profile starting from profile-start-step. If not specified, profiles only one step.",
)
parser.add_argument(
"--skip-warmup",
action="store_true",
Expand Down Expand Up @@ -210,6 +233,9 @@ def throughput_test_once(
ignore_eos: bool,
extra_request_body: Dict,
profile: bool,
profile_activities=None,
profile_start_step=None,
profile_steps=None,
return_logprob: bool = False,
logprob_start_len: int = -1,
):
Expand Down Expand Up @@ -241,7 +267,7 @@ def throughput_test_once(
"SGLANG_TORCH_PROFILER_DIR" in os.environ
), "Please set SGLANG_TORCH_PROFILER_DIR."
os.makedirs(os.environ["SGLANG_TORCH_PROFILER_DIR"], exist_ok=True)
backend.start_profile()
backend.start_profile(start_step=profile_start_step, num_steps=profile_steps, activities=profile_activities)

st = time.perf_counter()
gen_out = backend.generate(
Expand All @@ -255,8 +281,9 @@ def throughput_test_once(
if profile:
dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
known_files = set(os.listdir(dir))
backend.stop_profile()
monitor_trace_file(known_files, dir)
if not profile_steps:
backend.stop_profile()
monitor_trace_file(known_files, dir)

if backend_name == "runtime":
gen_out = json.loads(gen_out)
Expand Down Expand Up @@ -455,6 +482,9 @@ def throughput_test(
ignore_eos=not bench_args.disable_ignore_eos,
extra_request_body=extra_request_body,
profile=bench_args.profile,
profile_activities=bench_args.profile_activities,
profile_start_step=bench_args.profile_start_step,
profile_steps=bench_args.profile_steps,
return_logprob=bench_args.return_logprob,
logprob_start_len=bench_args.logprob_start_len,
)
Expand Down
Loading
Loading