diff --git a/.github/workflows/nightly-test-nvidia.yml b/.github/workflows/nightly-test-nvidia.yml index 3f83f40b6eff..f4d4561c9fe4 100644 --- a/.github/workflows/nightly-test-nvidia.yml +++ b/.github/workflows/nightly-test-nvidia.yml @@ -25,6 +25,7 @@ on: - 'nightly-test-multimodal-server-2-gpu' - 'nightly-test-perf-4-gpu-b200' - 'nightly-test-perf-8-gpu-b200' + - 'nightly-test-kernel-1-gpu-h100' workflow_call: inputs: ref: @@ -76,6 +77,42 @@ jobs: - uses: ./.github/actions/upload-cuda-coredumps if: always() + # JIT kernel full unit tests (expanded parameter ranges via SGLANG_JIT_KERNEL_RUN_FULL_TESTS) + nightly-test-kernel-1-gpu-h100: + if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-kernel-1-gpu-h100') + runs-on: 1-gpu-h100 + timeout-minutes: 240 + env: + # Full jit_kernel test grids (see sglang.jit_kernel.utils.should_run_full_tests) + SGLANG_JIT_KERNEL_RUN_FULL_TESTS: "1" + # Match pr-test-jit-kernel workflow for consistent JIT warmup behavior + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: true + # Allow maintenance bypass on default branch (same semantics as PR JIT workflow) + SGLANG_PR_TEST_BYPASS_MAINTENANCE_ON_MAIN: ${{ github.ref == 'refs/heads/main' && 'true' || 'false' }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - uses: ./.github/actions/check-maintenance + with: + github-token: ${{ github.token }} + + - name: Install dependencies + timeout-minutes: 20 + run: | + bash scripts/ci/cuda/ci_install_dependency.sh + + - name: Run jit kernel nightly suite + timeout-minutes: 60 + run: | + cd test + python3 run_suite.py --hw cuda --suite nightly-kernel-1-gpu --nightly --continue-on-error + + - uses: ./.github/actions/upload-cuda-coredumps + if: always() + # General tests - 4 GPU H100 nightly-test-general-4-gpu-h100: if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-4-gpu-h100') diff --git a/.github/workflows/pr-test-jit-kernel.yml b/.github/workflows/pr-test-jit-kernel.yml index bfcf2b600d5e..6e56278572a0 100644 --- a/.github/workflows/pr-test-jit-kernel.yml +++ b/.github/workflows/pr-test-jit-kernel.yml @@ -56,36 +56,8 @@ jobs: - name: Run test timeout-minutes: 30 run: | - cd python/sglang/jit_kernel - pytest tests/ - - jit-kernel-unit-test-nightly: - if: | - github.event_name == 'schedule' && - inputs.jit_kernel == 'true' - runs-on: 1-gpu-h100 - timeout-minutes: 240 - env: - SGLANG_JIT_KERNEL_RUN_FULL_TESTS: "1" - steps: - - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha || inputs.git_ref || github.sha }} - - - uses: ./.github/actions/check-maintenance - with: - github-token: ${{ github.token }} - - - name: Install dependencies - timeout-minutes: 20 - run: | - bash scripts/ci/cuda/ci_install_dependency.sh - - - name: Run full nightly test - timeout-minutes: 60 - run: | - cd python/sglang/jit_kernel - pytest tests/ + cd test/ + python3 run_suite.py --hw cuda --suite stage-b-kernel-unit-1-gpu-large jit-kernel-benchmark-test: if: | @@ -111,23 +83,5 @@ jobs: - name: Run benchmark tests timeout-minutes: 45 run: | - cd python/sglang/jit_kernel/benchmark - echo "Running jit-kernel benchmark tests in CI mode..." - - failures=() - - for bench_file in bench_*.py; do - echo "Testing $bench_file..." - if ! timeout 120 python3 "$bench_file"; then - failures+=("$bench_file") - fi - echo "Completed $bench_file" - echo "---" - done - - if [ ${#failures[@]} -ne 0 ]; then - echo "The following benchmark tests failed: ${failures[*]}" - exit 1 - fi - - echo "All jit-kernel benchmark tests completed successfully!" + cd test/ + python3 run_suite.py --hw cuda --suite stage-b-kernel-benchmark-1-gpu-large diff --git a/python/sglang/jit_kernel/benchmark/bench_awq_dequantize.py b/python/sglang/jit_kernel/benchmark/bench_awq_dequantize.py index 6a3fcbc93ee6..81422a597f4c 100644 --- a/python/sglang/jit_kernel/benchmark/bench_awq_dequantize.py +++ b/python/sglang/jit_kernel/benchmark/bench_awq_dequantize.py @@ -6,8 +6,11 @@ from sglang.jit_kernel.awq_dequantize import awq_dequantize as jit_awq_dequantize from sglang.jit_kernel.benchmark.utils import run_benchmark +from sglang.test.ci.ci_register import register_cuda_ci from sglang.utils import is_in_ci +register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large") + try: from sgl_kernel import awq_dequantize as aot_awq_dequantize diff --git a/python/sglang/jit_kernel/benchmark/bench_clamp_position.py b/python/sglang/jit_kernel/benchmark/bench_clamp_position.py index 08fa92660082..52082c64b9c4 100644 --- a/python/sglang/jit_kernel/benchmark/bench_clamp_position.py +++ b/python/sglang/jit_kernel/benchmark/bench_clamp_position.py @@ -11,6 +11,9 @@ ) from sglang.jit_kernel.clamp_position import clamp_position_cuda from sglang.srt.utils import get_compiler_backend +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=13, suite="stage-b-kernel-benchmark-1-gpu-large") SIZE_LIST = get_benchmark_range( full_range=[2**n for n in range(4, 16)], diff --git a/python/sglang/jit_kernel/benchmark/bench_concat_mla.py b/python/sglang/jit_kernel/benchmark/bench_concat_mla.py index 7f2b542a0c9b..8129b7db1c11 100644 --- a/python/sglang/jit_kernel/benchmark/bench_concat_mla.py +++ b/python/sglang/jit_kernel/benchmark/bench_concat_mla.py @@ -9,8 +9,11 @@ from sglang.jit_kernel.benchmark.utils import run_benchmark from sglang.jit_kernel.concat_mla import concat_mla_absorb_q as jit_absorb_q from sglang.jit_kernel.concat_mla import concat_mla_k as jit_k +from sglang.test.ci.ci_register import register_cuda_ci from sglang.utils import is_in_ci +register_cuda_ci(est_time=6, suite="stage-b-kernel-benchmark-1-gpu-large") + IS_CI = is_in_ci() NUM_LOCAL_HEADS = 128 diff --git a/python/sglang/jit_kernel/benchmark/bench_custom_all_reduce.py b/python/sglang/jit_kernel/benchmark/bench_custom_all_reduce.py index 82d5166dfb5e..4f36f1a48276 100644 --- a/python/sglang/jit_kernel/benchmark/bench_custom_all_reduce.py +++ b/python/sglang/jit_kernel/benchmark/bench_custom_all_reduce.py @@ -22,6 +22,13 @@ import torch.distributed as dist from sglang.jit_kernel.benchmark.utils import is_in_ci +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci( + est_time=120, + suite="stage-b-kernel-benchmark-1-gpu-large", + disabled="requires multi-GPU, self-skips in CI", +) DTYPE_MAP = { "float16": torch.float16, diff --git a/python/sglang/jit_kernel/benchmark/bench_fused_add_rmsnorm.py b/python/sglang/jit_kernel/benchmark/bench_fused_add_rmsnorm.py index 0d7b1ec23505..a842be84b72b 100644 --- a/python/sglang/jit_kernel/benchmark/bench_fused_add_rmsnorm.py +++ b/python/sglang/jit_kernel/benchmark/bench_fused_add_rmsnorm.py @@ -7,8 +7,11 @@ from sglang.jit_kernel.benchmark.utils import run_benchmark from sglang.jit_kernel.norm import fused_add_rmsnorm as jit_fused_add_rmsnorm +from sglang.test.ci.ci_register import register_cuda_ci from sglang.utils import is_in_ci +register_cuda_ci(est_time=6, suite="stage-b-kernel-benchmark-1-gpu-large") + IS_CI = is_in_ci() diff --git a/python/sglang/jit_kernel/benchmark/bench_fused_norm_scale_shift.py b/python/sglang/jit_kernel/benchmark/bench_fused_norm_scale_shift.py index 24971231c832..ae9ce7ff8cbb 100644 --- a/python/sglang/jit_kernel/benchmark/bench_fused_norm_scale_shift.py +++ b/python/sglang/jit_kernel/benchmark/bench_fused_norm_scale_shift.py @@ -15,8 +15,11 @@ ScaleResidualLayerNormScaleShift, ScaleResidualRMSNormScaleShift, ) +from sglang.test.ci.ci_register import register_cuda_ci from sglang.utils import is_in_ci +register_cuda_ci(est_time=17, suite="stage-b-kernel-benchmark-1-gpu-large") + if is_in_ci(): B_RANGE, S_RANGE, D_RANGE = [1], [128], [1024] else: diff --git a/python/sglang/jit_kernel/benchmark/bench_hadamard.py b/python/sglang/jit_kernel/benchmark/bench_hadamard.py index 61fc37e9b163..3da9ec484c6d 100644 --- a/python/sglang/jit_kernel/benchmark/bench_hadamard.py +++ b/python/sglang/jit_kernel/benchmark/bench_hadamard.py @@ -14,6 +14,9 @@ run_benchmark, ) from sglang.jit_kernel.hadamard import hadamard_transform +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large") # AOT kernel: might not be available in all environments. # This is used for performance baseline comparison. diff --git a/python/sglang/jit_kernel/benchmark/bench_hicache.py b/python/sglang/jit_kernel/benchmark/bench_hicache.py index c825a7f85801..d0e4a31aaec9 100644 --- a/python/sglang/jit_kernel/benchmark/bench_hicache.py +++ b/python/sglang/jit_kernel/benchmark/bench_hicache.py @@ -31,6 +31,9 @@ transfer_hicache_all_layer, transfer_hicache_one_layer, ) +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=29, suite="stage-b-kernel-benchmark-1-gpu-large") DISABLE_TORCH = os.environ.get("DISABLE_TORCH", "0") == "1" PAGE_SIZE = 1 diff --git a/python/sglang/jit_kernel/benchmark/bench_norm.py b/python/sglang/jit_kernel/benchmark/bench_norm.py index e58d9f5f9ff4..d046ecf2d2a8 100644 --- a/python/sglang/jit_kernel/benchmark/bench_norm.py +++ b/python/sglang/jit_kernel/benchmark/bench_norm.py @@ -9,8 +9,11 @@ from sglang.jit_kernel.benchmark.utils import run_benchmark from sglang.jit_kernel.norm import fused_add_rmsnorm as jit_fused_add_rmsnorm from sglang.jit_kernel.norm import rmsnorm as jit_rmsnorm +from sglang.test.ci.ci_register import register_cuda_ci from sglang.utils import is_in_ci +register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large") + IS_CI = is_in_ci() DTYPE = torch.bfloat16 diff --git a/python/sglang/jit_kernel/benchmark/bench_norm_impls.py b/python/sglang/jit_kernel/benchmark/bench_norm_impls.py index 9642635b1baa..7ef8ac832d95 100644 --- a/python/sglang/jit_kernel/benchmark/bench_norm_impls.py +++ b/python/sglang/jit_kernel/benchmark/bench_norm_impls.py @@ -21,8 +21,15 @@ from sglang.jit_kernel.norm import fused_add_rmsnorm as jit_fused_add_rmsnorm from sglang.jit_kernel.norm import rmsnorm as jit_rmsnorm from sglang.jit_kernel.utils import KERNEL_PATH +from sglang.test.ci.ci_register import register_cuda_ci from sglang.utils import is_in_ci +register_cuda_ci( + est_time=120, + suite="stage-b-kernel-benchmark-1-gpu-large", + disabled="self-skips in CI, standalone tool", +) + os.environ.setdefault("FLASHINFER_DISABLE_VERSION_CHECK", "1") REPO_ROOT = KERNEL_PATH.parents[2] diff --git a/python/sglang/jit_kernel/benchmark/bench_nvfp4_blockwise_moe.py b/python/sglang/jit_kernel/benchmark/bench_nvfp4_blockwise_moe.py index 3ae2e5ccaed8..98da03e72cac 100644 --- a/python/sglang/jit_kernel/benchmark/bench_nvfp4_blockwise_moe.py +++ b/python/sglang/jit_kernel/benchmark/bench_nvfp4_blockwise_moe.py @@ -13,6 +13,9 @@ scaled_fp4_quant, ) from sglang.srt.utils import is_sm100_supported +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large") FLOAT4_E2M1_MAX = 6.0 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max diff --git a/python/sglang/jit_kernel/benchmark/bench_nvfp4_quant.py b/python/sglang/jit_kernel/benchmark/bench_nvfp4_quant.py index e1a4eb7f555d..2be07d3d60f0 100644 --- a/python/sglang/jit_kernel/benchmark/bench_nvfp4_quant.py +++ b/python/sglang/jit_kernel/benchmark/bench_nvfp4_quant.py @@ -8,6 +8,9 @@ from sglang.jit_kernel.benchmark.utils import get_benchmark_range, run_benchmark from sglang.jit_kernel.nvfp4 import scaled_fp4_quant from sglang.srt.utils import is_sm100_supported +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large") FLOAT4_E2M1_MAX = 6.0 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max diff --git a/python/sglang/jit_kernel/benchmark/bench_nvfp4_scaled_mm.py b/python/sglang/jit_kernel/benchmark/bench_nvfp4_scaled_mm.py index 80d3b4f04950..f7af1e7c5100 100644 --- a/python/sglang/jit_kernel/benchmark/bench_nvfp4_scaled_mm.py +++ b/python/sglang/jit_kernel/benchmark/bench_nvfp4_scaled_mm.py @@ -8,6 +8,9 @@ from sglang.jit_kernel.benchmark.utils import get_benchmark_range, run_benchmark from sglang.jit_kernel.nvfp4 import cutlass_scaled_fp4_mm, scaled_fp4_quant from sglang.srt.utils import is_sm100_supported +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large") FLOAT4_E2M1_MAX = 6.0 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max diff --git a/python/sglang/jit_kernel/benchmark/bench_per_tensor_quant_fp8.py b/python/sglang/jit_kernel/benchmark/bench_per_tensor_quant_fp8.py index 9549526d59fe..a061639b8017 100644 --- a/python/sglang/jit_kernel/benchmark/bench_per_tensor_quant_fp8.py +++ b/python/sglang/jit_kernel/benchmark/bench_per_tensor_quant_fp8.py @@ -6,6 +6,9 @@ from sglang.jit_kernel.benchmark.utils import get_benchmark_range, run_benchmark from sglang.jit_kernel.per_tensor_quant_fp8 import per_tensor_quant_fp8 +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large") try: from vllm import _custom_ops as ops diff --git a/python/sglang/jit_kernel/benchmark/bench_per_token_group_quant_8bit.py b/python/sglang/jit_kernel/benchmark/bench_per_token_group_quant_8bit.py index b4515df0a360..a5a3c392b0df 100644 --- a/python/sglang/jit_kernel/benchmark/bench_per_token_group_quant_8bit.py +++ b/python/sglang/jit_kernel/benchmark/bench_per_token_group_quant_8bit.py @@ -17,8 +17,11 @@ ) from sglang.srt.utils import is_hip from sglang.srt.utils.bench_utils import bench_kineto +from sglang.test.ci.ci_register import register_cuda_ci from sglang.utils import is_in_ci +register_cuda_ci(est_time=13, suite="stage-b-kernel-benchmark-1-gpu-large") + IS_CI = is_in_ci() _is_hip = is_hip() diff --git a/python/sglang/jit_kernel/benchmark/bench_qknorm.py b/python/sglang/jit_kernel/benchmark/bench_qknorm.py index d38fe0b539dd..e5458385cd7d 100644 --- a/python/sglang/jit_kernel/benchmark/bench_qknorm.py +++ b/python/sglang/jit_kernel/benchmark/bench_qknorm.py @@ -13,6 +13,9 @@ ) from sglang.jit_kernel.norm import fused_inplace_qknorm from sglang.srt.utils import get_current_device_stream_fast +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=10, suite="stage-b-kernel-benchmark-1-gpu-large") alt_stream = torch.cuda.Stream() diff --git a/python/sglang/jit_kernel/benchmark/bench_qknorm_across_heads.py b/python/sglang/jit_kernel/benchmark/bench_qknorm_across_heads.py index cbcff1031413..9bd05f7fc10a 100644 --- a/python/sglang/jit_kernel/benchmark/bench_qknorm_across_heads.py +++ b/python/sglang/jit_kernel/benchmark/bench_qknorm_across_heads.py @@ -9,8 +9,11 @@ from sglang.jit_kernel.benchmark.utils import run_benchmark from sglang.jit_kernel.norm import fused_inplace_qknorm_across_heads from sglang.srt.utils import get_current_device_stream_fast +from sglang.test.ci.ci_register import register_cuda_ci from sglang.utils import is_in_ci +register_cuda_ci(est_time=12, suite="stage-b-kernel-benchmark-1-gpu-large") + IS_CI = is_in_ci() alt_stream = torch.cuda.Stream() diff --git a/python/sglang/jit_kernel/benchmark/bench_qwen_image_modulation.py b/python/sglang/jit_kernel/benchmark/bench_qwen_image_modulation.py index 42552a805b5b..c4713d56ae1b 100644 --- a/python/sglang/jit_kernel/benchmark/bench_qwen_image_modulation.py +++ b/python/sglang/jit_kernel/benchmark/bench_qwen_image_modulation.py @@ -9,8 +9,11 @@ fuse_layernorm_scale_shift_gate_select01_kernel, fuse_residual_layernorm_scale_shift_gate_select01_kernel, ) +from sglang.test.ci.ci_register import register_cuda_ci from sglang.utils import is_in_ci +register_cuda_ci(est_time=13, suite="stage-b-kernel-benchmark-1-gpu-large") + if is_in_ci(): B_RANGE, S_RANGE, D_RANGE = [1], [128], [3072] else: diff --git a/python/sglang/jit_kernel/benchmark/bench_renorm.py b/python/sglang/jit_kernel/benchmark/bench_renorm.py index 20cc9ae71a6a..cd4ab36b4326 100644 --- a/python/sglang/jit_kernel/benchmark/bench_renorm.py +++ b/python/sglang/jit_kernel/benchmark/bench_renorm.py @@ -6,8 +6,11 @@ import triton.testing from sglang.jit_kernel.benchmark.utils import run_benchmark_no_cudagraph +from sglang.test.ci.ci_register import register_cuda_ci from sglang.utils import is_in_ci +register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large") + def torch_top_k_renorm_probs(probs, top_k): """Vectorized PyTorch implementation of top-k renormalization.""" diff --git a/python/sglang/jit_kernel/benchmark/bench_resolve_future_token_ids.py b/python/sglang/jit_kernel/benchmark/bench_resolve_future_token_ids.py index 31430442ce67..f56c8df2a981 100644 --- a/python/sglang/jit_kernel/benchmark/bench_resolve_future_token_ids.py +++ b/python/sglang/jit_kernel/benchmark/bench_resolve_future_token_ids.py @@ -11,6 +11,9 @@ ) from sglang.jit_kernel.resolve_future_token_ids import resolve_future_token_ids_cuda from sglang.srt.utils import get_compiler_backend +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=10, suite="stage-b-kernel-benchmark-1-gpu-large") SIZE_LIST = get_benchmark_range( full_range=[2**n for n in range(4, 16)], # 16 … 32K elements diff --git a/python/sglang/jit_kernel/benchmark/bench_rmsnorm.py b/python/sglang/jit_kernel/benchmark/bench_rmsnorm.py index b55a2c06be8b..779b8ad7e207 100644 --- a/python/sglang/jit_kernel/benchmark/bench_rmsnorm.py +++ b/python/sglang/jit_kernel/benchmark/bench_rmsnorm.py @@ -13,6 +13,9 @@ run_benchmark, ) from sglang.jit_kernel.norm import rmsnorm as jit_rmsnorm +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=21, suite="stage-b-kernel-benchmark-1-gpu-large") def sglang_aot_rmsnorm( diff --git a/python/sglang/jit_kernel/benchmark/bench_rope.py b/python/sglang/jit_kernel/benchmark/bench_rope.py index 2ca71d4ca456..afe591185b60 100644 --- a/python/sglang/jit_kernel/benchmark/bench_rope.py +++ b/python/sglang/jit_kernel/benchmark/bench_rope.py @@ -10,6 +10,9 @@ get_benchmark_range, run_benchmark, ) +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=6, suite="stage-b-kernel-benchmark-1-gpu-large") MAX_SEQ_LEN = 131072 ROPE_BASE = 10000.0 diff --git a/python/sglang/jit_kernel/benchmark/bench_store_cache.py b/python/sglang/jit_kernel/benchmark/bench_store_cache.py index 700274772977..f1399ff0efe9 100644 --- a/python/sglang/jit_kernel/benchmark/bench_store_cache.py +++ b/python/sglang/jit_kernel/benchmark/bench_store_cache.py @@ -12,6 +12,9 @@ get_benchmark_range, ) from sglang.jit_kernel.kvcache import store_cache +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=9, suite="stage-b-kernel-benchmark-1-gpu-large") def sglang_jit_store_cache( diff --git a/python/sglang/jit_kernel/norm.py b/python/sglang/jit_kernel/norm.py index 3366f3871a67..4aef33c20d8b 100644 --- a/python/sglang/jit_kernel/norm.py +++ b/python/sglang/jit_kernel/norm.py @@ -6,9 +6,6 @@ import torch from sglang.jit_kernel.debug_utils import maybe_wrap_jit_kernel_debug - -logger = logging.getLogger(__name__) - from sglang.jit_kernel.utils import ( cache_once, is_arch_support_pdl, @@ -20,6 +17,9 @@ from tvm_ffi.module import Module +logger = logging.getLogger(__name__) + + @cache_once def _jit_qknorm_module(head_dim: int, dtype: torch.dtype) -> Module: args = make_cpp_args(head_dim, is_arch_support_pdl(), dtype) diff --git a/python/sglang/jit_kernel/tests/test_add_constant.py b/python/sglang/jit_kernel/tests/test_add_constant.py index 8379c18b1d7c..cad9ac3abd97 100644 --- a/python/sglang/jit_kernel/tests/test_add_constant.py +++ b/python/sglang/jit_kernel/tests/test_add_constant.py @@ -4,6 +4,10 @@ import torch from sglang.jit_kernel.add_constant import add_constant +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=45, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=180, suite="nightly-kernel-1-gpu", nightly=True) @pytest.mark.parametrize("size", [1, 2, 127, 128, 1024, 1025]) diff --git a/python/sglang/jit_kernel/tests/test_awq_dequantize.py b/python/sglang/jit_kernel/tests/test_awq_dequantize.py index d90fafa14970..debd97729621 100644 --- a/python/sglang/jit_kernel/tests/test_awq_dequantize.py +++ b/python/sglang/jit_kernel/tests/test_awq_dequantize.py @@ -5,6 +5,10 @@ import torch from sglang.jit_kernel.awq_dequantize import awq_dequantize as jit_awq_dequantize +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=9, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) try: from sgl_kernel import awq_dequantize as aot_awq_dequantize diff --git a/python/sglang/jit_kernel/tests/test_awq_marlin_moe_repack.py b/python/sglang/jit_kernel/tests/test_awq_marlin_moe_repack.py index a7ff242df3b3..2a0f9354ecd9 100644 --- a/python/sglang/jit_kernel/tests/test_awq_marlin_moe_repack.py +++ b/python/sglang/jit_kernel/tests/test_awq_marlin_moe_repack.py @@ -9,6 +9,10 @@ awq_marlin_moe_repack as jit_awq_marlin_moe_repack, ) from sglang.srt.layers.quantization.utils import pack_cols, quantize_weights +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=10, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) def _has_aot_awq_marlin_moe_repack() -> bool: diff --git a/python/sglang/jit_kernel/tests/test_awq_marlin_repack.py b/python/sglang/jit_kernel/tests/test_awq_marlin_repack.py index a3ee38808951..35c23922dc8c 100644 --- a/python/sglang/jit_kernel/tests/test_awq_marlin_repack.py +++ b/python/sglang/jit_kernel/tests/test_awq_marlin_repack.py @@ -9,8 +9,12 @@ awq_marlin_repack as jit_awq_marlin_repack, ) from sglang.srt.layers.quantization.utils import pack_cols, quantize_weights +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_marlin_utils import get_weight_perm, marlin_weights +register_cuda_ci(est_time=10, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) + def _has_aot_awq_marlin_repack() -> bool: return hasattr(torch.ops.sgl_kernel, "awq_marlin_repack") and hasattr( diff --git a/python/sglang/jit_kernel/tests/test_clamp_position.py b/python/sglang/jit_kernel/tests/test_clamp_position.py index 3fdc8c426562..cb3ec6ce595f 100644 --- a/python/sglang/jit_kernel/tests/test_clamp_position.py +++ b/python/sglang/jit_kernel/tests/test_clamp_position.py @@ -4,6 +4,10 @@ import torch from sglang.jit_kernel.clamp_position import clamp_position_cuda +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=12, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) def _reference_clamp_position(seq_lens): diff --git a/python/sglang/jit_kernel/tests/test_concat_mla.py b/python/sglang/jit_kernel/tests/test_concat_mla.py index 45f35e2488f4..9ecfb654cac8 100644 --- a/python/sglang/jit_kernel/tests/test_concat_mla.py +++ b/python/sglang/jit_kernel/tests/test_concat_mla.py @@ -5,6 +5,11 @@ import torch import triton +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=17, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) + def torch_concat_mla_k( k: torch.Tensor, k_nope: torch.Tensor, k_rope: torch.Tensor diff --git a/python/sglang/jit_kernel/tests/test_custom_all_reduce.py b/python/sglang/jit_kernel/tests/test_custom_all_reduce.py index e1f08dbb9863..bf0cfa38d775 100644 --- a/python/sglang/jit_kernel/tests/test_custom_all_reduce.py +++ b/python/sglang/jit_kernel/tests/test_custom_all_reduce.py @@ -18,6 +18,7 @@ import logging import os import subprocess +import sys from typing import Optional import pytest @@ -30,6 +31,19 @@ from sglang.srt.distributed.device_communicators.custom_all_reduce_v2 import ( CustomAllReduceV2, ) +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci( + est_time=120, + suite="stage-b-kernel-unit-1-gpu-large", + disabled="requires multi-GPU distributed setup", +) +register_cuda_ci( + est_time=120, + suite="nightly-kernel-1-gpu", + nightly=True, + disabled="requires multi-GPU distributed setup", +) # --------------------------------------------------------------------------- # Test parameters (shared between test class and worker) @@ -224,4 +238,7 @@ def worker_main() -> None: if __name__ == "__main__": - worker_main() + if "LOCAL_RANK" in os.environ: + worker_main() + else: + sys.exit(pytest.main([__file__, "-v", "-s"])) diff --git a/python/sglang/jit_kernel/tests/test_cutedsl_gdn.py b/python/sglang/jit_kernel/tests/test_cutedsl_gdn.py index fc7a8e19b00c..83e47024c7a6 100644 --- a/python/sglang/jit_kernel/tests/test_cutedsl_gdn.py +++ b/python/sglang/jit_kernel/tests/test_cutedsl_gdn.py @@ -6,6 +6,8 @@ import pytest import torch +from sglang.test.ci.ci_register import register_cuda_ci + try: import cuda.bindings.driver as cuda_driver import cutlass # noqa: F401 @@ -27,6 +29,9 @@ except ImportError: TRITON_AVAILABLE = False +register_cuda_ci(est_time=5, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) + def run_triton_kernel(A_log, dt_bias, q, k, v, a, b, initial_state, indices, scale): return fused_sigmoid_gating_delta_rule_update( diff --git a/python/sglang/jit_kernel/tests/test_flash_attention_4.py b/python/sglang/jit_kernel/tests/test_flash_attention_4.py index 9785a2ec1cf0..e1453b8f2323 100644 --- a/python/sglang/jit_kernel/tests/test_flash_attention_4.py +++ b/python/sglang/jit_kernel/tests/test_flash_attention_4.py @@ -12,6 +12,10 @@ from einops import rearrange, repeat from sglang.jit_kernel.flash_attention_v4 import flash_attn_varlen_func +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=120, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=900, suite="nightly-kernel-1-gpu", nightly=True) # Skip this test on Hopper machine skip_condition = torch.cuda.get_device_capability() < (10, 0) diff --git a/python/sglang/jit_kernel/tests/test_fused_add_rmsnorm.py b/python/sglang/jit_kernel/tests/test_fused_add_rmsnorm.py index a16b2650c240..49636c4f5444 100644 --- a/python/sglang/jit_kernel/tests/test_fused_add_rmsnorm.py +++ b/python/sglang/jit_kernel/tests/test_fused_add_rmsnorm.py @@ -5,6 +5,10 @@ import torch from sglang.jit_kernel.utils import get_ci_test_range +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=5, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) def sglang_jit_fused_add_rmsnorm( diff --git a/python/sglang/jit_kernel/tests/test_fused_metadata_copy.py b/python/sglang/jit_kernel/tests/test_fused_metadata_copy.py index cc0f05904c34..f0fb78c1f60e 100644 --- a/python/sglang/jit_kernel/tests/test_fused_metadata_copy.py +++ b/python/sglang/jit_kernel/tests/test_fused_metadata_copy.py @@ -14,6 +14,11 @@ import pytest import torch +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=100, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=400, suite="nightly-kernel-1-gpu", nightly=True) + # ============================================================================= # Helper Functions # ============================================================================= diff --git a/python/sglang/jit_kernel/tests/test_fused_norm_scale_shift.py b/python/sglang/jit_kernel/tests/test_fused_norm_scale_shift.py index c54fcfd5a1a9..42c3371ff890 100644 --- a/python/sglang/jit_kernel/tests/test_fused_norm_scale_shift.py +++ b/python/sglang/jit_kernel/tests/test_fused_norm_scale_shift.py @@ -10,6 +10,10 @@ fused_norm_scale_shift, fused_scale_residual_norm_scale_shift, ) +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=28, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) DEVICE = "cuda" SHAPE_MAP = { diff --git a/python/sglang/jit_kernel/tests/test_fused_store_index_cache.py b/python/sglang/jit_kernel/tests/test_fused_store_index_cache.py index 4f38c558d0e9..5766f3168b5a 100644 --- a/python/sglang/jit_kernel/tests/test_fused_store_index_cache.py +++ b/python/sglang/jit_kernel/tests/test_fused_store_index_cache.py @@ -22,6 +22,8 @@ import pytest import torch +from sglang.test.ci.ci_register import register_cuda_ci + try: from sglang.jit_kernel.fused_store_index_cache import ( can_use_nsa_fused_store, @@ -46,6 +48,9 @@ except ImportError: _is_fp8_fnuz = False +register_cuda_ci(est_time=24, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) + PAGE_SIZE = 64 HEAD_DIM = 128 FP8_E4M3_MAX = 448.0 diff --git a/python/sglang/jit_kernel/tests/test_fused_verify_triton_gdn.py b/python/sglang/jit_kernel/tests/test_fused_verify_triton_gdn.py index 1545f3e6b3e7..08db51289b25 100644 --- a/python/sglang/jit_kernel/tests/test_fused_verify_triton_gdn.py +++ b/python/sglang/jit_kernel/tests/test_fused_verify_triton_gdn.py @@ -11,6 +11,8 @@ import pytest import torch +from sglang.test.ci.ci_register import register_cuda_ci + try: from sglang.srt.layers.attention.fla.fused_gdn_gating import fused_gdn_gating from sglang.srt.layers.attention.fla.fused_recurrent import ( @@ -24,6 +26,9 @@ except ImportError: KERNELS_AVAILABLE = False +register_cuda_ci(est_time=6, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) + def _make_tensors(N, T, H, HV, K, V, device="cuda", seed=2025): """Create input tensors for GDN target_verify.""" diff --git a/python/sglang/jit_kernel/tests/test_gptq_marlin.py b/python/sglang/jit_kernel/tests/test_gptq_marlin.py index ff2b2c13c29c..a811ee654d78 100644 --- a/python/sglang/jit_kernel/tests/test_gptq_marlin.py +++ b/python/sglang/jit_kernel/tests/test_gptq_marlin.py @@ -6,8 +6,12 @@ from sglang.jit_kernel.gptq_marlin import gptq_marlin_gemm from sglang.srt.layers.quantization.marlin_utils import marlin_make_workspace +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize +register_cuda_ci(est_time=13, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) + MNK_FACTORS = [ (1, 1, 1), (1, 4, 8), diff --git a/python/sglang/jit_kernel/tests/test_gptq_marlin_repack.py b/python/sglang/jit_kernel/tests/test_gptq_marlin_repack.py index 71ab5aafa9f4..4bcbc0bf3a4a 100644 --- a/python/sglang/jit_kernel/tests/test_gptq_marlin_repack.py +++ b/python/sglang/jit_kernel/tests/test_gptq_marlin_repack.py @@ -10,8 +10,12 @@ pack_rows, sort_weights, ) +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_marlin_utils import get_weight_perm, marlin_weights +register_cuda_ci(est_time=16, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) + MARLIN_K_CHUNKS = [128] MARLIN_N_CHUNKS = [64, 256] diff --git a/python/sglang/jit_kernel/tests/test_hadamard_jit.py b/python/sglang/jit_kernel/tests/test_hadamard_jit.py index daaa7addbae8..cc03a01aec88 100644 --- a/python/sglang/jit_kernel/tests/test_hadamard_jit.py +++ b/python/sglang/jit_kernel/tests/test_hadamard_jit.py @@ -14,6 +14,10 @@ hadamard_transform_28n, hadamard_transform_40n, ) +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=128, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=512, suite="nightly-kernel-1-gpu", nightly=True) # Exact M×N Hadamard matrices (±1 entries) copied from # python/sglang/jit_kernel/csrc/fast-hadamard-transform/code_gen.py. diff --git a/python/sglang/jit_kernel/tests/test_moe_lora_align_block_size.py b/python/sglang/jit_kernel/tests/test_moe_lora_align_block_size.py index 415d2bf5a5f2..48688e5da033 100644 --- a/python/sglang/jit_kernel/tests/test_moe_lora_align_block_size.py +++ b/python/sglang/jit_kernel/tests/test_moe_lora_align_block_size.py @@ -11,7 +11,8 @@ from sglang.jit_kernel.moe_lora_align import moe_lora_align_block_size from sglang.test.ci.ci_register import register_cuda_ci -register_cuda_ci(est_time=80, suite="stage-b-test-1-gpu-large") +register_cuda_ci(est_time=28, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) def round_up(x, base): diff --git a/python/sglang/jit_kernel/tests/test_moe_wna16_marlin.py b/python/sglang/jit_kernel/tests/test_moe_wna16_marlin.py index 90cb99a6f1df..5100aac5ceed 100644 --- a/python/sglang/jit_kernel/tests/test_moe_wna16_marlin.py +++ b/python/sglang/jit_kernel/tests/test_moe_wna16_marlin.py @@ -7,8 +7,12 @@ from sglang.jit_kernel.moe_wna16_marlin import moe_wna16_marlin_gemm from sglang.srt.layers.moe.fused_moe_triton import moe_align_block_size +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize +register_cuda_ci(est_time=10, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) + def _has_aot_moe_wna16_marlin_gemm() -> bool: return hasattr(torch.ops.sgl_kernel, "moe_wna16_marlin_gemm") and hasattr( diff --git a/python/sglang/jit_kernel/tests/test_norm_jit.py b/python/sglang/jit_kernel/tests/test_norm_jit.py index 1f8b41e6fbd1..ebd0d3034cd9 100644 --- a/python/sglang/jit_kernel/tests/test_norm_jit.py +++ b/python/sglang/jit_kernel/tests/test_norm_jit.py @@ -5,6 +5,11 @@ import pytest import torch +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=125, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=500, suite="nightly-kernel-1-gpu", nightly=True) + # JIT rmsnorm: fp16/bf16 only # - Warp norm path (one warp per token): hidden_size in {64, 128, 256} # - CTA norm path (multi-warp per token): hidden_size is a multiple of 256, > 256, and <=8192 diff --git a/python/sglang/jit_kernel/tests/test_nvfp4_blockwise_moe.py b/python/sglang/jit_kernel/tests/test_nvfp4_blockwise_moe.py index 6eb809d3385f..864636050098 100644 --- a/python/sglang/jit_kernel/tests/test_nvfp4_blockwise_moe.py +++ b/python/sglang/jit_kernel/tests/test_nvfp4_blockwise_moe.py @@ -1,3 +1,5 @@ +import sys + import pytest import torch @@ -6,6 +8,10 @@ scaled_fp4_experts_quant, scaled_fp4_quant, ) +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=5, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) FLOAT4_E2M1_MAX = 6.0 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max @@ -125,3 +131,7 @@ def test_nvfp4_blockwise_moe_grouped_mm(dtype: torch.dtype) -> None: ref[start:end] = torch.matmul(a[start:end], b[i].t()) torch.testing.assert_close(out, ref, atol=1e-1, rtol=1e-1) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-v", "-s"])) diff --git a/python/sglang/jit_kernel/tests/test_nvfp4_gemm.py b/python/sglang/jit_kernel/tests/test_nvfp4_gemm.py index 9a76cd8009d7..21c14d5b1c07 100644 --- a/python/sglang/jit_kernel/tests/test_nvfp4_gemm.py +++ b/python/sglang/jit_kernel/tests/test_nvfp4_gemm.py @@ -1,7 +1,13 @@ +import sys + import pytest import torch from sglang.jit_kernel.nvfp4 import cutlass_scaled_fp4_mm, scaled_fp4_quant +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=5, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) def _nvfp4_supported() -> bool: @@ -140,3 +146,7 @@ def test_nvfp4_gemm(dtype: torch.dtype, shape: tuple[int, int, int]) -> None: ) torch.testing.assert_close(out, expected_out.to(dtype=dtype), atol=1e-1, rtol=1e-1) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-v", "-s"])) diff --git a/python/sglang/jit_kernel/tests/test_nvfp4_quant.py b/python/sglang/jit_kernel/tests/test_nvfp4_quant.py index eb7816408b60..5b7dfbd0f2e5 100644 --- a/python/sglang/jit_kernel/tests/test_nvfp4_quant.py +++ b/python/sglang/jit_kernel/tests/test_nvfp4_quant.py @@ -1,3 +1,5 @@ +import sys + import pytest import torch @@ -12,6 +14,11 @@ except Exception: _sgl_silu_and_mul = None +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=5, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) + def _nvfp4_supported() -> bool: return torch.cuda.is_available() and torch.cuda.get_device_capability() >= (10, 0) @@ -212,3 +219,7 @@ def test_silu_and_mul_quantize_to_fp4_grouped(shape: tuple[int, int, int]) -> No scale_ref = recover_swizzled_scales(ref_output_scales[i], m, k) scale_ans = recover_swizzled_scales(output_scales[i], m, k) torch.testing.assert_close(scale_ref[: mask[i]], scale_ans[: mask[i]]) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-v", "-s"])) diff --git a/python/sglang/jit_kernel/tests/test_per_tensor_quant_fp8.py b/python/sglang/jit_kernel/tests/test_per_tensor_quant_fp8.py index ecc79d67b75b..cac76d03a783 100644 --- a/python/sglang/jit_kernel/tests/test_per_tensor_quant_fp8.py +++ b/python/sglang/jit_kernel/tests/test_per_tensor_quant_fp8.py @@ -6,6 +6,10 @@ import torch from sglang.jit_kernel.per_tensor_quant_fp8 import per_tensor_quant_fp8 +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=16, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) try: from sglang.srt.utils import is_hip diff --git a/python/sglang/jit_kernel/tests/test_per_token_group_quant_8bit.py b/python/sglang/jit_kernel/tests/test_per_token_group_quant_8bit.py index dde8ccc6d7f9..8b0452d7d0ec 100644 --- a/python/sglang/jit_kernel/tests/test_per_token_group_quant_8bit.py +++ b/python/sglang/jit_kernel/tests/test_per_token_group_quant_8bit.py @@ -23,6 +23,10 @@ from sglang.srt.layers.quantization.fp8_kernel import ( per_token_group_quant_8bit as triton_per_token_group_quant_8bit, ) +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=16, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) configs = list( itertools.product( diff --git a/python/sglang/jit_kernel/tests/test_pos_enc.py b/python/sglang/jit_kernel/tests/test_pos_enc.py index 2bfee5619c9b..ad09fd706416 100644 --- a/python/sglang/jit_kernel/tests/test_pos_enc.py +++ b/python/sglang/jit_kernel/tests/test_pos_enc.py @@ -8,6 +8,10 @@ import triton.language as tl from sglang.jit_kernel.rope import rotary_embedding +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=18, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) @triton.jit diff --git a/python/sglang/jit_kernel/tests/test_qknorm.py b/python/sglang/jit_kernel/tests/test_qknorm.py index 5bd16b4f30b3..4b85b5c52bb6 100644 --- a/python/sglang/jit_kernel/tests/test_qknorm.py +++ b/python/sglang/jit_kernel/tests/test_qknorm.py @@ -6,6 +6,10 @@ import triton from sglang.jit_kernel.utils import get_ci_test_range +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=37, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=148, suite="nightly-kernel-1-gpu", nightly=True) def sglang_aot_qknorm( diff --git a/python/sglang/jit_kernel/tests/test_qknorm_across_heads.py b/python/sglang/jit_kernel/tests/test_qknorm_across_heads.py index bf9e4991faef..81c8a1ead5bd 100644 --- a/python/sglang/jit_kernel/tests/test_qknorm_across_heads.py +++ b/python/sglang/jit_kernel/tests/test_qknorm_across_heads.py @@ -6,6 +6,10 @@ import triton from sglang.jit_kernel.utils import get_ci_test_range +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=15, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) def sglang_jit_qknorm_across_heads( diff --git a/python/sglang/jit_kernel/tests/test_qwen_image_modulation.py b/python/sglang/jit_kernel/tests/test_qwen_image_modulation.py index f536964f57e3..dce8ce947311 100644 --- a/python/sglang/jit_kernel/tests/test_qwen_image_modulation.py +++ b/python/sglang/jit_kernel/tests/test_qwen_image_modulation.py @@ -10,6 +10,10 @@ fuse_residual_layernorm_scale_shift_gate_select01_kernel, ) from sglang.jit_kernel.utils import get_ci_test_range +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=15, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) DEVICE = "cuda" DTYPES = get_ci_test_range( diff --git a/python/sglang/jit_kernel/tests/test_renorm.py b/python/sglang/jit_kernel/tests/test_renorm.py index 870b46db7752..4def31326749 100644 --- a/python/sglang/jit_kernel/tests/test_renorm.py +++ b/python/sglang/jit_kernel/tests/test_renorm.py @@ -7,6 +7,11 @@ import sgl_kernel import torch +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=6, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) + @pytest.mark.parametrize("batch_size", [1, 99, 989]) @pytest.mark.parametrize("vocab_size", [111, 32000, 128256]) diff --git a/python/sglang/jit_kernel/tests/test_resolve_future_token_ids.py b/python/sglang/jit_kernel/tests/test_resolve_future_token_ids.py index ad3225720006..cfb8597f6a3e 100644 --- a/python/sglang/jit_kernel/tests/test_resolve_future_token_ids.py +++ b/python/sglang/jit_kernel/tests/test_resolve_future_token_ids.py @@ -4,6 +4,10 @@ import torch from sglang.jit_kernel.resolve_future_token_ids import resolve_future_token_ids_cuda +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=9, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) def _reference_resolve(input_ids, future_map): diff --git a/python/sglang/jit_kernel/tests/test_rmsnorm.py b/python/sglang/jit_kernel/tests/test_rmsnorm.py index 138fece2183e..ac31a792747d 100644 --- a/python/sglang/jit_kernel/tests/test_rmsnorm.py +++ b/python/sglang/jit_kernel/tests/test_rmsnorm.py @@ -6,6 +6,10 @@ import triton from sglang.jit_kernel.utils import get_ci_test_range +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=18, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) def sglang_jit_rmsnorm(input: torch.Tensor, weight: torch.Tensor) -> None: diff --git a/python/sglang/jit_kernel/tests/test_rope.py b/python/sglang/jit_kernel/tests/test_rope.py index abc482f5c28f..62a601f653a4 100644 --- a/python/sglang/jit_kernel/tests/test_rope.py +++ b/python/sglang/jit_kernel/tests/test_rope.py @@ -5,6 +5,10 @@ import triton from sglang.jit_kernel.utils import get_ci_test_range +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=64, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=256, suite="nightly-kernel-1-gpu", nightly=True) DEVICE = "cuda" DTYPE = torch.bfloat16 diff --git a/python/sglang/jit_kernel/tests/test_store_cache.py b/python/sglang/jit_kernel/tests/test_store_cache.py index cd6887042bb9..278781ca2420 100644 --- a/python/sglang/jit_kernel/tests/test_store_cache.py +++ b/python/sglang/jit_kernel/tests/test_store_cache.py @@ -6,6 +6,10 @@ from sglang.jit_kernel.kvcache import can_use_store_cache, store_cache from sglang.jit_kernel.utils import get_ci_test_range +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=28, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) BS_LIST = [2**n for n in range(0, 15)] BS_LIST += [x + 1 + i for i, x in enumerate(BS_LIST)] diff --git a/python/sglang/jit_kernel/tests/test_timestep_embedding.py b/python/sglang/jit_kernel/tests/test_timestep_embedding.py index bc900291ab57..1ec5912d4778 100644 --- a/python/sglang/jit_kernel/tests/test_timestep_embedding.py +++ b/python/sglang/jit_kernel/tests/test_timestep_embedding.py @@ -14,6 +14,10 @@ timestep_embedding as timestep_embedding_cuda, ) from sglang.jit_kernel.utils import get_ci_test_range +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=16, suite="stage-b-kernel-unit-1-gpu-large") +register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True) CORRECTNESS_BATCH_SIZES = get_ci_test_range( [1, 2, 8, 128, 256, 512, 1536, 2048, 4096, 11008, 16384], diff --git a/scripts/version_branch_to_tag.sh b/scripts/version_branch_to_tag.sh deleted file mode 100755 index 9f587fb0b541..000000000000 --- a/scripts/version_branch_to_tag.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -set -euxo pipefail - -# This script is used for release. -# It tags all remote branches starting with 'v' with the same name as the branch, -# deletes the corresponding branches from the remote, and pushes the tags to the remote repository. - -git fetch origin --prune - -# List all branches starting with 'v' -branches=$(git branch -r | grep 'origin/v' | sed 's/origin\///') - -# Loop through each branch -for branch in $branches; do - echo "Processing branch: $branch" - - # Get the commit hash for the branch - commit_hash=$(git rev-parse origin/$branch) - - # Create a tag with the same name as the branch using the commit hash - git tag $branch $commit_hash - - # Delete the branch from the remote - git push origin --delete $branch -done - -# Push all tags to the remote repository -git push --tags - -echo "All branches starting with 'v' have been tagged, deleted from remote, and pushed to the remote repository." diff --git a/test/run_suite.py b/test/run_suite.py index 93fcaaede101..75c79ba09de0 100644 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -41,6 +41,8 @@ "stage-b-test-1-gpu-large", "stage-b-test-2-gpu-large", "stage-b-test-4-gpu-b200", + "stage-b-kernel-unit-1-gpu-large", + "stage-b-kernel-benchmark-1-gpu-large", "stage-c-test-4-gpu-h100", "stage-c-test-4-gpu-b200", "stage-c-test-4-gpu-gb200", @@ -73,6 +75,7 @@ "nightly-8-gpu-h200-basic", # Basic tests for large models on H200 "nightly-8-gpu-b200-basic", # Basic tests for large models on B200 "nightly-8-gpu-common", # Common tests that run on both H200 and B200 + "nightly-kernel-1-gpu", # Eval and perf suites (2-gpu) "nightly-eval-text-2-gpu", "nightly-eval-vlm-2-gpu", @@ -170,9 +173,11 @@ def run_a_suite(args): auto_partition_id = args.auto_partition_id auto_partition_size = args.auto_partition_size - # All tests (per-commit and nightly) are now in registered/ # Use absolute paths so the script works from any working directory script_dir = os.path.dirname(os.path.abspath(__file__)) + repo_root = os.path.dirname(script_dir) + + # Registered tests under test/registered/ files = [ f for f in glob.glob( @@ -180,7 +185,13 @@ def run_a_suite(args): ) if not f.endswith("/conftest.py") and not f.endswith("/__init__.py") ] - # Strict: all registered files must have proper registration + + # JIT kernel tests and benchmarks (live alongside kernel source) + jit_kernel_dir = os.path.join(repo_root, "python", "sglang", "jit_kernel") + files += glob.glob(os.path.join(jit_kernel_dir, "tests", "test_*.py")) + files += glob.glob(os.path.join(jit_kernel_dir, "benchmark", "bench_*.py")) + + # Strict: all discovered files must have proper registration sanity_check = True all_tests = collect_tests(files, sanity_check=sanity_check)