diff --git a/.github/workflows/nightly-test-nvidia.yml b/.github/workflows/nightly-test-nvidia.yml
index 3f83f40b6eff..f4d4561c9fe4 100644
--- a/.github/workflows/nightly-test-nvidia.yml
+++ b/.github/workflows/nightly-test-nvidia.yml
@@ -25,6 +25,7 @@ on:
           - 'nightly-test-multimodal-server-2-gpu'
           - 'nightly-test-perf-4-gpu-b200'
           - 'nightly-test-perf-8-gpu-b200'
+          - 'nightly-test-kernel-1-gpu-h100'
   workflow_call:
     inputs:
       ref:
@@ -76,6 +77,42 @@ jobs:
       - uses: ./.github/actions/upload-cuda-coredumps
         if: always()
 
+  # JIT kernel full unit tests (expanded parameter ranges via SGLANG_JIT_KERNEL_RUN_FULL_TESTS)
+  nightly-test-kernel-1-gpu-h100:
+    if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-kernel-1-gpu-h100')
+    runs-on: 1-gpu-h100
+    timeout-minutes: 240
+    env:
+      # Full jit_kernel test grids (see sglang.jit_kernel.utils.should_run_full_tests)
+      SGLANG_JIT_KERNEL_RUN_FULL_TESTS: "1"
+      # Match pr-test-jit-kernel workflow for consistent JIT warmup behavior
+      SGLANG_JIT_DEEPGEMM_FAST_WARMUP: true
+      # Allow maintenance bypass on default branch (same semantics as PR JIT workflow)
+      SGLANG_PR_TEST_BYPASS_MAINTENANCE_ON_MAIN: ${{ github.ref == 'refs/heads/main' && 'true' || 'false' }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - uses: ./.github/actions/check-maintenance
+        with:
+          github-token: ${{ github.token }}
+
+      - name: Install dependencies
+        timeout-minutes: 20
+        run: |
+          bash scripts/ci/cuda/ci_install_dependency.sh
+
+      - name: Run jit kernel nightly suite
+        timeout-minutes: 60
+        run: |
+          cd test
+          python3 run_suite.py --hw cuda --suite nightly-kernel-1-gpu --nightly --continue-on-error
+
+      - uses: ./.github/actions/upload-cuda-coredumps
+        if: always()
+
   # General tests - 4 GPU H100
   nightly-test-general-4-gpu-h100:
     if: github.repository == 'sgl-project/sglang' && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-general-4-gpu-h100')
diff --git a/.github/workflows/pr-test-jit-kernel.yml b/.github/workflows/pr-test-jit-kernel.yml
index bfcf2b600d5e..6e56278572a0 100644
--- a/.github/workflows/pr-test-jit-kernel.yml
+++ b/.github/workflows/pr-test-jit-kernel.yml
@@ -56,36 +56,8 @@ jobs:
       - name: Run test
         timeout-minutes: 30
         run: |
-          cd python/sglang/jit_kernel
-          pytest tests/
-
-  jit-kernel-unit-test-nightly:
-    if: |
-      github.event_name == 'schedule' &&
-      inputs.jit_kernel == 'true'
-    runs-on: 1-gpu-h100
-    timeout-minutes: 240
-    env:
-      SGLANG_JIT_KERNEL_RUN_FULL_TESTS: "1"
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha || inputs.git_ref || github.sha }}
-
-      - uses: ./.github/actions/check-maintenance
-        with:
-          github-token: ${{ github.token }}
-
-      - name: Install dependencies
-        timeout-minutes: 20
-        run: |
-          bash scripts/ci/cuda/ci_install_dependency.sh
-
-      - name: Run full nightly test
-        timeout-minutes: 60
-        run: |
-          cd python/sglang/jit_kernel
-          pytest tests/
+          cd test/
+          python3 run_suite.py --hw cuda --suite stage-b-kernel-unit-1-gpu-large
 
   jit-kernel-benchmark-test:
     if: |
@@ -111,23 +83,5 @@ jobs:
       - name: Run benchmark tests
         timeout-minutes: 45
         run: |
-          cd python/sglang/jit_kernel/benchmark
-          echo "Running jit-kernel benchmark tests in CI mode..."
-
-          failures=()
-
-          for bench_file in bench_*.py; do
-            echo "Testing $bench_file..."
-            if ! timeout 120 python3 "$bench_file"; then
-              failures+=("$bench_file")
-            fi
-            echo "Completed $bench_file"
-            echo "---"
-          done
-
-          if [ ${#failures[@]} -ne 0 ]; then
-            echo "The following benchmark tests failed: ${failures[*]}"
-            exit 1
-          fi
-
-          echo "All jit-kernel benchmark tests completed successfully!"
+          cd test/
+          python3 run_suite.py --hw cuda --suite stage-b-kernel-benchmark-1-gpu-large
diff --git a/python/sglang/jit_kernel/benchmark/bench_awq_dequantize.py b/python/sglang/jit_kernel/benchmark/bench_awq_dequantize.py
index 6a3fcbc93ee6..81422a597f4c 100644
--- a/python/sglang/jit_kernel/benchmark/bench_awq_dequantize.py
+++ b/python/sglang/jit_kernel/benchmark/bench_awq_dequantize.py
@@ -6,8 +6,11 @@
 
 from sglang.jit_kernel.awq_dequantize import awq_dequantize as jit_awq_dequantize
 from sglang.jit_kernel.benchmark.utils import run_benchmark
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.utils import is_in_ci
 
+register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large")
+
 try:
     from sgl_kernel import awq_dequantize as aot_awq_dequantize
 
diff --git a/python/sglang/jit_kernel/benchmark/bench_clamp_position.py b/python/sglang/jit_kernel/benchmark/bench_clamp_position.py
index 08fa92660082..52082c64b9c4 100644
--- a/python/sglang/jit_kernel/benchmark/bench_clamp_position.py
+++ b/python/sglang/jit_kernel/benchmark/bench_clamp_position.py
@@ -11,6 +11,9 @@
 )
 from sglang.jit_kernel.clamp_position import clamp_position_cuda
 from sglang.srt.utils import get_compiler_backend
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=13, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 SIZE_LIST = get_benchmark_range(
     full_range=[2**n for n in range(4, 16)],
diff --git a/python/sglang/jit_kernel/benchmark/bench_concat_mla.py b/python/sglang/jit_kernel/benchmark/bench_concat_mla.py
index 7f2b542a0c9b..8129b7db1c11 100644
--- a/python/sglang/jit_kernel/benchmark/bench_concat_mla.py
+++ b/python/sglang/jit_kernel/benchmark/bench_concat_mla.py
@@ -9,8 +9,11 @@
 from sglang.jit_kernel.benchmark.utils import run_benchmark
 from sglang.jit_kernel.concat_mla import concat_mla_absorb_q as jit_absorb_q
 from sglang.jit_kernel.concat_mla import concat_mla_k as jit_k
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.utils import is_in_ci
 
+register_cuda_ci(est_time=6, suite="stage-b-kernel-benchmark-1-gpu-large")
+
 IS_CI = is_in_ci()
 
 NUM_LOCAL_HEADS = 128
diff --git a/python/sglang/jit_kernel/benchmark/bench_custom_all_reduce.py b/python/sglang/jit_kernel/benchmark/bench_custom_all_reduce.py
index 82d5166dfb5e..4f36f1a48276 100644
--- a/python/sglang/jit_kernel/benchmark/bench_custom_all_reduce.py
+++ b/python/sglang/jit_kernel/benchmark/bench_custom_all_reduce.py
@@ -22,6 +22,13 @@
 import torch.distributed as dist
 
 from sglang.jit_kernel.benchmark.utils import is_in_ci
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(
+    est_time=120,
+    suite="stage-b-kernel-benchmark-1-gpu-large",
+    disabled="requires multi-GPU, self-skips in CI",
+)
 
 DTYPE_MAP = {
     "float16": torch.float16,
diff --git a/python/sglang/jit_kernel/benchmark/bench_fused_add_rmsnorm.py b/python/sglang/jit_kernel/benchmark/bench_fused_add_rmsnorm.py
index 0d7b1ec23505..a842be84b72b 100644
--- a/python/sglang/jit_kernel/benchmark/bench_fused_add_rmsnorm.py
+++ b/python/sglang/jit_kernel/benchmark/bench_fused_add_rmsnorm.py
@@ -7,8 +7,11 @@
 
 from sglang.jit_kernel.benchmark.utils import run_benchmark
 from sglang.jit_kernel.norm import fused_add_rmsnorm as jit_fused_add_rmsnorm
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.utils import is_in_ci
 
+register_cuda_ci(est_time=6, suite="stage-b-kernel-benchmark-1-gpu-large")
+
 IS_CI = is_in_ci()
 
 
diff --git a/python/sglang/jit_kernel/benchmark/bench_fused_norm_scale_shift.py b/python/sglang/jit_kernel/benchmark/bench_fused_norm_scale_shift.py
index 24971231c832..ae9ce7ff8cbb 100644
--- a/python/sglang/jit_kernel/benchmark/bench_fused_norm_scale_shift.py
+++ b/python/sglang/jit_kernel/benchmark/bench_fused_norm_scale_shift.py
@@ -15,8 +15,11 @@
     ScaleResidualLayerNormScaleShift,
     ScaleResidualRMSNormScaleShift,
 )
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.utils import is_in_ci
 
+register_cuda_ci(est_time=17, suite="stage-b-kernel-benchmark-1-gpu-large")
+
 if is_in_ci():
     B_RANGE, S_RANGE, D_RANGE = [1], [128], [1024]
 else:
diff --git a/python/sglang/jit_kernel/benchmark/bench_hadamard.py b/python/sglang/jit_kernel/benchmark/bench_hadamard.py
index 61fc37e9b163..3da9ec484c6d 100644
--- a/python/sglang/jit_kernel/benchmark/bench_hadamard.py
+++ b/python/sglang/jit_kernel/benchmark/bench_hadamard.py
@@ -14,6 +14,9 @@
     run_benchmark,
 )
 from sglang.jit_kernel.hadamard import hadamard_transform
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 # AOT kernel: might not be available in all environments.
 # This is used for performance baseline comparison.
diff --git a/python/sglang/jit_kernel/benchmark/bench_hicache.py b/python/sglang/jit_kernel/benchmark/bench_hicache.py
index c825a7f85801..d0e4a31aaec9 100644
--- a/python/sglang/jit_kernel/benchmark/bench_hicache.py
+++ b/python/sglang/jit_kernel/benchmark/bench_hicache.py
@@ -31,6 +31,9 @@
     transfer_hicache_all_layer,
     transfer_hicache_one_layer,
 )
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=29, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 DISABLE_TORCH = os.environ.get("DISABLE_TORCH", "0") == "1"
 PAGE_SIZE = 1
diff --git a/python/sglang/jit_kernel/benchmark/bench_norm.py b/python/sglang/jit_kernel/benchmark/bench_norm.py
index e58d9f5f9ff4..d046ecf2d2a8 100644
--- a/python/sglang/jit_kernel/benchmark/bench_norm.py
+++ b/python/sglang/jit_kernel/benchmark/bench_norm.py
@@ -9,8 +9,11 @@
 from sglang.jit_kernel.benchmark.utils import run_benchmark
 from sglang.jit_kernel.norm import fused_add_rmsnorm as jit_fused_add_rmsnorm
 from sglang.jit_kernel.norm import rmsnorm as jit_rmsnorm
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.utils import is_in_ci
 
+register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large")
+
 IS_CI = is_in_ci()
 
 DTYPE = torch.bfloat16
diff --git a/python/sglang/jit_kernel/benchmark/bench_norm_impls.py b/python/sglang/jit_kernel/benchmark/bench_norm_impls.py
index 9642635b1baa..7ef8ac832d95 100644
--- a/python/sglang/jit_kernel/benchmark/bench_norm_impls.py
+++ b/python/sglang/jit_kernel/benchmark/bench_norm_impls.py
@@ -21,8 +21,15 @@
 from sglang.jit_kernel.norm import fused_add_rmsnorm as jit_fused_add_rmsnorm
 from sglang.jit_kernel.norm import rmsnorm as jit_rmsnorm
 from sglang.jit_kernel.utils import KERNEL_PATH
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.utils import is_in_ci
 
+register_cuda_ci(
+    est_time=120,
+    suite="stage-b-kernel-benchmark-1-gpu-large",
+    disabled="self-skips in CI, standalone tool",
+)
+
 os.environ.setdefault("FLASHINFER_DISABLE_VERSION_CHECK", "1")
 
 REPO_ROOT = KERNEL_PATH.parents[2]
diff --git a/python/sglang/jit_kernel/benchmark/bench_nvfp4_blockwise_moe.py b/python/sglang/jit_kernel/benchmark/bench_nvfp4_blockwise_moe.py
index 3ae2e5ccaed8..98da03e72cac 100644
--- a/python/sglang/jit_kernel/benchmark/bench_nvfp4_blockwise_moe.py
+++ b/python/sglang/jit_kernel/benchmark/bench_nvfp4_blockwise_moe.py
@@ -13,6 +13,9 @@
     scaled_fp4_quant,
 )
 from sglang.srt.utils import is_sm100_supported
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 FLOAT4_E2M1_MAX = 6.0
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
diff --git a/python/sglang/jit_kernel/benchmark/bench_nvfp4_quant.py b/python/sglang/jit_kernel/benchmark/bench_nvfp4_quant.py
index e1a4eb7f555d..2be07d3d60f0 100644
--- a/python/sglang/jit_kernel/benchmark/bench_nvfp4_quant.py
+++ b/python/sglang/jit_kernel/benchmark/bench_nvfp4_quant.py
@@ -8,6 +8,9 @@
 from sglang.jit_kernel.benchmark.utils import get_benchmark_range, run_benchmark
 from sglang.jit_kernel.nvfp4 import scaled_fp4_quant
 from sglang.srt.utils import is_sm100_supported
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 FLOAT4_E2M1_MAX = 6.0
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
diff --git a/python/sglang/jit_kernel/benchmark/bench_nvfp4_scaled_mm.py b/python/sglang/jit_kernel/benchmark/bench_nvfp4_scaled_mm.py
index 80d3b4f04950..f7af1e7c5100 100644
--- a/python/sglang/jit_kernel/benchmark/bench_nvfp4_scaled_mm.py
+++ b/python/sglang/jit_kernel/benchmark/bench_nvfp4_scaled_mm.py
@@ -8,6 +8,9 @@
 from sglang.jit_kernel.benchmark.utils import get_benchmark_range, run_benchmark
 from sglang.jit_kernel.nvfp4 import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from sglang.srt.utils import is_sm100_supported
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 FLOAT4_E2M1_MAX = 6.0
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
diff --git a/python/sglang/jit_kernel/benchmark/bench_per_tensor_quant_fp8.py b/python/sglang/jit_kernel/benchmark/bench_per_tensor_quant_fp8.py
index 9549526d59fe..a061639b8017 100644
--- a/python/sglang/jit_kernel/benchmark/bench_per_tensor_quant_fp8.py
+++ b/python/sglang/jit_kernel/benchmark/bench_per_tensor_quant_fp8.py
@@ -6,6 +6,9 @@
 
 from sglang.jit_kernel.benchmark.utils import get_benchmark_range, run_benchmark
 from sglang.jit_kernel.per_tensor_quant_fp8 import per_tensor_quant_fp8
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 try:
     from vllm import _custom_ops as ops
diff --git a/python/sglang/jit_kernel/benchmark/bench_per_token_group_quant_8bit.py b/python/sglang/jit_kernel/benchmark/bench_per_token_group_quant_8bit.py
index b4515df0a360..a5a3c392b0df 100644
--- a/python/sglang/jit_kernel/benchmark/bench_per_token_group_quant_8bit.py
+++ b/python/sglang/jit_kernel/benchmark/bench_per_token_group_quant_8bit.py
@@ -17,8 +17,11 @@
 )
 from sglang.srt.utils import is_hip
 from sglang.srt.utils.bench_utils import bench_kineto
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.utils import is_in_ci
 
+register_cuda_ci(est_time=13, suite="stage-b-kernel-benchmark-1-gpu-large")
+
 IS_CI = is_in_ci()
 
 _is_hip = is_hip()
diff --git a/python/sglang/jit_kernel/benchmark/bench_qknorm.py b/python/sglang/jit_kernel/benchmark/bench_qknorm.py
index d38fe0b539dd..e5458385cd7d 100644
--- a/python/sglang/jit_kernel/benchmark/bench_qknorm.py
+++ b/python/sglang/jit_kernel/benchmark/bench_qknorm.py
@@ -13,6 +13,9 @@
 )
 from sglang.jit_kernel.norm import fused_inplace_qknorm
 from sglang.srt.utils import get_current_device_stream_fast
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=10, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 alt_stream = torch.cuda.Stream()
 
diff --git a/python/sglang/jit_kernel/benchmark/bench_qknorm_across_heads.py b/python/sglang/jit_kernel/benchmark/bench_qknorm_across_heads.py
index cbcff1031413..9bd05f7fc10a 100644
--- a/python/sglang/jit_kernel/benchmark/bench_qknorm_across_heads.py
+++ b/python/sglang/jit_kernel/benchmark/bench_qknorm_across_heads.py
@@ -9,8 +9,11 @@
 from sglang.jit_kernel.benchmark.utils import run_benchmark
 from sglang.jit_kernel.norm import fused_inplace_qknorm_across_heads
 from sglang.srt.utils import get_current_device_stream_fast
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.utils import is_in_ci
 
+register_cuda_ci(est_time=12, suite="stage-b-kernel-benchmark-1-gpu-large")
+
 IS_CI = is_in_ci()
 
 alt_stream = torch.cuda.Stream()
diff --git a/python/sglang/jit_kernel/benchmark/bench_qwen_image_modulation.py b/python/sglang/jit_kernel/benchmark/bench_qwen_image_modulation.py
index 42552a805b5b..c4713d56ae1b 100644
--- a/python/sglang/jit_kernel/benchmark/bench_qwen_image_modulation.py
+++ b/python/sglang/jit_kernel/benchmark/bench_qwen_image_modulation.py
@@ -9,8 +9,11 @@
     fuse_layernorm_scale_shift_gate_select01_kernel,
     fuse_residual_layernorm_scale_shift_gate_select01_kernel,
 )
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.utils import is_in_ci
 
+register_cuda_ci(est_time=13, suite="stage-b-kernel-benchmark-1-gpu-large")
+
 if is_in_ci():
     B_RANGE, S_RANGE, D_RANGE = [1], [128], [3072]
 else:
diff --git a/python/sglang/jit_kernel/benchmark/bench_renorm.py b/python/sglang/jit_kernel/benchmark/bench_renorm.py
index 20cc9ae71a6a..cd4ab36b4326 100644
--- a/python/sglang/jit_kernel/benchmark/bench_renorm.py
+++ b/python/sglang/jit_kernel/benchmark/bench_renorm.py
@@ -6,8 +6,11 @@
 import triton.testing
 
 from sglang.jit_kernel.benchmark.utils import run_benchmark_no_cudagraph
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.utils import is_in_ci
 
+register_cuda_ci(est_time=5, suite="stage-b-kernel-benchmark-1-gpu-large")
+
 
 def torch_top_k_renorm_probs(probs, top_k):
     """Vectorized PyTorch implementation of top-k renormalization."""
diff --git a/python/sglang/jit_kernel/benchmark/bench_resolve_future_token_ids.py b/python/sglang/jit_kernel/benchmark/bench_resolve_future_token_ids.py
index 31430442ce67..f56c8df2a981 100644
--- a/python/sglang/jit_kernel/benchmark/bench_resolve_future_token_ids.py
+++ b/python/sglang/jit_kernel/benchmark/bench_resolve_future_token_ids.py
@@ -11,6 +11,9 @@
 )
 from sglang.jit_kernel.resolve_future_token_ids import resolve_future_token_ids_cuda
 from sglang.srt.utils import get_compiler_backend
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=10, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 SIZE_LIST = get_benchmark_range(
     full_range=[2**n for n in range(4, 16)],  # 16 … 32K elements
diff --git a/python/sglang/jit_kernel/benchmark/bench_rmsnorm.py b/python/sglang/jit_kernel/benchmark/bench_rmsnorm.py
index b55a2c06be8b..779b8ad7e207 100644
--- a/python/sglang/jit_kernel/benchmark/bench_rmsnorm.py
+++ b/python/sglang/jit_kernel/benchmark/bench_rmsnorm.py
@@ -13,6 +13,9 @@
     run_benchmark,
 )
 from sglang.jit_kernel.norm import rmsnorm as jit_rmsnorm
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=21, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 
 def sglang_aot_rmsnorm(
diff --git a/python/sglang/jit_kernel/benchmark/bench_rope.py b/python/sglang/jit_kernel/benchmark/bench_rope.py
index 2ca71d4ca456..afe591185b60 100644
--- a/python/sglang/jit_kernel/benchmark/bench_rope.py
+++ b/python/sglang/jit_kernel/benchmark/bench_rope.py
@@ -10,6 +10,9 @@
     get_benchmark_range,
     run_benchmark,
 )
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=6, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 MAX_SEQ_LEN = 131072
 ROPE_BASE = 10000.0
diff --git a/python/sglang/jit_kernel/benchmark/bench_store_cache.py b/python/sglang/jit_kernel/benchmark/bench_store_cache.py
index 700274772977..f1399ff0efe9 100644
--- a/python/sglang/jit_kernel/benchmark/bench_store_cache.py
+++ b/python/sglang/jit_kernel/benchmark/bench_store_cache.py
@@ -12,6 +12,9 @@
     get_benchmark_range,
 )
 from sglang.jit_kernel.kvcache import store_cache
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=9, suite="stage-b-kernel-benchmark-1-gpu-large")
 
 
 def sglang_jit_store_cache(
diff --git a/python/sglang/jit_kernel/norm.py b/python/sglang/jit_kernel/norm.py
index 3366f3871a67..4aef33c20d8b 100644
--- a/python/sglang/jit_kernel/norm.py
+++ b/python/sglang/jit_kernel/norm.py
@@ -6,9 +6,6 @@
 import torch
 
 from sglang.jit_kernel.debug_utils import maybe_wrap_jit_kernel_debug
-
-logger = logging.getLogger(__name__)
-
 from sglang.jit_kernel.utils import (
     cache_once,
     is_arch_support_pdl,
@@ -20,6 +17,9 @@
     from tvm_ffi.module import Module
 
 
+logger = logging.getLogger(__name__)
+
+
 @cache_once
 def _jit_qknorm_module(head_dim: int, dtype: torch.dtype) -> Module:
     args = make_cpp_args(head_dim, is_arch_support_pdl(), dtype)
diff --git a/python/sglang/jit_kernel/tests/test_add_constant.py b/python/sglang/jit_kernel/tests/test_add_constant.py
index 8379c18b1d7c..cad9ac3abd97 100644
--- a/python/sglang/jit_kernel/tests/test_add_constant.py
+++ b/python/sglang/jit_kernel/tests/test_add_constant.py
@@ -4,6 +4,10 @@
 import torch
 
 from sglang.jit_kernel.add_constant import add_constant
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=45, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=180, suite="nightly-kernel-1-gpu", nightly=True)
 
 
 @pytest.mark.parametrize("size", [1, 2, 127, 128, 1024, 1025])
diff --git a/python/sglang/jit_kernel/tests/test_awq_dequantize.py b/python/sglang/jit_kernel/tests/test_awq_dequantize.py
index d90fafa14970..debd97729621 100644
--- a/python/sglang/jit_kernel/tests/test_awq_dequantize.py
+++ b/python/sglang/jit_kernel/tests/test_awq_dequantize.py
@@ -5,6 +5,10 @@
 import torch
 
 from sglang.jit_kernel.awq_dequantize import awq_dequantize as jit_awq_dequantize
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=9, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 try:
     from sgl_kernel import awq_dequantize as aot_awq_dequantize
diff --git a/python/sglang/jit_kernel/tests/test_awq_marlin_moe_repack.py b/python/sglang/jit_kernel/tests/test_awq_marlin_moe_repack.py
index a7ff242df3b3..2a0f9354ecd9 100644
--- a/python/sglang/jit_kernel/tests/test_awq_marlin_moe_repack.py
+++ b/python/sglang/jit_kernel/tests/test_awq_marlin_moe_repack.py
@@ -9,6 +9,10 @@
     awq_marlin_moe_repack as jit_awq_marlin_moe_repack,
 )
 from sglang.srt.layers.quantization.utils import pack_cols, quantize_weights
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=10, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 
 def _has_aot_awq_marlin_moe_repack() -> bool:
diff --git a/python/sglang/jit_kernel/tests/test_awq_marlin_repack.py b/python/sglang/jit_kernel/tests/test_awq_marlin_repack.py
index a3ee38808951..35c23922dc8c 100644
--- a/python/sglang/jit_kernel/tests/test_awq_marlin_repack.py
+++ b/python/sglang/jit_kernel/tests/test_awq_marlin_repack.py
@@ -9,8 +9,12 @@
     awq_marlin_repack as jit_awq_marlin_repack,
 )
 from sglang.srt.layers.quantization.utils import pack_cols, quantize_weights
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_marlin_utils import get_weight_perm, marlin_weights
 
+register_cuda_ci(est_time=10, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
+
 
 def _has_aot_awq_marlin_repack() -> bool:
     return hasattr(torch.ops.sgl_kernel, "awq_marlin_repack") and hasattr(
diff --git a/python/sglang/jit_kernel/tests/test_clamp_position.py b/python/sglang/jit_kernel/tests/test_clamp_position.py
index 3fdc8c426562..cb3ec6ce595f 100644
--- a/python/sglang/jit_kernel/tests/test_clamp_position.py
+++ b/python/sglang/jit_kernel/tests/test_clamp_position.py
@@ -4,6 +4,10 @@
 import torch
 
 from sglang.jit_kernel.clamp_position import clamp_position_cuda
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=12, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 
 def _reference_clamp_position(seq_lens):
diff --git a/python/sglang/jit_kernel/tests/test_concat_mla.py b/python/sglang/jit_kernel/tests/test_concat_mla.py
index 45f35e2488f4..9ecfb654cac8 100644
--- a/python/sglang/jit_kernel/tests/test_concat_mla.py
+++ b/python/sglang/jit_kernel/tests/test_concat_mla.py
@@ -5,6 +5,11 @@
 import torch
 import triton
 
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=17, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
+
 
 def torch_concat_mla_k(
     k: torch.Tensor, k_nope: torch.Tensor, k_rope: torch.Tensor
diff --git a/python/sglang/jit_kernel/tests/test_custom_all_reduce.py b/python/sglang/jit_kernel/tests/test_custom_all_reduce.py
index e1f08dbb9863..bf0cfa38d775 100644
--- a/python/sglang/jit_kernel/tests/test_custom_all_reduce.py
+++ b/python/sglang/jit_kernel/tests/test_custom_all_reduce.py
@@ -18,6 +18,7 @@
 import logging
 import os
 import subprocess
+import sys
 from typing import Optional
 
 import pytest
@@ -30,6 +31,19 @@
 from sglang.srt.distributed.device_communicators.custom_all_reduce_v2 import (
     CustomAllReduceV2,
 )
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(
+    est_time=120,
+    suite="stage-b-kernel-unit-1-gpu-large",
+    disabled="requires multi-GPU distributed setup",
+)
+register_cuda_ci(
+    est_time=120,
+    suite="nightly-kernel-1-gpu",
+    nightly=True,
+    disabled="requires multi-GPU distributed setup",
+)
 
 # ---------------------------------------------------------------------------
 # Test parameters (shared between test class and worker)
@@ -224,4 +238,7 @@ def worker_main() -> None:
 
 
 if __name__ == "__main__":
-    worker_main()
+    if "LOCAL_RANK" in os.environ:
+        worker_main()
+    else:
+        sys.exit(pytest.main([__file__, "-v", "-s"]))
diff --git a/python/sglang/jit_kernel/tests/test_cutedsl_gdn.py b/python/sglang/jit_kernel/tests/test_cutedsl_gdn.py
index fc7a8e19b00c..83e47024c7a6 100644
--- a/python/sglang/jit_kernel/tests/test_cutedsl_gdn.py
+++ b/python/sglang/jit_kernel/tests/test_cutedsl_gdn.py
@@ -6,6 +6,8 @@
 import pytest
 import torch
 
+from sglang.test.ci.ci_register import register_cuda_ci
+
 try:
     import cuda.bindings.driver as cuda_driver
     import cutlass  # noqa: F401
@@ -27,6 +29,9 @@
 except ImportError:
     TRITON_AVAILABLE = False
 
+register_cuda_ci(est_time=5, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
+
 
 def run_triton_kernel(A_log, dt_bias, q, k, v, a, b, initial_state, indices, scale):
     return fused_sigmoid_gating_delta_rule_update(
diff --git a/python/sglang/jit_kernel/tests/test_flash_attention_4.py b/python/sglang/jit_kernel/tests/test_flash_attention_4.py
index 9785a2ec1cf0..e1453b8f2323 100644
--- a/python/sglang/jit_kernel/tests/test_flash_attention_4.py
+++ b/python/sglang/jit_kernel/tests/test_flash_attention_4.py
@@ -12,6 +12,10 @@
 from einops import rearrange, repeat
 
 from sglang.jit_kernel.flash_attention_v4 import flash_attn_varlen_func
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=120, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=900, suite="nightly-kernel-1-gpu", nightly=True)
 
 # Skip this test on Hopper machine
 skip_condition = torch.cuda.get_device_capability() < (10, 0)
diff --git a/python/sglang/jit_kernel/tests/test_fused_add_rmsnorm.py b/python/sglang/jit_kernel/tests/test_fused_add_rmsnorm.py
index a16b2650c240..49636c4f5444 100644
--- a/python/sglang/jit_kernel/tests/test_fused_add_rmsnorm.py
+++ b/python/sglang/jit_kernel/tests/test_fused_add_rmsnorm.py
@@ -5,6 +5,10 @@
 import torch
 
 from sglang.jit_kernel.utils import get_ci_test_range
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=5, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 
 def sglang_jit_fused_add_rmsnorm(
diff --git a/python/sglang/jit_kernel/tests/test_fused_metadata_copy.py b/python/sglang/jit_kernel/tests/test_fused_metadata_copy.py
index cc0f05904c34..f0fb78c1f60e 100644
--- a/python/sglang/jit_kernel/tests/test_fused_metadata_copy.py
+++ b/python/sglang/jit_kernel/tests/test_fused_metadata_copy.py
@@ -14,6 +14,11 @@
 import pytest
 import torch
 
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=100, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=400, suite="nightly-kernel-1-gpu", nightly=True)
+
 # =============================================================================
 # Helper Functions
 # =============================================================================
diff --git a/python/sglang/jit_kernel/tests/test_fused_norm_scale_shift.py b/python/sglang/jit_kernel/tests/test_fused_norm_scale_shift.py
index c54fcfd5a1a9..42c3371ff890 100644
--- a/python/sglang/jit_kernel/tests/test_fused_norm_scale_shift.py
+++ b/python/sglang/jit_kernel/tests/test_fused_norm_scale_shift.py
@@ -10,6 +10,10 @@
     fused_norm_scale_shift,
     fused_scale_residual_norm_scale_shift,
 )
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=28, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 DEVICE = "cuda"
 SHAPE_MAP = {
diff --git a/python/sglang/jit_kernel/tests/test_fused_store_index_cache.py b/python/sglang/jit_kernel/tests/test_fused_store_index_cache.py
index 4f38c558d0e9..5766f3168b5a 100644
--- a/python/sglang/jit_kernel/tests/test_fused_store_index_cache.py
+++ b/python/sglang/jit_kernel/tests/test_fused_store_index_cache.py
@@ -22,6 +22,8 @@
 import pytest
 import torch
 
+from sglang.test.ci.ci_register import register_cuda_ci
+
 try:
     from sglang.jit_kernel.fused_store_index_cache import (
         can_use_nsa_fused_store,
@@ -46,6 +48,9 @@
 except ImportError:
     _is_fp8_fnuz = False
 
+register_cuda_ci(est_time=24, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
+
 PAGE_SIZE = 64
 HEAD_DIM = 128
 FP8_E4M3_MAX = 448.0
diff --git a/python/sglang/jit_kernel/tests/test_fused_verify_triton_gdn.py b/python/sglang/jit_kernel/tests/test_fused_verify_triton_gdn.py
index 1545f3e6b3e7..08db51289b25 100644
--- a/python/sglang/jit_kernel/tests/test_fused_verify_triton_gdn.py
+++ b/python/sglang/jit_kernel/tests/test_fused_verify_triton_gdn.py
@@ -11,6 +11,8 @@
 import pytest
 import torch
 
+from sglang.test.ci.ci_register import register_cuda_ci
+
 try:
     from sglang.srt.layers.attention.fla.fused_gdn_gating import fused_gdn_gating
     from sglang.srt.layers.attention.fla.fused_recurrent import (
@@ -24,6 +26,9 @@
 except ImportError:
     KERNELS_AVAILABLE = False
 
+register_cuda_ci(est_time=6, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
+
 
 def _make_tensors(N, T, H, HV, K, V, device="cuda", seed=2025):
     """Create input tensors for GDN target_verify."""
diff --git a/python/sglang/jit_kernel/tests/test_gptq_marlin.py b/python/sglang/jit_kernel/tests/test_gptq_marlin.py
index ff2b2c13c29c..a811ee654d78 100644
--- a/python/sglang/jit_kernel/tests/test_gptq_marlin.py
+++ b/python/sglang/jit_kernel/tests/test_gptq_marlin.py
@@ -6,8 +6,12 @@
 
 from sglang.jit_kernel.gptq_marlin import gptq_marlin_gemm
 from sglang.srt.layers.quantization.marlin_utils import marlin_make_workspace
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize
 
+register_cuda_ci(est_time=13, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
+
 MNK_FACTORS = [
     (1, 1, 1),
     (1, 4, 8),
diff --git a/python/sglang/jit_kernel/tests/test_gptq_marlin_repack.py b/python/sglang/jit_kernel/tests/test_gptq_marlin_repack.py
index 71ab5aafa9f4..4bcbc0bf3a4a 100644
--- a/python/sglang/jit_kernel/tests/test_gptq_marlin_repack.py
+++ b/python/sglang/jit_kernel/tests/test_gptq_marlin_repack.py
@@ -10,8 +10,12 @@
     pack_rows,
     sort_weights,
 )
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_marlin_utils import get_weight_perm, marlin_weights
 
+register_cuda_ci(est_time=16, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
+
 MARLIN_K_CHUNKS = [128]
 MARLIN_N_CHUNKS = [64, 256]
 
diff --git a/python/sglang/jit_kernel/tests/test_hadamard_jit.py b/python/sglang/jit_kernel/tests/test_hadamard_jit.py
index daaa7addbae8..cc03a01aec88 100644
--- a/python/sglang/jit_kernel/tests/test_hadamard_jit.py
+++ b/python/sglang/jit_kernel/tests/test_hadamard_jit.py
@@ -14,6 +14,10 @@
     hadamard_transform_28n,
     hadamard_transform_40n,
 )
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=128, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=512, suite="nightly-kernel-1-gpu", nightly=True)
 
 # Exact M×N Hadamard matrices (±1 entries) copied from
 # python/sglang/jit_kernel/csrc/fast-hadamard-transform/code_gen.py.
diff --git a/python/sglang/jit_kernel/tests/test_moe_lora_align_block_size.py b/python/sglang/jit_kernel/tests/test_moe_lora_align_block_size.py
index 415d2bf5a5f2..48688e5da033 100644
--- a/python/sglang/jit_kernel/tests/test_moe_lora_align_block_size.py
+++ b/python/sglang/jit_kernel/tests/test_moe_lora_align_block_size.py
@@ -11,7 +11,8 @@
 from sglang.jit_kernel.moe_lora_align import moe_lora_align_block_size
 from sglang.test.ci.ci_register import register_cuda_ci
 
-register_cuda_ci(est_time=80, suite="stage-b-test-1-gpu-large")
+register_cuda_ci(est_time=28, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 
 def round_up(x, base):
diff --git a/python/sglang/jit_kernel/tests/test_moe_wna16_marlin.py b/python/sglang/jit_kernel/tests/test_moe_wna16_marlin.py
index 90cb99a6f1df..5100aac5ceed 100644
--- a/python/sglang/jit_kernel/tests/test_moe_wna16_marlin.py
+++ b/python/sglang/jit_kernel/tests/test_moe_wna16_marlin.py
@@ -7,8 +7,12 @@
 
 from sglang.jit_kernel.moe_wna16_marlin import moe_wna16_marlin_gemm
 from sglang.srt.layers.moe.fused_moe_triton import moe_align_block_size
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize
 
+register_cuda_ci(est_time=10, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
+
 
 def _has_aot_moe_wna16_marlin_gemm() -> bool:
     return hasattr(torch.ops.sgl_kernel, "moe_wna16_marlin_gemm") and hasattr(
diff --git a/python/sglang/jit_kernel/tests/test_norm_jit.py b/python/sglang/jit_kernel/tests/test_norm_jit.py
index 1f8b41e6fbd1..ebd0d3034cd9 100644
--- a/python/sglang/jit_kernel/tests/test_norm_jit.py
+++ b/python/sglang/jit_kernel/tests/test_norm_jit.py
@@ -5,6 +5,11 @@
 import pytest
 import torch
 
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=125, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=500, suite="nightly-kernel-1-gpu", nightly=True)
+
 # JIT rmsnorm: fp16/bf16 only
 # - Warp norm path (one warp per token):  hidden_size in {64, 128, 256}
 # - CTA norm path (multi-warp per token): hidden_size is a multiple of 256, > 256, and <=8192
diff --git a/python/sglang/jit_kernel/tests/test_nvfp4_blockwise_moe.py b/python/sglang/jit_kernel/tests/test_nvfp4_blockwise_moe.py
index 6eb809d3385f..864636050098 100644
--- a/python/sglang/jit_kernel/tests/test_nvfp4_blockwise_moe.py
+++ b/python/sglang/jit_kernel/tests/test_nvfp4_blockwise_moe.py
@@ -1,3 +1,5 @@
+import sys
+
 import pytest
 import torch
 
@@ -6,6 +8,10 @@
     scaled_fp4_experts_quant,
     scaled_fp4_quant,
 )
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=5, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 FLOAT4_E2M1_MAX = 6.0
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
@@ -125,3 +131,7 @@ def test_nvfp4_blockwise_moe_grouped_mm(dtype: torch.dtype) -> None:
         ref[start:end] = torch.matmul(a[start:end], b[i].t())
 
     torch.testing.assert_close(out, ref, atol=1e-1, rtol=1e-1)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v", "-s"]))
diff --git a/python/sglang/jit_kernel/tests/test_nvfp4_gemm.py b/python/sglang/jit_kernel/tests/test_nvfp4_gemm.py
index 9a76cd8009d7..21c14d5b1c07 100644
--- a/python/sglang/jit_kernel/tests/test_nvfp4_gemm.py
+++ b/python/sglang/jit_kernel/tests/test_nvfp4_gemm.py
@@ -1,7 +1,13 @@
+import sys
+
 import pytest
 import torch
 
 from sglang.jit_kernel.nvfp4 import cutlass_scaled_fp4_mm, scaled_fp4_quant
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=5, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 
 def _nvfp4_supported() -> bool:
@@ -140,3 +146,7 @@ def test_nvfp4_gemm(dtype: torch.dtype, shape: tuple[int, int, int]) -> None:
     )
 
     torch.testing.assert_close(out, expected_out.to(dtype=dtype), atol=1e-1, rtol=1e-1)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v", "-s"]))
diff --git a/python/sglang/jit_kernel/tests/test_nvfp4_quant.py b/python/sglang/jit_kernel/tests/test_nvfp4_quant.py
index eb7816408b60..5b7dfbd0f2e5 100644
--- a/python/sglang/jit_kernel/tests/test_nvfp4_quant.py
+++ b/python/sglang/jit_kernel/tests/test_nvfp4_quant.py
@@ -1,3 +1,5 @@
+import sys
+
 import pytest
 import torch
 
@@ -12,6 +14,11 @@
 except Exception:
     _sgl_silu_and_mul = None
 
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=5, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
+
 
 def _nvfp4_supported() -> bool:
     return torch.cuda.is_available() and torch.cuda.get_device_capability() >= (10, 0)
@@ -212,3 +219,7 @@ def test_silu_and_mul_quantize_to_fp4_grouped(shape: tuple[int, int, int]) -> No
         scale_ref = recover_swizzled_scales(ref_output_scales[i], m, k)
         scale_ans = recover_swizzled_scales(output_scales[i], m, k)
         torch.testing.assert_close(scale_ref[: mask[i]], scale_ans[: mask[i]])
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v", "-s"]))
diff --git a/python/sglang/jit_kernel/tests/test_per_tensor_quant_fp8.py b/python/sglang/jit_kernel/tests/test_per_tensor_quant_fp8.py
index ecc79d67b75b..cac76d03a783 100644
--- a/python/sglang/jit_kernel/tests/test_per_tensor_quant_fp8.py
+++ b/python/sglang/jit_kernel/tests/test_per_tensor_quant_fp8.py
@@ -6,6 +6,10 @@
 import torch
 
 from sglang.jit_kernel.per_tensor_quant_fp8 import per_tensor_quant_fp8
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=16, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 try:
     from sglang.srt.utils import is_hip
diff --git a/python/sglang/jit_kernel/tests/test_per_token_group_quant_8bit.py b/python/sglang/jit_kernel/tests/test_per_token_group_quant_8bit.py
index dde8ccc6d7f9..8b0452d7d0ec 100644
--- a/python/sglang/jit_kernel/tests/test_per_token_group_quant_8bit.py
+++ b/python/sglang/jit_kernel/tests/test_per_token_group_quant_8bit.py
@@ -23,6 +23,10 @@
 from sglang.srt.layers.quantization.fp8_kernel import (
     per_token_group_quant_8bit as triton_per_token_group_quant_8bit,
 )
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=16, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 configs = list(
     itertools.product(
diff --git a/python/sglang/jit_kernel/tests/test_pos_enc.py b/python/sglang/jit_kernel/tests/test_pos_enc.py
index 2bfee5619c9b..ad09fd706416 100644
--- a/python/sglang/jit_kernel/tests/test_pos_enc.py
+++ b/python/sglang/jit_kernel/tests/test_pos_enc.py
@@ -8,6 +8,10 @@
 import triton.language as tl
 
 from sglang.jit_kernel.rope import rotary_embedding
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=18, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 
 @triton.jit
diff --git a/python/sglang/jit_kernel/tests/test_qknorm.py b/python/sglang/jit_kernel/tests/test_qknorm.py
index 5bd16b4f30b3..4b85b5c52bb6 100644
--- a/python/sglang/jit_kernel/tests/test_qknorm.py
+++ b/python/sglang/jit_kernel/tests/test_qknorm.py
@@ -6,6 +6,10 @@
 import triton
 
 from sglang.jit_kernel.utils import get_ci_test_range
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=37, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=148, suite="nightly-kernel-1-gpu", nightly=True)
 
 
 def sglang_aot_qknorm(
diff --git a/python/sglang/jit_kernel/tests/test_qknorm_across_heads.py b/python/sglang/jit_kernel/tests/test_qknorm_across_heads.py
index bf9e4991faef..81c8a1ead5bd 100644
--- a/python/sglang/jit_kernel/tests/test_qknorm_across_heads.py
+++ b/python/sglang/jit_kernel/tests/test_qknorm_across_heads.py
@@ -6,6 +6,10 @@
 import triton
 
 from sglang.jit_kernel.utils import get_ci_test_range
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=15, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 
 def sglang_jit_qknorm_across_heads(
diff --git a/python/sglang/jit_kernel/tests/test_qwen_image_modulation.py b/python/sglang/jit_kernel/tests/test_qwen_image_modulation.py
index f536964f57e3..dce8ce947311 100644
--- a/python/sglang/jit_kernel/tests/test_qwen_image_modulation.py
+++ b/python/sglang/jit_kernel/tests/test_qwen_image_modulation.py
@@ -10,6 +10,10 @@
     fuse_residual_layernorm_scale_shift_gate_select01_kernel,
 )
 from sglang.jit_kernel.utils import get_ci_test_range
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=15, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 DEVICE = "cuda"
 DTYPES = get_ci_test_range(
diff --git a/python/sglang/jit_kernel/tests/test_renorm.py b/python/sglang/jit_kernel/tests/test_renorm.py
index 870b46db7752..4def31326749 100644
--- a/python/sglang/jit_kernel/tests/test_renorm.py
+++ b/python/sglang/jit_kernel/tests/test_renorm.py
@@ -7,6 +7,11 @@
 import sgl_kernel
 import torch
 
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=6, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
+
 
 @pytest.mark.parametrize("batch_size", [1, 99, 989])
 @pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
diff --git a/python/sglang/jit_kernel/tests/test_resolve_future_token_ids.py b/python/sglang/jit_kernel/tests/test_resolve_future_token_ids.py
index ad3225720006..cfb8597f6a3e 100644
--- a/python/sglang/jit_kernel/tests/test_resolve_future_token_ids.py
+++ b/python/sglang/jit_kernel/tests/test_resolve_future_token_ids.py
@@ -4,6 +4,10 @@
 import torch
 
 from sglang.jit_kernel.resolve_future_token_ids import resolve_future_token_ids_cuda
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=9, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 
 def _reference_resolve(input_ids, future_map):
diff --git a/python/sglang/jit_kernel/tests/test_rmsnorm.py b/python/sglang/jit_kernel/tests/test_rmsnorm.py
index 138fece2183e..ac31a792747d 100644
--- a/python/sglang/jit_kernel/tests/test_rmsnorm.py
+++ b/python/sglang/jit_kernel/tests/test_rmsnorm.py
@@ -6,6 +6,10 @@
 import triton
 
 from sglang.jit_kernel.utils import get_ci_test_range
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=18, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 
 def sglang_jit_rmsnorm(input: torch.Tensor, weight: torch.Tensor) -> None:
diff --git a/python/sglang/jit_kernel/tests/test_rope.py b/python/sglang/jit_kernel/tests/test_rope.py
index abc482f5c28f..62a601f653a4 100644
--- a/python/sglang/jit_kernel/tests/test_rope.py
+++ b/python/sglang/jit_kernel/tests/test_rope.py
@@ -5,6 +5,10 @@
 import triton
 
 from sglang.jit_kernel.utils import get_ci_test_range
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=64, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=256, suite="nightly-kernel-1-gpu", nightly=True)
 
 DEVICE = "cuda"
 DTYPE = torch.bfloat16
diff --git a/python/sglang/jit_kernel/tests/test_store_cache.py b/python/sglang/jit_kernel/tests/test_store_cache.py
index cd6887042bb9..278781ca2420 100644
--- a/python/sglang/jit_kernel/tests/test_store_cache.py
+++ b/python/sglang/jit_kernel/tests/test_store_cache.py
@@ -6,6 +6,10 @@
 
 from sglang.jit_kernel.kvcache import can_use_store_cache, store_cache
 from sglang.jit_kernel.utils import get_ci_test_range
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=28, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 BS_LIST = [2**n for n in range(0, 15)]
 BS_LIST += [x + 1 + i for i, x in enumerate(BS_LIST)]
diff --git a/python/sglang/jit_kernel/tests/test_timestep_embedding.py b/python/sglang/jit_kernel/tests/test_timestep_embedding.py
index bc900291ab57..1ec5912d4778 100644
--- a/python/sglang/jit_kernel/tests/test_timestep_embedding.py
+++ b/python/sglang/jit_kernel/tests/test_timestep_embedding.py
@@ -14,6 +14,10 @@
     timestep_embedding as timestep_embedding_cuda,
 )
 from sglang.jit_kernel.utils import get_ci_test_range
+from sglang.test.ci.ci_register import register_cuda_ci
+
+register_cuda_ci(est_time=16, suite="stage-b-kernel-unit-1-gpu-large")
+register_cuda_ci(est_time=120, suite="nightly-kernel-1-gpu", nightly=True)
 
 CORRECTNESS_BATCH_SIZES = get_ci_test_range(
     [1, 2, 8, 128, 256, 512, 1536, 2048, 4096, 11008, 16384],
diff --git a/scripts/version_branch_to_tag.sh b/scripts/version_branch_to_tag.sh
deleted file mode 100755
index 9f587fb0b541..000000000000
--- a/scripts/version_branch_to_tag.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-set -euxo pipefail
-
-# This script is used for release.
-# It tags all remote branches starting with 'v' with the same name as the branch,
-# deletes the corresponding branches from the remote, and pushes the tags to the remote repository.
-
-git fetch origin --prune
-
-# List all branches starting with 'v'
-branches=$(git branch -r | grep 'origin/v' | sed 's/origin\///')
-
-# Loop through each branch
-for branch in $branches; do
-    echo "Processing branch: $branch"
-
-    # Get the commit hash for the branch
-    commit_hash=$(git rev-parse origin/$branch)
-
-    # Create a tag with the same name as the branch using the commit hash
-    git tag $branch $commit_hash
-
-    # Delete the branch from the remote
-    git push origin --delete $branch
-done
-
-# Push all tags to the remote repository
-git push --tags
-
-echo "All branches starting with 'v' have been tagged, deleted from remote, and pushed to the remote repository."
diff --git a/test/run_suite.py b/test/run_suite.py
index 93fcaaede101..75c79ba09de0 100644
--- a/test/run_suite.py
+++ b/test/run_suite.py
@@ -41,6 +41,8 @@
         "stage-b-test-1-gpu-large",
         "stage-b-test-2-gpu-large",
         "stage-b-test-4-gpu-b200",
+        "stage-b-kernel-unit-1-gpu-large",
+        "stage-b-kernel-benchmark-1-gpu-large",
         "stage-c-test-4-gpu-h100",
         "stage-c-test-4-gpu-b200",
         "stage-c-test-4-gpu-gb200",
@@ -73,6 +75,7 @@
         "nightly-8-gpu-h200-basic",  # Basic tests for large models on H200
         "nightly-8-gpu-b200-basic",  # Basic tests for large models on B200
         "nightly-8-gpu-common",  # Common tests that run on both H200 and B200
+        "nightly-kernel-1-gpu",
         # Eval and perf suites (2-gpu)
         "nightly-eval-text-2-gpu",
         "nightly-eval-vlm-2-gpu",
@@ -170,9 +173,11 @@ def run_a_suite(args):
     auto_partition_id = args.auto_partition_id
     auto_partition_size = args.auto_partition_size
 
-    # All tests (per-commit and nightly) are now in registered/
     # Use absolute paths so the script works from any working directory
     script_dir = os.path.dirname(os.path.abspath(__file__))
+    repo_root = os.path.dirname(script_dir)
+
+    # Registered tests under test/registered/
     files = [
         f
         for f in glob.glob(
@@ -180,7 +185,13 @@ def run_a_suite(args):
         )
         if not f.endswith("/conftest.py") and not f.endswith("/__init__.py")
     ]
-    # Strict: all registered files must have proper registration
+
+    # JIT kernel tests and benchmarks (live alongside kernel source)
+    jit_kernel_dir = os.path.join(repo_root, "python", "sglang", "jit_kernel")
+    files += glob.glob(os.path.join(jit_kernel_dir, "tests", "test_*.py"))
+    files += glob.glob(os.path.join(jit_kernel_dir, "benchmark", "bench_*.py"))
+
+    # Strict: all discovered files must have proper registration
     sanity_check = True
 
     all_tests = collect_tests(files, sanity_check=sanity_check)