diff --git a/.claude/skills/write-sglang-test/SKILL.md b/.claude/skills/write-sglang-test/SKILL.md
index c2d3e587b3f5..0a24772fe48a 100644
--- a/.claude/skills/write-sglang-test/SKILL.md
+++ b/.claude/skills/write-sglang-test/SKILL.md
@@ -11,14 +11,14 @@ description: Guide for writing SGLang CI/UT tests. Covers CustomTestCase, CI reg
 
 1. **Always use `CustomTestCase`** — never raw `unittest.TestCase`. It ensures `tearDownClass` runs even when `setUpClass` fails, preventing resource leaks in CI.
 2. **`tearDownClass` must be defensive** — use `hasattr`/null checks before accessing resources (e.g. `cls.process`) that `setUpClass` may not have finished allocating.
-3. **Place tests in `test/registered/<category>/`** — except JIT kernel tests and benchmarks, which live in `python/sglang/jit_kernel/tests/` and `python/sglang/jit_kernel/benchmark/`
+3. **Place tests in `test/registered/<category>/`** — except JIT kernel tests and benchmarks, which live in `python/sglang/jit_kernel/tests/` and `python/sglang/jit_kernel/benchmark/` (nested subfolders are allowed)
 4. **Reuse server fixtures** — inherit from `DefaultServerBase` or write `setUpClass`/`tearDownClass` with `popen_launch_server`
 5. **Prefer mock over real server** — when testing logic that doesn't need a server / engine launch (middleware, request routing, config validation, argument parsing), use `unittest.mock.patch` / `MagicMock` and place tests in `test/registered/unit/`. Only launch a real server when the test genuinely needs inference results or server lifecycle behavior.
 
 JIT kernel exception:
 - If the task is adding or updating code under `python/sglang/jit_kernel/`, prefer the `add-jit-kernel` skill first.
-- JIT kernel correctness tests use `python/sglang/jit_kernel/tests/test_*.py`.
-- JIT kernel benchmarks use `python/sglang/jit_kernel/benchmark/bench_*.py`.
+- JIT kernel correctness tests use `python/sglang/jit_kernel/tests/**/test_*.py`.
+- JIT kernel benchmarks use `python/sglang/jit_kernel/benchmark/**/bench_*.py`.
 - Those files are still executed by `test/run_suite.py`, but through dedicated kernel suites rather than `test/registered/`.
 
 ---
diff --git a/.github/workflows/pr-test-amd-rocm720.yml b/.github/workflows/pr-test-amd-rocm720.yml
index 710bd6834705..99847a19fe50 100644
--- a/.github/workflows/pr-test-amd-rocm720.yml
+++ b/.github/workflows/pr-test-amd-rocm720.yml
@@ -156,6 +156,8 @@ jobs:
               - "python/sglang/multimodal_gen/**"
               - "python/sglang/cli/**"
               - "python/sglang/jit_kernel/diffusion/**"
+              - "python/sglang/jit_kernel/tests/diffusion/**"
+              - "python/sglang/jit_kernel/benchmark/diffusion/**"
               - "python/pyproject_rocm.toml"
               - "python/pyproject_other.toml"
 
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index 98a47f05bfa1..3c778f4b933c 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -154,6 +154,8 @@ jobs:
               - "python/sglang/multimodal_gen/**"
               - "python/sglang/cli/**"
               - "python/sglang/jit_kernel/diffusion/**"
+              - "python/sglang/jit_kernel/tests/diffusion/**"
+              - "python/sglang/jit_kernel/benchmark/diffusion/**"
               - "python/pyproject_rocm.toml"
               - "python/pyproject_other.toml"
 
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 3180adb32379..fe47cf71dc07 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -130,7 +130,9 @@ jobs:
               - ".github/workflows/pr-test-multimodal-gen.yml"
               - "python/pyproject.toml"
               - "python/sglang/multimodal_gen/**/*.!(md|ipynb)"
-              - "python/sglang/jit_kernel/**"
+              - "python/sglang/jit_kernel/diffusion/**"
+              - "python/sglang/jit_kernel/tests/diffusion/**"
+              - "python/sglang/jit_kernel/benchmark/diffusion/**"
               - "python/sglang/cli/**"
             jit_kernel:
               - ".github/workflows/pr-test.yml"
@@ -197,8 +199,8 @@ jobs:
             echo "jit_kernel=false" >> $GITHUB_OUTPUT
           fi
 
-          # Check for multimodal_gen changes
-          if echo "$CHANGED_FILES" | grep -qE "^(python/sglang/multimodal_gen/|python/sglang/cli/|python/pyproject\.toml|\.github/workflows/pr-test\.yml|\.github/workflows/pr-test-multimodal-gen\.yml)"; then
+          # Check for multimodal_gen changes, including diffusion-specific jit_kernel coverage
+          if echo "$CHANGED_FILES" | grep -qE "^(python/sglang/multimodal_gen/|python/sglang/cli/|python/sglang/jit_kernel/diffusion/|python/sglang/jit_kernel/tests/diffusion/|python/sglang/jit_kernel/benchmark/diffusion/|python/pyproject\.toml|\.github/workflows/pr-test\.yml|\.github/workflows/pr-test-multimodal-gen\.yml)"; then
             echo "multimodal_gen=true" >> $GITHUB_OUTPUT
             echo "Detected multimodal_gen changes"
           else
diff --git a/python/sglang/jit_kernel/benchmark/bench_fused_norm_scale_shift.py b/python/sglang/jit_kernel/benchmark/diffusion/bench_fused_norm_scale_shift.py
similarity index 100%
rename from python/sglang/jit_kernel/benchmark/bench_fused_norm_scale_shift.py
rename to python/sglang/jit_kernel/benchmark/diffusion/bench_fused_norm_scale_shift.py
diff --git a/python/sglang/jit_kernel/benchmark/bench_norm_impls.py b/python/sglang/jit_kernel/benchmark/diffusion/bench_norm_impls.py
similarity index 100%
rename from python/sglang/jit_kernel/benchmark/bench_norm_impls.py
rename to python/sglang/jit_kernel/benchmark/diffusion/bench_norm_impls.py
diff --git a/python/sglang/jit_kernel/benchmark/bench_qwen_image_modulation.py b/python/sglang/jit_kernel/benchmark/diffusion/bench_qwen_image_modulation.py
similarity index 100%
rename from python/sglang/jit_kernel/benchmark/bench_qwen_image_modulation.py
rename to python/sglang/jit_kernel/benchmark/diffusion/bench_qwen_image_modulation.py
diff --git a/python/sglang/jit_kernel/tests/test_fused_norm_scale_shift.py b/python/sglang/jit_kernel/tests/diffusion/test_fused_norm_scale_shift.py
similarity index 100%
rename from python/sglang/jit_kernel/tests/test_fused_norm_scale_shift.py
rename to python/sglang/jit_kernel/tests/diffusion/test_fused_norm_scale_shift.py
diff --git a/python/sglang/jit_kernel/tests/test_qwen_image_modulation.py b/python/sglang/jit_kernel/tests/diffusion/test_qwen_image_modulation.py
similarity index 100%
rename from python/sglang/jit_kernel/tests/test_qwen_image_modulation.py
rename to python/sglang/jit_kernel/tests/diffusion/test_qwen_image_modulation.py
diff --git a/test/README.md b/test/README.md
index 26e00c45d2a3..d299f77403ad 100644
--- a/test/README.md
+++ b/test/README.md
@@ -81,7 +81,7 @@ Here is an illustration
 
 
 ## Folder organization
-- `registered`: The registered test files. They are run in CI. Most tests should live in this folder. The main exception is JIT kernel coverage, which lives under `python/sglang/jit_kernel/tests/` and `python/sglang/jit_kernel/benchmark/`.
+- `registered`: The registered test files. They are run in CI. Most tests should live in this folder. The main exception is JIT kernel coverage, which lives under `python/sglang/jit_kernel/tests/` and `python/sglang/jit_kernel/benchmark/` (including nested subfolders such as `diffusion/`).
 - `manual`: Test files that CI does not run; you run them manually. Typically, these are temporary tests, deprecated tests, or tests that are not suitable for CI—such as those that take too long or require special setup. We would still like to keep some files here for anyone who wants to run them locally.
 - `run_suite.py`: The launch script to run a test suite. It scans `test/registered/` and also the JIT kernel test / benchmark directories.
 - Other: utility scripts and metadata folders. The `srt` folder holds our legacy CI setup and should be deprecated as soon as possible.
@@ -142,7 +142,7 @@ python test/run_suite.py --hw cuda --suite stage-b-test-1-gpu-small \
 ## CI Registry System
 
 CI-discovered tests use a registry-based CI system for flexible backend and schedule configuration.
-This includes files under `test/registered/` and, for JIT kernels, files under `python/sglang/jit_kernel/tests/` and `python/sglang/jit_kernel/benchmark/`.
+This includes files under `test/registered/` and, for JIT kernels, files under `python/sglang/jit_kernel/tests/` and `python/sglang/jit_kernel/benchmark/`, recursively including nested subfolders.
 For every CI-discovered file you add, you need to register it in a suite and provide an estimated execution time in seconds.
 
 ### Registration Functions
@@ -180,8 +180,8 @@ register_cuda_ci(est_time=80, suite="stage-b-test-1-gpu-small", disabled="flaky
 
 JIT kernel files are discovered by `test/run_suite.py`, but they do not live under `test/registered/`:
 
-- Correctness tests: `python/sglang/jit_kernel/tests/test_*.py`
-- Benchmarks: `python/sglang/jit_kernel/benchmark/bench_*.py`
+- Correctness tests: `python/sglang/jit_kernel/tests/**/test_*.py`
+- Benchmarks: `python/sglang/jit_kernel/benchmark/**/bench_*.py`
 
 Use dedicated kernel suites:
 
diff --git a/test/run_suite.py b/test/run_suite.py
index b3e4b95dbea8..97b39ee2ea71 100644
--- a/test/run_suite.py
+++ b/test/run_suite.py
@@ -192,8 +192,12 @@ def run_a_suite(args):
 
     # JIT kernel tests and benchmarks (live alongside kernel source)
     jit_kernel_dir = os.path.join(repo_root, "python", "sglang", "jit_kernel")
-    files += glob.glob(os.path.join(jit_kernel_dir, "tests", "test_*.py"))
-    files += glob.glob(os.path.join(jit_kernel_dir, "benchmark", "bench_*.py"))
+    files += glob.glob(
+        os.path.join(jit_kernel_dir, "tests", "**", "test_*.py"), recursive=True
+    )
+    files += glob.glob(
+        os.path.join(jit_kernel_dir, "benchmark", "**", "bench_*.py"), recursive=True
+    )
 
     # Strict: all discovered files must have proper registration
     sanity_check = True