diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 9f7b8140b6bd..418260e1e399 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -425,7 +425,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        partition: [0, 1]
+        partition: [0, 1, 2]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -446,7 +446,44 @@ jobs:
         timeout-minutes: 30
         run: |
           cd test/
-          python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 2
+          python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 3
+
+  stage-b-test-2-gpu:
+    needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-2-gpu') ||
+        (
+          !inputs.target_stage &&
+          (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    runs-on: 2-gpu-runner
+    env:
+      RUNNER_LABELS: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Download artifacts
+        if: needs.check-changes.outputs.sgl_kernel == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Install dependencies
+        run: |
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          cd test/
+          python3 run_suite.py --hw cuda --suite stage-b-test-small-2-gpu
 
   multimodal-gen-test-1-gpu:
     needs: [check-changes, call-gate, sgl-kernel-build-wheels]
@@ -1326,6 +1363,7 @@ jobs:
 
         stage-a-test-1,
         stage-b-test-small-1-gpu,
+        stage-b-test-2-gpu,
         quantization-test,
         unit-test-backend-1-gpu,
         unit-test-backend-2-gpu,
diff --git a/python/sglang/test/ci/__init__.py b/python/sglang/test/ci/__init__.py
new file mode 100644
index 000000000000..595f4b79d0d0
--- /dev/null
+++ b/python/sglang/test/ci/__init__.py
@@ -0,0 +1 @@
+"""CI utilities for SGLang test infrastructure."""
diff --git a/test/lora_utils.py b/python/sglang/test/lora_utils.py
similarity index 100%
rename from test/lora_utils.py
rename to python/sglang/test/lora_utils.py
diff --git a/scripts/ci/slash_command_handler.py b/scripts/ci/slash_command_handler.py
index fa3bb8557ada..f43505d0f97e 100644
--- a/scripts/ci/slash_command_handler.py
+++ b/scripts/ci/slash_command_handler.py
@@ -144,6 +144,7 @@ def handle_rerun_stage(
     nvidia_stages = [
         "stage-a-test-1",
         "stage-b-test-small-1-gpu",
+        "stage-b-test-2-gpu",
         "multimodal-gen-test-1-gpu",
         "multimodal-gen-test-2-gpu",
         "quantization-test",
diff --git a/test/srt/lora/test_lora.py b/test/registered/lora/test_lora.py
similarity index 87%
rename from test/srt/lora/test_lora.py
rename to test/registered/lora/test_lora.py
index 3dc08d2f16f4..6d5c2b340547 100644
--- a/test/srt/lora/test_lora.py
+++ b/test/registered/lora/test_lora.py
@@ -14,22 +14,18 @@
 
 import multiprocessing as mp
 import os
-import sys
 import unittest
-from pathlib import Path
 
-# Add test directory to path for lora_utils import
-# TODO: can be removed after migration
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
-
-from lora_utils import (
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.lora_utils import (
     ALL_OTHER_MULTI_LORA_MODELS,
     CI_MULTI_LORA_MODELS,
     run_lora_multiple_batch_on_model_cases,
 )
-
 from sglang.test.test_utils import CustomTestCase, is_in_ci
 
+register_cuda_ci(est_time=82, suite="stage-b-test-small-1-gpu")
+
 
 class TestLoRA(CustomTestCase):
     def test_ci_lora_models(self):
diff --git a/test/srt/lora/test_lora_backend.py b/test/registered/lora/test_lora_backend.py
similarity index 91%
rename from test/srt/lora/test_lora_backend.py
rename to test/registered/lora/test_lora_backend.py
index bf9d5b75ca32..e8f9134f656e 100644
--- a/test/srt/lora/test_lora_backend.py
+++ b/test/registered/lora/test_lora_backend.py
@@ -14,16 +14,11 @@
 
 import multiprocessing as mp
 import os
-import sys
 import unittest
-from pathlib import Path
 from typing import List
 
-# Add test directory to path for lora_utils import
-# TODO: can be removed after migration
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
-
-from lora_utils import (
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.lora_utils import (
     ALL_OTHER_LORA_MODELS,
     BACKENDS,
     CI_LORA_MODELS,
@@ -32,9 +27,10 @@
     LoRAModelCase,
     run_lora_test_one_by_one,
 )
-
 from sglang.test.test_utils import CustomTestCase, is_in_ci
 
+register_cuda_ci(est_time=200, suite="stage-b-test-small-1-gpu")
+
 
 class TestLoRABackend(CustomTestCase):
 
diff --git a/test/srt/lora/test_lora_eviction.py b/test/registered/lora/test_lora_eviction.py
similarity index 98%
rename from test/srt/lora/test_lora_eviction.py
rename to test/registered/lora/test_lora_eviction.py
index 78cdd8282fe0..7d9fb6f2e5aa 100644
--- a/test/srt/lora/test_lora_eviction.py
+++ b/test/registered/lora/test_lora_eviction.py
@@ -19,9 +19,12 @@
 
 import torch
 
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.runners import SRTRunner
 from sglang.test.test_utils import CustomTestCase
 
+register_cuda_ci(est_time=224, suite="stage-b-test-small-1-gpu")
+
 PROMPTS = [
     "AI is a field of computer science focused on",
     """
diff --git a/test/nightly/test_lora_eviction_policy.py b/test/registered/lora/test_lora_eviction_policy.py
similarity index 100%
rename from test/nightly/test_lora_eviction_policy.py
rename to test/registered/lora/test_lora_eviction_policy.py
diff --git a/test/srt/lora/test_lora_hf_sgl_logprob_diff.py b/test/registered/lora/test_lora_hf_sgl_logprob_diff.py
similarity index 96%
rename from test/srt/lora/test_lora_hf_sgl_logprob_diff.py
rename to test/registered/lora/test_lora_hf_sgl_logprob_diff.py
index b0975fa5d666..10c03e68a814 100644
--- a/test/srt/lora/test_lora_hf_sgl_logprob_diff.py
+++ b/test/registered/lora/test_lora_hf_sgl_logprob_diff.py
@@ -28,28 +28,24 @@
 """
 
 import multiprocessing as mp
-import os
-import sys
 import unittest
 from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
 
-# Add sglang to path if needed
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
-
 from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.runners import HFRunner, SRTRunner
 
-register_cuda_ci(est_time=300, suite="nightly-1-gpu", nightly=True)
-
-from sglang.test.test_utils import (
-    DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
-    CustomTestCase,
-    is_in_ci,
+register_cuda_ci(
+    est_time=300,
+    suite="nightly-1-gpu",
+    nightly=True,
+    disabled="Temporarily disabled, will be fixed later",
 )
 
+from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, CustomTestCase
+
 # Test configuration constants
 LORA_BACKEND = "triton"
 DISABLE_CUDA_GRAPH = False
@@ -510,10 +506,6 @@ def test_lora_logprob_comparison_basic(self):
         """
         Basic test comparing HF and SGLang LoRA logprobs with small model.
         """
-        # Use a smaller model and shorter prompts for CI
-        if is_in_ci():
-            self.skipTest("Skipping in CI environment - requires large models")
-
         model_path = "meta-llama/Llama-2-7b-hf"
         lora_paths = ["yushengsu/sglang_lora_logprob_diff_without_tuning"]
         prompts = DEFAULT_TEST_PROMPTS[:2]  # Use fewer prompts for faster testing
@@ -529,9 +521,6 @@ def test_lora_logprob_comparison_full(self):
         """
         Full test comparing HF and SGLang LoRA logprobs with all prompts.
         """
-        if is_in_ci():
-            self.skipTest("Skipping in CI environment - requires large models")
-
         model_path = "meta-llama/Llama-2-7b-hf"
         lora_paths = ["yushengsu/sglang_lora_logprob_diff_without_tuning"]
         prompts = DEFAULT_TEST_PROMPTS
diff --git a/test/nightly/test_lora_openai_api.py b/test/registered/lora/test_lora_openai_api.py
similarity index 100%
rename from test/nightly/test_lora_openai_api.py
rename to test/registered/lora/test_lora_openai_api.py
diff --git a/test/nightly/test_lora_openai_compatible.py b/test/registered/lora/test_lora_openai_compatible.py
similarity index 100%
rename from test/nightly/test_lora_openai_compatible.py
rename to test/registered/lora/test_lora_openai_compatible.py
diff --git a/test/nightly/test_lora_qwen3.py b/test/registered/lora/test_lora_qwen3.py
similarity index 88%
rename from test/nightly/test_lora_qwen3.py
rename to test/registered/lora/test_lora_qwen3.py
index 39d5a755efcf..da6999f422c8 100644
--- a/test/nightly/test_lora_qwen3.py
+++ b/test/registered/lora/test_lora_qwen3.py
@@ -13,22 +13,15 @@
 # ==============================================================================
 
 import multiprocessing as mp
-import sys
 import unittest
-from pathlib import Path
 
-# Add test directory to path for lora_utils import
-# TODO: can be removed after migration
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-
-from lora_utils import (
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.lora_utils import (
     LoRAAdaptor,
     LoRAModelCase,
     run_lora_multiple_batch_on_model_cases,
 )
 
-from sglang.test.ci.ci_register import register_cuda_ci
-
 register_cuda_ci(est_time=97, suite="nightly-1-gpu", nightly=True)
 
 from sglang.test.test_utils import CustomTestCase
diff --git a/test/nightly/test_lora_radix_cache.py b/test/registered/lora/test_lora_radix_cache.py
similarity index 90%
rename from test/nightly/test_lora_radix_cache.py
rename to test/registered/lora/test_lora_radix_cache.py
index fee9bfce1d15..84ffd2cceda6 100644
--- a/test/nightly/test_lora_radix_cache.py
+++ b/test/registered/lora/test_lora_radix_cache.py
@@ -13,19 +13,12 @@
 # ==============================================================================
 
 import multiprocessing as mp
-import sys
 import unittest
-from pathlib import Path
 
 import torch
 
-# Add test directory to path for lora_utils import
-# TODO: can be removed after migration
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-
-from lora_utils import CI_MULTI_LORA_MODELS, run_lora_test_one_by_one
-
 from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.lora_utils import CI_MULTI_LORA_MODELS, run_lora_test_one_by_one
 
 register_cuda_ci(est_time=200, suite="nightly-1-gpu", nightly=True)
 
diff --git a/test/srt/lora/test_lora_tp.py b/test/registered/lora/test_lora_tp.py
similarity index 91%
rename from test/srt/lora/test_lora_tp.py
rename to test/registered/lora/test_lora_tp.py
index 8e7b23f62dbf..c90979c9a3ac 100644
--- a/test/srt/lora/test_lora_tp.py
+++ b/test/registered/lora/test_lora_tp.py
@@ -14,16 +14,11 @@
 
 import multiprocessing as mp
 import os
-import sys
 import unittest
-from pathlib import Path
 from typing import List
 
-# Add test directory to path for lora_utils import
-# TODO: can be removed after migration
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
-
-from lora_utils import (
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.lora_utils import (
     ALL_OTHER_LORA_MODELS,
     CI_LORA_MODELS,
     DEFAULT_PROMPTS,
@@ -31,9 +26,10 @@
     LoRAModelCase,
     run_lora_test_one_by_one,
 )
-
 from sglang.test.test_utils import CustomTestCase, is_in_ci
 
+register_cuda_ci(est_time=116, suite="stage-b-test-small-2-gpu")
+
 
 class TestLoRATP(CustomTestCase):
 
diff --git a/test/srt/lora/test_lora_update.py b/test/registered/lora/test_lora_update.py
similarity index 99%
rename from test/srt/lora/test_lora_update.py
rename to test/registered/lora/test_lora_update.py
index 9c3f0855033b..957c1d7cefcc 100644
--- a/test/srt/lora/test_lora_update.py
+++ b/test/registered/lora/test_lora_update.py
@@ -23,6 +23,7 @@
 import torch
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.runners import SRTRunner
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -33,6 +34,8 @@
     popen_launch_server,
 )
 
+register_cuda_ci(est_time=451, suite="stage-b-test-small-1-gpu")
+
 PROMPTS = [
     "SGL is a",
     "AI is a field of computer science focused on",
diff --git a/test/srt/lora/test_multi_lora_backend.py b/test/registered/lora/test_multi_lora_backend.py
similarity index 90%
rename from test/srt/lora/test_multi_lora_backend.py
rename to test/registered/lora/test_multi_lora_backend.py
index 84def4813a9d..58dfdc16ca36 100644
--- a/test/srt/lora/test_multi_lora_backend.py
+++ b/test/registered/lora/test_multi_lora_backend.py
@@ -14,22 +14,18 @@
 
 import multiprocessing as mp
 import os
-import sys
 import unittest
-from pathlib import Path
 
-# Add test directory to path for lora_utils import
-# TODO: can be removed after migration
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
-
-from lora_utils import (
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.lora_utils import (
     ALL_OTHER_MULTI_LORA_MODELS,
     CI_MULTI_LORA_MODELS,
     run_lora_multiple_batch_on_model_cases,
 )
-
 from sglang.test.test_utils import CustomTestCase, is_in_ci
 
+register_cuda_ci(est_time=60, suite="stage-b-test-small-1-gpu")
+
 # All prompts are used at once in a batch.
 PROMPTS = [
     "AI is a field of computer science focused on",
diff --git a/test/run_suite.py b/test/run_suite.py
index 32a5fd744cd6..ed521752dd9c 100644
--- a/test/run_suite.py
+++ b/test/run_suite.py
@@ -19,7 +19,11 @@
 PER_COMMIT_SUITES = {
     HWBackend.CPU: ["default"],
     HWBackend.AMD: ["stage-a-test-1"],
-    HWBackend.CUDA: ["stage-a-test-1", "stage-b-test-small-1-gpu"],
+    HWBackend.CUDA: [
+        "stage-a-test-1",
+        "stage-b-test-small-1-gpu",
+        "stage-b-test-small-2-gpu",
+    ],
     HWBackend.NPU: [],
 }
 
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 91c224e89459..81b5d3ed5a60 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -13,11 +13,6 @@
         TestFile("layers/attention/mamba/test_causal_conv1d.py", 25),
         TestFile("layers/attention/mamba/test_mamba_ssm.py", 7),
         TestFile("layers/attention/mamba/test_mamba_ssm_ssd.py", 13),
-        TestFile("lora/test_lora.py", 82),
-        TestFile("lora/test_lora_eviction.py", 224),
-        TestFile("lora/test_lora_update.py", 451),
-        TestFile("lora/test_lora_backend.py", 200),
-        TestFile("lora/test_multi_lora_backend.py", 60),
         TestFile("models/test_compressed_tensors_models.py", 42),
         TestFile("models/test_cross_encoder_models.py", 100),
         TestFile("models/test_embedding_models.py", 73),
@@ -133,7 +128,6 @@
         TestFile("hicache/test_hicache_storage_file_backend.py", 200),
         TestFile("hicache/test_hicache_storage_mooncake_backend.py", 300),
         TestFile("layers/attention/mamba/test_mamba2_mixer.py", 50),
-        TestFile("lora/test_lora_tp.py", 116),
         TestFile("models/test_glm4_moe_models.py", 100),
         TestFile("models/test_kimi_linear_models.py", 90),
         TestFile("rl/test_update_weights_from_distributed.py", 103),
@@ -201,10 +195,8 @@
         TestFile("test_quantization.py", 185),
         TestFile("test_gguf.py", 96),
     ],
-    # Nightly test suites have been moved to test/run_suite_nightly.py
     "__not_in_ci__": [
         TestFile("test_release_memory_occupation.py", 200),  # Temporarily disabled
-        TestFile("lora/test_lora_hf_sgl_logprob_diff.py"),  # Nightly test
         TestFile("models/test_dummy_grok_models.py"),
         TestFile(
             "rl/test_update_weights_from_disk.py"
@@ -228,12 +220,10 @@
         # TestFile("hicache/test_hicache.py", 116), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/12575
         # TestFile("hicache/test_hicache_mla.py", 127), # Disabled temporarily,  # Temporarily disabled, see https://github.com/sgl-project/sglang/issues/12574
         # TestFile("hicache/test_hicache_storage.py", 127), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/12575
-        TestFile("lora/test_lora.py", 665),
+        # LoRA tests moved to test/registered/lora/ - AMD entries need to be re-added there
         # TestFile("lora/test_lora_backend.py", 99), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/13107
         # TestFile("lora/test_lora_cuda_graph.py", 250), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/13107
-        TestFile("lora/test_lora_eviction.py", 240),
         # TestFile("lora/test_lora_qwen3.py", 97), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/13107
-        TestFile("lora/test_multi_lora_backend.py", 60),
         TestFile("models/test_compressed_tensors_models.py", 42),
         TestFile("models/test_qwen_models.py", 82),
         TestFile("models/test_reward_models.py", 132),
@@ -308,7 +298,7 @@
         TestFile("test_mla.py", 242),
     ],
     "per-commit-2-gpu-amd": [
-        # TestFile("lora/test_lora_tp.py", 116), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/13107
+        # TestFile("lora/test_lora_tp.py", 116), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/13107. Moved to test/registered/lora/
         TestFile("rl/test_update_weights_from_distributed.py", 103),
         TestFile("test_data_parallelism.py", 73),
         TestFile("test_load_weights_from_remote_instance.py", 72),