diff --git a/docs_new/docs/basic_usage/openai_api_completions.ipynb b/docs_new/docs/basic_usage/openai_api_completions.ipynb
index ffa576ae52c5..8d417dab465d 100644
--- a/docs_new/docs/basic_usage/openai_api_completions.ipynb
+++ b/docs_new/docs/basic_usage/openai_api_completions.ipynb
@@ -374,7 +374,7 @@
    "source": [
     "#### Returning Routed Experts (MoE Models)\n",
     "\n",
-    "For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`."
+    "For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`. By default this returns `[0, seqlen - 1)`, the full available sequence, because RL workflows need routed experts for the full sequence. Set `routed_experts_start_len` in `extra_body` to an absolute prefix length to return only `[routed_experts_start_len, seqlen - 1)`. For example, in multi-turn RL rollouts, routed experts for tokens from previous turns have already been collected, so setting this value avoids unnecessary transfer that cause bottlenecks."
    ]
   },
   {
@@ -468,7 +468,7 @@
    "source": [
     "#### Returning Routed Experts (MoE Models)\n",
     "\n",
-    "For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`."
+    "For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`. By default this returns `[0, seqlen - 1)`, the full available sequence, because RL workflows need routed experts for the full sequence. Set `routed_experts_start_len` in `extra_body` to an absolute prefix length to return only `[routed_experts_start_len, seqlen - 1)`. For example, in multi-turn RL rollouts, routed experts for tokens from previous turns have already been collected, so setting this value avoids unnecessary transfer that cause bottlenecks."
    ]
   },
   {
diff --git a/docs_new/docs/basic_usage/openai_api_completions.mdx b/docs_new/docs/basic_usage/openai_api_completions.mdx
index c463fcca5ac7..0cc08ad8ad96 100644
--- a/docs_new/docs/basic_usage/openai_api_completions.mdx
+++ b/docs_new/docs/basic_usage/openai_api_completions.mdx
@@ -342,7 +342,7 @@ for chunk in stream:
 
 #### Returning Routed Experts (MoE Models)
 
-For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`.
+For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`. By default this returns `[0, seqlen - 1)`, the full available sequence, because RL workflows need routed experts for the full sequence. Set `routed_experts_start_len` in `extra_body` to an absolute prefix length to return only `[routed_experts_start_len, seqlen - 1)`. For example, in multi-turn RL rollouts, routed experts for tokens from previous turns have already been collected, so setting this value avoids unnecessary transfer that cause bottlenecks.
 
 ```python Example
 # Example with logit_bias parameter for completions API
@@ -406,7 +406,7 @@ print_highlight(f"Response: {response}")
 
 #### Returning Routed Experts (MoE Models)
 
-For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`.
+For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`. By default this returns `[0, seqlen - 1)`, the full available sequence, because RL workflows need routed experts for the full sequence. Set `routed_experts_start_len` in `extra_body` to an absolute prefix length to return only `[routed_experts_start_len, seqlen - 1)`. For example, in multi-turn RL rollouts, routed experts for tokens from previous turns have already been collected, so setting this value avoids unnecessary transfer that cause bottlenecks.
 
 ## Structured Outputs (JSON, Regex, EBNF)
 
diff --git a/docs_new/docs/basic_usage/sampling_params.mdx b/docs_new/docs/basic_usage/sampling_params.mdx
index 4b271a229c9c..1e49a7783d75 100644
--- a/docs_new/docs/basic_usage/sampling_params.mdx
+++ b/docs_new/docs/basic_usage/sampling_params.mdx
@@ -107,7 +107,12 @@ The `/generate` endpoint accepts the following parameters in JSON format. For de
     <tr>
       <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>return_routed_experts</td>
       <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`bool = False`</td>
-      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>Whether to return routed experts for MoE models. Requires `--enable-return-routed-experts` server flag. Returns base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`.</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>Whether to return routed experts for MoE models. Requires `--enable-return-routed-experts` server flag. With the default `routed_experts_start_len=0`, returns the full available sequence `[0, seqlen - 1)` because RL workflows need routed experts for the full sequence. The result is base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`.</td>
+    </tr>
+    <tr>
+      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>routed_experts_start_len</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`int = 0`</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>If `return_routed_experts`, the absolute start position for returned routed-experts rows. `0` preserves the default full sequence; set it to an accumulated prefix length to return only `[routed_experts_start_len, seqlen - 1)`. For example, in multi-turn RL rollouts, routed experts for tokens from previous turns have already been collected, so setting this value avoids unnecessary transfer that cause bottlenecks. Must be in `[0, prompt_tokens]`.</td>
     </tr>
   </tbody>
 </table>
diff --git a/python/sglang/srt/disaggregation/encode_receiver.py b/python/sglang/srt/disaggregation/encode_receiver.py
index 1a5028ef9dbd..754d9ad18406 100644
--- a/python/sglang/srt/disaggregation/encode_receiver.py
+++ b/python/sglang/srt/disaggregation/encode_receiver.py
@@ -973,6 +973,7 @@ def create_req(self, recv_req: TokenizedGenerateReqInput):
             require_reasoning=recv_req.require_reasoning,
             return_hidden_states=recv_req.return_hidden_states,
             return_routed_experts=recv_req.return_routed_experts,
+            routed_experts_start_len=recv_req.routed_experts_start_len,
             eos_token_ids=self.scheduler.model_config.hf_eos_token_id,
             bootstrap_host=recv_req.bootstrap_host,
             bootstrap_port=recv_req.bootstrap_port,
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index 7f44c88d85c0..a94f9d7ed26d 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -333,6 +333,7 @@ def generate(
         custom_logit_processor: Optional[Union[List[str], str]] = None,
         return_hidden_states: bool = False,
         return_routed_experts: bool = False,
+        routed_experts_start_len: int = 0,
         stream: bool = False,
         bootstrap_host: Optional[Union[List[str], str]] = None,
         bootstrap_port: Optional[Union[List[int], int]] = None,
@@ -369,6 +370,7 @@ def generate(
             custom_logit_processor=custom_logit_processor,
             return_hidden_states=return_hidden_states,
             return_routed_experts=return_routed_experts,
+            routed_experts_start_len=routed_experts_start_len,
             stream=stream,
             bootstrap_host=bootstrap_host,
             bootstrap_port=bootstrap_port,
@@ -423,6 +425,7 @@ async def async_generate(
         custom_logit_processor: Optional[Union[List[str], str]] = None,
         return_hidden_states: bool = False,
         return_routed_experts: bool = False,
+        routed_experts_start_len: int = 0,
         stream: bool = False,
         bootstrap_host: Optional[Union[List[str], str]] = None,
         bootstrap_port: Optional[Union[List[int], int]] = None,
@@ -458,6 +461,7 @@ async def async_generate(
             lora_path=lora_path,
             return_hidden_states=return_hidden_states,
             return_routed_experts=return_routed_experts,
+            routed_experts_start_len=routed_experts_start_len,
             stream=stream,
             custom_logit_processor=custom_logit_processor,
             bootstrap_host=bootstrap_host,
diff --git a/python/sglang/srt/entrypoints/openai/protocol.py b/python/sglang/srt/entrypoints/openai/protocol.py
index a19744243f09..58a137b19e5e 100644
--- a/python/sglang/srt/entrypoints/openai/protocol.py
+++ b/python/sglang/srt/entrypoints/openai/protocol.py
@@ -285,6 +285,7 @@ class CompletionRequest(BaseModel):
     user: Optional[str] = None
     return_hidden_states: bool = False
     return_routed_experts: bool = False
+    routed_experts_start_len: int = 0
     return_cached_tokens_details: bool = False
 
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
@@ -632,6 +633,7 @@ class ChatCompletionRequest(BaseModel):
     parallel_tool_calls: bool = True
     return_hidden_states: bool = False
     return_routed_experts: bool = False
+    routed_experts_start_len: int = 0
     return_cached_tokens_details: bool = False
     reasoning_effort: Optional[Literal["none", "low", "medium", "high", "max"]] = Field(
         default=None,
diff --git a/python/sglang/srt/entrypoints/openai/serving_chat.py b/python/sglang/srt/entrypoints/openai/serving_chat.py
index 7bf381d629e4..26293fc27dcc 100644
--- a/python/sglang/srt/entrypoints/openai/serving_chat.py
+++ b/python/sglang/srt/entrypoints/openai/serving_chat.py
@@ -437,6 +437,7 @@ def _convert_to_internal_request(
             disagg_prefill_dp_rank=request.disagg_prefill_dp_rank,
             return_hidden_states=request.return_hidden_states,
             return_routed_experts=request.return_routed_experts,
+            routed_experts_start_len=request.routed_experts_start_len,
             rid=request.rid,
             extra_key=self._compute_extra_key(request),
             require_reasoning=self._get_reasoning_from_request(request),
diff --git a/python/sglang/srt/entrypoints/openai/serving_completions.py b/python/sglang/srt/entrypoints/openai/serving_completions.py
index 8598620c4f75..eba04f463b1f 100644
--- a/python/sglang/srt/entrypoints/openai/serving_completions.py
+++ b/python/sglang/srt/entrypoints/openai/serving_completions.py
@@ -123,6 +123,7 @@ def _convert_to_internal_request(
             disagg_prefill_dp_rank=request.disagg_prefill_dp_rank,
             return_hidden_states=request.return_hidden_states,
             return_routed_experts=request.return_routed_experts,
+            routed_experts_start_len=request.routed_experts_start_len,
             rid=request.rid,
             extra_key=self._compute_extra_key(request),
             priority=request.priority,
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 1722f75d6af3..33bfc1bc6051 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -174,7 +174,9 @@ class GenerateReqInput(BaseReq):
     # Whether to return captured routed experts
     return_routed_experts: bool = False
     return_indexer_topk: bool = False
-    # The start location in the prompt for returning routed experts.
+    # Absolute start position for returned routings; response covers
+    # `[routed_experts_start_len, seqlen - 1)`. Must be in [0, prompt_tokens].
+    # 0 = full sequence.
     routed_experts_start_len: int = 0
 
     # The modalities of the image data [image, multi-images, video]
@@ -654,6 +656,7 @@ def __getitem__(self, i):
                 else self.return_hidden_states
             ),
             return_routed_experts=self.return_routed_experts,
+            routed_experts_start_len=self.routed_experts_start_len,
             return_indexer_topk=self.return_indexer_topk,
             modalities=self.modalities[i] if self.modalities else None,
             session_params=self.session_params,
@@ -730,7 +733,7 @@ class TokenizedGenerateReqInput(BaseReq):
 
     # Whether to return captured routed experts
     return_routed_experts: bool = False
-    # The start location in the prompt for returning routed experts.
+    # See GenerateReqInput.routed_experts_start_len.
     routed_experts_start_len: int = 0
 
     return_indexer_topk: bool = False
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 2450d49fb2a0..99f2744ee763 100755
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -599,6 +599,7 @@ def __init__(
         require_reasoning: bool = False,
         return_hidden_states: bool = False,
         return_routed_experts: bool = False,
+        routed_experts_start_len: int = 0,
         return_indexer_topk: bool = False,
         eos_token_ids: Optional[Set[int]] = None,
         bootstrap_host: Optional[str] = None,
@@ -818,6 +819,7 @@ def __init__(
 
         # capture routed experts
         self.return_routed_experts = return_routed_experts
+        self.routed_experts_start_len = routed_experts_start_len
         self.routed_experts: Optional[torch.Tensor] = (
             None  # cpu tensor: shape (seqlen, topk)
         )
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 6e662e77baac..8f4dfe996afb 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2001,6 +2001,7 @@ def handle_generate_request(
                 require_reasoning=recv_req.require_reasoning,
                 return_hidden_states=recv_req.return_hidden_states,
                 return_routed_experts=recv_req.return_routed_experts,
+                routed_experts_start_len=recv_req.routed_experts_start_len,
                 return_indexer_topk=recv_req.return_indexer_topk,
                 eos_token_ids=self.model_config.hf_eos_token_id,
                 bootstrap_host=recv_req.bootstrap_host,
@@ -2158,6 +2159,27 @@ def handle_generate_request(
             self._add_request_to_queue(req)
             return
 
+        if recv_req.return_routed_experts:
+            error_msg = None
+            if recv_req.routed_experts_start_len < 0:
+                error_msg = (
+                    f"{recv_req.routed_experts_start_len=} is lower than 0. "
+                    "Please use a non-negative routed_experts_start_len."
+                )
+
+            if recv_req.routed_experts_start_len > len(req.origin_input_ids):
+                error_msg = (
+                    f"{recv_req.routed_experts_start_len=} is higher than the "
+                    f"number of input tokens {len(req.origin_input_ids)=}. Please "
+                    f"use a smaller routed_experts_start_len."
+                )
+
+            if error_msg is not None:
+                req.routed_experts_start_len = 0
+                req.set_finish_with_abort(error_msg)
+                self._add_request_to_queue(req)
+                return
+
         added_to_grammar_queue = self.grammar_manager.process_req_with_grammar(req)
         if not added_to_grammar_queue:
             self._add_request_to_queue(req)
diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
index d572039e6fb2..997073ab8ca4 100644
--- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py
+++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
@@ -108,16 +108,48 @@ def process_batch_result_prebuilt(self: Scheduler, batch: ScheduleBatch):
             self.token_to_kv_pool_allocator.free_group_end()
 
     def maybe_collect_routed_experts(self: Scheduler, req: Req):
-        """Collect routed experts for a finished request."""
+        """Collect routed experts for a finished request.
+
+        Returns immediately if `return_routed_experts` was not set on the
+        request, so non-opted-in reqs don't pay the host-gather cost.
+
+        Honors the caller's absolute start so the response covers
+        `[start_len, seqlen - 1)`. The default start_len is 0, which returns
+        the full sequence.
+
+        Logs a soft warning if the resulting tensor's row count differs from
+        the expected `seqlen - 1 - start_len`, to catch silent regressions.
+        """
+        if not req.return_routed_experts:
+            return
         capturer = get_global_experts_capturer()
         if capturer is None:
             return
+        start_len = req.routed_experts_start_len
         req.routed_experts = capturer.get_topk(
             req_pool_idx=req.req_pool_idx,
             seqlen=req.seqlen,
             req_to_token_pool=self.req_to_token_pool,
+            start_len=start_len,
         )
 
+        expected_rows = max(0, req.seqlen - 1 - start_len)
+        if (
+            req.routed_experts is not None
+            and req.routed_experts.shape[0] != expected_rows
+        ):
+            logger.warning(
+                "routed_experts row-count mismatch for req %s: got %d, "
+                "expected %d (seqlen=%d, cached_tokens=%d, start_len=%s). "
+                "This indicates a silent bug.",
+                req.rid,
+                req.routed_experts.shape[0],
+                expected_rows,
+                req.seqlen,
+                req.cached_tokens,
+                req.routed_experts_start_len,
+            )
+
     def maybe_collect_indexer_topk(self: Scheduler, req: Req):
         capturer = get_global_indexer_capturer()
         if capturer is None:
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 1eea93d2eb65..0c368367bdf9 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -1018,6 +1018,7 @@ def _create_tokenized_object(
                 require_reasoning=obj.require_reasoning,
                 return_hidden_states=obj.return_hidden_states,
                 return_routed_experts=obj.return_routed_experts,
+                routed_experts_start_len=obj.routed_experts_start_len,
                 return_indexer_topk=obj.return_indexer_topk,
                 routed_dp_rank=obj.routed_dp_rank,
                 disagg_prefill_dp_rank=obj.disagg_prefill_dp_rank,
diff --git a/python/sglang/srt/session/session_controller.py b/python/sglang/srt/session/session_controller.py
index ce98514b8568..1915c370a540 100644
--- a/python/sglang/srt/session/session_controller.py
+++ b/python/sglang/srt/session/session_controller.py
@@ -233,6 +233,7 @@ def create_req(
             require_reasoning=req.require_reasoning,
             return_hidden_states=req.return_hidden_states,
             return_routed_experts=req.return_routed_experts,
+            routed_experts_start_len=req.routed_experts_start_len,
             priority=req.priority,
             routing_key=req.routing_key,
             extra_key=req.extra_key,
diff --git a/python/sglang/srt/state_capturer/base.py b/python/sglang/srt/state_capturer/base.py
index 8620804e7b6f..3931c431573b 100644
--- a/python/sglang/srt/state_capturer/base.py
+++ b/python/sglang/srt/state_capturer/base.py
@@ -147,10 +147,16 @@ def get_topk(
         req_pool_idx: int,
         seqlen: int,
         req_to_token_pool: ReqToTokenPool,
+        start_len: int = 0,
     ) -> torch.Tensor:
-        cache_pool_idx = req_to_token_pool.req_to_token[req_pool_idx][
-            : seqlen - 1
-        ].cpu()
+        if start_len < 0:
+            raise ValueError(f"{start_len=} must be non-negative")
+        start_len = min(start_len, seqlen - 1)
+        cache_pool_idx = (
+            req_to_token_pool.req_to_token[req_pool_idx][start_len : seqlen - 1]
+            .cpu()
+            .clone()
+        )
         return self.host_cache.buffer[cache_pool_idx]
 
     def on_forward_end(
diff --git a/test/registered/rl/test_return_routed_experts.py b/test/registered/rl/test_return_routed_experts.py
index 8caa66795c5e..33643207a052 100644
--- a/test/registered/rl/test_return_routed_experts.py
+++ b/test/registered/rl/test_return_routed_experts.py
@@ -5,6 +5,8 @@
 from typing import List
 
 import aiohttp
+import numpy as np
+import requests
 import torch
 from torch.nn.utils.rnn import pad_sequence
 
@@ -15,6 +17,7 @@
 from sglang.srt.utils import kill_process_tree
 from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_utils import (
+    DEFAULT_ENABLE_ROUTED_EXPERTS_MODEL_NAME_FOR_TEST,
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     CustomTestCase,
@@ -32,6 +35,9 @@
 SHAREGPT_FILENAME = "ShareGPT_V3_unfiltered_cleaned_split.json"
 logger = logging.getLogger(__name__)
 
+_QWEN3_30B_A3B_NUM_LAYERS = 48
+_QWEN3_30B_A3B_TOPK = 8
+
 
 class TestReturnRoutedExperts(CustomTestCase):
     """End-to-end check that --enable-return-routed-experts stays correct
@@ -263,5 +269,195 @@ def compare_baseline_w_reference(baseline, reference):
     return num_total_mismatches
 
 
+class TestRoutedExpertsStartLen(CustomTestCase):
+    """Verify the `routed_experts_start_len` parameter:
+
+    - default (0) returns the full sequence
+    - explicit start_len crops the response and the cropped tail matches
+      the corresponding tail of the full response
+    """
+
+    MAX_NEW_TOKENS = 8
+
+    @classmethod
+    def setUpClass(cls):
+        cls.process = popen_launch_server(
+            DEFAULT_ENABLE_ROUTED_EXPERTS_MODEL_NAME_FOR_TEST,
+            DEFAULT_URL_FOR_TEST,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--enable-return-routed-experts",
+                "--enable-deterministic-inference",
+                "--tp",
+                2,
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        if hasattr(cls, "process") and cls.process:
+            kill_process_tree(cls.process.pid)
+
+    def _send(self, payload: dict) -> dict:
+        resp = requests.post(
+            f"{DEFAULT_URL_FOR_TEST}/generate", json=payload, timeout=120
+        )
+        return resp
+
+    def _build_payload(self, **extra) -> dict:
+        payload = {
+            "text": "User: Tell me a fact about cats.\nAssistant:",
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": self.MAX_NEW_TOKENS,
+                "ignore_eos": True,
+            },
+            "return_routed_experts": True,
+        }
+        payload.update(extra)
+        return payload
+
+    def _routed_experts(self, resp_json: dict):
+        return extract_routed_experts_from_meta_info(resp_json).reshape(
+            -1, _QWEN3_30B_A3B_NUM_LAYERS, _QWEN3_30B_A3B_TOPK
+        )
+
+    def _seqlen(self, resp_json: dict) -> int:
+        meta = resp_json["meta_info"]
+        return meta["prompt_tokens"] + meta["completion_tokens"]
+
+    def test_start_len_zero_is_default(self):
+        """Omitting the field must match `routed_experts_start_len=0`,
+        which returns the full sequence (start_len=0)."""
+        resp_default = self._send(self._build_payload()).json()
+        resp_zero = self._send(self._build_payload(routed_experts_start_len=0)).json()
+
+        rows_default = self._routed_experts(resp_default)
+        rows_zero = self._routed_experts(resp_zero)
+
+        seqlen_default = self._seqlen(resp_default)
+        seqlen_zero = self._seqlen(resp_zero)
+        self.assertEqual(seqlen_default, seqlen_zero)
+        self.assertEqual(rows_default.shape[0], seqlen_default - 1)
+        self.assertEqual(rows_zero.shape[0], seqlen_zero - 1)
+        self.assertTrue(
+            np.array_equal(rows_default, rows_zero),
+            "default and explicit 0 must produce identical routed experts",
+        )
+
+    def test_start_len_controls_row_count(self):
+        """`routed_experts_start_len=N` must return `seqlen - 1 - N` rows
+        and the returned tail must match the corresponding tail of the
+        full sequence (start_len omitted)."""
+        full_resp = self._send(self._build_payload()).json()
+        full_rows = self._routed_experts(full_resp)
+        seqlen = self._seqlen(full_resp)
+        self.assertEqual(full_rows.shape[0], seqlen - 1)
+
+        start_len = max(1, full_resp["meta_info"]["prompt_tokens"] // 2)
+
+        cropped_resp = self._send(
+            self._build_payload(routed_experts_start_len=start_len)
+        ).json()
+        cropped_rows = self._routed_experts(cropped_resp)
+        cropped_seqlen = self._seqlen(cropped_resp)
+
+        self.assertEqual(seqlen, cropped_seqlen)
+        expected_rows = seqlen - 1 - start_len
+        self.assertEqual(
+            cropped_rows.shape[0],
+            expected_rows,
+            f"expected {expected_rows} rows, got {cropped_rows.shape[0]}",
+        )
+        self.assertTrue(
+            np.array_equal(full_rows[start_len:], cropped_rows),
+            "cropped routed experts must match the tail of the full sequence",
+        )
+
+    def test_start_len_exceeds_prompt_tokens_aborts(self):
+        """`routed_experts_start_len > prompt_tokens` must abort the request:
+        the caller cannot meaningfully reference positions that don't exist
+        in the prompt yet."""
+        baseline = self._send(self._build_payload()).json()
+        prompt_tokens = baseline["meta_info"]["prompt_tokens"]
+
+        ok = self._send(self._build_payload(routed_experts_start_len=prompt_tokens))
+        self.assertEqual(
+            ok.status_code,
+            200,
+            f"start_len=={prompt_tokens} should pass, got {ok.text}",
+        )
+
+        too_big = self._send(
+            self._build_payload(routed_experts_start_len=prompt_tokens + 1)
+        )
+        self._assert_aborted(too_big, "is higher than the number of input tokens")
+
+    def test_start_len_with_cache_hit(self):
+        """`start_len` must allow the radix prefix to extend past it. The
+        first request seeds the cache; the second sends the same prompt
+        with `start_len` somewhere inside the prompt. We verify:
+
+          - meta_info.cached_tokens > start_len (would be impossible if a
+            cap forced the prefix match to <= start_len),
+          - the response row count still equals `seqlen - 1 - start_len`.
+        """
+        cache_salt = "cache-hit-test"
+        first = self._send(self._build_payload(extra_key=cache_salt)).json()
+        self.assertEqual(
+            first["meta_info"].get("cached_tokens", 0),
+            0,
+            "first request must be a cold miss",
+        )
+
+        prompt_tokens = first["meta_info"]["prompt_tokens"]
+        start_len = max(1, prompt_tokens // 2)
+        second = self._send(
+            self._build_payload(
+                extra_key=cache_salt,
+                routed_experts_start_len=start_len,
+            )
+        ).json()
+
+        cached = second["meta_info"].get("cached_tokens", 0)
+        self.assertGreater(
+            cached,
+            start_len,
+            f"expected radix prefix past start_len={start_len}, "
+            f"got cached_tokens={cached} (cap not removed?)",
+        )
+
+        rows = self._routed_experts(second)
+        expected = self._seqlen(second) - 1 - start_len
+        self.assertEqual(
+            rows.shape[0],
+            expected,
+            f"expected {expected} rows, got {rows.shape[0]}",
+        )
+
+    def _assert_aborted(self, resp, expected_substring: str):
+        """Assert a request was aborted with `expected_substring` in the
+        error message."""
+        if resp.status_code == 200:
+            body = resp.json()
+            meta = body.get("meta_info", {})
+            finish_reason = meta.get("finish_reason") or {}
+            message = (
+                str(finish_reason.get("message", ""))
+                + " "
+                + str(body.get("text", ""))
+                + " "
+                + str(body.get("error", ""))
+            )
+            self.assertIn(
+                expected_substring,
+                message,
+                f"expected abort with '{expected_substring}', got body={body}",
+            )
+        else:
+            self.assertGreaterEqual(resp.status_code, 400)
+            self.assertIn(expected_substring, resp.text)
+
+
 if __name__ == "__main__":
     unittest.main()