sgl-project · eitanturok · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/docs/diffusion/performance/cache/teacache.md b/docs/diffusion/performance/cache/teacache.md
@@ -50,7 +50,7 @@ TeaCache is configured via `TeaCacheParams` in the sampling parameters:
 from sglang.multimodal_gen.configs.sample.teacache import TeaCacheParams
 
 params = TeaCacheParams(
-    teacache_thresh=0.1,           # Threshold for accumulated L1 distance
+    rel_l1_thresh=0.1,           # Threshold for accumulated L1 distance
     coefficients=[1.0, 0.0, 0.0],  # Polynomial coefficients for L1 rescaling
 )
 ```
@@ -59,7 +59,7 @@ params = TeaCacheParams(
 
 | Parameter | Type | Description |
 |-----------|------|-------------|
-| `teacache_thresh` | float | Threshold for accumulated L1 distance. Lower = more caching, faster but potentially lower quality |
+| `rel_l1_thresh` | float | Threshold for accumulated L1 distance. Lower = more caching, faster but potentially lower quality |
 | `coefficients` | list[float] | Polynomial coefficients for L1 rescaling. Model-specific tuning |
 
 ### Model-Specific Configurations
@@ -73,7 +73,7 @@ TeaCache is built into the following model families:
 | Model Family | CFG Cache Separation | Notes |
 |--------------|---------------------|-------|
 | Wan (wan2.1, wan2.2) | Yes | Full support |
-| Hunyuan (HunyuanVideo) | Yes | To be supported |
+| Hunyuan (HunyuanVideo) | Yes | Full support |
 | Z-Image | Yes | To be supported |
 | Flux | No | To be supported |
 | Qwen | No | To be supported |

@@ -38,7 +38,7 @@ class HunyuanSamplingParams(SamplingParams):
 
     teacache_params: TeaCacheParams = field(
         default_factory=lambda: TeaCacheParams(
-            teacache_thresh=0.15,
+            rel_l1_thresh=0.15,
             # from https://github.com/ali-vilab/TeaCache/blob/7c10efc4702c6b619f47805f7abe4a7a08085aa0/TeaCache4HunyuanVideo/teacache_sample_video.py#L222
             coefficients=[
                 7.33226126e02,

@@ -161,9 +161,8 @@ class SamplingParams:
 
     # TeaCache parameters
     enable_teacache: bool = False
-    teacache_params: Any = (
-        None  # TeaCacheParams or WanTeaCacheParams, set by model-specific subclass
-    )
+    cache_params: Any | None = None
+    calibrate_cache: bool = False
 
     # Profiling
     profile: bool = False
@@ -615,6 +614,12 @@ def add_argument(*name_or_flags, **kwargs):
             "--enable-teacache",
             action="store_true",
         )
+        parser.add_argument(
+            "--calibrate-cache",
+            action="store_true",
+            default=SamplingParams.calibrate_cache,
+            help="Run in calibration mode: collect magnitude ratio statistics instead of skipping steps.",
+        )
 
         # profiling
         add_argument(
@@ -971,4 +976,4 @@ def n_tokens(self) -> int:
 
 @dataclass
 class CacheParams:
-    cache_type: str = "none"
+    pass
@@ -17,7 +17,7 @@ class TeaCacheParams(CacheParams):
     Attributes:
         cache_type: (`str`, defaults to `teacache`):
             A string labeling these parameters as belonging to teacache.
-        teacache_thresh (`float`, defaults to `0.0`):
+        rel_l1_thresh (`float`, defaults to `0.0`):
             Threshold for accumulated relative L1 distance. When below this threshold, the
             forward pass is skipped. Recommended values: 0.25 for ~1.5x speedup, 0.4 for ~1.8x,
             0.6 for ~2.0x.
@@ -48,7 +48,7 @@ class TeaCacheParams(CacheParams):
     """
 
     cache_type: str = "teacache"
-    teacache_thresh: float = 0.0
+    rel_l1_thresh: float = 0.0
     start_skipping: int | float = 5
     end_skipping: int | float = -1
     coefficients: list[float] = field(default_factory=list)

@@ -66,7 +66,7 @@ class WanT2V_1_3B_SamplingParams(SamplingParams):
 
     teacache_params: TeaCacheParams = field(
         default_factory=lambda: TeaCacheParams(
-            teacache_thresh=0.08,
+            rel_l1_thresh=0.08,
             use_ret_steps=True,
             coefficients_callback=_wan_1_3b_coefficients,
             start_skipping=5,
@@ -102,7 +102,7 @@ class WanT2V_14B_SamplingParams(SamplingParams):
 
     teacache_params: TeaCacheParams = field(
         default_factory=lambda: TeaCacheParams(
-            teacache_thresh=0.20,
+            rel_l1_thresh=0.20,
             use_ret_steps=False,
             coefficients_callback=_wan_14b_coefficients,
             start_skipping=1,
@@ -128,7 +128,7 @@ class WanI2V_14B_480P_SamplingParam(WanT2V_1_3B_SamplingParams):
 
     teacache_params: TeaCacheParams = field(
         default_factory=lambda: TeaCacheParams(
-            teacache_thresh=0.26,
+            rel_l1_thresh=0.26,
             use_ret_steps=True,
             coefficients_callback=_wan_14b_coefficients,
             start_skipping=5,
@@ -156,7 +156,7 @@ class WanI2V_14B_720P_SamplingParam(WanT2V_14B_SamplingParams):
 
     teacache_params: TeaCacheParams = field(
         default_factory=lambda: TeaCacheParams(
-            teacache_thresh=0.3,
+            rel_l1_thresh=0.3,
             use_ret_steps=True,
             coefficients_callback=_wan_14b_coefficients,
             start_skipping=5,

@@ -22,7 +22,7 @@ class ZImageTurboSamplingParams(SamplingParams):
 
     teacache_params: TeaCacheParams = field(
         default_factory=lambda: TeaCacheParams(
-            teacache_thresh=0.15,
+            rel_l1_thresh=0.15,
             coefficients=[
                 7.33226126e02,
                 -4.01131952e02,

diff --git a/python/sglang/multimodal_gen/runtime/cache/__init__.py b/python/sglang/multimodal_gen/runtime/cache/__init__.py
@@ -10,18 +10,24 @@
 
 """
 
+from sglang.multimodal_gen.runtime.cache.base import DiffusionCache
 from sglang.multimodal_gen.runtime.cache.cache_dit_integration import (
     CacheDitConfig,
     enable_cache_on_dual_transformer,
     enable_cache_on_transformer,
     get_scm_mask,
 )
-from sglang.multimodal_gen.runtime.cache.teacache import TeaCacheContext, TeaCacheMixin
+from sglang.multimodal_gen.runtime.cache.teacache import (
+    TeaCacheState,
+    TeaCacheStrategy,
+)
 
 __all__ = [
+    # Base
+    "DiffusionCache",
     # TeaCache (always available)
-    "TeaCacheContext",
-    "TeaCacheMixin",
+    "TeaCacheState",
+    "TeaCacheStrategy",
     # cache-dit integration (lazy-loaded, requires cache-dit package)
     "CacheDitConfig",
     "enable_cache_on_transformer",

diff --git a/python/sglang/multimodal_gen/runtime/cache/base.py b/python/sglang/multimodal_gen/runtime/cache/base.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class DiffusionCache(ABC):
+    """Base class for managing diffusion timestep caching.
+
+    Subclasses define specific strategies for deciding when to skip
+    computation and how to store/retrieve hidden states.
+    """
+
+    @abstractmethod
+    def maybe_reset(self, **kwargs) -> None:
+        """Resets the internal cache state for a new generation sequence.
+
+        Args:
+            **kwargs: Additional parameters that may be helpful.
+        """
+
+    @abstractmethod
+    def should_skip(self, **kwargs) -> bool:
+        """Determines if the current timestep computation can be skipped.
+
+        Args:
+            **kwargs: Additional parameters that may be helpful.
+
+        Returns:
+            bool: True if the timestep should be skipped, False otherwise.
+        """
+
+    @abstractmethod
+    def write(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        **kwargs
+    ) -> None:
+        """Cache the result of a full forward pass to the cache state.
+
+        Args:
+            hidden_states: Output of the transformer blocks.
+            original_hidden_states: Input from before the transformer blocks.
+            **kwargs: Additional parameters that may be helpful.
+        """
+
+    @abstractmethod
+    def read(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
+        """Computes an approximation of the forward pass using cached data. Reads from the cache.
+
+        Args:
+            hidden_states: The current input/intermediate hidden states.
+            **kwargs: Additional parameters for the retrieval strategy.
+
+        Returns:
+            torch.Tensor: The approximated output of the forward pass.
+        """
+
+    def calibrate(self, **kwargs) -> None:
+        """Performs a calibration step to learn cache thresholds or values.
+
+        Args:
+            **kwargs: Additional parameters that may be helpful.
+        """
+        pass