vllm-project · hsliuustc0106 · Feb 26, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
@@ -214,8 +214,6 @@ class Qwen3TTSTokenizerV1EncoderConfig(PretrainedConfig):
         output_dim (`int`, *optional*, defaults to 3584):
             Output feature dimension produced by the encoder head (before/after projection, implementation-dependent).
 
-        grad_checkpointing (`bool`, *optional*, defaults to `False`):
-            Whether to enable gradient checkpointing to reduce memory usage during training.
         enable_mp (`bool`, *optional*, defaults to `False`):
             Whether to enable model parallel features (implementation-dependent).
         audio_sequence_parallel (`bool`, *optional*, defaults to `False`):
@@ -246,7 +244,6 @@ def __init__(
         n_layer=32,
         n_window=100,
         output_dim=3584,
-        grad_checkpointing=False,
         enable_mp=False,
         audio_sequence_parallel=False,
         audio_vq_type="GRVQ",
@@ -265,7 +262,6 @@ def __init__(
         self.n_layer = n_layer
         self.n_window = n_window
         self.output_dim = output_dim
-        self.grad_checkpointing = grad_checkpointing
         self.enable_mp = enable_mp
         self.audio_sequence_parallel = audio_sequence_parallel
         self.audio_vq_type = audio_vq_type

@@ -1297,8 +1297,6 @@ def __init__(self, config: Qwen3TTSTokenizerV1EncoderConfig):
             n_layer=config.n_layer,
             n_window=config.n_window,
             output_dim=config.output_dim,
-            grad_checkpointing=config.grad_checkpointing,
-            enable_mp=config.enable_mp,
             audio_sequence_parallel=config.audio_sequence_parallel,
             audio_vq_type=config.audio_vq_type,
             audio_vq_layers=config.audio_vq_layers,

@@ -196,8 +196,6 @@ def __init__(
         n_layer: int,
         n_window: int = 1500,
         output_dim: int = 512,
-        grad_checkpointing: bool = False,
-        enable_mp: bool = False,
         audio_sequence_parallel: bool = False,
         audio_vq_layers: int = -1,
         audio_vq_type: str = "NULL",
@@ -219,8 +217,6 @@ def __init__(
             n_layer,
             n_window,
             output_dim,
-            grad_checkpointing,
-            enable_mp,
             audio_sequence_parallel,
         )
 

@@ -23,19 +23,7 @@
 import torch.nn.functional as F
 from torch import Tensor, nn
 
-try:
-    from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_varlen_func
-except ImportError:
-    try:
-        from flash_attn.flash_attn_interface import flash_attn_unpadded_func as flash_attn_varlen_func
-    except ImportError:
-        print(
-            "\n********\nWarning: flash-attn is not installed. "
-            "Will only run the manual PyTorch version. "
-            "Please install flash-attn for faster inference.\n********\n "
-        )
-        flash_attn_varlen_func = None
-
+from vllm_omni.diffusion.attention.backends.utils.fa import HAS_FLASH_ATTN, flash_attn_varlen_func
 
 N_FFT = 400
 HOP_LENGTH = 160
@@ -152,15 +140,15 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class MultiHeadAttention(nn.Module):
-    def __init__(self, n_state: int, n_head: int):
+    def __init__(self, n_state: int, n_head: int, use_flash_attention: bool = True):
         super().__init__()
         self.n_head = n_head
         self.query = Linear(n_state, n_state)
         self.key = Linear(n_state, n_state, bias=False)
         self.value = Linear(n_state, n_state)
         self.out = Linear(n_state, n_state)
 
-        self.use_flash_attention = True
+        self.use_flash_attention = use_flash_attention and HAS_FLASH_ATTN
 
     def forward(
         self,
@@ -171,15 +159,8 @@ def forward(
         k = self.key(x)
         v = self.value(x)
 
-        if self.use_flash_attention:
-            if flash_attn_varlen_func is None:
-                x = self.qkv_attention_manual(q, k, v, cu_seqlens=cu_seqlens)
-            else:
-                if q.dtype not in [torch.float16, torch.bfloat16]:
-                    x = self.qkv_attention_manual(q, k, v, cu_seqlens=cu_seqlens)
-                    self.use_flash_attention = False
-                else:
-                    x = self.qkv_flash_attention(q, k, v, cu_seqlens=cu_seqlens)
+        if self.use_flash_attention and q.dtype in [torch.float16, torch.bfloat16]:
+            x = self.qkv_flash_attention(q, k, v, cu_seqlens=cu_seqlens)
         else:
             x = self.qkv_attention_manual(q, k, v, cu_seqlens=cu_seqlens)
 
@@ -249,7 +230,7 @@ def qkv_attention_manual(self, q: Tensor, k: Tensor, v: Tensor, cu_seqlens: Tens
 
 
 class ResidualAttentionBlock(nn.Module):
-    def __init__(self, n_state: int, n_head: int, enable_mp: bool = False, sequence_parallel: bool = False):
+    def __init__(self, n_state: int, n_head: int, sequence_parallel: bool = False):
         super().__init__()
         n_mlp = n_state * 4
         self.attn_ln = nn.LayerNorm(n_state)
@@ -274,8 +255,6 @@ def __init__(
         n_layer: int,
         n_window: int = 1500,
         output_dim: int = 512,
-        grad_checkpointing: bool = False,
-        enable_mp: bool = False,
         audio_sequence_parallel: bool = False,
     ):
         super().__init__()
@@ -286,10 +265,7 @@ def __init__(
         self.n_mels = n_mels
 
         self.blocks = nn.ModuleList(
-            [
-                ResidualAttentionBlock(n_state, n_head, enable_mp=enable_mp, sequence_parallel=audio_sequence_parallel)
-                for _ in range(n_layer)
-            ]
+            [ResidualAttentionBlock(n_state, n_head, sequence_parallel=audio_sequence_parallel) for _ in range(n_layer)]
         )
         self.ln_post = nn.LayerNorm(n_state)
         self.avg_pooler = nn.AvgPool1d(2, stride=2)
@@ -299,8 +275,6 @@ def __init__(
         self.audio_bos_eos_token = nn.Embedding(2, output_dim)
 
         self.output_dim = output_dim
-        self.grad_checkpointing = grad_checkpointing
-        self.enable_mp = enable_mp
         self.n_head = n_head
         self.n_state = n_state
         self.n_window = n_window
@@ -309,13 +283,6 @@ def __init__(
 
         self.tp_world_size = 1
 
-        self.set_audio_sync()
-
-    def set_audio_sync(self):
-        for name, param in self.named_parameters():
-            if not name.startswith("blocks"):
-                setattr(param, "audio_sync", True)
-
     def forward(
         self, x_list: list[Tensor], audio_mellens: list[int], audio_aftercnnlens: list[int], audio_seqlens: list[int]
     ):
@@ -377,9 +344,3 @@ def forward(
         output[end_ids] = self.audio_bos_eos_token.weight[1].to(x.dtype)
         output[audio_tokens_mask] = x
         return output
-
-    def lock(self, layers: int):
-        self.conv1.requires_grad_(False)
-        self.conv2.requires_grad_(False)
-        for i in range(min(layers, len(self.blocks))):
-            self.blocks[i].requires_grad_(False)