Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,6 @@ class Qwen3TTSTokenizerV1EncoderConfig(PretrainedConfig):
output_dim (`int`, *optional*, defaults to 3584):
Output feature dimension produced by the encoder head (before/after projection, implementation-dependent).

grad_checkpointing (`bool`, *optional*, defaults to `False`):
Whether to enable gradient checkpointing to reduce memory usage during training.
enable_mp (`bool`, *optional*, defaults to `False`):
Whether to enable model parallel features (implementation-dependent).
audio_sequence_parallel (`bool`, *optional*, defaults to `False`):
Expand Down Expand Up @@ -246,7 +244,6 @@ def __init__(
n_layer=32,
n_window=100,
output_dim=3584,
grad_checkpointing=False,
enable_mp=False,
audio_sequence_parallel=False,
audio_vq_type="GRVQ",
Expand All @@ -265,7 +262,6 @@ def __init__(
self.n_layer = n_layer
self.n_window = n_window
self.output_dim = output_dim
self.grad_checkpointing = grad_checkpointing
self.enable_mp = enable_mp
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

enable_mp got removed from the model code but is still hanging around here. Might want to clean it up too, same as you did for grad_checkpointing.

self.audio_sequence_parallel = audio_sequence_parallel
self.audio_vq_type = audio_vq_type
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1297,8 +1297,6 @@ def __init__(self, config: Qwen3TTSTokenizerV1EncoderConfig):
n_layer=config.n_layer,
n_window=config.n_window,
output_dim=config.output_dim,
grad_checkpointing=config.grad_checkpointing,
enable_mp=config.enable_mp,
audio_sequence_parallel=config.audio_sequence_parallel,
audio_vq_type=config.audio_vq_type,
audio_vq_layers=config.audio_vq_layers,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,6 @@ def __init__(
n_layer: int,
n_window: int = 1500,
output_dim: int = 512,
grad_checkpointing: bool = False,
enable_mp: bool = False,
audio_sequence_parallel: bool = False,
audio_vq_layers: int = -1,
audio_vq_type: str = "NULL",
Expand All @@ -219,8 +217,6 @@ def __init__(
n_layer,
n_window,
output_dim,
grad_checkpointing,
enable_mp,
audio_sequence_parallel,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,7 @@
import torch.nn.functional as F
from torch import Tensor, nn

try:
from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_varlen_func
except ImportError:
try:
from flash_attn.flash_attn_interface import flash_attn_unpadded_func as flash_attn_varlen_func
except ImportError:
print(
"\n********\nWarning: flash-attn is not installed. "
"Will only run the manual PyTorch version. "
"Please install flash-attn for faster inference.\n********\n "
)
flash_attn_varlen_func = None

from vllm_omni.diffusion.attention.backends.utils.fa import HAS_FLASH_ATTN, flash_attn_varlen_func
Comment thread
yuanheng-zhao marked this conversation as resolved.

N_FFT = 400
HOP_LENGTH = 160
Expand Down Expand Up @@ -152,15 +140,15 @@ def forward(self, x: Tensor) -> Tensor:


class MultiHeadAttention(nn.Module):
def __init__(self, n_state: int, n_head: int):
def __init__(self, n_state: int, n_head: int, use_flash_attention: bool = True):
super().__init__()
self.n_head = n_head
self.query = Linear(n_state, n_state)
self.key = Linear(n_state, n_state, bias=False)
self.value = Linear(n_state, n_state)
self.out = Linear(n_state, n_state)

self.use_flash_attention = True
self.use_flash_attention = use_flash_attention and HAS_FLASH_ATTN

def forward(
self,
Expand All @@ -171,15 +159,8 @@ def forward(
k = self.key(x)
v = self.value(x)

if self.use_flash_attention:
if flash_attn_varlen_func is None:
x = self.qkv_attention_manual(q, k, v, cu_seqlens=cu_seqlens)
else:
if q.dtype not in [torch.float16, torch.bfloat16]:
x = self.qkv_attention_manual(q, k, v, cu_seqlens=cu_seqlens)
self.use_flash_attention = False
else:
x = self.qkv_flash_attention(q, k, v, cu_seqlens=cu_seqlens)
if self.use_flash_attention and q.dtype in [torch.float16, torch.bfloat16]:
x = self.qkv_flash_attention(q, k, v, cu_seqlens=cu_seqlens)
else:
x = self.qkv_attention_manual(q, k, v, cu_seqlens=cu_seqlens)

Expand Down Expand Up @@ -249,7 +230,7 @@ def qkv_attention_manual(self, q: Tensor, k: Tensor, v: Tensor, cu_seqlens: Tens


class ResidualAttentionBlock(nn.Module):
def __init__(self, n_state: int, n_head: int, enable_mp: bool = False, sequence_parallel: bool = False):
def __init__(self, n_state: int, n_head: int, sequence_parallel: bool = False):
super().__init__()
n_mlp = n_state * 4
self.attn_ln = nn.LayerNorm(n_state)
Expand All @@ -274,8 +255,6 @@ def __init__(
n_layer: int,
n_window: int = 1500,
output_dim: int = 512,
grad_checkpointing: bool = False,
enable_mp: bool = False,
audio_sequence_parallel: bool = False,
):
super().__init__()
Expand All @@ -286,10 +265,7 @@ def __init__(
self.n_mels = n_mels

self.blocks = nn.ModuleList(
[
ResidualAttentionBlock(n_state, n_head, enable_mp=enable_mp, sequence_parallel=audio_sequence_parallel)
for _ in range(n_layer)
]
[ResidualAttentionBlock(n_state, n_head, sequence_parallel=audio_sequence_parallel) for _ in range(n_layer)]
)
self.ln_post = nn.LayerNorm(n_state)
self.avg_pooler = nn.AvgPool1d(2, stride=2)
Expand All @@ -299,8 +275,6 @@ def __init__(
self.audio_bos_eos_token = nn.Embedding(2, output_dim)

self.output_dim = output_dim
self.grad_checkpointing = grad_checkpointing
self.enable_mp = enable_mp
self.n_head = n_head
self.n_state = n_state
self.n_window = n_window
Expand All @@ -309,13 +283,6 @@ def __init__(

self.tp_world_size = 1

self.set_audio_sync()

def set_audio_sync(self):
for name, param in self.named_parameters():
if not name.startswith("blocks"):
setattr(param, "audio_sync", True)

def forward(
self, x_list: list[Tensor], audio_mellens: list[int], audio_aftercnnlens: list[int], audio_seqlens: list[int]
):
Expand Down Expand Up @@ -377,9 +344,3 @@ def forward(
output[end_ids] = self.audio_bos_eos_token.weight[1].to(x.dtype)
output[audio_tokens_mask] = x
return output

def lock(self, layers: int):
self.conv1.requires_grad_(False)
self.conv2.requires_grad_(False)
for i in range(min(layers, len(self.blocks))):
self.blocks[i].requires_grad_(False)