diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index da3889d31a7d..767f49ab105f 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -218,7 +218,7 @@ def __init__(self, act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - use_data_parallel: bool = False): + disable_tp: bool = False): super().__init__() self.gate_up_proj = MergedColumnParallelLinear( input_size=in_features, @@ -226,14 +226,14 @@ def __init__(self, bias=bias, quant_config=quant_config, prefix=f"{prefix}.gate_up_proj", - disable_tp=use_data_parallel) + disable_tp=disable_tp) self.down_proj = RowParallelLinear(hidden_features, in_features, bias=bias, quant_config=quant_config, prefix=f"{prefix}.down_proj", - disable_tp=use_data_parallel) + disable_tp=disable_tp) self.act_fn = act_fn def forward(self, x: torch.Tensor): @@ -271,13 +271,13 @@ def __init__( projection_size: int, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - use_data_parallel: bool = False, + disable_tp: bool = False, attn_backend: _Backend = _Backend.TORCH_SDPA, use_upstream_fa: bool = False, ) -> None: super().__init__() # Per attention head and per partition values. - self.tp_size = (1 if use_data_parallel else + self.tp_size = (1 if disable_tp else parallel_state.get_tensor_model_parallel_world_size()) self.tp_rank = parallel_state.get_tensor_model_parallel_rank() self.hidden_size_per_attention_head = dist_utils.divide( @@ -293,13 +293,13 @@ def __init__( bias=True, quant_config=quant_config, prefix=f"{prefix}.qkv", - disable_tp=use_data_parallel) + disable_tp=disable_tp) self.proj = RowParallelLinear(input_size=projection_size, output_size=embed_dim, quant_config=quant_config, prefix=f"{prefix}.proj", - disable_tp=use_data_parallel) + disable_tp=disable_tp) self.attn_backend = attn_backend self.use_upstream_fa = use_upstream_fa self.is_flash_attn_backend = self.attn_backend in { @@ -425,7 +425,7 @@ def __init__( norm_layer: Optional[Callable[[int], nn.Module]] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - use_data_parallel: bool = False, + disable_tp: bool = False, attn_backend: _Backend = _Backend.TORCH_SDPA, use_upstream_fa: bool = False, ) -> None: @@ -434,22 +434,21 @@ def __init__( norm_layer = partial(nn.LayerNorm, eps=1e-6) self.norm1 = norm_layer(dim) self.norm2 = norm_layer(dim) - self.attn = Qwen2_5_VisionAttention( - embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn", - use_data_parallel=use_data_parallel, - attn_backend=attn_backend, - use_upstream_fa=use_upstream_fa) + self.attn = Qwen2_5_VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn", + disable_tp=disable_tp, + attn_backend=attn_backend, + use_upstream_fa=use_upstream_fa) self.mlp = Qwen2_5_VisionMLP(dim, mlp_hidden_dim, act_fn=act_fn, bias=True, quant_config=quant_config, prefix=f"{prefix}.mlp", - use_data_parallel=use_data_parallel) + disable_tp=disable_tp) def forward( self, @@ -640,7 +639,7 @@ def __init__( norm_layer=norm_layer, quant_config=quant_config, prefix=f"{prefix}.blocks.{layer_idx}", - use_data_parallel=use_data_parallel, + disable_tp=use_data_parallel, attn_backend=self.attn_backend, use_upstream_fa=use_upstream_fa) for layer_idx in range(depth) ]) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 00de89811cc7..75e5054c5ec2 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -128,7 +128,7 @@ def __init__(self, act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - use_data_parallel: bool = False): + disable_tp: bool = False): super().__init__() self.linear_fc1 = ColumnParallelLinear(in_features, hidden_features, @@ -136,14 +136,14 @@ def __init__(self, quant_config=quant_config, return_bias=False, prefix=f"{prefix}.linear_fc1", - disable_tp=use_data_parallel) + disable_tp=disable_tp) self.linear_fc2 = RowParallelLinear(hidden_features, in_features, bias=bias, quant_config=quant_config, return_bias=False, prefix=f"{prefix}.linear_fc2", - disable_tp=use_data_parallel) + disable_tp=disable_tp) self.act_fn = act_fn def forward(self, x: torch.Tensor): @@ -162,7 +162,7 @@ def __init__( norm_layer: Optional[Callable[[int], nn.Module]] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - use_data_parallel: bool = False, + disable_tp: bool = False, attn_backend: _Backend = _Backend.TORCH_SDPA, use_upstream_fa: bool = False, ) -> None: @@ -171,22 +171,21 @@ def __init__( norm_layer = partial(nn.LayerNorm, eps=1e-6) self.norm1 = norm_layer(dim) self.norm2 = norm_layer(dim) - self.attn = Qwen2_5_VisionAttention( - embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn", - use_data_parallel=use_data_parallel, - attn_backend=attn_backend, - use_upstream_fa=use_upstream_fa) + self.attn = Qwen2_5_VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn", + disable_tp=disable_tp, + attn_backend=attn_backend, + use_upstream_fa=use_upstream_fa) self.mlp = Qwen3_VisionMLP(dim, mlp_hidden_dim, act_fn=act_fn, bias=True, quant_config=quant_config, prefix=f"{prefix}.mlp", - use_data_parallel=use_data_parallel) + disable_tp=disable_tp) def forward( self, @@ -217,7 +216,7 @@ def __init__( use_postshuffle_norm: bool = False, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - use_data_parallel: bool = False, + disable_tp: bool = False, ) -> None: super().__init__() self.hidden_size = context_dim * (spatial_merge_size**2) @@ -234,14 +233,14 @@ def __init__( bias=True, quant_config=quant_config, prefix=f"{prefix}.linear_fc1", - disable_tp=use_data_parallel) + disable_tp=disable_tp) self.act_fn = nn.GELU() self.linear_fc2 = RowParallelLinear(self.hidden_size, d_model, bias=True, quant_config=quant_config, prefix=f"{prefix}.linear_fc2", - disable_tp=use_data_parallel) + disable_tp=disable_tp) def forward(self, x: torch.Tensor) -> torch.Tensor: if self.use_postshuffle_norm: @@ -263,7 +262,7 @@ def __init__( norm_eps: float = 1e-6, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - use_data_parallel: bool = False, + disable_tp: bool = False, ) -> None: super().__init__() self.hidden_size = vision_config.hidden_size @@ -274,7 +273,7 @@ def __init__( self.spatial_merge_unit = self.spatial_merge_size**2 self.temporal_patch_size = vision_config.temporal_patch_size self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes - self.use_data_parallel = use_data_parallel + self.disable_tp = disable_tp self.num_grid_per_side = int(self.num_position_embeddings**0.5) # NOTE: This is used for creating empty tensor for all_gather for @@ -303,7 +302,7 @@ def __init__( spatial_merge_size=self.spatial_merge_size, quant_config=quant_config, prefix=f"{prefix}.merger", - use_data_parallel=use_data_parallel, + disable_tp=disable_tp, ) self.deepstack_merger_list = nn.ModuleList([ @@ -315,7 +314,7 @@ def __init__( norm_layer=norm_layer, quant_config=quant_config, prefix=f"{prefix}.deepstack_merger_list.{layer_idx}", - use_data_parallel=use_data_parallel) + disable_tp=disable_tp) for layer_idx in range(len(self.deepstack_visual_indexes)) ]) @@ -344,7 +343,7 @@ def __init__( norm_layer=norm_layer, quant_config=quant_config, prefix=f"{prefix}.blocks.{layer_idx}", - use_data_parallel=use_data_parallel, + disable_tp=disable_tp, attn_backend=self.attn_backend, use_upstream_fa=use_upstream_fa) for layer_idx in range(vision_config.depth) @@ -1134,7 +1133,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config, prefix=maybe_prefix(prefix, "visual"), - use_data_parallel=self.use_data_parallel, + disable_tp=self.use_data_parallel, ) self.language_model = Qwen3LLMForCausalLM(vllm_config=vllm_config, diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index 1ed053eb2e96..1af50773ed81 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -314,10 +314,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config: Qwen3VLMoeConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config + parallel_config = vllm_config.parallel_config self.config = config self.multimodal_config = multimodal_config self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" + self.sequence_parallel = parallel_config.use_sequence_parallel_moe if not multimodal_config.get_limit_per_prompt("image") and \ not multimodal_config.get_limit_per_prompt("video"): @@ -328,7 +330,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config, prefix=maybe_prefix(prefix, "visual"), - use_data_parallel=self.use_data_parallel, + disable_tp=self.use_data_parallel or self.sequence_parallel, ) self.language_model = Qwen3MoeLLMForCausalLM(vllm_config=vllm_config,