diff --git a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py index 5cfaac9e4418..b99ed353747e 100644 --- a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py +++ b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py @@ -641,10 +641,8 @@ def forward_extend( dtype=torch.bool, device=forward_batch.input_ids.device, ) - conv_states_to_use = conv_states.clone() else: has_initial_states = forward_batch.extend_prefix_lens > 0 - conv_states_to_use = conv_states if is_target_verify: batch_size = seq_len // forward_batch.spec_info.draft_token_num @@ -656,7 +654,7 @@ def forward_extend( ) mixed_qkv_processed = causal_conv1d_update( mixed_qkv_reshaped, - conv_states_to_use, + conv_states, conv_weights, bias, activation, @@ -675,7 +673,7 @@ def forward_extend( conv_weights, bias, activation=activation, - conv_states=conv_states_to_use, + conv_states=conv_states, has_initial_state=has_initial_states, cache_indices=cache_indices, query_start_loc=query_start_loc,