diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 1c52f0c7bb5..718a4f9076c 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -38,12 +38,12 @@ jobs: else DEVICE_FLAG="--device /dev/dri" fi - docker pull lmsysorg/sglang:v0.4.5-rocm630 + docker pull lmsysorg/sglang:v0.4.5.post2-rocm630 docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.5-rocm630 + lmsysorg/sglang:v0.4.5.post2-rocm630 - name: Install dependencies run: | @@ -82,12 +82,12 @@ jobs: else DEVICE_FLAG="--device /dev/dri" fi - docker pull lmsysorg/sglang:v0.4.5-rocm630 + docker pull lmsysorg/sglang:v0.4.5.post2-rocm630 docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ --cap-add=SYS_PTRACE -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} --security-opt seccomp=unconfined \ -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.5-rocm630 + lmsysorg/sglang:v0.4.5.post2-rocm630 - name: Install dependencies run: | @@ -120,12 +120,12 @@ jobs: else DEVICE_FLAG="--device /dev/dri" fi - docker pull lmsysorg/sglang:v0.4.5-rocm630 + docker pull lmsysorg/sglang:v0.4.5.post2-rocm630 docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.5-rocm630 + lmsysorg/sglang:v0.4.5.post2-rocm630 - name: Install dependencies run: | @@ -149,7 +149,7 @@ jobs: finish: if: always() needs: [ - accuracy-test-1-gpu-amd, mla-test-1-gpu-amd + accuracy-test-1-gpu-amd, mla-test-1-gpu-amd, bench-test-2-gpu-amd ] runs-on: ubuntu-latest steps: diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py index 95ae14b7246..e3a99655a0e 100644 --- a/python/sglang/srt/layers/rotary_embedding.py +++ b/python/sglang/srt/layers/rotary_embedding.py @@ -665,6 +665,7 @@ def forward_native( offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """PyTorch-native implementation equivalent to forward().""" + dtype = query.dtype query_rot = query[..., : self.rotary_dim] key_rot = key[..., : self.rotary_dim] if self.rotary_dim < self.head_size: @@ -695,7 +696,7 @@ def forward_native( else: query = query_rot key = key_rot - return query, key + return query.to(dtype), key.to(dtype) class Llama3RotaryEmbedding(RotaryEmbedding): diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 3daeab95c08..a9cb3c5b91c 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -682,10 +682,6 @@ def forward_absorb( forward_batch: ForwardBatch, zero_allocator: BumpAllocator, ) -> torch.Tensor: - q_len = hidden_states.shape[0] - q_input = hidden_states.new_empty( - q_len, self.num_local_heads, self.kv_lora_rank + self.qk_rope_head_dim - ) if self.q_lora_rank is not None: q = self.q_a_proj(hidden_states)[0] q = self.q_a_layernorm(q) @@ -729,20 +725,20 @@ def forward_absorb( ) else: q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc) - q_input[..., : self.kv_lora_rank] = q_nope_out.transpose(0, 1) + + q_nope_out = q_nope_out.transpose(0, 1) latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0] - v_input = latent_cache[..., : self.kv_lora_rank] - v_input = self.kv_a_layernorm(v_input.contiguous()).unsqueeze(1) - k_input = latent_cache.unsqueeze(1) - k_input[..., : self.kv_lora_rank] = v_input - k_pe = k_input[..., self.kv_lora_rank :] + k_nope = latent_cache[..., : self.kv_lora_rank] + k_nope = self.kv_a_layernorm(k_nope).unsqueeze(1) + k_pe = latent_cache[..., self.kv_lora_rank :].unsqueeze(1) q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe) - q_input[..., self.kv_lora_rank :] = q_pe - k_input[..., self.kv_lora_rank :] = k_pe - attn_output = self.attn_mqa(q_input, k_input, v_input, forward_batch) + q = torch.cat([q_nope_out, q_pe], dim=-1) + k = torch.cat([k_nope, k_pe], dim=-1) + + attn_output = self.attn_mqa(q, k, k_nope, forward_batch) attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank) if self.use_deep_gemm_bmm: