Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 147 additions & 5 deletions .github/workflows/pr-test-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,51 @@ jobs:
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .

docker exec -w / ci_sglang mkdir -p /dummy-grok
mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
docker cp ./dummy-grok ci_sglang:/

- name: Evaluate Accuracy
timeout-minutes: 20
run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_accuracy_large.py
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_fp8_accuracy.py
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_qwen_models.py

accuracy-test-2-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false
runs-on: linux-mi300-gpu-2
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup docker
run: |
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG.
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
ghcr.io/saienduri/sglang-aiter-v0.1.1:428

- name: Install dependencies
run: |
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"

docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .

- name: Evaluate accuracy (TP=2)
timeout-minutes: 20
run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_moe_eval_accuracy_large.py

mla-test-1-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false
Expand Down Expand Up @@ -104,6 +138,113 @@ jobs:
run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_mla.py

performance-test-1-gpu-part-1-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false
runs-on: linux-mi300-gpu-1
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup docker
run: |
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG.
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
ghcr.io/saienduri/sglang-aiter-v0.1.1:428

- name: Install dependencies
run: |
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"

docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .

- name: Benchmark single latency
timeout-minutes: 10
run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default

- name: Benchmark online latency
timeout-minutes: 10
run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default

- name: Benchmark offline throughput
timeout-minutes: 10
run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default

- name: Benchmark offline throughput (Non-streaming, small batch size)
timeout-minutes: 10
run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size

- name: Benchmark online latency (EAGLE)
timeout-minutes: 10
run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle

performance-test-1-gpu-part-2-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false
runs-on: linux-mi300-gpu-1
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup docker
run: |
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG.
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
ghcr.io/saienduri/sglang-aiter-v0.1.1:428

- name: Install dependencies
run: |
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"

docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .

- name: Benchmark offline throughput (w/o RadixAttention)
timeout-minutes: 10
run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache

- name: Benchmark offline throughput (w/ Triton)
timeout-minutes: 10
run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend

- name: Benchmark offline throughput (w/ FP8)
timeout-minutes: 10
run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8

bench-test-2-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false
Expand Down Expand Up @@ -169,7 +310,8 @@ jobs:
finish:
if: always()
needs: [
accuracy-test-1-gpu-amd, mla-test-1-gpu-amd, bench-test-2-gpu-amd
accuracy-test-1-gpu-amd, mla-test-1-gpu-amd, bench-test-2-gpu-amd,
accuracy-test-2-gpu-amd, performance-test-1-gpu-part-1-amd, performance-test-1-gpu-part-2-amd
]
runs-on: ubuntu-latest
steps:
Expand Down
30 changes: 24 additions & 6 deletions test/srt/test_bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ def test_offline_throughput_default(self):
f"### test_offline_throughput_default\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 3800)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertGreater(res["output_throughput"], 3500)
else:
self.assertGreater(res["output_throughput"], 3800)

def test_offline_throughput_non_stream_small_batch_size(self):
res = run_bench_serving(
Expand Down Expand Up @@ -64,7 +67,10 @@ def test_offline_throughput_without_radix_cache(self):
f"### test_offline_throughput_without_radix_cache\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 3800)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertGreater(res["output_throughput"], 3500)
else:
self.assertGreater(res["output_throughput"], 3800)

def test_offline_throughput_without_chunked_prefill(self):
res = run_bench_serving(
Expand Down Expand Up @@ -99,7 +105,10 @@ def test_offline_throughput_with_triton_attention_backend(self):
f"### test_offline_throughput_with_triton_attention_backend\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 3700)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertGreater(res["output_throughput"], 3500)
else:
self.assertGreater(res["output_throughput"], 3700)

def test_offline_throughput_default_fp8(self):
res = run_bench_serving(
Expand All @@ -114,7 +123,10 @@ def test_offline_throughput_default_fp8(self):
f"### test_offline_throughput_default_fp8\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 4300)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertGreater(res["output_throughput"], 4000)
else:
self.assertGreater(res["output_throughput"], 4300)

def test_online_latency_default(self):
res = run_bench_serving(
Expand All @@ -130,7 +142,10 @@ def test_online_latency_default(self):
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
)
self.assertLess(res["median_e2e_latency_ms"], 11000)
self.assertLess(res["median_ttft_ms"], 86)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertLess(res["median_ttft_ms"], 115)
else:
self.assertLess(res["median_ttft_ms"], 86)
self.assertLess(res["median_itl_ms"], 10)

def test_online_latency_eagle(self):
Expand Down Expand Up @@ -165,7 +180,10 @@ def test_online_latency_eagle(self):
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
f'accept_length: {res["accept_length"]:.2f} \n'
)
self.assertLess(res["median_e2e_latency_ms"], 900)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertLess(res["median_e2e_latency_ms"], 1450)
else:
self.assertLess(res["median_e2e_latency_ms"], 900)
self.assertGreater(res["accept_length"], 3.0)

def test_moe_offline_throughput_default(self):
Expand Down
Loading