diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 52499eeaab7..75f2fcbd28f 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -290,12 +290,46 @@ jobs: run: | bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600 + - name: Run CustomAllReduce test + timeout-minutes: 10 + run: | + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/amd_ci_exec.sh python3 -m unittest test_custom_allreduce.TestCustomAllReduce + + unit-test-sgl-kernel-amd: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + strategy: + fail-fast: false + matrix: + runner: [linux-mi300-gpu-1, linux-mi325-gpu-1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Start CI container + run: bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/amd_ci_install_dependency.sh + + - name: Run test + timeout-minutes: 10 + run: | + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py + docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py + finish: if: always() needs: [ accuracy-test-1-gpu-amd, mla-test-1-gpu-amd, bench-test-2-gpu-amd, accuracy-test-2-gpu-amd, performance-test-1-gpu-part-1-amd, performance-test-1-gpu-part-2-amd, - unit-test-backend-1-gpu-amd, unit-test-backend-2-gpu-amd, unit-test-backend-8-gpu-amd + unit-test-backend-1-gpu-amd, unit-test-backend-2-gpu-amd, unit-test-backend-8-gpu-amd, + unit-test-sgl-kernel-amd ] runs-on: ubuntu-latest steps: diff --git a/scripts/amd_ci_install_dependency.sh b/scripts/amd_ci_install_dependency.sh index 24ab5b77a8c..3c8061351b3 100755 --- a/scripts/amd_ci_install_dependency.sh +++ b/scripts/amd_ci_install_dependency.sh @@ -19,3 +19,4 @@ mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpubli docker cp ./dummy-grok ci_sglang:/ docker exec ci_sglang pip install huggingface_hub[hf_xet] +docker exec ci_sglang pip install pytest diff --git a/test/srt/test_custom_allreduce.py b/test/srt/test_custom_allreduce.py index 38600aeabd8..462ac578e0e 100644 --- a/test/srt/test_custom_allreduce.py +++ b/test/srt/test_custom_allreduce.py @@ -56,22 +56,30 @@ def multi_process_parallel( class TestCustomAllReduce(CustomTestCase): + TEST_SIZES = [ + 512, + 4096, + 32768, + 262144, + 2097152, + 16777216, + 33554432, + ] # 512B...32MB + WORLD_SIZES = [2, 4, 6, 8] + TEST_LOOP = 10 + @classmethod def setUpClass(cls): - random.seed(42) - # 512B to 32MB - cls.test_sizes = [512, 4096, 32768, 262144, 2097152, 16777216, 33554432] - cls.world_sizes = [2, 4, 6, 8] - cls.test_loop = 10 + random.seed(42) # keep the deterministic seed def test_graph_allreduce(self): - for world_size in self.world_sizes: + for world_size in self.WORLD_SIZES: if world_size > torch.cuda.device_count(): continue multi_process_parallel(world_size, self, self.graph_allreduce) def test_eager_allreduce(self): - for world_size in self.world_sizes: + for world_size in self.WORLD_SIZES: if world_size > torch.cuda.device_count(): continue multi_process_parallel(world_size, self, self.eager_allreduce) @@ -102,9 +110,9 @@ def graph_allreduce(self, world_size, rank, distributed_init_port): torch.cuda.synchronize() del data - for sz in self.test_sizes: + for sz in self.TEST_SIZES: for dtype in [torch.float32, torch.float16, torch.bfloat16]: - for _ in range(self.test_loop): + for _ in range(self.TEST_LOOP): with graph_capture() as graph_capture_context: # use integers so result matches NCCL exactly inp1 = torch.randint( @@ -151,9 +159,9 @@ def eager_allreduce(self, world_size, rank, distributed_init_port): initialize_model_parallel(tensor_model_parallel_size=world_size) group = get_tensor_model_parallel_group().device_group - for sz in self.test_sizes: + for sz in self.TEST_SIZES: for dtype in [torch.float32, torch.float16, torch.bfloat16]: - for _ in range(self.test_loop): + for _ in range(self.TEST_LOOP): inp1 = torch.randint( 1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device() )