Turn off autotune for scaled mm for fp8 dynamic quant in torchao #2176

Workflow file for this run

.github/workflows/pr-test.yml at 9f72cf9

	name: PR Test

	on:
	push:
	branches: [ main ]
	paths:
	- "python/sglang/**"
	- "test/**"
	pull_request:
	branches: [ main ]
	paths:
	- "python/sglang/**"
	- "test/**"
	workflow_dispatch:

	concurrency:
	group: pr-test-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	unit-test-frontend:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 10
	run: \|
	cd test/lang
	python3 run_suite.py --suite minimal

	unit-test-backend-part-1:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 25
	run: \|
	cd test/srt
	python3 run_suite.py --suite minimal --range-begin 0 --range-end 5

	unit-test-backend-part-2:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 25
	run: \|
	cd test/srt
	python3 run_suite.py --suite minimal --range-begin 5 --range-end 14

	unit-test-backend-part-3:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 25
	run: \|
	cd test/srt
	python3 run_suite.py --suite minimal --range-begin 14 --range-end 21

	unit-test-backend-part-4:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 25
	run: \|
	cd test/srt
	python3 run_suite.py --suite minimal --range-begin 21

	performance-test-1-gpu-part-1:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Benchmark single latency
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_latency.TestBenchLatency.test_default

	- name: Benchmark online latency
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default

	- name: Benchmark offline throughput
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default

	- name: Benchmark offline throughput (Non-streaming, small batch size)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size

	performance-test-1-gpu-part-2:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Benchmark offline throughput (w/o RadixAttention)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache

	- name: Benchmark offline throughput (w/ Triton)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend

	- name: Benchmark offline throughput (w/ FP8)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8

	performance-test-2-gpu:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 2-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Benchmark offline throughput (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default

	- name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache

	- name: Benchmark single latency (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_latency.TestBenchLatency.test_moe_default

	accuracy-test-1-gpu:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	run: \|
	bash scripts/ci_install_dependency.sh

	git clone https://github.com/merrymercy/human-eval.git
	cd human-eval
	pip install -e .

	- name: Evaluate accuracy
	timeout-minutes: 20
	run: \|
	cd test/srt
	python3 test_eval_accuracy_large.py

	accuracy-test-2-gpu:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 2-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	run: \|
	bash scripts/ci_install_dependency.sh

	git clone https://github.com/merrymercy/human-eval.git
	cd human-eval
	pip install -e .

	- name: Evaluate accuracy (TP=2)
	timeout-minutes: 20
	run: \|
	cd test/srt
	python3 test_moe_eval_accuracy_large.py

	- name: Evaluate MLA accuracy (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 test_mla.py
	python3 test_mla_fp8.py
	python3 test_dp_attention.py

	- name: Evaluate data parallelism accuracy (DP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 test_data_parallelism.py

	finish:
	needs: [
	unit-test-frontend, unit-test-backend-part-1, unit-test-backend-part-2, unit-test-backend-part-3, unit-test-backend-part-4,
	performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
	accuracy-test-1-gpu, accuracy-test-2-gpu
	]
	runs-on: ubuntu-latest
	steps:
	- name: Finish
	run: echo "This is an empty step to ensure that all jobs are completed."

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Turn off autotune for scaled mm for fp8 dynamic quant in torchao #2176

Workflow file

Turn off autotune for scaled mm for fp8 dynamic quant in torchao #2176

Jobs

Run details

Workflow file for this run