Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .codespellrc
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[codespell]
ignore-words-list = ans, als, hel, boostrap, childs, te, vas, hsa, ment, cann, thi, makro, wil, rouge, PRIS
ignore-words-list = ans, als, hel, boostrap, childs, te, vas, hsa, ment, cann, thi, makro, wil, rouge, PRIS, ather
skip = *.json,*.jsonl,*.patch,*.txt
28 changes: 26 additions & 2 deletions .github/workflows/nightly-test-amd-rocm720.yml
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,7 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2
# 8-GPU Qwen 3.5 (Accuracy + Performance combined) ROCm 7.2
nightly-8-gpu-qwen35-rocm720:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen35-rocm720,'))
runs-on: linux-mi325-8gpu-sglang
Expand Down Expand Up @@ -653,6 +653,18 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

- name: Performance Test ROCm 7.2 (8-GPU Qwen 3.5 FP8)
timeout-minutes: 120
continue-on-error: true
run: |
> github_summary.md # Clear summary file
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
-e SGLANG_USE_AITER=1 \
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# 8-GPU GLM-5 (Accuracy) ROCm 7.2
nightly-8-gpu-glm5-rocm720:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5-rocm720,'))
Expand Down Expand Up @@ -1219,7 +1231,7 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# MI35x 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2
# MI35x 8-GPU Qwen 3.5 (Accuracy + Performance combined) ROCm 7.2
nightly-8-gpu-mi35x-qwen35-rocm720:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen35-rocm720,'))
runs-on: linux-mi35x-gpu-8
Expand Down Expand Up @@ -1252,6 +1264,18 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

- name: Performance Test MI35x ROCm 7.2 (8-GPU Qwen 3.5 FP8)
timeout-minutes: 120
continue-on-error: true
run: |
> github_summary.md # Clear summary file
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
-e SGLANG_USE_AITER=1 \
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

nightly-8-gpu-mi35x-glm5-rocm720:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5-rocm720,'))
runs-on: linux-mi35x-gpu-8
Expand Down
28 changes: 26 additions & 2 deletions .github/workflows/nightly-test-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# 8-GPU Qwen 3.5 (Accuracy)
# 8-GPU Qwen 3.5 (Accuracy + Performance combined)
nightly-8-gpu-qwen35:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-qwen35,'))
runs-on: linux-mi325-8gpu-sglang
Expand Down Expand Up @@ -656,6 +656,18 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

- name: Performance Test (8-GPU Qwen 3.5 FP8)
timeout-minutes: 120
continue-on-error: true
run: |
> github_summary.md # Clear summary file
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
-e SGLANG_USE_AITER=1 \
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

nightly-8-gpu-glm5:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-glm5,'))
runs-on: linux-mi325-8gpu-sglang
Expand Down Expand Up @@ -1224,7 +1236,7 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

# MI35x 8-GPU Qwen 3.5 (Accuracy)
# MI35x 8-GPU Qwen 3.5 (Accuracy + Performance combined)
nightly-8-gpu-mi35x-qwen35:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-qwen35,'))
runs-on: linux-mi35x-gpu-8
Expand Down Expand Up @@ -1257,6 +1269,18 @@ jobs:
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

- name: Performance Test MI35x (8-GPU Qwen 3.5 FP8)
timeout-minutes: 120
continue-on-error: true
run: |
> github_summary.md # Clear summary file
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
-e SGLANG_USE_AITER=1 \
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-qwen35-fp8 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
exit ${TEST_EXIT_CODE:-0}

nightly-8-gpu-mi35x-glm5:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (!(inputs.job_filter || inputs.job_select) || (inputs.job_filter || inputs.job_select) == 'all' || contains(format(',{0},', inputs.job_filter || inputs.job_select), ',nightly-8-gpu-mi35x-glm5,'))
runs-on: linux-mi35x-gpu-8
Expand Down
2 changes: 2 additions & 0 deletions benchmark/asr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ This benchmark evaluates the performance and accuracy (Word Error Rate - WER) of

- `openai/whisper-large-v3`
- `openai/whisper-large-v3-turbo`
- `Qwen/Qwen3-ASR-1.7B`
- `Qwen/Qwen3-ASR-0.6B`

## Setup

Expand Down
4 changes: 4 additions & 0 deletions benchmark/kernels/fused_moe_triton/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ def get_model_config(
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
hidden_size = getattr(config, "moe_latent_size", None) or hidden_size
elif architecture == "Gemma4ForConditionalGeneration":
E = config.num_experts // ep_size
topk = config.top_k_experts
intermediate_size = config.moe_intermediate_size
else:
# Default: Mixtral
E = config.num_local_experts // ep_size
Expand Down
151 changes: 151 additions & 0 deletions benchmark/mmlu/bench_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""
Usage:
python3 bench_hf.py --model-path meta-llama/Llama-2-7b-hf --data-dir data --ntrain 5
"""

import argparse
import json
import os
import time

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

choices = ["A", "B", "C", "D"]


def format_subject(subject):
l = subject.split("_")
s = ""
for entry in l:
s += " " + entry
return s


def format_example(df, idx, include_answer=True):
prompt = df.iloc[idx, 0]
k = df.shape[1] - 2
for j in range(k):
prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
prompt += "\nAnswer:"
if include_answer:
prompt += " {}\n\n".format(df.iloc[idx, k + 1])
return prompt


def gen_prompt(train_df, subject, k=-1):
prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(
format_subject(subject)
)
if k == -1:
k = train_df.shape[0]
for i in range(k):
prompt += format_example(train_df, i)
return prompt


@torch.no_grad()
def main(args):
print(f"Loading model: {args.model_path}")
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
args.model_path,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
).eval()

subjects = sorted(
[
f.split("_test.csv")[0]
for f in os.listdir(os.path.join(args.data_dir, "test"))
if "_test.csv" in f
]
)

all_cors = []
num_requests = 0
total_latency = 0

for subject in tqdm(subjects[: args.nsub]):
dev_df = pd.read_csv(
os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
)[: args.ntrain]
test_df = pd.read_csv(
os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
)

k = args.ntrain
few_shot_examples = gen_prompt(dev_df, subject, k)
while len(tokenizer.encode(few_shot_examples)) > 1536:
k -= 1
if k < 0:
break
few_shot_examples = gen_prompt(dev_df, subject, k)

preds = []
labels = []
tic = time.perf_counter()

for i in range(test_df.shape[0]):
prompt_end = format_example(test_df, i, include_answer=False)
prompt = few_shot_examples + prompt_end

input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
output_ids = model.generate(
input_ids,
max_new_tokens=1,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
)

output_str = tokenizer.decode(
output_ids[0][input_ids.shape[-1] :], skip_special_tokens=True
)
preds.append(output_str.strip()[0] if len(output_str.strip()) > 0 else "")
labels.append(test_df.iloc[i, test_df.shape[1] - 1])

latency = time.perf_counter() - tic
total_latency += latency

cors = [pred == label for pred, label in zip(preds, labels)]
all_cors.append(cors)
num_requests += len(test_df)

print(
f"Subject: {subject}, Accuracy: {np.mean(cors):.3f}, Latency: {latency:.3f}s"
)

weighted_acc = np.mean(np.concatenate(all_cors))
print(f"Total Latency: {total_latency:.3f}s")
print(f"Average Accuracy: {weighted_acc:.3f}")

if args.output:
with open(args.output, "a") as fout:
value = {
"task": "mmlu",
"backend": "hf",
"model": args.model_path,
"latency": round(total_latency, 3),
"accuracy": round(weighted_acc, 3),
"num_requests": num_requests,
"other": {
"nsub": args.nsub,
"ntrain": args.ntrain,
},
}
fout.write(json.dumps(value) + "\n")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, required=True)
parser.add_argument("--ntrain", type=int, default=5)
parser.add_argument("--data-dir", type=str, default="data")
parser.add_argument("--nsub", type=int, default=60)
parser.add_argument("--output", type=str, help="Output file path")
args = parser.parse_args()
main(args)
Loading
Loading