From 494b949eb048bdb3f8319f9c7fc149a40876ee50 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 19 Dec 2025 04:28:43 +0000 Subject: [PATCH 1/5] upd mmlu test --- scripts/benchmarks/mmlu/bench.sh | 4 ++-- src/srtctl/backends/sglang.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/benchmarks/mmlu/bench.sh b/scripts/benchmarks/mmlu/bench.sh index fd15f982..0a92f624 100644 --- a/scripts/benchmarks/mmlu/bench.sh +++ b/scripts/benchmarks/mmlu/bench.sh @@ -13,8 +13,8 @@ n_prefill=$1 n_decode=$2 prefill_gpus=$3 decode_gpus=$4 -num_examples=${5:-198} # Default: 198 -max_tokens=${6:-512} # Default: 512 +num_examples=${5:-200} # Default: 200 +max_tokens=${6:-2048} # Default: 2048 repeat=${7:-8} # Default: 8 num_threads=${8:-512} # Default: 512 diff --git a/src/srtctl/backends/sglang.py b/src/srtctl/backends/sglang.py index 9013361e..b45425bd 100644 --- a/src/srtctl/backends/sglang.py +++ b/src/srtctl/backends/sglang.py @@ -262,6 +262,12 @@ def generate_slurm_script(self, config_path: Path = None, timestamp: str = None) concurrency_str = str(concurrencies) parsable_config = f"{isl} {osl} {concurrency_str} {req_rate}" + elif bench_type == "mmlu": + num_examples = benchmark_config.get("num_examples", 200) + max_tokens = benchmark_config.get("max_tokens", 2048) + repeat = benchmark_config.get("repeat", 8) + num_threads = benchmark_config.get("num_threads", 512) + parsable_config = f"{num_examples} {max_tokens} {repeat} {num_threads}" # Config directory should point to where deepep_config.json lives # This is typically the configs/ directory in the yaml-config repo From 09313f8860dac5c1198fe109857364e2728dadbb Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 19 Dec 2025 07:15:31 +0000 Subject: [PATCH 2/5] upd mmlu bench --- src/srtctl/core/schema.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py index 66939855..5e24a82b 100644 --- a/src/srtctl/core/schema.py +++ b/src/srtctl/core/schema.py @@ -165,6 +165,12 @@ class BenchmarkConfig(BaseModel): ) req_rate: Optional[str] = Field("inf", description="Request rate") + # Accuracy benchmark arguments + num_examples: Optional[int] = Field(None, description="Number of examples") + max_tokens: Optional[int] = Field(None, description="Maximum output tokens") + repeat: Optional[int] = Field(None, description="Number of times to repeat the benchmark") + num_threads: Optional[int] = Field(None, description="Number of running threads for accuracy benchmark") + class ProfilingType(str, Enum): """Supported profiling types.""" From d1abf66b9cc0d49c18bb69f4f363c956d412bd35 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 19 Dec 2025 07:37:23 +0000 Subject: [PATCH 3/5] upd gpqa and longbench --- scripts/benchmarks/gpqa/bench.sh | 6 +++--- src/srtctl/backends/sglang.py | 13 +++++++++++++ src/srtctl/core/schema.py | 2 ++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/scripts/benchmarks/gpqa/bench.sh b/scripts/benchmarks/gpqa/bench.sh index 70c5a86b..b1b33fb3 100755 --- a/scripts/benchmarks/gpqa/bench.sh +++ b/scripts/benchmarks/gpqa/bench.sh @@ -14,10 +14,10 @@ n_decode=$2 prefill_gpus=$3 decode_gpus=$4 num_examples=${5:-198} # Default: 198 -max_tokens=${6:-512} # Default: 512 +max_tokens=${6:-32768} # Default: 32768 repeat=${7:-8} # Default: 8 -num_threads=${8:-512} # Default: 512 -thinking_mode=${9:-deepseek-r1} # Default: deepseek-r1 +num_threads=${8:-128} # Default: 128 +# Note: --thinking-mode removed because dynamo frontend doesn't support chat_template_kwargs echo "GPQA Benchmark Config: num_examples=${num_examples}; max_tokens=${max_tokens}; repeat=${repeat}; num_threads=${num_threads}; thinking-mode=${thinking_mode}" diff --git a/src/srtctl/backends/sglang.py b/src/srtctl/backends/sglang.py index b45425bd..5538c439 100644 --- a/src/srtctl/backends/sglang.py +++ b/src/srtctl/backends/sglang.py @@ -268,6 +268,19 @@ def generate_slurm_script(self, config_path: Path = None, timestamp: str = None) repeat = benchmark_config.get("repeat", 8) num_threads = benchmark_config.get("num_threads", 512) parsable_config = f"{num_examples} {max_tokens} {repeat} {num_threads}" + elif bench_type == "gpqa": + num_examples = benchmark_config.get("num_examples", 198) + max_tokens = benchmark_config.get("max_tokens", 32768) + repeat = benchmark_config.get("repeat", 8) + num_threads = benchmark_config.get("num_threads", 128) + parsable_config = f"{num_examples} {max_tokens} {repeat} {num_threads}" + elif bench_type == "longbenchv2": + num_examples = benchmark_config.get("num_examples", None) + max_tokens = benchmark_config.get("max_tokens", 16384) + max_context_length = benchmark_config.get("max_context_length", 128000) + num_threads = benchmark_config.get("num_threads", 16) + categories = benchmark_config.get("categories", None) + parsable_config = f"{num_examples} {max_tokens} {max_context_length} {num_threads} {categories}" # Config directory should point to where deepep_config.json lives # This is typically the configs/ directory in the yaml-config repo diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py index 5e24a82b..66b97cbe 100644 --- a/src/srtctl/core/schema.py +++ b/src/srtctl/core/schema.py @@ -170,6 +170,8 @@ class BenchmarkConfig(BaseModel): max_tokens: Optional[int] = Field(None, description="Maximum output tokens") repeat: Optional[int] = Field(None, description="Number of times to repeat the benchmark") num_threads: Optional[int] = Field(None, description="Number of running threads for accuracy benchmark") + max_context_length: Optional[int] = Field(None, description="Maximum context length for LongBench-v2 accuracy benchmark") + categories: Optional[list[str]] = Field(None, description="Comma-separated list of categories to evaluate for LongBench-v2 (None for all)") class ProfilingType(str, Enum): From 5a096ea1065d975bd1fbec982ff41bb57115ccd9 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 19 Dec 2025 07:57:56 +0000 Subject: [PATCH 4/5] upd docs --- docs/accuracy.md | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 docs/accuracy.md diff --git a/docs/accuracy.md b/docs/accuracy.md new file mode 100644 index 00000000..91989698 --- /dev/null +++ b/docs/accuracy.md @@ -0,0 +1,57 @@ +# Accuracy Benchmark + +In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa` and `longbenchv2`. + +**Note that the `context-length` argument in the config yaml needs to be larger than the `max_tokens` argument of accuracy benchmark.** + + +## MMLU + +For MMLU dataset, the benchmark section in yaml file can be modified in the following way: +```bash +benchmark: + type: "mmlu" + num_examples: 200 # Number of examples to run + max_tokens: 2048 # Max number of output tokens + repeat: 8 # Number of repetition + num_threads: 512 # Number of parallel threads for running benchmark +``` + +Then launch the script as usual: +```bash +srtctl apply -f config.yaml +``` + +After finishing benchmarking, the `benchmark.out` will contain the results of accuracy: +``` +==================== +Repeat: 8, mean: 0.812 +Scores: ['0.790', '0.820', '0.800', '0.820', '0.820', '0.790', '0.820', '0.840'] +==================== +Writing report to /tmp/mmlu_deepseek-ai_DeepSeek-R1.html +{'other': np.float64(0.9), 'other:std': np.float64(0.30000000000000004), 'score:std': np.float64(0.36660605559646725), 'stem': np.float64(0.8095238095238095), 'stem:std': np.float64(0.392676726249301), 'humanities': np.float64(0.7428571428571429), 'humanities:std': np.float64(0.4370588154508102), 'social_sciences': np.float64(0.9583333333333334), 'social_sciences:std': np.float64(0.19982631347136331), 'score': np.float64(0.84)} +Writing results to /tmp/mmlu_deepseek-ai_DeepSeek-R1.json +Total latency: 465.618 s +Score: 0.840 +Results saved to: /logs/accuracy/mmlu_deepseek-ai_DeepSeek-R1.json +MMLU evaluation complete +``` + + +## GPQA +For GPQA dataset, the benchmark section in yaml file can be modified in the following way: +```bash +benchmark: + type: "gpqa" + num_examples: 198 # Number of examples to run + max_tokens: 65536 # We need a larger output token number for GPQA + repeat: 8 # Number of repetition + num_threads: 128 # Number of parallel threads for running benchmark +``` +The `context-length` argument here should be set to a value larger than `max_tokens`. + + +## LongBench-V2 +To be updated + + From 9c085be05baf8a7d47131cca2613afb78f714415 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 19 Dec 2025 08:26:50 +0000 Subject: [PATCH 5/5] fix --- scripts/benchmarks/gpqa/bench.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/benchmarks/gpqa/bench.sh b/scripts/benchmarks/gpqa/bench.sh index b1b33fb3..6c0da42a 100755 --- a/scripts/benchmarks/gpqa/bench.sh +++ b/scripts/benchmarks/gpqa/bench.sh @@ -19,7 +19,7 @@ repeat=${7:-8} # Default: 8 num_threads=${8:-128} # Default: 128 # Note: --thinking-mode removed because dynamo frontend doesn't support chat_template_kwargs -echo "GPQA Benchmark Config: num_examples=${num_examples}; max_tokens=${max_tokens}; repeat=${repeat}; num_threads=${num_threads}; thinking-mode=${thinking_mode}" +echo "GPQA Benchmark Config: num_examples=${num_examples}; max_tokens=${max_tokens}; repeat=${repeat}; num_threads=${num_threads}" # Source utilities for wait_for_model source /scripts/utils/benchmark_utils.sh @@ -49,8 +49,7 @@ python3 -m sglang.test.run_eval \ --num-examples ${num_examples} \ --max-tokens ${max_tokens} \ --repeat ${repeat} \ - --num-threads ${num_threads} \ - --thinking-mode ${thinking_mode} + --num-threads ${num_threads} # Copy the result file from /tmp to our logs directory # The result file is named gpqa_{model_name}.json