diff --git a/docs/accuracy.md b/docs/accuracy.md new file mode 100644 index 00000000..91989698 --- /dev/null +++ b/docs/accuracy.md @@ -0,0 +1,57 @@ +# Accuracy Benchmark + +In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa` and `longbenchv2`. + +**Note that the `context-length` argument in the config yaml needs to be larger than the `max_tokens` argument of accuracy benchmark.** + + +## MMLU + +For MMLU dataset, the benchmark section in yaml file can be modified in the following way: +```bash +benchmark: + type: "mmlu" + num_examples: 200 # Number of examples to run + max_tokens: 2048 # Max number of output tokens + repeat: 8 # Number of repetition + num_threads: 512 # Number of parallel threads for running benchmark +``` + +Then launch the script as usual: +```bash +srtctl apply -f config.yaml +``` + +After finishing benchmarking, the `benchmark.out` will contain the results of accuracy: +``` +==================== +Repeat: 8, mean: 0.812 +Scores: ['0.790', '0.820', '0.800', '0.820', '0.820', '0.790', '0.820', '0.840'] +==================== +Writing report to /tmp/mmlu_deepseek-ai_DeepSeek-R1.html +{'other': np.float64(0.9), 'other:std': np.float64(0.30000000000000004), 'score:std': np.float64(0.36660605559646725), 'stem': np.float64(0.8095238095238095), 'stem:std': np.float64(0.392676726249301), 'humanities': np.float64(0.7428571428571429), 'humanities:std': np.float64(0.4370588154508102), 'social_sciences': np.float64(0.9583333333333334), 'social_sciences:std': np.float64(0.19982631347136331), 'score': np.float64(0.84)} +Writing results to /tmp/mmlu_deepseek-ai_DeepSeek-R1.json +Total latency: 465.618 s +Score: 0.840 +Results saved to: /logs/accuracy/mmlu_deepseek-ai_DeepSeek-R1.json +MMLU evaluation complete +``` + + +## GPQA +For GPQA dataset, the benchmark section in yaml file can be modified in the following way: +```bash +benchmark: + type: "gpqa" + num_examples: 198 # Number of examples to run + max_tokens: 65536 # We need a larger output token number for GPQA + repeat: 8 # Number of repetition + num_threads: 128 # Number of parallel threads for running benchmark +``` +The `context-length` argument here should be set to a value larger than `max_tokens`. + + +## LongBench-V2 +To be updated + + diff --git a/scripts/benchmarks/gpqa/bench.sh b/scripts/benchmarks/gpqa/bench.sh index 70c5a86b..6c0da42a 100755 --- a/scripts/benchmarks/gpqa/bench.sh +++ b/scripts/benchmarks/gpqa/bench.sh @@ -14,12 +14,12 @@ n_decode=$2 prefill_gpus=$3 decode_gpus=$4 num_examples=${5:-198} # Default: 198 -max_tokens=${6:-512} # Default: 512 +max_tokens=${6:-32768} # Default: 32768 repeat=${7:-8} # Default: 8 -num_threads=${8:-512} # Default: 512 -thinking_mode=${9:-deepseek-r1} # Default: deepseek-r1 +num_threads=${8:-128} # Default: 128 +# Note: --thinking-mode removed because dynamo frontend doesn't support chat_template_kwargs -echo "GPQA Benchmark Config: num_examples=${num_examples}; max_tokens=${max_tokens}; repeat=${repeat}; num_threads=${num_threads}; thinking-mode=${thinking_mode}" +echo "GPQA Benchmark Config: num_examples=${num_examples}; max_tokens=${max_tokens}; repeat=${repeat}; num_threads=${num_threads}" # Source utilities for wait_for_model source /scripts/utils/benchmark_utils.sh @@ -49,8 +49,7 @@ python3 -m sglang.test.run_eval \ --num-examples ${num_examples} \ --max-tokens ${max_tokens} \ --repeat ${repeat} \ - --num-threads ${num_threads} \ - --thinking-mode ${thinking_mode} + --num-threads ${num_threads} # Copy the result file from /tmp to our logs directory # The result file is named gpqa_{model_name}.json diff --git a/scripts/benchmarks/mmlu/bench.sh b/scripts/benchmarks/mmlu/bench.sh index fd15f982..0a92f624 100644 --- a/scripts/benchmarks/mmlu/bench.sh +++ b/scripts/benchmarks/mmlu/bench.sh @@ -13,8 +13,8 @@ n_prefill=$1 n_decode=$2 prefill_gpus=$3 decode_gpus=$4 -num_examples=${5:-198} # Default: 198 -max_tokens=${6:-512} # Default: 512 +num_examples=${5:-200} # Default: 200 +max_tokens=${6:-2048} # Default: 2048 repeat=${7:-8} # Default: 8 num_threads=${8:-512} # Default: 512 diff --git a/src/srtctl/backends/sglang.py b/src/srtctl/backends/sglang.py index 9013361e..5538c439 100644 --- a/src/srtctl/backends/sglang.py +++ b/src/srtctl/backends/sglang.py @@ -262,6 +262,25 @@ def generate_slurm_script(self, config_path: Path = None, timestamp: str = None) concurrency_str = str(concurrencies) parsable_config = f"{isl} {osl} {concurrency_str} {req_rate}" + elif bench_type == "mmlu": + num_examples = benchmark_config.get("num_examples", 200) + max_tokens = benchmark_config.get("max_tokens", 2048) + repeat = benchmark_config.get("repeat", 8) + num_threads = benchmark_config.get("num_threads", 512) + parsable_config = f"{num_examples} {max_tokens} {repeat} {num_threads}" + elif bench_type == "gpqa": + num_examples = benchmark_config.get("num_examples", 198) + max_tokens = benchmark_config.get("max_tokens", 32768) + repeat = benchmark_config.get("repeat", 8) + num_threads = benchmark_config.get("num_threads", 128) + parsable_config = f"{num_examples} {max_tokens} {repeat} {num_threads}" + elif bench_type == "longbenchv2": + num_examples = benchmark_config.get("num_examples", None) + max_tokens = benchmark_config.get("max_tokens", 16384) + max_context_length = benchmark_config.get("max_context_length", 128000) + num_threads = benchmark_config.get("num_threads", 16) + categories = benchmark_config.get("categories", None) + parsable_config = f"{num_examples} {max_tokens} {max_context_length} {num_threads} {categories}" # Config directory should point to where deepep_config.json lives # This is typically the configs/ directory in the yaml-config repo diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py index 66939855..66b97cbe 100644 --- a/src/srtctl/core/schema.py +++ b/src/srtctl/core/schema.py @@ -165,6 +165,14 @@ class BenchmarkConfig(BaseModel): ) req_rate: Optional[str] = Field("inf", description="Request rate") + # Accuracy benchmark arguments + num_examples: Optional[int] = Field(None, description="Number of examples") + max_tokens: Optional[int] = Field(None, description="Maximum output tokens") + repeat: Optional[int] = Field(None, description="Number of times to repeat the benchmark") + num_threads: Optional[int] = Field(None, description="Number of running threads for accuracy benchmark") + max_context_length: Optional[int] = Field(None, description="Maximum context length for LongBench-v2 accuracy benchmark") + categories: Optional[list[str]] = Field(None, description="Comma-separated list of categories to evaluate for LongBench-v2 (None for all)") + class ProfilingType(str, Enum): """Supported profiling types."""