ishandhanani · ishandhanani · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/docs/accuracy.md b/docs/accuracy.md
@@ -0,0 +1,57 @@
+# Accuracy Benchmark
+
+In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa` and `longbenchv2`. 
+
+**Note that the `context-length` argument in the config yaml needs to be larger than the `max_tokens` argument of accuracy benchmark.**
+
+
+## MMLU
+
+For MMLU dataset, the benchmark section in yaml file can be modified in the following way:
+```bash
+benchmark:
+  type: "mmlu"
+  num_examples: 200 # Number of examples to run
+  max_tokens: 2048 # Max number of output tokens
+  repeat: 8 # Number of repetition
+  num_threads: 512 # Number of parallel threads for running benchmark
+```
+
+Then launch the script as usual:
+```bash
+srtctl apply -f config.yaml
+```
+
+After finishing benchmarking, the `benchmark.out` will contain the results of accuracy:
+```
+====================
+Repeat: 8, mean: 0.812
+Scores: ['0.790', '0.820', '0.800', '0.820', '0.820', '0.790', '0.820', '0.840']
+====================
+Writing report to /tmp/mmlu_deepseek-ai_DeepSeek-R1.html
+{'other': np.float64(0.9), 'other:std': np.float64(0.30000000000000004), 'score:std': np.float64(0.36660605559646725), 'stem': np.float64(0.8095238095238095), 'stem:std': np.float64(0.392676726249301), 'humanities': np.float64(0.7428571428571429), 'humanities:std': np.float64(0.4370588154508102), 'social_sciences': np.float64(0.9583333333333334), 'social_sciences:std': np.float64(0.19982631347136331), 'score': np.float64(0.84)}
+Writing results to /tmp/mmlu_deepseek-ai_DeepSeek-R1.json
+Total latency: 465.618 s
+Score: 0.840
+Results saved to: /logs/accuracy/mmlu_deepseek-ai_DeepSeek-R1.json
+MMLU evaluation complete
+```
+
+
+## GPQA
+For GPQA dataset, the benchmark section in yaml file can be modified in the following way:
+```bash
+benchmark:
+  type: "gpqa"
+  num_examples: 198 # Number of examples to run
+  max_tokens: 65536 # We need a larger output token number for GPQA
+  repeat: 8 # Number of repetition
+  num_threads: 128 # Number of parallel threads for running benchmark
+```
+The `context-length` argument here should be set to a value larger than `max_tokens`.
+
+
+## LongBench-V2
+To be updated
+
+
diff --git a/scripts/benchmarks/gpqa/bench.sh b/scripts/benchmarks/gpqa/bench.sh
@@ -14,12 +14,12 @@ n_decode=$2
 prefill_gpus=$3
 decode_gpus=$4
 num_examples=${5:-198}  # Default: 198
-max_tokens=${6:-512}    # Default: 512
+max_tokens=${6:-32768}    # Default: 32768
 repeat=${7:-8}          # Default: 8
-num_threads=${8:-512}   # Default: 512
-thinking_mode=${9:-deepseek-r1} # Default: deepseek-r1
+num_threads=${8:-128}   # Default: 128
+# Note: --thinking-mode removed because dynamo frontend doesn't support chat_template_kwargs
 
-echo "GPQA Benchmark Config: num_examples=${num_examples}; max_tokens=${max_tokens}; repeat=${repeat}; num_threads=${num_threads}; thinking-mode=${thinking_mode}"
+echo "GPQA Benchmark Config: num_examples=${num_examples}; max_tokens=${max_tokens}; repeat=${repeat}; num_threads=${num_threads}"
 
 # Source utilities for wait_for_model
 source /scripts/utils/benchmark_utils.sh
@@ -49,8 +49,7 @@ python3 -m sglang.test.run_eval \
     --num-examples ${num_examples} \
     --max-tokens ${max_tokens} \
     --repeat ${repeat} \
-    --num-threads ${num_threads} \
-    --thinking-mode ${thinking_mode}
+    --num-threads ${num_threads}
 
 # Copy the result file from /tmp to our logs directory
 # The result file is named gpqa_{model_name}.json

diff --git a/scripts/benchmarks/mmlu/bench.sh b/scripts/benchmarks/mmlu/bench.sh
@@ -13,8 +13,8 @@ n_prefill=$1
 n_decode=$2
 prefill_gpus=$3
 decode_gpus=$4
-num_examples=${5:-198}  # Default: 198
-max_tokens=${6:-512}    # Default: 512
+num_examples=${5:-200}  # Default: 200
+max_tokens=${6:-2048}    # Default: 2048
 repeat=${7:-8}          # Default: 8
 num_threads=${8:-512}   # Default: 512
 

diff --git a/src/srtctl/backends/sglang.py b/src/srtctl/backends/sglang.py
@@ -262,6 +262,25 @@ def generate_slurm_script(self, config_path: Path = None, timestamp: str = None)
                 concurrency_str = str(concurrencies)
 
             parsable_config = f"{isl} {osl} {concurrency_str} {req_rate}"
+        elif bench_type == "mmlu":
+            num_examples = benchmark_config.get("num_examples", 200)
+            max_tokens = benchmark_config.get("max_tokens", 2048)
+            repeat = benchmark_config.get("repeat", 8)
+            num_threads = benchmark_config.get("num_threads", 512)
+            parsable_config = f"{num_examples} {max_tokens} {repeat} {num_threads}"
+        elif bench_type == "gpqa":
+            num_examples = benchmark_config.get("num_examples", 198)
+            max_tokens = benchmark_config.get("max_tokens", 32768)
+            repeat = benchmark_config.get("repeat", 8)
+            num_threads = benchmark_config.get("num_threads", 128)
+            parsable_config = f"{num_examples} {max_tokens} {repeat} {num_threads}"
+        elif bench_type == "longbenchv2":
+            num_examples = benchmark_config.get("num_examples", None)
+            max_tokens = benchmark_config.get("max_tokens", 16384)
+            max_context_length = benchmark_config.get("max_context_length", 128000)
+            num_threads = benchmark_config.get("num_threads", 16)
+            categories = benchmark_config.get("categories", None)
+            parsable_config = f"{num_examples} {max_tokens} {max_context_length} {num_threads} {categories}"
 
         # Config directory should point to where deepep_config.json lives
         # This is typically the configs/ directory in the yaml-config repo

diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
@@ -165,6 +165,14 @@ class BenchmarkConfig(BaseModel):
     )
     req_rate: Optional[str] = Field("inf", description="Request rate")
 
+    # Accuracy benchmark arguments
+    num_examples: Optional[int] = Field(None, description="Number of examples")
+    max_tokens: Optional[int] = Field(None, description="Maximum output tokens")
+    repeat: Optional[int] = Field(None, description="Number of times to repeat the benchmark")
+    num_threads: Optional[int] = Field(None, description="Number of running threads for accuracy benchmark")
+    max_context_length: Optional[int] = Field(None, description="Maximum context length for LongBench-v2 accuracy benchmark")
+    categories: Optional[list[str]] = Field(None, description="Comma-separated list of categories to evaluate for LongBench-v2 (None for all)")
+
 
 class ProfilingType(str, Enum):
     """Supported profiling types."""