From 494b949eb048bdb3f8319f9c7fc149a40876ee50 Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <sobereddiezhang@gmail.com>
Date: Fri, 19 Dec 2025 04:28:43 +0000
Subject: [PATCH 1/5] upd mmlu test

---
 scripts/benchmarks/mmlu/bench.sh | 4 ++--
 src/srtctl/backends/sglang.py    | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/benchmarks/mmlu/bench.sh b/scripts/benchmarks/mmlu/bench.sh
index fd15f982..0a92f624 100644
--- a/scripts/benchmarks/mmlu/bench.sh
+++ b/scripts/benchmarks/mmlu/bench.sh
@@ -13,8 +13,8 @@ n_prefill=$1
 n_decode=$2
 prefill_gpus=$3
 decode_gpus=$4
-num_examples=${5:-198}  # Default: 198
-max_tokens=${6:-512}    # Default: 512
+num_examples=${5:-200}  # Default: 200
+max_tokens=${6:-2048}    # Default: 2048
 repeat=${7:-8}          # Default: 8
 num_threads=${8:-512}   # Default: 512
 
diff --git a/src/srtctl/backends/sglang.py b/src/srtctl/backends/sglang.py
index 9013361e..b45425bd 100644
--- a/src/srtctl/backends/sglang.py
+++ b/src/srtctl/backends/sglang.py
@@ -262,6 +262,12 @@ def generate_slurm_script(self, config_path: Path = None, timestamp: str = None)
                 concurrency_str = str(concurrencies)
 
             parsable_config = f"{isl} {osl} {concurrency_str} {req_rate}"
+        elif bench_type == "mmlu":
+            num_examples = benchmark_config.get("num_examples", 200)
+            max_tokens = benchmark_config.get("max_tokens", 2048)
+            repeat = benchmark_config.get("repeat", 8)
+            num_threads = benchmark_config.get("num_threads", 512)
+            parsable_config = f"{num_examples} {max_tokens} {repeat} {num_threads}"
 
         # Config directory should point to where deepep_config.json lives
         # This is typically the configs/ directory in the yaml-config repo

From 09313f8860dac5c1198fe109857364e2728dadbb Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <sobereddiezhang@gmail.com>
Date: Fri, 19 Dec 2025 07:15:31 +0000
Subject: [PATCH 2/5] upd mmlu bench

---
 src/srtctl/core/schema.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
index 66939855..5e24a82b 100644
--- a/src/srtctl/core/schema.py
+++ b/src/srtctl/core/schema.py
@@ -165,6 +165,12 @@ class BenchmarkConfig(BaseModel):
     )
     req_rate: Optional[str] = Field("inf", description="Request rate")
 
+    # Accuracy benchmark arguments
+    num_examples: Optional[int] = Field(None, description="Number of examples")
+    max_tokens: Optional[int] = Field(None, description="Maximum output tokens")
+    repeat: Optional[int] = Field(None, description="Number of times to repeat the benchmark")
+    num_threads: Optional[int] = Field(None, description="Number of running threads for accuracy benchmark")
+
 
 class ProfilingType(str, Enum):
     """Supported profiling types."""

From d1abf66b9cc0d49c18bb69f4f363c956d412bd35 Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <sobereddiezhang@gmail.com>
Date: Fri, 19 Dec 2025 07:37:23 +0000
Subject: [PATCH 3/5] upd gpqa and longbench

---
 scripts/benchmarks/gpqa/bench.sh |  6 +++---
 src/srtctl/backends/sglang.py    | 13 +++++++++++++
 src/srtctl/core/schema.py        |  2 ++
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/scripts/benchmarks/gpqa/bench.sh b/scripts/benchmarks/gpqa/bench.sh
index 70c5a86b..b1b33fb3 100755
--- a/scripts/benchmarks/gpqa/bench.sh
+++ b/scripts/benchmarks/gpqa/bench.sh
@@ -14,10 +14,10 @@ n_decode=$2
 prefill_gpus=$3
 decode_gpus=$4
 num_examples=${5:-198}  # Default: 198
-max_tokens=${6:-512}    # Default: 512
+max_tokens=${6:-32768}    # Default: 32768
 repeat=${7:-8}          # Default: 8
-num_threads=${8:-512}   # Default: 512
-thinking_mode=${9:-deepseek-r1} # Default: deepseek-r1
+num_threads=${8:-128}   # Default: 128
+# Note: --thinking-mode removed because dynamo frontend doesn't support chat_template_kwargs
 
 echo "GPQA Benchmark Config: num_examples=${num_examples}; max_tokens=${max_tokens}; repeat=${repeat}; num_threads=${num_threads}; thinking-mode=${thinking_mode}"
 
diff --git a/src/srtctl/backends/sglang.py b/src/srtctl/backends/sglang.py
index b45425bd..5538c439 100644
--- a/src/srtctl/backends/sglang.py
+++ b/src/srtctl/backends/sglang.py
@@ -268,6 +268,19 @@ def generate_slurm_script(self, config_path: Path = None, timestamp: str = None)
             repeat = benchmark_config.get("repeat", 8)
             num_threads = benchmark_config.get("num_threads", 512)
             parsable_config = f"{num_examples} {max_tokens} {repeat} {num_threads}"
+        elif bench_type == "gpqa":
+            num_examples = benchmark_config.get("num_examples", 198)
+            max_tokens = benchmark_config.get("max_tokens", 32768)
+            repeat = benchmark_config.get("repeat", 8)
+            num_threads = benchmark_config.get("num_threads", 128)
+            parsable_config = f"{num_examples} {max_tokens} {repeat} {num_threads}"
+        elif bench_type == "longbenchv2":
+            num_examples = benchmark_config.get("num_examples", None)
+            max_tokens = benchmark_config.get("max_tokens", 16384)
+            max_context_length = benchmark_config.get("max_context_length", 128000)
+            num_threads = benchmark_config.get("num_threads", 16)
+            categories = benchmark_config.get("categories", None)
+            parsable_config = f"{num_examples} {max_tokens} {max_context_length} {num_threads} {categories}"
 
         # Config directory should point to where deepep_config.json lives
         # This is typically the configs/ directory in the yaml-config repo
diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
index 5e24a82b..66b97cbe 100644
--- a/src/srtctl/core/schema.py
+++ b/src/srtctl/core/schema.py
@@ -170,6 +170,8 @@ class BenchmarkConfig(BaseModel):
     max_tokens: Optional[int] = Field(None, description="Maximum output tokens")
     repeat: Optional[int] = Field(None, description="Number of times to repeat the benchmark")
     num_threads: Optional[int] = Field(None, description="Number of running threads for accuracy benchmark")
+    max_context_length: Optional[int] = Field(None, description="Maximum context length for LongBench-v2 accuracy benchmark")
+    categories: Optional[list[str]] = Field(None, description="Comma-separated list of categories to evaluate for LongBench-v2 (None for all)")
 
 
 class ProfilingType(str, Enum):

From 5a096ea1065d975bd1fbec982ff41bb57115ccd9 Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <sobereddiezhang@gmail.com>
Date: Fri, 19 Dec 2025 07:57:56 +0000
Subject: [PATCH 4/5] upd docs

---
 docs/accuracy.md | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 docs/accuracy.md

diff --git a/docs/accuracy.md b/docs/accuracy.md
new file mode 100644
index 00000000..91989698
--- /dev/null
+++ b/docs/accuracy.md
@@ -0,0 +1,57 @@
+# Accuracy Benchmark
+
+In srt-slurm, users can run different accuracy benchmarks by setting the benchmark section in the config yaml file. Supported benchmarks include `mmlu`, `gpqa` and `longbenchv2`. 
+
+**Note that the `context-length` argument in the config yaml needs to be larger than the `max_tokens` argument of accuracy benchmark.**
+
+
+## MMLU
+
+For MMLU dataset, the benchmark section in yaml file can be modified in the following way:
+```bash
+benchmark:
+  type: "mmlu"
+  num_examples: 200 # Number of examples to run
+  max_tokens: 2048 # Max number of output tokens
+  repeat: 8 # Number of repetition
+  num_threads: 512 # Number of parallel threads for running benchmark
+```
+ 
+Then launch the script as usual:
+```bash
+srtctl apply -f config.yaml
+```
+
+After finishing benchmarking, the `benchmark.out` will contain the results of accuracy:
+```
+====================
+Repeat: 8, mean: 0.812
+Scores: ['0.790', '0.820', '0.800', '0.820', '0.820', '0.790', '0.820', '0.840']
+====================
+Writing report to /tmp/mmlu_deepseek-ai_DeepSeek-R1.html
+{'other': np.float64(0.9), 'other:std': np.float64(0.30000000000000004), 'score:std': np.float64(0.36660605559646725), 'stem': np.float64(0.8095238095238095), 'stem:std': np.float64(0.392676726249301), 'humanities': np.float64(0.7428571428571429), 'humanities:std': np.float64(0.4370588154508102), 'social_sciences': np.float64(0.9583333333333334), 'social_sciences:std': np.float64(0.19982631347136331), 'score': np.float64(0.84)}
+Writing results to /tmp/mmlu_deepseek-ai_DeepSeek-R1.json
+Total latency: 465.618 s
+Score: 0.840
+Results saved to: /logs/accuracy/mmlu_deepseek-ai_DeepSeek-R1.json
+MMLU evaluation complete
+```
+
+
+## GPQA
+For GPQA dataset, the benchmark section in yaml file can be modified in the following way:
+```bash
+benchmark:
+  type: "gpqa"
+  num_examples: 198 # Number of examples to run
+  max_tokens: 65536 # We need a larger output token number for GPQA
+  repeat: 8 # Number of repetition
+  num_threads: 128 # Number of parallel threads for running benchmark
+```
+The `context-length` argument here should be set to a value larger than `max_tokens`.
+
+
+## LongBench-V2
+To be updated
+
+

From 9c085be05baf8a7d47131cca2613afb78f714415 Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <sobereddiezhang@gmail.com>
Date: Fri, 19 Dec 2025 08:26:50 +0000
Subject: [PATCH 5/5] fix

---
 scripts/benchmarks/gpqa/bench.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/benchmarks/gpqa/bench.sh b/scripts/benchmarks/gpqa/bench.sh
index b1b33fb3..6c0da42a 100755
--- a/scripts/benchmarks/gpqa/bench.sh
+++ b/scripts/benchmarks/gpqa/bench.sh
@@ -19,7 +19,7 @@ repeat=${7:-8}          # Default: 8
 num_threads=${8:-128}   # Default: 128
 # Note: --thinking-mode removed because dynamo frontend doesn't support chat_template_kwargs
 
-echo "GPQA Benchmark Config: num_examples=${num_examples}; max_tokens=${max_tokens}; repeat=${repeat}; num_threads=${num_threads}; thinking-mode=${thinking_mode}"
+echo "GPQA Benchmark Config: num_examples=${num_examples}; max_tokens=${max_tokens}; repeat=${repeat}; num_threads=${num_threads}"
 
 # Source utilities for wait_for_model
 source /scripts/utils/benchmark_utils.sh
@@ -49,8 +49,7 @@ python3 -m sglang.test.run_eval \
     --num-examples ${num_examples} \
     --max-tokens ${max_tokens} \
     --repeat ${repeat} \
-    --num-threads ${num_threads} \
-    --thinking-mode ${thinking_mode}
+    --num-threads ${num_threads}
 
 # Copy the result file from /tmp to our logs directory
 # The result file is named gpqa_{model_name}.json