ishandhanani · YAMY1234 · Mar 5, 2026 · Feb 27, 2026
diff --git a/src/srtctl/benchmarks/sa_bench.py b/src/srtctl/benchmarks/sa_bench.py
@@ -82,7 +82,7 @@ def build_command(
         # Tokenizer path: HF model ID or container mount path
         tokenizer_path = str(runtime.model_path) if runtime.is_hf_model else "/model"
 
-        return [
+        cmd = [
             "bash",
             self.script_path,
             endpoint,
@@ -96,4 +96,6 @@ def build_command(
             str(total_gpus),
             str(prefill_gpus),
             str(decode_gpus),
+            str(b.random_range_ratio) if b.random_range_ratio is not None else "0.8",
         ]
+        return cmd
diff --git a/src/srtctl/benchmarks/scripts/gpqa/bench.sh b/src/srtctl/benchmarks/scripts/gpqa/bench.sh
@@ -13,9 +13,14 @@ MAX_TOKENS=${3:-32768}
 REPEAT=${4:-8}
 NUM_THREADS=${5:-128}
 
-MODEL_NAME="deepseek-ai/DeepSeek-R1"
+# Auto-detect model name from /v1/models endpoint; fall back to default
+MODEL_NAME=$(curl -s "${ENDPOINT}/v1/models" 2>/dev/null | python3 -c "import sys,json; print(json.load(sys.stdin)['data'][0]['id'])" 2>/dev/null || echo "")
+if [ -z "${MODEL_NAME}" ]; then
+    MODEL_NAME="deepseek-ai/DeepSeek-R1"
+    echo "Warning: Could not auto-detect model name, using default: ${MODEL_NAME}"
+fi
 
-echo "GPQA Config: endpoint=${ENDPOINT}; num_examples=${NUM_EXAMPLES}; max_tokens=${MAX_TOKENS}; repeat=${REPEAT}; num_threads=${NUM_THREADS}"
+echo "GPQA Config: endpoint=${ENDPOINT}; model=${MODEL_NAME}; num_examples=${NUM_EXAMPLES}; max_tokens=${MAX_TOKENS}; repeat=${REPEAT}; num_threads=${NUM_THREADS}"
 
 # Create results directory
 result_dir="/logs/accuracy"

diff --git a/src/srtctl/cli/mixins/postprocess_stage.py b/src/srtctl/cli/mixins/postprocess_stage.py
@@ -194,7 +194,7 @@ def _extract_benchmark_results(self) -> dict[str, Any] | None:
         # Fallback to raw output for legacy/failed rollups
         benchmark_out = self.runtime.log_dir / "benchmark.out"
         if benchmark_out.exists():
-            return {"benchmark_type": "unknown", "raw_output": benchmark_out.read_text()}
+            return {"benchmark_type": "unknown", "raw_output": benchmark_out.read_text(errors="replace")}
 
         return None
 

diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
@@ -534,6 +534,7 @@ class BenchmarkConfig:
     mooncake_workload: str | None = None  # "mooncake", "conversation", "synthetic", "toolagent"
     ttft_threshold_ms: int | None = None  # Goodput TTFT threshold in ms (default: 2000)
     itl_threshold_ms: int | None = None  # Goodput ITL threshold in ms (default: 25)
+    random_range_ratio: float | None = None  # Random input/output length range ratio (default: 0.8)
 
     def get_concurrency_list(self) -> list[int]:
         if self.concurrencies is None: