From 022ee9fcbccc8ead14c8f56d2f3db4bc3830ca91 Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Thu, 4 Jun 2026 19:39:17 +0200 Subject: [PATCH] scripts(dflash): switch default bench target to Q8_0 + --target flag Per Markus 2026-06-04: DFlash quality measurement should use a Q8_0 target rather than Q4_K_M, since Q4_K_M introduces enough target-side quantization noise to confound DFlash's own accept-rate signal. Q8_0 fits in 38 GB total, well within titan A100 80 GB. * Default `TARGET` is now `gemma-4-31B-it-Q8_0.gguf`. Override via `--target PATH` or `DFLASH_BENCH_TARGET` env var. * Also added `DFLASH_BENCH_DRAFTER_DIR` env var for consistency. * Comment block documents VRAM math for Q4_K_M / Q8_0 / BF16 targets so future runs can pick the right card. --- scripts/bench-dflash.sh | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/scripts/bench-dflash.sh b/scripts/bench-dflash.sh index 27efd8e5ef7..f0bc345d824 100755 --- a/scripts/bench-dflash.sh +++ b/scripts/bench-dflash.sh @@ -6,11 +6,19 @@ # pair runs N times so variance is visible (DFlash bench has ±2-3pp # run-to-run variance even at temp=0 / fixed seed). # -# VRAM requirement: ~22 GB free (target Q4_K_M ~18 GB + drafter ~1-3 GB + -# compute). Coordinate centurion-llm scale-down before running. +# VRAM requirement (target + ~1-3 GB drafter + compute): +# - Q4_K_M target ~18 GB → ~22 GB total (fits on a single 24 GB card) +# - Q8_0 target ~33 GB → ~38 GB total (titan A100 80 GB only) +# - BF16 target ~62 GB → ~67 GB total (titan A100 80 GB only) +# Coordinate centurion-llm scale-down before running on shared hardware. # # Usage: -# scripts/bench-dflash.sh [--quants Q4,Q6,Q8,BF16] [--runs 3] [--ctx 4096] +# scripts/bench-dflash.sh [--target PATH] [--quants Q4,Q6,Q8,BF16] [--runs 3] [--ctx 4096] +# +# Default target is gemma-4-31B-it-Q8_0.gguf — the higher-quality reference +# preferred for DFlash quality measurement (Markus 2026-06-04). For VRAM- +# constrained local runs, override with --target gemma-4-31B-it-Q4_K_M.gguf +# (or set DFLASH_BENCH_TARGET in the env). # # Output goes to /tmp/dflash-bench-.md with a markdown summary # table at the bottom. @@ -25,8 +33,8 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" BIN="$ROOT/build-cuda/bin/llama-speculative-simple" -TARGET="$ROOT/models/gemma-4-31B-it-Q4_K_M.gguf" -DRAFTER_DIR="$ROOT/models/dflash-gemma4-31b-gguf" +TARGET="${DFLASH_BENCH_TARGET:-$ROOT/models/gemma-4-31B-it-Q8_0.gguf}" +DRAFTER_DIR="${DFLASH_BENCH_DRAFTER_DIR:-$ROOT/models/dflash-gemma4-31b-gguf}" TS=$(date +%Y%m%d-%H%M%S) OUT="/tmp/dflash-bench-$TS.md" @@ -36,6 +44,7 @@ CTX=4096 while (( $# )); do case "$1" in + --target) TARGET="$2"; shift 2 ;; --quants) QUANTS="$2"; shift 2 ;; --runs) RUNS="$2"; shift 2 ;; --ctx) CTX="$2"; shift 2 ;;