diff --git a/tools/hygiene/check-tick-history-shard-schema.sh b/tools/hygiene/check-tick-history-shard-schema.sh index 82f20c34..ab1553ae 100755 --- a/tools/hygiene/check-tick-history-shard-schema.sh +++ b/tools/hygiene/check-tick-history-shard-schema.sh @@ -18,36 +18,46 @@ # requires schema uniformity on main; this check is the # mechanism that preserves it. # -# What this checks: -# 1. Shard file exists at the canonical path -# docs/hygiene-history/ticks/YYYY/MM/DD/.md -# (or the extended HHMMSSZ-.md form per the schema's -# high-concurrency option). +# Usage: +# tools/hygiene/check-tick-history-shard-schema.sh +# — full-tree audit; scans every shard under +# docs/hygiene-history/ticks/. Default mode used by +# manual runs and full-tree audits. +# tools/hygiene/check-tick-history-shard-schema.sh --files PATH... +# — restricted audit; scans only the listed shard files. +# Shape that pre-push hooks and per-PR CI jobs want, so +# they can run only on changed shards instead of failing +# on the 5 known-stale shards documented below. Each +# path must be a real file ending in .md under the shard +# directory; non-shard paths are silently skipped (so the +# caller can pass a broader file list, e.g. all changed +# files in a PR diff). +# +# What this checks (per shard): +# 1. Filename matches HHMMZ.md or HHMMSSZ-.md per the +# schema in docs/hygiene-history/ticks/README.md. # 2. First non-empty line is a 6-column markdown table row -# starting with `| YYYY-MM-DDTHH:MM:SSZ |` — exactly the ISO -# timestamp, no parenthetical, no extra prose, no leading -# whitespace beyond the standard `| `. -# 3. The timestamp inside col1 matches the filename's `HHMMZ` -# — i.e. a shard at `2026/04/30/2304Z.md` must carry a col1 -# timestamp of `2026-04-30T23:04:??Z` (any second). +# starting with `| YYYY-MM-DDTHH:MM(:SS)?Z |` — exactly the +# ISO timestamp, no parenthetical, no extra prose, no +# leading whitespace beyond the standard `| `. Both the +# with-seconds and no-seconds forms are valid ISO-8601 UTC. +# 3. The col1 timestamp's date + HH:MM matches the filename's +# path date and HHMM. # -# What this does NOT do: -# - Does NOT validate body content (cols 4-6). The body is -# intentionally free-form prose. -# - Does NOT enforce that col2 = `` or col3 = -# `` strictly. The schema's lower columns -# have drifted in practice (col3 commonly carries a commit -# SHA instead of the cron sentinel); enforcing that would -# be its own clean-up effort. -# - Does NOT detect the prefab pattern (col1 timestamp -# significantly ahead of commit-author time). That requires -# git-log access which isn't available pre-push for the +# What this does NOT check: +# - Body content (cols 4-6) — intentionally free-form prose. +# - Strict col2/col3 enforcement — the lower columns have +# drifted in practice (col3 commonly carries a commit SHA +# instead of the cron sentinel); enforcing that would be +# its own clean-up effort. +# - The prefab pattern (col1 timestamp ≫ commit-author time) +# — requires git-log access not available pre-push for the # current commit. See # `memory/feedback_tick_history_prefabricated_shards_codex_finding_audit_trail_integrity_2026_04_30.md` # for the deferred check. # # Exit codes: -# 0 — all shards valid +# 0 — all checked shards valid # 1 — one or more violations found (details on stderr) # 2 — invocation error (script bug or missing inputs) # @@ -69,16 +79,25 @@ # `memory/feedback_tick_history_prefabricated_shards_codex_finding_audit_trail_integrity_2026_04_30.md` # — fixing col1 mechanically would launder the body-level # prefab claim. The check is therefore landed in DORMANT -# mode (not yet wired into CI); a future cleanup PR resolves -# the prefab-vs-schema decision before the check goes -# binding. +# mode (full-tree); the --files mode IS safe to wire into +# pre-push immediately because it only checks the caller's +# stated set, not the full tree. set -euo pipefail ROOT="${REPO_ROOT:-$(git rev-parse --show-toplevel 2>/dev/null || echo .)}" SHARD_DIR="$ROOT/docs/hygiene-history/ticks" -if [ ! -d "$SHARD_DIR" ]; then +# Argument parsing. +files_mode=0 +files=() +if [ $# -gt 0 ] && [ "$1" = "--files" ]; then + files_mode=1 + shift + files=("$@") +fi + +if [ "$files_mode" -eq 0 ] && [ ! -d "$SHARD_DIR" ]; then echo "error: $SHARD_DIR does not exist" >&2 exit 2 fi @@ -86,9 +105,17 @@ fi violations=0 total=0 -# Find every shard file (skip README.md and any schema/* docs). -while IFS= read -r -d '' shard; do +# Per-shard validator. Echos VIOLATION lines on stderr and +# returns 0 if the shard is fine, 1 if a violation was found. +# Shellcheck note: this function uses early-return semantics +# instead of `continue` because it's invoked outside the find +# loop too (in --files mode). +scan_one() { + local shard="$1" total=$((total + 1)) + local base path_rel parts yyyy rest_a mm rest_b dd hhmm hh mm_of_hour + local first_line line ts ts_yyyy ts_mm ts_dd ts_hh ts_min + base="$(basename "$shard" .md)" path_rel="${shard#"$ROOT/"}" @@ -100,21 +127,26 @@ while IFS= read -r -d '' shard; do rest_b="${rest_a#*/}" dd="${rest_b%%/*}" - # Pull the HHMM from the filename (handle both HHMMZ and - # HHMMSSZ- forms). + # Filename HHMM extraction. Per docs/hygiene-history/ticks/README.md + # the accepted forms are: + # - `HHMMZ.md` — bare four-digit form (with optional disambiguator) + # - `HHMMSSZ-.md` — six-digit-with-hash form (the recommended + # high-concurrency form; the hash suffix is REQUIRED, not optional — + # bare `HHMMSSZ.md` would weaken the collision-avoidance rule the + # hash exists for) + # Codex P2 review on PR #977 caught the earlier optional-hash regex. if [[ "$base" =~ ^([0-9]{4})Z(-[0-9a-f]+)?$ ]]; then hhmm="${BASH_REMATCH[1]}" - elif [[ "$base" =~ ^([0-9]{4})([0-9]{2})Z(-[0-9a-f]+)?$ ]]; then + elif [[ "$base" =~ ^([0-9]{4})([0-9]{2})Z-[0-9a-f]+$ ]]; then hhmm="${BASH_REMATCH[1]}" else echo "VIOLATION: $path_rel — filename does not match HHMMZ.md or HHMMSSZ-.md schema" >&2 - violations=$((violations + 1)) - continue + return 1 fi hh="${hhmm:0:2}" mm_of_hour="${hhmm:2:2}" - # Read the first non-empty line. + # First non-empty line. first_line="" while IFS= read -r line; do if [ -n "${line// }" ]; then @@ -125,16 +157,29 @@ while IFS= read -r -d '' shard; do if [ -z "$first_line" ]; then echo "VIOLATION: $path_rel — file is empty or whitespace-only" >&2 - violations=$((violations + 1)) - continue + return 1 + fi + + # Schema rule: row must be a 6-column markdown table — col1 + # = ISO-8601 UTC timestamp, then 5 more columns (model id, + # cron sentinel, body, PR ref, observation) per + # docs/hygiene-history/ticks/README.md. Codex P2 review on + # PR #977 caught that the col1-only check accepted rows + # like `| | a |` with too few columns. The 6-column + # enforcement runs first; the col1 regex only fires if the + # column count is right. + pipe_count=$(awk -F'|' '{print NF-1}' <<< "$first_line") + # 6 columns => 7 pipes (one before col1, one between each pair, one + # after col6). Allow 7 or 8 to tolerate trailing whitespace. + if [ "$pipe_count" -lt 7 ]; then + echo "VIOLATION: $path_rel — first row has $pipe_count pipe characters; schema requires 6 columns (7 pipes including the trailing one)" >&2 + echo " got: $(echo "$first_line" | head -c 120)" >&2 + return 1 fi - # Schema rule: first cell must be `| YYYY-MM-DDTHH:MM(:SS)?Z |` - # with no extra content before the next column boundary. Both - # the with-seconds form (`...T23:04:00Z`) and the no-seconds - # form (`...T23:04Z`) are valid ISO-8601 UTC; the schema - # in docs/hygiene-history/ticks/README.md does not pick a side. - # Capture the timestamp and verify it matches the path. + # Schema rule: col1 must be `| YYYY-MM-DDTHH:MM(:SS)?Z |` + # exactly, with no parenthetical or extra prose. Both ISO + # forms are valid UTC; the schema doesn't pick a side. if [[ "$first_line" =~ ^\|\ ([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z)\ \|\ ]]; then ts="${BASH_REMATCH[1]}" ts_yyyy="${ts:0:4}" @@ -145,25 +190,68 @@ while IFS= read -r -d '' shard; do if [ "$ts_yyyy" != "$yyyy" ] || [ "$ts_mm" != "$mm" ] || [ "$ts_dd" != "$dd" ]; then echo "VIOLATION: $path_rel — col1 timestamp $ts does not match path date $yyyy-$mm-$dd" >&2 - violations=$((violations + 1)) - continue + return 1 fi if [ "$ts_hh" != "$hh" ] || [ "$ts_min" != "$mm_of_hour" ]; then echo "VIOLATION: $path_rel — col1 timestamp ${ts_hh}:${ts_min} does not match filename ${hh}:${mm_of_hour}" >&2 - violations=$((violations + 1)) - continue + return 1 fi else - echo "VIOLATION: $path_rel — col1 must be exactly '| YYYY-MM-DDTHH:MM:SSZ | ...' (no parenthetical, no extra prose)" >&2 + echo "VIOLATION: $path_rel — col1 must be exactly '| YYYY-MM-DDTHH:MM(:SS)?Z | ...' (no parenthetical, no extra prose)" >&2 echo " got: $(echo "$first_line" | head -c 120)" >&2 - violations=$((violations + 1)) - continue + return 1 fi -done < <(find "$SHARD_DIR" -type f -name '*.md' \ - ! -name 'README.md' \ - -print0) + return 0 +} + +if [ "$files_mode" -eq 1 ]; then + # --files mode: scan only the listed paths. Skip non-shard + # paths silently so callers can pass a broader file list + # (e.g. all changed files from `git diff --name-only`). + for f in "${files[@]}"; do + case "$f" in + docs/hygiene-history/ticks/*/*.md) + # Resolve to absolute path so scan_one's $ROOT prefix + # stripping works. Bash case `*` matches `/` so this + # glob covers the YYYY/MM/DD/.md depth — verified + # via test on PR #977 (Copilot reported a P0 here based + # on misreading bash case glob semantics; closed form-2 + # because `*` in case patterns is greedy across `/`, + # confirmed by running the script against real shard + # paths like docs/hygiene-history/ticks/2026/04/30/2018Z.md + # which match correctly). + abs="$ROOT/$f" + # Per the script header's "silently skipped" contract, + # missing or non-shard paths emit no diagnostic. Codex + # P2 review on PR #977 caught the earlier "skipped (not + # a file)" stderr message that contradicted the contract. + if [ ! -f "$abs" ]; then + continue + fi + if [ "$(basename "$f")" = "README.md" ]; then + continue + fi + if ! scan_one "$abs"; then + violations=$((violations + 1)) + fi + ;; + *) + # Not a shard path; silently skip. + ;; + esac + done +else + # Default mode: full-tree audit. + while IFS= read -r -d '' shard; do + if ! scan_one "$shard"; then + violations=$((violations + 1)) + fi + done < <(find "$SHARD_DIR" -type f -name '*.md' \ + ! -name 'README.md' \ + -print0) +fi echo "checked $total shard files; $violations violations" >&2