Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 104 additions & 2 deletions tools/hygiene/sort-tick-history-canonical.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,29 @@

import argparse
import re
import subprocess
import sys
from pathlib import Path


def repo_root() -> Path | None:
"""Resolve the repo root via `git rev-parse --show-toplevel`.

Mirrors sibling hygiene scripts so that running this tool from a
subdirectory still resolves the default `--file` correctly. Returns
None if not in a git repo (caller falls back to CWD)."""
try:
out = subprocess.run(
["git", "rev-parse", "--show-toplevel"],
capture_output=True,
text=True,
check=True,
)
return Path(out.stdout.strip())
except (subprocess.CalledProcessError, FileNotFoundError):
return None


def find_separator_line(lines: list[str]) -> int | None:
"""Return the line index of the markdown table separator (last
occurrence — the file has a sample schema row earlier in prose)."""
Expand Down Expand Up @@ -95,17 +114,80 @@ def sort_canonical(text: str) -> tuple[str, dict]:
header = lines[: sep_idx + 1]
data = lines[sep_idx + 1 :]

# File-line offset for converting post-separator indices into
# 1-based file line numbers in error diagnostics. Reviewer P2:
# 0-based post-separator indices were confusing; the user wants
# to grep / open-at-line, not arithmetic in their head.
sep_file_line = sep_idx + 1 # 1-based line number of separator

data_rows: list[tuple[str, int, str]] = []
unmatched_table_rows: list[tuple[int, str]] = []
for original_index, line in enumerate(data):
if not line.strip():
continue
ts = get_timestamp(line)
if ts:
data_rows.append((ts, original_index, line))
Comment thread
AceHack marked this conversation as resolved.
elif line.lstrip().startswith("|"):
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Detect unmatched rows without stripping indentation

line.lstrip().startswith("|") treats any indented line that begins with | as a malformed table row, so valid trailing markdown content (for example an indented code block or fenced content line starting with |) now triggers the unmatched-row ValueError and blocks sorting. This was introduced with the fail-fast guard and makes the tool reject files it should preserve; the unmatched-row check should only match actual table-row prefixes (e.g., unindented or markdown-valid table indentation) instead of stripping all leading whitespace.

Useful? React with 👍 / 👎.

# Looks like a table row but no timestamp matched —
# schema drift or malformed row. Refuse to silently drop;
# caller decides whether to fail or skip after seeing
# the count.
unmatched_table_rows.append((original_index, line))

rows_in = len(data_rows)
original_order = [line for _, _, line in data_rows]

# Trailing non-row content after the table — anything that isn't
# blank and isn't a table row must be preserved (Codex P2 finding:
# naive header+rows reconstruction would lose trailing prose).
# Find the index of the last table-shaped line (matched OR
# unmatched); everything after that index is trailing content.
trailing_lines: list[str] = []
table_indices = sorted(
[idx for _, idx, _ in data_rows]
+ [idx for idx, _ in unmatched_table_rows]
)
if table_indices:
last_table_idx = table_indices[-1]
# Trailing = lines AFTER the last table-row. Strip leading
# blank-line separator(s) so the reconstructed file gets a
# single canonical blank between table-end and prose-start.
trailing_candidate = data[last_table_idx + 1 :]
first_non_blank = next(
(i for i, line in enumerate(trailing_candidate) if line.strip()),
len(trailing_candidate),
)
if first_non_blank < len(trailing_candidate):
trailing_lines = trailing_candidate[first_non_blank:]

# P0 guard: if the data region has table-shaped lines but zero
# match the timestamp regex, the schema has drifted and the
# naive write-back would wipe the table. Refuse.
if rows_in == 0 and unmatched_table_rows:
first_orig = unmatched_table_rows[0][0]
raise ValueError(
f"schema drift: {len(unmatched_table_rows)} table-shaped row(s) "
f"found but ZERO matched the ISO-8601 timestamp regex. "
f"Refusing to write — would wipe tick-history. "
f"First unmatched row at file-line "
f"{sep_file_line + 1 + first_orig}: "
f"{unmatched_table_rows[0][1][:120]}"
)

# P1 guard: any unmatched table row is a discipline violation;
# silently dropping rows is exactly the failure mode this script
# is supposed to prevent. Surface and refuse.
if unmatched_table_rows:
first = unmatched_table_rows[0]
raise ValueError(
f"refusing to drop {len(unmatched_table_rows)} unmatched "
f"table row(s); per Otto-229 (append-only discipline) the "
f"sort tool must not silently lose rows. "
f"First unmatched at file-line "
f"{sep_file_line + 1 + first[0]}: {first[1][:120]}"
)
Comment thread
AceHack marked this conversation as resolved.

# Stable sort by (timestamp, original_index) so ties preserve
# input order. ISO-8601 strings sort lex == chronological.
data_rows.sort(key=lambda x: (x[0], x[1]))
Expand All @@ -118,7 +200,17 @@ def sort_canonical(text: str) -> tuple[str, dict]:
seen.add(line)
unique_rows.append(line)

new_text = "\n".join(header) + "\n" + "\n".join(unique_rows) + "\n"
# Reconstruct: header + sorted-rows + (blank-line separator +
# trailing prose if any). Without trailing-content preservation
# the naive header + rows reconstruction would silently drop any
# post-table prose paragraph (Codex P2 finding). The blank-line
# separator between table-end and trailing-content is required by
# CommonMark to terminate the table and start a new block.
parts = ["\n".join(header), "\n".join(unique_rows)]
if trailing_lines:
parts.append("") # explicit blank-line separator
parts.append("\n".join(trailing_lines))
new_text = "\n".join(parts) + "\n"
rows_out = len(unique_rows)
reordered = unique_rows != original_order[: len(unique_rows)] or rows_out != rows_in

Expand All @@ -127,6 +219,7 @@ def sort_canonical(text: str) -> tuple[str, dict]:
"rows_out": rows_out,
"duplicates_removed": rows_in - rows_out,
"reordered": reordered,
"trailing_lines_preserved": len(trailing_lines),
}


Expand All @@ -142,11 +235,20 @@ def main(argv: list[str] | None = None) -> int:
parser.add_argument(
"--file",
default="docs/hygiene-history/loop-tick-history.md",
help="Path to tick-history file (relative to repo root)",
help="Path to tick-history file (relative paths resolve to repo "
"root via 'git rev-parse --show-toplevel'; if not in a git "
"checkout, falls back to current working directory)",
)
args = parser.parse_args(argv)

# Resolve --file relative to repo root when it is a relative path,
# so the tool works the same whether invoked from repo root or any
# subdirectory. Sibling hygiene scripts share this convention.
p = Path(args.file)
if not p.is_absolute():
root = repo_root()
if root is not None:
p = root / args.file
if not p.exists():
print(f"ERROR: file not found: {p}", file=sys.stderr)
return 1
Expand Down
Loading