unslothai · danielhanchen · May 14, 2026 · May 14, 2026 · gemini-code-assist · May 14, 2026
@@ -0,0 +1,245 @@
+# Unsloth - 2x faster, 60% less VRAM LLM training and finetuning
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+
+"""Deterministic comment / docstring-only verifier.
+
+Compares a list of changed files between two git refs and reports whether
+each diff is strictly comments / docstrings (Python) or comments
+(YAML / GitHub Actions). Useful for gating a "comment trim" /
+"docstring refactor" PR against accidental code drift.
+
+Per .py file: parse both revs into AST, strip module / class / function
+docstrings, then compare ast.unparse output. Pure Python comments are
+discarded by the parser by construction, so any post-strip diff is real
+code. Per .yml file: yaml.safe_load both sides and compare the parsed
+Python object; if scalar values differ, also strip shell comments inside
+``run: |`` block bodies before comparing. Exit code 0 = all OK, 1 = at
+least one file has a real (non-comment) diff or an error.
+
+Usage:
+    python scripts/verify_comment_only_diff.py [--base REF] [--head REF] path ...
+
+Defaults: --base origin/main, --head HEAD. Paths are repo-relative.
+
+Example:
+    git diff --name-only origin/main..HEAD \\
+      | xargs python scripts/verify_comment_only_diff.py --base origin/main
+"""
+from __future__ import annotations
+
+import argparse
+import ast
+import difflib
+import subprocess
+import sys
+from typing import Any
+
+import yaml
+
+
+def _git_show(rev: str, path: str) -> str:
+    return subprocess.check_output(
+        ["git", "show", f"{rev}:{path}"], text = True, stderr = subprocess.DEVNULL,
+    )
+
+
+def _strip_docstrings(tree: ast.AST) -> ast.AST:
+    """Remove every string-literal docstring (Module / FunctionDef /
+    AsyncFunctionDef / ClassDef). Empty body becomes ``pass`` so
+    ast.unparse stays valid."""
+    for node in ast.walk(tree):
+        if isinstance(
+            node,
+            (ast.Module, ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef),
+        ):
+            body = getattr(node, "body", None)
+            if not body:
+                continue
+            first = body[0]
+            if (
+                isinstance(first, ast.Expr)
+                and isinstance(first.value, ast.Constant)
+                and isinstance(first.value.value, str)
+            ):
+                node.body = body[1:]
+                if not node.body:
+                    node.body = [ast.Pass()]
+    return tree
+
+
+def _normalize_py(src: str) -> str:
+    tree = ast.parse(src)
+    tree = _strip_docstrings(tree)
+    return ast.unparse(tree)
+
+
+def _strip_shell_comments(s: str) -> str:
+    """Strip pure-comment lines and inline trailing comments from a shell
+    snippet, then collapse runs of blank lines. Heuristic only: leaves a
+    line untouched if it has an odd quote count (open string)."""
+    out = []
+    for line in s.splitlines():
+        stripped = line.lstrip()
+        if stripped.startswith("#"):
+            continue
+        has_single = line.count("'") % 2 == 0
+        has_double = line.count('"') % 2 == 0
+        if has_single and has_double:
+            idx = line.find(" #")
+            if idx >= 0:
+                line = line[:idx].rstrip()
+        out.append(line)
+    norm = []
+    prev_blank = False
+    for line in out:
+        if line.strip() == "":
+            if prev_blank:
+                continue
+            prev_blank = True
+        else:
+            prev_blank = False
+        norm.append(line)
+    return "\n".join(norm).strip()
-def _strip_shell_comments(s: str) -> str:
-    """Strip pure-comment lines and inline trailing comments from a shell
-    snippet, then collapse runs of blank lines. Heuristic only: leaves a
-    line untouched if it has an odd quote count (open string)."""
-    out = []
-    for line in s.splitlines():
-        stripped = line.lstrip()
-        if stripped.startswith("#"):
-            continue
-        has_single = line.count("'") % 2 == 0
-        has_double = line.count('"') % 2 == 0
-        if has_single and has_double:
-            idx = line.find(" #")
-            if idx >= 0:
-                line = line[:idx].rstrip()
-        out.append(line)
-    norm = []
-    prev_blank = False
-    for line in out:
-        if line.strip() == "":
-            if prev_blank:
-                continue
-            prev_blank = True
-        else:
-            prev_blank = False
-        norm.append(line)
-    return "\n".join(norm).strip()
+def _strip_shell_comments(s: str) -> str:
+    """Strip pure-comment lines from a shell snippet, then collapse runs of
+    blank lines."""
+    norm = []
+    prev_blank = False
+    for line in s.splitlines():
+        if line.lstrip().startswith("#"):
+            continue
+        if line.strip() == "":
+            if prev_blank:
+                continue
+            prev_blank = True
+        else:
+            prev_blank = False
+        norm.append(line)
+    return "\n".join(norm).strip()
-def _strip_shell_comments(s: str) -> str:
-    """Strip pure-comment lines and inline trailing comments from a shell
-    snippet, then collapse runs of blank lines. Heuristic only: leaves a
-    line untouched if it has an odd quote count (open string)."""
-    out = []
-    for line in s.splitlines():
-        stripped = line.lstrip()
-        if stripped.startswith("#"):
-            continue
-        has_single = line.count("'") % 2 == 0
-        has_double = line.count('"') % 2 == 0
-        if has_single and has_double:
-            idx = line.find(" #")
-            if idx >= 0:
-                line = line[:idx].rstrip()
-        out.append(line)
-    norm = []
-    prev_blank = False
-    for line in out:
-        if line.strip() == "":
-            if prev_blank:
-                continue
-            prev_blank = True
-        else:
-            prev_blank = False
-        norm.append(line)
-    return "\n".join(norm).strip()
+def _strip_shell_comments(s: str) -> str:
+    """Strip pure-comment lines from a shell snippet, then collapse runs of
+    blank lines."""
+    norm = []
+    prev_blank = False
+    for line in s.splitlines():
+        if line.lstrip().startswith("#"):
+            continue
+        if line.strip() == "":
+            if prev_blank:
+                continue
+            prev_blank = True
+        else:
+            prev_blank = False
+        norm.append(line)
+    return "\n".join(norm).strip()
+
+
+def _normalize_yaml_run_strings(obj: Any) -> Any:
+    """Walk the parsed YAML object; for any multi-line string (i.e. a
+    ``run: |`` script body), strip shell comments. Returns a normalised
+    copy."""
+    if isinstance(obj, dict):
+        return {k: _normalize_yaml_run_strings(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_normalize_yaml_run_strings(x) for x in obj]
+    if isinstance(obj, str) and "\n" in obj:
+        return _strip_shell_comments(obj)
+    return obj
+
+
+def _walk_yaml_diff(b: Any, a: Any, prefix: str = "") -> None:
+    """Print a path-keyed summary of the first structural / scalar diff."""
+    if type(b) is not type(a):
+        print(
+            f"     type-diff at {prefix or '/'}: "
+            f"{type(b).__name__} -> {type(a).__name__}",
+        )
+        return
+    if isinstance(b, dict):
+        keys = sorted((set(b.keys()) | set(a.keys())), key = lambda x: str(x))
+        for k in keys:
+            if k not in b:
+                print(f"     added key {prefix}/{k}")
+            elif k not in a:
+                print(f"     removed key {prefix}/{k}")
+            else:
+                _walk_yaml_diff(b[k], a[k], f"{prefix}/{k}")
+    elif isinstance(b, list):
+        if len(b) != len(a):
+            print(
+                f"     list len at {prefix or '/'}: "
+                f"{len(b)} -> {len(a)}",
+            )
+        for i, (bi, ai) in enumerate(zip(b, a)):
+            _walk_yaml_diff(bi, ai, f"{prefix}[{i}]")
-        if len(b) != len(a):
-            print(
-                f"     list len at {prefix or '/'}: "
-                f"{len(b)} -> {len(a)}",
-            )
-        for i, (bi, ai) in enumerate(zip(b, a)):
-            _walk_yaml_diff(bi, ai, f"{prefix}[{i}]")
+    elif isinstance(b, list):
+        if len(b) != len(a):
+            print(f"     list len at {prefix or '/'}: {len(b)} -> {len(a)}")
+        for i in range(max(len(b), len(a))):
+            if i >= len(b):
+                print(f"     added element at {prefix}[{i}]")
+            elif i >= len(a):
+                print(f"     removed element at {prefix}[{i}]")
+            else:
+                _walk_yaml_diff(b[i], a[i], f"{prefix}[{i}]")
-        if len(b) != len(a):
-            print(
-                f"     list len at {prefix or '/'}: "
-                f"{len(b)} -> {len(a)}",
-            )
-        for i, (bi, ai) in enumerate(zip(b, a)):
-            _walk_yaml_diff(bi, ai, f"{prefix}[{i}]")
+    elif isinstance(b, list):
+        if len(b) != len(a):
+            print(f"     list len at {prefix or '/'}: {len(b)} -> {len(a)}")
+        for i in range(max(len(b), len(a))):
+            if i >= len(b):
+                print(f"     added element at {prefix}[{i}]")
+            elif i >= len(a):
+                print(f"     removed element at {prefix}[{i}]")
+            else:
+                _walk_yaml_diff(b[i], a[i], f"{prefix}[{i}]")
+    elif b != a:
+        bs = repr(b)[:300]
+        as_ = repr(a)[:300]
+        print(f"     scalar at {prefix or '/'}:")
+        print(f"       before: {bs}")
+        print(f"       after:  {as_}")
+
+
+def _verify_python(path: str, before: str, after: str) -> bool:
+    try:
+        norm_before = _normalize_py(before)
+        norm_after = _normalize_py(after)
+    except SyntaxError as exc:
+        print(f"FAIL {path}: SyntaxError parsing -- {exc}")
+        return False
+    if norm_before == norm_after:
+        print(f"OK   {path}  (AST identical after docstring strip)")
+        return True
+    diff = list(
+        difflib.unified_diff(
+            norm_before.splitlines(),
+            norm_after.splitlines(),
+            fromfile = f"{path}@before",
+            tofile = f"{path}@after",
+            n = 2,
+        )
+    )
+    print(f"FAIL {path}: AST differs after docstring strip:")
+    for line in diff[:40]:
+        print(f"     {line}")
+    return False
+
+
+def _verify_yaml(path: str, before: str, after: str) -> bool:
+    try:
+        raw_before = yaml.safe_load(before)
+        raw_after = yaml.safe_load(after)
+    except yaml.YAMLError as exc:
+        print(f"FAIL {path}: YAML parse error -- {exc}")
+        return False
+    if raw_before == raw_after:
+        print(f"OK   {path}  (YAML parsed object identical)")
+        return True
+    norm_before = _normalize_yaml_run_strings(raw_before)
+    norm_after = _normalize_yaml_run_strings(raw_after)
+    if norm_before == norm_after:
+        print(
+            f"OK   {path}  (YAML parsed object identical after "
+            f"stripping shell comments from run: bodies)",
+        )
+        return True
+    print(
+        f"FAIL {path}: YAML parsed objects still differ after stripping "
+        f"shell comments from `run:` bodies.",
+    )
+    _walk_yaml_diff(norm_before, norm_after)
+    return False
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        description = "Verify each path's diff between BASE and HEAD is "
+        "strictly comments / docstrings.",
+    )
+    parser.add_argument("--base", default = "origin/main", help = "base git ref")
+    parser.add_argument("--head", default = "HEAD", help = "head git ref")
+    parser.add_argument("paths", nargs = "+", help = "repo-relative paths")
+    args = parser.parse_args(argv)
+
+    rc = 0
+    print(f"Comparing {len(args.paths)} files: {args.base} vs {args.head}\n")
+    for path in args.paths:
+        try:
+            before = _git_show(args.base, path)
+            after = _git_show(args.head, path)
+        except subprocess.CalledProcessError as exc:
+            print(f"SKIP {path}: {exc}")
+            continue
+
+        if path.endswith(".py"):
-    for path in args.paths:
-        try:
-            before = _git_show(args.base, path)
-            after = _git_show(args.head, path)
-        except subprocess.CalledProcessError as exc:
-            print(f"SKIP {path}: {exc}")
-            continue
-
-        if path.endswith(".py"):
+    for path in args.paths:
+        before = _git_show(args.base, path)
+        after = _git_show(args.head, path)
+
+        if path.endswith(".py"):
-    for path in args.paths:
-        try:
-            before = _git_show(args.base, path)
-            after = _git_show(args.head, path)
-        except subprocess.CalledProcessError as exc:
-            print(f"SKIP {path}: {exc}")
-            continue
-
-        if path.endswith(".py"):
+    for path in args.paths:
+        before = _git_show(args.base, path)
+        after = _git_show(args.head, path)
+
+        if path.endswith(".py"):
+            if not _verify_python(path, before, after):
+                rc = 1
+        elif path.endswith((".yml", ".yaml")):
+            if not _verify_yaml(path, before, after):
+                rc = 1
+        else:
+            print(f"NOTE {path}: not .py or .yaml -- skipped automated check.")
+
+    return rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())