diff --git a/scripts/verify_comment_only_diff.py b/scripts/verify_comment_only_diff.py new file mode 100644 index 0000000000..068d3244df --- /dev/null +++ b/scripts/verify_comment_only_diff.py @@ -0,0 +1,245 @@ +# Unsloth - 2x faster, 60% less VRAM LLM training and finetuning +# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. + +"""Deterministic comment / docstring-only verifier. + +Compares a list of changed files between two git refs and reports whether +each diff is strictly comments / docstrings (Python) or comments +(YAML / GitHub Actions). Useful for gating a "comment trim" / +"docstring refactor" PR against accidental code drift. + +Per .py file: parse both revs into AST, strip module / class / function +docstrings, then compare ast.unparse output. Pure Python comments are +discarded by the parser by construction, so any post-strip diff is real +code. Per .yml file: yaml.safe_load both sides and compare the parsed +Python object; if scalar values differ, also strip shell comments inside +``run: |`` block bodies before comparing. Exit code 0 = all OK, 1 = at +least one file has a real (non-comment) diff or an error. + +Usage: + python scripts/verify_comment_only_diff.py [--base REF] [--head REF] path ... + +Defaults: --base origin/main, --head HEAD. Paths are repo-relative. + +Example: + git diff --name-only origin/main..HEAD \\ + | xargs python scripts/verify_comment_only_diff.py --base origin/main +""" +from __future__ import annotations + +import argparse +import ast +import difflib +import subprocess +import sys +from typing import Any + +import yaml + + +def _git_show(rev: str, path: str) -> str: + return subprocess.check_output( + ["git", "show", f"{rev}:{path}"], text = True, stderr = subprocess.DEVNULL, + ) + + +def _strip_docstrings(tree: ast.AST) -> ast.AST: + """Remove every string-literal docstring (Module / FunctionDef / + AsyncFunctionDef / ClassDef). Empty body becomes ``pass`` so + ast.unparse stays valid.""" + for node in ast.walk(tree): + if isinstance( + node, + (ast.Module, ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef), + ): + body = getattr(node, "body", None) + if not body: + continue + first = body[0] + if ( + isinstance(first, ast.Expr) + and isinstance(first.value, ast.Constant) + and isinstance(first.value.value, str) + ): + node.body = body[1:] + if not node.body: + node.body = [ast.Pass()] + return tree + + +def _normalize_py(src: str) -> str: + tree = ast.parse(src) + tree = _strip_docstrings(tree) + return ast.unparse(tree) + + +def _strip_shell_comments(s: str) -> str: + """Strip pure-comment lines and inline trailing comments from a shell + snippet, then collapse runs of blank lines. Heuristic only: leaves a + line untouched if it has an odd quote count (open string).""" + out = [] + for line in s.splitlines(): + stripped = line.lstrip() + if stripped.startswith("#"): + continue + has_single = line.count("'") % 2 == 0 + has_double = line.count('"') % 2 == 0 + if has_single and has_double: + idx = line.find(" #") + if idx >= 0: + line = line[:idx].rstrip() + out.append(line) + norm = [] + prev_blank = False + for line in out: + if line.strip() == "": + if prev_blank: + continue + prev_blank = True + else: + prev_blank = False + norm.append(line) + return "\n".join(norm).strip() + + +def _normalize_yaml_run_strings(obj: Any) -> Any: + """Walk the parsed YAML object; for any multi-line string (i.e. a + ``run: |`` script body), strip shell comments. Returns a normalised + copy.""" + if isinstance(obj, dict): + return {k: _normalize_yaml_run_strings(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_normalize_yaml_run_strings(x) for x in obj] + if isinstance(obj, str) and "\n" in obj: + return _strip_shell_comments(obj) + return obj + + +def _walk_yaml_diff(b: Any, a: Any, prefix: str = "") -> None: + """Print a path-keyed summary of the first structural / scalar diff.""" + if type(b) is not type(a): + print( + f" type-diff at {prefix or '/'}: " + f"{type(b).__name__} -> {type(a).__name__}", + ) + return + if isinstance(b, dict): + keys = sorted((set(b.keys()) | set(a.keys())), key = lambda x: str(x)) + for k in keys: + if k not in b: + print(f" added key {prefix}/{k}") + elif k not in a: + print(f" removed key {prefix}/{k}") + else: + _walk_yaml_diff(b[k], a[k], f"{prefix}/{k}") + elif isinstance(b, list): + if len(b) != len(a): + print( + f" list len at {prefix or '/'}: " + f"{len(b)} -> {len(a)}", + ) + for i, (bi, ai) in enumerate(zip(b, a)): + _walk_yaml_diff(bi, ai, f"{prefix}[{i}]") + elif b != a: + bs = repr(b)[:300] + as_ = repr(a)[:300] + print(f" scalar at {prefix or '/'}:") + print(f" before: {bs}") + print(f" after: {as_}") + + +def _verify_python(path: str, before: str, after: str) -> bool: + try: + norm_before = _normalize_py(before) + norm_after = _normalize_py(after) + except SyntaxError as exc: + print(f"FAIL {path}: SyntaxError parsing -- {exc}") + return False + if norm_before == norm_after: + print(f"OK {path} (AST identical after docstring strip)") + return True + diff = list( + difflib.unified_diff( + norm_before.splitlines(), + norm_after.splitlines(), + fromfile = f"{path}@before", + tofile = f"{path}@after", + n = 2, + ) + ) + print(f"FAIL {path}: AST differs after docstring strip:") + for line in diff[:40]: + print(f" {line}") + return False + + +def _verify_yaml(path: str, before: str, after: str) -> bool: + try: + raw_before = yaml.safe_load(before) + raw_after = yaml.safe_load(after) + except yaml.YAMLError as exc: + print(f"FAIL {path}: YAML parse error -- {exc}") + return False + if raw_before == raw_after: + print(f"OK {path} (YAML parsed object identical)") + return True + norm_before = _normalize_yaml_run_strings(raw_before) + norm_after = _normalize_yaml_run_strings(raw_after) + if norm_before == norm_after: + print( + f"OK {path} (YAML parsed object identical after " + f"stripping shell comments from run: bodies)", + ) + return True + print( + f"FAIL {path}: YAML parsed objects still differ after stripping " + f"shell comments from `run:` bodies.", + ) + _walk_yaml_diff(norm_before, norm_after) + return False + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description = "Verify each path's diff between BASE and HEAD is " + "strictly comments / docstrings.", + ) + parser.add_argument("--base", default = "origin/main", help = "base git ref") + parser.add_argument("--head", default = "HEAD", help = "head git ref") + parser.add_argument("paths", nargs = "+", help = "repo-relative paths") + args = parser.parse_args(argv) + + rc = 0 + print(f"Comparing {len(args.paths)} files: {args.base} vs {args.head}\n") + for path in args.paths: + try: + before = _git_show(args.base, path) + after = _git_show(args.head, path) + except subprocess.CalledProcessError as exc: + print(f"SKIP {path}: {exc}") + continue + + if path.endswith(".py"): + if not _verify_python(path, before, after): + rc = 1 + elif path.endswith((".yml", ".yaml")): + if not _verify_yaml(path, before, after): + rc = 1 + else: + print(f"NOTE {path}: not .py or .yaml -- skipped automated check.") + + return rc + + +if __name__ == "__main__": + sys.exit(main())