diff --git a/.github/scripts/check-vendored-grammar-updates.py b/.github/scripts/check-vendored-grammar-updates.py new file mode 100644 index 0000000000..0d91a1e3e2 --- /dev/null +++ b/.github/scripts/check-vendored-grammar-updates.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 +"""Monitor upstream updates for GitNexus's VENDORED tree-sitter grammars. + +The npm-dependency grammars (tree-sitter-c, -python, -rust, ...) are tracked by +Dependabot — when upstream publishes a new version, a PR shows up. The VENDORED +grammars (gitnexus/vendor/tree-sitter-*) are off the dependency graph: GitNexus +ships its own snapshot, so nothing tells us when their upstream moves. This +closes that blind spot — it gives the vendored grammars the same daily +"you're behind upstream" visibility Dependabot gives the npm ones. + +Per upstream source: + - npm-published grammars (swift, kotlin): compare the vendored package.json + `version` against the npm registry's latest. + - GitHub-only grammars (dart, proto): compare the vendored snapshot against the + upstream default branch — by recorded commit (parsed from `_vendoredBy`) when + available, else by byte-comparing the vendored `src/parser.c`. + +Companion to check-tree-sitter-upgrade-readiness.py, which tracks a different +thing (tree-sitter RUNTIME 0.25 upgrade readiness / ABI), not version drift. + +Outputs Markdown to stdout. Exit 0 = every vendored grammar is current. Exit 1 = +at least one is behind (the workflow uses this to open/update a tracking issue). + +Stdlib only — runs on any vanilla runner. +""" + +from __future__ import annotations + +import json +import os +import pathlib +import re +import urllib.error +import urllib.parse +import urllib.request + +REPO_ROOT = pathlib.Path(__file__).resolve().parents[2] +VENDOR_DIR = REPO_ROOT / "gitnexus" / "vendor" + +# Upstream source of truth per vendored grammar. Co-located here (mirrors the +# GRAMMARS table in check-tree-sitter-upgrade-readiness.py) so a new vendored +# grammar is monitored by adding one row. `source` selects the drift strategy. +UPSTREAM: dict[str, dict[str, str]] = { + "tree-sitter-swift": {"source": "npm", "name": "tree-sitter-swift"}, + "tree-sitter-kotlin": {"source": "npm", "name": "tree-sitter-kotlin"}, + "tree-sitter-dart": { + "source": "github", + "repo": "UserNobody14/tree-sitter-dart", + "branch": "master", + }, + "tree-sitter-proto": { + "source": "github", + "repo": "coder3101/tree-sitter-proto", + "branch": "main", + }, +} + +_GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") + + +# ── Fetch helpers (stdlib only) ───────────────────────────────────────────── + + +def _fetch(url: str, *, accept: str = "application/json", timeout: int = 10) -> str | None: + headers = {"Accept": accept, "User-Agent": "gitnexus-vendored-monitor"} + try: + host = urllib.parse.urlparse(url).hostname or "" + except ValueError: + host = "" + is_github = host == "api.github.com" or host.endswith(".githubusercontent.com") + if _GITHUB_TOKEN and is_github: + headers["Authorization"] = f"Bearer {_GITHUB_TOKEN}" + try: + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req, timeout=timeout) as resp: + return resp.read().decode("utf-8", errors="ignore") + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError): + return None + + +def npm_latest(pkg: str) -> str | None: + text = _fetch(f"https://registry.npmjs.org/{urllib.parse.quote(pkg)}/latest") + if not text: + return None + try: + return json.loads(text).get("version") + except json.JSONDecodeError: + return None + + +def github_head_sha(repo: str, branch: str) -> str | None: + text = _fetch(f"https://api.github.com/repos/{repo}/commits/{branch}") + if not text: + return None + try: + return json.loads(text).get("sha") + except json.JSONDecodeError: + return None + + +def github_raw(repo: str, branch: str, path: str) -> str | None: + return _fetch( + f"https://raw.githubusercontent.com/{repo}/{branch}/{path}", + accept="text/plain", + ) + + +# ── Version comparison ────────────────────────────────────────────────────── + + +def semver_tuple(v: str) -> tuple[int, ...]: + nums = re.findall(r"\d+", v or "") + return tuple(int(n) for n in nums[:3]) if nums else () + + +# ── Per-grammar drift ─────────────────────────────────────────────────────── + + +def vendored_version(name: str) -> str | None: + pkg = VENDOR_DIR / name / "package.json" + if not pkg.is_file(): + return None + try: + return json.loads(pkg.read_text(encoding="utf-8", errors="ignore")).get("version") + except json.JSONDecodeError: + return None + + +def vendored_commit(name: str) -> str | None: + pkg = VENDOR_DIR / name / "package.json" + if not pkg.is_file(): + return None + try: + by = json.loads(pkg.read_text(encoding="utf-8", errors="ignore")).get("_vendoredBy", "") + except json.JSONDecodeError: + return None + m = re.search(r"\b([0-9a-f]{40})\b", by) + return m.group(1) if m else None + + +def check_npm(name: str, cfg: dict) -> dict: + current = vendored_version(name) + latest = npm_latest(cfg["name"]) + if latest is None: + return {"name": name, "state": "unknown", "detail": "npm registry fetch failed"} + behind = bool(current) and semver_tuple(latest) > semver_tuple(current) + return { + "name": name, + "state": "behind" if behind else "current", + "current": current or "?", + "latest": latest, + "source": f"npm:{cfg['name']}", + } + + +def check_github(name: str, cfg: dict) -> dict: + repo, branch = cfg["repo"], cfg["branch"] + head = github_head_sha(repo, branch) + if head is None: + return {"name": name, "state": "unknown", "detail": f"GitHub fetch failed ({repo})"} + pinned = vendored_commit(name) + if pinned: + behind = pinned != head + return { + "name": name, + "state": "behind" if behind else "current", + "current": pinned[:12], + "latest": head[:12], + "source": f"github:{repo}@{branch}", + } + # No recorded commit — fall back to byte-comparing the vendored parser.c. + local = VENDOR_DIR / name / "src" / "parser.c" + if not local.is_file(): + return { + "name": name, + "state": "unknown", + "detail": "no recorded commit and no vendored src/parser.c to compare", + } + upstream = github_raw(repo, branch, "src/parser.c") + if upstream is None: + return {"name": name, "state": "unknown", "detail": f"could not fetch upstream parser.c ({repo})"} + same = local.read_text(encoding="utf-8", errors="ignore").replace("\r\n", "\n") == upstream.replace( + "\r\n", "\n" + ) + return { + "name": name, + "state": "current" if same else "behind", + "current": "vendored parser.c", + "latest": f"{repo}@{branch} HEAD ({head[:12]})", + "source": f"github:{repo}@{branch} (parser.c byte-compare)", + } + + +def main() -> int: + vendored = sorted(d.name for d in VENDOR_DIR.iterdir() if d.is_dir() and d.name.startswith("tree-sitter-")) if VENDOR_DIR.is_dir() else [] + + results: list[dict] = [] + for name in vendored: + cfg = UPSTREAM.get(name) + if cfg is None: + results.append({"name": name, "state": "unconfigured", "detail": "add an UPSTREAM entry to monitor this grammar"}) + continue + results.append(check_npm(name, cfg) if cfg["source"] == "npm" else check_github(name, cfg)) + + behind = [r for r in results if r["state"] == "behind"] + unknown = [r for r in results if r["state"] in ("unknown", "unconfigured")] + current = [r for r in results if r["state"] == "current"] + + out: list[str] = ["# Vendored tree-sitter grammar updates", ""] + if not vendored: + out.append("No vendored grammars found under `gitnexus/vendor/`.") + print("\n".join(out)) + return 0 + + if behind: + out.append(f"**{len(behind)} vendored grammar(s) behind upstream.** Refresh the vendor snapshot and re-run the `build-tree-sitter-prebuilds` workflow.") + else: + out.append("**All vendored grammars are current with upstream.**") + out.append("") + out.append("| Grammar | Vendored | Upstream latest | Source | Status |") + out.append("|---|---|---|---|---|") + for r in results: + if r["state"] in ("unknown", "unconfigured"): + out.append(f"| `{r['name']}` | — | — | — | ⚠️ {r.get('detail', r['state'])} |") + else: + icon = "🔴 behind" if r["state"] == "behind" else "✅ current" + out.append(f"| `{r['name']}` | `{r['current']}` | `{r['latest']}` | {r['source']} | {icon} |") + out.append("") + out.append("_npm grammars are tracked by Dependabot; this covers the vendored ones. " + "Companion: the tree-sitter 0.25 runtime upgrade readiness tracker._") + print("\n".join(out)) + + # Exit 1 only on real drift — `unknown` (transient fetch failure) must not + # flap the tracking issue open/closed on a network blip. + return 1 if behind else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.github/workflows/vendored-grammar-updates.yml b/.github/workflows/vendored-grammar-updates.yml new file mode 100644 index 0000000000..c7dadee2dd --- /dev/null +++ b/.github/workflows/vendored-grammar-updates.yml @@ -0,0 +1,173 @@ +name: Vendored grammar updates + +# Gives GitNexus's VENDORED tree-sitter grammars (gitnexus/vendor/tree-sitter-*) +# the same daily "you're behind upstream" visibility Dependabot gives the +# npm-dependency grammars. The vendored grammars are off the dependency graph, +# so Dependabot can't see them; this checks each against its upstream (npm +# latest for swift/kotlin, GitHub default-branch drift for dart/proto) and +# opens/updates a tracking issue when one falls behind. +# +# Logic + per-grammar upstream coords: .github/scripts/check-vendored-grammar-updates.py +# Companion: tree-sitter-upgrade-readiness.yml tracks the tree-sitter RUNTIME +# upgrade (ABI), a different concern from vendored version drift. +# +# Concurrency convention: see CONTRIBUTING.md → "GitHub Actions — Concurrency Convention". + +on: + schedule: + # Daily at 09:05 UTC — just after the upgrade-readiness tracker (09:00) and + # Dependabot's daily run, so all dependency signals surface together. + - cron: '5 9 * * *' + workflow_dispatch: + pull_request: + paths: + - '.github/scripts/check-vendored-grammar-updates.py' + - '.github/workflows/vendored-grammar-updates.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +permissions: + contents: read + +jobs: + check: + name: Check vendored grammar drift + runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: read + # Needed to open/update/close the tracking issue on scheduled runs. + issues: write + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Check vendored grammar drift + id: check + shell: bash + env: + # Raises the GitHub API rate limit (60 → 5000/hr) for the drift fetches. + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set +e + python3 .github/scripts/check-vendored-grammar-updates.py > drift-report.md + code=$? + set -e + echo "exit_code=$code" >> "$GITHUB_OUTPUT" + { + echo 'report<> "$GITHUB_OUTPUT" + echo "=== Report ===" + cat drift-report.md + + # On PR runs the script just self-validates; drift is informational. + - name: Annotate PR when a vendored grammar is behind + if: github.event_name == 'pull_request' && steps.check.outputs.exit_code != '0' + run: echo "::warning::A vendored tree-sitter grammar is behind upstream. See the job output." + + - name: Upsert tracking issue on scheduled drift + if: github.event_name == 'schedule' && steps.check.outputs.exit_code != '0' + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + env: + REPORT: ${{ steps.check.outputs.report }} + with: + script: | + const title = 'Vendored tree-sitter grammar updates available'; + const report = process.env.REPORT; + const body = + report + + '\n\nGenerated daily by `.github/workflows/vendored-grammar-updates.yml`. ' + + 'Closes automatically when every vendored grammar is current with upstream.'; + + // One status per grammar, parsed from the report's last table cell. + const parseRows = (md) => { + const map = {}; + for (const m of (md || '').matchAll(/\| `(tree-sitter-[^`]+)` \|.*\| ([^|]+) \|$/gm)) { + map[m[1]] = m[2].trim(); + } + return map; + }; + + const { data: open } = await github.rest.issues.listForRepo({ + owner: context.repo.owner, + repo: context.repo.repo, + state: 'open', + labels: 'tree-sitter-drift', + per_page: 20, + }); + const existing = open.find((i) => i.title === title); + + if (existing) { + const oldRows = parseRows(existing.body); + const newRows = parseRows(report); + const changes = []; + for (const [name, status] of Object.entries(newRows)) { + if (oldRows[name] && oldRows[name] !== status) { + changes.push(`\`${name}\`: ${oldRows[name]} → ${status}`); + } + } + const behind = Object.entries(newRows) + .filter(([, s]) => s.includes('behind')) + .map(([n]) => n); + let comment = `Behind upstream: ${behind.length ? behind.map((n) => `\`${n}\``).join(', ') : 'none'}.`; + comment += changes.length + ? '\n\nChanges since last run:\n' + changes.map((c) => `- ${c}`).join('\n') + : ' No changes from the previous run.'; + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: existing.number, + body: comment, + }); + await github.rest.issues.update({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: existing.number, + body, + }); + core.info(`Updated issue #${existing.number}`); + } else { + const { data: created } = await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title, + body, + labels: ['tree-sitter-drift', 'dependencies'], + }); + core.info(`Opened issue #${created.number}`); + } + + - name: Close tracking issue when all vendored grammars are current + if: github.event_name == 'schedule' && steps.check.outputs.exit_code == '0' + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const title = 'Vendored tree-sitter grammar updates available'; + const { data: open } = await github.rest.issues.listForRepo({ + owner: context.repo.owner, + repo: context.repo.repo, + state: 'open', + labels: 'tree-sitter-drift', + per_page: 20, + }); + const existing = open.find((i) => i.title === title); + if (existing) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: existing.number, + body: 'All vendored tree-sitter grammars are now current with upstream. Closing automatically.', + }); + await github.rest.issues.update({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: existing.number, + state: 'closed', + }); + core.info(`Closed issue #${existing.number}`); + }