From b3686a052799f379b9e033c5f397c2b0f46529b1 Mon Sep 17 00:00:00 2001 From: Alex Yang Date: Tue, 3 Mar 2026 18:43:53 -0800 Subject: [PATCH 1/3] feat: restructure codeowner_overrides.json for PR auto-labeling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restructure codeowner_overrides.json from a flat dict to a grouped array where each group has `labels` (for PR auto-labeling) and `owners` (path to owner mappings). All existing path→owner mappings are preserved. Update codeowner_analyzer.py to parse the new format via a standalone `load_overrides()` function, and simplify CODEOWNERS generation to a single alphabetically-sorted pass over the union of computed and override paths. Override owners are prepended on exact matches for a wider owner list; override-only paths (including directories) are now emitted too. Add pr-auto-label.yml workflow that reads the grouped overrides, matches changed files by path prefix, and additively applies labels. Uses pull_request_target for safe write access on external PRs. Skips PRs older than 21 days. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/pr-auto-label.yml | 64 +++++++++ scripts/codeowner_analyzer.py | 201 +++++++++++----------------- scripts/codeowner_overrides.json | 68 ++++++---- 3 files changed, 187 insertions(+), 146 deletions(-) create mode 100644 .github/workflows/pr-auto-label.yml diff --git a/.github/workflows/pr-auto-label.yml b/.github/workflows/pr-auto-label.yml new file mode 100644 index 0000000000..8b1587a7bd --- /dev/null +++ b/.github/workflows/pr-auto-label.yml @@ -0,0 +1,64 @@ +name: PR Auto-Label + +on: + pull_request_target: + types: [opened, synchronize] + +permissions: + contents: read + pull-requests: write + +jobs: + label: + runs-on: ubuntu-latest + steps: + - name: Skip PRs older than 21 days + env: + PR_CREATED: ${{ github.event.pull_request.created_at }} + run: | + created=$(date -d "$PR_CREATED" +%s) + cutoff=$(date -d '21 days ago' +%s) + if [ "$created" -lt "$cutoff" ]; then + echo "PR older than 21 days, skipping" + exit 1 + fi + + - name: Sparse checkout overrides file from base branch + uses: actions/checkout@v4 + with: + sparse-checkout: scripts/codeowner_overrides.json + sparse-checkout-cone-mode: false + + - name: Apply labels based on changed files + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + run: | + # Fetch changed files in the PR + changed_files=$(gh api "repos/${{ github.repository }}/pulls/${PR_NUMBER}/files" \ + --paginate --jq '.[].filename') + + # Read override groups and apply matching labels + jq -c '.[]' scripts/codeowner_overrides.json | while read -r group; do + labels=$(echo "$group" | jq -r '.labels[]') + prefixes=$(echo "$group" | jq -r '.owners | keys[]') + + match=false + for prefix in $prefixes; do + for file in $changed_files; do + case "$file" in + "$prefix"*) match=true; break ;; + esac + done + if $match; then break; fi + done + + if $match; then + for label in $labels; do + echo "Adding label: $label" + gh pr edit "$PR_NUMBER" \ + --repo "${{ github.repository }}" \ + --add-label "$label" || true + done + fi + done diff --git a/scripts/codeowner_analyzer.py b/scripts/codeowner_analyzer.py index acae27f8b2..6a31562120 100644 --- a/scripts/codeowner_analyzer.py +++ b/scripts/codeowner_analyzer.py @@ -18,6 +18,31 @@ import re +def load_overrides(filename: str) -> Dict[str, List[str]]: + """Load codeowner overrides from a grouped JSON array file. + + Returns a flat dict mapping path prefixes to lists of owners. + """ + with open(filename, "r") as f: + raw = json.load(f) + + if not isinstance(raw, list): + raise ValueError("Overrides file must contain a JSON array") + + overrides: Dict[str, List[str]] = {} + for group in raw: + if not isinstance(group, dict) or "owners" not in group: + raise ValueError("Each group must be an object with an 'owners' key") + for path, users in group["owners"].items(): + if not isinstance(users, list) or not all( + isinstance(u, str) for u in users + ): + raise ValueError(f"Override for '{path}' must be a list of strings") + overrides[path.lstrip("./").rstrip("/")] = users + + return overrides + + class CodeOwnersAnalyzer: def __init__( self, @@ -524,62 +549,67 @@ def _normalize_usernames(self, users: List[str]) -> List[str]: """Normalize usernames to have @ prefix.""" return [f"@{u}" if not u.startswith("@") else u for u in users] + def _extract_computed_usernames(self, data: Dict[str, Any]) -> List[str]: + """Extract deduplicated GitHub usernames from computed ownership data.""" + usernames: List[str] = [] + seen_lower: set = set() + if not data["owners"]: + return usernames + top_owners = [ + o for o in data["owners"][: self.top_n_owners] if o["ownership_score"] > 0.1 + ] + for owner in top_owners: + gh = self.get_github_username(owner["author"]) + name = f"@{gh}" if gh else owner["author"].split("<")[1].rstrip(">") + if name.lower() not in seen_lower: + usernames.append(name) + seen_lower.add(name.lower()) + return usernames + def _merge_owners_with_overrides( self, module: str, computed_usernames: List[str] ) -> List[str]: - """Merge manual overrides with computed owners. - - Override users are prepended to the list (indicating primary ownership), - and duplicates are removed. Override users are always included, even if - that exceeds top_n_owners. Only computed owners are subject to truncation. + """Merge manual overrides with computed owners for an exact-match path. - Note: File-level overrides are handled separately in generate_codeowners_file(), - since there's no computed ownership for individual files. - - Args: - module: The module path (e.g., "flashinfer/fused_moe") - computed_usernames: List of GitHub usernames from git history analysis - (with @ prefix, e.g., ["@alice", "@bob"]) - - Returns: - Merged list of usernames with overrides first (all overrides preserved) + Override users are prepended; computed users fill remaining slots up to + top_n_owners. The result is wider than either list alone. """ - if not self.owner_overrides: - return computed_usernames[: self.top_n_owners] - - # Skip file-level overrides (handled separately) - if self._is_file_path(module): - return computed_usernames[: self.top_n_owners] - - # Get override users for this module (without @ prefix in the config) override_users = self.owner_overrides.get(module, []) if not override_users: return computed_usernames[: self.top_n_owners] - # Normalize override users to have @ prefix - override_usernames = self._normalize_usernames(override_users) - - # Build merged list: overrides first, then computed (excluding duplicates) - # Override users are always preserved; only computed users are truncated - # Use case-insensitive comparison since GitHub usernames are case-insensitive - merged = list(override_usernames) + merged = self._normalize_usernames(override_users) merged_lower = {u.lower() for u in merged} - num_overrides = len(merged) - remaining_slots = max(0, self.top_n_owners - num_overrides) + remaining = max(0, self.top_n_owners - len(merged)) for username in computed_usernames: + if remaining <= 0: + break if username.lower() not in merged_lower: - if remaining_slots > 0: - merged.append(username) - merged_lower.add(username.lower()) - remaining_slots -= 1 + merged.append(username) + merged_lower.add(username.lower()) + remaining -= 1 return merged + def _format_codeowners_path(self, path: str) -> str: + """Format a path for CODEOWNERS (append / for directories).""" + return path if self._is_file_path(path) else f"{path}/" + def generate_codeowners_file( self, results: Dict[str, Any], output_file: str = "CODEOWNERS" ) -> None: - """Generate a CODEOWNERS file based on analysis.""" + """Generate a CODEOWNERS file from computed results and overrides. + + All paths (computed and override-only) are emitted in alphabetical + order. For exact-match paths, override owners are prepended to the + computed list so the final owner set is wider. Alphabetical ordering + naturally puts parent dirs before children, which matters because + CODEOWNERS uses last-match-wins. + """ + override_paths = set(self.owner_overrides) if self.owner_overrides else set() + all_paths = sorted(set(results) | override_paths) + with open(output_file, "w") as f: f.write("# Code Owners File\n") f.write("# Generated automatically from git history analysis\n") @@ -589,58 +619,16 @@ def generate_codeowners_file( f.write("# Manual overrides applied from overrides file\n") f.write("\n") - # Write directory entries (computed + merged overrides) - for module, data in results.items(): - # Extract GitHub usernames from computed owners - # Use case-insensitive deduplication since the same user may appear - # multiple times with different email addresses - computed_usernames = [] - seen_usernames_lower = set() - if data["owners"]: - # Take top N owners or those with ownership score > 0.1 - top_owners = [ - owner - for owner in data["owners"][: self.top_n_owners] - if owner["ownership_score"] > 0.1 - ] - - for owner in top_owners: - github_username = self.get_github_username(owner["author"]) - if github_username: - username = f"@{github_username}" - else: - # Fallback to email if no GitHub username found - username = owner["author"].split("<")[1].rstrip(">") - - # Skip duplicates (case-insensitive) - if username.lower() not in seen_usernames_lower: - computed_usernames.append(username) - seen_usernames_lower.add(username.lower()) - - # Merge with overrides - final_usernames = self._merge_owners_with_overrides( - module, computed_usernames - ) - - if final_usernames: - owners_list = " ".join(final_usernames) - f.write(f"{module}/ {owners_list}\n") + for path in all_paths: + if path in results: + computed = self._extract_computed_usernames(results[path]) + final = self._merge_owners_with_overrides(path, computed) + else: + # Override-only path (no computed match) + final = self._normalize_usernames(self.owner_overrides[path]) - # Write file-level overrides LAST (CODEOWNERS uses last-match-wins) - # This ensures file-specific overrides take precedence over directory patterns - if self.owner_overrides: - file_overrides = [ - (path, users) - for path, users in self.owner_overrides.items() - if self._is_file_path(path) - ] - if file_overrides: - f.write( - "\n# File-level overrides (must come last for precedence)\n" - ) - for path, users in sorted(file_overrides): - usernames = self._normalize_usernames(users) - f.write(f"{path} {' '.join(usernames)}\n") + if final: + f.write(f"{self._format_codeowners_path(path)} {' '.join(final)}\n") def print_detailed_report(self, results: Dict[str, Any]) -> None: """Print a detailed ownership report.""" @@ -777,46 +765,15 @@ def main() -> int: owner_overrides = None if args.overrides_file: try: - with open(args.overrides_file, "r") as f: - owner_overrides = json.load(f) - if not isinstance(owner_overrides, dict): - print( - "Error: Overrides file must contain a JSON object (dict)", - file=sys.stderr, - ) - return 1 - # Validate structure: all values should be lists of strings - for path, users in owner_overrides.items(): - if not isinstance(users, list): - print( - f"Error: Override for '{path}' must be a list of usernames", - file=sys.stderr, - ) - return 1 - for user in users: - if not isinstance(user, str): - print( - f"Error: Override users for '{path}' must be strings", - file=sys.stderr, - ) - return 1 - - # Normalize paths: strip leading "./" and trailing "/" - owner_overrides = { - path.lstrip("./").rstrip("/"): users - for path, users in owner_overrides.items() - } + owner_overrides = load_overrides(args.overrides_file) except FileNotFoundError: print( f"Error: Overrides file not found: {args.overrides_file}", file=sys.stderr, ) return 1 - except json.JSONDecodeError as e: - print(f"Error: Invalid JSON in overrides file: {e}", file=sys.stderr) - return 1 - except Exception as e: - print(f"Error reading overrides file: {e}", file=sys.stderr) + except (json.JSONDecodeError, ValueError) as e: + print(f"Error in overrides file: {e}", file=sys.stderr) return 1 try: diff --git a/scripts/codeowner_overrides.json b/scripts/codeowner_overrides.json index 9816d5e4dd..9493c4b17d 100644 --- a/scripts/codeowner_overrides.json +++ b/scripts/codeowner_overrides.json @@ -1,24 +1,44 @@ -{ - "scripts/codeowner_overrides.json": ["@ci-users"], - - "flashinfer/attention.py": ["@flashinfer-ai/attention-owners"], - "include/flashinfer/attention": ["@flashinfer-ai/attention-owners"], - "csrc/fmha_v2": ["@flashinfer-ai/attention-owners"], - "csrc/xqa": ["@flashinfer-ai/attention-owners"], - "tests/attention": ["@flashinfer-ai/attention-owners"], - - "flashinfer/gemm": ["@flashinfer-ai/gemm-owners"], - "include/flashinfer/gemm": ["@flashinfer-ai/gemm-owners"], - "tests/gemm": ["@flashinfer-ai/gemm-owners"], - - "flashinfer/fused_moe": ["@flashinfer-ai/moe-owners"], - "csrc/fused_moe": ["@flashinfer-ai/moe-owners"], - "tests/moe": ["@flashinfer-ai/moe-owners"], - - "flashinfer/comm": ["@flashinfer-ai/comm-owners"], - "include/flashinfer/comm": ["@flashinfer-ai/comm-owners"], - "tests/comm": ["@flashinfer-ai/comm-owners"], - - "flashinfer/norm.py": ["@flashinfer-ai/misc-op-owners"], - "tests/norm": ["@flashinfer-ai/misc-op-owners"] -} +[ + { + "labels": ["op: attention"], + "owners": { + "flashinfer/attention.py": ["@flashinfer-ai/attention-owners"], + "include/flashinfer/attention": ["@flashinfer-ai/attention-owners"], + "csrc/fmha_v2": ["@flashinfer-ai/attention-owners"], + "csrc/xqa": ["@flashinfer-ai/attention-owners"], + "tests/attention": ["@flashinfer-ai/attention-owners"] + } + }, + { + "labels": ["op: gemm"], + "owners": { + "flashinfer/gemm": ["@flashinfer-ai/gemm-owners"], + "include/flashinfer/gemm": ["@flashinfer-ai/gemm-owners"], + "tests/gemm": ["@flashinfer-ai/gemm-owners"] + } + }, + { + "labels": ["op: moe"], + "owners": { + "flashinfer/fused_moe": ["@flashinfer-ai/moe-owners"], + "csrc/fused_moe": ["@flashinfer-ai/moe-owners"], + "tests/moe": ["@flashinfer-ai/moe-owners"] + } + }, + { + "labels": ["op: comm"], + "owners": { + "flashinfer/comm": ["@flashinfer-ai/comm-owners"], + "include/flashinfer/comm": ["@flashinfer-ai/comm-owners"], + "tests/comm": ["@flashinfer-ai/comm-owners"] + } + }, + { + "labels": ["op: misc"], + "owners": { + "scripts/codeowner_overrides.json": ["@ci-users"], + "flashinfer/norm.py": ["@flashinfer-ai/misc-op-owners"], + "tests/norm": ["@flashinfer-ai/misc-op-owners"] + } + } +] From b22fa5fc649b3dc160cb40d80fcc479ea8c725e5 Mon Sep 17 00:00:00 2001 From: Alex Yang Date: Tue, 3 Mar 2026 19:10:06 -0800 Subject: [PATCH 2/3] feat: add --labels flag to filter tests by PR labels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add --labels option to test_utils.sh parse_args() accepting comma-separated PR labels (e.g., "op: attention,op: gemm"). Implement gh_label_filter() which reads codeowner_overrides.json to determine which test prefixes are categorized under which labels. Four-step logic: (1) no labels → run all, (2) label match → run, (3) categorized but no match → skip, (4) uncategorized → always run. Wire the filter into task_run_unit_tests.sh find_test_files() so discovered test files are filtered before execution. Co-Authored-By: Claude Opus 4.6 --- scripts/task_run_unit_tests.sh | 8 +++- scripts/test_utils.sh | 85 +++++++++++++++++++++++++++++++++- 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/scripts/task_run_unit_tests.sh b/scripts/task_run_unit_tests.sh index 303bb00d3a..47cbe24860 100755 --- a/scripts/task_run_unit_tests.sh +++ b/scripts/task_run_unit_tests.sh @@ -50,7 +50,13 @@ find_test_files() { done if [ "$exclude_file" = false ]; then - TEST_FILES="$TEST_FILES $test_file" + # Apply label-based filtering if --labels was provided + local skip_reason + if skip_reason=$(gh_label_filter "$test_file"); then + TEST_FILES="$TEST_FILES $test_file" + else + echo " Skipping $test_file: $skip_reason" + fi fi done diff --git a/scripts/test_utils.sh b/scripts/test_utils.sh index 68106e8c98..6308f31c35 100755 --- a/scripts/test_utils.sh +++ b/scripts/test_utils.sh @@ -36,8 +36,9 @@ EXIT_CODE=0 parse_args() { DRY_RUN=false SANITY_TEST=false - for arg in "$@"; do - case $arg in + LABELS="" + while [ $# -gt 0 ]; do + case $1 in --dry-run) DRY_RUN=true ;; @@ -50,10 +51,90 @@ parse_args() { SANITY_TEST=true fi ;; + --labels) + shift + LABELS="$1" + ;; esac + shift + done +} + +# Label filter state (populated once by _init_label_filter) +_LABEL_FILTER_INITIALIZED=false +_CATEGORIZED_PREFIXES="" # all tests/* prefixes in the JSON +_INCLUDED_PREFIXES="" # tests/* prefixes whose group labels match LABELS + +_init_label_filter() { + if [ "$_LABEL_FILTER_INITIALIZED" = true ]; then + return + fi + _LABEL_FILTER_INITIALIZED=true + + local overrides_file="${SCRIPT_DIR}/codeowner_overrides.json" + if [ ! -f "$overrides_file" ]; then + echo "Warning: $overrides_file not found, label filter disabled" >&2 + LABELS="" + return + fi + + # All test prefixes mentioned in the JSON + _CATEGORIZED_PREFIXES=$(jq -r '.[].owners | keys[] | select(startswith("tests/"))' "$overrides_file") + + # Test prefixes whose group labels intersect with the requested labels + IFS=',' read -ra _label_array <<< "$LABELS" + for _req_label in "${_label_array[@]}"; do + _req_label=$(echo "$_req_label" | xargs) # trim whitespace + local matches + matches=$(jq -r --arg label "$_req_label" \ + '.[] | select(.labels[] == $label) | .owners | keys[] | select(startswith("tests/"))' \ + "$overrides_file") + if [ -n "$matches" ]; then + _INCLUDED_PREFIXES="${_INCLUDED_PREFIXES}${_INCLUDED_PREFIXES:+ +}${matches}" + fi done } +# Filter a test file by PR labels. +# Returns 0 (include) or 1 (skip). On skip, prints a brief reason to stdout. +# +# Four-step logic: +# 1. No labels specified → include (run everything) +# 2. File matches included label → include (label hit) +# 3. File is categorized but not included → skip (label mismatch) +# 4. File is uncategorized → include (always run) +gh_label_filter() { + local test_file="$1" + + # Step 1: no labels specified — include everything + if [ -z "$LABELS" ]; then + return 0 + fi + + _init_label_filter + + # Step 2: file matches a prefix whose group label is in LABELS — include + while IFS= read -r prefix; do + [ -z "$prefix" ] && continue + if [[ "$test_file" == "$prefix"* ]]; then + return 0 + fi + done <<< "$_INCLUDED_PREFIXES" + + # Step 3: file is categorized in the JSON but no label matched — skip + while IFS= read -r prefix; do + [ -z "$prefix" ] && continue + if [[ "$test_file" == "$prefix"* ]]; then + echo "no matching label (have: $LABELS)" + return 1 + fi + done <<< "$_CATEGORIZED_PREFIXES" + + # Step 4: file not mentioned in any group — uncategorized, always run + return 0 +} + # Print test mode banner print_test_mode_banner() { if [ "$DRY_RUN" = "true" ]; then From 1489253ba96be172dbc5ab8e77137fe11d5a84ca Mon Sep 17 00:00:00 2001 From: Alex Yang Date: Tue, 3 Mar 2026 19:13:45 -0800 Subject: [PATCH 3/3] fix: address code review feedback on PR #2683 - Sanitize paths and usernames against newline injection in CODEOWNERS output (high severity) - Validate group['owners'] is a dict, not just present (medium) - Use readarray + quoted array expansion in pr-auto-label.yml to handle labels with spaces (e.g., "op: attention") Co-Authored-By: Claude Opus 4.6 --- .github/workflows/pr-auto-label.yml | 8 ++++---- scripts/codeowner_analyzer.py | 13 ++++++++++--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pr-auto-label.yml b/.github/workflows/pr-auto-label.yml index 8b1587a7bd..907ab392fb 100644 --- a/.github/workflows/pr-auto-label.yml +++ b/.github/workflows/pr-auto-label.yml @@ -40,11 +40,11 @@ jobs: # Read override groups and apply matching labels jq -c '.[]' scripts/codeowner_overrides.json | while read -r group; do - labels=$(echo "$group" | jq -r '.labels[]') - prefixes=$(echo "$group" | jq -r '.owners | keys[]') + readarray -t labels < <(echo "$group" | jq -r '.labels[]') + readarray -t prefixes < <(echo "$group" | jq -r '.owners | keys[]') match=false - for prefix in $prefixes; do + for prefix in "${prefixes[@]}"; do for file in $changed_files; do case "$file" in "$prefix"*) match=true; break ;; @@ -54,7 +54,7 @@ jobs: done if $match; then - for label in $labels; do + for label in "${labels[@]}"; do echo "Adding label: $label" gh pr edit "$PR_NUMBER" \ --repo "${{ github.repository }}" \ diff --git a/scripts/codeowner_analyzer.py b/scripts/codeowner_analyzer.py index 6a31562120..935d9bffaa 100644 --- a/scripts/codeowner_analyzer.py +++ b/scripts/codeowner_analyzer.py @@ -31,8 +31,8 @@ def load_overrides(filename: str) -> Dict[str, List[str]]: overrides: Dict[str, List[str]] = {} for group in raw: - if not isinstance(group, dict) or "owners" not in group: - raise ValueError("Each group must be an object with an 'owners' key") + if not isinstance(group, dict) or not isinstance(group.get("owners"), dict): + raise ValueError("Each group must be an object with an 'owners' dict") for path, users in group["owners"].items(): if not isinstance(users, list) or not all( isinstance(u, str) for u in users @@ -628,7 +628,14 @@ def generate_codeowners_file( final = self._normalize_usernames(self.owner_overrides[path]) if final: - f.write(f"{self._format_codeowners_path(path)} {' '.join(final)}\n") + # Sanitize against newline injection + safe_path = ( + self._format_codeowners_path(path) + .replace("\n", "") + .replace("\r", "") + ) + safe_owners = [u.replace("\n", "").replace("\r", "") for u in final] + f.write(f"{safe_path} {' '.join(safe_owners)}\n") def print_detailed_report(self, results: Dict[str, Any]) -> None: """Print a detailed ownership report."""