diff --git a/.github/workflows/detect-duplicate-issues.yml b/.github/workflows/detect-duplicate-issues.yml new file mode 100644 index 000000000000..fe926245d2c4 --- /dev/null +++ b/.github/workflows/detect-duplicate-issues.yml @@ -0,0 +1,64 @@ +name: Detect Duplicate Issues + +on: + issues: + types: [opened, reopened, edited] + +jobs: + detect-duplicate: + runs-on: ubuntu-latest + permissions: + issues: write + contents: read + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: pip install requests numpy scikit-learn sentence-transformers + + - name: Compute cache key epoch (weekly) + id: cache-epoch + run: echo "week=$(date +%G-W%V)" >> "$GITHUB_OUTPUT" + + - name: Restore issue embedding cache + id: restore-issue-cache + uses: actions/cache/restore@v4 + with: + path: .github/workflows/.dup_issue_cache/embeddings + key: dup-issue-emb-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }} + restore-keys: | + dup-issue-emb-${{ github.repository }}- + + - name: Run duplicate Issue detection + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ISSUE_NUMBER: ${{ github.event.issue.number }} + REPO: ${{ github.repository }} + DRY_RUN: 0 + USE_SENTENCE_TRANSFORMERS: 1 + ISSUE_CANDIDATE_STATE: all + MAX_CANDIDATES: 500 + TITLE_COMPARE_TOP_N: 25 + TOP_K: 5 + SIMILARITY_THRESHOLD: 0.82 + TEXT_WEIGHT: 0.8 + TITLE_WEIGHT: 0.2 + AUTO_LABEL: 1 + DUPLICATE_LABEL: possible-duplicate + ISSUE_EMBED_CACHE_DIR: .github/workflows/.dup_issue_cache/embeddings + ISSUE_EMBED_CACHE_WRITE: 1 + run: python .github/workflows/scripts/detect_duplicate_issues.py + + - name: Save issue embedding cache + if: ${{ steps.restore-issue-cache.outputs.cache-hit != 'true' }} + continue-on-error: true + uses: actions/cache/save@v4 + with: + path: .github/workflows/.dup_issue_cache/embeddings + key: dup-issue-emb-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }} diff --git a/.github/workflows/detect-duplicate-prs.yml b/.github/workflows/detect-duplicate-prs.yml new file mode 100644 index 000000000000..ce47eac543fa --- /dev/null +++ b/.github/workflows/detect-duplicate-prs.yml @@ -0,0 +1,55 @@ +name: Detect Duplicate PRs + +on: + pull_request: + types: [opened, reopened, synchronize, edited, ready_for_review] + +jobs: + detect-duplicate: + runs-on: ubuntu-latest + permissions: + pull-requests: write + contents: read + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: pip install requests numpy scikit-learn sentence-transformers + + - name: Compute cache key epoch (weekly) + id: cache-epoch + run: echo "week=$(date +%G-W%V)" >> "$GITHUB_OUTPUT" + + - name: Restore PR file cache + id: restore-pr-cache + uses: actions/cache/restore@v4 + with: + path: .github/workflows/.dup_pr_cache/files + key: dup-pr-files-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }} + restore-keys: | + dup-pr-files-${{ github.repository }}- + + - name: Run duplicate PR detection + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + REPO: ${{ github.repository }} + DRY_RUN: 0 + PR_FILE_CACHE_DIR: .github/workflows/.dup_pr_cache/files + PR_FILE_CACHE_WRITE: 1 + PREFETCH_CANDIDATE_FILES: 1 + run: python .github/workflows/scripts/detect_duplicate_prs.py + + - name: Save PR file cache + if: ${{ github.event.pull_request.head.repo.full_name == github.repository && steps.restore-pr-cache.outputs.cache-hit != 'true' }} + continue-on-error: true + uses: actions/cache/save@v4 + with: + path: .github/workflows/.dup_pr_cache/files + key: dup-pr-files-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }} diff --git a/.github/workflows/scripts/detect_duplicate_issues.py b/.github/workflows/scripts/detect_duplicate_issues.py new file mode 100644 index 000000000000..43d992ce6af9 --- /dev/null +++ b/.github/workflows/scripts/detect_duplicate_issues.py @@ -0,0 +1,453 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Detect duplicate issues using two-stage similarity: +1) semantic text similarity (title + body) +2) blended score with title token overlap + +Workflow behavior: +- Upsert one bot comment identified by marker. +- Remove stale bot comment and duplicate label when no matches remain. + +Local debug example: +GITHUB_TOKEN="$(gh auth token)" ISSUE_NUMBER=39774 REPO=vllm-project/vllm DRY_RUN=1 \ +ISSUE_EMBED_CACHE_DIR=.github/workflows/.dup_issue_cache/embeddings \ +.venv/bin/python .github/workflows/scripts/detect_duplicate_issues.py +""" + +import json +import os +from pathlib import Path + +import numpy as np +import regex as re +import requests +from sklearn.feature_extraction.text import HashingVectorizer + +USE_SENTENCE_TRANSFORMERS = os.getenv("USE_SENTENCE_TRANSFORMERS", "1").lower() in { + "1", + "true", + "yes", +} +try: + if USE_SENTENCE_TRANSFORMERS: + from sentence_transformers import SentenceTransformer + else: + SentenceTransformer = None +except Exception: + SentenceTransformer = None + +model = None +if SentenceTransformer is not None: + model = SentenceTransformer("all-MiniLM-L6-v2") + +hashing_vectorizer = HashingVectorizer( + n_features=2048, + alternate_sign=False, + norm="l2", +) + +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "") +ISSUE_NUMBER = int(os.environ["ISSUE_NUMBER"]) +REPO = os.environ["REPO"] +DRY_RUN = os.getenv("DRY_RUN", "1").lower() in {"1", "true", "yes"} + +HEADERS = {"Accept": "application/vnd.github+json"} +if GITHUB_TOKEN: + HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}" + +SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.82")) +TOP_K = int(os.getenv("TOP_K", "5")) +MAX_CANDIDATES = int(os.getenv("MAX_CANDIDATES", "500")) +ISSUE_CANDIDATE_STATE = os.getenv("ISSUE_CANDIDATE_STATE", "all") +TITLE_COMPARE_TOP_N = int(os.getenv("TITLE_COMPARE_TOP_N", "25")) +TEXT_WEIGHT = float(os.getenv("TEXT_WEIGHT", "0.8")) +TITLE_WEIGHT = float(os.getenv("TITLE_WEIGHT", "0.2")) +COMMENT_MARKER = "" +DUPLICATE_LABEL = os.getenv("DUPLICATE_LABEL", "possible-duplicate") +AUTO_LABEL = os.getenv("AUTO_LABEL", "1").lower() in {"1", "true", "yes"} +ISSUE_EMBED_CACHE_DIR = os.getenv("ISSUE_EMBED_CACHE_DIR", "") +ISSUE_EMBED_CACHE_WRITE = os.getenv("ISSUE_EMBED_CACHE_WRITE", "1").lower() in { + "1", + "true", + "yes", +} +EMBEDDING_MODE = "sentence-transformers" if model is not None else "hashing-vectorizer" +FEATURE_VERSION = "v2-template-cleaned" +RUN_STATS = { + "api_requests": 0, + "candidates_fetched": 0, + "text_scored": 0, + "final_scored": 0, + "embed_cache_hits": 0, + "embed_cache_misses": 0, + "embed_cache_writes": 0, +} +DROP_BODY_SECTION_HEADERS = { + "your current environment", + "environment", + "before submitting", + "checklist", +} +DROP_BODY_LINE_PATTERNS = [ + re.compile(r"^\s*-\s*\[[ xX]\]\s*"), + re.compile(r"^\s*\s*$"), +] +ENV_BLOCK_HINTS = { + "collecting environment information", + "system info", + "pytorch info", + "python environment", + "cuda / gpu info", + "cpu info", + "the output of python collect_env.py", +} + + +def gh_get(url, params=None): + RUN_STATS["api_requests"] += 1 + r = requests.get(url, headers=HEADERS, params=params) + r.raise_for_status() + return r.json() + + +def gh_post(url, payload): + RUN_STATS["api_requests"] += 1 + r = requests.post(url, headers=HEADERS, json=payload) + r.raise_for_status() + + +def gh_patch(url, payload): + RUN_STATS["api_requests"] += 1 + r = requests.patch(url, headers=HEADERS, json=payload) + r.raise_for_status() + + +def gh_delete(url, ignore_not_found=False): + RUN_STATS["api_requests"] += 1 + r = requests.delete(url, headers=HEADERS) + if ignore_not_found and r.status_code == 404: + return + r.raise_for_status() + + +def _header_name(line: str) -> str: + stripped = line.strip().lstrip("#").strip().lower() + return stripped.rstrip(":") + + +def _should_drop_code_block(lines: list[str]) -> bool: + if not lines: + return False + block_text = "\n".join(lines).lower() + hint_hits = sum(1 for hint in ENV_BLOCK_HINTS if hint in block_text) + if hint_hits >= 2: + return True + return bool(len(lines) >= 60 and hint_hits >= 1) + + +def clean_issue_body(body: str) -> str: + lines = body.replace("\r\n", "\n").split("\n") + output: list[str] = [] + i = 0 + skip_until_next_header = False + while i < len(lines): + line = lines[i] + header = _header_name(line) + if line.strip().startswith("##"): + skip_until_next_header = header in DROP_BODY_SECTION_HEADERS + i += 1 + continue + if skip_until_next_header: + i += 1 + continue + if line.strip().startswith("```"): + block = [line] + i += 1 + while i < len(lines): + block.append(lines[i]) + if lines[i].strip().startswith("```"): + i += 1 + break + i += 1 + if not _should_drop_code_block(block): + output.extend(block) + continue + if any(p.search(line) for p in DROP_BODY_LINE_PATTERNS): + i += 1 + continue + output.append(line) + i += 1 + cleaned = "\n".join(output).strip() + return cleaned + + +def build_issue_text(issue): + raw_body = issue.get("body") or "" + cleaned_body = clean_issue_body(raw_body) + return f"Title: {issue.get('title', '')}\nBody: {cleaned_body[:1000]}" + + +def title_tokens(issue): + title = (issue.get("title") or "").lower() + return re.findall(r"[a-z0-9_]+", title) + + +def get_embedding(text: str): + if model is not None: + return np.asarray(model.encode(text), dtype=float) + return hashing_vectorizer.transform([text]).toarray()[0] + + +def cosine_similarity(a, b): + return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)) + + +def jaccard_similarity(a, b): + sa = set(a) + sb = set(b) + if not sa or not sb: + return 0.0 + return len(sa & sb) / len(sa | sb) + + +def post_comment(issue_number, body): + if DRY_RUN: + print("DRY_RUN enabled: skip posting issue comment.") + print(body) + return + url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments" + gh_post(url, {"body": body}) + + +def patch_comment(comment_id, body): + if DRY_RUN: + print(f"DRY_RUN enabled: skip updating comment {comment_id}.") + print(body) + return + url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}" + gh_patch(url, {"body": body}) + + +def delete_comment(comment_id): + if DRY_RUN: + print(f"DRY_RUN enabled: skip deleting comment {comment_id}.") + return + url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}" + gh_delete(url) + + +def add_label(issue_number, label): + if DRY_RUN: + print(f"DRY_RUN enabled: skip adding label {label}.") + return + url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/labels" + gh_post(url, {"labels": [label]}) + + +def remove_label(issue_number, label): + if DRY_RUN: + print(f"DRY_RUN enabled: skip removing label {label}.") + return + url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/labels/{label}" + gh_delete(url, ignore_not_found=True) + + +def find_existing_bot_comment(issue_number): + page = 1 + while True: + comments = gh_get( + f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments", + params={"per_page": 100, "page": page}, + ) + if not comments: + return None + for comment in comments: + if COMMENT_MARKER in (comment.get("body") or ""): + return comment["id"] + if len(comments) < 100: + return None + page += 1 + + +def print_run_stats(): + print( + "Stats: " + f"api_requests={RUN_STATS['api_requests']} " + f"candidates_fetched={RUN_STATS['candidates_fetched']} " + f"text_scored={RUN_STATS['text_scored']} " + f"final_scored={RUN_STATS['final_scored']} " + f"embed_cache_hits={RUN_STATS['embed_cache_hits']} " + f"embed_cache_misses={RUN_STATS['embed_cache_misses']} " + f"embed_cache_writes={RUN_STATS['embed_cache_writes']}" + ) + + +def get_issue_features(issue): + cache_file = None + issue_number = issue.get("number") + updated_at = issue.get("updated_at") or "" + if ISSUE_EMBED_CACHE_DIR and issue_number is not None: + cache_dir = Path(ISSUE_EMBED_CACHE_DIR) + cache_dir.mkdir(parents=True, exist_ok=True) + cache_file = cache_dir / f"{issue_number}.json" + if cache_file.exists(): + try: + payload = json.loads(cache_file.read_text(encoding="utf-8")) + embedding = payload.get("embedding") + title_tok = payload.get("title_tokens") + if ( + payload.get("feature_version") == FEATURE_VERSION + and payload.get("mode") == EMBEDDING_MODE + and payload.get("updated_at", "") == updated_at + and isinstance(embedding, list) + and isinstance(title_tok, list) + ): + RUN_STATS["embed_cache_hits"] += 1 + return np.asarray(embedding, dtype=float), title_tok + except Exception: + pass + RUN_STATS["embed_cache_misses"] += 1 + text = build_issue_text(issue) + emb = get_embedding(text) + title_tok = title_tokens(issue) + if cache_file is not None and ISSUE_EMBED_CACHE_WRITE: + try: + cache_file.write_text( + json.dumps( + { + "feature_version": FEATURE_VERSION, + "mode": EMBEDDING_MODE, + "updated_at": updated_at, + "embedding": emb.tolist(), + "title_tokens": title_tok, + } + ), + encoding="utf-8", + ) + RUN_STATS["embed_cache_writes"] += 1 + except Exception: + pass + return emb, title_tok + + +def main(): + current_issue = gh_get(f"https://api.github.com/repos/{REPO}/issues/{ISSUE_NUMBER}") + if "pull_request" in current_issue: + print("This is a PR, skipping.") + return + + current_emb, current_title_tokens = get_issue_features(current_issue) + + candidates = [] + page = 1 + while len(candidates) < MAX_CANDIDATES: + issues = gh_get( + f"https://api.github.com/repos/{REPO}/issues", + params={ + "state": ISSUE_CANDIDATE_STATE, + "per_page": 50, + "page": page, + "sort": "updated", + "direction": "desc", + }, + ) + if not issues: + break + for issue in issues: + if "pull_request" in issue: + continue + if issue["number"] == ISSUE_NUMBER: + continue + candidates.append(issue) + if len(candidates) >= MAX_CANDIDATES: + break + page += 1 + if len(issues) < 50: + break + RUN_STATS["candidates_fetched"] = len(candidates) + + text_results = [] + candidate_title_tokens = {} + for issue in candidates: + emb, title_tok = get_issue_features(issue) + candidate_title_tokens[issue["number"]] = title_tok + text_sim = cosine_similarity(current_emb, emb) + RUN_STATS["text_scored"] += 1 + text_results.append((text_sim, issue)) + + text_results.sort(key=lambda x: -x[0]) + title_candidates = text_results[:TITLE_COMPARE_TOP_N] + + blended_results = [] + for text_sim, issue in title_candidates: + tok_sim = jaccard_similarity( + current_title_tokens, candidate_title_tokens.get(issue["number"], []) + ) + final_sim = TEXT_WEIGHT * text_sim + TITLE_WEIGHT * tok_sim + RUN_STATS["final_scored"] += 1 + blended_results.append((final_sim, issue, text_sim, tok_sim)) + + blended_results.sort(key=lambda x: -x[0]) + top_results = [ + (sim, issue, text_sim, tok_sim) + for sim, issue, text_sim, tok_sim in blended_results[:TOP_K] + if sim >= SIMILARITY_THRESHOLD + ] + + existing_comment_id = find_existing_bot_comment(ISSUE_NUMBER) + if not top_results: + if existing_comment_id is not None: + delete_comment(existing_comment_id) + print("Deleted stale duplicate checker comment.") + if AUTO_LABEL: + remove_label(ISSUE_NUMBER, DUPLICATE_LABEL) + print("No highly similar issues found.") + print_run_stats() + return + + if AUTO_LABEL: + add_label(ISSUE_NUMBER, DUPLICATE_LABEL) + + lines = [ + COMMENT_MARKER, + "## 🔍 Potentially Related Issues\n", + ( + f"The following {ISSUE_CANDIDATE_STATE} issues may be related to this " + "issue:\n" + ), + ( + "If this is intentional and complementary work, feel free to ignore " + "this notice.\n" + ), + "| Match Score | Desc Similarity | Title Overlap | Issue # | State | Title |", + "|---|---|---|---|---|---|", + ] + for sim, issue, text_sim, tok_sim in top_results: + state_icon = "🟢" if issue["state"] == "open" else "🔴" + row = ( + f"| {sim:.0%} | {text_sim:.0%} | {tok_sim:.0%} | " + f"#{issue['number']} | {state_icon} {issue['state']} | " + f"[{issue['title']}]({issue['html_url']}) |" + ) + lines.append(row) + lines.append( + "\n> 🤖 Auto-detected by similarity signals (title/body/title-tokens)." + ) + lines.append( + "This is a soft hint only. Please review manually to determine whether " + "these are related work or true duplicates." + ) + body = "\n".join(lines) + + if existing_comment_id is not None: + patch_comment(existing_comment_id, body) + print(f"Updated comment with {len(top_results)} similar issues.") + else: + post_comment(ISSUE_NUMBER, body) + print(f"Posted comment with {len(top_results)} similar issues.") + print_run_stats() + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/scripts/detect_duplicate_prs.py b/.github/workflows/scripts/detect_duplicate_prs.py new file mode 100644 index 000000000000..4db9d8492a7a --- /dev/null +++ b/.github/workflows/scripts/detect_duplicate_prs.py @@ -0,0 +1,317 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Detect duplicate PRs using text similarity + file overlap. + +Workflow overview: +1. Load current PR metadata, text, and changed files. +2. Fetch open PR candidates (excluding the current PR). +3. Use text similarity for first-pass candidate ranking. +4. Compute blended text+file similarity on top candidates. +5. Upsert one bot comment; remove stale comment when no matches remain. + +Local debug example: +GITHUB_TOKEN="$(gh auth token)" PR_NUMBER=61456 REPO=vllm-project/vllm DRY_RUN=1 \ +PR_FILE_CACHE_DIR=.github/workflows/.dup_pr_cache/files \ +.venv/bin/python .github/workflows/scripts/detect_duplicate_prs.py +""" + +import json +import os +from pathlib import Path + +import numpy as np +import requests +from sklearn.feature_extraction.text import HashingVectorizer + +USE_SENTENCE_TRANSFORMERS = os.getenv("USE_SENTENCE_TRANSFORMERS", "1").lower() in { + "1", + "true", + "yes", +} +try: + if USE_SENTENCE_TRANSFORMERS: + from sentence_transformers import SentenceTransformer + else: + SentenceTransformer = None +except Exception: + SentenceTransformer = None + +model = None +if SentenceTransformer is not None: + model = SentenceTransformer("all-MiniLM-L6-v2") + +hashing_vectorizer = HashingVectorizer( + n_features=2048, + alternate_sign=False, + norm="l2", +) + + +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "") +PR_NUMBER = int(os.environ["PR_NUMBER"]) +REPO = os.environ["REPO"] +DRY_RUN = os.getenv("DRY_RUN", "1").lower() in {"1", "true", "yes"} + +HEADERS = {"Accept": "application/vnd.github+json"} +if GITHUB_TOKEN: + HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}" + +SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.75")) +TOP_K = 5 +MAX_CANDIDATES = int(os.getenv("MAX_CANDIDATES", "500")) +PR_CANDIDATE_STATE = os.getenv("PR_CANDIDATE_STATE", "all") +FILE_COMPARE_TOP_N = int(os.getenv("FILE_COMPARE_TOP_N", "20")) +PREFETCH_CANDIDATE_FILES = os.getenv("PREFETCH_CANDIDATE_FILES", "1").lower() in { + "1", + "true", + "yes", +} +TEXT_WEIGHT = 0.75 +FILE_WEIGHT = 0.25 +COMMENT_MARKER = "" +PR_FILE_CACHE_DIR = os.getenv("PR_FILE_CACHE_DIR", "") +PR_FILE_CACHE_WRITE = os.getenv("PR_FILE_CACHE_WRITE", "1").lower() in { + "1", + "true", + "yes", +} +RUN_STATS = { + "api_requests": 0, + "file_cache_hits": 0, + "file_cache_misses": 0, + "file_cache_writes": 0, +} + + +def gh_get(url, params=None): + RUN_STATS["api_requests"] += 1 + r = requests.get(url, headers=HEADERS, params=params) + r.raise_for_status() + return r.json() + + +def get_pr_files(pr_number): + cache_file = None + if PR_FILE_CACHE_DIR: + cache_dir = Path(PR_FILE_CACHE_DIR) + cache_dir.mkdir(parents=True, exist_ok=True) + cache_file = cache_dir / f"{pr_number}.json" + if cache_file.exists(): + try: + cached_files = json.loads(cache_file.read_text(encoding="utf-8")) + if isinstance(cached_files, list): + RUN_STATS["file_cache_hits"] += 1 + return cached_files + except Exception: + pass + RUN_STATS["file_cache_misses"] += 1 + url = f"https://api.github.com/repos/{REPO}/pulls/{pr_number}/files" + try: + files = gh_get(url, params={"per_page": 100}) + filenames = [f["filename"] for f in files] + if cache_file is not None and PR_FILE_CACHE_WRITE: + try: + cache_file.write_text(json.dumps(filenames), encoding="utf-8") + RUN_STATS["file_cache_writes"] += 1 + except Exception: + pass + return filenames + except Exception: + return [] + + +def build_pr_text(pr): + parts = [ + f"Title: {pr.get('title', '')}", + f"Body: {(pr.get('body') or '')[:800]}", + ] + return "\n".join(parts) + + +def get_embedding(text: str): + if model is not None: + return np.asarray(model.encode(text), dtype=float) + return hashing_vectorizer.transform([text]).toarray()[0] + + +def cosine_similarity(a, b): + return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)) + + +def jaccard_similarity(a, b): + sa = set(a) + sb = set(b) + if not sa or not sb: + return 0.0 + return len(sa & sb) / len(sa | sb) + + +def post_comment(issue_number, body): + if DRY_RUN: + print("DRY_RUN enabled: skip posting PR comment.") + print(body) + return + url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments" + requests.post(url, headers=HEADERS, json={"body": body}) + + +def patch_comment(comment_id, body): + if DRY_RUN: + print(f"DRY_RUN enabled: skip updating comment {comment_id}.") + print(body) + return + url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}" + requests.patch(url, headers=HEADERS, json={"body": body}) + + +def delete_comment(comment_id): + if DRY_RUN: + print(f"DRY_RUN enabled: skip deleting comment {comment_id}.") + return + url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}" + requests.delete(url, headers=HEADERS) + + +def find_existing_bot_comment(issue_number): + page = 1 + while True: + comments = gh_get( + f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments", + params={"per_page": 100, "page": page}, + ) + if not comments: + return None + for comment in comments: + if COMMENT_MARKER in (comment.get("body") or ""): + return comment["id"] + if len(comments) < 100: + return None + page += 1 + + +def main(): + def print_run_stats(): + print( + "Stats: " + f"api_requests={RUN_STATS['api_requests']} " + f"file_cache_hits={RUN_STATS['file_cache_hits']} " + f"file_cache_misses={RUN_STATS['file_cache_misses']} " + f"file_cache_writes={RUN_STATS['file_cache_writes']}" + ) + + # 1. Load current PR context. + current_pr = gh_get(f"https://api.github.com/repos/{REPO}/pulls/{PR_NUMBER}") + current_text = build_pr_text(current_pr) + current_emb = get_embedding(current_text) + current_files = get_pr_files(PR_NUMBER) + + # 2. Fetch PR candidates (exclude current PR). ranked by most recent updated + history_prs = [] + page = 1 + while len(history_prs) < MAX_CANDIDATES: + prs = gh_get( + f"https://api.github.com/repos/{REPO}/pulls", + params={ + "state": PR_CANDIDATE_STATE, + "per_page": 50, + "page": page, + "sort": "updated", + "direction": "desc", + }, + ) + if not prs: + break + for pr in prs: + if pr["number"] != PR_NUMBER: + history_prs.append(pr) + if len(history_prs) >= MAX_CANDIDATES: + break + page += 1 + if len(prs) < 50: + break + + # 3. Stage-1: rank candidates by text similarity. + text_results = [] + for pr in history_prs: + text = build_pr_text(pr) + emb = get_embedding(text) + text_sim = cosine_similarity(current_emb, emb) + text_results.append((text_sim, pr)) + + # Warm file cache for all candidates so different PR runs can reuse + # a stable candidate pool across workflow executions. + if PREFETCH_CANDIDATE_FILES: + for pr in history_prs: + get_pr_files(pr["number"]) + + text_results.sort(key=lambda x: -x[0]) + file_candidates = text_results[:FILE_COMPARE_TOP_N] + + # 4. Stage-2: score top candidates with text + file overlap. + results = [] + for text_sim, pr in file_candidates: + pr_files = get_pr_files(pr["number"]) + file_sim = jaccard_similarity(current_files, pr_files) + final_sim = TEXT_WEIGHT * text_sim + FILE_WEIGHT * file_sim + results.append((final_sim, pr, text_sim, file_sim)) + + results.sort(key=lambda x: -x[0]) + top_results = [ + (sim, pr, text_sim, file_sim) + for sim, pr, text_sim, file_sim in results[:TOP_K] + if sim >= SIMILARITY_THRESHOLD + ] + + # 5. Upsert bot comment + existing_comment_id = find_existing_bot_comment(PR_NUMBER) + if not top_results: + if existing_comment_id is not None: + delete_comment(existing_comment_id) + print("Deleted stale duplicate checker comment.") + else: + print("No highly similar PRs found.") + print_run_stats() + return + + lines = [ + COMMENT_MARKER, + "## 🔍 Potentially Related PRs\n", + ( + f"The following {PR_CANDIDATE_STATE} PRs may be related to this PR, " + "and could overlap in intent or implementation:\n" + ), + ( + "If this is intentional and complementary work, feel free to ignore " + "this notice.\n" + ), + "| Match Score | Desc Similarity | Files Overlap | PR # | State | Title |", + "|---|---|---|---|---|---|", + ] + for sim, pr, text_sim, file_sim in top_results: + state_icon = ( + "🟢" if pr["state"] == "open" else ("🟣" if pr.get("merged_at") else "🔴") + ) + row = ( + f"| {sim:.0%} | {text_sim:.0%} | {file_sim:.0%} | " + f"#{pr['number']} | {state_icon} {pr['state']} | " + f"[{pr['title']}]({pr['html_url']}) |" + ) + lines.append(row) + lines.append("\n> 🤖 Auto-detected by similarity signals (title/body/files).") + lines.append( + "This is a soft hint only. Please review manually to determine whether " + "these are related work or true duplicates." + ) + body = "\n".join(lines) + if existing_comment_id is not None: + patch_comment(existing_comment_id, body) + print(f"Updated comment with {len(top_results)} similar PRs.") + else: + post_comment(PR_NUMBER, body) + print(f"Posted comment with {len(top_results)} similar PRs.") + print_run_stats() + + +if __name__ == "__main__": + main()