From 69ae7560339fbe3de528a97bf3a15d961e4c9545 Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Mon, 13 Apr 2026 15:15:04 +0800 Subject: [PATCH 1/6] test AI de-dup CI Signed-off-by: Peter Pan --- .github/workflows/detect-duplicate-issues.yml | 29 +++ .github/workflows/detect-duplicate-prs.yml | 29 +++ .../scripts/detect_duplicate_issues.py | 149 ++++++++++++++ .../workflows/scripts/detect_duplicate_prs.py | 190 ++++++++++++++++++ 4 files changed, 397 insertions(+) create mode 100644 .github/workflows/detect-duplicate-issues.yml create mode 100644 .github/workflows/detect-duplicate-prs.yml create mode 100644 .github/workflows/scripts/detect_duplicate_issues.py create mode 100644 .github/workflows/scripts/detect_duplicate_prs.py diff --git a/.github/workflows/detect-duplicate-issues.yml b/.github/workflows/detect-duplicate-issues.yml new file mode 100644 index 000000000000..d4a3ba32eb8f --- /dev/null +++ b/.github/workflows/detect-duplicate-issues.yml @@ -0,0 +1,29 @@ +name: Detect Duplicate Issues + +on: + issues: + types: [opened, reopened] + +jobs: + detect-duplicate: + runs-on: ubuntu-latest + permissions: + issues: write + contents: read + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install requests numpy scikit-learn sentence-transformers + + - name: Run duplicate Issue detection + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ISSUE_NUMBER: ${{ github.event.issue.number }} + REPO: ${{ github.repository }} + run: python .github/workflows/scripts/detect_duplicate_issues.py diff --git a/.github/workflows/detect-duplicate-prs.yml b/.github/workflows/detect-duplicate-prs.yml new file mode 100644 index 000000000000..2d739656f4c3 --- /dev/null +++ b/.github/workflows/detect-duplicate-prs.yml @@ -0,0 +1,29 @@ +name: Detect Duplicate PRs + +on: + pull_request: + types: [opened, reopened] + +jobs: + detect-duplicate: + runs-on: ubuntu-latest + permissions: + pull-requests: write + contents: read + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install requests numpy scikit-learn sentence-transformers + + - name: Run duplicate PR detection + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + REPO: ${{ github.repository }} + run: python .github/workflows/scripts/detect_duplicate_prs.py diff --git a/.github/workflows/scripts/detect_duplicate_issues.py b/.github/workflows/scripts/detect_duplicate_issues.py new file mode 100644 index 000000000000..339d10270a52 --- /dev/null +++ b/.github/workflows/scripts/detect_duplicate_issues.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Detect duplicate Issues using OpenAI embeddings + cosine similarity. +Compares: title + body keywords. +""" + +import os + +import numpy as np +import requests + +# 替换 get_embedding 函数,使用 sentence-transformers(免费) +from sentence_transformers import SentenceTransformer + +model = SentenceTransformer("all-MiniLM-L6-v2") + + +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "") +ISSUE_NUMBER = int(os.environ["ISSUE_NUMBER"]) +REPO = os.environ["REPO"] +DRY_RUN = os.getenv("DRY_RUN", "0").lower() in {"1", "true", "yes"} + +HEADERS = {"Accept": "application/vnd.github+json"} +if GITHUB_TOKEN: + HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}" + +SIMILARITY_THRESHOLD = 0.88 +TOP_K = 5 +MAX_HISTORY = 300 + + +def gh_get(url, params=None): + r = requests.get(url, headers=HEADERS, params=params) + r.raise_for_status() + return r.json() + + +def build_issue_text(issue): + return f"Title: {issue.get('title', '')}\nBody: {(issue.get('body') or '')[:1000]}" + + +def get_embedding(text: str): + return model.encode(text) + + +def cosine_similarity(a, b): + return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)) + + +def post_comment(issue_number, body): + if DRY_RUN: + print("DRY_RUN enabled: skip posting issue comment.") + print(body) + return + url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments" + requests.post(url, headers=HEADERS, json={"body": body}) + + +def add_label(issue_number, label): + if DRY_RUN: + print(f"DRY_RUN enabled: skip adding label {label}.") + return + url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/labels" + requests.post(url, headers=HEADERS, json={"labels": [label]}) + + +def main(): + # 1. 获取当前 Issue + current = gh_get(f"https://api.github.com/repos/{REPO}/issues/{ISSUE_NUMBER}") + # 跳过 PR(GitHub API 中 PR 也会出现在 issues 接口) + if "pull_request" in current: + print("This is a PR, skipping.") + return + + current_text = build_issue_text(current) + current_emb = get_embedding(current_text) + + # 2. 拉取历史 Issues(open + closed) + history_issues = [] + for state in ["open", "closed"]: + page = 1 + while len(history_issues) < MAX_HISTORY: + issues = gh_get( + f"https://api.github.com/repos/{REPO}/issues", + params={ + "state": state, + "per_page": 50, + "page": page, + "sort": "updated", + "direction": "desc", + }, + ) + if not issues: + break + for issue in issues: + # 跳过 PR 和自身 + if "pull_request" not in issue and issue["number"] != ISSUE_NUMBER: + history_issues.append(issue) + page += 1 + if len(issues) < 50: + break + if len(history_issues) >= MAX_HISTORY: + break + + history_issues = history_issues[:MAX_HISTORY] + + # 3. 计算相似度 + results = [] + for issue in history_issues: + text = build_issue_text(issue) + emb = get_embedding(text) + sim = cosine_similarity(current_emb, emb) + results.append((sim, issue)) + + results.sort(key=lambda x: -x[0]) + top_results = [ + (sim, i) for sim, i in results[:TOP_K] if sim >= SIMILARITY_THRESHOLD + ] + + if not top_results: + print("No highly similar issues found.") + return + + # 4. 自动打 label + 发评论 + add_label(ISSUE_NUMBER, "possible-duplicate") + + lines = [ + "## 🔍 Possible Duplicate Issue Detected\n", + "The following existing issues appear highly similar:\n", + "| Similarity | Issue | State | Title |", + "|---|---|---|---|", + ] + for sim, issue in top_results: + state_icon = "🟢" if issue["state"] == "open" else "🔴" + row = ( + f"| {sim:.0%} | #{issue['number']} | {state_icon} {issue['state']} | " + f"[{issue['title']}]({issue['html_url']}) |" + ) + lines.append(row) + lines.append( + "\n> 🤖 Auto-detected by duplicate issue checker. A maintainer will verify." + ) + post_comment(ISSUE_NUMBER, "\n".join(lines)) + print(f"Posted comment with {len(top_results)} similar issues.") + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/scripts/detect_duplicate_prs.py b/.github/workflows/scripts/detect_duplicate_prs.py new file mode 100644 index 000000000000..75d1a5e6ebb8 --- /dev/null +++ b/.github/workflows/scripts/detect_duplicate_prs.py @@ -0,0 +1,190 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Detect duplicate PRs using text similarity + file overlap. +""" + +import os + +import numpy as np +import requests +from sklearn.feature_extraction.text import HashingVectorizer + +USE_SENTENCE_TRANSFORMERS = os.getenv("USE_SENTENCE_TRANSFORMERS", "1").lower() in { + "1", + "true", + "yes", +} +try: + if USE_SENTENCE_TRANSFORMERS: + from sentence_transformers import SentenceTransformer + else: + SentenceTransformer = None +except Exception: + SentenceTransformer = None + +model = None +if SentenceTransformer is not None: + model = SentenceTransformer("all-MiniLM-L6-v2") + +hashing_vectorizer = HashingVectorizer( + n_features=2048, + alternate_sign=False, + norm="l2", +) + + +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "") +PR_NUMBER = int(os.environ["PR_NUMBER"]) +REPO = os.environ["REPO"] +DRY_RUN = os.getenv("DRY_RUN", "0").lower() in {"1", "true", "yes"} + +HEADERS = {"Accept": "application/vnd.github+json"} +if GITHUB_TOKEN: + HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}" + +SIMILARITY_THRESHOLD = 0.82 +TOP_K = 5 +MAX_OPEN_CANDIDATES = int(os.getenv("MAX_OPEN_CANDIDATES", "120")) +FILE_COMPARE_TOP_N = int(os.getenv("FILE_COMPARE_TOP_N", "20")) +TEXT_WEIGHT = 0.75 +FILE_WEIGHT = 0.25 + + +def gh_get(url, params=None): + r = requests.get(url, headers=HEADERS, params=params) + r.raise_for_status() + return r.json() + + +def get_pr_files(pr_number): + url = f"https://api.github.com/repos/{REPO}/pulls/{pr_number}/files" + try: + files = gh_get(url, params={"per_page": 100}) + return [f["filename"] for f in files] + except Exception: + return [] + + +def build_pr_text(pr): + parts = [ + f"Title: {pr.get('title', '')}", + f"Body: {(pr.get('body') or '')[:800]}", + ] + return "\n".join(parts) + + +def get_embedding(text: str): + if model is not None: + return np.asarray(model.encode(text), dtype=float) + return hashing_vectorizer.transform([text]).toarray()[0] + + +def cosine_similarity(a, b): + return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)) + + +def jaccard_similarity(a, b): + sa = set(a) + sb = set(b) + if not sa or not sb: + return 0.0 + return len(sa & sb) / len(sa | sb) + + +def post_comment(issue_number, body): + if DRY_RUN: + print("DRY_RUN enabled: skip posting PR comment.") + print(body) + return + url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments" + requests.post(url, headers=HEADERS, json={"body": body}) + + +def main(): + # 1. 获取当前 PR 信息 + current_pr = gh_get(f"https://api.github.com/repos/{REPO}/pulls/{PR_NUMBER}") + current_text = build_pr_text(current_pr) + current_emb = get_embedding(current_text) + current_files = get_pr_files(PR_NUMBER) + + # 2. 拉取 open PR(排除自身,限制候选规模) + history_prs = [] + page = 1 + while len(history_prs) < MAX_OPEN_CANDIDATES: + prs = gh_get( + f"https://api.github.com/repos/{REPO}/pulls", + params={ + "state": "open", + "per_page": 50, + "page": page, + "sort": "updated", + "direction": "desc", + }, + ) + if not prs: + break + for pr in prs: + if pr["number"] != PR_NUMBER: + history_prs.append(pr) + if len(history_prs) >= MAX_OPEN_CANDIDATES: + break + page += 1 + if len(prs) < 50: + break + + # 3. 第一阶段:文本相似度筛候选 + text_results = [] + for pr in history_prs: + text = build_pr_text(pr) + emb = get_embedding(text) + text_sim = cosine_similarity(current_emb, emb) + text_results.append((text_sim, pr)) + + text_results.sort(key=lambda x: -x[0]) + file_candidates = text_results[:FILE_COMPARE_TOP_N] + + # 4. 第二阶段:仅对Top候选拉取文件并融合得分 + results = [] + for text_sim, pr in file_candidates: + pr_files = get_pr_files(pr["number"]) + file_sim = jaccard_similarity(current_files, pr_files) + final_sim = TEXT_WEIGHT * text_sim + FILE_WEIGHT * file_sim + results.append((final_sim, pr, text_sim, file_sim)) + + results.sort(key=lambda x: -x[0]) + top_results = [ + (sim, pr, text_sim, file_sim) + for sim, pr, text_sim, file_sim in results[:TOP_K] + if sim >= SIMILARITY_THRESHOLD + ] + + # 5. 发评论 + if not top_results: + print("No highly similar PRs found.") + return + + lines = [ + "## 🔍 Potential Duplicate PRs Detected\n", + "The following open PRs appear similar to this one:\n", + "| Score | Text | Files | PR | State | Title |", + "|---|---|---|---|---|---|", + ] + for sim, pr, text_sim, file_sim in top_results: + state_icon = ( + "🟢" if pr["state"] == "open" else ("🟣" if pr.get("merged_at") else "🔴") + ) + row = ( + f"| {sim:.0%} | {text_sim:.0%} | {file_sim:.0%} | " + f"#{pr['number']} | {state_icon} {pr['state']} | " + f"[{pr['title']}]({pr['html_url']}) |" + ) + lines.append(row) + lines.append("\n> 🤖 Auto-detected by duplicate PR checker.") + lines.append("Please review to avoid redundant work.") + post_comment(PR_NUMBER, "\n".join(lines)) + print(f"Posted comment with {len(top_results)} similar PRs.") + + +if __name__ == "__main__": + main() From fdee28f2aaaa058f3ff6a007630001b8eb5177e3 Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Mon, 13 Apr 2026 17:36:24 +0800 Subject: [PATCH 2/6] refine refine Signed-off-by: Peter Pan --- .github/workflows/detect-duplicate-prs.yml | 3 +- .../workflows/scripts/detect_duplicate_prs.py | 70 ++++++++++++++++--- 2 files changed, 64 insertions(+), 9 deletions(-) diff --git a/.github/workflows/detect-duplicate-prs.yml b/.github/workflows/detect-duplicate-prs.yml index 2d739656f4c3..2f04cff71d27 100644 --- a/.github/workflows/detect-duplicate-prs.yml +++ b/.github/workflows/detect-duplicate-prs.yml @@ -2,7 +2,7 @@ name: Detect Duplicate PRs on: pull_request: - types: [opened, reopened] + types: [opened, reopened, synchronize, edited, ready_for_review] jobs: detect-duplicate: @@ -27,3 +27,4 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} REPO: ${{ github.repository }} run: python .github/workflows/scripts/detect_duplicate_prs.py + diff --git a/.github/workflows/scripts/detect_duplicate_prs.py b/.github/workflows/scripts/detect_duplicate_prs.py index 75d1a5e6ebb8..906fc72e4455 100644 --- a/.github/workflows/scripts/detect_duplicate_prs.py +++ b/.github/workflows/scripts/detect_duplicate_prs.py @@ -2,6 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Detect duplicate PRs using text similarity + file overlap. + +Workflow overview: +1. Load current PR metadata, text, and changed files. +2. Fetch open PR candidates (excluding the current PR). +3. Use text similarity for first-pass candidate ranking. +4. Compute blended text+file similarity on top candidates. +5. Upsert one bot comment; remove stale comment when no matches remain. """ import os @@ -49,6 +56,7 @@ FILE_COMPARE_TOP_N = int(os.getenv("FILE_COMPARE_TOP_N", "20")) TEXT_WEIGHT = 0.75 FILE_WEIGHT = 0.25 +COMMENT_MARKER = "" def gh_get(url, params=None): @@ -101,14 +109,48 @@ def post_comment(issue_number, body): requests.post(url, headers=HEADERS, json={"body": body}) +def patch_comment(comment_id, body): + if DRY_RUN: + print(f"DRY_RUN enabled: skip updating comment {comment_id}.") + print(body) + return + url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}" + requests.patch(url, headers=HEADERS, json={"body": body}) + + +def delete_comment(comment_id): + if DRY_RUN: + print(f"DRY_RUN enabled: skip deleting comment {comment_id}.") + return + url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}" + requests.delete(url, headers=HEADERS) + + +def find_existing_bot_comment(issue_number): + page = 1 + while True: + comments = gh_get( + f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments", + params={"per_page": 100, "page": page}, + ) + if not comments: + return None + for comment in comments: + if COMMENT_MARKER in (comment.get("body") or ""): + return comment["id"] + if len(comments) < 100: + return None + page += 1 + + def main(): - # 1. 获取当前 PR 信息 + # 1. Load current PR context. current_pr = gh_get(f"https://api.github.com/repos/{REPO}/pulls/{PR_NUMBER}") current_text = build_pr_text(current_pr) current_emb = get_embedding(current_text) current_files = get_pr_files(PR_NUMBER) - # 2. 拉取 open PR(排除自身,限制候选规模) + # 2. Fetch open PR candidates (exclude current PR). history_prs = [] page = 1 while len(history_prs) < MAX_OPEN_CANDIDATES: @@ -133,7 +175,7 @@ def main(): if len(prs) < 50: break - # 3. 第一阶段:文本相似度筛候选 + # 3. Stage-1: rank candidates by text similarity. text_results = [] for pr in history_prs: text = build_pr_text(pr) @@ -144,7 +186,7 @@ def main(): text_results.sort(key=lambda x: -x[0]) file_candidates = text_results[:FILE_COMPARE_TOP_N] - # 4. 第二阶段:仅对Top候选拉取文件并融合得分 + # 4. Stage-2: score top candidates with text + file overlap. results = [] for text_sim, pr in file_candidates: pr_files = get_pr_files(pr["number"]) @@ -159,12 +201,18 @@ def main(): if sim >= SIMILARITY_THRESHOLD ] - # 5. 发评论 + # 5. Upsert bot comment + existing_comment_id = find_existing_bot_comment(PR_NUMBER) if not top_results: - print("No highly similar PRs found.") + if existing_comment_id is not None: + delete_comment(existing_comment_id) + print("Deleted stale duplicate checker comment.") + else: + print("No highly similar PRs found.") return lines = [ + COMMENT_MARKER, "## 🔍 Potential Duplicate PRs Detected\n", "The following open PRs appear similar to this one:\n", "| Score | Text | Files | PR | State | Title |", @@ -182,9 +230,15 @@ def main(): lines.append(row) lines.append("\n> 🤖 Auto-detected by duplicate PR checker.") lines.append("Please review to avoid redundant work.") - post_comment(PR_NUMBER, "\n".join(lines)) - print(f"Posted comment with {len(top_results)} similar PRs.") + body = "\n".join(lines) + if existing_comment_id is not None: + patch_comment(existing_comment_id, body) + print(f"Updated comment with {len(top_results)} similar PRs.") + else: + post_comment(PR_NUMBER, body) + print(f"Posted comment with {len(top_results)} similar PRs.") if __name__ == "__main__": main() + From 5e779eef4f8a5e9f133f621128540b9c45e2d040 Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Tue, 14 Apr 2026 12:08:17 +0800 Subject: [PATCH 3/6] adding closed PR + change threshold Signed-off-by: Peter Pan --- .../workflows/scripts/detect_duplicate_prs.py | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/.github/workflows/scripts/detect_duplicate_prs.py b/.github/workflows/scripts/detect_duplicate_prs.py index 906fc72e4455..86e9358126e3 100644 --- a/.github/workflows/scripts/detect_duplicate_prs.py +++ b/.github/workflows/scripts/detect_duplicate_prs.py @@ -50,9 +50,12 @@ if GITHUB_TOKEN: HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}" -SIMILARITY_THRESHOLD = 0.82 +SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.75")) TOP_K = 5 -MAX_OPEN_CANDIDATES = int(os.getenv("MAX_OPEN_CANDIDATES", "120")) +MAX_CANDIDATES = int( + os.getenv("MAX_CANDIDATES", os.getenv("MAX_OPEN_CANDIDATES", "120")) +) +PR_CANDIDATE_STATE = os.getenv("PR_CANDIDATE_STATE", "all") FILE_COMPARE_TOP_N = int(os.getenv("FILE_COMPARE_TOP_N", "20")) TEXT_WEIGHT = 0.75 FILE_WEIGHT = 0.25 @@ -150,14 +153,14 @@ def main(): current_emb = get_embedding(current_text) current_files = get_pr_files(PR_NUMBER) - # 2. Fetch open PR candidates (exclude current PR). + # 2. Fetch PR candidates (exclude current PR). history_prs = [] page = 1 - while len(history_prs) < MAX_OPEN_CANDIDATES: + while len(history_prs) < MAX_CANDIDATES: prs = gh_get( f"https://api.github.com/repos/{REPO}/pulls", params={ - "state": "open", + "state": PR_CANDIDATE_STATE, "per_page": 50, "page": page, "sort": "updated", @@ -169,7 +172,7 @@ def main(): for pr in prs: if pr["number"] != PR_NUMBER: history_prs.append(pr) - if len(history_prs) >= MAX_OPEN_CANDIDATES: + if len(history_prs) >= MAX_CANDIDATES: break page += 1 if len(prs) < 50: @@ -213,9 +216,10 @@ def main(): lines = [ COMMENT_MARKER, - "## 🔍 Potential Duplicate PRs Detected\n", - "The following open PRs appear similar to this one:\n", - "| Score | Text | Files | PR | State | Title |", + "## 🔍 Potentially Related PRs\n", + f"The following {PR_CANDIDATE_STATE} PRs may be related to this PR, and could overlap in intent or implementation:\n", + "If this is intentional and complementary work, feel free to ignore this notice.\n", + "| Match Score | Desc Similarity | Files Overlap | PR # | State | Title |", "|---|---|---|---|---|---|", ] for sim, pr, text_sim, file_sim in top_results: @@ -228,8 +232,10 @@ def main(): f"[{pr['title']}]({pr['html_url']}) |" ) lines.append(row) - lines.append("\n> 🤖 Auto-detected by duplicate PR checker.") - lines.append("Please review to avoid redundant work.") + lines.append("\n> 🤖 Auto-detected by similarity signals (title/body/files).") + lines.append( + "This is a soft hint only. Please review manually to determine whether these are related work or true duplicates." + ) body = "\n".join(lines) if existing_comment_id is not None: patch_comment(existing_comment_id, body) @@ -241,4 +247,3 @@ def main(): if __name__ == "__main__": main() - From 97f0f31d0d85f791b7ccd632dc46973ce24918b7 Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Tue, 14 Apr 2026 19:04:41 +0800 Subject: [PATCH 4/6] Improve duplicate issue/PR detection workflows and caching Signed-off-by: Peter Pan --- .github/workflows/detect-duplicate-issues.yml | 37 +- .github/workflows/detect-duplicate-prs.yml | 25 + .../scripts/detect_duplicate_issues.py | 437 +++++++++++++++--- .../workflows/scripts/detect_duplicate_prs.py | 71 ++- 4 files changed, 493 insertions(+), 77 deletions(-) diff --git a/.github/workflows/detect-duplicate-issues.yml b/.github/workflows/detect-duplicate-issues.yml index d4a3ba32eb8f..fe926245d2c4 100644 --- a/.github/workflows/detect-duplicate-issues.yml +++ b/.github/workflows/detect-duplicate-issues.yml @@ -2,7 +2,7 @@ name: Detect Duplicate Issues on: issues: - types: [opened, reopened] + types: [opened, reopened, edited] jobs: detect-duplicate: @@ -17,13 +17,48 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.11' + cache: 'pip' - name: Install dependencies run: pip install requests numpy scikit-learn sentence-transformers + - name: Compute cache key epoch (weekly) + id: cache-epoch + run: echo "week=$(date +%G-W%V)" >> "$GITHUB_OUTPUT" + + - name: Restore issue embedding cache + id: restore-issue-cache + uses: actions/cache/restore@v4 + with: + path: .github/workflows/.dup_issue_cache/embeddings + key: dup-issue-emb-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }} + restore-keys: | + dup-issue-emb-${{ github.repository }}- + - name: Run duplicate Issue detection env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ISSUE_NUMBER: ${{ github.event.issue.number }} REPO: ${{ github.repository }} + DRY_RUN: 0 + USE_SENTENCE_TRANSFORMERS: 1 + ISSUE_CANDIDATE_STATE: all + MAX_CANDIDATES: 500 + TITLE_COMPARE_TOP_N: 25 + TOP_K: 5 + SIMILARITY_THRESHOLD: 0.82 + TEXT_WEIGHT: 0.8 + TITLE_WEIGHT: 0.2 + AUTO_LABEL: 1 + DUPLICATE_LABEL: possible-duplicate + ISSUE_EMBED_CACHE_DIR: .github/workflows/.dup_issue_cache/embeddings + ISSUE_EMBED_CACHE_WRITE: 1 run: python .github/workflows/scripts/detect_duplicate_issues.py + + - name: Save issue embedding cache + if: ${{ steps.restore-issue-cache.outputs.cache-hit != 'true' }} + continue-on-error: true + uses: actions/cache/save@v4 + with: + path: .github/workflows/.dup_issue_cache/embeddings + key: dup-issue-emb-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }} diff --git a/.github/workflows/detect-duplicate-prs.yml b/.github/workflows/detect-duplicate-prs.yml index 2f04cff71d27..ce47eac543fa 100644 --- a/.github/workflows/detect-duplicate-prs.yml +++ b/.github/workflows/detect-duplicate-prs.yml @@ -17,14 +17,39 @@ jobs: uses: actions/setup-python@v5 with: python-version: '3.11' + cache: 'pip' - name: Install dependencies run: pip install requests numpy scikit-learn sentence-transformers + - name: Compute cache key epoch (weekly) + id: cache-epoch + run: echo "week=$(date +%G-W%V)" >> "$GITHUB_OUTPUT" + + - name: Restore PR file cache + id: restore-pr-cache + uses: actions/cache/restore@v4 + with: + path: .github/workflows/.dup_pr_cache/files + key: dup-pr-files-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }} + restore-keys: | + dup-pr-files-${{ github.repository }}- + - name: Run duplicate PR detection env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} REPO: ${{ github.repository }} + DRY_RUN: 0 + PR_FILE_CACHE_DIR: .github/workflows/.dup_pr_cache/files + PR_FILE_CACHE_WRITE: 1 + PREFETCH_CANDIDATE_FILES: 1 run: python .github/workflows/scripts/detect_duplicate_prs.py + - name: Save PR file cache + if: ${{ github.event.pull_request.head.repo.full_name == github.repository && steps.restore-pr-cache.outputs.cache-hit != 'true' }} + continue-on-error: true + uses: actions/cache/save@v4 + with: + path: .github/workflows/.dup_pr_cache/files + key: dup-pr-files-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }} diff --git a/.github/workflows/scripts/detect_duplicate_issues.py b/.github/workflows/scripts/detect_duplicate_issues.py index 339d10270a52..a8177220a164 100644 --- a/.github/workflows/scripts/detect_duplicate_issues.py +++ b/.github/workflows/scripts/detect_duplicate_issues.py @@ -1,60 +1,240 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ -Detect duplicate Issues using OpenAI embeddings + cosine similarity. -Compares: title + body keywords. +Detect duplicate issues using two-stage similarity: +1) semantic text similarity (title + body) +2) blended score with title token overlap + +Workflow behavior: +- Upsert one bot comment identified by marker. +- Remove stale bot comment and duplicate label when no matches remain. + +Local debug example: +GITHUB_TOKEN="$(gh auth token)" ISSUE_NUMBER=39774 REPO=vllm-project/vllm DRY_RUN=1 ISSUE_EMBED_CACHE_DIR=.github/workflows/.dup_issue_cache/embeddings .venv/bin/python .github/workflows/scripts/detect_duplicate_issues.py """ +import json import os +import re +from pathlib import Path import numpy as np import requests - -# 替换 get_embedding 函数,使用 sentence-transformers(免费) -from sentence_transformers import SentenceTransformer - -model = SentenceTransformer("all-MiniLM-L6-v2") - +from sklearn.feature_extraction.text import HashingVectorizer + +USE_SENTENCE_TRANSFORMERS = os.getenv("USE_SENTENCE_TRANSFORMERS", "1").lower() in { + "1", + "true", + "yes", +} +try: + if USE_SENTENCE_TRANSFORMERS: + from sentence_transformers import SentenceTransformer + else: + SentenceTransformer = None +except Exception: + SentenceTransformer = None + +model = None +if SentenceTransformer is not None: + model = SentenceTransformer("all-MiniLM-L6-v2") + +hashing_vectorizer = HashingVectorizer( + n_features=2048, + alternate_sign=False, + norm="l2", +) GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "") ISSUE_NUMBER = int(os.environ["ISSUE_NUMBER"]) REPO = os.environ["REPO"] -DRY_RUN = os.getenv("DRY_RUN", "0").lower() in {"1", "true", "yes"} +DRY_RUN = os.getenv("DRY_RUN", "1").lower() in {"1", "true", "yes"} HEADERS = {"Accept": "application/vnd.github+json"} if GITHUB_TOKEN: HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}" -SIMILARITY_THRESHOLD = 0.88 -TOP_K = 5 -MAX_HISTORY = 300 +SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.82")) +TOP_K = int(os.getenv("TOP_K", "5")) +MAX_CANDIDATES = int(os.getenv("MAX_CANDIDATES", "500")) +ISSUE_CANDIDATE_STATE = os.getenv("ISSUE_CANDIDATE_STATE", "all") +TITLE_COMPARE_TOP_N = int(os.getenv("TITLE_COMPARE_TOP_N", "25")) +TEXT_WEIGHT = float(os.getenv("TEXT_WEIGHT", "0.8")) +TITLE_WEIGHT = float(os.getenv("TITLE_WEIGHT", "0.2")) +COMMENT_MARKER = "" +DUPLICATE_LABEL = os.getenv("DUPLICATE_LABEL", "possible-duplicate") +AUTO_LABEL = os.getenv("AUTO_LABEL", "1").lower() in {"1", "true", "yes"} +ISSUE_EMBED_CACHE_DIR = os.getenv("ISSUE_EMBED_CACHE_DIR", "") +ISSUE_EMBED_CACHE_WRITE = os.getenv("ISSUE_EMBED_CACHE_WRITE", "1").lower() in { + "1", + "true", + "yes", +} +EMBEDDING_MODE = "sentence-transformers" if model is not None else "hashing-vectorizer" +FEATURE_VERSION = "v2-template-cleaned" +RUN_STATS = { + "api_requests": 0, + "candidates_fetched": 0, + "text_scored": 0, + "final_scored": 0, + "embed_cache_hits": 0, + "embed_cache_misses": 0, + "embed_cache_writes": 0, +} +DROP_BODY_SECTION_HEADERS = { + "your current environment", + "environment", + "before submitting", + "checklist", +} +DROP_BODY_LINE_PATTERNS = [ + re.compile(r"^\s*-\s*\[[ xX]\]\s*"), + re.compile(r"^\s*\s*$"), +] +ENV_BLOCK_HINTS = { + "collecting environment information", + "system info", + "pytorch info", + "python environment", + "cuda / gpu info", + "cpu info", + "the output of python collect_env.py", +} def gh_get(url, params=None): + RUN_STATS["api_requests"] += 1 r = requests.get(url, headers=HEADERS, params=params) r.raise_for_status() return r.json() +def gh_post(url, payload): + RUN_STATS["api_requests"] += 1 + r = requests.post(url, headers=HEADERS, json=payload) + r.raise_for_status() + + +def gh_patch(url, payload): + RUN_STATS["api_requests"] += 1 + r = requests.patch(url, headers=HEADERS, json=payload) + r.raise_for_status() + + +def gh_delete(url, ignore_not_found=False): + RUN_STATS["api_requests"] += 1 + r = requests.delete(url, headers=HEADERS) + if ignore_not_found and r.status_code == 404: + return + r.raise_for_status() + + +def _header_name(line: str) -> str: + stripped = line.strip().lstrip("#").strip().lower() + return stripped.rstrip(":") + + +def _should_drop_code_block(lines: list[str]) -> bool: + if not lines: + return False + block_text = "\n".join(lines).lower() + hint_hits = sum(1 for hint in ENV_BLOCK_HINTS if hint in block_text) + if hint_hits >= 2: + return True + if len(lines) >= 60 and hint_hits >= 1: + return True + return False + + +def clean_issue_body(body: str) -> str: + lines = body.replace("\r\n", "\n").split("\n") + output: list[str] = [] + i = 0 + skip_until_next_header = False + while i < len(lines): + line = lines[i] + header = _header_name(line) + if line.strip().startswith("##"): + skip_until_next_header = header in DROP_BODY_SECTION_HEADERS + i += 1 + continue + if skip_until_next_header: + i += 1 + continue + if line.strip().startswith("```"): + block = [line] + i += 1 + while i < len(lines): + block.append(lines[i]) + if lines[i].strip().startswith("```"): + i += 1 + break + i += 1 + if not _should_drop_code_block(block): + output.extend(block) + continue + if any(p.search(line) for p in DROP_BODY_LINE_PATTERNS): + i += 1 + continue + output.append(line) + i += 1 + cleaned = "\n".join(output).strip() + return cleaned + + def build_issue_text(issue): - return f"Title: {issue.get('title', '')}\nBody: {(issue.get('body') or '')[:1000]}" + raw_body = issue.get("body") or "" + cleaned_body = clean_issue_body(raw_body) + return f"Title: {issue.get('title', '')}\nBody: {cleaned_body[:1000]}" + + +def title_tokens(issue): + title = (issue.get("title") or "").lower() + return re.findall(r"[a-z0-9_]+", title) def get_embedding(text: str): - return model.encode(text) + if model is not None: + return np.asarray(model.encode(text), dtype=float) + return hashing_vectorizer.transform([text]).toarray()[0] def cosine_similarity(a, b): return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)) +def jaccard_similarity(a, b): + sa = set(a) + sb = set(b) + if not sa or not sb: + return 0.0 + return len(sa & sb) / len(sa | sb) + + def post_comment(issue_number, body): if DRY_RUN: print("DRY_RUN enabled: skip posting issue comment.") print(body) return url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments" - requests.post(url, headers=HEADERS, json={"body": body}) + gh_post(url, {"body": body}) + + +def patch_comment(comment_id, body): + if DRY_RUN: + print(f"DRY_RUN enabled: skip updating comment {comment_id}.") + print(body) + return + url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}" + gh_patch(url, {"body": body}) + + +def delete_comment(comment_id): + if DRY_RUN: + print(f"DRY_RUN enabled: skip deleting comment {comment_id}.") + return + url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}" + gh_delete(url) def add_label(issue_number, label): @@ -62,87 +242,204 @@ def add_label(issue_number, label): print(f"DRY_RUN enabled: skip adding label {label}.") return url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/labels" - requests.post(url, headers=HEADERS, json={"labels": [label]}) + gh_post(url, {"labels": [label]}) + + +def remove_label(issue_number, label): + if DRY_RUN: + print(f"DRY_RUN enabled: skip removing label {label}.") + return + url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/labels/{label}" + gh_delete(url, ignore_not_found=True) + + +def find_existing_bot_comment(issue_number): + page = 1 + while True: + comments = gh_get( + f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments", + params={"per_page": 100, "page": page}, + ) + if not comments: + return None + for comment in comments: + if COMMENT_MARKER in (comment.get("body") or ""): + return comment["id"] + if len(comments) < 100: + return None + page += 1 + + +def print_run_stats(): + print( + "Stats: " + f"api_requests={RUN_STATS['api_requests']} " + f"candidates_fetched={RUN_STATS['candidates_fetched']} " + f"text_scored={RUN_STATS['text_scored']} " + f"final_scored={RUN_STATS['final_scored']} " + f"embed_cache_hits={RUN_STATS['embed_cache_hits']} " + f"embed_cache_misses={RUN_STATS['embed_cache_misses']} " + f"embed_cache_writes={RUN_STATS['embed_cache_writes']}" + ) + + +def get_issue_features(issue): + cache_file = None + issue_number = issue.get("number") + updated_at = issue.get("updated_at") or "" + if ISSUE_EMBED_CACHE_DIR and issue_number is not None: + cache_dir = Path(ISSUE_EMBED_CACHE_DIR) + cache_dir.mkdir(parents=True, exist_ok=True) + cache_file = cache_dir / f"{issue_number}.json" + if cache_file.exists(): + try: + payload = json.loads(cache_file.read_text(encoding="utf-8")) + embedding = payload.get("embedding") + title_tok = payload.get("title_tokens") + if ( + payload.get("feature_version") == FEATURE_VERSION + and payload.get("mode") == EMBEDDING_MODE + and payload.get("updated_at", "") == updated_at + and isinstance(embedding, list) + and isinstance(title_tok, list) + ): + RUN_STATS["embed_cache_hits"] += 1 + return np.asarray(embedding, dtype=float), title_tok + except Exception: + pass + RUN_STATS["embed_cache_misses"] += 1 + text = build_issue_text(issue) + emb = get_embedding(text) + title_tok = title_tokens(issue) + if cache_file is not None and ISSUE_EMBED_CACHE_WRITE: + try: + cache_file.write_text( + json.dumps( + { + "feature_version": FEATURE_VERSION, + "mode": EMBEDDING_MODE, + "updated_at": updated_at, + "embedding": emb.tolist(), + "title_tokens": title_tok, + } + ), + encoding="utf-8", + ) + RUN_STATS["embed_cache_writes"] += 1 + except Exception: + pass + return emb, title_tok def main(): - # 1. 获取当前 Issue - current = gh_get(f"https://api.github.com/repos/{REPO}/issues/{ISSUE_NUMBER}") - # 跳过 PR(GitHub API 中 PR 也会出现在 issues 接口) - if "pull_request" in current: + current_issue = gh_get(f"https://api.github.com/repos/{REPO}/issues/{ISSUE_NUMBER}") + if "pull_request" in current_issue: print("This is a PR, skipping.") return - current_text = build_issue_text(current) - current_emb = get_embedding(current_text) - - # 2. 拉取历史 Issues(open + closed) - history_issues = [] - for state in ["open", "closed"]: - page = 1 - while len(history_issues) < MAX_HISTORY: - issues = gh_get( - f"https://api.github.com/repos/{REPO}/issues", - params={ - "state": state, - "per_page": 50, - "page": page, - "sort": "updated", - "direction": "desc", - }, - ) - if not issues: - break - for issue in issues: - # 跳过 PR 和自身 - if "pull_request" not in issue and issue["number"] != ISSUE_NUMBER: - history_issues.append(issue) - page += 1 - if len(issues) < 50: + current_emb, current_title_tokens = get_issue_features(current_issue) + + candidates = [] + page = 1 + while len(candidates) < MAX_CANDIDATES: + issues = gh_get( + f"https://api.github.com/repos/{REPO}/issues", + params={ + "state": ISSUE_CANDIDATE_STATE, + "per_page": 50, + "page": page, + "sort": "updated", + "direction": "desc", + }, + ) + if not issues: + break + for issue in issues: + if "pull_request" in issue: + continue + if issue["number"] == ISSUE_NUMBER: + continue + candidates.append(issue) + if len(candidates) >= MAX_CANDIDATES: break - if len(history_issues) >= MAX_HISTORY: + page += 1 + if len(issues) < 50: break + RUN_STATS["candidates_fetched"] = len(candidates) + + text_results = [] + candidate_title_tokens = {} + for issue in candidates: + emb, title_tok = get_issue_features(issue) + candidate_title_tokens[issue["number"]] = title_tok + text_sim = cosine_similarity(current_emb, emb) + RUN_STATS["text_scored"] += 1 + text_results.append((text_sim, issue)) + + text_results.sort(key=lambda x: -x[0]) + title_candidates = text_results[:TITLE_COMPARE_TOP_N] + + blended_results = [] + for text_sim, issue in title_candidates: + tok_sim = jaccard_similarity( + current_title_tokens, candidate_title_tokens.get(issue["number"], []) + ) + final_sim = TEXT_WEIGHT * text_sim + TITLE_WEIGHT * tok_sim + RUN_STATS["final_scored"] += 1 + blended_results.append((final_sim, issue, text_sim, tok_sim)) - history_issues = history_issues[:MAX_HISTORY] - - # 3. 计算相似度 - results = [] - for issue in history_issues: - text = build_issue_text(issue) - emb = get_embedding(text) - sim = cosine_similarity(current_emb, emb) - results.append((sim, issue)) - - results.sort(key=lambda x: -x[0]) + blended_results.sort(key=lambda x: -x[0]) top_results = [ - (sim, i) for sim, i in results[:TOP_K] if sim >= SIMILARITY_THRESHOLD + (sim, issue, text_sim, tok_sim) + for sim, issue, text_sim, tok_sim in blended_results[:TOP_K] + if sim >= SIMILARITY_THRESHOLD ] + existing_comment_id = find_existing_bot_comment(ISSUE_NUMBER) if not top_results: + if existing_comment_id is not None: + delete_comment(existing_comment_id) + print("Deleted stale duplicate checker comment.") + if AUTO_LABEL: + remove_label(ISSUE_NUMBER, DUPLICATE_LABEL) print("No highly similar issues found.") + print_run_stats() return - # 4. 自动打 label + 发评论 - add_label(ISSUE_NUMBER, "possible-duplicate") + if AUTO_LABEL: + add_label(ISSUE_NUMBER, DUPLICATE_LABEL) lines = [ - "## 🔍 Possible Duplicate Issue Detected\n", - "The following existing issues appear highly similar:\n", - "| Similarity | Issue | State | Title |", - "|---|---|---|---|", + COMMENT_MARKER, + "## 🔍 Potentially Related Issues\n", + f"The following {ISSUE_CANDIDATE_STATE} issues may be related to this issue:\n", + "If this is intentional and complementary work, feel free to ignore this notice.\n", + "| Match Score | Desc Similarity | Title Overlap | Issue # | State | Title |", + "|---|---|---|---|---|---|", ] - for sim, issue in top_results: + for sim, issue, text_sim, tok_sim in top_results: state_icon = "🟢" if issue["state"] == "open" else "🔴" row = ( - f"| {sim:.0%} | #{issue['number']} | {state_icon} {issue['state']} | " + f"| {sim:.0%} | {text_sim:.0%} | {tok_sim:.0%} | " + f"#{issue['number']} | {state_icon} {issue['state']} | " f"[{issue['title']}]({issue['html_url']}) |" ) lines.append(row) lines.append( - "\n> 🤖 Auto-detected by duplicate issue checker. A maintainer will verify." + "\n> 🤖 Auto-detected by similarity signals (title/body/title-tokens)." + ) + lines.append( + "This is a soft hint only. Please review manually to determine whether these are related work or true duplicates." ) - post_comment(ISSUE_NUMBER, "\n".join(lines)) - print(f"Posted comment with {len(top_results)} similar issues.") + body = "\n".join(lines) + + if existing_comment_id is not None: + patch_comment(existing_comment_id, body) + print(f"Updated comment with {len(top_results)} similar issues.") + else: + post_comment(ISSUE_NUMBER, body) + print(f"Posted comment with {len(top_results)} similar issues.") + print_run_stats() if __name__ == "__main__": diff --git a/.github/workflows/scripts/detect_duplicate_prs.py b/.github/workflows/scripts/detect_duplicate_prs.py index 86e9358126e3..85dd26c85c08 100644 --- a/.github/workflows/scripts/detect_duplicate_prs.py +++ b/.github/workflows/scripts/detect_duplicate_prs.py @@ -9,9 +9,14 @@ 3. Use text similarity for first-pass candidate ranking. 4. Compute blended text+file similarity on top candidates. 5. Upsert one bot comment; remove stale comment when no matches remain. + +Local debug example: +GITHUB_TOKEN="$(gh auth token)" PR_NUMBER=61456 REPO=vllm-project/vllm DRY_RUN=1 PR_FILE_CACHE_DIR=.github/workflows/.dup_pr_cache/files .venv/bin/python .github/workflows/scripts/detect_duplicate_prs.py """ +import json import os +from pathlib import Path import numpy as np import requests @@ -44,7 +49,7 @@ GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "") PR_NUMBER = int(os.environ["PR_NUMBER"]) REPO = os.environ["REPO"] -DRY_RUN = os.getenv("DRY_RUN", "0").lower() in {"1", "true", "yes"} +DRY_RUN = os.getenv("DRY_RUN", "1").lower() in {"1", "true", "yes"} HEADERS = {"Accept": "application/vnd.github+json"} if GITHUB_TOKEN: @@ -52,27 +57,64 @@ SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.75")) TOP_K = 5 -MAX_CANDIDATES = int( - os.getenv("MAX_CANDIDATES", os.getenv("MAX_OPEN_CANDIDATES", "120")) -) +MAX_CANDIDATES = int(os.getenv("MAX_CANDIDATES", "500")) PR_CANDIDATE_STATE = os.getenv("PR_CANDIDATE_STATE", "all") FILE_COMPARE_TOP_N = int(os.getenv("FILE_COMPARE_TOP_N", "20")) +PREFETCH_CANDIDATE_FILES = os.getenv("PREFETCH_CANDIDATE_FILES", "1").lower() in { + "1", + "true", + "yes", +} TEXT_WEIGHT = 0.75 FILE_WEIGHT = 0.25 COMMENT_MARKER = "" +PR_FILE_CACHE_DIR = os.getenv("PR_FILE_CACHE_DIR", "") +PR_FILE_CACHE_WRITE = os.getenv("PR_FILE_CACHE_WRITE", "1").lower() in { + "1", + "true", + "yes", +} +RUN_STATS = { + "api_requests": 0, + "file_cache_hits": 0, + "file_cache_misses": 0, + "file_cache_writes": 0, +} def gh_get(url, params=None): + RUN_STATS["api_requests"] += 1 r = requests.get(url, headers=HEADERS, params=params) r.raise_for_status() return r.json() def get_pr_files(pr_number): + cache_file = None + if PR_FILE_CACHE_DIR: + cache_dir = Path(PR_FILE_CACHE_DIR) + cache_dir.mkdir(parents=True, exist_ok=True) + cache_file = cache_dir / f"{pr_number}.json" + if cache_file.exists(): + try: + cached_files = json.loads(cache_file.read_text(encoding="utf-8")) + if isinstance(cached_files, list): + RUN_STATS["file_cache_hits"] += 1 + return cached_files + except Exception: + pass + RUN_STATS["file_cache_misses"] += 1 url = f"https://api.github.com/repos/{REPO}/pulls/{pr_number}/files" try: files = gh_get(url, params={"per_page": 100}) - return [f["filename"] for f in files] + filenames = [f["filename"] for f in files] + if cache_file is not None and PR_FILE_CACHE_WRITE: + try: + cache_file.write_text(json.dumps(filenames), encoding="utf-8") + RUN_STATS["file_cache_writes"] += 1 + except Exception: + pass + return filenames except Exception: return [] @@ -147,13 +189,22 @@ def find_existing_bot_comment(issue_number): def main(): + def print_run_stats(): + print( + "Stats: " + f"api_requests={RUN_STATS['api_requests']} " + f"file_cache_hits={RUN_STATS['file_cache_hits']} " + f"file_cache_misses={RUN_STATS['file_cache_misses']} " + f"file_cache_writes={RUN_STATS['file_cache_writes']}" + ) + # 1. Load current PR context. current_pr = gh_get(f"https://api.github.com/repos/{REPO}/pulls/{PR_NUMBER}") current_text = build_pr_text(current_pr) current_emb = get_embedding(current_text) current_files = get_pr_files(PR_NUMBER) - # 2. Fetch PR candidates (exclude current PR). + # 2. Fetch PR candidates (exclude current PR). ranked by most recent updated history_prs = [] page = 1 while len(history_prs) < MAX_CANDIDATES: @@ -186,6 +237,12 @@ def main(): text_sim = cosine_similarity(current_emb, emb) text_results.append((text_sim, pr)) + # Warm file cache for all candidates so different PR runs can reuse + # a stable candidate pool across workflow executions. + if PREFETCH_CANDIDATE_FILES: + for pr in history_prs: + get_pr_files(pr["number"]) + text_results.sort(key=lambda x: -x[0]) file_candidates = text_results[:FILE_COMPARE_TOP_N] @@ -212,6 +269,7 @@ def main(): print("Deleted stale duplicate checker comment.") else: print("No highly similar PRs found.") + print_run_stats() return lines = [ @@ -243,6 +301,7 @@ def main(): else: post_comment(PR_NUMBER, body) print(f"Posted comment with {len(top_results)} similar PRs.") + print_run_stats() if __name__ == "__main__": From c4504b06d92793c47d594421eece6bd360ff4d25 Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Thu, 16 Apr 2026 18:21:11 +0800 Subject: [PATCH 5/6] ci: use regex module for forbidden import check Signed-off-by: Peter Pan --- .../scripts/detect_duplicate_issues.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/.github/workflows/scripts/detect_duplicate_issues.py b/.github/workflows/scripts/detect_duplicate_issues.py index a8177220a164..43d992ce6af9 100644 --- a/.github/workflows/scripts/detect_duplicate_issues.py +++ b/.github/workflows/scripts/detect_duplicate_issues.py @@ -10,15 +10,17 @@ - Remove stale bot comment and duplicate label when no matches remain. Local debug example: -GITHUB_TOKEN="$(gh auth token)" ISSUE_NUMBER=39774 REPO=vllm-project/vllm DRY_RUN=1 ISSUE_EMBED_CACHE_DIR=.github/workflows/.dup_issue_cache/embeddings .venv/bin/python .github/workflows/scripts/detect_duplicate_issues.py +GITHUB_TOKEN="$(gh auth token)" ISSUE_NUMBER=39774 REPO=vllm-project/vllm DRY_RUN=1 \ +ISSUE_EMBED_CACHE_DIR=.github/workflows/.dup_issue_cache/embeddings \ +.venv/bin/python .github/workflows/scripts/detect_duplicate_issues.py """ import json import os -import re from pathlib import Path import numpy as np +import regex as re import requests from sklearn.feature_extraction.text import HashingVectorizer @@ -141,9 +143,7 @@ def _should_drop_code_block(lines: list[str]) -> bool: hint_hits = sum(1 for hint in ENV_BLOCK_HINTS if hint in block_text) if hint_hits >= 2: return True - if len(lines) >= 60 and hint_hits >= 1: - return True - return False + return bool(len(lines) >= 60 and hint_hits >= 1) def clean_issue_body(body: str) -> str: @@ -412,8 +412,14 @@ def main(): lines = [ COMMENT_MARKER, "## 🔍 Potentially Related Issues\n", - f"The following {ISSUE_CANDIDATE_STATE} issues may be related to this issue:\n", - "If this is intentional and complementary work, feel free to ignore this notice.\n", + ( + f"The following {ISSUE_CANDIDATE_STATE} issues may be related to this " + "issue:\n" + ), + ( + "If this is intentional and complementary work, feel free to ignore " + "this notice.\n" + ), "| Match Score | Desc Similarity | Title Overlap | Issue # | State | Title |", "|---|---|---|---|---|---|", ] @@ -429,7 +435,8 @@ def main(): "\n> 🤖 Auto-detected by similarity signals (title/body/title-tokens)." ) lines.append( - "This is a soft hint only. Please review manually to determine whether these are related work or true duplicates." + "This is a soft hint only. Please review manually to determine whether " + "these are related work or true duplicates." ) body = "\n".join(lines) From 243eb586bf551c768302c4c1b2103726dde1c2b4 Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Thu, 16 Apr 2026 21:31:01 +0800 Subject: [PATCH 6/6] Fix E501 in duplicate PR checker script Signed-off-by: Peter Pan --- .../workflows/scripts/detect_duplicate_prs.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/.github/workflows/scripts/detect_duplicate_prs.py b/.github/workflows/scripts/detect_duplicate_prs.py index 85dd26c85c08..4db9d8492a7a 100644 --- a/.github/workflows/scripts/detect_duplicate_prs.py +++ b/.github/workflows/scripts/detect_duplicate_prs.py @@ -11,7 +11,9 @@ 5. Upsert one bot comment; remove stale comment when no matches remain. Local debug example: -GITHUB_TOKEN="$(gh auth token)" PR_NUMBER=61456 REPO=vllm-project/vllm DRY_RUN=1 PR_FILE_CACHE_DIR=.github/workflows/.dup_pr_cache/files .venv/bin/python .github/workflows/scripts/detect_duplicate_prs.py +GITHUB_TOKEN="$(gh auth token)" PR_NUMBER=61456 REPO=vllm-project/vllm DRY_RUN=1 \ +PR_FILE_CACHE_DIR=.github/workflows/.dup_pr_cache/files \ +.venv/bin/python .github/workflows/scripts/detect_duplicate_prs.py """ import json @@ -275,8 +277,14 @@ def print_run_stats(): lines = [ COMMENT_MARKER, "## 🔍 Potentially Related PRs\n", - f"The following {PR_CANDIDATE_STATE} PRs may be related to this PR, and could overlap in intent or implementation:\n", - "If this is intentional and complementary work, feel free to ignore this notice.\n", + ( + f"The following {PR_CANDIDATE_STATE} PRs may be related to this PR, " + "and could overlap in intent or implementation:\n" + ), + ( + "If this is intentional and complementary work, feel free to ignore " + "this notice.\n" + ), "| Match Score | Desc Similarity | Files Overlap | PR # | State | Title |", "|---|---|---|---|---|---|", ] @@ -292,7 +300,8 @@ def print_run_stats(): lines.append(row) lines.append("\n> 🤖 Auto-detected by similarity signals (title/body/files).") lines.append( - "This is a soft hint only. Please review manually to determine whether these are related work or true duplicates." + "This is a soft hint only. Please review manually to determine whether " + "these are related work or true duplicates." ) body = "\n".join(lines) if existing_comment_id is not None: