From 69ae7560339fbe3de528a97bf3a15d961e4c9545 Mon Sep 17 00:00:00 2001
From: Peter Pan <Peter.Pan@daocloud.io>
Date: Mon, 13 Apr 2026 15:15:04 +0800
Subject: [PATCH 1/6] test AI de-dup CI

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
---
 .github/workflows/detect-duplicate-issues.yml |  29 +++
 .github/workflows/detect-duplicate-prs.yml    |  29 +++
 .../scripts/detect_duplicate_issues.py        | 149 ++++++++++++++
 .../workflows/scripts/detect_duplicate_prs.py | 190 ++++++++++++++++++
 4 files changed, 397 insertions(+)
 create mode 100644 .github/workflows/detect-duplicate-issues.yml
 create mode 100644 .github/workflows/detect-duplicate-prs.yml
 create mode 100644 .github/workflows/scripts/detect_duplicate_issues.py
 create mode 100644 .github/workflows/scripts/detect_duplicate_prs.py

diff --git a/.github/workflows/detect-duplicate-issues.yml b/.github/workflows/detect-duplicate-issues.yml
new file mode 100644
index 000000000000..d4a3ba32eb8f
--- /dev/null
+++ b/.github/workflows/detect-duplicate-issues.yml
@@ -0,0 +1,29 @@
+name: Detect Duplicate Issues
+
+on:
+  issues:
+    types: [opened, reopened]
+
+jobs:
+  detect-duplicate:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: pip install requests numpy scikit-learn sentence-transformers
+
+      - name: Run duplicate Issue detection
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          ISSUE_NUMBER: ${{ github.event.issue.number }}
+          REPO: ${{ github.repository }}
+        run: python .github/workflows/scripts/detect_duplicate_issues.py
diff --git a/.github/workflows/detect-duplicate-prs.yml b/.github/workflows/detect-duplicate-prs.yml
new file mode 100644
index 000000000000..2d739656f4c3
--- /dev/null
+++ b/.github/workflows/detect-duplicate-prs.yml
@@ -0,0 +1,29 @@
+name: Detect Duplicate PRs
+
+on:
+  pull_request:
+    types: [opened, reopened]
+
+jobs:
+  detect-duplicate:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: pip install requests numpy scikit-learn sentence-transformers
+
+      - name: Run duplicate PR detection
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          REPO: ${{ github.repository }}
+        run: python .github/workflows/scripts/detect_duplicate_prs.py
diff --git a/.github/workflows/scripts/detect_duplicate_issues.py b/.github/workflows/scripts/detect_duplicate_issues.py
new file mode 100644
index 000000000000..339d10270a52
--- /dev/null
+++ b/.github/workflows/scripts/detect_duplicate_issues.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Detect duplicate Issues using OpenAI embeddings + cosine similarity.
+Compares: title + body keywords.
+"""
+
+import os
+
+import numpy as np
+import requests
+
+# 替换 get_embedding 函数，使用 sentence-transformers（免费）
+from sentence_transformers import SentenceTransformer
+
+model = SentenceTransformer("all-MiniLM-L6-v2")
+
+
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "")
+ISSUE_NUMBER = int(os.environ["ISSUE_NUMBER"])
+REPO = os.environ["REPO"]
+DRY_RUN = os.getenv("DRY_RUN", "0").lower() in {"1", "true", "yes"}
+
+HEADERS = {"Accept": "application/vnd.github+json"}
+if GITHUB_TOKEN:
+    HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}"
+
+SIMILARITY_THRESHOLD = 0.88
+TOP_K = 5
+MAX_HISTORY = 300
+
+
+def gh_get(url, params=None):
+    r = requests.get(url, headers=HEADERS, params=params)
+    r.raise_for_status()
+    return r.json()
+
+
+def build_issue_text(issue):
+    return f"Title: {issue.get('title', '')}\nBody: {(issue.get('body') or '')[:1000]}"
+
+
+def get_embedding(text: str):
+    return model.encode(text)
+
+
+def cosine_similarity(a, b):
+    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10))
+
+
+def post_comment(issue_number, body):
+    if DRY_RUN:
+        print("DRY_RUN enabled: skip posting issue comment.")
+        print(body)
+        return
+    url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments"
+    requests.post(url, headers=HEADERS, json={"body": body})
+
+
+def add_label(issue_number, label):
+    if DRY_RUN:
+        print(f"DRY_RUN enabled: skip adding label {label}.")
+        return
+    url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/labels"
+    requests.post(url, headers=HEADERS, json={"labels": [label]})
+
+
+def main():
+    # 1. 获取当前 Issue
+    current = gh_get(f"https://api.github.com/repos/{REPO}/issues/{ISSUE_NUMBER}")
+    # 跳过 PR（GitHub API 中 PR 也会出现在 issues 接口）
+    if "pull_request" in current:
+        print("This is a PR, skipping.")
+        return
+
+    current_text = build_issue_text(current)
+    current_emb = get_embedding(current_text)
+
+    # 2. 拉取历史 Issues（open + closed）
+    history_issues = []
+    for state in ["open", "closed"]:
+        page = 1
+        while len(history_issues) < MAX_HISTORY:
+            issues = gh_get(
+                f"https://api.github.com/repos/{REPO}/issues",
+                params={
+                    "state": state,
+                    "per_page": 50,
+                    "page": page,
+                    "sort": "updated",
+                    "direction": "desc",
+                },
+            )
+            if not issues:
+                break
+            for issue in issues:
+                # 跳过 PR 和自身
+                if "pull_request" not in issue and issue["number"] != ISSUE_NUMBER:
+                    history_issues.append(issue)
+            page += 1
+            if len(issues) < 50:
+                break
+        if len(history_issues) >= MAX_HISTORY:
+            break
+
+    history_issues = history_issues[:MAX_HISTORY]
+
+    # 3. 计算相似度
+    results = []
+    for issue in history_issues:
+        text = build_issue_text(issue)
+        emb = get_embedding(text)
+        sim = cosine_similarity(current_emb, emb)
+        results.append((sim, issue))
+
+    results.sort(key=lambda x: -x[0])
+    top_results = [
+        (sim, i) for sim, i in results[:TOP_K] if sim >= SIMILARITY_THRESHOLD
+    ]
+
+    if not top_results:
+        print("No highly similar issues found.")
+        return
+
+    # 4. 自动打 label + 发评论
+    add_label(ISSUE_NUMBER, "possible-duplicate")
+
+    lines = [
+        "## 🔍 Possible Duplicate Issue Detected\n",
+        "The following existing issues appear highly similar:\n",
+        "| Similarity | Issue | State | Title |",
+        "|---|---|---|---|",
+    ]
+    for sim, issue in top_results:
+        state_icon = "🟢" if issue["state"] == "open" else "🔴"
+        row = (
+            f"| {sim:.0%} | #{issue['number']} | {state_icon} {issue['state']} | "
+            f"[{issue['title']}]({issue['html_url']}) |"
+        )
+        lines.append(row)
+    lines.append(
+        "\n> 🤖 Auto-detected by duplicate issue checker. A maintainer will verify."
+    )
+    post_comment(ISSUE_NUMBER, "\n".join(lines))
+    print(f"Posted comment with {len(top_results)} similar issues.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/scripts/detect_duplicate_prs.py b/.github/workflows/scripts/detect_duplicate_prs.py
new file mode 100644
index 000000000000..75d1a5e6ebb8
--- /dev/null
+++ b/.github/workflows/scripts/detect_duplicate_prs.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Detect duplicate PRs using text similarity + file overlap.
+"""
+
+import os
+
+import numpy as np
+import requests
+from sklearn.feature_extraction.text import HashingVectorizer
+
+USE_SENTENCE_TRANSFORMERS = os.getenv("USE_SENTENCE_TRANSFORMERS", "1").lower() in {
+    "1",
+    "true",
+    "yes",
+}
+try:
+    if USE_SENTENCE_TRANSFORMERS:
+        from sentence_transformers import SentenceTransformer
+    else:
+        SentenceTransformer = None
+except Exception:
+    SentenceTransformer = None
+
+model = None
+if SentenceTransformer is not None:
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+
+hashing_vectorizer = HashingVectorizer(
+    n_features=2048,
+    alternate_sign=False,
+    norm="l2",
+)
+
+
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "")
+PR_NUMBER = int(os.environ["PR_NUMBER"])
+REPO = os.environ["REPO"]
+DRY_RUN = os.getenv("DRY_RUN", "0").lower() in {"1", "true", "yes"}
+
+HEADERS = {"Accept": "application/vnd.github+json"}
+if GITHUB_TOKEN:
+    HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}"
+
+SIMILARITY_THRESHOLD = 0.82
+TOP_K = 5
+MAX_OPEN_CANDIDATES = int(os.getenv("MAX_OPEN_CANDIDATES", "120"))
+FILE_COMPARE_TOP_N = int(os.getenv("FILE_COMPARE_TOP_N", "20"))
+TEXT_WEIGHT = 0.75
+FILE_WEIGHT = 0.25
+
+
+def gh_get(url, params=None):
+    r = requests.get(url, headers=HEADERS, params=params)
+    r.raise_for_status()
+    return r.json()
+
+
+def get_pr_files(pr_number):
+    url = f"https://api.github.com/repos/{REPO}/pulls/{pr_number}/files"
+    try:
+        files = gh_get(url, params={"per_page": 100})
+        return [f["filename"] for f in files]
+    except Exception:
+        return []
+
+
+def build_pr_text(pr):
+    parts = [
+        f"Title: {pr.get('title', '')}",
+        f"Body: {(pr.get('body') or '')[:800]}",
+    ]
+    return "\n".join(parts)
+
+
+def get_embedding(text: str):
+    if model is not None:
+        return np.asarray(model.encode(text), dtype=float)
+    return hashing_vectorizer.transform([text]).toarray()[0]
+
+
+def cosine_similarity(a, b):
+    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10))
+
+
+def jaccard_similarity(a, b):
+    sa = set(a)
+    sb = set(b)
+    if not sa or not sb:
+        return 0.0
+    return len(sa & sb) / len(sa | sb)
+
+
+def post_comment(issue_number, body):
+    if DRY_RUN:
+        print("DRY_RUN enabled: skip posting PR comment.")
+        print(body)
+        return
+    url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments"
+    requests.post(url, headers=HEADERS, json={"body": body})
+
+
+def main():
+    # 1. 获取当前 PR 信息
+    current_pr = gh_get(f"https://api.github.com/repos/{REPO}/pulls/{PR_NUMBER}")
+    current_text = build_pr_text(current_pr)
+    current_emb = get_embedding(current_text)
+    current_files = get_pr_files(PR_NUMBER)
+
+    # 2. 拉取 open PR（排除自身，限制候选规模）
+    history_prs = []
+    page = 1
+    while len(history_prs) < MAX_OPEN_CANDIDATES:
+        prs = gh_get(
+            f"https://api.github.com/repos/{REPO}/pulls",
+            params={
+                "state": "open",
+                "per_page": 50,
+                "page": page,
+                "sort": "updated",
+                "direction": "desc",
+            },
+        )
+        if not prs:
+            break
+        for pr in prs:
+            if pr["number"] != PR_NUMBER:
+                history_prs.append(pr)
+                if len(history_prs) >= MAX_OPEN_CANDIDATES:
+                    break
+        page += 1
+        if len(prs) < 50:
+            break
+
+    # 3. 第一阶段：文本相似度筛候选
+    text_results = []
+    for pr in history_prs:
+        text = build_pr_text(pr)
+        emb = get_embedding(text)
+        text_sim = cosine_similarity(current_emb, emb)
+        text_results.append((text_sim, pr))
+
+    text_results.sort(key=lambda x: -x[0])
+    file_candidates = text_results[:FILE_COMPARE_TOP_N]
+
+    # 4. 第二阶段：仅对Top候选拉取文件并融合得分
+    results = []
+    for text_sim, pr in file_candidates:
+        pr_files = get_pr_files(pr["number"])
+        file_sim = jaccard_similarity(current_files, pr_files)
+        final_sim = TEXT_WEIGHT * text_sim + FILE_WEIGHT * file_sim
+        results.append((final_sim, pr, text_sim, file_sim))
+
+    results.sort(key=lambda x: -x[0])
+    top_results = [
+        (sim, pr, text_sim, file_sim)
+        for sim, pr, text_sim, file_sim in results[:TOP_K]
+        if sim >= SIMILARITY_THRESHOLD
+    ]
+
+    # 5. 发评论
+    if not top_results:
+        print("No highly similar PRs found.")
+        return
+
+    lines = [
+        "## 🔍 Potential Duplicate PRs Detected\n",
+        "The following open PRs appear similar to this one:\n",
+        "| Score | Text | Files | PR | State | Title |",
+        "|---|---|---|---|---|---|",
+    ]
+    for sim, pr, text_sim, file_sim in top_results:
+        state_icon = (
+            "🟢" if pr["state"] == "open" else ("🟣" if pr.get("merged_at") else "🔴")
+        )
+        row = (
+            f"| {sim:.0%} | {text_sim:.0%} | {file_sim:.0%} | "
+            f"#{pr['number']} | {state_icon} {pr['state']} | "
+            f"[{pr['title']}]({pr['html_url']}) |"
+        )
+        lines.append(row)
+    lines.append("\n> 🤖 Auto-detected by duplicate PR checker.")
+    lines.append("Please review to avoid redundant work.")
+    post_comment(PR_NUMBER, "\n".join(lines))
+    print(f"Posted comment with {len(top_results)} similar PRs.")
+
+
+if __name__ == "__main__":
+    main()

From fdee28f2aaaa058f3ff6a007630001b8eb5177e3 Mon Sep 17 00:00:00 2001
From: Peter Pan <Peter.Pan@daocloud.io>
Date: Mon, 13 Apr 2026 17:36:24 +0800
Subject: [PATCH 2/6] refine

refine

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
---
 .github/workflows/detect-duplicate-prs.yml    |  3 +-
 .../workflows/scripts/detect_duplicate_prs.py | 70 ++++++++++++++++---
 2 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/detect-duplicate-prs.yml b/.github/workflows/detect-duplicate-prs.yml
index 2d739656f4c3..2f04cff71d27 100644
--- a/.github/workflows/detect-duplicate-prs.yml
+++ b/.github/workflows/detect-duplicate-prs.yml
@@ -2,7 +2,7 @@ name: Detect Duplicate PRs
 
 on:
   pull_request:
-    types: [opened, reopened]
+    types: [opened, reopened, synchronize, edited, ready_for_review]
 
 jobs:
   detect-duplicate:
@@ -27,3 +27,4 @@ jobs:
           PR_NUMBER: ${{ github.event.pull_request.number }}
           REPO: ${{ github.repository }}
         run: python .github/workflows/scripts/detect_duplicate_prs.py
+
diff --git a/.github/workflows/scripts/detect_duplicate_prs.py b/.github/workflows/scripts/detect_duplicate_prs.py
index 75d1a5e6ebb8..906fc72e4455 100644
--- a/.github/workflows/scripts/detect_duplicate_prs.py
+++ b/.github/workflows/scripts/detect_duplicate_prs.py
@@ -2,6 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Detect duplicate PRs using text similarity + file overlap.
+
+Workflow overview:
+1. Load current PR metadata, text, and changed files.
+2. Fetch open PR candidates (excluding the current PR).
+3. Use text similarity for first-pass candidate ranking.
+4. Compute blended text+file similarity on top candidates.
+5. Upsert one bot comment; remove stale comment when no matches remain.
 """
 
 import os
@@ -49,6 +56,7 @@
 FILE_COMPARE_TOP_N = int(os.getenv("FILE_COMPARE_TOP_N", "20"))
 TEXT_WEIGHT = 0.75
 FILE_WEIGHT = 0.25
+COMMENT_MARKER = "<!-- duplicate-pr-checker -->"
 
 
 def gh_get(url, params=None):
@@ -101,14 +109,48 @@ def post_comment(issue_number, body):
     requests.post(url, headers=HEADERS, json={"body": body})
 
 
+def patch_comment(comment_id, body):
+    if DRY_RUN:
+        print(f"DRY_RUN enabled: skip updating comment {comment_id}.")
+        print(body)
+        return
+    url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}"
+    requests.patch(url, headers=HEADERS, json={"body": body})
+
+
+def delete_comment(comment_id):
+    if DRY_RUN:
+        print(f"DRY_RUN enabled: skip deleting comment {comment_id}.")
+        return
+    url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}"
+    requests.delete(url, headers=HEADERS)
+
+
+def find_existing_bot_comment(issue_number):
+    page = 1
+    while True:
+        comments = gh_get(
+            f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments",
+            params={"per_page": 100, "page": page},
+        )
+        if not comments:
+            return None
+        for comment in comments:
+            if COMMENT_MARKER in (comment.get("body") or ""):
+                return comment["id"]
+        if len(comments) < 100:
+            return None
+        page += 1
+
+
 def main():
-    # 1. 获取当前 PR 信息
+    # 1. Load current PR context.
     current_pr = gh_get(f"https://api.github.com/repos/{REPO}/pulls/{PR_NUMBER}")
     current_text = build_pr_text(current_pr)
     current_emb = get_embedding(current_text)
     current_files = get_pr_files(PR_NUMBER)
 
-    # 2. 拉取 open PR（排除自身，限制候选规模）
+    # 2. Fetch open PR candidates (exclude current PR).
     history_prs = []
     page = 1
     while len(history_prs) < MAX_OPEN_CANDIDATES:
@@ -133,7 +175,7 @@ def main():
         if len(prs) < 50:
             break
 
-    # 3. 第一阶段：文本相似度筛候选
+    # 3. Stage-1: rank candidates by text similarity.
     text_results = []
     for pr in history_prs:
         text = build_pr_text(pr)
@@ -144,7 +186,7 @@ def main():
     text_results.sort(key=lambda x: -x[0])
     file_candidates = text_results[:FILE_COMPARE_TOP_N]
 
-    # 4. 第二阶段：仅对Top候选拉取文件并融合得分
+    # 4. Stage-2: score top candidates with text + file overlap.
     results = []
     for text_sim, pr in file_candidates:
         pr_files = get_pr_files(pr["number"])
@@ -159,12 +201,18 @@ def main():
         if sim >= SIMILARITY_THRESHOLD
     ]
 
-    # 5. 发评论
+    # 5. Upsert bot comment
+    existing_comment_id = find_existing_bot_comment(PR_NUMBER)
     if not top_results:
-        print("No highly similar PRs found.")
+        if existing_comment_id is not None:
+            delete_comment(existing_comment_id)
+            print("Deleted stale duplicate checker comment.")
+        else:
+            print("No highly similar PRs found.")
         return
 
     lines = [
+        COMMENT_MARKER,
         "## 🔍 Potential Duplicate PRs Detected\n",
         "The following open PRs appear similar to this one:\n",
         "| Score | Text | Files | PR | State | Title |",
@@ -182,9 +230,15 @@ def main():
         lines.append(row)
     lines.append("\n> 🤖 Auto-detected by duplicate PR checker.")
     lines.append("Please review to avoid redundant work.")
-    post_comment(PR_NUMBER, "\n".join(lines))
-    print(f"Posted comment with {len(top_results)} similar PRs.")
+    body = "\n".join(lines)
+    if existing_comment_id is not None:
+        patch_comment(existing_comment_id, body)
+        print(f"Updated comment with {len(top_results)} similar PRs.")
+    else:
+        post_comment(PR_NUMBER, body)
+        print(f"Posted comment with {len(top_results)} similar PRs.")
 
 
 if __name__ == "__main__":
     main()
+

From 5e779eef4f8a5e9f133f621128540b9c45e2d040 Mon Sep 17 00:00:00 2001
From: Peter Pan <Peter.Pan@daocloud.io>
Date: Tue, 14 Apr 2026 12:08:17 +0800
Subject: [PATCH 3/6] adding closed PR + change threshold

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
---
 .../workflows/scripts/detect_duplicate_prs.py | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/scripts/detect_duplicate_prs.py b/.github/workflows/scripts/detect_duplicate_prs.py
index 906fc72e4455..86e9358126e3 100644
--- a/.github/workflows/scripts/detect_duplicate_prs.py
+++ b/.github/workflows/scripts/detect_duplicate_prs.py
@@ -50,9 +50,12 @@
 if GITHUB_TOKEN:
     HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}"
 
-SIMILARITY_THRESHOLD = 0.82
+SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.75"))
 TOP_K = 5
-MAX_OPEN_CANDIDATES = int(os.getenv("MAX_OPEN_CANDIDATES", "120"))
+MAX_CANDIDATES = int(
+    os.getenv("MAX_CANDIDATES", os.getenv("MAX_OPEN_CANDIDATES", "120"))
+)
+PR_CANDIDATE_STATE = os.getenv("PR_CANDIDATE_STATE", "all")
 FILE_COMPARE_TOP_N = int(os.getenv("FILE_COMPARE_TOP_N", "20"))
 TEXT_WEIGHT = 0.75
 FILE_WEIGHT = 0.25
@@ -150,14 +153,14 @@ def main():
     current_emb = get_embedding(current_text)
     current_files = get_pr_files(PR_NUMBER)
 
-    # 2. Fetch open PR candidates (exclude current PR).
+    # 2. Fetch PR candidates (exclude current PR).
     history_prs = []
     page = 1
-    while len(history_prs) < MAX_OPEN_CANDIDATES:
+    while len(history_prs) < MAX_CANDIDATES:
         prs = gh_get(
             f"https://api.github.com/repos/{REPO}/pulls",
             params={
-                "state": "open",
+                "state": PR_CANDIDATE_STATE,
                 "per_page": 50,
                 "page": page,
                 "sort": "updated",
@@ -169,7 +172,7 @@ def main():
         for pr in prs:
             if pr["number"] != PR_NUMBER:
                 history_prs.append(pr)
-                if len(history_prs) >= MAX_OPEN_CANDIDATES:
+                if len(history_prs) >= MAX_CANDIDATES:
                     break
         page += 1
         if len(prs) < 50:
@@ -213,9 +216,10 @@ def main():
 
     lines = [
         COMMENT_MARKER,
-        "## 🔍 Potential Duplicate PRs Detected\n",
-        "The following open PRs appear similar to this one:\n",
-        "| Score | Text | Files | PR | State | Title |",
+        "## 🔍 Potentially Related PRs\n",
+        f"The following {PR_CANDIDATE_STATE} PRs may be related to this PR, and could overlap in intent or implementation:\n",
+        "If this is intentional and complementary work, feel free to ignore this notice.\n",
+        "| Match Score | Desc Similarity | Files Overlap | PR # | State | Title |",
         "|---|---|---|---|---|---|",
     ]
     for sim, pr, text_sim, file_sim in top_results:
@@ -228,8 +232,10 @@ def main():
             f"[{pr['title']}]({pr['html_url']}) |"
         )
         lines.append(row)
-    lines.append("\n> 🤖 Auto-detected by duplicate PR checker.")
-    lines.append("Please review to avoid redundant work.")
+    lines.append("\n> 🤖 Auto-detected by similarity signals (title/body/files).")
+    lines.append(
+        "This is a soft hint only. Please review manually to determine whether these are related work or true duplicates."
+    )
     body = "\n".join(lines)
     if existing_comment_id is not None:
         patch_comment(existing_comment_id, body)
@@ -241,4 +247,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-

From 97f0f31d0d85f791b7ccd632dc46973ce24918b7 Mon Sep 17 00:00:00 2001
From: Peter Pan <Peter.Pan@daocloud.io>
Date: Tue, 14 Apr 2026 19:04:41 +0800
Subject: [PATCH 4/6] Improve duplicate issue/PR detection workflows and
 caching

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
---
 .github/workflows/detect-duplicate-issues.yml |  37 +-
 .github/workflows/detect-duplicate-prs.yml    |  25 +
 .../scripts/detect_duplicate_issues.py        | 437 +++++++++++++++---
 .../workflows/scripts/detect_duplicate_prs.py |  71 ++-
 4 files changed, 493 insertions(+), 77 deletions(-)

diff --git a/.github/workflows/detect-duplicate-issues.yml b/.github/workflows/detect-duplicate-issues.yml
index d4a3ba32eb8f..fe926245d2c4 100644
--- a/.github/workflows/detect-duplicate-issues.yml
+++ b/.github/workflows/detect-duplicate-issues.yml
@@ -2,7 +2,7 @@ name: Detect Duplicate Issues
 
 on:
   issues:
-    types: [opened, reopened]
+    types: [opened, reopened, edited]
 
 jobs:
   detect-duplicate:
@@ -17,13 +17,48 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: '3.11'
+          cache: 'pip'
 
       - name: Install dependencies
         run: pip install requests numpy scikit-learn sentence-transformers
 
+      - name: Compute cache key epoch (weekly)
+        id: cache-epoch
+        run: echo "week=$(date +%G-W%V)" >> "$GITHUB_OUTPUT"
+
+      - name: Restore issue embedding cache
+        id: restore-issue-cache
+        uses: actions/cache/restore@v4
+        with:
+          path: .github/workflows/.dup_issue_cache/embeddings
+          key: dup-issue-emb-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }}
+          restore-keys: |
+            dup-issue-emb-${{ github.repository }}-
+
       - name: Run duplicate Issue detection
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           ISSUE_NUMBER: ${{ github.event.issue.number }}
           REPO: ${{ github.repository }}
+          DRY_RUN: 0
+          USE_SENTENCE_TRANSFORMERS: 1
+          ISSUE_CANDIDATE_STATE: all
+          MAX_CANDIDATES: 500
+          TITLE_COMPARE_TOP_N: 25
+          TOP_K: 5
+          SIMILARITY_THRESHOLD: 0.82
+          TEXT_WEIGHT: 0.8
+          TITLE_WEIGHT: 0.2
+          AUTO_LABEL: 1
+          DUPLICATE_LABEL: possible-duplicate
+          ISSUE_EMBED_CACHE_DIR: .github/workflows/.dup_issue_cache/embeddings
+          ISSUE_EMBED_CACHE_WRITE: 1
         run: python .github/workflows/scripts/detect_duplicate_issues.py
+
+      - name: Save issue embedding cache
+        if: ${{ steps.restore-issue-cache.outputs.cache-hit != 'true' }}
+        continue-on-error: true
+        uses: actions/cache/save@v4
+        with:
+          path: .github/workflows/.dup_issue_cache/embeddings
+          key: dup-issue-emb-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }}
diff --git a/.github/workflows/detect-duplicate-prs.yml b/.github/workflows/detect-duplicate-prs.yml
index 2f04cff71d27..ce47eac543fa 100644
--- a/.github/workflows/detect-duplicate-prs.yml
+++ b/.github/workflows/detect-duplicate-prs.yml
@@ -17,14 +17,39 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: '3.11'
+          cache: 'pip'
 
       - name: Install dependencies
         run: pip install requests numpy scikit-learn sentence-transformers
 
+      - name: Compute cache key epoch (weekly)
+        id: cache-epoch
+        run: echo "week=$(date +%G-W%V)" >> "$GITHUB_OUTPUT"
+
+      - name: Restore PR file cache
+        id: restore-pr-cache
+        uses: actions/cache/restore@v4
+        with:
+          path: .github/workflows/.dup_pr_cache/files
+          key: dup-pr-files-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }}
+          restore-keys: |
+            dup-pr-files-${{ github.repository }}-
+
       - name: Run duplicate PR detection
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
           REPO: ${{ github.repository }}
+          DRY_RUN: 0
+          PR_FILE_CACHE_DIR: .github/workflows/.dup_pr_cache/files
+          PR_FILE_CACHE_WRITE: 1
+          PREFETCH_CANDIDATE_FILES: 1
         run: python .github/workflows/scripts/detect_duplicate_prs.py
 
+      - name: Save PR file cache
+        if: ${{ github.event.pull_request.head.repo.full_name == github.repository && steps.restore-pr-cache.outputs.cache-hit != 'true' }}
+        continue-on-error: true
+        uses: actions/cache/save@v4
+        with:
+          path: .github/workflows/.dup_pr_cache/files
+          key: dup-pr-files-${{ github.repository }}-${{ steps.cache-epoch.outputs.week }}
diff --git a/.github/workflows/scripts/detect_duplicate_issues.py b/.github/workflows/scripts/detect_duplicate_issues.py
index 339d10270a52..a8177220a164 100644
--- a/.github/workflows/scripts/detect_duplicate_issues.py
+++ b/.github/workflows/scripts/detect_duplicate_issues.py
@@ -1,60 +1,240 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-Detect duplicate Issues using OpenAI embeddings + cosine similarity.
-Compares: title + body keywords.
+Detect duplicate issues using two-stage similarity:
+1) semantic text similarity (title + body)
+2) blended score with title token overlap
+
+Workflow behavior:
+- Upsert one bot comment identified by marker.
+- Remove stale bot comment and duplicate label when no matches remain.
+
+Local debug example:
+GITHUB_TOKEN="$(gh auth token)" ISSUE_NUMBER=39774 REPO=vllm-project/vllm DRY_RUN=1 ISSUE_EMBED_CACHE_DIR=.github/workflows/.dup_issue_cache/embeddings .venv/bin/python .github/workflows/scripts/detect_duplicate_issues.py
 """
 
+import json
 import os
+import re
+from pathlib import Path
 
 import numpy as np
 import requests
-
-# 替换 get_embedding 函数，使用 sentence-transformers（免费）
-from sentence_transformers import SentenceTransformer
-
-model = SentenceTransformer("all-MiniLM-L6-v2")
-
+from sklearn.feature_extraction.text import HashingVectorizer
+
+USE_SENTENCE_TRANSFORMERS = os.getenv("USE_SENTENCE_TRANSFORMERS", "1").lower() in {
+    "1",
+    "true",
+    "yes",
+}
+try:
+    if USE_SENTENCE_TRANSFORMERS:
+        from sentence_transformers import SentenceTransformer
+    else:
+        SentenceTransformer = None
+except Exception:
+    SentenceTransformer = None
+
+model = None
+if SentenceTransformer is not None:
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+
+hashing_vectorizer = HashingVectorizer(
+    n_features=2048,
+    alternate_sign=False,
+    norm="l2",
+)
 
 GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "")
 ISSUE_NUMBER = int(os.environ["ISSUE_NUMBER"])
 REPO = os.environ["REPO"]
-DRY_RUN = os.getenv("DRY_RUN", "0").lower() in {"1", "true", "yes"}
+DRY_RUN = os.getenv("DRY_RUN", "1").lower() in {"1", "true", "yes"}
 
 HEADERS = {"Accept": "application/vnd.github+json"}
 if GITHUB_TOKEN:
     HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}"
 
-SIMILARITY_THRESHOLD = 0.88
-TOP_K = 5
-MAX_HISTORY = 300
+SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.82"))
+TOP_K = int(os.getenv("TOP_K", "5"))
+MAX_CANDIDATES = int(os.getenv("MAX_CANDIDATES", "500"))
+ISSUE_CANDIDATE_STATE = os.getenv("ISSUE_CANDIDATE_STATE", "all")
+TITLE_COMPARE_TOP_N = int(os.getenv("TITLE_COMPARE_TOP_N", "25"))
+TEXT_WEIGHT = float(os.getenv("TEXT_WEIGHT", "0.8"))
+TITLE_WEIGHT = float(os.getenv("TITLE_WEIGHT", "0.2"))
+COMMENT_MARKER = "<!-- duplicate-issue-checker -->"
+DUPLICATE_LABEL = os.getenv("DUPLICATE_LABEL", "possible-duplicate")
+AUTO_LABEL = os.getenv("AUTO_LABEL", "1").lower() in {"1", "true", "yes"}
+ISSUE_EMBED_CACHE_DIR = os.getenv("ISSUE_EMBED_CACHE_DIR", "")
+ISSUE_EMBED_CACHE_WRITE = os.getenv("ISSUE_EMBED_CACHE_WRITE", "1").lower() in {
+    "1",
+    "true",
+    "yes",
+}
+EMBEDDING_MODE = "sentence-transformers" if model is not None else "hashing-vectorizer"
+FEATURE_VERSION = "v2-template-cleaned"
+RUN_STATS = {
+    "api_requests": 0,
+    "candidates_fetched": 0,
+    "text_scored": 0,
+    "final_scored": 0,
+    "embed_cache_hits": 0,
+    "embed_cache_misses": 0,
+    "embed_cache_writes": 0,
+}
+DROP_BODY_SECTION_HEADERS = {
+    "your current environment",
+    "environment",
+    "before submitting",
+    "checklist",
+}
+DROP_BODY_LINE_PATTERNS = [
+    re.compile(r"^\s*-\s*\[[ xX]\]\s*"),
+    re.compile(r"^\s*<!--.*-->\s*$"),
+]
+ENV_BLOCK_HINTS = {
+    "collecting environment information",
+    "system info",
+    "pytorch info",
+    "python environment",
+    "cuda / gpu info",
+    "cpu info",
+    "the output of python collect_env.py",
+}
 
 
 def gh_get(url, params=None):
+    RUN_STATS["api_requests"] += 1
     r = requests.get(url, headers=HEADERS, params=params)
     r.raise_for_status()
     return r.json()
 
 
+def gh_post(url, payload):
+    RUN_STATS["api_requests"] += 1
+    r = requests.post(url, headers=HEADERS, json=payload)
+    r.raise_for_status()
+
+
+def gh_patch(url, payload):
+    RUN_STATS["api_requests"] += 1
+    r = requests.patch(url, headers=HEADERS, json=payload)
+    r.raise_for_status()
+
+
+def gh_delete(url, ignore_not_found=False):
+    RUN_STATS["api_requests"] += 1
+    r = requests.delete(url, headers=HEADERS)
+    if ignore_not_found and r.status_code == 404:
+        return
+    r.raise_for_status()
+
+
+def _header_name(line: str) -> str:
+    stripped = line.strip().lstrip("#").strip().lower()
+    return stripped.rstrip(":")
+
+
+def _should_drop_code_block(lines: list[str]) -> bool:
+    if not lines:
+        return False
+    block_text = "\n".join(lines).lower()
+    hint_hits = sum(1 for hint in ENV_BLOCK_HINTS if hint in block_text)
+    if hint_hits >= 2:
+        return True
+    if len(lines) >= 60 and hint_hits >= 1:
+        return True
+    return False
+
+
+def clean_issue_body(body: str) -> str:
+    lines = body.replace("\r\n", "\n").split("\n")
+    output: list[str] = []
+    i = 0
+    skip_until_next_header = False
+    while i < len(lines):
+        line = lines[i]
+        header = _header_name(line)
+        if line.strip().startswith("##"):
+            skip_until_next_header = header in DROP_BODY_SECTION_HEADERS
+            i += 1
+            continue
+        if skip_until_next_header:
+            i += 1
+            continue
+        if line.strip().startswith("```"):
+            block = [line]
+            i += 1
+            while i < len(lines):
+                block.append(lines[i])
+                if lines[i].strip().startswith("```"):
+                    i += 1
+                    break
+                i += 1
+            if not _should_drop_code_block(block):
+                output.extend(block)
+            continue
+        if any(p.search(line) for p in DROP_BODY_LINE_PATTERNS):
+            i += 1
+            continue
+        output.append(line)
+        i += 1
+    cleaned = "\n".join(output).strip()
+    return cleaned
+
+
 def build_issue_text(issue):
-    return f"Title: {issue.get('title', '')}\nBody: {(issue.get('body') or '')[:1000]}"
+    raw_body = issue.get("body") or ""
+    cleaned_body = clean_issue_body(raw_body)
+    return f"Title: {issue.get('title', '')}\nBody: {cleaned_body[:1000]}"
+
+
+def title_tokens(issue):
+    title = (issue.get("title") or "").lower()
+    return re.findall(r"[a-z0-9_]+", title)
 
 
 def get_embedding(text: str):
-    return model.encode(text)
+    if model is not None:
+        return np.asarray(model.encode(text), dtype=float)
+    return hashing_vectorizer.transform([text]).toarray()[0]
 
 
 def cosine_similarity(a, b):
     return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10))
 
 
+def jaccard_similarity(a, b):
+    sa = set(a)
+    sb = set(b)
+    if not sa or not sb:
+        return 0.0
+    return len(sa & sb) / len(sa | sb)
+
+
 def post_comment(issue_number, body):
     if DRY_RUN:
         print("DRY_RUN enabled: skip posting issue comment.")
         print(body)
         return
     url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments"
-    requests.post(url, headers=HEADERS, json={"body": body})
+    gh_post(url, {"body": body})
+
+
+def patch_comment(comment_id, body):
+    if DRY_RUN:
+        print(f"DRY_RUN enabled: skip updating comment {comment_id}.")
+        print(body)
+        return
+    url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}"
+    gh_patch(url, {"body": body})
+
+
+def delete_comment(comment_id):
+    if DRY_RUN:
+        print(f"DRY_RUN enabled: skip deleting comment {comment_id}.")
+        return
+    url = f"https://api.github.com/repos/{REPO}/issues/comments/{comment_id}"
+    gh_delete(url)
 
 
 def add_label(issue_number, label):
@@ -62,87 +242,204 @@ def add_label(issue_number, label):
         print(f"DRY_RUN enabled: skip adding label {label}.")
         return
     url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/labels"
-    requests.post(url, headers=HEADERS, json={"labels": [label]})
+    gh_post(url, {"labels": [label]})
+
+
+def remove_label(issue_number, label):
+    if DRY_RUN:
+        print(f"DRY_RUN enabled: skip removing label {label}.")
+        return
+    url = f"https://api.github.com/repos/{REPO}/issues/{issue_number}/labels/{label}"
+    gh_delete(url, ignore_not_found=True)
+
+
+def find_existing_bot_comment(issue_number):
+    page = 1
+    while True:
+        comments = gh_get(
+            f"https://api.github.com/repos/{REPO}/issues/{issue_number}/comments",
+            params={"per_page": 100, "page": page},
+        )
+        if not comments:
+            return None
+        for comment in comments:
+            if COMMENT_MARKER in (comment.get("body") or ""):
+                return comment["id"]
+        if len(comments) < 100:
+            return None
+        page += 1
+
+
+def print_run_stats():
+    print(
+        "Stats: "
+        f"api_requests={RUN_STATS['api_requests']} "
+        f"candidates_fetched={RUN_STATS['candidates_fetched']} "
+        f"text_scored={RUN_STATS['text_scored']} "
+        f"final_scored={RUN_STATS['final_scored']} "
+        f"embed_cache_hits={RUN_STATS['embed_cache_hits']} "
+        f"embed_cache_misses={RUN_STATS['embed_cache_misses']} "
+        f"embed_cache_writes={RUN_STATS['embed_cache_writes']}"
+    )
+
+
+def get_issue_features(issue):
+    cache_file = None
+    issue_number = issue.get("number")
+    updated_at = issue.get("updated_at") or ""
+    if ISSUE_EMBED_CACHE_DIR and issue_number is not None:
+        cache_dir = Path(ISSUE_EMBED_CACHE_DIR)
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        cache_file = cache_dir / f"{issue_number}.json"
+        if cache_file.exists():
+            try:
+                payload = json.loads(cache_file.read_text(encoding="utf-8"))
+                embedding = payload.get("embedding")
+                title_tok = payload.get("title_tokens")
+                if (
+                    payload.get("feature_version") == FEATURE_VERSION
+                    and payload.get("mode") == EMBEDDING_MODE
+                    and payload.get("updated_at", "") == updated_at
+                    and isinstance(embedding, list)
+                    and isinstance(title_tok, list)
+                ):
+                    RUN_STATS["embed_cache_hits"] += 1
+                    return np.asarray(embedding, dtype=float), title_tok
+            except Exception:
+                pass
+    RUN_STATS["embed_cache_misses"] += 1
+    text = build_issue_text(issue)
+    emb = get_embedding(text)
+    title_tok = title_tokens(issue)
+    if cache_file is not None and ISSUE_EMBED_CACHE_WRITE:
+        try:
+            cache_file.write_text(
+                json.dumps(
+                    {
+                        "feature_version": FEATURE_VERSION,
+                        "mode": EMBEDDING_MODE,
+                        "updated_at": updated_at,
+                        "embedding": emb.tolist(),
+                        "title_tokens": title_tok,
+                    }
+                ),
+                encoding="utf-8",
+            )
+            RUN_STATS["embed_cache_writes"] += 1
+        except Exception:
+            pass
+    return emb, title_tok
 
 
 def main():
-    # 1. 获取当前 Issue
-    current = gh_get(f"https://api.github.com/repos/{REPO}/issues/{ISSUE_NUMBER}")
-    # 跳过 PR（GitHub API 中 PR 也会出现在 issues 接口）
-    if "pull_request" in current:
+    current_issue = gh_get(f"https://api.github.com/repos/{REPO}/issues/{ISSUE_NUMBER}")
+    if "pull_request" in current_issue:
         print("This is a PR, skipping.")
         return
 
-    current_text = build_issue_text(current)
-    current_emb = get_embedding(current_text)
-
-    # 2. 拉取历史 Issues（open + closed）
-    history_issues = []
-    for state in ["open", "closed"]:
-        page = 1
-        while len(history_issues) < MAX_HISTORY:
-            issues = gh_get(
-                f"https://api.github.com/repos/{REPO}/issues",
-                params={
-                    "state": state,
-                    "per_page": 50,
-                    "page": page,
-                    "sort": "updated",
-                    "direction": "desc",
-                },
-            )
-            if not issues:
-                break
-            for issue in issues:
-                # 跳过 PR 和自身
-                if "pull_request" not in issue and issue["number"] != ISSUE_NUMBER:
-                    history_issues.append(issue)
-            page += 1
-            if len(issues) < 50:
+    current_emb, current_title_tokens = get_issue_features(current_issue)
+
+    candidates = []
+    page = 1
+    while len(candidates) < MAX_CANDIDATES:
+        issues = gh_get(
+            f"https://api.github.com/repos/{REPO}/issues",
+            params={
+                "state": ISSUE_CANDIDATE_STATE,
+                "per_page": 50,
+                "page": page,
+                "sort": "updated",
+                "direction": "desc",
+            },
+        )
+        if not issues:
+            break
+        for issue in issues:
+            if "pull_request" in issue:
+                continue
+            if issue["number"] == ISSUE_NUMBER:
+                continue
+            candidates.append(issue)
+            if len(candidates) >= MAX_CANDIDATES:
                 break
-        if len(history_issues) >= MAX_HISTORY:
+        page += 1
+        if len(issues) < 50:
             break
+    RUN_STATS["candidates_fetched"] = len(candidates)
+
+    text_results = []
+    candidate_title_tokens = {}
+    for issue in candidates:
+        emb, title_tok = get_issue_features(issue)
+        candidate_title_tokens[issue["number"]] = title_tok
+        text_sim = cosine_similarity(current_emb, emb)
+        RUN_STATS["text_scored"] += 1
+        text_results.append((text_sim, issue))
+
+    text_results.sort(key=lambda x: -x[0])
+    title_candidates = text_results[:TITLE_COMPARE_TOP_N]
+
+    blended_results = []
+    for text_sim, issue in title_candidates:
+        tok_sim = jaccard_similarity(
+            current_title_tokens, candidate_title_tokens.get(issue["number"], [])
+        )
+        final_sim = TEXT_WEIGHT * text_sim + TITLE_WEIGHT * tok_sim
+        RUN_STATS["final_scored"] += 1
+        blended_results.append((final_sim, issue, text_sim, tok_sim))
 
-    history_issues = history_issues[:MAX_HISTORY]
-
-    # 3. 计算相似度
-    results = []
-    for issue in history_issues:
-        text = build_issue_text(issue)
-        emb = get_embedding(text)
-        sim = cosine_similarity(current_emb, emb)
-        results.append((sim, issue))
-
-    results.sort(key=lambda x: -x[0])
+    blended_results.sort(key=lambda x: -x[0])
     top_results = [
-        (sim, i) for sim, i in results[:TOP_K] if sim >= SIMILARITY_THRESHOLD
+        (sim, issue, text_sim, tok_sim)
+        for sim, issue, text_sim, tok_sim in blended_results[:TOP_K]
+        if sim >= SIMILARITY_THRESHOLD
     ]
 
+    existing_comment_id = find_existing_bot_comment(ISSUE_NUMBER)
     if not top_results:
+        if existing_comment_id is not None:
+            delete_comment(existing_comment_id)
+            print("Deleted stale duplicate checker comment.")
+        if AUTO_LABEL:
+            remove_label(ISSUE_NUMBER, DUPLICATE_LABEL)
         print("No highly similar issues found.")
+        print_run_stats()
         return
 
-    # 4. 自动打 label + 发评论
-    add_label(ISSUE_NUMBER, "possible-duplicate")
+    if AUTO_LABEL:
+        add_label(ISSUE_NUMBER, DUPLICATE_LABEL)
 
     lines = [
-        "## 🔍 Possible Duplicate Issue Detected\n",
-        "The following existing issues appear highly similar:\n",
-        "| Similarity | Issue | State | Title |",
-        "|---|---|---|---|",
+        COMMENT_MARKER,
+        "## 🔍 Potentially Related Issues\n",
+        f"The following {ISSUE_CANDIDATE_STATE} issues may be related to this issue:\n",
+        "If this is intentional and complementary work, feel free to ignore this notice.\n",
+        "| Match Score | Desc Similarity | Title Overlap | Issue # | State | Title |",
+        "|---|---|---|---|---|---|",
     ]
-    for sim, issue in top_results:
+    for sim, issue, text_sim, tok_sim in top_results:
         state_icon = "🟢" if issue["state"] == "open" else "🔴"
         row = (
-            f"| {sim:.0%} | #{issue['number']} | {state_icon} {issue['state']} | "
+            f"| {sim:.0%} | {text_sim:.0%} | {tok_sim:.0%} | "
+            f"#{issue['number']} | {state_icon} {issue['state']} | "
             f"[{issue['title']}]({issue['html_url']}) |"
         )
         lines.append(row)
     lines.append(
-        "\n> 🤖 Auto-detected by duplicate issue checker. A maintainer will verify."
+        "\n> 🤖 Auto-detected by similarity signals (title/body/title-tokens)."
+    )
+    lines.append(
+        "This is a soft hint only. Please review manually to determine whether these are related work or true duplicates."
     )
-    post_comment(ISSUE_NUMBER, "\n".join(lines))
-    print(f"Posted comment with {len(top_results)} similar issues.")
+    body = "\n".join(lines)
+
+    if existing_comment_id is not None:
+        patch_comment(existing_comment_id, body)
+        print(f"Updated comment with {len(top_results)} similar issues.")
+    else:
+        post_comment(ISSUE_NUMBER, body)
+        print(f"Posted comment with {len(top_results)} similar issues.")
+    print_run_stats()
 
 
 if __name__ == "__main__":
diff --git a/.github/workflows/scripts/detect_duplicate_prs.py b/.github/workflows/scripts/detect_duplicate_prs.py
index 86e9358126e3..85dd26c85c08 100644
--- a/.github/workflows/scripts/detect_duplicate_prs.py
+++ b/.github/workflows/scripts/detect_duplicate_prs.py
@@ -9,9 +9,14 @@
 3. Use text similarity for first-pass candidate ranking.
 4. Compute blended text+file similarity on top candidates.
 5. Upsert one bot comment; remove stale comment when no matches remain.
+
+Local debug example:
+GITHUB_TOKEN="$(gh auth token)" PR_NUMBER=61456 REPO=vllm-project/vllm DRY_RUN=1 PR_FILE_CACHE_DIR=.github/workflows/.dup_pr_cache/files .venv/bin/python .github/workflows/scripts/detect_duplicate_prs.py
 """
 
+import json
 import os
+from pathlib import Path
 
 import numpy as np
 import requests
@@ -44,7 +49,7 @@
 GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "")
 PR_NUMBER = int(os.environ["PR_NUMBER"])
 REPO = os.environ["REPO"]
-DRY_RUN = os.getenv("DRY_RUN", "0").lower() in {"1", "true", "yes"}
+DRY_RUN = os.getenv("DRY_RUN", "1").lower() in {"1", "true", "yes"}
 
 HEADERS = {"Accept": "application/vnd.github+json"}
 if GITHUB_TOKEN:
@@ -52,27 +57,64 @@
 
 SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.75"))
 TOP_K = 5
-MAX_CANDIDATES = int(
-    os.getenv("MAX_CANDIDATES", os.getenv("MAX_OPEN_CANDIDATES", "120"))
-)
+MAX_CANDIDATES = int(os.getenv("MAX_CANDIDATES", "500"))
 PR_CANDIDATE_STATE = os.getenv("PR_CANDIDATE_STATE", "all")
 FILE_COMPARE_TOP_N = int(os.getenv("FILE_COMPARE_TOP_N", "20"))
+PREFETCH_CANDIDATE_FILES = os.getenv("PREFETCH_CANDIDATE_FILES", "1").lower() in {
+    "1",
+    "true",
+    "yes",
+}
 TEXT_WEIGHT = 0.75
 FILE_WEIGHT = 0.25
 COMMENT_MARKER = "<!-- duplicate-pr-checker -->"
+PR_FILE_CACHE_DIR = os.getenv("PR_FILE_CACHE_DIR", "")
+PR_FILE_CACHE_WRITE = os.getenv("PR_FILE_CACHE_WRITE", "1").lower() in {
+    "1",
+    "true",
+    "yes",
+}
+RUN_STATS = {
+    "api_requests": 0,
+    "file_cache_hits": 0,
+    "file_cache_misses": 0,
+    "file_cache_writes": 0,
+}
 
 
 def gh_get(url, params=None):
+    RUN_STATS["api_requests"] += 1
     r = requests.get(url, headers=HEADERS, params=params)
     r.raise_for_status()
     return r.json()
 
 
 def get_pr_files(pr_number):
+    cache_file = None
+    if PR_FILE_CACHE_DIR:
+        cache_dir = Path(PR_FILE_CACHE_DIR)
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        cache_file = cache_dir / f"{pr_number}.json"
+        if cache_file.exists():
+            try:
+                cached_files = json.loads(cache_file.read_text(encoding="utf-8"))
+                if isinstance(cached_files, list):
+                    RUN_STATS["file_cache_hits"] += 1
+                    return cached_files
+            except Exception:
+                pass
+    RUN_STATS["file_cache_misses"] += 1
     url = f"https://api.github.com/repos/{REPO}/pulls/{pr_number}/files"
     try:
         files = gh_get(url, params={"per_page": 100})
-        return [f["filename"] for f in files]
+        filenames = [f["filename"] for f in files]
+        if cache_file is not None and PR_FILE_CACHE_WRITE:
+            try:
+                cache_file.write_text(json.dumps(filenames), encoding="utf-8")
+                RUN_STATS["file_cache_writes"] += 1
+            except Exception:
+                pass
+        return filenames
     except Exception:
         return []
 
@@ -147,13 +189,22 @@ def find_existing_bot_comment(issue_number):
 
 
 def main():
+    def print_run_stats():
+        print(
+            "Stats: "
+            f"api_requests={RUN_STATS['api_requests']} "
+            f"file_cache_hits={RUN_STATS['file_cache_hits']} "
+            f"file_cache_misses={RUN_STATS['file_cache_misses']} "
+            f"file_cache_writes={RUN_STATS['file_cache_writes']}"
+        )
+
     # 1. Load current PR context.
     current_pr = gh_get(f"https://api.github.com/repos/{REPO}/pulls/{PR_NUMBER}")
     current_text = build_pr_text(current_pr)
     current_emb = get_embedding(current_text)
     current_files = get_pr_files(PR_NUMBER)
 
-    # 2. Fetch PR candidates (exclude current PR).
+    # 2. Fetch PR candidates (exclude current PR). ranked by most recent updated
     history_prs = []
     page = 1
     while len(history_prs) < MAX_CANDIDATES:
@@ -186,6 +237,12 @@ def main():
         text_sim = cosine_similarity(current_emb, emb)
         text_results.append((text_sim, pr))
 
+    # Warm file cache for all candidates so different PR runs can reuse
+    # a stable candidate pool across workflow executions.
+    if PREFETCH_CANDIDATE_FILES:
+        for pr in history_prs:
+            get_pr_files(pr["number"])
+
     text_results.sort(key=lambda x: -x[0])
     file_candidates = text_results[:FILE_COMPARE_TOP_N]
 
@@ -212,6 +269,7 @@ def main():
             print("Deleted stale duplicate checker comment.")
         else:
             print("No highly similar PRs found.")
+        print_run_stats()
         return
 
     lines = [
@@ -243,6 +301,7 @@ def main():
     else:
         post_comment(PR_NUMBER, body)
         print(f"Posted comment with {len(top_results)} similar PRs.")
+    print_run_stats()
 
 
 if __name__ == "__main__":

From c4504b06d92793c47d594421eece6bd360ff4d25 Mon Sep 17 00:00:00 2001
From: Peter Pan <Peter.Pan@daocloud.io>
Date: Thu, 16 Apr 2026 18:21:11 +0800
Subject: [PATCH 5/6] ci: use regex module for forbidden import check

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
---
 .../scripts/detect_duplicate_issues.py        | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/scripts/detect_duplicate_issues.py b/.github/workflows/scripts/detect_duplicate_issues.py
index a8177220a164..43d992ce6af9 100644
--- a/.github/workflows/scripts/detect_duplicate_issues.py
+++ b/.github/workflows/scripts/detect_duplicate_issues.py
@@ -10,15 +10,17 @@
 - Remove stale bot comment and duplicate label when no matches remain.
 
 Local debug example:
-GITHUB_TOKEN="$(gh auth token)" ISSUE_NUMBER=39774 REPO=vllm-project/vllm DRY_RUN=1 ISSUE_EMBED_CACHE_DIR=.github/workflows/.dup_issue_cache/embeddings .venv/bin/python .github/workflows/scripts/detect_duplicate_issues.py
+GITHUB_TOKEN="$(gh auth token)" ISSUE_NUMBER=39774 REPO=vllm-project/vllm DRY_RUN=1 \
+ISSUE_EMBED_CACHE_DIR=.github/workflows/.dup_issue_cache/embeddings \
+.venv/bin/python .github/workflows/scripts/detect_duplicate_issues.py
 """
 
 import json
 import os
-import re
 from pathlib import Path
 
 import numpy as np
+import regex as re
 import requests
 from sklearn.feature_extraction.text import HashingVectorizer
 
@@ -141,9 +143,7 @@ def _should_drop_code_block(lines: list[str]) -> bool:
     hint_hits = sum(1 for hint in ENV_BLOCK_HINTS if hint in block_text)
     if hint_hits >= 2:
         return True
-    if len(lines) >= 60 and hint_hits >= 1:
-        return True
-    return False
+    return bool(len(lines) >= 60 and hint_hits >= 1)
 
 
 def clean_issue_body(body: str) -> str:
@@ -412,8 +412,14 @@ def main():
     lines = [
         COMMENT_MARKER,
         "## 🔍 Potentially Related Issues\n",
-        f"The following {ISSUE_CANDIDATE_STATE} issues may be related to this issue:\n",
-        "If this is intentional and complementary work, feel free to ignore this notice.\n",
+        (
+            f"The following {ISSUE_CANDIDATE_STATE} issues may be related to this "
+            "issue:\n"
+        ),
+        (
+            "If this is intentional and complementary work, feel free to ignore "
+            "this notice.\n"
+        ),
         "| Match Score | Desc Similarity | Title Overlap | Issue # | State | Title |",
         "|---|---|---|---|---|---|",
     ]
@@ -429,7 +435,8 @@ def main():
         "\n> 🤖 Auto-detected by similarity signals (title/body/title-tokens)."
     )
     lines.append(
-        "This is a soft hint only. Please review manually to determine whether these are related work or true duplicates."
+        "This is a soft hint only. Please review manually to determine whether "
+        "these are related work or true duplicates."
     )
     body = "\n".join(lines)
 

From 243eb586bf551c768302c4c1b2103726dde1c2b4 Mon Sep 17 00:00:00 2001
From: Peter Pan <Peter.Pan@daocloud.io>
Date: Thu, 16 Apr 2026 21:31:01 +0800
Subject: [PATCH 6/6] Fix E501 in duplicate PR checker script

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
---
 .../workflows/scripts/detect_duplicate_prs.py   | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/scripts/detect_duplicate_prs.py b/.github/workflows/scripts/detect_duplicate_prs.py
index 85dd26c85c08..4db9d8492a7a 100644
--- a/.github/workflows/scripts/detect_duplicate_prs.py
+++ b/.github/workflows/scripts/detect_duplicate_prs.py
@@ -11,7 +11,9 @@
 5. Upsert one bot comment; remove stale comment when no matches remain.
 
 Local debug example:
-GITHUB_TOKEN="$(gh auth token)" PR_NUMBER=61456 REPO=vllm-project/vllm DRY_RUN=1 PR_FILE_CACHE_DIR=.github/workflows/.dup_pr_cache/files .venv/bin/python .github/workflows/scripts/detect_duplicate_prs.py
+GITHUB_TOKEN="$(gh auth token)" PR_NUMBER=61456 REPO=vllm-project/vllm DRY_RUN=1 \
+PR_FILE_CACHE_DIR=.github/workflows/.dup_pr_cache/files \
+.venv/bin/python .github/workflows/scripts/detect_duplicate_prs.py
 """
 
 import json
@@ -275,8 +277,14 @@ def print_run_stats():
     lines = [
         COMMENT_MARKER,
         "## 🔍 Potentially Related PRs\n",
-        f"The following {PR_CANDIDATE_STATE} PRs may be related to this PR, and could overlap in intent or implementation:\n",
-        "If this is intentional and complementary work, feel free to ignore this notice.\n",
+        (
+            f"The following {PR_CANDIDATE_STATE} PRs may be related to this PR, "
+            "and could overlap in intent or implementation:\n"
+        ),
+        (
+            "If this is intentional and complementary work, feel free to ignore "
+            "this notice.\n"
+        ),
         "| Match Score | Desc Similarity | Files Overlap | PR # | State | Title |",
         "|---|---|---|---|---|---|",
     ]
@@ -292,7 +300,8 @@ def print_run_stats():
         lines.append(row)
     lines.append("\n> 🤖 Auto-detected by similarity signals (title/body/files).")
     lines.append(
-        "This is a soft hint only. Please review manually to determine whether these are related work or true duplicates."
+        "This is a soft hint only. Please review manually to determine whether "
+        "these are related work or true duplicates."
     )
     body = "\n".join(lines)
     if existing_comment_id is not None: