From 24314ad79d3307b0de7c443b279128716cfcb873 Mon Sep 17 00:00:00 2001 From: Lior Date: Thu, 28 May 2026 07:26:34 -0400 Subject: [PATCH 1/2] feat(B-0914.6): proximity-agent substrate-engineering substrate de-duplication (canonical-form + Jaccard-similarity clustering); 19 tests pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Google co-scientist proximity agent (Nature 2026): maps ideas into high-dimensional space + groups similar variants to prevent wasting compute on substantively-identical proposals. Generalized to TS-side substrate with two de-dup mechanisms. What this adds: - ProximityFeedback discriminated union + ProximityResult Result-shape - Cluster with representative + members + canonicalForm - clusterByCanonical(corpus, canonicalFn) — deterministic dedup - jaccardSimilarity(tokensA, tokensB) — Jaccard coefficient - defaultTokenize(text) — lowercase + stop-word filter - clusterBySimilarity(context) — greedy clustering by Jaccard threshold - uniqueRepresentatives(result) — drop duplicates convenience Tests (19; all pass): - clusterByCanonical groups same-canonical items - first-seen is representative (pre-sort by score for top-ranked rep) - empty corpus → EmptyCorpus - all unique → N clusters of size 1 - jaccardSimilarity edge cases (identical / disjoint / partial / empty) - defaultTokenize lowercase + stop-word filter - clusterBySimilarity threshold catches near-duplicates - High threshold keeps all distinct; low threshold clusters aggressively - Invalid threshold → InvalidThreshold - uniqueRepresentatives extracts rep-only list - Compose with evolution substrate: pre-sort by score → rep is best - ProximityFeedback exhaustive switch Composes with substrate: - B-0914.6 backlog row - B-0914.5 PR #5767 evolution (de-dup Survivor list before mash) - B-0914.2 PR #5769 closed-loop (de-dup pre-CI-dispatch saves cycles) - verify-existing-substrate-before-authoring rule (proximity IS substrate-inventory at runtime scope) - grep-substrate-anchors-before-razor-as-metaphysical rule (substrate- anchor check at runtime scope) - additive-not-zero-sum + monad-propagation + asymmetric-authorship Real semantic embeddings (TF-IDF / sentence-BERT) deferred; current PoC handles structural dedup case (substrate-engineering work often produces variants that differ only in serialization order, key casing, attribute ordering — canonical-form normalization catches these without embeddings). Co-Authored-By: Claude Opus 4.7 --- tools/workflow-engine/proximity.test.ts | 222 ++++++++++++++++++++++ tools/workflow-engine/proximity.ts | 240 ++++++++++++++++++++++++ 2 files changed, 462 insertions(+) create mode 100644 tools/workflow-engine/proximity.test.ts create mode 100644 tools/workflow-engine/proximity.ts diff --git a/tools/workflow-engine/proximity.test.ts b/tools/workflow-engine/proximity.test.ts new file mode 100644 index 0000000000..9761115cf1 --- /dev/null +++ b/tools/workflow-engine/proximity.test.ts @@ -0,0 +1,222 @@ +/** + * tools/workflow-engine/proximity.test.ts + * + * B-0914.6 — invariant tests for proximity-dedup substrate. + */ + +import { describe, expect, it } from "bun:test"; +import { + clusterByCanonical, + clusterBySimilarity, + defaultTokenize, + jaccardSimilarity, + uniqueRepresentatives, +} from "./proximity"; + +interface Hypothesis { + mechanism: string; + drugCandidate: string; + evidence: number; +} + +describe("B-0914.6 proximity-dedup substrate", () => { + it("clusterByCanonical groups items with same canonical form", () => { + const corpus: Hypothesis[] = [ + { mechanism: "ER-stress", drugCandidate: "cur-6", evidence: 0.8 }, + { mechanism: "ER-Stress", drugCandidate: "Cur-6", evidence: 0.7 }, // case-only diff + { mechanism: "kinase", drugCandidate: "drug-x", evidence: 0.6 }, + ]; + const result = clusterByCanonical(corpus, (h) => + `${h.mechanism.toLowerCase()}|${h.drugCandidate.toLowerCase()}`, + ); + expect(result.ok).toBe(true); + if (!result.ok) return; + expect(result.uniqueCount).toBe(2); + expect(result.clusters[0]!.members.length).toBe(2); // ER-stress + ER-Stress + expect(result.clusters[1]!.members.length).toBe(1); // kinase + }); + + it("clusterByCanonical: first-seen is representative", () => { + const corpus: Hypothesis[] = [ + { mechanism: "x", drugCandidate: "a", evidence: 0.5 }, // first + { mechanism: "x", drugCandidate: "a", evidence: 0.9 }, // duplicate + ]; + const result = clusterByCanonical(corpus, (h) => `${h.mechanism}|${h.drugCandidate}`); + expect(result.ok).toBe(true); + if (!result.ok) return; + expect(result.clusters[0]!.representative.evidence).toBe(0.5); // first-seen + }); + + it("clusterByCanonical empty corpus → EmptyCorpus", () => { + const result = clusterByCanonical([], (h) => h.mechanism); + expect(result.ok).toBe(false); + if (result.ok) return; + expect(result.feedback.kind).toBe("EmptyCorpus"); + }); + + it("clusterByCanonical: all unique returns N clusters of size 1", () => { + const corpus: Hypothesis[] = [ + { mechanism: "a", drugCandidate: "1", evidence: 0.1 }, + { mechanism: "b", drugCandidate: "2", evidence: 0.2 }, + { mechanism: "c", drugCandidate: "3", evidence: 0.3 }, + ]; + const result = clusterByCanonical(corpus, (h) => `${h.mechanism}|${h.drugCandidate}`); + expect(result.ok).toBe(true); + if (!result.ok) return; + expect(result.uniqueCount).toBe(3); + for (const c of result.clusters) { + expect(c.members.length).toBe(1); + } + }); + + it("jaccardSimilarity: identical token sets = 1.0", () => { + const a = new Set(["alpha", "beta", "gamma"]); + const b = new Set(["alpha", "beta", "gamma"]); + expect(jaccardSimilarity(a, b)).toBe(1.0); + }); + + it("jaccardSimilarity: disjoint token sets = 0.0", () => { + const a = new Set(["alpha", "beta"]); + const b = new Set(["gamma", "delta"]); + expect(jaccardSimilarity(a, b)).toBe(0.0); + }); + + it("jaccardSimilarity: partial overlap returns intersection/union", () => { + const a = new Set(["alpha", "beta", "gamma"]); + const b = new Set(["beta", "gamma", "delta"]); + // intersection = {beta, gamma} (size 2) + // union = {alpha, beta, gamma, delta} (size 4) + // jaccard = 2/4 = 0.5 + expect(jaccardSimilarity(a, b)).toBe(0.5); + }); + + it("jaccardSimilarity: both empty returns 1.0", () => { + expect(jaccardSimilarity(new Set(), new Set())).toBe(1.0); + }); + + it("jaccardSimilarity: one empty returns 0.0", () => { + expect(jaccardSimilarity(new Set(["x"]), new Set())).toBe(0.0); + expect(jaccardSimilarity(new Set(), new Set(["x"]))).toBe(0.0); + }); + + it("defaultTokenize: lowercase + filter stop words + filter single-char", () => { + const tokens = defaultTokenize("The quick BROWN fox jumps over the lazy dog"); + expect(tokens.has("quick")).toBe(true); + expect(tokens.has("brown")).toBe(true); + expect(tokens.has("the")).toBe(false); // stop word + expect(tokens.has("over")).toBe(true); + }); + + it("clusterBySimilarity: threshold catches near-duplicates", () => { + const corpus = [ + "ER-stress inhibition via cur-6 targeting unfolded protein response", + "ER-stress inhibition via cur-6 mechanism", // shares 'ER-stress', 'inhibition', 'via', 'cur-6' + "kinase inhibition via drug-x targeting cellular signaling", + ]; + const result = clusterBySimilarity({ + corpus, + extractTokens: defaultTokenize, + threshold: 0.4, // moderate threshold + }); + expect(result.ok).toBe(true); + if (!result.ok) return; + // First 2 should cluster; third is distinct + expect(result.uniqueCount).toBe(2); + }); + + it("clusterBySimilarity: high threshold keeps all distinct", () => { + const corpus = ["alpha beta", "alpha beta gamma", "alpha beta delta"]; + const result = clusterBySimilarity({ + corpus, + extractTokens: defaultTokenize, + threshold: 1.0, // unanimous-only + }); + expect(result.ok).toBe(true); + if (!result.ok) return; + expect(result.uniqueCount).toBe(3); + }); + + it("clusterBySimilarity: low threshold clusters aggressively", () => { + const corpus = ["alpha beta gamma", "alpha", "beta", "gamma"]; + const result = clusterBySimilarity({ + corpus, + extractTokens: defaultTokenize, + threshold: 0.2, + }); + expect(result.ok).toBe(true); + if (!result.ok) return; + // First item has all 3; others share 1 of 3 with first → 1/3 ≈ 0.33 > 0.2 → cluster + expect(result.uniqueCount).toBe(1); + }); + + it("clusterBySimilarity: invalid threshold → InvalidThreshold", () => { + const result = clusterBySimilarity({ + corpus: ["x"], + extractTokens: defaultTokenize, + threshold: 1.5, + }); + expect(result.ok).toBe(false); + if (result.ok) return; + expect(result.feedback.kind).toBe("InvalidThreshold"); + }); + + it("clusterBySimilarity: empty corpus → EmptyCorpus", () => { + const result = clusterBySimilarity({ + corpus: [], + extractTokens: defaultTokenize, + threshold: 0.5, + }); + expect(result.ok).toBe(false); + if (result.ok) return; + expect(result.feedback.kind).toBe("EmptyCorpus"); + }); + + it("uniqueRepresentatives extracts representative-only list", () => { + const corpus: Hypothesis[] = [ + { mechanism: "x", drugCandidate: "a", evidence: 0.5 }, + { mechanism: "x", drugCandidate: "a", evidence: 0.9 }, + { mechanism: "y", drugCandidate: "b", evidence: 0.7 }, + ]; + const result = clusterByCanonical(corpus, (h) => `${h.mechanism}|${h.drugCandidate}`); + const reps = uniqueRepresentatives(result); + expect(reps.length).toBe(2); + expect(reps[0]!.mechanism).toBe("x"); + expect(reps[1]!.mechanism).toBe("y"); + }); + + it("uniqueRepresentatives returns empty for failed result", () => { + const result = clusterByCanonical([], (h) => h.mechanism); + expect(uniqueRepresentatives(result).length).toBe(0); + }); + + it("compose with evolution substrate: pre-sort by score → representative is best", () => { + // Simulates: TrueSkill ranks survivors → proximity dedups → take representatives + const survivors = [ + { id: "h3", canonical: "ER-stress", skill: 30 }, + { id: "h1", canonical: "ER-stress", skill: 20 }, + { id: "h2", canonical: "kinase", skill: 25 }, + ]; + // Pre-sort by skill descending (highest-rank first) + const sorted = [...survivors].sort((a, b) => b.skill - a.skill); + const result = clusterByCanonical(sorted, (s) => s.canonical); + expect(result.ok).toBe(true); + if (!result.ok) return; + // h3 (skill 30) was first-seen for "ER-stress" cluster + const erStressCluster = result.clusters.find((c) => c.canonicalForm === "ER-stress"); + expect(erStressCluster?.representative.id).toBe("h3"); + }); + + it("ProximityFeedback exhaustive switch (compile-time check)", () => { + type R = ReturnType>; + const acknowledge = (r: R): string => { + if (r.ok) return "ok"; + switch (r.feedback.kind) { + case "EmptyCorpus": + case "InvalidThreshold": + return r.feedback.kind; + } + }; + const r = clusterByCanonical([], (h) => h.mechanism); + expect(acknowledge(r)).toBe("EmptyCorpus"); + }); +}); diff --git a/tools/workflow-engine/proximity.ts b/tools/workflow-engine/proximity.ts new file mode 100644 index 0000000000..55d83784b6 --- /dev/null +++ b/tools/workflow-engine/proximity.ts @@ -0,0 +1,240 @@ +/** + * tools/workflow-engine/proximity.ts + * + * B-0914.6 — proximity agent substrate for substrate-engineering + * substrate de-duplication. + * + * Per Google co-scientist proximity agent (Nature 2026): maps ideas + * into high-dimensional space + groups similar variants to detect + * when generation produces near-duplicate hypotheses. Prevents wasting + * compute on substantively-identical proposals. + * + * This substrate ships TWO de-duplication mechanisms: + * - canonical-form normalization (deterministic; no embedding model) + * - similarity-by-shared-tokens (lightweight; no external dependency) + * + * Real semantic embeddings (TF-IDF / sentence-BERT / etc.) deferred to + * substrate-engineering work after operator-substrate-direction; current + * PoC handles the structural dedup case (substrate-engineering work + * often produces variants that differ only in serialization order, key + * casing, attribute ordering). + * + * Composes with: + * - B-0914.6 backlog row (proximity-dedup extension target) + * - B-0914.5 PR #5767 evolution substrate (Survivor de-dup before mash) + * - B-0914.2 PR #5769 closed-loop (de-dup pre-CI-dispatch saves cycles) + * - .claude/rules/verify-existing-substrate-before-authoring (proximity + * IS substrate-inventory at runtime scope) + * - .claude/rules/grep-substrate-anchors-before-razor-as-metaphysical + * (verify substrate anchors before razor-flagging; proximity-dedup + * IS the substrate-anchor check at run-time scope) + * - .claude/rules/additive-not-zero-sum (substrate compounds; don't + * mint parallel substrate-engineering substrate) + * - .claude/rules/monad-propagation-pattern (Result) + * - .claude/rules/asymmetric-authorship (substrate-entity authors + * proximity verdict via TFeedback) + */ + +/** + * Proximity feedback per asymmetric-authorship + monad-propagation rules. + */ +export type ProximityFeedback = + | { kind: "EmptyCorpus" } + | { kind: "InvalidThreshold"; threshold: number }; + +/** + * Result-shape per monad-propagation rule. + */ +export type ProximityResult = + | { ok: true; clusters: ReadonlyArray>; uniqueCount: number } + | { ok: false; feedback: ProximityFeedback }; + +/** + * Cluster of near-duplicate substrate items. + * + * The `representative` is the canonical form chosen to represent the + * cluster (substrate-honest: the highest-quality member by caller's + * substrate-engineering criterion). The `members` includes the + * representative + all near-duplicates clustered with it. + */ +export interface Cluster { + readonly representative: T; + readonly members: ReadonlyArray; + readonly canonicalForm: string; // the canonical-form key clustering uses +} + +/** + * Canonical-form normalization function — caller supplies how to map + * a substrate item to its canonical string form. Items with the SAME + * canonical form are clustered together. + * + * Example: for a hypothesis with `{mechanism, drugCandidate, evidence}`, + * canonical form might be `"${mechanism.toLowerCase()}|${drugCandidate.toLowerCase()}"` + * (ignores evidence; case-insensitive; ignores attribute order). + */ +export type CanonicalFn = (item: T) => string; + +/** + * Cluster items by canonical-form normalization. + * + * Items with the same canonical form go into the same cluster. + * The first item in each cluster (by input order) is the representative. + * Caller can override representative selection by pre-sorting input + * (e.g., by TrueSkill conservativeSkill descending → top-ranked + * representative). + * + * Pure function; no side effects; composable via Result.bind. + */ +export function clusterByCanonical( + corpus: ReadonlyArray, + canonicalFn: CanonicalFn, +): ProximityResult { + if (corpus.length === 0) { + return { ok: false, feedback: { kind: "EmptyCorpus" } }; + } + + const byCanonical = new Map(); + const repByCanonical = new Map(); + + for (const item of corpus) { + const canonical = canonicalFn(item); + const existing = byCanonical.get(canonical); + if (existing) { + existing.push(item); + } else { + byCanonical.set(canonical, [item]); + repByCanonical.set(canonical, item); // first-seen is representative + } + } + + const clusters: Cluster[] = []; + for (const [canonical, members] of byCanonical.entries()) { + clusters.push({ + representative: repByCanonical.get(canonical)!, + members, + canonicalForm: canonical, + }); + } + + return { + ok: true, + clusters, + uniqueCount: clusters.length, + }; +} + +/** + * Token-based similarity: Jaccard coefficient on shared tokens. + * + * Returns value in [0, 1]: + * 1.0 = identical token sets + * 0.0 = no shared tokens + * + * Useful for comparing two substrate items where canonical-form + * normalization is too strict (need fuzzy matching). + */ +export function jaccardSimilarity( + tokensA: ReadonlySet, + tokensB: ReadonlySet, +): number { + if (tokensA.size === 0 && tokensB.size === 0) return 1.0; + if (tokensA.size === 0 || tokensB.size === 0) return 0.0; + const intersection = new Set(); + for (const t of tokensA) { + if (tokensB.has(t)) intersection.add(t); + } + const unionSize = tokensA.size + tokensB.size - intersection.size; + return intersection.size / unionSize; +} + +/** + * Token extraction: simple word-splitting + lowercase + filter stop words. + * + * Caller can supply custom tokenizer for domain-specific tokenization + * (medical terminology, code identifiers, etc.). + */ +export function defaultTokenize(text: string): Set { + const stopWords = new Set([ + "a", "an", "the", "is", "are", "of", "in", "on", "at", "to", "for", + "with", "by", "as", "and", "or", "but", "if", "then", "this", "that", + "these", "those", "it", "its", "be", "been", "was", "were", + ]); + const tokens = new Set(); + const words = text.toLowerCase().match(/[a-z0-9]+/g) ?? []; + for (const w of words) { + if (!stopWords.has(w) && w.length >= 2) { + tokens.add(w); + } + } + return tokens; +} + +/** + * Cluster items by Jaccard similarity threshold. + * + * Items with similarity >= threshold are clustered together. Uses + * greedy clustering: for each item, find existing cluster with highest + * similarity to representative; if >= threshold, add to that cluster; + * else start new cluster with this item as representative. + * + * O(N * K) where K = number of clusters formed (worst case O(N²)). + */ +export interface SimilarityClusterContext { + readonly corpus: ReadonlyArray; + readonly extractTokens: (item: T) => Set; + readonly threshold: number; // Jaccard threshold in (0, 1] +} + +export function clusterBySimilarity( + context: SimilarityClusterContext, +): ProximityResult { + if (context.corpus.length === 0) { + return { ok: false, feedback: { kind: "EmptyCorpus" } }; + } + if (context.threshold <= 0 || context.threshold > 1 || !Number.isFinite(context.threshold)) { + return { ok: false, feedback: { kind: "InvalidThreshold", threshold: context.threshold } }; + } + + const clusterData: Array<{ rep: T; repTokens: Set; members: T[] }> = []; + + for (const item of context.corpus) { + const itemTokens = context.extractTokens(item); + let bestClusterIdx = -1; + let bestSimilarity = 0; + for (let i = 0; i < clusterData.length; i++) { + const sim = jaccardSimilarity(itemTokens, clusterData[i]!.repTokens); + if (sim > bestSimilarity) { + bestSimilarity = sim; + bestClusterIdx = i; + } + } + if (bestClusterIdx >= 0 && bestSimilarity >= context.threshold) { + clusterData[bestClusterIdx]!.members.push(item); + } else { + clusterData.push({ rep: item, repTokens: itemTokens, members: [item] }); + } + } + + const clusters: Cluster[] = clusterData.map((c) => ({ + representative: c.rep, + members: c.members, + canonicalForm: `[similarity:${context.threshold}]:${[...c.repTokens].sort().join(",")}`, + })); + + return { + ok: true, + clusters, + uniqueCount: clusters.length, + }; +} + +/** + * Convenience: extract representatives only (drop duplicates). + * + * Substrate-honest substrate-engineering: when dedup is the goal, + * this is the canonical "give me the unique items" form. + */ +export function uniqueRepresentatives(result: ProximityResult): ReadonlyArray { + if (!result.ok) return []; + return result.clusters.map((c) => c.representative); +} From 96c2182c95bc090347575747b147de28b8b7a8ad Mon Sep 17 00:00:00 2001 From: Lior Date: Thu, 28 May 2026 08:12:08 -0400 Subject: [PATCH 2/2] fix(PR #5772): clarify B-0914 subtask reference + document Cluster.canonicalForm semantic divergence (Copilot threads) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two threads from Copilot on tools/workflow-engine/proximity.ts: 1. Docblock cross-reference "B-0914.6 backlog row" was misleading — the seven .N subtasks (.1-.7) are sections within the parent B-0914 row file, NOT separate B-0914.N row files. Reworded to "B-0914 subtask .6" with explicit parent-row pointer + cross-reference clarification for subtasks .5 and .2 as well. 2. Cluster.canonicalForm field semantically divergent between clusterByCanonical (real canonical-form string from CanonicalFn) and clusterBySimilarity (synthesized "[similarity:]:" label). Added interface docblock that documents the divergence explicitly + names the discriminator (`[similarity:` prefix) callers can use + notes future-substrate rename path. Non-breaking: same field name + same type + same behavior; only docblock expanded. Composes with asymmetric-authorship + monad-propagation rules unchanged. Autonomous-loop tick 2026-05-28T12:08Z resolution of PR #5772 BLOCKED gate (unresolved Copilot threads only blocker; required checks all green). Co-Authored-By: Claude --- tools/workflow-engine/proximity.ts | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/tools/workflow-engine/proximity.ts b/tools/workflow-engine/proximity.ts index 55d83784b6..2c9d7a1519 100644 --- a/tools/workflow-engine/proximity.ts +++ b/tools/workflow-engine/proximity.ts @@ -20,9 +20,14 @@ * casing, attribute ordering). * * Composes with: - * - B-0914.6 backlog row (proximity-dedup extension target) - * - B-0914.5 PR #5767 evolution substrate (Survivor de-dup before mash) - * - B-0914.2 PR #5769 closed-loop (de-dup pre-CI-dispatch saves cycles) + * - B-0914 subtask .6 (parent row `B-0914-co-scientist-plus-robin-...` + * §"### B-0914.6 — Proximity-agent for substrate-engineering substrate + * de-duplication"; the seven .N subtasks are sections within the + * parent row, NOT separate B-0914.N row files) + * - B-0914 subtask .5 (PR #5767 evolution substrate — Survivor de-dup + * before mash) + * - B-0914 subtask .2 (PR #5769 closed-loop — de-dup pre-CI-dispatch + * saves cycles) * - .claude/rules/verify-existing-substrate-before-authoring (proximity * IS substrate-inventory at runtime scope) * - .claude/rules/grep-substrate-anchors-before-razor-as-metaphysical @@ -56,11 +61,27 @@ export type ProximityResult = * cluster (substrate-honest: the highest-quality member by caller's * substrate-engineering criterion). The `members` includes the * representative + all near-duplicates clustered with it. + * + * The `canonicalForm` field is the cluster-identity key. Its CONTENT + * depends on which clustering function produced the cluster: + * - `clusterByCanonical` — the actual canonical-form string the + * caller's `CanonicalFn` returned for all members of the cluster + * (substrate-honest: this IS the canonical form) + * - `clusterBySimilarity` — a synthesized cluster-identity label of + * the shape `[similarity:]:` derived + * from the representative's tokens (substrate-honest: NOT a real + * canonical form; serves as a stable cluster-identity key only) + * + * Callers needing to distinguish the two semantics check whether the + * field starts with `[similarity:` — that prefix marks similarity-clustered + * output. Future-substrate may rename to `clusterKey` + add a discriminator + * field; current shape preserves the substrate-engineering name while + * documenting the divergence. */ export interface Cluster { readonly representative: T; readonly members: ReadonlyArray; - readonly canonicalForm: string; // the canonical-form key clustering uses + readonly canonicalForm: string; // cluster-identity key; see interface docblock for content semantics per producer } /**