From 24314ad79d3307b0de7c443b279128716cfcb873 Mon Sep 17 00:00:00 2001
From: Lior <lior@zeta.dev>
Date: Thu, 28 May 2026 07:26:34 -0400
Subject: [PATCH 1/2] feat(B-0914.6): proximity-agent substrate-engineering
 substrate de-duplication (canonical-form + Jaccard-similarity clustering); 19
 tests pass
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per Google co-scientist proximity agent (Nature 2026): maps ideas into
high-dimensional space + groups similar variants to prevent wasting
compute on substantively-identical proposals. Generalized to TS-side
substrate with two de-dup mechanisms.

What this adds:
- ProximityFeedback discriminated union + ProximityResult<T> Result-shape
- Cluster<T> with representative + members + canonicalForm
- clusterByCanonical<T>(corpus, canonicalFn) — deterministic dedup
- jaccardSimilarity(tokensA, tokensB) — Jaccard coefficient
- defaultTokenize(text) — lowercase + stop-word filter
- clusterBySimilarity<T>(context) — greedy clustering by Jaccard threshold
- uniqueRepresentatives<T>(result) — drop duplicates convenience

Tests (19; all pass):
- clusterByCanonical groups same-canonical items
- first-seen is representative (pre-sort by score for top-ranked rep)
- empty corpus → EmptyCorpus
- all unique → N clusters of size 1
- jaccardSimilarity edge cases (identical / disjoint / partial / empty)
- defaultTokenize lowercase + stop-word filter
- clusterBySimilarity threshold catches near-duplicates
- High threshold keeps all distinct; low threshold clusters aggressively
- Invalid threshold → InvalidThreshold
- uniqueRepresentatives extracts rep-only list
- Compose with evolution substrate: pre-sort by score → rep is best
- ProximityFeedback exhaustive switch

Composes with substrate:
- B-0914.6 backlog row
- B-0914.5 PR #5767 evolution (de-dup Survivor list before mash)
- B-0914.2 PR #5769 closed-loop (de-dup pre-CI-dispatch saves cycles)
- verify-existing-substrate-before-authoring rule (proximity IS
  substrate-inventory at runtime scope)
- grep-substrate-anchors-before-razor-as-metaphysical rule (substrate-
  anchor check at runtime scope)
- additive-not-zero-sum + monad-propagation + asymmetric-authorship

Real semantic embeddings (TF-IDF / sentence-BERT) deferred; current PoC
handles structural dedup case (substrate-engineering work often produces
variants that differ only in serialization order, key casing, attribute
ordering — canonical-form normalization catches these without embeddings).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 tools/workflow-engine/proximity.test.ts | 222 ++++++++++++++++++++++
 tools/workflow-engine/proximity.ts      | 240 ++++++++++++++++++++++++
 2 files changed, 462 insertions(+)
 create mode 100644 tools/workflow-engine/proximity.test.ts
 create mode 100644 tools/workflow-engine/proximity.ts
diff --git a/tools/workflow-engine/proximity.test.ts b/tools/workflow-engine/proximity.test.ts
new file mode 100644
index 0000000000..9761115cf1
--- /dev/null
+++ b/tools/workflow-engine/proximity.test.ts
@@ -0,0 +1,222 @@
+/**
+ * tools/workflow-engine/proximity.test.ts
+ *
+ * B-0914.6 — invariant tests for proximity-dedup substrate.
+ */
+
+import { describe, expect, it } from "bun:test";
+import {
+  clusterByCanonical,
+  clusterBySimilarity,
+  defaultTokenize,
+  jaccardSimilarity,
+  uniqueRepresentatives,
+} from "./proximity";
+
+interface Hypothesis {
+  mechanism: string;
+  drugCandidate: string;
+  evidence: number;
+}
+
+describe("B-0914.6 proximity-dedup substrate", () => {
+  it("clusterByCanonical groups items with same canonical form", () => {
+    const corpus: Hypothesis[] = [
+      { mechanism: "ER-stress", drugCandidate: "cur-6", evidence: 0.8 },
+      { mechanism: "ER-Stress", drugCandidate: "Cur-6", evidence: 0.7 },  // case-only diff
+      { mechanism: "kinase", drugCandidate: "drug-x", evidence: 0.6 },
+    ];
+    const result = clusterByCanonical(corpus, (h) =>
+      `${h.mechanism.toLowerCase()}|${h.drugCandidate.toLowerCase()}`,
+    );
+    expect(result.ok).toBe(true);
+    if (!result.ok) return;
+    expect(result.uniqueCount).toBe(2);
+    expect(result.clusters[0]!.members.length).toBe(2);  // ER-stress + ER-Stress
+    expect(result.clusters[1]!.members.length).toBe(1);  // kinase
+  });
+
+  it("clusterByCanonical: first-seen is representative", () => {
+    const corpus: Hypothesis[] = [
+      { mechanism: "x", drugCandidate: "a", evidence: 0.5 },  // first
+      { mechanism: "x", drugCandidate: "a", evidence: 0.9 },  // duplicate
+    ];
+    const result = clusterByCanonical(corpus, (h) => `${h.mechanism}|${h.drugCandidate}`);
+    expect(result.ok).toBe(true);
+    if (!result.ok) return;
+    expect(result.clusters[0]!.representative.evidence).toBe(0.5);  // first-seen
+  });
+
+  it("clusterByCanonical empty corpus → EmptyCorpus", () => {
+    const result = clusterByCanonical<Hypothesis>([], (h) => h.mechanism);
+    expect(result.ok).toBe(false);
+    if (result.ok) return;
+    expect(result.feedback.kind).toBe("EmptyCorpus");
+  });
+
+  it("clusterByCanonical: all unique returns N clusters of size 1", () => {
+    const corpus: Hypothesis[] = [
+      { mechanism: "a", drugCandidate: "1", evidence: 0.1 },
+      { mechanism: "b", drugCandidate: "2", evidence: 0.2 },
+      { mechanism: "c", drugCandidate: "3", evidence: 0.3 },
+    ];
+    const result = clusterByCanonical(corpus, (h) => `${h.mechanism}|${h.drugCandidate}`);
+    expect(result.ok).toBe(true);
+    if (!result.ok) return;
+    expect(result.uniqueCount).toBe(3);
+    for (const c of result.clusters) {
+      expect(c.members.length).toBe(1);
+    }
+  });
+
+  it("jaccardSimilarity: identical token sets = 1.0", () => {
+    const a = new Set(["alpha", "beta", "gamma"]);
+    const b = new Set(["alpha", "beta", "gamma"]);
+    expect(jaccardSimilarity(a, b)).toBe(1.0);
+  });
+
+  it("jaccardSimilarity: disjoint token sets = 0.0", () => {
+    const a = new Set(["alpha", "beta"]);
+    const b = new Set(["gamma", "delta"]);
+    expect(jaccardSimilarity(a, b)).toBe(0.0);
+  });
+
+  it("jaccardSimilarity: partial overlap returns intersection/union", () => {
+    const a = new Set(["alpha", "beta", "gamma"]);
+    const b = new Set(["beta", "gamma", "delta"]);
+    // intersection = {beta, gamma} (size 2)
+    // union = {alpha, beta, gamma, delta} (size 4)
+    // jaccard = 2/4 = 0.5
+    expect(jaccardSimilarity(a, b)).toBe(0.5);
+  });
+
+  it("jaccardSimilarity: both empty returns 1.0", () => {
+    expect(jaccardSimilarity(new Set(), new Set())).toBe(1.0);
+  });
+
+  it("jaccardSimilarity: one empty returns 0.0", () => {
+    expect(jaccardSimilarity(new Set(["x"]), new Set())).toBe(0.0);
+    expect(jaccardSimilarity(new Set(), new Set(["x"]))).toBe(0.0);
+  });
+
+  it("defaultTokenize: lowercase + filter stop words + filter single-char", () => {
+    const tokens = defaultTokenize("The quick BROWN fox jumps over the lazy dog");
+    expect(tokens.has("quick")).toBe(true);
+    expect(tokens.has("brown")).toBe(true);
+    expect(tokens.has("the")).toBe(false);  // stop word
+    expect(tokens.has("over")).toBe(true);
+  });
+
+  it("clusterBySimilarity: threshold catches near-duplicates", () => {
+    const corpus = [
+      "ER-stress inhibition via cur-6 targeting unfolded protein response",
+      "ER-stress inhibition via cur-6 mechanism",  // shares 'ER-stress', 'inhibition', 'via', 'cur-6'
+      "kinase inhibition via drug-x targeting cellular signaling",
+    ];
+    const result = clusterBySimilarity({
+      corpus,
+      extractTokens: defaultTokenize,
+      threshold: 0.4,  // moderate threshold
+    });
+    expect(result.ok).toBe(true);
+    if (!result.ok) return;
+    // First 2 should cluster; third is distinct
+    expect(result.uniqueCount).toBe(2);
+  });
+
+  it("clusterBySimilarity: high threshold keeps all distinct", () => {
+    const corpus = ["alpha beta", "alpha beta gamma", "alpha beta delta"];
+    const result = clusterBySimilarity({
+      corpus,
+      extractTokens: defaultTokenize,
+      threshold: 1.0,  // unanimous-only
+    });
+    expect(result.ok).toBe(true);
+    if (!result.ok) return;
+    expect(result.uniqueCount).toBe(3);
+  });
+
+  it("clusterBySimilarity: low threshold clusters aggressively", () => {
+    const corpus = ["alpha beta gamma", "alpha", "beta", "gamma"];
+    const result = clusterBySimilarity({
+      corpus,
+      extractTokens: defaultTokenize,
+      threshold: 0.2,
+    });
+    expect(result.ok).toBe(true);
+    if (!result.ok) return;
+    // First item has all 3; others share 1 of 3 with first → 1/3 ≈ 0.33 > 0.2 → cluster
+    expect(result.uniqueCount).toBe(1);
+  });
+
+  it("clusterBySimilarity: invalid threshold → InvalidThreshold", () => {
+    const result = clusterBySimilarity({
+      corpus: ["x"],
+      extractTokens: defaultTokenize,
+      threshold: 1.5,
+    });
+    expect(result.ok).toBe(false);
+    if (result.ok) return;
+    expect(result.feedback.kind).toBe("InvalidThreshold");
+  });
+
+  it("clusterBySimilarity: empty corpus → EmptyCorpus", () => {
+    const result = clusterBySimilarity({
+      corpus: [],
+      extractTokens: defaultTokenize,
+      threshold: 0.5,
+    });
+    expect(result.ok).toBe(false);
+    if (result.ok) return;
+    expect(result.feedback.kind).toBe("EmptyCorpus");
+  });
+
+  it("uniqueRepresentatives extracts representative-only list", () => {
+    const corpus: Hypothesis[] = [
+      { mechanism: "x", drugCandidate: "a", evidence: 0.5 },
+      { mechanism: "x", drugCandidate: "a", evidence: 0.9 },
+      { mechanism: "y", drugCandidate: "b", evidence: 0.7 },
+    ];
+    const result = clusterByCanonical(corpus, (h) => `${h.mechanism}|${h.drugCandidate}`);
+    const reps = uniqueRepresentatives(result);
+    expect(reps.length).toBe(2);
+    expect(reps[0]!.mechanism).toBe("x");
+    expect(reps[1]!.mechanism).toBe("y");
+  });
+
+  it("uniqueRepresentatives returns empty for failed result", () => {
+    const result = clusterByCanonical<Hypothesis>([], (h) => h.mechanism);
+    expect(uniqueRepresentatives(result).length).toBe(0);
+  });
+
+  it("compose with evolution substrate: pre-sort by score → representative is best", () => {
+    // Simulates: TrueSkill ranks survivors → proximity dedups → take representatives
+    const survivors = [
+      { id: "h3", canonical: "ER-stress", skill: 30 },
+      { id: "h1", canonical: "ER-stress", skill: 20 },
+      { id: "h2", canonical: "kinase", skill: 25 },
+    ];
+    // Pre-sort by skill descending (highest-rank first)
+    const sorted = [...survivors].sort((a, b) => b.skill - a.skill);
+    const result = clusterByCanonical(sorted, (s) => s.canonical);
+    expect(result.ok).toBe(true);
+    if (!result.ok) return;
+    // h3 (skill 30) was first-seen for "ER-stress" cluster
+    const erStressCluster = result.clusters.find((c) => c.canonicalForm === "ER-stress");
+    expect(erStressCluster?.representative.id).toBe("h3");
+  });
+
+  it("ProximityFeedback exhaustive switch (compile-time check)", () => {
+    type R = ReturnType<typeof clusterByCanonical<Hypothesis>>;
+    const acknowledge = (r: R): string => {
+      if (r.ok) return "ok";
+      switch (r.feedback.kind) {
+        case "EmptyCorpus":
+        case "InvalidThreshold":
+          return r.feedback.kind;
+      }
+    };
+    const r = clusterByCanonical<Hypothesis>([], (h) => h.mechanism);
+    expect(acknowledge(r)).toBe("EmptyCorpus");
+  });
+});
diff --git a/tools/workflow-engine/proximity.ts b/tools/workflow-engine/proximity.ts
new file mode 100644
index 0000000000..55d83784b6
--- /dev/null
+++ b/tools/workflow-engine/proximity.ts
@@ -0,0 +1,240 @@
+/**
+ * tools/workflow-engine/proximity.ts
+ *
+ * B-0914.6 — proximity agent substrate for substrate-engineering
+ * substrate de-duplication.
+ *
+ * Per Google co-scientist proximity agent (Nature 2026): maps ideas
+ * into high-dimensional space + groups similar variants to detect
+ * when generation produces near-duplicate hypotheses. Prevents wasting
+ * compute on substantively-identical proposals.
+ *
+ * This substrate ships TWO de-duplication mechanisms:
+ *   - canonical-form normalization (deterministic; no embedding model)
+ *   - similarity-by-shared-tokens (lightweight; no external dependency)
+ *
+ * Real semantic embeddings (TF-IDF / sentence-BERT / etc.) deferred to
+ * substrate-engineering work after operator-substrate-direction; current
+ * PoC handles the structural dedup case (substrate-engineering work
+ * often produces variants that differ only in serialization order, key
+ * casing, attribute ordering).
+ *
+ * Composes with:
+ *   - B-0914.6 backlog row (proximity-dedup extension target)
+ *   - B-0914.5 PR #5767 evolution substrate (Survivor de-dup before mash)
+ *   - B-0914.2 PR #5769 closed-loop (de-dup pre-CI-dispatch saves cycles)
+ *   - .claude/rules/verify-existing-substrate-before-authoring (proximity
+ *     IS substrate-inventory at runtime scope)
+ *   - .claude/rules/grep-substrate-anchors-before-razor-as-metaphysical
+ *     (verify substrate anchors before razor-flagging; proximity-dedup
+ *     IS the substrate-anchor check at run-time scope)
+ *   - .claude/rules/additive-not-zero-sum (substrate compounds; don't
+ *     mint parallel substrate-engineering substrate)
+ *   - .claude/rules/monad-propagation-pattern (Result<T, TFeedback>)
+ *   - .claude/rules/asymmetric-authorship (substrate-entity authors
+ *     proximity verdict via TFeedback)
+ */
+
+/**
+ * Proximity feedback per asymmetric-authorship + monad-propagation rules.
+ */
+export type ProximityFeedback =
+  | { kind: "EmptyCorpus" }
+  | { kind: "InvalidThreshold"; threshold: number };
+
+/**
+ * Result-shape per monad-propagation rule.
+ */
+export type ProximityResult<T> =
+  | { ok: true; clusters: ReadonlyArray<Cluster<T>>; uniqueCount: number }
+  | { ok: false; feedback: ProximityFeedback };
+
+/**
+ * Cluster of near-duplicate substrate items.
+ *
+ * The `representative` is the canonical form chosen to represent the
+ * cluster (substrate-honest: the highest-quality member by caller's
+ * substrate-engineering criterion). The `members` includes the
+ * representative + all near-duplicates clustered with it.
+ */
+export interface Cluster<T> {
+  readonly representative: T;
+  readonly members: ReadonlyArray<T>;
+  readonly canonicalForm: string;  // the canonical-form key clustering uses
+}
+
+/**
+ * Canonical-form normalization function — caller supplies how to map
+ * a substrate item to its canonical string form. Items with the SAME
+ * canonical form are clustered together.
+ *
+ * Example: for a hypothesis with `{mechanism, drugCandidate, evidence}`,
+ * canonical form might be `"${mechanism.toLowerCase()}|${drugCandidate.toLowerCase()}"`
+ * (ignores evidence; case-insensitive; ignores attribute order).
+ */
+export type CanonicalFn<T> = (item: T) => string;
+
+/**
+ * Cluster items by canonical-form normalization.
+ *
+ * Items with the same canonical form go into the same cluster.
+ * The first item in each cluster (by input order) is the representative.
+ * Caller can override representative selection by pre-sorting input
+ * (e.g., by TrueSkill conservativeSkill descending → top-ranked
+ * representative).
+ *
+ * Pure function; no side effects; composable via Result.bind.
+ */
+export function clusterByCanonical<T>(
+  corpus: ReadonlyArray<T>,
+  canonicalFn: CanonicalFn<T>,
+): ProximityResult<T> {
+  if (corpus.length === 0) {
+    return { ok: false, feedback: { kind: "EmptyCorpus" } };
+  }
+
+  const byCanonical = new Map<string, T[]>();
+  const repByCanonical = new Map<string, T>();
+
+  for (const item of corpus) {
+    const canonical = canonicalFn(item);
+    const existing = byCanonical.get(canonical);
+    if (existing) {
+      existing.push(item);
+    } else {
+      byCanonical.set(canonical, [item]);
+      repByCanonical.set(canonical, item);  // first-seen is representative
+    }
+  }
+
+  const clusters: Cluster<T>[] = [];
+  for (const [canonical, members] of byCanonical.entries()) {
+    clusters.push({
+      representative: repByCanonical.get(canonical)!,
+      members,
+      canonicalForm: canonical,
+    });
+  }
+
+  return {
+    ok: true,
+    clusters,
+    uniqueCount: clusters.length,
+  };
+}
+
+/**
+ * Token-based similarity: Jaccard coefficient on shared tokens.
+ *
+ * Returns value in [0, 1]:
+ *   1.0 = identical token sets
+ *   0.0 = no shared tokens
+ *
+ * Useful for comparing two substrate items where canonical-form
+ * normalization is too strict (need fuzzy matching).
+ */
+export function jaccardSimilarity(
+  tokensA: ReadonlySet<string>,
+  tokensB: ReadonlySet<string>,
+): number {
+  if (tokensA.size === 0 && tokensB.size === 0) return 1.0;
+  if (tokensA.size === 0 || tokensB.size === 0) return 0.0;
+  const intersection = new Set<string>();
+  for (const t of tokensA) {
+    if (tokensB.has(t)) intersection.add(t);
+  }
+  const unionSize = tokensA.size + tokensB.size - intersection.size;
+  return intersection.size / unionSize;
+}
+
+/**
+ * Token extraction: simple word-splitting + lowercase + filter stop words.
+ *
+ * Caller can supply custom tokenizer for domain-specific tokenization
+ * (medical terminology, code identifiers, etc.).
+ */
+export function defaultTokenize(text: string): Set<string> {
+  const stopWords = new Set([
+    "a", "an", "the", "is", "are", "of", "in", "on", "at", "to", "for",
+    "with", "by", "as", "and", "or", "but", "if", "then", "this", "that",
+    "these", "those", "it", "its", "be", "been", "was", "were",
+  ]);
+  const tokens = new Set<string>();
+  const words = text.toLowerCase().match(/[a-z0-9]+/g) ?? [];
+  for (const w of words) {
+    if (!stopWords.has(w) && w.length >= 2) {
+      tokens.add(w);
+    }
+  }
+  return tokens;
+}
+
+/**
+ * Cluster items by Jaccard similarity threshold.
+ *
+ * Items with similarity >= threshold are clustered together. Uses
+ * greedy clustering: for each item, find existing cluster with highest
+ * similarity to representative; if >= threshold, add to that cluster;
+ * else start new cluster with this item as representative.
+ *
+ * O(N * K) where K = number of clusters formed (worst case O(N²)).
+ */
+export interface SimilarityClusterContext<T> {
+  readonly corpus: ReadonlyArray<T>;
+  readonly extractTokens: (item: T) => Set<string>;
+  readonly threshold: number;  // Jaccard threshold in (0, 1]
+}
+
+export function clusterBySimilarity<T>(
+  context: SimilarityClusterContext<T>,
+): ProximityResult<T> {
+  if (context.corpus.length === 0) {
+    return { ok: false, feedback: { kind: "EmptyCorpus" } };
+  }
+  if (context.threshold <= 0 || context.threshold > 1 || !Number.isFinite(context.threshold)) {
+    return { ok: false, feedback: { kind: "InvalidThreshold", threshold: context.threshold } };
+  }
+
+  const clusterData: Array<{ rep: T; repTokens: Set<string>; members: T[] }> = [];
+
+  for (const item of context.corpus) {
+    const itemTokens = context.extractTokens(item);
+    let bestClusterIdx = -1;
+    let bestSimilarity = 0;
+    for (let i = 0; i < clusterData.length; i++) {
+      const sim = jaccardSimilarity(itemTokens, clusterData[i]!.repTokens);
+      if (sim > bestSimilarity) {
+        bestSimilarity = sim;
+        bestClusterIdx = i;
+      }
+    }
+    if (bestClusterIdx >= 0 && bestSimilarity >= context.threshold) {
+      clusterData[bestClusterIdx]!.members.push(item);
+    } else {
+      clusterData.push({ rep: item, repTokens: itemTokens, members: [item] });
+    }
+  }
+
+  const clusters: Cluster<T>[] = clusterData.map((c) => ({
+    representative: c.rep,
+    members: c.members,
+    canonicalForm: `[similarity:${context.threshold}]:${[...c.repTokens].sort().join(",")}`,
+  }));
+
+  return {
+    ok: true,
+    clusters,
+    uniqueCount: clusters.length,
+  };
+}
+
+/**
+ * Convenience: extract representatives only (drop duplicates).
+ *
+ * Substrate-honest substrate-engineering: when dedup is the goal,
+ * this is the canonical "give me the unique items" form.
+ */
+export function uniqueRepresentatives<T>(result: ProximityResult<T>): ReadonlyArray<T> {
+  if (!result.ok) return [];
+  return result.clusters.map((c) => c.representative);
+}

From 96c2182c95bc090347575747b147de28b8b7a8ad Mon Sep 17 00:00:00 2001
From: Lior <lior@zeta.dev>
Date: Thu, 28 May 2026 08:12:08 -0400
Subject: [PATCH 2/2] fix(PR #5772): clarify B-0914 subtask reference +
 document Cluster.canonicalForm semantic divergence (Copilot threads)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two threads from Copilot on tools/workflow-engine/proximity.ts:

1. Docblock cross-reference "B-0914.6 backlog row" was misleading — the
   seven .N subtasks (.1-.7) are sections within the parent B-0914 row
   file, NOT separate B-0914.N row files. Reworded to "B-0914 subtask .6"
   with explicit parent-row pointer + cross-reference clarification for
   subtasks .5 and .2 as well.

2. Cluster.canonicalForm field semantically divergent between
   clusterByCanonical (real canonical-form string from CanonicalFn<T>)
   and clusterBySimilarity (synthesized "[similarity:<threshold>]:<tokens>"
   label). Added interface docblock that documents the divergence
   explicitly + names the discriminator (`[similarity:` prefix) callers
   can use + notes future-substrate rename path.

Non-breaking: same field name + same type + same behavior; only docblock
expanded. Composes with asymmetric-authorship + monad-propagation rules
unchanged.

Autonomous-loop tick 2026-05-28T12:08Z resolution of PR #5772 BLOCKED
gate (unresolved Copilot threads only blocker; required checks all green).

Co-Authored-By: Claude <noreply@anthropic.com>
---
 tools/workflow-engine/proximity.ts | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/tools/workflow-engine/proximity.ts b/tools/workflow-engine/proximity.ts
index 55d83784b6..2c9d7a1519 100644
--- a/tools/workflow-engine/proximity.ts
+++ b/tools/workflow-engine/proximity.ts
@@ -20,9 +20,14 @@
  * casing, attribute ordering).
  *
  * Composes with:
- *   - B-0914.6 backlog row (proximity-dedup extension target)
- *   - B-0914.5 PR #5767 evolution substrate (Survivor de-dup before mash)
- *   - B-0914.2 PR #5769 closed-loop (de-dup pre-CI-dispatch saves cycles)
+ *   - B-0914 subtask .6 (parent row `B-0914-co-scientist-plus-robin-...`
+ *     §"### B-0914.6 — Proximity-agent for substrate-engineering substrate
+ *     de-duplication"; the seven .N subtasks are sections within the
+ *     parent row, NOT separate B-0914.N row files)
+ *   - B-0914 subtask .5 (PR #5767 evolution substrate — Survivor de-dup
+ *     before mash)
+ *   - B-0914 subtask .2 (PR #5769 closed-loop — de-dup pre-CI-dispatch
+ *     saves cycles)
  *   - .claude/rules/verify-existing-substrate-before-authoring (proximity
  *     IS substrate-inventory at runtime scope)
  *   - .claude/rules/grep-substrate-anchors-before-razor-as-metaphysical
@@ -56,11 +61,27 @@ export type ProximityResult<T> =
  * cluster (substrate-honest: the highest-quality member by caller's
  * substrate-engineering criterion). The `members` includes the
  * representative + all near-duplicates clustered with it.
+ *
+ * The `canonicalForm` field is the cluster-identity key. Its CONTENT
+ * depends on which clustering function produced the cluster:
+ *   - `clusterByCanonical` — the actual canonical-form string the
+ *     caller's `CanonicalFn<T>` returned for all members of the cluster
+ *     (substrate-honest: this IS the canonical form)
+ *   - `clusterBySimilarity` — a synthesized cluster-identity label of
+ *     the shape `[similarity:<threshold>]:<sorted-rep-tokens>` derived
+ *     from the representative's tokens (substrate-honest: NOT a real
+ *     canonical form; serves as a stable cluster-identity key only)
+ *
+ * Callers needing to distinguish the two semantics check whether the
+ * field starts with `[similarity:` — that prefix marks similarity-clustered
+ * output. Future-substrate may rename to `clusterKey` + add a discriminator
+ * field; current shape preserves the substrate-engineering name while
+ * documenting the divergence.
  */
 export interface Cluster<T> {
   readonly representative: T;
   readonly members: ReadonlyArray<T>;
-  readonly canonicalForm: string;  // the canonical-form key clustering uses
+  readonly canonicalForm: string;  // cluster-identity key; see interface docblock for content semantics per producer
 }
 
 /**