diff --git a/bun.lock b/bun.lock index 9c00e4761..530241f37 100644 --- a/bun.lock +++ b/bun.lock @@ -101,7 +101,6 @@ "kysely-postgres-js": "^2.0.0", "marked": "^17.0.4", "marked-terminal": "^7.3.0", - "node-sql-parser": "^5.4.0", "open": "^10.1.0", "ora": "^8.0.1", "pino": "^10.1.0", @@ -384,7 +383,6 @@ "hono-pino": "^0.10.3", "kysely": "^0.28.0", "kysely-postgres-js": "^2.0.0", - "node-sql-parser": "^5.4.0", "pino": "^10.1.0", "postgres": "^3.4.7", "react": "^19.2.5", @@ -1947,8 +1945,6 @@ "@types/node": ["@types/node@20.19.9", "", { "dependencies": { "undici-types": "~6.21.0" } }, "sha512-cuVNgarYWZqxRJDQHEB58GEONhOK79QVR/qYx4S7kcUObQvUwvFnYxJuuHUKm2aieN9X3yZB4LZsuYNU1Qphsw=="], - "@types/pegjs": ["@types/pegjs@0.10.6", "", {}, "sha512-eLYXDbZWXh2uxf+w8sXS8d6KSoXTswfps6fvCUuVAGN8eRpfe7h9eSRydxiSJvo9Bf+GzifsDOr9TMQlmJdmkw=="], - "@types/pg": ["@types/pg@8.6.1", "", { "dependencies": { "@types/node": "*", "pg-protocol": "*", "pg-types": "^2.2.0" } }, "sha512-1Kc4oAGzAl7uqUStZCDvaLFqZrW9qWSjXOmBfdgyBP5La7Us6Mg4GBvRlSoaZMhQF/zSj1C8CtKMBkoiT8eL8w=="], "@types/pg-pool": ["@types/pg-pool@2.0.6", "", { "dependencies": { "@types/pg": "*" } }, "sha512-TaAUE5rq2VQYxab5Ts7WZhKNmuN78Q6PiFonTDdpbx8a1H0M1vhy3rhiMjl+e2iHmogyMw7jZF4FrE6eJUy5HQ=="], @@ -2145,8 +2141,6 @@ "bidi-js": ["bidi-js@1.0.3", "", { "dependencies": { "require-from-string": "^2.0.2" } }, "sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw=="], - "big-integer": ["big-integer@1.6.52", "", {}, "sha512-QxD8cf2eVqJOOz63z6JIN9BzvVs/dlySa5HGSBH5xtR8dPteIRQnBxxKqkNTiT6jbDTF6jAfrd4oMcND9RGbQg=="], - "bignumber.js": ["bignumber.js@9.3.1", "", {}, "sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ=="], "binary-extensions": ["binary-extensions@2.3.0", "", {}, "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw=="], @@ -3225,8 +3219,6 @@ "node-sarif-builder": ["node-sarif-builder@3.4.0", "", { "dependencies": { "@types/sarif": "^2.1.7", "fs-extra": "^11.1.1" } }, "sha512-tGnJW6OKRii9u/b2WiUViTJS+h7Apxx17qsMUjsUeNDiMMX5ZFf8F8Fcz7PAQ6omvOxHZtvDTmOYKJQwmfpjeg=="], - "node-sql-parser": ["node-sql-parser@5.4.0", "", { "dependencies": { "@types/pegjs": "^0.10.0", "big-integer": "^1.6.48" } }, "sha512-jVe6Z61gPcPjCElPZ6j8llB3wnqGcuQzefim1ERsqIakxnEy5JlzV7XKdO1KmacRG5TKwPc4vJTgSRQ0LfkbFw=="], - "normalize-path": ["normalize-path@3.0.0", "", {}, "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA=="], "npm-run-path": ["npm-run-path@4.0.1", "", { "dependencies": { "path-key": "^3.0.0" } }, "sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw=="], diff --git a/db/migrations/20260529130000_entity_types_backing.sql b/db/migrations/20260529130000_entity_types_backing.sql new file mode 100644 index 000000000..094ecc1df --- /dev/null +++ b/db/migrations/20260529130000_entity_types_backing.sql @@ -0,0 +1,18 @@ +-- migrate:up + +-- Derived (SQL-view-backed) entity types. Today every entity type is "stored" +-- (rows are inserted/validated against metadata_schema). A "derived" entity type +-- is instead a read-only SQL view over other relations (events, other entities); +-- it has no stored rows — its data comes from running backing_sql via query_sql. +-- +-- Decision B: a typed first-class column (not a metadata jsonb blob) so apply can +-- diff it and the read path can read it without parsing. There is NO separate mode +-- column — a type is derived iff backing_sql IS NOT NULL. Measure/dimension roles +-- are classified ON READ from backing_sql, not persisted. +-- +-- Idempotent: no-op on databases that already have the column. +ALTER TABLE public.entity_types ADD COLUMN IF NOT EXISTS backing_sql text; + +-- migrate:down + +ALTER TABLE public.entity_types DROP COLUMN IF EXISTS backing_sql; diff --git a/db/migrations/20260531120000_reject_rows_on_derived_types.sql b/db/migrations/20260531120000_reject_rows_on_derived_types.sql new file mode 100644 index 000000000..0a37f0350 --- /dev/null +++ b/db/migrations/20260531120000_reject_rows_on_derived_types.sql @@ -0,0 +1,62 @@ +-- migrate:up + +-- Invariant backstop: a DERIVED entity type (backing_sql IS NOT NULL) is a SQL +-- view and must NEVER have stored rows in `entities`. The app guards this in the +-- known paths (createEntity, entity-link-upsert, manage_entity_schema convert) +-- for friendly errors, but these triggers make the invariant airtight regardless +-- of which code path (or future one) writes the data. + +-- (1) No stored row may point at a derived type — on INSERT, and on an UPDATE +-- that re-points an existing row's entity_type_id. Fires only when the target +-- type is derived; normal stored-type writes pass with a single PK lookup. +CREATE OR REPLACE FUNCTION public.reject_rows_on_derived_entity_type() + RETURNS trigger AS $$ +BEGIN + IF EXISTS ( + SELECT 1 FROM public.entity_types et + WHERE et.id = NEW.entity_type_id AND et.backing_sql IS NOT NULL + ) THEN + RAISE EXCEPTION + 'entity type % is derived (a SQL view) and cannot have stored rows', + NEW.entity_type_id + USING ERRCODE = 'check_violation'; + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +DROP TRIGGER IF EXISTS trg_reject_rows_on_derived ON public.entities; +CREATE TRIGGER trg_reject_rows_on_derived + BEFORE INSERT OR UPDATE OF entity_type_id ON public.entities + FOR EACH ROW EXECUTE FUNCTION public.reject_rows_on_derived_entity_type(); + +-- (2) A type may not BECOME derived while stored rows still exist — that would +-- orphan them (the view ignores stored rows). Fires only when backing_sql is set +-- in the UPDATE; clearing it (derived → stored) is always allowed. +CREATE OR REPLACE FUNCTION public.reject_derived_conversion_with_rows() + RETURNS trigger AS $$ +BEGIN + IF NEW.backing_sql IS NOT NULL AND EXISTS ( + SELECT 1 FROM public.entities e + WHERE e.entity_type_id = NEW.id AND e.deleted_at IS NULL + ) THEN + RAISE EXCEPTION + 'entity type % cannot become a derived view while stored rows exist; delete them first', + NEW.id + USING ERRCODE = 'check_violation'; + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +DROP TRIGGER IF EXISTS trg_reject_derived_conversion_with_rows ON public.entity_types; +CREATE TRIGGER trg_reject_derived_conversion_with_rows + BEFORE UPDATE OF backing_sql ON public.entity_types + FOR EACH ROW EXECUTE FUNCTION public.reject_derived_conversion_with_rows(); + +-- migrate:down + +DROP TRIGGER IF EXISTS trg_reject_derived_conversion_with_rows ON public.entity_types; +DROP FUNCTION IF EXISTS public.reject_derived_conversion_with_rows(); +DROP TRIGGER IF EXISTS trg_reject_rows_on_derived ON public.entities; +DROP FUNCTION IF EXISTS public.reject_rows_on_derived_entity_type(); diff --git a/packages/cli/package.json b/packages/cli/package.json index 63d648a65..fc7fd2de9 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -91,7 +91,6 @@ "kysely-postgres-js": "^2.0.0", "marked": "^17.0.4", "marked-terminal": "^7.3.0", - "node-sql-parser": "^5.4.0", "open": "^10.1.0", "ora": "^8.0.1", "pino": "^10.1.0", diff --git a/packages/cli/src/commands/_lib/apply/__tests__/client.test.ts b/packages/cli/src/commands/_lib/apply/__tests__/client.test.ts index b6e1269f7..0d4fb8550 100644 --- a/packages/cli/src/commands/_lib/apply/__tests__/client.test.ts +++ b/packages/cli/src/commands/_lib/apply/__tests__/client.test.ts @@ -169,4 +169,72 @@ describe("ApplyClient — prune", () => { watcher_ids: ["42"], }); }); + + test("upsertEntityType POSTs a nested backing for a derived type", async () => { + const calls: Array<{ url: string; init?: RequestInit }> = []; + const client = new ApplyClient( + { apiBaseUrl: "https://example.test", orgSlug: "acme", token: "tok" }, + (async (url, init) => { + calls.push({ url: String(url), init }); + return new Response(JSON.stringify({ success: true }), { status: 200 }); + }) as typeof fetch + ); + + await client.upsertEntityType({ + slug: "subscription", + backing: { + sql: "SELECT company_id, SUM(amount) AS spend FROM events GROUP BY company_id", + }, + }); + + expect(calls[0]?.url).toBe( + "https://example.test/api/acme/manage_entity_schema" + ); + const body = JSON.parse(String(calls[0]?.init?.body)); + expect(body.action).toBe("create"); + expect(body.backing).toEqual({ + sql: "SELECT company_id, SUM(amount) AS spend FROM events GROUP BY company_id", + }); + }); + + test("listEntityTypes hoists backing_sql to a { sql } backing (derived type)", async () => { + const calls: Array<{ url: string; init?: RequestInit }> = []; + const client = new ApplyClient( + { apiBaseUrl: "https://example.test", orgSlug: "acme", token: "tok" }, + (async (url, init) => { + calls.push({ url: String(url), init }); + return new Response( + JSON.stringify({ + entity_types: [ + { + slug: "subscription", + metadata_schema: { type: "object", properties: {} }, + backing_sql: "SELECT 1 AS x", + }, + ], + }), + { status: 200 } + ); + }) as typeof fetch + ); + + const types = await client.listEntityTypes(); + expect(types[0]?.backing).toEqual({ sql: "SELECT 1 AS x" }); + }); + + test("upsertEntityType POSTs backing:null for a stored type", async () => { + const calls: Array<{ url: string; init?: RequestInit }> = []; + const client = new ApplyClient( + { apiBaseUrl: "https://example.test", orgSlug: "acme", token: "tok" }, + (async (url, init) => { + calls.push({ url: String(url), init }); + return new Response(JSON.stringify({ success: true }), { status: 200 }); + }) as typeof fetch + ); + + await client.upsertEntityType({ slug: "company", name: "Company" }); + + const body = JSON.parse(String(calls[0]?.init?.body)); + expect(body.backing).toBeNull(); + }); }); diff --git a/packages/cli/src/commands/_lib/apply/__tests__/diff-idempotency.test.ts b/packages/cli/src/commands/_lib/apply/__tests__/diff-idempotency.test.ts index e46215f7e..102cb04a7 100644 --- a/packages/cli/src/commands/_lib/apply/__tests__/diff-idempotency.test.ts +++ b/packages/cli/src/commands/_lib/apply/__tests__/diff-idempotency.test.ts @@ -120,6 +120,63 @@ describe("computeDiff — idempotency (applying twice is a no-op)", () => { expect(secondPlan.counts.update).toBe(0); }); + test("derived entity type: same backing_sql is a noop (no persisted inference)", () => { + // A derived type stores only backing.sql — measure roles are classified on + // read, never persisted — so a re-apply is a plain, churn-free noop. + const sql = + "SELECT company_id, SUM(amount) AS spend FROM events GROUP BY company_id"; + const desired = buildState([], { + memorySchema: { + entityTypes: [ + { slug: "subscription", name: "Subscription", backing: { sql } }, + ], + relationshipTypes: [], + }, + }); + + const afterFirstApply: RemoteSnapshot = { + ...emptyRemote(), + entityTypes: [ + { slug: "subscription", name: "Subscription", backing: { sql } }, + ], + }; + + const plan = computeDiff(desired, afterFirstApply); + const row = plan.rows.find((r) => r.kind === "entity-type"); + expect(row?.verb).toBe("noop"); + expect(plan.counts.update).toBe(0); + }); + + test("derived entity type: a changed backing_sql is an update", () => { + const desired = buildState([], { + memorySchema: { + entityTypes: [ + { + slug: "subscription", + name: "Subscription", + backing: { sql: "SELECT 2 AS x FROM events" }, + }, + ], + relationshipTypes: [], + }, + }); + const remote: RemoteSnapshot = { + ...emptyRemote(), + entityTypes: [ + { + slug: "subscription", + name: "Subscription", + backing: { sql: "SELECT 1 AS x FROM events" }, + }, + ], + }; + const plan = computeDiff(desired, remote); + const row = plan.rows.find((r) => r.kind === "entity-type"); + expect(row?.verb).toBe("update"); + if (row?.kind === "entity-type") + expect(row.changedFields).toContain("backing"); + }); + test("relationship type: same desired+remote is noop", () => { const desired = buildState([], { memorySchema: { diff --git a/packages/cli/src/commands/_lib/apply/__tests__/map-config.test.ts b/packages/cli/src/commands/_lib/apply/__tests__/map-config.test.ts index 18f348dd8..8d4cb1112 100644 --- a/packages/cli/src/commands/_lib/apply/__tests__/map-config.test.ts +++ b/packages/cli/src/commands/_lib/apply/__tests__/map-config.test.ts @@ -197,6 +197,39 @@ describe("mapProjectToDesiredState", () => { ]); }); + test("maps a derived entity's backing ({ sql }); stored entities carry none", () => { + const subscription = defineEntityType({ + key: "subscription", + name: "Subscription", + backing: { + sql: "SELECT company_id, SUM(amount) AS spend FROM revolut GROUP BY company_id", + }, + }); + const company = defineEntityType({ key: "company", name: "Company" }); + const state = mapProjectToDesiredState( + defineConfig({ agents: [], entities: [subscription, company] }) + ); + const byKey = Object.fromEntries( + state.memorySchema.entityTypes.map((e) => [e.slug, e]) + ); + expect(byKey.subscription?.backing).toEqual({ + sql: "SELECT company_id, SUM(amount) AS spend FROM revolut GROUP BY company_id", + }); + // stored (default) entities never carry backing — keeps the diff churn-free + expect(byKey.company?.backing).toBeUndefined(); + }); + + test("rejects an empty backing.sql at load time (before any remote mutation)", () => { + const bad = defineEntityType({ + key: "bad", + name: "Bad", + backing: { sql: " " }, + }); + expect(() => + mapProjectToDesiredState(defineConfig({ agents: [], entities: [bad] })) + ).toThrow(/empty backing\.sql/i); + }); + test("carries prune into DesiredState (defaults false when unset)", () => { expect(mapProjectToDesiredState(defineConfig({ agents: [] })).prune).toBe( false diff --git a/packages/cli/src/commands/_lib/apply/client.ts b/packages/cli/src/commands/_lib/apply/client.ts index f1469c57c..2bb248692 100644 --- a/packages/cli/src/commands/_lib/apply/client.ts +++ b/packages/cli/src/commands/_lib/apply/client.ts @@ -28,6 +28,10 @@ export interface RemoteEntityType { description?: string; required?: string[]; properties?: Record; + /** Present only for derived types (mirrors {@link DesiredEntityType.backing}). */ + backing?: { + sql: string; + }; /** * Owning org id. The list endpoint also returns *public* types from OTHER * orgs (`o.visibility = 'public'`), so prune must compare this against the @@ -188,7 +192,10 @@ function pickArray(body: Record, ...keys: string[]): T[] { * folds the flat fields back into `metadata_schema` when writing. */ function hoistEntityTypeSchema( - row: RemoteEntityType & { metadata_schema?: unknown } + row: RemoteEntityType & { + metadata_schema?: unknown; + backing_sql?: string | null; + } ): RemoteEntityType { const schema = row.metadata_schema; const out: RemoteEntityType = { @@ -208,6 +215,11 @@ function hoistEntityTypeSchema( ); } } + // A type is derived iff it has view SQL; stored types carry no backing, so it + // compares equal to the desired side without churn. + if (typeof row.backing_sql === "string") { + out.backing = { sql: row.backing_sql }; + } return out; } @@ -489,21 +501,24 @@ export class ApplyClient { // ── Memory schema ───────────────────────────────────────────────────────── async listEntityTypes(): Promise { + type RawEntityTypeRow = RemoteEntityType & { + metadata_schema?: unknown; + backing_sql?: string | null; + }; const { body } = await this.request<{ - entity_types?: Array; - entityTypes?: Array; + entity_types?: RawEntityTypeRow[]; + entityTypes?: RawEntityTypeRow[]; }>("POST", `/api/${this.orgSlug}/manage_entity_schema`, { schema_type: "entity_type", action: "list", }); // The server returns the type's fields inside a single `metadata_schema` - // JSON Schema. Surface its `properties`/`required` at top level so the diff - // compares them against the desired config (which carries them flat). - return pickArray( - body, - "entity_types", - "entityTypes" - ).map(hoistEntityTypeSchema); + // JSON Schema (+ typed backing_* columns). Surface `properties`/`required` + // and normalize `backing` at top level so the diff compares them against the + // desired config (which carries them flat). + return pickArray(body, "entity_types", "entityTypes").map( + hoistEntityTypeSchema + ); } /** @@ -543,13 +558,16 @@ export class ApplyClient { description?: string; required?: string[]; properties?: Record; + backing?: { + sql: string; + }; }): Promise { // The server stores per-type fields as a single `metadata_schema` JSON // Schema (`{ type, properties, required }`) — it does NOT read top-level // `properties`/`required`. Fold them into `metadata_schema` so the schema // actually persists (otherwise every apply re-reports a `properties` // update because the stored schema stays empty). - const { slug, name, description, required, properties } = entity; + const { slug, name, description, required, properties, backing } = entity; const payload: Record = { slug }; if (name !== undefined) payload.name = name; if (description !== undefined) payload.description = description; @@ -560,6 +578,10 @@ export class ApplyClient { ...(required && required.length > 0 ? { required } : {}), }; } + // Backing is sent on every upsert so it is deterministic: `{ sql }` makes + // the type derived; `null` makes it stored (and reverts a previously-derived + // type). + payload.backing = backing ? { sql: backing.sql } : null; return this.upsertSchemaResource("entity_type", payload); } diff --git a/packages/cli/src/commands/_lib/apply/desired-state.ts b/packages/cli/src/commands/_lib/apply/desired-state.ts index a695cc4e2..efbf65b39 100644 --- a/packages/cli/src/commands/_lib/apply/desired-state.ts +++ b/packages/cli/src/commands/_lib/apply/desired-state.ts @@ -43,6 +43,14 @@ export interface DesiredEntityType { required?: string[]; properties?: Record; metadata?: Record; + /** + * Present only for derived (SQL-view-backed) entity types; absent ⇒ stored + * (the default). Normalized so a stored type compares equal on both sides + * (desired + remote both omit it) and never churns the diff. + */ + backing?: { + sql: string; + }; } export interface DesiredRelationshipType { diff --git a/packages/cli/src/commands/_lib/apply/diff.ts b/packages/cli/src/commands/_lib/apply/diff.ts index f5dd80cd7..1202c96f8 100644 --- a/packages/cli/src/commands/_lib/apply/diff.ts +++ b/packages/cli/src/commands/_lib/apply/diff.ts @@ -482,6 +482,12 @@ function diffEntityType( name: "properties", changed: (d, r) => !deepEqual(d.properties, r.properties), }, + { + // Derived types store metadata_schema verbatim (no inferred superset), + // so a plain properties + backing diff is exact and idempotent. + name: "backing", + changed: (d, r) => !deepEqual(d.backing, r.backing), + }, ], }) as EntityTypeDiffRow; } diff --git a/packages/cli/src/commands/_lib/apply/map-config.ts b/packages/cli/src/commands/_lib/apply/map-config.ts index 64885abd4..fd05cc959 100644 --- a/packages/cli/src/commands/_lib/apply/map-config.ts +++ b/packages/cli/src/commands/_lib/apply/map-config.ts @@ -505,6 +505,14 @@ function mapAgent( } function mapEntityType(entity: EntityType): DesiredEntityType { + // Fail loud in the CLI before any remote mutation: the server's + // assertValidBacking would otherwise reject an empty backing mid-apply with a + // confusing 4xx, possibly after other mutations have already landed. + if (entity.backing && entity.backing.sql.trim() === "") { + throw new ValidationError( + `entity type "${entity.key}" has an empty backing.sql` + ); + } return { slug: entity.key, ...(entity.name ? { name: entity.name } : {}), @@ -512,6 +520,9 @@ function mapEntityType(entity: EntityType): DesiredEntityType { ...(entity.required ? { required: entity.required } : {}), ...(entity.properties ? { properties: entity.properties } : {}), ...(entity.metadata ? { metadata: entity.metadata } : {}), + // `backing` is present only for derived types; a stored entity (the default) + // carries no backing so it never churns the diff. + ...(entity.backing ? { backing: { sql: entity.backing.sql } } : {}), }; } diff --git a/packages/cli/src/config/define.ts b/packages/cli/src/config/define.ts index c208fd6f0..f8d37e25b 100644 --- a/packages/cli/src/config/define.ts +++ b/packages/cli/src/config/define.ts @@ -25,6 +25,21 @@ export type ConnectorRef = string | ConnectorClass; // Memory schema // --------------------------------------------------------------------------- +/** + * Makes an entity type **derived**: its rows are a read-only SQL view over other + * relations (events, other entities) instead of inserted/validated rows. + * + * Presence is the discriminant: an entity type with `backing` is derived; without + * it, it is **stored** (the default — a curated entity like a Company or a + * hand-named Trip). There is no separate `mode` field — "derived" just means + * "has a view". Read a derived type's rows by running its SQL through `query_sql`; + * measure vs. dimension columns are classified on read (not declared here). + */ +export interface EntityBacking { + /** ANSI SELECT over other relations (events, entities, …). */ + sql: string; +} + export interface EntityType { readonly kind: "entityType"; /** Stable slug — diff key. */ @@ -36,6 +51,12 @@ export interface EntityType { /** JSON Schema properties for the entity's metadata. */ properties?: Record; metadata?: Record; + /** + * Present only for DERIVED types — a read-only SQL view (`{ sql }`). Omitted ⇒ + * the type is stored (the default; rows are inserted/validated). Presence is + * the only discriminant; there is no separate `mode` field. + */ + backing?: EntityBacking; } export function defineEntityType(config: Omit): EntityType { diff --git a/packages/owletto b/packages/owletto index c5db525a3..62b3ca958 160000 --- a/packages/owletto +++ b/packages/owletto @@ -1 +1 @@ -Subproject commit c5db525a37e69d7fdb18315b16e24a5776862dd1 +Subproject commit 62b3ca958c2b99688990b87b628271616fd3f7ce diff --git a/packages/server/package.json b/packages/server/package.json index 49b488137..029202317 100644 --- a/packages/server/package.json +++ b/packages/server/package.json @@ -56,7 +56,6 @@ "hono-pino": "^0.10.3", "kysely": "^0.28.0", "kysely-postgres-js": "^2.0.0", - "node-sql-parser": "^5.4.0", "pino": "^10.1.0", "postgres": "^3.4.7", "react": "^19.2.5", diff --git a/packages/server/src/__tests__/integration/entity-schema/derived-entity-query.test.ts b/packages/server/src/__tests__/integration/entity-schema/derived-entity-query.test.ts new file mode 100644 index 000000000..04dc414a9 --- /dev/null +++ b/packages/server/src/__tests__/integration/entity-schema/derived-entity-query.test.ts @@ -0,0 +1,207 @@ +/** + * Derived-entity READ path. + * + * A derived entity type stores a `backing_sql` view but no rows of its own. + * The read path reuses the existing `query_sql` tool: fetch the view SQL via + * `get_type`, run it through `query_sql`, which org-scopes every referenced + * table (here `events`) via `validateAndScopeQuery`. This test proves that + * round-trip works AND that the scoping isolates orgs — a sibling org's events + * never leak into the aggregate. + */ + +import { beforeAll, describe, expect, it } from 'vitest'; +import { querySql } from '../../../tools/admin/query_sql'; +import type { ToolContext } from '../../../tools/registry'; +import { cleanupTestDatabase, getTestDb } from '../../setup/test-db'; +import { + addUserToOrganization, + createTestEvent, + createTestOrganization, + createTestUser, +} from '../../setup/test-fixtures'; +import { TestApiClient } from '../../setup/test-mcp-client'; + +describe('derived entity read path (reuse query_sql)', () => { + let owner: TestApiClient; + let orgAId: string; + let orgBId: string; + + beforeAll(async () => { + await cleanupTestDatabase(); + const orgA = await createTestOrganization({ name: 'Derived Query A' }); + const orgB = await createTestOrganization({ name: 'Derived Query B' }); + orgAId = orgA.id; + orgBId = orgB.id; + const user = await createTestUser({ email: 'derived-query@test.com' }); + await addUserToOrganization(user.id, orgA.id, 'owner'); + owner = await TestApiClient.for({ + organizationId: orgA.id, + userId: user.id, + memberRole: 'owner', + }); + }); + + it('a stored derived backing_sql (metadata jsonb) is queryable + org-scoped via query_sql', async () => { + // Realistic derived view: business data lives in events.metadata (jsonb), + // not in fixed columns. Extraction + cast + aggregate must survive the + // parse → validate → org-scope path (now powered by @polyglot-sql/sdk). + const sql = + "SELECT (metadata->>'vendor') AS vendor, SUM((metadata->>'amount')::numeric) AS total_spend, COUNT(*) AS purchases FROM events GROUP BY 1"; + await owner.entity_schema.createType({ + slug: 'spend-by-vendor', + name: 'Spend by vendor', + backing: { sql }, + }); + + // Measure columns are classified ON READ: the SUM + COUNT(*) are measures, + // the jsonb-extracted `vendor` is a dimension. + const got = (await owner.entity_schema.getType('spend-by-vendor')) as { + entity_type?: { backing_sql?: string | null; measure_columns?: string[] }; + }; + expect((got.entity_type?.measure_columns ?? []).sort()).toEqual(['purchases', 'total_spend']); + + // 2 purchases in org A + 1 in org B (same vendor). Org B must be excluded. + await createTestEvent({ organization_id: orgAId, content: 'a1', metadata: { vendor: 'acme', amount: '10' } }); + await createTestEvent({ organization_id: orgAId, content: 'a2', metadata: { vendor: 'acme', amount: '5' } }); + await createTestEvent({ organization_id: orgBId, content: 'b1', metadata: { vendor: 'acme', amount: '99' } }); + + const ctxA: ToolContext = { + organizationId: orgAId, + userId: 'u', + memberRole: 'owner', + isAuthenticated: true, + tokenType: 'oauth', + scopedToOrg: false, + allowCrossOrg: false, + }; + const res = await querySql( + { sql: got.entity_type?.backing_sql as string, sort_by: 'vendor' }, + {}, + ctxA + ); + + expect(res.error).toBeUndefined(); + const acme = res.rows.find((r) => r.vendor === 'acme'); + // Org-scoped: only org A's 2 events aggregate — org B's $99 never leaks in. + expect(Number(acme?.purchases)).toBe(2); + expect(Number(acme?.total_spend)).toBe(15); + }); + + it('rejects creating a stored row on a derived (view) entity type', async () => { + await owner.entity_schema.createType({ + slug: 'orders-view', + name: 'Orders view', + backing: { sql: 'SELECT semantic_type, COUNT(*) AS n FROM events GROUP BY 1' }, + }); + // The view has no stored rows; inserting one would be silently ignored. + await expect( + owner.entities.create({ type: 'orders-view', name: 'nope' }) + ).rejects.toThrow(/derived|view|no stored rows/i); + }); + + it('rejects converting a populated stored type into a derived view', async () => { + await owner.entity_schema.createType({ slug: 'people', name: 'People' }); + await owner.entities.create({ type: 'people', name: 'Ada' }); + // Would orphan Ada (the view ignores stored rows) — must be rejected. + await expect( + owner.entity_schema.updateType({ + slug: 'people', + backing: { sql: 'SELECT 1 AS x FROM events' }, + }) + ).rejects.toThrow(/stored entit|delete them first/i); + }); + + it('a DB trigger rejects a direct row insert on a derived type (invariant backstop)', async () => { + await owner.entity_schema.createType({ + slug: 'metrics-view', + name: 'Metrics', + backing: { sql: 'SELECT 1 AS x FROM events' }, + }); + const db = getTestDb(); + const [row] = await db` + SELECT id FROM entity_types WHERE slug = 'metrics-view' AND organization_id = ${orgAId} LIMIT 1 + `; + // Bypass the app guards entirely — the trigger is the safety net. + await expect( + db`INSERT INTO entities (entity_type_id, name, organization_id) VALUES (${row.id}, 'x', ${orgAId})` + ).rejects.toThrow(/derived|cannot have stored rows/i); + }); + + // query_sql is member-accessible; these are the parser-bypass leaks the + // bug-hunt confirmed (member reads the admin-only oauth_tokens via a parser + // hole). They must error through the real tool, not just validateAndScopeQuery. + const memberCtx = (): ToolContext => ({ + organizationId: orgAId, + userId: 'member-u', + memberRole: 'member', + isAuthenticated: true, + tokenType: 'oauth', + scopedToOrg: false, + allowCrossOrg: false, + }); + + it('member query_sql rejects the `TABLE oauth_tokens` shorthand', async () => { + const res = await querySql({ sql: 'TABLE oauth_tokens' }, {}, memberCtx()); + expect(res.rows).toHaveLength(0); + expect(res.error).toMatch(/SELECT \/ WITH/i); + }); + + it('member query_sql blocks oauth_tokens nested in a CASE subquery', async () => { + const res = await querySql( + { + sql: "SELECT id, (CASE WHEN true THEN (SELECT access_token FROM oauth_tokens LIMIT 1) ELSE 'x' END) AS leak FROM entities", + }, + {}, + memberCtx() + ); + expect(res.rows).toHaveLength(0); + expect(res.error).toMatch(/admin access/i); + }); + + it('DB trigger rejects re-pointing a stored row at a derived type (UPDATE)', async () => { + await owner.entity_schema.createType({ slug: 'animals', name: 'Animals' }); + await owner.entity_schema.createType({ + slug: 'animal-counts', + name: 'Animal counts', + backing: { sql: 'SELECT 1 AS x FROM events' }, + }); + await owner.entities.create({ type: 'animals', name: 'Cat' }); + const db = getTestDb(); + const [cat] = await db` + SELECT id FROM entities WHERE name = 'Cat' AND organization_id = ${orgAId} LIMIT 1 + `; + const [derived] = await db` + SELECT id FROM entity_types WHERE slug = 'animal-counts' AND organization_id = ${orgAId} LIMIT 1 + `; + // Re-pointing the row's entity_type_id to the derived type would orphan it. + await expect( + db`UPDATE entities SET entity_type_id = ${derived.id} WHERE id = ${cat.id}` + ).rejects.toThrow(/derived|cannot have stored rows/i); + }); + + it('DB trigger rejects setting backing_sql on a populated stored type (UPDATE)', async () => { + await owner.entity_schema.createType({ slug: 'plants', name: 'Plants' }); + await owner.entities.create({ type: 'plants', name: 'Fern' }); + const db = getTestDb(); + // Direct DB convert-to-derived while rows exist must be rejected. + await expect( + db`UPDATE entity_types SET backing_sql = 'SELECT 1 AS x FROM events' + WHERE slug = 'plants' AND organization_id = ${orgAId}` + ).rejects.toThrow(/derived view while stored rows exist|delete them first/i); + }); + + it('allows converting to derived when all rows are soft-deleted (matches app live-row count)', async () => { + await owner.entity_schema.createType({ slug: 'fish', name: 'Fish' }); + await owner.entities.create({ type: 'fish', name: 'Nemo' }); + const db = getTestDb(); + // Soft-delete the only row — the app's convert-guard counts WHERE deleted_at + // IS NULL, so conversion is allowed; the DB trigger must agree (not block on + // a tombstoned row). + await db`UPDATE entities e SET deleted_at = NOW() + FROM entity_types et + WHERE e.entity_type_id = et.id AND et.slug = 'fish' AND e.organization_id = ${orgAId}`; + await expect( + owner.entity_schema.updateType({ slug: 'fish', backing: { sql: 'SELECT 1 AS x FROM events' } }) + ).resolves.toBeDefined(); + }); +}); diff --git a/packages/server/src/__tests__/integration/entity-schema/entity-types.test.ts b/packages/server/src/__tests__/integration/entity-schema/entity-types.test.ts index 2d821abbc..4d5f29e3a 100644 --- a/packages/server/src/__tests__/integration/entity-schema/entity-types.test.ts +++ b/packages/server/src/__tests__/integration/entity-schema/entity-types.test.ts @@ -78,6 +78,71 @@ describe('entity schema CRUD', () => { expect(slugs).toContain('lst-asset'); await owner.entity_schema.deleteType('lst-asset'); }); + + it('round-trips a derived backing (sql) and reverts to stored', async () => { + type Got = { + entity_type?: { + backing_sql?: string | null; + measure_columns?: string[]; + metadata_schema?: { properties?: Record> } | null; + }; + }; + + await owner.entity_schema.createType({ + slug: 'spend-by-vendor', + name: 'Spend by vendor', + backing: { + sql: 'SELECT company_id, currency, SUM(amount) AS total_spend, COUNT(DISTINCT u) AS users FROM events GROUP BY company_id, currency', + }, + }); + const created = (await owner.entity_schema.getType('spend-by-vendor')) as Got; + expect(created.entity_type?.backing_sql).toContain('SUM(amount)'); + // Measure columns are classified ON READ (not persisted into metadata_schema). + expect((created.entity_type?.measure_columns ?? []).sort()).toEqual([ + 'total_spend', + 'users', + ]); + // No inferred annotations are persisted — metadata_schema stays as authored. + const props = created.entity_type?.metadata_schema?.properties ?? {}; + expect(props.total_spend).toBeUndefined(); + + // update the view sql → backing_sql changes, measure_columns recompute + await owner.entity_schema.updateType({ + slug: 'spend-by-vendor', + backing: { sql: 'SELECT company_id, AVG(amount) AS avg_spend FROM events GROUP BY company_id' }, + }); + const updated = (await owner.entity_schema.getType('spend-by-vendor')) as Got; + expect(updated.entity_type?.backing_sql).toContain('AVG(amount)'); + expect(updated.entity_type?.measure_columns).toEqual(['avg_spend']); + + // revert to stored: backing = null clears the view; no measure_columns. + await owner.entity_schema.updateType({ slug: 'spend-by-vendor', backing: null }); + const reverted = (await owner.entity_schema.getType('spend-by-vendor')) as Got; + expect(reverted.entity_type?.backing_sql ?? null).toBeNull(); + expect(reverted.entity_type?.measure_columns ?? []).toEqual([]); + + await owner.entity_schema.deleteType('spend-by-vendor'); + }); + + it('a stored type carries no backing_sql', async () => { + await owner.entity_schema.createType({ slug: 'plain-thing', name: 'Plain' }); + const got = (await owner.entity_schema.getType('plain-thing')) as { + entity_type?: { backing_sql?: string | null }; + }; + expect(got.entity_type?.backing_sql ?? null).toBeNull(); + await owner.entity_schema.deleteType('plain-thing'); + }); + + it('rejects an empty / whitespace backing.sql (no corrupt derived type)', async () => { + // TypeBox minLength isn't enforced for this tool, so the handler guards. + await expect( + owner.entity_schema.createType({ + slug: 'blank-view', + name: 'Blank', + backing: { sql: ' ' }, + }) + ).rejects.toThrow(/backing\.sql cannot be empty/i); + }); }); describe('relationship_type', () => { diff --git a/packages/server/src/__tests__/integration/mcp/member-write-access.test.ts b/packages/server/src/__tests__/integration/mcp/member-write-access.test.ts index 5bd096f47..a294c8d56 100644 --- a/packages/server/src/__tests__/integration/mcp/member-write-access.test.ts +++ b/packages/server/src/__tests__/integration/mcp/member-write-access.test.ts @@ -171,7 +171,7 @@ describe('MCP member write access', () => { }); const beforeBody = await beforeResponse.json(); const beforeNames = beforeBody.result.tools.map((tool: any) => tool.name); - // Admin/owner sessions see `query_sql` (admin-tier read). + // query_sql is read-tier — visible to members and admins alike. expect(beforeNames).toContain('query_sql'); expect(beforeNames).toContain('run_sdk'); @@ -193,8 +193,10 @@ describe('MCP member write access', () => { const afterBody = await afterResponse.json(); const afterTools = afterBody.result.tools.map((tool: any) => tool.name); expect(afterTools).toContain('save_memory'); - // Member tier loses admin-only `query_sql`. - expect(afterTools).not.toContain('query_sql'); + // query_sql is read-tier, so it stays visible after a downgrade; the + // auth/identity tables it can reach stay admin-only via the per-query + // restriction (covered in scoped-query-schema-rejection.test.ts). + expect(afterTools).toContain('query_sql'); }); it('applies member removal immediately to existing MCP sessions', async () => { diff --git a/packages/server/src/__tests__/integration/mcp/metric-series-column-safety.test.ts b/packages/server/src/__tests__/integration/mcp/metric-series-column-safety.test.ts new file mode 100644 index 000000000..6c9e98471 --- /dev/null +++ b/packages/server/src/__tests__/integration/mcp/metric-series-column-safety.test.ts @@ -0,0 +1,73 @@ +/** + * metric_series column safety. + * + * metric_series is read-tier (members may chart their org's operational data), + * so its org-scoping CTEs MUST emit the safe column allowlist — never `SELECT *`. + * Regression: it previously omitted `safeColumns`, so a member running + * `SELECT * FROM connections` got the withheld `credentials` column (OAuth + * tokens). This proves the allowlist is now applied. + */ + +import { beforeAll, describe, expect, it } from 'vitest'; +import { metricSeries } from '../../../tools/admin/metric_series'; +import type { ToolContext } from '../../../tools/registry'; +import { cleanupTestDatabase, getTestDb } from '../../setup/test-db'; +import { + createTestConnection, + createTestEvent, + createTestOrganization, +} from '../../setup/test-fixtures'; + +describe('metric_series — safe column allowlist (member)', () => { + let orgId: string; + + beforeAll(async () => { + await cleanupTestDatabase(); + const org = await createTestOrganization({ name: 'Metric Column Safety' }); + orgId = org.id; + const conn = await createTestConnection({ + organization_id: orgId, + connector_key: 'slack', + }); + // Stash a secret in the withheld column so a leak would be unmistakable. + const db = getTestDb(); + await db`UPDATE connections SET credentials = ${db.json({ access_token: 'SECRET-TOKEN' })} WHERE id = ${conn.id}`; + }); + + const memberCtx = (): ToolContext => ({ + organizationId: orgId, + userId: 'metric-member', + memberRole: 'member', + isAuthenticated: true, + tokenType: 'oauth', + scopedToOrg: false, + allowCrossOrg: false, + }); + + it('does not expose the credentials column via `SELECT * FROM connections`', async () => { + const res = await metricSeries({ sql: 'SELECT * FROM connections' }, {}, memberCtx()); + expect(res.columns).not.toContain('credentials'); + // and the secret value never appears anywhere in the rows + expect(JSON.stringify(res.rows)).not.toContain('SECRET-TOKEN'); + // sanity: an allowlisted column IS present (the scope worked, not just empty) + expect(res.columns).toContain('connector_key'); + }); + + it('refuses a data-modifying CTE (read-only transaction)', async () => { + // A data-modifying WITH CTE passes the SELECT/WITH guard, so the read-only + // transaction is what must stop it. Without it, this read-tier endpoint + // could DELETE/UPDATE/INSERT. + const db = getTestDb(); + const ev = await createTestEvent({ organization_id: orgId, content: 'keep-me' }); + await expect( + metricSeries( + { sql: 'WITH x AS (DELETE FROM events RETURNING id) SELECT count(*) AS n FROM x' }, + {}, + memberCtx() + ) + ).rejects.toThrow(/read-only|read only|cannot execute/i); + // the row is untouched + const [row] = await db`SELECT id FROM events WHERE id = ${ev.id}`; + expect(row?.id).toBeDefined(); + }); +}); diff --git a/packages/server/src/__tests__/unit/scoped-query-schema-rejection.test.ts b/packages/server/src/__tests__/unit/scoped-query-schema-rejection.test.ts new file mode 100644 index 000000000..54199fe1d --- /dev/null +++ b/packages/server/src/__tests__/unit/scoped-query-schema-rejection.test.ts @@ -0,0 +1,304 @@ +/** + * Cross-org scoping guard: validateAndScopeQuery MUST reject schema-qualified + * table references (`public.connections`, `pg_catalog.*`, …) anywhere in the + * query. + * + * Why this is security-critical: org-scoping shadows UNQUALIFIED table names + * with org-filtered CTEs. A schema-qualified reference resolves to the real + * base table and bypasses the CTE → reads every org's rows. Regression guard: + * the @polyglot-sql/sdk migration's first cut used `ast.getTables`, which only + * returns the first FROM table — a schema-qualified table in a JOIN or subquery + * slipped past and leaked. These reproducers were RED then; the raw-AST + * recursion in extractTableRefs makes them GREEN. + */ + +import { describe, expect, it } from 'bun:test'; +import { + isReadQuery, + stripLeadingComments, + validateAndScopeQuery, +} from '../../utils/execute-data-sources'; +import { ADMIN_ONLY_QUERYABLE_TABLES, SAFE_COLUMN_DEFS } from '../../utils/table-schema'; + +const scope = (sql: string) => + validateAndScopeQuery(sql, 'org_test', { safeColumns: SAFE_COLUMN_DEFS }); + +// A non-admin caller passes the auth/identity tables as restricted. +const scopeAsMember = (sql: string) => + validateAndScopeQuery(sql, 'org_test', { + safeColumns: SAFE_COLUMN_DEFS, + restrictedTables: ADMIN_ONLY_QUERYABLE_TABLES, + }); + +describe('validateAndScopeQuery — schema-qualified table rejection', () => { + const leaks: Array<[string, string]> = [ + ['top-level', 'SELECT cc.payload_text FROM public.connections cc'], + ['join 2nd table', 'SELECT * FROM entities e JOIN public.connections c ON c.id = e.id'], + [ + 'multi-join', + 'SELECT * FROM entities e JOIN public.connections c ON c.id = e.id JOIN public.events x ON x.id = e.id', + ], + [ + 'subquery IN join', + 'SELECT id FROM entities WHERE id IN (SELECT id FROM entities e2 JOIN public.connections c ON true)', + ], + [ + 'subquery EXISTS', + 'SELECT id FROM entities e WHERE EXISTS (SELECT 1 FROM public.connections c WHERE c.id = e.id)', + ], + ['UNION branch', 'SELECT id FROM entities UNION SELECT id FROM public.connections'], + ['CTE body', 'WITH x AS (SELECT id FROM public.connections) SELECT * FROM x'], + ['three-part name', 'SELECT * FROM entities e JOIN mydb.public.connections c ON true'], + ]; + + for (const [label, sql] of leaks) { + it(`rejects a schema-qualified ref (${label})`, () => { + expect(() => scope(sql)).toThrow(/schema-qualified/i); + }); + } + + it('catches a deeply-nested schema-qualified ref (no recursion depth fail-open)', () => { + let sql = 'SELECT id FROM public.events'; + for (let i = 0; i < 60; i++) sql = `SELECT id FROM (${sql}) AS _n${i}`; + expect(() => scope(sql)).toThrow(/schema-qualified/i); + }); + + it('rejects multiple statements (the org-scoping CTEs only wrap the first)', () => { + // The 2nd statement would otherwise run unscoped against public.oauth_tokens. + expect(() => + scope('SELECT id FROM events; SELECT id FROM public.oauth_tokens') + ).toThrow(); + }); + + const clean: Array<[string, string]> = [ + ['plain', 'SELECT * FROM entities WHERE id > 0'], + ['unqualified join', 'SELECT * FROM events ev JOIN entities en ON en.id = ANY(ev.entity_ids)'], + [ + 'jsonb aggregate view', + "SELECT (metadata->>'vendor') AS v, SUM((metadata->>'amount')::numeric) AS s, COUNT(*) AS n FROM events GROUP BY 1", + ], + ['cte join', 'WITH c AS (SELECT id FROM events) SELECT * FROM c JOIN entities e ON e.id = c.id'], + ['union', 'SELECT id FROM entities UNION SELECT id FROM events'], + ]; + + for (const [label, sql] of clean) { + it(`scopes a clean query without error (${label})`, () => { + const out = scope(sql); + // every referenced base table is wrapped in an org-scoped CTE + expect(out.sql).toContain('organization_id'); + expect(out.params[0]).toBe('org_test'); + }); + } +}); + +describe('validateAndScopeQuery — member table restriction (auth/identity admin-only)', () => { + const blocked: Array<[string, string]> = [ + ['oauth_tokens', 'SELECT * FROM oauth_tokens'], + ['oauth_clients', 'SELECT * FROM oauth_clients'], + ['user roster', 'SELECT * FROM "user"'], + ['joined oauth_tokens', 'SELECT * FROM entities e JOIN oauth_tokens t ON t.id = e.id'], + ]; + for (const [label, sql] of blocked) { + it(`blocks a non-admin from ${label}`, () => { + expect(() => scopeAsMember(sql)).toThrow(/admin access/i); + }); + } + + const allowed: Array<[string, string]> = [ + ['events', 'SELECT * FROM events'], + ['entities', 'SELECT * FROM entities'], + ['connections', 'SELECT * FROM connections'], + ['feeds', 'SELECT * FROM feeds'], + ]; + for (const [label, sql] of allowed) { + it(`allows a non-admin to query ${label}`, () => { + const out = scopeAsMember(sql); + expect(out.sql).toContain('organization_id'); + }); + } + + it('allows an admin (no restriction) to query oauth_tokens', () => { + const out = scope('SELECT * FROM oauth_tokens'); + expect(out.sql).toContain('organization_id'); + }); +}); + +/** + * Parser-bypass regressions. These were RED after the @polyglot-sql/sdk swap + * (confirmed by the adversarial bug-hunt), because the member-accessible scoping + * relied on `ast.getTableNames`, which (a) yields nothing for the `TABLE ` + * shorthand (mis-parsed as a column) and (b) does not descend into subqueries + * nested inside an expression (CASE / scalar). Both let a non-admin read + * oauth_tokens — unscoped AND past the admin gate. Now GREEN via the SELECT/WITH + * prefix guard + the complete (union) table walk + the CTE-collision guard. + */ +describe('validateAndScopeQuery — parser-bypass regressions', () => { + it('rejects the PostgreSQL `TABLE ` shorthand (member)', () => { + expect(() => scopeAsMember('TABLE oauth_tokens')).toThrow(/SELECT \/ WITH/i); + }); + + it('rejects `TABLE ` even for an admin (not a SELECT)', () => { + expect(() => scope('TABLE events')).toThrow(/SELECT \/ WITH/i); + }); + + it('blocks oauth_tokens nested in a CASE-expression subquery (member)', () => { + const sql = + "SELECT id, (CASE WHEN true THEN (SELECT token FROM oauth_tokens LIMIT 1) ELSE 'x' END) AS leak FROM entities"; + expect(() => scopeAsMember(sql)).toThrow(/admin access/i); + }); + + it('blocks oauth_tokens nested in a scalar SELECT-list subquery (member)', () => { + const sql = 'SELECT id, (SELECT count(*) FROM oauth_tokens) AS n FROM entities'; + expect(() => scopeAsMember(sql)).toThrow(/admin access/i); + }); + + it('still allows the same shapes for an admin (no restriction)', () => { + const sql = 'SELECT id, (SELECT count(*) FROM oauth_tokens) AS n FROM entities'; + const out = scope(sql); + expect(out.sql).toContain('organization_id'); + }); + + it('SCOPES a non-admin table nested in a scalar subquery (no cross-org leak)', () => { + // events nested in a SELECT-list subquery is invisible to getTableNames; the + // complete walk catches it so it gets its own org-scoped CTE rather than + // reading every org's events. + const out = scopeAsMember('SELECT id, (SELECT count(*) FROM events) AS n FROM entities'); + expect(out.sql).toMatch(/"events"\s+AS\s+\(/i); + expect(out.sql).toMatch(/"entities"\s+AS\s+\(/i); + }); + + it('rejects a CTE whose name shadows a real table (fail-closed)', () => { + // The parser cannot tell, by lexical scope, whether `events` in the CTE body + // is the CTE or the base table — so we forbid the ambiguity outright. + expect(() => scope('WITH events AS (SELECT 1 AS x) SELECT * FROM events')).toThrow( + /collides|reserved/i + ); + expect(() => + scope('WITH events AS (SELECT id FROM events) SELECT * FROM events') + ).toThrow(/collides|reserved/i); + }); + + it('rejects a CTE whose name shadows an admin-only table (fail-closed)', () => { + expect(() => + scope('WITH oauth_tokens AS (SELECT 1 AS x) SELECT * FROM oauth_tokens') + ).toThrow(/collides|reserved/i); + }); + + it('still allows an ordinary (non-colliding) CTE name', () => { + const out = scopeAsMember('WITH recent AS (SELECT id FROM events) SELECT * FROM recent'); + expect(out.sql).toMatch(/"events"\s+AS\s+\(/i); + }); +}); + +/** + * Completeness invariant — the security property the whole table-extraction + * exists to uphold: an admin-only table referenced ANYWHERE in a member's query + * must be rejected, in EVERY syntactic position. This is the parser-independent + * guarantee a DB role would otherwise provide, asserted instead where we can run + * it in CI. It is the canary for a future parser blind spot: add a position the + * extractor can't see and one of these flips RED before the leak can ship. + * + * Each builder embeds the table with NO column references (SELECT 1 / count(*) / + * ON true), so the schema validator can't reject for an unrelated reason and + * mask a missed-table hole — the only thing that should reject is the admin gate. + */ +describe('validateAndScopeQuery — admin-table completeness invariant', () => { + const positions: Array<[string, (t: string) => string]> = [ + ['top-level FROM', (t) => `SELECT * FROM ${t}`], + ['JOIN', (t) => `SELECT * FROM entities e JOIN ${t} x ON true`], + ['WHERE EXISTS subquery', (t) => `SELECT id FROM entities WHERE EXISTS (SELECT 1 FROM ${t})`], + [ + 'scalar SELECT-list subquery', + (t) => `SELECT id, (SELECT count(*) FROM ${t}) AS n FROM entities`, + ], + [ + 'CASE-nested subquery', + (t) => + `SELECT id, (CASE WHEN true THEN (SELECT count(*) FROM ${t}) ELSE 0 END) AS n FROM entities`, + ], + ['CTE body', (t) => `WITH src AS (SELECT 1 AS c FROM ${t}) SELECT * FROM src`], + ['UNION branch', (t) => `SELECT id FROM entities UNION SELECT 1 FROM ${t}`], + [ + 'doubly-nested subquery', + (t) => `SELECT 1 FROM (SELECT 1 AS c FROM (SELECT 1 AS c FROM ${t}) AS a) AS b`, + ], + ]; + + // SQL-safe identifiers for each restricted table ("user" is reserved). + const adminTables = [...ADMIN_ONLY_QUERYABLE_TABLES].map((t) => (t === 'user' ? '"user"' : t)); + + for (const [label, build] of positions) { + for (const t of adminTables) { + it(`blocks ${t} in ${label} (member)`, () => { + expect(() => scopeAsMember(build(t))).toThrow(/admin access/i); + }); + } + // Inverse: the same shape with an allowed table must NOT be over-blocked — + // it scopes cleanly (every base table wrapped in an org-scoped CTE). + it(`does not over-block a clean ${label}`, () => { + const out = scopeAsMember(build('events')); + expect(out.sql).toContain('organization_id'); + }); + } +}); + +/** + * Edge cases the review surfaced once query_sql became member-reachable (both + * pre-existing in buildScopedQuery / validateTableQuery, latent while the tool + * was admin-only). The output-alias relaxation must NOT relax real unknown + * columns or excluded columns — those stay rejected. + */ +describe('validateAndScopeQuery — ORDER BY/GROUP BY alias + leading-comment WITH', () => { + it('accepts ORDER BY on an output alias', () => { + const out = scopeAsMember('SELECT COUNT(*) AS n FROM events ORDER BY n'); + expect(out.sql).toContain('organization_id'); + }); + + it('accepts GROUP BY on an output alias', () => { + const out = scopeAsMember('SELECT semantic_type AS st, COUNT(*) AS c FROM events GROUP BY st'); + expect(out.sql).toContain('organization_id'); + }); + + it('still rejects ORDER BY on a genuinely unknown column', () => { + expect(() => scopeAsMember('SELECT id FROM events ORDER BY nonsense_col')).toThrow( + /unknown column/i + ); + }); + + it('still rejects an excluded column even when aliased', () => { + // `credentials` is withheld from connections; aliasing it must not sneak it in. + expect(() => scopeAsMember('SELECT credentials AS x FROM connections')).toThrow( + /unknown column/i + ); + }); + + it('emits a single WITH for a leading-comment WITH query (valid SQL)', () => { + const out = scopeAsMember('-- note\nWITH recent AS (SELECT id FROM events) SELECT * FROM recent'); + expect((out.sql.match(/\bWITH\b/gi) || []).length).toBe(1); + expect(out.sql).toMatch(/"events"\s+AS\s+\(/i); + }); +}); + +/** + * The read-query prefix guard strips leading comments with a LINEAR scan, not a + * nested-quantifier regex (which backtracks catastrophically — a ReDoS, since + * this runs on member-supplied SQL). Guards against reintroducing that. + */ +describe('isReadQuery / stripLeadingComments (ReDoS-safe)', () => { + it('strips leading line + block comments before the SELECT/WITH check', () => { + expect(isReadQuery('-- note\nSELECT 1')).toBe(true); + expect(isReadQuery('/* a */ /* b */ WITH x AS (SELECT 1) SELECT * FROM x')).toBe(true); + expect(isReadQuery(' \n SELECT 1')).toBe(true); + expect(isReadQuery('DELETE FROM events')).toBe(false); + expect(isReadQuery('TABLE events')).toBe(false); + expect(stripLeadingComments('-- x\n/* y */ SELECT 1')).toBe('SELECT 1'); + // unterminated comment → consumes to EOF (no match, fail-closed) + expect(isReadQuery('/* unclosed SELECT 1')).toBe(false); + }); + + it('handles pathological unclosed-comment input in linear time (no catastrophic backtracking)', () => { + const start = Date.now(); + expect(isReadQuery(`${'/*'.repeat(100_000)} SELECT 1`)).toBe(false); + expect(Date.now() - start).toBeLessThan(1000); // a backtracking regex would hang + }); +}); diff --git a/packages/server/src/auth/__tests__/tool-access.test.ts b/packages/server/src/auth/__tests__/tool-access.test.ts index 1437e06ad..3365af3f4 100644 --- a/packages/server/src/auth/__tests__/tool-access.test.ts +++ b/packages/server/src/auth/__tests__/tool-access.test.ts @@ -20,8 +20,8 @@ import { } from '../tool-access'; describe('requiresOwnerAdmin', () => { - it('should require admin for query_sql despite being read-only', () => { - expect(requiresOwnerAdmin('query_sql', {}, true)).toBe(true); + it('treats query_sql as read-tier — members may query; sensitive tables gated per-query', () => { + expect(requiresOwnerAdmin('query_sql', {}, true)).toBe(false); }); it('should require admin for destructive manage_entity actions only', () => { @@ -367,21 +367,20 @@ describe('checkToolAccess', () => { } ); - it('keeps admin-only tools restricted for members', () => { - // query_sql is the canonical admin-only tool on the post-PR-2 surface. + it('lets members run query_sql (read-tier; auth/identity tables gated per-query, not at the tool gate)', () => { expect(() => checkToolAccess( 'query_sql', { sql: 'SELECT 1', sort_by: 'id' }, - { - ...baseAuth, - memberRole: 'member', - scopes: ['mcp:admin'], - } + { ...baseAuth, memberRole: 'member', scopes: ['mcp:read'] } ) - ).toThrow( - 'This action requires admin or owner access. Ask an organization owner to grant elevated access.' - ); + ).not.toThrow(); + }); + + it('keeps genuinely admin-only actions admin-tier (manage_entity_schema create)', () => { + // Opening query_sql to read-tier must not have widened the real admin + // actions — these go through SDK wrappers / action-router, gated by policy. + expect(requiresOwnerAdmin('manage_entity_schema', { action: 'create' }, false)).toBe(true); }); }); diff --git a/packages/server/src/auth/tool-access.ts b/packages/server/src/auth/tool-access.ts index d9bd8be52..13e3059df 100644 --- a/packages/server/src/auth/tool-access.ts +++ b/packages/server/src/auth/tool-access.ts @@ -150,10 +150,10 @@ export function requiresOwnerAdmin( args: unknown, readOnlyHint: boolean ): boolean { - // query_sql is intentionally owner/admin only despite being read-only — - // it can read across the whole org's data, including audit/event tables. - if (toolName === 'query_sql') return true; - + // query_sql / metric_series are read-tier (members may query their org's + // operational data). The auth/identity tables (oauth_tokens, oauth_clients, + // user) stay admin-only via ADMIN_ONLY_QUERYABLE_TABLES, enforced per-query in + // those handlers — not by gating the whole tool to admins. if (actionMatches(OWNER_ADMIN_ACTIONS, toolName, args)) return true; const hasExplicitPolicy = toolName in OWNER_ADMIN_ACTIONS || toolName in MEMBER_WRITE_ACTIONS; diff --git a/packages/server/src/sandbox/namespaces/entity-schema.ts b/packages/server/src/sandbox/namespaces/entity-schema.ts index 16410a48a..b2cf484fb 100644 --- a/packages/server/src/sandbox/namespaces/entity-schema.ts +++ b/packages/server/src/sandbox/namespaces/entity-schema.ts @@ -33,6 +33,8 @@ export interface EntitySchemaNamespace { color?: string; metadata_schema?: Record; event_kinds?: Record; + /** Make the type derived (a SQL view); `null`/omit ⇒ a stored type. */ + backing?: { sql: string } | null; }): Promise; updateType(input: { slug: string; @@ -42,6 +44,8 @@ export interface EntitySchemaNamespace { color?: string; metadata_schema?: Record; event_kinds?: Record; + /** Set/clear the derived view; omit to leave backing unchanged. */ + backing?: { sql: string } | null; }): Promise; deleteType(slug: string): Promise; auditType(slug: string): Promise; diff --git a/packages/server/src/tools/admin/__tests__/query_sql.test.ts b/packages/server/src/tools/admin/__tests__/query_sql.test.ts index 7245712d6..3134d3dda 100644 --- a/packages/server/src/tools/admin/__tests__/query_sql.test.ts +++ b/packages/server/src/tools/admin/__tests__/query_sql.test.ts @@ -35,13 +35,13 @@ describe('querySql input validation', () => { expect(result.total_count).toBe(0); }); - it('requires an explicit sort_by column before building SQL', async () => { + it('rejects an invalid sort_by column name (sort_by is optional, but must be a bare identifier when given)', async () => { const result = await querySql( - { sql: 'select * from events' } as unknown as QuerySqlArgs, + { sql: 'select * from events', sort_by: 'a; DROP' } as unknown as QuerySqlArgs, {}, ctx ); - expect(result.error).toBe('sort_by (string column name) is required.'); + expect(result.error).toBe('Invalid sort_by column name: a; DROP'); }); }); diff --git a/packages/server/src/tools/admin/manage_entity.ts b/packages/server/src/tools/admin/manage_entity.ts index 2a55c0a87..2e1d3dd30 100644 --- a/packages/server/src/tools/admin/manage_entity.ts +++ b/packages/server/src/tools/admin/manage_entity.ts @@ -454,6 +454,9 @@ async function handleCreate( throw new Error('name is required for create action'); } + // (Derived-type rejection lives in createEntity — the single chokepoint that + // also resolves public-catalog types.) + // Validate metadata against entity type's JSON schema (if defined) if (args.metadata && Object.keys(args.metadata).length > 0) { const validation = await validateEntityMetadata(args.entity_type, args.metadata, ctx); diff --git a/packages/server/src/tools/admin/manage_entity_schema.ts b/packages/server/src/tools/admin/manage_entity_schema.ts index b4a0cb08a..d0c9fdb5b 100644 --- a/packages/server/src/tools/admin/manage_entity_schema.ts +++ b/packages/server/src/tools/admin/manage_entity_schema.ts @@ -11,6 +11,7 @@ import { type Static, Type } from '@sinclair/typebox'; import type { AutoCreateWhenRule } from '@lobu/connector-sdk'; import { type DbClient, getDb } from '../../db/client'; +import { measureColumns } from '../../utils/infer-measures'; import type { Env } from '../../index'; import logger from '../../utils/logger'; import { compileRulesMetadata, ruleHashFor } from '../../identity/rules'; @@ -40,6 +41,14 @@ const AutoCreateWhenRuleInputSchema = Type.Object( { additionalProperties: false } ); +/** Derived-entity backing: a read-only SQL view. */ +const BackingInputSchema = Type.Object( + { + sql: Type.String({ minLength: 1, description: 'ANSI SELECT defining the view' }), + }, + { additionalProperties: false } +); + export const ManageEntitySchemaSchema = Type.Object({ schema_type: Type.Union([Type.Literal('entity_type'), Type.Literal('relationship_type')], { description: 'Whether to manage entity types or relationship types', @@ -98,6 +107,12 @@ export const ManageEntitySchemaSchema = Type.Object({ } ) ), + backing: Type.Optional( + Type.Union([Type.Null(), BackingInputSchema], { + description: + "[entity_type: create/update] Makes the type DERIVED — a read-only SQL view. `{ sql }` to set; `null` to clear (revert to a stored type); omit to leave unchanged. Read a derived type's rows by running its `backing_sql` (returned by `get`) through `query_sql`, which org-scopes the view; `get` also returns `measure_columns` (the view's aggregate columns, classified on read).", + }) + ), // Relationship type fields is_symmetric: Type.Optional( @@ -157,6 +172,7 @@ interface EntityTypeRow { color?: string | null; metadata_schema?: Record | null; event_kinds?: Record | null; + backing_sql?: string | null; is_system: boolean; created_by?: string | null; organization_id?: string | null; @@ -165,6 +181,8 @@ interface EntityTypeRow { updated_at: Date; entity_count?: number; current_view_template_version_id?: number | null; + /** Derived types only — the view's aggregate columns, classified on read. */ + measure_columns?: string[]; } interface AuditEntry { @@ -264,10 +282,11 @@ export async function manageEntitySchema( // ============================================ const ENTITY_TYPE_COLUMNS = - 'id, slug, name, description, icon, color, metadata_schema, event_kinds, created_by, organization_id, created_at, updated_at, current_view_template_version_id'; + 'id, slug, name, description, icon, color, metadata_schema, event_kinds, backing_sql, created_by, organization_id, created_at, updated_at, current_view_template_version_id'; const ENTITY_TYPE_COLUMNS_WITH_ORG = `et.id, et.slug, et.name, et.description, et.icon, et.color, - et.metadata_schema, et.event_kinds, et.created_by, et.organization_id, + et.metadata_schema, et.event_kinds, et.backing_sql, + et.created_by, et.organization_id, et.created_at, et.updated_at, et.current_view_template_version_id, o.slug AS organization_slug`; @@ -314,6 +333,18 @@ function validateEntityMetadataSchemaDisplayConfig( } } +/** + * Reject an empty/whitespace `backing.sql`. TypeBox's `minLength: 1` is not + * enforced for this tool (it isn't in VALIDATED_TOOLS), so without this guard a + * caller could persist a "derived" type whose view is blank — unqueryable and + * with no inferable measures. `backing: null` (revert to stored) is fine. + */ +function assertValidBacking(backing: ManageEntitySchemaArgs['backing']): void { + if (backing && typeof backing.sql === 'string' && backing.sql.trim() === '') { + throw new Error('backing.sql cannot be empty'); + } +} + async function getEntityCountsByType(organizationId: string): Promise> { const sql = getDb(); const rows = await sql` @@ -454,6 +485,8 @@ async function etHandleGet( const [resolved] = await resolveUsernames([rows[0] as Record], 'created_by'); const mapped = mapRowToEntityType(resolved); mapped.entity_count = await getEntityCountForType(Number(mapped.id), ctx.organizationId); + // Classify the view's measure columns on read (never persisted). + if (mapped.backing_sql) mapped.measure_columns = measureColumns(mapped.backing_sql); return { schema_type: 'entity_type', action: 'get', entity_type: mapped }; } @@ -492,7 +525,10 @@ async function etHandleCreate( } validateEntityMetadataSchemaDisplayConfig(args.metadata_schema); + assertValidBacking(args.backing); + // metadata_schema is stored as the author sent it — measure/dimension roles for + // a derived type are classified ON READ (see etHandleGet), never persisted. const metadataSchema = args.metadata_schema ? sql.json(args.metadata_schema) : null; const eventKinds = args.event_kinds ? sql.json(args.event_kinds) : null; @@ -500,6 +536,7 @@ async function etHandleCreate( INSERT INTO entity_types ( slug, name, description, icon, color, metadata_schema, event_kinds, + backing_sql, organization_id, created_by, created_at, updated_at ) VALUES ( @@ -510,6 +547,7 @@ async function etHandleCreate( ${args.color ?? null}, ${metadataSchema}, ${eventKinds}, + ${args.backing?.sql ?? null}, ${ctx.organizationId}, ${ctx.userId}, current_timestamp, @@ -556,18 +594,30 @@ async function etHandleUpdate( const current = existing[0]; const beforePayload = { ...current } as Record; - const hasMetadataSchema = args.metadata_schema !== undefined; - if (hasMetadataSchema) { + if (args.metadata_schema !== undefined) { validateEntityMetadataSchemaDisplayConfig(args.metadata_schema); } - const metadataSchemaJson = hasMetadataSchema - ? args.metadata_schema - ? sql.json(args.metadata_schema) - : null - : null; + assertValidBacking(args.backing); + // Converting a populated stored type to a derived (view-backed) type would + // orphan its existing rows (the view ignores them). Reject it. + if (args.backing?.sql) { + const existingCount = await getEntityCountForType(Number(current.id), ctx.organizationId); + if (existingCount > 0) { + throw new Error( + `Cannot make entity type '${args.slug}' derived: ${existingCount} stored ${existingCount === 1 ? 'entity exists' : 'entities exist'}. Delete them first.` + ); + } + } + // metadata_schema is stored verbatim (measure roles are classified on read). + const hasMetadataSchema = args.metadata_schema !== undefined; + const metadataSchemaJson = args.metadata_schema ? sql.json(args.metadata_schema) : null; const hasEventKinds = args.event_kinds !== undefined; const eventKindsJson = hasEventKinds && args.event_kinds ? sql.json(args.event_kinds) : null; + // Backing is set as a unit: callers send `backing` (an object makes the type + // derived, null reverts it to stored) or omit it to leave backing unchanged. + const hasBacking = args.backing !== undefined; + await sql` UPDATE entity_types SET name = COALESCE(${args.name ?? null}, name), @@ -582,6 +632,10 @@ async function etHandleUpdate( WHEN ${hasEventKinds} THEN ${eventKindsJson} ELSE event_kinds END, + backing_sql = CASE + WHEN ${hasBacking} THEN ${args.backing?.sql ?? null}::text + ELSE backing_sql + END, updated_by = ${ctx.userId}, updated_at = current_timestamp WHERE id = ${current.id} diff --git a/packages/server/src/tools/admin/metric_series.ts b/packages/server/src/tools/admin/metric_series.ts index 2cee5f6fb..a713b9ded 100644 --- a/packages/server/src/tools/admin/metric_series.ts +++ b/packages/server/src/tools/admin/metric_series.ts @@ -21,7 +21,8 @@ import { type Static, Type } from '@sinclair/typebox'; import { getDb } from '../../db/client'; -import { validateAndScopeQuery } from '../../utils/execute-data-sources'; +import { isReadQuery, validateAndScopeQuery } from '../../utils/execute-data-sources'; +import { ADMIN_ONLY_QUERYABLE_TABLES, SAFE_COLUMN_DEFS } from '../../utils/table-schema'; import { ToolUserError } from '../../utils/errors'; import logger from '../../utils/logger'; import type { ToolContext } from '../registry'; @@ -43,11 +44,6 @@ const STATEMENT_TIMEOUT_MS = 5000; // megabytes back to the client. const MAX_ROWS = 2000; -// Only SELECT or WITH … SELECT are valid metric queries. Catches DML/DDL -// prefixes (INSERT/UPDATE/DELETE/TRUNCATE/COPY/CREATE/DROP/ALTER/GRANT/ -// REVOKE/VACUUM/ANALYZE/EXPLAIN/SET/RESET/LOCK/CALL/DO/…) before they hit -// the heavier AST validator. -const SELECT_OR_WITH = /^\s*(?:--[^\n]*\n\s*|\/\*[\s\S]*?\*\/\s*)*(SELECT|WITH)\b/i; export interface MetricSeriesResult { columns: string[]; @@ -64,11 +60,19 @@ export async function metricSeries( throw new Error('metric_series: caller must be scoped to an organization'); } - if (!SELECT_OR_WITH.test(args.sql)) { + if (!isReadQuery(args.sql)) { throw new ToolUserError('metric_series: only SELECT or WITH … SELECT queries are accepted'); } - const { sql: scopedSql, params } = validateAndScopeQuery(args.sql, orgId); + // Members may chart their org's operational data; the auth/identity tables + // (oauth_tokens, oauth_clients, user) stay admin-only. + const isAdmin = ctx.memberRole === 'owner' || ctx.memberRole === 'admin'; + const { sql: scopedSql, params } = validateAndScopeQuery(args.sql, orgId, { + // Emit the safe column allowlist (not SELECT *) so a member charting e.g. + // `connections` can't pull credential columns the allowlist withholds. + safeColumns: SAFE_COLUMN_DEFS, + restrictedTables: isAdmin ? undefined : ADMIN_ONLY_QUERYABLE_TABLES, + }); const db = getDb(); // Run inside a transaction so SET LOCAL applies for the user query only. @@ -77,6 +81,10 @@ export async function metricSeries( let rows: Record[]; try { rows = (await db.begin(async (tx) => { + // READ ONLY first: a data-modifying CTE (`WITH x AS (DELETE … RETURNING …) + // SELECT …`) passes the SELECT/WITH guard, so the DB must refuse the write. + // Mirrors query_sql; without it this read-tier endpoint could mutate. + await tx.unsafe('SET TRANSACTION READ ONLY'); await tx.unsafe(`SET LOCAL statement_timeout = ${STATEMENT_TIMEOUT_MS}`); return tx.unsafe(scopedSql, params as unknown[]); })) as Record[]; diff --git a/packages/server/src/tools/admin/query_sql.ts b/packages/server/src/tools/admin/query_sql.ts index d5f781cca..8ad6500d4 100644 --- a/packages/server/src/tools/admin/query_sql.ts +++ b/packages/server/src/tools/admin/query_sql.ts @@ -11,14 +11,14 @@ import { getDb } from '../../db/client'; import { validateAndScopeQuery } from '../../utils/execute-data-sources'; import logger from '../../utils/logger'; import { raceAbort } from '../../utils/race-abort'; -import { SAFE_COLUMN_DEFS } from '../../utils/table-schema'; +import { ADMIN_ONLY_QUERYABLE_TABLES, SAFE_COLUMN_DEFS } from '../../utils/table-schema'; import { getCachedMembershipRole, getCachedOrgBySlug } from '../../workspace/multi-tenant'; import type { ToolContext } from '../registry'; export const QuerySqlSchema = Type.Object({ sql: Type.String({ description: - 'Base SELECT query. Table references are auto-scoped to your organization. Do NOT include ORDER BY, LIMIT, or OFFSET — they are added automatically.', + 'Base SELECT query. Table references are auto-scoped to your organization. It is wrapped as a subquery, so ORDER BY / LIMIT / window functions inside it are fine; pagination + sort are added on the outside via sort_by/limit/offset.', }), org_slug: Type.Optional( Type.String({ @@ -26,9 +26,11 @@ export const QuerySqlSchema = Type.Object({ 'Optional. Only honored on the unscoped `/mcp` endpoint with OAuth auth. Rejected for PAT auth, browser-session auth, and scoped `/mcp/{slug}` connections — re-connect to the target workspace instead.', }) ), - sort_by: Type.String({ - description: 'Column name to sort by.', - }), + sort_by: Type.Optional( + Type.String({ + description: 'Column name to sort by. Omit to return rows unordered (e.g. a view whose columns you don\'t know upfront).', + }) + ), sort_order: Type.Optional( Type.Union([Type.Literal('asc'), Type.Literal('desc')], { description: 'Sort direction. Default: asc.', @@ -68,7 +70,6 @@ interface QuerySqlResult { error?: string; } -const TRAILING_CLAUSES = /\b(ORDER\s+BY|LIMIT|OFFSET)\b/i; const COLUMN_NAME_RE = /^[a-zA-Z_]\w*$/; const PG_OID_TYPE_MAP: Record = { @@ -121,21 +122,16 @@ export async function querySql( if (typeof args.sql !== 'string') { return errorResult('sql (string) is required.', startTime); } - if (typeof args.sort_by !== 'string' || args.sort_by.length === 0) { - return errorResult('sort_by (string column name) is required.', startTime); - } const baseSql = args.sql.trim(); if (!baseSql) return errorResult('SQL query is required.', startTime); - if (TRAILING_CLAUSES.test(baseSql)) { - return errorResult( - 'Do not include ORDER BY, LIMIT, or OFFSET in your SQL — they are added automatically.', - startTime - ); - } + // The base query is wrapped as `SELECT * FROM () _t [ORDER BY …] LIMIT …`, + // so an ORDER BY / LIMIT / window inside the caller's SQL is valid (it sits in + // the subquery). A derived view's backing_sql commonly has `OVER (ORDER BY …)`. - if (!COLUMN_NAME_RE.test(args.sort_by)) { + // sort_by is optional: omit it for a view whose columns aren't known upfront. + if (args.sort_by !== undefined && !COLUMN_NAME_RE.test(args.sort_by)) { return errorResult(`Invalid sort_by column name: ${args.sort_by}`, startTime); } @@ -144,6 +140,9 @@ export async function querySql( // cross-org. The single source of truth is `ctx.allowCrossOrg`, which is // computed from `tokenType === 'oauth' && !scopedToOrg`. let targetOrgId = ctx.organizationId; + // Members may query their own org's operational tables; the auth/identity + // tables stay admin-only (enforced via restrictedTables below). + let callerIsAdmin = ctx.memberRole === 'owner' || ctx.memberRole === 'admin'; if (args.org_slug) { if (!ctx.allowCrossOrg) { if (ctx.scopedToOrg) { @@ -171,16 +170,22 @@ export async function querySql( startTime ); } - // `query_sql` is admin-tier (it can read audit/event tables); the cross- - // org hop must re-validate that constraint against the *target* org's - // role, not just the caller's role in the bound org. - if (role !== 'owner' && role !== 'admin') { - return errorResult( - `Cross-org query_sql requires owner or admin access in '${args.org_slug}'.`, - startTime - ); - } targetOrgId = targetOrg.id; + // Reaching into ANOTHER workspace stays owner/admin-only. Passing your OWN + // org slug is just an explicit form of the default and stays read-tier — + // don't reject a member or silently escalate them to admin. Either way the + // role is re-validated against the *target* org, not the bound-org role. + if (targetOrg.id !== ctx.organizationId) { + if (role !== 'owner' && role !== 'admin') { + return errorResult( + `Cross-org query_sql requires owner or admin access in '${args.org_slug}'.`, + startTime + ); + } + callerIsAdmin = true; // cross-org already required owner/admin in the target + } else { + callerIsAdmin = role === 'owner' || role === 'admin'; + } } // Validate, parse, and org-scope the query @@ -189,6 +194,7 @@ export async function querySql( try { const scoped = validateAndScopeQuery(baseSql, targetOrgId, { safeColumns: SAFE_COLUMN_DEFS, + restrictedTables: callerIsAdmin ? undefined : ADMIN_ONLY_QUERYABLE_TABLES, }); scopedSql = scoped.sql; params = scoped.params; @@ -227,7 +233,8 @@ export async function querySql( const offset = Math.max(0, Math.trunc(rawOffset)); const countSql = `SELECT count(*)::int AS c FROM (${scopedSql}) AS _t ${searchWhere}`; - const dataSql = `SELECT * FROM (${scopedSql}) AS _t ${searchWhere} ORDER BY "${args.sort_by}" ${sortOrder} LIMIT ${limit} OFFSET ${offset}`; + const orderBy = args.sort_by ? `ORDER BY "${args.sort_by}" ${sortOrder}` : ''; + const dataSql = `SELECT * FROM (${scopedSql}) AS _t ${searchWhere} ${orderBy} LIMIT ${limit} OFFSET ${offset}`; try { const sql = getDb(); diff --git a/packages/server/src/tools/registry.ts b/packages/server/src/tools/registry.ts index c2777792f..088242d9d 100644 --- a/packages/server/src/tools/registry.ts +++ b/packages/server/src/tools/registry.ts @@ -152,7 +152,7 @@ const TOOLS: ToolDefinition[] = [ { name: 'query_sql', description: - 'Run a paginated, sortable, searchable read-only SQL query. Table references auto-scope to the bound org. Do NOT include ORDER BY/LIMIT/OFFSET or positional parameters. Optional `org_slug` (OAuth on /mcp only) redirects the query to a different member org; rejected on /mcp/{slug} and on PAT auth.', + 'Run a paginated, sortable, searchable read-only SQL query. Table references auto-scope to the bound org. The query is wrapped as a subquery, so inner ORDER BY / LIMIT / window functions are fine; pagination + sort come from the sort_by/limit/offset args. Do NOT use positional parameters ($1, $2, …). Optional `org_slug` (OAuth on /mcp only) redirects the query to a different member org; rejected on /mcp/{slug} and on PAT auth.', inputSchema: QuerySqlSchema, annotations: READ_ONLY, handler: querySql, diff --git a/packages/server/src/utils/__tests__/infer-measures.test.ts b/packages/server/src/utils/__tests__/infer-measures.test.ts new file mode 100644 index 000000000..ffccf9350 --- /dev/null +++ b/packages/server/src/utils/__tests__/infer-measures.test.ts @@ -0,0 +1,85 @@ +import { describe, expect, it } from 'vitest'; +import { inferColumns, measureColumns } from '../infer-measures'; + +describe('inferColumns', () => { + it('classifies aggregates / computed columns as measures, plain columns as dimensions', () => { + const cols = inferColumns( + `SELECT company_id, currency, + SUM(amount) AS total, + COUNT(*) AS n, + COUNT(DISTINCT u) AS users, + AVG(x) AS avgx, + MAX(d) AS last_d, + num / den AS rate + FROM events GROUP BY company_id, currency` + ); + const role = Object.fromEntries(cols.map((c) => [c.name, c.role])); + expect(role.company_id).toBe('dimension'); + expect(role.currency).toBe('dimension'); + expect(role.total).toBe('measure'); + expect(role.n).toBe('measure'); + expect(role.users).toBe('measure'); // COUNT(DISTINCT …) + expect(role.avgx).toBe('measure'); + expect(role.last_d).toBe('measure'); // MAX + expect(role.rate).toBe('measure'); // a / b + }); + + it('treats generic aggregates (bool_or, jsonb_agg, percentile … WITHIN GROUP) as measures', () => { + // Regression: these report as polyglot's generic `aggregate_function` / + // `within_group` node types and were previously misclassified as dimensions. + const role = Object.fromEntries( + inferColumns( + `SELECT g, + bool_or(flag) AS any_flag, + jsonb_agg(x) AS xs, + percentile_cont(0.5) WITHIN GROUP (ORDER BY v) AS median_v + FROM events GROUP BY g` + ).map((c) => [c.name, c.role]) + ); + expect(role.g).toBe('dimension'); + expect(role.any_flag).toBe('measure'); + expect(role.xs).toBe('measure'); + expect(role.median_v).toBe('measure'); + }); + + it('sees through casts/parens to the underlying aggregate', () => { + // Regression: a cast wraps the aggregate node, so checking only the top + // node type reported `COUNT(*)::int` / `SUM(x)::numeric` as dimensions. + const role = Object.fromEntries( + inferColumns( + `SELECT g, + COUNT(*)::int AS n, + SUM(amount)::numeric AS total, + (AVG(x))::float8 AS avgx, + (metadata->>'amount')::numeric AS amt + FROM events GROUP BY g` + ).map((c) => [c.name, c.role]) + ); + expect(role.g).toBe('dimension'); + expect(role.n).toBe('measure'); + expect(role.total).toBe('measure'); + expect(role.avgx).toBe('measure'); + // a cast of a NON-aggregate stays a dimension (no false positive) + expect(role.amt).toBe('dimension'); + }); + + it('returns [] for non-SELECT roots and genuinely unparseable SQL', () => { + // SELECT * → handled (star projection, no named columns) + expect(inferColumns('SELECT * FROM events')).toEqual([]); + // These parse SUCCESSFULLY to a non-`select` root, so they hit the + // non-select guard (not the parse-failure branch): + expect(inferColumns('this is not sql')).toEqual([]); // parses as a `not` node + expect(inferColumns('DROP TABLE x')).toEqual([]); // parses as `drop_table` + // These FAIL to parse → exercise the `!res.success` fail-open branch: + expect(inferColumns('SELECT (')).toEqual([]); + expect(inferColumns('SELECT FROM')).toEqual([]); + }); +}); + +describe('measureColumns', () => { + it('returns just the measure column names', () => { + expect( + measureColumns('SELECT company_id, SUM(amount) AS spend, COUNT(*) AS n FROM events GROUP BY 1').sort() + ).toEqual(['n', 'spend']); + }); +}); diff --git a/packages/server/src/utils/entity-link-upsert.ts b/packages/server/src/utils/entity-link-upsert.ts index e5a7a6bed..08e57fa86 100644 --- a/packages/server/src/utils/entity-link-upsert.ts +++ b/packages/server/src/utils/entity-link-upsert.ts @@ -215,8 +215,8 @@ async function createEntityWithIdentities(params: { // Resolve entity_type slug → entity_types(id). Same schema search path as // createEntity: try the entity's own org first, then any visibility='public' // catalog. First match wins. See createEntity for the slug-poisoning caveat. - const typeRow = await sql<{ id: number }>` - SELECT et.id + const typeRow = await sql<{ id: number; backing_sql: string | null }>` + SELECT et.id, et.backing_sql FROM entity_types et LEFT JOIN organization o ON o.id = et.organization_id WHERE et.slug = ${params.entityType} @@ -235,6 +235,16 @@ async function createEntityWithIdentities(params: { ); return null; } + // Derived (view-backed) types have no stored rows — skip auto-create (the + // view ignores any row this would insert). Mirrors createEntity's guard for + // this separate connector/link insert path. + if (typeRow[0].backing_sql) { + logger.warn( + { entityType: params.entityType, orgId: params.orgId }, + 'entity auto-create skipped: entity type is derived (a SQL view)' + ); + return null; + } const entityTypeId = typeRow[0].id; // Try a few slug variants to defuse improbable random collisions. diff --git a/packages/server/src/utils/entity-management.ts b/packages/server/src/utils/entity-management.ts index 45839d448..d9e9597c8 100644 --- a/packages/server/src/utils/entity-management.ts +++ b/packages/server/src/utils/entity-management.ts @@ -246,8 +246,8 @@ export async function createEntity( // visibility flips to admins; long-term the right fix is either an // explicit `is_catalog` flag on `organization` or per-agent `uses_catalog` // declarations narrowing the search scope. - const typeRow = await sql<{ id: number }>` - SELECT et.id + const typeRow = await sql<{ id: number; backing_sql: string | null }>` + SELECT et.id, et.backing_sql FROM entity_types et LEFT JOIN organization o ON o.id = et.organization_id WHERE et.slug = ${data.entity_type} @@ -265,6 +265,15 @@ export async function createEntity( 400 ); } + // A derived (view-backed) type has no stored rows — its data is its backing_sql + // view. Reject inserts here (the single chokepoint; covers tenant + public + // catalog types) so a row the view ignores can't be orphaned. + if (typeRow[0].backing_sql) { + throw new ToolUserError( + `Entity type '${data.entity_type}' is derived (a SQL view) and has no stored rows. Edit its backing view instead of creating entities.`, + 400 + ); + } const entityTypeId = typeRow[0].id; // Generate slug from name if not provided diff --git a/packages/server/src/utils/execute-data-sources.ts b/packages/server/src/utils/execute-data-sources.ts index 79160c14d..2045d0fba 100644 --- a/packages/server/src/utils/execute-data-sources.ts +++ b/packages/server/src/utils/execute-data-sources.ts @@ -8,21 +8,18 @@ * - Any other name → treated as an entity_type slug, filtered from entities * * Security: - * - SQL parsed via node-sql-parser to extract ALL table references + * - SQL parsed via @polyglot-sql/sdk to extract ALL table references * - Schema-qualified references (e.g. public.user) rejected outright * - Every table ref gets a CTE with org-scoping baked in * - READ ONLY transaction + timeout via sql.begin() * - FORBIDDEN_OPS regex as additional safeguard */ -import { createRequire } from 'node:module'; - -const _require = createRequire(import.meta.url); -const { Parser } = _require('node-sql-parser'); - +import { Dialect, ast, parse as parseSql } from '@polyglot-sql/sdk'; import type { DbClient } from '../db/client'; import logger from './logger'; import { + ADMIN_ONLY_QUERYABLE_TABLES, buildColumnList, type ColumnDef, QUERYABLE_TABLE_NAMES, @@ -49,58 +46,233 @@ const FORBIDDEN_OPS = /\b(COPY|IMPORT|PRAGMA|CALL)\b/i; const MAX_ROWS = 1000; const QUERY_TIMEOUT_MS = 5000; -const sqlParser = new Parser(); +type SqlNode = ast.Expression; -// ============================================ -// SQL Parsing -// ============================================ +/** Strip {{...}} template placeholders to a literal so the parser doesn't choke. */ +function stripPlaceholders(sql: string): string { + return sql.replace(/\{\{\w+(?:\.\w+)?\}\}/g, '0'); +} + +/** Parse to the top-level statement node, or undefined when the SQL won't parse. */ +function parseRoot(sql: string): SqlNode | undefined { + const res = parseSql(stripPlaceholders(sql), Dialect.PostgreSQL); + if (!res.success || !res.ast) return undefined; + return (Array.isArray(res.ast) ? res.ast[0] : res.ast) as SqlNode | undefined; +} + +/** Pull a bare identifier string out of polyglot's `{ name, quoted }` shapes. */ +function identName(value: unknown): string | null { + if (typeof value === 'string') return value; + if (value && typeof value === 'object') { + const o = value as Record; + if (typeof o.name === 'string') return o.name; + if (o.name && typeof o.name === 'object') return identName(o.name); + if (o.this) return identName(o.this); + } + return null; +} /** - * Extract all table references from a SQL query. - * Rejects schema-qualified references (e.g. public.users, pg_catalog.pg_roles). - * Filters out user-defined CTE names so they aren't treated as virtual tables. + * Collect every schema-qualified table reference (`schema.table`) in the parsed + * tree by recursing the RAW AST node graph. + * + * Security-critical: org-scoping shadows UNQUALIFIED table names with CTEs, so a + * schema-qualified ref (`public.connections`, `pg_catalog.*`) bypasses scoping + * and reads every org's rows. polyglot's `getTables`/`walk`/`findByType` only + * surface the FIRST `FROM` table — they do NOT descend into JOINs or + * sub-selects — so a qualified table in a join or subquery would slip past a + * node-enumeration check. A raw recursion over the node graph is the only + * reliable way to see them all. A polyglot table-ref node is shaped + * `{ name, schema, catalog, ... }`; `schema` is null when unqualified. + * + * Iterative (stack) traversal, NOT recursion: a recursion depth-cap would + * fail OPEN — a deeply-nested `public.oauth_tokens` past the cap would slip + * past and bypass scoping. The `seen` set bounds the walk on cyclic graphs. */ -function extractTableRefs(query: string): string[] { - // Replace {{...}} placeholders with a literal so the parser doesn't choke - const forParsing = query.replace(/\{\{\w+(?:\.\w+)?\}\}/g, '0'); +function collectSchemaQualifiedTables(root: unknown): string[] { + const seen = new Set(); + const hits: string[] = []; + const stack: unknown[] = [root]; + while (stack.length > 0) { + const node = stack.pop(); + if (!node || typeof node !== 'object') continue; + if (seen.has(node as object)) continue; + seen.add(node as object); + if (Array.isArray(node)) { + for (const child of node) stack.push(child); + continue; + } + const obj = node as Record; + if (obj.schema != null && obj.name != null && Object.hasOwn(obj, 'catalog')) { + hits.push(`${identName(obj.schema) ?? '?'}.${identName(obj.name) ?? '?'}`); + } + for (const key of Object.keys(obj)) stack.push(obj[key]); + } + return hits; +} - const tableList = sqlParser.tableList(forParsing, { database: 'PostgreSql' }); +/** + * Every table-ref name anywhere in the tree (lowercased) via the same raw walk. + * Security-critical: `ast.getTableNames` does NOT descend into subqueries nested + * inside an expression (e.g. `(CASE WHEN … THEN (SELECT … FROM oauth_tokens) …)`), + * so a table hidden there would be neither scoped nor admin-gated. This walk + * reaches them. Includes CTE-reference names (filtered out by the caller). + */ +function collectAllTableNames(root: unknown): string[] { + const seen = new Set(); + const names: string[] = []; + const stack: unknown[] = [root]; + while (stack.length > 0) { + const node = stack.pop(); + if (!node || typeof node !== 'object') continue; + if (seen.has(node as object)) continue; + seen.add(node as object); + if (Array.isArray(node)) { + for (const child of node) stack.push(child); + continue; + } + const obj = node as Record; + if (obj.name != null && Object.hasOwn(obj, 'catalog')) { + const n = identName(obj.name); + if (n) names.push(n.toLowerCase()); + } + for (const key of Object.keys(obj)) stack.push(obj[key]); + } + return names; +} - // Extract user-defined CTE names to exclude - const userCteNames = new Set(); - try { - const ast = sqlParser.astify(forParsing, { database: 'PostgreSql' }); - const astObj = (Array.isArray(ast) ? ast[0] : ast) as unknown as Record; - const withClause = astObj?.with as Array<{ name: { value: string } }> | undefined; - if (withClause) { - for (const cte of withClause) { - if (cte.name?.value) userCteNames.add(cte.name.value.toLowerCase()); +/** + * Names defined in every WITH clause in the tree (lowercased), incl. nested + * WITHs. A CTE name is a local alias, NOT a base table — it must be excluded + * from the scoping list (we'd otherwise inject a conflicting CTE) and from the + * admin gate (a `WITH events AS …` would otherwise be treated as the base table). + */ +function collectCteNames(root: unknown): Set { + const seen = new Set(); + const names = new Set(); + const stack: unknown[] = [root]; + while (stack.length > 0) { + const node = stack.pop(); + if (!node || typeof node !== 'object') continue; + if (seen.has(node as object)) continue; + seen.add(node as object); + if (Array.isArray(node)) { + for (const child of node) stack.push(child); + continue; + } + const obj = node as Record; + if (Array.isArray(obj.ctes)) { + for (const cte of obj.ctes) { + const alias = identName((cte as Record)?.alias); + if (alias) names.add(alias.toLowerCase()); } } - } catch { - // If AST parsing fails, we still have tableList — just can't filter CTEs + for (const key of Object.keys(obj)) stack.push(obj[key]); } + return names; +} - const tables = new Set(); - for (const ref of tableList) { - // Format: "operation::schema::table" - const parts = ref.split('::'); - const schema = parts[1]; - const table = parts[2]; - - if (schema && schema !== 'null' && schema !== '') { - throw new Error(`Schema-qualified table references are not allowed: ${schema}.${table}`); +/** + * Strip leading whitespace + SQL comments (line `--…` and block `/* … *​/`) with + * a single linear scan. Deliberately NOT a regex: a comment-stripping regex with + * nested quantifiers (`(?:--…|/*…*​/)*`) backtracks catastrophically on crafted + * input (e.g. many unclosed `/*`) — a ReDoS DoS, since this runs on member SQL. + */ +export function stripLeadingComments(sql: string): string { + const n = sql.length; + let i = 0; + for (;;) { + while (i < n && /\s/.test(sql[i])) i++; + if (sql.startsWith('--', i)) { + const nl = sql.indexOf('\n', i); + if (nl === -1) return ''; + i = nl + 1; + } else if (sql.startsWith('/*', i)) { + const end = sql.indexOf('*/', i); + if (end === -1) return ''; + i = end + 2; + } else { + return sql.slice(i); } + } +} - if (table) { - const lower = table.toLowerCase(); - if (!userCteNames.has(lower)) { - tables.add(lower); - } +// A read query is SELECT or WITH … SELECT, after any leading comments. Rejects +// DML/DDL prefixes AND PostgreSQL's `TABLE ` shorthand (≡ SELECT * FROM +// ) — polyglot mis-parses the latter as a column, so it yields no table +// refs and would otherwise pass through unscoped. +export function isReadQuery(sql: string): boolean { + return /^(SELECT|WITH)\b/i.test(stripLeadingComments(sql)); +} + +// ============================================ +// SQL Parsing +// ============================================ + +/** + * Extract the COMPLETE set of base-table references a query reads, lowercased — + * the list that must each be wrapped in an org-scoping CTE. + * + * Why this is more than `ast.getTableNames`: the @polyglot-sql/sdk migration's + * scoping relied on `getTableNames`, which has TWO blind spots that each leak + * (found by the adversarial bug-hunt): + * 1. It does not descend into subqueries nested inside an EXPRESSION — e.g. + * `(CASE WHEN … THEN (SELECT … FROM oauth_tokens) END)` or a scalar + * `SELECT (SELECT … FROM events) …`. Such a table was left unscoped. + * 2. PostgreSQL's `TABLE ` shorthand mis-parses as a column, yielding no + * refs at all (handled by the SELECT/WITH guard in validateAndScopeQuery). + * + * Strategy here: + * - Reject multiple statements and schema-qualified refs (both bypass scoping). + * - Build the ref set from a raw AST walk (`collectAllTableNames`), which is a + * strict SUPERSET of `ast.getTableNames` (verified across diverse shapes) and + * additionally reaches expression-nested subqueries getTableNames misses. The + * completeness-invariant test suite guards this against parser changes. + * - Exclude CTE names (local aliases, not base tables). + * - FAIL-CLOSED collision guard: reject a query whose CTE name shadows a real + * or admin table. The parser cannot tell, by lexical scope, whether `events` + * in `WITH events AS (SELECT … FROM events)` is the CTE or the base table — + * so we forbid the ambiguity rather than risk an unscoped base-table read. + */ +function extractTableRefs(query: string): string[] { + const res = parseSql(stripPlaceholders(query), Dialect.PostgreSQL); + if (!res.success || !res.ast) { + throw new Error('Could not parse SQL query for table extraction'); + } + // Reject multiple statements: org-scoping CTEs only wrap the FIRST statement, + // so a trailing `; SELECT … FROM public.oauth_tokens` would run unscoped. + const statements = Array.isArray(res.ast) ? res.ast : [res.ast]; + if (statements.length > 1) { + throw new Error('Multiple SQL statements are not allowed; provide a single query.'); + } + const root = statements[0] as SqlNode; + + // Reject schema-qualified references anywhere in the tree (joins, subqueries, + // CTE bodies, UNION branches) — they bypass the org-scoping CTEs. + const qualified = collectSchemaQualifiedTables(root); + if (qualified.length > 0) { + throw new Error( + `Schema-qualified table references are not allowed: ${[...new Set(qualified)].join(', ')}` + ); + } + + const cteNames = collectCteNames(root); + // A CTE may not shadow a real/admin table name — see fail-closed note above. + for (const cte of cteNames) { + if (QUERYABLE_TABLE_NAMES.has(cte) || ADMIN_ONLY_QUERYABLE_TABLES.has(cte)) { + throw new Error( + `CTE name '${cte}' collides with a reserved table name; rename the CTE.` + ); } } - return Array.from(tables); + // Every base-table ref via the raw walk (superset of getTableNames, incl. + // expression-nested), minus CTE names (local aliases, not base tables). + const refs = new Set(); + for (const n of collectAllTableNames(root)) { + if (!cteNames.has(n)) refs.add(n); + } + return Array.from(refs); } // ============================================ @@ -113,7 +285,7 @@ function extractTableRefs(query: string): string[] { * Validation pipeline: * 1. validateTableQuery() — @polyglot-sql/sdk parses the SQL and checks * all table/column references against the allowlisted schema - * 2. extractTableRefs() — node-sql-parser AST extracts table names + * 2. extractTableRefs() — @polyglot-sql/sdk AST extracts table names * 3. buildScopedQuery() — wraps each table reference in an org-scoped CTE * * Throws on any validation failure. @@ -121,26 +293,55 @@ function extractTableRefs(query: string): string[] { export function validateAndScopeQuery( rawSql: string, organizationId: string, - options?: { safeColumns?: Map } + options?: { + safeColumns?: Map; + /** + * Tables the caller may NOT reference (rejected even though they're in the + * global allowlist). Used to keep auth/identity tables (oauth_tokens, + * oauth_clients, user) admin-only when a non-admin runs query_sql / + * metric_series. Omit for admin / server-internal callers (full access). + */ + restrictedTables?: ReadonlySet; + } ): { sql: string; params: unknown[] } { const trimmed = rawSql.trim(); if (!trimmed) { throw new Error('SQL query is required'); } + // Must be a read query. Rejects DML/DDL AND PostgreSQL's `TABLE ` + // shorthand (≡ `SELECT * FROM `) — polyglot mis-parses `TABLE` as a + // column, so it would yield zero table refs and pass through UNSCOPED and + // past the admin-table gate. The gate below is fail-closed regardless, but + // rejecting the shorthand outright keeps the contract obvious. + if (!isReadQuery(trimmed)) { + throw new Error('Only SELECT / WITH queries are allowed.'); + } + // Schema-level validation via SQL parser (rejects unknown tables/columns, mutations, etc.) const validation = validateTableQuery(trimmed); if (!validation.valid) { throw new Error(validation.errors.join('; ')); } - // AST-based table extraction + // COMPLETE table extraction (union of getTableNames + raw walk, CTE names + // excluded). Drives the unknown-table check, the admin gate, AND org-scoping, + // so an expression-nested table is caught by all three. const tableRefs = extractTableRefs(trimmed); const unknown = tableRefs.filter((t) => !QUERYABLE_TABLE_NAMES.has(t)); if (unknown.length > 0) { throw new Error(`Unknown table(s): ${unknown.join(', ')}`); } + if (options?.restrictedTables) { + const blocked = tableRefs.filter((t) => options.restrictedTables?.has(t)); + if (blocked.length > 0) { + throw new Error( + `Table(s) require admin access: ${[...new Set(blocked)].join(', ')}` + ); + } + } + return buildScopedQuery(trimmed, tableRefs, { organizationId }, options); } @@ -371,12 +572,17 @@ function buildScopedQuery( if (ctes.length === 0) return { sql: processedQuery, params }; const cteStr = `WITH ${ctes.join(',\n')}`; - const trimmed = processedQuery.trim(); - - // If user query starts with WITH, merge CTEs (no duplicate WITH keyword) - const finalSql = /^WITH\b/i.test(trimmed) - ? `${cteStr},\n${trimmed.replace(/^WITH\s+/i, '')}` - : `${cteStr}\n${trimmed}`; + // Strip leading line/block comments before deciding how to merge: a + // `-- note\nWITH x AS (…) …` query is still a WITH, and prepending a second + // WITH keyword would emit invalid SQL (`WITH … \n -- note \n WITH x …`). + // Comments are cosmetic, so dropping the leading ones is safe. + const body = stripLeadingComments(processedQuery).trim(); + + // If the user query is itself a WITH, splice our CTEs in front of its CTE list + // (single WITH keyword); otherwise prepend our WITH block. + const finalSql = /^WITH\b/i.test(body) + ? `${cteStr},\n${body.replace(/^WITH\s+/i, '')}` + : `${cteStr}\n${body}`; return { sql: finalSql, params }; } @@ -404,32 +610,26 @@ function buildScopedQuery( */ export function queryProjectsIdColumn(query: string): boolean { try { - const forParsing = query.trim().replace(/\{\{\w+(?:\.\w+)?\}\}/g, '0'); - const ast = sqlParser.astify(forParsing, { database: 'PostgreSql' }); - const stmt = (Array.isArray(ast) ? ast[0] : ast) as Record | undefined; - const columns = stmt?.columns; - if (!Array.isArray(columns)) { - // `SELECT *` is sometimes represented as a non-array; treat unknown - // shapes as "has id" so we never block a save we can't analyze. - return true; - } - for (const col of columns as Array>) { - const as = col.as; - if (typeof as === 'string' && as.toLowerCase() === 'id') return true; - - const expr = col.expr as Record | undefined; - if (!expr || expr.type !== 'column_ref') continue; - - const column = expr.column; + const root = parseRoot(query); + // Treat any shape we can't analyze (parse failure, non-SELECT) as "has id" + // so we never block a save on a parser edge case. + if (!root || ast.getExprType(root) !== 'select') return true; + const projection = (ast.getExprData(root) as { expressions?: unknown[] }).expressions; + if (!Array.isArray(projection)) return true; + + for (const item of projection as SqlNode[]) { + const itemType = ast.getExprType(item); // Star projection: `*` or `alias.*` - if (column === '*') return true; - // Bare column reference: { expr: { value: 'id' } } - const name = - typeof column === 'string' - ? column - : ((column as Record)?.expr as Record | undefined) - ?.value; - if (typeof name === 'string' && name.toLowerCase() === 'id') return true; + if (itemType === 'star' || ast.isStar?.(item)) return true; + if (itemType === 'alias') { + const d = ast.getExprData(item) as Record; + if (identName(d.alias)?.toLowerCase() === 'id') return true; // ... AS id + const inner = (d.this ?? d.expr) as SqlNode | undefined; + if (inner && (ast.getExprType(inner) === 'star' || ast.isStar?.(inner))) return true; + } else if (itemType === 'column') { + if (identName((ast.getExprData(item) as Record).name)?.toLowerCase() === 'id') + return true; // bare `id` + } } return false; } catch { @@ -452,8 +652,8 @@ export function validateDataSourceQuery(name: string, query: string, parse = fal } if (parse) { try { - const forParsing = trimmed.replace(/\{\{\w+(?:\.\w+)?\}\}/g, '0'); - sqlParser.astify(forParsing, { database: 'PostgreSql' }); + const res = parseSql(stripPlaceholders(trimmed), Dialect.PostgreSQL); + if (!res.success) throw new Error(res.error ?? 'could not parse query'); extractTableRefs(trimmed); } catch (err) { throw new Error(`Data source '${name}': ${err instanceof Error ? err.message : String(err)}`); diff --git a/packages/server/src/utils/infer-measures.ts b/packages/server/src/utils/infer-measures.ts new file mode 100644 index 000000000..c4dda4bc2 --- /dev/null +++ b/packages/server/src/utils/infer-measures.ts @@ -0,0 +1,149 @@ +/** + * Classify a derived entity view's output columns as measures or dimensions — + * computed ON READ, never persisted. + * + * A derived entity type is a SQL view; each output column is either a MEASURE + * (an aggregate / computed numeric) or a DIMENSION (a plain grouping column). + * The UI uses this to tag/right-align measure columns. We do NOT write these + * roles into metadata_schema — persisting an inferred superset only created + * apply-diff churn; the single consumer (the UI) reads them at query time. + * + * Parsing uses @polyglot-sql/sdk — the same engine `validateAndScopeQuery` uses. + * The SDK auto-initialises on (ESM) import, so the synchronous `parse` works + * without an explicit init step. + */ +import { Dialect, ast, parse } from '@polyglot-sql/sdk'; + +type Node = ast.Expression; + +export interface DerivedColumn { + name: string; + role: 'measure' | 'dimension'; +} + +// A projection column is a MEASURE when its expression is an aggregate or a +// computed numeric (ratio). polyglot reports canonical aggregates by name +// (sum/count/…) and everything else as the generic `aggregate_function` / +// `within_group` (e.g. bool_or, jsonb_agg, percentile_cont … WITHIN GROUP); +// `div` covers ratios. A bare column (or a cast of one) is a dimension — but a +// cast of an aggregate (`COUNT(*)::int`, `SUM(x)::numeric`) is still a measure, +// so we peel cast/paren wrappers before classifying (see unwrapWrappers). +const MEASURE_EXPR_TYPES = new Set([ + 'sum', + 'count', + 'avg', + 'min', + 'max', + 'median', + 'mode', + 'approx_distinct', + 'approx_count_distinct', + 'count_if', + 'sum_if', + 'group_concat', + 'string_agg', + 'list_agg', + 'array_agg', + 'stddev', + 'variance', + 'first', + 'last', + 'any_value', + // polyglot's generic aggregate node types + 'aggregate_function', + 'within_group', + // computed numeric (ratios) + 'div', +]); + +/** + * Peel `cast` / `paren` wrappers off a projection value so an aggregate hidden + * under them (`COUNT(*)::int`, `(SUM(x))::numeric`) is still classified by its + * underlying type. Stops at the first non-wrapper node. Bounded to guard against + * a pathological/cyclic tree. + */ +function unwrapWrappers(node: Node): Node { + let cur = node; + for (let i = 0; i < 24; i++) { + let type: string; + try { + type = ast.getExprType(cur); + } catch { + return cur; + } + if (type !== 'cast' && type !== 'paren') return cur; + const inner = (ast.getExprData(cur) as Record).this; + if (!inner || typeof inner !== 'object') return cur; + cur = inner as Node; + } + return cur; +} + +/** Pull a bare identifier string out of polyglot's `{ name, quoted }` shapes. */ +function identName(value: unknown): string | null { + if (typeof value === 'string') return value; + if (value && typeof value === 'object') { + const o = value as Record; + if (typeof o.name === 'string') return o.name; + if (o.name && typeof o.name === 'object') return identName(o.name); + if (o.this) return identName(o.this); + } + return null; +} + +/** + * Classify each projection column of a derived entity's view SELECT. + * Returns `[]` if the SQL can't be parsed or isn't a simple projection. + */ +export function inferColumns(sql: string): DerivedColumn[] { + // Strip {{...}} template placeholders so the parser doesn't choke. + const forParsing = sql.trim().replace(/\{\{\w+(?:\.\w+)?\}\}/g, '0'); + + let root: Node | undefined; + try { + const res = parse(forParsing, Dialect.PostgreSQL); + if (!res.success || !res.ast) return []; + root = (Array.isArray(res.ast) ? res.ast[0] : res.ast) as Node | undefined; + } catch { + return []; + } + if (!root || ast.getExprType(root) !== 'select') return []; + + const projection = (ast.getExprData(root) as { expressions?: unknown[] }).expressions; + if (!Array.isArray(projection)) return []; + + const out: DerivedColumn[] = []; + for (const item of projection as Node[]) { + const itemType = ast.getExprType(item); + + let nameSrc: unknown; + let valueExpr: Node = item; + if (itemType === 'alias') { + const d = ast.getExprData(item) as Record; + nameSrc = d.alias; + valueExpr = (d.this ?? d.expr ?? item) as Node; + } else if (itemType === 'column') { + nameSrc = (ast.getExprData(item) as Record).name; + } else { + continue; // star / literal / unnamed projection + } + + const name = identName(nameSrc); + if (!name || name === '*') continue; + + out.push({ + name, + role: MEASURE_EXPR_TYPES.has(ast.getExprType(unwrapWrappers(valueExpr))) + ? 'measure' + : 'dimension', + }); + } + return out; +} + +/** Names of the view's measure columns (computed on read; never persisted). */ +export function measureColumns(sql: string): string[] { + return inferColumns(sql) + .filter((c) => c.role === 'measure') + .map((c) => c.name); +} diff --git a/packages/server/src/utils/table-schema.ts b/packages/server/src/utils/table-schema.ts index 6c566cb09..5002034e0 100644 --- a/packages/server/src/utils/table-schema.ts +++ b/packages/server/src/utils/table-schema.ts @@ -9,7 +9,7 @@ * is bypassed */ -import { validateWithSchema } from '@polyglot-sql/sdk'; +import { Dialect, ast, parse, validateWithSchema } from '@polyglot-sql/sdk'; export interface ColumnDef { name: string; @@ -348,6 +348,20 @@ export const QUERYABLE_SCHEMA = { export const QUERYABLE_TABLE_NAMES = new Set(QUERYABLE_SCHEMA.tables.map((t) => t.name)); +/** + * Queryable tables that stay OWNER/ADMIN-only even when query_sql / metric_series + * are member-accessible: the auth + identity tables. Members can read the + * org's operational data (entities, events, connections, feeds, watchers, …) but + * not enumerate every OAuth token/app or the full user roster. Secret columns + * (credentials, client_secret, token_hash, email, phone) are already excluded + * from the schema above; this is the table-level guard on top of that. + */ +export const ADMIN_ONLY_QUERYABLE_TABLES: ReadonlySet = new Set([ + 'oauth_tokens', + 'oauth_clients', + 'user', +]); + /** table name → column definitions for use in CTE SELECT. * Columns with `expr` are derived from JSONB and need special handling. */ export const SAFE_COLUMN_DEFS = new Map( @@ -371,11 +385,54 @@ export function buildColumnList(defs: ColumnDef[], alias?: string): string { * Validate that SQL only references allowed tables and columns. * E200 = unknown table, E201 = unknown column. */ +/** + * Output-column aliases of the query's top-level SELECT. `ORDER BY`/`GROUP BY` + * may reference these (valid SQL), but the schema validator treats them as + * unknown columns — so we use this to suppress those false positives. + * Best-effort: empty on parse failure. Only matches the alias NAME, never the + * underlying expression, so an excluded column referenced as `SELECT credentials + * AS x` is still rejected (the `credentials` reference itself fails). + */ +function outputAliases(sql: string): Set { + const out = new Set(); + try { + const res = parse(sql, Dialect.PostgreSQL); + if (!res.success || !res.ast) return out; + const root = Array.isArray(res.ast) ? res.ast[0] : res.ast; + const data = ast.getExprData(root) as { expressions?: unknown[] }; + for (const item of data.expressions ?? []) { + try { + if (ast.getExprType(item as never) !== 'alias') continue; + const a = (ast.getExprData(item as never) as { alias?: unknown }).alias; + const name = + typeof a === 'string' + ? a + : ((a as { name?: string; value?: string })?.name ?? + (a as { value?: string })?.value); + if (typeof name === 'string' && name) out.add(name.toLowerCase()); + } catch { + // skip malformed projection item + } + } + } catch { + // unparseable → no aliases; validator errors stand + } + return out; +} + export function validateTableQuery(sql: string): { valid: boolean; errors: string[] } { const result = validateWithSchema(sql, QUERYABLE_SCHEMA, 'postgresql', { checkReferences: true }); - const errors = (result.errors ?? []).filter( - (e: { code: string }) => e.code === 'E200' || e.code === 'E201' - ); + const aliases = outputAliases(sql); + const errors = (result.errors ?? []).filter((e: { code: string; message: string }) => { + // E201 = unknown column. A reference to the query's OWN output alias (e.g. + // `SELECT count(*) AS n … ORDER BY n`) is valid SQL, not an unknown column. + if (e.code === 'E201') { + const m = /Unknown column '([^']+)'/i.exec(e.message ?? ''); + if (m && aliases.has(m[1].toLowerCase())) return false; + return true; + } + return e.code === 'E200'; // unknown table — always a real error + }); return { valid: errors.length === 0, errors: errors.map((e: { message: string }) => e.message),