-
Notifications
You must be signed in to change notification settings - Fork 0
perf(logsdb): query optimization, shared code extraction, CI improvements #219
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
6e08b77
d19a19f
bd0645b
ddf16e5
c98eece
c01a536
bd6d0d9
1966013
064cb0e
268bf2c
718b180
1c34d0b
fb7da13
fa98631
d3cfefb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,7 +34,7 @@ | |
| */ | ||
|
|
||
| import type { Chunk } from "./chunk.js"; | ||
| import { readBodiesOnly, readRecords } from "./chunk.js"; | ||
| import { readRecords, readRecordsFromRaw } from "./chunk.js"; | ||
| import type { LogStore } from "./engine.js"; | ||
| import type { LogRecord, StreamId } from "./types.js"; | ||
|
|
||
|
|
@@ -147,39 +147,24 @@ export function* queryStream( | |
| } | ||
|
|
||
| if (useBodyFastPath) { | ||
| // Template-token pruning: if the chunk header carries template | ||
| // literal tokens (toks), check if any token contains the needle | ||
| // as a substring. If no template token matches AND the chunk has | ||
| // no raw-string bodies (raw strings might still match), we can | ||
| // skip ZSTD decompression entirely. | ||
| const needle = spec.bodyContains; | ||
| if (needle !== undefined && chunkPrunedByTemplateTokens(chunk, needle)) { | ||
| stats.chunksPruned++; | ||
| continue; | ||
| } | ||
| // Fast path: decode only bodies, check which match the | ||
| // substring. Only do full decode if there are body matches. | ||
| // biome-ignore lint/style/noNonNullAssertion: guarded by useBodyFastPath check above | ||
| const needle = spec.bodyContains!; | ||
| // Phase 1: Raw byte scan. Decompress and check if the needle's | ||
| // UTF-8 bytes exist anywhere in the payload. If not, no body in | ||
| // this chunk can contain the needle — skip without any string | ||
| // construction or template reconstruction. | ||
| const t0 = nowMillis(); | ||
| const bodies = readBodiesOnly(chunk, store.registry, policy); | ||
| let hasMatch = false; | ||
| for (let i = 0; i < bodies.length; i++) { | ||
| if ( | ||
| typeof bodies[i] === "string" && | ||
| needle !== undefined && | ||
| (bodies[i] as string).includes(needle) | ||
| ) { | ||
| hasMatch = true; | ||
| break; | ||
| } | ||
| } | ||
| if (!hasMatch) { | ||
| // No body in this chunk matches — skip full decode entirely | ||
| const codec = store.registry.get(chunk.header.codecName); | ||
| const raw = codec.decode(chunk.payload); | ||
| if (!rawPayloadContains(raw, needle)) { | ||
| stats.decodeMillis += nowMillis() - t0; | ||
| stats.chunksPruned++; | ||
| continue; | ||
| } | ||
| // Some bodies match — need full records for time/severity post-filtering | ||
| const records = readRecords(chunk, store.registry, policy); | ||
| // Phase 2: Raw bytes matched — do full decode and filter inline. | ||
| // We skip the body-only intermediate decode because if the chunk | ||
| // passes the raw scan, we'll need full records anyway for yielding. | ||
| const records = readRecordsFromRaw(raw, chunk.header, policy); | ||
| stats.decodeMillis += nowMillis() - t0; | ||
| for (const record of records) { | ||
| stats.recordsScanned++; | ||
|
|
@@ -279,51 +264,54 @@ function chunkPassesSeverity(chunk: Chunk, severityGte?: number): boolean { | |
| } | ||
|
|
||
| /** | ||
| * Template-token pruning for bodyContains. If the chunk header carries | ||
| * template literal tokens (TypedColumnarDrainPolicy stores these in | ||
| * codecMeta.toks), check if any token contains the needle as a | ||
| * substring. If no template token can match AND the chunk metadata | ||
| * confirms zero raw-string bodies, we can skip ZSTD decompression. | ||
| * | ||
| * SOUNDNESS: We can only prune when BOTH conditions hold: | ||
| * 1. No template literal token contains the needle | ||
| * 2. The chunk has zero raw-string bodies (rawCount === 0) | ||
| * Raw byte scan: check if the needle's UTF-8 bytes appear anywhere | ||
| * in the decompressed payload buffer. This is a sound negative filter: | ||
| * if the bytes aren't found, no body string in this chunk can contain | ||
| * the needle. False positives are possible (the needle bytes might | ||
| * appear in template dictionary metadata, slot type headers, etc.) | ||
| * but are rare and handled by the subsequent per-record check. | ||
|
Comment on lines
267
to
+273
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Edge Case: Raw byte scan false negative for JSON-escaped characters in NDJSONEven for NDJSON payloads, This is a minor edge case because Was this helpful? React with 👍 / 👎 | Reply |
||
| * | ||
| * Even when pruning, variable values (PARAM_STR slots) could still | ||
| * contain the needle — but those aren't part of template *literals*. | ||
| * The body is reconstructed as: literal + variable + literal + ... | ||
| * So if no literal contains the needle AND the needle doesn't span a | ||
| * literal/variable boundary, we'd need to also check variable columns. | ||
| * Since checking variables requires decompression anyway, template- | ||
| * token pruning is only effective for needles that MUST appear in a | ||
| * template literal (not in a variable slot) to produce a match. | ||
| * | ||
| * CONSERVATIVE: returns false (don't prune) when unsure. | ||
| * Cost: ~0.003ms for a 86KB buffer when Buffer is available (SIMD), | ||
| * ~0.01ms with manual scan. Compare to full decodeBodies at ~0.6ms. | ||
| */ | ||
| function chunkPrunedByTemplateTokens(chunk: Chunk, needle: string): boolean { | ||
| const meta = chunk.header.codecMeta as { toks?: string[]; rawCount?: number } | undefined; | ||
| if (!meta?.toks) return false; // no token data — can't prune | ||
|
|
||
| // If there are raw-string bodies, they could contain anything | ||
| if (meta.rawCount === undefined || meta.rawCount > 0) return false; | ||
| const enc = new TextEncoder(); | ||
| const needleCache = new Map<string, Uint8Array>(); | ||
| const NEEDLE_CACHE_MAX = 64; | ||
|
|
||
| // Check if any template literal token contains the needle | ||
| for (const tok of meta.toks) { | ||
| if (tok.includes(needle)) return false; // might match — don't prune | ||
| function rawPayloadContains(raw: Uint8Array, needle: string): boolean { | ||
| let needleBytes = needleCache.get(needle); | ||
| if (!needleBytes) { | ||
| needleBytes = enc.encode(needle); | ||
| if (needleCache.size >= NEEDLE_CACHE_MAX) needleCache.clear(); | ||
| needleCache.set(needle, needleBytes); | ||
| } | ||
| return uint8IndexOf(raw, needleBytes) !== -1; | ||
| } | ||
|
|
||
| // No template token contains the needle AND there are zero raw strings. | ||
| // However, the reconstructed body is: tok0 + var0 + tok1 + var1 + ... | ||
| // If the needle could span a tok/var boundary, we can't prune. | ||
| // Safe to prune only if the needle can't be split across boundaries. | ||
| // Since we don't track variable values at the header level, we can | ||
| // NOT safely prune — variable values might contain the needle. | ||
| // Template-token pruning is only sound for needles that match a | ||
| // complete template token (not a substring of a variable). | ||
| // | ||
| // DISABLED: This optimization requires bloom filters or variable- | ||
| // value token sets in the header to be sound. For now, return false. | ||
| return false; | ||
| /** Portable Uint8Array substring search (works in browser + Node). */ | ||
| function uint8IndexOf(haystack: Uint8Array, needle: Uint8Array): number { | ||
| // Use Buffer.includes when available (Node) — it's SIMD-optimized | ||
| if (typeof globalThis.Buffer !== "undefined") { | ||
| const hBuf = globalThis.Buffer.from(haystack.buffer, haystack.byteOffset, haystack.byteLength); | ||
| return hBuf.indexOf( | ||
| globalThis.Buffer.from(needle.buffer, needle.byteOffset, needle.byteLength) | ||
| ); | ||
| } | ||
| // Browser fallback: simple byte-at-a-time search | ||
| const hLen = haystack.length; | ||
| const nLen = needle.length; | ||
| if (nLen === 0) return 0; | ||
| if (nLen > hLen) return -1; | ||
| const first = needle[0] as number; | ||
| const limit = hLen - nLen; | ||
| outer: for (let i = 0; i <= limit; i++) { | ||
| if (haystack[i] !== first) continue; | ||
| for (let j = 1; j < nLen; j++) { | ||
| if (haystack[i + j] !== needle[j]) continue outer; | ||
| } | ||
| return i; | ||
| } | ||
| return -1; | ||
| } | ||
|
|
||
| /** Per-record filter — applied after chunk decode. */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,137 @@ | ||
| import { describe, it, expect } from "vitest"; | ||
| import { defaultClassifier, TemplatedClassifier } from "../src/classify.js"; | ||
| import type { TemplateExtractor } from "../src/classify.js"; | ||
| import type { LogRecord } from "../src/types.js"; | ||
|
|
||
| function makeRecord(body: unknown): LogRecord { | ||
| return { | ||
| timeUnixNano: 1000000000n, | ||
| severityNumber: 9, | ||
| severityText: "INFO", | ||
| body: body as LogRecord["body"], | ||
| attributes: [], | ||
| }; | ||
| } | ||
|
|
||
| describe("classifyShape via defaultClassifier", () => { | ||
| it("null → primitive", () => { | ||
| expect(defaultClassifier.classify(makeRecord(null))).toBe("primitive"); | ||
| }); | ||
|
|
||
| it("string → freetext", () => { | ||
| expect(defaultClassifier.classify(makeRecord("hello world"))).toBe("freetext"); | ||
| }); | ||
|
|
||
| it("empty string → freetext", () => { | ||
| expect(defaultClassifier.classify(makeRecord(""))).toBe("freetext"); | ||
| }); | ||
|
|
||
| it("number → primitive", () => { | ||
| expect(defaultClassifier.classify(makeRecord(42))).toBe("primitive"); | ||
| }); | ||
|
|
||
| it("NaN → primitive", () => { | ||
| expect(defaultClassifier.classify(makeRecord(NaN))).toBe("primitive"); | ||
| }); | ||
|
|
||
| it("bigint → primitive", () => { | ||
| expect(defaultClassifier.classify(makeRecord(123456789012345678n))).toBe("primitive"); | ||
| }); | ||
|
|
||
| it("boolean true → primitive", () => { | ||
| expect(defaultClassifier.classify(makeRecord(true))).toBe("primitive"); | ||
| }); | ||
|
|
||
| it("boolean false → primitive", () => { | ||
| expect(defaultClassifier.classify(makeRecord(false))).toBe("primitive"); | ||
| }); | ||
|
|
||
| it("Uint8Array → bytes", () => { | ||
| expect(defaultClassifier.classify(makeRecord(new Uint8Array([1, 2, 3])))).toBe("bytes"); | ||
| }); | ||
|
|
||
| it("empty Uint8Array → bytes", () => { | ||
| expect(defaultClassifier.classify(makeRecord(new Uint8Array(0)))).toBe("bytes"); | ||
| }); | ||
|
|
||
| it("array → kvlist", () => { | ||
| expect(defaultClassifier.classify(makeRecord([1, 2, 3]))).toBe("kvlist"); | ||
| }); | ||
|
|
||
| it("empty array → kvlist", () => { | ||
| expect(defaultClassifier.classify(makeRecord([]))).toBe("kvlist"); | ||
| }); | ||
|
|
||
| it("plain object → kvlist", () => { | ||
| expect(defaultClassifier.classify(makeRecord({ key: "value" }))).toBe("kvlist"); | ||
| }); | ||
|
|
||
| it("empty object → kvlist", () => { | ||
| expect(defaultClassifier.classify(makeRecord({}))).toBe("kvlist"); | ||
| }); | ||
|
|
||
| it("undefined → primitive (fallback)", () => { | ||
| expect(defaultClassifier.classify(makeRecord(undefined))).toBe("primitive"); | ||
| }); | ||
| }); | ||
|
|
||
| describe("TemplatedClassifier", () => { | ||
| function makeExtractor(templates: Map<string, number>): TemplateExtractor { | ||
| return { | ||
| matchTemplate(s: string) { | ||
| const id = templates.get(s); | ||
| if (id === undefined) return undefined; | ||
| return { templateId: id, vars: [] }; | ||
| }, | ||
| matchOrAdd(s: string) { | ||
| const id = templates.get(s); | ||
| if (id !== undefined) return { templateId: id, vars: [], isNew: false }; | ||
| const newId = templates.size; | ||
| templates.set(s, newId); | ||
| return { templateId: newId, vars: [], isNew: true }; | ||
| }, | ||
| templateCount: () => templates.size, | ||
| templates: function* () { | ||
| for (const [template, id] of templates) yield { id, template }; | ||
| }, | ||
| }; | ||
| } | ||
|
|
||
| it("returns 'templated' when extractor matches string body", () => { | ||
| const ext = makeExtractor(new Map([["Connection from <*>", 0]])); | ||
| const cls = new TemplatedClassifier(ext); | ||
| expect(cls.classify(makeRecord("Connection from <*>"))).toBe("templated"); | ||
| }); | ||
|
|
||
| it("returns 'freetext' when extractor does NOT match string body", () => { | ||
| const ext = makeExtractor(new Map([["Other template", 0]])); | ||
| const cls = new TemplatedClassifier(ext); | ||
| expect(cls.classify(makeRecord("totally different text"))).toBe("freetext"); | ||
| }); | ||
|
|
||
| it("returns non-string shapes without consulting extractor", () => { | ||
| let called = false; | ||
| const ext = makeExtractor(new Map()); | ||
| const origMatch = ext.matchTemplate.bind(ext); | ||
| ext.matchTemplate = (s: string) => { | ||
| called = true; | ||
| return origMatch(s); | ||
| }; | ||
| const cls = new TemplatedClassifier(ext); | ||
|
|
||
| expect(cls.classify(makeRecord(42))).toBe("primitive"); | ||
| expect(called).toBe(false); | ||
|
|
||
| expect(cls.classify(makeRecord({ x: 1 }))).toBe("kvlist"); | ||
| expect(called).toBe(false); | ||
|
|
||
| expect(cls.classify(makeRecord(new Uint8Array(4)))).toBe("bytes"); | ||
| expect(called).toBe(false); | ||
| }); | ||
|
|
||
| it("returns 'primitive' for null without consulting extractor", () => { | ||
| const ext = makeExtractor(new Map()); | ||
| const cls = new TemplatedClassifier(ext); | ||
| expect(cls.classify(makeRecord(null))).toBe("primitive"); | ||
| }); | ||
| }); |
Uh oh!
There was an error while loading. Please reload this page.