Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions packages/o11ylogsdb/src/chunk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -205,12 +205,25 @@ export function readRecords(
): LogRecord[] {
const codec = registry.get(chunk.header.codecName);
const raw = codec.decode(chunk.payload);
return readRecordsFromRaw(raw, chunk.header, policy);
}

/**
* Decode records from an already-decompressed payload buffer. Use this
* when the caller has already decompressed (e.g. in the query engine's
* raw-byte-scan path) to avoid double decompression.
*/
export function readRecordsFromRaw(
raw: Uint8Array,
header: ChunkHeader,
policy?: ChunkPolicy
): LogRecord[] {
if (policy?.decodePayload) {
return policy.decodePayload(raw, chunk.header.nLogs, chunk.header.codecMeta);
return policy.decodePayload(raw, header.nLogs, header.codecMeta);
}
const decoded = decodeNdjsonRecords(raw, chunk.header.nLogs);
const decoded = decodeNdjsonRecords(raw, header.nLogs);
if (policy?.postDecode) {
return policy.postDecode(decoded, chunk.header.codecMeta);
return policy.postDecode(decoded, header.codecMeta);
}
return decoded;
}
Expand Down Expand Up @@ -333,8 +346,8 @@ function toJsonable(r: LogRecord): JsonableRecord {
if (r.flags !== undefined) out.f = r.flags;
if (r.traceId) out.ti = bytesToHex(r.traceId);
if (r.spanId) out.si = bytesToHex(r.spanId);
if (r.eventName) out.e = r.eventName;
if (r.droppedAttributesCount) out.d = r.droppedAttributesCount;
if (r.eventName !== undefined) out.e = r.eventName;
if (r.droppedAttributesCount !== undefined) out.d = r.droppedAttributesCount;
return out;
}

Expand All @@ -353,8 +366,8 @@ function fromJsonable(j: JsonableRecord): LogRecord {
if (j.f !== undefined) out.flags = j.f;
if (j.ti) out.traceId = hexToBytes(j.ti);
if (j.si) out.spanId = hexToBytes(j.si);
if (j.e) out.eventName = j.e;
if (j.d) out.droppedAttributesCount = j.d;
if (j.e !== undefined) out.eventName = j.e;
if (j.d !== undefined) out.droppedAttributesCount = j.d;
return out;
}

Expand Down
1 change: 1 addition & 0 deletions packages/o11ylogsdb/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export {
deserializeChunk,
readBodiesOnly,
readRecords,
readRecordsFromRaw,
serializeChunk,
} from "./chunk.js";
export type { BodyClassifier, TemplateExtractor } from "./classify.js";
Expand Down
126 changes: 57 additions & 69 deletions packages/o11ylogsdb/src/query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
*/

import type { Chunk } from "./chunk.js";
import { readBodiesOnly, readRecords } from "./chunk.js";
import { readRecords, readRecordsFromRaw } from "./chunk.js";
import type { LogStore } from "./engine.js";
import type { LogRecord, StreamId } from "./types.js";

Expand Down Expand Up @@ -147,39 +147,24 @@ export function* queryStream(
}

if (useBodyFastPath) {
// Template-token pruning: if the chunk header carries template
// literal tokens (toks), check if any token contains the needle
// as a substring. If no template token matches AND the chunk has
// no raw-string bodies (raw strings might still match), we can
// skip ZSTD decompression entirely.
const needle = spec.bodyContains;
if (needle !== undefined && chunkPrunedByTemplateTokens(chunk, needle)) {
stats.chunksPruned++;
continue;
}
// Fast path: decode only bodies, check which match the
// substring. Only do full decode if there are body matches.
// biome-ignore lint/style/noNonNullAssertion: guarded by useBodyFastPath check above
const needle = spec.bodyContains!;
// Phase 1: Raw byte scan. Decompress and check if the needle's
// UTF-8 bytes exist anywhere in the payload. If not, no body in
// this chunk can contain the needle — skip without any string
// construction or template reconstruction.
const t0 = nowMillis();
const bodies = readBodiesOnly(chunk, store.registry, policy);
let hasMatch = false;
for (let i = 0; i < bodies.length; i++) {
if (
typeof bodies[i] === "string" &&
needle !== undefined &&
(bodies[i] as string).includes(needle)
) {
hasMatch = true;
break;
}
}
if (!hasMatch) {
// No body in this chunk matches — skip full decode entirely
const codec = store.registry.get(chunk.header.codecName);
const raw = codec.decode(chunk.payload);
if (!rawPayloadContains(raw, needle)) {
stats.decodeMillis += nowMillis() - t0;
stats.chunksPruned++;
continue;
}
// Some bodies match — need full records for time/severity post-filtering
const records = readRecords(chunk, store.registry, policy);
// Phase 2: Raw bytes matched — do full decode and filter inline.
// We skip the body-only intermediate decode because if the chunk
// passes the raw scan, we'll need full records anyway for yielding.
const records = readRecordsFromRaw(raw, chunk.header, policy);
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
stats.decodeMillis += nowMillis() - t0;
for (const record of records) {
stats.recordsScanned++;
Expand Down Expand Up @@ -279,51 +264,54 @@ function chunkPassesSeverity(chunk: Chunk, severityGte?: number): boolean {
}

/**
* Template-token pruning for bodyContains. If the chunk header carries
* template literal tokens (TypedColumnarDrainPolicy stores these in
* codecMeta.toks), check if any token contains the needle as a
* substring. If no template token can match AND the chunk metadata
* confirms zero raw-string bodies, we can skip ZSTD decompression.
*
* SOUNDNESS: We can only prune when BOTH conditions hold:
* 1. No template literal token contains the needle
* 2. The chunk has zero raw-string bodies (rawCount === 0)
* Raw byte scan: check if the needle's UTF-8 bytes appear anywhere
* in the decompressed payload buffer. This is a sound negative filter:
* if the bytes aren't found, no body string in this chunk can contain
* the needle. False positives are possible (the needle bytes might
* appear in template dictionary metadata, slot type headers, etc.)
* but are rare and handled by the subsequent per-record check.
Comment on lines 267 to +273

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Edge Case: Raw byte scan false negative for JSON-escaped characters in NDJSON

Even for NDJSON payloads, rawPayloadContains can produce false negatives when the bodyContains needle includes characters that get JSON-escaped (e.g., literal newlines ``, quotes ", backslashes ``). The raw bytes in the NDJSON buffer contain the escaped form (`\n`), not the original byte (`0x0A`), so a scan for the original character won't match.

This is a minor edge case because bodyContains searches in practice are almost always for printable ASCII/UTF-8 text that doesn't require JSON escaping, but it's worth documenting the limitation.

Was this helpful? React with 👍 / 👎 | Reply gitar fix to apply this suggestion

*
* Even when pruning, variable values (PARAM_STR slots) could still
* contain the needle — but those aren't part of template *literals*.
* The body is reconstructed as: literal + variable + literal + ...
* So if no literal contains the needle AND the needle doesn't span a
* literal/variable boundary, we'd need to also check variable columns.
* Since checking variables requires decompression anyway, template-
* token pruning is only effective for needles that MUST appear in a
* template literal (not in a variable slot) to produce a match.
*
* CONSERVATIVE: returns false (don't prune) when unsure.
* Cost: ~0.003ms for a 86KB buffer when Buffer is available (SIMD),
* ~0.01ms with manual scan. Compare to full decodeBodies at ~0.6ms.
*/
function chunkPrunedByTemplateTokens(chunk: Chunk, needle: string): boolean {
const meta = chunk.header.codecMeta as { toks?: string[]; rawCount?: number } | undefined;
if (!meta?.toks) return false; // no token data — can't prune

// If there are raw-string bodies, they could contain anything
if (meta.rawCount === undefined || meta.rawCount > 0) return false;
const enc = new TextEncoder();
const needleCache = new Map<string, Uint8Array>();
const NEEDLE_CACHE_MAX = 64;

// Check if any template literal token contains the needle
for (const tok of meta.toks) {
if (tok.includes(needle)) return false; // might match — don't prune
function rawPayloadContains(raw: Uint8Array, needle: string): boolean {
let needleBytes = needleCache.get(needle);
if (!needleBytes) {
needleBytes = enc.encode(needle);
if (needleCache.size >= NEEDLE_CACHE_MAX) needleCache.clear();
needleCache.set(needle, needleBytes);
}
return uint8IndexOf(raw, needleBytes) !== -1;
}

// No template token contains the needle AND there are zero raw strings.
// However, the reconstructed body is: tok0 + var0 + tok1 + var1 + ...
// If the needle could span a tok/var boundary, we can't prune.
// Safe to prune only if the needle can't be split across boundaries.
// Since we don't track variable values at the header level, we can
// NOT safely prune — variable values might contain the needle.
// Template-token pruning is only sound for needles that match a
// complete template token (not a substring of a variable).
//
// DISABLED: This optimization requires bloom filters or variable-
// value token sets in the header to be sound. For now, return false.
return false;
/** Portable Uint8Array substring search (works in browser + Node). */
function uint8IndexOf(haystack: Uint8Array, needle: Uint8Array): number {
// Use Buffer.includes when available (Node) — it's SIMD-optimized
if (typeof globalThis.Buffer !== "undefined") {
const hBuf = globalThis.Buffer.from(haystack.buffer, haystack.byteOffset, haystack.byteLength);
return hBuf.indexOf(
globalThis.Buffer.from(needle.buffer, needle.byteOffset, needle.byteLength)
);
}
// Browser fallback: simple byte-at-a-time search
const hLen = haystack.length;
const nLen = needle.length;
if (nLen === 0) return 0;
if (nLen > hLen) return -1;
const first = needle[0] as number;
const limit = hLen - nLen;
outer: for (let i = 0; i <= limit; i++) {
if (haystack[i] !== first) continue;
for (let j = 1; j < nLen; j++) {
if (haystack[i + j] !== needle[j]) continue outer;
}
return i;
}
return -1;
}

/** Per-record filter — applied after chunk decode. */
Expand Down
137 changes: 137 additions & 0 deletions packages/o11ylogsdb/test/classify.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import { describe, it, expect } from "vitest";
import { defaultClassifier, TemplatedClassifier } from "../src/classify.js";
import type { TemplateExtractor } from "../src/classify.js";
import type { LogRecord } from "../src/types.js";

function makeRecord(body: unknown): LogRecord {
return {
timeUnixNano: 1000000000n,
severityNumber: 9,
severityText: "INFO",
body: body as LogRecord["body"],
attributes: [],
};
}

describe("classifyShape via defaultClassifier", () => {
it("null → primitive", () => {
expect(defaultClassifier.classify(makeRecord(null))).toBe("primitive");
});

it("string → freetext", () => {
expect(defaultClassifier.classify(makeRecord("hello world"))).toBe("freetext");
});

it("empty string → freetext", () => {
expect(defaultClassifier.classify(makeRecord(""))).toBe("freetext");
});

it("number → primitive", () => {
expect(defaultClassifier.classify(makeRecord(42))).toBe("primitive");
});

it("NaN → primitive", () => {
expect(defaultClassifier.classify(makeRecord(NaN))).toBe("primitive");
});

it("bigint → primitive", () => {
expect(defaultClassifier.classify(makeRecord(123456789012345678n))).toBe("primitive");
});

it("boolean true → primitive", () => {
expect(defaultClassifier.classify(makeRecord(true))).toBe("primitive");
});

it("boolean false → primitive", () => {
expect(defaultClassifier.classify(makeRecord(false))).toBe("primitive");
});

it("Uint8Array → bytes", () => {
expect(defaultClassifier.classify(makeRecord(new Uint8Array([1, 2, 3])))).toBe("bytes");
});

it("empty Uint8Array → bytes", () => {
expect(defaultClassifier.classify(makeRecord(new Uint8Array(0)))).toBe("bytes");
});

it("array → kvlist", () => {
expect(defaultClassifier.classify(makeRecord([1, 2, 3]))).toBe("kvlist");
});

it("empty array → kvlist", () => {
expect(defaultClassifier.classify(makeRecord([]))).toBe("kvlist");
});

it("plain object → kvlist", () => {
expect(defaultClassifier.classify(makeRecord({ key: "value" }))).toBe("kvlist");
});

it("empty object → kvlist", () => {
expect(defaultClassifier.classify(makeRecord({}))).toBe("kvlist");
});

it("undefined → primitive (fallback)", () => {
expect(defaultClassifier.classify(makeRecord(undefined))).toBe("primitive");
});
});

describe("TemplatedClassifier", () => {
function makeExtractor(templates: Map<string, number>): TemplateExtractor {
return {
matchTemplate(s: string) {
const id = templates.get(s);
if (id === undefined) return undefined;
return { templateId: id, vars: [] };
},
matchOrAdd(s: string) {
const id = templates.get(s);
if (id !== undefined) return { templateId: id, vars: [], isNew: false };
const newId = templates.size;
templates.set(s, newId);
return { templateId: newId, vars: [], isNew: true };
},
templateCount: () => templates.size,
templates: function* () {
for (const [template, id] of templates) yield { id, template };
},
};
}

it("returns 'templated' when extractor matches string body", () => {
const ext = makeExtractor(new Map([["Connection from <*>", 0]]));
const cls = new TemplatedClassifier(ext);
expect(cls.classify(makeRecord("Connection from <*>"))).toBe("templated");
});

it("returns 'freetext' when extractor does NOT match string body", () => {
const ext = makeExtractor(new Map([["Other template", 0]]));
const cls = new TemplatedClassifier(ext);
expect(cls.classify(makeRecord("totally different text"))).toBe("freetext");
});

it("returns non-string shapes without consulting extractor", () => {
let called = false;
const ext = makeExtractor(new Map());
const origMatch = ext.matchTemplate.bind(ext);
ext.matchTemplate = (s: string) => {
called = true;
return origMatch(s);
};
const cls = new TemplatedClassifier(ext);

expect(cls.classify(makeRecord(42))).toBe("primitive");
expect(called).toBe(false);

expect(cls.classify(makeRecord({ x: 1 }))).toBe("kvlist");
expect(called).toBe(false);

expect(cls.classify(makeRecord(new Uint8Array(4)))).toBe("bytes");
expect(called).toBe(false);
});

it("returns 'primitive' for null without consulting extractor", () => {
const ext = makeExtractor(new Map());
const cls = new TemplatedClassifier(ext);
expect(cls.classify(makeRecord(null))).toBe("primitive");
});
});
Loading
Loading