Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9b1fe20
feat(xml): add XML module with streaming parser, DOM-style parser, an…
tomas-zijdemans Jan 8, 2026
81445b5
perf(xml): native TransformStream for 20% faster streaming
tomas-zijdemans Jan 9, 2026
a5ed5bf
refactor(xml): remove deprecated async generator APIs, sync all tests
tomas-zijdemans Jan 9, 2026
749193c
perf(xml): use switch statement for named entity decoding
tomas-zijdemans Jan 12, 2026
605f0d6
perf(xml): replace object lookups with switch in entity encoding
tomas-zijdemans Jan 12, 2026
e3678c9
perf(xml): use charCodeAt for tokenizer hot path
tomas-zijdemans Jan 12, 2026
dc79433
perf(xml): switch DOM parser to character code comparisons
tomas-zijdemans Jan 12, 2026
566f083
perf(xml): add fast path for attribute value normalization
tomas-zijdemans Jan 12, 2026
9fdb310
refactor(xml): remove helper functions
tomas-zijdemans Jan 14, 2026
ec7ccf1
perf(xml): optimize switch
tomas-zijdemans Jan 14, 2026
ceb8a1a
perf(xml): cache hot variables
tomas-zijdemans Jan 14, 2026
1233275
feat(xml/unstable): add error position tracking as an aption
tomas-zijdemans Jan 14, 2026
1578a99
perf(xml): introduce basic dedicated capture methods
tomas-zijdemans Jan 14, 2026
dc6da86
perf(xml): optimize CDATA capture with indexOf batch scanning
tomas-zijdemans Jan 14, 2026
23f0a7b
refactor(xml): handle comment and PI capture
tomas-zijdemans Jan 14, 2026
6fa4da1
perf(xml): XmlName Caching when streaming
tomas-zijdemans Jan 14, 2026
b9140d1
perf(xml): pending Start Element Reuse
tomas-zijdemans Jan 14, 2026
ae340f2
perf(xml): optimize name parsing, add XmlName.raw property
tomas-zijdemans Jan 14, 2026
dbd8ffe
fix tests
tomas-zijdemans Jan 14, 2026
94a37e9
feat(xml): callback based streaming core
tomas-zijdemans Jan 15, 2026
d9f917b
feat(xml): direct streaming
tomas-zijdemans Jan 15, 2026
d5b2b2a
feat(xml): use callbacks for parse
tomas-zijdemans Jan 15, 2026
6266a3b
test coverage
tomas-zijdemans Jan 15, 2026
fdd09f0
fix(xml): avoid double parseName
tomas-zijdemans Jan 15, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/title.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,5 @@ jobs:
ulid(/unstable)?
uuid(/unstable)?
webgpu(/unstable)?
xml(/unstable)?
yaml(/unstable)?
1 change: 1 addition & 0 deletions browser-compat.tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"./ulid",
"./uuid",
"./webgpu",
"./xml",
"./yaml"
]
}
4 changes: 3 additions & 1 deletion deno.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
"_tools/node_test_runner",
"http/testdata",
"fs/testdata",
"dotenv/testdata"
"dotenv/testdata",
"xml/testdata"
],
"lint": {
"rules": {
Expand Down Expand Up @@ -94,6 +95,7 @@
"./ulid",
"./uuid",
"./webgpu",
"./xml",
"./yaml"
]
}
2 changes: 1 addition & 1 deletion import_map.json
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I have no idea why this formatting is happening 😅

Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"npm:/typescript": "npm:[email protected]",
"automation/": "https://raw.githubusercontent.com/denoland/automation/0.10.0/",
"graphviz": "npm:node-graphviz@^0.1.1",

"@std/assert": "jsr:@std/assert@^1.0.16",
"@std/async": "jsr:@std/async@^1.0.16",
"@std/bytes": "jsr:@std/bytes@^1.0.6",
Expand Down Expand Up @@ -46,6 +45,7 @@
"@std/ulid": "jsr:@std/ulid@^1.0.0",
"@std/uuid": "jsr:@std/uuid@^1.1.0",
"@std/webgpu": "jsr:@std/webgpu@^0.224.9",
"@std/xml": "jsr:@std/xml@^0.0.1",
"@std/yaml": "jsr:@std/yaml@^1.0.10"
}
}
67 changes: 67 additions & 0 deletions xml/_common.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2018-2026 the Deno authors. MIT license.
// This module is browser compatible.

/**
* Internal shared utilities for the XML module.
*
* @module
*/

import type { XmlName } from "./types.ts";

/**
* Line ending normalization pattern per XML 1.0 §2.11.
* Converts \r\n and standalone \r to \n.
*/
export const LINE_ENDING_RE = /\r\n?/g;

/**
* Whitespace-only test per XML 1.0 §2.3.
* Uses explicit [ \t\r\n] instead of \s to match XML spec exactly:
* S ::= (#x20 | #x9 | #xD | #xA)+
*/
export const WHITESPACE_ONLY_RE = /^[ \t\r\n]*$/;

/**
* XML declaration version attribute pattern.
* Matches both single and double quoted values.
*/
export const VERSION_RE = /version\s*=\s*(?:"([^"]+)"|'([^']+)')/;

/**
* XML declaration encoding attribute pattern.
* Matches both single and double quoted values.
*/
export const ENCODING_RE = /encoding\s*=\s*(?:"([^"]+)"|'([^']+)')/;

/**
* XML declaration standalone attribute pattern.
* Matches both single and double quoted values, restricted to "yes" or "no".
*/
export const STANDALONE_RE = /standalone\s*=\s*(?:"(yes|no)"|'(yes|no)')/;

/**
* Parses a qualified XML name into its prefix and local parts.
*
* @example Usage
* ```ts
* import { parseName } from "./_common.ts";
*
* parseName("ns:element"); // { raw: "ns:element", prefix: "ns", local: "element" }
* parseName("element"); // { raw: "element", local: "element" }
* ```
*
* @param name The raw name string (e.g., "ns:element" or "element")
* @returns An XmlName object with raw, local and optional prefix
*/
export function parseName(name: string): XmlName {
const colonIndex = name.indexOf(":");
if (colonIndex === -1) {
return { raw: name, local: name };
}
return {
raw: name,
prefix: name.slice(0, colonIndex),
local: name.slice(colonIndex + 1),
};
}
184 changes: 184 additions & 0 deletions xml/_entities.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
// Copyright 2018-2026 the Deno authors. MIT license.
// This module is browser compatible.

/**
* Internal module for XML entity encoding and decoding.
*
* @module
*/

// Hoisted regex patterns for performance
const ENTITY_RE = /&([a-zA-Z]+|#[0-9]+|#x[0-9a-fA-F]+);/g;
const SPECIAL_CHARS_RE = /[<>&'"]/g;
const ATTR_ENCODE_RE = /[<>&'"\t\n\r]/g;

/**
* Pattern to detect bare `&` not followed by a valid reference.
* Valid references are: &name; or &#digits; or &#xhexdigits;
*/
const BARE_AMPERSAND_RE = /&(?![a-zA-Z][a-zA-Z0-9]*;|#[0-9]+;|#x[0-9a-fA-F]+;)/;

/**
* Checks if a code point is a valid XML 1.0 Char per §2.2.
*
* Per the specification:
* Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
*
* This excludes:
* - NULL (#x0)
* - Control characters #x1-#x8, #xB-#xC, #xE-#x1F
* - Surrogate pairs #xD800-#xDFFF (handled separately)
* - Non-characters #xFFFE-#xFFFF
*
* @see {@link https://www.w3.org/TR/xml/#charsets | XML 1.0 §2.2 Characters}
*/
function isValidXmlChar(codePoint: number): boolean {
return (
codePoint === 0x9 ||
codePoint === 0xA ||
codePoint === 0xD ||
(codePoint >= 0x20 && codePoint <= 0xD7FF) ||
(codePoint >= 0xE000 && codePoint <= 0xFFFD) ||
(codePoint >= 0x10000 && codePoint <= 0x10FFFF)
);
}

/**
* Options for entity decoding.
*/
export interface DecodeEntityOptions {
/**
* If true, throws an error on invalid bare `&` characters.
* Per XML 1.0 §3.1, `&` must be escaped as `&amp;` unless it starts
* a valid entity or character reference.
*
* @default false
*/
readonly strict?: boolean;
}

/**
* Decodes XML entities in a string.
*
* Handles the five predefined entities (§4.6) and numeric character
* references (§4.1) per the XML 1.0 specification.
*
* @param text The text containing XML entities to decode.
* @param options Decoding options.
* @returns The text with entities decoded.
*/
export function decodeEntities(
text: string,
options?: DecodeEntityOptions,
): string {
// Fast path: no ampersand means no entities to decode
if (!text.includes("&")) return text;

if (options?.strict) {
const match = BARE_AMPERSAND_RE.exec(text);
if (match) {
throw new Error(
`Invalid bare '&' at position ${match.index}: ` +
`entity references must be &name; or &#num; or &#xHex;`,
);
}
}

return text.replace(ENTITY_RE, (match, entity: string) => {
if (entity.startsWith("#x")) {
// Hexadecimal character reference
const codePoint = parseInt(entity.slice(2), 16);
// Invalid per XML 1.0 §4.1 WFC: Legal Character - must match Char production
if (!isValidXmlChar(codePoint)) {
return match;
}
return String.fromCodePoint(codePoint);
}
if (entity.startsWith("#")) {
// Decimal character reference
const codePoint = parseInt(entity.slice(1), 10);
// Invalid per XML 1.0 §4.1 WFC: Legal Character - must match Char production
if (!isValidXmlChar(codePoint)) {
return match;
}
return String.fromCodePoint(codePoint);
}
// Named entity - use switch for optimal performance
switch (entity) {
case "lt":
return "<";
case "gt":
return ">";
case "amp":
return "&";
case "apos":
return "'";
case "quot":
return '"';
default:
// Unknown entity - return as-is
return match;
}
});
}

/**
* Encodes special characters as XML entities.
*
* @param text The text to encode.
* @returns The text with special characters encoded as entities.
*/
export function encodeEntities(text: string): string {
// Fast path: no special characters means nothing to encode
if (!/[<>&'"]/.test(text)) return text;
return text.replace(SPECIAL_CHARS_RE, (char) => {
switch (char) {
case "<":
return "&lt;";
case ">":
return "&gt;";
case "&":
return "&amp;";
case "'":
return "&apos;";
case '"':
return "&quot;";
default:
return char;
}
});
}

/**
* Encodes special characters for use in XML attribute values.
* Encodes whitespace characters that would be normalized per XML 1.0 §3.3.3.
*
* @param value The attribute value to encode.
* @returns The encoded attribute value.
*/
export function encodeAttributeValue(value: string): string {
// Fast path: no special characters means nothing to encode
if (!/[<>&'"\t\n\r]/.test(value)) return value;
return value.replace(ATTR_ENCODE_RE, (c) => {
switch (c) {
case "<":
return "&lt;";
case ">":
return "&gt;";
case "&":
return "&amp;";
case "'":
return "&apos;";
case '"':
return "&quot;";
case "\t":
return "&#9;";
case "\n":
return "&#10;";
case "\r":
return "&#13;";
default:
return c;
}
});
}
Loading
Loading