From cf7a7c1c29f6853faf79f50c97e0767da2182a52 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Thu, 25 Jun 2020 15:24:03 -0400 Subject: [PATCH] Spec update: make everything use percent-encode sets Follows https://github.com/whatwg/url/pull/518. This generally tries to make the code correspond more explicitly to the specification in a few ways. It doesn't handle non-UTF-8 encodings yet, though. Supersedes #152. --- src/URL-impl.js | 4 +- src/URLSearchParams-impl.js | 2 +- src/encoding.js | 18 +++++ src/infra.js | 2 + src/percent-encoding.js | 139 ++++++++++++++++++++++++++++++++++++ src/url-state-machine.js | 114 +++++++---------------------- src/urlencoded.js | 125 ++++++++++++-------------------- 7 files changed, 232 insertions(+), 172 deletions(-) create mode 100644 src/encoding.js create mode 100644 src/percent-encoding.js diff --git a/src/URL-impl.js b/src/URL-impl.js index 5818667..363aa26 100644 --- a/src/URL-impl.js +++ b/src/URL-impl.js @@ -46,7 +46,7 @@ exports.implementation = class URLImpl { this._query._list.splice(0); const { query } = parsedURL; if (query !== null) { - this._query._list = urlencoded.parseUrlencoded(query); + this._query._list = urlencoded.parseUrlencodedString(query); } } @@ -185,7 +185,7 @@ exports.implementation = class URLImpl { const input = v[0] === "?" ? v.substring(1) : v; url.query = ""; usm.basicURLParse(input, { url, stateOverride: "query" }); - this._query._list = urlencoded.parseUrlencoded(input); + this._query._list = urlencoded.parseUrlencodedString(input); } get searchParams() { diff --git a/src/URLSearchParams-impl.js b/src/URLSearchParams-impl.js index b59d677..3f10ec8 100644 --- a/src/URLSearchParams-impl.js +++ b/src/URLSearchParams-impl.js @@ -26,7 +26,7 @@ exports.implementation = class URLSearchParamsImpl { this._list.push([name, value]); } } else { - this._list = urlencoded.parseUrlencoded(init); + this._list = urlencoded.parseUrlencodedString(init); } } diff --git a/src/encoding.js b/src/encoding.js new file mode 100644 index 0000000..6a5abf4 --- /dev/null +++ b/src/encoding.js @@ -0,0 +1,18 @@ +"use strict"; +const { TextEncoder, TextDecoder } = require("util"); + +const utf8Encoder = new TextEncoder(); +const utf8Decoder = new TextDecoder("utf-8", { ignoreBOM: true }); + +function utf8Encode(string) { + return utf8Encoder.encode(string); +} + +function utf8DecodeWithoutBOM(bytes) { + return utf8Decoder.decode(bytes); +} + +module.exports = { + utf8Encode, + utf8DecodeWithoutBOM +}; diff --git a/src/infra.js b/src/infra.js index 2a712f8..4a984a3 100644 --- a/src/infra.js +++ b/src/infra.js @@ -1,5 +1,7 @@ "use strict"; +// Note that we take code points as JS numbers, not JS strings. + function isASCIIDigit(c) { return c >= 0x30 && c <= 0x39; } diff --git a/src/percent-encoding.js b/src/percent-encoding.js new file mode 100644 index 0000000..587a1ba --- /dev/null +++ b/src/percent-encoding.js @@ -0,0 +1,139 @@ +"use strict"; +const { isASCIIHex } = require("./infra"); +const { utf8Encode } = require("./encoding"); + +function p(char) { + return char.codePointAt(0); +} + +// https://whatpr.org/url/518.html#percent-encode +function percentEncode(c) { + let hex = c.toString(16).toUpperCase(); + if (hex.length === 1) { + hex = "0" + hex; + } + + return "%" + hex; +} + +// https://whatpr.org/url/518.html#percent-decode +function percentDecodeBytes(input) { + const output = new Uint8Array(input.byteLength); + let outputIndex = 0; + for (let i = 0; i < input.byteLength; ++i) { + const byte = input[i]; + if (byte !== 0x25) { + output[outputIndex++] = byte; + } else if (byte === 0x25 && (!isASCIIHex(input[i + 1]) || !isASCIIHex(input[i + 2]))) { + output[outputIndex++] = byte; + } else { + const bytePoint = parseInt(String.fromCodePoint(input[i + 1], input[i + 2]), 16); + output[outputIndex++] = bytePoint; + i += 2; + } + } + + return output.slice(0, outputIndex); +} + +// https://whatpr.org/url/518.html#string-percent-decode +function percentDecodeString(input) { + const bytes = utf8Encode(input); + return percentDecodeBytes(bytes); +} + +// https://whatpr.org/url/518.html#c0-control-percent-encode-set +function isC0ControlPercentEncode(c) { + return c <= 0x1F || c > 0x7E; +} + +// https://whatpr.org/url/518.html#fragment-percent-encode-set +const extraFragmentPercentEncodeSet = new Set([p(" "), p("\""), p("<"), p(">"), p("`")]); +function isFragmentPercentEncode(c) { + return isC0ControlPercentEncode(c) || extraFragmentPercentEncodeSet.has(c); +} + +// https://whatpr.org/url/518.html#query-percent-encode-set +const extraQueryPercentEncodeSet = new Set([p(" "), p("\""), p("#"), p("<"), p(">")]); +function isQueryPercentEncode(c) { + return isC0ControlPercentEncode(c) || extraQueryPercentEncodeSet.has(c); +} + +// https://whatpr.org/url/518.html#special-query-percent-encode-set +function isSpecialQueryPercentEncode(c) { + return isQueryPercentEncode(c) || c === p("'"); +} + +// https://whatpr.org/url/518.html#path-percent-encode-set +const extraPathPercentEncodeSet = new Set([p("?"), p("`"), p("{"), p("}")]); +function isPathPercentEncode(c) { + return isQueryPercentEncode(c) || extraPathPercentEncodeSet.has(c); +} + +// https://whatpr.org/url/518.html#userinfo-percent-encode-set +const extraUserinfoPercentEncodeSet = + new Set([p("/"), p(":"), p(";"), p("="), p("@"), p("["), p("\\"), p("]"), p("^"), p("|")]); +function isUserinfoPercentEncode(c) { + return isPathPercentEncode(c) || extraUserinfoPercentEncodeSet.has(c); +} + +// https://whatpr.org/url/518.html#application-x-www-form-urlencoded-percent-encode-set +const extraURLEncodedPercentEncodeSet = new Set([ + p("!"), p("$"), p("%"), p("&"), p("'"), + p("("), p(")"), p("+"), p(","), p("~") +]); +function isURLEncodedPercentEncode(c) { + return isUserinfoPercentEncode(c) || extraURLEncodedPercentEncodeSet.has(c); +} + +// https://whatpr.org/url/518.html#code-point-percent-encode-after-encoding +// https://whatpr.org/url/518.html#utf-8-percent-encode +// Assuming encoding is always utf-8 allows us to trim one of the logic branches. TODO: support encoding. +// The "-Internal" variant here has code points as JS strings. The external version used by other files has code points +// as JS numbers, like the rest of the codebase. +function utf8PercentEncodeCodePointInternal(codePoint, percentEncodePredicate) { + const bytes = utf8Encode(codePoint); + let output = ""; + for (const byte of bytes) { + // Our percentEncodePredicate operates on bytes, not code points, so this is slightly different from the spec. + if (!percentEncodePredicate(byte)) { + output += String.fromCharCode(byte); + } else { + output += percentEncode(byte); + } + } + + return output; +} + +function utf8PercentEncodeCodePoint(codePoint, percentEncodePredicate) { + return utf8PercentEncodeCodePointInternal(String.fromCodePoint(codePoint), percentEncodePredicate); +} + +// https://whatpr.org/url/518.html#string-percent-encode-after-encoding +// https://whatpr.org/url/518.html#string-utf-8-percent-encode +function utf8PercentEncodeString(input, percentEncodePredicate, spaceAsPlus = false) { + let output = ""; + for (const codePoint of input) { + if (spaceAsPlus && codePoint === " ") { + output += "+"; + } else { + output += utf8PercentEncodeCodePointInternal(codePoint, percentEncodePredicate); + } + } + return output; +} + +module.exports = { + isC0ControlPercentEncode, + isFragmentPercentEncode, + isQueryPercentEncode, + isSpecialQueryPercentEncode, + isPathPercentEncode, + isUserinfoPercentEncode, + isURLEncodedPercentEncode, + percentDecodeString, + percentDecodeBytes, + utf8PercentEncodeString, + utf8PercentEncodeCodePoint +}; diff --git a/src/url-state-machine.js b/src/url-state-machine.js index 1159879..3dc247c 100644 --- a/src/url-state-machine.js +++ b/src/url-state-machine.js @@ -3,7 +3,10 @@ const punycode = require("punycode"); const tr46 = require("tr46"); const infra = require("./infra"); -const { percentEncode, percentDecode } = require("./urlencoded"); +const { utf8DecodeWithoutBOM } = require("./encoding"); +const { percentDecodeString, utf8PercentEncodeCodePoint, utf8PercentEncodeString, isC0ControlPercentEncode, + isFragmentPercentEncode, isQueryPercentEncode, isSpecialQueryPercentEncode, isPathPercentEncode, + isUserinfoPercentEncode } = require("./percent-encoding"); function p(char) { return char.codePointAt(0); @@ -21,7 +24,7 @@ const specialSchemes = { const failure = Symbol("failure"); function countSymbols(str) { - return punycode.ucs2.decode(str).length; + return [...str].length; } function at(input, idx) { @@ -74,48 +77,6 @@ function defaultPort(scheme) { return specialSchemes[scheme]; } -function utf8PercentEncode(c) { - const buf = Buffer.from(c); - - let str = ""; - - for (let i = 0; i < buf.length; ++i) { - str += percentEncode(buf[i]); - } - - return str; -} - -function isC0ControlPercentEncode(c) { - return c <= 0x1F || c > 0x7E; -} - -const extraUserinfoPercentEncodeSet = - new Set([p("/"), p(":"), p(";"), p("="), p("@"), p("["), p("\\"), p("]"), p("^"), p("|")]); -function isUserinfoPercentEncode(c) { - return isPathPercentEncode(c) || extraUserinfoPercentEncodeSet.has(c); -} - -const extraFragmentPercentEncodeSet = new Set([p(" "), p("\""), p("<"), p(">"), p("`")]); -function isFragmentPercentEncode(c) { - return isC0ControlPercentEncode(c) || extraFragmentPercentEncodeSet.has(c); -} - -const extraPathPercentEncodeSet = new Set([p("#"), p("?"), p("{"), p("}")]); -function isPathPercentEncode(c) { - return isFragmentPercentEncode(c) || extraPathPercentEncodeSet.has(c); -} - -function percentEncodeChar(c, encodeSetPredicate) { - const cStr = String.fromCodePoint(c); - - if (encodeSetPredicate(c)) { - return utf8PercentEncode(cStr); - } - - return cStr; -} - function parseIPv4Number(input) { let R = 10; @@ -377,7 +338,7 @@ function parseHost(input, isNotSpecialArg = false) { return parseOpaqueHost(input); } - const domain = percentDecode(Buffer.from(input)).toString(); + const domain = utf8DecodeWithoutBOM(percentDecodeString(input)); const asciiDomain = domainToASCII(domain); if (asciiDomain === failure) { return failure; @@ -400,12 +361,7 @@ function parseOpaqueHost(input) { return failure; } - let output = ""; - const decoded = punycode.ucs2.decode(input); - for (let i = 0; i < decoded.length; ++i) { - output += percentEncodeChar(decoded[i], isC0ControlPercentEncode); - } - return output; + return utf8PercentEncodeString(input, isC0ControlPercentEncode); } function findLongestZeroSequence(arr) { @@ -769,7 +725,7 @@ URLStateMachine.prototype["parse authority"] = function parseAuthority(c, cStr) this.passwordTokenSeenFlag = true; continue; } - const encodedCodePoints = percentEncodeChar(codePoint, isUserinfoPercentEncode); + const encodedCodePoints = utf8PercentEncodeCodePoint(codePoint, isUserinfoPercentEncode); if (this.passwordTokenSeenFlag) { this.url.password += encodedCodePoints; } else { @@ -1059,7 +1015,7 @@ URLStateMachine.prototype["parse path"] = function parsePath(c) { this.parseError = true; } - this.buffer += percentEncodeChar(c, isPathPercentEncode); + this.buffer += utf8PercentEncodeCodePoint(c, isPathPercentEncode); } return true; @@ -1085,45 +1041,33 @@ URLStateMachine.prototype["parse cannot-be-a-base-URL path"] = function parseCan } if (!isNaN(c)) { - this.url.path[0] += percentEncodeChar(c, isC0ControlPercentEncode); + this.url.path[0] += utf8PercentEncodeCodePoint(c, isC0ControlPercentEncode); } } return true; }; -URLStateMachine.prototype["parse query"] = function parseQuery(c, cStr) { - if (isNaN(c) || (!this.stateOverride && c === p("#"))) { - if (!isSpecial(this.url) || this.url.scheme === "ws" || this.url.scheme === "wss") { - this.encodingOverride = "utf-8"; - } - - const buffer = Buffer.from(this.buffer); // TODO: Use encoding override instead - for (let i = 0; i < buffer.length; ++i) { - if (buffer[i] < 0x21 || - buffer[i] > 0x7E || - buffer[i] === 0x22 || buffer[i] === 0x23 || buffer[i] === 0x3C || buffer[i] === 0x3E || - (buffer[i] === 0x27 && isSpecial(this.url))) { - this.url.query += percentEncode(buffer[i]); - } else { - this.url.query += String.fromCodePoint(buffer[i]); - } - } +URLStateMachine.prototype["parse query"] = function parseQuery(c) { + if (!isSpecial(this.url) || this.url.scheme === "ws" || this.url.scheme === "wss") { + this.encodingOverride = "utf-8"; + } - this.buffer = ""; - if (c === p("#")) { - this.url.fragment = ""; - this.state = "fragment"; - } - } else { + if (!this.stateOverride & c === p("#")) { + this.url.fragment = ""; + this.state = "fragment"; + } else if (!isNaN(c)) { // TODO: If c is not a URL code point and not "%", parse error. + if (c === p("%") && (!infra.isASCIIHex(this.input[this.pointer + 1]) || !infra.isASCIIHex(this.input[this.pointer + 2]))) { this.parseError = true; } - this.buffer += cStr; + const queryPercentEncodePredicate = isSpecial(this.url) ? isSpecialQueryPercentEncode : isQueryPercentEncode; + // TODO: use "percent-encode after encoding" passing in this.encodingOverride + this.url.query += utf8PercentEncodeCodePoint(c, queryPercentEncodePredicate); } return true; @@ -1138,7 +1082,7 @@ URLStateMachine.prototype["parse fragment"] = function parseFragment(c) { this.parseError = true; } - this.url.fragment += percentEncodeChar(c, isFragmentPercentEncode); + this.url.fragment += utf8PercentEncodeCodePoint(c, isFragmentPercentEncode); } return true; @@ -1247,19 +1191,11 @@ module.exports.basicURLParse = function (input, options) { }; module.exports.setTheUsername = function (url, username) { - url.username = ""; - const decoded = punycode.ucs2.decode(username); - for (let i = 0; i < decoded.length; ++i) { - url.username += percentEncodeChar(decoded[i], isUserinfoPercentEncode); - } + url.username = utf8PercentEncodeString(username, isUserinfoPercentEncode); }; module.exports.setThePassword = function (url, password) { - url.password = ""; - const decoded = punycode.ucs2.decode(password); - for (let i = 0; i < decoded.length; ++i) { - url.password += percentEncodeChar(decoded[i], isUserinfoPercentEncode); - } + url.password = utf8PercentEncodeString(password, isUserinfoPercentEncode); }; module.exports.serializeHost = serializeHost; diff --git a/src/urlencoded.js b/src/urlencoded.js index 593755b..be84a76 100644 --- a/src/urlencoded.js +++ b/src/urlencoded.js @@ -1,57 +1,12 @@ "use strict"; -const { isASCIIHex } = require("./infra"); +const { utf8Encode, utf8DecodeWithoutBOM } = require("./encoding"); +const { percentDecodeBytes, utf8PercentEncodeString, isURLEncodedPercentEncode } = require("./percent-encoding"); function p(char) { return char.codePointAt(0); } -function strictlySplitByteSequence(buf, cp) { - const list = []; - let last = 0; - let i = buf.indexOf(cp); - while (i >= 0) { - list.push(buf.slice(last, i)); - last = i + 1; - i = buf.indexOf(cp, last); - } - if (last !== buf.length) { - list.push(buf.slice(last)); - } - return list; -} - -function replaceByteInByteSequence(buf, from, to) { - let i = buf.indexOf(from); - while (i >= 0) { - buf[i] = to; - i = buf.indexOf(from, i + 1); - } - return buf; -} - -function percentEncode(c) { - let hex = c.toString(16).toUpperCase(); - if (hex.length === 1) { - hex = "0" + hex; - } - - return "%" + hex; -} - -function percentDecode(input) { - const output = Buffer.alloc(input.byteLength); - let ptr = 0; - for (let i = 0; i < input.length; ++i) { - if (input[i] !== p("%") || !isASCIIHex(input[i + 1]) || !isASCIIHex(input[i + 2])) { - output[ptr++] = input[i]; - } else { - output[ptr++] = parseInt(input.slice(i + 1, i + 3).toString(), 16); - i += 2; - } - } - return output.slice(0, ptr); -} - +// https://whatpr.org/url/518.html#concept-urlencoded-parser function parseUrlencoded(input) { const sequences = strictlySplitByteSequence(input, p("&")); const output = []; @@ -69,47 +24,39 @@ function parseUrlencoded(input) { value = bytes.slice(indexOfEqual + 1); } else { name = bytes; - value = Buffer.alloc(0); + value = new Uint8Array(0); } - name = replaceByteInByteSequence(Buffer.from(name), p("+"), p(" ")); - value = replaceByteInByteSequence(Buffer.from(value), p("+"), p(" ")); + name = replaceByteInByteSequence(name, 0x2B, 0x20); + value = replaceByteInByteSequence(value, 0x2B, 0x20); + + const nameString = utf8DecodeWithoutBOM(percentDecodeBytes(name)); + const valueString = utf8DecodeWithoutBOM(percentDecodeBytes(value)); - output.push([percentDecode(name).toString(), percentDecode(value).toString()]); + output.push([nameString, valueString]); } return output; } -function serializeUrlencodedByte(input) { - let output = ""; - for (const byte of input) { - if (byte === p(" ")) { - output += "+"; - } else if (byte === p("*") || - byte === p("-") || - byte === p(".") || - (byte >= p("0") && byte <= p("9")) || - (byte >= p("A") && byte <= p("Z")) || - byte === p("_") || - (byte >= p("a") && byte <= p("z"))) { - output += String.fromCodePoint(byte); - } else { - output += percentEncode(byte); - } - } - return output; +// https://whatpr.org/url/518.html#concept-urlencoded-string-parser +function parseUrlencodedString(input) { + return parseUrlencoded(utf8Encode(input)); } +// https://whatpr.org/url/518.html#concept-urlencoded-serializer function serializeUrlencoded(tuples, encodingOverride = undefined) { let encoding = "utf-8"; if (encodingOverride !== undefined) { + // TODO "get the output encoding", i.e. handle encoding labels vs. names. encoding = encodingOverride; } let output = ""; for (const [i, tuple] of tuples.entries()) { // TODO: handle encoding override - const name = serializeUrlencodedByte(Buffer.from(tuple[0])); + + const name = utf8PercentEncodeString(tuple[0], isURLEncodedPercentEncode, true); + let value = tuple[1]; if (tuple.length > 2 && tuple[2] !== undefined) { if (tuple[2] === "hidden" && name === "_charset_") { @@ -119,7 +66,9 @@ function serializeUrlencoded(tuples, encodingOverride = undefined) { value = value.name; } } - value = serializeUrlencodedByte(Buffer.from(value)); + + value = utf8PercentEncodeString(value, isURLEncodedPercentEncode, true); + if (i !== 0) { output += "&"; } @@ -128,15 +77,31 @@ function serializeUrlencoded(tuples, encodingOverride = undefined) { return output; } -module.exports = { - percentEncode, - percentDecode, +function strictlySplitByteSequence(buf, cp) { + const list = []; + let last = 0; + let i = buf.indexOf(cp); + while (i >= 0) { + list.push(buf.slice(last, i)); + last = i + 1; + i = buf.indexOf(cp, last); + } + if (last !== buf.length) { + list.push(buf.slice(last)); + } + return list; +} - // application/x-www-form-urlencoded string parser - parseUrlencoded(input) { - return parseUrlencoded(Buffer.from(input)); - }, +function replaceByteInByteSequence(buf, from, to) { + let i = buf.indexOf(from); + while (i >= 0) { + buf[i] = to; + i = buf.indexOf(from, i + 1); + } + return buf; +} - // application/x-www-form-urlencoded serializer +module.exports = { + parseUrlencodedString, serializeUrlencoded };