Skip to content

Commit

Permalink
Spec update: make everything use percent-encode sets
Browse files Browse the repository at this point in the history
Follows whatwg/url#518.

This generally tries to make the code correspond more explicitly to the specification in a few ways. It doesn't handle non-UTF-8 encodings yet, though.

Supersedes #152.
  • Loading branch information
domenic authored Jun 25, 2020
1 parent 9b34f29 commit cf7a7c1
Show file tree
Hide file tree
Showing 7 changed files with 232 additions and 172 deletions.
4 changes: 2 additions & 2 deletions src/URL-impl.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ exports.implementation = class URLImpl {
this._query._list.splice(0);
const { query } = parsedURL;
if (query !== null) {
this._query._list = urlencoded.parseUrlencoded(query);
this._query._list = urlencoded.parseUrlencodedString(query);
}
}

Expand Down Expand Up @@ -185,7 +185,7 @@ exports.implementation = class URLImpl {
const input = v[0] === "?" ? v.substring(1) : v;
url.query = "";
usm.basicURLParse(input, { url, stateOverride: "query" });
this._query._list = urlencoded.parseUrlencoded(input);
this._query._list = urlencoded.parseUrlencodedString(input);
}

get searchParams() {
Expand Down
2 changes: 1 addition & 1 deletion src/URLSearchParams-impl.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ exports.implementation = class URLSearchParamsImpl {
this._list.push([name, value]);
}
} else {
this._list = urlencoded.parseUrlencoded(init);
this._list = urlencoded.parseUrlencodedString(init);
}
}

Expand Down
18 changes: 18 additions & 0 deletions src/encoding.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"use strict";
const { TextEncoder, TextDecoder } = require("util");

const utf8Encoder = new TextEncoder();
const utf8Decoder = new TextDecoder("utf-8", { ignoreBOM: true });

function utf8Encode(string) {
return utf8Encoder.encode(string);
}

function utf8DecodeWithoutBOM(bytes) {
return utf8Decoder.decode(bytes);
}

module.exports = {
utf8Encode,
utf8DecodeWithoutBOM
};
2 changes: 2 additions & 0 deletions src/infra.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"use strict";

// Note that we take code points as JS numbers, not JS strings.

function isASCIIDigit(c) {
return c >= 0x30 && c <= 0x39;
}
Expand Down
139 changes: 139 additions & 0 deletions src/percent-encoding.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
"use strict";
const { isASCIIHex } = require("./infra");
const { utf8Encode } = require("./encoding");

function p(char) {
return char.codePointAt(0);
}

// https://whatpr.org/url/518.html#percent-encode
function percentEncode(c) {
let hex = c.toString(16).toUpperCase();
if (hex.length === 1) {
hex = "0" + hex;
}

return "%" + hex;
}

// https://whatpr.org/url/518.html#percent-decode
function percentDecodeBytes(input) {
const output = new Uint8Array(input.byteLength);
let outputIndex = 0;
for (let i = 0; i < input.byteLength; ++i) {
const byte = input[i];
if (byte !== 0x25) {
output[outputIndex++] = byte;
} else if (byte === 0x25 && (!isASCIIHex(input[i + 1]) || !isASCIIHex(input[i + 2]))) {
output[outputIndex++] = byte;
} else {
const bytePoint = parseInt(String.fromCodePoint(input[i + 1], input[i + 2]), 16);
output[outputIndex++] = bytePoint;
i += 2;
}
}

return output.slice(0, outputIndex);
}

// https://whatpr.org/url/518.html#string-percent-decode
function percentDecodeString(input) {
const bytes = utf8Encode(input);
return percentDecodeBytes(bytes);
}

// https://whatpr.org/url/518.html#c0-control-percent-encode-set
function isC0ControlPercentEncode(c) {
return c <= 0x1F || c > 0x7E;
}

// https://whatpr.org/url/518.html#fragment-percent-encode-set
const extraFragmentPercentEncodeSet = new Set([p(" "), p("\""), p("<"), p(">"), p("`")]);
function isFragmentPercentEncode(c) {
return isC0ControlPercentEncode(c) || extraFragmentPercentEncodeSet.has(c);
}

// https://whatpr.org/url/518.html#query-percent-encode-set
const extraQueryPercentEncodeSet = new Set([p(" "), p("\""), p("#"), p("<"), p(">")]);
function isQueryPercentEncode(c) {
return isC0ControlPercentEncode(c) || extraQueryPercentEncodeSet.has(c);
}

// https://whatpr.org/url/518.html#special-query-percent-encode-set
function isSpecialQueryPercentEncode(c) {
return isQueryPercentEncode(c) || c === p("'");
}

// https://whatpr.org/url/518.html#path-percent-encode-set
const extraPathPercentEncodeSet = new Set([p("?"), p("`"), p("{"), p("}")]);
function isPathPercentEncode(c) {
return isQueryPercentEncode(c) || extraPathPercentEncodeSet.has(c);
}

// https://whatpr.org/url/518.html#userinfo-percent-encode-set
const extraUserinfoPercentEncodeSet =
new Set([p("/"), p(":"), p(";"), p("="), p("@"), p("["), p("\\"), p("]"), p("^"), p("|")]);
function isUserinfoPercentEncode(c) {
return isPathPercentEncode(c) || extraUserinfoPercentEncodeSet.has(c);
}

// https://whatpr.org/url/518.html#application-x-www-form-urlencoded-percent-encode-set
const extraURLEncodedPercentEncodeSet = new Set([
p("!"), p("$"), p("%"), p("&"), p("'"),
p("("), p(")"), p("+"), p(","), p("~")
]);
function isURLEncodedPercentEncode(c) {
return isUserinfoPercentEncode(c) || extraURLEncodedPercentEncodeSet.has(c);
}

// https://whatpr.org/url/518.html#code-point-percent-encode-after-encoding
// https://whatpr.org/url/518.html#utf-8-percent-encode
// Assuming encoding is always utf-8 allows us to trim one of the logic branches. TODO: support encoding.
// The "-Internal" variant here has code points as JS strings. The external version used by other files has code points
// as JS numbers, like the rest of the codebase.
function utf8PercentEncodeCodePointInternal(codePoint, percentEncodePredicate) {
const bytes = utf8Encode(codePoint);
let output = "";
for (const byte of bytes) {
// Our percentEncodePredicate operates on bytes, not code points, so this is slightly different from the spec.
if (!percentEncodePredicate(byte)) {
output += String.fromCharCode(byte);
} else {
output += percentEncode(byte);
}
}

return output;
}

function utf8PercentEncodeCodePoint(codePoint, percentEncodePredicate) {
return utf8PercentEncodeCodePointInternal(String.fromCodePoint(codePoint), percentEncodePredicate);
}

// https://whatpr.org/url/518.html#string-percent-encode-after-encoding
// https://whatpr.org/url/518.html#string-utf-8-percent-encode
function utf8PercentEncodeString(input, percentEncodePredicate, spaceAsPlus = false) {
let output = "";
for (const codePoint of input) {
if (spaceAsPlus && codePoint === " ") {
output += "+";
} else {
output += utf8PercentEncodeCodePointInternal(codePoint, percentEncodePredicate);
}
}
return output;
}

module.exports = {
isC0ControlPercentEncode,
isFragmentPercentEncode,
isQueryPercentEncode,
isSpecialQueryPercentEncode,
isPathPercentEncode,
isUserinfoPercentEncode,
isURLEncodedPercentEncode,
percentDecodeString,
percentDecodeBytes,
utf8PercentEncodeString,
utf8PercentEncodeCodePoint
};
114 changes: 25 additions & 89 deletions src/url-state-machine.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ const punycode = require("punycode");
const tr46 = require("tr46");

const infra = require("./infra");
const { percentEncode, percentDecode } = require("./urlencoded");
const { utf8DecodeWithoutBOM } = require("./encoding");
const { percentDecodeString, utf8PercentEncodeCodePoint, utf8PercentEncodeString, isC0ControlPercentEncode,
isFragmentPercentEncode, isQueryPercentEncode, isSpecialQueryPercentEncode, isPathPercentEncode,
isUserinfoPercentEncode } = require("./percent-encoding");

function p(char) {
return char.codePointAt(0);
Expand All @@ -21,7 +24,7 @@ const specialSchemes = {
const failure = Symbol("failure");

function countSymbols(str) {
return punycode.ucs2.decode(str).length;
return [...str].length;
}

function at(input, idx) {
Expand Down Expand Up @@ -74,48 +77,6 @@ function defaultPort(scheme) {
return specialSchemes[scheme];
}

function utf8PercentEncode(c) {
const buf = Buffer.from(c);

let str = "";

for (let i = 0; i < buf.length; ++i) {
str += percentEncode(buf[i]);
}

return str;
}

function isC0ControlPercentEncode(c) {
return c <= 0x1F || c > 0x7E;
}

const extraUserinfoPercentEncodeSet =
new Set([p("/"), p(":"), p(";"), p("="), p("@"), p("["), p("\\"), p("]"), p("^"), p("|")]);
function isUserinfoPercentEncode(c) {
return isPathPercentEncode(c) || extraUserinfoPercentEncodeSet.has(c);
}

const extraFragmentPercentEncodeSet = new Set([p(" "), p("\""), p("<"), p(">"), p("`")]);
function isFragmentPercentEncode(c) {
return isC0ControlPercentEncode(c) || extraFragmentPercentEncodeSet.has(c);
}

const extraPathPercentEncodeSet = new Set([p("#"), p("?"), p("{"), p("}")]);
function isPathPercentEncode(c) {
return isFragmentPercentEncode(c) || extraPathPercentEncodeSet.has(c);
}

function percentEncodeChar(c, encodeSetPredicate) {
const cStr = String.fromCodePoint(c);

if (encodeSetPredicate(c)) {
return utf8PercentEncode(cStr);
}

return cStr;
}

function parseIPv4Number(input) {
let R = 10;

Expand Down Expand Up @@ -377,7 +338,7 @@ function parseHost(input, isNotSpecialArg = false) {
return parseOpaqueHost(input);
}

const domain = percentDecode(Buffer.from(input)).toString();
const domain = utf8DecodeWithoutBOM(percentDecodeString(input));
const asciiDomain = domainToASCII(domain);
if (asciiDomain === failure) {
return failure;
Expand All @@ -400,12 +361,7 @@ function parseOpaqueHost(input) {
return failure;
}

let output = "";
const decoded = punycode.ucs2.decode(input);
for (let i = 0; i < decoded.length; ++i) {
output += percentEncodeChar(decoded[i], isC0ControlPercentEncode);
}
return output;
return utf8PercentEncodeString(input, isC0ControlPercentEncode);
}

function findLongestZeroSequence(arr) {
Expand Down Expand Up @@ -769,7 +725,7 @@ URLStateMachine.prototype["parse authority"] = function parseAuthority(c, cStr)
this.passwordTokenSeenFlag = true;
continue;
}
const encodedCodePoints = percentEncodeChar(codePoint, isUserinfoPercentEncode);
const encodedCodePoints = utf8PercentEncodeCodePoint(codePoint, isUserinfoPercentEncode);
if (this.passwordTokenSeenFlag) {
this.url.password += encodedCodePoints;
} else {
Expand Down Expand Up @@ -1059,7 +1015,7 @@ URLStateMachine.prototype["parse path"] = function parsePath(c) {
this.parseError = true;
}

this.buffer += percentEncodeChar(c, isPathPercentEncode);
this.buffer += utf8PercentEncodeCodePoint(c, isPathPercentEncode);
}

return true;
Expand All @@ -1085,45 +1041,33 @@ URLStateMachine.prototype["parse cannot-be-a-base-URL path"] = function parseCan
}

if (!isNaN(c)) {
this.url.path[0] += percentEncodeChar(c, isC0ControlPercentEncode);
this.url.path[0] += utf8PercentEncodeCodePoint(c, isC0ControlPercentEncode);
}
}

return true;
};

URLStateMachine.prototype["parse query"] = function parseQuery(c, cStr) {
if (isNaN(c) || (!this.stateOverride && c === p("#"))) {
if (!isSpecial(this.url) || this.url.scheme === "ws" || this.url.scheme === "wss") {
this.encodingOverride = "utf-8";
}

const buffer = Buffer.from(this.buffer); // TODO: Use encoding override instead
for (let i = 0; i < buffer.length; ++i) {
if (buffer[i] < 0x21 ||
buffer[i] > 0x7E ||
buffer[i] === 0x22 || buffer[i] === 0x23 || buffer[i] === 0x3C || buffer[i] === 0x3E ||
(buffer[i] === 0x27 && isSpecial(this.url))) {
this.url.query += percentEncode(buffer[i]);
} else {
this.url.query += String.fromCodePoint(buffer[i]);
}
}
URLStateMachine.prototype["parse query"] = function parseQuery(c) {
if (!isSpecial(this.url) || this.url.scheme === "ws" || this.url.scheme === "wss") {
this.encodingOverride = "utf-8";
}

this.buffer = "";
if (c === p("#")) {
this.url.fragment = "";
this.state = "fragment";
}
} else {
if (!this.stateOverride & c === p("#")) {
this.url.fragment = "";
this.state = "fragment";
} else if (!isNaN(c)) {
// TODO: If c is not a URL code point and not "%", parse error.

if (c === p("%") &&
(!infra.isASCIIHex(this.input[this.pointer + 1]) ||
!infra.isASCIIHex(this.input[this.pointer + 2]))) {
this.parseError = true;
}

this.buffer += cStr;
const queryPercentEncodePredicate = isSpecial(this.url) ? isSpecialQueryPercentEncode : isQueryPercentEncode;
// TODO: use "percent-encode after encoding" passing in this.encodingOverride
this.url.query += utf8PercentEncodeCodePoint(c, queryPercentEncodePredicate);
}

return true;
Expand All @@ -1138,7 +1082,7 @@ URLStateMachine.prototype["parse fragment"] = function parseFragment(c) {
this.parseError = true;
}

this.url.fragment += percentEncodeChar(c, isFragmentPercentEncode);
this.url.fragment += utf8PercentEncodeCodePoint(c, isFragmentPercentEncode);
}

return true;
Expand Down Expand Up @@ -1247,19 +1191,11 @@ module.exports.basicURLParse = function (input, options) {
};

module.exports.setTheUsername = function (url, username) {
url.username = "";
const decoded = punycode.ucs2.decode(username);
for (let i = 0; i < decoded.length; ++i) {
url.username += percentEncodeChar(decoded[i], isUserinfoPercentEncode);
}
url.username = utf8PercentEncodeString(username, isUserinfoPercentEncode);
};

module.exports.setThePassword = function (url, password) {
url.password = "";
const decoded = punycode.ucs2.decode(password);
for (let i = 0; i < decoded.length; ++i) {
url.password += percentEncodeChar(decoded[i], isUserinfoPercentEncode);
}
url.password = utf8PercentEncodeString(password, isUserinfoPercentEncode);
};

module.exports.serializeHost = serializeHost;
Expand Down
Loading

0 comments on commit cf7a7c1

Please sign in to comment.