Spec update: make everything use percent-encode sets

Follows whatwg/url#518. This generally tries to make the code correspond more explicitly to the specification in a few ways. It doesn't handle non-UTF-8 encodings yet, though. Supersedes #152.
jsdom · Jun 25, 2020 · cf7a7c1 · cf7a7c1
1 parent 9b34f29
commit cf7a7c1
Show file tree

Hide file tree

Showing 7 changed files with 232 additions and 172 deletions.
diff --git a/src/URL-impl.js b/src/URL-impl.js
@@ -46,7 +46,7 @@ exports.implementation = class URLImpl {
     this._query._list.splice(0);
     const { query } = parsedURL;
     if (query !== null) {
-      this._query._list = urlencoded.parseUrlencoded(query);
+      this._query._list = urlencoded.parseUrlencodedString(query);
     }
   }
 
@@ -185,7 +185,7 @@ exports.implementation = class URLImpl {
     const input = v[0] === "?" ? v.substring(1) : v;
     url.query = "";
     usm.basicURLParse(input, { url, stateOverride: "query" });
-    this._query._list = urlencoded.parseUrlencoded(input);
+    this._query._list = urlencoded.parseUrlencodedString(input);
   }
 
   get searchParams() {

diff --git a/src/URLSearchParams-impl.js b/src/URLSearchParams-impl.js
@@ -26,7 +26,7 @@ exports.implementation = class URLSearchParamsImpl {
         this._list.push([name, value]);
       }
     } else {
-      this._list = urlencoded.parseUrlencoded(init);
+      this._list = urlencoded.parseUrlencodedString(init);
     }
   }
 

diff --git a/src/encoding.js b/src/encoding.js
@@ -0,0 +1,18 @@
+"use strict";
+const { TextEncoder, TextDecoder } = require("util");
+
+const utf8Encoder = new TextEncoder();
+const utf8Decoder = new TextDecoder("utf-8", { ignoreBOM: true });
+
+function utf8Encode(string) {
+  return utf8Encoder.encode(string);
+}
+
+function utf8DecodeWithoutBOM(bytes) {
+  return utf8Decoder.decode(bytes);
+}
+
+module.exports = {
+  utf8Encode,
+  utf8DecodeWithoutBOM
+};
diff --git a/src/infra.js b/src/infra.js
@@ -1,5 +1,7 @@
 "use strict";
 
+// Note that we take code points as JS numbers, not JS strings.
+
 function isASCIIDigit(c) {
   return c >= 0x30 && c <= 0x39;
 }

diff --git a/src/percent-encoding.js b/src/percent-encoding.js
@@ -0,0 +1,139 @@
+"use strict";
+const { isASCIIHex } = require("./infra");
+const { utf8Encode } = require("./encoding");
+
+function p(char) {
+  return char.codePointAt(0);
+}
+
+// https://whatpr.org/url/518.html#percent-encode
+function percentEncode(c) {
+  let hex = c.toString(16).toUpperCase();
+  if (hex.length === 1) {
+    hex = "0" + hex;
+  }
+
+  return "%" + hex;
+}
+
+// https://whatpr.org/url/518.html#percent-decode
+function percentDecodeBytes(input) {
+  const output = new Uint8Array(input.byteLength);
+  let outputIndex = 0;
+  for (let i = 0; i < input.byteLength; ++i) {
+    const byte = input[i];
+    if (byte !== 0x25) {
+      output[outputIndex++] = byte;
+    } else if (byte === 0x25 && (!isASCIIHex(input[i + 1]) || !isASCIIHex(input[i + 2]))) {
+      output[outputIndex++] = byte;
+    } else {
+      const bytePoint = parseInt(String.fromCodePoint(input[i + 1], input[i + 2]), 16);
+      output[outputIndex++] = bytePoint;
+      i += 2;
+    }
+  }
+
+  return output.slice(0, outputIndex);
+}
+
+// https://whatpr.org/url/518.html#string-percent-decode
+function percentDecodeString(input) {
+  const bytes = utf8Encode(input);
+  return percentDecodeBytes(bytes);
+}
+
+// https://whatpr.org/url/518.html#c0-control-percent-encode-set
+function isC0ControlPercentEncode(c) {
+  return c <= 0x1F || c > 0x7E;
+}
+
+// https://whatpr.org/url/518.html#fragment-percent-encode-set
+const extraFragmentPercentEncodeSet = new Set([p(" "), p("\""), p("<"), p(">"), p("`")]);
+function isFragmentPercentEncode(c) {
+  return isC0ControlPercentEncode(c) || extraFragmentPercentEncodeSet.has(c);
+}
+
+// https://whatpr.org/url/518.html#query-percent-encode-set
+const extraQueryPercentEncodeSet = new Set([p(" "), p("\""), p("#"), p("<"), p(">")]);
+function isQueryPercentEncode(c) {
+  return isC0ControlPercentEncode(c) || extraQueryPercentEncodeSet.has(c);
+}
+
+// https://whatpr.org/url/518.html#special-query-percent-encode-set
+function isSpecialQueryPercentEncode(c) {
+  return isQueryPercentEncode(c) || c === p("'");
+}
+
+// https://whatpr.org/url/518.html#path-percent-encode-set
+const extraPathPercentEncodeSet = new Set([p("?"), p("`"), p("{"), p("}")]);
+function isPathPercentEncode(c) {
+  return isQueryPercentEncode(c) || extraPathPercentEncodeSet.has(c);
+}
+
+// https://whatpr.org/url/518.html#userinfo-percent-encode-set
+const extraUserinfoPercentEncodeSet =
+  new Set([p("/"), p(":"), p(";"), p("="), p("@"), p("["), p("\\"), p("]"), p("^"), p("|")]);
+function isUserinfoPercentEncode(c) {
+  return isPathPercentEncode(c) || extraUserinfoPercentEncodeSet.has(c);
+}
+
+// https://whatpr.org/url/518.html#application-x-www-form-urlencoded-percent-encode-set
+const extraURLEncodedPercentEncodeSet = new Set([
+  p("!"), p("$"), p("%"), p("&"), p("'"),
+  p("("), p(")"), p("+"), p(","), p("~")
+]);
+function isURLEncodedPercentEncode(c) {
+  return isUserinfoPercentEncode(c) || extraURLEncodedPercentEncodeSet.has(c);
+}
+
+// https://whatpr.org/url/518.html#code-point-percent-encode-after-encoding
+// https://whatpr.org/url/518.html#utf-8-percent-encode
+// Assuming encoding is always utf-8 allows us to trim one of the logic branches. TODO: support encoding.
+// The "-Internal" variant here has code points as JS strings. The external version used by other files has code points
+// as JS numbers, like the rest of the codebase.
+function utf8PercentEncodeCodePointInternal(codePoint, percentEncodePredicate) {
+  const bytes = utf8Encode(codePoint);
+  let output = "";
+  for (const byte of bytes) {
+    // Our percentEncodePredicate operates on bytes, not code points, so this is slightly different from the spec.
+    if (!percentEncodePredicate(byte)) {
+      output += String.fromCharCode(byte);
+    } else {
+      output += percentEncode(byte);
+    }
+  }
+
+  return output;
+}
+
+function utf8PercentEncodeCodePoint(codePoint, percentEncodePredicate) {
+  return utf8PercentEncodeCodePointInternal(String.fromCodePoint(codePoint), percentEncodePredicate);
+}
+
+// https://whatpr.org/url/518.html#string-percent-encode-after-encoding
+// https://whatpr.org/url/518.html#string-utf-8-percent-encode
+function utf8PercentEncodeString(input, percentEncodePredicate, spaceAsPlus = false) {
+  let output = "";
+  for (const codePoint of input) {
+    if (spaceAsPlus && codePoint === " ") {
+      output += "+";
+    } else {
+      output += utf8PercentEncodeCodePointInternal(codePoint, percentEncodePredicate);
+    }
+  }
+  return output;
+}
+
+module.exports = {
+  isC0ControlPercentEncode,
+  isFragmentPercentEncode,
+  isQueryPercentEncode,
+  isSpecialQueryPercentEncode,
+  isPathPercentEncode,
+  isUserinfoPercentEncode,
+  isURLEncodedPercentEncode,
+  percentDecodeString,
+  percentDecodeBytes,
+  utf8PercentEncodeString,
+  utf8PercentEncodeCodePoint
+};
diff --git a/src/url-state-machine.js b/src/url-state-machine.js
@@ -3,7 +3,10 @@ const punycode = require("punycode");
 const tr46 = require("tr46");
 
 const infra = require("./infra");
-const { percentEncode, percentDecode } = require("./urlencoded");
+const { utf8DecodeWithoutBOM } = require("./encoding");
+const { percentDecodeString, utf8PercentEncodeCodePoint, utf8PercentEncodeString, isC0ControlPercentEncode,
+  isFragmentPercentEncode, isQueryPercentEncode, isSpecialQueryPercentEncode, isPathPercentEncode,
+  isUserinfoPercentEncode } = require("./percent-encoding");
 
 function p(char) {
   return char.codePointAt(0);
@@ -21,7 +24,7 @@ const specialSchemes = {
 const failure = Symbol("failure");
 
 function countSymbols(str) {
-  return punycode.ucs2.decode(str).length;
+  return [...str].length;
 }
 
 function at(input, idx) {
@@ -74,48 +77,6 @@ function defaultPort(scheme) {
   return specialSchemes[scheme];
 }
 
-function utf8PercentEncode(c) {
-  const buf = Buffer.from(c);
-
-  let str = "";
-
-  for (let i = 0; i < buf.length; ++i) {
-    str += percentEncode(buf[i]);
-  }
-
-  return str;
-}
-
-function isC0ControlPercentEncode(c) {
-  return c <= 0x1F || c > 0x7E;
-}
-
-const extraUserinfoPercentEncodeSet =
-  new Set([p("/"), p(":"), p(";"), p("="), p("@"), p("["), p("\\"), p("]"), p("^"), p("|")]);
-function isUserinfoPercentEncode(c) {
-  return isPathPercentEncode(c) || extraUserinfoPercentEncodeSet.has(c);
-}
-
-const extraFragmentPercentEncodeSet = new Set([p(" "), p("\""), p("<"), p(">"), p("`")]);
-function isFragmentPercentEncode(c) {
-  return isC0ControlPercentEncode(c) || extraFragmentPercentEncodeSet.has(c);
-}
-
-const extraPathPercentEncodeSet = new Set([p("#"), p("?"), p("{"), p("}")]);
-function isPathPercentEncode(c) {
-  return isFragmentPercentEncode(c) || extraPathPercentEncodeSet.has(c);
-}
-
-function percentEncodeChar(c, encodeSetPredicate) {
-  const cStr = String.fromCodePoint(c);
-
-  if (encodeSetPredicate(c)) {
-    return utf8PercentEncode(cStr);
-  }
-
-  return cStr;
-}
-
 function parseIPv4Number(input) {
   let R = 10;
 
@@ -377,7 +338,7 @@ function parseHost(input, isNotSpecialArg = false) {
     return parseOpaqueHost(input);
   }
 
-  const domain = percentDecode(Buffer.from(input)).toString();
+  const domain = utf8DecodeWithoutBOM(percentDecodeString(input));
   const asciiDomain = domainToASCII(domain);
   if (asciiDomain === failure) {
     return failure;
@@ -400,12 +361,7 @@ function parseOpaqueHost(input) {
     return failure;
   }
 
-  let output = "";
-  const decoded = punycode.ucs2.decode(input);
-  for (let i = 0; i < decoded.length; ++i) {
-    output += percentEncodeChar(decoded[i], isC0ControlPercentEncode);
-  }
-  return output;
+  return utf8PercentEncodeString(input, isC0ControlPercentEncode);
 }
 
 function findLongestZeroSequence(arr) {
@@ -769,7 +725,7 @@ URLStateMachine.prototype["parse authority"] = function parseAuthority(c, cStr)
         this.passwordTokenSeenFlag = true;
         continue;
       }
-      const encodedCodePoints = percentEncodeChar(codePoint, isUserinfoPercentEncode);
+      const encodedCodePoints = utf8PercentEncodeCodePoint(codePoint, isUserinfoPercentEncode);
       if (this.passwordTokenSeenFlag) {
         this.url.password += encodedCodePoints;
       } else {
@@ -1059,7 +1015,7 @@ URLStateMachine.prototype["parse path"] = function parsePath(c) {
       this.parseError = true;
     }
 
-    this.buffer += percentEncodeChar(c, isPathPercentEncode);
+    this.buffer += utf8PercentEncodeCodePoint(c, isPathPercentEncode);
   }
 
   return true;
@@ -1085,45 +1041,33 @@ URLStateMachine.prototype["parse cannot-be-a-base-URL path"] = function parseCan
     }
 
     if (!isNaN(c)) {
-      this.url.path[0] += percentEncodeChar(c, isC0ControlPercentEncode);
+      this.url.path[0] += utf8PercentEncodeCodePoint(c, isC0ControlPercentEncode);
     }
   }
 
   return true;
 };
 
-URLStateMachine.prototype["parse query"] = function parseQuery(c, cStr) {
-  if (isNaN(c) || (!this.stateOverride && c === p("#"))) {
-    if (!isSpecial(this.url) || this.url.scheme === "ws" || this.url.scheme === "wss") {
-      this.encodingOverride = "utf-8";
-    }
-
-    const buffer = Buffer.from(this.buffer); // TODO: Use encoding override instead
-    for (let i = 0; i < buffer.length; ++i) {
-      if (buffer[i] < 0x21 ||
-          buffer[i] > 0x7E ||
-          buffer[i] === 0x22 || buffer[i] === 0x23 || buffer[i] === 0x3C || buffer[i] === 0x3E ||
-          (buffer[i] === 0x27 && isSpecial(this.url))) {
-        this.url.query += percentEncode(buffer[i]);
-      } else {
-        this.url.query += String.fromCodePoint(buffer[i]);
-      }
-    }
+URLStateMachine.prototype["parse query"] = function parseQuery(c) {
+  if (!isSpecial(this.url) || this.url.scheme === "ws" || this.url.scheme === "wss") {
+    this.encodingOverride = "utf-8";
+  }
 
-    this.buffer = "";
-    if (c === p("#")) {
-      this.url.fragment = "";
-      this.state = "fragment";
-    }
-  } else {
+  if (!this.stateOverride & c === p("#")) {
+    this.url.fragment = "";
+    this.state = "fragment";
+  } else if (!isNaN(c)) {
     // TODO: If c is not a URL code point and not "%", parse error.
+
     if (c === p("%") &&
       (!infra.isASCIIHex(this.input[this.pointer + 1]) ||
         !infra.isASCIIHex(this.input[this.pointer + 2]))) {
       this.parseError = true;
     }
 
-    this.buffer += cStr;
+    const queryPercentEncodePredicate = isSpecial(this.url) ? isSpecialQueryPercentEncode : isQueryPercentEncode;
+    // TODO: use "percent-encode after encoding" passing in this.encodingOverride
+    this.url.query += utf8PercentEncodeCodePoint(c, queryPercentEncodePredicate);
   }
 
   return true;
@@ -1138,7 +1082,7 @@ URLStateMachine.prototype["parse fragment"] = function parseFragment(c) {
       this.parseError = true;
     }
 
-    this.url.fragment += percentEncodeChar(c, isFragmentPercentEncode);
+    this.url.fragment += utf8PercentEncodeCodePoint(c, isFragmentPercentEncode);
   }
 
   return true;
@@ -1247,19 +1191,11 @@ module.exports.basicURLParse = function (input, options) {
 };
 
 module.exports.setTheUsername = function (url, username) {
-  url.username = "";
-  const decoded = punycode.ucs2.decode(username);
-  for (let i = 0; i < decoded.length; ++i) {
-    url.username += percentEncodeChar(decoded[i], isUserinfoPercentEncode);
-  }
+  url.username = utf8PercentEncodeString(username, isUserinfoPercentEncode);
 };
 
 module.exports.setThePassword = function (url, password) {
-  url.password = "";
-  const decoded = punycode.ucs2.decode(password);
-  for (let i = 0; i < decoded.length; ++i) {
-    url.password += percentEncodeChar(decoded[i], isUserinfoPercentEncode);
-  }
+  url.password = utf8PercentEncodeString(password, isUserinfoPercentEncode);
 };
 
 module.exports.serializeHost = serializeHost;