From cf7a7c1c29f6853faf79f50c97e0767da2182a52 Mon Sep 17 00:00:00 2001
From: Domenic Denicola <d@domenic.me>
Date: Thu, 25 Jun 2020 15:24:03 -0400
Subject: [PATCH] Spec update: make everything use percent-encode sets

Follows https://github.com/whatwg/url/pull/518.

This generally tries to make the code correspond more explicitly to the specification in a few ways. It doesn't handle non-UTF-8 encodings yet, though.

Supersedes #152.
---
 src/URL-impl.js             |   4 +-
 src/URLSearchParams-impl.js |   2 +-
 src/encoding.js             |  18 +++++
 src/infra.js                |   2 +
 src/percent-encoding.js     | 139 ++++++++++++++++++++++++++++++++++++
 src/url-state-machine.js    | 114 +++++++----------------------
 src/urlencoded.js           | 125 ++++++++++++--------------------
 7 files changed, 232 insertions(+), 172 deletions(-)
 create mode 100644 src/encoding.js
 create mode 100644 src/percent-encoding.js

diff --git a/src/URL-impl.js b/src/URL-impl.js
index 5818667..363aa26 100644
--- a/src/URL-impl.js
+++ b/src/URL-impl.js
@@ -46,7 +46,7 @@ exports.implementation = class URLImpl {
     this._query._list.splice(0);
     const { query } = parsedURL;
     if (query !== null) {
-      this._query._list = urlencoded.parseUrlencoded(query);
+      this._query._list = urlencoded.parseUrlencodedString(query);
     }
   }
 
@@ -185,7 +185,7 @@ exports.implementation = class URLImpl {
     const input = v[0] === "?" ? v.substring(1) : v;
     url.query = "";
     usm.basicURLParse(input, { url, stateOverride: "query" });
-    this._query._list = urlencoded.parseUrlencoded(input);
+    this._query._list = urlencoded.parseUrlencodedString(input);
   }
 
   get searchParams() {
diff --git a/src/URLSearchParams-impl.js b/src/URLSearchParams-impl.js
index b59d677..3f10ec8 100644
--- a/src/URLSearchParams-impl.js
+++ b/src/URLSearchParams-impl.js
@@ -26,7 +26,7 @@ exports.implementation = class URLSearchParamsImpl {
         this._list.push([name, value]);
       }
     } else {
-      this._list = urlencoded.parseUrlencoded(init);
+      this._list = urlencoded.parseUrlencodedString(init);
     }
   }
 
diff --git a/src/encoding.js b/src/encoding.js
new file mode 100644
index 0000000..6a5abf4
--- /dev/null
+++ b/src/encoding.js
@@ -0,0 +1,18 @@
+"use strict";
+const { TextEncoder, TextDecoder } = require("util");
+
+const utf8Encoder = new TextEncoder();
+const utf8Decoder = new TextDecoder("utf-8", { ignoreBOM: true });
+
+function utf8Encode(string) {
+  return utf8Encoder.encode(string);
+}
+
+function utf8DecodeWithoutBOM(bytes) {
+  return utf8Decoder.decode(bytes);
+}
+
+module.exports = {
+  utf8Encode,
+  utf8DecodeWithoutBOM
+};
diff --git a/src/infra.js b/src/infra.js
index 2a712f8..4a984a3 100644
--- a/src/infra.js
+++ b/src/infra.js
@@ -1,5 +1,7 @@
 "use strict";
 
+// Note that we take code points as JS numbers, not JS strings.
+
 function isASCIIDigit(c) {
   return c >= 0x30 && c <= 0x39;
 }
diff --git a/src/percent-encoding.js b/src/percent-encoding.js
new file mode 100644
index 0000000..587a1ba
--- /dev/null
+++ b/src/percent-encoding.js
@@ -0,0 +1,139 @@
+"use strict";
+const { isASCIIHex } = require("./infra");
+const { utf8Encode } = require("./encoding");
+
+function p(char) {
+  return char.codePointAt(0);
+}
+
+// https://whatpr.org/url/518.html#percent-encode
+function percentEncode(c) {
+  let hex = c.toString(16).toUpperCase();
+  if (hex.length === 1) {
+    hex = "0" + hex;
+  }
+
+  return "%" + hex;
+}
+
+// https://whatpr.org/url/518.html#percent-decode
+function percentDecodeBytes(input) {
+  const output = new Uint8Array(input.byteLength);
+  let outputIndex = 0;
+  for (let i = 0; i < input.byteLength; ++i) {
+    const byte = input[i];
+    if (byte !== 0x25) {
+      output[outputIndex++] = byte;
+    } else if (byte === 0x25 && (!isASCIIHex(input[i + 1]) || !isASCIIHex(input[i + 2]))) {
+      output[outputIndex++] = byte;
+    } else {
+      const bytePoint = parseInt(String.fromCodePoint(input[i + 1], input[i + 2]), 16);
+      output[outputIndex++] = bytePoint;
+      i += 2;
+    }
+  }
+
+  return output.slice(0, outputIndex);
+}
+
+// https://whatpr.org/url/518.html#string-percent-decode
+function percentDecodeString(input) {
+  const bytes = utf8Encode(input);
+  return percentDecodeBytes(bytes);
+}
+
+// https://whatpr.org/url/518.html#c0-control-percent-encode-set
+function isC0ControlPercentEncode(c) {
+  return c <= 0x1F || c > 0x7E;
+}
+
+// https://whatpr.org/url/518.html#fragment-percent-encode-set
+const extraFragmentPercentEncodeSet = new Set([p(" "), p("\""), p("<"), p(">"), p("`")]);
+function isFragmentPercentEncode(c) {
+  return isC0ControlPercentEncode(c) || extraFragmentPercentEncodeSet.has(c);
+}
+
+// https://whatpr.org/url/518.html#query-percent-encode-set
+const extraQueryPercentEncodeSet = new Set([p(" "), p("\""), p("#"), p("<"), p(">")]);
+function isQueryPercentEncode(c) {
+  return isC0ControlPercentEncode(c) || extraQueryPercentEncodeSet.has(c);
+}
+
+// https://whatpr.org/url/518.html#special-query-percent-encode-set
+function isSpecialQueryPercentEncode(c) {
+  return isQueryPercentEncode(c) || c === p("'");
+}
+
+// https://whatpr.org/url/518.html#path-percent-encode-set
+const extraPathPercentEncodeSet = new Set([p("?"), p("`"), p("{"), p("}")]);
+function isPathPercentEncode(c) {
+  return isQueryPercentEncode(c) || extraPathPercentEncodeSet.has(c);
+}
+
+// https://whatpr.org/url/518.html#userinfo-percent-encode-set
+const extraUserinfoPercentEncodeSet =
+  new Set([p("/"), p(":"), p(";"), p("="), p("@"), p("["), p("\\"), p("]"), p("^"), p("|")]);
+function isUserinfoPercentEncode(c) {
+  return isPathPercentEncode(c) || extraUserinfoPercentEncodeSet.has(c);
+}
+
+// https://whatpr.org/url/518.html#application-x-www-form-urlencoded-percent-encode-set
+const extraURLEncodedPercentEncodeSet = new Set([
+  p("!"), p("$"), p("%"), p("&"), p("'"),
+  p("("), p(")"), p("+"), p(","), p("~")
+]);
+function isURLEncodedPercentEncode(c) {
+  return isUserinfoPercentEncode(c) || extraURLEncodedPercentEncodeSet.has(c);
+}
+
+// https://whatpr.org/url/518.html#code-point-percent-encode-after-encoding
+// https://whatpr.org/url/518.html#utf-8-percent-encode
+// Assuming encoding is always utf-8 allows us to trim one of the logic branches. TODO: support encoding.
+// The "-Internal" variant here has code points as JS strings. The external version used by other files has code points
+// as JS numbers, like the rest of the codebase.
+function utf8PercentEncodeCodePointInternal(codePoint, percentEncodePredicate) {
+  const bytes = utf8Encode(codePoint);
+  let output = "";
+  for (const byte of bytes) {
+    // Our percentEncodePredicate operates on bytes, not code points, so this is slightly different from the spec.
+    if (!percentEncodePredicate(byte)) {
+      output += String.fromCharCode(byte);
+    } else {
+      output += percentEncode(byte);
+    }
+  }
+
+  return output;
+}
+
+function utf8PercentEncodeCodePoint(codePoint, percentEncodePredicate) {
+  return utf8PercentEncodeCodePointInternal(String.fromCodePoint(codePoint), percentEncodePredicate);
+}
+
+// https://whatpr.org/url/518.html#string-percent-encode-after-encoding
+// https://whatpr.org/url/518.html#string-utf-8-percent-encode
+function utf8PercentEncodeString(input, percentEncodePredicate, spaceAsPlus = false) {
+  let output = "";
+  for (const codePoint of input) {
+    if (spaceAsPlus && codePoint === " ") {
+      output += "+";
+    } else {
+      output += utf8PercentEncodeCodePointInternal(codePoint, percentEncodePredicate);
+    }
+  }
+  return output;
+}
+
+module.exports = {
+  isC0ControlPercentEncode,
+  isFragmentPercentEncode,
+  isQueryPercentEncode,
+  isSpecialQueryPercentEncode,
+  isPathPercentEncode,
+  isUserinfoPercentEncode,
+  isURLEncodedPercentEncode,
+  percentDecodeString,
+  percentDecodeBytes,
+  utf8PercentEncodeString,
+  utf8PercentEncodeCodePoint
+};
diff --git a/src/url-state-machine.js b/src/url-state-machine.js
index 1159879..3dc247c 100644
--- a/src/url-state-machine.js
+++ b/src/url-state-machine.js
@@ -3,7 +3,10 @@ const punycode = require("punycode");
 const tr46 = require("tr46");
 
 const infra = require("./infra");
-const { percentEncode, percentDecode } = require("./urlencoded");
+const { utf8DecodeWithoutBOM } = require("./encoding");
+const { percentDecodeString, utf8PercentEncodeCodePoint, utf8PercentEncodeString, isC0ControlPercentEncode,
+  isFragmentPercentEncode, isQueryPercentEncode, isSpecialQueryPercentEncode, isPathPercentEncode,
+  isUserinfoPercentEncode } = require("./percent-encoding");
 
 function p(char) {
   return char.codePointAt(0);
@@ -21,7 +24,7 @@ const specialSchemes = {
 const failure = Symbol("failure");
 
 function countSymbols(str) {
-  return punycode.ucs2.decode(str).length;
+  return [...str].length;
 }
 
 function at(input, idx) {
@@ -74,48 +77,6 @@ function defaultPort(scheme) {
   return specialSchemes[scheme];
 }
 
-function utf8PercentEncode(c) {
-  const buf = Buffer.from(c);
-
-  let str = "";
-
-  for (let i = 0; i < buf.length; ++i) {
-    str += percentEncode(buf[i]);
-  }
-
-  return str;
-}
-
-function isC0ControlPercentEncode(c) {
-  return c <= 0x1F || c > 0x7E;
-}
-
-const extraUserinfoPercentEncodeSet =
-  new Set([p("/"), p(":"), p(";"), p("="), p("@"), p("["), p("\\"), p("]"), p("^"), p("|")]);
-function isUserinfoPercentEncode(c) {
-  return isPathPercentEncode(c) || extraUserinfoPercentEncodeSet.has(c);
-}
-
-const extraFragmentPercentEncodeSet = new Set([p(" "), p("\""), p("<"), p(">"), p("`")]);
-function isFragmentPercentEncode(c) {
-  return isC0ControlPercentEncode(c) || extraFragmentPercentEncodeSet.has(c);
-}
-
-const extraPathPercentEncodeSet = new Set([p("#"), p("?"), p("{"), p("}")]);
-function isPathPercentEncode(c) {
-  return isFragmentPercentEncode(c) || extraPathPercentEncodeSet.has(c);
-}
-
-function percentEncodeChar(c, encodeSetPredicate) {
-  const cStr = String.fromCodePoint(c);
-
-  if (encodeSetPredicate(c)) {
-    return utf8PercentEncode(cStr);
-  }
-
-  return cStr;
-}
-
 function parseIPv4Number(input) {
   let R = 10;
 
@@ -377,7 +338,7 @@ function parseHost(input, isNotSpecialArg = false) {
     return parseOpaqueHost(input);
   }
 
-  const domain = percentDecode(Buffer.from(input)).toString();
+  const domain = utf8DecodeWithoutBOM(percentDecodeString(input));
   const asciiDomain = domainToASCII(domain);
   if (asciiDomain === failure) {
     return failure;
@@ -400,12 +361,7 @@ function parseOpaqueHost(input) {
     return failure;
   }
 
-  let output = "";
-  const decoded = punycode.ucs2.decode(input);
-  for (let i = 0; i < decoded.length; ++i) {
-    output += percentEncodeChar(decoded[i], isC0ControlPercentEncode);
-  }
-  return output;
+  return utf8PercentEncodeString(input, isC0ControlPercentEncode);
 }
 
 function findLongestZeroSequence(arr) {
@@ -769,7 +725,7 @@ URLStateMachine.prototype["parse authority"] = function parseAuthority(c, cStr)
         this.passwordTokenSeenFlag = true;
         continue;
       }
-      const encodedCodePoints = percentEncodeChar(codePoint, isUserinfoPercentEncode);
+      const encodedCodePoints = utf8PercentEncodeCodePoint(codePoint, isUserinfoPercentEncode);
       if (this.passwordTokenSeenFlag) {
         this.url.password += encodedCodePoints;
       } else {
@@ -1059,7 +1015,7 @@ URLStateMachine.prototype["parse path"] = function parsePath(c) {
       this.parseError = true;
     }
 
-    this.buffer += percentEncodeChar(c, isPathPercentEncode);
+    this.buffer += utf8PercentEncodeCodePoint(c, isPathPercentEncode);
   }
 
   return true;
@@ -1085,45 +1041,33 @@ URLStateMachine.prototype["parse cannot-be-a-base-URL path"] = function parseCan
     }
 
     if (!isNaN(c)) {
-      this.url.path[0] += percentEncodeChar(c, isC0ControlPercentEncode);
+      this.url.path[0] += utf8PercentEncodeCodePoint(c, isC0ControlPercentEncode);
     }
   }
 
   return true;
 };
 
-URLStateMachine.prototype["parse query"] = function parseQuery(c, cStr) {
-  if (isNaN(c) || (!this.stateOverride && c === p("#"))) {
-    if (!isSpecial(this.url) || this.url.scheme === "ws" || this.url.scheme === "wss") {
-      this.encodingOverride = "utf-8";
-    }
-
-    const buffer = Buffer.from(this.buffer); // TODO: Use encoding override instead
-    for (let i = 0; i < buffer.length; ++i) {
-      if (buffer[i] < 0x21 ||
-          buffer[i] > 0x7E ||
-          buffer[i] === 0x22 || buffer[i] === 0x23 || buffer[i] === 0x3C || buffer[i] === 0x3E ||
-          (buffer[i] === 0x27 && isSpecial(this.url))) {
-        this.url.query += percentEncode(buffer[i]);
-      } else {
-        this.url.query += String.fromCodePoint(buffer[i]);
-      }
-    }
+URLStateMachine.prototype["parse query"] = function parseQuery(c) {
+  if (!isSpecial(this.url) || this.url.scheme === "ws" || this.url.scheme === "wss") {
+    this.encodingOverride = "utf-8";
+  }
 
-    this.buffer = "";
-    if (c === p("#")) {
-      this.url.fragment = "";
-      this.state = "fragment";
-    }
-  } else {
+  if (!this.stateOverride & c === p("#")) {
+    this.url.fragment = "";
+    this.state = "fragment";
+  } else if (!isNaN(c)) {
     // TODO: If c is not a URL code point and not "%", parse error.
+
     if (c === p("%") &&
       (!infra.isASCIIHex(this.input[this.pointer + 1]) ||
         !infra.isASCIIHex(this.input[this.pointer + 2]))) {
       this.parseError = true;
     }
 
-    this.buffer += cStr;
+    const queryPercentEncodePredicate = isSpecial(this.url) ? isSpecialQueryPercentEncode : isQueryPercentEncode;
+    // TODO: use "percent-encode after encoding" passing in this.encodingOverride
+    this.url.query += utf8PercentEncodeCodePoint(c, queryPercentEncodePredicate);
   }
 
   return true;
@@ -1138,7 +1082,7 @@ URLStateMachine.prototype["parse fragment"] = function parseFragment(c) {
       this.parseError = true;
     }
 
-    this.url.fragment += percentEncodeChar(c, isFragmentPercentEncode);
+    this.url.fragment += utf8PercentEncodeCodePoint(c, isFragmentPercentEncode);
   }
 
   return true;
@@ -1247,19 +1191,11 @@ module.exports.basicURLParse = function (input, options) {
 };
 
 module.exports.setTheUsername = function (url, username) {
-  url.username = "";
-  const decoded = punycode.ucs2.decode(username);
-  for (let i = 0; i < decoded.length; ++i) {
-    url.username += percentEncodeChar(decoded[i], isUserinfoPercentEncode);
-  }
+  url.username = utf8PercentEncodeString(username, isUserinfoPercentEncode);
 };
 
 module.exports.setThePassword = function (url, password) {
-  url.password = "";
-  const decoded = punycode.ucs2.decode(password);
-  for (let i = 0; i < decoded.length; ++i) {
-    url.password += percentEncodeChar(decoded[i], isUserinfoPercentEncode);
-  }
+  url.password = utf8PercentEncodeString(password, isUserinfoPercentEncode);
 };
 
 module.exports.serializeHost = serializeHost;
diff --git a/src/urlencoded.js b/src/urlencoded.js
index 593755b..be84a76 100644
--- a/src/urlencoded.js
+++ b/src/urlencoded.js
@@ -1,57 +1,12 @@
 "use strict";
-const { isASCIIHex } = require("./infra");
+const { utf8Encode, utf8DecodeWithoutBOM } = require("./encoding");
+const { percentDecodeBytes, utf8PercentEncodeString, isURLEncodedPercentEncode } = require("./percent-encoding");
 
 function p(char) {
   return char.codePointAt(0);
 }
 
-function strictlySplitByteSequence(buf, cp) {
-  const list = [];
-  let last = 0;
-  let i = buf.indexOf(cp);
-  while (i >= 0) {
-    list.push(buf.slice(last, i));
-    last = i + 1;
-    i = buf.indexOf(cp, last);
-  }
-  if (last !== buf.length) {
-    list.push(buf.slice(last));
-  }
-  return list;
-}
-
-function replaceByteInByteSequence(buf, from, to) {
-  let i = buf.indexOf(from);
-  while (i >= 0) {
-    buf[i] = to;
-    i = buf.indexOf(from, i + 1);
-  }
-  return buf;
-}
-
-function percentEncode(c) {
-  let hex = c.toString(16).toUpperCase();
-  if (hex.length === 1) {
-    hex = "0" + hex;
-  }
-
-  return "%" + hex;
-}
-
-function percentDecode(input) {
-  const output = Buffer.alloc(input.byteLength);
-  let ptr = 0;
-  for (let i = 0; i < input.length; ++i) {
-    if (input[i] !== p("%") || !isASCIIHex(input[i + 1]) || !isASCIIHex(input[i + 2])) {
-      output[ptr++] = input[i];
-    } else {
-      output[ptr++] = parseInt(input.slice(i + 1, i + 3).toString(), 16);
-      i += 2;
-    }
-  }
-  return output.slice(0, ptr);
-}
-
+// https://whatpr.org/url/518.html#concept-urlencoded-parser
 function parseUrlencoded(input) {
   const sequences = strictlySplitByteSequence(input, p("&"));
   const output = [];
@@ -69,47 +24,39 @@ function parseUrlencoded(input) {
       value = bytes.slice(indexOfEqual + 1);
     } else {
       name = bytes;
-      value = Buffer.alloc(0);
+      value = new Uint8Array(0);
     }
 
-    name = replaceByteInByteSequence(Buffer.from(name), p("+"), p(" "));
-    value = replaceByteInByteSequence(Buffer.from(value), p("+"), p(" "));
+    name = replaceByteInByteSequence(name, 0x2B, 0x20);
+    value = replaceByteInByteSequence(value, 0x2B, 0x20);
+
+    const nameString = utf8DecodeWithoutBOM(percentDecodeBytes(name));
+    const valueString = utf8DecodeWithoutBOM(percentDecodeBytes(value));
 
-    output.push([percentDecode(name).toString(), percentDecode(value).toString()]);
+    output.push([nameString, valueString]);
   }
   return output;
 }
 
-function serializeUrlencodedByte(input) {
-  let output = "";
-  for (const byte of input) {
-    if (byte === p(" ")) {
-      output += "+";
-    } else if (byte === p("*") ||
-               byte === p("-") ||
-               byte === p(".") ||
-               (byte >= p("0") && byte <= p("9")) ||
-               (byte >= p("A") && byte <= p("Z")) ||
-               byte === p("_") ||
-               (byte >= p("a") && byte <= p("z"))) {
-      output += String.fromCodePoint(byte);
-    } else {
-      output += percentEncode(byte);
-    }
-  }
-  return output;
+// https://whatpr.org/url/518.html#concept-urlencoded-string-parser
+function parseUrlencodedString(input) {
+  return parseUrlencoded(utf8Encode(input));
 }
 
+// https://whatpr.org/url/518.html#concept-urlencoded-serializer
 function serializeUrlencoded(tuples, encodingOverride = undefined) {
   let encoding = "utf-8";
   if (encodingOverride !== undefined) {
+    // TODO "get the output encoding", i.e. handle encoding labels vs. names.
     encoding = encodingOverride;
   }
 
   let output = "";
   for (const [i, tuple] of tuples.entries()) {
     // TODO: handle encoding override
-    const name = serializeUrlencodedByte(Buffer.from(tuple[0]));
+
+    const name = utf8PercentEncodeString(tuple[0], isURLEncodedPercentEncode, true);
+
     let value = tuple[1];
     if (tuple.length > 2 && tuple[2] !== undefined) {
       if (tuple[2] === "hidden" && name === "_charset_") {
@@ -119,7 +66,9 @@ function serializeUrlencoded(tuples, encodingOverride = undefined) {
         value = value.name;
       }
     }
-    value = serializeUrlencodedByte(Buffer.from(value));
+
+    value = utf8PercentEncodeString(value, isURLEncodedPercentEncode, true);
+
     if (i !== 0) {
       output += "&";
     }
@@ -128,15 +77,31 @@ function serializeUrlencoded(tuples, encodingOverride = undefined) {
   return output;
 }
 
-module.exports = {
-  percentEncode,
-  percentDecode,
+function strictlySplitByteSequence(buf, cp) {
+  const list = [];
+  let last = 0;
+  let i = buf.indexOf(cp);
+  while (i >= 0) {
+    list.push(buf.slice(last, i));
+    last = i + 1;
+    i = buf.indexOf(cp, last);
+  }
+  if (last !== buf.length) {
+    list.push(buf.slice(last));
+  }
+  return list;
+}
 
-  // application/x-www-form-urlencoded string parser
-  parseUrlencoded(input) {
-    return parseUrlencoded(Buffer.from(input));
-  },
+function replaceByteInByteSequence(buf, from, to) {
+  let i = buf.indexOf(from);
+  while (i >= 0) {
+    buf[i] = to;
+    i = buf.indexOf(from, i + 1);
+  }
+  return buf;
+}
 
-  // application/x-www-form-urlencoded serializer
+module.exports = {
+  parseUrlencodedString,
   serializeUrlencoded
 };