From 40a36b3af8181a4d112f6d54f1dbd3824f6421ef Mon Sep 17 00:00:00 2001 From: Hakan Kimeiga Date: Fri, 8 Dec 2017 14:07:14 -0800 Subject: [PATCH] url: added url fragment lookup table Percent-encoded additional characters in fragment state with new FRAGMENT_ENCODE_SET lookup table. The fragment percent-encode set includes the C0 control percent-encode set and code points U+0020, U+0022, U+003C, U+003E, and U+0060. PR-URL: https://github.com/nodejs/node/pull/17627 Fixes: https://github.com/nodejs/node/issues/17540 Reviewed-By: Timothy Gu Reviewed-By: Daijiro Wachi Reviewed-By: Ruben Bridgewater Reviewed-By: James M Snell --- doc/api/url.md | 11 +++-- src/node_url.cc | 70 +++++++++++++++++++++++++- test/fixtures/url-setter-tests.js | 46 +++++++++++++++-- test/fixtures/url-tests.js | 82 +++++++++++++++++++++++++++---- 4 files changed, 192 insertions(+), 17 deletions(-) diff --git a/doc/api/url.md b/doc/api/url.md index 40cc002f048e81..55649313b3a2e4 100644 --- a/doc/api/url.md +++ b/doc/api/url.md @@ -1112,12 +1112,15 @@ forward slash (`/`) character is encoded as `%3C`. The [WHATWG URL Standard][] uses a more selective and fine grained approach to selecting encoded characters than that used by the Legacy API. -The WHATWG algorithm defines three "percent-encode sets" that describe ranges +The WHATWG algorithm defines four "percent-encode sets" that describe ranges of characters that must be percent-encoded: * The *C0 control percent-encode set* includes code points in range U+0000 to U+001F (inclusive) and all code points greater than U+007E. +* The *fragment percent-encode set* includes the *C0 control percent-encode set* + and code points U+0020, U+0022, U+003C, U+003E, and U+0060. + * The *path percent-encode set* includes the *C0 control percent-encode set* and code points U+0020, U+0022, U+0023, U+003C, U+003E, U+003F, U+0060, U+007B, and U+007D. @@ -1128,9 +1131,9 @@ of characters that must be percent-encoded: The *userinfo percent-encode set* is used exclusively for username and passwords encoded within the URL. The *path percent-encode set* is used for the -path of most URLs. The *C0 control percent-encode set* is used for all -other cases, including URL fragments in particular, but also host and path -under certain specific conditions. +path of most URLs. The *fragment percent-encode set* is used for URL fragments. +The *C0 control percent-encode set* is used for host and path under certain +specific conditions, in addition to all other cases. When non-ASCII characters appear within a hostname, the hostname is encoded using the [Punycode][] algorithm. Note, however, that a hostname *may* contain diff --git a/src/node_url.cc b/src/node_url.cc index 0f7992264ecf0f..a28859d285939b 100644 --- a/src/node_url.cc +++ b/src/node_url.cc @@ -332,6 +332,74 @@ const uint8_t C0_CONTROL_ENCODE_SET[32] = { 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 }; +const uint8_t FRAGMENT_ENCODE_SET[32] = { + // 00 01 02 03 04 05 06 07 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 08 09 0A 0B 0C 0D 0E 0F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 10 11 12 13 14 15 16 17 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 18 19 1A 1B 1C 1D 1E 1F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 20 21 22 23 24 25 26 27 + 0x01 | 0x00 | 0x04 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 28 29 2A 2B 2C 2D 2E 2F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 30 31 32 33 34 35 36 37 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 38 39 3A 3B 3C 3D 3E 3F + 0x00 | 0x00 | 0x00 | 0x00 | 0x10 | 0x00 | 0x40 | 0x00, + // 40 41 42 43 44 45 46 47 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 48 49 4A 4B 4C 4D 4E 4F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 50 51 52 53 54 55 56 57 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 58 59 5A 5B 5C 5D 5E 5F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 60 61 62 63 64 65 66 67 + 0x01 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 68 69 6A 6B 6C 6D 6E 6F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 70 71 72 73 74 75 76 77 + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00, + // 78 79 7A 7B 7C 7D 7E 7F + 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x00 | 0x80, + // 80 81 82 83 84 85 86 87 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 88 89 8A 8B 8C 8D 8E 8F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 90 91 92 93 94 95 96 97 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // 98 99 9A 9B 9C 9D 9E 9F + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A0 A1 A2 A3 A4 A5 A6 A7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // A8 A9 AA AB AC AD AE AF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B0 B1 B2 B3 B4 B5 B6 B7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // B8 B9 BA BB BC BD BE BF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C0 C1 C2 C3 C4 C5 C6 C7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // C8 C9 CA CB CC CD CE CF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D0 D1 D2 D3 D4 D5 D6 D7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // D8 D9 DA DB DC DD DE DF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E0 E1 E2 E3 E4 E5 E6 E7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // E8 E9 EA EB EC ED EE EF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F0 F1 F2 F3 F4 F5 F6 F7 + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, + // F8 F9 FA FB FC FD FE FF + 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80 +}; + + const uint8_t PATH_ENCODE_SET[32] = { // 00 01 02 03 04 05 06 07 0x01 | 0x02 | 0x04 | 0x08 | 0x10 | 0x20 | 0x40 | 0x80, @@ -1896,7 +1964,7 @@ void URL::Parse(const char* input, case 0: break; default: - AppendOrEscape(&buffer, ch, C0_CONTROL_ENCODE_SET); + AppendOrEscape(&buffer, ch, FRAGMENT_ENCODE_SET); } break; default: diff --git a/test/fixtures/url-setter-tests.js b/test/fixtures/url-setter-tests.js index 6f769eaec7543d..289812cb7e33ff 100644 --- a/test/fixtures/url-setter-tests.js +++ b/test/fixtures/url-setter-tests.js @@ -2,7 +2,7 @@ /* The following tests are copied from WPT. Modifications to them should be upstreamed first. Refs: - https://github.com/w3c/web-platform-tests/blob/b30abaecf4/url/setters_tests.json + https://github.com/w3c/web-platform-tests/blob/ed4bb727ed/url/setters_tests.json License: http://www.w3.org/Consortium/Legal/2008/04-testsuite-copyright.html */ module.exports = @@ -1793,13 +1793,53 @@ module.exports = "hash": "" } }, + { + "href": "http://example.net", + "new_value": "#foo bar", + "expected": { + "href": "http://example.net/#foo%20bar", + "hash": "#foo%20bar" + } + }, + { + "href": "http://example.net", + "new_value": "#foo\"bar", + "expected": { + "href": "http://example.net/#foo%22bar", + "hash": "#foo%22bar" + } + }, + { + "href": "http://example.net", + "new_value": "#foobar", + "expected": { + "href": "http://example.net/#foo%3Ebar", + "hash": "#foo%3Ebar" + } + }, + { + "href": "http://example.net", + "new_value": "#foo`bar", + "expected": { + "href": "http://example.net/#foo%60bar", + "hash": "#foo%60bar" + } + }, { "comment": "Simple percent-encoding; nuls, tabs, and newlines are removed", "href": "a:/", "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", "expected": { - "href": "a:/#%01%1F !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9", - "hash": "#%01%1F !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9" + "href": "a:/#%01%1F%20!%22#$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_%60az{|}~%7F%C2%80%C2%81%C3%89%C3%A9", + "hash": "#%01%1F%20!%22#$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_%60az{|}~%7F%C2%80%C2%81%C3%89%C3%A9" } }, { diff --git a/test/fixtures/url-tests.js b/test/fixtures/url-tests.js index 48f77fe0774d64..745adb8d8aab43 100644 --- a/test/fixtures/url-tests.js +++ b/test/fixtures/url-tests.js @@ -2,7 +2,7 @@ /* The following tests are copied from WPT. Modifications to them should be upstreamed first. Refs: - https://github.com/w3c/web-platform-tests/blob/11757f1/url/urltestdata.json + https://github.com/w3c/web-platform-tests/blob/ed4bb727ed/url/urltestdata.json License: http://www.w3.org/Consortium/Legal/2008/04-testsuite-copyright.html */ module.exports = @@ -161,7 +161,7 @@ module.exports = { "input": "http://f:21/ b ? d # e ", "base": "http://example.org/foo/bar", - "href": "http://f:21/%20b%20?%20d%20# e", + "href": "http://f:21/%20b%20?%20d%20#%20e", "origin": "http://f:21", "protocol": "http:", "username": "", @@ -171,12 +171,12 @@ module.exports = "port": "21", "pathname": "/%20b%20", "search": "?%20d%20", - "hash": "# e" + "hash": "#%20e" }, { "input": "lolscheme:x x#x x", "base": "about:blank", - "href": "lolscheme:x x#x x", + "href": "lolscheme:x x#x%20x", "protocol": "lolscheme:", "username": "", "password": "", @@ -185,7 +185,7 @@ module.exports = "port": "", "pathname": "x x", "search": "", - "hash": "#x x" + "hash": "#x%20x" }, { "input": "http://f:/c", @@ -2268,7 +2268,7 @@ module.exports = { "input": "http://www.google.com/foo?bar=baz# »", "base": "about:blank", - "href": "http://www.google.com/foo?bar=baz# %C2%BB", + "href": "http://www.google.com/foo?bar=baz#%20%C2%BB", "origin": "http://www.google.com", "protocol": "http:", "username": "", @@ -2278,12 +2278,12 @@ module.exports = "port": "", "pathname": "/foo", "search": "?bar=baz", - "hash": "# %C2%BB" + "hash": "#%20%C2%BB" }, { "input": "data:test# »", "base": "about:blank", - "href": "data:test# %C2%BB", + "href": "data:test#%20%C2%BB", "origin": "null", "protocol": "data:", "username": "", @@ -2293,7 +2293,7 @@ module.exports = "port": "", "pathname": "test", "search": "", - "hash": "# %C2%BB" + "hash": "#%20%C2%BB" }, { "input": "http://www.google.com", @@ -4795,6 +4795,70 @@ module.exports = "searchParams": "qux=", "hash": "#foo%08bar" }, + { + "input": "http://foo.bar/baz?qux#foo\"bar", + "base": "about:blank", + "href": "http://foo.bar/baz?qux#foo%22bar", + "origin": "http://foo.bar", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo.bar", + "hostname": "foo.bar", + "port": "", + "pathname": "/baz", + "search": "?qux", + "searchParams": "qux=", + "hash": "#foo%22bar" + }, + { + "input": "http://foo.bar/baz?qux#foobar", + "base": "about:blank", + "href": "http://foo.bar/baz?qux#foo%3Ebar", + "origin": "http://foo.bar", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo.bar", + "hostname": "foo.bar", + "port": "", + "pathname": "/baz", + "search": "?qux", + "searchParams": "qux=", + "hash": "#foo%3Ebar" + }, + { + "input": "http://foo.bar/baz?qux#foo`bar", + "base": "about:blank", + "href": "http://foo.bar/baz?qux#foo%60bar", + "origin": "http://foo.bar", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo.bar", + "hostname": "foo.bar", + "port": "", + "pathname": "/baz", + "search": "?qux", + "searchParams": "qux=", + "hash": "#foo%60bar" + }, "# IPv4 parsing (via https://github.com/nodejs/node/pull/10317)", { "input": "http://192.168.257",