From 8bca7cd8e7e2bb2cc86ce0d687426ec2f3aeaf7d Mon Sep 17 00:00:00 2001 From: Arjen Poutsma Date: Fri, 3 May 2024 13:34:03 +0200 Subject: [PATCH] Various UrlParser improvements - Consistent use of codePointAt instead of charAt. - Fix bug in domainToAscii See gh-32513 --- .../springframework/web/util/UrlParser.java | 176 ++++++++++-------- 1 file changed, 94 insertions(+), 82 deletions(-) diff --git a/spring-web/src/main/java/org/springframework/web/util/UrlParser.java b/spring-web/src/main/java/org/springframework/web/util/UrlParser.java index 713aff6ec6d4..599d348d6aef 100644 --- a/spring-web/src/main/java/org/springframework/web/util/UrlParser.java +++ b/spring-web/src/main/java/org/springframework/web/util/UrlParser.java @@ -179,7 +179,7 @@ private UrlRecord basicUrlParser(@Nullable UrlRecord url, @Nullable State stateO void sanitizeInput(boolean removeC0ControlOrSpace) { boolean strip = true; for (int i = 0; i < this.input.length(); i++) { - char c = this.input.charAt(i); + int c = this.input.codePointAt(i); boolean isSpaceOrC0 = c == ' ' || isC0Control(c); boolean isTabOrNL = c == '\t' || isNewline(c); if ((strip && isSpaceOrC0) || isTabOrNL) { @@ -204,7 +204,7 @@ else if (isTabOrNL) { } if (removeC0ControlOrSpace) { for (int i = this.input.length() - 1; i >= 0; i--) { - char c = this.input.charAt(i); + int c = this.input.codePointAt(i); if (c == ' ' || isC0Control(c)) { if (validate()) { // If input contains any (leading or) trailing C0 control or space, invalid-URL-unit validation error. @@ -224,7 +224,7 @@ private void setState(State newState) { if (logger.isTraceEnabled()) { String c; if (this.pointer < this.input.length()) { - c = Character.toString(this.input.charAt(this.pointer)); + c = Character.toString(this.input.codePointAt(this.pointer)); } else { c = "EOF"; @@ -265,16 +265,16 @@ private static LinkedList strictSplit(String input, int delimiter) { private static String domainToAscii(String domain, boolean beStrict) { // If beStrict is false, domain is an ASCII string, and strictly splitting domain on U+002E (.) does not produce any item that starts with an ASCII case-insensitive match for "xn--", this step is equivalent to ASCII lowercasing domain. - boolean onlyLowerCase = !beStrict; if (!beStrict && containsOnlyAscii(domain)) { int dotIdx = domain.indexOf('.'); + boolean onlyLowerCase = true; while (dotIdx != -1) { if (domain.length() - dotIdx > 4) { // ASCII case-insensitive match for "xn--" - char ch0 = domain.charAt(dotIdx + 1); - char ch1 = domain.charAt(dotIdx + 2); - char ch2 = domain.charAt(dotIdx + 3); - char ch3 = domain.charAt(dotIdx + 4); + int ch0 = domain.codePointAt(dotIdx + 1); + int ch1 = domain.codePointAt(dotIdx + 2); + int ch2 = domain.codePointAt(dotIdx + 3); + int ch3 = domain.codePointAt(dotIdx + 4); if ((ch0 == 'x' || ch0 == 'X') && (ch1 == 'n' || ch1 == 'N') && ch2 == '-' && ch3 == '_') { @@ -284,9 +284,9 @@ private static String domainToAscii(String domain, boolean beStrict) { } dotIdx = domain.indexOf('.', dotIdx + 1); } - } - if (onlyLowerCase) { - return domain.toLowerCase(Locale.ENGLISH); + if (onlyLowerCase) { + return domain.toLowerCase(Locale.ENGLISH); + } } // Let result be the result of running Unicode ToASCII (https://www.unicode.org/reports/tr46/#ToASCII) with domain_name set to domain, UseSTD3ASCIIRules set to beStrict, CheckHyphens set to false, CheckBidi set to true, CheckJoiners set to true, Transitional_Processing set to false, and VerifyDnsLength set to beStrict. [UTS46] int flag = 0; @@ -392,7 +392,7 @@ private static boolean isAsciiAlpha(int ch) { private static boolean containsOnlyAsciiDigits(CharSequence string) { for (int i=0; i< string.length(); i++ ) { - char ch = string.charAt(i); + int ch = codePointAt(string, i); if (!isAsciiDigit(ch)) { return false; } @@ -400,9 +400,9 @@ private static boolean containsOnlyAsciiDigits(CharSequence string) { return true; } - private static boolean containsOnlyAscii(CharSequence string) { - for (int i=0; i< string.length(); i++ ) { - char ch = string.charAt(i); + private static boolean containsOnlyAscii(String string) { + for (int i = 0; i < string.length(); i++) { + int ch = string.codePointAt(i); if (!isAsciiCodePoint(ch)) { return false; } @@ -505,7 +505,7 @@ private void emptyBuffer() { private int remaining(int deltaPos) { int pos = this.pointer + deltaPos + 1; if (pos < this.input.length()) { - return this.input.charAt(pos); + return this.input.codePointAt(pos); } else { return EOF; @@ -571,27 +571,27 @@ private static boolean isSingleDotPathSegment(StringBuilder b) { int len = b.length(); switch (len) { case 1 -> { - char ch0 = b.charAt(0); + int ch0 = b.codePointAt(0); return ch0 == '.'; } case 2 -> { - char ch0 = b.charAt(0); - char ch1 = b.charAt(1); + int ch0 = b.codePointAt(0); + int ch1 = b.codePointAt(1); return ch0 == '/' && ch1 == '.'; } case 3 -> { // ASCII case-insensitive match for "%2e". - char ch0 = b.charAt(0); - char ch1 = b.charAt(1); - char ch2 = b.charAt(2); + int ch0 = b.codePointAt(0); + int ch1 = b.codePointAt(1); + int ch2 = b.codePointAt(2); return ch0 == '%' && ch1 == '2' && (ch2 == 'e' || ch2 == 'E'); } case 4 -> { // ASCII case-insensitive match for "/%2e". - char ch0 = b.charAt(0); - char ch1 = b.charAt(1); - char ch2 = b.charAt(2); - char ch3 = b.charAt(3); + int ch0 = b.codePointAt(0); + int ch1 = b.codePointAt(1); + int ch2 = b.codePointAt(2); + int ch3 = b.codePointAt(3); return ch0 == '/' && ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E'); } default -> { @@ -607,55 +607,55 @@ private static boolean isDoubleDotPathSegment(StringBuilder b) { int len = b.length(); switch (len) { case 2 -> { - char ch0 = b.charAt(0); - char ch1 = b.charAt(1); + int ch0 = b.codePointAt(0); + int ch1 = b.codePointAt(1); return ch0 == '.' && ch1 == '.'; } case 3 -> { - char ch0 = b.charAt(0); - char ch1 = b.charAt(1); - char ch2 = b.charAt(2); + int ch0 = b.codePointAt(0); + int ch1 = b.codePointAt(1); + int ch2 = b.codePointAt(2); return ch0 == '/' && ch1 == '.' && ch2 == '.'; } case 4 -> { - char ch0 = b.charAt(0); - char ch1 = b.charAt(1); - char ch2 = b.charAt(2); - char ch3 = b.charAt(3); + int ch0 = b.codePointAt(0); + int ch1 = b.codePointAt(1); + int ch2 = b.codePointAt(2); + int ch3 = b.codePointAt(3); // case-insensitive match for ".%2e" or "%2e." return (ch0 == '.' && ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E') || (ch0 == '%' && ch1 == '2' && (ch2 == 'e' || ch2 == 'E') && ch3 == '.')); } case 5 -> { - char ch0 = b.charAt(0); - char ch1 = b.charAt(1); - char ch2 = b.charAt(2); - char ch3 = b.charAt(3); - char ch4 = b.charAt(4); + int ch0 = b.codePointAt(0); + int ch1 = b.codePointAt(1); + int ch2 = b.codePointAt(2); + int ch3 = b.codePointAt(3); + int ch4 = b.codePointAt(4); // case-insensitive match for "/.%2e" or "/%2e." return ch0 == '/' && (ch1 == '.' && ch2 == '%' && ch3 == '2' && (ch4 == 'e' || ch4 == 'E') || (ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E') && ch4 == '.')); } case 6 -> { - char ch0 = b.charAt(0); - char ch1 = b.charAt(1); - char ch2 = b.charAt(2); - char ch3 = b.charAt(3); - char ch4 = b.charAt(4); - char ch5 = b.charAt(5); + int ch0 = b.codePointAt(0); + int ch1 = b.codePointAt(1); + int ch2 = b.codePointAt(2); + int ch3 = b.codePointAt(3); + int ch4 = b.codePointAt(4); + int ch5 = b.codePointAt(5); // case-insensitive match for "%2e%2e". return ch0 == '%' && ch1 == '2' && (ch2 == 'e' || ch2 == 'E') && ch3 == '%' && ch4 == '2' && (ch5 == 'e' || ch5 == 'E'); } case 7 -> { - char ch0 = b.charAt(0); - char ch1 = b.charAt(1); - char ch2 = b.charAt(2); - char ch3 = b.charAt(3); - char ch4 = b.charAt(4); - char ch5 = b.charAt(5); - char ch6 = b.charAt(6); + int ch0 = b.codePointAt(0); + int ch1 = b.codePointAt(1); + int ch2 = b.codePointAt(2); + int ch3 = b.codePointAt(3); + int ch4 = b.codePointAt(4); + int ch5 = b.codePointAt(5); + int ch6 = b.codePointAt(6); // case-insensitive match for "/%2e%2e". return ch0 == '/' && ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E') && ch4 == '%' && ch5 == '2' && (ch6 == 'e' || ch6 == 'E'); @@ -686,7 +686,7 @@ private static boolean isWindowsDriveLetter(CharSequence input, boolean normaliz * its first two code points are a Windows drive letter * its length is 2 or its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#). */ - private static boolean startsWithWindowsDriveLetter(CharSequence input) { + private static boolean startsWithWindowsDriveLetter(String input) { int len = input.length(); if (len < 2) { return false; @@ -698,18 +698,18 @@ private static boolean startsWithWindowsDriveLetter(CharSequence input) { return true; } else { - char ch2 = input.charAt(2); + int ch2 = input.codePointAt(2); return ch2 == '/' || ch2 == '\\' || ch2 == '?' || ch2 == '#'; } } private static boolean isWindowsDriveLetterInternal(CharSequence s, boolean normalized) { - char ch0 = s.charAt(0); + int ch0 = codePointAt(s, 0); if (!isAsciiAlpha(ch0)) { return false; } else { - char ch1 = s.charAt(1); + int ch1 = codePointAt(s, 1); if (normalized) { return ch1 == ':'; } @@ -719,6 +719,18 @@ private static boolean isWindowsDriveLetterInternal(CharSequence s, boolean norm } } + private static int codePointAt(CharSequence s, int index) { + if (s instanceof String string) { + return string.codePointAt(index); + } + else if (s instanceof StringBuilder builder) { + return builder.codePointAt(index); + } + else { + throw new IllegalStateException(); + } + } + private enum State { @@ -1500,8 +1512,8 @@ else if (p.previousState != URL_TEMPLATE && c == '{') { // If c is U+0025 (%) and remaining does not start with two ASCII hex digits, invalid-URL-unit validation error. else if (c == '%' && (p.pointer >= p.input.length() - 2 || - !isAsciiHexDigit(p.input.charAt(p.pointer + 1)) || - !isAsciiHexDigit(p.input.charAt(p.pointer + 2)))) { + !isAsciiHexDigit(p.input.codePointAt(p.pointer + 1)) || + !isAsciiHexDigit(p.input.codePointAt(p.pointer + 2)))) { p.validationError("Invalid URL Unit: \"" + (char) c + "\""); } } @@ -1549,8 +1561,8 @@ else if (p.previousState != URL_TEMPLATE && c == '{') { // If c is U+0025 (%) and remaining does not start with two ASCII hex digits, invalid-URL-unit validation error. else if (c == '%' && (p.pointer >= p.input.length() - 2 || - !isAsciiHexDigit(p.input.charAt(p.pointer + 1)) || - !isAsciiHexDigit(p.input.charAt(p.pointer + 2)))) { + !isAsciiHexDigit(p.input.codePointAt(p.pointer + 1)) || + !isAsciiHexDigit(p.input.codePointAt(p.pointer + 2)))) { p.validationError("Invalid URL Unit: \"" + (char) c + "\""); } } @@ -1612,8 +1624,8 @@ else if (c != EOF) { // If c is U+0025 (%) and remaining does not start with two ASCII hex digits, invalid-URL-unit validation error. else if (c == '%' && (p.pointer >= p.input.length() - 2 || - !isAsciiHexDigit(p.input.charAt(p.pointer + 1)) || - !isAsciiHexDigit(p.input.charAt(p.pointer + 2)))) { + !isAsciiHexDigit(p.input.codePointAt(p.pointer + 1)) || + !isAsciiHexDigit(p.input.codePointAt(p.pointer + 2)))) { p.validationError("Invalid URL Unit: \"" + (char) c + "\""); } } @@ -1635,8 +1647,8 @@ public void handle(int c, UrlRecord url, UrlParser p) { // If c is U+0025 (%) and remaining does not start with two ASCII hex digits, invalid-URL-unit validation error. else if (c == '%' && (p.pointer >= p.input.length() - 2 || - !isAsciiHexDigit(p.input.charAt(p.pointer + 1)) || - !isAsciiHexDigit(p.input.charAt(p.pointer + 2)))) { + !isAsciiHexDigit(p.input.codePointAt(p.pointer + 1)) || + !isAsciiHexDigit(p.input.codePointAt(p.pointer + 2)))) { p.validationError("Invalid URL Unit: \"" + (char) c + "\""); } } @@ -2079,10 +2091,10 @@ sealed interface Host permits Domain, EmptyHost, IpAddressHost, OpaqueHost { */ static Host parse(String input, boolean isOpaque, UrlParser p) { // If input starts with U+005B ([), then: - if (!input.isEmpty() && input.charAt(0) == '[') { + if (!input.isEmpty() && input.codePointAt(0) == '[') { int last = input.length() - 1; // If input does not end with U+005D (]), IPv6-unclosed validation error, return failure. - if (input.charAt(last) != ']') { + if (input.codePointAt(last) != ']') { throw new InvalidUrlException("IPv6 address is missing the closing \"]\")."); } // Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed. @@ -2102,7 +2114,7 @@ static Host parse(String input, boolean isOpaque, UrlParser p) { String asciiDomain = domainToAscii(domain, false); for (int i=0; i < asciiDomain.length(); i++) { - char ch = asciiDomain.charAt(i); + int ch = asciiDomain.codePointAt(i); // If asciiDomain contains a forbidden domain code point, domain-invalid-code-point validation error, return failure. if (isForbiddenDomain(ch)) { throw new InvalidUrlException("Invalid character \"" + ch + "\" in domain \"" + input + "\""); @@ -2245,7 +2257,7 @@ private OpaqueHost(String host) { */ public static OpaqueHost parse(String input, UrlParser p) { for (int i = 0; i < input.length(); i++) { - char ch = input.charAt(i); + int ch = input.codePointAt(i); // If input contains a forbidden host code point, host-invalid-code-point validation error, return failure. if (isForbiddenHost(ch)) { throw new InvalidUrlException("An opaque host contains a forbidden host code point."); @@ -2255,7 +2267,7 @@ public static OpaqueHost parse(String input, UrlParser p) { p.validationError("Code point \"" + ch + "\" is not a URL unit."); } //If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, invalid-URL-unit validation error. - if (p.validate() && ch == '%' && (input.length() - i < 2 || !isAsciiDigit(input.charAt(i + 1)) || !isAsciiDigit(input.charAt(i + 2)))) { + if (p.validate() && ch == '%' && (input.length() - i < 2 || !isAsciiDigit(input.codePointAt(i + 1)) || !isAsciiDigit(input.codePointAt(i + 2)))) { p.validationError("Code point \"" + ch + "\" is not a URL unit."); } } @@ -2442,8 +2454,8 @@ private static ParseIpv4NumberResult parseIpv4Number(String input) { int len = input.length(); // If input contains at least two code points and the first two code points are either "0X" or "0x", then: if (len >= 2) { - char ch0 = input.charAt(0); - char ch1 = input.charAt(1); + int ch0 = input.codePointAt(0); + int ch1 = input.codePointAt(1); if (ch0 == '0' && (ch1 == 'X' || ch1 == 'x')) { // Set validationError to true. validationError = true; @@ -2535,11 +2547,11 @@ public static Ipv6Address parse(String input) { // Let pointer be a pointer for input. int pointer = 0; int inputLength = input.length(); - int c = (inputLength > 0) ? input.charAt(0) : EOF; + int c = (inputLength > 0) ? input.codePointAt(0) : EOF; // If c is U+003A (:), then: if (c == ':') { // If remaining does not start with U+003A (:), IPv6-invalid-compression validation error, return failure. - if (inputLength > 1 && input.charAt(1) != ':') { + if (inputLength > 1 && input.codePointAt(1) != ':') { throw new InvalidUrlException("IPv6 address begins with improper compression."); } // Increase pointer by 2. @@ -2548,7 +2560,7 @@ public static Ipv6Address parse(String input) { pieceIndex++; compress = pieceIndex; } - c = (pointer < inputLength) ? input.charAt(pointer) : EOF; + c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF; // While c is not the EOF code point: while (c != EOF) { // If pieceIndex is 8, IPv6-too-many-pieces validation error, return failure. @@ -2565,7 +2577,7 @@ public static Ipv6Address parse(String input) { pointer++; pieceIndex++; compress = pieceIndex; - c = (pointer < inputLength) ? input.charAt(pointer) : EOF; + c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF; continue; } // Let value and length be 0. @@ -2577,7 +2589,7 @@ public static Ipv6Address parse(String input) { value = (value * 0x10) + cHex; pointer++; length++; - c = (pointer < inputLength) ? input.charAt(pointer) : EOF; + c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF; } // If c is U+002E (.), then: if (c == '.') { @@ -2593,7 +2605,7 @@ public static Ipv6Address parse(String input) { } // Let numbersSeen be 0. int numbersSeen = 0; - c = (pointer < inputLength) ? input.charAt(pointer) : EOF; + c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF; // While c is not the EOF code point: while (c != EOF) { // Let ipv4Piece be null. @@ -2603,7 +2615,7 @@ public static Ipv6Address parse(String input) { // If c is a U+002E (.) and numbersSeen is less than 4, then increase pointer by 1. if (c =='.' && numbersSeen < 4) { pointer++; - c = (pointer < inputLength) ? input.charAt(pointer) : EOF; + c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF; } // Otherwise, IPv4-in-IPv6-invalid-code-point validation error, return failure. else { @@ -2637,7 +2649,7 @@ else if (ipv4Piece == 0) { } // Increase pointer by 1. pointer++; - c = (pointer < inputLength) ? input.charAt(pointer) : EOF; + c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF; } // Set address[pieceIndex] to address[pieceIndex] × 0x100 + ipv4Piece. address[pieceIndex] = address[pieceIndex] * 0x100 + (ipv4Piece != null ? ipv4Piece : 0); @@ -2647,7 +2659,7 @@ else if (ipv4Piece == 0) { if (numbersSeen == 2 || numbersSeen == 4) { pieceIndex++; } - c = (pointer < inputLength) ? input.charAt(pointer) : EOF; + c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF; } // If numbersSeen is not 4, IPv4-in-IPv6-too-few-parts validation error, return failure. if (numbersSeen != 4) { @@ -2660,7 +2672,7 @@ else if (ipv4Piece == 0) { else if (c == ':') { // Increase pointer by 1. pointer++; - c = (pointer < inputLength) ? input.charAt(pointer) : EOF; + c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF; // If c is the EOF code point, IPv6-invalid-code-point validation error, return failure. if (c == EOF) { throw new InvalidUrlException("IPv6 address unexpectedly ends.");