From 8bca7cd8e7e2bb2cc86ce0d687426ec2f3aeaf7d Mon Sep 17 00:00:00 2001
From: Arjen Poutsma <arjen.poutsma@broadcom.com>
Date: Fri, 3 May 2024 13:34:03 +0200
Subject: [PATCH] Various UrlParser improvements

- Consistent use of codePointAt instead of charAt.
- Fix bug in domainToAscii

See gh-32513
---
 .../springframework/web/util/UrlParser.java   | 176 ++++++++++--------
 1 file changed, 94 insertions(+), 82 deletions(-)
diff --git a/spring-web/src/main/java/org/springframework/web/util/UrlParser.java b/spring-web/src/main/java/org/springframework/web/util/UrlParser.java
index 713aff6ec6d4..599d348d6aef 100644
--- a/spring-web/src/main/java/org/springframework/web/util/UrlParser.java
+++ b/spring-web/src/main/java/org/springframework/web/util/UrlParser.java
@@ -179,7 +179,7 @@ private UrlRecord basicUrlParser(@Nullable UrlRecord url, @Nullable State stateO
 	void sanitizeInput(boolean removeC0ControlOrSpace) {
 		boolean strip = true;
 		for (int i = 0; i < this.input.length(); i++) {
-			char c = this.input.charAt(i);
+			int c = this.input.codePointAt(i);
 			boolean isSpaceOrC0 = c == ' ' || isC0Control(c);
 			boolean isTabOrNL = c == '\t' || isNewline(c);
 			if ((strip && isSpaceOrC0) || isTabOrNL) {
@@ -204,7 +204,7 @@ else if (isTabOrNL) {
 		}
 		if (removeC0ControlOrSpace) {
 			for (int i = this.input.length() - 1; i >= 0; i--) {
-				char c = this.input.charAt(i);
+				int c = this.input.codePointAt(i);
 				if (c == ' ' || isC0Control(c)) {
 					if (validate()) {
 						// If input contains any (leading or) trailing C0 control or space, invalid-URL-unit validation error.
@@ -224,7 +224,7 @@ private void setState(State newState) {
 		if (logger.isTraceEnabled()) {
 			String c;
 			if (this.pointer < this.input.length()) {
-				c = Character.toString(this.input.charAt(this.pointer));
+				c = Character.toString(this.input.codePointAt(this.pointer));
 			}
 			else {
 				c = "EOF";
@@ -265,16 +265,16 @@ private static LinkedList<String> strictSplit(String input, int delimiter) {
 
 	private static String domainToAscii(String domain, boolean beStrict) {
 		// If beStrict is false, domain is an ASCII string, and strictly splitting domain on U+002E (.) does not produce any item that starts with an ASCII case-insensitive match for "xn--", this step is equivalent to ASCII lowercasing domain.
-		boolean onlyLowerCase = !beStrict;
 		if (!beStrict && containsOnlyAscii(domain)) {
 			int dotIdx = domain.indexOf('.');
+			boolean onlyLowerCase = true;
 			while (dotIdx != -1) {
 				if (domain.length() - dotIdx > 4) {
 					// ASCII case-insensitive match for "xn--"
-					char ch0 = domain.charAt(dotIdx + 1);
-					char ch1 = domain.charAt(dotIdx + 2);
-					char ch2 = domain.charAt(dotIdx + 3);
-					char ch3 = domain.charAt(dotIdx + 4);
+					int ch0 = domain.codePointAt(dotIdx + 1);
+					int ch1 = domain.codePointAt(dotIdx + 2);
+					int ch2 = domain.codePointAt(dotIdx + 3);
+					int ch3 = domain.codePointAt(dotIdx + 4);
 					if ((ch0 == 'x' || ch0 == 'X') &&
 							(ch1 == 'n' || ch1 == 'N') &&
 							ch2 == '-' && ch3 == '_') {
@@ -284,9 +284,9 @@ private static String domainToAscii(String domain, boolean beStrict) {
 				}
 				dotIdx = domain.indexOf('.', dotIdx + 1);
 			}
-		}
-		if (onlyLowerCase) {
-			return domain.toLowerCase(Locale.ENGLISH);
+			if (onlyLowerCase) {
+				return domain.toLowerCase(Locale.ENGLISH);
+			}
 		}
 		// Let result be the result of running Unicode ToASCII (https://www.unicode.org/reports/tr46/#ToASCII) with domain_name set to domain, UseSTD3ASCIIRules set to beStrict, CheckHyphens set to false, CheckBidi set to true, CheckJoiners set to true, Transitional_Processing set to false, and VerifyDnsLength set to beStrict. [UTS46]
 		int flag = 0;
@@ -392,7 +392,7 @@ private static boolean isAsciiAlpha(int ch) {
 
 	private static boolean containsOnlyAsciiDigits(CharSequence string) {
 		for (int i=0; i< string.length(); i++ ) {
-			char ch = string.charAt(i);
+			int ch = codePointAt(string, i);
 			if (!isAsciiDigit(ch)) {
 				return false;
 			}
@@ -400,9 +400,9 @@ private static boolean containsOnlyAsciiDigits(CharSequence string) {
 		return true;
 	}
 
-	private static boolean containsOnlyAscii(CharSequence string) {
-		for (int i=0; i< string.length(); i++ ) {
-			char ch = string.charAt(i);
+	private static boolean containsOnlyAscii(String string) {
+		for (int i = 0; i < string.length(); i++) {
+			int ch = string.codePointAt(i);
 			if (!isAsciiCodePoint(ch)) {
 				return false;
 			}
@@ -505,7 +505,7 @@ private void emptyBuffer() {
 	private int remaining(int deltaPos) {
 		int pos = this.pointer + deltaPos + 1;
 		if (pos < this.input.length()) {
-			return this.input.charAt(pos);
+			return this.input.codePointAt(pos);
 		}
 		else {
 			return EOF;
@@ -571,27 +571,27 @@ private static boolean isSingleDotPathSegment(StringBuilder b) {
 		int len = b.length();
 		switch (len) {
 			case 1 -> {
-				char ch0 = b.charAt(0);
+				int ch0 = b.codePointAt(0);
 				return ch0 == '.';
 			}
 			case 2 -> {
-				char ch0 = b.charAt(0);
-				char ch1 = b.charAt(1);
+				int ch0 = b.codePointAt(0);
+				int ch1 = b.codePointAt(1);
 				return ch0 == '/' && ch1 == '.';
 			}
 			case 3 -> {
 				//  ASCII case-insensitive match for "%2e".
-				char ch0 = b.charAt(0);
-				char ch1 = b.charAt(1);
-				char ch2 = b.charAt(2);
+				int ch0 = b.codePointAt(0);
+				int ch1 = b.codePointAt(1);
+				int ch2 = b.codePointAt(2);
 				return ch0 == '%' && ch1 == '2' && (ch2 == 'e' || ch2 == 'E');
 			}
 			case 4 -> {
 				//  ASCII case-insensitive match for "/%2e".
-				char ch0 = b.charAt(0);
-				char ch1 = b.charAt(1);
-				char ch2 = b.charAt(2);
-				char ch3 = b.charAt(3);
+				int ch0 = b.codePointAt(0);
+				int ch1 = b.codePointAt(1);
+				int ch2 = b.codePointAt(2);
+				int ch3 = b.codePointAt(3);
 				return ch0 == '/' && ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E');
 			}
 			default -> {
@@ -607,55 +607,55 @@ private static boolean isDoubleDotPathSegment(StringBuilder b) {
 		int len = b.length();
 		switch (len) {
 			case 2 -> {
-				char ch0 = b.charAt(0);
-				char ch1 = b.charAt(1);
+				int ch0 = b.codePointAt(0);
+				int ch1 = b.codePointAt(1);
 				return ch0 == '.' && ch1 == '.';
 			}
 			case 3 -> {
-				char ch0 = b.charAt(0);
-				char ch1 = b.charAt(1);
-				char ch2 = b.charAt(2);
+				int ch0 = b.codePointAt(0);
+				int ch1 = b.codePointAt(1);
+				int ch2 = b.codePointAt(2);
 				return ch0 == '/' && ch1 == '.' && ch2 == '.';
 			}
 			case 4 -> {
-				char ch0 = b.charAt(0);
-				char ch1 = b.charAt(1);
-				char ch2 = b.charAt(2);
-				char ch3 = b.charAt(3);
+				int ch0 = b.codePointAt(0);
+				int ch1 = b.codePointAt(1);
+				int ch2 = b.codePointAt(2);
+				int ch3 = b.codePointAt(3);
 				// case-insensitive match for ".%2e" or "%2e."
 				return (ch0 == '.' && ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E') ||
 						(ch0 == '%' && ch1 == '2' && (ch2 == 'e' || ch2 == 'E') && ch3 == '.'));
 			}
 			case 5 -> {
-				char ch0 = b.charAt(0);
-				char ch1 = b.charAt(1);
-				char ch2 = b.charAt(2);
-				char ch3 = b.charAt(3);
-				char ch4 = b.charAt(4);
+				int ch0 = b.codePointAt(0);
+				int ch1 = b.codePointAt(1);
+				int ch2 = b.codePointAt(2);
+				int ch3 = b.codePointAt(3);
+				int ch4 = b.codePointAt(4);
 				// case-insensitive match for "/.%2e" or "/%2e."
 				return ch0 == '/' &&
 						(ch1 == '.' && ch2 == '%' && ch3 == '2' && (ch4 == 'e' || ch4 == 'E')
 								|| (ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E') && ch4 == '.'));
 			}
 			case 6 -> {
-				char ch0 = b.charAt(0);
-				char ch1 = b.charAt(1);
-				char ch2 = b.charAt(2);
-				char ch3 = b.charAt(3);
-				char ch4 = b.charAt(4);
-				char ch5 = b.charAt(5);
+				int ch0 = b.codePointAt(0);
+				int ch1 = b.codePointAt(1);
+				int ch2 = b.codePointAt(2);
+				int ch3 = b.codePointAt(3);
+				int ch4 = b.codePointAt(4);
+				int ch5 = b.codePointAt(5);
 				// case-insensitive match for "%2e%2e".
 				return ch0 == '%' && ch1 == '2' && (ch2 == 'e' || ch2 == 'E')
 						&& ch3 == '%' && ch4 == '2' && (ch5 == 'e' || ch5 == 'E');
 			}
 			case 7 -> {
-				char ch0 = b.charAt(0);
-				char ch1 = b.charAt(1);
-				char ch2 = b.charAt(2);
-				char ch3 = b.charAt(3);
-				char ch4 = b.charAt(4);
-				char ch5 = b.charAt(5);
-				char ch6 = b.charAt(6);
+				int ch0 = b.codePointAt(0);
+				int ch1 = b.codePointAt(1);
+				int ch2 = b.codePointAt(2);
+				int ch3 = b.codePointAt(3);
+				int ch4 = b.codePointAt(4);
+				int ch5 = b.codePointAt(5);
+				int ch6 = b.codePointAt(6);
 				// case-insensitive match for "/%2e%2e".
 				return ch0 == '/' && ch1 == '%' && ch2 == '2' && (ch3 == 'e' || ch3 == 'E')
 						&& ch4 == '%' && ch5 == '2' && (ch6 == 'e' || ch6 == 'E');
@@ -686,7 +686,7 @@ private static boolean isWindowsDriveLetter(CharSequence input, boolean normaliz
 	 * its first two code points are a Windows drive letter
 	 * its length is 2 or its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#).
 	 */
-	private static boolean startsWithWindowsDriveLetter(CharSequence input) {
+	private static boolean startsWithWindowsDriveLetter(String input) {
 		int len = input.length();
 		if (len < 2) {
 			return false;
@@ -698,18 +698,18 @@ private static boolean startsWithWindowsDriveLetter(CharSequence input) {
 			return true;
 		}
 		else {
-			char ch2 = input.charAt(2);
+			int ch2 = input.codePointAt(2);
 			return ch2 == '/' || ch2 == '\\' || ch2 == '?' || ch2 == '#';
 		}
 	}
 
 	private static boolean isWindowsDriveLetterInternal(CharSequence s, boolean normalized) {
-		char ch0 = s.charAt(0);
+		int ch0 = codePointAt(s, 0);
 		if (!isAsciiAlpha(ch0)) {
 			return false;
 		}
 		else {
-			char ch1 = s.charAt(1);
+			int ch1 = codePointAt(s, 1);
 			if (normalized) {
 				return ch1 == ':';
 			}
@@ -719,6 +719,18 @@ private static boolean isWindowsDriveLetterInternal(CharSequence s, boolean norm
 		}
 	}
 
+	private static int codePointAt(CharSequence s, int index) {
+		if (s instanceof String string) {
+			return string.codePointAt(index);
+		}
+		else if (s instanceof StringBuilder builder) {
+			return builder.codePointAt(index);
+		}
+		else {
+			throw new IllegalStateException();
+		}
+	}
+
 
 	private enum State {
 
@@ -1500,8 +1512,8 @@ else if (p.previousState != URL_TEMPLATE && c == '{') {
 						// If c is U+0025 (%) and remaining does not start with two ASCII hex digits, invalid-URL-unit validation error.
 						else if (c == '%' &&
 								(p.pointer >= p.input.length() - 2 ||
-										!isAsciiHexDigit(p.input.charAt(p.pointer + 1)) ||
-										!isAsciiHexDigit(p.input.charAt(p.pointer + 2)))) {
+										!isAsciiHexDigit(p.input.codePointAt(p.pointer + 1)) ||
+										!isAsciiHexDigit(p.input.codePointAt(p.pointer + 2)))) {
 							p.validationError("Invalid URL Unit: \"" + (char) c + "\"");
 						}
 					}
@@ -1549,8 +1561,8 @@ else if (p.previousState != URL_TEMPLATE && c == '{') {
 						// If c is U+0025 (%) and remaining does not start with two ASCII hex digits, invalid-URL-unit validation error.
 						else if (c == '%' &&
 								(p.pointer >= p.input.length() - 2 ||
-										!isAsciiHexDigit(p.input.charAt(p.pointer + 1)) ||
-										!isAsciiHexDigit(p.input.charAt(p.pointer + 2)))) {
+										!isAsciiHexDigit(p.input.codePointAt(p.pointer + 1)) ||
+										!isAsciiHexDigit(p.input.codePointAt(p.pointer + 2)))) {
 							p.validationError("Invalid URL Unit: \"" + (char) c + "\"");
 						}
 					}
@@ -1612,8 +1624,8 @@ else if (c != EOF) {
 						// If c is U+0025 (%) and remaining does not start with two ASCII hex digits, invalid-URL-unit validation error.
 						else if (c == '%' &&
 								(p.pointer >= p.input.length() - 2 ||
-										!isAsciiHexDigit(p.input.charAt(p.pointer + 1)) ||
-										!isAsciiHexDigit(p.input.charAt(p.pointer + 2)))) {
+										!isAsciiHexDigit(p.input.codePointAt(p.pointer + 1)) ||
+										!isAsciiHexDigit(p.input.codePointAt(p.pointer + 2)))) {
 							p.validationError("Invalid URL Unit: \"" + (char) c + "\"");
 						}
 					}
@@ -1635,8 +1647,8 @@ public void handle(int c, UrlRecord url, UrlParser p) {
 						// If c is U+0025 (%) and remaining does not start with two ASCII hex digits, invalid-URL-unit validation error.
 						else if (c == '%' &&
 								(p.pointer >= p.input.length() - 2 ||
-										!isAsciiHexDigit(p.input.charAt(p.pointer + 1)) ||
-										!isAsciiHexDigit(p.input.charAt(p.pointer + 2)))) {
+										!isAsciiHexDigit(p.input.codePointAt(p.pointer + 1)) ||
+										!isAsciiHexDigit(p.input.codePointAt(p.pointer + 2)))) {
 							p.validationError("Invalid URL Unit: \"" + (char) c + "\"");
 						}
 					}
@@ -2079,10 +2091,10 @@ sealed interface Host permits Domain, EmptyHost, IpAddressHost, OpaqueHost {
 		 */
 		static Host parse(String input, boolean isOpaque, UrlParser p) {
 			// If input starts with U+005B ([), then:
-			if (!input.isEmpty() && input.charAt(0) == '[') {
+			if (!input.isEmpty() && input.codePointAt(0) == '[') {
 				int last = input.length() - 1;
 				// If input does not end with U+005D (]), IPv6-unclosed validation error, return failure.
-				if (input.charAt(last) != ']') {
+				if (input.codePointAt(last) != ']') {
 					throw new InvalidUrlException("IPv6 address is missing the closing \"]\").");
 				}
 				// Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
@@ -2102,7 +2114,7 @@ static Host parse(String input, boolean isOpaque, UrlParser p) {
 			String asciiDomain = domainToAscii(domain, false);
 
 			for (int i=0; i < asciiDomain.length(); i++) {
-				char ch = asciiDomain.charAt(i);
+				int ch = asciiDomain.codePointAt(i);
 				// If asciiDomain contains a forbidden domain code point, domain-invalid-code-point validation error, return failure.
 				if (isForbiddenDomain(ch)) {
 					throw new InvalidUrlException("Invalid character \"" + ch + "\" in domain \"" + input + "\"");
@@ -2245,7 +2257,7 @@ private OpaqueHost(String host) {
 		 */
 		public static OpaqueHost parse(String input, UrlParser p) {
 			for (int i = 0; i < input.length(); i++) {
-				char ch = input.charAt(i);
+				int ch = input.codePointAt(i);
 				// If input contains a forbidden host code point, host-invalid-code-point validation error, return failure.
 				if (isForbiddenHost(ch)) {
 					throw new InvalidUrlException("An opaque host contains a forbidden host code point.");
@@ -2255,7 +2267,7 @@ public static OpaqueHost parse(String input, UrlParser p) {
 					p.validationError("Code point \"" + ch + "\" is not a URL unit.");
 				}
 				//If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, invalid-URL-unit validation error.
-				if (p.validate() && ch == '%' && (input.length() - i < 2 || !isAsciiDigit(input.charAt(i + 1)) || !isAsciiDigit(input.charAt(i + 2)))) {
+				if (p.validate() && ch == '%' && (input.length() - i < 2 || !isAsciiDigit(input.codePointAt(i + 1)) || !isAsciiDigit(input.codePointAt(i + 2)))) {
 					p.validationError("Code point \"" + ch + "\" is not a URL unit.");
 				}
 			}
@@ -2442,8 +2454,8 @@ private static ParseIpv4NumberResult parseIpv4Number(String input) {
 			int len = input.length();
 			// If input contains at least two code points and the first two code points are either "0X" or "0x", then:
 			if (len >= 2) {
-				char ch0 = input.charAt(0);
-				char ch1 = input.charAt(1);
+				int ch0 = input.codePointAt(0);
+				int ch1 = input.codePointAt(1);
 				if (ch0 == '0' && (ch1 == 'X' || ch1 == 'x')) {
 					// Set validationError to true.
 					validationError = true;
@@ -2535,11 +2547,11 @@ public static Ipv6Address parse(String input) {
 			// Let pointer be a pointer for input.
 			int pointer = 0;
 			int inputLength = input.length();
-			int c = (inputLength > 0) ? input.charAt(0) : EOF;
+			int c = (inputLength > 0) ? input.codePointAt(0) : EOF;
 			// If c is U+003A (:), then:
 			if (c == ':') {
 				// If remaining does not start with U+003A (:), IPv6-invalid-compression validation error, return failure.
-				if (inputLength > 1 && input.charAt(1) != ':') {
+				if (inputLength > 1 && input.codePointAt(1) != ':') {
 					throw new InvalidUrlException("IPv6 address begins with improper compression.");
 				}
 				// Increase pointer by 2.
@@ -2548,7 +2560,7 @@ public static Ipv6Address parse(String input) {
 				pieceIndex++;
 				compress = pieceIndex;
 			}
-			c = (pointer < inputLength) ? input.charAt(pointer) : EOF;
+			c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF;
 			// While c is not the EOF code point:
 			while (c != EOF) {
 				// If pieceIndex is 8, IPv6-too-many-pieces validation error, return failure.
@@ -2565,7 +2577,7 @@ public static Ipv6Address parse(String input) {
 					pointer++;
 					pieceIndex++;
 					compress = pieceIndex;
-					c = (pointer < inputLength) ? input.charAt(pointer) : EOF;
+					c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF;
 					continue;
 				}
 				// Let value and length be 0.
@@ -2577,7 +2589,7 @@ public static Ipv6Address parse(String input) {
 					value = (value * 0x10) + cHex;
 					pointer++;
 					length++;
-					c = (pointer < inputLength) ? input.charAt(pointer) : EOF;
+					c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF;
 				}
 				// If c is U+002E (.), then:
 				if (c == '.') {
@@ -2593,7 +2605,7 @@ public static Ipv6Address parse(String input) {
 					}
 					// Let numbersSeen be 0.
 					int numbersSeen = 0;
-					c = (pointer < inputLength) ? input.charAt(pointer) : EOF;
+					c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF;
 					// While c is not the EOF code point:
 					while (c != EOF) {
 						// Let ipv4Piece be null.
@@ -2603,7 +2615,7 @@ public static Ipv6Address parse(String input) {
 							// If c is a U+002E (.) and numbersSeen is less than 4, then increase pointer by 1.
 							if (c =='.' && numbersSeen < 4) {
 								pointer++;
-								c = (pointer < inputLength) ? input.charAt(pointer) : EOF;
+								c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF;
 							}
 							// Otherwise, IPv4-in-IPv6-invalid-code-point validation error, return failure.
 							else {
@@ -2637,7 +2649,7 @@ else if (ipv4Piece == 0) {
 							}
 							// Increase pointer by 1.
 							pointer++;
-							c = (pointer < inputLength) ? input.charAt(pointer) : EOF;
+							c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF;
 						}
 						// Set address[pieceIndex] to address[pieceIndex] × 0x100 + ipv4Piece.
 						address[pieceIndex] = address[pieceIndex] * 0x100 + (ipv4Piece != null ? ipv4Piece : 0);
@@ -2647,7 +2659,7 @@ else if (ipv4Piece == 0) {
 						if (numbersSeen == 2 || numbersSeen == 4) {
 							pieceIndex++;
 						}
-						c = (pointer < inputLength) ? input.charAt(pointer) : EOF;
+						c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF;
 					}
 					// If numbersSeen is not 4, IPv4-in-IPv6-too-few-parts validation error, return failure.
 					if (numbersSeen != 4) {
@@ -2660,7 +2672,7 @@ else if (ipv4Piece == 0) {
 				else if (c == ':') {
 					// Increase pointer by 1.
 					pointer++;
-					c = (pointer < inputLength) ? input.charAt(pointer) : EOF;
+					c = (pointer < inputLength) ? input.codePointAt(pointer) : EOF;
 					// If c is the EOF code point, IPv6-invalid-code-point validation error, return failure.
 					if (c == EOF) {
 						throw new InvalidUrlException("IPv6 address unexpectedly ends.");