From 9de27fa7cd823f6d26e33e48b36ef010c05f977c Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 20 Oct 2023 11:04:31 +1100 Subject: [PATCH] Only attempt to correct 8559 -> UTF on response headers Fixes #2011 This allows header values to be sent as UTF-8 (as encoded by Java HTTP request writer). But that's out of spec and depending on how the upstream server handles that, may or may not work as the user intended. Hence added a note on the docs to suggest encoding the data first. --- CHANGES | 3 + src/main/java/org/jsoup/Connection.java | 22 ++-- .../java/org/jsoup/helper/HttpConnection.java | 113 ++++++++++-------- .../org/jsoup/helper/HttpConnectionTest.java | 9 ++ 4 files changed, 86 insertions(+), 61 deletions(-) diff --git a/CHANGES b/CHANGES index e9265bd4e6..bc120729e0 100644 --- a/CHANGES +++ b/CHANGES @@ -69,6 +69,9 @@ Release 1.16.2 [PENDING] parent. + * Bugfix: in Jsoup.Connection when adding headers, the value may have been assumed to be an incorrectly decoded + ISO_8859_1 string, and re-encoded as UTF-8. The value is now left as-is. + * Change: removed previously deprecated methods Document#normalise, Element#forEach(org.jsoup.helper.Consumer<>), Node#forEach(org.jsoup.helper.Consumer<>), and the org.jsoup.helper.Consumer interface; the latter being a previously required compatibility shim prior to Android's de-sugaring support. diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java index 4e279a988f..f422debcb7 100644 --- a/src/main/java/org/jsoup/Connection.java +++ b/src/main/java/org/jsoup/Connection.java @@ -412,11 +412,11 @@ interface Base> { /** * Get the value of a header. If there is more than one header value with the same name, the headers are returned - * comma seperated, per rfc2616-sec4. + * comma separated, per rfc2616-sec4. *

- * Header names are case insensitive. + * Header names are case-insensitive. *

- * @param name name of header (case insensitive) + * @param name name of header (case-insensitive) * @return value of header, or null if not set. * @see #hasHeader(String) * @see #cookie(String) @@ -425,14 +425,16 @@ interface Base> { /** * Get the values of a header. - * @param name header name, case insensitive. + * @param name header name, case-insensitive. * @return a list of values for this header, or an empty list if not set. */ List headers(String name); /** - * Set a header. This method will overwrite any existing header with the same case insensitive name. (If there + * Set a header. This method will overwrite any existing header with the same case-insensitive name. If there * is more than one value for this header, this method will update the first matching header. + *

For compatibility, if the content of the header includes text that cannot be represented by ISO-8859-1, + * then it should be encoded first per RFC 2047.

* @param name Name of header * @param value Value of header * @return this, for chaining @@ -442,6 +444,8 @@ interface Base> { /** * Add a header. The header will be added regardless of whether a header with the same name already exists. + *

For compatibility, if the content of the header includes text that cannot be represented by ISO-8859-1, + * then it should be encoded first per RFC 2047.

* @param name Name of new header * @param value Value of new header * @return this, for chaining @@ -450,22 +454,22 @@ interface Base> { /** * Check if a header is present - * @param name name of header (case insensitive) + * @param name name of header (case-insensitive) * @return if the header is present in this request/response */ boolean hasHeader(String name); /** * Check if a header is present, with the given value - * @param name header name (case insensitive) - * @param value value (case insensitive) + * @param name header name (case-insensitive) + * @param value value (case-insensitive) * @return if the header and value pair are set in this req/res */ boolean hasHeaderWithValue(String name, String value); /** * Remove headers by name. If there is more than one header with this name, they will all be removed. - * @param name name of header to remove (case insensitive) + * @param name name of header to remove (case-insensitive) * @return this, for chaining */ T removeHeader(String name); diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index d87c9f42f0..af7a18aa29 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -448,7 +448,7 @@ public String header(String name) { } @Override - public T addHeader(String name, String value) { + public T addHeader(String name, @Nullable String value) { Validate.notEmptyParam(name, "name"); //noinspection ConstantConditions value = value == null ? "" : value; @@ -458,7 +458,7 @@ public T addHeader(String name, String value) { values = new ArrayList<>(); headers.put(name, values); } - values.add(fixHeaderEncoding(value)); + values.add(value); return (T) this; } @@ -469,55 +469,6 @@ public List headers(String name) { return getHeadersCaseInsensitive(name); } - private static String fixHeaderEncoding(String val) { - byte[] bytes = val.getBytes(ISO_8859_1); - if (!looksLikeUtf8(bytes)) - return val; - return new String(bytes, UTF_8); - } - - private static boolean looksLikeUtf8(byte[] input) { - int i = 0; - // BOM: - if (input.length >= 3 - && (input[0] & 0xFF) == 0xEF - && (input[1] & 0xFF) == 0xBB - && (input[2] & 0xFF) == 0xBF) { - i = 3; - } - - int end; - for (int j = input.length; i < j; ++i) { - int o = input[i]; - if ((o & 0x80) == 0) { - continue; // ASCII - } - - // UTF-8 leading: - if ((o & 0xE0) == 0xC0) { - end = i + 1; - } else if ((o & 0xF0) == 0xE0) { - end = i + 2; - } else if ((o & 0xF8) == 0xF0) { - end = i + 3; - } else { - return false; - } - - if (end >= input.length) - return false; - - while (i < end) { - i++; - o = input[i]; - if ((o & 0xC0) != 0x80) { - return false; - } - } - } - return true; - } - @Override public T header(String name, String value) { Validate.notEmptyParam(name, "name"); @@ -1162,9 +1113,67 @@ void processResponseHeaders(Map> resHeaders) { } } for (String value : values) { - addHeader(name, value); + addHeader(name, fixHeaderEncoding(value)); + } + } + } + + /** + Servers may encode response headers in UTF-8 instead of RFC defined 8859. This method attempts to detect that + and re-decode the string as UTF-8. + * @param val a header value string that may have been incorrectly decoded as 8859. + * @return a potentially re-decoded string. + */ + @Nullable + private static String fixHeaderEncoding(@Nullable String val) { + if (val == null) return val; + byte[] bytes = val.getBytes(ISO_8859_1); + if (looksLikeUtf8(bytes)) + return new String(bytes, UTF_8); + else + return val; + } + + private static boolean looksLikeUtf8(byte[] input) { + int i = 0; + // BOM: + if (input.length >= 3 + && (input[0] & 0xFF) == 0xEF + && (input[1] & 0xFF) == 0xBB + && (input[2] & 0xFF) == 0xBF) { + i = 3; + } + + int end; + for (int j = input.length; i < j; ++i) { + int o = input[i]; + if ((o & 0x80) == 0) { + continue; // ASCII + } + + // UTF-8 leading: + if ((o & 0xE0) == 0xC0) { + end = i + 1; + } else if ((o & 0xF0) == 0xE0) { + end = i + 2; + } else if ((o & 0xF8) == 0xF0) { + end = i + 3; + } else { + return false; + } + + if (end >= input.length) + return false; + + while (i < end) { + i++; + o = input[i]; + if ((o & 0xC0) != 0x80) { + return false; + } } } + return true; } private @Nullable static String setOutputContentType(final Connection.Request req) { diff --git a/src/test/java/org/jsoup/helper/HttpConnectionTest.java b/src/test/java/org/jsoup/helper/HttpConnectionTest.java index d77658809f..9444c2da80 100644 --- a/src/test/java/org/jsoup/helper/HttpConnectionTest.java +++ b/src/test/java/org/jsoup/helper/HttpConnectionTest.java @@ -358,4 +358,13 @@ public void caseInsensitiveHeaders(Locale locale) { } assertTrue(threw); } + + @Test void setHeaderWithUnicodeValue() { + Connection connect = Jsoup.connect("https://example.com"); + String value = "/foo/我的"; + connect.header("Key", value); + + String actual = connect.request().header("Key"); + assertEquals(value, actual); + } }