diff --git a/CHANGES b/CHANGES index e9265bd4e6..bc120729e0 100644 --- a/CHANGES +++ b/CHANGES @@ -69,6 +69,9 @@ Release 1.16.2 [PENDING] parent. + * Bugfix: in Jsoup.Connection when adding headers, the value may have been assumed to be an incorrectly decoded + ISO_8859_1 string, and re-encoded as UTF-8. The value is now left as-is. + * Change: removed previously deprecated methods Document#normalise, Element#forEach(org.jsoup.helper.Consumer<>), Node#forEach(org.jsoup.helper.Consumer<>), and the org.jsoup.helper.Consumer interface; the latter being a previously required compatibility shim prior to Android's de-sugaring support. diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java index 4e279a988f..f422debcb7 100644 --- a/src/main/java/org/jsoup/Connection.java +++ b/src/main/java/org/jsoup/Connection.java @@ -412,11 +412,11 @@ interface Base> { /** * Get the value of a header. If there is more than one header value with the same name, the headers are returned - * comma seperated, per rfc2616-sec4. + * comma separated, per rfc2616-sec4. *

- * Header names are case insensitive. + * Header names are case-insensitive. *

- * @param name name of header (case insensitive) + * @param name name of header (case-insensitive) * @return value of header, or null if not set. * @see #hasHeader(String) * @see #cookie(String) @@ -425,14 +425,16 @@ interface Base> { /** * Get the values of a header. - * @param name header name, case insensitive. + * @param name header name, case-insensitive. * @return a list of values for this header, or an empty list if not set. */ List headers(String name); /** - * Set a header. This method will overwrite any existing header with the same case insensitive name. (If there + * Set a header. This method will overwrite any existing header with the same case-insensitive name. If there * is more than one value for this header, this method will update the first matching header. + *

For compatibility, if the content of the header includes text that cannot be represented by ISO-8859-1, + * then it should be encoded first per RFC 2047.

* @param name Name of header * @param value Value of header * @return this, for chaining @@ -442,6 +444,8 @@ interface Base> { /** * Add a header. The header will be added regardless of whether a header with the same name already exists. + *

For compatibility, if the content of the header includes text that cannot be represented by ISO-8859-1, + * then it should be encoded first per RFC 2047.

* @param name Name of new header * @param value Value of new header * @return this, for chaining @@ -450,22 +454,22 @@ interface Base> { /** * Check if a header is present - * @param name name of header (case insensitive) + * @param name name of header (case-insensitive) * @return if the header is present in this request/response */ boolean hasHeader(String name); /** * Check if a header is present, with the given value - * @param name header name (case insensitive) - * @param value value (case insensitive) + * @param name header name (case-insensitive) + * @param value value (case-insensitive) * @return if the header and value pair are set in this req/res */ boolean hasHeaderWithValue(String name, String value); /** * Remove headers by name. If there is more than one header with this name, they will all be removed. - * @param name name of header to remove (case insensitive) + * @param name name of header to remove (case-insensitive) * @return this, for chaining */ T removeHeader(String name); diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index d87c9f42f0..af7a18aa29 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -448,7 +448,7 @@ public String header(String name) { } @Override - public T addHeader(String name, String value) { + public T addHeader(String name, @Nullable String value) { Validate.notEmptyParam(name, "name"); //noinspection ConstantConditions value = value == null ? "" : value; @@ -458,7 +458,7 @@ public T addHeader(String name, String value) { values = new ArrayList<>(); headers.put(name, values); } - values.add(fixHeaderEncoding(value)); + values.add(value); return (T) this; } @@ -469,55 +469,6 @@ public List headers(String name) { return getHeadersCaseInsensitive(name); } - private static String fixHeaderEncoding(String val) { - byte[] bytes = val.getBytes(ISO_8859_1); - if (!looksLikeUtf8(bytes)) - return val; - return new String(bytes, UTF_8); - } - - private static boolean looksLikeUtf8(byte[] input) { - int i = 0; - // BOM: - if (input.length >= 3 - && (input[0] & 0xFF) == 0xEF - && (input[1] & 0xFF) == 0xBB - && (input[2] & 0xFF) == 0xBF) { - i = 3; - } - - int end; - for (int j = input.length; i < j; ++i) { - int o = input[i]; - if ((o & 0x80) == 0) { - continue; // ASCII - } - - // UTF-8 leading: - if ((o & 0xE0) == 0xC0) { - end = i + 1; - } else if ((o & 0xF0) == 0xE0) { - end = i + 2; - } else if ((o & 0xF8) == 0xF0) { - end = i + 3; - } else { - return false; - } - - if (end >= input.length) - return false; - - while (i < end) { - i++; - o = input[i]; - if ((o & 0xC0) != 0x80) { - return false; - } - } - } - return true; - } - @Override public T header(String name, String value) { Validate.notEmptyParam(name, "name"); @@ -1162,9 +1113,67 @@ void processResponseHeaders(Map> resHeaders) { } } for (String value : values) { - addHeader(name, value); + addHeader(name, fixHeaderEncoding(value)); + } + } + } + + /** + Servers may encode response headers in UTF-8 instead of RFC defined 8859. This method attempts to detect that + and re-decode the string as UTF-8. + * @param val a header value string that may have been incorrectly decoded as 8859. + * @return a potentially re-decoded string. + */ + @Nullable + private static String fixHeaderEncoding(@Nullable String val) { + if (val == null) return val; + byte[] bytes = val.getBytes(ISO_8859_1); + if (looksLikeUtf8(bytes)) + return new String(bytes, UTF_8); + else + return val; + } + + private static boolean looksLikeUtf8(byte[] input) { + int i = 0; + // BOM: + if (input.length >= 3 + && (input[0] & 0xFF) == 0xEF + && (input[1] & 0xFF) == 0xBB + && (input[2] & 0xFF) == 0xBF) { + i = 3; + } + + int end; + for (int j = input.length; i < j; ++i) { + int o = input[i]; + if ((o & 0x80) == 0) { + continue; // ASCII + } + + // UTF-8 leading: + if ((o & 0xE0) == 0xC0) { + end = i + 1; + } else if ((o & 0xF0) == 0xE0) { + end = i + 2; + } else if ((o & 0xF8) == 0xF0) { + end = i + 3; + } else { + return false; + } + + if (end >= input.length) + return false; + + while (i < end) { + i++; + o = input[i]; + if ((o & 0xC0) != 0x80) { + return false; + } } } + return true; } private @Nullable static String setOutputContentType(final Connection.Request req) { diff --git a/src/test/java/org/jsoup/helper/HttpConnectionTest.java b/src/test/java/org/jsoup/helper/HttpConnectionTest.java index d77658809f..9444c2da80 100644 --- a/src/test/java/org/jsoup/helper/HttpConnectionTest.java +++ b/src/test/java/org/jsoup/helper/HttpConnectionTest.java @@ -358,4 +358,13 @@ public void caseInsensitiveHeaders(Locale locale) { } assertTrue(threw); } + + @Test void setHeaderWithUnicodeValue() { + Connection connect = Jsoup.connect("https://example.com"); + String value = "/foo/我的"; + connect.header("Key", value); + + String actual = connect.request().header("Key"); + assertEquals(value, actual); + } }