Skip to content

Commit

Permalink
Only attempt to correct 8559 -> UTF on response headers
Browse files Browse the repository at this point in the history
Fixes #2011

This allows header values to be sent as UTF-8 (as encoded by Java HTTP request writer). But that's out of spec and depending on how the upstream server handles that, may or may not work as the user intended. Hence added a note on the docs to suggest encoding the data first.
  • Loading branch information
jhy committed Oct 20, 2023
1 parent eff1521 commit 9de27fa
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 61 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ Release 1.16.2 [PENDING]
parent.
<https://github.com/jhy/jsoup/issues/2013>

* Bugfix: in Jsoup.Connection when adding headers, the value may have been assumed to be an incorrectly decoded
ISO_8859_1 string, and re-encoded as UTF-8. The value is now left as-is.

* Change: removed previously deprecated methods Document#normalise, Element#forEach(org.jsoup.helper.Consumer<>),
Node#forEach(org.jsoup.helper.Consumer<>), and the org.jsoup.helper.Consumer interface; the latter being a
previously required compatibility shim prior to Android's de-sugaring support.
Expand Down
22 changes: 13 additions & 9 deletions src/main/java/org/jsoup/Connection.java
Original file line number Diff line number Diff line change
Expand Up @@ -412,11 +412,11 @@ interface Base<T extends Base<T>> {

/**
* Get the value of a header. If there is more than one header value with the same name, the headers are returned
* comma seperated, per <a href="https://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.2">rfc2616-sec4</a>.
* comma separated, per <a href="https://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.2">rfc2616-sec4</a>.
* <p>
* Header names are case insensitive.
* Header names are case-insensitive.
* </p>
* @param name name of header (case insensitive)
* @param name name of header (case-insensitive)
* @return value of header, or null if not set.
* @see #hasHeader(String)
* @see #cookie(String)
Expand All @@ -425,14 +425,16 @@ interface Base<T extends Base<T>> {

/**
* Get the values of a header.
* @param name header name, case insensitive.
* @param name header name, case-insensitive.
* @return a list of values for this header, or an empty list if not set.
*/
List<String> headers(String name);

/**
* Set a header. This method will overwrite any existing header with the same case insensitive name. (If there
* Set a header. This method will overwrite any existing header with the same case-insensitive name. If there
* is more than one value for this header, this method will update the first matching header.
* <p>For compatibility, if the content of the header includes text that cannot be represented by ISO-8859-1,
* then it should be encoded first per <a href="https://www.ietf.org/rfc/rfc2047.txt">RFC 2047</a>.</p>
* @param name Name of header
* @param value Value of header
* @return this, for chaining
Expand All @@ -442,6 +444,8 @@ interface Base<T extends Base<T>> {

/**
* Add a header. The header will be added regardless of whether a header with the same name already exists.
* <p>For compatibility, if the content of the header includes text that cannot be represented by ISO-8859-1,
* then it should be encoded first per <a href="https://www.ietf.org/rfc/rfc2047.txt">RFC 2047</a>.</p>
* @param name Name of new header
* @param value Value of new header
* @return this, for chaining
Expand All @@ -450,22 +454,22 @@ interface Base<T extends Base<T>> {

/**
* Check if a header is present
* @param name name of header (case insensitive)
* @param name name of header (case-insensitive)
* @return if the header is present in this request/response
*/
boolean hasHeader(String name);

/**
* Check if a header is present, with the given value
* @param name header name (case insensitive)
* @param value value (case insensitive)
* @param name header name (case-insensitive)
* @param value value (case-insensitive)
* @return if the header and value pair are set in this req/res
*/
boolean hasHeaderWithValue(String name, String value);

/**
* Remove headers by name. If there is more than one header with this name, they will all be removed.
* @param name name of header to remove (case insensitive)
* @param name name of header to remove (case-insensitive)
* @return this, for chaining
*/
T removeHeader(String name);
Expand Down
113 changes: 61 additions & 52 deletions src/main/java/org/jsoup/helper/HttpConnection.java
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ public String header(String name) {
}

@Override
public T addHeader(String name, String value) {
public T addHeader(String name, @Nullable String value) {
Validate.notEmptyParam(name, "name");
//noinspection ConstantConditions
value = value == null ? "" : value;
Expand All @@ -458,7 +458,7 @@ public T addHeader(String name, String value) {
values = new ArrayList<>();
headers.put(name, values);
}
values.add(fixHeaderEncoding(value));
values.add(value);

return (T) this;
}
Expand All @@ -469,55 +469,6 @@ public List<String> headers(String name) {
return getHeadersCaseInsensitive(name);
}

private static String fixHeaderEncoding(String val) {
byte[] bytes = val.getBytes(ISO_8859_1);
if (!looksLikeUtf8(bytes))
return val;
return new String(bytes, UTF_8);
}

private static boolean looksLikeUtf8(byte[] input) {
int i = 0;
// BOM:
if (input.length >= 3
&& (input[0] & 0xFF) == 0xEF
&& (input[1] & 0xFF) == 0xBB
&& (input[2] & 0xFF) == 0xBF) {
i = 3;
}

int end;
for (int j = input.length; i < j; ++i) {
int o = input[i];
if ((o & 0x80) == 0) {
continue; // ASCII
}

// UTF-8 leading:
if ((o & 0xE0) == 0xC0) {
end = i + 1;
} else if ((o & 0xF0) == 0xE0) {
end = i + 2;
} else if ((o & 0xF8) == 0xF0) {
end = i + 3;
} else {
return false;
}

if (end >= input.length)
return false;

while (i < end) {
i++;
o = input[i];
if ((o & 0xC0) != 0x80) {
return false;
}
}
}
return true;
}

@Override
public T header(String name, String value) {
Validate.notEmptyParam(name, "name");
Expand Down Expand Up @@ -1162,9 +1113,67 @@ void processResponseHeaders(Map<String, List<String>> resHeaders) {
}
}
for (String value : values) {
addHeader(name, value);
addHeader(name, fixHeaderEncoding(value));
}
}
}

/**
Servers may encode response headers in UTF-8 instead of RFC defined 8859. This method attempts to detect that
and re-decode the string as UTF-8.
* @param val a header value string that may have been incorrectly decoded as 8859.
* @return a potentially re-decoded string.
*/
@Nullable
private static String fixHeaderEncoding(@Nullable String val) {
if (val == null) return val;
byte[] bytes = val.getBytes(ISO_8859_1);
if (looksLikeUtf8(bytes))
return new String(bytes, UTF_8);
else
return val;
}

private static boolean looksLikeUtf8(byte[] input) {
int i = 0;
// BOM:
if (input.length >= 3
&& (input[0] & 0xFF) == 0xEF
&& (input[1] & 0xFF) == 0xBB
&& (input[2] & 0xFF) == 0xBF) {
i = 3;
}

int end;
for (int j = input.length; i < j; ++i) {
int o = input[i];
if ((o & 0x80) == 0) {
continue; // ASCII
}

// UTF-8 leading:
if ((o & 0xE0) == 0xC0) {
end = i + 1;
} else if ((o & 0xF0) == 0xE0) {
end = i + 2;
} else if ((o & 0xF8) == 0xF0) {
end = i + 3;
} else {
return false;
}

if (end >= input.length)
return false;

while (i < end) {
i++;
o = input[i];
if ((o & 0xC0) != 0x80) {
return false;
}
}
}
return true;
}

private @Nullable static String setOutputContentType(final Connection.Request req) {
Expand Down
9 changes: 9 additions & 0 deletions src/test/java/org/jsoup/helper/HttpConnectionTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -358,4 +358,13 @@ public void caseInsensitiveHeaders(Locale locale) {
}
assertTrue(threw);
}

@Test void setHeaderWithUnicodeValue() {
Connection connect = Jsoup.connect("https://example.com");
String value = "/foo/我的";
connect.header("Key", value);

String actual = connect.request().header("Key");
assertEquals(value, actual);
}
}

0 comments on commit 9de27fa

Please sign in to comment.