Skip to content

Commit

Permalink
Strip control characters from URLs when resolving absolute URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
jhy committed Aug 21, 2022
1 parent 985f1fe commit 4ea768d
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 1 deletion.
10 changes: 9 additions & 1 deletion src/main/java/org/jsoup/internal/StringUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ public static boolean isAscii(String string) {
* @throws MalformedURLException if an error occurred generating the URL
*/
public static URL resolve(URL base, String relUrl) throws MalformedURLException {
relUrl = stripControlChars(relUrl);
// workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
if (relUrl.startsWith("?"))
relUrl = base.getPath() + relUrl;
Expand All @@ -308,7 +309,9 @@ public static URL resolve(URL base, String relUrl) throws MalformedURLException
* @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned)
* @return an absolute URL if one was able to be generated, or the empty string if not
*/
public static String resolve(final String baseUrl, final String relUrl) {
public static String resolve(String baseUrl, String relUrl) {
// workaround: java will allow control chars in a path URL and may treat as relative, but Chrome / Firefox will strip and may see as a scheme. Normalize to browser's view.
baseUrl = stripControlChars(baseUrl); relUrl = stripControlChars(relUrl);
try {
URL base;
try {
Expand All @@ -327,6 +330,11 @@ public static String resolve(final String baseUrl, final String relUrl) {
}
private static final Pattern validUriScheme = Pattern.compile("^[a-zA-Z][a-zA-Z0-9+-.]*:");

private static final Pattern controlChars = Pattern.compile("[\\x00-\\x1f]*"); // matches ascii 0 - 31, to strip from url
private static String stripControlChars(final String input) {
return controlChars.matcher(input).replaceAll("");
}

private static final ThreadLocal<Stack<StringBuilder>> threadLocalBuilders = new ThreadLocal<Stack<StringBuilder>>() {
@Override
protected Stack<StringBuilder> initialValue() {
Expand Down
9 changes: 9 additions & 0 deletions src/test/java/org/jsoup/internal/StringUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,15 @@ public void join() {
assertEquals("http://example.com/b/c/g#s/../x", resolve("http://example.com/b/c/d;p?q", "g#s/../x"));
}

@Test void stripsControlCharsFromUrls() {
// should resovle to an absolute url:
assertEquals("foo:bar", resolve("\nhttps://\texample.com/", "\r\nfo\to:ba\br"));
}

@Test void allowsSpaceInUrl() {
assertEquals("https://example.com/foo bar/", resolve("HTTPS://example.com/example/", "../foo bar/"));
}

@Test
void isAscii() {
assertTrue(StringUtil.isAscii(""));
Expand Down
18 changes: 18 additions & 0 deletions src/test/java/org/jsoup/safety/CleanerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,24 @@ public void safeListedProtocolShouldBeRetained(Locale locale) {
assertEquals("<a rel=\"nofollow\">Link</a>", clean);
}

@Test void dropsConcealedJavascriptProtocolWhenRelativesLinksEnabled() {
Safelist safelist = Safelist.basic().preserveRelativeLinks(true);
String html = "<a href=\"&#0013;ja&Tab;va&Tab;script&#0010;:alert(1)\">Link</a>";
String clean = Jsoup.clean(html, "https://", safelist);
assertEquals("<a rel=\"nofollow\">Link</a>", clean);

String colon = "<a href=\"ja&Tab;va&Tab;script&colon;alert(1)\">Link</a>";
String cleanColon = Jsoup.clean(colon, "https://", safelist);
assertEquals("<a rel=\"nofollow\">Link</a>", cleanColon);
}

@Test void dropsConcealedJavascriptProtocolWhenRelativesLinksDisabled() {
Safelist safelist = Safelist.basic().preserveRelativeLinks(false);
String html = "<a href=\"ja&Tab;vas&#0013;cript:alert(1)\">Link</a>";
String clean = Jsoup.clean(html, "https://", safelist);
assertEquals("<a rel=\"nofollow\">Link</a>", clean);
}

@Test public void handlesCustomProtocols() {
String html = "<img src='cid:12345' /> <img src='data:gzzt' />";
String dropped = Jsoup.clean(html, Safelist.basicWithImages());
Expand Down

0 comments on commit 4ea768d

Please sign in to comment.