Skip to content

Commit

Permalink
fix: make sure the scraper uses the configured proxy & enhance scraping
Browse files Browse the repository at this point in the history
The scraper now works with Nexus2 and Nexus3 repositories (potentially
even more, but only those were tested).
  • Loading branch information
netmikey committed Nov 23, 2020
1 parent dbbeb9d commit 0e2e3fd
Showing 1 changed file with 31 additions and 7 deletions.
38 changes: 31 additions & 7 deletions src/main/java/io/github/netmikey/mvncloner/mvncloner/Scraper.java
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
package io.github.netmikey.mvncloner.mvncloner;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.ProxySelector;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand All @@ -19,6 +24,7 @@
import org.springframework.stereotype.Component;

import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.ProxyConfig;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
Expand Down Expand Up @@ -59,6 +65,16 @@ public void mirror() throws Exception {
try (final WebClient webClient = new WebClient()) {
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
// Set proxy
Optional<Proxy> proxy = ProxySelector.getDefault().select(new URI(rootUrl)).stream().findFirst();
proxy.ifPresent(theProxy -> {
InetSocketAddress proxyAddress = (InetSocketAddress) theProxy.address();
if (proxyAddress != null) {
webClient.getOptions()
.setProxyConfig(new ProxyConfig(proxyAddress.getHostName(), proxyAddress.getPort()));
}
});
// Set credentials
Utils.setCredentials(webClient, username, password);

LOG.info("Mirroring from " + rootUrl + " ...");
Expand All @@ -79,24 +95,32 @@ private void processIndexUrl(WebClient webClient, String pageUrl, Path mirrorPat

List<String> recurseUrls = new ArrayList<>();

String pageHost = new URL(pageUrl).getHost();

List<HtmlAnchor> links = page.getAnchors();
for (HtmlAnchor link : links) {
String fullyQualifiedUrl = page.getFullyQualifiedUrl(link.getHrefAttribute()).toString();
LOG.trace(" Found link: " + fullyQualifiedUrl);
// Only consider links to artifacts or subdirectories
if (fullyQualifiedUrl.startsWith(pageUrl)) {
// Avoid crawling out into the open...
if (new URL(fullyQualifiedUrl).getHost().equals(pageHost)) {
Matcher filePatternMatcher = FILE_URL_PATTERN.matcher(fullyQualifiedUrl);
if (filePatternMatcher.matches()) {
// Looks like a link to a file
handleFileLink(webClient, mirrorPath, filePatternMatcher);
} else {
// Looks like a link to another subdrectory: recurse
LOG.trace(" Mark for recursion.");
recurseUrls.add(fullyQualifiedUrl);
// Only consider links to artifacts or subdirectories
if (fullyQualifiedUrl.startsWith(pageUrl)) {
// Looks like a link to another subdrectory: recurse
LOG.trace(" Mark for recursion.");
recurseUrls.add(fullyQualifiedUrl);
} else {
// Looks like a link back or to some completely other
// page: ignore it.
LOG.trace(" Ignoring this link: destination outside of scope.");
}
}
} else {
// Looks like a link back or to some completely other page:
// ignore it.
// Looks like a link to some completely other page: ignore it.
LOG.trace(" Ignoring this link: destination outside of scope.");
}
}
Expand Down

0 comments on commit 0e2e3fd

Please sign in to comment.