Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement on GWTP Crawler #290

Merged
merged 4 commits into from
Jul 17, 2013
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,19 @@
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.inject.Provider;
import javax.inject.Singleton;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.SilentCssErrorHandler;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.inject.Inject;
import com.googlecode.objectify.Key;
Expand All @@ -44,18 +49,31 @@
@Singleton
public class CrawlServiceServlet extends HttpServlet {

private class SyncAllAjaxController extends NicelyResynchronizingAjaxController {
private static final long serialVersionUID = 1L;

@Override
public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
return true;
}
}

private static final String CHAR_ENCODING = "UTF-8";

private static final long serialVersionUID = -6129110224710383122L;

@Inject(optional = true)
@HtmlUnitTimeoutMillis
private long timeoutMillis = 12000;
private long jsTimeoutMillis = 1000;
private long pageWaitMillis = 200;
private int maxLoopChecks = 2;

@Inject(optional = true)
@CachedPageTimeoutSec
private long cachedPageTimeoutSec = 15 * 60;

private final Logger log;
private final Provider<WebClient> webClientProvider;

private final String key;
Expand All @@ -64,16 +82,17 @@ public class CrawlServiceServlet extends HttpServlet {

@Inject
CrawlServiceServlet(final Provider<WebClient> webClientProvider,
final Logger log,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rm final?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and align with (

@ServiceKey String key,
CachedPageDao cachedPageDao) {
this.webClientProvider = webClientProvider;
this.log = log;
this.key = key;
this.cachedPageDao = cachedPageDao;
}

@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) {

PrintWriter out = null;
try {
resp.setCharacterEncoding(CHAR_ENCODING);
Expand Down Expand Up @@ -175,19 +194,42 @@ private CachedPage createPlaceholderPage(String url, Date currDate) {
* @throws IOException
* @throws MalformedURLException
*/
private StringBuilder renderPage(String url) throws IOException,
MalformedURLException {
private StringBuilder renderPage(String url) throws IOException {
WebClient webClient = webClientProvider.get();

webClient.setCssEnabled(false);
webClient.setJavaScriptTimeout(0);
webClient.setJavaScriptTimeout(0);
webClient.setThrowExceptionOnScriptError(false);
webClient.setThrowExceptionOnFailingStatusCode(false);
webClient.setJavaScriptEnabled(true);
webClient.getCache().clear();
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setRedirectEnabled(false);
webClient.setAjaxController(new SyncAllAjaxController());
webClient.setCssErrorHandler(new SilentCssErrorHandler());

HtmlPage page = webClient.getPage(url);
webClient.getJavaScriptEngine().pumpEventLoop(timeoutMillis);

int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(jsTimeoutMillis);
int loopCount = 0;

while (waitForBackgroundJavaScript > 0 && loopCount < maxLoopChecks) {
++loopCount;
waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(jsTimeoutMillis);

if (waitForBackgroundJavaScript == 0) {
log.fine("HtmlUnit exits background javascript at loop counter " + loopCount);
break;
}

synchronized (page) {
log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount);
try {
page.wait(pageWaitMillis);
} catch (InterruptedException e) {
log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e);
}
}
}

StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("<hr />\n");
stringBuilder.append("<center><h3>You are viewing a non-interactive page that is intended for the crawler. ");
Expand All @@ -197,6 +239,7 @@ private StringBuilder renderPage(String url) throws IOException,

stringBuilder.append(page.asXml());
webClient.closeAllWindows();

return stringBuilder;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,6 @@ public void configureServlets() {
@Singleton
@Provides
WebClient getWebClient() {
return new WebClient(BrowserVersion.FIREFOX_3_6);
return new WebClient(BrowserVersion.FIREFOX_17);
}
}
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@
<jukito.version>1.1.2</jukito.version>
<junit.version>4.11</junit.version>
<mockito.version>1.9.5</mockito.version>
<htmlunit.version>2.9</htmlunit.version>
<htmlunit.version>2.12</htmlunit.version>
<selenium.version>2.32.0</selenium.version>
<cucumber.version>1.1.3</cucumber.version>
<httpcore.version>4.2.3</httpcore.version>
Expand Down