From 0540a182934c5094b55763557ff8e21c5058ae76 Mon Sep 17 00:00:00 2001 From: imrabti Date: Mon, 15 Jul 2013 19:19:26 +0000 Subject: [PATCH 1/4] Update HTMLUnit to latest version, change Browser to FIREFOX_17. Some enhancement to CrawlServiceServlet, wait until the JavaScript engine finishes processing. --- .../server/CrawlServiceServlet.java | 61 ++++++++++++++++--- .../server/guice/CrawlServiceModule.java | 2 +- pom.xml | 2 +- 3 files changed, 54 insertions(+), 11 deletions(-) diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java index 7f3a4faded..21e4e23913 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java @@ -23,6 +23,8 @@ import java.util.Date; import java.util.List; import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; import javax.inject.Provider; import javax.inject.Singleton; @@ -30,7 +32,10 @@ import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; +import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; +import com.gargoylesoftware.htmlunit.SilentCssErrorHandler; import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.google.inject.Inject; import com.googlecode.objectify.Key; @@ -44,6 +49,15 @@ @Singleton public class CrawlServiceServlet extends HttpServlet { + private class SyncAllAjaxController extends NicelyResynchronizingAjaxController { + private static final long serialVersionUID = 1L; + + @Override + public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) { + return true; + } + } + private static final String CHAR_ENCODING = "UTF-8"; private static final long serialVersionUID = -6129110224710383122L; @@ -51,11 +65,15 @@ public class CrawlServiceServlet extends HttpServlet { @Inject(optional = true) @HtmlUnitTimeoutMillis private long timeoutMillis = 12000; + private long jsTimeoutMillis = 1000; + private long pageWaitMillis = 200; + private int maxLoopChecks = 2; @Inject(optional = true) @CachedPageTimeoutSec private long cachedPageTimeoutSec = 15 * 60; + private final Logger log; private final Provider webClientProvider; private final String key; @@ -64,16 +82,17 @@ public class CrawlServiceServlet extends HttpServlet { @Inject CrawlServiceServlet(final Provider webClientProvider, + final Logger log, @ServiceKey String key, CachedPageDao cachedPageDao) { this.webClientProvider = webClientProvider; + this.log = log; this.key = key; this.cachedPageDao = cachedPageDao; } @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) { - PrintWriter out = null; try { resp.setCharacterEncoding(CHAR_ENCODING); @@ -175,19 +194,42 @@ private CachedPage createPlaceholderPage(String url, Date currDate) { * @throws IOException * @throws MalformedURLException */ - private StringBuilder renderPage(String url) throws IOException, - MalformedURLException { + private StringBuilder renderPage(String url) throws IOException { WebClient webClient = webClientProvider.get(); - webClient.setCssEnabled(false); - webClient.setJavaScriptTimeout(0); - webClient.setJavaScriptTimeout(0); - webClient.setThrowExceptionOnScriptError(false); - webClient.setThrowExceptionOnFailingStatusCode(false); - webClient.setJavaScriptEnabled(true); + webClient.getCache().clear(); + webClient.getOptions().setCssEnabled(false); + webClient.getOptions().setJavaScriptEnabled(true); + webClient.getOptions().setThrowExceptionOnScriptError(false); + webClient.getOptions().setRedirectEnabled(false); + webClient.setAjaxController(new SyncAllAjaxController()); + webClient.setCssErrorHandler(new SilentCssErrorHandler()); + HtmlPage page = webClient.getPage(url); webClient.getJavaScriptEngine().pumpEventLoop(timeoutMillis); + int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(jsTimeoutMillis); + int loopCount = 0; + + while (waitForBackgroundJavaScript > 0 && loopCount < maxLoopChecks) { + ++loopCount; + waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(jsTimeoutMillis); + + if (waitForBackgroundJavaScript == 0) { + log.fine("HtmlUnit exits background javascript at loop counter " + loopCount); + break; + } + + synchronized (page) { + log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount); + try { + page.wait(pageWaitMillis); + } catch (InterruptedException e) { + log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e); + } + } + } + StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("
\n"); stringBuilder.append("

You are viewing a non-interactive page that is intended for the crawler. "); @@ -197,6 +239,7 @@ private StringBuilder renderPage(String url) throws IOException, stringBuilder.append(page.asXml()); webClient.closeAllWindows(); + return stringBuilder; } diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java index e4428a38a1..4f9778787a 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java @@ -37,6 +37,6 @@ public void configureServlets() { @Singleton @Provides WebClient getWebClient() { - return new WebClient(BrowserVersion.FIREFOX_3_6); + return new WebClient(BrowserVersion.FIREFOX_17); } } diff --git a/pom.xml b/pom.xml index 19065e90cb..c9d8cacc16 100644 --- a/pom.xml +++ b/pom.xml @@ -255,7 +255,7 @@ 1.1.2 4.11 1.9.5 - 2.9 + 2.12 2.32.0 1.1.3 4.2.3 From 4c429ebf7d6c59be9170dd38bca9fb3ccc62c39b Mon Sep 17 00:00:00 2001 From: imrabti Date: Tue, 16 Jul 2013 16:00:55 +0000 Subject: [PATCH 2/4] CR Fixes. --- .../LazyActionHandlerValidatorRegistryImpl.java | 11 ++++++----- .../server/CrawlServiceServlet.java | 15 ++++++--------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/gwtp-core/gwtp-dispatch-server-guice/src/main/java/com/gwtplatform/dispatch/server/guice/actionhandlervalidator/LazyActionHandlerValidatorRegistryImpl.java b/gwtp-core/gwtp-dispatch-server-guice/src/main/java/com/gwtplatform/dispatch/server/guice/actionhandlervalidator/LazyActionHandlerValidatorRegistryImpl.java index 6119e30874..5d38d8539c 100644 --- a/gwtp-core/gwtp-dispatch-server-guice/src/main/java/com/gwtplatform/dispatch/server/guice/actionhandlervalidator/LazyActionHandlerValidatorRegistryImpl.java +++ b/gwtp-core/gwtp-dispatch-server-guice/src/main/java/com/gwtplatform/dispatch/server/guice/actionhandlervalidator/LazyActionHandlerValidatorRegistryImpl.java @@ -51,9 +51,10 @@ public class LazyActionHandlerValidatorRegistryImpl implements @Inject LazyActionHandlerValidatorRegistryImpl(Injector injector) { this.injector = injector; - actionHandlerValidatorClasses = new ConcurrentHashMap>, ActionHandlerValidatorClass, ? extends Result>>(); - actionHandlerValidatorInstances = new ConcurrentHashMap>, ActionHandlerValidatorInstance>(); + actionHandlerValidatorClasses = new ConcurrentHashMap>, + ActionHandlerValidatorClass, ? extends Result>>(); + actionHandlerValidatorInstances = new ConcurrentHashMap>, + ActionHandlerValidatorInstance>(); validators = new ConcurrentHashMap, ActionValidator>(); } @@ -105,8 +106,8 @@ public , R extends Result> void removeActionHandlerValidator Class actionClass, ActionHandlerValidatorClass actionHandlerValidatorClass) { - ActionHandlerValidatorClass oldActionHandlerValidatorClass = actionHandlerValidatorClasses.get - (actionClass); + ActionHandlerValidatorClass oldActionHandlerValidatorClass = actionHandlerValidatorClasses.get( + actionClass); if (oldActionHandlerValidatorClass == actionHandlerValidatorClass) { actionHandlerValidatorClasses.remove(actionClass); diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java index 21e4e23913..9022e9769b 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java @@ -81,10 +81,10 @@ public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) private final CachedPageDao cachedPageDao; @Inject - CrawlServiceServlet(final Provider webClientProvider, - final Logger log, - @ServiceKey String key, - CachedPageDao cachedPageDao) { + CrawlServiceServlet(Provider webClientProvider, + Logger log, + CachedPageDao cachedPageDao, + @ServiceKey String key) { this.webClientProvider = webClientProvider; this.log = log; this.key = key; @@ -147,9 +147,7 @@ private void storeFetchedPage(CachedPage cachedPage, * @param out The {@link PrintWriter} to write to, if needed. * @return {@code true} if the page needs to be fetched, {@code false} otherwise. */ - private boolean needToFetchPage(CachedPage matchingPage, - Date currDate, PrintWriter out) { - + private boolean needToFetchPage(CachedPage matchingPage, Date currDate, PrintWriter out) { if (matchingPage == null) { return true; } @@ -252,8 +250,7 @@ private StringBuilder renderPage(String url) throws IOException { * @param currDate The current date, to check for expiration. * @return The non-expired matching page if found, {@code null} otherwise. */ - private CachedPage extractMatchingPage(Map, CachedPage> deprecatedPages, - Date currDate) { + private CachedPage extractMatchingPage(Map, CachedPage> deprecatedPages, Date currDate) { CachedPage matchingPage = findMostRecentPage(deprecatedPages); // Keep the matching page only if it has not expired From 93137d7a487d5e1b165cc728f84965b7f7926853 Mon Sep 17 00:00:00 2001 From: imrabti Date: Tue, 16 Jul 2013 16:36:58 +0000 Subject: [PATCH 3/4] Fix CheckStyle. --- .../LazyActionHandlerValidatorRegistryImpl.java | 11 ++++++----- .../crawlerservice/server/CrawlServiceServlet.java | 1 - 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/gwtp-core/gwtp-dispatch-server-spring/src/main/java/com/gwtplatform/dispatch/server/spring/actionhandlervalidator/LazyActionHandlerValidatorRegistryImpl.java b/gwtp-core/gwtp-dispatch-server-spring/src/main/java/com/gwtplatform/dispatch/server/spring/actionhandlervalidator/LazyActionHandlerValidatorRegistryImpl.java index f700eb2fad..789e04e39d 100644 --- a/gwtp-core/gwtp-dispatch-server-spring/src/main/java/com/gwtplatform/dispatch/server/spring/actionhandlervalidator/LazyActionHandlerValidatorRegistryImpl.java +++ b/gwtp-core/gwtp-dispatch-server-spring/src/main/java/com/gwtplatform/dispatch/server/spring/actionhandlervalidator/LazyActionHandlerValidatorRegistryImpl.java @@ -46,9 +46,10 @@ public class LazyActionHandlerValidatorRegistryImpl implements LazyActionHandler private final Map, ActionValidator> validators; public LazyActionHandlerValidatorRegistryImpl() { - actionHandlerValidatorClasses = new ConcurrentHashMap>, ActionHandlerValidatorClass, ? extends Result>>(); - actionHandlerValidatorInstances = new ConcurrentHashMap>, ActionHandlerValidatorInstance>(); + actionHandlerValidatorClasses = new ConcurrentHashMap>, + ActionHandlerValidatorClass, ? extends Result>>(); + actionHandlerValidatorInstances = new ConcurrentHashMap>, + ActionHandlerValidatorInstance>(); validators = new ConcurrentHashMap, ActionValidator>(); } @@ -96,8 +97,8 @@ public ActionValidator findActionValidator(Class acti public , R extends Result> void removeActionHandlerValidatorClass(Class actionClass, ActionHandlerValidatorClass actionHandlerValidatorClass) { - ActionHandlerValidatorClass oldActionHandlerValidatorClass = actionHandlerValidatorClasses.get - (actionClass); + ActionHandlerValidatorClass oldActionHandlerValidatorClass = actionHandlerValidatorClasses.get( + actionClass); if (oldActionHandlerValidatorClass == actionHandlerValidatorClass) { actionHandlerValidatorClasses.remove(actionClass); diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java index 9022e9769b..d2aa6aa2da 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.io.PrintWriter; -import java.net.MalformedURLException; import java.net.URLDecoder; import java.util.Date; import java.util.List; From 581eb27b6c85b4a61e442f4b9619ab1563a48747 Mon Sep 17 00:00:00 2001 From: imrabti Date: Wed, 17 Jul 2013 11:57:05 +0000 Subject: [PATCH 4/4] gwt-carstore Out Of Memory Fix. --- gwtp-carstore/pom.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gwtp-carstore/pom.xml b/gwtp-carstore/pom.xml index 1351139ae9..e24751f02b 100644 --- a/gwtp-carstore/pom.xml +++ b/gwtp-carstore/pom.xml @@ -86,6 +86,8 @@ com.google.appengine.tools.development.gwt.AppEngineLauncher ${gae.version} ${gae.home} + -Xss2048k -Xmx1024M -XX:MaxPermSize=512m + 2 CarStore.html