From 9cc4bf2b4c06337e34678f6500862b74245750e8 Mon Sep 17 00:00:00 2001 From: BenDol Date: Tue, 6 Oct 2015 23:15:13 +1300 Subject: [PATCH 01/11] Decouple the crawler service for use in other frameworks. This is following a Spring implementation soon, though I don't know how you guys want the package breakdown for a Spring implementation since the crawler service assumes Guice. --- ....java => AbstractCrawlServiceServlet.java} | 123 ++++++++++-------- .../crawlerservice/server/CrawledPage.java | 42 ++++++ .../crawlerservice/server/ServiceKey.java | 1 + .../server/domain/CachedPage.java | 12 +- .../server/guice/CrawlServiceModule.java | 1 - .../server/guice/CrawlServiceServlet.java | 99 ++++++++++++++ 6 files changed, 224 insertions(+), 54 deletions(-) rename gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/{CrawlServiceServlet.java => AbstractCrawlServiceServlet.java} (74%) create mode 100644 gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawledPage.java create mode 100644 gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceServlet.java diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/AbstractCrawlServiceServlet.java similarity index 74% rename from gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java rename to gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/AbstractCrawlServiceServlet.java index a45b95a9b7..db9ef93021 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/AbstractCrawlServiceServlet.java @@ -26,8 +26,6 @@ import java.util.logging.Logger; import java.util.regex.Pattern; -import javax.inject.Provider; -import javax.inject.Singleton; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; @@ -38,16 +36,12 @@ import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.google.common.base.Strings; -import com.google.inject.Inject; -import com.googlecode.objectify.Key; -import com.gwtplatform.crawlerservice.server.domain.CachedPage; -import com.gwtplatform.crawlerservice.server.service.CachedPageDao; /** - * Servlet that makes it possible to fetch an external page, renders it using HTMLUnit and returns the HTML page. + * Servlet that makes it possible to fetch an external page, + * renders it using HTMLUnit and returns the HTML page. */ -@Singleton -public class CrawlServiceServlet extends HttpServlet { +public abstract class AbstractCrawlServiceServlet extends HttpServlet { private static class SyncAllAjaxController extends NicelyResynchronizingAjaxController { private static final long serialVersionUID = 1L; @@ -58,39 +52,27 @@ public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) } } - private static final String CHAR_ENCODING = "UTF-8"; + protected static final String CHAR_ENCODING = "UTF-8"; private static final long serialVersionUID = -6129110224710383122L; - @Inject(optional = true) - @HtmlUnitTimeoutMillis - private final long timeoutMillis = 5000; - private final long jsTimeoutMillis = 2000; - private final long pageWaitMillis = 100; - private final long maxLoopChecks = 2; + protected final Logger log; + protected final String key; - @Inject(optional = true) - @CachedPageTimeoutSec - private final long cachedPageTimeoutSec = 15 * 60; + protected AbstractCrawlServiceServlet(Logger log, String key) { + this.log = log; + this.key = key; + } - private final Logger log; - private final Provider webClientProvider; + protected abstract T createCrawledPage(); - private final String key; + protected abstract T getCachedPage(String url); - private final CachedPageDao cachedPageDao; + protected abstract void saveCachedPage(T crawledPage); - @Inject - protected CrawlServiceServlet( - Provider webClientProvider, - Logger log, - CachedPageDao cachedPageDao, - @ServiceKey String key) { - this.webClientProvider = webClientProvider; - this.log = log; - this.key = key; - this.cachedPageDao = cachedPageDao; - } + protected abstract void deleteCachedPage(T crawledPage); + + protected abstract WebClient getWebClient(); @Override protected void doGet(HttpServletRequest request, HttpServletResponse response) { @@ -107,14 +89,14 @@ protected void doGet(HttpServletRequest request, HttpServletResponse response) { if (!Strings.isNullOrEmpty(url)) { url = URLDecoder.decode(url, CHAR_ENCODING); - CachedPage cachedPage = cachedPageDao.get(Key.create(CachedPage.class, url)); + T crawledPage = getCachedPage(url); Date currDate = new Date(); - if (needToFetchPage(cachedPage, currDate, out)) { - cachedPage = createPlaceholderPage(url, currDate); + if (needToFetchPage(crawledPage, currDate, out)) { + crawledPage = createPlaceholderPage(url, currDate); String renderedHtml = renderPage(url); - storeFetchedPage(cachedPage, renderedHtml); + storeFetchedPage(crawledPage, renderedHtml); out.println(renderedHtml); } } @@ -146,10 +128,10 @@ private void validateKey(HttpServletRequest request) } } - private void storeFetchedPage(CachedPage cachedPage, String stringBuilder) { - cachedPage.setContent(stringBuilder); - cachedPage.setFetchInProgress(false); - cachedPageDao.put(cachedPage); + private void storeFetchedPage(T crawledPage, String stringBuilder) { + crawledPage.setContent(stringBuilder); + crawledPage.setFetchInProgress(false); + saveCachedPage(crawledPage); } /** @@ -161,15 +143,15 @@ private void storeFetchedPage(CachedPage cachedPage, String stringBuilder) { * @param out The {@link PrintWriter} to write to, if needed. * @return {@code true} if the page needs to be fetched, {@code false} otherwise. */ - private boolean needToFetchPage(CachedPage matchingPage, Date currDate, PrintWriter out) { - if (matchingPage == null || matchingPage.isExpired(cachedPageTimeoutSec)) { + private boolean needToFetchPage(T matchingPage, Date currDate, PrintWriter out) { + if (matchingPage == null || matchingPage.isExpired(getCachedPageTimeoutSec())) { return true; } if (matchingPage.isFetchInProgress()) { // If fetch is in progress since more than 60 seconds, we consider something went wrong and fetch again. if (currDate.getTime() > matchingPage.getFetchDate().getTime() + 60000) { - cachedPageDao.delete(matchingPage); + deleteCachedPage(matchingPage); return true; } else { out.println("FETCH_IN_PROGRESS"); @@ -188,12 +170,12 @@ private boolean needToFetchPage(CachedPage matchingPage, Date currDate, PrintWri * @param currDate The current date, to mark the page. * @return The newly created placeholder page. */ - private CachedPage createPlaceholderPage(String url, Date currDate) { - CachedPage result = new CachedPage(); + private T createPlaceholderPage(String url, Date currDate) { + T result = createCrawledPage(); result.setUrl(url); result.setFetchDate(currDate); result.setFetchInProgress(true); - cachedPageDao.put(result); + saveCachedPage(result); return result; } @@ -205,7 +187,7 @@ private CachedPage createPlaceholderPage(String url, Date currDate) { * @return The rendered page, in a {@link StringBuilder}. */ private String renderPage(String url) throws IOException { - WebClient webClient = webClientProvider.get(); + WebClient webClient = getWebClient(); webClient.getCache().clear(); webClient.getOptions().setCssEnabled(false); @@ -218,12 +200,13 @@ private String renderPage(String url) throws IOException { WebRequest webRequest = new WebRequest(new URL(url), "text/html"); HtmlPage page = webClient.getPage(webRequest); - webClient.getJavaScriptEngine().pumpEventLoop(timeoutMillis); + webClient.getJavaScriptEngine().pumpEventLoop(getTimeoutMillis()); + long jsTimeoutMillis = getJsTimeoutMillis(); int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(jsTimeoutMillis); int loopCount = 0; - while (waitForBackgroundJavaScript > 0 && loopCount < maxLoopChecks) { + while (waitForBackgroundJavaScript > 0 && loopCount < getMaxLoopChecks()) { ++loopCount; waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(jsTimeoutMillis); @@ -235,7 +218,7 @@ private String renderPage(String url) throws IOException { synchronized (page) { log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount); try { - page.wait(pageWaitMillis); + page.wait(getPageWaitMillis()); } catch (InterruptedException e) { log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e); } @@ -248,4 +231,40 @@ private String renderPage(String url) throws IOException { .matcher(page.asXml().replace("", "")) .replaceAll(""); } + + /** + * The HTML Unit Timeout in milliseconds. + */ + public long getTimeoutMillis() { + return 5000; + } + + /** + * The JavaScript load timeout in milliseconds. + */ + public long getJsTimeoutMillis() { + return 2000; + } + + /** + * Max page wait time in milliseconds. + */ + public long getPageWaitMillis() { + return 100; + } + + /** + * Max loop check value. + */ + public long getMaxLoopChecks() { + return 2; + } + + /** + * Cache timeout period before {@link CrawledPage}'s are invalidated. + * @return timeout period in seconds. + */ + public long getCachedPageTimeoutSec() { + return 15 * 60; + } } diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawledPage.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawledPage.java new file mode 100644 index 0000000000..7782b1e5ce --- /dev/null +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawledPage.java @@ -0,0 +1,42 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.gwtplatform.crawlerservice.server; + +import java.util.Date; + +/** + * Crawled page interface. + */ +public interface CrawledPage { + void setUrl(String url); + + String getUrl(); + + void setFetchDate(Date fetchDate); + + Date getFetchDate(); + + void setFetchInProgress(boolean fetchInProgress); + + boolean isFetchInProgress(); + + void setContent(String content); + + String getContent(); + + boolean isExpired(long cachedPageTimeoutSec); +} diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java index c5263134c4..89c9d7483b 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java @@ -20,6 +20,7 @@ import java.lang.annotation.Target; import com.google.inject.BindingAnnotation; +import com.gwtplatform.crawlerservice.server.guice.CrawlServiceServlet; import static java.lang.annotation.ElementType.FIELD; import static java.lang.annotation.ElementType.METHOD; diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java index d5ca1cf6e1..2161329d79 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java @@ -20,50 +20,60 @@ import com.googlecode.objectify.annotation.Entity; import com.googlecode.objectify.annotation.Id; +import com.gwtplatform.crawlerservice.server.CrawledPage; /** * Stores a cached version of a page. */ @Entity -public class CachedPage { +public class CachedPage implements CrawledPage { @Id private String url; private Date fetchDate; private boolean fetchInProgress; private String content; + @Override public void setUrl(String url) { this.url = url; } + @Override public String getUrl() { return url; } + @Override public void setFetchDate(Date fetchDate) { this.fetchDate = new Date(fetchDate.getTime()); } + @Override public Date getFetchDate() { return new Date(fetchDate.getTime()); } + @Override public void setFetchInProgress(boolean fetchInProgress) { this.fetchInProgress = fetchInProgress; } + @Override public boolean isFetchInProgress() { return fetchInProgress; } + @Override public void setContent(String content) { this.content = content; } + @Override public String getContent() { return content; } + @Override public boolean isExpired(long cachedPageTimeoutSec) { return new Date().getTime() > fetchDate.getTime() + cachedPageTimeoutSec * 1000; } diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java index a6c4f0f6e1..533b90863e 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java @@ -22,7 +22,6 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.google.inject.Provides; import com.google.inject.servlet.ServletModule; -import com.gwtplatform.crawlerservice.server.CrawlServiceServlet; public class CrawlServiceModule extends ServletModule { diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceServlet.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceServlet.java new file mode 100644 index 0000000000..a9807f4ec9 --- /dev/null +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceServlet.java @@ -0,0 +1,99 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.gwtplatform.crawlerservice.server.guice; + +import java.util.logging.Logger; + +import javax.inject.Provider; +import javax.inject.Singleton; + +import com.gargoylesoftware.htmlunit.WebClient; +import com.google.inject.Inject; +import com.googlecode.objectify.Key; +import com.gwtplatform.crawlerservice.server.AbstractCrawlServiceServlet; +import com.gwtplatform.crawlerservice.server.CachedPageTimeoutSec; +import com.gwtplatform.crawlerservice.server.HtmlUnitTimeoutMillis; +import com.gwtplatform.crawlerservice.server.ServiceKey; +import com.gwtplatform.crawlerservice.server.domain.CachedPage; +import com.gwtplatform.crawlerservice.server.service.CachedPageDao; + +/** + * Servlet that makes it possible to fetch an external page, + * renders it using HTMLUnit and returns the HTML page. + */ +@Singleton +public class CrawlServiceServlet extends AbstractCrawlServiceServlet { + + @Inject(optional = true) + @HtmlUnitTimeoutMillis + private long timeoutMillis = 5000; + + @Inject(optional = true) + @CachedPageTimeoutSec + private long cachedPageTimeoutSec = 15 * 60; + + private final Provider webClientProvider; + + private final CachedPageDao cachedPageDao; + + @Inject + protected CrawlServiceServlet( + Provider webClientProvider, + Logger log, + CachedPageDao cachedPageDao, + @ServiceKey String key) { + super(log, key); + + this.webClientProvider = webClientProvider; + this.cachedPageDao = cachedPageDao; + } + + @Override + protected CachedPage createCrawledPage() { + return new CachedPage(); + } + + @Override + protected CachedPage getCachedPage(String url) { + return cachedPageDao.get(Key.create(CachedPage.class, url)); + } + + @Override + protected void saveCachedPage(CachedPage cachedPage) { + cachedPageDao.put(cachedPage); + } + + @Override + protected void deleteCachedPage(CachedPage cachedPage) { + cachedPageDao.delete(cachedPage); + } + + @Override + protected WebClient getWebClient() { + return webClientProvider.get(); + } + + @Override + public long getCachedPageTimeoutSec() { + return cachedPageTimeoutSec; + } + + @Override + public long getTimeoutMillis() { + return timeoutMillis; + } +} From e12fddfdb0f6a25c852b02e9ea203802c0d3dab7 Mon Sep 17 00:00:00 2001 From: BenDol Date: Tue, 6 Oct 2015 23:49:14 +1300 Subject: [PATCH 02/11] Move these out of guice package. --- .../server/{guice => }/CrawlServiceServlet.java | 9 ++------- .../gwtplatform/crawlerservice/server/ServiceKey.java | 1 - .../crawlerservice/server/guice/CrawlServiceModule.java | 1 + 3 files changed, 3 insertions(+), 8 deletions(-) rename gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/{guice => }/CrawlServiceServlet.java (84%) diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceServlet.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java similarity index 84% rename from gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceServlet.java rename to gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java index a9807f4ec9..eeb738ccda 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceServlet.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawlerservice.server.guice; +package com.gwtplatform.crawlerservice.server; import java.util.logging.Logger; @@ -24,16 +24,11 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.google.inject.Inject; import com.googlecode.objectify.Key; -import com.gwtplatform.crawlerservice.server.AbstractCrawlServiceServlet; -import com.gwtplatform.crawlerservice.server.CachedPageTimeoutSec; -import com.gwtplatform.crawlerservice.server.HtmlUnitTimeoutMillis; -import com.gwtplatform.crawlerservice.server.ServiceKey; import com.gwtplatform.crawlerservice.server.domain.CachedPage; import com.gwtplatform.crawlerservice.server.service.CachedPageDao; /** - * Servlet that makes it possible to fetch an external page, - * renders it using HTMLUnit and returns the HTML page. + * Guice crawl servlet implementation. */ @Singleton public class CrawlServiceServlet extends AbstractCrawlServiceServlet { diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java index 89c9d7483b..c5263134c4 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java @@ -20,7 +20,6 @@ import java.lang.annotation.Target; import com.google.inject.BindingAnnotation; -import com.gwtplatform.crawlerservice.server.guice.CrawlServiceServlet; import static java.lang.annotation.ElementType.FIELD; import static java.lang.annotation.ElementType.METHOD; diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java index 533b90863e..a6c4f0f6e1 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java @@ -22,6 +22,7 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.google.inject.Provides; import com.google.inject.servlet.ServletModule; +import com.gwtplatform.crawlerservice.server.CrawlServiceServlet; public class CrawlServiceModule extends ServletModule { From 76eccca9ccb96a0b7b568d336f55ef84eb0c22be Mon Sep 17 00:00:00 2001 From: BenDol Date: Wed, 7 Oct 2015 00:39:56 +1300 Subject: [PATCH 03/11] Move AbstractCrawlServiceServlet to gwtp-crawler module. --- gwtp-core/gwtp-crawler/pom.xml | 5 +++++ .../server/AbstractCrawlServiceServlet.java | 13 +++++++------ .../com/gwtplatform/crawler/server/CrawlFilter.java | 2 +- .../gwtplatform/crawler}/server/CrawledPage.java | 2 +- .../crawler}/server/InvalidKeyException.java | 2 +- gwtp-core/pom.xml | 7 +++++++ gwtp-crawler-service/pom.xml | 6 ++++++ .../crawlerservice/server/CrawlServiceServlet.java | 1 + .../crawlerservice/server/ServiceKey.java | 1 - .../crawlerservice/server/domain/CachedPage.java | 2 +- 10 files changed, 30 insertions(+), 11 deletions(-) rename {gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice => gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler}/server/AbstractCrawlServiceServlet.java (98%) rename {gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice => gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler}/server/CrawledPage.java (95%) rename {gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice => gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler}/server/InvalidKeyException.java (94%) diff --git a/gwtp-core/gwtp-crawler/pom.xml b/gwtp-core/gwtp-crawler/pom.xml index 6ccb64a9b8..4649c5f65d 100644 --- a/gwtp-core/gwtp-crawler/pom.xml +++ b/gwtp-core/gwtp-crawler/pom.xml @@ -36,6 +36,11 @@ servlet-api + + net.sourceforge.htmlunit + htmlunit + + com.google.inject diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/AbstractCrawlServiceServlet.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java similarity index 98% rename from gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/AbstractCrawlServiceServlet.java rename to gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java index db9ef93021..601508b2c9 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/AbstractCrawlServiceServlet.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawlerservice.server; +package com.gwtplatform.crawler.server; import java.io.IOException; import java.io.PrintWriter; @@ -35,7 +35,6 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.html.HtmlPage; -import com.google.common.base.Strings; /** * Servlet that makes it possible to fetch an external page, @@ -64,16 +63,18 @@ protected AbstractCrawlServiceServlet(Logger log, String key) { this.key = key; } + protected abstract WebClient getWebClient(); + protected abstract T createCrawledPage(); + // Page cache operations + protected abstract T getCachedPage(String url); protected abstract void saveCachedPage(T crawledPage); protected abstract void deleteCachedPage(T crawledPage); - protected abstract WebClient getWebClient(); - @Override protected void doGet(HttpServletRequest request, HttpServletResponse response) { PrintWriter out = null; @@ -86,7 +87,7 @@ protected void doGet(HttpServletRequest request, HttpServletResponse response) { validateKey(request); String url = request.getParameter("url"); - if (!Strings.isNullOrEmpty(url)) { + if (url != null && !url.isEmpty()) { url = URLDecoder.decode(url, CHAR_ENCODING); T crawledPage = getCachedPage(url); @@ -117,7 +118,7 @@ private void validateKey(HttpServletRequest request) throws InvalidKeyException, UnsupportedEncodingException { String receivedKey = request.getParameter("key"); - if (Strings.isNullOrEmpty(receivedKey)) { + if (receivedKey == null || receivedKey.isEmpty()) { throw new InvalidKeyException("No service key attached to the request."); } else { String decodedKey = URLDecoder.decode(receivedKey, CHAR_ENCODING); diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java index 306d6709a3..cd5a316563 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java @@ -152,7 +152,7 @@ public void doFilter(ServletRequest request, ServletResponse response, log.info("Crawl filter encountered escaped fragment, will open: " + pageName); String serviceRequest = serviceUrl + "?key=" + URLEncoder.encode(key, CHAR_ENCODING) - + "&url=" + URLEncoder.encode(pageName, CHAR_ENCODING); + + "&url=" + URLEncoder.encode(pageName, CHAR_ENCODING); log.info("Full service request: " + serviceRequest); diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawledPage.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawledPage.java similarity index 95% rename from gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawledPage.java rename to gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawledPage.java index 7782b1e5ce..52f8d755fe 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawledPage.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawledPage.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawlerservice.server; +package com.gwtplatform.crawler.server; import java.util.Date; diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/InvalidKeyException.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/InvalidKeyException.java similarity index 94% rename from gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/InvalidKeyException.java rename to gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/InvalidKeyException.java index ab5481dd52..37edb10691 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/InvalidKeyException.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/InvalidKeyException.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawlerservice.server; +package com.gwtplatform.crawler.server; public class InvalidKeyException extends Exception { private static final long serialVersionUID = 1L; diff --git a/gwtp-core/pom.xml b/gwtp-core/pom.xml index 9ebdf5bcce..45f7ea3152 100644 --- a/gwtp-core/pom.xml +++ b/gwtp-core/pom.xml @@ -251,6 +251,13 @@ ${velocity.version} + + + net.sourceforge.htmlunit + htmlunit + ${htmlunit.version} + + org.jukito diff --git a/gwtp-crawler-service/pom.xml b/gwtp-crawler-service/pom.xml index 3fc0e2be92..8dfe3ed32d 100644 --- a/gwtp-crawler-service/pom.xml +++ b/gwtp-crawler-service/pom.xml @@ -80,6 +80,12 @@ + + ${project.groupId} + gwtp-crawler + ${project.version} + + javax.servlet servlet-api diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java index eeb738ccda..75ea52d128 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java @@ -24,6 +24,7 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.google.inject.Inject; import com.googlecode.objectify.Key; +import com.gwtplatform.crawler.server.AbstractCrawlServiceServlet; import com.gwtplatform.crawlerservice.server.domain.CachedPage; import com.gwtplatform.crawlerservice.server.service.CachedPageDao; diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java index c5263134c4..fd68825395 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java @@ -32,7 +32,6 @@ *
bindConstant().annotatedWith(ServiceKey.class).to("123456");
  * 
*/ - @BindingAnnotation @Target({FIELD, PARAMETER, METHOD}) @Retention(RUNTIME) diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java index 2161329d79..65562dc256 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java @@ -20,7 +20,7 @@ import com.googlecode.objectify.annotation.Entity; import com.googlecode.objectify.annotation.Id; -import com.gwtplatform.crawlerservice.server.CrawledPage; +import com.gwtplatform.crawler.server.CrawledPage; /** * Stores a cached version of a page. From 2b2a661cbb4c8711907851d2d13957feb7ad30b9 Mon Sep 17 00:00:00 2001 From: BenDol Date: Wed, 7 Oct 2015 00:42:49 +1300 Subject: [PATCH 04/11] Checkstyle fix --- .../main/java/com/gwtplatform/crawler/server/CrawlFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java index cd5a316563..306d6709a3 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java @@ -152,7 +152,7 @@ public void doFilter(ServletRequest request, ServletResponse response, log.info("Crawl filter encountered escaped fragment, will open: " + pageName); String serviceRequest = serviceUrl + "?key=" + URLEncoder.encode(key, CHAR_ENCODING) - + "&url=" + URLEncoder.encode(pageName, CHAR_ENCODING); + + "&url=" + URLEncoder.encode(pageName, CHAR_ENCODING); log.info("Full service request: " + serviceRequest); From fb9846f3480f191573f2e4715ccc108d8c3231d5 Mon Sep 17 00:00:00 2001 From: BenDol Date: Wed, 7 Oct 2015 23:00:21 +1300 Subject: [PATCH 05/11] Decouple more crawler logic & started Spring implementation. Currently the Spring implementation is untested (as well as the Guice changes). If someone else that is more experienced using Guice could see if there will be any issues that would be great! --- gwtp-core/gwtp-crawler-guice/pom.xml | 55 ++++++++ .../server/guice}/CachedPageTimeoutSec.java | 2 +- .../server/guice}/CrawlServiceServlet.java | 37 +----- .../server/guice/GuiceCrawlFilter.java | 38 ++++++ .../server/guice}/HtmlUnitTimeoutMillis.java | 2 +- .../crawler/server/guice}/ServiceKey.java | 3 +- .../crawler/server/guice}/ServiceUrl.java | 2 +- gwtp-core/gwtp-crawler-spring/pom.xml | 78 ++++++++++++ .../server/spring/CrawlServiceServlet.java | 119 ++++++++++++++++++ .../server/spring/SpringCrawlFilter.java | 63 ++++++++++ gwtp-core/gwtp-crawler/pom.xml | 6 - .../server/AbstractCrawlServiceServlet.java | 61 ++++++--- .../crawler/server/CrawlCacheService.java | 31 +++++ .../crawler/server/CrawlFilter.java | 20 +-- .../crawler/server/DefaultCrawledPage.java | 74 +++++++++++ gwtp-core/pom.xml | 2 + gwtp-crawler-service/pom.xml | 2 +- .../server/OfyCrawlCacheService.java | 59 +++++++++ .../crawlerservice/server/ServiceKey.java | 39 ------ .../server/guice/CrawlServiceModule.java | 6 +- 20 files changed, 587 insertions(+), 112 deletions(-) create mode 100644 gwtp-core/gwtp-crawler-guice/pom.xml rename {gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server => gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice}/CachedPageTimeoutSec.java (96%) rename {gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server => gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice}/CrawlServiceServlet.java (64%) create mode 100644 gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/GuiceCrawlFilter.java rename {gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server => gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice}/HtmlUnitTimeoutMillis.java (96%) rename gwtp-core/{gwtp-crawler/src/main/java/com/gwtplatform/crawler/server => gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice}/ServiceKey.java (96%) rename gwtp-core/{gwtp-crawler/src/main/java/com/gwtplatform/crawler/server => gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice}/ServiceUrl.java (96%) create mode 100644 gwtp-core/gwtp-crawler-spring/pom.xml create mode 100644 gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/CrawlServiceServlet.java create mode 100644 gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/SpringCrawlFilter.java create mode 100644 gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlCacheService.java create mode 100644 gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawledPage.java create mode 100644 gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/OfyCrawlCacheService.java delete mode 100644 gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java diff --git a/gwtp-core/gwtp-crawler-guice/pom.xml b/gwtp-core/gwtp-crawler-guice/pom.xml new file mode 100644 index 0000000000..fa4f62c654 --- /dev/null +++ b/gwtp-core/gwtp-crawler-guice/pom.xml @@ -0,0 +1,55 @@ + + + 4.0.0 + + + com.gwtplatform + gwtp-core + 1.6-SNAPSHOT + + + gwtp-crawler-guice + GWTP Crawler for Guice + + + + + + src/main/java + + **/*.java + + + + + src/main/resources + + **/*.gwt.xml + + + + + + + + ${project.groupId} + gwtp-crawler + + + + javax.servlet + servlet-api + + + + net.sourceforge.htmlunit + htmlunit + + + + + com.google.inject + guice + + + diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CachedPageTimeoutSec.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CachedPageTimeoutSec.java similarity index 96% rename from gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CachedPageTimeoutSec.java rename to gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CachedPageTimeoutSec.java index f7ff39255b..00ba5ae6e7 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CachedPageTimeoutSec.java +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CachedPageTimeoutSec.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawlerservice.server; +package com.gwtplatform.crawler.server.guice; import java.lang.annotation.Retention; import java.lang.annotation.Target; diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CrawlServiceServlet.java similarity index 64% rename from gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java rename to gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CrawlServiceServlet.java index 75ea52d128..d5e87d3351 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CrawlServiceServlet.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawlerservice.server; +package com.gwtplatform.crawler.server.guice; import java.util.logging.Logger; @@ -23,16 +23,14 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.google.inject.Inject; -import com.googlecode.objectify.Key; import com.gwtplatform.crawler.server.AbstractCrawlServiceServlet; -import com.gwtplatform.crawlerservice.server.domain.CachedPage; -import com.gwtplatform.crawlerservice.server.service.CachedPageDao; +import com.gwtplatform.crawler.server.CrawlCacheService; /** - * Guice crawl servlet implementation. + * Guice Crawl Service Servlet. */ @Singleton -public class CrawlServiceServlet extends AbstractCrawlServiceServlet { +public class CrawlServiceServlet extends AbstractCrawlServiceServlet { @Inject(optional = true) @HtmlUnitTimeoutMillis @@ -44,38 +42,15 @@ public class CrawlServiceServlet extends AbstractCrawlServiceServlet private final Provider webClientProvider; - private final CachedPageDao cachedPageDao; - @Inject protected CrawlServiceServlet( Provider webClientProvider, Logger log, - CachedPageDao cachedPageDao, + CrawlCacheService crawlCacheService, @ServiceKey String key) { - super(log, key); + super(log, key, crawlCacheService); this.webClientProvider = webClientProvider; - this.cachedPageDao = cachedPageDao; - } - - @Override - protected CachedPage createCrawledPage() { - return new CachedPage(); - } - - @Override - protected CachedPage getCachedPage(String url) { - return cachedPageDao.get(Key.create(CachedPage.class, url)); - } - - @Override - protected void saveCachedPage(CachedPage cachedPage) { - cachedPageDao.put(cachedPage); - } - - @Override - protected void deleteCachedPage(CachedPage cachedPage) { - cachedPageDao.delete(cachedPage); } @Override diff --git a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/GuiceCrawlFilter.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/GuiceCrawlFilter.java new file mode 100644 index 0000000000..ceb9a11272 --- /dev/null +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/GuiceCrawlFilter.java @@ -0,0 +1,38 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.gwtplatform.crawler.server.guice; + +import java.util.logging.Logger; + +import javax.inject.Inject; +import javax.inject.Singleton; + +import com.gwtplatform.crawler.server.CrawlFilter; + +/** + * Guice implementation for the {@link CrawlFilter}. + */ +@Singleton +public final class GuiceCrawlFilter extends CrawlFilter { + + @Inject + GuiceCrawlFilter(@ServiceUrl String serviceUrl, + @ServiceKey String key, + Logger log) { + super(serviceUrl, key, log); + } +} diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/HtmlUnitTimeoutMillis.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/HtmlUnitTimeoutMillis.java similarity index 96% rename from gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/HtmlUnitTimeoutMillis.java rename to gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/HtmlUnitTimeoutMillis.java index 39800076b0..a356948f93 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/HtmlUnitTimeoutMillis.java +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/HtmlUnitTimeoutMillis.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawlerservice.server; +package com.gwtplatform.crawler.server.guice; import java.lang.annotation.Retention; import java.lang.annotation.Target; diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/ServiceKey.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceKey.java similarity index 96% rename from gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/ServiceKey.java rename to gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceKey.java index 53cdd762a2..e1912a3228 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/ServiceKey.java +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceKey.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawler.server; +package com.gwtplatform.crawler.server.guice; import java.lang.annotation.Retention; import java.lang.annotation.Target; @@ -32,7 +32,6 @@ *
bindConstant().annotatedWith(ServiceKey.class).to("123456");
  * 
*/ - @BindingAnnotation @Target({FIELD, PARAMETER, METHOD}) @Retention(RUNTIME) diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/ServiceUrl.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceUrl.java similarity index 96% rename from gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/ServiceUrl.java rename to gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceUrl.java index 4ee6b19e3c..25f47a1ba9 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/ServiceUrl.java +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceUrl.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawler.server; +package com.gwtplatform.crawler.server.guice; import java.lang.annotation.Retention; import java.lang.annotation.Target; diff --git a/gwtp-core/gwtp-crawler-spring/pom.xml b/gwtp-core/gwtp-crawler-spring/pom.xml new file mode 100644 index 0000000000..761345669e --- /dev/null +++ b/gwtp-core/gwtp-crawler-spring/pom.xml @@ -0,0 +1,78 @@ + + + 4.0.0 + + + com.gwtplatform + gwtp-core + 1.6-SNAPSHOT + + + gwtp-crawler-spring + GWTP Crawler for Spring + + + + + + src/main/java + + **/*.java + + + + + src/main/resources + + **/*.gwt.xml + + + + + + + + ${project.groupId} + gwtp-crawler + + + + javax.servlet + servlet-api + + + + net.sourceforge.htmlunit + htmlunit + + + + org.springframework + spring-core + ${spring.version} + + + org.springframework + spring-context + ${spring.version} + + + org.springframework + spring-beans + ${spring.version} + + + org.springframework + spring-web + ${spring.version} + + + + + org.springframework + spring-test + ${spring.version} + test + + + diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/CrawlServiceServlet.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/CrawlServiceServlet.java new file mode 100644 index 0000000000..0eb78c10fa --- /dev/null +++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/CrawlServiceServlet.java @@ -0,0 +1,119 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.gwtplatform.crawler.server.spring; + +import java.io.IOException; +import java.util.logging.Logger; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; +import org.springframework.web.HttpRequestHandler; +import org.springframework.web.WebApplicationInitializer; + +import com.gargoylesoftware.htmlunit.WebClient; +import com.gwtplatform.crawler.server.AbstractCrawlServiceServlet; +import com.gwtplatform.crawler.server.CrawlCacheService; +import com.gwtplatform.crawler.server.CrawledPage; + +/** + * Spring Crawl Service Servlet.
+ * Required bean dependencies are: + *
    + *
  • webClient ({@link WebClient}): HTML Unit virtual web client.
  • + *
  • crawlCacheService ({@link CrawlCacheService}): Crawled page cache service.
  • + *
  • crawlKey (String): Unique key for the crawler service.
  • + *
  • logger (Logger): Logger for the crawl filter.
  • + *
  • timeoutMillis (long:5000): The HTML Unit Timeout in milliseconds.
  • + *
  • cachedPageTimeoutSec (long:900): Cache timeout period before {@link CrawledPage}'s are invalidated.
  • + *
+ * + * Register in web.xml like so: + *
+ * {@code
+ *  <-- First ensure you have the ContextLoaderListener -->
+ *     
+ *         org.springframework.web.context.ContextLoaderListener
+ *     
+ *
+ *     
+ *          crawlServiceServlet
+ *          org.springframework.web.context.support.HttpRequestHandlerServlet
+ *     
+ *
+ *     
+ *         crawlServiceServlet
+ *         /*
+ *     }
+ * 
+ * or using {@link WebApplicationInitializer}: + *
+ *   // Ensure you have registered the ContextLoaderListener
+ *   servletContext.addListener(new ContextLoaderListener(context));
+ *
+ *   // Register the new servlet as a HttpRequestHandlerServlet
+ *   servletContext.addServlet("crawlServiceServlet", new HttpRequestHandlerServlet()).addMapping("/*");
+ * 
+ * + */ +@Component +public class CrawlServiceServlet extends AbstractCrawlServiceServlet implements HttpRequestHandler { + + @Value("${timeoutMillis:5000}") + private long timeoutMillis; + + @Value("${cachedPageTimeoutSec:900}") + private long cachedPageTimeoutSec; + + private final WebClient webClient; + + @Autowired + protected CrawlServiceServlet( + WebClient webClient, + CrawlCacheService crawlCacheService, + String crawlKey, + Logger logger) { + super(logger, crawlKey, crawlCacheService); + + this.webClient = webClient; + } + + @Override + public void handleRequest(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + doGet(request, response); + } + + @Override + protected WebClient getWebClient() { + return webClient; + } + + @Override + public long getCachedPageTimeoutSec() { + return cachedPageTimeoutSec; + } + + @Override + public long getTimeoutMillis() { + return timeoutMillis; + } +} diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/SpringCrawlFilter.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/SpringCrawlFilter.java new file mode 100644 index 0000000000..efa241e87f --- /dev/null +++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/SpringCrawlFilter.java @@ -0,0 +1,63 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.gwtplatform.crawler.server.spring; + +import java.util.logging.Logger; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import org.springframework.web.WebApplicationInitializer; + +import com.gwtplatform.crawler.server.CrawlFilter; + +/** + * Spring implementation for the {@link CrawlFilter}.
+ * Required bean dependencies are: + *
    + *
  • serviceUrl (String): Url for the crawler service.
  • + *
  • crawlKey (String): Unique key for the crawler service.
  • + *
  • logger (Logger): Logger for the crawl filter.
  • + *
+ * + * Register in web.xml like so: + *
+ * {@code
+ * 
+ *          crawlFilter
+ *          org.springframework.web.filter.DelegatingFilterProxy
+ *     
+ *     
+ *         crawlFilter
+ *         /*
+ *     }
+ * 
+ * or using {@link WebApplicationInitializer}: + *
+ *   servletContext.addFilter("crawlFilter", new DelegatingFilterProxy())
+ *      .addMappingForUrlPatterns(EnumSet.of(DispatcherType.REQUEST), true, "/*");
+ * 
+ * + * @author Ben Dol + */ +@Component("crawlFilter") +public final class SpringCrawlFilter extends CrawlFilter { + + @Autowired + SpringCrawlFilter(String serviceUrl, String crawlKey, Logger logger) { + super(serviceUrl, crawlKey, logger); + } +} diff --git a/gwtp-core/gwtp-crawler/pom.xml b/gwtp-core/gwtp-crawler/pom.xml index 4649c5f65d..8202f28992 100644 --- a/gwtp-core/gwtp-crawler/pom.xml +++ b/gwtp-core/gwtp-crawler/pom.xml @@ -40,11 +40,5 @@ net.sourceforge.htmlunit htmlunit
- - - - com.google.inject - guice -
diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java index 601508b2c9..fce43a9e08 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java @@ -40,7 +40,8 @@ * Servlet that makes it possible to fetch an external page, * renders it using HTMLUnit and returns the HTML page. */ -public abstract class AbstractCrawlServiceServlet extends HttpServlet { +@SuppressWarnings("unchecked") +public abstract class AbstractCrawlServiceServlet extends HttpServlet { private static class SyncAllAjaxController extends NicelyResynchronizingAjaxController { private static final long serialVersionUID = 1L; @@ -51,6 +52,24 @@ public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) } } + static class DummyCrawlCacheService implements CrawlCacheService { + @Override + public CrawledPage createCrawledPage() { + return new DefaultCrawledPage(); + } + + @Override + public CrawledPage getCachedPage(String url) { + return null; + } + + @Override + public void saveCachedPage(CrawledPage crawledPage) { } + + @Override + public void deleteCachedPage(CrawledPage crawledPage) { } + } + protected static final String CHAR_ENCODING = "UTF-8"; private static final long serialVersionUID = -6129110224710383122L; @@ -58,23 +77,25 @@ public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) protected final Logger log; protected final String key; - protected AbstractCrawlServiceServlet(Logger log, String key) { + private final CrawlCacheService cacheService; + + public AbstractCrawlServiceServlet( + Logger log, + String key) { + this(log, key, new DummyCrawlCacheService()); + } + + public AbstractCrawlServiceServlet( + Logger log, + String key, + CrawlCacheService cacheService) { this.log = log; this.key = key; + this.cacheService = cacheService; } protected abstract WebClient getWebClient(); - protected abstract T createCrawledPage(); - - // Page cache operations - - protected abstract T getCachedPage(String url); - - protected abstract void saveCachedPage(T crawledPage); - - protected abstract void deleteCachedPage(T crawledPage); - @Override protected void doGet(HttpServletRequest request, HttpServletResponse response) { PrintWriter out = null; @@ -90,7 +111,7 @@ protected void doGet(HttpServletRequest request, HttpServletResponse response) { if (url != null && !url.isEmpty()) { url = URLDecoder.decode(url, CHAR_ENCODING); - T crawledPage = getCachedPage(url); + CrawledPage crawledPage = cacheService.getCachedPage(url); Date currDate = new Date(); @@ -129,10 +150,10 @@ private void validateKey(HttpServletRequest request) } } - private void storeFetchedPage(T crawledPage, String stringBuilder) { + private void storeFetchedPage(CrawledPage crawledPage, String stringBuilder) { crawledPage.setContent(stringBuilder); crawledPage.setFetchInProgress(false); - saveCachedPage(crawledPage); + cacheService.saveCachedPage(crawledPage); } /** @@ -144,7 +165,7 @@ private void storeFetchedPage(T crawledPage, String stringBuilder) { * @param out The {@link PrintWriter} to write to, if needed. * @return {@code true} if the page needs to be fetched, {@code false} otherwise. */ - private boolean needToFetchPage(T matchingPage, Date currDate, PrintWriter out) { + private boolean needToFetchPage(CrawledPage matchingPage, Date currDate, PrintWriter out) { if (matchingPage == null || matchingPage.isExpired(getCachedPageTimeoutSec())) { return true; } @@ -152,7 +173,7 @@ private boolean needToFetchPage(T matchingPage, Date currDate, PrintWriter out) if (matchingPage.isFetchInProgress()) { // If fetch is in progress since more than 60 seconds, we consider something went wrong and fetch again. if (currDate.getTime() > matchingPage.getFetchDate().getTime() + 60000) { - deleteCachedPage(matchingPage); + cacheService.deleteCachedPage(matchingPage); return true; } else { out.println("FETCH_IN_PROGRESS"); @@ -171,12 +192,12 @@ private boolean needToFetchPage(T matchingPage, Date currDate, PrintWriter out) * @param currDate The current date, to mark the page. * @return The newly created placeholder page. */ - private T createPlaceholderPage(String url, Date currDate) { - T result = createCrawledPage(); + private CrawledPage createPlaceholderPage(String url, Date currDate) { + CrawledPage result = cacheService.createCrawledPage(); result.setUrl(url); result.setFetchDate(currDate); result.setFetchInProgress(true); - saveCachedPage(result); + cacheService.saveCachedPage(result); return result; } diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlCacheService.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlCacheService.java new file mode 100644 index 0000000000..87dcb077a7 --- /dev/null +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlCacheService.java @@ -0,0 +1,31 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.gwtplatform.crawler.server; + +/** + * Crawl cache service interface. + */ +public interface CrawlCacheService { + + T createCrawledPage(); + + T getCachedPage(String url); + + void saveCachedPage(T crawledPage); + + void deleteCachedPage(T crawledPage); +} diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java index 306d6709a3..390bc605e6 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlFilter.java @@ -28,8 +28,6 @@ import java.net.URLEncoder; import java.util.logging.Logger; -import javax.inject.Inject; -import javax.inject.Singleton; import javax.servlet.Filter; import javax.servlet.FilterChain; import javax.servlet.FilterConfig; @@ -40,10 +38,9 @@ import javax.servlet.http.HttpServletResponse; /** - * Servlet that makes this application crawlable. + * Servlet filter that makes this application crawlable. */ -@Singleton -public final class CrawlFilter implements Filter { +public class CrawlFilter implements Filter { private static final String CHAR_ENCODING = "UTF-8"; /** @@ -64,10 +61,7 @@ public final class CrawlFilter implements Filter { private final String key; private final Logger log; - @Inject - CrawlFilter(@ServiceUrl String serviceUrl, - @ServiceKey String key, - Logger log) { + protected CrawlFilter(String serviceUrl, String key, Logger log) { this.serviceUrl = serviceUrl; this.key = key; this.log = log; @@ -202,4 +196,12 @@ public void doFilter(ServletRequest request, ServletResponse response, @Override public void init(FilterConfig filterConfig) throws ServletException { } + + public String getServiceUrl() { + return serviceUrl; + } + + public String getKey() { + return key; + } } diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawledPage.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawledPage.java new file mode 100644 index 0000000000..69a5c72d2e --- /dev/null +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawledPage.java @@ -0,0 +1,74 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.gwtplatform.crawler.server; + +import java.util.Date; + +/** + * Default crawled page implementation. + */ +public class DefaultCrawledPage implements CrawledPage { + private String url; + private Date fetchDate; + private boolean fetchInProgress; + private String content; + + @Override + public void setUrl(String url) { + this.url = url; + } + + @Override + public String getUrl() { + return url; + } + + @Override + public void setFetchDate(Date fetchDate) { + this.fetchDate = new Date(fetchDate.getTime()); + } + + @Override + public Date getFetchDate() { + return new Date(fetchDate.getTime()); + } + + @Override + public void setFetchInProgress(boolean fetchInProgress) { + this.fetchInProgress = fetchInProgress; + } + + @Override + public boolean isFetchInProgress() { + return fetchInProgress; + } + + @Override + public void setContent(String content) { + this.content = content; + } + + @Override + public String getContent() { + return content; + } + + @Override + public boolean isExpired(long cachedPageTimeoutSec) { + return new Date().getTime() > fetchDate.getTime() + cachedPageTimeoutSec * 1000; + } +} diff --git a/gwtp-core/pom.xml b/gwtp-core/pom.xml index 45f7ea3152..e857f612d7 100644 --- a/gwtp-core/pom.xml +++ b/gwtp-core/pom.xml @@ -29,6 +29,8 @@ gwtp-tester gwtp-processors gwtp-crawler + gwtp-crawler-guice + gwtp-crawler-spring gwtp-all diff --git a/gwtp-crawler-service/pom.xml b/gwtp-crawler-service/pom.xml index 8dfe3ed32d..9aefab6aa5 100644 --- a/gwtp-crawler-service/pom.xml +++ b/gwtp-crawler-service/pom.xml @@ -82,7 +82,7 @@ ${project.groupId} - gwtp-crawler + gwtp-crawler-guice ${project.version} diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/OfyCrawlCacheService.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/OfyCrawlCacheService.java new file mode 100644 index 0000000000..e654d28f7c --- /dev/null +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/OfyCrawlCacheService.java @@ -0,0 +1,59 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.gwtplatform.crawlerservice.server; + +import javax.inject.Inject; +import javax.inject.Singleton; + +import com.googlecode.objectify.Key; +import com.gwtplatform.crawler.server.CrawlCacheService; +import com.gwtplatform.crawlerservice.server.domain.CachedPage; +import com.gwtplatform.crawlerservice.server.service.CachedPageDao; + +/** + * Objectify DAO Crawl Cache Service. + */ +@Singleton +public class OfyCrawlCacheService implements CrawlCacheService { + + private final CachedPageDao cachedPageDao; + + @Inject + protected OfyCrawlCacheService(CachedPageDao cachedPageDao) { + this.cachedPageDao = cachedPageDao; + } + + @Override + public CachedPage createCrawledPage() { + return new CachedPage(); + } + + @Override + public CachedPage getCachedPage(String url) { + return cachedPageDao.get(Key.create(CachedPage.class, url)); + } + + @Override + public void saveCachedPage(CachedPage cachedPage) { + cachedPageDao.put(cachedPage); + } + + @Override + public void deleteCachedPage(CachedPage cachedPage) { + cachedPageDao.delete(cachedPage); + } +} diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java deleted file mode 100644 index fd68825395..0000000000 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/ServiceKey.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright 2011 ArcBees Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package com.gwtplatform.crawlerservice.server; - -import java.lang.annotation.Retention; -import java.lang.annotation.Target; - -import com.google.inject.BindingAnnotation; - -import static java.lang.annotation.ElementType.FIELD; -import static java.lang.annotation.ElementType.METHOD; -import static java.lang.annotation.ElementType.PARAMETER; -import static java.lang.annotation.RetentionPolicy.RUNTIME; - -/** - * Use this annotation to bind the key that should be used when invoking - * {@link CrawlServiceServlet}. For example: - *
bindConstant().annotatedWith(ServiceKey.class).to("123456");
- * 
- */ -@BindingAnnotation -@Target({FIELD, PARAMETER, METHOD}) -@Retention(RUNTIME) -public @interface ServiceKey { -} diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java index a6c4f0f6e1..dffd936d9d 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java @@ -22,12 +22,16 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.google.inject.Provides; import com.google.inject.servlet.ServletModule; -import com.gwtplatform.crawlerservice.server.CrawlServiceServlet; +import com.gwtplatform.crawler.server.CrawlCacheService; +import com.gwtplatform.crawler.server.guice.CrawlServiceServlet; +import com.gwtplatform.crawlerservice.server.OfyCrawlCacheService; public class CrawlServiceModule extends ServletModule { @Override public void configureServlets() { + bind(CrawlCacheService.class).to(OfyCrawlCacheService.class); + serve("*").with(CrawlServiceServlet.class); } From 1a405921f852aaf796e4eee4c7eb5638373ab42c Mon Sep 17 00:00:00 2001 From: BenDol Date: Wed, 7 Oct 2015 23:44:03 +1300 Subject: [PATCH 06/11] Add DefaultCrawlCacheService and clean up. --- gwtp-core/gwtp-crawler-guice/pom.xml | 5 ---- gwtp-core/gwtp-crawler-spring/pom.xml | 5 ---- .../server/AbstractCrawlServiceServlet.java | 27 +++++-------------- .../server/DefaultCrawlCacheService.java | 19 +++++++++++++ gwtp-crawler-service/pom.xml | 6 ----- 5 files changed, 26 insertions(+), 36 deletions(-) create mode 100644 gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java diff --git a/gwtp-core/gwtp-crawler-guice/pom.xml b/gwtp-core/gwtp-crawler-guice/pom.xml index fa4f62c654..97cceca1a6 100644 --- a/gwtp-core/gwtp-crawler-guice/pom.xml +++ b/gwtp-core/gwtp-crawler-guice/pom.xml @@ -41,11 +41,6 @@ servlet-api
- - net.sourceforge.htmlunit - htmlunit - - com.google.inject diff --git a/gwtp-core/gwtp-crawler-spring/pom.xml b/gwtp-core/gwtp-crawler-spring/pom.xml index 761345669e..8541f4f734 100644 --- a/gwtp-core/gwtp-crawler-spring/pom.xml +++ b/gwtp-core/gwtp-crawler-spring/pom.xml @@ -41,11 +41,6 @@ servlet-api - - net.sourceforge.htmlunit - htmlunit - - org.springframework spring-core diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java index fce43a9e08..31c55c27ee 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java @@ -52,24 +52,6 @@ public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) } } - static class DummyCrawlCacheService implements CrawlCacheService { - @Override - public CrawledPage createCrawledPage() { - return new DefaultCrawledPage(); - } - - @Override - public CrawledPage getCachedPage(String url) { - return null; - } - - @Override - public void saveCachedPage(CrawledPage crawledPage) { } - - @Override - public void deleteCachedPage(CrawledPage crawledPage) { } - } - protected static final String CHAR_ENCODING = "UTF-8"; private static final long serialVersionUID = -6129110224710383122L; @@ -82,7 +64,7 @@ public void deleteCachedPage(CrawledPage crawledPage) { } public AbstractCrawlServiceServlet( Logger log, String key) { - this(log, key, new DummyCrawlCacheService()); + this(log, key, null); } public AbstractCrawlServiceServlet( @@ -91,7 +73,12 @@ public AbstractCrawlServiceServlet( CrawlCacheService cacheService) { this.log = log; this.key = key; - this.cacheService = cacheService; + + if(cacheService != null) { + this.cacheService = cacheService; + } else { + this.cacheService = new DefaultCrawlCacheService(); + } } protected abstract WebClient getWebClient(); diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java new file mode 100644 index 0000000000..8b2ecf9875 --- /dev/null +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java @@ -0,0 +1,19 @@ +package com.gwtplatform.crawler.server; + +public class DefaultCrawlCacheService implements CrawlCacheService { + @Override + public DefaultCrawledPage createCrawledPage() { + return new DefaultCrawledPage(); + } + + @Override + public DefaultCrawledPage getCachedPage(String url) { + return null; + } + + @Override + public void saveCachedPage(DefaultCrawledPage crawledPage) { } + + @Override + public void deleteCachedPage(DefaultCrawledPage crawledPage) { } +} diff --git a/gwtp-crawler-service/pom.xml b/gwtp-crawler-service/pom.xml index 9aefab6aa5..fd5c6dbc36 100644 --- a/gwtp-crawler-service/pom.xml +++ b/gwtp-crawler-service/pom.xml @@ -97,12 +97,6 @@ ${javax.inject.version} - - net.sourceforge.htmlunit - htmlunit - ${htmlunit.version} - - com.google.appengine appengine-api-1.0-sdk From 4741aab275fc0d7eba6e04da9df9cc91b4f58242 Mon Sep 17 00:00:00 2001 From: BenDol Date: Wed, 7 Oct 2015 23:46:28 +1300 Subject: [PATCH 07/11] Fix checkstyle issues. --- .../server/AbstractCrawlServiceServlet.java | 2 +- .../crawler/server/DefaultCrawlCacheService.java | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java index 31c55c27ee..a4836f7c2e 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/AbstractCrawlServiceServlet.java @@ -74,7 +74,7 @@ public AbstractCrawlServiceServlet( this.log = log; this.key = key; - if(cacheService != null) { + if (cacheService != null) { this.cacheService = cacheService; } else { this.cacheService = new DefaultCrawlCacheService(); diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java index 8b2ecf9875..39c8ae280f 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java @@ -1,3 +1,19 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package com.gwtplatform.crawler.server; public class DefaultCrawlCacheService implements CrawlCacheService { From fd482985bdad860c55eccd2a2187a2dca646624b Mon Sep 17 00:00:00 2001 From: BenDol Date: Thu, 8 Oct 2015 00:53:45 +1300 Subject: [PATCH 08/11] Working Spring crawler support (filter and service base support). Separated the classes to 'filter' and 'service' sub packages. --- .../crawler/server/guice/ServiceKey.java | 1 + .../guice/{ => filter}/GuiceCrawlFilter.java | 3 +- .../server/guice/{ => filter}/ServiceUrl.java | 2 +- .../{ => service}/CachedPageTimeoutSec.java | 2 +- .../{ => service}/CrawlServiceServlet.java | 3 +- .../{ => service}/HtmlUnitTimeoutMillis.java | 2 +- .../server/spring/AbstractCrawlerModule.java | 34 +++++++++++++++ .../filter/AbstractCrawlFilterModule.java | 34 +++++++++++++++ .../{ => filter}/SpringCrawlFilter.java | 18 ++++---- .../service/AbstractCrawlServiceModule.java | 43 +++++++++++++++++++ .../{ => service}/CrawlServiceServlet.java | 17 +++++--- .../server/guice/CrawlServiceModule.java | 2 +- 12 files changed, 140 insertions(+), 21 deletions(-) rename gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/{ => filter}/GuiceCrawlFilter.java (90%) rename gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/{ => filter}/ServiceUrl.java (95%) rename gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/{ => service}/CachedPageTimeoutSec.java (96%) rename gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/{ => service}/CrawlServiceServlet.java (94%) rename gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/{ => service}/HtmlUnitTimeoutMillis.java (96%) create mode 100644 gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/AbstractCrawlerModule.java create mode 100644 gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/AbstractCrawlFilterModule.java rename gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/{ => filter}/SpringCrawlFilter.java (78%) create mode 100644 gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java rename gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/{ => service}/CrawlServiceServlet.java (88%) diff --git a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceKey.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceKey.java index e1912a3228..f0052eee21 100644 --- a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceKey.java +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceKey.java @@ -20,6 +20,7 @@ import java.lang.annotation.Target; import com.google.inject.BindingAnnotation; +import com.gwtplatform.crawler.server.guice.service.CrawlServiceServlet; import static java.lang.annotation.ElementType.FIELD; import static java.lang.annotation.ElementType.METHOD; diff --git a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/GuiceCrawlFilter.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/filter/GuiceCrawlFilter.java similarity index 90% rename from gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/GuiceCrawlFilter.java rename to gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/filter/GuiceCrawlFilter.java index ceb9a11272..088b448ef5 100644 --- a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/GuiceCrawlFilter.java +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/filter/GuiceCrawlFilter.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawler.server.guice; +package com.gwtplatform.crawler.server.guice.filter; import java.util.logging.Logger; @@ -22,6 +22,7 @@ import javax.inject.Singleton; import com.gwtplatform.crawler.server.CrawlFilter; +import com.gwtplatform.crawler.server.guice.ServiceKey; /** * Guice implementation for the {@link CrawlFilter}. diff --git a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceUrl.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/filter/ServiceUrl.java similarity index 95% rename from gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceUrl.java rename to gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/filter/ServiceUrl.java index 25f47a1ba9..1f2b1676c4 100644 --- a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/ServiceUrl.java +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/filter/ServiceUrl.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawler.server.guice; +package com.gwtplatform.crawler.server.guice.filter; import java.lang.annotation.Retention; import java.lang.annotation.Target; diff --git a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CachedPageTimeoutSec.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/CachedPageTimeoutSec.java similarity index 96% rename from gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CachedPageTimeoutSec.java rename to gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/CachedPageTimeoutSec.java index 00ba5ae6e7..80b7c8eb05 100644 --- a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CachedPageTimeoutSec.java +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/CachedPageTimeoutSec.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawler.server.guice; +package com.gwtplatform.crawler.server.guice.service; import java.lang.annotation.Retention; import java.lang.annotation.Target; diff --git a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CrawlServiceServlet.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/CrawlServiceServlet.java similarity index 94% rename from gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CrawlServiceServlet.java rename to gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/CrawlServiceServlet.java index d5e87d3351..ec6f33eeec 100644 --- a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/CrawlServiceServlet.java +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/CrawlServiceServlet.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawler.server.guice; +package com.gwtplatform.crawler.server.guice.service; import java.util.logging.Logger; @@ -25,6 +25,7 @@ import com.google.inject.Inject; import com.gwtplatform.crawler.server.AbstractCrawlServiceServlet; import com.gwtplatform.crawler.server.CrawlCacheService; +import com.gwtplatform.crawler.server.guice.ServiceKey; /** * Guice Crawl Service Servlet. diff --git a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/HtmlUnitTimeoutMillis.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/HtmlUnitTimeoutMillis.java similarity index 96% rename from gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/HtmlUnitTimeoutMillis.java rename to gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/HtmlUnitTimeoutMillis.java index a356948f93..ad488ae47d 100644 --- a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/HtmlUnitTimeoutMillis.java +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/HtmlUnitTimeoutMillis.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawler.server.guice; +package com.gwtplatform.crawler.server.guice.service; import java.lang.annotation.Retention; import java.lang.annotation.Target; diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/AbstractCrawlerModule.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/AbstractCrawlerModule.java new file mode 100644 index 0000000000..575c3847ff --- /dev/null +++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/AbstractCrawlerModule.java @@ -0,0 +1,34 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.gwtplatform.crawler.server.spring; + +import java.util.logging.Logger; + +import org.springframework.context.annotation.Bean; + +/** + * Abstract crawler module for {@link @Configuration} setup. + */ +public abstract class AbstractCrawlerModule { + @Bean + protected Logger crawlLogger() { + return Logger.getAnonymousLogger(); + } + + @Bean + protected abstract String crawlKey(); +} diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/AbstractCrawlFilterModule.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/AbstractCrawlFilterModule.java new file mode 100644 index 0000000000..adac264efe --- /dev/null +++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/AbstractCrawlFilterModule.java @@ -0,0 +1,34 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.gwtplatform.crawler.server.spring.filter; + +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.ComponentScan; + +import com.gwtplatform.crawler.server.spring.AbstractCrawlerModule; + +/** + * Abstract crawl filter module for {@link @Configuration} setup. + */ +@ComponentScan(basePackages = { + "com.gwtplatform.crawler.server.spring.filter" + }) +public abstract class AbstractCrawlFilterModule extends AbstractCrawlerModule { + + @Bean + protected abstract String serviceUrl(); +} diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/SpringCrawlFilter.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/SpringCrawlFilter.java similarity index 78% rename from gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/SpringCrawlFilter.java rename to gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/SpringCrawlFilter.java index efa241e87f..1c87e570a6 100644 --- a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/SpringCrawlFilter.java +++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/SpringCrawlFilter.java @@ -14,7 +14,7 @@ * the License. */ -package com.gwtplatform.crawler.server.spring; +package com.gwtplatform.crawler.server.spring.filter; import java.util.logging.Logger; @@ -25,15 +25,17 @@ import com.gwtplatform.crawler.server.CrawlFilter; /** - * Spring implementation for the {@link CrawlFilter}.
+ * Spring implementation for the {@link CrawlFilter}.
* Required bean dependencies are: *
    *
  • serviceUrl (String): Url for the crawler service.
  • *
  • crawlKey (String): Unique key for the crawler service.
  • - *
  • logger (Logger): Logger for the crawl filter.
  • + *
  • crawlLogger (Logger): Logger for the crawl filter.
  • *
- * - * Register in web.xml like so: + * Extend the {@link AbstractCrawlFilterModule} with + * {@link org.springframework.beans.factory.annotation.Configurable} class. + *
+ * Then register inside web.xml like so: *
  * {@code
  * 
@@ -45,7 +47,7 @@
  *         /*
  *     }
  * 
- * or using {@link WebApplicationInitializer}: + * or instead using {@link WebApplicationInitializer}: *
  *   servletContext.addFilter("crawlFilter", new DelegatingFilterProxy())
  *      .addMappingForUrlPatterns(EnumSet.of(DispatcherType.REQUEST), true, "/*");
@@ -57,7 +59,7 @@
 public final class SpringCrawlFilter extends CrawlFilter {
 
     @Autowired
-    SpringCrawlFilter(String serviceUrl, String crawlKey, Logger logger) {
-        super(serviceUrl, crawlKey, logger);
+    SpringCrawlFilter(String serviceUrl, String crawlKey, Logger crawlLogger) {
+        super(serviceUrl, crawlKey, crawlLogger);
     }
 }
diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java
new file mode 100644
index 0000000000..9f96c0d122
--- /dev/null
+++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2011 ArcBees Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.gwtplatform.crawler.server.spring.service;
+
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.ComponentScan;
+
+import com.gargoylesoftware.htmlunit.BrowserVersion;
+import com.gargoylesoftware.htmlunit.WebClient;
+import com.gwtplatform.crawler.server.CrawlCacheService;
+import com.gwtplatform.crawler.server.DefaultCrawlCacheService;
+
+/**
+ * Abstract crawl service module for {@link @Configuration} setup.
+ */
+@ComponentScan(basePackages = {
+        "com.gwtplatform.crawler.server.spring.service"
+        })
+public abstract class AbstractCrawlServiceModule {
+    @Bean
+    protected WebClient webClient() {
+        return new WebClient(BrowserVersion.CHROME);
+    }
+
+    @Bean
+    protected CrawlCacheService crawlCacheService() {
+        return new DefaultCrawlCacheService();
+    }
+}
diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/CrawlServiceServlet.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/CrawlServiceServlet.java
similarity index 88%
rename from gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/CrawlServiceServlet.java
rename to gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/CrawlServiceServlet.java
index 0eb78c10fa..700e4d27ef 100644
--- a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/CrawlServiceServlet.java
+++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/CrawlServiceServlet.java
@@ -14,7 +14,7 @@
  * the License.
  */
 
-package com.gwtplatform.crawler.server.spring;
+package com.gwtplatform.crawler.server.spring.service;
 
 import java.io.IOException;
 import java.util.logging.Logger;
@@ -23,6 +23,7 @@
 import javax.servlet.http.HttpServletRequest;
 import javax.servlet.http.HttpServletResponse;
 
+import com.gwtplatform.crawler.server.spring.filter.AbstractCrawlFilterModule;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Component;
@@ -35,18 +36,20 @@
 import com.gwtplatform.crawler.server.CrawledPage;
 
 /**
- * Spring Crawl Service Servlet.
+ * Spring Crawl Service Servlet.
* Required bean dependencies are: *
    *
  • webClient ({@link WebClient}): HTML Unit virtual web client.
  • *
  • crawlCacheService ({@link CrawlCacheService}): Crawled page cache service.
  • *
  • crawlKey (String): Unique key for the crawler service.
  • - *
  • logger (Logger): Logger for the crawl filter.
  • + *
  • crawlLogger (Logger): Logger for the crawl filter.
  • *
  • timeoutMillis (long:5000): The HTML Unit Timeout in milliseconds.
  • *
  • cachedPageTimeoutSec (long:900): Cache timeout period before {@link CrawledPage}'s are invalidated.
  • *
- * - * Register in web.xml like so: + * Extend the {@link AbstractCrawlServiceModule} with + * {@link org.springframework.beans.factory.annotation.Configurable} class. + *
+ * Then register in web.xml like so: *
  * {@code
  *  <-- First ensure you have the ContextLoaderListener -->
@@ -90,8 +93,8 @@ protected CrawlServiceServlet(
             WebClient webClient,
             CrawlCacheService crawlCacheService,
             String crawlKey,
-            Logger logger) {
-        super(logger, crawlKey, crawlCacheService);
+            Logger crawlLogger) {
+        super(crawlLogger, crawlKey, crawlCacheService);
 
         this.webClient = webClient;
     }
diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java
index dffd936d9d..b3dc04f27c 100644
--- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java
+++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/guice/CrawlServiceModule.java
@@ -23,7 +23,7 @@
 import com.google.inject.Provides;
 import com.google.inject.servlet.ServletModule;
 import com.gwtplatform.crawler.server.CrawlCacheService;
-import com.gwtplatform.crawler.server.guice.CrawlServiceServlet;
+import com.gwtplatform.crawler.server.guice.service.CrawlServiceServlet;
 import com.gwtplatform.crawlerservice.server.OfyCrawlCacheService;
 
 public class CrawlServiceModule extends ServletModule {

From 31260a5f1ef039663e8ff1337c3b69d97c42048a Mon Sep 17 00:00:00 2001
From: BenDol 
Date: Thu, 8 Oct 2015 01:00:00 +1300
Subject: [PATCH 09/11] Fix license dates and some javadoc.

---
 .../crawler/server/guice/filter/GuiceCrawlFilter.java       | 3 ++-
 .../crawler/server/guice/service/CrawlServiceServlet.java   | 3 ++-
 .../crawler/server/spring/AbstractCrawlerModule.java        | 3 ++-
 .../server/spring/filter/AbstractCrawlFilterModule.java     | 3 ++-
 .../crawler/server/spring/filter/SpringCrawlFilter.java     | 2 +-
 .../server/spring/service/AbstractCrawlServiceModule.java   | 3 ++-
 .../crawler/server/spring/service/CrawlServiceServlet.java  | 4 ++--
 .../com/gwtplatform/crawler/server/CrawlCacheService.java   | 3 ++-
 .../java/com/gwtplatform/crawler/server/CrawledPage.java    | 4 +++-
 .../crawler/server/DefaultCrawlCacheService.java            | 6 +++++-
 .../com/gwtplatform/crawler/server/DefaultCrawledPage.java  | 3 ++-
 .../crawlerservice/server/OfyCrawlCacheService.java         | 3 ++-
 12 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/filter/GuiceCrawlFilter.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/filter/GuiceCrawlFilter.java
index 088b448ef5..05621c3ec0 100644
--- a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/filter/GuiceCrawlFilter.java
+++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/filter/GuiceCrawlFilter.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011 ArcBees Inc.
+ * Copyright 2015 ArcBees Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  * use this file except in compliance with the License. You may obtain a copy of
@@ -26,6 +26,7 @@
 
 /**
  * Guice implementation for the {@link CrawlFilter}.
+ * @author Ben Dol
  */
 @Singleton
 public final class GuiceCrawlFilter extends CrawlFilter {
diff --git a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/CrawlServiceServlet.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/CrawlServiceServlet.java
index ec6f33eeec..c5b157ff3d 100644
--- a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/CrawlServiceServlet.java
+++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/guice/service/CrawlServiceServlet.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011 ArcBees Inc.
+ * Copyright 2015 ArcBees Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  * use this file except in compliance with the License. You may obtain a copy of
@@ -29,6 +29,7 @@
 
 /**
  * Guice Crawl Service Servlet.
+ * @author Ben Dol
  */
 @Singleton
 public class CrawlServiceServlet extends AbstractCrawlServiceServlet {
diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/AbstractCrawlerModule.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/AbstractCrawlerModule.java
index 575c3847ff..9317e1ef31 100644
--- a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/AbstractCrawlerModule.java
+++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/AbstractCrawlerModule.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011 ArcBees Inc.
+ * Copyright 2015 ArcBees Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  * use this file except in compliance with the License. You may obtain a copy of
@@ -22,6 +22,7 @@
 
 /**
  * Abstract crawler module for {@link @Configuration} setup.
+ * @author Ben Dol
  */
 public abstract class AbstractCrawlerModule {
     @Bean
diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/AbstractCrawlFilterModule.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/AbstractCrawlFilterModule.java
index adac264efe..d0dc8053f8 100644
--- a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/AbstractCrawlFilterModule.java
+++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/AbstractCrawlFilterModule.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011 ArcBees Inc.
+ * Copyright 2015 ArcBees Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  * use this file except in compliance with the License. You may obtain a copy of
@@ -23,6 +23,7 @@
 
 /**
  * Abstract crawl filter module for {@link @Configuration} setup.
+ * @author Ben Dol
  */
 @ComponentScan(basePackages = {
         "com.gwtplatform.crawler.server.spring.filter"
diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/SpringCrawlFilter.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/SpringCrawlFilter.java
index 1c87e570a6..e36420aabf 100644
--- a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/SpringCrawlFilter.java
+++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/SpringCrawlFilter.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011 ArcBees Inc.
+ * Copyright 2015 ArcBees Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  * use this file except in compliance with the License. You may obtain a copy of
diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java
index 9f96c0d122..1f36e7572b 100644
--- a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java
+++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011 ArcBees Inc.
+ * Copyright 2015 ArcBees Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  * use this file except in compliance with the License. You may obtain a copy of
@@ -26,6 +26,7 @@
 
 /**
  * Abstract crawl service module for {@link @Configuration} setup.
+ * @author Ben Dol
  */
 @ComponentScan(basePackages = {
         "com.gwtplatform.crawler.server.spring.service"
diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/CrawlServiceServlet.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/CrawlServiceServlet.java
index 700e4d27ef..fcc75db812 100644
--- a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/CrawlServiceServlet.java
+++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/CrawlServiceServlet.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011 ArcBees Inc.
+ * Copyright 2015 ArcBees Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  * use this file except in compliance with the License. You may obtain a copy of
@@ -23,7 +23,6 @@
 import javax.servlet.http.HttpServletRequest;
 import javax.servlet.http.HttpServletResponse;
 
-import com.gwtplatform.crawler.server.spring.filter.AbstractCrawlFilterModule;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Component;
@@ -76,6 +75,7 @@
  *   servletContext.addServlet("crawlServiceServlet", new HttpRequestHandlerServlet()).addMapping("/*");
  * 
* + * @author Ben Dol */ @Component public class CrawlServiceServlet extends AbstractCrawlServiceServlet implements HttpRequestHandler { diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlCacheService.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlCacheService.java index 87dcb077a7..6552c9442b 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlCacheService.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawlCacheService.java @@ -1,5 +1,5 @@ /* - * Copyright 2011 ArcBees Inc. + * Copyright 2015 ArcBees Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -18,6 +18,7 @@ /** * Crawl cache service interface. + * @author Ben Dol */ public interface CrawlCacheService { diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawledPage.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawledPage.java index 52f8d755fe..36c50913cd 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawledPage.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/CrawledPage.java @@ -1,5 +1,5 @@ /* - * Copyright 2011 ArcBees Inc. + * Copyright 2015 ArcBees Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -20,6 +20,8 @@ /** * Crawled page interface. + * + * @author Ben Dol */ public interface CrawledPage { void setUrl(String url); diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java index 39c8ae280f..2bab482cd1 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawlCacheService.java @@ -1,5 +1,5 @@ /* - * Copyright 2011 ArcBees Inc. + * Copyright 2015 ArcBees Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -16,6 +16,10 @@ package com.gwtplatform.crawler.server; +/** + * Default crawl cache service implementation. + * @author Ben Dol + */ public class DefaultCrawlCacheService implements CrawlCacheService { @Override public DefaultCrawledPage createCrawledPage() { diff --git a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawledPage.java b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawledPage.java index 69a5c72d2e..9eced45418 100644 --- a/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawledPage.java +++ b/gwtp-core/gwtp-crawler/src/main/java/com/gwtplatform/crawler/server/DefaultCrawledPage.java @@ -1,5 +1,5 @@ /* - * Copyright 2011 ArcBees Inc. + * Copyright 2015 ArcBees Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -20,6 +20,7 @@ /** * Default crawled page implementation. + * @author Ben Dol */ public class DefaultCrawledPage implements CrawledPage { private String url; diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/OfyCrawlCacheService.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/OfyCrawlCacheService.java index e654d28f7c..e0c719947e 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/OfyCrawlCacheService.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/OfyCrawlCacheService.java @@ -1,5 +1,5 @@ /* - * Copyright 2011 ArcBees Inc. + * Copyright 2015 ArcBees Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -26,6 +26,7 @@ /** * Objectify DAO Crawl Cache Service. + * @author Ben Dol */ @Singleton public class OfyCrawlCacheService implements CrawlCacheService { From 28a74a82a88e14c008e0a7361f7bb93f62634708 Mon Sep 17 00:00:00 2001 From: BenDol Date: Thu, 8 Oct 2015 01:27:21 +1300 Subject: [PATCH 10/11] AbstractCrawlServiceModule should extend AbstractCrawlerModule. --- .../server/spring/service/AbstractCrawlServiceModule.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java index 1f36e7572b..77be5ab0e7 100644 --- a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java +++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/service/AbstractCrawlServiceModule.java @@ -23,6 +23,7 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.gwtplatform.crawler.server.CrawlCacheService; import com.gwtplatform.crawler.server.DefaultCrawlCacheService; +import com.gwtplatform.crawler.server.spring.AbstractCrawlerModule; /** * Abstract crawl service module for {@link @Configuration} setup. @@ -31,7 +32,7 @@ @ComponentScan(basePackages = { "com.gwtplatform.crawler.server.spring.service" }) -public abstract class AbstractCrawlServiceModule { +public abstract class AbstractCrawlServiceModule extends AbstractCrawlerModule { @Bean protected WebClient webClient() { return new WebClient(BrowserVersion.CHROME); From a7bcce9715e5a5c1b39872868f622b40467c839e Mon Sep 17 00:00:00 2001 From: BenDol Date: Sun, 5 Jun 2016 17:10:11 +1200 Subject: [PATCH 11/11] Add previous ServiceKey class with deprecation. --- .../crawler/server/ServiceKey.java | 42 +++++++++++++++++++ .../spring/filter/SpringCrawlFilter.java | 2 +- 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/ServiceKey.java diff --git a/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/ServiceKey.java b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/ServiceKey.java new file mode 100644 index 0000000000..17cd0eec46 --- /dev/null +++ b/gwtp-core/gwtp-crawler-guice/src/main/java/com/gwtplatform/crawler/server/ServiceKey.java @@ -0,0 +1,42 @@ +/* + * Copyright 2011 ArcBees Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.gwtplatform.crawler.server; + +import com.google.inject.BindingAnnotation; +import com.gwtplatform.crawler.server.guice.service.CrawlServiceServlet; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +/** + * Use this annotation to bind the key that should be used when invoking + * {@link CrawlServiceServlet}. For example: + *
bindConstant().annotatedWith(ServiceKey.class).to("123456");
+ * 
+ * @deprecated Please use {@link com.gwtplatform.crawler.server.guice.ServiceKey} instead. + */ +@BindingAnnotation +@Target({FIELD, PARAMETER, METHOD}) +@Retention(RUNTIME) +@Deprecated +public @interface ServiceKey { +} diff --git a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/SpringCrawlFilter.java b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/SpringCrawlFilter.java index e36420aabf..c07a79527b 100644 --- a/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/SpringCrawlFilter.java +++ b/gwtp-core/gwtp-crawler-spring/src/main/java/com/gwtplatform/crawler/server/spring/filter/SpringCrawlFilter.java @@ -38,7 +38,7 @@ * Then register inside web.xml like so: *
  * {@code
- * 
+ *     
  *          crawlFilter
  *          org.springframework.web.filter.DelegatingFilterProxy
  *