From a397ead5e6dbb8a394fb03118862b100f98ada8a Mon Sep 17 00:00:00 2001 From: Christopher Viel Date: Tue, 1 Oct 2013 15:01:55 -0400 Subject: [PATCH] Fixed various issues in Crawler Service --- .../server/CrawlServiceServlet.java | 120 ++++++++++-------- .../server/domain/CachedPage.java | 7 +- .../server/domain/DatastoreObject.java | 4 +- .../server/service/ObjectifyDao.java | 4 +- 4 files changed, 78 insertions(+), 57 deletions(-) diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java index d2aa6aa2da..a97e739bca 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/CrawlServiceServlet.java @@ -36,14 +36,14 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.html.HtmlPage; +import com.google.common.base.Strings; import com.google.inject.Inject; import com.googlecode.objectify.Key; import com.gwtplatform.crawlerservice.server.domain.CachedPage; import com.gwtplatform.crawlerservice.server.service.CachedPageDao; /** - * Servlet that makes it possible to fetch an external page, renders it using HTMLUnit and returns - * the HTML page. + * Servlet that makes it possible to fetch an external page, renders it using HTMLUnit and returns the HTML page. */ @Singleton public class CrawlServiceServlet extends HttpServlet { @@ -94,34 +94,22 @@ public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) protected void doGet(HttpServletRequest req, HttpServletResponse resp) { PrintWriter out = null; try { - resp.setCharacterEncoding(CHAR_ENCODING); - resp.setHeader("Content-Type", "text/plain; charset=" + CHAR_ENCODING); + boolean keyValid = validateKey(req, resp); - out = resp.getWriter(); + if (keyValid) { + out = resp.getWriter(); - String receivedKey = URLDecoder.decode(req.getParameter("key"), CHAR_ENCODING); - if (!key.equals(receivedKey)) { - out.println("

The service key received does not match the desired key.

"); - } else { - String url = URLDecoder.decode(req.getParameter("url"), CHAR_ENCODING); - - List> keys = cachedPageDao.listKeysByProperty("url", url); - Map, CachedPage> deprecatedPages = cachedPageDao.get(keys); - - Date currDate = new Date(); - - CachedPage matchingPage = extractMatchingPage(deprecatedPages, currDate); - cachedPageDao.deleteKeys(deprecatedPages.keySet()); + String url = Strings.nullToEmpty(req.getParameter("url")); + url = URLDecoder.decode(url, CHAR_ENCODING); - if (needToFetchPage(matchingPage, currDate, out)) { - CachedPage cachedPage = createPlaceholderPage(url, currDate); - StringBuilder renderedHtml = renderPage(url); - storeFetchedPage(cachedPage, renderedHtml); - out.println(renderedHtml.toString()); + if (!url.isEmpty()) { + renderResponse(url, resp); } } } catch (IOException e) { e.printStackTrace(); + + resp.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR); } finally { if (out != null) { out.close(); @@ -129,17 +117,62 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp) { } } - private void storeFetchedPage(CachedPage cachedPage, - StringBuilder stringBuilder) { - cachedPage.setContent(stringBuilder.toString()); + private boolean validateKey(HttpServletRequest request, HttpServletResponse response) + throws IOException { + PrintWriter output = response.getWriter(); + String receivedKey = request.getParameter("key"); + boolean keyIsValid = false; + + if (Strings.isNullOrEmpty(receivedKey)) { + output.println("No service key attached to the request."); + } else { + String decodedKey = URLDecoder.decode(receivedKey, CHAR_ENCODING); + + if (!key.equals(decodedKey)) { + output.println("The service key received does not match the desired key."); + } else { + keyIsValid = true; + } + } + + if (!keyIsValid) { + response.setStatus(HttpServletResponse.SC_FORBIDDEN); + } + + return keyIsValid; + } + + private void renderResponse(String url, HttpServletResponse response) throws IOException { + PrintWriter out = response.getWriter(); + + response.setCharacterEncoding(CHAR_ENCODING); + response.setHeader("Content-Type", "text/plain; charset=" + CHAR_ENCODING); + + List> keys = cachedPageDao.listKeysByProperty("url", url); + Map, CachedPage> deprecatedPages = cachedPageDao.get(keys); + + Date currDate = new Date(); + + CachedPage matchingPage = extractMatchingPage(deprecatedPages, currDate); + cachedPageDao.deleteKeys(deprecatedPages.keySet()); + + if (needToFetchPage(matchingPage, currDate, out)) { + CachedPage cachedPage = createPlaceholderPage(url, currDate); + String renderedHtml = renderPage(url); + storeFetchedPage(cachedPage, renderedHtml); + out.println(renderedHtml); + } + } + + private void storeFetchedPage(CachedPage cachedPage, String stringBuilder) { + cachedPage.setContent(stringBuilder); cachedPage.setFetchInProgress(false); cachedPageDao.put(cachedPage); } /** - * Checks if the page {@link matchingPage} needs to be fetched. If it does not need to be fetched, - * but a fetch is already in progress, then it prints out {@code FETCH_IN_PROGRESS} to the - * specified {@link PrintWriter}. + * Checks if the page {@link matchingPage} needs to be fetched. If it does not need to be fetched, but a fetch is + * already in progress, then it prints out {@code FETCH_IN_PROGRESS} to the specified {@link PrintWriter}. * * @param matchingPage The matching page, can be {@code null} if no page matches. * @param currDate The current date. @@ -183,15 +216,13 @@ private CachedPage createPlaceholderPage(String url, Date currDate) { } /** - * Fetches the page at {@code url} and renders the page in a {@link StringBuilder}. The rendered - * page is prefixed with a message indicating this is a non-interactive version. + * Fetches the page at {@code url} and renders the page in a {@link StringBuilder}. The rendered page is prefixed + * with a message indicating this is a non-interactive version. * * @param url The URL of the page to render. * @return The rendered page, in a {@link StringBuilder}. - * @throws IOException - * @throws MalformedURLException */ - private StringBuilder renderPage(String url) throws IOException { + private String renderPage(String url) throws IOException { WebClient webClient = webClientProvider.get(); webClient.getCache().clear(); @@ -227,23 +258,14 @@ private StringBuilder renderPage(String url) throws IOException { } } - StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append("
\n"); - stringBuilder.append("

You are viewing a non-interactive page that is intended for the crawler. "); - stringBuilder.append("You probably want to see this page: " + url + - "

\n"); - stringBuilder.append("
\n"); - - stringBuilder.append(page.asXml()); webClient.closeAllWindows(); - return stringBuilder; + return page.asXml(); } /** - * Checks if there is a page from {@code deprecatedPages} that is not expired. If there is - * more than one, choose the most recent. If one is found it is removed from the - * {@code deprecatedPages} list. + * Checks if there is a page from {@code deprecatedPages} that is not expired. If there is more than one, choose the + * most recent. If one is found it is removed from the {@code deprecatedPages} list. * * @param deprecatedPages The list of pages that match the URL but that are expected to be. * @param currDate The current date, to check for expiration. @@ -254,8 +276,7 @@ private CachedPage extractMatchingPage(Map, CachedPage> deprecat // Keep the matching page only if it has not expired if (matchingPage == null || - currDate.getTime() > - matchingPage.getFetchDate().getTime() + cachedPageTimeoutSec * 1000) { + currDate.getTime() > matchingPage.getFetchDate().getTime() + cachedPageTimeoutSec * 1000) { matchingPage = null; } else { deprecatedPages.remove(Key.create(CachedPage.class, matchingPage.getId())); @@ -267,8 +288,7 @@ private CachedPage extractMatchingPage(Map, CachedPage> deprecat private CachedPage findMostRecentPage(Map, CachedPage> pages) { CachedPage result = null; for (CachedPage page : pages.values()) { - if (result == null || - page.getFetchDate().after(result.getFetchDate())) { + if (result == null || page.getFetchDate().after(result.getFetchDate())) { result = page; } } diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java index 8e33cc053d..8b78d39de3 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/CachedPage.java @@ -18,12 +18,15 @@ import java.util.Date; +import com.googlecode.objectify.annotation.Entity; +import com.googlecode.objectify.annotation.Index; + /** * Stores a cached version of a page. - * - * @author Philippe Beaudoin */ +@Entity public class CachedPage extends DatastoreObject { + @Index private String url; private Date fetchDate; private boolean fetchInProgress; diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/DatastoreObject.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/DatastoreObject.java index 81996f5da0..aa6910bfb5 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/DatastoreObject.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/domain/DatastoreObject.java @@ -16,12 +16,10 @@ package com.gwtplatform.crawlerservice.server.domain; -import javax.persistence.Id; +import com.googlecode.objectify.annotation.Id; /** * The base class of any object that can be stored in the datastore. - * - * @author Philippe Beaudoin */ public class DatastoreObject { @Id diff --git a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/service/ObjectifyDao.java b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/service/ObjectifyDao.java index ad98b2ab93..167d9ee135 100644 --- a/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/service/ObjectifyDao.java +++ b/gwtp-crawler-service/src/main/java/com/gwtplatform/crawlerservice/server/service/ObjectifyDao.java @@ -22,9 +22,9 @@ import com.googlecode.objectify.Key; import com.googlecode.objectify.Objectify; -import com.googlecode.objectify.ObjectifyService; import com.googlecode.objectify.cmd.LoadType; import com.googlecode.objectify.cmd.Query; +import com.gwtplatform.crawlerservice.server.objectify.OfyService; /** * Generic DAO for use with Objectify. @@ -116,7 +116,7 @@ public List> listChildKeys(Object parent) { protected Objectify ofy() { if (lazyOfy == null) { - lazyOfy = ObjectifyService.ofy( ); + lazyOfy = OfyService.ofy(); } return lazyOfy; }