Skip to content

Commit

Permalink
Merge pull request ArcBees#326 from ArcBees/cv_crawler_service_update
Browse files Browse the repository at this point in the history
Fixed various issues in Crawler Service
  • Loading branch information
Christopher Viel committed Oct 21, 2013
2 parents d72665a + 49fba17 commit 8790831
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.common.base.Strings;
import com.google.inject.Inject;
import com.googlecode.objectify.Key;
import com.gwtplatform.crawlerservice.server.domain.CachedPage;
import com.gwtplatform.crawlerservice.server.service.CachedPageDao;

/**
* Servlet that makes it possible to fetch an external page, renders it using HTMLUnit and returns
* the HTML page.
* Servlet that makes it possible to fetch an external page, renders it using HTMLUnit and returns the HTML page.
*/
@Singleton
public class CrawlServiceServlet extends HttpServlet {
Expand Down Expand Up @@ -94,52 +94,85 @@ public boolean processSynchron(HtmlPage page, WebRequest request, boolean async)
protected void doGet(HttpServletRequest req, HttpServletResponse resp) {
PrintWriter out = null;
try {
resp.setCharacterEncoding(CHAR_ENCODING);
resp.setHeader("Content-Type", "text/plain; charset=" + CHAR_ENCODING);
boolean keyValid = validateKey(req, resp);

out = resp.getWriter();
if (keyValid) {
out = resp.getWriter();

String receivedKey = URLDecoder.decode(req.getParameter("key"), CHAR_ENCODING);
if (!key.equals(receivedKey)) {
out.println("<h3>The service key received does not match the desired key.</h3>");
} else {
String url = URLDecoder.decode(req.getParameter("url"), CHAR_ENCODING);

List<Key<CachedPage>> keys = cachedPageDao.listKeysByProperty("url", url);
Map<Key<CachedPage>, CachedPage> deprecatedPages = cachedPageDao.get(keys);

Date currDate = new Date();

CachedPage matchingPage = extractMatchingPage(deprecatedPages, currDate);
cachedPageDao.deleteKeys(deprecatedPages.keySet());
String url = Strings.nullToEmpty(req.getParameter("url"));
url = URLDecoder.decode(url, CHAR_ENCODING);

if (needToFetchPage(matchingPage, currDate, out)) {
CachedPage cachedPage = createPlaceholderPage(url, currDate);
StringBuilder renderedHtml = renderPage(url);
storeFetchedPage(cachedPage, renderedHtml);
out.println(renderedHtml.toString());
if (!url.isEmpty()) {
renderResponse(url, resp);
}
}
} catch (IOException e) {
e.printStackTrace();

resp.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
} finally {
if (out != null) {
out.close();
}
}
}

private void storeFetchedPage(CachedPage cachedPage,
StringBuilder stringBuilder) {
cachedPage.setContent(stringBuilder.toString());
private boolean validateKey(HttpServletRequest request, HttpServletResponse response)
throws IOException {
PrintWriter output = response.getWriter();
String receivedKey = request.getParameter("key");
boolean keyIsValid = false;

if (Strings.isNullOrEmpty(receivedKey)) {
output.println("No service key attached to the request.");
} else {
String decodedKey = URLDecoder.decode(receivedKey, CHAR_ENCODING);

if (!key.equals(decodedKey)) {
output.println("The service key received does not match the desired key.");
} else {
keyIsValid = true;
}
}

if (!keyIsValid) {
response.setStatus(HttpServletResponse.SC_FORBIDDEN);
}

return keyIsValid;
}

private void renderResponse(String url, HttpServletResponse response) throws IOException {
PrintWriter out = response.getWriter();

response.setCharacterEncoding(CHAR_ENCODING);
response.setHeader("Content-Type", "text/plain; charset=" + CHAR_ENCODING);

List<Key<CachedPage>> keys = cachedPageDao.listKeysByProperty("url", url);
Map<Key<CachedPage>, CachedPage> deprecatedPages = cachedPageDao.get(keys);

Date currDate = new Date();

CachedPage matchingPage = extractMatchingPage(deprecatedPages, currDate);
cachedPageDao.deleteKeys(deprecatedPages.keySet());

if (needToFetchPage(matchingPage, currDate, out)) {
CachedPage cachedPage = createPlaceholderPage(url, currDate);
String renderedHtml = renderPage(url);
storeFetchedPage(cachedPage, renderedHtml);
out.println(renderedHtml);
}
}

private void storeFetchedPage(CachedPage cachedPage, String stringBuilder) {
cachedPage.setContent(stringBuilder);
cachedPage.setFetchInProgress(false);
cachedPageDao.put(cachedPage);
}

/**
* Checks if the page {@link matchingPage} needs to be fetched. If it does not need to be fetched,
* but a fetch is already in progress, then it prints out {@code FETCH_IN_PROGRESS} to the
* specified {@link PrintWriter}.
* Checks if the page {@link matchingPage} needs to be fetched. If it does not need to be fetched, but a fetch is
* already in progress, then it prints out {@code FETCH_IN_PROGRESS} to the specified {@link PrintWriter}.
*
* @param matchingPage The matching page, can be {@code null} if no page matches.
* @param currDate The current date.
Expand Down Expand Up @@ -183,15 +216,13 @@ private CachedPage createPlaceholderPage(String url, Date currDate) {
}

/**
* Fetches the page at {@code url} and renders the page in a {@link StringBuilder}. The rendered
* page is prefixed with a message indicating this is a non-interactive version.
* Fetches the page at {@code url} and renders the page in a {@link StringBuilder}. The rendered page is prefixed
* with a message indicating this is a non-interactive version.
*
* @param url The URL of the page to render.
* @return The rendered page, in a {@link StringBuilder}.
* @throws IOException
* @throws MalformedURLException
*/
private StringBuilder renderPage(String url) throws IOException {
private String renderPage(String url) throws IOException {
WebClient webClient = webClientProvider.get();

webClient.getCache().clear();
Expand Down Expand Up @@ -227,23 +258,14 @@ private StringBuilder renderPage(String url) throws IOException {
}
}

StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("<hr />\n");
stringBuilder.append("<center><h3>You are viewing a non-interactive page that is intended for the crawler. ");
stringBuilder.append("You probably want to see this page: <a href=\"" + url + "\">" + url +
"</a></h3></center>\n");
stringBuilder.append("<hr />\n");

stringBuilder.append(page.asXml());
webClient.closeAllWindows();

return stringBuilder;
return page.asXml();
}

/**
* Checks if there is a page from {@code deprecatedPages} that is not expired. If there is
* more than one, choose the most recent. If one is found it is removed from the
* {@code deprecatedPages} list.
* Checks if there is a page from {@code deprecatedPages} that is not expired. If there is more than one, choose the
* most recent. If one is found it is removed from the {@code deprecatedPages} list.
*
* @param deprecatedPages The list of pages that match the URL but that are expected to be.
* @param currDate The current date, to check for expiration.
Expand All @@ -254,8 +276,7 @@ private CachedPage extractMatchingPage(Map<Key<CachedPage>, CachedPage> deprecat

// Keep the matching page only if it has not expired
if (matchingPage == null ||
currDate.getTime() >
matchingPage.getFetchDate().getTime() + cachedPageTimeoutSec * 1000) {
currDate.getTime() > matchingPage.getFetchDate().getTime() + cachedPageTimeoutSec * 1000) {
matchingPage = null;
} else {
deprecatedPages.remove(Key.create(CachedPage.class, matchingPage.getId()));
Expand All @@ -267,8 +288,7 @@ private CachedPage extractMatchingPage(Map<Key<CachedPage>, CachedPage> deprecat
private CachedPage findMostRecentPage(Map<Key<CachedPage>, CachedPage> pages) {
CachedPage result = null;
for (CachedPage page : pages.values()) {
if (result == null ||
page.getFetchDate().after(result.getFetchDate())) {
if (result == null || page.getFetchDate().after(result.getFetchDate())) {
result = page;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@

import java.util.Date;

import com.googlecode.objectify.annotation.Entity;
import com.googlecode.objectify.annotation.Index;

/**
* Stores a cached version of a page.
*
* @author Philippe Beaudoin
*/
@Entity
public class CachedPage extends DatastoreObject {
@Index
private String url;
private Date fetchDate;
private boolean fetchInProgress;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,10 @@

package com.gwtplatform.crawlerservice.server.domain;

import javax.persistence.Id;
import com.googlecode.objectify.annotation.Id;

/**
* The base class of any object that can be stored in the datastore.
*
* @author Philippe Beaudoin
*/
public class DatastoreObject {
@Id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@

import com.googlecode.objectify.Key;
import com.googlecode.objectify.Objectify;
import com.googlecode.objectify.ObjectifyService;
import com.googlecode.objectify.cmd.LoadType;
import com.googlecode.objectify.cmd.Query;
import com.gwtplatform.crawlerservice.server.objectify.OfyService;

/**
* Generic DAO for use with Objectify.
Expand Down Expand Up @@ -116,7 +116,7 @@ public List<Key<T>> listChildKeys(Object parent) {

protected Objectify ofy() {
if (lazyOfy == null) {
lazyOfy = ObjectifyService.ofy( );
lazyOfy = OfyService.ofy();
}
return lazyOfy;
}
Expand Down

0 comments on commit 8790831

Please sign in to comment.