Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed various issues in Crawler Service #326

Merged
merged 1 commit into from
Oct 21, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.common.base.Strings;
import com.google.inject.Inject;
import com.googlecode.objectify.Key;
import com.gwtplatform.crawlerservice.server.domain.CachedPage;
import com.gwtplatform.crawlerservice.server.service.CachedPageDao;

/**
* Servlet that makes it possible to fetch an external page, renders it using HTMLUnit and returns
* the HTML page.
* Servlet that makes it possible to fetch an external page, renders it using HTMLUnit and returns the HTML page.
*/
@Singleton
public class CrawlServiceServlet extends HttpServlet {
Expand Down Expand Up @@ -94,52 +94,85 @@ public boolean processSynchron(HtmlPage page, WebRequest request, boolean async)
protected void doGet(HttpServletRequest req, HttpServletResponse resp) {
PrintWriter out = null;
try {
resp.setCharacterEncoding(CHAR_ENCODING);
resp.setHeader("Content-Type", "text/plain; charset=" + CHAR_ENCODING);
boolean keyValid = validateKey(req, resp);

out = resp.getWriter();
if (keyValid) {
out = resp.getWriter();

String receivedKey = URLDecoder.decode(req.getParameter("key"), CHAR_ENCODING);
if (!key.equals(receivedKey)) {
out.println("<h3>The service key received does not match the desired key.</h3>");
} else {
String url = URLDecoder.decode(req.getParameter("url"), CHAR_ENCODING);

List<Key<CachedPage>> keys = cachedPageDao.listKeysByProperty("url", url);
Map<Key<CachedPage>, CachedPage> deprecatedPages = cachedPageDao.get(keys);

Date currDate = new Date();

CachedPage matchingPage = extractMatchingPage(deprecatedPages, currDate);
cachedPageDao.deleteKeys(deprecatedPages.keySet());
String url = Strings.nullToEmpty(req.getParameter("url"));
url = URLDecoder.decode(url, CHAR_ENCODING);

if (needToFetchPage(matchingPage, currDate, out)) {
CachedPage cachedPage = createPlaceholderPage(url, currDate);
StringBuilder renderedHtml = renderPage(url);
storeFetchedPage(cachedPage, renderedHtml);
out.println(renderedHtml.toString());
if (!url.isEmpty()) {
renderResponse(url, resp);
}
}
} catch (IOException e) {
e.printStackTrace();

resp.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
} finally {
if (out != null) {
out.close();
}
}
}

private void storeFetchedPage(CachedPage cachedPage,
StringBuilder stringBuilder) {
cachedPage.setContent(stringBuilder.toString());
private boolean validateKey(HttpServletRequest request, HttpServletResponse response)
throws IOException {
PrintWriter output = response.getWriter();
String receivedKey = request.getParameter("key");
boolean keyIsValid = false;

if (Strings.isNullOrEmpty(receivedKey)) {
output.println("No service key attached to the request.");
} else {
String decodedKey = URLDecoder.decode(receivedKey, CHAR_ENCODING);

if (!key.equals(decodedKey)) {
output.println("The service key received does not match the desired key.");
} else {
keyIsValid = true;
}
}

if (!keyIsValid) {
response.setStatus(HttpServletResponse.SC_FORBIDDEN);
}

return keyIsValid;
}

private void renderResponse(String url, HttpServletResponse response) throws IOException {
PrintWriter out = response.getWriter();

response.setCharacterEncoding(CHAR_ENCODING);
response.setHeader("Content-Type", "text/plain; charset=" + CHAR_ENCODING);

List<Key<CachedPage>> keys = cachedPageDao.listKeysByProperty("url", url);
Map<Key<CachedPage>, CachedPage> deprecatedPages = cachedPageDao.get(keys);

Date currDate = new Date();

CachedPage matchingPage = extractMatchingPage(deprecatedPages, currDate);
cachedPageDao.deleteKeys(deprecatedPages.keySet());

if (needToFetchPage(matchingPage, currDate, out)) {
CachedPage cachedPage = createPlaceholderPage(url, currDate);
String renderedHtml = renderPage(url);
storeFetchedPage(cachedPage, renderedHtml);
out.println(renderedHtml);
}
}

private void storeFetchedPage(CachedPage cachedPage, String stringBuilder) {
cachedPage.setContent(stringBuilder);
cachedPage.setFetchInProgress(false);
cachedPageDao.put(cachedPage);
}

/**
* Checks if the page {@link matchingPage} needs to be fetched. If it does not need to be fetched,
* but a fetch is already in progress, then it prints out {@code FETCH_IN_PROGRESS} to the
* specified {@link PrintWriter}.
* Checks if the page {@link matchingPage} needs to be fetched. If it does not need to be fetched, but a fetch is
* already in progress, then it prints out {@code FETCH_IN_PROGRESS} to the specified {@link PrintWriter}.
*
* @param matchingPage The matching page, can be {@code null} if no page matches.
* @param currDate The current date.
Expand Down Expand Up @@ -183,15 +216,13 @@ private CachedPage createPlaceholderPage(String url, Date currDate) {
}

/**
* Fetches the page at {@code url} and renders the page in a {@link StringBuilder}. The rendered
* page is prefixed with a message indicating this is a non-interactive version.
* Fetches the page at {@code url} and renders the page in a {@link StringBuilder}. The rendered page is prefixed
* with a message indicating this is a non-interactive version.
*
* @param url The URL of the page to render.
* @return The rendered page, in a {@link StringBuilder}.
* @throws IOException
* @throws MalformedURLException
*/
private StringBuilder renderPage(String url) throws IOException {
private String renderPage(String url) throws IOException {
WebClient webClient = webClientProvider.get();

webClient.getCache().clear();
Expand Down Expand Up @@ -227,23 +258,14 @@ private StringBuilder renderPage(String url) throws IOException {
}
}

StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("<hr />\n");
stringBuilder.append("<center><h3>You are viewing a non-interactive page that is intended for the crawler. ");
stringBuilder.append("You probably want to see this page: <a href=\"" + url + "\">" + url +
"</a></h3></center>\n");
stringBuilder.append("<hr />\n");

stringBuilder.append(page.asXml());
webClient.closeAllWindows();

return stringBuilder;
return page.asXml();
}

/**
* Checks if there is a page from {@code deprecatedPages} that is not expired. If there is
* more than one, choose the most recent. If one is found it is removed from the
* {@code deprecatedPages} list.
* Checks if there is a page from {@code deprecatedPages} that is not expired. If there is more than one, choose the
* most recent. If one is found it is removed from the {@code deprecatedPages} list.
*
* @param deprecatedPages The list of pages that match the URL but that are expected to be.
* @param currDate The current date, to check for expiration.
Expand All @@ -254,8 +276,7 @@ private CachedPage extractMatchingPage(Map<Key<CachedPage>, CachedPage> deprecat

// Keep the matching page only if it has not expired
if (matchingPage == null ||
currDate.getTime() >
matchingPage.getFetchDate().getTime() + cachedPageTimeoutSec * 1000) {
currDate.getTime() > matchingPage.getFetchDate().getTime() + cachedPageTimeoutSec * 1000) {
matchingPage = null;
} else {
deprecatedPages.remove(Key.create(CachedPage.class, matchingPage.getId()));
Expand All @@ -267,8 +288,7 @@ private CachedPage extractMatchingPage(Map<Key<CachedPage>, CachedPage> deprecat
private CachedPage findMostRecentPage(Map<Key<CachedPage>, CachedPage> pages) {
CachedPage result = null;
for (CachedPage page : pages.values()) {
if (result == null ||
page.getFetchDate().after(result.getFetchDate())) {
if (result == null || page.getFetchDate().after(result.getFetchDate())) {
result = page;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@

import java.util.Date;

import com.googlecode.objectify.annotation.Entity;
import com.googlecode.objectify.annotation.Index;

/**
* Stores a cached version of a page.
*
* @author Philippe Beaudoin
*/
@Entity
public class CachedPage extends DatastoreObject {
@Index
private String url;
private Date fetchDate;
private boolean fetchInProgress;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,10 @@

package com.gwtplatform.crawlerservice.server.domain;

import javax.persistence.Id;
import com.googlecode.objectify.annotation.Id;

/**
* The base class of any object that can be stored in the datastore.
*
* @author Philippe Beaudoin
*/
public class DatastoreObject {
@Id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@

import com.googlecode.objectify.Key;
import com.googlecode.objectify.Objectify;
import com.googlecode.objectify.ObjectifyService;
import com.googlecode.objectify.cmd.LoadType;
import com.googlecode.objectify.cmd.Query;
import com.gwtplatform.crawlerservice.server.objectify.OfyService;

/**
* Generic DAO for use with Objectify.
Expand Down Expand Up @@ -116,7 +116,7 @@ public List<Key<T>> listChildKeys(Object parent) {

protected Objectify ofy() {
if (lazyOfy == null) {
lazyOfy = ObjectifyService.ofy( );
lazyOfy = OfyService.ofy();
}
return lazyOfy;
}
Expand Down