From 242cd5c65ab6ca6252b5f683e5d52af94f6f751d Mon Sep 17 00:00:00 2001 From: Will Johnston Date: Wed, 25 Mar 2015 09:06:46 -0500 Subject: [PATCH] Create README.md --- README.md | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..1634c9c --- /dev/null +++ b/README.md @@ -0,0 +1,130 @@ +# Creepy + +> Crawl AJAX/SPA websites and create snapshots for SEO. + +## Getting Started +This plugin requires [PhantomJS](http://phantomjs.org/). + +This plugin uses the [HTML5 History API](https://developer.mozilla.org/en-US/docs/Web/API/History) `pushState` method to +do all navigation. This avoids having to reload the pages, but requires that the application support `popstate` event routing. + +### Options +```js +var Creepy = require('./creepy'); +var creepy = new Creepy(...); +``` + +#### rootUrl +Type: `string` + +Specifies the first page from which to start the crawl. + +#### initialDelay +Type: `number` +Default: `5000` + +Specifies the time (in milliseconds) to wait after loading the initial page before beginning the crawl. +This allows you to tailor the crawler for your site and make sure everything loads before you change URLs. + +#### delay +Type: `number` +Default: `5000` + +Specifies the time (in milliseconds) to wait for each new route to load before getting the DOM. + +#### ignore +Type: `Array` + +Specifies paths to ignore while crawling. Can also be Regular Expression strings. + +#### sort +Type: `(urls: Array) => Array` + +Allows you to sort/filter the urls while they are being collected. + +### Crawl +```js +var Creepy = require('./creepy'); +var creepy = new Creepy(...); + +creepy.crawl(success, failure, done); +``` + +The success callback is called after every page load. The page information is passed to the function including: + +```js +{ + // The page url processed + url: string; + + // The HTML for the url + content: string; +} +``` + +The failure callback is called whenever a page fails to load with the information: +```js +{ + // The page url that failed to process + url: string; + status: 'fail'; +} +``` + +The done callback is called when all the urls have finished loading. + +### Usage Examples + +```js +var fs = require('fs'); +var Creepy = require('./creepy'); + +var isPosts = '^((?!\/posts).)*$'; +var regex = new RegExp(isPosts); +var rootUrl = 'http://my-site.com'; + +var creepy = new Creepy({ + rootUrl: rootUrl, + initialDelay: 3000, + delay: 1000, + ignore: [ + '/admin' + ], + // Process all posts routes last + sort: function(urls) { + urls.sort(function(a, b) { + var testA = regex.test(a), + testB = regex.test(b); + + if(testA && !testB) { + return -1; + } else if(!testA && testB) { + return 1; + } + + return 0; + }); + + return urls; + } +}); + +creepy.crawl( + function(page) { + try { + page.url = 'pages/' + + decodeURI(page.url.replace(rootUrl + '/', '').replace(/\/|:/g, '_')) + + '.html'; + fs.write(page.url, page.content, 'w'); + } catch(e) { } + console.log('Loaded ' + page.url + ' content length = ' + page.content.length); + }, + function(page) { + console.log('Could not load page. URL = ' + page.url + ' status = ' + page.status); + }, + function() { + console.log('Done'); + phantom.exit(); + } +); +```