From f6674cf828854e5310f1334f3a577087601dedbf Mon Sep 17 00:00:00 2001 From: Steven Shen Date: Mon, 28 Mar 2016 12:26:55 -0700 Subject: [PATCH 1/4] Check for "text/html" content-type before attempting to get the URI data and parse. --- index.js | 59 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/index.js b/index.js index a8a04af..2b1a500 100644 --- a/index.js +++ b/index.js @@ -301,32 +301,47 @@ MetaInspector.prototype.fetch = function(){ var _this = this; var totalChunks = 0; - var r = request({uri : this.url, gzip: true, maxRedirects: this.maxRedirects, timeout: this.timeout, strictSSL: this.strictSSL}, function(error, response, body){ - if(!error && response.statusCode === 200){ - _this.document = body; - _this.parsedDocument = cheerio.load(body); - _this.response = response; + getDocHead(); + + function getDocHead() { + request.head({uri : _this.url, gzip: true, maxRedirects: _this.maxRedirects, timeout: _this.timeout, strictSSL: _this.strictSSL}, function(err, res, data) { + if(!err && res.statusCode === 200 && res.headers['content-type'] && res.headers['content-type'].indexOf('text/html') >= 0){ + getDocument(); + } else { + _this.emit("error", 'invalid data type'); + } + }); - _this.initAllProperties(); + } - _this.emit("fetch"); - } - else{ - _this.emit("error", error); - } - }); + function getDocument() { + var r = request.get({uri : _this.url, gzip: true, maxRedirects: _this.maxRedirects, timeout: _this.timeout, strictSSL: _this.strictSSL}, function(error, response, body){ + if(!error && response.statusCode === 200){ + _this.document = body; + _this.parsedDocument = cheerio.load(body); + _this.response = response; - if(_this.options.limit){ - _this.__stoppedAtLimit = false; - r.on('data', function(chunk){ - totalChunks += chunk.length; - if(totalChunks > _this.options.limit){ - if(!_this.__stoppedAtLimit) { - _this.emit("limit"); - _this.__stoppedAtLimit = true; - } - r.abort(); + _this.initAllProperties(); + + _this.emit("fetch"); + } + else{ + _this.emit("error", error); } }); + + if(_this.options.limit) { + _this.__stoppedAtLimit = false; + r.on('data', function (chunk) { + totalChunks += chunk.length; + if (totalChunks > _this.options.limit) { + if (!_this.__stoppedAtLimit) { + _this.emit("limit"); + _this.__stoppedAtLimit = true; + } + r.abort(); + } + }); + } } }; From 3f60c0d6e1e42ef8daebf5e70cb2549248c6f8c5 Mon Sep 17 00:00:00 2001 From: Steven Shen Date: Tue, 3 May 2016 12:14:58 -0700 Subject: [PATCH 2/4] #0000: Use cookie jar when making HTTP request --- index.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/index.js b/index.js index 2b1a500..2700bfd 100644 --- a/index.js +++ b/index.js @@ -1,5 +1,6 @@ var util = require('util'), request = require('request'), + jar = request.jar(), events = require('events'), cheerio = require('cheerio'), URI = require('uri-js'); @@ -304,7 +305,7 @@ MetaInspector.prototype.fetch = function(){ getDocHead(); function getDocHead() { - request.head({uri : _this.url, gzip: true, maxRedirects: _this.maxRedirects, timeout: _this.timeout, strictSSL: _this.strictSSL}, function(err, res, data) { + request.head({uri : _this.url, gzip: true, jar: jar, maxRedirects: _this.maxRedirects, timeout: _this.timeout, strictSSL: _this.strictSSL}, function(err, res, data) { if(!err && res.statusCode === 200 && res.headers['content-type'] && res.headers['content-type'].indexOf('text/html') >= 0){ getDocument(); } else { @@ -315,7 +316,7 @@ MetaInspector.prototype.fetch = function(){ } function getDocument() { - var r = request.get({uri : _this.url, gzip: true, maxRedirects: _this.maxRedirects, timeout: _this.timeout, strictSSL: _this.strictSSL}, function(error, response, body){ + var r = request.get({uri : _this.url, gzip: true, jar: jar, maxRedirects: _this.maxRedirects, timeout: _this.timeout, strictSSL: _this.strictSSL}, function(error, response, body){ if(!error && response.statusCode === 200){ _this.document = body; _this.parsedDocument = cheerio.load(body); From 003f139d6efe7b4a69c6be0dfba2e673c749e24c Mon Sep 17 00:00:00 2001 From: Steven Shen Date: Tue, 3 May 2016 12:23:22 -0700 Subject: [PATCH 3/4] #0000: Make cookie jar specific to the individual request --- index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.js b/index.js index 2700bfd..5908636 100644 --- a/index.js +++ b/index.js @@ -1,6 +1,5 @@ var util = require('util'), request = require('request'), - jar = request.jar(), events = require('events'), cheerio = require('cheerio'), URI = require('uri-js'); @@ -301,6 +300,7 @@ MetaInspector.prototype.getAbsolutePath = function(href){ MetaInspector.prototype.fetch = function(){ var _this = this; var totalChunks = 0; + var jar = request.jar(); getDocHead(); From 06cad852f3f540cefdebd7fb48fb4146079f6643 Mon Sep 17 00:00:00 2001 From: Steven Shen Date: Wed, 4 May 2016 10:26:48 -0700 Subject: [PATCH 4/4] #0000: Support passing headers via options when instantiating the module --- index.js | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/index.js b/index.js index 5908636..8ff802a 100644 --- a/index.js +++ b/index.js @@ -36,6 +36,8 @@ var MetaInspector = function(url, options){ this.timeout = this.options.timeout || 20000; //Timeout in ms this.strictSSL = !!this.options.strictSSL; + + this.headers = this.options.headers; }; //MetaInspector.prototype = new events.EventEmitter(); @@ -305,7 +307,7 @@ MetaInspector.prototype.fetch = function(){ getDocHead(); function getDocHead() { - request.head({uri : _this.url, gzip: true, jar: jar, maxRedirects: _this.maxRedirects, timeout: _this.timeout, strictSSL: _this.strictSSL}, function(err, res, data) { + request.head({uri : _this.url, gzip: true, jar: jar, maxRedirects: _this.maxRedirects, timeout: _this.timeout, strictSSL: _this.strictSSL, headers: _this.headers}, function(err, res, data) { if(!err && res.statusCode === 200 && res.headers['content-type'] && res.headers['content-type'].indexOf('text/html') >= 0){ getDocument(); } else { @@ -316,7 +318,7 @@ MetaInspector.prototype.fetch = function(){ } function getDocument() { - var r = request.get({uri : _this.url, gzip: true, jar: jar, maxRedirects: _this.maxRedirects, timeout: _this.timeout, strictSSL: _this.strictSSL}, function(error, response, body){ + var r = request.get({uri : _this.url, gzip: true, jar: jar, maxRedirects: _this.maxRedirects, timeout: _this.timeout, strictSSL: _this.strictSSL, headers: _this.headers}, function(error, response, body){ if(!error && response.statusCode === 200){ _this.document = body; _this.parsedDocument = cheerio.load(body);