-
Notifications
You must be signed in to change notification settings - Fork 139
/
Copy pathscraper.js
114 lines (105 loc) · 3.83 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/**
* scraper - a set of utility methods to make scraping HTML easier
*/
"use strict";
var when = require("when");
var http = require("http");
var htmlparser = require("htmlparser");
var tool = require("./tool");
var log = tool.log();
/**
* Converts the provided HTML text into a dom.
*
* @param {string} text
* @returns {Object} object representing the dom
*/
exports.parseHTML = function(text) {
var handler = new htmlparser.DefaultHandler(null, {verbose: false, ignoreWhitespace: true});
new htmlparser.Parser(handler).parseComplete(text);
return handler.dom;
}; var parseHTML = exports.parseHTML;
/**
* Returns all <table> tags contained in the provided dom as elements in an array.
*
* @param {Object} dom a parse tree obtained from calling the parseHTML function.
* @returns {Array} an array of all tables and their associated sub trees.
*/
exports.tablesOf = function(dom) {
return htmlparser.DomUtils.getElements({tag_type: "tag", tag_name: "table"}, dom);
}
/**
* Returns all text nodes contained in the provided dom as elements in an array.
*
* @param {Object} dom a parse tree obtained from calling the parseHTML function.
* @returns {Array} a flattened array of all text nodes.
*/
exports.textsOf = function(dom) {
return htmlparser.DomUtils.getElements({tag_type: "text"}, dom);
}; var textsOf = exports.textsOf;
/**
* Returns all <tr> tags contained in the provided dom, presumably a tree rooted with a table node.
*
* @param {Object} dom a parse tree obtained from calling the parseHTML function.
* @returns {Array} an array of all rows and their associated sub trees.
*/
exports.rowsOf = function(dom) {
return htmlparser.DomUtils.getElements({tag_type: "tag", tag_name: "tr"}, dom);
}; var rowsOf = exports.rowsOf;
/**
* Given an html table comprised of rows having the <tr> tag, return a two-dimensional array of all cell values.
*
* @param {Object} table a parse tree obtained from calling the parseHTML function.
* @returns {Array} an array of rows, each row being an array of trimmed text values.
*/
exports.extract = function(table) {
return rowsOf(table).map(function(row) {
return textsOf(row).map(function(text) {
return text.data.trim();
});
});
}
/**
* Returns the match results of all text nodes in the provided dom, satisfying the specified regex, as elements
* in an array.
*
* @param regex a regular expression.
* @param {Object} dom a parse tree obtained from calling the parseHTML function.
* @returns {Array} an array of regex match results.
*/
exports.matchText = function(regex, dom) {
var results = [];
function matchForRegex(data) {
var match = data.match(regex);
return match ? results.push(match) : false;
}
htmlparser.DomUtils.getElements({tag_type: "text", tag_contains: matchForRegex}, dom);
return results;
}
/**
* Performs an http GET and parses the HTML into a dom. The result is a promise for the dom.
*
* @param options same as those taken by the http.request method.
* @param [converter] a callback that takes a buffer and converts it to another format.
* @returns {promise} a promise for the parsed dom of the specified url
*/
exports.fetch = function(options, converter) {
converter = converter || function nop(buffer) { return buffer; };
var d = when.defer();
log.info("get: " + options);
http.get(options, function(response) {
var chunks = [];
response.on("data", function(chunk) {
chunks.push(chunk);
});
response.on("end", function() {
log.info("got: " + options);
var converted = converter(Buffer.concat(chunks));
var parsed = parseHTML(converted);
log.info("done: " + options);
d.resolve(parsed);
});
}).on("error", function(error) {
d.reject(error);
});
return d.promise;
}