-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
98 lines (79 loc) · 2.41 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
var request = require("request"),
cheerio = require("cheerio"),
url = "https://www.google.com/search?q=data+mining",
corpus = {},
totalResults = 0,
resultsDownloaded = 0;
function callback() {
resultsDownloaded++;
if (resultsDownloaded !== totalResults) {
return;
}
var words = [];
// stick all words in an array
for (prop in corpus) {
words.push({
word: prop,
count: corpus[prop]
});
}
// sort array based on how often they occur
words.sort(function(a, b) {
return b.count - a.count;
});
// finally, log the first fifty most popular words
console.log(words.slice(0, 20));
}
request(url, function(error, response, body) {
if (error) {
console.log('Couldn’ t get page because of error: '+error);
return;
}
// load the body of the page into Cheerio so we can traverse the DOM
var $ = cheerio.load(body),
links = $(".r a");
links.each(function(i, link) {
// get the href attribute of each link
var url = $(link).attr("href");
// strip out unnecessary junk
url = url.replace("/url?q=", "").split("&")[0];
if (url.charAt(0) === "/") {
return;
}
// this link counts as a result, so increment results
totalResults++;
// download that page
request(url, function(error, response, body) {
if (error) {
console.log('Couldn’ t get page because of error: '+error);
return;
}
// load the page into cheerio
var $page = cheerio.load(body),
text = $page("body").text();
// throw away extra whitespace and non-alphanumeric characters
text = text.replace(/\s+/g, " ")
.replace(/[^a-zA-Z ]/g, "")
.toLowerCase();
// split on spaces for a list of all the words on that page and
// loop through that list
text.split(" ").forEach(function(word) {
// we don't want to include very short or long words, as they're
// probably bad data
if (word.length < 4 || word.length > 20) {
return;
}
if (corpus[word]) {
// if this word is already in our "corpus", our collection
// of terms, increase the count by one
corpus[word]++;
} else {
// otherwise, say that we've found one of that word so far
corpus[word] = 1;
}
});
// and when our request is completed, call the callback to wrap up!
callback();
});
});
});