-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.js
92 lines (75 loc) · 2.42 KB
/
main.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
const Apify = require('apify');
const { log, htmlToText } = Apify.utils;
/**
* Function called for each page to extract data from it.
*/
const handlePageFunction = async ({ request, html, $ }) => {
const { url } = request;
log.info('Processing page', { url });
// Extract all meta elements from <head>
const meta = {};
$('head meta').each(function () {
let name = $(this).attr('name')
|| $(this).attr('property')
|| $(this).attr('http-equiv');
let content = $(this).attr('content');
const charset = $(this).attr('charset');
if (!name && charset) {
name = 'charset';
content = charset;
}
if (name) {
meta[name] = content ? content.trim() : null;
}
});
const title = ($('head title').eq(0).text() || '').trim();
const language = $('html').eq(0).attr('lang').trim();
const content = {
navs: [],
headers: [],
footers: [],
captions: [],
mainText: null,
};
['nav', 'header', 'footer'].forEach((tag) => {
$(tag).each(function () {
// TODO: This is inefficient, htmlToText() should accept Cheerio element too
content[`${tag}s`].push(htmlToText($(this).html()));
});
});
$('h1, h2, h3, h4, h5, h6, h7').each(function () {
content.captions.push({
tag: $(this).prop('tagName').toLowerCase(),
value: $(this).text().trim(),
});
});
['nav', 'header', 'footer'].forEach((tag) => {
$(tag).remove();
});
content.mainText = htmlToText($.root().html());
await Apify.pushData({
url,
title,
language,
meta,
content,
});
};
Apify.main(async () => {
const input = await Apify.getInput();
if (!input || !input.urls) throw new Error('Invalid input!');
const requestList = await Apify.openRequestList('METADATA_SCRAPER', input.urls);
const crawler = new Apify.CheerioCrawler({
requestList,
minConcurrency: 1,
maxConcurrency: 100,
handlePageTimeoutSecs: 60,
handlePageFunction,
maxRequestRetries: input.maxRequestRetries || 0,
handleFailedRequestFunction: async ({ error, request }) => {
log.exception(error, 'Failed to load the page, giving up', { url: request.url });
},
});
await crawler.run();
log.info('Done.');
});