-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
82 lines (69 loc) · 2.27 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import { insert, getData } from './firebase.js';
import puppeteer from 'puppeteer';
import schedule from 'node-schedule';
const getArticles = async (page, sectionName) => {
const data = [];
const titles = await page.$$eval(`${sectionName} article .title`, (els) =>
els.map((el) => el.textContent)
);
const images = await page.$$eval(`${sectionName} article img`, (els) =>
els.map((el) => el.getAttribute('src'))
);
const cats = await page.$$eval(`${sectionName} article .cat`, (els) =>
els.map((el) => el.textContent.split(' '))
);
const dates = await page.$$eval(`${sectionName} article .post-date`, (els) =>
els.map((el) => el.getAttribute('title'))
);
const links = await page.$$eval(`${sectionName} article .title a`, (els) =>
els.map((el) => el.getAttribute('href'))
);
for (let i = 0; i < titles.length; i++) {
data.push({
title: titles[i],
url: images[i],
category: cats[i],
date: dates[i],
link: links[i],
});
}
return data;
};
const getArticleDetail = async (page, link) => {
await page.goto(link);
const images = await page.$$eval('.content .desc img', (els) =>
els.map((el) => el.getAttribute('src'))
);
let content = await page.$eval('.content .desc', (el) => el.innerHTML);
content = content
.replace(/(?:\r\n|\r|\n)/g, '<br/>')
.replace(/<img[^>]*>/g, '')
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script\s*>/gi, '');
for (let i = 0; i < images.length - 1; i++) {
content = content.replace(
/<noscript\b[^<]*(?:(?!<\/noscript>)<[^<]*)*<\/noscript\s*>/,
`<img src="${images[i]}" alt="img" />`
);
}
return content;
};
const excute = async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://abmedia.io/blog');
const docs = await getData();
let res = [
...(await getArticles(page, '.loop-grid')),
...(await getArticles(page, '.loop-post')),
];
res = res.filter((item) => !item.title.startsWith('EP'));
for (let i = 0; i < res.length; i++) {
const content = await getArticleDetail(page, res[i].link);
res[i].content = content;
await insert(res[i], docs);
}
await browser.close();
};
schedule.scheduleJob('30 * * * * *', function () {
excute();
});