-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_area.js
93 lines (87 loc) · 2.45 KB
/
fetch_area.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
/* eslint-env node */
/* eslint-disable @typescript-eslint/no-var-requires */
const fs = require('fs').promises;
const path = require('path');
const got = require('got');
const cheerio = require('cheerio');
const domain = 'http://www.mca.gov.cn';
const from1980 = '/article/sj/xzqh/1980/';
const year2020 = '/article/sj/xzqh/2020/';
const pad = (s) => s.length === 2 ? s : `0${s}`;
function getDate(s) {
if (s.indexOf('行政区划代码') < 0) return null;
let m = s.match(/(\d+)年(\d+)月(\d+)日/);
if (m) {
return `${m[1]}-${pad(m[2])}-${pad(m[3])}`;
}
m = s.match(/(\d+)年(\d+)月/);
if (m) {
return `${m[1]}-${pad(m[2])}-01`;
}
}
async function getData(url) {
if (!/^http/i.test(url)) {
url = domain + url;
}
const text = await got(url, { retry: 3 }).text();
const $ = cheerio.load(text);
const content = $('.content');
if (content.length) {
const link = $('a', content);
if (link.length) {
return getData($(link[0]).attr('href'));
} else {
const m = text.match(/window\.location\.href="([^"]+)"/);
if (m) {
return getData(m[1]);
}
}
}
const data = {};
$('tr').each((i, el) => {
const arr = $(el).text().trim().replace(/\s+/g, ' ').split(' ');
if (arr.length >= 2 && /^\d+$/.test(arr[0])) {
data[arr[0]] = arr[1];
}
});
return data;
}
async function main(listUrl) {
let total = 0;
let page = 1;
do {
const url = page === 1 ? listUrl : `${listUrl}?${page}`;
const body = await got(url, { retry: 3 }).text();
const $ = cheerio.load(body);
const list = [];
$('a.artitlelist').each((i, el) => {
const title = $(el).text();
const date = getDate(title);
const link = $(el).attr('href');
if (date) {
list.push({ date, link });
}
});
for (const item of list) {
const data = await getData(item.link);
if (Object.keys(data).length < 10) {
continue;
}
const file = path.join(__dirname, '../data', `${item.date}.json`);
await fs.writeFile(file, JSON.stringify(data, null, ' '), 'utf-8');
console.log(item.date);
}
const matched = body.match(/totalpage\s*=\s*"(\d+)"/);
if (page === 1 && matched) {
total = parseInt(matched[1]);
}
page++;
} while (page <= total);
}
const baseUrl = process.argv[2] === '1980' ? from1980 : year2020;
main(domain + baseUrl).then(() => {
console.log('done.');
}).catch(e => {
console.error(e);
process.exit(1);
});