-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawler.py
44 lines (38 loc) · 1.32 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# -*- coding: utf-8 -*-
import json
import re
import urllib
import urllib2
def main():
iurl = 'https://zh.wikisource.org/zh-hant/%E5%94%90%E8%A9%A9%E4%B8%89%E7%99%BE%E9%A6%96'
content = urllib2.urlopen(iurl).read().decode('utf8')
re_url = re.compile('<li>([^<]*?) <a href="(.*?)".*?>(.*?)</li>')
ps = re_url.findall(content)
res = []
# t_re = re.compile('dth:50%;"><b>(.*?)</b>')
# a_re = re.compile(u'作者:</span>.*?>(.*?)</a>')
c_re = re.compile('poem">\s*?<p>([\W\w]*?)</p>')
rm_re = re.compile('</*span.*?>|<small[\W\w]*?/small>|\s|</*a>|</*sub>')
urlbase = "http://diy.fwg.hk/download/chi/learnandteach/software/poem300/"
i = 0
for pp in ps:
i += 1
if i % 10 == 0:
print i
url = "https://zh.wikisource.org" + pp[1]
# print url
try:
content = urllib2.urlopen(url).read().decode('utf8')
# t = t_re.findall(content)[0]
# a = a_re.findall(content)[0]
c = rm_re.sub('', c_re.findall(content)[0]).split('<br/>')
res.append([url, rm_re.sub('', pp[2]), pp[0], c])
except:
print url
f = open('./tang300.v0.json', 'w')
s = json.dumps(res, ensure_ascii=False, indent=2)
# print s
print len(res)
f.write(s.encode('utf-8'))
f.close()
main()