-
Notifications
You must be signed in to change notification settings - Fork 0
/
zhSpider.py
61 lines (49 loc) · 1.87 KB
/
zhSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: zhuihuSpider
Description : 爬取知乎 高票回答信息 以用于生成词云
Author : wsm
date: 2019-01-16
-------------------------------------------------
Change Activity:
2019-01-16:
-------------------------------------------------
"""
__author__ = 'wsm'
# https://www.zhihu.com/api/v4/questions/267653585/answers?include=content,data[*].voteup_count&limit=20&offset=0
import json, os, requests
from pyquery import PyQuery
class ZhiHuSpider(object):
def __init__(self, questionid):
self.header = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3551.3 Safari/537.36'
}
self.questionid = questionid
self.url = 'https://www.zhihu.com/api/v4/questions/' + self.questionid + '/answers?include=content,data[*].voteup_count&limit=20&offset=0'
self.path = os.path.dirname(__file__)
def getContent(self):
json_str = requests.get(self.url, headers=self.header)
json_obj = json.loads(json_str.content.decode('utf8'))
content = json_obj['data']
if content:
self.url = json_obj['paging']['next']
for i in content:
# 过滤 html 标签
doc = PyQuery(i['content'])
self.formatContent(doc.text())
else:
self.url = ''
def formatContent(self, content):
with open(os.path.join(self.path, 'text/' + self.questionid + '.txt'), 'a+') as f:
f.write(content + '\n')
f.close()
def run(self):
while True:
if self.url == '':
break
self.getContent()
print('close')
if __name__ == '__main__':
zsp = ZhiHuSpider('281036323')
zsp.run()