-
Notifications
You must be signed in to change notification settings - Fork 0
/
youku.py
198 lines (183 loc) · 6.83 KB
/
youku.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#-*-coding:utf-8-*-
#!/usr/bin/python
from lib import config, db_sy
from pprint import pprint
import time
import sys,os
from extractor import *
# linux定时任务更改当前路径
if config.ENV_OS == 'linux':
os.chdir('/data/crawler/')
# 全局数据库连接
dbconn = db_sy.getConnection()
# 优酷视频的定期更新间隔
update_period = config.VIDEO_UPDATE_PERIOD
# 修复插入数据库乱码问题
reload(sys)
sys.setdefaultencoding('utf-8')
# 游戏类别对应的PHPCMS中的catid
# game_type = {'lol':'6','Dota2':'7','dota2':'7','starcraft':'8','wow':'13','cf':'20','duanyou':'95','hearthstone':'22','minecraft':'60','overwatch':'61','pvp':'62','WorldOfTanks':'63','CS_GO':'64','cos':'65','dota':'66','warcraft':'67','zhanzheng':'68','sheji':'69','CR':'70','yuanchuang':'71','zixun':'72','zhuji':'94','huaijiu':'99','other':'14'}
# 检查vid是否已存在,即视频已插入数据库
# 获取优酷主播频道信息
def check_vid_from_db(vid):
# To_do 添加主播视频最近更新时间
fields = ['id']
res = db_sy.db_select(dbconn, 'video_data', "`vid` = '{0}' ".format(vid), fields)
if res:
return True
else:
return False
# 对组装video和主播信息
def get_videos_info(zhubo, videos):
uid = None
vids = []
res_arr = []
anchor_id = zhubo['id']
catid = zhubo['v_category']
# 视频的上次更新时间
old_updatetime = zhubo['v_updatetime']
updatetime = zhubo['v_updatetime']
# 遍历threads, 判断是否成功
for video in videos:
# 加入筛选条件title不能为空
if video['title'] and video['publish_time'] > old_updatetime:
temp_kw = {}
temp_kw['title'] = video['title']
temp_kw['thumb'] = video['avatar']
temp_kw['v_time'] = video['time']
temp_kw['link'] = video['link']
# description当然方法无法获取,置空
temp_kw['description'] = ''
temp_kw['anchor'] = anchor_id
temp_kw['catid'] = catid
temp_kw['keywords'] = zhubo['name']
temp_kw['publishtime'] = video['publish_time']
if updatetime < temp_kw['publishtime']:
updatetime = temp_kw['publishtime']
temp_kw['mark'] = video['num']
temp_kw['vid'] = video['vid']
temp_kw['platform'] = zhubo['platform']
# 查询当前vid是否存在
if check_vid_from_db(video['vid']):
continue
res_arr.append(temp_kw)
# 更新主播uid信息和最新视频更新时间
# zhubo['uid'] = uid
zhubo['v_updatetime'] = updatetime
# 更新主播下次视频更新时间
zhubo['v_next_updatetime'] = get_zhubo_next_updatetime(updatetime)
return zhubo, res_arr
def pro_video_list(zhubo, obj_extractor):
# 对优酷所有列表页进行处理
def get_all_list(channel_url):
v_all_list = []
next_page_url = channel_url
start_page = 0
# 加入限制条件,只爬取前250页数据
while not next_page_url is None and start_page < 250:
v_list = []
v_list, next_page_url = obj_extractor.get_single_list(next_page_url, is_all=True)
if v_list is None:
continue
v_all_list.extend(v_list)
start_page += 1
print 'v_all_list size :' + str(len(v_all_list)) + '\r\n'
return v_all_list
is_init = zhubo['is_init']
channel_url = zhubo['url']
res_list = []
# 判断当前主播是否为初始爬取,是则爬取所有分页,否则只爬取第一页
if is_init:
res_list = get_all_list(channel_url)
# 过滤操作
print "#####IN Videos Filter,size:{0}#####\r\n".format(len(res_list))
zhubo, videos = get_videos_info(zhubo, res_list)
print "#####End Videos Filter,size:{0}#####\r\n".format(len(videos))
# 获取主播视频数量
zhubo['v_num'] = len(videos)
# 入库
# 更新video表数据
db_sy.db_insert(dbconn, videos, 'video')
# 更新anchor表数据
# 组装主播数据
zhubo_list = []
zhubo_list.append(zhubo)
db_sy.db_update(dbconn, zhubo_list, 'anchor')
else:
next_updatetime = zhubo['v_next_updatetime']
# 判断当前主播是否更新
if (next_updatetime == 0 or is_update_zhubo(next_updatetime)):
# 间隔0.3秒
time.sleep(0.5)
res_list, url = obj_extractor.get_single_list(channel_url)
# 过滤操作
print "#####IN Videos Filter size:{0}#####\r\n".format(len(res_list))
zhubo, videos = get_videos_info(zhubo, res_list)
print "#####End Videos Filter,size:{0}#####\r\n".format(len(videos))
if len(videos) != 0:
# 更新主播视频数量
zhubo['v_num'] = zhubo['v_num'] + len(videos)
# 入库
# 更新video表数据
db_sy.db_insert(dbconn, videos, 'video')
# 更新anchor表数据,更新v_num,v_updatetime
zhubo_list = []
zhubo_list.append(zhubo)
db_sy.db_update(dbconn, zhubo_list, 'anchor')
print zhubo['id'] + ', size: ' + str(len(videos)) + '\r\n'
# 判断主播当前时间是否进行更新操作
# 根据更新单位和下次更新时间判断当前是否需要更新该主播
# 更新单位 24h, 12h, 6h, 1h
def is_update_zhubo(next_updatetime):
current_time = int(time.time())
# return True if abs(current_time - next_updatetime) < update_period else False
# 暂时取消判断主播是否更新的机制,提升爬取的实时性
return True
# 获取主播下次更新时间
def get_zhubo_next_updatetime(new_updatetime):
current_time = int(time.time())
pprint(new_updatetime)
if new_updatetime > current_time:
raise Exception('获取的视频更新时间一长,[请检查!')
# 根据最新的视频更新时间,决定下次的更新时间
# 如果视频时间差大于一周
if abs(current_time - new_updatetime) > 14*update_period:
return abs(current_time - new_updatetime) + current_time
else:
return current_time + update_period
# 获取优酷主播频道信息
def get_zhubo_from_db():
res = []
is_init = False
# To_do 添加主播视频最近更新时间
fields = ['id', 'title','video_category', 'platform_url', 'v_updatetime', 'v_next_updatetime', 'v_num', 'platform_id', 'platform']
res_zhubo = db_sy.db_select(dbconn, 'anchor', "`platform_url` != '' ", fields)
for zhubo in res_zhubo:
info = {}
info['id'] = str(zhubo[0])
info['name'] = zhubo[1]
info['v_category'] = zhubo[2]
info['url'] = zhubo[3].strip()
info['v_updatetime'] = zhubo[4]
info['v_next_updatetime'] = zhubo[5]
info['v_num'] = zhubo[6]
info['uid'] = zhubo[7]
info['platform'] = zhubo[8]
info['is_init'] = is_init
res.append(info)
return res
if __name__ == "__main__":
# 读取主播自频道信息
zipindao_info = get_zhubo_from_db()
zhubo = None
now = time.time()
for zhubo in zipindao_info:
#动态加载页面抓取对象
str_extractor = zhubo['platform'].capitalize() + 'Extractor'
module = globals()[str_extractor]
obj_extractor = getattr(module, str_extractor)(zhubo)
pro_video_list(zhubo, obj_extractor)
# 执行时间
print "time cost : " + str(int((time.time() - now))) + " seconds"
# 关闭数据库连接
dbconn.close()