-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdown.py
133 lines (126 loc) · 3.93 KB
/
down.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2016/3/4 10:23
# @Author : Aries ([email protected])
# @Site : http://iw3c.com
# @File : download.py
# @Software: PyCharm
import os,json,sys,getopt
import time
import Queue,threading
from bs4 import BeautifulSoup
import requests
try:
opts, args = getopt.getopt(sys.argv[1:], 'd:')
except getopt.GetoptError, err:
print str(err)
exit()
BASE_DIR = 'xieemanhua'
for k,v in opts:
if k == '-d':
BASE_DIR = v
BASE_URL = 'http://m.wujiecao.cn'
SAVE_DIR = 'datas/'+BASE_DIR
THREAD_COUNT = 5
#获取HTML内容
def getHtml(url,timeout=20):
try:
headers = {
'Accept-Language': 'zh-cn',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/4.0 (compatible MSIE 6.00 Windows NT 5.1 SV1)',
}
r = requests.get(url,headers=headers,timeout=timeout)
r.encoding='utf-8'
html = r.text
return html
except Exception,ex:
return False
#采集列表页
def getAllPageLists():
print "=====start get all page lists====== %s" % time.ctime()
soup = BeautifulSoup(getHtml(BASE_URL+'/'+BASE_DIR))
select = soup.find('select', class_='paging-select')
option = select.find_all('option')
lists = []
for o in option:
lists.append(BASE_URL+'/'+BASE_DIR+'/'+o['value'])
return lists
#采集每一页的列表
def getSingePageUrlLists(url):
print "=====start getSingePageUrlLists("+os.path.basename(url)+") ====== %s" % time.ctime()
soup = BeautifulSoup(getHtml(url))
ul=soup.find("ul", class_="pic")
all_a=ul.find_all('a')
datas = []
for a in all_a:
img = a.find('img')
span=a.find('span',class_="bt")
title = span.contents[0]
data = {'url':BASE_URL+a['href'],'title':title,'pic':BASE_URL+img['lazysrc']}
datas.append(data)
return datas
#采集详细页
def getDetailPage(url,title):
print "=====start getDetailPage("+os.path.basename(url)+") ====== %s" % time.ctime()
soup = BeautifulSoup(getHtml(url))
div = soup.find('div',id="imgString")
img = div.find('img')
imgUrl = img['src']
if imgUrl == '':
return False
response = requests.get(imgUrl, stream=True)
if response.status_code != 200:
return False
image = response.content
dir = SAVE_DIR
if not os.path.exists(dir):
os.mkdir(dir)
baseName = os.path.basename(img['src']);
fileName = dir+'/'+title+'.'+baseName.split('.')[1]
try:
open(fileName ,"wb").write(image)
print "=====write end====== %s" % time.ctime()
except IOError:
print("IO Error\n")
return
class getLists(threading.Thread):
def __init__(self ,que,detailQue):
threading.Thread.__init__(self)
self.pageQue = que
self.detailQue = detailQue
def run(self):
while True:
url = self.pageQue.get()
singePageLists = getSingePageUrlLists(url)
for sl in singePageLists:
self.detailQue.put(json.dumps(sl))
self.pageQue.task_done()
class getDetailLists(threading.Thread):
def __init__(self ,detailQue):
threading.Thread.__init__(self)
self.detailQue = detailQue
def run(self):
while True:
data = self.detailQue.get()
decodeData = json.loads(data)
getDetailPage(decodeData['url'],decodeData['title'])
self.detailQue.task_done()
pageQue = Queue.Queue()
detailQue = Queue.Queue()
if __name__ == '__main__':
print "====start request====%s" % time.ctime()
allLists = getAllPageLists()
for u in allLists:
pageQue.put(u)
for t in range(THREAD_COUNT):
t = getLists(pageQue,detailQue)
t.setDaemon(True)
t.start()
for t in range(THREAD_COUNT):
t = getDetailLists(detailQue)
t.setDaemon(True)
t.start()
pageQue.join()
detailQue.join()
print 'all DONE at:', time.ctime()