-
Notifications
You must be signed in to change notification settings - Fork 1
/
ScraperPool.py
98 lines (82 loc) · 2.81 KB
/
ScraperPool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import threading, time, copy, string, random, os
class JobThread(threading.Thread):
def __init__(self, parentPool, threadID):
threading.Thread.__init__(self)
self.pool = parentPool
self.myId = threadID
def run(self):
while True:
job = self.pool.getJob()
if (job == None):
#print "Thread %d : NO MORE JOBS" % (self.myId)
break
#print "Thread %d : got new job" % (self.myId)
self.pool.pushResult(job())
return
class ThreadPool(object):
def __init__(self, poolsize):
self.results = []
self.res_lock = threading.Lock()
self.nbthreads = poolsize
self.jobsLock = threading.Lock()
self.finiEvent = threading.Event()
def getJob(self):
return None
def signalFinished(self):
self.finiEvent.set()
def pushResult(self, res):
self.res_lock.acquire()
self.results.append(res)
self.res_lock.release()
def executeJobs(self):
# Starting Threads
self.threads = []
for i in range(self.nbthreads):
t = JobThread(self, i)
t.start()
self.threads.append(t)
# Waiting to finish
self.finiEvent.wait()
#print "Gave everything out, waiting for threads to finish ..."
for t in self.threads:
t.join()
#print "Cleaned up threads!"
return self.results
class FetcherPool(ThreadPool):
def __init__(self, fetcher, urls, poolsize, savePath=None):
ThreadPool.__init__(self, poolsize)
self.fetcher = fetcher
self.urls = urls
self.path = savePath
def finished(self):
return len(self.urls) == 0
def random_key(self, length):
key = ''
for i in range(length):
key += random.choice(string.lowercase + string.uppercase + string.digits)
return key
def exists(self, filename):
contents = os.listdir(self.path)
for name in contents:
if name.find(filename) != -1:
return True
return False
def getJob(self):
self.jobsLock.acquire()
if (len(self.urls) == 0):
job = None
else:
url = copy.deepcopy(self.urls[0])
if (self.path != None):
fileId = self.random_key(10)
while self.exists(fileId):
fileId = self.random_key(10)
filename = self.path + "img_" + fileId + ".jpg"
job = lambda: self.fetcher(url, filename)
else:
job = lambda: self.fetcher(url)
self.urls.pop(0)
if (len(self.urls) == 0):
self.signalFinished()
self.jobsLock.release()
return job