forked from zhiying8710/bookshelf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdupefilters.py
31 lines (25 loc) · 873 Bytes
/
dupefilters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# encoding: utf-8
# Created on 2014-5-9
# @author: binge
import sys
reload(sys)
sys.setdefaultencoding('utf-8') # @UndefinedVariable
import time
from scrapy.dupefilter import RFPDupeFilter
# import md5
try:
from hashlib import md5
except ImportError:
from md5 import md5
class UnFilterDupeFilter(RFPDupeFilter):
'''
this class extends scrapy.dupefilter.RFPDupeFilter(default filter the duplicate request),
for un filter the duplicate request. overwrite function request_fingerprint, every single time
this function will return the different value.
and function request_seen is used to judge is current request having been crawled yet, in this,
we just return False.
'''
def request_seen(self, request):
return False
def request_fingerprint(self, request):
return md5(request._get_url() + str(time.time()))