Skip to content

Commit

Permalink
Issue #45: Robots source fails on site with invalid HTTPS certificate
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Jul 30, 2018
1 parent 538a6fd commit a1608b3
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 2 deletions.
20 changes: 19 additions & 1 deletion dirhunt/sources/robots.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from itertools import chain

import requests
from requests import RequestException

from dirhunt.sources.base import Source
from dirhunt._compat import RobotFileParser, URLError

Expand All @@ -9,9 +12,24 @@ def get_url(protocol, domain, path):
return '{protocol}://{domain}/{path}'.format(**locals())


class DirhuntRobotFileParser(RobotFileParser):
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
try:
r = requests.get(self.url)
except RequestException:
pass
else:
if r.status_code in (401, 403):
self.disallow_all = True
elif r.status_code >= 400 and r.status_code < 500:
self.allow_all = True
self.parse(r.text.splitlines())


class Robots(Source):
def callback(self, domain, protocol='http'):
rp = RobotFileParser()
rp = DirhuntRobotFileParser()
rp.set_url(get_url(protocol, domain, 'robots.txt'))
try:
rp.read()
Expand Down
2 changes: 1 addition & 1 deletion dirhunt/url_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, sessions, url, timeout=10):
def get_data(self):
session = self.sessions.get_session()
try:
resp = session.get(self.url.url, stream=True, timeout=self.timeout, allow_redirects=False)
resp = session.get(self.url.url, stream=True, verify=False, timeout=self.timeout, allow_redirects=False)
except RequestException:
raise RequestError
try:
Expand Down

0 comments on commit a1608b3

Please sign in to comment.