Issue #45: Robots source fails on site with invalid HTTPS certificate

Nekmo · Jul 30, 2018 · a1608b3 · a1608b3
1 parent 538a6fd
commit a1608b3
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 2 deletions.
diff --git a/dirhunt/sources/robots.py b/dirhunt/sources/robots.py
@@ -1,5 +1,8 @@
 from itertools import chain
 
+import requests
+from requests import RequestException
+
 from dirhunt.sources.base import Source
 from dirhunt._compat import RobotFileParser, URLError
 
@@ -9,9 +12,24 @@ def get_url(protocol, domain, path):
     return '{protocol}://{domain}/{path}'.format(**locals())
 
 
+class DirhuntRobotFileParser(RobotFileParser):
+    def read(self):
+        """Reads the robots.txt URL and feeds it to the parser."""
+        try:
+            r = requests.get(self.url)
+        except RequestException:
+            pass
+        else:
+            if r.status_code in (401, 403):
+                self.disallow_all = True
+            elif r.status_code >= 400 and r.status_code < 500:
+                self.allow_all = True
+            self.parse(r.text.splitlines())
+
+
 class Robots(Source):
     def callback(self, domain, protocol='http'):
-        rp = RobotFileParser()
+        rp = DirhuntRobotFileParser()
         rp.set_url(get_url(protocol, domain, 'robots.txt'))
         try:
             rp.read()

diff --git a/dirhunt/url_info.py b/dirhunt/url_info.py
@@ -44,7 +44,7 @@ def __init__(self, sessions, url, timeout=10):
     def get_data(self):
         session = self.sessions.get_session()
         try:
-            resp = session.get(self.url.url, stream=True, timeout=self.timeout, allow_redirects=False)
+            resp = session.get(self.url.url, stream=True, verify=False, timeout=self.timeout, allow_redirects=False)
         except RequestException:
             raise RequestError
         try: