Issue #88: Search paths and subdomains in commoncrawl

Nekmo · Jan 26, 2022 · 7327d2b · 7327d2b
1 parent 30e98e4
commit 7327d2b
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 0 deletions.
diff --git a/dirhunt/sources/__init__.py b/dirhunt/sources/__init__.py
@@ -1,3 +1,4 @@
+from dirhunt.sources.commoncrawl import CommonCrawl
 from dirhunt.sources.google import Google
 from dirhunt.sources.robots import Robots
 from dirhunt.sources.virustotal import VirusTotal
@@ -6,6 +7,7 @@
     Robots,
     VirusTotal,
     Google,
+    CommonCrawl,
 ]
 
 

diff --git a/dirhunt/sources/commoncrawl.py b/dirhunt/sources/commoncrawl.py
@@ -0,0 +1,40 @@
+import json
+from json import JSONDecodeError
+
+from requests.exceptions import RequestException
+from dirhunt.sessions import Sessions
+from dirhunt.sources.base import Source
+
+
+COMMONCRAWL_URL = 'https://index.commoncrawl.org/collinfo.json'
+
+
+class CommonCrawl(Source):
+    def get_latest_craw_index(self):
+        url = COMMONCRAWL_URL
+        session = Sessions().get_session()
+        try:
+            response = session.get(url)
+            response.raise_for_status()
+            crawl_indexes = response.json()
+        except (RequestException, ValueError, JSONDecodeError):
+            return
+        if not crawl_indexes:
+            return
+        latest_crawl_index = crawl_indexes[0]
+        return latest_crawl_index['cdx-api']
+
+
+    def callback(self, domain):
+        latest_crawl_index = self.get_latest_craw_index()
+        if not latest_crawl_index:
+            return
+        session = Sessions().get_session()
+        response = session.get(
+            latest_crawl_index,
+            params={'url': '*.{}'.format(domain), 'output': 'json'},
+            stream=True
+        )
+        for line in filter(bool, response.iter_lines()):
+            data = json.loads(line)
+            self.add_result(data['url'])