Skip to content

Commit

Permalink
Issue #88: Search paths and subdomains in commoncrawl
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Jan 26, 2022
1 parent 30e98e4 commit 7327d2b
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 0 deletions.
2 changes: 2 additions & 0 deletions dirhunt/sources/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from dirhunt.sources.commoncrawl import CommonCrawl
from dirhunt.sources.google import Google
from dirhunt.sources.robots import Robots
from dirhunt.sources.virustotal import VirusTotal
Expand All @@ -6,6 +7,7 @@
Robots,
VirusTotal,
Google,
CommonCrawl,
]


Expand Down
40 changes: 40 additions & 0 deletions dirhunt/sources/commoncrawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import json
from json import JSONDecodeError

from requests.exceptions import RequestException
from dirhunt.sessions import Sessions
from dirhunt.sources.base import Source


COMMONCRAWL_URL = 'https://index.commoncrawl.org/collinfo.json'


class CommonCrawl(Source):
def get_latest_craw_index(self):
url = COMMONCRAWL_URL
session = Sessions().get_session()
try:
response = session.get(url)
response.raise_for_status()
crawl_indexes = response.json()
except (RequestException, ValueError, JSONDecodeError):
return
if not crawl_indexes:
return
latest_crawl_index = crawl_indexes[0]
return latest_crawl_index['cdx-api']


def callback(self, domain):
latest_crawl_index = self.get_latest_craw_index()
if not latest_crawl_index:
return
session = Sessions().get_session()
response = session.get(
latest_crawl_index,
params={'url': '*.{}'.format(domain), 'output': 'json'},
stream=True
)
for line in filter(bool, response.iter_lines()):
data = json.loads(line)
self.add_result(data['url'])

0 comments on commit 7327d2b

Please sign in to comment.