Skip to content

Commit

Permalink
Issue #91: Search paths and subdomains in archive.org
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Mar 6, 2022
1 parent ec3d60b commit cc4e6c9
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 0 deletions.
2 changes: 2 additions & 0 deletions dirhunt/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
from dirhunt.sources.robots import Robots
from dirhunt.sources.ssl import CertificateSSL
from dirhunt.sources.virustotal import VirusTotal
from dirhunt.sources.wayback import Wayback

SOURCE_CLASSES = [
Robots,
VirusTotal,
Google,
CommonCrawl,
CertificateSSL,
Wayback,
]


Expand Down
33 changes: 33 additions & 0 deletions dirhunt/sources/wayback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import json

from requests.exceptions import RequestException
from dirhunt.sessions import Sessions
from dirhunt.sources.base import Source


WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx'
WAYBACK_PARAMS = {
'fl': 'original',
'collapse': 'urlkey',
'limit': '10000'
}
DEFAULT_ENCODING = 'utf-8'


class Wayback(Source):

def callback(self, domain):
session = Sessions().get_session()
response = session.get(
WAYBACK_URL,
params=dict(WAYBACK_PARAMS, url='*.{}'.format(domain)),
stream=True
)
try:
response.raise_for_status()
except RequestException:
return
for line in filter(bool, response.iter_lines()):
if isinstance(line, bytes):
line = line.decode(response.encoding or DEFAULT_ENCODING)
self.add_result(line)

0 comments on commit cc4e6c9

Please sign in to comment.