forked from rhergenreder/HackingScripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl_urls.py
executable file
·136 lines (110 loc) · 4.36 KB
/
crawl_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
import argparse
import urllib.parse
import urllib3
import requests
import queue
import re
from bs4 import BeautifulSoup
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class Crawler:
def __init__(self, url):
self.url = url
parts = urllib.parse.urlparse(url)
if not parts.scheme and not parts.netloc and parts.path:
self.domain = parts.path
self.url = parts._replace(scheme="http", netloc=parts.path, path="").geturl()
self.scheme = "http"
else:
self.domain = parts.netloc
self.scheme = "http" if not parts.scheme else parts.scheme
self.user_agent = "WebCrawler/1.0"
self.cookies = {}
self.proxy = None
#
self.queue = queue.Queue()
self.visited = set()
self.out_of_scope = set()
self.resources = set()
self.pages = set()
def request(self, url):
headers = {"User-Agent": self.user_agent}
kwargs = {"verify": False, "cookies": self.cookies, "headers": headers}
if self.proxy:
kwargs["proxy"] = {
"http": self.proxy,
"https": self.proxy
}
print("requesting:", url)
return requests.get(url, **kwargs)
def start(self):
self.queue.put(self.url)
while not self.queue.empty():
url = self.queue.get()
if url in self.visited:
continue
self.visited.add(url)
res = self.request(url)
content_type = res.headers.get("Content-Type", None)
if "text/html" not in content_type.lower().split(";"):
continue
urls = self.collect_urls(res.text)
for url in urls:
parts = urllib.parse.urlparse(url)
if parts.netloc and parts.netloc != self.domain:
self.out_of_scope.add(url)
else:
resources_ext = ["jpg", "jpeg", "gif", "png", "css", "js", "svg", "ico"]
path, args = parts.path, None
if "?" in path:
path = path[0:path.index("?")]
args = urllib.parse.parse_args(path[path.index("?") + 1:])
if path.rsplit(".", 1)[-1] in resources_ext:
self.resources.add(url)
else:
self.pages.add(url)
self.queue.put(parts._replace(netloc=self.domain, scheme=self.scheme, fragment="").geturl())
@staticmethod
def collect_urls(page):
if not isinstance(page, BeautifulSoup):
page = BeautifulSoup(page, "html.parser")
urls = set()
attrs = ["src", "href", "action"]
tags = ["a", "link", "script", "img", "form"]
for tag in tags:
for e in page.find_all(tag):
for attr in attrs:
if e.has_attr(attr):
urls.add(e[attr])
return urls
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("url", help="The target URI to scan to, e.g. http://example.com:8080/dir/")
parser.add_argument("--proxy", help="Proxy to connect through") # TODO
parser.add_argument("--user-agent", help="User-Agent to use")
parser.add_argument("--cookie", help="Cookies to send", action='append', default=[])
parser.add_argument('--verbose', '-v', help="Verbose otuput", action='store_true')
args = parser.parse_args()
crawler = Crawler(args.url)
if args.user_agent:
crawler.user_agent = args.user_agent
if args.proxy:
crawler.proxy = args.proxy
cookie_pattern = re.compile("^([a-zA-Z0-9.%/+_-]+)=([a-zA-Z0-9.%/+_-])*$")
for cookie in crawler.cookies:
m = cookie_pattern.match(cookie)
if not m:
print("[-] Cookie does not match pattern:", cookie)
print("[-] You might need to URL-encode it")
exit()
key, value = (urllib.parse.unquoute(m[1]), urllib.parse.unquoute(m[2]))
crawler.cookies[key] = value
crawler.start()
results = {
"Pages": crawler.pages,
"Resources": crawler.resources,
"Out of Scope": crawler.out_of_scope
}
for name, values in results.items():
print(f"=== {name} ===")
print("\n".join(values))