Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactored tests and TorBot app so that tests no longer need to touch #95

Merged
merged 4 commits into from
Jul 4, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ tests/__pycache__/
modules/__init__.py
.idea/
tests/.ropeproject/
torBot
*.pyc
tests/.pytest_cache
.pytest_cache
13 changes: 3 additions & 10 deletions modules/getemails.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from modules.bcolors import Bcolors
from modules.net_utils import get_urls_from_page
from bs4 import BeautifulSoup


Expand All @@ -19,19 +20,11 @@ def getMails(soup):

if isinstance(type(soup), type(BeautifulSoup)):

emails = []
links = soup.find_all('a')
for ref in links:
url = ref.get('href')
if url and 'mailto' in url:
"""Split email address on"""
email_addr = url.split(':')
if len(email_addr) > 1:
emails.append(email_addr[1])
emails = get_urls_from_page(soup, email=True)

"""Pretty print output as below"""
print('')
print(b_colors.OKGREEN + 'Mails Found - ' + b_colors.ENDC + str(len(emails)))
print(b_colors.OKGREEN+'Mails Found - '+b_colors.ENDC+str(len(emails)))
print('-------------------------------')

return emails
Expand Down
92 changes: 5 additions & 87 deletions modules/getweblinks.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,7 @@
import re
import requests
import tldextract

from modules.net_utils import get_urls_from_page, get_url_status
from modules import pagereader
from bs4 import BeautifulSoup
from modules.bcolors import Bcolors
from requests.exceptions import ConnectionError, HTTPError


def valid_url(url, extensions=None):
"""Checks for any valid url using regular expression matching

Matches all possible url patterns with the url that is passed and
returns True if it is a url and returns False if it is not.

Args:
url: string representing url to be checked

Returns:
bool: True if valid url format and False if not
"""
pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)"
regex = re.compile(pattern)
if not extensions:
if regex.match(url):
return True
return False

parts = tldextract.extract(url)
valid_sites = list()
for ext in extensions:
if regex.match(url) and '.'+parts.suffix in ext:
valid_sites.append(url)
return valid_sites


def valid_onion_url(url):
"""Checks for valid onion url using regular expression matching

Only matches onion urls

Args:
url: string representing url to be checked

Returns:
bool: True if valid onion url format, False if not
"""
pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)"
regex = re.compile(pattern)
if regex.match(url):
return True
return False


def is_link_alive(link):
"""Generator that yields links as they come

Uses head request because it uses less bandwidth than get and timeout is
set to 10 seconds and then link is automatically declared as dead.

Args:
link: link to be tested
colors: object containing colors for link

Yields:
string: link with either no color or red which indicates failure
"""

try:
resp = requests.head(link, timeout=10)
resp.raise_for_status()
return True
except (ConnectionError, HTTPError):
return False


def add_green(link):
Expand All @@ -98,28 +27,17 @@ def get_links(soup, ext=False, live=False):
"""
b_colors = Bcolors()
if isinstance(soup, BeautifulSoup):
websites = []

links = soup.find_all('a')
for ref in links:
url = ref.get('href')
if ext:
if url and valid_url(url, ext):
websites.append(url)
else:
if url and valid_onion_url(url):
websites.append(url)

websites = get_urls_from_page(soup, extension=ext)
"""Pretty print output as below"""
print(''.join((b_colors.OKGREEN,
'Websites Found - ', b_colors.ENDC, str(len(websites)))))
print('------------------------------------')

if live:
for link in websites:
if is_link_alive(link):
if get_url_status(link) != 0:
coloredlink = add_green(link)
page = pagereader.read_page(link)
page = pagereader.read_first_page(link)[0]
if page is not None and page.title is not None:
print_row(coloredlink, page.title.string)
else:
Expand Down
73 changes: 73 additions & 0 deletions modules/net_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import re
import requests

from requests.exceptions import ConnectionError, HTTPError


def check_connection(url):

print("Attempting to connect to {site}".format(site=url))
if get_url_status(url) != 0:
return 1

return 0


def get_url_status(url, headers=False):
"""
Uses head request because it uses less bandwith than get and timeout is
set to 10 seconds and then link is automatically declared as dead.

Args:
link: link to be tested
colors: object containing colors for link

Return:
something?: either an int or return value of the connection object's
get request if successful & zero is failure
"""
try:
if headers:
resp = requests.get(url, headers=headers)
else:
resp = requests.get(url)
resp.raise_for_status()
return resp
except (ConnectionError, HTTPError):
return 0


def is_url(url):
pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)"
regex = re.compile(pattern)
if regex.match(url):
return 1
return 0


def is_onion_url(url):
pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)"
regex = re.compile(pattern)
if regex.match(url):
return 1
return 0


def get_urls_from_page(page, email=False, extension=False):
urls = []
anchors_on_page = page.find_all('a')
for anchor_tag in anchors_on_page:
url = anchor_tag.get('href')
if extension:
if url and is_url(url) == 1:
urls.append(url)
elif email:
if url and 'mailto' in url:
email_addr = url.split(':')
if len(email_addr) > 1:
urls.append(email_addr[1])
else:
if url and is_onion_url(url) == 1:
urls.append(url)

return urls
113 changes: 35 additions & 78 deletions modules/pagereader.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import requests

from bs4 import BeautifulSoup
from modules.net_utils import get_url_status
from modules.bcolors import Bcolors
from requests.exceptions import ConnectionError, HTTPError, MissingSchema
from sys import exit


Expand All @@ -15,83 +13,42 @@ def read_first_page(site):
attempts_left = 3
err = " "
while attempts_left:
try:
if attempts_left == 3:
print(next(connection_msg(site)))
response = requests.get(site, headers=headers)
print("Connection successful.")
page = BeautifulSoup(response.text, 'html.parser')
return page, response
if attempts_left == 2:
print(next(connection_msg('https://' + site)))
response = requests.get('https://' + site, headers=headers)
print("Connection successful.")
page = BeautifulSoup(response.text, 'html.parser')
return page, response
if attempts_left == 1:
print(next(connection_msg('http://' + site)))
response = requests.get('http://' + site, headers=headers)
print("Connection successful.")
if attempts_left == 3:
print(next(connection_msg(site)))
response = get_url_status(site, headers)
if response != 0:
page = BeautifulSoup(response.text, 'html.parser')
return page, response
if not attempts_left:
msg = ''.join(("There has been an {err} while attempting to ",
"connect to {site}.")).format(err=err, site=site)
exit(msg)

except (HTTPError, MissingSchema, ConnectionError) as e:
attempts_left -= 1
err = e

if isinstance(err, HTTPError):
print("There has been an HTTP error after three attempts.")
exit(1)
if isinstance(err, ConnectionError):
print("Got ConnectionError after three attempts... ",
"Please check if the TOR service is running or not.")
exit(1)


def read_page(site):
headers = {'User-Agent': 'TorBot - Onion crawler | www.github.com/DedSecInside/TorBot'}
attempts_left = 3
err = " "
while attempts_left:
try:
if attempts_left == 3:
# print(next(connection_msg(site)))
response = requests.get(site, headers=headers)
# print("Connection successful.")
page = BeautifulSoup(response.text, 'html.parser')
return page
if attempts_left == 2:
# print(next(connection_msg('https://'+site)))
response = requests.get('https://' + site, headers=headers)
# print("Connection successful.")
page = BeautifulSoup(response.text, 'html.parser')
return page
if attempts_left == 1:
# print(next(connection_msg('http://'+site)))
response = requests.get('http://' + site, headers=headers)
# print("Connection successful.")
page = BeautifulSoup(response.text, 'html.parser')
return page
if not attempts_left:
msg = ''.join(("There has been an {err} while attempting to ",
"connect to {site}.")).format(err=err, site=site)
exit(msg)

except (HTTPError, MissingSchema, ConnectionError) as e:
attempts_left -= 1
err = e

if isinstance(err, HTTPError):
print("There has been an HTTP error after three attempts.")
exit(1)
if isinstance(err, ConnectionError):
print("There has been a connection error after three attempts.")
exit(1)

else:
attempts_left -= 1
continue

if attempts_left == 2:
https_url = 'https://'+site
print(next(connection_msg(https_url)))
response = get_url_status(https_url, headers)
if response != 0:
page = BeautifulSoup(response.text, 'html.parser')
return page, response
else:
attempts_left -= 1
continue

if attempts_left == 1:
http_url = 'http://'+site
print(next(connection_msg(http_url)))
response = get_url_status(http_url, headers)
if response != 0:
page = BeautifulSoup(response.text, 'html.parser')
return page, response
else:
attempts_left -= 1
continue

if not attempts_left:
msg = ''.join(("There has been an {err} while attempting to ",
"connect to {site}.")).format(err=err, site=site)
exit(msg)

def get_ip():
"""Returns users tor ip address
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ termcolor==1.1.0
requests==2.18.4
requests_mock==1.4.0
tldextract==2.2.0

yattag==1.10.0
Loading