From 82d36bba9741a1a3458ef4f238eba646d77a17f5 Mon Sep 17 00:00:00 2001 From: KingAkeem Date: Sun, 26 Aug 2018 16:14:22 -0400 Subject: [PATCH 1/9] Refactoring --- modules/getemails.py | 2 +- modules/getweblinks.py | 175 ++++++++++-------------------------- modules/net_utils.py | 62 ------------- modules/pagereader.py | 42 ++++++++- modules/utils.py | 200 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 285 insertions(+), 196 deletions(-) delete mode 100644 modules/net_utils.py create mode 100644 modules/utils.py diff --git a/modules/getemails.py b/modules/getemails.py index bd9f84ca..0ed30643 100644 --- a/modules/getemails.py +++ b/modules/getemails.py @@ -1,5 +1,5 @@ from modules.bcolors import Bcolors -from modules.net_utils import get_urls_from_page +from modules.pagereader import get_urls_from_page from bs4 import BeautifulSoup diff --git a/modules/getweblinks.py b/modules/getweblinks.py index de95425a..4db3057e 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -1,72 +1,64 @@ -import requests - -from requests import HTTPError, ConnectionError -from modules.net_utils import get_urls_from_page, get_url_status from bs4 import BeautifulSoup from modules.bcolors import Bcolors -from threading import Thread -from queue import Queue +from modules.utils import is_url, is_onion_url, bfs_urls, queue_tasks, display_link -def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None): +def get_urls_from_page(page_soup, email=False, extension=False): """ - Traverses links passed using Breadth First Search. You can specify stop depth - or specify a target to look for. The depth argument is used for recursion + Searches for urls on page using the anchor tag and href attribute, + also searchs for emails using 'mailto' if specified. Args: - links (list): list of urls to traverse - ext (string): string representing extension to use for URLs - depth (int): used for recursion - stop_depth (int): stops traversing at this depth if specified - targetLink (string): stops at this link if specified + page (bs4.BeauitulSoup): html soup to search + email (bool): flag whether to collect emails as well + extension (bool): flag whether to use additional extensions Returns: - depth (int): depth stopped at + urls (list): urls found on page """ + if not isinstance(page_soup, BeautifulSoup): + raise(Exception("First arg must be bs4.BeautifulSoup object")) + + urls = [] + anchors_on_page = page_soup.find_all('a') + for anchor_tag in anchors_on_page: + url = anchor_tag.get('href') + if extension: + if url and is_url(url) == 1: + urls.append(url) + elif email: + if url and 'mailto' in url: + email_addr = url.split(':') + if len(email_addr) > 1: + urls.append(email_addr[1]) + else: + if url and is_onion_url(url) == 1: + urls.append(url) - if depth == stop_depth: - return depth - - toVisit = list() - for link in links: - if targetLink == link and targetLink: - return depth - try: - resp = requests.get(link) - except (HTTPError, ConnectionError): - continue - soup = BeautifulSoup(resp.text, 'html.parser') - websitesToVisit = get_urls_from_page(soup, extension=ext) - for site in websitesToVisit: - toVisit.append(site) - depth += 1 - if stop_depth and targetLink: - traverse_links(toVisit, ext, depth, stop_depth, targetLink) - elif stop_depth: - traverse_links(toVisit, ext, depth, stop_depth=stop_depth) - elif targetLink: - traverse_links(toVisit, ext, depth, targetLink=targetLink) - else: - traverse_links(toVisit, ext, depth) - + return urls -def search_page(html_text, ext, stop=None): - soup = BeautifulSoup(html_text, 'html.parser') - links = get_urls_from_page(soup, extension=ext) - if stop: - traverse_links(links, ext, stop=stop) - else: - traverse_links(links, ext) +def search_page(html, ext, stop_depth=None): + """ + Takes in a pages HTML and searches the links on the page using + BFS. -def add_green(link): - colors = Bcolors() - return '\t' + colors.OKGREEN + link + colors.ENDC + Args: + html (str): HTML with links to search + add_exts (str): additional extension + stop_depth (int): The depth at which to stop + Returns: + links_found (list): links found on page and associated pages + """ + soup = BeautifulSoup(html, 'html.parser') + links = get_urls_from_page(soup, extension=ext) + if stop_depth: + links_found = bfs_urls(links, ext, stop_depth=stop_depth) + else: + links_found = bfs_urls(links, ext) -def add_red(link): - colors = Bcolors() - return '\t' + colors.On_Red + link + colors.ENDC + return links_found def get_links(soup, ext=False, live=False): @@ -96,80 +88,3 @@ def get_links(soup, ext=False, live=False): else: raise(Exception('Method parameter is not of instance BeautifulSoup')) - - -def display_link(link): - """ - Prints the status of a link based on if it can be reached using a GET - request. Link is printed with a color based on status. - Green for a reachable status code and red for not reachable. - - Args: - link (str): url to be printed - Returns: - None - """ - resp = get_url_status(link) - if resp != 0: - title = BeautifulSoup(resp.text, 'html.parser').title.string - coloredlink = add_green(link) - print_row(coloredlink, title) - else: - coloredlink = add_red(link) - print_row(coloredlink, "Not found") - - -def execute_tasks(q, task_func, tasks_args=tuple()): - """ - Executes tasks inside of queue using function and arguments passed - inside of threads - - Args: - q (queue.Queue): contains tasks - task_func (function): function to be executed on tasks and args - task_args (tuple): contains arguments for function - Returns: - None - """ - while True: - task = q.get() - if tasks_args: - task_func(task, tasks_args) - else: - task_func(task) - q.task_done() - - -def queue_tasks(tasks, task_func, tasks_args=tuple()): - """ - Starts threads with tasks and queue, then queues tasks and spawned threads - begin to pull tasks off queue to execute - - Args: - tasks (list): lists of values that you'd like to operate on - task_func (function): function that you would like to use - tasks_args (tuple): arguments for function - Returns: - None - """ - q = Queue(len(tasks)*2) - for _ in tasks: - if tasks_args: - if isinstance(tasks_args, tuple): - t = Thread(target=execute_tasks, args=(q, task_func, tasks_args)) - t.daemon = True - t.start() - else: - raise(Exception('Function arguments must be passed in the form of a tuple.')) - else: - t = Thread(target=execute_tasks, args=(q, task_func)) - t.daemon = True - t.start() - - for task in tasks: - q.put(task) - q.join() - - -def print_row(url, description): - print("%-80s %-30s" % (url, description)) diff --git a/modules/net_utils.py b/modules/net_utils.py deleted file mode 100644 index 09e9f43d..00000000 --- a/modules/net_utils.py +++ /dev/null @@ -1,62 +0,0 @@ -import re -import requests - - -def get_url_status(url, headers=False): - """ - Uses head request because it uses less bandwith than get and timeout is - set to 10 seconds and then link is automatically declared as dead. - - Args: - link: link to be tested - colors: object containing colors for link - - Return: - something?: either an int or return value of the connection object's - get request if successful & zero is failure - """ - try: - if headers: - resp = requests.get(url, headers=headers) - else: - resp = requests.get(url) - resp.raise_for_status() - return resp - except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError): - return 0 - - -def is_url(url): - pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)" - regex = re.compile(pattern) - if regex.match(url): - return 1 - return 0 - - -def is_onion_url(url): - pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)" - regex = re.compile(pattern) - if regex.match(url): - return 1 - return 0 - - -def get_urls_from_page(page, email=False, extension=False): - urls = [] - anchors_on_page = page.find_all('a') - for anchor_tag in anchors_on_page: - url = anchor_tag.get('href') - if extension: - if url and is_url(url) == 1: - urls.append(url) - elif email: - if url and 'mailto' in url: - email_addr = url.split(':') - if len(email_addr) > 1: - urls.append(email_addr[1]) - else: - if url and is_onion_url(url) == 1: - urls.append(url) - - return urls diff --git a/modules/pagereader.py b/modules/pagereader.py index 99e04bc7..51242412 100644 --- a/modules/pagereader.py +++ b/modules/pagereader.py @@ -1,9 +1,44 @@ import sys from bs4 import BeautifulSoup -from modules.net_utils import get_url_status +from modules.utils import get_url_status from modules.bcolors import Bcolors +def display_url(url): + """ + Prints the status of a url based on if it can be reached using a GET + request. url is printed with a color based on status. + Green for a reachable status code and red for not reachable. + + Args: + url (str): url to be printed + Returns: + None + """ + resp = get_url_status(url) + if resp != 0: + title = BeautifulSoup(resp.text, 'html.parser').title.string + coloredurl = add_green(url) + print_row(coloredurl, title) + else: + coloredurl = add_red(url) + print_row(coloredurl, "Not found") + + +def print_row(url, description): + print("%-80s %-30s" % (url, description)) + + +def add_green(link): + colors = Bcolors() + return '\t' + colors.OKGREEN + link + colors.ENDC + + +def add_red(link): + colors = Bcolors() + return '\t' + colors.On_Red + link + colors.ENDC + + def connection_msg(site): yield "Attempting to connect to {site}".format(site=site) @@ -61,5 +96,6 @@ def get_ip(): page = read_first_page('https://check.torproject.org/')[0] pg = page.find('strong') ip_addr = pg.renderContents() - - return b_colors.WARNING + b_colors.BOLD + ip_addr.decode("utf-8") + b_colors.ENDC + COLOR_BEGIN = b_colors.WARNING + b_colors.BOLD + COLOR_END = b_colors.ENDC + return COLOR_BEGIN + ip_addr.decode("utf-8") + COLOR_END diff --git a/modules/utils.py b/modules/utils.py new file mode 100644 index 00000000..1c8ffd55 --- /dev/null +++ b/modules/utils.py @@ -0,0 +1,200 @@ +import re +import requests + +from bs4 import BeautifulSoup +from requests.exceptions import HTTPError, ConnectionError +from queue import Queue +from threading import Thread +from modules.getweblinks import get_urls_from_page + +""" + +ALGORITHM UTILITY FUNCTIONS + +""" + + +def bfs_urls(urls, add_exts, rec_depth=0, stop_depth=None, target_url=None): + """ + Traverses urls passed using Breadth First Search. You can specify stop + depth or specify a target to look for. The rec_depth argument is used + for recursion. + + *NOTE: This function uses a GET request for each url found, this can + be very expensive so avoid if possible try to acquire the urls to + be traversed and use bfs function. + + Args: + urls (list): urls to traverse + add_exts (str): additional extensions to use + rec_depth (int): used for recursion + stop_depth (int): stops traversing at this depth if specified + target_url (str): stops at this url if specified + + Returns: + rec_depth (int): depth stopped at + """ + + if rec_depth == stop_depth: + return rec_depth + + urls_to_visit = list() + for url in urls: + if target_url == url and target_url: + return rec_depth + try: + resp = requests.get(url) + except (HTTPError, ConnectionError): + continue + soup = BeautifulSoup(resp.text, 'html.parser') + page_urls = get_urls_from_page(soup, extension=add_exts) + for url in page_urls: + urls_to_visit.append(url) + rec_depth += 1 + if stop_depth and target_url: + bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth, target_url) + elif stop_depth: + bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth=stop_depth) + elif target_url: + bfs_urls(urls_to_visit, add_exts, rec_depth, target_url=target_url) + else: + bfs_urls(urls_to_visit, add_exts, rec_depth=rec_depth) + + +def bfs(nodes, target_node=None, rec_depth=0, stop_depth=None): + """ + Traverses nodes using Breadth First Search. You can specify stop + depth or specify a target to look for. The rec_depth argument is used + for recursion. + + Args: + nodes (list): objects to traverse + target_node (object): object being searched for + rec_depth (int): used for recursion + stop_depth (int): stops traversing at this depth if specified + + Returns: + rec_depth (int): depth stopped at + """ + + if rec_depth == stop_depth: + return rec_depth + + adjacent_nodes = list() + # Checks that nodes is a list or has a Visit method + if not isinstance(nodes, list) and not hasattr(nodes, 'Visit', False): + raise(Exception('nodes must be a list')) + + for node in nodes: + if target_node == node and target_node: + return rec_depth + node.Visit() + adjacent_nodes.append(node) + rec_depth += 1 + if target_node and not stop_depth: + bfs(adjacent_nodes, target_node, rec_depth) + elif not target_node and stop_depth: + bfs(adjacent_nodes, rec_depth=rec_depth, stop_depth=stop_depth) + elif target_node and stop_depth: + bfs(adjacent_nodes, target_node, rec_depth, stop_depth) + else: + bfs(adjacent_nodes, rec_depth) + + +def exec_tasks(q, task_func, tasks_args=tuple()): + """ + Executes tasks inside of queue using function and arguments passed + inside of threads + + Args: + q (queue.Queue): contains tasks + task_func (function): function to be executed on tasks and args + task_args (tuple): contains arguments for function + Returns: + None + """ + while True: + task = q.get() + if tasks_args: + task_func(task, tasks_args) + else: + task_func(task) + q.task_done() + + +def queue_tasks(tasks, task_func, tasks_args=tuple()): + """ + Starts threads with tasks and queue, then queues tasks and spawned + threads begin to pull tasks off queue to execute + + Args: + tasks (list): lists of values that you'd like to operate on + task_func (function): function that you would like to use + tasks_args (tuple): arguments for function + Returns: + None + """ + q = Queue(len(tasks)*2) + for _ in tasks: + if tasks_args: + if isinstance(tasks_args, tuple): + t = Thread(target=exec_tasks, args=(q, task_func, tasks_args)) + t.daemon = True + t.start() + else: + raise(Exception('Arguments must be in the form of a tuple.')) + else: + t = Thread(target=exec_tasks, args=(q, task_func)) + t.daemon = True + t.start() + + for task in tasks: + q.put(task) + q.join() + + +""" + +Networking functions + +""" + + +def get_url_status(url, headers=False): + """ + Uses GET request to check if website exists + + *NOTE: May look into changing this to HEAD requests to improve perf + + Args: + url (str): url to be tested + + Return: + something? (int/Response object): return value of the connection + object's GET request if successful & zero upon failure + """ + try: + if headers: + resp = requests.get(url, headers=headers) + else: + resp = requests.get(url) + resp.raise_for_status() + return resp + except (ConnectionError, HTTPError): + return 0 + + +def is_url(url): + pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)" + regex = re.compile(pattern) + if regex.match(url): + return 1 + return 0 + + +def is_onion_url(url): + pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)" + regex = re.compile(pattern) + if regex.match(url): + return 1 + return 0 From 69b237451b68ad5ded84da9e4d8b37f31992f23e Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 07:22:16 -0400 Subject: [PATCH 2/9] Fixing PyLint --- modules/utils.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/modules/utils.py b/modules/utils.py index 1c8ffd55..84b3f34e 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -7,12 +7,7 @@ from threading import Thread from modules.getweblinks import get_urls_from_page -""" - -ALGORITHM UTILITY FUNCTIONS - -""" - +# ALGORITHM UTILITY FUNCTIONS def bfs_urls(urls, add_exts, rec_depth=0, stop_depth=None, target_url=None): """ @@ -153,11 +148,8 @@ def queue_tasks(tasks, task_func, tasks_args=tuple()): q.join() -""" - -Networking functions +# Networking functions -""" def get_url_status(url, headers=False): From 461545527aefc6d0dd3b2cab684c8b5facd1d65c Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 07:59:15 -0400 Subject: [PATCH 3/9] Fixing imports --- modules/getemails.py | 8 ++++---- modules/getweblinks.py | 24 ++++++++++++++++++++---- modules/utils.py | 20 ++------------------ 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/modules/getemails.py b/modules/getemails.py index 0ed30643..e1299ff8 100644 --- a/modules/getemails.py +++ b/modules/getemails.py @@ -1,6 +1,7 @@ -from modules.bcolors import Bcolors -from modules.pagereader import get_urls_from_page +import modules.getweblinks as getweblinks + from bs4 import BeautifulSoup +from modules.bcolors import Bcolors def getMails(soup): @@ -19,8 +20,7 @@ def getMails(soup): b_colors = Bcolors() if isinstance(type(soup), type(BeautifulSoup)): - - emails = get_urls_from_page(soup, email=True) + emails = getweblinks.get_urls_from_page(soup, email=True) # Pretty print output as below print('') diff --git a/modules/getweblinks.py b/modules/getweblinks.py index 4db3057e..57650f17 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -1,7 +1,23 @@ +import re +import modules.utils + from bs4 import BeautifulSoup from modules.bcolors import Bcolors -from modules.utils import is_url, is_onion_url, bfs_urls, queue_tasks, display_link +def is_url(url): + pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)" + regex = re.compile(pattern) + if regex.match(url): + return 1 + return 0 + + +def is_onion_url(url): + pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)" + regex = re.compile(pattern) + if regex.match(url): + return 1 + return 0 def get_urls_from_page(page_soup, email=False, extension=False): """ @@ -54,9 +70,9 @@ def search_page(html, ext, stop_depth=None): soup = BeautifulSoup(html, 'html.parser') links = get_urls_from_page(soup, extension=ext) if stop_depth: - links_found = bfs_urls(links, ext, stop_depth=stop_depth) + links_found = utils.bfs_urls(links, ext, stop_depth=stop_depth) else: - links_found = bfs_urls(links, ext) + links_found = utils.bfs_urls(links, ext) return links_found @@ -83,7 +99,7 @@ def get_links(soup, ext=False, live=False): print('------------------------------------') if live: - queue_tasks(websites, display_link) + utils.queue_tasks(websites, utils.display_link) return websites else: diff --git a/modules/utils.py b/modules/utils.py index 84b3f34e..d60e96a8 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -1,11 +1,11 @@ import re import requests +import modules.getweblinks from bs4 import BeautifulSoup from requests.exceptions import HTTPError, ConnectionError from queue import Queue from threading import Thread -from modules.getweblinks import get_urls_from_page # ALGORITHM UTILITY FUNCTIONS @@ -42,7 +42,7 @@ def bfs_urls(urls, add_exts, rec_depth=0, stop_depth=None, target_url=None): except (HTTPError, ConnectionError): continue soup = BeautifulSoup(resp.text, 'html.parser') - page_urls = get_urls_from_page(soup, extension=add_exts) + page_urls = getweblinks.get_urls_from_page(soup, extension=add_exts) for url in page_urls: urls_to_visit.append(url) rec_depth += 1 @@ -174,19 +174,3 @@ def get_url_status(url, headers=False): return resp except (ConnectionError, HTTPError): return 0 - - -def is_url(url): - pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)" - regex = re.compile(pattern) - if regex.match(url): - return 1 - return 0 - - -def is_onion_url(url): - pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)" - regex = re.compile(pattern) - if regex.match(url): - return 1 - return 0 From 28a3133780a706881d7239b304a1d4c77f385e0d Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:06:08 -0400 Subject: [PATCH 4/9] A lot more refactoring --- modules/bcolors.py | 14 ------- modules/colors.py | 51 +++++++++++++++++++++++++ modules/getemails.py | 21 +++++----- modules/getweblinks.py | 76 ++++++++++++++++++++++++------------- modules/pagereader.py | 79 +++++++++++++++++++++----------------- modules/utils.py | 82 ++++++++++++++++++++-------------------- tests/test_getemails.py | 4 +- tests/test_pagereader.py | 2 +- torBot.py | 41 ++++++++++++-------- 9 files changed, 226 insertions(+), 144 deletions(-) delete mode 100644 modules/bcolors.py create mode 100644 modules/colors.py diff --git a/modules/bcolors.py b/modules/bcolors.py deleted file mode 100644 index 78b05842..00000000 --- a/modules/bcolors.py +++ /dev/null @@ -1,14 +0,0 @@ -class Bcolors: - - def __init__(self): - self.HEADER = '\033[95m' - self.OKBLUE = '\033[94m' - self.OKGREEN = '\033[92m' - self.WARNING = '\033[93m' - self.FAIL = '\033[91m' - self.ENDC = '\033[0m' - self.BOLD = '\033[1m' - self.UNDERLINE = '\033[4m' - self.WHITE = '\033[97m' - self.On_Black = '\033[40m' - self.On_Red = '\033[41m' diff --git a/modules/colors.py b/modules/colors.py new file mode 100644 index 00000000..a7f9e5cb --- /dev/null +++ b/modules/colors.py @@ -0,0 +1,51 @@ +""" +Module containing class with colors +""" + +class Colors: + """ + Class that contains colors used for TorBot in terminal and a method + that adds colr to a string + + Attributes: + _colors (dict): A map containing all of the color codes needed + """ + def __init__(self): + self._colors = { + 'white': "\033[1;37m", + 'yellow': "\033[1;33m", + 'green': "\033[1;32m", + 'blue': "\033[1;34m", + 'cyan': "\033[1;36m", + 'red': "\033[1;31m", + 'magenta': "\033[1;35m", + 'black': "\033[1;30m", + 'darkwhite': "\033[0;37m", + 'darkyellow': "\033[0;33m", + 'darkgreen': "\033[0;32m", + 'darkblue': "\033[0;34m", + 'darkcyan': "\033[0;36m", + 'darkred': "\033[0;31m", + 'darkmagenta':"\033[0;35m", + 'darkblack': "\033[0;30m", + 'end': "\033[0;0m" + } + + def add(self, string, color): + """ + Method that adds color to a given string + + Args: + string (str): string to add color to + color (str): string of color to add + """ + return self.get(color) + string + self.get('end') + + def get(self, color): + """ + Method that returns the color code of the given color string + + Args: + color (str): color code to be returned + """ + return self._colors[color] diff --git a/modules/getemails.py b/modules/getemails.py index e1299ff8..150c7b3f 100644 --- a/modules/getemails.py +++ b/modules/getemails.py @@ -1,10 +1,14 @@ -import modules.getweblinks as getweblinks - +""" +Module returns emails found on webpage +""" from bs4 import BeautifulSoup -from modules.bcolors import Bcolors +import modules.getweblinks +from modules.colors import Colors + +COLOR = Colors() -def getMails(soup): +def get_mails(soup): """ Searches for tags for links then checks if link contains the substring 'mailto' indicating that it's an email. If it is determined @@ -17,17 +21,16 @@ def getMails(soup): Returns: emails: list of email IDs """ - b_colors = Bcolors() if isinstance(type(soup), type(BeautifulSoup)): - emails = getweblinks.get_urls_from_page(soup, email=True) + emails = modules.getweblinks.get_urls_from_page(soup, email=True) # Pretty print output as below print('') - print(b_colors.OKGREEN+'Mails Found - '+b_colors.ENDC+str(len(emails))) + success_string = 'Mails Found - ' + str(len(emails)) + print(COLOR.add(success_string, 'green')) print('-------------------------------') return emails - else: - raise ValueError('Method parameter is not of instance BeautifulSoup') + raise ValueError('Method parameter is not of instance BeautifulSoup') diff --git a/modules/getweblinks.py b/modules/getweblinks.py index 57650f17..c0c69e5d 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -1,10 +1,26 @@ +""" +Module used to interact with a pages urls +""" import re -import modules.utils from bs4 import BeautifulSoup -from modules.bcolors import Bcolors + +import modules.utils +import modules.pagereader + +from modules.colors import Colors + +COLOR = Colors() def is_url(url): + """ + Returns an integer representing validity of url syntax + + Args: + url (str): url to be verified + Returns + (int): integer representing if url is a valid format + """ pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)" regex = re.compile(pattern) if regex.match(url): @@ -13,6 +29,14 @@ def is_url(url): def is_onion_url(url): + """ + Returns an integer representing validity of an onion url syntax + + Args: + url (str): url to be verified + Returns + (int): integer representing if url is a valid format + """ pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)" regex = re.compile(pattern) if regex.match(url): @@ -21,19 +45,19 @@ def is_onion_url(url): def get_urls_from_page(page_soup, email=False, extension=False): """ - Searches for urls on page using the anchor tag and href attribute, - also searchs for emails using 'mailto' if specified. + Searches for urls on page using the anchor tag and href attribute, + also searchs for emails using 'mailto' if specified. - Args: - page (bs4.BeauitulSoup): html soup to search - email (bool): flag whether to collect emails as well - extension (bool): flag whether to use additional extensions + Args: + page (bs4.BeauitulSoup): html soup to search + email (bool): flag whether to collect emails as well + extension (bool): flag whether to use additional extensions - Returns: - urls (list): urls found on page + Returns: + urls (list): urls found on page """ if not isinstance(page_soup, BeautifulSoup): - raise(Exception("First arg must be bs4.BeautifulSoup object")) + raise Exception("First arg must be bs4.BeautifulSoup object") urls = [] anchors_on_page = page_soup.find_all('a') @@ -70,37 +94,35 @@ def search_page(html, ext, stop_depth=None): soup = BeautifulSoup(html, 'html.parser') links = get_urls_from_page(soup, extension=ext) if stop_depth: - links_found = utils.bfs_urls(links, ext, stop_depth=stop_depth) + links_found = modules.utils.bfs_urls(links, ext, stop_depth=stop_depth) else: - links_found = utils.bfs_urls(links, ext) + links_found = modules.utils.bfs_urls(links, ext) return links_found def get_links(soup, ext=False, live=False): """ - Returns list of links listed on the webpage of the soup passed. If live - is set to true then it will also print the status of each of the links - and setting ext to an actual extension such as '.com' will allow those - extensions to be recognized as valid urls and not just '.tor'. + Returns list of links listed on the webpage of the soup passed. If live + is set to true then it will also print the status of each of the links + and setting ext to an actual extension such as '.com' will allow those + extensions to be recognized as valid urls and not just '.tor'. - Args: - soup (bs4.BeautifulSoup): webpage to be searched for links. + Args: + soup (bs4.BeautifulSoup): webpage to be searched for links. - Returns: - websites (list(str)): List of websites that were found + Returns: + websites (list(str)): List of websites that were found """ - b_colors = Bcolors() if isinstance(soup, BeautifulSoup): websites = get_urls_from_page(soup, extension=ext) # Pretty print output as below - print(''.join((b_colors.OKGREEN, - 'Websites Found - ', b_colors.ENDC, str(len(websites))))) + success_string = 'Websites Found - ' + str(len(websites)) + print(COLOR.add(success_string, 'green')) print('------------------------------------') if live: - utils.queue_tasks(websites, utils.display_link) + modules.utils.queue_tasks(websites, modules.pagereader.display_url) return websites - else: - raise(Exception('Method parameter is not of instance BeautifulSoup')) + raise Exception('Method parameter is not of instance BeautifulSoup') diff --git a/modules/pagereader.py b/modules/pagereader.py index 51242412..d26d2208 100644 --- a/modules/pagereader.py +++ b/modules/pagereader.py @@ -1,8 +1,13 @@ +""" +This module is used for reading HTML pages using either bs4.BeautifulSoup objects or url strings +""" + import sys from bs4 import BeautifulSoup from modules.utils import get_url_status -from modules.bcolors import Bcolors +from modules.colors import Colors +COLOR = Colors() def display_url(url): """ @@ -18,70 +23,76 @@ def display_url(url): resp = get_url_status(url) if resp != 0: title = BeautifulSoup(resp.text, 'html.parser').title.string - coloredurl = add_green(url) + coloredurl = COLOR.add(url, 'green') print_row(coloredurl, title) else: - coloredurl = add_red(url) + coloredurl = COLOR.add(url, 'red') print_row(coloredurl, "Not found") def print_row(url, description): + """ + Prints row in specified format + """ print("%-80s %-30s" % (url, description)) -def add_green(link): - colors = Bcolors() - return '\t' + colors.OKGREEN + link + colors.ENDC - - -def add_red(link): - colors = Bcolors() - return '\t' + colors.On_Red + link + colors.ENDC - +def connection_msg(url): + """ + Generator used to yield message while waiting for response + """ + yield "Attempting to connect to {url}".format(url=url) -def connection_msg(site): - yield "Attempting to connect to {site}".format(site=site) +def read_page(url): + """ + Attempts to connect to url and returns the HTML from page -def read_first_page(site): + Args: + url (str): url of website to be read + Returns: + page (str): html from page + response (int): indicator of success + """ headers = {'User-Agent': 'XXXX-XXXXX-XXXX'} attempts_left = 3 err = " " while attempts_left: if attempts_left == 3: - response = get_url_status(site, headers) + print(next(connection_msg(url))) + response = get_url_status(url, headers) if response != 0: page = BeautifulSoup(response.text, 'html.parser') return page, response - else: - attempts_left -= 1 - continue + + attempts_left -= 1 + continue if attempts_left == 2: - https_url = 'https://' + site + https_url = 'https://' + url print(next(connection_msg(https_url))) response = get_url_status(https_url, headers) if response != 0: page = BeautifulSoup(response.text, 'html.parser') return page, response - else: - attempts_left -= 1 - continue + + attempts_left -= 1 + continue if attempts_left == 1: - http_url = 'http://' + site + http_url = 'http://' + url print(next(connection_msg(http_url))) response = get_url_status(http_url, headers) if response != 0: page = BeautifulSoup(response.text, 'html.parser') return page, response - else: - attempts_left -= 1 - continue + + attempts_left -= 1 + continue if not attempts_left: msg = ''.join(("There has been an {err} while attempting to ", - "connect to {site}.")).format(err=err, site=site) + "connect to {url}.")).format(err=err, url=url) sys.exit(msg) @@ -92,10 +103,8 @@ def get_ip(): displays your IP address which we scape and return """ - b_colors = Bcolors() - page = read_first_page('https://check.torproject.org/')[0] - pg = page.find('strong') - ip_addr = pg.renderContents() - COLOR_BEGIN = b_colors.WARNING + b_colors.BOLD - COLOR_END = b_colors.ENDC - return COLOR_BEGIN + ip_addr.decode("utf-8") + COLOR_END + page = read_page('https://check.torproject.org/')[0] + ip_cont = page.find('strong') + ip_addr = ip_cont.renderContents() + ip_string = ip_addr.decode("utf-8") + return COLOR.add(ip_string, 'yellow') diff --git a/modules/utils.py b/modules/utils.py index d60e96a8..c5cee5ec 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -1,11 +1,13 @@ -import re -import requests -import modules.getweblinks - -from bs4 import BeautifulSoup -from requests.exceptions import HTTPError, ConnectionError +""" +Provides essential utilites for the rest of TorBot app +""" from queue import Queue from threading import Thread +from bs4 import BeautifulSoup +from requests.exceptions import HTTPError + +import requests +import modules.getweblinks # ALGORITHM UTILITY FUNCTIONS @@ -42,18 +44,19 @@ def bfs_urls(urls, add_exts, rec_depth=0, stop_depth=None, target_url=None): except (HTTPError, ConnectionError): continue soup = BeautifulSoup(resp.text, 'html.parser') - page_urls = getweblinks.get_urls_from_page(soup, extension=add_exts) - for url in page_urls: - urls_to_visit.append(url) + page_urls = modules.getweblinks.get_urls_from_page(soup, extension=add_exts) + for page_url in page_urls: + urls_to_visit.append(page_url) rec_depth += 1 + if stop_depth and target_url: - bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth, target_url) - elif stop_depth: - bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth=stop_depth) - elif target_url: - bfs_urls(urls_to_visit, add_exts, rec_depth, target_url=target_url) - else: - bfs_urls(urls_to_visit, add_exts, rec_depth=rec_depth) + return bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth, target_url) + if stop_depth: + return bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth=stop_depth) + if target_url: + return bfs_urls(urls_to_visit, add_exts, rec_depth, target_url=target_url) + + return bfs_urls(urls_to_visit, add_exts, rec_depth=rec_depth) def bfs(nodes, target_node=None, rec_depth=0, stop_depth=None): @@ -78,7 +81,7 @@ def bfs(nodes, target_node=None, rec_depth=0, stop_depth=None): adjacent_nodes = list() # Checks that nodes is a list or has a Visit method if not isinstance(nodes, list) and not hasattr(nodes, 'Visit', False): - raise(Exception('nodes must be a list')) + raise Exception('nodes must be a list') for node in nodes: if target_node == node and target_node: @@ -86,35 +89,36 @@ def bfs(nodes, target_node=None, rec_depth=0, stop_depth=None): node.Visit() adjacent_nodes.append(node) rec_depth += 1 + if target_node and not stop_depth: - bfs(adjacent_nodes, target_node, rec_depth) - elif not target_node and stop_depth: - bfs(adjacent_nodes, rec_depth=rec_depth, stop_depth=stop_depth) - elif target_node and stop_depth: - bfs(adjacent_nodes, target_node, rec_depth, stop_depth) - else: - bfs(adjacent_nodes, rec_depth) + return bfs(adjacent_nodes, target_node, rec_depth) + if not target_node and stop_depth: + return bfs(adjacent_nodes, rec_depth=rec_depth, stop_depth=stop_depth) + if target_node and stop_depth: + return bfs(adjacent_nodes, target_node, rec_depth, stop_depth) + return bfs(adjacent_nodes, rec_depth) -def exec_tasks(q, task_func, tasks_args=tuple()): + +def exec_tasks(que, task_func, tasks_args=tuple()): """ Executes tasks inside of queue using function and arguments passed inside of threads Args: - q (queue.Queue): contains tasks + que (queue.Queue): contains tasks task_func (function): function to be executed on tasks and args task_args (tuple): contains arguments for function Returns: None """ while True: - task = q.get() + task = que.get() if tasks_args: task_func(task, tasks_args) else: task_func(task) - q.task_done() + que.task_done() def queue_tasks(tasks, task_func, tasks_args=tuple()): @@ -129,29 +133,27 @@ def queue_tasks(tasks, task_func, tasks_args=tuple()): Returns: None """ - q = Queue(len(tasks)*2) + que = Queue(len(tasks)*2) for _ in tasks: if tasks_args: if isinstance(tasks_args, tuple): - t = Thread(target=exec_tasks, args=(q, task_func, tasks_args)) - t.daemon = True - t.start() + thd = Thread(target=exec_tasks, args=(que, task_func, tasks_args)) + thd.daemon = True + thd.start() else: - raise(Exception('Arguments must be in the form of a tuple.')) + raise Exception('Arguments must be in the form of a tuple.') else: - t = Thread(target=exec_tasks, args=(q, task_func)) - t.daemon = True - t.start() + thd = Thread(target=exec_tasks, args=(que, task_func)) + thd.daemon = True + thd.start() for task in tasks: - q.put(task) - q.join() + que.put(task) + que.join() # Networking functions - - def get_url_status(url, headers=False): """ Uses GET request to check if website exists diff --git a/tests/test_getemails.py b/tests/test_getemails.py index 3306b957..e65a6a38 100644 --- a/tests/test_getemails.py +++ b/tests/test_getemails.py @@ -17,7 +17,7 @@ def test_get_emails_fail(): mock_html = doc.getvalue() mock_soup = BeautifulSoup(mock_html, 'html.parser') - emails = getemails.getMails(mock_soup) + emails = getemails.get_mails(mock_soup) assert emails == [] @@ -36,7 +36,7 @@ def test_get_emails(): mock_html = doc.getvalue() mock_soup = BeautifulSoup(mock_html, 'html.parser') - emails = getemails.getMails(mock_soup) + emails = getemails.get_mails(mock_soup) assert emails == test_emails diff --git a/tests/test_pagereader.py b/tests/test_pagereader.py index 2d38c0f0..0253fdb3 100644 --- a/tests/test_pagereader.py +++ b/tests/test_pagereader.py @@ -33,7 +33,7 @@ def test_read_first_page(): mock_connection.register_uri('GET', test_data[i][0], text=test_data[i][1]) - result = str(pagereader.read_first_page(test_data[i][0])[0]) + result = str(pagereader.read_page(test_data[i][0])[0]) assert result == test_data[i][1] diff --git a/torBot.py b/torBot.py index c7edbc52..42bd73ab 100644 --- a/torBot.py +++ b/torBot.py @@ -1,12 +1,16 @@ +""" +MAIN MODULE +""" import argparse import socket import socks -from modules import (bcolors, getemails, pagereader, getweblinks, updater, +from modules import (colors, getemails, pagereader, getweblinks, updater, info, savefile) # GLOBAL CONSTS LOCALHOST = "127.0.0.1" DEFPORT = 9050 +COLOR = colors.Colors() # TorBot VERSION __VERSION = "1.2" @@ -46,7 +50,7 @@ def getaddrinfo(*args): Last two arguments should be a tuple containing the address and port """ return [(socket.AF_INET, socket.SOCK_STREAM, 6, - '', (args[0], args[1]))] + '', (args[0], args[1]))] socket.getaddrinfo = getaddrinfo @@ -55,11 +59,7 @@ def header(): Prints out header ASCII art """ - b_color = bcolors.Bcolors() - D3DSEC = b_color.FAIL + " D3DSEC " + b_color.WHITE - INS1DE = b_color.FAIL + " INS1DE " + b_color.WHITE - - header = r""" + title = r""" __ ____ ____ __ ______ / /_/ __ \/ __ \/ /_ ____/_ __/ / __/ / / / /_/ / __ \/ __ \/ / @@ -71,15 +71,21 @@ def header(): # GitHub : https://github.com/DedsecInside/TorBot # # Help : use -h for help text # ####################################################### - {FAIL} LICENSE: GNU Public License {END}""".format( - D3DSEC=D3DSEC, INS1DE=INS1DE, FAIL=b_color.FAIL, - BOLD=b_color.BOLD, VERSION=__VERSION, END=b_color.ENDC, - On_Black=b_color.On_Black, WHITE=b_color.WHITE - ) - print(header) + {FAIL} LICENSE: GNU Public License {END}""" + + title = title.format( + FAIL=COLOR.get('red'), + VERSION=__VERSION, + END=COLOR.get('end'), + On_Black=COLOR.get('black') + ) + print(title) def get_args(): + """ + Parses user flags passed to TorBot + """ parser = argparse.ArgumentParser() parser.add_argument("-v", "--version", action="store_true", @@ -117,7 +123,10 @@ def get_args(): return parser.parse_args() -def main(conn=False): +def main(): + """ + TorBot's Core + """ args = get_args() connect(args.ip, args.port) link = args.url @@ -136,11 +145,11 @@ def main(conn=False): # additional flag can be set with -u/--url flag if args.url: print("Tor IP Address :", pagereader.get_ip()) - html_content, response = pagereader.read_first_page(link) + html_content, response = pagereader.read_page(link) print("Connection successful.") # -m/--mail if args.mail: - emails = getemails.getMails(html_content) + emails = getemails.get_mails(html_content) print(emails) if args.save: savefile.saveJson('Emails', emails) From 3409666fe9ff0fc4738713fc594317fe8d8cba41 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:09:06 -0400 Subject: [PATCH 5/9] Trying to remove cyclic import error --- modules/colors.py | 1 + modules/getemails.py | 1 + modules/getweblinks.py | 1 + modules/pagereader.py | 1 + modules/utils.py | 1 + 5 files changed, 5 insertions(+) diff --git a/modules/colors.py b/modules/colors.py index a7f9e5cb..4efd3b18 100644 --- a/modules/colors.py +++ b/modules/colors.py @@ -1,3 +1,4 @@ + """ Module containing class with colors """ diff --git a/modules/getemails.py b/modules/getemails.py index 150c7b3f..7e1d5b0b 100644 --- a/modules/getemails.py +++ b/modules/getemails.py @@ -1,3 +1,4 @@ + """ Module returns emails found on webpage """ diff --git a/modules/getweblinks.py b/modules/getweblinks.py index c0c69e5d..9748299c 100644 --- a/modules/getweblinks.py +++ b/modules/getweblinks.py @@ -1,3 +1,4 @@ + """ Module used to interact with a pages urls """ diff --git a/modules/pagereader.py b/modules/pagereader.py index d26d2208..fc78ef84 100644 --- a/modules/pagereader.py +++ b/modules/pagereader.py @@ -1,3 +1,4 @@ + """ This module is used for reading HTML pages using either bs4.BeautifulSoup objects or url strings """ diff --git a/modules/utils.py b/modules/utils.py index c5cee5ec..eb3041e5 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -1,3 +1,4 @@ + """ Provides essential utilites for the rest of TorBot app """ From 9e31c338d8119b7b92bbb6d9c30627906c7c4db4 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:25:47 -0400 Subject: [PATCH 6/9] Updating requirements --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0c415d7b..47377e35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,4 @@ PySocks==1.6.7 termcolor==1.1.0 requests==2.18.4 requests_mock==1.4.0 -tldextract==2.2.0 yattag==1.10.0 -python-dotenv==0.9.1 From f32fe1272cc7630d3c5d5ab7f1e3108b9f454d96 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:27:34 -0400 Subject: [PATCH 7/9] Updating README --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index e1c8601a..f3f35a83 100755 --- a/README.md +++ b/README.md @@ -72,8 +72,6 @@ Contributor name will be updated to the below list. :D - PySocks - termcolor - requests -- python-dotenv -- tldextract - requests_mock - yattag From b2f9967d354b0218ca6d41019d498106033b00b6 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:29:27 -0400 Subject: [PATCH 8/9] Adding pyinstaller to requirements and to install script --- install.sh | 2 ++ requirements.txt | 1 + 2 files changed, 3 insertions(+) diff --git a/install.sh b/install.sh index cfaaaaf1..80ea9db6 100755 --- a/install.sh +++ b/install.sh @@ -8,6 +8,8 @@ go get golang.org/x/net/html mkdir -p tmp_build mkdir -p tmp_dist +pip install pyinstaller + # Creates executable file and sends dependences to the recently created directories pyinstaller --onefile --workpath ./tmp_build --distpath ./tmp_dist torBot.py diff --git a/requirements.txt b/requirements.txt index 47377e35..a79c45d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ termcolor==1.1.0 requests==2.18.4 requests_mock==1.4.0 yattag==1.10.0 +pyinstaller==3.4.0 From 551192f6278952de414d257790d6cc045f87e9c5 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Fri, 14 Sep 2018 10:30:01 -0400 Subject: [PATCH 9/9] Updating README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f3f35a83..f3a0cbb3 100755 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ Contributor name will be updated to the below list. :D ### Python Dependencies - beautifulsoup4 +- pyinstaller - PySocks - termcolor - requests