DedSecInside · PSNAppz · Aug 10, 2018 · Aug 8, 2018 · Aug 8, 2018 · Aug 8, 2018
diff --git a/modules/getweblinks.py b/modules/getweblinks.py
@@ -1,7 +1,58 @@
+import requests
+
 from modules.net_utils import get_urls_from_page, get_url_status
-from modules import pagereader
 from bs4 import BeautifulSoup
 from modules.bcolors import Bcolors
+from threading import Thread
+from queue import Queue
+
+
+def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None):
+    """
+        Traverses links passed using Breadth First Search. You can specify stop depth
+        or specify a target to look for. The depth argument is used for recursion
+
+        Args:
+            links (list): list of urls to traverse
+            ext (string): string representing extension to use for URLs
+            depth (int): used for recursion
+            stop_depth (int): stops traversing at this depth if specified
+            targetLink (string): stops at this link if specified
+
+        Returns:
+            depth (int): depth stopped at
+    """
+
+    if depth == stop_depth:
+        return depth
+
+    toVisit = list()
+    for link in links:
+        if targetLink == link and targetLink:
+            return depth
+        resp = requests.get(link)
+        soup = BeautifulSoup(resp.text, 'html.parser')
+        websitesToVisit = get_urls_from_page(soup, extension=ext)
+        for site in websitesToVisit:
+            toVisit.append(site)
+    depth += 1
+    if stop_depth and targetLink:
+        traverse_links(toVisit, ext, depth, stop_depth, targetLink)
+    elif stop_depth:
+        traverse_links(toVisit, ext, depth, stop_depth=stop_depth)
+    elif targetLink:
+        traverse_links(toVisit, ext, depth, targetLink=targetLink)
+    else:
+        traverse_links(toVisit, ext, depth)
+
+
+def search_page(html_text, ext, stop=None):
+    soup = BeautifulSoup(html_text, 'html.parser')
+    links = get_urls_from_page(soup, extension=ext)
+    if stop:
+        traverse_links(links, ext, stop=stop)
+    else:
+        traverse_links(links, ext)
 
 
 def add_green(link):
@@ -16,14 +67,16 @@ def add_red(link):
 
 def get_links(soup, ext=False, live=False):
     """
-        Searches through all <a ref> (hyperlinks) tags and stores them in a
-        list then validates if the url is formatted correctly.
+        Returns list of links listed on the webpage of the soup passed. If live
+        is set to true then it will also print the status of each of the links
+        and setting ext to an actual extension such as '.com' will allow those
+        extensions to be recognized as valid urls and not just '.tor'.
 
         Args:
-            soup: BeautifulSoup instance currently being used.
+            soup (bs4.BeautifulSoup): webpage to be searched for links.
 
         Returns:
-            websites: List of websites that were found
+            websites (list(str)): List of websites that were found
     """
     b_colors = Bcolors()
     if isinstance(soup, BeautifulSoup):
@@ -34,21 +87,85 @@ def get_links(soup, ext=False, live=False):
         print('------------------------------------')
 
         if live:
-            for link in websites:
-                if get_url_status(link) != 0:
-                    coloredlink = add_green(link)
-                    page = pagereader.read_first_page(link)[0]
-                    if page is not None and page.title is not None:
-                        print_row(coloredlink, page.title.string)
-                else:
-                    coloredlink = add_red(link)
-                    print_row(coloredlink, "Not found")
-
+            queue_tasks(websites, display_link)
         return websites
 
     else:
         raise(Exception('Method parameter is not of instance BeautifulSoup'))
 
 
+def display_link(link):
+    """
+        Prints the status of a link based on if it can be reached using a GET
+        request. Link is printed with a color based on status.
+        Green for a reachable status code and red for not reachable.
+
+        Args:
+            link (str): url to be printed
+        Returns:
+            None
+    """
+    resp = get_url_status(link)
+    if resp != 0:
+        title = BeautifulSoup(resp.text, 'html.parser').title.string
+        coloredlink = add_green(link)
+        print_row(coloredlink, title)
+    else:
+        coloredlink = add_red(link)
+        print_row(coloredlink, "Not found")
+
+
+def execute_tasks(q, task_func, tasks_args=tuple()):
+    """
+        Executes tasks inside of queue using function and arguments passed
+        inside of threads
+
+        Args:
+            q (queue.Queue): contains tasks
+            task_func (function): function to be executed on tasks and args
+            task_args (tuple): contains arguments for function
+        Returns:
+            None
+    """
+    while True:
+        task = q.get()
+        if tasks_args:
+            task_func(task, tasks_args)
+        else:
+            task_func(task)
+        q.task_done()
+
+
+def queue_tasks(tasks, task_func, tasks_args=tuple()):
+    """
+        Starts threads with tasks and queue, then queues tasks and spawned threads
+        begin to pull tasks off queue to execute
+
+        Args:
+            tasks (list): lists of values that you'd like to operate on
+            task_func (function): function that you would like to use
+            tasks_args (tuple): arguments for function
+        Returns:
+            None
+    """
+    q = Queue(len(tasks)*2)
+    for _ in tasks:
+        if tasks_args:
+            if isinstance(tasks_args, tuple):
+                t = Thread(target=execute_tasks, args=(q, task_func, tasks_args))
+                t.daemon = True
+                t.start()
+            else:
+                raise(Exception('Function arguments must be passed in the form of a tuple.'))
+        else:
+            t = Thread(target=execute_tasks, args=(q, task_func))
+            t.daemon = True
+            t.start()
+
+    for task in tasks:
+        q.put(task)
+    q.join()
+
+
 def print_row(url, description):
     print("%-80s %-30s" % (url, description))
diff --git a/modules/net_utils.py b/modules/net_utils.py
@@ -2,14 +2,6 @@
 import requests
 
 
-def check_connection(url):
-    print("Attempting to connect to {site}".format(site=url))
-    if get_url_status(url) != 0:
-        return 1
-
-    return 0
-
-
 def get_url_status(url, headers=False):
     """
         Uses head request because it uses less bandwith than get and timeout is