Skip to content

Add function to traverse links using Breadth First Search #111

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Aug 10, 2018
147 changes: 132 additions & 15 deletions modules/getweblinks.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,58 @@
import requests

from modules.net_utils import get_urls_from_page, get_url_status
from modules import pagereader
from bs4 import BeautifulSoup
from modules.bcolors import Bcolors
from threading import Thread
from queue import Queue


def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None):
"""
Traverses links passed using Breadth First Search. You can specify stop depth
or specify a target to look for. The depth argument is used for recursion

Args:
links (list): list of urls to traverse
ext (string): string representing extension to use for URLs
depth (int): used for recursion
stop_depth (int): stops traversing at this depth if specified
targetLink (string): stops at this link if specified

Returns:
depth (int): depth stopped at
"""

if depth == stop_depth:
return depth

toVisit = list()
for link in links:
if targetLink == link and targetLink:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't see value of and condition here. If targetLink == link it will always be targetLink, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to make sure targetLink since Python is a dynamic language, it's impossible to tell ahead of time what items a list may contain. If a None were to somehow get inserted, I don't want it to return a false positive.

return depth
resp = requests.get(link)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if errors out?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, didn't think about that. I'm going to just put a try-except block and just pass errors. If there are errors, then we can just assume the link isn't valid.

soup = BeautifulSoup(resp.text, 'html.parser')
websitesToVisit = get_urls_from_page(soup, extension=ext)
for site in websitesToVisit:
toVisit.append(site)
depth += 1
if stop_depth and targetLink:
traverse_links(toVisit, ext, depth, stop_depth, targetLink)
elif stop_depth:
traverse_links(toVisit, ext, depth, stop_depth=stop_depth)
elif targetLink:
traverse_links(toVisit, ext, depth, targetLink=targetLink)
else:
traverse_links(toVisit, ext, depth)


def search_page(html_text, ext, stop=None):
soup = BeautifulSoup(html_text, 'html.parser')
links = get_urls_from_page(soup, extension=ext)
if stop:
traverse_links(links, ext, stop=stop)
else:
traverse_links(links, ext)


def add_green(link):
Expand All @@ -16,14 +67,16 @@ def add_red(link):

def get_links(soup, ext=False, live=False):
"""
Searches through all <a ref> (hyperlinks) tags and stores them in a
list then validates if the url is formatted correctly.
Returns list of links listed on the webpage of the soup passed. If live
is set to true then it will also print the status of each of the links
and setting ext to an actual extension such as '.com' will allow those
extensions to be recognized as valid urls and not just '.tor'.

Args:
soup: BeautifulSoup instance currently being used.
soup (bs4.BeautifulSoup): webpage to be searched for links.

Returns:
websites: List of websites that were found
websites (list(str)): List of websites that were found
"""
b_colors = Bcolors()
if isinstance(soup, BeautifulSoup):
Expand All @@ -34,21 +87,85 @@ def get_links(soup, ext=False, live=False):
print('------------------------------------')

if live:
for link in websites:
if get_url_status(link) != 0:
coloredlink = add_green(link)
page = pagereader.read_first_page(link)[0]
if page is not None and page.title is not None:
print_row(coloredlink, page.title.string)
else:
coloredlink = add_red(link)
print_row(coloredlink, "Not found")

queue_tasks(websites, display_link)
return websites

else:
raise(Exception('Method parameter is not of instance BeautifulSoup'))


def display_link(link):
"""
Prints the status of a link based on if it can be reached using a GET
request. Link is printed with a color based on status.
Green for a reachable status code and red for not reachable.

Args:
link (str): url to be printed
Returns:
None
"""
resp = get_url_status(link)
if resp != 0:
title = BeautifulSoup(resp.text, 'html.parser').title.string
coloredlink = add_green(link)
print_row(coloredlink, title)
else:
coloredlink = add_red(link)
print_row(coloredlink, "Not found")


def execute_tasks(q, task_func, tasks_args=tuple()):
"""
Executes tasks inside of queue using function and arguments passed
inside of threads

Args:
q (queue.Queue): contains tasks
task_func (function): function to be executed on tasks and args
task_args (tuple): contains arguments for function
Returns:
None
"""
while True:
task = q.get()
if tasks_args:
task_func(task, tasks_args)
else:
task_func(task)
q.task_done()


def queue_tasks(tasks, task_func, tasks_args=tuple()):
"""
Starts threads with tasks and queue, then queues tasks and spawned threads
begin to pull tasks off queue to execute

Args:
tasks (list): lists of values that you'd like to operate on
task_func (function): function that you would like to use
tasks_args (tuple): arguments for function
Returns:
None
"""
q = Queue(len(tasks)*2)
for _ in tasks:
if tasks_args:
if isinstance(tasks_args, tuple):
t = Thread(target=execute_tasks, args=(q, task_func, tasks_args))
t.daemon = True
t.start()
else:
raise(Exception('Function arguments must be passed in the form of a tuple.'))
else:
t = Thread(target=execute_tasks, args=(q, task_func))
t.daemon = True
t.start()

for task in tasks:
q.put(task)
q.join()


def print_row(url, description):
print("%-80s %-30s" % (url, description))
8 changes: 0 additions & 8 deletions modules/net_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,6 @@
import requests


def check_connection(url):
print("Attempting to connect to {site}".format(site=url))
if get_url_status(url) != 0:
return 1

return 0


def get_url_status(url, headers=False):
"""
Uses head request because it uses less bandwith than get and timeout is
Expand Down