-
-
Notifications
You must be signed in to change notification settings - Fork 569
Add function to traverse links using Breadth First Search #111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
3735b3d
148002c
10e3cb9
08d58ce
b1586d4
eb83451
a9920c0
063a209
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,58 @@ | ||
import requests | ||
|
||
from modules.net_utils import get_urls_from_page, get_url_status | ||
from modules import pagereader | ||
from bs4 import BeautifulSoup | ||
from modules.bcolors import Bcolors | ||
from threading import Thread | ||
from queue import Queue | ||
|
||
|
||
def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None): | ||
""" | ||
Traverses links passed using Breadth First Search. You can specify stop depth | ||
or specify a target to look for. The depth argument is used for recursion | ||
|
||
Args: | ||
links (list): list of urls to traverse | ||
ext (string): string representing extension to use for URLs | ||
depth (int): used for recursion | ||
stop_depth (int): stops traversing at this depth if specified | ||
targetLink (string): stops at this link if specified | ||
|
||
Returns: | ||
depth (int): depth stopped at | ||
""" | ||
|
||
if depth == stop_depth: | ||
return depth | ||
|
||
toVisit = list() | ||
for link in links: | ||
if targetLink == link and targetLink: | ||
return depth | ||
resp = requests.get(link) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if errors out? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch, didn't think about that. I'm going to just put a try-except block and just pass errors. If there are errors, then we can just assume the link isn't valid. |
||
soup = BeautifulSoup(resp.text, 'html.parser') | ||
websitesToVisit = get_urls_from_page(soup, extension=ext) | ||
for site in websitesToVisit: | ||
toVisit.append(site) | ||
depth += 1 | ||
if stop_depth and targetLink: | ||
traverse_links(toVisit, ext, depth, stop_depth, targetLink) | ||
elif stop_depth: | ||
traverse_links(toVisit, ext, depth, stop_depth=stop_depth) | ||
elif targetLink: | ||
traverse_links(toVisit, ext, depth, targetLink=targetLink) | ||
else: | ||
traverse_links(toVisit, ext, depth) | ||
|
||
|
||
def search_page(html_text, ext, stop=None): | ||
soup = BeautifulSoup(html_text, 'html.parser') | ||
links = get_urls_from_page(soup, extension=ext) | ||
if stop: | ||
traverse_links(links, ext, stop=stop) | ||
else: | ||
traverse_links(links, ext) | ||
|
||
|
||
def add_green(link): | ||
|
@@ -16,14 +67,16 @@ def add_red(link): | |
|
||
def get_links(soup, ext=False, live=False): | ||
""" | ||
Searches through all <a ref> (hyperlinks) tags and stores them in a | ||
list then validates if the url is formatted correctly. | ||
Returns list of links listed on the webpage of the soup passed. If live | ||
is set to true then it will also print the status of each of the links | ||
and setting ext to an actual extension such as '.com' will allow those | ||
extensions to be recognized as valid urls and not just '.tor'. | ||
|
||
Args: | ||
soup: BeautifulSoup instance currently being used. | ||
soup (bs4.BeautifulSoup): webpage to be searched for links. | ||
|
||
Returns: | ||
websites: List of websites that were found | ||
websites (list(str)): List of websites that were found | ||
""" | ||
b_colors = Bcolors() | ||
if isinstance(soup, BeautifulSoup): | ||
|
@@ -34,21 +87,85 @@ def get_links(soup, ext=False, live=False): | |
print('------------------------------------') | ||
|
||
if live: | ||
for link in websites: | ||
if get_url_status(link) != 0: | ||
coloredlink = add_green(link) | ||
page = pagereader.read_first_page(link)[0] | ||
if page is not None and page.title is not None: | ||
print_row(coloredlink, page.title.string) | ||
else: | ||
coloredlink = add_red(link) | ||
print_row(coloredlink, "Not found") | ||
|
||
queue_tasks(websites, display_link) | ||
return websites | ||
|
||
else: | ||
raise(Exception('Method parameter is not of instance BeautifulSoup')) | ||
|
||
|
||
def display_link(link): | ||
""" | ||
Prints the status of a link based on if it can be reached using a GET | ||
request. Link is printed with a color based on status. | ||
Green for a reachable status code and red for not reachable. | ||
|
||
Args: | ||
link (str): url to be printed | ||
Returns: | ||
None | ||
""" | ||
resp = get_url_status(link) | ||
if resp != 0: | ||
title = BeautifulSoup(resp.text, 'html.parser').title.string | ||
coloredlink = add_green(link) | ||
print_row(coloredlink, title) | ||
else: | ||
coloredlink = add_red(link) | ||
print_row(coloredlink, "Not found") | ||
|
||
|
||
def execute_tasks(q, task_func, tasks_args=tuple()): | ||
""" | ||
Executes tasks inside of queue using function and arguments passed | ||
inside of threads | ||
|
||
Args: | ||
q (queue.Queue): contains tasks | ||
task_func (function): function to be executed on tasks and args | ||
task_args (tuple): contains arguments for function | ||
Returns: | ||
None | ||
""" | ||
while True: | ||
task = q.get() | ||
if tasks_args: | ||
task_func(task, tasks_args) | ||
else: | ||
task_func(task) | ||
q.task_done() | ||
|
||
|
||
def queue_tasks(tasks, task_func, tasks_args=tuple()): | ||
""" | ||
Starts threads with tasks and queue, then queues tasks and spawned threads | ||
begin to pull tasks off queue to execute | ||
|
||
Args: | ||
tasks (list): lists of values that you'd like to operate on | ||
task_func (function): function that you would like to use | ||
tasks_args (tuple): arguments for function | ||
Returns: | ||
None | ||
""" | ||
q = Queue(len(tasks)*2) | ||
for _ in tasks: | ||
if tasks_args: | ||
if isinstance(tasks_args, tuple): | ||
t = Thread(target=execute_tasks, args=(q, task_func, tasks_args)) | ||
t.daemon = True | ||
t.start() | ||
else: | ||
raise(Exception('Function arguments must be passed in the form of a tuple.')) | ||
else: | ||
t = Thread(target=execute_tasks, args=(q, task_func)) | ||
t.daemon = True | ||
t.start() | ||
|
||
for task in tasks: | ||
q.put(task) | ||
q.join() | ||
|
||
|
||
def print_row(url, description): | ||
print("%-80s %-30s" % (url, description)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't see value of and condition here. If
targetLink == link
it will always betargetLink
, right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to make sure
targetLink
since Python is a dynamic language, it's impossible to tell ahead of time what items a list may contain. If aNone
were to somehow get inserted, I don't want it to return a false positive.