Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add function to traverse links using Breadth First Search #111

Merged
merged 8 commits into from
Aug 10, 2018
54 changes: 54 additions & 0 deletions modules/getweblinks.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,64 @@
import requests

from requests import HTTPError, ConnectionError
from modules.net_utils import get_urls_from_page, get_url_status
from bs4 import BeautifulSoup
from modules.bcolors import Bcolors
from threading import Thread
from queue import Queue


def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None):
"""
Traverses links passed using Breadth First Search. You can specify stop depth
or specify a target to look for. The depth argument is used for recursion

Args:
links (list): list of urls to traverse
ext (string): string representing extension to use for URLs
depth (int): used for recursion
stop_depth (int): stops traversing at this depth if specified
targetLink (string): stops at this link if specified

Returns:
depth (int): depth stopped at
"""

if depth == stop_depth:
return depth

toVisit = list()
for link in links:
if targetLink == link and targetLink:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't see value of and condition here. If targetLink == link it will always be targetLink, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to make sure targetLink since Python is a dynamic language, it's impossible to tell ahead of time what items a list may contain. If a None were to somehow get inserted, I don't want it to return a false positive.

return depth
try:
resp = requests.get(link)
except (HTTPError, ConnectionError):
continue
soup = BeautifulSoup(resp.text, 'html.parser')
websitesToVisit = get_urls_from_page(soup, extension=ext)
for site in websitesToVisit:
toVisit.append(site)
depth += 1
if stop_depth and targetLink:
traverse_links(toVisit, ext, depth, stop_depth, targetLink)
elif stop_depth:
traverse_links(toVisit, ext, depth, stop_depth=stop_depth)
elif targetLink:
traverse_links(toVisit, ext, depth, targetLink=targetLink)
else:
traverse_links(toVisit, ext, depth)


def search_page(html_text, ext, stop=None):
soup = BeautifulSoup(html_text, 'html.parser')
links = get_urls_from_page(soup, extension=ext)
if stop:
traverse_links(links, ext, stop=stop)
else:
traverse_links(links, ext)


def add_green(link):
colors = Bcolors()
return '\t' + colors.OKGREEN + link + colors.ENDC
Expand Down