Skip to content

Commit 5ca2d21

Browse files
authored
Merge pull request #111 from KingAkeem/bfs_crawl
Add function to traverse links using Breadth First Search
2 parents d80d33a + 063a209 commit 5ca2d21

File tree

1 file changed

+54
-0
lines changed

1 file changed

+54
-0
lines changed

modules/getweblinks.py

+54
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,64 @@
1+
import requests
2+
3+
from requests import HTTPError, ConnectionError
14
from modules.net_utils import get_urls_from_page, get_url_status
25
from bs4 import BeautifulSoup
36
from modules.bcolors import Bcolors
47
from threading import Thread
58
from queue import Queue
69

710

11+
def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None):
12+
"""
13+
Traverses links passed using Breadth First Search. You can specify stop depth
14+
or specify a target to look for. The depth argument is used for recursion
15+
16+
Args:
17+
links (list): list of urls to traverse
18+
ext (string): string representing extension to use for URLs
19+
depth (int): used for recursion
20+
stop_depth (int): stops traversing at this depth if specified
21+
targetLink (string): stops at this link if specified
22+
23+
Returns:
24+
depth (int): depth stopped at
25+
"""
26+
27+
if depth == stop_depth:
28+
return depth
29+
30+
toVisit = list()
31+
for link in links:
32+
if targetLink == link and targetLink:
33+
return depth
34+
try:
35+
resp = requests.get(link)
36+
except (HTTPError, ConnectionError):
37+
continue
38+
soup = BeautifulSoup(resp.text, 'html.parser')
39+
websitesToVisit = get_urls_from_page(soup, extension=ext)
40+
for site in websitesToVisit:
41+
toVisit.append(site)
42+
depth += 1
43+
if stop_depth and targetLink:
44+
traverse_links(toVisit, ext, depth, stop_depth, targetLink)
45+
elif stop_depth:
46+
traverse_links(toVisit, ext, depth, stop_depth=stop_depth)
47+
elif targetLink:
48+
traverse_links(toVisit, ext, depth, targetLink=targetLink)
49+
else:
50+
traverse_links(toVisit, ext, depth)
51+
52+
53+
def search_page(html_text, ext, stop=None):
54+
soup = BeautifulSoup(html_text, 'html.parser')
55+
links = get_urls_from_page(soup, extension=ext)
56+
if stop:
57+
traverse_links(links, ext, stop=stop)
58+
else:
59+
traverse_links(links, ext)
60+
61+
862
def add_green(link):
963
colors = Bcolors()
1064
return '\t' + colors.OKGREEN + link + colors.ENDC

0 commit comments

Comments
 (0)