|
| 1 | +import requests |
| 2 | + |
| 3 | +from requests import HTTPError, ConnectionError |
1 | 4 | from modules.net_utils import get_urls_from_page, get_url_status
|
2 | 5 | from bs4 import BeautifulSoup
|
3 | 6 | from modules.bcolors import Bcolors
|
4 | 7 | from threading import Thread
|
5 | 8 | from queue import Queue
|
6 | 9 |
|
7 | 10 |
|
| 11 | +def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None): |
| 12 | + """ |
| 13 | + Traverses links passed using Breadth First Search. You can specify stop depth |
| 14 | + or specify a target to look for. The depth argument is used for recursion |
| 15 | +
|
| 16 | + Args: |
| 17 | + links (list): list of urls to traverse |
| 18 | + ext (string): string representing extension to use for URLs |
| 19 | + depth (int): used for recursion |
| 20 | + stop_depth (int): stops traversing at this depth if specified |
| 21 | + targetLink (string): stops at this link if specified |
| 22 | +
|
| 23 | + Returns: |
| 24 | + depth (int): depth stopped at |
| 25 | + """ |
| 26 | + |
| 27 | + if depth == stop_depth: |
| 28 | + return depth |
| 29 | + |
| 30 | + toVisit = list() |
| 31 | + for link in links: |
| 32 | + if targetLink == link and targetLink: |
| 33 | + return depth |
| 34 | + try: |
| 35 | + resp = requests.get(link) |
| 36 | + except (HTTPError, ConnectionError): |
| 37 | + continue |
| 38 | + soup = BeautifulSoup(resp.text, 'html.parser') |
| 39 | + websitesToVisit = get_urls_from_page(soup, extension=ext) |
| 40 | + for site in websitesToVisit: |
| 41 | + toVisit.append(site) |
| 42 | + depth += 1 |
| 43 | + if stop_depth and targetLink: |
| 44 | + traverse_links(toVisit, ext, depth, stop_depth, targetLink) |
| 45 | + elif stop_depth: |
| 46 | + traverse_links(toVisit, ext, depth, stop_depth=stop_depth) |
| 47 | + elif targetLink: |
| 48 | + traverse_links(toVisit, ext, depth, targetLink=targetLink) |
| 49 | + else: |
| 50 | + traverse_links(toVisit, ext, depth) |
| 51 | + |
| 52 | + |
| 53 | +def search_page(html_text, ext, stop=None): |
| 54 | + soup = BeautifulSoup(html_text, 'html.parser') |
| 55 | + links = get_urls_from_page(soup, extension=ext) |
| 56 | + if stop: |
| 57 | + traverse_links(links, ext, stop=stop) |
| 58 | + else: |
| 59 | + traverse_links(links, ext) |
| 60 | + |
| 61 | + |
8 | 62 | def add_green(link):
|
9 | 63 | colors = Bcolors()
|
10 | 64 | return '\t' + colors.OKGREEN + link + colors.ENDC
|
|
0 commit comments