Merge pull request #111 from KingAkeem/bfs_crawl

PSNAppz · web-flow · commit 5ca2d213353c · 2018-08-10T09:09:32.000+05:30
Add function to traverse links using Breadth First Search
diff --git a/modules/getweblinks.py b/modules/getweblinks.py
@@ -1,10 +1,64 @@
+import requests
+
+from requests import HTTPError, ConnectionError
 from modules.net_utils import get_urls_from_page, get_url_status
 from bs4 import BeautifulSoup
 from modules.bcolors import Bcolors
 from threading import Thread
 from queue import Queue
 
 
+def traverse_links(links, ext, depth=0, stop_depth=None, targetLink=None):
+    """
+        Traverses links passed using Breadth First Search. You can specify stop depth
+        or specify a target to look for. The depth argument is used for recursion
+
+        Args:
+            links (list): list of urls to traverse
+            ext (string): string representing extension to use for URLs
+            depth (int): used for recursion
+            stop_depth (int): stops traversing at this depth if specified
+            targetLink (string): stops at this link if specified
+
+        Returns:
+            depth (int): depth stopped at
+    """
+
+    if depth == stop_depth:
+        return depth
+
+    toVisit = list()
+    for link in links:
+        if targetLink == link and targetLink:
+            return depth
+        try:
+            resp = requests.get(link)
+        except (HTTPError, ConnectionError):
+            continue
+        soup = BeautifulSoup(resp.text, 'html.parser')
+        websitesToVisit = get_urls_from_page(soup, extension=ext)
+        for site in websitesToVisit:
+            toVisit.append(site)
+    depth += 1
+    if stop_depth and targetLink:
+        traverse_links(toVisit, ext, depth, stop_depth, targetLink)
+    elif stop_depth:
+        traverse_links(toVisit, ext, depth, stop_depth=stop_depth)
+    elif targetLink:
+        traverse_links(toVisit, ext, depth, targetLink=targetLink)
+    else:
+        traverse_links(toVisit, ext, depth)
+
+
+def search_page(html_text, ext, stop=None):
+    soup = BeautifulSoup(html_text, 'html.parser')
+    links = get_urls_from_page(soup, extension=ext)
+    if stop:
+        traverse_links(links, ext, stop=stop)
+    else:
+        traverse_links(links, ext)
+
+
 def add_green(link):
     colors = Bcolors()
     return '\t' + colors.OKGREEN + link + colors.ENDC