DedSecInside · KingAkeem · Jul 4, 2018 · Jul 1, 2018 · Jul 4, 2018 · Jul 4, 2018
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,7 @@ tests/__pycache__/
 modules/__init__.py
 .idea/
 tests/.ropeproject/
+torBot
+*.pyc
+tests/.pytest_cache
+.pytest_cache
diff --git a/modules/getemails.py b/modules/getemails.py
@@ -1,4 +1,5 @@
 from modules.bcolors import Bcolors
+from modules.net_utils import get_urls_from_page
 from bs4 import BeautifulSoup
 
 
@@ -19,19 +20,11 @@ def getMails(soup):
 
     if isinstance(type(soup), type(BeautifulSoup)):
 
-        emails = []
-        links = soup.find_all('a')
-        for ref in links:
-            url = ref.get('href')
-            if url and 'mailto' in url:
-                """Split email address on"""
-                email_addr = url.split(':')
-                if len(email_addr) > 1:
-                    emails.append(email_addr[1])
+        emails = get_urls_from_page(soup, email=True)
 
         """Pretty print output as below"""
         print('')
-        print(b_colors.OKGREEN + 'Mails Found - ' + b_colors.ENDC + str(len(emails)))
+        print(b_colors.OKGREEN+'Mails Found - '+b_colors.ENDC+str(len(emails)))
         print('-------------------------------')
 
         return emails

diff --git a/modules/getweblinks.py b/modules/getweblinks.py
@@ -1,78 +1,7 @@
-import re
-import requests
-import tldextract
-
+from modules.net_utils import get_urls_from_page, get_url_status
 from modules import pagereader
 from bs4 import BeautifulSoup
 from modules.bcolors import Bcolors
-from requests.exceptions import ConnectionError, HTTPError
-
-
-def valid_url(url, extensions=None):
-    """Checks for any valid url using regular expression matching
-
-        Matches all possible url patterns with the url that is passed and
-        returns True if it is a url and returns False if it is not.
-
-        Args:
-            url: string representing url to be checked
-
-        Returns:
-            bool: True if valid url format and False if not
-    """
-    pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)"
-    regex = re.compile(pattern)
-    if not extensions:
-        if regex.match(url):
-            return True
-        return False
-
-    parts = tldextract.extract(url)
-    valid_sites = list()
-    for ext in extensions:
-        if regex.match(url) and '.'+parts.suffix in ext:
-            valid_sites.append(url)
-    return valid_sites
-
-
-def valid_onion_url(url):
-    """Checks for valid onion url using regular expression matching
-
-        Only matches onion urls
-
-        Args:
-            url: string representing url to be checked
-
-        Returns:
-            bool: True if valid onion url format, False if not
-    """
-    pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)"
-    regex = re.compile(pattern)
-    if regex.match(url):
-        return True
-    return False
-
-
-def is_link_alive(link):
-    """Generator that yields links as they come
-
-        Uses head request because it uses less bandwidth than get and timeout is
-        set to 10 seconds and then link is automatically declared as dead.
-
-        Args:
-            link: link to be tested
-            colors: object containing colors for link
-
-        Yields:
-            string: link with either no color or red which indicates failure
-    """
-
-    try:
-        resp = requests.head(link, timeout=10)
-        resp.raise_for_status()
-        return True
-    except (ConnectionError, HTTPError):
-        return False
 
 
 def add_green(link):
@@ -98,28 +27,17 @@ def get_links(soup, ext=False, live=False):
     """
     b_colors = Bcolors()
     if isinstance(soup, BeautifulSoup):
-        websites = []
-
-        links = soup.find_all('a')
-        for ref in links:
-            url = ref.get('href')
-            if ext:
-                if url and valid_url(url, ext):
-                    websites.append(url)
-            else:
-                if url and valid_onion_url(url):
-                    websites.append(url)
-
+        websites = get_urls_from_page(soup, extension=ext)
         """Pretty print output as below"""
         print(''.join((b_colors.OKGREEN,
               'Websites Found - ', b_colors.ENDC, str(len(websites)))))
         print('------------------------------------')
-        
+
         if live:
             for link in websites:
-                if is_link_alive(link):
+                if get_url_status(link) != 0:
                     coloredlink = add_green(link)
-                    page = pagereader.read_page(link)
+                    page = pagereader.read_first_page(link)[0]
                     if page is not None and page.title is not None:
                         print_row(coloredlink, page.title.string)
                 else:

diff --git a/modules/net_utils.py b/modules/net_utils.py
@@ -0,0 +1,73 @@
+import re
+import requests
+
+from requests.exceptions import ConnectionError, HTTPError
+
+
+def check_connection(url):
+
+    print("Attempting to connect to {site}".format(site=url))
+    if get_url_status(url) != 0:
+        return 1
+
+    return 0
+
+
+def get_url_status(url, headers=False):
+    """
+        Uses head request because it uses less bandwith than get and timeout is
+        set to 10 seconds and then link is automatically declared as dead.
+
+        Args:
+            link: link to be tested
+            colors: object containing colors for link
+
+        Return:
+            something?: either an int or return value of the connection object's
+            get request if successful & zero is failure
+    """
+    try:
+        if headers:
+                resp = requests.get(url, headers=headers)
+        else:
+                resp = requests.get(url)
+        resp.raise_for_status()
+        return resp
+    except (ConnectionError, HTTPError):
+        return 0
+
+
+def is_url(url):
+    pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.([a-z, A-Z]+)(.*)"
+    regex = re.compile(pattern)
+    if regex.match(url):
+            return 1
+    return 0
+
+
+def is_onion_url(url):
+    pattern = r"^https?:\/\/(www\.)?([a-z,A-Z,0-9]*)\.onion/(.*)"
+    regex = re.compile(pattern)
+    if regex.match(url):
+        return 1
+    return 0
+
+
+def get_urls_from_page(page, email=False, extension=False):
+    urls = []
+    anchors_on_page = page.find_all('a')
+    for anchor_tag in anchors_on_page:
+        url = anchor_tag.get('href')
+        if extension:
+            if url and is_url(url) == 1:
+                urls.append(url)
+        elif email:
+            if url and 'mailto' in url:
+                email_addr = url.split(':')
+                if len(email_addr) > 1:
+                    urls.append(email_addr[1])
+        else:
+            if url and is_onion_url(url) == 1:
+                urls.append(url)
+
+    return urls
diff --git a/modules/pagereader.py b/modules/pagereader.py
@@ -1,8 +1,6 @@
-import requests
-
 from bs4 import BeautifulSoup
+from modules.net_utils import get_url_status
 from modules.bcolors import Bcolors
-from requests.exceptions import ConnectionError, HTTPError, MissingSchema
 from sys import exit
 
 
@@ -15,83 +13,42 @@ def read_first_page(site):
     attempts_left = 3
     err = " "
     while attempts_left:
-        try:
-            if attempts_left == 3:
-                print(next(connection_msg(site)))
-                response = requests.get(site, headers=headers)
-                print("Connection successful.")
-                page = BeautifulSoup(response.text, 'html.parser')
-                return page, response
-            if attempts_left == 2:
-                print(next(connection_msg('https://' + site)))
-                response = requests.get('https://' + site, headers=headers)
-                print("Connection successful.")
-                page = BeautifulSoup(response.text, 'html.parser')
-                return page, response
-            if attempts_left == 1:
-                print(next(connection_msg('http://' + site)))
-                response = requests.get('http://' + site, headers=headers)
-                print("Connection successful.")
+        if attempts_left == 3:
+            print(next(connection_msg(site)))
+            response = get_url_status(site, headers)
+            if response != 0:
                 page = BeautifulSoup(response.text, 'html.parser')
                 return page, response
-            if not attempts_left:
-                msg = ''.join(("There has been an {err} while attempting to ",
-                               "connect to {site}.")).format(err=err, site=site)
-                exit(msg)
-
-        except (HTTPError, MissingSchema, ConnectionError) as e:
-            attempts_left -= 1
-            err = e
-
-    if isinstance(err, HTTPError):
-        print("There has been an HTTP error after three attempts.")
-        exit(1)
-    if isinstance(err, ConnectionError):
-        print("Got ConnectionError after three attempts... ",
-              "Please check if the TOR service is running or not.")
-        exit(1)
-
-
-def read_page(site):
-    headers = {'User-Agent': 'TorBot - Onion crawler | www.github.com/DedSecInside/TorBot'}
-    attempts_left = 3
-    err = " "
-    while attempts_left:
-        try:
-            if attempts_left == 3:
-                # print(next(connection_msg(site)))
-                response = requests.get(site, headers=headers)
-                # print("Connection successful.")
-                page = BeautifulSoup(response.text, 'html.parser')
-                return page
-            if attempts_left == 2:
-                # print(next(connection_msg('https://'+site)))
-                response = requests.get('https://' + site, headers=headers)
-                # print("Connection successful.")
-                page = BeautifulSoup(response.text, 'html.parser')
-                return page
-            if attempts_left == 1:
-                # print(next(connection_msg('http://'+site)))
-                response = requests.get('http://' + site, headers=headers)
-                # print("Connection successful.")
-                page = BeautifulSoup(response.text, 'html.parser')
-                return page
-            if not attempts_left:
-                msg = ''.join(("There has been an {err} while attempting to ",
-                               "connect to {site}.")).format(err=err, site=site)
-                exit(msg)
-
-        except (HTTPError, MissingSchema, ConnectionError) as e:
-            attempts_left -= 1
-            err = e
-
-    if isinstance(err, HTTPError):
-        print("There has been an HTTP error after three attempts.")
-        exit(1)
-    if isinstance(err, ConnectionError):
-        print("There has been a connection error after three attempts.")
-        exit(1)
-
+            else:
+                attempts_left -= 1
+                continue
+
+        if attempts_left == 2:
+            https_url = 'https://'+site
+            print(next(connection_msg(https_url)))
+            response = get_url_status(https_url, headers)
+            if response != 0:
+                    page = BeautifulSoup(response.text, 'html.parser')
+                    return page, response
+            else:
+                attempts_left -= 1
+                continue
+
+        if attempts_left == 1:
+            http_url = 'http://'+site
+            print(next(connection_msg(http_url)))
+            response = get_url_status(http_url, headers)
+            if response != 0:
+                    page = BeautifulSoup(response.text, 'html.parser')
+                    return page, response
+            else:
+                attempts_left -= 1
+                continue
+
+        if not attempts_left:
+            msg = ''.join(("There has been an {err} while attempting to ",
+                           "connect to {site}.")).format(err=err, site=site)
+            exit(msg)
 
 def get_ip():
     """Returns users tor ip address

diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,4 @@ termcolor==1.1.0
 requests==2.18.4
 requests_mock==1.4.0
 tldextract==2.2.0
-
+yattag==1.10.0