Merge pull request #46 from KingAkeem/dev

Beautiful and comprehensible code base. (Documenting and Refactoring)
DedSecInside · Jan 11, 2018 · 6870714 · 6870714
2 parents 0889945 + 9701706
commit 6870714
Show file tree

Hide file tree

Showing 9 changed files with 218 additions and 158 deletions.
diff --git a/README.md b/README.md
@@ -101,14 +101,8 @@ Before you run the torBot make sure the following things are done properly:
 * Run tor service
 `sudo service tor start`
 
-* Set a password for tor
-`tor --hash-password "my_password" `
-
-* Give the password inside torbot.py
-`from stem.control import Controller
-with Controller.from_port(port = 9051) as controller:
- controller.authenticate("your_password_hash")
- controller.signal(Signal.NEWNYM)`
+* Make sure that your torrc is configured to SOCKS_PORT localhost:9050 which should be the 
+  should default setting
 
 `python3 torBot.py`
 <pre>

diff --git a/modules/getemails.py b/modules/getemails.py
@@ -1,36 +1,40 @@
 from modules.bcolors import Bcolors
 from bs4 import BeautifulSoup
-from modules.savefile import saveJson
 
 
-"""Get all emails from the website"""
+def getMails(soup):
 
+    """
+        Searches for <a href> tags for links then checks if link ccontains the
+        substring 'mailto' indicating that it's an email. If it is determined
+        to be an email then the link is split and the username is appeneded to
+        the list
 
-def getMails(soup, save=0):
+        Args:
+            soup: BeautifulSoup isntance that will be used for parsing
+
+        Returns:
+            emails: list of email IDs
+    """
     b_colors = Bcolors()
-    _soup_instance = BeautifulSoup
-    if isinstance(type(soup), type(_soup_instance)):
+
+    if isinstance(type(soup), type(BeautifulSoup)):
+
         emails = []
-        for link in soup.find_all('a'):
-            email_link = link.get('href')
-            if email_link is not None:
-                if 'mailto' in email_link:
-                    """Split email address on"""
-                    email_addr = email_link.split(':')
-                    emails.append(email_addr[1])
-            else:
-                pass
+        links = soup.find_all('a')
+        for ref in links:
+            url = ref.get('href')
+            if url and 'mailto' in url:
+                """Split email address on"""
+                email_addr = url.split(':')
+                emails.append(email_addr[1])
+
         """Pretty print output as below"""
         print ('')
         print (b_colors.OKGREEN+'Mails Found - '+b_colors.ENDC+str(len(emails)))
         print ('-------------------------------')
-        for mail in emails:
-            print (mail)
-        if save:
-            saveJson("Extracted-Mail-IDs", emails)
-        return ''
+
+        return emails
+
     else:
-        msg = ''.join((b_colors.FAIL,
-                       'Method parameter is not of instance BeautifulSoup',
-                       b_colors.ENDC))
-        raise(msg)
+        raise('Method parameter is not of instance BeautifulSoup')
diff --git a/modules/getweblinks.py b/modules/getweblinks.py
@@ -1,42 +1,75 @@
-import urllib.request
+import re
 from modules.bcolors import Bcolors
 from bs4 import BeautifulSoup
 
 
+def valid_onion_url(link):
 
-def link_status(web, out_queue, index):
-    b_colors = Bcolors()
-    out_queue[index] = web + " is_live = False "
-    try:
-        urllib.request.urlopen(web)
-        out_queue[index] = web + " is_live = True "
-        print(web)
-    except urllib.error.HTTPError:
-        print(b_colors.On_Red+web+b_colors.ENDC)
+    """
+        Validates onion urls using regex
+
+        Args:
+            link: the url to be checked
+
+        Returns:
+            bool: True/False based on link
+    """
+
+    pattern = r"^https?\b(://+)(.+)(.+)\bonion/(.*)"
+    re_obj = re.compile(pattern)
+    if re_obj.fullmatch(link):
+        return True
+
+    return False
+
+
+def valid_url(link):
+
+    """
+        Validates general urls using regex
+
+        Takes in string which is a link and returns decides validitity of url
+        using regex
+
+        Args:
+            link: the url to be checked
+
+        Returns:
+            bool: True/False based on link
+    """
+
+    pattern = r"^https?\b(://+)(.+)(.+)\b...(.*)"
+    re_obj = re.compile(pattern)
+    if re_obj.fullmatch(link):
+        return True
+
+    return False
+
+
+def getLinks(soup):
 
+    """
+        Searches through all <a ref> (hyperlinks) tags and stores them in a
+        list then validates if the url is formatted correctly.
 
-def getLinks(soup, ext, live=0, save=0):
+        Args:
+            soup: BeautifulSoup instance currently being used.
 
-    """Get all onion links from the website"""
+        Returns:
+            websites: List of websites that were found
+    """
 
     b_colors = Bcolors()
-    extensions = []
-    if ext:
-        for e in ext:
-            extensions.append(e)
 
     if isinstance(type(soup), type(BeautifulSoup)):
         websites = []
 
-        for link in soup.find_all('a'):
-            web_link = link.get('href')
-            if web_link and ('http' in web_link or 'https' in web_link):
+        links = soup.find_all('a')
+        for ref in links:
+            url = ref.get('href')
+            if url and (valid_onion_url(url) or valid_url(url)):
+                websites.append(url)
 
-                for exten in extensions:
-                    if web_link.endswith(exten):
-                        websites.append(web_link)
-                else:
-                    websites.append(web_link)
         """Pretty print output as below"""
         print(''.join((b_colors.OKGREEN,
               'Websites Found - ', b_colors.ENDC, str(len(websites)))))

diff --git a/modules/savefile.py b/modules/savefile.py
@@ -3,15 +3,21 @@
 
 
 def saveJson(datatype, data):
+    """
+        Creates json file and stores json
+
+        Args:
+            datatype: the type of the object being passed
+            data = data that is being stored with object
+    """
 
-    "function_docstring"
     timestr = time.strftime("%Y%m%d-%H%M%S")
     # Json File Creation
-    file = open("TorBoT-Export-"+datatype+timestr+".json", "a")
-    # Store data in Json format
-    output = {datatype: data}
-    # Dump output to file
-    json.dump(output, file, indent=2)
-    file.close()
+    with open("TorBoT-Export-"+datatype+timestr+".json", "x") as file:
+        # Store data in Json format
+        output = {datatype: data}
+        # Dump output to file
+        json.dump(output, file, indent=2)
+
     print("\nData will be saved with a File Name :",
           "TorBoT-Export-"+datatype+timestr+".json")
diff --git a/modules/updater.py b/modules/updater.py
@@ -2,6 +2,13 @@
 
 
 def updateTor():
+
+    """
+        Currently updates Tor by calling terminal commands using subprocess
+        Not a great method and will be replaced in the future.
+
+    """
+
     print("Checking for latest stable release")
     isGit = subprocess.Popen(
             ["git", "branch"],

diff --git a/tests/test_getemails.py b/tests/test_getemails.py
@@ -9,7 +9,6 @@
 sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
 
 from modules import pagereader, getemails
-from io import StringIO
 from modules.bcolors import Bcolors
 
 
@@ -19,15 +18,12 @@
 class getMailsTestCase(unittest.TestCase):
 
         def setUp(self):
-            self.held, sys.stdout = sys.stdout, StringIO()
             self.b_colors = Bcolors()
 
-        def test_print_emails(self):
-            data = ''.join(("\n", self.b_colors.OKGREEN, "Mails Found - ",
-                            self.b_colors.ENDC, "1\n------------------------",
-                            "-------\n[email protected]\n"))
-            getemails.getMails(soup)
-            self.assertEqual(sys.stdout.getvalue(), data)
+        def test_getemails(self):
+            test_emails = ["[email protected]"]
+            emails = getemails.getMails(soup)
+            self.assertEqual(emails, test_emails)
 
 
 if __name__ == '__main__':

diff --git a/tests/test_getweblinks.py b/tests/test_getweblinks.py
@@ -22,16 +22,14 @@ def setUp(self):
             self.held, sys.stdout = sys.stdout, StringIO()
             self.maxDiff = None
 
-        def test_print_links(self):
+        def test_get_links(self):
 
             data = ['http://aff.ironsocket.com/SH7L',
-                               'http://aff.ironsocket.com/SH7L',
-                               'http://wsrs.net/',
-                               'http://cmsgear.com/',
-                               'http://cmsgear.com/']
+                    'http://aff.ironsocket.com/SH7L',
+                    'http://wsrs.net/',
+                    'http://cmsgear.com/']
 
-            ext = ['.com/']
-            result = getweblinks.getLinks(soup, ext)
+            result = getweblinks.getLinks(soup)
             self.assertEqual(result, data)
 
 

diff --git a/tests/test_savetofile.py b/tests/test_savetofile.py
@@ -27,10 +27,8 @@ def test_save_links(self):
         data = ['http://aff.ironsocket.com/SH7L',
                 'http://aff.ironsocket.com/SH7L',
                 'http://wsrs.net/',
-                'http://cmsgear.com/',
                 'http://cmsgear.com/']
-        ext = ['.com/']
-        result = getweblinks.getLinks(soup, ext, 0, 1)
+        result = getweblinks.getLinks(soup)
         self.assertEqual(result, data)