DedSecInside · PSNAppz · Jan 11, 2018 · Jan 10, 2018 · Jan 10, 2018 · Jan 10, 2018
diff --git a/README.md b/README.md
@@ -101,14 +101,8 @@ Before you run the torBot make sure the following things are done properly:
 * Run tor service
 `sudo service tor start`
 
-* Set a password for tor
-`tor --hash-password "my_password" `
-
-* Give the password inside torbot.py
-`from stem.control import Controller
-with Controller.from_port(port = 9051) as controller:
- controller.authenticate("your_password_hash")
- controller.signal(Signal.NEWNYM)`
+* Make sure that your torrc is configured to SOCKS_PORT localhost:9050 which should be the 
+  should default setting
 
 `python3 torBot.py`
 <pre>

diff --git a/modules/getemails.py b/modules/getemails.py
@@ -3,34 +3,37 @@
 from modules.savefile import saveJson
 
 
-"""Get all emails from the website"""
-
-
 def getMails(soup, save=0):
+    """
+        Searches for <a href> tags for links then checks if link ccontains the
+        substring 'mailto' indicating that it's an email. If it is determined
+        to be an email then the link is split and the username is appeneded to
+        the list
+
+        Args:
+            soup: BeautifulSoup isntance that will be used for parsing
+    """
     b_colors = Bcolors()
-    _soup_instance = BeautifulSoup
-    if isinstance(type(soup), type(_soup_instance)):
+
+    if isinstance(type(soup), type(BeautifulSoup)):
+
         emails = []
-        for link in soup.find_all('a'):
-            email_link = link.get('href')
-            if email_link is not None:
-                if 'mailto' in email_link:
-                    """Split email address on"""
-                    email_addr = email_link.split(':')
-                    emails.append(email_addr[1])
-            else:
-                pass
+        links = soup.find_all('a')
+        for ref in links:
+            url = ref.get('href')
+            if url and 'mailto' in url:
+                """Split email address on"""
+                email_addr = url.split(':')
+                emails.append(email_addr[1])
+
         """Pretty print output as below"""
         print ('')
         print (b_colors.OKGREEN+'Mails Found - '+b_colors.ENDC+str(len(emails)))
         print ('-------------------------------')
+
         for mail in emails:
-            print (mail)
+            print(mail)
         if save:
             saveJson("Extracted-Mail-IDs", emails)
-        return ''
     else:
-        msg = ''.join((b_colors.FAIL,
-                       'Method parameter is not of instance BeautifulSoup',
-                       b_colors.ENDC))
-        raise(msg)
+        raise('Method parameter is not of instance BeautifulSoup')
diff --git a/modules/getweblinks.py b/modules/getweblinks.py
@@ -1,42 +1,74 @@
-import urllib.request
+import re
 from modules.bcolors import Bcolors
 from bs4 import BeautifulSoup
 
 
+def valid_onion_url(link):
 
-def link_status(web, out_queue, index):
-    b_colors = Bcolors()
-    out_queue[index] = web + " is_live = False "
-    try:
-        urllib.request.urlopen(web)
-        out_queue[index] = web + " is_live = True "
-        print(web)
-    except urllib.error.HTTPError:
-        print(b_colors.On_Red+web+b_colors.ENDC)
+    """
+        Validates onion urls using regex
+
+        Args:
+            link: the url to be checked
+
+        Returns:
+            bool: True/False based on link
+    """
+
+    pattern = r"^https?\b(://+)(.+)(.+)\bonion/(.*)"
+    re_obj = re.compile(pattern)
+    if re_obj.fullmatch(link):
+        return True
+
+    return False
+
+
+def valid_url(link):
+
+    """
+        Validates general urls using regex
 
+        Takes in string which is a link and returns decides validitity of url
+        using regex
 
-def getLinks(soup, ext, live=0, save=0):
+        Args:
+            link: the url to be checked
 
-    """Get all onion links from the website"""
+        Returns:
+            bool: True/False based on link
+    """
+
+    pattern = r"^https?\b(://+)(.+)(.+)\b...(.*)"
+    re_obj = re.compile(pattern)
+    if re_obj.fullmatch(link):
+        return True
+
+    return False
+
+
+def getLinks(soup):
+
+    """
+        Searches through all <a ref> (hyperlinks) tags and stores them in a
+        list then validates if the url is formatted correctly.
+
+        Args:
+            soup: BeautifulSoup instance currently being used.
+
+        Returns:
+            websites: List of websites that were found
+    """
 
     b_colors = Bcolors()
-    extensions = []
-    if ext:
-        for e in ext:
-            extensions.append(e)
 
     if isinstance(type(soup), type(BeautifulSoup)):
         websites = []
 
-        for link in soup.find_all('a'):
-            web_link = link.get('href')
-            if web_link and ('http' in web_link or 'https' in web_link):
-
-                for exten in extensions:
-                    if web_link.endswith(exten):
-                        websites.append(web_link)
-                else:
-                    websites.append(web_link)
+        links = soup.find_all('a')
+        for ref in links:
+            url = ref.get('href')
+            if url and (valid_onion_url(url) or valid_url(url)):
+                websites.append(url)
         """Pretty print output as below"""
         print(''.join((b_colors.OKGREEN,
               'Websites Found - ', b_colors.ENDC, str(len(websites)))))

diff --git a/tests/test_getweblinks.py b/tests/test_getweblinks.py
@@ -22,16 +22,14 @@ def setUp(self):
             self.held, sys.stdout = sys.stdout, StringIO()
             self.maxDiff = None
 
-        def test_print_links(self):
+        def test_get_links(self):
 
             data = ['http://aff.ironsocket.com/SH7L',
-                               'http://aff.ironsocket.com/SH7L',
-                               'http://wsrs.net/',
-                               'http://cmsgear.com/',
-                               'http://cmsgear.com/']
+                    'http://aff.ironsocket.com/SH7L',
+                    'http://wsrs.net/',
+                    'http://cmsgear.com/']
 
-            ext = ['.com/']
-            result = getweblinks.getLinks(soup, ext)
+            result = getweblinks.getLinks(soup)
             self.assertEqual(result, data)
 
 

diff --git a/tests/test_savetofile.py b/tests/test_savetofile.py
@@ -27,10 +27,8 @@ def test_save_links(self):
         data = ['http://aff.ironsocket.com/SH7L',
                 'http://aff.ironsocket.com/SH7L',
                 'http://wsrs.net/',
-                'http://cmsgear.com/',
                 'http://cmsgear.com/']
-        ext = ['.com/']
-        result = getweblinks.getLinks(soup, ext, 0, 1)
+        result = getweblinks.getLinks(soup)
         self.assertEqual(result, data)