Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A mission to produce a beautiful and comprehensible code base. (Documenting and Refactoring) #46

Merged
merged 10 commits into from
Jan 11, 2018
10 changes: 2 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,8 @@ Before you run the torBot make sure the following things are done properly:
* Run tor service
`sudo service tor start`

* Set a password for tor
`tor --hash-password "my_password" `

* Give the password inside torbot.py
`from stem.control import Controller
with Controller.from_port(port = 9051) as controller:
controller.authenticate("your_password_hash")
controller.signal(Signal.NEWNYM)`
* Make sure that your torrc is configured to SOCKS_PORT localhost:9050 which should be the
should default setting

`python3 torBot.py`
<pre>
Expand Down
43 changes: 23 additions & 20 deletions modules/getemails.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,37 @@
from modules.savefile import saveJson


"""Get all emails from the website"""


def getMails(soup, save=0):
"""
Searches for <a href> tags for links then checks if link ccontains the
substring 'mailto' indicating that it's an email. If it is determined
to be an email then the link is split and the username is appeneded to
the list

Args:
soup: BeautifulSoup isntance that will be used for parsing
"""
b_colors = Bcolors()
_soup_instance = BeautifulSoup
if isinstance(type(soup), type(_soup_instance)):

if isinstance(type(soup), type(BeautifulSoup)):

emails = []
for link in soup.find_all('a'):
email_link = link.get('href')
if email_link is not None:
if 'mailto' in email_link:
"""Split email address on"""
email_addr = email_link.split(':')
emails.append(email_addr[1])
else:
pass
links = soup.find_all('a')
for ref in links:
url = ref.get('href')
if url and 'mailto' in url:
"""Split email address on"""
email_addr = url.split(':')
emails.append(email_addr[1])

"""Pretty print output as below"""
print ('')
print (b_colors.OKGREEN+'Mails Found - '+b_colors.ENDC+str(len(emails)))
print ('-------------------------------')

for mail in emails:
print (mail)
print(mail)
if save:
saveJson("Extracted-Mail-IDs", emails)
return ''
else:
msg = ''.join((b_colors.FAIL,
'Method parameter is not of instance BeautifulSoup',
b_colors.ENDC))
raise(msg)
raise('Method parameter is not of instance BeautifulSoup')
82 changes: 57 additions & 25 deletions modules/getweblinks.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,74 @@
import urllib.request
import re
from modules.bcolors import Bcolors
from bs4 import BeautifulSoup


def valid_onion_url(link):

def link_status(web, out_queue, index):
b_colors = Bcolors()
out_queue[index] = web + " is_live = False "
try:
urllib.request.urlopen(web)
out_queue[index] = web + " is_live = True "
print(web)
except urllib.error.HTTPError:
print(b_colors.On_Red+web+b_colors.ENDC)
"""
Validates onion urls using regex

Args:
link: the url to be checked

Returns:
bool: True/False based on link
"""

pattern = r"^https?\b(://+)(.+)(.+)\bonion/(.*)"
re_obj = re.compile(pattern)
if re_obj.fullmatch(link):
return True

return False


def valid_url(link):

"""
Validates general urls using regex

Takes in string which is a link and returns decides validitity of url
using regex

def getLinks(soup, ext, live=0, save=0):
Args:
link: the url to be checked

"""Get all onion links from the website"""
Returns:
bool: True/False based on link
"""

pattern = r"^https?\b(://+)(.+)(.+)\b...(.*)"
re_obj = re.compile(pattern)
if re_obj.fullmatch(link):
return True

return False


def getLinks(soup):

"""
Searches through all <a ref> (hyperlinks) tags and stores them in a
list then validates if the url is formatted correctly.

Args:
soup: BeautifulSoup instance currently being used.

Returns:
websites: List of websites that were found
"""

b_colors = Bcolors()
extensions = []
if ext:
for e in ext:
extensions.append(e)

if isinstance(type(soup), type(BeautifulSoup)):
websites = []

for link in soup.find_all('a'):
web_link = link.get('href')
if web_link and ('http' in web_link or 'https' in web_link):

for exten in extensions:
if web_link.endswith(exten):
websites.append(web_link)
else:
websites.append(web_link)
links = soup.find_all('a')
for ref in links:
url = ref.get('href')
if url and (valid_onion_url(url) or valid_url(url)):
websites.append(url)
"""Pretty print output as below"""
print(''.join((b_colors.OKGREEN,
'Websites Found - ', b_colors.ENDC, str(len(websites)))))
Expand Down
12 changes: 5 additions & 7 deletions tests/test_getweblinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,14 @@ def setUp(self):
self.held, sys.stdout = sys.stdout, StringIO()
self.maxDiff = None

def test_print_links(self):
def test_get_links(self):

data = ['http://aff.ironsocket.com/SH7L',
'http://aff.ironsocket.com/SH7L',
'http://wsrs.net/',
'http://cmsgear.com/',
'http://cmsgear.com/']
'http://aff.ironsocket.com/SH7L',
'http://wsrs.net/',
'http://cmsgear.com/']

ext = ['.com/']
result = getweblinks.getLinks(soup, ext)
result = getweblinks.getLinks(soup)
self.assertEqual(result, data)


Expand Down
4 changes: 1 addition & 3 deletions tests/test_savetofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,8 @@ def test_save_links(self):
data = ['http://aff.ironsocket.com/SH7L',
'http://aff.ironsocket.com/SH7L',
'http://wsrs.net/',
'http://cmsgear.com/',
'http://cmsgear.com/']
ext = ['.com/']
result = getweblinks.getLinks(soup, ext, 0, 1)
result = getweblinks.getLinks(soup)
self.assertEqual(result, data)


Expand Down
Loading