From 800567888dd1c7a225db6604252f4a88903273fd Mon Sep 17 00:00:00 2001 From: OSINT-TECHNOLOGIES <77023667+OSINT-TECHNOLOGIES@users.noreply.github.com> Date: Fri, 14 Jun 2024 19:53:42 +0300 Subject: [PATCH] Added .xlsx report creation module --- datagather_modules/xlsx_report_creation.py | 282 +++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 datagather_modules/xlsx_report_creation.py diff --git a/datagather_modules/xlsx_report_creation.py b/datagather_modules/xlsx_report_creation.py new file mode 100644 index 0000000..eb15054 --- /dev/null +++ b/datagather_modules/xlsx_report_creation.py @@ -0,0 +1,282 @@ +import sys + +try: + from datetime import datetime + import os + import openpyxl + from openpyxl.styles import Font + from colorama import Fore, Style + import sqlite3 +except ImportError as e: + print(Fore.RED + "Import error appeared. Reason: {}".format(e) + Style.RESET_ALL) + sys.exit() + +sys.path.append('service') + +import crawl_processor as cp +import dorking_processor as dp +import networking_processor as np +import db_processing as db +import files_processing as fp + +def create_report(short_domain, url, case_comment, report_file_type): + try: + ctime = datetime.now().strftime('%Y-%m-%d_%Hh%Mm%Ss') + casename = short_domain.replace(".", "") + '_' + ctime + '.xlsx' + foldername = short_domain.replace(".", "") + '_' + ctime + db_casename = short_domain.replace(".", "") + now = datetime.now() + db_creation_date = str(now.year) + str(now.month) + str(now.day) + report_folder = "report_{}".format(foldername) + robots_filepath = report_folder + '//01-robots.txt' + sitemap_filepath = report_folder + '//02-sitemap.txt' + sitemap_links_filepath = report_folder + '//03-sitemap_links.txt' + os.makedirs(report_folder, exist_ok=True) + wb = openpyxl.Workbook() + sheet_names = [ + "GENERAL INFO", + "WHOIS", + "SOCIAL MEDIAS", + "SUBDOMAINS", + "DNS SCAN", + "SSL CERTIFICATE", + "INTERNETDB SEARCH", + "WEBSITE TECHNOLOGIES", + "SITEMAP LINKS", + "DORKING RESULTS" + ] + sheet = wb.active + sheet.title = sheet_names[0] + for name in sheet_names[1:]: + wb.create_sheet(title=name) + bold_font = Font(bold=True) + + print(Fore.GREEN + "Started scanning domain" + Style.RESET_ALL) + print(Fore.GREEN + "Getting domain IP address" + Style.RESET_ALL) + ip = cp.ip_gather(short_domain) + print(Fore.GREEN + 'Gathering WHOIS information' + Style.RESET_ALL) + res = cp.whois_gather(short_domain) + print(Fore.GREEN + 'Processing e-mails gathering' + Style.RESET_ALL) + mails = cp.mail_gather(url) + print(Fore.GREEN + 'Processing subdomain gathering' + Style.RESET_ALL) + subdomains, subdomains_amount = cp.subdomains_gather(url, short_domain) + print(Fore.GREEN + 'Processing social medias gathering' + Style.RESET_ALL) + social_medias = cp.sm_gather(url) + print(Fore.GREEN + 'Processing subdomain analysis' + Style.RESET_ALL) + subdomain_urls, subdomain_mails, subdomain_ip, sd_socials = cp.domains_reverse_research(subdomains, report_file_type) + print(Fore.GREEN + 'Processing SSL certificate gathering' + Style.RESET_ALL) + issuer, subject, notBefore, notAfter, commonName, serialNumber = np.get_ssl_certificate(short_domain) + print(Fore.GREEN + 'Processing MX records gathering' + Style.RESET_ALL) + mx_records = np.get_dns_info(short_domain) + print(Fore.GREEN + 'Extracting robots.txt and sitemap.xml' + Style.RESET_ALL) + robots_txt_result = np.get_robots_txt(short_domain, robots_filepath) + sitemap_xml_result = np.get_sitemap_xml(short_domain, sitemap_filepath) + sitemap_links_status, parsed_links = np.extract_links_from_sitemap(sitemap_links_filepath, sitemap_filepath, 'xlsx') + print(Fore.GREEN + 'Gathering info about website technologies' + Style.RESET_ALL) + web_servers, cms, programming_languages, web_frameworks, analytics, javascript_frameworks = np.get_technologies(url) + print(Fore.GREEN + 'Processing Shodan InternetDB search' + Style.RESET_ALL) + ports, hostnames, cpes, tags, vulns = np.query_internetdb(ip, report_file_type) + print(Fore.GREEN + 'Processing Google Dorking' + Style.RESET_ALL) + dorking_status, dorking_results = dp.transfer_results_to_xlsx(dp.get_dorking_query(short_domain)) + print(Fore.GREEN + 'Processing XLSX report for {} case...'.format(short_domain) + Style.RESET_ALL) + common_socials = {key: social_medias.get(key, []) + sd_socials.get(key, []) for key in set(social_medias) | set(sd_socials)} + for key in common_socials: + common_socials[key] = list(set(common_socials[key])) + total_socials = sum(len(values) for values in common_socials.values()) + + ws = wb['GENERAL INFO'] + for col in ['1', '2', '3', '4', '5', '6', '7']: + cell = f"A{col}" + ws[cell].font = bold_font + ws.column_dimensions['A'].width = 45 + ws.column_dimensions['B'].width = 60 + ws['A1'] = 'SUBDOMAINS FOUND' + ws['A2'] = 'SOCIAL MEDIAS FOUND' + ws['A3'] = 'ROBOTS EXTRACTED?' + ws['A4'] = 'SITEMAP.XML EXTRACTED?' + ws['A5'] = 'SITEMAP.XML LINKS EXTRACTED?' + ws['A6'] = 'DORKING STATUS' + ws['A7'] = 'REPORT CREATION TIME' + ws['B1'] = subdomains_amount + ws['B2'] = total_socials + ws['B3'] = robots_txt_result + ws['B4'] = sitemap_xml_result + ws['B5'] = sitemap_links_status + ws['B6'] = dorking_status + ws['B7'] = ctime + + ws = wb['WHOIS'] + for col in ['1', '2', '3', '4', '5', '6', '7', '8']: + cell = f"A{col}" + ws[cell].font = bold_font + ws.column_dimensions['A'].width = 45 + ws.column_dimensions['B'].width = 60 + ws['A1'] = 'SHORT DOMAIN' + ws['A2'] = 'URL' + ws['A3'] = 'IP ADDRESS' + ws['A4'] = 'REGISTRAR' + ws['A5'] = 'CREATION DATE' + ws['A6'] = 'EXPIRATION DATE' + ws['A7'] = 'NAME SERVERS' + ws['A8'] = 'ORGANIZATION NAME' + ws['B1'] = short_domain + ws['B2'] = url + ws['B3'] = ip + ws['B4'] = res['registrar'] + ws['B5'] = res['creation_date'] + ws['B6'] = res['expiration_date'] + ws['B7'] = ', '.join(res['name_servers']) + ws['B8'] = res['org'] + + ws = wb['SOCIAL MEDIAS'] + for col in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']: + cell = f"{col}1" + ws[cell].font = bold_font + ws.column_dimensions[col].width = 70 + tw_links = common_socials['Twitter'] + inst_links = common_socials['Instagram'] + tg_links = common_socials['Telegram'] + tt_links = common_socials['TikTok'] + li_links = common_socials['LinkedIn'] + vk_links = common_socials['VKontakte'] + yt_links = common_socials['YouTube'] + wc_links = common_socials['WeChat'] + ok_links = common_socials['Odnoklassniki'] + fb_links = common_socials['Facebook'] + ws['A1'] = 'FACEBOOK' + ws['B1'] = 'TWITTER' + ws['C1'] = 'INSTAGRAM' + ws['D1'] = 'TELEGRAM' + ws['E1'] = 'TIKTOK' + ws['F1'] = 'LINKEDIN' + ws['G1'] = 'VKONTAKTE' + ws['H1'] = 'YOUTUBE' + ws['I1'] = 'ODNOKLASSNIKI' + ws['J1'] = 'WECHAT' + + for i in range(len(fb_links)): + ws[f"A{i + 2}"] = fb_links[i] + for i in range(len(tw_links)): + ws[f"B{i + 2}"] = tw_links[i] + for i in range(len(inst_links)): + ws[f"C{i + 2}"] = inst_links[i] + for i in range(len(tg_links)): + ws[f"D{i + 2}"] = tg_links[i] + for i in range(len(tt_links)): + ws[f"E{i + 2}"] = tt_links[i] + for i in range(len(li_links)): + ws[f"F{i + 2}"] = li_links[i] + for i in range(len(vk_links)): + ws[f"G{i + 2}"] = vk_links[i] + for i in range(len(yt_links)): + ws[f"H{i + 2}"] = yt_links[i] + for i in range(len(ok_links)): + ws[f"I{i + 2}"] = ok_links[i] + for i in range(len(wc_links)): + ws[f"J{i + 2}"] = wc_links[i] + + ws = wb['SUBDOMAINS'] + for col in ['A', 'B', 'C']: + cell = f"{col}1" + ws[cell].font = bold_font + ws.column_dimensions[col].width = 70 + ws['A1'] = 'FOUNDED SUBDOMAINS' + ws['B1'] = 'SUBDOMAIN IP ADDRESSES (NOT CORRELATED)' + ws['C1'] = 'SUBDOMAIN EMAILS (NOT CORRELATED)' + try: + for i in range(len(subdomain_urls)): + ws[f"A{i + 2}"] = str(subdomain_urls[i]) + for i in range(len(subdomain_ip)): + ws[f"B{i + 2}"] = str(subdomain_ip[i]) + for i in range(len(subdomain_mails)): + ws[f"C{i + 2}"] = str(subdomain_mails[i]) + except Exception as e: + print(Fore.RED + "Error appeared when writing some information about subdomains in XLSX file. Reason: {}".format(e)) + pass + + ws = wb['DNS SCAN'] + for col in ['1', '2']: + cell = f"A{col}" + ws[cell].font = bold_font + ws.column_dimensions['A'].width = 45 + ws.column_dimensions['B'].width = 60 + ws['A1'] = 'NAME SERVERS' + ws['A2'] = 'MX ADDRESSES' + ws['B1'] = ', '.join(res['name_servers']) + ws['B2'] = mx_records + + ws = wb['SSL CERTIFICATE'] + for col in ['1', '2', '3', '4', '5', '6']: + cell = f"A{col}" + ws[cell].font = bold_font + ws.column_dimensions['A'].width = 45 + ws.column_dimensions['B'].width = 60 + ws['A1'] = 'ISSUER' + ws['A2'] = 'SUBJECT' + ws['A3'] = 'NOT BEFORE' + ws['A4'] = 'NOT AFTER' + ws['A5'] = 'CERTIFICATE NAME' + ws['A6'] = 'CERTIFICATE SERIAL NUMBER' + ws['B1'] = issuer + ws['B2'] = subject + ws['B3'] = notBefore + ws['B4'] = notAfter + ws['B5'] = commonName + ws['B6'] = serialNumber + + ws = wb['INTERNETDB SEARCH'] + for col in ['1', '2', '3', '4']: + cell = f"A{col}" + ws[cell].font = bold_font + ws['I1'].font = bold_font + ws.column_dimensions['A'].width = 45 + ws.column_dimensions['B'].width = 60 + ws['A1'] = 'OPEN PORTS' + ws['A2'] = 'HOSTNAMES' + ws['A3'] = 'TAGS' + ws['A4'] = 'CPEs' + ws['I1'] = 'POTENTIAL VULNERABILITIES' + ws['B1'] = str(ports) + ws['B2'] = str(hostnames) + ws['B3'] = str(tags) + ws['B4'] = str(cpes) + for i in range(len(vulns)): + ws[f"I{i + 2}"] = str(vulns[i]) + + ws = wb['WEBSITE TECHNOLOGIES'] + for col in ['1', '2', '3', '4', '5', '6']: + cell = f"A{col}" + ws[cell].font = bold_font + ws.column_dimensions['A'].width = 45 + ws.column_dimensions['B'].width = 60 + ws['A1'] = 'WEB SERVERS' + ws['A2'] = 'CMS' + ws['A3'] = 'USED PROGRAMMING LANGUAGES' + ws['A4'] = 'USED WEB FRAMEWORKS' + ws['A5'] = 'ANALYTICS SERVICE' + ws['A6'] = 'USED JAVASCRIPT FRAMEWORKS' + ws['B1'] = str(web_servers) + ws['B2'] = str(cms) + ws['B3'] = str(programming_languages) + ws['B4'] = str(web_frameworks) + ws['B5'] = str(analytics) + ws['B6'] = str(javascript_frameworks) + + ws = wb['SITEMAP LINKS'] + ws.column_dimensions['A'].width = 80 + for i in range(len(parsed_links)): + ws[f"A{i + 1}"] = str(parsed_links[i]) + + ws = wb['DORKING RESULTS'] + ws.column_dimensions['A'].width = 80 + for i in range(len(dorking_results)): + ws[f"A{i + 1}"] = str(dorking_results[i]) + + report_file = report_folder + "//" + casename + wb.save(report_file) + print(Fore.GREEN + "Report for {} case was created at {}".format(''.join(short_domain), ctime) + Style.RESET_ALL) + robots_content, sitemap_content, sitemap_links_content, dorking_content = fp.get_db_columns(report_folder) + xlsx_blob = fp.get_blob(report_file) + db.insert_blob('XLSX', xlsx_blob, db_casename, db_creation_date, case_comment, robots_content, sitemap_content, sitemap_links_content, dorking_content) + except Exception as e: + print(Fore.RED + 'Unable to create XLSX report. Reason: {}'.format(e)) \ No newline at end of file