Skip to content

Commit

Permalink
Reworked logs into solid journal file for all scans
Browse files Browse the repository at this point in the history
  • Loading branch information
OSINT-TECHNOLOGIES authored Aug 21, 2024
1 parent db2a715 commit 4f5602b
Showing 1 changed file with 32 additions and 22 deletions.
54 changes: 32 additions & 22 deletions pagesearch/pagesearch_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,42 @@
from colorama import Fore, Style
import os
import fitz
import sys
sys.path.append('service')
from logs_processing import logging

def extract_text_from_pdf(filename: str):
def extract_text_from_pdf(filename: str) -> str:
try:
extract_text_from_pdf_status = 'TEXT EXTRACTING FROM PDF (PAGESEARCH): OK'
logging.info('TEXT EXTRACTION FROM PDF (PAGESEARCH): OK')
doc = fitz.open(filename=filename)
text = ""
for page in doc:
text += page.get_text()
return text, extract_text_from_pdf_status
return text
except Exception as e:
print(Fore.RED + f"Can't open some PDF file. See logs for details" + Style.RESET_ALL)
extract_text_from_pdf_status = f'TEXT EXTRACTING FROM PDF (PAGESEARCH): NOT OK. REASON: {e}'
return '', extract_text_from_pdf_status
print(Fore.RED + f"Can't open some PDF file. See journal for details" + Style.RESET_ALL)
logging.error(f'TEXT EXTRACTION FROM PDF (PAGESEARCH): ERROR. REASON: {e}')
pass

def find_keywords_in_pdfs(ps_docs_path, keywords: list):
def find_keywords_in_pdfs(ps_docs_path, keywords: list) -> dict:
try:
find_keywords_in_pdfs_status = 'FINDING KEYWORDS IN PDF (PAGESEARCH): OK'
logging.info('KEYWORDS SEARCH IN PDF (PAGESEARCH): OK')
pdf_files = [f for f in os.listdir(ps_docs_path) if f.lower().endswith(".pdf")]
results = {}
pdf_with_keywords = 0
for pdf_file in pdf_files:
pdf_path = os.path.join(ps_docs_path, pdf_file)
extracted_text, extract_text_from_pdf_status = extract_text_from_pdf(pdf_path)
extracted_text = extract_text_from_pdf(pdf_path)
for keyword in keywords:
if keyword.lower() in extracted_text.lower():
if pdf_file not in results:
results[pdf_file] = []
results[pdf_file].append(keyword)
pdf_with_keywords += 1
return results, pdf_with_keywords, extract_text_from_pdf_status, find_keywords_in_pdfs_status
return results, pdf_with_keywords
except Exception as e:
print(Fore.RED + f"Can't find keywords. See logs for details")
find_keywords_in_pdfs_status = f'FINDING KEYWORDS IN PDF (PAGESEARCH): NOT OK. REASON: {e}'
return results, pdf_with_keywords, extract_text_from_pdf_status, find_keywords_in_pdfs_status
print(Fore.RED + f"Can't find keywords. See journal for details")
logging.error(f'KEYWORDS SEARCH IN PDF (PAGESEARCH): ERROR. REASON: {e}')
pass

def clean_bad_pdfs(ps_docs_path):
Expand Down Expand Up @@ -68,16 +69,19 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):

for url in subdomains_list:
try:
logging.info('ACCESSING SUBDOMAIN (PAGESEARCH): OK')
response = requests.get('http://' + url)
if response.status_code == 200:
accessible_subdomains += 1
soup = BeautifulSoup(response.content, 'html.parser')
except Exception:
print(Fore.RED + "Can't access some subdomain. See logs for details")
except Exception as e:
print(Fore.RED + "Can't access some subdomain. See journal for details")
logging.error(f'ACCESSING SUBDOMAIN (PAGESEARCH): ERROR. REASON: {e}')
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------" + Style.RESET_ALL)
pass

try:
logging.info('WEB RESOURCE ADDITIONAL INFO GATHERING (PAGESEARCH): OK')
title = soup.title.string
emails = re.findall(email_pattern, soup.text)
total_emails.append(emails)
Expand Down Expand Up @@ -122,12 +126,14 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
print(Fore.GREEN + "Found cookie: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{cookie_name}. " + Style.RESET_ALL + Fore.GREEN + "Value: " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{cookie_value}" + Style.RESET_ALL)
cookies_counter += 1

except Exception:
print(Fore.RED + "Error while getting detailed info on web resource. See logs for details")
except Exception as e:
print(Fore.RED + "Error while getting detailed info on web resource. See journal for details")
logging.error(f'WEB RESOURCE ADDITIONAL INFO GATHERING (PAGESEARCH): ERROR. REASON: {e}')
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------" + Style.RESET_ALL)
pass

try:
logging.info('FILES EXTRACTION (PAGESEARCH): OK')
links = soup.find_all('a')
for link in links:
href = link.get('href')
Expand Down Expand Up @@ -236,8 +242,9 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
files_counter += 1
print(Fore.GREEN + "File was successfully saved")
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
except Exception:
print(Fore.RED + "This file can't be accessed to extract it. See logs for details")
except Exception as e:
print(Fore.RED + "This file can't be accessed to extract it. See journal for details")
logging.error(f'FILES EXTRACTION (PAGESEARCH): ERROR. REASON: {e}')
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------" + Style.RESET_ALL)
pass

Expand All @@ -249,11 +256,13 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
if keywords_flag == 1:
print(Fore.GREEN + "Searching keywords in PDF files..." + Style.RESET_ALL)
try:
pdf_results, pdf_with_keywords, extract_text_from_pdf_status, find_keywords_in_pdfs_status = find_keywords_in_pdfs(ps_docs_path, keywords)
pdf_results, pdf_with_keywords = find_keywords_in_pdfs(ps_docs_path, keywords)
for pdf_file, found_keywords in pdf_results.items():
print(Fore.GREEN + f"Keywords " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{', '.join(found_keywords)}" + Style.RESET_ALL + Fore.GREEN + f" found in '{pdf_file}'" + Style.RESET_ALL)
except Exception as e:
print(Fore.RED + f"Can't find keywords. Reason: {e}")
print(Fore.RED + f"Can't find keywords. See journal for details")
logging.error(f'KEYWORDS SEARCH IN PDF (PAGESEARCH): ERROR. REASON: {e}')
pdf_with_keywords = 0
elif keywords_flag == 0:
print(Fore.RED + "Keywords gathering won't start because of None user input" + Style.RESET_ALL)
print(Fore.LIGHTGREEN_EX + "-------------------------------------------------" + Style.RESET_ALL)
Expand All @@ -269,8 +278,9 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):

if keywords_flag == 0:
print(Fore.GREEN + "[+] Keywords were not gathered because of None user input")
return ps_emails_return
else:
print(Fore.GREEN + f"[+] Total {pdf_with_keywords} keywords were found in PDF files")
return ps_emails_return, extract_text_from_pdf_status, find_keywords_in_pdfs_status
return ps_emails_return


0 comments on commit 4f5602b

Please sign in to comment.