-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.py
63 lines (43 loc) · 1.6 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import textract
import re
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
def anonymize_document(pdf_path, isPdf=True):
textstring = ""
if isPdf:
textstring = read_pdf(pdf_path)
else:
textstring = read_text(pdf_path)
print(textstring)
print("=====================================")
print("Anonymizing sensitive data in the PDF")
print("=====================================\n\n")
return anonymize_text(textstring)
def read_pdf(file_path):
text = textract.process(pdf_path, method="pdfminer")
# Remove null characters
textstring = text.decode("utf-8")
textstring = textstring.replace("\0", "")
textstring = textstring.replace("_", " ")
# Remove non-printable characters
textstring = re.sub(r"[^\x20-\x7e]", "", textstring)
return textstring
def read_text(file_path):
with open(file_path, "r") as file:
text = file.read()
return text
def anonymize_text(text):
# Initialize Presidio analyzer and anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
# Analyze the text to identify sensitive data
analyzer_results = analyzer.analyze(text=text, entities=["PERSON"], language="en")
# Anonymize the sensitive data
anonymized_text = anonymizer.anonymize(text=text, analyzer_results=analyzer_results)
return anonymized_text
# Example usage
pdf_path = "PARTNERSHIP_AGREEMENT.pdf"
txt_path = "NonCompeteText.txt"
# anonymized_text = anonymize_document(pdf_path)
anonymized_text = anonymize_document(txt_path, False)
print(anonymized_text)