Skip to content

Commit 3945fd1

Browse files
Merge pull request #1976 from blacklanternsecurity/fix-extractous
Update Extractous with new API changes
2 parents b1d8b94 + b14bad5 commit 3945fd1

File tree

1 file changed

+17
-42
lines changed

1 file changed

+17
-42
lines changed

bbot/modules/extractous.py

+17-42
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,12 @@ async def filter_event(self, event):
8181
async def handle_event(self, event):
8282
file_path = event.data["path"]
8383
content = await self.scan.helpers.run_in_executor_mp(extract_text, file_path)
84+
if isinstance(content, tuple):
85+
error, traceback = content
86+
self.error(f"Error extracting text from {file_path}: {error}")
87+
self.trace(traceback)
88+
return
89+
8490
if content:
8591
raw_text_event = self.make_event(
8692
content,
@@ -99,49 +105,18 @@ def extract_text(file_path):
99105
:return: ASCII-encoded plaintext extracted from the document.
100106
"""
101107

102-
extractable_file_types = [
103-
".csv",
104-
".eml",
105-
".msg",
106-
".epub",
107-
".xlsx",
108-
".xls",
109-
".html",
110-
".htm",
111-
".md",
112-
".org",
113-
".odt",
114-
".pdf",
115-
".txt",
116-
".text",
117-
".log",
118-
".ppt",
119-
".pptx",
120-
".rst",
121-
".rtf",
122-
".tsv",
123-
".doc",
124-
".docx",
125-
".xml",
126-
]
127-
128-
# If the file can be extracted with extractous use its partition function or try and read it
129-
if any(file_path.lower().endswith(file_type) for file_type in extractable_file_types):
130-
try:
131-
extractor = Extractor()
132-
reader = extractor.extract_file(str(file_path))
108+
try:
109+
extractor = Extractor()
110+
reader, metadata = extractor.extract_file(str(file_path))
133111

134-
result = ""
112+
result = ""
113+
buffer = reader.read(4096)
114+
while len(buffer) > 0:
115+
result += buffer.decode("utf-8")
135116
buffer = reader.read(4096)
136-
while len(buffer) > 0:
137-
result += buffer.decode("utf-8")
138-
buffer = reader.read(4096)
139117

140-
return result.strip()
118+
return result.strip()
119+
except Exception as e:
120+
import traceback
141121

142-
except Exception:
143-
with open(file_path, "rb") as file:
144-
return file.read().decode("utf-8", errors="ignore")
145-
else:
146-
with open(file_path, "rb") as file:
147-
return file.read().decode("utf-8", errors="ignore")
122+
return (str(e), traceback.format_exc())

0 commit comments

Comments
 (0)