@@ -81,6 +81,12 @@ async def filter_event(self, event):
81
81
async def handle_event (self , event ):
82
82
file_path = event .data ["path" ]
83
83
content = await self .scan .helpers .run_in_executor_mp (extract_text , file_path )
84
+ if isinstance (content , tuple ):
85
+ error , traceback = content
86
+ self .error (f"Error extracting text from { file_path } : { error } " )
87
+ self .trace (traceback )
88
+ return
89
+
84
90
if content :
85
91
raw_text_event = self .make_event (
86
92
content ,
@@ -99,49 +105,18 @@ def extract_text(file_path):
99
105
:return: ASCII-encoded plaintext extracted from the document.
100
106
"""
101
107
102
- extractable_file_types = [
103
- ".csv" ,
104
- ".eml" ,
105
- ".msg" ,
106
- ".epub" ,
107
- ".xlsx" ,
108
- ".xls" ,
109
- ".html" ,
110
- ".htm" ,
111
- ".md" ,
112
- ".org" ,
113
- ".odt" ,
114
- ".pdf" ,
115
- ".txt" ,
116
- ".text" ,
117
- ".log" ,
118
- ".ppt" ,
119
- ".pptx" ,
120
- ".rst" ,
121
- ".rtf" ,
122
- ".tsv" ,
123
- ".doc" ,
124
- ".docx" ,
125
- ".xml" ,
126
- ]
127
-
128
- # If the file can be extracted with extractous use its partition function or try and read it
129
- if any (file_path .lower ().endswith (file_type ) for file_type in extractable_file_types ):
130
- try :
131
- extractor = Extractor ()
132
- reader = extractor .extract_file (str (file_path ))
108
+ try :
109
+ extractor = Extractor ()
110
+ reader , metadata = extractor .extract_file (str (file_path ))
133
111
134
- result = ""
112
+ result = ""
113
+ buffer = reader .read (4096 )
114
+ while len (buffer ) > 0 :
115
+ result += buffer .decode ("utf-8" )
135
116
buffer = reader .read (4096 )
136
- while len (buffer ) > 0 :
137
- result += buffer .decode ("utf-8" )
138
- buffer = reader .read (4096 )
139
117
140
- return result .strip ()
118
+ return result .strip ()
119
+ except Exception as e :
120
+ import traceback
141
121
142
- except Exception :
143
- with open (file_path , "rb" ) as file :
144
- return file .read ().decode ("utf-8" , errors = "ignore" )
145
- else :
146
- with open (file_path , "rb" ) as file :
147
- return file .read ().decode ("utf-8" , errors = "ignore" )
122
+ return (str (e ), traceback .format_exc ())
0 commit comments