diff --git a/config/dpkg/control b/config/dpkg/control index f5e1681339..372c194106 100644 --- a/config/dpkg/control +++ b/config/dpkg/control @@ -17,7 +17,7 @@ Description: Data files for plaso (log2timeline) Package: python3-plaso Architecture: all -Depends: plaso-data (>= ${binary:Version}), libbde-python3 (>= 20140531), libcreg-python3 (>= 20200725), libesedb-python3 (>= 20150409), libevt-python3 (>= 20191104), libevtx-python3 (>= 20141112), libewf-python3 (>= 20131210), libfsapfs-python3 (>= 20201107), libfsext-python3 (>= 20200819), libfshfs-python3 (>= 20201103), libfsntfs-python3 (>= 20200805), libfsxfs-python3 (>= 20201114), libfvde-python3 (>= 20160719), libfwnt-python3 (>= 20180117), libfwsi-python3 (>= 20150606), liblnk-python3 (>= 20150830), libluksde-python3 (>= 20200101), libmsiecf-python3 (>= 20150314), libolecf-python3 (>= 20151223), libqcow-python3 (>= 20131204), libregf-python3 (>= 20201002), libscca-python3 (>= 20190605), libsigscan-python3 (>= 20190629), libsmdev-python3 (>= 20140529), libsmraw-python3 (>= 20140612), libvhdi-python3 (>= 20131210), libvmdk-python3 (>= 20140421), libvshadow-python3 (>= 20160109), libvslvm-python3 (>= 20160109), python3-artifacts (>= 20190305), python3-bencode, python3-certifi (>= 2016.9.26), python3-cffi-backend (>= 1.9.1), python3-chardet (>= 2.0.1), python3-cryptography (>= 2.0.2), python3-dateutil (>= 1.5), python3-defusedxml (>= 0.5.0), python3-dfdatetime (>= 20200824), python3-dfvfs (>= 20201114), python3-dfwinreg (>= 20201002), python3-dtfabric (>= 20200621), python3-elasticsearch (>= 6.0), python3-future (>= 0.16.0), python3-idna (>= 2.5), python3-lz4 (>= 0.10.0), python3-pefile (>= 2018.8.8), python3-psutil (>= 5.4.3), python3-pyparsing (>= 2.3.0), python3-pytsk3 (>= 20160721), python3-redis (>= 3.4), python3-requests (>= 2.18.0), python3-six (>= 1.1.0), python3-tz, python3-urllib3 (>= 1.21.1), python3-xlsxwriter (>= 0.9.3), python3-yaml (>= 3.10), python3-yara (>= 3.4.0), python3-zmq (>= 2.1.11), ${python3:Depends}, ${misc:Depends} +Depends: plaso-data (>= ${binary:Version}), libbde-python3 (>= 20140531), libcreg-python3 (>= 20200725), libesedb-python3 (>= 20150409), libevt-python3 (>= 20191104), libevtx-python3 (>= 20141112), libewf-python3 (>= 20131210), libfsapfs-python3 (>= 20201107), libfsext-python3 (>= 20200819), libfshfs-python3 (>= 20201103), libfsntfs-python3 (>= 20200805), libfsxfs-python3 (>= 20201114), libfvde-python3 (>= 20160719), libfwnt-python3 (>= 20180117), libfwsi-python3 (>= 20150606), liblnk-python3 (>= 20150830), libluksde-python3 (>= 20200101), libmsiecf-python3 (>= 20150314), libolecf-python3 (>= 20151223), libqcow-python3 (>= 20131204), libregf-python3 (>= 20201002), libscca-python3 (>= 20190605), libsigscan-python3 (>= 20190629), libsmdev-python3 (>= 20140529), libsmraw-python3 (>= 20140612), libvhdi-python3 (>= 20131210), libvmdk-python3 (>= 20140421), libvshadow-python3 (>= 20160109), libvslvm-python3 (>= 20160109), python3-artifacts (>= 20190305), python3-bencode, python3-certifi (>= 2016.9.26), python3-cffi-backend (>= 1.9.1), python3-chardet (>= 2.0.1), python3-cryptography (>= 2.0.2), python3-dateutil (>= 1.5), python3-defusedxml (>= 0.5.0), python3-dfdatetime (>= 20200824), python3-dfvfs (>= 20201118), python3-dfwinreg (>= 20201002), python3-dtfabric (>= 20200621), python3-elasticsearch (>= 6.0), python3-future (>= 0.16.0), python3-idna (>= 2.5), python3-lz4 (>= 0.10.0), python3-pefile (>= 2018.8.8), python3-psutil (>= 5.4.3), python3-pyparsing (>= 2.3.0), python3-pytsk3 (>= 20160721), python3-redis (>= 3.4), python3-requests (>= 2.18.0), python3-six (>= 1.1.0), python3-tz, python3-urllib3 (>= 1.21.1), python3-xlsxwriter (>= 0.9.3), python3-yaml (>= 3.10), python3-yara (>= 3.4.0), python3-zmq (>= 2.1.11), ${python3:Depends}, ${misc:Depends} Description: Python 3 module of plaso (log2timeline) Plaso (log2timeline) is a framework to create super timelines. Its purpose is to extract timestamps from various files found on typical diff --git a/dependencies.ini b/dependencies.ini index 73afbb0021..a39b85826a 100644 --- a/dependencies.ini +++ b/dependencies.ini @@ -56,7 +56,7 @@ version_property: __version__ [dfvfs] dpkg_name: python3-dfvfs -minimum_version: 20201114 +minimum_version: 20201118 rpm_name: python3-dfvfs version_property: __version__ diff --git a/plaso/dependencies.py b/plaso/dependencies.py index 4d655c42a5..9b9e9d02a6 100644 --- a/plaso/dependencies.py +++ b/plaso/dependencies.py @@ -27,7 +27,7 @@ 'dateutil': ('__version__', '1.5', None, True), 'defusedxml': ('__version__', '0.5.0', None, True), 'dfdatetime': ('__version__', '20200824', None, True), - 'dfvfs': ('__version__', '20201114', None, True), + 'dfvfs': ('__version__', '20201118', None, True), 'dfwinreg': ('__version__', '20201002', None, True), 'dtfabric': ('__version__', '20200621', None, True), 'elasticsearch': ('__versionstr__', '6.0', None, False), diff --git a/plaso/parsers/text_parser.py b/plaso/parsers/text_parser.py index 85a1392048..e6af3c89ac 100644 --- a/plaso/parsers/text_parser.py +++ b/plaso/parsers/text_parser.py @@ -9,6 +9,7 @@ from __future__ import unicode_literals import abc +import codecs import pyparsing @@ -179,6 +180,33 @@ def __init__(self): # TODO: self._line_structures is a work-around and this needs # a structural fix. self._line_structures = list(self.LINE_STRUCTURES) + self._parser_mediator = None + + codecs.register_error('text_parser_handler', self._EncodingErrorHandler) + + def _EncodingErrorHandler(self, exception): + """Encoding error handler. + + Args: + exception [UnicodeDecodeError]: exception. + + Returns: + tuple[str, int]: replacement string and number of bytes to skip. + + Raises: + TypeError: if exception is not of type UnicodeDecodeError. + """ + if not isinstance(exception, UnicodeDecodeError): + raise TypeError('Unsupported exception type.') + + if self._parser_mediator: + self._parser_mediator.ProduceExtractionWarning( + 'error decoding 0x{0:02x} at offset: {1:d}'.format( + exception.object[exception.start], + self._current_offset + exception.start)) + + escaped = '\\x{0:2x}'.format(exception.object[exception.start]) + return (escaped, exception.start + 1) def _GetValueFromStructure(self, structure, name, default_value=None): """Retrieves a token value from a Pyparsing structure. @@ -281,7 +309,7 @@ def _ReadLine(self, text_file_object, max_len=None, depth=0): Raises: UnicodeDecodeError: if the text cannot be decoded using the specified - encoding. + encoding and encoding errors is set to strict. """ line = text_file_object.readline(size=max_len) @@ -314,6 +342,10 @@ def ParseFileObject(self, parser_mediator, file_object): 'Line structure undeclared, unable to proceed.') encoding = self._ENCODING or parser_mediator.codepage + + # Use strict encoding error handling in the verification step so that + # a text parser does not generate extraction warning for encoding errors + # of unsupported files. text_file_object = text_file.TextFile(file_object, encoding=encoding) try: @@ -339,6 +371,12 @@ def ParseFileObject(self, parser_mediator, file_object): if not self.VerifyStructure(parser_mediator, line): raise errors.UnableToParseFile('Wrong file structure.') + self._parser_mediator = parser_mediator + + text_file_object = text_file.TextFile( + file_object, encoding=encoding, encoding_errors='text_parser_handler') + line = self._ReadLine(text_file_object, max_len=self.MAX_LINE_LENGTH) + consecutive_line_failures = 0 index = None # Set the offset to the beginning of the file. @@ -572,7 +610,6 @@ def ParseFileObject(self, parser_mediator, file_object): for key, structure in self.LINE_STRUCTURES: structure.parseWithTabs() - consecutive_line_failures = 0 # Read every line in the text file. while text_reader.lines: diff --git a/requirements.txt b/requirements.txt index 9beff3cbbb..6e91a9667b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ chardet >= 2.0.1 cryptography >= 2.0.2 defusedxml >= 0.5.0 dfdatetime >= 20200824 -dfvfs >= 20201114 +dfvfs >= 20201118 dfwinreg >= 20201002 dtfabric >= 20200621 elasticsearch >= 6.0 diff --git a/setup.cfg b/setup.cfg index b43bb413b7..5b4cf54fd0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,7 +55,7 @@ requires = libbde-python3 >= 20140531 python3-dateutil >= 1.5 python3-defusedxml >= 0.5.0 python3-dfdatetime >= 20200824 - python3-dfvfs >= 20201114 + python3-dfvfs >= 20201118 python3-dfwinreg >= 20201002 python3-dtfabric >= 20200621 python3-elasticsearch >= 6.0 diff --git a/tests/parsers/systemd_journal.py b/tests/parsers/systemd_journal.py index c8ca843fba..29cbc3de7f 100644 --- a/tests/parsers/systemd_journal.py +++ b/tests/parsers/systemd_journal.py @@ -6,15 +6,11 @@ import unittest -try: - from plaso.parsers import systemd_journal -except ImportError: - systemd_journal = None +from plaso.parsers import systemd_journal from tests.parsers import test_lib -@unittest.skipIf(systemd_journal is None, 'requires LZMA compression support') class SystemdJournalParserTest(test_lib.ParserTestCase): """Tests for the Systemd Journal parser.""" diff --git a/tests/parsers/text_parser.py b/tests/parsers/text_parser.py index d93a0b6880..ac4a298ac5 100644 --- a/tests/parsers/text_parser.py +++ b/tests/parsers/text_parser.py @@ -4,15 +4,60 @@ from __future__ import unicode_literals +import codecs +import io import unittest import pyparsing +from dfvfs.file_io import file_object_io as dfvfs_file_object_io +from dfvfs.helpers import text_file as dfvfs_text_file from plaso.parsers import text_parser from tests.parsers import test_lib +class TestPyparsingSingleLineTextParser( + text_parser.PyparsingSingleLineTextParser): + """Single line PyParsing-based text parser for testing purposes.""" + + _ENCODING = 'utf-8' + + _LINE = pyparsing.Regex('.*') + pyparsing.lineEnd() + + LINE_STRUCTURES = [('line', _LINE)] + + def ParseRecord(self, parser_mediator, key, structure): + """Parses a log record structure and produces events. + + This function takes as an input a parsed pyparsing structure + and produces an EventObject if possible from that structure. + + Args: + parser_mediator (ParserMediator): mediates interactions between parsers + and other components, such as storage and dfvfs. + key (str): name of the parsed structure. + structure (pyparsing.ParseResults): tokens from a parsed log line. + """ + return + + def VerifyStructure(self, parser_mediator, line): + """Verify the structure of the file and return boolean based on that check. + + This function should read enough text from the text file to confirm + that the file is the correct one for this particular parser. + + Args: + parser_mediator (ParserMediator): mediates interactions between parsers + and other components, such as storage and dfvfs. + line (str): single line from the text file. + + Returns: + bool: True if this is the correct parser, False otherwise. + """ + return True + + class PyparsingConstantsTest(test_lib.ParserTestCase): """Tests the PyparsingConstants text parser.""" @@ -54,32 +99,141 @@ def testConstantIPv4(self): text_parser.PyparsingConstants.IPV4_ADDRESS.parseString('34.258') -class PyparsingSingleLineTextParserTest(unittest.TestCase): +class PyparsingSingleLineTextParserTest(test_lib.ParserTestCase): """Tests for the single line PyParsing-based text parser.""" - # pylint: disable=protected-access + # pylint: disable=attribute-defined-outside-init,protected-access + + def _EncodingErrorHandler(self, exception): + """Encoding error handler. + + Args: + exception [UnicodeDecodeError]: exception. + + Returns: + tuple[str, int]: replacement string and number of bytes to skip. + + Raises: + TypeError: if exception is not of type UnicodeDecodeError. + """ + if not isinstance(exception, UnicodeDecodeError): + raise TypeError('Unsupported exception type.') + + self._encoding_errors.append( + (exception.start, exception.object[exception.start])) + escaped = '\\x{0:2x}'.format(exception.object[exception.start]) + return (escaped, exception.start + 1) def testIsText(self): """Tests the _IsText function.""" - parser = text_parser.PyparsingSingleLineTextParser() + test_parser = TestPyparsingSingleLineTextParser() bytes_in = b'this is My Weird ASCII and non whatever string.' - self.assertTrue(parser._IsText(bytes_in)) + self.assertTrue(test_parser._IsText(bytes_in)) bytes_in = 'Plaso Síar Og Raðar Þessu' - self.assertTrue(parser._IsText(bytes_in)) + self.assertTrue(test_parser._IsText(bytes_in)) bytes_in = b'\x01\\62LSO\xFF' - self.assertFalse(parser._IsText(bytes_in)) + self.assertFalse(test_parser._IsText(bytes_in)) bytes_in = b'T\x00h\x00i\x00s\x00\x20\x00' - self.assertTrue(parser._IsText(bytes_in)) + self.assertTrue(test_parser._IsText(bytes_in)) bytes_in = b'Ascii\x00' - self.assertTrue(parser._IsText(bytes_in)) + self.assertTrue(test_parser._IsText(bytes_in)) bytes_in = b'Ascii Open then...\x00\x99\x23' - self.assertFalse(parser._IsText(bytes_in)) + self.assertFalse(test_parser._IsText(bytes_in)) + + def testReadLine(self): + """Tests the _ReadLine function.""" + with io.BytesIO(b'This is another file.') as file_object: + file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object) + file_io.open() + + test_parser = TestPyparsingSingleLineTextParser() + test_text_file = dfvfs_text_file.TextFile(file_io, encoding='utf-8') + line = test_parser._ReadLine(test_text_file) + self.assertEqual(line, 'This is another file.') + + file_io.close() + + with io.BytesIO(b'This is an\xbather file.') as file_object: + file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object) + file_io.open() + + test_parser = TestPyparsingSingleLineTextParser() + test_text_file = dfvfs_text_file.TextFile(file_io, encoding='utf8') + with self.assertRaises(UnicodeDecodeError): + test_parser._ReadLine(test_text_file) + + file_io.close() + + with io.BytesIO(b'This is an\xbather file.') as file_object: + file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object) + file_io.open() + + test_parser = TestPyparsingSingleLineTextParser() + test_text_file = dfvfs_text_file.TextFile( + file_io, encoding='utf8', encoding_errors='replace') + line = test_parser._ReadLine(test_text_file) + self.assertEqual(line, 'This is an\ufffdther file.') + + file_io.close() + + self._encoding_errors = [] + codecs.register_error('test_handler', self._EncodingErrorHandler) + + with io.BytesIO(b'This is an\xbather file.') as file_object: + file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object) + file_io.open() + + test_parser = TestPyparsingSingleLineTextParser() + test_text_file = dfvfs_text_file.TextFile( + file_io, encoding='utf8', encoding_errors='test_handler') + line = test_parser._ReadLine(test_text_file) + self.assertEqual(line, 'This is an\\xbather file.') + + file_io.close() + + self.assertEqual(len(self._encoding_errors), 1) + self.assertEqual(self._encoding_errors[0], (10, 0xba)) + + def testParseFileObject(self): + """Tests the ParseFileObject function.""" + storage_writer = self._CreateStorageWriter() + parser_mediator = self._CreateParserMediator(storage_writer) + + with io.BytesIO(b'This is another file.\nWith two lines.\n') as file_object: + file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object) + file_io.open() + + test_parser = TestPyparsingSingleLineTextParser() + test_parser.ParseFileObject(parser_mediator, file_io) + + file_io.close() + + self.assertEqual(storage_writer.number_of_warnings, 0) + # The test parser does not generate events. + self.assertEqual(storage_writer.number_of_events, 0) + + storage_writer = self._CreateStorageWriter() + parser_mediator = self._CreateParserMediator(storage_writer) + + with io.BytesIO( + b'This is another file.\nWith tw\xba lines.\n') as file_object: + file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object) + file_io.open() + + test_parser = TestPyparsingSingleLineTextParser() + test_parser.ParseFileObject(parser_mediator, file_io) + + file_io.close() + + self.assertEqual(storage_writer.number_of_warnings, 1) + # The test parser does not generate events. + self.assertEqual(storage_writer.number_of_events, 0) if __name__ == '__main__':