Skip to content

Commit

Permalink
Changes to text parser to handle decode errors log2timeline#3301
Browse files Browse the repository at this point in the history
  • Loading branch information
joachimmetz committed Nov 18, 2020
1 parent 5f1d0f2 commit 13aa4b3
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 21 deletions.
2 changes: 1 addition & 1 deletion config/dpkg/control
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Description: Data files for plaso (log2timeline)

Package: python3-plaso
Architecture: all
Depends: plaso-data (>= ${binary:Version}), libbde-python3 (>= 20140531), libcreg-python3 (>= 20200725), libesedb-python3 (>= 20150409), libevt-python3 (>= 20191104), libevtx-python3 (>= 20141112), libewf-python3 (>= 20131210), libfsapfs-python3 (>= 20201107), libfsext-python3 (>= 20200819), libfshfs-python3 (>= 20201103), libfsntfs-python3 (>= 20200805), libfsxfs-python3 (>= 20201114), libfvde-python3 (>= 20160719), libfwnt-python3 (>= 20180117), libfwsi-python3 (>= 20150606), liblnk-python3 (>= 20150830), libluksde-python3 (>= 20200101), libmsiecf-python3 (>= 20150314), libolecf-python3 (>= 20151223), libqcow-python3 (>= 20131204), libregf-python3 (>= 20201002), libscca-python3 (>= 20190605), libsigscan-python3 (>= 20190629), libsmdev-python3 (>= 20140529), libsmraw-python3 (>= 20140612), libvhdi-python3 (>= 20131210), libvmdk-python3 (>= 20140421), libvshadow-python3 (>= 20160109), libvslvm-python3 (>= 20160109), python3-artifacts (>= 20190305), python3-bencode, python3-certifi (>= 2016.9.26), python3-cffi-backend (>= 1.9.1), python3-chardet (>= 2.0.1), python3-cryptography (>= 2.0.2), python3-dateutil (>= 1.5), python3-defusedxml (>= 0.5.0), python3-dfdatetime (>= 20200824), python3-dfvfs (>= 20201114), python3-dfwinreg (>= 20201002), python3-dtfabric (>= 20200621), python3-elasticsearch (>= 6.0), python3-future (>= 0.16.0), python3-idna (>= 2.5), python3-lz4 (>= 0.10.0), python3-pefile (>= 2018.8.8), python3-psutil (>= 5.4.3), python3-pyparsing (>= 2.3.0), python3-pytsk3 (>= 20160721), python3-redis (>= 3.4), python3-requests (>= 2.18.0), python3-six (>= 1.1.0), python3-tz, python3-urllib3 (>= 1.21.1), python3-xlsxwriter (>= 0.9.3), python3-yaml (>= 3.10), python3-yara (>= 3.4.0), python3-zmq (>= 2.1.11), ${python3:Depends}, ${misc:Depends}
Depends: plaso-data (>= ${binary:Version}), libbde-python3 (>= 20140531), libcreg-python3 (>= 20200725), libesedb-python3 (>= 20150409), libevt-python3 (>= 20191104), libevtx-python3 (>= 20141112), libewf-python3 (>= 20131210), libfsapfs-python3 (>= 20201107), libfsext-python3 (>= 20200819), libfshfs-python3 (>= 20201103), libfsntfs-python3 (>= 20200805), libfsxfs-python3 (>= 20201114), libfvde-python3 (>= 20160719), libfwnt-python3 (>= 20180117), libfwsi-python3 (>= 20150606), liblnk-python3 (>= 20150830), libluksde-python3 (>= 20200101), libmsiecf-python3 (>= 20150314), libolecf-python3 (>= 20151223), libqcow-python3 (>= 20131204), libregf-python3 (>= 20201002), libscca-python3 (>= 20190605), libsigscan-python3 (>= 20190629), libsmdev-python3 (>= 20140529), libsmraw-python3 (>= 20140612), libvhdi-python3 (>= 20131210), libvmdk-python3 (>= 20140421), libvshadow-python3 (>= 20160109), libvslvm-python3 (>= 20160109), python3-artifacts (>= 20190305), python3-bencode, python3-certifi (>= 2016.9.26), python3-cffi-backend (>= 1.9.1), python3-chardet (>= 2.0.1), python3-cryptography (>= 2.0.2), python3-dateutil (>= 1.5), python3-defusedxml (>= 0.5.0), python3-dfdatetime (>= 20200824), python3-dfvfs (>= 20201118), python3-dfwinreg (>= 20201002), python3-dtfabric (>= 20200621), python3-elasticsearch (>= 6.0), python3-future (>= 0.16.0), python3-idna (>= 2.5), python3-lz4 (>= 0.10.0), python3-pefile (>= 2018.8.8), python3-psutil (>= 5.4.3), python3-pyparsing (>= 2.3.0), python3-pytsk3 (>= 20160721), python3-redis (>= 3.4), python3-requests (>= 2.18.0), python3-six (>= 1.1.0), python3-tz, python3-urllib3 (>= 1.21.1), python3-xlsxwriter (>= 0.9.3), python3-yaml (>= 3.10), python3-yara (>= 3.4.0), python3-zmq (>= 2.1.11), ${python3:Depends}, ${misc:Depends}
Description: Python 3 module of plaso (log2timeline)
Plaso (log2timeline) is a framework to create super timelines. Its
purpose is to extract timestamps from various files found on typical
Expand Down
2 changes: 1 addition & 1 deletion dependencies.ini
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ version_property: __version__

[dfvfs]
dpkg_name: python3-dfvfs
minimum_version: 20201114
minimum_version: 20201118
rpm_name: python3-dfvfs
version_property: __version__

Expand Down
2 changes: 1 addition & 1 deletion plaso/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
'dateutil': ('__version__', '1.5', None, True),
'defusedxml': ('__version__', '0.5.0', None, True),
'dfdatetime': ('__version__', '20200824', None, True),
'dfvfs': ('__version__', '20201114', None, True),
'dfvfs': ('__version__', '20201118', None, True),
'dfwinreg': ('__version__', '20201002', None, True),
'dtfabric': ('__version__', '20200621', None, True),
'elasticsearch': ('__versionstr__', '6.0', None, False),
Expand Down
41 changes: 39 additions & 2 deletions plaso/parsers/text_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from __future__ import unicode_literals

import abc
import codecs

import pyparsing

Expand Down Expand Up @@ -179,6 +180,33 @@ def __init__(self):
# TODO: self._line_structures is a work-around and this needs
# a structural fix.
self._line_structures = list(self.LINE_STRUCTURES)
self._parser_mediator = None

codecs.register_error('text_parser_handler', self._EncodingErrorHandler)

def _EncodingErrorHandler(self, exception):
"""Encoding error handler.
Args:
exception [UnicodeDecodeError]: exception.
Returns:
tuple[str, int]: replacement string and number of bytes to skip.
Raises:
TypeError: if exception is not of type UnicodeDecodeError.
"""
if not isinstance(exception, UnicodeDecodeError):
raise TypeError('Unsupported exception type.')

if self._parser_mediator:
self._parser_mediator.ProduceExtractionWarning(
'error decoding 0x{0:02x} at offset: {1:d}'.format(
exception.object[exception.start],
self._current_offset + exception.start))

escaped = '\\x{0:2x}'.format(exception.object[exception.start])
return (escaped, exception.start + 1)

def _GetValueFromStructure(self, structure, name, default_value=None):
"""Retrieves a token value from a Pyparsing structure.
Expand Down Expand Up @@ -281,7 +309,7 @@ def _ReadLine(self, text_file_object, max_len=None, depth=0):
Raises:
UnicodeDecodeError: if the text cannot be decoded using the specified
encoding.
encoding and encoding errors is set to strict.
"""
line = text_file_object.readline(size=max_len)

Expand Down Expand Up @@ -314,6 +342,10 @@ def ParseFileObject(self, parser_mediator, file_object):
'Line structure undeclared, unable to proceed.')

encoding = self._ENCODING or parser_mediator.codepage

# Use strict encoding error handling in the verification step so that
# a text parser does not generate extraction warning for encoding errors
# of unsupported files.
text_file_object = text_file.TextFile(file_object, encoding=encoding)

try:
Expand All @@ -339,6 +371,12 @@ def ParseFileObject(self, parser_mediator, file_object):
if not self.VerifyStructure(parser_mediator, line):
raise errors.UnableToParseFile('Wrong file structure.')

self._parser_mediator = parser_mediator

text_file_object = text_file.TextFile(
file_object, encoding=encoding, encoding_errors='text_parser_handler')
line = self._ReadLine(text_file_object, max_len=self.MAX_LINE_LENGTH)

consecutive_line_failures = 0
index = None
# Set the offset to the beginning of the file.
Expand Down Expand Up @@ -572,7 +610,6 @@ def ParseFileObject(self, parser_mediator, file_object):
for key, structure in self.LINE_STRUCTURES:
structure.parseWithTabs()


consecutive_line_failures = 0
# Read every line in the text file.
while text_reader.lines:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ chardet >= 2.0.1
cryptography >= 2.0.2
defusedxml >= 0.5.0
dfdatetime >= 20200824
dfvfs >= 20201114
dfvfs >= 20201118
dfwinreg >= 20201002
dtfabric >= 20200621
elasticsearch >= 6.0
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ requires = libbde-python3 >= 20140531
python3-dateutil >= 1.5
python3-defusedxml >= 0.5.0
python3-dfdatetime >= 20200824
python3-dfvfs >= 20201114
python3-dfvfs >= 20201118
python3-dfwinreg >= 20201002
python3-dtfabric >= 20200621
python3-elasticsearch >= 6.0
Expand Down
6 changes: 1 addition & 5 deletions tests/parsers/systemd_journal.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,11 @@

import unittest

try:
from plaso.parsers import systemd_journal
except ImportError:
systemd_journal = None
from plaso.parsers import systemd_journal

from tests.parsers import test_lib


@unittest.skipIf(systemd_journal is None, 'requires LZMA compression support')
class SystemdJournalParserTest(test_lib.ParserTestCase):
"""Tests for the Systemd Journal parser."""

Expand Down
172 changes: 163 additions & 9 deletions tests/parsers/text_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,60 @@

from __future__ import unicode_literals

import codecs
import io
import unittest

import pyparsing

from dfvfs.file_io import file_object_io as dfvfs_file_object_io
from dfvfs.helpers import text_file as dfvfs_text_file
from plaso.parsers import text_parser

from tests.parsers import test_lib


class TestPyparsingSingleLineTextParser(
text_parser.PyparsingSingleLineTextParser):
"""Single line PyParsing-based text parser for testing purposes."""

_ENCODING = 'utf-8'

_LINE = pyparsing.Regex('.*') + pyparsing.lineEnd()

LINE_STRUCTURES = [('line', _LINE)]

def ParseRecord(self, parser_mediator, key, structure):
"""Parses a log record structure and produces events.
This function takes as an input a parsed pyparsing structure
and produces an EventObject if possible from that structure.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfvfs.
key (str): name of the parsed structure.
structure (pyparsing.ParseResults): tokens from a parsed log line.
"""
return

def VerifyStructure(self, parser_mediator, line):
"""Verify the structure of the file and return boolean based on that check.
This function should read enough text from the text file to confirm
that the file is the correct one for this particular parser.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfvfs.
line (str): single line from the text file.
Returns:
bool: True if this is the correct parser, False otherwise.
"""
return True


class PyparsingConstantsTest(test_lib.ParserTestCase):
"""Tests the PyparsingConstants text parser."""

Expand Down Expand Up @@ -54,32 +99,141 @@ def testConstantIPv4(self):
text_parser.PyparsingConstants.IPV4_ADDRESS.parseString('34.258')


class PyparsingSingleLineTextParserTest(unittest.TestCase):
class PyparsingSingleLineTextParserTest(test_lib.ParserTestCase):
"""Tests for the single line PyParsing-based text parser."""

# pylint: disable=protected-access
# pylint: disable=attribute-defined-outside-init,protected-access

def _EncodingErrorHandler(self, exception):
"""Encoding error handler.
Args:
exception [UnicodeDecodeError]: exception.
Returns:
tuple[str, int]: replacement string and number of bytes to skip.
Raises:
TypeError: if exception is not of type UnicodeDecodeError.
"""
if not isinstance(exception, UnicodeDecodeError):
raise TypeError('Unsupported exception type.')

self._encoding_errors.append(
(exception.start, exception.object[exception.start]))
escaped = '\\x{0:2x}'.format(exception.object[exception.start])
return (escaped, exception.start + 1)

def testIsText(self):
"""Tests the _IsText function."""
parser = text_parser.PyparsingSingleLineTextParser()
test_parser = TestPyparsingSingleLineTextParser()

bytes_in = b'this is My Weird ASCII and non whatever string.'
self.assertTrue(parser._IsText(bytes_in))
self.assertTrue(test_parser._IsText(bytes_in))

bytes_in = 'Plaso Síar Og Raðar Þessu'
self.assertTrue(parser._IsText(bytes_in))
self.assertTrue(test_parser._IsText(bytes_in))

bytes_in = b'\x01\\62LSO\xFF'
self.assertFalse(parser._IsText(bytes_in))
self.assertFalse(test_parser._IsText(bytes_in))

bytes_in = b'T\x00h\x00i\x00s\x00\x20\x00'
self.assertTrue(parser._IsText(bytes_in))
self.assertTrue(test_parser._IsText(bytes_in))

bytes_in = b'Ascii\x00'
self.assertTrue(parser._IsText(bytes_in))
self.assertTrue(test_parser._IsText(bytes_in))

bytes_in = b'Ascii Open then...\x00\x99\x23'
self.assertFalse(parser._IsText(bytes_in))
self.assertFalse(test_parser._IsText(bytes_in))

def testReadLine(self):
"""Tests the _ReadLine function."""
with io.BytesIO(b'This is another file.') as file_object:
file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object)
file_io.open()

test_parser = TestPyparsingSingleLineTextParser()
test_text_file = dfvfs_text_file.TextFile(file_io, encoding='utf-8')
line = test_parser._ReadLine(test_text_file)
self.assertEqual(line, 'This is another file.')

file_io.close()

with io.BytesIO(b'This is an\xbather file.') as file_object:
file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object)
file_io.open()

test_parser = TestPyparsingSingleLineTextParser()
test_text_file = dfvfs_text_file.TextFile(file_io, encoding='utf8')
with self.assertRaises(UnicodeDecodeError):
test_parser._ReadLine(test_text_file)

file_io.close()

with io.BytesIO(b'This is an\xbather file.') as file_object:
file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object)
file_io.open()

test_parser = TestPyparsingSingleLineTextParser()
test_text_file = dfvfs_text_file.TextFile(
file_io, encoding='utf8', encoding_errors='replace')
line = test_parser._ReadLine(test_text_file)
self.assertEqual(line, 'This is an\ufffdther file.')

file_io.close()

self._encoding_errors = []
codecs.register_error('test_handler', self._EncodingErrorHandler)

with io.BytesIO(b'This is an\xbather file.') as file_object:
file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object)
file_io.open()

test_parser = TestPyparsingSingleLineTextParser()
test_text_file = dfvfs_text_file.TextFile(
file_io, encoding='utf8', encoding_errors='test_handler')
line = test_parser._ReadLine(test_text_file)
self.assertEqual(line, 'This is an\\xbather file.')

file_io.close()

self.assertEqual(len(self._encoding_errors), 1)
self.assertEqual(self._encoding_errors[0], (10, 0xba))

def testParseFileObject(self):
"""Tests the ParseFileObject function."""
storage_writer = self._CreateStorageWriter()
parser_mediator = self._CreateParserMediator(storage_writer)

with io.BytesIO(b'This is another file.\nWith two lines.\n') as file_object:
file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object)
file_io.open()

test_parser = TestPyparsingSingleLineTextParser()
test_parser.ParseFileObject(parser_mediator, file_io)

file_io.close()

self.assertEqual(storage_writer.number_of_warnings, 0)
# The test parser does not generate events.
self.assertEqual(storage_writer.number_of_events, 0)

storage_writer = self._CreateStorageWriter()
parser_mediator = self._CreateParserMediator(storage_writer)

with io.BytesIO(
b'This is another file.\nWith tw\xba lines.\n') as file_object:
file_io = dfvfs_file_object_io.FileObjectIO(None, file_object=file_object)
file_io.open()

test_parser = TestPyparsingSingleLineTextParser()
test_parser.ParseFileObject(parser_mediator, file_io)

file_io.close()

self.assertEqual(storage_writer.number_of_warnings, 1)
# The test parser does not generate events.
self.assertEqual(storage_writer.number_of_events, 0)


if __name__ == '__main__':
Expand Down

0 comments on commit 13aa4b3

Please sign in to comment.