Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 50 additions & 30 deletions pytest_splunk_addon/standard_lib/utilities/xml_event_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,48 @@
#

import re
from collections import OrderedDict


supported_headers = OrderedDict(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need an ordered dict here, why not just dict?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just to have a control which format is checked first, ofc ideally it shouldn't matter. It's a tricky part, I think we're not using any official or recommended regex here and preferably we should have some better solution here

[
(
"CEF",
{
"regex": r"\s(CEF:\d\|[^\|]+\|([^\|]+)\|[^\|]+\|[^\|]+\|[^\|]+\|([^\|]+)\|(.*))",
"match_element": 1,
},
),
(
"CEF_checkpoint",
{
"regex": r"(time=\d+\|[^\|]+\|([^\|]+)\|[^\|]+\|[^\|]+\|[^\|]+\|([^\|]+)\|(.*))",
"match_element": 1,
},
),
(
"rfc5424",
{
"regex": r"(?:(\d{4}[-]\d{2}[-]\d{2}[T]\d{2}[:]\d{2}[:]\d{2}(?:\.\d{1,6})?(?:[+-]\d{2}[:]\d{2}|Z)?))\s(?:([\w][\w\d\.@-]*)|-)\s(.*)$",
"match_element": 3,
},
),
(
"rfc3164",
{
"regex": r"([A-Z][a-z][a-z]\s{1,2}\d{1,2}(?:\s\d{4})?\s\d{2}[:]\d{2}[:]\d{2})\s+([\w][\w\d\.@-]*)\s\w*:?(.*)$",
"match_element": 3,
},
),
(
"httpd",
{
"regex": r"((?:\d+(?:(?:\.|:)(?:\d+|[a-fA-F]+)?){3,8}))(?:\s(?:-|\w+))*\s\[(\d{1,2}\/\w+\/\d{4}(?:[:]\d{2}){3}(?:\.\d{1,6})?(?:\s[+-]\d{2}[:]?\d{2})?(?:Z)?)]\s(.*)$",
"match_element": 3,
},
),
]
)


def escape_char_event(event):
Expand Down Expand Up @@ -71,35 +113,13 @@ def escape_char_event(event):


def strip_syslog_header(raw_event):
"""
removes syslog header and returns event without it, make sure header type is added to supported_headers
Input: raw event
"""
# remove leading space chars
raw_event = raw_event.strip()
CEF_format_match = re.search(
r"\s(CEF:\d\|[^\|]+\|([^\|]+)\|[^\|]+\|[^\|]+\|[^\|]+\|([^\|]+)\|(.*))",
raw_event,
)
if CEF_format_match:
stripped_header = CEF_format_match.group(1)
return stripped_header
CEF_checkpoint_match = re.search(
r"(time=\d+\|[^\|]+\|([^\|]+)\|[^\|]+\|[^\|]+\|[^\|]+\|([^\|]+)\|(.*))",
raw_event,
)
if CEF_checkpoint_match:
stripped_header = CEF_checkpoint_match.group(1)
return stripped_header
regex_rfc5424 = re.search(
r"(?:(\d{4}[-]\d{2}[-]\d{2}[T]\d{2}[:]\d{2}[:]\d{2}(?:\.\d{1,6})?(?:[+-]\d{2}[:]\d{2}|Z)?)|-)\s(?:([\w][\w\d\.@-]*)|-)\s(.*)$",
raw_event,
)
if regex_rfc5424:
stripped_header = regex_rfc5424.group(3)
return stripped_header
regex_rfc3164 = re.search(
r"([A-Z][a-z][a-z]\s{1,2}\d{1,2}(?:\s\d{4})?\s\d{2}[:]\d{2}[:]\d{2})\s+([\w][\w\d\.@-]*)\s\w*:?(.*)$",
raw_event,
)
if regex_rfc3164:
stripped_header = regex_rfc3164.group(3)
return stripped_header
if not (CEF_format_match and regex_rfc3164 and regex_rfc5424):
return None
for header_format in supported_headers.values():
header_match = re.search(header_format.get("regex"), raw_event)
if header_match:
return header_match.group(header_format.get("match_element"))
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,19 @@ def test_escape_char_event(escape_char, expected_output):
"Jan 11 10:25:39 host CEF:Version|Device Vendor|Device Product|Device Version|Device Event Class ID|Name|Severity|[Extension]",
"Version|Device Vendor|Device Product|Device Version|Device Event Class ID|Name|Severity|[Extension]",
),
("dummy string", None),
(
'10.0.1.1 - - [04/Jan/2021:18:37:21 +0530] "GET /tomcat.svg HTTP/1.1" 200 67795',
'"GET /tomcat.svg HTTP/1.1" 200 67795',
),
("- cisco dummy", None),
],
ids=[
"rfc5424-format",
"rfc3164-format",
"rfc3164-format-longer",
"CEF-checkpoint-foramt",
"CEF-format",
"httpd-format",
"wrong-format",
],
)
Expand Down