From 2c96d71042ea3f433a345153e8c94d128d0ae7b0 Mon Sep 17 00:00:00 2001 From: Jeffrey Everling Date: Tue, 19 Feb 2019 13:18:50 +0100 Subject: [PATCH 1/3] Adding full text regex support --- cortexutils/analyzer.py | 5 +- cortexutils/extractor.py | 114 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/cortexutils/analyzer.py b/cortexutils/analyzer.py index 88a1c45..e391d0b 100644 --- a/cortexutils/analyzer.py +++ b/cortexutils/analyzer.py @@ -52,7 +52,10 @@ def summary(self, raw): def artifacts(self, raw): # Use the regex extractor, if auto_extract setting is not False if self.auto_extract: - extractor = Extractor(ignore=self.get_data()) + try: + extractor = EnhancedExtractor(ignore=self.get_data()) + except: + extractor = Extractor(ignore=self.get_data()) return extractor.check_iterable(raw) # Return empty list diff --git a/cortexutils/extractor.py b/cortexutils/extractor.py index eb124cb..8a421c6 100644 --- a/cortexutils/extractor.py +++ b/cortexutils/extractor.py @@ -24,6 +24,8 @@ class Extractor: def __init__(self, ignore=None): self.ignore = ignore self.regex = self.__init_regex() + self.ftregex = self.__init_ft_regex() + self.asregex = self.__init_analyzer_regex() @staticmethod def __init_regex(): @@ -117,6 +119,93 @@ def __init_regex(): return regex + @staticmethod + def __init_ft_regex(): + + logging.info("Preparing full text regex statements") + + """ + Returns compiled full text regex list. + + :return: List of {type, regex} dicts + :rtype: list + """ + + #### Generic regexes + + # IPv4 + regex = [{ + 'types': ['ip'], + 'regex': re.compile(r'(?:^|\D)((?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9]))(?:\D|$)', re.MULTILINE) + }] + + # URL + regex.append({ + 'types': ['url','fqdn','domain','uri_path'], + 'regex': re.compile(r'((?:http|https):\/\/((?:(?:.*?)\.)?(.*?(?:\.\w+)+))\/?([a-zA-Z0-9\/\-\_\.\~\=\?]+\??)?)', re.MULTILINE) + }) + + # mail + regex.append({ + 'types': ['mail','domain'], + 'regex': re.compile(r'((?:[a-zA-Z0-9\/\-\_\.\+]+)@{1}([a-zA-Z0-9\-\_]+\.[a-zA-Z0-9\-\_\.]+)+)', re.MULTILINE) + }) + + ### Mail Specific regexes + + return regex + + @staticmethod + def __init_analyzer_regex(): + + logging.info("Preparing analyzer specific regex statements place holder") + + """ + Returns False when the analyzer has no analyzer specific regexes. + + :return: Empty list + :rtype: list + """ + + self.empty_list = [] + + return self.empty_list + + def __findftmatch(self, value): + """Checks if the given value is contains regexes + + :param value: The value to check + :type value: str or number + :return: Data type of value, if known, else empty string + :rtype: str + """ + self.found_observables = [] + if isinstance(value, (str, unicode)): + self.regexpack = [] + self.regexpack.append(self.ftregex) + self.regexpack.append(self.asregex) + for r in self.regexpack: + matches = re.findall(r.get('regex'), value) + if len(matches) > 0: + for found_observable in matches: + if isinstance(found_observable, tuple): + i = 0 + for groups in found_observable: + self.found_observables.append({ + 'type': r.get('types')[i], + 'value': found_observable[i] + }) + i += 1 + else: + self.found_observables.append({ + 'type': r.get('types')[0], + 'value': found_observable + }) + if len(self.found_observables) > 0: + return self.found_observables + else: + return '' + def __checktype(self, value): """Checks if the given value is a known datatype @@ -167,6 +256,10 @@ def check_iterable(self, iterable): 'type': dt, 'value': iterable }) + #Check full text for regex matches + matches = self.__findftmatch(iterable) + if len(matches) > 0: + results.extend(matches) elif isinstance(iterable, list): for item in iterable: if isinstance(item, list) or isinstance(item, dict): @@ -178,6 +271,10 @@ def check_iterable(self, iterable): 'type': dt, 'value': item }) + #Check full text for regex matches + matches = self.__findftmatch(iterable) + if len(matches) > 0: + results.extend(matches) elif isinstance(iterable, dict): for _, item in iterable.items(): if isinstance(item, list) or isinstance(item, dict): @@ -189,7 +286,24 @@ def check_iterable(self, iterable): 'type': dt, 'value': item }) + #Check full text for regex matches + matches = self.__findftmatch(iterable) + if len(matches) > 0: + results.extend(matches) else: raise TypeError('Not supported type.') + #Deduplicate results for a cleaner result + results = self.deduplicate(results) return results + + def deduplicate(self, list_of_objects): + dedup_list = [] + for object in list_of_objects: + present = False + for new_object in dedup_list: + if object['type'] == new_object['type'] and object['value'] == new_object['value']: + present = True + if not present: + dedup_list.append(object) + return dedup_list \ No newline at end of file From b7202f68599b0f45085016b14466942b27c15628 Mon Sep 17 00:00:00 2001 From: Jeffrey Everling Date: Tue, 19 Feb 2019 18:11:40 +0100 Subject: [PATCH 2/3] Fixed some issues --- cortexutils/analyzer.py | 7 ++----- cortexutils/extractor.py | 43 +++++++++++++++++----------------------- 2 files changed, 20 insertions(+), 30 deletions(-) diff --git a/cortexutils/analyzer.py b/cortexutils/analyzer.py index e391d0b..f929157 100644 --- a/cortexutils/analyzer.py +++ b/cortexutils/analyzer.py @@ -41,7 +41,7 @@ def build_taxonomy(self, level, namespace, predicate, value): 'namespace': namespace, 'predicate': predicate, 'value': value - } + } def summary(self, raw): """Returns a summary, needed for 'short.html' template. Overwrite it for your needs! @@ -52,10 +52,7 @@ def summary(self, raw): def artifacts(self, raw): # Use the regex extractor, if auto_extract setting is not False if self.auto_extract: - try: - extractor = EnhancedExtractor(ignore=self.get_data()) - except: - extractor = Extractor(ignore=self.get_data()) + extractor = Extractor(ignore=self.get_data()) return extractor.check_iterable(raw) # Return empty list diff --git a/cortexutils/extractor.py b/cortexutils/extractor.py index 8a421c6..53a70c2 100644 --- a/cortexutils/extractor.py +++ b/cortexutils/extractor.py @@ -25,7 +25,6 @@ def __init__(self, ignore=None): self.ignore = ignore self.regex = self.__init_regex() self.ftregex = self.__init_ft_regex() - self.asregex = self.__init_analyzer_regex() @staticmethod def __init_regex(): @@ -119,11 +118,9 @@ def __init_regex(): return regex - @staticmethod - def __init_ft_regex(): - - logging.info("Preparing full text regex statements") - + @staticmethod + def __init_ft_regex(): + """ Returns compiled full text regex list. @@ -132,34 +129,32 @@ def __init_ft_regex(): """ #### Generic regexes - + # IPv4 - regex = [{ + ftregex = [{ 'types': ['ip'], 'regex': re.compile(r'(?:^|\D)((?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9]))(?:\D|$)', re.MULTILINE) }] # URL - regex.append({ + ftregex.append({ 'types': ['url','fqdn','domain','uri_path'], 'regex': re.compile(r'((?:http|https):\/\/((?:(?:.*?)\.)?(.*?(?:\.\w+)+))\/?([a-zA-Z0-9\/\-\_\.\~\=\?]+\??)?)', re.MULTILINE) }) # mail - regex.append({ + ftregex.append({ 'types': ['mail','domain'], 'regex': re.compile(r'((?:[a-zA-Z0-9\/\-\_\.\+]+)@{1}([a-zA-Z0-9\-\_]+\.[a-zA-Z0-9\-\_\.]+)+)', re.MULTILINE) }) - + ### Mail Specific regexes - return regex + return ftregex @staticmethod def __init_analyzer_regex(): - - logging.info("Preparing analyzer specific regex statements place holder") - + """ Returns False when the analyzer has no analyzer specific regexes. @@ -167,9 +162,9 @@ def __init_analyzer_regex(): :rtype: list """ - self.empty_list = [] + empty_list = [] - return self.empty_list + return empty_list def __findftmatch(self, value): """Checks if the given value is contains regexes @@ -181,13 +176,11 @@ def __findftmatch(self, value): """ self.found_observables = [] if isinstance(value, (str, unicode)): - self.regexpack = [] - self.regexpack.append(self.ftregex) - self.regexpack.append(self.asregex) + self.regexpack = self.ftregex + self.asregex for r in self.regexpack: - matches = re.findall(r.get('regex'), value) - if len(matches) > 0: - for found_observable in matches: + self.hits = re.findall(r.get('regex'), value) + if len(self.hits) > 0: + for found_observable in self.hits: if isinstance(found_observable, tuple): i = 0 for groups in found_observable: @@ -272,7 +265,7 @@ def check_iterable(self, iterable): 'value': item }) #Check full text for regex matches - matches = self.__findftmatch(iterable) + matches = self.__findftmatch(item) if len(matches) > 0: results.extend(matches) elif isinstance(iterable, dict): @@ -287,7 +280,7 @@ def check_iterable(self, iterable): 'value': item }) #Check full text for regex matches - matches = self.__findftmatch(iterable) + matches = self.__findftmatch(item) if len(matches) > 0: results.extend(matches) else: From 0e40bc8d4b8199d638fd2923591ba9e85b5f32dd Mon Sep 17 00:00:00 2001 From: Jeffrey E Date: Thu, 16 Jan 2020 18:21:32 +0100 Subject: [PATCH 3/3] Removed "+ self.asregex" as mentioned --- cortexutils/extractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cortexutils/extractor.py b/cortexutils/extractor.py index 53a70c2..a7f90b8 100644 --- a/cortexutils/extractor.py +++ b/cortexutils/extractor.py @@ -176,7 +176,7 @@ def __findftmatch(self, value): """ self.found_observables = [] if isinstance(value, (str, unicode)): - self.regexpack = self.ftregex + self.asregex + self.regexpack = self.ftregex for r in self.regexpack: self.hits = re.findall(r.get('regex'), value) if len(self.hits) > 0: @@ -299,4 +299,4 @@ def deduplicate(self, list_of_objects): present = True if not present: dedup_list.append(object) - return dedup_list \ No newline at end of file + return dedup_list