From 2c96d71042ea3f433a345153e8c94d128d0ae7b0 Mon Sep 17 00:00:00 2001
From: Jeffrey Everling <jeffrey@antsec.nl>
Date: Tue, 19 Feb 2019 13:18:50 +0100
Subject: [PATCH 1/3] Adding full text regex support

---
 cortexutils/analyzer.py  |   5 +-
 cortexutils/extractor.py | 114 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+), 1 deletion(-)

diff --git a/cortexutils/analyzer.py b/cortexutils/analyzer.py
index 88a1c45..e391d0b 100644
--- a/cortexutils/analyzer.py
+++ b/cortexutils/analyzer.py
@@ -52,7 +52,10 @@ def summary(self, raw):
     def artifacts(self, raw):
         # Use the regex extractor, if auto_extract setting is not False
         if self.auto_extract:
-            extractor = Extractor(ignore=self.get_data())
+            try:
+                extractor = EnhancedExtractor(ignore=self.get_data())
+            except:
+                extractor = Extractor(ignore=self.get_data())
             return extractor.check_iterable(raw)
 
         # Return empty list
diff --git a/cortexutils/extractor.py b/cortexutils/extractor.py
index eb124cb..8a421c6 100644
--- a/cortexutils/extractor.py
+++ b/cortexutils/extractor.py
@@ -24,6 +24,8 @@ class Extractor:
     def __init__(self, ignore=None):
         self.ignore = ignore
         self.regex = self.__init_regex()
+        self.ftregex = self.__init_ft_regex()
+        self.asregex = self.__init_analyzer_regex()
 
     @staticmethod
     def __init_regex():
@@ -117,6 +119,93 @@ def __init_regex():
 
         return regex
 
+        @staticmethod
+        def __init_ft_regex():
+        
+        logging.info("Preparing full text regex statements")
+        
+        """
+        Returns compiled full text regex list.
+
+        :return: List of {type, regex} dicts
+        :rtype: list
+        """
+
+        #### Generic regexes
+        
+        # IPv4
+        regex = [{
+            'types': ['ip'],
+            'regex': re.compile(r'(?:^|\D)((?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9]))(?:\D|$)', re.MULTILINE)
+        }]
+
+        # URL
+        regex.append({
+            'types': ['url','fqdn','domain','uri_path'],
+            'regex': re.compile(r'((?:http|https):\/\/((?:(?:.*?)\.)?(.*?(?:\.\w+)+))\/?([a-zA-Z0-9\/\-\_\.\~\=\?]+\??)?)', re.MULTILINE)
+        })
+
+        # mail
+        regex.append({
+            'types': ['mail','domain'],
+            'regex': re.compile(r'((?:[a-zA-Z0-9\/\-\_\.\+]+)@{1}([a-zA-Z0-9\-\_]+\.[a-zA-Z0-9\-\_\.]+)+)', re.MULTILINE)
+        })
+        
+        ### Mail Specific regexes
+
+        return regex
+
+    @staticmethod
+    def __init_analyzer_regex():
+        
+        logging.info("Preparing analyzer specific regex statements place holder")
+        
+        """
+        Returns False when the analyzer has no analyzer specific regexes.
+
+        :return: Empty list
+        :rtype: list
+        """
+
+        self.empty_list = []
+
+        return self.empty_list
+
+    def __findftmatch(self, value):
+        """Checks if the given value is contains regexes
+
+        :param value: The value to check
+        :type value: str or number
+        :return: Data type of value, if known, else empty string
+        :rtype: str
+        """
+        self.found_observables = []
+        if isinstance(value, (str, unicode)):
+            self.regexpack = []
+            self.regexpack.append(self.ftregex)
+            self.regexpack.append(self.asregex)
+            for r in self.regexpack:
+                matches = re.findall(r.get('regex'), value)
+                if len(matches) > 0:
+                    for found_observable in matches:
+                        if isinstance(found_observable, tuple):
+                            i = 0
+                            for groups in found_observable:
+                                self.found_observables.append({
+                                    'type': r.get('types')[i],
+                                    'value': found_observable[i]
+                                    })
+                                i += 1
+                        else:
+                            self.found_observables.append({
+                                'type': r.get('types')[0],
+                                'value': found_observable
+                                })
+            if len(self.found_observables) > 0:
+                return self.found_observables
+            else:
+                return ''
+
     def __checktype(self, value):
         """Checks if the given value is a known datatype
 
@@ -167,6 +256,10 @@ def check_iterable(self, iterable):
                     'type': dt,
                     'value': iterable
                 })
+            #Check full text for regex matches
+            matches = self.__findftmatch(iterable)
+            if len(matches) > 0:
+                results.extend(matches)
         elif isinstance(iterable, list):
             for item in iterable:
                 if isinstance(item, list) or isinstance(item, dict):
@@ -178,6 +271,10 @@ def check_iterable(self, iterable):
                             'type': dt,
                             'value': item
                         })
+                    #Check full text for regex matches
+                    matches = self.__findftmatch(iterable)
+                    if len(matches) > 0:
+                        results.extend(matches)
         elif isinstance(iterable, dict):
             for _, item in iterable.items():
                 if isinstance(item, list) or isinstance(item, dict):
@@ -189,7 +286,24 @@ def check_iterable(self, iterable):
                             'type': dt,
                             'value': item
                         })
+                    #Check full text for regex matches
+                    matches = self.__findftmatch(iterable)
+                    if len(matches) > 0:
+                        results.extend(matches)
         else:
             raise TypeError('Not supported type.')
 
+        #Deduplicate results for a cleaner result
+        results = self.deduplicate(results)
         return results
+
+    def deduplicate(self, list_of_objects):
+        dedup_list = []
+        for object in list_of_objects:
+            present = False
+            for new_object in dedup_list:
+                if object['type'] == new_object['type'] and object['value'] == new_object['value']:
+                    present = True
+            if not present:
+                dedup_list.append(object)
+        return dedup_list
\ No newline at end of file

From b7202f68599b0f45085016b14466942b27c15628 Mon Sep 17 00:00:00 2001
From: Jeffrey Everling <jeffrey@antsec.nl>
Date: Tue, 19 Feb 2019 18:11:40 +0100
Subject: [PATCH 2/3] Fixed some issues

---
 cortexutils/analyzer.py  |  7 ++-----
 cortexutils/extractor.py | 43 +++++++++++++++++-----------------------
 2 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/cortexutils/analyzer.py b/cortexutils/analyzer.py
index e391d0b..f929157 100644
--- a/cortexutils/analyzer.py
+++ b/cortexutils/analyzer.py
@@ -41,7 +41,7 @@ def build_taxonomy(self, level, namespace, predicate, value):
                 'namespace': namespace,
                 'predicate': predicate,
                 'value': value
-                }    
+                }
 
     def summary(self, raw):
         """Returns a summary, needed for 'short.html' template. Overwrite it for your needs!
@@ -52,10 +52,7 @@ def summary(self, raw):
     def artifacts(self, raw):
         # Use the regex extractor, if auto_extract setting is not False
         if self.auto_extract:
-            try:
-                extractor = EnhancedExtractor(ignore=self.get_data())
-            except:
-                extractor = Extractor(ignore=self.get_data())
+            extractor = Extractor(ignore=self.get_data())
             return extractor.check_iterable(raw)
 
         # Return empty list
diff --git a/cortexutils/extractor.py b/cortexutils/extractor.py
index 8a421c6..53a70c2 100644
--- a/cortexutils/extractor.py
+++ b/cortexutils/extractor.py
@@ -25,7 +25,6 @@ def __init__(self, ignore=None):
         self.ignore = ignore
         self.regex = self.__init_regex()
         self.ftregex = self.__init_ft_regex()
-        self.asregex = self.__init_analyzer_regex()
 
     @staticmethod
     def __init_regex():
@@ -119,11 +118,9 @@ def __init_regex():
 
         return regex
 
-        @staticmethod
-        def __init_ft_regex():
-        
-        logging.info("Preparing full text regex statements")
-        
+    @staticmethod
+    def __init_ft_regex():
+
         """
         Returns compiled full text regex list.
 
@@ -132,34 +129,32 @@ def __init_ft_regex():
         """
 
         #### Generic regexes
-        
+
         # IPv4
-        regex = [{
+        ftregex = [{
             'types': ['ip'],
             'regex': re.compile(r'(?:^|\D)((?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9])\.(?:25[0-5]|2[0-4]\d|[1]\d\d|[1-9]\d|[0-9]))(?:\D|$)', re.MULTILINE)
         }]
 
         # URL
-        regex.append({
+        ftregex.append({
             'types': ['url','fqdn','domain','uri_path'],
             'regex': re.compile(r'((?:http|https):\/\/((?:(?:.*?)\.)?(.*?(?:\.\w+)+))\/?([a-zA-Z0-9\/\-\_\.\~\=\?]+\??)?)', re.MULTILINE)
         })
 
         # mail
-        regex.append({
+        ftregex.append({
             'types': ['mail','domain'],
             'regex': re.compile(r'((?:[a-zA-Z0-9\/\-\_\.\+]+)@{1}([a-zA-Z0-9\-\_]+\.[a-zA-Z0-9\-\_\.]+)+)', re.MULTILINE)
         })
-        
+
         ### Mail Specific regexes
 
-        return regex
+        return ftregex
 
     @staticmethod
     def __init_analyzer_regex():
-        
-        logging.info("Preparing analyzer specific regex statements place holder")
-        
+
         """
         Returns False when the analyzer has no analyzer specific regexes.
 
@@ -167,9 +162,9 @@ def __init_analyzer_regex():
         :rtype: list
         """
 
-        self.empty_list = []
+        empty_list = []
 
-        return self.empty_list
+        return empty_list
 
     def __findftmatch(self, value):
         """Checks if the given value is contains regexes
@@ -181,13 +176,11 @@ def __findftmatch(self, value):
         """
         self.found_observables = []
         if isinstance(value, (str, unicode)):
-            self.regexpack = []
-            self.regexpack.append(self.ftregex)
-            self.regexpack.append(self.asregex)
+            self.regexpack = self.ftregex + self.asregex
             for r in self.regexpack:
-                matches = re.findall(r.get('regex'), value)
-                if len(matches) > 0:
-                    for found_observable in matches:
+                self.hits = re.findall(r.get('regex'), value)
+                if len(self.hits) > 0:
+                    for found_observable in self.hits:
                         if isinstance(found_observable, tuple):
                             i = 0
                             for groups in found_observable:
@@ -272,7 +265,7 @@ def check_iterable(self, iterable):
                             'value': item
                         })
                     #Check full text for regex matches
-                    matches = self.__findftmatch(iterable)
+                    matches = self.__findftmatch(item)
                     if len(matches) > 0:
                         results.extend(matches)
         elif isinstance(iterable, dict):
@@ -287,7 +280,7 @@ def check_iterable(self, iterable):
                             'value': item
                         })
                     #Check full text for regex matches
-                    matches = self.__findftmatch(iterable)
+                    matches = self.__findftmatch(item)
                     if len(matches) > 0:
                         results.extend(matches)
         else:

From 0e40bc8d4b8199d638fd2923591ba9e85b5f32dd Mon Sep 17 00:00:00 2001
From: Jeffrey E <gekkeharry13@hotmail.com>
Date: Thu, 16 Jan 2020 18:21:32 +0100
Subject: [PATCH 3/3] Removed  "+ self.asregex" as mentioned

---
 cortexutils/extractor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cortexutils/extractor.py b/cortexutils/extractor.py
index 53a70c2..a7f90b8 100644
--- a/cortexutils/extractor.py
+++ b/cortexutils/extractor.py
@@ -176,7 +176,7 @@ def __findftmatch(self, value):
         """
         self.found_observables = []
         if isinstance(value, (str, unicode)):
-            self.regexpack = self.ftregex + self.asregex
+            self.regexpack = self.ftregex
             for r in self.regexpack:
                 self.hits = re.findall(r.get('regex'), value)
                 if len(self.hits) > 0:
@@ -299,4 +299,4 @@ def deduplicate(self, list_of_objects):
                     present = True
             if not present:
                 dedup_list.append(object)
-        return dedup_list
\ No newline at end of file
+        return dedup_list