Analyser exception handling (#69)

* start of analyser tests * analyser error handling and testing
forensic-architecture · Jun 18, 2019 · 76cf367 · 76cf367
1 parent e366b90
commit 76cf367
Show file tree

Hide file tree

Showing 12 changed files with 167 additions and 29 deletions.
diff --git a/run.py b/run.py
@@ -246,6 +246,7 @@ def __run_lib_tests():
             "python",
             "-m",
             "pytest",
+            "test",
         ]
     )
     if returncode is 1:

diff --git a/src/lib/common/analyser.py b/src/lib/common/analyser.py
@@ -6,7 +6,12 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
 from lib.common.util import save_logs
-from lib.common.exceptions import ElementShouldSkipError, ElementShouldRetryError
+from lib.common.exceptions import (
+    ElementShouldSkipError,
+    ElementShouldRetryError,
+    InvalidAnalyserConfigError,
+    MTriageStorageCorruptedError,
+)
 from lib.common.mtmodule import MTModule
 
 
@@ -54,7 +59,28 @@ class Analyser(MTModule):
     DERIVED_EXT = "derived"
 
     def __init__(self, config, module, dir):
-        super().__init__(module, dir)
+        try:
+            super().__init__(module, dir)
+        except PermissionError as e:
+            raise InvalidAnalyserConfigError("You must provide a valid directory path")
+
+        if not "elements_in" in config:
+            raise InvalidAnalyserConfigError(
+                "The config must contain an 'elements_in' whitelist indicating the analyser's input."
+            )
+        elif type(config["elements_in"]) is not list or len(config["elements_in"]) is 0:
+            raise InvalidAnalyserConfigError(
+                "The 'elements_in' whitelist must be a list containing at least one string"
+            )
+
+        if type(module) is not str or module == "":
+            raise InvalidAnalyserConfigError(
+                "You must provide a name for your analyser"
+            )
+
+        if type(dir) is not str:
+            raise InvalidAnalyserConfigError("You must provide a valid directory path")
+
         self.CONFIG = config
 
     @abstractmethod
@@ -69,10 +95,15 @@ def analyse_element(self, element, config):
         return NotImplemented
 
     def start_analysing(self):
-        self.__pre_analyse()
-        derived_dirs = self.__analyse()
-        self.__post_analyse(derived_dirs)
-        self.save_and_clear_logs()
+        # generic error handling protocol may get undescriptive in development
+        # should probably toggle off during development
+        try:
+            self.__pre_analyse()
+            derived_dirs = self.__analyse()
+            self.__post_analyse(derived_dirs)
+            self.save_and_clear_logs()
+        except:
+            raise MTriageStorageCorruptedError()
 
     def pre_analyse(self, config):
         """option to set up class variables"""
@@ -237,13 +268,15 @@ def __attempt_analyse(self, attempts, element, config):
         try:
             self.analyse_element(element, config)
         except ElementShouldSkipError as e:
+            os.rmdir(element["dest"])
             self.error_logger(str(e), element)
             return
         except ElementShouldRetryError as e:
             self.error_logger(str(e), element)
             if attempts > 1:
-                return self.attempt_analyse(attempts - 1, element, config)
+                return self.__attempt_analyse(attempts - 1, element, config)
             else:
+                os.rmdir(element["dest"])
                 self.error_logger(
                     "failed after maximum retries - skipping element", element
                 )

diff --git a/src/lib/common/exceptions.py b/src/lib/common/exceptions.py
@@ -57,3 +57,10 @@ def __init__(self, fname):
             f"""The method '{fname}' does not belong to a class that inherits from MTModule. The
                         logged_phase decorator can only be applied to methods on such a class."""
         )
+
+
+class MTriageStorageCorruptedError(Exception):
+    def __init__(self, fname):
+        super().__init__(
+            "MTriage encountered an unexpected file structure in selectors or analysers. Ensure you specified the correct working directory."
+        )
diff --git a/src/lib/common/mtmodule.py b/src/lib/common/mtmodule.py
@@ -61,6 +61,6 @@ def error_logger(self, msg, element=None):
     def __get_context(self, element):
         context = f"{self.NAME}: {self.PHASE_KEY}: "
         if element != None:
-            el_id = element["element_id"]
+            el_id = element["id"]
             context = context + f"{el_id}: "
         return context
diff --git a/src/lib/common/selector.py b/src/lib/common/selector.py
@@ -31,7 +31,7 @@ def index(self, config):
         Should populate a dataframe with the results, keep logs, and then call:
             self.index_complete(df, logs)
 
-        REQUIRED: each result in the dataframe must contain an 'element_id' field containing
+        REQUIRED: each result in the dataframe must contain an 'id' field containing
         a unique identifier for the element.
 
         NOTE: should be a relatively light pass that designates the space to be retrieved.
@@ -79,8 +79,8 @@ def __pre_retrieve(self):
     def __retrieve(self, df):
         for index, row in df.iterrows():
             element = row.to_dict()
-            element_id = row["element_id"]
-            element["dest"] = f"{self.ELEMENT_DIR}/{element_id}"
+            id = row["id"]
+            element["dest"] = f"{self.ELEMENT_DIR}/{id}"
             self.__attempt_retrieve(5, element)
 
     @MTModule.logged_phase("post-retrieve")

diff --git a/src/lib/selectors/local/main.py b/src/lib/selectors/local/main.py
@@ -51,7 +51,7 @@ def _run(self, config, output_path):
                         "name": f[0],
                         "extension": f[1],
                         "path": os.path.join(root, file),
-                        "element_id": f"{f[0]}{f[1]}",
+                        "id": f"{f[0]}{f[1]}",
                     }
                 )
                 self.logger("indexed file: " + os.path.join(root, file))

diff --git a/src/lib/selectors/youtube/main.py b/src/lib/selectors/youtube/main.py
@@ -99,14 +99,14 @@ def _add_to_csv_obj(self, csv_obj, s_res):
             desc = search_result["snippet"]["description"]
             publishedAt = search_result["snippet"]["publishedAt"]
             url = f"https://www.youtube.com/watch?v={videoId}"
-            element_id = self._id_from_url(url)
+            id = self._id_from_url(url)
             csv_obj.append(
                 {
                     "url": url,
                     "title": title.replace(",", ";"),
                     "desc": desc.replace(",", ";"),
                     "published": publishedAt[0:10],
-                    "element_id": element_id,
+                    "id": id,
                 }
             )
         return csv_obj

diff --git a/src/test/test_analyser_errors.py b/src/test/test_analyser_errors.py
@@ -0,0 +1,102 @@
+from lib.common.analyser import Analyser
+import os
+import unittest
+from lib.common.exceptions import (
+    ElementShouldRetryError,
+    ElementShouldSkipError,
+    InvalidAnalyserConfigError,
+    MTriageStorageCorruptedError,
+)
+from test.utils import (
+    TEMP_ELEMENT_DIR,
+    scaffold_empty,
+    scaffold_elementmap,
+    cleanup,
+    get_element_path,
+)
+import pandas
+
+
+class ErrorThrowingAnalyser(Analyser):
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.retryCount = 0
+
+    def analyse_element(self, element, config):
+        if element["id"] == "skip":
+            raise ElementShouldSkipError("test")
+        elif element["id"] == "retry3" and self.retryCount < 3:
+            self.retryCount += 1
+            raise ElementShouldRetryError("test")
+        elif element["id"] == "retryN":
+            raise ElementShouldRetryError("test")
+        else:
+            pass
+
+
+class TestAnalyserErrors(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.selname = "stub_sel"
+
+        scaffold_empty(self.selname, elements=["skip", "retry3", "retryN", "pass"])
+        good = {"elements_in": [self.selname]}
+
+        self.an = ErrorThrowingAnalyser(good, "analyserErrorSelector", TEMP_ELEMENT_DIR)
+
+    @classmethod
+    def tearDownClass(self):
+        cleanup()
+
+    def test_analyse_skip_error(self):
+        with self.assertRaisesRegex(ElementShouldSkipError, "test - skipping element"):
+            self.an.analyse_element({"id": "skip"}, {})
+
+    def test_analyse_retry_error(self):
+        with self.assertRaisesRegex(ElementShouldRetryError, "test - attempt retry"):
+            self.an.analyse_element({"id": "retryN"}, {})
+
+    def test_bad_init_error(self):
+        bad0 = {}
+        bad1 = {"elements_in": []}
+        bad2 = {"elements_in": None}
+        good = {"elements_in": ["selname"]}
+
+        with self.assertRaisesRegex(
+            InvalidAnalyserConfigError, "must contain an 'elements_in' whitelist"
+        ):
+            no_elements_in = ErrorThrowingAnalyser(bad0, "stub", TEMP_ELEMENT_DIR)
+
+        with self.assertRaisesRegex(
+            InvalidAnalyserConfigError,
+            "The 'elements_in' whitelist must be a list containing at least one string",
+        ):
+            empty_elements_in = ErrorThrowingAnalyser(bad1, "stub", TEMP_ELEMENT_DIR)
+
+        with self.assertRaisesRegex(
+            InvalidAnalyserConfigError,
+            "The 'elements_in' whitelist must be a list containing at least one string",
+        ):
+            empty_elements_in = ErrorThrowingAnalyser(bad2, "stub", TEMP_ELEMENT_DIR)
+
+        with self.assertRaisesRegex(
+            InvalidAnalyserConfigError, "You must provide a name for your analyser"
+        ):
+            badan2 = ErrorThrowingAnalyser(good, "", TEMP_ELEMENT_DIR)
+
+    def test_integration(self):
+        self.assertEqual(self.an.retryCount, 0)
+        self.an.start_analysing()
+
+        skip_path = get_element_path(self.selname, "skip", analyser=self.an.NAME)
+        self.assertFalse(os.path.exists(skip_path))
+
+        retryn_path = get_element_path(self.selname, "retryN", analyser=self.an.NAME)
+        self.assertFalse(os.path.exists(retryn_path))
+
+        retry3_path = get_element_path(self.selname, "retry3", analyser=self.an.NAME)
+        self.assertEqual(self.an.retryCount, 3)
+        self.assertTrue(os.path.exists(retry3_path))
+
+        pass_path = get_element_path(self.selname, "pass", analyser=self.an.NAME)
+        self.assertTrue(os.path.exists(pass_path))
diff --git a/src/test/test_mtmodule.py b/src/test/test_mtmodule.py
@@ -1,4 +1,5 @@
 from abc import ABC
+from test.utils import TEMP_ELEMENT_DIR, cleanup
 from lib.common.exceptions import ImproperLoggedPhaseError
 from lib.common.mtmodule import MTModule
 import os
@@ -13,12 +14,12 @@ class EmptyMTModule(MTModule):
 class TestEmptyMTModule(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        self.BASE_DIR = "../tempdir"
+        self.BASE_DIR = TEMP_ELEMENT_DIR
         self.mod = EmptyMTModule("empty", self.BASE_DIR)
 
     @classmethod
     def tearDownClass(self):
-        shutil.rmtree(self.BASE_DIR)
+        cleanup()
 
     def test_class_variables(self):
         self.assertEqual(self.mod.NAME, "empty")

diff --git a/src/test/test_selector.py b/src/test/test_selector.py
@@ -15,7 +15,7 @@
 class EmptySelector(Selector):
     def index(self, config):
         if not os.path.exists(self.ELEMENT_MAP):
-            df = pd.DataFrame([{"element_id": "test"}])
+            df = pd.DataFrame([{"id": "test"}])
             return df
         else:
             return None

diff --git a/src/test/test_selector_errors.py b/src/test/test_selector_errors.py
@@ -30,12 +30,12 @@ def index(self, config):
             return scaffold_elementmap(elements)
 
     def retrieve_element(self, element, config):
-        if element["element_id"] == "skip":
+        if element["id"] == "skip":
             raise ElementShouldSkipError("test")
-        elif element["element_id"] == "retry3" and self.retryCount < 3:
+        elif element["id"] == "retry3" and self.retryCount < 3:
             self.retryCount += 1
             raise ElementShouldRetryError("test")
-        elif element["element_id"] == "retryN":
+        elif element["id"] == "retryN":
             raise ElementShouldRetryError("test")
         else:
             pass
@@ -75,11 +75,11 @@ def test_index_error(self):
 
     def test_retrieve_skip_error(self):
         with self.assertRaisesRegex(ElementShouldSkipError, "test - skipping element"):
-            self.retrieveErrorSelector.retrieve_element({"element_id": "skip"}, {})
+            self.retrieveErrorSelector.retrieve_element({"id": "skip"}, {})
 
     def test_retrieve_retry_error(self):
         with self.assertRaisesRegex(ElementShouldRetryError, "test - attempt retry"):
-            self.retrieveErrorSelector.retrieve_element({"element_id": "retryN"}, {})
+            self.retrieveErrorSelector.retrieve_element({"id": "retryN"}, {})
 
     def test_integration(self):
         self.assertEqual(self.retrieveErrorSelector.retryCount, 0)

diff --git a/src/test/utils.py b/src/test/utils.py
@@ -7,12 +7,6 @@
 
 
 def scaffold_empty(selname, elements=[], analysers=[]):
-    print("scaffold run")
-    if not os.path.exists(TEMP_ELEMENT_DIR):
-        raise Exception(
-            "temp element dir doesn't exist - you need to instantiate it in a selector or analyser before using test methods."
-        )
-
     os.makedirs(f"{TEMP_ELEMENT_DIR}/{selname}/{Analyser.DERIVED_EXT}")
 
     for element in elements:
@@ -32,7 +26,7 @@ def get_element_path(selname, elementId, analyser=None):
 
 
 def scaffold_elementmap(elements=[]):
-    rows = list(map(lambda elid: {"element_id": elid}, elements))
+    rows = list(map(lambda elid: {"id": elid}, elements))
     return pd.DataFrame(rows)