From bef51ade45e874446582ba2d3468436d0bbc0aba Mon Sep 17 00:00:00 2001
From: Daniel Rapp <rappdw@gmail.com>
Date: Fri, 17 Apr 2020 16:06:22 -0600
Subject: [PATCH] Handle SIGQUIT on out of memory from NumPy better (#48)

* Add memory test script
Make rbg more robust... output whatever info has been collected

* Set limit in rbg of 35000 vertices. This is arbitrary, but allows reasonable execution time without using too much virtual memory in my configuration
Update bibliography references (not tied to this branch...)
---
 bibliography.md    | 10 ++++--
 scripts/memtest.py | 78 ++++++++++++++++++++++++++++++++++++++++++++++
 scripts/rbg        | 52 ++++++++++++++++++++-----------
 3 files changed, 120 insertions(+), 20 deletions(-)
 create mode 100644 scripts/memtest.py

diff --git a/bibliography.md b/bibliography.md
index 90a7929..608d674 100644
--- a/bibliography.md
+++ b/bibliography.md
@@ -1,4 +1,10 @@
 | Date encountered | Author | Title |
 |------------------|--------|-------|
-| 4/6/2020 | Stephan Dolan | Fun with Semirings |
-| 4/2/2020 | Stéfan van der Walt, Gael Varoquaux | The NumPy Array: A Structure for Efficient Numerical Computation | 
\ No newline at end of file
+| 4/2/2020 | Travis Oliphant | [Guide to NumPy](https://web.mit.edu/dvp/Public/numpybook.pdf) |
+| 4/2/2020 | Stéfan van der Walt, Gael Varoquaux | [The NumPy Array: A Structure for Efficient Numerical Computation](https://arxiv.org/abs/1102.1523) |
+| 4/6/2020 | Stephan Dolan | [Fun with Semirings](http://stedolan.net/research/semirings.pdf) |
+| 4/12/2020 | Alan Cannaday | [Solving Cycling Pedigress or "Loops" by Analyzing Birth Ranges and Parent-Child Relationships](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.368.9730&rep=rep1&type=pdf)  
+| 4/14/2020 | Leskovec, Rajaraman, Ullman | [Mining of Massive Datasets](http://infolab.stanford.edu/~ullman/mmds/book0n.pdf) |
+| 4/14/2020 | Balabit | [Scalable SParse Matrix Multiplication in Apache Spark](https://medium.com/balabit-unsupervised/scalable-sparse-matrix-multiplication-in-apache-spark-c79e9ffc0703) |
+
+ 
\ No newline at end of file
diff --git a/scripts/memtest.py b/scripts/memtest.py
new file mode 100644
index 0000000..907671a
--- /dev/null
+++ b/scripts/memtest.py
@@ -0,0 +1,78 @@
+import numpy as np
+import os
+import psutil
+import sys
+import traceback
+
+PROCESS = psutil.Process(os.getpid())
+MEGA = 1024 * 1024
+
+
+def main():
+    try:
+        print_memory_usage()
+        # alloc_max_str()
+        alloc_max_array()
+    except MemoryError as error:
+        # Output expected MemoryErrors.
+        log_exception(error)
+    except Exception as exception:
+        # Output unexpected Exceptions.
+        log_exception(exception, False)
+
+
+def alloc_max_array():
+    """Allocates memory for maximum array.
+    See: https://stackoverflow.com/a/15495136
+
+    :return: None
+    """
+    base = 13 * 10000
+    i = 0
+    nbytes = 0
+    while True:
+        try:
+            size = base + i * 1000
+            collection = np.ones((size,size), dtype=np.int32)
+            nbytes = collection.nbytes
+            i += 1
+            if i % 1 == 0:
+                print(f"loop: {i}; size: {size:,}; allocated: {nbytes/(1024*1024*1024):,.2f} GB")
+        except MemoryError as error:
+            # Output expected MemoryErrors.
+            log_exception(error)
+            break
+        except Exception as exception:
+            # Output unexpected Exceptions.
+            log_exception(exception, False)
+            break
+    print(f'Maximum array size: {nbytes:,}')
+    print_memory_usage()
+
+
+def log_exception(exception: BaseException, expected: bool = True):
+    """Prints the passed BaseException to the console, including traceback.
+
+    :param exception: The BaseException to output.
+    :param expected: Determines if BaseException was expected.
+    """
+    output = "[{}] {}: {}".format('EXPECTED' if expected else 'UNEXPECTED', type(exception).__name__, exception)
+    print(output)
+    exc_type, exc_value, exc_traceback = sys.exc_info()
+    traceback.print_tb(exc_traceback)
+
+
+def print_memory_usage():
+    """Prints current memory usage stats.
+    See: https://stackoverflow.com/a/15495136
+
+    :return: None
+    """
+    total, available, percent, used, free, active, inactive, wired = psutil.virtual_memory()
+    total, available, used, free = total / MEGA, available / MEGA, used / MEGA, free / MEGA
+    proc = PROCESS.memory_info()[1] / MEGA
+    print(f'process = {proc:,.2f} total = {total:,.2f} available = {available:,.2f} used = {used:,.2f} free = {free:,.2f} percent = {percent}')
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/scripts/rbg b/scripts/rbg
index 3da8d38..02f8a44 100755
--- a/scripts/rbg
+++ b/scripts/rbg
@@ -15,6 +15,7 @@ from scipy.stats import describe
 
 
 MAX_PRACTICAL_SIZE = 1500
+MAX_ALLOWABLE_SIZE = 35000 #130000
 
 
 def init_argparse():
@@ -93,6 +94,8 @@ def run_analysis(args):
     col_stats = None
     time_closure = 0
     time_canonical = 0
+    vertex_info = None
+    start = time()
     if args.closure or args.canonical:
         try:
             logger.info("Computing transitive closure")
@@ -114,34 +117,37 @@ def run_analysis(args):
 
             if args.closure and writer:
                 write_results(args, writer, R_star, "closure", "closure")
+
+            if args.canonical:
+                logger.info("Computing canonical ordering")
+                start = time()
+                R_canonical = avos_canonical_ordering(R_star)
+                time_canonical = time() - start
+                logger.info("  Canonical ordering complete")
+                if writer:
+                    write_results(args, writer, R_canonical.A, "canonical", "canonical ordering",
+                                  key_permutation=R_canonical.label_permutation)
         except CycleError as e:
+            if time_closure == 0:
+                time_closure = time() - start
             reader.get_vertex_key()
-            logger.info(f"  Error: cycle detected. {reader.get_vertex_key()[e.vertex]} has a path to itself.")
-            return
-
-    if args.canonical:
-        logger.info("Computing canonical ordering")
-        start = time()
-        R_canonical = avos_canonical_ordering(R_star)
-        time_canonical = time() - start
-        logger.info("  Canonical ordering complete")
-        if writer:
-            write_results(args, writer, R_canonical.A, "canonical", "canonical ordering", key_permutation=R_canonical.label_permutation)
+            vertex_info = reader.get_vertex_key()[e.vertex]
+            logger.error(f"  Error: cycle detected. {vertex_info} has a path to itself.")
 
     if args.append_analysis:
         comp_stats = describe([val for val in R_canonical.components.values()]) if R_canonical else None
         header_written = path.exists(args.append_analysis)
         with open(args.append_analysis, "a+") as file:
             if not header_written:
-                file.write("#name,hops,multi parents,vertices,edges (simple),edges (closure),time (closure),time (canonical),mean,row max,row variance,row skewness,row kurtosis,col max,col variance,col skewness,col kurtosis,components,comp min,comp max,comp mean,comp variance,comp skewness,comp kurtosis\n")
+                file.write("#name,hops,multi parents,vertices,edges (simple),edges (closure),time (closure),time (canonical),mean,row max,row variance,row skewness,row kurtosis,col max,col variance,col skewness,col kurtosis,components,comp min,comp max,comp mean,comp variance,comp skewness,comp kurtosis,cycle vertex\n")
             file.write(f"{args.basename},"                                               # name
                        f"{args.hops},"                                                   # hops
                        f"{'T' if args.ingest_invalid else 'F'},"                         # multiple parents allowed
                        f"{graph.shape[0]},"                                              # vertices
                        f"{graph.nnz},"                                                   # edges (simple)
                        f"{np.count_nonzero(R_star) if R_star is not None else ''},"      # edges (closed)
-                       f"{time_closure},"                                                # time (closure)
-                       f"{time_canonical},"                                              # time (canonical)
+                       f"{time_closure if time_closure else ''},"                                                # time (closure)
+                       f"{time_canonical if time_canonical else ''},"                                              # time (canonical)
                        f"{row_stats.mean if row_stats is not None else ''},"             # mean
                        f"{row_stats.minmax[1] if row_stats is not None else ''},"        # row max
                        f"{row_stats.variance if row_stats is not None else ''},"         # row variance
@@ -158,12 +164,13 @@ def run_analysis(args):
                        f"{comp_stats.variance if comp_stats is not None else ''},"       # comp variance
                        f"{comp_stats.skewness if comp_stats is not None else ''},"       # comp skewness
                        f"{comp_stats.kurtosis if comp_stats is not None else ''},"       # comp kurtosis
+                       f"{vertex_info if vertex_info else ''},"                          # cycle vertex
                        f"\n")
 
 
 def get_file(args, name: str, extension:str="xlsx", output=True):
-    basefile = args.basefile
-    basename = str(Path(basefile).parts[-1])
+    basefile = Path(args.basefile)
+    basename = str(basefile.parts[-1])
     args.basename = basename
     hops = args.hops
     if output:
@@ -205,11 +212,20 @@ if __name__ == '__main__':
                                     args.invalid_filter, ignore_file)
     graph: rb.sparse.rb_matrix = reader.read()
 
-    if graph.shape[0] >= MAX_COLUMNS_EXCEL:
+    if graph.shape[0] >= MAX_COLUMNS_EXCEL and graph.shape[0] <= MAX_ALLOWABLE_SIZE and args.outdir:
         logger.error(f"Trying to ingest a graph that exceeds the size excel can handle (Max: {MAX_COLUMNS_EXCEL:,}).")
-    if graph.shape[0] >= MAX_PRACTICAL_SIZE:
+        args.outdir = None
+    if graph.shape[0] >= MAX_PRACTICAL_SIZE and graph.shape[0] <= MAX_ALLOWABLE_SIZE and args.outdir:
         logger.warning("This graph is on the large size. It will take a few seconds more to write the xlsx file.")
 
     logger.info(f"  Reading complete. There are {graph.nnz:,} edges in the graph.")
 
+    if graph.shape[0] > MAX_ALLOWABLE_SIZE:
+        # Beyond a certain size, allocating a numpy array will result in a SIGKILL.
+        # On my macbook, that is 130,000 (MAX_ALLOWABLE_SIZE). I'm uncertain how to determine
+        # this programmatically, but it can be determined empirically via the memtest script
+        logger.error(f"Unable to process more than {MAX_ALLOWABLE_SIZE:,} vertices. Attempting to process "
+                     f"{graph.shape[0]:,} vertices.")
+        args.closure = args.canonical = False
+
     args.func(args)