Replace Pandas with Tabulate in summary (#95)

Replace usage of pandas library with smaller pure python Tabulate package to reduce build size and download time. [ committed by @MattToast ] [ reviewed by @Spartee ]
CrayLabs · Dec 9, 2021 · b091fc1 · b091fc1
2 parents ab0207d + bd9eaf5
commit b091fc1
Show file tree

Hide file tree

Showing 15 changed files with 86 additions and 55 deletions.
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -7,7 +7,7 @@ numpy>=1.18.2
 toml>=0.10.1
 tqdm>=4.50.2
 psutil>=5.7.2
-pandas>=1.1.3
+tabulate>=0.8.9
 black>=20.8b1
 isort>=5.6.4
 pylint>=2.6.0

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 psutil>=5.7.2
 coloredlogs==10.0
-pandas>=1.1.3
+tabulate>=0.8.9
 smartredis>=0.1.1
 redis==3.5.3
 redis-py-cluster==2.1.3

diff --git a/setup.cfg b/setup.cfg
@@ -33,7 +33,7 @@ include_package_data = True
 install_requires =
     psutil>=5.7.2
     coloredlogs==10.0
-    pandas>=1.1.3
+    tabulate>=0.8.9
     smartredis>=0.1.1
     redis-py-cluster==2.1.3
     redis==3.5.3

diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py
@@ -86,7 +86,7 @@ def __init__(
         :rtype: ``Ensemble``
         """
         self.params = init_default({}, params, dict)
-        self.params_as_args = init_default({}, params_as_args, (list,str))
+        self.params_as_args = init_default({}, params_as_args, (list, str))
         self._key_prefixing_enabled = True
         self.batch_settings = init_default({}, batch_settings, BatchSettings)
         self.run_settings = init_default({}, run_settings, RunSettings)
@@ -113,9 +113,7 @@ def _initialize_entities(self, **kwargs):
                 param_names, params = self._read_model_parameters()
 
                 # Compute all combinations of model parameters and arguments
-                all_model_params = strategy(
-                    param_names, params, **kwargs
-                )
+                all_model_params = strategy(param_names, params, **kwargs)
                 if not isinstance(all_model_params, list):
                     raise UserStrategyError(strategy)
 
@@ -171,7 +169,6 @@ def _initialize_entities(self, **kwargs):
             else:
                 logger.info("Empty ensemble created for batch launch")
 
-
     def add_model(self, model):
         """Add a model to this ensemble
 
@@ -300,4 +297,4 @@ def _read_model_parameters(self):
                     "Incorrect type for ensemble parameters\n"
                     + "Must be list, int, or string."
                 )
-        return param_names, parameters
+        return param_names, parameters
diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py
@@ -25,6 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from smartsim.error.errors import SSConfigError
+
 from ..error import EntityExistsError
 from ..utils.helpers import cat_arg_and_value, init_default
 from .entity import SmartSimEntity
@@ -119,15 +120,18 @@ def attach_generator_files(self, to_copy=None, to_symlink=None, to_configure=Non
         self.files = EntityFiles(to_configure, to_copy, to_symlink)
 
     def params_to_args(self):
-        """Convert parameters to command line arguments and update run settings.
-        """
+        """Convert parameters to command line arguments and update run settings."""
         for param in self.params_as_args:
             if not param in self.params:
-                raise SSConfigError(f"Tried to convert {param} to command line argument " +
-                                    f"for Model {self.name}, but its value was not found in model params")
+                raise SSConfigError(
+                    f"Tried to convert {param} to command line argument "
+                    + f"for Model {self.name}, but its value was not found in model params"
+                )
             if self.run_settings is None:
-                raise SSConfigError(f"Tried to configure command line parameter for Model {self.name}, " +
-                                    "but no RunSettings are set.")
+                raise SSConfigError(
+                    f"Tried to configure command line parameter for Model {self.name}, "
+                    + "but no RunSettings are set."
+                )
             self.run_settings.add_exe_args(cat_arg_and_value(param, self.params[param]))
 
     def __eq__(self, other):

diff --git a/smartsim/experiment.py b/smartsim/experiment.py
@@ -29,7 +29,7 @@
 from os import getcwd
 from pprint import pformat
 
-import pandas as pd
+from tabulate import tabulate
 from tqdm import trange
 
 from .control import Controller, Manifest
@@ -461,41 +461,46 @@ def reconnect_orchestrator(self, checkpoint):
             logger.error(e)
             raise
 
-    def summary(self):
+    def summary(self, format="github"):
         """Return a summary of the ``Experiment``
 
         The summary will show each instance that has been
         launched and completed in this ``Experiment``
 
-        :return: pandas Dataframe of ``Experiment`` history
-        :rtype: pd.DataFrame
+        :param format: the style in which the summary table is formatted,
+                       for a full list of styles see:
+                       https://github.com/astanin/python-tabulate#table-format,
+                       defaults to "github"
+        :type format: str, optional
+        :return: tabulate string of ``Experiment`` history
+        :rtype: str
         """
-        index = 0
-        df = pd.DataFrame(
-            columns=[
-                "Name",
-                "Entity-Type",
-                "JobID",
-                "RunID",
-                "Time",
-                "Status",
-                "Returncode",
-            ]
-        )
+        values = []
+        headers = [
+            "Name",
+            "Entity-Type",
+            "JobID",
+            "RunID",
+            "Time",
+            "Status",
+            "Returncode",
+        ]
+
         # TODO should this include running jobs?
         for job in self._control._jobs.completed.values():
             for run in range(job.history.runs + 1):
-                df.loc[index] = [
-                    job.entity.name,
-                    job.entity.type,
-                    job.history.jids[run],
-                    run,
-                    job.history.job_times[run],
-                    job.history.statuses[run],
-                    job.history.returns[run],
-                ]
-                index += 1
-        return df
+                values.append(
+                    [
+                        job.entity.name,
+                        job.entity.type,
+                        job.history.jids[run],
+                        run,
+                        job.history.job_times[run],
+                        job.history.statuses[run],
+                        job.history.returns[run],
+                    ]
+                )
+        return tabulate(values, headers, showindex=True, tablefmt=format)
 
     def _launch_summary(self, manifest):
         """Experiment pre-launch summary of entities that will be launched

diff --git a/tests/backends/run_sklearn_onnx.py b/tests/backends/run_sklearn_onnx.py
@@ -5,7 +5,6 @@
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.linear_model import LinearRegression
 from sklearn.model_selection import train_test_split
-
 from smartredis import Client
 
 

diff --git a/tests/backends/run_tf.py b/tests/backends/run_tf.py
@@ -1,9 +1,9 @@
 import os
 
 import numpy as np
+from smartredis import Client
 from tensorflow import keras
 
-from smartredis import Client
 from smartsim.tf import freeze_model
 
 

diff --git a/tests/backends/run_torch.py b/tests/backends/run_torch.py
@@ -4,7 +4,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-
 from smartredis import Client
 
 

diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py
@@ -77,17 +77,19 @@ def test_summary(fileutils, wlmutils):
     assert exp.get_status(bad)[0] == constants.STATUS_FAILED
     assert exp.get_status(sleep)[0] == constants.STATUS_COMPLETED
 
-    summary_df = exp.summary()
-    print(summary_df)
-    row = summary_df.loc[0]
+    summary_str = exp.summary(format="plain")
+    print(summary_str)
 
+    rows = [s.split() for s in summary_str.split("\n")]
+    headers = ["Index"] + rows.pop(0)
+
+    row = dict(zip(headers, rows[0]))
     assert sleep.name == row["Name"]
     assert sleep.type == row["Entity-Type"]
     assert 0 == int(row["RunID"])
     assert 0 == int(row["Returncode"])
 
-    row_1 = summary_df.loc[1]
-
+    row_1 = dict(zip(headers, rows[1]))
     assert bad.name == row_1["Name"]
     assert bad.type == row_1["Entity-Type"]
     assert 0 == int(row_1["RunID"])

diff --git a/tests/test_configs/smartredis/consumer.py b/tests/test_configs/smartredis/consumer.py
@@ -5,7 +5,6 @@
 import numpy as np
 import torch
 import torch.nn as nn
-
 from smartredis import Client
 
 if __name__ == "__main__":

diff --git a/tests/test_configs/smartredis/producer.py b/tests/test_configs/smartredis/producer.py
@@ -5,7 +5,6 @@
 import numpy as np
 import torch
 import torch.nn as nn
-
 from smartredis import Client
 
 

diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py
@@ -140,7 +140,11 @@ def test_arg_and_model_params_step():
     rs_copy = deepcopy(rs)
     rs_orig_args = rs_copy.exe_args
     ensemble = Ensemble(
-        "step", params, params_as_args=["H", "g_param"], run_settings=rs_copy, perm_strat="step"
+        "step",
+        params,
+        params_as_args=["H", "g_param"],
+        run_settings=rs_copy,
+        perm_strat="step",
     )
     assert len(ensemble) == 2
 

diff --git a/tests/test_experiment.py b/tests/test_experiment.py
@@ -99,3 +99,27 @@ def test_poll(fileutils):
     exp.start(model, block=False)
     exp.poll(interval=1)
     exp.stop(model)
+
+
+def test_summary(fileutils):
+    exp_name = "test_exp_summary"
+    exp = Experiment(exp_name)
+    test_dir = fileutils.make_test_dir(exp_name)
+    m = exp.create_model(
+        "model", path=test_dir, run_settings=RunSettings("echo", "Hello")
+    )
+    exp.start(m)
+    summary_str = exp.summary(format="plain")
+    print(summary_str)
+
+    summary_lines = summary_str.split("\n")
+    assert 2 == len(summary_lines)
+
+    headers, values = [s.split() for s in summary_lines]
+    headers = ["Index"] + headers
+
+    row = dict(zip(headers, values))
+    assert m.name == row["Name"]
+    assert m.type == row["Entity-Type"]
+    assert 0 == int(row["RunID"])
+    assert 0 == int(row["Returncode"])
diff --git a/tests/test_smartredis.py b/tests/test_smartredis.py
@@ -22,9 +22,8 @@
 
 shouldrun = True
 try:
-    import torch
-
     import smartredis
+    import torch
 except ImportError:
     shouldrun = False