Merge pull request #483 from bouthilx/feature/lpi

Add LPI plot
Epistimio · Nov 28, 2020 · d4a06a0 · d4a06a0
2 parents d5db0c3 + 9dc425e
commit d4a06a0
Show file tree

Hide file tree

Showing 14 changed files with 3,330 additions and 416 deletions.
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -30,6 +30,7 @@ requirements:
     - pandas
     - falcon
     - gunicorn
+    - scikit-learn
 
 test:
   import:

diff --git a/docs/src/user/web_api.rst b/docs/src/user/web_api.rst
@@ -244,9 +244,9 @@ Plots
 The plot resource permits the generation and retrieval of `Plotly <https://plotly.com/>`_ plots to
 visualize your experiments and their results.
 
-.. http:get:: /plots/regret/:experiment
+.. http:get:: /plots/lpi/:experiment
 
-   Return a regret plot for the specified experiment.
+   Return a lpi plot for the specified experiment.
 
    **Example response**
 
@@ -274,6 +274,21 @@ visualize your experiments and their results.
 
    :statuscode 404: When the specified experiment doesn't exist in the database.
 
+.. http:get:: /plots/regret/:experiment
+
+   Return a regret plot for the specified experiment.
+
+   **Example response**
+
+   .. sourcecode:: http
+
+      HTTP/1.1 200 OK
+      Content-Type: text/javascript
+
+   The JSON output is generated automatically according to the `Plotly.js schema reference <https://plotly.com/python/reference/index/>`_.
+
+   :statuscode 404: When the specified experiment doesn't exist in the database.
+
 
 Errors
 ------

diff --git a/examples/plotting/plotting-api.ipynb b/examples/plotting/plotting-api.ipynb
diff --git a/setup.py b/setup.py
@@ -58,7 +58,8 @@
             ]
         },
     install_requires=['PyYAML', 'pymongo>=3', 'numpy', 'scipy', 'gitpython', 'filelock',
-                      'tabulate', 'AppDirs', 'plotly', 'pandas', 'gunicorn', 'falcon'],
+                      'tabulate', 'AppDirs', 'plotly', 'pandas', 'gunicorn', 'falcon',
+                      'scikit-learn'],
     tests_require=tests_require,
     setup_requires=['setuptools', 'appdirs', 'pytest-runner'],
     extras_require=dict(test=tests_require),

diff --git a/src/orion/analysis/__init__.py b/src/orion/analysis/__init__.py
@@ -8,6 +8,7 @@
    :synopsis: Provides agnostic HPO analysis tools
 """
 
+from orion.analysis.lpi import lpi
 from orion.analysis.regret import regret
 
-__all__ = ['regret']
+__all__ = ['lpi', 'regret']
diff --git a/src/orion/analysis/lpi.py b/src/orion/analysis/lpi.py
@@ -0,0 +1,186 @@
+# -*- coding: utf-8 -*-
+"""
+:mod:`orion.analysis.lpi` -- Provide tools to calculate Local Parameter Importance
+==================================================================================
+
+.. module:: orion.analysis.lpi
+   :platform: Unix
+   :synopsis: Provide tools to calculate Local Parameter Importance
+"""
+import numpy
+import pandas as pd
+from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor,\
+    ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
+
+from orion.core.worker.transformer import build_required_space
+
+
+_regressors_ = {
+    'AdaBoostRegressor': AdaBoostRegressor,
+    'BaggingRegressor': BaggingRegressor,
+    'ExtraTreesRegressor': ExtraTreesRegressor,
+    'GradientBoostingRegressor': GradientBoostingRegressor,
+    'RandomForestRegressor': RandomForestRegressor,
+}
+
+
+def train_regressor(regressor_name, data, **kwargs):
+    """Train regressor model
+
+    Parameters
+    ----------
+    model: str
+        Name of the regression model to use. Can be one of
+        - AdaBoostRegressor
+        - BaggingRegressor
+        - ExtraTreesRegressor
+        - GradientBoostingRegressor
+        - RandomForestRegressor (Default)
+    trials: DataFrame or dict
+        A dataframe of trials containing, at least, the columns 'objective' and 'id'. Or a dict
+        equivalent.
+
+    **kwargs
+        Arguments for the regressor model.
+
+    """
+    if regressor_name not in _regressors_:
+        raise ValueError(
+            f'{regressor_name} is not a supported regressor. '
+            f'Did you mean any of theses: list(_regressors_.keys())')
+
+    regressor = _regressors_[regressor_name](**kwargs)
+    return regressor.fit(data[:, :-1], data[:, -1])
+
+
+def to_numpy(trials, space):
+    """Convert trials in DataFrame to Numpy array of (params + objective)"""
+    return trials[list(space.keys()) + ['objective']].to_numpy()
+
+
+def flatten(trials_array, flattened_space):
+    """Flatten dimensions"""
+    flattened_points = numpy.array(
+        [flattened_space.transform(point[:-1]) for point in trials_array])
+
+    return numpy.concatenate((flattened_points, trials_array[:, -1:]), axis=1)
+
+
+def make_grid(point, space, model, n):
+    """Build a grid based on point.
+
+    The shape of the grid will be
+        (number of hyperparameters,
+         number of points ``n``,
+         number of hyperparameters + 1)
+
+    Last column is the objective predicted by the model for a given point.
+
+    Parameters
+    ----------
+    point: numpy.ndarray
+        A tuple representation of the best trials, (hyperparameters + objective)
+    model: str
+        Name of the regression model to use. Can be one of
+        - AdaBoostRegressor
+        - BaggingRegressor
+        - ExtraTreesRegressor
+        - GradientBoostingRegressor
+        - RandomForestRegressor (Default)
+    trials: DataFrame or dict
+        A dataframe of trials containing, at least, the columns 'objective' and 'id'. Or a dict
+        equivalent.
+
+    **kwargs
+        Arguments for the regressor model.
+
+    """
+    grid = numpy.zeros((len(space), n, len(space) + 1))
+    for i, dim in enumerate(space.values()):
+        grid[i, :, :] = point
+        grid[i, :, i] = numpy.linspace(*dim.interval(), num=n)
+        grid[i, :, -1] = model.predict(grid[i, :, :-1])
+    return grid
+
+
+def compute_variances(grid):
+    """Compute variance for each hyperparameters"""
+    return grid[:, :, -1].var(axis=1)
+
+
+def _lpi(point, space, model, n):
+    """Local parameter importance for each hyperparameters"""
+    grid = make_grid(point, space, model, n)
+    variances = compute_variances(grid)
+    ratios = variances / variances.sum()
+    return pd.DataFrame(data=ratios, index=space.keys(), columns=['LPI'])
+
+
+def _linear_lpi(point, space, model, n):
+    # TODO
+    return
+
+
+modes = dict(
+    best=_lpi,
+    linear=_linear_lpi)
+
+
+def lpi(trials, space, mode='best', model='RandomForestRegressor', n=20, **kwargs):
+    """
+    Calculates the Local Parameter Importance for a collection of :class:`Trial`.
+
+    For more information on the metric, see original paper at
+    https://ml.informatik.uni-freiburg.de/papers/18-LION12-CAVE.pdf.
+
+    Biedenkapp, André, et al. "Cave: Configuration assessment, visualization and evaluation."
+    International Conference on Learning and Intelligent Optimization. Springer, Cham, 2018.
+
+    Parameters
+    ----------
+    trials: DataFrame or dict
+        A dataframe of trials containing, at least, the columns 'objective' and 'id'. Or a dict
+        equivalent.
+
+    space: Space object
+        A space object from an experiment.
+
+    mode: str
+        Mode to compute the LPI.
+        - ``best``: Take the best trial found as the anchor for the LPI
+        - ``linear``: Recompute LPI for all values on a grid
+
+    model: str
+        Name of the regression model to use. Can be one of
+        - AdaBoostRegressor
+        - BaggingRegressor
+        - ExtraTreesRegressor
+        - GradientBoostingRegressor
+        - RandomForestRegressor (Default)
+
+    n: int
+        Number of points to compute the variances. Default is 20.
+
+    **kwargs
+        Arguments for the regressor model.
+
+    Returns
+    -------
+    DataFrame
+        LPI value for each parameter. If ``mode`` is `linear`, then a list of
+        param values and LPI metrics are returned in a DataFrame format.
+    """
+    flattened_space = build_required_space(
+        space, type_requirement='numerical', shape_requirement='flattened')
+    if trials.empty or trials.shape[0] == 0:
+        return pd.DataFrame(
+            data=[0] * len(flattened_space),
+            index=flattened_space.keys(),
+            columns=['LPI'])
+
+    data = to_numpy(trials, space)
+    data = flatten(data, flattened_space)
+    model = train_regressor(model, data, **kwargs)
+    best_point = data[numpy.argmin(data[:, -1])]
+    results = modes[mode](best_point, flattened_space, model, n)
+    return results
diff --git a/src/orion/core/worker/transformer.py b/src/orion/core/worker/transformer.py
@@ -116,21 +116,23 @@ def reshape(space, shape_requirement):
     reshaped_space = ReshapedSpace(space)
 
     for dim_index, dim in enumerate(space.values()):
-        if numpy.prod(dim.shape) == 1:
+        if not dim.shape or numpy.prod(dim.shape) == 1:
             reshaped_space.register(
                 ReshapedDimension(
                     transformer=Identity(dim.type),
-                    original_dimension=dim
+                    original_dimension=dim,
+                    index=dim_index
                 )
             )
         else:
             for index in itertools.product(*map(range, dim.shape)):
                 key = f'{dim.name}[{",".join(map(str, index))}]'
                 reshaped_space.register(
                     ReshapedDimension(
-                        transformer=View(dim.shape, index, dim_index, dim.type),
+                        transformer=View(dim.shape, index, dim.type),
                         original_dimension=dim,
-                        name=key
+                        name=key,
+                        index=dim_index
                     )
                 )
 
@@ -233,6 +235,8 @@ def transform(self, point):
     # pylint:disable=unused-argument
     def reverse(self, transformed_point, index=None):
         """Return `transformed_point` as it is."""
+        if index is not None:
+            return transformed_point[index]
         return transformed_point
 
     def repr_format(self, what):
@@ -529,10 +533,9 @@ def reverse(self, transformed_point, index=None):
 class View(Transformer):
     """Look-up single index in a dimensions with shape > 1"""
 
-    def __init__(self, shape, index, dim_index, domain_type=None):
+    def __init__(self, shape, index, domain_type=None):
         self.shape = shape
         self.index = index
-        self.dim_index = dim_index
         self._domain_type = domain_type
 
     @property
@@ -542,7 +545,7 @@ def first(self):
 
     def transform(self, point):
         """Only return one element of the group"""
-        return point[self.dim_index][self.index]
+        return point[self.index]
 
     def reverse(self, transformed_point, index=None):
         """Only return packend point if view of first element, otherwise drop."""
@@ -679,11 +682,12 @@ def cardinality(self):
 class ReshapedDimension(TransformedDimension):
     """Duck-type `Dimension` to mimic its functionality."""
 
-    def __init__(self, transformer, original_dimension, name=None):
+    def __init__(self, transformer, original_dimension, index, name=None):
         super(ReshapedDimension, self).__init__(transformer, original_dimension)
         if name is None:
             name = original_dimension.name
         self._name = name
+        self.index = index
 
     @property
     def first(self):
@@ -692,7 +696,7 @@ def first(self):
 
     def transform(self, point):
         """Expose `Transformer.transform` interface from underlying instance."""
-        return self.transformer.transform(point)
+        return self.transformer.transform(point[self.index])
 
     def reverse(self, transformed_point, index=None):
         """Expose `Transformer.reverse` interface from underlying instance."""
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,6 +30,7 @@ requirements: @@
         - pandas
         - falcon
         - gunicorn
+        - scikit-learn
     test:
       import:
@@ Expand Down @@