multivariate drift bugfix and doc update (#37)

NannyML · Mar 22, 2022 · 8cc0ec7 · 8cc0ec7
1 parent 7562d2b
commit 8cc0ec7
Show file tree

Hide file tree

Showing 6 changed files with 44 additions and 16 deletions.
diff --git a/docs/_static/butterfly-multivariate-drift.svg b/docs/_static/butterfly-multivariate-drift.svg
diff --git a/docs/deep_dive/performance_estimation.rst b/docs/deep_dive/performance_estimation.rst
@@ -66,8 +66,6 @@ for a set of :math:`n` predictions according to following algorithm:
     .. math::
         FN_{i,j}=\begin{cases} 0,\qquad \qquad \qquad \thinspace  y_{i,j}=1 \\ P(\hat{y} \neq y)_{i,j},\qquad y_{i,j}=0\end{cases}
 
-    .. math::
-
     7. Calculate steps 2-6 for all predictions in :math:`\hat{\mathbf{p}}`
        (i.e. for all :math:`j` from 1 to :math:`n`) so
        that confusion matrix elements are calculated for each prediction.

diff --git a/docs/examples/Deep Dive Data Reconstruction with PCA.ipynb b/docs/examples/Deep Dive Data Reconstruction with PCA.ipynb
@@ -161,7 +161,7 @@
     "# let's create plot with results\n",
     "figure = rcerror_results.plot()\n",
     "figure.show()\n",
-    "figure.write_image(file=\"butterfly-multivariate-drift.svg\")"
+    "# figure.write_image(file=\"butterfly-multivariate-drift.svg\")"
    ]
   },
   {

diff --git a/docs/examples/Quickstart.ipynb b/docs/examples/Quickstart.ipynb
@@ -13,7 +13,7 @@
     "reference.head()\n",
     "# Let's use a chunk size of 5000 data points to create our drift statistics\n",
     "chunk_size = 5000\n",
-    "data = pd.concat([reference, analysis])"
+    "data = pd.concat([reference, analysis], ignore_index=True)"
    ]
   },
   {
@@ -126,7 +126,7 @@
    "outputs": [],
    "source": [
     "# Let's initialize the object that will perform Data Reconstruction with PCA\n",
-    "rcerror_calculator = nml.DataReconstructionDriftCalculator(model_metadata=metadata, chunk_size=5000)\n",
+    "rcerror_calculator = nml.DataReconstructionDriftCalculator(model_metadata=metadata, chunk_size=chunk_size)\n",
     "# NannyML compares drift versus the full reference dataset.\n",
     "rcerror_calculator.fit(reference_data=reference)\n",
     "# let's see Reconstruction error statistics for all available data\n",
@@ -161,9 +161,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.10"
+   "version": "3.10.2"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/nannyml/drift/data_reconstruction/calculator.py b/nannyml/drift/data_reconstruction/calculator.py
@@ -100,12 +100,14 @@ def _fit(self, reference_data: pd.DataFrame):
 
         # TODO: We duplicate the reference data 3 times, here. Improve to something more memory efficient?
         imputed_reference_data = reference_data.copy(deep=True)
-        imputed_reference_data[selected_categorical_column_names] = self._imputer_categorical.fit_transform(
-            imputed_reference_data[selected_categorical_column_names]
-        )
-        imputed_reference_data[selected_continuous_column_names] = self._imputer_continuous.fit_transform(
-            imputed_reference_data[selected_continuous_column_names]
-        )
+        if len(selected_categorical_column_names) > 0:
+            imputed_reference_data[selected_categorical_column_names] = self._imputer_categorical.fit_transform(
+                imputed_reference_data[selected_categorical_column_names]
+            )
+        if len(selected_continuous_column_names) > 0:
+            imputed_reference_data[selected_continuous_column_names] = self._imputer_continuous.fit_transform(
+                imputed_reference_data[selected_continuous_column_names]
+            )
 
         encoder = CountEncoder(cols=selected_categorical_column_names, normalize=True)
         encoded_reference_data = imputed_reference_data.copy(deep=True)
@@ -251,8 +253,10 @@ def _calculate_reconstruction_error_for_data(
     data = data.reset_index(drop=True)
 
     # Impute missing values
-    data[selected_categorical_features] = imputer_categorical.transform(data[selected_categorical_features])
-    data[selected_continuous_features] = imputer_continuous.transform(data[selected_continuous_features])
+    if len(selected_categorical_features) > 0:
+        data[selected_categorical_features] = imputer_categorical.transform(data[selected_categorical_features])
+    if len(selected_continuous_features) > 0:
+        data[selected_continuous_features] = imputer_continuous.transform(data[selected_continuous_features])
 
     data[selected_features] = encoder.transform(data[selected_features])
 

diff --git a/tests/test_drift.py b/tests/test_drift.py
@@ -539,3 +539,29 @@ def test_data_reconstruction_drift_calculator_numeric_results(sample_drift_data,
         }
     )
     pd.testing.assert_frame_equal(expected_drift, drift.data[['key', 'reconstruction_error']])
+
+
+def test_data_reconstruction_drift_calculator_with_only_numeric_should_not_fail(  # noqa: D103
+    sample_drift_data, sample_drift_metadata
+):
+    calc = DataReconstructionDriftCalculator(sample_drift_metadata, chunk_period='W', features=['f1', 'f2'])
+    ref_data = sample_drift_data.loc[sample_drift_data['partition'] == 'reference']
+    calc.fit(ref_data)
+    try:
+        drift = calc.calculate(data=sample_drift_data)
+        print(drift)
+    except Exception:
+        pytest.fail()
+
+
+def test_data_reconstruction_drift_calculator_with_only_categorical_should_not_fail(  # noqa: D103
+    sample_drift_data, sample_drift_metadata
+):
+    calc = DataReconstructionDriftCalculator(sample_drift_metadata, chunk_period='W', features=['f3', 'f4'])
+    ref_data = sample_drift_data.loc[sample_drift_data['partition'] == 'reference']
+    calc.fit(ref_data)
+    try:
+        drift = calc.calculate(data=sample_drift_data)
+        print(drift)
+    except Exception:
+        pytest.fail()