Skip to content

Commit

Permalink
multivariate drift bugfix and doc update (#37)
Browse files Browse the repository at this point in the history
  • Loading branch information
nikml authored and nnansters committed Mar 22, 2022
1 parent 7562d2b commit 8cc0ec7
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 16 deletions.
2 changes: 1 addition & 1 deletion docs/_static/butterfly-multivariate-drift.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 0 additions & 2 deletions docs/deep_dive/performance_estimation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,6 @@ for a set of :math:`n` predictions according to following algorithm:
.. math::
FN_{i,j}=\begin{cases} 0,\qquad \qquad \qquad \thinspace y_{i,j}=1 \\ P(\hat{y} \neq y)_{i,j},\qquad y_{i,j}=0\end{cases}
.. math::
7. Calculate steps 2-6 for all predictions in :math:`\hat{\mathbf{p}}`
(i.e. for all :math:`j` from 1 to :math:`n`) so
that confusion matrix elements are calculated for each prediction.
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/Deep Dive Data Reconstruction with PCA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@
"# let's create plot with results\n",
"figure = rcerror_results.plot()\n",
"figure.show()\n",
"figure.write_image(file=\"butterfly-multivariate-drift.svg\")"
"# figure.write_image(file=\"butterfly-multivariate-drift.svg\")"
]
},
{
Expand Down
8 changes: 4 additions & 4 deletions docs/examples/Quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"reference.head()\n",
"# Let's use a chunk size of 5000 data points to create our drift statistics\n",
"chunk_size = 5000\n",
"data = pd.concat([reference, analysis])"
"data = pd.concat([reference, analysis], ignore_index=True)"
]
},
{
Expand Down Expand Up @@ -126,7 +126,7 @@
"outputs": [],
"source": [
"# Let's initialize the object that will perform Data Reconstruction with PCA\n",
"rcerror_calculator = nml.DataReconstructionDriftCalculator(model_metadata=metadata, chunk_size=5000)\n",
"rcerror_calculator = nml.DataReconstructionDriftCalculator(model_metadata=metadata, chunk_size=chunk_size)\n",
"# NannyML compares drift versus the full reference dataset.\n",
"rcerror_calculator.fit(reference_data=reference)\n",
"# let's see Reconstruction error statistics for all available data\n",
Expand Down Expand Up @@ -161,9 +161,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
"version": "3.10.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
}
20 changes: 12 additions & 8 deletions nannyml/drift/data_reconstruction/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,14 @@ def _fit(self, reference_data: pd.DataFrame):

# TODO: We duplicate the reference data 3 times, here. Improve to something more memory efficient?
imputed_reference_data = reference_data.copy(deep=True)
imputed_reference_data[selected_categorical_column_names] = self._imputer_categorical.fit_transform(
imputed_reference_data[selected_categorical_column_names]
)
imputed_reference_data[selected_continuous_column_names] = self._imputer_continuous.fit_transform(
imputed_reference_data[selected_continuous_column_names]
)
if len(selected_categorical_column_names) > 0:
imputed_reference_data[selected_categorical_column_names] = self._imputer_categorical.fit_transform(
imputed_reference_data[selected_categorical_column_names]
)
if len(selected_continuous_column_names) > 0:
imputed_reference_data[selected_continuous_column_names] = self._imputer_continuous.fit_transform(
imputed_reference_data[selected_continuous_column_names]
)

encoder = CountEncoder(cols=selected_categorical_column_names, normalize=True)
encoded_reference_data = imputed_reference_data.copy(deep=True)
Expand Down Expand Up @@ -251,8 +253,10 @@ def _calculate_reconstruction_error_for_data(
data = data.reset_index(drop=True)

# Impute missing values
data[selected_categorical_features] = imputer_categorical.transform(data[selected_categorical_features])
data[selected_continuous_features] = imputer_continuous.transform(data[selected_continuous_features])
if len(selected_categorical_features) > 0:
data[selected_categorical_features] = imputer_categorical.transform(data[selected_categorical_features])
if len(selected_continuous_features) > 0:
data[selected_continuous_features] = imputer_continuous.transform(data[selected_continuous_features])

data[selected_features] = encoder.transform(data[selected_features])

Expand Down
26 changes: 26 additions & 0 deletions tests/test_drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,3 +539,29 @@ def test_data_reconstruction_drift_calculator_numeric_results(sample_drift_data,
}
)
pd.testing.assert_frame_equal(expected_drift, drift.data[['key', 'reconstruction_error']])


def test_data_reconstruction_drift_calculator_with_only_numeric_should_not_fail( # noqa: D103
sample_drift_data, sample_drift_metadata
):
calc = DataReconstructionDriftCalculator(sample_drift_metadata, chunk_period='W', features=['f1', 'f2'])
ref_data = sample_drift_data.loc[sample_drift_data['partition'] == 'reference']
calc.fit(ref_data)
try:
drift = calc.calculate(data=sample_drift_data)
print(drift)
except Exception:
pytest.fail()


def test_data_reconstruction_drift_calculator_with_only_categorical_should_not_fail( # noqa: D103
sample_drift_data, sample_drift_metadata
):
calc = DataReconstructionDriftCalculator(sample_drift_metadata, chunk_period='W', features=['f3', 'f4'])
ref_data = sample_drift_data.loc[sample_drift_data['partition'] == 'reference']
calc.fit(ref_data)
try:
drift = calc.calculate(data=sample_drift_data)
print(drift)
except Exception:
pytest.fail()

0 comments on commit 8cc0ec7

Please sign in to comment.