replace Boston housing dataset with diabetes dataset since sklearn ha…

…s deprecated the Boston housing dataset
interpretml · Jan 12, 2023 · df8cf74 · df8cf74
1 parent 0cb3ab8
commit df8cf74
Show file tree

Hide file tree

Showing 10 changed files with 33 additions and 39 deletions.
diff --git a/examples/python/assets/importance_notebook_global_lstat.png b/examples/python/assets/importance_notebook_global_lstat.png
diff --git a/examples/python/assets/importance_notebook_local_exp.png b/examples/python/assets/importance_notebook_local_exp.png
diff --git a/examples/python/notebooks/EBM Feature Importances.ipynb b/examples/python/notebooks/EBM Feature Importances.ipynb
@@ -37,12 +37,12 @@
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "from sklearn.datasets import load_boston\n",
+    "from sklearn.datasets import load_diabetes\n",
     "from interpret.glassbox import ExplainableBoostingRegressor\n",
     "\n",
-    "boston = load_boston()\n",
-    "df = pd.DataFrame(boston.data, columns=boston.feature_names)\n",
-    "df[\"target\"] = boston.target\n",
+    "dataset = load_diabetes()\n",
+    "df = pd.DataFrame(dataset.data, columns=dataset.feature_names)\n",
+    "df[\"target\"] = dataset.target\n",
     "\n",
     "train_cols = df.columns[0:-1]\n",
     "label = df.columns[-1]\n",
@@ -104,11 +104,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Going beyond overall term importances, because EBMs are additive models we can measure exactly how each term contributes to a prediction. Let's take a look at the graph of the term, `LSTAT`, by selecting it in the drop-down menu.\n",
+    "Going beyond overall term importances, because EBMs are additive models we can measure exactly how each term contributes to a prediction. Let's take a look at the graph of the term, `bp`, by selecting it in the drop-down menu.\n",
     "\n",
     "![Global Explanation - LSTAT](../assets/importance_notebook_global_lstat.png)\n",
     "\n",
-    "The way to interpret this is that if a new datapoint came in with `LSTAT` = 5, the model adds about +2.7 to the final prediction. However, for a different datapoint with `LSTAT` = 10, the model would now add approx. -0.47 to the prediction.\n",
+    "The way to interpret this is that if a new datapoint came in with `bp` = 0.1, the model adds about +33.1 to the final prediction. However, for a different datapoint with `bp` = 0.13, the model would now add approx. +36.7 to the prediction.\n",
     "\n",
     "To make individual predictions, the model uses each term graph as a look up table, notes the contribution per term, and sums them together with the learned intercept to make a prediction. In regression, the intercept is the mean target (label) of the training set, and each term adds or subtracts to this mean. In classification, the intercept reflects the base rate of the positive class on a log scale. The gray above and below the graph shows the confidence of the model in that region of the graph."
    ]
@@ -146,7 +146,7 @@
     "\n",
     "![Local Explanation](../assets/importance_notebook_local_exp.png)\n",
     "\n",
-    "The model prediction is 26.8. We can see that the intercept adds about +22.5, `LSTAT` adds ~+2.7, and `RAD` adds about -1.2. So far, for the top 3 contributing terms, we're at a cumulative prediction of ~+24. If we repeat this process for all the terms, we'll arrive exactly at the model prediction of 26.8."
+    "The model prediction is 188.50. We can see that the intercept adds about +151.9, `bp` subtracts about 0.02, and `age` adds about 0.04. If we repeat this process for all the terms, we'll arrive exactly at the model prediction of 188.50."
    ]
   },
   {

diff --git a/examples/python/notebooks/Explaining Blackbox Classifiers.ipynb b/examples/python/notebooks/Explaining Blackbox Classifiers.ipynb
@@ -14,7 +14,6 @@
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "from sklearn.datasets import load_boston\n",
     "from sklearn.model_selection import train_test_split\n",
     "\n",
     "df = pd.read_csv(\n",
@@ -205,4 +204,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/examples/python/notebooks/Explaining Blackbox Regressors.ipynb b/examples/python/notebooks/Explaining Blackbox Regressors.ipynb
@@ -14,13 +14,13 @@
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "from sklearn.datasets import load_boston\n",
+    "from sklearn.datasets import load_diabetes\n",
     "from sklearn.model_selection import train_test_split\n",
     "\n",
-    "boston = load_boston()\n",
-    "feature_names = list(boston.feature_names)\n",
-    "df = pd.DataFrame(boston.data, columns=feature_names)\n",
-    "df[\"target\"] = boston.target\n",
+    "dataset = load_diabetes()\n",
+    "feature_names = list(dataset.feature_names)\n",
+    "df = pd.DataFrame(dataset.data, columns=feature_names)\n",
+    "df[\"target\"] = dataset.target\n",
     "# df = df.sample(frac=0.1, random_state=1)\n",
     "train_cols = df.columns[0:-1]\n",
     "label = df.columns[-1]\n",

diff --git a/examples/python/notebooks/Interpretable Regression Methods.ipynb b/examples/python/notebooks/Interpretable Regression Methods.ipynb
@@ -14,13 +14,13 @@
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "from sklearn.datasets import load_boston\n",
+    "from sklearn.datasets import load_diabetes\n",
     "from sklearn.model_selection import train_test_split\n",
     "\n",
-    "boston = load_boston()\n",
-    "feature_names = list(boston.feature_names)\n",
-    "df = pd.DataFrame(boston.data, columns=feature_names)\n",
-    "df[\"target\"] = boston.target\n",
+    "dataset = load_diabetes()\n",
+    "feature_names = list(dataset.feature_names)\n",
+    "df = pd.DataFrame(dataset.data, columns=feature_names)\n",
+    "df[\"target\"] = dataset.target\n",
     "# df = df.sample(frac=0.1, random_state=1)\n",
     "train_cols = df.columns[0:-1]\n",
     "label = df.columns[-1]\n",
@@ -234,4 +234,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/python/interpret-core/interpret/glassbox/test/test_decisiontree.py b/python/interpret-core/interpret/glassbox/test/test_decisiontree.py
@@ -2,16 +2,16 @@
 # Distributed under the MIT software license
 
 from ..decisiontree import ClassificationTree, RegressionTree
-from sklearn.datasets import load_breast_cancer, load_boston
+from sklearn.datasets import load_breast_cancer, load_diabetes
 from sklearn.tree import DecisionTreeClassifier as SKDT
 from sklearn.tree import DecisionTreeRegressor as SKRT
 import numpy as np
 
 
 def test_rt():
-    boston = load_boston()
-    X, y = boston.data, boston.target
-    feature_names = boston.feature_names
+    dataset = load_diabetes()
+    X, y = dataset.data, dataset.target
+    feature_names = dataset.feature_names
 
     sk_dt = SKRT(random_state=1, max_depth=3)
     our_dt = RegressionTree(feature_names=feature_names, random_state=1)

diff --git a/python/interpret-core/interpret/glassbox/test/test_linear.py b/python/interpret-core/interpret/glassbox/test/test_linear.py
@@ -2,16 +2,16 @@
 # Distributed under the MIT software license
 
 from ..linear import LogisticRegression, LinearRegression
-from sklearn.datasets import load_breast_cancer, load_boston
+from sklearn.datasets import load_breast_cancer, load_diabetes
 from sklearn.linear_model import LogisticRegression as SKLogistic
 from sklearn.linear_model import Lasso as SKLinear
 import numpy as np
 
 
 def test_linear_regression():
-    boston = load_boston()
-    X, y = boston.data, boston.target
-    feature_names = boston.feature_names
+    dataset = load_diabetes()
+    X, y = dataset.data, dataset.target
+    feature_names = dataset.feature_names
 
     sk_lr = SKLinear(random_state=1)
     our_lr = LinearRegression(feature_names=feature_names, random_state=1)

diff --git a/python/interpret-core/interpret/greybox/test/test_treeinterpreter.py b/python/interpret-core/interpret/greybox/test/test_treeinterpreter.py
@@ -3,7 +3,7 @@
 
 
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.datasets import load_boston
+from sklearn.datasets import load_diabetes
 from ..treeinterpreter import TreeInterpreter
 
 import pytest
@@ -17,13 +17,13 @@ def test_that_tree_works():
     # http://blog.datadive.net/random-forest-interpretation-with-scikit-learn/
 
     # Fit tree
-    boston = load_boston()
+    dataset = load_diabetes()
     rf = RandomForestRegressor()
-    X, y = boston.data[:300], boston.target[:300]
-    feature_names = boston.feature_names
+    X, y = dataset.data[:300], dataset.target[:300]
+    feature_names = dataset.feature_names
 
-    X_new = boston.data[[300, 309]]
-    y_new = boston.target[[300, 309]]
+    X_new = dataset.data[[300, 309]]
+    y_new = dataset.target[[300, 309]]
     rf.fit(X, y)
 
     # Build expected local explanation

diff --git a/python/interpret-core/interpret/test/utils.py b/python/interpret-core/interpret/test/utils.py
@@ -95,11 +95,6 @@ def _synthetic(mode="regression"):
 
     return dataset
 
-
-def boston_regression():
-    return None
-
-
 def iris_classification():
     from sklearn.datasets import load_iris