Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 110 additions & 55 deletions pyBreakDown/explainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from collections import deque
from blist import blist
from pyBreakDown import explanation as e
from tqdm import tqdm


class Explainer:
"""
Expand All @@ -17,17 +19,19 @@ class Explainer:
Dataset feature names.
"""
def __init__(self, clf, data, colnames):
assert len(colnames) == data.shape[1] #otherwise it wouldnt make any sense
assert len(colnames) == data.shape[1] # otherwise it wouldn't make any sense
self.clf = clf
self.data = data
self.colnames = colnames

def _transform_observation (self, observation):
@staticmethod
def _transform_observation (observation):
if observation.ndim < 2:
observation = np.expand_dims(observation, axis=0)
return observation

def _get_initial_dataset(self, observation, data):
@staticmethod
def _get_initial_dataset(observation, data):
assert observation.ndim == 2 and observation.shape[0] == 1
return np.repeat(observation,repeats=data.shape[0], axis=0)

Expand Down Expand Up @@ -55,16 +59,19 @@ def explain (self, observation, direction, useIntercept = False, baseline=0):

"""
data = np.copy(self.data)
assert direction in ["up","down"]
observation = self._transform_observation(observation) #expand dims from 1D to 2D if necessary
assert direction in ["up", "down"]
observation = self._transform_observation(observation) # expand dims from 1D to 2D if necessary
assert len(self.colnames) == observation.shape[1]

if direction=="up":
exp = self._explain_up(observation, baseline, data)
if direction=="down":
exp = self._explain_down(observation, baseline, data)
if direction == "up":
exp = self._explain_up(observation, data)
else:
exp = self._explain_down(observation, data)

mean_prediction = np.mean(self.clf.predict(data))
if hasattr(self.clf, 'predict_proba'):
mean_prediction = np.mean(self.clf.predict_proba(data)[:, 0])
else:
mean_prediction = np.mean(self.clf.predict(data))

if useIntercept:
baseline = mean_prediction
Expand All @@ -77,63 +84,110 @@ def explain (self, observation, direction, useIntercept = False, baseline=0):
exp.make_final_prediction()
return exp


def _explain_up (self, observation, baseline, data):
def _explain_up(self, observation, data):
new_data = self._get_initial_dataset(observation, data)

baseline_yhat = np.mean(self.clf.predict(data))

open_variables = blist(range(0,data.shape[1]))
important_variables = deque()
important_yhats = {}

for i in range(0, data.shape[1]):
yhats = {}
yhats_diff = np.repeat(-float('inf'), data.shape[1])

for variable in open_variables:
tmp_data = np.copy(data)
tmp_data[:,variable] = new_data[:,variable]
yhats[variable] = self.clf.predict(tmp_data)
yhats_diff[variable] = abs(baseline_yhat - np.mean(yhats[variable]))

amax = np.argmax(yhats_diff)
important_variables.append(amax)
important_yhats[i] = yhats[amax]
data[:,amax] = new_data[:,amax]
open_variables.remove(amax)
if hasattr(self.clf, 'predict_proba'):
baseline_yhat = np.mean(self.clf.predict_proba(data)[:, 0])
open_variables = blist(range(0, data.shape[1]))
important_variables = deque()
important_yhats = {}
temp = np.copy(data)

for i in tqdm(range(0, data.shape[1])):
yhats = {}
yhats_diff = np.repeat([-float('inf')], data.shape[1])

for variable in open_variables:
data[:, variable] = new_data[:, variable]
yhats[variable] = self.clf.predict_proba(data)[:, 0]
yhats_diff[variable] = abs(baseline_yhat - np.mean(yhats[variable]))
data[:, variable]=temp[:, variable]

amax = np.argmax(yhats_diff)
important_variables.append(amax)
important_yhats[i] = yhats[amax]
data[:, amax] = new_data[:, amax]
open_variables.remove(amax)
else:
baseline_yhat = np.mean(self.clf.predict(data))
open_variables = blist(range(0, data.shape[1]))
important_variables = deque()
important_yhats = {}
temp = np.copy(data)

for i in tqdm(range(0, data.shape[1])):
yhats = {}
yhats_diff = np.repeat([-float('inf')], data.shape[1])

for variable in open_variables:
data[:,variable] = data[:,variable]
yhats[variable] = self.clf.predict(data)[:, 0]
yhats_diff[variable] = abs(baseline_yhat - np.mean(yhats[variable]))
data[:, variable] = temp[:, variable]

amax = np.argmax(yhats_diff)
important_variables.append(amax)
important_yhats[i] = yhats[amax]
data[:,amax] = new_data[:,amax]
open_variables.remove(amax)

var_names = np.array(self.colnames)[important_variables]
var_values = observation[0,important_variables]
var_values = observation[0, important_variables]
means = self._get_means_from_yhats(important_yhats)
means.appendleft(baseline_yhat)
contributions = np.diff(means)
return e.Explanation(var_names, var_values, contributions, e.ExplainerDirection.Up)

def _explain_down (self, observation, baseline, data):
def _explain_down (self, observation, data):
new_data = self._get_initial_dataset(observation, data)

target_yhat = self.clf.predict(observation)
if hasattr(self.clf, 'predict_proba'):
target_yhat = self.clf.predict_proba(observation)[:, 0]

open_variables = blist(range(0, data.shape[1]))
important_variables = deque()
important_yhats = {}
temp = np.copy(new_data)

for i in range(0, data.shape[1]):
yhats = {}
yhats_diff = np.repeat([float('inf')], data.shape[1])

for variable in open_variables:
new_data[:,variable] = data[:,variable]
yhats[variable] = self.clf.predict_proba(new_data)[:, 0]
yhats_diff[variable] = abs(target_yhat - np.mean(yhats[variable]))
new_data[:, variable] = temp[:, variable]

amin = np.argmin(yhats_diff)
important_variables.append(amin)
important_yhats[i] = yhats[amin]
new_data[:,amin] = data[:,amin]
open_variables.remove(amin)
else:
target_yhat = self.clf.predict(observation)[:, 0]

open_variables = blist(range(0, data.shape[1]))
important_variables = deque()
important_yhats = {}
temp = np.copy(new_data)

open_variables = blist(range(0,data.shape[1]))
important_variables = deque()
important_yhats = {}
for i in range(0, data.shape[1]):
yhats = {}
yhats_diff = np.repeat([float('inf')], data.shape[1])

for i in range(0, data.shape[1]):
yhats = {}
yhats_diff = np.repeat(float('inf'), data.shape[1])

for variable in open_variables:
tmp_data = np.copy(new_data)
tmp_data[:,variable] = data[:,variable]
yhats[variable] = self.clf.predict(tmp_data)
yhats_diff[variable] = abs(target_yhat - np.mean(yhats[variable]))
for variable in open_variables:
new_data[:, variable] = data[:, variable]
yhats[variable] = self.clf.predict(new_data)[:, 0]
yhats_diff[variable] = abs(target_yhat - np.mean(yhats[variable]))
new_data[:, variable] = temp[:, variable]

amin = np.argmin(yhats_diff)
important_variables.append(amin)
important_yhats[i] = yhats[amin]
new_data[:,amin] = data[:,amin]
open_variables.remove(amin)
amin = np.argmin(yhats_diff)
important_variables.append(amin)
important_yhats[i] = yhats[amin]
new_data[:, amin] = data[:, amin]
open_variables.remove(amin)

important_variables.reverse()
var_names = np.array(self.colnames)[important_variables]
Expand All @@ -145,5 +199,6 @@ def _explain_down (self, observation, baseline, data):

return e.Explanation(var_names, var_values, contributions, e.ExplainerDirection.Down)

def _get_means_from_yhats (self, important_yhats):
return deque([np.array(v).mean() for k,v in important_yhats.items()])
@staticmethod
def _get_means_from_yhats (important_yhats):
return deque([np.array(v).mean() for k, v in important_yhats.items()])
Loading