forked from dylan-slack/Fooling-LIME-SHAP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
compas_experiment.py
156 lines (124 loc) · 7.5 KB
/
compas_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
The experiment MAIN for COMPAS.
* Run the file and the COMPAS experiments will complete
* This may take some time because we iterate through every instance in the test set for
both LIME and SHAP explanations take some time to compute
* The print outs can be interpreted as maps from the RANK to the rate at which the feature occurs in the rank.. e.g:
1: [('length_of_stay', 0.002592352559948153), ('unrelated_column_one', 0.9974076474400518)]
can be read as the first unrelated column occurs ~100% of the time in as the most important feature
* "Nothing shown" refers to SHAP yielding only 0 shapley values
"""
import warnings
from adversarial_models import *
from utils import *
from get_data import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import lime
import lime.lime_tabular
import shap
from copy import deepcopy
# Set up experiment parameters
params = Params("model_configurations/experiment_params.json")
np.random.seed(params.seed)
X, y, cols = get_and_preprocess_compas_data(params)
# add unrelated columns, setup
X['unrelated_column_one'] = np.random.choice([0,1],size=X.shape[0])
X['unrelated_column_two'] = np.random.choice([0,1],size=X.shape[0])
features = [c for c in X]
race_indc = features.index('race')
unrelated_indcs = features.index('unrelated_column_one')
unrelated_indcs1 = features.index('unrelated_column_two')
X = X.values
print (features)
###
## The models f and psi for COMPAS. We discriminate based on race for f and concider two RANDOMLY DRAWN features to display in psi
#
# the biased model
class racist_model_f:
# Decision rule: classify negatively if race is black
def predict(self,X):
return np.array([params.negative_outcome if x[race_indc] > 0 else params.positive_outcome for x in X])
def predict_proba(self, X):
return one_hot_encode(self.predict(X))
def score(self, X,y):
return np.sum(self.predict(X)==y) / len(X)
# the display model with one unrelated feature
class innocuous_model_psi:
# Decision rule: classify according to randomly drawn column 'unrelated column'
def predict_proba(self, X):
return one_hot_encode(np.array([params.negative_outcome if x[unrelated_indcs] > 0 else params.positive_outcome for x in X]))
# the display model with two unrelated features
class innocuous_model_psi_two:
def predict_proba(self, X):
A = np.where(X[:,unrelated_indcs] > 0, params.positive_outcome, params.negative_outcome)
B = np.where(X[:,unrelated_indcs1] > 0, params.positive_outcome, params.negative_outcome)
preds = np.logical_xor(A, B).astype(int)
return one_hot_encode(preds)
#
##
###
def experiment_main():
"""
Run through experiments for LIME/SHAP on compas using both one and two unrelated features.
* This may take some time given that we iterate through every point in the test set
* We print out the rate at which features occur in the top three features
"""
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.1)
ss = StandardScaler().fit(xtrain)
xtrain = ss.transform(xtrain)
xtest = ss.transform(xtest)
print ('---------------------')
print ("Beginning LIME COMPAS Experiments....")
print ("(These take some time to run because we have to generate explanations for every point in the test set) ") # 'two_year_recid','c_charge_degree'
print ('---------------------')
# Train the adversarial model for LIME with f and psi
adv_lime = Adversarial_Lime_Model(racist_model_f(), innocuous_model_psi()).train(xtrain, ytrain, categorical_features=[features.index('unrelated_column_one'),features.index('unrelated_column_two'), features.index('c_charge_degree_F'), features.index('c_charge_degree_M'), features.index('two_year_recid'), features.index('race'), features.index("sex_Male"), features.index("sex_Female")], feature_names=features, perturbation_multiplier=30)
adv_explainer = lime.lime_tabular.LimeTabularExplainer(xtrain, sample_around_instance=True, feature_names=adv_lime.get_column_names(), categorical_features=[features.index('unrelated_column_one'),features.index('unrelated_column_two'),features.index('c_charge_degree_F'), features.index('c_charge_degree_M'), features.index('two_year_recid'), features.index('race'), features.index("sex_Male"), features.index("sex_Female")], discretize_continuous=False)
explanations = []
for i in range(xtest.shape[0]):
explanations.append(adv_explainer.explain_instance(xtest[i], adv_lime.predict_proba).as_list())
# Display Results
print ("LIME Ranks and Pct Occurances (1 corresponds to most important feature) for one unrelated feature:")
print (experiment_summary(explanations, features))
print ("Fidelity:", round(adv_lime.fidelity(xtest),2))
# Repeat the same thing for two features
adv_lime = Adversarial_Lime_Model(racist_model_f(), innocuous_model_psi_two()).train(xtrain, ytrain, categorical_features=[features.index('unrelated_column_one'),features.index('unrelated_column_two'),features.index('c_charge_degree_F'), features.index('c_charge_degree_M'), features.index('two_year_recid'), features.index('race'), features.index("sex_Male"), features.index("sex_Female")], feature_names=features, perturbation_multiplier=30)
adv_explainer = lime.lime_tabular.LimeTabularExplainer(xtrain, feature_names=adv_lime.get_column_names(), categorical_features=[features.index('unrelated_column_one'),features.index('unrelated_column_two'),features.index('c_charge_degree_F'), features.index('c_charge_degree_M'), features.index('two_year_recid'), features.index('race'), features.index("sex_Male"), features.index("sex_Female")], discretize_continuous=False)
explanations = []
for i in range(xtest.shape[0]):
explanations.append(adv_explainer.explain_instance(xtest[i], adv_lime.predict_proba).as_list())
print ("LIME Ranks and Pct Occurances two unrelated features:")
print (experiment_summary(explanations, features))
print ("Fidelity:", round(adv_lime.fidelity(xtest),2))
print ('---------------------')
print ('Beginning SHAP COMPAS Experiments....')
print ('---------------------')
#Setup SHAP
background_distribution = shap.kmeans(xtrain,10)
adv_shap = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi()).train(xtrain, ytrain, feature_names=features)
adv_kerenel_explainer = shap.KernelExplainer(adv_shap.predict, background_distribution)
explanations = adv_kerenel_explainer.shap_values(xtest)
# format for display
formatted_explanations = []
for exp in explanations:
formatted_explanations.append([(features[i], exp[i]) for i in range(len(exp))])
print ("SHAP Ranks and Pct Occurances one unrelated features:")
print (experiment_summary(formatted_explanations, features))
print ("Fidelity:",round(adv_shap.fidelity(xtest),2))
background_distribution = shap.kmeans(xtrain,10)
adv_shap = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi_two()).train(xtrain, ytrain, feature_names=features)
adv_kerenel_explainer = shap.KernelExplainer(adv_shap.predict, background_distribution)
explanations = adv_kerenel_explainer.shap_values(xtest)
# format for display
formatted_explanations = []
for exp in explanations:
formatted_explanations.append([(features[i], exp[i]) for i in range(len(exp))])
print ("SHAP Ranks and Pct Occurances two unrelated features:")
print (experiment_summary(formatted_explanations, features))
print ("Fidelity:",round(adv_shap.fidelity(xtest),2))
print ('---------------------')
if __name__ == "__main__":
experiment_main()