From 99891033f2c7c2f6d3d8a51fbdeb0a9de55f781a Mon Sep 17 00:00:00 2001 From: Hilly12 Date: Fri, 17 Sep 2021 14:01:21 +0100 Subject: [PATCH 1/7] add dendrogram --- src/fairlens/plot/distr.py | 13 ++- src/fairlens/scorer.py | 199 ++++++++++++++++++++++++++++++++++--- 2 files changed, 198 insertions(+), 14 deletions(-) diff --git a/src/fairlens/plot/distr.py b/src/fairlens/plot/distr.py index c8881fb5..71489802 100644 --- a/src/fairlens/plot/distr.py +++ b/src/fairlens/plot/distr.py @@ -133,7 +133,7 @@ def attr_distr_plot( normalize: bool = False, cmap: Optional[Sequence[Tuple[float, float, float]]] = None, ax: Optional[Axes] = None, -) -> Optional[Axes]: +) -> Union[Axes, Sequence[Axes]]: """Plot the distribution of the target attribute with respect to all the unique values in the column `attr`. Args: @@ -211,6 +211,9 @@ def attr_distr_plot( fig.tight_layout() plt.subplots_adjust(hspace=0.3) + min_ylim = max_ylim = 0 + axes = [] + for i, (group, title) in enumerate(zip(groups, labels)): ax_ = fig.add_subplot(r, c, i + 1) distr_plot( @@ -226,8 +229,14 @@ def attr_distr_plot( ax=ax_, ) plt.title(title) + min_ylim = min(min_ylim, ax_.get_ylim()[0]) + max_ylim = max(max_ylim, ax_.get_ylim()[1]) + axes.append(ax_) + + for ax_ in axes: + ax_.set_ylim(min_ylim, max_ylim) - return None + return axes distr_plot( df_, diff --git a/src/fairlens/scorer.py b/src/fairlens/scorer.py index 97870be0..8f87105d 100644 --- a/src/fairlens/scorer.py +++ b/src/fairlens/scorer.py @@ -6,7 +6,12 @@ from itertools import combinations from typing import Mapping, Optional, Sequence, Tuple +import matplotlib.pyplot as plt +import numpy as np import pandas as pd +from matplotlib.axes import Axes +from scipy.cluster.hierarchy import dendrogram +from sklearn.cluster import AgglomerativeClustering from . import utils from .metrics.unified import stat_distance @@ -90,8 +95,8 @@ def distribution_score( Args: metric (str, optional): - Choose a metric to use. Defaults to automatically chosen metric depending on - the distribution of the target variable. + Choose the metric to use. If set to "auto" chooses the metric depending on + the distribution of the target variable. Defaults to "auto". method (str, optional): The method used to apply the metric to the sub-group. Can take values ["dist_to_all", dist_to_rest"] which correspond to measuring the distance @@ -121,23 +126,132 @@ def distribution_score( if len(sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: return 0.0, pd.DataFrame([], columns=["Group", "Distance", "Proportion", "Counts"]) - max_comb = min(max_comb, len(sensitive_attrs)) if max_comb is not None else len(sensitive_attrs) - df_dists = [] + # Find all combinations of sensitive attributes + combs = _all_sensitive_combs(df, sensitive_attrs, max_comb=max_comb) + + # Computes scores for each sensitive value in a data frame, for each combination of sensitive attributes + df_dists = [_calculate_distance(df, self.target_attr, comb, metric, method, p_value) for comb in combs] + df_dist = pd.concat(df_dists, ignore_index=True) + + return df_dist.reset_index(drop=True) + + def pairwise_score( + self, + metric: str = "auto", + p_value: bool = False, + max_comb: Optional[int] = None, + ): + """Returns a dataframe consisting of the statistical distances between each pair of sub-groups. - # Try all combinations of sensitive attributes - for k in range(1, max_comb + 1): - for sensitive_attr in combinations(sensitive_attrs, k): - df_not_nan = df[~(df[list(sensitive_attr)] == "nan").any(axis=1)] - if len(df_not_nan) == 0: - continue + Args: + metric (str, optional): + Choose the metric to use. Defaults to automatically chosen metric depending on + the distribution of the target variable. + p_value (bool, optional): + Whether or not to compute a p-value for the distances. + max_comb (Optional[int], optional): + Max number of combinations of sensitive attributes to be considered. + If None all combinations are considered. Defaults to 4. + """ - df_dist = _calculate_distance(df, self.target_attr, list(sensitive_attr), metric, method, p_value) - df_dists.append(df_dist) + df = self.df[self.sensitive_attrs + [self.target_attr]].copy() + sensitive_attrs = self.sensitive_attrs + # Bin continuous sensitive attributes + for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types): + if distr_type.is_continuous() or distr_type.is_datetime(): + col = utils.infer_dtype(df[attr]) + df.loc[:, attr] = utils._bin_as_string(col, distr_type.value, prefix=True) + + # Convert binary attributes to 0s and 1s + if self.distr_type.is_binary(): + df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0] + + if len(sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: + return 0.0, pd.DataFrame([], columns=["Group", "Distance", "Proportion", "Counts"]) + + # Find all combinations of sensitive attributes + combs = _all_sensitive_combs(df, sensitive_attrs, max_comb=max_comb) + + # Computes distances between each pair of sensitive values in a data frame, + # for each combination of sensitive attributes + df_dists = [_calculate_distance_pair(df, self.target_attr, comb, metric, p_value) for comb in combs] df_dist = pd.concat(df_dists, ignore_index=True) return df_dist.reset_index(drop=True) + def plot_dendrogram(self, metric: str = "auto", ax: Optional[Axes] = None) -> Axes: + """Hierarchically clusters the sensitive subgroups using the metric and plots + the resulting tree in a dendrogram. + + Args: + metric (str, optional): + Choose the metric to use. If set to "auto" chooses the metric depending on + the distribution of the target variable. Defaults to "auto". + ax (Optional[matplotlib.axes.Axes], optional): + ax (Optional[matplotlib.axes.Axes], optional): + An axis to plot the figure on. Set to plt.gca() if None. Defaults to None. + + Returns: + Axes: + matplotlib.axes.Axes: + The matplotlib axis containing the plot. + """ + + if ax is None: + ax = plt.gca() + + df = self.df[self.sensitive_attrs + [self.target_attr]].copy() + sensitive_attrs = self.sensitive_attrs + + # Bin continuous sensitive attributes + for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types): + if distr_type.is_continuous() or distr_type.is_datetime(): + col = utils.infer_dtype(df[attr]) + df.loc[:, attr] = utils._bin_as_string(col, distr_type.value, prefix=True) + + # Convert binary attributes to 0s and 1s + if self.distr_type.is_binary(): + df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0] + + if len(sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: + return + + groups = [] + for vs in [[{attr: [val]} for val in df[attr].unique()] for attr in sensitive_attrs]: + groups.extend(vs) + + dist_matrix = np.zeros((len(groups), len(groups))) + for i, g1 in enumerate(groups): + for j, g2 in enumerate(groups): + dist_matrix[i][j] = abs(stat_distance(df, self.target_attr, g1, g2, mode=metric)[0]) + + model = AgglomerativeClustering( + distance_threshold=0.1, affinity="precomputed", n_clusters=None, linkage="average", compute_full_tree=True + ) + model = model.fit(dist_matrix) + + # Create Dendogram + counts = np.zeros(model.children_.shape[0]) + n_samples = len(model.labels_) + for i, merge in enumerate(model.children_): + current_count = 0 + for child_idx in merge: + if child_idx < n_samples: + current_count += 1 # leaf node + else: + current_count += counts[child_idx - n_samples] + counts[i] = current_count + + linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float) + + # Plot the corresponding dendrogram + group_names = [list(group.values())[0][0] for group in groups] + dendrogram(linkage_matrix, labels=group_names, ax=ax) + ax.tick_params(axis="x", labelrotation=90) + + return ax + def plot_distributions( self, figsize: Optional[Tuple[int, int]] = None, @@ -303,3 +417,64 @@ def _calculate_distance( df_dist.drop(columns=["P-Value"], inplace=True) return df_dist + + +def _calculate_distance_pair( + df: pd.DataFrame, + target_attr: str, + sensitive_attrs: Sequence[str], + metric: str = "auto", + p_value: bool = False, +) -> pd.DataFrame: + + unique = df[sensitive_attrs].drop_duplicates() + + dist = [] + + for i_index, i in unique.iterrows(): + for j_index, j in unique.iterrows(): + if i_index == j_index: + continue + + group1 = {attr: [value] for attr, value in i.to_dict().items()} + group2 = {attr: [value] for attr, value in j.to_dict().items()} + + preds = utils.get_predicates_mult(df, [group1, group2]) + pred1, pred2 = preds[0], preds[1] + + dist_res = stat_distance(df, target_attr, pred1, pred2, mode=metric, p_value=p_value) + distance = dist_res[0] + p = dist_res[1] if p_value else 0 + + dist.append( + { + "Positive Group": ", ".join(map(str, i.to_dict().values())), + "Negative Group": ", ".join(map(str, j.to_dict().values())), + "Distance": distance, + "Positive Counts": len(df[pred1]), + "Negative Counts": len(df[pred2]), + "P-Value": p, + } + ) + + df_dist = pd.DataFrame(dist) + + if not p_value: + df_dist.drop(columns=["P-Value"], inplace=True) + + return df_dist + + +def _all_sensitive_combs(df: pd.DataFrame, sensitive_attrs: Sequence[str], max_comb: Optional[int] = None): + max_comb = min(max_comb, len(sensitive_attrs)) if max_comb is not None else len(sensitive_attrs) + + groups = [] + for k in range(1, max_comb + 1): + for sensitive_attr in combinations(sensitive_attrs, k): + df_not_nan = df[~(df[list(sensitive_attr)] == "nan").any(axis=1)] + if len(df_not_nan) == 0: + continue + + groups.append(list(sensitive_attr)) + + return groups From 4bdc43a8966bc9d918cff55124424c804d0746fe Mon Sep 17 00:00:00 2001 From: Hilly12 Date: Fri, 17 Sep 2021 14:51:08 +0100 Subject: [PATCH 2/7] add threshold param --- src/fairlens/scorer.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/fairlens/scorer.py b/src/fairlens/scorer.py index 8f87105d..6b67d60a 100644 --- a/src/fairlens/scorer.py +++ b/src/fairlens/scorer.py @@ -180,17 +180,18 @@ def pairwise_score( return df_dist.reset_index(drop=True) - def plot_dendrogram(self, metric: str = "auto", ax: Optional[Axes] = None) -> Axes: + def plot_dendrogram(self, threshold: float, metric: str = "auto", ax: Optional[Axes] = None) -> Axes: """Hierarchically clusters the sensitive subgroups using the metric and plots the resulting tree in a dendrogram. Args: + threshold (float, optional): + The linkage distance threshold, above which clusters will not be merged. metric (str, optional): Choose the metric to use. If set to "auto" chooses the metric depending on the distribution of the target variable. Defaults to "auto". ax (Optional[matplotlib.axes.Axes], optional): - ax (Optional[matplotlib.axes.Axes], optional): - An axis to plot the figure on. Set to plt.gca() if None. Defaults to None. + An axis to plot the figure on. Set to plt.gca() if None. Defaults to None. Returns: Axes: @@ -227,11 +228,15 @@ def plot_dendrogram(self, metric: str = "auto", ax: Optional[Axes] = None) -> Ax dist_matrix[i][j] = abs(stat_distance(df, self.target_attr, g1, g2, mode=metric)[0]) model = AgglomerativeClustering( - distance_threshold=0.1, affinity="precomputed", n_clusters=None, linkage="average", compute_full_tree=True + n_clusters=None, + distance_threshold=threshold, + affinity="precomputed", + linkage="average", + compute_full_tree=True, ) model = model.fit(dist_matrix) - # Create Dendogram + # Create Dendrogram counts = np.zeros(model.children_.shape[0]) n_samples = len(model.labels_) for i, merge in enumerate(model.children_): @@ -247,7 +252,7 @@ def plot_dendrogram(self, metric: str = "auto", ax: Optional[Axes] = None) -> Ax # Plot the corresponding dendrogram group_names = [list(group.values())[0][0] for group in groups] - dendrogram(linkage_matrix, labels=group_names, ax=ax) + _ = dendrogram(linkage_matrix, labels=group_names, ax=ax) ax.tick_params(axis="x", labelrotation=90) return ax From 0d6749b6f0da4fd674e4f2728a90fa5b72bc6292 Mon Sep 17 00:00:00 2001 From: Hilly12 Date: Fri, 17 Sep 2021 14:54:53 +0100 Subject: [PATCH 3/7] update docstrings --- src/fairlens/scorer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/fairlens/scorer.py b/src/fairlens/scorer.py index 6b67d60a..88256c6f 100644 --- a/src/fairlens/scorer.py +++ b/src/fairlens/scorer.py @@ -194,9 +194,8 @@ def plot_dendrogram(self, threshold: float, metric: str = "auto", ax: Optional[A An axis to plot the figure on. Set to plt.gca() if None. Defaults to None. Returns: - Axes: - matplotlib.axes.Axes: - The matplotlib axis containing the plot. + matplotlib.axes.Axes: + The matplotlib axis containing the plot. """ if ax is None: From 6ced76ec8e1e543acd2b6deccf208f2a64274a56 Mon Sep 17 00:00:00 2001 From: Hilly12 Date: Fri, 17 Sep 2021 15:32:48 +0100 Subject: [PATCH 4/7] add tests --- tests/test_scorer.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_scorer.py b/tests/test_scorer.py index 4b38d2de..a9aa7088 100644 --- a/tests/test_scorer.py +++ b/tests/test_scorer.py @@ -64,3 +64,26 @@ def test_distribution_score(): score = calculate_score(df_dist) assert score * df_dist["Counts"].sum() == (df_dist["Distance"] * df_dist["Counts"]).sum() + + +def test_pairwise_compas(): + fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"]) + df_dist = fscorer.pairwise_score() + + assert (df_dist["Distance"] > 0).all() + +def test_pairwise_adult(): + fscorer = FairnessScorer(dfa, "class", ["race", "sex"]) + df_dist = fscorer.pairwise_score(metric="binomial") + + assert (df_dist["Distance"] != 0).all() + + +def test_dendrogram_compas(): + fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"]) + fscorer.plot_dendrogram(0.01) + + +def test_dendrogram_adult(): + fscorer = FairnessScorer(dfa, "class", ["race", "sex"]) + fscorer.plot_dendrogram(0.01) \ No newline at end of file From ddf0f0d4617441b723bebf96f851f45309033f7a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Sep 2021 14:33:04 +0000 Subject: [PATCH 5/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_scorer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_scorer.py b/tests/test_scorer.py index a9aa7088..c258083d 100644 --- a/tests/test_scorer.py +++ b/tests/test_scorer.py @@ -72,6 +72,7 @@ def test_pairwise_compas(): assert (df_dist["Distance"] > 0).all() + def test_pairwise_adult(): fscorer = FairnessScorer(dfa, "class", ["race", "sex"]) df_dist = fscorer.pairwise_score(metric="binomial") @@ -86,4 +87,4 @@ def test_dendrogram_compas(): def test_dendrogram_adult(): fscorer = FairnessScorer(dfa, "class", ["race", "sex"]) - fscorer.plot_dendrogram(0.01) \ No newline at end of file + fscorer.plot_dendrogram(0.01) From 47a0bb0b3a31049f627f8513e1543cc7a8726aa7 Mon Sep 17 00:00:00 2001 From: Hilly12 Date: Fri, 17 Sep 2021 15:41:36 +0100 Subject: [PATCH 6/7] reduce cognitive complexity --- src/fairlens/scorer.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/fairlens/scorer.py b/src/fairlens/scorer.py index 88256c6f..ee81f92f 100644 --- a/src/fairlens/scorer.py +++ b/src/fairlens/scorer.py @@ -239,13 +239,8 @@ def plot_dendrogram(self, threshold: float, metric: str = "auto", ax: Optional[A counts = np.zeros(model.children_.shape[0]) n_samples = len(model.labels_) for i, merge in enumerate(model.children_): - current_count = 0 - for child_idx in merge: - if child_idx < n_samples: - current_count += 1 # leaf node - else: - current_count += counts[child_idx - n_samples] - counts[i] = current_count + current_counts = [1 if child_idx < n_samples else counts[child_idx - n_samples] for child_idx in merge] + counts[i] = sum(current_counts) linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float) From feba5e31d31f000a6e43b5b5abf3c6a6bfef76e8 Mon Sep 17 00:00:00 2001 From: Hilly12 Date: Fri, 17 Sep 2021 16:27:12 +0100 Subject: [PATCH 7/7] combine scoring methods --- src/fairlens/scorer.py | 78 ++++++++++-------------------------------- tests/test_scorer.py | 20 +++++++---- 2 files changed, 32 insertions(+), 66 deletions(-) diff --git a/src/fairlens/scorer.py b/src/fairlens/scorer.py index ee81f92f..6dc1b506 100644 --- a/src/fairlens/scorer.py +++ b/src/fairlens/scorer.py @@ -90,8 +90,8 @@ def distribution_score( p_value: bool = False, max_comb: Optional[int] = None, ) -> pd.DataFrame: - """Returns a dataframe consisting of all unique sub-groups and their statistical distance to the rest - of the population w.r.t. the target variable. + """Returns a dataframe consisting of all unique sub-groups and their statistical distances of + the target variable computed based on the `metric` and `method` parameters. Args: metric (str, optional): @@ -99,10 +99,10 @@ def distribution_score( the distribution of the target variable. Defaults to "auto". method (str, optional): The method used to apply the metric to the sub-group. Can take values - ["dist_to_all", dist_to_rest"] which correspond to measuring the distance - between the subgroup distribution and the overall distribution, or the - overall distribution without the subgroup, respectively. - Defaults to "dist_to_all". + ["dist_to_all", dist_to_rest", "pairwise"] which correspond to measuring + the distance between the subgroup distribution and the overall distribution, or the + overall distribution without the subgroup, or alternatively measuring the distance + between all possible pairs of subgroups, respectively. Defaults to "dist_to_all". p_value (bool, optional): Whether or not to compute a p-value for the distances. max_comb (Optional[int], optional): @@ -111,7 +111,6 @@ def distribution_score( """ df = self.df[self.sensitive_attrs + [self.target_attr]].copy() - sensitive_attrs = self.sensitive_attrs # Bin continuous sensitive attributes for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types): @@ -123,62 +122,21 @@ def distribution_score( if self.distr_type.is_binary(): df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0] - if len(sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: + if len(self.sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: return 0.0, pd.DataFrame([], columns=["Group", "Distance", "Proportion", "Counts"]) # Find all combinations of sensitive attributes - combs = _all_sensitive_combs(df, sensitive_attrs, max_comb=max_comb) + combs = _all_sensitive_combs(df, self.sensitive_attrs, max_comb=max_comb) # Computes scores for each sensitive value in a data frame, for each combination of sensitive attributes - df_dists = [_calculate_distance(df, self.target_attr, comb, metric, method, p_value) for comb in combs] - df_dist = pd.concat(df_dists, ignore_index=True) - - return df_dist.reset_index(drop=True) - - def pairwise_score( - self, - metric: str = "auto", - p_value: bool = False, - max_comb: Optional[int] = None, - ): - """Returns a dataframe consisting of the statistical distances between each pair of sub-groups. - - Args: - metric (str, optional): - Choose the metric to use. Defaults to automatically chosen metric depending on - the distribution of the target variable. - p_value (bool, optional): - Whether or not to compute a p-value for the distances. - max_comb (Optional[int], optional): - Max number of combinations of sensitive attributes to be considered. - If None all combinations are considered. Defaults to 4. - """ - - df = self.df[self.sensitive_attrs + [self.target_attr]].copy() - sensitive_attrs = self.sensitive_attrs - - # Bin continuous sensitive attributes - for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types): - if distr_type.is_continuous() or distr_type.is_datetime(): - col = utils.infer_dtype(df[attr]) - df.loc[:, attr] = utils._bin_as_string(col, distr_type.value, prefix=True) - - # Convert binary attributes to 0s and 1s - if self.distr_type.is_binary(): - df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0] - - if len(sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: - return 0.0, pd.DataFrame([], columns=["Group", "Distance", "Proportion", "Counts"]) - - # Find all combinations of sensitive attributes - combs = _all_sensitive_combs(df, sensitive_attrs, max_comb=max_comb) + if method == "pairwise": + df_dists = [_calculate_distance_pair(df, self.target_attr, comb, metric, p_value) for comb in combs] + else: + df_dists = [_calculate_distance(df, self.target_attr, comb, metric, method, p_value) for comb in combs] - # Computes distances between each pair of sensitive values in a data frame, - # for each combination of sensitive attributes - df_dists = [_calculate_distance_pair(df, self.target_attr, comb, metric, p_value) for comb in combs] df_dist = pd.concat(df_dists, ignore_index=True) - return df_dist.reset_index(drop=True) + return df_dist def plot_dendrogram(self, threshold: float, metric: str = "auto", ax: Optional[Axes] = None) -> Axes: """Hierarchically clusters the sensitive subgroups using the metric and plots @@ -202,7 +160,6 @@ def plot_dendrogram(self, threshold: float, metric: str = "auto", ax: Optional[A ax = plt.gca() df = self.df[self.sensitive_attrs + [self.target_attr]].copy() - sensitive_attrs = self.sensitive_attrs # Bin continuous sensitive attributes for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types): @@ -214,11 +171,11 @@ def plot_dendrogram(self, threshold: float, metric: str = "auto", ax: Optional[A if self.distr_type.is_binary(): df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0] - if len(sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: - return + if len(self.sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: + return ax groups = [] - for vs in [[{attr: [val]} for val in df[attr].unique()] for attr in sensitive_attrs]: + for vs in [[{attr: [val]} for val in df[attr].unique()] for attr in self.sensitive_attrs]: groups.extend(vs) dist_matrix = np.zeros((len(groups), len(groups))) @@ -359,7 +316,7 @@ def demographic_report( def calculate_score(df_dist: pd.DataFrame) -> float: - """Calculate the weighted mean pairwise statistical distance. + """Calculate the weighted mean of statistical distances. Args: df_dist (pd.DataFrame): @@ -452,6 +409,7 @@ def _calculate_distance_pair( "Distance": distance, "Positive Counts": len(df[pred1]), "Negative Counts": len(df[pred2]), + "Counts": len(df[pred1]) + len(df[pred2]), "P-Value": p, } ) diff --git a/tests/test_scorer.py b/tests/test_scorer.py index c258083d..fdd6d71e 100644 --- a/tests/test_scorer.py +++ b/tests/test_scorer.py @@ -58,9 +58,17 @@ def test_sensitive_attr_detection(): assert fscorer.sensitive_attrs == ["DateOfBirth", "Ethnicity", "Language", "MaritalStatus", "RawScore", "Sex"] -def test_distribution_score(): +def test_distribution_score_all(): fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"]) - df_dist = fscorer.distribution_score() + df_dist = fscorer.distribution_score(method="all") + score = calculate_score(df_dist) + + assert score * df_dist["Counts"].sum() == (df_dist["Distance"] * df_dist["Counts"]).sum() + + +def test_distribution_score_rest(): + fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"]) + df_dist = fscorer.distribution_score(method="rest") score = calculate_score(df_dist) assert score * df_dist["Counts"].sum() == (df_dist["Distance"] * df_dist["Counts"]).sum() @@ -68,23 +76,23 @@ def test_distribution_score(): def test_pairwise_compas(): fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"]) - df_dist = fscorer.pairwise_score() + df_dist = fscorer.distribution_score(method="pairwise") assert (df_dist["Distance"] > 0).all() def test_pairwise_adult(): fscorer = FairnessScorer(dfa, "class", ["race", "sex"]) - df_dist = fscorer.pairwise_score(metric="binomial") + df_dist = fscorer.distribution_score(metric="binomial", method="pairwise") assert (df_dist["Distance"] != 0).all() def test_dendrogram_compas(): fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"]) - fscorer.plot_dendrogram(0.01) + fscorer.plot_dendrogram(0.1) def test_dendrogram_adult(): fscorer = FairnessScorer(dfa, "class", ["race", "sex"]) - fscorer.plot_dendrogram(0.01) + fscorer.plot_dendrogram(0.1)