diff --git a/src/fairlens/plot/distr.py b/src/fairlens/plot/distr.py index 6225affd..3d8b3695 100644 --- a/src/fairlens/plot/distr.py +++ b/src/fairlens/plot/distr.py @@ -136,7 +136,7 @@ def attr_distr_plot( normalize: bool = False, cmap: Optional[Sequence[Tuple[float, float, float]]] = None, ax: Optional[Axes] = None, -) -> Optional[Axes]: +) -> Union[Axes, Sequence[Axes]]: """Plot the distribution of the target attribute with respect to all the unique values in the column `attr`. Args: @@ -217,6 +217,9 @@ def attr_distr_plot( fig.tight_layout() plt.subplots_adjust(hspace=0.3) + min_ylim = max_ylim = 0 + axes = [] + for i, (group, title) in enumerate(zip(groups, labels)): ax_ = fig.add_subplot(r, c, i + 1) distr_plot( @@ -232,8 +235,14 @@ def attr_distr_plot( ax=ax_, ) plt.title(title) + min_ylim = min(min_ylim, ax_.get_ylim()[0]) + max_ylim = max(max_ylim, ax_.get_ylim()[1]) + axes.append(ax_) + + for ax_ in axes: + ax_.set_ylim(min_ylim, max_ylim) - return None + return axes if distr_type == "binary": _countplot(x=df_[attr], hue=df_[target_attr], palette=cmap, normalize=normalize) diff --git a/src/fairlens/scorer.py b/src/fairlens/scorer.py index ca978b3a..09513386 100644 --- a/src/fairlens/scorer.py +++ b/src/fairlens/scorer.py @@ -6,7 +6,12 @@ from itertools import combinations from typing import Any, List, Mapping, Optional, Sequence, Tuple, Union +import matplotlib.pyplot as plt +import numpy as np import pandas as pd +from matplotlib.axes import Axes +from scipy.cluster.hierarchy import dendrogram +from sklearn.cluster import AgglomerativeClustering from . import utils from .metrics.statistics import sensitive_group_analysis @@ -86,19 +91,19 @@ def distribution_score( p_value: bool = False, max_comb: Optional[int] = None, ) -> pd.DataFrame: - """Returns a dataframe consisting of all unique sub-groups and their statistical distance to the rest - of the population w.r.t. the target variable. + """Returns a dataframe consisting of all unique sub-groups and their statistical distances of + the target variable computed based on the `metric` and `method` parameters. Args: metric (str, optional): - Choose a metric to use. Defaults to automatically chosen metric depending on - the distribution of the target variable. + Choose the metric to use. If set to "auto" chooses the metric depending on + the distribution of the target variable. Defaults to "auto". method (str, optional): The method used to apply the metric to the sub-group. Can take values - ["dist_to_all", dist_to_rest"] which correspond to measuring the distance - between the subgroup distribution and the overall distribution, or the - overall distribution without the subgroup, respectively. - Defaults to "dist_to_all". + ["dist_to_all", dist_to_rest", "pairwise"] which correspond to measuring + the distance between the subgroup distribution and the overall distribution, or the + overall distribution without the subgroup, or alternatively measuring the distance + between all possible pairs of subgroups, respectively. Defaults to "dist_to_all". p_value (bool, optional): Whether or not to compute a p-value for the distances. max_comb (Optional[int], optional): @@ -107,7 +112,6 @@ def distribution_score( """ df = self.df[self.sensitive_attrs + [self.target_attr]].copy() - sensitive_attrs = self.sensitive_attrs # Bin continuous sensitive attributes for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types): @@ -119,25 +123,91 @@ def distribution_score( if self.distr_type.is_binary(): df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0] - if len(sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: + if len(self.sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: return 0.0, pd.DataFrame([], columns=["Group", "Distance", "Proportion", "Counts"]) - max_comb = min(max_comb, len(sensitive_attrs)) if max_comb is not None else len(sensitive_attrs) - df_dists = [] - - # Try all combinations of sensitive attributes - for k in range(1, max_comb + 1): - for sensitive_attr in combinations(sensitive_attrs, k): - df_not_nan = df[~(df[list(sensitive_attr)] == "nan").any(axis=1)] - if len(df_not_nan) == 0: - continue + # Find all combinations of sensitive attributes + combs = _all_sensitive_combs(df, self.sensitive_attrs, max_comb=max_comb) - df_dist = _calculate_distance(df, self.target_attr, list(sensitive_attr), metric, method, p_value) - df_dists.append(df_dist) + # Computes scores for each sensitive value in a data frame, for each combination of sensitive attributes + if method == "pairwise": + df_dists = [_calculate_distance_pair(df, self.target_attr, comb, metric, p_value) for comb in combs] + else: + df_dists = [_calculate_distance(df, self.target_attr, comb, metric, method, p_value) for comb in combs] df_dist = pd.concat(df_dists, ignore_index=True) - return df_dist.reset_index(drop=True) + return df_dist + + def plot_dendrogram(self, threshold: float, metric: str = "auto", ax: Optional[Axes] = None) -> Axes: + """Hierarchically clusters the sensitive subgroups using the metric and plots + the resulting tree in a dendrogram. + + Args: + threshold (float, optional): + The linkage distance threshold, above which clusters will not be merged. + metric (str, optional): + Choose the metric to use. If set to "auto" chooses the metric depending on + the distribution of the target variable. Defaults to "auto". + ax (Optional[matplotlib.axes.Axes], optional): + An axis to plot the figure on. Set to plt.gca() if None. Defaults to None. + + Returns: + matplotlib.axes.Axes: + The matplotlib axis containing the plot. + """ + + if ax is None: + ax = plt.gca() + + df = self.df[self.sensitive_attrs + [self.target_attr]].copy() + + # Bin continuous sensitive attributes + for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types): + if distr_type.is_continuous() or distr_type.is_datetime(): + col = utils.infer_dtype(df[attr]) + df.loc[:, attr] = utils._bin_as_string(col, distr_type.value, prefix=True) + + # Convert binary attributes to 0s and 1s + if self.distr_type.is_binary(): + df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0] + + if len(self.sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: + return ax + + groups = [] + for vs in [[{attr: [val]} for val in df[attr].unique()] for attr in self.sensitive_attrs]: + groups.extend(vs) + + dist_matrix = np.zeros((len(groups), len(groups))) + for i, g1 in enumerate(groups): + for j, g2 in enumerate(groups): + dist_matrix[i][j] = abs(stat_distance(df, self.target_attr, g1, g2, mode=metric)[0]) + + model = AgglomerativeClustering( + n_clusters=None, + distance_threshold=threshold, + affinity="precomputed", + linkage="average", + compute_full_tree=True, + ) + model = model.fit(dist_matrix) + + # Create Dendrogram + counts = np.zeros(model.children_.shape[0]) + n_samples = len(model.labels_) + for i, merge in enumerate(model.children_): + current_counts = [1 if child_idx < n_samples else counts[child_idx - n_samples] for child_idx in merge] + counts[i] = sum(current_counts) + + linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float) + + # Plot the corresponding dendrogram + group_names = [list(group.values())[0][0] for group in groups] + _ = dendrogram(linkage_matrix, labels=group_names, ax=ax) + ax.tick_params(axis="x", labelrotation=90) + + return ax def plot_distributions( self, @@ -301,7 +371,7 @@ def compare_group_statistics( def calculate_score(df_dist: pd.DataFrame) -> float: - """Calculate the weighted mean pairwise statistical distance. + """Calculate the weighted mean of statistical distances. Args: df_dist (pd.DataFrame): @@ -358,3 +428,65 @@ def _calculate_distance( df_dist.drop(columns=["P-Value"], inplace=True) return df_dist + + +def _calculate_distance_pair( + df: pd.DataFrame, + target_attr: str, + sensitive_attrs: Sequence[str], + metric: str = "auto", + p_value: bool = False, +) -> pd.DataFrame: + + unique = df[sensitive_attrs].drop_duplicates() + + dist = [] + + for i_index, i in unique.iterrows(): + for j_index, j in unique.iterrows(): + if i_index == j_index: + continue + + group1 = {attr: [value] for attr, value in i.to_dict().items()} + group2 = {attr: [value] for attr, value in j.to_dict().items()} + + preds = utils.get_predicates_mult(df, [group1, group2]) + pred1, pred2 = preds[0], preds[1] + + dist_res = stat_distance(df, target_attr, pred1, pred2, mode=metric, p_value=p_value) + distance = dist_res[0] + p = dist_res[1] if p_value else 0 + + dist.append( + { + "Positive Group": ", ".join(map(str, i.to_dict().values())), + "Negative Group": ", ".join(map(str, j.to_dict().values())), + "Distance": distance, + "Positive Counts": len(df[pred1]), + "Negative Counts": len(df[pred2]), + "Counts": len(df[pred1]) + len(df[pred2]), + "P-Value": p, + } + ) + + df_dist = pd.DataFrame(dist) + + if not p_value: + df_dist.drop(columns=["P-Value"], inplace=True) + + return df_dist + + +def _all_sensitive_combs(df: pd.DataFrame, sensitive_attrs: Sequence[str], max_comb: Optional[int] = None): + max_comb = min(max_comb, len(sensitive_attrs)) if max_comb is not None else len(sensitive_attrs) + + groups = [] + for k in range(1, max_comb + 1): + for sensitive_attr in combinations(sensitive_attrs, k): + df_not_nan = df[~(df[list(sensitive_attr)] == "nan").any(axis=1)] + if len(df_not_nan) == 0: + continue + + groups.append(list(sensitive_attr)) + + return groups diff --git a/tests/test_scorer.py b/tests/test_scorer.py index ed42b2fc..4994766e 100644 --- a/tests/test_scorer.py +++ b/tests/test_scorer.py @@ -59,14 +59,46 @@ def test_sensitive_attr_detection(): assert fscorer.sensitive_attrs == ["DateOfBirth", "Ethnicity", "Language", "MaritalStatus", "RawScore", "Sex"] -def test_distribution_score(): +def test_distribution_score_all(): fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"]) - df_dist = fscorer.distribution_score() + df_dist = fscorer.distribution_score(method="all") + score = calculate_score(df_dist) + + assert score * df_dist["Counts"].sum() == (df_dist["Distance"] * df_dist["Counts"]).sum() + + +def test_distribution_score_rest(): + fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"]) + df_dist = fscorer.distribution_score(method="rest") score = calculate_score(df_dist) assert score * df_dist["Counts"].sum() == (df_dist["Distance"] * df_dist["Counts"]).sum() +def test_pairwise_compas(): + fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"]) + df_dist = fscorer.distribution_score(method="pairwise") + + assert (df_dist["Distance"] > 0).all() + + +def test_pairwise_adult(): + fscorer = FairnessScorer(dfa, "class", ["race", "sex"]) + df_dist = fscorer.distribution_score(metric="binomial", method="pairwise") + + assert (df_dist["Distance"] != 0).all() + + +def test_dendrogram_compas(): + fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"]) + fscorer.plot_dendrogram(0.1) + + +def test_dendrogram_adult(): + fscorer = FairnessScorer(dfa, "class", ["race", "sex"]) + fscorer.plot_dendrogram(0.1) + + def test_group_statistics_manual(): fscorer = FairnessScorer(dfc, "RawScore", ["Ethnicity", "Sex"]) df_stats = fscorer.compare_group_statistics(