synthesized-io · Hilly12 · Sep 6, 2021 · Sep 7, 2021 · Sep 9, 2021 · Sep 9, 2021
diff --git a/docs/user_guide/correlations.rst b/docs/user_guide/correlations.rst
@@ -32,8 +32,16 @@ Let's first look at how we would go about detecting correlations inside a datafr
     import fairlens as fl
 
     columns = ["gender", "random", "score"]
-    data = [["male", 10, 50], ["female", 20, 80], ["male", 20, 60], ["female", 10, 90]]
-
+    data = [
+        ["male", 10, 60],
+        ["female", 10, 80],
+        ["male", 10, 60],
+        ["female", 10, 80],
+        ["male", 9, 59],
+        ["female", 11, 80],
+        ["male", 12, 61],
+        ["female", 10, 83],
+    ]
     df = pd.DataFrame(data, columns=columns)
 
 Here the score seems to be correlated with gender, with females leaning towards somewhat higher scores.
@@ -65,7 +73,7 @@ Correlation Heatmaps
 ^^^^^^^^^^^^^^^^^^^^
 
 The :code:`plot` module allows users to generate a correlation heatmap of any dataset by simply
-passing the dataframe to the :code:`two_column_heatmap()` function, which will plot a heatmap from the
+passing the dataframe to the :code:`heatmap()` function, which will plot a heatmap from the
 matrix of the correlation coefficients computed by using the Pearson Coefficient, the Kruskal-Wallis
 Test and Cramer's V between each two of the columns (for numerical-numerical, categorical-numerical and
 categorical-categorical associations, respectively).
@@ -92,19 +100,17 @@ This will automatically choose different methods for different types of data, ho
 are configurable.
 
 .. ipython:: python
-    :okwarning:
 
     @savefig corr_heatmap_1.png
-    fl.plot.two_column_heatmap(df)
+    fl.plot.heatmap(df)
 
 
 Let's try generating a heatmap of the same dataset, but using some non-linear metrics
 for numerical-numerical and numerical-categorical associations for added precision.
 
 .. ipython:: python
-    :okwarning:
 
     from fairlens.metrics import distance_nn_correlation, distance_cn_correlation, cramers_v
 
     @savefig corr_heatmap_2.png
-    fl.plot.two_column_heatmap(df, distance_nn_correlation, distance_cn_correlation, cramers_v)
+    fl.plot.heatmap(df, distance_nn_correlation, distance_cn_correlation, cramers_v)
diff --git a/src/fairlens/metrics/__init__.py b/src/fairlens/metrics/__init__.py
@@ -23,6 +23,7 @@
     cramers_v,
     distance_cn_correlation,
     distance_nn_correlation,
+    pearson,
     r2_mcfadden,
     kruskal_wallis,
     kruskal_wallis_boolean,
@@ -58,6 +59,7 @@
     "cramers_v",
     "distance_cn_correlation",
     "distance_nn_correlation",
+    "pearson",
     "r2_mcfadden",
     "kruskal_wallis",
     "kruskal_wallis_boolean",

diff --git a/src/fairlens/metrics/correlation.py b/src/fairlens/metrics/correlation.py
@@ -11,6 +11,9 @@
 from sklearn import linear_model
 from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
 
+EPSILON = 1e-6
+MIN_MEAN_SAMPLE_SIZE = 20
+
 
 def cramers_v(sr_a: pd.Series, sr_b: pd.Series) -> float:
     """Metric that calculates the corrected Cramer's V statistic for categorical-categorical
@@ -23,43 +26,50 @@ def cramers_v(sr_a: pd.Series, sr_b: pd.Series) -> float:
             Second categorical series to analyze.
 
     Returns:
-        float: Value of the statistic.
+        float:
+            Value of the statistic.
     """
 
-    if len(sr_a.value_counts()) == 1:
+    if sr_a.equals(sr_b):
+        return 1
+
+    confusion_matrix = pd.crosstab(sr_a, sr_b)
+    r, k = confusion_matrix.shape
+    n = confusion_matrix.to_numpy().sum()
+
+    if r < 2 or k < 2:
         return 0
-    if len(sr_b.value_counts()) == 1:
+
+    chi2 = ss.chi2_contingency(confusion_matrix, correction=(confusion_matrix.shape[0] != 2))[0]
+    phi2 = chi2 / n
+
+    phi2corr = phi2 - ((k - 1) * (r - 1)) / (n - 1)
+
+    if phi2corr <= EPSILON:
         return 0
-    else:
-        confusion_matrix = pd.crosstab(sr_a, sr_b)
 
-        if confusion_matrix.shape[0] == 2:
-            correct = False
-        else:
-            correct = True
+    rcorr = r - ((r - 1) ** 2) / (n - 1)
+    kcorr = k - ((k - 1) ** 2) / (n - 1)
 
-        chi2 = ss.chi2_contingency(confusion_matrix, correction=correct)[0]
-        n = sum(confusion_matrix.sum())
-        phi2 = chi2 / n
-        r, k = confusion_matrix.shape
-        phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
-        rcorr = r - ((r - 1) ** 2) / (n - 1)
-        kcorr = k - ((k - 1) ** 2) / (n - 1)
-        return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
+    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
 
 
 def pearson(sr_a: pd.Series, sr_b: pd.Series) -> float:
-    """Metric that calculates Pearson's correlation coefficent for numerical-numerical
+    """Calculates the Pearson's correlation coefficent for numerical-numerical
     pairs of series, used in heatmap generation.
 
     Args:
-        sr_a (pd.Series): First numerical series to analyze.
-        sr_b (pd.Series): Second numerical series to analyze.
+        sr_a (pd.Series):
+            First numerical series to analyze.
+        sr_b (pd.Series):
+            Second numerical series to analyze.
 
     Returns:
-        float: Value of the coefficient.
+        float:
+            Value of the coefficient.
     """
-    return abs(sr_a.corr(sr_b))
+
+    return sr_a.corr(sr_b, method="pearson")
 
 
 def r2_mcfadden(sr_a: pd.Series, sr_b: pd.Series) -> float:
@@ -78,6 +88,7 @@ def r2_mcfadden(sr_a: pd.Series, sr_b: pd.Series) -> float:
     Returns:
         float: Value of the pseudo-R2 McFadden score.
     """
+
     x = sr_b.to_numpy().reshape(-1, 1)
     x = StandardScaler().fit_transform(x)
     y = sr_a.to_numpy()
@@ -120,16 +131,17 @@ def kruskal_wallis(sr_a: pd.Series, sr_b: pd.Series) -> float:
             p-value is the probability that the two columns are not correlated.
     """
 
-    sr_a = sr_a.astype("category").cat.codes
     groups = sr_b.groupby(sr_a)
-    arrays = [groups.get_group(category) for category in sr_a.unique()]
+    if len(groups) < 2:
+        return 0
+
+    args = [groups.get_group(category).array for category in sr_a.unique()]
 
-    args = [group.array for group in arrays]
-    try:
-        _, p_val = ss.kruskal(*args, nan_policy="omit")
-    except ValueError:
+    if np.mean([len(values) for values in args]) <= MIN_MEAN_SAMPLE_SIZE:
         return 0
 
+    _, p_val = ss.kruskal(*args, nan_policy="omit")
+
     return p_val
 
 
@@ -147,7 +159,8 @@ def kruskal_wallis_boolean(sr_a: pd.Series, sr_b: pd.Series, p_cutoff: float = 0
             The maximum admitted p-value for the distributions to be considered independent.
 
     Returns:
-        bool: Bool value representing whether or not the two series are correlated.
+        bool:
+            Bool value representing whether or not the two series are correlated.
     """
 
     sr_a = sr_a.astype("category").cat.codes
@@ -181,8 +194,6 @@ def distance_nn_correlation(sr_a: pd.Series, sr_b: pd.Series) -> float:
             The correlation coefficient.
     """
 
-    warnings.filterwarnings(action="ignore", category=UserWarning)
-
     if sr_a.size < sr_b.size:
         sr_a = sr_a.append(pd.Series(sr_a.mean()).repeat(sr_b.size - sr_a.size), ignore_index=True)
     elif sr_a.size > sr_b.size:

diff --git a/src/fairlens/metrics/unified.py b/src/fairlens/metrics/unified.py
@@ -2,9 +2,9 @@
 Collection of helper methods which can be used as to interface metrics.
 """
 
-import multiprocessing as mp
-from typing import Any, Callable, List, Mapping, Optional, Tuple, Type, Union
+from typing import Any, Callable, List, Mapping, Tuple, Type, Union
 
+import numpy as np
 import pandas as pd
 
 from .. import utils
@@ -118,8 +118,6 @@ def correlation_matrix(
     num_num_metric: Callable[[pd.Series, pd.Series], float] = pearson,
     cat_num_metric: Callable[[pd.Series, pd.Series], float] = kruskal_wallis,
     cat_cat_metric: Callable[[pd.Series, pd.Series], float] = cramers_v,
-    columns_x: Optional[List[str]] = None,
-    columns_y: Optional[List[str]] = None,
 ) -> pd.DataFrame:
     """This function creates a correlation matrix out of a dataframe, using a correlation metric for each
     possible type of pair of series (i.e. numerical-numerical, categorical-numerical, categorical-categorical).
@@ -135,60 +133,62 @@ def correlation_matrix(
         cat_cat_metric (Callable[[pd.Series, pd.Series], float], optional):
             The correlation metric used for categorical-categorical series pairs. Defaults to corrected Cramer's V
             statistic.
-        columns_x (Optional[List[str]]):
-            The column names that determine the rows of the matrix.
-        columns_y (Optional[List[str]]):
-            The column names that determine the columns of the matrix.
 
     Returns:
         pd.DataFrame:
             The correlation matrix to be used in heatmap generation.
     """
 
-    if columns_x is None:
-        columns_x = df.columns
+    df = df.copy()
 
-    if columns_y is None:
-        columns_y = df.columns
+    distr_types = [utils.infer_distr_type(df[col]) for col in df.columns]
 
-    pool = mp.Pool(mp.cpu_count())
+    for col in df.columns:
+        df[col] = utils.infer_dtype(df[col])
 
-    series_list = [
-        pd.Series(
-            pool.starmap(
-                _correlation_matrix_helper,
-                [(df[col_x], df[col_y], num_num_metric, cat_num_metric, cat_cat_metric) for col_x in columns_x],
-            ),
-            index=columns_x,
-            name=col_y,
-        )
-        for col_y in columns_y
-    ]
+        if df[col].dtype.kind == "O":
+            df[col] = pd.Series(pd.factorize(df[col], na_sentinel=-1)[0]).replace(-1, np.nan)
+
+    df = df.append(pd.DataFrame({col: [i] for i, col in enumerate(df.columns)}))
 
-    pool.close()
+    def corr(a: np.ndarray, b: np.ndarray):
+        return _correlation_matrix_helper(
+            a,
+            b,
+            distr_types=distr_types,
+            num_num_metric=num_num_metric,
+            cat_num_metric=cat_num_metric,
+            cat_cat_metric=cat_cat_metric,
+        )
 
-    return pd.concat(series_list, axis=1, keys=[series.name for series in series_list])
+    return df.corr(method=corr)
 
 
 def _correlation_matrix_helper(
-    sr_a: pd.Series,
-    sr_b: pd.Series,
+    a: np.ndarray,
+    b: np.ndarray,
+    distr_types: List[utils.DistrType],
     num_num_metric: Callable[[pd.Series, pd.Series], float] = pearson,
     cat_num_metric: Callable[[pd.Series, pd.Series], float] = kruskal_wallis,
     cat_cat_metric: Callable[[pd.Series, pd.Series], float] = cramers_v,
 ) -> float:
 
-    a_type = utils.infer_distr_type(sr_a)
-    b_type = utils.infer_distr_type(sr_b)
+    a_type = distr_types[int(a[-1])]
+    b_type = distr_types[int(b[-1])]
+
+    sr_a = pd.Series(a[:-1])
+    sr_b = pd.Series(b[:-1])
+
+    df = pd.DataFrame({"a": sr_a, "b": sr_b}).dropna().reset_index()
 
     if a_type.is_continuous() and b_type.is_continuous():
-        return num_num_metric(sr_a, sr_b)
+        return num_num_metric(df["a"], df["b"])
 
     elif b_type.is_continuous():
-        return cat_num_metric(sr_a, sr_b)
+        return cat_num_metric(df["a"], df["b"])
 
     elif a_type.is_continuous():
-        return cat_num_metric(sr_b, sr_a)
+        return cat_num_metric(df["b"], df["a"])
 
     else:
-        return cat_cat_metric(sr_a, sr_b)
+        return cat_cat_metric(df["a"], df["b"])
diff --git a/src/fairlens/plot/__init__.py b/src/fairlens/plot/__init__.py
@@ -3,8 +3,8 @@
 """
 
 
+from .correlation import heatmap
 from .distr import attr_distr_plot, distr_plot, mult_distr_plot
-from .heatmap import two_column_heatmap
 from .style import reset_style, use_style
 
-__all__ = ["use_style", "reset_style", "distr_plot", "attr_distr_plot", "mult_distr_plot", "two_column_heatmap"]
+__all__ = ["use_style", "reset_style", "distr_plot", "attr_distr_plot", "mult_distr_plot", "heatmap"]
diff --git a/src/fairlens/plot/correlation.py b/src/fairlens/plot/correlation.py
@@ -0,0 +1,64 @@
+"""
+Plot correlation heatmaps for datasets.
+"""
+
+from typing import Callable, Optional, Sequence, Tuple
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from matplotlib.axes import Axes
+
+from ..metrics import correlation, unified
+
+
+def heatmap(
+    df: pd.DataFrame,
+    num_num_metric: Callable[[pd.Series, pd.Series], float] = correlation.pearson,
+    cat_num_metric: Callable[[pd.Series, pd.Series], float] = correlation.kruskal_wallis,
+    cat_cat_metric: Callable[[pd.Series, pd.Series], float] = correlation.cramers_v,
+    cmap: Optional[Sequence[Tuple[float, float, float]]] = None,
+    annotate: bool = False,
+) -> Axes:
+    """This function creates a correlation heatmap out of a dataframe, using user provided or default correlation
+    metrics for all possible types of pairs of series (i.e. numerical-numerical, categorical-numerical,
+    categorical-categorical).
+
+    Args:
+        df (pd.DataFrame):
+            The dataframe used for computing correlations and producing a heatmap.
+        num_num_metric (Callable[[pd.Series, pd.Series], float], optional):
+            The correlation metric used for numerical-numerical series pairs. Defaults to Pearson's correlation
+            coefficient.
+        cat_num_metric (Callable[[pd.Series, pd.Series], float], optional):
+            The correlation metric used for categorical-numerical series pairs. Defaults to Kruskal-Wallis' H Test.
+        cat_cat_metric (Callable[[pd.Series, pd.Series], float], optional):
+            The correlation metric used for categorical-categorical series pairs. Defaults to corrected Cramer's V
+            statistic.
+        cmap (Optional[Sequence[Tuple[float, float, float]]], optional):
+            A sequence of RGB tuples used to colour the histograms. If None seaborn's default pallete
+            will be used. Defaults to None.
+        annotate (bool, optional):
+            Annotate the heatmap.
+
+    Returns:
+        matplotlib.axes.Axes:
+            The matplotlib axis containing the plot.
+
+    Examples:
+        >>> df = pd.read_csv("datasets/german_credit_data.csv")
+        >>> heatmap(df)
+        >>> plt.show()
+
+        .. image:: ../../savefig/corr_heatmap_1.png
+    """
+
+    corr_matrix = unified.correlation_matrix(df, num_num_metric, cat_num_metric, cat_cat_metric)
+
+    cmap = cmap or sns.cubehelix_palette(start=0.2, rot=-0.2, dark=0.3, as_cmap=True)
+    annot = annotate or None
+
+    ax = sns.heatmap(corr_matrix, vmin=0, vmax=1, square=True, cmap=cmap, linewidth=0.5, annot=annot, fmt=".1f")
+    plt.tight_layout()
+
+    return ax