From 62823a754b81e0c54ec12b61d341447169d64856 Mon Sep 17 00:00:00 2001
From: Sami Jawhar <sami@thecybermonk.com>
Date: Mon, 14 Mar 2022 20:35:44 -0700
Subject: [PATCH] Add calculate_cng_indices

---
 factor_analyzer/factor_analyzer.py | 68 +++++++++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/factor_analyzer/factor_analyzer.py b/factor_analyzer/factor_analyzer.py
index dcf6088..9c6ff24 100644
--- a/factor_analyzer/factor_analyzer.py
+++ b/factor_analyzer/factor_analyzer.py
@@ -8,6 +8,7 @@
 """
 
 import warnings
+from typing import Tuple
 
 import numpy as np
 import pandas as pd
@@ -15,12 +16,13 @@
 from scipy.optimize import minimize
 from scipy.stats import chi2, pearsonr
 from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.linear_models import LinearRegression
 from sklearn.utils import check_array
 from sklearn.utils.extmath import randomized_svd
 from sklearn.utils.validation import check_is_fitted
 
 from .rotator import OBLIQUE_ROTATIONS, POSSIBLE_ROTATIONS, Rotator
-from .utils import corr, impute_values, partial_correlations, smc
+from .utils import corr, covariance_to_correlation, impute_values, partial_correlations, smc
 
 POSSIBLE_SVDS = ['randomized', 'lapack']
 
@@ -114,6 +116,70 @@ def calculate_bartlett_sphericity(x):
     return statistic, p_value
 
 
+def calculate_cng_indices(
+    data: np.ndarray, model: str = "components"
+) -> Tuple[int, pd.DataFrame]:
+    """Calculate the Cattel-Nelson-Gorsuch indices, which are used to determine
+    the appropriate number of factors for a factor analysis.
+
+    Direct port of nCng function from nFactors package:
+    https://rdrr.io/cran/nFactors/man/nCng.html
+
+    Parameters
+    ----------
+    data : array-like
+        The array of samples x observable for which to calculate CNG indices
+    model : str
+        "components" or "factors"
+
+    Returns
+    -------
+    num_factors : int
+        The number of components/factors to retain
+    details : pd.DataFrame
+        The eigenvalues and CNG indices of the dataset
+    """
+    data = corr(data.values)
+    if model == "factors":
+        data -= np.linalg.pinv(np.diag(np.diag(np.linalg.pinv(data))))
+        # TODO: Should this line be here?
+        data = covariance_to_correlation(data)
+
+    values = np.sort(np.linalg.eigvals(data))[::-1]
+
+    num_variables = len(data)
+    if num_variables < 6:
+        raise ValueError("The number of variables must be at least 6")
+
+    fit_size = 3
+    cng = np.diff(
+        [
+            [
+                LinearRegression()
+                .fit(idx_values[:, np.newaxis], values[idx_values])
+                .coef_
+                for idx_values in [
+                    np.arange(idx_fit, idx_fit + fit_size),
+                    np.arange(idx_fit + fit_size, idx_fit + 2 * fit_size),
+                ]
+            ]
+            for idx_fit in range(num_variables - 2 * fit_size)
+        ],
+        axis=1,
+    ).squeeze(axis=(1, 2))
+
+    num_factors = np.nanargmax(cng) + fit_size
+
+    details = pd.DataFrame(
+        {
+            "data": values[: len(cng)],
+            "cng": cng,
+        }
+    )
+
+    return num_factors, details
+
+
 class FactorAnalyzer(BaseEstimator, TransformerMixin):
     """
     A FactorAnalyzer class, which -