From 62823a754b81e0c54ec12b61d341447169d64856 Mon Sep 17 00:00:00 2001 From: Sami Jawhar Date: Mon, 14 Mar 2022 20:35:44 -0700 Subject: [PATCH] Add calculate_cng_indices --- factor_analyzer/factor_analyzer.py | 68 +++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/factor_analyzer/factor_analyzer.py b/factor_analyzer/factor_analyzer.py index dcf6088..9c6ff24 100644 --- a/factor_analyzer/factor_analyzer.py +++ b/factor_analyzer/factor_analyzer.py @@ -8,6 +8,7 @@ """ import warnings +from typing import Tuple import numpy as np import pandas as pd @@ -15,12 +16,13 @@ from scipy.optimize import minimize from scipy.stats import chi2, pearsonr from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.linear_models import LinearRegression from sklearn.utils import check_array from sklearn.utils.extmath import randomized_svd from sklearn.utils.validation import check_is_fitted from .rotator import OBLIQUE_ROTATIONS, POSSIBLE_ROTATIONS, Rotator -from .utils import corr, impute_values, partial_correlations, smc +from .utils import corr, covariance_to_correlation, impute_values, partial_correlations, smc POSSIBLE_SVDS = ['randomized', 'lapack'] @@ -114,6 +116,70 @@ def calculate_bartlett_sphericity(x): return statistic, p_value +def calculate_cng_indices( + data: np.ndarray, model: str = "components" +) -> Tuple[int, pd.DataFrame]: + """Calculate the Cattel-Nelson-Gorsuch indices, which are used to determine + the appropriate number of factors for a factor analysis. + + Direct port of nCng function from nFactors package: + https://rdrr.io/cran/nFactors/man/nCng.html + + Parameters + ---------- + data : array-like + The array of samples x observable for which to calculate CNG indices + model : str + "components" or "factors" + + Returns + ------- + num_factors : int + The number of components/factors to retain + details : pd.DataFrame + The eigenvalues and CNG indices of the dataset + """ + data = corr(data.values) + if model == "factors": + data -= np.linalg.pinv(np.diag(np.diag(np.linalg.pinv(data)))) + # TODO: Should this line be here? + data = covariance_to_correlation(data) + + values = np.sort(np.linalg.eigvals(data))[::-1] + + num_variables = len(data) + if num_variables < 6: + raise ValueError("The number of variables must be at least 6") + + fit_size = 3 + cng = np.diff( + [ + [ + LinearRegression() + .fit(idx_values[:, np.newaxis], values[idx_values]) + .coef_ + for idx_values in [ + np.arange(idx_fit, idx_fit + fit_size), + np.arange(idx_fit + fit_size, idx_fit + 2 * fit_size), + ] + ] + for idx_fit in range(num_variables - 2 * fit_size) + ], + axis=1, + ).squeeze(axis=(1, 2)) + + num_factors = np.nanargmax(cng) + fit_size + + details = pd.DataFrame( + { + "data": values[: len(cng)], + "cng": cng, + } + ) + + return num_factors, details + + class FactorAnalyzer(BaseEstimator, TransformerMixin): """ A FactorAnalyzer class, which -