|
| 1 | +import random |
| 2 | +import math |
| 3 | +from sklearn.ensemble import BaggingClassifier |
| 4 | +from sklearn.ensemble import BaggingRegressor |
| 5 | +from sklearn.ensemble import RandomForestClassifier |
| 6 | +import numpy as np |
| 7 | +import pandas as pd |
| 8 | +import matplotlib.pyplot as plt |
| 9 | +from sklearn import datasets |
| 10 | +import random |
| 11 | +import tqdm |
| 12 | +from sklearn.preprocessing import LabelEncoder |
| 13 | + |
| 14 | +def impute_missing_values(df,var_deviation_tolerance=0.97, actual_or_gaussian_residuals='actual', |
| 15 | + col_floor_ceiling_dict=None, scores=False): |
| 16 | + |
| 17 | + '''Impute missing values while minimizing distortion of variable distribution |
| 18 | + by creating a bagged model using other variables and adding residuals to output values |
| 19 | + |
| 20 | + Parameters: |
| 21 | + df: dataframe with missing values |
| 22 | + var_deviation_tolerance: target percent deviation from original variable distributions |
| 23 | + actual_or_guassian_residuals: apply residuals to model outputs from actual distribution or from |
| 24 | + a gaussian distribution based on residuals' means and variances |
| 25 | + col_floor_ceiling_dict: a dictionary with the variable name and a tuple of the min and max for variables |
| 26 | + with a finite range. Use float(inf) or float(-inf) for variables that are limited in only one direction |
| 27 | + scores: return accuracy score of models per variable |
| 28 | + |
| 29 | + Returns: |
| 30 | + df: df with imputed values |
| 31 | + problems: columns that failed to impute |
| 32 | + column_scores: accuracy scores of imputation model on non-missing values |
| 33 | + ''' |
| 34 | + df = df.copy() |
| 35 | + columns = df.columns |
| 36 | + type_dict = df.dtypes.to_dict() |
| 37 | + missing_columns = list(df.isna().sum()[df.isna().sum()>0].sort_values().index) |
| 38 | + have_columns = [i for i in columns if i not in missing_columns] |
| 39 | + column_scores = {} |
| 40 | + problems = [] |
| 41 | + for col in tqdm.tqdm(missing_columns): |
| 42 | + try: |
| 43 | + percent_missing = df[col].isna().sum()/df.shape[0] |
| 44 | + m = math.ceil(percent_missing/((1/.97)-1)) |
| 45 | + other_columns = [i for i in columns if i != col] |
| 46 | + na_index = df[df[col].isna()==1].index |
| 47 | + have_index = [i for i in df.index if i not in na_index] |
| 48 | + na_have_cols = set(df.loc[na_index,other_columns].dropna(axis=1).columns) |
| 49 | + have_have_cols = set(df.loc[have_index,other_columns].dropna(axis=1).columns) |
| 50 | + both_cols = na_have_cols.intersection(have_have_cols) |
| 51 | + int_df = pd.get_dummies(df.loc[:,both_cols],drop_first=True) |
| 52 | + X_have = int_df.loc[have_index,:] |
| 53 | + y_have = df[col][have_index] |
| 54 | + X_na = int_df.loc[na_index,:] |
| 55 | + if type_dict[col]=='object': |
| 56 | + le = LabelEncoder() |
| 57 | + y_have = le.fit_transform(y_have) |
| 58 | + df[col][have_index] = y_have |
| 59 | + rf = RandomForestClassifier() |
| 60 | + bagc = BaggingClassifier(base_estimator=rf,n_estimators=m) |
| 61 | + bagc.fit(X_have,y_have) |
| 62 | + column_scores[col]=bagc.score(X_have,y_have) |
| 63 | + resid_preds = bagc.predict(X_have) |
| 64 | + residuals = y_have-resid_preds |
| 65 | + preds = bagc.predict(X_na) |
| 66 | + else: |
| 67 | + bagr = BaggingRegressor(n_estimators=m) |
| 68 | + bagr.fit(X_have,y_have) |
| 69 | + column_scores[col] = bagr.score(X_have,y_have) |
| 70 | + resid_preds = bagr.predict(X_have) |
| 71 | + residuals = y_have-resid_preds |
| 72 | + preds = bagr.predict(X_na) |
| 73 | + if actual_or_gaussian_residuals=='actual': |
| 74 | + rand_resids = np.random.choice(residuals,len(X_na),replace=True) |
| 75 | + else: |
| 76 | + rand_resids = np.random.normal(residuals.mean(),residuals.std(),len(X_na)) |
| 77 | + preds = preds + rand_resids |
| 78 | + if type_dict[col]=='object': |
| 79 | + preds = preds.round() |
| 80 | + if col_floor_ceiling_dict!=None: |
| 81 | + if col in col_floor_ceiling_dict.keys(): |
| 82 | + preds = np.clip(preds,col_floor_ceiling_dict[col][0],col_floor_ceiling_dict[col][1]) |
| 83 | + df[col][na_index] = preds |
| 84 | + have_columns.append(col) |
| 85 | + except: |
| 86 | + problems.append(col) |
| 87 | + if scores == False: |
| 88 | + return df,problems |
| 89 | + else: |
| 90 | + return df, problems, column_scores |
0 commit comments