work on data imputation

carusyte · carusyte · commit 2dc2b387a94f · 2020-07-02T10:35:53.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ build/
 *.egg-info/
 .DS_Store
 corl/wc_data/impute/impute_run.sh
+PIP_TARGET=/
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.7.7
diff --git a/corl/wc_data/impute/MultipleImputation.py b/corl/wc_data/impute/MultipleImputation.py
@@ -0,0 +1,90 @@
+import random
+import math
+from sklearn.ensemble import BaggingClassifier
+from sklearn.ensemble import BaggingRegressor
+from sklearn.ensemble import RandomForestClassifier
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn import datasets
+import random
+import tqdm
+from sklearn.preprocessing import LabelEncoder
+
+def impute_missing_values(df,var_deviation_tolerance=0.97, actual_or_gaussian_residuals='actual', 
+                          col_floor_ceiling_dict=None, scores=False):
+    
+    '''Impute missing values while minimizing distortion of variable distribution
+    by creating a bagged model using other variables and adding residuals to output values
+    
+    Parameters:
+    df: dataframe with missing values
+    var_deviation_tolerance: target percent deviation from original variable distributions
+    actual_or_guassian_residuals: apply residuals to model outputs from actual distribution or from
+        a gaussian distribution based on residuals' means and variances
+    col_floor_ceiling_dict: a dictionary with the variable name and a tuple of the min and max for variables 
+        with a finite range. Use float(inf) or float(-inf) for variables that are limited in only one direction
+    scores: return accuracy score of models per variable
+    
+    Returns:
+    df: df with imputed values
+    problems: columns that failed to impute
+    column_scores: accuracy scores of imputation model on non-missing values
+    '''
+    df = df.copy()
+    columns = df.columns
+    type_dict = df.dtypes.to_dict()
+    missing_columns = list(df.isna().sum()[df.isna().sum()>0].sort_values().index)
+    have_columns = [i for i in columns if i not in missing_columns]
+    column_scores = {}
+    problems = []
+    for col in tqdm.tqdm(missing_columns):
+        try:
+            percent_missing = df[col].isna().sum()/df.shape[0]
+            m = math.ceil(percent_missing/((1/.97)-1))
+            other_columns = [i for i in columns if i != col]
+            na_index = df[df[col].isna()==1].index
+            have_index = [i for i in df.index if i not in na_index]
+            na_have_cols = set(df.loc[na_index,other_columns].dropna(axis=1).columns)
+            have_have_cols = set(df.loc[have_index,other_columns].dropna(axis=1).columns)
+            both_cols = na_have_cols.intersection(have_have_cols)
+            int_df = pd.get_dummies(df.loc[:,both_cols],drop_first=True)
+            X_have = int_df.loc[have_index,:]
+            y_have = df[col][have_index]
+            X_na = int_df.loc[na_index,:]
+            if type_dict[col]=='object':
+                le = LabelEncoder()
+                y_have = le.fit_transform(y_have)
+                df[col][have_index] = y_have
+                rf = RandomForestClassifier()
+                bagc = BaggingClassifier(base_estimator=rf,n_estimators=m)
+                bagc.fit(X_have,y_have)
+                column_scores[col]=bagc.score(X_have,y_have)
+                resid_preds = bagc.predict(X_have)
+                residuals = y_have-resid_preds
+                preds = bagc.predict(X_na)
+            else:
+                bagr = BaggingRegressor(n_estimators=m)
+                bagr.fit(X_have,y_have)
+                column_scores[col] = bagr.score(X_have,y_have)
+                resid_preds = bagr.predict(X_have)
+                residuals = y_have-resid_preds
+                preds = bagr.predict(X_na)
+            if actual_or_gaussian_residuals=='actual':
+                rand_resids = np.random.choice(residuals,len(X_na),replace=True)
+            else:
+                rand_resids = np.random.normal(residuals.mean(),residuals.std(),len(X_na))
+            preds = preds + rand_resids
+            if type_dict[col]=='object':
+                preds = preds.round()
+            if col_floor_ceiling_dict!=None:
+                if col in col_floor_ceiling_dict.keys():
+                        preds = np.clip(preds,col_floor_ceiling_dict[col][0],col_floor_ceiling_dict[col][1])
+            df[col][na_index] = preds
+            have_columns.append(col)
+        except:
+            problems.append(col)
+    if scores == False:
+        return df,problems
+    else:
+        return df, problems, column_scores
diff --git a/corl/wc_data/impute/common.py b/corl/wc_data/impute/common.py
@@ -1,11 +1,13 @@
-import pandas as pd
-import numpy as np
+import argparse
+from time import strftime
 from mysql.connector.pooling import MySQLConnectionPool
 
+print_header = lambda msg: print(f"{msg}\n{'-'*len(msg)}")
+
 cnxpool = None
 
-def _parseArgs():
-    parser = argparse.ArgumentParser()
+def parseArgs():
+    parser = argparse.ArgumentParser(add_help=False)
     parser.add_argument('--db_host',
                         type=str,
                         help='database host address',
@@ -20,8 +22,10 @@ def _parseArgs():
                         default=None)
     return parser.parse_args()
 
-def _init(db_pool_size=None, db_host=None, db_port=None, db_pwd=None):
+def init(db_pool_size=None, db_host=None, db_port=None, db_pwd=None):
     global cnxpool
+    if cnxpool is not None:
+        return cnxpool
     print("{} initializing mysql connection pool...".format(
         strftime("%H:%M:%S")))
     cnxpool = MySQLConnectionPool(
@@ -35,18 +39,4 @@ def _init(db_pool_size=None, db_host=None, db_port=None, db_pwd=None):
         # ssl_ca='',
         # use_pure=True,
         connect_timeout=90000)
-
-def _impute():
-    global cnxpool
-    c = cnxpool.get_connection()
-    query = 'select code, date, amount, xrate, close, high, high_close, open, open_close, low, low_close, volume from index_d_n_lr'
-    df = pd.read_sql(query, c)
-
-if __name__ == '__main__':
-    args = _parseArgs()
-    _init(4, 
-        db_host=args.db_host,
-        db_port=args.db_port, 
-        db_pwd=args.db_pwd
-    )
-    _impute()
+    return cnxpool
diff --git a/corl/wc_data/impute/impute_tmpl.sh b/corl/wc_data/impute/impute_tmpl.sh
@@ -1,2 +1,2 @@
 #!/bin/sh
-python data_imputer.py --db_host=['replace'] --db_port=['replace'] --db_pwd=['replace']
+python with_autoimpute.py --db_host=['replace'] --db_port=['replace'] --db_pwd=['replace']
diff --git a/corl/wc_data/impute/with_autoimpute.py b/corl/wc_data/impute/with_autoimpute.py
@@ -0,0 +1,49 @@
+import pandas as pd
+import numpy as np
+from time import strftime
+from mysql.connector.pooling import MySQLConnectionPool
+from corl.wc_data.impute.common import parseArgs, init, print_header
+from autoimpute.imputations import SingleImputer, MultipleImputer, MiceImputer
+
+import warnings
+warnings.filterwarnings("ignore")
+
+cnxpool = None
+
+
+def _impute():
+    c = cnxpool.get_connection()
+    query = 'select code, date, amount, xrate, close, high, high_close, open, open_close, low, low_close, volume from index_d_n_lr order by code asc, date asc'
+    df = pd.read_sql(query, c)
+    print_header('original table:')
+    print(df)
+    print_header('Rows having NaN:')
+    nan_df = df[df.isna().any(axis=1)]
+    print(nan_df)
+    sdf = df[['amount', 'xrate', 'close', 'high', 'high_close',
+              'open', 'open_close', 'low', 'low_close', 'volume']]
+    print_header('Subset of Table')
+    print(sdf)
+    mi = MiceImputer()
+    mi_data_full = mi.fit_transform(sdf)
+
+    for i, m in enumerate(mi_data_full):
+        print_header("After Imputation #{}".format(i+1))
+        imputed_filtered = m[[m.isna().any(axis=1)]]
+        print(imputed_filtered)
+
+    # print the results
+    # print_header("Results from SingleImputer running PMM on column y one time")
+    # conc = pd.concat([data_miss.head(20), si_data_full.head(20)], axis=1)
+    # conc.columns = ["x", "y_orig", "x_imp", "y_imp"]
+    # conc[["x", "y_orig", "y_imp"]]
+
+
+if __name__ == '__main__':
+    args = parseArgs()
+    cnxpool = init(4,
+                   db_host=args.db_host,
+                   db_port=args.db_port,
+                   db_pwd=args.db_pwd
+                   )
+    _impute()
diff --git a/corl/wc_data/impute/with_multimp.py b/corl/wc_data/impute/with_multimp.py
@@ -0,0 +1,40 @@
+import pandas as pd
+import numpy as np
+from time import strftime
+from mysql.connector.pooling import MySQLConnectionPool
+from corl.wc_data.impute.common import parseArgs, init
+from corl.wc_data.impute.MultipleImputation import impute_missing_values
+
+
+
+def _impute():
+    global cnxpool
+    c = cnxpool.get_connection()
+    query = 'select code, date, amount, xrate, close, high, high_close, open, open_close, low, low_close, volume from index_d_n_lr order by code asc, date asc'
+    df = pd.read_sql(query, c)
+    print_header('original table:')
+    print(df)
+    print_header('Rows having NaN:')
+    nan_df = df[df.isna().any(axis=1)]
+    print(nan_df)
+    si = SingleImputer()
+    si_data_full = si.fit_transform(df)
+
+    print_header("After Imputation")
+    imputed_filtered = si_data_full[[si_data_full.isna().any(axis=1)]]
+    print(imputed_filtered)
+
+    # print the results
+    # print_header("Results from SingleImputer running PMM on column y one time")
+    # conc = pd.concat([data_miss.head(20), si_data_full.head(20)], axis=1)
+    # conc.columns = ["x", "y_orig", "x_imp", "y_imp"]
+    # conc[["x", "y_orig", "y_imp"]]
+
+if __name__ == '__main__':
+    args = parseArgs()
+    init(4, 
+        db_host=args.db_host,
+        db_port=args.db_port, 
+        db_pwd=args.db_pwd
+    )
+    _impute()
diff --git a/corl/wc_data/impute/with_tfp.py b/corl/wc_data/impute/with_tfp.py
@@ -0,0 +1,29 @@
+import tensorflow_probability as tfp
+import tensorflow as tf
+import numpy as np
+
+
+time_series_with_nans = [-1., 1., np.nan, 2.4, np.nan, 5]
+observed_time_series = tfp.sts.MaskedTimeSeries(
+  time_series=time_series_with_nans,
+  is_missing=tf.math.is_nan(time_series_with_nans))
+
+
+# Build model using observed time series to set heuristic priors.
+linear_trend_model = tfp.sts.LocalLinearTrend(
+  observed_time_series=observed_time_series)
+model = tfp.sts.Sum([linear_trend_model],
+                    observed_time_series=observed_time_series)
+
+
+# Fit model to data
+parameter_samples, _ = tfp.sts.fit_with_hmc(model, observed_time_series)
+
+
+# Impute missing values
+imputed_series_distribution = tfp.sts.impute_missing_values(
+  model, observed_time_series, parameter_samples, include_observation_noise=True)
+print('imputed means and stddevs: ',
+      imputed_series_distribution.mean(),
+      imputed_series_distribution.stddev(),
+      imputed_series_distribution.)

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`#!/bin/sh`
`2`		`-python data_imputer.py --db_host=['replace'] --db_port=['replace'] --db_pwd=['replace']`
	`2`	`+python with_autoimpute.py --db_host=['replace'] --db_port=['replace'] --db_pwd=['replace']`