From 4b730b6e428859cb7e699079a4f472116cbeb457 Mon Sep 17 00:00:00 2001 From: SHUBHAM LADDHA Date: Wed, 22 May 2019 11:07:08 -0700 Subject: [PATCH] Add files via upload --- plot.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++ tests.py | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 plot.py create mode 100644 tests.py diff --git a/plot.py b/plot.py new file mode 100644 index 000000000..e6957bbb3 --- /dev/null +++ b/plot.py @@ -0,0 +1,60 @@ +import dask +from dask import delayed +import dask.dataframe as dd +import dask.array as da +import pandas as pd + +def f1(dataframe, col): + x = dataframe.groupby(col)[col].count() + return dict(x) + +def f2(dataframe, col): + minv = dataframe[col].min() + maxv = dataframe[col].max() + #print ('min = ', minv, 'maxv = ', maxv) + #print (dataframe[col]) + dframe = dd.from_array(dataframe[col]).dropna() + h, b = da.histogram(dframe.values, range=[minv, maxv], bins=10) + return h + + +def plot(df, unique_threshold): + + #df = pd.read_csv('C:/Users/sladdha/Desktop/DataPrep/Datasets/Normal.csv') + + ls = list() + for col in df.columns: + ls.append(delayed(df[col].nunique)()) + + x, = dask.compute(ls) + y, = dask.compute(x) + result = list() + + test = [] + + for i, col in enumerate(df.columns): + if (df[col].count()==0): + continue + + + if (y[i]/df[col].count()