diff --git a/plot.py b/plot.py new file mode 100644 index 000000000..e6957bbb3 --- /dev/null +++ b/plot.py @@ -0,0 +1,60 @@ +import dask +from dask import delayed +import dask.dataframe as dd +import dask.array as da +import pandas as pd + +def f1(dataframe, col): + x = dataframe.groupby(col)[col].count() + return dict(x) + +def f2(dataframe, col): + minv = dataframe[col].min() + maxv = dataframe[col].max() + #print ('min = ', minv, 'maxv = ', maxv) + #print (dataframe[col]) + dframe = dd.from_array(dataframe[col]).dropna() + h, b = da.histogram(dframe.values, range=[minv, maxv], bins=10) + return h + + +def plot(df, unique_threshold): + + #df = pd.read_csv('C:/Users/sladdha/Desktop/DataPrep/Datasets/Normal.csv') + + ls = list() + for col in df.columns: + ls.append(delayed(df[col].nunique)()) + + x, = dask.compute(ls) + y, = dask.compute(x) + result = list() + + test = [] + + for i, col in enumerate(df.columns): + if (df[col].count()==0): + continue + + + if (y[i]/df[col].count()