Skip to content

Commit c3e1c2d

Browse files
committed
Adding Madelyn's code
1 parent 8ea2916 commit c3e1c2d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+16373
-60
lines changed

data/ContinuousDataSet.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#!/usr/bin/env python3
2+
3+
from graph.GraphNode import GraphNode
4+
import numpy as np
5+
6+
class ContinuousDataSet(DataSet):
7+
8+
def __init__(self, data, var_names):
9+
10+
self.data = data
11+
12+
nodes = []
13+
names = []
14+
for name in var_names:
15+
nodes.append(GraphNode(name))
16+
17+
self.variables = nodes
18+
self.variable_names = var_names
19+
20+
num_rows, num_cols = data.shape
21+
self.num_rows = num_rows
22+
self.num_vars = num_cols
23+
24+
self.datatype = datatype
25+
26+
def get_data(self):
27+
return self.dataset
28+
29+
def get_variables(self):
30+
return self.variables
31+
32+
def get_variable_names(self):
33+
return self.variable_names
34+
35+
def get_data_type(self):
36+
return self.datatype
37+
38+
def get_num_rows(self):
39+
return self.num_rows
40+
41+
def get_num_variables(self):
42+
return self.num_vars
43+
44+
def get_column(self, variable):
45+
return self.variable_map[variable]
46+
47+
def get_correlation_matrix(self):
48+
return np.corrcoef(self.data)
49+
50+
def get_covariance_matrix(self):
51+
return np.cov(self.data)
52+
53+
def get_double(self, row, column):
54+
55+
return self.data[row, column]
56+
57+
# Currently returns None for non-number values. May return ordering once
58+
# infrastructure has been built for that
59+
def get_int(self, row, column):
60+
return int(self.data[row, column])
61+
62+
def get_object(self, row, column):
63+
return self.data[row, column]
64+
65+
def get_var_by_index(self, column):
66+
return self.variables[column]
67+
68+
def get_var_by_name(self, var_name):
69+
return self.variables[self.variable_names.index(var_name)]
70+
71+
def is_continuous(self):
72+
return True
73+
74+
def is_discrete(self):
75+
return False
76+
77+
def is_mixed(self):
78+
return False
79+
80+
def remove_col_by_index(self, index):
81+
self.data = np.delete(self.data, index, 1)
82+
83+
def remove_col_by_name(self, name):
84+
self.data = np.delete(self.data, self.variable_names.index(var_name), 1)
85+
86+
def remove_columns(self, columns):
87+
self.data = np.delete(self.data, columns, 1)
88+
89+
def remove_rows(self, rows):
90+
self.data = np.delete(self.data, rows, 0)
91+
92+
def set_double(self, row, column, value):
93+
data = self.data
94+
data[row, column] == value
95+
self.data = data
96+
97+
def subset_cols_by_index(self, indices):
98+
self.data = numpy.take(self.data, indices, 1)
99+
100+
def subset_cols_by_variable(self, variables):
101+
indices = []
102+
103+
indices.append(self.variables.index(variable) for variable in variables)
104+
105+
def subset_rows(self, indices):
106+
self.data = numpy.take(self.data, indices, 0)
107+
108+
def permute_rows(self):
109+
self.data = numpy.random.permutation(self.data)
110+
111+
def __str__(self):
112+
s = ""
113+
114+
for name in self.variable_names:
115+
s = s + name + '\t'
116+
117+
s = s + '\n'
118+
119+
for i in range(self.num_rows):
120+
for j in range(self.num_vars):
121+
s = s + str(self.data[i, j])
122+
123+
if j < self.num_vars - 1:
124+
s = s + '\t'
125+
if i < self.num_rows - 1:
126+
s = s + '\n'
127+
128+
return s

data/DataUtils.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/usr/bin/env python3
2+
3+
from data.DataSet import ContinuousDataSet
4+
from data.DataSet import DiscreteDataSet
5+
from data.DataSet import MixedDataSet
6+
7+
import numpy as np
8+
import pandas as pd
9+
10+
class DataUtils():
11+
12+
def load_continuous_data(self, filename, **kwargs):
13+
14+
missing = '*'
15+
header = True
16+
comments = "\""
17+
set_delimiter = False
18+
delimiter = 'whitespace'
19+
20+
for key, value in kwargs.items():
21+
if key == 'missing':
22+
missing = value
23+
elif key == 'header':
24+
header = bool(value)
25+
elif key == 'comments':
26+
comments = value
27+
elif key == 'delimiter':
28+
set_delimiter = True
29+
delimiter = value
30+
31+
if set_delimiter:
32+
if header:
33+
data = np.genfromtxt(filename, skip_header=1, missing_values=missing, delimiter=delimiter, comments=comments)
34+
fp = open(filename, 'r')
35+
line = fp.readline()
36+
names = line.split(delimiter)
37+
fp.close()
38+
else:
39+
data = np.genfromtxt(filename, missing_values=missing, delimiter=delimiter, comments=comments)
40+
num_rows, num_columns = data.shape
41+
names = ['V'+ str(i) for i in range(num_columns)]
42+
else:
43+
if header:
44+
data = np.genfromtxt(filename, skip_header=1, missing_values=missing, comments=comments)
45+
fp = open(filename, 'r')
46+
line = fp.readline()
47+
names = line.split()
48+
fp.close()
49+
else:
50+
data = np.genfromtxt(filename, missing_values=missing, comments=comments)
51+
names = ['V'+ str(i) for i in range(num_columns)]
52+
53+
data = ContinuousDataSet(data, names)
54+
55+
return data
56+
57+
def load_discrete_data(self, filename, **kwargs):
58+
59+
missing = '*'
60+
header = True
61+
comments = "\""
62+
set_delimiter = False
63+
delimiter = 'whitespace'
64+
65+
for key, value in kwargs.items():
66+
if key == 'missing':
67+
missing = value
68+
elif key == 'header':
69+
header = bool(value)
70+
elif key == 'comments':
71+
comments = value
72+
elif key == 'delimiter':
73+
set_delimiter = True
74+
delimiter = value
75+
76+
if set_delimiter:
77+
if header:
78+
data = np.genfromtxt(filename, skip_header=1, missing_values=missing, delimiter=delimiter, comments=comments, dtype=str)
79+
fp = open(filename, 'r')
80+
line = fp.readline()
81+
names = line.split(delimiter)
82+
fp.close()
83+
else:
84+
data = np.genfromtxt(filename, missing_values=missing, delimiter=delimiter, comments=comments, dtype=str)
85+
num_rows, num_columns = data.shape
86+
names = ['V'+ str(i) for i in range(num_columns)]
87+
else:
88+
if header:
89+
data = np.genfromtxt(filename, skip_header=1, missing_values=missing, comments=comments, dtype=str)
90+
fp = open(filename, 'r')
91+
line = fp.readline()
92+
names = line.split()
93+
fp.close()
94+
else:
95+
data = np.genfromtxt(filename, missing_values=missing, comments=comments, dtype=str)
96+
num_rows, num_columns = data.shape
97+
names = ['V'+ str(i) for i in range(num_columns)]
98+
99+
data = DiscreteDataSet(data, names)
100+
101+
return data
102+
103+
def load_mixed_data(self, filename, max_discrete, **kwargs):
104+
missing = '*'
105+
header = False
106+
comments = "\""
107+
delimiter = '\t'
108+
109+
110+
for key, value in kwargs.items():
111+
if key == 'missing':
112+
missing = value
113+
elif key == 'header':
114+
header = bool(value)
115+
elif key == 'comments':
116+
comments = value
117+
elif key == 'delimiter':
118+
delimiter = value
119+
120+
if header:
121+
data = pd.read_csv(filename, delimiter=delimiter, comment=comments)
122+
rows = len(data.index)
123+
columns = len(data.columns)
124+
fp = open(filename, 'r')
125+
line = fp.readline()
126+
names = line.split(delimiter)
127+
fp.close()
128+
129+
else:
130+
data = np.genfromtxt(filename, missing_values=missing, delimiter=delimiter, comments=comments, dtype=object)
131+
rows = len(data.index)
132+
columns = len(data.columns)
133+
names = ['V'+ str(i) for i in range(columns)]
134+
for i in range(columns):
135+
if np.unique(data[:, i]).size > max_discrete:
136+
for j in range(rows):
137+
data[i, j] = float(data[i, j])
138+
139+
for i in range(columns):
140+
for j in range(rows):
141+
if data.iat[i, j] == missing:
142+
data.iat[i, j] = None
143+
144+
for i in range(columns):
145+
if len(data.iloc[:, i].unique()) <= max_discrete:
146+
for j in range(rows):
147+
data.iloc[j, i] = str(data.iloc[j, i])
148+
149+
data = MixedDataSet(data, names)
150+
151+
return data

0 commit comments

Comments
 (0)