diff --git a/app/age_hist.py b/app/age_hist.py new file mode 100644 index 0000000..c7aecd9 --- /dev/null +++ b/app/age_hist.py @@ -0,0 +1,13 @@ +import numpy as np +import matplotlib.pyplot as plt + +from .load_data_3d import load_targets +from .settings import DATA_DIRECTORY + +def age_hist(): + data = load_targets() + data = data['Y'].tolist() + plt.figure() + plt.hist(data, bins=100) + plt.savefig("plots/age_hist.pdf") + plt.close() diff --git a/app/cut_brain.py b/app/cut_brain.py index 4fe6cc1..7b67416 100644 --- a/app/cut_brain.py +++ b/app/cut_brain.py @@ -4,14 +4,14 @@ def cut_brain(sample, pos): data = [] for d in sample: if pos == "lt": - z_s,z_e = 60,len(d[0,0,:]) + z_s,z_e = 60,d.shape[2] y_s,y_e = 0,60 elif pos == "mt": - z_s,z_e = 60,len(d[0,0,:]) + z_s,z_e = 60,d.shape[2] y_s,y_e = 60,110 elif pos == "rt": - z_s,z_e = 50,len(d[0,0,:]) - y_s,y_e = 110,len(d[0,:,0]) + z_s,z_e = 50,d.shape[2] + y_s,y_e = 110,d.shape[1] elif pos == "lb": z_s,z_e = 0,60 y_s,y_e = 0,60 @@ -20,7 +20,7 @@ def cut_brain(sample, pos): y_s,y_e = 60,110 elif pos == "rb": z_s,z_e = 0,50 - y_s,y_e = 110,len(d[0,:,0]) + y_s,y_e = 110,d.shape[1] area = np.array([[z_s,z_e],[y_s,y_e]]) diff --git a/app/cut_brain_test.py b/app/cut_brain_test.py new file mode 100644 index 0000000..d81a754 --- /dev/null +++ b/app/cut_brain_test.py @@ -0,0 +1,40 @@ +import numpy as np + +def cut_brain(d, pos): + if pos == "lt": + z_s,z_e = 60,d.shape[2] + y_s,y_e = 0,60 + elif pos == "mt": + z_s,z_e = 60,d.shape[2] + y_s,y_e = 60,110 + elif pos == "rt": + z_s,z_e = 50,d.shape[2] + y_s,y_e = 110,d.shape[1] + elif pos == "lb": + z_s,z_e = 0,60 + y_s,y_e = 0,60 + elif pos == "mb": + z_s,z_e = 0,60 + y_s,y_e = 60,110 + elif pos == "rb": + z_s,z_e = 0,50 + y_s,y_e = 110,d.shape[1] + + area = np.array([[z_s,z_e],[y_s,y_e]]) + + z = range(area[0][0],area[0][1]) + y = range(area[1][0],area[1][1]) + x_len = d.shape[0] + y_len = len(y) + z_len = len(z) + + d_new = np.zeros((x_len,y_len,z_len)) + for k in range(0,x_len): + z_index = 0 + for i in z: + y_index = 0 + for j in y: + d_new[k,y_index,z_index] = d[k,j,i] + y_index += 1 + z_index += 1 + return d_new diff --git a/app/feature.py b/app/feature.py index 5ee2066..9f37e84 100644 --- a/app/feature.py +++ b/app/feature.py @@ -4,7 +4,7 @@ from .normalize import normalize -def feature_ratio_mean(inputs): +def feature_ratio_mean(inputs, norm=None): inputs = get_flat_values(inputs) fs = [] @@ -17,38 +17,28 @@ def feature_ratio_mean(inputs): fs.append(ratio) - fs = normalize(fs) - return fs + if norm == None: + fs, minmax = normalize(fs) + return fs, minmax + else: + fs = normalize(fs, norm) + return fs -def feature_mean(inputs): +def feature_mean(inputs, norm=None): inputs = get_flat_values(inputs) fs = [] for i in inputs: fs.append(np.mean(i)) - fs = normalize(fs) - return fs + if norm == None: + fs, minmax = normalize(fs) + return fs, minmax + else: + fs = normalize(fs, norm) + return fs - -def feature_ratio(inputs): - inputs = get_flat_values(inputs) - - fs = [] - for i in inputs: - - low = i[i < 1100] - high = i[i >= 1100] - - ratio = np.sum(high)/np.sum(low) - - fs.append(ratio) - - fs = normalize(fs) - return fs - - -def feature_max(inputs): +def feature_max(inputs, norm=None): inputs = get_flat_values(inputs) fs = [] @@ -58,9 +48,12 @@ def feature_max(inputs): x_max = bin_edges[idx_max] fs.append(x_max) - - fs = normalize(fs) - return fs + if norm == None: + fs, minmax = normalize(fs) + return fs, minmax + else: + fs = normalize(fs, norm) + return fs def get_flat_values(inputs): diff --git a/app/feature_test.py b/app/feature_test.py new file mode 100644 index 0000000..d759037 --- /dev/null +++ b/app/feature_test.py @@ -0,0 +1,75 @@ +"""Description of this file.""" + +import numpy as np + +from .normalize import normalize +from .get_number_of_files import get_number_of_files +from .load_deviations import load_deviation +from .cut_brain_test import cut_brain +from .load_data_3d import load_sample_input + +def feature_ratio_mean(area="whole", training=True, norm=None): + fs = [] + for i in range(1,get_number_of_files(training)+1): + data = load_data(i,training) + if not area == "whole": + data = cut_brain(data,area) + data = get_flat_values(data) + mean = np.mean(data) + low = data[data < mean] + high = data[data >= mean] + + ratio = np.sum(high)/np.sum(low) + + fs.append(ratio) + + if norm == None: + fs, minmax = normalize(fs) + return fs, minmax + else: + fs = normalize(fs, norm) + return fs + +def feature_mean(area, training=True, norm=None): + fs = [] + for i in range(1,get_number_of_files(training)+1): + data = load_data(i,training) + if not area == "whole": + data = cut_brain(data,area) + data = get_flat_values(data) + fs.append(np.mean(data)) + + if norm == None: + fs, minmax = normalize(fs) + return fs, minmax + else: + fs = normalize(fs, norm) + return fs + +def feature_max(area, training=True, norm=None): + fs = [] + for i in range(1,get_number_of_files(training)+1): + data = load_data(i,training) + if not area == "whole": + data = cut_brain(data,area) + data = get_flat_values(data) + hist, bin_edges = np.histogram(data, bins='auto') + idx_max = hist.argmax() + x_max = bin_edges[idx_max] + fs.append(x_max) + + if norm == None: + fs, minmax = normalize(fs) + return fs, minmax + else: + fs = normalize(fs, norm) + return fs + +def get_flat_values(inputs): + vs = inputs.flatten() + return vs #[vs > 100] + +def load_data(ID, training=True): + #return load_deviation(ID, training) + return load_sample_input(ID, training) + diff --git a/app/get_number_of_files.py b/app/get_number_of_files.py new file mode 100644 index 0000000..84a7c2b --- /dev/null +++ b/app/get_number_of_files.py @@ -0,0 +1,14 @@ +import os +from .settings import DATA_DIRECTORY + +def get_number_of_files(training=True): + if training == True: + folder = "set_train" + else: + folder = "set_test" + DIR_IN = os.path.join( + DATA_DIRECTORY, + folder + ) + nof = len([name for name in os.listdir(DIR_IN) if os.path.isfile(os.path.join(DIR_IN, name))]) + return nof diff --git a/app/heatmap.py b/app/heatmap.py index bfaf459..edacd18 100644 --- a/app/heatmap.py +++ b/app/heatmap.py @@ -6,6 +6,7 @@ import numpy as np from .load_data import load_sample_input +from .load_deviations import load_deviation __author__ = "lfievet" @@ -20,8 +21,10 @@ def heatmap(): - sample = 10 - data = load_sample_input(sample) + sample = 20 + #data = load_sample_input(sample) + + data = load_deviation(sample) # flat_data = data.get_data().flatten() # flat_data = flat_data[flat_data > 100] @@ -38,7 +41,7 @@ def heatmap(): # return for i in range(60, 150): - heat_map = data.get_data()[:, :, i, 0] + heat_map = data[:][:][i] #.get_data()[:, :, i, 0] # print(heat_map) # print(heat_map.shape) # heatmap.reshape((176, 208)) diff --git a/app/load_data_3d.py b/app/load_data_3d.py index e7223b6..c394efc 100644 --- a/app/load_data_3d.py +++ b/app/load_data_3d.py @@ -1,31 +1,32 @@ """Loading training and testing data.""" - import os +import scipy.io import nibabel as ni import pandas as pd from .settings import DATA_DIRECTORY +from .get_number_of_files import get_number_of_files def load_samples_inputs(training=True): - DIR = os.path.join( - DATA_DIRECTORY, - "set_train" - ) - files = len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))]) - + nof = get_number_of_files(training) inputs = [] - for i in range(1, files+1): - inputs.append(load_sample_input(i)) - + for i in range(1, nof+1): + inputs.append(load_sample_input(i, training)) return inputs - def load_sample_input(id=1, training=True): + if training == True: + folder = "set_train" + files = "train" + else: + folder = "set_test" + files = "test" + file_path = os.path.join( DATA_DIRECTORY, - "set_train", - "train_{}.nii".format(id) + folder, + "{}_{}.nii".format(files,id) ) return ni.load(file_path).get_data()[:,:,:,0] diff --git a/app/load_deviations.py b/app/load_deviations.py new file mode 100644 index 0000000..fc82797 --- /dev/null +++ b/app/load_deviations.py @@ -0,0 +1,42 @@ +import os +import scipy.io +import numpy as np + +from .settings import DATA_DIRECTORY +from .mean_brain import create_deviation_set +from .get_number_of_files import get_number_of_files + +def load_deviations(training=True): + if training == True: + files = "train" + else: + files = "test" + DIR = os.path.join( + DATA_DIRECTORY, + "set_{}_deviation".format(files), + ) + if not os.path.exists(DIR): + create_deviation_set() + data = [] + for i in range(1,get_number_of_files(training)+1): + data.append(load_deviation(i,training)) + return data + +def load_deviation(ID,training=True): + if training == True: + files = "train" + else: + files = "test" + DIR = os.path.join( + DATA_DIRECTORY, + "set_{}_deviation".format(files), + ) + if not os.path.exists(DIR): + create_deviation_set() + FILE = os.path.join( + DATA_DIRECTORY, + "set_{}_deviation".format(files), + "{}_{}.mat".format(files,ID) + ) + return scipy.io.loadmat(FILE)['out'] + diff --git a/app/load_features.py b/app/load_features.py new file mode 100644 index 0000000..4c83c9a --- /dev/null +++ b/app/load_features.py @@ -0,0 +1,64 @@ +from .load_data_3d import load_targets, load_samples_inputs +from .load_deviations import load_deviations +from .cut_brain import cut_brain +from .feature import feature_mean, feature_max, feature_ratio_mean + +def load_features(norms=None): + if norms == None: + training = True + else: + training = False + + areas = ["lt","mt","rt","lb","mb","rb"] + inputs = [ + { + "area": "whole", + "val": load_deviations(training) + } + ] + for a in areas: + inputs.append( + { + "area": a, + "val": cut_brain(inputs[0]["val"], a) + } + ) + features = [ + { + "name": "mean", + "f": feature_mean + }, + { + "name": "ratio_mean", + "f": feature_ratio_mean + }, + { + "name": "max", + "f": feature_max + }, + ] + if norms == None: + norms = {} + data = load_targets() + print("plotting features") + for f in features: + for i in inputs: + feature_inputs , norms["{}_{}".format(f["name"], i["area"])] = f["f"](i["val"]) + data["{}_{}".format(f["name"], i["area"])] = feature_inputs + # plt.figure() + # plt.scatter( + # feature_inputs, + # data["Y"].tolist(), + # ) + # plt.savefig("plots/line_{}_{}.pdf".format( + # f["name"], i["area"] + # )) + # plt.close() + return data, norms + else: + data = {} + for f in features: + for i in inputs: + feature_inputs = f["f"](i["val"], norms["{}_{}".format(f["name"], i["area"])]) + data["{}_{}".format(f["name"], i["area"])] = feature_inputs + return data diff --git a/app/load_features_test.py b/app/load_features_test.py new file mode 100644 index 0000000..41385c3 --- /dev/null +++ b/app/load_features_test.py @@ -0,0 +1,48 @@ +import matplotlib.pyplot as plt + +from .load_data_3d import load_targets, load_samples_inputs +from .load_deviations import load_deviations +from .cut_brain_test import cut_brain +from .feature_test import feature_mean, feature_max, feature_ratio_mean + +def load_features(norms=None): + if norms == None: + training = True + else: + training = False + + areas = ["whole","lt","mt","rt","lb","mb","rb"] + features = [ + { + "name": "mean", + "f": feature_mean + }, + { + "name": "ratio_mean", + "f": feature_ratio_mean + }, + { + "name": "max", + "f": feature_max + }, + ] + if norms == None: + norms = {} + data = load_targets() + print("plotting features") + for f in features: + for a in areas: + feature_inputs , norms["{}_{}".format(f["name"], a)] = f["f"](a) + data["{}_{}".format(f["name"], a)] = feature_inputs + plt.figure() + plt.scatter(feature_inputs, data["Y"].tolist()) + plt.savefig("plots/line_{}_{}.pdf".format(f["name"], a)) + plt.close() + return data, norms + else: + data = {} + for f in features: + for a in areas: + feature_inputs = f["f"](a, norms["{}_{}".format(f["name"], a)]) + data["{}_{}".format(f["name"], a)] = feature_inputs + return data diff --git a/app/mean_brain.py b/app/mean_brain.py new file mode 100644 index 0000000..821d7c1 --- /dev/null +++ b/app/mean_brain.py @@ -0,0 +1,61 @@ +import os +import numpy as np +import pandas as pd +import scipy.io + +from .get_number_of_files import get_number_of_files +from .load_data_3d import load_sample_input, load_targets +from .settings import DATA_DIRECTORY + +def create_deviation_set(): + mean = mean_brain() + + files = "train" + DIR_OUT = os.path.join( + DATA_DIRECTORY, + "set_{}_deviation".format(files) + ) + if not os.path.exists(DIR_OUT): + os.makedirs(DIR_OUT) + nof = get_number_of_files(True) + for i in range(1,nof+1): + data = load_sample_input(i,True) + diff = brain_deviation(data, mean) + FILE_OUT = os.path.join( + DIR_OUT, + "{}_{}.mat".format(files,i) + ) + scipy.io.savemat(FILE_OUT, mdict={'out': diff}, oned_as='row') + + files = "test" + DIR_OUT = os.path.join( + DATA_DIRECTORY, + "set_{}_deviation".format(files) + ) + if not os.path.exists(DIR_OUT): + os.makedirs(DIR_OUT) + nof = get_number_of_files(False) + for i in range(1,nof+1): + data = load_sample_input(i,False) + diff = brain_deviation(data, mean) + FILE_OUT = os.path.join( + DIR_OUT, + "{}_{}.mat".format(files,i) + ) + scipy.io.savemat(FILE_OUT, mdict={'out': diff}, oned_as='row') + +def brain_deviation(data, mean): + return data-mean + +def mean_brain(): + targets = load_targets() + targets = targets['Y'].tolist() + nof = get_number_of_files(True) + mean = np.zeros(load_sample_input(1,True).shape) + for i in range(0,nof): + data = load_sample_input(i+1,True) + mean += np.divide(data, targets.count(targets[i])) + return np.divide(mean, len(set(targets))) + + + diff --git a/app/normalize.py b/app/normalize.py index b5ac38a..e1b6b50 100644 --- a/app/normalize.py +++ b/app/normalize.py @@ -1,11 +1,11 @@ import numpy as np -def normalize(inputs): +def normalize(inputs, minmax=None): data = np.array(inputs) - maximum = data.max() - minimum = data.min() - - out = (data-minimum)/(maximum-minimum) - - return out + if minmax == None: + minimum = data.min() + maximum = data.max() + return (data-minimum)/(maximum-minimum), [minimum,maximum] + else: + return (data-minmax[0])/(minmax[1]-minmax[0]) diff --git a/app/predictor_cut.py b/app/predictor_cut.py index 9beef5e..475cd0a 100644 --- a/app/predictor_cut.py +++ b/app/predictor_cut.py @@ -12,36 +12,63 @@ from sklearn.metrics import accuracy_score from .settings import CURRENT_DIRECTORY -from .cut_brain import cut_brain -from .feature import feature_mean, feature_max, feature_ratio_mean -from .load_data_3d import load_targets, load_samples_inputs from .squared_error import squared_error - +from .load_features_test import load_features +from .get_number_of_files import get_number_of_files def predict_cut(training=True): - cache_path = os.path.join( + cache_data_path = os.path.join( + CURRENT_DIRECTORY, + "..", + "cache", + "data.hdf" + ) + cache_norms_path = os.path.join( + CURRENT_DIRECTORY, + "..", + "cache", + "norms.hdf" + ) + cache_test_path = os.path.join( CURRENT_DIRECTORY, "..", "cache", - "trial1.hdf" + "test.hdf" ) - if os.path.exists(cache_path): + if os.path.exists(cache_data_path) and os.path.exists(cache_norms_path): print("Loading features from cache") - data = pd.read_hdf(cache_path, "table") + data = pd.read_hdf(cache_data_path, "table") + norms = pd.read_hdf(cache_norms_path, "table").to_dict() else: print("Loading features") - data = load_features() + data, norms = load_features() + print("saving data to cache") + data.to_hdf(cache_data_path, "table") + pd.DataFrame.from_dict(norms).to_hdf(cache_norms_path, "table") - print("saving data to cache") - data.to_hdf(cache_path, "table") + if os.path.exists(cache_test_path): + print("Loading test features from cache") + test_data = pd.read_hdf(cache_test_path, "table").to_dict() + else: + print("Loading test features") + test_data = load_features(norms) + print("saving test to cache") + pd.DataFrame.from_dict(test_data).to_hdf(cache_test_path, "table") - print(data) - #feature_list = data.keys().tolist() - #feature_list.remove("Y") - feature_list = ["max_lb", "max_mb", "mean_mb", "mean_rt", "mean_mt", "mean_whole", "ratio_mean_lt", - "ratio_mean_mb", "ratio_mean_mt", "ratio_mean_rt", "ratio_mean_whole"] + feature_list = ['mean_rt', 'mean_mb', 'ratio_mean_lt', 'ratio_mean_rt', 'ratio_mean_rb', 'max_rt', 'max_rb'] xs = data[feature_list].values.tolist() ys = data["Y"].values.tolist() + #print(data) + #print(test_data) + test_inputs = [] + for i in range(0,get_number_of_files(training=False)): + tmp = [] + for f in feature_list: + tmp.append(test_data[f][0][i]) + test_inputs.append(tmp) + + print(len(xs)) + print(len(test_inputs)) nn = KNeighborsRegressor( n_neighbors=3, weights="uniform", @@ -52,53 +79,14 @@ def predict_cut(training=True): predicted = cross_val_predict(nn, xs, ys, cv=5) print("Squared Error:",squared_error(ys,predicted)) + nn.fit(xs, ys) -def load_features(): - areas = ["lt","mt","rt","lb","mb","rb"] - inputs = [ - { - "area": "whole", - "val": load_samples_inputs() - } - ] - for a in areas: - inputs.append( - { - "area": a, - "val": cut_brain(inputs[0]["val"], a) - } - ) - data = load_targets() - - features = [ - { - "name": "mean", - "f": feature_mean - }, - { - "name": "ratio_mean", - "f": feature_ratio_mean - }, - { - "name": "max", - "f": feature_max - }, - ] - - print("plotting features") - for f in features: - for i in inputs: - feature_inputs = f["f"](i["val"]) - data["{}_{}".format(f["name"], i["area"])] = feature_inputs - - plt.figure() - plt.scatter( - feature_inputs, - data["Y"].tolist(), - ) - plt.savefig("plots/line_{}_{}.pdf".format( - f["name"], i["area"] - )) - plt.close() + result = nn.predict(test_inputs) + print(result) + result_path = os.path.join( + CURRENT_DIRECTORY, + "..", + "result.csv" + ) + pd.Series(result).to_csv(result_path) - return data diff --git a/main.py b/main.py index c2d715c..b0d9777 100644 --- a/main.py +++ b/main.py @@ -9,8 +9,9 @@ from app.predictor_cut import predict_cut from app.predictor_cut_iterate import predict_cut_iterate from app.heatmap_side import heatmap_side - +from app.mean_brain import create_deviation_set from app.settings import CACHE_DIRECTORY, PLOT_DIRECTORY, ITERATE_DIRECTORY +from app.age_hist import age_hist if __name__ == "__main__": if not os.path.exists(CACHE_DIRECTORY): @@ -41,4 +42,7 @@ print("Additional Argument needed for this!") else: predict_cut_iterate(num=int(args[2])) - + elif args[1] == "mean_brain": + create_deviation_set() + elif args[1] == "age_hist": + age_hist()