|
| 1 | +#Reading the dataset student.mat |
| 2 | +#Make sure that the attributes are numerical |
| 3 | +import matplotlib.pyplot as plt |
| 4 | +import pandas as pd |
| 5 | +import numpy as np |
| 6 | + |
| 7 | +data=pd.read_csv('/content/student-mat.csv',sep=';') |
| 8 | +print(data) |
| 9 | + |
| 10 | +#Taking only numerical attributes |
| 11 | +x=data[['age','Medu','Fedu','traveltime','studytime','failures','famrel','freetime','goout','Dalc','Walc','health','absences']] |
| 12 | +print(x) |
| 13 | +print(x.shape) |
| 14 | + |
| 15 | +#Choosing G3 as target |
| 16 | +y=data['G3'] |
| 17 | +print(y) |
| 18 | +print(y.shape) |
| 19 | + |
| 20 | +#Applying linear regression(multiple features) without PCA |
| 21 | +from sklearn.model_selection import train_test_split |
| 22 | +x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/3, random_state = 0) |
| 23 | + |
| 24 | +from sklearn.linear_model import LinearRegression |
| 25 | +regressor_2 = LinearRegression() |
| 26 | +regressor_2.fit(x_train, y_train) |
| 27 | +y_pred=regressor_2.predict(x_test) |
| 28 | + |
| 29 | +from sklearn.metrics import mean_squared_error |
| 30 | +from sklearn.metrics import mean_absolute_error |
| 31 | + |
| 32 | +print('MSE',mean_squared_error(y,regressor_2.predict(x))) |
| 33 | +print('MSA',mean_absolute_error(y,regressor_2.predict(x))) |
| 34 | +print('Accuracy',regressor_2.score(x_test,y_test)*100) |
| 35 | + |
| 36 | +#Finding optimum of PCA components to achieve 95% varation |
| 37 | +from sklearn.decomposition import PCA |
| 38 | +pca=PCA(0.95) |
| 39 | +pca.fit(x) |
| 40 | +reduced=pca.transform(x) |
| 41 | +print(reduced.shape) |
| 42 | +#which means, 6 components are required! |
| 43 | + |
| 44 | +#Applying PCA |
| 45 | +from sklearn.decomposition import PCA |
| 46 | + |
| 47 | +# Choosing 6 PCA components |
| 48 | + |
| 49 | +pca = PCA(n_components=6) |
| 50 | +principalComponents = pca.fit_transform(x) |
| 51 | +principalDf = pd.DataFrame(data = principalComponents |
| 52 | + , columns = ['PC1','PC2','PC3','PC4','PC5','PC6']) |
| 53 | + |
| 54 | +print(principalDf) |
| 55 | + |
| 56 | +#Variance |
| 57 | +var = pca.explained_variance_ratio_ |
| 58 | +print(var) |
| 59 | + |
| 60 | +plt.bar(['PC1', 'PC2','PC3','PC4','PC5','PC6'], var) |
| 61 | +plt.title('Variance vs PC1, PC2, PC3, PC4, PC5, PC6') |
| 62 | +plt.xlabel('Principal Components') |
| 63 | +plt.ylabel('Variance') |
| 64 | + |
| 65 | +#Applying Linear Regression using PCA components |
| 66 | +from sklearn.model_selection import train_test_split |
| 67 | +x2=principalDf |
| 68 | +x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y, test_size = 1/3, random_state = 0) |
| 69 | + |
| 70 | +from sklearn.linear_model import LinearRegression |
| 71 | +regressor_3 = LinearRegression() |
| 72 | +regressor_3.fit(x2_train, y2_train) |
| 73 | +y_pred2=regressor_3.predict(x2_test) |
| 74 | + |
| 75 | +from sklearn.metrics import mean_squared_error |
| 76 | +from sklearn.metrics import mean_absolute_error |
| 77 | + |
| 78 | +print('MSE',mean_squared_error(y,regressor_3.predict(x2))) |
| 79 | +print('MSA',mean_absolute_error(y,regressor_3.predict(x2))) |
| 80 | +print('Accuracy',regressor_3.score(x2_test,y2_test)*100) |
0 commit comments