Skip to content

Commit 777158e

Browse files
authored
Create PCA_dataset analysis.py
1 parent ca4637d commit 777158e

File tree

1 file changed

+80
-0
lines changed

1 file changed

+80
-0
lines changed

PCA_dataset analysis.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#Reading the dataset student.mat
2+
#Make sure that the attributes are numerical
3+
import matplotlib.pyplot as plt
4+
import pandas as pd
5+
import numpy as np
6+
7+
data=pd.read_csv('/content/student-mat.csv',sep=';')
8+
print(data)
9+
10+
#Taking only numerical attributes
11+
x=data[['age','Medu','Fedu','traveltime','studytime','failures','famrel','freetime','goout','Dalc','Walc','health','absences']]
12+
print(x)
13+
print(x.shape)
14+
15+
#Choosing G3 as target
16+
y=data['G3']
17+
print(y)
18+
print(y.shape)
19+
20+
#Applying linear regression(multiple features) without PCA
21+
from sklearn.model_selection import train_test_split
22+
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/3, random_state = 0)
23+
24+
from sklearn.linear_model import LinearRegression
25+
regressor_2 = LinearRegression()
26+
regressor_2.fit(x_train, y_train)
27+
y_pred=regressor_2.predict(x_test)
28+
29+
from sklearn.metrics import mean_squared_error
30+
from sklearn.metrics import mean_absolute_error
31+
32+
print('MSE',mean_squared_error(y,regressor_2.predict(x)))
33+
print('MSA',mean_absolute_error(y,regressor_2.predict(x)))
34+
print('Accuracy',regressor_2.score(x_test,y_test)*100)
35+
36+
#Finding optimum of PCA components to achieve 95% varation
37+
from sklearn.decomposition import PCA
38+
pca=PCA(0.95)
39+
pca.fit(x)
40+
reduced=pca.transform(x)
41+
print(reduced.shape)
42+
#which means, 6 components are required!
43+
44+
#Applying PCA
45+
from sklearn.decomposition import PCA
46+
47+
# Choosing 6 PCA components
48+
49+
pca = PCA(n_components=6)
50+
principalComponents = pca.fit_transform(x)
51+
principalDf = pd.DataFrame(data = principalComponents
52+
, columns = ['PC1','PC2','PC3','PC4','PC5','PC6'])
53+
54+
print(principalDf)
55+
56+
#Variance
57+
var = pca.explained_variance_ratio_
58+
print(var)
59+
60+
plt.bar(['PC1', 'PC2','PC3','PC4','PC5','PC6'], var)
61+
plt.title('Variance vs PC1, PC2, PC3, PC4, PC5, PC6')
62+
plt.xlabel('Principal Components')
63+
plt.ylabel('Variance')
64+
65+
#Applying Linear Regression using PCA components
66+
from sklearn.model_selection import train_test_split
67+
x2=principalDf
68+
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y, test_size = 1/3, random_state = 0)
69+
70+
from sklearn.linear_model import LinearRegression
71+
regressor_3 = LinearRegression()
72+
regressor_3.fit(x2_train, y2_train)
73+
y_pred2=regressor_3.predict(x2_test)
74+
75+
from sklearn.metrics import mean_squared_error
76+
from sklearn.metrics import mean_absolute_error
77+
78+
print('MSE',mean_squared_error(y,regressor_3.predict(x2)))
79+
print('MSA',mean_absolute_error(y,regressor_3.predict(x2)))
80+
print('Accuracy',regressor_3.score(x2_test,y2_test)*100)

0 commit comments

Comments
 (0)