Menu

Pythoncode_PCA

0 Comment

#This assignment is a tutorial so have a bit of fun with it#If you would like to explore some additional options give it a try#Goal is to provide some meaningful info to the restaurant owner#Some notes below#Both PCA and FA provide useful summary info for multivariate data, but#all of the original variables are needed for their calculation, so #the big question is can we use them to find a subset of variables to #predict overall score?#Also,trying to give meaningful labels to components is really hard.#When the variables are on different scales you need to work with the #correlation matrix.  For this assignment they are on same scale so#we will work with the raw data.#PCA only helps if the original variables are correlated, if they #are independent PCA will not help.#Approach takes two steps#First step find the dimensionality of the data, that is the #number of original variables to be retained#Second step find which ones, more on this below# import packages for this exampleimport pandas as pd import numpy as np  # arrays and math functionsimport matplotlib.pyplot as plt  # static plottingfrom sklearn.decomposition import PCA, FactorAnalysisimport statsmodels.formula.api as smf  # R-like model specification #Set some display options
pd.set_option(‘display.notebook_repr_html’, False) pd.set_option(‘display.max_columns’, 40) pd.set_option(‘display.max_rows’, 10) pd.set_option(‘display.width’, 120) #Read in the restaurant datasetfood_df = pd.read_csv(‘C:/Users/Jahee Koo/Desktop/MSPA/2018_Winter_410_regression/HW03 PCA/FACTOR1.csv’)#A good step to take is to convert all variable names to lower casefood_df.columns = [s.lower() for s in food_df.columns]print(food_df)print(”)print(‘—– Summary of Input Data —–‘)print(”)# show the object is a DataFrameprint(‘Object type: ‘, type(food_df))# show number of observations in the DataFrameprint(‘Number of observations: ‘, len(food_df))# show variable namesprint(‘Variable names: ‘, food_df.columns)# show descriptive statisticsprint(food_df.describe())# show a portion of the beginning of the DataFrameprint(food_df.head())#look at correlation structurecdata = food_df.loc[:,[‘overall’,’taste’,’temp’,’freshness’,’wait’,’clean’,’friend’,’location’,’parking’,’view’]] corr = cdata[cdata.columns].corr()print(corr)#Use the correlation matrix to help provide advice to the restaurant owner#Look at four different models and compare them#Which model do you think is best and why?#Model 1 full regression model#Model 2 select my reduced regression model taste, wait and location#Model 3 Full PCA model#Model 4 Reduced PCA model with parking, taste and clean#Model 5 FA model#First find the PCA#Second find the FA#Run the models#Compare the models and show VIF for each model#PCAprint(”)print(‘—– Principal Component Analysis —–‘)print(”)pca_data = food_df.loc[:,[‘taste’,’temp’,’freshness’,’wait’,’clean’,’friend’,’location’,’parking’,’view’]] pca = PCA()P = pca.fit(pca_data)print(pca_data)np.set_printoptions(threshold=np.inf) np.around([pca.components_], decimals=3)#Note per Everett p209 pick the three variables with the largest#absolute coefficient on the component not already picked#So, choose parking, taste and clean for the PCA variable reduction model # show summary of pca solutionpca_explained_variance = pca.explained_variance_ratio_print(‘Proportion of variance explained:’, pca_explained_variance)# note that principal components analysis corresponds# to finding eigenvalues and eigenvectors of the pca_datapca_data_cormat = np.corrcoef(pca_data.T)eigenvalues, eigenvectors = np.linalg.eig(pca_data_cormat)np.around([eigenvalues], decimals=3) print(‘Linear algebra demonstration: Proportion of variance explained: ‘, 
eigenvalues/eigenvalues.sum())np.around([eigenvectors], decimals=3)# show the plot for the pricipal component analysisplt.bar(np.arange(len(pca_explained_variance)), pca_explained_variance,  
color = ‘grey’, alpha = 0.5, align = ‘center’)plt.title(‘PCA Proportion of Total Variance’)plt.show()
# show a scree plot d = {‘eigenvalues’: eigenvalues }df1 = pd.DataFrame(data=d)df2 =pd.Series([1,2,3,4,5,6,7,8,9])#df2 = {‘factors’: factors}# merge eigenvalues with # of factorsresult = pd.concat([df1, df2], axis=1, join_axes=[df2.index])print (result)def scat(dataframe,var1,var2): 
dataframe[var2].plot() 
plt.title(‘Scree Plot’) 
plt.xlabel(‘# of factors’) 
plt.ylabel(‘Eigenvalues’) 
scat(result,’0′,’eigenvalues’)plt.show() # provide partial listing of variable loadings on principal components# transpose for variables by components listingpca_loadings = pca.components_.T# provide full formatted listing of loadings for first three components# print loadings while rounding to three digits # and suppress printing of very small numbers# but do not suppress printing of zeroesnp.set_printoptions(precision = 3, suppress = True, 
formatter={‘float’: ‘{: 0.3f}’.format})print(pca_loadings[:,0:3])  # compute full set of principal components (scores)C = pca.transform(pca_data)print(C)# add first three principal component scores to the original data framepca_data[‘pca1’] = C[:,0]pca_data[‘pca2’] = C[:,1]pca_data[‘pca3’] = C[:,2]print(pca_data) # add first three principal component scores to the food_dffood_df[‘pca1’] = C[:,0]food_df[‘pca2’] = C[:,1]food_df[‘pca3’] = C[:,2]print(food_df) # explore relationships between pairs of principal components# working with the first three components onlypca_scores = pca_data.loc[:,[‘pca1′,’pca2’, ‘pca3’]]pca_model_cormat = \ 
np.corrcoef(pca_scores.as_matrix().transpose()).round(decimals=3)print(pca_model_cormat)#Looks like that worked #Factor Analysisprint(”)print(‘—– Factor Analysis (Unrotated) —–‘)print(”) # assume three factors will be sufficient # this is an unrotated orthogonal solution# maximum likelihood estimation is employed# for best results set tolerance low and max iterations highfa = FactorAnalysis(n_components = 3, tol=1e-8, max_iter=1000000)  #the unrotated solutionfa.fit(pca_data)# retrieve the factor loadings as an array of arrays# transpose for variables by factors listing of loadingsfa_loadings = fa.components_.Tprint(fa_loadings)# show the loadings of the variables on the factors# for the unrotated maximum likelihood solution# print loadings while rounding to three digits # and suppress printing of very small numbers# but do not suppress printing of zeroesnp.set_printoptions(precision = 3, suppress = True, 
formatter={‘float’: ‘{: 0.3f}’.format})print(fa_loadings)  # compute full set of factor scoresF = fa.transform(pca_data)print(F)# add factor scores to the original data framefood_df[‘fa1’] = F[:,0]food_df[‘fa2’] = F[:,1]food_df[‘fa3’] = F[:,2]print(food_df)  #Look at five different models and compare them#Which model do you think is best and why?#Model 1 full regression model#Model 2 select my reduced regression model taste, wait and location#Model 3 Full PCA model#Model 4 Reduced PCA model with parking, taste and clean#Model 5 FA model#Run the Models#Model 1 full modelregress_model_fit = smf.ols(formula = ‘overall~taste+temp+freshness+wait+clean+friend+location+parking+view’, data = food_df).fit()# summary of model fit  print(regress_model_fit.summary())#Model 2#Note, Model 2 is a choice from looking at the correlation, you may choose a#different selection for this if you like, just explain why regress_model_fit = smf.ols(formula = ‘overall~taste+wait+location’, data = food_df).fit()# summary of model fit  print(regress_model_fit.summary())#Model 3#regress the response overall on principal componentspca_model_fit = smf.ols(formula = ‘overall~pca1+pca2+pca3’, data = food_df).fit()# summary of model fit  print(pca_model_fit.summary())#Model 4#regress the response overall on principal componentspca_model_fit = smf.ols(formula = ‘overall~parking+taste+clean’, data = food_df).fit()# summary of model fit  print(pca_model_fit.summary()) #Model 5 #regress the response overall on factor scoresfa_model_fit = smf.ols(formula = ‘overall~fa1+fa2+fa3’, data = food_df).fit()# summary of model fit  print(fa_model_fit.summary())#next look at VIF to see what the full, choice, PCA and FA models did# Break into left and right hand side; y and X then find VIF for each modelimport statsmodels.formula.api as smfrom patsy import dmatricesfrom statsmodels.stats.outliers_influence import variance_inflation_factory = food_df.loc[:,[‘overall’]]X = food_df.loc[:,[‘taste’,’temp’,’freshness’,’wait’,’clean’,’friend’,’location’,’parking’,’view’]]y, X = dmatrices(‘overall ~ taste+temp+freshness+wait+clean+friend+location+parking+view ‘, data=food_df, return_type=”dataframe”)# For each Xi, calculate VIFvif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]print(”)print(‘—– VIF for Full Regression Model —–‘)print(”)print(vif)#VIF for choice modely = food_df.loc[:,[‘overall’]]X = food_df.loc[:,[‘taste’,’clean’,’location’]] y, X = dmatrices(‘overall ~ taste+clean+location ‘, data=food_df, return_type=”dataframe”)vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]print(”)print(‘—– VIF for Choice Model —–‘)print(”)print(vif) #VIF for PCAy = food_df.loc[:,[‘overall’]]X = food_df.loc[:,[‘pca1′,’pca2′,’pca3’]] y, X = dmatrices(‘overall ~ pca1+pca2+pca3 ‘, data=food_df, return_type=”dataframe”)vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]print(”)print(‘—– VIF for PCA Model —–‘)print(”)print(vif) #VIF for FAy = food_df.loc[:,[‘overall’]]X = food_df.loc[:,[‘fa1′,’fa2′,’fa3’]] y, X = dmatrices(‘overall ~ fa1+fa2+fa3 ‘, data=food_df, return_type=”dataframe”)vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]print(”)print(‘—– VIF for FA Model —–‘)print(”)print(vif)#Which model do you like best and why?#For the full regression model sum the coefficients for each three variable#grouping, taste, temp freshness group 1#wait, clean, friend group 2# location, parking, view group 3#How do you interpret this info?#Compare with the choice model