# NRES 798 201501 Lab 3 #instal.packages("PerformanceAnalytics") library(PerformanceAnalytics) #install.packages("car") library(car) ######################################### # Correlations and Covariance ######################################### # Correlation # Use the "Soils" data set from the "car" library data(Soils) # Load the "Soils" data set names(Soils) # Column names from the data set str(Soils) # Structure of the data set summary(Soils)# Summary of the data set # Calculate the correlation between ph and N plot(Soils$pH,Soils$N, xlab = "PH",ylab="N") cor(Soils$pH,Soils$N) cor(Soils$pH,Soils$N,method=c("pearson")) cor(Soils$pH,Soils$N,method=c("kendall")) # TODO: compare correlation coefficients using different methods cor(Soils$pH,Soils$N,method=c("spearman")) # TODO: compare correlation coefficients using different methods # Calculate the covariance between ph and N # Covariance: how much two random variables change together cov(Soils$pH,Soils$N) # Correlation plots usning the chart.correlation() functin chart.Correlation(Soils[,6:7],histogram = TRUE) # TODO: what correlation coefficient is used as the default in chart.Correlation() # TODO: using chart.Correlation plot the correlatios between pH, N,Dens, P,CA, Mg,K,Na # TODO: plot the resulting figure as a large pdf (i.e. width > 10 inches) ######################################### # Evaluating residuals ######################################### # Using the Soils data set, evaluate the residuals when Na is used as a predictor of Conduc. plot(Soils$Na,Soils$Conduc) lm.conduc <- lm(Soils$Conduc ~ Soils$Na) abline(lm.conduc) conduc.resid <- residuals(lm.conduc) # Extract the residuals from the fitted line conduc.fitted <- fitted(lm.conduc) # Extract the fitted points (y axis) plot(conduc.fitted,conduc.resid) abline(h = 0, lty = 2) # TODO: Does this data apear homoscedstic or heteroscedastic? # TODO: what does the following functin do? res.plot <- function(y,x) { lm.mod <- lm(y~x) lm.resid <- residuals(lm.mod) lm.fit <- fitted(lm.mod) par(mfrow=c(2,1)) plot(x,y) abline(lm.mod) plot(lm.fit,lm.resid) abline(h = 0, lty = 2) } # TODO: Examing the homoscedastity of Conduc as predicted by pH,N,P and Ca. res.plot(Soils$Condu,Soils$Ca) x <- Soils$PH y <- Soils$Conduc # Residual Normality conduc.resid <- residuals(lm.conduc) # Extract the residuals from the fitted line conduc.fitted <- fitted(lm.conduc) # Extract the fitted points (y axis) plot(conduc.fitted,conduc.resid) abline(h = 0, lty = 2) hist(conduc.resid) # TODO: estimate the mean and standard deviation of this distribution # TODO: estimate the normal pdf from this data and create a nomral plot overlay # TODO: formmaly test whether or not these residuals are normally distributed ######################################### # Simple linear regression ######################################### # Simple linear regressions are fit and evaluated using the lm() "linear model" function # Simple linear regression of Conduc as a function of pH plot(Soils$pH,Soils$Conduc,ylab="Conductance",Xlab="pH") lm.reg1 <- lm(Soils$Conduc ~ Soils$pH) # model formulation y = b0 + b1(x) lm.reg1 <- lm(Conduc ~ pH, data=Soils) # Same model as above but formulated differently plot(Soils$pH,Soils$Conduc,ylab="Conductance",Xlab="pH") abline(lm.reg1,col="red",lwd=2) # Evaluate the lm() fitting summary(lm.reg1) # TODO: What are the coefficients that are returned? Write the deterministic component of the fitted equation in its full form. # TODO: What are the p-values associated with each coefficient testing (i.e.null and alternative H). # Evaluate the lm() output using an ANOVA table anova(lm.reg1) # TODO: What information from the ANOVA table would normally be reported in a paper. # Use the function coefficients() (coef()) to extract the fitted coefficients from the model coef(lm.reg1) # Use the function residuals() to calculate the residuals from the fitted values residuals(lm.reg1) # use the function fitted() to calculate the fitteed values based on the model fitted(lm.reg1) # use the function confint() to calcualte the confidence intervals for the model parameters confint(lm.reg1) # Plot the residuals from the analysis plot(fitted(lm.reg1),residuals(lm.reg1),ylab="Residuals",xlab="Fitted model estimates") abline(h = 0, lty = 2) # Extracting the components sum of squares SSy <- deviance(lm(Soils$Conduc ~ 1)) # Sum of squares total RSS <- deviance(lm.reg1) # Residual sum of squares SSreg <- SSy - RSS # Sum of square regression # TODO: how much varaition is being captured by the deterministc component of the model? # Regression of Conoduc as influenced by Mg lm.reg2 <- lm(Soils$Conduc ~ Soils$Mg) plot(Soils$Mg,Soils$Conduc,xlim=c(0,12),ylim=c(0,13)) abline(lm.reg2,col="red",lwd=2) lm.reg2a <- lm(Soils$Conduc ~ Soils$Mg -1) # Regression fitting with a stated intercept of zero #lm.reg2a <- lm(Soils$Conduc ~ 0 + Soils$Mg) # Alternative formulatino for zero intercept regression abline(lm.reg2a,col="blue",lwd=2) # Multiple regression # Multiple regressions using lm() are specified by including additional linear terms to the model # Multiple regression of Na and Ca against Conduc lm.mreg1 <- lm(Soils$Conduc ~ Soils$Na + Soils$Ca) # No interaction specified summary(lm.mreg1) anova(lm.mreg1) # TODO: which factor accounts for the greatest amount of variation? # TODO: does the order of independent factors in the model matter? # TODO: what impact does changing the order of variables have on the total r2 and variable SSreg components. # TODO: what impact does changing the order of variables have on the coefficient estimates # Multiple regression with interaction terms # Using lm() interaction terms can be included explicitly by including a product term or using a colon to sperate variables Nac <- Soils$Na - mean(Soils$Na) # center the Girth variable Cac <- Soils$Ca - mean(Soils$Ca) # center the Height variable Soils$NaCaint <- Nac*Cac # create the interaction term lm.mreg2 <- lm(Soils$Conduc ~ Soils$Na + Soils$Ca + Soils$NaCaint) # or lm.mreg3 <- lm(Soils$Conduc ~ Soils$Na + Soils$Ca + Soils$Na:Soils$Ca) # Using built in formula expansion (normally use) # Interactions can also be specified as lm.mreg3 <- lm(Soils$Conduc ~ Soils$Na*Soils$Ca) # TODO: create a multiple regression model that includes N, P and K, and an interaction term for N and P, for the dependent varialbe Conduc # TODO: Evaluate what the interactin term does in the model # Quadratic term in linear model # Polynomial components of linear models can be included in 2 ways sp2 <- (Soils$P)^2 lm.mreg4 <- lm(Soils$Conduc ~ Soils$P + sp2) or lm.mreg4 <- lm(Soils$Conduc ~ Soils$P + I(Soils$P^2)) summary(lm.mreg4) anova(lm.mreg4) fit1 <- fitted(lm.mreg4) plot(Soils$P,Soils$Conduc) points(Soils$P,fit1,type="p",col="red",pch=19) # TODO: fit a cubic model using Soils$P # TODO: Does a linear, quadratic or cubic model best fit this data? ######################################### # Introduction to ANOVA ######################################### # The functino lm() will calculate an ANOVA if it receives factor variables # TODO: which variables in "Soils" are chategorical factors # Variables can be designated as factors by using the function factor(), or as.factor() an1 <- lm(Soils$Conduc ~ Soils$Depth) # using lm() with chategorical factors summary(an1) anova(an1) # ANOVA in R can also be easly done using the functin aov() an2 <- aov(Soils$Conduc ~ Soils$Depth) # TODO: what does summary(an2) provide and how does this differ from a regression? # TODO: use aov() to create a two way ANOVA using Soils$Depth and Soils$Contour with no interaction # TODO: use aov() to run a two way ANOVA on the data including an interactino term