# NRES 798 201501 Lab 5
# Objectives of todays labs
  # Implement and understand ANOVA analysis (fixed factor)
  # Perform pairwise evaluations beween multiple comparisons (Tukey's test)
  # Create publication suitable figures
  # Run an ANCOVA and evaluate model complexity

# data in the package "faraway" will be analysed in today's lab
install.packages("faraway")
library(faraway)

#########################################
# ANOVA: one-way
#########################################

# Data on the time it takes blood from animials on different diets to coagulate.
# From Box, Hunter and Hunter 1978. 
data(coagulation)

# TODO: How many diet levels are examined?
# TODO: How many replicates (animals) were sampled in each level?

# TODO: Examine the data and determine if it is normaly distrubited and homoscedastic
  # TODO: Box-plot: are there potential problems with data from any of the levels?
  # TODO: Q-Q plot: what does the plot suggest about the normality, skewness and kurtosis of the data?
  # TODO: Residuals does the data appear to be homoscedastic?
    # Why are only three levels displayed
    # Hint: Use jitter() for the plotting of the fit values.  What does this do?
  # TODO: Normality test: what does the w and p-value indicate about the normality?
  # Testing for Homogeneity of variances
  # Bartlett's test can be used to test if the variance is the same for three or more samples
    # e.g. bartlett.test(coag ~ diet, data=coagulation)


# TODO: Perform an ANOVA and evaluate whether or not there are differences beween the diet levels.

# TODO: How do the coefficient estimates for each level differ if a zero intercept is included or not?
  # i.e. coag ~ diet vs. coag ~ diet -1,

# TODO: Determine which levels are statitically different from each other
  # Tukey's test (or the Honest Significant Difference test) is commonly used to where differences occur between multiple comparisons.
  
  # Method 1
  install.packages("agricolae") # package agricolae contains the function HSD.test
  library(agricolae)
  # format  HSD.test(responce,levels,DFerror,MSerror,group=TRUE)
  summary(aov(coag ~ diet,coagulation))
  HSDresults <- HSD.test(coagulation$coag,coagulation$diet,20,5.6,group = TRUE)
  str(HSDresults)     # determine the structure of the output
  HSDresults$groups   # show where the differences occur
  # TODO: how do you interpret the "groups" differences?
  HSDresults          # produce the full output
  # TODO: what is the exact p-value associated with each multiple comparison?

  # Method 2
  TukeyHSD(aov(coag ~ diet, coagulation))
  # TODO: How do you interpret the "p adj" for each pair?

# TODO: What is the best way to plot one-way ANOVA data?
  # e.g. boxplot() or bar.group() (function in "agricolae")
  bar.group(HSDresults$groups,ylim=c(0,80),density=4,border="blue")

  # TODO: create a publication quality figure using boxplot() that includes meaningfull X and Y lables, and designations for where the differences between multiple comparisons occur.
  # Hint: use "xlab", "ylab", "ylim", "cex" and importantly "mtext" and/or "text"


#########################################
# ANOVA two-way
#########################################

data(rats)
# data on the survial time of rats after consumeing 1 of 3 poisions (factor 1), and in 4 treatments (factor 2)
# TODO: what happens when the plot function is used on a formula?
  # e.g. plot(time ~ treat + poison, data=rats)

# TODO: use lm() to constuct a two-way ANOVA with an interaction term
  # Hint: the format for multiple regression also apples to multi-way ANOVA
  # TODO: how do you interpret the interaction coefficient estimates? (e.g. summary(lm()))

  # plotting interactions
  interaction.plot(rats$treat,rats$poison,rats$time)
  interaction.plot(rats$poison,rats$treat,rats$time)

# TODO: Plot the residuals of the model and evaluate if the current model is sufficient.
# TODO: How do Log and reciprocal (1/y) transformations influence the fit?

# TODO: Based on the anova(lm()) output can the model be simplified, and should it be simiplified?
  # Hint look at both anova() and summary()
# TODO: Run a "simpler" model and compare the output.

# TODO: Creat a publication grade figure that includes the interaction (interaction.plot())
  # TODO: modify the legend and use different colors for one factor and different line types for the second factor.

#########################################
# Random block design ANOVA
#########################################

data(penicillin)
# Data examining the yield of penicillin produced from 4 differet processes (treat)
# Large homogeneous batches of the raw material is difficult to produce so blocks (blends) are used.

# TODO: Plot the model to graphically examine the treatment and the blocking factor
# TODO; Does the treatment effect or the blocking effect appear to be larger?
# TODO: Run the random block ANOVA and interpret the treatment and the blocking factor.
# TODO: Use interaction.plot() to examine if and how treatment potentialy interacts with the blocking factor.
# TODO: Should an interaction term ever be included in the model.


#########################################
# Nested ANOVA
#########################################

# Fomula for nested ANOVA: y ~ a/b/c , read a, with b nested in a, with c nested in b and a

# Example data of stream DOC sampled from reaches nested wihtin streams
dat <- data.frame(DOC = rnorm(20,mean = 5, sd = 1.3),
                  reach = rep(letters[1:4], 5),
                  stream = rep(c("Douglas","Speedy","Spey","Obrian","Willow"), each = 4))

# TODO: examine how the sum of squares is calculated in a factorial vs. nested structure

# The correct model specification with the reach error nested in the stream error would look like
nmod <- aov(DOC ~ stream + Error(stream/reach), dat)
summary(nmod)

# The function lmer() will be used later to calculate F and p-values


#########################################
# ANCOVA
#########################################
# Download the Gain.txt data and place in working directory (From Crawley)
datpath <- "C:/Users/Che/UNBC_work/Courses/NRES-798/NRES-798-Labs/Gain.txt"
dat <- read.table(datpath,header=TRUE)

# Data on the weight gain of individuals as a function of Sex and Genotype with age as a potential covariate

# TODO: examine the relationship between weight and the three factors
# TODO: construct the full (maximum) model including both categorical factors and the continuous covariate
  # e.g. y ~ a*b*c

# TODO evaluate if any of the higher order interactions can be removed.