# NRES 798 201501 Lab 3

# Lab objectives
  # Ploting complex figures (review)
  # Evaluate the properties of different probability distribution
  # Examine the Central limit theorem using different base probability distributions
  # Learn how to fitting probability distributions to given data distributions
  # Learn how to test whether a distribution is normally distributed
  # calcualte correlations between independent data points

# libraries to be used in Lab 3
# install.packages("moments") # Downloads and instals the package in the local uses R library (library location can be modified)
install.packages("MASS")
install.packages("vcd")
install.packages("PerformanceAnalytics")
library(moments) # Finds the package in the library and loads the functions into the R environment
library(MASS)
library(vcd)
library(PerformanceAnalytics)

#####################################
# Ploting more complex figures
#####################################

# Saving ploted figures
data(cars)
plot(cars$speed,cars$dist)  # Plots the figure in the default device (device 1)

pdf("Figure1.pdf",width=7,height=7) # Opens a new device (pdf figure).  The full path to the figure can be given
plot(dist ~ speed, data=cars) # Plots the data to the device (same as above but using different notation)
dev.off() # Closes the device that was being writen to

# Always close devices (dev.off()) once the plotting has been completed (can be multiple commands sent to device)

# Common ploting devices (file formats): pdf, png, jpeg, tiff, svg
# pdf produces good figues but are hard to modify
# For publication figures using the svg device (Scalable Vector Graphic) is recomended
#   Once created the .svg file can be opened with a vector graphics editor and modified
#   A good vector graphics editor is Inkscape: https://inkscape.org/en/
#   In Inkscape all components of the figure can be modiifed
#   The polished figure can then be exported as a bitmap in any required resolution (e.g. 900 dpi)

# TODO: explore what the arguments bg, and qualitly do for the various devices
# Note that width and height units differ between devices

# Ploting multipel figures in a nicer framework
# Multiple plots can be specified by using the par(mfrow=c(x,y)) framework
# However a more flexible approach is to use the layout function

layout_matrix <- matrix(c(rep(c(1,2),times=2,each=2),rep(c(3,4),times=2,each=2)),ncol=4)
layout_matrix # Four plots arranged as quadrants

layout(layout_matrix, heights = c(1,1,1,1), width = c(1,1,1,1)) # Define the layout structure of the figure
layout.show(4) # display the figure structure


# Example of plotting with layout
pdf("four_plot_cars.pdf")
layout(layout_matrix, heights = c(2,2,1,1), width = c(1,1,2,2)) # Define the layout structure of the figure
speed_hist <- hist(cars$speed,plot = FALSE)
dist_hist <- hist(cars$dist,plot = FALSE)
top = max(c(speed_hist$counts, dist_hist$counts))

barplot(speed_hist$counts, axes=FALSE, space=0,horiz=TRUE)
plot(0,0,type="n",xaxt="n",yaxt="n",xlab="",ylab="",bty="n") # TODO: what is this plot comand doing?
plot(cars$dist,cars$speed,xlab="Distance",ylab="Speed",cex.lab=1.5)
barplot(dist_hist$counts, axes=FALSE, space=0)
dev.off()

# TODO: Load the "iris" data into R
# TODO: Subset the data and create a 2 column by 4 row plot that shows a histogram of Sepal length, width, and Petal Length width
# for setosa and Virginica.  Make sure the x and y axis for the histrograms are the same


#####################################
# Exploring distributions
#####################################

# Distribution functions- d,p,r,q realizations: e.g. pnorm, dnorm, qnorm,rnorm
#   (p) probability distribution function (pdf)
#   (d) probability density function (cdf)
#   (q) quantile function
#   (r) random number generation 

# Example of a normal distribution
test_mean <- 5
test_sd <- 1
test_range <- seq(0,10,0.01)
norm_distribution <- dnorm(test_range,mean = test_mean, sd =test_sd)
norm_density <- pnorm(test_range,mean = test_mean, sd =test_sd)
norm_quantile <- qnorm(c(0.10,0.25,0.5,0.75,0.9),mean = test_mean, sd = test_sd)
norm_random <- rnorm(100,mean = test_mean, sd = test_sd)

par(mfrow=c(2,2))
plot(test_range,norm_distribution,type="l")
plot(test_range,norm_density,type="l")
hist(norm_random)
plot(c(0.10,0.25,0.5,0.75,0.9),norm_quantile,type="p")

# TODO: what is the relationship between pnorm and dnorm?
# TODO: how does each plot respond to increasing the sd

# TODO: for each of the distributions (Binomial, Poisson, Negative binomial, Normal, Log-normal, Weibull) 
# create a 4 by 4 matrix of plots that displays how each distribution responds to changes in the relevant arguments (parameters)


#####################################
# Central Limit Theorem
#####################################

# Examine the validity of the Central Limit Theorem using a uniform distribution
hist(runif(10000)*10,main="Uniform")  # histogram of 1000 random draws from Uniform random distribution

n <- 10000  # number of means that are sampled
nsamples <- 5  # number of draws from the underlying distribution from which to calculate the mean
means<- numeric(n) # define a vector to fill
for(i in 1:n){  # Loop 
  means[i] <- mean(runif(nsamples)*10) # Calculate the mean of a number of random draws from a distribution
}
hist(means,ylim=c(0,1600))  # Plot a histogram of the means

mean(means) # Return the mean of the means
sd(means) # Return the standard deviaation from the means

xv <- seq(0,10,0.1) # Create a sequence from zero to 10
yv <- dnorm(xv,mean=mean(means),sd=sd(means))*5000 # 
lines(xv,yv)


# TODO: Using the Uniform distribution, examine how the Central Limit Theorem responds to changes in the number of means sampled (n)
# TODO: Examing the impact of altering the number of random draws that are used to calculate the mean (nsamples).

# TODO: Create scripts that examing the Central Limit Theorem when drawing from a Poisson and Weibull distribution.
# TODO: Does sensitivity to changes in n and nsamples chage with differet distributions?


# TODO: Across what parameter space do the Binomial, POisson and Weibull distributions approximate a normal distribution

# TODO: write function that calculates the mean, variance, skew and Kurtosis of a sammple vector and returns the results as a list


#####################################
# Fitting distributions in R
#####################################

# Fitting distribution involves four steps:
# 1. Model/function choice: family of distributions
# 2. Estimating parameters
# 3. Evaluating quality of fitting
# 4. Goodness of fit statistical test
# 
# 1. Distribution selection
# Analyse data to deteremine what familes of distributions may be appropriate
# a: mean, variance, skew, kurtosis
# b: plot y values, box plot, histrogram, output of data base properties.


ynorm <- rlnorm(100,meanlog = 4,sdlog =0.4)  # creeate a distribution to test

layout_matrix <- matrix(c(rep(c(1,2),times=2,each=2),rep(c(3,4),times=2,each=2)),ncol=4)
layout(layout_matrix, heights = c(1,1,1,1), width = c(1,1,1,1)) # Define the layout structure of the figure
layout.show(4) # display the figure structure

hist(ynorm, probability = TRUE)
boxplot(ynorm)
plot(ynorm)
plot(c(0,10),c(0,10),type="n",xaxt="n",yaxt="n",xlab="",ylab="",bty="n")
ymean <- round(mean(ynorm),2)
yvar <- round(var(ynorm),2)
yskewness <- round(skewness(ynorm),2)
ykurtosis <- round(kurtosis(ynorm),2)

text(1,8,paste("Mean = ",ymean,sep=""),pos=4,cex=2)
text(1,6,paste("Variance = ",yvar,sep=""),pos=4,cex=2)
text(1,4,paste("Skewness = ",yskewness,sep=""),pos=4,cex=2)
text(1,2,paste("Kurtosis = ",ykurtosis,sep=""),pos=4,cex=2)

# TODO: Convert the above code into a function that performs the calculations and prints the figure to a pdf device

# 2. Estimating parameters
# 
# Parameter estimates can be obtained in two ways
# a) Analogic method: estimating parameter values from the "sample" population and assuming these correspond to the population parameters
# e.g. 
# ymean = mean(ynorm) 
# ysd <- sd(ynorm)
# 
# b) Using maximum likelihood methods to estimate the population parameters from sample data.
#  Methods for maximu likelihood estimation are included in the package "MASS".
# The "fitdistr()" function takes the sample data vector, and a specification of the distribution as arguments and returns the mle parameter estimates.

ymle <- fitdistr(ynorm,"normal")

# TODO: what type of object does ymle return?

# TODO: Extract the mean and sd from the object as numerics

# TODO: Create random normal, log-normal, Poisson, and weibull data sets (n = 50) and estimate the parameters using Analogic and maximum likelihood methods

# 3. Evaluating quality of distribution fits
# The suitability of a fitted distribution can be evaluated graphically by ploting the distribution pdf on top of the histogram,
# or by using a Quantile-Quatile plot that compares the observed vs. theoretical quantiles

# ploting estimated distributions over histograms
# Normal distribution
hist(ynorm,main="",xlab="observed values",col="grey91",prob=TRUE)  # plot a histogram of the sample data
curve(dnorm(x,mean=ymean,sd=ysd),add=TRUE,col="red")               # Plot the pdf of the distriibution using estimated parameters

# TODO: Randomly generate Poisson ditributed data (n=150) with a mean of 3.4
# TODO: create a histogram of the created data 
# TODO: Use maximum likelihood methods to estimate the Poisson parameter
# TODO: plot the assumed Poisson pdf on top of the histogram
# TODO: Using another color, plot a fitted Gausian distribution to the same data

# Using Quantile-Quantile plots to evaluate distribution fits
# Comparing sample data to a normal distribution using QQ plots can simply be done by using the function qqnorm
qqnorm(ynorm)    # Takes the sample data and plots the sample quantiles against the theoretical pdf quantiles
qqline(ynorm,lty=2) # adds a line to the theoretical quantile plot.  Deviation of the sample data from this line identifies areas where the fit is not good.

# In order to compare other distributions the qqplot function can be used
x.wei <- rweibull(n=100,shape=2.1,scale= 1.1) # Random sample data from a weibull distribution
x.theo <- rweibull(n=1000,shape = 2.0, scale = 1.0) # Theoretical population with "known" parameter values used
qqplot(x.theo,x.wei,main="QQ-plot distr. Weibull")
abline(0,1) # a 45-degree reference line 

# 4. Goodness of fit and normality tests.
# The most common test is determining whether data is normally distributed
# A common, and powerfull, test for normality is the Shapiro-Wilk test
shapiro.test(ynorm)
# The W value is  the test statistic, if the p-value is < 0.05 we should consider rejecting the null hypothesis

# Testing the goodness of fit of discrete (count data) distributions is best done with the function goodfi() from the vcd library
# testing for good ness of fit for a log-normal distribution

gf <- goodfit(rpois(100,2),type="poisson", method = "MinChisq")
summary(gf)
plot(gf,main="Count data vs Poisson distribution")


#####################################
# Examining correlations between independent variables
#####################################

data(iris)
names(iris)

# Correlations between variables can be calculated using the function cor()
ycor <- cor(iris$Sepal.Length,iris$Petal.Length,method = c("pearson"))

# The package PerformanceAnalytics contains some easy functions for examining correlations with a data set
chart.Correlation(iris[,1:4],histogram = TRUE)


####################################
# Work
###################################

# TODO: Create a dummy data set (data.frame) that contains 1 responce variable and 4 independent variables.
  # The number of independent observations should be 100
  # Create the responce variable by randomly drawing frmo a normal distribution
  # Create the independent variables by randomly drawing from a negative exponeial, gamma, Poisson and uniform distributions
  # Save the data frame as a .csv file

# Analysis the newly created data set to determine the basic structure of the data
# i.e. Calculate moments for all variables, plot histogram, plot barplot, plot raw data, test for corrrelations between the inependent variables