# NRES 798 Lab 2
# RStudio
# http://www.rstudio.com/
# R IDE (integrated development environment)
#   Script editor, R interface (build configuration), debugging, project management, version control

# RStudio components
# 	Script editor, R command line (console), Environment (+ history), Files (+plots, packages, help)


#####################################
# Using RStudio
#####################################
# File, New Project
# File, New File, R Script


# Implicit Using the project workspace (working directory)
# Rstudio, Sessions, Set Working Directory
getwd() # Determine what the path of the current working directory
setwd("C:/Users/Che/UNBC_work/Courses/NRES-798/NRES-798-Labs/NRES-798-Lab-2") # Or can use "Set working directory" in "Sessions"
dir() # Outputs the files in the current directory

# RStudio components

# R Console (bottom left)
# Interface to R, where information is sent and received from R

# Text editor (top left)
# Formated text editing
# Scripts can be passed to the R console using "control + enter"


# Environment and history (top right)
# Global Environment
# Lists of all objects that R know about
  # Properties of the R objects are also listed here
  # Can also be accomplished with the command ls()
  # 
  # The variables in the R environment can be cleared in 2 ways
  # 1. "Clear" button in the Rstudio Enviornment window
  # 2. rm(list=ls()) # using the R command clear and applying it to all objects in the enviornment ls()
# History
# History of all comands sent to the R console

# Files, Plots, Packages, Help, Viewer (bottom right)
# Files: list of files in the specified working directory
# Plots: the default device that ploting commands are sent to
#   Displayed plots can be saved usig the "Export"drop down menu
# Packages: list of all packages that are currently loaded into the R environment
# Help: window for the called help dialog


#####################################
# Loading additional packages
#####################################
# R packges are groups of functions that perform related tasks
# New packages must first be downloaded, and then installed in "libaries" before the functions can be used
# For example the papckage "moments" contains functions for calculating the skew and kurtosis of a data set
  # Another package that contains similar functions is called "e1071"
# Packages must be loaded each session

install.packages("moments") # Downloads and instals the package in the local uses R library (library location can be modified)
library(moments) # Finds the package in the library and loads the functions into the R environment
skewness(rlnorm(1000,1,0.25))

# Packages also contain help files for all of their functions
help(skewness)
help(kurtosis)

# TODO: Calculate the Kurtosis of three log-normal random distribution that have different means and sd


#####################################
# Creating new functions
#####################################

# The generic structure for creating a new function is...
# functionName <- function(arg1, arg2..){
#	function calculations
# }

# Function to return the calculated sum of squares
sumSquares <- function(x){
  meanX <- mean(x)  # calculate mean of data
  difX <- x - meanX # calculate the difference between the mean and each data point
  sqr.difX <- difX^2 # square the difference
  sum.sqr.difX <- sum(sqr.difX) # sum the squared differences
  return(sum.sqr.difX) # return the sum of squares
}

# TODO: create a function that takes a numerical vector, calculates the median, and returns the square root of the median

# TODO: create a function that takes a 5 by 5 matrix, and a numerical scalar, and returns a vector that contains all of the matrix values that are fully divisible by the value (no remainder)

#####################################
# Initial data exploration and plotting
#####################################

# Load the data set iris
data(iris)  # load the in built data set iris
names(iris)	# list the variable names in iris
summary(iris) # provide an overview summary of the data set

str(iris) # provide more information on the structure of iris
unique(iris$Species) # Lists all of the unique levels of the factor

# plot() is the base level function for ploting data
# plot() what plot produces will depend on the type of object that it is passed
plot(iris) # plot function based on data frame object

plot(iris$Sepal.Length,iris$Sepal.Width) # plot of x vs. y

# Modify the number of plots that are shown in a figure
par(mfrow=c(2,2)) # Change the number of plots to a 2 by 2
hist(iris$Sepal.Length)	# Histogram of sepal length
hist(iris$Sepal.Width)
hist(iris$Petal.Length)
hist(iris$Petal.Width)

# Produce box plot of sepal length for each species
plot(iris$Species, iris$Sepal.Length)  # Data type determines that box plot is to be produced

boxplot(iris$Sepal.Length~iris$Species)  # Explicit framework for producing the same thing

# TODO: plot Sepal.Width vs. Petal.Width
# TODO: rename the X and Y axis to be more informative
# TODO: add a main title at the top of the figure 

#####################################
# Ploting more complex figures
#####################################

# Saving ploted figures
data(cars)
plot(cars$speed,cars$dist)  # Plots the figure in the default device (device 1)

pdf("Figure1.pdf",width=7,height=7) # Opens a new device (pdf figure).  The full path to the figure can be given
plot(dist ~ speed, data=cars) # Plots the data to the device (same as above but using different notation)
dev.off() # Closes the device that was being writen to

# Always close devices (dev.off()) once the plotting has been completed (can be multiple commands sent to device)

# Common ploting devices (file formats): pdf, png, jpeg, tiff, svg
# pdf produces good figues but are hard to modify
# For publication figures using the svg device (Scalable Vector Graphic) is recomended
#   Once created the .svg file can be opened with a vector graphics editor and modified
#   A good vector graphics editor is Inkscape: https://inkscape.org/en/
#   In Inkscape all components of the figure can be modiifed
#   The polished figure can then be exported as a bitmap in any required resolution (e.g. 900 dpi)

# TODO: write ploting scripts to plot the "cars" data using pdf, png, jpeg, tiff and svg devices
# TODO: explore what the arguments bg, and qualitly do for the various devices
# Note that width and height units differ between devices

# Ploting multipel figures in a nicer framework
# Multiple plots can be specified by using the par(mfrow=c(x,y)) framework
# However a more flexible approach is to use the layout function

layout_matrix <- matrix(c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4),ncol=4,nrow=4)
layout_matrix # Four plots arranged as columns

layout_matrix <- matrix(c(rep(c(1,2),times=2,each=2),rep(c(3,4),times=2,each=2)),ncol=4)
layout_matrix # Four plots arranged as quadrants

layout(layout_matrix, heights = c(1,1,1,1), width = c(1,1,1,1)) # Define the layout structure of the figure
layout.show(4) # display the figure structure

layout(layout_matrix, heights = c(2,2,1,1), width = c(1,1,2,2)) # Define the layout structure of the figure
# heights speicifies the height of each row
# width specifies the width of each column
layout.show(4) # NOTE: layout.show() can't be used when you are writing to a device other than the null device

# Example of plotting with layout
pdf("four_plot_cars.pdf")
layout(layout_matrix, heights = c(2,2,1,1), width = c(1,1,2,2)) # Define the layout structure of the figure
speed_hist <- hist(cars$speed,plot = FALSE)
dist_hist <- hist(cars$dist,plot = FALSE)
top = max(c(speed_hist$counts, dist_hist$counts))

barplot(speed_hist$counts, axes=FALSE, space=0,horiz=TRUE)
plot(0,0,type="n",xaxt="n",yaxt="n",xlab="",ylab="",bty="n") # TODO: what is this plot comand doing?
plot(cars$dist,cars$speed,xlab="Distance",ylab="Speed",cex.lab=1.5)
barplot(dist_hist$counts, axes=FALSE, space=0)
dev.off()

# TODO: Load the "iris" data into R
# TODO: Subset the data and create a 2 column by 4 row plot that shows a histogram of Sepal length, width, and Petal Length width
# for setosa and Virginica.  Make sure the x and y axis for the histrograms are the same


#####################################
# Exploring distributions
#####################################

# Distribution functions- d,p,r,q realizations: e.g. pnorm, dnorm, qnorm,rnorm
#   (p) probability distribution function (pdf)
#   (d) probability density function (cdf)
#   (q) quantile function
#   (r) random number generation 

# Example of a normal distribution
test_mean <- 5
test_sd <- 1
test_range <- seq(0,10,0.01)
norm_distribution <- dnorm(test_range,mean = test_mean, sd =test_sd)
norm_density <- pnorm(test_range,mean = test_mean, sd =test_sd)
norm_quantile <- qnorm(c(0.10,0.25,0.5,0.75,0.9),mean = test_mean, sd = test_sd)
norm_random <- rnorm(100,mean = test_mean, sd = test_sd)

par(mfrow=c(2,2))
plot(test_range,norm_distribution,type="l")
plot(test_range,norm_density,type="l")
hist(norm_random)
plot(c(0.10,0.25,0.5,0.75,0.9),norm_quantile,type="p")

# TODO: what is the relationship between pnorm and dnorm?
# TODO: how does each plot respond to increasing the sd

# TODO: for each of the distributions (Binomial, Poisson, Negative binomial, Normal, Log-normal, Weibull) 
# create a 4 by 4 matrix of plots that displays how each distribution responds to changes in the relevant arguments (parameters)


#####################################
#Data read in and manipulation
#####################################

# Instal the package "xlsx"
# Package includes functions to read and write Excel spreadsheets
install.packages("xlsx")
# Packages are downloaded and saved in a local library repository

# load the package into the active R library
library(xlsx)

# Use help to find out what argruments read.xlsx() needs
help(read.xlsx)

# TODO: Open Excell and create a data set that has 5 columns and n rows
#   TODO: make sure that 2 of the columns are factors that are defined by strings
#   TOTO: make sure there are missing values included in the data set
# TODO: Save the file in the Excell format .xlsx and a .csv format
# TODO: Use the read.xlsx() function to read the file into R and assign to a data frame object
# TODO: use the read.csv() function to read in the file and assign to a second data frame object


#####################################
# Handling NA and NaN
#####################################

# Real data set often contain missing data (coded as NA (not available) or NaN (not a number) in R)
# Some functions have built in methods for excluding NA data

x <- c(1,2,NA,3)
mean(x) # returns NA
mean(x, na.rm=TRUE) # returns 2 


# For some functions you will need to remove, replace, exlude NaN entries before analysis
# Note that x[x!=NaN] does not work for removing or identifying the location of data with NaN values
# need to use function is.na() to return logical value of NA location
xv <-c(1,5,3,4,NaN,3,2,9,3,NA,8,NaN,3,6,4)  # Vector including NaN and NA entries
xm <- matrix(xv,ncol=3)  # matrix with NaN

is.na(xm) # Examine what is returned for each entry is in the matrix

# is.na() can be used to replace all NaN in a dataset
xm[is.na(xm)] <- 9999

# For many functions the method for dealing with NA data can be set uisng the argument na.action
# E.g. wiht a lineare model
x <- c(1:5,NA) # independent variable with NA values
y <- rnorm(6) # dependent varialbe
lm(y~x,na.action = na.fail) # Specifies that an error is returned if NA data present
lm(y~x,na.action = na.omit) # Specifies that NA values are to be omited from the function calculations

# It is often good practice to keep NA and NaN entries in a data frame and deal with them using the function methods

# TODO: Using the data set you previously created, determine which rows (factors) contain NA data, and replace them with the value 2015


#####################################
# Central Limit Theorem
#####################################

# Examine the validity of the Central Limit Theorem using a uniform distribution
hist(runif(10000)*10,main="Uniform")  # histogram of 1000 random draws from Uniform random distribution

n <- 10000  # number of means that are sampled
nsamples <- 5  # number of draws from the underlying distribution from which to calculate the mean
means<- numeric(n) # define a vector to fill
for(i in 1:n){  # Loop 
  means[i] <- mean(runif(nsamples)*10) # Calculate the mean of a number of random draws from a distribution
}
hist(means,ylim=c(0,1600))  # Plot a histogram of the means

mean(means) # Return the mean of the means
sd(means) # Return the standard deviaation from the means

xv <- seq(0,10,0.1) # Create a sequence from zero to 10
yv <- dnorm(xv,mean=mean(means),sd=sd(means))*5000 # 
lines(xv,yv)


# TODO: Using the Uniform distribution, examine how the Central Limit Theorem responds to changes in the number of means sampled (n)
# TODO: Examing the impact of altering the number of random draws that are used to calculate the mean (nsamples).

# TODO: Create scripts that examing the Central Limit Theorem when drawing from a Poisson and Weibull distribution.
# TODO: Does sensitivity to changes in n and nsamples chage with differet distributions?


# TODO: Across what parameter space do the Binomial, POisson and Weibull distributions approximate a normal distribution

# TODO: Write a function that compares a sample distribution to a Poisson and Weibul distribution

# TODO: write function that calculates the mean, variance, skew and Kurtosis of a sammple vector and returns the results as a list