# NRES 798 Lab 2 # RStudio # http://www.rstudio.com/ # R IDE (integrated development environment) # Script editor, R interface (build configuration), debugging, project management, version control # RStudio components # Script editor, R command line (console), Environment (+ history), Files (+plots, packages, help) ##################################### # Using RStudio ##################################### # File, New Project # File, New File, R Script # Implicit Using the project workspace (working directory) # Rstudio, Sessions, Set Working Directory getwd() # Determine what the path of the current working directory setwd("C:/Users/Che/UNBC_work/Courses/NRES-798/NRES-798-Labs/NRES-798-Lab-2") # Or can use "Set working directory" in "Sessions" dir() # Outputs the files in the current directory # RStudio components # R Console (bottom left) # Interface to R, where information is sent and received from R # Text editor (top left) # Formated text editing # Scripts can be passed to the R console using "control + enter" # Environment and history (top right) # Global Environment # Lists of all objects that R know about # Properties of the R objects are also listed here # Can also be accomplished with the command ls() # # The variables in the R environment can be cleared in 2 ways # 1. "Clear" button in the Rstudio Enviornment window # 2. rm(list=ls()) # using the R command clear and applying it to all objects in the enviornment ls() # History # History of all comands sent to the R console # Files, Plots, Packages, Help, Viewer (bottom right) # Files: list of files in the specified working directory # Plots: the default device that ploting commands are sent to # Displayed plots can be saved usig the "Export"drop down menu # Packages: list of all packages that are currently loaded into the R environment # Help: window for the called help dialog ##################################### # Loading additional packages ##################################### # R packges are groups of functions that perform related tasks # New packages must first be downloaded, and then installed in "libaries" before the functions can be used # For example the papckage "moments" contains functions for calculating the skew and kurtosis of a data set # Another package that contains similar functions is called "e1071" # Packages must be loaded each session install.packages("moments") # Downloads and instals the package in the local uses R library (library location can be modified) library(moments) # Finds the package in the library and loads the functions into the R environment skewness(rlnorm(1000,1,0.25)) # Packages also contain help files for all of their functions help(skewness) help(kurtosis) # TODO: Calculate the Kurtosis of three log-normal random distribution that have different means and sd ##################################### # Creating new functions ##################################### # The generic structure for creating a new function is... # functionName <- function(arg1, arg2..){ # function calculations # } # Function to return the calculated sum of squares sumSquares <- function(x){ meanX <- mean(x) # calculate mean of data difX <- x - meanX # calculate the difference between the mean and each data point sqr.difX <- difX^2 # square the difference sum.sqr.difX <- sum(sqr.difX) # sum the squared differences return(sum.sqr.difX) # return the sum of squares } # TODO: create a function that takes a numerical vector, calculates the median, and returns the square root of the median # TODO: create a function that takes a 5 by 5 matrix, and a numerical scalar, and returns a vector that contains all of the matrix values that are fully divisible by the value (no remainder) ##################################### # Initial data exploration and plotting ##################################### # Load the data set iris data(iris) # load the in built data set iris names(iris) # list the variable names in iris summary(iris) # provide an overview summary of the data set str(iris) # provide more information on the structure of iris unique(iris$Species) # Lists all of the unique levels of the factor # plot() is the base level function for ploting data # plot() what plot produces will depend on the type of object that it is passed plot(iris) # plot function based on data frame object plot(iris$Sepal.Length,iris$Sepal.Width) # plot of x vs. y # Modify the number of plots that are shown in a figure par(mfrow=c(2,2)) # Change the number of plots to a 2 by 2 hist(iris$Sepal.Length) # Histogram of sepal length hist(iris$Sepal.Width) hist(iris$Petal.Length) hist(iris$Petal.Width) # Produce box plot of sepal length for each species plot(iris$Species, iris$Sepal.Length) # Data type determines that box plot is to be produced boxplot(iris$Sepal.Length~iris$Species) # Explicit framework for producing the same thing # TODO: plot Sepal.Width vs. Petal.Width # TODO: rename the X and Y axis to be more informative # TODO: add a main title at the top of the figure ##################################### # Ploting more complex figures ##################################### # Saving ploted figures data(cars) plot(cars$speed,cars$dist) # Plots the figure in the default device (device 1) pdf("Figure1.pdf",width=7,height=7) # Opens a new device (pdf figure). The full path to the figure can be given plot(dist ~ speed, data=cars) # Plots the data to the device (same as above but using different notation) dev.off() # Closes the device that was being writen to # Always close devices (dev.off()) once the plotting has been completed (can be multiple commands sent to device) # Common ploting devices (file formats): pdf, png, jpeg, tiff, svg # pdf produces good figues but are hard to modify # For publication figures using the svg device (Scalable Vector Graphic) is recomended # Once created the .svg file can be opened with a vector graphics editor and modified # A good vector graphics editor is Inkscape: https://inkscape.org/en/ # In Inkscape all components of the figure can be modiifed # The polished figure can then be exported as a bitmap in any required resolution (e.g. 900 dpi) # TODO: write ploting scripts to plot the "cars" data using pdf, png, jpeg, tiff and svg devices # TODO: explore what the arguments bg, and qualitly do for the various devices # Note that width and height units differ between devices # Ploting multipel figures in a nicer framework # Multiple plots can be specified by using the par(mfrow=c(x,y)) framework # However a more flexible approach is to use the layout function layout_matrix <- matrix(c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4),ncol=4,nrow=4) layout_matrix # Four plots arranged as columns layout_matrix <- matrix(c(rep(c(1,2),times=2,each=2),rep(c(3,4),times=2,each=2)),ncol=4) layout_matrix # Four plots arranged as quadrants layout(layout_matrix, heights = c(1,1,1,1), width = c(1,1,1,1)) # Define the layout structure of the figure layout.show(4) # display the figure structure layout(layout_matrix, heights = c(2,2,1,1), width = c(1,1,2,2)) # Define the layout structure of the figure # heights speicifies the height of each row # width specifies the width of each column layout.show(4) # NOTE: layout.show() can't be used when you are writing to a device other than the null device # Example of plotting with layout pdf("four_plot_cars.pdf") layout(layout_matrix, heights = c(2,2,1,1), width = c(1,1,2,2)) # Define the layout structure of the figure speed_hist <- hist(cars$speed,plot = FALSE) dist_hist <- hist(cars$dist,plot = FALSE) top = max(c(speed_hist$counts, dist_hist$counts)) barplot(speed_hist$counts, axes=FALSE, space=0,horiz=TRUE) plot(0,0,type="n",xaxt="n",yaxt="n",xlab="",ylab="",bty="n") # TODO: what is this plot comand doing? plot(cars$dist,cars$speed,xlab="Distance",ylab="Speed",cex.lab=1.5) barplot(dist_hist$counts, axes=FALSE, space=0) dev.off() # TODO: Load the "iris" data into R # TODO: Subset the data and create a 2 column by 4 row plot that shows a histogram of Sepal length, width, and Petal Length width # for setosa and Virginica. Make sure the x and y axis for the histrograms are the same ##################################### # Exploring distributions ##################################### # Distribution functions- d,p,r,q realizations: e.g. pnorm, dnorm, qnorm,rnorm # (p) probability distribution function (pdf) # (d) probability density function (cdf) # (q) quantile function # (r) random number generation # Example of a normal distribution test_mean <- 5 test_sd <- 1 test_range <- seq(0,10,0.01) norm_distribution <- dnorm(test_range,mean = test_mean, sd =test_sd) norm_density <- pnorm(test_range,mean = test_mean, sd =test_sd) norm_quantile <- qnorm(c(0.10,0.25,0.5,0.75,0.9),mean = test_mean, sd = test_sd) norm_random <- rnorm(100,mean = test_mean, sd = test_sd) par(mfrow=c(2,2)) plot(test_range,norm_distribution,type="l") plot(test_range,norm_density,type="l") hist(norm_random) plot(c(0.10,0.25,0.5,0.75,0.9),norm_quantile,type="p") # TODO: what is the relationship between pnorm and dnorm? # TODO: how does each plot respond to increasing the sd # TODO: for each of the distributions (Binomial, Poisson, Negative binomial, Normal, Log-normal, Weibull) # create a 4 by 4 matrix of plots that displays how each distribution responds to changes in the relevant arguments (parameters) ##################################### #Data read in and manipulation ##################################### # Instal the package "xlsx" # Package includes functions to read and write Excel spreadsheets install.packages("xlsx") # Packages are downloaded and saved in a local library repository # load the package into the active R library library(xlsx) # Use help to find out what argruments read.xlsx() needs help(read.xlsx) # TODO: Open Excell and create a data set that has 5 columns and n rows # TODO: make sure that 2 of the columns are factors that are defined by strings # TOTO: make sure there are missing values included in the data set # TODO: Save the file in the Excell format .xlsx and a .csv format # TODO: Use the read.xlsx() function to read the file into R and assign to a data frame object # TODO: use the read.csv() function to read in the file and assign to a second data frame object ##################################### # Handling NA and NaN ##################################### # Real data set often contain missing data (coded as NA (not available) or NaN (not a number) in R) # Some functions have built in methods for excluding NA data x <- c(1,2,NA,3) mean(x) # returns NA mean(x, na.rm=TRUE) # returns 2 # For some functions you will need to remove, replace, exlude NaN entries before analysis # Note that x[x!=NaN] does not work for removing or identifying the location of data with NaN values # need to use function is.na() to return logical value of NA location xv <-c(1,5,3,4,NaN,3,2,9,3,NA,8,NaN,3,6,4) # Vector including NaN and NA entries xm <- matrix(xv,ncol=3) # matrix with NaN is.na(xm) # Examine what is returned for each entry is in the matrix # is.na() can be used to replace all NaN in a dataset xm[is.na(xm)] <- 9999 # For many functions the method for dealing with NA data can be set uisng the argument na.action # E.g. wiht a lineare model x <- c(1:5,NA) # independent variable with NA values y <- rnorm(6) # dependent varialbe lm(y~x,na.action = na.fail) # Specifies that an error is returned if NA data present lm(y~x,na.action = na.omit) # Specifies that NA values are to be omited from the function calculations # It is often good practice to keep NA and NaN entries in a data frame and deal with them using the function methods # TODO: Using the data set you previously created, determine which rows (factors) contain NA data, and replace them with the value 2015 ##################################### # Central Limit Theorem ##################################### # Examine the validity of the Central Limit Theorem using a uniform distribution hist(runif(10000)*10,main="Uniform") # histogram of 1000 random draws from Uniform random distribution n <- 10000 # number of means that are sampled nsamples <- 5 # number of draws from the underlying distribution from which to calculate the mean means<- numeric(n) # define a vector to fill for(i in 1:n){ # Loop means[i] <- mean(runif(nsamples)*10) # Calculate the mean of a number of random draws from a distribution } hist(means,ylim=c(0,1600)) # Plot a histogram of the means mean(means) # Return the mean of the means sd(means) # Return the standard deviaation from the means xv <- seq(0,10,0.1) # Create a sequence from zero to 10 yv <- dnorm(xv,mean=mean(means),sd=sd(means))*5000 # lines(xv,yv) # TODO: Using the Uniform distribution, examine how the Central Limit Theorem responds to changes in the number of means sampled (n) # TODO: Examing the impact of altering the number of random draws that are used to calculate the mean (nsamples). # TODO: Create scripts that examing the Central Limit Theorem when drawing from a Poisson and Weibull distribution. # TODO: Does sensitivity to changes in n and nsamples chage with differet distributions? # TODO: Across what parameter space do the Binomial, POisson and Weibull distributions approximate a normal distribution # TODO: Write a function that compares a sample distribution to a Poisson and Weibul distribution # TODO: write function that calculates the mean, variance, skew and Kurtosis of a sammple vector and returns the results as a list