# Cran R # http://www.r-project.org/ # R environment and base packages # Text editor to R (Notepad ++) # R libraries (packages): C:\Program Files\R\R-3.0.3\library # Introduction to R: http://cran.r-project.org/doc/manuals/r-release/R-intro.pdf # R reference card : http://cran.r-project.org/doc/contrib/Short-refcard.pdf # R statistical environment # Script based interface # Everything in R is a object # Object: data structure, attributes, methods # R requires minimal object declaration: object type will change with content # RStudio # http://www.rstudio.com/ # R IDE (integrated development environment) # Script editor, R interface (build configuration), debuging, project management, version control # (Tinn-R, Eclipse (StatET)) # RStudio components # Script editor, R command line (console), Environment (+ history), Files (+plots, packages, help) ##################################### # Using RStudio ##################################### # File, New Project # File, New File, R Script # Controlling the location of data and output # Explicit pathway description dataPath <- "C:/Users/Che/UNBC_work/Courses/NRES-798/NRES_lab_1" dataPath <- "H:/che/NRES_798_lab_1/test_data.csv" dataReadIn <- read.csv(dataPath,header = TRUE) # Implicit using the project workspace (working directory) # Rstudio, Sessions, Set Working Directory getwd() # Determine what the path of the current working directory setwd(dataPath) # Sets the path to the specified working directory dir() # Outputs the files in the current directory # TODO: change the directory to the root directory of your H drive # TODO: list the files in your H drive root directory # TODO: change the directory back to you NRES-798 directory (project folder) ##################################### # Assignment in R ##################################### # Three assignment methods ("<-","->","=") # Be consistent and only use "<-" a1 <- 15 # assign numeric scalar to a1 a2 <- 3 b1 <- TRUE # logical, assign b as true (T), no quotation b2 <- FALSE d <- "ecology" # assign character string to data # Using R for simple numeric operations (calculator) 3 + 4 # direct operation e <- 16/4 # output of operation assigned to object # + - * / ^ %% # TODO: what does %% do (e.g. 17 %% 5 # TODO: what is produced by # a1 + b1 # a1 + b2 # a1 / b1 # a1 / b2 # b2 / a1 # b2 / b2 # a2 + d # The function mode() reveal the type of an object mode(a1) # "numeric" mode(b2) # "logical" mode(d) # "character" # Working directory vs. working environment # Only objects that are brought into R's working environment are available for use ls() # list all the objects in the working environment # Individual objects can be removed from the enviornment with rm() rm(a1) # removes a1 from the work space # TODO: what now happens with # e <- a1 + a2 ##################################### # vectors objects ##################################### # vectors: a set of scalars arranged in a one-dimensional array # vector assignment: c(), read c as concatenate or combine v1 <- c(1,3,2,5,4) v2 <- c(2,3) v3 <- c(v1,v2,v1) # vectors can also be combined length(v1) # returns the length of the vector length(v3) # vector indexing: identifying and operating on vector components v3 v3[4] # nth element of vector v3[1:4] # elements 1 to 4 of vector v3[c(1,3,5)] # specific elements of vector v3[v3 > 4] # all elements that are greater than 4 v3[4:length(v3)] # fourth element to the end (length(v3)) of the vector v3[v3 == 3] # all elements that equal (==) 3 v3[v3 != 3] # all elements that do not equal (!=) 3\library v3[v3 != 3 & v3 > 2] # all elelments that don't equal 3 and are greater than 2 v3[-2] # vector excluding element 2 v3[-c(1,4,6)] # vector excluding listed elements # operating on vectors a2 v1 nv1 <- v1 + a2 nv2 <- v2 * a2 nv3 <- v1 + v2 # TODO: what is happening here? nv4 <- v1 * v2 # TODO: what is happening here? # altering the sequence of vectors v4 <- c(1,6,5,3,4,7,2,9,10,8) v4s <- sort(v4) # sorts the vector v4o <- order(v4) # TODO: what is retuned and how does this differ from sort # creating structured vectors v4 <- rep(1,10) # create a vector of 1's that is 10 elements long v5 <- rep(c(1:5),3) # create a vector of 1:5 replicated 3 times v6 <- rep(c(1:5), each = 3) # vector of 1:5 with each number sequentially repeated 3 times v7 <- rep(c(1:5),each = 3, times = 2) v8 <- c(1:20) v9 <- seq(1,20,by=2.34) # factors: vectors where the elements are factors, used to code experiments v10 <- rep(c("high","med","low","veryLow"),each = 2) # create vector v10 <- factor(v10) # transform vector in to factor ##################################### # Matrix objects ##################################### m1 <- matrix(c(1:12),nrow = 3, ncol = 4) # 3 by 4 matrix filled by column m2 <- matrix(c(1:12),nrow = 3, ncol = 4, byrow = TRUE) # TODO: how does this differ? # To determine the size of a matrix dim(m2) # Matrix indexing m1 m1e1 <- m1[2,] # Just row 2 m1e2 <- m1[,3] # Just column 3 m1e3 <- m1[3,1] # Just the element in row 3, column 1 # TODO: when extracting a subset of data from a matrix what is returned? is.matrix(m1e1) is.vector(m1e1) is.numeric(m1e1) # is.type() tests for type of object (is.vector(),is.array()...) # Matrix operations s1 <- 3 v1 <- c(1,2,3) m1 <- matrix(c(1:12),nrow = 3, ncol = 4) # TODO: what happens with each of these operations m1a <- m1 + s1 m1b <- m1 * s1 m1c <- m1 + v1 m1d <- m1 * v1 # matrix transformation m1 m2 <- t(m1) # TOD0: how does the operator * differ from %*% m3 <- m2 %*% v1 ##################################### # List objects ##################################### # Lists are the "everything" data objects myList <- list(5,c(4:8),c("not","to","hard"),m1) # Understanding list indexing is important because many stats functions return a list # List indexing [[]] for list element, following [] for element components myList[2] # second element of list returned as type list myList[[2]] # second element of list returned as vector myList[[2]][3] # third component of second element of list myList[[4]][2,] # row 2 of the 4 element of the list (matrix) myList[[3]][3] # The structure of an object can be determined using the function str(object) str(myList) # TODO: what does the retuned information mean? ##################################### # Data frame objects ##################################### # Data frames equivalent to a spread sheet, and very common data storage type # Each column of the data frame is a vector # Each vector can be of a different mode (e.g. numeric, factor) f1 <- rep(c("blue","red","green"),each = 3) # create vector of factors f2 <- c(1,5,3,7,4,8,5,2,4) # create vector of data df1 <- data.frame(heatType=f1,age=f2) # create a data frame by binding vectors together dim(df1) # returns the size of the data frame str(df1) # returns the structure of the data frame names(df1) # returns the column "names" of the data frame names(df1)[2] # specific names can be identified by including their number # indexing data frames as like a matrix df1[2,] # data frame components from row 2 df1[,2] # data frame components from column 20 # indexing data frames using column names df1$heatType # "$" operator specifies variables (column names) in the data frame df1$heatType[3] # specifies the third element from variable heatType # Saving data frames (and other data objects) from R-intro # Saving all files as .csv data files is a very robust way to save data #write.csv(x, file = "", row.names = FALSE) write.csv(df1, file = "myDataTest.csv", row.names = FALSE) # file = "myData.csv" # By not including a pathway the data is saved to the set working directory # file = "D:/Che/work/movement_data/spring2014.csv" # Fully specifying the path can be less risky # .csv file can then be read back into the R enviornment with #read.csv(file, header = TRUE) df2 <- read.csv("myDataTest.csv", header=TRUE) ##################################### # Working with Data frames ##################################### # Load the inbuilt data set "cars" data(cars) # TODO: determine the size of the data set # TODO: determine the variables names # Components of data frames can be extracted using the same framework as vectors cars[cars$speed > 15,] # extract only those rows where speed > 15, return all columns cars[-c(1:12),2] # remove the first 12 rows and return only the second column # TODO: extract only the rows where the distance is exactly double the speed # data frame subsets can also be extracted using the function subset # subset(data,component to extract) cars1 <- subset(cars,cars$speed > 19) ##################################### # Basic data exploration ##################################### # Load the inbuilt data set "trees" data(trees) # Some of the common summary statistic functions are... # mean(), min(), max(), range(), var(), sd(), median(), quantile(,...) # TODO: calculate the mean tree volume # TODO: determine the minimum and maximum tree height # TODO: determine the range of tree heights. # TODO: calculate the sum of all tree Girths # TODO: calculate the variance and standard deviation of tree volume # help(): is a very powerful tool in R that provides information and examples for all functions # e.g. help(mean) # TODO: use help() to find out what additional arguments need to be included when using quantile(). # TODO: calculate the 25% (first) and 75% (third) quantiles for tree height # TODO: what does the function summary(trees) return? # The precision of data can be altered using round(), floor(), and ceiling() round(trees$Girth,digits=0) # Round the values to the nearest integer ceiling(trees$volume) # move the data to next highest integer ##################################### # Creating new functions ##################################### # The generic structure for creating a new function is... # functionName <- function(arg1, arg2..){ # function calculations # } # Function to return the calculated sum of squares sumSquares <- function(x){ meanX <- mean(x) # calculate mean of data difX <- x - meanX # calculate the difference between the mean and each data point sqr.difX <- difX^2 # square the difference sum.sqr.difX <- sum(sqr.difX) # sum the squared differences return(sum.sqr.difX) # return the sum of squares } # TODO: create a function that takes a numerical vector, calculates the median, and returns the square root of the median # TODO: create a function that takes a 5 by 5 matrix, and a numerical scalar, and returns a vector that contains all of the matrix values that are fully divisible by the value (no remainder) ##################################### # Initial data exploration and plotting ##################################### # Load the data set iris data(iris) # load the in built data set iris names(iris) # list the variable names in iris summary(iris) # provide an overview summary of the data set str(iris) # provide more information on the structure of iris unique(iris$Species) # Lists all of the unique levels of the factor # plot() is the base level function for ploting data # plot() what plot produces will depend on the type of object that it is passed plot(iris) # plot function based on data frame object plot(iris$Sepal.Length,iris$Sepal.Width) # plot of x vs. y # Modify the number of plots that are shown in a figure par(mfrow=c(2,2)) # Change the number of plots to a 2 by 20 hist(iris$Sepal.Length) # Histogram of sepal length hist(iris$Sepal.Width) hist(iris$Petal.Length) hist(iris$Petal.Width) # Produce box plot of sepal length for each species plot(iris$Species, iris$Sepal.Length) # Data type determines that box plot is to be produced boxplot(iris$Sepal.Length~iris$Species) # Explicit framework for producing the same thing # TODO: plot Sepal.Width vs. Petal.Width # TODO: rename the X and Y axis to be more informative # TODO: add a main title at the top of the figure # Exercises # TODO: create a data file in excel, # save as a .csv file, # read the file into R, # output the size of the data frame, # list all of the data frame components, # ceate a new data frame that contans a subset of the original data frame, # save the new data frame as a .csv file