# NRES 798 201501 Lab 3 # Lab objectives # Ploting complex figures (review) # Evaluate the properties of different probability distribution # Examine the Central limit theorem using different base probability distributions # Learn how to fitting probability distributions to given data distributions # Learn how to test whether a distribution is normally distributed # calcualte correlations between independent data points # libraries to be used in Lab 3 # install.packages("moments") # Downloads and instals the package in the local uses R library (library location can be modified) install.packages("MASS") install.packages("vcd") install.packages("PerformanceAnalytics") library(moments) # Finds the package in the library and loads the functions into the R environment library(MASS) library(vcd) library(PerformanceAnalytics) ##################################### # Ploting more complex figures ##################################### # Saving ploted figures data(cars) plot(cars$speed,cars$dist) # Plots the figure in the default device (device 1) pdf("Figure1.pdf",width=7,height=7) # Opens a new device (pdf figure). The full path to the figure can be given plot(dist ~ speed, data=cars) # Plots the data to the device (same as above but using different notation) dev.off() # Closes the device that was being writen to # Always close devices (dev.off()) once the plotting has been completed (can be multiple commands sent to device) # Common ploting devices (file formats): pdf, png, jpeg, tiff, svg # pdf produces good figues but are hard to modify # For publication figures using the svg device (Scalable Vector Graphic) is recomended # Once created the .svg file can be opened with a vector graphics editor and modified # A good vector graphics editor is Inkscape: https://inkscape.org/en/ # In Inkscape all components of the figure can be modiifed # The polished figure can then be exported as a bitmap in any required resolution (e.g. 900 dpi) # TODO: explore what the arguments bg, and qualitly do for the various devices # Note that width and height units differ between devices # Ploting multipel figures in a nicer framework # Multiple plots can be specified by using the par(mfrow=c(x,y)) framework # However a more flexible approach is to use the layout function layout_matrix <- matrix(c(rep(c(1,2),times=2,each=2),rep(c(3,4),times=2,each=2)),ncol=4) layout_matrix # Four plots arranged as quadrants layout(layout_matrix, heights = c(1,1,1,1), width = c(1,1,1,1)) # Define the layout structure of the figure layout.show(4) # display the figure structure # Example of plotting with layout pdf("four_plot_cars.pdf") layout(layout_matrix, heights = c(2,2,1,1), width = c(1,1,2,2)) # Define the layout structure of the figure speed_hist <- hist(cars$speed,plot = FALSE) dist_hist <- hist(cars$dist,plot = FALSE) top = max(c(speed_hist$counts, dist_hist$counts)) barplot(speed_hist$counts, axes=FALSE, space=0,horiz=TRUE) plot(0,0,type="n",xaxt="n",yaxt="n",xlab="",ylab="",bty="n") # TODO: what is this plot comand doing? plot(cars$dist,cars$speed,xlab="Distance",ylab="Speed",cex.lab=1.5) barplot(dist_hist$counts, axes=FALSE, space=0) dev.off() # TODO: Load the "iris" data into R # TODO: Subset the data and create a 2 column by 4 row plot that shows a histogram of Sepal length, width, and Petal Length width # for setosa and Virginica. Make sure the x and y axis for the histrograms are the same ##################################### # Exploring distributions ##################################### # Distribution functions- d,p,r,q realizations: e.g. pnorm, dnorm, qnorm,rnorm # (p) probability distribution function (pdf) # (d) probability density function (cdf) # (q) quantile function # (r) random number generation # Example of a normal distribution test_mean <- 5 test_sd <- 1 test_range <- seq(0,10,0.01) norm_distribution <- dnorm(test_range,mean = test_mean, sd =test_sd) norm_density <- pnorm(test_range,mean = test_mean, sd =test_sd) norm_quantile <- qnorm(c(0.10,0.25,0.5,0.75,0.9),mean = test_mean, sd = test_sd) norm_random <- rnorm(100,mean = test_mean, sd = test_sd) par(mfrow=c(2,2)) plot(test_range,norm_distribution,type="l") plot(test_range,norm_density,type="l") hist(norm_random) plot(c(0.10,0.25,0.5,0.75,0.9),norm_quantile,type="p") # TODO: what is the relationship between pnorm and dnorm? # TODO: how does each plot respond to increasing the sd # TODO: for each of the distributions (Binomial, Poisson, Negative binomial, Normal, Log-normal, Weibull) # create a 4 by 4 matrix of plots that displays how each distribution responds to changes in the relevant arguments (parameters) ##################################### # Central Limit Theorem ##################################### # Examine the validity of the Central Limit Theorem using a uniform distribution hist(runif(10000)*10,main="Uniform") # histogram of 1000 random draws from Uniform random distribution n <- 10000 # number of means that are sampled nsamples <- 5 # number of draws from the underlying distribution from which to calculate the mean means<- numeric(n) # define a vector to fill for(i in 1:n){ # Loop means[i] <- mean(runif(nsamples)*10) # Calculate the mean of a number of random draws from a distribution } hist(means,ylim=c(0,1600)) # Plot a histogram of the means mean(means) # Return the mean of the means sd(means) # Return the standard deviaation from the means xv <- seq(0,10,0.1) # Create a sequence from zero to 10 yv <- dnorm(xv,mean=mean(means),sd=sd(means))*5000 # lines(xv,yv) # TODO: Using the Uniform distribution, examine how the Central Limit Theorem responds to changes in the number of means sampled (n) # TODO: Examing the impact of altering the number of random draws that are used to calculate the mean (nsamples). # TODO: Create scripts that examing the Central Limit Theorem when drawing from a Poisson and Weibull distribution. # TODO: Does sensitivity to changes in n and nsamples chage with differet distributions? # TODO: Across what parameter space do the Binomial, POisson and Weibull distributions approximate a normal distribution # TODO: write function that calculates the mean, variance, skew and Kurtosis of a sammple vector and returns the results as a list ##################################### # Fitting distributions in R ##################################### # Fitting distribution involves four steps: # 1. Model/function choice: family of distributions # 2. Estimating parameters # 3. Evaluating quality of fitting # 4. Goodness of fit statistical test # # 1. Distribution selection # Analyse data to deteremine what familes of distributions may be appropriate # a: mean, variance, skew, kurtosis # b: plot y values, box plot, histrogram, output of data base properties. ynorm <- rlnorm(100,meanlog = 4,sdlog =0.4) # creeate a distribution to test layout_matrix <- matrix(c(rep(c(1,2),times=2,each=2),rep(c(3,4),times=2,each=2)),ncol=4) layout(layout_matrix, heights = c(1,1,1,1), width = c(1,1,1,1)) # Define the layout structure of the figure layout.show(4) # display the figure structure hist(ynorm, probability = TRUE) boxplot(ynorm) plot(ynorm) plot(c(0,10),c(0,10),type="n",xaxt="n",yaxt="n",xlab="",ylab="",bty="n") ymean <- round(mean(ynorm),2) yvar <- round(var(ynorm),2) yskewness <- round(skewness(ynorm),2) ykurtosis <- round(kurtosis(ynorm),2) text(1,8,paste("Mean = ",ymean,sep=""),pos=4,cex=2) text(1,6,paste("Variance = ",yvar,sep=""),pos=4,cex=2) text(1,4,paste("Skewness = ",yskewness,sep=""),pos=4,cex=2) text(1,2,paste("Kurtosis = ",ykurtosis,sep=""),pos=4,cex=2) # TODO: Convert the above code into a function that performs the calculations and prints the figure to a pdf device # 2. Estimating parameters # # Parameter estimates can be obtained in two ways # a) Analogic method: estimating parameter values from the "sample" population and assuming these correspond to the population parameters # e.g. # ymean = mean(ynorm) # ysd <- sd(ynorm) # # b) Using maximum likelihood methods to estimate the population parameters from sample data. # Methods for maximu likelihood estimation are included in the package "MASS". # The "fitdistr()" function takes the sample data vector, and a specification of the distribution as arguments and returns the mle parameter estimates. ymle <- fitdistr(ynorm,"normal") # TODO: what type of object does ymle return? # TODO: Extract the mean and sd from the object as numerics # TODO: Create random normal, log-normal, Poisson, and weibull data sets (n = 50) and estimate the parameters using Analogic and maximum likelihood methods # 3. Evaluating quality of distribution fits # The suitability of a fitted distribution can be evaluated graphically by ploting the distribution pdf on top of the histogram, # or by using a Quantile-Quatile plot that compares the observed vs. theoretical quantiles # ploting estimated distributions over histograms # Normal distribution hist(ynorm,main="",xlab="observed values",col="grey91",prob=TRUE) # plot a histogram of the sample data curve(dnorm(x,mean=ymean,sd=ysd),add=TRUE,col="red") # Plot the pdf of the distriibution using estimated parameters # TODO: Randomly generate Poisson ditributed data (n=150) with a mean of 3.4 # TODO: create a histogram of the created data # TODO: Use maximum likelihood methods to estimate the Poisson parameter # TODO: plot the assumed Poisson pdf on top of the histogram # TODO: Using another color, plot a fitted Gausian distribution to the same data # Using Quantile-Quantile plots to evaluate distribution fits # Comparing sample data to a normal distribution using QQ plots can simply be done by using the function qqnorm qqnorm(ynorm) # Takes the sample data and plots the sample quantiles against the theoretical pdf quantiles qqline(ynorm,lty=2) # adds a line to the theoretical quantile plot. Deviation of the sample data from this line identifies areas where the fit is not good. # In order to compare other distributions the qqplot function can be used x.wei <- rweibull(n=100,shape=2.1,scale= 1.1) # Random sample data from a weibull distribution x.theo <- rweibull(n=1000,shape = 2.0, scale = 1.0) # Theoretical population with "known" parameter values used qqplot(x.theo,x.wei,main="QQ-plot distr. Weibull") abline(0,1) # a 45-degree reference line # 4. Goodness of fit and normality tests. # The most common test is determining whether data is normally distributed # A common, and powerfull, test for normality is the Shapiro-Wilk test shapiro.test(ynorm) # The W value is the test statistic, if the p-value is < 0.05 we should consider rejecting the null hypothesis # Testing the goodness of fit of discrete (count data) distributions is best done with the function goodfi() from the vcd library # testing for good ness of fit for a log-normal distribution gf <- goodfit(rpois(100,2),type="poisson", method = "MinChisq") summary(gf) plot(gf,main="Count data vs Poisson distribution") ##################################### # Examining correlations between independent variables ##################################### data(iris) names(iris) # Correlations between variables can be calculated using the function cor() ycor <- cor(iris$Sepal.Length,iris$Petal.Length,method = c("pearson")) # The package PerformanceAnalytics contains some easy functions for examining correlations with a data set chart.Correlation(iris[,1:4],histogram = TRUE) #################################### # Work ################################### # TODO: Create a dummy data set (data.frame) that contains 1 responce variable and 4 independent variables. # The number of independent observations should be 100 # Create the responce variable by randomly drawing frmo a normal distribution # Create the independent variables by randomly drawing from a negative exponeial, gamma, Poisson and uniform distributions # Save the data frame as a .csv file # Analysis the newly created data set to determine the basic structure of the data # i.e. Calculate moments for all variables, plot histogram, plot barplot, plot raw data, test for corrrelations between the inependent variables