# Scott Merrill # Nov 25 # Data simulation rm(list = ls(all=TRUE)) par(mfrow = c(1, 1)) dataframe.new = as.dataframe(matrix) ############################################################## # categorical data #### # number.of.total.samples = 100 # Matricies have a problem with names and numbers. # Data frames do not seem to. large.woody.debris = NULL harvest.type=NULL sim.data = data.frame (large.woody.debris = large.woody.debris, harvest.type=harvest.type) sim.data # Harvest type suggests different distributions of # remaining large woody debris harvest.names = c("slash","burn","hack", "beetle colony","green logging") harvest.means = c(2,3,3,5,7) harvest.sd = c(1,1,1,1.5,2) harvest.samples = c(15,25,20,23,17) # the error above is variance within each harvest type. # also the possibility of additional error such as measurement error count = 1 for (x in 1:length(harvest.means)) { for (y in 1:harvest.samples[x]) { sim.data[count,1] = (rnorm(mean=harvest.means[x], sd=harvest.sd[x],n=1) # still want to add in measurement error + rnorm(mean = 0, sd = 1, n = 1)) sim.data[count,2] = harvest.names[x] count = count + 1 } } hist(sim.data[,1]) sim.data mod1 = glm(sim.data[,1] ~ sim.data[,2]-1) mod1 summary(mod1) ################################################################ # simple linear model ########## rm(list = ls(all = TRUE) results = numeric(1000) for (loop.test in 1:1000) { # blueberry yield is a function of mycorrhizal inoculation # yield = intercept (no fungi present) + # slope (the amount of change in yield by increasing fungi by 1%) # * %Inoculation (actual amount present) # Also, there is error in the system (ability to measure yield, # natural variability in yield within plants inoculation.treatment.names = c("natural","treated") # natural is estimated to be beta distributed with shape1 = 1,shape2 = 24 # hist(rbeta(n=10000,shape1 = 6, shape2=24)) # How does this distribution look? # treated is estimated to be beta distributed with shape1 = 4,shape2 = 16 # hist(rbeta(n=10000,shape1 = 8, shape2=2)) # How does this distribution look? # Creating a data matrix sim.data = matrix (nrow = 40, ncol = 3, data = 0, dimnames = list(1:40,c("yield","%Inoc","Error"))) colnames(sim.data) = c("yield","%Inoc","Error") rownames(sim.data) <- paste('plot', 1:length(sim.data[,1])) # column 1 is my yield, column 2 is my %inoculation, column 3 is saved for error # 40 rows based on 40 plots of data (this is only one year) # If I had three years of expected data I might make column 1 equal to 120 # 40 plots per year * 3 years = 120 plots or 120 rows) # and add a column to be used for a "year" categorical variable sim.data # first twenty rows are natural sim.data[c(1:20),2] sim.data[c(1:20),2] = rbeta(n=20,shape1 = 6, shape2=24) # second twenty rows are inoculated sim.data[c(21:40),2] sim.data[c(21:40),2] = rbeta(n=20,shape1 = 8, shape2=2) # intercept is the expected amount of yield with no mychorrhizae I estimate 20 lbs/plant intercept = 20 # slope of the mychorrhizae effect is the amount of change in yield from zero inoc to 100% inoc # mychorrhizae.slope = 3 lbs / plant (I estimte that inoculation could add ~ 15% yield to plants mychorrhizae.slope = 3 # also I need error # error in the system (ability to measure yield, natural variability in yield within plants) # mean = 0 (error could be plus or minus) # estimated to be ~ normal with with a sd = 4 which is stating that # ~95% of my plants yield between 12 and 28 lbs (without inoculation) sim.data[,3] = rnorm(mean = 0, sd = 4, n = length(sim.data[,3])) sim.data[,1] = intercept + mychorrhizae.slope*sim.data[,2] + sim.data[,3] sim.data # what do those data look like? # plot (sim.data[,2],sim.data[,1], ylab = "Yield", xlab="% Mychorrhizal Inoculation") # now the test! mod1 = lm(sim.data[,1] ~ sim.data[,2]) summary(mod1) # adding a line to the plot # abline(mod1$coef, lwd = 3, col = "cyan3") # I would loop through multiple iterations of this process (say 1000 times) # extract the p-value or f-stat # then determine fraction of time that a significant result was observed summary(mod1)[[4]][8] #provides p-value for %inoc (i.e., sim.data[,2]) results[loop.test] = summary(mod1)[[4]][8] } results sum(results<.05)/1000 # If I wanted to add in multiple years I would simulate error # to have one error value per year # to simulate good and bad years # e.g., year one is a good year with yield = yield + 2.2 for all plots; # year 2, yield = yield - 0.1 # year 3, yield = yield - 1.7 # error values for the years could be estimated using # rnorm(mean = 0, sd = ?, n = number of years) ########################################################################## # complex linear model ########## rm (list = ls(all = TRUE)) # yield is a function of nitrogen, water and heat units (degree days) # nitrogen distribution is a treatment with 4 different levels # water is from precip and is likely normal with a mean of around 20 # centimeters during the growing season, sd = 3 # degree days are derived from average daily temperature and are reasonably normal # with a mean of 1200 and a sd = 150 # yeild = intercept + function(degree days) + function(precip) # + function(nitrogen) + ERROR # yeild = intercept + beta1*degreedays + beta2*precip # + beta3*nitrogen + ERROR # first set up matrix and populate it with my # x (predictor, independent) variables # # columns 1 = yeild, 2 = applied N, 3 = N catergory, # 4 = PPT, 5 = Degree days yield = NULL Applied.N = NULL N.cat=NULL PPT = NULL DD=NULL sim.data = data.frame(yield = yield, Applied.N = Applied.N, N.cat=N.cat, PPT = PPT, DD=DD) Nitrogen.names = c("base","low","medium", "high", "validation") Nitrogen.means = c(30,80,100,120,100) Nitrogen.sd = c(2,5,6,7,6) Nitrogen.treatments = c(4,4,4,4,10) count = 1 for (x in 1:length(Nitrogen.means)) { for (y in 1:Nitrogen.treatments[x]) { sim.data[count,1] = 0 sim.data[count,2] = (rnorm(mean=Nitrogen.means[x], sd=Nitrogen.sd[x],n=1) # still want to add in measurement error + rnorm(mean = 0, sd = 2, n = 1)) sim.data[count,3] = Nitrogen.names[x] sim.data[count,4] = 0 sim.data[count,5] = 0 count = count + 1 } } hist(sim.data[c(1:16),1]) sim.data # rnorm for PPT values sim.data[,4] = rnorm(mean=20,sd=3,n = length(sim.data[,4])) # rnorm for Degree day values sim.data[,5] = rnorm(mean=1200,sd=150,n = length(sim.data[,5])) # test sim.data # yeild = intercept + function(degree days) + function(PPT) # + function(nitrogen) + ERROR # yeild = intercept + beta1*degreedays + beta2*PPT # + beta3*nitrogen + ERROR # yeild = 50 + (.05*(degreedays-1000) + 1.5*(PPT-10) # + .7*(Nitrogen-30) + ERROR #two ways to add the error # 1) directly in the formula # 2) as an error column in the matrix # directly in the formula sim.data[,1] =( 50 + .05*(sim.data[,5]-1000) + 1.5*(sim.data[,4]-10) + 0.7*(sim.data[,2]-30) + rnorm(mean=0,sd=20,n=length(sim.data[,1]))) # in data.frame sim.data[,6] = rnorm(mean=0,sd=20,n=length(sim.data[,1])) sim.data[,1] =( 50 + .05*(sim.data[,5]-1000) + 1.5*(sim.data[,4]-10) + 0.7*(sim.data[,2]-30) + sim.data[,6]) hist(sim.data[c(1:16),1]) # categorical data mod0 = glm(sim.data[c(1:16),1] ~ (sim.data[c(1:16),3] + sim.data[c(1:16),4] + sim.data[c(1:16),5])) summary(mod0) # continuous N mod1 = glm(sim.data[c(1:16),1] ~ (sim.data[c(1:16),2] + sim.data[c(1:16),4] + sim.data[c(1:16),5])) summary(mod1) # without weather covariates mod2 = glm(sim.data[c(1:16),1] ~ sim.data[c(1:16),2]) summary(mod2) # This gets really tricky AIC(mod0) AIC(mod1) AIC(mod2) ######################################################## # multimodel inference #### mod1$coefficients # sim.data[,7] = predictions for mod1 sim.data[,7] = (mod1$coefficients[1] + mod1$coefficients[2]*sim.data[,2] + mod1$coefficients[3]*sim.data[,4] + mod1$coefficients[4]*sim.data[,5]) sim.data[,8] = (sim.data[,1] - sim.data[,7])^2 MSE.mod1.val = sum(sim.data[c(17:26),8])/Nitrogen.treatments[5] MSE.mod1.val # sim.data[,9] = predictions for mod2 sim.data[,9] = (mod2$coefficients[1] + mod2$coefficients[2]*sim.data[,2]) sim.data[,10] = (sim.data[,1] - sim.data[,9])^2 MSE.mod2.val = sum(sim.data[c(17:26),10])/Nitrogen.treatments[5] MSE.mod2.val AIC(mod1) AIC(mod2) ##################################################################################### # simple non-linear ############## rm (list = ls(all=TRUE)) # blueberry yield is a function of mycorrhizal inoculation # yield = intercept (no fungi present) + # slope (the amount of change in yield by increasing fungi by 1%) # * %Inoculation (actual amount present) # Also, there is error in the system (ability to measure yield, # natural variability in yield within plants inoculation.treatment.names = c("natural","treated") # natural is estimated to be beta distributed with shape1 = 1,shape2 = 24 # hist(rbeta(n=10000,shape1 = 6, shape2=24)) # How does this distribution look? # treated is estimated to be beta distributed with shape1 = 4,shape2 = 16 # hist(rbeta(n=10000,shape1 = 8, shape2=2)) # How does this distribution look? # Creating a data matrix sim.data = matrix (nrow = 40, ncol = 3, data = 0, dimnames = list(1:40,c("yield","%Inoc","Error"))) colnames(sim.data) = c("yield","%Inoc","Error") rownames(sim.data) <- paste('plot', 1:length(sim.data[,1])) # column 1 is my yield, column 2 is my %inoculation, column 3 is saved for error # 40 rows based on 40 plots of data (this is only one year) # If I had three years of expected data I might make column 1 equal to 120 # 40 plots per year * 3 years = 120 plots or 120 rows) # and add a column to be used for a "year" categorical variable sim.data # first twenty rows are natural sim.data[c(1:20),2] sim.data[c(1:20),2] = rbeta(n=20,shape1 = 6, shape2=24) # second twenty rows are inoculated sim.data[c(21:40),2] sim.data[c(21:40),2] = rbeta(n=20,shape1 = 8, shape2=2) # intercept is the expected amount of yield with no mychorrhizae I estimate 20 lbs/plant intercept = 20 # Same error in y variable as in linear example sim.data[,3] = rnorm(mean = 0, sd = 4, n = length(sim.data[,3])) # what is the maximum effect that inoculation will have on a plant # what is the total saturation effect maximum.inoc.effect = 3 # estimating that 3 lbs is the maximum effect of fungal inoc # yield = 1 / (1 + exp(-%inoc) + Error # what does that look like? z = seq(-10,10,by = .1) plot(z,maximum.inoc.effect / (1 + exp(-z))) # in the above plot the half way point is at x = 0 with y = 1.5 # you need to figure out the half way point in your data # (when you want the curve to be half way towards saturation) half.way = .35 # 35% is my estimated half way point in this example halfway.index=1-half.way # to get that value to the zero point (e.g., in the latest plot) # note I am using 10 as my multiplier this changes the steepness of the curve # (how quickly it saturates), different numbers can be put in. sim.data[,1] = intercept + maximum.inoc.effect / (1 + exp(-10*(sim.data[,2]-halfway.index))) + sim.data[,3] sim.data mod1 = lm(sim.data[,1]~sim.data[,2]) summary(mod1) mod2 = nls(sim.data[,1]~alpha + beta2 / (1 + exp(-beta1*sim.data[,2])), start = list(alpha = 18, beta2 = 2.9, beta1= 10)) summary(mod2) mod3 = nls(sim.data[,1]~alpha + beta2*sim.data[,2], start = list(alpha = 18, beta2 = 2.9)) summary(mod3) ######################################################################## # non-linear a little more complex ######################### rm (list = ls(all = TRUE)) # yield is a function of nitrogen, water and heat units (degree days) # nitrogen distribution is a treatment with 4 different levels # water is from precip and is likely normal with a mean of around 20 # centimeters during the growing season, sd = 3 # degree days are derived from average daily temperature and are reasonably normal # with a mean of 1200 and a sd = 150 # yeild = intercept + function(degree days) + function(precip) # + ##SATURATING## function(nitrogen) + ERROR # yeild = intercept + beta1*degreedays + beta2*precip # + beta3*nitrogen + ERROR # first set up matrix and populate it with my # x (predictor, independent) variables # # columns 1 = yeild, 2 = applied N, 3 = N catergory, # 4 = PPT, 5 = Degree days yield = NULL Applied.N = NULL N.cat=NULL PPT = NULL DD=NULL sim.data = data.frame(yield = yield, Applied.N = Applied.N, N.cat=N.cat, PPT = PPT, DD=DD) Nitrogen.names = c("base","low","medium", "high", "validation") Nitrogen.means = c(30,80,100,120,100) Nitrogen.sd = c(2,5,6,7,6) Nitrogen.treatments = c(4,4,4,4,10) count = 1 for (x in 1:length(Nitrogen.means)) { for (y in 1:Nitrogen.treatments[x]) { sim.data[count,1] = 0 sim.data[count,2] = (rnorm(mean=Nitrogen.means[x], sd=Nitrogen.sd[x],n=1) # still want to add in measurement error + rnorm(mean = 0, sd = 2, n = 1)) sim.data[count,3] = Nitrogen.names[x] sim.data[count,4] = 0 sim.data[count,5] = 0 count = count + 1 } } hist(sim.data[c(1:16),1]) sim.data # rnorm for PPT values sim.data[,4] = rnorm(mean=20,sd=3,n = length(sim.data[,4])) # rnorm for Degree day values sim.data[,5] = rnorm(mean=1200,sd=150,n = length(sim.data[,5])) # test sim.data # yeild = intercept + function(degree days) + function(PPT) # + bizarre saturating function(nitrogen) + ERROR # yeild = intercept + beta1*degreedays + beta2*PPT # + (beta2*(1-exp(-K*sim.data[,2]))-beta3) + ERROR # yeild = 50 + (.05*(degreedays-1000) + 1.5*(PPT-10) # + (200*(1-exp(-.05*sim.data[,2]))-150) + ERROR #two ways to add the error # 1) directly in the formula # 2) as an error column in the matrix # directly in the formula sim.data[,1] =( 50 + .05*(sim.data[,5]-1000) + 1.5*(sim.data[,4]-10) + (200*(1-exp(-.05*sim.data[,2]))-150) + rnorm(mean=0,sd=20,n=length(sim.data[,1]))) # in data.frame sim.data[,6] = rnorm(mean=0,sd=20,n=length(sim.data[,1])) sim.data[,1] =(50 + .05*(sim.data[,5]-1000) + 1.5*(sim.data[,4]-10) + (200*(1-exp(-.05*sim.data[,2]))-150) + sim.data[,6]) hist(sim.data[c(1:16),1]) # categorical data mod0 = glm(sim.data[c(1:16),1] ~ (sim.data[c(1:16),3] + sim.data[c(1:16),4] + sim.data[c(1:16),5])) summary(mod0) #### plot N vs yield plot(sim.data[,2],sim.data[,1], ylab = "Yield", xlab = "Nitrogen") # continuous N mod1 = glm(sim.data[c(1:16),1] ~ (sim.data[c(1:16),2] + sim.data[c(1:16),4] + sim.data[c(1:16),5])) summary(mod1) # without weather covariates mod2 = glm(sim.data[c(1:16),1] ~ sim.data[c(1:16),2]) summary(mod2) # non-linear model nls # for whatever reason, R doesn't like using a data frame in this case. Change to vectors yield = as.numeric(sim.data[c(1:16),1]) N = as.numeric(sim.data[c(1:16),2]) PPT = as.numeric(sim.data[c(1:16),4]) DD = as.numeric(sim.data[c(1:16),5]) # all variables with non-linear N mod3 = nls(yield~alpha + beta1*(1-exp(-K*N) + beta2*PPT + beta3*DD), start = list(K = .05, alpha = -200, beta1 = 190, beta2 = 1, beta3=.05)) summary(mod3) # Simple model with just N mod4 = nls(yield~alpha + beta1*(1-exp(-K*N)), start = list(K = .05, alpha = -100, beta1 = 190)) summary(mod4) Z = coef(mod4)[2] + coef(mod4)[3]*(1-exp(-coef(mod4)[1]*N)) plot(yield,Z) #N = seq(1:150) #Z = 200*(1-exp(-.05*N))-150 #plot(Z) #plot(N[30:120],Z[30:120])