# Scott Merrill
# Nov 25
# Data simulation

rm(list = ls(all=TRUE))
par(mfrow = c(1, 1))

dataframe.new = as.dataframe(matrix)

##############################################################
# categorical data
####


# number.of.total.samples = 100
# Matricies have a problem with names and numbers. 
#    Data frames do not seem to. 
large.woody.debris = NULL
harvest.type=NULL
sim.data = data.frame (large.woody.debris = large.woody.debris, 
	harvest.type=harvest.type)

sim.data

# Harvest type suggests different distributions of 
#	remaining large woody debris

harvest.names = c("slash","burn","hack",
		"beetle colony","green logging")
harvest.means = c(2,3,3,5,7)
harvest.sd = c(1,1,1,1.5,2)
harvest.samples = c(15,25,20,23,17)

# the error above is variance within each harvest type.
# also the possibility of additional error such as measurement error

count = 1
for (x in 1:length(harvest.means)) { 
	for (y in 1:harvest.samples[x]) {
		sim.data[count,1] = (rnorm(mean=harvest.means[x],
			sd=harvest.sd[x],n=1)
			# still want to add in measurement error
			+ rnorm(mean = 0, sd = 1, n = 1))
		sim.data[count,2] = harvest.names[x]		
		count = count + 1
	}
}
hist(sim.data[,1])
sim.data

mod1 = glm(sim.data[,1] ~ sim.data[,2]-1)
mod1
summary(mod1)

################################################################
# simple linear model
##########

rm(list = ls(all = TRUE)


results = numeric(1000)
for (loop.test in 1:1000) {

# blueberry yield is a function of mycorrhizal inoculation
# yield = intercept (no fungi present) + 
#	slope (the amount of change in yield by increasing fungi by 1%)
#	* %Inoculation (actual amount present)
#	Also, there is error in the system (ability to measure yield, 
#	natural variability in yield within plants


inoculation.treatment.names = c("natural","treated")
# natural is estimated to be beta distributed with shape1 = 1,shape2 = 24
# hist(rbeta(n=10000,shape1 = 6, shape2=24)) # How does this distribution look?

# treated is estimated to be beta distributed with shape1 = 4,shape2 = 16
# hist(rbeta(n=10000,shape1 = 8, shape2=2)) # How does this distribution look?

# Creating a data matrix
sim.data = matrix (nrow = 40, ncol = 3, data = 0, dimnames = list(1:40,c("yield","%Inoc","Error")))
colnames(sim.data) = c("yield","%Inoc","Error")
rownames(sim.data) <- paste('plot', 1:length(sim.data[,1]))
# column 1 is my yield, column 2 is my %inoculation, column 3 is saved for error
# 40 rows based on 40 plots of data (this is only one year)
# If I had three years of expected data I might make column 1 equal to 120
#	40 plots per year * 3 years = 120 plots or 120 rows)
#	and add a column to be used for a "year" categorical variable
sim.data

# first twenty rows are natural sim.data[c(1:20),2] 
sim.data[c(1:20),2] = rbeta(n=20,shape1 = 6, shape2=24)

# second twenty rows are inoculated sim.data[c(21:40),2] 
sim.data[c(21:40),2] = rbeta(n=20,shape1 = 8, shape2=2)

# intercept is the expected amount of yield with no mychorrhizae I estimate 20 lbs/plant
intercept = 20
# slope of the mychorrhizae effect is the amount of change in yield from zero inoc to 100% inoc
#	mychorrhizae.slope = 3 lbs / plant (I estimte that inoculation could add ~ 15% yield to plants
mychorrhizae.slope = 3
# also I need error
# error in the system (ability to measure yield, natural variability in yield within plants)
# mean = 0 (error could be plus or minus)
# estimated to be ~ normal with with a sd = 4 which is stating that 
#	~95% of my plants yield between 12 and 28 lbs (without inoculation)
sim.data[,3] = rnorm(mean = 0, sd = 4, n = length(sim.data[,3]))

sim.data[,1] = intercept + mychorrhizae.slope*sim.data[,2] + sim.data[,3]
sim.data

# what do those data look like? 
# plot (sim.data[,2],sim.data[,1], ylab = "Yield", xlab="% Mychorrhizal Inoculation")

# now the test!

mod1 = lm(sim.data[,1] ~ sim.data[,2])
summary(mod1)

# adding a line to the plot
# abline(mod1$coef, lwd = 3, col = "cyan3")

# I would loop through multiple iterations of this process (say 1000 times)
# 	extract the p-value or f-stat
# 	then determine fraction of time that a significant result was observed
summary(mod1)[[4]][8] #provides p-value for %inoc (i.e., sim.data[,2])

results[loop.test] = summary(mod1)[[4]][8]
}

results
sum(results<.05)/1000
# If I wanted to add in multiple years I would simulate error 
# 	to have one error value per year 
#	to simulate good and bad years
#	e.g., year one is a good year with yield = yield + 2.2 for all plots; 
#		year 2, yield = yield - 0.1
#		year 3, yield = yield - 1.7
# 	error values for the years could be estimated using 
#		rnorm(mean = 0, sd = ?, n = number of years)

##########################################################################
# complex linear model
##########

rm (list = ls(all = TRUE))

# yield is a function of nitrogen, water and heat units (degree days)
# nitrogen distribution is a treatment with 4 different levels
# water is from precip and is likely normal with a mean of around 20
#    centimeters during the growing season, sd = 3
# degree days are derived from average daily temperature and are reasonably normal
#    with a mean of 1200 and a sd = 150

# yeild = intercept + function(degree days) + function(precip)
#		+ function(nitrogen) + ERROR
# yeild = intercept + beta1*degreedays + beta2*precip 
#		+ beta3*nitrogen + ERROR

# first set up matrix and populate it with my 
#	x (predictor, independent) variables
# 	

# columns 1 = yeild, 2 = applied N, 3 = N catergory,
#		4 = PPT, 5 = Degree days
yield = NULL
Applied.N = NULL
N.cat=NULL
PPT = NULL
DD=NULL
sim.data = data.frame(yield = yield, Applied.N = Applied.N, N.cat=N.cat,
				PPT = PPT, DD=DD)

Nitrogen.names = c("base","low","medium",
		"high", "validation")
Nitrogen.means = c(30,80,100,120,100)
Nitrogen.sd = c(2,5,6,7,6)
Nitrogen.treatments = c(4,4,4,4,10)

count = 1
for (x in 1:length(Nitrogen.means)) { 
	for (y in 1:Nitrogen.treatments[x]) {
		sim.data[count,1] = 0
		sim.data[count,2] = (rnorm(mean=Nitrogen.means[x],
			sd=Nitrogen.sd[x],n=1)
			# still want to add in measurement error
			+ rnorm(mean = 0, sd = 2, n = 1))
		sim.data[count,3] = Nitrogen.names[x]
			
		sim.data[count,4] = 0
		sim.data[count,5] = 0			
		count = count + 1
	}
}
hist(sim.data[c(1:16),1])
sim.data

# rnorm for PPT values
sim.data[,4] = rnorm(mean=20,sd=3,n = length(sim.data[,4]))
# rnorm for Degree day values
sim.data[,5] = rnorm(mean=1200,sd=150,n = length(sim.data[,5]))

# test
sim.data

# yeild = intercept + function(degree days) + function(PPT)
#		+ function(nitrogen) + ERROR
# yeild = intercept + beta1*degreedays + beta2*PPT 
#		+ beta3*nitrogen + ERROR

# yeild = 50 + (.05*(degreedays-1000) + 1.5*(PPT-10) 
#		+ .7*(Nitrogen-30) + ERROR

#two ways to add the error
#	1) directly in the formula
#	2) as an error column in the matrix

# directly in the formula
sim.data[,1] =( 50 + .05*(sim.data[,5]-1000) + 1.5*(sim.data[,4]-10) 
		+ 0.7*(sim.data[,2]-30) + 
		rnorm(mean=0,sd=20,n=length(sim.data[,1])))

# in data.frame
sim.data[,6] = rnorm(mean=0,sd=20,n=length(sim.data[,1]))

sim.data[,1] =( 50 + .05*(sim.data[,5]-1000) + 1.5*(sim.data[,4]-10) 
		+ 0.7*(sim.data[,2]-30) + sim.data[,6])

hist(sim.data[c(1:16),1])

# categorical data
mod0 = glm(sim.data[c(1:16),1] ~ (sim.data[c(1:16),3] + 
		sim.data[c(1:16),4] + sim.data[c(1:16),5]))
summary(mod0)

# continuous N
mod1 = glm(sim.data[c(1:16),1] ~ (sim.data[c(1:16),2] + 
		sim.data[c(1:16),4] + sim.data[c(1:16),5]))
summary(mod1)

# without weather covariates
mod2 = glm(sim.data[c(1:16),1] ~ sim.data[c(1:16),2])
summary(mod2)

# This gets really tricky
AIC(mod0)
AIC(mod1)
AIC(mod2)


########################################################
# multimodel inference
####

mod1$coefficients

# sim.data[,7] = predictions for mod1
sim.data[,7] = (mod1$coefficients[1] + mod1$coefficients[2]*sim.data[,2]
		+ mod1$coefficients[3]*sim.data[,4]  
		+ mod1$coefficients[4]*sim.data[,5])

sim.data[,8] = (sim.data[,1] - sim.data[,7])^2
MSE.mod1.val = sum(sim.data[c(17:26),8])/Nitrogen.treatments[5]
MSE.mod1.val


# sim.data[,9] = predictions for mod2
sim.data[,9] = (mod2$coefficients[1] + mod2$coefficients[2]*sim.data[,2])

sim.data[,10] = (sim.data[,1] - sim.data[,9])^2
MSE.mod2.val = sum(sim.data[c(17:26),10])/Nitrogen.treatments[5]
MSE.mod2.val

AIC(mod1)
AIC(mod2)


#####################################################################################
# simple non-linear
##############
rm (list = ls(all=TRUE))


# blueberry yield is a function of mycorrhizal inoculation
# yield = intercept (no fungi present) + 
#	slope (the amount of change in yield by increasing fungi by 1%)
#	* %Inoculation (actual amount present)
#	Also, there is error in the system (ability to measure yield, 
#	natural variability in yield within plants


inoculation.treatment.names = c("natural","treated")
# natural is estimated to be beta distributed with shape1 = 1,shape2 = 24
# hist(rbeta(n=10000,shape1 = 6, shape2=24)) # How does this distribution look?

# treated is estimated to be beta distributed with shape1 = 4,shape2 = 16
# hist(rbeta(n=10000,shape1 = 8, shape2=2)) # How does this distribution look?

# Creating a data matrix
sim.data = matrix (nrow = 40, ncol = 3, data = 0, dimnames = list(1:40,c("yield","%Inoc","Error")))
colnames(sim.data) = c("yield","%Inoc","Error")
rownames(sim.data) <- paste('plot', 1:length(sim.data[,1]))
# column 1 is my yield, column 2 is my %inoculation, column 3 is saved for error
# 40 rows based on 40 plots of data (this is only one year)
# If I had three years of expected data I might make column 1 equal to 120
#	40 plots per year * 3 years = 120 plots or 120 rows)
#	and add a column to be used for a "year" categorical variable
sim.data

# first twenty rows are natural sim.data[c(1:20),2] 
sim.data[c(1:20),2] = rbeta(n=20,shape1 = 6, shape2=24)

# second twenty rows are inoculated sim.data[c(21:40),2] 
sim.data[c(21:40),2] = rbeta(n=20,shape1 = 8, shape2=2)

# intercept is the expected amount of yield with no mychorrhizae I estimate 20 lbs/plant
intercept = 20

# Same error in y variable as in linear example
sim.data[,3] = rnorm(mean = 0, sd = 4, n = length(sim.data[,3]))

# what is the maximum effect that inoculation will have on a plant 
#	what is the total saturation effect
maximum.inoc.effect = 3 # estimating that 3 lbs is the maximum effect of fungal inoc

# yield = 1 / (1 + exp(-%inoc) + Error
# what does that look like? 
z = seq(-10,10,by = .1)
plot(z,maximum.inoc.effect / (1 + exp(-z)))

# in the above plot the half way point is at x = 0 with y = 1.5
# you need to figure out the half way point in your data 
#	(when you want the curve to be half way towards saturation)
half.way = .35 # 35% is my estimated half way point in this example
halfway.index=1-half.way # to get that value to the zero point (e.g., in the latest plot) 
# note I am using 10 as my multiplier this changes the steepness of the curve
# (how quickly it saturates), different numbers can be put in. 

sim.data[,1] = intercept + maximum.inoc.effect / (1 + exp(-10*(sim.data[,2]-halfway.index))) + sim.data[,3]
sim.data

mod1 = lm(sim.data[,1]~sim.data[,2])
summary(mod1)

mod2 = nls(sim.data[,1]~alpha + beta2 / (1 + exp(-beta1*sim.data[,2])), 
	start = list(alpha = 18, beta2 = 2.9, beta1= 10))
summary(mod2)

mod3 = nls(sim.data[,1]~alpha + beta2*sim.data[,2], 
	start = list(alpha = 18, beta2 = 2.9))
summary(mod3)



########################################################################
# non-linear a little more complex
#########################

rm (list = ls(all = TRUE))

# yield is a function of nitrogen, water and heat units (degree days)
# nitrogen distribution is a treatment with 4 different levels
# water is from precip and is likely normal with a mean of around 20
#    centimeters during the growing season, sd = 3
# degree days are derived from average daily temperature and are reasonably normal
#    with a mean of 1200 and a sd = 150

# yeild = intercept + function(degree days) + function(precip)
#		+ ##SATURATING## function(nitrogen) + ERROR
# yeild = intercept + beta1*degreedays + beta2*precip 
#		+ beta3*nitrogen + ERROR

# first set up matrix and populate it with my 
#	x (predictor, independent) variables
# 	

# columns 1 = yeild, 2 = applied N, 3 = N catergory,
#		4 = PPT, 5 = Degree days
yield = NULL
Applied.N = NULL
N.cat=NULL
PPT = NULL
DD=NULL
sim.data = data.frame(yield = yield, Applied.N = Applied.N, N.cat=N.cat,
				PPT = PPT, DD=DD)

Nitrogen.names = c("base","low","medium",
		"high", "validation")
Nitrogen.means = c(30,80,100,120,100)
Nitrogen.sd = c(2,5,6,7,6)
Nitrogen.treatments = c(4,4,4,4,10)

count = 1
for (x in 1:length(Nitrogen.means)) { 
	for (y in 1:Nitrogen.treatments[x]) {
		sim.data[count,1] = 0
		sim.data[count,2] = (rnorm(mean=Nitrogen.means[x],
			sd=Nitrogen.sd[x],n=1)
			# still want to add in measurement error
			+ rnorm(mean = 0, sd = 2, n = 1))
		sim.data[count,3] = Nitrogen.names[x]
			
		sim.data[count,4] = 0
		sim.data[count,5] = 0			
		count = count + 1
	}
}
hist(sim.data[c(1:16),1])
sim.data

# rnorm for PPT values
sim.data[,4] = rnorm(mean=20,sd=3,n = length(sim.data[,4]))
# rnorm for Degree day values
sim.data[,5] = rnorm(mean=1200,sd=150,n = length(sim.data[,5]))

# test
sim.data

# yeild = intercept + function(degree days) + function(PPT)
#		+ bizarre saturating function(nitrogen) + ERROR
# yeild = intercept + beta1*degreedays + beta2*PPT 
#		+ (beta2*(1-exp(-K*sim.data[,2]))-beta3) + ERROR

# yeild = 50 + (.05*(degreedays-1000) + 1.5*(PPT-10) 
#		+ (200*(1-exp(-.05*sim.data[,2]))-150) + ERROR

#two ways to add the error
#	1) directly in the formula
#	2) as an error column in the matrix

# directly in the formula
sim.data[,1] =( 50 + .05*(sim.data[,5]-1000) + 1.5*(sim.data[,4]-10) 
		+ (200*(1-exp(-.05*sim.data[,2]))-150) + 
		rnorm(mean=0,sd=20,n=length(sim.data[,1])))

# in data.frame
sim.data[,6] = rnorm(mean=0,sd=20,n=length(sim.data[,1]))

sim.data[,1] =(50 + .05*(sim.data[,5]-1000) + 1.5*(sim.data[,4]-10) 
		+ (200*(1-exp(-.05*sim.data[,2]))-150) + sim.data[,6])

hist(sim.data[c(1:16),1])

# categorical data
mod0 = glm(sim.data[c(1:16),1] ~ (sim.data[c(1:16),3] + 
		sim.data[c(1:16),4] + sim.data[c(1:16),5]))
summary(mod0)

#### plot N vs yield
plot(sim.data[,2],sim.data[,1], ylab = "Yield", xlab = "Nitrogen")


# continuous N
mod1 = glm(sim.data[c(1:16),1] ~ (sim.data[c(1:16),2] + 
		sim.data[c(1:16),4] + sim.data[c(1:16),5]))
summary(mod1)

# without weather covariates
mod2 = glm(sim.data[c(1:16),1] ~ sim.data[c(1:16),2])
summary(mod2)


# non-linear model nls

# for whatever reason, R doesn't like using a data frame in this case. Change to vectors
yield = as.numeric(sim.data[c(1:16),1])
N = as.numeric(sim.data[c(1:16),2])
PPT = as.numeric(sim.data[c(1:16),4])
DD = as.numeric(sim.data[c(1:16),5])

# all variables with non-linear N
mod3 = nls(yield~alpha + beta1*(1-exp(-K*N) + beta2*PPT + beta3*DD),
	start = list(K = .05, alpha = -200, beta1 = 190, beta2 = 1, beta3=.05))
summary(mod3)

# Simple model with just N
mod4 = nls(yield~alpha + beta1*(1-exp(-K*N)),
	start = list(K = .05, alpha = -100, beta1 = 190))
summary(mod4)

Z = coef(mod4)[2] + coef(mod4)[3]*(1-exp(-coef(mod4)[1]*N))
plot(yield,Z)



#N = seq(1:150)
#Z =  200*(1-exp(-.05*N))-150
#plot(Z)
#plot(N[30:120],Z[30:120])