# Scott Merrill
# Oct 29
# linear and non-linear models


rm (list = ls(all = TRUE))

require(datasets)

# co2 data is monthly co2 concentration data in ppm from 1959 to 1997

data(co2)
plot(co2, main = "Mauna Loa Atmospheric CO2 Concentration")
co2
# 468 data points
length(co2)

# extract the January data from these data
jan.co2 = co2[seq(from = 1, to = 468, by = 12)]
length(jan.co2)

# check
co2
jan.co2

year = seq(1959,1997)
length(year)

plot(year,jan.co2)

mod1=lm(jan.co2~year)
summary(mod1)
abline(mod1$coefficients)
mtext("How does that fit?")

plot(mod1$residuals)
sum(mod1$residuals^2)

# Change year to 1 to 39
year = seq(from =1, to = 39)

# Doesn't change anything other than the x axis
plot(year,jan.co2)

mod1=lm(jan.co2~year)
summary(mod1)
abline(mod1$coefficients)
mtext("Doesn't change anything other than the x-axis")

#######
# Standard operating procedure is to linearize the data
######
# power transforms can be optimized using a boxcox tranformation 
# found in library (MASS)

# but I couldn't get my transformation to work so...

for (lambda in seq(from = 1.3, to = 1.5, by = .01)) {
	trans.year = year^lambda
	mod.fit=lm(jan.co2~trans.year)
	SSE = sum(mod.fit$residuals^2)
	print(c("SSE = ", SSE, "lambda = ", lambda))
}


trans.year = year^1.37

plot(trans.year,jan.co2)

mod2=lm(jan.co2~trans.year)
summary(mod2)
abline(mod2$coefficients)

plot(mod2$residuals)
sum(mod2$residuals^2)

# Interpretation of this linear model is...
plot(trans.year,jan.co2, main = "linear fit to transformed data is exceptional")
abline(mod2$coefficients)
mtext("Adjusted R-squared is psyck!", line = -1)
mtext(summary(mod2)[8],line = -2)

########
# What if you were interested in the non-linear nature of the model?
########
# I think it is exponential

mod.nls = nls(jan.co2~co2.base.value + b*year^gamma, 
	start = list(co2.base.value = 0, b = 1, gamma = 1))
summary(mod.nls)
coef(mod.nls)


# This tells us that the best fit is: 
fit.y = 315.19326 + 0.32334 *year^1.36955 
plot(year,fit.y, lwd=3, type = "l")
points(year,jan.co2, lwd=2, col = "red")

# What is the sum of squared error in this model
sum((fit.y-jan.co2)^2)

##################
# very similar fit. So why not just use the simple linear transformation? 
##############
rm(list = ls(all = TRUE))
require(datasets)
data(co2)
co2

# making a column of evenly spaced dates from 1959-1997 (actually year 1 to end of year 39)
# each month equals 1/12th of a year so jan of year 1 equals 1, february of year 1 is 1+ 1/12,
# 	march of year 1 = 1 + 2/12, on through december of year 1 = 1 + 11/12
z = seq(1,40-1/12, by = 1/12)
length(z) # Checking to confirm that my date vector is of the same length as my co2 vector

# creating and filling in a data matrix
all.co2 = matrix(nrow = 468, ncol = 2, data = 0)
all.co2[,1] = co2[seq(from = 1, to = 468, by = 1)]
all.co2[,2] = z

# Looking at the co2 data in a couple of different ways
plot(all.co2[,2],all.co2[,1], type = "l", main = "Mauna Loa Atmospheric CO2 Concentration")
plot(all.co2[,2],all.co2[,1], main = "Mauna Loa Atmospheric CO2 Concentration")


# creating a simple linear model
mod.lm.all= lm(all.co2[,1]~all.co2[,2])
# adding the model predictions to the plot
abline(mod.lm.all$coefficients, lwd = 3, col = "red")
sum(mod.lm.all$residuals^2)
####
# SSE =  3194.08
####


# creating an exponential model
mod.nls.exp = nls(all.co2[,1]~co2.base.value + b*all.co2[,2]^gamma, start = list(co2.base.value = 310, b = 1.3, gamma = 1))
summary(mod.nls.exp)

fit.line.exp=315.26545+  0.33431*all.co2[,2]^1.35882

# adding the exponential model predictions to the plot
points(all.co2[,2],fit.line.exp, type = "l", col = "blue", lwd = 3)

sum((fit.line.y - all.co2[,1])^2)
####
# SSE = 2138.026
####

mod.nls.complex = nls(all.co2[,1]~co2.base.value + b*all.co2[,2]^gamma + zeta*sin(all.co2[,2]*pi*2) , start = list(co2.base.value = 315, b = .3, gamma = 1, zeta = 7))
summary(mod.nls.complex)

fit.line.complex = 315.19481 +  0.33702*all.co2[,2]^ 1.35733 + 2.76377*sin(all.co2[,2]*pi*2)

# adding the complex non-linear model predictions to the plot
plot(all.co2[,2],fit.line.complex, type = "l", col = "green", lwd = 1)

sum((fit.line.complex - all.co2[,1])^2)

####
# SSE = 351.2988
####


# revisualizing the data as a line on the plot
points(all.co2[,2],all.co2[,1], type = "l", col = "black", lwd = 2)

# AIC values lower is better
# AIC uses a measure of fit (how much error exists)
# 	and penalizes models for increasing the number of parameters

AIC(mod.lm.all)

AIC(mod.nls.exp)

AIC(mod.nls.complex)