#Sex: # 0 = female, # 1 = male #ChestPain: # 1 = typical angina, # 2 = atypical angina, # 3 = non-anginal pain, # 4 = asymptomatic #Fbs: fasting blood sugar greater than 120 mg/dl, # 1 = TRUE, # 0 = FALSE #RestECG: resting electrocardiograph # 1 = normal # 2 = having ST-T wave abnormality # 3 = showing probable or definite left ventricular hypertrophy #ExAng # exercise induced angina # 1 = yes, # 0 = no #Oldpeak: ST depression induced by exercise relative to rest #Slope: the slope of the peak exercise ST segment # 1 = upsloping # 2 = flat # 3 = downsloping #Ca: number of major vessels (0-3) colored by fluoroscopy #Thal: thalium heart scan # 3 = normal (no cold spots) # 6 = fixed defect (cold spots during rest and exercise) # 7 = reversible defect (when cold spots only appear during exercise) #AHD: diagnosis of heart disease # 0 if less than or equal to 50% diameter narrowing # 1 if greater than 50% diameter narrowing source("http://www.uvm.edu/~rsingle/stat3880/data/scripts-3880.R") library(ISLR) library(tree) Heart <-read.csv('http://www.uvm.edu/~rsingle/stat3880/S24/data/Heart.csv') dim(Heart) #14 vars: 13 indep vars + dependent var Heart[!complete.cases(Heart),] Heart = Heart[complete.cases(Heart),] #needed for Bagging/RF #class() of variables determines how treated Heart$AHD = factor(Heart$AHD) Heart$Sex = factor(Heart$Sex) Heart$Thal = factor(Heart$Thal) Heart$ChestPain = factor(Heart$ChestPain) par(mfrow=c(2,3)) boxplot(Age~AHD, data=Heart,ylab="Age") boxplot(RestBP~AHD, data=Heart, ylab="RestBP") boxplot(Chol~AHD, data=Heart, ylab="Chol") boxplot(MaxHR~AHD, data=Heart, ylab="MaxHR") boxplot(Oldpeak~AHD, data=Heart, ylab="Oldpeak") par(mfrow=c(2,4)) mosaicplot(~AHD+Sex, data=Heart,main="") mosaicplot(~AHD+ChestPain, data=Heart, main="") mosaicplot(~AHD+Fbs, data=Heart, main="") mosaicplot(~AHD+RestECG, data=Heart, main="") mosaicplot(~AHD+ExAng, data=Heart,main="") mosaicplot(~AHD+Slope, data=Heart,main="") mosaicplot(~AHD+Thal, data=Heart,main="") mosaicplot(~AHD+Ca, data=Heart,main="") par(mfrow=c(1,1)) #Create training and test set set.seed(1) #set.seed(2) train = sample(dim(Heart)[1], dim(Heart)[1]/2) Heart.train=Heart[train,] Heart.test=Heart[-train,] #Build the tree on training data #tree.heart=tree(AHD ~., Heart.train) tree.heart=tree(AHD ~., Heart,subset=train) plot(tree.heart) text(tree.heart,pretty=0) #Get predictions on test set from full tree tree.pred=predict(tree.heart,Heart.test,type="class") tab <- table(tree.pred,Heart.test$AHD) tab (tab[1,1]+tab[2,2])/sum(tab) #% of correct predictions #Compare with a pruned tree cv.heart=cv.tree(tree.heart,FUN=prune.misclass) cv.heart plot(cv.heart$size,cv.heart$dev,type="b") prune.heart=prune.misclass(tree.heart,best=6) plot(prune.heart) text(prune.heart,pretty=0) #Get % of correct predictions for pruned tree tree.pred=predict(prune.heart,Heart.test,type="class") tab <- table(tree.pred,Heart.test$AHD) tab (tab[1,1]+tab[2,2])/sum(tab) #=============================== # Bagging and Random Forests #=============================== library(randomForest) #Bagging bag.heart = randomForest(AHD ~ ., data = Heart.train, mtry = 13, ntree = 500, importance = T) bag.pred = predict(bag.heart, Heart.test) #Get predictions on test set from bagged tree tab <- table(bag.pred,Heart.test$AHD) tab (tab[1,1]+tab[2,2])/sum(tab) #% of correct predictions importance(bag.heart) varImpPlot(bag.heart) #Random Forest rf.heart = randomForest(AHD ~ ., data = Heart.train, mtry = 4, ntree = 500, importance = T) rf.heart = randomForest(AHD ~ ., data = Heart.train, mtry = 3, ntree = 500, importance = T) rf.pred = predict(rf.heart, Heart.test) #Get predictions on test set from random forest tab <- table(rf.pred,Heart.test$AHD) tab (tab[1,1]+tab[2,2])/sum(tab) #% of correct predictions importance(rf.heart) varImpPlot(rf.heart) #=============================== #Boosting #=============================== library(gbm) #convert response to 0/1 Heart$AHD <- ifelse(Heart$AHD=="Yes", 1, 0) boost.heart=gbm(AHD~.,data=Heart[train,],distribution="bernoulli",n.trees=5000,interaction.depth=3,shrinkage=.001) summary(boost.heart, las=1) yhat.boost=predict(boost.heart, newdata=Heart[-train,], n.trees=5000) yhat.boost[1:10] #not looking like great predictions #Find optimal n.trees via cross-validation boost.heart=gbm(AHD~.,data=Heart[train,],distribution="bernoulli",n.trees=5000,interaction.depth=3,shrinkage=.001,cv.folds=5) summary(boost.heart, las=1) best.ntrees <- gbm.perf(boost.heart, method="cv", plot.it=TRUE) phat.boost=predict(boost.heart, type="response", newdata=Heart[-train,], n.trees=best.ntrees) phat.boost[1:10] yhat.boost = ifelse(phat.boost > 0.5, "Yes","No") yhat.boost[1:10] tab <- table(yhat.boost, Heart$AHD[-train]) (tab[1,1]+tab[2,2])/sum(tab) #% of correct predictions #Tree - full: [1] 0.7248322 #Tree - pruned: [1] 0.7315436 #Bagged trees: [1] 0.8120805 #Random Forest: [1] 0.8456376 #Boosting: [1] 0.8523490