Problem 1

library(rpart)          #classification and regression trees
library(partykit)       #treeplots
## Loading required package: grid
library(MASS)           #breast and pima indian data
library(ElemStatLearn)  #prostate data
library(randomForest)   #random forests
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(xgboost)        #gradient boosting 
library(caret)          #tune hyper-parameters
## Loading required package: lattice
library(rattle)         #Fancy
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## The following object is masked from 'package:xgboost':
## 
##     xgboost
## The following object is masked from 'package:randomForest':
## 
##     importance
library(rpart.plot)     #Enhanced 
library(RColorBrewer)   #Color
library(party)          #Alternative decision tree algorithm
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## 
## Attaching package: 'party'
## The following objects are masked from 'package:partykit':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner,
##     node_surv, node_terminal
attach(iris)
names(iris)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "Species"
set.seed(123)
tree.iris <- rpart(Species ~ Sepal.Width  + Petal.Width, data = iris)
print(tree.iris$cptable)
##     CP nsplit rel error xerror       xstd
## 1 0.50      0      1.00   1.21 0.04836666
## 2 0.44      1      0.50   0.76 0.06123180
## 3 0.01      2      0.06   0.07 0.02583280
cp1 <- min(tree.iris$cptable[3, ])
prune.tree.iris <-  prune(tree.iris, cp <- cp1)
plot(as.party(prune.tree.iris))

rparty.test <- predict(prune.tree.iris, newdata = iris, type = "class")
table(rparty.test, iris$Species)
##             
## rparty.test  setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
(50+49+45)/(50+49+45+6)
## [1] 0.96

Missclasification error equal to 4%

pairs(iris[, c("Sepal.Width", "Petal.Width")],pch=21,bg=c("red","green","blue")[unclass(Species)])

Q: Build a decision tree where Species is a function of all variables Sepal.Width, Petal.Width, Sepal.Length, and Sepal.Length. Plot the tree. Comment on the misclassification rate in this new tree.

tree.iris2 <- rpart(Species ~ ., data = iris)
cp2 <- min(tree.iris2$cptable[3, ])
prune.tree.iris2 <-  prune(tree.iris2, cp <- cp2)
plot(as.party(prune.tree.iris2))

rparty.test2 <- predict(prune.tree.iris2, newdata = iris, type = "class")
table(rparty.test2, iris$Species)
##             
## rparty.test2 setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
(50+49+45)/(50+49+45+6)
## [1] 0.96

Missclasification error is equal to 4%, same result we had with Petal.Width and Speal.Width variables.

Problem 2

library(MASS)
attach(Pima.te)
pima <- Pima.te
set.seed(123) #random number generator
#create training and testing sets
ind <- sample(2, nrow(pima), replace = TRUE, prob = c(0.7, 0.3))
pima.train <- pima[ind == 1, ] #the training data set
pima.test <- pima[ind == 2, ] #the test data set
set.seed(123)
rf.pima <- randomForest(type ~ ., data = pima.train)
rf.pima
## 
## Call:
##  randomForest(formula = type ~ ., data = pima.train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 21.94%
## Confusion matrix:
##      No Yes class.error
## No  136  20   0.1282051
## Yes  32  49   0.3950617

The OOB error rate is 21.94% which is pretty hihg when we are talking about deseas.

plot(rf.pima)

ntree <- which.min(rf.pima$err.rate[, 1])
ntree
## [1] 150

We need 64 trees to optimize the model accuracy

rf.pima.2 <- randomForest(type ~ ., data = pima.train, ntree = ntree)
rf.pima.2
## 
## Call:
##  randomForest(formula = type ~ ., data = pima.train, ntree = ntree) 
##                Type of random forest: classification
##                      Number of trees: 150
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 21.52%
## Confusion matrix:
##      No Yes class.error
## No  136  20   0.1282051
## Yes  31  50   0.3827160
rf.pima.predict <- predict(rf.pima.2, newdata = pima.test, type = "response")
table(rf.pima.predict, pima.test$type)
##                
## rf.pima.predict No Yes
##             No  56  13
##             Yes 11  15
(56+15)/(56+15+11+13)
## [1] 0.7473684

The accuracy is 74%

varImpPlot(rf.pima.2)

Problem 3

library(ISLR)
attach(Carseats)
data(Carseats)
High <- with(Carseats, ifelse(Sales <= 8, "No", "Yes"))
Carseats <- data.frame(Carseats, High)
set.seed(217)

#create training and testing sets
ind3 <- sample(2, nrow(Carseats), replace = TRUE, prob = c(0.7, 0.3))
carseats.train <- Carseats[ind3 == 1, ]
carseats.test <- Carseats[ind3 == 2, ]
set.seed(123)
tree.carseats <- rpart(High ~ .-Sales, data = carseats.train)
tree.carseats$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.23577236      0 1.0000000 1.0000000 0.06833647
## 2 0.12195122      1 0.7642276 0.9105691 0.06733512
## 3 0.05691057      2 0.6422764 0.8536585 0.06647365
## 4 0.04065041      3 0.5853659 0.7560976 0.06456772
## 5 0.03658537      4 0.5447154 0.7560976 0.06456772
## 6 0.03252033      6 0.4715447 0.7479675 0.06438326
## 7 0.01219512      8 0.4065041 0.7317073 0.06400203
## 8 0.01000000     10 0.3821138 0.7642276 0.06474813
cp3 <- min(tree.carseats$cptable[7, ])
prune.tree.carseats <-  prune(tree.carseats, cp <- cp3)
fancyRpartPlot(prune.tree.carseats)

rparty.test3 <- predict(prune.tree.carseats, newdata = carseats.test, type = "class")
table(rparty.test3, carseats.test$High)
##             
## rparty.test3 No Yes
##          No  65   9
##          Yes  5  32

The accuracy we get here is

(65+32)/(65+32+14)
## [1] 0.8738739