#Settingthe working directory setwd("/Users/mats-ksp/Dropbox/Wszystko/Teaching/TaughtCourses/DataMiningVisualization/Labs/Lab3") # Commandline based installation and activation of packages install.packages("ISLR") library(ISLR) install.packages("tree") #Only if it was not installed before library(tree) #Read about the tree package on R-documentation site at the R-project for Statistical Computing #Check `https://www.r-project.org/' and `https://cran.rstudio.com/web/packages/tree/tree.pdf' help(tree) ############################################ # Binary tree for a regression problem install.packages("MASS") library(MASS) help(Boston) library(MASS) set.seed(1) train=sample(1:nrow(Boston),nrow(Boston)/2) tree.boston=tree(medv~.,Boston,subset=train) summary(tree.boston) # In the context of a regression tree, the deviance is simply the sum of squared errors for the tree. plot(tree.boston) text(tree.boston,pretty=0) #Cross-validation cv.boston=cv.tree(tree.boston) plot(cv.boston$size,cv.boston$dev,type='b') prune.boston=prune.tree(tree.boston,best=5) plot(prune.boston) text(prune.boston,pretty=0) yhat=predict(tree.boston,newdata=Boston[-train,]) boston.test=Boston[-train,"medv"] plot(yhat,boston.test) abline(0,1) mean((yhat-boston.test)^2) yhat2=predict(prune.boston,newdata=Boston[-train,]) boston.test=Boston[-train,"medv"] plot(yhat2,boston.test) abline(0,1) mean((yhat2-boston.test)^2) # Boosting # Boosting install.packages("gbm") #first time only library(gbm) help(gbm) set.seed(1) boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4) summary(boost.boston) par(mfrow=c(1,2)) plot(boost.boston,i="rm") plot(boost.boston,i="lstat") #Testing phase and the value of the mean square error. yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000) mean((yhat.boost-boston.test)^2) #Change of lambda boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4,shrinkage=0.2,verbose=F) yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000) mean((yhat.boost-boston.test)^2) # Bagging and Random Forests install.packages("randomForest") #only if not done before library(randomForest) install.packages("MASS") #only if not done before library(MASS) #Setting a training data set set.seed(1) train = sample(1:nrow(Boston), nrow(Boston)/2) set.seed(1) bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,importance=TRUE) bag.boston #Percentage of variability explained ss=var(Boston$medv) (ss-11.07993)/ss #Importance measures and graphs importance(bag.boston) help(importance) help(varImpPlot) varImpPlot(bag.boston) #Testing the derived tree yhat.bag = predict(bag.boston,newdata=Boston[-train,]) boston.test=Boston[-train,"medv"] plot(yhat.bag, boston.test) abline(0,1) #the mean square errors mean((yhat.bag-boston.test)^2) #Reduced number of trees bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,ntree=25) yhat.bag = predict(bag.boston,newdata=Boston[-train,]) mean((yhat.bag-boston.test)^2) #Turning to random forests set.seed(1) rf.boston=randomForest(medv~.,data=Boston,subset=train,mtry=6,importance=TRUE) yhat.rf = predict(rf.boston,newdata=Boston[-train,]) mean((yhat.rf-boston.test)^2) importance(rf.boston) varImpPlot(rf.boston)