#Settingthe working directory

setwd("/Users/mats-ksp/Dropbox/Wszystko/Teaching/TaughtCourses/DataMiningVisualization/Labs/Lab3")


# Commandline based installation and activation of packages

install.packages("ISLR")
library(ISLR)


install.packages("tree") #Only if it was not installed before
library(tree)

#Read about the tree package on R-documentation site at the R-project for Statistical Computing
#Check `https://www.r-project.org/' and `https://cran.rstudio.com/web/packages/tree/tree.pdf'

help(tree)


############################################
# Binary tree for a regression problem

install.packages("MASS")
library(MASS)

help(Boston)


library(MASS)
set.seed(1)
train=sample(1:nrow(Boston),nrow(Boston)/2)
tree.boston=tree(medv~.,Boston,subset=train)
summary(tree.boston)

# In the context of a regression tree, the deviance is simply the sum of squared errors for the tree.

plot(tree.boston)
text(tree.boston,pretty=0)


#Cross-validation 
cv.boston=cv.tree(tree.boston)
plot(cv.boston$size,cv.boston$dev,type='b')
prune.boston=prune.tree(tree.boston,best=5)
plot(prune.boston)
text(prune.boston,pretty=0)
yhat=predict(tree.boston,newdata=Boston[-train,])
boston.test=Boston[-train,"medv"]
plot(yhat,boston.test)
abline(0,1)

mean((yhat-boston.test)^2)

yhat2=predict(prune.boston,newdata=Boston[-train,])
boston.test=Boston[-train,"medv"]
plot(yhat2,boston.test)
abline(0,1)
mean((yhat2-boston.test)^2)

# Boosting

# Boosting

install.packages("gbm") #first time only
library(gbm)

help(gbm)

set.seed(1)
boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4)
summary(boost.boston)


par(mfrow=c(1,2))
plot(boost.boston,i="rm")
plot(boost.boston,i="lstat")

#Testing phase and the value of the mean square error. 
yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000)
mean((yhat.boost-boston.test)^2)


#Change of lambda
boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4,shrinkage=0.2,verbose=F)
yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000)
mean((yhat.boost-boston.test)^2)


# Bagging and Random Forests

install.packages("randomForest") #only if not done before
library(randomForest)

install.packages("MASS") #only if not done before
library(MASS)

#Setting a training data set
set.seed(1)
train = sample(1:nrow(Boston), nrow(Boston)/2)

set.seed(1)
bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,importance=TRUE)
bag.boston

#Percentage of variability explained
ss=var(Boston$medv)
(ss-11.07993)/ss

#Importance measures and graphs


importance(bag.boston)
help(importance)
help(varImpPlot)
varImpPlot(bag.boston)

#Testing the derived tree
yhat.bag = predict(bag.boston,newdata=Boston[-train,])

boston.test=Boston[-train,"medv"]
plot(yhat.bag, boston.test)
abline(0,1)

#the mean square errors
mean((yhat.bag-boston.test)^2)


#Reduced number of trees
bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,ntree=25)
yhat.bag = predict(bag.boston,newdata=Boston[-train,])
mean((yhat.bag-boston.test)^2)


#Turning to random forests
set.seed(1)
rf.boston=randomForest(medv~.,data=Boston,subset=train,mtry=6,importance=TRUE)
yhat.rf = predict(rf.boston,newdata=Boston[-train,])
mean((yhat.rf-boston.test)^2)
importance(rf.boston)
varImpPlot(rf.boston)