#####
#Setting the working directory

setwd("/home/podgorsk/Dropbox/Wszystko/Teaching/TaughtCourses/DataMiningVisualization/Labs/Lab2")

####
# Non-linear Modeling

# Splines
install.packages("splines")
library(splines)

# Commandline based installation and activation of packages

install.packages("ISLR")
library(ISLR)
attach(Wage)
dim(Wage)
Wage[1,]


agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])

fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage)
pred=predict(fit,newdata=list(age=age.grid),se=T)
plot(age,wage,col="gray")
lines(age.grid,pred$fit,lwd=2)
lines(age.grid,pred$fit+2*pred$se,lty="dashed")
lines(age.grid,pred$fit-2*pred$se,lty="dashed")


dim(bs(age,knots=c(25,40,60)))

dim(bs(age,df=6))

attr(bs(age,df=6),"knots") #These are knots returned by B-splines method

fit2=lm(wage~ns(age,df=4),data=Wage)

pred2=predict(fit2,newdata=list(age=age.grid),se=T)

lines(age.grid, pred2$fit,col="red",lwd=2)

plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")

title("Smoothing Spline")
fit=smooth.spline(age,wage,df=16)
fit2=smooth.spline(age,wage,cv=TRUE) #Cross-validation part 
fit2$df
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8)


#######
# GAMs

install.packages("gam")
library(gam)

help(gam)

gam.m3=gam(wage~s(year,4)+s(age,5)+education,data=Wage)

summary(gam.m3)

par(mfrow=c(1,3))
plot(gam.m3, se=TRUE,col="blue")

preds=predict(gam.m3,newdata=Wage)

dev.off()

plot(1:length(preds),wage-preds)


######
# Regression tree for classification

attach(Carseats)  #This loads the data set that is the subject of our study

help(Carseats) #Some information about the data

summary(Carseats) #Some descriptive statistics on the data
sapply(Carseats, class)
str(Carseats)

High = ifelse ( Sales <=8 ,"No" ,"Yes") #Classification variable

Carseats2 = data.frame ( Carseats , High ) #Adding this variable to our set
sapply(Carseats2, class)
str(Carseats2)

Carseats2$High

install.packages("tree") #Only if it was not installed before
library(tree)

#Read about the tree package on R-documentation site at the R-project for Statistical Computing
#Check `https://www.r-project.org/' and `https://cran.rstudio.com/web/packages/tree/tree.pdf'

help(tree)

#Building a binary tree for classification

tree.carseats = tree(High~.- Sales,Carseats2)

tree.carseats2= tree(High~CompPrice+Income+Advertising+Population+Price+ShelveLoc+Age+Education+Urban+US,Carseats2)

summary(tree.carseats)
summary(tree.carseats2) #Please, note different methods of describing the explanatory variables Xi's 
                        #Try experiment with the data to see how one can choose effectively variables that
                        #are used for analysis


#First, the tree itself
plot(tree.carseats,type="uniform")

#Then labels
text(tree.carseats, pretty =0)

#You can play a bit with some features to get 'pretty' really pretty as in:

plot(tree.carseats, type="uniform", col="dark blue")


text(tree.carseats, pretty =0, col = "dark red",cex=0.5)

plot(tree.carseats, type="uniform", col="dark blue")


text(tree.carseats, pretty =0, col = "dark red",cex=0.5,adj=c(.5,-5))

#The argument pretty=0 instructs R to include the category names for any qualitative predictors, rather than simply displaying a letter for each category.
# adj shifts the location of the text in x and y. 

Carseats2[2,]

# In the case of a classification tree, the argument type="class" instructs R to return the actual class prediction. 
predict(tree.carseats,Carseats2[2,],type ="class")
predict(tree.carseats,Carseats2[2,])

predict(tree.carseats,Carseats2[10,],type ="class")
predict(tree.carseats,Carseats2[50,],type ="class")

# Predicting a fresh data
a=Carseats2[1,]
str(a)

a[1,1:6]=c(5,98,60,5,80,100)
a[1,8:9]=c(35,6)
a[1,7]="Bad"
a[1,10:12]=c("No","Yes","No")
#It should be noted that for factors a[1,...] is not working the same as a[...] and one should assigng factors and numerical values
#separately

predict(tree.carseats,a,type ="class")

#Testing vs. training error

set.seed(2) #In order to have the same sample each time you will sample -- setting seed (not truly needed)
train = sample (1:nrow (Carseats2),200) #Recall what the function "sample()" is doing. What is "nrow()"?
                                         
Carseats2.test=Carseats2[-train,] #Note how you exclude the data
High.test = High[-train]
tree.carseats=tree(High~.-Sales,Carseats2,subset=train)


tree.pred = predict(tree.carseats,Carseats2.test,type ="class")
table(tree.pred,High.test)

#Do not run 'set.seed(2)' for the bootstrap study of the prediction rate.


#Cross-validation and pruning 

help(prune.misclass)

prune.carseats=prune.misclass(tree.carseats,best=3) #Compare with the help result
plot (prune.carseats )
text (prune.carseats, pretty =0)


set.seed(3) #To fix random sample used in the c-v process, not needed in general

help(cv.tree)

cv.carseats =cv.tree(tree.carseats,FUN=prune.misclass )
#Note that `miscalssifcation error' function is used as the loss function guiding cross-validation process.

names(cv.carseats )

cv.carseats

par( mfrow = c(1 ,2) )
plot(cv.carseats$size, cv.carseats$dev , type ="b")
plot(cv.carseats$k , cv.carseats$dev , type ="b")

#What would you do next to complete the data mining of the above dataset?

######
#Regression tree for volatility vs. return

#Example of statistical analysis vs. data mining: Volatility smile and leverage effect

#Reading data frame from a file
VD=read.csv('VolatilityData.csv')
VD=VD[,c(2,3)]

#pdf('VD.pdf')
plot(VD,pch='.')
#dev.off()
pdf('RFit.pdf')
rf=lm(VD$Volatility~VD$Shocks)
summary(rf)
plot(VD,pch='.')
abline(rf,col='red')
dev.off()

VD$Volatility


n=nrow(as.matrix(VD))


#####
install.packages("tree") #Only if it was not installed before
library(tree)

#Read about the tree package on R-documentation site at the R-project for Statistical Computing
#Check `https://www.r-project.org/' and `https://cran.rstudio.com/web/packages/tree/tree.pdf'

help(tree)

tree.volatility=tree(Volatility~Shocks,VD,mindev = 0.0003,subset=1:floor(n/2))
summary(tree.volatility)

pdf('rich_tree.pdf')
plot(tree.volatility)
text(tree.volatility,adj=c(0.5,5.5),cex=0.35)
dev.off()


yhat=predict(tree.volatility,newdata = VD)
sqrt(mean((VD$Volatility-yhat)^2))


pdf('overfit_tree.pdf')
plot(VD,pch='.')
abline(rf,col='red')
points(VD$Shocks,yhat,pch='*',col='blue')
dev.off()

#Validating phase


#Validation: a selection of k - prunning parameter for the original tree to be pruned 
validate_ind=(floor(n/2)+1):floor(3*n/4)
VVD=VD[validate_ind,]
yhat2=predict(tree.volatility,newdata=VVD)
sqrt(mean((VVD$Volatility-yhat2)^2))


pt=prune.tree(tree.volatility)
K=length(pt$k)
stds=vector('numeric',K-3)
for(j in 2:(K-2))
{
  pts=prune.tree(tree.volatility,pt$k[j]) 
  yhat2=predict(pts,VVD)
  stds[j-1]=sqrt(mean(( VVD$Volatility-yhat2)^2))
}

pdf('SDS.pdf')
plot(stds)
dev.off()
plot(stds[75:80])

j=80 #optimal
pts=prune.tree(tree.volatility,pt$k[j]) 
plot(pts)
text(pts,adj=c(0.5,0.5),cex=1)


yhat2=predict(pts,newdata = VD)

pdf("RegTree.pdf")
plot(VD,pch='.')
abline(rf,col='red')
points(VD$Shocks,yhat,pch='*',col='blue')
points(VD$Shocks,yhat2,col='red')
dev.off()

#Testing phase
test_ind=(floor(3*n/4)+1):n
TVD=VD[test_ind,]
yhat3=predict(pts,TVD)

sqrt(mean((TVD$Volatility-yhat3)^2))

#Regressing against quadratic function
X=cbind(VD$Shocks,VD$Shocks^2)
rf1=lm(VD$Volatility~X)
a=rf1$coefficients

xx=seq(-3,3,by=0.01)
pdf('AllFits.pdf')
plot(VD,pch='.')
abline(rf,col='red')
points(VD$Shocks,yhat,pch='*',col='blue')
points(VD$Shocks,yhat2,col='red')
lines(xx,a[1]+a[2]*xx+a[3]*xx^2,col='green',lwd=4)
dev.off()


summary(rf1)