##### #Setting the working directory setwd("/home/podgorsk/Dropbox/Wszystko/Teaching/TaughtCourses/DataMiningVisualization/Labs/Lab2") #### # Non-linear Modeling # Splines install.packages("splines") library(splines) # Commandline based installation and activation of packages install.packages("ISLR") library(ISLR) attach(Wage) dim(Wage) Wage[1,] agelims=range(age) age.grid=seq(from=agelims[1],to=agelims[2]) fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage) pred=predict(fit,newdata=list(age=age.grid),se=T) plot(age,wage,col="gray") lines(age.grid,pred$fit,lwd=2) lines(age.grid,pred$fit+2*pred$se,lty="dashed") lines(age.grid,pred$fit-2*pred$se,lty="dashed") dim(bs(age,knots=c(25,40,60))) dim(bs(age,df=6)) attr(bs(age,df=6),"knots") #These are knots returned by B-splines method fit2=lm(wage~ns(age,df=4),data=Wage) pred2=predict(fit2,newdata=list(age=age.grid),se=T) lines(age.grid, pred2$fit,col="red",lwd=2) plot(age,wage,xlim=agelims,cex=.5,col="darkgrey") title("Smoothing Spline") fit=smooth.spline(age,wage,df=16) fit2=smooth.spline(age,wage,cv=TRUE) #Cross-validation part fit2$df lines(fit,col="red",lwd=2) lines(fit2,col="blue",lwd=2) legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8) ####### # GAMs install.packages("gam") library(gam) help(gam) gam.m3=gam(wage~s(year,4)+s(age,5)+education,data=Wage) summary(gam.m3) par(mfrow=c(1,3)) plot(gam.m3, se=TRUE,col="blue") preds=predict(gam.m3,newdata=Wage) dev.off() plot(1:length(preds),wage-preds) ###### # Regression tree for classification attach(Carseats) #This loads the data set that is the subject of our study help(Carseats) #Some information about the data summary(Carseats) #Some descriptive statistics on the data sapply(Carseats, class) str(Carseats) High = ifelse ( Sales <=8 ,"No" ,"Yes") #Classification variable Carseats2 = data.frame ( Carseats , High ) #Adding this variable to our set sapply(Carseats2, class) str(Carseats2) Carseats2$High install.packages("tree") #Only if it was not installed before library(tree) #Read about the tree package on R-documentation site at the R-project for Statistical Computing #Check `https://www.r-project.org/' and `https://cran.rstudio.com/web/packages/tree/tree.pdf' help(tree) #Building a binary tree for classification tree.carseats = tree(High~.- Sales,Carseats2) tree.carseats2= tree(High~CompPrice+Income+Advertising+Population+Price+ShelveLoc+Age+Education+Urban+US,Carseats2) summary(tree.carseats) summary(tree.carseats2) #Please, note different methods of describing the explanatory variables Xi's #Try experiment with the data to see how one can choose effectively variables that #are used for analysis #First, the tree itself plot(tree.carseats,type="uniform") #Then labels text(tree.carseats, pretty =0) #You can play a bit with some features to get 'pretty' really pretty as in: plot(tree.carseats, type="uniform", col="dark blue") text(tree.carseats, pretty =0, col = "dark red",cex=0.5) plot(tree.carseats, type="uniform", col="dark blue") text(tree.carseats, pretty =0, col = "dark red",cex=0.5,adj=c(.5,-5)) #The argument pretty=0 instructs R to include the category names for any qualitative predictors, rather than simply displaying a letter for each category. # adj shifts the location of the text in x and y. Carseats2[2,] # In the case of a classification tree, the argument type="class" instructs R to return the actual class prediction. predict(tree.carseats,Carseats2[2,],type ="class") predict(tree.carseats,Carseats2[2,]) predict(tree.carseats,Carseats2[10,],type ="class") predict(tree.carseats,Carseats2[50,],type ="class") # Predicting a fresh data a=Carseats2[1,] str(a) a[1,1:6]=c(5,98,60,5,80,100) a[1,8:9]=c(35,6) a[1,7]="Bad" a[1,10:12]=c("No","Yes","No") #It should be noted that for factors a[1,...] is not working the same as a[...] and one should assigng factors and numerical values #separately predict(tree.carseats,a,type ="class") #Testing vs. training error set.seed(2) #In order to have the same sample each time you will sample -- setting seed (not truly needed) train = sample (1:nrow (Carseats2),200) #Recall what the function "sample()" is doing. What is "nrow()"? Carseats2.test=Carseats2[-train,] #Note how you exclude the data High.test = High[-train] tree.carseats=tree(High~.-Sales,Carseats2,subset=train) tree.pred = predict(tree.carseats,Carseats2.test,type ="class") table(tree.pred,High.test) #Do not run 'set.seed(2)' for the bootstrap study of the prediction rate. #Cross-validation and pruning help(prune.misclass) prune.carseats=prune.misclass(tree.carseats,best=3) #Compare with the help result plot (prune.carseats ) text (prune.carseats, pretty =0) set.seed(3) #To fix random sample used in the c-v process, not needed in general help(cv.tree) cv.carseats =cv.tree(tree.carseats,FUN=prune.misclass ) #Note that `miscalssifcation error' function is used as the loss function guiding cross-validation process. names(cv.carseats ) cv.carseats par( mfrow = c(1 ,2) ) plot(cv.carseats$size, cv.carseats$dev , type ="b") plot(cv.carseats$k , cv.carseats$dev , type ="b") #What would you do next to complete the data mining of the above dataset? ###### #Regression tree for volatility vs. return #Example of statistical analysis vs. data mining: Volatility smile and leverage effect #Reading data frame from a file VD=read.csv('VolatilityData.csv') VD=VD[,c(2,3)] #pdf('VD.pdf') plot(VD,pch='.') #dev.off() pdf('RFit.pdf') rf=lm(VD$Volatility~VD$Shocks) summary(rf) plot(VD,pch='.') abline(rf,col='red') dev.off() VD$Volatility n=nrow(as.matrix(VD)) ##### install.packages("tree") #Only if it was not installed before library(tree) #Read about the tree package on R-documentation site at the R-project for Statistical Computing #Check `https://www.r-project.org/' and `https://cran.rstudio.com/web/packages/tree/tree.pdf' help(tree) tree.volatility=tree(Volatility~Shocks,VD,mindev = 0.0003,subset=1:floor(n/2)) summary(tree.volatility) pdf('rich_tree.pdf') plot(tree.volatility) text(tree.volatility,adj=c(0.5,5.5),cex=0.35) dev.off() yhat=predict(tree.volatility,newdata = VD) sqrt(mean((VD$Volatility-yhat)^2)) pdf('overfit_tree.pdf') plot(VD,pch='.') abline(rf,col='red') points(VD$Shocks,yhat,pch='*',col='blue') dev.off() #Validating phase #Validation: a selection of k - prunning parameter for the original tree to be pruned validate_ind=(floor(n/2)+1):floor(3*n/4) VVD=VD[validate_ind,] yhat2=predict(tree.volatility,newdata=VVD) sqrt(mean((VVD$Volatility-yhat2)^2)) pt=prune.tree(tree.volatility) K=length(pt$k) stds=vector('numeric',K-3) for(j in 2:(K-2)) { pts=prune.tree(tree.volatility,pt$k[j]) yhat2=predict(pts,VVD) stds[j-1]=sqrt(mean(( VVD$Volatility-yhat2)^2)) } pdf('SDS.pdf') plot(stds) dev.off() plot(stds[75:80]) j=80 #optimal pts=prune.tree(tree.volatility,pt$k[j]) plot(pts) text(pts,adj=c(0.5,0.5),cex=1) yhat2=predict(pts,newdata = VD) pdf("RegTree.pdf") plot(VD,pch='.') abline(rf,col='red') points(VD$Shocks,yhat,pch='*',col='blue') points(VD$Shocks,yhat2,col='red') dev.off() #Testing phase test_ind=(floor(3*n/4)+1):n TVD=VD[test_ind,] yhat3=predict(pts,TVD) sqrt(mean((TVD$Volatility-yhat3)^2)) #Regressing against quadratic function X=cbind(VD$Shocks,VD$Shocks^2) rf1=lm(VD$Volatility~X) a=rf1$coefficients xx=seq(-3,3,by=0.01) pdf('AllFits.pdf') plot(VD,pch='.') abline(rf,col='red') points(VD$Shocks,yhat,pch='*',col='blue') points(VD$Shocks,yhat2,col='red') lines(xx,a[1]+a[2]*xx+a[3]*xx^2,col='green',lwd=4) dev.off() summary(rf1)