# 1 Variance reduction by random forest -- illustrative Monte Carlo study #Simulated data N=20 #Sample size d=2 #Dimension of the predictors rho=0.85 #Correlation 1 rho0=0.5 #Correlation 2 #Data two dimensional and size N but correlated both within columns #and within rows Z=matrix(rnorm(2*N),nrow=N) ZN=rnorm(N) Z2=rnorm(2) X=sqrt(1-rho^2)*sqrt(1-rho0^2)*Z+rho0*ZN%*%t(rep(1,2))+rho*as.matrix(rep(1,N))%*%Z2 #Monte Carlo simulation of the data MC=2000 MCsample=matrix(0,nrow=2*N,ncol=MC) for(mc in 1:MC) { Z=matrix(rnorm(2*N),nrow=N) ZN=rnorm(N) Z2=rnorm(2) X=sqrt(1-rho^2)*sqrt(1-rho0^2)*Z+rho0*ZN%*%t(rep(1,2))+rho*as.matrix(rep(1,N))%*%Z2 MCsample[,mc]=cbind(X[,1],X[,2]) } #Estimated covariance from the MC study -- compare with the theoretical values round(cov(t(MCsample)),digits=2) # #Estimate of the common mean of coordinates (which is zero) based on a single sample of matrix X mean(X[,1])+mean(X[,2]) # #Bootstrap to obtain an estimate of the mean based on single sample of X B=10000 #Bootstrap sample size Bmean=vector('numeric',B) for(i in 1:B) { BN=sample(1:N,size=N, rep=TRUE) BX1=X[BN,1] BX2=X[BN,2] Bmean[i]=mean(BX1)+mean(BX2) } mean(Bmean) # #Bootstrapping coordinates as in random forest Bmean2=vector('numeric',B) for(i in 1:B) { BN=sample(1:N,size=N, rep=TRUE) delta=rbinom(1,1,0.5) BX1=delta*X[BN,1] BX2=(1-delta)*X[BN,2] Bmean2[i]=mean(BX1)+mean(BX2) } mean(Bmean2) #Another Monte Carlo study to investigate the random forest method MC=30 #Monte Carlo sample size E1=vector("numeric",MC) #MC-values of the bootstrap estimates E2=E1 #MC-values of the random forest type estimates for(j in 1:MC) #MC loop { Z=matrix(rnorm(2*N),nrow=N) ZN=rnorm(N) Z2=rnorm(2) X=sqrt(1-rho^2)*sqrt(1-rho0^2)*Z+rho0*ZN%*%t(rep(1,2))+rho*as.matrix(rep(1,N))%*%Z2 for(i in 1:B) #Bootstrap loop { BN=sample(1:N,size=N, rep=TRUE) BX1=X[BN,1] BX2=X[BN,2] Bmean[i]=mean(BX1)+mean(BX2) } E1[j]=mean(Bmean) for(i in 1:B) #Random forest loop { BN=sample(1:N,size=N, rep=TRUE) delta=rbinom(1,1,0.5) BX1=delta*X[BN,1] BX2=(1-delta)*X[BN,2] Bmean2[i]=mean(BX1)+mean(BX2) } E2[j]=mean(Bmean2) } #Summary of the study mean(E1) # mean(E2) # var(E1) # var(E2) # ####################### #2. Lasso method ####################### library(ISLR) fix(Hitters) names(Hitters) dim(Hitters) sum(is.na(Hitters$Salary)) Hitters=na.omit(Hitters) dim(Hitters) sum(is.na(Hitters)) x=model.matrix(Salary~.,Hitters)[,-1] y=Hitters$Salary set.seed(1) train=sample(1:nrow(x), nrow(x)/2) test=(-train) y.test=y[test] library(glmnet) grid=10^seq(10,-2,length=100) #The following does a laso fit for different penalty lambda lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid) #if alpha=1 then a lasso model is fit plot(lasso.mod) set.seed(1) cv.out=cv.glmnet(x[train,],y[train],alpha=1) plot(cv.out) # bestlam=cv.out$lambda.min lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,]) mean((lasso.pred-y.test)^2) out=glmnet(x,y,alpha=1,lambda=grid) lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,] lasso.coef lasso.coef[lasso.coef!=0] #3 Neural networks install.packages('neuralnet') library("neuralnet") #Going to create a neural network to perform sqare rooting #Type ?neuralnet for more information on the neuralnet library #Generate 50 random numbers uniformly distributed between 0 and 100 #And store them as a dataframe traininginput <- as.data.frame(runif(50, min=0, max=100)) trainingoutput <- sqrt(traininginput) #Column bind the data into one variable trainingdata <- cbind(traininginput,trainingoutput) colnames(trainingdata) <- c("Input","Output") #Train the neural network #Going to have 10 hidden layers #Threshold is a numeric value specifying the threshold for the partial #derivatives of the error function as stopping criteria. net.sqrt <- neuralnet(Output~Input,trainingdata, hidden=10, threshold=0.01) print(net.sqrt) #Plot the neural network plot(net.sqrt) #Test the neural network on some testing data testdata <- as.data.frame((1:10)^2) #Generate some squared numbers net.results <- compute(net.sqrt, testdata) #Run them through the neural network #Lets see what properties net.sqrt has ls(net.results) #Lets see the results print(net.results$net.result) #Lets display a better version of the results cleanoutput <- cbind(testdata,sqrt(testdata), as.data.frame(net.results$net.result)) colnames(cleanoutput) <- c("Input","Expected Output","Neural Net Output") print(cleanoutput)