#Setting the working directory setwd("YourOwnDirectoryWithData") #First part of the project #Bootstrapping pairs #scaning data LW=read.table("WeightLength.txt",header=T) dim(LW) n=dim(LW)[1] colMeans(LW) var(LW) cor(LW) #Bootstrap study bn=sample(1:n,n,rep=T) BLW=LW[bn,] dim(BLW) bn[1] BLW[1,] B=10000 rho=vector("numeric",B) for( i in 1:B) { bn=sample(1:n,n,rep=T) BLW=LW[bn,] rho[i]=cor(BLW)[1,2] } mean(rho) sd(rho) #In the next few lines of codes you see how to construct confidence intervals for the estimated parameter. #Make sure that you understand what is done in these steps ocr=sort(rho-mean(rho)) alpha=0.05 LE=floor(alpha/2*B) UE=floor((1-alpha/2)*B) cor(LW)[1,2]-ocr[LE] cor(LW)[1,2]-ocr[UE] cor(LW) #The second part of the project # Non-linear Modeling # Splines install.packages("splines") library(splines) #Simulation of the data x=seq(0,1,by=1/400) y=x*(1-x)+rnorm(length(x),0,0.25) plot(x,y) points(x,x*(1-x),type='l') #Using B-splines H=bs(x,knots=c(1/2),Boundary.knots=c(0,1),intercept=T) dim(H) dim(t(H)%*%H) solve(t(H)%*%H) solve(t(H)%*%H)%*%t(H)%*%y hatb=solve(t(H)%*%H)%*%t(H)%*%y points(x,H%*%hatb,type='l',col='red') # Non-parametric bootstrap with B-splines res=y-H%*%hatb n=401 Bn=50 newres=matrix(0,ncol=n,nrow=Bn) for(i in 1:Bn) { newR=sample(res,n,rep=T) newres[i,]=newR } newy=H%*%hatb+newres[1,] newB=solve(t(H)%*%H)%*%t(H)%*%newy newB points(x,H%*%newB,type='l',col='yellow') #The third part of the project #Reading in microarray data sets #ALL Data Set #The following sequence of instruction will download `bioconductor' package and activate it source("http://bioconductor.org/biocLite.R") biocLite() #When asked update all packages by putting 'a' and ENTER key. biocLite("ALL") library(Biobase) library(ALL) data(ALL) help(ALL) #Accessing the data from the install.packages("ISLR") #This line only if this package was not installed on your computer before library(ISLR) help(NCI60) nci.labs=NCI60$labs nci.data=NCI60$data dim(nci.data) nci.labs #Reading the data set described in Chapter 10 Exercise 11 of the first textbook #Here we assume that the file with the data is in the current directory under the name: MA=read.csv("Ch10Ex11.csv",header=F) dim(MA) #14 cancer microarray data, data have to be downloaded to the computer, the current directory y1=scan("14cancer.ytrain") y2=scan("14cancer.ytest") length(y1) length(y2) x1=read.table("14cancer.xtrain",header=F) x2=read.table("14cancer.xtest",header=F) dim(x1) dim(x2) #The next part of the code deals with the bootstrapping example discussed #also in Assignment 1, where a possibility of modelling certain data with #Poisson distribution was considered. x=c(107, 90, 71, 102, 73, 100, 73, 107, 116, 83, 109, 99, 76, 76, 97, 116, 80, 80, 104, 91, 73, 118, 110, 73, 107, 71, 82, 80, 118, 75) n=length(x) mx=mean(x) vx=var(x) #Comparing estimates using an MC study MCN=1000 #The size of MC study MCVar=vector("numeric",MCN) #Space reserved for keeping values of MC study MCMean=MCVar lambda=(mx+vx)/2 #The choice of the parameter for Poisson distribution #somewhat arbitrary but related to the data #this should be checked for a whole range of lambda for( i in 1:MCN) { MCx=rpois(n,lambda) MCVar[i]=var(MCx) MCMean[i]=mean(MCx) } hist(MCMean) quartz() hist(MCVar) var(MCMean) var(MCVar) mean(MCMean) mean(MCVar) #Goodness of fit of Poisson using bootstrap N=1000 Var=vector("numeric",N) Mean=Var for( i in 1:N) { Bx=sample(x,size=n,rep=TRUE) Var[i]=var(Bx) Mean[i]=mean(Bx) } hist(Mean) hist(Var) #Code for boostrap assessment of microarray accuracy, complementary to #Assignment 1 #The data can be loaded from the package for the book or directly from the folder help("NCI60") #If does not work load ISRL package library("ISLR", lib.loc="~/Library/R/3.2/library") table(NCI60$labs) A=NCI60$data #Genes in columns and samples in rows Size=dim(A) heatmap(A[,1:100]) #We will use another data set which fits our design as discussed in the #class, Assignment 1 #Scanning the data back to one big matrix Dat=matrix(scan(file="MicroArrBoot.txt"),ncol=Size[2],byrow=TRUE) dim(Dat) #The data are: sextuple after sextuple five times in rows #We extract five sextuples from the data Data=array(0,dim=c(5,6,Size[2])) #Three dimensional matrix: #1st dimension -- different genetic materials (sextuples) #2nd dimension -- different microarrays in a sextuple #3rd dimension -- gene for( m in 1:5 ) { Data[m,,]=Dat[((m-1)*6+1):(m*6),] } #We start with computing estimates #First we estimate sigma #Five estimates of sigma bases on averaged variances across genes with #sextuples hatsigma=0 for( m in 1:5){ hatsigma=hatsigma+sqrt(mean(apply(Data[m,,],2,var)))/5 } hatsigma #Estimate of sigma #Then estimating sigma_i, for the i-th gene which will be kept in hatSigma hatSigma2=rep(0,Size[2]) for( k in 1:6){ hatSigma2=hatSigma2+apply(Data[,k,],2,var)/6 } hatSigma2=hatSigma2-hatsigma^2 quartz() hist(sqrt(hatSigma2[hatSigma2>0])) #Continuation: Bootstrapping to obtain the accuracy of the estimates B=50 #Bootstrap sample size Bhatsigma=vector("numeric",B) BhatSigma2=matrix(0,ncol=Size[2],nrow=B) #This may take time: for(b in 1:B) { Bm=sample(1:5,replace = TRUE) Bk=sample(1:6,replace=TRUE) BData=Data[Bm,Bk,] Bhatsigma[b]=0 for( m in 1:5){ Bhatsigma[b]=Bhatsigma[b]+sqrt(mean(apply(BData[m,,],2,var)))/5 } BhatSigma2[b,]=rep(0,Size[2]) for( k in 1:6){ BhatSigma2[b,]=BhatSigma2[b,]+apply(BData[,k,],2,var)/6 } } hist(Bhatsigma) #Histogram of the bootstrapped sigmas hist(sqrt(BhatSigma2[1,BhatSigma2[1,]>0])) #Histogram of the bootstrapped #sigma_i for the first gene (i=1)