#Setting the working directory

setwd("YourOwnDirectoryWithData")


#First part of the project
#Bootstrapping pairs
#scaning data


LW=read.table("WeightLength.txt",header=T)
dim(LW)

n=dim(LW)[1]


colMeans(LW)

var(LW)
cor(LW)

#Bootstrap study

bn=sample(1:n,n,rep=T)

BLW=LW[bn,]
dim(BLW)

bn[1]
BLW[1,]

B=10000

rho=vector("numeric",B)

for( i in 1:B)
{
bn=sample(1:n,n,rep=T)

BLW=LW[bn,]

rho[i]=cor(BLW)[1,2]
}

mean(rho)
sd(rho)


#In the next few lines of codes you see how to construct confidence intervals for the estimated parameter. 
#Make sure that you understand what is done in these steps

ocr=sort(rho-mean(rho))
alpha=0.05
LE=floor(alpha/2*B)
UE=floor((1-alpha/2)*B)


cor(LW)[1,2]-ocr[LE]
cor(LW)[1,2]-ocr[UE]
cor(LW)


#The second part of the project
# Non-linear Modeling

# Splines
install.packages("splines")
library(splines)

#Simulation of the data
x=seq(0,1,by=1/400)
y=x*(1-x)+rnorm(length(x),0,0.25)
plot(x,y)
points(x,x*(1-x),type='l')

#Using B-splines
H=bs(x,knots=c(1/2),Boundary.knots=c(0,1),intercept=T)

dim(H)
dim(t(H)%*%H)
solve(t(H)%*%H)
solve(t(H)%*%H)%*%t(H)%*%y
hatb=solve(t(H)%*%H)%*%t(H)%*%y

points(x,H%*%hatb,type='l',col='red')

# Non-parametric bootstrap with B-splines

res=y-H%*%hatb
n=401
Bn=50
newres=matrix(0,ncol=n,nrow=Bn)
for(i in 1:Bn)
{
newR=sample(res,n,rep=T)
newres[i,]=newR
}
newy=H%*%hatb+newres[1,]

newB=solve(t(H)%*%H)%*%t(H)%*%newy
newB

points(x,H%*%newB,type='l',col='yellow')

#The third part of the project
#Reading in microarray data sets

#ALL Data Set
#The following sequence of instruction will download `bioconductor' package and activate it

source("http://bioconductor.org/biocLite.R")
biocLite()

#When asked update all packages by putting 'a' and ENTER key.

biocLite("ALL")
library(Biobase)
library(ALL)
data(ALL)
help(ALL)

#Accessing the data from the 

install.packages("ISLR") #This line only if this package was not installed on your computer before
library(ISLR)
help(NCI60)

nci.labs=NCI60$labs
nci.data=NCI60$data
dim(nci.data)
nci.labs

#Reading the data set described in Chapter 10 Exercise 11 of the first textbook
#Here we assume that the file with the data is in the current directory under the name: 

MA=read.csv("Ch10Ex11.csv",header=F)
dim(MA)

#14 cancer microarray data, data have to be downloaded to the computer, the current directory

y1=scan("14cancer.ytrain")
y2=scan("14cancer.ytest")


length(y1)
length(y2)

x1=read.table("14cancer.xtrain",header=F)
x2=read.table("14cancer.xtest",header=F)

dim(x1)
dim(x2)

#The next part of the code deals with the bootstrapping example discussed 
#also in Assignment 1, where a possibility of modelling certain data with 
#Poisson distribution was considered.


x=c(107, 90, 71, 102, 73, 100, 73, 107, 116, 83, 109, 99, 76, 76, 97, 116, 80, 80, 104, 91, 73, 118, 110, 73, 107, 71, 82, 80, 118, 75)
n=length(x)

mx=mean(x)
vx=var(x)

#Comparing estimates using an MC study

MCN=1000 #The size of MC study
MCVar=vector("numeric",MCN) #Space reserved for keeping values of MC study
MCMean=MCVar

lambda=(mx+vx)/2 #The choice of the parameter for Poisson distribution
                 #somewhat arbitrary but related to the data
                 #this should be checked for a whole range of lambda
for( i in 1:MCN)
{
  MCx=rpois(n,lambda)
  MCVar[i]=var(MCx)
  MCMean[i]=mean(MCx)
}  
  hist(MCMean)
  quartz()
  hist(MCVar)
  
  var(MCMean)
  var(MCVar)

  mean(MCMean)
  mean(MCVar)
  
#Goodness of fit of Poisson using bootstrap


N=1000
Var=vector("numeric",N)
Mean=Var
for( i in 1:N)
{
  Bx=sample(x,size=n,rep=TRUE)
  Var[i]=var(Bx)
  Mean[i]=mean(Bx)
}  
  hist(Mean)
  hist(Var)
  
  
  #Code for boostrap assessment of microarray accuracy, complementary to 
  #Assignment 1
  
  #The data can be loaded from the package for the book or directly from the folder
  
  
  help("NCI60") #If does not work load ISRL package
  library("ISLR", lib.loc="~/Library/R/3.2/library")
  table(NCI60$labs)
  A=NCI60$data #Genes in columns and samples in rows
  Size=dim(A)
  heatmap(A[,1:100])
  #We will use another data set which fits our design as discussed in the 
  #class, Assignment 1
  #Scanning the data back to one big matrix
  
  Dat=matrix(scan(file="MicroArrBoot.txt"),ncol=Size[2],byrow=TRUE)
  dim(Dat) #The data are: sextuple after sextuple five times in rows
  
  
  #We extract five sextuples from the data 
  
  Data=array(0,dim=c(5,6,Size[2])) #Three dimensional matrix:
  #1st dimension -- different genetic materials (sextuples)
  #2nd dimension -- different microarrays in a sextuple
  #3rd dimension -- gene
  
  for( m in 1:5 )
  {
    Data[m,,]=Dat[((m-1)*6+1):(m*6),]
  }
  
  #We start with computing estimates
  
  #First we estimate sigma
  
  #Five estimates of sigma bases on averaged variances across genes with 
  #sextuples
  
  hatsigma=0
  for( m in 1:5){
    hatsigma=hatsigma+sqrt(mean(apply(Data[m,,],2,var)))/5 
  }
  
  hatsigma #Estimate of sigma
  
  #Then estimating sigma_i, for the i-th gene which will be kept in hatSigma
  
  hatSigma2=rep(0,Size[2])
  
  for( k in 1:6){
    hatSigma2=hatSigma2+apply(Data[,k,],2,var)/6 
  }
  
  
  hatSigma2=hatSigma2-hatsigma^2
  
  quartz()
  
  hist(sqrt(hatSigma2[hatSigma2>0]))
  
  
  #Continuation: Bootstrapping to obtain the accuracy of the estimates
  
  B=50 #Bootstrap sample size
  Bhatsigma=vector("numeric",B)
  BhatSigma2=matrix(0,ncol=Size[2],nrow=B)
  
  #This may take time:
  for(b in 1:B)
  {
    Bm=sample(1:5,replace = TRUE)
    Bk=sample(1:6,replace=TRUE)
    BData=Data[Bm,Bk,]
    Bhatsigma[b]=0
    for( m in 1:5){
      Bhatsigma[b]=Bhatsigma[b]+sqrt(mean(apply(BData[m,,],2,var)))/5 
    }
    BhatSigma2[b,]=rep(0,Size[2])
    
    for( k in 1:6){
      BhatSigma2[b,]=BhatSigma2[b,]+apply(BData[,k,],2,var)/6 
    }
    
  }
  
  hist(Bhatsigma) #Histogram of the bootstrapped sigmas
  
  hist(sqrt(BhatSigma2[1,BhatSigma2[1,]>0])) #Histogram of the bootstrapped 
  #sigma_i for the first gene (i=1)