#------------------------------------------------------------------------# # created on : 02/14/2017 # AFFILIATION : DEPARTMENT OF STATISTICS, GEORGE WASHINGTON UNIVERSITY # EMAIL : judywang@gwu.edu # R Functions for "" # by Wang, H., McKeague, I. and Qian, M. (2017) #------------------------------------------------------------------------# library(quantreg) library(abind) library(MASS) QMET.1tau <- function(Y, X, tau, alpha=0.05, B=200, dB=100, constant= seq(0.5, 5, 0.05), h=NA) { #Bootstrap maximum-type t-statistic at a given single quantile level # Inputs # X: a n*p matrix, where p is the number of variables and n is the # sample size (the intercept term is excluded); # Y: a n*1 vector; # tau: quantile level of interest, scalar # alpha: the significance level. default value alpha = 0.05; # B: the number of bootstrap samples. # dB: the number of double bootstrap samples. # constant: grid of constants involved in the tuning parameter lambda # h: bandwidth parameter (if NA, then automatically determine h using the rule from Hall and Sheather (1988).) # Outputs # tn: observed test statistic value # khat: the index of the selected most predictive variable # pval.QMET: pvalue of the QMET based on lambda chosen by double bootstrap # pval.CPB: pvalue of the standard centered percentile bootstrap # pvals: pvalues of the QMET based on fixed lambdas in the grid # resid: residuals from regressing Y on the selected variable # optJ: the index for the selected lambda n <- dim(X)[1]; p <- dim(X)[2]; lambda = constant*sqrt(log(n)*tau*(1-tau)) nlam = length(constant) betak = matrix(0, nrow=2, ncol=p) #intercept and slope by regressing Y on each element of X separately Jk = array(0, c(2,2,p)) #weighted covariance of (1, Xk), i.e. estimated Hessian matrix for k=1,...,p Jkinv = array(0, c(2,2,p)) #inverse of Jk rho.k = rep(0, p) #the min of quantile objective function associated with the kth candidate model, k=1,...,p SE.k = tn.k = se.k = rep(0, p) #SE of slope estimate for different candidate model fhat = matrix(0, nrow=n, ncol=p) for(k in 1:p) { x = cbind(1,X[,k]) fit = rq.fit.fnb(x, Y, tau) out=summary.rq.nid (fit, x, Y, tau, n, p=2, eps=3.666853e-11,h) Jk[,,k] = solve(out$Hinv)/n Jkinv[,,k] = out$Hinv*n rho.k[k] = out$rho betak[,k] = as.vector(fit$coef) #se.k[k] = out$se[2] #SE.k[k] = summary(rq(Y~x-1,tau),se="boot")$coef[2,2] SE.k[k] = out$se[2] fhat[,k] = out$f tn.k[k] = betak[2,k]/SE.k[k] } #find the most significant predictor \hat k = the one that gives the smallest quantile objective function value #and calculate tn for the observed sample (assuming nid errors) khat = which.min(rho.k) betahat = betak[2, khat] #estimated slope for the chosen model associated with khat ehat <- Y - cbind(1,X[,khat])%*%betak[, khat] #estimated residuals for model khat sigmahat = SE.k[khat] #SE of betahat for the model khat tn <- tn.k[khat] psi = tau - 1*(ehat<0) ### begin bootstrap cpb = Vgrid = rep(0, B) GridIndexT = matrix(0, nrow=B, ncol=nlam) G0 = sqrt(n)* colMeans(cbind(1, X)*as.vector(psi)) Cov = var(X) Var = diag(Cov) #(1) obtain the bootstrap khat est.B = khat.B(Y, X, B, tau) bskhat.all = est.B$bskhat.all bsidx.all = est.B$bsidx #(2) obtain bootstrap bstn, bspsi and Vgrid bspsi.all = bstn.all = bsbetahat.all = bssigmahat.all = NULL for (bs in 1:B) { bsidx <- bsidx.all[,bs] bskhat = bskhat.all[bs] tt = vn.B.func(Y, X, bsidx, bskhat, tau, h, G0, psi, tn, lambda, SE.k, Jkinv, betahat) cpb[bs] = tt$cpb GridIndexT[bs,] = tt$GridIndexT Vgrid[bs] <- tt$Vgrid bspsi.all = cbind(bspsi.all, tt$bspsi) bstn.all = c(bstn.all, tt$bstn) bsbetahat.all = c(bsbetahat.all, tt$bsbetahat) bssigmahat.all = c(bssigmahat.all, tt$bssigmahat) } #record the rejection counts of bootstrap bhats based on DB critical values reject = matrix(0, nrow=nlam, ncol=B) #run double bootstrap if(dB>1 & nlam>1) { for(bs in 1:B){ bsidx <- bsidx.all[,bs] bskhat = bskhat.all[bs] bsY = Y[bsidx] bsX = X[bsidx,] bspsi = bspsi.all[,bs] bstn = bstn.all[bs] bsbetahat = bsbetahat.all[bs] bssigmahat = bssigmahat.all[bs] ### start double bootstrap dcpb = dVgrid = rep(0, dB) dGridIndexT = matrix(0, nrow=dB, ncol=nlam) bsG0 = sqrt(n)* colMeans(cbind(1, bsX)*as.vector(bspsi)) bsCov = var(bsX) bsVar = diag(bsCov) #(1) obtain the double bootstrap khat est.dB = khat.B(bsY, bsX, dB, tau) dbskhat.all = est.dB$bskhat.all dbsidx.all = est.dB$bsidx #(2) obtain the double bootstrap tn, psi, V bsG0 = sqrt(n)* colMeans(cbind(1, bsX)*as.vector(bspsi)) for(dbs in 1:dB){ dbsidx <- dbsidx.all[,dbs] dbskhat = dbskhat.all[dbs] tt = vn.B.func(bsY, bsX, dbsidx, dbskhat, tau, h, bsG0, bspsi, bstn, lambda, SE.k, Jkinv, bsbetahat) dbspsi = tt$bspsi dbstn = tt$bstn dcpb[dbs] = tt$cpb dGridIndexT[dbs,] = tt$GridIndexT dVgrid[dbs] <- tt$Vgrid } ###end of the double bootstrap ##select among lambdas the optimal lambda that gives CIs with rejection rate closest to nominal level for(j in 1:nlam) { dCIrange = dcpb*(1-dGridIndexT[,j]) + dVgrid*dGridIndexT[,j] #dB*1 lowerquantile <- quantile(dCIrange, alpha/2, na.rm=T) upperquantile <- quantile(dCIrange, 1-alpha/2, na.rm=T) if((bsbetahat-betahat)/bssigmahat >upperquantile | (bsbetahat-betahat)/bssigmahat=tn,na.rm=T))) pvals[j] = pval } #pvalue of QMET based on the lambda chosen by DB pval.QMET = pvals[optJ] #pvalue of the center percentile bootstrap pval.CPB = min(2*min(mean(cpb =tn,na.rm=T)),1) out = list(tn=tn, khat=khat, pval.QMET=pval.QMET, pval.CPB=pval.CPB, pvals=pvals, resid=ehat, optJ = optJ) return(out) } QMET.mtau <- function(Y, X, taus, alpha=0.05, B=200, dB=100, constant= seq(0.5, 5, 0.05), h=NA) { #Test across multiple quantile levels # X: a n*p matrix, where p is the number of variables and n is the # sample size (the intercept term is excluded); # Y: a n*1 vector; # taus: a vector of quantile levels of interest # alpha: the significance level. default value alpha = 0.05; # B: the number of bootstrap samples. Default value B=1000; #outputs #RR.DB: rejection rates of the DB for each lambda #rejectYes: rejection indicator for individual lam, chosen lambda with double bootstrap, chosen lambda based on optJ2, and CPB #CPBcrit: critical values for the CPB method #QMETcrit: critical values for the QMET method for each lambda, nlam*2 matrix n <- dim(X)[1]; p <- dim(X)[2]; ntau = length(taus) lams = NULL for(j in 1:ntau) lams=cbind(lams, constant*sqrt(log(n)*taus[j]*(1-taus[j]))) nlam = length(constant) betak = array(0, c(2, p, ntau)) #intercept and slope by regressing Y on each element of X separately Jk = array(0, c(2,2,p, ntau)) #weighted covariance of (1, Xk), i.e. estimated Hessian matrix for k=1,...,p Jkinv = array(0, c(2,2,p, ntau)) #inverse of Jk rho.k = matrix(0, nrow=p, ncol=ntau) #the min of quantile objective function associated with the kth candidate model, k=1,...,p SE.k = tn.k = se.k = matrix(0, nrow=p, ncol=ntau) #SE of slope estimate for different candidate model fhat = array(0, c(n, p, ntau)) for(j in 1:ntau) { tau = taus[j] for(k in 1:p) { x = cbind(1,X[,k]) fit = rq.fit.fnb(x, Y, tau) out=summary.rq.nid(fit, x, Y, tau, n, p=2, eps=3.666853e-11,h) Jk[,,k,j] = solve(out$Hinv)/n Jkinv[,,k,j] = out$Hinv*n rho.k[k,j] = out$rho betak[, k,j] = as.vector(fit$coef) SE.k[k,j] = out$se[2] fhat[,k,j] = out$f tn.k[k,j] = betak[2, k,j]/SE.k[k,j] }} #find the most significant predictor \hat k = the one that gives the smallest quantile objective function value #and calculate tn for the observed sample (assuming nid errors) khat = apply(rho.k,2,which.min) #estimated slope for the chosen model associated with khat betahat = ehat = sigmahat = tn = psi = NULL for(j in 1:ntau) { betahat[j] = betak[2, khat[j],j] #estimated residuals for model khat tmp.ehat = Y - cbind(1,X[,khat[j]])%*%betak[, khat[j], j] ehat = cbind(ehat, tmp.ehat) #SE of betahat for the model khat sigmahat = c(sigmahat, SE.k[khat[j], j]) tn = c(tn, tn.k[khat[j], j]) psi = cbind(psi, taus[j] - 1*(tmp.ehat <0)) } Tn = mean(tn^2) ### begin bootstrap cpb = Vgrid = matrix(0, nrow=B, ncol=ntau) GridIndexT = array(0, c(B, nlam, ntau)) G0 = NULL for(j in 1:ntau) G0=cbind(G0, sqrt(n)* colMeans(cbind(1, X)*as.vector(psi[,j]))) Cov = var(X) Var = diag(Cov) #(1) obtain the bootstrap khat est.B = khat.B.mtau(Y, X, B, taus) bskhat.all = est.B$bskhat.all bsidx.all = est.B$bsidx #(2) obtain bootstrap bstn, bspsi and Vgrid bspsi.all = array(0, c(n, B, ntau)) bstn.all = bsbetahat.all = bssigmahat.all = matrix(0, nrow=B, ncol=ntau) for(j in 1:ntau){ for (bs in 1:B) { bsidx <- bsidx.all[,bs] bskhat = bskhat.all[bs,j] tt = vn.B.func(Y, X, bsidx, bskhat, taus[j], h, G0[,j], psi[,j], tn[j], lams[,j], SE.k[,j], Jkinv[,,,j], betahat[j]) cpb[bs,j ] = tt$cpb GridIndexT[bs,,j] = tt$GridIndexT Vgrid[bs, j] <- tt$Vgrid bspsi.all[,bs,j] = tt$bspsi bstn.all[bs, j] = tt$bstn bsbetahat.all[bs, j] = tt$bsbetahat bssigmahat.all[bs, j] = tt$bssigmahat } } #record the rejection counts of bootstrap bhats based on DB critical values reject= matrix(0, nrow=nlam, ncol=B) #Run double bootstrap if(dB>1 & nlam>1){ for(bs in 1:B){ bsidx <- bsidx.all[,bs] bskhat = bskhat.all[bs] bsY = Y[bsidx] bsX = X[bsidx,] bspsi = bspsi.all[,bs,] bstn = bstn.all[bs,] bsbetahat = bsbetahat.all[bs,] bssigmahat = bssigmahat.all[bs,] ### start double bootstrap dcpb = dVgrid = matrix(0, nrow=dB, ncol=ntau) dGridIndexT = array(0, c(dB, nlam, ntau)) bsG0 = NULL for(j in 1:ntau) bsG0=cbind(bsG0, sqrt(n)* colMeans(cbind(1, bsX)*as.vector(bspsi[,j]))) bsCov = var(bsX) bsVar = diag(bsCov) #(1) obtain the double bootstrap khat est.dB = khat.B.mtau(bsY, bsX, dB, taus) dbskhat.all = est.dB$bskhat.all dbsidx.all = est.dB$bsidx #(2) obtain the double bootstrap tn, psi, V for(j in 1:ntau){ for(dbs in 1:dB){ dbsidx <- dbsidx.all[,dbs] dbskhat = dbskhat.all[dbs,j] tt = vn.B.func(bsY, bsX, dbsidx, dbskhat, taus[j], h, bsG0[,j], bspsi[,j], bstn[j], lams[,j], SE.k[,j], Jkinv[,,,j], bsbetahat[j]) dbspsi = tt$bspsi dbstn = tt$bstn dcpb[dbs,j] = tt$cpb dGridIndexT[dbs,,j] = tt$GridIndexT dVgrid[dbs,j] <- tt$Vgrid }} ###end of the double bootstrap ##select among lambdas the optimal lambda that gives CIs with rejection rate closest to nominal level for(j in 1:nlam) { dCIrange = dcpb*(1-dGridIndexT[,j,]) + dVgrid*dGridIndexT[,j,] #dB*ntau dTn = apply(dCIrange^2, 1, mean) tt = (bsbetahat-betahat)/bssigmahat tt = mean(tt^2) if( tt > quantile(dTn, 1-alpha)) reject[j, bs]= 1 } # if(bs %%100==0) cat("finished", bs, "th bootstrap", "\n") } #end of the bootstrap reject = apply(reject, 1, mean) #find the optimal lambda optJ = which.min(abs(reject-alpha)) } else optJ=1 #obtain pvalues associated with each lambda pvals = rep(0, nlam) for(j in 1:nlam) { CIrange = cpb*(1-GridIndexT[,j,]) + Vgrid*GridIndexT[,j,] Tnstar = apply(CIrange^2, 1, mean) pval = mean(Tnstar>=Tn,na.rm=T) pvals[j] = pval } #pvalue of QMET based on the lambda chosen by DB pval.QMET = pvals[optJ] #pvalue of the center percentile bootstrap tt2 = apply(cpb^2, 1, mean) pval.CPB = mean(tt2 >=Tn,na.rm=T) cval.CPB = quantile(tt2, 1-alpha) out = list(Tn=Tn, tn=tn, khat=khat, pval.QMET=pval.QMET, pval.CPB=pval.CPB, pvals=pvals, optJ = optJ, cval.CPB=cval.CPB) return(out) } ########### Below are internal functions needed in the two main functions ################# summary.rq.nid <- function (object, x, y, tau, n, p=2, eps=3.666853e-11, h=NA) { x = as.matrix(x) coef <- coefficients(object) resid <- as.vector(object$residuals) #calculate the covariance (sandwich formula) if(is.na(h)) { h <- bandwidth.rq(tau, n, hs = TRUE) if (tau + h > 1) stop("tau + h > 1: error in summary.rq") if (tau - h < 0) stop("tau - h < 0: error in summary.rq") } bhi <- rq.fit.fnb(x, y, tau = tau + h)$coef blo <- rq.fit.fnb(x, y, tau = tau - h)$coef dyhat <- x %*% (bhi - blo) if (any(dyhat <= 0)) warning(paste(sum(dyhat <= 0), "non-positive fis")) f <- pmax(0, (2 * h)/(dyhat - eps)) fxxinv <- diag(p) fxxinv <- backsolve(qr(sqrt(f) * x)$qr[1:p, 1:p, drop = FALSE], fxxinv) fxxinv <- fxxinv %*% t(fxxinv) #covariance for bhat in possibly misspecified model (this is the part that differs from summary.rq) cov <- fxxinv %*% t((tau-1*(resid <0))^2*x)%*%x %*% fxxinv serr <- sqrt(diag(cov)) object$cov <- cov object$Hinv <- fxxinv object$serr <- serr object$coefficients <- coef object$residuals <- resid object$tau <- tau object$f <- f object$h = h object$rho = sum(resid*(tau-(resid<0))) object } khat.B = function(Y, X, B, tau) { #return the bootstrap MQR coefficient estimators and the bootstrap khat #B: number of bootstrap #Y: observed response vector #X: observed design matrix #tau: a single quantile level p = ncol(X) n = length(Y) ### generate bootstrap sample bsidx.all <- matrix(sample(n, n * B, replace = TRUE), n, B) coef.b = array(0, dim=c(p,B,2)) rho.b = matrix(0, nrow=p, ncol=B) for(j in 1:p){ coef.b[j,,] <- boot.rq.xy(as.matrix(cbind(1,X[,j])), Y, bsidx.all, tau) } bskhat.all = rep(0,B) for(bs in 1:B){ bsidx <- bsidx.all[,bs] bsY <- Y[bsidx] bsX <- X[bsidx,] tmp.coef = coef.b[,bs,] fitted = t(t(bsX)*tmp.coef[,2] + tmp.coef[,1]) resid = bsY - fitted resid2 = resid*(tau-(resid<0)) rho.b[,bs] = colMeans(resid2) bskhat.all[bs] = which.min(rho.b[,bs]) } out = list(bskhat.all= bskhat.all, bsidx.all= bsidx.all) return(out) } Vprocess.slope.b0 <- function(GJG, JG, p) { #for non iid errors #for a given b, calculate the bootstrap Vn(b,j) for j=1,...,p #fix b=0 M = GJG V = JG[,2] K = which.max(M) bsV = V[K] return(list(bsV=bsV, K=K)) } keep<-function (..., list = character(0), all = FALSE, sure = FALSE) { if (missing(...) && missing(list)) { warning("Keep something, or use rm(list=ls()) to clear workspace. ", "Nothing was removed.") return(invisible(NULL)) } names <- as.character(substitute(list(...)))[-1] list <- c(list, names) keep.elements <- match(list, ls(1, all.names = all)) if (any(is.na(keep.elements))) { warning("You tried to keep \"", list[which(is.na(keep.elements))[1]], "\" which doesn't exist in workspace. Nothing was removed.", sep = "") return(invisible(NULL)) } if (sure) rm(list = ls(1, all.names = all)[-keep.elements], pos = 1) else return(ls(1, all.names = all)[-keep.elements]) } vn.B.func = function(Y, X, bsidx, bskhat, tau, h=NA, G0, psi, tn, lambda, SE.k, Jkinv, betahat) { #h: bandwidth used in estimating the se of quantile coeff estimator n = length(Y) p = ncol(X) bsY <- Y[bsidx] bsX <- X[bsidx,] fithat = rq.fit.fnb(cbind(1, bsX[,bskhat]), bsY, tau) bsbetahat = fithat$coef[2] # bootstrap version of tn assuming nid errors bssigmahat = summary.rq.nid (fithat, cbind(1, bsX[,bskhat]), bsY, tau, n, p=2, eps=3.666853e-11, h)$se[2] bstn = (bsbetahat/bssigmahat) bspsi = tau - 1*(fithat$resid<0) cpb = (bsbetahat - betahat)/bssigmahat #need return # QMET bsCov <- var(bsX) bsVar <- diag(bsCov) bsG = sqrt(n)* colMeans(cbind(1, bsX)*psi[bsidx]) - G0 bsZ <- sqrt(n)*( (colMeans(as.vector(psi[bsidx])*bsX) - colMeans(bsX)*mean(psi[bsidx])) - (colMeans(as.vector(psi)*X) - colMeans(bsX)*mean(psi))) GridIndexT = 1*(max(abs(bstn), abs(tn)) <= lambda) #need return bsG2 = cbind(rep(bsG[1],p), bsG[-1]) #p*2 matrix GJG = rep(0,p); JG = matrix(0,ncol=2,nrow=p) for(k in 1:p){ GJG[k] = t(bsG2[k,])%*%Jkinv[,,k]%*%bsG2[k,] JG[k,] = Jkinv[,,k]%*%bsG2[k,] } tt = Vprocess.slope.b0(GJG, JG, p) #Vgrid <- tt$bsV/(sqrt(n)* SE.k[tt$K]) K = tt$K fithat = rq.fit.fnb(cbind(1, bsX[,K]), bsY, tau) bsSE.K = summary.rq.nid (fithat, cbind(1, bsX[,K]), bsY, tau, n, p=2, eps=3.666853e-11, h)$se[2] Vgrid <- tt\$bsV/(sqrt(n)* bsSE.K) out = list(cpb=cpb, GridIndexT= GridIndexT, Vgrid= Vgrid, bstn=bstn, bspsi= bspsi, bsbetahat= bsbetahat, bssigmahat= bssigmahat) return(out) } khat.B.mtau <- function(Y, X, B, taus) { #return the bootstrap MQR coefficient estimators and the bootstrap khat #B: number of bootstrap #Y: observed response vector #X: observed design matrix #taus: a vector of quantile levels p = ncol(X) n = length(Y) ntau = length(taus) ### generate bootstrap sample bsidx.all <- matrix(sample(n, n * B, replace = TRUE), n, B) coef.b = array(0, dim=c(p,B,2,ntau)) rho.b = array(0, c(p, B, ntau)) for(j in 1:ntau){ for(k in 1:p){ coef.b[k,,,j] <- boot.rq.xy(as.matrix(cbind(1,X[,k])), Y, bsidx.all, taus[j]) } } bskhat.all = matrix(0, nrow=B, ncol=ntau) for(j in 1:ntau){ for(bs in 1:B){ bsidx <- bsidx.all[,bs] bsY <- Y[bsidx] bsX <- X[bsidx,] tmp.coef = coef.b[,bs,,j] fitted = t(t(bsX)*tmp.coef[,2] + tmp.coef[,1]) resid = bsY - fitted resid2 = resid*(taus[j]-(resid<0)) rho.b[,bs,j] = colMeans(resid2) bskhat.all[bs,j] = which.min(rho.b[,bs,j]) } } out = list(bskhat.all= bskhat.all, bsidx.all= bsidx.all) return(out) }