# This file contains R code for implementing the binary screening test (BST) and # confidence intervals proposed in "Marginal screening of 2 x 2 tables in # large-scale case-control studies" by Ian W. McKeague and Min Qian. # input: # train -- a list with two components: # train$W is an n x p matrix of {0,1}-valued risk factors (exposure indicators) # train$Y is a vector of n case indicators (case 1, control 0) # output: # pvalue -- p-value based on BST # CInull, CImax, CIboot and CIbag -- four proposed CIs (at 95% nominal coverage) of log(theta_0) # khat -- estimate of maximally associated risk factor (index) # theta_hat -- point estimate of the corresponding odds ratio (theta_0 in the paper) # Example: # data generated from simulation model C, with 0.5 pairwise correlation between risk factors library(MultiOrd) p=50 # number of risk factors n=200 # number of subjects (an even positive integer) Y = c(rep(1, 0.5*n), rep(0, 0.5*n)) # case indicators Sigma = matrix(1,p,p); for (j in 1:p){ for (k in 1:p){ if (k==j){ Sigma[j,k]=1 } else{ Sigma[j,k]=0.5 } } } W1 <- generate.binary(n/2, c(0.65, 0.6, 0.55, rep(0.5, p-3)), Sigma) # cases W2 <- generate.binary(n/2, c(rep(0.4, 3), rep(0.5,p-3)), Sigma) # controls W = rbind(W1,W2) train = list(W=W, Y=Y) result <- BST(train) result BST <- function(train){ library(MASS) n = dim(train$W)[1] p = dim(train$W)[2] testn=10000 # number of Monte Carlo draws used in BST B = 1000 # number of bootstrap samples used in CI_boot bagB=200 # number of bootstrap samples per bag used in CI_bag bag=10 # number of baggings used in CI_bag CX=cor(train$W[train$Y==1,]) N1 = colSums(train$W) EX = colSums(train$W*train$Y) M1=sum(train$Y) q = N1/n pi1 = EX/N1 pi2 = (M1-EX)/(n-N1) theta = (pi1/(1-pi1))/(pi2/(1-pi2)) tau_hat = sqrt(1/EX +1/(EX+n-N1-M1)+1/(N1-EX)+1/(M1-EX)) sigma = sqrt(1/(q*pi1)+1/(q*(1-pi1))+1/((1-q)*pi2)+1/((1-q)*(1-pi2))) khat=which.max(abs(log(theta))/tau_hat) theta_hat = theta[khat] # BST test; Z = mvrnorm(testn, rep(0,p), CX) K=apply(Z^2, 1, function(x) max(which(x == max(x, na.rm = TRUE)))) stat = rep(NA,testn) for (i in 1:testn){ stat[i]=Z[i,K[i]]*sigma[K[i]] } tempp = sum(stat/sqrt(n) > log(theta_hat))/testn pvalue = 2*min(tempp, 1-tempp) # CONFIDENCE INTERVALS ###################################################### # bootstrap estimate of \hat\theta_n bstheta_hat = rep(NA, B) for (bs in 1:B){ bsidx = c(sample(1:(n/2), replace=TRUE), sample((1+n/2):n, replace=TRUE)) bsW = train$W[bsidx,] bsY = train$Y[bsidx] bsCX=cor(bsW[bsY==1,]) # bsCX = CX bsN1 = colSums(bsW) bsEX = colSums(bsW*bsY) bsM1=sum(bsY) bsq = bsN1/n bspi1 = bsEX/bsN1 bspi2 = (bsM1-bsEX)/(n-bsN1) bstheta = (bspi1/(1-bspi1))/(bspi2/(1-bspi2)) bstau_hat = sqrt(1/bsEX +1/(bsEX+n-bsN1-bsM1)+1/(bsN1-bsEX)+1/(bsM1-bsEX)) bssigma = sqrt(1/(bsq*bspi1)+1/(bsq*(1-bspi1))+1/((1-bsq)*bspi2)+1/((1-bsq)*(1-bspi2))) bskhat=which.max(abs(log(bstheta))/bstau_hat) bstheta_hat[bs] = bstheta[bskhat] } # bagging estimate of \hat\theta_n bstheta_hat_bag = matrix(NA, bagB,bag) for (bagidx in 1:bag){ for (bs in 1:bagB){ bsidx = c(sample(c(1:n)[train$Y==1], replace=TRUE), sample(c(1:n)[train$Y==0], replace=TRUE)) bsW = train$W[bsidx,] bsY = train$Y[bsidx] bsCX=cor(bsW[bsY==1,]) # bsCX = CX bsN1 = colSums(bsW) bsEX = colSums(bsW*bsY) bsM1=sum(bsY) bsq = bsN1/n bspi1 = bsEX/bsN1 bspi2 = (bsM1-bsEX)/(n-bsN1) bstheta = (bspi1/(1-bspi1))/(bspi2/(1-bspi2)) bstau_hat = sqrt(1/bsEX +1/(bsEX+n-bsN1-bsM1)+1/(bsN1-bsEX)+1/(bsM1-bsEX)) bssigma = sqrt(1/(bsq*bspi1)+1/(bsq*(1-bspi1))+1/((1-bsq)*bspi2)+1/((1-bsq)*(1-bspi2))) bskhat=which.max(abs(log(bstheta))/bstau_hat) bstheta_hat_bag[bs, bagidx] = bstheta[bskhat] } } # taking grid of b on the khat path bseq = c(0,kronecker(seq(0.1,7,0.2), c(1,-1), FUN = "*")) lb = rep(NA, length(bseq)) ub = rep(NA, length(bseq)) bscoverage1 = rep(NA, length(bseq)) bscoverage_bag = matrix(NA, length(bseq), bag) for (j in 1:length(bseq)){ b = bseq[j]*sigma[khat] temp = Z temp[,khat]=Z[,khat] + bseq[j] K=apply(temp^2, 1, function(x) max(which(x == max(x, na.rm = TRUE)))) stat = rep(NA,testn) for (i in 1:testn){ stat[i]=Z[i,K[i]]*sigma[K[i]] -b*(K[i]!=khat) } lb[j] = log(theta_hat) - sort(stat)[testn*0.975]/sqrt(n) ub[j] = log(theta_hat)- sort(stat)[testn*0.025]/sqrt(n) bscoverage1[j] = sum((log(bstheta_hat) < ub[j])*(log(bstheta_hat) > lb[j]))/B bscoverage_bag[j,] = colMeans((log(bstheta_hat_bag) < ub[j])*(log(bstheta_hat_bag) > lb[j])) } # CI based on null distribution; CInull= c(lb[which(bseq==0)], ub[which(bseq==0)]) # CI-max CImax = c(min(lb), max(ub)) # CI-boot selectidx1 = which.min(abs(bscoverage1 - 0.95)) CIboot = c(lb[selectidx1], ub[selectidx1]) # CI-bag temp_CI = matrix(NA, bag,2) for (bagidx in 1:bag){ selectidx2 = which.min(abs(bscoverage_bag[,bagidx] - 0.95)) temp_CI[bagidx,]= c(lb[selectidx2], ub[selectidx2]) } CIbag = colMeans(temp_CI) object <- list(pvalue = pvalue, CInull = CInull, CImax = CImax, CIboot = CIboot, CIbag = CIbag, khat = khat, theta_hat = theta_hat) }