################################################################################## # Program: MI-lasso.R # # NOTE: This R program includes: # # 1) the R function "MI.lasso" to implement the MI-lasso variable selection # # method. # # 2) An example on how to use the "MI.lasso" function. # # # # Reference: # # Chen, Q. & Wang, S. (2013). "Variable selection for multiply-imputed data # # with application to dioxin exposure study," Statistics in Medicine, 32: # # 3646-59. # ################################################################################## MI.lasso = function(mydata=mydata, D, lambda, maxiter=200, eps=1e-6) { ## D is the number of imputations ## mydata is in the array format: mydata[[1]] is the first imputed dataset... ## for each mydata[[d]], the first p columns are covariates X, and the last one is the outcome Y ## number of observations n = dim(mydata[[1]])[1] ## number of covariates p = dim(mydata[[1]])[2]-1 ## Standardize covariates X and center outcome Y x = NULL y = NULL meanx = NULL normx = NULL meany = 0 for (d in 1:D) { x[[d]] = mydata[[d]][,1:p] y[[d]] = mydata[[d]][,(p+1)] meanx[[d]] = apply(x[[d]], 2, mean) x[[d]] = scale(x[[d]], meanx[[d]], FALSE) normx[[d]] = sqrt(apply(x[[d]]^2, 2, sum)) x[[d]] = scale(x[[d]], FALSE, normx[[d]]) meany[d] = mean(y[[d]]) y[[d]] = y[[d]] - meany[d] } ## Ordinary least squares (OLS) estimates of beta coefficients b.ols = matrix(0,p,D) for (d in 1:D) { b.ols[,d] = qr.solve(t(x[[d]])%*%x[[d]])%*%t(x[[d]])%*%y[[d]] } ## Estimate beta_dj in Equation (4) iter = 0 dif = 1 c = rep(1,p) b = matrix(0,p,D) while (iter<=maxiter & dif>=eps) { iter = iter + 1 b.old = b # update beta_d by ridge regression for (d in 1:D) { xtx = t(x[[d]])%*%x[[d]] diag(xtx) = diag(xtx) + lambda/c xty = t(x[[d]])%*%y[[d]] b[,d] = qr.solve(xtx)%*%xty } # update c in Equation (4) c = sqrt(apply(b^2,1,sum)) c[c0] for (d in 1:D) { sse[d] = mean((y[[d]]-x[[d]]%*%b[,d])^2) } for (j in nzero) { norm1 = sqrt(sum(b[j,]^2)) norm2 = sqrt(sum(b.ols[j,]^2)) df = df+1+(D-1)*norm1/norm2 } bic = log(mean(sse)) + log(D*n)*df/(D*n) ## transform beta and intercept back to original scale b.scaled = b b0 = 0 coefficients = matrix(0,p+1,D) for (d in 1:D) { b[,d] = b[,d]/normx[[d]] b0[d] = meany[d] - b[,d]%*%meanx[[d]] coefficients[,d] = c(b0[d], b[,d]) } ## output the selected variables ## TRUE means "selected", FALSE means "NOT selected" varsel = abs(b[,1])>0 return(list(coefficients=coefficients, bic=bic, varsel=varsel)) } ###################################################### # An example on the use of the "MI.lasso" function # ###################################################### ## specify tuning parameter lamvec = (2^(seq(-1,4,by=0.05)))^2/2 ## select the optimal tuning parameter by BIC bicvec = 0 for (i in 1:length(lamvec)) { fit = MI.lasso(newdata, 5, lambda=lamvec[i]) bicvec[i] = fit$bic } ## select tuning parameter with the smallest BIC opt = order(bicvec)[1] ## fit the model with the optimal lambda fit.opt = MI.lasso(newdata, 5, lambda=lamvec[opt]) selection = which(fit.opt$varsel==TRUE) print(fit.opt$coefficients)