# Install package rfvarsel library(rfvarsel) #################### # Nonparametric variable selection for # causal inference is demonstrated with # three data generation processes and # compared with gaussian bayesian networks. # # The RFXY algorithms assume there is an # outcome Y, a binary treatment indicator # T, and predictors, X1, ..., Xp # # If you wish to use only the test for # conditional independence, see the # last example with function RFVS_perm_cond() #################### # Data Example 1: linear data # (see paper for description of data-generating process) #################### set.seed(1234) dat_lin <- dgpLINEAR(n = 1000, noise = 92) head(dat_lin) # Y is outcome, T is treatment ?RFXY_trad_wrap # uses traditional random forest (RF) permutation importance s1 <- Sys.time() lin_results_trad <- RFXY_trad_wrap(X = dat_lin[,1:100], Y = dat_lin$Y, T = dat_lin$T, perms = 20, ntree = 200, pctl = .95, sidak = TRUE, scale = FALSE, iterate = FALSE) lin_results_trad # correct result is X1, X2, X3, X5, X7 e1 <- Sys.time() e1 - s1 # About 20 seconds on 2017 MacBook Air ?RFXY_cond_wrap # uses conditional RF permutation importance # preferred over traditional but more time consuming s1 <- Sys.time() lin_results_cond <- RFXY_cond_wrap(X = dat_lin[,1:100], Y = dat_lin$Y, T = dat_lin$T, perms = 20, ntree = 200, permsCond = 10, ntreeCond = 50, pctl = .95, sidak = TRUE, scale = FALSE, iterate = FALSE, permcores = 4) # mclapply doesn't work on Windows OS lin_results_cond # correct result is X1, X2, X3, X5, X7 e1 <- Sys.time() e1 - s1 # About 40 seconds on 2017 MacBook Air # gaussian bayesian networks s1 <- Sys.time() lin_gbn <- gbn_xy(X = dat_lin[,1:100], Y = dat_lin$Y, T = dat_lin$T, alpha = .05, method = "mmpc") lin_gbn e1 <- Sys.time() e1 - s1 # About half a second on 2017 MacBook Air #################### # Data Example 2: nonlinear data # (see paper for description of data-generating process) #################### set.seed(1234) dat_int <- dgpINT(n = 1000, noise = 92) head(dat_int) s1 <- Sys.time() int_results_trad <- RFXY_trad_wrap(X = dat_int[,1:100], Y = dat_int$Y, T = dat_int$T, perms = 20, ntree = 200, pctl = .95, sidak = TRUE, scale = FALSE, iterate = FALSE) int_results_trad # correct result is X1, X2, X3, X5, X7 e1 <- Sys.time() e1 - s1 # About 20 seconds on 2017 MacBook Air s1 <- Sys.time() int_results_cond <- RFXY_cond_wrap(X = dat_int[,1:100], Y = dat_int$Y, T = dat_int$T, perms = 20, ntree = 200, permsCond = 10, ntreeCond = 50, pctl = .95, sidak = TRUE, scale = FALSE, iterate = FALSE, permcores = 4) # mclapply doesn't work on Windows OS int_results_cond # correct result is X1, X2, X3, X5, X7 e1 <- Sys.time() e1 - s1 # About 60 seconds on 2017 MacBook Air # gaussian bayesian networks # Typically fails to detect X2 and/or X3 because they affect # potential outcomes primarily through their interaction (see paper) int_gbn <- gbn_xy(X = dat_int[,1:100], Y = dat_int$Y, T = dat_int$T, alpha = .05, method = "mmpc") int_gbn # correct result is X1, X2, X3, X5, X7 #################### # Data Example 3: nonlinear data with collider #################### set.seed(1234) dat_intcoll <- dgpINTCOLL(n = 1000, noise = 92) head(dat_intcoll) # Typically incorrectly selects X8 because it is # marginally associated with potential outcomes # Sometimes also X4 and/or X6 because, conditional # on T, they are marginally associated with the # potential outcomes intcoll_results_trad <- RFXY_trad_wrap(X = dat_intcoll[,1:100], Y = dat_intcoll$Y, T = dat_intcoll$T, perms = 20, ntree = 200, pctl = .95, sidak = TRUE, scale = FALSE, iterate = FALSE) intcoll_results_trad # correct result is X1, X2, X3, X5, X7 s1 <- Sys.time() intcoll_results_cond <- RFXY_cond_wrap(X = dat_intcoll[,1:100], Y = dat_intcoll$Y, T = dat_intcoll$T, perms = 20, ntree = 200, permsCond = 10, ntreeCond = 50, pctl = .95, sidak = TRUE, scale = FALSE, iterate = FALSE, permcores = 4) # mclapply doesn't work on Windows OS intcoll_results_cond # correct result is X1, X2, X3, X5, X7 e1 <- Sys.time() e1 - s1 # About 60 seconds on 2017 MacBook Air # gaussian bayesian networks # Typically fails to detect X2 and/or X3 because they affect # potential outcomes primarily through their interaction (see paper) intcoll_gbn <- gbn_xy(X = dat_intcoll[,1:100], Y = dat_intcoll$Y, T = dat_intcoll$T, alpha = .05, method = "mmpc") intcoll_gbn # correct result is X1, X2, X3, X5, X7 #################### # Nonparametric testing for conditional independence #################### ?RFVS_perm_cond # This function is called within # RFXY_trad_wrap() and RFXY_trad(), but # may also be used on its own for # nonparametric conditional indpependence # testing. # DGP for example (see Figure 2 in paper) dgpNEW <- function(n = 1000) { X1 <- rnorm(n) X2 <- rnorm(n) X3 <- rnorm(n) X4 <- rnorm(n) X6 <- rnorm(n) X5 <- X2 + X4 + rnorm(n) X7 <- rnorm(n); X8 <- rnorm(n); X9 <- rnorm(n); X10 <- rnorm(n) X11 <- rnorm(n); X12 <- rnorm(n); X13 <- rnorm(n); X14 <- rnorm(n) ps <- (1 + exp(-(0 + X1 + X2 + X6)))^-1 T <- rbinom(n = n, size = 1, prob = ps) Y0 <- X3 + X4 + X6 + rnorm(n) Y1 <- X3 + X4 + X6 + 2 + rnorm(n) Y <- Y0; Y[T == 1] <- Y1[T == 1] out <- data.frame(cbind(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, T, Y0, Y1, Y)) } set.seed(1234) dat_new <- dgpNEW(4000) head(dat_new) dim(dat_new) round(cor(dat_new), 2) # note no correlations between potential outcomes and X1 and X2 # Condition on T = 0 dat_new_0 <- dat_new[dat_new$T == 0, -15] round(cor(dat_new_0), 2) # correlation between potential outcomes and X1 and X2 induced by conditioning on T == 0 # Variable selection based on nonparametric # marginal tests (not conditional). # Typically incorrectly selects the collider X5 # Runtime about 25 seconds trad_out <- RFVS_perm_trad(x = dat_new_0[,1:14], y = dat_new_0[,"Y"], plot = TRUE) nms_out <- rownames(trad_out)[which(trad_out[,1] >= trad_out[,2])] # X3, X4, X6 is correct answer (see DGP above) nms_out # correct solution (X3, X4, X6) # Variable selection based on nonparametric conditional tests. # First, run RFVS_perm_trad to drop noise. We did this just above # and called the output 'nms_out'. Then, pass those names on to # RFVS_perm_cond: s1 <- Sys.time() cond_out <- RFVS_perm_cond(x = dat_new_0[, nms_out], y = dat_new_0[,"Y"], permcores = 4, plot = TRUE) e1 <- Sys.time() e1 - s1 rownames(cond_out)[which(cond_out[,1] >= cond_out[,2])] # Runtime about 10 seconds # Typically gets the correct solution (X3, X4, X6)