library(arm) library(epitools) library(aod) library(ggplot2) library(binom) library(gridExtra) # Simulate misclassification: Goldberger individuals with meat misclassify_one_row <- function(row, column, sens=NA, spec=NA, p.nomove=NA, max.cat=NA) { random <- runif(1, 0, 1) # If no p.nomove, do boolean misclassification if (is.na(p.nomove)) { # If underlying value is true, generate false negative if random < sens if (row[column] & random > sens) { return(FALSE) } else if (!row[column] & random > spec) { return(TRUE) } else { return(as.logical(row[column])) } } else { current <- row[column] if (random > p.nomove) { # Move this one. return(sample(1:max.cat, 1)) } else { # Don't move. return(current) } } } misclassify_boolean_column <- function(dataset, column, sens, spec) { return(apply(dataset, 1, function(row) { misclassify_one_row(row, column, sens=sens, spec=spec)})) } misclassify_category_column <- function(dataset, column, p.nomove, max.cat) { return(apply(dataset, 1, function(row) { misclassify_one_row(row, column, p.nomove=p.nomove, max.cat=max.cat)})) } # Meat- JG # Household fresh meat supply distribution taken from the Diet paper, Table IX jg_meat_categories <- c(expression(NA<="1.0"), expression("1.0"-"1.9"), expression("2.0"-"2.9"), expression("3.0"-"3.9"), expression(NA>="4.0")) jg_diseased_household_meat <- c(54, 4, 2, 1, 0) jg_total_household_meat <- c(495, 131, 61, 36, 18) jg_percent <- jg_diseased_household_meat/jg_total_household_meat jg_meat_data <- data.frame(categories=factor(jg_meat_categories, levels=jg_meat_categories, ordered=TRUE), diseased=jg_diseased_household_meat, total=jg_total_household_meat, percent=jg_percent) jg_slope <- coef(lm(percent~as.numeric(categories), data=jg_meat_data))[2] jg_plot <- ggplot(data=jg_meat_data, aes(x=categories, y=percent)) + geom_point(size=8) + scale_y_continuous(limits=c(0, 0.15)) + xlab("Household Meat Supply per Adult Male Unit, lbs") + scale_x_discrete(labels = jg_meat_categories) + ylab("% of Households With Pellagra")+ stat_smooth(method="lm", se=FALSE, mapping=aes(x=as.numeric(categories), y=percent), size=1, color="black", )+ ggtitle("A)")+ theme_classic() + theme(plot.title=element_text(hjust=-0.18), plot.margin=unit(x=c(1,1,0.5,3), units="lines")) + theme(axis.title.x=element_text(size=10, vjust=-0.05)) + theme(axis.title.y=element_text(size=10, vjust=0.25)) + theme(plot.title=element_text(size=10)) + theme(legend.text=element_text(size=10)) + theme(legend.title=element_text(size=10)) # Generate household-level disease and meat categories meat_cat <- c() for (i in 1:length(jg_total_household_meat)) { meat_cat <- c(meat_cat, rep(i, jg_total_household_meat[i])) } disease <- c() for (i in 1:length(jg_total_household_meat)) { disease <- c(disease, rep(TRUE, jg_diseased_household_meat[i])) disease <- c(disease, rep(FALSE, jg_total_household_meat[i]-jg_diseased_household_meat[i])) } # Simulate 25 datasets with misclassification to show range of slopes. set.seed(54321) n.datasets <- 25 dataframes <- vector(mode="list", length=n.datasets) misclassified_slopes <- vector(length=n.datasets) misclassified_plot <- ggplot() + ggtitle("B)") + scale_y_continuous(limits=c(0, 0.15)) + xlab("Household Meat Supply per Adult Male Unit, lbs") + scale_x_discrete(labels = jg_meat_categories) + ylab("% of Households With Pellagra") + theme_classic() + theme(plot.title=element_text(hjust=-0.2), plot.margin=unit(x=c(1,1,0.5,3), units="lines")) + theme(axis.title.x=element_text(size=10, vjust=-0.05)) + theme(axis.title.y=element_text(size=10, vjust=0.25)) + theme(plot.title=element_text(size=10)) + theme(legend.text=element_text(size=10)) + theme(legend.title=element_text(size=10)) for (i in 1:n.datasets) { meat_sim_data <- data.frame(id=1:sum(jg_total_household_meat), meat=meat_cat, disease=disease) meat_sim_data$misclassified_disease <- misclassify_boolean_column(meat_sim_data, "disease", 1.0, specificity) meat_sim_data$misclassified_meat <- misclassify_category_column(meat_sim_data, "meat", 0.75, length(jg_total_household_meat)) meat_sim_table <- table(meat_sim_data[,'misclassified_meat'], meat_sim_data[,'misclassified_disease']) meat_sim_props <- prop.table((meat_sim_table), 1)[,2] dataframes[[i]] <- data.frame(categories=factor(jg_meat_categories, levels=jg_meat_categories, ordered=TRUE), diseased=meat_sim_table[,'TRUE'], total = apply(meat_sim_table, 1, sum), percent = meat_sim_props) misclassified_slopes[i] <- coef(lm(percent~as.numeric(categories), data=dataframes[[i]]))[2] } median_slope <- median(misclassified_slopes) median_frame <- which(misclassified_slopes == median(misclassified_slopes)) for (i in 1:n.datasets) { # Plot median line last so it appears on top if (i == median_frame) { next } misclassified_plot <- misclassified_plot + geom_point(data=dataframes[[i]], aes(x=categories, y=percent)) + stat_smooth(data=dataframes[[i]], method="lm", se=FALSE, mapping=aes(x=categories, y=percent, group=1), size=0.5, color="black") } misclassified_plot <- misclassified_plot + stat_smooth(data=dataframes[[median_frame]], method="lm", se=FALSE, mapping=aes(x=categories, y=percent, group=1), size=2, color="black") # Now do repeated simulations meat_misclassification_rates <- seq(0, 0.3, by=0.01) results.matrix <- matrix(NA, nrow=length(meat_misclassification_rates), ncol=1) rownames(results.matrix) <- meat_misclassification_rates n.simulations = 1000 for (i in 1:length(meat_misclassification_rates)) { rate_nomove <- 1-meat_misclassification_rates[i] results <- vector(length=n.simulations) for (k in 1:n.simulations) { misclassified_disease <- misclassify_boolean_column(meat_sim_data, "disease", 1.0, specificity) misclassified_meat <- misclassify_category_column(meat_sim_data, "meat", rate_nomove, length(jg_total_household_meat)) disease_table <- table(misclassified_meat, misclassified_disease) disease_props <- prop.table((disease_table), 1)[,2] dataframe <- data.frame(categories=factor(jg_meat_categories, levels=jg_meat_categories, ordered=TRUE), percent = disease_props) # Assume the investigators would consider 5 cases/1000 residents per category to be noticeable results[k] <- (coef(lm(percent~as.numeric(categories), data=dataframe))[2] < -0.0091) } results.matrix[i,1] <- prop.table(table(results))['TRUE'] } results_df <- data.frame(results.matrix) colnames(results_df)[1] <- 'percent_found' results_df$misclassification_rate <- rownames(results_df) confint <- binom.confint(n.simulations*results_df$percent_found,n.simulations, conf.level=.95, method='agresti-coull') results_df$ci_lower <- confint$lower results_df$ci_upper <- confint$upper p<-ggplot(results_df, aes(x=misclassification_rate, y=percent_found))+geom_point()+ stat_smooth(aes(group=1), method="loess", se=FALSE, color="black") p + theme_classic() + theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank(), axis.title=element_text(vjust=-0.1), plot.margin=unit(x=c(1,1,0.5,2), units="lines"))+ scale_y_continuous(limits=c(0.4, 1.0)) + scale_x_discrete(breaks = seq(0, 0.30, 0.05), label=seq(0, 30, 5))+ xlab("% of Households for Which Meat Supply Was Misclassified") + ylab("Proportion of Simulations Resulting in a Suggestive Correlation") dev.off()