library(arm)
library(epitools)
library(aod)
library(ggplot2)
library(binom)
library(gridExtra)

# Simulate misclassification: Goldberger individuals with meat

misclassify_one_row <- function(row, column, sens=NA, spec=NA, p.nomove=NA, max.cat=NA) {
  random <- runif(1, 0, 1)
  # If no p.nomove, do boolean misclassification
  if (is.na(p.nomove)) {
    # If underlying value is true, generate false negative if random < sens
    if (row[column] & random > sens) {
      return(FALSE)
    } else if (!row[column] & random > spec) {
      return(TRUE)
    } else {
      return(as.logical(row[column]))
    }
  } else {
    current <- row[column]
    if (random > p.nomove) {
      # Move this one.
      return(sample(1:max.cat, 1))
    } else {
      # Don't move.
      return(current)
    }
  }
}

misclassify_boolean_column <- function(dataset, column, sens, spec) {
  return(apply(dataset, 1, function(row) { misclassify_one_row(row, column, sens=sens, spec=spec)}))
}
misclassify_category_column <- function(dataset, column, p.nomove, max.cat) {
  return(apply(dataset, 1, function(row) { misclassify_one_row(row, column, p.nomove=p.nomove, max.cat=max.cat)}))
}

# Meat- JG
# Household fresh meat supply distribution taken from the Diet paper, Table IX
jg_meat_categories <- c(expression(NA<="1.0"), expression("1.0"-"1.9"), expression("2.0"-"2.9"), expression("3.0"-"3.9"), expression(NA>="4.0")) 
jg_diseased_household_meat <- c(54, 4, 2, 1, 0)
jg_total_household_meat <- c(495, 131, 61, 36, 18)
jg_percent <- jg_diseased_household_meat/jg_total_household_meat
jg_meat_data <- data.frame(categories=factor(jg_meat_categories, levels=jg_meat_categories, ordered=TRUE), 
				diseased=jg_diseased_household_meat, 
				total=jg_total_household_meat, 
				percent=jg_percent)
jg_slope <- coef(lm(percent~as.numeric(categories), data=jg_meat_data))[2]
jg_plot <- ggplot(data=jg_meat_data, aes(x=categories, y=percent)) + 
	geom_point(size=8) + 
	scale_y_continuous(limits=c(0, 0.15)) +
	xlab("Household Meat Supply per Adult Male Unit, lbs") + 
	scale_x_discrete(labels = jg_meat_categories) +
	ylab("% of Households With Pellagra")+
	stat_smooth(method="lm", se=FALSE, mapping=aes(x=as.numeric(categories), y=percent), size=1, color="black", )+
	ggtitle("A)")+
	theme_classic() + 
	theme(plot.title=element_text(hjust=-0.18), 
		plot.margin=unit(x=c(1,1,0.5,3), units="lines"))  +
	theme(axis.title.x=element_text(size=10, vjust=-0.05)) +
	theme(axis.title.y=element_text(size=10, vjust=0.25)) + 
	theme(plot.title=element_text(size=10)) +
	theme(legend.text=element_text(size=10)) +
	theme(legend.title=element_text(size=10))

# Generate household-level disease and meat categories
meat_cat <- c()
for (i in 1:length(jg_total_household_meat)) {
  meat_cat <- c(meat_cat, rep(i, jg_total_household_meat[i]))
}
disease <- c()
for (i in 1:length(jg_total_household_meat)) {
  disease <- c(disease, rep(TRUE, jg_diseased_household_meat[i]))
  disease <- c(disease, rep(FALSE, jg_total_household_meat[i]-jg_diseased_household_meat[i]))
}

# Simulate 25 datasets with misclassification to show range of slopes.
set.seed(54321)
n.datasets <- 25
dataframes <- vector(mode="list", length=n.datasets)
misclassified_slopes <- vector(length=n.datasets)
misclassified_plot <- ggplot() + 
	ggtitle("B)") +
	scale_y_continuous(limits=c(0, 0.15)) +
	xlab("Household Meat Supply per Adult Male Unit, lbs") + 
	scale_x_discrete(labels = jg_meat_categories) +
	ylab("% of Households With Pellagra") +
	theme_classic() +
	theme(plot.title=element_text(hjust=-0.2), 
		plot.margin=unit(x=c(1,1,0.5,3), units="lines"))  +
	theme(axis.title.x=element_text(size=10, vjust=-0.05)) +
	theme(axis.title.y=element_text(size=10, vjust=0.25)) + 
	theme(plot.title=element_text(size=10)) +
	theme(legend.text=element_text(size=10)) +
	theme(legend.title=element_text(size=10))

	

for (i in 1:n.datasets) {
  meat_sim_data <- data.frame(id=1:sum(jg_total_household_meat), meat=meat_cat, disease=disease)
  meat_sim_data$misclassified_disease <- misclassify_boolean_column(meat_sim_data, "disease", 1.0, specificity)
  meat_sim_data$misclassified_meat <- misclassify_category_column(meat_sim_data, "meat", 0.75, length(jg_total_household_meat))
  meat_sim_table <- table(meat_sim_data[,'misclassified_meat'], meat_sim_data[,'misclassified_disease'])
  meat_sim_props <- prop.table((meat_sim_table), 1)[,2]
  dataframes[[i]] <- data.frame(categories=factor(jg_meat_categories, levels=jg_meat_categories, ordered=TRUE),
					diseased=meat_sim_table[,'TRUE'],
					total = apply(meat_sim_table, 1, sum),
					percent = meat_sim_props)
  misclassified_slopes[i] <- coef(lm(percent~as.numeric(categories), data=dataframes[[i]]))[2]
}
median_slope <- median(misclassified_slopes)
median_frame <- which(misclassified_slopes == median(misclassified_slopes))
for (i in 1:n.datasets) {
  # Plot median line last so it appears on top
  if (i == median_frame) { next }
  misclassified_plot <- misclassified_plot + 
		geom_point(data=dataframes[[i]], 
			aes(x=categories, y=percent)) + 
		stat_smooth(data=dataframes[[i]], 
				method="lm", 
				se=FALSE, 
				mapping=aes(x=categories, y=percent, group=1),
				size=0.5, color="black") 
}
misclassified_plot <- misclassified_plot + 
		stat_smooth(data=dataframes[[median_frame]], 
				method="lm", 
				se=FALSE, 
				mapping=aes(x=categories, y=percent, group=1),
				size=2, color="black") 


# Now do repeated simulations
meat_misclassification_rates <- seq(0, 0.3, by=0.01)
results.matrix <- matrix(NA, nrow=length(meat_misclassification_rates), ncol=1)
rownames(results.matrix) <- meat_misclassification_rates
n.simulations = 1000
for (i in 1:length(meat_misclassification_rates)) {
  rate_nomove <- 1-meat_misclassification_rates[i]
  results <- vector(length=n.simulations)
  for (k in 1:n.simulations) {
    misclassified_disease <- misclassify_boolean_column(meat_sim_data, "disease", 1.0, specificity)
    misclassified_meat <- misclassify_category_column(meat_sim_data, "meat", rate_nomove, length(jg_total_household_meat))
    disease_table <- table(misclassified_meat, misclassified_disease)
    disease_props <- prop.table((disease_table), 1)[,2]
    dataframe <- data.frame(categories=factor(jg_meat_categories, levels=jg_meat_categories, ordered=TRUE),
				     percent = disease_props)
    # Assume the investigators would consider 5 cases/1000 residents per category to be noticeable
    results[k] <- (coef(lm(percent~as.numeric(categories), data=dataframe))[2] < -0.0091)
  }
  results.matrix[i,1] <- prop.table(table(results))['TRUE']
}
results_df <- data.frame(results.matrix)
colnames(results_df)[1] <- 'percent_found'
results_df$misclassification_rate <- rownames(results_df)
confint <- binom.confint(n.simulations*results_df$percent_found,n.simulations, conf.level=.95, method='agresti-coull')
results_df$ci_lower <- confint$lower
results_df$ci_upper <- confint$upper

p<-ggplot(results_df, aes(x=misclassification_rate, y=percent_found))+geom_point()+
	stat_smooth(aes(group=1), method="loess", se=FALSE, color="black")
p + theme_classic() + 
	theme(panel.grid.minor=element_blank(), 
          panel.grid.major=element_blank(),
		axis.title=element_text(vjust=-0.1),
	plot.margin=unit(x=c(1,1,0.5,2), units="lines"))+ 
	scale_y_continuous(limits=c(0.4, 1.0)) +
	scale_x_discrete(breaks = seq(0, 0.30, 0.05), label=seq(0, 30, 5))+
  xlab("% of Households for Which Meat Supply Was Misclassified") + 
ylab("Proportion of Simulations Resulting in a Suggestive Correlation")
dev.off()