# Bernd Beber and Alexandra Scacco # The Devil Is in the Digits # Last updated June 23, 2009 # Version originally posted to web site is archived at http://www.columbia.edu/~bhb2102/files/Iran_2009_v1.R ############################################################################# ###### useful functions ############################################################################# # function to extract digit in position pos from elements in v # set pos to "last" to get last digit, "penult" to get second to last, or numeric getdigit <- function(v, pos="last") { s <- strsplit(as.character(v),c()) if(pos=="last") { s <- sapply(s, function(y) y[length(y)]); return(s) } if(pos=="penult") { s <- sapply(s, function(y) ifelse(length(y)<2, NA, y[length(y)-1])); return(s) } if(is.numeric(pos)) { s <- sapply(s, function(y) y[pos]); return(s) } stop("Invalid position") } # function to get list of digit counts and proportions for each stratum identified by id digits <- function(v, id, pos="last"){ if(!is.vector(v) | !is.vector(id)) stop("Inputs must be vectors") if(!identical(length(v),length(id))) stop("Input vectors vary in length") id.uni <- unique(id) count <- matrix(ncol=10, nrow=length(id.uni), dimnames=list(NULL,c(0:9))) distance <- matrix(0, ncol=6, nrow=length(id.uni), dimnames=list(NULL,c(0:5))) id.out <- c() reps <- list() for(i in 1:length(id.uni)){ tmp <- getdigit(v[id==id.uni[i]], pos=pos) # check for distance between last and penultimate digits within observation ifelse(pos=="last", last<-tmp, last<-getdigit(v[id==id.uni[i]], pos="last")) penult <- getdigit(v[id==id.uni[i]], pos="penult") tolast <- abs(as.numeric(last) - as.numeric(penult)) tolast[tolast>5 & !is.na(tolast)] <- 10 - tolast[tolast>5 & !is.na(tolast)] tolast <- table(tolast) distance[i,dimnames(tolast)[[1]]] <- tolast # check digit frequencies tmp <- table(as.numeric(tmp)) count[i,dimnames(tmp)[[1]]] <- tmp if(sum(is.na(count[i,]))<10) count[i,is.na(count[i,])] <- 0 id.out <- c(id.out, id.uni[i]) } distance.prop <- t(apply(distance, 1, function(x) x/sum(x))) prop <- t(apply(count, 1, function(x) x/sum(x))) n <- apply(count, 1, function(x) sum(x)) sd <- apply(prop, 1, function(x) sd(x)) chi <- apply(count, 1, function(x) sum((x - .1*sum(x))^2 / (.1*sum(x)))) all.count <- apply(count, 2, sum) all.prop <- all.count/sum(n) all.n <- sum(all.count) all.sd <- sd(all.prop) all.chi <- sum((all.count - .1*all.n)^2 / (.1*all.n)) all.distance <- apply(distance, 2, sum) return(list(distance=distance, distance.prop=distance.prop, count=count, prop=prop, sd=sd, chi=chi, n=n, id=id.out, all.distance=all.distance, all.count=all.count, all.prop=all.prop, all.n=all.n, all.sd=all.sd, all.chi=all.chi)) } # function to get p-values sims.fct.p <- function(NSIMS=100000, VAL, THRES1, THRES2, SD, NONADJ) { sims.thres1 <- sims.thres2 <- sims.thres12 <- sims.sd <- sims.nonadj <- sims.thres12.nonadj <- sims.sd.nonadj <- c() for(j in 1:NSIMS) { draw1 <- sample(0:9, VAL, replace=T) draw2 <- sample(0:9, VAL, replace=T) prop1 <- sapply(0:9, function(x) length(draw1[draw1==x])/VAL) prop2 <- sapply(0:9, function(x) length(draw2[draw2==x])/VAL) sims.thres1 <- c(sims.thres1, .thres1 <- ifelse(THRES1>=.1, sum(prop1>=THRES1), sum(prop1<=THRES1))) sims.thres2 <- c(sims.thres2, .thres2 <- ifelse(THRES2>=.1, sum(prop1>=THRES2), sum(prop1<=THRES2))) sims.thres12 <- c(sims.thres12, .thres1 & .thres2) sims.sd <- c(sims.sd, .sd <- (sd(prop1)>=SD)) sims.nonadj <- c(sims.nonadj, .nonadj <- (sum(prop2[4:10])<=NONADJ)) sims.thres12.nonadj <- c(sims.thres12.nonadj, .thres1 & .thres2 & .nonadj) sims.sd.nonadj <- c(sims.sd.nonadj, .sd & .nonadj) pct.done <- 100*j/NSIMS if(pct.done == round(pct.done)) { print(pct.done); flush.console() } } return(list(thres1 = sum(sims.thres1>0)/NSIMS, thres2 = sum(sims.thres2>0)/NSIMS, thres12 = sum(sims.thres12>0)/NSIMS, sd = sum(sims.sd>0)/NSIMS, nonadj = sum(sims.nonadj>0)/NSIMS, thres12.nonadj = sum(sims.thres12.nonadj>0)/NSIMS, sd.nonadj = sum(sims.sd.nonadj>0)/NSIMS)) } ############################################################################# ###### look at data from iran 2009 ############################################################################# # provincial results from 2009 presidential election iran.data <- read.csv("Iran_2009.csv") attach(iran.data) # number of provinces in data n.province <- length(Province) # get last digits for Ahmadinejad, Mousavi, etc., and all columns iran.ahm.digits <- digits(Ahmadinejad, rep(1, n.province)) iran.mou.digits <- digits(Mousavi, rep(1, n.province)) iran.kar.digits <- digits(Karroubi, rep(1, n.province)) iran.rez.digits <- digits(Rezaee, rep(1, n.province)) iran.all.digits <- digits(c(Ahmadinejad, Mousavi, Karroubi, Rezaee), rep(1, 4*n.province)) # p-value for chi-squared test of variation in last-digit frequencies 1 - pchisq(iran.all.digits$all.chi, 9) # get probabilities probs <- sims.fct.p(VAL=iran.all.digits$all.n, THRES1=iran.all.digits$all.prop["7"], THRES2=iran.all.digits$all.prop["5"], SD=iran.all.digits$all.sd, NONADJ=sum(iran.all.digits$distance.prop[3:6])) # point of comparison: US 2008 us.data <- read.csv("US_2008_Vote_Totals.csv") us.data <- us.data[us.data[,"State"]!="D. C.",] attach(us.data) # number of states in data n.states <- length(State) # get last digits for Obama, McCain, and both columns us.oba.digits <- digits(Obama, rep(1, n.states)) us.mcc.digits <- digits(McCain, rep(1, n.states)) us.all.digits <- digits(c(Obama, McCain), rep(1, 2*n.states)) # run analysis for US, to see if we get false positive us.probs <- sims.fct.p(VAL=us.all.digits$all.n, THRES1=max(us.all.digits$all.prop), THRES2=min(us.all.digits$all.prop), SD=us.all.digits$all.sd, NONADJ=sum(us.all.digits$distance.prop[3:6])) # results probs us.probs