library(MASS) ########################################### ############### Train Model ############### ########################################### # Find rank one matrix. Called by eigen.model rankOne.calc <- function( Qmat, blocks ) { M <- length(blocks) N <- 0 for ( i in 1:(M-1) ) { for ( j in (i+1):M ) { N <- N + length(blocks[[i]])*length(blocks[[j]]) } } L <- dim(Qmat)[1] # Set up system of equations X <- matrix( rep(0,L*N) ,ncol=L) q.vec <- rep(0,N) ind <- 1 for ( h in 1:(M-1) ) { for ( i in (h+1):M ) { for ( j in blocks[[h]] ) { for ( k in blocks[[i]] ) { q.vec[ind] <- log(abs(Qmat[j,k])) X[ind,j] <- 1 X[ind,k] <- 1 ind <- ind + 1 } } } } # Least squares solution T <- c() if ( M == 2 ) { X.inv <- ginv(X[,2:L]) T <- X.inv %*% q.vec T <- c(0,T[,1]) } else { X.inv <- ginv(X) T <- X.inv %*% q.vec } # Calculate Rmat Rmat <- exp(T) %*% t(exp(T)) Rmat } ######################### Read in training data and fit model ########################### # annotations: File with annotations for variants in training set # blocks: List of arrays giving block structure ( e.g. blocks[[1]]=c(1,2,3,4), blocks[[2]]=c(5,6,7,8,9,10,11,12) ) # columns: Array listing which columns to read in from annotations ( e.g. 5:20 ) # invert: Annotations which should be rescaled as 1-x (e.g. c(1,17,18,19,20) ) eigen.model <- function( annotations, blocks, columns, save.file=NULL, invert=NULL, header=TRUE, sep="\t" ) { # Find number of columns in annotations anno1 <- read.table(annotations,nrows=1,sep=sep) dim.anno <- dim(anno1) # Read in full dataset cols <- rep("NULL",dim.anno[2]) cols[columns] <- "character" anno <- read.table(annotations,colClasses=cols,header=header,sep=sep) d.anno <- dim(anno) print(d.anno) for ( i in 1:d.anno[2] ) { anno[,i] <- as.numeric(anno[,i]) } if ( !is.null(invert) ) { for ( i in invert ) { anno[,i] <- 1 - anno[,i] } } means <- rep(0,d.anno[2]) sds <- rep(0,d.anno[2]) for ( i in 1:(d.anno[2]) ) { means[i] <- mean(anno[,i],na.rm=TRUE) sds[i] <- sd(anno[,i],na.rm=TRUE) anno[,i] <- (anno[,i]-means[i])/sds[i] } print(dim(anno)) include <- c() blocks2 <- list() start <- 1 for ( i in 1:length(blocks) ) { include <- c(include,blocks[[i]]) blocks2[[i]] <- seq(start,(start+length(blocks[[i]])-1),1) start <- start + length(blocks[[i]]) } print(include) means <- means[include] sds <- sds[include] anno <- anno[,include] # Calculate covariance cov1 <- cor(anno,use="pairwise.complete.obs") print("cov1") print(dim(cov1)) # Calculate R matrix and weights R.mat <- rankOne.calc( cov1, blocks2 ) print("R.mat") print(dim(R.mat)) eigen.decomp <- eigen(R.mat) weights <- eigen.decomp$vectors[,1] # Record and save/return results model <- list() model$cov.matrix <- cov1 model$rank.one.matrix <- R.mat model$weights <- weights model$means <- means model$sds <- sds if ( !is.null(save.file) ) { save(model,file=save.file) } model } ######################### Eigen-PC ########################### eigen.model.pc <- function( annotations, columns, save.file=NULL, invert=NULL, header=TRUE, sep="\t" ) { # Find number of columns in annotations anno1 <- read.table(annotations,nrows=1,sep=sep) dim.anno <- dim(anno1) # Read in full dataset cols <- rep("NULL",dim.anno[2]) cols[columns] <- "character" anno <- read.table(annotations,colClasses=cols,header=header,sep=sep) d.anno <- dim(anno) for ( i in 1:d.anno[2] ) { anno[,i] <- as.numeric(anno[,i]) } if ( !is.null(invert) ) { for ( i in invert ) { anno[,i] <- 1 - anno[,i] } } means <- rep(0,d.anno[2]) sds <- rep(0,d.anno[2]) for ( i in 1:(d.anno[2]) ) { means[i] <- mean(anno[,i],na.rm=TRUE) sds[i] <- sd(anno[,i],na.rm=TRUE) anno[,i] <- (anno[,i]-means[i])/sds[i] } # Calculate covariance cov1 <- cor(anno,use="pairwise.complete.obs") # Calculate eigendecomposition of matrix and weights eigen.decomp <- eigen(cov1) weights <- eigen.decomp$vectors[,1] # Record and save/return results model <- list() model$cov.matrix <- cov1 model$weights <- weights model$means <- means model$sds <- sds if ( !is.null(save.file) ) { save(model,file=save.file) } model } ########################################### ############# Calculate Eigen Scores ###### ########################################### ### Called by eigen.score calc.scores <- function( x, columns, means, sds, weights, consequence, invert, impute.val, impute.which, impute.cons, outfile ) { # Adjust annotations (rescale, normalize, impute) adj <- as.numeric(as.character(x[columns])) # Make datatype numeric if ( !is.null(invert) ) { adj[invert] <- 1 - adj[invert] # switch scale (e.g. SIFT, allele frequency) } missing <- which(is.na(adj)) # Find missing values adj[missing] <- means[missing] # Impute missing values adj <- (adj-means)/sds # Standardize annotations # Impute special values for select consequences if ( !is.null(impute.cons) ) { impute <- 0 cons <- as.character(x[consequence]) for ( c in impute.cons ) { impute <- impute + grepl(c,cons) } if ( impute > 0 ) { adj[impute.which] <- (impute.val-means[impute.which])/sds[impute.which] } } score.N <- t(adj) %*% weights out <- paste(c(as.character(x),score.N),collapse="\t") cat(out,file=outfile,sep="\n") } ############# Read in model file and setup score calculation ############ # annotations: File with annotations for variants to be scored # model: Comma separated list of R data files containing model information # use: Indicate which annotations to use in the score (e.g. 1:12) # columns: Columns in the annotations file, containing annotations specified by use (e.g. 5:16) # header: Does the annotation file have a header line? (TRUE/FALSE) # consequence: Column containing functional consequence (e.g. 21 ) # invert: Annotations which should be rescaled as 1-x (e.g. c(1,17,18,19,20) ) # impute.val: Vector of values to use for imputation of special values (e.g. c(1,1,1,5.37) ) # impute.which: Vector of annotations to apply imputation to (e.g. c(1,2,3,4) ) # impute.cons: Use non-mean imputation for consequences in impute.cons (e.g. c("stop_lost","splice_site") ) eigen.score <- function( annotations, model1, model2, use, columns, save.file, header=NULL, consequence=NULL, invert=NULL, impute.val=NULL, impute.which=NULL, impute.cons=NULL ) { anno.input <- gzfile(annotations,open="r") # Open connection to read in annotations # Process model files means.list <- c() sds.list <- c() weights.list <- c() model.files <- strsplit(model1,",")[[1]] for ( i in 1:length(model.files) ) { try.result <- try(load(model.files[i]),silent=TRUE) if ( class(try.result) != "try-error" ) { weights.list <- rbind(weights.list,abs(model$weights[use]/sum(model$weights[use]))) } else { print("not found") } } model.files <- strsplit(model2,",")[[1]] #print(use) for ( i in 1:length(model.files) ) { try.result <- try(load(model.files[i]),silent=TRUE) if ( class(try.result) != "try-error" ) { means.list <- rbind(means.list,model$means[use]) sds.list <- rbind(sds.list,model$sds[use]) } else { print("not found") } } means <- rep(0,length(use)) sds <- rep(0,length(use)) weights <- rep(0,length(use)) for ( i in 1:length(use) ) { means[i] <- median(means.list[,i],na.rm=TRUE) sds[i] <- median(sds.list[,i],na.rm=TRUE) weights[i] <- median(weights.list[,i],na.rm=TRUE) } output <- gzfile(save.file,open="w") if ( !is.null(header) ) { out <- paste(header,collapse="\t") cat(out,file=output,sep="\n") } calc_score <- function(x) { calc.scores(x,columns,means,sds,weights,consequence,invert,impute.val,impute.which,impute.cons,output) } while ( length( line <- readLines(anno.input, n = 1, warn = FALSE ) ) > 0 ) { parts <- strsplit(as.character(line),"\t") lapply(parts,calc_score) } close(anno.input) close(output) } #####################################