/* C++ code implementing the rare variant testing strategy by Iuliana Ionita-Laza et al, Plos Genet, 2011 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <gsl/gsl_cdf.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>
#include <gsl/gsl_permutation.h>
#include <gsl/gsl_sort.h>
#include <gsl/gsl_sort_vector.h>
#include <time.h>
#include <gsl/gsl_sf_gamma.h>


int **affected, **unaffected;
int **affected_perm, **unaffected_perm;
int naff, nunaff, nind, nsnps;
float *sum_R, *sum_P, *sum_RP, *sum_C;
int *outlier_R, *outlier_P, *outlier_RP, *outlier_C;
int *start, *end;
char **gene;
float *weight;

void alloca_data()
{
  int i;

  affected=new int*[nind+100];
  unaffected=new int*[nind+100];
  affected_perm=new int*[nind+100];
  unaffected_perm=new int*[nind+100];
  sum_R=new float[nsnps+100];
  sum_P=new float[nsnps+100];
  sum_RP=new float[nsnps+100];
  sum_C=new float[nsnps+100];
  outlier_R=new int[nsnps+100];
  outlier_P=new int[nsnps+100];
  outlier_RP=new int[nsnps+100];
  outlier_C=new int[nsnps+100];
  start=new int[nsnps+100];
  end=new int[nsnps+100];
  gene=new char*[nsnps];
  weight=new float[nsnps+100];

  for (i=1; i<=nind; i++)
    affected[i]=new int[nsnps+100];
  for (i=1; i<=nind; i++)
    unaffected[i]=new int[nsnps+100];
  
  for (i=1; i<=nind; i++)
    affected_perm[i]=new int[nsnps+100];
  for (i=1; i<=nind; i++)
    unaffected_perm[i]=new int[nsnps+100];

  for (i=1; i<=nsnps; i++)
    gene[i]=new char[100];


}

main(int argc, char *argv[])
{

  int i, j, k, ig, k0, b, a, aff;
  int npermut, ngenes;
  float freq; 
  FILE *fin, *fout;
 
  const gsl_rng_type * T;
  gsl_rng * r;
  gsl_rng_env_setup();
  T = gsl_rng_default;
  r = gsl_rng_alloc (T);
  gsl_rng_set(r, time(0));

  nind=atoi(argv[1]);
  nsnps=atoi(argv[2]);
  ngenes=atoi(argv[3]);
  freq=atof(argv[4]);
  npermut=atoi(argv[5]);

  alloca_data();

  naff=0; nunaff=0;
  // read 1st input file: data.ped
  /***********************************************************************/
  fin=fopen("data.ped","r");
  for (i=1; i<=nind; i++)
    {
      fscanf(fin,"%d %d %d %d %d %d",&a,&a,&a,&a,&a,&aff);
      if (aff==2) { naff=naff+1;
    for (j=1; j<=nsnps; j++)
       fscanf(fin,"%d",&affected[naff][j]);
      }
      if (aff==1) { nunaff=nunaff+1;
	for (j=1; j<=nsnps; j++)
	  fscanf(fin,"%d",&unaffected[nunaff][j]);
      }
    }
  fclose(fin);
  /***********************************************************************/
  // read 2nd input file: gene_position.txt
  fin=fopen("gene_position.txt","r");
  for (i=1; i<=ngenes; i++)
     fscanf(fin,"%s %d %d",gene[i],&start[i],&end[i]);
  fclose(fin);

  /***********************************************************************/
  // read 3rd input file: gene_position.txt  
  fin=fopen("weights.txt","r");
  for (i=1; i<=nsnps; i++)
    fscanf(fin,"%f",&weight[i]);
  fclose(fin);
 
  /***********************************************************************/



  // Compute risk, protective, max and sum statistics in the paper
  /***********************************************************************/
  int count, count1;
  for (ig=1; ig<=ngenes; ig++)
    {
  sum_RP[ig]=0; 
  sum_R[ig]=0;
  sum_P[ig]=0;
  sum_C[ig]=0;

  for (i=start[ig]; i<=end[ig]; i++)
    {
      count=0;
      count1=0;
      for (j=1; j<=naff; j++)
	  count=count+affected[j][i];
      for (j=1; j<=nunaff; j++)
	  count1=count1+unaffected[j][i];
     
      float f=1;
      float w;  
      k0=(int)(freq*2*nunaff);
      if (count>0) f=gsl_cdf_poisson_P(count1,nunaff*(count+count1)/((float)naff+nunaff))*(1-gsl_cdf_poisson_P(count-1,naff*(count+count1)/((float) naff+nunaff)));
      if ((count1<=k0) && (((float)count)/naff>((float)count1)/nunaff)) sum_R[ig]=sum_R[ig]-weight[i]*log(f); 

      f=1;
      k0=(int)(freq*2*naff);
      if (count1>0) f=gsl_cdf_poisson_P(count,naff*(count+count1)/((float)naff+nunaff))*(1-gsl_cdf_poisson_P(count1-1,nunaff*(count+count1)/((float) naff+nunaff)));
      if ((count<=k0) && (((float)count1)/nunaff>((float)count)/naff)) sum_P[ig]=sum_P[ig]-weight[i]*log(f);
    }

     // The max and sum statistics in the manuscript: R - potentially risk and P - potentially protective
     sum_RP[ig]=fmax(sum_R[ig],sum_P[ig]);
     sum_C[ig]=sum_R[ig]+sum_P[ig];
    }
  
  /***********************************************************************/
    

  //Empirical P-values via Permutations
  /***********************************************************************/
  for (ig=1; ig<=ngenes; ig++)
    { 
      outlier_RP[ig]=0; 
      outlier_R[ig]=0;
      outlier_P[ig]=0;
      outlier_C[ig]=0;
    }

  int permut[naff+nunaff+100];
  const size_t N = naff+nunaff;
  gsl_permutation * p = gsl_permutation_alloc (N);
  gsl_permutation_init (p);
 
  for (b=1; b<=npermut; b++)
    {
       /********** Generate permutations ********************************/
      gsl_ran_shuffle (r, p->data, N, sizeof(size_t));
      for (i=1; i<=(naff+nunaff); i++)
	permut[i]=p->data[i-1];

      for (i=1; i<=nsnps; i++)
        {
          for (j=1; j<=naff; j++)
            {                               
              if (permut[j]+1<=naff) affected_perm[j][i]=affected[permut[j]+1][i];
              else affected_perm[j][i]=unaffected[permut[j]+1-naff][i];
            }
          for (j=1; j<=nunaff; j++)
            {
              if (permut[j+naff]+1<=naff) unaffected_perm[j][i]=affected[permut[j+naff]+1][i];
              else unaffected_perm[j][i]=unaffected[permut[j+naff]+1-naff][i];
            }
	}
    

      for (ig=1; ig<=ngenes; ig++)
	{
	  float sum_perm_R=0, sum_perm_P=0, sum_perm_RP=0, sum_perm_C=0;
        for (i=start[ig]; i<=end[ig]; i++)
	{
	  count=0;
	  count1=0;
	  for (j=1; j<=naff; j++)
	      count=count+affected_perm[j][i];
	  for (j=1; j<=nunaff; j++)
	      count1=count1+unaffected_perm[j][i];  
	 
	  float f=1;
	  k0=(int)(freq*2*nunaff);
	  if (count>0) f=gsl_cdf_poisson_P(count1,nunaff*(count+count1)/((float)naff+nunaff))*(1-gsl_cdf_poisson_P(count-1,naff*(count+count1)/((float) naff+nunaff)));
          if ((count1<=k0) && (count/((float)naff)>count1/((float)nunaff))) sum_perm_R=sum_perm_R-weight[i]*log(f);

	  f=1;
          k0=(int)(freq*2*naff);
	  if (count1>0) f=gsl_cdf_poisson_P(count,naff*(count+count1)/((float)naff+nunaff))*(1-gsl_cdf_poisson_P(count1-1,nunaff*(count+count1)/((float) naff+nunaff)));
	  if ((count<=k0) && (((float)count1)/nunaff>((float)count)/naff)) sum_perm_P=sum_perm_P-weight[i]*log(f);
	}

      sum_perm_RP=fmax(sum_perm_R,sum_perm_P);
      sum_perm_C=sum_perm_R+sum_perm_P;
      if (sum_perm_RP>=sum_RP[ig]) outlier_RP[ig]=outlier_RP[ig]+1;
      if (sum_perm_R>=sum_R[ig]) outlier_R[ig]=outlier_R[ig]+1;
      if (sum_perm_P>=sum_P[ig]) outlier_P[ig]=outlier_P[ig]+1;
      if (sum_perm_C>=sum_C[ig]) outlier_C[ig]=outlier_C[ig]+1;
	}
    }
  /***********************************************************************/
  
    // output files results.txt
    /***********************************************************************/
    fout=fopen("results.txt","w");
    fprintf(fout,"The empirical P-values (risk, protective, max, sum) based on %d permutations are:\n",b-1);
    fprintf(fout,"Gene Risk_P Protective_P RP_P C_P \n");
    for (ig=1; ig<=ngenes; ig++)
      fprintf(fout,"%s %f %f %f %f\n",gene[ig],outlier_R[ig]/(b-1.0),outlier_P[ig]/(b-1.0),outlier_RP[ig]/(b-1.0),outlier_C[ig]/(b-1.0));
    fclose(fout); 
}


