/* C++ code implementing the rare variant testing strategy by Iuliana Ionita-Laza et al, Plos Genet, 2011 */ #include #include #include #include #include #include #include #include #include #include #include int **affected, **unaffected; int **affected_perm, **unaffected_perm; int naff, nunaff, nind, nsnps; float *sum_R, *sum_P, *sum_RP, *sum_C; int *outlier_R, *outlier_P, *outlier_RP, *outlier_C; int *start, *end; char **gene; float *weight; void alloca_data() { int i; affected=new int*[nind+100]; unaffected=new int*[nind+100]; affected_perm=new int*[nind+100]; unaffected_perm=new int*[nind+100]; sum_R=new float[nsnps+100]; sum_P=new float[nsnps+100]; sum_RP=new float[nsnps+100]; sum_C=new float[nsnps+100]; outlier_R=new int[nsnps+100]; outlier_P=new int[nsnps+100]; outlier_RP=new int[nsnps+100]; outlier_C=new int[nsnps+100]; start=new int[nsnps+100]; end=new int[nsnps+100]; gene=new char*[nsnps]; weight=new float[nsnps+100]; for (i=1; i<=nind; i++) affected[i]=new int[nsnps+100]; for (i=1; i<=nind; i++) unaffected[i]=new int[nsnps+100]; for (i=1; i<=nind; i++) affected_perm[i]=new int[nsnps+100]; for (i=1; i<=nind; i++) unaffected_perm[i]=new int[nsnps+100]; for (i=1; i<=nsnps; i++) gene[i]=new char[100]; } main(int argc, char *argv[]) { int i, j, k, ig, k0, b, a, aff; int npermut, ngenes; float freq; FILE *fin, *fout; const gsl_rng_type * T; gsl_rng * r; gsl_rng_env_setup(); T = gsl_rng_default; r = gsl_rng_alloc (T); gsl_rng_set(r, time(0)); nind=atoi(argv[1]); nsnps=atoi(argv[2]); ngenes=atoi(argv[3]); freq=atof(argv[4]); npermut=atoi(argv[5]); alloca_data(); naff=0; nunaff=0; // read 1st input file: data.ped /***********************************************************************/ fin=fopen("data.ped","r"); for (i=1; i<=nind; i++) { fscanf(fin,"%d %d %d %d %d %d",&a,&a,&a,&a,&a,&aff); if (aff==2) { naff=naff+1; for (j=1; j<=nsnps; j++) fscanf(fin,"%d",&affected[naff][j]); } if (aff==1) { nunaff=nunaff+1; for (j=1; j<=nsnps; j++) fscanf(fin,"%d",&unaffected[nunaff][j]); } } fclose(fin); /***********************************************************************/ // read 2nd input file: gene_position.txt fin=fopen("gene_position.txt","r"); for (i=1; i<=ngenes; i++) fscanf(fin,"%s %d %d",gene[i],&start[i],&end[i]); fclose(fin); /***********************************************************************/ // read 3rd input file: gene_position.txt fin=fopen("weights.txt","r"); for (i=1; i<=nsnps; i++) fscanf(fin,"%f",&weight[i]); fclose(fin); /***********************************************************************/ // Compute risk, protective, max and sum statistics in the paper /***********************************************************************/ int count, count1; for (ig=1; ig<=ngenes; ig++) { sum_RP[ig]=0; sum_R[ig]=0; sum_P[ig]=0; sum_C[ig]=0; for (i=start[ig]; i<=end[ig]; i++) { count=0; count1=0; for (j=1; j<=naff; j++) count=count+affected[j][i]; for (j=1; j<=nunaff; j++) count1=count1+unaffected[j][i]; float f=1; float w; k0=(int)(freq*2*nunaff); if (count>0) f=gsl_cdf_poisson_P(count1,nunaff*(count+count1)/((float)naff+nunaff))*(1-gsl_cdf_poisson_P(count-1,naff*(count+count1)/((float) naff+nunaff))); if ((count1<=k0) && (((float)count)/naff>((float)count1)/nunaff)) sum_R[ig]=sum_R[ig]-weight[i]*log(f); f=1; k0=(int)(freq*2*naff); if (count1>0) f=gsl_cdf_poisson_P(count,naff*(count+count1)/((float)naff+nunaff))*(1-gsl_cdf_poisson_P(count1-1,nunaff*(count+count1)/((float) naff+nunaff))); if ((count<=k0) && (((float)count1)/nunaff>((float)count)/naff)) sum_P[ig]=sum_P[ig]-weight[i]*log(f); } // The max and sum statistics in the manuscript: R - potentially risk and P - potentially protective sum_RP[ig]=fmax(sum_R[ig],sum_P[ig]); sum_C[ig]=sum_R[ig]+sum_P[ig]; } /***********************************************************************/ //Empirical P-values via Permutations /***********************************************************************/ for (ig=1; ig<=ngenes; ig++) { outlier_RP[ig]=0; outlier_R[ig]=0; outlier_P[ig]=0; outlier_C[ig]=0; } int permut[naff+nunaff+100]; const size_t N = naff+nunaff; gsl_permutation * p = gsl_permutation_alloc (N); gsl_permutation_init (p); for (b=1; b<=npermut; b++) { /********** Generate permutations ********************************/ gsl_ran_shuffle (r, p->data, N, sizeof(size_t)); for (i=1; i<=(naff+nunaff); i++) permut[i]=p->data[i-1]; for (i=1; i<=nsnps; i++) { for (j=1; j<=naff; j++) { if (permut[j]+1<=naff) affected_perm[j][i]=affected[permut[j]+1][i]; else affected_perm[j][i]=unaffected[permut[j]+1-naff][i]; } for (j=1; j<=nunaff; j++) { if (permut[j+naff]+1<=naff) unaffected_perm[j][i]=affected[permut[j+naff]+1][i]; else unaffected_perm[j][i]=unaffected[permut[j+naff]+1-naff][i]; } } for (ig=1; ig<=ngenes; ig++) { float sum_perm_R=0, sum_perm_P=0, sum_perm_RP=0, sum_perm_C=0; for (i=start[ig]; i<=end[ig]; i++) { count=0; count1=0; for (j=1; j<=naff; j++) count=count+affected_perm[j][i]; for (j=1; j<=nunaff; j++) count1=count1+unaffected_perm[j][i]; float f=1; k0=(int)(freq*2*nunaff); if (count>0) f=gsl_cdf_poisson_P(count1,nunaff*(count+count1)/((float)naff+nunaff))*(1-gsl_cdf_poisson_P(count-1,naff*(count+count1)/((float) naff+nunaff))); if ((count1<=k0) && (count/((float)naff)>count1/((float)nunaff))) sum_perm_R=sum_perm_R-weight[i]*log(f); f=1; k0=(int)(freq*2*naff); if (count1>0) f=gsl_cdf_poisson_P(count,naff*(count+count1)/((float)naff+nunaff))*(1-gsl_cdf_poisson_P(count1-1,nunaff*(count+count1)/((float) naff+nunaff))); if ((count<=k0) && (((float)count1)/nunaff>((float)count)/naff)) sum_perm_P=sum_perm_P-weight[i]*log(f); } sum_perm_RP=fmax(sum_perm_R,sum_perm_P); sum_perm_C=sum_perm_R+sum_perm_P; if (sum_perm_RP>=sum_RP[ig]) outlier_RP[ig]=outlier_RP[ig]+1; if (sum_perm_R>=sum_R[ig]) outlier_R[ig]=outlier_R[ig]+1; if (sum_perm_P>=sum_P[ig]) outlier_P[ig]=outlier_P[ig]+1; if (sum_perm_C>=sum_C[ig]) outlier_C[ig]=outlier_C[ig]+1; } } /***********************************************************************/ // output files results.txt /***********************************************************************/ fout=fopen("results.txt","w"); fprintf(fout,"The empirical P-values (risk, protective, max, sum) based on %d permutations are:\n",b-1); fprintf(fout,"Gene Risk_P Protective_P RP_P C_P \n"); for (ig=1; ig<=ngenes; ig++) fprintf(fout,"%s %f %f %f %f\n",gene[ig],outlier_R[ig]/(b-1.0),outlier_P[ig]/(b-1.0),outlier_RP[ig]/(b-1.0),outlier_C[ig]/(b-1.0)); fclose(fout); }