Skip Navigation
  Print Page

Biostatistics and Bioinformatics Branch (BBB)

Skip sharing on social media links
Share this:
Skip Internal Navigation

A Cross-population Extended Haplotype-based Homozygosity Score Test (xp-EHHST)

xpEHHST.cc

#include "xpEHHST.h"

int main(int argc, char **argv)
          {using namespace std;

          xpEHHT xpehht1,xpehht2;

          /*read in the 1st population data*/
          xpehht1.load_info(argv[1]);
          xpehht1.load_data(argv[2]);

          /*read in the 2nd population data*/
          xpehht2.load_info(argv[3]);
          xpehht2.load_data(argv[4]);

          for (int i = 0; i < xpehht1.snp_num; i++)
                xpehht1.compute_allele_freq(i);
          for (int i = 0; i < xpehht2.snp_num; i++)
                xpehht2.compute_allele_freq(i);

          int len = strlen(argv[5]);
          char *str_ehh = new char[len + 8 + 1];
          str_ehh = strcpy(str_ehh, argv[5]);

          xpehht1.out_ehh = strcat(str_ehh, ".out");
          FILE * output_ehh = fopen(xpehht1.out_ehh, "wt");
          xpehht1.output_ehh = output_ehh;
          fprintf(output_ehh, "Marker location ID1 ID2 S_AB p-value\n");

          for (int i = 0; i < xpehht1.snp_num; i++)
                {
                xpehht1.compute_ehhst(i);
                }

          for (int i = 0; i < xpehht2.snp_num; i++)
                {
                xpehht2.compute_ehhst(i);
                }

          /*find the intersection of two SNP lists based on location*/

          vector<int> snp;
          set_intersection(xpehht1.phy_map.begin(),xpehht1.phy_map.end(),xpehht2.phy_map.begin(),xpehht2.phy_map.end(),back_inserter(snp));

// vector<int>::iterator it;
// cout << "SNP list contains:";
                //for ( it=snp.begin() ; it < snp.begin()+10/*snp.end()*/; it++ )
// cout << " " << *it;
                //cout << endl;

          int n1=xpehht1.data.size()/2;
          int n2=xpehht2.data.size()/2;
          //cout << n1 <<endl;
                //cout << n2 <<endl;

          for (int k = 0; k < snp.size(); k++)
                {
            int index1=distance(xpehht1.phy_map.begin(), find(xpehht1.phy_map.begin(),xpehht1.phy_map.end(),snp[k]));
                int index2=distance(xpehht2.phy_map.begin(), find(xpehht2.phy_map.begin(),xpehht2.phy_map.end(),snp[k]));
                //cout << xpehht1.musav[index1] <<endl;
                //cout << xpehht2.sigmasav[index2] <<endl;
                //cout << xpehht2.Ssav[index2] <<endl;

                double s_ab=xpehht1.Ssav[index1]*sqrt(xpehht1.sigmasav[index1]/n1)-xpehht2.Ssav[index2]*sqrt(xpehht2.sigmasav[index2]/n2);
                double sigma_ab=sqrt(((n1-1)*xpehht1.sigmasav[index1]+(n2-1)*xpehht2.sigmasav[index2])/(n1+n2-2));
                s_ab=s_ab/( sigma_ab * sqrt( 1.0/n1 + 1.0/n2 )); // Zhong ming missed: * sqrt( 1.0/n1 + 1.0/n2 )

                //cout << s_ab <<endl;
            fprintf(output_ehh, "%11s %10d %10d %10d %7.3f %10.3f\n",
            xpehht1.snp_list[index1].c_str(), xpehht1.phy_map[index1], index1, index2, s_ab,2 * xpehht1.ndist(s_ab, true));
            }

          /*calculate cross-population test statistic: To be determined, Kruskal-Wallis as a alternative*/
          // S_AB=(S_A*sigma_A/sqrt(n_A)-S_B*sigma_B/sqrt(n_B))/sigma_AB
          // sigma_AB=sqrt((n_A-1)*sigma_A^2+(n_B-1)*sigma_B^2)/(n_A+n_B-2))

          //test code
                //vector<float>::iterator it2;
                //cout << "mean counts in pop 2 are:";
                //for ( it2=xpehht2.musav.begin() ; it2 < xpehht2.musav.begin()+10/*snp.end()*/; it2++ )
// cout << " " << *it2;
                //cout << endl;

//find the common SNP index in each population using find_if function

//for a given common SNP, find the corresponding sigma_2 and S

     fclose(output_ehh);

     delete [] str_ehh;
     return 0;
     }

void xpEHHT::load_info(char *infofile)
     {
     FILE *fp = fopen(infofile, "r");
     if (fp == 0)
                return;
     char line[128];

     fgets(line, 128, fp);//skip the first line, revised by Ming
     while (fgets(line, 128, fp))
                {
                char *str = strtok(line, "\t"); // the space in "\t" is a tab
                string marker(str);

                snp_list.push_back(marker);

                str = strtok(NULL, "\t");
                phy_map.push_back(atoi(str));

                str = strtok(NULL, "\t");
                      allele_coded_0.push_back(str);

                str = strtok(NULL, "\t");
                      allele_coded_1.push_back(str);
                }
     fclose(fp);
     }

int xpEHHT::load_data(char *datafile)
     {
     FILE *fp = fopen(datafile, "r");
     if (fp == 0)
                return -1;

     vector<char> line;

     while (1)
                {
                char in = fgetc(fp);
                if (isalnum(in))
                                {
                     line.push_back(in);
                     continue;
                     }
                if (in == '\n' || in == '\r' || in == EOF)
                     {
                     // convert line to single char array
                     if (line.size() > 0)
                                {
                                char *pl = new char[line.size()+1];
                                memset(pl,0,line.size()+1);
                                for (int i = 0; i < line.size(); i++)
                                           {
                                        pl[i] = line[i];
                                        }
                                data.push_back(pl);
                                           }
                                           line.clear();
                     }

                if (in == EOF)
                     break;
               }

          fclose(fp); // added by Andrew Redd on March 6 2009//

          if (data.size() < 1)
                          return -2;

          snp_num = strlen(data[0]);
          //printf("snp_num = %d, data.size() = %d \n", snp_num, data.size());
          return 1;
          }

void xpEHHT::compute_allele_freq(int index)
          {
          double freq = 0.0;
          for (int i = 0; i < data.size(); i++)
               {
               if (data[i][index] == '0')
                    freq++;
               }

          p.push_back( freq / data.size() );
          }

void xpEHHT::compute_ehhst(int index)
          {
          //printf("index = %d \r", index);
          double T = count_homozygotes( index );

          double mu, EM;
          mu = EM = mean_M ( index );

          //calculate ER and ER^2
          mean_R = mean_Rsq = 0.0;
          mean_R_Rsq_hap ( index );
          if (mean_R * mean_R > mean_Rsq + epsilon)
                    printf("ERROR: mean_R * mean_R = %f > mean_Rsq = %f\n", mean_R * mean_R, mean_Rsq);

          // calculate EL and EL^2
          mean_L = mean_Lsq = 0.0;
          mean_L_Lsq_hap ( index );
          if (mean_L * mean_L > mean_Lsq + epsilon)
                    printf("ERROR: mean_L * mean_L = %f > mean_Lsq = %f\n", mean_L * mean_L, mean_Lsq);

          // calculate ELR
          /*double mean_LR = expectation_LR ( index );
          printf("%11s, ", snp_list[index].c_str());
          printf("mean_R = %f, mean_Rsq = %f, ", mean_R, mean_Rsq);
          printf("mean_L = %f, mean_Lsq = %f, ", mean_L, mean_Lsq);
          printf("mean_LR = %f\n\n", mean_LR);*/

          mu += mean_R + mean_L;

          double sigma = EM - EM * EM;
          sigma += (mean_Rsq - mean_R * mean_R) + (mean_Lsq - mean_L * mean_L);
          sigma += 2.0 * (mean_R + mean_L) * (1.0 - EM);
          //sigma += 2.0 * (mean_LR - mean_R * mean_L);
          sigma += 2.0 * mean_R * mean_L * ( 1.0/EM -1.0 );

          double S = T - (data.size()/2.0) * mu;
          S = S / sqrt( data.size() * sigma/2.0 );
          double pvalue = 2 * ndist(S, true);

          //Tsav.push_back(T);
          musav.push_back(mu);
          sigmasav.push_back(sigma);
          Ssav.push_back(S);

          //fprintf(output_ehh, "%11s %10d %.3f %7.0f %7.3f %7.3f %7.3f ",
          //snp_list[index].c_str(), phy_map[index], p[index], T, mu, sigma, S);
          //if (pvalue > 0.05)
          // fprintf(output_ehh, "\n");
          //else if (pvalue > 0.0001)
          // fprintf(output_ehh, " %7.5f \n", pvalue);
          //else
          // fprintf(output_ehh, " %7.1e \n", pvalue);
          }

//calculate the mean of M, under HWE
double xpEHHT::mean_M (int index)
          {
                    return p[index] * p[index] + (1 - p[index]) * (1 - p[index]);
          }

double xpEHHT::count_homozygotes(int index)
          {
          double T =0;
          int num_ind = data.size()/2;

          for (int i = 0; i < num_ind; i++)
                    {
                    if (data[2*i][index] == data[2*i + 1][index])
                              {
                                        T++;
                    for (int j = index + 1; j <= snp_num; j++) //count right-hand side homozygotes
{//if (index == 0 && 2*i < 8) printf("T = %f, 2*i = %d, j = %d, data[2*i][j] = %c, data[2*i + 1][j] = %c \n", T, 2* i, j,data[2*i][j], data[2*i + 1][j]);
                              if (data[2*i][j] == data[2*i + 1][j])
                                                  T++;
                                              else
                                                  break;
                              }
                                        for (int j = index - 1; j >= 0; j--) //count left-hand side homozygotes
                              {
                              if (data[2*i][j] == data[2*i + 1][j])
                                        T++;
                                     else
                                        break;
                                     }
                              }
                    }

          return T;
          }

// functions for haplotype data
void xpEHHT::mean_R_Rsq_hap (int index)
          {
          int test = 0;

          for (int r = 3; r <= snp_num - index + 1; r++)
                    {
                    map<const char*, int, strCmp> hset;
                    map<const char*, int, strCmp>::iterator xx;
                    map<const char*, int, strCmp>::iterator yy;

                              for (int i = 0; i < data.size(); i++)
                              {
                                        char *str = new char[r + 1];
                                   memset(str, 0, r+1);
                              for (int j = 0; j < r; j++)
                                        str[j] = data[i][index + j];

                              //insert a key when necessary
                                        xx = hset.find(str);
                              if (xx == hset.end())
                                        hset[str] = 1;
                                    else
                                        {
                              (*xx).second += 1;
                                        delete [] str;
                              }
                    }

          xx = hset.begin();
                    double PR = 0.0;
                    while (xx != hset.end())
                    {
                              for (yy = hset.begin(); yy != hset.end(); yy++)
                                        {
                                                  if ((*xx).first != (*yy).first && memcmp((*xx).first, (*yy).first, r - 1) == 0)
                                                            PR += 1.0 * (*xx).second * (*yy).second / data.size() / data.size();
                                                  }
                    xx++;
                    }

                    //delete the keys to save memory and to avoid leaking problems, method by Xiong Hao
                    for (map<const char*, int, strCmp>::const_iterator xx = hset.begin(); xx != hset.end(); ++xx)
                              delete [] xx->first;

          hset.clear();

          mean_R += (r - 2.0) * PR;
                    mean_Rsq += (r - 2.0) * (r - 2.0) * PR;

          if ( (r - 2.0) * (r - 2.0) * PR < epsilon )
                    test +=1;
                 else
                    test = 0;

                 if (test > TEST)
                              break;
          }
}

void xpEHHT::mean_L_Lsq_hap (int index)
          {
          int test = 0;

          for (int ell = 3; ell <= index + 1; ell++)
                    {
                    map<const char*, int, strCmp> hset;
                    map<const char*, int, strCmp>::iterator xx;
                    map<const char*, int, strCmp>::iterator yy;

                              for (int i = 0; i < data.size(); i++)
                              {
                              char *str = new char[ell + 1];
                              memset(str, 0, ell + 1);
                              for (int j = 0; j < ell; j++)
                                        str[j] = data[i][index - j];

                                     xx = hset.find(str);
                              if (xx == hset.end())
                                  hset[str] = 1;
                              else
                                  {
                                        (*xx).second += 1;
                                  delete[](str);
                                        }
                              }

          xx = hset.begin();
                    double PL = 0.0;
                    while (xx != hset.end())
                    {
                              for (yy = hset.begin(); yy != hset.end(); yy++)
                                        {
                                                  if ((*xx).first != (*yy).first && memcmp((*xx).first, (*yy).first, ell - 1) == 0)
                                                  PL += 1.0 * (*xx).second * (*yy).second / data.size() / data.size();
                                        }
                              xx++;
                    }

          //delete the keys to save memory and to avoidleaking problems, method by Xiong Hao
          for (map<const char*, int, strCmp>::const_iterator xx = hset.begin(); xx != hset.end(); ++xx)
                    delete [] xx->first;

          hset.clear();

          mean_L += (ell - 2.0) * PL;
                    mean_Lsq += (ell - 2.0) * (ell - 2.0) * PL;

          if ( (ell - 2.0) * (ell - 2.0) * PL < epsilon )
                    test +=1;
               else
                    test = 0;

               if (test > TEST)
                              break;
               }
          }

// The normal distribution function
//#define LOWER_TAIL_ONE 7.5
//#define UPPER_TAIL_ZERO 500

double xpEHHT::ndist(double z, bool upper)
          {
          // C version of ID Hill, "The Normal Integral"
          // Applied Statistics, Vol 22, pp. 424-427

          // If 7 digit accuracy is enough, alternative is
          // return erfcc(x / M_SQRT2) * 0.5;

          if (z < 0)
                    {
                    upper = !upper;
                    z = -z;
          }

          //if (z > LOWER_TAIL_ONE && !upper || z > UPPER_TAIL_ZERO)
             // return (upper) ? 0.0 : 1.0;

          double p, y = 0.5 * z * z;

          if (z < 1.28)
                    {
                    p = 0.5 - z * (0.398942280444 - 0.399903438504 * y /
                                        (y + 5.75885480458 - 29.8213557808 /
                                        (y + 2.62433121679 + 48.6959930692 /
                                        (y + 5.92885724438))));
                    }
          else
                    {
                    p = 0.398942270385 * exp (-y) /
                           (z - 2.8052e-8 + 1.00000615302 /
                           (z + 3.98064794e-4 + 1.98615381364 /
                           (z - 0.151679116635 + 5.29330324926 /
                           (z + 4.8385912808 - 15.1508972451 /
                           (z + 0.742380924027 + 30.789933034 /
                           (z + 3.99019417011))))));
                    }

          return p;
          //return (upper) ? p : 1 - p;
          }

Last Updated Date: 04/12/2013
Last Reviewed Date: 04/12/2013

Contact Information

Name: Dr Paul Albert
Chief and Senior Investigator
Biostatistics and Bioinformatics Branch
Phone: 301-496-5582
E-mail: albertp@mail.nih.gov

Staff Directory
Vision National Institutes of Health Home BOND National Institues of Health Home Home Storz Lab: Section on Environmental Gene Regulation Home Machner Lab: Unit on Microbial Pathogenesis Home Division of Intramural Population Health Research Home Bonifacino Lab: Section on Intracellular Protein Trafficking Home Lilly Lab: Section on Gamete Development Home Lippincott-Schwartz Lab: Section on Organelle Biology