lib/dnaMarkov.c

Go to the documentation of this file.
00001 /* dnaMarkov - stuff to build 1st, 2nd, 3rd, and coding
00002  * 3rd degree Markov models for DNA. */
00003 
00004 #include "common.h"
00005 #include "dnautil.h"
00006 #include "dnaseq.h"
00007 #include "slog.h"
00008 #include "dnaMarkov.h"
00009 
00010 static char const rcsid[] = "$Id: dnaMarkov.c,v 1.4 2005/04/10 14:41:22 markd Exp $";
00011 
00012 void dnaMark0(struct dnaSeq *seqList, double mark0[5], int slogMark0[5])
00013 /* Figure out frequency of bases in input.  Results go into
00014  * mark0 and optionally in scaled log form into slogMark0.
00015  * Order is N, T, C, A, G.  (TCAG is our normal order) */
00016 {
00017 struct dnaSeq *seq;
00018 int histo[4];
00019 int oneHisto[4];
00020 double total;
00021 int i;
00022 double *freq = mark0+1;
00023 
00024 zeroBytes(histo, sizeof(histo));
00025 for (seq = seqList; seq != NULL; seq = seq->next)
00026     {
00027     dnaBaseHistogram(seq->dna, seq->size, oneHisto);
00028     for (i=0; i<4; ++i)
00029         histo[i] += oneHisto[i];
00030     }
00031 total = histo[0] + histo[1] + histo[2] + histo[3];
00032 freq[-1] = 1.0;
00033 for (i=0; i<4; ++i)
00034     freq[i] = (double)histo[i] / total;
00035 if (slogMark0 != NULL)
00036     {
00037     int *slogFreq = slogMark0 + 1;
00038     slogFreq[-1] = 0;
00039     for (i=0; i<4; ++i)
00040         slogFreq[i] = slog(freq[i]);
00041     }
00042 }
00043 
00044 
00045 void dnaMark1(struct dnaSeq *seqList, double mark0[5], int slogMark0[5], 
00046         double mark1[5][5], int slogMark1[5][5])
00047 /* Make up 1st order Markov model - probability that one nucleotide
00048  * will follow another. Input is sequence and 0th order Markov models.
00049  * Output is first order Markov model. slogMark1 can be NULL. */
00050 {
00051 struct dnaSeq *seq;
00052 DNA *dna, *endDna;
00053 int i,j;
00054 int histo[5][5];
00055 int hist1[5];
00056 
00057 zeroBytes(histo, sizeof(histo));
00058 zeroBytes(hist1, sizeof(hist1));
00059 for (seq = seqList; seq != NULL; seq = seq->next)
00060     {
00061     dna = seq->dna;
00062     endDna = dna + seq->size-1;
00063     for (;dna < endDna; ++dna)
00064         {
00065         i = ntVal[(int)dna[0]];
00066         j = ntVal[(int)dna[1]];
00067         hist1[i+1] += 1;
00068         histo[i+1][j+1] += 1;
00069         }
00070     }
00071 for (i=0; i<5; ++i)
00072     {
00073     for (j=0; j<5; ++j)
00074         {
00075         double mark1Val;
00076         int matVal = histo[i][j] + 1;
00077         mark1Val = ((double)matVal)/(hist1[i]+5);
00078         mark1[i][j] = mark1Val;
00079         if (slogMark1 != NULL)
00080             slogMark1[i][j] = slog(mark1Val);
00081         }
00082     }
00083 for (i=0; i<5; ++i)
00084     {
00085     mark1[i][0] = 1;
00086     mark1[0][i] = mark0[i];
00087     if (slogMark1 != NULL)
00088         {
00089         slogMark1[i][0] = 0;
00090         slogMark1[0][i] = slogMark0[i];
00091         }
00092     }
00093 }
00094 
00095 void dnaMarkTriple(struct dnaSeq *seqList, 
00096     double mark0[5], int slogMark0[5],
00097     double mark1[5][5], int slogMark1[5][5],
00098     double mark2[5][5][5], int slogMark2[5][5][5],
00099     int offset, int advance, int earlyEnd)
00100 /* Make up a table of how the probability of a nucleotide depends on the previous two.
00101  * Depending on offset and advance parameters this could either be a straight 2nd order
00102  * Markov model, or a model for a particular coding frame. */
00103 {
00104 struct dnaSeq *seq;
00105 DNA *dna, *endDna;
00106 int i,j,k;
00107 int histo[5][5][5];
00108 int hist2[5][5];
00109 int total = 0;
00110 zeroBytes(histo, sizeof(histo));
00111 zeroBytes(hist2, sizeof(hist2));
00112 for (seq = seqList; seq != NULL; seq = seq->next)
00113     {
00114     dna = seq->dna;
00115     endDna = dna + seq->size - earlyEnd - 2;
00116     dna += offset;
00117     for (;dna < endDna; dna += advance)
00118         {
00119         i = ntVal[(int)dna[0]];
00120         j = ntVal[(int)dna[1]];
00121         k = ntVal[(int)dna[2]];
00122         hist2[i+1][j+1] += 1;
00123         histo[i+1][j+1][k+1] += 1;
00124         total += 1;
00125         }
00126     }
00127 for (i=0; i<5; ++i)
00128     {
00129     for (j=0; j<5; ++j)
00130         {
00131         for (k=0; k<5; ++k)
00132             {
00133             double markVal;
00134             int matVal = histo[i][j][k]+1;
00135             if (i == 0 || j == 0 || k == 0)
00136                 {
00137                 if (k == 0)
00138                     {
00139                     mark2[i][j][k] = 1;
00140                     if (slogMark2 != NULL)
00141                         slogMark2[i][j][k] = 0;
00142                     }
00143                 else if (j == 0)
00144                     {
00145                     mark2[i][j][k] = mark0[k];
00146                     if (slogMark2 != NULL)
00147                         slogMark2[i][j][k] = slogMark0[k];
00148                     }
00149                 else if (i == 0)
00150                     {
00151                     mark2[i][j][k] = mark1[j][k];
00152                     if (slogMark2 != NULL)
00153                         slogMark2[i][j][k] = slogMark1[j][k];
00154                     }
00155                 }
00156             else
00157                 {
00158                 markVal = ((double)matVal)/(hist2[i][j]+5);
00159                 mark2[i][j][k] = markVal;
00160                 if (slogMark2 != NULL)
00161                     slogMark2[i][j][k] = slog(markVal);
00162                 }
00163             }
00164         }
00165     }
00166 }
00167 
00168 void dnaMark2(struct dnaSeq *seqList, double mark0[5], int slogMark0[5],
00169         double mark1[5][5], int slogMark1[5][5],
00170         double mark2[5][5][5], int slogMark2[5][5][5])
00171 /* Make up 1st order Markov model - probability that one nucleotide
00172  * will follow the previous two. */
00173 {
00174 dnaMarkTriple(seqList, mark0, slogMark0, mark1, slogMark1, 
00175         mark2, slogMark2, 0, 1, 0);
00176 }
00177 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2