lib/codebias.c

Go to the documentation of this file.
00001 /* codebias.c - stuff for managing codons and codon bias. 
00002  *
00003  * This file is copyright 2002 Jim Kent, but license is hereby
00004  * granted for all use - public, private or commercial. */
00005 
00006 #include "common.h"
00007 #include "dnautil.h"
00008 #include "hmmstats.h"
00009 #include "codebias.h"
00010 
00011 static char const rcsid[] = "$Id: codebias.c,v 1.6 2005/04/10 14:41:21 markd Exp $";
00012 
00013 
00014 struct codonBias *codonLoadBias(char *fileName)
00015 /* Create scaled log codon bias tables based on .cod file.  
00016  * You can freeMem it when you're done. */
00017 {
00018 struct codonBias *cb;
00019 char line[1024];
00020 int lineCount = 0;
00021 char *words[128];
00022 int wordCount;
00023 int i = 0, j = 0;
00024 int skip = 0;
00025 boolean getMark0 = FALSE;
00026 boolean getMark1 = FALSE;
00027 FILE *f = mustOpen(fileName, "r");
00028 int val;
00029 
00030 AllocVar(cb);
00031 while (fgets(line, sizeof(line), f) )
00032     {
00033     ++lineCount;
00034     if (skip)
00035         {
00036         skip -= 1;
00037         continue;
00038         }
00039     if (getMark1)
00040         {
00041         wordCount = chopLine(line, words);
00042         if (wordCount != 65)
00043             errAbort("Bad line %d of %s\n", lineCount, fileName);
00044         for (j=0; j<64; ++j)
00045             {
00046             val = atoi(words[j+1]);
00047             if (val == 0)
00048                 cb->mark1[i][j] = scaledLog(1.0E-20);
00049             else
00050                 cb->mark1[i][j] = scaledLog(0.001*val);
00051             }
00052         if ((i += 1) == 64)
00053             getMark1 = FALSE;
00054         }
00055     else if (getMark0)
00056         {
00057         wordCount = chopLine(line, words);
00058         if (wordCount != 64)
00059             errAbort("Bad line %d of %s\n", lineCount, fileName);
00060         for (j=0; j<64; ++j)
00061             {
00062             val = atoi(words[j]);
00063             if (val == 0)
00064                 cb->mark0[j] = scaledLog(1.0E-20);
00065             else
00066                 cb->mark0[j] = scaledLog(0.001*val);
00067             }
00068         getMark0 = FALSE;
00069         }
00070     else if (startsWith("Markov", line))
00071         {
00072         wordCount = chopLine(line, words);
00073         if (wordCount != 2)
00074             errAbort("Bad line %d of %s\n", lineCount, fileName);
00075         if (sameString(words[1], "0"))
00076             getMark0 = TRUE;
00077         else if (sameString(words[1], "1"))
00078             getMark1 = TRUE;
00079         else
00080             errAbort("Bad line %d of %s\n", lineCount, fileName);
00081         skip = 3;
00082         }
00083     }
00084 fclose(f);
00085 return cb;
00086 }
00087    
00088 static void unN(DNA *dna, int dnaSize)
00089 /* Turn N's into T's. */
00090 {
00091 int i;
00092 int val;
00093 for (i=0; i<dnaSize; ++i)
00094     {
00095     if ((val = ntVal[(int)dna[i]]) < 0)
00096         dna[i] = 't';
00097     }
00098 }
00099 
00100 int codonFindFrame(DNA *dna, int dnaSize, struct codonBias *forBias)
00101 /* Assuming a stretch of DNA is an exon, find most likely frame that it's in. 
00102  * Beware this routine will replace N's with T's in the input dna.*/
00103 {
00104 double logOneFourth = log(0.25);
00105 double logProbs[3];
00106 int frame;
00107 int dnaIx;
00108 double logP;
00109 double bestLogP = -0x6fffffff;
00110 int bestFrame = -1;
00111 int lastDnaStart = dnaSize-3;
00112 DNA *d;
00113 int codon = 0, lastCodon; 
00114 
00115 unN(dna, dnaSize);
00116 for (frame=0; frame<3; ++frame)
00117     {
00118     /* Partial first codon just gets even background score. */
00119     logP = frame*logOneFourth;
00120     /* 1st order model on first full codon. */
00121     if (frame <= lastDnaStart)
00122         {
00123         d = dna+frame;
00124         codon = (ntVal[(int)d[0]]<<4) + (ntVal[(int)d[1]]<<2) + ntVal[(int)d[2]];
00125         logP += forBias->mark0[codon];
00126         }
00127     /* 2nd order model on subsequent full codons. */
00128     for (dnaIx = frame+3; dnaIx <= lastDnaStart; dnaIx += 3)
00129         {
00130         lastCodon = codon;
00131         d = dna+dnaIx;
00132         codon = (ntVal[(int)d[0]]<<4) + (ntVal[(int)d[1]]<<2) + ntVal[(int)d[2]];
00133         logP += forBias->mark1[lastCodon][codon];
00134         }
00135     /* Partial last codon gets even background score. */
00136     logP += (dnaSize-dnaIx)*logOneFourth;
00137     logProbs[frame] = logP;
00138     if (logP > bestLogP)
00139         {
00140         bestLogP = logP;
00141         bestFrame = frame;
00142         }
00143     }
00144 return bestFrame;
00145 }
00146 
00147 
00148 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2