jkOwnLib/xensmall.c

Go to the documentation of this file.
00001 /* xensmall.c - align using dynamic programming. */
00002 /* Copyright 2000-2003 Jim Kent.  All rights reserved. */
00003 
00004 #include "common.h"
00005 #include "memalloc.h"
00006 #include "cheapcgi.h"
00007 #include "dnautil.h"
00008 #include "xenalign.h"
00009 #include "pairHmm.h"
00010 
00011 static char const rcsid[] = "$Id: xensmall.c,v 1.11 2006/03/15 18:36:16 angie Exp $";
00012 
00013 static double calcGcRatio(DNA *a, int aSize, DNA *b, int bSize)
00014 /* Figure out percentage of g/c in a and b. */
00015 {
00016 int counts[4];
00017 int i;
00018 int total = 4;
00019 DNA base;
00020 int val;
00021 int gcCount;
00022 
00023 for (i=0; i<ArraySize(counts); ++i)
00024     counts[i] = 1;
00025 for (i=0; i<aSize; ++i)
00026     {
00027     base = a[i];
00028     if ((val = ntVal[(int)base]) >= 0)
00029         {
00030         counts[val] += 1;
00031         total += 1;
00032         }
00033     }    
00034 for (i=0; i<bSize; ++i)
00035     {
00036     base = b[i];
00037     if ((val = ntVal[(int)base]) >= 0)
00038         {
00039         counts[val] += 1;
00040         total += 1;
00041         }
00042     }    
00043 gcCount = counts[C_BASE_VAL] + counts[G_BASE_VAL];
00044 return (double)gcCount/(double)total;
00045 }
00046 
00047 /* Our seven matrices. */
00048 enum sevenStateIx
00049     {
00050     hiFiIx = 0,
00051     qSlipIx,
00052     tSlipIx,
00053     loFiIx,
00054     c1Ix,
00055     c2Ix,
00056     c3Ix,
00057     };
00058 
00059 
00060 static int loFiFromHiFiCost;
00061 static int loFiFromCodingCost;
00062 static int loFiFromInsertCost;
00063 
00064 static int hiFiFromLoFiCost;
00065 static int hiFiFromCodingCost;
00066 static int hiFiFromInsertCost;
00067 
00068 static int codingFromHiFiCost;
00069 static int codingFromLoFiCost;
00070 static int codingFromInsertCost;
00071 
00072 static int insertFromHiFiCost;
00073 static int insertFromLoFiCost;
00074 static int insertFromCodingCost;
00075 
00076 static int loFiSmallGapCost;
00077 static int hiFiSmallGapCost;
00078 static int codingSmallGapCost;
00079 static int largeGapExtensionCost;
00080 
00081 
00082 static int scaledLogOfFraction(double p, double q)
00083 /* return log of fraction scaled by 1000 */
00084 {
00085 return round(1000*log((double)p/q) );
00086 }
00087 
00088 static int transitionCost(double prob)
00089 /* Calculates the cost of transition of given probability */
00090 {
00091 return round(1000*(log(prob)));
00092 }
00093 
00094 static int c1c2MatchTable[26*32];
00095 static int c3MatchTable[26*32];
00096 static int hiFiMatchTable[26*32];
00097 static int loFiMatchTable[26*32];
00098 
00099 #define letterIx(a) ((a)-'a')
00100 
00101 static void makeMatchTable(int matchTable[], double matchProb, double gcRatio)
00102 /* Make table of match/mismatch cost/benefits for a given probability of
00103  * matching and a given % of GC. */
00104 {
00105 double atRatio = 1.0 - gcRatio;
00106 int unlikely = round(1000*log(0.0001));
00107 double mismatchProb = 1.0 - matchProb;
00108 double freq[4];
00109 DNA a,b;
00110 int i,j;
00111 
00112 freq[A_BASE_VAL] = freq[T_BASE_VAL] = atRatio/2;
00113 freq[G_BASE_VAL] = freq[C_BASE_VAL] = gcRatio/2;
00114 
00115 for (i=0; i<26*32; ++i)
00116         matchTable[i] = unlikely;
00117 
00118 for (i=0; i<4; ++i)
00119     {
00120     a =  valToNt[i];
00121     for (j=0; j<4; ++j)
00122         {
00123         b = valToNt[j];
00124         if (a == b)
00125             matchTable[32*letterIx(a) + letterIx(b)] = scaledLogOfFraction(matchProb, freq[j]);
00126         else
00127             matchTable[32*letterIx(a) + letterIx(b)] = scaledLogOfFraction(mismatchProb, 1.0 - freq[j]);
00128         }
00129     }
00130 }
00131 
00132 static void makeWobbleMatchTable(int matchTable[], double gcRatio)
00133 /* Make table of match/mismatch costs for the wobble codon. */
00134 {
00135 /* Assume 53% match, 33% wobble-match 14% mismatch. */
00136 double atRatio = 1.0 - gcRatio;
00137 int unlikely = round(1000*log(0.0001));
00138 double freq[4];
00139 int i;
00140 double matchProb = 0.53;
00141 double wobbleProb = 0.33;
00142 double mismatchProb = 0.14*0.5;   /* Two ways to mismatch. */
00143 
00144 freq[A_BASE_VAL] = freq[T_BASE_VAL] = atRatio/2;
00145 freq[G_BASE_VAL] = freq[C_BASE_VAL] = gcRatio/2;
00146 
00147 for (i=0; i<26*32; ++i)
00148        matchTable[i] = unlikely;
00149 matchTable[32*letterIx('a') + letterIx('a')] = scaledLogOfFraction(matchProb, freq[A_BASE_VAL]);
00150 matchTable[32*letterIx('a') + letterIx('c')] = scaledLogOfFraction(mismatchProb, freq[C_BASE_VAL]);
00151 matchTable[32*letterIx('a') + letterIx('g')] = scaledLogOfFraction(wobbleProb, freq[G_BASE_VAL]);
00152 matchTable[32*letterIx('a') + letterIx('t')] = scaledLogOfFraction(mismatchProb, freq[T_BASE_VAL]);
00153 
00154 matchTable[32*letterIx('c') + letterIx('a')] = scaledLogOfFraction(mismatchProb, freq[A_BASE_VAL]);
00155 matchTable[32*letterIx('c') + letterIx('c')] = scaledLogOfFraction(matchProb, freq[C_BASE_VAL]);
00156 matchTable[32*letterIx('c') + letterIx('g')] = scaledLogOfFraction(mismatchProb, freq[G_BASE_VAL]);
00157 matchTable[32*letterIx('c') + letterIx('t')] = scaledLogOfFraction(wobbleProb, freq[T_BASE_VAL]);
00158 
00159 matchTable[32*letterIx('g') + letterIx('a')] = scaledLogOfFraction(wobbleProb, freq[A_BASE_VAL]);
00160 matchTable[32*letterIx('g') + letterIx('c')] = scaledLogOfFraction(mismatchProb, freq[C_BASE_VAL]);
00161 matchTable[32*letterIx('g') + letterIx('g')] = scaledLogOfFraction(matchProb, freq[G_BASE_VAL]);
00162 matchTable[32*letterIx('g') + letterIx('t')] = scaledLogOfFraction(mismatchProb, freq[T_BASE_VAL]);
00163 
00164 matchTable[32*letterIx('t') + letterIx('a')] = scaledLogOfFraction(mismatchProb, freq[A_BASE_VAL]);
00165 matchTable[32*letterIx('t') + letterIx('c')] = scaledLogOfFraction(wobbleProb, freq[C_BASE_VAL]);
00166 matchTable[32*letterIx('t') + letterIx('g')] = scaledLogOfFraction(mismatchProb, freq[G_BASE_VAL]);
00167 matchTable[32*letterIx('t') + letterIx('t')] = scaledLogOfFraction(matchProb, freq[T_BASE_VAL]);
00168 }
00169 
00170 static void calcCostBenefit(double gcRatio)
00171 /* Figure out weights to put on all the arrows... */
00172 {
00173 loFiFromHiFiCost = transitionCost(1.0/30.0);
00174 codingFromHiFiCost = transitionCost(1.0/60.0);
00175 insertFromHiFiCost = transitionCost(1.0/1000.0);
00176 
00177 loFiFromCodingCost = transitionCost(1.0/200.0);
00178 hiFiFromCodingCost = transitionCost(1.0/300.0);
00179 insertFromCodingCost = transitionCost(1.0/1000.0);
00180 
00181 hiFiFromLoFiCost = transitionCost(1.0/100.0);
00182 codingFromLoFiCost = transitionCost(1.0/150.0);
00183 insertFromLoFiCost = transitionCost(1.0/400.0);
00184 
00185 loFiFromInsertCost = transitionCost(1.0/60.0);
00186 hiFiFromInsertCost = transitionCost(1.0/1000.0);
00187 codingFromInsertCost = transitionCost(1.0/1000.0);
00188 
00189 largeGapExtensionCost = scaledLogOfFraction(74,75);
00190 
00191 loFiSmallGapCost = scaledLogOfFraction(1,10);
00192 hiFiSmallGapCost = scaledLogOfFraction(1,25);
00193 codingSmallGapCost = scaledLogOfFraction(1,300);
00194 
00195 makeMatchTable(c1c2MatchTable, 0.92, gcRatio);
00196 //c1c2matchBonus = scaledLogOfFraction(0.92, 0.25);   /* 1.33 1.28 */
00197 //c1c2mismatchCost = scaledLogOfFraction(0.08, 0.75); /* -2.71 -2.01 */
00198 
00199 makeWobbleMatchTable(c3MatchTable, gcRatio);
00200 //c3matchBonus = scaledLogOfFraction(0.6, 0.25);
00201 //assert(c3matchBonus > 0);
00202 //c3mismatchCost = scaledLogOfFraction(0.4, 0.75);
00203 
00204 makeMatchTable(loFiMatchTable, 0.5, gcRatio);
00205 //loFiMatchBonus = scaledLogOfFraction(0.5, 0.25);
00206 //loFiMismatchCost = scaledLogOfFraction(0.5, 0.75);
00207 
00208 makeMatchTable(hiFiMatchTable, 0.9, gcRatio);
00209 //hiFiMatchBonus = scaledLogOfFraction(0.9, 0.25);
00210 //hiFiMismatchCost = scaledLogOfFraction(0.1, 0.75);
00211 }
00212 
00213 
00214 int xenAlignSmall(DNA *query, int querySize, DNA *target, int targetSize, FILE *f,
00215     boolean printExtraAtEnds)
00216 /* Use dynamic programming to do small scale (querySize * targetSize < 10,000,000)
00217  * alignment of DNA. */
00218 {
00219 struct phmmMatrix *a;
00220 struct phmmState *hf, *lf, *iq, *it, *c1, *c2, *c3;
00221 int qIx, tIx, sIx;  /* Query, target, and state indices */
00222 int rowOffset, newCellOffset;
00223 struct phmmAliPair *pairList;
00224 int matchOff, qSlipOff, tSlipOff;
00225 int bestScore = -0x4fffffff;
00226 struct phmmMommy *bestCell = NULL;
00227 int c1c2PairScore, c3PairScore, loFiPairScore, hiFiPairScore;
00228 int matchTableOffset;
00229 double gcRatio;
00230 
00231 /* Check that it's not too big. */
00232 if ((double)targetSize * querySize > 1.1E7)
00233     errAbort("Can't align %d x %d, too big\n", querySize, targetSize);
00234 
00235 /* Set up graph edge costs/benefits */
00236 gcRatio = calcGcRatio(query, querySize, target, targetSize);
00237 calcCostBenefit(gcRatio);
00238 
00239 /* Initialize 7 state matrix. */
00240 a = phmmMatrixNew(7, query, querySize, target, targetSize);
00241 hf = phmmNameState(a, hiFiIx, "highFi", 'H');
00242 lf = phmmNameState(a, loFiIx, "lowFi", 'L');
00243 iq = phmmNameState(a, qSlipIx, "qSlip", 'Q');
00244 it = phmmNameState(a, tSlipIx, "tSlip", 'T');
00245 c1 = phmmNameState(a, c1Ix, "frame1", '1');
00246 c2 = phmmNameState(a, c2Ix, "frame2", '2');
00247 c3 = phmmNameState(a, c3Ix, "frame3", '3');
00248 
00249 qSlipOff = -a->qDim;
00250 tSlipOff = -1;
00251 matchOff = qSlipOff + tSlipOff;
00252 
00253 for (tIx = 1; tIx < a->tDim; tIx += 1)
00254     {
00255     UBYTE mommy = 0;
00256     int score, tempScore;
00257 
00258 /* Macros to make me less mixed up when accessing scores from row arrays.*/
00259 #define matchScore lastScores[qIx-1]
00260 #define qSlipScore lastScores[qIx]
00261 #define tSlipScore scores[qIx-1]
00262 #define newScore scores[qIx]
00263 
00264 /* Start up state block (with all ways to enter state) */
00265 #define startState(state) \
00266    score = 0;
00267 
00268 /* Define a transition from state while advancing over both
00269  * target and query. */
00270 #define matchState(state, addScore) \
00271    { \
00272    if ((tempScore = state->matchScore + addScore) > score) \
00273         { \
00274         mommy = phmmPackMommy(state->stateIx, -1, -1); \
00275         score = tempScore; \
00276         } \
00277    } 
00278 
00279 /* Define a transition from state while slipping query
00280  * and advancing target. */
00281 #define qSlipState(state, addScore) \
00282    { \
00283    if ((tempScore = state->qSlipScore + addScore) > score) \
00284         { \
00285         mommy = phmmPackMommy(state->stateIx, 0, -1); \
00286         score = tempScore; \
00287         } \
00288    }
00289 
00290 /* Define a transition from state while slipping target
00291  * and advancing query. */
00292 #define tSlipState(state, addScore) \
00293    { \
00294    if ((tempScore = state->tSlipScore + addScore) > score) \
00295         { \
00296         mommy = phmmPackMommy(state->stateIx, -1, 0); \
00297         score = tempScore; \
00298         } \
00299    }
00300 
00301 /* End a block of transitions into state. */
00302 #define endState(state) \
00303     { \
00304     struct phmmMommy *newCell = state->cells + newCellOffset; \
00305     if (score <= 0) \
00306         { \
00307         mommy = phmmNullMommy; \
00308         score = 0; \
00309         } \
00310     newCell->mommy = mommy; \
00311     state->newScore = score; \
00312     if (score > bestScore) \
00313         { \
00314         bestScore = score; \
00315         bestCell = newCell; \
00316         } \
00317     } 
00318 
00319 /* End a state that you know won't produce an optimal
00320  * final score. */
00321 #define shortEndState(state) \
00322     { \
00323     struct phmmMommy *newCell = state->cells + newCellOffset; \
00324     if (score <= 0) \
00325         { \
00326         mommy = phmmNullMommy; \
00327         score = 0; \
00328         } \
00329     newCell->mommy = mommy; \
00330     state->newScore = score; \
00331     }
00332 
00333     rowOffset = tIx*a->qDim;
00334     for (qIx = 1; qIx < a->qDim; qIx += 1)
00335         {
00336         int qBase = letterIx(a->query[qIx-1]);
00337         int tBase = letterIx(a->target[tIx-1]);
00338 
00339         newCellOffset = rowOffset + qIx;
00340         
00341         /* Figure the cost or bonus for pairing target and query residue here. */
00342         matchTableOffset = (qBase<<5) + tBase;
00343         c1c2PairScore = c1c2MatchTable[matchTableOffset];
00344         c3PairScore = c3MatchTable[matchTableOffset];
00345         hiFiPairScore = hiFiMatchTable[matchTableOffset];
00346         loFiPairScore = loFiMatchTable[matchTableOffset];
00347 
00348         /* Update hiFi space. */
00349             {
00350             startState(hf);
00351             matchState(hf, hiFiPairScore);
00352             qSlipState(hf, hiFiSmallGapCost);
00353             tSlipState(hf, hiFiSmallGapCost);
00354             matchState(iq, hiFiPairScore + hiFiFromInsertCost);
00355             matchState(it, hiFiPairScore + hiFiFromInsertCost);
00356             matchState(lf, hiFiPairScore + hiFiFromLoFiCost);
00357             matchState(c1, hiFiPairScore + hiFiFromCodingCost);
00358             matchState(c2, hiFiPairScore + hiFiFromCodingCost);
00359             matchState(c3, hiFiPairScore + hiFiFromCodingCost);
00360             endState(hf);
00361             }
00362 
00363         /* Update loFi space. */
00364             {
00365             startState(lf);
00366             matchState(lf, loFiPairScore);
00367             qSlipState(lf, loFiSmallGapCost);
00368             tSlipState(lf, loFiSmallGapCost);
00369             matchState(iq, loFiPairScore + loFiFromInsertCost);
00370             matchState(it, loFiPairScore + loFiFromInsertCost);
00371             matchState(hf, loFiPairScore + loFiFromHiFiCost);
00372             matchState(c1, loFiPairScore + loFiFromCodingCost);
00373             matchState(c2, loFiPairScore + loFiFromCodingCost);
00374             matchState(c3, loFiPairScore + loFiFromCodingCost);
00375             endState(lf);
00376             }
00377 
00378         /* Update query slip space. */
00379             {
00380             startState(iq);
00381             qSlipState(iq, largeGapExtensionCost);
00382             qSlipState(hf, insertFromHiFiCost);            
00383             qSlipState(lf, insertFromLoFiCost);
00384             qSlipState(c1, insertFromCodingCost);
00385             qSlipState(c2, insertFromCodingCost);
00386             qSlipState(c3, insertFromCodingCost);
00387             qSlipState(it, largeGapExtensionCost); /* Allow double gaps, T first always. */
00388             shortEndState(iq);
00389             }
00390         
00391         /* Update target slip space. */
00392             {
00393             startState(it);
00394             tSlipState(it, largeGapExtensionCost);
00395             tSlipState(hf, insertFromHiFiCost);            
00396             tSlipState(lf, insertFromLoFiCost);
00397             tSlipState(c1, insertFromCodingCost);
00398             tSlipState(c2, insertFromCodingCost);
00399             tSlipState(c3, insertFromCodingCost);
00400             shortEndState(it);
00401             }
00402 
00403         /* Update coding1 space. */
00404             {
00405             startState(c1);
00406             matchState(c3, c1c2PairScore);
00407             qSlipState(c1, codingSmallGapCost);
00408             tSlipState(c1, codingSmallGapCost);
00409             matchState(iq, c1c2PairScore + codingFromInsertCost);
00410             matchState(it, c1c2PairScore + codingFromInsertCost);
00411             matchState(lf, c1c2PairScore + codingFromLoFiCost);
00412             matchState(hf, c1c2PairScore + codingFromHiFiCost);
00413             endState(c1);
00414             }
00415 
00416         /* Update coding2 space. */
00417             {
00418             startState(c2);
00419             matchState(c1, c1c2PairScore);
00420             qSlipState(c2, codingSmallGapCost);
00421             tSlipState(c2, codingSmallGapCost);
00422             matchState(iq, c1c2PairScore + codingFromInsertCost);
00423             matchState(it, c1c2PairScore + codingFromInsertCost);
00424             matchState(lf, c1c2PairScore + codingFromLoFiCost);
00425             matchState(hf, c1c2PairScore + codingFromHiFiCost);
00426             endState(c2);
00427             }
00428 
00429 
00430         /* Update coding3 space. */
00431             {
00432             startState(c3);
00433             matchState(c2, c3PairScore);
00434             qSlipState(c3, codingSmallGapCost);
00435             tSlipState(c3, codingSmallGapCost);
00436             matchState(iq, c3PairScore + codingFromInsertCost);
00437             matchState(it, c3PairScore + codingFromInsertCost);
00438             matchState(lf, c3PairScore + codingFromLoFiCost);
00439             matchState(hf, c3PairScore + codingFromHiFiCost);
00440             endState(c3);
00441             }
00442         }
00443     /* Swap score columns so current becomes last, and last gets
00444      * reused. */
00445     for (sIx = 0; sIx < a->stateCount; ++sIx)
00446         {
00447         struct phmmState *as = &a->states[sIx];
00448         int *swapTemp = as->lastScores;
00449         as->lastScores = as->scores;
00450         as->scores = swapTemp;
00451         }
00452     }
00453 
00454 /* Trace back from best scoring cell. */
00455 pairList = phmmTraceBack(a, bestCell);
00456 phmmPrintTrace(a, pairList, TRUE, f, printExtraAtEnds);
00457 
00458 slFreeList(&pairList);
00459 phmmMatrixFree(&a);
00460 return bestScore;
00461 #undef matchScore
00462 #undef qSlipScore
00463 #undef tSlipScore
00464 #undef newScore
00465 #undef startState
00466 #undef matchState
00467 #undef qSlipState
00468 #undef tSlipState
00469 #undef shortEndState
00470 #undef endState
00471 }
00472 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2