jkOwnLib/fuzzyFind.c

Go to the documentation of this file.
00001 /* Copyright 1999-2003 Jim Kent.  All rights reserved. */
00002 /* fuzzyFind - searches a large DNA sequence (the haystack) for 
00003  * a smaller DNA sequence (the needle).  What makes this tricky
00004  * is that the match need not be exact.
00005  *
00006  * The algorithm looks for exact matches to (typically ten) evenly 
00007  * spaced subsequences of the needle long enough for the match to
00008  * occur only once or less by chance in the haystack.  If
00009  * in fact they occur more than 3 times, the subsequence is
00010  * extended by a base.
00011  *
00012  * Next these ten-mers (called "tiles" in the code)
00013  * are extended as far as possible exactly.  Each tile at
00014  * this point may match the haystack in multiple places.
00015  *
00016  * Next the "weave" routines try to find a good 
00017  * way to link the tiles together.  Good is scored
00018  * by a routine which awards a point for a matching
00019  * base, and subtracts the log-base-two of gaps.  It
00020  * penalizes an additionallly if a tile is
00021  * placed before instead of after a previous tile.
00022  *
00023  * Now that a preliminary alignment exists, the algorithm
00024  * searches for exact matches between tiles. These matches
00025  * are constrained to be long enough that they only occur
00026  * once in the space between tiles.  These matches are 
00027  * extended as far as they can be exactly, and added as 
00028  * further tiles to the alignment list.
00029  *
00030  * Finally the tiles are extended inexactly until they
00031  * abutt each other, or until even inexact extension is
00032  * fruitless.  
00033  * 
00034  * The inexact extension is perhaps the trickiest part of the
00035  * algorithm.  It's implemented in expandLeft and expandRight.
00036  * In these the extension is first taken as far as it can go
00037  * exactly.  Then if four of the next five bases match it's
00038  * taken five bases further. Then a break is generated, and the
00039  * next significant match between the needle and haystack is
00040  * scanned for.
00041  */
00042 
00043 static char const rcsid[] = "$Id: fuzzyFind.c,v 1.17 2006/05/09 01:05:55 markd Exp $";
00044 
00045 #include "common.h"
00046 #include "dnautil.h"
00047 #include "localmem.h"
00048 #include "errabort.h"
00049 #include "fuzzyFind.h"
00050 #include "obscure.h"
00051 
00052 /* ffMemory routines - 
00053  *    FuzzyFinder allocates memory internally from the
00054  *    following routines, which allocate blocks from the
00055  *    main allocator and then dole them out.
00056  *
00057  *    At the end of fuzzy-finding, the actual alignment
00058  *    is copied to more permanent memory, and all the
00059  *    blocks freed.  This saves from having to remember
00060  *    to free every little thing, and also is faster.
00061  *
00062  *    If memory can't be allocated this will throw
00063  *    back to the root of the fuzzy-finder system.
00064  *
00065  *    (In typical usage this only allocates about
00066  *    as much memory as the size of the DNA
00067  *    you're scanning though.)
00068  *    
00069  */
00070 
00071 /* debugging stuff. */
00072 void dumpFf(struct ffAli *left, DNA *needle, DNA *hay); 
00073 
00074 /* settable parameter, defaults to constant value */
00075 static jmp_buf ffRecover;
00076 
00077 static void ffAbort()
00078 /* Abort fuzzy finding. */
00079 {
00080 longjmp(ffRecover, -1);
00081 }
00082 
00083 static struct lm *ffMemPool = NULL;
00084 
00085 static void ffMemInit()
00086 /* Initialize fuzzyFinder local memory system. */
00087 {
00088 ffMemPool = lmInit(2048);
00089 }
00090 
00091 static void ffMemCleanup()
00092 /* Free up fuzzyFinder local memory system. */
00093 {
00094 lmCleanup(&ffMemPool);
00095 }
00096 
00097 static void *ffNeedMem(size_t size)
00098 /* Allocate from fuzzyFinder local memory system. */
00099 {
00100 return lmAlloc(ffMemPool, size);
00101 }
00102 
00103 static void makeFreqTable(DNA *dna, int dnaSize, double freq[4])
00104 {
00105 int histo[4];
00106 double total;
00107 int i;
00108 
00109 dnaBaseHistogram(dna, dnaSize, histo);
00110 total = histo[0] + histo[1] + histo[2] + histo[3];
00111 if (total == 0) total = 1;
00112 for (i=0; i<4; ++i)
00113     freq[i] = (double)histo[i] / total;
00114 }
00115 
00116 static double oligoProb(DNA *oligo, int size, double freq[4])
00117 {
00118 double prob = 1.0;
00119 int i;
00120 int baseVal;
00121 
00122 for (i=0; i<size; ++i)
00123     {
00124     if ((baseVal = ntVal[(int)oligo[i]]) >= 0)
00125         prob *= freq[baseVal];
00126     }
00127 return prob;
00128 }
00129 
00130 static boolean findImprobableOligo(DNA *needle, int needleLength, double maxProb, double freq[4],
00131     DNA **rOligo, int *rOligoLength, double *rOligoProb)
00132 /* Find an oligo that's got less than maxProb probability of matching to
00133  * random DNA with the given base frequency distribution. */
00134 {
00135 int i;
00136 double totalProb = 1.0;
00137 int base;
00138 int startIx = 0;
00139 
00140 for (i=0;i<needleLength;++i)
00141     {
00142     if ((base = ntVal[(int)needle[i]]) < 0)
00143         {
00144         totalProb = 1.0;
00145         startIx = i+1;
00146         }
00147     else
00148         {
00149         if ((totalProb *= freq[base]) <= maxProb)
00150             {
00151             *rOligo = needle+startIx;
00152             *rOligoLength = i-startIx+1;
00153             *rOligoProb = totalProb;
00154             return TRUE;
00155             }
00156         }
00157     }
00158 return FALSE;
00159 }
00160 
00161 
00162 static boolean hasRepeat(DNA *oligo, int oligoLen)
00163 /* Returns TRUE if oligo has an internal repeat mod 1, 2, or 3. */
00164 {
00165 int mod;
00166 int i;
00167 DNA b;
00168 boolean gotRepeat;
00169 int repSize;
00170 int maxRep = (oligoLen+1)/2;
00171 
00172 b = oligo[0];
00173 gotRepeat = TRUE;
00174 for (i=1; i<oligoLen; ++i)
00175     {
00176     if (oligo[i] != b)
00177         {
00178         gotRepeat = FALSE;
00179         break;
00180         }
00181     }
00182 if (gotRepeat)
00183     return TRUE;
00184 
00185 gotRepeat = TRUE;
00186 for (i=2; i<oligoLen; ++i)
00187     {
00188     if (oligo[i&1] != oligo[i])
00189         {
00190         gotRepeat = FALSE;
00191         break;
00192         }
00193     }
00194 if (gotRepeat)
00195     return TRUE;
00196 
00197 for (repSize = 3; repSize <= maxRep; ++repSize)
00198     {
00199     mod = 0;
00200     gotRepeat = TRUE;
00201     for (i=repSize; i<oligoLen; ++i)
00202         {
00203         if (oligo[mod] != oligo[i])
00204             {
00205             gotRepeat = FALSE;
00206             break;
00207             }
00208         if (++mod == repSize)
00209             mod = 0;
00210         }
00211     if (gotRepeat)
00212         return TRUE;
00213     }
00214 return gotRepeat;
00215 }
00216 
00217 
00218 
00219 static boolean ffFindGoodOligo(DNA *needle, int needleLength, double maxProb, double freq[4],
00220     DNA **rOligo, int *rOligoLength, double *rOligoProb)
00221 /* Find an oligo that's suitably improbable and doesn't contain 
00222  * short internal repeats. */
00223 {
00224 int oligoLen;
00225 DNA *oligo;
00226 
00227 /* Loop around until you get one that doesn't repeat. */
00228 for (;;)
00229     {
00230     if (!findImprobableOligo(needle, needleLength, maxProb, freq, rOligo, rOligoLength, rOligoProb))
00231         return FALSE;
00232     oligoLen = *rOligoLength;
00233     oligo = *rOligo;
00234     if (hasRepeat(oligo, oligoLen))
00235         {
00236         DNA *newNeedle = oligo+oligoLen;
00237         int newSize = needleLength - (newNeedle-needle);
00238         if (newSize <= 0)
00239             return FALSE;
00240         needle = newNeedle;
00241         needleLength = newSize;
00242         }
00243     else
00244         return TRUE;
00245     }
00246 }
00247 
00248 static boolean leftNextMatch(struct ffAli *ali, DNA *ns, DNA *ne, DNA *hs, DNA *he, 
00249     int gapPenalty, int maxSkip)
00250 /* Scan to the left for something that matches the next bit of the needle
00251  * in the haystack. */
00252 {
00253 int haySize = he - hs;
00254 int needleSize = ne - ns;
00255 int diagSize = haySize + needleSize;
00256 int matchSize;
00257 int i;
00258 
00259 /* We take care of bigger skips on the "tile" level. */
00260 if (diagSize > maxSkip)
00261     diagSize = maxSkip;
00262 /* Scan diagonally... */
00263 /*
00264 0 1 2 3
00265 1 2 3 4
00266 2 3 4 5
00267 3 4 5 6
00268 */
00269 for (i=1; i<=diagSize; ++i)
00270     {
00271     int hOff = i;
00272     int nOff = 0;
00273     int hDiff = hOff - haySize;
00274     matchSize = gapPenalty + digitsBaseTwo(i);
00275     if (hDiff > 0)
00276         {
00277         nOff += hDiff;
00278         hOff -= hDiff;
00279         }
00280     for (;hOff >= 0; --hOff, ++nOff)
00281         {
00282         int needleLeft = needleSize - nOff;
00283         int hayLeft = haySize - hOff;
00284         if (matchSize > needleLeft) break;
00285         if (matchSize > hayLeft) continue;
00286         if (ne[-nOff-1] == he[-hOff-1] && memcmp(ne-nOff-matchSize, he-hOff-matchSize, matchSize) == 0)
00287             {
00288             ali->nStart = ne - nOff - matchSize;
00289             ali->nEnd = ne - nOff;
00290             ali->hStart = he - hOff - matchSize;
00291             ali->hEnd = he- hOff;
00292             return TRUE;
00293             }
00294         }
00295     }
00296 return FALSE;
00297 }
00298 
00299 
00300 static boolean rightNextMatch(struct ffAli *ali, DNA *ns, DNA *ne, DNA *hs, DNA *he, 
00301     int gapPenalty, int maxSkip)
00302 /* Scan to the right for something that matches the next bit of the needle
00303  * in the haystack. */
00304 {
00305 int haySize = he - hs;
00306 int needleSize = ne - ns;
00307 int diagSize = haySize + needleSize;
00308 int matchSize;
00309 int i;
00310 
00311 /* We take care of bigger skips on the "tile" level. */
00312 if (diagSize > maxSkip)
00313     diagSize = maxSkip;
00314 /* Scan diagonally... */
00315 /*
00316 0 1 2 3
00317 1 2 3 4
00318 2 3 4 5
00319 3 4 5 6
00320 */
00321 for (i=1; i<=diagSize; ++i)
00322     {
00323     int hOff = i;
00324     int nOff = 0;
00325     int hDiff = hOff - haySize;
00326     matchSize = gapPenalty + digitsBaseTwo(i);
00327     if (hDiff > 0)
00328         {
00329         nOff += hDiff;
00330         hOff -= hDiff;
00331         }
00332     for (;hOff >= 0; --hOff, ++nOff)
00333         {
00334         int needleLeft = needleSize - nOff;
00335         int hayLeft = haySize - hOff;
00336         if (matchSize > needleLeft) break;
00337         if (matchSize > hayLeft) continue;
00338         if (ns[nOff] == hs[hOff] && memcmp(ns+nOff, hs+hOff, matchSize) == 0)
00339             {
00340             ali->nStart = ns + nOff;
00341             ali->nEnd = ns + nOff + matchSize;
00342             ali->hStart = hs + hOff;
00343             ali->hEnd = hs + hOff + matchSize;
00344             return TRUE;
00345             }
00346         }
00347     }
00348 return FALSE;
00349 }
00350 
00351 static boolean expandRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
00352     DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip);
00353 
00354 
00355 static boolean expandLeft(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
00356     DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip)
00357 /* Given a matching segment, try to expand the aligned parts to the
00358  * right. */
00359 {
00360 DNA *ns = ali->nStart;
00361 DNA *hs = ali->hStart;
00362 int score;
00363 DNA *oldNs = ns;
00364 
00365 for (;;)
00366     {
00367     while (ns > needleStart && hs > hayStart && ns[-1] == hs[-1])
00368         {
00369         --hs;
00370         --ns;
00371         }
00372     if (ns <= needleStart || hs <= hayStart)
00373         {
00374         ali->nStart = ns;
00375         ali->hStart = hs;
00376         return ns != oldNs;
00377         }
00378     /* Find fuzzy score to the left. */
00379         {
00380         int windowSize = 5;
00381         int nSize = ns-needleStart;
00382         int hSize = hs-hayStart;
00383         if (windowSize > nSize) windowSize = nSize;
00384         if (windowSize > hSize) windowSize = hSize;
00385         if (windowSize > 0)
00386             score = dnaScoreMatch(ns-windowSize, hs-windowSize, windowSize); 
00387         else
00388             score = -1;
00389 
00390         if (score >= windowSize-2)
00391             {
00392             hs -= windowSize;
00393             ns -= windowSize;
00394             }
00395         else if (--numSkips >= 0)
00396             {
00397             struct ffAli *newAli = ffNeedMem(sizeof(*newAli));
00398             ali->nStart = ns;
00399             ali->hStart = hs;
00400             if (ns - needleStart < 3 || 
00401                 !leftNextMatch(newAli, needleStart, ns, hayStart, hs, gapPenalty, maxSkip))
00402                 {
00403                 return ns != oldNs; /* We did our best... */
00404                 }
00405             newAli->right = ali;
00406             newAli->left = ali->left;
00407             if (ali->left)
00408                 ali->left->right = newAli;
00409             ali->left = newAli;
00410             ali = newAli;
00411             expandRight(ali, needleStart, ns, hayStart, hs, 0, gapPenalty, maxSkip);
00412             ns = ali->nStart;
00413             hs = ali->hStart;
00414             }
00415         else
00416             {
00417             ali->nStart = ns;
00418             ali->hStart = hs;
00419             return ns != oldNs;
00420             }
00421         }
00422     }
00423 }
00424 
00425 static boolean expandRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
00426     DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip)
00427 /* Given a matched segment, try to expand it to the right. */
00428 {
00429 int score;
00430 DNA *ne = ali->nEnd;
00431 DNA *he = ali->hEnd;
00432 DNA *oldNe = ne;
00433 
00434 for (;;)
00435     {
00436     /* Expand as far to the right as you can keeping perfect match */
00437     while (ne < needleEnd && he < hayEnd && *ne == *he)
00438         {
00439         ++he;
00440         ++ne;
00441         }
00442 
00443     /* Check to see if have come to the end. */
00444     if (ne >= needleEnd || he >= hayEnd)
00445         {
00446         ali->nEnd = ne;
00447         ali->hEnd = he;
00448         return oldNe != ne;
00449         }
00450 
00451     /* Find fuzzy score to the right. */
00452         {
00453         int windowSize = 5;
00454         int nSize = needleEnd-ne;
00455         int hSize = hayEnd - he;
00456         if (windowSize > nSize) windowSize = nSize;
00457         if (windowSize > hSize) windowSize = hSize;
00458         if (windowSize > 0)
00459             score = dnaScoreMatch(ne, he, windowSize); 
00460         else
00461             score = -1;
00462 
00463         if (score >= windowSize-2)
00464             {
00465             he += windowSize;
00466             ne += windowSize;
00467             }
00468         else if (--numSkips >= 0)
00469             {
00470             struct ffAli *newAli = ffNeedMem(sizeof(*newAli));
00471             ali->nEnd = ne;
00472             ali->hEnd = he;
00473             if (needleEnd - ne < 3 || 
00474                 !rightNextMatch(newAli, ne, needleEnd, he, hayEnd, gapPenalty, maxSkip))
00475                 {
00476                 return oldNe != ne; /* We did our best... */
00477                 }
00478             newAli->left = ali;
00479             newAli->right = ali->right;
00480             if (ali->right)
00481                 ali->right->left = newAli;
00482             ali->right = newAli;
00483             ali = newAli;
00484             expandLeft(ali, ne, needleEnd, he, hayEnd, 0, gapPenalty, maxSkip);
00485             ne = ali->nEnd;
00486             he = ali->hEnd;
00487             }
00488         else 
00489             {
00490             ali->nEnd = ne;
00491             ali->hEnd = he;
00492             return oldNe != ne;
00493             }
00494         }
00495     }
00496 }
00497 
00498 static boolean exactFind(DNA *needle, int nSize, DNA *hay, int hSize, int *hOffset)
00499 /* Look for exact match of needle in haystack. */
00500 {
00501 DNA firstC = needle[0];
00502 DNA *restOfNeedle = needle+1;
00503 int i;
00504 int endIx = hSize - nSize;
00505 int restSize = nSize-1;
00506 
00507 for (i = 0; i<=endIx; ++i)
00508     {
00509     if (firstC == hay[i])
00510         {
00511         if (memcmp(restOfNeedle, hay+i+1, restSize) == 0)
00512             {
00513             *hOffset = i;
00514             return TRUE;
00515             }
00516         }
00517     }
00518 *hOffset = -1;
00519 return FALSE;
00520 }
00521 
00522 #ifdef UNUSED
00523 static int countAlis(struct ffAli *ali)
00524 /* Count number of blocks in alignment. */
00525 {
00526 int count = 0;
00527 if (ali == NULL)
00528     return 0;
00529 while (ali->left)
00530     ali = ali->left;
00531 while (ali)
00532     {
00533     count += 1;
00534     ali = ali->right;
00535     }
00536 return count;
00537 }
00538 #endif /* UNUSED */
00539 
00540 
00541 static struct ffAli *reconsiderAlignedGaps(struct ffAli *ali)
00542 /* If the gap between two blocks is the same in both needle and
00543  * haystack, see if we're actually better off with the two
00544  * blocks merged together. */
00545 {
00546 struct ffAli *a = NULL;
00547 struct ffAli *left = NULL;
00548 
00549 if (ali == NULL)
00550     return NULL;
00551 a = ali;
00552 for (;;)
00553     {
00554     int gapSize;
00555 
00556     /* Advance to next ali */
00557     left = a;
00558     a = a->right;
00559     if (a == NULL)
00560         break;
00561     gapSize = a->nStart - left->nEnd;
00562     if (gapSize == a->hStart - left->hEnd)
00563         {
00564         int gapScore;
00565         int matchScore;
00566         gapScore = -ffCdnaGapPenalty(left, a);
00567         matchScore = dnaScoreMatch(left->nEnd, left->hEnd, gapSize);
00568         if (matchScore > gapScore)
00569             {
00570             /* Make current cover left. RemoveEmpty will take
00571              * care of empty shell of left later. */
00572             a->hStart = left->hEnd = left->hStart;
00573             a->nStart = left->nEnd = left->nStart;
00574             }
00575         }
00576     }
00577 return ali;
00578 }
00579 
00580 static struct ffAli *trimAlis(struct ffAli *aliList)
00581 /* If ends of an ali don't match trim them off. */
00582 {
00583 struct ffAli *ali;
00584 int trimCount = 0;
00585 for (ali = aliList; ali != NULL; ali = ali->right)
00586     {
00587     while (ali->nStart[0] != ali->hStart[0] && ali->nStart < ali->nEnd)
00588         {
00589         ali->nStart += 1;
00590         ali->hStart += 1;
00591         ++trimCount;
00592         }
00593     while (ali->nEnd[-1] != ali->hEnd[-1] && ali->nEnd > ali->nStart)
00594         {
00595         ali->nEnd -= 1;
00596         ali->hEnd -= 1;
00597         ++trimCount;
00598         }
00599     }
00600 return aliList;
00601 }
00602 
00603 static int extendThroughN;  /* Can extend through blocks of N's? */
00604 
00605 void setFfExtendThroughN(boolean val)
00606 /* Set whether or not can extend through N's. */
00607 {
00608 extendThroughN = val;
00609 }
00610 
00611 boolean expandThroughNRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
00612     DNA *hayStart, DNA *hayEnd)
00613 /* Expand through up to three N's to the left. */
00614 {
00615 DNA *nEnd = ali->nEnd;
00616 DNA *hEnd = ali->hEnd;
00617 DNA n,h;
00618 boolean expanded = FALSE;
00619 while (nEnd < needleEnd && hEnd < hayEnd)
00620     {
00621     n = *nEnd;
00622     h = *hEnd;
00623     if ((n == h) || 
00624         (n == 'n' && 
00625         (extendThroughN || nEnd + 3 >= needleEnd || nEnd[1] != 'n' || nEnd[2] != 'n' || nEnd[3] != 'n')) ||
00626         (h == 'n' && 
00627         (extendThroughN || hEnd + 3 >= hayEnd || hEnd[1] != 'n' || hEnd[2] != 'n' || hEnd[3] != 'n')))
00628         {
00629         nEnd += 1;
00630         hEnd += 1;
00631         expanded = TRUE;
00632         }
00633     else
00634         break;
00635     }
00636 ali->nEnd = nEnd;
00637 ali->hEnd = hEnd;
00638 return expanded;
00639 }
00640 
00641 boolean expandThroughNLeft(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
00642     DNA *hayStart, DNA *hayEnd)
00643 /* Expand through up to three N's to the left. */
00644 {
00645 DNA *nStart = ali->nStart-1;
00646 DNA *hStart = ali->hStart-1;
00647 DNA n,h;
00648 boolean expanded = FALSE;
00649 while (nStart >= needleStart && hStart >= hayStart)
00650     {
00651     n = *nStart;
00652     h = *hStart;
00653     if ((n == h) ||
00654         (n == 'n' && (extendThroughN || nStart - 3 < needleStart || nStart[-1] != 'n' || nStart[-2] != 'n' || nStart[-3] != 'n')) ||
00655         (h == 'n' && (extendThroughN || hStart - 3 < hayStart || hStart[-1] != 'n' || hStart[-2] != 'n' || hStart[-3] != 'n')))
00656         {
00657         nStart -= 1;
00658         hStart -= 1;
00659         expanded = TRUE;
00660         }
00661     else
00662         break;
00663     }
00664 ali->nStart = nStart + 1;
00665 ali->hStart = hStart + 1;
00666 return expanded;
00667 }
00668 
00669 static struct ffAli *expandAlis(struct ffAli *ali, DNA *nStart, DNA *nEnd, DNA *hStart, DNA *hEnd, 
00670     int gapPenalty, int maxSkip)
00671 /* Expand alignment to cover in-between tiles as well. */
00672 {
00673 struct ffAli *a, *left, *right;
00674 DNA *ns, *ne, *hs, *he;
00675 boolean expanded = TRUE;
00676 
00677 while (expanded)
00678     {
00679     expanded = FALSE;
00680     /* Loop through expanding three times. */
00681     /* First just expand through N's that don't require an insertion/deletion. */
00682     for (a = ali; a != NULL; a = a->right)
00683         {
00684         if ((left = a->left) != NULL)
00685             {
00686             ns = left->nEnd;
00687             hs = left->hEnd;
00688             }
00689         else
00690             {
00691             ns = nStart;
00692             hs = hStart;
00693             }
00694         if ((right = a->right) != NULL)
00695             {
00696             ne = right->nStart;
00697             he = right->hStart;
00698             }
00699         else
00700             {
00701             ne = nEnd;
00702             he = hEnd;
00703             }
00704         expanded |= expandThroughNLeft(a, ns, ne, hs, he);
00705         expanded |= expandThroughNRight(a, ns, ne, hs, he);
00706         }
00707     /* Second do other expansion that doesn't require an insertion/deletion. */
00708     for (a = ali; a != NULL; a = a->right)
00709         {
00710         if ((left = a->left) != NULL)
00711             {
00712             ns = left->nEnd;
00713             hs = left->hEnd;
00714             }
00715         else
00716             {
00717             ns = nStart;
00718             hs = hStart;
00719             }
00720         if ((right = a->right) != NULL)
00721             {
00722             ne = right->nStart;
00723             he = right->hStart;
00724             }
00725         else
00726             {
00727             ne = nEnd;
00728             he = hEnd;
00729             }
00730         expanded |= expandLeft(a, ns, ne, hs, he, 0, gapPenalty, maxSkip);
00731         expanded |= expandRight(a, ns, ne, hs, he, 0, gapPenalty, maxSkip);
00732         }
00733     /* Finally do insertion/deletion. */
00734     for (a = ali; a != NULL; a = a->right)
00735         {
00736         if ((left = a->left) != NULL)
00737             {
00738             ns = left->nEnd;
00739             hs = left->hEnd;
00740             }
00741         else
00742             {
00743             ns = nStart;
00744             hs = hStart;
00745             }
00746         if ((right = a->right) != NULL)
00747             {
00748             ne = right->nStart;
00749             he = right->hStart;
00750             }
00751         else
00752             {
00753             ne = nEnd;
00754             he = hEnd;
00755             }
00756         expanded |= expandLeft(a, ns, ne, hs, he, 1, gapPenalty, maxSkip);
00757         expanded |= expandRight(a, ns, ne, hs, he, 1, gapPenalty, maxSkip);
00758         }
00759     while (ali->left)
00760         ali = ali->left;
00761     }
00762 return ali;
00763 }
00764 
00765 
00766 
00767 DNA *matchInMem(DNA *ns, DNA *ne, DNA *hs, DNA *he)
00768 {
00769 int nLen = ne - ns;
00770 DNA c = *ns++;
00771 he -= nLen;
00772 nLen -= 1;
00773 while (hs < he)
00774     {   
00775     if (*hs++ == c && memcmp(ns, hs, nLen) == 0)
00776         {
00777         return hs-1;
00778         }
00779     }
00780 return NULL;
00781 }
00782 
00783 struct ffAli *findAliBetween(DNA *tile, int tileSize, DNA *ns, DNA *ne, DNA *hs, DNA *he)
00784 {
00785 DNA *match;
00786 DNA *tileEnd = tile + tileSize;
00787 int tries = 0;
00788 for (;;)
00789     {
00790     if ((match = matchInMem(tile, tileEnd, hs, he)) == NULL)
00791         return NULL;
00792     if (matchInMem(tile, tileEnd, match+1, he) == NULL)
00793         {
00794         /* Got exactly one match, whoopie! */
00795         struct ffAli *ali = ffNeedMem(sizeof(*ali));
00796         ali->nStart = tile;
00797         ali->nEnd = tileEnd;
00798         ali->hStart = match;
00799         ali->hEnd = match + (tileEnd-tile);
00800         ffExpandExactLeft(ali, ns, hs);
00801         ffExpandExactRight(ali, ne, he);
00802         return ali;
00803         }
00804 
00805     /* Got more than one match.  Expand tile to make it more specific.
00806      * In general first add base to start, then base to beginning.  If
00807      * at beginning or end already are constrained.  If already spanning
00808      * full needle, can't expand, so you're done. */
00809     if (tile <= ns)
00810         {
00811         if (tileEnd >= ne)
00812             return NULL;
00813         ++tileEnd;
00814         }
00815     else if (tile >= tileEnd)
00816         {
00817         --tile;
00818         }
00819     else if (tries & 1)
00820         {
00821         ++tileEnd;
00822         }
00823     else
00824         {
00825         --tile;
00826         }    
00827     ++tries;
00828     }    
00829 }
00830 
00831 double evalExactAli(struct ffAli *ali, DNA *ns, DNA *ne, DNA *hs, DNA *he, int numTiles,
00832     double freq[4])
00833 {
00834 int haySize = he-hs;
00835 double prob = 1.0;
00836 double allPossibles = haySize*numTiles;
00837 
00838 for (;ali != NULL;ali=ali->right)
00839     {
00840     double p = oligoProb(ali->nStart, ali->nEnd - ali->nStart, freq) * allPossibles;
00841     if (p < 1.0)
00842         prob *= p;
00843     }
00844 return prob;
00845 }
00846 
00847 void ffUnlink(struct ffAli **pList, struct ffAli *el)
00848 /* Unlink element from doubly linked list. If leftmost
00849  * element update list pointer. */
00850 {
00851 struct ffAli *list = *pList;
00852 struct ffAli *right = el->right;
00853 struct ffAli *left = el->left;
00854 
00855 if (el == list)    /* If first element need to update list. */
00856     *pList = right;
00857 if (right != NULL)
00858     right->left = left;
00859 if (left != NULL)
00860     left->right = right;
00861 el->left = el->right = NULL;
00862 }
00863 
00864 void unlinkAli(struct ffAli **pList, struct ffAli *ali)
00865 /* Unlink ali from list. */
00866 {
00867 ffUnlink(pList, ali);
00868 }
00869 
00870 struct protoGene
00871     {
00872     struct protoGene *left;
00873     struct protoGene *right;
00874     struct ffAli *hits;
00875     DNA *hStart, *hEnd, *nStart, *nEnd;
00876     int score;
00877     };
00878 
00879 int cmpProtoSize(const void *va, const void *vb)
00880 /* Sort biggest size first. */
00881 {
00882 const struct protoGene *a = *((struct protoGene **)va);
00883 const struct protoGene *b = *((struct protoGene **)vb);
00884 int aSize, bSize;
00885 aSize = a->nEnd - a->nStart;
00886 bSize = b->nEnd - b->nStart;
00887 return (bSize - aSize);
00888 }
00889 
00890 int cmpProtoNeedle(const void *va, const void *vb)
00891 /* Sort smallest offset into needle first. */
00892 {
00893 const struct protoGene *a = *((struct protoGene **)va);
00894 const struct protoGene *b = *((struct protoGene **)vb);
00895 return (a->nStart - b->nStart);
00896 }
00897 
00898 int cmpProtoScore(const void *va, const void *vb)
00899 /* Sort biggest score first. */
00900 {
00901 const struct protoGene *a = *((struct protoGene **)va);
00902 const struct protoGene *b = *((struct protoGene **)vb);
00903 return (b->score - a->score);
00904 }
00905 
00906 boolean canAdd(struct protoGene *a, struct protoGene *b)
00907 /* Can b fit into a?  It can if it doesn't overlap
00908  * by more than 1/4 with what's already there. */
00909 {
00910 struct ffAli *pa;
00911 DNA *bnEnd = b->nEnd;
00912 DNA *bnStart = b->nStart;
00913 DNA *bhEnd = b->hEnd;
00914 DNA *bhStart = b->hStart;
00915 int bSize = bnEnd - bnStart;
00916 int maxOverlap;
00917 for (pa = a->hits; pa != NULL; pa = pa->right)
00918     {
00919     int aSize = pa->nEnd - pa->nStart;
00920     DNA *start = (bnStart > pa->nStart ? bnStart : pa->nStart);
00921     DNA *end = (bnEnd < pa->nEnd ? bnEnd : pa->nEnd);
00922     maxOverlap = (aSize < bSize ? aSize : bSize) / 4;
00923     if (maxOverlap < 2) maxOverlap = 2;
00924     if (end - start >= maxOverlap)
00925        return FALSE;
00926     start = (bhStart > pa->hStart ? bhStart : pa->hStart);
00927     end = (bhEnd < pa->hEnd ? bhEnd : pa->hEnd);
00928     if (end - start >= maxOverlap)
00929         return FALSE;
00930     }
00931 return TRUE;
00932 }
00933 
00934 boolean bestMerger(struct protoGene *list, enum ffStringency stringency,
00935     DNA *ns, DNA *hs, struct protoGene **retA, struct protoGene **retB)
00936 /* Figure out best scoring merger in list. */
00937 {
00938 int bestScore = -0x7fffffff;
00939 int score;
00940 struct protoGene *bestA = NULL, *bestB = NULL;
00941 struct protoGene *a, *b;
00942 boolean isCdna = (stringency == ffCdna);
00943 
00944 if (list == NULL)
00945     return FALSE;
00946 
00947 for (a = list; a != NULL; a = a->right)
00948     {
00949     for (b = a->right; b != NULL; b = b->right)
00950         {
00951         if (canAdd(a,b))
00952             {
00953             int hGap = b->hStart - a->hEnd;
00954             int nGap = b->nStart - a->nEnd;
00955             if (hGap < 0)
00956                 {
00957                 hGap = -8*hGap;
00958                 if (!isCdna || hGap < 32)
00959                     hGap = (hGap*hGap);
00960                 }
00961             if (nGap < 0)
00962                 nGap = -8*nGap;
00963             score = -hGap - nGap*nGap;
00964             if (score > bestScore)
00965                 {
00966                 bestA = a;
00967                 bestB = b;
00968                 bestScore = score;
00969                 }
00970             }
00971         }
00972     }
00973 *retA = bestA;
00974 *retB = bestB;
00975 return bestA != NULL;
00976 }
00977 
00978 void mergeProtoGenes(struct protoGene **pList, struct protoGene *a, struct protoGene *b)
00979 /* Absorb protoGene b into protoGene a. */
00980 {
00981 ffUnlink((struct ffAli**)pList, (struct ffAli *)b);
00982 ffCat(&a->hits, &b->hits);
00983 if (a->hStart > b->hStart)
00984     a->hStart = b->hStart;
00985 if (a->nStart > b->nStart)
00986     a->nStart = b->nStart;
00987 if (a->hEnd < b->hEnd)
00988     a->hEnd = b->hEnd;
00989 if (a->nEnd < b->nEnd)
00990     a->nEnd = b->nEnd;
00991 }
00992 
00993 void lumpProtoGenes(struct protoGene **pList,
00994     DNA *ns, DNA *hs, enum ffStringency stringency)
00995 /* Lump together as many protogenes as we can. */
00996 {
00997 struct protoGene *a, *b;
00998 
00999 while (bestMerger(*pList, stringency, ns, hs, &a, &b))
01000     {
01001     mergeProtoGenes(pList, a, b);
01002     }
01003 }
01004 
01005 struct protoGene *lumpHits(struct ffAli **pHitList, DNA *ns, DNA *hs)
01006 /* Lump together as many hits as can. Criteria - they must be close
01007  * to same "diagonal." That is the distance between them must be
01008  * nearly the same in the needle and the haystack. */
01009 {
01010 struct protoGene proto;
01011 struct protoGene *retProto;
01012 struct ffAli *ali, *nextAli;
01013 int dif, lastDif;
01014 int lumpCount = 1;
01015 
01016 if ((ali = *pHitList) == NULL)
01017     return NULL;
01018 
01019 /* Move first hit onto our crude exon. */
01020 nextAli = ali->right;
01021 proto.left = proto.right = NULL;
01022 unlinkAli(pHitList, ali);
01023 proto.hits = ali;
01024 proto.hStart = ali->hStart;
01025 proto.hEnd = ali->hEnd;
01026 proto.nStart = ali->nStart;
01027 proto.nEnd = ali->nEnd;
01028 lastDif = ali->hStart - ali->nStart;
01029 
01030 /* Go through rest of list and put others that are close to
01031  * on the same diagonal onto this exon. */
01032 while ((ali = nextAli) != NULL)
01033     {
01034     nextAli = ali->right;
01035     dif = ali->hStart - ali->nStart;
01036     if (lastDif - 2 <= dif && dif <= lastDif + 2)
01037         {
01038         lastDif = dif;
01039         unlinkAli(pHitList, ali);
01040         ali->left = proto.hits;
01041         proto.hits = ali;
01042         proto.hEnd = ali->hEnd;
01043         proto.nEnd = ali->nEnd;
01044         ++lumpCount;
01045         }
01046     }
01047 proto.hits = ffMakeRightLinks(proto.hits);
01048 retProto = ffNeedMem(sizeof(*retProto));
01049 memcpy(retProto, &proto, sizeof(*retProto));
01050 return retProto;
01051 }
01052 
01053 #ifdef OLD
01054 void removeThrowbackHits(struct protoGene *proto)
01055 /* Remove hits that cause backtracking in hay coordinates. */
01056 {
01057 struct ffAli *left, *right;
01058 boolean gotThrowback = FALSE;
01059 
01060 right = proto->hits;
01061 for (;;)
01062     {
01063     left = right;
01064     right = right->right;
01065     if (right == NULL)
01066         break;
01067     if (left->hStart > right->hStart)
01068         {
01069         right->hStart = right->hEnd = left->hEnd;
01070         right->nStart = right->nEnd = left->nEnd;
01071         gotThrowback = TRUE;
01072         }
01073     }
01074 if (gotThrowback)
01075     proto->hits = ffRemoveEmptyAlis(proto->hits, FALSE);
01076 }
01077 #endif /* OLD */
01078 
01079 
01080 void thinProtoList(struct protoGene **pList, int maxToTake)
01081 /* Reduce list to only top scoring members.  At this
01082  * point protoList is singly linked on left. */
01083 {
01084 struct protoGene *newList = NULL, *pg, *next;
01085 int i;
01086 
01087 slSort(pList, cmpProtoScore);
01088 for (pg=*pList, i=0; pg != NULL && i < maxToTake; pg = next, ++i)
01089     {
01090     next = pg->left;
01091     pg->left = newList;
01092     newList = pg;
01093     }
01094 *pList = newList;
01095 }
01096 
01097 static struct ffAli *weaveAli(struct ffAli *hitList, 
01098     DNA *ns, DNA *ne, DNA *hs, DNA *he, double freq[4], int *rBestVal, 
01099     enum ffStringency stringency)
01100 /* Weave together best looking alignment out of the hitList list. */
01101 {
01102 struct ffAli *ali, *lastAli;
01103 struct protoGene *proto, *protoList = NULL;
01104 int protoCount = 0;
01105 
01106 /* Sort initial hits and remove duplicates. */
01107 ffAliSort(&hitList, ffCmpHitsHayFirst);
01108 ali = hitList;
01109 for (;;)
01110     {
01111     lastAli = ali;
01112     if ((ali = ali->right) == NULL)
01113         break;
01114     if (ali->hStart == lastAli->hStart && ali->hEnd == lastAli->hEnd
01115        && ali->nStart == lastAli->nStart && ali->nEnd == lastAli->nEnd)
01116        {
01117        unlinkAli(&hitList, lastAli);
01118        }
01119     }
01120 
01121 /* Lump together things which look to be separated only by small
01122  * amounts of noise. */
01123 while ((proto = lumpHits(&hitList, ns, hs)) != NULL)
01124     {
01125     proto->left = protoList;
01126     protoList = proto;
01127     ++protoCount;
01128     }
01129 if (protoCount > 200)
01130     {
01131     thinProtoList(&protoList, 200);
01132     }
01133 protoList = (struct protoGene *)ffMakeRightLinks((struct ffAli*)protoList);
01134 
01135 
01136 /* Lump together any other ones that can, starting with ones
01137  * that go together best. */
01138 ffAliSort((struct ffAli**)(void*)&protoList, cmpProtoNeedle);
01139 lumpProtoGenes(&protoList, ns, hs, stringency );
01140 for (proto = protoList; proto != NULL; proto = proto->right)
01141     {
01142     ffAliSort((struct ffAli**)&proto->hits, ffCmpHitsNeedleFirst);
01143 /*    removeThrowbackHits(proto); */
01144     proto->score = ffScore(proto->hits,stringency);
01145     }
01146 
01147 /* Sort by score and return the best. */
01148 ffAliSort((struct ffAli **)(void*)&protoList, cmpProtoScore);
01149 *rBestVal = protoList->score;
01150 return protoList->hits;
01151 }
01152 
01153 /* This set of variables is set before calling the recursive tile finders - the
01154  * below two routines. */
01155 static double rwFreq[4];
01156 static boolean rwIsCdna;
01157 static boolean rwCheckGoodEnough;
01158 
01159 static struct ffAli *rwFindTilesBetween(DNA *ns, DNA *ne, DNA *hs, DNA *he, 
01160     enum ffStringency stringency, double probMax)
01161 /* Search for more or less regularly spaced exact matches that are
01162  * in the right order. */
01163 {
01164 int searchOffset;
01165 int endTileOffset;
01166 int haySize = he - hs;
01167 int needleSize = ne - ns;
01168 int numTiles = 0;
01169 int bestWeaveVal;
01170 int tileIx;
01171 struct ffAli *hitList = NULL;
01172 struct ffAli *bestAli;
01173 struct ffAli *ali;
01174 int possibleTiles;
01175 double tileProbOne;
01176 
01177 possibleTiles = (haySize - nextPowerOfFour(needleSize));
01178 if (possibleTiles < 1) possibleTiles = 1;
01179 tileProbOne = probMax/possibleTiles;
01180 
01181 searchOffset = 0;
01182 endTileOffset = 0;
01183 tileIx = 0;
01184 for (;;)
01185     {
01186     DNA *tile;
01187     int tileSize;
01188     double tileProb;
01189     DNA *h = hs;
01190 
01191     searchOffset = endTileOffset;
01192     if (!ffFindGoodOligo(ns+searchOffset, needleSize-searchOffset, tileProbOne,
01193         rwFreq, &tile, &tileSize, &tileProb))
01194         {
01195         break;
01196         }
01197     /* Find all parts that match the tile. */
01198     for (;;)
01199         {
01200         if ((h = matchInMem(tile, tile+tileSize, h, he)) == NULL)
01201             break;
01202         ali = ffNeedMem(sizeof(*ali));
01203         ali->hStart = h;
01204         ali->hEnd = ali->hStart + tileSize;
01205         ali->nStart = tile;
01206         ali->nEnd = tile + tileSize;
01207         ali->left = hitList;
01208         hitList = ali;
01209         h += tileSize;
01210         } 
01211     endTileOffset = tile + tileSize - ns;
01212     ++numTiles;
01213     }
01214 if (hitList == NULL)
01215     return NULL;
01216 
01217 hitList = ffMakeRightLinks(hitList);
01218 for (ali = hitList; ali != NULL; ali = ali->right)
01219     {
01220     ffExpandExactLeft(ali, ns, hs);
01221     ffExpandExactRight(ali, ne, he);
01222     }
01223 bestAli = weaveAli(hitList, ns, ne, hs, he, rwFreq, &bestWeaveVal, stringency);
01224 if (rwCheckGoodEnough)
01225     {
01226     double prob;
01227     prob = evalExactAli(bestAli, ns, ne, hs, he, numTiles, rwFreq);
01228     if (prob > 0.1)
01229         return NULL;
01230     rwCheckGoodEnough = FALSE;
01231     }
01232 
01233 return bestAli;
01234 }
01235 
01236 static struct ffAli *exactAli(DNA *ns, DNA *ne, DNA *hs, DNA *he)
01237 /* Return alignment based on exact match. */
01238 {
01239 int exactOffset;
01240 if (exactFind(ns, ne-ns, hs, he-hs, &exactOffset))
01241     {
01242     struct ffAli *ali = ffNeedMem(sizeof(*ali));
01243     ali->nStart = ns;
01244     ali->nEnd = ne;
01245     ali->hStart = hs + exactOffset;
01246     ali->hEnd = ali->hStart + (ne - ns);
01247     if (ali->hEnd > he)
01248         ali->hEnd = he;
01249     return ali;
01250     }
01251 else
01252     return NULL;
01253 }
01254 
01255 static struct ffAli *recursiveWeave(DNA *ns, DNA *ne, DNA *hs, DNA *he, 
01256     enum ffStringency stringency, double probMax, int level, int orientation)
01257 /* Find a set of tiles, then recurse to find set of tiles between the tiles
01258  * at somewhat lower stringency. */
01259 {
01260 struct ffAli *left = NULL, *right = NULL, *aliList;
01261 
01262 if ((left = exactAli(ns, ne, hs, he)) != NULL)
01263     return left;
01264 if (stringency == ffExact)
01265     return NULL;
01266 
01267 aliList = rwFindTilesBetween(ns, ne, hs, he, stringency, probMax);
01268 if (aliList != NULL)
01269     {
01270     DNA *lne, *rns, *lhe, *rhs;
01271     int ndif, hdif;
01272     right = aliList;
01273     if (orientation == 0)
01274         orientation = ffIntronOrientation(aliList);
01275     for (;;)
01276         {
01277         /* Figure out the end points to recurse to. */
01278         if (left != NULL)
01279             {
01280             lne = left->nEnd;
01281             lhe = left->hEnd;
01282             }
01283         else
01284             {
01285             lne = ns;
01286             lhe = hs;
01287             }
01288         if (right != NULL)
01289             {
01290             rns = right->nStart;
01291             rhs = right->hStart;
01292             }
01293         else
01294             {
01295             rns = ne;
01296             rhs = he;
01297             }
01298         ndif = rns-lne;
01299         hdif = rhs-lhe;
01300         /* If a big enough gap left recurse. */
01301         if (ndif >= 5 && hdif >= 5)
01302             {
01303             struct ffAli *newLeft = NULL, *newRight;
01304 
01305             newLeft = recursiveWeave(lne, rns, lhe, rhs, stringency, probMax*2, level+1, 
01306                 orientation);
01307             if (newLeft != NULL)
01308                 {
01309                 /* Insert new tiles between left and right. */
01310                 
01311                 /* Find right end of tile set. */
01312                 newRight = newLeft;
01313                 while (newRight->right != NULL)
01314                     newRight = newRight->right;
01315                 
01316                 
01317                 if (left != NULL)
01318                     {
01319                     left->right = newLeft;
01320                     newLeft->left = left;
01321                     }
01322                 else
01323                     {
01324                     aliList = newLeft;
01325                     }
01326                 if (right != NULL)
01327                     {
01328                     right->left = newRight;
01329                     newRight->right = right;
01330                     }
01331                 }
01332             }
01333         if ((left = right) == NULL)
01334             break;
01335         right = right->right;
01336         }
01337     }
01338 return aliList;
01339 }
01340 
01341 
01342 static struct ffAli *findWovenTiles(DNA *ns, DNA *ne, DNA *hs, DNA *he, enum ffStringency stringency)
01343 {
01344 struct ffAli *bestAli;
01345 int haySize = he - hs;
01346 int needleSize = ne - ns;
01347 static double tileStrinProbMult[] = { 0.0001, 0.0005, 0.0005, 0.5, };
01348                                     /* exact  cDNA        tight    loose */
01349 if (needleSize < 2 || haySize < 2)  /* Be serious man! */
01350     return NULL;
01351 
01352 /* Set up rwVariables - essentially locals except that they don't change over the course
01353  * of the recursion. */
01354 makeFreqTable(hs, haySize, rwFreq);
01355 rwIsCdna = (stringency == ffCdna);
01356 rwCheckGoodEnough = (stringency == ffTight || stringency == ffCdna);
01357 
01358 bestAli = recursiveWeave(ns, ne, hs, he, stringency, tileStrinProbMult[stringency], 1, 0);
01359 
01360 return bestAli;
01361 }
01362 
01363 
01364 static struct ffAli *findBestAli(DNA *ns, DNA *ne, DNA *hs, DNA *he, enum ffStringency stringency)
01365 {
01366 static int iniExpGapPen[] = {0 /* (exact) */, 4 /* cDna */, 4 /* tight */, 4 /* loose */};
01367 static int addExpGapPen[] = {0 /* (exact) */, 3 /* cDna */, 3 /* tight */, 3 /* loose */};
01368 static int midTileMinSize[] ={0 /*(exact) */,12 /* cDNA */, 12 /* tight */, 4 /* loose */};
01369 
01370 struct ffAli *bestAli;
01371 int matchSize;
01372 
01373 matchSize = nextPowerOfFour(he-hs)+1;
01374 if (matchSize < midTileMinSize[stringency])
01375     matchSize = midTileMinSize[stringency];
01376 
01377 bestAli = findWovenTiles(ns, ne, hs, he, stringency);
01378 if (bestAli == NULL)
01379     return NULL;
01380 
01381 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
01382 bestAli = expandAlis(bestAli,ns,ne,hs,he,iniExpGapPen[stringency], 1);
01383 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
01384 bestAli = expandAlis(bestAli,ns,ne,hs,he,addExpGapPen[stringency], 2*matchSize);
01385 bestAli = trimAlis(bestAli);
01386 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
01387 bestAli = ffMergeHayOverlaps(bestAli);
01388 bestAli = reconsiderAlignedGaps(bestAli);
01389 bestAli = ffRemoveEmptyAlis(bestAli, FALSE);
01390 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
01391 if (stringency == ffCdna)
01392     ffSlideIntrons(bestAli);
01393 bestAli = ffRemoveEmptyAlis(bestAli, FALSE);
01394 return bestAli;
01395 }
01396 
01397 
01398 static struct ffAli *saveAliToPermanentMem(struct ffAli *volatileAli)
01399 /* Save alignment to memory that doesn't get thrown away. */
01400 {
01401 struct ffAli *leftList = NULL;
01402 struct ffAli *newAli, *ali;
01403 struct ffAli *rightList = NULL;
01404 
01405 /* Allocate memory, copy contents and set left pointer. */
01406 for (ali = volatileAli; ali != NULL; ali = ali->right)
01407     {
01408     newAli = needMem(sizeof(*newAli));
01409     if (newAli == NULL)
01410         {
01411         slFreeList(leftList);
01412         ffAbort();
01413         }
01414     memcpy(newAli, ali, sizeof(*newAli));
01415     newAli->left = leftList;
01416     leftList = newAli;
01417     }
01418 
01419 /* Set right pointer. */
01420 for (ali = leftList; ali != NULL; ali = ali->left)
01421     {
01422     ali->right = rightList;
01423     rightList = ali;
01424     }
01425 return rightList;
01426 }
01427 
01428 struct ffAli *ffFind(DNA *needleStart, DNA *needleEnd, DNA *hayStart, DNA *hayEnd,
01429      enum ffStringency stringency)
01430 /* Return an alignment of needle in haystack. */
01431 {
01432 struct ffAli *bestAli;
01433 int status;
01434 
01435 assert(needleStart <= needleEnd);
01436 assert(hayStart <= hayEnd);
01437 
01438 ffMemInit();
01439 dnaUtilOpen();
01440 
01441 
01442 /* Set up error recovery. */
01443 status = setjmp(ffRecover);
01444 if (status == 0)    /* Always true except after long jump. */
01445     {
01446     pushAbortHandler(ffAbort);
01447     bestAli = findBestAli(needleStart, needleEnd, hayStart, hayEnd, stringency);
01448     ffCountGoodEnds(bestAli);
01449     bestAli = saveAliToPermanentMem(bestAli);
01450     }
01451 else    /* They long jumped here because of an error. */
01452     {
01453     bestAli = NULL;
01454     }
01455 popAbortHandler();
01456 ffMemCleanup();
01457 return bestAli;
01458 }
01459 
01460 boolean ffFindAndScore(DNA *needle, int needleSize, DNA *haystack, int haySize,
01461     enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle, int *pScore)
01462 /* Return TRUE if find an allignment using needle, or reverse complement of 
01463  * needle to search haystack. DNA must be lower case. If pScore is non-NULL returns
01464  * score of alignment. */
01465 {
01466 int fScore, rScore;
01467 struct ffAli *fAli, *rAli;
01468 
01469 /* Get forward alignment and score it. */
01470 fAli = ffFind(needle, needle+needleSize, haystack, haystack+haySize, stringency);
01471 fScore = ffScore(fAli,stringency);
01472 
01473 /* Get reverse alignment and score it. */
01474 reverseComplement(needle, needleSize);
01475 rAli = ffFind(needle, needle+needleSize, haystack, haystack+haySize, stringency);
01476 rScore = ffScore(rAli,stringency);
01477 reverseComplement(needle, needleSize);
01478 
01479 /* If no good alignment on either strand return FALSE. */
01480 if (fAli == NULL && rAli == NULL)
01481     {
01482     *pAli = NULL;
01483     return FALSE;
01484     }
01485 
01486 /* Return TRUE with best alignment.  Free the other one. */
01487 if (fScore > rScore)
01488     {
01489     *pAli = fAli;
01490     *pRcNeedle = FALSE;
01491     if (pScore != NULL)
01492         *pScore = fScore;
01493     ffFreeAli(&rAli);
01494     }   
01495 else
01496     {
01497     *pAli = rAli;
01498     *pRcNeedle = TRUE;
01499     if (pScore != NULL)
01500         *pScore = rScore;
01501     ffFreeAli(&fAli);
01502     }
01503 return TRUE;
01504 }
01505 
01506 boolean ffFindEitherStrandN(DNA *needle, int needleSize, DNA *haystack, int haySize,
01507     enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle)
01508 /* Return TRUE if find an allignment using needle, or reverse complement of 
01509  * needle to search haystack. DNA must be lower case. */
01510 {
01511 return ffFindAndScore(needle, needleSize, haystack, haySize, stringency, pAli, pRcNeedle, NULL);
01512 }
01513 
01514 boolean ffFindEitherStrand(DNA *needle, DNA *haystack, enum ffStringency stringency,
01515     struct ffAli **pAli, boolean *pRcNeedle)
01516 /* Return TRUE if find an alignment using needle, or reverse complement of 
01517  * needle to search haystack. DNA must be lower case. Needle and haystack
01518  * are zero terminated. */
01519 {
01520 return ffFindEitherStrandN(needle, strlen(needle), haystack, strlen(haystack),
01521 stringency, pAli, pRcNeedle);
01522 }

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2