00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 static char const rcsid[] = "$Id: fuzzyFind.c,v 1.17 2006/05/09 01:05:55 markd Exp $";
00044
00045 #include "common.h"
00046 #include "dnautil.h"
00047 #include "localmem.h"
00048 #include "errabort.h"
00049 #include "fuzzyFind.h"
00050 #include "obscure.h"
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072 void dumpFf(struct ffAli *left, DNA *needle, DNA *hay);
00073
00074
00075 static jmp_buf ffRecover;
00076
00077 static void ffAbort()
00078
00079 {
00080 longjmp(ffRecover, -1);
00081 }
00082
00083 static struct lm *ffMemPool = NULL;
00084
00085 static void ffMemInit()
00086
00087 {
00088 ffMemPool = lmInit(2048);
00089 }
00090
00091 static void ffMemCleanup()
00092
00093 {
00094 lmCleanup(&ffMemPool);
00095 }
00096
00097 static void *ffNeedMem(size_t size)
00098
00099 {
00100 return lmAlloc(ffMemPool, size);
00101 }
00102
00103 static void makeFreqTable(DNA *dna, int dnaSize, double freq[4])
00104 {
00105 int histo[4];
00106 double total;
00107 int i;
00108
00109 dnaBaseHistogram(dna, dnaSize, histo);
00110 total = histo[0] + histo[1] + histo[2] + histo[3];
00111 if (total == 0) total = 1;
00112 for (i=0; i<4; ++i)
00113 freq[i] = (double)histo[i] / total;
00114 }
00115
00116 static double oligoProb(DNA *oligo, int size, double freq[4])
00117 {
00118 double prob = 1.0;
00119 int i;
00120 int baseVal;
00121
00122 for (i=0; i<size; ++i)
00123 {
00124 if ((baseVal = ntVal[(int)oligo[i]]) >= 0)
00125 prob *= freq[baseVal];
00126 }
00127 return prob;
00128 }
00129
00130 static boolean findImprobableOligo(DNA *needle, int needleLength, double maxProb, double freq[4],
00131 DNA **rOligo, int *rOligoLength, double *rOligoProb)
00132
00133
00134 {
00135 int i;
00136 double totalProb = 1.0;
00137 int base;
00138 int startIx = 0;
00139
00140 for (i=0;i<needleLength;++i)
00141 {
00142 if ((base = ntVal[(int)needle[i]]) < 0)
00143 {
00144 totalProb = 1.0;
00145 startIx = i+1;
00146 }
00147 else
00148 {
00149 if ((totalProb *= freq[base]) <= maxProb)
00150 {
00151 *rOligo = needle+startIx;
00152 *rOligoLength = i-startIx+1;
00153 *rOligoProb = totalProb;
00154 return TRUE;
00155 }
00156 }
00157 }
00158 return FALSE;
00159 }
00160
00161
00162 static boolean hasRepeat(DNA *oligo, int oligoLen)
00163
00164 {
00165 int mod;
00166 int i;
00167 DNA b;
00168 boolean gotRepeat;
00169 int repSize;
00170 int maxRep = (oligoLen+1)/2;
00171
00172 b = oligo[0];
00173 gotRepeat = TRUE;
00174 for (i=1; i<oligoLen; ++i)
00175 {
00176 if (oligo[i] != b)
00177 {
00178 gotRepeat = FALSE;
00179 break;
00180 }
00181 }
00182 if (gotRepeat)
00183 return TRUE;
00184
00185 gotRepeat = TRUE;
00186 for (i=2; i<oligoLen; ++i)
00187 {
00188 if (oligo[i&1] != oligo[i])
00189 {
00190 gotRepeat = FALSE;
00191 break;
00192 }
00193 }
00194 if (gotRepeat)
00195 return TRUE;
00196
00197 for (repSize = 3; repSize <= maxRep; ++repSize)
00198 {
00199 mod = 0;
00200 gotRepeat = TRUE;
00201 for (i=repSize; i<oligoLen; ++i)
00202 {
00203 if (oligo[mod] != oligo[i])
00204 {
00205 gotRepeat = FALSE;
00206 break;
00207 }
00208 if (++mod == repSize)
00209 mod = 0;
00210 }
00211 if (gotRepeat)
00212 return TRUE;
00213 }
00214 return gotRepeat;
00215 }
00216
00217
00218
00219 static boolean ffFindGoodOligo(DNA *needle, int needleLength, double maxProb, double freq[4],
00220 DNA **rOligo, int *rOligoLength, double *rOligoProb)
00221
00222
00223 {
00224 int oligoLen;
00225 DNA *oligo;
00226
00227
00228 for (;;)
00229 {
00230 if (!findImprobableOligo(needle, needleLength, maxProb, freq, rOligo, rOligoLength, rOligoProb))
00231 return FALSE;
00232 oligoLen = *rOligoLength;
00233 oligo = *rOligo;
00234 if (hasRepeat(oligo, oligoLen))
00235 {
00236 DNA *newNeedle = oligo+oligoLen;
00237 int newSize = needleLength - (newNeedle-needle);
00238 if (newSize <= 0)
00239 return FALSE;
00240 needle = newNeedle;
00241 needleLength = newSize;
00242 }
00243 else
00244 return TRUE;
00245 }
00246 }
00247
00248 static boolean leftNextMatch(struct ffAli *ali, DNA *ns, DNA *ne, DNA *hs, DNA *he,
00249 int gapPenalty, int maxSkip)
00250
00251
00252 {
00253 int haySize = he - hs;
00254 int needleSize = ne - ns;
00255 int diagSize = haySize + needleSize;
00256 int matchSize;
00257 int i;
00258
00259
00260 if (diagSize > maxSkip)
00261 diagSize = maxSkip;
00262
00263
00264
00265
00266
00267
00268
00269 for (i=1; i<=diagSize; ++i)
00270 {
00271 int hOff = i;
00272 int nOff = 0;
00273 int hDiff = hOff - haySize;
00274 matchSize = gapPenalty + digitsBaseTwo(i);
00275 if (hDiff > 0)
00276 {
00277 nOff += hDiff;
00278 hOff -= hDiff;
00279 }
00280 for (;hOff >= 0; --hOff, ++nOff)
00281 {
00282 int needleLeft = needleSize - nOff;
00283 int hayLeft = haySize - hOff;
00284 if (matchSize > needleLeft) break;
00285 if (matchSize > hayLeft) continue;
00286 if (ne[-nOff-1] == he[-hOff-1] && memcmp(ne-nOff-matchSize, he-hOff-matchSize, matchSize) == 0)
00287 {
00288 ali->nStart = ne - nOff - matchSize;
00289 ali->nEnd = ne - nOff;
00290 ali->hStart = he - hOff - matchSize;
00291 ali->hEnd = he- hOff;
00292 return TRUE;
00293 }
00294 }
00295 }
00296 return FALSE;
00297 }
00298
00299
00300 static boolean rightNextMatch(struct ffAli *ali, DNA *ns, DNA *ne, DNA *hs, DNA *he,
00301 int gapPenalty, int maxSkip)
00302
00303
00304 {
00305 int haySize = he - hs;
00306 int needleSize = ne - ns;
00307 int diagSize = haySize + needleSize;
00308 int matchSize;
00309 int i;
00310
00311
00312 if (diagSize > maxSkip)
00313 diagSize = maxSkip;
00314
00315
00316
00317
00318
00319
00320
00321 for (i=1; i<=diagSize; ++i)
00322 {
00323 int hOff = i;
00324 int nOff = 0;
00325 int hDiff = hOff - haySize;
00326 matchSize = gapPenalty + digitsBaseTwo(i);
00327 if (hDiff > 0)
00328 {
00329 nOff += hDiff;
00330 hOff -= hDiff;
00331 }
00332 for (;hOff >= 0; --hOff, ++nOff)
00333 {
00334 int needleLeft = needleSize - nOff;
00335 int hayLeft = haySize - hOff;
00336 if (matchSize > needleLeft) break;
00337 if (matchSize > hayLeft) continue;
00338 if (ns[nOff] == hs[hOff] && memcmp(ns+nOff, hs+hOff, matchSize) == 0)
00339 {
00340 ali->nStart = ns + nOff;
00341 ali->nEnd = ns + nOff + matchSize;
00342 ali->hStart = hs + hOff;
00343 ali->hEnd = hs + hOff + matchSize;
00344 return TRUE;
00345 }
00346 }
00347 }
00348 return FALSE;
00349 }
00350
00351 static boolean expandRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
00352 DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip);
00353
00354
00355 static boolean expandLeft(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
00356 DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip)
00357
00358
00359 {
00360 DNA *ns = ali->nStart;
00361 DNA *hs = ali->hStart;
00362 int score;
00363 DNA *oldNs = ns;
00364
00365 for (;;)
00366 {
00367 while (ns > needleStart && hs > hayStart && ns[-1] == hs[-1])
00368 {
00369 --hs;
00370 --ns;
00371 }
00372 if (ns <= needleStart || hs <= hayStart)
00373 {
00374 ali->nStart = ns;
00375 ali->hStart = hs;
00376 return ns != oldNs;
00377 }
00378
00379 {
00380 int windowSize = 5;
00381 int nSize = ns-needleStart;
00382 int hSize = hs-hayStart;
00383 if (windowSize > nSize) windowSize = nSize;
00384 if (windowSize > hSize) windowSize = hSize;
00385 if (windowSize > 0)
00386 score = dnaScoreMatch(ns-windowSize, hs-windowSize, windowSize);
00387 else
00388 score = -1;
00389
00390 if (score >= windowSize-2)
00391 {
00392 hs -= windowSize;
00393 ns -= windowSize;
00394 }
00395 else if (--numSkips >= 0)
00396 {
00397 struct ffAli *newAli = ffNeedMem(sizeof(*newAli));
00398 ali->nStart = ns;
00399 ali->hStart = hs;
00400 if (ns - needleStart < 3 ||
00401 !leftNextMatch(newAli, needleStart, ns, hayStart, hs, gapPenalty, maxSkip))
00402 {
00403 return ns != oldNs;
00404 }
00405 newAli->right = ali;
00406 newAli->left = ali->left;
00407 if (ali->left)
00408 ali->left->right = newAli;
00409 ali->left = newAli;
00410 ali = newAli;
00411 expandRight(ali, needleStart, ns, hayStart, hs, 0, gapPenalty, maxSkip);
00412 ns = ali->nStart;
00413 hs = ali->hStart;
00414 }
00415 else
00416 {
00417 ali->nStart = ns;
00418 ali->hStart = hs;
00419 return ns != oldNs;
00420 }
00421 }
00422 }
00423 }
00424
00425 static boolean expandRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
00426 DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip)
00427
00428 {
00429 int score;
00430 DNA *ne = ali->nEnd;
00431 DNA *he = ali->hEnd;
00432 DNA *oldNe = ne;
00433
00434 for (;;)
00435 {
00436
00437 while (ne < needleEnd && he < hayEnd && *ne == *he)
00438 {
00439 ++he;
00440 ++ne;
00441 }
00442
00443
00444 if (ne >= needleEnd || he >= hayEnd)
00445 {
00446 ali->nEnd = ne;
00447 ali->hEnd = he;
00448 return oldNe != ne;
00449 }
00450
00451
00452 {
00453 int windowSize = 5;
00454 int nSize = needleEnd-ne;
00455 int hSize = hayEnd - he;
00456 if (windowSize > nSize) windowSize = nSize;
00457 if (windowSize > hSize) windowSize = hSize;
00458 if (windowSize > 0)
00459 score = dnaScoreMatch(ne, he, windowSize);
00460 else
00461 score = -1;
00462
00463 if (score >= windowSize-2)
00464 {
00465 he += windowSize;
00466 ne += windowSize;
00467 }
00468 else if (--numSkips >= 0)
00469 {
00470 struct ffAli *newAli = ffNeedMem(sizeof(*newAli));
00471 ali->nEnd = ne;
00472 ali->hEnd = he;
00473 if (needleEnd - ne < 3 ||
00474 !rightNextMatch(newAli, ne, needleEnd, he, hayEnd, gapPenalty, maxSkip))
00475 {
00476 return oldNe != ne;
00477 }
00478 newAli->left = ali;
00479 newAli->right = ali->right;
00480 if (ali->right)
00481 ali->right->left = newAli;
00482 ali->right = newAli;
00483 ali = newAli;
00484 expandLeft(ali, ne, needleEnd, he, hayEnd, 0, gapPenalty, maxSkip);
00485 ne = ali->nEnd;
00486 he = ali->hEnd;
00487 }
00488 else
00489 {
00490 ali->nEnd = ne;
00491 ali->hEnd = he;
00492 return oldNe != ne;
00493 }
00494 }
00495 }
00496 }
00497
00498 static boolean exactFind(DNA *needle, int nSize, DNA *hay, int hSize, int *hOffset)
00499
00500 {
00501 DNA firstC = needle[0];
00502 DNA *restOfNeedle = needle+1;
00503 int i;
00504 int endIx = hSize - nSize;
00505 int restSize = nSize-1;
00506
00507 for (i = 0; i<=endIx; ++i)
00508 {
00509 if (firstC == hay[i])
00510 {
00511 if (memcmp(restOfNeedle, hay+i+1, restSize) == 0)
00512 {
00513 *hOffset = i;
00514 return TRUE;
00515 }
00516 }
00517 }
00518 *hOffset = -1;
00519 return FALSE;
00520 }
00521
00522 #ifdef UNUSED
00523 static int countAlis(struct ffAli *ali)
00524
00525 {
00526 int count = 0;
00527 if (ali == NULL)
00528 return 0;
00529 while (ali->left)
00530 ali = ali->left;
00531 while (ali)
00532 {
00533 count += 1;
00534 ali = ali->right;
00535 }
00536 return count;
00537 }
00538 #endif
00539
00540
00541 static struct ffAli *reconsiderAlignedGaps(struct ffAli *ali)
00542
00543
00544
00545 {
00546 struct ffAli *a = NULL;
00547 struct ffAli *left = NULL;
00548
00549 if (ali == NULL)
00550 return NULL;
00551 a = ali;
00552 for (;;)
00553 {
00554 int gapSize;
00555
00556
00557 left = a;
00558 a = a->right;
00559 if (a == NULL)
00560 break;
00561 gapSize = a->nStart - left->nEnd;
00562 if (gapSize == a->hStart - left->hEnd)
00563 {
00564 int gapScore;
00565 int matchScore;
00566 gapScore = -ffCdnaGapPenalty(left, a);
00567 matchScore = dnaScoreMatch(left->nEnd, left->hEnd, gapSize);
00568 if (matchScore > gapScore)
00569 {
00570
00571
00572 a->hStart = left->hEnd = left->hStart;
00573 a->nStart = left->nEnd = left->nStart;
00574 }
00575 }
00576 }
00577 return ali;
00578 }
00579
00580 static struct ffAli *trimAlis(struct ffAli *aliList)
00581
00582 {
00583 struct ffAli *ali;
00584 int trimCount = 0;
00585 for (ali = aliList; ali != NULL; ali = ali->right)
00586 {
00587 while (ali->nStart[0] != ali->hStart[0] && ali->nStart < ali->nEnd)
00588 {
00589 ali->nStart += 1;
00590 ali->hStart += 1;
00591 ++trimCount;
00592 }
00593 while (ali->nEnd[-1] != ali->hEnd[-1] && ali->nEnd > ali->nStart)
00594 {
00595 ali->nEnd -= 1;
00596 ali->hEnd -= 1;
00597 ++trimCount;
00598 }
00599 }
00600 return aliList;
00601 }
00602
00603 static int extendThroughN;
00604
00605 void setFfExtendThroughN(boolean val)
00606
00607 {
00608 extendThroughN = val;
00609 }
00610
00611 boolean expandThroughNRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
00612 DNA *hayStart, DNA *hayEnd)
00613
00614 {
00615 DNA *nEnd = ali->nEnd;
00616 DNA *hEnd = ali->hEnd;
00617 DNA n,h;
00618 boolean expanded = FALSE;
00619 while (nEnd < needleEnd && hEnd < hayEnd)
00620 {
00621 n = *nEnd;
00622 h = *hEnd;
00623 if ((n == h) ||
00624 (n == 'n' &&
00625 (extendThroughN || nEnd + 3 >= needleEnd || nEnd[1] != 'n' || nEnd[2] != 'n' || nEnd[3] != 'n')) ||
00626 (h == 'n' &&
00627 (extendThroughN || hEnd + 3 >= hayEnd || hEnd[1] != 'n' || hEnd[2] != 'n' || hEnd[3] != 'n')))
00628 {
00629 nEnd += 1;
00630 hEnd += 1;
00631 expanded = TRUE;
00632 }
00633 else
00634 break;
00635 }
00636 ali->nEnd = nEnd;
00637 ali->hEnd = hEnd;
00638 return expanded;
00639 }
00640
00641 boolean expandThroughNLeft(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
00642 DNA *hayStart, DNA *hayEnd)
00643
00644 {
00645 DNA *nStart = ali->nStart-1;
00646 DNA *hStart = ali->hStart-1;
00647 DNA n,h;
00648 boolean expanded = FALSE;
00649 while (nStart >= needleStart && hStart >= hayStart)
00650 {
00651 n = *nStart;
00652 h = *hStart;
00653 if ((n == h) ||
00654 (n == 'n' && (extendThroughN || nStart - 3 < needleStart || nStart[-1] != 'n' || nStart[-2] != 'n' || nStart[-3] != 'n')) ||
00655 (h == 'n' && (extendThroughN || hStart - 3 < hayStart || hStart[-1] != 'n' || hStart[-2] != 'n' || hStart[-3] != 'n')))
00656 {
00657 nStart -= 1;
00658 hStart -= 1;
00659 expanded = TRUE;
00660 }
00661 else
00662 break;
00663 }
00664 ali->nStart = nStart + 1;
00665 ali->hStart = hStart + 1;
00666 return expanded;
00667 }
00668
00669 static struct ffAli *expandAlis(struct ffAli *ali, DNA *nStart, DNA *nEnd, DNA *hStart, DNA *hEnd,
00670 int gapPenalty, int maxSkip)
00671
00672 {
00673 struct ffAli *a, *left, *right;
00674 DNA *ns, *ne, *hs, *he;
00675 boolean expanded = TRUE;
00676
00677 while (expanded)
00678 {
00679 expanded = FALSE;
00680
00681
00682 for (a = ali; a != NULL; a = a->right)
00683 {
00684 if ((left = a->left) != NULL)
00685 {
00686 ns = left->nEnd;
00687 hs = left->hEnd;
00688 }
00689 else
00690 {
00691 ns = nStart;
00692 hs = hStart;
00693 }
00694 if ((right = a->right) != NULL)
00695 {
00696 ne = right->nStart;
00697 he = right->hStart;
00698 }
00699 else
00700 {
00701 ne = nEnd;
00702 he = hEnd;
00703 }
00704 expanded |= expandThroughNLeft(a, ns, ne, hs, he);
00705 expanded |= expandThroughNRight(a, ns, ne, hs, he);
00706 }
00707
00708 for (a = ali; a != NULL; a = a->right)
00709 {
00710 if ((left = a->left) != NULL)
00711 {
00712 ns = left->nEnd;
00713 hs = left->hEnd;
00714 }
00715 else
00716 {
00717 ns = nStart;
00718 hs = hStart;
00719 }
00720 if ((right = a->right) != NULL)
00721 {
00722 ne = right->nStart;
00723 he = right->hStart;
00724 }
00725 else
00726 {
00727 ne = nEnd;
00728 he = hEnd;
00729 }
00730 expanded |= expandLeft(a, ns, ne, hs, he, 0, gapPenalty, maxSkip);
00731 expanded |= expandRight(a, ns, ne, hs, he, 0, gapPenalty, maxSkip);
00732 }
00733
00734 for (a = ali; a != NULL; a = a->right)
00735 {
00736 if ((left = a->left) != NULL)
00737 {
00738 ns = left->nEnd;
00739 hs = left->hEnd;
00740 }
00741 else
00742 {
00743 ns = nStart;
00744 hs = hStart;
00745 }
00746 if ((right = a->right) != NULL)
00747 {
00748 ne = right->nStart;
00749 he = right->hStart;
00750 }
00751 else
00752 {
00753 ne = nEnd;
00754 he = hEnd;
00755 }
00756 expanded |= expandLeft(a, ns, ne, hs, he, 1, gapPenalty, maxSkip);
00757 expanded |= expandRight(a, ns, ne, hs, he, 1, gapPenalty, maxSkip);
00758 }
00759 while (ali->left)
00760 ali = ali->left;
00761 }
00762 return ali;
00763 }
00764
00765
00766
00767 DNA *matchInMem(DNA *ns, DNA *ne, DNA *hs, DNA *he)
00768 {
00769 int nLen = ne - ns;
00770 DNA c = *ns++;
00771 he -= nLen;
00772 nLen -= 1;
00773 while (hs < he)
00774 {
00775 if (*hs++ == c && memcmp(ns, hs, nLen) == 0)
00776 {
00777 return hs-1;
00778 }
00779 }
00780 return NULL;
00781 }
00782
00783 struct ffAli *findAliBetween(DNA *tile, int tileSize, DNA *ns, DNA *ne, DNA *hs, DNA *he)
00784 {
00785 DNA *match;
00786 DNA *tileEnd = tile + tileSize;
00787 int tries = 0;
00788 for (;;)
00789 {
00790 if ((match = matchInMem(tile, tileEnd, hs, he)) == NULL)
00791 return NULL;
00792 if (matchInMem(tile, tileEnd, match+1, he) == NULL)
00793 {
00794
00795 struct ffAli *ali = ffNeedMem(sizeof(*ali));
00796 ali->nStart = tile;
00797 ali->nEnd = tileEnd;
00798 ali->hStart = match;
00799 ali->hEnd = match + (tileEnd-tile);
00800 ffExpandExactLeft(ali, ns, hs);
00801 ffExpandExactRight(ali, ne, he);
00802 return ali;
00803 }
00804
00805
00806
00807
00808
00809 if (tile <= ns)
00810 {
00811 if (tileEnd >= ne)
00812 return NULL;
00813 ++tileEnd;
00814 }
00815 else if (tile >= tileEnd)
00816 {
00817 --tile;
00818 }
00819 else if (tries & 1)
00820 {
00821 ++tileEnd;
00822 }
00823 else
00824 {
00825 --tile;
00826 }
00827 ++tries;
00828 }
00829 }
00830
00831 double evalExactAli(struct ffAli *ali, DNA *ns, DNA *ne, DNA *hs, DNA *he, int numTiles,
00832 double freq[4])
00833 {
00834 int haySize = he-hs;
00835 double prob = 1.0;
00836 double allPossibles = haySize*numTiles;
00837
00838 for (;ali != NULL;ali=ali->right)
00839 {
00840 double p = oligoProb(ali->nStart, ali->nEnd - ali->nStart, freq) * allPossibles;
00841 if (p < 1.0)
00842 prob *= p;
00843 }
00844 return prob;
00845 }
00846
00847 void ffUnlink(struct ffAli **pList, struct ffAli *el)
00848
00849
00850 {
00851 struct ffAli *list = *pList;
00852 struct ffAli *right = el->right;
00853 struct ffAli *left = el->left;
00854
00855 if (el == list)
00856 *pList = right;
00857 if (right != NULL)
00858 right->left = left;
00859 if (left != NULL)
00860 left->right = right;
00861 el->left = el->right = NULL;
00862 }
00863
00864 void unlinkAli(struct ffAli **pList, struct ffAli *ali)
00865
00866 {
00867 ffUnlink(pList, ali);
00868 }
00869
00870 struct protoGene
00871 {
00872 struct protoGene *left;
00873 struct protoGene *right;
00874 struct ffAli *hits;
00875 DNA *hStart, *hEnd, *nStart, *nEnd;
00876 int score;
00877 };
00878
00879 int cmpProtoSize(const void *va, const void *vb)
00880
00881 {
00882 const struct protoGene *a = *((struct protoGene **)va);
00883 const struct protoGene *b = *((struct protoGene **)vb);
00884 int aSize, bSize;
00885 aSize = a->nEnd - a->nStart;
00886 bSize = b->nEnd - b->nStart;
00887 return (bSize - aSize);
00888 }
00889
00890 int cmpProtoNeedle(const void *va, const void *vb)
00891
00892 {
00893 const struct protoGene *a = *((struct protoGene **)va);
00894 const struct protoGene *b = *((struct protoGene **)vb);
00895 return (a->nStart - b->nStart);
00896 }
00897
00898 int cmpProtoScore(const void *va, const void *vb)
00899
00900 {
00901 const struct protoGene *a = *((struct protoGene **)va);
00902 const struct protoGene *b = *((struct protoGene **)vb);
00903 return (b->score - a->score);
00904 }
00905
00906 boolean canAdd(struct protoGene *a, struct protoGene *b)
00907
00908
00909 {
00910 struct ffAli *pa;
00911 DNA *bnEnd = b->nEnd;
00912 DNA *bnStart = b->nStart;
00913 DNA *bhEnd = b->hEnd;
00914 DNA *bhStart = b->hStart;
00915 int bSize = bnEnd - bnStart;
00916 int maxOverlap;
00917 for (pa = a->hits; pa != NULL; pa = pa->right)
00918 {
00919 int aSize = pa->nEnd - pa->nStart;
00920 DNA *start = (bnStart > pa->nStart ? bnStart : pa->nStart);
00921 DNA *end = (bnEnd < pa->nEnd ? bnEnd : pa->nEnd);
00922 maxOverlap = (aSize < bSize ? aSize : bSize) / 4;
00923 if (maxOverlap < 2) maxOverlap = 2;
00924 if (end - start >= maxOverlap)
00925 return FALSE;
00926 start = (bhStart > pa->hStart ? bhStart : pa->hStart);
00927 end = (bhEnd < pa->hEnd ? bhEnd : pa->hEnd);
00928 if (end - start >= maxOverlap)
00929 return FALSE;
00930 }
00931 return TRUE;
00932 }
00933
00934 boolean bestMerger(struct protoGene *list, enum ffStringency stringency,
00935 DNA *ns, DNA *hs, struct protoGene **retA, struct protoGene **retB)
00936
00937 {
00938 int bestScore = -0x7fffffff;
00939 int score;
00940 struct protoGene *bestA = NULL, *bestB = NULL;
00941 struct protoGene *a, *b;
00942 boolean isCdna = (stringency == ffCdna);
00943
00944 if (list == NULL)
00945 return FALSE;
00946
00947 for (a = list; a != NULL; a = a->right)
00948 {
00949 for (b = a->right; b != NULL; b = b->right)
00950 {
00951 if (canAdd(a,b))
00952 {
00953 int hGap = b->hStart - a->hEnd;
00954 int nGap = b->nStart - a->nEnd;
00955 if (hGap < 0)
00956 {
00957 hGap = -8*hGap;
00958 if (!isCdna || hGap < 32)
00959 hGap = (hGap*hGap);
00960 }
00961 if (nGap < 0)
00962 nGap = -8*nGap;
00963 score = -hGap - nGap*nGap;
00964 if (score > bestScore)
00965 {
00966 bestA = a;
00967 bestB = b;
00968 bestScore = score;
00969 }
00970 }
00971 }
00972 }
00973 *retA = bestA;
00974 *retB = bestB;
00975 return bestA != NULL;
00976 }
00977
00978 void mergeProtoGenes(struct protoGene **pList, struct protoGene *a, struct protoGene *b)
00979
00980 {
00981 ffUnlink((struct ffAli**)pList, (struct ffAli *)b);
00982 ffCat(&a->hits, &b->hits);
00983 if (a->hStart > b->hStart)
00984 a->hStart = b->hStart;
00985 if (a->nStart > b->nStart)
00986 a->nStart = b->nStart;
00987 if (a->hEnd < b->hEnd)
00988 a->hEnd = b->hEnd;
00989 if (a->nEnd < b->nEnd)
00990 a->nEnd = b->nEnd;
00991 }
00992
00993 void lumpProtoGenes(struct protoGene **pList,
00994 DNA *ns, DNA *hs, enum ffStringency stringency)
00995
00996 {
00997 struct protoGene *a, *b;
00998
00999 while (bestMerger(*pList, stringency, ns, hs, &a, &b))
01000 {
01001 mergeProtoGenes(pList, a, b);
01002 }
01003 }
01004
01005 struct protoGene *lumpHits(struct ffAli **pHitList, DNA *ns, DNA *hs)
01006
01007
01008
01009 {
01010 struct protoGene proto;
01011 struct protoGene *retProto;
01012 struct ffAli *ali, *nextAli;
01013 int dif, lastDif;
01014 int lumpCount = 1;
01015
01016 if ((ali = *pHitList) == NULL)
01017 return NULL;
01018
01019
01020 nextAli = ali->right;
01021 proto.left = proto.right = NULL;
01022 unlinkAli(pHitList, ali);
01023 proto.hits = ali;
01024 proto.hStart = ali->hStart;
01025 proto.hEnd = ali->hEnd;
01026 proto.nStart = ali->nStart;
01027 proto.nEnd = ali->nEnd;
01028 lastDif = ali->hStart - ali->nStart;
01029
01030
01031
01032 while ((ali = nextAli) != NULL)
01033 {
01034 nextAli = ali->right;
01035 dif = ali->hStart - ali->nStart;
01036 if (lastDif - 2 <= dif && dif <= lastDif + 2)
01037 {
01038 lastDif = dif;
01039 unlinkAli(pHitList, ali);
01040 ali->left = proto.hits;
01041 proto.hits = ali;
01042 proto.hEnd = ali->hEnd;
01043 proto.nEnd = ali->nEnd;
01044 ++lumpCount;
01045 }
01046 }
01047 proto.hits = ffMakeRightLinks(proto.hits);
01048 retProto = ffNeedMem(sizeof(*retProto));
01049 memcpy(retProto, &proto, sizeof(*retProto));
01050 return retProto;
01051 }
01052
01053 #ifdef OLD
01054 void removeThrowbackHits(struct protoGene *proto)
01055
01056 {
01057 struct ffAli *left, *right;
01058 boolean gotThrowback = FALSE;
01059
01060 right = proto->hits;
01061 for (;;)
01062 {
01063 left = right;
01064 right = right->right;
01065 if (right == NULL)
01066 break;
01067 if (left->hStart > right->hStart)
01068 {
01069 right->hStart = right->hEnd = left->hEnd;
01070 right->nStart = right->nEnd = left->nEnd;
01071 gotThrowback = TRUE;
01072 }
01073 }
01074 if (gotThrowback)
01075 proto->hits = ffRemoveEmptyAlis(proto->hits, FALSE);
01076 }
01077 #endif
01078
01079
01080 void thinProtoList(struct protoGene **pList, int maxToTake)
01081
01082
01083 {
01084 struct protoGene *newList = NULL, *pg, *next;
01085 int i;
01086
01087 slSort(pList, cmpProtoScore);
01088 for (pg=*pList, i=0; pg != NULL && i < maxToTake; pg = next, ++i)
01089 {
01090 next = pg->left;
01091 pg->left = newList;
01092 newList = pg;
01093 }
01094 *pList = newList;
01095 }
01096
01097 static struct ffAli *weaveAli(struct ffAli *hitList,
01098 DNA *ns, DNA *ne, DNA *hs, DNA *he, double freq[4], int *rBestVal,
01099 enum ffStringency stringency)
01100
01101 {
01102 struct ffAli *ali, *lastAli;
01103 struct protoGene *proto, *protoList = NULL;
01104 int protoCount = 0;
01105
01106
01107 ffAliSort(&hitList, ffCmpHitsHayFirst);
01108 ali = hitList;
01109 for (;;)
01110 {
01111 lastAli = ali;
01112 if ((ali = ali->right) == NULL)
01113 break;
01114 if (ali->hStart == lastAli->hStart && ali->hEnd == lastAli->hEnd
01115 && ali->nStart == lastAli->nStart && ali->nEnd == lastAli->nEnd)
01116 {
01117 unlinkAli(&hitList, lastAli);
01118 }
01119 }
01120
01121
01122
01123 while ((proto = lumpHits(&hitList, ns, hs)) != NULL)
01124 {
01125 proto->left = protoList;
01126 protoList = proto;
01127 ++protoCount;
01128 }
01129 if (protoCount > 200)
01130 {
01131 thinProtoList(&protoList, 200);
01132 }
01133 protoList = (struct protoGene *)ffMakeRightLinks((struct ffAli*)protoList);
01134
01135
01136
01137
01138 ffAliSort((struct ffAli**)(void*)&protoList, cmpProtoNeedle);
01139 lumpProtoGenes(&protoList, ns, hs, stringency );
01140 for (proto = protoList; proto != NULL; proto = proto->right)
01141 {
01142 ffAliSort((struct ffAli**)&proto->hits, ffCmpHitsNeedleFirst);
01143
01144 proto->score = ffScore(proto->hits,stringency);
01145 }
01146
01147
01148 ffAliSort((struct ffAli **)(void*)&protoList, cmpProtoScore);
01149 *rBestVal = protoList->score;
01150 return protoList->hits;
01151 }
01152
01153
01154
01155 static double rwFreq[4];
01156 static boolean rwIsCdna;
01157 static boolean rwCheckGoodEnough;
01158
01159 static struct ffAli *rwFindTilesBetween(DNA *ns, DNA *ne, DNA *hs, DNA *he,
01160 enum ffStringency stringency, double probMax)
01161
01162
01163 {
01164 int searchOffset;
01165 int endTileOffset;
01166 int haySize = he - hs;
01167 int needleSize = ne - ns;
01168 int numTiles = 0;
01169 int bestWeaveVal;
01170 int tileIx;
01171 struct ffAli *hitList = NULL;
01172 struct ffAli *bestAli;
01173 struct ffAli *ali;
01174 int possibleTiles;
01175 double tileProbOne;
01176
01177 possibleTiles = (haySize - nextPowerOfFour(needleSize));
01178 if (possibleTiles < 1) possibleTiles = 1;
01179 tileProbOne = probMax/possibleTiles;
01180
01181 searchOffset = 0;
01182 endTileOffset = 0;
01183 tileIx = 0;
01184 for (;;)
01185 {
01186 DNA *tile;
01187 int tileSize;
01188 double tileProb;
01189 DNA *h = hs;
01190
01191 searchOffset = endTileOffset;
01192 if (!ffFindGoodOligo(ns+searchOffset, needleSize-searchOffset, tileProbOne,
01193 rwFreq, &tile, &tileSize, &tileProb))
01194 {
01195 break;
01196 }
01197
01198 for (;;)
01199 {
01200 if ((h = matchInMem(tile, tile+tileSize, h, he)) == NULL)
01201 break;
01202 ali = ffNeedMem(sizeof(*ali));
01203 ali->hStart = h;
01204 ali->hEnd = ali->hStart + tileSize;
01205 ali->nStart = tile;
01206 ali->nEnd = tile + tileSize;
01207 ali->left = hitList;
01208 hitList = ali;
01209 h += tileSize;
01210 }
01211 endTileOffset = tile + tileSize - ns;
01212 ++numTiles;
01213 }
01214 if (hitList == NULL)
01215 return NULL;
01216
01217 hitList = ffMakeRightLinks(hitList);
01218 for (ali = hitList; ali != NULL; ali = ali->right)
01219 {
01220 ffExpandExactLeft(ali, ns, hs);
01221 ffExpandExactRight(ali, ne, he);
01222 }
01223 bestAli = weaveAli(hitList, ns, ne, hs, he, rwFreq, &bestWeaveVal, stringency);
01224 if (rwCheckGoodEnough)
01225 {
01226 double prob;
01227 prob = evalExactAli(bestAli, ns, ne, hs, he, numTiles, rwFreq);
01228 if (prob > 0.1)
01229 return NULL;
01230 rwCheckGoodEnough = FALSE;
01231 }
01232
01233 return bestAli;
01234 }
01235
01236 static struct ffAli *exactAli(DNA *ns, DNA *ne, DNA *hs, DNA *he)
01237
01238 {
01239 int exactOffset;
01240 if (exactFind(ns, ne-ns, hs, he-hs, &exactOffset))
01241 {
01242 struct ffAli *ali = ffNeedMem(sizeof(*ali));
01243 ali->nStart = ns;
01244 ali->nEnd = ne;
01245 ali->hStart = hs + exactOffset;
01246 ali->hEnd = ali->hStart + (ne - ns);
01247 if (ali->hEnd > he)
01248 ali->hEnd = he;
01249 return ali;
01250 }
01251 else
01252 return NULL;
01253 }
01254
01255 static struct ffAli *recursiveWeave(DNA *ns, DNA *ne, DNA *hs, DNA *he,
01256 enum ffStringency stringency, double probMax, int level, int orientation)
01257
01258
01259 {
01260 struct ffAli *left = NULL, *right = NULL, *aliList;
01261
01262 if ((left = exactAli(ns, ne, hs, he)) != NULL)
01263 return left;
01264 if (stringency == ffExact)
01265 return NULL;
01266
01267 aliList = rwFindTilesBetween(ns, ne, hs, he, stringency, probMax);
01268 if (aliList != NULL)
01269 {
01270 DNA *lne, *rns, *lhe, *rhs;
01271 int ndif, hdif;
01272 right = aliList;
01273 if (orientation == 0)
01274 orientation = ffIntronOrientation(aliList);
01275 for (;;)
01276 {
01277
01278 if (left != NULL)
01279 {
01280 lne = left->nEnd;
01281 lhe = left->hEnd;
01282 }
01283 else
01284 {
01285 lne = ns;
01286 lhe = hs;
01287 }
01288 if (right != NULL)
01289 {
01290 rns = right->nStart;
01291 rhs = right->hStart;
01292 }
01293 else
01294 {
01295 rns = ne;
01296 rhs = he;
01297 }
01298 ndif = rns-lne;
01299 hdif = rhs-lhe;
01300
01301 if (ndif >= 5 && hdif >= 5)
01302 {
01303 struct ffAli *newLeft = NULL, *newRight;
01304
01305 newLeft = recursiveWeave(lne, rns, lhe, rhs, stringency, probMax*2, level+1,
01306 orientation);
01307 if (newLeft != NULL)
01308 {
01309
01310
01311
01312 newRight = newLeft;
01313 while (newRight->right != NULL)
01314 newRight = newRight->right;
01315
01316
01317 if (left != NULL)
01318 {
01319 left->right = newLeft;
01320 newLeft->left = left;
01321 }
01322 else
01323 {
01324 aliList = newLeft;
01325 }
01326 if (right != NULL)
01327 {
01328 right->left = newRight;
01329 newRight->right = right;
01330 }
01331 }
01332 }
01333 if ((left = right) == NULL)
01334 break;
01335 right = right->right;
01336 }
01337 }
01338 return aliList;
01339 }
01340
01341
01342 static struct ffAli *findWovenTiles(DNA *ns, DNA *ne, DNA *hs, DNA *he, enum ffStringency stringency)
01343 {
01344 struct ffAli *bestAli;
01345 int haySize = he - hs;
01346 int needleSize = ne - ns;
01347 static double tileStrinProbMult[] = { 0.0001, 0.0005, 0.0005, 0.5, };
01348
01349 if (needleSize < 2 || haySize < 2)
01350 return NULL;
01351
01352
01353
01354 makeFreqTable(hs, haySize, rwFreq);
01355 rwIsCdna = (stringency == ffCdna);
01356 rwCheckGoodEnough = (stringency == ffTight || stringency == ffCdna);
01357
01358 bestAli = recursiveWeave(ns, ne, hs, he, stringency, tileStrinProbMult[stringency], 1, 0);
01359
01360 return bestAli;
01361 }
01362
01363
01364 static struct ffAli *findBestAli(DNA *ns, DNA *ne, DNA *hs, DNA *he, enum ffStringency stringency)
01365 {
01366 static int iniExpGapPen[] = {0 , 4 , 4 , 4 };
01367 static int addExpGapPen[] = {0 , 3 , 3 , 3 };
01368 static int midTileMinSize[] ={0 ,12 , 12 , 4 };
01369
01370 struct ffAli *bestAli;
01371 int matchSize;
01372
01373 matchSize = nextPowerOfFour(he-hs)+1;
01374 if (matchSize < midTileMinSize[stringency])
01375 matchSize = midTileMinSize[stringency];
01376
01377 bestAli = findWovenTiles(ns, ne, hs, he, stringency);
01378 if (bestAli == NULL)
01379 return NULL;
01380
01381 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
01382 bestAli = expandAlis(bestAli,ns,ne,hs,he,iniExpGapPen[stringency], 1);
01383 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
01384 bestAli = expandAlis(bestAli,ns,ne,hs,he,addExpGapPen[stringency], 2*matchSize);
01385 bestAli = trimAlis(bestAli);
01386 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
01387 bestAli = ffMergeHayOverlaps(bestAli);
01388 bestAli = reconsiderAlignedGaps(bestAli);
01389 bestAli = ffRemoveEmptyAlis(bestAli, FALSE);
01390 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
01391 if (stringency == ffCdna)
01392 ffSlideIntrons(bestAli);
01393 bestAli = ffRemoveEmptyAlis(bestAli, FALSE);
01394 return bestAli;
01395 }
01396
01397
01398 static struct ffAli *saveAliToPermanentMem(struct ffAli *volatileAli)
01399
01400 {
01401 struct ffAli *leftList = NULL;
01402 struct ffAli *newAli, *ali;
01403 struct ffAli *rightList = NULL;
01404
01405
01406 for (ali = volatileAli; ali != NULL; ali = ali->right)
01407 {
01408 newAli = needMem(sizeof(*newAli));
01409 if (newAli == NULL)
01410 {
01411 slFreeList(leftList);
01412 ffAbort();
01413 }
01414 memcpy(newAli, ali, sizeof(*newAli));
01415 newAli->left = leftList;
01416 leftList = newAli;
01417 }
01418
01419
01420 for (ali = leftList; ali != NULL; ali = ali->left)
01421 {
01422 ali->right = rightList;
01423 rightList = ali;
01424 }
01425 return rightList;
01426 }
01427
01428 struct ffAli *ffFind(DNA *needleStart, DNA *needleEnd, DNA *hayStart, DNA *hayEnd,
01429 enum ffStringency stringency)
01430
01431 {
01432 struct ffAli *bestAli;
01433 int status;
01434
01435 assert(needleStart <= needleEnd);
01436 assert(hayStart <= hayEnd);
01437
01438 ffMemInit();
01439 dnaUtilOpen();
01440
01441
01442
01443 status = setjmp(ffRecover);
01444 if (status == 0)
01445 {
01446 pushAbortHandler(ffAbort);
01447 bestAli = findBestAli(needleStart, needleEnd, hayStart, hayEnd, stringency);
01448 ffCountGoodEnds(bestAli);
01449 bestAli = saveAliToPermanentMem(bestAli);
01450 }
01451 else
01452 {
01453 bestAli = NULL;
01454 }
01455 popAbortHandler();
01456 ffMemCleanup();
01457 return bestAli;
01458 }
01459
01460 boolean ffFindAndScore(DNA *needle, int needleSize, DNA *haystack, int haySize,
01461 enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle, int *pScore)
01462
01463
01464
01465 {
01466 int fScore, rScore;
01467 struct ffAli *fAli, *rAli;
01468
01469
01470 fAli = ffFind(needle, needle+needleSize, haystack, haystack+haySize, stringency);
01471 fScore = ffScore(fAli,stringency);
01472
01473
01474 reverseComplement(needle, needleSize);
01475 rAli = ffFind(needle, needle+needleSize, haystack, haystack+haySize, stringency);
01476 rScore = ffScore(rAli,stringency);
01477 reverseComplement(needle, needleSize);
01478
01479
01480 if (fAli == NULL && rAli == NULL)
01481 {
01482 *pAli = NULL;
01483 return FALSE;
01484 }
01485
01486
01487 if (fScore > rScore)
01488 {
01489 *pAli = fAli;
01490 *pRcNeedle = FALSE;
01491 if (pScore != NULL)
01492 *pScore = fScore;
01493 ffFreeAli(&rAli);
01494 }
01495 else
01496 {
01497 *pAli = rAli;
01498 *pRcNeedle = TRUE;
01499 if (pScore != NULL)
01500 *pScore = rScore;
01501 ffFreeAli(&fAli);
01502 }
01503 return TRUE;
01504 }
01505
01506 boolean ffFindEitherStrandN(DNA *needle, int needleSize, DNA *haystack, int haySize,
01507 enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle)
01508
01509
01510 {
01511 return ffFindAndScore(needle, needleSize, haystack, haySize, stringency, pAli, pRcNeedle, NULL);
01512 }
01513
01514 boolean ffFindEitherStrand(DNA *needle, DNA *haystack, enum ffStringency stringency,
01515 struct ffAli **pAli, boolean *pRcNeedle)
01516
01517
01518
01519 {
01520 return ffFindEitherStrandN(needle, strlen(needle), haystack, strlen(haystack),
01521 stringency, pAli, pRcNeedle);
01522 }