00001
00002
00003
00004 #include "common.h"
00005 #include "dnaseq.h"
00006 #include "localmem.h"
00007 #include "memalloc.h"
00008 #include "bits.h"
00009 #include "genoFind.h"
00010 #include "fuzzyFind.h"
00011 #include "supStitch.h"
00012 #include "bandExt.h"
00013 #include "gfInternal.h"
00014
00015 static char const rcsid[] = "$Id: ffSeedExtend.c,v 1.36 2006/06/22 16:24:43 kent Exp $";
00016
00017 static void extendExactRight(int qMax, int tMax, char **pEndQ, char **pEndT)
00018
00019 {
00020 int last = min(qMax, tMax);
00021 int i;
00022 char *q = *pEndQ, *t = *pEndT;
00023
00024 for (i=0; i<last; ++i)
00025 {
00026 if (*q != *t)
00027 break;
00028 q += 1;
00029 t += 1;
00030 }
00031 *pEndQ = q;
00032 *pEndT = t;
00033 }
00034
00035 static void extendExactLeft(int qMax, int tMax, char **pStartQ, char **pStartT)
00036
00037 {
00038 int last = min(qMax, tMax);
00039 int i;
00040 char *q = *pStartQ - 1, *t = *pStartT - 1;
00041
00042 for (i=0; i<last; ++i)
00043 {
00044 if (*q != *t)
00045 break;
00046 q -= 1;
00047 t -= 1;
00048 }
00049 *pStartQ = q + 1;
00050 *pStartT = t + 1;
00051 }
00052
00053 static void extendGaplessRight(int qMax, int tMax, int maxDrop, char **pEndQ, char **pEndT)
00054
00055
00056 {
00057 int last = min(qMax, tMax);
00058 int i;
00059 char *q = *pEndQ, *t = *pEndT;
00060 int score = 0, bestScore = -1, bestPos = -1;
00061
00062 for (i=0; i<last; ++i)
00063 {
00064 if (q[i] == t[i])
00065 {
00066 ++score;
00067 if (score > bestScore)
00068 {
00069 bestScore = score;
00070 bestPos = i;
00071 }
00072 }
00073 else
00074 {
00075 score -= 3;
00076 if (bestScore - score >= maxDrop)
00077 break;
00078 }
00079 }
00080 ++bestPos;
00081 *pEndQ = q+bestPos;
00082 *pEndT = t+bestPos;
00083 }
00084
00085 static void extendGaplessLeft(int qMax, int tMax, int maxDrop, char **pStartQ, char **pStartT)
00086
00087
00088 {
00089 int score = 0, bestScore = -1, bestPos = 0;
00090 int last = -min(qMax, tMax);
00091 int i;
00092 char *q = *pStartQ, *t = *pStartT;
00093
00094 for (i=-1; i>=last; --i)
00095 {
00096 if (q[i] == t[i])
00097 {
00098 ++score;
00099 if (score > bestScore)
00100 {
00101 bestScore = score;
00102 bestPos = i;
00103 }
00104 }
00105 else
00106 {
00107 score -= 3;
00108 if (bestScore - score >= maxDrop)
00109 break;
00110 }
00111 }
00112 *pStartQ = q+bestPos;
00113 *pStartT = t+bestPos;
00114 }
00115
00116 static int ffHashFuncN(char *s, int seedSize)
00117
00118 {
00119 int acc = 0;
00120 int i;
00121 for (i=0; i<seedSize; ++i)
00122 {
00123 acc <<= 1;
00124 acc += ntVal[(int)s[i]];
00125 }
00126 return acc&0xfff;
00127 }
00128
00129 struct seqHashEl
00130
00131 {
00132 struct seqHashEl *next;
00133 char *seq;
00134 };
00135
00136 static boolean totalDegenerateN(char *s, int seedSize)
00137
00138 {
00139 char c1 = s[0], c2 = s[1];
00140 int i;
00141 if (seedSize & 1)
00142 {
00143 seedSize -= 1;
00144 if (c1 != s[seedSize])
00145 return FALSE;
00146 }
00147 for (i=2; i<seedSize; i += 2)
00148 {
00149 if (c1 != s[i] || c2 != s[i+1])
00150 return FALSE;
00151 }
00152 return TRUE;
00153 }
00154
00155 static struct ffAli *ffFindExtendNmers(char *nStart, char *nEnd, char *hStart, char *hEnd,
00156 int seedSize)
00157
00158 {
00159 struct lm *lm = lmInit(32*1024);
00160 struct seqHashEl **hashTable, *hashEl, **hashSlot;
00161 struct ffAli *ffList = NULL, *ff;
00162 char *n = nStart, *h = hStart, *ne = nEnd - seedSize, *he = hEnd - seedSize;
00163
00164
00165 lmAllocArray(lm, hashTable, 4*1024);
00166 while (n <= ne)
00167 {
00168 if (!totalDegenerateN(n, seedSize))
00169 {
00170 hashSlot = ffHashFuncN(n, seedSize) + hashTable;
00171 lmAllocVar(lm, hashEl);
00172 hashEl->seq = n;
00173 slAddHead(hashSlot, hashEl);
00174 }
00175 ++n;
00176 }
00177
00178
00179 while (h <= he)
00180 {
00181 for (hashEl = hashTable[ffHashFuncN(h, seedSize)];
00182 hashEl != NULL; hashEl = hashEl->next)
00183 {
00184 if (memcmp(hashEl->seq, h, seedSize) == 0)
00185 {
00186 AllocVar(ff);
00187 ff->hStart = h;
00188 ff->hEnd = h + seedSize;
00189 ff->nStart = hashEl->seq;
00190 ff->nEnd = hashEl->seq + seedSize;
00191 extendExactLeft(ff->nStart - nStart, ff->hStart - hStart,
00192 &ff->nStart, &ff->hStart);
00193 extendExactRight(nEnd - ff->nEnd, hEnd - ff->hEnd, &ff->nEnd, &ff->hEnd);
00194 ff->left = ffList;
00195 ffList = ff;
00196 }
00197 }
00198 ++h;
00199 }
00200 ffList = ffMakeRightLinks(ffList);
00201 ffList = ffMergeClose(ffList, nStart, hStart);
00202 lmCleanup(&lm);
00203 return ffList;
00204 }
00205
00206 static void clumpToExactRange(struct gfClump *clump, bioSeq *qSeq, int tileSize,
00207 int frame, struct trans3 *t3, struct gfRange **pRangeList)
00208
00209
00210 {
00211 struct gfSeqSource *target = clump->target;
00212 aaSeq *tSeq = target->seq;
00213 BIOPOL *qs, *ts, *qe, *te;
00214 struct gfHit *hit;
00215 int qStart = 0, tStart = 0, qEnd = 0, tEnd = 0, newQ = 0, newT = 0;
00216 boolean outOfIt = TRUE;
00217 struct gfRange *range;
00218 BIOPOL *lastQs = NULL, *lastQe = NULL, *lastTs = NULL, *lastTe = NULL;
00219
00220 if (tSeq == NULL)
00221 internalErr();
00222
00223
00224
00225
00226
00227
00228
00229 for (hit = clump->hitList; ; hit = hit->next)
00230 {
00231 if (hit != NULL)
00232 {
00233 newQ = hit->qStart;
00234 newT = hit->tStart - target->start;
00235 }
00236
00237
00238 if (!outOfIt)
00239 {
00240
00241
00242
00243 if (hit == NULL || newQ != qEnd || newT != tEnd)
00244 {
00245 qs = qSeq->dna + qStart;
00246 ts = tSeq->dna + tStart;
00247 qe = qSeq->dna + qEnd;
00248 te = tSeq->dna + tEnd;
00249 extendExactRight(qSeq->size - qEnd, tSeq->size - tEnd,
00250 &qe, &te);
00251 extendExactLeft(qStart, tStart, &qs, &ts);
00252 if (qs != lastQs || ts != lastTs || qe != lastQe || qs != lastQs)
00253 {
00254 lastQs = qs;
00255 lastTs = ts;
00256 lastQe = qe;
00257 lastTe = te;
00258 AllocVar(range);
00259 range->qStart = qs - qSeq->dna;
00260 range->qEnd = qe - qSeq->dna;
00261 range->tName = cloneString(tSeq->name);
00262 range->tSeq = tSeq;
00263 range->tStart = ts - tSeq->dna;
00264 range->tEnd = te - tSeq->dna;
00265 range->hitCount = qe - qs;
00266 range->frame = frame;
00267 range->t3 = t3;
00268 slAddHead(pRangeList, range);
00269 }
00270 outOfIt = TRUE;
00271 }
00272 }
00273 if (hit == NULL)
00274 break;
00275
00276 if (outOfIt)
00277 {
00278 qStart = newQ;
00279 qEnd = qStart + tileSize;
00280 tStart = newT;
00281 tEnd = tStart + tileSize;
00282 outOfIt = FALSE;
00283 }
00284 else
00285 {
00286 qEnd = newQ + tileSize;
00287 tEnd = newT + tileSize;
00288 }
00289 }
00290 }
00291
00292 static void addExtraHits(struct gfHit *hitList, int hitSize,
00293 struct dnaSeq *qSeq, struct dnaSeq *tSeq, struct ffAli **pExtraList)
00294
00295 {
00296 struct gfHit *hit;
00297 struct ffAli *ff;
00298 char *qs = qSeq->dna, *ts = tSeq->dna;
00299 char *qe = qs + qSeq->size, *te = ts + tSeq->size;
00300 for (hit = hitList; hit != NULL; hit = hit->next)
00301 {
00302 AllocVar(ff);
00303 ff->nStart = ff->nEnd = qs + hit->qStart;
00304 ff->hStart = ff->hEnd = ts + hit->tStart;
00305 ff->nEnd += hitSize;
00306 ff->hEnd += hitSize;
00307 ff->left = *pExtraList;
00308 ffExpandExactLeft(ff, qs, ts);
00309 ffExpandExactRight(ff, qe, te);
00310 *pExtraList = ff;
00311 }
00312 }
00313
00314 static struct ffAli *foldInExtras(struct dnaSeq *qSeq, struct dnaSeq *tSeq,
00315 struct ffAli *ffList, struct ffAli *extraList)
00316
00317
00318 {
00319 if (extraList != NULL)
00320 {
00321 struct ssBundle *bun;
00322 struct ssFfItem *ffi;
00323 AllocVar(bun);
00324 bun->qSeq = qSeq;
00325 bun->genoSeq = tSeq;
00326 bun->avoidFuzzyFindKludge = TRUE;
00327 AllocVar(ffi);
00328 ffi->ff = ffList;
00329 slAddHead(&bun->ffList, ffi);
00330 AllocVar(ffi);
00331 ffi->ff = extraList;
00332 slAddHead(&bun->ffList, ffi);
00333 ssStitch(bun, ffCdna, 16, 1);
00334 if (bun->ffList != NULL)
00335 {
00336 ffList = bun->ffList->ff;
00337 bun->ffList->ff = NULL;
00338 }
00339 else
00340 {
00341 ffList = NULL;
00342 }
00343 ssBundleFree(&bun);
00344 }
00345 return ffList;
00346 }
00347
00348 static struct ffAli *scanIndexForSmallExons(struct genoFind *gf, struct gfSeqSource *target,
00349 struct dnaSeq *qSeq, Bits *qMaskBits, int qMaskOffset, struct dnaSeq *tSeq,
00350 struct lm *lm, struct ffAli *ffList)
00351
00352 {
00353 int qGap, tGap, tStart, qStart;
00354 struct ffAli *lastFf = NULL, *ff = ffList;
00355 struct gfHit *hitList = NULL;
00356 struct dnaSeq qSubSeq;
00357 struct ffAli *extraList = NULL;
00358 int tileSize = gf->tileSize;
00359 int biggestToFind = 200;
00360
00361
00362 if (ffList == NULL)
00363 return NULL;
00364
00365 ZeroVar(&qSubSeq);
00366
00367
00368 qGap = ff->nStart - qSeq->dna;
00369 tGap = ff->hStart - tSeq->dna;
00370 if (qGap >= tileSize && qGap <= biggestToFind && tGap >= tileSize)
00371 {
00372 tStart = ff->hStart - tSeq->dna;
00373 if (tGap > ffIntronMax)
00374 {
00375 tGap = ffIntronMax;
00376 }
00377 qSubSeq.dna = qSeq->dna;
00378 qSubSeq.size = qGap;
00379 hitList = gfFindHitsInRegion(gf, &qSubSeq, qMaskBits, qMaskOffset,
00380 lm, target, tStart - tGap, tStart);
00381 addExtraHits(hitList, tileSize, &qSubSeq, tSeq, &extraList);
00382 }
00383
00384
00385 for (;;)
00386 {
00387 lastFf = ff;
00388 ff = ff->right;
00389 if (ff == NULL)
00390 break;
00391 qGap = ff->nStart - lastFf->nEnd;
00392 tGap = ff->hStart - lastFf->hEnd;
00393 if (qGap >= tileSize && qGap <= biggestToFind && tGap >= tileSize)
00394 {
00395 qStart = lastFf->nEnd - qSeq->dna;
00396 tStart = lastFf->hEnd - tSeq->dna;
00397 qSubSeq.dna = lastFf->nEnd;
00398 qSubSeq.size = qGap;
00399 hitList = gfFindHitsInRegion(gf, &qSubSeq, qMaskBits, qMaskOffset + qStart,
00400 lm, target, tStart, tStart + tGap);
00401 addExtraHits(hitList, tileSize, &qSubSeq, tSeq, &extraList);
00402 }
00403 }
00404
00405
00406 qGap = qSeq->dna + qSeq->size - lastFf->nEnd;
00407 tGap = tSeq->dna + tSeq->size - lastFf->hEnd;
00408 if (qGap >= tileSize && qGap < biggestToFind && tGap >= tileSize)
00409 {
00410 if (tGap > ffIntronMax) tGap = ffIntronMax;
00411 qStart = lastFf->nEnd - qSeq->dna;
00412 tStart = lastFf->hEnd - tSeq->dna;
00413 qSubSeq.dna = lastFf->nEnd;
00414 qSubSeq.size = qGap;
00415 hitList = gfFindHitsInRegion(gf, &qSubSeq, qMaskBits, qMaskOffset + qStart,
00416 lm, target, tStart, tStart + tGap);
00417 addExtraHits(hitList, tileSize, &qSubSeq, tSeq, &extraList);
00418 }
00419 extraList = ffMakeRightLinks(extraList);
00420 ffList = foldInExtras(qSeq, tSeq, ffList, extraList);
00421 return ffList;
00422 }
00423
00424
00425 static void bandExtBefore(struct axtScoreScheme *ss, struct ffAli *ff,
00426 int qGap, int tGap, struct ffAli **pExtraList)
00427
00428
00429 {
00430 struct ffAli *ext;
00431 int minGap = min(qGap, tGap);
00432 int maxGap = minGap * 2;
00433 if (minGap > 0)
00434 {
00435 if (qGap > maxGap) qGap = maxGap;
00436 if (tGap > maxGap) tGap = maxGap;
00437 ext = bandExtFf(NULL, ss, 3, ff, ff->nStart - qGap, ff->nStart,
00438 ff->hStart - tGap, ff->hStart, -1, maxGap);
00439 ffCat(pExtraList, &ext);
00440 }
00441 }
00442
00443 static void bandExtAfter(struct axtScoreScheme *ss, struct ffAli *ff,
00444 int qGap, int tGap, struct ffAli **pExtraList)
00445
00446
00447 {
00448 struct ffAli *ext;
00449 int minGap = min(qGap, tGap);
00450 int maxGap = minGap * 2;
00451 if (minGap > 0)
00452 {
00453 if (qGap > maxGap) qGap = maxGap;
00454 if (tGap > maxGap) tGap = maxGap;
00455 ext = bandExtFf(NULL, ss, 3, ff, ff->nEnd, ff->nEnd + qGap,
00456 ff->hEnd, ff->hEnd + tGap, 1, maxGap);
00457 ffCat(pExtraList, &ext);
00458 }
00459 }
00460
00461
00462 static struct ffAli *bandedExtend(struct dnaSeq *qSeq, struct dnaSeq *tSeq, struct ffAli *ffList)
00463
00464 {
00465 struct ffAli *extraList = NULL, *ff = ffList, *lastFf = NULL;
00466 struct axtScoreScheme *ss = axtScoreSchemeRnaDefault();
00467 int qGap, tGap;
00468
00469 if (ff == NULL)
00470 return NULL;
00471
00472
00473 qGap = ff->nStart - qSeq->dna;
00474 tGap = ff->hStart - tSeq->dna;
00475 bandExtBefore(ss, ff, qGap, tGap, &extraList);
00476
00477
00478 for (;;)
00479 {
00480 lastFf = ff;
00481 ff = ff->right;
00482 if (ff == NULL)
00483 break;
00484 qGap = ff->nStart - lastFf->nEnd;
00485 tGap = ff->hStart - lastFf->hEnd;
00486 bandExtAfter(ss, lastFf, qGap, tGap, &extraList);
00487 bandExtBefore(ss, ff, qGap, tGap, &extraList);
00488 }
00489
00490
00491 qGap = qSeq->dna + qSeq->size - lastFf->nEnd;
00492 tGap = tSeq->dna + tSeq->size - lastFf->hEnd;
00493 bandExtAfter(ss, lastFf, qGap, tGap, &extraList);
00494
00495 ffList = foldInExtras(qSeq, tSeq, ffList, extraList);
00496 return ffList;
00497 }
00498
00499
00500
00501 static struct ffAli *expandGapless(struct dnaSeq *qSeq, struct dnaSeq *tSeq, struct ffAli *ffList)
00502
00503
00504 {
00505 struct ffAli *ff = ffList, *lastFf = NULL;
00506 char *nStart = qSeq->dna;
00507 char *nEnd = qSeq->dna + qSeq->size;
00508 char *hStart = tSeq->dna;
00509 char *hEnd = tSeq->dna + tSeq->size;
00510
00511
00512 extendGaplessLeft(ff->nStart - nStart, ff->hStart - hStart,
00513 9, &ff->nStart, &ff->hStart);
00514
00515
00516 for (;;)
00517 {
00518 lastFf = ff;
00519 ff = ff->right;
00520 if (ff == NULL)
00521 break;
00522 extendGaplessRight(nEnd - lastFf->nEnd, hEnd - lastFf->hEnd, 9,
00523 &lastFf->nEnd, &lastFf->hEnd);
00524 extendGaplessLeft(ff->nStart - nStart, ff->hStart - hStart, 9,
00525 &ff->nStart, &ff->hStart);
00526 }
00527 extendGaplessRight(nEnd - lastFf->nEnd,
00528 hEnd - lastFf->hEnd, 9,
00529 &lastFf->nEnd, &lastFf->hEnd);
00530 return ffList;
00531 }
00532
00533
00534 static int seedResolvePower(int seedSize, int resolveLimit)
00535
00536
00537 {
00538 int res;
00539 if (seedSize >= 14)
00540 return ffIntronMax;
00541 res = (1 << (seedSize+seedSize-resolveLimit));
00542 if (res > ffIntronMax)
00543 res = ffIntronMax;
00544 return res;
00545 }
00546
00547 static char *scanExactLeft(char *n, int nSize, int hSize, char *hEnd, int resolveLimit)
00548
00549 {
00550
00551 char n1 = *n++;
00552 char *hStart;
00553 int maxSize = seedResolvePower(nSize, resolveLimit);
00554
00555 if (hSize > maxSize) hSize = maxSize;
00556 nSize -= 1;
00557 hStart = hEnd - hSize;
00558
00559 hEnd -= nSize;
00560 while (hEnd >= hStart)
00561 {
00562 if (n1 == *hEnd && memcmp(n, hEnd+1, nSize) == 0)
00563 return hEnd;
00564 hEnd -= 1;
00565 }
00566 return NULL;
00567 }
00568
00569 static char *scanExactRight(char *n, int nSize, int hSize, char *hStart, int resolveLimit)
00570
00571 {
00572
00573 char n1 = *n++;
00574 char *hEnd;
00575 int maxSize = seedResolvePower(nSize, resolveLimit);
00576
00577 if (hSize > maxSize) hSize = maxSize;
00578 hEnd = hStart + hSize;
00579 nSize -= 1;
00580
00581 hEnd -= nSize;
00582 while (hStart <= hEnd)
00583 {
00584 if (n1 == *hStart && memcmp(n, hStart+1, nSize) == 0)
00585 return hStart;
00586 hStart += 1;
00587 }
00588 return NULL;
00589 }
00590
00591 static struct ffAli *fillInExact(char *nStart, char *nEnd, char *hStart, char *hEnd,
00592 boolean isRc, boolean scanLeft, boolean scanRight, int resolveLimit)
00593
00594
00595
00596
00597 {
00598 struct ffAli *ff = NULL;
00599 char *hPos = NULL;
00600 int nGap = nEnd - nStart;
00601 int hGap = hEnd - hStart;
00602 int minGap = min(nGap, hGap);
00603
00604 if (minGap <= 2)
00605 return NULL;
00606 if (scanLeft)
00607 {
00608 if ((hPos = scanExactLeft(nStart, nGap, hGap, hEnd, resolveLimit)) != NULL)
00609 {
00610 AllocVar(ff);
00611 ff->nStart = nStart;
00612 ff->nEnd = nEnd;
00613 ff->hStart = hPos;
00614 ff->hEnd = hPos + nGap;
00615 return ff;
00616 }
00617 }
00618 else
00619 {
00620 if ((hPos = scanExactRight(nStart, nGap, hGap, hStart, resolveLimit)) != NULL)
00621 {
00622 AllocVar(ff);
00623 ff->nStart = nStart;
00624 ff->nEnd = nEnd;
00625 ff->hStart = hPos;
00626 ff->hEnd = hPos + nGap;
00627 return ff;
00628 }
00629 }
00630 return NULL;
00631 }
00632
00633 static struct ffAli *findFromSmallerSeeds(char *nStart, char *nEnd,
00634 char *hStart, char *hEnd, boolean isRc,
00635 boolean scanLeft, boolean scanRight, int seedSize, int resolveLimit)
00636
00637 {
00638 int nGap = nEnd - nStart;
00639 if (nGap >= seedSize)
00640 {
00641 struct ffAli *ffList;
00642 if (scanLeft || scanRight)
00643 {
00644 int hGap = hEnd - hStart;
00645 int maxSize = seedResolvePower(seedSize, resolveLimit);
00646 if (hGap > maxSize) hGap = maxSize;
00647 if (scanLeft)
00648 hStart = hEnd - hGap;
00649 if (scanRight)
00650 hEnd = hStart + hGap;
00651 }
00652 ffList = ffFindExtendNmers(nStart, nEnd, hStart, hEnd, seedSize);
00653 if (ffList != NULL)
00654 {
00655 struct ffAli *extensions = NULL, *ff;
00656 struct axtScoreScheme *ss = axtScoreSchemeRnaDefault();
00657 for (ff = ffList; ff != NULL; ff = ff->right)
00658 {
00659 bandExtBefore(ss, ff, ff->nStart - nStart, ff->hStart - hStart, &extensions);
00660 bandExtAfter(ss, ff, nEnd - ff->nEnd, hEnd - ff->hEnd, &extensions);
00661 }
00662 ffCat(&ffList, &extensions);
00663 }
00664
00665 return ffList;
00666 }
00667 return NULL;
00668 }
00669
00670 static int countT(char *s, int size)
00671
00672 {
00673 int i;
00674 for (i=0; i<size; ++i)
00675 {
00676 if (s[i] != 't')
00677 break;
00678 }
00679 return i;
00680 }
00681
00682 static int countA(char *s, int size)
00683
00684 {
00685 int count = 0;
00686 int i;
00687 for (i=size-1; i >= 0; --i)
00688 {
00689 if (s[i] == 'a')
00690 ++count;
00691 else
00692 break;
00693 }
00694 return count;
00695 }
00696
00697 struct ffAli *scanTinyOne(char *nStart, char *nEnd, char *hStart, char *hEnd,
00698 boolean isRc, boolean scanLeft, boolean scanRight, int seedSize)
00699
00700 {
00701 struct ffAli *ff;
00702 int nGap = nEnd - nStart;
00703 if (nGap > 80)
00704 return NULL;
00705 if (scanLeft && isRc)
00706 nStart += countT(nStart, nGap);
00707 if (scanRight && !isRc)
00708 nEnd -= countA(nStart, nGap);
00709 ff = fillInExact(nStart, nEnd, hStart, hEnd, isRc, scanLeft, scanRight, 3);
00710 if (ff != NULL)
00711 {
00712 return ff;
00713 }
00714 return findFromSmallerSeeds(nStart, nEnd, hStart, hEnd, isRc,
00715 scanLeft, scanRight, seedSize, 3);
00716 }
00717
00718 static struct ffAli *scanForSmallerExons( int seedSize,
00719 struct dnaSeq *qSeq, struct dnaSeq *tSeq,
00720 boolean isRc, struct ffAli *ffList)
00721
00722 {
00723 struct ffAli *extraList = NULL, *ff = ffList, *lastFf = NULL, *newFf;
00724
00725 if (ff == NULL)
00726 return NULL;
00727
00728
00729 newFf = scanTinyOne(qSeq->dna, ff->nStart, tSeq->dna, ff->hStart,
00730 isRc, TRUE, FALSE, seedSize);
00731 ffCat(&extraList, &newFf);
00732
00733
00734 for (;;)
00735 {
00736 lastFf = ff;
00737 ff = ff->right;
00738 if (ff == NULL)
00739 break;
00740 newFf = scanTinyOne(lastFf->nEnd, ff->nStart, lastFf->hEnd, ff->hStart,
00741 isRc, FALSE, FALSE, seedSize);
00742 ffCat(&extraList, &newFf);
00743 }
00744
00745
00746 newFf = scanTinyOne(lastFf->nEnd, qSeq->dna + qSeq->size,
00747 lastFf->hEnd, tSeq->dna + tSeq->size, isRc, FALSE, TRUE, seedSize);
00748 ffCat(&extraList, &newFf);
00749
00750 ffList = foldInExtras(qSeq, tSeq, ffList, extraList);
00751 return ffList;
00752 }
00753
00754 static struct ffAli *scanForTinyInternal(struct dnaSeq *qSeq, struct dnaSeq *tSeq,
00755 boolean isRc, struct ffAli *ffList)
00756
00757 {
00758 struct ffAli *extraList = NULL, *ff = ffList, *lastFf = NULL, *newFf;
00759
00760 if (ff == NULL)
00761 return NULL;
00762
00763
00764 for (;;)
00765 {
00766 lastFf = ff;
00767 ff = ff->right;
00768 if (ff == NULL)
00769 break;
00770 newFf = fillInExact(lastFf->nEnd, ff->nStart, lastFf->hEnd, ff->hStart, isRc,
00771 FALSE, FALSE, 0);
00772 ffCat(&extraList, &newFf);
00773 }
00774
00775 ffList = foldInExtras(qSeq, tSeq, ffList, extraList);
00776 return ffList;
00777 }
00778
00779 static boolean tradeMismatchToCloseSpliceGap( struct ffAli *left,
00780 struct ffAli *right, int orientation)
00781
00782
00783 {
00784 if (intronOrientation(left->hEnd+1, right->hStart) == orientation)
00785 {
00786 left->hEnd += 1;
00787 left->nEnd += 1;
00788 return TRUE;
00789 }
00790 if (intronOrientation(left->hEnd, right->hStart-1) == orientation)
00791 {
00792 right->hStart -= 1;
00793 right->nStart -= 1;
00794 return TRUE;
00795 }
00796 return FALSE;
00797 }
00798
00799 static int calcSpliceScore(struct axtScoreScheme *ss,
00800 char a1, char a2, char b1, char b2, int orientation)
00801
00802 {
00803 int score = 0;
00804 int matchScore = ss->matrix['c']['c'];
00805 if (orientation >= 0)
00806 {
00807 score += ss->matrix[(int)a1]['g'];
00808 if (a2 != 'c')
00809 score += ss->matrix[(int)a2]['t'];
00810 score += ss->matrix[(int)b1]['a'];
00811 score += ss->matrix[(int)b2]['g'];
00812 }
00813 else
00814 {
00815 score += ss->matrix[(int)a1]['c'];
00816 score += ss->matrix[(int)a2]['t'];
00817 if (b1 != 'g')
00818 score += ss->matrix[(int)b1]['a'];
00819 score += ss->matrix[(int)b2]['c'];
00820 }
00821 if (score >= 3* matchScore)
00822 score += matchScore;
00823 return score;
00824 }
00825
00826 static void grabAroundIntron(char *hpStart, int iPos, int iSize,
00827 int modPeelSize, char *hSeq)
00828
00829 {
00830 memcpy(hSeq, hpStart, iPos);
00831 memcpy(hSeq+iPos, hpStart+iPos+iSize, modPeelSize - iPos);
00832 hSeq[modPeelSize] = 0;
00833 }
00834
00835 #ifdef UNTESTED
00836 struct ffAli *removeFf(struct ffAli *ff, struct ffAli *ffList)
00837
00838 {
00839 struct ffAli *right = ff->right;
00840 struct ffAli *left;
00841 if (ff == ffList)
00842 {
00843 if (right != NULL)
00844 right->left = NULL;
00845 freeMem(ff);
00846 return right;
00847 }
00848 left = ff->left;
00849 left->right = right;
00850 if (right != NULL)
00851 right->left = left;
00852 freeMem(ff);
00853 return ffList;
00854 }
00855 #endif
00856
00857 static struct ffAli *hardRefineSplice(struct ffAli *left, struct ffAli *right,
00858 struct dnaSeq *qSeq, struct dnaSeq *tSeq, struct ffAli *ffList,
00859 int orientation)
00860
00861
00862 {
00863
00864
00865
00866 int peelSize = 12;
00867 char nSeq[12+1], hSeq[12+1+1];
00868 char nSym[25], hSym[25];
00869 int symCount;
00870 int seqScore, spliceScore, score, maxScore = 0;
00871 int nGap = right->nStart - left->nEnd;
00872 int hGap = right->hStart - left->hEnd;
00873 int peelLeft = (peelSize - nGap)/2;
00874 int intronSize = hGap - nGap;
00875 char *npStart = left->nEnd - peelLeft;
00876 char *npEnd = npStart + peelSize;
00877 char *hpStart = left->hEnd - peelLeft;
00878 char *hpEnd = npEnd + (right->hStart - right->nStart);
00879 struct axtScoreScheme *ss = axtScoreSchemeRnaDefault();
00880 static int modSize[3] = {0, 1, -1};
00881 int modIx;
00882 int bestPos = -1, bestMod = 0;
00883 int iPos;
00884
00885 memcpy(nSeq, npStart, peelSize);
00886 nSeq[peelSize] = 0;
00887 for (modIx=0; modIx < ArraySize(modSize); ++modIx)
00888 {
00889 int modOne = modSize[modIx];
00890 int modPeelSize = peelSize - modOne;
00891 int iSize = intronSize + modOne;
00892 for (iPos=0; iPos <= modPeelSize; iPos++)
00893 {
00894 grabAroundIntron(hpStart, iPos, iSize, modPeelSize, hSeq);
00895 if (bandExt(TRUE, ss, 2, nSeq, peelSize, hSeq, modPeelSize, 1,
00896 sizeof(hSym), &symCount, nSym, hSym, NULL, NULL))
00897 {
00898 seqScore = axtScoreSym(ss, symCount, nSym, hSym);
00899 spliceScore = calcSpliceScore(ss, hpStart[iPos], hpStart[iPos+1],
00900 hpStart[iPos+iSize-2], hpStart[iPos+iSize-1], orientation);
00901 score = seqScore + spliceScore;
00902 if (score > maxScore)
00903 {
00904 maxScore = score;
00905 bestPos = iPos;
00906 bestMod = modOne;
00907 }
00908 }
00909 }
00910 }
00911 if (maxScore > 0)
00912 {
00913 int modPeelSize = peelSize - bestMod;
00914 int i,diff, cutSymIx = 0;
00915 int nIx, hIx;
00916 struct ffAli *ff;
00917
00918
00919 grabAroundIntron(hpStart, bestPos, intronSize + bestMod, modPeelSize, hSeq);
00920 bandExt(TRUE, ss, 2, nSeq, peelSize, hSeq, modPeelSize, 1,
00921 sizeof(hSym), &symCount, nSym, hSym, NULL, NULL);
00922
00923
00924 if (left->nStart > npStart || right->nEnd < npEnd)
00925 {
00926
00927
00928
00929
00930 verbose(2, "Unable to peel in hardRefineSplice\n");
00931 return ffList;
00932 }
00933 diff = left->nEnd - npStart;
00934 left->nEnd -= diff;
00935 left->hEnd -= diff;
00936 diff = right->nStart - npEnd;
00937 right->nStart -= diff;
00938 right->hStart -= diff;
00939
00940
00941
00942 nIx = hIx = 0;
00943 ff = left;
00944 for (i=0; i<symCount; ++i)
00945 {
00946 if (hIx == bestPos)
00947 {
00948 cutSymIx = i;
00949 break;
00950 }
00951 if (hSym[i] == '-')
00952 {
00953 ff = NULL;
00954 nIx += 1;
00955 }
00956 else if (nSym[i] == '-')
00957 {
00958 ff = NULL;
00959 hIx += 1;
00960 }
00961 else
00962 {
00963 if (ff == NULL)
00964 {
00965 AllocVar(ff);
00966 ff->left = left;
00967 ff->right = right;
00968 left->right = ff;
00969 right->left = ff;
00970 left = ff;
00971 ff->nStart = ff->nEnd = npStart + nIx;
00972 ff->hStart = ff->hEnd = hpStart + hIx;
00973 }
00974 ++nIx;
00975 ++hIx;
00976 ff->nEnd += 1;
00977 ff->hEnd += 1;
00978 }
00979 }
00980
00981
00982
00983 ff = right;
00984 hIx = nIx = 0;
00985 for (i = symCount-1; i >= cutSymIx; --i)
00986 {
00987 if (hSym[i] == '-')
00988 {
00989 ff = NULL;
00990 nIx += 1;
00991 }
00992 else if (nSym[i] == '-')
00993 {
00994 ff = NULL;
00995 hIx += 1;
00996 }
00997 else
00998 {
00999 if (ff == NULL)
01000 {
01001 AllocVar(ff);
01002 ff->left = left;
01003 ff->right = right;
01004 left->right = ff;
01005 right->left = ff;
01006 left = ff;
01007 ff->nStart = ff->nEnd = npEnd - nIx;
01008 ff->hStart = ff->hEnd = hpEnd - hIx;
01009 }
01010 ++nIx;
01011 ++hIx;
01012 ff->nStart -= 1;
01013 ff->hStart -= 1;
01014 }
01015 }
01016 }
01017 return ffList;
01018 }
01019
01020
01021 static struct ffAli *refineSpliceSites(struct dnaSeq *qSeq, struct dnaSeq *tSeq,
01022 struct ffAli *ffList)
01023
01024
01025 {
01026 int orientation = ffIntronOrientation(ffList);
01027 struct ffAli *ff, *nextFf;
01028 if (orientation == 0)
01029 return ffList;
01030 if (ffSlideOrientedIntrons(ffList, orientation))
01031 ffList = ffRemoveEmptyAlis(ffList, TRUE);
01032 for (ff = ffList; ff != NULL; ff = nextFf)
01033 {
01034 int nGap, hGap;
01035 if ((nextFf = ff->right) == NULL)
01036 break;
01037 nGap = nextFf->nStart - ff->nEnd;
01038 hGap = nextFf->hStart - ff->hEnd;
01039 if (nGap > 0 && nGap <= 6 && hGap >= 30)
01040 {
01041 if (nGap == 1)
01042 {
01043 if (tradeMismatchToCloseSpliceGap(ff, nextFf, orientation))
01044 continue;
01045 }
01046 ffList = hardRefineSplice(ff, nextFf, qSeq, tSeq, ffList, orientation);
01047 }
01048 }
01049 return ffList;
01050 }
01051
01052
01053 static boolean smoothOneGap(struct ffAli *left, struct ffAli *right, struct ffAli *ffList)
01054
01055
01056
01057
01058 {
01059 int nGap = right->nStart - left->nEnd;
01060 int hGap = right->hStart - left->hEnd;
01061 if (nGap > 0 && hGap > 0 && nGap < 10 && hGap < 10)
01062 {
01063 int sizeDiff = nGap - hGap;
01064 if (sizeDiff < 0) sizeDiff = -sizeDiff;
01065 if (sizeDiff <= 3)
01066 {
01067 struct axtScoreScheme *ss = axtScoreSchemeRnaDefault();
01068 char hSym[20], nSym[20];
01069 int symCount;
01070 if (bandExt(TRUE, ss, 3, left->nEnd, nGap, left->hEnd, hGap, 1,
01071 sizeof(hSym), &symCount, nSym, hSym, NULL, NULL))
01072 {
01073 int gapPenalty = -ffCalcCdnaGapPenalty(hGap, nGap) * ss->matrix['a']['a'];
01074 int score = axtScoreSym(ss, symCount, nSym, hSym);
01075 if (score >= gapPenalty)
01076 {
01077 struct ffAli *l, *r;
01078 l = ffAliFromSym(symCount, nSym, hSym, NULL, left->nEnd, left->hEnd);
01079 r = ffRightmost(l);
01080 left->right = l;
01081 l->left = left;
01082 r->right = right;
01083 right->left = r;
01084 return TRUE;
01085 }
01086 }
01087 }
01088 }
01089 return FALSE;
01090 }
01091
01092
01093 static struct ffAli *smoothSmallGaps(struct dnaSeq *qSeq, struct dnaSeq *tSeq,
01094 struct ffAli *ffList)
01095
01096 {
01097 struct ffAli *left = ffList, *right;
01098 boolean smoothed = FALSE;
01099
01100 if (ffList == NULL) return NULL;
01101 for (;;)
01102 {
01103 if ((right = left->right) == NULL)
01104 break;
01105 if (smoothOneGap(left, right, ffList))
01106 smoothed = TRUE;
01107 left = right;
01108 }
01109 if (smoothed)
01110 {
01111 ffList = ffMergeNeedleAlis(ffList, TRUE);
01112 }
01113 return ffList;
01114 }
01115
01116 int aPenalty(char *s, int size)
01117
01118 {
01119 int aCount = 0, tCount = 0;
01120 int i;
01121 char c;
01122 for (i=0; i<size; ++i)
01123 {
01124 c = s[i];
01125 if (c == 'a') ++aCount;
01126 if (c == 't') ++tCount;
01127 }
01128 if (tCount > aCount) aCount = tCount;
01129 if (aCount >= size)
01130 return aCount-1;
01131 else if (aCount >= size*0.75)
01132 return aCount * 0.90;
01133 else
01134 return 0;
01135 }
01136
01137 static int trimGapPenalty(int hGap, int nGap, char *iStart, char *iEnd, int orientation)
01138
01139 {
01140 int penalty = ffCalcGapPenalty(hGap, nGap, ffCdna);
01141 if (hGap > 2 || nGap > 2)
01142
01143 {
01144 penalty <<= 1;
01145 if (nGap > 0)
01146 penalty += 3;
01147
01148
01149 penalty += 6 - 2*ffScoreIntron(iStart[0], iStart[1],
01150 iEnd[-2], iEnd[-1], orientation);
01151 }
01152 return penalty;
01153 }
01154
01155
01156 static struct ffAli *trimFlakyEnds(struct dnaSeq *qSeq, struct dnaSeq *tSeq,
01157 struct ffAli *ffList)
01158
01159
01160
01161 {
01162 int orientation = ffIntronOrientation(ffList);
01163 struct ffAli *left, *right;
01164 char *iStart, *iEnd;
01165 int blockScore, gapPenalty;
01166
01167
01168 if (ffAliCount(ffList) < 2)
01169 return ffList;
01170
01171
01172 left = ffList;
01173 right = ffList->right;
01174 while (right != NULL)
01175 {
01176 blockScore = ffScoreMatch(left->nStart, left->hStart,
01177 left->nEnd-left->nStart);
01178 blockScore -= aPenalty(left->nStart, left->nEnd - left->nStart);
01179 iStart = left->hEnd;
01180 iEnd = right->hStart;
01181 gapPenalty = trimGapPenalty(iEnd-iStart,
01182 right->nStart - left->nEnd, iStart, iEnd, orientation);
01183 if (gapPenalty >= blockScore)
01184 {
01185 freeMem(left);
01186 ffList = right;
01187 right->left = NULL;
01188 }
01189 else
01190 break;
01191 left = right;
01192 right = right->right;
01193 }
01194
01195 right = ffRightmost(ffList);
01196 if (right == ffList)
01197 return ffList;
01198 left = right->left;
01199 while (left != NULL)
01200 {
01201 blockScore = ffScoreMatch(right->nStart, right->hStart,
01202 right->nEnd-right->nStart);
01203 blockScore -= aPenalty(right->nStart, right->nEnd - right->nStart);
01204 iStart = left->hEnd;
01205 iEnd = right->hStart;
01206 gapPenalty = trimGapPenalty(iEnd-iStart,
01207 right->nStart - left->nEnd, iStart, iEnd, orientation);
01208 if (gapPenalty >= blockScore)
01209 {
01210 freeMem(right);
01211 left->right = NULL;
01212 }
01213 else
01214 break;
01215 right = left;
01216 left = left->left;
01217 }
01218 return ffList;
01219 }
01220
01221
01222 static void refineBundle(struct genoFind *gf,
01223 struct dnaSeq *qSeq, Bits *qMaskBits, int qMaskOffset,
01224 struct dnaSeq *tSeq, struct lm *lm, struct ssBundle *bun, boolean isRc)
01225
01226 {
01227 struct ssFfItem *ffi;
01228 struct gfSeqSource *target = gfFindNamedSource(gf, tSeq->name);
01229
01230
01231 for (ffi = bun->ffList; ffi != NULL; ffi = ffi->next)
01232 {
01233 ffi->ff = expandGapless(qSeq, tSeq, ffi->ff);
01234 }
01235 ssStitch(bun, ffCdna, 16, 16);
01236
01237 for (ffi = bun->ffList; ffi != NULL; ffi = ffi->next)
01238 {
01239 ffi->ff = scanIndexForSmallExons(gf, target, qSeq, qMaskBits, qMaskOffset,
01240 tSeq, lm, ffi->ff);
01241 ffi->ff = bandedExtend(qSeq, tSeq, ffi->ff);
01242 ffi->ff = scanForSmallerExons(gf->tileSize, qSeq, tSeq, isRc, ffi->ff);
01243 ffi->ff = refineSpliceSites(qSeq, tSeq, ffi->ff);
01244 ffi->ff = scanForTinyInternal(qSeq, tSeq, isRc, ffi->ff);
01245 ffi->ff = smoothSmallGaps(qSeq, tSeq, ffi->ff);
01246 ffi->ff = trimFlakyEnds(qSeq, tSeq, ffi->ff);
01247 }
01248 }
01249
01250
01251 struct ssBundle *ffSeedExtInMem(struct genoFind *gf, struct dnaSeq *qSeq, Bits *qMaskBits,
01252 int qOffset, struct lm *lm, int minScore, boolean isRc)
01253
01254 {
01255 struct ssBundle *bunList = NULL, *bun;
01256 int hitCount;
01257 struct gfClump *clumpList, *clump;
01258 struct gfRange *rangeList = NULL, *range;
01259 struct dnaSeq *tSeq;
01260
01261 clumpList = gfFindClumpsWithQmask(gf, qSeq, qMaskBits, qOffset, lm, &hitCount);
01262 for (clump = clumpList; clump != NULL; clump = clump->next)
01263 clumpToExactRange(clump, qSeq, gf->tileSize, 0, NULL, &rangeList);
01264 slSort(&rangeList, gfRangeCmpTarget);
01265 rangeList = gfRangesBundle(rangeList, ffIntronMax);
01266 for (range = rangeList; range != NULL; range = range->next)
01267 {
01268 range->qStart += qOffset;
01269 range->qEnd += qOffset;
01270 tSeq = range->tSeq;
01271 AllocVar(bun);
01272 bun->qSeq = qSeq;
01273 bun->genoSeq = tSeq;
01274 bun->ffList = gfRangesToFfItem(range->components, qSeq);
01275 bun->isProt = FALSE;
01276 bun->avoidFuzzyFindKludge = TRUE;
01277 ssStitch(bun, ffCdna, 16, 10);
01278 refineBundle(gf, qSeq, qMaskBits, qOffset, tSeq, lm, bun, isRc);
01279 slAddHead(&bunList, bun);
01280 }
01281 gfRangeFreeList(&rangeList);
01282 gfClumpFreeList(&clumpList);
01283 return bunList;
01284 }
01285