00001
00002
00003
00004
00005
00006
00007
00008 #include "common.h"
00009 #include "dnautil.h"
00010 #include "dlist.h"
00011 #include "fuzzyFind.h"
00012 #include "localmem.h"
00013 #include "patSpace.h"
00014 #include "trans3.h"
00015 #include "supStitch.h"
00016 #include "chainBlock.h"
00017
00018 static char const rcsid[] = "$Id: supStitch.c,v 1.38 2007/04/20 22:43:37 kent Exp $";
00019
00020 static void ssFindBestBig(struct ffAli *ffList, bioSeq *qSeq, bioSeq *tSeq,
00021 enum ffStringency stringency, boolean isProt, struct trans3 *t3List,
00022 struct ffAli **retBestAli, int *retScore, struct ffAli **retLeftovers);
00023
00024 void ssFfItemFree(struct ssFfItem **pEl)
00025
00026 {
00027 struct ssFfItem *el;
00028 if ((el = *pEl) != NULL)
00029 {
00030 ffFreeAli(&el->ff);
00031 freez(pEl);
00032 }
00033 }
00034
00035 void ssFfItemFreeList(struct ssFfItem **pList)
00036
00037 {
00038 struct ssFfItem *el, *next;
00039 for (el = *pList; el != NULL; el = next)
00040 {
00041 next = el->next;
00042 ssFfItemFree(&el);
00043 }
00044 *pList = NULL;
00045 }
00046
00047 void ssBundleFree(struct ssBundle **pEl)
00048
00049 {
00050 struct ssBundle *el;
00051 if ((el = *pEl) != NULL)
00052 {
00053 ssFfItemFreeList(&el->ffList);
00054 freez(pEl);
00055 }
00056 }
00057
00058 void ssBundleFreeList(struct ssBundle **pList)
00059
00060 {
00061 struct ssBundle *el, *next;
00062 for (el = *pList; el != NULL; el = next)
00063 {
00064 next = el->next;
00065 ssBundleFree(&el);
00066 }
00067 *pList = NULL;
00068 }
00069
00070 void dumpNearCrossover(char *what, DNA *n, DNA *h, int size)
00071
00072 {
00073 printf("%s: ", what);
00074 mustWrite(stdout, n, size);
00075 printf("\n");
00076 printf("%s: ", what);
00077 mustWrite(stdout, h, size);
00078 printf("\n");
00079 }
00080
00081 void dumpFf(struct ffAli *left, DNA *needle, DNA *hay)
00082
00083 {
00084 struct ffAli *ff;
00085 for (ff = left; ff != NULL; ff = ff->right)
00086 {
00087 printf("(%ld - %ld)[%ld-%ld] ", (long)(ff->hStart-hay), (long)(ff->hEnd-hay),
00088 (long)(ff->nStart - needle), (long)(ff->nEnd - needle));
00089 }
00090 printf("\n");
00091 }
00092
00093 void dumpBuns(struct ssBundle *bunList)
00094
00095 {
00096 struct ssBundle *bun;
00097 struct ssFfItem *ffl;
00098 for (bun = bunList; bun != NULL; bun = bun->next)
00099 {
00100 struct dnaSeq *qSeq = bun->qSeq;
00101 struct dnaSeq *genoSeq = bun->genoSeq;
00102 printf("Bundle of %d between %s and %s\n", slCount(bun->ffList), qSeq->name, genoSeq->name);
00103 for (ffl = bun->ffList; ffl != NULL; ffl = ffl->next)
00104 {
00105 dumpFf(ffl->ff, bun->qSeq->dna, bun->genoSeq->dna);
00106 }
00107 }
00108 }
00109
00110
00111 static int bioScoreMatch(boolean isProt, char *a, char *b, int size)
00112
00113 {
00114 if (isProt)
00115 {
00116 return dnaOrAaScoreMatch(a, b, size, 2, -1, 'X');
00117 }
00118 else
00119 {
00120 return dnaOrAaScoreMatch(a, b, size, 1, -1, 'n');
00121 }
00122 }
00123
00124
00125 static int findCrossover(struct ffAli *left, struct ffAli *right, int overlap, boolean isProt)
00126
00127
00128
00129
00130
00131 {
00132 int bestPos = 0;
00133 char *nStart = right->nStart;
00134 char *lhStart = left->hEnd - overlap;
00135 char *rhStart = right->hStart;
00136 int i;
00137 int (*scoreMatch)(char a, char b);
00138 int score, bestScore;
00139
00140 if (isProt)
00141 {
00142 scoreMatch = aaScore2;
00143 score = bestScore = aaScoreMatch(nStart, rhStart, overlap);
00144 }
00145 else
00146 {
00147 scoreMatch = dnaScore2;
00148 score = bestScore = dnaScoreMatch(nStart, rhStart, overlap);
00149 }
00150
00151 for (i=0; i<overlap; ++i)
00152 {
00153 char n = nStart[i];
00154 score += scoreMatch(lhStart[i], n);
00155 score -= scoreMatch(rhStart[i], n);
00156 if (score > bestScore)
00157 {
00158 bestScore = score;
00159 bestPos = i+1;
00160 }
00161 }
00162 return bestPos;
00163 }
00164
00165 static void trans3Offsets(struct trans3 *t3List, AA *startP, AA *endP,
00166 int *retStart, int *retEnd)
00167
00168 {
00169 struct trans3 *t3;
00170 int frame;
00171 aaSeq *seq;
00172
00173 for (t3 = t3List; t3 != NULL; t3 = t3->next)
00174 {
00175 for (frame = 0; frame < 3; ++frame)
00176 {
00177 seq = t3->trans[frame];
00178 if (seq->dna <= startP && startP < seq->dna + seq->size)
00179 {
00180 *retStart = 3*(startP - seq->dna)+frame + t3->start;
00181 *retEnd = 3*(endP - seq->dna)+frame + t3->start;
00182 return;
00183 }
00184 }
00185 }
00186 internalErr();
00187 }
00188
00189 static boolean isMonotonic(struct ffAli *left)
00190
00191 {
00192 struct ffAli *right;
00193
00194 while ((right = left->right) != NULL)
00195 {
00196 if (left->nEnd <= right->nStart && left->hEnd <= right->hStart)
00197 left = right;
00198 else
00199 {
00200 verbose(2, "not monotonic dn %d, dh %d\n", (int)(right->nStart - left->nEnd),
00201 (int)(right->hStart - right->hEnd));
00202 return FALSE;
00203 }
00204 }
00205 return TRUE;
00206 }
00207
00208 static struct ffAli *forceMonotonic(struct ffAli *aliList,
00209 struct dnaSeq *qSeq, struct dnaSeq *tSeq, enum ffStringency stringency,
00210 boolean isProt, struct trans3 *t3List)
00211
00212
00213 {
00214 if (!isProt)
00215 {
00216 if (!isMonotonic(aliList))
00217 {
00218 struct ffAli *leftovers = NULL;
00219 int score;
00220 ssFindBestBig(aliList, qSeq, tSeq, stringency, isProt, t3List, &aliList, &score,
00221 &leftovers);
00222 ffFreeAli(&leftovers);
00223 }
00224 }
00225 return aliList;
00226 }
00227
00228 struct ffAli *smallMiddleExons(struct ffAli *aliList,
00229 struct ssBundle *bundle,
00230 enum ffStringency stringency)
00231
00232 {
00233 if (bundle->t3List != NULL)
00234 return aliList;
00235 else
00236 {
00237 struct dnaSeq *qSeq = bundle->qSeq;
00238 struct dnaSeq *genoSeq = bundle->genoSeq;
00239 struct ffAli *right, *left = NULL, *newLeft, *newRight;
00240
00241 left = aliList;
00242 for (right = aliList->right; right != NULL; right = right->right)
00243 {
00244 if (right->hStart - left->hEnd >= 3 && right->nStart - left->nEnd >= 3)
00245 {
00246 newLeft = ffFind(left->nEnd, right->nStart, left->hEnd, right->hStart, stringency);
00247 if (newLeft != NULL)
00248 {
00249 newLeft = forceMonotonic(newLeft, qSeq, genoSeq,
00250 stringency, bundle->isProt, bundle->t3List );
00251 newRight = ffRightmost(newLeft);
00252 if (left != NULL)
00253 {
00254 left->right = newLeft;
00255 newLeft->left = left;
00256 }
00257 else
00258 {
00259 aliList = newLeft;
00260 }
00261 if (right != NULL)
00262 {
00263 right->left = newRight;
00264 newRight->right = right;
00265 }
00266 }
00267 }
00268 left = right;
00269 }
00270 }
00271 return aliList;
00272 }
00273
00274 struct ssNode
00275
00276 {
00277 struct ssEdge *waysIn;
00278 struct ffAli *ff;
00279 struct ssEdge *bestWayIn;
00280 int cumScore;
00281 int nodeScore;
00282 };
00283
00284 struct ssEdge
00285
00286 {
00287 struct ssEdge *next;
00288 struct ssNode *nodeIn;
00289 int score;
00290 int overlap;
00291 int crossover;
00292 };
00293
00294 struct ssGraph
00295
00296
00297 {
00298 int nodeCount;
00299 struct ssNode *nodes;
00300 int edgeCount;
00301 struct ssEdge *edges;
00302 };
00303
00304 static void ssGraphFree(struct ssGraph **pGraph)
00305
00306 {
00307 struct ssGraph *graph;
00308 if ((graph = *pGraph) != NULL)
00309 {
00310 freeMem(graph->nodes);
00311 freeMem(graph->edges);
00312 freez(pGraph);
00313 }
00314 }
00315
00316 boolean tripleCanFollow(struct ffAli *a, struct ffAli *b, aaSeq *qSeq, struct trans3 *t3List)
00317
00318 {
00319 int ahStart, ahEnd, bhStart, bhEnd;
00320 trans3Offsets(t3List, a->hStart, a->hEnd, &ahStart, &ahEnd);
00321 trans3Offsets(t3List, b->hStart, b->hEnd, &bhStart, &bhEnd);
00322 return (a->nStart < b->nStart && a->nEnd < b->nEnd && ahStart < bhStart && ahEnd < bhEnd);
00323 }
00324
00325
00326 static struct ssGraph *ssGraphMake(struct ffAli *ffList, bioSeq *qSeq,
00327 enum ffStringency stringency, boolean isProt, struct trans3 *t3List)
00328
00329 {
00330 int nodeCount = ffAliCount(ffList);
00331 int maxEdgeCount = (nodeCount+1)*(nodeCount)/2;
00332 int edgeCount = 0;
00333 struct ssEdge *edges, *e;
00334 struct ssNode *nodes;
00335 struct ssGraph *graph;
00336 struct ffAli *ff, *mid;
00337 int i, midIx;
00338 int overlap;
00339 boolean canFollow;
00340
00341 if (nodeCount == 1)
00342 maxEdgeCount = 1;
00343
00344 AllocVar(graph);
00345 graph->nodeCount = nodeCount;
00346 graph->nodes = AllocArray(nodes, nodeCount+1);
00347 for (i=1, ff = ffList; i<=nodeCount; ++i, ff = ff->right)
00348 {
00349 nodes[i].ff = ff;
00350 nodes[i].nodeScore = bioScoreMatch(isProt, ff->nStart, ff->hStart, ff->hEnd - ff->hStart);
00351 }
00352
00353 graph->edges = AllocArray(edges, maxEdgeCount);
00354 for (mid = ffList, midIx=1; mid != NULL; mid = mid->right, ++midIx)
00355 {
00356 int midScore;
00357 struct ssNode *midNode = &nodes[midIx];
00358 e = &edges[edgeCount++];
00359 assert(edgeCount <= maxEdgeCount);
00360 e->nodeIn = &nodes[0];
00361 e->score = midScore = midNode->nodeScore;
00362 midNode->waysIn = e;
00363 for (ff = ffList,i=1; ff != mid; ff = ff->right,++i)
00364 {
00365 int mhStart = 0, mhEnd = 0;
00366 if (t3List)
00367 {
00368 canFollow = tripleCanFollow(ff, mid, qSeq, t3List);
00369 trans3Offsets(t3List, mid->hStart, mid->hEnd, &mhStart, &mhEnd);
00370 }
00371 else
00372 {
00373 canFollow = (ff->nStart < mid->nStart && ff->nEnd < mid->nEnd
00374 && ff->hStart < mid->hStart && ff->hEnd < mid->hEnd);
00375 }
00376 if (canFollow)
00377 {
00378 struct ssNode *ffNode = &nodes[i];
00379 int score;
00380 int hGap;
00381 int nGap;
00382 int crossover;
00383
00384 nGap = mid->nStart - ff->nEnd;
00385 if (t3List)
00386 {
00387 int fhStart, fhEnd;
00388 trans3Offsets(t3List, ff->hStart, ff->hEnd, &fhStart, &fhEnd);
00389 hGap = mhStart - fhEnd;
00390 }
00391 else
00392 {
00393 hGap = mid->hStart - ff->hEnd;
00394 }
00395 e = &edges[edgeCount++];
00396 assert(edgeCount <= maxEdgeCount);
00397 e->nodeIn = ffNode;
00398 e->overlap = overlap = -nGap;
00399 if (overlap > 0)
00400 {
00401 int midSize = mid->hEnd - mid->hStart;
00402 int ffSize = ff->hEnd - ff->hStart;
00403 int newMidScore, newFfScore;
00404 e->crossover = crossover = findCrossover(ff, mid, overlap, isProt);
00405 newMidScore = bioScoreMatch(isProt, mid->nStart, mid->hStart, midSize-overlap+crossover);
00406 newFfScore = bioScoreMatch(isProt, ff->nStart+crossover, ff->hStart+crossover,
00407 ffSize-crossover);
00408 score = newMidScore - ffNode->nodeScore + newFfScore;
00409 nGap = 0;
00410 hGap -= overlap;
00411 }
00412 else
00413 {
00414 score = midScore;
00415 }
00416 score -= ffCalcGapPenalty(hGap, nGap, stringency);
00417 e->score = score;
00418 slAddHead(&midNode->waysIn, e);
00419 }
00420 }
00421 slReverse(&midNode->waysIn);
00422 }
00423 return graph;
00424 }
00425
00426 static struct ssNode *ssDynamicProgram(struct ssGraph *graph)
00427
00428
00429
00430 {
00431 int veryBestScore = -0x3fffffff;
00432 struct ssNode *veryBestNode = NULL;
00433 int nodeIx;
00434
00435 for (nodeIx = 1; nodeIx <= graph->nodeCount; ++nodeIx)
00436 {
00437 int bestScore = -0x3fffffff;
00438 int score;
00439 struct ssEdge *bestEdge = NULL;
00440 struct ssNode *curNode = &graph->nodes[nodeIx];
00441 struct ssEdge *edge;
00442
00443 for (edge = curNode->waysIn; edge != NULL; edge = edge->next)
00444 {
00445 score = edge->score + edge->nodeIn->cumScore;
00446 if (score > bestScore)
00447 {
00448 bestScore = score;
00449 bestEdge = edge;
00450 }
00451 }
00452 curNode->bestWayIn = bestEdge;
00453 curNode->cumScore = bestScore;
00454 if (bestScore >= veryBestScore)
00455 {
00456 veryBestScore = bestScore;
00457 veryBestNode = curNode;
00458 }
00459 }
00460 return veryBestNode;
00461 }
00462
00463 static void ssGraphFindBest(struct ssGraph *graph, struct ffAli **retBestAli,
00464 int *retScore, struct ffAli **retLeftovers)
00465
00466
00467 {
00468 struct ssEdge *edge;
00469 struct ssNode *node;
00470 struct ssNode *startNode = &graph->nodes[0];
00471 struct ffAli *bestAli = NULL, *leftovers = NULL;
00472 struct ffAli *ff, *left = NULL;
00473 int i;
00474 int overlap, crossover, crossDif;
00475
00476
00477 node = ssDynamicProgram(graph);
00478 *retScore = node->cumScore;
00479
00480
00481 while (node != startNode)
00482 {
00483 ff = node->ff;
00484 ff->right = bestAli;
00485 bestAli = ff;
00486 node->ff = NULL;
00487 edge = node->bestWayIn;
00488 if ((overlap = edge->overlap) > 0)
00489 {
00490 left = edge->nodeIn->ff;
00491 crossover = edge->crossover;
00492 crossDif = overlap-crossover;
00493 left->hEnd -= crossDif;
00494 left->nEnd -= crossDif;
00495 ff->hStart += crossover;
00496 ff->nStart += crossover;
00497 }
00498 node = node->bestWayIn->nodeIn;
00499 }
00500
00501
00502 left = NULL;
00503 for (ff = bestAli; ff != NULL; ff = ff->right)
00504 {
00505 ff->left = left;
00506 left = ff;
00507 }
00508
00509
00510 for (i=1, node=graph->nodes+1; i<=graph->nodeCount; ++i,++node)
00511 {
00512 if ((ff = node->ff) != NULL)
00513 {
00514 ff->left = leftovers;
00515 leftovers = ff;
00516 }
00517 }
00518 leftovers = ffMakeRightLinks(leftovers);
00519
00520 *retBestAli = bestAli;
00521 *retLeftovers = leftovers;
00522 }
00523
00524 static void ssFindBestSmall(struct ffAli *ffList, bioSeq *qSeq, bioSeq *tSeq,
00525 enum ffStringency stringency, boolean isProt, struct trans3 *t3List,
00526 struct ffAli **retBestAli, int *retScore, struct ffAli **retLeftovers)
00527
00528
00529 {
00530 struct ssGraph *graph = ssGraphMake(ffList, qSeq, stringency, isProt, t3List);
00531 ssGraphFindBest(graph, retBestAli, retScore, retLeftovers);
00532 *retBestAli = forceMonotonic(*retBestAli, qSeq, tSeq, stringency, isProt, t3List);
00533 ssGraphFree(&graph);
00534 }
00535
00536
00537
00538 static enum ffStringency ssStringency;
00539 static boolean ssIsProt;
00540
00541 static int ssGapCost(int dq, int dt, void *data)
00542
00543
00544 {
00545 int cost;
00546 if (dt < 0) dt = 0;
00547 if (dq < 0) dq = 0;
00548 cost = ffCalcGapPenalty(dt, dq, ssStringency);
00549 return cost;
00550 }
00551
00552
00553 static int findOverlap(struct cBlock *a, struct cBlock *b)
00554
00555 {
00556 int dq = b->qStart - a->qEnd;
00557 int dt = b->tStart - a->tEnd;
00558 return -min(dq, dt);
00559 }
00560
00561 static int ssConnectCost(struct cBlock *a, struct cBlock *b, void *data)
00562
00563
00564 {
00565 struct ffAli *aFf = a->data, *bFf = b->data;
00566 int overlapAdjustment = 0;
00567 int overlap = findOverlap(a, b);
00568 int dq = b->qStart - a->qEnd;
00569 int dt = b->tStart - a->tEnd;
00570
00571 if (overlap > 0)
00572 {
00573 int aSize = aFf->hEnd - aFf->hStart;
00574 int bSize = bFf->hEnd - bFf->hStart;
00575 if (overlap >= bSize || overlap >= aSize)
00576 {
00577
00578
00579
00580 overlapAdjustment = a->score + b->score;
00581 }
00582 else
00583 {
00584
00585 int crossover = findCrossover(aFf, bFf, overlap, ssIsProt);
00586 int remain = overlap - crossover;
00587 overlapAdjustment =
00588 bioScoreMatch(ssIsProt, aFf->nEnd - remain, aFf->hEnd - remain,
00589 remain)
00590 + bioScoreMatch(ssIsProt, bFf->nStart, bFf->hStart,
00591 crossover);
00592 dq -= overlap;
00593 dt -= overlap;
00594 }
00595 }
00596 return overlapAdjustment + ssGapCost(dq, dt, data);
00597 }
00598
00599
00600 static void ssFindBestBig(struct ffAli *ffList, bioSeq *qSeq, bioSeq *tSeq,
00601 enum ffStringency stringency, boolean isProt, struct trans3 *t3List,
00602 struct ffAli **retBestAli, int *retScore, struct ffAli **retLeftovers)
00603
00604
00605 {
00606 struct cBlock *boxList = NULL, *box, *prevBox;
00607 struct ffAli *ff, *farRight = NULL;
00608 struct lm *lm = lmInit(0);
00609 int boxSize;
00610 DNA *firstH = tSeq->dna;
00611 struct chain *chainList, *chain, *bestChain;
00612 int tMin = BIGNUM, tMax = -BIGNUM;
00613
00614
00615
00616 for (ff = ffList; ff != NULL; ff = ff->right)
00617 {
00618 lmAllocVar(lm, box);
00619 boxSize = ff->nEnd - ff->nStart;
00620 box->qStart = ff->nStart - qSeq->dna;
00621 box->qEnd = box->qStart + boxSize;
00622 if (t3List)
00623 {
00624 trans3Offsets(t3List, ff->hStart, ff->hEnd, &box->tStart, &box->tEnd);
00625 }
00626 else
00627 {
00628 box->tStart = ff->hStart - firstH;
00629 box->tEnd = box->tStart + boxSize;
00630 }
00631 box->data = ff;
00632 box->score = bioScoreMatch(isProt, ff->nStart, ff->hStart, boxSize);
00633 if (tMin > box->tStart) tMin = box->tStart;
00634 if (tMax < box->tEnd) tMax = box->tEnd;
00635 slAddHead(&boxList, box);
00636 }
00637
00638
00639 for (box = boxList; box != NULL; box = box->next)
00640 {
00641 box->tStart -= tMin;
00642 box->tEnd -= tMin;
00643 }
00644 tMax -= tMin;
00645
00646 ssStringency = stringency;
00647 ssIsProt = isProt;
00648 chainList = chainBlocks(qSeq->name, qSeq->size, '+', "tSeq", tMax, &boxList,
00649 ssConnectCost, ssGapCost, NULL, NULL);
00650
00651
00652 bestChain = chainList;
00653 prevBox = bestChain->blockList;
00654 for (box = prevBox->next; box != NULL; box = box->next)
00655 {
00656 int overlap = findOverlap(prevBox, box);
00657 if (overlap > 0)
00658 {
00659 struct ffAli *left = prevBox->data;
00660 struct ffAli *right = box->data;
00661 int crossover = findCrossover(left, right, overlap, isProt);
00662 int remain = overlap - crossover;
00663 left->nEnd -= remain;
00664 left->hEnd -= remain;
00665 right->nStart += crossover;
00666 right->hStart += crossover;
00667 }
00668 prevBox = box;
00669 }
00670
00671
00672 farRight = NULL;
00673 for (box = chainList->blockList; box != NULL; box = box->next)
00674 {
00675 ff = box->data;
00676 ff->left = farRight;
00677 farRight = ff;
00678 }
00679 *retBestAli = ffMakeRightLinks(farRight);
00680
00681
00682 farRight = NULL;
00683 for (chain = chainList->next; chain != NULL; chain = chain->next)
00684 {
00685 for (box = chain->blockList; box != NULL; box = box->next)
00686 {
00687 ff = box->data;
00688 ff->left = farRight;
00689 farRight = ff;
00690 }
00691 }
00692 *retLeftovers = ffMakeRightLinks(farRight);
00693
00694 *retScore = bestChain->score;
00695 for (chain = chainList; chain != NULL; chain = chain->next)
00696 chain->blockList = NULL;
00697 chainFreeList(&chainList);
00698 lmCleanup(&lm);
00699 }
00700
00701 static void ssFindBest(struct ffAli *ffList, bioSeq *qSeq, bioSeq *tSeq,
00702 enum ffStringency stringency, boolean isProt, struct trans3 *t3List,
00703 struct ffAli **retBestAli, int *retScore, struct ffAli **retLeftovers)
00704
00705 {
00706 int count = ffAliCount(ffList);
00707 if (count >= 10)
00708 {
00709 ssFindBestBig(ffList, qSeq, tSeq, stringency, isProt, t3List,
00710 retBestAli, retScore, retLeftovers);
00711 }
00712 else
00713 {
00714 ssFindBestSmall(ffList, qSeq, tSeq, stringency, isProt, t3List,
00715 retBestAli, retScore, retLeftovers);
00716 }
00717 }
00718
00719 struct ffAli *cutAtBigIntrons(struct ffAli *ffList, int maxIntron,
00720 int *pScore, enum ffStringency stringency,
00721 struct ffAli **returnLeftovers)
00722
00723
00724 {
00725 struct ffAli *prevFf, *ff, *cutFf = NULL;
00726 prevFf = ffList;
00727 for (ff = prevFf->right; ff != NULL; ff = ff->right)
00728 {
00729 int dt = ff->hStart - prevFf->hEnd;
00730 if (dt > maxIntron)
00731 {
00732 cutFf = prevFf;
00733 break;
00734 }
00735 prevFf = ff;
00736 }
00737 if (cutFf != NULL)
00738 {
00739 ff = cutFf->right;
00740 cutFf->right = NULL;
00741 ff->left = NULL;
00742 ffCat(returnLeftovers, &ff);
00743 *pScore = ffScore(ffList, stringency);
00744 }
00745 return ffList;
00746 }
00747
00748 void ssStitch(struct ssBundle *bundle, enum ffStringency stringency,
00749 int minScore, int maxToReturn)
00750
00751
00752
00753 {
00754 struct dnaSeq *qSeq = bundle->qSeq;
00755 struct dnaSeq *genoSeq = bundle->genoSeq;
00756 struct ffAli *ffList = NULL;
00757 struct ssFfItem *ffl;
00758 struct ffAli *bestPath;
00759 int score;
00760 boolean firstTime = TRUE;
00761
00762 if (bundle->ffList == NULL)
00763 return;
00764
00765
00766
00767 if (minScore > 20)
00768 minScore = 20;
00769
00770
00771 for (ffl = bundle->ffList; ffl != NULL; ffl = ffl->next)
00772 {
00773 ffCat(&ffList, &ffl->ff);
00774 }
00775 slFreeList(&bundle->ffList);
00776
00777 ffAliSort(&ffList, ffCmpHitsNeedleFirst);
00778 ffList = ffMergeClose(ffList, qSeq->dna, genoSeq->dna);
00779
00780 while (ffList != NULL)
00781 {
00782 ssFindBest(ffList, qSeq, genoSeq, stringency,
00783 bundle->isProt, bundle->t3List,
00784 &bestPath, &score, &ffList);
00785
00786 bestPath = ffMergeNeedleAlis(bestPath, TRUE);
00787 bestPath = ffRemoveEmptyAlis(bestPath, TRUE);
00788 bestPath = ffMergeHayOverlaps(bestPath);
00789 bestPath = ffRemoveEmptyAlis(bestPath, TRUE);
00790 bestPath = forceMonotonic(bestPath, qSeq, genoSeq, stringency,
00791 bundle->isProt, bundle->t3List);
00792
00793 if (firstTime && stringency == ffCdna && bundle->avoidFuzzyFindKludge == FALSE)
00794 {
00795
00796
00797 bestPath = smallMiddleExons(bestPath, bundle, stringency);
00798 }
00799 bestPath = ffMergeNeedleAlis(bestPath, TRUE);
00800 if (ffIntronMax != ffIntronMaxDefault)
00801 {
00802 bestPath = cutAtBigIntrons(bestPath, ffIntronMax, &score, stringency,
00803 &ffList);
00804 }
00805 if (!bundle->isProt)
00806 ffSlideIntrons(bestPath);
00807 bestPath = ffRemoveEmptyAlis(bestPath, TRUE);
00808 if (score >= minScore)
00809 {
00810 AllocVar(ffl);
00811 ffl->ff = bestPath;
00812 slAddHead(&bundle->ffList, ffl);
00813 }
00814 else
00815 {
00816 ffFreeAli(&bestPath);
00817 ffFreeAli(&ffList);
00818 break;
00819 }
00820 firstTime = FALSE;
00821 if (--maxToReturn <= 0)
00822 {
00823 ffFreeAli(&ffList);
00824 break;
00825 }
00826 }
00827 slReverse(&bundle->ffList);
00828 return;
00829 }
00830
00831 static struct ssBundle *findBundle(struct ssBundle *list,
00832 struct dnaSeq *genoSeq)
00833
00834
00835 {
00836 struct ssBundle *el;
00837 for (el = list; el != NULL; el = el->next)
00838 if (el->genoSeq == genoSeq)
00839 return el;
00840 return NULL;
00841 }
00842
00843 struct ssBundle *ssFindBundles(struct patSpace *ps, struct dnaSeq *cSeq,
00844 char *cName, enum ffStringency stringency, boolean avoidSelfSelf)
00845
00846 {
00847 struct patClump *clumpList, *clump;
00848 struct ssBundle *bundleList = NULL, *bun = NULL;
00849 DNA *cdna = cSeq->dna;
00850 int totalCdnaSize = cSeq->size;
00851 DNA *endCdna = cdna+totalCdnaSize;
00852 struct ssFfItem *ffl;
00853 struct dnaSeq *lastSeq = NULL;
00854 int maxSize = 700;
00855 int preferredSize = 500;
00856 int overlapSize = 250;
00857
00858 for (;;)
00859 {
00860 int cSize = endCdna - cdna;
00861 if (cSize > maxSize)
00862 cSize = preferredSize;
00863 clumpList = patSpaceFindOne(ps, cdna, cSize);
00864 for (clump = clumpList; clump != NULL; clump = clump->next)
00865 {
00866 struct ffAli *ff;
00867 struct dnaSeq *seq = clump->seq;
00868 DNA *tStart = seq->dna + clump->start;
00869 if (!avoidSelfSelf || !sameString(seq->name, cSeq->name))
00870 {
00871 ff = ffFind(cdna, cdna+cSize, tStart, tStart + clump->size, stringency);
00872 if (ff != NULL)
00873 {
00874 if (lastSeq != seq)
00875 {
00876 lastSeq = seq;
00877 if ((bun = findBundle(bundleList, seq)) == NULL)
00878 {
00879 AllocVar(bun);
00880 bun->qSeq = cSeq;
00881 bun->genoSeq = seq;
00882 bun->genoIx = clump->bacIx;
00883 bun->genoContigIx = clump->seqIx;
00884 slAddHead(&bundleList, bun);
00885 }
00886 }
00887 AllocVar(ffl);
00888 ffl->ff = ff;
00889 slAddHead(&bun->ffList, ffl);
00890 }
00891 }
00892 }
00893 cdna += cSize;
00894 if (cdna >= endCdna)
00895 break;
00896 cdna -= overlapSize;
00897 slFreeList(&clumpList);
00898 }
00899 slReverse(&bundleList);
00900 cdna = cSeq->dna;
00901
00902 for (bun = bundleList; bun != NULL; bun = bun->next)
00903 {
00904 ssStitch(bun, stringency, 20, 16);
00905 }
00906 return bundleList;
00907 }
00908