jkOwnLib/supStitch.c

Go to the documentation of this file.
00001 /* supStitch stitches together a bundle of ffAli alignments that share
00002  * a common query and target sequence into larger alignments if possible.
00003  * This is commonly used when the query sequence was broken up into
00004  * overlapping blocks in the initial alignment, and also to look for
00005  * introns larger than fuzzyFinder can handle. */
00006 /* Copyright 2000-2005 Jim Kent.  All rights reserved. */
00007 
00008 #include "common.h"
00009 #include "dnautil.h"
00010 #include "dlist.h"
00011 #include "fuzzyFind.h"
00012 #include "localmem.h"
00013 #include "patSpace.h"
00014 #include "trans3.h"
00015 #include "supStitch.h"
00016 #include "chainBlock.h"
00017 
00018 static char const rcsid[] = "$Id: supStitch.c,v 1.38 2007/04/20 22:43:37 kent Exp $";
00019 
00020 static void ssFindBestBig(struct ffAli *ffList, bioSeq *qSeq, bioSeq *tSeq,
00021         enum ffStringency stringency, boolean isProt, struct trans3 *t3List,
00022         struct ffAli **retBestAli, int *retScore, struct ffAli **retLeftovers);
00023 
00024 void ssFfItemFree(struct ssFfItem **pEl)
00025 /* Free a single ssFfItem. */
00026 {
00027 struct ssFfItem *el;
00028 if ((el = *pEl) != NULL)
00029     {
00030     ffFreeAli(&el->ff);
00031     freez(pEl);
00032     }
00033 }
00034 
00035 void ssFfItemFreeList(struct ssFfItem **pList)
00036 /* Free a list of ssFfItems. */
00037 {
00038 struct ssFfItem *el, *next;
00039 for (el = *pList; el != NULL; el = next)
00040     {
00041     next = el->next;
00042     ssFfItemFree(&el);
00043     }
00044 *pList = NULL;
00045 }
00046 
00047 void ssBundleFree(struct ssBundle **pEl)
00048 /* Free up one ssBundle */
00049 {
00050 struct ssBundle *el;
00051 if ((el = *pEl) != NULL)
00052     {
00053     ssFfItemFreeList(&el->ffList);
00054     freez(pEl);
00055     }
00056 }
00057 
00058 void ssBundleFreeList(struct ssBundle **pList)
00059 /* Free up list of ssBundles */
00060 {
00061 struct ssBundle *el, *next;
00062 for (el = *pList; el != NULL; el = next)
00063     {
00064     next = el->next;
00065     ssBundleFree(&el);
00066     }
00067 *pList = NULL;
00068 }
00069 
00070 void dumpNearCrossover(char *what, DNA *n, DNA *h, int size)
00071 /* Print sequence near crossover */
00072 {
00073 printf("%s: ", what);
00074 mustWrite(stdout, n, size);
00075 printf("\n");
00076 printf("%s: ", what);
00077 mustWrite(stdout, h, size);
00078 printf("\n");
00079 }
00080 
00081 void dumpFf(struct ffAli *left, DNA *needle, DNA *hay)
00082 /* Print info on ffAli. */
00083 {
00084 struct ffAli *ff;
00085 for (ff = left; ff != NULL; ff = ff->right)
00086     {
00087     printf("(%ld - %ld)[%ld-%ld] ", (long)(ff->hStart-hay), (long)(ff->hEnd-hay),
00088         (long)(ff->nStart - needle), (long)(ff->nEnd - needle));
00089     }
00090 printf("\n");
00091 }
00092 
00093 void dumpBuns(struct ssBundle *bunList)
00094 /* Print diagnostic info on bundles. */
00095 {
00096 struct ssBundle *bun;
00097 struct ssFfItem *ffl;
00098 for (bun = bunList; bun != NULL; bun = bun->next)
00099     {
00100     struct dnaSeq *qSeq = bun->qSeq;        /* Query sequence (not owned by bundle.) */
00101     struct dnaSeq *genoSeq = bun->genoSeq;     /* Genomic sequence (not owned by bundle.) */
00102     printf("Bundle of %d between %s and %s\n", slCount(bun->ffList), qSeq->name, genoSeq->name);
00103     for (ffl = bun->ffList; ffl != NULL; ffl = ffl->next)
00104         {
00105         dumpFf(ffl->ff, bun->qSeq->dna, bun->genoSeq->dna);
00106         }
00107     }
00108 }
00109 
00110 
00111 static int bioScoreMatch(boolean isProt, char *a, char *b, int size)
00112 /* Return score of match (no inserts) between two bio-polymers. */
00113 {
00114 if (isProt)
00115     {
00116     return dnaOrAaScoreMatch(a, b, size, 2, -1, 'X');
00117     }
00118 else
00119     {
00120     return dnaOrAaScoreMatch(a, b, size, 1, -1, 'n');
00121     }
00122 }
00123 
00124 
00125 static int findCrossover(struct ffAli *left, struct ffAli *right, int overlap, boolean isProt)
00126 /* Find ideal crossover point of overlapping blocks.  That is
00127  * the point where we should start using the right block rather
00128  * than the left block.  This point is an offset from the start
00129  * of the overlapping region (which is the same as the start of the
00130  * right block). */
00131 {
00132 int bestPos = 0;
00133 char *nStart = right->nStart;
00134 char *lhStart = left->hEnd - overlap;
00135 char *rhStart = right->hStart;
00136 int i;
00137 int (*scoreMatch)(char a, char b);
00138 int score, bestScore;
00139 
00140 if (isProt)
00141     {
00142     scoreMatch = aaScore2;
00143     score = bestScore = aaScoreMatch(nStart, rhStart, overlap);
00144     }
00145 else
00146     {
00147     scoreMatch = dnaScore2;
00148     score = bestScore = dnaScoreMatch(nStart, rhStart, overlap);
00149     }
00150 
00151 for (i=0; i<overlap; ++i)
00152     {
00153     char n = nStart[i];
00154     score += scoreMatch(lhStart[i], n);
00155     score -= scoreMatch(rhStart[i], n);
00156     if (score > bestScore)
00157         {
00158         bestScore = score;
00159         bestPos = i+1;
00160         }
00161     }
00162 return bestPos;
00163 }
00164 
00165 static void trans3Offsets(struct trans3 *t3List, AA *startP, AA *endP,
00166         int *retStart, int *retEnd)
00167 /* Figure out offset of peptide in context of larger sequences. */
00168 {
00169 struct trans3 *t3;
00170 int frame;
00171 aaSeq *seq;
00172 
00173 for (t3 = t3List; t3 != NULL; t3 = t3->next)
00174     {
00175     for (frame = 0; frame < 3; ++frame)
00176         {
00177         seq = t3->trans[frame];
00178         if (seq->dna <= startP && startP < seq->dna + seq->size)
00179             {
00180             *retStart = 3*(startP - seq->dna)+frame + t3->start;
00181             *retEnd = 3*(endP - seq->dna)+frame + t3->start;
00182             return;
00183             }
00184         }
00185     }
00186 internalErr();
00187 }
00188 
00189 static boolean isMonotonic(struct ffAli *left)
00190 /* Return TRUE if this is monotonic on both query and target */
00191 {
00192 struct ffAli *right;
00193 
00194 while ((right = left->right) != NULL)
00195     {
00196     if (left->nEnd <= right->nStart && left->hEnd <= right->hStart)
00197         left = right;
00198     else
00199         {
00200         verbose(2, "not monotonic dn %d, dh %d\n", (int)(right->nStart - left->nEnd), 
00201                 (int)(right->hStart - right->hEnd));
00202         return FALSE;
00203         }
00204     }
00205 return TRUE;
00206 }
00207 
00208 static struct ffAli *forceMonotonic(struct ffAli *aliList,
00209         struct dnaSeq *qSeq, struct dnaSeq *tSeq, enum ffStringency stringency,
00210         boolean isProt, struct trans3 *t3List)
00211 /* Remove any blocks that violate strictly increasing order in both coordinates. 
00212  * This is not optimal, but it turns out to be very rarely used. */
00213 {
00214 if (!isProt)
00215     {
00216     if (!isMonotonic(aliList))
00217         {
00218         struct ffAli *leftovers = NULL;
00219         int score;
00220         ssFindBestBig(aliList, qSeq, tSeq, stringency, isProt, t3List, &aliList, &score,
00221            &leftovers);
00222         ffFreeAli(&leftovers);
00223         }
00224     }
00225 return aliList;
00226 }
00227 
00228 struct ffAli *smallMiddleExons(struct ffAli *aliList, 
00229         struct ssBundle *bundle, 
00230         enum ffStringency stringency)
00231 /* Look for small exons in the middle. */
00232 {
00233 if (bundle->t3List != NULL)
00234     return aliList;     /* Can't handle intense translated stuff. */
00235 else
00236     {
00237     struct dnaSeq *qSeq =  bundle->qSeq;
00238     struct dnaSeq *genoSeq = bundle->genoSeq;
00239     struct ffAli *right, *left = NULL, *newLeft, *newRight;
00240 
00241     left = aliList;
00242     for (right = aliList->right; right != NULL; right = right->right)
00243         {
00244         if (right->hStart - left->hEnd >= 3 && right->nStart - left->nEnd >= 3)
00245             {
00246             newLeft = ffFind(left->nEnd, right->nStart, left->hEnd, right->hStart, stringency);
00247             if (newLeft != NULL)
00248                 {
00249                 newLeft = forceMonotonic(newLeft, qSeq, genoSeq, 
00250                     stringency, bundle->isProt, bundle->t3List );
00251                 newRight = ffRightmost(newLeft);
00252                 if (left != NULL)
00253                     {
00254                     left->right = newLeft;
00255                     newLeft->left = left;
00256                     }
00257                 else
00258                     {
00259                     aliList = newLeft;
00260                     }
00261                 if (right != NULL)
00262                     {
00263                     right->left = newRight;
00264                     newRight->right = right;
00265                     }
00266                 }
00267             }
00268         left = right;
00269         }
00270     }
00271 return aliList;
00272 }
00273 
00274 struct ssNode
00275 /* Node of superStitch graph. */
00276     {
00277     struct ssEdge *waysIn;      /* Edges leading into this node. */
00278     struct ffAli *ff;           /* Alignment block associated with node. */
00279     struct ssEdge *bestWayIn;   /* Dynamic programming best edge in. */
00280     int cumScore;               /* Dynamic programming score of edge. */
00281     int nodeScore;              /* Score of this node. */
00282     };
00283 
00284 struct ssEdge
00285 /* Edge of superStitch graph. */
00286     {
00287     struct ssEdge *next;         /* Next edge in waysIn list. */
00288     struct ssNode *nodeIn;       /* The node that leads to this edge. */
00289     int score;                   /* Score you get taking this edge. */
00290     int overlap;                 /* Overlap between two nodes. */
00291     int crossover;               /* Offset from overlap where crossover occurs. */
00292     };
00293 
00294 struct ssGraph
00295 /* Super stitcher graph for dynamic programming to find best
00296  * way through. */
00297     {
00298     int nodeCount;              /* Number of nodes. */
00299     struct ssNode *nodes;       /* Array of nodes. */
00300     int edgeCount;              /* Number of edges. */
00301     struct ssEdge *edges;       /* Array of edges. */
00302     };
00303 
00304 static void ssGraphFree(struct ssGraph **pGraph)
00305 /* Free graph. */
00306 {
00307 struct ssGraph *graph;
00308 if ((graph = *pGraph) != NULL)
00309     {
00310     freeMem(graph->nodes);
00311     freeMem(graph->edges);
00312     freez(pGraph);
00313     }
00314 }
00315 
00316 boolean tripleCanFollow(struct ffAli *a, struct ffAli *b, aaSeq *qSeq, struct trans3 *t3List)
00317 /* Figure out if a can follow b in any one of three reading frames of haystack. */
00318 {
00319 int ahStart, ahEnd, bhStart, bhEnd;
00320 trans3Offsets(t3List, a->hStart, a->hEnd, &ahStart, &ahEnd);
00321 trans3Offsets(t3List, b->hStart, b->hEnd, &bhStart, &bhEnd);
00322 return  (a->nStart < b->nStart && a->nEnd < b->nEnd && ahStart < bhStart && ahEnd < bhEnd);
00323 }
00324 
00325 
00326 static struct ssGraph *ssGraphMake(struct ffAli *ffList, bioSeq *qSeq,
00327         enum ffStringency stringency, boolean isProt, struct trans3 *t3List)
00328 /* Make a graph corresponding to ffList */
00329 {
00330 int nodeCount = ffAliCount(ffList);
00331 int maxEdgeCount = (nodeCount+1)*(nodeCount)/2;
00332 int edgeCount = 0;
00333 struct ssEdge *edges, *e;
00334 struct ssNode *nodes;
00335 struct ssGraph *graph;
00336 struct ffAli *ff, *mid;
00337 int i, midIx;
00338 int overlap;
00339 boolean canFollow;
00340 
00341 if (nodeCount == 1)
00342     maxEdgeCount = 1;
00343     
00344 AllocVar(graph);
00345 graph->nodeCount = nodeCount;
00346 graph->nodes = AllocArray(nodes, nodeCount+1);
00347 for (i=1, ff = ffList; i<=nodeCount; ++i, ff = ff->right)
00348     {
00349     nodes[i].ff = ff;
00350     nodes[i].nodeScore = bioScoreMatch(isProt, ff->nStart, ff->hStart, ff->hEnd - ff->hStart);
00351     }
00352 
00353 graph->edges = AllocArray(edges, maxEdgeCount);
00354 for (mid = ffList, midIx=1; mid != NULL; mid = mid->right, ++midIx)
00355     {
00356     int midScore;
00357     struct ssNode *midNode = &nodes[midIx];
00358     e = &edges[edgeCount++];
00359     assert(edgeCount <= maxEdgeCount);
00360     e->nodeIn = &nodes[0];
00361     e->score = midScore = midNode->nodeScore;
00362     midNode->waysIn = e;
00363     for (ff = ffList,i=1; ff != mid; ff = ff->right,++i)
00364         {
00365         int mhStart = 0, mhEnd = 0;
00366         if (t3List)
00367             {
00368             canFollow = tripleCanFollow(ff, mid, qSeq, t3List);
00369             trans3Offsets(t3List, mid->hStart, mid->hEnd, &mhStart, &mhEnd);
00370             }
00371         else 
00372             {
00373             canFollow = (ff->nStart < mid->nStart && ff->nEnd < mid->nEnd 
00374                         && ff->hStart < mid->hStart && ff->hEnd < mid->hEnd);
00375             }
00376         if (canFollow)
00377             {
00378             struct ssNode *ffNode = &nodes[i];
00379             int score;
00380             int hGap;
00381             int nGap;
00382             int crossover;
00383 
00384             nGap = mid->nStart - ff->nEnd;
00385             if (t3List)
00386                 {
00387                 int fhStart, fhEnd;
00388                 trans3Offsets(t3List, ff->hStart, ff->hEnd, &fhStart, &fhEnd);
00389                 hGap = mhStart - fhEnd;
00390                 }
00391             else
00392                 {
00393                 hGap = mid->hStart - ff->hEnd;
00394                 }
00395             e = &edges[edgeCount++];
00396             assert(edgeCount <= maxEdgeCount);
00397             e->nodeIn = ffNode;
00398             e->overlap = overlap = -nGap;
00399             if (overlap > 0)
00400                 {
00401                 int midSize = mid->hEnd - mid->hStart;
00402                 int ffSize = ff->hEnd - ff->hStart;
00403                 int newMidScore, newFfScore;
00404                 e->crossover = crossover = findCrossover(ff, mid, overlap, isProt);
00405                 newMidScore = bioScoreMatch(isProt, mid->nStart, mid->hStart, midSize-overlap+crossover);
00406                 newFfScore = bioScoreMatch(isProt, ff->nStart+crossover, ff->hStart+crossover,
00407                                 ffSize-crossover);
00408                 score = newMidScore - ffNode->nodeScore + newFfScore;
00409                 nGap = 0;
00410                 hGap -= overlap;
00411                 }
00412             else
00413                 {
00414                 score = midScore;
00415                 }
00416             score -= ffCalcGapPenalty(hGap, nGap, stringency);
00417             e->score = score;
00418             slAddHead(&midNode->waysIn, e);
00419             }
00420         }
00421     slReverse(&midNode->waysIn);
00422     }
00423 return graph;
00424 }
00425 
00426 static struct ssNode *ssDynamicProgram(struct ssGraph *graph)
00427 /* Do dynamic programming to find optimal path though
00428  * graph.  Return best ending node (with back-trace
00429  * pointers and score set). */
00430 {
00431 int veryBestScore = -0x3fffffff;
00432 struct ssNode *veryBestNode = NULL;
00433 int nodeIx;
00434 
00435 for (nodeIx = 1; nodeIx <= graph->nodeCount; ++nodeIx)
00436     {
00437     int bestScore = -0x3fffffff;
00438     int score;
00439     struct ssEdge *bestEdge = NULL;
00440     struct ssNode *curNode = &graph->nodes[nodeIx];
00441     struct ssEdge *edge;
00442 
00443     for (edge = curNode->waysIn; edge != NULL; edge = edge->next)
00444         {
00445         score = edge->score + edge->nodeIn->cumScore;
00446         if (score > bestScore)
00447             {
00448             bestScore = score;
00449             bestEdge = edge;
00450             }
00451         }
00452     curNode->bestWayIn = bestEdge;
00453     curNode->cumScore = bestScore;
00454     if (bestScore >= veryBestScore)
00455         {
00456         veryBestScore = bestScore;
00457         veryBestNode = curNode;
00458         }
00459     }
00460 return veryBestNode;
00461 }
00462 
00463 static void ssGraphFindBest(struct ssGraph *graph, struct ffAli **retBestAli, 
00464    int *retScore, struct ffAli **retLeftovers)
00465 /* Traverse graph and put best alignment in retBestAli.  Return score.
00466  * Put blocks not part of best alignment in retLeftovers. */
00467 {
00468 struct ssEdge *edge;
00469 struct ssNode *node;
00470 struct ssNode *startNode = &graph->nodes[0];
00471 struct ffAli *bestAli = NULL, *leftovers = NULL;
00472 struct ffAli *ff, *left = NULL;
00473 int i;
00474 int overlap, crossover, crossDif;
00475 
00476 /* Find best path and save score. */
00477 node = ssDynamicProgram(graph);
00478 *retScore = node->cumScore;
00479 
00480 /* Trace back and trim overlaps. */
00481 while (node != startNode)
00482     {
00483     ff = node->ff;
00484     ff->right = bestAli;
00485     bestAli = ff;
00486     node->ff = NULL;
00487     edge = node->bestWayIn;
00488     if ((overlap = edge->overlap) > 0)
00489         {
00490         left = edge->nodeIn->ff;
00491         crossover = edge->crossover;
00492         crossDif = overlap-crossover;
00493         left->hEnd -= crossDif;
00494         left->nEnd -= crossDif;
00495         ff->hStart += crossover;
00496         ff->nStart += crossover;
00497         }
00498     node = node->bestWayIn->nodeIn;
00499     }
00500 
00501 /* Make left links. */
00502 left = NULL;
00503 for (ff = bestAli; ff != NULL; ff = ff->right)
00504     {
00505     ff->left = left;
00506     left = ff;
00507     }
00508 
00509 /* Make leftover list. */
00510 for (i=1, node=graph->nodes+1; i<=graph->nodeCount; ++i,++node)
00511     {
00512     if ((ff = node->ff) != NULL)
00513         {
00514         ff->left = leftovers;
00515         leftovers = ff;
00516         }
00517     }
00518 leftovers = ffMakeRightLinks(leftovers);
00519 
00520 *retBestAli = bestAli;
00521 *retLeftovers = leftovers;
00522 }
00523 
00524 static void ssFindBestSmall(struct ffAli *ffList, bioSeq *qSeq, bioSeq *tSeq,
00525         enum ffStringency stringency, boolean isProt, struct trans3 *t3List,
00526         struct ffAli **retBestAli, int *retScore, struct ffAli **retLeftovers)
00527 /* Build a graph and do a dynamic program on it to find best chains. 
00528  * For small ffLists this is faster than the stuff in chainBlock. */
00529 {
00530 struct ssGraph *graph = ssGraphMake(ffList, qSeq, stringency, isProt, t3List);
00531 ssGraphFindBest(graph, retBestAli, retScore, retLeftovers);
00532 *retBestAli = forceMonotonic(*retBestAli, qSeq, tSeq, stringency, isProt, t3List);
00533 ssGraphFree(&graph);
00534 }
00535 
00536 
00537 
00538 static enum ffStringency ssStringency;
00539 static boolean ssIsProt;
00540 
00541 static int ssGapCost(int dq, int dt, void *data)
00542 /* Return gap penalty.  This just need be a lower bound on 
00543  * the penalty actually. */
00544 {
00545 int cost;
00546 if (dt < 0) dt = 0;
00547 if (dq < 0) dq = 0;
00548 cost = ffCalcGapPenalty(dt, dq, ssStringency);
00549 return cost;
00550 }
00551 
00552 
00553 static int findOverlap(struct cBlock *a, struct cBlock *b)
00554 /* Figure out how much a and b overlap on either sequence. */
00555 {
00556 int dq = b->qStart - a->qEnd;
00557 int dt = b->tStart - a->tEnd;
00558 return -min(dq, dt);
00559 }
00560 
00561 static int ssConnectCost(struct cBlock *a, struct cBlock *b, void *data)
00562 /* Calculate connection cost - including gap score
00563  * and overlap adjustments if any. */
00564 {
00565 struct ffAli *aFf = a->data, *bFf = b->data;
00566 int overlapAdjustment = 0;
00567 int overlap = findOverlap(a, b);
00568 int dq = b->qStart - a->qEnd;
00569 int dt = b->tStart - a->tEnd;
00570 
00571 if (overlap > 0)
00572     {
00573     int aSize = aFf->hEnd - aFf->hStart;
00574     int bSize = bFf->hEnd - bFf->hStart;
00575     if (overlap >= bSize || overlap >= aSize)
00576         {
00577         /* Give stiff overlap adjustment for case where
00578          * one block completely enclosed in the other on
00579          * either sequence. This will make it never happen. */
00580         overlapAdjustment = a->score + b->score;
00581         }
00582     else
00583         {
00584         /* More normal case - partial overlap on one or both strands. */
00585         int crossover = findCrossover(aFf, bFf, overlap, ssIsProt);
00586         int remain = overlap - crossover;
00587         overlapAdjustment =
00588             bioScoreMatch(ssIsProt, aFf->nEnd - remain, aFf->hEnd - remain, 
00589                 remain)
00590           + bioScoreMatch(ssIsProt, bFf->nStart, bFf->hStart, 
00591                 crossover);
00592         dq -= overlap;
00593         dt -= overlap;
00594         }
00595     }
00596 return overlapAdjustment + ssGapCost(dq, dt, data);
00597 }
00598 
00599 
00600 static void ssFindBestBig(struct ffAli *ffList, bioSeq *qSeq, bioSeq *tSeq,
00601         enum ffStringency stringency, boolean isProt, struct trans3 *t3List,
00602         struct ffAli **retBestAli, int *retScore, struct ffAli **retLeftovers)
00603 /* Go set up things to call chainBlocks to find best way to string together
00604  * blocks in alignment. */
00605 {
00606 struct cBlock *boxList = NULL, *box, *prevBox;
00607 struct ffAli *ff, *farRight = NULL;
00608 struct lm *lm = lmInit(0);
00609 int boxSize;
00610 DNA *firstH = tSeq->dna;
00611 struct chain *chainList, *chain, *bestChain;
00612 int tMin = BIGNUM, tMax = -BIGNUM;
00613 
00614 
00615 /* Make up box list for chainer. */
00616 for (ff = ffList; ff != NULL; ff = ff->right)
00617     {
00618     lmAllocVar(lm, box);
00619     boxSize = ff->nEnd - ff->nStart;
00620     box->qStart = ff->nStart - qSeq->dna;
00621     box->qEnd = box->qStart + boxSize;
00622     if (t3List)
00623         {
00624         trans3Offsets(t3List, ff->hStart, ff->hEnd, &box->tStart, &box->tEnd);
00625         }
00626     else
00627         {
00628         box->tStart = ff->hStart - firstH;
00629         box->tEnd = box->tStart + boxSize;
00630         }
00631     box->data = ff;
00632     box->score = bioScoreMatch(isProt, ff->nStart, ff->hStart, boxSize);
00633     if (tMin > box->tStart) tMin = box->tStart;
00634     if (tMax < box->tEnd) tMax = box->tEnd;
00635     slAddHead(&boxList, box);
00636     }
00637 
00638 /* Adjust boxes so that tMin is always 0. */
00639 for (box = boxList; box != NULL; box = box->next)
00640     {
00641     box->tStart -= tMin;
00642     box->tEnd -= tMin;
00643     }
00644 tMax -= tMin;
00645 
00646 ssStringency = stringency;
00647 ssIsProt = isProt;
00648 chainList = chainBlocks(qSeq->name, qSeq->size, '+', "tSeq", tMax, &boxList,
00649         ssConnectCost, ssGapCost, NULL, NULL);
00650 
00651 /* Fixup crossovers on best (first) chain. */
00652 bestChain = chainList;
00653 prevBox = bestChain->blockList;
00654 for (box = prevBox->next; box != NULL; box = box->next)
00655     {
00656     int overlap = findOverlap(prevBox, box);
00657     if (overlap > 0)
00658         {
00659         struct ffAli *left = prevBox->data;
00660         struct ffAli *right = box->data;
00661         int crossover = findCrossover(left, right, overlap, isProt);
00662         int remain = overlap - crossover;
00663         left->nEnd -= remain;
00664         left->hEnd -= remain;
00665         right->nStart += crossover;
00666         right->hStart += crossover;
00667         }
00668     prevBox = box;
00669     }
00670 
00671 /* Copy stuff from first chain to bestAli. */
00672 farRight = NULL;
00673 for (box = chainList->blockList; box != NULL; box = box->next)
00674     {
00675     ff = box->data;
00676     ff->left = farRight;
00677     farRight = ff;
00678     }
00679 *retBestAli = ffMakeRightLinks(farRight);
00680 
00681 /* Copy stuff from other chains to leftovers. */
00682 farRight = NULL;
00683 for (chain = chainList->next; chain != NULL; chain = chain->next)
00684     {
00685     for (box = chain->blockList; box != NULL; box = box->next)
00686         {
00687         ff = box->data;
00688         ff->left = farRight;
00689         farRight = ff;
00690         }
00691     }
00692 *retLeftovers = ffMakeRightLinks(farRight);
00693 
00694 *retScore = bestChain->score;
00695 for (chain = chainList; chain != NULL; chain = chain->next)
00696     chain->blockList = NULL;    /* Don't want to free this, it's local. */
00697 chainFreeList(&chainList);
00698 lmCleanup(&lm);
00699 }
00700 
00701 static void ssFindBest(struct ffAli *ffList, bioSeq *qSeq, bioSeq *tSeq,
00702         enum ffStringency stringency, boolean isProt, struct trans3 *t3List,
00703         struct ffAli **retBestAli, int *retScore, struct ffAli **retLeftovers)
00704 /* String together blocks in alignment into chains. */
00705 {
00706 int count = ffAliCount(ffList);
00707 if (count >= 10)
00708     {
00709     ssFindBestBig(ffList, qSeq, tSeq, stringency, isProt, t3List,
00710         retBestAli, retScore, retLeftovers);
00711     }
00712 else
00713     {
00714     ssFindBestSmall(ffList, qSeq, tSeq, stringency, isProt, t3List,
00715         retBestAli, retScore, retLeftovers);
00716     }
00717 }
00718 
00719 struct ffAli *cutAtBigIntrons(struct ffAli *ffList, int maxIntron, 
00720         int *pScore, enum ffStringency stringency,
00721         struct ffAli **returnLeftovers)
00722 /* Return ffList up to the first intron that's too big.
00723  * Put the rest of the blocks back onto the leftovers list. */
00724 {
00725 struct ffAli *prevFf, *ff, *cutFf = NULL;
00726 prevFf = ffList;
00727 for (ff = prevFf->right; ff != NULL; ff = ff->right)
00728     {
00729     int dt = ff->hStart - prevFf->hEnd;
00730     if (dt > maxIntron)
00731         {
00732         cutFf = prevFf;
00733         break;
00734         }
00735     prevFf = ff;
00736     }
00737 if (cutFf != NULL)
00738     {
00739     ff = cutFf->right;
00740     cutFf->right = NULL;
00741     ff->left = NULL;
00742     ffCat(returnLeftovers, &ff);
00743     *pScore = ffScore(ffList, stringency);
00744     }
00745 return ffList;
00746 }
00747 
00748 void ssStitch(struct ssBundle *bundle, enum ffStringency stringency, 
00749         int minScore, int maxToReturn)
00750 /* Glue together mrnas in bundle as much as possible. Returns number of
00751  * alignments after stitching. Updates bundle->ffList with stitched
00752  * together version. */
00753 {
00754 struct dnaSeq *qSeq =  bundle->qSeq;
00755 struct dnaSeq *genoSeq = bundle->genoSeq;
00756 struct ffAli *ffList = NULL;
00757 struct ssFfItem *ffl;
00758 struct ffAli *bestPath;
00759 int score;
00760 boolean firstTime = TRUE;
00761 
00762 if (bundle->ffList == NULL)
00763     return;
00764 
00765 /* The score may improve when we stitch together more alignments,
00766  * so don't let minScore be too harsh at this stage. */
00767 if (minScore > 20)
00768     minScore = 20;
00769 
00770 /* Create ffAlis for all in bundle and move to one big list. */
00771 for (ffl = bundle->ffList; ffl != NULL; ffl = ffl->next)
00772     {
00773     ffCat(&ffList, &ffl->ff);
00774     }
00775 slFreeList(&bundle->ffList);
00776 
00777 ffAliSort(&ffList, ffCmpHitsNeedleFirst);
00778 ffList = ffMergeClose(ffList, qSeq->dna, genoSeq->dna);
00779 
00780 while (ffList != NULL)
00781     {
00782     ssFindBest(ffList, qSeq, genoSeq, stringency, 
00783         bundle->isProt, bundle->t3List,
00784         &bestPath, &score, &ffList);
00785 
00786     bestPath = ffMergeNeedleAlis(bestPath, TRUE);
00787     bestPath = ffRemoveEmptyAlis(bestPath, TRUE);
00788     bestPath = ffMergeHayOverlaps(bestPath);
00789     bestPath = ffRemoveEmptyAlis(bestPath, TRUE);
00790     bestPath = forceMonotonic(bestPath, qSeq, genoSeq, stringency,
00791         bundle->isProt, bundle->t3List);
00792 
00793     if (firstTime && stringency == ffCdna && bundle->avoidFuzzyFindKludge == FALSE)
00794         {
00795         /* Only look for middle exons the first time.  Next times
00796          * this might regenerate most of the first alignment... */
00797         bestPath = smallMiddleExons(bestPath, bundle, stringency);
00798         }
00799     bestPath = ffMergeNeedleAlis(bestPath, TRUE);
00800     if (ffIntronMax != ffIntronMaxDefault)
00801         {
00802         bestPath = cutAtBigIntrons(bestPath, ffIntronMax, &score, stringency,
00803                 &ffList);
00804         }
00805     if (!bundle->isProt)
00806         ffSlideIntrons(bestPath);
00807     bestPath = ffRemoveEmptyAlis(bestPath, TRUE);
00808     if (score >= minScore)
00809         {
00810         AllocVar(ffl);
00811         ffl->ff = bestPath;
00812         slAddHead(&bundle->ffList, ffl);
00813         }
00814     else
00815         {
00816         ffFreeAli(&bestPath);
00817         ffFreeAli(&ffList);
00818         break;
00819         }
00820     firstTime = FALSE;
00821     if (--maxToReturn <= 0)
00822         {
00823         ffFreeAli(&ffList);
00824         break;
00825         }
00826     }
00827 slReverse(&bundle->ffList);
00828 return;
00829 }
00830 
00831 static struct ssBundle *findBundle(struct ssBundle *list,  
00832         struct dnaSeq *genoSeq)
00833 /* Find bundle in list that has matching genoSeq pointer, or NULL
00834  * if none such.  This routine is used by psLayout but not blat. */
00835 {
00836 struct ssBundle *el;
00837 for (el = list; el != NULL; el = el->next)
00838     if (el->genoSeq == genoSeq)
00839         return el;
00840 return NULL;
00841 }
00842 
00843 struct ssBundle *ssFindBundles(struct patSpace *ps, struct dnaSeq *cSeq, 
00844         char *cName, enum ffStringency stringency, boolean avoidSelfSelf)
00845 /* Find patSpace alignments.  This routine is used by psLayout but not blat. */
00846 {
00847 struct patClump *clumpList, *clump;
00848 struct ssBundle *bundleList = NULL, *bun = NULL;
00849 DNA *cdna = cSeq->dna;
00850 int totalCdnaSize = cSeq->size;
00851 DNA *endCdna = cdna+totalCdnaSize;
00852 struct ssFfItem *ffl;
00853 struct dnaSeq *lastSeq = NULL;
00854 int maxSize = 700;
00855 int preferredSize = 500;
00856 int overlapSize = 250;
00857 
00858 for (;;)
00859     {
00860     int cSize = endCdna - cdna;
00861     if (cSize > maxSize)
00862         cSize = preferredSize;
00863     clumpList = patSpaceFindOne(ps, cdna, cSize);
00864     for (clump = clumpList; clump != NULL; clump = clump->next)
00865         {
00866         struct ffAli *ff;
00867         struct dnaSeq *seq = clump->seq;
00868         DNA *tStart = seq->dna + clump->start;
00869         if (!avoidSelfSelf || !sameString(seq->name, cSeq->name))
00870             {
00871             ff = ffFind(cdna, cdna+cSize, tStart, tStart + clump->size, stringency);
00872             if (ff != NULL)
00873                 {
00874                 if (lastSeq != seq)
00875                     {
00876                     lastSeq = seq;
00877                     if ((bun = findBundle(bundleList, seq)) == NULL)
00878                         {
00879                         AllocVar(bun);
00880                         bun->qSeq = cSeq;
00881                         bun->genoSeq = seq;
00882                         bun->genoIx = clump->bacIx;
00883                         bun->genoContigIx = clump->seqIx;
00884                         slAddHead(&bundleList, bun);
00885                         }
00886                     }
00887                 AllocVar(ffl);
00888                 ffl->ff = ff;
00889                 slAddHead(&bun->ffList, ffl);
00890                 }
00891             }
00892         }
00893     cdna += cSize;
00894     if (cdna >= endCdna)
00895         break;
00896     cdna -= overlapSize;
00897     slFreeList(&clumpList);
00898     }
00899 slReverse(&bundleList);
00900 cdna = cSeq->dna;
00901 
00902 for (bun = bundleList; bun != NULL; bun = bun->next)
00903     {
00904     ssStitch(bun, stringency, 20, 16);
00905     }
00906 return bundleList;
00907 }
00908 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2