lib/chainBlock.c

Go to the documentation of this file.
00001 /* chainBlock - Chain together scored blocks from an alignment
00002  * into scored chains.  Internally this uses a kd-tree and a
00003  * varient of an algorithm suggested by Webb Miller and further
00004  * developed by Jim Kent. */
00005 
00006 #include "common.h"
00007 #include "localmem.h"
00008 #include "linefile.h"
00009 #include "dlist.h"
00010 #include "chainBlock.h"
00011 
00012 static char const rcsid[] = "$Id: chainBlock.c,v 1.17 2005/04/10 14:41:21 markd Exp $";
00013 
00014 struct kdBranch
00015 /* A kd-tree. That is a binary tree which partitions the children
00016  * into higher and lower one dimension at a time.  We're just doing
00017  * one in two dimensions, so it alternates between q and t dimensions. */
00018     {
00019     struct kdBranch *lo;      /* Pointer to children with lower coordinates. */
00020     struct kdBranch *hi;      /* Pointer to children with higher coordinates. */
00021     struct kdLeaf *leaf;      /* Extra info for leaves on tree. */
00022     int cutCoord;             /* Coordinate (in some dimension) to cut on */
00023     double maxScore;          /* Max score of any leaf below us. */
00024     int maxQ;                 /* Maximum qEnd of any leaf below us. */
00025     int maxT;                 /* Maximum tEnd of any leaf below us. */
00026     };
00027 
00028 struct kdLeaf
00029 /* A leaf in our kdTree. */
00030     {
00031     struct kdLeaf *next;        /* Next in list. */
00032     struct cBlock *cb;          /* Start position and score from user. */
00033     struct kdBranch *bestPred;  /* Best predecessor. */
00034     double totalScore;          /* Total score of chain up to here. */
00035     bool hit;                   /* This hit? Used by system internally. */
00036     };
00037 
00038 struct kdTree
00039 /* The whole tree.  */
00040     {
00041     struct kdBranch *root;      /* Pointer to root of kd-tree. */
00042     };
00043 
00044 
00045 static int kdLeafCmpQ(const void *va, const void *vb)
00046 /* Compare to sort based on query start. */
00047 {
00048 const struct kdLeaf *a = *((struct kdLeaf **)va);
00049 const struct kdLeaf *b = *((struct kdLeaf **)vb);
00050 return a->cb->qStart - b->cb->qStart;
00051 }
00052 
00053 static int kdLeafCmpT(const void *va, const void *vb)
00054 /* Compare to sort based on target start. */
00055 {
00056 const struct kdLeaf *a = *((struct kdLeaf **)va);
00057 const struct kdLeaf *b = *((struct kdLeaf **)vb);
00058 return a->cb->tStart - b->cb->tStart;
00059 }
00060 
00061 static int kdLeafCmpTotal(const void *va, const void *vb)
00062 /* Compare to sort based on total score. */
00063 {
00064 const struct kdLeaf *a = *((struct kdLeaf **)va);
00065 const struct kdLeaf *b = *((struct kdLeaf **)vb);
00066 double diff = b->totalScore - a->totalScore;
00067 if (diff < 0) return -1;
00068 else if (diff > 0) return 1;
00069 else return 0;
00070 }
00071 
00072 static int medianVal(struct dlList *list, int medianIx, int dim)
00073 /* Return value of median block in list on given dimension 
00074  * Mark blocks up to median as hit. */
00075 {
00076 struct dlNode *node = list->head;
00077 struct kdLeaf *leaf = NULL;
00078 int i;
00079 
00080 for (i=0; i<medianIx; ++i)
00081     {
00082     leaf = node->val;
00083     leaf->hit = TRUE;  
00084     node = node->next;
00085     }
00086 return (dim == 0 ? leaf->cb->qStart : leaf->cb->tStart);
00087 }
00088 
00089 static int splitList(struct dlList *oldList, struct dlList *newList)
00090 /* Peel off members of oldList that are not hit onto new list. */
00091 {
00092 struct dlNode *node, *next;
00093 struct kdLeaf *leaf;
00094 int newCount = 0;
00095 dlListInit(newList);
00096 for (node = oldList->head; !dlEnd(node); node = next)
00097     {
00098     next = node->next;
00099     leaf = node->val;
00100     if (!leaf->hit)
00101         {
00102         dlRemove(node);
00103         dlAddTail(newList, node);
00104         ++newCount;
00105         }
00106     }
00107 return newCount;
00108 }
00109 
00110 static void clearHits(struct dlList *list)
00111 /* Clear hit flags of all blocks on list. */
00112 {
00113 struct dlNode *node;
00114 for (node = list->head; !dlEnd(node); node = node->next)
00115     {
00116     struct kdLeaf *leaf = node->val;
00117     leaf->hit = FALSE;
00118     }
00119 }
00120 
00121 
00122 static struct kdBranch *kdBuild(int nodeCount, struct dlList *lists[2], int dim,
00123         struct lm *lm)
00124 /* Build up kd-tree recursively. */
00125 {
00126 struct kdBranch *branch;
00127 lmAllocVar(lm, branch);
00128 if (nodeCount == 1)
00129     {
00130     struct kdLeaf *leaf = lists[0]->head->val;
00131     branch->leaf = leaf;
00132     branch->maxQ = leaf->cb->qEnd;
00133     branch->maxT = leaf->cb->tEnd;
00134     }
00135 else
00136     {
00137     int newCount;
00138     struct dlList *newLists[2];
00139     struct dlList newQ, newT;
00140     int nextDim = 1-dim;
00141 
00142     /* Subdivide lists along median.  */
00143     newLists[0] = &newQ;
00144     newLists[1] = &newT;
00145     clearHits(lists[0]);
00146     branch->cutCoord = medianVal(lists[dim], nodeCount/2, dim);
00147     newCount = splitList(lists[0], newLists[0]);
00148     splitList(lists[1], newLists[1]);
00149 
00150     /* Recurse on each side. */
00151     branch->lo = kdBuild(nodeCount - newCount, lists, nextDim, lm);
00152     branch->hi = kdBuild(newCount, newLists, nextDim, lm);
00153     branch->maxQ = max(branch->lo->maxQ, branch->hi->maxQ);
00154     branch->maxT = max(branch->lo->maxT, branch->hi->maxT);
00155     }
00156 return branch;
00157 }
00158 
00159 static struct kdTree *kdTreeMake(struct kdLeaf *leafList, struct lm *lm)
00160 /* Make a kd-tree containing leafList. */
00161 {
00162 struct kdLeaf *leaf;
00163 int nodeCount = slCount(leafList);
00164 struct kdTree *tree;
00165 struct dlList qList, tList,*lists[2];
00166 struct dlNode *qNodes, *tNodes;
00167 int i;
00168 
00169 /* Build lists sorted in each dimension. This
00170  * will let us quickly find medians while constructing
00171  * the kd-tree. */
00172 dlListInit(&qList);
00173 dlListInit(&tList);
00174 AllocArray(qNodes, nodeCount);
00175 AllocArray(tNodes, nodeCount);
00176 for (i=0 , leaf=leafList; leaf != NULL; leaf = leaf->next, ++i)
00177     {
00178     qNodes[i].val = tNodes[i].val = leaf;
00179     dlAddTail(&qList, &qNodes[i]);
00180     dlAddTail(&tList, &tNodes[i]);
00181     }
00182 /* Just sort qList since tList is sorted because it was
00183  * constructed from sorted leafList. */
00184 dlSort(&qList, kdLeafCmpQ); 
00185 lists[0] = &qList;
00186 lists[1] = &tList;
00187 
00188 /* Allocate master data structure and call recursive builder. */
00189 lmAllocVar(lm, tree);
00190 tree->root = kdBuild(nodeCount, lists, 0, lm);
00191 
00192 /* Clean up and go home. */
00193 freeMem(qNodes);
00194 freeMem(tNodes);
00195 return tree;
00196 }
00197 
00198 struct predScore
00199 /* Predecessor and score we get merging with it. */
00200     {
00201     struct kdBranch *pred;      /* Predecessor. */
00202     double score;               /* Score of us plus predecessor. */
00203     };
00204 
00205 static struct predScore bestPredecessor(
00206         struct kdLeaf *lonely,      /* We're finding this leaf's predecessor */
00207         ConnectCost connectCost,    /* Cost to connect two leafs. */
00208         GapCost gapCost,            /* Lower bound on gap cost. */
00209         void *gapData,              /* Data to pass to Gap/Connect cost */
00210         int dim,                    /* Dimension level of tree splits on. */
00211         struct kdBranch *branch,    /* Subtree to explore */
00212         struct predScore bestSoFar) /* Best predecessor so far. */
00213 /* Find the highest scoring predecessor to this leaf, and
00214  * thus iteratively the highest scoring subchain that ends
00215  * in this leaf. */
00216 {
00217 struct kdLeaf *leaf;
00218 double maxScore = branch->maxScore + lonely->cb->score;
00219 
00220 /* If best score in this branch of tree wouldn't be enough
00221  * don't bother exploring it. First try without calculating
00222  * gap score in case gap score is a little expensive to calculate. */
00223 if (maxScore < bestSoFar.score)
00224     return bestSoFar;
00225 maxScore -= gapCost(lonely->cb->qStart - branch->maxQ, 
00226         lonely->cb->tStart - branch->maxT, gapData);
00227 if (maxScore < bestSoFar.score)
00228     return bestSoFar;
00229 
00230 
00231 /* If it's a terminal branch, then calculate score to connect
00232  * with it. */
00233 else if ((leaf = branch->leaf) != NULL)
00234     {
00235     if (leaf->cb->qStart < lonely->cb->qStart 
00236      && leaf->cb->tStart < lonely->cb->tStart)
00237         {
00238         double score = leaf->totalScore + lonely->cb->score - 
00239                 connectCost(leaf->cb, lonely->cb, gapData);
00240         if (score > bestSoFar.score)
00241            {
00242            bestSoFar.score = score;
00243            bestSoFar.pred = branch;
00244            }
00245         }
00246     return bestSoFar;
00247     }
00248 
00249 /* Otherwise explore sub-trees that could harbor predecessors. */
00250 else
00251     {
00252     int newDim = 1-dim;
00253     int dimCoord = (dim == 0 ? lonely->cb->qStart : lonely->cb->tStart);
00254     
00255     /* Explore hi branch first as it is more likely to have high
00256      * scores.  However only explore it if it can have things starting
00257      * before us. */
00258     if (dimCoord > branch->cutCoord)
00259          bestSoFar = bestPredecessor(lonely, connectCost, gapCost, gapData,
00260                 newDim, branch->hi, bestSoFar);
00261     bestSoFar = bestPredecessor(lonely, connectCost, gapCost, gapData,
00262         newDim, branch->lo, bestSoFar);
00263     return bestSoFar;
00264     }
00265 }
00266 
00267 static void updateScoresOnWay(struct kdBranch *branch, 
00268         int dim, struct kdLeaf *leaf)
00269 /* Traverse kd-tree to find leaf.  Update all maxScores on the way
00270  * to reflect leaf->totalScore. */
00271 {
00272 int newDim = 1-dim;
00273 int dimCoord = (dim == 0 ? leaf->cb->qStart : leaf->cb->tStart);
00274 if (branch->maxScore < leaf->totalScore) branch->maxScore = leaf->totalScore;
00275 if (branch->leaf == NULL)
00276     {
00277     if (dimCoord <= branch->cutCoord)
00278         updateScoresOnWay(branch->lo, newDim, leaf);
00279     if (dimCoord >= branch->cutCoord)
00280         updateScoresOnWay(branch->hi, newDim, leaf);
00281     }
00282 }
00283 
00284 static void findBestPredecessors(struct kdTree *tree, struct kdLeaf *leafList, 
00285         ConnectCost connectCost, GapCost gapCost, void *gapData)
00286 /* Find best predecessor for each leaf. */
00287 {
00288 static struct predScore noBest;
00289 struct kdLeaf *leaf;
00290 struct kdLeaf *bestLeaf = NULL;
00291 double bestScore = 0;
00292 
00293 for (leaf = leafList; leaf != NULL; leaf = leaf->next)
00294     {
00295     struct predScore best;
00296     best = bestPredecessor(leaf, connectCost, gapCost, gapData, 0, tree->root, noBest);
00297     if (best.score > leaf->totalScore)
00298         {
00299         leaf->totalScore = best.score;
00300         leaf->bestPred = best.pred;
00301         }
00302     updateScoresOnWay(tree->root, 0, leaf);
00303     if (bestScore < leaf->totalScore)
00304         {
00305         bestScore = leaf->totalScore;
00306         bestLeaf = leaf;
00307         }
00308     }
00309 }
00310 
00311 static double scoreBlocks(struct cBlock *blockList, ConnectCost connectCost,
00312         void *gapData)
00313 /* Score list of blocks including gaps between blocks. */
00314 {
00315 struct cBlock *block, *lastBlock = NULL;
00316 double score = 0;
00317 for (block = blockList; block != NULL; block = block->next)
00318     {
00319     score += block->score;
00320     if (lastBlock != NULL)
00321         score -= connectCost(lastBlock, block, gapData);
00322     lastBlock = block;
00323     }
00324 return score;
00325 }
00326 
00327 static struct chain *peelChains(char *qName, int qSize, char qStrand,
00328         char *tName, int tSize, struct kdLeaf *leafList, FILE *details)
00329 /* Peel off all chains from tree implied by
00330  * best predecessors. */
00331 {
00332 struct kdLeaf *leaf;
00333 struct chain *chainList = NULL, *chain;
00334 for (leaf = leafList; leaf != NULL; leaf = leaf->next)
00335     leaf->hit = FALSE;
00336 for (leaf = leafList; leaf != NULL; leaf = leaf->next)
00337     {
00338     if (!leaf->hit)
00339         {
00340         struct kdLeaf *lf;
00341         AllocVar(chain);
00342         chain->qName = cloneString(qName);
00343         chain->qSize = qSize;
00344         chain->qStrand = qStrand;
00345         chain->tName = cloneString(tName);
00346         chain->tSize = tSize;
00347         chain->qEnd = leaf->cb->qEnd;
00348         chain->tEnd = leaf->cb->tEnd;
00349         if (details)
00350             {
00351             chain->score = leaf->totalScore;
00352             chain->id = -1;
00353             chainWriteHead(chain, details);
00354             chain->score = 0;
00355             chain->id = 0;
00356             }
00357         slAddHead(&chainList, chain);
00358         for (lf = leaf; ; )
00359             {
00360             lf->hit = TRUE;
00361             slAddHead(&chain->blockList, lf->cb);
00362             chain->qStart = lf->cb->qStart;
00363             chain->tStart = lf->cb->tStart;
00364             if (details)
00365                 {
00366                 struct cBlock *b = lf->cb;
00367                 fprintf(details, "%d\t%f\t%d\t%d\t%d\n", b->score, lf->totalScore,
00368                         b->tStart, b->qStart, b->qEnd - b->qStart);
00369                 }
00370             if (lf->bestPred == NULL)
00371                  break;
00372             else
00373                 {
00374                 if (details)
00375                     {
00376                     struct cBlock *b = lf->cb;
00377                     struct cBlock *a = lf->bestPred->leaf->cb;
00378                     fprintf(details, " gap %d\t%d\n", 
00379                         b->tStart - a->tEnd, b->qStart - a->qEnd);
00380                     }
00381                 }
00382             lf = lf->bestPred->leaf;
00383             if (lf->hit)
00384                 break;
00385             }
00386         }
00387     }
00388 slReverse(&chainList);
00389 return chainList;
00390 }
00391 
00392 struct chain *chainBlocks(
00393         char *qName, int qSize, char qStrand,   /* Info on query sequence */
00394         char *tName, int tSize,                 /* Info on target. */
00395         struct cBlock **pBlockList,             /* Unordered ungapped alignments. */
00396         ConnectCost connectCost,                /* Calculate cost to connect nodes. */
00397         GapCost gapCost,                        /* Cost for non-overlapping nodes. */
00398         void *gapData,                          /* Passed through to connect/gapCosts */
00399         FILE *details)                          /* NULL except for debugging */
00400 /* Create list of chains from list of blocks.  The blockList will get
00401  * eaten up as the blocks are moved from the list to the chain. 
00402  * The list of chains returned is sorted by score. 
00403  *
00404  * The details FILE may be NULL, and is where additional information
00405  * about the chaining is put.
00406  *
00407  * Note that the connectCost needs to adjust for possibly partially 
00408  * overlapping blocks, and that these need to be taken out of the
00409  * resulting chains in general.  This can get fairly complex.  Also
00410  * the chains will need some cleanup at the end.  Use the chainConnect
00411  * module to help with this.  See hg/mouseStuff/axtChain for example usage. */
00412 {
00413 struct kdTree *tree;
00414 struct kdLeaf *leafList = NULL, *leaf;
00415 struct cBlock *block;
00416 struct chain *chainList = NULL, *chain;
00417 struct lm *lm;
00418 
00419 /* Empty lists will be problematic later, so deal with them here. */
00420 if (*pBlockList == NULL)
00421    return NULL;
00422 
00423 /* Make a leaf for each block. */
00424 lm = lmInit(0);  /* Memory for tree, branches and leaves. */
00425 for (block = *pBlockList; block != NULL; block = block->next)
00426     {
00427     /* Watch out for 0-length blocks in input: */
00428     if (block->tStart == block->tEnd)
00429         continue;
00430     lmAllocVar(lm, leaf);
00431     leaf->cb = block;
00432     leaf->totalScore = block->score;
00433     slAddHead(&leafList, leaf);
00434     }
00435 
00436 /* Figure out chains. */
00437 slSort(&leafList, kdLeafCmpT);
00438 tree = kdTreeMake(leafList, lm);
00439 findBestPredecessors(tree, leafList, connectCost, gapCost, gapData);
00440 slSort(&leafList, kdLeafCmpTotal);
00441 chainList = peelChains(qName, qSize, qStrand, tName, tSize, leafList, details);
00442 
00443 /* Rescore chains (since some truncated) */
00444 for (chain = chainList; chain != NULL; chain = chain->next)
00445     chain->score = scoreBlocks(chain->blockList, connectCost, gapData);
00446 slSort(&chainList,  chainCmpScore);
00447 
00448 /* Clean up and go home. */
00449 lmCleanup(&lm);
00450 *pBlockList = NULL;
00451 return chainList;
00452 }
00453 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2