00001
00002
00003
00004
00005
00006 #include "common.h"
00007 #include "localmem.h"
00008 #include "linefile.h"
00009 #include "dlist.h"
00010 #include "chainBlock.h"
00011
00012 static char const rcsid[] = "$Id: chainBlock.c,v 1.17 2005/04/10 14:41:21 markd Exp $";
00013
00014 struct kdBranch
00015
00016
00017
00018 {
00019 struct kdBranch *lo;
00020 struct kdBranch *hi;
00021 struct kdLeaf *leaf;
00022 int cutCoord;
00023 double maxScore;
00024 int maxQ;
00025 int maxT;
00026 };
00027
00028 struct kdLeaf
00029
00030 {
00031 struct kdLeaf *next;
00032 struct cBlock *cb;
00033 struct kdBranch *bestPred;
00034 double totalScore;
00035 bool hit;
00036 };
00037
00038 struct kdTree
00039
00040 {
00041 struct kdBranch *root;
00042 };
00043
00044
00045 static int kdLeafCmpQ(const void *va, const void *vb)
00046
00047 {
00048 const struct kdLeaf *a = *((struct kdLeaf **)va);
00049 const struct kdLeaf *b = *((struct kdLeaf **)vb);
00050 return a->cb->qStart - b->cb->qStart;
00051 }
00052
00053 static int kdLeafCmpT(const void *va, const void *vb)
00054
00055 {
00056 const struct kdLeaf *a = *((struct kdLeaf **)va);
00057 const struct kdLeaf *b = *((struct kdLeaf **)vb);
00058 return a->cb->tStart - b->cb->tStart;
00059 }
00060
00061 static int kdLeafCmpTotal(const void *va, const void *vb)
00062
00063 {
00064 const struct kdLeaf *a = *((struct kdLeaf **)va);
00065 const struct kdLeaf *b = *((struct kdLeaf **)vb);
00066 double diff = b->totalScore - a->totalScore;
00067 if (diff < 0) return -1;
00068 else if (diff > 0) return 1;
00069 else return 0;
00070 }
00071
00072 static int medianVal(struct dlList *list, int medianIx, int dim)
00073
00074
00075 {
00076 struct dlNode *node = list->head;
00077 struct kdLeaf *leaf = NULL;
00078 int i;
00079
00080 for (i=0; i<medianIx; ++i)
00081 {
00082 leaf = node->val;
00083 leaf->hit = TRUE;
00084 node = node->next;
00085 }
00086 return (dim == 0 ? leaf->cb->qStart : leaf->cb->tStart);
00087 }
00088
00089 static int splitList(struct dlList *oldList, struct dlList *newList)
00090
00091 {
00092 struct dlNode *node, *next;
00093 struct kdLeaf *leaf;
00094 int newCount = 0;
00095 dlListInit(newList);
00096 for (node = oldList->head; !dlEnd(node); node = next)
00097 {
00098 next = node->next;
00099 leaf = node->val;
00100 if (!leaf->hit)
00101 {
00102 dlRemove(node);
00103 dlAddTail(newList, node);
00104 ++newCount;
00105 }
00106 }
00107 return newCount;
00108 }
00109
00110 static void clearHits(struct dlList *list)
00111
00112 {
00113 struct dlNode *node;
00114 for (node = list->head; !dlEnd(node); node = node->next)
00115 {
00116 struct kdLeaf *leaf = node->val;
00117 leaf->hit = FALSE;
00118 }
00119 }
00120
00121
00122 static struct kdBranch *kdBuild(int nodeCount, struct dlList *lists[2], int dim,
00123 struct lm *lm)
00124
00125 {
00126 struct kdBranch *branch;
00127 lmAllocVar(lm, branch);
00128 if (nodeCount == 1)
00129 {
00130 struct kdLeaf *leaf = lists[0]->head->val;
00131 branch->leaf = leaf;
00132 branch->maxQ = leaf->cb->qEnd;
00133 branch->maxT = leaf->cb->tEnd;
00134 }
00135 else
00136 {
00137 int newCount;
00138 struct dlList *newLists[2];
00139 struct dlList newQ, newT;
00140 int nextDim = 1-dim;
00141
00142
00143 newLists[0] = &newQ;
00144 newLists[1] = &newT;
00145 clearHits(lists[0]);
00146 branch->cutCoord = medianVal(lists[dim], nodeCount/2, dim);
00147 newCount = splitList(lists[0], newLists[0]);
00148 splitList(lists[1], newLists[1]);
00149
00150
00151 branch->lo = kdBuild(nodeCount - newCount, lists, nextDim, lm);
00152 branch->hi = kdBuild(newCount, newLists, nextDim, lm);
00153 branch->maxQ = max(branch->lo->maxQ, branch->hi->maxQ);
00154 branch->maxT = max(branch->lo->maxT, branch->hi->maxT);
00155 }
00156 return branch;
00157 }
00158
00159 static struct kdTree *kdTreeMake(struct kdLeaf *leafList, struct lm *lm)
00160
00161 {
00162 struct kdLeaf *leaf;
00163 int nodeCount = slCount(leafList);
00164 struct kdTree *tree;
00165 struct dlList qList, tList,*lists[2];
00166 struct dlNode *qNodes, *tNodes;
00167 int i;
00168
00169
00170
00171
00172 dlListInit(&qList);
00173 dlListInit(&tList);
00174 AllocArray(qNodes, nodeCount);
00175 AllocArray(tNodes, nodeCount);
00176 for (i=0 , leaf=leafList; leaf != NULL; leaf = leaf->next, ++i)
00177 {
00178 qNodes[i].val = tNodes[i].val = leaf;
00179 dlAddTail(&qList, &qNodes[i]);
00180 dlAddTail(&tList, &tNodes[i]);
00181 }
00182
00183
00184 dlSort(&qList, kdLeafCmpQ);
00185 lists[0] = &qList;
00186 lists[1] = &tList;
00187
00188
00189 lmAllocVar(lm, tree);
00190 tree->root = kdBuild(nodeCount, lists, 0, lm);
00191
00192
00193 freeMem(qNodes);
00194 freeMem(tNodes);
00195 return tree;
00196 }
00197
00198 struct predScore
00199
00200 {
00201 struct kdBranch *pred;
00202 double score;
00203 };
00204
00205 static struct predScore bestPredecessor(
00206 struct kdLeaf *lonely,
00207 ConnectCost connectCost,
00208 GapCost gapCost,
00209 void *gapData,
00210 int dim,
00211 struct kdBranch *branch,
00212 struct predScore bestSoFar)
00213
00214
00215
00216 {
00217 struct kdLeaf *leaf;
00218 double maxScore = branch->maxScore + lonely->cb->score;
00219
00220
00221
00222
00223 if (maxScore < bestSoFar.score)
00224 return bestSoFar;
00225 maxScore -= gapCost(lonely->cb->qStart - branch->maxQ,
00226 lonely->cb->tStart - branch->maxT, gapData);
00227 if (maxScore < bestSoFar.score)
00228 return bestSoFar;
00229
00230
00231
00232
00233 else if ((leaf = branch->leaf) != NULL)
00234 {
00235 if (leaf->cb->qStart < lonely->cb->qStart
00236 && leaf->cb->tStart < lonely->cb->tStart)
00237 {
00238 double score = leaf->totalScore + lonely->cb->score -
00239 connectCost(leaf->cb, lonely->cb, gapData);
00240 if (score > bestSoFar.score)
00241 {
00242 bestSoFar.score = score;
00243 bestSoFar.pred = branch;
00244 }
00245 }
00246 return bestSoFar;
00247 }
00248
00249
00250 else
00251 {
00252 int newDim = 1-dim;
00253 int dimCoord = (dim == 0 ? lonely->cb->qStart : lonely->cb->tStart);
00254
00255
00256
00257
00258 if (dimCoord > branch->cutCoord)
00259 bestSoFar = bestPredecessor(lonely, connectCost, gapCost, gapData,
00260 newDim, branch->hi, bestSoFar);
00261 bestSoFar = bestPredecessor(lonely, connectCost, gapCost, gapData,
00262 newDim, branch->lo, bestSoFar);
00263 return bestSoFar;
00264 }
00265 }
00266
00267 static void updateScoresOnWay(struct kdBranch *branch,
00268 int dim, struct kdLeaf *leaf)
00269
00270
00271 {
00272 int newDim = 1-dim;
00273 int dimCoord = (dim == 0 ? leaf->cb->qStart : leaf->cb->tStart);
00274 if (branch->maxScore < leaf->totalScore) branch->maxScore = leaf->totalScore;
00275 if (branch->leaf == NULL)
00276 {
00277 if (dimCoord <= branch->cutCoord)
00278 updateScoresOnWay(branch->lo, newDim, leaf);
00279 if (dimCoord >= branch->cutCoord)
00280 updateScoresOnWay(branch->hi, newDim, leaf);
00281 }
00282 }
00283
00284 static void findBestPredecessors(struct kdTree *tree, struct kdLeaf *leafList,
00285 ConnectCost connectCost, GapCost gapCost, void *gapData)
00286
00287 {
00288 static struct predScore noBest;
00289 struct kdLeaf *leaf;
00290 struct kdLeaf *bestLeaf = NULL;
00291 double bestScore = 0;
00292
00293 for (leaf = leafList; leaf != NULL; leaf = leaf->next)
00294 {
00295 struct predScore best;
00296 best = bestPredecessor(leaf, connectCost, gapCost, gapData, 0, tree->root, noBest);
00297 if (best.score > leaf->totalScore)
00298 {
00299 leaf->totalScore = best.score;
00300 leaf->bestPred = best.pred;
00301 }
00302 updateScoresOnWay(tree->root, 0, leaf);
00303 if (bestScore < leaf->totalScore)
00304 {
00305 bestScore = leaf->totalScore;
00306 bestLeaf = leaf;
00307 }
00308 }
00309 }
00310
00311 static double scoreBlocks(struct cBlock *blockList, ConnectCost connectCost,
00312 void *gapData)
00313
00314 {
00315 struct cBlock *block, *lastBlock = NULL;
00316 double score = 0;
00317 for (block = blockList; block != NULL; block = block->next)
00318 {
00319 score += block->score;
00320 if (lastBlock != NULL)
00321 score -= connectCost(lastBlock, block, gapData);
00322 lastBlock = block;
00323 }
00324 return score;
00325 }
00326
00327 static struct chain *peelChains(char *qName, int qSize, char qStrand,
00328 char *tName, int tSize, struct kdLeaf *leafList, FILE *details)
00329
00330
00331 {
00332 struct kdLeaf *leaf;
00333 struct chain *chainList = NULL, *chain;
00334 for (leaf = leafList; leaf != NULL; leaf = leaf->next)
00335 leaf->hit = FALSE;
00336 for (leaf = leafList; leaf != NULL; leaf = leaf->next)
00337 {
00338 if (!leaf->hit)
00339 {
00340 struct kdLeaf *lf;
00341 AllocVar(chain);
00342 chain->qName = cloneString(qName);
00343 chain->qSize = qSize;
00344 chain->qStrand = qStrand;
00345 chain->tName = cloneString(tName);
00346 chain->tSize = tSize;
00347 chain->qEnd = leaf->cb->qEnd;
00348 chain->tEnd = leaf->cb->tEnd;
00349 if (details)
00350 {
00351 chain->score = leaf->totalScore;
00352 chain->id = -1;
00353 chainWriteHead(chain, details);
00354 chain->score = 0;
00355 chain->id = 0;
00356 }
00357 slAddHead(&chainList, chain);
00358 for (lf = leaf; ; )
00359 {
00360 lf->hit = TRUE;
00361 slAddHead(&chain->blockList, lf->cb);
00362 chain->qStart = lf->cb->qStart;
00363 chain->tStart = lf->cb->tStart;
00364 if (details)
00365 {
00366 struct cBlock *b = lf->cb;
00367 fprintf(details, "%d\t%f\t%d\t%d\t%d\n", b->score, lf->totalScore,
00368 b->tStart, b->qStart, b->qEnd - b->qStart);
00369 }
00370 if (lf->bestPred == NULL)
00371 break;
00372 else
00373 {
00374 if (details)
00375 {
00376 struct cBlock *b = lf->cb;
00377 struct cBlock *a = lf->bestPred->leaf->cb;
00378 fprintf(details, " gap %d\t%d\n",
00379 b->tStart - a->tEnd, b->qStart - a->qEnd);
00380 }
00381 }
00382 lf = lf->bestPred->leaf;
00383 if (lf->hit)
00384 break;
00385 }
00386 }
00387 }
00388 slReverse(&chainList);
00389 return chainList;
00390 }
00391
00392 struct chain *chainBlocks(
00393 char *qName, int qSize, char qStrand,
00394 char *tName, int tSize,
00395 struct cBlock **pBlockList,
00396 ConnectCost connectCost,
00397 GapCost gapCost,
00398 void *gapData,
00399 FILE *details)
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412 {
00413 struct kdTree *tree;
00414 struct kdLeaf *leafList = NULL, *leaf;
00415 struct cBlock *block;
00416 struct chain *chainList = NULL, *chain;
00417 struct lm *lm;
00418
00419
00420 if (*pBlockList == NULL)
00421 return NULL;
00422
00423
00424 lm = lmInit(0);
00425 for (block = *pBlockList; block != NULL; block = block->next)
00426 {
00427
00428 if (block->tStart == block->tEnd)
00429 continue;
00430 lmAllocVar(lm, leaf);
00431 leaf->cb = block;
00432 leaf->totalScore = block->score;
00433 slAddHead(&leafList, leaf);
00434 }
00435
00436
00437 slSort(&leafList, kdLeafCmpT);
00438 tree = kdTreeMake(leafList, lm);
00439 findBestPredecessors(tree, leafList, connectCost, gapCost, gapData);
00440 slSort(&leafList, kdLeafCmpTotal);
00441 chainList = peelChains(qName, qSize, qStrand, tName, tSize, leafList, details);
00442
00443
00444 for (chain = chainList; chain != NULL; chain = chain->next)
00445 chain->score = scoreBlocks(chain->blockList, connectCost, gapData);
00446 slSort(&chainList, chainCmpScore);
00447
00448
00449 lmCleanup(&lm);
00450 *pBlockList = NULL;
00451 return chainList;
00452 }
00453