00001
00002
00003
00004 #include "common.h"
00005 #include "linefile.h"
00006 #include "hash.h"
00007 #include "dnaseq.h"
00008 #include "dnautil.h"
00009 #include "chain.h"
00010
00011 static char const rcsid[] = "$Id: chain.c,v 1.25 2007/02/19 18:29:32 kent Exp $";
00012
00013 void chainFree(struct chain **pChain)
00014
00015 {
00016 struct chain *chain = *pChain;
00017 if (chain != NULL)
00018 {
00019 slFreeList(&chain->blockList);
00020 freeMem(chain->qName);
00021 freeMem(chain->tName);
00022 freez(pChain);
00023 }
00024 }
00025
00026 void chainFreeList(struct chain **pList)
00027
00028 {
00029 struct chain *el, *next;
00030
00031 for (el = *pList; el != NULL; el = next)
00032 {
00033 next = el->next;
00034 chainFree(&el);
00035 }
00036 *pList = NULL;
00037 }
00038
00039 int cBlockCmpTarget(const void *va, const void *vb)
00040
00041 {
00042 const struct cBlock *a = *((struct cBlock **)va);
00043 const struct cBlock *b = *((struct cBlock **)vb);
00044 return a->tStart - b->tStart;
00045 }
00046
00047 int cBlockCmpBoth(const void *va, const void *vb)
00048
00049 {
00050 const struct cBlock *a = *((struct cBlock **)va);
00051 const struct cBlock *b = *((struct cBlock **)vb);
00052 int dif;
00053 dif = a->qStart - b->qStart;
00054 if (dif == 0)
00055 dif = a->tStart - b->tStart;
00056 return dif;
00057 }
00058
00059 int cBlockCmpDiagQuery(const void *va, const void *vb)
00060
00061 {
00062 const struct cBlock *a = *((struct cBlock **)va);
00063 const struct cBlock *b = *((struct cBlock **)vb);
00064 int dif;
00065 dif = (a->tStart - a->qStart) - (b->tStart - b->qStart);
00066 if (dif == 0)
00067 dif = a->qStart - b->qStart;
00068 return dif;
00069 }
00070
00071 void cBlocksAddOffset(struct cBlock *blockList, int qOff, int tOff)
00072
00073 {
00074 struct cBlock *block;
00075 for (block = blockList; block != NULL; block = block->next)
00076 {
00077 block->tStart += tOff;
00078 block->tEnd += tOff;
00079 block->qStart += qOff;
00080 block->qEnd += qOff;
00081 }
00082 }
00083
00084 int chainCmpScore(const void *va, const void *vb)
00085
00086 {
00087 const struct chain *a = *((struct chain **)va);
00088 const struct chain *b = *((struct chain **)vb);
00089 double diff = b->score - a->score;
00090 if (diff < 0.0) return -1;
00091 else if (diff > 0.0) return 1;
00092 else return 0;
00093 }
00094
00095 int chainCmpScoreDesc(const void *va, const void *vb)
00096
00097 {
00098 const struct chain *a = *((struct chain **)va);
00099 const struct chain *b = *((struct chain **)vb);
00100 double diff = a->score - b->score;
00101 if (diff < 0.0) return -1;
00102 else if (diff > 0.0) return 1;
00103 else return 0;
00104 }
00105
00106 int chainCmpTarget(const void *va, const void *vb)
00107
00108 {
00109 const struct chain *a = *((struct chain **)va);
00110 const struct chain *b = *((struct chain **)vb);
00111 int dif = strcmp(a->tName, b->tName);
00112 if (dif == 0)
00113 dif = a->tStart - b->tStart;
00114 return dif;
00115 }
00116
00117 #define FACTOR 300000000
00118
00119 int chainCmpQuery(const void *va, const void *vb)
00120
00121 {
00122 const struct chain *a = *((struct chain **)va);
00123 const struct chain *b = *((struct chain **)vb);
00124 int dif;
00125
00126 dif = strcmp(a->qName, b->qName);
00127 if (dif == 0)
00128 dif = a->qStart - b->qStart;
00129 return dif;
00130 }
00131
00132 static int nextId = 1;
00133
00134 void chainIdSet(int id)
00135
00136 {
00137 nextId = id;
00138 }
00139
00140 void chainIdReset()
00141
00142 {
00143 chainIdSet(1);
00144 }
00145
00146 void chainIdNext(struct chain *chain)
00147
00148 {
00149 chain->id = nextId++;
00150 }
00151
00152 void chainWriteHead(struct chain *chain, FILE *f)
00153
00154 {
00155 if (chain->id == 0)
00156 chainIdNext(chain);
00157 fprintf(f, "chain %1.0f %s %d + %d %d %s %d %c %d %d %d\n", chain->score,
00158 chain->tName, chain->tSize, chain->tStart, chain->tEnd,
00159 chain->qName, chain->qSize, chain->qStrand, chain->qStart, chain->qEnd,
00160 chain->id);
00161 }
00162
00163 void chainWrite(struct chain *chain, FILE *f)
00164
00165 {
00166 struct cBlock *b, *nextB;
00167
00168 chainWriteHead(chain, f);
00169 for (b = chain->blockList; b != NULL; b = nextB)
00170 {
00171 nextB = b->next;
00172 fprintf(f, "%d", b->qEnd - b->qStart);
00173 if (nextB != NULL)
00174 fprintf(f, "\t%d\t%d",
00175 nextB->tStart - b->tEnd, nextB->qStart - b->qEnd);
00176 fputc('\n', f);
00177 }
00178 fputc('\n', f);
00179 }
00180
00181 void chainWriteAll(struct chain *chainList, FILE *f)
00182
00183 {
00184 struct chain *chain;
00185 for (chain = chainList; chain != NULL; chain = chain->next)
00186 chainWrite(chain, f);
00187 }
00188
00189 void chainWriteLong(struct chain *chain, FILE *f)
00190
00191 {
00192 struct cBlock *b, *nextB;
00193
00194 chainWriteHead(chain, f);
00195 for (b = chain->blockList; b != NULL; b = nextB)
00196 {
00197 nextB = b->next;
00198 fprintf(f, "%d\t%d\t", b->tStart, b->qStart);
00199 fprintf(f, "%d", b->qEnd - b->qStart);
00200 if (nextB != NULL)
00201 fprintf(f, "\t%d\t%d",
00202 nextB->tStart - b->tEnd, nextB->qStart - b->qEnd);
00203 fputc('\n', f);
00204 }
00205 fputc('\n', f);
00206 }
00207
00208 struct chain *chainReadChainLine(struct lineFile *lf)
00209
00210
00211 {
00212 char *row[13];
00213 int wordCount;
00214 struct chain *chain;
00215
00216 wordCount = lineFileChop(lf, row);
00217 if (wordCount == 0)
00218 return NULL;
00219 if (wordCount < 12)
00220 errAbort("Expecting at least 12 words line %d of %s",
00221 lf->lineIx, lf->fileName);
00222 if (!sameString(row[0], "chain"))
00223 errAbort("Expecting 'chain' line %d of %s", lf->lineIx, lf->fileName);
00224 AllocVar(chain);
00225 chain->score = atof(row[1]);
00226 chain->tName = cloneString(row[2]);
00227 chain->tSize = lineFileNeedNum(lf, row, 3);
00228 if (wordCount >= 13)
00229 chain->id = lineFileNeedNum(lf, row, 12);
00230 else
00231 chainIdNext(chain);
00232
00233
00234 chain->tStart = lineFileNeedNum(lf, row, 5);
00235 chain->tEnd = lineFileNeedNum(lf, row, 6);
00236 chain->qName = cloneString(row[7]);
00237 chain->qSize = lineFileNeedNum(lf, row, 8);
00238 chain->qStrand = row[9][0];
00239 chain->qStart = lineFileNeedNum(lf, row, 10);
00240 chain->qEnd = lineFileNeedNum(lf, row, 11);
00241 if (chain->qStart >= chain->qEnd || chain->tStart >= chain->tEnd)
00242 errAbort("End before start line %d of %s", lf->lineIx, lf->fileName);
00243 if (chain->qStart < 0 || chain->tStart < 0)
00244 errAbort("Start before zero line %d of %s", lf->lineIx, lf->fileName);
00245 if (chain->qEnd > chain->qSize || chain->tEnd > chain->tSize)
00246 errAbort("Past end of sequence line %d of %s", lf->lineIx, lf->fileName);
00247 return chain;
00248 }
00249
00250 void chainReadBlocks(struct lineFile *lf, struct chain *chain)
00251
00252 {
00253 char *row[3];
00254 int q,t;
00255
00256
00257 q = chain->qStart;
00258 t = chain->tStart;
00259 for (;;)
00260 {
00261 int wordCount = lineFileChop(lf, row);
00262 int size = lineFileNeedNum(lf, row, 0);
00263 struct cBlock *b;
00264 AllocVar(b);
00265 slAddHead(&chain->blockList, b);
00266 b->qStart = q;
00267 b->tStart = t;
00268 q += size;
00269 t += size;
00270 b->qEnd = q;
00271 b->tEnd = t;
00272 if (wordCount == 1)
00273 break;
00274 else if (wordCount < 3)
00275 errAbort("Expecting 1 or 3 words line %d of %s\n",
00276 lf->lineIx, lf->fileName);
00277 t += lineFileNeedNum(lf, row, 1);
00278 q += lineFileNeedNum(lf, row, 2);
00279 }
00280 if (q != chain->qEnd)
00281 errAbort("q end mismatch %d vs %d line %d of %s\n",
00282 q, chain->qEnd, lf->lineIx, lf->fileName);
00283 if (t != chain->tEnd)
00284 errAbort("t end mismatch %d vs %d line %d of %s\n",
00285 t, chain->tEnd, lf->lineIx, lf->fileName);
00286 slReverse(&chain->blockList);
00287 }
00288
00289 struct chain *chainRead(struct lineFile *lf)
00290
00291
00292
00293 {
00294 struct chain *chain = chainReadChainLine(lf);
00295 if (chain != NULL)
00296 chainReadBlocks(lf, chain);
00297 return chain;
00298 }
00299
00300 void chainSwap(struct chain *chain)
00301
00302 {
00303 struct chain old = *chain;
00304 struct cBlock *b;
00305
00306
00307 chain->qName = old.tName;
00308 chain->tName = old.qName;
00309 chain->qStart = old.tStart;
00310 chain->qEnd = old.tEnd;
00311 chain->tStart = old.qStart;
00312 chain->tEnd = old.qEnd;
00313 chain->qSize = old.tSize;
00314 chain->tSize = old.qSize;
00315
00316
00317 for (b = chain->blockList; b != NULL; b = b->next)
00318 {
00319 struct cBlock old = *b;
00320 b->qStart = old.tStart;
00321 b->qEnd = old.tEnd;
00322 b->tStart = old.qStart;
00323 b->tEnd = old.qEnd;
00324 }
00325
00326
00327 if (chain->qStrand == '-')
00328 {
00329
00330
00331
00332
00333 for (b = chain->blockList; b != NULL; b = b->next)
00334 {
00335 reverseIntRange(&b->tStart, &b->tEnd, chain->tSize);
00336 reverseIntRange(&b->qStart, &b->qEnd, chain->qSize);
00337 }
00338 reverseIntRange(&chain->tStart, &chain->tEnd, chain->tSize);
00339 reverseIntRange(&chain->qStart, &chain->qEnd, chain->qSize);
00340 slReverse(&chain->blockList);
00341 }
00342 }
00343
00344 struct hash *chainReadUsedSwapLf(char *fileName, boolean swapQ, Bits *bits, struct lineFile *lf)
00345
00346
00347 {
00348 char nameBuf[16];
00349 struct hash *hash = hashNew(18);
00350 struct chain *chain;
00351 int usedCount = 0, count = 0;
00352
00353 while ((chain = chainRead(lf)) != NULL)
00354 {
00355 ++count;
00356 if (bits != NULL && !bitReadOne(bits, chain->id))
00357 {
00358 chainFree(&chain);
00359 continue;
00360 }
00361 safef(nameBuf, sizeof(nameBuf), "%x", chain->id);
00362 if (hashLookup(hash, nameBuf))
00363 errAbort("Duplicate chain %d ending line %d of %s",
00364 chain->id, lf->lineIx, lf->fileName);
00365 if (swapQ)
00366 chainSwap(chain);
00367 hashAdd(hash, nameBuf, chain);
00368 ++usedCount;
00369 }
00370 return hash;
00371 }
00372
00373 struct hash *chainReadUsedSwap(char *fileName, boolean swapQ, Bits *bits)
00374
00375
00376 {
00377 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00378 struct hash *hash = chainReadUsedSwapLf(fileName, swapQ, NULL, lf);
00379 lineFileClose(&lf);
00380 return hash;
00381 }
00382
00383 struct hash *chainReadAllSwap(char *fileName, boolean swapQ)
00384
00385 {
00386 return chainReadUsedSwap(fileName, swapQ, NULL);
00387 }
00388
00389 struct hash *chainReadAll(char *fileName)
00390
00391 {
00392 return chainReadAllSwap(fileName, FALSE);
00393 }
00394
00395 struct hash *chainReadAllWithMeta(char *fileName, FILE *f)
00396
00397 {
00398 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00399 struct hash *hash = NULL;
00400 lineFileSetMetaDataOutput(lf, f);
00401 hash = chainReadUsedSwapLf(fileName, FALSE, NULL, lf);
00402 lineFileClose(&lf);
00403 return hash;
00404 }
00405 struct chain *chainLookup(struct hash *hash, int id)
00406
00407 {
00408 char nameBuf[16];
00409 safef(nameBuf, sizeof(nameBuf), "%x", id);
00410 return hashMustFindVal(hash, nameBuf);
00411 }
00412
00413 void chainSubsetOnT(struct chain *chain, int subStart, int subEnd,
00414 struct chain **retSubChain, struct chain **retChainToFree)
00415
00416
00417
00418
00419
00420
00421 {
00422
00423 struct cBlock *firstBlock;
00424 for (firstBlock = chain->blockList; firstBlock != NULL; firstBlock = firstBlock->next)
00425 {
00426 if (firstBlock->tEnd > subStart)
00427 break;
00428 }
00429 chainFastSubsetOnT(chain, firstBlock, subStart, subEnd, retSubChain, retChainToFree);
00430 }
00431
00432 void chainFastSubsetOnT(struct chain *chain, struct cBlock *firstBlock,
00433 int subStart, int subEnd, struct chain **retSubChain, struct chain **retChainToFree)
00434
00435
00436 {
00437 struct chain *sub = NULL;
00438 struct cBlock *oldB, *b, *bList = NULL;
00439 int qStart = BIGNUM, qEnd = -BIGNUM;
00440 int tStart = BIGNUM, tEnd = -BIGNUM;
00441
00442
00443 if (subStart <= chain->tStart && subEnd >= chain->tEnd)
00444 {
00445 *retSubChain = chain;
00446 *retChainToFree = NULL;
00447 return;
00448 }
00449
00450 for (oldB = firstBlock; oldB != NULL; oldB = oldB->next)
00451 {
00452 if (oldB->tStart >= subEnd)
00453 break;
00454 b = CloneVar(oldB);
00455 if (b->tStart < subStart)
00456 {
00457 b->qStart += subStart - b->tStart;
00458 b->tStart = subStart;
00459 }
00460 if (b->tEnd > subEnd)
00461 {
00462 b->qEnd -= b->tEnd - subEnd;
00463 b->tEnd = subEnd;
00464 }
00465 slAddHead(&bList, b);
00466 if (qStart > b->qStart)
00467 qStart = b->qStart;
00468 if (qEnd < b->qEnd)
00469 qEnd = b->qEnd;
00470 if (tStart > b->tStart)
00471 tStart = b->tStart;
00472 if (tEnd < b->tEnd)
00473 tEnd = b->tEnd;
00474 }
00475 slReverse(&bList);
00476
00477
00478 if (bList != NULL)
00479 {
00480 double sizeRatio;
00481 AllocVar(sub);
00482 sub->blockList = bList;
00483 sub->qName = cloneString(chain->qName);
00484 sub->qSize = chain->qSize;
00485 sub->qStrand = chain->qStrand;
00486 sub->qStart = qStart;
00487 sub->qEnd = qEnd;
00488 sub->tName = cloneString(chain->tName);
00489 sub->tSize = chain->tSize;
00490 sub->tStart = tStart;
00491 sub->tEnd = tEnd;
00492 sub->id = chain->id;
00493
00494
00495 sizeRatio = (sub->tEnd - sub->tStart);
00496 sizeRatio /= (chain->tEnd - chain->tStart);
00497 sub->score = sizeRatio * chain->score;
00498 }
00499 *retSubChain = *retChainToFree = sub;
00500 }
00501
00502 void chainSubsetOnQ(struct chain *chain, int subStart, int subEnd,
00503 struct chain **retSubChain, struct chain **retChainToFree)
00504
00505
00506
00507
00508
00509
00510 {
00511 struct chain *sub = NULL;
00512 struct cBlock *oldB, *b, *bList = NULL;
00513 int qStart = BIGNUM, qEnd = -BIGNUM;
00514 int tStart = BIGNUM, tEnd = -BIGNUM;
00515
00516
00517 if (subStart <= chain->qStart && subEnd >= chain->qEnd)
00518 {
00519 *retSubChain = chain;
00520 *retChainToFree = NULL;
00521 return;
00522 }
00523
00524 for (oldB = chain->blockList; oldB != NULL; oldB = oldB->next)
00525 {
00526 if (oldB->qEnd <= subStart)
00527 continue;
00528 if (oldB->qStart >= subEnd)
00529 break;
00530 b = CloneVar(oldB);
00531 if (b->qStart < subStart)
00532 {
00533 b->tStart += subStart - b->qStart;
00534 b->qStart = subStart;
00535 }
00536 if (b->qEnd > subEnd)
00537 {
00538 b->tEnd -= b->qEnd - subEnd;
00539 b->qEnd = subEnd;
00540 }
00541 slAddHead(&bList, b);
00542 if (tStart > b->tStart)
00543 tStart = b->tStart;
00544 if (tEnd < b->tEnd)
00545 tEnd = b->tEnd;
00546 if (qStart > b->qStart)
00547 qStart = b->qStart;
00548 if (qEnd < b->qEnd)
00549 qEnd = b->qEnd;
00550 }
00551 slReverse(&bList);
00552
00553
00554 if (bList != NULL)
00555 {
00556 AllocVar(sub);
00557 sub->blockList = bList;
00558 sub->qName = cloneString(chain->qName);
00559 sub->qSize = chain->qSize;
00560 sub->qStrand = chain->qStrand;
00561 sub->qStart = qStart;
00562 sub->qEnd = qEnd;
00563 sub->tName = cloneString(chain->tName);
00564 sub->tSize = chain->tSize;
00565 sub->tStart = tStart;
00566 sub->tEnd = tEnd;
00567 sub->id = chain->id;
00568 }
00569 *retSubChain = *retChainToFree = sub;
00570 }
00571
00572 void chainRangeQPlusStrand(struct chain *chain, int *retQs, int *retQe)
00573
00574
00575 {
00576 if (chain == NULL)
00577 errAbort("chain::chainRangeQPlusStrand() - Can't find range in null query chain.");
00578 if (chain->qStrand == '-')
00579 {
00580 *retQs = chain->qSize - chain->qEnd;
00581 *retQe = chain->qSize - chain->qStart;
00582 }
00583 else
00584 {
00585 *retQs = chain->qStart;
00586 *retQe = chain->qEnd;
00587 }
00588 }
00589