lib/chain.c

Go to the documentation of this file.
00001 /* chain - pairwise alignments that can include gaps in both
00002  * sequences at once.  This is similar in many ways to psl,
00003  * but more suitable to cross species genomic comparisons. */
00004 #include "common.h"
00005 #include "linefile.h"
00006 #include "hash.h"
00007 #include "dnaseq.h"
00008 #include "dnautil.h"
00009 #include "chain.h"
00010 
00011 static char const rcsid[] = "$Id: chain.c,v 1.25 2007/02/19 18:29:32 kent Exp $";
00012 
00013 void chainFree(struct chain **pChain)
00014 /* Free up a chain. */
00015 {
00016 struct chain *chain = *pChain;
00017 if (chain != NULL)
00018     {
00019     slFreeList(&chain->blockList);
00020     freeMem(chain->qName);
00021     freeMem(chain->tName);
00022     freez(pChain);
00023     }
00024 }
00025 
00026 void chainFreeList(struct chain **pList)
00027 /* Free a list of dynamically allocated chain's */
00028 {
00029 struct chain *el, *next;
00030 
00031 for (el = *pList; el != NULL; el = next)
00032     {
00033     next = el->next;
00034     chainFree(&el);
00035     }
00036 *pList = NULL;
00037 }
00038 
00039 int cBlockCmpTarget(const void *va, const void *vb)
00040 /* Compare to sort based on target start. */
00041 {
00042 const struct cBlock *a = *((struct cBlock **)va);
00043 const struct cBlock *b = *((struct cBlock **)vb);
00044 return a->tStart - b->tStart;
00045 }
00046 
00047 int cBlockCmpBoth(const void *va, const void *vb)
00048 /* Compare to sort based on query, then target. */
00049 {
00050 const struct cBlock *a = *((struct cBlock **)va);
00051 const struct cBlock *b = *((struct cBlock **)vb);
00052 int dif;
00053 dif = a->qStart - b->qStart;
00054 if (dif == 0)
00055     dif = a->tStart - b->tStart;
00056 return dif;
00057 }
00058 
00059 int cBlockCmpDiagQuery(const void *va, const void *vb)
00060 /* Compare to sort based on diagonal, then query. */
00061 {
00062 const struct cBlock *a = *((struct cBlock **)va);
00063 const struct cBlock *b = *((struct cBlock **)vb);
00064 int dif;
00065 dif = (a->tStart - a->qStart) - (b->tStart - b->qStart);
00066 if (dif == 0)
00067     dif = a->qStart - b->qStart;
00068 return dif;
00069 }
00070 
00071 void cBlocksAddOffset(struct cBlock *blockList, int qOff, int tOff)
00072 /* Add offsets to block list. */
00073 {
00074 struct cBlock *block;
00075 for (block = blockList; block != NULL; block = block->next)
00076     {
00077     block->tStart += tOff;
00078     block->tEnd += tOff;
00079     block->qStart += qOff;
00080     block->qEnd += qOff;
00081     }
00082 }
00083 
00084 int chainCmpScore(const void *va, const void *vb)
00085 /* Compare to sort based on total score. */
00086 {
00087 const struct chain *a = *((struct chain **)va);
00088 const struct chain *b = *((struct chain **)vb);
00089 double diff = b->score - a->score;
00090 if (diff < 0.0) return -1;
00091 else if (diff > 0.0) return 1;
00092 else return 0;
00093 }
00094 
00095 int chainCmpScoreDesc(const void *va, const void *vb)
00096 /* Compare to sort based on total score descending. */
00097 {
00098 const struct chain *a = *((struct chain **)va);
00099 const struct chain *b = *((struct chain **)vb);
00100 double diff = a->score - b->score;
00101 if (diff < 0.0) return -1;
00102 else if (diff > 0.0) return 1;
00103 else return 0;
00104 }
00105 
00106 int chainCmpTarget(const void *va, const void *vb)
00107 /* Compare to sort based on target position. */
00108 {
00109 const struct chain *a = *((struct chain **)va);
00110 const struct chain *b = *((struct chain **)vb);
00111 int dif = strcmp(a->tName, b->tName);
00112 if (dif == 0)
00113     dif = a->tStart - b->tStart;
00114 return dif;
00115 }
00116 
00117 #define FACTOR 300000000
00118 
00119 int chainCmpQuery(const void *va, const void *vb)
00120 /* Compare to sort based on query chrom and target position. */
00121 {
00122 const struct chain *a = *((struct chain **)va);
00123 const struct chain *b = *((struct chain **)vb);
00124 int dif;                                                                        
00125 
00126 dif = strcmp(a->qName, b->qName);                                               
00127 if (dif == 0)                                                                   
00128     dif = a->qStart - b->qStart;                                                
00129 return dif;                       
00130 }
00131 
00132 static int nextId = 1;
00133 
00134 void chainIdSet(int id)
00135 /* Set next chain id. */
00136 {
00137 nextId = id;
00138 }
00139 
00140 void chainIdReset()
00141 /* Reset chain id. */
00142 {
00143 chainIdSet(1);
00144 }
00145 
00146 void chainIdNext(struct chain *chain)
00147 /* Add an id to a chain if it doesn't have one already */
00148 {
00149 chain->id = nextId++;
00150 }
00151 
00152 void chainWriteHead(struct chain *chain, FILE *f)
00153 /* Write chain before block/insert list. */
00154 {
00155 if (chain->id == 0)
00156     chainIdNext(chain);
00157 fprintf(f, "chain %1.0f %s %d + %d %d %s %d %c %d %d %d\n", chain->score,
00158     chain->tName, chain->tSize, chain->tStart, chain->tEnd,
00159     chain->qName, chain->qSize, chain->qStrand, chain->qStart, chain->qEnd,
00160     chain->id);
00161 }
00162 
00163 void chainWrite(struct chain *chain, FILE *f)
00164 /* Write out chain to file in usual format*/
00165 {
00166 struct cBlock *b, *nextB;
00167 
00168 chainWriteHead(chain, f);
00169 for (b = chain->blockList; b != NULL; b = nextB)
00170     {
00171     nextB = b->next;
00172     fprintf(f, "%d", b->qEnd - b->qStart);
00173     if (nextB != NULL)
00174         fprintf(f, "\t%d\t%d", 
00175                 nextB->tStart - b->tEnd, nextB->qStart - b->qEnd);
00176     fputc('\n', f);
00177     }
00178 fputc('\n', f);
00179 }
00180 
00181 void chainWriteAll(struct chain *chainList, FILE *f)
00182 /* Write all chains to file. */
00183 {
00184 struct chain *chain;
00185 for (chain = chainList; chain != NULL; chain = chain->next)
00186     chainWrite(chain, f);
00187 }
00188 
00189 void chainWriteLong(struct chain *chain, FILE *f)
00190 /* Write out chain to file in longer format*/
00191 {
00192 struct cBlock *b, *nextB;
00193 
00194 chainWriteHead(chain, f);
00195 for (b = chain->blockList; b != NULL; b = nextB)
00196     {
00197     nextB = b->next;
00198     fprintf(f, "%d\t%d\t", b->tStart, b->qStart);
00199     fprintf(f, "%d", b->qEnd - b->qStart);
00200     if (nextB != NULL)
00201         fprintf(f, "\t%d\t%d", 
00202                 nextB->tStart - b->tEnd, nextB->qStart - b->qEnd);
00203     fputc('\n', f);
00204     }
00205 fputc('\n', f);
00206 }
00207 
00208 struct chain *chainReadChainLine(struct lineFile *lf)
00209 /* Read line that starts with chain.  Allocate memory
00210  * and fill in values.  However don't read link lines. */
00211 {
00212 char *row[13];
00213 int wordCount;
00214 struct chain *chain;
00215 
00216 wordCount = lineFileChop(lf, row);
00217 if (wordCount == 0)
00218     return NULL;
00219 if (wordCount < 12)
00220     errAbort("Expecting at least 12 words line %d of %s", 
00221         lf->lineIx, lf->fileName);
00222 if (!sameString(row[0], "chain"))
00223     errAbort("Expecting 'chain' line %d of %s", lf->lineIx, lf->fileName);
00224 AllocVar(chain);
00225 chain->score = atof(row[1]);
00226 chain->tName = cloneString(row[2]);
00227 chain->tSize = lineFileNeedNum(lf, row, 3);
00228 if (wordCount >= 13)
00229     chain->id = lineFileNeedNum(lf, row, 12);
00230 else
00231     chainIdNext(chain);
00232 
00233 /* skip tStrand for now, always implicitly + */
00234 chain->tStart = lineFileNeedNum(lf, row, 5);
00235 chain->tEnd = lineFileNeedNum(lf, row, 6);
00236 chain->qName = cloneString(row[7]);
00237 chain->qSize = lineFileNeedNum(lf, row, 8);
00238 chain->qStrand = row[9][0];
00239 chain->qStart = lineFileNeedNum(lf, row, 10);
00240 chain->qEnd = lineFileNeedNum(lf, row, 11);
00241 if (chain->qStart >= chain->qEnd || chain->tStart >= chain->tEnd)
00242     errAbort("End before start line %d of %s", lf->lineIx, lf->fileName);
00243 if (chain->qStart < 0 || chain->tStart < 0)
00244     errAbort("Start before zero line %d of %s", lf->lineIx, lf->fileName);
00245 if (chain->qEnd > chain->qSize || chain->tEnd > chain->tSize)
00246     errAbort("Past end of sequence line %d of %s", lf->lineIx, lf->fileName);
00247 return chain;
00248 }
00249 
00250 void chainReadBlocks(struct lineFile *lf, struct chain *chain)
00251 /* Read in chain blocks from file. */
00252 {
00253 char *row[3];
00254 int q,t;
00255 
00256 /* Now read in block list. */
00257 q = chain->qStart;
00258 t = chain->tStart;
00259 for (;;)
00260     {
00261     int wordCount = lineFileChop(lf, row);
00262     int size = lineFileNeedNum(lf, row, 0);
00263     struct cBlock *b;
00264     AllocVar(b);
00265     slAddHead(&chain->blockList, b);
00266     b->qStart = q;
00267     b->tStart = t;
00268     q += size;
00269     t += size;
00270     b->qEnd = q;
00271     b->tEnd = t;
00272     if (wordCount == 1)
00273         break;
00274     else if (wordCount < 3)
00275         errAbort("Expecting 1 or 3 words line %d of %s\n", 
00276                 lf->lineIx, lf->fileName);
00277     t += lineFileNeedNum(lf, row, 1);
00278     q += lineFileNeedNum(lf, row, 2);
00279     }
00280 if (q != chain->qEnd)
00281     errAbort("q end mismatch %d vs %d line %d of %s\n", 
00282         q, chain->qEnd, lf->lineIx, lf->fileName);
00283 if (t != chain->tEnd)
00284     errAbort("t end mismatch %d vs %d line %d of %s\n", 
00285         t, chain->tEnd, lf->lineIx, lf->fileName);
00286 slReverse(&chain->blockList);
00287 }
00288 
00289 struct chain *chainRead(struct lineFile *lf)
00290 /* Read next chain from file.  Return NULL at EOF. 
00291  * Note that chain block scores are not filled in by
00292  * this. */
00293 {
00294 struct chain *chain = chainReadChainLine(lf);
00295 if (chain != NULL)
00296     chainReadBlocks(lf, chain);
00297 return chain;
00298 }
00299 
00300 void chainSwap(struct chain *chain)
00301 /* Swap target and query side of chain. */
00302 {
00303 struct chain old = *chain;
00304 struct cBlock *b;
00305 
00306 /* Copy basic stuff swapping t and q. */
00307 chain->qName = old.tName;
00308 chain->tName = old.qName;
00309 chain->qStart = old.tStart;
00310 chain->qEnd = old.tEnd;
00311 chain->tStart = old.qStart;
00312 chain->tEnd = old.qEnd;
00313 chain->qSize = old.tSize;
00314 chain->tSize = old.qSize;
00315 
00316 /* Swap t and q in blocks. */
00317 for (b = chain->blockList; b != NULL; b = b->next)
00318     {
00319     struct cBlock old = *b;
00320     b->qStart = old.tStart;
00321     b->qEnd = old.tEnd;
00322     b->tStart = old.qStart;
00323     b->tEnd = old.qEnd;
00324     }
00325 
00326 /* Cope with the minus strand. */
00327 if (chain->qStrand == '-')
00328     {
00329     /* chain's are really set up so that the target is on the
00330      * + strand and the query is on the minus strand.
00331      * Therefore we need to reverse complement both 
00332      * strands while swapping to preserve this. */
00333     for (b = chain->blockList; b != NULL; b = b->next)
00334         {
00335         reverseIntRange(&b->tStart, &b->tEnd, chain->tSize);
00336         reverseIntRange(&b->qStart, &b->qEnd, chain->qSize);
00337         }
00338     reverseIntRange(&chain->tStart, &chain->tEnd, chain->tSize);
00339     reverseIntRange(&chain->qStart, &chain->qEnd, chain->qSize);
00340     slReverse(&chain->blockList);
00341     }
00342 }
00343 
00344 struct hash *chainReadUsedSwapLf(char *fileName, boolean swapQ, Bits *bits, struct lineFile *lf)
00345 /* Read chains that are marked as used in the 
00346  * bits array (which may be NULL) into a hash keyed by id. */
00347 {
00348 char nameBuf[16];
00349 struct hash *hash = hashNew(18);
00350 struct chain *chain;
00351 int usedCount = 0, count = 0;
00352 
00353 while ((chain = chainRead(lf)) != NULL)
00354     {
00355     ++count;
00356     if (bits != NULL && !bitReadOne(bits, chain->id))
00357         {
00358         chainFree(&chain);
00359         continue;
00360         }
00361     safef(nameBuf, sizeof(nameBuf), "%x", chain->id);
00362     if (hashLookup(hash, nameBuf))
00363         errAbort("Duplicate chain %d ending line %d of %s", 
00364                 chain->id, lf->lineIx, lf->fileName);
00365     if (swapQ)
00366         chainSwap(chain);
00367     hashAdd(hash, nameBuf, chain);
00368     ++usedCount;
00369     }
00370 return hash;
00371 }
00372 
00373 struct hash *chainReadUsedSwap(char *fileName, boolean swapQ, Bits *bits)
00374 /* Read chains that are marked as used in the 
00375  * bits array (which may be NULL) into a hash keyed by id. */
00376 {
00377 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00378 struct hash *hash = chainReadUsedSwapLf(fileName, swapQ, NULL, lf);
00379 lineFileClose(&lf);
00380 return hash;
00381 }
00382 
00383 struct hash *chainReadAllSwap(char *fileName, boolean swapQ)
00384 /* Read chains into a hash keyed by id. */
00385 {
00386 return chainReadUsedSwap(fileName, swapQ, NULL);
00387 }
00388 
00389 struct hash *chainReadAll(char *fileName)
00390 /* Read chains into a hash keyed by id. */
00391 {
00392 return chainReadAllSwap(fileName, FALSE);
00393 }
00394 
00395 struct hash *chainReadAllWithMeta(char *fileName, FILE *f)
00396 /* Read chains into a hash keyed by id. */
00397 {
00398 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00399 struct hash *hash = NULL;
00400 lineFileSetMetaDataOutput(lf, f);
00401 hash = chainReadUsedSwapLf(fileName, FALSE, NULL, lf);
00402 lineFileClose(&lf);
00403 return hash;
00404 }
00405 struct chain *chainLookup(struct hash *hash, int id)
00406 /* Find chain in hash. */
00407 {
00408 char nameBuf[16];
00409 safef(nameBuf, sizeof(nameBuf), "%x", id);
00410 return hashMustFindVal(hash, nameBuf);
00411 }
00412 
00413 void chainSubsetOnT(struct chain *chain, int subStart, int subEnd, 
00414     struct chain **retSubChain,  struct chain **retChainToFree)
00415 /* Get subchain of chain bounded by subStart-subEnd on 
00416  * target side.  Return result in *retSubChain.  In some
00417  * cases this may be the original chain, in which case
00418  * *retChainToFree is NULL.  When done call chainFree on
00419  * *retChainToFree.  The score and id fields are not really
00420  * properly filled in. */
00421 {
00422 /* Find first relevant block. */
00423 struct cBlock *firstBlock;
00424 for (firstBlock = chain->blockList; firstBlock != NULL; firstBlock = firstBlock->next)
00425     {
00426     if (firstBlock->tEnd > subStart)
00427         break;
00428     }
00429 chainFastSubsetOnT(chain, firstBlock, subStart, subEnd, retSubChain, retChainToFree);
00430 }
00431 
00432 void chainFastSubsetOnT(struct chain *chain, struct cBlock *firstBlock,
00433         int subStart, int subEnd, struct chain **retSubChain,  struct chain **retChainToFree)
00434 /* Get subchain as in chainSubsetOnT. Pass in initial block that may
00435  * be known from some index to speed things up. */
00436 {
00437 struct chain *sub = NULL;
00438 struct cBlock *oldB, *b, *bList = NULL;
00439 int qStart = BIGNUM, qEnd = -BIGNUM;
00440 int tStart = BIGNUM, tEnd = -BIGNUM;
00441 
00442 /* Check for easy case. */
00443 if (subStart <= chain->tStart && subEnd >= chain->tEnd)
00444     {
00445     *retSubChain = chain;
00446     *retChainToFree = NULL;
00447     return;
00448     }
00449 /* Build new block list and calculate bounds. */
00450 for (oldB = firstBlock; oldB != NULL; oldB = oldB->next)
00451     {
00452     if (oldB->tStart >= subEnd)
00453         break;
00454     b = CloneVar(oldB);
00455     if (b->tStart < subStart)
00456         {
00457         b->qStart += subStart - b->tStart;
00458         b->tStart = subStart;
00459         }
00460     if (b->tEnd > subEnd)
00461         {
00462         b->qEnd -= b->tEnd - subEnd;
00463         b->tEnd = subEnd;
00464         }
00465     slAddHead(&bList, b);
00466     if (qStart > b->qStart)
00467         qStart = b->qStart;
00468     if (qEnd < b->qEnd)
00469         qEnd = b->qEnd;
00470     if (tStart > b->tStart)
00471         tStart = b->tStart;
00472     if (tEnd < b->tEnd)
00473         tEnd = b->tEnd;
00474     }
00475 slReverse(&bList);
00476 
00477 /* Make new chain based on old. */
00478 if (bList != NULL)
00479     {
00480     double sizeRatio;
00481     AllocVar(sub);
00482     sub->blockList = bList;
00483     sub->qName = cloneString(chain->qName);
00484     sub->qSize = chain->qSize;
00485     sub->qStrand = chain->qStrand;
00486     sub->qStart = qStart;
00487     sub->qEnd = qEnd;
00488     sub->tName = cloneString(chain->tName);
00489     sub->tSize = chain->tSize;
00490     sub->tStart = tStart;
00491     sub->tEnd = tEnd;
00492     sub->id = chain->id;
00493 
00494     /* Fake new score. */
00495     sizeRatio = (sub->tEnd - sub->tStart);
00496     sizeRatio /= (chain->tEnd - chain->tStart);
00497     sub->score = sizeRatio * chain->score;
00498     }
00499 *retSubChain = *retChainToFree = sub;
00500 }
00501 
00502 void chainSubsetOnQ(struct chain *chain, int subStart, int subEnd, 
00503     struct chain **retSubChain,  struct chain **retChainToFree)
00504 /* Get subchain of chain bounded by subStart-subEnd on 
00505  * query side.  Return result in *retSubChain.  In some
00506  * cases this may be the original chain, in which case
00507  * *retChainToFree is NULL.  When done call chainFree on
00508  * *retChainToFree.  The score and id fields are not really
00509  * properly filled in. */
00510 {
00511 struct chain *sub = NULL;
00512 struct cBlock *oldB, *b, *bList = NULL;
00513 int qStart = BIGNUM, qEnd = -BIGNUM;
00514 int tStart = BIGNUM, tEnd = -BIGNUM;
00515 
00516 /* Check for easy case. */
00517 if (subStart <= chain->qStart && subEnd >= chain->qEnd)
00518     {
00519     *retSubChain = chain;
00520     *retChainToFree = NULL;
00521     return;
00522     }
00523 /* Build new block list and calculate bounds. */
00524 for (oldB = chain->blockList; oldB != NULL; oldB = oldB->next)
00525     {
00526     if (oldB->qEnd <= subStart)
00527         continue;
00528     if (oldB->qStart >= subEnd)
00529         break;
00530     b = CloneVar(oldB);
00531     if (b->qStart < subStart)
00532         {
00533         b->tStart += subStart - b->qStart;
00534         b->qStart = subStart;
00535         }
00536     if (b->qEnd > subEnd)
00537         {
00538         b->tEnd -= b->qEnd - subEnd;
00539         b->qEnd = subEnd;
00540         }
00541     slAddHead(&bList, b);
00542     if (tStart > b->tStart)
00543         tStart = b->tStart;
00544     if (tEnd < b->tEnd)
00545         tEnd = b->tEnd;
00546     if (qStart > b->qStart)
00547         qStart = b->qStart;
00548     if (qEnd < b->qEnd)
00549         qEnd = b->qEnd;
00550     }
00551 slReverse(&bList);
00552 
00553 /* Make new chain based on old. */
00554 if (bList != NULL)
00555     {
00556     AllocVar(sub);
00557     sub->blockList = bList;
00558     sub->qName = cloneString(chain->qName);
00559     sub->qSize = chain->qSize;
00560     sub->qStrand = chain->qStrand;
00561     sub->qStart = qStart;
00562     sub->qEnd = qEnd;
00563     sub->tName = cloneString(chain->tName);
00564     sub->tSize = chain->tSize;
00565     sub->tStart = tStart;
00566     sub->tEnd = tEnd;
00567     sub->id = chain->id;
00568     }
00569 *retSubChain = *retChainToFree = sub;
00570 }
00571 
00572 void chainRangeQPlusStrand(struct chain *chain, int *retQs, int *retQe)
00573 /* Return range of bases covered by chain on q side on the plus
00574  * strand. */
00575 {
00576 if (chain == NULL)
00577     errAbort("chain::chainRangeQPlusStrand() - Can't find range in null query chain.");
00578 if (chain->qStrand == '-')
00579     {
00580     *retQs = chain->qSize - chain->qEnd;
00581     *retQe = chain->qSize - chain->qStart;
00582     }
00583 else
00584     {
00585     *retQs = chain->qStart;
00586     *retQe = chain->qEnd;
00587     }
00588 }
00589 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2