jkOwnLib/xenbig.c

Go to the documentation of this file.
00001 /* xenbig.c - Routines to do big xeno alignments by breaking them into
00002  * pieces, calling the small aligner, and then stitching them back
00003  * together again. */
00004 /* Copyright 2000-2003 Jim Kent.  All rights reserved. */
00005 
00006 #include "common.h"
00007 #include "dlist.h"
00008 #include "hash.h"
00009 #include "portable.h"
00010 #include "dnautil.h"
00011 #include "nt4.h"
00012 #include "crudeali.h"
00013 #include "wormdna.h"
00014 #include "xenalign.h"
00015 
00016 static char const rcsid[] = "$Id: xenbig.c,v 1.3 2006/03/15 18:36:16 angie Exp $";
00017 
00018 struct contig
00019 /* A contiguously aligned sequence pair. */
00020     {
00021     char *name;
00022     char *queryFile;
00023     char *query;
00024     int qStart, qEnd;
00025     int qOffset, qEndOffset;
00026     char qStrand;
00027     char *target;
00028     int tStart, tEnd;
00029     int tOffset, tEndOffset;
00030     char tStrand;
00031     int score;
00032     char *qSym;     /* Query symbols (nucleotides and insert char) */
00033     char *tSym;     /* Target symbols (nucleotides and insert char) */
00034     char *hSym;     /* "Hidden" state symbols. */
00035     int symCount;
00036     boolean isComplete;
00037     };
00038 
00039 static void freeContig(struct contig **pContig)
00040 /* Free up a contig. */
00041 {
00042 struct contig *contig = *pContig;
00043 if (contig != NULL)
00044     {
00045     freeMem(contig->name);
00046     freeMem(contig->target);
00047     freeMem(contig->qSym);
00048     freeMem(contig->tSym);
00049     freeMem(contig->hSym);
00050     freez(pContig);
00051     }
00052 }
00053 
00054 static void freeContigList(struct dlList **pContigList)
00055 /* Free up list of contigs. */
00056 {
00057 struct dlNode *node;
00058 struct contig *contig;
00059 
00060 for (node = (*pContigList)->head; node->next != NULL; node = node->next)
00061     {
00062     contig = node->val;
00063     freeContig(&contig);
00064     }
00065 freeDlList(pContigList);
00066 }
00067 
00068 static void checkComplete(struct dlList *contigList)
00069 /* Make sure all the contigs on the list are complete. */
00070 {
00071 struct dlNode *node;
00072 struct contig *contig;
00073 
00074 for (node = contigList->head; node->next != NULL; node = node->next)
00075     {
00076     contig = node->val;
00077     if (!contig->isComplete)
00078         {
00079         errAbort("Contig %s:%d-%d %c %s:%d-%d %c isn't complete",
00080             contig->query, contig->qStart, contig->qEnd, contig->qStrand,
00081             contig->target, contig->tStart, contig->tEnd, contig->tStrand);
00082         }
00083     }
00084 }
00085 
00086 static int cmpContigQueryStart(const void *va, const void *vb)
00087 /* Compare two contigs by start position and then by inverse
00088  * stop position. (This ensures that if one contig completely
00089  * encompasses another it will appear on list first.) */
00090 {
00091 struct contig **pA = (struct contig **)va;
00092 struct contig **pB = (struct contig **)vb;
00093 struct contig *a = *pA, *b = *pB;
00094 int dif;
00095 
00096 dif = a->qStart - b->qStart;
00097 if (dif == 0)
00098    //dif = a->tStart - b->tStart;
00099    dif = b->qEnd - a->qEnd;
00100 return dif;
00101 }
00102 
00103 static void writeContig(struct contig *contig, FILE *out, 
00104         boolean compactOutput, boolean newFormat)
00105 /* Write out one contig to file */
00106 {
00107 int i;
00108 int size = contig->symCount;
00109 #define maxLineSize 50
00110 char buf[maxLineSize+1];
00111 int lineSize;
00112 int j;
00113 int qIx = 0, tIx = 0;
00114 char *qSyms = contig->qSym;
00115 char *tSyms = contig->tSym;
00116 
00117 if (newFormat)
00118     {
00119     fprintf(out, "%s align %d.%d%% of %d %s %s:%d-%d %c %s:%d-%d %c\n",
00120         contig->name, contig->score/10, contig->score%10, size, 
00121         contig->queryFile, contig->query, contig->qStart, contig->qEnd, contig->qStrand,
00122         contig->target, contig->tStart, contig->tEnd, contig->tStrand);
00123     }
00124 else
00125     {
00126     fprintf(out, "%s align %d.%d%% of %d %s:%d-%d %c %s:%d-%d %c\n",
00127         contig->name, contig->score/10, contig->score%10, size, 
00128         contig->query, contig->qStart, contig->qEnd, contig->qStrand,
00129         contig->target, contig->tStart, contig->tEnd, contig->tStrand);
00130     }
00131 if (compactOutput)
00132     {
00133     mustWrite(out, contig->qSym, size);
00134     fputc('\n', out);
00135     mustWrite(out, contig->tSym, size);
00136     fputc('\n', out);
00137     mustWrite(out, contig->hSym, size);
00138     fputc('\n', out);
00139     }
00140 else
00141     {
00142     for (i=0; i<size; i += lineSize)
00143         {
00144         /* Figure out how long this line is going to be. */
00145         lineSize = size - i;
00146         if (lineSize > maxLineSize)
00147             lineSize = maxLineSize;
00148         buf[lineSize] = 0;
00149     
00150         /* Print query symbols */
00151         memcpy(buf, contig->qSym + i, lineSize);
00152         fprintf(out, "%5d %s\n", qIx, buf);
00153 
00154         /* Print matching symbols */
00155         fprintf(out, "%5s ", "");
00156         for (j=0; j<lineSize; ++j)
00157             {
00158             int off = j+i;
00159             char q = qSyms[off];
00160             char t = tSyms[off];
00161             char c = (q == t ? '|' : ' ');
00162             fputc(c, out);
00163             }
00164         fputc('\n', out);
00165 
00166         /* Print target symbols */
00167         memcpy(buf, contig->tSym + i, lineSize);
00168         fprintf(out, "%5d %s\n", tIx, buf);
00169 
00170         /* Print hidden symbols */
00171         memcpy(buf, contig->hSym + i, lineSize);
00172         fprintf(out, "%5s %s\n", "", buf);
00173 
00174         /* Figure out how far we have advanced */
00175         for (j=0; j<lineSize; ++j)
00176             {
00177             int off = j+i;
00178             char q = qSyms[off];
00179             char t = tSyms[off];
00180             if (q != '-')
00181                 ++qIx;
00182             if (t != '-')
00183                 ++tIx;
00184             }
00185         /* Print blank line */
00186         fputc('\n', out);
00187         }
00188     }
00189 #undef lineSize
00190 }
00191 
00192 struct cluster
00193 /* A set of compatable contigs - one that might eventually be merged. */
00194     {
00195     struct dlList *contigs;
00196     char *query;
00197     int qStart, qEnd;
00198     int qOffset, qEndOffset;
00199     char qStrand;
00200     char *target;
00201     int tStart, tEnd;
00202     int tOffset, tEndOffset;
00203     char tStrand;
00204     int score;
00205     };
00206 
00207 #ifdef UNUSED
00208 static int cmpCluster(const void *va, const void *vb)
00209 /* Compare two clusters. */
00210 {
00211 struct cluster **pA = (struct cluster **)va;
00212 struct cluster **pB = (struct cluster **)vb;
00213 struct cluster *a = *pA, *b = *pB;
00214 
00215 return b->score - a->score;
00216 }
00217 #endif /* UNUSED */
00218 
00219 static boolean contigsOverlap(struct contig *a, struct contig *b, 
00220     int *retQueryOverlap, int *retTargetOverlap)
00221 /* Returns TRUE is contigs overlap. */
00222 {
00223 int overlap;
00224 int start, end;
00225 
00226 /* Check overlap in query. */
00227 start = max(a->qStart, b->qStart);
00228 end = min(a->qEnd, b->qEnd);
00229 overlap = end - start;
00230 if (overlap > 0)
00231     *retQueryOverlap = overlap;
00232 else
00233     return FALSE;
00234 
00235 /* Check overlap in target. */
00236 start = max(a->tStart, b->tStart);
00237 end = min(a->tEnd, b->tEnd);
00238 overlap = end - start;
00239 if (overlap > 0)
00240     *retTargetOverlap = overlap;
00241 else
00242     return FALSE;
00243 return TRUE;
00244 }
00245 
00246 static int findSymIx(char *qSym, int qIx, int *retSymIx)
00247 /* Convert from query to symbol index, dealing with insert chars. */
00248 {
00249 char q;
00250 int symIx;
00251 int qCount = 0;
00252 
00253 for (symIx = 0; ; ++symIx)
00254     {
00255     if (qCount == qIx)
00256         {
00257         *retSymIx = symIx;
00258         return TRUE;
00259         }
00260     if ((q = *qSym++) == 0)
00261         {
00262         warn("findSymIx gone past end");
00263         return FALSE;
00264         }
00265     if (q != '-')
00266         ++qCount;
00267     }
00268 }
00269 
00270 static boolean mergeIntoAdjacentClusters(struct dlList *clusterList, struct dlNode *contigNode)
00271 /* If can merge contig into cluster list, do it and return TRUE.
00272  * else return FALSE. */
00273 {
00274 struct contig *contig = contigNode->val;  /* The contig we're trying to fold in. */
00275 struct dlNode *clusterNode;               /* Node of a single cluster. */
00276 struct cluster *cluster;                  /* A cluster. */
00277 
00278 for (clusterNode = clusterList->head; clusterNode->next != NULL; clusterNode = clusterNode->next)
00279     {
00280     cluster = clusterNode->val;
00281     if ((contig->qOffset == cluster->qEndOffset - 1000 ||
00282         (contig->qEndOffset == cluster->qEndOffset &&
00283          contig->tEndOffset == cluster->tEndOffset)) &&
00284         contig->qStrand == cluster->qStrand &&
00285         contig->tStrand == cluster->tStrand &&
00286         sameString(contig->target, cluster->target) &&
00287         sameString(contig->query, cluster->query))
00288         {
00289         if (contig->qStrand == contig->tStrand)
00290             {
00291             if (contig->tOffset < cluster->tEndOffset + 50000
00292                 && contig->tOffset >= cluster->tOffset-300)
00293                 {
00294                 cluster->qEndOffset = contig->qEndOffset;
00295                 cluster->tEndOffset = contig->tEndOffset;
00296                 cluster->score += contig->score;
00297                 dlAddTail(cluster->contigs, contigNode);
00298                 return TRUE;
00299                 }
00300             }
00301         else
00302             {
00303             if ( contig->tEndOffset > cluster->tOffset - 50000
00304                 && contig->tEndOffset <= cluster->tEndOffset+300)
00305                 {
00306                 cluster->qEndOffset = contig->qEndOffset;
00307                 cluster->tOffset = contig->tOffset;
00308                 cluster->score += contig->score;
00309                 dlAddTail(cluster->contigs, contigNode);
00310                 return TRUE;
00311                 }
00312             }
00313         }
00314     }
00315 return FALSE;
00316 }
00317 
00318 static void maxAlignedOverlap(char *a1, char *a2, char *b1, char *b2, int size,
00319     int *retStart, int *retOverlapSize)
00320 /* Find maximum overlap between a and b without shifting a or
00321  * b. */
00322 {
00323 int maxStart = -1;
00324 int maxLen = -1;
00325 int curStart = 0;
00326 int curLen = 0;
00327 int i;
00328 
00329 for (i=0; i<size; ++i)
00330     {
00331     if (a1[i] == b1[i] && a2[i] == b2[i])
00332         {
00333         curLen += 1;
00334         if (curLen > maxLen)
00335             {
00336             maxLen = curLen;
00337             maxStart = curStart;
00338             }
00339         }
00340     else
00341         {
00342         curLen = 0;
00343         curStart = i+1;
00344         }
00345     }
00346 *retStart = maxStart;
00347 *retOverlapSize = maxLen;
00348 }
00349     
00350 static void findMaxOverlap(char *a1, char *a2, int aSize, 
00351     char *b1, char *b2, int bSize,
00352     int *retAstart, int *retBstart, int *retOverlapSize)
00353 /* Find maximum overlap between a and b and return it. */
00354 {
00355 int i;
00356 int maxOverlap = -1;
00357 int maxAstart = 0;
00358 int maxBstart = 0;
00359 int overlap;
00360 int minSize = min(aSize, bSize);
00361 int goodEnough = minSize-1;
00362 int start;
00363 
00364 for (i=0; i<bSize; ++i)
00365     {
00366     int size = bSize-i;
00367     if (size > aSize)
00368         size = aSize;
00369     maxAlignedOverlap(a1, a2, b1+i, b2+i, size, &start, &overlap);
00370     if (overlap > maxOverlap)
00371         {
00372         maxOverlap = overlap;
00373         maxAstart = start;
00374         maxBstart = i+start;
00375         if (maxOverlap >= goodEnough)
00376             goto RETURN_VALS;
00377         }
00378     }
00379 for (i=1; i<aSize; ++i)
00380     {
00381     int size = aSize-i;
00382     if (size > bSize)
00383         size = bSize;
00384     maxAlignedOverlap(a1+i, a2+i, b1, b2, size, &start, &overlap);
00385     if (overlap > maxOverlap)
00386         {
00387         maxOverlap = overlap;
00388         maxAstart = i+start;
00389         maxBstart = start;
00390         if (maxOverlap >= goodEnough)
00391             goto RETURN_VALS;
00392         }
00393     }
00394 RETURN_VALS:
00395     {
00396     *retAstart = maxAstart;
00397     *retBstart = maxBstart;
00398     *retOverlapSize = maxOverlap;
00399     }
00400 }
00401 
00402 static struct contig *forwardMergeTwo(struct contig *a, struct contig *b)
00403 /* Make a new contig that's a merger of a and b. 
00404  * Assumes that a and b overlap and are on + strand. */
00405 {
00406 int aSymOverlapStart, aSymOverlapEnd, aSymOverlapSize;
00407 int bSymOverlapStart, bSymOverlapEnd, bSymOverlapSize;
00408 int aqoStart, bqoStart;
00409 int overlapSize, halfOverlapSize;
00410 int aCopyStart, aCopyEnd, bCopyStart, bCopyEnd;
00411 int newSymCount;
00412 struct contig *c;
00413 int aCopySize, bCopySize;
00414 int qOverlap;
00415 
00416 /* Find overlapping area in symbol coordinates. */
00417 qOverlap = b->qStart - a->qStart;
00418 if (qOverlap < 0 || qOverlap > a->qEnd - a->qStart)
00419     return NULL;
00420 if (!findSymIx(a->qSym, b->qStart - a->qStart, &aSymOverlapStart))
00421     return NULL;
00422 aSymOverlapEnd = a->symCount;
00423 aSymOverlapSize = aSymOverlapEnd - aSymOverlapStart;
00424 bSymOverlapStart = 0;
00425 if (!findSymIx(b->qSym, a->qEnd - b->qStart, &bSymOverlapEnd))
00426     return NULL;
00427 bSymOverlapSize = bSymOverlapEnd - bSymOverlapStart;
00428 
00429 /* Find subset of overlapping area that is identical
00430  * in source and query symbols. */
00431 findMaxOverlap(a->qSym + aSymOverlapStart, 
00432     a->tSym + aSymOverlapStart, aSymOverlapSize, 
00433     b->qSym + bSymOverlapStart,
00434     b->tSym + bSymOverlapStart, bSymOverlapSize, 
00435     &aqoStart, &bqoStart, &overlapSize);
00436 if (overlapSize < 15)
00437     return NULL;
00438 aSymOverlapStart += aqoStart;
00439 bSymOverlapStart += bqoStart;
00440 halfOverlapSize = overlapSize/2;
00441 
00442 /* Figure out what areas will go into new contig, and how big
00443  * it will be. */
00444 aCopyStart = 0;
00445 aCopyEnd = aSymOverlapStart + halfOverlapSize;
00446 bCopyStart = bSymOverlapStart + halfOverlapSize;
00447 bCopyEnd = b->symCount;
00448 newSymCount = aCopyEnd - aCopyStart + bCopyEnd - bCopyStart;
00449 
00450 /* Allocate new contig with set of symbols big enough to hold
00451  * merged contig. */
00452 AllocVar(c);
00453 c->query = a->query;
00454 c->queryFile = a->queryFile;
00455 c->qStart = min(a->qStart, b->qStart);
00456 c->qEnd = max(a->qEnd, b->qEnd);
00457 c->qStrand = a->qStrand;
00458 c->target = cloneString(a->target);
00459 c->tStart = a->tStart;
00460 c->tEnd = b->tEnd;
00461 c->tStrand = a->tStrand;
00462 c->qSym = needMem(newSymCount);
00463 c->tSym = needMem(newSymCount);
00464 c->hSym = needMem(newSymCount);
00465 c->symCount = newSymCount;
00466 
00467 /* Copy symbols from contig a up to half way through
00468  * overlapping area */
00469 aCopySize = aCopyEnd - aCopyStart;
00470 bCopySize = bCopyEnd - bCopyStart;
00471 memcpy(c->qSym, a->qSym+aCopyStart, aCopySize);
00472 memcpy(c->tSym, a->tSym+aCopyStart, aCopySize);
00473 memcpy(c->hSym, a->hSym+aCopyStart, aCopySize);
00474 
00475 /* Append symbols from contig b from half way through
00476  * overlapping area to end. */
00477 memcpy(c->qSym+aCopySize, b->qSym+bCopyStart, bCopySize);
00478 memcpy(c->tSym+aCopySize, b->tSym+bCopyStart, bCopySize);
00479 memcpy(c->hSym+aCopySize, b->hSym+bCopyStart, bCopySize);
00480 
00481 return c;
00482 }
00483 
00484 static void rcContigOffsets(struct contig *a, int revOff)
00485 /* Subtract each coordinate in contig from revOff */
00486 {
00487 int temp;
00488 
00489 temp = a->qStart;
00490 a->qStart = revOff - a->qEnd;
00491 a->qEnd = revOff - temp;
00492 temp = a->tStart;
00493 a->tStart = revOff - a->tEnd;
00494 a->tEnd = revOff - temp;
00495 }
00496 
00497 static void rcQueryOffsetsAroundSeg(struct contig *a, int segStart, int segEnd)
00498 /* Reverse complement offsets relative to segStart and segEnd */
00499 {
00500 int segSize = segEnd - segStart;
00501 int qEnd = reverseOffset(a->qStart - segStart, segSize) + segStart + 1;
00502 int qStart = reverseOffset(a->qEnd - segStart, segSize) + segStart + 1;
00503 a->qStart = qStart;
00504 a->qEnd = qEnd;
00505 }
00506 
00507 static struct contig *mergeTwo(struct contig *a, struct contig *b)
00508 /* Make a new contig that's a merger of a and b. 
00509  * Assumes that a and b overlap and are on either strand. */
00510 {
00511 int aStart, bEnd, revOff;
00512 struct contig *c;
00513 int tStart, tEnd;
00514 int qStart, qEnd;
00515 
00516 /* Take care of easy forward case first. */
00517 if (a->qStrand == a->tStrand)
00518     return forwardMergeTwo(a,b);
00519 
00520 /* Figure out a good position to do reverse complementing of
00521  * offsets about.  */
00522 aStart = a->qStart;
00523 bEnd = b->qEnd;
00524 revOff = aStart + bEnd;
00525 
00526 /* Save target beginning and end. */
00527 tStart = b->tStart;
00528 tEnd = a->tEnd;
00529 
00530 /* Save target begin and end. */
00531 qStart = a->qStart;
00532 qEnd = b->qEnd;
00533 
00534 /* Reverse complement offsets, do merge, and reverse complement
00535  * offsets back. */
00536 rcContigOffsets(a, revOff);
00537 rcContigOffsets(b, revOff);
00538 c = forwardMergeTwo(b,a);
00539 rcContigOffsets(a, revOff);
00540 rcContigOffsets(b, revOff);
00541 if (c != NULL)
00542     {
00543     rcContigOffsets(c, revOff);
00544     c->tStart = tStart;
00545     c->tEnd = tEnd;
00546     assert(c->qStart == qStart && c->qEnd == qEnd);
00547     }
00548 return c;
00549 }
00550 
00551 
00552 static void mergeWithinCluster(struct cluster *cluster)
00553 /* Merge all the contigs that you can merge within cluster. */
00554 {
00555 int clusterSize = dlCount(cluster->contigs);
00556 struct dlNode *prevNode, *curNode, *nextNode;
00557 struct contig *prev, *cur, *merged;
00558 int tOverlap, qOverlap;
00559 
00560 /* uglyf("M %d %s:%d-%d %c (%d-%d) %s:%d-%d %c (%d-%d)\n",
00561     clusterSize, 
00562     cluster->query, cluster->qStart, cluster->qEnd, cluster->qStrand,
00563     cluster->qOffset, cluster->qEndOffset,
00564     cluster->target, cluster->tStart, cluster->tEnd, cluster->tStrand,
00565     cluster->tOffset, cluster->tEndOffset);
00566  */
00567 
00568 if (clusterSize < 2)
00569     return;     /* Wow was that easy! */
00570 curNode = cluster->contigs->head;
00571 cur = curNode->val;
00572 nextNode = curNode->next;
00573 
00574 for (;;)
00575     {
00576     /* Start of loop logic - store current position as previous, and
00577      * move to next, stop if at end of list. */
00578     prevNode = curNode;
00579     curNode = nextNode;
00580     if ((nextNode = curNode->next) == NULL)
00581         break;
00582     prev = cur;
00583     cur = curNode->val;    
00584 
00585     /* Figure out if current node overlaps with previous. */
00586     if (contigsOverlap(prev, cur, &qOverlap, &tOverlap))
00587         {
00588         if ((merged = mergeTwo(prev, cur)) == NULL)
00589             warn("Couldn't merge two");
00590         else
00591             {
00592             dlRemove(prevNode);
00593             freeMem(prevNode);
00594             freeContig(&cur);
00595             freeContig(&prev);
00596             curNode->val = cur = merged;
00597             }
00598         }
00599     }
00600 }
00601 
00602 static boolean clusterEncompassedBefore(struct dlList *clusterList, struct cluster *cluster)
00603 /* Return TRUE if clusterList up to cluster has a member that encompasses
00604  * cluster. */
00605 {
00606 struct dlNode *node;
00607 struct cluster *c;
00608 
00609 for (node = clusterList->head; node->next != NULL; node = node->next)
00610     {
00611     c = node->val;
00612     if (c == cluster)
00613         return FALSE;
00614     if (c->qStart <= cluster->qStart && c->qEnd >= cluster->qEnd &&
00615         c->tStart <= cluster->tStart && c->tEnd >= cluster->tEnd &&
00616         c->qStrand == cluster->qStrand &&
00617         c->tStrand == cluster->tStrand &&
00618         sameString(c->query, cluster->query) &&
00619         sameString(c->target, cluster->target) )
00620         {
00621         return TRUE;
00622         }            
00623     }
00624 assert(FALSE);
00625 return FALSE;
00626 }
00627 
00628 static void clusterRemoveEncompassed(struct dlList *clusterList)
00629 /* Remove elements from list that are completely encompassed by
00630  * another. */
00631 {
00632 struct dlNode *node, *next;
00633 struct cluster *cluster;
00634 
00635 for (node = clusterList->head; node->next != NULL; node = next)
00636     {
00637     next = node->next;
00638     cluster = node->val;
00639     if (clusterEncompassedBefore(clusterList, cluster))
00640         {
00641         dlRemove(node);
00642         freeMem(node);
00643         freeContigList(&cluster->contigs);
00644         freeMem(cluster);
00645         }
00646     }
00647 }
00648 
00649 static boolean contigEncompassedBefore(struct dlList *contigList, struct contig *contig)
00650 /* Return TRUE if contigList up to contig has a member that encompasses
00651  * contig. */
00652 {
00653 struct dlNode *node;
00654 struct contig *c;
00655 
00656 for (node = contigList->head; node->next != NULL; node = node->next)
00657     {
00658     c = node->val;
00659     if (c == contig)
00660         return FALSE;
00661     if (c->qStart <= contig->qStart && c->qEnd >= contig->qEnd &&
00662         c->tStart <= contig->tStart && c->tEnd >= contig->tEnd &&
00663         c->qStrand == contig->qStrand &&
00664         c->tStrand == contig->tStrand &&
00665         sameString(c->query, contig->query) &&
00666         sameString(c->target, contig->target) )
00667         {
00668         return TRUE;
00669         }            
00670     }
00671 assert(FALSE);
00672 return FALSE;
00673 }
00674 
00675 static void contigRemoveEncompassed(struct dlList *contigList)
00676 /* Remove elements from list that are completely encompassed by
00677  * another. */
00678 {
00679 struct dlNode *node, *next;
00680 struct contig *contig;
00681 
00682 for (node = contigList->head; node->next != NULL; node = next)
00683     {
00684     next = node->next;
00685     contig = node->val;
00686     if (contigEncompassedBefore(contigList, contig))
00687         {
00688         dlRemove(node);
00689         freeMem(node);
00690         freeContig(&contig);
00691         }
00692     }
00693 }
00694 
00695 static int milliMatchScore(struct contig *contig)
00696 /* Return score that's fraction of symbols that match over 1000 */
00697 {
00698 int symCount = contig->symCount;
00699 char *qSym = contig->qSym;
00700 char *tSym = contig->tSym;
00701 int matchCount = 0;
00702 int i;
00703 
00704 for (i=0; i<symCount; ++i)
00705     {
00706     if (qSym[i] == tSym[i])
00707         ++matchCount;
00708     }
00709 return round( (1000.0 * matchCount) /  symCount);
00710 } 
00711 
00712 static void mergeContigs(struct dlList *contigList, int stitchMinScore)
00713 /* Merge together contigs where possible. Assumes query sequence for
00714  * all in list is the same (though the query subsequence differs). */
00715 {
00716 struct dlList *clusterList;
00717 struct dlNode *contigNode, *clusterNode;
00718 struct dlNode *nextClusterNode;
00719 struct cluster *cluster;
00720 struct contig *contig;
00721 int contigCount;
00722 int startCount = 0;
00723 
00724 /* Make sure there's really something to do, so we don't have
00725  * to NULL check so much in the rest of this routine. */
00726 contigCount = dlCount(contigList);
00727 if (contigCount < 1)
00728     return;
00729 
00730 clusterList = newDlList();
00731 
00732 /* Group together contigs that overlap in both query and target
00733    into clusters. */
00734 while ((contigNode = dlPopHead(contigList)) != NULL)
00735     {
00736     if (!mergeIntoAdjacentClusters(clusterList, contigNode))
00737         {
00738         AllocVar(cluster);
00739         contig = contigNode->val;
00740         cluster->contigs = newDlList();
00741         dlAddTail(cluster->contigs, contigNode);
00742         cluster->query = contig->query;
00743         cluster->qStart = contig->qStart;
00744         cluster->qEnd = contig->qEnd;
00745         cluster->qOffset = contig->qOffset;
00746         cluster->qEndOffset = contig->qEndOffset;
00747         cluster->qStrand = contig->qStrand;
00748         cluster->target = contig->target;
00749         cluster->tStart = contig->tStart;
00750         cluster->tEnd = contig->tEnd;
00751         cluster->tOffset = contig->tOffset;
00752         cluster->tEndOffset = contig->tEndOffset;
00753         cluster->tStrand = contig->tStrand;
00754         cluster->score = contig->score;
00755         dlAddValTail(clusterList, cluster);
00756         }
00757     }
00758 
00759 /* Weed out clusters that are too weak. */
00760 for (clusterNode = clusterList->head; clusterNode->next != NULL; 
00761     clusterNode = nextClusterNode)
00762     {
00763     nextClusterNode = clusterNode->next;
00764     cluster = clusterNode->val;
00765     if (cluster->score < stitchMinScore)
00766         {
00767         dlRemove(clusterNode);
00768         freeContigList(&cluster->contigs);
00769         freeMem(cluster);
00770         freeMem(clusterNode);
00771         }
00772     }
00773 
00774 clusterRemoveEncompassed(clusterList);
00775 
00776 /* Merge contigs within cluster into a single contig, and move that contig
00777  * back onto cluster list. */
00778 for (clusterNode = clusterList->head; clusterNode->next != NULL; clusterNode = clusterNode->next)
00779     {
00780     cluster = clusterNode->val;
00781     mergeWithinCluster(cluster);
00782     while ((contigNode = dlPopHead(cluster->contigs)) != NULL)
00783         dlAddTail(contigList, contigNode);
00784     freeDlList(&cluster->contigs);
00785     }
00786 
00787 dlSort(contigList, cmpContigQueryStart);
00788 contigRemoveEncompassed(contigList);
00789 
00790 for (contigNode = contigList->head; contigNode->next != NULL; contigNode = contigNode->next)
00791     {
00792     char nameBuf[128];
00793     char *noDir;
00794     char *noDot;
00795     contig = contigNode->val;
00796     /* Get rid of directory and suffix parts of file name to use as
00797      * base for contig name. */
00798     noDir = strrchr(contig->query, '\\');
00799     if (noDir == NULL)
00800         noDir = strrchr(contig->query, '/');
00801     if (noDir == NULL)
00802         noDir = contig->query;
00803     else
00804         noDir += 1;
00805     noDot = strrchr(noDir, '.');
00806     if (noDot != NULL)
00807         *noDot = 0;
00808     sprintf(nameBuf, "%s.c%d", noDir, ++startCount);    
00809     contig->name = cloneString(nameBuf);
00810     contig->score = milliMatchScore(contig);
00811     }
00812 
00813 /* Free up now gutted cluster list. */
00814 freeDlListAndVals(&clusterList);
00815 }
00816 
00817 static void finishContigs( struct dlList **pContigList, 
00818     FILE *out, int stitchMinScore, boolean compactOutput, boolean newFormat)
00819 /* Merge together contigs that go together, write the good
00820  * ones to output, and remove contigs from list.  */
00821 {
00822 struct contig *contig;
00823 struct dlNode *node;
00824 
00825 checkComplete(*pContigList);
00826 mergeContigs(*pContigList, stitchMinScore);
00827 for (node = (*pContigList)->head; node->next != NULL; node = node->next)
00828     {
00829     contig = node->val;
00830     writeContig(contig, out, compactOutput, newFormat);
00831     }
00832 freeContigList(pContigList);
00833 *pContigList = newDlList();
00834 }
00835 
00836 static int countNonGap(char *s, int size)
00837 /* Count number of non-gap characters in string s of given size. */
00838 {
00839 int count = 0;
00840 while (--size >= 0)
00841     if (*s++ != '-') ++count;
00842 return count;
00843 }
00844 
00845 static void xStitch(char *inName, FILE *out, int stitchMinScore, boolean compactOutput, 
00846    boolean newFormat)
00847 /* Do the big old stitching run putting together contents of inName
00848  * into contigs in out. */
00849 {
00850 FILE *in;
00851 char line[512];
00852 int lineCount = 0;
00853 char *words[64];
00854 int wordCount;
00855 char *firstWord;
00856 char *queryName = "";
00857 char *queryFile = "";
00858 struct hash *queryFileHash = newHash(0);
00859 struct contig *contig = NULL;
00860 struct dlList *contigList = newDlList();
00861 int lineState = 0;   /* Keeps track of groups of four lines. */
00862 struct slName *queryNameList = NULL;
00863 int maxSymCount = 64*1024;
00864 char *qSymBuf = needMem(maxSymCount+1);
00865 char *tSymBuf = needMem(maxSymCount+1);
00866 char *hSymBuf = needMem(maxSymCount+1);
00867 int symCount = 0;
00868 int qSymLen = 0, tSymLen = 0, hSymLen = 0;
00869 
00870 in = mustOpen(inName, "r");
00871 while (fgets(line, sizeof(line), in))
00872     {
00873     ++lineCount;
00874     if (++lineState == 5)
00875         lineState = 0;
00876     wordCount = chopLine(line, words);
00877     if (wordCount <= 0)
00878         continue;
00879     firstWord = words[0];
00880     if (sameString(firstWord, "Aligning"))
00881         {
00882         char *queryString;
00883         char *targetString;
00884         char queryStrand, targetStrand;
00885         char *parts[8];
00886         int partCount;
00887 
00888         /* Do some preliminary checking of this line. */
00889         if (newFormat)
00890             {
00891             if (wordCount < 7)
00892                 errAbort("Short line %d of %s", lineCount, inName);
00893             queryFile = hashStoreName(queryFileHash, words[1]);
00894             queryString = words[2];
00895             queryStrand = words[3][0];
00896             targetString = words[5];
00897             targetStrand = words[6][0];
00898             }
00899         else
00900             {
00901             if (wordCount < 6)
00902                 errAbort("Short line %d of %s", lineCount, inName);
00903             queryString = words[1];
00904             queryStrand = words[2][0];
00905             targetString = words[4];
00906             targetStrand = words[5][0];
00907             }
00908 
00909         /* Extract the name of the query sequence.  If it's new,
00910          * then write out contigs on previous query we've accumulated
00911          * so far and start a new list. */
00912         partCount = chopString(queryString, ":-", parts, ArraySize(parts));
00913         if (!sameString(parts[0], queryName))
00914             {
00915             /* Allocate new name and keep track of it. */
00916             struct slName *newName = newSlName(parts[0]);
00917             slAddHead(&queryNameList, newName);
00918 
00919             /* Write out old contigs and empty out contig list. */
00920             finishContigs(&contigList, out, stitchMinScore, compactOutput, newFormat);
00921             queryName = newName->name;
00922             }
00923         
00924         /* Make up a new contig, and fill it in with the data we
00925          * have so far about query. */
00926         AllocVar(contig);
00927         dlAddValTail(contigList, contig);
00928         contig->queryFile = queryFile;
00929         contig->query = queryName;
00930         contig->qOffset = atoi(parts[1]);
00931         contig->qEndOffset = atoi(parts[2]);
00932         contig->qStrand = queryStrand;
00933 
00934         /* Parse target string and fill in contig with it's info. */
00935         chopString(targetString, ":-", parts, ArraySize(parts));
00936         contig->target = cloneString(parts[0]);
00937         contig->tOffset = atoi(parts[1]);
00938         contig->tEndOffset = atoi(parts[2]);
00939         contig->tStrand = targetStrand;
00940 
00941         /* We don't know start and end yet - set them to values
00942          * that will get easily replace by max/min. */
00943         contig->qStart = contig->tStart = 0x3fffffff;
00944 
00945         lineState = -1;
00946         symCount = 0;
00947         }
00948     else if (sameString(firstWord, "best"))
00949         {
00950         if (wordCount < 3)
00951             errAbort("Short line %d of %s", lineCount, inName);
00952         contig->score = atoi(words[2]);
00953         contig->isComplete = TRUE;
00954         contig->qSym = cloneStringZ(qSymBuf, symCount);
00955         contig->tSym = cloneStringZ(tSymBuf, symCount);
00956         contig->hSym = cloneStringZ(hSymBuf, symCount);
00957         contig->symCount = symCount;
00958         contig->qEnd = contig->qStart + countNonGap(qSymBuf, symCount);
00959         contig->tEnd = contig->tStart + countNonGap(tSymBuf, symCount);
00960         if (contig->qStrand != contig->tStrand) 
00961             rcQueryOffsetsAroundSeg(contig, contig->qOffset, contig->qEndOffset);
00962         }
00963     else if (wordCount > 1 && (isdigit(firstWord[0]) || firstWord[0] == '-'))
00964         {
00965         int start, end;
00966         char *sym = words[1];
00967         int symLen = strlen(sym);
00968         char firstChar = firstWord[0];
00969         if (lineState != 0 && lineState != 2)
00970             errAbort("Bummer - phasing mismatch on lineState line %d of %s!\n", lineCount, inName);
00971         assert(lineState == 0 || lineState == 2);
00972         start = atoi(firstWord);
00973         end = start + symLen;
00974         if (symCount + symLen > maxSymCount)
00975             {
00976             errAbort("Single contig too long line %d of %s, can only handle up to %d symbols\n",
00977                 lineCount, inName, maxSymCount);
00978             }
00979         if (lineState == 0) /* query symbols */
00980             {
00981             qSymLen = symLen;
00982             if (isdigit(firstChar))
00983                 {
00984                 start += contig->qOffset;
00985                 end += contig->qOffset;
00986                 contig->qStart = min(contig->qStart, start);
00987                 contig->qEnd = max(contig->qEnd, end);
00988                 }
00989             memcpy(qSymBuf+symCount, sym, symLen);
00990             }
00991         else               /* target symbols */
00992             {
00993             tSymLen = symLen;
00994             if (tSymLen != qSymLen)
00995                 {
00996                 errAbort("Target symbol size not same as query line %d of %s",
00997                     lineCount, inName);
00998                 }            
00999             if (isdigit(firstChar))
01000                 {
01001                 start += contig->tOffset;
01002                 end += contig->tOffset;
01003                 contig->tStart = min(contig->tStart, start);
01004                 }
01005             memcpy(tSymBuf+symCount, sym, symLen);
01006             }
01007         }
01008     else if (firstWord[0] == '(')
01009         {
01010         lineState = -1;
01011         }
01012     else
01013         {
01014         assert(lineState == 1 || lineState == 3);
01015         if (lineState == 3)  /* Hidden symbols. */
01016             {
01017             char *sym = firstWord;
01018             int symLen = strlen(sym);
01019             hSymLen = symLen;
01020             if (hSymLen != qSymLen)
01021                 {
01022                 errAbort("Hidden symbol size not same as query line %d of %s",
01023                     lineCount, inName);
01024                 }
01025             memcpy(hSymBuf+symCount, sym, symLen);
01026             symCount += symLen;
01027             }        
01028         }
01029     } 
01030 finishContigs(&contigList, out, stitchMinScore, compactOutput, newFormat);
01031 fclose(in);
01032 
01033 slFreeList(&queryNameList);
01034 freeHash(&queryFileHash);
01035 freeMem(qSymBuf);
01036 freeMem(tSymBuf);
01037 freeMem(hSymBuf);
01038 }
01039 
01040 void xenStitch(char *inName, FILE *out, int stitchMinScore, boolean compactOutput)
01041 /* Do the big old stitching run putting together contents of inName
01042  * into contigs in out.  Create output in newer format. */
01043 {
01044 xStitch(inName, out, stitchMinScore, compactOutput, TRUE);
01045 }
01046 
01047 void xenStitcher(char *inName, FILE *out, int stitchMinScore, boolean compactOutput)
01048 /* Do the big old stitching run putting together contents of inName
01049  * into contigs in out.  Create output in older format. */
01050 {
01051 xStitch(inName, out, stitchMinScore, compactOutput, FALSE);
01052 }
01053 
01054 
01055 void xenAlignBig(DNA *query, int qSize, DNA *target, int tSize, FILE *f, boolean forHtml)
01056 /* Do a big alignment - one that has to be stitched together. */
01057 {
01058 struct crudeAli *caList, *ca;
01059 struct nt4Seq *nt4 = newNt4(target, tSize, "target");
01060 struct tempName dynTn;
01061 int i;
01062 int lqSize;
01063 int qMaxSize = 2000;
01064 int tMaxSize = 2*qMaxSize;
01065 int lastQsection = qSize - qMaxSize/2;
01066 FILE *dynFile;
01067 int tMin, tMax, tMid;
01068 int score;
01069 int totalAli = 0;
01070 double totalSize;
01071 
01072 totalSize = (double)qSize * tSize;
01073 if (totalSize < 10000000.0)
01074     {
01075     xenAlignSmall(query, qSize, target, tSize, f, forHtml);
01076     return;
01077     }
01078 makeTempName(&dynTn, "xen", ".dyn");
01079 dynFile = mustOpen(dynTn.forCgi, "w");
01080 
01081 for (i = 0; i<lastQsection || i == 0; i += qMaxSize/2)
01082     {
01083     lqSize = qSize - i;
01084     if (lqSize > qMaxSize)
01085         lqSize = qMaxSize;
01086     caList = crudeAliFind(query+i, lqSize, &nt4, 1, 8, 12);
01087     if (forHtml)
01088         {
01089         fputc('.', stdout);
01090         fflush(stdout);
01091         }
01092     for (ca = caList; ca != NULL; ca = ca->next)
01093         {
01094         tMid = (ca->start + ca->end)/2;
01095         tMin = tMid-tMaxSize/2;
01096         tMax = tMin + tMaxSize;
01097         if (tMin < 0)
01098             tMin = 0;
01099         if (tMax > tSize)
01100             tMax = tSize;
01101         
01102         fprintf(dynFile, "Aligning query:%d-%d %c to target:%d-%d +\n",
01103             i, i+lqSize, ca->strand, tMin, tMax);
01104         if (ca->strand == '-')
01105             {
01106             reverseComplement(query+i, lqSize);
01107             }
01108         score = xenAlignSmall(query+i, lqSize, target+tMin, tMax-tMin, dynFile, FALSE);        
01109         if (ca->strand == '-')
01110             reverseComplement(query+i, lqSize);
01111         fprintf(dynFile, "best score %d\n", score);
01112         if (forHtml)
01113             {
01114             fputc('.', stdout);
01115             fflush(stdout);
01116             }
01117         ++totalAli;
01118         }
01119     slFreeList(&caList);
01120     }
01121 
01122 freeNt4(&nt4);
01123 fclose(dynFile);
01124 if (forHtml)
01125     {
01126     printf("\n %d alignments to stitch.\n", totalAli);
01127     }
01128 xenStitcher(dynTn.forCgi, f, 150000, FALSE);
01129 remove(dynTn.forCgi);
01130 }
01131 
01132 
01133 void xenAlignWorm(DNA *query, int qSize, FILE *f, boolean forHtml)
01134 /* Do alignment against worm genome. */
01135  {
01136 struct crudeAli *caList = NULL, *ca, *newCa;
01137 struct tempName dynTn;
01138 int i;
01139 int lqSize;
01140 int qMaxSize = 2000;
01141 int tMaxSize = 2*qMaxSize;
01142 int lastQsection = qSize - qMaxSize/2;
01143 FILE *dynFile;
01144 int tMin, tMax, tMid, tSize;
01145 int score;
01146 int chromCount;
01147 char **chromNames;
01148 struct nt4Seq **chrom;
01149 DNA *target;
01150 int sectionCount = 0;
01151 
01152 /* Get C. elegans genome and do crude alignments. */
01153 wormLoadNt4Genome(&chrom, &chromCount);
01154 wormChromNames(&chromNames, &chromCount);
01155 for (i = 0; i<lastQsection || i == 0; i += qMaxSize/2)
01156     {
01157     if (forHtml)
01158         {
01159         fputc('.', stdout);
01160         fflush(stdout);
01161         }
01162     lqSize = qSize - i;
01163     if (lqSize > qMaxSize)
01164         lqSize = qMaxSize;
01165     newCa = crudeAliFind(query+i, lqSize, chrom, chromCount, 8, 45);
01166     for (ca = newCa; ca != NULL; ca = ca->next)
01167         {
01168         ca->qStart = i;
01169         ca->qEnd = i+lqSize;
01170         }
01171     caList = slCat(caList,newCa);
01172     ++sectionCount;
01173     }
01174 wormFreeNt4Genome(&chrom);
01175 
01176 /* Make temporary output file. */
01177 makeTempName(&dynTn, "xen", ".dyn");
01178 dynFile = mustOpen(dynTn.forCgi, "w");
01179 
01180 /* Do dynamic programming alignment on each crude alignment. */
01181 if (forHtml)
01182     {
01183     int crudeCount = slCount(caList);
01184     if (crudeCount > 2*sectionCount)
01185         {
01186         printf("\nThis is a difficult alignment.  It looks like the query aligns with\n"
01187                "the <I>C. elegans</I> genome in %d places.  It will take about %1.0f more\n"
01188                "minutes to finish.\n", (crudeCount+sectionCount/2)/sectionCount, crudeCount*0.5);
01189         }
01190     fflush(stdout);
01191     }
01192 for (ca = caList; ca != NULL; ca = ca->next)
01193     {
01194     if (forHtml)
01195         {
01196         fputc('.', stdout);
01197         fflush(stdout);
01198         }
01199     tMid = (ca->start + ca->end)/2;
01200     tMin = tMid-tMaxSize/2;
01201     tMax = tMin + tMaxSize;
01202     wormClipRangeToChrom(chromNames[ca->chromIx], &tMin, &tMax);
01203     fprintf(dynFile, "Aligning query:%d-%d %c to %s:%d-%d +\n",
01204         ca->qStart, ca->qEnd, ca->strand, chromNames[ca->chromIx], tMin, tMax);
01205     lqSize = ca->qEnd - ca->qStart;
01206     if (ca->strand == '-')
01207         reverseComplement(query+ca->qStart, lqSize);
01208     tSize = tMax - tMin;
01209     target = wormChromPart(chromNames[ca->chromIx], tMin, tSize);
01210     score = xenAlignSmall(query+ca->qStart, lqSize, target, tSize, dynFile, FALSE);        
01211     freez(&target);
01212     if (ca->strand == '-')
01213         reverseComplement(query+ca->qStart, lqSize);
01214     fprintf(dynFile, "best score %d\n", score);
01215     }
01216 slFreeList(&caList);
01217 fclose(dynFile);
01218 
01219 if (forHtml)
01220     printf("\n");
01221 xenStitcher(dynTn.forCgi, f, 150000, FALSE);
01222 remove(dynTn.forCgi);
01223 }
01224 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2