00001
00002
00003
00004
00005
00006 #include "common.h"
00007 #include "dlist.h"
00008 #include "hash.h"
00009 #include "portable.h"
00010 #include "dnautil.h"
00011 #include "nt4.h"
00012 #include "crudeali.h"
00013 #include "wormdna.h"
00014 #include "xenalign.h"
00015
00016 static char const rcsid[] = "$Id: xenbig.c,v 1.3 2006/03/15 18:36:16 angie Exp $";
00017
00018 struct contig
00019
00020 {
00021 char *name;
00022 char *queryFile;
00023 char *query;
00024 int qStart, qEnd;
00025 int qOffset, qEndOffset;
00026 char qStrand;
00027 char *target;
00028 int tStart, tEnd;
00029 int tOffset, tEndOffset;
00030 char tStrand;
00031 int score;
00032 char *qSym;
00033 char *tSym;
00034 char *hSym;
00035 int symCount;
00036 boolean isComplete;
00037 };
00038
00039 static void freeContig(struct contig **pContig)
00040
00041 {
00042 struct contig *contig = *pContig;
00043 if (contig != NULL)
00044 {
00045 freeMem(contig->name);
00046 freeMem(contig->target);
00047 freeMem(contig->qSym);
00048 freeMem(contig->tSym);
00049 freeMem(contig->hSym);
00050 freez(pContig);
00051 }
00052 }
00053
00054 static void freeContigList(struct dlList **pContigList)
00055
00056 {
00057 struct dlNode *node;
00058 struct contig *contig;
00059
00060 for (node = (*pContigList)->head; node->next != NULL; node = node->next)
00061 {
00062 contig = node->val;
00063 freeContig(&contig);
00064 }
00065 freeDlList(pContigList);
00066 }
00067
00068 static void checkComplete(struct dlList *contigList)
00069
00070 {
00071 struct dlNode *node;
00072 struct contig *contig;
00073
00074 for (node = contigList->head; node->next != NULL; node = node->next)
00075 {
00076 contig = node->val;
00077 if (!contig->isComplete)
00078 {
00079 errAbort("Contig %s:%d-%d %c %s:%d-%d %c isn't complete",
00080 contig->query, contig->qStart, contig->qEnd, contig->qStrand,
00081 contig->target, contig->tStart, contig->tEnd, contig->tStrand);
00082 }
00083 }
00084 }
00085
00086 static int cmpContigQueryStart(const void *va, const void *vb)
00087
00088
00089
00090 {
00091 struct contig **pA = (struct contig **)va;
00092 struct contig **pB = (struct contig **)vb;
00093 struct contig *a = *pA, *b = *pB;
00094 int dif;
00095
00096 dif = a->qStart - b->qStart;
00097 if (dif == 0)
00098
00099 dif = b->qEnd - a->qEnd;
00100 return dif;
00101 }
00102
00103 static void writeContig(struct contig *contig, FILE *out,
00104 boolean compactOutput, boolean newFormat)
00105
00106 {
00107 int i;
00108 int size = contig->symCount;
00109 #define maxLineSize 50
00110 char buf[maxLineSize+1];
00111 int lineSize;
00112 int j;
00113 int qIx = 0, tIx = 0;
00114 char *qSyms = contig->qSym;
00115 char *tSyms = contig->tSym;
00116
00117 if (newFormat)
00118 {
00119 fprintf(out, "%s align %d.%d%% of %d %s %s:%d-%d %c %s:%d-%d %c\n",
00120 contig->name, contig->score/10, contig->score%10, size,
00121 contig->queryFile, contig->query, contig->qStart, contig->qEnd, contig->qStrand,
00122 contig->target, contig->tStart, contig->tEnd, contig->tStrand);
00123 }
00124 else
00125 {
00126 fprintf(out, "%s align %d.%d%% of %d %s:%d-%d %c %s:%d-%d %c\n",
00127 contig->name, contig->score/10, contig->score%10, size,
00128 contig->query, contig->qStart, contig->qEnd, contig->qStrand,
00129 contig->target, contig->tStart, contig->tEnd, contig->tStrand);
00130 }
00131 if (compactOutput)
00132 {
00133 mustWrite(out, contig->qSym, size);
00134 fputc('\n', out);
00135 mustWrite(out, contig->tSym, size);
00136 fputc('\n', out);
00137 mustWrite(out, contig->hSym, size);
00138 fputc('\n', out);
00139 }
00140 else
00141 {
00142 for (i=0; i<size; i += lineSize)
00143 {
00144
00145 lineSize = size - i;
00146 if (lineSize > maxLineSize)
00147 lineSize = maxLineSize;
00148 buf[lineSize] = 0;
00149
00150
00151 memcpy(buf, contig->qSym + i, lineSize);
00152 fprintf(out, "%5d %s\n", qIx, buf);
00153
00154
00155 fprintf(out, "%5s ", "");
00156 for (j=0; j<lineSize; ++j)
00157 {
00158 int off = j+i;
00159 char q = qSyms[off];
00160 char t = tSyms[off];
00161 char c = (q == t ? '|' : ' ');
00162 fputc(c, out);
00163 }
00164 fputc('\n', out);
00165
00166
00167 memcpy(buf, contig->tSym + i, lineSize);
00168 fprintf(out, "%5d %s\n", tIx, buf);
00169
00170
00171 memcpy(buf, contig->hSym + i, lineSize);
00172 fprintf(out, "%5s %s\n", "", buf);
00173
00174
00175 for (j=0; j<lineSize; ++j)
00176 {
00177 int off = j+i;
00178 char q = qSyms[off];
00179 char t = tSyms[off];
00180 if (q != '-')
00181 ++qIx;
00182 if (t != '-')
00183 ++tIx;
00184 }
00185
00186 fputc('\n', out);
00187 }
00188 }
00189 #undef lineSize
00190 }
00191
00192 struct cluster
00193
00194 {
00195 struct dlList *contigs;
00196 char *query;
00197 int qStart, qEnd;
00198 int qOffset, qEndOffset;
00199 char qStrand;
00200 char *target;
00201 int tStart, tEnd;
00202 int tOffset, tEndOffset;
00203 char tStrand;
00204 int score;
00205 };
00206
00207 #ifdef UNUSED
00208 static int cmpCluster(const void *va, const void *vb)
00209
00210 {
00211 struct cluster **pA = (struct cluster **)va;
00212 struct cluster **pB = (struct cluster **)vb;
00213 struct cluster *a = *pA, *b = *pB;
00214
00215 return b->score - a->score;
00216 }
00217 #endif
00218
00219 static boolean contigsOverlap(struct contig *a, struct contig *b,
00220 int *retQueryOverlap, int *retTargetOverlap)
00221
00222 {
00223 int overlap;
00224 int start, end;
00225
00226
00227 start = max(a->qStart, b->qStart);
00228 end = min(a->qEnd, b->qEnd);
00229 overlap = end - start;
00230 if (overlap > 0)
00231 *retQueryOverlap = overlap;
00232 else
00233 return FALSE;
00234
00235
00236 start = max(a->tStart, b->tStart);
00237 end = min(a->tEnd, b->tEnd);
00238 overlap = end - start;
00239 if (overlap > 0)
00240 *retTargetOverlap = overlap;
00241 else
00242 return FALSE;
00243 return TRUE;
00244 }
00245
00246 static int findSymIx(char *qSym, int qIx, int *retSymIx)
00247
00248 {
00249 char q;
00250 int symIx;
00251 int qCount = 0;
00252
00253 for (symIx = 0; ; ++symIx)
00254 {
00255 if (qCount == qIx)
00256 {
00257 *retSymIx = symIx;
00258 return TRUE;
00259 }
00260 if ((q = *qSym++) == 0)
00261 {
00262 warn("findSymIx gone past end");
00263 return FALSE;
00264 }
00265 if (q != '-')
00266 ++qCount;
00267 }
00268 }
00269
00270 static boolean mergeIntoAdjacentClusters(struct dlList *clusterList, struct dlNode *contigNode)
00271
00272
00273 {
00274 struct contig *contig = contigNode->val;
00275 struct dlNode *clusterNode;
00276 struct cluster *cluster;
00277
00278 for (clusterNode = clusterList->head; clusterNode->next != NULL; clusterNode = clusterNode->next)
00279 {
00280 cluster = clusterNode->val;
00281 if ((contig->qOffset == cluster->qEndOffset - 1000 ||
00282 (contig->qEndOffset == cluster->qEndOffset &&
00283 contig->tEndOffset == cluster->tEndOffset)) &&
00284 contig->qStrand == cluster->qStrand &&
00285 contig->tStrand == cluster->tStrand &&
00286 sameString(contig->target, cluster->target) &&
00287 sameString(contig->query, cluster->query))
00288 {
00289 if (contig->qStrand == contig->tStrand)
00290 {
00291 if (contig->tOffset < cluster->tEndOffset + 50000
00292 && contig->tOffset >= cluster->tOffset-300)
00293 {
00294 cluster->qEndOffset = contig->qEndOffset;
00295 cluster->tEndOffset = contig->tEndOffset;
00296 cluster->score += contig->score;
00297 dlAddTail(cluster->contigs, contigNode);
00298 return TRUE;
00299 }
00300 }
00301 else
00302 {
00303 if ( contig->tEndOffset > cluster->tOffset - 50000
00304 && contig->tEndOffset <= cluster->tEndOffset+300)
00305 {
00306 cluster->qEndOffset = contig->qEndOffset;
00307 cluster->tOffset = contig->tOffset;
00308 cluster->score += contig->score;
00309 dlAddTail(cluster->contigs, contigNode);
00310 return TRUE;
00311 }
00312 }
00313 }
00314 }
00315 return FALSE;
00316 }
00317
00318 static void maxAlignedOverlap(char *a1, char *a2, char *b1, char *b2, int size,
00319 int *retStart, int *retOverlapSize)
00320
00321
00322 {
00323 int maxStart = -1;
00324 int maxLen = -1;
00325 int curStart = 0;
00326 int curLen = 0;
00327 int i;
00328
00329 for (i=0; i<size; ++i)
00330 {
00331 if (a1[i] == b1[i] && a2[i] == b2[i])
00332 {
00333 curLen += 1;
00334 if (curLen > maxLen)
00335 {
00336 maxLen = curLen;
00337 maxStart = curStart;
00338 }
00339 }
00340 else
00341 {
00342 curLen = 0;
00343 curStart = i+1;
00344 }
00345 }
00346 *retStart = maxStart;
00347 *retOverlapSize = maxLen;
00348 }
00349
00350 static void findMaxOverlap(char *a1, char *a2, int aSize,
00351 char *b1, char *b2, int bSize,
00352 int *retAstart, int *retBstart, int *retOverlapSize)
00353
00354 {
00355 int i;
00356 int maxOverlap = -1;
00357 int maxAstart = 0;
00358 int maxBstart = 0;
00359 int overlap;
00360 int minSize = min(aSize, bSize);
00361 int goodEnough = minSize-1;
00362 int start;
00363
00364 for (i=0; i<bSize; ++i)
00365 {
00366 int size = bSize-i;
00367 if (size > aSize)
00368 size = aSize;
00369 maxAlignedOverlap(a1, a2, b1+i, b2+i, size, &start, &overlap);
00370 if (overlap > maxOverlap)
00371 {
00372 maxOverlap = overlap;
00373 maxAstart = start;
00374 maxBstart = i+start;
00375 if (maxOverlap >= goodEnough)
00376 goto RETURN_VALS;
00377 }
00378 }
00379 for (i=1; i<aSize; ++i)
00380 {
00381 int size = aSize-i;
00382 if (size > bSize)
00383 size = bSize;
00384 maxAlignedOverlap(a1+i, a2+i, b1, b2, size, &start, &overlap);
00385 if (overlap > maxOverlap)
00386 {
00387 maxOverlap = overlap;
00388 maxAstart = i+start;
00389 maxBstart = start;
00390 if (maxOverlap >= goodEnough)
00391 goto RETURN_VALS;
00392 }
00393 }
00394 RETURN_VALS:
00395 {
00396 *retAstart = maxAstart;
00397 *retBstart = maxBstart;
00398 *retOverlapSize = maxOverlap;
00399 }
00400 }
00401
00402 static struct contig *forwardMergeTwo(struct contig *a, struct contig *b)
00403
00404
00405 {
00406 int aSymOverlapStart, aSymOverlapEnd, aSymOverlapSize;
00407 int bSymOverlapStart, bSymOverlapEnd, bSymOverlapSize;
00408 int aqoStart, bqoStart;
00409 int overlapSize, halfOverlapSize;
00410 int aCopyStart, aCopyEnd, bCopyStart, bCopyEnd;
00411 int newSymCount;
00412 struct contig *c;
00413 int aCopySize, bCopySize;
00414 int qOverlap;
00415
00416
00417 qOverlap = b->qStart - a->qStart;
00418 if (qOverlap < 0 || qOverlap > a->qEnd - a->qStart)
00419 return NULL;
00420 if (!findSymIx(a->qSym, b->qStart - a->qStart, &aSymOverlapStart))
00421 return NULL;
00422 aSymOverlapEnd = a->symCount;
00423 aSymOverlapSize = aSymOverlapEnd - aSymOverlapStart;
00424 bSymOverlapStart = 0;
00425 if (!findSymIx(b->qSym, a->qEnd - b->qStart, &bSymOverlapEnd))
00426 return NULL;
00427 bSymOverlapSize = bSymOverlapEnd - bSymOverlapStart;
00428
00429
00430
00431 findMaxOverlap(a->qSym + aSymOverlapStart,
00432 a->tSym + aSymOverlapStart, aSymOverlapSize,
00433 b->qSym + bSymOverlapStart,
00434 b->tSym + bSymOverlapStart, bSymOverlapSize,
00435 &aqoStart, &bqoStart, &overlapSize);
00436 if (overlapSize < 15)
00437 return NULL;
00438 aSymOverlapStart += aqoStart;
00439 bSymOverlapStart += bqoStart;
00440 halfOverlapSize = overlapSize/2;
00441
00442
00443
00444 aCopyStart = 0;
00445 aCopyEnd = aSymOverlapStart + halfOverlapSize;
00446 bCopyStart = bSymOverlapStart + halfOverlapSize;
00447 bCopyEnd = b->symCount;
00448 newSymCount = aCopyEnd - aCopyStart + bCopyEnd - bCopyStart;
00449
00450
00451
00452 AllocVar(c);
00453 c->query = a->query;
00454 c->queryFile = a->queryFile;
00455 c->qStart = min(a->qStart, b->qStart);
00456 c->qEnd = max(a->qEnd, b->qEnd);
00457 c->qStrand = a->qStrand;
00458 c->target = cloneString(a->target);
00459 c->tStart = a->tStart;
00460 c->tEnd = b->tEnd;
00461 c->tStrand = a->tStrand;
00462 c->qSym = needMem(newSymCount);
00463 c->tSym = needMem(newSymCount);
00464 c->hSym = needMem(newSymCount);
00465 c->symCount = newSymCount;
00466
00467
00468
00469 aCopySize = aCopyEnd - aCopyStart;
00470 bCopySize = bCopyEnd - bCopyStart;
00471 memcpy(c->qSym, a->qSym+aCopyStart, aCopySize);
00472 memcpy(c->tSym, a->tSym+aCopyStart, aCopySize);
00473 memcpy(c->hSym, a->hSym+aCopyStart, aCopySize);
00474
00475
00476
00477 memcpy(c->qSym+aCopySize, b->qSym+bCopyStart, bCopySize);
00478 memcpy(c->tSym+aCopySize, b->tSym+bCopyStart, bCopySize);
00479 memcpy(c->hSym+aCopySize, b->hSym+bCopyStart, bCopySize);
00480
00481 return c;
00482 }
00483
00484 static void rcContigOffsets(struct contig *a, int revOff)
00485
00486 {
00487 int temp;
00488
00489 temp = a->qStart;
00490 a->qStart = revOff - a->qEnd;
00491 a->qEnd = revOff - temp;
00492 temp = a->tStart;
00493 a->tStart = revOff - a->tEnd;
00494 a->tEnd = revOff - temp;
00495 }
00496
00497 static void rcQueryOffsetsAroundSeg(struct contig *a, int segStart, int segEnd)
00498
00499 {
00500 int segSize = segEnd - segStart;
00501 int qEnd = reverseOffset(a->qStart - segStart, segSize) + segStart + 1;
00502 int qStart = reverseOffset(a->qEnd - segStart, segSize) + segStart + 1;
00503 a->qStart = qStart;
00504 a->qEnd = qEnd;
00505 }
00506
00507 static struct contig *mergeTwo(struct contig *a, struct contig *b)
00508
00509
00510 {
00511 int aStart, bEnd, revOff;
00512 struct contig *c;
00513 int tStart, tEnd;
00514 int qStart, qEnd;
00515
00516
00517 if (a->qStrand == a->tStrand)
00518 return forwardMergeTwo(a,b);
00519
00520
00521
00522 aStart = a->qStart;
00523 bEnd = b->qEnd;
00524 revOff = aStart + bEnd;
00525
00526
00527 tStart = b->tStart;
00528 tEnd = a->tEnd;
00529
00530
00531 qStart = a->qStart;
00532 qEnd = b->qEnd;
00533
00534
00535
00536 rcContigOffsets(a, revOff);
00537 rcContigOffsets(b, revOff);
00538 c = forwardMergeTwo(b,a);
00539 rcContigOffsets(a, revOff);
00540 rcContigOffsets(b, revOff);
00541 if (c != NULL)
00542 {
00543 rcContigOffsets(c, revOff);
00544 c->tStart = tStart;
00545 c->tEnd = tEnd;
00546 assert(c->qStart == qStart && c->qEnd == qEnd);
00547 }
00548 return c;
00549 }
00550
00551
00552 static void mergeWithinCluster(struct cluster *cluster)
00553
00554 {
00555 int clusterSize = dlCount(cluster->contigs);
00556 struct dlNode *prevNode, *curNode, *nextNode;
00557 struct contig *prev, *cur, *merged;
00558 int tOverlap, qOverlap;
00559
00560
00561
00562
00563
00564
00565
00566
00567
00568 if (clusterSize < 2)
00569 return;
00570 curNode = cluster->contigs->head;
00571 cur = curNode->val;
00572 nextNode = curNode->next;
00573
00574 for (;;)
00575 {
00576
00577
00578 prevNode = curNode;
00579 curNode = nextNode;
00580 if ((nextNode = curNode->next) == NULL)
00581 break;
00582 prev = cur;
00583 cur = curNode->val;
00584
00585
00586 if (contigsOverlap(prev, cur, &qOverlap, &tOverlap))
00587 {
00588 if ((merged = mergeTwo(prev, cur)) == NULL)
00589 warn("Couldn't merge two");
00590 else
00591 {
00592 dlRemove(prevNode);
00593 freeMem(prevNode);
00594 freeContig(&cur);
00595 freeContig(&prev);
00596 curNode->val = cur = merged;
00597 }
00598 }
00599 }
00600 }
00601
00602 static boolean clusterEncompassedBefore(struct dlList *clusterList, struct cluster *cluster)
00603
00604
00605 {
00606 struct dlNode *node;
00607 struct cluster *c;
00608
00609 for (node = clusterList->head; node->next != NULL; node = node->next)
00610 {
00611 c = node->val;
00612 if (c == cluster)
00613 return FALSE;
00614 if (c->qStart <= cluster->qStart && c->qEnd >= cluster->qEnd &&
00615 c->tStart <= cluster->tStart && c->tEnd >= cluster->tEnd &&
00616 c->qStrand == cluster->qStrand &&
00617 c->tStrand == cluster->tStrand &&
00618 sameString(c->query, cluster->query) &&
00619 sameString(c->target, cluster->target) )
00620 {
00621 return TRUE;
00622 }
00623 }
00624 assert(FALSE);
00625 return FALSE;
00626 }
00627
00628 static void clusterRemoveEncompassed(struct dlList *clusterList)
00629
00630
00631 {
00632 struct dlNode *node, *next;
00633 struct cluster *cluster;
00634
00635 for (node = clusterList->head; node->next != NULL; node = next)
00636 {
00637 next = node->next;
00638 cluster = node->val;
00639 if (clusterEncompassedBefore(clusterList, cluster))
00640 {
00641 dlRemove(node);
00642 freeMem(node);
00643 freeContigList(&cluster->contigs);
00644 freeMem(cluster);
00645 }
00646 }
00647 }
00648
00649 static boolean contigEncompassedBefore(struct dlList *contigList, struct contig *contig)
00650
00651
00652 {
00653 struct dlNode *node;
00654 struct contig *c;
00655
00656 for (node = contigList->head; node->next != NULL; node = node->next)
00657 {
00658 c = node->val;
00659 if (c == contig)
00660 return FALSE;
00661 if (c->qStart <= contig->qStart && c->qEnd >= contig->qEnd &&
00662 c->tStart <= contig->tStart && c->tEnd >= contig->tEnd &&
00663 c->qStrand == contig->qStrand &&
00664 c->tStrand == contig->tStrand &&
00665 sameString(c->query, contig->query) &&
00666 sameString(c->target, contig->target) )
00667 {
00668 return TRUE;
00669 }
00670 }
00671 assert(FALSE);
00672 return FALSE;
00673 }
00674
00675 static void contigRemoveEncompassed(struct dlList *contigList)
00676
00677
00678 {
00679 struct dlNode *node, *next;
00680 struct contig *contig;
00681
00682 for (node = contigList->head; node->next != NULL; node = next)
00683 {
00684 next = node->next;
00685 contig = node->val;
00686 if (contigEncompassedBefore(contigList, contig))
00687 {
00688 dlRemove(node);
00689 freeMem(node);
00690 freeContig(&contig);
00691 }
00692 }
00693 }
00694
00695 static int milliMatchScore(struct contig *contig)
00696
00697 {
00698 int symCount = contig->symCount;
00699 char *qSym = contig->qSym;
00700 char *tSym = contig->tSym;
00701 int matchCount = 0;
00702 int i;
00703
00704 for (i=0; i<symCount; ++i)
00705 {
00706 if (qSym[i] == tSym[i])
00707 ++matchCount;
00708 }
00709 return round( (1000.0 * matchCount) / symCount);
00710 }
00711
00712 static void mergeContigs(struct dlList *contigList, int stitchMinScore)
00713
00714
00715 {
00716 struct dlList *clusterList;
00717 struct dlNode *contigNode, *clusterNode;
00718 struct dlNode *nextClusterNode;
00719 struct cluster *cluster;
00720 struct contig *contig;
00721 int contigCount;
00722 int startCount = 0;
00723
00724
00725
00726 contigCount = dlCount(contigList);
00727 if (contigCount < 1)
00728 return;
00729
00730 clusterList = newDlList();
00731
00732
00733
00734 while ((contigNode = dlPopHead(contigList)) != NULL)
00735 {
00736 if (!mergeIntoAdjacentClusters(clusterList, contigNode))
00737 {
00738 AllocVar(cluster);
00739 contig = contigNode->val;
00740 cluster->contigs = newDlList();
00741 dlAddTail(cluster->contigs, contigNode);
00742 cluster->query = contig->query;
00743 cluster->qStart = contig->qStart;
00744 cluster->qEnd = contig->qEnd;
00745 cluster->qOffset = contig->qOffset;
00746 cluster->qEndOffset = contig->qEndOffset;
00747 cluster->qStrand = contig->qStrand;
00748 cluster->target = contig->target;
00749 cluster->tStart = contig->tStart;
00750 cluster->tEnd = contig->tEnd;
00751 cluster->tOffset = contig->tOffset;
00752 cluster->tEndOffset = contig->tEndOffset;
00753 cluster->tStrand = contig->tStrand;
00754 cluster->score = contig->score;
00755 dlAddValTail(clusterList, cluster);
00756 }
00757 }
00758
00759
00760 for (clusterNode = clusterList->head; clusterNode->next != NULL;
00761 clusterNode = nextClusterNode)
00762 {
00763 nextClusterNode = clusterNode->next;
00764 cluster = clusterNode->val;
00765 if (cluster->score < stitchMinScore)
00766 {
00767 dlRemove(clusterNode);
00768 freeContigList(&cluster->contigs);
00769 freeMem(cluster);
00770 freeMem(clusterNode);
00771 }
00772 }
00773
00774 clusterRemoveEncompassed(clusterList);
00775
00776
00777
00778 for (clusterNode = clusterList->head; clusterNode->next != NULL; clusterNode = clusterNode->next)
00779 {
00780 cluster = clusterNode->val;
00781 mergeWithinCluster(cluster);
00782 while ((contigNode = dlPopHead(cluster->contigs)) != NULL)
00783 dlAddTail(contigList, contigNode);
00784 freeDlList(&cluster->contigs);
00785 }
00786
00787 dlSort(contigList, cmpContigQueryStart);
00788 contigRemoveEncompassed(contigList);
00789
00790 for (contigNode = contigList->head; contigNode->next != NULL; contigNode = contigNode->next)
00791 {
00792 char nameBuf[128];
00793 char *noDir;
00794 char *noDot;
00795 contig = contigNode->val;
00796
00797
00798 noDir = strrchr(contig->query, '\\');
00799 if (noDir == NULL)
00800 noDir = strrchr(contig->query, '/');
00801 if (noDir == NULL)
00802 noDir = contig->query;
00803 else
00804 noDir += 1;
00805 noDot = strrchr(noDir, '.');
00806 if (noDot != NULL)
00807 *noDot = 0;
00808 sprintf(nameBuf, "%s.c%d", noDir, ++startCount);
00809 contig->name = cloneString(nameBuf);
00810 contig->score = milliMatchScore(contig);
00811 }
00812
00813
00814 freeDlListAndVals(&clusterList);
00815 }
00816
00817 static void finishContigs( struct dlList **pContigList,
00818 FILE *out, int stitchMinScore, boolean compactOutput, boolean newFormat)
00819
00820
00821 {
00822 struct contig *contig;
00823 struct dlNode *node;
00824
00825 checkComplete(*pContigList);
00826 mergeContigs(*pContigList, stitchMinScore);
00827 for (node = (*pContigList)->head; node->next != NULL; node = node->next)
00828 {
00829 contig = node->val;
00830 writeContig(contig, out, compactOutput, newFormat);
00831 }
00832 freeContigList(pContigList);
00833 *pContigList = newDlList();
00834 }
00835
00836 static int countNonGap(char *s, int size)
00837
00838 {
00839 int count = 0;
00840 while (--size >= 0)
00841 if (*s++ != '-') ++count;
00842 return count;
00843 }
00844
00845 static void xStitch(char *inName, FILE *out, int stitchMinScore, boolean compactOutput,
00846 boolean newFormat)
00847
00848
00849 {
00850 FILE *in;
00851 char line[512];
00852 int lineCount = 0;
00853 char *words[64];
00854 int wordCount;
00855 char *firstWord;
00856 char *queryName = "";
00857 char *queryFile = "";
00858 struct hash *queryFileHash = newHash(0);
00859 struct contig *contig = NULL;
00860 struct dlList *contigList = newDlList();
00861 int lineState = 0;
00862 struct slName *queryNameList = NULL;
00863 int maxSymCount = 64*1024;
00864 char *qSymBuf = needMem(maxSymCount+1);
00865 char *tSymBuf = needMem(maxSymCount+1);
00866 char *hSymBuf = needMem(maxSymCount+1);
00867 int symCount = 0;
00868 int qSymLen = 0, tSymLen = 0, hSymLen = 0;
00869
00870 in = mustOpen(inName, "r");
00871 while (fgets(line, sizeof(line), in))
00872 {
00873 ++lineCount;
00874 if (++lineState == 5)
00875 lineState = 0;
00876 wordCount = chopLine(line, words);
00877 if (wordCount <= 0)
00878 continue;
00879 firstWord = words[0];
00880 if (sameString(firstWord, "Aligning"))
00881 {
00882 char *queryString;
00883 char *targetString;
00884 char queryStrand, targetStrand;
00885 char *parts[8];
00886 int partCount;
00887
00888
00889 if (newFormat)
00890 {
00891 if (wordCount < 7)
00892 errAbort("Short line %d of %s", lineCount, inName);
00893 queryFile = hashStoreName(queryFileHash, words[1]);
00894 queryString = words[2];
00895 queryStrand = words[3][0];
00896 targetString = words[5];
00897 targetStrand = words[6][0];
00898 }
00899 else
00900 {
00901 if (wordCount < 6)
00902 errAbort("Short line %d of %s", lineCount, inName);
00903 queryString = words[1];
00904 queryStrand = words[2][0];
00905 targetString = words[4];
00906 targetStrand = words[5][0];
00907 }
00908
00909
00910
00911
00912 partCount = chopString(queryString, ":-", parts, ArraySize(parts));
00913 if (!sameString(parts[0], queryName))
00914 {
00915
00916 struct slName *newName = newSlName(parts[0]);
00917 slAddHead(&queryNameList, newName);
00918
00919
00920 finishContigs(&contigList, out, stitchMinScore, compactOutput, newFormat);
00921 queryName = newName->name;
00922 }
00923
00924
00925
00926 AllocVar(contig);
00927 dlAddValTail(contigList, contig);
00928 contig->queryFile = queryFile;
00929 contig->query = queryName;
00930 contig->qOffset = atoi(parts[1]);
00931 contig->qEndOffset = atoi(parts[2]);
00932 contig->qStrand = queryStrand;
00933
00934
00935 chopString(targetString, ":-", parts, ArraySize(parts));
00936 contig->target = cloneString(parts[0]);
00937 contig->tOffset = atoi(parts[1]);
00938 contig->tEndOffset = atoi(parts[2]);
00939 contig->tStrand = targetStrand;
00940
00941
00942
00943 contig->qStart = contig->tStart = 0x3fffffff;
00944
00945 lineState = -1;
00946 symCount = 0;
00947 }
00948 else if (sameString(firstWord, "best"))
00949 {
00950 if (wordCount < 3)
00951 errAbort("Short line %d of %s", lineCount, inName);
00952 contig->score = atoi(words[2]);
00953 contig->isComplete = TRUE;
00954 contig->qSym = cloneStringZ(qSymBuf, symCount);
00955 contig->tSym = cloneStringZ(tSymBuf, symCount);
00956 contig->hSym = cloneStringZ(hSymBuf, symCount);
00957 contig->symCount = symCount;
00958 contig->qEnd = contig->qStart + countNonGap(qSymBuf, symCount);
00959 contig->tEnd = contig->tStart + countNonGap(tSymBuf, symCount);
00960 if (contig->qStrand != contig->tStrand)
00961 rcQueryOffsetsAroundSeg(contig, contig->qOffset, contig->qEndOffset);
00962 }
00963 else if (wordCount > 1 && (isdigit(firstWord[0]) || firstWord[0] == '-'))
00964 {
00965 int start, end;
00966 char *sym = words[1];
00967 int symLen = strlen(sym);
00968 char firstChar = firstWord[0];
00969 if (lineState != 0 && lineState != 2)
00970 errAbort("Bummer - phasing mismatch on lineState line %d of %s!\n", lineCount, inName);
00971 assert(lineState == 0 || lineState == 2);
00972 start = atoi(firstWord);
00973 end = start + symLen;
00974 if (symCount + symLen > maxSymCount)
00975 {
00976 errAbort("Single contig too long line %d of %s, can only handle up to %d symbols\n",
00977 lineCount, inName, maxSymCount);
00978 }
00979 if (lineState == 0)
00980 {
00981 qSymLen = symLen;
00982 if (isdigit(firstChar))
00983 {
00984 start += contig->qOffset;
00985 end += contig->qOffset;
00986 contig->qStart = min(contig->qStart, start);
00987 contig->qEnd = max(contig->qEnd, end);
00988 }
00989 memcpy(qSymBuf+symCount, sym, symLen);
00990 }
00991 else
00992 {
00993 tSymLen = symLen;
00994 if (tSymLen != qSymLen)
00995 {
00996 errAbort("Target symbol size not same as query line %d of %s",
00997 lineCount, inName);
00998 }
00999 if (isdigit(firstChar))
01000 {
01001 start += contig->tOffset;
01002 end += contig->tOffset;
01003 contig->tStart = min(contig->tStart, start);
01004 }
01005 memcpy(tSymBuf+symCount, sym, symLen);
01006 }
01007 }
01008 else if (firstWord[0] == '(')
01009 {
01010 lineState = -1;
01011 }
01012 else
01013 {
01014 assert(lineState == 1 || lineState == 3);
01015 if (lineState == 3)
01016 {
01017 char *sym = firstWord;
01018 int symLen = strlen(sym);
01019 hSymLen = symLen;
01020 if (hSymLen != qSymLen)
01021 {
01022 errAbort("Hidden symbol size not same as query line %d of %s",
01023 lineCount, inName);
01024 }
01025 memcpy(hSymBuf+symCount, sym, symLen);
01026 symCount += symLen;
01027 }
01028 }
01029 }
01030 finishContigs(&contigList, out, stitchMinScore, compactOutput, newFormat);
01031 fclose(in);
01032
01033 slFreeList(&queryNameList);
01034 freeHash(&queryFileHash);
01035 freeMem(qSymBuf);
01036 freeMem(tSymBuf);
01037 freeMem(hSymBuf);
01038 }
01039
01040 void xenStitch(char *inName, FILE *out, int stitchMinScore, boolean compactOutput)
01041
01042
01043 {
01044 xStitch(inName, out, stitchMinScore, compactOutput, TRUE);
01045 }
01046
01047 void xenStitcher(char *inName, FILE *out, int stitchMinScore, boolean compactOutput)
01048
01049
01050 {
01051 xStitch(inName, out, stitchMinScore, compactOutput, FALSE);
01052 }
01053
01054
01055 void xenAlignBig(DNA *query, int qSize, DNA *target, int tSize, FILE *f, boolean forHtml)
01056
01057 {
01058 struct crudeAli *caList, *ca;
01059 struct nt4Seq *nt4 = newNt4(target, tSize, "target");
01060 struct tempName dynTn;
01061 int i;
01062 int lqSize;
01063 int qMaxSize = 2000;
01064 int tMaxSize = 2*qMaxSize;
01065 int lastQsection = qSize - qMaxSize/2;
01066 FILE *dynFile;
01067 int tMin, tMax, tMid;
01068 int score;
01069 int totalAli = 0;
01070 double totalSize;
01071
01072 totalSize = (double)qSize * tSize;
01073 if (totalSize < 10000000.0)
01074 {
01075 xenAlignSmall(query, qSize, target, tSize, f, forHtml);
01076 return;
01077 }
01078 makeTempName(&dynTn, "xen", ".dyn");
01079 dynFile = mustOpen(dynTn.forCgi, "w");
01080
01081 for (i = 0; i<lastQsection || i == 0; i += qMaxSize/2)
01082 {
01083 lqSize = qSize - i;
01084 if (lqSize > qMaxSize)
01085 lqSize = qMaxSize;
01086 caList = crudeAliFind(query+i, lqSize, &nt4, 1, 8, 12);
01087 if (forHtml)
01088 {
01089 fputc('.', stdout);
01090 fflush(stdout);
01091 }
01092 for (ca = caList; ca != NULL; ca = ca->next)
01093 {
01094 tMid = (ca->start + ca->end)/2;
01095 tMin = tMid-tMaxSize/2;
01096 tMax = tMin + tMaxSize;
01097 if (tMin < 0)
01098 tMin = 0;
01099 if (tMax > tSize)
01100 tMax = tSize;
01101
01102 fprintf(dynFile, "Aligning query:%d-%d %c to target:%d-%d +\n",
01103 i, i+lqSize, ca->strand, tMin, tMax);
01104 if (ca->strand == '-')
01105 {
01106 reverseComplement(query+i, lqSize);
01107 }
01108 score = xenAlignSmall(query+i, lqSize, target+tMin, tMax-tMin, dynFile, FALSE);
01109 if (ca->strand == '-')
01110 reverseComplement(query+i, lqSize);
01111 fprintf(dynFile, "best score %d\n", score);
01112 if (forHtml)
01113 {
01114 fputc('.', stdout);
01115 fflush(stdout);
01116 }
01117 ++totalAli;
01118 }
01119 slFreeList(&caList);
01120 }
01121
01122 freeNt4(&nt4);
01123 fclose(dynFile);
01124 if (forHtml)
01125 {
01126 printf("\n %d alignments to stitch.\n", totalAli);
01127 }
01128 xenStitcher(dynTn.forCgi, f, 150000, FALSE);
01129 remove(dynTn.forCgi);
01130 }
01131
01132
01133 void xenAlignWorm(DNA *query, int qSize, FILE *f, boolean forHtml)
01134
01135 {
01136 struct crudeAli *caList = NULL, *ca, *newCa;
01137 struct tempName dynTn;
01138 int i;
01139 int lqSize;
01140 int qMaxSize = 2000;
01141 int tMaxSize = 2*qMaxSize;
01142 int lastQsection = qSize - qMaxSize/2;
01143 FILE *dynFile;
01144 int tMin, tMax, tMid, tSize;
01145 int score;
01146 int chromCount;
01147 char **chromNames;
01148 struct nt4Seq **chrom;
01149 DNA *target;
01150 int sectionCount = 0;
01151
01152
01153 wormLoadNt4Genome(&chrom, &chromCount);
01154 wormChromNames(&chromNames, &chromCount);
01155 for (i = 0; i<lastQsection || i == 0; i += qMaxSize/2)
01156 {
01157 if (forHtml)
01158 {
01159 fputc('.', stdout);
01160 fflush(stdout);
01161 }
01162 lqSize = qSize - i;
01163 if (lqSize > qMaxSize)
01164 lqSize = qMaxSize;
01165 newCa = crudeAliFind(query+i, lqSize, chrom, chromCount, 8, 45);
01166 for (ca = newCa; ca != NULL; ca = ca->next)
01167 {
01168 ca->qStart = i;
01169 ca->qEnd = i+lqSize;
01170 }
01171 caList = slCat(caList,newCa);
01172 ++sectionCount;
01173 }
01174 wormFreeNt4Genome(&chrom);
01175
01176
01177 makeTempName(&dynTn, "xen", ".dyn");
01178 dynFile = mustOpen(dynTn.forCgi, "w");
01179
01180
01181 if (forHtml)
01182 {
01183 int crudeCount = slCount(caList);
01184 if (crudeCount > 2*sectionCount)
01185 {
01186 printf("\nThis is a difficult alignment. It looks like the query aligns with\n"
01187 "the <I>C. elegans</I> genome in %d places. It will take about %1.0f more\n"
01188 "minutes to finish.\n", (crudeCount+sectionCount/2)/sectionCount, crudeCount*0.5);
01189 }
01190 fflush(stdout);
01191 }
01192 for (ca = caList; ca != NULL; ca = ca->next)
01193 {
01194 if (forHtml)
01195 {
01196 fputc('.', stdout);
01197 fflush(stdout);
01198 }
01199 tMid = (ca->start + ca->end)/2;
01200 tMin = tMid-tMaxSize/2;
01201 tMax = tMin + tMaxSize;
01202 wormClipRangeToChrom(chromNames[ca->chromIx], &tMin, &tMax);
01203 fprintf(dynFile, "Aligning query:%d-%d %c to %s:%d-%d +\n",
01204 ca->qStart, ca->qEnd, ca->strand, chromNames[ca->chromIx], tMin, tMax);
01205 lqSize = ca->qEnd - ca->qStart;
01206 if (ca->strand == '-')
01207 reverseComplement(query+ca->qStart, lqSize);
01208 tSize = tMax - tMin;
01209 target = wormChromPart(chromNames[ca->chromIx], tMin, tSize);
01210 score = xenAlignSmall(query+ca->qStart, lqSize, target, tSize, dynFile, FALSE);
01211 freez(&target);
01212 if (ca->strand == '-')
01213 reverseComplement(query+ca->qStart, lqSize);
01214 fprintf(dynFile, "best score %d\n", score);
01215 }
01216 slFreeList(&caList);
01217 fclose(dynFile);
01218
01219 if (forHtml)
01220 printf("\n");
01221 xenStitcher(dynTn.forCgi, f, 150000, FALSE);
01222 remove(dynTn.forCgi);
01223 }
01224