00001
00002
00003
00004
00005 #include "common.h"
00006 #include "net.h"
00007 #include "linefile.h"
00008 #include "sqlNum.h"
00009 #include "dnaseq.h"
00010 #include "fa.h"
00011 #include "fuzzyFind.h"
00012 #include "supStitch.h"
00013 #include "genoFind.h"
00014 #include "gfInternal.h"
00015 #include "errabort.h"
00016 #include "nib.h"
00017 #include "twoBit.h"
00018 #include "trans3.h"
00019
00020
00021 static char const rcsid[] = "$Id: gfBlatLib.c,v 1.25 2007/04/20 22:44:11 kent Exp $";
00022
00023 static int ssAliCount = 16;
00024
00025 #ifdef DEBUG
00026 void dumpRange(struct gfRange *r, FILE *f)
00027
00028 {
00029 fprintf(f, "%d-%d %s %d-%d, hits %d\n", r->qStart, r->qEnd, r->tName, r->tStart, r->tEnd, r->hitCount);
00030 }
00031
00032 void dumpRangeList(struct gfRange *rangeList, FILE *f)
00033
00034 {
00035 struct gfRange *range;
00036 for (range = rangeList; range != NULL; range = range->next)
00037 dumpRange(range, f);
00038 }
00039 #endif
00040
00041 void gfRangeFree(struct gfRange **pEl)
00042
00043
00044 {
00045 struct gfRange *el;
00046
00047 if ((el = *pEl) == NULL) return;
00048 freeMem(el->tName);
00049 if (el->components != NULL)
00050 gfRangeFreeList(&el->components);
00051 freez(pEl);
00052 }
00053
00054 void gfRangeFreeList(struct gfRange **pList)
00055
00056 {
00057 struct gfRange *el, *next;
00058
00059 for (el = *pList; el != NULL; el = next)
00060 {
00061 next = el->next;
00062 gfRangeFree(&el);
00063 }
00064 *pList = NULL;
00065 }
00066
00067 static struct gfRange *gfRangeLoad(char **row)
00068
00069
00070 {
00071 struct gfRange *ret;
00072
00073 AllocVar(ret);
00074 ret->qStart = atoi(row[0]);
00075 ret->qEnd = atoi(row[1]);
00076 ret->tName = cloneString(row[2]);
00077 ret->tStart = atoi(row[3]);
00078 ret->tEnd = atoi(row[4]);
00079 ret->hitCount = atoi(row[5]);
00080 return ret;
00081 }
00082
00083 int gfRangeCmpTarget(const void *va, const void *vb)
00084
00085 {
00086 const struct gfRange *a = *((struct gfRange **)va);
00087 const struct gfRange *b = *((struct gfRange **)vb);
00088 int diff;
00089
00090 diff = strcmp(a->tName, b->tName);
00091 if (diff == 0)
00092 {
00093 long lDiff = a->t3 - b->t3;
00094 if (lDiff < 0)
00095 diff = -1;
00096 else if (lDiff > 0)
00097 diff = 1;
00098 else
00099 diff = 0;
00100 #ifdef SOLARIS_WORKAROUND_COMPILER_BUG_BUT_FAILS_IN_64_BIT
00101 diff = (unsigned long)(a->t3) - (unsigned long)(b->t3);
00102 #endif
00103 }
00104 if (diff == 0)
00105 diff = a->tStart - b->tStart;
00106 return diff;
00107 }
00108
00109 static void startSeqQuery(int conn, bioSeq *seq, char *type)
00110
00111 {
00112 char buf[256];
00113 sprintf(buf, "%s%s %d", gfSignature(), type, seq->size);
00114 write(conn, buf, strlen(buf));
00115 read(conn, buf, 1);
00116 if (buf[0] != 'Y')
00117 errAbort("Expecting 'Y' from server, got %c", buf[0]);
00118 write(conn, seq->dna, seq->size);
00119 }
00120
00121 static void gfServerWarn(bioSeq *seq, char *warning)
00122
00123 {
00124 warn("couldn't process %s: %s", seq->name, warning);
00125 }
00126
00127 static struct gfRange *gfQuerySeq(int conn, struct dnaSeq *seq)
00128
00129 {
00130 struct gfRange *rangeList = NULL, *range;
00131 char buf[256], *row[6];
00132 int rowSize;
00133
00134 startSeqQuery(conn, seq, "query");
00135
00136
00137 for (;;)
00138 {
00139 netRecieveString(conn, buf);
00140 if (sameString(buf, "end"))
00141 {
00142 break;
00143 }
00144 else if (startsWith("Error:", buf))
00145 {
00146 gfServerWarn(seq, buf);
00147 break;
00148 }
00149 else
00150 {
00151 rowSize = chopLine(buf, row);
00152 if (rowSize < 6)
00153 errAbort("Expecting 6 words from server got %d", rowSize);
00154 range = gfRangeLoad(row);
00155 slAddHead(&rangeList, range);
00156 }
00157 }
00158 slReverse(&rangeList);
00159 return rangeList;
00160 }
00161
00162 static int findTileSize(char *line)
00163
00164 {
00165 char *var, *val;
00166 int tileSize = 4;
00167 for (;;)
00168 {
00169 var = nextWord(&line);
00170 if (var == NULL)
00171 break;
00172 val = nextWord(&line);
00173 if (val == NULL)
00174 {
00175 internalErr();
00176 break;
00177 }
00178 if (sameString("tileSize", var))
00179 {
00180 tileSize = atoi(val);
00181 if (tileSize <= 0)
00182 internalErr();
00183 }
00184 }
00185 return tileSize;
00186 }
00187
00188 struct gfHit *getHitsFromServer(int conn, struct lm *lm)
00189
00190 {
00191 char *s, *line, *q, *t;
00192 struct gfHit *hitList = NULL, *hit;
00193 s = line = netRecieveLongString(conn);
00194 for (;;)
00195 {
00196 if ((q = nextWord(&line)) == NULL)
00197 break;
00198 if ((t = nextWord(&line)) == NULL)
00199 internalErr();
00200 lmAllocVar(lm, hit);
00201 hit->qStart = sqlUnsigned(q);
00202 hit->tStart = sqlUnsigned(t);
00203 slAddHead(&hitList, hit);
00204 }
00205 freez(&s);
00206 slReverse(&hitList);
00207 return hitList;
00208 }
00209
00210 static void gfQuerySeqTrans(int conn, aaSeq *seq, struct gfClump *clumps[2][3],
00211 struct lm *lm, struct gfSeqSource **retSsList, int *retTileSize)
00212
00213 {
00214 int frame, isRc, rowSize;
00215 struct gfClump *clump;
00216 int tileSize = 0;
00217 char *line;
00218 char buf[256], *row[12];
00219 struct gfSeqSource *ssList = NULL, *ss;
00220
00221 for (isRc = 0; isRc <= 1; ++isRc)
00222 for (frame = 0; frame<3; ++frame)
00223 clumps[isRc][frame] = NULL;
00224
00225
00226 startSeqQuery(conn, seq, "protQuery");
00227 line = netRecieveString(conn, buf);
00228 if (!startsWith("Error:", line))
00229 {
00230 tileSize = findTileSize(line);
00231
00232
00233 for (;;)
00234 {
00235
00236 netRecieveString(conn, buf);
00237 if (sameString(buf, "end"))
00238 {
00239 break;
00240 }
00241 else if (startsWith("Error:", buf))
00242 {
00243 gfServerWarn(seq, buf);
00244 break;
00245 }
00246 rowSize = chopLine(buf, row);
00247 if (rowSize < 8)
00248 errAbort("Expecting 8 words from server got %d", rowSize);
00249 AllocVar(clump);
00250 clump->qStart = sqlUnsigned(row[0]);
00251 clump->qEnd = sqlUnsigned(row[1]);
00252 AllocVar(ss);
00253 ss->fileName = cloneString(row[2]);
00254 slAddHead(&ssList, ss);
00255 clump->target = ss;
00256 clump->tStart = sqlUnsigned(row[3]);
00257 clump->tEnd = sqlUnsigned(row[4]);
00258 clump->hitCount = sqlUnsigned(row[5]);
00259 isRc = ((row[6][0] == '-') ? 1 : 0);
00260 frame = sqlUnsigned(row[7]);
00261 slAddHead(&clumps[isRc][frame], clump);
00262
00263
00264 clump->hitList = getHitsFromServer(conn, lm);
00265 assert(slCount(clump->hitList) == clump->hitCount);
00266 }
00267 for (isRc = 0; isRc <= 1; ++isRc)
00268 for (frame = 0; frame<3; ++frame)
00269 slReverse(&clumps[isRc][frame]);
00270 }
00271 else
00272 {
00273 gfServerWarn(seq, line);
00274 }
00275 *retSsList = ssList;
00276 *retTileSize = tileSize;
00277 }
00278
00279 static void gfQuerySeqTransTrans(int conn, struct dnaSeq *seq,
00280 struct gfClump *clumps[2][3][3],
00281 struct lm *lm, struct gfSeqSource **retSsList, int *retTileSize)
00282
00283
00284 {
00285 int qFrame, tFrame, isRc, rowSize;
00286 struct gfClump *clump;
00287 int tileSize = 0;
00288 char *line;
00289 char buf[256], *row[12];
00290 struct gfSeqSource *ssList = NULL, *ss;
00291
00292 for (isRc = 0; isRc <= 1; ++isRc)
00293 for (qFrame = 0; qFrame<3; ++qFrame)
00294 for (tFrame = 0; tFrame<3; ++tFrame)
00295 clumps[isRc][qFrame][tFrame] = NULL;
00296
00297
00298 startSeqQuery(conn, seq, "transQuery");
00299 line = netRecieveString(conn, buf);
00300 if (!startsWith("Error:", line))
00301 {
00302 tileSize = findTileSize(line);
00303
00304
00305 for (;;)
00306 {
00307
00308 netRecieveString(conn, buf);
00309 if (sameString(buf, "end"))
00310 {
00311 break;
00312 }
00313 else if (startsWith("Error:", buf))
00314 {
00315 gfServerWarn(seq, buf);
00316 break;
00317 }
00318 rowSize = chopLine(buf, row);
00319 if (rowSize < 9)
00320 errAbort("Expecting 9 words from server got %d", rowSize);
00321 AllocVar(clump);
00322 clump->qStart = sqlUnsigned(row[0]);
00323 clump->qEnd = sqlUnsigned(row[1]);
00324 AllocVar(ss);
00325 ss->fileName = cloneString(row[2]);
00326 slAddHead(&ssList, ss);
00327 clump->target = ss;
00328 clump->tStart = sqlUnsigned(row[3]);
00329 clump->tEnd = sqlUnsigned(row[4]);
00330 clump->hitCount = sqlUnsigned(row[5]);
00331 isRc = ((row[6][0] == '-') ? 1 : 0);
00332 qFrame = sqlUnsigned(row[7]);
00333 tFrame = sqlUnsigned(row[8]);
00334 slAddHead(&clumps[isRc][qFrame][tFrame], clump);
00335
00336
00337 clump->hitList = getHitsFromServer(conn, lm);
00338 assert(slCount(clump->hitList) == clump->hitCount);
00339 }
00340 for (isRc = 0; isRc <= 1; ++isRc)
00341 for (qFrame = 0; qFrame<3; ++qFrame)
00342 for (tFrame = 0; tFrame<3; ++tFrame)
00343 slReverse(&clumps[isRc][qFrame][tFrame]);
00344 }
00345 else
00346 {
00347 gfServerWarn(seq, buf);
00348 }
00349 *retSsList = ssList;
00350 *retTileSize = tileSize;
00351 }
00352
00353
00354
00355 struct gfRange *gfRangesBundle(struct gfRange *exonList, int maxIntron)
00356
00357
00358
00359
00360 {
00361 struct gfRange *geneList = NULL, *gene = NULL, *lastExon = NULL, *exon, *nextExon;
00362
00363 for (exon = exonList; exon != NULL; exon = nextExon)
00364 {
00365 nextExon = exon->next;
00366 if (lastExon == NULL || !sameString(lastExon->tName, exon->tName)
00367 || exon->t3 != lastExon->t3
00368 || exon->tStart - lastExon->tEnd > maxIntron)
00369 {
00370 AllocVar(gene);
00371 gene->tStart = exon->tStart;
00372 gene->tEnd = exon->tEnd;
00373 gene->tName = cloneString(exon->tName);
00374 gene->tSeq = exon->tSeq;
00375 gene->qStart = exon->qStart;
00376 gene->qEnd = exon->qEnd;
00377 gene->hitCount = exon->hitCount;
00378 gene->t3 = exon->t3;
00379 slAddHead(&gene->components, exon);
00380 slAddHead(&geneList, gene);
00381 }
00382 else
00383 {
00384 if (exon->qStart < gene->qStart) gene->qStart = exon->qStart;
00385 if (exon->qEnd > gene->qEnd) gene->qEnd = exon->qEnd;
00386 if (exon->tStart < gene->tStart) gene->tStart = exon->tStart;
00387 if (exon->tEnd > gene->tEnd) gene->tEnd = exon->tEnd;
00388 gene->hitCount += exon->hitCount;
00389 slAddTail(&gene->components, exon);
00390 }
00391 lastExon = exon;
00392 }
00393 slReverse(&geneList);
00394 return geneList;
00395 }
00396
00397 static int usualExpansion = 500;
00398
00399 static boolean alignComponents(struct gfRange *combined, struct ssBundle *bun,
00400 enum ffStringency stringency)
00401
00402
00403 {
00404 struct gfRange *range;
00405 struct dnaSeq *qSeq = bun->qSeq, *tSeq = bun->genoSeq;
00406 struct ssFfItem *ffi;
00407 struct ffAli *ali;
00408 int qStart, qEnd, tStart, tEnd;
00409 int extra = 250;
00410 boolean gotAny = FALSE;
00411
00412 for (range = combined->components; range != NULL; range = range->next)
00413 {
00414
00415 qStart = range->qStart - extra;
00416 tStart = range->tStart - extra;
00417 qEnd = range->qEnd + extra;
00418 tEnd = range->tEnd + extra;
00419 if (range == combined->components)
00420 {
00421 qStart -= extra;
00422 tStart -= extra;
00423 }
00424 if (range->next == NULL)
00425 {
00426 qEnd += extra;
00427 tEnd += extra;
00428 }
00429 if (qStart < combined->qStart) qStart = combined->qStart;
00430 if (tStart < combined->tStart) tStart = combined->tStart;
00431 if (qEnd > combined->qEnd) qEnd = combined->qEnd;
00432 if (tEnd > combined->tEnd) tEnd = combined->tEnd;
00433 ali = ffFind(qSeq->dna + qStart,
00434 qSeq->dna + qEnd,
00435 tSeq->dna + tStart - combined->tStart,
00436 tSeq->dna + tEnd - combined->tStart,
00437 stringency);
00438 if (ali != NULL)
00439 {
00440 AllocVar(ffi);
00441 ffi->ff = ali;
00442 slAddHead(&bun->ffList, ffi);
00443 gotAny = TRUE;
00444 }
00445 }
00446 return gotAny;
00447 }
00448
00449 static int scoreAli(struct ffAli *ali, boolean isProt,
00450 enum ffStringency stringency,
00451 struct dnaSeq *tSeq, struct trans3 *t3List)
00452
00453 {
00454 int (*scoreFunc)(char *a, char *b, int size);
00455 struct ffAli *ff, *nextFf;
00456 int score = 0;
00457 if (isProt)
00458 scoreFunc = aaScoreMatch;
00459 else
00460 scoreFunc = dnaScoreMatch;
00461 for (ff = ali; ff != NULL; ff = nextFf)
00462 {
00463 nextFf = ff->right;
00464 score += scoreFunc(ff->nStart, ff->hStart, ff->nEnd-ff->nStart);
00465 if (nextFf != NULL)
00466 {
00467 int nhStart = trans3GenoPos(nextFf->hStart, tSeq, t3List, FALSE);
00468 int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE);
00469 int hGap = nhStart - ohEnd;
00470 int nGap = nextFf->nStart - ff->nEnd;
00471 score -= ffCalcGapPenalty(hGap, nGap, stringency);
00472 }
00473 }
00474 return score;
00475 }
00476
00477 static void saveAlignments(char *chromName, int chromSize, int chromOffset,
00478 struct ssBundle *bun, struct hash *t3Hash,
00479 boolean qIsRc, boolean tIsRc,
00480 enum ffStringency stringency, int minMatch, struct gfOutput *out)
00481
00482 {
00483 struct dnaSeq *tSeq = bun->genoSeq, *qSeq = bun->qSeq;
00484 struct ssFfItem *ffi;
00485 if (minMatch > qSeq->size/2) minMatch = qSeq->size/2;
00486 if (minMatch < 1) minMatch = 1;
00487 for (ffi = bun->ffList; ffi != NULL; ffi = ffi->next)
00488 {
00489 struct ffAli *ff = ffi->ff;
00490 struct trans3 *t3List = NULL;
00491 int score;
00492 if (t3Hash != NULL)
00493 t3List = hashMustFindVal(t3Hash, tSeq->name);
00494 score = scoreAli(ff, bun->isProt, stringency, tSeq, t3List);
00495 if (score >= minMatch)
00496 {
00497 out->out(chromName, chromSize, chromOffset, ff, tSeq, t3Hash, qSeq,
00498 qIsRc, tIsRc, stringency, minMatch, out);
00499 }
00500 }
00501 }
00502
00503 struct hash *gfFileCacheNew()
00504
00505 {
00506 return hashNew(0);
00507 }
00508
00509 static void gfFileCacheFreeEl(struct hashEl *el)
00510
00511 {
00512 char *name = el->name;
00513 if (nibIsFile(name))
00514 {
00515 struct nibInfo *nib = el->val;
00516 nibInfoFree(&nib);
00517 }
00518 else
00519 {
00520 struct twoBitFile *tbf = el->val;
00521 twoBitClose(&tbf);
00522 }
00523 el->val = NULL;
00524 }
00525
00526 void gfFileCacheFree(struct hash **pCache)
00527
00528 {
00529 struct hash *cache = *pCache;
00530 if (cache != NULL)
00531 {
00532 hashTraverseEls(cache, gfFileCacheFreeEl);
00533 hashFree(pCache);
00534 }
00535 }
00536
00537 static void getTargetName(char *tSpec, boolean includeFile, char *targetName)
00538
00539 {
00540 if (includeFile)
00541 {
00542 char seqName[128];
00543 char fileName[PATH_LEN];
00544 gfiGetSeqName(tSpec, seqName, fileName);
00545 safef(targetName, PATH_LEN, "%s:%s", fileName, seqName);
00546 }
00547 else
00548 gfiGetSeqName(tSpec, targetName, NULL);
00549 }
00550
00551
00552 void gfAlignStrand(int *pConn, char *tSeqDir, struct dnaSeq *seq,
00553 boolean isRc, int minMatch, struct hash *tFileCache, struct gfOutput *out)
00554
00555
00556
00557 {
00558 struct ssBundle *bun;
00559 struct gfRange *rangeList = NULL, *range;
00560 struct dnaSeq *targetSeq;
00561 char targetName[PATH_LEN];
00562
00563 rangeList = gfQuerySeq(*pConn, seq);
00564 close(*pConn);
00565 *pConn = -1;
00566 slSort(&rangeList, gfRangeCmpTarget);
00567
00568 rangeList = gfRangesBundle(rangeList, ffIntronMax);
00569 for (range = rangeList; range != NULL; range = range->next)
00570 {
00571 getTargetName(range->tName, out->includeTargetFile, targetName);
00572 targetSeq = gfiExpandAndLoadCached(range, tFileCache, tSeqDir,
00573 seq->size, &range->tTotalSize, FALSE, FALSE, usualExpansion);
00574 AllocVar(bun);
00575 bun->qSeq = seq;
00576 bun->genoSeq = targetSeq;
00577 alignComponents(range, bun, ffCdna);
00578 ssStitch(bun, ffCdna, minMatch, ssAliCount);
00579 saveAlignments(targetName, range->tTotalSize, range->tStart,
00580 bun, NULL, isRc, FALSE, ffCdna, minMatch, out);
00581 ssBundleFree(&bun);
00582 freeDnaSeq(&targetSeq);
00583 }
00584 gfRangeFreeList(&rangeList);
00585 }
00586
00587 char *clumpTargetName(struct gfClump *clump)
00588
00589 {
00590 struct gfSeqSource *target = clump->target;
00591 char *name;
00592 if (target->seq != NULL)
00593 name = target->seq->name;
00594 else
00595 name = target->fileName;
00596 if (name == NULL)
00597 internalErr();
00598 return name;
00599 }
00600
00601 static struct gfRange *seqClumpToRangeList(struct gfClump *clumpList, int frame)
00602
00603 {
00604 struct gfRange *rangeList = NULL, *range;
00605 struct gfClump *clump;
00606 char *name;
00607 int tOff;
00608
00609 for (clump = clumpList; clump != NULL; clump = clump->next)
00610 {
00611 tOff = clump->target->start;
00612 AllocVar(range);
00613 range->qStart = clump->qStart;
00614 range->qEnd = clump->qEnd;
00615 name = clumpTargetName(clump);
00616 range->tName = cloneString(name);
00617 range->tStart = clump->tStart - tOff;
00618 range->tEnd = clump->tEnd - tOff;
00619 range->tSeq = clump->target->seq;
00620 range->frame = frame;
00621 slAddHead(&rangeList, range);
00622 }
00623 slReverse(&rangeList);
00624 return rangeList;
00625 }
00626
00627 static struct ssBundle *gfClumpsToBundles(struct gfClump *clumpList,
00628 boolean isRc, struct dnaSeq *seq, int minScore,
00629 struct gfRange **retRangeList)
00630
00631 {
00632 struct ssBundle *bun, *bunList = NULL;
00633 struct gfRange *rangeList = NULL, *range;
00634 struct dnaSeq *targetSeq;
00635
00636 rangeList = seqClumpToRangeList(clumpList, 0);
00637 slSort(&rangeList, gfRangeCmpTarget);
00638 rangeList = gfRangesBundle(rangeList, 2000);
00639 for (range = rangeList; range != NULL; range = range->next)
00640 {
00641 targetSeq = range->tSeq;
00642 gfiExpandRange(range, seq->size, targetSeq->size, FALSE, isRc,
00643 usualExpansion);
00644 range->tStart = 0;
00645 range->tEnd = targetSeq->size;
00646 AllocVar(bun);
00647 bun->qSeq = seq;
00648 bun->genoSeq = targetSeq;
00649 alignComponents(range, bun, ffCdna);
00650 ssStitch(bun, ffCdna, minScore, ssAliCount);
00651 slAddHead(&bunList, bun);
00652 }
00653 slReverse(&bunList);
00654 *retRangeList = rangeList;
00655 return bunList;
00656 }
00657
00658
00659 static void extendHitRight(int qMax, int tMax,
00660 char **pEndQ, char **pEndT, int (*scoreMatch)(char a, char b),
00661 int maxDown)
00662
00663 {
00664 int maxScore = 0;
00665 int score = 0;
00666 int maxPos = -1;
00667 int last = min(qMax, tMax);
00668 int i;
00669 char *q = *pEndQ, *t = *pEndT;
00670
00671 for (i=0; i<last; ++i)
00672 {
00673 score += scoreMatch(q[i], t[i]);
00674 if (score > maxScore)
00675 {
00676 maxScore = score;
00677 maxPos = i;
00678 }
00679 else if (i > maxPos + maxDown)
00680 {
00681 break;
00682 }
00683 }
00684 *pEndQ = q+maxPos+1;
00685 *pEndT = t+maxPos+1;
00686 }
00687
00688 static void extendHitLeft(int qMax, int tMax,
00689 char **pStartQ, char **pStartT, int (*scoreMatch)(char a, char b),
00690 int maxDown)
00691
00692 {
00693 int maxScore = 0;
00694 int score = 0;
00695 int maxPos = 0;
00696 int last = -min(qMax, tMax);
00697 int i;
00698 char *q = *pStartQ, *t = *pStartT;
00699
00700 for (i=-1; i>=last; --i)
00701 {
00702 score += scoreMatch(q[i], t[i]);
00703 if (score > maxScore)
00704 {
00705 maxScore = score;
00706 maxPos = i;
00707 }
00708 else if (i < maxPos - maxDown)
00709 {
00710 break;
00711 }
00712 }
00713 *pStartQ = q+maxPos;
00714 *pStartT = t+maxPos;
00715 }
00716
00717
00718 static void clumpToHspRange(struct gfClump *clump, bioSeq *qSeq, int tileSize,
00719 int frame, struct trans3 *t3, struct gfRange **pRangeList,
00720 boolean isProt, boolean fastMap)
00721
00722
00723
00724 {
00725 struct gfSeqSource *target = clump->target;
00726 aaSeq *tSeq = target->seq;
00727 BIOPOL *qs, *ts, *qe, *te;
00728 struct gfHit *hit;
00729 int qStart = 0, tStart = 0, qEnd = 0, tEnd = 0, newQ = 0, newT = 0;
00730 boolean outOfIt = TRUE;
00731 struct gfRange *range;
00732 BIOPOL *lastQs = NULL, *lastQe = NULL, *lastTs = NULL, *lastTe = NULL;
00733 int (*scoreMatch)(char a, char b) = (isProt ? aaScore2 : dnaScore2);
00734 int maxDown, minSpan;
00735
00736 if (fastMap)
00737 {
00738 maxDown = 1;
00739 minSpan = 50;
00740 }
00741 else
00742 {
00743 maxDown = 10;
00744 minSpan = 0;
00745 }
00746
00747
00748 if (tSeq == NULL)
00749 internalErr();
00750
00751
00752
00753
00754
00755
00756
00757 for (hit = clump->hitList; ; hit = hit->next)
00758 {
00759 if (hit != NULL)
00760 {
00761 newQ = hit->qStart;
00762 newT = hit->tStart - target->start;
00763 }
00764
00765
00766 if (!outOfIt)
00767 {
00768
00769
00770
00771 if (hit == NULL || newQ != qEnd || newT != tEnd)
00772 {
00773 qs = qSeq->dna + qStart;
00774 ts = tSeq->dna + tStart;
00775 qe = qSeq->dna + qEnd;
00776 te = tSeq->dna + tEnd;
00777 extendHitRight(qSeq->size - qEnd, tSeq->size - tEnd,
00778 &qe, &te, scoreMatch, maxDown);
00779 extendHitLeft(qStart, tStart, &qs, &ts, scoreMatch, maxDown);
00780 if (qs != lastQs || ts != lastTs || qe != lastQe || qs != lastQs)
00781 {
00782 lastQs = qs;
00783 lastTs = ts;
00784 lastQe = qe;
00785 lastTe = te;
00786 if (qe - qs >= minSpan)
00787 {
00788 AllocVar(range);
00789 range->qStart = qs - qSeq->dna;
00790 range->qEnd = qe - qSeq->dna;
00791 range->tName = cloneString(tSeq->name);
00792 range->tSeq = tSeq;
00793 range->tStart = ts - tSeq->dna;
00794 range->tEnd = te - tSeq->dna;
00795 range->hitCount = qe - qs;
00796 range->frame = frame;
00797 range->t3 = t3;
00798 assert(range->tEnd <= tSeq->size);
00799 slAddHead(pRangeList, range);
00800 }
00801 }
00802 outOfIt = TRUE;
00803 }
00804 }
00805 if (hit == NULL)
00806 break;
00807
00808 if (outOfIt)
00809 {
00810 qStart = newQ;
00811 qEnd = qStart + tileSize;
00812 tStart = newT;
00813 tEnd = tStart + tileSize;
00814 outOfIt = FALSE;
00815 }
00816 else
00817 {
00818 qEnd = newQ + tileSize;
00819 tEnd = newT + tileSize;
00820 }
00821 }
00822 }
00823
00824 struct ssFfItem *gfRangesToFfItem(struct gfRange *rangeList, aaSeq *qSeq)
00825
00826 {
00827 AA *q = qSeq->dna;
00828 struct ffAli *ffList = NULL, *ff;
00829 struct gfRange *range;
00830 struct ssFfItem *ffi;
00831
00832 for (range = rangeList; range != NULL; range = range->next)
00833 {
00834 aaSeq *tSeq = range->tSeq;
00835 AA *t = tSeq->dna;
00836 AllocVar(ff);
00837 ff->nStart = q + range->qStart;
00838 ff->nEnd = q + range->qEnd;
00839 ff->hStart = t + range->tStart;
00840 ff->hEnd = t + range->tEnd;
00841 ff->left = ffList;
00842 ffList = ff;
00843 }
00844 AllocVar(ffi);
00845 ffi->ff = ffMakeRightLinks(ffList);
00846 return ffi;
00847 }
00848
00849 static struct ssBundle *fastMapClumpsToBundles(struct genoFind *gf, struct gfClump *clumpList, bioSeq *qSeq)
00850
00851 {
00852 struct gfClump *clump;
00853 struct gfRange *rangeList = NULL, *range;
00854 bioSeq *targetSeq;
00855 struct ssBundle *bunList = NULL, *bun;
00856
00857 for (clump = clumpList; clump != NULL; clump = clump->next)
00858 clumpToHspRange(clump, qSeq, gf->tileSize, 0, NULL, &rangeList, FALSE, TRUE);
00859 slReverse(&rangeList);
00860 slSort(&rangeList, gfRangeCmpTarget);
00861 rangeList = gfRangesBundle(rangeList, 256);
00862 for (range = rangeList; range != NULL; range = range->next)
00863 {
00864 targetSeq = range->tSeq;
00865 AllocVar(bun);
00866 bun->qSeq = qSeq;
00867 bun->genoSeq = targetSeq;
00868 bun->ffList = gfRangesToFfItem(range->components, qSeq);
00869 bun->isProt = FALSE;
00870 slAddHead(&bunList, bun);
00871 }
00872 gfRangeFreeList(&rangeList);
00873 return bunList;
00874 }
00875
00876 static void gfAlignSomeClumps(struct genoFind *gf, struct gfClump *clumpList,
00877 bioSeq *seq, boolean isRc, int minMatch,
00878 struct gfOutput *out, boolean isProt, enum ffStringency stringency)
00879
00880
00881 {
00882 struct gfClump *clump;
00883 struct gfRange *rangeList = NULL, *range;
00884 bioSeq *targetSeq;
00885 struct ssBundle *bun;
00886 int intronMax = ffIntronMax;
00887
00888 if (isProt)
00889 intronMax /= 3;
00890 for (clump = clumpList; clump != NULL; clump = clump->next)
00891 {
00892 clumpToHspRange(clump, seq, gf->tileSize, 0, NULL, &rangeList, isProt, FALSE);
00893 }
00894 slReverse(&rangeList);
00895 slSort(&rangeList, gfRangeCmpTarget);
00896 rangeList = gfRangesBundle(rangeList, intronMax);
00897 for (range = rangeList; range != NULL; range = range->next)
00898 {
00899 targetSeq = range->tSeq;
00900 AllocVar(bun);
00901 bun->qSeq = seq;
00902 bun->genoSeq = targetSeq;
00903 bun->ffList = gfRangesToFfItem(range->components, seq);
00904 bun->isProt = isProt;
00905 ssStitch(bun, stringency, minMatch, ssAliCount);
00906 saveAlignments(targetSeq->name, targetSeq->size, 0,
00907 bun, NULL, isRc, FALSE, stringency, minMatch, out);
00908 ssBundleFree(&bun);
00909 }
00910 gfRangeFreeList(&rangeList);
00911 }
00912
00913 void gfAlignAaClumps(struct genoFind *gf, struct gfClump *clumpList,
00914 aaSeq *seq, boolean isRc, int minMatch,
00915 struct gfOutput *out)
00916 {
00917 gfAlignSomeClumps(gf, clumpList, seq, isRc, minMatch, out, TRUE, ffTight);
00918 }
00919
00920 int rangeScore(struct gfRange *range, struct dnaSeq *qSeq)
00921
00922 {
00923 struct gfRange *comp;
00924 struct dnaSeq *tSeq;
00925 int score = 0;
00926 for (comp = range->components; comp != NULL; comp = comp->next)
00927 {
00928 tSeq = comp->tSeq;
00929 score += dnaScoreMatch(tSeq->dna + range->tStart, qSeq->dna + range->qStart,
00930 range->tEnd - range->tStart);
00931 if (comp->next != NULL)
00932 score -= 4;
00933 }
00934 return score;
00935 }
00936
00937
00938 void gfFindAlignAaTrans(struct genoFind *gfs[3], aaSeq *qSeq, struct hash *t3Hash,
00939 boolean tIsRc, int minMatch, struct gfOutput *out)
00940
00941
00942 {
00943 struct gfClump *clumps[3];
00944 int frame;
00945 struct gfClump *clump;
00946 struct gfRange *rangeList = NULL, *range;
00947 aaSeq *targetSeq;
00948 struct ssBundle *bun;
00949 int tileSize = gfs[0]->tileSize;
00950 struct trans3 *t3;
00951 int hitCount;
00952 struct lm *lm = lmInit(0);
00953
00954 gfTransFindClumps(gfs, qSeq, clumps, lm, &hitCount);
00955 for (frame=0; frame<3; ++frame)
00956 {
00957 for (clump = clumps[frame]; clump != NULL; clump = clump->next)
00958 {
00959 clumpToHspRange(clump, qSeq, tileSize, frame, NULL, &rangeList, TRUE, FALSE);
00960 }
00961 }
00962 slReverse(&rangeList);
00963 slSort(&rangeList, gfRangeCmpTarget);
00964 rangeList = gfRangesBundle(rangeList, ffIntronMax/3);
00965 for (range = rangeList; range != NULL; range = range->next)
00966 {
00967 targetSeq = range->tSeq;
00968 t3 = hashMustFindVal(t3Hash, targetSeq->name);
00969 AllocVar(bun);
00970 bun->qSeq = qSeq;
00971 bun->genoSeq = targetSeq;
00972 bun->ffList = gfRangesToFfItem(range->components, qSeq);
00973 bun->isProt = TRUE;
00974 bun->t3List = t3;
00975 ssStitch(bun, ffCdna, minMatch, ssAliCount);
00976 saveAlignments(targetSeq->name, t3->seq->size, 0,
00977 bun, t3Hash, FALSE, tIsRc, ffCdna, minMatch, out);
00978 ssBundleFree(&bun);
00979 }
00980 gfRangeFreeList(&rangeList);
00981 for (frame=0; frame<3; ++frame)
00982 gfClumpFreeList(&clumps[frame]);
00983 lmCleanup(&lm);
00984 }
00985
00986 void rangeCoorTimes3(struct gfRange *rangeList)
00987
00988
00989 {
00990 struct gfRange *range;
00991 for (range = rangeList; range != NULL; range = range->next)
00992 {
00993 range->qStart *= 3;
00994 range->qEnd *= 3;
00995 range->tStart = range->tStart*3;
00996 range->tEnd = range->tEnd*3;
00997 }
00998 }
00999
01000 static void loadHashT3Ranges(struct gfRange *rangeList,
01001 char *tSeqDir, struct hash *tFileCache, int qSeqSize, boolean isRc,
01002 struct hash **retT3Hash, struct dnaSeq **retSeqList,
01003 struct slRef **retT3RefList)
01004
01005
01006 {
01007 struct hash *t3Hash = newHash(10);
01008 struct dnaSeq *targetSeq, *tSeqList = NULL;
01009 struct slRef *t3RefList = NULL;
01010 struct gfRange *range;
01011
01012 for (range = rangeList; range != NULL; range = range->next)
01013 {
01014 struct trans3 *t3, *oldT3;
01015
01016 targetSeq = gfiExpandAndLoadCached(range, tFileCache,
01017 tSeqDir, qSeqSize*3, &range->tTotalSize, TRUE, isRc, usualExpansion);
01018 slAddHead(&tSeqList, targetSeq);
01019 freez(&targetSeq->name);
01020 targetSeq->name = cloneString(range->tName);
01021 t3 = trans3New(targetSeq);
01022 refAdd(&t3RefList, t3);
01023 t3->start = range->tStart;
01024 t3->end = range->tEnd;
01025 t3->nibSize = range->tTotalSize;
01026 t3->isRc = isRc;
01027 if ((oldT3 = hashFindVal(t3Hash, range->tName)) != NULL)
01028 {
01029 slAddTail(&oldT3->next, t3);
01030 }
01031 else
01032 {
01033 hashAdd(t3Hash, range->tName, t3);
01034 }
01035 }
01036 *retT3Hash = t3Hash;
01037 *retSeqList = tSeqList;
01038 *retT3RefList = t3RefList;
01039 }
01040
01041
01042 void gfAlignTrans(int *pConn, char *tSeqDir, aaSeq *seq, int minMatch,
01043 struct hash *tFileCache, struct gfOutput *out)
01044
01045
01046
01047 {
01048 struct ssBundle *bun;
01049 struct gfClump *clumps[2][3], *clump;
01050 struct gfRange *rangeList = NULL, *range, *rl;
01051 struct dnaSeq *targetSeq, *tSeqList = NULL;
01052 char targetName[PATH_LEN];
01053 int tileSize;
01054 int frame, isRc = 0;
01055 struct hash *t3Hash = NULL;
01056 struct slRef *t3RefList = NULL, *ref;
01057 struct gfSeqSource *ssList = NULL, *ss;
01058 struct trans3 *t3;
01059 struct lm *lm = lmInit(0);
01060
01061
01062 gfQuerySeqTrans(*pConn, seq, clumps, lm, &ssList, &tileSize);
01063 close(*pConn);
01064 *pConn = -1;
01065
01066 for (isRc = 0; isRc <= 1; ++isRc)
01067 {
01068
01069 for (frame = 0; frame < 3; ++frame)
01070 {
01071 rl = seqClumpToRangeList(clumps[isRc][frame], frame);
01072 rangeList = slCat(rangeList, rl);
01073 }
01074
01075 rangeCoorTimes3(rangeList);
01076 slSort(&rangeList, gfRangeCmpTarget);
01077 rangeList = gfRangesBundle(rangeList, ffIntronMax);
01078 loadHashT3Ranges(rangeList, tSeqDir, tFileCache, seq->size,
01079 isRc, &t3Hash, &tSeqList, &t3RefList);
01080
01081
01082
01083 gfRangeFreeList(&rangeList);
01084
01085
01086
01087
01088
01089 for (frame = 0; frame < 3; ++frame)
01090 {
01091 for (clump = clumps[isRc][frame]; clump != NULL; clump = clump->next)
01092 {
01093 struct gfSeqSource *ss = clump->target;
01094 t3 = trans3Find(t3Hash, clumpTargetName(clump), clump->tStart*3, clump->tEnd*3);
01095 ss->seq = t3->trans[frame];
01096 ss->start = t3->start/3;
01097 ss->end = t3->end/3;
01098 clumpToHspRange(clump, seq, tileSize, frame, t3, &rangeList, TRUE, FALSE);
01099 }
01100 }
01101 slReverse(&rangeList);
01102 slSort(&rangeList, gfRangeCmpTarget);
01103 rangeList = gfRangesBundle(rangeList, ffIntronMax/3);
01104
01105
01106 for (range = rangeList; range != NULL; range = range->next)
01107 {
01108 targetSeq = range->tSeq;
01109 AllocVar(bun);
01110 bun->qSeq = seq;
01111 bun->genoSeq = targetSeq;
01112 bun->ffList = gfRangesToFfItem(range->components, seq);
01113 bun->isProt = TRUE;
01114 t3 = hashMustFindVal(t3Hash, range->tName);
01115 bun->t3List = t3;
01116 ssStitch(bun, ffCdna, minMatch, ssAliCount);
01117 getTargetName(range->tName, out->includeTargetFile, targetName);
01118 saveAlignments(targetName, t3->nibSize, 0,
01119 bun, t3Hash, FALSE, isRc, ffCdna, minMatch, out);
01120 ssBundleFree(&bun);
01121 }
01122
01123
01124 gfRangeFreeList(&rangeList);
01125 freeHash(&t3Hash);
01126 for (ref = t3RefList; ref != NULL; ref = ref->next)
01127 {
01128 struct trans3 *t3 = ref->val;
01129 trans3Free(&t3);
01130 }
01131 slFreeList(&t3RefList);
01132 freeDnaSeqList(&tSeqList);
01133 }
01134
01135
01136 for (isRc=0; isRc<=1; ++isRc)
01137 for (frame=0; frame<3; ++frame)
01138 gfClumpFreeList(&clumps[isRc][frame]);
01139 for (ss = ssList; ss != NULL; ss = ss->next)
01140 freeMem(ss->fileName);
01141 slFreeList(&ssList);
01142 lmCleanup(&lm);
01143 }
01144
01145 void untranslateRangeList(struct gfRange *rangeList, int qFrame, int tFrame,
01146 struct hash *t3Hash, struct trans3 *t3, int tOffset)
01147
01148 {
01149 struct gfRange *range;
01150 for (range = rangeList; range != NULL; range = range->next)
01151 {
01152 range->qStart = 3*range->qStart + qFrame;
01153 range->qEnd = 3*range->qEnd + qFrame;
01154 range->tStart = 3*range->tStart + tFrame;
01155 range->tEnd = 3*range->tEnd + tFrame;
01156 if (t3Hash)
01157 t3 = trans3Find(t3Hash, range->tSeq->name, range->tStart + tOffset, range->tEnd + tOffset);
01158 range->tSeq = t3->seq;
01159 range->t3 = t3;
01160 }
01161 }
01162
01163 void gfAlignTransTrans(int *pConn, char *tSeqDir, struct dnaSeq *qSeq,
01164 boolean qIsRc, int minMatch, struct hash *tFileCache,
01165 struct gfOutput *out, boolean isRna)
01166
01167
01168
01169
01170 {
01171 struct gfClump *clumps[2][3][3], *clump;
01172 char targetName[PATH_LEN];
01173 int qFrame, tFrame, tIsRc;
01174 struct gfSeqSource *ssList = NULL, *ss;
01175 struct lm *lm = lmInit(0);
01176 int tileSize;
01177 struct gfRange *rangeList = NULL, *rl, *range;
01178 struct trans3 *qTrans = trans3New(qSeq), *t3;
01179 struct slRef *t3RefList = NULL, *t3Ref;
01180 struct hash *t3Hash = NULL;
01181 struct dnaSeq *tSeqList = NULL;
01182 enum ffStringency stringency = (isRna ? ffCdna : ffLoose);
01183
01184
01185 gfQuerySeqTransTrans(*pConn, qSeq, clumps, lm, &ssList, &tileSize);
01186 close(*pConn);
01187 *pConn = -1;
01188
01189 for (tIsRc=0; tIsRc <= 1; ++tIsRc)
01190 {
01191
01192 for (qFrame = 0; qFrame < 3; ++qFrame)
01193 {
01194 for (tFrame = 0; tFrame < 3; ++tFrame)
01195 {
01196 rl = seqClumpToRangeList(clumps[tIsRc][qFrame][tFrame], tFrame);
01197 rangeList = slCat(rangeList, rl);
01198 }
01199 }
01200 rangeCoorTimes3(rangeList);
01201 slSort(&rangeList, gfRangeCmpTarget);
01202 rangeList = gfRangesBundle(rangeList, ffIntronMax);
01203 loadHashT3Ranges(rangeList, tSeqDir, tFileCache,
01204 qSeq->size/3, tIsRc, &t3Hash, &tSeqList, &t3RefList);
01205
01206
01207
01208 gfRangeFreeList(&rangeList);
01209
01210
01211
01212
01213 for (qFrame = 0; qFrame < 3; ++qFrame)
01214 {
01215 for (tFrame = 0; tFrame < 3; ++tFrame)
01216 {
01217 for (clump = clumps[tIsRc][qFrame][tFrame]; clump != NULL; clump = clump->next)
01218 {
01219 struct gfSeqSource *ss = clump->target;
01220 struct gfRange *rangeSet = NULL;
01221 t3 = trans3Find(t3Hash, clumpTargetName(clump), clump->tStart*3, clump->tEnd*3);
01222 ss->seq = t3->trans[tFrame];
01223 ss->start = t3->start/3;
01224 ss->end = t3->end/3;
01225 clumpToHspRange(clump, qTrans->trans[qFrame], tileSize, tFrame, t3, &rangeSet, TRUE, FALSE);
01226 untranslateRangeList(rangeSet, qFrame, tFrame, NULL, t3, t3->start);
01227 rangeList = slCat(rangeSet, rangeList);
01228 }
01229 }
01230 }
01231 slReverse(&rangeList);
01232 slSort(&rangeList, gfRangeCmpTarget);
01233 rangeList = gfRangesBundle(rangeList, ffIntronMax);
01234
01235 for (range = rangeList; range != NULL; range = range->next)
01236 {
01237 struct dnaSeq *targetSeq = range->tSeq;
01238 struct ssBundle *bun;
01239
01240 AllocVar(bun);
01241 bun->qSeq = qSeq;
01242 bun->genoSeq = targetSeq;
01243 bun->ffList = gfRangesToFfItem(range->components, qSeq);
01244 ssStitch(bun, stringency, minMatch, ssAliCount);
01245 getTargetName(range->tName, out->includeTargetFile, targetName);
01246 t3 = range->t3;
01247 saveAlignments(targetName, t3->nibSize, t3->start,
01248 bun, NULL, qIsRc, tIsRc, stringency, minMatch, out);
01249 ssBundleFree(&bun);
01250 }
01251
01252
01253 gfRangeFreeList(&rangeList);
01254 freeHash(&t3Hash);
01255 for (t3Ref = t3RefList; t3Ref != NULL; t3Ref = t3Ref->next)
01256 {
01257 struct trans3 *t3 = t3Ref->val;
01258 trans3Free(&t3);
01259 }
01260 slFreeList(&t3RefList);
01261 freeDnaSeqList(&tSeqList);
01262 }
01263 trans3Free(&qTrans);
01264 for (ss = ssList; ss != NULL; ss = ss->next)
01265 freeMem(ss->fileName);
01266 slFreeList(&ssList);
01267 lmCleanup(&lm);
01268 }
01269
01270
01271 static struct ssBundle *gfTransTransFindBundles(struct genoFind *gfs[3], struct dnaSeq *qSeq,
01272 struct hash *t3Hash, boolean isRc, int minMatch, boolean isRna)
01273
01274
01275 {
01276 struct trans3 *qTrans = trans3New(qSeq);
01277 int qFrame, tFrame;
01278 struct gfClump *clumps[3][3], *clump;
01279 struct gfRange *rangeList = NULL, *range;
01280 int tileSize = gfs[0]->tileSize;
01281 bioSeq *targetSeq;
01282 struct ssBundle *bun, *bunList = NULL;
01283 int hitCount;
01284 struct lm *lm = lmInit(0);
01285 enum ffStringency stringency = (isRna ? ffCdna : ffLoose);
01286
01287 gfTransTransFindClumps(gfs, qTrans->trans, clumps, lm, &hitCount);
01288 for (qFrame = 0; qFrame<3; ++qFrame)
01289 {
01290 for (tFrame=0; tFrame<3; ++tFrame)
01291 {
01292 for (clump = clumps[qFrame][tFrame]; clump != NULL; clump = clump->next)
01293 {
01294 struct gfRange *rangeSet = NULL;
01295 clumpToHspRange(clump, qTrans->trans[qFrame], tileSize, tFrame, NULL, &rangeSet, TRUE, FALSE);
01296 untranslateRangeList(rangeSet, qFrame, tFrame, t3Hash, NULL, 0);
01297 rangeList = slCat(rangeSet, rangeList);
01298 }
01299 }
01300 }
01301 slSort(&rangeList, gfRangeCmpTarget);
01302 rangeList = gfRangesBundle(rangeList, 2000);
01303 for (range = rangeList; range != NULL; range = range->next)
01304 {
01305 targetSeq = range->tSeq;
01306 AllocVar(bun);
01307 bun->qSeq = qSeq;
01308 bun->genoSeq = targetSeq;
01309 bun->ffList = gfRangesToFfItem(range->components, qSeq);
01310 ssStitch(bun, stringency, minMatch, ssAliCount);
01311 slAddHead(&bunList, bun);
01312 }
01313 for (qFrame = 0; qFrame<3; ++qFrame)
01314 for (tFrame=0; tFrame<3; ++tFrame)
01315 gfClumpFreeList(&clumps[qFrame][tFrame]);
01316 gfRangeFreeList(&rangeList);
01317 trans3Free(&qTrans);
01318 lmCleanup(&lm);
01319 slReverse(&bunList);
01320 return bunList;
01321 }
01322
01323 static void addToBigBundleList(struct ssBundle **pOneList, struct hash *bunHash,
01324 struct ssBundle **pBigList, struct dnaSeq *query)
01325
01326
01327 {
01328 struct ssBundle *oneBun, *bigBun;
01329
01330 for (oneBun = *pOneList; oneBun != NULL; oneBun = oneBun->next)
01331 {
01332 char *name = oneBun->genoSeq->name;
01333 if ((bigBun = hashFindVal(bunHash, name)) == NULL)
01334 {
01335 AllocVar(bigBun);
01336 slAddHead(pBigList, bigBun);
01337 hashAdd(bunHash, name, bigBun);
01338 bigBun->qSeq = query;
01339 bigBun->genoSeq = oneBun->genoSeq;
01340 bigBun->isProt = oneBun->isProt;
01341 bigBun->avoidFuzzyFindKludge = oneBun->avoidFuzzyFindKludge;
01342 }
01343 bigBun->ffList = slCat(bigBun->ffList, oneBun->ffList);
01344 oneBun->ffList = NULL;
01345 }
01346 ssBundleFreeList(pOneList);
01347 }
01348
01349 static boolean jiggleSmallExons(struct ffAli *ali, struct dnaSeq *nSeq, struct dnaSeq *hSeq)
01350
01351
01352 {
01353 struct ffAli *left, *mid, *right;
01354 int orient;
01355 boolean creeped = FALSE;
01356
01357 if (ffAliCount(ali) < 3)
01358 return FALSE;
01359 orient = ffIntronOrientation(ali);
01360 left = ali;
01361 mid = left->right;
01362 right = mid->right;
01363 while (right != NULL)
01364 {
01365 int midSizeN = mid->nEnd - mid->nStart;
01366 if (midSizeN < 10 && mid->hStart - left->hEnd > 1 && right->hStart - mid->hEnd > 1)
01367 {
01368 DNA *spLeft, *spRight;
01369 DNA exonX[10+2+2];
01370 DNA *match;
01371 static int creeps[4][2] = { {2, 2}, {2, 1}, {1, 2}, {1, 1},};
01372 int creepIx, creepL, creepR;
01373 DNA *hs = mid->hStart, *he = mid->hEnd;
01374 DNA *hMin = left->hEnd, *hMax = right->hStart;
01375 if (orient >= 0)
01376 {
01377 spLeft = "ag";
01378 spRight = "gt";
01379 }
01380 else
01381 {
01382 spLeft = "ac";
01383 spRight = "ct";
01384 }
01385 for (creepIx=0; creepIx<4; ++creepIx)
01386 {
01387 creepL = creeps[creepIx][0];
01388 creepR = creeps[creepIx][1];
01389
01390 if (hs[-1] == spLeft[1] && he[0] == spRight[0])
01391 {
01392 if ((creepL == 1 || hs[-2] == spLeft[0])
01393 && (creepR == 1 || he[1] == spRight[1]))
01394 {
01395 break;
01396 }
01397 }
01398 memcpy(exonX, spLeft + 2 - creepL, creepL);
01399 memcpy(exonX + creepL, mid->nStart, midSizeN);
01400 memcpy(exonX + creepL + midSizeN, spRight, creepR);
01401 match = memMatch(exonX, midSizeN + creepR + creepL, hMin, hMax - hMin);
01402 if (match != NULL)
01403 {
01404 mid->hStart = match + creepL;
01405 mid->hEnd = mid->hStart + (he - hs);
01406 creeped = TRUE;
01407 break;
01408 }
01409 }
01410 }
01411 left = mid;
01412 mid = right;
01413 right = right->right;
01414 }
01415 if (creeped)
01416 ffSlideIntrons(ali);
01417 return creeped;
01418 }
01419
01420 static struct ffAli *refineSmallExons(struct ffAli *ff,
01421 struct dnaSeq *nSeq, struct dnaSeq *hSeq)
01422
01423
01424
01425 {
01426 if (jiggleSmallExons(ff, nSeq, hSeq))
01427 ff = ffRemoveEmptyAlis(ff, TRUE);
01428 return ff;
01429 }
01430
01431 static void refineSmallExonsInBundle(struct ssBundle *bun)
01432
01433
01434
01435 {
01436 struct ssFfItem *fi;
01437
01438 for (fi = bun->ffList; fi != NULL; fi = fi->next)
01439 {
01440 fi->ff = refineSmallExons(fi->ff, bun->qSeq, bun->genoSeq);
01441 }
01442 }
01443
01444 #ifdef DEBUG
01445 void dumpBunList(struct ssBundle *bunList)
01446
01447 {
01448 int totalItems = 0;
01449 int totalBlocks = 0;
01450 int totalBases = 0;
01451 struct ssBundle *bun;
01452 for (bun = bunList; bun != NULL; bun = bun->next)
01453 {
01454 struct ssFfItem *item;
01455 for (item = bun->ffList; item != NULL; item = item->next)
01456 {
01457 printf("item: ");
01458 struct ffAli *ff;
01459 for (ff = item->ff; ff != NULL; ff = ff->right)
01460 {
01461 totalBases += ff->hEnd - ff->hStart;
01462 totalBlocks += 1;
01463 printf("%d,", ff->hEnd - ff->hStart);
01464 }
01465 printf("\n");
01466 totalItems += 1;
01467 }
01468 }
01469 printf("total bundles %d, alignments %d, blocks %d, bases %d\n",
01470 slCount(bunList), totalItems, totalBlocks, totalBases);
01471 }
01472 #endif
01473
01474 static void gfAlignSomeClumps(struct genoFind *gf, struct gfClump *clumpList,
01475 bioSeq *seq, boolean isRc, int minMatch,
01476 struct gfOutput *out, boolean isProt, enum ffStringency stringency);
01477
01478 void gfLongDnaInMem(struct dnaSeq *query, struct genoFind *gf,
01479 boolean isRc, int minScore, Bits *qMaskBits,
01480 struct gfOutput *out, boolean fastMap, boolean band)
01481
01482
01483 {
01484 int hitCount;
01485 int maxSize = 5000;
01486 int preferredSize = 4500;
01487 int overlapSize = 250;
01488 struct dnaSeq subQuery = *query;
01489 struct lm *lm = lmInit(0);
01490 int subOffset, subSize, nextOffset;
01491 DNA saveEnd, *endPos;
01492 struct ssBundle *oneBunList = NULL, *bigBunList = NULL, *bun;
01493 struct hash *bunHash = newHash(8);
01494
01495 for (subOffset = 0; subOffset<query->size; subOffset = nextOffset)
01496 {
01497 struct gfClump *clumpList;
01498 struct gfRange *rangeList = NULL;
01499
01500
01501
01502
01503
01504 if (subOffset == 0 && query->size <= maxSize)
01505 nextOffset = subSize = query->size;
01506 else
01507 {
01508 subSize = preferredSize;
01509 if (subSize + subOffset >= query->size)
01510 {
01511 subSize = query->size - subOffset;
01512 nextOffset = query->size;
01513 }
01514 else
01515 {
01516 nextOffset = subOffset + preferredSize - overlapSize;
01517 }
01518 }
01519 subQuery.dna = query->dna + subOffset;
01520 subQuery.size = subSize;
01521 endPos = &subQuery.dna[subSize];
01522 saveEnd = *endPos;
01523 *endPos = 0;
01524 if (band)
01525 {
01526 oneBunList = ffSeedExtInMem(gf, &subQuery, qMaskBits, subOffset, lm, minScore, isRc);
01527 }
01528 else
01529 {
01530 clumpList = gfFindClumpsWithQmask(gf, &subQuery, qMaskBits, subOffset, lm, &hitCount);
01531 if (fastMap)
01532 {
01533 oneBunList = fastMapClumpsToBundles(gf, clumpList, &subQuery);
01534 }
01535 else
01536 {
01537 oneBunList = gfClumpsToBundles(clumpList, isRc, &subQuery, minScore, &rangeList);
01538 gfRangeFreeList(&rangeList);
01539 }
01540 gfClumpFreeList(&clumpList);
01541 }
01542 addToBigBundleList(&oneBunList, bunHash, &bigBunList, query);
01543 *endPos = saveEnd;
01544 }
01545 #ifdef DEBUG
01546 dumpBunList(bigBunList);
01547 #endif
01548 for (bun = bigBunList; bun != NULL; bun = bun->next)
01549 {
01550 ssStitch(bun, ffCdna, minScore, ssAliCount);
01551 if (!fastMap && !band)
01552 refineSmallExonsInBundle(bun);
01553 saveAlignments(bun->genoSeq->name, bun->genoSeq->size, 0,
01554 bun, NULL, isRc, FALSE, ffCdna, minScore, out);
01555 }
01556 ssBundleFreeList(&bigBunList);
01557 freeHash(&bunHash);
01558 lmCleanup(&lm);
01559 }
01560
01561
01562 void gfLongTransTransInMem(struct dnaSeq *query, struct genoFind *gfs[3],
01563 struct hash *t3Hash, boolean qIsRc, boolean tIsRc, boolean qIsRna,
01564 int minScore, struct gfOutput *out)
01565
01566
01567 {
01568 enum ffStringency stringency = (qIsRna ? ffCdna : ffLoose);
01569 int maxSize = 1500;
01570 int preferredSize = 1200;
01571 int overlapSize = 270;
01572 struct dnaSeq subQuery = *query;
01573 int subOffset, subSize, nextOffset;
01574 DNA saveEnd, *endPos;
01575 struct ssBundle *oneBunList = NULL, *bigBunList = NULL, *bun;
01576 struct hash *bunHash = newHash(8);
01577
01578 for (subOffset = 0; subOffset<query->size; subOffset = nextOffset)
01579 {
01580
01581
01582
01583
01584 if (subOffset == 0 && query->size <= maxSize)
01585 nextOffset = subSize = query->size;
01586 else
01587 {
01588 subSize = preferredSize;
01589 if (subSize + subOffset >= query->size)
01590 {
01591 subSize = query->size - subOffset;
01592 nextOffset = query->size;
01593 }
01594 else
01595 {
01596 nextOffset = subOffset + preferredSize - overlapSize;
01597 }
01598 }
01599 subQuery.dna = query->dna + subOffset;
01600 subQuery.size = subSize;
01601 endPos = &subQuery.dna[subSize];
01602 saveEnd = *endPos;
01603 *endPos = 0;
01604 oneBunList = gfTransTransFindBundles(gfs, &subQuery, t3Hash, qIsRc, minScore, qIsRna);
01605 addToBigBundleList(&oneBunList, bunHash, &bigBunList, query);
01606 *endPos = saveEnd;
01607 }
01608 for (bun = bigBunList; bun != NULL; bun = bun->next)
01609 {
01610 ssStitch(bun, ffCdna, minScore, ssAliCount);
01611 saveAlignments(bun->genoSeq->name, bun->genoSeq->size, 0,
01612 bun, NULL, qIsRc, tIsRc, stringency, minScore, out);
01613 }
01614 hashFree(&bunHash);
01615 ssBundleFreeList(&bigBunList);
01616 }
01617