00001
00002
00003
00004 #include "common.h"
00005 #include <signal.h>
00006 #include "obscure.h"
00007 #include "dnautil.h"
00008 #include "dnaseq.h"
00009 #include "nib.h"
00010 #include "twoBit.h"
00011 #include "fa.h"
00012 #include "dystring.h"
00013 #include "errabort.h"
00014 #include "sig.h"
00015 #include "ooc.h"
00016 #include "genoFind.h"
00017 #include "trans3.h"
00018 #include "binRange.h"
00019
00020 static char const rcsid[] = "$Id: genoFind.c,v 1.23 2007/02/04 05:09:02 kent Exp $";
00021
00022 char *gfSignature()
00023
00024
00025 {
00026 static char signature[] = "0ddf270562684f29";
00027 return signature;
00028 }
00029
00030 volatile boolean pipeBroke = FALSE;
00031
00032 static void gfPipeHandler(int sigNum)
00033
00034 {
00035 pipeBroke = TRUE;
00036 }
00037
00038 void gfCatchPipes()
00039
00040 {
00041 signal(SIGPIPE, gfPipeHandler);
00042 }
00043
00044 int gfReadMulti(int sd, void *vBuf, size_t size)
00045
00046 {
00047 char *buf = vBuf;
00048 size_t totalRead = 0;
00049 int oneRead;
00050
00051 while (totalRead < size)
00052 {
00053 oneRead = read(sd, buf + totalRead, size - totalRead);
00054 if (oneRead < 0)
00055 {
00056 perror("Couldn't finish large read");
00057 return oneRead;
00058 }
00059 else if (oneRead == 0)
00060
00061 break;
00062 totalRead += oneRead;
00063 }
00064 return totalRead;
00065 }
00066
00067 void genoFindFree(struct genoFind **pGenoFind)
00068
00069 {
00070 struct genoFind *gf = *pGenoFind;
00071 int i;
00072 struct gfSeqSource *sources;
00073 if (gf != NULL)
00074 {
00075 freeMem(gf->lists);
00076 freeMem(gf->listSizes);
00077 freeMem(gf->allocated);
00078 if ((sources = gf->sources) != NULL)
00079 {
00080 for (i=0; i<gf->sourceCount; ++i)
00081 bitFree(&sources[i].maskedBits);
00082 freeMem(sources);
00083 }
00084 freez(pGenoFind);
00085 }
00086 }
00087
00088 int gfPowerOf20(int n)
00089
00090 {
00091 int res = 20;
00092 while (--n > 0)
00093 res *= 20;
00094 return res;
00095 }
00096
00097 void gfCheckTileSize(int tileSize, boolean isPep)
00098
00099 {
00100 if (isPep)
00101 {
00102 if (tileSize < 3 || tileSize > 8)
00103 errAbort("protein tileSize must be between 3 and 8");
00104 }
00105 else
00106 {
00107 if (tileSize < 6 || tileSize > 18)
00108 errAbort("DNA tileSize must be between 6 and 18");
00109 }
00110 }
00111
00112 static struct genoFind *gfNewEmpty(int minMatch, int maxGap,
00113 int tileSize, int stepSize, int maxPat,
00114 char *oocFile, boolean isPep, boolean allowOneMismatch)
00115
00116 {
00117 struct genoFind *gf;
00118 int tileSpaceSize;
00119 int segSize = 0;
00120
00121 gfCheckTileSize(tileSize, isPep);
00122 if (stepSize == 0)
00123 stepSize = tileSize;
00124 AllocVar(gf);
00125 if (isPep)
00126 {
00127 if (tileSize > 5)
00128 {
00129 segSize = tileSize - 5;
00130 }
00131 tileSpaceSize = gf->tileSpaceSize = gfPowerOf20(tileSize-segSize);
00132 }
00133 else
00134 {
00135 int seedBitSize;
00136 if (tileSize > 12)
00137 {
00138 segSize = tileSize - 12;
00139 }
00140 seedBitSize = (tileSize-segSize)*2;
00141 tileSpaceSize = gf->tileSpaceSize = (1<<seedBitSize);
00142 gf->tileMask = tileSpaceSize-1;
00143 }
00144 gf->segSize = segSize;
00145 gf->tileSize = tileSize;
00146 gf->stepSize = stepSize;
00147 gf->isPep = isPep;
00148 gf->allowOneMismatch = allowOneMismatch;
00149 if (segSize > 0)
00150 {
00151 gf->endLists = needHugeZeroedMem(tileSpaceSize * sizeof(gf->endLists[0]));
00152 maxPat = BIGNUM;
00153
00154 }
00155 else
00156 {
00157 gf->lists = needHugeZeroedMem(tileSpaceSize * sizeof(gf->lists[0]));
00158 }
00159 gf->listSizes = needHugeZeroedMem(tileSpaceSize * sizeof(gf->listSizes[0]));
00160 gf->minMatch = minMatch;
00161 gf->maxGap = maxGap;
00162 gf->maxPat = maxPat;
00163 if (oocFile != NULL)
00164 {
00165 if (segSize > 0)
00166 errAbort("Don't yet support ooc on large tile sizes");
00167 oocMaskCounts(oocFile, gf->listSizes, tileSize, maxPat);
00168 oocMaskSimpleRepeats(gf->listSizes, tileSize, maxPat);
00169 }
00170 return gf;
00171 }
00172
00173 static int ntLookup[256];
00174
00175 static void initNtLookup()
00176 {
00177 static boolean initted = FALSE;
00178 if (!initted)
00179 {
00180 int i;
00181 for (i=0; i<ArraySize(ntLookup); ++i)
00182 ntLookup[i] = -1;
00183 ntLookup['a'] = A_BASE_VAL;
00184 ntLookup['c'] = C_BASE_VAL;
00185 ntLookup['t'] = T_BASE_VAL;
00186 ntLookup['g'] = G_BASE_VAL;
00187 initted = TRUE;
00188 }
00189 }
00190
00191 int gfDnaTile(DNA *dna, int n)
00192
00193 {
00194 int tile = 0;
00195 int i, c;
00196 for (i=0; i<n; ++i)
00197 {
00198 tile <<= 2;
00199 if ((c = ntLookup[(int)dna[i]]) < 0)
00200 return -1;
00201 tile += c;
00202 }
00203 return tile;
00204 }
00205
00206 int gfPepTile(AA *pep, int n)
00207
00208 {
00209 int tile = 0;
00210 int aa;
00211 while (--n >= 0)
00212 {
00213 tile *= 20;
00214 aa = aaVal[(int)(*pep++)];
00215 if (aa < 0)
00216 return -1;
00217 tile += aa;
00218 }
00219 return tile;
00220 }
00221
00222 static void gfCountSeq(struct genoFind *gf, bioSeq *seq)
00223
00224 {
00225 char *poly = seq->dna;
00226 int tileSize = gf->tileSize;
00227 int stepSize = gf->stepSize;
00228 int tileHeadSize = gf->tileSize - gf->segSize;
00229 int maxPat = gf->maxPat;
00230 int tile;
00231 bits32 *listSizes = gf->listSizes;
00232 int i, lastTile = seq->size - tileSize;
00233 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
00234
00235 initNtLookup();
00236 for (i=0; i<=lastTile; i += stepSize)
00237 {
00238 if ((tile = makeTile(poly, tileHeadSize)) >= 0)
00239 {
00240 if (listSizes[tile] < maxPat)
00241 {
00242 listSizes[tile] += 1;
00243 }
00244 }
00245 poly += stepSize;
00246 }
00247 }
00248
00249 static int gfCountTilesInNib(struct genoFind *gf, int stepSize, char *fileName)
00250
00251 {
00252 FILE *f;
00253 int tileSize = gf->tileSize;
00254 int bufSize = tileSize * 16*1024;
00255 int nibSize, i;
00256 int endBuf, basesInBuf;
00257 struct dnaSeq *seq;
00258
00259 printf("Counting tiles in %s\n", fileName);
00260 nibOpenVerify(fileName, &f, &nibSize);
00261 for (i=0; i < nibSize; i = endBuf)
00262 {
00263 endBuf = i + bufSize;
00264 if (endBuf >= nibSize) endBuf = nibSize;
00265 basesInBuf = endBuf - i;
00266 seq = nibLdPart(fileName, f, nibSize, i, basesInBuf);
00267 gfCountSeq(gf, seq);
00268 freeDnaSeq(&seq);
00269 }
00270 fclose(f);
00271 return nibSize;
00272 }
00273
00274 long long maxTotalBases()
00275
00276 {
00277 long long maxBases = 1024*1024;
00278 maxBases *= 4*1024;
00279 return maxBases;
00280 }
00281
00282 static long long twoBitCheckTotalSize(struct twoBitFile *tbf)
00283
00284
00285 {
00286 long long totalSize = twoBitTotalSize(tbf);
00287 if (totalSize > maxTotalBases())
00288 errAbort("Sorry, can only index up to %lld bases, %s has %lld",
00289 maxTotalBases(), tbf->fileName, totalSize);
00290 return totalSize;
00291 }
00292
00293 static void gfCountTilesInTwoBit(struct genoFind *gf, int stepSize,
00294 char *fileName, int *retSeqCount, long long *retBaseCount)
00295
00296
00297 {
00298 struct dnaSeq *seq;
00299 struct twoBitFile *tbf = twoBitOpen(fileName);
00300 struct twoBitIndex *index;
00301 int seqCount = 0;
00302 long long baseCount = twoBitCheckTotalSize(tbf);
00303
00304 printf("Counting tiles in %s\n", fileName);
00305 for (index = tbf->indexList; index != NULL; index = index->next)
00306 {
00307 seq = twoBitReadSeqFragLower(tbf, index->name, 0, 0);
00308 gfCountSeq(gf, seq);
00309 ++seqCount;
00310 freeDnaSeq(&seq);
00311 }
00312 twoBitClose(&tbf);
00313 *retSeqCount = seqCount;
00314 *retBaseCount = baseCount;
00315 }
00316
00317 static int gfAllocLists(struct genoFind *gf)
00318
00319
00320 {
00321 int oneCount;
00322 int count = 0;
00323 int i;
00324 bits32 *listSizes = gf->listSizes;
00325 bits32 **lists = gf->lists;
00326 bits32 *allocated = NULL;
00327 bits32 maxPat = gf->maxPat;
00328 int size;
00329 int usedCount = 0, overusedCount = 0;
00330 int tileSpaceSize = gf->tileSpaceSize;
00331
00332 for (i=0; i<tileSpaceSize; ++i)
00333 {
00334
00335 if ((oneCount = listSizes[i]) < maxPat)
00336 {
00337 count += oneCount;
00338 usedCount += 1;
00339 }
00340 else
00341 {
00342 overusedCount += 1;
00343 }
00344 }
00345 if (count > 0)
00346 gf->allocated = allocated = needHugeMem(count*sizeof(allocated[0]));
00347 for (i=0; i<tileSpaceSize; ++i)
00348 {
00349 if ((size = listSizes[i]) < maxPat)
00350 {
00351 lists[i] = allocated;
00352 allocated += size;
00353 }
00354 }
00355 return count;
00356 }
00357
00358 static int gfAllocLargeLists(struct genoFind *gf)
00359
00360
00361 {
00362 int count = 0;
00363 int i;
00364 bits32 *listSizes = gf->listSizes;
00365 bits16 **endLists = gf->endLists;
00366 bits16 *allocated = NULL;
00367 int size;
00368 int tileSpaceSize = gf->tileSpaceSize;
00369
00370 for (i=0; i<tileSpaceSize; ++i)
00371 count += listSizes[i];
00372 if (count > 0)
00373 gf->allocated = allocated = needHugeMem(3*count*sizeof(allocated[0]));
00374 for (i=0; i<tileSpaceSize; ++i)
00375 {
00376 size = listSizes[i];
00377 endLists[i] = allocated;
00378 allocated += 3*size;
00379 }
00380 return count;
00381 }
00382
00383
00384 static void gfAddSeq(struct genoFind *gf, bioSeq *seq, bits32 offset)
00385
00386 {
00387 char *poly = seq->dna;
00388 int tileSize = gf->tileSize;
00389 int stepSize = gf->stepSize;
00390 int i, lastTile = seq->size - tileSize;
00391 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
00392 int maxPat = gf->maxPat;
00393 int tile;
00394 bits32 *listSizes = gf->listSizes;
00395 bits32 **lists = gf->lists;
00396
00397 initNtLookup();
00398 for (i=0; i<=lastTile; i += stepSize)
00399 {
00400 tile = makeTile(poly, tileSize);
00401 if (tile >= 0)
00402 {
00403 if (listSizes[tile] < maxPat)
00404 {
00405 lists[tile][listSizes[tile]++] = offset;
00406 }
00407 }
00408 offset += stepSize;
00409 poly += stepSize;
00410 }
00411 }
00412
00413 static void gfAddLargeSeq(struct genoFind *gf, bioSeq *seq, bits32 offset)
00414
00415 {
00416 char *poly = seq->dna;
00417 int tileSize = gf->tileSize;
00418 int stepSize = gf->stepSize;
00419 int tileTailSize = gf->segSize;
00420 int tileHeadSize = tileSize - tileTailSize;
00421 int i, lastTile = seq->size - tileSize;
00422 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
00423 int tileHead;
00424 int tileTail;
00425 bits32 *listSizes = gf->listSizes;
00426 bits16 **endLists = gf->endLists;
00427 bits16 *endList;
00428 int headCount;
00429
00430 initNtLookup();
00431 for (i=0; i<=lastTile; i += stepSize)
00432 {
00433 tileHead = makeTile(poly, tileHeadSize);
00434 tileTail = makeTile(poly + tileHeadSize, tileTailSize);
00435 if (tileHead >= 0 && tileTail >= 0)
00436 {
00437 endList = endLists[tileHead];
00438 headCount = listSizes[tileHead]++;
00439 endList += headCount * 3;
00440 endList[0] = tileTail;
00441 endList[1] = (offset >> 16);
00442 endList[2] = (offset&0xffff);
00443 }
00444 offset += stepSize;
00445 poly += stepSize;
00446 }
00447 }
00448
00449
00450
00451 static int gfAddTilesInNib(struct genoFind *gf, char *fileName, bits32 offset,
00452 int stepSize)
00453
00454 {
00455 FILE *f;
00456 int tileSize = gf->tileSize;
00457 int bufSize = tileSize * 16*1024;
00458 int nibSize, i;
00459 int endBuf, basesInBuf;
00460 struct dnaSeq *seq;
00461
00462 printf("Adding tiles in %s\n", fileName);
00463 nibOpenVerify(fileName, &f, &nibSize);
00464 for (i=0; i < nibSize; i = endBuf)
00465 {
00466 endBuf = i + bufSize;
00467 if (endBuf >= nibSize) endBuf = nibSize;
00468 basesInBuf = endBuf - i;
00469 seq = nibLdPart(fileName, f, nibSize, i, basesInBuf);
00470 gfAddSeq(gf, seq, i + offset);
00471 freeDnaSeq(&seq);
00472 }
00473 fclose(f);
00474 return nibSize;
00475 }
00476
00477
00478 static void gfZeroOverused(struct genoFind *gf)
00479
00480 {
00481 bits32 *sizes = gf->listSizes;
00482 int tileSpaceSize = gf->tileSpaceSize, i;
00483 int maxPat = gf->maxPat;
00484 int overCount = 0;
00485
00486 for (i=0; i<tileSpaceSize; ++i)
00487 {
00488 if (sizes[i] >= maxPat)
00489 {
00490 sizes[i] = 0;
00491 ++overCount;
00492 }
00493 }
00494 }
00495
00496 static void gfZeroNonOverused(struct genoFind *gf)
00497
00498 {
00499 bits32 *sizes = gf->listSizes;
00500 int tileSpaceSize = gf->tileSpaceSize, i;
00501 int maxPat = gf->maxPat;
00502 int overCount = 0;
00503
00504 for (i=0; i<tileSpaceSize; ++i)
00505 {
00506 if (sizes[i] < maxPat)
00507 {
00508 sizes[i] = 0;
00509 ++overCount;
00510 }
00511 }
00512 }
00513
00514 struct genoFind *gfIndexNibsAndTwoBits(int fileCount, char *fileNames[],
00515 int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
00516 boolean allowOneMismatch, int stepSize)
00517
00518
00519
00520
00521
00522
00523
00524
00525 {
00526 struct genoFind *gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize,
00527 maxPat, oocFile, FALSE, allowOneMismatch);
00528 int i;
00529 bits32 offset = 0, nibSize;
00530 char *fileName;
00531 struct gfSeqSource *ss;
00532 long long totalBases = 0, warnAt = maxTotalBases();
00533 int totalSeq = 0;
00534
00535 if (allowOneMismatch)
00536 errAbort("Don't currently support allowOneMismatch in gfIndexNibsAndTwoBits");
00537 if (stepSize == 0)
00538 stepSize = gf->tileSize;
00539 for (i=0; i<fileCount; ++i)
00540 {
00541 fileName = fileNames[i];
00542 if (twoBitIsFile(fileName))
00543 {
00544 int seqCount;
00545 long long baseCount;
00546 gfCountTilesInTwoBit(gf, stepSize, fileName, &seqCount, &baseCount);
00547 totalBases += baseCount;
00548 totalSeq += seqCount;
00549 }
00550 else if (nibIsFile(fileName))
00551 {
00552 totalBases += gfCountTilesInNib(gf, stepSize, fileName);
00553 totalSeq += 1;
00554 }
00555 else
00556 errAbort("Unrecognized file type %s", fileName);
00557
00558 if (totalBases >= warnAt)
00559 errAbort("Exceeding 4 billion bases, sorry gfServer can't handle that.");
00560 }
00561 gfAllocLists(gf);
00562 gfZeroNonOverused(gf);
00563 AllocArray(gf->sources, totalSeq);
00564 gf->sourceCount = totalSeq;
00565 ss = gf->sources;
00566 for (i=0; i<fileCount; ++i)
00567 {
00568 fileName = fileNames[i];
00569 if (nibIsFile(fileName))
00570 {
00571 nibSize = gfAddTilesInNib(gf, fileName, offset, stepSize);
00572 ss->fileName = fileName;
00573 ss->start = offset;
00574 offset += nibSize;
00575 ss->end = offset;
00576 ++ss;
00577 }
00578 else
00579 {
00580 struct twoBitFile *tbf = twoBitOpen(fileName);
00581 struct twoBitIndex *index;
00582 char nameBuf[PATH_LEN+256];
00583 for (index = tbf->indexList; index != NULL; index = index->next)
00584 {
00585 struct dnaSeq *seq = twoBitReadSeqFragLower(tbf, index->name, 0,0);
00586 gfAddSeq(gf, seq, offset);
00587 safef(nameBuf, sizeof(nameBuf), "%s:%s", fileName, index->name);
00588 ss->fileName = cloneString(nameBuf);
00589 ss->start = offset;
00590 offset += seq->size;
00591 ss->end = offset;
00592 ++ss;
00593 dnaSeqFree(&seq);
00594 }
00595 twoBitClose(&tbf);
00596 }
00597 }
00598 gf->totalSeqSize = offset;
00599 gfZeroOverused(gf);
00600 printf("Done adding\n");
00601 return gf;
00602 }
00603
00604 static void maskSimplePepRepeat(struct genoFind *gf)
00605
00606
00607 {
00608 int i;
00609 int tileSize = gf->tileSize;
00610 int maxPat = gf->maxPat;
00611 bits32 *listSizes = gf->listSizes;
00612
00613 for (i=0; i<20; ++i)
00614 {
00615 int j, k;
00616 for (j=0; j<20; ++j)
00617 {
00618 int tile = 0;
00619 for (k=0; k<tileSize; ++k)
00620 {
00621 tile *= 20;
00622 if (k&1)
00623 tile += j;
00624 else
00625 tile += i;
00626 }
00627 listSizes[tile] = maxPat;
00628 }
00629 }
00630 }
00631
00632 struct dnaSeq *readMaskedNib(char *fileName, boolean mask)
00633
00634
00635 {
00636 struct dnaSeq *seq;
00637 if (mask)
00638 {
00639 seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName);
00640 toggleCase(seq->dna, seq->size);
00641 upperToN(seq->dna, seq->size);
00642 }
00643 else
00644 seq = nibLoadAll(fileName);
00645 return seq;
00646 }
00647
00648 struct dnaSeq *readMaskedTwoBit(struct twoBitFile *tbf, char *seqName,
00649 boolean mask)
00650
00651
00652 {
00653 struct dnaSeq *seq;
00654 if (mask)
00655 {
00656 seq = twoBitReadSeqFrag(tbf, seqName, 0, 0);
00657 toggleCase(seq->dna, seq->size);
00658 upperToN(seq->dna, seq->size);
00659 }
00660 else
00661 seq = twoBitReadSeqFragLower(tbf, seqName, 0, 0);
00662 return seq;
00663 }
00664
00665
00666 static void transCountBothStrands(struct dnaSeq *seq,
00667 struct genoFind *transGf[2][3])
00668
00669
00670 {
00671 int isRc, frame;
00672 struct trans3 *t3;
00673 for (isRc=0; isRc <= 1; ++isRc)
00674 {
00675 if (isRc)
00676 reverseComplement(seq->dna, seq->size);
00677 t3 = trans3New(seq);
00678 for (frame = 0; frame < 3; ++frame)
00679 {
00680 struct genoFind *gf = transGf[isRc][frame];
00681 gfCountSeq(gf, t3->trans[frame]);
00682 }
00683 trans3Free(&t3);
00684 }
00685 }
00686
00687 static void transIndexBothStrands(struct dnaSeq *seq,
00688 struct genoFind *transGf[2][3], bits32 offset[2][3],
00689 int sourceIx, char *fileName)
00690
00691
00692 {
00693 int isRc, frame;
00694 struct trans3 *t3;
00695 struct gfSeqSource *ss;
00696 for (isRc=0; isRc <= 1; ++isRc)
00697 {
00698 if (isRc)
00699 {
00700 reverseComplement(seq->dna, seq->size);
00701 }
00702 t3 = trans3New(seq);
00703 for (frame = 0; frame < 3; ++frame)
00704 {
00705 struct genoFind *gf = transGf[isRc][frame];
00706 ss = gf->sources + sourceIx;
00707 gfAddSeq(gf, t3->trans[frame], offset[isRc][frame]);
00708 ss->fileName = cloneString(fileName);
00709 ss->start = offset[isRc][frame];
00710 offset[isRc][frame] += t3->trans[frame]->size;
00711 ss->end = offset[isRc][frame];
00712 }
00713 trans3Free(&t3);
00714 }
00715 }
00716
00717 void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3],
00718 int fileCount, char *fileNames[],
00719 int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
00720 boolean allowOneMismatch, boolean doMask, int stepSize)
00721
00722 {
00723 struct genoFind *gf;
00724 int i,isRc, frame;
00725 bits32 offset[2][3];
00726 char *fileName;
00727 struct dnaSeq *seq;
00728 int sourceCount = 0;
00729 long long totalBases = 0, warnAt = maxTotalBases();
00730
00731 if (allowOneMismatch)
00732 errAbort("Don't currently support allowOneMismatch in gfIndexTransNibsAndTwoBits");
00733
00734 for (isRc=0; isRc <= 1; ++isRc)
00735 {
00736 for (frame = 0; frame < 3; ++frame)
00737 {
00738 transGf[isRc][frame] = gf = gfNewEmpty(minMatch, maxGap,
00739 tileSize, stepSize, maxPat, oocFile, TRUE, allowOneMismatch);
00740 }
00741 }
00742
00743
00744 for (isRc = 0; isRc <= 1; ++isRc)
00745 for (frame = 0; frame < 3; ++frame)
00746 maskSimplePepRepeat(transGf[isRc][frame]);
00747
00748
00749 for (i=0; i<fileCount; ++i)
00750 {
00751 fileName = fileNames[i];
00752 printf("Counting %s\n", fileName);
00753 if (nibIsFile(fileName))
00754 {
00755 seq = readMaskedNib(fileName, doMask);
00756 transCountBothStrands(seq, transGf);
00757 sourceCount += 1;
00758 totalBases += seq->size;
00759 freeDnaSeq(&seq);
00760 }
00761 else if (twoBitIsFile(fileName))
00762 {
00763 struct twoBitFile *tbf = twoBitOpen(fileName);
00764 struct twoBitIndex *index;
00765 totalBases += twoBitCheckTotalSize(tbf);
00766
00767 for (index = tbf->indexList; index != NULL; index = index->next)
00768 {
00769 seq = readMaskedTwoBit(tbf, index->name, doMask);
00770 transCountBothStrands(seq, transGf);
00771 sourceCount += 1;
00772 freeDnaSeq(&seq);
00773 }
00774 twoBitClose(&tbf);
00775 }
00776 else
00777 errAbort("Unrecognized file type %s", fileName);
00778 if (totalBases >= warnAt)
00779 errAbort("Exceeding 4 billion bases, sorry gfServer can't handle that.");
00780 }
00781
00782
00783 for (isRc=0; isRc <= 1; ++isRc)
00784 {
00785 for (frame = 0; frame < 3; ++frame)
00786 {
00787 gf = transGf[isRc][frame];
00788 gfAllocLists(gf);
00789 gfZeroNonOverused(gf);
00790 AllocArray(gf->sources, sourceCount);
00791 gf->sourceCount = sourceCount;
00792 offset[isRc][frame] = 0;
00793 }
00794 }
00795
00796
00797 sourceCount = 0;
00798 for (i=0; i<fileCount; ++i)
00799 {
00800 fileName = fileNames[i];
00801 printf("Indexing %s\n", fileName);
00802 if (nibIsFile(fileName))
00803 {
00804 seq = readMaskedNib(fileName, doMask);
00805 transIndexBothStrands(seq, transGf, offset, sourceCount, fileName);
00806 freeDnaSeq(&seq);
00807 sourceCount += 1;
00808 }
00809 else
00810 {
00811 struct twoBitFile *tbf = twoBitOpen(fileName);
00812 struct twoBitIndex *index;
00813 for (index = tbf->indexList; index != NULL; index = index->next)
00814 {
00815 char nameBuf[PATH_LEN+256];
00816 safef(nameBuf, sizeof(nameBuf), "%s:%s", fileName, index->name);
00817 seq = readMaskedTwoBit(tbf, index->name, doMask);
00818 transIndexBothStrands(seq, transGf, offset, sourceCount, nameBuf);
00819 sourceCount += 1;
00820 freeDnaSeq(&seq);
00821 }
00822 twoBitClose(&tbf);
00823 }
00824 }
00825
00826 for (isRc=0; isRc <= 1; ++isRc)
00827 {
00828 for (frame = 0; frame < 3; ++frame)
00829 {
00830 gf = transGf[isRc][frame];
00831 gf->totalSeqSize = offset[isRc][frame];
00832 gfZeroOverused(gf);
00833 }
00834 }
00835 }
00836
00837 static struct genoFind *gfSmallIndexSeq(struct genoFind *gf, bioSeq *seqList,
00838 int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
00839 boolean isPep, boolean maskUpper)
00840
00841 {
00842 int seqCount = slCount(seqList);
00843 bioSeq *seq;
00844 int i;
00845 bits32 offset = 0;
00846 struct gfSeqSource *ss;
00847
00848 if (isPep)
00849 maskSimplePepRepeat(gf);
00850 for (seq = seqList; seq != NULL; seq = seq->next)
00851 gfCountSeq(gf, seq);
00852 gfAllocLists(gf);
00853 gfZeroNonOverused(gf);
00854 if (seqCount > 0)
00855 AllocArray(gf->sources, seqCount);
00856 gf->sourceCount = seqCount;
00857 for (i=0, seq = seqList; i<seqCount; ++i, seq = seq->next)
00858 {
00859 gfAddSeq(gf, seq, offset);
00860 ss = gf->sources+i;
00861 ss->seq = seq;
00862 ss->start = offset;
00863 offset += seq->size;
00864 ss->end = offset;
00865 if (maskUpper)
00866 ss->maskedBits = maskFromUpperCaseSeq(seq);
00867 }
00868 gf->totalSeqSize = offset;
00869 gfZeroOverused(gf);
00870 return gf;
00871 }
00872
00873 static struct genoFind *gfLargeIndexSeq(struct genoFind *gf, bioSeq *seqList,
00874 int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
00875 boolean isPep, boolean maskUpper)
00876
00877 {
00878 int seqCount = slCount(seqList);
00879 bioSeq *seq;
00880 int i;
00881 bits32 offset = 0;
00882 struct gfSeqSource *ss;
00883
00884 for (seq = seqList; seq != NULL; seq = seq->next)
00885 gfCountSeq(gf, seq);
00886 gfAllocLargeLists(gf);
00887 gfZeroNonOverused(gf);
00888 AllocArray(gf->sources, seqCount);
00889 gf->sourceCount = seqCount;
00890 for (i=0, seq = seqList; i<seqCount; ++i, seq = seq->next)
00891 {
00892 gfAddLargeSeq(gf, seq, offset);
00893 ss = gf->sources+i;
00894 ss->seq = seq;
00895 ss->start = offset;
00896 offset += seq->size;
00897 ss->end = offset;
00898 if (maskUpper)
00899 ss->maskedBits = maskFromUpperCaseSeq(seq);
00900 }
00901 gf->totalSeqSize = offset;
00902 gfZeroOverused(gf);
00903 return gf;
00904 }
00905
00906
00907 struct genoFind *gfIndexSeq(bioSeq *seqList,
00908 int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
00909 boolean isPep, boolean allowOneMismatch, boolean maskUpper,
00910 int stepSize)
00911
00912
00913 {
00914 struct genoFind *gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize, maxPat,
00915 oocFile, isPep, allowOneMismatch);
00916 if (stepSize == 0)
00917 stepSize = tileSize;
00918 if (gf->segSize > 0)
00919 {
00920 gfLargeIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper);
00921 }
00922 else
00923 {
00924 gfSmallIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper);
00925 }
00926 return gf;
00927 }
00928
00929 static int bCmpSeqSource(const void *vTarget, const void *vRange)
00930
00931 {
00932 const bits32 *pTarget = vTarget;
00933 bits32 target = *pTarget;
00934 const struct gfSeqSource *ss = vRange;
00935
00936 if (target < ss->start) return -1;
00937 if (target >= ss->end) return 1;
00938 return 0;
00939 }
00940
00941 static struct gfSeqSource *findSource(struct genoFind *gf, bits32 targetPos)
00942
00943 {
00944 struct gfSeqSource *ss = bsearch(&targetPos, gf->sources, gf->sourceCount,
00945 sizeof(gf->sources[0]), bCmpSeqSource);
00946 if (ss == NULL)
00947 errAbort("Couldn't find source for %d", targetPos);
00948 return ss;
00949 }
00950
00951 void gfClumpFree(struct gfClump **pClump)
00952
00953 {
00954 struct gfClump *clump;
00955 if ((clump = *pClump) == NULL)
00956 return;
00957 freez(pClump);
00958 }
00959
00960 void gfClumpFreeList(struct gfClump **pList)
00961
00962 {
00963 struct gfClump *el, *next;
00964
00965 for (el = *pList; el != NULL; el = next)
00966 {
00967 next = el->next;
00968 gfClumpFree(&el);
00969 }
00970 *pList = NULL;
00971 }
00972
00973 void gfClumpDump(struct genoFind *gf, struct gfClump *clump, FILE *f)
00974
00975 {
00976 struct gfSeqSource *ss = clump->target;
00977 char *name = ss->fileName;
00978
00979 if (name == NULL) name = ss->seq->name;
00980 fprintf(f, "%u-%u %s %u-%u, hits %d\n",
00981 clump->qStart, clump->qEnd, name,
00982 clump->tStart - ss->start, clump->tEnd - ss->start,
00983 clump->hitCount);
00984 #ifdef SOMETIMES
00985 for (hit = clump->hitList; hit != NULL; hit = hit->next)
00986 fprintf(f, " q %d, t %d, diag %d\n", hit->qStart, hit->tStart, hit->diagonal);
00987 #endif
00988 }
00989
00990
00991
00992
00993
00994
00995 static void gfHitSort2(struct gfHit **ptArray, int n);
00996
00997
00998
00999 static struct gfHit **nosTemp, *nosSwap;
01000
01001 static void gfHitSort2(struct gfHit **ptArray, int n)
01002
01003
01004
01005 {
01006 struct gfHit **tmp, **pt1, **pt2;
01007 int n1, n2;
01008
01009
01010 n1 = (n>>1);
01011 n2 = n - n1;
01012 pt1 = ptArray;
01013 pt2 = ptArray + n1;
01014
01015
01016
01017 if (n1 > 2)
01018 gfHitSort2(pt1, n1);
01019 else if (n1 == 2 && pt1[0]->diagonal > pt1[1]->diagonal)
01020 {
01021 nosSwap = pt1[1];
01022 pt1[1] = pt1[0];
01023 pt1[0] = nosSwap;
01024 }
01025 if (n2 > 2)
01026 gfHitSort2(pt2, n2);
01027 else if (n2 == 2 && pt2[0]->diagonal > pt2[1]->diagonal)
01028 {
01029 nosSwap = pt2[1];
01030 pt2[1] = pt2[0];
01031 pt2[0] = nosSwap;
01032 }
01033
01034
01035
01036
01037 tmp = nosTemp;
01038 while (n1 > 0 && n2 > 0)
01039 {
01040 if ((*pt1)->diagonal <= (*pt2)->diagonal)
01041 {
01042 --n1;
01043 *tmp++ = *pt1++;
01044 }
01045 else
01046 {
01047 --n2;
01048 *tmp++ = *pt2++;
01049 }
01050 }
01051
01052 assert(tmp + n1 + n2 == nosTemp + n);
01053
01054
01055 if (n1 > 0)
01056 memcpy(tmp, pt1, n1 * sizeof(*tmp));
01057
01058
01059
01060 memcpy(ptArray, nosTemp, (n - n2) * sizeof(*ptArray));
01061 }
01062
01063 void gfHitSortDiagonal(struct gfHit **pList)
01064
01065 {
01066 struct gfHit *list = *pList;
01067 if (list != NULL && list->next != NULL)
01068 {
01069 int count = slCount(list);
01070 struct gfHit *el;
01071 struct gfHit **array;
01072 int i;
01073 int byteSize = count*sizeof(*array);
01074 array = needLargeMem(byteSize);
01075 nosTemp = needLargeMem(byteSize);
01076 for (el = list, i=0; el != NULL; el = el->next, i++)
01077 array[i] = el;
01078 gfHitSort2(array, count);
01079 list = NULL;
01080 for (i=0; i<count; ++i)
01081 {
01082 array[i]->next = list;
01083 list = array[i];
01084 }
01085 freez(&array);
01086 freez(&nosTemp);
01087 slReverse(&list);
01088 *pList = list;
01089 }
01090 }
01091
01092
01093
01094 static int cmpQuerySize;
01095
01096 #ifdef UNUSED
01097 static int gfHitCmpDiagonal(const void *va, const void *vb)
01098
01099 {
01100 const struct gfHit *a = *((struct gfHit **)va);
01101 const struct gfHit *b = *((struct gfHit **)vb);
01102
01103 if (a->diagonal > b->diagonal)
01104 return 1;
01105 else if (a->diagonal == b->diagonal)
01106 return 0;
01107 else
01108 return -1;
01109 }
01110 #endif
01111
01112 static int gfHitCmpTarget(const void *va, const void *vb)
01113
01114 {
01115 const struct gfHit *a = *((struct gfHit **)va);
01116 const struct gfHit *b = *((struct gfHit **)vb);
01117
01118 if (a->tStart > b->tStart)
01119 return 1;
01120 else if (a->tStart == b->tStart)
01121 return 0;
01122 else
01123 return -1;
01124 }
01125
01126 static int gfHitCmpQuery(const void *va, const void *vb)
01127
01128 {
01129 const struct gfHit *a = *((struct gfHit **)va);
01130 const struct gfHit *b = *((struct gfHit **)vb);
01131
01132 if (a->qStart > b->qStart)
01133 return 1;
01134 else if (a->qStart == b->qStart)
01135 return 0;
01136 else
01137 return -1;
01138 }
01139
01140 static void gfClumpComputeQueryCoverage(struct gfClump *clumpList,
01141 int tileSize)
01142
01143 {
01144 struct gfClump *clump;
01145 struct gfHit *hit;
01146 struct gfHit *hitList;
01147 int qcov;
01148 int blockStart, blockEnd;
01149
01150 for (clump = clumpList; clump != NULL; clump = clump->next)
01151 {
01152 hitList = clump->hitList;
01153 if ( hitList != NULL)
01154 {
01155 slSort(&hitList, gfHitCmpQuery);
01156 qcov = 0;
01157 blockStart = hitList->qStart;
01158 blockEnd = blockStart + tileSize;
01159 for (hit = hitList; hit != NULL; hit = hit->next)
01160 {
01161 if (hit->qStart > blockEnd)
01162 {
01163 qcov += blockEnd - blockStart;
01164 blockStart = hit->qStart;
01165 blockEnd = blockStart + tileSize;
01166 }
01167 else if (hit->qStart + tileSize > blockEnd)
01168 {
01169 blockEnd = hit->qStart + tileSize;
01170 }
01171 }
01172 qcov += blockEnd - blockStart;
01173 clump->queryCoverage = qcov;
01174 }
01175 }
01176 }
01177
01178
01179
01180 static int gfClumpCmpQueryCoverage(const void *va, const void *vb)
01181
01182 {
01183 const struct gfClump *a = *((struct gfClump **)va);
01184 const struct gfClump *b = *((struct gfClump **)vb);
01185
01186 return (b->queryCoverage - a->queryCoverage);
01187 }
01188
01189 static void findClumpBounds(struct gfClump *clump, int tileSize)
01190
01191 {
01192 struct gfHit *hit;
01193 int x;
01194 hit = clump->hitList;
01195 if (hit == NULL)
01196 return;
01197 clump->qStart = clump->qEnd = hit->qStart;
01198 clump->tStart = clump->tEnd = hit->tStart;
01199 for (hit = hit->next; hit != NULL; hit = hit->next)
01200 {
01201 x = hit->qStart;
01202 if (x < clump->qStart) clump->qStart = x;
01203 if (x > clump->qEnd) clump->qEnd = x;
01204 x = hit->tStart;
01205 if (x < clump->tStart) clump->tStart = x;
01206 if (x > clump->tEnd) clump->tEnd = x;
01207 }
01208 clump->tEnd += tileSize;
01209 clump->qEnd += tileSize;
01210 }
01211
01212 static void targetClump(struct genoFind *gf, struct gfClump **pClumpList, struct gfClump *clump)
01213
01214
01215 {
01216 struct gfSeqSource *ss = findSource(gf, clump->tStart);
01217 if (ss->end >= clump->tEnd)
01218 {
01219
01220 clump->target = ss;
01221 slAddHead(pClumpList, clump);
01222 }
01223 else
01224 {
01225
01226
01227 struct gfHit *hit, *nextHit, *inList, *outList, *oldList = clump->hitList;
01228 int hCount;
01229
01230 while (oldList != NULL)
01231 {
01232 inList = outList = NULL;
01233 hCount = 0;
01234 for (hit = oldList; hit != NULL; hit = nextHit)
01235 {
01236 nextHit = hit->next;
01237 if (ss->start <= hit->tStart && hit->tStart < ss->end)
01238 {
01239 ++hCount;
01240 slAddHead(&inList, hit);
01241 }
01242 else
01243 {
01244 slAddHead(&outList, hit);
01245 }
01246 }
01247 slReverse(&inList);
01248 slReverse(&outList);
01249 if (hCount >= gf->minMatch)
01250 {
01251 struct gfClump *newClump;
01252 AllocVar(newClump);
01253 newClump->hitList = inList;
01254 newClump->hitCount = hCount;
01255 newClump->target = ss;
01256 findClumpBounds(newClump, gf->tileSize);
01257 slAddHead(pClumpList, newClump);
01258 }
01259 else
01260 {
01261 inList = NULL;
01262 }
01263 oldList = outList;
01264 if (oldList != NULL)
01265 {
01266 ss = findSource(gf, oldList->tStart);
01267 }
01268 }
01269 clump->hitList = NULL;
01270 gfClumpFree(&clump);
01271 }
01272 }
01273
01274 static int gfNearEnough = 300;
01275
01276 static struct gfClump *clumpNear(struct genoFind *gf, struct gfClump *oldClumps, int minMatch)
01277
01278
01279 {
01280 struct gfClump *newClumps = NULL, *clump, *nextClump;
01281 struct gfHit *hit, *nextHit;
01282 int tileSize = gf->tileSize;
01283 bits32 lastT;
01284 int nearEnough = (gf->isPep ? gfNearEnough/3 : gfNearEnough);
01285
01286 for (clump = oldClumps; clump != NULL; clump = nextClump)
01287 {
01288 struct gfHit *newHits = NULL, *oldHits = clump->hitList;
01289 int clumpSize = 0;
01290 clump->hitCount = 0;
01291 clump->hitList = NULL;
01292 nextClump = clump->next;
01293 slSort(&oldHits, gfHitCmpTarget);
01294 lastT = oldHits->tStart;
01295 for (hit = oldHits; hit != NULL; hit = nextHit)
01296 {
01297 nextHit = hit->next;
01298 if (hit->tStart > nearEnough + lastT)
01299 {
01300 if (clumpSize >= minMatch)
01301 {
01302 slReverse(&newHits);
01303 clump->hitList = newHits;
01304 newHits = NULL;
01305 clump->hitCount = clumpSize;
01306 findClumpBounds(clump, tileSize);
01307 targetClump(gf, &newClumps, clump);
01308 AllocVar(clump);
01309 }
01310 else
01311 {
01312 newHits = NULL;
01313 clump->hitCount = 0;
01314 }
01315 clumpSize = 0;
01316 }
01317 lastT = hit->tStart;
01318 slAddHead(&newHits, hit);
01319 ++clumpSize;
01320 }
01321 slReverse(&newHits);
01322 clump->hitList = newHits;
01323 if (clumpSize >= minMatch)
01324 {
01325 clump->hitCount = clumpSize;
01326 findClumpBounds(clump, tileSize);
01327 targetClump(gf, &newClumps, clump);
01328 }
01329 else
01330 {
01331 gfClumpFree(&clump);
01332 }
01333 }
01334 return newClumps;
01335 }
01336
01337 static struct gfClump *clumpHits(struct genoFind *gf, struct gfHit *hitList, int minMatch)
01338
01339 {
01340 struct gfClump *clumpList = NULL, *clump = NULL;
01341 int maxGap = gf->maxGap;
01342 struct gfHit *hit, *nextHit, *lastHit = NULL;
01343 int totalHits = 0, usedHits = 0, clumpCount = 0;
01344 int tileSize = gf->tileSize;
01345 int bucketShift = 16;
01346 bits32 bucketSize = (1<<bucketShift);
01347 int bucketCount = (gf->totalSeqSize >> bucketShift) + 1;
01348 int nearEnough = (gf->isPep ? gfNearEnough/3 : gfNearEnough);
01349 bits32 boundary = bucketSize - nearEnough;
01350 int i;
01351 struct gfHit **buckets = NULL, **pb;
01352
01353
01354 AllocArray(buckets, bucketCount);
01355 for (hit = hitList; hit != NULL; hit = nextHit)
01356 {
01357 nextHit = hit->next;
01358 pb = buckets + (hit->tStart >> bucketShift);
01359 slAddHead(pb, hit);
01360 }
01361
01362
01363 for (i=0; i<bucketCount; ++i)
01364 {
01365 int clumpSize;
01366 bits32 maxT;
01367 struct gfHit *clumpHits;
01368 pb = buckets + i;
01369 gfHitSortDiagonal(pb);
01370 for (hit = *pb; hit != NULL; )
01371 {
01372
01373
01374 clumpSize = 0;
01375 clumpHits = lastHit = NULL;
01376 maxT = 0;
01377 for (; hit != NULL; hit = nextHit)
01378 {
01379 nextHit = hit->next;
01380 if (lastHit != NULL && hit->diagonal - lastHit->diagonal > maxGap)
01381 break;
01382 if (hit->tStart > maxT) maxT = hit->tStart;
01383 slAddHead(&clumpHits, hit);
01384 ++clumpSize;
01385 ++totalHits;
01386 lastHit = hit;
01387 }
01388 if (maxT > boundary && i < bucketCount-1)
01389 {
01390
01391
01392 buckets[i+1] = slCat(clumpHits, buckets[i+1]);
01393 }
01394 else if (clumpSize >= minMatch)
01395 {
01396
01397 AllocVar(clump);
01398 slAddHead(&clumpList, clump);
01399 clump->hitCount = clumpSize;
01400 clump->hitList = clumpHits;
01401 usedHits += clumpSize;
01402 ++clumpCount;
01403 }
01404 }
01405 *pb = NULL;
01406 boundary += bucketSize;
01407 }
01408 clumpList = clumpNear(gf, clumpList, minMatch);
01409 gfClumpComputeQueryCoverage(clumpList, tileSize);
01410 slSort(&clumpList, gfClumpCmpQueryCoverage);
01411
01412 #ifdef DEBUG
01413 uglyf("Dumping clumps B\n");
01414 for (clump = clumpList; clump != NULL; clump = clump->next)
01415 {
01416 uglyf(" %d %d %s %d %d (%d hits)\n", clump->qStart, clump->qEnd, clump->target->seq->name, clump->tStart, clump->tEnd, clump->hitCount);
01417 }
01418 #endif
01419 freez(&buckets);
01420 return clumpList;
01421 }
01422
01423
01424 static struct gfHit *gfFastFindDnaHits(struct genoFind *gf, struct dnaSeq *seq,
01425 Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
01426 struct gfSeqSource *target, int tMin, int tMax)
01427
01428
01429 {
01430 struct gfHit *hitList = NULL, *hit;
01431 int size = seq->size;
01432 int tileSizeMinusOne = gf->tileSize - 1;
01433 int mask = gf->tileMask;
01434 DNA *dna = seq->dna;
01435 int i, j;
01436 bits32 bits = 0;
01437 bits32 bVal;
01438 int listSize;
01439 bits32 qStart, *tList;
01440 int hitCount = 0;
01441
01442 for (i=0; i<tileSizeMinusOne; ++i)
01443 {
01444 bVal = ntValNoN[(int)dna[i]];
01445 bits <<= 2;
01446 bits += bVal;
01447 }
01448 for (i=tileSizeMinusOne; i<size; ++i)
01449 {
01450 bVal = ntValNoN[(int)dna[i]];
01451 bits <<= 2;
01452 bits += bVal;
01453 bits &= mask;
01454 listSize = gf->listSizes[bits];
01455 if (listSize != 0)
01456 {
01457 qStart = i-tileSizeMinusOne;
01458 if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, gf->tileSize) == 0)
01459 {
01460 tList = gf->lists[bits];
01461 for (j=0; j<listSize; ++j)
01462 {
01463 int tStart = tList[j];
01464 if (target == NULL ||
01465 (target == findSource(gf, tStart) && tStart >= tMin && tStart < tMax) )
01466 {
01467 lmAllocVar(lm, hit);
01468 hit->qStart = qStart;
01469 hit->tStart = tStart;
01470 hit->diagonal = tStart + size - qStart;
01471 slAddHead(&hitList, hit);
01472 ++hitCount;
01473 }
01474 }
01475 }
01476 }
01477 }
01478 *retHitCount = hitCount;
01479 return hitList;
01480 }
01481
01482 static struct gfHit *gfStraightFindHits(struct genoFind *gf, aaSeq *seq,
01483 Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
01484 struct gfSeqSource *target, int tMin, int tMax)
01485
01486
01487 {
01488 struct gfHit *hitList = NULL, *hit;
01489 int size = seq->size;
01490 int tileSize = gf->tileSize;
01491 int lastStart = size - tileSize;
01492 char *poly = seq->dna;
01493 int i, j;
01494 int tile;
01495 int listSize;
01496 bits32 qStart, *tList;
01497 int hitCount = 0;
01498 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
01499
01500 initNtLookup();
01501 for (i=0; i<=lastStart; ++i)
01502 {
01503 tile = makeTile(poly+i, tileSize);
01504 if (tile < 0)
01505 continue;
01506 listSize = gf->listSizes[tile];
01507 if (listSize > 0)
01508 {
01509 qStart = i;
01510 if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, tileSize) == 0)
01511 {
01512 tList = gf->lists[tile];
01513 for (j=0; j<listSize; ++j)
01514 {
01515 int tStart = tList[j];
01516 if (target == NULL ||
01517 (target == findSource(gf, tStart) && tStart >= tMin && tStart < tMax) )
01518 {
01519 lmAllocVar(lm,hit);
01520 hit->qStart = qStart;
01521 hit->tStart = tStart;
01522 hit->diagonal = tStart + size - qStart;
01523 slAddHead(&hitList, hit);
01524 ++hitCount;
01525 }
01526 }
01527 }
01528 }
01529 }
01530 *retHitCount = hitCount;
01531 return hitList;
01532 }
01533
01534 static struct gfHit *gfStraightFindNearHits(struct genoFind *gf, aaSeq *seq,
01535 Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
01536 struct gfSeqSource *target, int tMin, int tMax)
01537
01538
01539 {
01540 struct gfHit *hitList = NULL, *hit;
01541 int size = seq->size;
01542 int tileSize = gf->tileSize;
01543 int lastStart = size - tileSize;
01544 char *poly = seq->dna;
01545 int i, j;
01546 int tile;
01547 int listSize;
01548 bits32 qStart, *tList;
01549 int hitCount = 0;
01550 int varPos, varVal;
01551 int (*makeTile)(char *poly, int n);
01552 int alphabetSize;
01553 char oldChar, zeroChar;
01554 int *seqValLookup;
01555 int posMul, avoid;
01556
01557 initNtLookup();
01558 if (gf->isPep)
01559 {
01560 makeTile = gfPepTile;
01561 alphabetSize = 20;
01562 zeroChar = 'A';
01563 seqValLookup = aaVal;
01564 }
01565 else
01566 {
01567 makeTile = gfDnaTile;
01568 alphabetSize = 4;
01569 zeroChar = 't';
01570 seqValLookup = ntVal;
01571 }
01572
01573 for (i=0; i<=lastStart; ++i)
01574 {
01575 posMul = 1;
01576 for (varPos = tileSize-1; varPos >=0; --varPos)
01577 {
01578
01579 oldChar = poly[i+varPos];
01580 poly[i+varPos] = zeroChar;
01581 tile = makeTile(poly+i, tileSize);
01582 poly[i+varPos] = oldChar;
01583
01584
01585 if (varPos == 0)
01586 avoid = -1;
01587 else
01588 avoid = seqValLookup[(int)oldChar];
01589
01590 if (tile >= 0)
01591 {
01592
01593 for (varVal=0; varVal<alphabetSize; ++varVal)
01594 {
01595 if (varVal != avoid)
01596 {
01597 listSize = gf->listSizes[tile];
01598 if (listSize > 0)
01599 {
01600 qStart = i;
01601 if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, tileSize) == 0)
01602 {
01603 tList = gf->lists[tile];
01604 for (j=0; j<listSize; ++j)
01605 {
01606 int tStart = tList[j];
01607 if (target == NULL ||
01608 (target == findSource(gf, tStart)
01609 && tStart >= tMin && tStart < tMax) )
01610 {
01611 lmAllocVar(lm,hit);
01612 hit->qStart = qStart;
01613 hit->tStart = tStart;
01614 hit->diagonal = tStart + size - qStart;
01615 slAddHead(&hitList, hit);
01616 ++hitCount;
01617 }
01618 }
01619 }
01620 }
01621 }
01622 tile += posMul;
01623 }
01624 }
01625 posMul *= alphabetSize;
01626 }
01627 }
01628 *retHitCount = hitCount;
01629 return hitList;
01630 }
01631
01632 static struct gfHit *gfSegmentedFindHits(struct genoFind *gf, aaSeq *seq,
01633 Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
01634 struct gfSeqSource *target, int tMin, int tMax)
01635
01636
01637 {
01638 struct gfHit *hitList = NULL, *hit;
01639 int size = seq->size;
01640 int tileSize = gf->tileSize;
01641 int tileTailSize = gf->segSize;
01642 int tileHeadSize = gf->tileSize - tileTailSize;
01643 int lastStart = size - tileSize;
01644 char *poly = seq->dna;
01645 int i, j;
01646 int tileHead, tileTail;
01647 int listSize;
01648 bits32 qStart;
01649 bits16 *endList;
01650 int hitCount = 0;
01651 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
01652
01653
01654 initNtLookup();
01655 for (i=0; i<=lastStart; ++i)
01656 {
01657 if (qMaskBits == NULL || bitCountRange(qMaskBits, i+qMaskOffset, tileSize) == 0)
01658 {
01659 tileHead = makeTile(poly+i, tileHeadSize);
01660 if (tileHead < 0)
01661 continue;
01662 tileTail = makeTile(poly+i+tileHeadSize, tileTailSize);
01663 if (tileTail < 0)
01664 continue;
01665 listSize = gf->listSizes[tileHead];
01666 qStart = i;
01667 endList = gf->endLists[tileHead];
01668 for (j=0; j<listSize; ++j)
01669 {
01670 if (endList[0] == tileTail)
01671 {
01672 int tStart = (endList[1]<<16) + endList[2];
01673 if (target == NULL ||
01674 (target == findSource(gf, tStart)
01675 && tStart >= tMin && tStart < tMax) )
01676 {
01677 lmAllocVar(lm,hit);
01678 hit->qStart = qStart;
01679 hit->tStart = tStart;
01680 hit->diagonal = tStart + size - qStart;
01681 slAddHead(&hitList, hit);
01682 ++hitCount;
01683 }
01684 }
01685 endList += 3;
01686 }
01687 }
01688 }
01689 *retHitCount = hitCount;
01690 return hitList;
01691 }
01692
01693 static struct gfHit *gfSegmentedFindNearHits(struct genoFind *gf,
01694 aaSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
01695 struct gfSeqSource *target, int tMin, int tMax)
01696
01697
01698 {
01699 struct gfHit *hitList = NULL, *hit;
01700 int size = seq->size;
01701 int tileSize = gf->tileSize;
01702 int tileTailSize = gf->segSize;
01703 int tileHeadSize = gf->tileSize - tileTailSize;
01704 int lastStart = size - tileSize;
01705 char *poly = seq->dna;
01706 int i, j;
01707 int tileHead, tileTail;
01708 int listSize;
01709 bits32 qStart;
01710 bits16 *endList;
01711 int hitCount = 0;
01712 int varPos, varVal;
01713 int (*makeTile)(char *poly, int n);
01714 int alphabetSize;
01715 char oldChar, zeroChar;
01716 int headPosMul, tailPosMul, avoid;
01717 boolean modTail;
01718 int *seqValLookup;
01719
01720
01721 initNtLookup();
01722 if (gf->isPep)
01723 {
01724 makeTile = gfPepTile;
01725 alphabetSize = 20;
01726 zeroChar = 'A';
01727 seqValLookup = aaVal;
01728 }
01729 else
01730 {
01731 makeTile = gfDnaTile;
01732 alphabetSize = 4;
01733 zeroChar = 't';
01734 seqValLookup = ntVal;
01735 }
01736
01737 for (i=0; i<=lastStart; ++i)
01738 {
01739 if (qMaskBits == NULL || bitCountRange(qMaskBits, i+qMaskOffset, tileSize) == 0)
01740 {
01741 headPosMul = tailPosMul = 1;
01742 for (varPos = tileSize-1; varPos >= 0; --varPos)
01743 {
01744
01745 modTail = (varPos >= tileHeadSize);
01746 oldChar = poly[i+varPos];
01747 poly[i+varPos] = zeroChar;
01748 tileHead = makeTile(poly+i, tileHeadSize);
01749 tileTail = makeTile(poly+i+tileHeadSize, tileTailSize);
01750 poly[i+varPos] = oldChar;
01751
01752
01753 if (varPos == 0)
01754 avoid = -1;
01755 else
01756 avoid = seqValLookup[(int)oldChar];
01757
01758 if (tileHead >= 0 && tileTail >= 0)
01759 {
01760 for (varVal=0; varVal<alphabetSize; ++varVal)
01761 {
01762 if (varVal != avoid)
01763 {
01764 listSize = gf->listSizes[tileHead];
01765 qStart = i;
01766 endList = gf->endLists[tileHead];
01767 for (j=0; j<listSize; ++j)
01768 {
01769 if (endList[0] == tileTail)
01770 {
01771 int tStart = (endList[1]<<16) + endList[2];
01772 if (target == NULL ||
01773 (target == findSource(gf, tStart)
01774 && tStart >= tMin && tStart < tMax) )
01775 {
01776 lmAllocVar(lm,hit);
01777 hit->qStart = qStart;
01778 hit->tStart = tStart;
01779 hit->diagonal = tStart + size - qStart;
01780 slAddHead(&hitList, hit);
01781 ++hitCount;
01782 }
01783 }
01784 endList += 3;
01785 }
01786 }
01787 if (modTail)
01788 tileTail += tailPosMul;
01789 else
01790 tileHead += headPosMul;
01791 }
01792 }
01793 if (modTail)
01794 tailPosMul *= alphabetSize;
01795 else
01796 headPosMul *= alphabetSize;
01797 }
01798 }
01799 }
01800 *retHitCount = hitCount;
01801 return hitList;
01802 }
01803
01804
01805 static struct gfHit *gfFindHitsWithQmask(struct genoFind *gf, bioSeq *seq,
01806 Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
01807 struct gfSeqSource *target, int tMin, int tMax)
01808
01809
01810 {
01811 struct gfHit *hitList = NULL;
01812 if (gf->segSize == 0 && !gf->isPep && !gf->allowOneMismatch)
01813 {
01814 hitList = gfFastFindDnaHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
01815 target, tMin, tMax);
01816 }
01817 else
01818 {
01819 if (gf->segSize == 0)
01820 {
01821 if (gf->allowOneMismatch)
01822 {
01823 hitList = gfStraightFindNearHits(gf, seq, qMaskBits, qMaskOffset, lm,
01824 retHitCount, target, tMin, tMax);
01825 }
01826 else
01827 {
01828 hitList = gfStraightFindHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
01829 target, tMin, tMax);
01830 }
01831 }
01832 else
01833 {
01834 if (gf->allowOneMismatch)
01835 {
01836 hitList = gfSegmentedFindNearHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
01837 target, tMin, tMax);
01838 }
01839 else
01840 {
01841 hitList = gfSegmentedFindHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
01842 target, tMin, tMax);
01843 }
01844 }
01845 }
01846 return hitList;
01847 }
01848
01849 struct gfClump *gfFindClumpsWithQmask(struct genoFind *gf, bioSeq *seq,
01850 Bits *qMaskBits, int qMaskOffset,
01851 struct lm *lm, int *retHitCount)
01852
01853 {
01854 struct gfClump *clumpList = NULL;
01855 struct gfHit *hitList;
01856 int minMatch = gf->minMatch;
01857
01858 #ifdef OLD
01859 if (seq->size < gf->tileSize * (gf->minMatch+1))
01860 minMatch = 1;
01861 #endif
01862
01863 hitList = gfFindHitsWithQmask(gf, seq, qMaskBits, qMaskOffset, lm,
01864 retHitCount, NULL, 0, 0);
01865 cmpQuerySize = seq->size;
01866 clumpList = clumpHits(gf, hitList, minMatch);
01867 return clumpList;
01868 }
01869
01870 struct gfHit *gfFindHitsInRegion(struct genoFind *gf, bioSeq *seq,
01871 Bits *qMaskBits, int qMaskOffset, struct lm *lm,
01872 struct gfSeqSource *target, int tMin, int tMax)
01873
01874
01875
01876
01877 {
01878 int targetStart;
01879 struct gfHit *hitList, *hit;
01880 int hitCount;
01881
01882 targetStart = target->start;
01883 hitList = gfFindHitsWithQmask(gf, seq, qMaskBits, qMaskOffset, lm,
01884 &hitCount, target, tMin + targetStart, tMax + targetStart);
01885 for (hit = hitList; hit != NULL; hit = hit->next)
01886 hit->tStart -= targetStart;
01887 return hitList;
01888 }
01889
01890 struct gfClump *gfFindClumps(struct genoFind *gf, bioSeq *seq, struct lm *lm, int *retHitCount)
01891
01892 {
01893 return gfFindClumpsWithQmask(gf, seq, NULL, 0, lm, retHitCount);
01894 }
01895
01896
01897 void gfTransFindClumps(struct genoFind *gfs[3], aaSeq *seq, struct gfClump *clumps[3], struct lm *lm, int *retHitCount)
01898
01899 {
01900 int frame;
01901 int oneHit;
01902 int hitCount = 0;
01903 for (frame = 0; frame < 3; ++frame)
01904 {
01905 clumps[frame] = gfFindClumps(gfs[frame], seq, lm, &oneHit);
01906 hitCount += oneHit;
01907 }
01908 *retHitCount = hitCount;
01909 }
01910
01911 void gfTransTransFindClumps(struct genoFind *gfs[3], aaSeq *seqs[3],
01912 struct gfClump *clumps[3][3], struct lm *lm, int *retHitCount)
01913
01914
01915 {
01916 int qFrame;
01917 int oneHit;
01918 int hitCount = 0;
01919
01920 for (qFrame = 0; qFrame<3; ++qFrame)
01921 {
01922 gfTransFindClumps(gfs, seqs[qFrame], clumps[qFrame], lm, &oneHit);
01923 hitCount += oneHit;
01924 }
01925 *retHitCount = hitCount;
01926 }
01927
01928 void gfMakeOoc(char *outName, char *files[], int fileCount,
01929 int tileSize, bits32 maxPat, enum gfType tType)
01930
01931 {
01932 boolean dbIsPep = (tType == gftProt || tType == gftDnaX || tType == gftRnaX);
01933 struct genoFind *gf = gfNewEmpty(gfMinMatch, gfMaxGap, tileSize, tileSize,
01934 maxPat, NULL, dbIsPep, FALSE);
01935 bits32 *sizes = gf->listSizes;
01936 int tileSpaceSize = gf->tileSpaceSize;
01937 bioSeq *seq, *seqList;
01938 bits32 sig = oocSig, psz = tileSize;
01939 bits32 i;
01940 int oocCount = 0;
01941 char *inName;
01942 FILE *f = mustOpen(outName, "w");
01943
01944 if (gf->segSize > 0)
01945 errAbort("Don't yet know how to make ooc files for large tile sizes.");
01946 for (i=0; i<fileCount; ++i)
01947 {
01948 inName = files[i];
01949 printf("Loading %s\n", inName);
01950 if (nibIsFile(inName))
01951 {
01952 seqList = nibLoadAll(inName);
01953 }
01954 else if (twoBitIsFile(inName))
01955 {
01956 seqList = twoBitLoadAll(inName);
01957 for (seq = seqList; seq != NULL; seq = seq->next)
01958 toLowerN(seq->dna, seq->size);
01959 }
01960 else
01961 {
01962 seqList = faReadAllSeq(inName, tType != gftProt);
01963 }
01964 printf("Counting %s\n", inName);
01965 for (seq = seqList; seq != NULL; seq = seq->next)
01966 {
01967 int isRc;
01968 for (isRc = 0; isRc <= 1; ++isRc)
01969 {
01970 if (tType == gftDnaX || tType == gftRnaX)
01971 {
01972 struct trans3 *t3 = trans3New(seq);
01973 int frame;
01974 for (frame=0; frame<3; ++frame)
01975 {
01976 gfCountSeq(gf, t3->trans[frame]);
01977 }
01978 trans3Free(&t3);
01979 }
01980 else
01981 {
01982 gfCountSeq(gf, seq);
01983 }
01984 if (tType == gftProt || tType == gftRnaX)
01985 break;
01986 else
01987 {
01988 reverseComplement(seq->dna, seq->size);
01989 }
01990 }
01991 }
01992 freeDnaSeqList(&seqList);
01993 }
01994 printf("Writing %s\n", outName);
01995 writeOne(f, sig);
01996 writeOne(f, psz);
01997 for (i=0; i<tileSpaceSize; ++i)
01998 {
01999 if (sizes[i] >= maxPat)
02000 {
02001 writeOne(f, i);
02002 ++oocCount;
02003 }
02004 }
02005 carefulClose(&f);
02006 genoFindFree(&gf);
02007 printf("Wrote %d overused %d-mers to %s\n", oocCount, tileSize, outName);
02008 }
02009
02010 struct gfSeqSource *gfFindNamedSource(struct genoFind *gf, char *name)
02011
02012 {
02013 struct gfSeqSource *source = gf->sources;
02014 int count = gf->sourceCount;
02015
02016 if (source->seq == NULL)
02017 {
02018 char rootName[256];
02019 while (--count >= 0)
02020 {
02021 splitPath(source->fileName, NULL, rootName, NULL);
02022 if (sameString(name, rootName))
02023 return source;
02024 }
02025 }
02026 else
02027 {
02028 while (--count >= 0)
02029 {
02030 if (sameString(source->seq->name, name))
02031 return source;
02032 source += 1;
02033 }
02034 }
02035 return NULL;
02036 }
02037
02038 static void mergeAdd(struct binKeeper *bk, int start, int end, struct gfSeqSource *src)
02039
02040
02041 {
02042 struct binElement *iEl, *iList = binKeeperFind(bk, start, end);
02043 for (iEl = iList; iEl != NULL; iEl = iEl->next)
02044 {
02045 if (iEl->start < start)
02046 start = iEl->start;
02047 if (iEl->end > end)
02048 end = iEl->end;
02049 binKeeperRemove(bk, iEl->start, iEl->end, src);
02050 }
02051 slFreeList(&iList);
02052 binKeeperAdd(bk, start, end, src);
02053 }
02054
02055 static struct gfClump *pcrClumps(struct genoFind *gf, char *fPrimer, int fPrimerSize,
02056 char *rPrimer, int rPrimerSize, int minDistance, int maxDistance)
02057
02058 {
02059 struct gfClump *clumpList = NULL;
02060 int tileSize = gf->tileSize;
02061 int fTile;
02062 int fTileCount = fPrimerSize - tileSize;
02063 int *rTiles, rTile;
02064 int rTileCount = rPrimerSize - tileSize;
02065 int fTileIx,rTileIx,fPosIx,rPosIx;
02066 bits32 *fPosList, fPos, *rPosList, rPos;
02067 int fPosListSize, rPosListSize;
02068 struct hash *targetHash = newHash(0);
02069
02070
02071 AllocArray(rTiles, rTileCount);
02072 for (rTileIx = 0; rTileIx<rTileCount; ++rTileIx)
02073 {
02074 rTiles[rTileIx] = gfDnaTile(rPrimer + rTileIx, tileSize);
02075 if (rTiles[rTileIx] == -1)
02076 errAbort("Bad char in reverse primer sequence: %s", rPrimer);
02077 }
02078
02079
02080 for (fTileIx=0; fTileIx<fTileCount; ++fTileIx)
02081 {
02082 fTile = gfDnaTile(fPrimer + fTileIx, tileSize);
02083 if (fTile >= 0)
02084 {
02085 fPosListSize = gf->listSizes[fTile];
02086 fPosList = gf->lists[fTile];
02087 for (fPosIx=0; fPosIx < fPosListSize; ++fPosIx)
02088 {
02089 fPos = fPosList[fPosIx];
02090
02091 for (rTileIx=0; rTileIx < rTileCount; ++rTileIx)
02092 {
02093 rTile = rTiles[rTileIx];
02094 rPosListSize = gf->listSizes[rTile];
02095 rPosList = gf->lists[rTile];
02096 for (rPosIx=0; rPosIx < rPosListSize; ++rPosIx)
02097 {
02098 rPos = rPosList[rPosIx];
02099 if (rPos > fPos)
02100 {
02101 int distance = rPos - fPos;
02102 if (distance >= minDistance && distance <= maxDistance)
02103 {
02104 struct gfSeqSource *target = findSource(gf, fPos);
02105 if (rPos < target->end)
02106 {
02107 struct binKeeper *bk;
02108 int tStart = target->start;
02109 char *tName = target->fileName;
02110 if (tName == NULL)
02111 tName = target->seq->name;
02112 if ((bk = hashFindVal(targetHash, tName)) == NULL)
02113 {
02114 bk = binKeeperNew(0, target->end - tStart);
02115 hashAdd(targetHash, tName, bk);
02116 }
02117 mergeAdd(bk, fPos - tStart, rPos - tStart, target);
02118 }
02119 }
02120 }
02121 }
02122 }
02123 }
02124 }
02125 }
02126
02127
02128 {
02129 struct hashEl *tList, *tEl;
02130 tList = hashElListHash(targetHash);
02131 for (tEl = tList; tEl != NULL; tEl = tEl->next)
02132 {
02133 struct binKeeper *bk = tEl->val;
02134 struct binElement *bkList, *bkEl;
02135 bkList = binKeeperFindAll(bk);
02136 for (bkEl = bkList; bkEl != NULL; bkEl = bkEl->next)
02137 {
02138 struct gfClump *clump;
02139 AllocVar(clump);
02140 clump->target = bkEl->val;
02141 clump->tStart = bkEl->start;
02142 clump->tEnd = bkEl->end;
02143 slAddHead(&clumpList, clump);
02144 }
02145 slFreeList(&bkList);
02146 binKeeperFree(&bk);
02147 }
02148 hashElFreeList(&tList);
02149 hashFree(&targetHash);
02150 }
02151 freez(&rTiles);
02152 return clumpList;
02153 }
02154
02155 struct gfClump *gfPcrClumps(struct genoFind *gf, char *fPrimer, int fPrimerSize,
02156 char *rPrimer, int rPrimerSize, int minDistance, int maxDistance)
02157
02158 {
02159 struct gfClump *clumpList;
02160 if (gf->segSize > 0)
02161 errAbort("Can't do PCR on large tile sizes");
02162 if (gf->isPep)
02163 errAbort("Can't do PCR on protein/translated index");
02164 tolowers(fPrimer);
02165 tolowers(rPrimer);
02166 reverseComplement(rPrimer, rPrimerSize);
02167 clumpList = pcrClumps(gf, fPrimer, fPrimerSize, rPrimer, rPrimerSize,
02168 minDistance, maxDistance);
02169 reverseComplement(rPrimer, rPrimerSize);
02170 return clumpList;
02171 }
02172