jkOwnLib/genoFind.c

Go to the documentation of this file.
00001 /* genoFind - Quickly find where DNA occurs in genome.. */
00002 /* Copyright 2001-2005 Jim Kent.  All rights reserved. */
00003 
00004 #include "common.h"
00005 #include <signal.h>
00006 #include "obscure.h"
00007 #include "dnautil.h"
00008 #include "dnaseq.h"
00009 #include "nib.h"
00010 #include "twoBit.h"
00011 #include "fa.h"
00012 #include "dystring.h"
00013 #include "errabort.h"
00014 #include "sig.h"
00015 #include "ooc.h"
00016 #include "genoFind.h"
00017 #include "trans3.h"
00018 #include "binRange.h"
00019 
00020 static char const rcsid[] = "$Id: genoFind.c,v 1.23 2007/02/04 05:09:02 kent Exp $";
00021 
00022 char *gfSignature()
00023 /* Return signature that starts each command to gfServer. Helps defend 
00024  * server from confused clients. */
00025 {
00026 static char signature[] = "0ddf270562684f29";
00027 return signature;
00028 }
00029 
00030 volatile boolean pipeBroke = FALSE;     /* Flag broken pipes here. */
00031 
00032 static void gfPipeHandler(int sigNum)
00033 /* Set broken pipe flag. */
00034 {
00035 pipeBroke = TRUE;
00036 }
00037 
00038 void gfCatchPipes()
00039 /* Set up to catch broken pipe signals. */
00040 {
00041 signal(SIGPIPE, gfPipeHandler);
00042 }
00043 
00044 int gfReadMulti(int sd, void *vBuf, size_t size)
00045 /* Read in until all is read or there is an error. */
00046 {
00047 char *buf = vBuf;
00048 size_t totalRead = 0;
00049 int oneRead;
00050 
00051 while (totalRead < size)
00052     {
00053     oneRead = read(sd, buf + totalRead, size - totalRead);
00054     if (oneRead < 0)
00055         {
00056         perror("Couldn't finish large read");
00057         return oneRead;
00058         }
00059     else if (oneRead == 0)
00060     /* Avoid an infinite loop when the client closed the socket. */
00061         break;
00062     totalRead += oneRead;
00063     }
00064 return totalRead;
00065 }
00066 
00067 void genoFindFree(struct genoFind **pGenoFind)
00068 /* Free up a genoFind index. */
00069 {
00070 struct genoFind *gf = *pGenoFind;
00071 int i;
00072 struct gfSeqSource *sources;
00073 if (gf != NULL)
00074     {
00075     freeMem(gf->lists);
00076     freeMem(gf->listSizes);
00077     freeMem(gf->allocated);
00078     if ((sources = gf->sources) != NULL)
00079         {
00080         for (i=0; i<gf->sourceCount; ++i)
00081             bitFree(&sources[i].maskedBits);
00082         freeMem(sources);
00083         }
00084     freez(pGenoFind);
00085     }
00086 }
00087 
00088 int gfPowerOf20(int n)
00089 /* Return a 20 to the n */
00090 {
00091 int res = 20;
00092 while (--n > 0)
00093     res *= 20;
00094 return res;
00095 }
00096 
00097 void gfCheckTileSize(int tileSize, boolean isPep)
00098 /* Check that tile size is legal.  Abort if not. */
00099 {
00100 if (isPep)
00101     {
00102     if (tileSize < 3 || tileSize > 8)
00103         errAbort("protein tileSize must be between 3 and 8");
00104     }
00105 else
00106     {
00107     if (tileSize < 6 || tileSize > 18)
00108         errAbort("DNA tileSize must be between 6 and 18");
00109     }
00110 }
00111 
00112 static struct genoFind *gfNewEmpty(int minMatch, int maxGap, 
00113         int tileSize, int stepSize, int maxPat,
00114         char *oocFile, boolean isPep, boolean allowOneMismatch)
00115 /* Return an empty pattern space. oocFile parameter may be NULL*/
00116 {
00117 struct genoFind *gf;
00118 int tileSpaceSize;
00119 int segSize = 0;
00120 
00121 gfCheckTileSize(tileSize, isPep);
00122 if (stepSize == 0)
00123     stepSize = tileSize;
00124 AllocVar(gf);
00125 if (isPep)
00126     {
00127     if (tileSize > 5)
00128         {
00129         segSize = tileSize - 5;
00130         }
00131     tileSpaceSize = gf->tileSpaceSize = gfPowerOf20(tileSize-segSize);
00132     }
00133 else
00134     {
00135     int seedBitSize;
00136     if (tileSize > 12)
00137         {
00138         segSize = tileSize - 12;
00139         }
00140     seedBitSize = (tileSize-segSize)*2;
00141     tileSpaceSize = gf->tileSpaceSize = (1<<seedBitSize);
00142     gf->tileMask = tileSpaceSize-1;
00143     }
00144 gf->segSize = segSize;
00145 gf->tileSize = tileSize;
00146 gf->stepSize = stepSize;
00147 gf->isPep = isPep;
00148 gf->allowOneMismatch = allowOneMismatch;
00149 if (segSize > 0)
00150     {
00151     gf->endLists = needHugeZeroedMem(tileSpaceSize * sizeof(gf->endLists[0]));
00152     maxPat = BIGNUM;    /* Don't filter out overused on the big ones.  It is
00153                          * unnecessary and would be quite complex. */
00154     }
00155 else
00156     {
00157     gf->lists = needHugeZeroedMem(tileSpaceSize * sizeof(gf->lists[0]));
00158     }
00159 gf->listSizes = needHugeZeroedMem(tileSpaceSize * sizeof(gf->listSizes[0]));
00160 gf->minMatch = minMatch;
00161 gf->maxGap = maxGap;
00162 gf->maxPat = maxPat;
00163 if (oocFile != NULL)
00164     {
00165     if (segSize > 0)
00166         errAbort("Don't yet support ooc on large tile sizes");
00167     oocMaskCounts(oocFile, gf->listSizes, tileSize, maxPat);
00168     oocMaskSimpleRepeats(gf->listSizes, tileSize, maxPat);
00169     }
00170 return gf;
00171 }
00172 
00173 static int ntLookup[256];
00174 
00175 static void initNtLookup()
00176 {
00177 static boolean initted = FALSE;
00178 if (!initted)
00179     {
00180     int i;
00181     for (i=0; i<ArraySize(ntLookup); ++i)
00182         ntLookup[i] = -1;
00183     ntLookup['a'] = A_BASE_VAL;
00184     ntLookup['c'] = C_BASE_VAL;
00185     ntLookup['t'] = T_BASE_VAL;
00186     ntLookup['g'] = G_BASE_VAL;
00187     initted = TRUE;
00188     }
00189 }
00190 
00191 int gfDnaTile(DNA *dna, int n)
00192 /* Make a packed DNA n-mer. */
00193 {
00194 int tile = 0;
00195 int i, c;
00196 for (i=0; i<n; ++i)
00197     {
00198     tile <<= 2;
00199     if ((c = ntLookup[(int)dna[i]]) < 0)
00200         return -1;
00201     tile += c;
00202     }
00203 return tile;
00204 }
00205 
00206 int gfPepTile(AA *pep, int n)
00207 /* Make up packed representation of translated protein. */
00208 {
00209 int tile = 0;
00210 int aa;
00211 while (--n >= 0)
00212     {
00213     tile *= 20;
00214     aa = aaVal[(int)(*pep++)];
00215     if (aa < 0)
00216         return -1;
00217     tile += aa;
00218     }
00219 return tile;
00220 }
00221 
00222 static void gfCountSeq(struct genoFind *gf, bioSeq *seq)
00223 /* Add all N-mers in seq. */
00224 {
00225 char *poly = seq->dna;
00226 int tileSize = gf->tileSize;
00227 int stepSize = gf->stepSize;
00228 int tileHeadSize = gf->tileSize - gf->segSize;
00229 int maxPat = gf->maxPat;
00230 int tile;
00231 bits32 *listSizes = gf->listSizes;
00232 int i, lastTile = seq->size - tileSize;
00233 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
00234 
00235 initNtLookup();
00236 for (i=0; i<=lastTile; i += stepSize)
00237     {
00238     if ((tile = makeTile(poly, tileHeadSize)) >= 0)
00239         {
00240         if (listSizes[tile] < maxPat)
00241             {
00242             listSizes[tile] += 1;
00243             }
00244         }
00245     poly += stepSize;
00246     }
00247 }
00248 
00249 static int gfCountTilesInNib(struct genoFind *gf, int stepSize, char *fileName)
00250 /* Count all tiles in nib file.  Returns nib size. */
00251 {
00252 FILE *f;
00253 int tileSize = gf->tileSize;
00254 int bufSize = tileSize * 16*1024;
00255 int nibSize, i;
00256 int endBuf, basesInBuf;
00257 struct dnaSeq *seq;
00258 
00259 printf("Counting tiles in %s\n", fileName);
00260 nibOpenVerify(fileName, &f, &nibSize);
00261 for (i=0; i < nibSize; i = endBuf)
00262     {
00263     endBuf = i + bufSize;
00264     if (endBuf >= nibSize) endBuf = nibSize;
00265     basesInBuf = endBuf - i;
00266     seq = nibLdPart(fileName, f, nibSize, i, basesInBuf);
00267     gfCountSeq(gf, seq);
00268     freeDnaSeq(&seq);
00269     }
00270 fclose(f);
00271 return nibSize;
00272 }
00273 
00274 long long maxTotalBases()
00275 /* Return maximum bases we can index. */
00276 {
00277 long long maxBases = 1024*1024;
00278 maxBases *= 4*1024;
00279 return maxBases;
00280 }
00281 
00282 static long long twoBitCheckTotalSize(struct twoBitFile *tbf)
00283 /* Return total size of sequence in two bit file.  Squawk and
00284  * die if it's more than 4 gig. */
00285 {
00286 long long totalSize = twoBitTotalSize(tbf);
00287 if (totalSize > maxTotalBases())
00288     errAbort("Sorry, can only index up to %lld bases, %s has %lld",
00289         maxTotalBases(),  tbf->fileName, totalSize);
00290 return totalSize;
00291 }
00292 
00293 static void gfCountTilesInTwoBit(struct genoFind *gf, int stepSize,
00294         char *fileName, int *retSeqCount, long long *retBaseCount)
00295 /* Count all tiles in 2bit file.  Returns number of sequences and
00296  * total size of sequences in file. */
00297 {
00298 struct dnaSeq *seq;
00299 struct twoBitFile *tbf = twoBitOpen(fileName);
00300 struct twoBitIndex *index;
00301 int seqCount = 0;
00302 long long baseCount = twoBitCheckTotalSize(tbf);
00303 
00304 printf("Counting tiles in %s\n", fileName);
00305 for (index = tbf->indexList; index != NULL; index = index->next)
00306     {
00307     seq = twoBitReadSeqFragLower(tbf, index->name, 0, 0);
00308     gfCountSeq(gf, seq);
00309     ++seqCount;
00310     freeDnaSeq(&seq);
00311     }
00312 twoBitClose(&tbf);
00313 *retSeqCount = seqCount;
00314 *retBaseCount = baseCount;
00315 }
00316 
00317 static int gfAllocLists(struct genoFind *gf)
00318 /* Allocate index lists and set up list pointers. 
00319  * Returns size of all lists. */
00320 {
00321 int oneCount;
00322 int count = 0;
00323 int i;
00324 bits32 *listSizes = gf->listSizes;
00325 bits32 **lists = gf->lists;
00326 bits32 *allocated = NULL;
00327 bits32 maxPat = gf->maxPat;
00328 int size;
00329 int usedCount = 0, overusedCount = 0;
00330 int tileSpaceSize = gf->tileSpaceSize;
00331 
00332 for (i=0; i<tileSpaceSize; ++i)
00333     {
00334     /* If pattern is too much used it's no good to us, ignore. */
00335     if ((oneCount = listSizes[i]) < maxPat)
00336         {
00337         count += oneCount;
00338         usedCount += 1;
00339         }
00340     else
00341         {
00342         overusedCount += 1;
00343         }
00344     }
00345 if (count > 0)
00346     gf->allocated = allocated = needHugeMem(count*sizeof(allocated[0]));
00347 for (i=0; i<tileSpaceSize; ++i)
00348     {
00349     if ((size = listSizes[i]) < maxPat)
00350         {
00351         lists[i] = allocated;
00352         allocated += size;
00353         }
00354     }
00355 return count;
00356 }
00357 
00358 static int gfAllocLargeLists(struct genoFind *gf)
00359 /* Allocate large index lists and set up list pointers. 
00360  * Returns size of all lists. */
00361 {
00362 int count = 0;
00363 int i;
00364 bits32 *listSizes = gf->listSizes;
00365 bits16 **endLists = gf->endLists;
00366 bits16 *allocated = NULL;
00367 int size;
00368 int tileSpaceSize = gf->tileSpaceSize;
00369 
00370 for (i=0; i<tileSpaceSize; ++i)
00371     count += listSizes[i];
00372 if (count > 0)
00373     gf->allocated = allocated = needHugeMem(3*count*sizeof(allocated[0]));
00374 for (i=0; i<tileSpaceSize; ++i)
00375     {
00376     size = listSizes[i];
00377     endLists[i] = allocated;
00378     allocated += 3*size;
00379     }
00380 return count;
00381 }
00382 
00383 
00384 static void gfAddSeq(struct genoFind *gf, bioSeq *seq, bits32 offset)
00385 /* Add all N-mers in seq.  Done after gfCountSeq. */
00386 {
00387 char *poly = seq->dna;
00388 int tileSize = gf->tileSize;
00389 int stepSize = gf->stepSize;
00390 int i, lastTile = seq->size - tileSize;
00391 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
00392 int maxPat = gf->maxPat;
00393 int tile;
00394 bits32 *listSizes = gf->listSizes;
00395 bits32 **lists = gf->lists;
00396 
00397 initNtLookup();
00398 for (i=0; i<=lastTile; i += stepSize)
00399     {
00400     tile = makeTile(poly, tileSize);
00401     if (tile >= 0)
00402         {
00403         if (listSizes[tile] < maxPat)
00404             {
00405             lists[tile][listSizes[tile]++] = offset;
00406             }
00407         }
00408     offset += stepSize;
00409     poly += stepSize;
00410     }
00411 }
00412 
00413 static void gfAddLargeSeq(struct genoFind *gf, bioSeq *seq, bits32 offset)
00414 /* Add all N-mers to segmented index.  Done after gfCountSeq. */
00415 {
00416 char *poly = seq->dna;
00417 int tileSize = gf->tileSize;
00418 int stepSize = gf->stepSize;
00419 int tileTailSize = gf->segSize;
00420 int tileHeadSize = tileSize - tileTailSize;
00421 int i, lastTile = seq->size - tileSize;
00422 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
00423 int tileHead;
00424 int tileTail;
00425 bits32 *listSizes = gf->listSizes;
00426 bits16 **endLists = gf->endLists;
00427 bits16 *endList;
00428 int headCount;
00429 
00430 initNtLookup();
00431 for (i=0; i<=lastTile; i += stepSize)
00432     {
00433     tileHead = makeTile(poly, tileHeadSize);
00434     tileTail = makeTile(poly + tileHeadSize, tileTailSize);
00435     if (tileHead >= 0 && tileTail >= 0)
00436         {
00437         endList = endLists[tileHead];
00438         headCount = listSizes[tileHead]++;
00439         endList += headCount * 3;               /* Because have three slots per. */
00440         endList[0] = tileTail;
00441         endList[1] = (offset >> 16);
00442         endList[2] = (offset&0xffff);
00443         }
00444     offset += stepSize;
00445     poly += stepSize;
00446     }
00447 }
00448 
00449 
00450 
00451 static int gfAddTilesInNib(struct genoFind *gf, char *fileName, bits32 offset,
00452         int stepSize)
00453 /* Add all tiles in nib file.  Returns size of nib file. */
00454 {
00455 FILE *f;
00456 int tileSize = gf->tileSize;
00457 int bufSize = tileSize * 16*1024;
00458 int nibSize, i;
00459 int endBuf, basesInBuf;
00460 struct dnaSeq *seq;
00461 
00462 printf("Adding tiles in %s\n", fileName);
00463 nibOpenVerify(fileName, &f, &nibSize);
00464 for (i=0; i < nibSize; i = endBuf)
00465     {
00466     endBuf = i + bufSize;
00467     if (endBuf >= nibSize) endBuf = nibSize;
00468     basesInBuf = endBuf - i;
00469     seq = nibLdPart(fileName, f, nibSize, i, basesInBuf);
00470     gfAddSeq(gf, seq, i + offset);
00471     freeDnaSeq(&seq);
00472     }
00473 fclose(f);
00474 return nibSize;
00475 }
00476 
00477 
00478 static void gfZeroOverused(struct genoFind *gf)
00479 /* Zero out counts of overused tiles. */
00480 {
00481 bits32 *sizes = gf->listSizes;
00482 int tileSpaceSize = gf->tileSpaceSize, i;
00483 int maxPat = gf->maxPat;
00484 int overCount = 0;
00485 
00486 for (i=0; i<tileSpaceSize; ++i)
00487     {
00488     if (sizes[i] >= maxPat)
00489         {
00490         sizes[i] = 0;
00491         ++overCount;
00492         }
00493     }
00494 }
00495 
00496 static void gfZeroNonOverused(struct genoFind *gf)
00497 /* Zero out counts of non-overused tiles. */
00498 {
00499 bits32 *sizes = gf->listSizes;
00500 int tileSpaceSize = gf->tileSpaceSize, i;
00501 int maxPat = gf->maxPat;
00502 int overCount = 0;
00503 
00504 for (i=0; i<tileSpaceSize; ++i)
00505     {
00506     if (sizes[i] < maxPat)
00507         {
00508         sizes[i] = 0;
00509         ++overCount;
00510         }
00511     }
00512 }
00513 
00514 struct genoFind *gfIndexNibsAndTwoBits(int fileCount, char *fileNames[],
00515         int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
00516         boolean allowOneMismatch, int stepSize)
00517 /* Make index for all nibs and .2bits in list. 
00518  *      minMatch - minimum number of matching tiles to trigger alignments
00519  *      maxGap   - maximum deviation from diagonal of tiles
00520  *      tileSize - size of tile in nucleotides
00521  *      maxPat   - maximum use of tile to not be considered a repeat
00522  *      oocFile  - .ooc format file that lists repeat tiles.  May be NULL. 
00523  *      allowOneMismatch - allow one mismatch in a tile.  
00524  *      stepSize - space between tiles.  Zero means default (which is tileSize). */
00525 {
00526 struct genoFind *gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize,
00527         maxPat, oocFile, FALSE, allowOneMismatch);
00528 int i;
00529 bits32 offset = 0, nibSize;
00530 char *fileName;
00531 struct gfSeqSource *ss;
00532 long long totalBases = 0, warnAt = maxTotalBases();
00533 int totalSeq = 0;
00534 
00535 if (allowOneMismatch)
00536     errAbort("Don't currently support allowOneMismatch in gfIndexNibsAndTwoBits");
00537 if (stepSize == 0)
00538     stepSize = gf->tileSize;
00539 for (i=0; i<fileCount; ++i)
00540     {
00541     fileName = fileNames[i];
00542     if (twoBitIsFile(fileName))
00543         {
00544         int seqCount;
00545         long long baseCount;
00546         gfCountTilesInTwoBit(gf, stepSize, fileName, &seqCount, &baseCount);
00547         totalBases += baseCount;
00548         totalSeq += seqCount;
00549         }
00550     else if (nibIsFile(fileName))
00551         {
00552         totalBases += gfCountTilesInNib(gf, stepSize, fileName);
00553         totalSeq += 1;
00554         }
00555     else
00556         errAbort("Unrecognized file type %s", fileName);
00557     /* Warn if they exceed 4 gig. */
00558     if (totalBases >= warnAt)
00559         errAbort("Exceeding 4 billion bases, sorry gfServer can't handle that.");
00560     }
00561 gfAllocLists(gf);
00562 gfZeroNonOverused(gf);
00563 AllocArray(gf->sources, totalSeq);
00564 gf->sourceCount = totalSeq;
00565 ss = gf->sources;
00566 for (i=0; i<fileCount; ++i)
00567     {
00568     fileName = fileNames[i];
00569     if (nibIsFile(fileName))
00570         {
00571         nibSize = gfAddTilesInNib(gf, fileName, offset, stepSize);
00572         ss->fileName = fileName;
00573         ss->start = offset;
00574         offset += nibSize;
00575         ss->end = offset;
00576         ++ss;
00577         }
00578     else
00579         {
00580         struct twoBitFile *tbf = twoBitOpen(fileName);
00581         struct twoBitIndex *index;
00582         char nameBuf[PATH_LEN+256];
00583         for (index = tbf->indexList; index != NULL; index = index->next)
00584             {
00585             struct dnaSeq *seq = twoBitReadSeqFragLower(tbf, index->name, 0,0);
00586             gfAddSeq(gf, seq, offset);
00587             safef(nameBuf, sizeof(nameBuf), "%s:%s", fileName, index->name);
00588             ss->fileName = cloneString(nameBuf);
00589             ss->start = offset;
00590             offset += seq->size;
00591             ss->end = offset;
00592             ++ss;
00593             dnaSeqFree(&seq);
00594             }
00595         twoBitClose(&tbf);
00596         }
00597     }
00598 gf->totalSeqSize = offset;
00599 gfZeroOverused(gf);
00600 printf("Done adding\n");
00601 return gf;
00602 }
00603 
00604 static void maskSimplePepRepeat(struct genoFind *gf)
00605 /* Remove tiles from index that represent repeats
00606  * of period one and two. */
00607 {
00608 int i;
00609 int tileSize = gf->tileSize;
00610 int maxPat = gf->maxPat;
00611 bits32 *listSizes = gf->listSizes;
00612 
00613 for (i=0; i<20; ++i)
00614     {
00615     int j, k;
00616     for (j=0; j<20; ++j)
00617         {
00618         int tile = 0;
00619         for (k=0; k<tileSize; ++k)
00620             {
00621             tile *= 20;
00622             if (k&1)
00623                 tile += j;
00624             else
00625                 tile += i;
00626             }
00627         listSizes[tile] = maxPat;
00628         }
00629     }
00630 }
00631 
00632 struct dnaSeq *readMaskedNib(char *fileName, boolean mask)
00633 /* Read nib, optionally turning masked bits to N's. 
00634  * Result is lower case where not masked. */
00635 {
00636 struct dnaSeq *seq;
00637 if (mask)
00638     {
00639     seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName);
00640     toggleCase(seq->dna, seq->size);
00641     upperToN(seq->dna, seq->size);
00642     }
00643 else
00644     seq = nibLoadAll(fileName);
00645 return seq;
00646 }
00647 
00648 struct dnaSeq *readMaskedTwoBit(struct twoBitFile *tbf, char *seqName,
00649         boolean mask)
00650 /* Read seq from twoBitFile, optionally turning masked bits to N's. 
00651  * Result is lower case where not masked. */
00652 {
00653 struct dnaSeq *seq;
00654 if (mask)
00655     {
00656     seq = twoBitReadSeqFrag(tbf, seqName, 0, 0);
00657     toggleCase(seq->dna, seq->size);
00658     upperToN(seq->dna, seq->size);
00659     }
00660 else
00661     seq = twoBitReadSeqFragLower(tbf, seqName, 0, 0);
00662 return seq;
00663 }
00664 
00665 
00666 static void transCountBothStrands(struct dnaSeq *seq, 
00667     struct genoFind *transGf[2][3])
00668 /* Count (unmasked) tiles in both strand of sequence. 
00669  * As a side effect this will reverse-complement seq. */
00670 {
00671 int isRc, frame;
00672 struct trans3 *t3;
00673 for (isRc=0; isRc <= 1; ++isRc)
00674     {
00675     if (isRc)
00676         reverseComplement(seq->dna, seq->size);
00677     t3 = trans3New(seq);
00678     for (frame = 0; frame < 3; ++frame)
00679         {
00680         struct genoFind *gf = transGf[isRc][frame];
00681         gfCountSeq(gf, t3->trans[frame]);
00682         }
00683     trans3Free(&t3);
00684     }
00685 }
00686 
00687 static void transIndexBothStrands(struct dnaSeq *seq, 
00688     struct genoFind *transGf[2][3], bits32 offset[2][3],
00689     int sourceIx, char *fileName)
00690 /* Add unmasked tiles on both strands of sequence to
00691  * index.  As a side effect this will reverse-complement seq. */
00692 {
00693 int isRc, frame;
00694 struct trans3 *t3;
00695 struct gfSeqSource *ss;
00696 for (isRc=0; isRc <= 1; ++isRc)
00697     {
00698     if (isRc)
00699         {
00700         reverseComplement(seq->dna, seq->size);
00701         }
00702     t3 = trans3New(seq);
00703     for (frame = 0; frame < 3; ++frame)
00704         {
00705         struct genoFind *gf = transGf[isRc][frame];
00706         ss = gf->sources + sourceIx;
00707         gfAddSeq(gf, t3->trans[frame], offset[isRc][frame]);
00708         ss->fileName = cloneString(fileName);
00709         ss->start = offset[isRc][frame];
00710         offset[isRc][frame] += t3->trans[frame]->size;
00711         ss->end = offset[isRc][frame];
00712         }
00713     trans3Free(&t3);
00714     }
00715 }
00716 
00717 void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3], 
00718     int fileCount, char *fileNames[], 
00719     int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
00720     boolean allowOneMismatch, boolean doMask, int stepSize)
00721 /* Make translated (6 frame) index for all .nib and .2bit files. */
00722 {
00723 struct genoFind *gf;
00724 int i,isRc, frame;
00725 bits32 offset[2][3];
00726 char *fileName;
00727 struct dnaSeq *seq;
00728 int sourceCount = 0;
00729 long long totalBases = 0, warnAt = maxTotalBases();
00730 
00731 if (allowOneMismatch)
00732     errAbort("Don't currently support allowOneMismatch in gfIndexTransNibsAndTwoBits");
00733 /* Allocate indices for all reading frames. */
00734 for (isRc=0; isRc <= 1; ++isRc)
00735     {
00736     for (frame = 0; frame < 3; ++frame)
00737         {
00738         transGf[isRc][frame] = gf = gfNewEmpty(minMatch, maxGap, 
00739                 tileSize, stepSize, maxPat, oocFile, TRUE, allowOneMismatch);
00740         }
00741     }
00742 
00743 /* Mask simple AA repeats (of period 1 and 2). */
00744 for (isRc = 0; isRc <= 1; ++isRc)
00745     for (frame = 0; frame < 3; ++frame)
00746         maskSimplePepRepeat(transGf[isRc][frame]);
00747 
00748 /* Scan through .nib and .2bit files once counting tiles. */
00749 for (i=0; i<fileCount; ++i)
00750     {
00751     fileName = fileNames[i];
00752     printf("Counting %s\n", fileName);
00753     if (nibIsFile(fileName))
00754         {
00755         seq = readMaskedNib(fileName, doMask);
00756         transCountBothStrands(seq, transGf);
00757         sourceCount += 1;
00758         totalBases += seq->size;
00759         freeDnaSeq(&seq);
00760         }
00761     else if (twoBitIsFile(fileName))
00762         {
00763         struct twoBitFile *tbf = twoBitOpen(fileName);
00764         struct twoBitIndex *index;
00765         totalBases += twoBitCheckTotalSize(tbf);
00766 
00767         for (index = tbf->indexList; index != NULL; index = index->next)
00768             {
00769             seq = readMaskedTwoBit(tbf, index->name, doMask);
00770             transCountBothStrands(seq, transGf);
00771             sourceCount += 1;
00772             freeDnaSeq(&seq);
00773             }
00774         twoBitClose(&tbf);
00775         }
00776     else 
00777         errAbort("Unrecognized file type %s", fileName);
00778     if (totalBases >= warnAt)
00779         errAbort("Exceeding 4 billion bases, sorry gfServer can't handle that.");
00780     }
00781 
00782 /* Get space for entries in indexed of all reading frames. */
00783 for (isRc=0; isRc <= 1; ++isRc)
00784     {
00785     for (frame = 0; frame < 3; ++frame)
00786         {
00787         gf = transGf[isRc][frame];
00788         gfAllocLists(gf);
00789         gfZeroNonOverused(gf);
00790         AllocArray(gf->sources, sourceCount);
00791         gf->sourceCount = sourceCount;
00792         offset[isRc][frame] = 0;
00793         }
00794     }
00795 
00796 /* Scan through nibs a second time building index. */
00797 sourceCount = 0;
00798 for (i=0; i<fileCount; ++i)
00799     {
00800     fileName = fileNames[i];
00801     printf("Indexing %s\n", fileName);
00802     if (nibIsFile(fileName))
00803         {
00804         seq = readMaskedNib(fileName, doMask);
00805         transIndexBothStrands(seq, transGf, offset, sourceCount, fileName);
00806         freeDnaSeq(&seq);
00807         sourceCount += 1;
00808         }
00809     else        /* .2bit file */
00810         {
00811         struct twoBitFile *tbf = twoBitOpen(fileName);
00812         struct twoBitIndex *index;
00813         for (index = tbf->indexList; index != NULL; index = index->next)
00814             {
00815             char nameBuf[PATH_LEN+256];
00816             safef(nameBuf, sizeof(nameBuf), "%s:%s", fileName, index->name);
00817             seq = readMaskedTwoBit(tbf, index->name, doMask);
00818             transIndexBothStrands(seq, transGf, offset, sourceCount, nameBuf);
00819             sourceCount += 1;
00820             freeDnaSeq(&seq);
00821             }
00822         twoBitClose(&tbf);
00823         }
00824     }
00825 
00826 for (isRc=0; isRc <= 1; ++isRc)
00827     {
00828     for (frame = 0; frame < 3; ++frame)
00829         {
00830         gf = transGf[isRc][frame];
00831         gf->totalSeqSize = offset[isRc][frame];
00832         gfZeroOverused(gf);
00833         }
00834     }
00835 }
00836 
00837 static struct genoFind *gfSmallIndexSeq(struct genoFind *gf, bioSeq *seqList,
00838         int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, 
00839         boolean isPep, boolean maskUpper)
00840 /* Make index for all seqs in list. */
00841 {
00842 int seqCount = slCount(seqList);
00843 bioSeq *seq;
00844 int i;
00845 bits32 offset = 0;
00846 struct gfSeqSource *ss;
00847 
00848 if (isPep)
00849     maskSimplePepRepeat(gf);
00850 for (seq = seqList; seq != NULL; seq = seq->next)
00851     gfCountSeq(gf, seq);
00852 gfAllocLists(gf);
00853 gfZeroNonOverused(gf);
00854 if (seqCount > 0)
00855     AllocArray(gf->sources, seqCount);
00856 gf->sourceCount = seqCount;
00857 for (i=0, seq = seqList; i<seqCount; ++i, seq = seq->next)
00858     {
00859     gfAddSeq(gf, seq, offset);
00860     ss = gf->sources+i;
00861     ss->seq = seq;
00862     ss->start = offset;
00863     offset += seq->size;
00864     ss->end = offset;
00865     if (maskUpper)
00866         ss->maskedBits = maskFromUpperCaseSeq(seq);
00867     }
00868 gf->totalSeqSize = offset;
00869 gfZeroOverused(gf);
00870 return gf;
00871 }
00872 
00873 static struct genoFind *gfLargeIndexSeq(struct genoFind *gf, bioSeq *seqList,
00874         int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, 
00875         boolean isPep, boolean maskUpper)
00876 /* Make index for all seqs in list. */
00877 {
00878 int seqCount = slCount(seqList);
00879 bioSeq *seq;
00880 int i;
00881 bits32 offset = 0;
00882 struct gfSeqSource *ss;
00883 
00884 for (seq = seqList; seq != NULL; seq = seq->next)
00885     gfCountSeq(gf, seq);
00886 gfAllocLargeLists(gf);
00887 gfZeroNonOverused(gf);
00888 AllocArray(gf->sources, seqCount);
00889 gf->sourceCount = seqCount;
00890 for (i=0, seq = seqList; i<seqCount; ++i, seq = seq->next)
00891     {
00892     gfAddLargeSeq(gf, seq, offset);
00893     ss = gf->sources+i;
00894     ss->seq = seq;
00895     ss->start = offset;
00896     offset += seq->size;
00897     ss->end = offset;
00898     if (maskUpper)
00899         ss->maskedBits = maskFromUpperCaseSeq(seq);
00900     }
00901 gf->totalSeqSize = offset;
00902 gfZeroOverused(gf);
00903 return gf;
00904 }
00905 
00906 
00907 struct genoFind *gfIndexSeq(bioSeq *seqList,
00908         int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, 
00909         boolean isPep, boolean allowOneMismatch, boolean maskUpper,
00910         int stepSize)
00911 /* Make index for all seqs in list.  For DNA sequences upper case bits will
00912  * be unindexed. */
00913 {
00914 struct genoFind *gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize, maxPat, 
00915                                 oocFile, isPep, allowOneMismatch);
00916 if (stepSize == 0)
00917     stepSize = tileSize;
00918 if (gf->segSize > 0)
00919     {
00920     gfLargeIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper);
00921     }
00922 else
00923     {
00924     gfSmallIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper);
00925     }
00926 return gf;
00927 }
00928 
00929 static int bCmpSeqSource(const void *vTarget, const void *vRange)
00930 /* Compare function for binary search of gfSeqSource. */
00931 {
00932 const bits32 *pTarget = vTarget;
00933 bits32 target = *pTarget;
00934 const struct gfSeqSource *ss = vRange;
00935 
00936 if (target < ss->start) return -1;
00937 if (target >= ss->end) return 1;
00938 return 0;
00939 }
00940 
00941 static struct gfSeqSource *findSource(struct genoFind *gf, bits32 targetPos)
00942 /* Find source given target position. */
00943 {
00944 struct gfSeqSource *ss =  bsearch(&targetPos, gf->sources, gf->sourceCount, 
00945         sizeof(gf->sources[0]), bCmpSeqSource);
00946 if (ss == NULL)
00947     errAbort("Couldn't find source for %d", targetPos);
00948 return ss;
00949 }
00950 
00951 void gfClumpFree(struct gfClump **pClump)
00952 /* Free a single clump. */
00953 {
00954 struct gfClump *clump;
00955 if ((clump = *pClump) == NULL)
00956     return;
00957 freez(pClump);
00958 }
00959 
00960 void gfClumpFreeList(struct gfClump **pList)
00961 /* Free a list of dynamically allocated gfClump's */
00962 {
00963 struct gfClump *el, *next;
00964 
00965 for (el = *pList; el != NULL; el = next)
00966     {
00967     next = el->next;
00968     gfClumpFree(&el);
00969     }
00970 *pList = NULL;
00971 }
00972 
00973 void gfClumpDump(struct genoFind *gf, struct gfClump *clump, FILE *f)
00974 /* Print out info on clump */
00975 {
00976 struct gfSeqSource *ss = clump->target;
00977 char *name = ss->fileName;
00978 
00979 if (name == NULL) name = ss->seq->name;
00980 fprintf(f, "%u-%u %s %u-%u, hits %d\n", 
00981         clump->qStart, clump->qEnd, name,
00982         clump->tStart - ss->start, clump->tEnd - ss->start,
00983         clump->hitCount);
00984 #ifdef SOMETIMES
00985 for (hit = clump->hitList; hit != NULL; hit = hit->next)
00986     fprintf(f, "   q %d, t %d, diag %d\n", hit->qStart, hit->tStart, hit->diagonal);
00987 #endif
00988 }
00989 
00990 /* Fast sorting routines for sorting gfHits on diagonal. 
00991  * More or less equivalent to system qsort, but with
00992  * comparison function inline.  Worth a little tweaking
00993  * since this is the bottleneck for the whole procedure. */
00994 
00995 static void gfHitSort2(struct gfHit **ptArray, int n);
00996 
00997 /* Some variables used by recursive function gfHitSort2
00998  * across all incarnations. */
00999 static struct gfHit **nosTemp, *nosSwap;
01000 
01001 static void gfHitSort2(struct gfHit **ptArray, int n)
01002 /* This is a fast recursive sort that uses a temporary
01003  * buffer (nosTemp) that has to be as big as the array
01004  * that is being sorted. */
01005 {
01006 struct gfHit **tmp, **pt1, **pt2;
01007 int n1, n2;
01008 
01009 /* Divide area to sort in two. */
01010 n1 = (n>>1);
01011 n2 = n - n1;
01012 pt1 = ptArray;
01013 pt2 =  ptArray + n1;
01014 
01015 /* Sort each area separately.  Handle small case (2 or less elements)
01016  * here.  Otherwise recurse to sort. */
01017 if (n1 > 2)
01018     gfHitSort2(pt1, n1);
01019 else if (n1 == 2 && pt1[0]->diagonal > pt1[1]->diagonal)
01020     {
01021     nosSwap = pt1[1];
01022     pt1[1] = pt1[0];
01023     pt1[0] = nosSwap;
01024     }
01025 if (n2 > 2)
01026     gfHitSort2(pt2, n2);
01027 else if (n2 == 2 && pt2[0]->diagonal > pt2[1]->diagonal)
01028     {
01029     nosSwap = pt2[1];
01030     pt2[1] = pt2[0];
01031     pt2[0] = nosSwap;
01032     }
01033 
01034 /* At this point both halves are internally sorted. 
01035  * Do a merge-sort between two halves copying to temp
01036  * buffer.  Then copy back sorted result to main buffer. */
01037 tmp = nosTemp;
01038 while (n1 > 0 && n2 > 0)
01039     {
01040     if ((*pt1)->diagonal <= (*pt2)->diagonal)
01041         {
01042         --n1;
01043         *tmp++ = *pt1++;
01044         }
01045     else
01046         {
01047         --n2;
01048         *tmp++ = *pt2++;
01049         }
01050     }
01051 /* One or both sides are now fully merged. */
01052 assert(tmp + n1 + n2 == nosTemp + n);
01053 
01054 /* If some of first side left to merge copy it to end of temp buf. */
01055 if (n1 > 0)
01056     memcpy(tmp, pt1, n1 * sizeof(*tmp));
01057 
01058 /* If some of second side left to merge, we finesse it here:
01059  * simply refrain from copying over it as we copy back temp buf. */
01060 memcpy(ptArray, nosTemp, (n - n2) * sizeof(*ptArray));
01061 }
01062 
01063 void gfHitSortDiagonal(struct gfHit **pList)
01064 /* Sort a singly linked list with Qsort and a temporary array. */
01065 {
01066 struct gfHit *list = *pList;
01067 if (list != NULL && list->next != NULL)
01068     {
01069     int count = slCount(list);
01070     struct gfHit *el;
01071     struct gfHit **array;
01072     int i;
01073     int byteSize = count*sizeof(*array);
01074     array = needLargeMem(byteSize);
01075     nosTemp = needLargeMem(byteSize);
01076     for (el = list, i=0; el != NULL; el = el->next, i++)
01077         array[i] = el;
01078     gfHitSort2(array, count);
01079     list = NULL;
01080     for (i=0; i<count; ++i)
01081         {
01082         array[i]->next = list;
01083         list = array[i];
01084         }
01085     freez(&array);
01086     freez(&nosTemp);
01087     slReverse(&list);
01088     *pList = list;       
01089     }
01090 }
01091 
01092 
01093 
01094 static int cmpQuerySize;
01095 
01096 #ifdef UNUSED
01097 static int gfHitCmpDiagonal(const void *va, const void *vb)
01098 /* Compare to sort based on 'diagonal' offset. */
01099 {
01100 const struct gfHit *a = *((struct gfHit **)va);
01101 const struct gfHit *b = *((struct gfHit **)vb);
01102 
01103 if (a->diagonal > b->diagonal)
01104     return 1;
01105 else if (a->diagonal == b->diagonal)
01106     return 0;
01107 else
01108     return -1;
01109 }
01110 #endif /* UNUSED */
01111 
01112 static int gfHitCmpTarget(const void *va, const void *vb)
01113 /* Compare to sort based on target offset. */
01114 {
01115 const struct gfHit *a = *((struct gfHit **)va);
01116 const struct gfHit *b = *((struct gfHit **)vb);
01117 
01118 if (a->tStart > b->tStart)
01119     return 1;
01120 else if (a->tStart == b->tStart)
01121     return 0;
01122 else
01123     return -1;
01124 }
01125 
01126 static int gfHitCmpQuery(const void *va, const void *vb)
01127 /* Compare to sort based on target offset. (Thanks AG) */
01128 {
01129 const struct gfHit *a = *((struct gfHit **)va);
01130 const struct gfHit *b = *((struct gfHit **)vb);
01131 
01132 if (a->qStart > b->qStart)
01133     return 1;
01134 else if (a->qStart == b->qStart)
01135     return 0;
01136 else
01137     return -1;
01138 }
01139 
01140 static void gfClumpComputeQueryCoverage(struct gfClump *clumpList, 
01141         int tileSize)
01142 /* Figure out how much of query is covered by clump list. (Thanks AG). */
01143 {
01144 struct gfClump *clump;
01145 struct gfHit *hit;
01146 struct gfHit *hitList;
01147 int qcov;
01148 int blockStart, blockEnd;
01149 
01150 for (clump = clumpList; clump != NULL; clump = clump->next)
01151     {
01152     hitList = clump->hitList;
01153     if ( hitList != NULL)
01154         {
01155         slSort(&hitList, gfHitCmpQuery);
01156         qcov = 0;
01157         blockStart = hitList->qStart;
01158         blockEnd = blockStart + tileSize;
01159         for (hit = hitList; hit != NULL; hit = hit->next)
01160             {
01161             if (hit->qStart > blockEnd)
01162                 {
01163                 qcov += blockEnd - blockStart;
01164                 blockStart = hit->qStart;
01165                 blockEnd = blockStart + tileSize;
01166                 }
01167             else if (hit->qStart + tileSize > blockEnd)
01168                 {
01169                 blockEnd = hit->qStart + tileSize;
01170                 }
01171             }
01172         qcov += blockEnd - blockStart;
01173         clump->queryCoverage = qcov;
01174         }
01175     }
01176 }
01177 
01178 
01179 
01180 static int gfClumpCmpQueryCoverage(const void *va, const void *vb)
01181 /* Compare to sort based on query coverage. */
01182 {
01183 const struct gfClump *a = *((struct gfClump **)va);
01184 const struct gfClump *b = *((struct gfClump **)vb);
01185 
01186 return (b->queryCoverage - a->queryCoverage);
01187 }
01188 
01189 static void findClumpBounds(struct gfClump *clump, int tileSize)
01190 /* Figure out qStart/qEnd tStart/tEnd from hitList */
01191 {
01192 struct gfHit *hit;
01193 int x;
01194 hit = clump->hitList;
01195 if (hit == NULL)
01196     return;
01197 clump->qStart = clump->qEnd = hit->qStart;
01198 clump->tStart = clump->tEnd = hit->tStart;
01199 for (hit = hit->next; hit != NULL; hit = hit->next)
01200     {
01201     x = hit->qStart;
01202     if (x < clump->qStart) clump->qStart = x;
01203     if (x > clump->qEnd) clump->qEnd = x;
01204     x = hit->tStart;
01205     if (x < clump->tStart) clump->tStart = x;
01206     if (x > clump->tEnd) clump->tEnd = x;
01207     }
01208 clump->tEnd += tileSize;
01209 clump->qEnd += tileSize;
01210 }
01211 
01212 static void targetClump(struct genoFind *gf, struct gfClump **pClumpList, struct gfClump *clump)
01213 /* Add target sequence to clump.  If clump had multiple targets split it into
01214  * multiple clumps, one per target.  Add clump(s) to list. */
01215 {
01216 struct gfSeqSource *ss = findSource(gf, clump->tStart);
01217 if (ss->end >= clump->tEnd)
01218     {
01219     /* Usual simple case: all of clump hits single target. */
01220     clump->target = ss;
01221     slAddHead(pClumpList, clump);
01222     }
01223 else
01224     {
01225     /* If we've gotten here, then the clump is split across multiple targets.
01226      * We'll have to split it into multiple clumps... */
01227     struct gfHit *hit, *nextHit, *inList, *outList, *oldList = clump->hitList;
01228     int hCount;
01229 
01230     while (oldList != NULL)
01231         {
01232         inList = outList = NULL;
01233         hCount = 0;
01234         for (hit = oldList; hit != NULL; hit = nextHit)
01235             {
01236             nextHit = hit->next;
01237             if (ss->start <= hit->tStart  && hit->tStart < ss->end)
01238                 {
01239                 ++hCount;
01240                 slAddHead(&inList, hit);
01241                 }
01242             else
01243                 {
01244                 slAddHead(&outList, hit);
01245                 }
01246             }
01247         slReverse(&inList);
01248         slReverse(&outList);
01249         if (hCount >= gf->minMatch)
01250             {
01251             struct gfClump *newClump;
01252             AllocVar(newClump);
01253             newClump->hitList = inList;
01254             newClump->hitCount = hCount;
01255             newClump->target = ss;
01256             findClumpBounds(newClump, gf->tileSize);
01257             slAddHead(pClumpList, newClump);
01258             }
01259         else
01260             {
01261             inList = NULL;
01262             }
01263         oldList = outList;
01264         if (oldList != NULL)
01265             {
01266             ss = findSource(gf, oldList->tStart);
01267             }
01268         }
01269     clump->hitList = NULL;      /* We ate up the hit list. */
01270     gfClumpFree(&clump);
01271     }
01272 }
01273 
01274 static int gfNearEnough = 300;
01275 
01276 static struct gfClump *clumpNear(struct genoFind *gf, struct gfClump *oldClumps, int minMatch)
01277 /* Go through clump list and make sure hits are also near each other.
01278  * If necessary divide clumps. */
01279 {
01280 struct gfClump *newClumps = NULL, *clump, *nextClump;
01281 struct gfHit *hit, *nextHit;
01282 int tileSize = gf->tileSize;
01283 bits32 lastT;
01284 int nearEnough = (gf->isPep ? gfNearEnough/3 : gfNearEnough);
01285 
01286 for (clump = oldClumps; clump != NULL; clump = nextClump)
01287     {
01288     struct gfHit *newHits = NULL, *oldHits = clump->hitList;
01289     int clumpSize = 0;
01290     clump->hitCount = 0;
01291     clump->hitList = NULL;      /* Clump no longer owns list. */
01292     nextClump = clump->next;
01293     slSort(&oldHits, gfHitCmpTarget);
01294     lastT = oldHits->tStart;
01295     for (hit = oldHits; hit != NULL; hit = nextHit)
01296         {
01297         nextHit = hit->next;
01298         if (hit->tStart > nearEnough + lastT)
01299              {
01300              if (clumpSize >= minMatch)
01301                  {
01302                  slReverse(&newHits);
01303                  clump->hitList = newHits;
01304                  newHits = NULL;
01305                  clump->hitCount = clumpSize;
01306                  findClumpBounds(clump, tileSize);
01307                  targetClump(gf, &newClumps, clump);
01308                  AllocVar(clump);
01309                  }
01310              else
01311                  {
01312                  newHits = NULL;
01313                  clump->hitCount = 0;
01314                  }
01315              clumpSize = 0;
01316              }
01317         lastT = hit->tStart;
01318         slAddHead(&newHits, hit);
01319         ++clumpSize;
01320         }
01321     slReverse(&newHits);
01322     clump->hitList = newHits;
01323     if (clumpSize >= minMatch)
01324         {
01325         clump->hitCount = clumpSize;
01326         findClumpBounds(clump, tileSize);
01327         targetClump(gf, &newClumps, clump);
01328         }
01329     else
01330         {
01331         gfClumpFree(&clump);
01332         }
01333     }
01334 return newClumps;
01335 }
01336 
01337 static struct gfClump *clumpHits(struct genoFind *gf, struct gfHit *hitList, int minMatch)
01338 /* Clump together hits according to parameters in gf. */
01339 {
01340 struct gfClump *clumpList = NULL, *clump = NULL;
01341 int maxGap = gf->maxGap;
01342 struct gfHit *hit, *nextHit, *lastHit = NULL;
01343 int totalHits = 0, usedHits = 0, clumpCount = 0;
01344 int tileSize = gf->tileSize;
01345 int bucketShift = 16;           /* 64k buckets. */
01346 bits32 bucketSize = (1<<bucketShift);
01347 int bucketCount = (gf->totalSeqSize >> bucketShift) + 1;
01348 int nearEnough = (gf->isPep ? gfNearEnough/3 : gfNearEnough);
01349 bits32 boundary = bucketSize - nearEnough;
01350 int i;
01351 struct gfHit **buckets = NULL, **pb;
01352 
01353 /* Sort hit list into buckets. */
01354 AllocArray(buckets, bucketCount);
01355 for (hit = hitList; hit != NULL; hit = nextHit)
01356     {
01357     nextHit = hit->next;
01358     pb = buckets + (hit->tStart >> bucketShift);
01359     slAddHead(pb, hit);
01360     }
01361 
01362 /* Sort each bucket on diagonal and clump. */
01363 for (i=0; i<bucketCount; ++i)
01364     {
01365     int clumpSize;
01366     bits32 maxT;
01367     struct gfHit *clumpHits;
01368     pb = buckets + i;
01369     gfHitSortDiagonal(pb);
01370     for (hit = *pb; hit != NULL; )
01371          {
01372          /* Each time through this loop will get info on a clump.  Will only
01373           * actually create clump if it is big enough though. */
01374          clumpSize = 0;
01375          clumpHits = lastHit = NULL;
01376          maxT = 0;
01377          for (; hit != NULL; hit = nextHit)
01378              {
01379              nextHit = hit->next;
01380              if (lastHit != NULL && hit->diagonal - lastHit->diagonal > maxGap)
01381                  break;
01382              if (hit->tStart > maxT) maxT = hit->tStart;
01383              slAddHead(&clumpHits, hit);
01384              ++clumpSize;
01385              ++totalHits;
01386              lastHit = hit;
01387              }
01388          if (maxT > boundary && i < bucketCount-1)
01389              {
01390              /* Move clumps that are near boundary to next bucket to give them a
01391               * chance to merge with hits there. */
01392              buckets[i+1] = slCat(clumpHits, buckets[i+1]);
01393              }
01394          else if (clumpSize >= minMatch)
01395              {
01396              /* Save clumps that are large enough on list. */
01397              AllocVar(clump);
01398              slAddHead(&clumpList, clump);
01399              clump->hitCount = clumpSize;
01400              clump->hitList = clumpHits;
01401              usedHits += clumpSize;
01402              ++clumpCount;
01403              }
01404          }
01405     *pb = NULL;
01406     boundary += bucketSize;
01407     }
01408 clumpList = clumpNear(gf, clumpList, minMatch);
01409 gfClumpComputeQueryCoverage(clumpList, tileSize);       /* Thanks AG */
01410 slSort(&clumpList, gfClumpCmpQueryCoverage);
01411 
01412 #ifdef DEBUG
01413 uglyf("Dumping clumps B\n");
01414 for (clump = clumpList; clump != NULL; clump = clump->next)     /* uglyf */
01415     {
01416     uglyf(" %d %d %s %d %d (%d hits)\n", clump->qStart, clump->qEnd, clump->target->seq->name,   clump->tStart, clump->tEnd, clump->hitCount);
01417     }
01418 #endif /* DEBUG */
01419 freez(&buckets);
01420 return clumpList;
01421 }
01422 
01423 
01424 static struct gfHit *gfFastFindDnaHits(struct genoFind *gf, struct dnaSeq *seq, 
01425         Bits *qMaskBits,  int qMaskOffset, struct lm *lm, int *retHitCount,
01426         struct gfSeqSource *target, int tMin, int tMax)
01427 /* Find hits associated with one sequence. This is is special fast
01428  * case for DNA that is in an unsegmented index. */
01429 {
01430 struct gfHit *hitList = NULL, *hit;
01431 int size = seq->size;
01432 int tileSizeMinusOne = gf->tileSize - 1;
01433 int mask = gf->tileMask;
01434 DNA *dna = seq->dna;
01435 int i, j;
01436 bits32 bits = 0;
01437 bits32 bVal;
01438 int listSize;
01439 bits32 qStart, *tList;
01440 int hitCount = 0;
01441 
01442 for (i=0; i<tileSizeMinusOne; ++i)
01443     {
01444     bVal = ntValNoN[(int)dna[i]];
01445     bits <<= 2;
01446     bits += bVal;
01447     }
01448 for (i=tileSizeMinusOne; i<size; ++i)
01449     {
01450     bVal = ntValNoN[(int)dna[i]];
01451     bits <<= 2;
01452     bits += bVal;
01453     bits &= mask;
01454     listSize = gf->listSizes[bits];
01455     if (listSize != 0)
01456         {
01457         qStart = i-tileSizeMinusOne;
01458         if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, gf->tileSize) == 0)
01459             {
01460             tList = gf->lists[bits];
01461             for (j=0; j<listSize; ++j)
01462                 {
01463                 int tStart = tList[j];
01464                 if (target == NULL || 
01465                         (target == findSource(gf, tStart) && tStart >= tMin && tStart < tMax) ) 
01466                     {
01467                     lmAllocVar(lm, hit);
01468                     hit->qStart = qStart;
01469                     hit->tStart = tStart;
01470                     hit->diagonal = tStart + size - qStart;
01471                     slAddHead(&hitList, hit);
01472                     ++hitCount;
01473                     }
01474                 }
01475             }
01476         }
01477     }
01478 *retHitCount = hitCount;
01479 return hitList;
01480 }
01481 
01482 static struct gfHit *gfStraightFindHits(struct genoFind *gf, aaSeq *seq, 
01483         Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
01484         struct gfSeqSource *target, int tMin, int tMax)
01485 /* Find hits associated with one sequence in a non-segmented
01486  * index where hits match exactly. */
01487 {
01488 struct gfHit *hitList = NULL, *hit;
01489 int size = seq->size;
01490 int tileSize = gf->tileSize;
01491 int lastStart = size - tileSize;
01492 char *poly = seq->dna;
01493 int i, j;
01494 int tile;
01495 int listSize;
01496 bits32 qStart, *tList;
01497 int hitCount = 0;
01498 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
01499 
01500 initNtLookup();
01501 for (i=0; i<=lastStart; ++i)
01502     {
01503     tile = makeTile(poly+i, tileSize);
01504     if (tile < 0)
01505         continue;
01506     listSize = gf->listSizes[tile];
01507     if (listSize > 0)
01508         {
01509         qStart = i;
01510         if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, tileSize) == 0)
01511             {
01512             tList = gf->lists[tile];
01513             for (j=0; j<listSize; ++j)
01514                 {
01515                 int tStart = tList[j];
01516                 if (target == NULL || 
01517                         (target == findSource(gf, tStart) && tStart >= tMin && tStart < tMax) ) 
01518                     {
01519                     lmAllocVar(lm,hit);
01520                     hit->qStart = qStart;
01521                     hit->tStart = tStart;
01522                     hit->diagonal = tStart + size - qStart;
01523                     slAddHead(&hitList, hit);
01524                     ++hitCount;
01525                     }
01526                 }
01527             }
01528         }
01529     }
01530 *retHitCount = hitCount;
01531 return hitList;
01532 }
01533 
01534 static struct gfHit *gfStraightFindNearHits(struct genoFind *gf, aaSeq *seq, 
01535         Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
01536         struct gfSeqSource *target, int tMin, int tMax)
01537 /* Find hits associated with one sequence in a non-segmented
01538  * index where hits can mismatch in one letter. */
01539 {
01540 struct gfHit *hitList = NULL, *hit;
01541 int size = seq->size;
01542 int tileSize = gf->tileSize;
01543 int lastStart = size - tileSize;
01544 char *poly = seq->dna;
01545 int i, j;
01546 int tile;
01547 int listSize;
01548 bits32 qStart, *tList;
01549 int hitCount = 0;
01550 int varPos, varVal;     /* Variable position. */
01551 int (*makeTile)(char *poly, int n); 
01552 int alphabetSize;
01553 char oldChar, zeroChar;
01554 int *seqValLookup;
01555 int posMul, avoid;
01556 
01557 initNtLookup();
01558 if (gf->isPep)
01559     {
01560     makeTile = gfPepTile;
01561     alphabetSize = 20;
01562     zeroChar = 'A';
01563     seqValLookup = aaVal;
01564     }
01565 else
01566     {
01567     makeTile = gfDnaTile;
01568     alphabetSize = 4;
01569     zeroChar = 't';
01570     seqValLookup = ntVal;
01571     }
01572 
01573 for (i=0; i<=lastStart; ++i)
01574     {
01575     posMul = 1;
01576     for (varPos = tileSize-1; varPos >=0; --varPos)
01577         {
01578         /* Make a tile that has zero value at variable position. */
01579         oldChar = poly[i+varPos];
01580         poly[i+varPos] = zeroChar;
01581         tile = makeTile(poly+i, tileSize);
01582         poly[i+varPos] = oldChar;
01583 
01584         /* Avoid checking the unmodified tile multiple times. */
01585         if (varPos == 0)
01586             avoid = -1;
01587         else
01588             avoid = seqValLookup[(int)oldChar];
01589 
01590         if (tile >= 0)
01591             {
01592             /* Look up all possible values of variable position. */
01593             for (varVal=0; varVal<alphabetSize; ++varVal)
01594                 {
01595                 if (varVal != avoid)
01596                     {
01597                     listSize = gf->listSizes[tile];
01598                     if (listSize > 0)
01599                         {
01600                         qStart = i;
01601                         if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, tileSize) == 0)
01602                             {
01603                             tList = gf->lists[tile];
01604                             for (j=0; j<listSize; ++j)
01605                                 {
01606                                 int tStart = tList[j];
01607                                 if (target == NULL || 
01608                                         (target == findSource(gf, tStart) 
01609                                         && tStart >= tMin && tStart < tMax) ) 
01610                                     {
01611                                     lmAllocVar(lm,hit);
01612                                     hit->qStart = qStart;
01613                                     hit->tStart = tStart;
01614                                     hit->diagonal = tStart + size - qStart;
01615                                     slAddHead(&hitList, hit);
01616                                     ++hitCount;
01617                                     }
01618                                 }
01619                             }
01620                         }
01621                     }
01622                 tile += posMul;
01623                 }
01624             }
01625         posMul *= alphabetSize;
01626         }
01627     }
01628 *retHitCount = hitCount;
01629 return hitList;
01630 }
01631 
01632 static struct gfHit *gfSegmentedFindHits(struct genoFind *gf, aaSeq *seq, 
01633         Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
01634         struct gfSeqSource *target, int tMin, int tMax)
01635 /* Find hits associated with one sequence in general case in a segmented
01636  * index. */
01637 {
01638 struct gfHit *hitList = NULL, *hit;
01639 int size = seq->size;
01640 int tileSize = gf->tileSize;
01641 int tileTailSize = gf->segSize;
01642 int tileHeadSize = gf->tileSize - tileTailSize;
01643 int lastStart = size - tileSize;
01644 char *poly = seq->dna;
01645 int i, j;
01646 int tileHead, tileTail;
01647 int listSize;
01648 bits32 qStart;
01649 bits16 *endList;
01650 int hitCount = 0;
01651 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
01652 
01653 
01654 initNtLookup();
01655 for (i=0; i<=lastStart; ++i)
01656     {
01657     if (qMaskBits == NULL || bitCountRange(qMaskBits, i+qMaskOffset, tileSize) == 0)
01658         {
01659         tileHead = makeTile(poly+i, tileHeadSize);
01660         if (tileHead < 0)
01661             continue;
01662         tileTail = makeTile(poly+i+tileHeadSize, tileTailSize);
01663         if (tileTail < 0)
01664             continue;
01665         listSize = gf->listSizes[tileHead];
01666         qStart = i;
01667         endList = gf->endLists[tileHead];
01668         for (j=0; j<listSize; ++j)
01669             {
01670             if (endList[0] == tileTail)
01671                 {
01672                 int tStart = (endList[1]<<16) + endList[2];
01673                 if (target == NULL || 
01674                         (target == findSource(gf, tStart) 
01675                         && tStart >= tMin && tStart < tMax) ) 
01676                     {
01677                     lmAllocVar(lm,hit);
01678                     hit->qStart = qStart;
01679                     hit->tStart = tStart;
01680                     hit->diagonal = tStart + size - qStart;
01681                     slAddHead(&hitList, hit);
01682                     ++hitCount;
01683                     }
01684                 }
01685             endList += 3;
01686             }
01687         }
01688     }
01689 *retHitCount = hitCount;
01690 return hitList;
01691 }
01692 
01693 static struct gfHit *gfSegmentedFindNearHits(struct genoFind *gf, 
01694         aaSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
01695         struct gfSeqSource *target, int tMin, int tMax)
01696 /* Find hits associated with one sequence in a segmented
01697  * index where one mismatch is allowed. */
01698 {
01699 struct gfHit *hitList = NULL, *hit;
01700 int size = seq->size;
01701 int tileSize = gf->tileSize;
01702 int tileTailSize = gf->segSize;
01703 int tileHeadSize = gf->tileSize - tileTailSize;
01704 int lastStart = size - tileSize;
01705 char *poly = seq->dna;
01706 int i, j;
01707 int tileHead, tileTail;
01708 int listSize;
01709 bits32 qStart;
01710 bits16 *endList;
01711 int hitCount = 0;
01712 int varPos, varVal;     /* Variable position. */
01713 int (*makeTile)(char *poly, int n); 
01714 int alphabetSize;
01715 char oldChar, zeroChar;
01716 int headPosMul, tailPosMul, avoid;
01717 boolean modTail;
01718 int *seqValLookup;
01719 
01720 
01721 initNtLookup();
01722 if (gf->isPep)
01723     {
01724     makeTile = gfPepTile;
01725     alphabetSize = 20;
01726     zeroChar = 'A';
01727     seqValLookup = aaVal;
01728     }
01729 else
01730     {
01731     makeTile = gfDnaTile;
01732     alphabetSize = 4;
01733     zeroChar = 't';
01734     seqValLookup = ntVal;
01735     }
01736 
01737 for (i=0; i<=lastStart; ++i)
01738     {
01739     if (qMaskBits == NULL || bitCountRange(qMaskBits, i+qMaskOffset, tileSize) == 0)
01740         {
01741         headPosMul = tailPosMul = 1;
01742         for (varPos = tileSize-1; varPos >= 0; --varPos)
01743             {
01744             /* Make a tile that has zero value at variable position. */
01745             modTail = (varPos >= tileHeadSize);
01746             oldChar = poly[i+varPos];
01747             poly[i+varPos] = zeroChar;
01748             tileHead = makeTile(poly+i, tileHeadSize);
01749             tileTail = makeTile(poly+i+tileHeadSize, tileTailSize);
01750             poly[i+varPos] = oldChar;
01751 
01752             /* Avoid checking the unmodified tile multiple times. */
01753             if (varPos == 0)
01754                 avoid = -1;
01755             else
01756                 avoid = seqValLookup[(int)oldChar];
01757 
01758             if (tileHead >= 0 && tileTail >= 0)
01759                 {
01760                 for (varVal=0; varVal<alphabetSize; ++varVal)
01761                     {
01762                     if (varVal != avoid)
01763                         {
01764                         listSize = gf->listSizes[tileHead];
01765                         qStart = i;
01766                         endList = gf->endLists[tileHead];
01767                         for (j=0; j<listSize; ++j)
01768                             {
01769                             if (endList[0] == tileTail)
01770                                 {
01771                                 int tStart = (endList[1]<<16) + endList[2];
01772                                 if (target == NULL || 
01773                                         (target == findSource(gf, tStart) 
01774                                         && tStart >= tMin && tStart < tMax) ) 
01775                                     {
01776                                     lmAllocVar(lm,hit);
01777                                     hit->qStart = qStart;
01778                                     hit->tStart = tStart;
01779                                     hit->diagonal = tStart + size - qStart;
01780                                     slAddHead(&hitList, hit);
01781                                     ++hitCount;
01782                                     }
01783                                 }
01784                             endList += 3;
01785                             }
01786                         }
01787                     if (modTail)
01788                         tileTail += tailPosMul;
01789                     else 
01790                         tileHead += headPosMul;
01791                     }
01792                 }
01793             if (modTail)
01794                 tailPosMul *= alphabetSize;
01795             else 
01796                 headPosMul *= alphabetSize;
01797             }
01798         }
01799     }
01800 *retHitCount = hitCount;
01801 return hitList;
01802 }
01803 
01804 
01805 static struct gfHit *gfFindHitsWithQmask(struct genoFind *gf, bioSeq *seq,
01806         Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, 
01807         struct gfSeqSource *target, int tMin, int tMax)
01808 /* Find hits associated with one sequence soft-masking seq according to qMaskBits.
01809  * The hits will be in genome rather than chromosome coordinates. */
01810 {
01811 struct gfHit *hitList = NULL;
01812 if (gf->segSize == 0 && !gf->isPep && !gf->allowOneMismatch)
01813     {
01814     hitList = gfFastFindDnaHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
01815         target, tMin, tMax);
01816     }
01817 else
01818     {
01819     if (gf->segSize == 0)
01820         {
01821         if (gf->allowOneMismatch)
01822             {
01823             hitList = gfStraightFindNearHits(gf, seq, qMaskBits, qMaskOffset, lm, 
01824                 retHitCount, target, tMin, tMax);
01825             }
01826         else
01827             {
01828             hitList = gfStraightFindHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount, 
01829                 target, tMin, tMax);
01830             }
01831         }
01832     else
01833         {
01834         if (gf->allowOneMismatch)
01835             {
01836             hitList = gfSegmentedFindNearHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
01837                 target, tMin, tMax);
01838             }
01839         else
01840             {
01841             hitList = gfSegmentedFindHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
01842                 target, tMin, tMax);
01843             }
01844         }
01845     }
01846 return hitList;
01847 }
01848 
01849 struct gfClump *gfFindClumpsWithQmask(struct genoFind *gf, bioSeq *seq, 
01850         Bits *qMaskBits, int qMaskOffset,
01851         struct lm *lm, int *retHitCount)
01852 /* Find clumps associated with one sequence soft-masking seq according to qMaskBits */
01853 {
01854 struct gfClump *clumpList = NULL;
01855 struct gfHit *hitList;
01856 int minMatch = gf->minMatch;
01857 
01858 #ifdef OLD      /* stepSize makes this obsolete. */
01859 if (seq->size < gf->tileSize * (gf->minMatch+1))
01860      minMatch = 1;
01861 #endif /* OLD */
01862 
01863 hitList =  gfFindHitsWithQmask(gf, seq, qMaskBits, qMaskOffset, lm,
01864         retHitCount, NULL, 0, 0);
01865 cmpQuerySize = seq->size;
01866 clumpList = clumpHits(gf, hitList, minMatch);
01867 return clumpList;
01868 }
01869 
01870 struct gfHit *gfFindHitsInRegion(struct genoFind *gf, bioSeq *seq, 
01871         Bits *qMaskBits, int qMaskOffset, struct lm *lm, 
01872         struct gfSeqSource *target, int tMin, int tMax)
01873 /* Find hits restricted to one particular region. 
01874  * The hits returned by this will be in target sequence
01875  * coordinates rather than concatenated whole genome
01876  * coordinates as hits inside of clumps usually are.  */
01877 {
01878 int targetStart;
01879 struct gfHit *hitList, *hit;
01880 int hitCount;
01881 
01882 targetStart = target->start;
01883 hitList =  gfFindHitsWithQmask(gf, seq, qMaskBits, qMaskOffset, lm,
01884         &hitCount, target, tMin + targetStart, tMax + targetStart);
01885 for (hit = hitList; hit != NULL; hit = hit->next)
01886     hit->tStart -= targetStart;
01887 return hitList;
01888 }
01889 
01890 struct gfClump *gfFindClumps(struct genoFind *gf, bioSeq *seq, struct lm *lm, int *retHitCount)
01891 /* Find clumps associated with one sequence. */
01892 {
01893 return gfFindClumpsWithQmask(gf, seq, NULL, 0, lm, retHitCount);
01894 }
01895 
01896 
01897 void gfTransFindClumps(struct genoFind *gfs[3], aaSeq *seq, struct gfClump *clumps[3], struct lm *lm, int *retHitCount)
01898 /* Find clumps associated with one sequence in three translated reading frames. */
01899 {
01900 int frame;
01901 int oneHit;
01902 int hitCount = 0;
01903 for (frame = 0; frame < 3; ++frame)
01904     {
01905     clumps[frame] = gfFindClumps(gfs[frame], seq, lm, &oneHit);
01906     hitCount += oneHit;
01907     }
01908 *retHitCount = hitCount;
01909 }
01910 
01911 void gfTransTransFindClumps(struct genoFind *gfs[3], aaSeq *seqs[3], 
01912         struct gfClump *clumps[3][3], struct lm *lm, int *retHitCount)
01913 /* Find clumps associated with three sequences in three translated 
01914  * reading frames. Used for translated/translated protein comparisons. */
01915 {
01916 int qFrame;
01917 int oneHit;
01918 int hitCount = 0;
01919 
01920 for (qFrame = 0; qFrame<3; ++qFrame)
01921     {
01922     gfTransFindClumps(gfs, seqs[qFrame], clumps[qFrame], lm, &oneHit);
01923     hitCount += oneHit;
01924     }
01925 *retHitCount = hitCount;
01926 }
01927 
01928 void gfMakeOoc(char *outName, char *files[], int fileCount, 
01929         int tileSize, bits32 maxPat, enum gfType tType)
01930 /* Count occurences of tiles in seqList and make a .ooc file. */
01931 {
01932 boolean dbIsPep = (tType == gftProt || tType == gftDnaX || tType == gftRnaX);
01933 struct genoFind *gf = gfNewEmpty(gfMinMatch, gfMaxGap, tileSize, tileSize,
01934         maxPat, NULL, dbIsPep, FALSE);
01935 bits32 *sizes = gf->listSizes;
01936 int tileSpaceSize = gf->tileSpaceSize;
01937 bioSeq *seq, *seqList;
01938 bits32 sig = oocSig, psz = tileSize;
01939 bits32 i;
01940 int oocCount = 0;
01941 char *inName;
01942 FILE *f = mustOpen(outName, "w");
01943 
01944 if (gf->segSize > 0)
01945     errAbort("Don't yet know how to make ooc files for large tile sizes.");
01946 for (i=0; i<fileCount; ++i)
01947     {
01948     inName = files[i];
01949     printf("Loading %s\n", inName);
01950     if (nibIsFile(inName))
01951         {
01952         seqList = nibLoadAll(inName);
01953         }
01954     else if (twoBitIsFile(inName))
01955         {
01956         seqList = twoBitLoadAll(inName);
01957         for (seq = seqList; seq != NULL; seq = seq->next)
01958             toLowerN(seq->dna, seq->size);
01959         }
01960     else
01961         {
01962         seqList = faReadAllSeq(inName, tType != gftProt);
01963         }
01964     printf("Counting %s\n", inName);
01965     for (seq = seqList; seq != NULL; seq = seq->next)
01966         {
01967         int isRc;
01968         for (isRc = 0; isRc <= 1; ++isRc)
01969             {
01970             if (tType == gftDnaX || tType == gftRnaX)
01971                 {
01972                 struct trans3 *t3 = trans3New(seq);
01973                 int frame;
01974                 for (frame=0; frame<3; ++frame)
01975                     {
01976                     gfCountSeq(gf, t3->trans[frame]);
01977                     }
01978                 trans3Free(&t3);
01979                 }
01980             else
01981                 {
01982                 gfCountSeq(gf, seq);
01983                 }
01984             if (tType == gftProt || tType == gftRnaX)
01985                 break;
01986             else 
01987                 {
01988                 reverseComplement(seq->dna, seq->size);
01989                 }
01990             }
01991         }
01992     freeDnaSeqList(&seqList);
01993     }
01994 printf("Writing %s\n", outName);
01995 writeOne(f, sig);
01996 writeOne(f, psz);
01997 for (i=0; i<tileSpaceSize; ++i)
01998     {
01999     if (sizes[i] >= maxPat)
02000         {
02001         writeOne(f, i);
02002         ++oocCount;
02003         }
02004     }
02005 carefulClose(&f);
02006 genoFindFree(&gf);
02007 printf("Wrote %d overused %d-mers to %s\n", oocCount, tileSize, outName);
02008 }
02009 
02010 struct gfSeqSource *gfFindNamedSource(struct genoFind *gf, char *name)
02011 /* Find target of given name.  Return NULL if none. */
02012 {
02013 struct gfSeqSource *source = gf->sources;
02014 int count = gf->sourceCount;
02015 
02016 if (source->seq == NULL)        /* Use first source to see if seq or file. */
02017     {
02018     char rootName[256];
02019     while (--count >= 0)
02020         {
02021         splitPath(source->fileName, NULL, rootName, NULL);
02022         if (sameString(name, rootName))
02023              return source;
02024         }
02025     }
02026 else
02027     {
02028     while (--count >= 0)
02029         {
02030         if (sameString(source->seq->name, name))
02031             return source;
02032         source += 1;
02033         }
02034     }
02035 return NULL;
02036 }
02037 
02038 static void mergeAdd(struct binKeeper *bk, int start, int end, struct gfSeqSource *src)
02039 /* Add interval to bin-keeper, merging with any existing overlapping
02040  * intervals. */
02041 {
02042 struct binElement *iEl, *iList = binKeeperFind(bk, start, end);
02043 for (iEl = iList; iEl != NULL; iEl = iEl->next)
02044     {
02045     if (iEl->start < start)
02046         start = iEl->start;
02047     if (iEl->end > end)
02048         end = iEl->end;
02049     binKeeperRemove(bk, iEl->start, iEl->end, src);
02050     }
02051 slFreeList(&iList);
02052 binKeeperAdd(bk, start, end, src);
02053 }
02054 
02055 static struct gfClump *pcrClumps(struct genoFind *gf, char *fPrimer, int fPrimerSize, 
02056         char *rPrimer, int rPrimerSize, int minDistance, int maxDistance)
02057 /* Find possible PCR hits.  The fPrimer and rPrimer are on the same strand. */
02058 {
02059 struct gfClump *clumpList = NULL;
02060 int tileSize = gf->tileSize;
02061 int fTile;
02062 int fTileCount = fPrimerSize - tileSize;
02063 int *rTiles, rTile;
02064 int rTileCount = rPrimerSize - tileSize;
02065 int fTileIx,rTileIx,fPosIx,rPosIx;
02066 bits32 *fPosList, fPos, *rPosList, rPos;
02067 int fPosListSize, rPosListSize;
02068 struct hash *targetHash = newHash(0);
02069 
02070 /* Build up array of all tiles in reverse primer. */
02071 AllocArray(rTiles, rTileCount);
02072 for (rTileIx = 0; rTileIx<rTileCount; ++rTileIx)
02073     {
02074     rTiles[rTileIx] = gfDnaTile(rPrimer + rTileIx, tileSize);
02075     if (rTiles[rTileIx] == -1)
02076         errAbort("Bad char in reverse primer sequence: %s", rPrimer);
02077     }
02078 
02079 /* Loop through all tiles in forward primer. */
02080 for (fTileIx=0; fTileIx<fTileCount; ++fTileIx)
02081     {
02082     fTile = gfDnaTile(fPrimer + fTileIx, tileSize);
02083     if (fTile >= 0)
02084         {
02085         fPosListSize = gf->listSizes[fTile];
02086         fPosList = gf->lists[fTile];
02087         for (fPosIx=0; fPosIx < fPosListSize; ++fPosIx)
02088             {
02089             fPos = fPosList[fPosIx];
02090             /* Loop through hits to reverse primer. */
02091             for (rTileIx=0; rTileIx < rTileCount; ++rTileIx)
02092                 {
02093                 rTile = rTiles[rTileIx];
02094                 rPosListSize = gf->listSizes[rTile];
02095                 rPosList = gf->lists[rTile];
02096                 for (rPosIx=0; rPosIx < rPosListSize; ++rPosIx)
02097                     {
02098                     rPos = rPosList[rPosIx];
02099                     if (rPos > fPos)
02100                         {
02101                         int distance = rPos - fPos;
02102                         if (distance >= minDistance && distance <= maxDistance)
02103                             {
02104                             struct gfSeqSource *target = findSource(gf, fPos);
02105                             if (rPos < target->end)
02106                                 {
02107                                 struct binKeeper *bk;
02108                                 int tStart = target->start;
02109                                 char *tName = target->fileName;
02110                                 if (tName == NULL)
02111                                     tName = target->seq->name;
02112                                 if ((bk = hashFindVal(targetHash, tName)) == NULL)
02113                                     {
02114                                     bk = binKeeperNew(0, target->end - tStart);
02115                                     hashAdd(targetHash, tName, bk);
02116                                     }
02117                                 mergeAdd(bk, fPos - tStart, rPos - tStart, target);
02118                                 }
02119                             }
02120                         }
02121                     }
02122                 }
02123             }
02124         }
02125     }
02126 
02127 /* Make clumps and clean up temporary structures. */
02128     {
02129     struct hashEl *tList, *tEl;
02130     tList = hashElListHash(targetHash);
02131     for (tEl = tList; tEl != NULL; tEl = tEl->next)
02132         {
02133         struct binKeeper *bk = tEl->val;
02134         struct binElement *bkList, *bkEl;
02135         bkList = binKeeperFindAll(bk);
02136         for (bkEl = bkList; bkEl != NULL; bkEl = bkEl->next)
02137             {
02138             struct gfClump *clump;
02139             AllocVar(clump);
02140             clump->target = bkEl->val;
02141             clump->tStart = bkEl->start;
02142             clump->tEnd = bkEl->end;
02143             slAddHead(&clumpList, clump);
02144             }
02145         slFreeList(&bkList);
02146         binKeeperFree(&bk);
02147         }
02148     hashElFreeList(&tList);
02149     hashFree(&targetHash);
02150     }
02151 freez(&rTiles);
02152 return clumpList;       
02153 }
02154 
02155 struct gfClump *gfPcrClumps(struct genoFind *gf, char *fPrimer, int fPrimerSize, 
02156         char *rPrimer, int rPrimerSize, int minDistance, int maxDistance)
02157 /* Find possible PCR hits.  The fPrimer and rPrimer are on opposite strands. */
02158 {
02159 struct gfClump *clumpList;
02160 if (gf->segSize > 0)
02161     errAbort("Can't do PCR on large tile sizes");
02162 if (gf->isPep)
02163     errAbort("Can't do PCR on protein/translated index");
02164 tolowers(fPrimer);
02165 tolowers(rPrimer);
02166 reverseComplement(rPrimer, rPrimerSize);
02167 clumpList = pcrClumps(gf, fPrimer, fPrimerSize, rPrimer, rPrimerSize, 
02168         minDistance, maxDistance);
02169 reverseComplement(rPrimer, rPrimerSize);
02170 return clumpList;
02171 }
02172 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2