jkOwnLib/genoFind.c File Reference

#include "common.h"
#include <signal.h>
#include "obscure.h"
#include "dnautil.h"
#include "dnaseq.h"
#include "nib.h"
#include "twoBit.h"
#include "fa.h"
#include "dystring.h"
#include "errabort.h"
#include "sig.h"
#include "ooc.h"
#include "genoFind.h"
#include "trans3.h"
#include "binRange.h"

Include dependency graph for genoFind.c:

Go to the source code of this file.

Functions

char * gfSignature ()
static void gfPipeHandler (int sigNum)
void gfCatchPipes ()
int gfReadMulti (int sd, void *vBuf, size_t size)
void genoFindFree (struct genoFind **pGenoFind)
int gfPowerOf20 (int n)
void gfCheckTileSize (int tileSize, boolean isPep)
static struct genoFindgfNewEmpty (int minMatch, int maxGap, int tileSize, int stepSize, int maxPat, char *oocFile, boolean isPep, boolean allowOneMismatch)
static void initNtLookup ()
int gfDnaTile (DNA *dna, int n)
int gfPepTile (AA *pep, int n)
static void gfCountSeq (struct genoFind *gf, bioSeq *seq)
static int gfCountTilesInNib (struct genoFind *gf, int stepSize, char *fileName)
long long maxTotalBases ()
static long long twoBitCheckTotalSize (struct twoBitFile *tbf)
static void gfCountTilesInTwoBit (struct genoFind *gf, int stepSize, char *fileName, int *retSeqCount, long long *retBaseCount)
static int gfAllocLists (struct genoFind *gf)
static int gfAllocLargeLists (struct genoFind *gf)
static void gfAddSeq (struct genoFind *gf, bioSeq *seq, bits32 offset)
static void gfAddLargeSeq (struct genoFind *gf, bioSeq *seq, bits32 offset)
static int gfAddTilesInNib (struct genoFind *gf, char *fileName, bits32 offset, int stepSize)
static void gfZeroOverused (struct genoFind *gf)
static void gfZeroNonOverused (struct genoFind *gf)
genoFindgfIndexNibsAndTwoBits (int fileCount, char *fileNames[], int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean allowOneMismatch, int stepSize)
static void maskSimplePepRepeat (struct genoFind *gf)
dnaSeqreadMaskedNib (char *fileName, boolean mask)
dnaSeqreadMaskedTwoBit (struct twoBitFile *tbf, char *seqName, boolean mask)
static void transCountBothStrands (struct dnaSeq *seq, struct genoFind *transGf[2][3])
static void transIndexBothStrands (struct dnaSeq *seq, struct genoFind *transGf[2][3], bits32 offset[2][3], int sourceIx, char *fileName)
void gfIndexTransNibsAndTwoBits (struct genoFind *transGf[2][3], int fileCount, char *fileNames[], int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean allowOneMismatch, boolean doMask, int stepSize)
static struct genoFindgfSmallIndexSeq (struct genoFind *gf, bioSeq *seqList, int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean isPep, boolean maskUpper)
static struct genoFindgfLargeIndexSeq (struct genoFind *gf, bioSeq *seqList, int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean isPep, boolean maskUpper)
genoFindgfIndexSeq (bioSeq *seqList, int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean isPep, boolean allowOneMismatch, boolean maskUpper, int stepSize)
static int bCmpSeqSource (const void *vTarget, const void *vRange)
static struct gfSeqSourcefindSource (struct genoFind *gf, bits32 targetPos)
void gfClumpFree (struct gfClump **pClump)
void gfClumpFreeList (struct gfClump **pList)
void gfClumpDump (struct genoFind *gf, struct gfClump *clump, FILE *f)
static void gfHitSort2 (struct gfHit **ptArray, int n)
void gfHitSortDiagonal (struct gfHit **pList)
static int gfHitCmpTarget (const void *va, const void *vb)
static int gfHitCmpQuery (const void *va, const void *vb)
static void gfClumpComputeQueryCoverage (struct gfClump *clumpList, int tileSize)
static int gfClumpCmpQueryCoverage (const void *va, const void *vb)
static void findClumpBounds (struct gfClump *clump, int tileSize)
static void targetClump (struct genoFind *gf, struct gfClump **pClumpList, struct gfClump *clump)
static struct gfClumpclumpNear (struct genoFind *gf, struct gfClump *oldClumps, int minMatch)
static struct gfClumpclumpHits (struct genoFind *gf, struct gfHit *hitList, int minMatch)
static struct gfHitgfFastFindDnaHits (struct genoFind *gf, struct dnaSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, struct gfSeqSource *target, int tMin, int tMax)
static struct gfHitgfStraightFindHits (struct genoFind *gf, aaSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, struct gfSeqSource *target, int tMin, int tMax)
static struct gfHitgfStraightFindNearHits (struct genoFind *gf, aaSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, struct gfSeqSource *target, int tMin, int tMax)
static struct gfHitgfSegmentedFindHits (struct genoFind *gf, aaSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, struct gfSeqSource *target, int tMin, int tMax)
static struct gfHitgfSegmentedFindNearHits (struct genoFind *gf, aaSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, struct gfSeqSource *target, int tMin, int tMax)
static struct gfHitgfFindHitsWithQmask (struct genoFind *gf, bioSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, struct gfSeqSource *target, int tMin, int tMax)
gfClumpgfFindClumpsWithQmask (struct genoFind *gf, bioSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount)
gfHitgfFindHitsInRegion (struct genoFind *gf, bioSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, struct gfSeqSource *target, int tMin, int tMax)
gfClumpgfFindClumps (struct genoFind *gf, bioSeq *seq, struct lm *lm, int *retHitCount)
void gfTransFindClumps (struct genoFind *gfs[3], aaSeq *seq, struct gfClump *clumps[3], struct lm *lm, int *retHitCount)
void gfTransTransFindClumps (struct genoFind *gfs[3], aaSeq *seqs[3], struct gfClump *clumps[3][3], struct lm *lm, int *retHitCount)
void gfMakeOoc (char *outName, char *files[], int fileCount, int tileSize, bits32 maxPat, enum gfType tType)
gfSeqSourcegfFindNamedSource (struct genoFind *gf, char *name)
static void mergeAdd (struct binKeeper *bk, int start, int end, struct gfSeqSource *src)
static struct gfClumppcrClumps (struct genoFind *gf, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize, int minDistance, int maxDistance)
gfClumpgfPcrClumps (struct genoFind *gf, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize, int minDistance, int maxDistance)

Variables

static char const rcsid [] = "$Id: genoFind.c,v 1.23 2007/02/04 05:09:02 kent Exp $"
volatile boolean pipeBroke = FALSE
static int ntLookup [256]
static struct gfHit ** nosTemp
static struct gfHitnosSwap
static int cmpQuerySize
static int gfNearEnough = 300


Function Documentation

static int bCmpSeqSource ( const void *  vTarget,
const void *  vRange 
) [static]

Definition at line 929 of file genoFind.c.

References bits32, ss, and gfSeqSource::start.

Referenced by findSource().

00931 {
00932 const bits32 *pTarget = vTarget;
00933 bits32 target = *pTarget;
00934 const struct gfSeqSource *ss = vRange;
00935 
00936 if (target < ss->start) return -1;
00937 if (target >= ss->end) return 1;
00938 return 0;
00939 }

Here is the caller graph for this function:

static struct gfClump* clumpHits ( struct genoFind gf,
struct gfHit hitList,
int  minMatch 
) [static, read]

Definition at line 1337 of file genoFind.c.

References AllocArray, AllocVar, bits32, clumpNear(), gfHit::diagonal, freez(), gfClumpCmpQueryCoverage(), gfClumpComputeQueryCoverage(), gfHitSortDiagonal(), gfNearEnough, genoFind::isPep, genoFind::maxGap, maxGap, gfClump::next, gfHit::next, slAddHead, slCat(), slSort(), genoFind::tileSize, tileSize, genoFind::totalSeqSize, gfHit::tStart, and uglyf.

Referenced by gfFindClumpsWithQmask().

01339 {
01340 struct gfClump *clumpList = NULL, *clump = NULL;
01341 int maxGap = gf->maxGap;
01342 struct gfHit *hit, *nextHit, *lastHit = NULL;
01343 int totalHits = 0, usedHits = 0, clumpCount = 0;
01344 int tileSize = gf->tileSize;
01345 int bucketShift = 16;           /* 64k buckets. */
01346 bits32 bucketSize = (1<<bucketShift);
01347 int bucketCount = (gf->totalSeqSize >> bucketShift) + 1;
01348 int nearEnough = (gf->isPep ? gfNearEnough/3 : gfNearEnough);
01349 bits32 boundary = bucketSize - nearEnough;
01350 int i;
01351 struct gfHit **buckets = NULL, **pb;
01352 
01353 /* Sort hit list into buckets. */
01354 AllocArray(buckets, bucketCount);
01355 for (hit = hitList; hit != NULL; hit = nextHit)
01356     {
01357     nextHit = hit->next;
01358     pb = buckets + (hit->tStart >> bucketShift);
01359     slAddHead(pb, hit);
01360     }
01361 
01362 /* Sort each bucket on diagonal and clump. */
01363 for (i=0; i<bucketCount; ++i)
01364     {
01365     int clumpSize;
01366     bits32 maxT;
01367     struct gfHit *clumpHits;
01368     pb = buckets + i;
01369     gfHitSortDiagonal(pb);
01370     for (hit = *pb; hit != NULL; )
01371          {
01372          /* Each time through this loop will get info on a clump.  Will only
01373           * actually create clump if it is big enough though. */
01374          clumpSize = 0;
01375          clumpHits = lastHit = NULL;
01376          maxT = 0;
01377          for (; hit != NULL; hit = nextHit)
01378              {
01379              nextHit = hit->next;
01380              if (lastHit != NULL && hit->diagonal - lastHit->diagonal > maxGap)
01381                  break;
01382              if (hit->tStart > maxT) maxT = hit->tStart;
01383              slAddHead(&clumpHits, hit);
01384              ++clumpSize;
01385              ++totalHits;
01386              lastHit = hit;
01387              }
01388          if (maxT > boundary && i < bucketCount-1)
01389              {
01390              /* Move clumps that are near boundary to next bucket to give them a
01391               * chance to merge with hits there. */
01392              buckets[i+1] = slCat(clumpHits, buckets[i+1]);
01393              }
01394          else if (clumpSize >= minMatch)
01395              {
01396              /* Save clumps that are large enough on list. */
01397              AllocVar(clump);
01398              slAddHead(&clumpList, clump);
01399              clump->hitCount = clumpSize;
01400              clump->hitList = clumpHits;
01401              usedHits += clumpSize;
01402              ++clumpCount;
01403              }
01404          }
01405     *pb = NULL;
01406     boundary += bucketSize;
01407     }
01408 clumpList = clumpNear(gf, clumpList, minMatch);
01409 gfClumpComputeQueryCoverage(clumpList, tileSize);       /* Thanks AG */
01410 slSort(&clumpList, gfClumpCmpQueryCoverage);
01411 
01412 #ifdef DEBUG
01413 uglyf("Dumping clumps B\n");
01414 for (clump = clumpList; clump != NULL; clump = clump->next)     /* uglyf */
01415     {
01416     uglyf(" %d %d %s %d %d (%d hits)\n", clump->qStart, clump->qEnd, clump->target->seq->name,   clump->tStart, clump->tEnd, clump->hitCount);
01417     }
01418 #endif /* DEBUG */
01419 freez(&buckets);
01420 return clumpList;
01421 }

Here is the call graph for this function:

Here is the caller graph for this function:

static struct gfClump* clumpNear ( struct genoFind gf,
struct gfClump oldClumps,
int  minMatch 
) [static, read]

Definition at line 1276 of file genoFind.c.

References AllocVar, bits32, findClumpBounds(), gfClumpFree(), gfHitCmpTarget(), gfNearEnough, genoFind::isPep, gfHit::next, slAddHead, slReverse(), slSort(), targetClump(), genoFind::tileSize, tileSize, and gfHit::tStart.

Referenced by clumpHits().

01279 {
01280 struct gfClump *newClumps = NULL, *clump, *nextClump;
01281 struct gfHit *hit, *nextHit;
01282 int tileSize = gf->tileSize;
01283 bits32 lastT;
01284 int nearEnough = (gf->isPep ? gfNearEnough/3 : gfNearEnough);
01285 
01286 for (clump = oldClumps; clump != NULL; clump = nextClump)
01287     {
01288     struct gfHit *newHits = NULL, *oldHits = clump->hitList;
01289     int clumpSize = 0;
01290     clump->hitCount = 0;
01291     clump->hitList = NULL;      /* Clump no longer owns list. */
01292     nextClump = clump->next;
01293     slSort(&oldHits, gfHitCmpTarget);
01294     lastT = oldHits->tStart;
01295     for (hit = oldHits; hit != NULL; hit = nextHit)
01296         {
01297         nextHit = hit->next;
01298         if (hit->tStart > nearEnough + lastT)
01299              {
01300              if (clumpSize >= minMatch)
01301                  {
01302                  slReverse(&newHits);
01303                  clump->hitList = newHits;
01304                  newHits = NULL;
01305                  clump->hitCount = clumpSize;
01306                  findClumpBounds(clump, tileSize);
01307                  targetClump(gf, &newClumps, clump);
01308                  AllocVar(clump);
01309                  }
01310              else
01311                  {
01312                  newHits = NULL;
01313                  clump->hitCount = 0;
01314                  }
01315              clumpSize = 0;
01316              }
01317         lastT = hit->tStart;
01318         slAddHead(&newHits, hit);
01319         ++clumpSize;
01320         }
01321     slReverse(&newHits);
01322     clump->hitList = newHits;
01323     if (clumpSize >= minMatch)
01324         {
01325         clump->hitCount = clumpSize;
01326         findClumpBounds(clump, tileSize);
01327         targetClump(gf, &newClumps, clump);
01328         }
01329     else
01330         {
01331         gfClumpFree(&clump);
01332         }
01333     }
01334 return newClumps;
01335 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void findClumpBounds ( struct gfClump clump,
int  tileSize 
) [static]

Definition at line 1189 of file genoFind.c.

References gfClump::hitList, gfHit::next, gfClump::qEnd, gfHit::qStart, gfClump::qStart, gfClump::tEnd, gfHit::tStart, and gfClump::tStart.

Referenced by clumpNear(), and targetClump().

01191 {
01192 struct gfHit *hit;
01193 int x;
01194 hit = clump->hitList;
01195 if (hit == NULL)
01196     return;
01197 clump->qStart = clump->qEnd = hit->qStart;
01198 clump->tStart = clump->tEnd = hit->tStart;
01199 for (hit = hit->next; hit != NULL; hit = hit->next)
01200     {
01201     x = hit->qStart;
01202     if (x < clump->qStart) clump->qStart = x;
01203     if (x > clump->qEnd) clump->qEnd = x;
01204     x = hit->tStart;
01205     if (x < clump->tStart) clump->tStart = x;
01206     if (x > clump->tEnd) clump->tEnd = x;
01207     }
01208 clump->tEnd += tileSize;
01209 clump->qEnd += tileSize;
01210 }

Here is the caller graph for this function:

static struct gfSeqSource* findSource ( struct genoFind gf,
bits32  targetPos 
) [static, read]

Definition at line 941 of file genoFind.c.

References bCmpSeqSource(), errAbort(), genoFind::sourceCount, genoFind::sources, and ss.

Referenced by gfFastFindDnaHits(), gfSegmentedFindHits(), gfSegmentedFindNearHits(), gfStraightFindHits(), gfStraightFindNearHits(), pcrClumps(), and targetClump().

00943 {
00944 struct gfSeqSource *ss =  bsearch(&targetPos, gf->sources, gf->sourceCount, 
00945         sizeof(gf->sources[0]), bCmpSeqSource);
00946 if (ss == NULL)
00947     errAbort("Couldn't find source for %d", targetPos);
00948 return ss;
00949 }

Here is the call graph for this function:

Here is the caller graph for this function:

void genoFindFree ( struct genoFind **  pGenoFind  ) 

Definition at line 67 of file genoFind.c.

References genoFind::allocated, bitFree(), freeMem(), freez(), genoFind::lists, genoFind::listSizes, gfSeqSource::maskedBits, genoFind::sourceCount, and genoFind::sources.

Referenced by bigBlat(), genoFindDirect(), genoPcrDirect(), and gfMakeOoc().

00069 {
00070 struct genoFind *gf = *pGenoFind;
00071 int i;
00072 struct gfSeqSource *sources;
00073 if (gf != NULL)
00074     {
00075     freeMem(gf->lists);
00076     freeMem(gf->listSizes);
00077     freeMem(gf->allocated);
00078     if ((sources = gf->sources) != NULL)
00079         {
00080         for (i=0; i<gf->sourceCount; ++i)
00081             bitFree(&sources[i].maskedBits);
00082         freeMem(sources);
00083         }
00084     freez(pGenoFind);
00085     }
00086 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void gfAddLargeSeq ( struct genoFind gf,
bioSeq seq,
bits32  offset 
) [static]

Definition at line 413 of file genoFind.c.

References bits16, bits32, dnaSeq::dna, genoFind::endLists, gfDnaTile(), gfPepTile(), initNtLookup(), genoFind::isPep, genoFind::listSizes, genoFind::segSize, dnaSeq::size, genoFind::stepSize, stepSize, genoFind::tileSize, and tileSize.

Referenced by gfLargeIndexSeq().

00415 {
00416 char *poly = seq->dna;
00417 int tileSize = gf->tileSize;
00418 int stepSize = gf->stepSize;
00419 int tileTailSize = gf->segSize;
00420 int tileHeadSize = tileSize - tileTailSize;
00421 int i, lastTile = seq->size - tileSize;
00422 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
00423 int tileHead;
00424 int tileTail;
00425 bits32 *listSizes = gf->listSizes;
00426 bits16 **endLists = gf->endLists;
00427 bits16 *endList;
00428 int headCount;
00429 
00430 initNtLookup();
00431 for (i=0; i<=lastTile; i += stepSize)
00432     {
00433     tileHead = makeTile(poly, tileHeadSize);
00434     tileTail = makeTile(poly + tileHeadSize, tileTailSize);
00435     if (tileHead >= 0 && tileTail >= 0)
00436         {
00437         endList = endLists[tileHead];
00438         headCount = listSizes[tileHead]++;
00439         endList += headCount * 3;               /* Because have three slots per. */
00440         endList[0] = tileTail;
00441         endList[1] = (offset >> 16);
00442         endList[2] = (offset&0xffff);
00443         }
00444     offset += stepSize;
00445     poly += stepSize;
00446     }
00447 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void gfAddSeq ( struct genoFind gf,
bioSeq seq,
bits32  offset 
) [static]

Definition at line 384 of file genoFind.c.

References bits32, dnaSeq::dna, gfDnaTile(), gfPepTile(), initNtLookup(), genoFind::isPep, genoFind::lists, genoFind::listSizes, genoFind::maxPat, dnaSeq::size, genoFind::stepSize, stepSize, genoFind::tileSize, and tileSize.

Referenced by gfAddTilesInNib(), gfIndexNibsAndTwoBits(), gfSmallIndexSeq(), and transIndexBothStrands().

00386 {
00387 char *poly = seq->dna;
00388 int tileSize = gf->tileSize;
00389 int stepSize = gf->stepSize;
00390 int i, lastTile = seq->size - tileSize;
00391 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
00392 int maxPat = gf->maxPat;
00393 int tile;
00394 bits32 *listSizes = gf->listSizes;
00395 bits32 **lists = gf->lists;
00396 
00397 initNtLookup();
00398 for (i=0; i<=lastTile; i += stepSize)
00399     {
00400     tile = makeTile(poly, tileSize);
00401     if (tile >= 0)
00402         {
00403         if (listSizes[tile] < maxPat)
00404             {
00405             lists[tile][listSizes[tile]++] = offset;
00406             }
00407         }
00408     offset += stepSize;
00409     poly += stepSize;
00410     }
00411 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int gfAddTilesInNib ( struct genoFind gf,
char *  fileName,
bits32  offset,
int  stepSize 
) [static]

Definition at line 451 of file genoFind.c.

References freeDnaSeq(), gfAddSeq(), nibLdPart(), nibOpenVerify(), genoFind::tileSize, and tileSize.

Referenced by gfIndexNibsAndTwoBits().

00454 {
00455 FILE *f;
00456 int tileSize = gf->tileSize;
00457 int bufSize = tileSize * 16*1024;
00458 int nibSize, i;
00459 int endBuf, basesInBuf;
00460 struct dnaSeq *seq;
00461 
00462 printf("Adding tiles in %s\n", fileName);
00463 nibOpenVerify(fileName, &f, &nibSize);
00464 for (i=0; i < nibSize; i = endBuf)
00465     {
00466     endBuf = i + bufSize;
00467     if (endBuf >= nibSize) endBuf = nibSize;
00468     basesInBuf = endBuf - i;
00469     seq = nibLdPart(fileName, f, nibSize, i, basesInBuf);
00470     gfAddSeq(gf, seq, i + offset);
00471     freeDnaSeq(&seq);
00472     }
00473 fclose(f);
00474 return nibSize;
00475 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int gfAllocLargeLists ( struct genoFind gf  )  [static]

Definition at line 358 of file genoFind.c.

References genoFind::allocated, bits16, bits32, genoFind::endLists, genoFind::listSizes, needHugeMem(), and genoFind::tileSpaceSize.

Referenced by gfLargeIndexSeq().

00361 {
00362 int count = 0;
00363 int i;
00364 bits32 *listSizes = gf->listSizes;
00365 bits16 **endLists = gf->endLists;
00366 bits16 *allocated = NULL;
00367 int size;
00368 int tileSpaceSize = gf->tileSpaceSize;
00369 
00370 for (i=0; i<tileSpaceSize; ++i)
00371     count += listSizes[i];
00372 if (count > 0)
00373     gf->allocated = allocated = needHugeMem(3*count*sizeof(allocated[0]));
00374 for (i=0; i<tileSpaceSize; ++i)
00375     {
00376     size = listSizes[i];
00377     endLists[i] = allocated;
00378     allocated += 3*size;
00379     }
00380 return count;
00381 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int gfAllocLists ( struct genoFind gf  )  [static]

Definition at line 317 of file genoFind.c.

References genoFind::allocated, bits32, genoFind::lists, genoFind::listSizes, genoFind::maxPat, needHugeMem(), and genoFind::tileSpaceSize.

Referenced by gfIndexNibsAndTwoBits(), gfIndexTransNibsAndTwoBits(), and gfSmallIndexSeq().

00320 {
00321 int oneCount;
00322 int count = 0;
00323 int i;
00324 bits32 *listSizes = gf->listSizes;
00325 bits32 **lists = gf->lists;
00326 bits32 *allocated = NULL;
00327 bits32 maxPat = gf->maxPat;
00328 int size;
00329 int usedCount = 0, overusedCount = 0;
00330 int tileSpaceSize = gf->tileSpaceSize;
00331 
00332 for (i=0; i<tileSpaceSize; ++i)
00333     {
00334     /* If pattern is too much used it's no good to us, ignore. */
00335     if ((oneCount = listSizes[i]) < maxPat)
00336         {
00337         count += oneCount;
00338         usedCount += 1;
00339         }
00340     else
00341         {
00342         overusedCount += 1;
00343         }
00344     }
00345 if (count > 0)
00346     gf->allocated = allocated = needHugeMem(count*sizeof(allocated[0]));
00347 for (i=0; i<tileSpaceSize; ++i)
00348     {
00349     if ((size = listSizes[i]) < maxPat)
00350         {
00351         lists[i] = allocated;
00352         allocated += size;
00353         }
00354     }
00355 return count;
00356 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gfCatchPipes (  ) 

Definition at line 38 of file genoFind.c.

References gfPipeHandler().

Referenced by main().

00040 {
00041 signal(SIGPIPE, gfPipeHandler);
00042 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gfCheckTileSize ( int  tileSize,
boolean  isPep 
)

Definition at line 97 of file genoFind.c.

References errAbort().

Referenced by gfNewEmpty(), and main().

00099 {
00100 if (isPep)
00101     {
00102     if (tileSize < 3 || tileSize > 8)
00103         errAbort("protein tileSize must be between 3 and 8");
00104     }
00105 else
00106     {
00107     if (tileSize < 6 || tileSize > 18)
00108         errAbort("DNA tileSize must be between 6 and 18");
00109     }
00110 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int gfClumpCmpQueryCoverage ( const void *  va,
const void *  vb 
) [static]

Definition at line 1180 of file genoFind.c.

References gfClump::queryCoverage.

Referenced by clumpHits().

01182 {
01183 const struct gfClump *a = *((struct gfClump **)va);
01184 const struct gfClump *b = *((struct gfClump **)vb);
01185 
01186 return (b->queryCoverage - a->queryCoverage);
01187 }

Here is the caller graph for this function:

static void gfClumpComputeQueryCoverage ( struct gfClump clumpList,
int  tileSize 
) [static]

Definition at line 1140 of file genoFind.c.

References gfHitCmpQuery(), gfClump::hitList, gfHit::next, gfClump::next, gfHit::qStart, gfClump::queryCoverage, and slSort().

Referenced by clumpHits().

01143 {
01144 struct gfClump *clump;
01145 struct gfHit *hit;
01146 struct gfHit *hitList;
01147 int qcov;
01148 int blockStart, blockEnd;
01149 
01150 for (clump = clumpList; clump != NULL; clump = clump->next)
01151     {
01152     hitList = clump->hitList;
01153     if ( hitList != NULL)
01154         {
01155         slSort(&hitList, gfHitCmpQuery);
01156         qcov = 0;
01157         blockStart = hitList->qStart;
01158         blockEnd = blockStart + tileSize;
01159         for (hit = hitList; hit != NULL; hit = hit->next)
01160             {
01161             if (hit->qStart > blockEnd)
01162                 {
01163                 qcov += blockEnd - blockStart;
01164                 blockStart = hit->qStart;
01165                 blockEnd = blockStart + tileSize;
01166                 }
01167             else if (hit->qStart + tileSize > blockEnd)
01168                 {
01169                 blockEnd = hit->qStart + tileSize;
01170                 }
01171             }
01172         qcov += blockEnd - blockStart;
01173         clump->queryCoverage = qcov;
01174         }
01175     }
01176 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gfClumpDump ( struct genoFind gf,
struct gfClump clump,
FILE *  f 
)

Definition at line 973 of file genoFind.c.

References gfClump::hitCount, gfClump::hitList, name, gfHit::next, gfClump::qEnd, gfClump::qStart, ss, gfClump::target, gfClump::tEnd, and gfClump::tStart.

Referenced by genoFindDirect(), and genoPcrDirect().

00975 {
00976 struct gfSeqSource *ss = clump->target;
00977 char *name = ss->fileName;
00978 
00979 if (name == NULL) name = ss->seq->name;
00980 fprintf(f, "%u-%u %s %u-%u, hits %d\n", 
00981         clump->qStart, clump->qEnd, name,
00982         clump->tStart - ss->start, clump->tEnd - ss->start,
00983         clump->hitCount);
00984 #ifdef SOMETIMES
00985 for (hit = clump->hitList; hit != NULL; hit = hit->next)
00986     fprintf(f, "   q %d, t %d, diag %d\n", hit->qStart, hit->tStart, hit->diagonal);
00987 #endif
00988 }

Here is the caller graph for this function:

void gfClumpFree ( struct gfClump **  pClump  ) 

Definition at line 951 of file genoFind.c.

References freez().

Referenced by clumpNear(), gfClumpFreeList(), and targetClump().

00953 {
00954 struct gfClump *clump;
00955 if ((clump = *pClump) == NULL)
00956     return;
00957 freez(pClump);
00958 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gfClumpFreeList ( struct gfClump **  pList  ) 

Definition at line 960 of file genoFind.c.

References gfClumpFree(), and gfClump::next.

Referenced by dnaQuery(), ffSeedExtInMem(), genoFindDirect(), gfAlignTrans(), gfFindAlignAaTrans(), gfTransTransFindBundles(), pcrQuery(), searchOneProt(), transQuery(), and transTransQuery().

00962 {
00963 struct gfClump *el, *next;
00964 
00965 for (el = *pList; el != NULL; el = next)
00966     {
00967     next = el->next;
00968     gfClumpFree(&el);
00969     }
00970 *pList = NULL;
00971 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void gfCountSeq ( struct genoFind gf,
bioSeq seq 
) [static]

Definition at line 222 of file genoFind.c.

References bits32, dnaSeq::dna, gfDnaTile(), gfPepTile(), initNtLookup(), genoFind::isPep, genoFind::listSizes, genoFind::maxPat, genoFind::segSize, dnaSeq::size, genoFind::stepSize, stepSize, genoFind::tileSize, and tileSize.

Referenced by gfCountTilesInNib(), gfCountTilesInTwoBit(), gfLargeIndexSeq(), gfMakeOoc(), gfSmallIndexSeq(), and transCountBothStrands().

00224 {
00225 char *poly = seq->dna;
00226 int tileSize = gf->tileSize;
00227 int stepSize = gf->stepSize;
00228 int tileHeadSize = gf->tileSize - gf->segSize;
00229 int maxPat = gf->maxPat;
00230 int tile;
00231 bits32 *listSizes = gf->listSizes;
00232 int i, lastTile = seq->size - tileSize;
00233 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
00234 
00235 initNtLookup();
00236 for (i=0; i<=lastTile; i += stepSize)
00237     {
00238     if ((tile = makeTile(poly, tileHeadSize)) >= 0)
00239         {
00240         if (listSizes[tile] < maxPat)
00241             {
00242             listSizes[tile] += 1;
00243             }
00244         }
00245     poly += stepSize;
00246     }
00247 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int gfCountTilesInNib ( struct genoFind gf,
int  stepSize,
char *  fileName 
) [static]

Definition at line 249 of file genoFind.c.

References freeDnaSeq(), gfCountSeq(), nibLdPart(), nibOpenVerify(), genoFind::tileSize, and tileSize.

Referenced by gfIndexNibsAndTwoBits().

00251 {
00252 FILE *f;
00253 int tileSize = gf->tileSize;
00254 int bufSize = tileSize * 16*1024;
00255 int nibSize, i;
00256 int endBuf, basesInBuf;
00257 struct dnaSeq *seq;
00258 
00259 printf("Counting tiles in %s\n", fileName);
00260 nibOpenVerify(fileName, &f, &nibSize);
00261 for (i=0; i < nibSize; i = endBuf)
00262     {
00263     endBuf = i + bufSize;
00264     if (endBuf >= nibSize) endBuf = nibSize;
00265     basesInBuf = endBuf - i;
00266     seq = nibLdPart(fileName, f, nibSize, i, basesInBuf);
00267     gfCountSeq(gf, seq);
00268     freeDnaSeq(&seq);
00269     }
00270 fclose(f);
00271 return nibSize;
00272 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void gfCountTilesInTwoBit ( struct genoFind gf,
int  stepSize,
char *  fileName,
int *  retSeqCount,
long long *  retBaseCount 
) [static]

Definition at line 293 of file genoFind.c.

References baseCount, freeDnaSeq(), gfCountSeq(), twoBitFile::indexList, twoBitIndex::name, twoBitIndex::next, twoBitCheckTotalSize(), twoBitClose(), twoBitOpen(), and twoBitReadSeqFragLower().

Referenced by gfIndexNibsAndTwoBits().

00297 {
00298 struct dnaSeq *seq;
00299 struct twoBitFile *tbf = twoBitOpen(fileName);
00300 struct twoBitIndex *index;
00301 int seqCount = 0;
00302 long long baseCount = twoBitCheckTotalSize(tbf);
00303 
00304 printf("Counting tiles in %s\n", fileName);
00305 for (index = tbf->indexList; index != NULL; index = index->next)
00306     {
00307     seq = twoBitReadSeqFragLower(tbf, index->name, 0, 0);
00308     gfCountSeq(gf, seq);
00309     ++seqCount;
00310     freeDnaSeq(&seq);
00311     }
00312 twoBitClose(&tbf);
00313 *retSeqCount = seqCount;
00314 *retBaseCount = baseCount;
00315 }

Here is the call graph for this function:

Here is the caller graph for this function:

int gfDnaTile ( DNA dna,
int  n 
)

Definition at line 191 of file genoFind.c.

References ntLookup.

Referenced by gfAddLargeSeq(), gfAddSeq(), gfCountSeq(), gfSegmentedFindHits(), gfSegmentedFindNearHits(), gfStraightFindHits(), gfStraightFindNearHits(), and pcrClumps().

00193 {
00194 int tile = 0;
00195 int i, c;
00196 for (i=0; i<n; ++i)
00197     {
00198     tile <<= 2;
00199     if ((c = ntLookup[(int)dna[i]]) < 0)
00200         return -1;
00201     tile += c;
00202     }
00203 return tile;
00204 }

Here is the caller graph for this function:

static struct gfHit* gfFastFindDnaHits ( struct genoFind gf,
struct dnaSeq seq,
Bits qMaskBits,
int  qMaskOffset,
struct lm lm,
int *  retHitCount,
struct gfSeqSource target,
int  tMin,
int  tMax 
) [static, read]

Definition at line 1424 of file genoFind.c.

References bitCountRange(), bits32, dnaSeq::dna, findSource(), genoFind::lists, genoFind::listSizes, lm, lmAllocVar, mask, ntValNoN, gfHit::qStart, dnaSeq::size, slAddHead, genoFind::tileMask, genoFind::tileSize, and gfHit::tStart.

Referenced by gfFindHitsWithQmask().

01429 {
01430 struct gfHit *hitList = NULL, *hit;
01431 int size = seq->size;
01432 int tileSizeMinusOne = gf->tileSize - 1;
01433 int mask = gf->tileMask;
01434 DNA *dna = seq->dna;
01435 int i, j;
01436 bits32 bits = 0;
01437 bits32 bVal;
01438 int listSize;
01439 bits32 qStart, *tList;
01440 int hitCount = 0;
01441 
01442 for (i=0; i<tileSizeMinusOne; ++i)
01443     {
01444     bVal = ntValNoN[(int)dna[i]];
01445     bits <<= 2;
01446     bits += bVal;
01447     }
01448 for (i=tileSizeMinusOne; i<size; ++i)
01449     {
01450     bVal = ntValNoN[(int)dna[i]];
01451     bits <<= 2;
01452     bits += bVal;
01453     bits &= mask;
01454     listSize = gf->listSizes[bits];
01455     if (listSize != 0)
01456         {
01457         qStart = i-tileSizeMinusOne;
01458         if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, gf->tileSize) == 0)
01459             {
01460             tList = gf->lists[bits];
01461             for (j=0; j<listSize; ++j)
01462                 {
01463                 int tStart = tList[j];
01464                 if (target == NULL || 
01465                         (target == findSource(gf, tStart) && tStart >= tMin && tStart < tMax) ) 
01466                     {
01467                     lmAllocVar(lm, hit);
01468                     hit->qStart = qStart;
01469                     hit->tStart = tStart;
01470                     hit->diagonal = tStart + size - qStart;
01471                     slAddHead(&hitList, hit);
01472                     ++hitCount;
01473                     }
01474                 }
01475             }
01476         }
01477     }
01478 *retHitCount = hitCount;
01479 return hitList;
01480 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct gfClump* gfFindClumps ( struct genoFind gf,
bioSeq seq,
struct lm lm,
int *  retHitCount 
) [read]

Definition at line 1890 of file genoFind.c.

References gfFindClumpsWithQmask(), and lm.

Referenced by dnaQuery(), genoFindDirect(), gfTransFindClumps(), and searchOneProt().

01892 {
01893 return gfFindClumpsWithQmask(gf, seq, NULL, 0, lm, retHitCount);
01894 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct gfClump* gfFindClumpsWithQmask ( struct genoFind gf,
bioSeq seq,
Bits qMaskBits,
int  qMaskOffset,
struct lm lm,
int *  retHitCount 
) [read]

Definition at line 1849 of file genoFind.c.

References clumpHits(), cmpQuerySize, gfFindHitsWithQmask(), lm, minMatch, genoFind::minMatch, dnaSeq::size, and genoFind::tileSize.

Referenced by ffSeedExtInMem(), and gfFindClumps().

01853 {
01854 struct gfClump *clumpList = NULL;
01855 struct gfHit *hitList;
01856 int minMatch = gf->minMatch;
01857 
01858 #ifdef OLD      /* stepSize makes this obsolete. */
01859 if (seq->size < gf->tileSize * (gf->minMatch+1))
01860      minMatch = 1;
01861 #endif /* OLD */
01862 
01863 hitList =  gfFindHitsWithQmask(gf, seq, qMaskBits, qMaskOffset, lm,
01864         retHitCount, NULL, 0, 0);
01865 cmpQuerySize = seq->size;
01866 clumpList = clumpHits(gf, hitList, minMatch);
01867 return clumpList;
01868 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct gfHit* gfFindHitsInRegion ( struct genoFind gf,
bioSeq seq,
Bits qMaskBits,
int  qMaskOffset,
struct lm lm,
struct gfSeqSource target,
int  tMin,
int  tMax 
) [read]

Definition at line 1870 of file genoFind.c.

References gfFindHitsWithQmask(), lm, gfHit::next, gfSeqSource::start, and gfHit::tStart.

Referenced by scanIndexForSmallExons().

01877 {
01878 int targetStart;
01879 struct gfHit *hitList, *hit;
01880 int hitCount;
01881 
01882 targetStart = target->start;
01883 hitList =  gfFindHitsWithQmask(gf, seq, qMaskBits, qMaskOffset, lm,
01884         &hitCount, target, tMin + targetStart, tMax + targetStart);
01885 for (hit = hitList; hit != NULL; hit = hit->next)
01886     hit->tStart -= targetStart;
01887 return hitList;
01888 }

Here is the call graph for this function:

Here is the caller graph for this function:

static struct gfHit* gfFindHitsWithQmask ( struct genoFind gf,
bioSeq seq,
Bits qMaskBits,
int  qMaskOffset,
struct lm lm,
int *  retHitCount,
struct gfSeqSource target,
int  tMin,
int  tMax 
) [static, read]

Definition at line 1805 of file genoFind.c.

References genoFind::allowOneMismatch, gfFastFindDnaHits(), gfSegmentedFindHits(), gfSegmentedFindNearHits(), gfStraightFindHits(), gfStraightFindNearHits(), genoFind::isPep, lm, and genoFind::segSize.

Referenced by gfFindClumpsWithQmask(), and gfFindHitsInRegion().

01810 {
01811 struct gfHit *hitList = NULL;
01812 if (gf->segSize == 0 && !gf->isPep && !gf->allowOneMismatch)
01813     {
01814     hitList = gfFastFindDnaHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
01815         target, tMin, tMax);
01816     }
01817 else
01818     {
01819     if (gf->segSize == 0)
01820         {
01821         if (gf->allowOneMismatch)
01822             {
01823             hitList = gfStraightFindNearHits(gf, seq, qMaskBits, qMaskOffset, lm, 
01824                 retHitCount, target, tMin, tMax);
01825             }
01826         else
01827             {
01828             hitList = gfStraightFindHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount, 
01829                 target, tMin, tMax);
01830             }
01831         }
01832     else
01833         {
01834         if (gf->allowOneMismatch)
01835             {
01836             hitList = gfSegmentedFindNearHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
01837                 target, tMin, tMax);
01838             }
01839         else
01840             {
01841             hitList = gfSegmentedFindHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
01842                 target, tMin, tMax);
01843             }
01844         }
01845     }
01846 return hitList;
01847 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct gfSeqSource* gfFindNamedSource ( struct genoFind gf,
char *  name 
) [read]

Definition at line 2010 of file genoFind.c.

References gfSeqSource::fileName, dnaSeq::name, sameString, gfSeqSource::seq, genoFind::sourceCount, genoFind::sources, and splitPath().

Referenced by refineBundle().

02012 {
02013 struct gfSeqSource *source = gf->sources;
02014 int count = gf->sourceCount;
02015 
02016 if (source->seq == NULL)        /* Use first source to see if seq or file. */
02017     {
02018     char rootName[256];
02019     while (--count >= 0)
02020         {
02021         splitPath(source->fileName, NULL, rootName, NULL);
02022         if (sameString(name, rootName))
02023              return source;
02024         }
02025     }
02026 else
02027     {
02028     while (--count >= 0)
02029         {
02030         if (sameString(source->seq->name, name))
02031             return source;
02032         source += 1;
02033         }
02034     }
02035 return NULL;
02036 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int gfHitCmpQuery ( const void *  va,
const void *  vb 
) [static]

Definition at line 1126 of file genoFind.c.

References gfHit::qStart.

Referenced by gfClumpComputeQueryCoverage().

01128 {
01129 const struct gfHit *a = *((struct gfHit **)va);
01130 const struct gfHit *b = *((struct gfHit **)vb);
01131 
01132 if (a->qStart > b->qStart)
01133     return 1;
01134 else if (a->qStart == b->qStart)
01135     return 0;
01136 else
01137     return -1;
01138 }

Here is the caller graph for this function:

static int gfHitCmpTarget ( const void *  va,
const void *  vb 
) [static]

Definition at line 1112 of file genoFind.c.

References gfHit::tStart.

Referenced by clumpNear().

01114 {
01115 const struct gfHit *a = *((struct gfHit **)va);
01116 const struct gfHit *b = *((struct gfHit **)vb);
01117 
01118 if (a->tStart > b->tStart)
01119     return 1;
01120 else if (a->tStart == b->tStart)
01121     return 0;
01122 else
01123     return -1;
01124 }

Here is the caller graph for this function:

static void gfHitSort2 ( struct gfHit **  ptArray,
int  n 
) [static]

Definition at line 1001 of file genoFind.c.

References gfHit::diagonal, nosSwap, and nosTemp.

Referenced by gfHitSortDiagonal().

01005 {
01006 struct gfHit **tmp, **pt1, **pt2;
01007 int n1, n2;
01008 
01009 /* Divide area to sort in two. */
01010 n1 = (n>>1);
01011 n2 = n - n1;
01012 pt1 = ptArray;
01013 pt2 =  ptArray + n1;
01014 
01015 /* Sort each area separately.  Handle small case (2 or less elements)
01016  * here.  Otherwise recurse to sort. */
01017 if (n1 > 2)
01018     gfHitSort2(pt1, n1);
01019 else if (n1 == 2 && pt1[0]->diagonal > pt1[1]->diagonal)
01020     {
01021     nosSwap = pt1[1];
01022     pt1[1] = pt1[0];
01023     pt1[0] = nosSwap;
01024     }
01025 if (n2 > 2)
01026     gfHitSort2(pt2, n2);
01027 else if (n2 == 2 && pt2[0]->diagonal > pt2[1]->diagonal)
01028     {
01029     nosSwap = pt2[1];
01030     pt2[1] = pt2[0];
01031     pt2[0] = nosSwap;
01032     }
01033 
01034 /* At this point both halves are internally sorted. 
01035  * Do a merge-sort between two halves copying to temp
01036  * buffer.  Then copy back sorted result to main buffer. */
01037 tmp = nosTemp;
01038 while (n1 > 0 && n2 > 0)
01039     {
01040     if ((*pt1)->diagonal <= (*pt2)->diagonal)
01041         {
01042         --n1;
01043         *tmp++ = *pt1++;
01044         }
01045     else
01046         {
01047         --n2;
01048         *tmp++ = *pt2++;
01049         }
01050     }
01051 /* One or both sides are now fully merged. */
01052 assert(tmp + n1 + n2 == nosTemp + n);
01053 
01054 /* If some of first side left to merge copy it to end of temp buf. */
01055 if (n1 > 0)
01056     memcpy(tmp, pt1, n1 * sizeof(*tmp));
01057 
01058 /* If some of second side left to merge, we finesse it here:
01059  * simply refrain from copying over it as we copy back temp buf. */
01060 memcpy(ptArray, nosTemp, (n - n2) * sizeof(*ptArray));
01061 }

Here is the caller graph for this function:

void gfHitSortDiagonal ( struct gfHit **  pList  ) 

Definition at line 1063 of file genoFind.c.

References freez(), gfHitSort2(), needLargeMem(), gfHit::next, nosTemp, slCount(), and slReverse().

Referenced by clumpHits().

01065 {
01066 struct gfHit *list = *pList;
01067 if (list != NULL && list->next != NULL)
01068     {
01069     int count = slCount(list);
01070     struct gfHit *el;
01071     struct gfHit **array;
01072     int i;
01073     int byteSize = count*sizeof(*array);
01074     array = needLargeMem(byteSize);
01075     nosTemp = needLargeMem(byteSize);
01076     for (el = list, i=0; el != NULL; el = el->next, i++)
01077         array[i] = el;
01078     gfHitSort2(array, count);
01079     list = NULL;
01080     for (i=0; i<count; ++i)
01081         {
01082         array[i]->next = list;
01083         list = array[i];
01084         }
01085     freez(&array);
01086     freez(&nosTemp);
01087     slReverse(&list);
01088     *pList = list;       
01089     }
01090 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct genoFind* gfIndexNibsAndTwoBits ( int  fileCount,
char *  fileNames[],
int  minMatch,
int  maxGap,
int  tileSize,
int  maxPat,
char *  oocFile,
boolean  allowOneMismatch,
int  stepSize 
) [read]

Definition at line 514 of file genoFind.c.

References AllocArray, baseCount, bits32, cloneString(), dnaSeqFree, errAbort(), FALSE, gfAddSeq(), gfAddTilesInNib(), gfAllocLists(), gfCountTilesInNib(), gfCountTilesInTwoBit(), gfNewEmpty(), gfZeroNonOverused(), gfZeroOverused(), twoBitFile::indexList, maxTotalBases(), twoBitIndex::name, twoBitIndex::next, nibIsFile(), PATH_LEN, safef(), dnaSeq::size, genoFind::sourceCount, genoFind::sources, ss, genoFind::tileSize, genoFind::totalSeqSize, twoBitClose(), twoBitIsFile(), twoBitOpen(), and twoBitReadSeqFragLower().

Referenced by genoFindDirect(), genoPcrDirect(), and startServer().

00525 {
00526 struct genoFind *gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize,
00527         maxPat, oocFile, FALSE, allowOneMismatch);
00528 int i;
00529 bits32 offset = 0, nibSize;
00530 char *fileName;
00531 struct gfSeqSource *ss;
00532 long long totalBases = 0, warnAt = maxTotalBases();
00533 int totalSeq = 0;
00534 
00535 if (allowOneMismatch)
00536     errAbort("Don't currently support allowOneMismatch in gfIndexNibsAndTwoBits");
00537 if (stepSize == 0)
00538     stepSize = gf->tileSize;
00539 for (i=0; i<fileCount; ++i)
00540     {
00541     fileName = fileNames[i];
00542     if (twoBitIsFile(fileName))
00543         {
00544         int seqCount;
00545         long long baseCount;
00546         gfCountTilesInTwoBit(gf, stepSize, fileName, &seqCount, &baseCount);
00547         totalBases += baseCount;
00548         totalSeq += seqCount;
00549         }
00550     else if (nibIsFile(fileName))
00551         {
00552         totalBases += gfCountTilesInNib(gf, stepSize, fileName);
00553         totalSeq += 1;
00554         }
00555     else
00556         errAbort("Unrecognized file type %s", fileName);
00557     /* Warn if they exceed 4 gig. */
00558     if (totalBases >= warnAt)
00559         errAbort("Exceeding 4 billion bases, sorry gfServer can't handle that.");
00560     }
00561 gfAllocLists(gf);
00562 gfZeroNonOverused(gf);
00563 AllocArray(gf->sources, totalSeq);
00564 gf->sourceCount = totalSeq;
00565 ss = gf->sources;
00566 for (i=0; i<fileCount; ++i)
00567     {
00568     fileName = fileNames[i];
00569     if (nibIsFile(fileName))
00570         {
00571         nibSize = gfAddTilesInNib(gf, fileName, offset, stepSize);
00572         ss->fileName = fileName;
00573         ss->start = offset;
00574         offset += nibSize;
00575         ss->end = offset;
00576         ++ss;
00577         }
00578     else
00579         {
00580         struct twoBitFile *tbf = twoBitOpen(fileName);
00581         struct twoBitIndex *index;
00582         char nameBuf[PATH_LEN+256];
00583         for (index = tbf->indexList; index != NULL; index = index->next)
00584             {
00585             struct dnaSeq *seq = twoBitReadSeqFragLower(tbf, index->name, 0,0);
00586             gfAddSeq(gf, seq, offset);
00587             safef(nameBuf, sizeof(nameBuf), "%s:%s", fileName, index->name);
00588             ss->fileName = cloneString(nameBuf);
00589             ss->start = offset;
00590             offset += seq->size;
00591             ss->end = offset;
00592             ++ss;
00593             dnaSeqFree(&seq);
00594             }
00595         twoBitClose(&tbf);
00596         }
00597     }
00598 gf->totalSeqSize = offset;
00599 gfZeroOverused(gf);
00600 printf("Done adding\n");
00601 return gf;
00602 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct genoFind* gfIndexSeq ( bioSeq seqList,
int  minMatch,
int  maxGap,
int  tileSize,
int  maxPat,
char *  oocFile,
boolean  isPep,
boolean  allowOneMismatch,
boolean  maskUpper,
int  stepSize 
) [read]

Definition at line 907 of file genoFind.c.

References gfLargeIndexSeq(), gfNewEmpty(), gfSmallIndexSeq(), and genoFind::segSize.

Referenced by bigBlat(), and blat().

00913 {
00914 struct genoFind *gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize, maxPat, 
00915                                 oocFile, isPep, allowOneMismatch);
00916 if (stepSize == 0)
00917     stepSize = tileSize;
00918 if (gf->segSize > 0)
00919     {
00920     gfLargeIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper);
00921     }
00922 else
00923     {
00924     gfSmallIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper);
00925     }
00926 return gf;
00927 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gfIndexTransNibsAndTwoBits ( struct genoFind transGf[2][3],
int  fileCount,
char *  fileNames[],
int  minMatch,
int  maxGap,
int  tileSize,
int  maxPat,
char *  oocFile,
boolean  allowOneMismatch,
boolean  doMask,
int  stepSize 
)

Definition at line 717 of file genoFind.c.

References AllocArray, bits32, errAbort(), freeDnaSeq(), gfAllocLists(), gfNewEmpty(), gfZeroNonOverused(), gfZeroOverused(), twoBitFile::indexList, maskSimplePepRepeat(), maxTotalBases(), twoBitIndex::name, twoBitIndex::next, nibIsFile(), PATH_LEN, readMaskedNib(), readMaskedTwoBit(), safef(), dnaSeq::size, genoFind::totalSeqSize, transCountBothStrands(), transIndexBothStrands(), TRUE, twoBitCheckTotalSize(), twoBitClose(), twoBitIsFile(), and twoBitOpen().

Referenced by startServer().

00722 {
00723 struct genoFind *gf;
00724 int i,isRc, frame;
00725 bits32 offset[2][3];
00726 char *fileName;
00727 struct dnaSeq *seq;
00728 int sourceCount = 0;
00729 long long totalBases = 0, warnAt = maxTotalBases();
00730 
00731 if (allowOneMismatch)
00732     errAbort("Don't currently support allowOneMismatch in gfIndexTransNibsAndTwoBits");
00733 /* Allocate indices for all reading frames. */
00734 for (isRc=0; isRc <= 1; ++isRc)
00735     {
00736     for (frame = 0; frame < 3; ++frame)
00737         {
00738         transGf[isRc][frame] = gf = gfNewEmpty(minMatch, maxGap, 
00739                 tileSize, stepSize, maxPat, oocFile, TRUE, allowOneMismatch);
00740         }
00741     }
00742 
00743 /* Mask simple AA repeats (of period 1 and 2). */
00744 for (isRc = 0; isRc <= 1; ++isRc)
00745     for (frame = 0; frame < 3; ++frame)
00746         maskSimplePepRepeat(transGf[isRc][frame]);
00747 
00748 /* Scan through .nib and .2bit files once counting tiles. */
00749 for (i=0; i<fileCount; ++i)
00750     {
00751     fileName = fileNames[i];
00752     printf("Counting %s\n", fileName);
00753     if (nibIsFile(fileName))
00754         {
00755         seq = readMaskedNib(fileName, doMask);
00756         transCountBothStrands(seq, transGf);
00757         sourceCount += 1;
00758         totalBases += seq->size;
00759         freeDnaSeq(&seq);
00760         }
00761     else if (twoBitIsFile(fileName))
00762         {
00763         struct twoBitFile *tbf = twoBitOpen(fileName);
00764         struct twoBitIndex *index;
00765         totalBases += twoBitCheckTotalSize(tbf);
00766 
00767         for (index = tbf->indexList; index != NULL; index = index->next)
00768             {
00769             seq = readMaskedTwoBit(tbf, index->name, doMask);
00770             transCountBothStrands(seq, transGf);
00771             sourceCount += 1;
00772             freeDnaSeq(&seq);
00773             }
00774         twoBitClose(&tbf);
00775         }
00776     else 
00777         errAbort("Unrecognized file type %s", fileName);
00778     if (totalBases >= warnAt)
00779         errAbort("Exceeding 4 billion bases, sorry gfServer can't handle that.");
00780     }
00781 
00782 /* Get space for entries in indexed of all reading frames. */
00783 for (isRc=0; isRc <= 1; ++isRc)
00784     {
00785     for (frame = 0; frame < 3; ++frame)
00786         {
00787         gf = transGf[isRc][frame];
00788         gfAllocLists(gf);
00789         gfZeroNonOverused(gf);
00790         AllocArray(gf->sources, sourceCount);
00791         gf->sourceCount = sourceCount;
00792         offset[isRc][frame] = 0;
00793         }
00794     }
00795 
00796 /* Scan through nibs a second time building index. */
00797 sourceCount = 0;
00798 for (i=0; i<fileCount; ++i)
00799     {
00800     fileName = fileNames[i];
00801     printf("Indexing %s\n", fileName);
00802     if (nibIsFile(fileName))
00803         {
00804         seq = readMaskedNib(fileName, doMask);
00805         transIndexBothStrands(seq, transGf, offset, sourceCount, fileName);
00806         freeDnaSeq(&seq);
00807         sourceCount += 1;
00808         }
00809     else        /* .2bit file */
00810         {
00811         struct twoBitFile *tbf = twoBitOpen(fileName);
00812         struct twoBitIndex *index;
00813         for (index = tbf->indexList; index != NULL; index = index->next)
00814             {
00815             char nameBuf[PATH_LEN+256];
00816             safef(nameBuf, sizeof(nameBuf), "%s:%s", fileName, index->name);
00817             seq = readMaskedTwoBit(tbf, index->name, doMask);
00818             transIndexBothStrands(seq, transGf, offset, sourceCount, nameBuf);
00819             sourceCount += 1;
00820             freeDnaSeq(&seq);
00821             }
00822         twoBitClose(&tbf);
00823         }
00824     }
00825 
00826 for (isRc=0; isRc <= 1; ++isRc)
00827     {
00828     for (frame = 0; frame < 3; ++frame)
00829         {
00830         gf = transGf[isRc][frame];
00831         gf->totalSeqSize = offset[isRc][frame];
00832         gfZeroOverused(gf);
00833         }
00834     }
00835 }

Here is the call graph for this function:

Here is the caller graph for this function:

static struct genoFind* gfLargeIndexSeq ( struct genoFind gf,
bioSeq seqList,
int  minMatch,
int  maxGap,
int  tileSize,
int  maxPat,
char *  oocFile,
boolean  isPep,
boolean  maskUpper 
) [static, read]

Definition at line 873 of file genoFind.c.

References AllocArray, bits32, gfAddLargeSeq(), gfAllocLargeLists(), gfCountSeq(), gfZeroNonOverused(), gfZeroOverused(), maskFromUpperCaseSeq(), dnaSeq::next, gfSeqSource::seq, slCount(), and ss.

Referenced by gfIndexSeq().

00877 {
00878 int seqCount = slCount(seqList);
00879 bioSeq *seq;
00880 int i;
00881 bits32 offset = 0;
00882 struct gfSeqSource *ss;
00883 
00884 for (seq = seqList; seq != NULL; seq = seq->next)
00885     gfCountSeq(gf, seq);
00886 gfAllocLargeLists(gf);
00887 gfZeroNonOverused(gf);
00888 AllocArray(gf->sources, seqCount);
00889 gf->sourceCount = seqCount;
00890 for (i=0, seq = seqList; i<seqCount; ++i, seq = seq->next)
00891     {
00892     gfAddLargeSeq(gf, seq, offset);
00893     ss = gf->sources+i;
00894     ss->seq = seq;
00895     ss->start = offset;
00896     offset += seq->size;
00897     ss->end = offset;
00898     if (maskUpper)
00899         ss->maskedBits = maskFromUpperCaseSeq(seq);
00900     }
00901 gf->totalSeqSize = offset;
00902 gfZeroOverused(gf);
00903 return gf;
00904 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gfMakeOoc ( char *  outName,
char *  files[],
int  fileCount,
int  tileSize,
bits32  maxPat,
enum gfType  tType 
)

Definition at line 1928 of file genoFind.c.

References bits32, carefulClose(), dnaSeq::dna, errAbort(), FALSE, faReadAllSeq(), freeDnaSeqList(), genoFindFree(), gfCountSeq(), gfMaxGap, gfMinMatch, gfNewEmpty(), gftDnaX, gftProt, gftRnaX, genoFind::listSizes, mustOpen(), dnaSeq::next, nibIsFile(), nibLoadAll(), oocSig, reverseComplement(), genoFind::segSize, dnaSeq::size, genoFind::tileSpaceSize, toLowerN(), trans3Free(), trans3New(), twoBitIsFile(), twoBitLoadAll(), and writeOne.

Referenced by blat().

01931 {
01932 boolean dbIsPep = (tType == gftProt || tType == gftDnaX || tType == gftRnaX);
01933 struct genoFind *gf = gfNewEmpty(gfMinMatch, gfMaxGap, tileSize, tileSize,
01934         maxPat, NULL, dbIsPep, FALSE);
01935 bits32 *sizes = gf->listSizes;
01936 int tileSpaceSize = gf->tileSpaceSize;
01937 bioSeq *seq, *seqList;
01938 bits32 sig = oocSig, psz = tileSize;
01939 bits32 i;
01940 int oocCount = 0;
01941 char *inName;
01942 FILE *f = mustOpen(outName, "w");
01943 
01944 if (gf->segSize > 0)
01945     errAbort("Don't yet know how to make ooc files for large tile sizes.");
01946 for (i=0; i<fileCount; ++i)
01947     {
01948     inName = files[i];
01949     printf("Loading %s\n", inName);
01950     if (nibIsFile(inName))
01951         {
01952         seqList = nibLoadAll(inName);
01953         }
01954     else if (twoBitIsFile(inName))
01955         {
01956         seqList = twoBitLoadAll(inName);
01957         for (seq = seqList; seq != NULL; seq = seq->next)
01958             toLowerN(seq->dna, seq->size);
01959         }
01960     else
01961         {
01962         seqList = faReadAllSeq(inName, tType != gftProt);
01963         }
01964     printf("Counting %s\n", inName);
01965     for (seq = seqList; seq != NULL; seq = seq->next)
01966         {
01967         int isRc;
01968         for (isRc = 0; isRc <= 1; ++isRc)
01969             {
01970             if (tType == gftDnaX || tType == gftRnaX)
01971                 {
01972                 struct trans3 *t3 = trans3New(seq);
01973                 int frame;
01974                 for (frame=0; frame<3; ++frame)
01975                     {
01976                     gfCountSeq(gf, t3->trans[frame]);
01977                     }
01978                 trans3Free(&t3);
01979                 }
01980             else
01981                 {
01982                 gfCountSeq(gf, seq);
01983                 }
01984             if (tType == gftProt || tType == gftRnaX)
01985                 break;
01986             else 
01987                 {
01988                 reverseComplement(seq->dna, seq->size);
01989                 }
01990             }
01991         }
01992     freeDnaSeqList(&seqList);
01993     }
01994 printf("Writing %s\n", outName);
01995 writeOne(f, sig);
01996 writeOne(f, psz);
01997 for (i=0; i<tileSpaceSize; ++i)
01998     {
01999     if (sizes[i] >= maxPat)
02000         {
02001         writeOne(f, i);
02002         ++oocCount;
02003         }
02004     }
02005 carefulClose(&f);
02006 genoFindFree(&gf);
02007 printf("Wrote %d overused %d-mers to %s\n", oocCount, tileSize, outName);
02008 }

Here is the call graph for this function:

Here is the caller graph for this function:

static struct genoFind* gfNewEmpty ( int  minMatch,
int  maxGap,
int  tileSize,
int  stepSize,
int  maxPat,
char *  oocFile,
boolean  isPep,
boolean  allowOneMismatch 
) [static, read]

Definition at line 112 of file genoFind.c.

References AllocVar, genoFind::allowOneMismatch, BIGNUM, genoFind::endLists, errAbort(), gfCheckTileSize(), gfPowerOf20(), genoFind::isPep, genoFind::lists, genoFind::listSizes, genoFind::maxGap, genoFind::maxPat, genoFind::minMatch, needHugeZeroedMem(), oocMaskCounts(), oocMaskSimpleRepeats(), genoFind::segSize, genoFind::stepSize, genoFind::tileMask, genoFind::tileSize, and genoFind::tileSpaceSize.

Referenced by gfIndexNibsAndTwoBits(), gfIndexSeq(), gfIndexTransNibsAndTwoBits(), and gfMakeOoc().

00116 {
00117 struct genoFind *gf;
00118 int tileSpaceSize;
00119 int segSize = 0;
00120 
00121 gfCheckTileSize(tileSize, isPep);
00122 if (stepSize == 0)
00123     stepSize = tileSize;
00124 AllocVar(gf);
00125 if (isPep)
00126     {
00127     if (tileSize > 5)
00128         {
00129         segSize = tileSize - 5;
00130         }
00131     tileSpaceSize = gf->tileSpaceSize = gfPowerOf20(tileSize-segSize);
00132     }
00133 else
00134     {
00135     int seedBitSize;
00136     if (tileSize > 12)
00137         {
00138         segSize = tileSize - 12;
00139         }
00140     seedBitSize = (tileSize-segSize)*2;
00141     tileSpaceSize = gf->tileSpaceSize = (1<<seedBitSize);
00142     gf->tileMask = tileSpaceSize-1;
00143     }
00144 gf->segSize = segSize;
00145 gf->tileSize = tileSize;
00146 gf->stepSize = stepSize;
00147 gf->isPep = isPep;
00148 gf->allowOneMismatch = allowOneMismatch;
00149 if (segSize > 0)
00150     {
00151     gf->endLists = needHugeZeroedMem(tileSpaceSize * sizeof(gf->endLists[0]));
00152     maxPat = BIGNUM;    /* Don't filter out overused on the big ones.  It is
00153                          * unnecessary and would be quite complex. */
00154     }
00155 else
00156     {
00157     gf->lists = needHugeZeroedMem(tileSpaceSize * sizeof(gf->lists[0]));
00158     }
00159 gf->listSizes = needHugeZeroedMem(tileSpaceSize * sizeof(gf->listSizes[0]));
00160 gf->minMatch = minMatch;
00161 gf->maxGap = maxGap;
00162 gf->maxPat = maxPat;
00163 if (oocFile != NULL)
00164     {
00165     if (segSize > 0)
00166         errAbort("Don't yet support ooc on large tile sizes");
00167     oocMaskCounts(oocFile, gf->listSizes, tileSize, maxPat);
00168     oocMaskSimpleRepeats(gf->listSizes, tileSize, maxPat);
00169     }
00170 return gf;
00171 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct gfClump* gfPcrClumps ( struct genoFind gf,
char *  fPrimer,
int  fPrimerSize,
char *  rPrimer,
int  rPrimerSize,
int  minDistance,
int  maxDistance 
) [read]

Definition at line 2155 of file genoFind.c.

References errAbort(), genoFind::isPep, pcrClumps(), reverseComplement(), genoFind::segSize, and tolowers().

Referenced by genoPcrDirect(), and pcrQuery().

02158 {
02159 struct gfClump *clumpList;
02160 if (gf->segSize > 0)
02161     errAbort("Can't do PCR on large tile sizes");
02162 if (gf->isPep)
02163     errAbort("Can't do PCR on protein/translated index");
02164 tolowers(fPrimer);
02165 tolowers(rPrimer);
02166 reverseComplement(rPrimer, rPrimerSize);
02167 clumpList = pcrClumps(gf, fPrimer, fPrimerSize, rPrimer, rPrimerSize, 
02168         minDistance, maxDistance);
02169 reverseComplement(rPrimer, rPrimerSize);
02170 return clumpList;
02171 }

Here is the call graph for this function:

Here is the caller graph for this function:

int gfPepTile ( AA pep,
int  n 
)

Definition at line 206 of file genoFind.c.

References aaVal.

Referenced by gfAddLargeSeq(), gfAddSeq(), gfCountSeq(), gfSegmentedFindHits(), gfSegmentedFindNearHits(), gfStraightFindHits(), and gfStraightFindNearHits().

00208 {
00209 int tile = 0;
00210 int aa;
00211 while (--n >= 0)
00212     {
00213     tile *= 20;
00214     aa = aaVal[(int)(*pep++)];
00215     if (aa < 0)
00216         return -1;
00217     tile += aa;
00218     }
00219 return tile;
00220 }

Here is the caller graph for this function:

static void gfPipeHandler ( int  sigNum  )  [static]

Definition at line 32 of file genoFind.c.

References pipeBroke, and TRUE.

Referenced by gfCatchPipes().

00034 {
00035 pipeBroke = TRUE;
00036 }

Here is the caller graph for this function:

int gfPowerOf20 ( int  n  ) 

Definition at line 88 of file genoFind.c.

Referenced by gfNewEmpty().

00090 {
00091 int res = 20;
00092 while (--n > 0)
00093     res *= 20;
00094 return res;
00095 }

Here is the caller graph for this function:

int gfReadMulti ( int  sd,
void *  vBuf,
size_t  size 
)

Definition at line 44 of file genoFind.c.

Referenced by startServer().

00046 {
00047 char *buf = vBuf;
00048 size_t totalRead = 0;
00049 int oneRead;
00050 
00051 while (totalRead < size)
00052     {
00053     oneRead = read(sd, buf + totalRead, size - totalRead);
00054     if (oneRead < 0)
00055         {
00056         perror("Couldn't finish large read");
00057         return oneRead;
00058         }
00059     else if (oneRead == 0)
00060     /* Avoid an infinite loop when the client closed the socket. */
00061         break;
00062     totalRead += oneRead;
00063     }
00064 return totalRead;
00065 }

Here is the caller graph for this function:

static struct gfHit* gfSegmentedFindHits ( struct genoFind gf,
aaSeq seq,
Bits qMaskBits,
int  qMaskOffset,
struct lm lm,
int *  retHitCount,
struct gfSeqSource target,
int  tMin,
int  tMax 
) [static, read]

Definition at line 1632 of file genoFind.c.

References bitCountRange(), bits16, bits32, dnaSeq::dna, genoFind::endLists, findSource(), gfDnaTile(), gfPepTile(), initNtLookup(), genoFind::isPep, genoFind::listSizes, lm, lmAllocVar, gfHit::qStart, genoFind::segSize, dnaSeq::size, slAddHead, genoFind::tileSize, tileSize, and gfHit::tStart.

Referenced by gfFindHitsWithQmask().

01637 {
01638 struct gfHit *hitList = NULL, *hit;
01639 int size = seq->size;
01640 int tileSize = gf->tileSize;
01641 int tileTailSize = gf->segSize;
01642 int tileHeadSize = gf->tileSize - tileTailSize;
01643 int lastStart = size - tileSize;
01644 char *poly = seq->dna;
01645 int i, j;
01646 int tileHead, tileTail;
01647 int listSize;
01648 bits32 qStart;
01649 bits16 *endList;
01650 int hitCount = 0;
01651 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
01652 
01653 
01654 initNtLookup();
01655 for (i=0; i<=lastStart; ++i)
01656     {
01657     if (qMaskBits == NULL || bitCountRange(qMaskBits, i+qMaskOffset, tileSize) == 0)
01658         {
01659         tileHead = makeTile(poly+i, tileHeadSize);
01660         if (tileHead < 0)
01661             continue;
01662         tileTail = makeTile(poly+i+tileHeadSize, tileTailSize);
01663         if (tileTail < 0)
01664             continue;
01665         listSize = gf->listSizes[tileHead];
01666         qStart = i;
01667         endList = gf->endLists[tileHead];
01668         for (j=0; j<listSize; ++j)
01669             {
01670             if (endList[0] == tileTail)
01671                 {
01672                 int tStart = (endList[1]<<16) + endList[2];
01673                 if (target == NULL || 
01674                         (target == findSource(gf, tStart) 
01675                         && tStart >= tMin && tStart < tMax) ) 
01676                     {
01677                     lmAllocVar(lm,hit);
01678                     hit->qStart = qStart;
01679                     hit->tStart = tStart;
01680                     hit->diagonal = tStart + size - qStart;
01681                     slAddHead(&hitList, hit);
01682                     ++hitCount;
01683                     }
01684                 }
01685             endList += 3;
01686             }
01687         }
01688     }
01689 *retHitCount = hitCount;
01690 return hitList;
01691 }

Here is the call graph for this function:

Here is the caller graph for this function:

static struct gfHit* gfSegmentedFindNearHits ( struct genoFind gf,
aaSeq seq,
Bits qMaskBits,
int  qMaskOffset,
struct lm lm,
int *  retHitCount,
struct gfSeqSource target,
int  tMin,
int  tMax 
) [static, read]

Definition at line 1693 of file genoFind.c.

References aaVal, bitCountRange(), bits16, bits32, dnaSeq::dna, genoFind::endLists, findSource(), gfDnaTile(), gfPepTile(), initNtLookup(), genoFind::isPep, genoFind::listSizes, lm, lmAllocVar, ntVal, gfHit::qStart, genoFind::segSize, dnaSeq::size, slAddHead, genoFind::tileSize, tileSize, and gfHit::tStart.

Referenced by gfFindHitsWithQmask().

01698 {
01699 struct gfHit *hitList = NULL, *hit;
01700 int size = seq->size;
01701 int tileSize = gf->tileSize;
01702 int tileTailSize = gf->segSize;
01703 int tileHeadSize = gf->tileSize - tileTailSize;
01704 int lastStart = size - tileSize;
01705 char *poly = seq->dna;
01706 int i, j;
01707 int tileHead, tileTail;
01708 int listSize;
01709 bits32 qStart;
01710 bits16 *endList;
01711 int hitCount = 0;
01712 int varPos, varVal;     /* Variable position. */
01713 int (*makeTile)(char *poly, int n); 
01714 int alphabetSize;
01715 char oldChar, zeroChar;
01716 int headPosMul, tailPosMul, avoid;
01717 boolean modTail;
01718 int *seqValLookup;
01719 
01720 
01721 initNtLookup();
01722 if (gf->isPep)
01723     {
01724     makeTile = gfPepTile;
01725     alphabetSize = 20;
01726     zeroChar = 'A';
01727     seqValLookup = aaVal;
01728     }
01729 else
01730     {
01731     makeTile = gfDnaTile;
01732     alphabetSize = 4;
01733     zeroChar = 't';
01734     seqValLookup = ntVal;
01735     }
01736 
01737 for (i=0; i<=lastStart; ++i)
01738     {
01739     if (qMaskBits == NULL || bitCountRange(qMaskBits, i+qMaskOffset, tileSize) == 0)
01740         {
01741         headPosMul = tailPosMul = 1;
01742         for (varPos = tileSize-1; varPos >= 0; --varPos)
01743             {
01744             /* Make a tile that has zero value at variable position. */
01745             modTail = (varPos >= tileHeadSize);
01746             oldChar = poly[i+varPos];
01747             poly[i+varPos] = zeroChar;
01748             tileHead = makeTile(poly+i, tileHeadSize);
01749             tileTail = makeTile(poly+i+tileHeadSize, tileTailSize);
01750             poly[i+varPos] = oldChar;
01751 
01752             /* Avoid checking the unmodified tile multiple times. */
01753             if (varPos == 0)
01754                 avoid = -1;
01755             else
01756                 avoid = seqValLookup[(int)oldChar];
01757 
01758             if (tileHead >= 0 && tileTail >= 0)
01759                 {
01760                 for (varVal=0; varVal<alphabetSize; ++varVal)
01761                     {
01762                     if (varVal != avoid)
01763                         {
01764                         listSize = gf->listSizes[tileHead];
01765                         qStart = i;
01766                         endList = gf->endLists[tileHead];
01767                         for (j=0; j<listSize; ++j)
01768                             {
01769                             if (endList[0] == tileTail)
01770                                 {
01771                                 int tStart = (endList[1]<<16) + endList[2];
01772                                 if (target == NULL || 
01773                                         (target == findSource(gf, tStart) 
01774                                         && tStart >= tMin && tStart < tMax) ) 
01775                                     {
01776                                     lmAllocVar(lm,hit);
01777                                     hit->qStart = qStart;
01778                                     hit->tStart = tStart;
01779                                     hit->diagonal = tStart + size - qStart;
01780                                     slAddHead(&hitList, hit);
01781                                     ++hitCount;
01782                                     }
01783                                 }
01784                             endList += 3;
01785                             }
01786                         }
01787                     if (modTail)
01788                         tileTail += tailPosMul;
01789                     else 
01790                         tileHead += headPosMul;
01791                     }
01792                 }
01793             if (modTail)
01794                 tailPosMul *= alphabetSize;
01795             else 
01796                 headPosMul *= alphabetSize;
01797             }
01798         }
01799     }
01800 *retHitCount = hitCount;
01801 return hitList;
01802 }

Here is the call graph for this function:

Here is the caller graph for this function:

char* gfSignature (  ) 

Definition at line 22 of file genoFind.c.

Referenced by getFileList(), gfPcrGetRanges(), pcrServer(), queryServer(), startSeqQuery(), startServer(), statusServer(), and stopServer().

00025 {
00026 static char signature[] = "0ddf270562684f29";
00027 return signature;
00028 }

Here is the caller graph for this function:

static struct genoFind* gfSmallIndexSeq ( struct genoFind gf,
bioSeq seqList,
int  minMatch,
int  maxGap,
int  tileSize,
int  maxPat,
char *  oocFile,
boolean  isPep,
boolean  maskUpper 
) [static, read]

Definition at line 837 of file genoFind.c.

References AllocArray, bits32, gfAddSeq(), gfAllocLists(), gfCountSeq(), gfZeroNonOverused(), gfZeroOverused(), maskFromUpperCaseSeq(), maskSimplePepRepeat(), dnaSeq::next, gfSeqSource::seq, slCount(), and ss.

Referenced by gfIndexSeq().

00841 {
00842 int seqCount = slCount(seqList);
00843 bioSeq *seq;
00844 int i;
00845 bits32 offset = 0;
00846 struct gfSeqSource *ss;
00847 
00848 if (isPep)
00849     maskSimplePepRepeat(gf);
00850 for (seq = seqList; seq != NULL; seq = seq->next)
00851     gfCountSeq(gf, seq);
00852 gfAllocLists(gf);
00853 gfZeroNonOverused(gf);
00854 if (seqCount > 0)
00855     AllocArray(gf->sources, seqCount);
00856 gf->sourceCount = seqCount;
00857 for (i=0, seq = seqList; i<seqCount; ++i, seq = seq->next)
00858     {
00859     gfAddSeq(gf, seq, offset);
00860     ss = gf->sources+i;
00861     ss->seq = seq;
00862     ss->start = offset;
00863     offset += seq->size;
00864     ss->end = offset;
00865     if (maskUpper)
00866         ss->maskedBits = maskFromUpperCaseSeq(seq);
00867     }
00868 gf->totalSeqSize = offset;
00869 gfZeroOverused(gf);
00870 return gf;
00871 }

Here is the call graph for this function:

Here is the caller graph for this function:

static struct gfHit* gfStraightFindHits ( struct genoFind gf,
aaSeq seq,
Bits qMaskBits,
int  qMaskOffset,
struct lm lm,
int *  retHitCount,
struct gfSeqSource target,
int  tMin,
int  tMax 
) [static, read]

Definition at line 1482 of file genoFind.c.

References bitCountRange(), bits32, dnaSeq::dna, findSource(), gfDnaTile(), gfPepTile(), initNtLookup(), genoFind::isPep, genoFind::lists, genoFind::listSizes, lm, lmAllocVar, gfHit::qStart, dnaSeq::size, slAddHead, genoFind::tileSize, tileSize, and gfHit::tStart.

Referenced by gfFindHitsWithQmask().

01487 {
01488 struct gfHit *hitList = NULL, *hit;
01489 int size = seq->size;
01490 int tileSize = gf->tileSize;
01491 int lastStart = size - tileSize;
01492 char *poly = seq->dna;
01493 int i, j;
01494 int tile;
01495 int listSize;
01496 bits32 qStart, *tList;
01497 int hitCount = 0;
01498 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
01499 
01500 initNtLookup();
01501 for (i=0; i<=lastStart; ++i)
01502     {
01503     tile = makeTile(poly+i, tileSize);
01504     if (tile < 0)
01505         continue;
01506     listSize = gf->listSizes[tile];
01507     if (listSize > 0)
01508         {
01509         qStart = i;
01510         if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, tileSize) == 0)
01511             {
01512             tList = gf->lists[tile];
01513             for (j=0; j<listSize; ++j)
01514                 {
01515                 int tStart = tList[j];
01516                 if (target == NULL || 
01517                         (target == findSource(gf, tStart) && tStart >= tMin && tStart < tMax) ) 
01518                     {
01519                     lmAllocVar(lm,hit);
01520                     hit->qStart = qStart;
01521                     hit->tStart = tStart;
01522                     hit->diagonal = tStart + size - qStart;
01523                     slAddHead(&hitList, hit);
01524                     ++hitCount;
01525                     }
01526                 }
01527             }
01528         }
01529     }
01530 *retHitCount = hitCount;
01531 return hitList;
01532 }

Here is the call graph for this function:

Here is the caller graph for this function:

static struct gfHit* gfStraightFindNearHits ( struct genoFind gf,
aaSeq seq,
Bits qMaskBits,
int  qMaskOffset,
struct lm lm,
int *  retHitCount,
struct gfSeqSource target,
int  tMin,
int  tMax 
) [static, read]

Definition at line 1534 of file genoFind.c.

References aaVal, bitCountRange(), bits32, dnaSeq::dna, findSource(), gfDnaTile(), gfPepTile(), initNtLookup(), genoFind::isPep, genoFind::lists, genoFind::listSizes, lm, lmAllocVar, ntVal, gfHit::qStart, dnaSeq::size, slAddHead, genoFind::tileSize, tileSize, and gfHit::tStart.

Referenced by gfFindHitsWithQmask().

01539 {
01540 struct gfHit *hitList = NULL, *hit;
01541 int size = seq->size;
01542 int tileSize = gf->tileSize;
01543 int lastStart = size - tileSize;
01544 char *poly = seq->dna;
01545 int i, j;
01546 int tile;
01547 int listSize;
01548 bits32 qStart, *tList;
01549 int hitCount = 0;
01550 int varPos, varVal;     /* Variable position. */
01551 int (*makeTile)(char *poly, int n); 
01552 int alphabetSize;
01553 char oldChar, zeroChar;
01554 int *seqValLookup;
01555 int posMul, avoid;
01556 
01557 initNtLookup();
01558 if (gf->isPep)
01559     {
01560     makeTile = gfPepTile;
01561     alphabetSize = 20;
01562     zeroChar = 'A';
01563     seqValLookup = aaVal;
01564     }
01565 else
01566     {
01567     makeTile = gfDnaTile;
01568     alphabetSize = 4;
01569     zeroChar = 't';
01570     seqValLookup = ntVal;
01571     }
01572 
01573 for (i=0; i<=lastStart; ++i)
01574     {
01575     posMul = 1;
01576     for (varPos = tileSize-1; varPos >=0; --varPos)
01577         {
01578         /* Make a tile that has zero value at variable position. */
01579         oldChar = poly[i+varPos];
01580         poly[i+varPos] = zeroChar;
01581         tile = makeTile(poly+i, tileSize);
01582         poly[i+varPos] = oldChar;
01583 
01584         /* Avoid checking the unmodified tile multiple times. */
01585         if (varPos == 0)
01586             avoid = -1;
01587         else
01588             avoid = seqValLookup[(int)oldChar];
01589 
01590         if (tile >= 0)
01591             {
01592             /* Look up all possible values of variable position. */
01593             for (varVal=0; varVal<alphabetSize; ++varVal)
01594                 {
01595                 if (varVal != avoid)
01596                     {
01597                     listSize = gf->listSizes[tile];
01598                     if (listSize > 0)
01599                         {
01600                         qStart = i;
01601                         if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, tileSize) == 0)
01602                             {
01603                             tList = gf->lists[tile];
01604                             for (j=0; j<listSize; ++j)
01605                                 {
01606                                 int tStart = tList[j];
01607                                 if (target == NULL || 
01608                                         (target == findSource(gf, tStart) 
01609                                         && tStart >= tMin && tStart < tMax) ) 
01610                                     {
01611                                     lmAllocVar(lm,hit);
01612                                     hit->qStart = qStart;
01613                                     hit->tStart = tStart;
01614                                     hit->diagonal = tStart + size - qStart;
01615                                     slAddHead(&hitList, hit);
01616                                     ++hitCount;
01617                                     }
01618                                 }
01619                             }
01620                         }
01621                     }
01622                 tile += posMul;
01623                 }
01624             }
01625         posMul *= alphabetSize;
01626         }
01627     }
01628 *retHitCount = hitCount;
01629 return hitList;
01630 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gfTransFindClumps ( struct genoFind gfs[3],
aaSeq seq,
struct gfClump clumps[3],
struct lm lm,
int *  retHitCount 
)

Definition at line 1897 of file genoFind.c.

References gfFindClumps(), gfClump::hitCount, and lm.

Referenced by gfFindAlignAaTrans(), gfTransTransFindClumps(), and transQuery().

01899 {
01900 int frame;
01901 int oneHit;
01902 int hitCount = 0;
01903 for (frame = 0; frame < 3; ++frame)
01904     {
01905     clumps[frame] = gfFindClumps(gfs[frame], seq, lm, &oneHit);
01906     hitCount += oneHit;
01907     }
01908 *retHitCount = hitCount;
01909 }

Here is the call graph for this function:

Here is the caller graph for this function:

void gfTransTransFindClumps ( struct genoFind gfs[3],
aaSeq seqs[3],
struct gfClump clumps[3][3],
struct lm lm,
int *  retHitCount 
)

Definition at line 1911 of file genoFind.c.

References gfTransFindClumps(), gfClump::hitCount, and lm.

Referenced by gfTransTransFindBundles(), and transTransQuery().

01915 {
01916 int qFrame;
01917 int oneHit;
01918 int hitCount = 0;
01919 
01920 for (qFrame = 0; qFrame<3; ++qFrame)
01921     {
01922     gfTransFindClumps(gfs, seqs[qFrame], clumps[qFrame], lm, &oneHit);
01923     hitCount += oneHit;
01924     }
01925 *retHitCount = hitCount;
01926 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void gfZeroNonOverused ( struct genoFind gf  )  [static]

Definition at line 496 of file genoFind.c.

References bits32, genoFind::listSizes, genoFind::maxPat, and genoFind::tileSpaceSize.

Referenced by gfIndexNibsAndTwoBits(), gfIndexTransNibsAndTwoBits(), gfLargeIndexSeq(), and gfSmallIndexSeq().

00498 {
00499 bits32 *sizes = gf->listSizes;
00500 int tileSpaceSize = gf->tileSpaceSize, i;
00501 int maxPat = gf->maxPat;
00502 int overCount = 0;
00503 
00504 for (i=0; i<tileSpaceSize; ++i)
00505     {
00506     if (sizes[i] < maxPat)
00507         {
00508         sizes[i] = 0;
00509         ++overCount;
00510         }
00511     }
00512 }

Here is the caller graph for this function:

static void gfZeroOverused ( struct genoFind gf  )  [static]

Definition at line 478 of file genoFind.c.

References bits32, genoFind::listSizes, genoFind::maxPat, and genoFind::tileSpaceSize.

Referenced by gfIndexNibsAndTwoBits(), gfIndexTransNibsAndTwoBits(), gfLargeIndexSeq(), and gfSmallIndexSeq().

00480 {
00481 bits32 *sizes = gf->listSizes;
00482 int tileSpaceSize = gf->tileSpaceSize, i;
00483 int maxPat = gf->maxPat;
00484 int overCount = 0;
00485 
00486 for (i=0; i<tileSpaceSize; ++i)
00487     {
00488     if (sizes[i] >= maxPat)
00489         {
00490         sizes[i] = 0;
00491         ++overCount;
00492         }
00493     }
00494 }

Here is the caller graph for this function:

static void initNtLookup (  )  [static]

Definition at line 175 of file genoFind.c.

References A_BASE_VAL, ArraySize, C_BASE_VAL, FALSE, G_BASE_VAL, ntLookup, T_BASE_VAL, and TRUE.

Referenced by gfAddLargeSeq(), gfAddSeq(), gfCountSeq(), gfSegmentedFindHits(), gfSegmentedFindNearHits(), gfStraightFindHits(), and gfStraightFindNearHits().

00176 {
00177 static boolean initted = FALSE;
00178 if (!initted)
00179     {
00180     int i;
00181     for (i=0; i<ArraySize(ntLookup); ++i)
00182         ntLookup[i] = -1;
00183     ntLookup['a'] = A_BASE_VAL;
00184     ntLookup['c'] = C_BASE_VAL;
00185     ntLookup['t'] = T_BASE_VAL;
00186     ntLookup['g'] = G_BASE_VAL;
00187     initted = TRUE;
00188     }
00189 }

Here is the caller graph for this function:

static void maskSimplePepRepeat ( struct genoFind gf  )  [static]

Definition at line 604 of file genoFind.c.

References bits32, genoFind::listSizes, genoFind::maxPat, genoFind::tileSize, and tileSize.

Referenced by gfIndexTransNibsAndTwoBits(), and gfSmallIndexSeq().

00607 {
00608 int i;
00609 int tileSize = gf->tileSize;
00610 int maxPat = gf->maxPat;
00611 bits32 *listSizes = gf->listSizes;
00612 
00613 for (i=0; i<20; ++i)
00614     {
00615     int j, k;
00616     for (j=0; j<20; ++j)
00617         {
00618         int tile = 0;
00619         for (k=0; k<tileSize; ++k)
00620             {
00621             tile *= 20;
00622             if (k&1)
00623                 tile += j;
00624             else
00625                 tile += i;
00626             }
00627         listSizes[tile] = maxPat;
00628         }
00629     }
00630 }

Here is the caller graph for this function:

long long maxTotalBases (  ) 

Definition at line 274 of file genoFind.c.

Referenced by gfIndexNibsAndTwoBits(), gfIndexTransNibsAndTwoBits(), and twoBitCheckTotalSize().

00276 {
00277 long long maxBases = 1024*1024;
00278 maxBases *= 4*1024;
00279 return maxBases;
00280 }

Here is the caller graph for this function:

static void mergeAdd ( struct binKeeper bk,
int  start,
int  end,
struct gfSeqSource src 
) [static]

Definition at line 2038 of file genoFind.c.

References binKeeperAdd(), binKeeperFind(), binKeeperRemove(), binElement::end, binElement::next, slFreeList(), and binElement::start.

Referenced by pcrClumps().

02041 {
02042 struct binElement *iEl, *iList = binKeeperFind(bk, start, end);
02043 for (iEl = iList; iEl != NULL; iEl = iEl->next)
02044     {
02045     if (iEl->start < start)
02046         start = iEl->start;
02047     if (iEl->end > end)
02048         end = iEl->end;
02049     binKeeperRemove(bk, iEl->start, iEl->end, src);
02050     }
02051 slFreeList(&iList);
02052 binKeeperAdd(bk, start, end, src);
02053 }

Here is the call graph for this function:

Here is the caller graph for this function:

static struct gfClump* pcrClumps ( struct genoFind gf,
char *  fPrimer,
int  fPrimerSize,
char *  rPrimer,
int  rPrimerSize,
int  minDistance,
int  maxDistance 
) [static, read]

Definition at line 2055 of file genoFind.c.

References AllocArray, AllocVar, binKeeperFindAll(), binKeeperFree(), binKeeperNew(), bits32, errAbort(), findSource(), freez(), gfDnaTile(), hashAdd(), hashElFreeList(), hashElListHash(), hashFindVal(), hashFree, genoFind::lists, genoFind::listSizes, mergeAdd(), newHash(), binElement::next, hashEl::next, slAddHead, slFreeList(), gfSeqSource::start, genoFind::tileSize, tileSize, and hashEl::val.

Referenced by gfPcrClumps().

02058 {
02059 struct gfClump *clumpList = NULL;
02060 int tileSize = gf->tileSize;
02061 int fTile;
02062 int fTileCount = fPrimerSize - tileSize;
02063 int *rTiles, rTile;
02064 int rTileCount = rPrimerSize - tileSize;
02065 int fTileIx,rTileIx,fPosIx,rPosIx;
02066 bits32 *fPosList, fPos, *rPosList, rPos;
02067 int fPosListSize, rPosListSize;
02068 struct hash *targetHash = newHash(0);
02069 
02070 /* Build up array of all tiles in reverse primer. */
02071 AllocArray(rTiles, rTileCount);
02072 for (rTileIx = 0; rTileIx<rTileCount; ++rTileIx)
02073     {
02074     rTiles[rTileIx] = gfDnaTile(rPrimer + rTileIx, tileSize);
02075     if (rTiles[rTileIx] == -1)
02076         errAbort("Bad char in reverse primer sequence: %s", rPrimer);
02077     }
02078 
02079 /* Loop through all tiles in forward primer. */
02080 for (fTileIx=0; fTileIx<fTileCount; ++fTileIx)
02081     {
02082     fTile = gfDnaTile(fPrimer + fTileIx, tileSize);
02083     if (fTile >= 0)
02084         {
02085         fPosListSize = gf->listSizes[fTile];
02086         fPosList = gf->lists[fTile];
02087         for (fPosIx=0; fPosIx < fPosListSize; ++fPosIx)
02088             {
02089             fPos = fPosList[fPosIx];
02090             /* Loop through hits to reverse primer. */
02091             for (rTileIx=0; rTileIx < rTileCount; ++rTileIx)
02092                 {
02093                 rTile = rTiles[rTileIx];
02094                 rPosListSize = gf->listSizes[rTile];
02095                 rPosList = gf->lists[rTile];
02096                 for (rPosIx=0; rPosIx < rPosListSize; ++rPosIx)
02097                     {
02098                     rPos = rPosList[rPosIx];
02099                     if (rPos > fPos)
02100                         {
02101                         int distance = rPos - fPos;
02102                         if (distance >= minDistance && distance <= maxDistance)
02103                             {
02104                             struct gfSeqSource *target = findSource(gf, fPos);
02105                             if (rPos < target->end)
02106                                 {
02107                                 struct binKeeper *bk;
02108                                 int tStart = target->start;
02109                                 char *tName = target->fileName;
02110                                 if (tName == NULL)
02111                                     tName = target->seq->name;
02112                                 if ((bk = hashFindVal(targetHash, tName)) == NULL)
02113                                     {
02114                                     bk = binKeeperNew(0, target->end - tStart);
02115                                     hashAdd(targetHash, tName, bk);
02116                                     }
02117                                 mergeAdd(bk, fPos - tStart, rPos - tStart, target);
02118                                 }
02119                             }
02120                         }
02121                     }
02122                 }
02123             }
02124         }
02125     }
02126 
02127 /* Make clumps and clean up temporary structures. */
02128     {
02129     struct hashEl *tList, *tEl;
02130     tList = hashElListHash(targetHash);
02131     for (tEl = tList; tEl != NULL; tEl = tEl->next)
02132         {
02133         struct binKeeper *bk = tEl->val;
02134         struct binElement *bkList, *bkEl;
02135         bkList = binKeeperFindAll(bk);
02136         for (bkEl = bkList; bkEl != NULL; bkEl = bkEl->next)
02137             {
02138             struct gfClump *clump;
02139             AllocVar(clump);
02140             clump->target = bkEl->val;
02141             clump->tStart = bkEl->start;
02142             clump->tEnd = bkEl->end;
02143             slAddHead(&clumpList, clump);
02144             }
02145         slFreeList(&bkList);
02146         binKeeperFree(&bk);
02147         }
02148     hashElFreeList(&tList);
02149     hashFree(&targetHash);
02150     }
02151 freez(&rTiles);
02152 return clumpList;       
02153 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct dnaSeq* readMaskedNib ( char *  fileName,
boolean  mask 
) [read]

Definition at line 632 of file genoFind.c.

References dnaSeq::dna, NIB_MASK_MIXED, nibLoadAll(), nibLoadAllMasked(), dnaSeq::size, toggleCase(), and upperToN().

Referenced by gfIndexTransNibsAndTwoBits().

00635 {
00636 struct dnaSeq *seq;
00637 if (mask)
00638     {
00639     seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName);
00640     toggleCase(seq->dna, seq->size);
00641     upperToN(seq->dna, seq->size);
00642     }
00643 else
00644     seq = nibLoadAll(fileName);
00645 return seq;
00646 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct dnaSeq* readMaskedTwoBit ( struct twoBitFile tbf,
char *  seqName,
boolean  mask 
) [read]

Definition at line 648 of file genoFind.c.

References dnaSeq::dna, dnaSeq::size, toggleCase(), twoBitReadSeqFrag(), twoBitReadSeqFragLower(), and upperToN().

Referenced by gfIndexTransNibsAndTwoBits().

00652 {
00653 struct dnaSeq *seq;
00654 if (mask)
00655     {
00656     seq = twoBitReadSeqFrag(tbf, seqName, 0, 0);
00657     toggleCase(seq->dna, seq->size);
00658     upperToN(seq->dna, seq->size);
00659     }
00660 else
00661     seq = twoBitReadSeqFragLower(tbf, seqName, 0, 0);
00662 return seq;
00663 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void targetClump ( struct genoFind gf,
struct gfClump **  pClumpList,
struct gfClump clump 
) [static]

Definition at line 1212 of file genoFind.c.

References AllocVar, findClumpBounds(), findSource(), gfClumpFree(), gfClump::hitList, genoFind::minMatch, newClump(), gfHit::next, slAddHead, slReverse(), ss, gfClump::target, gfClump::tEnd, genoFind::tileSize, gfHit::tStart, and gfClump::tStart.

Referenced by clumpNear().

01215 {
01216 struct gfSeqSource *ss = findSource(gf, clump->tStart);
01217 if (ss->end >= clump->tEnd)
01218     {
01219     /* Usual simple case: all of clump hits single target. */
01220     clump->target = ss;
01221     slAddHead(pClumpList, clump);
01222     }
01223 else
01224     {
01225     /* If we've gotten here, then the clump is split across multiple targets.
01226      * We'll have to split it into multiple clumps... */
01227     struct gfHit *hit, *nextHit, *inList, *outList, *oldList = clump->hitList;
01228     int hCount;
01229 
01230     while (oldList != NULL)
01231         {
01232         inList = outList = NULL;
01233         hCount = 0;
01234         for (hit = oldList; hit != NULL; hit = nextHit)
01235             {
01236             nextHit = hit->next;
01237             if (ss->start <= hit->tStart  && hit->tStart < ss->end)
01238                 {
01239                 ++hCount;
01240                 slAddHead(&inList, hit);
01241                 }
01242             else
01243                 {
01244                 slAddHead(&outList, hit);
01245                 }
01246             }
01247         slReverse(&inList);
01248         slReverse(&outList);
01249         if (hCount >= gf->minMatch)
01250             {
01251             struct gfClump *newClump;
01252             AllocVar(newClump);
01253             newClump->hitList = inList;
01254             newClump->hitCount = hCount;
01255             newClump->target = ss;
01256             findClumpBounds(newClump, gf->tileSize);
01257             slAddHead(pClumpList, newClump);
01258             }
01259         else
01260             {
01261             inList = NULL;
01262             }
01263         oldList = outList;
01264         if (oldList != NULL)
01265             {
01266             ss = findSource(gf, oldList->tStart);
01267             }
01268         }
01269     clump->hitList = NULL;      /* We ate up the hit list. */
01270     gfClumpFree(&clump);
01271     }
01272 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void transCountBothStrands ( struct dnaSeq seq,
struct genoFind transGf[2][3] 
) [static]

Definition at line 666 of file genoFind.c.

References dnaSeq::dna, gfCountSeq(), reverseComplement(), trans3::seq, dnaSeq::size, trans3::trans, trans3Free(), and trans3New().

Referenced by gfIndexTransNibsAndTwoBits().

00670 {
00671 int isRc, frame;
00672 struct trans3 *t3;
00673 for (isRc=0; isRc <= 1; ++isRc)
00674     {
00675     if (isRc)
00676         reverseComplement(seq->dna, seq->size);
00677     t3 = trans3New(seq);
00678     for (frame = 0; frame < 3; ++frame)
00679         {
00680         struct genoFind *gf = transGf[isRc][frame];
00681         gfCountSeq(gf, t3->trans[frame]);
00682         }
00683     trans3Free(&t3);
00684     }
00685 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void transIndexBothStrands ( struct dnaSeq seq,
struct genoFind transGf[2][3],
bits32  offset[2][3],
int  sourceIx,
char *  fileName 
) [static]

Definition at line 687 of file genoFind.c.

References cloneString(), dnaSeq::dna, gfAddSeq(), trans3::isRc, reverseComplement(), gfSeqSource::seq, dnaSeq::size, ss, trans3::trans, trans3Free(), and trans3New().

Referenced by gfIndexTransNibsAndTwoBits().

00692 {
00693 int isRc, frame;
00694 struct trans3 *t3;
00695 struct gfSeqSource *ss;
00696 for (isRc=0; isRc <= 1; ++isRc)
00697     {
00698     if (isRc)
00699         {
00700         reverseComplement(seq->dna, seq->size);
00701         }
00702     t3 = trans3New(seq);
00703     for (frame = 0; frame < 3; ++frame)
00704         {
00705         struct genoFind *gf = transGf[isRc][frame];
00706         ss = gf->sources + sourceIx;
00707         gfAddSeq(gf, t3->trans[frame], offset[isRc][frame]);
00708         ss->fileName = cloneString(fileName);
00709         ss->start = offset[isRc][frame];
00710         offset[isRc][frame] += t3->trans[frame]->size;
00711         ss->end = offset[isRc][frame];
00712         }
00713     trans3Free(&t3);
00714     }
00715 }

Here is the call graph for this function:

Here is the caller graph for this function:

static long long twoBitCheckTotalSize ( struct twoBitFile tbf  )  [static]

Definition at line 282 of file genoFind.c.

References errAbort(), twoBitFile::fileName, maxTotalBases(), and twoBitTotalSize().

Referenced by gfCountTilesInTwoBit(), and gfIndexTransNibsAndTwoBits().

00285 {
00286 long long totalSize = twoBitTotalSize(tbf);
00287 if (totalSize > maxTotalBases())
00288     errAbort("Sorry, can only index up to %lld bases, %s has %lld",
00289         maxTotalBases(),  tbf->fileName, totalSize);
00290 return totalSize;
00291 }

Here is the call graph for this function:

Here is the caller graph for this function:


Variable Documentation

int cmpQuerySize [static]

Definition at line 1094 of file genoFind.c.

Referenced by gfFindClumpsWithQmask().

int gfNearEnough = 300 [static]

Definition at line 1274 of file genoFind.c.

Referenced by clumpHits(), and clumpNear().

struct gfHit * nosSwap [static]

Definition at line 999 of file genoFind.c.

Referenced by gfHitSort2().

struct gfHit** nosTemp [static]

Definition at line 999 of file genoFind.c.

Referenced by gfHitSort2(), and gfHitSortDiagonal().

int ntLookup[256] [static]

Definition at line 173 of file genoFind.c.

Referenced by gfDnaTile(), and initNtLookup().

volatile boolean pipeBroke = FALSE

Definition at line 30 of file genoFind.c.

Referenced by gfPipeHandler().

char const rcsid[] = "$Id: genoFind.c,v 1.23 2007/02/04 05:09:02 kent Exp $" [static]

Definition at line 20 of file genoFind.c.


Generated on Tue Dec 25 19:26:10 2007 for blat by  doxygen 1.5.2