jkOwnLib/crudeali.c File Reference

#include "common.h"
#include "dnautil.h"
#include "nt4.h"
#include "dnaseq.h"
#include "crudeali.h"

Include dependency graph for crudeali.c:

Go to the source code of this file.

Data Structures

struct  crudeHit
struct  crudeExon
struct  crudeGene
struct  probeTile
struct  fastProber

Defines

#define maxTileSize   16
#define wobbleMask   0xF3CF
#define tileHashBits   16
#define tileHashSize   (1<<tileHashBits)
#define tileHashMask   (tileHashSize-1)
#define tileHashFunc(tile)   ((tile)&tileHashMask)
#define maxHitsAtOnce   120000

Functions

static int lumpHitsIntoExons (struct crudeHit *hits, int hitCount, struct crudeExon *exons)
static int lumpExonsIntoGenes (struct crudeExon *exons, int exonCount, struct crudeGene *genes, struct nt4Seq *target, boolean isRc)
static int copyExonsIntoGenes (struct crudeExon *exons, int exonCount, struct crudeGene *genes, struct nt4Seq *target, boolean isRc)
static boolean makeGoodTile (DNA *dna, bits32 *retTile16, bits16 *retTile8, int tileSize)
static struct probeTile ** makeTileHash ()
static struct fastProbermakeFastProber (DNA *probeDna, int probeSize)
static void freeFastProber (struct fastProber **pFp)
static int makeIndividualHits16 (struct probeTile **hash, bits32 *bases, int baseOffset, int baseWordCount, struct crudeHit *hits, int maxHitCount, int hitCount)
static int makeHits16 (struct fastProber *fp, struct nt4Seq *target, struct crudeHit *hits, int maxHitCount)
void squeezeHits8 (struct fastProber *fp, short *compactDiagBuf, int compactDiagBufByteSize, int startToff, int lastCompactOff, int lastCompareOff, struct crudeHit *hits, int lastCompactedHit, int hitCount, int *retLastCompactedHit, int *retHitCount)
static int makeHits8 (struct fastProber *fp, struct nt4Seq *target, struct crudeHit *hits, int maxHitCount)
static int cmpHitsTargetFirst (const void *va, const void *vb)
static int cmpExonsTargetFirst (const void *va, const void *vb)
static int makeHits (struct fastProber *fp, struct nt4Seq *target, struct crudeHit *hits, int maxHitCount)
static int findCrudeGenes (struct fastProber *fp, struct nt4Seq *target, struct crudeGene *genes, int maxGeneCount, boolean isRc)
static int cmpGenesByScore (const void *va, const void *vb)
static int countGenesBetter (struct crudeGene *genes, int geneCount, int cutoff)
static int filterPoorGenes (struct crudeGene *genes, int geneCount)
static int chromIx (struct nt4Seq *one, struct nt4Seq **chrome, int chromeCount)
static int countPrettyGood (struct crudeGene *crudeGenes, int geneCount, int fraction, int minScore)
void initWormDnaMatchScores ()
void scoreNoninsertingExtensions (struct crudeGene *crudeGene, DNA *probe, int probeSize)
crudeAlicrudeAliFind (DNA *probe, int probeSize, struct nt4Seq **chrome, int chromeCount, int tileSize, int minScore)

Variables

static char const rcsid [] = "$Id: crudeali.c,v 1.5 2006/05/09 01:05:55 markd Exp $"
static int caTileSize = 16
static boolean caIsCdna = TRUE
static int wormDnaMatchScores [256][256]


Define Documentation

#define maxHitsAtOnce   120000

Definition at line 561 of file crudeali.c.

Referenced by crudeAliFind(), and findCrudeGenes().

#define maxTileSize   16

Definition at line 19 of file crudeali.c.

#define tileHashBits   16

Definition at line 280 of file crudeali.c.

Referenced by makeHits8().

#define tileHashFunc ( tile   )     ((tile)&tileHashMask)

Definition at line 283 of file crudeali.c.

Referenced by makeFastProber(), makeHits16(), and makeIndividualHits16().

#define tileHashMask   (tileHashSize-1)

Definition at line 282 of file crudeali.c.

#define tileHashSize   (1<<tileHashBits)

Definition at line 281 of file crudeali.c.

Referenced by makeTileHash().

#define wobbleMask   0xF3CF

Definition at line 21 of file crudeali.c.

Referenced by makeFastProber(), and makeHits8().


Function Documentation

static int chromIx ( struct nt4Seq one,
struct nt4Seq **  chrome,
int  chromeCount 
) [static]

Definition at line 663 of file crudeali.c.

References FALSE.

Referenced by cdaLoadOne(), cdaWriteOne(), crudeAliFind(), and gdfReadOneGene().

00664 {
00665 int i;
00666 for (i=0; i<chromeCount; ++i)
00667     {
00668     if (one == chrome[i])
00669         return i;
00670     }
00671 assert(FALSE);
00672 return -1;
00673 }

Here is the caller graph for this function:

static int cmpExonsTargetFirst ( const void *  va,
const void *  vb 
) [static]

Definition at line 548 of file crudeali.c.

References crudeExon::probeStart, and crudeExon::targetStart.

Referenced by findCrudeGenes().

00551 {
00552 struct crudeExon *a = (struct crudeExon *)va;
00553 struct crudeExon *b = (struct crudeExon *)vb;
00554 long diff;
00555 if ((diff = a->targetStart - b->targetStart) != 0)
00556     return diff;
00557 return a->probeStart - b->probeStart;
00558 }

Here is the caller graph for this function:

static int cmpGenesByScore ( const void *  va,
const void *  vb 
) [static]

Definition at line 611 of file crudeali.c.

References crudeGene::score.

Referenced by crudeAliFind(), and filterPoorGenes().

00613 {
00614 struct crudeGene *a = (struct crudeGene *)va;
00615 struct crudeGene *b = (struct crudeGene *)vb;
00616 return b->score - a->score;
00617 }

Here is the caller graph for this function:

static int cmpHitsTargetFirst ( const void *  va,
const void *  vb 
) [static]

Definition at line 536 of file crudeali.c.

References crudeHit::probeOffset, and crudeHit::targetOffset.

Referenced by findCrudeGenes().

00539 {
00540 struct crudeHit *a = (struct crudeHit *)va;
00541 struct crudeHit *b = (struct crudeHit *)vb;
00542 long diff;
00543 if ((diff = a->targetOffset - b->targetOffset) != 0)
00544     return diff;
00545 return a->probeOffset - b->probeOffset;
00546 }

Here is the caller graph for this function:

static int copyExonsIntoGenes ( struct crudeExon exons,
int  exonCount,
struct crudeGene genes,
struct nt4Seq target,
boolean  isRc 
) [static]

Definition at line 212 of file crudeali.c.

References crudeExon::hitCount, crudeGene::isRc, crudeExon::probeEnd, crudeGene::probeEnd, crudeExon::probeStart, crudeGene::probeStart, crudeGene::score, crudeGene::target, crudeExon::targetEnd, crudeGene::targetEnd, crudeExon::targetStart, and crudeGene::targetStart.

Referenced by findCrudeGenes().

00215 {
00216 int i;
00217 for (i=0; i<exonCount; ++i)
00218     {
00219     genes[i].target = target;
00220     genes[i].isRc = isRc;
00221     genes[i].probeStart = exons[i].probeStart;
00222     genes[i].probeEnd = exons[i].probeEnd;
00223     genes[i].targetStart = exons[i].targetStart;
00224     genes[i].targetEnd = exons[i].targetEnd;
00225     genes[i].score = exons[i].hitCount * exons[i].hitCount;
00226     }
00227 return exonCount;
00228 }

Here is the caller graph for this function:

static int countGenesBetter ( struct crudeGene genes,
int  geneCount,
int  cutoff 
) [static]

Definition at line 619 of file crudeali.c.

Referenced by crudeAliFind(), and filterPoorGenes().

00621 {
00622 int i;
00623 for (i=0; i<geneCount; ++i)
00624     {
00625     if (genes[i].score <= cutoff)
00626         break;
00627     }
00628 return i;
00629 }

Here is the caller graph for this function:

static int countPrettyGood ( struct crudeGene crudeGenes,
int  geneCount,
int  fraction,
int  minScore 
) [static]

Definition at line 675 of file crudeali.c.

References crudeGene::score.

Referenced by crudeAliFind().

00678 {
00679 int count = 0;
00680 int i;
00681 int threshold = crudeGenes->score;  /* Top score */
00682 
00683 threshold = (threshold + fraction/2)/fraction;
00684 if (threshold < minScore)
00685     threshold = minScore;
00686 for (i=0; i<geneCount; ++i)
00687     {
00688     if (crudeGenes[i].score >= threshold)
00689         ++count;
00690     else
00691         break;
00692     }
00693 return count;
00694 }

Here is the caller graph for this function:

struct crudeAli* crudeAliFind ( DNA probe,
int  probeSize,
struct nt4Seq **  chrome,
int  chromeCount,
int  tileSize,
int  minScore 
) [read]

Definition at line 794 of file crudeali.c.

References AllocVar, caIsCdna, caTileSize, chromIx(), cmpGenesByScore(), countGenesBetter(), countPrettyGood(), filterPoorGenes(), findCrudeGenes(), freeFastProber(), freeMem(), initWormDnaMatchScores(), makeFastProber(), maxHitsAtOnce, needMem(), reverseComplement(), scoreNoninsertingExtensions(), slAddHead, and slReverse().

Referenced by xenAlignBig(), and xenAlignWorm().

00797 {
00798 int oneGeneCount;
00799 int i;
00800 int chromeIx;
00801 int geneCount = 0;
00802 int maxGeneCount = 2*maxHitsAtOnce;
00803 int isRc;
00804 struct nt4Seq *target;
00805 struct crudeGene *crudeGenes = needMem(maxGeneCount*sizeof(*crudeGenes));
00806 struct fastProber *fps[2];
00807 struct crudeAli *aliList = NULL, *ali;
00808 int goodGeneCount;
00809 
00810 initWormDnaMatchScores();
00811 caTileSize = tileSize;
00812 caIsCdna = (tileSize >= 14);
00813 fps[0] = makeFastProber(probe, probeSize);
00814 reverseComplement(probe, probeSize);
00815 fps[1] = makeFastProber(probe, probeSize);
00816 reverseComplement(probe, probeSize);
00817 for (chromeIx = 0; chromeIx < chromeCount; ++chromeIx)
00818     {
00819     target = chrome[chromeIx];
00820     for (isRc = 0; isRc <= 1; ++isRc)
00821         {
00822         /* Get crude idea of genes on one strand. */
00823         oneGeneCount = findCrudeGenes(fps[isRc], target, 
00824             crudeGenes+geneCount, maxGeneCount-geneCount, isRc);
00825             
00826         /* Increment gene count and if necessary compact list. */
00827         geneCount += oneGeneCount;
00828         if (maxGeneCount - geneCount < maxHitsAtOnce)
00829             geneCount = filterPoorGenes(crudeGenes, geneCount);
00830         }
00831     if (maxGeneCount - geneCount < maxHitsAtOnce)
00832         break;
00833     }
00834 
00835 /* Sort genes by initial promise and get rid of some trash. */
00836     {
00837     int bestScore, worstScore;
00838     qsort(crudeGenes, geneCount, sizeof(crudeGenes[0]), cmpGenesByScore);
00839     bestScore = crudeGenes[0].score;
00840     worstScore = crudeGenes[geneCount-1].score;
00841     if (bestScore > worstScore)
00842         {
00843         int cutOff = bestScore/10;
00844         goodGeneCount = geneCount = countGenesBetter(crudeGenes, geneCount, cutOff);
00845         }
00846     else if (bestScore >= minScore)
00847         goodGeneCount = geneCount;
00848     else
00849         goodGeneCount = 0;
00850     }
00851 
00852 if (tileSize == 8)
00853     {
00854     goodGeneCount = countPrettyGood(crudeGenes, geneCount, 4, 4);
00855     geneCount = goodGeneCount;
00856     for (i=0; i<geneCount; ++i)
00857         scoreNoninsertingExtensions(crudeGenes+i, probe, probeSize);
00858     qsort(crudeGenes, geneCount, sizeof(crudeGenes[0]), cmpGenesByScore);
00859     goodGeneCount = countPrettyGood(crudeGenes, geneCount, 4, minScore);
00860     }
00861 for (i=0; i<goodGeneCount; ++i)
00862     {
00863     AllocVar(ali);
00864     ali->chromIx = chromIx(crudeGenes[i].target, chrome, chromeCount);
00865     ali->start = crudeGenes[i].targetStart;
00866     ali->end = crudeGenes[i].targetEnd;
00867     ali->score = crudeGenes[i].score;
00868     ali->strand = (crudeGenes[i].isRc ? '-' : '+');
00869     slAddHead(&aliList, ali);
00870     }
00871 slReverse(&aliList);
00872 freeFastProber(&fps[0]);
00873 freeFastProber(&fps[1]);
00874 freeMem(crudeGenes);
00875 
00876 return aliList;
00877 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int filterPoorGenes ( struct crudeGene genes,
int  geneCount 
) [static]

Definition at line 631 of file crudeali.c.

References cmpGenesByScore(), countGenesBetter(), and warn().

Referenced by crudeAliFind().

00635 {
00636 int bestScore, worstScore;
00637 int newGeneCount;
00638 int halfGeneCount = geneCount/2;
00639 
00640 qsort(genes, geneCount, sizeof(genes[0]), cmpGenesByScore);
00641 bestScore = genes[0].score;
00642 worstScore = genes[geneCount-1].score;
00643 
00644 /* If scores are clustered with half or more at the top,
00645  * we have to be crude and just lop off bottom half of scores. */
00646 if (bestScore == genes[halfGeneCount].score)
00647     {
00648     warn("Bunches of genes, all scoring the same. Program is "
00649          "throwing out half out of necessity.\n");
00650     return geneCount/2;
00651     }
00652 
00653 /* Drop bottom score until have gotten rid of at least half. */
00654 newGeneCount = geneCount;
00655 while (newGeneCount > halfGeneCount)
00656     {
00657     worstScore = genes[newGeneCount-1].score;
00658     newGeneCount = countGenesBetter(genes, newGeneCount, worstScore);
00659     }
00660 return newGeneCount;
00661 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int findCrudeGenes ( struct fastProber fp,
struct nt4Seq target,
struct crudeGene genes,
int  maxGeneCount,
boolean  isRc 
) [static]

Definition at line 581 of file crudeali.c.

References caIsCdna, cmpExonsTargetFirst(), cmpHitsTargetFirst(), copyExonsIntoGenes(), freeMem(), crudeExon::hitCount, lumpExonsIntoGenes(), lumpHitsIntoExons(), makeHits(), maxHitsAtOnce, and needMem().

Referenced by crudeAliFind().

00584 {
00585 struct crudeHit *hits;
00586 struct crudeExon *exons;
00587 int hitCount, exonCount, geneCount = 0;
00588 
00589 if (fp == NULL)
00590     return 0;
00591 assert(maxGeneCount >= maxHitsAtOnce);
00592 hits = needMem(maxHitsAtOnce*sizeof(*hits));
00593 exons = needMem(maxHitsAtOnce*sizeof(*exons));
00594 hitCount = makeHits(fp, target, hits, maxHitsAtOnce);
00595 
00596 if (hitCount > 0)
00597     {
00598     qsort(hits, hitCount, sizeof(hits[0]), cmpHitsTargetFirst);
00599     exonCount = lumpHitsIntoExons(hits, hitCount, exons);
00600     qsort(exons, exonCount, sizeof(exons[0]), cmpExonsTargetFirst);
00601     if (caIsCdna)
00602         geneCount = lumpExonsIntoGenes(exons, exonCount, genes, target, isRc);
00603     else
00604         geneCount = copyExonsIntoGenes(exons, exonCount, genes, target, isRc);
00605     }
00606 freeMem(hits);
00607 freeMem(exons);
00608 return geneCount;
00609 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void freeFastProber ( struct fastProber **  pFp  )  [static]

Definition at line 337 of file crudeali.c.

References freeMem(), freez(), fastProber::hash, and fastProber::probes.

Referenced by crudeAliFind().

00338 {
00339 struct fastProber *fp = *pFp;
00340 if (fp != NULL)
00341     {
00342     freeMem(fp->probes);
00343     freeMem(fp->hash);
00344     freez(pFp);
00345     }
00346 }

Here is the call graph for this function:

Here is the caller graph for this function:

void initWormDnaMatchScores (  ) 

Definition at line 698 of file crudeali.c.

References FALSE, TRUE, and wormDnaMatchScores.

Referenced by crudeAliFind().

00700 {
00701 int i,j;
00702 static boolean initted = FALSE;
00703 
00704 if (initted) return;
00705 initted = TRUE;
00706 for (i=0; i<256; ++i)
00707     for (j=0; j<256; ++j)
00708         wormDnaMatchScores[i][j] = -4;
00709 wormDnaMatchScores['a']['a'] = 2;
00710 wormDnaMatchScores['t']['t'] = 2;
00711 wormDnaMatchScores['A']['A'] = 2;
00712 wormDnaMatchScores['T']['T'] = 2;
00713 wormDnaMatchScores['c']['c'] = 3;
00714 wormDnaMatchScores['g']['g'] = 3;
00715 wormDnaMatchScores['C']['C'] = 3;
00716 wormDnaMatchScores['G']['G'] = 3;
00717 }

Here is the caller graph for this function:

static int lumpExonsIntoGenes ( struct crudeExon exons,
int  exonCount,
struct crudeGene genes,
struct nt4Seq target,
boolean  isRc 
) [static]

Definition at line 140 of file crudeali.c.

References caIsCdna, FALSE, crudeExon::hitCount, crudeExon::isLumped, crudeGene::isRc, crudeExon::probeEnd, crudeGene::probeEnd, crudeGene::probeStart, crudeExon::probeStart, crudeGene::score, crudeGene::target, crudeExon::targetEnd, crudeGene::targetEnd, crudeExon::targetStart, crudeGene::targetStart, and TRUE.

Referenced by findCrudeGenes().

00143 {
00144 int i;
00145 struct crudeGene *gene = genes;
00146 int geneCount = 0;
00147 int firstNonLumped = 0;
00148 int targetGapSpan = (caIsCdna ? 25000 : 200);
00149 int probeGapSpan = (caIsCdna ? 64 : 200);
00150 
00151 for (i=0; i<exonCount; ++i)
00152     exons[i].isLumped = FALSE;
00153 
00154 for (;;)
00155     {
00156     boolean startedGene = FALSE;
00157     int lastDiff = 0;
00158     int tOff, pOff;
00159     int thisDiff;
00160     int exonsInThis = 0;
00161 
00162     for ( ; firstNonLumped < exonCount; ++firstNonLumped)
00163         {
00164         if (!exons[firstNonLumped].isLumped)
00165             break;
00166         }
00167     for (i=firstNonLumped; i<exonCount; ++i)
00168         {
00169         if (!exons[i].isLumped)
00170             {
00171             tOff = exons[i].targetStart;
00172             pOff = exons[i].probeStart;
00173             thisDiff = tOff - pOff;
00174             if (!startedGene)
00175                 {
00176                 startedGene = TRUE;
00177                 exons[i].isLumped = TRUE;
00178                 lastDiff = thisDiff;
00179                 exonsInThis = 1;
00180                 gene->target = target;
00181                 gene->isRc = isRc;
00182                 gene->probeStart = exons[i].probeStart;
00183                 gene->probeEnd = exons[i].probeEnd;
00184                 gene->targetStart = exons[i].targetStart;
00185                 gene->targetEnd = exons[i].targetEnd;
00186                 gene->score = exons[i].hitCount * exons[i].hitCount;
00187                 }
00188             else if (gene->targetEnd - 10 <= tOff && tOff <= gene->targetEnd + targetGapSpan
00189                      && gene->probeEnd - 10 <= pOff && pOff <= gene->probeEnd + probeGapSpan
00190                      && lastDiff - 2 <= thisDiff)
00191                 {
00192                 exons[i].isLumped = TRUE;
00193                 gene->targetEnd = exons[i].targetEnd;
00194                 gene->probeEnd = exons[i].probeEnd;
00195                 gene->score += exons[i].hitCount * exons[i].hitCount;
00196                 exonsInThis += 1;
00197                 lastDiff = thisDiff;
00198                 }
00199             else if (tOff > gene->targetEnd + targetGapSpan)
00200                 break;
00201             }
00202         }
00203     if (!startedGene)
00204         break;
00205     gene->score += exonsInThis;
00206     ++geneCount;
00207     ++gene;
00208     }
00209 return geneCount;
00210 }

Here is the caller graph for this function:

static int lumpHitsIntoExons ( struct crudeHit hits,
int  hitCount,
struct crudeExon exons 
) [static]

Definition at line 59 of file crudeali.c.

References caTileSize, FALSE, crudeExon::hitCount, crudeHit::isLumped, crudeExon::isLumped, crudeExon::probeEnd, crudeExon::probeStart, crudeExon::targetEnd, crudeHit::targetOffset, crudeExon::targetStart, and TRUE.

Referenced by findCrudeGenes().

00063 {
00064 int i;
00065 struct crudeExon *exon = exons;
00066 int exonCount = 0;
00067 int firstNonLumped = 0;
00068 
00069 for (i=0; i<hitCount; ++i)
00070     hits[i].isLumped = FALSE;
00071 
00072 for (;;)
00073     {
00074     boolean startedExon = FALSE;
00075     int diagDiff = 0;
00076     int tOff, pOff;
00077     int thisDiff;
00078     for (; firstNonLumped < hitCount; ++firstNonLumped)
00079         {
00080         if (!hits[firstNonLumped].isLumped)
00081             break;
00082         }
00083     for (i=firstNonLumped; i<hitCount; ++i)
00084         {
00085         if (hits[i].isLumped == FALSE)
00086             {
00087             pOff = hits[i].probeOffset;
00088             tOff = hits[i].targetOffset;
00089             thisDiff = pOff - tOff;
00090             if (!startedExon)   /* First hit in exon. */
00091                 {
00092                 startedExon = TRUE;
00093                 hits[i].isLumped = TRUE;
00094                 exon->probeStart = pOff;
00095                 exon->probeEnd = pOff + caTileSize;
00096                 exon->targetStart = tOff;
00097                 exon->targetEnd = tOff + caTileSize;
00098                 exon->hitCount = 1;
00099                 diagDiff = thisDiff;
00100                 }
00101             else if (diagDiff - 2 <= thisDiff && thisDiff <= diagDiff + 2
00102                     && exon->probeEnd - 2 <= pOff && pOff <= exon->probeEnd + 48
00103                     && exon->targetEnd - 2 <= tOff && tOff <= exon->targetEnd + 48)
00104                 {
00105                 hits[i].isLumped = TRUE;
00106                 exon->probeEnd = pOff + caTileSize;
00107                 exon->targetEnd = tOff + caTileSize;
00108                 exon->hitCount += 1;
00109                 }
00110             else if (tOff > exon->targetEnd + 48)
00111                 break;
00112             }
00113         }
00114     if (!startedExon)   /* Must be all lumped together. */
00115         break;
00116     ++exonCount;
00117     ++exon;
00118     }
00119 
00120 /* If small tile size then get rid of exons with only one hit. */
00121 if (caTileSize == 8)
00122     {
00123     struct crudeExon *read = exons, *write = exons;
00124     int twoFerCount = 0;
00125     for (i=0; i<exonCount; ++i)
00126         {
00127         if (read->hitCount >= 2)
00128             {
00129             *write++ = *read++;
00130             ++twoFerCount;
00131             }
00132         else
00133             ++read;
00134         }
00135     exonCount = twoFerCount;
00136     }
00137 return exonCount;
00138 }

Here is the caller graph for this function:

static struct fastProber* makeFastProber ( DNA probeDna,
int  probeSize 
) [static, read]

Definition at line 304 of file crudeali.c.

References AllocVar, caTileSize, fastProber::hash, makeGoodTile(), makeTileHash(), needMem(), probeTile::next, probeTile::offset, fastProber::probes, fastProber::probeSize, probeTile::tile16, probeTile::tile8, tileHashFunc, and wobbleMask.

Referenced by crudeAliFind().

00305 {
00306 int maxProbeCount = probeSize - caTileSize + 1;
00307 int probeCount = 0;
00308 struct probeTile *probes;
00309 struct probeTile *probe;
00310 struct probeTile **hash;
00311 int i;
00312 struct fastProber *fp;
00313 
00314 if (maxProbeCount <= 0)
00315     return NULL;
00316 AllocVar(fp);
00317 fp->hash = hash = makeTileHash();
00318 fp->probeSize = probeSize;
00319 fp->probes = probes = needMem(maxProbeCount * sizeof(probes[0]) );
00320 for (i=0; i<maxProbeCount; ++i)
00321     {
00322     probe = &probes[probeCount];
00323     if (makeGoodTile(probeDna+i, &probe->tile16, &probe->tile8, caTileSize))
00324         {
00325         int hashIx; 
00326         probe->tile8 &= wobbleMask;
00327         hashIx = (caTileSize > 8 ? tileHashFunc(probe->tile16) : tileHashFunc(probe->tile8));
00328         probe->offset = i;
00329         probe->next = hash[hashIx];
00330         hash[hashIx] = probe;
00331         ++probeCount;
00332         }
00333     }
00334 return fp;
00335 }

Here is the call graph for this function:

Here is the caller graph for this function:

static boolean makeGoodTile ( DNA dna,
bits32 *  retTile16,
bits16 *  retTile8,
int  tileSize 
) [static]

Definition at line 230 of file crudeali.c.

References FALSE, ntVal, packDna16(), packDna8(), and TRUE.

Referenced by makeFastProber().

00233 {
00234 int i;
00235 int repMod;
00236 int endRep;
00237 int maxRep = (tileSize / 2);
00238 
00239 /* Make sure no N's in tile */
00240 for (i=0; i<tileSize; ++i)
00241     {
00242     if (ntVal[(int)dna[i]] < 0)
00243         return FALSE;
00244     }
00245 
00246 /* Make sure that tile isn't part of a pattern. */ 
00247 for (repMod = 1; repMod <= maxRep; repMod += 1)
00248     {
00249     boolean allSame = TRUE;
00250     endRep = tileSize - repMod;
00251     for (i=0; i<endRep; ++i)
00252         {
00253         if (dna[i] != dna[i+repMod])
00254             {
00255             allSame = FALSE;
00256             break;
00257             }
00258         }
00259     if (allSame)
00260         return FALSE;
00261     }
00262 
00263 if (tileSize > 8)
00264     *retTile16 = packDna16(dna);
00265 else
00266     *retTile8 = packDna8(dna);
00267 
00268 return TRUE;
00269 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int makeHits ( struct fastProber fp,
struct nt4Seq target,
struct crudeHit hits,
int  maxHitCount 
) [static]

Definition at line 566 of file crudeali.c.

References caTileSize, errAbort(), makeHits16(), and makeHits8().

Referenced by findCrudeGenes().

00569 {
00570 if (caTileSize == 8)
00571     return makeHits8(fp, target, hits, maxHitCount);
00572 else if (caTileSize == 16)
00573     return makeHits16(fp, target, hits, maxHitCount);
00574 else
00575     {
00576     errAbort("Can only do tile sizes of 8 or 16 in makeHits");
00577     return 0;
00578     }
00579 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int makeHits16 ( struct fastProber fp,
struct nt4Seq target,
struct crudeHit hits,
int  maxHitCount 
) [static]

Definition at line 381 of file crudeali.c.

References nt4Seq::baseCount, nt4Seq::bases, bits32, caTileSize, fastProber::hash, makeIndividualHits16(), and tileHashFunc.

Referenced by makeHits().

00384 {
00385 bits32 *bases = target->bases;
00386 struct probeTile **hash = fp->hash;
00387 unsigned long acc;
00388 int hitCount = 0;
00389 int i;
00390 int baseWordCount = (target->baseCount/caTileSize);
00391 bits32 *endBases = bases + baseWordCount;
00392 int chunkSize = 8;
00393 int chunkCount = baseWordCount/chunkSize;
00394 int baseOffset = 0;
00395 
00396 for (i=0; i<chunkCount; ++i)
00397     {
00398     acc = ((unsigned long)(hash[tileHashFunc(bases[0])])
00399         |  (unsigned long)(hash[tileHashFunc(bases[1])])
00400         |  (unsigned long)(hash[tileHashFunc(bases[2])])
00401         |  (unsigned long)(hash[tileHashFunc(bases[3])])
00402         |  (unsigned long)(hash[tileHashFunc(bases[4])])
00403         |  (unsigned long)(hash[tileHashFunc(bases[5])])
00404         |  (unsigned long)(hash[tileHashFunc(bases[6])])
00405         |  (unsigned long)(hash[tileHashFunc(bases[7])]));
00406     if (acc)
00407         {
00408         hitCount = makeIndividualHits16(hash, bases, baseOffset, chunkSize, hits, maxHitCount, hitCount);
00409         if (hitCount >= maxHitCount)
00410             break;
00411         }
00412     bases += chunkSize;
00413     baseOffset += chunkSize;
00414     }
00415 if (bases != endBases)
00416     hitCount = makeIndividualHits16(hash, bases, baseOffset, endBases-bases, hits, maxHitCount, hitCount);
00417 return hitCount;
00418 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int makeHits8 ( struct fastProber fp,
struct nt4Seq target,
struct crudeHit hits,
int  maxHitCount 
) [static]

Definition at line 462 of file crudeali.c.

References nt4Seq::baseCount, nt4Seq::bases, bits16, bits32, caTileSize, freeMem(), fastProber::hash, needLargeMem(), crudeHit::probeOffset, fastProber::probeSize, squeezeHits8(), crudeHit::targetOffset, tileHashBits, warn(), and wobbleMask.

Referenced by makeHits().

00465 {
00466 bits16 *bases = (bits16*)target->bases;
00467 struct probeTile **hash = fp->hash;
00468 struct probeTile *pbt;
00469 int hitCount = 0;
00470 int i;
00471 int baseWordCount = (target->baseCount/caTileSize);
00472 int tOffset = 0;
00473 
00474 /* Handle big/little endian problem. */
00475 bits32 endianTest = 0x12345678;
00476 bits16 *endianPt = (bits16*)(void*)(&endianTest);
00477 boolean needSwap = (*endianPt == 0x5678);
00478 int swapOffset[2];
00479 int swapIx = 0;
00480 
00481 /* Every so often we throw out singleton hits. The variables below
00482  * manage this. */
00483 int compactWindowSizePower = 10;
00484 int compactWindowSize = (1<<compactWindowSizePower);
00485 int compactModMask = compactWindowSize-1;
00486 int compactOverlap = 50;
00487 int lastCompactedHit = 0;
00488 int compactBufIntSize = (compactWindowSize+compactOverlap+fp->probeSize+4);
00489 int compactBufSize = (compactBufIntSize*sizeof(short));
00490 short *compactDiagBuf = needLargeMem(compactBufSize);
00491 
00492 if (needSwap)
00493     {
00494     swapOffset[0] = 8;
00495     swapOffset[1] = -8;
00496     }
00497 else
00498     {
00499     swapOffset[0] = swapOffset[1] = 0;
00500     }
00501 
00502 assert(tileHashBits == (8*sizeof(bits16)));
00503 for (i=0; i<baseWordCount; ++i)
00504     {
00505     if ((pbt = hash[*bases++ & wobbleMask]) != NULL)
00506         {
00507         if (hitCount < maxHitCount)
00508             {
00509             hits[hitCount].probeOffset = pbt->offset;
00510             hits[hitCount].targetOffset = tOffset + swapOffset[swapIx];
00511             ++hitCount;
00512             }
00513         else
00514             {
00515             warn("Got too many hits, only keeping first %d\n", hitCount);
00516             break;
00517             }
00518         }
00519     tOffset += caTileSize;
00520     if ((tOffset&compactModMask) == 0)
00521         {
00522         int startToff = tOffset - compactWindowSize - compactOverlap;
00523         int lastCompactOff = startToff + compactWindowSize;
00524         int lastCompareOff = tOffset;
00525 
00526         squeezeHits8(fp, compactDiagBuf, compactBufSize, 
00527             startToff,  lastCompactOff, lastCompareOff, 
00528             hits, lastCompactedHit, hitCount, &lastCompactedHit, &hitCount);
00529         }
00530     swapIx = 1 - swapIx;  /* Toggle between 0 and 1. */
00531     }
00532 freeMem(compactDiagBuf);
00533 return hitCount;
00534 }

Here is the call graph for this function:

Here is the caller graph for this function:

static int makeIndividualHits16 ( struct probeTile **  hash,
bits32 *  bases,
int  baseOffset,
int  baseWordCount,
struct crudeHit hits,
int  maxHitCount,
int  hitCount 
) [static]

Definition at line 348 of file crudeali.c.

References caTileSize, probeTile::next, probeTile::offset, crudeHit::probeOffset, crudeHit::targetOffset, probeTile::tile16, tileHashFunc, and warn().

Referenced by makeHits16().

00351 {
00352 struct probeTile *probe;
00353 int i;
00354 for (i=0; i<baseWordCount; ++i)
00355     {
00356     if ((probe = hash[tileHashFunc(bases[i])]) != NULL)
00357         {
00358         do
00359             {
00360             if (bases[i] == probe->tile16)
00361                 {
00362                 if (hitCount >= maxHitCount)
00363                     {
00364                     warn("Too many hits, only taking first %d", maxHitCount);
00365                     goto END_HIT_LOOP;
00366                     }
00367                 hits[hitCount].probeOffset = probe->offset;
00368                 hits[hitCount].targetOffset = ((i+baseOffset) * caTileSize);
00369                 ++hitCount;
00370                 break;
00371                 }
00372             probe = probe->next;
00373             }
00374         while (probe != NULL);
00375         }
00376     }
00377 END_HIT_LOOP:
00378 return hitCount;
00379 }

Here is the call graph for this function:

Here is the caller graph for this function:

static struct probeTile** makeTileHash (  )  [static, read]

Definition at line 285 of file crudeali.c.

References needLargeMem(), and tileHashSize.

Referenced by makeFastProber().

00286 {
00287 struct probeTile **table;
00288 int tableSize = tileHashSize * sizeof(table[0]);
00289 
00290 table = needLargeMem(tableSize);
00291 memset(table, 0, tableSize);
00292 return table;
00293 }

Here is the call graph for this function:

Here is the caller graph for this function:

void scoreNoninsertingExtensions ( struct crudeGene crudeGene,
DNA probe,
int  probeSize 
)

Definition at line 741 of file crudeali.c.

References nt4Seq::baseCount, freeMem(), crudeGene::isRc, nt4Unpack(), crudeGene::probeStart, reverseComplement(), crudeGene::score, crudeGene::target, crudeGene::targetStart, and wormDnaMatchScores.

Referenced by crudeAliFind().

00744 {
00745 int i, size;
00746 int score = 0, maxScore = 0;
00747 struct nt4Seq *targetChrom = crudeGene->target;
00748 DNA *target;
00749 DNA p,t;
00750 
00751 /* Figure out what DNA to fetch - trying to get all of target that
00752  * corresponds to all of probe, but clipping if at ends of chromosome. */
00753 int pTileStart = crudeGene->probeStart;
00754 int tTileStart = crudeGene->targetStart;
00755 int pStart = 0, pEnd = probeSize;
00756 int tStart = tTileStart - pTileStart;
00757 int tEnd = tStart + probeSize;
00758 if (tStart < 0)
00759     {
00760     pStart -= tStart;
00761     tStart = 0;
00762     }
00763 if (tEnd > targetChrom->baseCount)
00764     {
00765     int diff = tEnd - targetChrom->baseCount;
00766     tEnd -= diff;
00767     pEnd -= diff;
00768     }
00769 size = tEnd - tStart;
00770 if (crudeGene->isRc)
00771     reverseComplement(probe, probeSize);
00772 if (size > 0)
00773     {
00774     target = nt4Unpack(targetChrom, tStart, size);
00775     for (i = 0; i<size; ++i)
00776         {
00777         assert(i + pStart < probeSize);
00778         p = probe[i + pStart];
00779         t = target[i];
00780         score += wormDnaMatchScores[(int)p][(int)t];
00781         if (score < 0)
00782             score = 0;
00783         if (score > maxScore)
00784             maxScore = score;
00785         }            
00786     freeMem(target);
00787     }
00788 crudeGene->score = maxScore;
00789 if (crudeGene->isRc)
00790     reverseComplement(probe, probeSize);
00791 }

Here is the call graph for this function:

Here is the caller graph for this function:

void squeezeHits8 ( struct fastProber fp,
short *  compactDiagBuf,
int  compactDiagBufByteSize,
int  startToff,
int  lastCompactOff,
int  lastCompareOff,
struct crudeHit hits,
int  lastCompactedHit,
int  hitCount,
int *  retLastCompactedHit,
int *  retHitCount 
)

Definition at line 420 of file crudeali.c.

References crudeHit::probeOffset, fastProber::probeSize, and crudeHit::targetOffset.

Referenced by makeHits8().

00427 {
00428 int diagOffset = fp->probeSize+2;
00429 int diag;
00430 int i;
00431 struct crudeHit *hit, *writeHit;
00432 int compactDiagBufArraySize = compactDiagBufByteSize/sizeof(short);
00433 
00434 memset(compactDiagBuf, 0, compactDiagBufByteSize);
00435 for (i=lastCompactedHit; i<hitCount; ++i)
00436     {
00437     hit = hits+i;
00438     diag = (hit->targetOffset - startToff) - hit->probeOffset + diagOffset;
00439     assert(diag >= 0 && diag < compactDiagBufArraySize);
00440     compactDiagBuf[diag] += 1;
00441     }
00442 writeHit = hits+lastCompactedHit;
00443 for (i=lastCompactedHit; i<hitCount; ++i)
00444     {
00445     hit = hits+i;
00446     if (hit->targetOffset >= lastCompactOff)
00447         break;
00448     diag = (hit->targetOffset - startToff) - hit->probeOffset + diagOffset;
00449     if (compactDiagBuf[diag] > 1)
00450         *writeHit++ = *hit;
00451     }    
00452 *retLastCompactedHit = (writeHit - hits);
00453 for (;i<hitCount; ++i)
00454     {
00455     hit = hits+i;
00456     *writeHit++ = *hit;
00457     }
00458 *retHitCount = (writeHit - hits);
00459 }

Here is the caller graph for this function:


Variable Documentation

boolean caIsCdna = TRUE [static]

Definition at line 24 of file crudeali.c.

Referenced by crudeAliFind(), findCrudeGenes(), and lumpExonsIntoGenes().

int caTileSize = 16 [static]

Definition at line 23 of file crudeali.c.

Referenced by crudeAliFind(), lumpHitsIntoExons(), makeFastProber(), makeHits(), makeHits16(), makeHits8(), and makeIndividualHits16().

char const rcsid[] = "$Id: crudeali.c,v 1.5 2006/05/09 01:05:55 markd Exp $" [static]

Definition at line 17 of file crudeali.c.

int wormDnaMatchScores[256][256] [static]

Definition at line 696 of file crudeali.c.

Referenced by initWormDnaMatchScores(), and scoreNoninsertingExtensions().


Generated on Tue Dec 25 19:21:27 2007 for blat by  doxygen 1.5.2