jkOwnLib/crudeali.c

Go to the documentation of this file.
00001 /* crudeali.c - Files for doing fast blast-style crude alignment. 
00002  * This has two modes - a 16-base at a time and a 6 base at a time
00003  * which basically is for cDNA alignments and genomic/genomic 
00004  * alignments. The six base at a time doesn't use six contigious
00005  * bases, but rather 8 bases in a row with the two in "wobble
00006  * positions" masked out.  It ends up making things more
00007  * sensitive, and, fortunately, the whole thing still fits in
00008  * a single 16 bit word. */
00009 /* Copyright 2000-2003 Jim Kent.  All rights reserved. */
00010 
00011 #include "common.h"
00012 #include "dnautil.h"
00013 #include "nt4.h"
00014 #include "dnaseq.h"
00015 #include "crudeali.h"
00016 
00017 static char const rcsid[] = "$Id: crudeali.c,v 1.5 2006/05/09 01:05:55 markd Exp $";
00018 
00019 #define maxTileSize 16
00020 
00021 #define wobbleMask 0xF3CF
00022 
00023 static int caTileSize = 16;
00024 static boolean caIsCdna = TRUE;
00025 
00026 struct crudeHit
00027 /* A tileSize exact match between probe and target. */
00028     {
00029     int probeOffset;
00030     int targetOffset;
00031     boolean isLumped;
00032     };
00033 
00034 
00035 struct crudeExon
00036 /* A bunch of hits that hang together on a diagonal. */
00037     {
00038     int probeStart;
00039     int probeEnd;
00040     int targetStart;
00041     int targetEnd;
00042     int hitCount;
00043     boolean isLumped;
00044     };
00045 
00046 struct crudeGene
00047 /* A collection of diagonals that seem to hang together
00048  * like exons. */
00049     {
00050     struct nt4Seq *target;
00051     boolean isRc;
00052     int probeStart;
00053     int probeEnd;
00054     int targetStart;
00055     int targetEnd;
00056     int score;
00057     };
00058 
00059 static int lumpHitsIntoExons(struct crudeHit *hits, int hitCount, struct crudeExon *exons)
00060 /* Lump together hits that are within 48 bp of each other and within 2 of
00061  * the same diagonal into "exons".  The exons array must be hitCount elements
00062  * in order to accomodate the worst case where no lumping occurs.  */
00063 {
00064 int i;
00065 struct crudeExon *exon = exons;
00066 int exonCount = 0;
00067 int firstNonLumped = 0;
00068 
00069 for (i=0; i<hitCount; ++i)
00070     hits[i].isLumped = FALSE;
00071 
00072 for (;;)
00073     {
00074     boolean startedExon = FALSE;
00075     int diagDiff = 0;
00076     int tOff, pOff;
00077     int thisDiff;
00078     for (; firstNonLumped < hitCount; ++firstNonLumped)
00079         {
00080         if (!hits[firstNonLumped].isLumped)
00081             break;
00082         }
00083     for (i=firstNonLumped; i<hitCount; ++i)
00084         {
00085         if (hits[i].isLumped == FALSE)
00086             {
00087             pOff = hits[i].probeOffset;
00088             tOff = hits[i].targetOffset;
00089             thisDiff = pOff - tOff;
00090             if (!startedExon)   /* First hit in exon. */
00091                 {
00092                 startedExon = TRUE;
00093                 hits[i].isLumped = TRUE;
00094                 exon->probeStart = pOff;
00095                 exon->probeEnd = pOff + caTileSize;
00096                 exon->targetStart = tOff;
00097                 exon->targetEnd = tOff + caTileSize;
00098                 exon->hitCount = 1;
00099                 diagDiff = thisDiff;
00100                 }
00101             else if (diagDiff - 2 <= thisDiff && thisDiff <= diagDiff + 2
00102                     && exon->probeEnd - 2 <= pOff && pOff <= exon->probeEnd + 48
00103                     && exon->targetEnd - 2 <= tOff && tOff <= exon->targetEnd + 48)
00104                 {
00105                 hits[i].isLumped = TRUE;
00106                 exon->probeEnd = pOff + caTileSize;
00107                 exon->targetEnd = tOff + caTileSize;
00108                 exon->hitCount += 1;
00109                 }
00110             else if (tOff > exon->targetEnd + 48)
00111                 break;
00112             }
00113         }
00114     if (!startedExon)   /* Must be all lumped together. */
00115         break;
00116     ++exonCount;
00117     ++exon;
00118     }
00119 
00120 /* If small tile size then get rid of exons with only one hit. */
00121 if (caTileSize == 8)
00122     {
00123     struct crudeExon *read = exons, *write = exons;
00124     int twoFerCount = 0;
00125     for (i=0; i<exonCount; ++i)
00126         {
00127         if (read->hitCount >= 2)
00128             {
00129             *write++ = *read++;
00130             ++twoFerCount;
00131             }
00132         else
00133             ++read;
00134         }
00135     exonCount = twoFerCount;
00136     }
00137 return exonCount;
00138 }
00139 
00140 static int lumpExonsIntoGenes(struct crudeExon *exons, int exonCount, 
00141     struct crudeGene *genes, struct nt4Seq *target, boolean isRc)
00142 /* Lump together crude exons into crude genes, and score them. */
00143 {
00144 int i;
00145 struct crudeGene *gene = genes;
00146 int geneCount = 0;
00147 int firstNonLumped = 0;
00148 int targetGapSpan = (caIsCdna ? 25000 : 200);
00149 int probeGapSpan = (caIsCdna ? 64 : 200);
00150 
00151 for (i=0; i<exonCount; ++i)
00152     exons[i].isLumped = FALSE;
00153 
00154 for (;;)
00155     {
00156     boolean startedGene = FALSE;
00157     int lastDiff = 0;
00158     int tOff, pOff;
00159     int thisDiff;
00160     int exonsInThis = 0;
00161 
00162     for ( ; firstNonLumped < exonCount; ++firstNonLumped)
00163         {
00164         if (!exons[firstNonLumped].isLumped)
00165             break;
00166         }
00167     for (i=firstNonLumped; i<exonCount; ++i)
00168         {
00169         if (!exons[i].isLumped)
00170             {
00171             tOff = exons[i].targetStart;
00172             pOff = exons[i].probeStart;
00173             thisDiff = tOff - pOff;
00174             if (!startedGene)
00175                 {
00176                 startedGene = TRUE;
00177                 exons[i].isLumped = TRUE;
00178                 lastDiff = thisDiff;
00179                 exonsInThis = 1;
00180                 gene->target = target;
00181                 gene->isRc = isRc;
00182                 gene->probeStart = exons[i].probeStart;
00183                 gene->probeEnd = exons[i].probeEnd;
00184                 gene->targetStart = exons[i].targetStart;
00185                 gene->targetEnd = exons[i].targetEnd;
00186                 gene->score = exons[i].hitCount * exons[i].hitCount;
00187                 }
00188             else if (gene->targetEnd - 10 <= tOff && tOff <= gene->targetEnd + targetGapSpan
00189                      && gene->probeEnd - 10 <= pOff && pOff <= gene->probeEnd + probeGapSpan
00190                      && lastDiff - 2 <= thisDiff)
00191                 {
00192                 exons[i].isLumped = TRUE;
00193                 gene->targetEnd = exons[i].targetEnd;
00194                 gene->probeEnd = exons[i].probeEnd;
00195                 gene->score += exons[i].hitCount * exons[i].hitCount;
00196                 exonsInThis += 1;
00197                 lastDiff = thisDiff;
00198                 }
00199             else if (tOff > gene->targetEnd + targetGapSpan)
00200                 break;
00201             }
00202         }
00203     if (!startedGene)
00204         break;
00205     gene->score += exonsInThis;
00206     ++geneCount;
00207     ++gene;
00208     }
00209 return geneCount;
00210 }
00211 
00212 static int copyExonsIntoGenes(struct crudeExon *exons, int exonCount, 
00213     struct crudeGene *genes, struct nt4Seq *target, boolean isRc)
00214 /* Make crude genes that are just one per exon.  What a kludge! */
00215 {
00216 int i;
00217 for (i=0; i<exonCount; ++i)
00218     {
00219     genes[i].target = target;
00220     genes[i].isRc = isRc;
00221     genes[i].probeStart = exons[i].probeStart;
00222     genes[i].probeEnd = exons[i].probeEnd;
00223     genes[i].targetStart = exons[i].targetStart;
00224     genes[i].targetEnd = exons[i].targetEnd;
00225     genes[i].score = exons[i].hitCount * exons[i].hitCount;
00226     }
00227 return exonCount;
00228 }
00229 
00230 static boolean makeGoodTile(DNA *dna, bits32 *retTile16, bits16 *retTile8, int tileSize)
00231 /* Turn next base pairs into a tile.  Return FALSE
00232  * if it wouldn't be a good tile. */
00233 {
00234 int i;
00235 int repMod;
00236 int endRep;
00237 int maxRep = (tileSize / 2);
00238 
00239 /* Make sure no N's in tile */
00240 for (i=0; i<tileSize; ++i)
00241     {
00242     if (ntVal[(int)dna[i]] < 0)
00243         return FALSE;
00244     }
00245 
00246 /* Make sure that tile isn't part of a pattern. */ 
00247 for (repMod = 1; repMod <= maxRep; repMod += 1)
00248     {
00249     boolean allSame = TRUE;
00250     endRep = tileSize - repMod;
00251     for (i=0; i<endRep; ++i)
00252         {
00253         if (dna[i] != dna[i+repMod])
00254             {
00255             allSame = FALSE;
00256             break;
00257             }
00258         }
00259     if (allSame)
00260         return FALSE;
00261     }
00262 
00263 if (tileSize > 8)
00264     *retTile16 = packDna16(dna);
00265 else
00266     *retTile8 = packDna8(dna);
00267 
00268 return TRUE;
00269 }
00270 
00271 struct probeTile
00272 /* A tile from the probe and the offset where it came from. */
00273     {
00274     struct probeTile *next;
00275     int offset;
00276     bits32 tile16;
00277     bits16 tile8;
00278     };
00279 
00280 #define tileHashBits 16
00281 #define tileHashSize (1<<tileHashBits)
00282 #define tileHashMask (tileHashSize-1)
00283 #define tileHashFunc(tile) ((tile)&tileHashMask)
00284 
00285 static struct probeTile **makeTileHash()
00286 {
00287 struct probeTile **table;
00288 int tableSize = tileHashSize * sizeof(table[0]);
00289 
00290 table = needLargeMem(tableSize);
00291 memset(table, 0, tableSize);
00292 return table;
00293 }
00294 
00295 
00296 struct fastProber
00297 /* Structure that has hash list of probeSeq tiles for fast searching */
00298     {
00299     struct probeTile **hash;
00300     struct probeTile *probes;
00301     int probeSize;
00302     };
00303 
00304 static struct fastProber *makeFastProber(DNA *probeDna, int probeSize)
00305 {
00306 int maxProbeCount = probeSize - caTileSize + 1;
00307 int probeCount = 0;
00308 struct probeTile *probes;
00309 struct probeTile *probe;
00310 struct probeTile **hash;
00311 int i;
00312 struct fastProber *fp;
00313 
00314 if (maxProbeCount <= 0)
00315     return NULL;
00316 AllocVar(fp);
00317 fp->hash = hash = makeTileHash();
00318 fp->probeSize = probeSize;
00319 fp->probes = probes = needMem(maxProbeCount * sizeof(probes[0]) );
00320 for (i=0; i<maxProbeCount; ++i)
00321     {
00322     probe = &probes[probeCount];
00323     if (makeGoodTile(probeDna+i, &probe->tile16, &probe->tile8, caTileSize))
00324         {
00325         int hashIx; 
00326         probe->tile8 &= wobbleMask;
00327         hashIx = (caTileSize > 8 ? tileHashFunc(probe->tile16) : tileHashFunc(probe->tile8));
00328         probe->offset = i;
00329         probe->next = hash[hashIx];
00330         hash[hashIx] = probe;
00331         ++probeCount;
00332         }
00333     }
00334 return fp;
00335 }
00336 
00337 static void freeFastProber(struct fastProber **pFp)
00338 {
00339 struct fastProber *fp = *pFp;
00340 if (fp != NULL)
00341     {
00342     freeMem(fp->probes);
00343     freeMem(fp->hash);
00344     freez(pFp);
00345     }
00346 }
00347 
00348 static int makeIndividualHits16(struct probeTile **hash, bits32 *bases, int baseOffset, int baseWordCount,
00349     struct crudeHit *hits, int maxHitCount, int hitCount)
00350 /* Scan a short segment for individual hits. */
00351 {
00352 struct probeTile *probe;
00353 int i;
00354 for (i=0; i<baseWordCount; ++i)
00355     {
00356     if ((probe = hash[tileHashFunc(bases[i])]) != NULL)
00357         {
00358         do
00359             {
00360             if (bases[i] == probe->tile16)
00361                 {
00362                 if (hitCount >= maxHitCount)
00363                     {
00364                     warn("Too many hits, only taking first %d", maxHitCount);
00365                     goto END_HIT_LOOP;
00366                     }
00367                 hits[hitCount].probeOffset = probe->offset;
00368                 hits[hitCount].targetOffset = ((i+baseOffset) * caTileSize);
00369                 ++hitCount;
00370                 break;
00371                 }
00372             probe = probe->next;
00373             }
00374         while (probe != NULL);
00375         }
00376     }
00377 END_HIT_LOOP:
00378 return hitCount;
00379 }
00380 
00381 static int makeHits16(struct fastProber *fp, struct nt4Seq *target, struct crudeHit *hits,
00382     int maxHitCount)
00383 /* Scan entire target for hits to probe. */
00384 {
00385 bits32 *bases = target->bases;
00386 struct probeTile **hash = fp->hash;
00387 unsigned long acc;
00388 int hitCount = 0;
00389 int i;
00390 int baseWordCount = (target->baseCount/caTileSize);
00391 bits32 *endBases = bases + baseWordCount;
00392 int chunkSize = 8;
00393 int chunkCount = baseWordCount/chunkSize;
00394 int baseOffset = 0;
00395 
00396 for (i=0; i<chunkCount; ++i)
00397     {
00398     acc = ((unsigned long)(hash[tileHashFunc(bases[0])])
00399         |  (unsigned long)(hash[tileHashFunc(bases[1])])
00400         |  (unsigned long)(hash[tileHashFunc(bases[2])])
00401         |  (unsigned long)(hash[tileHashFunc(bases[3])])
00402         |  (unsigned long)(hash[tileHashFunc(bases[4])])
00403         |  (unsigned long)(hash[tileHashFunc(bases[5])])
00404         |  (unsigned long)(hash[tileHashFunc(bases[6])])
00405         |  (unsigned long)(hash[tileHashFunc(bases[7])]));
00406     if (acc)
00407         {
00408         hitCount = makeIndividualHits16(hash, bases, baseOffset, chunkSize, hits, maxHitCount, hitCount);
00409         if (hitCount >= maxHitCount)
00410             break;
00411         }
00412     bases += chunkSize;
00413     baseOffset += chunkSize;
00414     }
00415 if (bases != endBases)
00416     hitCount = makeIndividualHits16(hash, bases, baseOffset, endBases-bases, hits, maxHitCount, hitCount);
00417 return hitCount;
00418 }
00419 
00420 void squeezeHits8(struct fastProber *fp, 
00421     short *compactDiagBuf, int compactDiagBufByteSize, 
00422     int startToff, int lastCompactOff, int lastCompareOff,
00423     struct crudeHit *hits, int lastCompactedHit, int hitCount,
00424     int *retLastCompactedHit, int *retHitCount)
00425 /* Get rid of isolated hits by dumping ones where there is only one
00426  * hit on their diagonal. */
00427 {
00428 int diagOffset = fp->probeSize+2;
00429 int diag;
00430 int i;
00431 struct crudeHit *hit, *writeHit;
00432 int compactDiagBufArraySize = compactDiagBufByteSize/sizeof(short);
00433 
00434 memset(compactDiagBuf, 0, compactDiagBufByteSize);
00435 for (i=lastCompactedHit; i<hitCount; ++i)
00436     {
00437     hit = hits+i;
00438     diag = (hit->targetOffset - startToff) - hit->probeOffset + diagOffset;
00439     assert(diag >= 0 && diag < compactDiagBufArraySize);
00440     compactDiagBuf[diag] += 1;
00441     }
00442 writeHit = hits+lastCompactedHit;
00443 for (i=lastCompactedHit; i<hitCount; ++i)
00444     {
00445     hit = hits+i;
00446     if (hit->targetOffset >= lastCompactOff)
00447         break;
00448     diag = (hit->targetOffset - startToff) - hit->probeOffset + diagOffset;
00449     if (compactDiagBuf[diag] > 1)
00450         *writeHit++ = *hit;
00451     }    
00452 *retLastCompactedHit = (writeHit - hits);
00453 for (;i<hitCount; ++i)
00454     {
00455     hit = hits+i;
00456     *writeHit++ = *hit;
00457     }
00458 *retHitCount = (writeHit - hits);
00459 }
00460 
00461 
00462 static int makeHits8(struct fastProber *fp, struct nt4Seq *target, struct crudeHit *hits,
00463     int maxHitCount)
00464 /* Scan entire target for hits to probe. */
00465 {
00466 bits16 *bases = (bits16*)target->bases;
00467 struct probeTile **hash = fp->hash;
00468 struct probeTile *pbt;
00469 int hitCount = 0;
00470 int i;
00471 int baseWordCount = (target->baseCount/caTileSize);
00472 int tOffset = 0;
00473 
00474 /* Handle big/little endian problem. */
00475 bits32 endianTest = 0x12345678;
00476 bits16 *endianPt = (bits16*)(void*)(&endianTest);
00477 boolean needSwap = (*endianPt == 0x5678);
00478 int swapOffset[2];
00479 int swapIx = 0;
00480 
00481 /* Every so often we throw out singleton hits. The variables below
00482  * manage this. */
00483 int compactWindowSizePower = 10;
00484 int compactWindowSize = (1<<compactWindowSizePower);
00485 int compactModMask = compactWindowSize-1;
00486 int compactOverlap = 50;
00487 int lastCompactedHit = 0;
00488 int compactBufIntSize = (compactWindowSize+compactOverlap+fp->probeSize+4);
00489 int compactBufSize = (compactBufIntSize*sizeof(short));
00490 short *compactDiagBuf = needLargeMem(compactBufSize);
00491 
00492 if (needSwap)
00493     {
00494     swapOffset[0] = 8;
00495     swapOffset[1] = -8;
00496     }
00497 else
00498     {
00499     swapOffset[0] = swapOffset[1] = 0;
00500     }
00501 
00502 assert(tileHashBits == (8*sizeof(bits16)));
00503 for (i=0; i<baseWordCount; ++i)
00504     {
00505     if ((pbt = hash[*bases++ & wobbleMask]) != NULL)
00506         {
00507         if (hitCount < maxHitCount)
00508             {
00509             hits[hitCount].probeOffset = pbt->offset;
00510             hits[hitCount].targetOffset = tOffset + swapOffset[swapIx];
00511             ++hitCount;
00512             }
00513         else
00514             {
00515             warn("Got too many hits, only keeping first %d\n", hitCount);
00516             break;
00517             }
00518         }
00519     tOffset += caTileSize;
00520     if ((tOffset&compactModMask) == 0)
00521         {
00522         int startToff = tOffset - compactWindowSize - compactOverlap;
00523         int lastCompactOff = startToff + compactWindowSize;
00524         int lastCompareOff = tOffset;
00525 
00526         squeezeHits8(fp, compactDiagBuf, compactBufSize, 
00527             startToff,  lastCompactOff, lastCompareOff, 
00528             hits, lastCompactedHit, hitCount, &lastCompactedHit, &hitCount);
00529         }
00530     swapIx = 1 - swapIx;  /* Toggle between 0 and 1. */
00531     }
00532 freeMem(compactDiagBuf);
00533 return hitCount;
00534 }
00535 
00536 static int cmpHitsTargetFirst(const void *va, const void *vb)
00537 /* Compare function to sort hit array by ascending
00538  * target offset followed by ascending probe offset. */
00539 {
00540 struct crudeHit *a = (struct crudeHit *)va;
00541 struct crudeHit *b = (struct crudeHit *)vb;
00542 long diff;
00543 if ((diff = a->targetOffset - b->targetOffset) != 0)
00544     return diff;
00545 return a->probeOffset - b->probeOffset;
00546 }
00547 
00548 static int cmpExonsTargetFirst(const void *va, const void *vb)
00549 /* Compare function to sort exons by ascending target
00550  * offset followed by ascending probe offset. */
00551 {
00552 struct crudeExon *a = (struct crudeExon *)va;
00553 struct crudeExon *b = (struct crudeExon *)vb;
00554 long diff;
00555 if ((diff = a->targetStart - b->targetStart) != 0)
00556     return diff;
00557 return a->probeStart - b->probeStart;
00558 }
00559 
00560 
00561 #define maxHitsAtOnce 120000
00562 /* Maximum number of hits to allow on a single scan.
00563  * If we get more than this need to toss out a bad tile
00564  * or something... */
00565 
00566 static int makeHits(struct fastProber *fp, struct nt4Seq *target, struct crudeHit *hits,
00567     int maxHitCount)
00568 /* Scan entire target for hits to probe. */
00569 {
00570 if (caTileSize == 8)
00571     return makeHits8(fp, target, hits, maxHitCount);
00572 else if (caTileSize == 16)
00573     return makeHits16(fp, target, hits, maxHitCount);
00574 else
00575     {
00576     errAbort("Can only do tile sizes of 8 or 16 in makeHits");
00577     return 0;
00578     }
00579 }
00580 
00581 static int findCrudeGenes(struct fastProber *fp, struct nt4Seq *target,
00582     struct crudeGene *genes, int maxGeneCount, boolean isRc)
00583 /* Scan target with probe, and arrange hits into crude genes. */
00584 {
00585 struct crudeHit *hits;
00586 struct crudeExon *exons;
00587 int hitCount, exonCount, geneCount = 0;
00588 
00589 if (fp == NULL)
00590     return 0;
00591 assert(maxGeneCount >= maxHitsAtOnce);
00592 hits = needMem(maxHitsAtOnce*sizeof(*hits));
00593 exons = needMem(maxHitsAtOnce*sizeof(*exons));
00594 hitCount = makeHits(fp, target, hits, maxHitsAtOnce);
00595 
00596 if (hitCount > 0)
00597     {
00598     qsort(hits, hitCount, sizeof(hits[0]), cmpHitsTargetFirst);
00599     exonCount = lumpHitsIntoExons(hits, hitCount, exons);
00600     qsort(exons, exonCount, sizeof(exons[0]), cmpExonsTargetFirst);
00601     if (caIsCdna)
00602         geneCount = lumpExonsIntoGenes(exons, exonCount, genes, target, isRc);
00603     else
00604         geneCount = copyExonsIntoGenes(exons, exonCount, genes, target, isRc);
00605     }
00606 freeMem(hits);
00607 freeMem(exons);
00608 return geneCount;
00609 }
00610 
00611 static int cmpGenesByScore(const void *va, const void *vb)
00612 /* cmp function to sort leaving highest scores first. */
00613 {
00614 struct crudeGene *a = (struct crudeGene *)va;
00615 struct crudeGene *b = (struct crudeGene *)vb;
00616 return b->score - a->score;
00617 }
00618 
00619 static int countGenesBetter(struct crudeGene *genes, int geneCount, int cutoff)
00620 /* Count number of genes (in sorted list) with scores better than cutoff */
00621 {
00622 int i;
00623 for (i=0; i<geneCount; ++i)
00624     {
00625     if (genes[i].score <= cutoff)
00626         break;
00627     }
00628 return i;
00629 }
00630 
00631 static int filterPoorGenes(struct crudeGene *genes, int geneCount)
00632 /* Sort gene list by score and remove bottom scorers. 
00633  * This is gauranteed to get rid of half of genes on list or 
00634  * more. */
00635 {
00636 int bestScore, worstScore;
00637 int newGeneCount;
00638 int halfGeneCount = geneCount/2;
00639 
00640 qsort(genes, geneCount, sizeof(genes[0]), cmpGenesByScore);
00641 bestScore = genes[0].score;
00642 worstScore = genes[geneCount-1].score;
00643 
00644 /* If scores are clustered with half or more at the top,
00645  * we have to be crude and just lop off bottom half of scores. */
00646 if (bestScore == genes[halfGeneCount].score)
00647     {
00648     warn("Bunches of genes, all scoring the same. Program is "
00649          "throwing out half out of necessity.\n");
00650     return geneCount/2;
00651     }
00652 
00653 /* Drop bottom score until have gotten rid of at least half. */
00654 newGeneCount = geneCount;
00655 while (newGeneCount > halfGeneCount)
00656     {
00657     worstScore = genes[newGeneCount-1].score;
00658     newGeneCount = countGenesBetter(genes, newGeneCount, worstScore);
00659     }
00660 return newGeneCount;
00661 }
00662 
00663 static int chromIx(struct nt4Seq *one, struct nt4Seq **chrome, int chromeCount)
00664 {
00665 int i;
00666 for (i=0; i<chromeCount; ++i)
00667     {
00668     if (one == chrome[i])
00669         return i;
00670     }
00671 assert(FALSE);
00672 return -1;
00673 }
00674 
00675 static int countPrettyGood(struct crudeGene *crudeGenes, int geneCount, int fraction, int minScore)
00676 /* Count up # of genes that are not in bottom fraction. (For fraction 4, count
00677  * all but bottom quarter. */
00678 {
00679 int count = 0;
00680 int i;
00681 int threshold = crudeGenes->score;  /* Top score */
00682 
00683 threshold = (threshold + fraction/2)/fraction;
00684 if (threshold < minScore)
00685     threshold = minScore;
00686 for (i=0; i<geneCount; ++i)
00687     {
00688     if (crudeGenes[i].score >= threshold)
00689         ++count;
00690     else
00691         break;
00692     }
00693 return count;
00694 }
00695 
00696 static int wormDnaMatchScores[256][256];
00697 
00698 void initWormDnaMatchScores()
00699 /* Initialize our big sloppy pairwise comparison table. */
00700 {
00701 int i,j;
00702 static boolean initted = FALSE;
00703 
00704 if (initted) return;
00705 initted = TRUE;
00706 for (i=0; i<256; ++i)
00707     for (j=0; j<256; ++j)
00708         wormDnaMatchScores[i][j] = -4;
00709 wormDnaMatchScores['a']['a'] = 2;
00710 wormDnaMatchScores['t']['t'] = 2;
00711 wormDnaMatchScores['A']['A'] = 2;
00712 wormDnaMatchScores['T']['T'] = 2;
00713 wormDnaMatchScores['c']['c'] = 3;
00714 wormDnaMatchScores['g']['g'] = 3;
00715 wormDnaMatchScores['C']['C'] = 3;
00716 wormDnaMatchScores['G']['G'] = 3;
00717 }
00718 
00719 #ifdef OLD
00720 void scoreNoninsertingExtensions(struct crudeGene *crudeGene, DNA *probe, int probeSize)
00721 /* Fetch target DNA the size of probe and see how good of a score we
00722  * can come up with. */
00723 {
00724 int i;
00725 int pStart = crudeGene->probeStart;
00726 int size = crudeGene->probeEnd - pStart;
00727 int score = 0;
00728 DNA *target = nt4Unpack(crudeGene->target, crudeGene->targetStart, size);
00729 DNA p,t;
00730 probe += pStart;
00731 for (i=0; i < size; ++i)
00732     {
00733     p = probe[i];
00734     t = target[i];
00735     score += wormDnaMatchScores[p][t];
00736     }
00737 freeMem(target);
00738 }
00739 #endif 
00740 
00741 void scoreNoninsertingExtensions(struct crudeGene *crudeGene, DNA *probe, int probeSize)
00742 /* Old fetch target DNA the size of probe and see how good of a score we
00743  * can come up with. */
00744 {
00745 int i, size;
00746 int score = 0, maxScore = 0;
00747 struct nt4Seq *targetChrom = crudeGene->target;
00748 DNA *target;
00749 DNA p,t;
00750 
00751 /* Figure out what DNA to fetch - trying to get all of target that
00752  * corresponds to all of probe, but clipping if at ends of chromosome. */
00753 int pTileStart = crudeGene->probeStart;
00754 int tTileStart = crudeGene->targetStart;
00755 int pStart = 0, pEnd = probeSize;
00756 int tStart = tTileStart - pTileStart;
00757 int tEnd = tStart + probeSize;
00758 if (tStart < 0)
00759     {
00760     pStart -= tStart;
00761     tStart = 0;
00762     }
00763 if (tEnd > targetChrom->baseCount)
00764     {
00765     int diff = tEnd - targetChrom->baseCount;
00766     tEnd -= diff;
00767     pEnd -= diff;
00768     }
00769 size = tEnd - tStart;
00770 if (crudeGene->isRc)
00771     reverseComplement(probe, probeSize);
00772 if (size > 0)
00773     {
00774     target = nt4Unpack(targetChrom, tStart, size);
00775     for (i = 0; i<size; ++i)
00776         {
00777         assert(i + pStart < probeSize);
00778         p = probe[i + pStart];
00779         t = target[i];
00780         score += wormDnaMatchScores[(int)p][(int)t];
00781         if (score < 0)
00782             score = 0;
00783         if (score > maxScore)
00784             maxScore = score;
00785         }            
00786     freeMem(target);
00787     }
00788 crudeGene->score = maxScore;
00789 if (crudeGene->isRc)
00790     reverseComplement(probe, probeSize);
00791 }
00792 
00793 
00794 struct crudeAli *crudeAliFind(DNA *probe, int probeSize, 
00795     struct nt4Seq **chrome, int chromeCount, int tileSize, int minScore)
00796 /* Returns a list of crude alignments.  (You can free this with slFreeList() */
00797 {
00798 int oneGeneCount;
00799 int i;
00800 int chromeIx;
00801 int geneCount = 0;
00802 int maxGeneCount = 2*maxHitsAtOnce;
00803 int isRc;
00804 struct nt4Seq *target;
00805 struct crudeGene *crudeGenes = needMem(maxGeneCount*sizeof(*crudeGenes));
00806 struct fastProber *fps[2];
00807 struct crudeAli *aliList = NULL, *ali;
00808 int goodGeneCount;
00809 
00810 initWormDnaMatchScores();
00811 caTileSize = tileSize;
00812 caIsCdna = (tileSize >= 14);
00813 fps[0] = makeFastProber(probe, probeSize);
00814 reverseComplement(probe, probeSize);
00815 fps[1] = makeFastProber(probe, probeSize);
00816 reverseComplement(probe, probeSize);
00817 for (chromeIx = 0; chromeIx < chromeCount; ++chromeIx)
00818     {
00819     target = chrome[chromeIx];
00820     for (isRc = 0; isRc <= 1; ++isRc)
00821         {
00822         /* Get crude idea of genes on one strand. */
00823         oneGeneCount = findCrudeGenes(fps[isRc], target, 
00824             crudeGenes+geneCount, maxGeneCount-geneCount, isRc);
00825             
00826         /* Increment gene count and if necessary compact list. */
00827         geneCount += oneGeneCount;
00828         if (maxGeneCount - geneCount < maxHitsAtOnce)
00829             geneCount = filterPoorGenes(crudeGenes, geneCount);
00830         }
00831     if (maxGeneCount - geneCount < maxHitsAtOnce)
00832         break;
00833     }
00834 
00835 /* Sort genes by initial promise and get rid of some trash. */
00836     {
00837     int bestScore, worstScore;
00838     qsort(crudeGenes, geneCount, sizeof(crudeGenes[0]), cmpGenesByScore);
00839     bestScore = crudeGenes[0].score;
00840     worstScore = crudeGenes[geneCount-1].score;
00841     if (bestScore > worstScore)
00842         {
00843         int cutOff = bestScore/10;
00844         goodGeneCount = geneCount = countGenesBetter(crudeGenes, geneCount, cutOff);
00845         }
00846     else if (bestScore >= minScore)
00847         goodGeneCount = geneCount;
00848     else
00849         goodGeneCount = 0;
00850     }
00851 
00852 if (tileSize == 8)
00853     {
00854     goodGeneCount = countPrettyGood(crudeGenes, geneCount, 4, 4);
00855     geneCount = goodGeneCount;
00856     for (i=0; i<geneCount; ++i)
00857         scoreNoninsertingExtensions(crudeGenes+i, probe, probeSize);
00858     qsort(crudeGenes, geneCount, sizeof(crudeGenes[0]), cmpGenesByScore);
00859     goodGeneCount = countPrettyGood(crudeGenes, geneCount, 4, minScore);
00860     }
00861 for (i=0; i<goodGeneCount; ++i)
00862     {
00863     AllocVar(ali);
00864     ali->chromIx = chromIx(crudeGenes[i].target, chrome, chromeCount);
00865     ali->start = crudeGenes[i].targetStart;
00866     ali->end = crudeGenes[i].targetEnd;
00867     ali->score = crudeGenes[i].score;
00868     ali->strand = (crudeGenes[i].isRc ? '-' : '+');
00869     slAddHead(&aliList, ali);
00870     }
00871 slReverse(&aliList);
00872 freeFastProber(&fps[0]);
00873 freeFastProber(&fps[1]);
00874 freeMem(crudeGenes);
00875 
00876 return aliList;
00877 }
00878 

Generated on Tue Dec 25 18:39:29 2007 for blat by  doxygen 1.5.2