00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "common.h"
00012 #include "dnautil.h"
00013 #include "nt4.h"
00014 #include "dnaseq.h"
00015 #include "crudeali.h"
00016
00017 static char const rcsid[] = "$Id: crudeali.c,v 1.5 2006/05/09 01:05:55 markd Exp $";
00018
00019 #define maxTileSize 16
00020
00021 #define wobbleMask 0xF3CF
00022
00023 static int caTileSize = 16;
00024 static boolean caIsCdna = TRUE;
00025
00026 struct crudeHit
00027
00028 {
00029 int probeOffset;
00030 int targetOffset;
00031 boolean isLumped;
00032 };
00033
00034
00035 struct crudeExon
00036
00037 {
00038 int probeStart;
00039 int probeEnd;
00040 int targetStart;
00041 int targetEnd;
00042 int hitCount;
00043 boolean isLumped;
00044 };
00045
00046 struct crudeGene
00047
00048
00049 {
00050 struct nt4Seq *target;
00051 boolean isRc;
00052 int probeStart;
00053 int probeEnd;
00054 int targetStart;
00055 int targetEnd;
00056 int score;
00057 };
00058
00059 static int lumpHitsIntoExons(struct crudeHit *hits, int hitCount, struct crudeExon *exons)
00060
00061
00062
00063 {
00064 int i;
00065 struct crudeExon *exon = exons;
00066 int exonCount = 0;
00067 int firstNonLumped = 0;
00068
00069 for (i=0; i<hitCount; ++i)
00070 hits[i].isLumped = FALSE;
00071
00072 for (;;)
00073 {
00074 boolean startedExon = FALSE;
00075 int diagDiff = 0;
00076 int tOff, pOff;
00077 int thisDiff;
00078 for (; firstNonLumped < hitCount; ++firstNonLumped)
00079 {
00080 if (!hits[firstNonLumped].isLumped)
00081 break;
00082 }
00083 for (i=firstNonLumped; i<hitCount; ++i)
00084 {
00085 if (hits[i].isLumped == FALSE)
00086 {
00087 pOff = hits[i].probeOffset;
00088 tOff = hits[i].targetOffset;
00089 thisDiff = pOff - tOff;
00090 if (!startedExon)
00091 {
00092 startedExon = TRUE;
00093 hits[i].isLumped = TRUE;
00094 exon->probeStart = pOff;
00095 exon->probeEnd = pOff + caTileSize;
00096 exon->targetStart = tOff;
00097 exon->targetEnd = tOff + caTileSize;
00098 exon->hitCount = 1;
00099 diagDiff = thisDiff;
00100 }
00101 else if (diagDiff - 2 <= thisDiff && thisDiff <= diagDiff + 2
00102 && exon->probeEnd - 2 <= pOff && pOff <= exon->probeEnd + 48
00103 && exon->targetEnd - 2 <= tOff && tOff <= exon->targetEnd + 48)
00104 {
00105 hits[i].isLumped = TRUE;
00106 exon->probeEnd = pOff + caTileSize;
00107 exon->targetEnd = tOff + caTileSize;
00108 exon->hitCount += 1;
00109 }
00110 else if (tOff > exon->targetEnd + 48)
00111 break;
00112 }
00113 }
00114 if (!startedExon)
00115 break;
00116 ++exonCount;
00117 ++exon;
00118 }
00119
00120
00121 if (caTileSize == 8)
00122 {
00123 struct crudeExon *read = exons, *write = exons;
00124 int twoFerCount = 0;
00125 for (i=0; i<exonCount; ++i)
00126 {
00127 if (read->hitCount >= 2)
00128 {
00129 *write++ = *read++;
00130 ++twoFerCount;
00131 }
00132 else
00133 ++read;
00134 }
00135 exonCount = twoFerCount;
00136 }
00137 return exonCount;
00138 }
00139
00140 static int lumpExonsIntoGenes(struct crudeExon *exons, int exonCount,
00141 struct crudeGene *genes, struct nt4Seq *target, boolean isRc)
00142
00143 {
00144 int i;
00145 struct crudeGene *gene = genes;
00146 int geneCount = 0;
00147 int firstNonLumped = 0;
00148 int targetGapSpan = (caIsCdna ? 25000 : 200);
00149 int probeGapSpan = (caIsCdna ? 64 : 200);
00150
00151 for (i=0; i<exonCount; ++i)
00152 exons[i].isLumped = FALSE;
00153
00154 for (;;)
00155 {
00156 boolean startedGene = FALSE;
00157 int lastDiff = 0;
00158 int tOff, pOff;
00159 int thisDiff;
00160 int exonsInThis = 0;
00161
00162 for ( ; firstNonLumped < exonCount; ++firstNonLumped)
00163 {
00164 if (!exons[firstNonLumped].isLumped)
00165 break;
00166 }
00167 for (i=firstNonLumped; i<exonCount; ++i)
00168 {
00169 if (!exons[i].isLumped)
00170 {
00171 tOff = exons[i].targetStart;
00172 pOff = exons[i].probeStart;
00173 thisDiff = tOff - pOff;
00174 if (!startedGene)
00175 {
00176 startedGene = TRUE;
00177 exons[i].isLumped = TRUE;
00178 lastDiff = thisDiff;
00179 exonsInThis = 1;
00180 gene->target = target;
00181 gene->isRc = isRc;
00182 gene->probeStart = exons[i].probeStart;
00183 gene->probeEnd = exons[i].probeEnd;
00184 gene->targetStart = exons[i].targetStart;
00185 gene->targetEnd = exons[i].targetEnd;
00186 gene->score = exons[i].hitCount * exons[i].hitCount;
00187 }
00188 else if (gene->targetEnd - 10 <= tOff && tOff <= gene->targetEnd + targetGapSpan
00189 && gene->probeEnd - 10 <= pOff && pOff <= gene->probeEnd + probeGapSpan
00190 && lastDiff - 2 <= thisDiff)
00191 {
00192 exons[i].isLumped = TRUE;
00193 gene->targetEnd = exons[i].targetEnd;
00194 gene->probeEnd = exons[i].probeEnd;
00195 gene->score += exons[i].hitCount * exons[i].hitCount;
00196 exonsInThis += 1;
00197 lastDiff = thisDiff;
00198 }
00199 else if (tOff > gene->targetEnd + targetGapSpan)
00200 break;
00201 }
00202 }
00203 if (!startedGene)
00204 break;
00205 gene->score += exonsInThis;
00206 ++geneCount;
00207 ++gene;
00208 }
00209 return geneCount;
00210 }
00211
00212 static int copyExonsIntoGenes(struct crudeExon *exons, int exonCount,
00213 struct crudeGene *genes, struct nt4Seq *target, boolean isRc)
00214
00215 {
00216 int i;
00217 for (i=0; i<exonCount; ++i)
00218 {
00219 genes[i].target = target;
00220 genes[i].isRc = isRc;
00221 genes[i].probeStart = exons[i].probeStart;
00222 genes[i].probeEnd = exons[i].probeEnd;
00223 genes[i].targetStart = exons[i].targetStart;
00224 genes[i].targetEnd = exons[i].targetEnd;
00225 genes[i].score = exons[i].hitCount * exons[i].hitCount;
00226 }
00227 return exonCount;
00228 }
00229
00230 static boolean makeGoodTile(DNA *dna, bits32 *retTile16, bits16 *retTile8, int tileSize)
00231
00232
00233 {
00234 int i;
00235 int repMod;
00236 int endRep;
00237 int maxRep = (tileSize / 2);
00238
00239
00240 for (i=0; i<tileSize; ++i)
00241 {
00242 if (ntVal[(int)dna[i]] < 0)
00243 return FALSE;
00244 }
00245
00246
00247 for (repMod = 1; repMod <= maxRep; repMod += 1)
00248 {
00249 boolean allSame = TRUE;
00250 endRep = tileSize - repMod;
00251 for (i=0; i<endRep; ++i)
00252 {
00253 if (dna[i] != dna[i+repMod])
00254 {
00255 allSame = FALSE;
00256 break;
00257 }
00258 }
00259 if (allSame)
00260 return FALSE;
00261 }
00262
00263 if (tileSize > 8)
00264 *retTile16 = packDna16(dna);
00265 else
00266 *retTile8 = packDna8(dna);
00267
00268 return TRUE;
00269 }
00270
00271 struct probeTile
00272
00273 {
00274 struct probeTile *next;
00275 int offset;
00276 bits32 tile16;
00277 bits16 tile8;
00278 };
00279
00280 #define tileHashBits 16
00281 #define tileHashSize (1<<tileHashBits)
00282 #define tileHashMask (tileHashSize-1)
00283 #define tileHashFunc(tile) ((tile)&tileHashMask)
00284
00285 static struct probeTile **makeTileHash()
00286 {
00287 struct probeTile **table;
00288 int tableSize = tileHashSize * sizeof(table[0]);
00289
00290 table = needLargeMem(tableSize);
00291 memset(table, 0, tableSize);
00292 return table;
00293 }
00294
00295
00296 struct fastProber
00297
00298 {
00299 struct probeTile **hash;
00300 struct probeTile *probes;
00301 int probeSize;
00302 };
00303
00304 static struct fastProber *makeFastProber(DNA *probeDna, int probeSize)
00305 {
00306 int maxProbeCount = probeSize - caTileSize + 1;
00307 int probeCount = 0;
00308 struct probeTile *probes;
00309 struct probeTile *probe;
00310 struct probeTile **hash;
00311 int i;
00312 struct fastProber *fp;
00313
00314 if (maxProbeCount <= 0)
00315 return NULL;
00316 AllocVar(fp);
00317 fp->hash = hash = makeTileHash();
00318 fp->probeSize = probeSize;
00319 fp->probes = probes = needMem(maxProbeCount * sizeof(probes[0]) );
00320 for (i=0; i<maxProbeCount; ++i)
00321 {
00322 probe = &probes[probeCount];
00323 if (makeGoodTile(probeDna+i, &probe->tile16, &probe->tile8, caTileSize))
00324 {
00325 int hashIx;
00326 probe->tile8 &= wobbleMask;
00327 hashIx = (caTileSize > 8 ? tileHashFunc(probe->tile16) : tileHashFunc(probe->tile8));
00328 probe->offset = i;
00329 probe->next = hash[hashIx];
00330 hash[hashIx] = probe;
00331 ++probeCount;
00332 }
00333 }
00334 return fp;
00335 }
00336
00337 static void freeFastProber(struct fastProber **pFp)
00338 {
00339 struct fastProber *fp = *pFp;
00340 if (fp != NULL)
00341 {
00342 freeMem(fp->probes);
00343 freeMem(fp->hash);
00344 freez(pFp);
00345 }
00346 }
00347
00348 static int makeIndividualHits16(struct probeTile **hash, bits32 *bases, int baseOffset, int baseWordCount,
00349 struct crudeHit *hits, int maxHitCount, int hitCount)
00350
00351 {
00352 struct probeTile *probe;
00353 int i;
00354 for (i=0; i<baseWordCount; ++i)
00355 {
00356 if ((probe = hash[tileHashFunc(bases[i])]) != NULL)
00357 {
00358 do
00359 {
00360 if (bases[i] == probe->tile16)
00361 {
00362 if (hitCount >= maxHitCount)
00363 {
00364 warn("Too many hits, only taking first %d", maxHitCount);
00365 goto END_HIT_LOOP;
00366 }
00367 hits[hitCount].probeOffset = probe->offset;
00368 hits[hitCount].targetOffset = ((i+baseOffset) * caTileSize);
00369 ++hitCount;
00370 break;
00371 }
00372 probe = probe->next;
00373 }
00374 while (probe != NULL);
00375 }
00376 }
00377 END_HIT_LOOP:
00378 return hitCount;
00379 }
00380
00381 static int makeHits16(struct fastProber *fp, struct nt4Seq *target, struct crudeHit *hits,
00382 int maxHitCount)
00383
00384 {
00385 bits32 *bases = target->bases;
00386 struct probeTile **hash = fp->hash;
00387 unsigned long acc;
00388 int hitCount = 0;
00389 int i;
00390 int baseWordCount = (target->baseCount/caTileSize);
00391 bits32 *endBases = bases + baseWordCount;
00392 int chunkSize = 8;
00393 int chunkCount = baseWordCount/chunkSize;
00394 int baseOffset = 0;
00395
00396 for (i=0; i<chunkCount; ++i)
00397 {
00398 acc = ((unsigned long)(hash[tileHashFunc(bases[0])])
00399 | (unsigned long)(hash[tileHashFunc(bases[1])])
00400 | (unsigned long)(hash[tileHashFunc(bases[2])])
00401 | (unsigned long)(hash[tileHashFunc(bases[3])])
00402 | (unsigned long)(hash[tileHashFunc(bases[4])])
00403 | (unsigned long)(hash[tileHashFunc(bases[5])])
00404 | (unsigned long)(hash[tileHashFunc(bases[6])])
00405 | (unsigned long)(hash[tileHashFunc(bases[7])]));
00406 if (acc)
00407 {
00408 hitCount = makeIndividualHits16(hash, bases, baseOffset, chunkSize, hits, maxHitCount, hitCount);
00409 if (hitCount >= maxHitCount)
00410 break;
00411 }
00412 bases += chunkSize;
00413 baseOffset += chunkSize;
00414 }
00415 if (bases != endBases)
00416 hitCount = makeIndividualHits16(hash, bases, baseOffset, endBases-bases, hits, maxHitCount, hitCount);
00417 return hitCount;
00418 }
00419
00420 void squeezeHits8(struct fastProber *fp,
00421 short *compactDiagBuf, int compactDiagBufByteSize,
00422 int startToff, int lastCompactOff, int lastCompareOff,
00423 struct crudeHit *hits, int lastCompactedHit, int hitCount,
00424 int *retLastCompactedHit, int *retHitCount)
00425
00426
00427 {
00428 int diagOffset = fp->probeSize+2;
00429 int diag;
00430 int i;
00431 struct crudeHit *hit, *writeHit;
00432 int compactDiagBufArraySize = compactDiagBufByteSize/sizeof(short);
00433
00434 memset(compactDiagBuf, 0, compactDiagBufByteSize);
00435 for (i=lastCompactedHit; i<hitCount; ++i)
00436 {
00437 hit = hits+i;
00438 diag = (hit->targetOffset - startToff) - hit->probeOffset + diagOffset;
00439 assert(diag >= 0 && diag < compactDiagBufArraySize);
00440 compactDiagBuf[diag] += 1;
00441 }
00442 writeHit = hits+lastCompactedHit;
00443 for (i=lastCompactedHit; i<hitCount; ++i)
00444 {
00445 hit = hits+i;
00446 if (hit->targetOffset >= lastCompactOff)
00447 break;
00448 diag = (hit->targetOffset - startToff) - hit->probeOffset + diagOffset;
00449 if (compactDiagBuf[diag] > 1)
00450 *writeHit++ = *hit;
00451 }
00452 *retLastCompactedHit = (writeHit - hits);
00453 for (;i<hitCount; ++i)
00454 {
00455 hit = hits+i;
00456 *writeHit++ = *hit;
00457 }
00458 *retHitCount = (writeHit - hits);
00459 }
00460
00461
00462 static int makeHits8(struct fastProber *fp, struct nt4Seq *target, struct crudeHit *hits,
00463 int maxHitCount)
00464
00465 {
00466 bits16 *bases = (bits16*)target->bases;
00467 struct probeTile **hash = fp->hash;
00468 struct probeTile *pbt;
00469 int hitCount = 0;
00470 int i;
00471 int baseWordCount = (target->baseCount/caTileSize);
00472 int tOffset = 0;
00473
00474
00475 bits32 endianTest = 0x12345678;
00476 bits16 *endianPt = (bits16*)(void*)(&endianTest);
00477 boolean needSwap = (*endianPt == 0x5678);
00478 int swapOffset[2];
00479 int swapIx = 0;
00480
00481
00482
00483 int compactWindowSizePower = 10;
00484 int compactWindowSize = (1<<compactWindowSizePower);
00485 int compactModMask = compactWindowSize-1;
00486 int compactOverlap = 50;
00487 int lastCompactedHit = 0;
00488 int compactBufIntSize = (compactWindowSize+compactOverlap+fp->probeSize+4);
00489 int compactBufSize = (compactBufIntSize*sizeof(short));
00490 short *compactDiagBuf = needLargeMem(compactBufSize);
00491
00492 if (needSwap)
00493 {
00494 swapOffset[0] = 8;
00495 swapOffset[1] = -8;
00496 }
00497 else
00498 {
00499 swapOffset[0] = swapOffset[1] = 0;
00500 }
00501
00502 assert(tileHashBits == (8*sizeof(bits16)));
00503 for (i=0; i<baseWordCount; ++i)
00504 {
00505 if ((pbt = hash[*bases++ & wobbleMask]) != NULL)
00506 {
00507 if (hitCount < maxHitCount)
00508 {
00509 hits[hitCount].probeOffset = pbt->offset;
00510 hits[hitCount].targetOffset = tOffset + swapOffset[swapIx];
00511 ++hitCount;
00512 }
00513 else
00514 {
00515 warn("Got too many hits, only keeping first %d\n", hitCount);
00516 break;
00517 }
00518 }
00519 tOffset += caTileSize;
00520 if ((tOffset&compactModMask) == 0)
00521 {
00522 int startToff = tOffset - compactWindowSize - compactOverlap;
00523 int lastCompactOff = startToff + compactWindowSize;
00524 int lastCompareOff = tOffset;
00525
00526 squeezeHits8(fp, compactDiagBuf, compactBufSize,
00527 startToff, lastCompactOff, lastCompareOff,
00528 hits, lastCompactedHit, hitCount, &lastCompactedHit, &hitCount);
00529 }
00530 swapIx = 1 - swapIx;
00531 }
00532 freeMem(compactDiagBuf);
00533 return hitCount;
00534 }
00535
00536 static int cmpHitsTargetFirst(const void *va, const void *vb)
00537
00538
00539 {
00540 struct crudeHit *a = (struct crudeHit *)va;
00541 struct crudeHit *b = (struct crudeHit *)vb;
00542 long diff;
00543 if ((diff = a->targetOffset - b->targetOffset) != 0)
00544 return diff;
00545 return a->probeOffset - b->probeOffset;
00546 }
00547
00548 static int cmpExonsTargetFirst(const void *va, const void *vb)
00549
00550
00551 {
00552 struct crudeExon *a = (struct crudeExon *)va;
00553 struct crudeExon *b = (struct crudeExon *)vb;
00554 long diff;
00555 if ((diff = a->targetStart - b->targetStart) != 0)
00556 return diff;
00557 return a->probeStart - b->probeStart;
00558 }
00559
00560
00561 #define maxHitsAtOnce 120000
00562
00563
00564
00565
00566 static int makeHits(struct fastProber *fp, struct nt4Seq *target, struct crudeHit *hits,
00567 int maxHitCount)
00568
00569 {
00570 if (caTileSize == 8)
00571 return makeHits8(fp, target, hits, maxHitCount);
00572 else if (caTileSize == 16)
00573 return makeHits16(fp, target, hits, maxHitCount);
00574 else
00575 {
00576 errAbort("Can only do tile sizes of 8 or 16 in makeHits");
00577 return 0;
00578 }
00579 }
00580
00581 static int findCrudeGenes(struct fastProber *fp, struct nt4Seq *target,
00582 struct crudeGene *genes, int maxGeneCount, boolean isRc)
00583
00584 {
00585 struct crudeHit *hits;
00586 struct crudeExon *exons;
00587 int hitCount, exonCount, geneCount = 0;
00588
00589 if (fp == NULL)
00590 return 0;
00591 assert(maxGeneCount >= maxHitsAtOnce);
00592 hits = needMem(maxHitsAtOnce*sizeof(*hits));
00593 exons = needMem(maxHitsAtOnce*sizeof(*exons));
00594 hitCount = makeHits(fp, target, hits, maxHitsAtOnce);
00595
00596 if (hitCount > 0)
00597 {
00598 qsort(hits, hitCount, sizeof(hits[0]), cmpHitsTargetFirst);
00599 exonCount = lumpHitsIntoExons(hits, hitCount, exons);
00600 qsort(exons, exonCount, sizeof(exons[0]), cmpExonsTargetFirst);
00601 if (caIsCdna)
00602 geneCount = lumpExonsIntoGenes(exons, exonCount, genes, target, isRc);
00603 else
00604 geneCount = copyExonsIntoGenes(exons, exonCount, genes, target, isRc);
00605 }
00606 freeMem(hits);
00607 freeMem(exons);
00608 return geneCount;
00609 }
00610
00611 static int cmpGenesByScore(const void *va, const void *vb)
00612
00613 {
00614 struct crudeGene *a = (struct crudeGene *)va;
00615 struct crudeGene *b = (struct crudeGene *)vb;
00616 return b->score - a->score;
00617 }
00618
00619 static int countGenesBetter(struct crudeGene *genes, int geneCount, int cutoff)
00620
00621 {
00622 int i;
00623 for (i=0; i<geneCount; ++i)
00624 {
00625 if (genes[i].score <= cutoff)
00626 break;
00627 }
00628 return i;
00629 }
00630
00631 static int filterPoorGenes(struct crudeGene *genes, int geneCount)
00632
00633
00634
00635 {
00636 int bestScore, worstScore;
00637 int newGeneCount;
00638 int halfGeneCount = geneCount/2;
00639
00640 qsort(genes, geneCount, sizeof(genes[0]), cmpGenesByScore);
00641 bestScore = genes[0].score;
00642 worstScore = genes[geneCount-1].score;
00643
00644
00645
00646 if (bestScore == genes[halfGeneCount].score)
00647 {
00648 warn("Bunches of genes, all scoring the same. Program is "
00649 "throwing out half out of necessity.\n");
00650 return geneCount/2;
00651 }
00652
00653
00654 newGeneCount = geneCount;
00655 while (newGeneCount > halfGeneCount)
00656 {
00657 worstScore = genes[newGeneCount-1].score;
00658 newGeneCount = countGenesBetter(genes, newGeneCount, worstScore);
00659 }
00660 return newGeneCount;
00661 }
00662
00663 static int chromIx(struct nt4Seq *one, struct nt4Seq **chrome, int chromeCount)
00664 {
00665 int i;
00666 for (i=0; i<chromeCount; ++i)
00667 {
00668 if (one == chrome[i])
00669 return i;
00670 }
00671 assert(FALSE);
00672 return -1;
00673 }
00674
00675 static int countPrettyGood(struct crudeGene *crudeGenes, int geneCount, int fraction, int minScore)
00676
00677
00678 {
00679 int count = 0;
00680 int i;
00681 int threshold = crudeGenes->score;
00682
00683 threshold = (threshold + fraction/2)/fraction;
00684 if (threshold < minScore)
00685 threshold = minScore;
00686 for (i=0; i<geneCount; ++i)
00687 {
00688 if (crudeGenes[i].score >= threshold)
00689 ++count;
00690 else
00691 break;
00692 }
00693 return count;
00694 }
00695
00696 static int wormDnaMatchScores[256][256];
00697
00698 void initWormDnaMatchScores()
00699
00700 {
00701 int i,j;
00702 static boolean initted = FALSE;
00703
00704 if (initted) return;
00705 initted = TRUE;
00706 for (i=0; i<256; ++i)
00707 for (j=0; j<256; ++j)
00708 wormDnaMatchScores[i][j] = -4;
00709 wormDnaMatchScores['a']['a'] = 2;
00710 wormDnaMatchScores['t']['t'] = 2;
00711 wormDnaMatchScores['A']['A'] = 2;
00712 wormDnaMatchScores['T']['T'] = 2;
00713 wormDnaMatchScores['c']['c'] = 3;
00714 wormDnaMatchScores['g']['g'] = 3;
00715 wormDnaMatchScores['C']['C'] = 3;
00716 wormDnaMatchScores['G']['G'] = 3;
00717 }
00718
00719 #ifdef OLD
00720 void scoreNoninsertingExtensions(struct crudeGene *crudeGene, DNA *probe, int probeSize)
00721
00722
00723 {
00724 int i;
00725 int pStart = crudeGene->probeStart;
00726 int size = crudeGene->probeEnd - pStart;
00727 int score = 0;
00728 DNA *target = nt4Unpack(crudeGene->target, crudeGene->targetStart, size);
00729 DNA p,t;
00730 probe += pStart;
00731 for (i=0; i < size; ++i)
00732 {
00733 p = probe[i];
00734 t = target[i];
00735 score += wormDnaMatchScores[p][t];
00736 }
00737 freeMem(target);
00738 }
00739 #endif
00740
00741 void scoreNoninsertingExtensions(struct crudeGene *crudeGene, DNA *probe, int probeSize)
00742
00743
00744 {
00745 int i, size;
00746 int score = 0, maxScore = 0;
00747 struct nt4Seq *targetChrom = crudeGene->target;
00748 DNA *target;
00749 DNA p,t;
00750
00751
00752
00753 int pTileStart = crudeGene->probeStart;
00754 int tTileStart = crudeGene->targetStart;
00755 int pStart = 0, pEnd = probeSize;
00756 int tStart = tTileStart - pTileStart;
00757 int tEnd = tStart + probeSize;
00758 if (tStart < 0)
00759 {
00760 pStart -= tStart;
00761 tStart = 0;
00762 }
00763 if (tEnd > targetChrom->baseCount)
00764 {
00765 int diff = tEnd - targetChrom->baseCount;
00766 tEnd -= diff;
00767 pEnd -= diff;
00768 }
00769 size = tEnd - tStart;
00770 if (crudeGene->isRc)
00771 reverseComplement(probe, probeSize);
00772 if (size > 0)
00773 {
00774 target = nt4Unpack(targetChrom, tStart, size);
00775 for (i = 0; i<size; ++i)
00776 {
00777 assert(i + pStart < probeSize);
00778 p = probe[i + pStart];
00779 t = target[i];
00780 score += wormDnaMatchScores[(int)p][(int)t];
00781 if (score < 0)
00782 score = 0;
00783 if (score > maxScore)
00784 maxScore = score;
00785 }
00786 freeMem(target);
00787 }
00788 crudeGene->score = maxScore;
00789 if (crudeGene->isRc)
00790 reverseComplement(probe, probeSize);
00791 }
00792
00793
00794 struct crudeAli *crudeAliFind(DNA *probe, int probeSize,
00795 struct nt4Seq **chrome, int chromeCount, int tileSize, int minScore)
00796
00797 {
00798 int oneGeneCount;
00799 int i;
00800 int chromeIx;
00801 int geneCount = 0;
00802 int maxGeneCount = 2*maxHitsAtOnce;
00803 int isRc;
00804 struct nt4Seq *target;
00805 struct crudeGene *crudeGenes = needMem(maxGeneCount*sizeof(*crudeGenes));
00806 struct fastProber *fps[2];
00807 struct crudeAli *aliList = NULL, *ali;
00808 int goodGeneCount;
00809
00810 initWormDnaMatchScores();
00811 caTileSize = tileSize;
00812 caIsCdna = (tileSize >= 14);
00813 fps[0] = makeFastProber(probe, probeSize);
00814 reverseComplement(probe, probeSize);
00815 fps[1] = makeFastProber(probe, probeSize);
00816 reverseComplement(probe, probeSize);
00817 for (chromeIx = 0; chromeIx < chromeCount; ++chromeIx)
00818 {
00819 target = chrome[chromeIx];
00820 for (isRc = 0; isRc <= 1; ++isRc)
00821 {
00822
00823 oneGeneCount = findCrudeGenes(fps[isRc], target,
00824 crudeGenes+geneCount, maxGeneCount-geneCount, isRc);
00825
00826
00827 geneCount += oneGeneCount;
00828 if (maxGeneCount - geneCount < maxHitsAtOnce)
00829 geneCount = filterPoorGenes(crudeGenes, geneCount);
00830 }
00831 if (maxGeneCount - geneCount < maxHitsAtOnce)
00832 break;
00833 }
00834
00835
00836 {
00837 int bestScore, worstScore;
00838 qsort(crudeGenes, geneCount, sizeof(crudeGenes[0]), cmpGenesByScore);
00839 bestScore = crudeGenes[0].score;
00840 worstScore = crudeGenes[geneCount-1].score;
00841 if (bestScore > worstScore)
00842 {
00843 int cutOff = bestScore/10;
00844 goodGeneCount = geneCount = countGenesBetter(crudeGenes, geneCount, cutOff);
00845 }
00846 else if (bestScore >= minScore)
00847 goodGeneCount = geneCount;
00848 else
00849 goodGeneCount = 0;
00850 }
00851
00852 if (tileSize == 8)
00853 {
00854 goodGeneCount = countPrettyGood(crudeGenes, geneCount, 4, 4);
00855 geneCount = goodGeneCount;
00856 for (i=0; i<geneCount; ++i)
00857 scoreNoninsertingExtensions(crudeGenes+i, probe, probeSize);
00858 qsort(crudeGenes, geneCount, sizeof(crudeGenes[0]), cmpGenesByScore);
00859 goodGeneCount = countPrettyGood(crudeGenes, geneCount, 4, minScore);
00860 }
00861 for (i=0; i<goodGeneCount; ++i)
00862 {
00863 AllocVar(ali);
00864 ali->chromIx = chromIx(crudeGenes[i].target, chrome, chromeCount);
00865 ali->start = crudeGenes[i].targetStart;
00866 ali->end = crudeGenes[i].targetEnd;
00867 ali->score = crudeGenes[i].score;
00868 ali->strand = (crudeGenes[i].isRc ? '-' : '+');
00869 slAddHead(&aliList, ali);
00870 }
00871 slReverse(&aliList);
00872 freeFastProber(&fps[0]);
00873 freeFastProber(&fps[1]);
00874 freeMem(crudeGenes);
00875
00876 return aliList;
00877 }
00878