#include "common.h"#include "memalloc.h"#include "linefile.h"#include "bits.h"#include "hash.h"#include "dnautil.h"#include "dnaseq.h"#include "fa.h"#include "nib.h"#include "twoBit.h"#include "psl.h"#include "sig.h"#include "options.h"#include "obscure.h"#include "genoFind.h"#include "trans3.h"#include "gfClientLib.h"Include dependency graph for blat.c:

Go to the source code of this file.
Enumerations | |
| enum | constants { qWarnSize = 5000000 } |
Functions | |
| void | usage () |
| void | searchOneStrand (struct dnaSeq *seq, struct genoFind *gf, FILE *psl, boolean isRc, struct hash *maskHash, Bits *qMaskBits) |
| void | searchOneProt (aaSeq *seq, struct genoFind *gf, FILE *f) |
| void | dotOut () |
| void | searchOne (bioSeq *seq, struct genoFind *gf, FILE *f, boolean isProt, struct hash *maskHash, Bits *qMaskBits) |
| void | trimSeq (struct dnaSeq *seq, struct dnaSeq *trimmed) |
| Bits * | maskQuerySeq (struct dnaSeq *seq, boolean isProt, boolean maskQuery, boolean lcMask) |
| void | searchOneMaskTrim (struct dnaSeq *seq, boolean isProt, struct genoFind *gf, FILE *outFile, struct hash *maskHash, unsigned long *retTotalSize, int *retCount) |
| void | searchOneIndex (int fileCount, char *files[], struct genoFind *gf, char *outName, boolean isProt, struct hash *maskHash, FILE *outFile, boolean showStatus) |
| trans3 * | seqListToTrans3List (struct dnaSeq *seqList, aaSeq *transLists[3], struct hash **retHash) |
| void | tripleSearch (aaSeq *qSeq, struct genoFind *gfs[3], struct hash *t3Hash, boolean dbIsRc, FILE *f) |
| void | transTripleSearch (struct dnaSeq *qSeq, struct genoFind *gfs[3], struct hash *t3Hash, boolean dbIsRc, boolean qIsDna, FILE *f) |
| void | bigBlat (struct dnaSeq *untransList, int queryCount, char *queryFiles[], char *outFile, boolean transQuery, boolean qIsDna, FILE *out, boolean showStatus) |
| void | blat (char *dbFile, char *queryFile, char *outName) |
| int | main (int argc, char *argv[]) |
Variables | |
| static char const | rcsid [] = "$Id: blat.c,v 1.111 2006/12/01 18:30:43 kent Exp $" |
| char * | databaseName |
| int | databaseSeqCount = 0 |
| unsigned long | databaseLetters = 0 |
| int | tileSize = 11 |
| int | stepSize = 0 |
| int | minMatch = 2 |
| int | minScore = 30 |
| int | maxGap = 2 |
| int | repMatch = 1024*4 |
| int | dotEvery = 0 |
| boolean | oneOff = FALSE |
| boolean | noHead = FALSE |
| boolean | trimA = FALSE |
| boolean | trimHardA = FALSE |
| boolean | trimT = FALSE |
| boolean | fastMap = FALSE |
| char * | makeOoc = NULL |
| char * | ooc = NULL |
| enum gfType | qType = gftDna |
| enum gfType | tType = gftDna |
| char * | mask = NULL |
| char * | repeats = NULL |
| char * | qMask = NULL |
| double | minRepDivergence = 15 |
| double | minIdentity = 90 |
| char * | outputFormat = "psl" |
| optionSpec | options [] |
| gfOutput * | gvo |
| enum constants |
| void bigBlat | ( | struct dnaSeq * | untransList, | |
| int | queryCount, | |||
| char * | queryFiles[], | |||
| char * | outFile, | |||
| boolean | transQuery, | |||
| boolean | qIsDna, | |||
| FILE * | out, | |||
| boolean | showStatus | |||
| ) |
Definition at line 439 of file blat.c.
References carefulClose(), dnaSeq::dna, dotOut(), FALSE, faMixedSpeedReadNext(), gfOutput::fileHead, freeHash(), genoFindFree(), gfIndexSeq(), gfOutputQuery(), gvo, trans3::isRc, lineFileClose(), lineFileOpen(), maxGap, minMatch, dnaSeq::next, oneOff, ooc, qMask, qWarnSize, repMatch, reverseComplement(), sameString, seqListToTrans3List(), dnaSeq::size, slCount(), stepSize, tileSize, toggleCase(), toLowerN(), toUpperN(), trans3FreeList(), transTripleSearch(), trimSeq(), tripleSearch(), TRUE, upperToN(), warn(), and ZeroVar.
Referenced by blat().
00441 { 00442 int frame, i; 00443 struct dnaSeq *seq, trimmedSeq; 00444 struct genoFind *gfs[3]; 00445 aaSeq *dbSeqLists[3]; 00446 struct trans3 *t3List = NULL; 00447 int isRc; 00448 struct lineFile *lf = NULL; 00449 struct hash *t3Hash = NULL; 00450 boolean forceUpper = FALSE; 00451 boolean forceLower = FALSE; 00452 boolean toggle = FALSE; 00453 boolean maskUpper = FALSE; 00454 00455 ZeroVar(&trimmedSeq); 00456 if (showStatus) 00457 printf("Blatx %d sequences in database, %d files in query\n", slCount(untransList), queryCount); 00458 00459 /* Figure out how to manage query case. Proteins want to be in 00460 * upper case, generally, nucleotides in lower case. But there 00461 * may be repeatMasking based on case as well. */ 00462 if (transQuery) 00463 { 00464 if (qMask == NULL) 00465 forceLower = TRUE; 00466 else 00467 { 00468 maskUpper = TRUE; 00469 toggle = !sameString(qMask, "upper"); 00470 } 00471 } 00472 else 00473 { 00474 forceUpper = TRUE; 00475 } 00476 00477 if (gvo->fileHead != NULL) 00478 gvo->fileHead(gvo, out); 00479 00480 for (isRc = FALSE; isRc <= 1; ++isRc) 00481 { 00482 /* Initialize local pointer arrays to NULL to prevent surprises. */ 00483 for (frame = 0; frame < 3; ++frame) 00484 { 00485 gfs[frame] = NULL; 00486 dbSeqLists[frame] = NULL; 00487 } 00488 00489 t3List = seqListToTrans3List(untransList, dbSeqLists, &t3Hash); 00490 for (frame = 0; frame < 3; ++frame) 00491 { 00492 gfs[frame] = gfIndexSeq(dbSeqLists[frame], minMatch, maxGap, tileSize, 00493 repMatch, ooc, TRUE, oneOff, FALSE, stepSize); 00494 } 00495 00496 for (i=0; i<queryCount; ++i) 00497 { 00498 aaSeq qSeq; 00499 00500 lf = lineFileOpen(queryFiles[i], TRUE); 00501 while (faMixedSpeedReadNext(lf, &qSeq.dna, &qSeq.size, &qSeq.name)) 00502 { 00503 dotOut(); 00504 /* Put it into right case and optionally mask on case. */ 00505 if (forceLower) 00506 toLowerN(qSeq.dna, qSeq.size); 00507 else if (forceUpper) 00508 toUpperN(qSeq.dna, qSeq.size); 00509 else if (maskUpper) 00510 { 00511 if (toggle) 00512 toggleCase(qSeq.dna, qSeq.size); 00513 upperToN(qSeq.dna, qSeq.size); 00514 } 00515 if (qSeq.size > qWarnSize) 00516 { 00517 warn("Query sequence %s has size %d, it might take a while.", 00518 qSeq.name, qSeq.size); 00519 } 00520 trimSeq(&qSeq, &trimmedSeq); 00521 if (transQuery) 00522 transTripleSearch(&trimmedSeq, gfs, t3Hash, isRc, qIsDna, out); 00523 else 00524 tripleSearch(&trimmedSeq, gfs, t3Hash, isRc, out); 00525 gfOutputQuery(gvo, out); 00526 } 00527 lineFileClose(&lf); 00528 } 00529 00530 /* Clean up time. */ 00531 trans3FreeList(&t3List); 00532 freeHash(&t3Hash); 00533 for (frame = 0; frame < 3; ++frame) 00534 { 00535 genoFindFree(&gfs[frame]); 00536 } 00537 00538 for (seq = untransList; seq != NULL; seq = seq->next) 00539 { 00540 reverseComplement(seq->dna, seq->size); 00541 } 00542 } 00543 carefulClose(&out); 00544 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void blat | ( | char * | dbFile, | |
| char * | queryFile, | |||
| char * | outName | |||
| ) |
Definition at line 547 of file blat.c.
References bigBlat(), databaseLetters, databaseName, databaseSeqCount, dotEvery, errAbort(), FALSE, freeDnaSeqList(), freeHash(), gfClientFileArray(), gfClientSeqList(), gfClientUnmask(), gfIndexSeq(), gfMakeOoc(), gfOutputAny(), gftDna, gftDnaX, gftProt, gftRna, gftRnaX, gvo, hashAdd(), makeOoc, mask, maskFromUpperCaseSeq(), maxGap, minIdentity, minMatch, minRepDivergence, mustOpen(), dnaSeq::name, newHash(), dnaSeq::next, noHead, oneOff, ooc, outputFormat, qType, repeats, repMatch, searchOneIndex(), dnaSeq::size, slCount(), stepSize, tileSize, TRUE, and tType.
Referenced by main().
00549 { 00550 char **dbFiles, **queryFiles; 00551 int dbCount, queryCount; 00552 struct dnaSeq *dbSeqList, *seq; 00553 struct genoFind *gf; 00554 boolean tIsProt = (tType == gftProt); 00555 boolean qIsProt = (qType == gftProt); 00556 boolean bothSimpleNuc = (tType == gftDna && (qType == gftDna || qType == gftRna)); 00557 boolean bothSimpleProt = (tIsProt && qIsProt); 00558 FILE *f = mustOpen(outName, "w"); 00559 boolean showStatus = (f != stdout); 00560 00561 databaseName = dbFile; 00562 gfClientFileArray(dbFile, &dbFiles, &dbCount); 00563 if (makeOoc != NULL) 00564 { 00565 gfMakeOoc(makeOoc, dbFiles, dbCount, tileSize, repMatch, tType); 00566 if (showStatus) 00567 printf("Done making %s\n", makeOoc); 00568 exit(0); 00569 } 00570 gfClientFileArray(queryFile, &queryFiles, &queryCount); 00571 dbSeqList = gfClientSeqList(dbCount, dbFiles, tIsProt, tType == gftDnaX, repeats, 00572 minRepDivergence, showStatus); 00573 databaseSeqCount = slCount(dbSeqList); 00574 for (seq = dbSeqList; seq != NULL; seq = seq->next) 00575 databaseLetters += seq->size; 00576 00577 gvo = gfOutputAny(outputFormat, minIdentity*10, qIsProt, tIsProt, noHead, 00578 databaseName, databaseSeqCount, databaseLetters, minIdentity, f); 00579 00580 if (bothSimpleNuc || bothSimpleProt) 00581 { 00582 struct hash *maskHash = NULL; 00583 00584 /* Save away masking info for output. */ 00585 if (repeats != NULL) 00586 { 00587 maskHash = newHash(0); 00588 for (seq = dbSeqList; seq != NULL; seq = seq->next) 00589 { 00590 Bits *maskedBits = maskFromUpperCaseSeq(seq); 00591 hashAdd(maskHash, seq->name, maskedBits); 00592 } 00593 } 00594 00595 /* Handle masking and indexing. If masking is off, we want the indexer 00596 * to see unmasked sequence, otherwise we want it to see masked. However 00597 * after indexing we always want it unmasked, because things are always 00598 * unmasked for the extension phase. */ 00599 if (mask == NULL && !bothSimpleProt) 00600 gfClientUnmask(dbSeqList); 00601 gf = gfIndexSeq(dbSeqList, minMatch, maxGap, tileSize, repMatch, ooc, 00602 tIsProt, oneOff, FALSE, stepSize); 00603 if (mask != NULL) 00604 gfClientUnmask(dbSeqList); 00605 00606 searchOneIndex(queryCount, queryFiles, gf, outName, tIsProt, maskHash, f, showStatus); 00607 freeHash(&maskHash); 00608 } 00609 else if (tType == gftDnaX && qType == gftProt) 00610 { 00611 bigBlat(dbSeqList, queryCount, queryFiles, outName, FALSE, TRUE, f, showStatus); 00612 } 00613 else if (tType == gftDnaX && (qType == gftDnaX || qType == gftRnaX)) 00614 { 00615 bigBlat(dbSeqList, queryCount, queryFiles, outName, TRUE, qType == gftDnaX, f, showStatus); 00616 } 00617 else 00618 { 00619 errAbort("Unrecognized combination of target and query types\n"); 00620 } 00621 if (dotEvery > 0) 00622 printf("\n"); 00623 freeDnaSeqList(&dbSeqList); 00624 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void dotOut | ( | ) |
Definition at line 215 of file blat.c.
References dotEvery.
Referenced by bigBlat(), and searchOne().
00217 { 00218 static int mod = 1; 00219 if (dotEvery > 0) 00220 { 00221 if (--mod <= 0) 00222 { 00223 fputc('.', stdout); 00224 fflush(stdout); 00225 mod = dotEvery; 00226 } 00227 } 00228 }
Here is the caller graph for this function:

| int main | ( | int | argc, | |
| char * | argv[] | |||
| ) |
Definition at line 626 of file blat.c.
References blat(), chopLine, cloneString(), differentString, dotEvery, errAbort(), FALSE, fastMap, ffIntronMaxDefault, gfCheckTileSize(), gftDna, gftDnaX, gftProt, gftRna, gftRnaX, gfTypeFromName(), makeOoc, mask, maxGap, minIdentity, minMatch, minRepDivergence, minScore, noHead, oneOff, ooc, optionExists(), optionFloat(), optionInit(), optionInt(), options, optionVal(), outputFormat, qMask, qType, repeats, repMatch, setFfExtendThroughN(), setFfIntronMax(), stepSize, tileSize, trimA, trimHardA, trimT, TRUE, tType, and usage().
00628 { 00629 boolean dIsProtLike, qIsProtLike; 00630 00631 #ifdef DEBUG 00632 { 00633 char *cmd = "blat hCrea.geno hCrea.mrna foo.psl -t=dnax -q=rnax"; 00634 char *words[16]; 00635 00636 printf("Debugging parameters\n"); 00637 cmd = cloneString(cmd); 00638 argc = chopLine(cmd, words); 00639 argv = words; 00640 } 00641 #endif /* DEBUG */ 00642 00643 optionInit(&argc, argv, options); 00644 if (argc != 4) 00645 usage(); 00646 00647 /* Get database and query sequence types and make sure they are 00648 * legal and compatable. */ 00649 if (optionExists("prot")) 00650 qType = tType = gftProt; 00651 if (optionExists("t")) 00652 tType = gfTypeFromName(optionVal("t", NULL)); 00653 trimA = optionExists("trimA") || optionExists("trima"); 00654 trimT = optionExists("trimT") || optionExists("trimt"); 00655 trimHardA = optionExists("trimHardA"); 00656 switch (tType) 00657 { 00658 case gftProt: 00659 case gftDnaX: 00660 dIsProtLike = TRUE; 00661 break; 00662 case gftDna: 00663 dIsProtLike = FALSE; 00664 break; 00665 default: 00666 dIsProtLike = FALSE; 00667 errAbort("Illegal value for 't' parameter"); 00668 break; 00669 } 00670 if (optionExists("q")) 00671 qType = gfTypeFromName(optionVal("q", NULL)); 00672 if (qType == gftRnaX || qType == gftRna) 00673 trimA = TRUE; 00674 if (optionExists("noTrimA")) 00675 trimA = FALSE; 00676 switch (qType) 00677 { 00678 case gftProt: 00679 case gftDnaX: 00680 case gftRnaX: 00681 minIdentity = 25; 00682 qIsProtLike = TRUE; 00683 break; 00684 default: 00685 qIsProtLike = FALSE; 00686 break; 00687 } 00688 if ((dIsProtLike ^ qIsProtLike) != 0) 00689 errAbort("d and q must both be either protein or dna"); 00690 00691 /* Set default tile size for protein-based comparisons. */ 00692 if (dIsProtLike) 00693 { 00694 tileSize = 5; 00695 minMatch = 1; 00696 oneOff = FALSE; 00697 maxGap = 0; 00698 } 00699 00700 /* Get tile size and related parameters from user and make sure 00701 * they are within range. */ 00702 tileSize = optionInt("tileSize", tileSize); 00703 stepSize = optionInt("stepSize", stepSize); 00704 minMatch = optionInt("minMatch", minMatch); 00705 oneOff = optionExists("oneOff"); 00706 fastMap = optionExists("fastMap"); 00707 minScore = optionInt("minScore", minScore); 00708 maxGap = optionInt("maxGap", maxGap); 00709 minRepDivergence = optionFloat("minRepDivergence", minRepDivergence); 00710 minIdentity = optionFloat("minIdentity", minIdentity); 00711 gfCheckTileSize(tileSize, dIsProtLike); 00712 if (minMatch < 0) 00713 errAbort("minMatch must be at least 1"); 00714 if (maxGap > 100) 00715 errAbort("maxGap must be less than 100"); 00716 00717 00718 00719 /* Set repMatch parameter from command line, or 00720 * to reasonable value that depends on tile size. */ 00721 if (optionExists("repMatch")) 00722 repMatch = optionInt("repMatch", repMatch); 00723 else 00724 { 00725 if (dIsProtLike) 00726 { 00727 if (tileSize == 3) 00728 repMatch = 600000; 00729 else if (tileSize == 4) 00730 repMatch = 30000; 00731 else if (tileSize == 5) 00732 repMatch = 1500; 00733 else if (tileSize == 6) 00734 repMatch = 75; 00735 else if (tileSize <= 7) 00736 repMatch = 10; 00737 } 00738 else 00739 { 00740 if (tileSize == 15) 00741 repMatch = 16; 00742 else if (tileSize == 14) 00743 repMatch = 32; 00744 else if (tileSize == 13) 00745 repMatch = 128; 00746 else if (tileSize == 12) 00747 repMatch = 256; 00748 else if (tileSize == 11) 00749 repMatch = 4*256; 00750 else if (tileSize == 10) 00751 repMatch = 16*256; 00752 else if (tileSize == 9) 00753 repMatch = 64*256; 00754 else if (tileSize == 8) 00755 repMatch = 256*256; 00756 else if (tileSize == 7) 00757 repMatch = 1024*256; 00758 else if (tileSize == 6) 00759 repMatch = 4*1024*256; 00760 } 00761 } 00762 00763 /* Gather last few command line options. */ 00764 noHead = optionExists("noHead"); 00765 ooc = optionVal("ooc", NULL); 00766 makeOoc = optionVal("makeOoc", NULL); 00767 mask = optionVal("mask", NULL); 00768 qMask = optionVal("qMask", NULL); 00769 repeats = optionVal("repeats", NULL); 00770 if (repeats != NULL && mask != NULL && differentString(repeats, mask)) 00771 errAbort("The -mask and -repeat settings disagree. " 00772 "You can just omit -repeat if -mask is on"); 00773 if (mask != NULL) /* Mask setting will also set repeats. */ 00774 repeats = mask; 00775 outputFormat = optionVal("out", outputFormat); 00776 dotEvery = optionInt("dots", 0); 00777 /* set global for fuzzy find functions */ 00778 setFfIntronMax(optionInt("maxIntron", ffIntronMaxDefault)); 00779 setFfExtendThroughN(optionExists("extendThroughN")); 00780 00781 00782 /* Call routine that does the work. */ 00783 blat(argv[1], argv[2], argv[3]); 00784 return 0; 00785 }
Here is the call graph for this function:

Definition at line 270 of file blat.c.
References dnaSeq::dna, faToDna(), faToProtein(), maskFromUpperCaseSeq(), dnaSeq::name, qWarnSize, dnaSeq::size, toggleCase(), verbose(), and warn().
Referenced by searchOneMaskTrim().
00275 { 00276 Bits *qMaskBits = NULL; 00277 verbose(2, "%s\n", seq->name); 00278 if (isProt) 00279 faToProtein(seq->dna, seq->size); 00280 else 00281 { 00282 if (maskQuery) 00283 { 00284 if (lcMask) 00285 toggleCase(seq->dna, seq->size); 00286 qMaskBits = maskFromUpperCaseSeq(seq); 00287 } 00288 faToDna(seq->dna, seq->size); 00289 } 00290 if (seq->size > qWarnSize) 00291 { 00292 warn("Query sequence %s has size %d, it might take a while.", 00293 seq->name, seq->size); 00294 } 00295 return qMaskBits; 00296 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void searchOne | ( | bioSeq * | seq, | |
| struct genoFind * | gf, | |||
| FILE * | f, | |||
| boolean | isProt, | |||
| struct hash * | maskHash, | |||
| Bits * | qMaskBits | |||
| ) |
Definition at line 230 of file blat.c.
References dnaSeq::dna, dotOut(), FALSE, gfOutputQuery(), gvo, gfOutput::maskHash, reverseComplement(), searchOneProt(), searchOneStrand(), dnaSeq::size, and TRUE.
Referenced by searchOneMaskTrim().
00232 { 00233 dotOut(); 00234 if (isProt) 00235 { 00236 searchOneProt(seq, gf, f); 00237 } 00238 else 00239 { 00240 gvo->maskHash = maskHash; 00241 searchOneStrand(seq, gf, f, FALSE, maskHash, qMaskBits); 00242 reverseComplement(seq->dna, seq->size); 00243 searchOneStrand(seq, gf, f, TRUE, maskHash, qMaskBits); 00244 reverseComplement(seq->dna, seq->size); 00245 } 00246 gfOutputQuery(gvo, f); 00247 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void searchOneIndex | ( | int | fileCount, | |
| char * | files[], | |||
| struct genoFind * | gf, | |||
| char * | outName, | |||
| boolean | isProt, | |||
| struct hash * | maskHash, | |||
| FILE * | outFile, | |||
| boolean | showStatus | |||
| ) |
Definition at line 316 of file blat.c.
References carefulClose(), cloneString(), dnaSeq::dna, dnaSeqFree, errAbort(), faMixedSpeedReadNext(), twoBitSpec::fileName, freeDnaSeq(), freez(), gfOutputHead(), gvo, twoBitFile::indexList, lineFileClose(), lineFileOpen(), dnaSeq::name, twoBitIndex::name, twoBitIndex::next, NIB_MASK_MIXED, nibIsFile(), nibLoadAllMasked(), searchOneMaskTrim(), twoBitSpec::seqs, dnaSeq::size, ss, TRUE, twoBitClose(), twoBitIsSpec(), twoBitOpen(), twoBitReadSeqFrag(), and twoBitSpecNew().
Referenced by blat().
00319 { 00320 int i; 00321 char *fileName; 00322 int count = 0; 00323 unsigned long totalSize = 0; 00324 00325 gfOutputHead(gvo, outFile); 00326 for (i=0; i<fileCount; ++i) 00327 { 00328 fileName = files[i]; 00329 if (nibIsFile(fileName)) 00330 { 00331 struct dnaSeq *seq; 00332 00333 if (isProt) 00334 errAbort("%s: Can't use .nib files with -prot or d=prot option\n", fileName); 00335 seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName); 00336 freez(&seq->name); 00337 seq->name = cloneString(fileName); 00338 searchOneMaskTrim(seq, isProt, gf, outFile, 00339 maskHash, &totalSize, &count); 00340 freeDnaSeq(&seq); 00341 } 00342 else if (twoBitIsSpec(fileName)) 00343 { 00344 struct twoBitSpec *tbs = twoBitSpecNew(fileName); 00345 struct twoBitFile *tbf = twoBitOpen(tbs->fileName); 00346 if (isProt) 00347 errAbort("%s is a two bit file, which doesn't work for proteins.", 00348 fileName); 00349 if (tbs->seqs != NULL) 00350 { 00351 struct twoBitSeqSpec *ss = NULL; 00352 for (ss = tbs->seqs; ss != NULL; ss = ss->next) 00353 { 00354 struct dnaSeq *seq = twoBitReadSeqFrag(tbf, ss->name, 00355 ss->start, ss->end); 00356 searchOneMaskTrim(seq, isProt, gf, outFile, 00357 maskHash, &totalSize, &count); 00358 dnaSeqFree(&seq); 00359 } 00360 } 00361 else 00362 { 00363 struct twoBitIndex *index = NULL; 00364 for (index = tbf->indexList; index != NULL; index = index->next) 00365 { 00366 struct dnaSeq *seq = twoBitReadSeqFrag(tbf, index->name, 0, 0); 00367 searchOneMaskTrim(seq, isProt, gf, outFile, 00368 maskHash, &totalSize, &count); 00369 dnaSeqFree(&seq); 00370 } 00371 } 00372 twoBitClose(&tbf); 00373 } 00374 else 00375 { 00376 static struct dnaSeq seq; 00377 struct lineFile *lf = lineFileOpen(fileName, TRUE); 00378 while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) 00379 { 00380 searchOneMaskTrim(&seq, isProt, gf, outFile, 00381 maskHash, &totalSize, &count); 00382 } 00383 lineFileClose(&lf); 00384 } 00385 } 00386 carefulClose(&outFile); 00387 if (showStatus) 00388 printf("Searched %lu bases in %d sequences\n", totalSize, count); 00389 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void searchOneMaskTrim | ( | struct dnaSeq * | seq, | |
| boolean | isProt, | |||
| struct genoFind * | gf, | |||
| FILE * | outFile, | |||
| struct hash * | maskHash, | |||
| unsigned long * | retTotalSize, | |||
| int * | retCount | |||
| ) |
Definition at line 298 of file blat.c.
References bitFree(), maskQuerySeq(), qMask, sameWord, searchOne(), dnaSeq::size, trimSeq(), and ZeroVar.
Referenced by searchOneIndex().
00303 { 00304 boolean maskQuery = (qMask != NULL); 00305 boolean lcMask = (qMask != NULL && sameWord(qMask, "lower")); 00306 Bits *qMaskBits = maskQuerySeq(seq, isProt, maskQuery, lcMask); 00307 struct dnaSeq trimmedSeq; 00308 ZeroVar(&trimmedSeq); 00309 trimSeq(seq, &trimmedSeq); 00310 searchOne(&trimmedSeq, gf, outFile, isProt, maskHash, qMaskBits); 00311 *retTotalSize += seq->size; 00312 *retCount += 1; 00313 bitFree(&qMaskBits); 00314 }
Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 204 of file blat.c.
References FALSE, gfAlignAaClumps(), gfClumpFreeList(), gfFindClumps(), gvo, lm, lmCleanup(), lmInit(), and minScore.
Referenced by searchOne().
00206 { 00207 int hitCount; 00208 struct lm *lm = lmInit(0); 00209 struct gfClump *clumpList = gfFindClumps(gf, seq, lm, &hitCount); 00210 gfAlignAaClumps(gf, clumpList, seq, FALSE, minScore, gvo); 00211 gfClumpFreeList(&clumpList); 00212 lmCleanup(&lm); 00213 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void searchOneStrand | ( | struct dnaSeq * | seq, | |
| struct genoFind * | gf, | |||
| FILE * | psl, | |||
| boolean | isRc, | |||
| struct hash * | maskHash, | |||
| Bits * | qMaskBits | |||
| ) |
Definition at line 196 of file blat.c.
References fastMap, gfLongDnaInMem(), gvo, minScore, and optionExists().
Referenced by searchOne().
00199 { 00200 gfLongDnaInMem(seq, gf, isRc, minScore, qMaskBits, gvo, fastMap, optionExists("fine")); 00201 }
Here is the call graph for this function:

Here is the caller graph for this function:

| struct trans3* seqListToTrans3List | ( | struct dnaSeq * | seqList, | |
| aaSeq * | transLists[3], | |||
| struct hash ** | retHash | |||
| ) | [read] |
Definition at line 391 of file blat.c.
References hashAddUnique(), newHash(), dnaSeq::next, slAddHead, slReverse(), and trans3New().
Referenced by bigBlat().
00393 { 00394 int frame; 00395 struct dnaSeq *seq; 00396 struct trans3 *t3List = NULL, *t3; 00397 struct hash *hash = newHash(0); 00398 00399 for (seq = seqList; seq != NULL; seq = seq->next) 00400 { 00401 t3 = trans3New(seq); 00402 hashAddUnique(hash, t3->name, t3); 00403 slAddHead(&t3List, t3); 00404 for (frame = 0; frame < 3; ++frame) 00405 { 00406 slAddHead(&transLists[frame], t3->trans[frame]); 00407 } 00408 } 00409 slReverse(&t3List); 00410 for (frame = 0; frame < 3; ++frame) 00411 { 00412 slReverse(&transLists[frame]); 00413 } 00414 *retHash = hash; 00415 return t3List; 00416 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void transTripleSearch | ( | struct dnaSeq * | qSeq, | |
| struct genoFind * | gfs[3], | |||
| struct hash * | t3Hash, | |||
| boolean | dbIsRc, | |||
| boolean | qIsDna, | |||
| FILE * | f | |||
| ) |
Definition at line 425 of file blat.c.
References dnaSeq::dna, gfLongTransTransInMem(), gvo, minScore, gfOutput::reportTargetStrand, reverseComplement(), dnaSeq::size, and TRUE.
Referenced by bigBlat().
00428 { 00429 int qIsRc; 00430 gvo->reportTargetStrand = TRUE; 00431 for (qIsRc = 0; qIsRc <= qIsDna; qIsRc += 1) 00432 { 00433 gfLongTransTransInMem(qSeq, gfs, t3Hash, qIsRc, dbIsRc, !qIsDna, minScore, gvo); 00434 if (qIsDna) 00435 reverseComplement(qSeq->dna, qSeq->size); 00436 } 00437 }
Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 249 of file blat.c.
References dnaSeq::dna, maskHeadPolyT(), maskTailPolyA(), dnaSeq::size, trimA, trimHardA, and trimT.
Referenced by bigBlat(), and searchOneMaskTrim().
00252 { 00253 DNA *dna = seq->dna; 00254 int size = seq->size; 00255 *trimmed = *seq; 00256 if (trimT) 00257 maskHeadPolyT(dna, size); 00258 if (trimA || trimHardA) 00259 { 00260 int trimSize = maskTailPolyA(dna, size); 00261 if (trimHardA) 00262 { 00263 trimmed->size -= trimSize; 00264 dna[size-trimSize] = 0; 00265 } 00266 } 00267 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void tripleSearch | ( | aaSeq * | qSeq, | |
| struct genoFind * | gfs[3], | |||
| struct hash * | t3Hash, | |||
| boolean | dbIsRc, | |||
| FILE * | f | |||
| ) |
Definition at line 418 of file blat.c.
References gfFindAlignAaTrans(), gvo, minScore, gfOutput::reportTargetStrand, and TRUE.
Referenced by bigBlat().
00420 { 00421 gvo->reportTargetStrand = TRUE; 00422 gfFindAlignAaTrans(gfs, qSeq, t3Hash, dbIsRc, minScore, gvo); 00423 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void usage | ( | ) |
Definition at line 59 of file blat.c.
References ffIntronMaxDefault, and gfVersion.
Referenced by main().
00061 { 00062 printf( 00063 "blat - Standalone BLAT v. %s fast sequence search command line tool\n" 00064 "usage:\n" 00065 " blat database query [-ooc=11.ooc] output.psl\n" 00066 "where:\n" 00067 " database and query are each either a .fa , .nib or .2bit file,\n" 00068 " or a list these files one file name per line.\n" 00069 " -ooc=11.ooc tells the program to load over-occurring 11-mers from\n" 00070 " and external file. This will increase the speed\n" 00071 " by a factor of 40 in many cases, but is not required\n" 00072 " output.psl is where to put the output.\n" 00073 " Subranges of nib and .2bit files may specified using the syntax:\n" 00074 " /path/file.nib:seqid:start-end\n" 00075 " or\n" 00076 " /path/file.2bit:seqid:start-end\n" 00077 " or\n" 00078 " /path/file.nib:start-end\n" 00079 " With the second form, a sequence id of file:start-end will be used.\n" 00080 "options:\n" 00081 " -t=type Database type. Type is one of:\n" 00082 " dna - DNA sequence\n" 00083 " prot - protein sequence\n" 00084 " dnax - DNA sequence translated in six frames to protein\n" 00085 " The default is dna\n" 00086 " -q=type Query type. Type is one of:\n" 00087 " dna - DNA sequence\n" 00088 " rna - RNA sequence\n" 00089 " prot - protein sequence\n" 00090 " dnax - DNA sequence translated in six frames to protein\n" 00091 " rnax - DNA sequence translated in three frames to protein\n" 00092 " The default is dna\n" 00093 " -prot Synonymous with -t=prot -q=prot\n" 00094 " -ooc=N.ooc Use overused tile file N.ooc. N should correspond to \n" 00095 " the tileSize\n" 00096 " -tileSize=N sets the size of match that triggers an alignment. \n" 00097 " Usually between 8 and 12\n" 00098 " Default is 11 for DNA and 5 for protein.\n" 00099 " -stepSize=N spacing between tiles. Default is tileSize.\n" 00100 " -oneOff=N If set to 1 this allows one mismatch in tile and still\n" 00101 " triggers an alignments. Default is 0.\n" 00102 " -minMatch=N sets the number of tile matches. Usually set from 2 to 4\n" 00103 " Default is 2 for nucleotide, 1 for protein.\n" 00104 " -minScore=N sets minimum score. This is the matches minus the \n" 00105 " mismatches minus some sort of gap penalty. Default is 30\n" 00106 " -minIdentity=N Sets minimum sequence identity (in percent). Default is\n" 00107 " 90 for nucleotide searches, 25 for protein or translated\n" 00108 " protein searches.\n" 00109 " -maxGap=N sets the size of maximum gap between tiles in a clump. Usually\n" 00110 " set from 0 to 3. Default is 2. Only relevent for minMatch > 1.\n" 00111 " -noHead suppress .psl header (so it's just a tab-separated file)\n" 00112 " -makeOoc=N.ooc Make overused tile file. Target needs to be complete genome.\n" 00113 " -repMatch=N sets the number of repetitions of a tile allowed before\n" 00114 " it is marked as overused. Typically this is 256 for tileSize\n" 00115 " 12, 1024 for tile size 11, 4096 for tile size 10.\n" 00116 " Default is 1024. Typically only comes into play with makeOoc.\n" 00117 " Also affected by stepSize. When stepSize is halved repMatch is\n" 00118 " doubled to compensate.\n" 00119 " -mask=type Mask out repeats. Alignments won't be started in masked region\n" 00120 " but may extend through it in nucleotide searches. Masked areas\n" 00121 " are ignored entirely in protein or translated searches. Types are\n" 00122 " lower - mask out lower cased sequence\n" 00123 " upper - mask out upper cased sequence\n" 00124 " out - mask according to database.out RepeatMasker .out file\n" 00125 " file.out - mask database according to RepeatMasker file.out\n" 00126 " -qMask=type Mask out repeats in query sequence. Similar to -mask above but\n" 00127 " for query rather than target sequence.\n" 00128 " -repeats=type Type is same as mask types above. Repeat bases will not be\n" 00129 " masked in any way, but matches in repeat areas will be reported\n" 00130 " separately from matches in other areas in the psl output.\n" 00131 " -minRepDivergence=NN - minimum percent divergence of repeats to allow \n" 00132 " them to be unmasked. Default is 15. Only relevant for \n" 00133 " masking using RepeatMasker .out files.\n" 00134 " -dots=N Output dot every N sequences to show program's progress\n" 00135 " -trimT Trim leading poly-T\n" 00136 " -noTrimA Don't trim trailing poly-A\n" 00137 " -trimHardA Remove poly-A tail from qSize as well as alignments in \n" 00138 " psl output\n" 00139 " -fastMap Run for fast DNA/DNA remapping - not allowing introns, \n" 00140 " requiring high %%ID\n" 00141 " -out=type Controls output file format. Type is one of:\n" 00142 " psl - Default. Tab separated format, no sequence\n" 00143 " pslx - Tab separated format with sequence\n" 00144 " axt - blastz-associated axt format\n" 00145 " maf - multiz-associated maf format\n" 00146 " sim4 - similar to sim4 format\n" 00147 " wublast - similar to wublast format\n" 00148 " blast - similar to NCBI blast format\n" 00149 " blast8- NCBI blast tabular format\n" 00150 " blast9 - NCBI blast tabular format with comments\n" 00151 " -fine For high quality mRNAs look harder for small initial and\n" 00152 " terminal exons. Not recommended for ESTs\n" 00153 " -maxIntron=N Sets maximum intron size. Default is %d\n" 00154 " -extendThroughN - Allows extension of alignment through large blocks of N's\n" 00155 , gfVersion, ffIntronMaxDefault 00156 ); 00157 exit(-1); 00158 }
Here is the caller graph for this function:

| unsigned long databaseLetters = 0 |
| char* databaseName |
| int databaseSeqCount = 0 |
| boolean fastMap = FALSE |
Definition at line 194 of file blat.c.
Referenced by bigBlat(), blat(), doBlat(), gfClient(), searchOne(), searchOneIndex(), searchOneProt(), searchOneStrand(), transTripleSearch(), and tripleSearch().
| char* mask = NULL |
Definition at line 51 of file blat.c.
Referenced by addToPatSpace(), blat(), countPatSpace(), gfFastFindDnaHits(), main(), nibInput(), and nibOutput().
| int maxGap = 2 |
Definition at line 38 of file blat.c.
Referenced by bandExtAfter(), bandExtBefore(), bigBlat(), blat(), clumpHits(), genoFindDirect(), genoPcrDirect(), main(), and startServer().
| double minIdentity = 90 |
| int minMatch = 2 |
Definition at line 36 of file blat.c.
Referenced by bigBlat(), blat(), genoFindDirect(), genoPcrDirect(), gfFindClumpsWithQmask(), main(), patSpaceFindOne(), and startServer().
| double minRepDivergence = 15 |
| int minScore = 30 |
Definition at line 37 of file blat.c.
Referenced by gfClient(), main(), searchOneProt(), searchOneStrand(), transTripleSearch(), and tripleSearch().
| boolean oneOff = FALSE |
| struct optionSpec options[] |
Initial value:
{
{"t", OPTION_STRING},
{"q", OPTION_STRING},
{"prot", OPTION_BOOLEAN},
{"ooc", OPTION_STRING},
{"tileSize", OPTION_INT},
{"stepSize", OPTION_INT},
{"oneOff", OPTION_INT},
{"minMatch", OPTION_INT},
{"minScore", OPTION_INT},
{"minIdentity", OPTION_FLOAT},
{"maxGap", OPTION_INT},
{"noHead", OPTION_BOOLEAN},
{"makeOoc", OPTION_STRING},
{"repMatch", OPTION_INT},
{"mask", OPTION_STRING},
{"qMask", OPTION_STRING},
{"repeats", OPTION_STRING},
{"minRepDivergence", OPTION_FLOAT},
{"dots", OPTION_INT},
{"trimT", OPTION_BOOLEAN},
{"noTrimA", OPTION_BOOLEAN},
{"trimHardA", OPTION_BOOLEAN},
{"fastMap", OPTION_BOOLEAN},
{"out", OPTION_STRING},
{"fine", OPTION_BOOLEAN},
{"maxIntron", OPTION_INT},
{"extendThroughN", OPTION_BOOLEAN},
{NULL, 0},
}
Definition at line 161 of file blat.c.
Referenced by main(), optGet(), optionHashSome(), optionInit(), optionMultiVal(), and setOptions().
| char* outputFormat = "psl" |
| char* qMask = NULL |
char const rcsid[] = "$Id: blat.c,v 1.111 2006/12/01 18:30:43 kent Exp $" [static] |
| int repMatch = 1024*4 |
Definition at line 39 of file blat.c.
Referenced by bigBlat(), blat(), fillInMatchEtc(), genoFindDirect(), genoPcrDirect(), main(), savePslx(), startServer(), and usage().
| int stepSize = 0 |
Definition at line 35 of file blat.c.
Referenced by bigBlat(), blat(), genoFindDirect(), genoPcrDirect(), gfAddLargeSeq(), gfAddSeq(), gfCountSeq(), main(), and startServer().
| int tileSize = 11 |
Definition at line 34 of file blat.c.
Referenced by bigBlat(), blat(), clumpHits(), clumpNear(), findTileSize(), genoFindDirect(), genoPcrDirect(), gfAddLargeSeq(), gfAddSeq(), gfAddTilesInNib(), gfAlignTrans(), gfAlignTransTrans(), gfCountSeq(), gfCountTilesInNib(), gfFindAlignAaTrans(), gfQuerySeqTrans(), gfQuerySeqTransTrans(), gfSegmentedFindHits(), gfSegmentedFindNearHits(), gfStraightFindHits(), gfStraightFindNearHits(), gfTransTransFindBundles(), main(), maskSimplePepRepeat(), pcrClumps(), rwFindTilesBetween(), scanIndexForSmallExons(), startServer(), transQuery(), and transTransQuery().
| boolean trimHardA = FALSE |
1.5.2