blat/blat.c File Reference

#include "common.h"
#include "memalloc.h"
#include "linefile.h"
#include "bits.h"
#include "hash.h"
#include "dnautil.h"
#include "dnaseq.h"
#include "fa.h"
#include "nib.h"
#include "twoBit.h"
#include "psl.h"
#include "sig.h"
#include "options.h"
#include "obscure.h"
#include "genoFind.h"
#include "trans3.h"
#include "gfClientLib.h"

Include dependency graph for blat.c:

Go to the source code of this file.

Enumerations

enum  constants { qWarnSize = 5000000 }

Functions

void usage ()
void searchOneStrand (struct dnaSeq *seq, struct genoFind *gf, FILE *psl, boolean isRc, struct hash *maskHash, Bits *qMaskBits)
void searchOneProt (aaSeq *seq, struct genoFind *gf, FILE *f)
void dotOut ()
void searchOne (bioSeq *seq, struct genoFind *gf, FILE *f, boolean isProt, struct hash *maskHash, Bits *qMaskBits)
void trimSeq (struct dnaSeq *seq, struct dnaSeq *trimmed)
BitsmaskQuerySeq (struct dnaSeq *seq, boolean isProt, boolean maskQuery, boolean lcMask)
void searchOneMaskTrim (struct dnaSeq *seq, boolean isProt, struct genoFind *gf, FILE *outFile, struct hash *maskHash, unsigned long *retTotalSize, int *retCount)
void searchOneIndex (int fileCount, char *files[], struct genoFind *gf, char *outName, boolean isProt, struct hash *maskHash, FILE *outFile, boolean showStatus)
trans3seqListToTrans3List (struct dnaSeq *seqList, aaSeq *transLists[3], struct hash **retHash)
void tripleSearch (aaSeq *qSeq, struct genoFind *gfs[3], struct hash *t3Hash, boolean dbIsRc, FILE *f)
void transTripleSearch (struct dnaSeq *qSeq, struct genoFind *gfs[3], struct hash *t3Hash, boolean dbIsRc, boolean qIsDna, FILE *f)
void bigBlat (struct dnaSeq *untransList, int queryCount, char *queryFiles[], char *outFile, boolean transQuery, boolean qIsDna, FILE *out, boolean showStatus)
void blat (char *dbFile, char *queryFile, char *outName)
int main (int argc, char *argv[])

Variables

static char const rcsid [] = "$Id: blat.c,v 1.111 2006/12/01 18:30:43 kent Exp $"
char * databaseName
int databaseSeqCount = 0
unsigned long databaseLetters = 0
int tileSize = 11
int stepSize = 0
int minMatch = 2
int minScore = 30
int maxGap = 2
int repMatch = 1024*4
int dotEvery = 0
boolean oneOff = FALSE
boolean noHead = FALSE
boolean trimA = FALSE
boolean trimHardA = FALSE
boolean trimT = FALSE
boolean fastMap = FALSE
char * makeOoc = NULL
char * ooc = NULL
enum gfType qType = gftDna
enum gfType tType = gftDna
char * mask = NULL
char * repeats = NULL
char * qMask = NULL
double minRepDivergence = 15
double minIdentity = 90
char * outputFormat = "psl"
optionSpec options []
gfOutputgvo


Enumeration Type Documentation

enum constants

Enumerator:
qWarnSize 

Definition at line 29 of file blat.c.

00029                {
00030     qWarnSize = 5000000, /* Warn if more than this many bases in one query. */
00031     };


Function Documentation

void bigBlat ( struct dnaSeq untransList,
int  queryCount,
char *  queryFiles[],
char *  outFile,
boolean  transQuery,
boolean  qIsDna,
FILE *  out,
boolean  showStatus 
)

Definition at line 439 of file blat.c.

References carefulClose(), dnaSeq::dna, dotOut(), FALSE, faMixedSpeedReadNext(), gfOutput::fileHead, freeHash(), genoFindFree(), gfIndexSeq(), gfOutputQuery(), gvo, trans3::isRc, lineFileClose(), lineFileOpen(), maxGap, minMatch, dnaSeq::next, oneOff, ooc, qMask, qWarnSize, repMatch, reverseComplement(), sameString, seqListToTrans3List(), dnaSeq::size, slCount(), stepSize, tileSize, toggleCase(), toLowerN(), toUpperN(), trans3FreeList(), transTripleSearch(), trimSeq(), tripleSearch(), TRUE, upperToN(), warn(), and ZeroVar.

Referenced by blat().

00441 {
00442 int frame, i;
00443 struct dnaSeq *seq, trimmedSeq;
00444 struct genoFind *gfs[3];
00445 aaSeq *dbSeqLists[3];
00446 struct trans3 *t3List = NULL;
00447 int isRc;
00448 struct lineFile *lf = NULL;
00449 struct hash *t3Hash = NULL;
00450 boolean forceUpper = FALSE;
00451 boolean forceLower = FALSE;
00452 boolean toggle = FALSE;
00453 boolean maskUpper = FALSE;
00454 
00455 ZeroVar(&trimmedSeq);
00456 if (showStatus)
00457     printf("Blatx %d sequences in database, %d files in query\n", slCount(untransList), queryCount);
00458 
00459 /* Figure out how to manage query case.  Proteins want to be in
00460  * upper case, generally, nucleotides in lower case.  But there
00461  * may be repeatMasking based on case as well. */
00462 if (transQuery)
00463     {
00464     if (qMask == NULL)
00465        forceLower = TRUE;
00466     else
00467        {
00468        maskUpper = TRUE;
00469        toggle = !sameString(qMask, "upper");
00470        }
00471     }
00472 else
00473     {
00474     forceUpper = TRUE;
00475     }
00476 
00477 if (gvo->fileHead != NULL)
00478     gvo->fileHead(gvo, out);
00479 
00480 for (isRc = FALSE; isRc <= 1; ++isRc)
00481     {
00482     /* Initialize local pointer arrays to NULL to prevent surprises. */
00483     for (frame = 0; frame < 3; ++frame)
00484         {
00485         gfs[frame] = NULL;
00486         dbSeqLists[frame] = NULL;
00487         }
00488 
00489     t3List = seqListToTrans3List(untransList, dbSeqLists, &t3Hash);
00490     for (frame = 0; frame < 3; ++frame)
00491         {
00492         gfs[frame] = gfIndexSeq(dbSeqLists[frame], minMatch, maxGap, tileSize, 
00493                 repMatch, ooc, TRUE, oneOff, FALSE, stepSize);
00494         }
00495 
00496     for (i=0; i<queryCount; ++i)
00497         {
00498         aaSeq qSeq;
00499 
00500         lf = lineFileOpen(queryFiles[i], TRUE);
00501         while (faMixedSpeedReadNext(lf, &qSeq.dna, &qSeq.size, &qSeq.name))
00502             {
00503             dotOut();
00504             /* Put it into right case and optionally mask on case. */
00505             if (forceLower)
00506                 toLowerN(qSeq.dna, qSeq.size);
00507             else if (forceUpper)
00508                 toUpperN(qSeq.dna, qSeq.size);
00509             else if (maskUpper)
00510                 {
00511                 if (toggle)
00512                     toggleCase(qSeq.dna, qSeq.size);
00513                 upperToN(qSeq.dna, qSeq.size);
00514                 }
00515             if (qSeq.size > qWarnSize)
00516                 {
00517                 warn("Query sequence %s has size %d, it might take a while.",
00518                      qSeq.name, qSeq.size);
00519                 }
00520             trimSeq(&qSeq, &trimmedSeq);
00521             if (transQuery)
00522                 transTripleSearch(&trimmedSeq, gfs, t3Hash, isRc, qIsDna, out);
00523             else
00524                 tripleSearch(&trimmedSeq, gfs, t3Hash, isRc, out);
00525             gfOutputQuery(gvo, out);
00526             }
00527         lineFileClose(&lf);
00528         }
00529 
00530     /* Clean up time. */
00531     trans3FreeList(&t3List);
00532     freeHash(&t3Hash);
00533     for (frame = 0; frame < 3; ++frame)
00534         {
00535         genoFindFree(&gfs[frame]);
00536         }
00537 
00538     for (seq = untransList; seq != NULL; seq = seq->next)
00539         {
00540         reverseComplement(seq->dna, seq->size);
00541         }
00542     }
00543 carefulClose(&out);
00544 }

Here is the call graph for this function:

Here is the caller graph for this function:

void blat ( char *  dbFile,
char *  queryFile,
char *  outName 
)

Definition at line 547 of file blat.c.

References bigBlat(), databaseLetters, databaseName, databaseSeqCount, dotEvery, errAbort(), FALSE, freeDnaSeqList(), freeHash(), gfClientFileArray(), gfClientSeqList(), gfClientUnmask(), gfIndexSeq(), gfMakeOoc(), gfOutputAny(), gftDna, gftDnaX, gftProt, gftRna, gftRnaX, gvo, hashAdd(), makeOoc, mask, maskFromUpperCaseSeq(), maxGap, minIdentity, minMatch, minRepDivergence, mustOpen(), dnaSeq::name, newHash(), dnaSeq::next, noHead, oneOff, ooc, outputFormat, qType, repeats, repMatch, searchOneIndex(), dnaSeq::size, slCount(), stepSize, tileSize, TRUE, and tType.

Referenced by main().

00549 {
00550 char **dbFiles, **queryFiles;
00551 int dbCount, queryCount;
00552 struct dnaSeq *dbSeqList, *seq;
00553 struct genoFind *gf;
00554 boolean tIsProt = (tType == gftProt);
00555 boolean qIsProt = (qType == gftProt);
00556 boolean bothSimpleNuc = (tType == gftDna && (qType == gftDna || qType == gftRna));
00557 boolean bothSimpleProt = (tIsProt && qIsProt);
00558 FILE *f = mustOpen(outName, "w");
00559 boolean showStatus = (f != stdout);
00560 
00561 databaseName = dbFile;
00562 gfClientFileArray(dbFile, &dbFiles, &dbCount);
00563 if (makeOoc != NULL)
00564     {
00565     gfMakeOoc(makeOoc, dbFiles, dbCount, tileSize, repMatch, tType);
00566     if (showStatus)
00567         printf("Done making %s\n", makeOoc);
00568     exit(0);
00569     }
00570 gfClientFileArray(queryFile, &queryFiles, &queryCount);
00571 dbSeqList = gfClientSeqList(dbCount, dbFiles, tIsProt, tType == gftDnaX, repeats, 
00572         minRepDivergence, showStatus);
00573 databaseSeqCount = slCount(dbSeqList);
00574 for (seq = dbSeqList; seq != NULL; seq = seq->next)
00575     databaseLetters += seq->size;
00576 
00577 gvo = gfOutputAny(outputFormat, minIdentity*10, qIsProt, tIsProt, noHead, 
00578         databaseName, databaseSeqCount, databaseLetters, minIdentity, f);
00579 
00580 if (bothSimpleNuc || bothSimpleProt)
00581     {
00582     struct hash *maskHash = NULL;
00583 
00584     /* Save away masking info for output. */
00585     if (repeats != NULL)
00586         {
00587         maskHash = newHash(0);
00588         for (seq = dbSeqList; seq != NULL; seq = seq->next)
00589             {
00590             Bits *maskedBits = maskFromUpperCaseSeq(seq);
00591             hashAdd(maskHash, seq->name, maskedBits);
00592             }
00593         }
00594 
00595     /* Handle masking and indexing.  If masking is off, we want the indexer
00596      * to see unmasked sequence, otherwise we want it to see masked.  However
00597      * after indexing we always want it unmasked, because things are always
00598      * unmasked for the extension phase. */
00599     if (mask == NULL && !bothSimpleProt)
00600         gfClientUnmask(dbSeqList);
00601     gf = gfIndexSeq(dbSeqList, minMatch, maxGap, tileSize, repMatch, ooc, 
00602         tIsProt, oneOff, FALSE, stepSize);
00603     if (mask != NULL)
00604         gfClientUnmask(dbSeqList);
00605 
00606     searchOneIndex(queryCount, queryFiles, gf, outName, tIsProt, maskHash, f, showStatus);
00607     freeHash(&maskHash);
00608     }
00609 else if (tType == gftDnaX && qType == gftProt)
00610     {
00611     bigBlat(dbSeqList, queryCount, queryFiles, outName, FALSE, TRUE, f, showStatus);
00612     }
00613 else if (tType == gftDnaX && (qType == gftDnaX || qType == gftRnaX))
00614     {
00615     bigBlat(dbSeqList, queryCount, queryFiles, outName, TRUE, qType == gftDnaX, f, showStatus);
00616     }
00617 else
00618     {
00619     errAbort("Unrecognized combination of target and query types\n");
00620     }
00621 if (dotEvery > 0)
00622     printf("\n");
00623 freeDnaSeqList(&dbSeqList);
00624 }

Here is the call graph for this function:

Here is the caller graph for this function:

void dotOut (  ) 

Definition at line 215 of file blat.c.

References dotEvery.

Referenced by bigBlat(), and searchOne().

00217 {
00218 static int mod = 1;
00219 if (dotEvery > 0)
00220     {
00221     if (--mod <= 0)
00222         {
00223         fputc('.', stdout);
00224         fflush(stdout);
00225         mod = dotEvery;
00226         }
00227     }
00228 }

Here is the caller graph for this function:

int main ( int  argc,
char *  argv[] 
)

Definition at line 626 of file blat.c.

References blat(), chopLine, cloneString(), differentString, dotEvery, errAbort(), FALSE, fastMap, ffIntronMaxDefault, gfCheckTileSize(), gftDna, gftDnaX, gftProt, gftRna, gftRnaX, gfTypeFromName(), makeOoc, mask, maxGap, minIdentity, minMatch, minRepDivergence, minScore, noHead, oneOff, ooc, optionExists(), optionFloat(), optionInit(), optionInt(), options, optionVal(), outputFormat, qMask, qType, repeats, repMatch, setFfExtendThroughN(), setFfIntronMax(), stepSize, tileSize, trimA, trimHardA, trimT, TRUE, tType, and usage().

00628 {
00629 boolean dIsProtLike, qIsProtLike;
00630 
00631 #ifdef DEBUG
00632 {
00633 char *cmd = "blat hCrea.geno hCrea.mrna foo.psl -t=dnax -q=rnax";
00634 char *words[16];
00635 
00636 printf("Debugging parameters\n");
00637 cmd = cloneString(cmd);
00638 argc = chopLine(cmd, words);
00639 argv = words;
00640 }
00641 #endif /* DEBUG */
00642 
00643 optionInit(&argc, argv, options);
00644 if (argc != 4)
00645     usage();
00646 
00647 /* Get database and query sequence types and make sure they are
00648  * legal and compatable. */
00649 if (optionExists("prot"))
00650     qType = tType = gftProt;
00651 if (optionExists("t"))
00652     tType = gfTypeFromName(optionVal("t", NULL));
00653 trimA = optionExists("trimA") || optionExists("trima");
00654 trimT = optionExists("trimT") || optionExists("trimt");
00655 trimHardA = optionExists("trimHardA");
00656 switch (tType)
00657     {
00658     case gftProt:
00659     case gftDnaX:
00660         dIsProtLike = TRUE;
00661         break;
00662     case gftDna:
00663         dIsProtLike = FALSE;
00664         break;
00665     default:
00666         dIsProtLike = FALSE;
00667         errAbort("Illegal value for 't' parameter");
00668         break;
00669     }
00670 if (optionExists("q"))
00671     qType = gfTypeFromName(optionVal("q", NULL));
00672 if (qType == gftRnaX || qType == gftRna)
00673     trimA = TRUE;
00674 if (optionExists("noTrimA"))
00675     trimA = FALSE;
00676 switch (qType)
00677     {
00678     case gftProt:
00679     case gftDnaX:
00680     case gftRnaX:
00681         minIdentity = 25;
00682         qIsProtLike = TRUE;
00683         break;
00684     default:
00685         qIsProtLike = FALSE;
00686         break;
00687     }
00688 if ((dIsProtLike ^ qIsProtLike) != 0)
00689     errAbort("d and q must both be either protein or dna");
00690 
00691 /* Set default tile size for protein-based comparisons. */
00692 if (dIsProtLike)
00693     {
00694     tileSize = 5;
00695     minMatch = 1;
00696     oneOff = FALSE;
00697     maxGap = 0;
00698     }
00699 
00700 /* Get tile size and related parameters from user and make sure
00701  * they are within range. */
00702 tileSize = optionInt("tileSize", tileSize);
00703 stepSize = optionInt("stepSize", stepSize);
00704 minMatch = optionInt("minMatch", minMatch);
00705 oneOff = optionExists("oneOff");
00706 fastMap = optionExists("fastMap");
00707 minScore = optionInt("minScore", minScore);
00708 maxGap = optionInt("maxGap", maxGap);
00709 minRepDivergence = optionFloat("minRepDivergence", minRepDivergence);
00710 minIdentity = optionFloat("minIdentity", minIdentity);
00711 gfCheckTileSize(tileSize, dIsProtLike);
00712 if (minMatch < 0)
00713     errAbort("minMatch must be at least 1");
00714 if (maxGap > 100)
00715     errAbort("maxGap must be less than 100");
00716 
00717 
00718 
00719 /* Set repMatch parameter from command line, or
00720  * to reasonable value that depends on tile size. */
00721 if (optionExists("repMatch"))
00722     repMatch = optionInt("repMatch", repMatch);
00723 else
00724     {
00725     if (dIsProtLike)
00726         {
00727         if (tileSize == 3)
00728             repMatch = 600000;
00729         else if (tileSize == 4)
00730             repMatch = 30000;
00731         else if (tileSize == 5)
00732             repMatch = 1500;
00733         else if (tileSize == 6)
00734             repMatch = 75;
00735         else if (tileSize <= 7)
00736             repMatch = 10;
00737         }
00738     else
00739         {
00740         if (tileSize == 15)
00741             repMatch = 16;
00742         else if (tileSize == 14)
00743             repMatch = 32;
00744         else if (tileSize == 13)
00745             repMatch = 128;
00746         else if (tileSize == 12)
00747             repMatch = 256;
00748         else if (tileSize == 11)
00749             repMatch = 4*256;
00750         else if (tileSize == 10)
00751             repMatch = 16*256;
00752         else if (tileSize == 9)
00753             repMatch = 64*256;
00754         else if (tileSize == 8)
00755             repMatch = 256*256;
00756         else if (tileSize == 7)
00757             repMatch = 1024*256;
00758         else if (tileSize == 6)
00759             repMatch = 4*1024*256;
00760         }
00761     }
00762 
00763 /* Gather last few command line options. */
00764 noHead = optionExists("noHead");
00765 ooc = optionVal("ooc", NULL);
00766 makeOoc = optionVal("makeOoc", NULL);
00767 mask = optionVal("mask", NULL);
00768 qMask = optionVal("qMask", NULL);
00769 repeats = optionVal("repeats", NULL);
00770 if (repeats != NULL && mask != NULL && differentString(repeats, mask))
00771     errAbort("The -mask and -repeat settings disagree.  "
00772              "You can just omit -repeat if -mask is on");
00773 if (mask != NULL)       /* Mask setting will also set repeats. */
00774     repeats = mask;
00775 outputFormat = optionVal("out", outputFormat);
00776 dotEvery = optionInt("dots", 0);
00777 /* set global for fuzzy find functions */
00778 setFfIntronMax(optionInt("maxIntron", ffIntronMaxDefault));
00779 setFfExtendThroughN(optionExists("extendThroughN"));
00780 
00781 
00782 /* Call routine that does the work. */
00783 blat(argv[1], argv[2], argv[3]);
00784 return 0;
00785 }

Here is the call graph for this function:

Bits* maskQuerySeq ( struct dnaSeq seq,
boolean  isProt,
boolean  maskQuery,
boolean  lcMask 
)

Definition at line 270 of file blat.c.

References dnaSeq::dna, faToDna(), faToProtein(), maskFromUpperCaseSeq(), dnaSeq::name, qWarnSize, dnaSeq::size, toggleCase(), verbose(), and warn().

Referenced by searchOneMaskTrim().

00275 {
00276 Bits *qMaskBits = NULL;
00277 verbose(2, "%s\n", seq->name);
00278 if (isProt)
00279     faToProtein(seq->dna, seq->size);
00280 else
00281     {
00282     if (maskQuery)
00283         {
00284         if (lcMask)
00285             toggleCase(seq->dna, seq->size);
00286         qMaskBits = maskFromUpperCaseSeq(seq);
00287         }
00288     faToDna(seq->dna, seq->size);
00289     }
00290 if (seq->size > qWarnSize)
00291     {
00292     warn("Query sequence %s has size %d, it might take a while.",
00293          seq->name, seq->size);
00294     }
00295 return qMaskBits;
00296 }

Here is the call graph for this function:

Here is the caller graph for this function:

void searchOne ( bioSeq seq,
struct genoFind gf,
FILE *  f,
boolean  isProt,
struct hash maskHash,
Bits qMaskBits 
)

Definition at line 230 of file blat.c.

References dnaSeq::dna, dotOut(), FALSE, gfOutputQuery(), gvo, gfOutput::maskHash, reverseComplement(), searchOneProt(), searchOneStrand(), dnaSeq::size, and TRUE.

Referenced by searchOneMaskTrim().

00232 {
00233 dotOut();
00234 if (isProt)
00235     {
00236     searchOneProt(seq, gf, f);
00237     }
00238 else
00239     {
00240     gvo->maskHash = maskHash;
00241     searchOneStrand(seq, gf, f, FALSE, maskHash, qMaskBits);
00242     reverseComplement(seq->dna, seq->size);
00243     searchOneStrand(seq, gf, f, TRUE, maskHash, qMaskBits);
00244     reverseComplement(seq->dna, seq->size);
00245     }
00246 gfOutputQuery(gvo, f);
00247 }

Here is the call graph for this function:

Here is the caller graph for this function:

void searchOneIndex ( int  fileCount,
char *  files[],
struct genoFind gf,
char *  outName,
boolean  isProt,
struct hash maskHash,
FILE *  outFile,
boolean  showStatus 
)

Definition at line 316 of file blat.c.

References carefulClose(), cloneString(), dnaSeq::dna, dnaSeqFree, errAbort(), faMixedSpeedReadNext(), twoBitSpec::fileName, freeDnaSeq(), freez(), gfOutputHead(), gvo, twoBitFile::indexList, lineFileClose(), lineFileOpen(), dnaSeq::name, twoBitIndex::name, twoBitIndex::next, NIB_MASK_MIXED, nibIsFile(), nibLoadAllMasked(), searchOneMaskTrim(), twoBitSpec::seqs, dnaSeq::size, ss, TRUE, twoBitClose(), twoBitIsSpec(), twoBitOpen(), twoBitReadSeqFrag(), and twoBitSpecNew().

Referenced by blat().

00319 {
00320 int i;
00321 char *fileName;
00322 int count = 0; 
00323 unsigned long totalSize = 0;
00324 
00325 gfOutputHead(gvo, outFile);
00326 for (i=0; i<fileCount; ++i)
00327     {
00328     fileName = files[i];
00329     if (nibIsFile(fileName))
00330         {
00331         struct dnaSeq *seq;
00332 
00333         if (isProt)
00334             errAbort("%s: Can't use .nib files with -prot or d=prot option\n", fileName);
00335         seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName);
00336         freez(&seq->name);
00337         seq->name = cloneString(fileName);
00338         searchOneMaskTrim(seq, isProt, gf, outFile,
00339                           maskHash, &totalSize, &count);
00340         freeDnaSeq(&seq);
00341         }
00342     else if (twoBitIsSpec(fileName))
00343         {
00344         struct twoBitSpec *tbs = twoBitSpecNew(fileName);
00345         struct twoBitFile *tbf = twoBitOpen(tbs->fileName);
00346         if (isProt)
00347             errAbort("%s is a two bit file, which doesn't work for proteins.", 
00348                 fileName);
00349         if (tbs->seqs != NULL)
00350             {
00351             struct twoBitSeqSpec *ss = NULL;
00352             for (ss = tbs->seqs;  ss != NULL;  ss = ss->next)
00353                 {
00354                 struct dnaSeq *seq = twoBitReadSeqFrag(tbf, ss->name,
00355                                                        ss->start, ss->end);
00356                 searchOneMaskTrim(seq, isProt, gf, outFile,
00357                                   maskHash, &totalSize, &count);
00358                 dnaSeqFree(&seq);
00359                 }
00360             }
00361         else
00362             {
00363             struct twoBitIndex *index = NULL;
00364             for (index = tbf->indexList; index != NULL; index = index->next)
00365                 {
00366                 struct dnaSeq *seq = twoBitReadSeqFrag(tbf, index->name, 0, 0);
00367                 searchOneMaskTrim(seq, isProt, gf, outFile,
00368                                   maskHash, &totalSize, &count);
00369                 dnaSeqFree(&seq);
00370                 }
00371             }
00372         twoBitClose(&tbf);
00373         }
00374     else
00375         {
00376         static struct dnaSeq seq;
00377         struct lineFile *lf = lineFileOpen(fileName, TRUE);
00378         while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
00379             {
00380             searchOneMaskTrim(&seq, isProt, gf, outFile,
00381                               maskHash, &totalSize, &count);
00382             }
00383         lineFileClose(&lf);
00384         }
00385     }
00386 carefulClose(&outFile);
00387 if (showStatus)
00388     printf("Searched %lu bases in %d sequences\n", totalSize, count);
00389 }

Here is the call graph for this function:

Here is the caller graph for this function:

void searchOneMaskTrim ( struct dnaSeq seq,
boolean  isProt,
struct genoFind gf,
FILE *  outFile,
struct hash maskHash,
unsigned long *  retTotalSize,
int *  retCount 
)

Definition at line 298 of file blat.c.

References bitFree(), maskQuerySeq(), qMask, sameWord, searchOne(), dnaSeq::size, trimSeq(), and ZeroVar.

Referenced by searchOneIndex().

00303 {
00304 boolean maskQuery = (qMask != NULL);
00305 boolean lcMask = (qMask != NULL && sameWord(qMask, "lower"));
00306 Bits *qMaskBits = maskQuerySeq(seq, isProt, maskQuery, lcMask);
00307 struct dnaSeq trimmedSeq;
00308 ZeroVar(&trimmedSeq);
00309 trimSeq(seq, &trimmedSeq);
00310 searchOne(&trimmedSeq, gf, outFile, isProt, maskHash, qMaskBits);
00311 *retTotalSize += seq->size;
00312 *retCount += 1;
00313 bitFree(&qMaskBits);
00314 }

Here is the call graph for this function:

Here is the caller graph for this function:

void searchOneProt ( aaSeq seq,
struct genoFind gf,
FILE *  f 
)

Definition at line 204 of file blat.c.

References FALSE, gfAlignAaClumps(), gfClumpFreeList(), gfFindClumps(), gvo, lm, lmCleanup(), lmInit(), and minScore.

Referenced by searchOne().

00206 {
00207 int hitCount;
00208 struct lm *lm = lmInit(0);
00209 struct gfClump *clumpList = gfFindClumps(gf, seq, lm, &hitCount);
00210 gfAlignAaClumps(gf, clumpList, seq, FALSE, minScore, gvo);
00211 gfClumpFreeList(&clumpList);
00212 lmCleanup(&lm);
00213 }

Here is the call graph for this function:

Here is the caller graph for this function:

void searchOneStrand ( struct dnaSeq seq,
struct genoFind gf,
FILE *  psl,
boolean  isRc,
struct hash maskHash,
Bits qMaskBits 
)

Definition at line 196 of file blat.c.

References fastMap, gfLongDnaInMem(), gvo, minScore, and optionExists().

Referenced by searchOne().

00199 {
00200 gfLongDnaInMem(seq, gf, isRc, minScore, qMaskBits, gvo, fastMap, optionExists("fine"));
00201 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct trans3* seqListToTrans3List ( struct dnaSeq seqList,
aaSeq transLists[3],
struct hash **  retHash 
) [read]

Definition at line 391 of file blat.c.

References hashAddUnique(), newHash(), dnaSeq::next, slAddHead, slReverse(), and trans3New().

Referenced by bigBlat().

00393 {
00394 int frame;
00395 struct dnaSeq *seq;
00396 struct trans3 *t3List = NULL, *t3;
00397 struct hash *hash = newHash(0);
00398 
00399 for (seq = seqList; seq != NULL; seq = seq->next)
00400     {
00401     t3 = trans3New(seq);
00402     hashAddUnique(hash, t3->name, t3);
00403     slAddHead(&t3List, t3);
00404     for (frame = 0; frame < 3; ++frame)
00405         {
00406         slAddHead(&transLists[frame], t3->trans[frame]);
00407         }
00408     }
00409 slReverse(&t3List);
00410 for (frame = 0; frame < 3; ++frame)
00411     {
00412     slReverse(&transLists[frame]);
00413     }
00414 *retHash = hash;
00415 return t3List;
00416 }

Here is the call graph for this function:

Here is the caller graph for this function:

void transTripleSearch ( struct dnaSeq qSeq,
struct genoFind gfs[3],
struct hash t3Hash,
boolean  dbIsRc,
boolean  qIsDna,
FILE *  f 
)

Definition at line 425 of file blat.c.

References dnaSeq::dna, gfLongTransTransInMem(), gvo, minScore, gfOutput::reportTargetStrand, reverseComplement(), dnaSeq::size, and TRUE.

Referenced by bigBlat().

00428 {
00429 int qIsRc;
00430 gvo->reportTargetStrand = TRUE;
00431 for (qIsRc = 0; qIsRc <= qIsDna; qIsRc += 1)
00432     {
00433     gfLongTransTransInMem(qSeq, gfs, t3Hash, qIsRc, dbIsRc, !qIsDna, minScore, gvo);
00434     if (qIsDna)
00435         reverseComplement(qSeq->dna, qSeq->size);
00436     }
00437 }

Here is the call graph for this function:

Here is the caller graph for this function:

void trimSeq ( struct dnaSeq seq,
struct dnaSeq trimmed 
)

Definition at line 249 of file blat.c.

References dnaSeq::dna, maskHeadPolyT(), maskTailPolyA(), dnaSeq::size, trimA, trimHardA, and trimT.

Referenced by bigBlat(), and searchOneMaskTrim().

00252 {
00253 DNA *dna = seq->dna;
00254 int size = seq->size;
00255 *trimmed = *seq;
00256 if (trimT)
00257     maskHeadPolyT(dna, size);
00258 if (trimA || trimHardA)
00259     {
00260     int trimSize = maskTailPolyA(dna, size);
00261     if (trimHardA)
00262         {
00263         trimmed->size -= trimSize;
00264         dna[size-trimSize] = 0;
00265         }
00266     }
00267 }

Here is the call graph for this function:

Here is the caller graph for this function:

void tripleSearch ( aaSeq qSeq,
struct genoFind gfs[3],
struct hash t3Hash,
boolean  dbIsRc,
FILE *  f 
)

Definition at line 418 of file blat.c.

References gfFindAlignAaTrans(), gvo, minScore, gfOutput::reportTargetStrand, and TRUE.

Referenced by bigBlat().

00420 {
00421 gvo->reportTargetStrand = TRUE;
00422 gfFindAlignAaTrans(gfs, qSeq, t3Hash, dbIsRc, minScore, gvo);
00423 }

Here is the call graph for this function:

Here is the caller graph for this function:

void usage (  ) 

Definition at line 59 of file blat.c.

References ffIntronMaxDefault, and gfVersion.

Referenced by main().

00061 {
00062 printf(
00063   "blat - Standalone BLAT v. %s fast sequence search command line tool\n"
00064   "usage:\n"
00065   "   blat database query [-ooc=11.ooc] output.psl\n"
00066   "where:\n"
00067   "   database and query are each either a .fa , .nib or .2bit file,\n"
00068   "   or a list these files one file name per line.\n"
00069   "   -ooc=11.ooc tells the program to load over-occurring 11-mers from\n"
00070   "               and external file.  This will increase the speed\n"
00071   "               by a factor of 40 in many cases, but is not required\n"
00072   "   output.psl is where to put the output.\n"
00073   "   Subranges of nib and .2bit files may specified using the syntax:\n"
00074   "      /path/file.nib:seqid:start-end\n"
00075   "   or\n"
00076   "      /path/file.2bit:seqid:start-end\n"
00077   "   or\n"
00078   "      /path/file.nib:start-end\n"
00079   "   With the second form, a sequence id of file:start-end will be used.\n"
00080   "options:\n"
00081   "   -t=type     Database type.  Type is one of:\n"
00082   "                 dna - DNA sequence\n"
00083   "                 prot - protein sequence\n"
00084   "                 dnax - DNA sequence translated in six frames to protein\n"
00085   "               The default is dna\n"
00086   "   -q=type     Query type.  Type is one of:\n"
00087   "                 dna - DNA sequence\n"
00088   "                 rna - RNA sequence\n"
00089   "                 prot - protein sequence\n"
00090   "                 dnax - DNA sequence translated in six frames to protein\n"
00091   "                 rnax - DNA sequence translated in three frames to protein\n"
00092   "               The default is dna\n"
00093   "   -prot       Synonymous with -t=prot -q=prot\n"
00094   "   -ooc=N.ooc  Use overused tile file N.ooc.  N should correspond to \n"
00095   "               the tileSize\n"
00096   "   -tileSize=N sets the size of match that triggers an alignment.  \n"
00097   "               Usually between 8 and 12\n"
00098   "               Default is 11 for DNA and 5 for protein.\n"
00099   "   -stepSize=N spacing between tiles. Default is tileSize.\n"
00100   "   -oneOff=N   If set to 1 this allows one mismatch in tile and still\n"
00101   "               triggers an alignments.  Default is 0.\n"
00102   "   -minMatch=N sets the number of tile matches.  Usually set from 2 to 4\n"
00103   "               Default is 2 for nucleotide, 1 for protein.\n"
00104   "   -minScore=N sets minimum score.  This is the matches minus the \n"
00105   "               mismatches minus some sort of gap penalty.  Default is 30\n"
00106   "   -minIdentity=N Sets minimum sequence identity (in percent).  Default is\n"
00107   "               90 for nucleotide searches, 25 for protein or translated\n"
00108   "               protein searches.\n"
00109   "   -maxGap=N   sets the size of maximum gap between tiles in a clump.  Usually\n"
00110   "               set from 0 to 3.  Default is 2. Only relevent for minMatch > 1.\n"
00111   "   -noHead     suppress .psl header (so it's just a tab-separated file)\n"
00112   "   -makeOoc=N.ooc Make overused tile file. Target needs to be complete genome.\n"
00113   "   -repMatch=N sets the number of repetitions of a tile allowed before\n"
00114   "               it is marked as overused.  Typically this is 256 for tileSize\n"
00115   "               12, 1024 for tile size 11, 4096 for tile size 10.\n"
00116   "               Default is 1024.  Typically only comes into play with makeOoc.\n"
00117   "               Also affected by stepSize. When stepSize is halved repMatch is\n"
00118   "               doubled to compensate.\n"
00119   "   -mask=type  Mask out repeats.  Alignments won't be started in masked region\n"
00120   "               but may extend through it in nucleotide searches.  Masked areas\n"
00121   "               are ignored entirely in protein or translated searches. Types are\n"
00122   "                 lower - mask out lower cased sequence\n"
00123   "                 upper - mask out upper cased sequence\n"
00124   "                 out   - mask according to database.out RepeatMasker .out file\n"
00125   "                 file.out - mask database according to RepeatMasker file.out\n"
00126   "   -qMask=type Mask out repeats in query sequence.  Similar to -mask above but\n"
00127   "               for query rather than target sequence.\n"
00128   "   -repeats=type Type is same as mask types above.  Repeat bases will not be\n"
00129   "               masked in any way, but matches in repeat areas will be reported\n"
00130   "               separately from matches in other areas in the psl output.\n"
00131   "   -minRepDivergence=NN - minimum percent divergence of repeats to allow \n"
00132   "               them to be unmasked.  Default is 15.  Only relevant for \n"
00133   "               masking using RepeatMasker .out files.\n"
00134   "   -dots=N     Output dot every N sequences to show program's progress\n"
00135   "   -trimT      Trim leading poly-T\n"
00136   "   -noTrimA    Don't trim trailing poly-A\n"
00137   "   -trimHardA  Remove poly-A tail from qSize as well as alignments in \n"
00138   "               psl output\n"
00139   "   -fastMap    Run for fast DNA/DNA remapping - not allowing introns, \n"
00140   "               requiring high %%ID\n"
00141   "   -out=type   Controls output file format.  Type is one of:\n"
00142   "                   psl - Default.  Tab separated format, no sequence\n"
00143   "                   pslx - Tab separated format with sequence\n"
00144   "                   axt - blastz-associated axt format\n"
00145   "                   maf - multiz-associated maf format\n"
00146   "                   sim4 - similar to sim4 format\n"
00147   "                   wublast - similar to wublast format\n"
00148   "                   blast - similar to NCBI blast format\n"
00149   "                   blast8- NCBI blast tabular format\n"
00150   "                   blast9 - NCBI blast tabular format with comments\n"
00151   "   -fine       For high quality mRNAs look harder for small initial and\n"
00152   "               terminal exons.  Not recommended for ESTs\n"
00153   "   -maxIntron=N  Sets maximum intron size. Default is %d\n"
00154   "   -extendThroughN - Allows extension of alignment through large blocks of N's\n"
00155   , gfVersion, ffIntronMaxDefault
00156   );
00157 exit(-1);
00158 }

Here is the caller graph for this function:


Variable Documentation

unsigned long databaseLetters = 0

Definition at line 27 of file blat.c.

Referenced by blat().

char* databaseName

Definition at line 25 of file blat.c.

Referenced by blat(), and gfClient().

int databaseSeqCount = 0

Definition at line 26 of file blat.c.

Referenced by blat().

int dotEvery = 0

Definition at line 40 of file blat.c.

Referenced by blat(), dotOut(), and main().

boolean fastMap = FALSE

Definition at line 46 of file blat.c.

Referenced by main(), and searchOneStrand().

struct gfOutput* gvo

Definition at line 194 of file blat.c.

Referenced by bigBlat(), blat(), doBlat(), gfClient(), searchOne(), searchOneIndex(), searchOneProt(), searchOneStrand(), transTripleSearch(), and tripleSearch().

char* makeOoc = NULL

Definition at line 47 of file blat.c.

Referenced by blat(), and main().

char* mask = NULL

Definition at line 51 of file blat.c.

Referenced by addToPatSpace(), blat(), countPatSpace(), gfFastFindDnaHits(), main(), nibInput(), and nibOutput().

int maxGap = 2

Definition at line 38 of file blat.c.

Referenced by bandExtAfter(), bandExtBefore(), bigBlat(), blat(), clumpHits(), genoFindDirect(), genoPcrDirect(), main(), and startServer().

double minIdentity = 90

Definition at line 55 of file blat.c.

Referenced by blat(), gfClient(), and main().

int minMatch = 2

Definition at line 36 of file blat.c.

Referenced by bigBlat(), blat(), genoFindDirect(), genoPcrDirect(), gfFindClumpsWithQmask(), main(), patSpaceFindOne(), and startServer().

double minRepDivergence = 15

Definition at line 54 of file blat.c.

Referenced by blat(), and main().

int minScore = 30

Definition at line 37 of file blat.c.

Referenced by gfClient(), main(), searchOneProt(), searchOneStrand(), transTripleSearch(), and tripleSearch().

boolean noHead = FALSE

Definition at line 42 of file blat.c.

Referenced by blat(), and main().

boolean oneOff = FALSE

Definition at line 41 of file blat.c.

Referenced by bigBlat(), blat(), and main().

char* ooc = NULL

Definition at line 48 of file blat.c.

Referenced by bigBlat(), blat(), and main().

struct optionSpec options[]

Initial value:

 {
   {"t", OPTION_STRING},
   {"q", OPTION_STRING},
   {"prot", OPTION_BOOLEAN},
   {"ooc", OPTION_STRING},
   {"tileSize", OPTION_INT},
   {"stepSize", OPTION_INT},
   {"oneOff", OPTION_INT},
   {"minMatch", OPTION_INT},
   {"minScore", OPTION_INT},
   {"minIdentity", OPTION_FLOAT},
   {"maxGap", OPTION_INT},
   {"noHead", OPTION_BOOLEAN},
   {"makeOoc", OPTION_STRING},
   {"repMatch", OPTION_INT},
   {"mask", OPTION_STRING},
   {"qMask", OPTION_STRING},
   {"repeats", OPTION_STRING},
   {"minRepDivergence", OPTION_FLOAT},
   {"dots", OPTION_INT},
   {"trimT", OPTION_BOOLEAN},
   {"noTrimA", OPTION_BOOLEAN},
   {"trimHardA", OPTION_BOOLEAN},
   {"fastMap", OPTION_BOOLEAN},
   {"out", OPTION_STRING},
   {"fine", OPTION_BOOLEAN},
   {"maxIntron", OPTION_INT},
   {"extendThroughN", OPTION_BOOLEAN},
   {NULL, 0},
}

Definition at line 161 of file blat.c.

Referenced by main(), optGet(), optionHashSome(), optionInit(), optionMultiVal(), and setOptions().

char* outputFormat = "psl"

Definition at line 56 of file blat.c.

Referenced by blat(), gfClient(), and main().

char* qMask = NULL

Definition at line 53 of file blat.c.

Referenced by bigBlat(), main(), and searchOneMaskTrim().

enum gfType qType = gftDna

Definition at line 49 of file blat.c.

Referenced by blat(), doGetSeq(), gfClient(), and main().

char const rcsid[] = "$Id: blat.c,v 1.111 2006/12/01 18:30:43 kent Exp $" [static]

Definition at line 21 of file blat.c.

char* repeats = NULL

Definition at line 52 of file blat.c.

Referenced by blat(), and main().

int repMatch = 1024*4

Definition at line 39 of file blat.c.

Referenced by bigBlat(), blat(), fillInMatchEtc(), genoFindDirect(), genoPcrDirect(), main(), savePslx(), startServer(), and usage().

int stepSize = 0

Definition at line 35 of file blat.c.

Referenced by bigBlat(), blat(), genoFindDirect(), genoPcrDirect(), gfAddLargeSeq(), gfAddSeq(), gfCountSeq(), main(), and startServer().

int tileSize = 11

Definition at line 34 of file blat.c.

Referenced by bigBlat(), blat(), clumpHits(), clumpNear(), findTileSize(), genoFindDirect(), genoPcrDirect(), gfAddLargeSeq(), gfAddSeq(), gfAddTilesInNib(), gfAlignTrans(), gfAlignTransTrans(), gfCountSeq(), gfCountTilesInNib(), gfFindAlignAaTrans(), gfQuerySeqTrans(), gfQuerySeqTransTrans(), gfSegmentedFindHits(), gfSegmentedFindNearHits(), gfStraightFindHits(), gfStraightFindNearHits(), gfTransTransFindBundles(), main(), maskSimplePepRepeat(), pcrClumps(), rwFindTilesBetween(), scanIndexForSmallExons(), startServer(), transQuery(), and transTransQuery().

boolean trimA = FALSE

Definition at line 43 of file blat.c.

Referenced by main(), and trimSeq().

boolean trimHardA = FALSE

Definition at line 44 of file blat.c.

Referenced by main(), and trimSeq().

boolean trimT = FALSE

Definition at line 45 of file blat.c.

Referenced by main(), and trimSeq().

enum gfType tType = gftDna

Definition at line 50 of file blat.c.

Referenced by blat(), gfClient(), and main().


Generated on Tue Dec 25 18:40:04 2007 for blat by  doxygen 1.5.2