blat/blat.c

Go to the documentation of this file.
00001 /* blat - Standalone BLAT fast sequence search command line tool. */
00002 /* Copyright 2001-2004 Jim Kent.  All rights reserved. */
00003 #include "common.h"
00004 #include "memalloc.h"
00005 #include "linefile.h"
00006 #include "bits.h"
00007 #include "hash.h"
00008 #include "dnautil.h"
00009 #include "dnaseq.h"
00010 #include "fa.h"
00011 #include "nib.h"
00012 #include "twoBit.h"
00013 #include "psl.h"
00014 #include "sig.h"
00015 #include "options.h"
00016 #include "obscure.h"
00017 #include "genoFind.h"
00018 #include "trans3.h"
00019 #include "gfClientLib.h"
00020 
00021 static char const rcsid[] = "$Id: blat.c,v 1.111 2006/12/01 18:30:43 kent Exp $";
00022 
00023 /* Variables shared with other modules.  Set in this module, read only
00024  * elsewhere. */
00025 char *databaseName;             /* File name of database. */
00026 int databaseSeqCount = 0;       /* Number of sequences in database. */
00027 unsigned long databaseLetters = 0;      /* Number of bases in database. */
00028 
00029 enum constants {
00030     qWarnSize = 5000000, /* Warn if more than this many bases in one query. */
00031     };
00032 
00033 /* Variables that can be set from command line. */
00034 int tileSize = 11;
00035 int stepSize = 0;       /* Default (same as tileSize) */
00036 int minMatch = 2;
00037 int minScore = 30;
00038 int maxGap = 2;
00039 int repMatch = 1024*4;
00040 int dotEvery = 0;
00041 boolean oneOff = FALSE;
00042 boolean noHead = FALSE;
00043 boolean trimA = FALSE;
00044 boolean trimHardA = FALSE;
00045 boolean trimT = FALSE;
00046 boolean fastMap = FALSE;
00047 char *makeOoc = NULL;
00048 char *ooc = NULL;
00049 enum gfType qType = gftDna;
00050 enum gfType tType = gftDna;
00051 char *mask = NULL;
00052 char *repeats = NULL;
00053 char *qMask = NULL;
00054 double minRepDivergence = 15;
00055 double minIdentity = 90;
00056 char *outputFormat = "psl";
00057 
00058 
00059 void usage()
00060 /* Explain usage and exit. */
00061 {
00062 printf(
00063   "blat - Standalone BLAT v. %s fast sequence search command line tool\n"
00064   "usage:\n"
00065   "   blat database query [-ooc=11.ooc] output.psl\n"
00066   "where:\n"
00067   "   database and query are each either a .fa , .nib or .2bit file,\n"
00068   "   or a list these files one file name per line.\n"
00069   "   -ooc=11.ooc tells the program to load over-occurring 11-mers from\n"
00070   "               and external file.  This will increase the speed\n"
00071   "               by a factor of 40 in many cases, but is not required\n"
00072   "   output.psl is where to put the output.\n"
00073   "   Subranges of nib and .2bit files may specified using the syntax:\n"
00074   "      /path/file.nib:seqid:start-end\n"
00075   "   or\n"
00076   "      /path/file.2bit:seqid:start-end\n"
00077   "   or\n"
00078   "      /path/file.nib:start-end\n"
00079   "   With the second form, a sequence id of file:start-end will be used.\n"
00080   "options:\n"
00081   "   -t=type     Database type.  Type is one of:\n"
00082   "                 dna - DNA sequence\n"
00083   "                 prot - protein sequence\n"
00084   "                 dnax - DNA sequence translated in six frames to protein\n"
00085   "               The default is dna\n"
00086   "   -q=type     Query type.  Type is one of:\n"
00087   "                 dna - DNA sequence\n"
00088   "                 rna - RNA sequence\n"
00089   "                 prot - protein sequence\n"
00090   "                 dnax - DNA sequence translated in six frames to protein\n"
00091   "                 rnax - DNA sequence translated in three frames to protein\n"
00092   "               The default is dna\n"
00093   "   -prot       Synonymous with -t=prot -q=prot\n"
00094   "   -ooc=N.ooc  Use overused tile file N.ooc.  N should correspond to \n"
00095   "               the tileSize\n"
00096   "   -tileSize=N sets the size of match that triggers an alignment.  \n"
00097   "               Usually between 8 and 12\n"
00098   "               Default is 11 for DNA and 5 for protein.\n"
00099   "   -stepSize=N spacing between tiles. Default is tileSize.\n"
00100   "   -oneOff=N   If set to 1 this allows one mismatch in tile and still\n"
00101   "               triggers an alignments.  Default is 0.\n"
00102   "   -minMatch=N sets the number of tile matches.  Usually set from 2 to 4\n"
00103   "               Default is 2 for nucleotide, 1 for protein.\n"
00104   "   -minScore=N sets minimum score.  This is the matches minus the \n"
00105   "               mismatches minus some sort of gap penalty.  Default is 30\n"
00106   "   -minIdentity=N Sets minimum sequence identity (in percent).  Default is\n"
00107   "               90 for nucleotide searches, 25 for protein or translated\n"
00108   "               protein searches.\n"
00109   "   -maxGap=N   sets the size of maximum gap between tiles in a clump.  Usually\n"
00110   "               set from 0 to 3.  Default is 2. Only relevent for minMatch > 1.\n"
00111   "   -noHead     suppress .psl header (so it's just a tab-separated file)\n"
00112   "   -makeOoc=N.ooc Make overused tile file. Target needs to be complete genome.\n"
00113   "   -repMatch=N sets the number of repetitions of a tile allowed before\n"
00114   "               it is marked as overused.  Typically this is 256 for tileSize\n"
00115   "               12, 1024 for tile size 11, 4096 for tile size 10.\n"
00116   "               Default is 1024.  Typically only comes into play with makeOoc.\n"
00117   "               Also affected by stepSize. When stepSize is halved repMatch is\n"
00118   "               doubled to compensate.\n"
00119   "   -mask=type  Mask out repeats.  Alignments won't be started in masked region\n"
00120   "               but may extend through it in nucleotide searches.  Masked areas\n"
00121   "               are ignored entirely in protein or translated searches. Types are\n"
00122   "                 lower - mask out lower cased sequence\n"
00123   "                 upper - mask out upper cased sequence\n"
00124   "                 out   - mask according to database.out RepeatMasker .out file\n"
00125   "                 file.out - mask database according to RepeatMasker file.out\n"
00126   "   -qMask=type Mask out repeats in query sequence.  Similar to -mask above but\n"
00127   "               for query rather than target sequence.\n"
00128   "   -repeats=type Type is same as mask types above.  Repeat bases will not be\n"
00129   "               masked in any way, but matches in repeat areas will be reported\n"
00130   "               separately from matches in other areas in the psl output.\n"
00131   "   -minRepDivergence=NN - minimum percent divergence of repeats to allow \n"
00132   "               them to be unmasked.  Default is 15.  Only relevant for \n"
00133   "               masking using RepeatMasker .out files.\n"
00134   "   -dots=N     Output dot every N sequences to show program's progress\n"
00135   "   -trimT      Trim leading poly-T\n"
00136   "   -noTrimA    Don't trim trailing poly-A\n"
00137   "   -trimHardA  Remove poly-A tail from qSize as well as alignments in \n"
00138   "               psl output\n"
00139   "   -fastMap    Run for fast DNA/DNA remapping - not allowing introns, \n"
00140   "               requiring high %%ID\n"
00141   "   -out=type   Controls output file format.  Type is one of:\n"
00142   "                   psl - Default.  Tab separated format, no sequence\n"
00143   "                   pslx - Tab separated format with sequence\n"
00144   "                   axt - blastz-associated axt format\n"
00145   "                   maf - multiz-associated maf format\n"
00146   "                   sim4 - similar to sim4 format\n"
00147   "                   wublast - similar to wublast format\n"
00148   "                   blast - similar to NCBI blast format\n"
00149   "                   blast8- NCBI blast tabular format\n"
00150   "                   blast9 - NCBI blast tabular format with comments\n"
00151   "   -fine       For high quality mRNAs look harder for small initial and\n"
00152   "               terminal exons.  Not recommended for ESTs\n"
00153   "   -maxIntron=N  Sets maximum intron size. Default is %d\n"
00154   "   -extendThroughN - Allows extension of alignment through large blocks of N's\n"
00155   , gfVersion, ffIntronMaxDefault
00156   );
00157 exit(-1);
00158 }
00159 
00160 
00161 struct optionSpec options[] = {
00162    {"t", OPTION_STRING},
00163    {"q", OPTION_STRING},
00164    {"prot", OPTION_BOOLEAN},
00165    {"ooc", OPTION_STRING},
00166    {"tileSize", OPTION_INT},
00167    {"stepSize", OPTION_INT},
00168    {"oneOff", OPTION_INT},
00169    {"minMatch", OPTION_INT},
00170    {"minScore", OPTION_INT},
00171    {"minIdentity", OPTION_FLOAT},
00172    {"maxGap", OPTION_INT},
00173    {"noHead", OPTION_BOOLEAN},
00174    {"makeOoc", OPTION_STRING},
00175    {"repMatch", OPTION_INT},
00176    {"mask", OPTION_STRING},
00177    {"qMask", OPTION_STRING},
00178    {"repeats", OPTION_STRING},
00179    {"minRepDivergence", OPTION_FLOAT},
00180    {"dots", OPTION_INT},
00181    {"trimT", OPTION_BOOLEAN},
00182    {"noTrimA", OPTION_BOOLEAN},
00183    {"trimHardA", OPTION_BOOLEAN},
00184    {"fastMap", OPTION_BOOLEAN},
00185    {"out", OPTION_STRING},
00186    {"fine", OPTION_BOOLEAN},
00187    {"maxIntron", OPTION_INT},
00188    {"extendThroughN", OPTION_BOOLEAN},
00189    {NULL, 0},
00190 };
00191 
00192 
00193 /* Stuff to support various output formats. */
00194 struct gfOutput *gvo;           /* Overall output controller */
00195 
00196 void searchOneStrand(struct dnaSeq *seq, struct genoFind *gf, FILE *psl, 
00197         boolean isRc, struct hash *maskHash, Bits *qMaskBits)
00198 /* Search for seq in index, align it, and write results to psl. */
00199 {
00200 gfLongDnaInMem(seq, gf, isRc, minScore, qMaskBits, gvo, fastMap, optionExists("fine"));
00201 }
00202 
00203 
00204 void searchOneProt(aaSeq *seq, struct genoFind *gf, FILE *f)
00205 /* Search for protein seq in index and write results to psl. */
00206 {
00207 int hitCount;
00208 struct lm *lm = lmInit(0);
00209 struct gfClump *clumpList = gfFindClumps(gf, seq, lm, &hitCount);
00210 gfAlignAaClumps(gf, clumpList, seq, FALSE, minScore, gvo);
00211 gfClumpFreeList(&clumpList);
00212 lmCleanup(&lm);
00213 }
00214 
00215 void dotOut()
00216 /* Put out a dot every now and then if user want's to. */
00217 {
00218 static int mod = 1;
00219 if (dotEvery > 0)
00220     {
00221     if (--mod <= 0)
00222         {
00223         fputc('.', stdout);
00224         fflush(stdout);
00225         mod = dotEvery;
00226         }
00227     }
00228 }
00229 
00230 void searchOne(bioSeq *seq, struct genoFind *gf, FILE *f, boolean isProt, struct hash *maskHash, Bits *qMaskBits)
00231 /* Search for seq on either strand in index. */
00232 {
00233 dotOut();
00234 if (isProt)
00235     {
00236     searchOneProt(seq, gf, f);
00237     }
00238 else
00239     {
00240     gvo->maskHash = maskHash;
00241     searchOneStrand(seq, gf, f, FALSE, maskHash, qMaskBits);
00242     reverseComplement(seq->dna, seq->size);
00243     searchOneStrand(seq, gf, f, TRUE, maskHash, qMaskBits);
00244     reverseComplement(seq->dna, seq->size);
00245     }
00246 gfOutputQuery(gvo, f);
00247 }
00248 
00249 void trimSeq(struct dnaSeq *seq, struct dnaSeq *trimmed)
00250 /* Copy seq to trimmed (shallow copy) and optionally trim
00251  * off polyA tail or polyT head. */
00252 {
00253 DNA *dna = seq->dna;
00254 int size = seq->size;
00255 *trimmed = *seq;
00256 if (trimT)
00257     maskHeadPolyT(dna, size);
00258 if (trimA || trimHardA)
00259     {
00260     int trimSize = maskTailPolyA(dna, size);
00261     if (trimHardA)
00262         {
00263         trimmed->size -= trimSize;
00264         dna[size-trimSize] = 0;
00265         }
00266     }
00267 }
00268 
00269 
00270 Bits *maskQuerySeq(struct dnaSeq *seq, boolean isProt, 
00271         boolean maskQuery, boolean lcMask)
00272 /* Massage query sequence a bit, converting it to correct
00273  * case (upper for protein/lower for DNA) and optionally
00274  * returning upper/lower case info , and trimming poly A. */
00275 {
00276 Bits *qMaskBits = NULL;
00277 verbose(2, "%s\n", seq->name);
00278 if (isProt)
00279     faToProtein(seq->dna, seq->size);
00280 else
00281     {
00282     if (maskQuery)
00283         {
00284         if (lcMask)
00285             toggleCase(seq->dna, seq->size);
00286         qMaskBits = maskFromUpperCaseSeq(seq);
00287         }
00288     faToDna(seq->dna, seq->size);
00289     }
00290 if (seq->size > qWarnSize)
00291     {
00292     warn("Query sequence %s has size %d, it might take a while.",
00293          seq->name, seq->size);
00294     }
00295 return qMaskBits;
00296 }
00297             
00298 void searchOneMaskTrim(struct dnaSeq *seq, boolean isProt,
00299                        struct genoFind *gf, FILE *outFile,
00300                        struct hash *maskHash,
00301                        unsigned long *retTotalSize, int *retCount)
00302 /* Search a single sequence against a single genoFind index. */
00303 {
00304 boolean maskQuery = (qMask != NULL);
00305 boolean lcMask = (qMask != NULL && sameWord(qMask, "lower"));
00306 Bits *qMaskBits = maskQuerySeq(seq, isProt, maskQuery, lcMask);
00307 struct dnaSeq trimmedSeq;
00308 ZeroVar(&trimmedSeq);
00309 trimSeq(seq, &trimmedSeq);
00310 searchOne(&trimmedSeq, gf, outFile, isProt, maskHash, qMaskBits);
00311 *retTotalSize += seq->size;
00312 *retCount += 1;
00313 bitFree(&qMaskBits);
00314 }
00315 
00316 void searchOneIndex(int fileCount, char *files[], struct genoFind *gf, char *outName, 
00317         boolean isProt, struct hash *maskHash, FILE *outFile, boolean showStatus)
00318 /* Search all sequences in all files against single genoFind index. */
00319 {
00320 int i;
00321 char *fileName;
00322 int count = 0; 
00323 unsigned long totalSize = 0;
00324 
00325 gfOutputHead(gvo, outFile);
00326 for (i=0; i<fileCount; ++i)
00327     {
00328     fileName = files[i];
00329     if (nibIsFile(fileName))
00330         {
00331         struct dnaSeq *seq;
00332 
00333         if (isProt)
00334             errAbort("%s: Can't use .nib files with -prot or d=prot option\n", fileName);
00335         seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName);
00336         freez(&seq->name);
00337         seq->name = cloneString(fileName);
00338         searchOneMaskTrim(seq, isProt, gf, outFile,
00339                           maskHash, &totalSize, &count);
00340         freeDnaSeq(&seq);
00341         }
00342     else if (twoBitIsSpec(fileName))
00343         {
00344         struct twoBitSpec *tbs = twoBitSpecNew(fileName);
00345         struct twoBitFile *tbf = twoBitOpen(tbs->fileName);
00346         if (isProt)
00347             errAbort("%s is a two bit file, which doesn't work for proteins.", 
00348                 fileName);
00349         if (tbs->seqs != NULL)
00350             {
00351             struct twoBitSeqSpec *ss = NULL;
00352             for (ss = tbs->seqs;  ss != NULL;  ss = ss->next)
00353                 {
00354                 struct dnaSeq *seq = twoBitReadSeqFrag(tbf, ss->name,
00355                                                        ss->start, ss->end);
00356                 searchOneMaskTrim(seq, isProt, gf, outFile,
00357                                   maskHash, &totalSize, &count);
00358                 dnaSeqFree(&seq);
00359                 }
00360             }
00361         else
00362             {
00363             struct twoBitIndex *index = NULL;
00364             for (index = tbf->indexList; index != NULL; index = index->next)
00365                 {
00366                 struct dnaSeq *seq = twoBitReadSeqFrag(tbf, index->name, 0, 0);
00367                 searchOneMaskTrim(seq, isProt, gf, outFile,
00368                                   maskHash, &totalSize, &count);
00369                 dnaSeqFree(&seq);
00370                 }
00371             }
00372         twoBitClose(&tbf);
00373         }
00374     else
00375         {
00376         static struct dnaSeq seq;
00377         struct lineFile *lf = lineFileOpen(fileName, TRUE);
00378         while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
00379             {
00380             searchOneMaskTrim(&seq, isProt, gf, outFile,
00381                               maskHash, &totalSize, &count);
00382             }
00383         lineFileClose(&lf);
00384         }
00385     }
00386 carefulClose(&outFile);
00387 if (showStatus)
00388     printf("Searched %lu bases in %d sequences\n", totalSize, count);
00389 }
00390 
00391 struct trans3 *seqListToTrans3List(struct dnaSeq *seqList, aaSeq *transLists[3], struct hash **retHash)
00392 /* Convert sequence list to a trans3 list and lists for each of three frames. */
00393 {
00394 int frame;
00395 struct dnaSeq *seq;
00396 struct trans3 *t3List = NULL, *t3;
00397 struct hash *hash = newHash(0);
00398 
00399 for (seq = seqList; seq != NULL; seq = seq->next)
00400     {
00401     t3 = trans3New(seq);
00402     hashAddUnique(hash, t3->name, t3);
00403     slAddHead(&t3List, t3);
00404     for (frame = 0; frame < 3; ++frame)
00405         {
00406         slAddHead(&transLists[frame], t3->trans[frame]);
00407         }
00408     }
00409 slReverse(&t3List);
00410 for (frame = 0; frame < 3; ++frame)
00411     {
00412     slReverse(&transLists[frame]);
00413     }
00414 *retHash = hash;
00415 return t3List;
00416 }
00417 
00418 void tripleSearch(aaSeq *qSeq, struct genoFind *gfs[3], struct hash *t3Hash, boolean dbIsRc, FILE *f)
00419 /* Look for qSeq in indices for three frames.  Then do rest of alignment. */
00420 {
00421 gvo->reportTargetStrand = TRUE;
00422 gfFindAlignAaTrans(gfs, qSeq, t3Hash, dbIsRc, minScore, gvo);
00423 }
00424 
00425 void transTripleSearch(struct dnaSeq *qSeq, struct genoFind *gfs[3], struct hash *t3Hash, 
00426         boolean dbIsRc, boolean qIsDna, FILE *f)
00427 /* Translate qSeq three ways and look for each in three frames of index. */
00428 {
00429 int qIsRc;
00430 gvo->reportTargetStrand = TRUE;
00431 for (qIsRc = 0; qIsRc <= qIsDna; qIsRc += 1)
00432     {
00433     gfLongTransTransInMem(qSeq, gfs, t3Hash, qIsRc, dbIsRc, !qIsDna, minScore, gvo);
00434     if (qIsDna)
00435         reverseComplement(qSeq->dna, qSeq->size);
00436     }
00437 }
00438 
00439 void bigBlat(struct dnaSeq *untransList, int queryCount, char *queryFiles[], char *outFile, boolean transQuery, boolean qIsDna, FILE *out, boolean showStatus)
00440 /* Run query against translated DNA database (3 frames on each strand). */
00441 {
00442 int frame, i;
00443 struct dnaSeq *seq, trimmedSeq;
00444 struct genoFind *gfs[3];
00445 aaSeq *dbSeqLists[3];
00446 struct trans3 *t3List = NULL;
00447 int isRc;
00448 struct lineFile *lf = NULL;
00449 struct hash *t3Hash = NULL;
00450 boolean forceUpper = FALSE;
00451 boolean forceLower = FALSE;
00452 boolean toggle = FALSE;
00453 boolean maskUpper = FALSE;
00454 
00455 ZeroVar(&trimmedSeq);
00456 if (showStatus)
00457     printf("Blatx %d sequences in database, %d files in query\n", slCount(untransList), queryCount);
00458 
00459 /* Figure out how to manage query case.  Proteins want to be in
00460  * upper case, generally, nucleotides in lower case.  But there
00461  * may be repeatMasking based on case as well. */
00462 if (transQuery)
00463     {
00464     if (qMask == NULL)
00465        forceLower = TRUE;
00466     else
00467        {
00468        maskUpper = TRUE;
00469        toggle = !sameString(qMask, "upper");
00470        }
00471     }
00472 else
00473     {
00474     forceUpper = TRUE;
00475     }
00476 
00477 if (gvo->fileHead != NULL)
00478     gvo->fileHead(gvo, out);
00479 
00480 for (isRc = FALSE; isRc <= 1; ++isRc)
00481     {
00482     /* Initialize local pointer arrays to NULL to prevent surprises. */
00483     for (frame = 0; frame < 3; ++frame)
00484         {
00485         gfs[frame] = NULL;
00486         dbSeqLists[frame] = NULL;
00487         }
00488 
00489     t3List = seqListToTrans3List(untransList, dbSeqLists, &t3Hash);
00490     for (frame = 0; frame < 3; ++frame)
00491         {
00492         gfs[frame] = gfIndexSeq(dbSeqLists[frame], minMatch, maxGap, tileSize, 
00493                 repMatch, ooc, TRUE, oneOff, FALSE, stepSize);
00494         }
00495 
00496     for (i=0; i<queryCount; ++i)
00497         {
00498         aaSeq qSeq;
00499 
00500         lf = lineFileOpen(queryFiles[i], TRUE);
00501         while (faMixedSpeedReadNext(lf, &qSeq.dna, &qSeq.size, &qSeq.name))
00502             {
00503             dotOut();
00504             /* Put it into right case and optionally mask on case. */
00505             if (forceLower)
00506                 toLowerN(qSeq.dna, qSeq.size);
00507             else if (forceUpper)
00508                 toUpperN(qSeq.dna, qSeq.size);
00509             else if (maskUpper)
00510                 {
00511                 if (toggle)
00512                     toggleCase(qSeq.dna, qSeq.size);
00513                 upperToN(qSeq.dna, qSeq.size);
00514                 }
00515             if (qSeq.size > qWarnSize)
00516                 {
00517                 warn("Query sequence %s has size %d, it might take a while.",
00518                      qSeq.name, qSeq.size);
00519                 }
00520             trimSeq(&qSeq, &trimmedSeq);
00521             if (transQuery)
00522                 transTripleSearch(&trimmedSeq, gfs, t3Hash, isRc, qIsDna, out);
00523             else
00524                 tripleSearch(&trimmedSeq, gfs, t3Hash, isRc, out);
00525             gfOutputQuery(gvo, out);
00526             }
00527         lineFileClose(&lf);
00528         }
00529 
00530     /* Clean up time. */
00531     trans3FreeList(&t3List);
00532     freeHash(&t3Hash);
00533     for (frame = 0; frame < 3; ++frame)
00534         {
00535         genoFindFree(&gfs[frame]);
00536         }
00537 
00538     for (seq = untransList; seq != NULL; seq = seq->next)
00539         {
00540         reverseComplement(seq->dna, seq->size);
00541         }
00542     }
00543 carefulClose(&out);
00544 }
00545 
00546 
00547 void blat(char *dbFile, char *queryFile, char *outName)
00548 /* blat - Standalone BLAT fast sequence search command line tool. */
00549 {
00550 char **dbFiles, **queryFiles;
00551 int dbCount, queryCount;
00552 struct dnaSeq *dbSeqList, *seq;
00553 struct genoFind *gf;
00554 boolean tIsProt = (tType == gftProt);
00555 boolean qIsProt = (qType == gftProt);
00556 boolean bothSimpleNuc = (tType == gftDna && (qType == gftDna || qType == gftRna));
00557 boolean bothSimpleProt = (tIsProt && qIsProt);
00558 FILE *f = mustOpen(outName, "w");
00559 boolean showStatus = (f != stdout);
00560 
00561 databaseName = dbFile;
00562 gfClientFileArray(dbFile, &dbFiles, &dbCount);
00563 if (makeOoc != NULL)
00564     {
00565     gfMakeOoc(makeOoc, dbFiles, dbCount, tileSize, repMatch, tType);
00566     if (showStatus)
00567         printf("Done making %s\n", makeOoc);
00568     exit(0);
00569     }
00570 gfClientFileArray(queryFile, &queryFiles, &queryCount);
00571 dbSeqList = gfClientSeqList(dbCount, dbFiles, tIsProt, tType == gftDnaX, repeats, 
00572         minRepDivergence, showStatus);
00573 databaseSeqCount = slCount(dbSeqList);
00574 for (seq = dbSeqList; seq != NULL; seq = seq->next)
00575     databaseLetters += seq->size;
00576 
00577 gvo = gfOutputAny(outputFormat, minIdentity*10, qIsProt, tIsProt, noHead, 
00578         databaseName, databaseSeqCount, databaseLetters, minIdentity, f);
00579 
00580 if (bothSimpleNuc || bothSimpleProt)
00581     {
00582     struct hash *maskHash = NULL;
00583 
00584     /* Save away masking info for output. */
00585     if (repeats != NULL)
00586         {
00587         maskHash = newHash(0);
00588         for (seq = dbSeqList; seq != NULL; seq = seq->next)
00589             {
00590             Bits *maskedBits = maskFromUpperCaseSeq(seq);
00591             hashAdd(maskHash, seq->name, maskedBits);
00592             }
00593         }
00594 
00595     /* Handle masking and indexing.  If masking is off, we want the indexer
00596      * to see unmasked sequence, otherwise we want it to see masked.  However
00597      * after indexing we always want it unmasked, because things are always
00598      * unmasked for the extension phase. */
00599     if (mask == NULL && !bothSimpleProt)
00600         gfClientUnmask(dbSeqList);
00601     gf = gfIndexSeq(dbSeqList, minMatch, maxGap, tileSize, repMatch, ooc, 
00602         tIsProt, oneOff, FALSE, stepSize);
00603     if (mask != NULL)
00604         gfClientUnmask(dbSeqList);
00605 
00606     searchOneIndex(queryCount, queryFiles, gf, outName, tIsProt, maskHash, f, showStatus);
00607     freeHash(&maskHash);
00608     }
00609 else if (tType == gftDnaX && qType == gftProt)
00610     {
00611     bigBlat(dbSeqList, queryCount, queryFiles, outName, FALSE, TRUE, f, showStatus);
00612     }
00613 else if (tType == gftDnaX && (qType == gftDnaX || qType == gftRnaX))
00614     {
00615     bigBlat(dbSeqList, queryCount, queryFiles, outName, TRUE, qType == gftDnaX, f, showStatus);
00616     }
00617 else
00618     {
00619     errAbort("Unrecognized combination of target and query types\n");
00620     }
00621 if (dotEvery > 0)
00622     printf("\n");
00623 freeDnaSeqList(&dbSeqList);
00624 }
00625 
00626 int main(int argc, char *argv[])
00627 /* Process command line into global variables and call blat. */
00628 {
00629 boolean dIsProtLike, qIsProtLike;
00630 
00631 #ifdef DEBUG
00632 {
00633 char *cmd = "blat hCrea.geno hCrea.mrna foo.psl -t=dnax -q=rnax";
00634 char *words[16];
00635 
00636 printf("Debugging parameters\n");
00637 cmd = cloneString(cmd);
00638 argc = chopLine(cmd, words);
00639 argv = words;
00640 }
00641 #endif /* DEBUG */
00642 
00643 optionInit(&argc, argv, options);
00644 if (argc != 4)
00645     usage();
00646 
00647 /* Get database and query sequence types and make sure they are
00648  * legal and compatable. */
00649 if (optionExists("prot"))
00650     qType = tType = gftProt;
00651 if (optionExists("t"))
00652     tType = gfTypeFromName(optionVal("t", NULL));
00653 trimA = optionExists("trimA") || optionExists("trima");
00654 trimT = optionExists("trimT") || optionExists("trimt");
00655 trimHardA = optionExists("trimHardA");
00656 switch (tType)
00657     {
00658     case gftProt:
00659     case gftDnaX:
00660         dIsProtLike = TRUE;
00661         break;
00662     case gftDna:
00663         dIsProtLike = FALSE;
00664         break;
00665     default:
00666         dIsProtLike = FALSE;
00667         errAbort("Illegal value for 't' parameter");
00668         break;
00669     }
00670 if (optionExists("q"))
00671     qType = gfTypeFromName(optionVal("q", NULL));
00672 if (qType == gftRnaX || qType == gftRna)
00673     trimA = TRUE;
00674 if (optionExists("noTrimA"))
00675     trimA = FALSE;
00676 switch (qType)
00677     {
00678     case gftProt:
00679     case gftDnaX:
00680     case gftRnaX:
00681         minIdentity = 25;
00682         qIsProtLike = TRUE;
00683         break;
00684     default:
00685         qIsProtLike = FALSE;
00686         break;
00687     }
00688 if ((dIsProtLike ^ qIsProtLike) != 0)
00689     errAbort("d and q must both be either protein or dna");
00690 
00691 /* Set default tile size for protein-based comparisons. */
00692 if (dIsProtLike)
00693     {
00694     tileSize = 5;
00695     minMatch = 1;
00696     oneOff = FALSE;
00697     maxGap = 0;
00698     }
00699 
00700 /* Get tile size and related parameters from user and make sure
00701  * they are within range. */
00702 tileSize = optionInt("tileSize", tileSize);
00703 stepSize = optionInt("stepSize", stepSize);
00704 minMatch = optionInt("minMatch", minMatch);
00705 oneOff = optionExists("oneOff");
00706 fastMap = optionExists("fastMap");
00707 minScore = optionInt("minScore", minScore);
00708 maxGap = optionInt("maxGap", maxGap);
00709 minRepDivergence = optionFloat("minRepDivergence", minRepDivergence);
00710 minIdentity = optionFloat("minIdentity", minIdentity);
00711 gfCheckTileSize(tileSize, dIsProtLike);
00712 if (minMatch < 0)
00713     errAbort("minMatch must be at least 1");
00714 if (maxGap > 100)
00715     errAbort("maxGap must be less than 100");
00716 
00717 
00718 
00719 /* Set repMatch parameter from command line, or
00720  * to reasonable value that depends on tile size. */
00721 if (optionExists("repMatch"))
00722     repMatch = optionInt("repMatch", repMatch);
00723 else
00724     {
00725     if (dIsProtLike)
00726         {
00727         if (tileSize == 3)
00728             repMatch = 600000;
00729         else if (tileSize == 4)
00730             repMatch = 30000;
00731         else if (tileSize == 5)
00732             repMatch = 1500;
00733         else if (tileSize == 6)
00734             repMatch = 75;
00735         else if (tileSize <= 7)
00736             repMatch = 10;
00737         }
00738     else
00739         {
00740         if (tileSize == 15)
00741             repMatch = 16;
00742         else if (tileSize == 14)
00743             repMatch = 32;
00744         else if (tileSize == 13)
00745             repMatch = 128;
00746         else if (tileSize == 12)
00747             repMatch = 256;
00748         else if (tileSize == 11)
00749             repMatch = 4*256;
00750         else if (tileSize == 10)
00751             repMatch = 16*256;
00752         else if (tileSize == 9)
00753             repMatch = 64*256;
00754         else if (tileSize == 8)
00755             repMatch = 256*256;
00756         else if (tileSize == 7)
00757             repMatch = 1024*256;
00758         else if (tileSize == 6)
00759             repMatch = 4*1024*256;
00760         }
00761     }
00762 
00763 /* Gather last few command line options. */
00764 noHead = optionExists("noHead");
00765 ooc = optionVal("ooc", NULL);
00766 makeOoc = optionVal("makeOoc", NULL);
00767 mask = optionVal("mask", NULL);
00768 qMask = optionVal("qMask", NULL);
00769 repeats = optionVal("repeats", NULL);
00770 if (repeats != NULL && mask != NULL && differentString(repeats, mask))
00771     errAbort("The -mask and -repeat settings disagree.  "
00772              "You can just omit -repeat if -mask is on");
00773 if (mask != NULL)       /* Mask setting will also set repeats. */
00774     repeats = mask;
00775 outputFormat = optionVal("out", outputFormat);
00776 dotEvery = optionInt("dots", 0);
00777 /* set global for fuzzy find functions */
00778 setFfIntronMax(optionInt("maxIntron", ffIntronMaxDefault));
00779 setFfExtendThroughN(optionExists("extendThroughN"));
00780 
00781 
00782 /* Call routine that does the work. */
00783 blat(argv[1], argv[2], argv[3]);
00784 return 0;
00785 }

Generated on Tue Dec 25 18:39:29 2007 for blat by  doxygen 1.5.2