00001
00002
00003 #include "common.h"
00004 #include "memalloc.h"
00005 #include "linefile.h"
00006 #include "bits.h"
00007 #include "hash.h"
00008 #include "dnautil.h"
00009 #include "dnaseq.h"
00010 #include "fa.h"
00011 #include "nib.h"
00012 #include "twoBit.h"
00013 #include "psl.h"
00014 #include "sig.h"
00015 #include "options.h"
00016 #include "obscure.h"
00017 #include "genoFind.h"
00018 #include "trans3.h"
00019 #include "gfClientLib.h"
00020
00021 static char const rcsid[] = "$Id: blat.c,v 1.111 2006/12/01 18:30:43 kent Exp $";
00022
00023
00024
00025 char *databaseName;
00026 int databaseSeqCount = 0;
00027 unsigned long databaseLetters = 0;
00028
00029 enum constants {
00030 qWarnSize = 5000000,
00031 };
00032
00033
00034 int tileSize = 11;
00035 int stepSize = 0;
00036 int minMatch = 2;
00037 int minScore = 30;
00038 int maxGap = 2;
00039 int repMatch = 1024*4;
00040 int dotEvery = 0;
00041 boolean oneOff = FALSE;
00042 boolean noHead = FALSE;
00043 boolean trimA = FALSE;
00044 boolean trimHardA = FALSE;
00045 boolean trimT = FALSE;
00046 boolean fastMap = FALSE;
00047 char *makeOoc = NULL;
00048 char *ooc = NULL;
00049 enum gfType qType = gftDna;
00050 enum gfType tType = gftDna;
00051 char *mask = NULL;
00052 char *repeats = NULL;
00053 char *qMask = NULL;
00054 double minRepDivergence = 15;
00055 double minIdentity = 90;
00056 char *outputFormat = "psl";
00057
00058
00059 void usage()
00060
00061 {
00062 printf(
00063 "blat - Standalone BLAT v. %s fast sequence search command line tool\n"
00064 "usage:\n"
00065 " blat database query [-ooc=11.ooc] output.psl\n"
00066 "where:\n"
00067 " database and query are each either a .fa , .nib or .2bit file,\n"
00068 " or a list these files one file name per line.\n"
00069 " -ooc=11.ooc tells the program to load over-occurring 11-mers from\n"
00070 " and external file. This will increase the speed\n"
00071 " by a factor of 40 in many cases, but is not required\n"
00072 " output.psl is where to put the output.\n"
00073 " Subranges of nib and .2bit files may specified using the syntax:\n"
00074 " /path/file.nib:seqid:start-end\n"
00075 " or\n"
00076 " /path/file.2bit:seqid:start-end\n"
00077 " or\n"
00078 " /path/file.nib:start-end\n"
00079 " With the second form, a sequence id of file:start-end will be used.\n"
00080 "options:\n"
00081 " -t=type Database type. Type is one of:\n"
00082 " dna - DNA sequence\n"
00083 " prot - protein sequence\n"
00084 " dnax - DNA sequence translated in six frames to protein\n"
00085 " The default is dna\n"
00086 " -q=type Query type. Type is one of:\n"
00087 " dna - DNA sequence\n"
00088 " rna - RNA sequence\n"
00089 " prot - protein sequence\n"
00090 " dnax - DNA sequence translated in six frames to protein\n"
00091 " rnax - DNA sequence translated in three frames to protein\n"
00092 " The default is dna\n"
00093 " -prot Synonymous with -t=prot -q=prot\n"
00094 " -ooc=N.ooc Use overused tile file N.ooc. N should correspond to \n"
00095 " the tileSize\n"
00096 " -tileSize=N sets the size of match that triggers an alignment. \n"
00097 " Usually between 8 and 12\n"
00098 " Default is 11 for DNA and 5 for protein.\n"
00099 " -stepSize=N spacing between tiles. Default is tileSize.\n"
00100 " -oneOff=N If set to 1 this allows one mismatch in tile and still\n"
00101 " triggers an alignments. Default is 0.\n"
00102 " -minMatch=N sets the number of tile matches. Usually set from 2 to 4\n"
00103 " Default is 2 for nucleotide, 1 for protein.\n"
00104 " -minScore=N sets minimum score. This is the matches minus the \n"
00105 " mismatches minus some sort of gap penalty. Default is 30\n"
00106 " -minIdentity=N Sets minimum sequence identity (in percent). Default is\n"
00107 " 90 for nucleotide searches, 25 for protein or translated\n"
00108 " protein searches.\n"
00109 " -maxGap=N sets the size of maximum gap between tiles in a clump. Usually\n"
00110 " set from 0 to 3. Default is 2. Only relevent for minMatch > 1.\n"
00111 " -noHead suppress .psl header (so it's just a tab-separated file)\n"
00112 " -makeOoc=N.ooc Make overused tile file. Target needs to be complete genome.\n"
00113 " -repMatch=N sets the number of repetitions of a tile allowed before\n"
00114 " it is marked as overused. Typically this is 256 for tileSize\n"
00115 " 12, 1024 for tile size 11, 4096 for tile size 10.\n"
00116 " Default is 1024. Typically only comes into play with makeOoc.\n"
00117 " Also affected by stepSize. When stepSize is halved repMatch is\n"
00118 " doubled to compensate.\n"
00119 " -mask=type Mask out repeats. Alignments won't be started in masked region\n"
00120 " but may extend through it in nucleotide searches. Masked areas\n"
00121 " are ignored entirely in protein or translated searches. Types are\n"
00122 " lower - mask out lower cased sequence\n"
00123 " upper - mask out upper cased sequence\n"
00124 " out - mask according to database.out RepeatMasker .out file\n"
00125 " file.out - mask database according to RepeatMasker file.out\n"
00126 " -qMask=type Mask out repeats in query sequence. Similar to -mask above but\n"
00127 " for query rather than target sequence.\n"
00128 " -repeats=type Type is same as mask types above. Repeat bases will not be\n"
00129 " masked in any way, but matches in repeat areas will be reported\n"
00130 " separately from matches in other areas in the psl output.\n"
00131 " -minRepDivergence=NN - minimum percent divergence of repeats to allow \n"
00132 " them to be unmasked. Default is 15. Only relevant for \n"
00133 " masking using RepeatMasker .out files.\n"
00134 " -dots=N Output dot every N sequences to show program's progress\n"
00135 " -trimT Trim leading poly-T\n"
00136 " -noTrimA Don't trim trailing poly-A\n"
00137 " -trimHardA Remove poly-A tail from qSize as well as alignments in \n"
00138 " psl output\n"
00139 " -fastMap Run for fast DNA/DNA remapping - not allowing introns, \n"
00140 " requiring high %%ID\n"
00141 " -out=type Controls output file format. Type is one of:\n"
00142 " psl - Default. Tab separated format, no sequence\n"
00143 " pslx - Tab separated format with sequence\n"
00144 " axt - blastz-associated axt format\n"
00145 " maf - multiz-associated maf format\n"
00146 " sim4 - similar to sim4 format\n"
00147 " wublast - similar to wublast format\n"
00148 " blast - similar to NCBI blast format\n"
00149 " blast8- NCBI blast tabular format\n"
00150 " blast9 - NCBI blast tabular format with comments\n"
00151 " -fine For high quality mRNAs look harder for small initial and\n"
00152 " terminal exons. Not recommended for ESTs\n"
00153 " -maxIntron=N Sets maximum intron size. Default is %d\n"
00154 " -extendThroughN - Allows extension of alignment through large blocks of N's\n"
00155 , gfVersion, ffIntronMaxDefault
00156 );
00157 exit(-1);
00158 }
00159
00160
00161 struct optionSpec options[] = {
00162 {"t", OPTION_STRING},
00163 {"q", OPTION_STRING},
00164 {"prot", OPTION_BOOLEAN},
00165 {"ooc", OPTION_STRING},
00166 {"tileSize", OPTION_INT},
00167 {"stepSize", OPTION_INT},
00168 {"oneOff", OPTION_INT},
00169 {"minMatch", OPTION_INT},
00170 {"minScore", OPTION_INT},
00171 {"minIdentity", OPTION_FLOAT},
00172 {"maxGap", OPTION_INT},
00173 {"noHead", OPTION_BOOLEAN},
00174 {"makeOoc", OPTION_STRING},
00175 {"repMatch", OPTION_INT},
00176 {"mask", OPTION_STRING},
00177 {"qMask", OPTION_STRING},
00178 {"repeats", OPTION_STRING},
00179 {"minRepDivergence", OPTION_FLOAT},
00180 {"dots", OPTION_INT},
00181 {"trimT", OPTION_BOOLEAN},
00182 {"noTrimA", OPTION_BOOLEAN},
00183 {"trimHardA", OPTION_BOOLEAN},
00184 {"fastMap", OPTION_BOOLEAN},
00185 {"out", OPTION_STRING},
00186 {"fine", OPTION_BOOLEAN},
00187 {"maxIntron", OPTION_INT},
00188 {"extendThroughN", OPTION_BOOLEAN},
00189 {NULL, 0},
00190 };
00191
00192
00193
00194 struct gfOutput *gvo;
00195
00196 void searchOneStrand(struct dnaSeq *seq, struct genoFind *gf, FILE *psl,
00197 boolean isRc, struct hash *maskHash, Bits *qMaskBits)
00198
00199 {
00200 gfLongDnaInMem(seq, gf, isRc, minScore, qMaskBits, gvo, fastMap, optionExists("fine"));
00201 }
00202
00203
00204 void searchOneProt(aaSeq *seq, struct genoFind *gf, FILE *f)
00205
00206 {
00207 int hitCount;
00208 struct lm *lm = lmInit(0);
00209 struct gfClump *clumpList = gfFindClumps(gf, seq, lm, &hitCount);
00210 gfAlignAaClumps(gf, clumpList, seq, FALSE, minScore, gvo);
00211 gfClumpFreeList(&clumpList);
00212 lmCleanup(&lm);
00213 }
00214
00215 void dotOut()
00216
00217 {
00218 static int mod = 1;
00219 if (dotEvery > 0)
00220 {
00221 if (--mod <= 0)
00222 {
00223 fputc('.', stdout);
00224 fflush(stdout);
00225 mod = dotEvery;
00226 }
00227 }
00228 }
00229
00230 void searchOne(bioSeq *seq, struct genoFind *gf, FILE *f, boolean isProt, struct hash *maskHash, Bits *qMaskBits)
00231
00232 {
00233 dotOut();
00234 if (isProt)
00235 {
00236 searchOneProt(seq, gf, f);
00237 }
00238 else
00239 {
00240 gvo->maskHash = maskHash;
00241 searchOneStrand(seq, gf, f, FALSE, maskHash, qMaskBits);
00242 reverseComplement(seq->dna, seq->size);
00243 searchOneStrand(seq, gf, f, TRUE, maskHash, qMaskBits);
00244 reverseComplement(seq->dna, seq->size);
00245 }
00246 gfOutputQuery(gvo, f);
00247 }
00248
00249 void trimSeq(struct dnaSeq *seq, struct dnaSeq *trimmed)
00250
00251
00252 {
00253 DNA *dna = seq->dna;
00254 int size = seq->size;
00255 *trimmed = *seq;
00256 if (trimT)
00257 maskHeadPolyT(dna, size);
00258 if (trimA || trimHardA)
00259 {
00260 int trimSize = maskTailPolyA(dna, size);
00261 if (trimHardA)
00262 {
00263 trimmed->size -= trimSize;
00264 dna[size-trimSize] = 0;
00265 }
00266 }
00267 }
00268
00269
00270 Bits *maskQuerySeq(struct dnaSeq *seq, boolean isProt,
00271 boolean maskQuery, boolean lcMask)
00272
00273
00274
00275 {
00276 Bits *qMaskBits = NULL;
00277 verbose(2, "%s\n", seq->name);
00278 if (isProt)
00279 faToProtein(seq->dna, seq->size);
00280 else
00281 {
00282 if (maskQuery)
00283 {
00284 if (lcMask)
00285 toggleCase(seq->dna, seq->size);
00286 qMaskBits = maskFromUpperCaseSeq(seq);
00287 }
00288 faToDna(seq->dna, seq->size);
00289 }
00290 if (seq->size > qWarnSize)
00291 {
00292 warn("Query sequence %s has size %d, it might take a while.",
00293 seq->name, seq->size);
00294 }
00295 return qMaskBits;
00296 }
00297
00298 void searchOneMaskTrim(struct dnaSeq *seq, boolean isProt,
00299 struct genoFind *gf, FILE *outFile,
00300 struct hash *maskHash,
00301 unsigned long *retTotalSize, int *retCount)
00302
00303 {
00304 boolean maskQuery = (qMask != NULL);
00305 boolean lcMask = (qMask != NULL && sameWord(qMask, "lower"));
00306 Bits *qMaskBits = maskQuerySeq(seq, isProt, maskQuery, lcMask);
00307 struct dnaSeq trimmedSeq;
00308 ZeroVar(&trimmedSeq);
00309 trimSeq(seq, &trimmedSeq);
00310 searchOne(&trimmedSeq, gf, outFile, isProt, maskHash, qMaskBits);
00311 *retTotalSize += seq->size;
00312 *retCount += 1;
00313 bitFree(&qMaskBits);
00314 }
00315
00316 void searchOneIndex(int fileCount, char *files[], struct genoFind *gf, char *outName,
00317 boolean isProt, struct hash *maskHash, FILE *outFile, boolean showStatus)
00318
00319 {
00320 int i;
00321 char *fileName;
00322 int count = 0;
00323 unsigned long totalSize = 0;
00324
00325 gfOutputHead(gvo, outFile);
00326 for (i=0; i<fileCount; ++i)
00327 {
00328 fileName = files[i];
00329 if (nibIsFile(fileName))
00330 {
00331 struct dnaSeq *seq;
00332
00333 if (isProt)
00334 errAbort("%s: Can't use .nib files with -prot or d=prot option\n", fileName);
00335 seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName);
00336 freez(&seq->name);
00337 seq->name = cloneString(fileName);
00338 searchOneMaskTrim(seq, isProt, gf, outFile,
00339 maskHash, &totalSize, &count);
00340 freeDnaSeq(&seq);
00341 }
00342 else if (twoBitIsSpec(fileName))
00343 {
00344 struct twoBitSpec *tbs = twoBitSpecNew(fileName);
00345 struct twoBitFile *tbf = twoBitOpen(tbs->fileName);
00346 if (isProt)
00347 errAbort("%s is a two bit file, which doesn't work for proteins.",
00348 fileName);
00349 if (tbs->seqs != NULL)
00350 {
00351 struct twoBitSeqSpec *ss = NULL;
00352 for (ss = tbs->seqs; ss != NULL; ss = ss->next)
00353 {
00354 struct dnaSeq *seq = twoBitReadSeqFrag(tbf, ss->name,
00355 ss->start, ss->end);
00356 searchOneMaskTrim(seq, isProt, gf, outFile,
00357 maskHash, &totalSize, &count);
00358 dnaSeqFree(&seq);
00359 }
00360 }
00361 else
00362 {
00363 struct twoBitIndex *index = NULL;
00364 for (index = tbf->indexList; index != NULL; index = index->next)
00365 {
00366 struct dnaSeq *seq = twoBitReadSeqFrag(tbf, index->name, 0, 0);
00367 searchOneMaskTrim(seq, isProt, gf, outFile,
00368 maskHash, &totalSize, &count);
00369 dnaSeqFree(&seq);
00370 }
00371 }
00372 twoBitClose(&tbf);
00373 }
00374 else
00375 {
00376 static struct dnaSeq seq;
00377 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00378 while (faMixedSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
00379 {
00380 searchOneMaskTrim(&seq, isProt, gf, outFile,
00381 maskHash, &totalSize, &count);
00382 }
00383 lineFileClose(&lf);
00384 }
00385 }
00386 carefulClose(&outFile);
00387 if (showStatus)
00388 printf("Searched %lu bases in %d sequences\n", totalSize, count);
00389 }
00390
00391 struct trans3 *seqListToTrans3List(struct dnaSeq *seqList, aaSeq *transLists[3], struct hash **retHash)
00392
00393 {
00394 int frame;
00395 struct dnaSeq *seq;
00396 struct trans3 *t3List = NULL, *t3;
00397 struct hash *hash = newHash(0);
00398
00399 for (seq = seqList; seq != NULL; seq = seq->next)
00400 {
00401 t3 = trans3New(seq);
00402 hashAddUnique(hash, t3->name, t3);
00403 slAddHead(&t3List, t3);
00404 for (frame = 0; frame < 3; ++frame)
00405 {
00406 slAddHead(&transLists[frame], t3->trans[frame]);
00407 }
00408 }
00409 slReverse(&t3List);
00410 for (frame = 0; frame < 3; ++frame)
00411 {
00412 slReverse(&transLists[frame]);
00413 }
00414 *retHash = hash;
00415 return t3List;
00416 }
00417
00418 void tripleSearch(aaSeq *qSeq, struct genoFind *gfs[3], struct hash *t3Hash, boolean dbIsRc, FILE *f)
00419
00420 {
00421 gvo->reportTargetStrand = TRUE;
00422 gfFindAlignAaTrans(gfs, qSeq, t3Hash, dbIsRc, minScore, gvo);
00423 }
00424
00425 void transTripleSearch(struct dnaSeq *qSeq, struct genoFind *gfs[3], struct hash *t3Hash,
00426 boolean dbIsRc, boolean qIsDna, FILE *f)
00427
00428 {
00429 int qIsRc;
00430 gvo->reportTargetStrand = TRUE;
00431 for (qIsRc = 0; qIsRc <= qIsDna; qIsRc += 1)
00432 {
00433 gfLongTransTransInMem(qSeq, gfs, t3Hash, qIsRc, dbIsRc, !qIsDna, minScore, gvo);
00434 if (qIsDna)
00435 reverseComplement(qSeq->dna, qSeq->size);
00436 }
00437 }
00438
00439 void bigBlat(struct dnaSeq *untransList, int queryCount, char *queryFiles[], char *outFile, boolean transQuery, boolean qIsDna, FILE *out, boolean showStatus)
00440
00441 {
00442 int frame, i;
00443 struct dnaSeq *seq, trimmedSeq;
00444 struct genoFind *gfs[3];
00445 aaSeq *dbSeqLists[3];
00446 struct trans3 *t3List = NULL;
00447 int isRc;
00448 struct lineFile *lf = NULL;
00449 struct hash *t3Hash = NULL;
00450 boolean forceUpper = FALSE;
00451 boolean forceLower = FALSE;
00452 boolean toggle = FALSE;
00453 boolean maskUpper = FALSE;
00454
00455 ZeroVar(&trimmedSeq);
00456 if (showStatus)
00457 printf("Blatx %d sequences in database, %d files in query\n", slCount(untransList), queryCount);
00458
00459
00460
00461
00462 if (transQuery)
00463 {
00464 if (qMask == NULL)
00465 forceLower = TRUE;
00466 else
00467 {
00468 maskUpper = TRUE;
00469 toggle = !sameString(qMask, "upper");
00470 }
00471 }
00472 else
00473 {
00474 forceUpper = TRUE;
00475 }
00476
00477 if (gvo->fileHead != NULL)
00478 gvo->fileHead(gvo, out);
00479
00480 for (isRc = FALSE; isRc <= 1; ++isRc)
00481 {
00482
00483 for (frame = 0; frame < 3; ++frame)
00484 {
00485 gfs[frame] = NULL;
00486 dbSeqLists[frame] = NULL;
00487 }
00488
00489 t3List = seqListToTrans3List(untransList, dbSeqLists, &t3Hash);
00490 for (frame = 0; frame < 3; ++frame)
00491 {
00492 gfs[frame] = gfIndexSeq(dbSeqLists[frame], minMatch, maxGap, tileSize,
00493 repMatch, ooc, TRUE, oneOff, FALSE, stepSize);
00494 }
00495
00496 for (i=0; i<queryCount; ++i)
00497 {
00498 aaSeq qSeq;
00499
00500 lf = lineFileOpen(queryFiles[i], TRUE);
00501 while (faMixedSpeedReadNext(lf, &qSeq.dna, &qSeq.size, &qSeq.name))
00502 {
00503 dotOut();
00504
00505 if (forceLower)
00506 toLowerN(qSeq.dna, qSeq.size);
00507 else if (forceUpper)
00508 toUpperN(qSeq.dna, qSeq.size);
00509 else if (maskUpper)
00510 {
00511 if (toggle)
00512 toggleCase(qSeq.dna, qSeq.size);
00513 upperToN(qSeq.dna, qSeq.size);
00514 }
00515 if (qSeq.size > qWarnSize)
00516 {
00517 warn("Query sequence %s has size %d, it might take a while.",
00518 qSeq.name, qSeq.size);
00519 }
00520 trimSeq(&qSeq, &trimmedSeq);
00521 if (transQuery)
00522 transTripleSearch(&trimmedSeq, gfs, t3Hash, isRc, qIsDna, out);
00523 else
00524 tripleSearch(&trimmedSeq, gfs, t3Hash, isRc, out);
00525 gfOutputQuery(gvo, out);
00526 }
00527 lineFileClose(&lf);
00528 }
00529
00530
00531 trans3FreeList(&t3List);
00532 freeHash(&t3Hash);
00533 for (frame = 0; frame < 3; ++frame)
00534 {
00535 genoFindFree(&gfs[frame]);
00536 }
00537
00538 for (seq = untransList; seq != NULL; seq = seq->next)
00539 {
00540 reverseComplement(seq->dna, seq->size);
00541 }
00542 }
00543 carefulClose(&out);
00544 }
00545
00546
00547 void blat(char *dbFile, char *queryFile, char *outName)
00548
00549 {
00550 char **dbFiles, **queryFiles;
00551 int dbCount, queryCount;
00552 struct dnaSeq *dbSeqList, *seq;
00553 struct genoFind *gf;
00554 boolean tIsProt = (tType == gftProt);
00555 boolean qIsProt = (qType == gftProt);
00556 boolean bothSimpleNuc = (tType == gftDna && (qType == gftDna || qType == gftRna));
00557 boolean bothSimpleProt = (tIsProt && qIsProt);
00558 FILE *f = mustOpen(outName, "w");
00559 boolean showStatus = (f != stdout);
00560
00561 databaseName = dbFile;
00562 gfClientFileArray(dbFile, &dbFiles, &dbCount);
00563 if (makeOoc != NULL)
00564 {
00565 gfMakeOoc(makeOoc, dbFiles, dbCount, tileSize, repMatch, tType);
00566 if (showStatus)
00567 printf("Done making %s\n", makeOoc);
00568 exit(0);
00569 }
00570 gfClientFileArray(queryFile, &queryFiles, &queryCount);
00571 dbSeqList = gfClientSeqList(dbCount, dbFiles, tIsProt, tType == gftDnaX, repeats,
00572 minRepDivergence, showStatus);
00573 databaseSeqCount = slCount(dbSeqList);
00574 for (seq = dbSeqList; seq != NULL; seq = seq->next)
00575 databaseLetters += seq->size;
00576
00577 gvo = gfOutputAny(outputFormat, minIdentity*10, qIsProt, tIsProt, noHead,
00578 databaseName, databaseSeqCount, databaseLetters, minIdentity, f);
00579
00580 if (bothSimpleNuc || bothSimpleProt)
00581 {
00582 struct hash *maskHash = NULL;
00583
00584
00585 if (repeats != NULL)
00586 {
00587 maskHash = newHash(0);
00588 for (seq = dbSeqList; seq != NULL; seq = seq->next)
00589 {
00590 Bits *maskedBits = maskFromUpperCaseSeq(seq);
00591 hashAdd(maskHash, seq->name, maskedBits);
00592 }
00593 }
00594
00595
00596
00597
00598
00599 if (mask == NULL && !bothSimpleProt)
00600 gfClientUnmask(dbSeqList);
00601 gf = gfIndexSeq(dbSeqList, minMatch, maxGap, tileSize, repMatch, ooc,
00602 tIsProt, oneOff, FALSE, stepSize);
00603 if (mask != NULL)
00604 gfClientUnmask(dbSeqList);
00605
00606 searchOneIndex(queryCount, queryFiles, gf, outName, tIsProt, maskHash, f, showStatus);
00607 freeHash(&maskHash);
00608 }
00609 else if (tType == gftDnaX && qType == gftProt)
00610 {
00611 bigBlat(dbSeqList, queryCount, queryFiles, outName, FALSE, TRUE, f, showStatus);
00612 }
00613 else if (tType == gftDnaX && (qType == gftDnaX || qType == gftRnaX))
00614 {
00615 bigBlat(dbSeqList, queryCount, queryFiles, outName, TRUE, qType == gftDnaX, f, showStatus);
00616 }
00617 else
00618 {
00619 errAbort("Unrecognized combination of target and query types\n");
00620 }
00621 if (dotEvery > 0)
00622 printf("\n");
00623 freeDnaSeqList(&dbSeqList);
00624 }
00625
00626 int main(int argc, char *argv[])
00627
00628 {
00629 boolean dIsProtLike, qIsProtLike;
00630
00631 #ifdef DEBUG
00632 {
00633 char *cmd = "blat hCrea.geno hCrea.mrna foo.psl -t=dnax -q=rnax";
00634 char *words[16];
00635
00636 printf("Debugging parameters\n");
00637 cmd = cloneString(cmd);
00638 argc = chopLine(cmd, words);
00639 argv = words;
00640 }
00641 #endif
00642
00643 optionInit(&argc, argv, options);
00644 if (argc != 4)
00645 usage();
00646
00647
00648
00649 if (optionExists("prot"))
00650 qType = tType = gftProt;
00651 if (optionExists("t"))
00652 tType = gfTypeFromName(optionVal("t", NULL));
00653 trimA = optionExists("trimA") || optionExists("trima");
00654 trimT = optionExists("trimT") || optionExists("trimt");
00655 trimHardA = optionExists("trimHardA");
00656 switch (tType)
00657 {
00658 case gftProt:
00659 case gftDnaX:
00660 dIsProtLike = TRUE;
00661 break;
00662 case gftDna:
00663 dIsProtLike = FALSE;
00664 break;
00665 default:
00666 dIsProtLike = FALSE;
00667 errAbort("Illegal value for 't' parameter");
00668 break;
00669 }
00670 if (optionExists("q"))
00671 qType = gfTypeFromName(optionVal("q", NULL));
00672 if (qType == gftRnaX || qType == gftRna)
00673 trimA = TRUE;
00674 if (optionExists("noTrimA"))
00675 trimA = FALSE;
00676 switch (qType)
00677 {
00678 case gftProt:
00679 case gftDnaX:
00680 case gftRnaX:
00681 minIdentity = 25;
00682 qIsProtLike = TRUE;
00683 break;
00684 default:
00685 qIsProtLike = FALSE;
00686 break;
00687 }
00688 if ((dIsProtLike ^ qIsProtLike) != 0)
00689 errAbort("d and q must both be either protein or dna");
00690
00691
00692 if (dIsProtLike)
00693 {
00694 tileSize = 5;
00695 minMatch = 1;
00696 oneOff = FALSE;
00697 maxGap = 0;
00698 }
00699
00700
00701
00702 tileSize = optionInt("tileSize", tileSize);
00703 stepSize = optionInt("stepSize", stepSize);
00704 minMatch = optionInt("minMatch", minMatch);
00705 oneOff = optionExists("oneOff");
00706 fastMap = optionExists("fastMap");
00707 minScore = optionInt("minScore", minScore);
00708 maxGap = optionInt("maxGap", maxGap);
00709 minRepDivergence = optionFloat("minRepDivergence", minRepDivergence);
00710 minIdentity = optionFloat("minIdentity", minIdentity);
00711 gfCheckTileSize(tileSize, dIsProtLike);
00712 if (minMatch < 0)
00713 errAbort("minMatch must be at least 1");
00714 if (maxGap > 100)
00715 errAbort("maxGap must be less than 100");
00716
00717
00718
00719
00720
00721 if (optionExists("repMatch"))
00722 repMatch = optionInt("repMatch", repMatch);
00723 else
00724 {
00725 if (dIsProtLike)
00726 {
00727 if (tileSize == 3)
00728 repMatch = 600000;
00729 else if (tileSize == 4)
00730 repMatch = 30000;
00731 else if (tileSize == 5)
00732 repMatch = 1500;
00733 else if (tileSize == 6)
00734 repMatch = 75;
00735 else if (tileSize <= 7)
00736 repMatch = 10;
00737 }
00738 else
00739 {
00740 if (tileSize == 15)
00741 repMatch = 16;
00742 else if (tileSize == 14)
00743 repMatch = 32;
00744 else if (tileSize == 13)
00745 repMatch = 128;
00746 else if (tileSize == 12)
00747 repMatch = 256;
00748 else if (tileSize == 11)
00749 repMatch = 4*256;
00750 else if (tileSize == 10)
00751 repMatch = 16*256;
00752 else if (tileSize == 9)
00753 repMatch = 64*256;
00754 else if (tileSize == 8)
00755 repMatch = 256*256;
00756 else if (tileSize == 7)
00757 repMatch = 1024*256;
00758 else if (tileSize == 6)
00759 repMatch = 4*1024*256;
00760 }
00761 }
00762
00763
00764 noHead = optionExists("noHead");
00765 ooc = optionVal("ooc", NULL);
00766 makeOoc = optionVal("makeOoc", NULL);
00767 mask = optionVal("mask", NULL);
00768 qMask = optionVal("qMask", NULL);
00769 repeats = optionVal("repeats", NULL);
00770 if (repeats != NULL && mask != NULL && differentString(repeats, mask))
00771 errAbort("The -mask and -repeat settings disagree. "
00772 "You can just omit -repeat if -mask is on");
00773 if (mask != NULL)
00774 repeats = mask;
00775 outputFormat = optionVal("out", outputFormat);
00776 dotEvery = optionInt("dots", 0);
00777
00778 setFfIntronMax(optionInt("maxIntron", ffIntronMaxDefault));
00779 setFfExtendThroughN(optionExists("extendThroughN"));
00780
00781
00782
00783 blat(argv[1], argv[2], argv[3]);
00784 return 0;
00785 }