00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #ifndef INCLUDED_SSAHAMain
00032 #define INCLUDED_SSAHAMain
00033
00034 using namespace std;
00035
00036
00037
00038
00039
00040
00041 #include <string>
00042 #include <iostream>
00043 #include <memory>
00044 class SequenceReaderMulti;
00045 class SequenceReader;
00046 class HashTable;
00047
00048
00049
00050 struct QueryParameterStruct
00051 {
00052 string queryName;
00053 string queryType;
00054 string queryFormat;
00055 string subjectName;
00056 string subjectType;
00057 string subjectFormat;
00058 int queryStart;
00059 int queryEnd;
00060 int wordLength;
00061 int stepLength;
00062 int maxStore;
00063 int minPrint;
00064 int maxGap;
00065 int maxInsert;
00066 int numRepeats;
00067 int sortMatches;
00068 int doAlignment;
00069 int substituteThreshold;
00070 int bandExtension;
00071 string queryReplace;
00072 string subjectReplace;
00073
00074 string logMode;
00075 string saveName;
00076 bool parserFriendly;
00077 bool packHits;
00078 bool reverseQuery;
00079 bool runQuery;
00080 bool printHashStats;
00081 friend ostream& operator<<(ostream& os, const QueryParameterStruct& qps )
00082 {
00083 return os << "queryName:\t" << qps.queryName
00084 << "\nqueryType:\t" << qps.queryType
00085 << "\nqueryFormat:\t" << qps.queryFormat
00086 << "\nsubjectName:\t" << qps.subjectName
00087 << "\nsubjectType:\t" << qps.subjectType
00088 << "\nsubjectFormat:\t" << qps.subjectFormat
00089 << "\nqueryStart:\t" << qps.queryStart
00090 << "\nqueryEnd:\t" << qps.queryEnd
00091 << "\nwordLength:\t" << qps.wordLength
00092 << "\nstepLength:\t" << qps.stepLength
00093 << "\nmaxStore:\t" << qps.maxStore
00094 << "\nminPrint:\t" << qps.minPrint
00095 << "\nmaxGap: \t" << qps.maxGap
00096 << "\nmaxInsert:\t" << qps.maxInsert
00097 << "\nnumRepeats:\t" << qps.numRepeats
00098 << "\nsortMatches:\t" << qps.sortMatches
00099 << "\ndoAlignment:\t" << qps.doAlignment
00100 << "\nsubstituteThreshold:\t" << qps.substituteThreshold
00101 << "\nbandExtension:\t" << qps.bandExtension
00102
00103 << "\nqueryReplace:\t" << qps.queryReplace
00104 << "\nsubjectReplace:\t" << qps.subjectReplace
00105 << "\nlogMode:\t" << qps.logMode
00106 << "\nsaveName:\t" << qps.saveName
00107 << "\nparserFriendly:\t"
00108 << (( qps.parserFriendly ) ? (string)"true" : (string)"false")
00109 << "\npackHits:\t"
00110 << (( qps.packHits ) ? (string)"true" : (string)"false")
00111 << "\nreverseQuery:\t"
00112 << (( qps.reverseQuery ) ? (string)"true" : (string)"false")
00113 << "\nrunQuery:\t"
00114 << (( qps.runQuery ) ? (string)"true" : (string)"false")
00115 << "\nprintHashStats:\t"
00116 << (( qps.printHashStats ) ? (string)"true" : (string)"false")
00117 << "\n";
00118 }
00119
00120 };
00121
00122 QueryParameterStruct defaultParams =
00123 {
00124 "",
00125 "DNA",
00126 "",
00127 "",
00128 "DNA",
00129 "",
00130 1,
00131 -1,
00132 -1,
00133 -1,
00134 100000,
00135 1,
00136 -1,
00137 0,
00138 0,
00139 0,
00140 60,
00141 0,
00142 0,
00143
00144 "default",
00145 "tag",
00146 "cerr",
00147 "",
00148 false,
00149 false,
00150 true,
00151 true,
00152 false
00153 };
00154
00155
00156
00157
00158
00159
00160
00161 class CommandLineArg
00162 {
00163 public:
00164
00165
00166
00167
00168
00169
00170
00171 static void parseCommandLine
00172 ( int numArgs, char* args[], QueryParameterStruct& queryParams );
00173
00174
00175 CommandLineArg( const string& nameLong, const string& nameShort ) :
00176 nameLong_( nameLong ), nameShort_( nameShort ) {}
00177
00178 virtual bool isThisMe( const string& argName )
00179 {
00180 return ( ( argName == nameLong_ ) || ( argName == nameShort_ ) );
00181 }
00182 virtual void addValue( const string& value ) = 0;
00183 protected:
00184 string nameLong_;
00185 string nameShort_;
00186 };
00187
00188
00189
00190
00191 class CommandLineArgString : public CommandLineArg
00192 {
00193 public:
00194 CommandLineArgString
00195 ( const string& nameLong, const string& nameShort, string& destination ) :
00196 CommandLineArg( nameLong, nameShort ), destination_( destination ) {}
00197 void addValue( const string& value )
00198 {
00199 destination_ = value;
00200 }
00201 protected:
00202 string& destination_;
00203 };
00204
00205
00206
00207
00208 class CommandLineArgInt : public CommandLineArg
00209 {
00210 public:
00211 CommandLineArgInt
00212 ( const string& nameLong, const string& nameShort, int& destination ) :
00213 CommandLineArg( nameLong, nameShort ), destination_( destination ) {}
00214 void addValue( const string& value )
00215 {
00216 destination_ = atoi(value.c_str());
00217 }
00218 protected:
00219 int& destination_;
00220 };
00221
00222
00223
00224
00225 class CommandLineArgBool : public CommandLineArg
00226 {
00227 public:
00228 CommandLineArgBool
00229 ( const string& nameLong, const string& nameShort, bool& destination ) :
00230 CommandLineArg( nameLong, nameShort ), destination_( destination ) {}
00231
00232 virtual bool isThisMe( const string& argName )
00233 {
00234 if ( ( argName == nameLong_ ) || ( argName == nameShort_ ) )
00235 {
00236 destination_ = true;
00237 return true;
00238 }
00239 return false;
00240 }
00241
00242 void addValue( const string& value )
00243 {
00244 destination_ = atoi(value.c_str());
00245 }
00246 protected:
00247 bool& destination_;
00248 };
00249
00250
00251
00252
00253
00254
00255
00256
00257 class SequenceEncoder;
00258
00259 SequenceReader* findSequenceReader
00260 ( const string& fileName, const string& fileType, SequenceEncoder* encoder );
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274 void fillSequenceReaderMulti( SequenceReaderMulti& m,
00275 SequenceEncoder* encoder,
00276 const string& fileName,
00277 const string& dirName = "" );
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287 void processQuery( QueryParameterStruct& queryParams );
00288
00289 static const char* helpText[] ={
00290 "NAME\n",
00291 " ssaha - performs rapid searching of DNA and protein databases\n",
00292 "SYNOPSIS\n",
00293 " ssaha [-h] [-help]\n",
00294 " - print this help message.\n",
00295 " ssaha queryFile subjectFile [ -optionName_1 [optionValue_1] ] ...",
00296 " [ -optionName_n [optionValue_n] ] \n",
00297 " - create a hash table from the sequences in subjectFile and",
00298 " use it to search subjectFile for the sequences in queryFile.\n",
00299 " ssaha subjectFile [ -optionName_1 [optionValue_1] ] ...",
00300 " [ -optionName_n [optionValue_n] ] ...\n",
00301 " - just create a hash table from the sequences in subjectFile.",
00302 " The -saveName option must be set (see OPTIONS).\n",
00303 "DESCRIPTION\n",
00304 " ssaha is a tool for rapidly finding near exact matches in DNA or protein",
00305 " databases. The name is an acronym standing for Sequence Search and Alignment",
00306 " by Hashing Algorithm. It works by converting a sequence database into a",
00307 " hash table. This is then rapidly quizzed for hits, which are concatenated",
00308 " into matches.\n",
00309 "OPTIONS\n",
00310 " Options may be specified by either their full names or short names and may",
00311 " appear on the command line in any order.\n",
00312 " Full Name Short Description\n",
00313 "-queryFormat -qf Acceptable values:",
00314 " fasta - fasta file",
00315 " fastq - fastq file\n",
00316 " Default value:",
00317 " If not specified, attempts to deduce file type",
00318 " based on the filename suffix as follows:\n",
00319 " File suffix Deduced file type",
00320 " .fasta, .fa fasta file",
00321 " .fastq fastq file",
00322 " else assumed to be a directory of files, each of",
00323 " whose names indicates the file type as specified",
00324 " by the above rules.\n",
00325 "-subjectFormat -sf Acceptable values:",
00326 " fasta - fasta file",
00327 " fastq - fastq file",
00328 " hash - precomputed hash table\n",
00329 " Default value:",
00330 " If not specified, attempts to deduce file type",
00331 " based on the filename suffix as follows:\n",
00332 " File suffix Deduced file type",
00333 " .fasta, .fa fasta file",
00334 " .fastq fastq file",
00335 " else assumed to be a directory of files, each of",
00336 " whose names indicates the file type as specified",
00337 " by the above rules.\n",
00338 " Note:",
00339 " If -sf is set to hash, the -wl, -sl, and -ph",
00340 " options will, if present, be ignored.\n",
00341 "-queryType -qt Acceptable values:",
00342 " DNA",
00343 " protein\n",
00344 " Default value:",
00345 " DNA\n",
00346 "-subjectType -st Acceptable values:",
00347 " DNA (in which case queryType must also be DNA)",
00348 " protein",
00349 " codon (i.e. do 6 way DNA to protein translation)\n",
00350 " Default value:",
00351 " DNA\n",
00352 "-hashStats -hs Show information about the hash table currently",
00353 " in use.\n",
00354 "-parserFriendly -pf Show one match per line as a set of tab delimited",
00355 " (a.k.a. perlFriendly) fields:\n",
00356 " match direction: F forward, R reverse",
00357 " query name",
00358 " query start",
00359 " query end",
00360 " subject name",
00361 " subject start",
00362 " subject end",
00363 " number of matching bases",
00364 " percentage identity\n",
00365 "-logMode -lm Controls the output of log information",
00366 " Acceptable values:",
00367 " cerr - send to standard error",
00368 " cout - send to standard output",
00369 " null - suppress log output",
00370 " any other value sends log information to a file",
00371 " of the same name\n",
00372 " Default value:",
00373 " cerr\n",
00374 "-packHits -ph Store position of each word in a \"packed\"",
00375 " format comprising 32 bits per word. This halves",
00376 " the size of the .body file at the expense of a",
00377 " slight decrease in search speed.\n",
00378 "-wordLength -wl Size in base pairs of the words used to form",
00379 " the hash table. May vary from 1 to (assuming",
00380 " sufficient RAM is available) 16.",
00381 " Default value is 10.\n",
00382 "-maxGap -mg Maximum gap allowed between successive hits for",
00383 " them to count as part of the same match.",
00384 " Default value is 0.\n",
00385 "-maxInsert -mi Maximum number of insertions/deletions allowed",
00386 " between successive hits for them to count as part",
00387 " of the same match.",
00388 " Default value is 0.\n",
00389 "-maxStore -ms Largest number of times that a word may occur in",
00390 " the hash table for it to be used for matching",
00391 " expressed as a multiple of the number of",
00392 " occurrences per word that would be expected",
00393 " for a random database of the same size as the",
00394 " subject database.",
00395 " Default value is 10000.\n",
00396 "-numRepeats -nr Maximum size of tandem repeating motif that can be",
00397 " detected in the query sequence. This option may",
00398 " produce faster and better matches when dealing",
00399 " with data containing tandem repeats.",
00400 " Defaults to 0, and must be less than or equal to",
00401 " the word length.",
00402 " Notes:",
00403 " 1. This option does nothing if -ph is also set.",
00404 " 2. To get the best results with this option, set",
00405 " -mg to be at least equal to the word length.",
00406 " Setting the -mi option may also help.\n",
00407 "-minPrint -mp The minimum number of matching bases or residues",
00408 " that must be found in the query and subject",
00409 " sequences before they are considered as a match",
00410 " and thus printed.",
00411 " Default value is 1.\n",
00412 "-queryStart -qs Specifies the number of the first query sequence to",
00413 " be matched with the subject sequences (numbering of",
00414 " both the query and subject sequences starts at 1).",
00415 " Default value is 1.\n",
00416 "-queryEnd -qe Specifies the number of the last query sequence to",
00417 " be matched with the subject sequences. If not",
00418 " specified, continues until the end of the query",
00419 " sequence data is reached.\n",
00420 "-reportMode -rm Specifies behaviour upon encountering unexpected",
00421 " alphanumeric characters in query or subject ",
00422 " sequences:\n",
00423 " ignore - do nothing",
00424 " report - report to standard error",
00425 " replaceA - silently replace character with 'A'",
00426 " replaceG, replaceC, replaceT - as for replaceA",
00427 " rrepA - replace character with 'A' and report",
00428 " rrepG, rrepC, rrepT - as for rrepA",
00429 " Default value is `ignore.'\n",
00430 " NB FOR VERSION 3.0, THE -reportMode OPTION HAS BEEN",
00431 " SUPERCEDED BY THE -queryReplace AND -subjectReplace",
00432 " OPTIONS - SEE THE APPROPRIATE HELP ENTRIES\n",
00433 "-reverseQuery -rq When matching the reverse strand of a query,",
00434 " convert the positions of any matches found",
00435 " into the coordinate frame of the forward strand.",
00436 " Has no effect if queryType is set to protein.\n",
00437 "-saveName -sn Specifies that the hash table must be saved before",
00438 " the program exits. This option must be followed by",
00439 " a string fileNameRoot. The hash table data is",
00440 " saved into the files",
00441 " fileNameRoot.head",
00442 " fileNameRoot.body",
00443 " fileNameRoot.name",
00444 " fileNameRoot.size\n",
00445 " Notes:",
00446 " 1. If no query file is specified (usage (iii)",
00447 " above) it is an error not to set this option.",
00448 " 2. It is an error to set this option if",
00449 " subjectType is set to `hash.'",
00450 " 3. If the -ph option is also set, the -sn option",
00451 " also produces a fileNameRoot.start file.\n",
00452 "-sortMatches -sm Output only the top n matches for each query,",
00453 " sorted by number of matching bases, then by",
00454 " subject name, then by start position in the",
00455 " query sequence.",
00456 " Default value is zero, which outputs all matches",
00457 " for each query and does no sorting.\n",
00458 "-stepLength -sl Number of base pairs gap between words used to ",
00459 " produce hash table. Ignored if a precomputed ",
00460 " hash table is being used. Default value is ",
00461 " equal to wordLength.\n",
00462 "-queryReplace -qr Specifies behaviour upon encountering unexpected",
00463 " alphanumeric characters in query sequences:\n",
00464 " ignore - do nothing",
00465 " report - report to standard error",
00466 " A,G, etc. - replace with that character:",
00467 " must be A, G, C, T for DNA, or a valid IUPAC",
00468 " amino acid code for protein.",
00469 " Default: replace with 'A' for DNA, 'X' for protein\n",
00470 "-subjectReplace -sr Specifies behaviour upon encountering unexpected",
00471 " alphanumeric characters in subject sequences:\n",
00472 " ignore - do nothing",
00473 " report - report to standard error",
00474 " A,G, etc. - replace with that character",
00475 " Must be A, G, C, T for DNA or a valid IUPAC",
00476 " amino acid code for protein",
00477 " tag - `tag' the word so that it is not put",
00478 " into the hash table.",
00479 " Default: tag\n",
00480 "-substituteWords -sw Look for single base/amino mismatches in words",
00481 " that occur less than this many times more often",
00482 " than would be expected for a random database of",
00483 " the same size as the subject database.\n",
00484 " Only looks for:",
00485 " purine (G-A)/pyrimidine (T-C) mismatches for DNA",
00486 " mismatches with positive BLOSUM score for protein",
00487 " Set to zero to switch this feature off.",
00488 " Default value: 0 (switched off)\n",
00489 "-doAlignment -da Produce a graphical alignment of the matching region",
00490 " using banded dynamic programming. The alignment",
00491 " will be formatted to the specified number of columns.",
00492 " Set to zero to suppress alignments, otherwise",
00493 " must be at least 20.",
00494 " Default value: 80\n",
00495 "-bandExtension -be Specify size of the band to use for banded dynamic",
00496 " programming, when producing a graphical alignment.",
00497 " 0 - diagonal only",
00498 " n - n cells each side of diagonal",
00499 " Only has an effect when -be is nonzero",
00500 " Default value: 0 (diagonal only)",
00501 "OUTPUT FORMAT\n",
00502 "When full alignments are requested (-da set to nonzero) the software produces",
00503 "a line of information then a graphical alignment for each match found:\n",
00504 "i) DNA against DNA (untranslated)\n",
00505 "RF p1_1a788a06.q1c 515 538 p1_1a788f11.p1c 493 516 24",
00506 "100.00",
00507 "Alignment score: 13",
00508 "Q:000000515 tttt-tgagacggagtctcgctct",
00509 " ||||x||||||xx|||||||||||",
00510 "S:000000493 ttttttgagacaaagtctcgctct\n",
00511 "ii) Protein against protein\n",
00512 "FF SW:PPSA_AERPE 625 642 SW:PPSA_METTH 577 592 8 ",
00513 "44.44",
00514 "Alignment score: 31",
00515 "Q:000000625 KGGEKYETLDERNPMIGW",
00516 " x|||x |xx |x|||x||",
00517 "S:000000577 EGGEN-EPY-EHNPMLGW\n",
00518 "iii) Protein query against translated DNA subject ",
00519 "(format for translated DNA query against protein subject is similar)\n",
00520 "FR SW:PPS2_HUMAN 467 473 p1_1a788c10.q1c 507 529 7",
00521 "100.00",
00522 "Alignment score: 22",
00523 "Q:000000467 E..E..G..--V..L..D..P..",
00524 " ||||||||| ||||||Nxx|||",
00525 "S:000000507 gaggagggcatgtattaaaccca\n",
00526 "iv) DNA against DNA (translated)\n",
00527 "FF p1_1a788a01.p1c 52 76 p1_1a788b11.p1c 314 336 24",
00528 "96.00",
00529 "Alignment score: 3",
00530 "Q:000000052 atggtatgtctttcttttact-agat-",
00531 " ||xV..C..L..S..F.. R.. ",
00532 "S:000000314 attgtttgtctctccttc---taga-a\n",
00533 " From left to right the fields in the match information line are as follows:\n",
00534 " i) First character: query direction (F forward, R reverse)",
00535 " Second character: subject direction (F forward, R reverse)",
00536 " ii) query name",
00537 " iii) query start",
00538 " iv) query end",
00539 " v) subject name",
00540 " vi) subject start",
00541 " vii) subject end",
00542 " viii) estimated number of matching bases",
00543 " ix) estimated percentage identity\n",
00544 " The last two quantities are not exact values, they are approximations used",
00545 " to order the matches (if requested) before the full alignment is done.\n",
00546 " With the alignments switched off (-da 0), an entry like the one below is",
00547 " produced for each sequence in the query.\n",
00548 " Matches For Query 6 (653 bases): p1_1a788a03.q1c\n",
00549 " F 6 : p1_1a788a03.q1c Bases: 650 Q: 1 to 650 S: 1 to 650 100.00%",
00550 " R 5 : p1_1a788a03.p1c Bases: 100 Q: 22 to 121 S: 501 to 600 100.00%\n",
00551 " The top line shows the query number, name and size of the query sequence.",
00552 " Below that is one line for each match found in the subject database. From",
00553 " left to right, the entries on these lines are as follows:\n",
00554 " match direction: F forward, R reverse",
00555 " subject number",
00556 " subject name",
00557 " number of matching bases",
00558 " query start",
00559 " query end",
00560 " subject start",
00561 " subject end",
00562 " percentage identity\n",
00563 "Notes:\n",
00564 " 1. The output format is different if the program is run with the",
00565 " -parserFriendly option set. See the description of that option for details.\n",
00566 " 2. Because SSAHA works by looking for whole-word matches, the `number of",
00567 " matching bases' and `percentage identity' fields must be considered as lower",
00568 " bounds on the true values of these quantities.\n",
00569 "FURTHER INFORMATION\n",
00570 " The SSAHA home page is at http://www.sanger.ac.uk/Software/analysis/SSAHA/\n",
00571 " Zemin Ning, Anthony. J. Cox and James C. Mullikin. SSAHA: A Fast Search",
00572 " Method for Large DNA Databases. Submitted to Genome Research.\n",
00573 "@"
00574 };
00575
00576
00577
00578
00579
00580
00581 #endif
00582
00583
00584
00585
00586
00587
00588
00589