00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include "SSAHAMain.h"
00034 #include "SequenceEncoder.h"
00035 #include "SequenceReaderMulti.h"
00036 #include "SequenceReaderFasta.h"
00037 #include "SequenceReaderFastq.h"
00038
00039 #include "SequenceReaderFilter.h"
00040 #include "HashTable.h"
00041 #include "HashTablePacked.h"
00042 #include "HashTableTranslated.h"
00043 #include "MatchStore.h"
00044 #include "MatchStoreUngapped.h"
00045 #include "MatchStoreGapped.h"
00046 #include "MatchAligner.h"
00047 #include "TimeStamp.h"
00048 #include <dirent.h>
00049 #include <fstream>
00050 #include <string>
00051 #include <assert.h>
00052 #include <exception>
00053 #include <cmath>
00054
00055 MachineInfo* MachineInfo::info_ = 0;
00056 NullBuffer nullBuffer;
00057 ofstream* pLogStream = NULL;
00058
00059 int defaultWordLengthDNA = 12;
00060 int defaultWordLengthProtein = 4;
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070 typedef MatchTaskSort
00071 <SortByMatch>
00072 SortByMatchType;
00073
00074
00075
00076
00077
00078 int main( int numArgs, char* args[] )
00079 {
00080
00081 try
00082 {
00083
00084 cerr <<
00085 "Welcome to SSAHA: Sequence Search and Alignment by Hashing Algorithm\n\
00086 Copyright (C) 2004 by Genome Research Limited\n\
00087 This software is released under the terms of version 2 of the GNU General\n\
00088 Public Licence, as published by the Free Software Foundation.\n\
00089 This is SSAHA Version 3.2, released 1st March 2004.\n\n";
00090
00091 Timer timeStamp;
00092
00093 string thisArg;
00094 if ( numArgs < 3 )
00095 {
00096
00097
00098 if ( numArgs == 2 )
00099 {
00100 thisArg = (string) args[1];
00101 if ( ( thisArg != "-h" ) && ( thisArg != "-help" ) )
00102 {
00103 cerr << "Error: invalid argument"
00104 << " (" << args[1] << ")"
00105 << " - use \'" << args[0] << " -help\' for help.\n";
00106 throw SSAHAException("Invalid command line argument.");
00107 }
00108 }
00109
00110
00111 int i(0);
00112 do
00113 {
00114 cout << helpText[i] << "\n";
00115 } while (helpText[++i][0]!='@');
00116
00117 return(0);
00118
00119
00120
00121 }
00122
00123 QueryParameterStruct queryParams( defaultParams );
00124
00125 CommandLineArg::parseCommandLine( numArgs, args, queryParams );
00126
00127
00128
00129 processQuery( queryParams );
00130
00131
00132
00133 if (pLogStream != NULL) delete pLogStream;
00134
00135 return (0);
00136
00137 }
00138 catch (const SSAHAException& err )
00139 {
00140 cerr << "Caught SSAHA exception: " << err.what() << "\n";
00141 exit (1);
00142 }
00143 catch (const std::exception& err )
00144 {
00145 cerr << "Caught exception: " << err.what() << "\n";
00146 exit (1);
00147 }
00148
00149 }
00150
00151
00152
00153
00154
00155
00156
00157 void CommandLineArg::parseCommandLine
00158 ( int numArgs, char* args[], QueryParameterStruct& queryParams )
00159 {
00160
00161 int firstArg;
00162
00163
00164 if ( (args[2])[0] == '-' )
00165 {
00166
00167
00168 firstArg = 2;
00169 queryParams.subjectName = (string) args[1];
00170 queryParams.runQuery = false;
00171 }
00172 else
00173 {
00174
00175 firstArg = 3;
00176 queryParams.queryName = (string) args[1];
00177 queryParams.subjectName = (string) args[2];
00178 }
00179
00180
00181 vector<CommandLineArg*> validArgs;
00182 validArgs.push_back
00183 ( new CommandLineArgInt( "-wordLength", "-wl", queryParams.wordLength ) );
00184 validArgs.push_back
00185 ( new CommandLineArgInt( "-stepLength", "-sl", queryParams.stepLength ) );
00186 validArgs.push_back
00187 ( new CommandLineArgInt( "-maxStore", "-ms", queryParams.maxStore ) );
00188 validArgs.push_back
00189 ( new CommandLineArgInt( "-minPrint", "-mp", queryParams.minPrint ) );
00190 validArgs.push_back
00191 ( new CommandLineArgInt( "-maxGap", "-mg", queryParams.maxGap ) );
00192 validArgs.push_back
00193 ( new CommandLineArgInt( "-maxInsert", "-mi", queryParams.maxInsert ) );
00194 validArgs.push_back
00195 ( new CommandLineArgInt( "-numRepeats", "-nr", queryParams.numRepeats ) );
00196 validArgs.push_back
00197 ( new CommandLineArgInt( "-substituteWords", "-sw",
00198 queryParams.substituteThreshold ) );
00199 validArgs.push_back
00200 ( new CommandLineArgInt( "-bandExtension", "-be",
00201 queryParams.bandExtension ) );
00202 validArgs.push_back
00203 ( new CommandLineArgInt( "-sortMatches", "-sm", queryParams.sortMatches ) );
00204 validArgs.push_back
00205 ( new CommandLineArgInt
00206 ( "-doAlignment", "-da", queryParams.doAlignment ));
00207 validArgs.push_back
00208 ( new CommandLineArgInt( "-queryStart", "-qs", queryParams.queryStart ) );
00209 validArgs.push_back
00210 ( new CommandLineArgInt( "-queryEnd", "-qe", queryParams.queryEnd ) );
00211 validArgs.push_back
00212 ( new CommandLineArgBool( "-hashStats", "-hs", queryParams.printHashStats ));
00213 validArgs.push_back
00214 ( new CommandLineArgBool( "-packHits", "-ph", queryParams.packHits ));
00215 validArgs.push_back
00216 ( new CommandLineArgBool
00217 ( "-reverseQuery", "-rq", queryParams.reverseQuery ));
00218 validArgs.push_back
00219 ( new CommandLineArgBool
00220 ( "-parserFriendly", "-pf", queryParams.parserFriendly ) );
00221 validArgs.push_back
00222 ( new CommandLineArgString( "-queryType", "-qt", queryParams.queryType ) );
00223 validArgs.push_back
00224 ( new CommandLineArgString( "-subjectType", "-st", queryParams.subjectType));
00225 validArgs.push_back
00226 ( new CommandLineArgString( "-queryFormat", "-qf", queryParams.queryFormat));
00227 validArgs.push_back
00228 ( new CommandLineArgString
00229 ( "-subjectFormat", "-sf", queryParams.subjectFormat) );
00230
00231
00232 validArgs.push_back
00233 ( new CommandLineArgString( "-queryReplace", "-qr", queryParams.queryReplace ) );
00234 validArgs.push_back
00235 ( new CommandLineArgString( "-subjectReplace", "-sr", queryParams.subjectReplace ) );
00236 validArgs.push_back
00237 ( new CommandLineArgString( "-logMode", "-lm", queryParams.logMode ) );
00238 validArgs.push_back
00239 ( new CommandLineArgString( "-saveName", "-sn", queryParams.saveName ) );
00240
00241 string thisArg;
00242 vector<CommandLineArg*>::iterator pArg = validArgs.end();
00243 for ( int i(firstArg); i < numArgs ; i++ )
00244 {
00245 thisArg = (string) args[i];
00246 if (pArg != validArgs.end())
00247
00248
00249 {
00250 (*pArg)->addValue( thisArg );
00251 pArg = validArgs.end();
00252 }
00253 else
00254
00255
00256 {
00257 for (pArg = validArgs.begin() ; pArg != validArgs.end(); ++pArg )
00258 {
00259 if ( (*pArg)->isThisMe(thisArg) ) break;
00260 }
00261 if ( pArg == validArgs.end() )
00262
00263 {
00264 cerr << "Error: command line argument " << thisArg << " not found.\n";
00265 exit(1);
00266 }
00267
00268
00269
00270 if ( dynamic_cast<CommandLineArgBool*>(*pArg)) pArg=validArgs.end();
00271 }
00272 }
00273
00274
00275 for ( vector<CommandLineArg*>::iterator i = validArgs.begin();
00276 i != validArgs.end(); ++i ) delete *i;
00277
00278
00279 }
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289 void processQuery( QueryParameterStruct& queryParams )
00290 {
00291
00292
00293 if ( queryParams.logMode == "cout" )
00294 {
00295 cerr.rdbuf(cout.rdbuf());
00296 }
00297 else if ( queryParams.logMode == "null" )
00298 {
00299 cerr.rdbuf(&nullBuffer);
00300 }
00301 else if ( queryParams.logMode != "cerr" )
00302 {
00303 pLogStream = new ofstream(queryParams.logMode.c_str(), ios::ate|ios::app);
00304 if (pLogStream->fail())
00305 {
00306 cerr << "could not open log stream file " << queryParams.logMode
00307 << ".\n";
00308 throw SSAHAException("could not open log file.\n");
00309 }
00310 cerr.rdbuf(pLogStream->rdbuf());
00311 }
00312
00313 cerr << "Run started: " << getTimeNow()
00314 << "Input query parameters:\n" << queryParams;
00315
00316
00317 MachineInfo* info = MachineInfo::getInfo();
00318 cerr << *info;
00319
00320
00321
00322 if ( queryParams.minPrint <= 0 )
00323 {
00324 cerr << "Error: value for minPrint (" << queryParams.minPrint
00325 << ") must be greater than zero.\n";
00326 throw SSAHAException("Invalid value for minPrint");
00327 }
00328
00329 if ( queryParams.maxStore < 0 )
00330 {
00331 cerr << "Error: value for maxStore (" << queryParams.maxStore
00332 << ") must be greater than zero.\n";
00333 throw SSAHAException("Invalid value for maxStore");
00334 }
00335 if ( queryParams.substituteThreshold < 0 )
00336 {
00337 cerr << "Error: value for substituteThreshold ("
00338 << queryParams.substituteThreshold
00339 << ") must be greater than zero.\n";
00340 throw SSAHAException("Invalid value for substituteThreshold");
00341 }
00342
00343
00344 if ( queryParams.doAlignment != 0 )
00345 {
00346 if( queryParams.doAlignment < 20 )
00347 {
00348 cerr << "Error: value for doAlignment (" << queryParams.doAlignment
00349 << ") must be at least 20.\n";
00350 throw SSAHAException("Invalid value for doAlignment");
00351 }
00352 #ifdef XXX
00353 if ( ( queryParams.reportMode != "replaceA" )
00354 && ( queryParams.reportMode != "tag" ) )
00355 {
00356 cerr << "Info: graphical alignments can't be produced with reportMode\n"
00357 << "set to \"" << queryParams.reportMode
00358 << "\" , setting reportmode to \"tag\" and proceeding.\n";
00359 queryParams.reportMode="tag";
00360 }
00361 #endif
00362 if ( ( queryParams.queryReplace == "ignore" )
00363 || ( queryParams.queryReplace == "report" ) )
00364 {
00365 cerr << "Info: graphical alignments can't be produced with queryReplace\n"
00366 << "set to \"" << queryParams.queryReplace
00367 << "\" , resetting to default and proceeding.\n";
00368 queryParams.queryReplace="default";
00369 }
00370
00371 if ( ( queryParams.subjectReplace == "ignore" )
00372 || ( queryParams.subjectReplace == "report" ) )
00373 {
00374 cerr << "Info: graphical alignments can't be produced with subjectReplace\n"
00375 << "set to \"" << queryParams.subjectReplace
00376 << "\" , setting to \"tag\" and proceeding.\n";
00377 queryParams.subjectReplace="tag";
00378 }
00379
00380
00381 }
00382
00383
00384 if ( ( queryParams.stepLength <= 0 )
00385 || ( queryParams.stepLength > queryParams.wordLength ) )
00386 {
00387 cerr << "Info: setting stepLength to " << queryParams.wordLength << ".\n";
00388 queryParams.stepLength = queryParams.wordLength;
00389 }
00390
00391
00392 if (queryParams.queryStart <= 0 )
00393 {
00394 cerr << "Error: first requested sequence ("
00395 << queryParams.queryStart
00396 << ") must be greater than or equal to 1.\n";
00397 throw SSAHAException("Invalid query start value!");
00398 }
00399
00400 if ( ( queryParams.queryEnd != - 1 )
00401 && ( queryParams.queryEnd < queryParams.queryStart ) )
00402 {
00403 cerr << "Error: last requested sequence ("
00404 << queryParams.queryEnd
00405 << ") should be greater than or equal to first ("
00406 << queryParams.queryStart << ").\n";
00407 throw SSAHAException("Invalid query start and end values!");
00408 }
00409
00410
00411
00412 #ifdef XXXX
00413 SequenceReaderMode* pMode;
00414
00415 if ( queryParams.reportMode == "tag" )
00416 {
00417 pMode = new SequenceReaderModeFlagReplace('A');
00418 }
00419 else if ( queryParams.reportMode == "ignore" )
00420 {
00421 pMode = new SequenceReaderModeIgnore();
00422 }
00423 else if ( queryParams.reportMode == "report" )
00424 {
00425 pMode = new SequenceReaderModeReport();
00426 }
00427 else if ( queryParams.reportMode == "replaceA" )
00428 {
00429 pMode = new SequenceReaderModeReplace('A');
00430 }
00431 else if ( queryParams.reportMode == "replaceG" )
00432 {
00433 pMode = new SequenceReaderModeReplace('G');
00434 }
00435 else if ( queryParams.reportMode == "replaceC" )
00436 {
00437 pMode = new SequenceReaderModeReplace('C');
00438 }
00439 else if ( queryParams.reportMode == "replaceT" )
00440 {
00441 pMode = new SequenceReaderModeReplace('T');
00442 }
00443 else if ( queryParams.reportMode == "rrepA" )
00444 {
00445 pMode = new SequenceReaderModeReportReplace('A');
00446 }
00447 else if ( queryParams.reportMode == "rrepG" )
00448 {
00449 pMode = new SequenceReaderModeReportReplace('G');
00450 }
00451 else if ( queryParams.reportMode == "rrepC" )
00452 {
00453 pMode = new SequenceReaderModeReportReplace('C');
00454 }
00455 else if ( queryParams.reportMode == "rrepT" )
00456 {
00457 pMode = new SequenceReaderModeReportReplace('T');
00458 }
00459 else
00460 {
00461 cerr << "Error: value for reportMode (" << queryParams.reportMode
00462 << ") not recognised.\n";
00463 throw SSAHAException("Invalid value for reportMode");
00464 }
00465 #endif
00466
00467
00468 if ( ( queryParams.subjectFormat == "hash" )
00469 && ( queryParams.saveName != "" ) )
00470 {
00471 cerr << "Error: trying to create hash table from hash table!\n";
00472 throw SSAHAException("Invalid value for subject format");
00473 }
00474
00475 if ( queryParams.runQuery == false )
00476 {
00477 cerr <<
00478 "Info: no query name given, assuming hash table generation only \
00479 is required.\n";
00480 if ( ( queryParams.saveName == "" )
00481 && ( queryParams.printHashStats==false ) )
00482 {
00483 cerr << "Error: no saveName for hash table has been given.\n";
00484 throw SSAHAException("No saveName for hash table");
00485 }
00486 }
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521 HashTableFactory creator(cerr);
00522
00523 HashTableGeneric* pHashTable(NULL);
00524 SourceReader* pSubjectSource(NULL);
00525
00526 if ( queryParams.subjectFormat == "hash" )
00527 {
00528 SourceReaderIndex* pIndexReader;
00529
00530
00531 {
00532 ifstream filesFile( (queryParams.subjectName+(string)".files").c_str() );
00533 ifstream indexFile( (queryParams.subjectName+(string)".index").c_str() );
00534 if ((!filesFile.fail())&&(!indexFile.fail()))
00535 {
00536 pIndexReader = new SourceReaderIndex(queryParams.subjectName);
00537 cerr << "Info: found a sequence source index for this hash table.\n";
00538 }
00539 else
00540 {
00541 pIndexReader = NULL;
00542 cerr <<
00543 "Info: did not find a sequence source index for this hash table.\n";
00544
00545 }
00546 filesFile.close();
00547 indexFile.close();
00548 }
00549
00550 pHashTable = creator.loadHashTable
00551 ( queryParams.subjectName, pIndexReader );
00552 pHashTable->setMaxNumHits( queryParams.maxStore );
00553
00554 pSubjectSource=pIndexReader;
00555
00556
00557
00558
00559 if ( pHashTable->getBitsPerSymbol() == gResidueBits )
00560 {
00561 if ( pHashTable->getSourceDataType() == gProteinData )
00562 {
00563 cerr << "Hash table contains protein data." << endl;
00564 queryParams.subjectType = "protein";
00565 }
00566 else if ( pHashTable->getSourceDataType() == gDNAData )
00567 {
00568 cerr << "Hash table contains translated DNA data." << endl;
00569 queryParams.subjectType = "codon";
00570 }
00571 else
00572 {
00573 cerr << "Warning: could not determine type of hash table, "
00574 << "attempting to continue..." << endl;
00575 }
00576 }
00577 else if ( ( pHashTable->getBitsPerSymbol() == gBaseBits )
00578 && ( pHashTable->getSourceDataType() == gDNAData ) )
00579 {
00580 cerr << "Hash table contains untranslated DNA data." << endl;
00581 queryParams.subjectType = "DNA";
00582 }
00583 else
00584 {
00585 cerr << "Warning: could not determine type of hash table, "
00586 << "attempting to continue..." << endl;
00587 }
00588
00589
00590
00591
00592 if ( pHashTable->getWordLength() != queryParams.wordLength )
00593 {
00594 cerr << "Info: hash table was created using hash word length of "
00595 << pHashTable->getWordLength()
00596 <<" bases,\nproceeding using this value.\n";
00597 queryParams.wordLength = pHashTable->getWordLength();
00598 }
00599
00600 if ( pHashTable->getStepLength() != queryParams.stepLength )
00601 {
00602 cerr << "Info: hash table was created using interval of "
00603 << pHashTable->getStepLength()
00604 <<" between bases,\nproceeding using this value.\n";
00605 queryParams.stepLength = pHashTable->getStepLength();
00606 }
00607
00608
00609 }
00610 else
00611 {
00612
00613
00614 if ( ( queryParams.subjectType == "DNA" )
00615 && ( queryParams.queryType == "protein" ) )
00616 {
00617 cerr << "Warning: can't run protein query against DNA database, setting "
00618 << "subject database type to \"codon\" instead." << endl;
00619 queryParams.subjectType = "codon";
00620 }
00621
00622 if ( queryParams.subjectType == "DNA" )
00623 {
00624 pHashTable = new HashTablePacked( cerr, queryParams.saveName );
00625 if ( ( queryParams.wordLength <= 0 )
00626 || ( queryParams.wordLength*gBaseBits > ( 8*sizeof(Word)) -1 ) )
00627 {
00628 cerr << "Warning: word length (" << queryParams.wordLength
00629 << ") outside valid range (0 to "
00630 << (8*sizeof(Word)-1)/gBaseBits << "),\n"
00631 << "using default word length of "
00632 << defaultWordLengthDNA << " instead.\n";
00633 queryParams.wordLength = defaultWordLengthDNA;
00634
00635
00636 }
00637
00638 }
00639 else
00640 {
00641 if ( queryParams.subjectType == "protein" )
00642 {
00643
00644 pHashTable = new HashTablePackedProtein( cerr, queryParams.saveName );
00645
00646 }
00647 else if ( queryParams.subjectType == "codon" )
00648 {
00649 pHashTable = new HashTableTranslated( cerr, queryParams.saveName );
00650 }
00651 else
00652 {
00653 cerr << "Error: value for subjectType (" << queryParams.subjectType
00654 << ") not recognized.\n";
00655 throw SSAHAException("Invalid value for subjectType.\n");
00656 }
00657
00658 if ( ( queryParams.wordLength <= 0 )
00659 || ( queryParams.wordLength*gResidueBits > ( 8*sizeof(Word)) -1 ) )
00660 {
00661 cerr << "Warning: word length (" << queryParams.wordLength
00662 << ") outside valid range (0 to "
00663 << (8*sizeof(Word)-1)/gResidueBits << "),\n"
00664 << "using default word length of "
00665 << defaultWordLengthProtein << " instead.\n";
00666 queryParams.wordLength = defaultWordLengthProtein;
00667
00668 }
00669 }
00670
00671
00672 if ( ( queryParams.stepLength <= 0 )
00673 || ( queryParams.stepLength > queryParams.wordLength ) )
00674 {
00675 cerr << "Info: setting stepLength to " << queryParams.wordLength << ".\n";
00676 queryParams.stepLength = queryParams.wordLength;
00677 }
00678
00679
00680 SequenceReaderMode* pSubjectMode(NULL);
00681
00682 if ( queryParams.subjectReplace == "tag" )
00683 {
00684 pSubjectMode = new SequenceReaderModeFlagReplace('A');
00685 }
00686 else if ( queryParams.subjectReplace == "ignore" )
00687 {
00688 pSubjectMode = new SequenceReaderModeIgnore();
00689 }
00690 else if ( queryParams.subjectReplace == "report" )
00691 {
00692 pSubjectMode = new SequenceReaderModeReport();
00693 }
00694 else if ( queryParams.subjectReplace.size()==1)
00695 {
00696 unsigned char replaceChar(*queryParams.subjectReplace.c_str());
00697 if ((queryParams.subjectType=="protein")&&(ttProtein[replaceChar]==nv))
00698 {
00699 cerr << "Error: value for subjectReplace (" << queryParams.subjectReplace
00700 << ") must be a valid IUPAC amino acid code.\n";
00701 throw SSAHAException("Invalid value for subjectReplace");
00702 }
00703 else if (ttDNA[replaceChar]==nv)
00704 {
00705 cerr << "Error: value for subjectReplace (" << queryParams.subjectReplace
00706 << ") must be A,G,C or T.\n";
00707 throw SSAHAException("Invalid value for subjectReplace");
00708 }
00709 pSubjectMode = new SequenceReaderModeReplace(replaceChar);
00710 }
00711 else
00712 {
00713 cerr << "Error: value for subjectReplace (" << queryParams.subjectReplace
00714 << ") not recognised.\n";
00715 throw SSAHAException("Invalid value for subjectReplace");
00716 }
00717
00718
00719 SequenceEncoder* pSubjectEncoder;
00720
00721 if ( queryParams.subjectType == "protein" )
00722 pSubjectEncoder = new SequenceEncoderProtein(5,cerr);
00723 else
00724 pSubjectEncoder = new SequenceEncoderDNA(12,cerr);
00725
00726 SequenceReader* pSubject =
00727 findSequenceReader
00728 ( queryParams.subjectName, queryParams.subjectFormat, pSubjectEncoder );
00729 assert (pSubject !=NULL);
00730
00731 pSubject->changeMode( pSubjectMode );
00732 delete pSubjectMode;
00733
00734 creator.createHashTable
00735 ( *pHashTable,
00736 *pSubject,
00737 queryParams.wordLength,
00738 queryParams.maxStore,
00739 queryParams.stepLength );
00740
00741
00742 if ( ( queryParams.saveName != "" )
00743 && ( queryParams.subjectFormat != "hash" ) )
00744 {
00745
00746 creator.saveHashTable( *pHashTable );
00747 pSubject->saveIndex( queryParams.saveName );
00748 }
00749
00750 if (queryParams.doAlignment != 0 )
00751 {
00752 pSubjectSource = (SourceReader*) pSubject;
00753 }
00754 else
00755 {
00756 delete pSubjectEncoder;
00757 delete pSubject;
00758 }
00759 }
00760
00761 assert (pHashTable!=NULL);
00762 assert (pHashTable->isInitialized());
00763
00764 double expectedNumHits = pHashTable->getTotalNumWords();
00765
00766 if ( (pHashTable->getHitListFormat()==gTranslated)
00767 || (pHashTable->getHitListFormat()==g32BitPackedProtein) )
00768 {
00769 expectedNumHits /=
00770 pow((double)gNumCodonEncodings,(int)pHashTable->getWordLength());
00771 }
00772 else
00773 {
00774 assert(pHashTable->getBitsPerSymbol()==gBaseBits);
00775 expectedNumHits /= 1<<(2*pHashTable->getWordLength());
00776 }
00777
00778
00779
00780
00781
00782 cerr << "Info: would expect " << expectedNumHits
00783 << " hits per word for a random database of this size." << endl;
00784
00785 queryParams.maxStore=1+(int)(expectedNumHits*queryParams.maxStore);
00786
00787 cerr << "Info: will ignore hits on words that occur more than "
00788 << queryParams.maxStore << " times in the database." << endl;
00789
00790 pHashTable->setMaxNumHits
00791 (
00792 queryParams.maxStore
00793 );
00794
00795
00796 if ( queryParams.printHashStats == true )
00797 {
00798 pHashTable->printHashStats();
00799 }
00800
00801 if ( queryParams.runQuery == false )
00802 {
00803 cerr << "Info: no query name given, assuming no query required.\n";
00804 delete pHashTable;
00805 return;
00806 }
00807
00808
00809
00810
00811
00812 if ( queryParams.numRepeats < 0 )
00813 {
00814 cerr << "Error: max size of repeated motif to be masked ("
00815 << queryParams.numRepeats
00816 << ") outside valid range (0 to "
00817 << queryParams.wordLength << ").\n";
00818 throw SSAHAException("Invalid value for numRepeats!");
00819 }
00820 else if ( queryParams.numRepeats > queryParams.wordLength )
00821 {
00822 cerr << "Warning: max size of repeated motif to be masked ("
00823 << queryParams.numRepeats
00824 << ") outside valid range (0 to "
00825 << queryParams.wordLength << "),\nusing "
00826 << queryParams.wordLength << " instead.\n";
00827 queryParams.numRepeats = queryParams.wordLength;
00828 }
00829
00830 if ( ( queryParams.queryType != "DNA" )
00831 && ( queryParams.queryType != "protein" )
00832 && ( queryParams.queryType != "codon" ) )
00833 {
00834 cerr << "Error: value for queryType (" << queryParams.queryType
00835 << ") not recognized.\n";
00836 throw SSAHAException("Invalid value for queryType.\n");
00837 }
00838
00839 if (queryParams.substituteThreshold > 0 )
00840 {
00841 pHashTable->setSubstituteThreshold( queryParams.substituteThreshold );
00842 }
00843
00844 SequenceEncoder* pQueryEncoder;
00845
00846 if ( queryParams.queryType == "protein" )
00847 {
00848 if ( (dynamic_cast<HashTablePackedProtein*>(pHashTable)==NULL)
00849 && (dynamic_cast<HashTableTranslated*>(pHashTable)==NULL) )
00850 {
00851 cerr << "Error: protein query can't run against DNA database!\n";
00852 throw SSAHAException("Data type mismatch between query and subject");
00853 }
00854 pQueryEncoder = new SequenceEncoderProtein(5);
00855 if (queryParams.queryReplace == "default") queryParams.queryReplace="X";
00856 }
00857 else
00858 {
00859 pQueryEncoder = new SequenceEncoderDNA(12);
00860 if (queryParams.queryReplace == "default") queryParams.queryReplace="A";
00861 }
00862
00863 assert (pQueryEncoder != NULL);
00864
00865
00866 SequenceReaderMode* pQueryMode(NULL);
00867
00868 if ( queryParams.queryReplace == "tag" )
00869 {
00870 pQueryMode = new SequenceReaderModeFlagReplace('A');
00871 }
00872 else if ( queryParams.queryReplace == "ignore" )
00873 {
00874 pQueryMode = new SequenceReaderModeIgnore();
00875 }
00876 else if ( queryParams.queryReplace == "report" )
00877 {
00878 pQueryMode = new SequenceReaderModeReport();
00879 }
00880 else if ( queryParams.queryReplace.size()==1)
00881 {
00882 unsigned char replaceChar(*queryParams.queryReplace.c_str());
00883 if (queryParams.queryType=="protein")
00884 {
00885 if (ttProtein[replaceChar]==nv)
00886 {
00887 cerr << "Error: value for queryReplace (" << queryParams.queryReplace
00888 << ") must be a valid IUPAC amino acid code.\n";
00889 throw SSAHAException("Invalid value for queryReplace");
00890 }
00891 }
00892 else if (ttDNA[replaceChar]==nv)
00893 {
00894 cerr << "Error: value for queryReplace (" << queryParams.queryReplace
00895 << ") must be A,G,C or T.\n";
00896 throw SSAHAException("Invalid value for queryReplace");
00897 }
00898 pQueryMode = new SequenceReaderModeReplace(replaceChar);
00899 }
00900 else
00901 {
00902 cerr << "Error: value for queryReplace (" << queryParams.queryReplace
00903 << ") not recognised.\n";
00904 throw SSAHAException("Invalid value for queryReplace");
00905 }
00906
00907
00908
00909
00910 SequenceReader* pQuery =
00911 findSequenceReader
00912 ( queryParams.queryName, queryParams.queryFormat, pQueryEncoder );
00913 assert(pQuery!=NULL);
00914
00915 pQuery->changeMode( pQueryMode );
00916 delete pQueryMode;
00917
00918
00919 QueryManager queryManager( *pQuery,*pHashTable,cerr);
00920
00921
00922
00923 MatchTask* pTask(NULL);
00924 MatchTask* pPrintTask(NULL);
00925
00926
00927 SortByMatchType sorter(queryParams.sortMatches,0.25);
00928
00929 if (queryParams.doAlignment != 0 )
00930 {
00931 MatchAligner* pAligner;
00932
00933 if (queryParams.queryType == "DNA")
00934 {
00935 if (queryParams.subjectType == "DNA")
00936 {
00937 pAligner = new MatchAlignerDNA
00938 ( queryParams.doAlignment, queryParams.bandExtension );
00939 }
00940 else if (queryParams.subjectType == "protein")
00941 {
00942 pAligner = new MatchAlignerTranslatedProtein
00943 ( false, queryParams.doAlignment, queryParams.bandExtension );
00944 }
00945 else
00946 {
00947 pAligner = new MatchAlignerTranslatedDNA
00948 ( queryParams.doAlignment, queryParams.bandExtension );
00949 }
00950
00951 }
00952 else
00953 {
00954 if (queryParams.subjectType == "protein")
00955 {
00956 pAligner = new MatchAlignerProtein
00957 ( queryParams.doAlignment, queryParams.bandExtension );
00958 }
00959 else
00960 {
00961 pAligner = new MatchAlignerTranslatedProtein
00962 ( true, queryParams.doAlignment, queryParams.bandExtension );
00963 }
00964
00965 }
00966
00967 assert( pAligner!=false);
00968
00969
00970 pPrintTask = new MatchTaskAlign( *pQuery, *pSubjectSource, pAligner,
00971 true, true );
00972
00973 }
00974 else
00975 {
00976
00977 if (queryParams.parserFriendly==true)
00978 {
00979 if (queryParams.reverseQuery==true)
00980 pPrintTask = new MatchTaskPrintTabbedReverse(cout);
00981 else
00982 pPrintTask = new MatchTaskPrintTabbed(cout);
00983 }
00984 else
00985 {
00986 if (queryParams.reverseQuery==true)
00987 pPrintTask = new MatchTaskPrintReverse(cout);
00988 else
00989 pPrintTask = new MatchTaskPrint(cout);
00990 }
00991
00992 }
00993
00994 assert(pPrintTask!=NULL);
00995
00996 if (queryParams.sortMatches==0)
00997 pTask=pPrintTask;
00998 else
00999 pTask=new CombineTaskVirtual(sorter,*pPrintTask);
01000
01001 assert (pTask!=NULL);
01002
01003 if ((queryParams.maxGap == -1)&&(queryParams.maxInsert==0))
01004 {
01005
01006
01007
01008
01009
01010 MatchAlgorithmGapped algorithm
01011 ( 0, 0, queryParams.minPrint,
01012 queryParams.numRepeats );
01013
01014 queryManager.doQuery
01015 ( algorithm, *pTask, queryParams.queryStart, queryParams.queryEnd );
01016
01017
01018 }
01019 else
01020 {
01021
01022 if (queryParams.maxGap==-1) queryParams.maxGap=0;
01023
01024 MatchAlgorithmGapped algorithm
01025 ( queryParams.maxGap, queryParams.maxInsert, queryParams.minPrint,
01026 queryParams.numRepeats );
01027
01028 queryManager.doQuery
01029 ( algorithm, *pTask, queryParams.queryStart, queryParams.queryEnd );
01030
01031
01032 }
01033
01034
01035
01036 delete pQueryEncoder;
01037 delete pHashTable;
01038
01039
01040 delete pQuery;
01041 if (pTask!=pPrintTask) delete pTask;
01042 delete pPrintTask;
01043
01044 }
01045
01046 SequenceReader* findSequenceReader
01047 ( const string& fileName, const string& fileType, SequenceEncoder* encoder )
01048 {
01049
01050 if ( fileType == "fasta" )
01051 {
01052 return (SequenceReader*) new SequenceReaderFasta
01053 (fileName.c_str(),encoder,cerr);
01054 }
01055 else if ( fileType == "fastq" )
01056 {
01057 return (SequenceReader*) new SequenceReaderFastq
01058 (fileName.c_str(),encoder,cerr);
01059 }
01060 else if ( fileType == "" )
01061 {
01062 SequenceReaderMulti* pMulti = new SequenceReaderMulti(cerr);
01063 fillSequenceReaderMulti(*pMulti,encoder,fileName);
01064 if ( pMulti->getNumReaders() == 0 )
01065 {
01066 cerr << "Error: no recognized sources of sequence data were found in "
01067 << fileName << ".\n";
01068 exit (1);
01069 }
01070 return (SequenceReader*) pMulti;
01071 }
01072 else
01073 {
01074 cerr << "Error: " << fileType << " is not a recognized file type.\n";
01075 exit(1);
01076 }
01077
01078 }
01079
01080
01081
01082
01083
01084
01085
01086
01087
01088
01089
01090
01091 void fillSequenceReaderMulti( SequenceReaderMulti& m,
01092 SequenceEncoder* encoder,
01093 const string& fileName,
01094 const string& dirName )
01095 {
01096
01097 string::size_type dotPos(fileName.find_last_of('.'));
01098 string suffix
01099 ( (dotPos==string::npos)?"":fileName.substr(dotPos));
01100 SequenceReader* pSeq(NULL);
01101
01102 string fullPathName( (dirName=="")?fileName:dirName+"/"+fileName);
01103
01104
01105 if ( suffix == ".fail" )
01106 {
01107 return;
01108 }
01109 else if ( ( suffix == ".fasta" ) || ( suffix == ".fa" ) )
01110 {
01111 cerr << "Creating SequenceReaderFasta from " << fileName << endl;
01112 pSeq = new SequenceReaderFasta(fullPathName.c_str(),encoder,cerr);
01113
01114
01115 }
01116 else if ( suffix == ".fastq" )
01117 {
01118 cerr << "Creating SequenceReaderFastq from " << fileName << endl;
01119 pSeq = new SequenceReaderFastq(fullPathName.c_str(),encoder,cerr);
01120
01121 }
01122
01123
01124 if (pSeq!=NULL)
01125 {
01126 string filterFileName
01127 = ( (dirName=="")?"":dirName+"/")
01128 + ( (dotPos==string::npos)?fileName:fileName.substr(0,dotPos))
01129 + ".fail";
01130
01131 ifstream* pFilterFile = new ifstream(filterFileName.c_str());
01132
01133 if (!pFilterFile->fail())
01134 {
01135 SequenceReaderFilter* pFilter
01136 = new SequenceReaderFilter( pSeq, pFilterFile, cerr );
01137 pSeq=pFilter;
01138 }
01139 else delete pFilterFile;
01140 m.addReader(pSeq);
01141
01142 }
01143 else
01144 {
01145 cerr << "Trying to open directory " << fullPathName << " ..." << endl;
01146 DIR* pDir;
01147 if ( ! ( pDir = opendir( fullPathName.c_str() ) ) )
01148 {
01149 cerr << " ... failed! Exiting function." << endl;
01150 return;
01151 }
01152 dirent* dirEntry;
01153 string entryName;
01154 while( dirEntry = readdir(pDir) )
01155 {
01156 entryName = (string) dirEntry->d_name;
01157 if ((entryName == ".")||(entryName=="..")) continue;
01158
01159 fillSequenceReaderMulti(m,encoder,entryName,fullPathName);
01160 }
01161 closedir( pDir );
01162 }
01163
01164 return;
01165
01166 }
01167
01168
01169
01170
01171
01172
01173
01174
01175
01176