Global/SSAHAMain.cpp

Go to the documentation of this file.
00001 
00002 // #######################################################################
00003 
00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00005 // Version 3.2, released 1st March 2004
00006 // Copyright (c) Genome Research 2004
00007 
00008 // SSAHA is free software; you can redistribute it and/or modify 
00009 // it under the terms of version 2 of the GNU General Public Licence
00010 // as published by the Free Software Foundation.
00011  
00012 // This program is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public Licence for more details.
00016  
00017 // You should have received a copy of the GNU General Public Licence
00018 // along with this program; if not, write to the Free Software
00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00021 
00022 // #######################################################################
00023 
00024 // Module Name  : SSAHAMain
00025 // File Name    : SSAHAMain.cpp
00026 // Language     : C++
00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00028 
00029 // Description:
00030 
00031 // Includes:
00032 
00033 #include "SSAHAMain.h"
00034 #include "SequenceEncoder.h"
00035 #include "SequenceReaderMulti.h"
00036 #include "SequenceReaderFasta.h"
00037 #include "SequenceReaderFastq.h"
00038 //#include "SequenceReaderCodon.h"
00039 #include "SequenceReaderFilter.h"
00040 #include "HashTable.h"
00041 #include "HashTablePacked.h"
00042 #include "HashTableTranslated.h"
00043 #include "MatchStore.h"
00044 #include "MatchStoreUngapped.h"
00045 #include "MatchStoreGapped.h"
00046 #include "MatchAligner.h"
00047 #include "TimeStamp.h"
00048 #include <dirent.h>
00049 #include <fstream>
00050 #include <string>
00051 #include <assert.h>
00052 #include <exception>
00053 #include <cmath>
00054 
00055 MachineInfo* MachineInfo::info_ = 0;
00056 NullBuffer nullBuffer;
00057 ofstream*  pLogStream = NULL;
00058 
00059 int defaultWordLengthDNA = 12;
00060 int defaultWordLengthProtein = 4;
00061 
00062 
00063 
00064 // ### Function Definitions ###
00065 
00066 //typedef MatchTaskSort
00067 //<CombineSort<SortByMatchSize,SortBySubjectName,SortByQueryStart> >
00068 //SortByMatchType;
00069 
00070 typedef MatchTaskSort
00071 <SortByMatch>
00072 SortByMatchType;
00073 
00074 
00075 
00076 
00077 
00078 int main( int numArgs, char* args[] )
00079 {
00080 
00081   try
00082   {
00083 
00084     cerr << 
00085     "Welcome to SSAHA: Sequence Search and Alignment by Hashing Algorithm\n\
00086 Copyright (C) 2004 by Genome Research Limited\n\
00087 This software is released under the terms of version 2 of the GNU General\n\
00088 Public Licence, as published by the Free Software Foundation.\n\
00089 This is SSAHA Version 3.2, released 1st March 2004.\n\n";
00090 
00091   Timer timeStamp;
00092 
00093   string thisArg;
00094   if ( numArgs < 3 )
00095   {
00096     // -h, -help or no argument prints help message
00097     // anything else throws an exception
00098     if ( numArgs == 2 )
00099     {
00100       thisArg = (string) args[1];
00101       if ( ( thisArg != "-h" ) && ( thisArg != "-help" ) )
00102       {
00103         cerr << "Error: invalid argument"
00104              << " (" << args[1] << ")" 
00105              << " - use \'" << args[0] << " -help\' for help.\n";
00106         throw SSAHAException("Invalid command line argument.");
00107       } // ~if
00108     } // ~if
00109 
00110 
00111     int i(0); 
00112     do 
00113     {
00114       cout << helpText[i] << "\n";
00115     } while (helpText[++i][0]!='@');
00116 
00117     return(0);
00118 
00119 
00120 
00121   } // ~if
00122 
00123   QueryParameterStruct queryParams( defaultParams );
00124 
00125   CommandLineArg::parseCommandLine( numArgs, args, queryParams );
00126 
00127   //  cerr << timeStamp;
00128 
00129   processQuery( queryParams );
00130 
00131   //  cerr << timeStamp << "Job finished.\n";
00132 
00133   if (pLogStream != NULL) delete pLogStream;
00134 
00135   return (0);
00136 
00137   } // ~try
00138   catch (const SSAHAException& err )
00139   {
00140     cerr << "Caught SSAHA exception: " << err.what() << "\n";
00141     exit (1);
00142   } 
00143   catch (const std::exception& err )
00144   {
00145     cerr << "Caught exception: " << err.what() << "\n";
00146     exit (1);
00147   } 
00148 
00149 } // ~main
00150 
00151 // Name:      parseCommandLine
00152 // Arguments: command line arguments (in), QueryParameterStruct& (out)
00153 // Go through the command line arguments: if an argument is a recognized
00154 // keyword, take the following argument and place it in the appropriate
00155 // part of queryParams. Does not make any attempt to check if paramter values
00156 // are sensible.
00157 void CommandLineArg::parseCommandLine
00158 ( int numArgs, char* args[], QueryParameterStruct& queryParams )
00159 {
00160 
00161   int firstArg;
00162 
00163   // Add mandatory parameters
00164   if ( (args[2])[0] == '-' )
00165   {
00166     // then only one file name given: assume this is a subject file
00167     // to be hashed
00168     firstArg = 2;
00169     queryParams.subjectName = (string) args[1];
00170     queryParams.runQuery = false; 
00171   }
00172   else
00173   {
00174     // two file names given: assume first is query, second is subject
00175     firstArg = 3;
00176     queryParams.queryName   = (string) args[1];
00177     queryParams.subjectName = (string) args[2];
00178   }
00179 
00180   // Parse options
00181   vector<CommandLineArg*> validArgs;
00182   validArgs.push_back
00183   ( new CommandLineArgInt( "-wordLength", "-wl", queryParams.wordLength ) );
00184   validArgs.push_back
00185   ( new CommandLineArgInt( "-stepLength", "-sl", queryParams.stepLength ) );
00186   validArgs.push_back
00187   ( new CommandLineArgInt( "-maxStore", "-ms", queryParams.maxStore ) );
00188   validArgs.push_back
00189   ( new CommandLineArgInt( "-minPrint", "-mp", queryParams.minPrint ) );
00190   validArgs.push_back
00191   ( new CommandLineArgInt( "-maxGap", "-mg", queryParams.maxGap ) );
00192   validArgs.push_back
00193   ( new CommandLineArgInt( "-maxInsert", "-mi", queryParams.maxInsert ) );
00194   validArgs.push_back
00195   ( new CommandLineArgInt( "-numRepeats", "-nr", queryParams.numRepeats ) );
00196   validArgs.push_back
00197   ( new CommandLineArgInt( "-substituteWords", "-sw", 
00198                            queryParams.substituteThreshold ) );
00199   validArgs.push_back
00200   ( new CommandLineArgInt( "-bandExtension", "-be", 
00201                            queryParams.bandExtension ) );
00202   validArgs.push_back
00203   ( new CommandLineArgInt( "-sortMatches", "-sm", queryParams.sortMatches ) );
00204   validArgs.push_back
00205   ( new CommandLineArgInt
00206     ( "-doAlignment", "-da", queryParams.doAlignment ));
00207   validArgs.push_back
00208   ( new CommandLineArgInt( "-queryStart", "-qs", queryParams.queryStart ) );
00209   validArgs.push_back
00210   ( new CommandLineArgInt( "-queryEnd", "-qe", queryParams.queryEnd ) );
00211   validArgs.push_back
00212   ( new CommandLineArgBool( "-hashStats", "-hs", queryParams.printHashStats ));
00213   validArgs.push_back
00214   ( new CommandLineArgBool( "-packHits", "-ph", queryParams.packHits ));
00215   validArgs.push_back
00216   ( new CommandLineArgBool
00217     ( "-reverseQuery", "-rq", queryParams.reverseQuery ));  
00218   validArgs.push_back
00219   ( new CommandLineArgBool
00220     ( "-parserFriendly", "-pf", queryParams.parserFriendly ) );
00221   validArgs.push_back
00222   ( new CommandLineArgString( "-queryType", "-qt", queryParams.queryType ) );
00223   validArgs.push_back
00224   ( new CommandLineArgString( "-subjectType", "-st", queryParams.subjectType));
00225   validArgs.push_back
00226   ( new CommandLineArgString( "-queryFormat", "-qf", queryParams.queryFormat));
00227   validArgs.push_back
00228   ( new CommandLineArgString
00229     ( "-subjectFormat", "-sf", queryParams.subjectFormat) );
00230   //  validArgs.push_back
00231   //  ( new CommandLineArgString( "-reportMode", "-rm", queryParams.reportMode ) );
00232   validArgs.push_back
00233   ( new CommandLineArgString( "-queryReplace", "-qr", queryParams.queryReplace ) );
00234   validArgs.push_back
00235   ( new CommandLineArgString( "-subjectReplace", "-sr", queryParams.subjectReplace ) );
00236   validArgs.push_back
00237   ( new CommandLineArgString( "-logMode", "-lm", queryParams.logMode ) );
00238   validArgs.push_back
00239   ( new CommandLineArgString( "-saveName", "-sn", queryParams.saveName ) );
00240 
00241   string thisArg;
00242   vector<CommandLineArg*>::iterator pArg = validArgs.end();
00243   for ( int i(firstArg); i < numArgs ; i++ )
00244   {
00245     thisArg = (string) args[i];
00246     if (pArg != validArgs.end()) 
00247     // ... then an instance of CommandLineArg is waiting to parse the current
00248     // command line arg
00249     { 
00250       (*pArg)->addValue( thisArg ); 
00251       pArg = validArgs.end(); 
00252     } // ~if
00253     else
00254     // ... trawl through the list of command line args and see if any of 'em
00255     // recognize the current command line arg
00256     { 
00257       for (pArg = validArgs.begin() ; pArg != validArgs.end(); ++pArg )
00258       {
00259         if ( (*pArg)->isThisMe(thisArg) ) break; // *pArg recognizes the arg
00260       } // ~for      
00261       if ( pArg == validArgs.end() )
00262       // ... then the argument has not been recognized - complain!
00263       {
00264         cerr << "Error: command line argument " << thisArg << " not found.\n";
00265         exit(1);
00266       } // ~if
00267 
00268       // if pArg is an instance of type CommandLineArgBool, then there is
00269       // no associated value, so don't try to read one
00270       if ( dynamic_cast<CommandLineArgBool*>(*pArg)) pArg=validArgs.end();
00271     } // ~else    
00272   } // ~for
00273 
00274   // Deallocate memory
00275   for ( vector<CommandLineArg*>::iterator i = validArgs.begin(); 
00276         i != validArgs.end(); ++i ) delete *i;
00277 
00278 
00279 } // ~parseCommandLine
00280 
00281 // Name:      processQuery
00282 // Arguments: QueryParameterStruct& (in)
00283 // 1. Takes queryParams produced by parseCommandLine
00284 // 2. Check the values therein are sensible.
00285 // 3. Set up a SequenceReader for the query data
00286 // 4. Either load in a hash table or create one from the subject data
00287 // 5. Pass the hash table and query sequence reader to the QueryManager
00288 // 6. Run the query!
00289 void processQuery( QueryParameterStruct& queryParams )
00290 {
00291 
00292   // Send log information to appropriate destination
00293   if ( queryParams.logMode == "cout" )
00294   {
00295     cerr.rdbuf(cout.rdbuf());
00296   } // ~if
00297   else if ( queryParams.logMode == "null" )
00298   {
00299     cerr.rdbuf(&nullBuffer);
00300   } // ~else if
00301   else if ( queryParams.logMode != "cerr" )
00302   {
00303     pLogStream = new ofstream(queryParams.logMode.c_str(), ios::ate|ios::app);
00304     if (pLogStream->fail())
00305     {
00306       cerr << "could not open log stream file " << queryParams.logMode
00307            << ".\n";
00308       throw SSAHAException("could not open log file.\n");
00309     } // ~if
00310     cerr.rdbuf(pLogStream->rdbuf());
00311   } // ~else if 
00312 
00313   cerr << "Run started: " << getTimeNow()
00314        << "Input query parameters:\n" << queryParams;
00315 
00316   // Obtain and print machine info
00317   MachineInfo* info = MachineInfo::getInfo();
00318   cerr << *info;
00319 
00320   // Check queryParams values are sensible
00321 
00322   if ( queryParams.minPrint <= 0 )
00323   {
00324     cerr << "Error: value for minPrint (" << queryParams.minPrint 
00325          << ") must be greater than zero.\n";
00326     throw SSAHAException("Invalid value for minPrint");
00327   } // ~if
00328 
00329   if ( queryParams.maxStore < 0 )
00330   {
00331     cerr << "Error: value for maxStore (" << queryParams.maxStore 
00332          << ") must be greater than zero.\n";
00333     throw SSAHAException("Invalid value for maxStore");
00334   } // ~if
00335   if ( queryParams.substituteThreshold < 0 )
00336   {
00337     cerr << "Error: value for substituteThreshold (" 
00338          << queryParams.substituteThreshold
00339          << ") must be greater than zero.\n";
00340     throw SSAHAException("Invalid value for substituteThreshold");
00341   } // ~if
00342 
00343 
00344   if ( queryParams.doAlignment != 0 )
00345   {
00346     if( queryParams.doAlignment < 20 )
00347     {
00348       cerr << "Error: value for doAlignment (" << queryParams.doAlignment 
00349            << ") must be at least 20.\n";
00350       throw SSAHAException("Invalid value for doAlignment");
00351     } // ~if
00352 #ifdef XXX
00353     if (    ( queryParams.reportMode != "replaceA" )
00354          && ( queryParams.reportMode != "tag" )   )
00355     {
00356       cerr << "Info: graphical alignments can't be produced with reportMode\n"
00357            << "set to \"" << queryParams.reportMode 
00358            << "\" , setting reportmode to \"tag\" and proceeding.\n";
00359       queryParams.reportMode="tag";
00360     } // ~if   
00361 #endif
00362     if (    ( queryParams.queryReplace == "ignore" )
00363          || ( queryParams.queryReplace == "report" )   )
00364     {
00365       cerr << "Info: graphical alignments can't be produced with queryReplace\n"
00366            << "set to \"" << queryParams.queryReplace 
00367            << "\" , resetting to default and proceeding.\n";
00368       queryParams.queryReplace="default";
00369     } // ~if   
00370 
00371     if (    ( queryParams.subjectReplace == "ignore" )
00372          || ( queryParams.subjectReplace == "report" )   )
00373     {
00374       cerr << "Info: graphical alignments can't be produced with subjectReplace\n"
00375            << "set to \"" << queryParams.subjectReplace
00376            << "\" , setting to \"tag\" and proceeding.\n";
00377       queryParams.subjectReplace="tag";
00378     } // ~if   
00379 
00380 
00381   } // ~if
00382 
00383   // If no step length specified, set it to the word length
00384   if (    ( queryParams.stepLength <= 0 )
00385        || ( queryParams.stepLength > queryParams.wordLength )   )
00386   { 
00387     cerr << "Info: setting stepLength to " << queryParams.wordLength << ".\n";
00388     queryParams.stepLength = queryParams.wordLength;
00389   } // ~if
00390 
00391   // Check that queryStart and queryEnd are sensible
00392   if (queryParams.queryStart <= 0 )
00393   {
00394       cerr << "Error: first requested sequence  (" 
00395       << queryParams.queryStart
00396       << ") must be greater than or equal to 1.\n";
00397       throw SSAHAException("Invalid query start value!");
00398   } // ~if
00399 
00400   if (    ( queryParams.queryEnd != - 1 ) 
00401        && ( queryParams.queryEnd < queryParams.queryStart ) )
00402   {
00403       cerr << "Error: last requested sequence  (" 
00404       << queryParams.queryEnd
00405       << ") should be greater than or equal to first ("
00406       << queryParams.queryStart << ").\n";
00407       throw SSAHAException("Invalid query start and end values!");
00408   } // ~if
00409 
00410   // Set up mode for sequence readers
00411 
00412 #ifdef XXXX
00413   SequenceReaderMode* pMode;
00414 
00415   if ( queryParams.reportMode == "tag" )
00416   {
00417     pMode = new SequenceReaderModeFlagReplace('A');
00418   }
00419   else if ( queryParams.reportMode == "ignore" )
00420   {
00421     pMode = new SequenceReaderModeIgnore();
00422   }
00423   else if ( queryParams.reportMode == "report" )
00424   {
00425     pMode = new SequenceReaderModeReport();
00426   }
00427   else if ( queryParams.reportMode == "replaceA" )
00428   {
00429     pMode = new SequenceReaderModeReplace('A');
00430   }
00431   else if ( queryParams.reportMode == "replaceG" )
00432   {
00433     pMode = new SequenceReaderModeReplace('G');
00434   }
00435   else if ( queryParams.reportMode == "replaceC" )
00436   {
00437     pMode = new SequenceReaderModeReplace('C');
00438   }
00439   else if ( queryParams.reportMode == "replaceT" )
00440   {
00441     pMode = new SequenceReaderModeReplace('T');
00442   }
00443   else if ( queryParams.reportMode == "rrepA" )
00444   {
00445     pMode = new SequenceReaderModeReportReplace('A');
00446   }
00447   else if ( queryParams.reportMode == "rrepG" )
00448   {
00449     pMode = new SequenceReaderModeReportReplace('G');
00450   }
00451   else if ( queryParams.reportMode == "rrepC" )
00452   {
00453     pMode = new SequenceReaderModeReportReplace('C');
00454   }
00455   else if ( queryParams.reportMode == "rrepT" )
00456   {
00457     pMode = new SequenceReaderModeReportReplace('T');
00458   }
00459   else 
00460   {
00461     cerr << "Error: value for reportMode (" << queryParams.reportMode 
00462          << ") not recognised.\n";
00463     throw SSAHAException("Invalid value for reportMode");
00464   } // ~else
00465 #endif
00466 
00467   // trap foolish command line argument combinations
00468   if (    ( queryParams.subjectFormat == "hash" )
00469        && ( queryParams.saveName != "" ) )
00470   {
00471     cerr << "Error: trying to create hash table from hash table!\n";
00472     throw SSAHAException("Invalid value for subject format");
00473   } // ~if
00474 
00475   if ( queryParams.runQuery == false )
00476   {
00477     cerr << 
00478     "Info: no query name given, assuming hash table generation only \
00479 is required.\n";
00480     if (    ( queryParams.saveName == "" )
00481          && ( queryParams.printHashStats==false )   )
00482     {
00483       cerr << "Error: no saveName for hash table has been given.\n";
00484       throw SSAHAException("No saveName for hash table");
00485     } // ~if
00486   } // ~if
00487 
00488   // Set up subject data ...
00489   //  Allocator<PositionInHitList>* pArrayAllocator;
00490   //  cerr << "Info: pointer array will be allocated ";
00491 
00492   //  if (queryParams.mapArray==true)
00493   //  {
00494   //   cerr << "by memory mapping.\n";
00495   //    pArrayAllocator = new AllocatorMapped<PositionInHitList>;
00496   // }
00497   //  else
00498   //  {
00499   //   cerr << "from local memory.\n";
00500   //   pArrayAllocator = new AllocatorLocal<PositionInHitList>;
00501   //  }
00502 
00503   //  Allocator<PositionInDatabase>* pHitListAllocator;
00504   //  Allocator<PositionPacked>* pHitListAllocator;
00505   //  cerr << "Info: hit list will be allocated ";
00506 
00507   //  if (queryParams.mapHits==true)
00508   //  {
00509   //   cerr << "by memory mapping.\n";
00510   //   pHitListAllocator = new AllocatorMapped<PositionInDatabase>;
00511   //  }
00512   //  else
00513   //  {
00514   //   cerr << "from local memory.\n";
00515     //    pHitListAllocator = new AllocatorLocal<PositionInDatabase>;
00516   //   pHitListAllocator = new AllocatorLocal<PositionPacked>;
00517   //  }
00518 
00519   //  HashTable hashTable
00520 
00521   HashTableFactory creator(cerr);
00522 
00523   HashTableGeneric* pHashTable(NULL);
00524   SourceReader* pSubjectSource(NULL);
00525 
00526   if ( queryParams.subjectFormat == "hash" ) 
00527   { 
00528     SourceReaderIndex* pIndexReader;
00529     // loading in a previously created hash table
00530 
00531     {
00532       ifstream filesFile( (queryParams.subjectName+(string)".files").c_str() );
00533       ifstream indexFile( (queryParams.subjectName+(string)".index").c_str() );
00534       if ((!filesFile.fail())&&(!indexFile.fail()))
00535       {
00536         pIndexReader = new SourceReaderIndex(queryParams.subjectName);
00537         cerr << "Info: found a sequence source index for this hash table.\n";
00538       } // ~if  
00539       else
00540       {
00541         pIndexReader = NULL;
00542         cerr << 
00543           "Info: did not find a sequence source index for this hash table.\n";
00544             
00545       } // ~else
00546      filesFile.close();
00547      indexFile.close();
00548    } // ~scope of ifstreams
00549 
00550     pHashTable = creator.loadHashTable
00551       ( queryParams.subjectName, pIndexReader );
00552     pHashTable->setMaxNumHits( queryParams.maxStore );
00553 
00554     pSubjectSource=pIndexReader; // this ensures it gets deleted at the end
00555 
00556     // Determine and display type of loaded in hash table
00557     // subjectType still needs to be properly set up as it is
00558     // used to determind which MatchTaskAlign is needed
00559     if ( pHashTable->getBitsPerSymbol() == gResidueBits )
00560     {
00561       if ( pHashTable->getSourceDataType() == gProteinData )
00562       {
00563         cerr << "Hash table contains protein data." << endl;
00564         queryParams.subjectType = "protein";
00565       } // ~if
00566       else if ( pHashTable->getSourceDataType() == gDNAData )
00567       {
00568         cerr << "Hash table contains translated DNA data." << endl;
00569         queryParams.subjectType = "codon";
00570       } // ~else if
00571       else 
00572       {
00573         cerr << "Warning: could not determine type of hash table, "
00574              << "attempting to continue..." << endl;
00575       } // ~else
00576     } // ~if
00577     else if (     ( pHashTable->getBitsPerSymbol() == gBaseBits )
00578                && ( pHashTable->getSourceDataType() == gDNAData ) ) 
00579     {
00580         cerr << "Hash table contains untranslated DNA data." << endl;
00581         queryParams.subjectType = "DNA";
00582     } // ~if
00583     else
00584     {
00585         cerr << "Warning: could not determine type of hash table, "
00586              << "attempting to continue..." << endl;
00587     } // ~else
00588 
00589 
00590     // any word length / step length input is irrelevant as
00591     // we have to use the values the table was created with.
00592     if ( pHashTable->getWordLength() != queryParams.wordLength )
00593     {
00594       cerr << "Info: hash table was created using hash word length of "
00595            << pHashTable->getWordLength() 
00596            <<" bases,\nproceeding using this value.\n";
00597       queryParams.wordLength = pHashTable->getWordLength();
00598     } // ~if
00599 
00600     if ( pHashTable->getStepLength() != queryParams.stepLength )
00601     {
00602       cerr << "Info: hash table was created using interval of "
00603            << pHashTable->getStepLength() 
00604            <<" between bases,\nproceeding using this value.\n";
00605       queryParams.stepLength = pHashTable->getStepLength();
00606     } // ~if
00607 
00608 
00609   } // ~if
00610   else 
00611   {
00612     // have to create our own hash table
00613 
00614     if (    ( queryParams.subjectType == "DNA" ) 
00615          && ( queryParams.queryType == "protein" ) )
00616     {
00617       cerr << "Warning: can't run protein query against DNA database, setting "
00618            << "subject database type to \"codon\" instead." << endl;
00619       queryParams.subjectType = "codon";
00620     } // ~if
00621 
00622     if ( queryParams.subjectType == "DNA" )
00623     {
00624       pHashTable = new HashTablePacked( cerr, queryParams.saveName );
00625       if (    ( queryParams.wordLength <= 0 )
00626               || ( queryParams.wordLength*gBaseBits > ( 8*sizeof(Word)) -1 ) )
00627       {
00628         cerr << "Warning: word length (" << queryParams.wordLength 
00629              << ") outside valid range (0 to " 
00630              << (8*sizeof(Word)-1)/gBaseBits << "),\n"
00631              << "using default word length of " 
00632              << defaultWordLengthDNA << " instead.\n";
00633         queryParams.wordLength = defaultWordLengthDNA;
00634         //      throw SSAHAException("Invalid value for wordLength.");
00635       
00636       } // ~if
00637       
00638     } // ~if
00639     else 
00640     {
00641       if ( queryParams.subjectType == "protein" )
00642       {
00643         
00644         pHashTable = new HashTablePackedProtein( cerr, queryParams.saveName );
00645         //      pHashTable = new HashTableFred( cerr, queryParams.saveName );
00646       } // ~if
00647       else if ( queryParams.subjectType == "codon" )
00648       {
00649         pHashTable = new HashTableTranslated( cerr, queryParams.saveName );
00650       } // ~else if
00651       else 
00652       {
00653         cerr << "Error: value for subjectType (" << queryParams.subjectType
00654              << ") not recognized.\n";
00655         throw SSAHAException("Invalid value for subjectType.\n");
00656       } // ~if
00657 
00658       if (    ( queryParams.wordLength <= 0 )
00659               || ( queryParams.wordLength*gResidueBits > ( 8*sizeof(Word)) -1 ) )
00660       {
00661         cerr << "Warning: word length (" << queryParams.wordLength 
00662              << ") outside valid range (0 to " 
00663              << (8*sizeof(Word)-1)/gResidueBits << "),\n"
00664              << "using default word length of " 
00665              << defaultWordLengthProtein << " instead.\n";
00666         queryParams.wordLength = defaultWordLengthProtein;
00667         //      throw SSAHAException("Invalid value for wordLength.");
00668       } // ~if
00669      } // ~else
00670 
00671     // If no step length specified, set it to the word length
00672     if (    ( queryParams.stepLength <= 0 )
00673          || ( queryParams.stepLength > queryParams.wordLength )   )
00674     { 
00675       cerr << "Info: setting stepLength to " << queryParams.wordLength << ".\n";
00676       queryParams.stepLength = queryParams.wordLength;
00677     } // ~if
00678     
00679 
00680     SequenceReaderMode* pSubjectMode(NULL);
00681 
00682     if ( queryParams.subjectReplace == "tag" )
00683     {
00684       pSubjectMode = new SequenceReaderModeFlagReplace('A');
00685     } // ~if
00686     else if ( queryParams.subjectReplace == "ignore" )
00687     {
00688       pSubjectMode = new SequenceReaderModeIgnore();
00689     } // ~else if
00690     else if ( queryParams.subjectReplace == "report" )
00691     {
00692       pSubjectMode = new SequenceReaderModeReport();
00693     } // ~else if
00694     else if ( queryParams.subjectReplace.size()==1)
00695     {
00696       unsigned char replaceChar(*queryParams.subjectReplace.c_str());
00697       if ((queryParams.subjectType=="protein")&&(ttProtein[replaceChar]==nv))
00698       {
00699         cerr << "Error: value for subjectReplace (" << queryParams.subjectReplace 
00700              << ") must be a valid IUPAC amino acid code.\n";
00701         throw SSAHAException("Invalid value for subjectReplace");
00702       } // ~if
00703       else if (ttDNA[replaceChar]==nv)
00704       {
00705         cerr << "Error: value for subjectReplace (" << queryParams.subjectReplace 
00706              << ") must be A,G,C or T.\n";
00707         throw SSAHAException("Invalid value for subjectReplace");
00708       } // ~else if
00709       pSubjectMode = new SequenceReaderModeReplace(replaceChar);
00710     } // ~else if
00711   else 
00712   {
00713     cerr << "Error: value for subjectReplace (" << queryParams.subjectReplace 
00714          << ") not recognised.\n";
00715     throw SSAHAException("Invalid value for subjectReplace");
00716   } // ~else
00717 
00718 
00719     SequenceEncoder* pSubjectEncoder;
00720 
00721     if ( queryParams.subjectType == "protein" )
00722       pSubjectEncoder = new SequenceEncoderProtein(5,cerr);
00723     else
00724       pSubjectEncoder = new SequenceEncoderDNA(12,cerr); 
00725 
00726     SequenceReader* pSubject =
00727     findSequenceReader
00728     ( queryParams.subjectName, queryParams.subjectFormat, pSubjectEncoder ); 
00729     assert (pSubject !=NULL);
00730 
00731     pSubject->changeMode( pSubjectMode );
00732     delete pSubjectMode; // cloned, so no longer needed
00733 
00734     creator.createHashTable
00735     (  *pHashTable,
00736        *pSubject, 
00737        queryParams.wordLength, 
00738        queryParams.maxStore, 
00739        queryParams.stepLength  );
00740 
00741     // Save the hash table to a file, if required
00742     if (    ( queryParams.saveName != "" )
00743          && ( queryParams.subjectFormat != "hash" )  )
00744     { 
00745       //      hashTable.saveHashTable(queryParams.saveName);
00746       creator.saveHashTable( *pHashTable );
00747       pSubject->saveIndex( queryParams.saveName );
00748     } // ~if
00749 
00750     if (queryParams.doAlignment != 0 )
00751     {
00752       pSubjectSource = (SourceReader*) pSubject;
00753     } // ~if
00754     else
00755     {
00756       delete pSubjectEncoder;
00757       delete pSubject;
00758     } // ~else
00759   } // ~else
00760 
00761   assert (pHashTable!=NULL);
00762   assert (pHashTable->isInitialized());
00763 
00764   double expectedNumHits = pHashTable->getTotalNumWords();
00765 
00766   if (    (pHashTable->getHitListFormat()==gTranslated)
00767        || (pHashTable->getHitListFormat()==g32BitPackedProtein) )
00768   {
00769     expectedNumHits /= 
00770       pow((double)gNumCodonEncodings,(int)pHashTable->getWordLength());
00771   } // ~if
00772   else
00773   {
00774     assert(pHashTable->getBitsPerSymbol()==gBaseBits);
00775     expectedNumHits /= 1<<(2*pHashTable->getWordLength());
00776   } // ~else
00777 
00778   //  expectedNumHits /= (pHashTable->getBitsPerSymbol()==gBaseBits)
00779   //   ? 1<<(2*pHashTable->getWordLength())
00780   //   : pow((double)gNumCodonEncodings,(int)pHashTable->getWordLength());
00781 
00782   cerr << "Info: would expect " << expectedNumHits 
00783          << " hits per word for a random database of this size." << endl;
00784 
00785   queryParams.maxStore=1+(int)(expectedNumHits*queryParams.maxStore);
00786 
00787   cerr << "Info: will ignore hits on words that occur more than " 
00788        << queryParams.maxStore << " times in the database." << endl;
00789 
00790   pHashTable->setMaxNumHits
00791   (
00792     queryParams.maxStore
00793   );
00794 
00795 
00796   if ( queryParams.printHashStats == true )
00797   {
00798     pHashTable->printHashStats();
00799   } // ~if
00800 
00801   if ( queryParams.runQuery == false )
00802   {
00803     cerr << "Info: no query name given, assuming no query required.\n"; 
00804     delete pHashTable;
00805     return;
00806   } // ~if
00807 
00808   // Check that numRepeats is sensible
00809   // Checked here and not earlier because word length may change from
00810   // the value specified in the command line arguments if the hash
00811   // table has been loaded in from disk.
00812   if ( queryParams.numRepeats < 0 )
00813   {
00814     cerr << "Error: max size of repeated motif to be masked (" 
00815          << queryParams.numRepeats
00816          << ") outside valid range (0 to " 
00817          << queryParams.wordLength << ").\n";
00818     throw SSAHAException("Invalid value for numRepeats!");
00819   } // ~if
00820   else if ( queryParams.numRepeats > queryParams.wordLength ) 
00821   {
00822     cerr << "Warning: max size of repeated motif to be masked (" 
00823          << queryParams.numRepeats
00824          << ") outside valid range (0 to " 
00825          << queryParams.wordLength << "),\nusing "
00826          << queryParams.wordLength << " instead.\n";
00827     queryParams.numRepeats = queryParams.wordLength;
00828   } // ~if
00829 
00830   if (    ( queryParams.queryType != "DNA" )
00831           && ( queryParams.queryType != "protein" )
00832           && ( queryParams.queryType != "codon" ) )
00833   {
00834     cerr << "Error: value for queryType (" << queryParams.queryType
00835          << ") not recognized.\n";
00836     throw SSAHAException("Invalid value for queryType.\n");
00837   } // ~if
00838 
00839   if (queryParams.substituteThreshold > 0 )
00840   {
00841     pHashTable->setSubstituteThreshold( queryParams.substituteThreshold );
00842   } // ~if
00843 
00844   SequenceEncoder* pQueryEncoder;
00845   
00846 if ( queryParams.queryType == "protein" )
00847 {
00848   if (    (dynamic_cast<HashTablePackedProtein*>(pHashTable)==NULL)
00849           && (dynamic_cast<HashTableTranslated*>(pHashTable)==NULL) )
00850   {
00851     cerr << "Error: protein query can't run against DNA database!\n";
00852     throw SSAHAException("Data type mismatch between query and subject");
00853   } // ~if
00854   pQueryEncoder = new SequenceEncoderProtein(5);
00855   if (queryParams.queryReplace == "default") queryParams.queryReplace="X";
00856 } // ~if
00857 else
00858 {
00859   pQueryEncoder = new SequenceEncoderDNA(12); 
00860   if (queryParams.queryReplace == "default") queryParams.queryReplace="A";
00861 } // ~else
00862 
00863 assert (pQueryEncoder != NULL);
00864 
00865 
00866 SequenceReaderMode* pQueryMode(NULL);
00867 
00868 if ( queryParams.queryReplace == "tag" )
00869 {
00870   pQueryMode = new SequenceReaderModeFlagReplace('A');
00871 } // ~if
00872 else if ( queryParams.queryReplace == "ignore" )
00873 {
00874   pQueryMode = new SequenceReaderModeIgnore();
00875 } // ~else if
00876 else if ( queryParams.queryReplace == "report" )
00877 {
00878   pQueryMode = new SequenceReaderModeReport();
00879 } // ~else if
00880 else if ( queryParams.queryReplace.size()==1)
00881 {
00882   unsigned char replaceChar(*queryParams.queryReplace.c_str());
00883   if (queryParams.queryType=="protein")
00884   {
00885     if (ttProtein[replaceChar]==nv)
00886     {
00887       cerr << "Error: value for queryReplace (" << queryParams.queryReplace 
00888            << ") must be a valid IUPAC amino acid code.\n";
00889       throw SSAHAException("Invalid value for queryReplace");
00890     } // ~if
00891   } // ~if
00892   else if (ttDNA[replaceChar]==nv)
00893   {
00894     cerr << "Error: value for queryReplace (" << queryParams.queryReplace 
00895          << ") must be A,G,C or T.\n";
00896     throw SSAHAException("Invalid value for queryReplace");
00897   } // ~else if
00898   pQueryMode = new SequenceReaderModeReplace(replaceChar);
00899 } // ~else if
00900 else 
00901 {
00902   cerr << "Error: value for queryReplace (" << queryParams.queryReplace 
00903        << ") not recognised.\n";
00904     throw SSAHAException("Invalid value for queryReplace");
00905 } // ~else
00906 
00907 
00908 
00909   // Set up SequenceReader for query data
00910   SequenceReader* pQuery = 
00911   findSequenceReader
00912     ( queryParams.queryName, queryParams.queryFormat, pQueryEncoder );
00913   assert(pQuery!=NULL);  
00914 
00915   pQuery->changeMode( pQueryMode );
00916 delete pQueryMode;
00917 
00918   // Set up query manager
00919   QueryManager queryManager( *pQuery,*pHashTable,cerr);
00920 
00921   // Set up MatchTask
00922 
00923   MatchTask* pTask(NULL);
00924   MatchTask* pPrintTask(NULL);
00925   //  MatchTaskPrint printNormal;
00926   //  MatchTaskPrintTabbed printTabbed;
00927   SortByMatchType sorter(queryParams.sortMatches,0.25);
00928   
00929   if (queryParams.doAlignment != 0 )
00930   {
00931     MatchAligner* pAligner;
00932 
00933     if (queryParams.queryType == "DNA")
00934     {
00935       if (queryParams.subjectType == "DNA")
00936       {
00937         pAligner = new MatchAlignerDNA
00938           ( queryParams.doAlignment, queryParams.bandExtension ); 
00939       } // ~if
00940       else if (queryParams.subjectType == "protein")
00941       {
00942         pAligner = new MatchAlignerTranslatedProtein
00943           ( false, queryParams.doAlignment, queryParams.bandExtension );   
00944       } // ~else if
00945       else // (queryParams.subjectType == "codon")
00946       {
00947         pAligner = new MatchAlignerTranslatedDNA
00948           ( queryParams.doAlignment, queryParams.bandExtension );   
00949       } // ~else
00950 
00951     } // ~if (queryParams.queryType == "DNA")
00952     else // query is protein
00953     {  
00954       if (queryParams.subjectType == "protein")
00955       {
00956         pAligner = new MatchAlignerProtein
00957           ( queryParams.doAlignment, queryParams.bandExtension ); 
00958       } // ~if
00959       else // subjectType is "DNA" or "codon"
00960       {
00961         pAligner = new MatchAlignerTranslatedProtein
00962           ( true, queryParams.doAlignment, queryParams.bandExtension );   
00963       } // ~else
00964 
00965     } // ~else
00966 
00967     assert( pAligner!=false);
00968 
00969     // ownership of *pAligner passes to *pPrintTask
00970     pPrintTask = new MatchTaskAlign( *pQuery, *pSubjectSource, pAligner,
00971                                      true, true );
00972 
00973   } // ~if
00974   else
00975   {
00976 
00977     if (queryParams.parserFriendly==true)
00978       {
00979         if (queryParams.reverseQuery==true) 
00980           pPrintTask = new MatchTaskPrintTabbedReverse(cout);
00981         else
00982           pPrintTask = new MatchTaskPrintTabbed(cout);
00983       } // ~if
00984     else
00985       {
00986         if (queryParams.reverseQuery==true) 
00987           pPrintTask = new MatchTaskPrintReverse(cout);
00988         else
00989           pPrintTask = new MatchTaskPrint(cout);
00990       } // ~else
00991 
00992   } // ~else
00993 
00994   assert(pPrintTask!=NULL);
00995 
00996   if (queryParams.sortMatches==0)
00997     pTask=pPrintTask;
00998   else
00999     pTask=new CombineTaskVirtual(sorter,*pPrintTask);
01000 
01001   assert (pTask!=NULL);
01002 
01003   if ((queryParams.maxGap == -1)&&(queryParams.maxInsert==0))
01004   {
01005 
01006     // This no longer works with the new codon stuff TC 11.12.1
01007     //    MatchAlgorithmUngapped algorithm
01008     //    ( queryParams.minPrint, queryParams.numRepeats );
01009 
01010     MatchAlgorithmGapped algorithm
01011     ( 0, 0, queryParams.minPrint, 
01012       queryParams.numRepeats );    
01013 
01014     queryManager.doQuery
01015     ( algorithm, *pTask, queryParams.queryStart, queryParams.queryEnd );
01016 
01017 
01018   } // ~if
01019   else
01020   {
01021 
01022     if (queryParams.maxGap==-1) queryParams.maxGap=0;
01023 
01024     MatchAlgorithmGapped algorithm
01025     ( queryParams.maxGap, queryParams.maxInsert, queryParams.minPrint, 
01026       queryParams.numRepeats );    
01027 
01028     queryManager.doQuery
01029     ( algorithm, *pTask, queryParams.queryStart, queryParams.queryEnd );
01030     
01031 
01032   } // ~else
01033 
01034   // Don't need to delete pQueryEncoder because ownership of it has
01035   // passed to *pQuery, so it is deleted when *pQuery is deleted.
01036   delete pQueryEncoder;
01037   delete pHashTable;
01038 //  delete pSubjectMode;
01039 //  delete pQueryMode;
01040   delete pQuery;
01041   if (pTask!=pPrintTask) delete pTask;
01042   delete pPrintTask;
01043 
01044 } // ~processQuery
01045 
01046 SequenceReader* findSequenceReader
01047 ( const string& fileName, const string& fileType, SequenceEncoder* encoder )
01048 {
01049 
01050    if ( fileType == "fasta" )
01051    {
01052      return (SequenceReader*) new SequenceReaderFasta
01053        (fileName.c_str(),encoder,cerr);
01054    }
01055    else if ( fileType == "fastq" )
01056    {
01057      return (SequenceReader*) new SequenceReaderFastq
01058        (fileName.c_str(),encoder,cerr);
01059    }
01060    else if ( fileType == "" )
01061    {
01062      SequenceReaderMulti* pMulti = new SequenceReaderMulti(cerr);
01063      fillSequenceReaderMulti(*pMulti,encoder,fileName);
01064      if ( pMulti->getNumReaders() == 0 )
01065      {
01066        cerr << "Error: no recognized sources of sequence data were found in "
01067             << fileName << ".\n";
01068        exit (1);
01069      } // ~if
01070      return (SequenceReader*) pMulti;
01071    }
01072    else
01073    {
01074      cerr << "Error: " << fileType << " is not a recognized file type.\n"; 
01075      exit(1);
01076    }
01077    
01078 } // ~findSequenceReader
01079 
01080 
01081 // Name:      fillSequenceReaderMulti
01082 // Arguments: SequenceReaderMulti& (out), string& (in), string& (in
01083 // This uses the filename suffix to deduce what file type to create
01084 // if fileName = *.fasta it creates an instance of SequenceReaderFasta and
01085 // adds it to m.
01086 // (Other file formats are TBD)
01087 // If filename suffix does not exist or is not recognized, it assumes
01088 // the filename must be a directory. It then calls itself for each filename
01089 // found in the directory. By calling itself recursively in this way, all
01090 // files in a directory tree are incorporated into m.
01091 void fillSequenceReaderMulti( SequenceReaderMulti& m,
01092                               SequenceEncoder* encoder,
01093                               const string& fileName, 
01094                               const string& dirName )
01095 {
01096 
01097   string::size_type dotPos(fileName.find_last_of('.'));
01098   string suffix
01099     ( (dotPos==string::npos)?"":fileName.substr(dotPos));  
01100   SequenceReader* pSeq(NULL);
01101 
01102   string fullPathName( (dirName=="")?fileName:dirName+"/"+fileName);
01103   //  cout << fullPathName << endl;
01104 
01105   if ( suffix == ".fail" )
01106   {
01107     return; // fail files picked up when their associated seq file is read
01108   }
01109   else if ( ( suffix == ".fasta" ) || ( suffix == ".fa" ) ) 
01110   { 
01111     cerr << "Creating SequenceReaderFasta from " << fileName << endl; 
01112     pSeq = new SequenceReaderFasta(fullPathName.c_str(),encoder,cerr);
01113     //    m.addReader(pSeq);
01114     //    m.addReader(new SequenceReaderFasta(fullPathName.c_str(),encoder,cerr));
01115   } // ~if
01116   else if ( suffix == ".fastq" ) 
01117   { 
01118     cerr << "Creating SequenceReaderFastq from " << fileName << endl; 
01119     pSeq = new SequenceReaderFastq(fullPathName.c_str(),encoder,cerr);
01120     //m.addReader(new SequenceReaderFastq(fullPathName.c_str(),encoder,cerr));
01121   } // ~else if
01122   // ... else ifs for all other formats ...
01123 
01124   if (pSeq!=NULL)
01125   {
01126     string filterFileName
01127       = ( (dirName=="")?"":dirName+"/")
01128       + ( (dotPos==string::npos)?fileName:fileName.substr(0,dotPos))
01129       + ".fail"; 
01130 
01131     ifstream* pFilterFile = new ifstream(filterFileName.c_str());
01132 
01133     if (!pFilterFile->fail())
01134     {
01135       SequenceReaderFilter* pFilter
01136         = new SequenceReaderFilter( pSeq, pFilterFile, cerr );
01137       pSeq=pFilter;
01138     }
01139     else delete pFilterFile;
01140     m.addReader(pSeq);
01141 
01142   }
01143   else // presume fullPathName is a directory, try to expand it
01144   {
01145     cerr << "Trying to open directory " << fullPathName  << " ..." << endl;
01146     DIR* pDir;
01147     if ( ! ( pDir = opendir( fullPathName.c_str() ) ) )
01148     {
01149       cerr << " ... failed! Exiting function." << endl;
01150       return; 
01151     } // ~if
01152     dirent* dirEntry;
01153     string entryName;
01154     while( dirEntry = readdir(pDir) )
01155     {
01156       entryName = (string) dirEntry->d_name;
01157       if ((entryName == ".")||(entryName=="..")) continue;
01158       // function calls itself recursively ... 
01159       fillSequenceReaderMulti(m,encoder,entryName,fullPathName);
01160     } // ~while
01161     closedir( pDir );
01162   } // ~else
01163 
01164   return;
01165 
01166 } // ~findSequenceReader 
01167 
01168 // End of file SSAHAMain.cpp
01169 
01170 
01171 
01172 
01173 
01174 
01175 
01176 

Generated on Fri Dec 21 13:12:15 2007 for ssaha by  doxygen 1.5.2