SequenceReader/SequenceReaderMulti.h

Go to the documentation of this file.
00001 /*  Last edited: Feb 21 18:18 2002 (ac2) */
00002 
00003 // #######################################################################
00004 
00005 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00006 // Version 3.2, released 1st March 2004
00007 // Copyright (c) Genome Research 2002
00008 
00009 // SSAHA is free software; you can redistribute it and/or modify 
00010 // it under the terms of version 2 of the GNU General Public Licence
00011 // as published by the Free Software Foundation.
00012  
00013 // This program is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public Licence for more details.
00017  
00018 // You should have received a copy of the GNU General Public Licence
00019 // along with this program; if not, write to the Free Software
00020 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00021 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00022 
00023 // #######################################################################
00024 
00025 // Module Name  : SequenceReaderMulti
00026 // File Name    : SequenceReaderMulti.h
00027 // Language     : C++
00028 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00029 
00030 // Include guard:
00031 #ifndef INCLUDED_SequenceReaderMulti
00032 #define INCLUDED_SequenceReaderMulti
00033 
00034 // Description:
00035 
00036 // #pragma interface
00037 // Includes:
00038 #include "SequenceReader.h"
00039 #include <memory>
00040 #include <algorithm>
00041 #include <vector>
00042 
00043 // NB it is good practise for #include statements in header files to be
00044 // replaced by forward declarations if at all possible
00045 
00046 // ### Class Declarations ###
00047 
00048   struct SeqReaderInfo
00049   {
00050     SequenceReader* ptr_;
00051     int             size_;
00052     bool            allSeqsRead_;
00053     SeqReaderInfo( SequenceReader* ptr ) : 
00054       ptr_( ptr ), size_( 0 ), allSeqsRead_( false ) 
00055       {
00056       } 
00057     SeqReaderInfo( void ) : 
00058       ptr_( NULL ), size_( 0 ), allSeqsRead_( false )
00059       {
00060       } 
00061     SeqReaderInfo( const SeqReaderInfo& rhs ) : 
00062       //      ptr_( rhs.ptr_->clone() ), size_( rhs.size_ ), allSeqsRead_( rhs.allSeqsRead_ ) {} 
00063       ptr_( rhs.ptr_ ), size_( rhs.size_ ), allSeqsRead_( rhs.allSeqsRead_ ) 
00064       {
00065       } 
00066     ~SeqReaderInfo() 
00067       { 
00068       }
00069 
00070   };
00071 
00072 
00073 // -----------------------------
00074 // Class Name : SequenceReaderMultiState
00075 // Description: This preserves the state (ie position in a file) of
00076 // a SequenceReaderMulti
00077 class SequenceReaderMultiState : public SequenceReaderState
00078 {
00079  public:
00080   SequenceReaderMultiState
00081     ( SequenceNumber lsn, 
00082       vector<SeqReaderInfo>::iterator tr,
00083       SequenceReaderState* ps ) :
00084   thisReader_(tr),  
00085     pState_(ps), SequenceReaderState(lsn) {}
00086   virtual ~SequenceReaderMultiState() {} //delete pState_;
00087   // no point in making this private as it's const
00088   const vector<SeqReaderInfo>::iterator thisReader_;
00089   // this is state info for *thisReader, whatever it is
00090   SequenceReaderState* pState_;
00091 };
00092 
00093 
00094 // Class Name : SequenceReaderMulti
00095 // Description: The purpose of this class is to enable multiple 
00096 // SequenceReaders to 'act as one.' This enables for example a directory
00097 // of fasta files to be processed in the same way as a single one.
00098 class SequenceReaderMulti : public SequenceReader
00099 {
00100 
00101 
00102   // PUBLIC MEMBER FUNCTIONS
00103   public:
00104 
00105   // Constructors and Destructors
00106 
00107   // Function Name:
00108   // Arguments:
00109   // TYPE  NAME  IN/OUT COMMENT
00110   // Returns: TYPE COMMENT
00111   SequenceReaderMulti( ostream& monitoringStream = cerr );
00112 
00113   SequenceReaderMulti( const SequenceReaderMulti& rhs); 
00114 
00115 
00116   // Function Name:
00117   // Arguments:
00118   // TYPE  NAME  IN/OUT COMMENT
00119   // Returns: TYPE COMMENT
00120   virtual ~SequenceReaderMulti(); 
00121   // (NB destructor should be virtual if class is to be derived from)
00122 
00123   // Manipulator Functions
00124   virtual SequenceReader* clone( void ) 
00125   { return new SequenceReaderMulti( *this ); }
00126 
00127 
00128   // Function Name: addReader
00129   // Arguments: SequenceReader&
00130   // A copy of seq is made (which is why SequenceReader and its subclasses
00131   // need to have a copy constructor) and a pointer to it is placed in 
00132   // allReaders. 
00133   void addReader( SequenceReader& seq );
00134 
00135   // Function Name: addReader
00136   // Arguments: SequenceReader*
00137   // pSeq is placed in allReaders_. NB no copy of pSeq is made: use by
00138   // creating a new SequenceReader and calling this. The SeqReaderMulti
00139   // takes ownership of *pSeq and is responsible for its destruction
00140   void addReader( SequenceReader* pSeq );
00141 
00142 
00143   // Function Name: changeMode
00144   // Arguments: const SequenceReaderMode&
00145   // Makes a copy of mode and uses it to handle mismatch character reads
00146   virtual void changeMode( SequenceReaderMode* pMode );
00147 
00148 
00149   // Accessor Functions
00150   // (NB all accessor functions should be 'const')
00151 
00152   int getNumReaders( void )
00153   { 
00154     return allReaders_.size();
00155   }
00156 
00157   // Function Name: findReader
00158   // Arguments: SequenceNumber& (in/out)
00159   // Given an input sequence number, adjusts seqNum and thisReader_
00160   // so that the seqNum'th sequence of *this is obtained by passing
00161   // the adjusted value of seqNum to *thisReader_
00162   // Returns true if the find was successful, else returns false
00163   // (in which case thisReader_ and seqNum are not adjusted)
00164   bool findReader( SequenceNumber& seqNum );
00165   
00166   // Function Name: rewind
00167   // Arguments: void
00168   // Returns:   void
00169   // Rewind to the start of the data file, so that getNextSequence will
00170   // return the first sequence in the file
00171   virtual void rewind( void );
00172 
00173   // Function Name: findSequence
00174   // Arguments: SequenceNumber (in)
00175   // Returns:   void
00176   // Winds the input file stream to the start of sequence number seqNum. 
00177   // Returns false if seqNum exceeds the number of sequences in
00178   // the file.
00179   virtual bool findSequence( SequenceNumber seqNum );
00180 
00181   // Function Name: getNextSequence
00182   // Arguments: WordSequence& (out), int (in)
00183   // Returns:   int
00184   // Read the next set of sequence information from the file and parse it
00185   // into WordSequence format. Returns -1 if there has been a problem with
00186   // reading the sequence, else returns the number of valid base pairs 
00187   // contained within the final word of the sequence
00188   virtual int getNextSequence( WordSequence& nextSeq, int wordLength );
00189 
00190   // Function Name: getSequence
00191   // Arguments: WordSequence& (out), SequenceNumber (in), int (in)
00192   // Returns:   bool
00193   // Read the sequenceNumber-th set of sequence information from the file and 
00194   // parse it into WordSequence format
00195   virtual int getSequence
00196     ( WordSequence& nextSeq, SequenceNumber sequenceNumber, int wordLength );
00197 
00198   // Function Name: getLastSequenceName
00199   // Arguments: string& (out)
00200   // Returns:   void
00201   // Fills the string with the name of the last sequence read 
00202   virtual void getLastSequenceName( string& seqName ) const;
00203 
00204   // Function Name: getBitsPerSymbol
00205   // Arguments: none
00206   // Returns:   int
00207   // Returns number of bits per symbol used in encoding
00208   virtual int getBitsPerSymbol ( void ) const;
00209 
00210   // Function Name: getSourceDataType
00211   // Arguments: none
00212   // Returns:   SourceDataType
00213   // Returns type of data being encoded (protein or DNA)
00214   virtual SourceDataType getSourceDataType( void ) const;
00215 
00216   // Function Name: printName
00217   // Arguments: string& (out), int (in)
00218   // Returns:   void
00219   // Fills a string with the name of the requested sequence
00220   virtual bool printName( ostream& os, SequenceNumber seqNum );
00221 
00222   // Function Name: printSideInfo
00223   // Arguments: string& (out), SequenceNumber (in)
00224   // Returns:   void
00225   // Fills a string with the name of the requested sequence
00226   virtual bool printSideInfo( ostream& os, SequenceNumber seqNum );
00227 
00228   // Function Name: printSource
00229   // Arguments: string& (out), SequenceNumber (in)
00230   // Returns:   void
00231   // Fills a string with the name of the requested sequence
00232   virtual bool printSource( ostream& os, SequenceNumber seqNum );
00233 
00234   // Function Name: extractSource
00235   // This extracts the source data for bases seqStart to seqEnd inclusive
00236   // of sequence seqNum and places it in source
00237   virtual void extractSource( char** pSource,//vector<char>& source, 
00238                      SequenceNumber seqNum,
00239                      SequenceOffset seqStart,
00240                      SequenceOffset seqEnd );
00241 
00242   // Function Name: saveIndexImp
00243   // Actually save the indexing data to disk. Implemented
00244   // for SequenceReaderFile and SequenceReaderMulti, not
00245   // for SourceReaderIndex
00246   virtual void saveIndexImp
00247     ( ostream& fileFile, 
00248       ostream& indexFile, 
00249       int& fileNumber );
00250 
00251 
00252   // Function Name: saveState
00253   // Arguments: void
00254   // Returns:   SequenceReaderState*
00255   // saves the state (ie current file position) of a SequenceReader for future
00256   // restoration
00257   virtual SequenceReaderState* saveState( void ) const
00258   {
00259     return new SequenceReaderMultiState
00260       ( lastSequenceNumber_, 
00261         thisReader_, 
00262         (thisReader_==allReaders_.end()) 
00263         ? NULL 
00264         : thisReader_->ptr_->saveState() );
00265   }
00266 
00267   // Function Name: restoreState
00268   // Arguments:   SequenceReaderState*
00269   // Returns:     void
00270   // restores the state (ie current file position) of a SequenceReader
00271   // then (NB!!) deletes *pState;
00272   virtual void restoreState( SequenceReaderState* pState ) 
00273   {
00274     SequenceReaderMultiState* p
00275       (dynamic_cast<SequenceReaderMultiState*>(pState));
00276     assert(p!=NULL);
00277     lastSequenceNumber_ = p->lastSequenceNumber_;
00278     thisReader_->ptr_->rewind();
00279     thisReader_=p->thisReader_; 
00280     if (thisReader_!=allReaders_.end())
00281       thisReader_->ptr_->restoreState( p->pState_ ); 
00282     delete pState;
00283   }
00284 
00285   // PROTECTED MEMBER FUNCTIONS 
00286   // (visible to this class and derived classes only)
00287   protected:
00288 
00289   // Function Name: computeNumSequencesInFile
00290   // Arguments: void
00291   // Returns:   SequenceNumber
00292   // Returns the number of sequences in the file (will be done by lazy 
00293   // initialization, i.e. will only be calculated if asked for. NB this
00294   // will lose the current place in the file)
00295   virtual SequenceNumber computeNumSequencesInFile( void );
00296 
00297 
00298   // PRIVATE MEMBER FUNCTIONS
00299   // (visible to instances of this class only)
00300   
00301   private:
00302   SequenceReaderMulti( const SequenceReader&);             // NOT IMPLEMENTED
00303   SequenceReaderMulti& operator=(const SequenceReader&);   // NOT IMPLEMENTED
00304 
00305   // PROTECTED DATA:
00306   // (visible to instances of this class only)
00307   protected:
00308 
00309   vector<SeqReaderInfo> allReaders_;
00310   //  vector<SequenceNumber>             numSeqs_;
00311   vector<SeqReaderInfo>::iterator thisReader_;
00312   SequenceNumber currentSeqNum_;
00313 
00314   bool isFirstSeq_;
00315   int  bitsPerSymbol_;
00316   SourceDataType sourceDataType_;
00317 
00318   // PRIVATE MEMBER DATA
00319   private:
00320 
00321 }; // SequenceReaderMulti
00322 
00323 
00324 
00325 
00326 
00327 
00328 
00329 
00330 // ### Function Declarations ###
00331 
00332 // Name:
00333 // Arguments:
00334 // TYPE  NAME  IN/OUT COMMENT
00335 // Returns: TYPE COMMENT
00336 
00337 // End of include guard:
00338 #endif
00339 
00340 // End of file SequenceReaderMulti.h

Generated on Fri Dec 21 13:12:16 2007 for ssaha by  doxygen 1.5.2