00001 /* Last edited: Feb 21 18:18 2002 (ac2) */ 00002 00003 // ####################################################################### 00004 00005 // SSAHA : Sequence Search and Alignment by Hashing Algorithm 00006 // Version 3.2, released 1st March 2004 00007 // Copyright (c) Genome Research 2002 00008 00009 // SSAHA is free software; you can redistribute it and/or modify 00010 // it under the terms of version 2 of the GNU General Public Licence 00011 // as published by the Free Software Foundation. 00012 00013 // This program is distributed in the hope that it will be useful, 00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 // GNU General Public Licence for more details. 00017 00018 // You should have received a copy of the GNU General Public Licence 00019 // along with this program; if not, write to the Free Software 00020 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 00021 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt 00022 00023 // ####################################################################### 00024 00025 // Module Name : SequenceReaderMulti 00026 // File Name : SequenceReaderMulti.h 00027 // Language : C++ 00028 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk) 00029 00030 // Include guard: 00031 #ifndef INCLUDED_SequenceReaderMulti 00032 #define INCLUDED_SequenceReaderMulti 00033 00034 // Description: 00035 00036 // #pragma interface 00037 // Includes: 00038 #include "SequenceReader.h" 00039 #include <memory> 00040 #include <algorithm> 00041 #include <vector> 00042 00043 // NB it is good practise for #include statements in header files to be 00044 // replaced by forward declarations if at all possible 00045 00046 // ### Class Declarations ### 00047 00048 struct SeqReaderInfo 00049 { 00050 SequenceReader* ptr_; 00051 int size_; 00052 bool allSeqsRead_; 00053 SeqReaderInfo( SequenceReader* ptr ) : 00054 ptr_( ptr ), size_( 0 ), allSeqsRead_( false ) 00055 { 00056 } 00057 SeqReaderInfo( void ) : 00058 ptr_( NULL ), size_( 0 ), allSeqsRead_( false ) 00059 { 00060 } 00061 SeqReaderInfo( const SeqReaderInfo& rhs ) : 00062 // ptr_( rhs.ptr_->clone() ), size_( rhs.size_ ), allSeqsRead_( rhs.allSeqsRead_ ) {} 00063 ptr_( rhs.ptr_ ), size_( rhs.size_ ), allSeqsRead_( rhs.allSeqsRead_ ) 00064 { 00065 } 00066 ~SeqReaderInfo() 00067 { 00068 } 00069 00070 }; 00071 00072 00073 // ----------------------------- 00074 // Class Name : SequenceReaderMultiState 00075 // Description: This preserves the state (ie position in a file) of 00076 // a SequenceReaderMulti 00077 class SequenceReaderMultiState : public SequenceReaderState 00078 { 00079 public: 00080 SequenceReaderMultiState 00081 ( SequenceNumber lsn, 00082 vector<SeqReaderInfo>::iterator tr, 00083 SequenceReaderState* ps ) : 00084 thisReader_(tr), 00085 pState_(ps), SequenceReaderState(lsn) {} 00086 virtual ~SequenceReaderMultiState() {} //delete pState_; 00087 // no point in making this private as it's const 00088 const vector<SeqReaderInfo>::iterator thisReader_; 00089 // this is state info for *thisReader, whatever it is 00090 SequenceReaderState* pState_; 00091 }; 00092 00093 00094 // Class Name : SequenceReaderMulti 00095 // Description: The purpose of this class is to enable multiple 00096 // SequenceReaders to 'act as one.' This enables for example a directory 00097 // of fasta files to be processed in the same way as a single one. 00098 class SequenceReaderMulti : public SequenceReader 00099 { 00100 00101 00102 // PUBLIC MEMBER FUNCTIONS 00103 public: 00104 00105 // Constructors and Destructors 00106 00107 // Function Name: 00108 // Arguments: 00109 // TYPE NAME IN/OUT COMMENT 00110 // Returns: TYPE COMMENT 00111 SequenceReaderMulti( ostream& monitoringStream = cerr ); 00112 00113 SequenceReaderMulti( const SequenceReaderMulti& rhs); 00114 00115 00116 // Function Name: 00117 // Arguments: 00118 // TYPE NAME IN/OUT COMMENT 00119 // Returns: TYPE COMMENT 00120 virtual ~SequenceReaderMulti(); 00121 // (NB destructor should be virtual if class is to be derived from) 00122 00123 // Manipulator Functions 00124 virtual SequenceReader* clone( void ) 00125 { return new SequenceReaderMulti( *this ); } 00126 00127 00128 // Function Name: addReader 00129 // Arguments: SequenceReader& 00130 // A copy of seq is made (which is why SequenceReader and its subclasses 00131 // need to have a copy constructor) and a pointer to it is placed in 00132 // allReaders. 00133 void addReader( SequenceReader& seq ); 00134 00135 // Function Name: addReader 00136 // Arguments: SequenceReader* 00137 // pSeq is placed in allReaders_. NB no copy of pSeq is made: use by 00138 // creating a new SequenceReader and calling this. The SeqReaderMulti 00139 // takes ownership of *pSeq and is responsible for its destruction 00140 void addReader( SequenceReader* pSeq ); 00141 00142 00143 // Function Name: changeMode 00144 // Arguments: const SequenceReaderMode& 00145 // Makes a copy of mode and uses it to handle mismatch character reads 00146 virtual void changeMode( SequenceReaderMode* pMode ); 00147 00148 00149 // Accessor Functions 00150 // (NB all accessor functions should be 'const') 00151 00152 int getNumReaders( void ) 00153 { 00154 return allReaders_.size(); 00155 } 00156 00157 // Function Name: findReader 00158 // Arguments: SequenceNumber& (in/out) 00159 // Given an input sequence number, adjusts seqNum and thisReader_ 00160 // so that the seqNum'th sequence of *this is obtained by passing 00161 // the adjusted value of seqNum to *thisReader_ 00162 // Returns true if the find was successful, else returns false 00163 // (in which case thisReader_ and seqNum are not adjusted) 00164 bool findReader( SequenceNumber& seqNum ); 00165 00166 // Function Name: rewind 00167 // Arguments: void 00168 // Returns: void 00169 // Rewind to the start of the data file, so that getNextSequence will 00170 // return the first sequence in the file 00171 virtual void rewind( void ); 00172 00173 // Function Name: findSequence 00174 // Arguments: SequenceNumber (in) 00175 // Returns: void 00176 // Winds the input file stream to the start of sequence number seqNum. 00177 // Returns false if seqNum exceeds the number of sequences in 00178 // the file. 00179 virtual bool findSequence( SequenceNumber seqNum ); 00180 00181 // Function Name: getNextSequence 00182 // Arguments: WordSequence& (out), int (in) 00183 // Returns: int 00184 // Read the next set of sequence information from the file and parse it 00185 // into WordSequence format. Returns -1 if there has been a problem with 00186 // reading the sequence, else returns the number of valid base pairs 00187 // contained within the final word of the sequence 00188 virtual int getNextSequence( WordSequence& nextSeq, int wordLength ); 00189 00190 // Function Name: getSequence 00191 // Arguments: WordSequence& (out), SequenceNumber (in), int (in) 00192 // Returns: bool 00193 // Read the sequenceNumber-th set of sequence information from the file and 00194 // parse it into WordSequence format 00195 virtual int getSequence 00196 ( WordSequence& nextSeq, SequenceNumber sequenceNumber, int wordLength ); 00197 00198 // Function Name: getLastSequenceName 00199 // Arguments: string& (out) 00200 // Returns: void 00201 // Fills the string with the name of the last sequence read 00202 virtual void getLastSequenceName( string& seqName ) const; 00203 00204 // Function Name: getBitsPerSymbol 00205 // Arguments: none 00206 // Returns: int 00207 // Returns number of bits per symbol used in encoding 00208 virtual int getBitsPerSymbol ( void ) const; 00209 00210 // Function Name: getSourceDataType 00211 // Arguments: none 00212 // Returns: SourceDataType 00213 // Returns type of data being encoded (protein or DNA) 00214 virtual SourceDataType getSourceDataType( void ) const; 00215 00216 // Function Name: printName 00217 // Arguments: string& (out), int (in) 00218 // Returns: void 00219 // Fills a string with the name of the requested sequence 00220 virtual bool printName( ostream& os, SequenceNumber seqNum ); 00221 00222 // Function Name: printSideInfo 00223 // Arguments: string& (out), SequenceNumber (in) 00224 // Returns: void 00225 // Fills a string with the name of the requested sequence 00226 virtual bool printSideInfo( ostream& os, SequenceNumber seqNum ); 00227 00228 // Function Name: printSource 00229 // Arguments: string& (out), SequenceNumber (in) 00230 // Returns: void 00231 // Fills a string with the name of the requested sequence 00232 virtual bool printSource( ostream& os, SequenceNumber seqNum ); 00233 00234 // Function Name: extractSource 00235 // This extracts the source data for bases seqStart to seqEnd inclusive 00236 // of sequence seqNum and places it in source 00237 virtual void extractSource( char** pSource,//vector<char>& source, 00238 SequenceNumber seqNum, 00239 SequenceOffset seqStart, 00240 SequenceOffset seqEnd ); 00241 00242 // Function Name: saveIndexImp 00243 // Actually save the indexing data to disk. Implemented 00244 // for SequenceReaderFile and SequenceReaderMulti, not 00245 // for SourceReaderIndex 00246 virtual void saveIndexImp 00247 ( ostream& fileFile, 00248 ostream& indexFile, 00249 int& fileNumber ); 00250 00251 00252 // Function Name: saveState 00253 // Arguments: void 00254 // Returns: SequenceReaderState* 00255 // saves the state (ie current file position) of a SequenceReader for future 00256 // restoration 00257 virtual SequenceReaderState* saveState( void ) const 00258 { 00259 return new SequenceReaderMultiState 00260 ( lastSequenceNumber_, 00261 thisReader_, 00262 (thisReader_==allReaders_.end()) 00263 ? NULL 00264 : thisReader_->ptr_->saveState() ); 00265 } 00266 00267 // Function Name: restoreState 00268 // Arguments: SequenceReaderState* 00269 // Returns: void 00270 // restores the state (ie current file position) of a SequenceReader 00271 // then (NB!!) deletes *pState; 00272 virtual void restoreState( SequenceReaderState* pState ) 00273 { 00274 SequenceReaderMultiState* p 00275 (dynamic_cast<SequenceReaderMultiState*>(pState)); 00276 assert(p!=NULL); 00277 lastSequenceNumber_ = p->lastSequenceNumber_; 00278 thisReader_->ptr_->rewind(); 00279 thisReader_=p->thisReader_; 00280 if (thisReader_!=allReaders_.end()) 00281 thisReader_->ptr_->restoreState( p->pState_ ); 00282 delete pState; 00283 } 00284 00285 // PROTECTED MEMBER FUNCTIONS 00286 // (visible to this class and derived classes only) 00287 protected: 00288 00289 // Function Name: computeNumSequencesInFile 00290 // Arguments: void 00291 // Returns: SequenceNumber 00292 // Returns the number of sequences in the file (will be done by lazy 00293 // initialization, i.e. will only be calculated if asked for. NB this 00294 // will lose the current place in the file) 00295 virtual SequenceNumber computeNumSequencesInFile( void ); 00296 00297 00298 // PRIVATE MEMBER FUNCTIONS 00299 // (visible to instances of this class only) 00300 00301 private: 00302 SequenceReaderMulti( const SequenceReader&); // NOT IMPLEMENTED 00303 SequenceReaderMulti& operator=(const SequenceReader&); // NOT IMPLEMENTED 00304 00305 // PROTECTED DATA: 00306 // (visible to instances of this class only) 00307 protected: 00308 00309 vector<SeqReaderInfo> allReaders_; 00310 // vector<SequenceNumber> numSeqs_; 00311 vector<SeqReaderInfo>::iterator thisReader_; 00312 SequenceNumber currentSeqNum_; 00313 00314 bool isFirstSeq_; 00315 int bitsPerSymbol_; 00316 SourceDataType sourceDataType_; 00317 00318 // PRIVATE MEMBER DATA 00319 private: 00320 00321 }; // SequenceReaderMulti 00322 00323 00324 00325 00326 00327 00328 00329 00330 // ### Function Declarations ### 00331 00332 // Name: 00333 // Arguments: 00334 // TYPE NAME IN/OUT COMMENT 00335 // Returns: TYPE COMMENT 00336 00337 // End of include guard: 00338 #endif 00339 00340 // End of file SequenceReaderMulti.h
1.5.2