00001 /* Last edited: Feb 21 18:19 2002 (ac2) */ 00002 00003 // ####################################################################### 00004 00005 // SSAHA : Sequence Search and Alignment by Hashing Algorithm 00006 // Version 3.2, released 1st March 2004 00007 // Copyright (c) Genome Research 2002 00008 00009 // SSAHA is free software; you can redistribute it and/or modify 00010 // it under the terms of version 2 of the GNU General Public Licence 00011 // as published by the Free Software Foundation. 00012 00013 // This program is distributed in the hope that it will be useful, 00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 // GNU General Public Licence for more details. 00017 00018 // You should have received a copy of the GNU General Public Licence 00019 // along with this program; if not, write to the Free Software 00020 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 00021 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt 00022 00023 // ####################################################################### 00024 00025 // Module Name : SequenceReaderFasta 00026 // File Name : SequenceReaderFasta.h 00027 // Language : C++ 00028 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk) 00029 00030 // Include guard: 00031 #ifndef INCLUDED_SequenceReaderFasta 00032 #define INCLUDED_SequenceReaderFasta 00033 00034 // Description: 00035 00036 // Includes: 00037 00038 class WordSequence; 00039 //class SequenceEncoder; 00040 #include "SequenceReader.h" 00041 #include "SequenceEncoder.h" 00042 #include <string> 00043 00044 // NB it is good practise for #include statements in header files to be 00045 // replaced by forward declarations if at all possible 00046 00047 // ### Class Declarations ### 00048 00049 00050 // ----------------------------- 00051 // Class Name : SequenceReaderFileState 00052 // Description: This preserves the state (ie position in a file) of 00053 // a SequenceReader 00054 class SequenceReaderFileState : public SequenceReaderState 00055 { 00056 public: 00057 SequenceReaderFileState( SequenceNumber lsn, std::streampos fp ) : 00058 filePos_(fp), SequenceReaderState(lsn) {} 00059 // no point in making this private as it's const 00060 const std::streampos filePos_; 00061 }; 00062 00063 00064 00065 // Class Name : 00066 // Description: 00067 class SequenceReaderFile : public SequenceReader 00068 { 00069 00070 // PUBLIC MEMBER FUNCTIONS 00071 public: 00072 00073 // Constructors and Destructors 00074 00075 // Function Name: 00076 // Arguments: 00077 // TYPE NAME IN/OUT COMMENT 00078 // Returns: TYPE COMMENT 00079 SequenceReaderFile 00080 ( const char* fileName, 00081 char seqStartChar, 00082 char seqStopChar, 00083 SequenceEncoder* pEncoder, 00084 ostream& monitoringStream = cerr ); 00085 00086 SequenceReaderFile 00087 ( istream& inputStream, 00088 char seqStartChar, 00089 char seqStopChar, 00090 SequenceEncoder* pEncoder, 00091 ostream& monitoringStream = cerr ); 00092 00093 // Function Name: 00094 // Arguments: 00095 // TYPE NAME IN/OUT COMMENT 00096 // Returns: TYPE COMMENT 00097 SequenceReaderFile( const SequenceReaderFile& rhs); 00098 00099 // Function Name: 00100 // Arguments: 00101 // TYPE NAME IN/OUT COMMENT 00102 // Returns: TYPE COMMENT 00103 virtual ~SequenceReaderFile(); 00104 // (NB destructor should be virtual if class is to be derived from) 00105 00106 // Manipulator Functions 00107 virtual SequenceReader* clone( void ) 00108 { return new SequenceReaderFile( *this ); } 00109 00110 // Function Name: changeMode 00111 // Arguments: const SequenceReaderMode& 00112 // Makes a copy of mode and uses it to handle mismatch character reads 00113 virtual void changeMode( SequenceReaderMode* pMode ); 00114 00115 // Function Name: 00116 // Arguments: 00117 // TYPE NAME IN/OUT COMMENT 00118 // Returns: TYPE COMMENT 00119 00120 // Accessor Functions 00121 // (NB all accessor functions should be 'const') 00122 00123 // Function Name: rewind 00124 // Arguments: void 00125 // Returns: void 00126 // Rewind to the start of the data file, so that getNextSequence will 00127 // return the first sequence in the file 00128 void rewind( void ); 00129 00130 // Function Name: findSequence 00131 // Arguments: SequenceNumber (in) 00132 // Returns: void 00133 // Winds the input file stream to the start of sequence number seqNum. 00134 // Throws an exception if seqNum exceeds the number of sequences in 00135 // the file. 00136 virtual bool findSequence( SequenceNumber seqNum ); 00137 00138 // Function Name: getNextSequence 00139 // Arguments: WordSequence& (out), int (in) 00140 // Returns: bool 00141 // Read the next set of sequence information from the file and parse it 00142 // into WordSequence format 00143 virtual int getNextSequence( WordSequence& nextSeq, int wordLength ); 00144 00145 // Function Name: getSequence 00146 // Arguments: WordSequence& (out), SequenceNumber (in), int (in) 00147 // Returns: int 00148 // Read the sequenceNumber-th set of sequence information from the file and 00149 // parse it into WordSequence format 00150 virtual int getSequence 00151 ( WordSequence& nextSeq, SequenceNumber sequenceNumber, int wordLength ); 00152 00153 // Function Name: getLastSequenceName 00154 // Arguments: string& (out) 00155 // Returns: void 00156 // Fills the string with the name of the last sequence read 00157 virtual void getLastSequenceName( string& seqName ) const; 00158 00159 // Function Name: getBitsPerSymbol 00160 // Arguments: none 00161 // Returns: int 00162 // Returns number of bits per symbol used in encoding 00163 virtual int getBitsPerSymbol ( void ) const; 00164 00165 // Function Name: getSourceDataType 00166 // Arguments: none 00167 // Returns: SourceDataType 00168 // Returns type of data being encoded (protein or DNA) 00169 virtual SourceDataType getSourceDataType( void ) const; 00170 00171 // Function Name: printName 00172 // Arguments: ostream& (out), SequenceNumber (in) 00173 // Returns: void 00174 // Sends the name of the requested sequence to the output stream. 00175 virtual bool printName( ostream& os, SequenceNumber seqNum ); 00176 00177 // Function Name: printSideInfo 00178 // Arguments: ostream& (out), SequenceNumber (in) 00179 // Returns: void 00180 // Sends the side info (e.g. clone name) for the requested sequence 00181 // to the output stream. 00182 virtual bool printSideInfo( ostream& os, SequenceNumber seqNum ); 00183 00184 // Function Name: printSource 00185 // Arguments: ostream& (out), SequenceNumber (in) 00186 // Returns: void 00187 // Send to the output stream the source data (in general, ASCII) from which 00188 // the requested sequence was decoded. 00189 virtual bool printSource( ostream& os, SequenceNumber seqNum ); 00190 00191 // Function Name: extractSource 00192 // This extracts the source data for bases seqStart to seqEnd inclusive 00193 // of sequence seqNum and places it in source 00194 virtual void extractSource( char** pSource,//vector<char>& source, 00195 SequenceNumber seqNum, 00196 SequenceOffset seqStart, 00197 SequenceOffset seqEnd ); 00198 00199 virtual void saveIndexImp 00200 ( ostream& fileFile, 00201 ostream& indexFile, 00202 int& fileNumber ); 00203 00204 00205 // Function Name: saveState 00206 // Arguments: void 00207 // Returns: SequenceReaderState* 00208 // saves the state (ie current file position) of a SequenceReader for future 00209 // restoration 00210 virtual SequenceReaderState* saveState( void ) const 00211 { 00212 return new SequenceReaderFileState 00213 ( lastSequenceNumber_, pInputFileStream_->tellg() ); 00214 } 00215 00216 // Function Name: restoreState 00217 // Arguments: SequenceReaderState* 00218 // Returns: void 00219 // restores the state (ie current file position) of a SequenceReader 00220 // then (NB!!) deletes *pState; 00221 virtual void restoreState( SequenceReaderState* pState ) 00222 { 00223 SequenceReaderFileState* p 00224 (dynamic_cast<SequenceReaderFileState*>(pState)); 00225 assert(p!=NULL); 00226 lastSequenceNumber_ = p->lastSequenceNumber_; 00227 pInputFileStream_->seekg( p->filePos_, ios::beg ); 00228 delete pState; 00229 } 00230 00231 // PROTECTED MEMBER FUNCTIONS 00232 // (visible to this class and derived classes only) 00233 protected: 00234 00235 // Function Name: computeNumSequencesInFile 00236 // Arguments: void 00237 // Returns: SequenceNumber 00238 // Returns the number of sequences in the file (will be done by lazy 00239 // initialization, i.e. will only be calculated if asked for. NB this 00240 // will lose the current place in the file) 00241 virtual SequenceNumber computeNumSequencesInFile( void ); 00242 00243 00244 // PRIVATE MEMBER FUNCTIONS 00245 // (visible to instances of this class only) 00246 00247 private: 00248 SequenceReaderFile& operator=(const SequenceReaderFile&);// NOT IMPLEMENTED 00249 00250 // PROTECTED DATA: 00251 // (visible to this class and derived classes only) 00252 protected: 00253 enum Constants{ sideInfoBufferSize_ = 20000, inputBufferSize_ = 20000 }; 00254 char inputBuffer_[ inputBufferSize_ ]; 00255 // char sideInfoBuffer_[ sideInfoBufferSize_ ]; 00256 // string inputBuffer_; 00257 string sideInfoBuffer_; 00258 char seqStartChar_; 00259 char seqStopChar_; 00260 00261 istream* pInputFileStream_; 00262 00263 // ifstream* pInputFileStream_; 00264 string fileName_; 00265 SequenceEncoder* pEncoder_; 00266 00267 vector<std::streampos> seqPositions_; 00268 00269 // PRIVATE MEMBER DATA 00270 private: 00271 00272 }; // SequenceReaderFile 00273 00274 class SequenceReaderFasta : public SequenceReaderFile 00275 { 00276 00277 // PUBLIC MEMBER FUNCTIONS 00278 public: 00279 00280 // Constructors and Destructors 00281 00282 00283 // Function Name: 00284 // Arguments: 00285 // TYPE NAME IN/OUT COMMENT 00286 // Returns: TYPE COMMENT 00287 SequenceReaderFasta 00288 ( const char* fileName, 00289 SequenceEncoder* pEncoder, 00290 ostream& monitoringStream = cerr ); 00291 SequenceReaderFasta 00292 ( const char* fileName, 00293 ostream& monitoringStream = cerr ); 00294 00295 }; // SequenceReaderFasta 00296 00297 typedef SequenceReaderFasta SequenceReaderFastaDNA; 00298 00299 class SequenceReaderFastaProtein : public SequenceReaderFasta 00300 { 00301 00302 // PUBLIC MEMBER FUNCTIONS 00303 public: 00304 00305 // Constructors and Destructors 00306 00307 // Function Name: 00308 // Arguments: 00309 // TYPE NAME IN/OUT COMMENT 00310 // Returns: TYPE COMMENT 00311 SequenceReaderFastaProtein( const char* fileName, 00312 ostream& monitoringStream = cerr ); 00313 00314 }; // SequenceReaderFastaProtein 00315 00316 00317 00318 00319 00320 // ### Function Declarations ### 00321 00322 // Name: 00323 // Arguments: 00324 // TYPE NAME IN/OUT COMMENT 00325 // Returns: TYPE COMMENT 00326 00327 // End of include guard: 00328 #endif 00329 00330 // End of file SequenceReaderFile.h 00331 00332 00333 00334 00335 00336 00337 00338 00339 00340
1.5.2