SequenceReader/SequenceReaderFasta.h

Go to the documentation of this file.
00001 /*  Last edited: Feb 21 18:19 2002 (ac2) */
00002 
00003 // #######################################################################
00004 
00005 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00006 // Version 3.2, released 1st March 2004
00007 // Copyright (c) Genome Research 2002
00008 
00009 // SSAHA is free software; you can redistribute it and/or modify 
00010 // it under the terms of version 2 of the GNU General Public Licence
00011 // as published by the Free Software Foundation.
00012  
00013 // This program is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public Licence for more details.
00017  
00018 // You should have received a copy of the GNU General Public Licence
00019 // along with this program; if not, write to the Free Software
00020 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00021 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00022 
00023 // #######################################################################
00024 
00025 // Module Name  : SequenceReaderFasta
00026 // File Name    : SequenceReaderFasta.h
00027 // Language     : C++
00028 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00029 
00030 // Include guard:
00031 #ifndef INCLUDED_SequenceReaderFasta
00032 #define INCLUDED_SequenceReaderFasta
00033 
00034 // Description:
00035 
00036 // Includes:
00037 
00038 class WordSequence;
00039 //class SequenceEncoder;
00040 #include "SequenceReader.h"
00041 #include "SequenceEncoder.h"
00042 #include <string>
00043 
00044 // NB it is good practise for #include statements in header files to be
00045 // replaced by forward declarations if at all possible
00046 
00047 // ### Class Declarations ###
00048 
00049 
00050 // -----------------------------
00051 // Class Name : SequenceReaderFileState
00052 // Description: This preserves the state (ie position in a file) of
00053 // a SequenceReader
00054 class SequenceReaderFileState : public SequenceReaderState
00055 {
00056  public:
00057   SequenceReaderFileState( SequenceNumber lsn, std::streampos fp ) :
00058   filePos_(fp),  SequenceReaderState(lsn) {}
00059   // no point in making this private as it's const
00060   const std::streampos filePos_;
00061 };
00062 
00063 
00064 
00065 // Class Name :
00066 // Description: 
00067 class SequenceReaderFile : public SequenceReader
00068 {
00069 
00070   // PUBLIC MEMBER FUNCTIONS
00071   public:
00072 
00073   // Constructors and Destructors
00074 
00075   // Function Name:
00076   // Arguments:
00077   // TYPE  NAME  IN/OUT COMMENT
00078   // Returns: TYPE COMMENT
00079   SequenceReaderFile
00080   ( const char* fileName, 
00081     char seqStartChar,
00082     char seqStopChar,
00083     SequenceEncoder* pEncoder,
00084     ostream& monitoringStream = cerr );
00085 
00086   SequenceReaderFile
00087   ( istream& inputStream, 
00088     char seqStartChar,
00089     char seqStopChar,
00090     SequenceEncoder* pEncoder,
00091     ostream& monitoringStream = cerr );
00092 
00093   // Function Name:
00094   // Arguments:
00095   // TYPE  NAME  IN/OUT COMMENT
00096   // Returns: TYPE COMMENT
00097   SequenceReaderFile( const SequenceReaderFile& rhs); 
00098 
00099   // Function Name:
00100   // Arguments:
00101   // TYPE  NAME  IN/OUT COMMENT
00102   // Returns: TYPE COMMENT
00103   virtual ~SequenceReaderFile(); 
00104   // (NB destructor should be virtual if class is to be derived from)
00105 
00106   // Manipulator Functions
00107   virtual SequenceReader* clone( void ) 
00108   { return new SequenceReaderFile( *this ); }
00109 
00110   // Function Name: changeMode
00111   // Arguments: const SequenceReaderMode&
00112   // Makes a copy of mode and uses it to handle mismatch character reads
00113   virtual void changeMode( SequenceReaderMode* pMode );
00114 
00115   // Function Name:
00116   // Arguments:
00117   // TYPE  NAME  IN/OUT COMMENT
00118   // Returns: TYPE COMMENT
00119 
00120   // Accessor Functions
00121   // (NB all accessor functions should be 'const')
00122 
00123   // Function Name: rewind
00124   // Arguments: void
00125   // Returns:   void
00126   // Rewind to the start of the data file, so that getNextSequence will
00127   // return the first sequence in the file
00128   void rewind( void );
00129 
00130   // Function Name: findSequence
00131   // Arguments: SequenceNumber (in)
00132   // Returns:   void
00133   // Winds the input file stream to the start of sequence number seqNum. 
00134   // Throws an exception if seqNum exceeds the number of sequences in
00135   // the file.
00136   virtual bool findSequence( SequenceNumber seqNum );
00137 
00138   // Function Name: getNextSequence
00139   // Arguments: WordSequence& (out), int (in)
00140   // Returns:   bool
00141   // Read the next set of sequence information from the file and parse it
00142   // into WordSequence format
00143   virtual int getNextSequence( WordSequence& nextSeq, int wordLength );
00144 
00145   // Function Name: getSequence
00146   // Arguments: WordSequence& (out), SequenceNumber (in), int (in)
00147   // Returns:   int
00148   // Read the sequenceNumber-th set of sequence information from the file and 
00149   // parse it into WordSequence format
00150   virtual int getSequence
00151   ( WordSequence& nextSeq, SequenceNumber sequenceNumber, int wordLength );
00152 
00153   // Function Name: getLastSequenceName
00154   // Arguments: string& (out)
00155   // Returns:   void
00156   // Fills the string with the name of the last sequence read 
00157   virtual void getLastSequenceName( string& seqName ) const;
00158 
00159   // Function Name: getBitsPerSymbol
00160   // Arguments: none
00161   // Returns:   int
00162   // Returns number of bits per symbol used in encoding
00163   virtual int getBitsPerSymbol ( void ) const;
00164 
00165   // Function Name: getSourceDataType
00166   // Arguments: none
00167   // Returns:   SourceDataType
00168   // Returns type of data being encoded (protein or DNA)
00169   virtual SourceDataType getSourceDataType( void ) const;
00170 
00171   // Function Name: printName
00172   // Arguments: ostream& (out), SequenceNumber (in)
00173   // Returns:   void
00174   // Sends the name of the requested sequence to the output stream.
00175   virtual bool printName( ostream& os, SequenceNumber seqNum );
00176 
00177   // Function Name: printSideInfo
00178   // Arguments: ostream& (out), SequenceNumber (in)
00179   // Returns:   void
00180   // Sends the side info (e.g. clone name) for the requested sequence 
00181   // to the output stream.
00182   virtual bool printSideInfo( ostream& os, SequenceNumber seqNum );
00183 
00184   // Function Name: printSource
00185   // Arguments: ostream& (out), SequenceNumber (in)
00186   // Returns:   void
00187   // Send to the output stream the source data (in general, ASCII) from which 
00188   // the requested sequence was decoded.
00189   virtual bool printSource( ostream& os, SequenceNumber seqNum );
00190 
00191   // Function Name: extractSource
00192   // This extracts the source data for bases seqStart to seqEnd inclusive
00193   // of sequence seqNum and places it in source
00194   virtual void extractSource( char** pSource,//vector<char>& source, 
00195                      SequenceNumber seqNum,
00196                      SequenceOffset seqStart,
00197                      SequenceOffset seqEnd );
00198 
00199   virtual void saveIndexImp
00200     ( ostream& fileFile, 
00201       ostream& indexFile, 
00202       int& fileNumber );
00203 
00204 
00205   // Function Name: saveState
00206   // Arguments: void
00207   // Returns:   SequenceReaderState*
00208   // saves the state (ie current file position) of a SequenceReader for future
00209   // restoration
00210   virtual SequenceReaderState* saveState( void ) const
00211   {
00212     return new SequenceReaderFileState
00213       ( lastSequenceNumber_, pInputFileStream_->tellg() );
00214   }
00215 
00216   // Function Name: restoreState
00217   // Arguments:   SequenceReaderState*
00218   // Returns:     void
00219   // restores the state (ie current file position) of a SequenceReader
00220   // then (NB!!) deletes *pState;
00221   virtual void restoreState( SequenceReaderState* pState )
00222   {
00223     SequenceReaderFileState* p
00224       (dynamic_cast<SequenceReaderFileState*>(pState));
00225     assert(p!=NULL);
00226     lastSequenceNumber_ = p->lastSequenceNumber_;
00227     pInputFileStream_->seekg( p->filePos_, ios::beg );
00228     delete pState;
00229   }
00230 
00231   // PROTECTED MEMBER FUNCTIONS 
00232   // (visible to this class and derived classes only)
00233   protected:
00234 
00235   // Function Name: computeNumSequencesInFile
00236   // Arguments: void
00237   // Returns:   SequenceNumber
00238   // Returns the number of sequences in the file (will be done by lazy 
00239   // initialization, i.e. will only be calculated if asked for. NB this
00240   // will lose the current place in the file)
00241   virtual SequenceNumber computeNumSequencesInFile( void );
00242 
00243 
00244   // PRIVATE MEMBER FUNCTIONS
00245   // (visible to instances of this class only)
00246   
00247   private:
00248   SequenceReaderFile& operator=(const SequenceReaderFile&);// NOT IMPLEMENTED
00249 
00250   // PROTECTED DATA:
00251   // (visible to this class and derived classes only)
00252   protected:
00253     enum Constants{ sideInfoBufferSize_ = 20000, inputBufferSize_ = 20000 };
00254     char inputBuffer_[ inputBufferSize_ ];
00255     //    char sideInfoBuffer_[ sideInfoBufferSize_ ];
00256   //    string inputBuffer_;
00257     string sideInfoBuffer_;
00258   char seqStartChar_;
00259   char seqStopChar_;
00260 
00261   istream* pInputFileStream_;
00262 
00263   //  ifstream* pInputFileStream_;
00264   string fileName_;
00265   SequenceEncoder* pEncoder_;
00266 
00267   vector<std::streampos> seqPositions_;
00268 
00269   // PRIVATE MEMBER DATA
00270   private:
00271 
00272 }; // SequenceReaderFile
00273 
00274 class SequenceReaderFasta : public SequenceReaderFile
00275 {
00276 
00277   // PUBLIC MEMBER FUNCTIONS
00278   public:
00279 
00280   // Constructors and Destructors
00281 
00282 
00283   // Function Name:
00284   // Arguments:
00285   // TYPE  NAME  IN/OUT COMMENT
00286   // Returns: TYPE COMMENT
00287   SequenceReaderFasta
00288   ( const char* fileName, 
00289     SequenceEncoder* pEncoder,
00290     ostream& monitoringStream = cerr );
00291   SequenceReaderFasta
00292   ( const char* fileName, 
00293     ostream& monitoringStream = cerr );
00294 
00295 }; // SequenceReaderFasta
00296 
00297 typedef SequenceReaderFasta SequenceReaderFastaDNA; 
00298 
00299 class SequenceReaderFastaProtein : public SequenceReaderFasta
00300 {
00301 
00302   // PUBLIC MEMBER FUNCTIONS
00303   public:
00304 
00305   // Constructors and Destructors
00306 
00307   // Function Name:
00308   // Arguments:
00309   // TYPE  NAME  IN/OUT COMMENT
00310   // Returns: TYPE COMMENT
00311   SequenceReaderFastaProtein( const char* fileName, 
00312                               ostream& monitoringStream = cerr );
00313 
00314 }; // SequenceReaderFastaProtein
00315 
00316 
00317 
00318 
00319 
00320 // ### Function Declarations ###
00321 
00322 // Name:
00323 // Arguments:
00324 // TYPE  NAME  IN/OUT COMMENT
00325 // Returns: TYPE COMMENT
00326 
00327 // End of include guard:
00328 #endif
00329 
00330 // End of file SequenceReaderFile.h
00331 
00332 
00333 
00334 
00335 
00336 
00337 
00338 
00339 
00340 

Generated on Fri Dec 21 13:12:16 2007 for ssaha by  doxygen 1.5.2