SequenceReader/SequenceReaderFilter.h

Go to the documentation of this file.
00001 /*  Last edited: Apr 18 17:49 2002 (ac2) */
00002 
00003 // #######################################################################
00004 
00005 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00006 // Version 3.2, released 1st March 2004
00007 // Copyright (c) Genome Research 2002
00008 
00009 // SSAHA is free software; you can redistribute it and/or modify 
00010 // it under the terms of version 2 of the GNU General Public Licence
00011 // as published by the Free Software Foundation.
00012  
00013 // This program is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public Licence for more details.
00017  
00018 // You should have received a copy of the GNU General Public Licence
00019 // along with this program; if not, write to the Free Software
00020 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00021 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00022 
00023 // #######################################################################
00024 
00025 // Module Name  : SequenceReaderFilter
00026 // File Name    : SequenceReaderFilter.h
00027 // Language     : C++
00028 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00029 
00030 // Include guard:
00031 #ifndef INCLUDED_SequenceReaderFilter
00032 #define INCLUDED_SequenceReaderFilter
00033 
00034 // Description:
00035 
00036 // Includes:
00037 
00038 #include "SequenceReader.h"
00039 #include "GlobalDefinitions.h"
00040 //#include <set>
00041 //#include <hash_set>
00042 // NB it is good practise for #include statements in header files to be
00043 // replaced by forward declarations if at all possible
00044 
00045 // ### Class Declarations ###
00046 
00047 struct HashFunctor
00048 {
00049   size_t operator()(const char* s) const 
00050   { 
00051     unsigned long h = 0; 
00052     for ( ; *s; ++s)
00053       h = 5*h + *s;
00054     return size_t(h);
00055   } // ~operator()
00056 }; // ~struct HashFunctor
00057 
00058 
00059 class StringHash : public vector<string>
00060 {
00061 public:
00062   HashFunctor H_;
00063 
00064   void makeBins( int numBins );
00065 
00066   size_t makeNumBins( void ) const;
00067 
00068   void makeBins( void )
00069   {
00070     makeBins(makeNumBins());
00071   }
00072 
00073   bool isPresent( const string& s )
00074   {
00075     vector<string*>* myBin = &bins_[ H_(s.c_str()) % numBins_ ];
00076     for (vector<string*>::iterator i(myBin->begin()); i!= myBin->end();++i)
00077       if (**i == s ) return true;
00078     return false;
00079   }
00080 
00081   size_t max( void ) const 
00082   { 
00083     int max(0);
00084     for (vector<vector<string*> >::const_iterator i(bins_.begin()); i!= bins_.end(); i++ )
00085       if (i->size()>max) max=i->size();
00086     return max;
00087   } 
00088 
00089   double loadFactor( void ) const { return size()/numBins_; }
00090 
00091   double effic( void ) const { return max()/loadFactor(); }
00092 
00093   size_t numBins( void ) const { return numBins_; }
00094 
00095 private:
00096 
00097   size_t numBins_;
00098   vector<vector<string*> > bins_;
00099 
00100 }; // ~class StringHash
00101 
00102 
00103 // -----------------------------
00104 // Class Name : SequenceReaderFilterState
00105 // Description: This preserves the state (ie position in a file) of
00106 // a SequenceReaderFilter
00107 class SequenceReaderFilterState : public SequenceReaderState
00108 {
00109  public:
00110   SequenceReaderFilterState
00111     ( SequenceNumber lsn, SequenceReader* ps ) :
00112     pState_(ps->saveState()), SequenceReaderState(lsn) {}
00113   virtual ~SequenceReaderFilterState() {} //delete pState_;
00114   // no point in making this private as it's const
00115   // this is state info for *ps, whatever it is
00116   SequenceReaderState* pState_;
00117 };
00118 
00119 
00120 
00121 
00122 // Class Name : SequenceReaderFilter
00123 // Description: Takes a pointer to another SequenceReader plus a list of
00124 // sequence names to exclude
00125  
00126 
00127 class SequenceReaderFilter : public SequenceReader
00128 {
00129 
00130 
00131   // PUBLIC MEMBER FUNCTIONS
00132   public:
00133   // Constructors and Destructors
00134 
00135   // Function Name: Constructor
00136   // Arguments: SequenceReader*, int, ostream&
00137   // NB SequenceReaderFilter takes ownership of pSeq, ie
00138   // it deletes it when it itself is destructed
00139   // SequenceReaderWhatever s( ... );
00140   // SequenceReader* p = new SequenceReaderWhatever( ... );
00141   // SequenceReaderFilter f( p, "name" ); // OK
00142   // SequenceReaderFilter f( s->clone(), "name" ); // OK - takes a copy
00143   // SequenceReaderFilter f( &s, "name" ); // don't do this ... bad
00144 
00145   SequenceReaderFilter
00146   ( SequenceReader* pSeq, 
00147     ifstream* pFilterSource,
00148     ostream& monitoringStream = cerr );
00149 
00150 
00151   SequenceReaderFilter
00152   ( SequenceReader* pSeq, 
00153     const char* filterFileName, 
00154     ostream& monitoringStream = cerr );
00155 
00156 
00157   // Function Name: Copy constructor
00158   // Arguments:
00159   SequenceReaderFilter( const SequenceReaderFilter& rhs );
00160 
00161 
00162   // Function Name: Destructor
00163   // Arguments:
00164   ~SequenceReaderFilter(); 
00165   // (NB destructor should be virtual if class is to be derived from)
00166 
00167   // Manipulator Functions
00168   virtual SequenceReader* clone( void ) 
00169   { 
00170     return new SequenceReaderFilter( *this ); 
00171   }
00172 
00173   // Function Name:
00174   // Arguments:
00175   // TYPE  NAME  IN/OUT COMMENT
00176   // Returns: TYPE COMMENT
00177 
00178   // Function Name: readFilterNames
00179   // Arguments: ifstream*
00180   // Returns: void
00181   // Reads in a list of names to be filtered from an ifstream*
00182   // into set<string>*. Once done, deletes the ifstream
00183   void readFilterNames( ifstream* pFilterSource );
00184 
00185 
00186   // Function Name: changeMode
00187   // Arguments: const SequenceReaderMode&
00188   // Makes a copy of mode and uses it to handle mismatch character reads
00189   // Does nothing here because any character reading necessary will have 
00190   // been done by the SequenceReader this class is constructed from 
00191   virtual void changeMode( SequenceReaderMode* pMode ) 
00192   {
00193     pSeq_->changeMode( pMode );
00194   }
00195 
00196   // Accessor Functions
00197   // (NB all accessor functions should be 'const')
00198 
00199   // Function Name: rewind
00200   // Arguments: void
00201   // Returns:   void
00202   // Rewind to the start of the data file, so that getNextSequence will
00203   // return the first sequence in the file
00204   virtual void rewind( void ); 
00205 
00206   // Function Name: getNextSequence
00207   // Arguments: WordSequence& (out), int (in)
00208   // Returns:   int
00209   // Read the next set of sequence information from the file and parse it
00210   // into WordSequence format. Returns -1 if there has been a problem with
00211   // reading the sequence, else returns the number of valid base pairs 
00212   // contained within the final word of the sequence.
00213   virtual int getNextSequence( WordSequence& nextSeq, int wordLength );
00214 
00215   // Function Name: getSequence
00216   // Arguments: WordSequence& (out), SequenceNumber (in), int (in)
00217   // Returns:   bool
00218   // Read the sequenceNumber-th set of sequence information from the file and 
00219   // parse it into WordSequence format
00220   virtual int getSequence
00221   ( WordSequence& nextSeq, SequenceNumber sequenceNumber, int wordLength );
00222 
00223   // Function Name: getLastSequenceName
00224   // Arguments: string& (out)
00225   // Returns:   void
00226   // Fills the string with the name of the last sequence read 
00227   virtual void getLastSequenceName( string& seqName ) const;
00228 
00229   // Function Name: getBitsPerSymbol
00230   // Arguments: none
00231   // Returns:   int
00232   // Returns number of bits per symbol used in encoding
00233   virtual int getBitsPerSymbol ( void ) const 
00234   {
00235     return pSeq_->getBitsPerSymbol();
00236   }
00237 
00238   // Function Name: getSourceDataType
00239   // Arguments: none
00240   // Returns:   SourceDataType
00241   // Returns type of data being encoded (protein or DNA)
00242   virtual SourceDataType getSourceDataType( void ) const
00243   {
00244     return pSeq_->getSourceDataType();
00245   }
00246 
00247   // Function Name: getNumFiltered
00248   // Arguments: none
00249   // Returns:   int
00250   // Returns number of sequences excluded so far from pSeq_
00251   // if (areAllSequencesRead()==true) then this is the final number
00252   int getNumFiltered( void ) const
00253   {
00254     return numFiltered_;
00255   }
00256 
00257   // Function Name: printName
00258   // Arguments: string& (out), SequenceNumber (in)
00259   // Returns:   void
00260   // Fills a string with the name of the requested sequence
00261   virtual bool printName( ostream& os, SequenceNumber seqNum );
00262 
00263   // Function Name: printSideInfo
00264   // Arguments: string& (out), SequenceNumber (in)
00265   // Returns:   void
00266   // Fills a string with the name of the requested sequence
00267   virtual bool printSideInfo( ostream& os, SequenceNumber seqNum );
00268 
00269   // Function Name: printSource
00270   // Arguments: string& (out), SequenceNumber (in)
00271   // Returns:   void
00272   // Fills a string with the name of the requested sequence
00273   virtual bool printSource( ostream& os, SequenceNumber seqNum );
00274 
00275   virtual bool findSequence( SequenceNumber seqNum );
00276 
00277 
00278   // Function Name: extractSource
00279   // This extracts the source data for bases seqStart to seqEnd inclusive
00280   // of sequence seqNum and places it in source
00281   virtual void extractSource( char** pSource,//vector<char>& source, 
00282                      SequenceNumber seqNum,
00283                      SequenceOffset seqStart,
00284                      SequenceOffset seqEnd );
00285 
00286   // Function Name: saveIndexImp
00287   // This makes and saves the index for *this, by kinda monkeying around
00288   // with the index for *pSeq_ 
00289   virtual void saveIndexImp
00290     ( ostream& fileFile, 
00291       ostream& indexFile, 
00292       int& fileNumber );
00293 
00294   // Function Name: saveState
00295   // Arguments: void
00296   // Returns:   SequenceReaderState*
00297   // saves the state (ie current file position) of a SequenceReader for future
00298   // restoration
00299   virtual SequenceReaderState* saveState( void ) const
00300   {
00301     return new SequenceReaderFilterState( lastSequenceNumber_, pSeq_ );
00302   }
00303 
00304   // Function Name: restoreState
00305   // Arguments:   SequenceReaderState*
00306   // Returns:     void
00307   // restores the state (ie current file position) of a SequenceReader
00308   // then (NB!!) deletes *pState;
00309   virtual void restoreState( SequenceReaderState* pState ) 
00310   {
00311     SequenceReaderFilterState* p
00312       (dynamic_cast<SequenceReaderFilterState*>(pState));
00313     assert(p!=NULL);
00314     lastSequenceNumber_ = p->lastSequenceNumber_;
00315     pSeq_->restoreState( p->pState_ );
00316     delete pState;
00317   }
00318 
00319   // PROTECTED MEMBER FUNCTIONS 
00320   // (visible to this class and derived classes only)
00321   protected:
00322 
00323   // Function Name: computeNumSequencesInFile
00324   // Arguments: void
00325   // Returns:   SequenceNumber
00326   // Returns the number of sequences in the file - called by 
00327   // getNumSequencesInFile. 
00328   virtual SequenceNumber computeNumSequencesInFile( void );
00329 
00330   // PRIVATE MEMBER FUNCTIONS
00331   // (visible to instances of this class only)
00332   
00333   private:
00334   SequenceReaderFilter& operator=(const SequenceReaderFilter&);   // NOT IMPLEMENTED
00335 
00336   // PROTECTED DATA:
00337   // (visible to instances of this class only)
00338   protected:
00339   SequenceReader* pSeq_;
00340   //  string filterFileName_;
00341   //  set<string>* pFilterNames_;
00342   StringHash* pFilterNames_;
00343   // filterNums_ - element i of filterNums_ is the number of the sequence
00344   // in *pSeq corresponding to sequence i of *this
00345   // (so filterNums_[0] is not used)
00346   vector<SequenceNumber> filterNums_;
00347   int numFiltered_;
00348 
00349   //  int wordLength_;
00350   //  int bitsPerSymbol_;
00351   //  SourceDataType sourceData_;
00352 
00353   // PRIVATE MEMBER DATA
00354   private:
00355 
00356 }; // ~class SequenceReaderFilter
00357 
00358 
00359 
00360 // ### Function Declarations ###
00361 
00362 // Name:
00363 // Arguments:
00364 // TYPE  NAME  IN/OUT COMMENT
00365 // Returns: TYPE COMMENT
00366 
00367 // End of include guard:
00368 #endif
00369 
00370 // End of file SequenceReaderFilter.h

Generated on Fri Dec 21 13:12:16 2007 for ssaha by  doxygen 1.5.2