00001 /* Last edited: Apr 18 17:49 2002 (ac2) */ 00002 00003 // ####################################################################### 00004 00005 // SSAHA : Sequence Search and Alignment by Hashing Algorithm 00006 // Version 3.2, released 1st March 2004 00007 // Copyright (c) Genome Research 2002 00008 00009 // SSAHA is free software; you can redistribute it and/or modify 00010 // it under the terms of version 2 of the GNU General Public Licence 00011 // as published by the Free Software Foundation. 00012 00013 // This program is distributed in the hope that it will be useful, 00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 // GNU General Public Licence for more details. 00017 00018 // You should have received a copy of the GNU General Public Licence 00019 // along with this program; if not, write to the Free Software 00020 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 00021 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt 00022 00023 // ####################################################################### 00024 00025 // Module Name : SequenceReaderFilter 00026 // File Name : SequenceReaderFilter.h 00027 // Language : C++ 00028 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk) 00029 00030 // Include guard: 00031 #ifndef INCLUDED_SequenceReaderFilter 00032 #define INCLUDED_SequenceReaderFilter 00033 00034 // Description: 00035 00036 // Includes: 00037 00038 #include "SequenceReader.h" 00039 #include "GlobalDefinitions.h" 00040 //#include <set> 00041 //#include <hash_set> 00042 // NB it is good practise for #include statements in header files to be 00043 // replaced by forward declarations if at all possible 00044 00045 // ### Class Declarations ### 00046 00047 struct HashFunctor 00048 { 00049 size_t operator()(const char* s) const 00050 { 00051 unsigned long h = 0; 00052 for ( ; *s; ++s) 00053 h = 5*h + *s; 00054 return size_t(h); 00055 } // ~operator() 00056 }; // ~struct HashFunctor 00057 00058 00059 class StringHash : public vector<string> 00060 { 00061 public: 00062 HashFunctor H_; 00063 00064 void makeBins( int numBins ); 00065 00066 size_t makeNumBins( void ) const; 00067 00068 void makeBins( void ) 00069 { 00070 makeBins(makeNumBins()); 00071 } 00072 00073 bool isPresent( const string& s ) 00074 { 00075 vector<string*>* myBin = &bins_[ H_(s.c_str()) % numBins_ ]; 00076 for (vector<string*>::iterator i(myBin->begin()); i!= myBin->end();++i) 00077 if (**i == s ) return true; 00078 return false; 00079 } 00080 00081 size_t max( void ) const 00082 { 00083 int max(0); 00084 for (vector<vector<string*> >::const_iterator i(bins_.begin()); i!= bins_.end(); i++ ) 00085 if (i->size()>max) max=i->size(); 00086 return max; 00087 } 00088 00089 double loadFactor( void ) const { return size()/numBins_; } 00090 00091 double effic( void ) const { return max()/loadFactor(); } 00092 00093 size_t numBins( void ) const { return numBins_; } 00094 00095 private: 00096 00097 size_t numBins_; 00098 vector<vector<string*> > bins_; 00099 00100 }; // ~class StringHash 00101 00102 00103 // ----------------------------- 00104 // Class Name : SequenceReaderFilterState 00105 // Description: This preserves the state (ie position in a file) of 00106 // a SequenceReaderFilter 00107 class SequenceReaderFilterState : public SequenceReaderState 00108 { 00109 public: 00110 SequenceReaderFilterState 00111 ( SequenceNumber lsn, SequenceReader* ps ) : 00112 pState_(ps->saveState()), SequenceReaderState(lsn) {} 00113 virtual ~SequenceReaderFilterState() {} //delete pState_; 00114 // no point in making this private as it's const 00115 // this is state info for *ps, whatever it is 00116 SequenceReaderState* pState_; 00117 }; 00118 00119 00120 00121 00122 // Class Name : SequenceReaderFilter 00123 // Description: Takes a pointer to another SequenceReader plus a list of 00124 // sequence names to exclude 00125 00126 00127 class SequenceReaderFilter : public SequenceReader 00128 { 00129 00130 00131 // PUBLIC MEMBER FUNCTIONS 00132 public: 00133 // Constructors and Destructors 00134 00135 // Function Name: Constructor 00136 // Arguments: SequenceReader*, int, ostream& 00137 // NB SequenceReaderFilter takes ownership of pSeq, ie 00138 // it deletes it when it itself is destructed 00139 // SequenceReaderWhatever s( ... ); 00140 // SequenceReader* p = new SequenceReaderWhatever( ... ); 00141 // SequenceReaderFilter f( p, "name" ); // OK 00142 // SequenceReaderFilter f( s->clone(), "name" ); // OK - takes a copy 00143 // SequenceReaderFilter f( &s, "name" ); // don't do this ... bad 00144 00145 SequenceReaderFilter 00146 ( SequenceReader* pSeq, 00147 ifstream* pFilterSource, 00148 ostream& monitoringStream = cerr ); 00149 00150 00151 SequenceReaderFilter 00152 ( SequenceReader* pSeq, 00153 const char* filterFileName, 00154 ostream& monitoringStream = cerr ); 00155 00156 00157 // Function Name: Copy constructor 00158 // Arguments: 00159 SequenceReaderFilter( const SequenceReaderFilter& rhs ); 00160 00161 00162 // Function Name: Destructor 00163 // Arguments: 00164 ~SequenceReaderFilter(); 00165 // (NB destructor should be virtual if class is to be derived from) 00166 00167 // Manipulator Functions 00168 virtual SequenceReader* clone( void ) 00169 { 00170 return new SequenceReaderFilter( *this ); 00171 } 00172 00173 // Function Name: 00174 // Arguments: 00175 // TYPE NAME IN/OUT COMMENT 00176 // Returns: TYPE COMMENT 00177 00178 // Function Name: readFilterNames 00179 // Arguments: ifstream* 00180 // Returns: void 00181 // Reads in a list of names to be filtered from an ifstream* 00182 // into set<string>*. Once done, deletes the ifstream 00183 void readFilterNames( ifstream* pFilterSource ); 00184 00185 00186 // Function Name: changeMode 00187 // Arguments: const SequenceReaderMode& 00188 // Makes a copy of mode and uses it to handle mismatch character reads 00189 // Does nothing here because any character reading necessary will have 00190 // been done by the SequenceReader this class is constructed from 00191 virtual void changeMode( SequenceReaderMode* pMode ) 00192 { 00193 pSeq_->changeMode( pMode ); 00194 } 00195 00196 // Accessor Functions 00197 // (NB all accessor functions should be 'const') 00198 00199 // Function Name: rewind 00200 // Arguments: void 00201 // Returns: void 00202 // Rewind to the start of the data file, so that getNextSequence will 00203 // return the first sequence in the file 00204 virtual void rewind( void ); 00205 00206 // Function Name: getNextSequence 00207 // Arguments: WordSequence& (out), int (in) 00208 // Returns: int 00209 // Read the next set of sequence information from the file and parse it 00210 // into WordSequence format. Returns -1 if there has been a problem with 00211 // reading the sequence, else returns the number of valid base pairs 00212 // contained within the final word of the sequence. 00213 virtual int getNextSequence( WordSequence& nextSeq, int wordLength ); 00214 00215 // Function Name: getSequence 00216 // Arguments: WordSequence& (out), SequenceNumber (in), int (in) 00217 // Returns: bool 00218 // Read the sequenceNumber-th set of sequence information from the file and 00219 // parse it into WordSequence format 00220 virtual int getSequence 00221 ( WordSequence& nextSeq, SequenceNumber sequenceNumber, int wordLength ); 00222 00223 // Function Name: getLastSequenceName 00224 // Arguments: string& (out) 00225 // Returns: void 00226 // Fills the string with the name of the last sequence read 00227 virtual void getLastSequenceName( string& seqName ) const; 00228 00229 // Function Name: getBitsPerSymbol 00230 // Arguments: none 00231 // Returns: int 00232 // Returns number of bits per symbol used in encoding 00233 virtual int getBitsPerSymbol ( void ) const 00234 { 00235 return pSeq_->getBitsPerSymbol(); 00236 } 00237 00238 // Function Name: getSourceDataType 00239 // Arguments: none 00240 // Returns: SourceDataType 00241 // Returns type of data being encoded (protein or DNA) 00242 virtual SourceDataType getSourceDataType( void ) const 00243 { 00244 return pSeq_->getSourceDataType(); 00245 } 00246 00247 // Function Name: getNumFiltered 00248 // Arguments: none 00249 // Returns: int 00250 // Returns number of sequences excluded so far from pSeq_ 00251 // if (areAllSequencesRead()==true) then this is the final number 00252 int getNumFiltered( void ) const 00253 { 00254 return numFiltered_; 00255 } 00256 00257 // Function Name: printName 00258 // Arguments: string& (out), SequenceNumber (in) 00259 // Returns: void 00260 // Fills a string with the name of the requested sequence 00261 virtual bool printName( ostream& os, SequenceNumber seqNum ); 00262 00263 // Function Name: printSideInfo 00264 // Arguments: string& (out), SequenceNumber (in) 00265 // Returns: void 00266 // Fills a string with the name of the requested sequence 00267 virtual bool printSideInfo( ostream& os, SequenceNumber seqNum ); 00268 00269 // Function Name: printSource 00270 // Arguments: string& (out), SequenceNumber (in) 00271 // Returns: void 00272 // Fills a string with the name of the requested sequence 00273 virtual bool printSource( ostream& os, SequenceNumber seqNum ); 00274 00275 virtual bool findSequence( SequenceNumber seqNum ); 00276 00277 00278 // Function Name: extractSource 00279 // This extracts the source data for bases seqStart to seqEnd inclusive 00280 // of sequence seqNum and places it in source 00281 virtual void extractSource( char** pSource,//vector<char>& source, 00282 SequenceNumber seqNum, 00283 SequenceOffset seqStart, 00284 SequenceOffset seqEnd ); 00285 00286 // Function Name: saveIndexImp 00287 // This makes and saves the index for *this, by kinda monkeying around 00288 // with the index for *pSeq_ 00289 virtual void saveIndexImp 00290 ( ostream& fileFile, 00291 ostream& indexFile, 00292 int& fileNumber ); 00293 00294 // Function Name: saveState 00295 // Arguments: void 00296 // Returns: SequenceReaderState* 00297 // saves the state (ie current file position) of a SequenceReader for future 00298 // restoration 00299 virtual SequenceReaderState* saveState( void ) const 00300 { 00301 return new SequenceReaderFilterState( lastSequenceNumber_, pSeq_ ); 00302 } 00303 00304 // Function Name: restoreState 00305 // Arguments: SequenceReaderState* 00306 // Returns: void 00307 // restores the state (ie current file position) of a SequenceReader 00308 // then (NB!!) deletes *pState; 00309 virtual void restoreState( SequenceReaderState* pState ) 00310 { 00311 SequenceReaderFilterState* p 00312 (dynamic_cast<SequenceReaderFilterState*>(pState)); 00313 assert(p!=NULL); 00314 lastSequenceNumber_ = p->lastSequenceNumber_; 00315 pSeq_->restoreState( p->pState_ ); 00316 delete pState; 00317 } 00318 00319 // PROTECTED MEMBER FUNCTIONS 00320 // (visible to this class and derived classes only) 00321 protected: 00322 00323 // Function Name: computeNumSequencesInFile 00324 // Arguments: void 00325 // Returns: SequenceNumber 00326 // Returns the number of sequences in the file - called by 00327 // getNumSequencesInFile. 00328 virtual SequenceNumber computeNumSequencesInFile( void ); 00329 00330 // PRIVATE MEMBER FUNCTIONS 00331 // (visible to instances of this class only) 00332 00333 private: 00334 SequenceReaderFilter& operator=(const SequenceReaderFilter&); // NOT IMPLEMENTED 00335 00336 // PROTECTED DATA: 00337 // (visible to instances of this class only) 00338 protected: 00339 SequenceReader* pSeq_; 00340 // string filterFileName_; 00341 // set<string>* pFilterNames_; 00342 StringHash* pFilterNames_; 00343 // filterNums_ - element i of filterNums_ is the number of the sequence 00344 // in *pSeq corresponding to sequence i of *this 00345 // (so filterNums_[0] is not used) 00346 vector<SequenceNumber> filterNums_; 00347 int numFiltered_; 00348 00349 // int wordLength_; 00350 // int bitsPerSymbol_; 00351 // SourceDataType sourceData_; 00352 00353 // PRIVATE MEMBER DATA 00354 private: 00355 00356 }; // ~class SequenceReaderFilter 00357 00358 00359 00360 // ### Function Declarations ### 00361 00362 // Name: 00363 // Arguments: 00364 // TYPE NAME IN/OUT COMMENT 00365 // Returns: TYPE COMMENT 00366 00367 // End of include guard: 00368 #endif 00369 00370 // End of file SequenceReaderFilter.h
1.5.2