HashTable/HashTableGeneric.h

Go to the documentation of this file.
00001 /*  Last edited: May 29 15:45 2002 (ac2) */
00002 
00003 // #######################################################################
00004 
00005 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00006 // Version 3.2, released 1st March 2004
00007 // Copyright (c) Genome Research 2002
00008 
00009 // SSAHA is free software; you can redistribute it and/or modify 
00010 // it under the terms of version 2 of the GNU General Public Licence
00011 // as published by the Free Software Foundation.
00012  
00013 // This program is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public Licence for more details.
00017  
00018 // You should have received a copy of the GNU General Public Licence
00019 // along with this program; if not, write to the Free Software
00020 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00021 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00022 
00023 // #######################################################################
00024 
00025 // Module Name  : HashTableGeneric
00026 // File Name    : HashTableGeneric.h
00027 // Language     : C++
00028 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00029 
00030 // Include guard:
00031 #ifndef INCLUDED_HashTableGeneric
00032 #define INCLUDED_HashTableGeneric
00033 
00034 // Description:
00035 
00036 
00037 #include <iosfwd>
00038 #include <string>
00039 #include <utility>
00040 #include <algorithm>
00041 #include <functional>
00042 #include <cassert>
00043 #include "GlobalDefinitions.h"
00044 class SequenceReader;
00045 
00046 
00047 // ### Class Declarations ###
00048 
00049 typedef unsigned int PositionInHitList; 
00050 
00051 
00052 // Class Name : NameReader
00053 // Description: Sits inside an instance of HashTable giving out
00054 // sequence names. These are either stored locally (NameReaderLocal)
00055 // or grabbed from an instance of SourceReaderIndex (NameReaderIndex) 
00056 class NameReader
00057 {
00058  public:
00059   virtual ~NameReader() {}
00060   virtual const char* getSequenceName
00061     ( SequenceNumber seqNum ) const {return NULL;} //= 0;
00062   virtual void getSequenceName
00063     ( string& seqName, SequenceNumber seqNum ) const {} //= 0;
00064   virtual void saveSequenceNames( ostream& nameFile ) {}
00065   virtual void loadSequenceNames( istream& nameFile ) {}
00066   virtual SequenceNumber size( void ) const { return 0; } //= 0;
00067 };
00068 
00069 class NameReaderLocal : public NameReader, private vector<string>
00070 {
00071  public:
00072   virtual ~NameReaderLocal() {}
00073   virtual const char* getSequenceName
00074   ( SequenceNumber seqNum ) const;
00075   virtual void getSequenceName
00076   ( string& seqName, SequenceNumber seqNum ) const;
00077   string& lastName()
00078   { 
00079     string dummy = "";
00080     push_back(dummy); return back(); 
00081   }
00082   virtual void saveSequenceNames( ostream& nameFile );
00083   virtual void loadSequenceNames
00084     ( istream& nameFile );
00085   virtual SequenceNumber size( void ) const
00086   {
00087     return vector<string>::size();
00088   }
00089   void pop_back( void ) 
00090   {
00091     vector<string>::pop_back();
00092   }
00093 };
00094 
00095 class SourceReaderIndex;
00096 class NameReaderIndex : public NameReader
00097 {
00098  public:
00099   NameReaderIndex( SourceReaderIndex& reader ) : reader_(reader) {}
00100   virtual ~NameReaderIndex() {}
00101   virtual const char* getSequenceName
00102   ( SequenceNumber seqNum ) const;
00103   virtual void getSequenceName
00104   ( string& seqName, SequenceNumber seqNum ) const;
00105   virtual SequenceNumber size( void ) const;
00106  private:
00107   SourceReaderIndex& reader_;
00108 };
00109 
00110 
00111 
00112 
00113 
00114 
00115 class SequenceAdapter
00116 {
00117 public:
00118   typedef WordSequence::size_type size_type;
00119   SequenceAdapter() {}
00120   virtual ~SequenceAdapter () {}
00121 
00122   virtual void link( const WordSequence& seq )
00123     { pSeq_ = &seq; }
00124   virtual Word operator[]( size_type j )
00125     { return (*pSeq_)[j]; }
00126   virtual size_type size( void ) const
00127     { return pSeq_->size()-1; }
00128 protected:
00129   const WordSequence* pSeq_;
00130 };  // ~class SequenceAdapter
00131 
00132 
00133 class SequenceAdapterWithOverlap : public SequenceAdapter
00134 {
00135 public:  
00136   SequenceAdapterWithOverlap
00137   ( int bitsPerSymbol, int wordLength, int stepLength );
00138   virtual ~SequenceAdapterWithOverlap();
00139 
00140   virtual void link( const WordSequence& seq );
00141   virtual Word operator[]( size_type j );
00142   virtual size_type size( void ) const { return size_; }
00143 protected:
00144   int bitsPerSymbol_;
00145   int wordLength_;
00146   int stepLength_;
00147   size_type size_;
00148   Word* maskLeft_;
00149   Word* maskRight_;
00150 
00151 }; // ~class SequenceAdapterWithOverlap
00152 
00153 class AdapterFactory
00154 {
00155  public:
00156   SequenceAdapter* create
00157   ( int bitsPerSymbol, int wordLength, int stepLength ) 
00158   { 
00159     return 
00160     ( (wordLength==stepLength)
00161       ? new SequenceAdapter()
00162       : new SequenceAdapterWithOverlap
00163       ( bitsPerSymbol, wordLength, stepLength ) );
00164   } // ~create
00165         
00166 }; // ~class AdapterFactory
00167 
00168 // Class Name : HashTable
00169 // Description: All hash table implementations will inherit their interface
00170 // from this base class
00171 class HashTableGeneric
00172 {
00173 
00174   friend class HashTableFactory;
00175 
00176   // PUBLIC MEMBER FUNCTIONS
00177   public:
00178 
00179   // Constructors and Destructors
00180 
00181   // Function Name: Constructor
00182   // Arguments:     ostream&
00183   // Returns: N/A
00184   HashTableGeneric
00185   ( ostream& monitoringStream,
00186     const string& name,
00187     Allocator<PositionInHitList>& arrayAllocator );
00188 
00189   // Function Name:
00190   // Arguments:
00191   // TYPE  NAME  IN/OUT COMMENT
00192   // Returns: TYPE COMMENT
00193   virtual ~HashTableGeneric();
00194   // (NB destructor should be virtual if class is to be derived from)
00195 
00196   // Manipulator Functions
00197 
00198   // Function Name: createHashTable
00199   // Arguments: SequenceReader& (in), int (in), int (in)
00200   // Reads sequence information from an instance of SequenceReader and
00201   // uses it to create a hash table
00202   virtual void createHashTable
00203   ( SequenceReader& sequenceReader, int wordLength, int maxNumHits, 
00204     int stepLength = 0 );
00205   virtual int countWordsAndGetNames
00206     ( SequenceReader& sequenceReader, SequenceAdapter* seq );
00207   virtual void setupPointerArray( void );
00208   virtual void computePointerArray( void );
00209   virtual void setupHitList( void );
00210   virtual void hashAllWords
00211     ( SequenceReader& sequenceReader, SequenceAdapter* seq, int numSeqs );
00212   virtual void cleanupTempData( void );
00213 
00214   Allocator<PositionInHitList>* tempAlloc_;
00215 
00216 
00217   //  virtual void hashWords
00218   //( SequenceAdapter& thisSeq, SequenceNumber seqNum,  
00219   //  unsigned int* pHitsFoundSoFar_ ) =0;
00220   virtual void hashWords
00221     ( SequenceAdapter& thisSeq, SequenceNumber seqNum ) =0;
00222   virtual void countWords( SequenceAdapter& thisSeq ) =0;
00223 
00224   virtual void matchSequence
00225   ( WordSequence& seq, HitList& hitListFwd ) =0;
00226 
00227   virtual void setNumRepeats( int nr) =0;
00228   virtual void setSubstituteThreshold( int ns ) {}
00229 
00230   virtual char* getHitListStart( void ) const=0;
00231   virtual int getHitTypeSize( void ) const=0;
00232   virtual void allocateHitList( unsigned long size ) =0;
00233   virtual void loadHitList( unsigned long size ) =0;
00234   virtual void saveHitList( void ) =0;
00235   virtual void loadSequenceNames( void );
00236   virtual void saveSequenceNames( void );
00237 
00238   // Function Name: loadHashTable
00239   // Arguments: const string& (in)
00240   // Reads a pre-computed hash table into memory from a file
00241   // A hash table is stored as three files:
00242   //
00243   // 1. fileNameRoot.head has the following format
00244   // int stepLength_;
00245   // PositionInHitList pWordPositionInHitList_[ 2^(2*wordLength) ];
00246   // This stores an instance of the PositionInHitList type for each possible
00247   // value of Word (i.e. there are 2^(2*wordLength_) entries in all). Entry
00248   // number x of *pWordPositionInHitList_ tells you which entry of
00249   // *pHitListForAllWords_ contains the position of the first occurrence
00250   // of word x+1 in the subject sequence database.
00251   // NB wordLength is the length in bases of each hash word and is deduced by
00252   // loadHashTable from the size of this file.
00253   //
00254   // 2. fileNameRoot.body has the following format:
00255   // PositionInDatabase* pHitListForAllWords_;
00256   // This stores an instance of the PositionInDatabase type for each
00257   // occurrence of each Word. Only the first maxNumHits_ occurrences of
00258   // each Word are stored.
00259   // 
00260   // 3. fileNameRoot.name contained the name of each subject sequence, 
00261   // stored sequentially as strings.
00262   //  void loadHashTable( HashTableGeneric& ht, const string& fileNameRoot );
00263   virtual void loadHashTable( SourceReaderIndex* pSourceReader=NULL );
00264 
00265   // Function Name: saveHashTable
00266   // Arguments: const string& (in)
00267   // Save hash table into files (for subsequent retrieval by loadHashTable)
00268   // See description of HashTable::loadHashTable for details of file format 
00269   // used.
00270   //  void saveHashTable( HashTableGeneric& ht, const string& fileNameRoot );
00271   virtual void saveHashTable( void );
00272 
00273 
00274 
00275 
00276 
00277   // Accessor Functions
00278   // (NB all accessor functions should be 'const')
00279 
00280   // Function Name: getSequenceName
00281   // Arguments: string& (out), SequenceNumber& (in)
00282   // Places the name of the seqNum'th sequence in the database in seqName
00283   void getSequenceName
00284   ( string& seqName, SequenceNumber seqNum ) const;
00285 
00286   // Function Name: getSequenceName
00287   // Arguments: SequenceNumber& (in)
00288   // Returns: const char*
00289   // Returns C style string of the seqNum'th seq in the database
00290   // NB should be used with caution as the string only lasts as long
00291   // as the hash table, you have to copy it if you want to keep it
00292   // after the hash table's destructor is called for any reason.
00293   const char* getSequenceName
00294   ( SequenceNumber seqNum ) const;
00295 
00296   // Function Name: getSequenceSize
00297   // Arguments: SequenceNumber (in)
00298   // Returns the size of the seqNum'th sequence in the database.
00299   SequenceOffset getSequenceSize( SequenceNumber seqNum ) const;
00300   
00301   // Function Name: printHashStats
00302   // Prints some stats about the hash table
00303   virtual void printHashStats( void ); 
00304 
00305   bool isInitialized() const { return isInitialized_; }
00306   int  getWordLength() const { return wordLength_; }
00307   int  getStepLength() const { return stepLength_; }
00308   SequenceNumber getNumSequences() const;
00309   //  SequenceNumber getNumSequences() const { assert(1==0); }
00310   virtual int  getMaxNumHits() const { return maxNumHits_; }
00311   virtual void setMaxNumHits( int mnh ) { maxNumHits_ = mnh; }
00312   int  getBitsPerSymbol( void ) const { return bitsPerSymbol_; }
00313   SourceDataType getSourceDataType( void ) const { return sourceData_; }
00314   HitListFormatType getHitListFormat( void ) const 
00315     { return hitListFormat_; }
00316   virtual unsigned long getTotalNumWords( void ) const 
00317     { return wordsInDatabase_; }
00318   const NameReader& getNameReader() const { return *pNameReader_; }
00319 
00320   // Function Name:
00321   // Arguments:
00322   // TYPE  NAME  IN/OUT COMMENT
00323   // Returns: TYPE COMMENT
00324   
00325   // PROTECTED MEMBER FUNCTIONS 
00326   // (visible to this class and derived classes only)
00327   protected:
00328 
00329   // PRIVATE MEMBER FUNCTIONS
00330   // (visible to instances of this class only)
00331   private:
00332   HashTableGeneric( const HashTableGeneric&);             // NOT IMPLEMENTED
00333   HashTableGeneric& operator=(const HashTableGeneric&);   // NOT IMPLEMENTED
00334 
00335   // PROTECTED MEMBER DATA
00336   // (visible to this class and derived classes only)
00337   protected:
00338   // name_ identifies the hash table & is used as a prefix when 
00339   // saving/loading to/from files
00340   string         name_;
00341 
00342   // isInitialized_ is set to true once a hash table has been successfully
00343   // created, either by loading in from a file or creating from sequence data.
00344   bool           isInitialized_;
00345 
00346   // wordLength_ is passed in as an argument to HashTable::createHashTable.
00347   // The subject data is split into words, each containing wordLength_ base 
00348   // pairs. The hash table stores the positions of each word in the subject
00349   // data
00350   int            wordLength_;  
00351 
00352   // stepLength_ is passed in as an argument to HashTable::createHashTable
00353   // and specifies the interval in base pairs between successive hash words.
00354   // This must lie between 1 and wordLength_, and its default value is
00355   // wordLength_.
00356   int            stepLength_;
00357 
00358 
00359   // maxNumHits_ is passed in as an argument to HashTable::createHashTable,
00360   // and specifies the maximum number of occurrences of each word to be
00361   // stored in the hash table. If we find more than maxNumHits_ occurrences of
00362   // a particular hash word, then none of them are stored.
00363   int            maxNumHits_;
00364 
00365   // bitsPerSymbol_ refers to the number of bit per symbol contained in 
00366   // a word. In general, either 2 for DNA or 5 for protein, although no
00367   // reason why code shouldn't work with other values.
00368   int            bitsPerSymbol_;
00369 
00370   // sourceData_ is the type of data used to create the hash table.
00371   // Can't just use bitsPerSymbol_ to tell this because 5 bit data
00372   // can be generated from DNA by translating codons.
00373   SourceDataType sourceData_;
00374 
00375   // hitListFormat_ is an enum that describes the format used to store
00376   // the hit list. Set to gNotSpecified in HashTableGeneric and reset
00377   // to its final `proper' value in its subclass.
00378   HitListFormatType hitListFormat_;
00379 
00380 
00381   // monitoringStream_ specifies where error and progress messages are to be 
00382   // sent.
00383   ostream&       monitoringStream_; 
00384 
00385   // wordsInDatabase_ is the number of Words actually found in the subject
00386   // database
00387   unsigned long wordsInDatabase_;
00388   
00389   // numDifferentWords_ is the number of different possible Words of
00390   // the chosen word length and symbol size
00391   unsigned long numDifferentWords_;
00392 
00393   // This is temporary storage for the number of hits found for each word,
00394   // used during createHashTable.  
00395   PositionInHitList* pHitsFoundSoFar_;
00396 
00397   // This stores an instance of the PositionInHitList type for each possible
00398   // value of Word (i.e. there are 2^(2*wordLength_) entries in all). Entry
00399   // number x of *pWordPositionInHitList_ tells you which entry of
00400   // *pHitListForAllWords_ contains the position of the first occurrence
00401   // of word x+1 in the subject sequence database.
00402   PositionInHitList*  pWordPositionInHitList_; 
00403 
00404   // pArrayAllocator handles dynamic mem alloc for pWordPositionInHitList_
00405   Allocator<PositionInHitList>* pArrayAllocator_;
00406 
00407  
00408 
00409   // For each sequence in the subject database, this stores a string
00410   // containing the sequence name.
00411   //  vector<string*> sequenceNames_;
00412   NameReader* pNameReader_;
00413 
00414   // For each sequence in the subject database, this stores the number
00415   // of bases in the sequence.
00416   SequenceOffset* pSequenceSizes_;
00417 
00418 
00419 }; // HashTableGeneric
00420 
00421 
00422 class HashTableFactory
00423 {
00424  public:
00425   HashTableFactory( ostream& monStream=cerr ) : monitoringStream_(monStream)
00426   {}
00427 
00428   // Function Name: createHashTable
00429   // Arguments: SequenceReader& (in), int (in), int (in)
00430   // Reads sequence information from an instance of SequenceReader and
00431   // uses it to create a hash table
00432   void createHashTable
00433   ( HashTableGeneric& ht, 
00434     SequenceReader& sequenceReader, int wordLength, int maxNumHits, 
00435     int stepLength = 0 )
00436   {
00437     ht.createHashTable
00438       ( sequenceReader, wordLength, maxNumHits, stepLength );
00439   }
00440 
00441   // Looks at the .name file of a set of saved hash table files, decides
00442   // what type of hash table it was from, creates the appropriate hash
00443   // table, loads the data into it and returns a pointer to its handiwork
00444   HashTableGeneric* loadHashTable
00445     ( const string& name, SourceReaderIndex* pReader=NULL );
00446 
00447   // Function Name: loadHashTable
00448   // Arguments: const string& (in)
00449   // Reads a pre-computed hash table into memory from a file
00450   // A hash table is stored as three files:
00451   //
00452   // 1. fileNameRoot.head has the following format
00453   // int stepLength_;
00454   // PositionInHitList pWordPositionInHitList_[ 2^(2*wordLength) ];
00455   // This stores an instance of the PositionInHitList type for each possible
00456   // value of Word (i.e. there are 2^(2*wordLength_) entries in all). Entry
00457   // number x of *pWordPositionInHitList_ tells you which entry of
00458   // *pHitListForAllWords_ contains the position of the first occurrence
00459   // of word x+1 in the subject sequence database.
00460   // NB wordLength is the length in bases of each hash word and is deduced by
00461   // loadHashTable from the size of this file.
00462   //
00463   // 2. fileNameRoot.body has the following format:
00464   // PositionInDatabase* pHitListForAllWords_;
00465   // This stores an instance of the PositionInDatabase type for each
00466   // occurrence of each Word. Only the first maxNumHits_ occurrences of
00467   // each Word are stored.
00468   // 
00469   // 3. fileNameRoot.name contained the name of each subject sequence, 
00470   // stored sequentially as strings.
00471   //  void loadHashTable( HashTableGeneric& ht, const string& fileNameRoot );
00472   void loadHashTable( HashTableGeneric& ht, SourceReaderIndex* pReader=NULL )
00473   {
00474     ht.loadHashTable(pReader);
00475   }
00476 
00477 
00478 
00479   // Function Name: saveHashTable
00480   // Arguments: const string& (in)
00481   // Save hash table into files (for subsequent retrieval by loadHashTable)
00482   // See description of HashTable::loadHashTable for details of file format 
00483   // used.
00484   //  void saveHashTable( HashTableGeneric& ht, const string& fileNameRoot );
00485   void saveHashTable( HashTableGeneric& ht )
00486   {
00487     ht.saveHashTable();
00488   }
00489 
00490 
00491   ostream& monitoringStream_;
00492 
00493 };
00494 
00495 // Class Name : HashTable
00496 // Description: All hash table implementations will inherit their interface
00497 // from this base class
00498 template<typename T, class SUBCLASS> 
00499 struct HashTableView : public HashTableGeneric // %%%% TBD private??
00500 {
00501 
00502   friend class HashTableFactory;
00503 
00504   // PUBLIC MEMBER FUNCTIONS
00505   public:
00506   typedef T HitType; 
00507 
00508   // Constructors and Destructors
00509 
00510   // Function Name: Constructor
00511   // Arguments:     ostream&
00512   // Returns: N/A
00513   HashTableView( ostream& monitoringStream,
00514                  string name,
00515                  Allocator<HitType>& hitListAllocator,
00516                  Allocator<PositionInHitList>& arrayAllocator ) :
00517   HashTableGeneric
00518   ( monitoringStream, name, arrayAllocator )
00519     {
00520       pHitListAllocator_ =  hitListAllocator.clone
00521         (&pHitListForAllWords_, name+(string)".body", monitoringStream_ );
00522       //      pHitListAllocator_->link(&pHitListForAllWords_,name_+(string)".body");
00523     }
00524 
00525   // Function Name:
00526   // Arguments:
00527   // TYPE  NAME  IN/OUT COMMENT
00528   // Returns: TYPE COMMENT
00529   virtual ~HashTableView()
00530   { 
00531     monitoringStream_ << "destructing HashTableView ...\n";
00532     if ( isInitialized_ )
00533     {
00534       monitoringStream_ << "... deallocating memory\n";
00535       //      pHitListAllocator_->deallocate();
00536       delete pHitListAllocator_; // also deallocs hit list
00537       //      delete [] pHitListForAllWords_; 
00538     } // ~if 
00539     else 
00540     {
00541       monitoringStream_ 
00542       << "... never initialized, no memory deallocation required\n";
00543     } // ~else
00544   } // destructor
00545 
00546   // (NB destructor should be virtual if class is to be derived from)
00547 
00548   // Manipulator Functions
00549 
00550   virtual void hashWords
00551   ( SequenceAdapter& thisSeq, SequenceNumber seqNum ) =0;
00552   virtual void countWords( SequenceAdapter& thisSeq ) =0;
00553 
00554   
00555 
00556   // Accessor Functions
00557   // (NB all accessor functions should be 'const')
00558 
00559 
00560 
00561   // The next few lines make HashTable behave like an STL container
00562 
00563   typedef const HitType* iterator;
00564   typedef const HitType* const_iterator;
00565 
00566   virtual int getHitTypeSize( void ) const 
00567   {
00568     return sizeof(HitType);
00569   }
00570 
00571   virtual char* getHitListStart( void ) const 
00572   {
00573     return (char*) pHitListForAllWords_;
00574   }
00575 
00576   virtual void allocateHitList( unsigned long size )
00577   {
00578     //    pHitListForAllWords_=new HitType[size];
00579     pHitListAllocator_->allocate(size);
00580   }
00581 
00582   virtual void loadHitList( unsigned long size )
00583   {
00584     pHitListAllocator_->load(size);
00585   }
00586 
00587   virtual void saveHitList( void )
00588   {
00589     pHitListAllocator_->save();
00590   }
00591 
00592 
00593 
00594 
00595   const_iterator begin( Word w ) const
00596   {
00597     return pHitListForAllWords_ +
00598       ( ( w == 0 ) ? 0 : pWordPositionInHitList_[ w - 1 ] );
00599   } // ~const_iterator begin( Word w ) const
00600 
00601   const_iterator last( Word w ) const
00602   {  
00603     return pHitListForAllWords_ + ( pWordPositionInHitList_[ w ] ); 
00604   } // ~const_iterator last( Word w ) const
00605 
00606   const_iterator end( Word w ) const
00607   { 
00608     return( (size(w) > maxNumHits_) ? begin(w) : last(w) ); 
00609   } // ~const_iterator end( Word w ) const
00610 
00611   const int size( Word w ) const
00612   { 
00613     return( last(w) - begin(w) ); 
00614   } // ~const int size( Word w ) const
00615 
00616   //  inline SequenceNumber getSequence( const_iterator i ) const;
00617   //  inline SequenceOffset getOffset( const_iterator i ) const;
00618 
00619   template< class HITLIST >
00620   void matchWord
00621   ( Word queryWord, HITLIST& hitsFound, int baseOffset=0 ) const
00622   {
00623     // About 1/2 the exec time of a matchWord is spent calling end()
00624     // so bung it in a const variable TC 12.3.2 
00625     //    for( iterator i( begin( queryWord ) ); 
00626     // i != end( queryWord ); i++ )
00627     if ((queryWord&gCursedWord)!=(Word)0) return;
00628     const_iterator endWord(end(queryWord));
00629     for( iterator i( begin( queryWord ) ); i != endWord; ++i )
00630     { 
00631       hitsFound.addHit
00632         ( *i, baseOffset );
00633     } // ~for i
00634 
00635   } // ~template< class HITLIST > void matchWord
00636 
00637 
00638   template< class HITLIST >
00639   void matchWord
00640   ( const WordSequence& queryWords, 
00641     HITLIST& hitsFound, 
00642     int baseOffset=0 ) const
00643   {
00644     //    printf("HashTable::matchWord (multiple)\n");
00645     if (queryWords.size()==0) return;
00646     
00647     // WordSequenceIterator last 
00648     //    = const_cast<WordSequenceIterator>(queryWords.end());
00649     //   last--; 
00650     WordSequence::const_iterator last(&queryWords.back());
00651 
00652     for ( WordSequence::const_iterator thisWord(queryWords.begin());
00653             thisWord != last ; ++thisWord )
00654     {
00655       matchWord( *thisWord, hitsFound, baseOffset );
00656       baseOffset += wordLength_;
00657     } // ~for
00658 
00659   } // ~  template< class HITLIST > void matchWord
00660 
00661 
00662 
00663   // Function Name:
00664   // Arguments:
00665   // TYPE  NAME  IN/OUT COMMENT
00666   // Returns: TYPE COMMENT
00667   
00668   // PROTECTED MEMBER FUNCTIONS 
00669   // (visible to this class and derived classes only)
00670   protected:
00671 
00672   // PRIVATE MEMBER FUNCTIONS
00673   // (visible to instances of this class only)
00674   private:
00675   HashTableView( const HashTableView&);             // NOT IMPLEMENTED
00676   HashTableView& operator=(const HashTableView&);   // NOT IMPLEMENTED
00677 
00678   // PROTECTED MEMBER DATA
00679   // (visible to this class and derived classes only)
00680   protected:
00681   
00682   // This stores an instance of the PositionInDatabase type for each
00683   // occurrence of each Word. Only the first maxNumHits_ occurrences of
00684   // each Word are stored.
00685   HitType* pHitListForAllWords_;
00686   Allocator<HitType>* pHitListAllocator_;
00687   //  Allocator** ppHitListAllocator_;
00688 
00689 }; // HashTableView
00690 
00691 
00692 // Class Name : TopList
00693 // Description: This class is used by HashTable::printHashStats
00694 // to compute the top 10 words in the data base
00695 class TopList : public vector<pair<int, Word> >
00696 {
00697 public:
00698   TopList( int size, int wordLength, int bitsPerSymbol=gBaseBits ) : 
00699     vector<pair<int, Word> >( size, pair<int, Word>(0,0) ),
00700     wordLength_( wordLength ), bitsPerSymbol_( bitsPerSymbol ) {}
00701   void push_back( pair<int, Word> p )
00702   {
00703     if ( p.first > back().first )
00704     { 
00705 
00706       vector<pair<int, Word> >::push_back( p );
00707       sort( begin(), end(), greater<pair<int, Word> >()  );
00708       pop_back();
00709     } // ~if                                    
00710   } // ~push_back
00711 
00712   friend ostream& operator<<
00713     ( ostream& os, TopList tl )
00714   {
00715     if (tl.bitsPerSymbol_==gBaseBits)
00716     {
00717      for( vector<pair<int, Word> >::iterator i(tl.begin()); 
00718           i != tl.end() ; ++i)
00719        cout << printWord( i->second, tl.wordLength_ ) 
00720             << ", " << i->first << " occurrences.\n";
00721     } // ~if 
00722     else if (tl.bitsPerSymbol_==gResidueBits )
00723     {
00724       for( vector<pair<int, Word> >::iterator i(tl.begin()); 
00725            i != tl.end() ; ++i)
00726         cout << printResidue( i->second, tl.wordLength_ ) 
00727              << ", " << i->first << " occurrences.\n";
00728     }
00729     else assert(1==0);
00730     return os;
00731   } // ~operator<<
00732 
00733  private:
00734   int wordLength_;
00735   int bitsPerSymbol_;
00736 
00737   
00738 };
00739 
00740 // Class Name : WordSequenceShifted
00741 // Description: Constructed from an instance thisSeq of WordSequence.
00742 // seqs_[i] consists of thisSeq left shifted by i bases, that is, the 1st
00743 // base of seqs_[i] is the (i+1)st base of thisSeq. The purpose of this class
00744 // is to enable the masking out of tandem repeats 
00745 class WordSequenceShifted 
00746 {
00747   public:
00748   WordSequenceShifted( const WordSequence& thisSeq, 
00749                        const HashTableGeneric& hashTable );
00750   Word& operator[]( unsigned long wordNum )
00751   {
00752     return( (seqs_[ wordNum%seqs_.size() ])[ wordNum/seqs_.size() ] );
00753   }
00754 
00755 
00756 // NB size_ is not the number of bases in the sequence, it is the number of 
00757 // hash words that may be obtained from the sequence
00758 
00759   //  void screenRepeats( HitList& hitsOut, int numRepeats );
00760 
00761   int size( void ) { return size_; }
00762  
00763   protected:
00764   //  const HashTable& hashTable_;
00765   vector<WordSequence> seqs_; 
00766   int size_; 
00767 }; 
00768 
00769 
00770 // ### Function Declarations ###
00771 
00772 // Name:
00773 // Arguments:
00774 // TYPE  NAME  IN/OUT COMMENT
00775 // Returns: TYPE COMMENT
00776 
00777 // End of include guard:
00778 #endif
00779 
00780 // End of file HashTableGeneric.h

Generated on Fri Dec 21 13:12:15 2007 for ssaha by  doxygen 1.5.2