00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #ifndef INCLUDED_HashTableGeneric
00032 #define INCLUDED_HashTableGeneric
00033
00034
00035
00036
00037 #include <iosfwd>
00038 #include <string>
00039 #include <utility>
00040 #include <algorithm>
00041 #include <functional>
00042 #include <cassert>
00043 #include "GlobalDefinitions.h"
00044 class SequenceReader;
00045
00046
00047
00048
00049 typedef unsigned int PositionInHitList;
00050
00051
00052
00053
00054
00055
00056 class NameReader
00057 {
00058 public:
00059 virtual ~NameReader() {}
00060 virtual const char* getSequenceName
00061 ( SequenceNumber seqNum ) const {return NULL;}
00062 virtual void getSequenceName
00063 ( string& seqName, SequenceNumber seqNum ) const {}
00064 virtual void saveSequenceNames( ostream& nameFile ) {}
00065 virtual void loadSequenceNames( istream& nameFile ) {}
00066 virtual SequenceNumber size( void ) const { return 0; }
00067 };
00068
00069 class NameReaderLocal : public NameReader, private vector<string>
00070 {
00071 public:
00072 virtual ~NameReaderLocal() {}
00073 virtual const char* getSequenceName
00074 ( SequenceNumber seqNum ) const;
00075 virtual void getSequenceName
00076 ( string& seqName, SequenceNumber seqNum ) const;
00077 string& lastName()
00078 {
00079 string dummy = "";
00080 push_back(dummy); return back();
00081 }
00082 virtual void saveSequenceNames( ostream& nameFile );
00083 virtual void loadSequenceNames
00084 ( istream& nameFile );
00085 virtual SequenceNumber size( void ) const
00086 {
00087 return vector<string>::size();
00088 }
00089 void pop_back( void )
00090 {
00091 vector<string>::pop_back();
00092 }
00093 };
00094
00095 class SourceReaderIndex;
00096 class NameReaderIndex : public NameReader
00097 {
00098 public:
00099 NameReaderIndex( SourceReaderIndex& reader ) : reader_(reader) {}
00100 virtual ~NameReaderIndex() {}
00101 virtual const char* getSequenceName
00102 ( SequenceNumber seqNum ) const;
00103 virtual void getSequenceName
00104 ( string& seqName, SequenceNumber seqNum ) const;
00105 virtual SequenceNumber size( void ) const;
00106 private:
00107 SourceReaderIndex& reader_;
00108 };
00109
00110
00111
00112
00113
00114
00115 class SequenceAdapter
00116 {
00117 public:
00118 typedef WordSequence::size_type size_type;
00119 SequenceAdapter() {}
00120 virtual ~SequenceAdapter () {}
00121
00122 virtual void link( const WordSequence& seq )
00123 { pSeq_ = &seq; }
00124 virtual Word operator[]( size_type j )
00125 { return (*pSeq_)[j]; }
00126 virtual size_type size( void ) const
00127 { return pSeq_->size()-1; }
00128 protected:
00129 const WordSequence* pSeq_;
00130 };
00131
00132
00133 class SequenceAdapterWithOverlap : public SequenceAdapter
00134 {
00135 public:
00136 SequenceAdapterWithOverlap
00137 ( int bitsPerSymbol, int wordLength, int stepLength );
00138 virtual ~SequenceAdapterWithOverlap();
00139
00140 virtual void link( const WordSequence& seq );
00141 virtual Word operator[]( size_type j );
00142 virtual size_type size( void ) const { return size_; }
00143 protected:
00144 int bitsPerSymbol_;
00145 int wordLength_;
00146 int stepLength_;
00147 size_type size_;
00148 Word* maskLeft_;
00149 Word* maskRight_;
00150
00151 };
00152
00153 class AdapterFactory
00154 {
00155 public:
00156 SequenceAdapter* create
00157 ( int bitsPerSymbol, int wordLength, int stepLength )
00158 {
00159 return
00160 ( (wordLength==stepLength)
00161 ? new SequenceAdapter()
00162 : new SequenceAdapterWithOverlap
00163 ( bitsPerSymbol, wordLength, stepLength ) );
00164 }
00165
00166 };
00167
00168
00169
00170
00171 class HashTableGeneric
00172 {
00173
00174 friend class HashTableFactory;
00175
00176
00177 public:
00178
00179
00180
00181
00182
00183
00184 HashTableGeneric
00185 ( ostream& monitoringStream,
00186 const string& name,
00187 Allocator<PositionInHitList>& arrayAllocator );
00188
00189
00190
00191
00192
00193 virtual ~HashTableGeneric();
00194
00195
00196
00197
00198
00199
00200
00201
00202 virtual void createHashTable
00203 ( SequenceReader& sequenceReader, int wordLength, int maxNumHits,
00204 int stepLength = 0 );
00205 virtual int countWordsAndGetNames
00206 ( SequenceReader& sequenceReader, SequenceAdapter* seq );
00207 virtual void setupPointerArray( void );
00208 virtual void computePointerArray( void );
00209 virtual void setupHitList( void );
00210 virtual void hashAllWords
00211 ( SequenceReader& sequenceReader, SequenceAdapter* seq, int numSeqs );
00212 virtual void cleanupTempData( void );
00213
00214 Allocator<PositionInHitList>* tempAlloc_;
00215
00216
00217
00218
00219
00220 virtual void hashWords
00221 ( SequenceAdapter& thisSeq, SequenceNumber seqNum ) =0;
00222 virtual void countWords( SequenceAdapter& thisSeq ) =0;
00223
00224 virtual void matchSequence
00225 ( WordSequence& seq, HitList& hitListFwd ) =0;
00226
00227 virtual void setNumRepeats( int nr) =0;
00228 virtual void setSubstituteThreshold( int ns ) {}
00229
00230 virtual char* getHitListStart( void ) const=0;
00231 virtual int getHitTypeSize( void ) const=0;
00232 virtual void allocateHitList( unsigned long size ) =0;
00233 virtual void loadHitList( unsigned long size ) =0;
00234 virtual void saveHitList( void ) =0;
00235 virtual void loadSequenceNames( void );
00236 virtual void saveSequenceNames( void );
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263 virtual void loadHashTable( SourceReaderIndex* pSourceReader=NULL );
00264
00265
00266
00267
00268
00269
00270
00271 virtual void saveHashTable( void );
00272
00273
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283 void getSequenceName
00284 ( string& seqName, SequenceNumber seqNum ) const;
00285
00286
00287
00288
00289
00290
00291
00292
00293 const char* getSequenceName
00294 ( SequenceNumber seqNum ) const;
00295
00296
00297
00298
00299 SequenceOffset getSequenceSize( SequenceNumber seqNum ) const;
00300
00301
00302
00303 virtual void printHashStats( void );
00304
00305 bool isInitialized() const { return isInitialized_; }
00306 int getWordLength() const { return wordLength_; }
00307 int getStepLength() const { return stepLength_; }
00308 SequenceNumber getNumSequences() const;
00309
00310 virtual int getMaxNumHits() const { return maxNumHits_; }
00311 virtual void setMaxNumHits( int mnh ) { maxNumHits_ = mnh; }
00312 int getBitsPerSymbol( void ) const { return bitsPerSymbol_; }
00313 SourceDataType getSourceDataType( void ) const { return sourceData_; }
00314 HitListFormatType getHitListFormat( void ) const
00315 { return hitListFormat_; }
00316 virtual unsigned long getTotalNumWords( void ) const
00317 { return wordsInDatabase_; }
00318 const NameReader& getNameReader() const { return *pNameReader_; }
00319
00320
00321
00322
00323
00324
00325
00326
00327 protected:
00328
00329
00330
00331 private:
00332 HashTableGeneric( const HashTableGeneric&);
00333 HashTableGeneric& operator=(const HashTableGeneric&);
00334
00335
00336
00337 protected:
00338
00339
00340 string name_;
00341
00342
00343
00344 bool isInitialized_;
00345
00346
00347
00348
00349
00350 int wordLength_;
00351
00352
00353
00354
00355
00356 int stepLength_;
00357
00358
00359
00360
00361
00362
00363 int maxNumHits_;
00364
00365
00366
00367
00368 int bitsPerSymbol_;
00369
00370
00371
00372
00373 SourceDataType sourceData_;
00374
00375
00376
00377
00378 HitListFormatType hitListFormat_;
00379
00380
00381
00382
00383 ostream& monitoringStream_;
00384
00385
00386
00387 unsigned long wordsInDatabase_;
00388
00389
00390
00391 unsigned long numDifferentWords_;
00392
00393
00394
00395 PositionInHitList* pHitsFoundSoFar_;
00396
00397
00398
00399
00400
00401
00402 PositionInHitList* pWordPositionInHitList_;
00403
00404
00405 Allocator<PositionInHitList>* pArrayAllocator_;
00406
00407
00408
00409
00410
00411
00412 NameReader* pNameReader_;
00413
00414
00415
00416 SequenceOffset* pSequenceSizes_;
00417
00418
00419 };
00420
00421
00422 class HashTableFactory
00423 {
00424 public:
00425 HashTableFactory( ostream& monStream=cerr ) : monitoringStream_(monStream)
00426 {}
00427
00428
00429
00430
00431
00432 void createHashTable
00433 ( HashTableGeneric& ht,
00434 SequenceReader& sequenceReader, int wordLength, int maxNumHits,
00435 int stepLength = 0 )
00436 {
00437 ht.createHashTable
00438 ( sequenceReader, wordLength, maxNumHits, stepLength );
00439 }
00440
00441
00442
00443
00444 HashTableGeneric* loadHashTable
00445 ( const string& name, SourceReaderIndex* pReader=NULL );
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472 void loadHashTable( HashTableGeneric& ht, SourceReaderIndex* pReader=NULL )
00473 {
00474 ht.loadHashTable(pReader);
00475 }
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485 void saveHashTable( HashTableGeneric& ht )
00486 {
00487 ht.saveHashTable();
00488 }
00489
00490
00491 ostream& monitoringStream_;
00492
00493 };
00494
00495
00496
00497
00498 template<typename T, class SUBCLASS>
00499 struct HashTableView : public HashTableGeneric
00500 {
00501
00502 friend class HashTableFactory;
00503
00504
00505 public:
00506 typedef T HitType;
00507
00508
00509
00510
00511
00512
00513 HashTableView( ostream& monitoringStream,
00514 string name,
00515 Allocator<HitType>& hitListAllocator,
00516 Allocator<PositionInHitList>& arrayAllocator ) :
00517 HashTableGeneric
00518 ( monitoringStream, name, arrayAllocator )
00519 {
00520 pHitListAllocator_ = hitListAllocator.clone
00521 (&pHitListForAllWords_, name+(string)".body", monitoringStream_ );
00522
00523 }
00524
00525
00526
00527
00528
00529 virtual ~HashTableView()
00530 {
00531 monitoringStream_ << "destructing HashTableView ...\n";
00532 if ( isInitialized_ )
00533 {
00534 monitoringStream_ << "... deallocating memory\n";
00535
00536 delete pHitListAllocator_;
00537
00538 }
00539 else
00540 {
00541 monitoringStream_
00542 << "... never initialized, no memory deallocation required\n";
00543 }
00544 }
00545
00546
00547
00548
00549
00550 virtual void hashWords
00551 ( SequenceAdapter& thisSeq, SequenceNumber seqNum ) =0;
00552 virtual void countWords( SequenceAdapter& thisSeq ) =0;
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563 typedef const HitType* iterator;
00564 typedef const HitType* const_iterator;
00565
00566 virtual int getHitTypeSize( void ) const
00567 {
00568 return sizeof(HitType);
00569 }
00570
00571 virtual char* getHitListStart( void ) const
00572 {
00573 return (char*) pHitListForAllWords_;
00574 }
00575
00576 virtual void allocateHitList( unsigned long size )
00577 {
00578
00579 pHitListAllocator_->allocate(size);
00580 }
00581
00582 virtual void loadHitList( unsigned long size )
00583 {
00584 pHitListAllocator_->load(size);
00585 }
00586
00587 virtual void saveHitList( void )
00588 {
00589 pHitListAllocator_->save();
00590 }
00591
00592
00593
00594
00595 const_iterator begin( Word w ) const
00596 {
00597 return pHitListForAllWords_ +
00598 ( ( w == 0 ) ? 0 : pWordPositionInHitList_[ w - 1 ] );
00599 }
00600
00601 const_iterator last( Word w ) const
00602 {
00603 return pHitListForAllWords_ + ( pWordPositionInHitList_[ w ] );
00604 }
00605
00606 const_iterator end( Word w ) const
00607 {
00608 return( (size(w) > maxNumHits_) ? begin(w) : last(w) );
00609 }
00610
00611 const int size( Word w ) const
00612 {
00613 return( last(w) - begin(w) );
00614 }
00615
00616
00617
00618
00619 template< class HITLIST >
00620 void matchWord
00621 ( Word queryWord, HITLIST& hitsFound, int baseOffset=0 ) const
00622 {
00623
00624
00625
00626
00627 if ((queryWord&gCursedWord)!=(Word)0) return;
00628 const_iterator endWord(end(queryWord));
00629 for( iterator i( begin( queryWord ) ); i != endWord; ++i )
00630 {
00631 hitsFound.addHit
00632 ( *i, baseOffset );
00633 }
00634
00635 }
00636
00637
00638 template< class HITLIST >
00639 void matchWord
00640 ( const WordSequence& queryWords,
00641 HITLIST& hitsFound,
00642 int baseOffset=0 ) const
00643 {
00644
00645 if (queryWords.size()==0) return;
00646
00647
00648
00649
00650 WordSequence::const_iterator last(&queryWords.back());
00651
00652 for ( WordSequence::const_iterator thisWord(queryWords.begin());
00653 thisWord != last ; ++thisWord )
00654 {
00655 matchWord( *thisWord, hitsFound, baseOffset );
00656 baseOffset += wordLength_;
00657 }
00658
00659 }
00660
00661
00662
00663
00664
00665
00666
00667
00668
00669
00670 protected:
00671
00672
00673
00674 private:
00675 HashTableView( const HashTableView&);
00676 HashTableView& operator=(const HashTableView&);
00677
00678
00679
00680 protected:
00681
00682
00683
00684
00685 HitType* pHitListForAllWords_;
00686 Allocator<HitType>* pHitListAllocator_;
00687
00688
00689 };
00690
00691
00692
00693
00694
00695 class TopList : public vector<pair<int, Word> >
00696 {
00697 public:
00698 TopList( int size, int wordLength, int bitsPerSymbol=gBaseBits ) :
00699 vector<pair<int, Word> >( size, pair<int, Word>(0,0) ),
00700 wordLength_( wordLength ), bitsPerSymbol_( bitsPerSymbol ) {}
00701 void push_back( pair<int, Word> p )
00702 {
00703 if ( p.first > back().first )
00704 {
00705
00706 vector<pair<int, Word> >::push_back( p );
00707 sort( begin(), end(), greater<pair<int, Word> >() );
00708 pop_back();
00709 }
00710 }
00711
00712 friend ostream& operator<<
00713 ( ostream& os, TopList tl )
00714 {
00715 if (tl.bitsPerSymbol_==gBaseBits)
00716 {
00717 for( vector<pair<int, Word> >::iterator i(tl.begin());
00718 i != tl.end() ; ++i)
00719 cout << printWord( i->second, tl.wordLength_ )
00720 << ", " << i->first << " occurrences.\n";
00721 }
00722 else if (tl.bitsPerSymbol_==gResidueBits )
00723 {
00724 for( vector<pair<int, Word> >::iterator i(tl.begin());
00725 i != tl.end() ; ++i)
00726 cout << printResidue( i->second, tl.wordLength_ )
00727 << ", " << i->first << " occurrences.\n";
00728 }
00729 else assert(1==0);
00730 return os;
00731 }
00732
00733 private:
00734 int wordLength_;
00735 int bitsPerSymbol_;
00736
00737
00738 };
00739
00740
00741
00742
00743
00744
00745 class WordSequenceShifted
00746 {
00747 public:
00748 WordSequenceShifted( const WordSequence& thisSeq,
00749 const HashTableGeneric& hashTable );
00750 Word& operator[]( unsigned long wordNum )
00751 {
00752 return( (seqs_[ wordNum%seqs_.size() ])[ wordNum/seqs_.size() ] );
00753 }
00754
00755
00756
00757
00758
00759
00760
00761 int size( void ) { return size_; }
00762
00763 protected:
00764
00765 vector<WordSequence> seqs_;
00766 int size_;
00767 };
00768
00769
00770
00771
00772
00773
00774
00775
00776
00777
00778 #endif
00779
00780