#include <HashTableGeneric.h>
Inheritance diagram for HashTableGeneric:


Definition at line 171 of file HashTableGeneric.h.
| HashTableGeneric::HashTableGeneric | ( | ostream & | monitoringStream, | |
| const string & | name, | |||
| Allocator< PositionInHitList > & | arrayAllocator | |||
| ) |
Definition at line 63 of file HashTableGeneric.cpp.
00065 : 00066 isInitialized_( false ), 00067 monitoringStream_( monitoringStream ), 00068 name_( name ), 00069 bitsPerSymbol_( gBaseBits ), // default: may be overwritten in subclass ctor 00070 maxNumHits_( defaultMaxNumHits ), 00071 hitListFormat_( gNotSpecified ), 00072 pArrayAllocator_ 00073 ( arrayAllocator.clone(&pWordPositionInHitList_, 00074 name+(string)".head", 00075 monitoringStream_) ), 00076 pSequenceSizes_(NULL), 00077 pNameReader_(NULL) 00078 { 00079 monitoringStream_ << "constructing HashTableGeneric\n"; 00080 if (name_=="") 00081 { 00082 char buf[50]; 00083 static int numTables(0); 00084 sprintf(buf,"SSAHAHashTable-%d-%d",getpid(),++numTables); 00085 name_=(string)buf; 00086 } 00087 // pArrayAllocator_->link(&pWordPositionInHitList_,name+(string)".head"); 00088 } // constructor
| HashTableGeneric::~HashTableGeneric | ( | ) | [virtual] |
Definition at line 94 of file HashTableGeneric.cpp.
References isInitialized_, monitoringStream_, pArrayAllocator_, pNameReader_, and pSequenceSizes_.
00095 { 00096 monitoringStream_ << "destructing HashTableGeneric ...\n"; 00097 if ( isInitialized_ ) 00098 { 00099 monitoringStream_ << "... deallocating memory\n"; 00100 // delete [] pHitListForAllWords_; 00101 // pArrayAllocator->deallocate(); 00102 delete pArrayAllocator_; // also deallocs pointer array 00103 // deletion of HitList is handled in HashTableView 00104 // pHitListAllocator->deallocate(); 00105 // delete pHitListAllocator; // NB destructor order 00106 // delete [] pWordPositionInHitList_; 00107 delete [] pSequenceSizes_; 00108 // for ( vector<string*>::iterator i( sequenceNames_.begin() ); 00109 // i != sequenceNames_.end() ; ++i ) delete *i; 00110 delete pNameReader_; 00111 } // ~if 00112 else 00113 { 00114 monitoringStream_ 00115 << "... never initialized, no memory deallocation required\n"; 00116 } // ~else 00117 } // destructor
| HashTableGeneric::HashTableGeneric | ( | const HashTableGeneric & | ) | [private] |
| void HashTableGeneric::createHashTable | ( | SequenceReader & | sequenceReader, | |
| int | wordLength, | |||
| int | maxNumHits, | |||
| int | stepLength = 0 | |||
| ) | [virtual] |
Definition at line 120 of file HashTableGeneric.cpp.
References bitsPerSymbol_, cleanupTempData(), computePointerArray(), countWordsAndGetNames(), AdapterFactory::create(), gBitsPerWord, gDNAData, SequenceReader::getBitsPerSymbol(), SequenceReader::getSourceDataType(), gProteinData, hashAllWords(), isInitialized_, maxNumHits_, monitoringStream_, numDifferentWords_, pSequenceSizes_, setupHitList(), setupPointerArray(), sourceData_, stepLength_, and wordLength_.
Referenced by HashTableFactory::createHashTable(), and main().
00122 { 00123 00124 monitoringStream_ << "HashTable::createHashTable" << endl; 00125 00126 // bitsPerSymbol_ = sequenceReader.getBitsPerSymbol(); 00127 00128 if (bitsPerSymbol_ != sequenceReader.getBitsPerSymbol() ) 00129 { 00130 monitoringStream_ << "Info: reader encodes using " 00131 << sequenceReader.getBitsPerSymbol() 00132 << " bits per symbol, hash table expects " 00133 << bitsPerSymbol_ << " bits per symbol.\n"; 00134 // throw SSAHAException("Wrong encoder for hash table"); 00135 } // ~if 00136 00137 00138 monitoringStream_ << "Info: hash table creation will proceed using " 00139 << bitsPerSymbol_ << " bits per symbol.\n"; 00140 00141 sourceData_ = sequenceReader.getSourceDataType(); 00142 00143 if ( sourceData_ == gDNAData ) 00144 { 00145 monitoringStream_ 00146 << "Info: hash table will be created from DNA source data.\n"; 00147 } // ~if 00148 else if ( sourceData_ == gProteinData ) 00149 { 00150 monitoringStream_ 00151 << "Info: hash table will be created from protein source data.\n"; 00152 } // ~else 00153 else 00154 { 00155 monitoringStream_ 00156 << "Info: source data for hash table is of unknown type.\n"; 00157 } // ~else 00158 00159 00160 // check wordLength is sensible 00161 00162 int bitsUsedPerWord( wordLength * bitsPerSymbol_ ); 00163 00164 if ( (bitsUsedPerWord <= 0 ) || ( bitsUsedPerWord > gBitsPerWord ) ) 00165 // 4 base pairs storable per byte 00166 { 00167 monitoringStream_ << "Error: bits used per hash words (" 00168 << bitsUsedPerWord << ") outside valid range ( 1 to " 00169 << gBitsPerWord << ").\n"; 00170 throw SSAHAException("Invalid hash word length."); 00171 } // ~if 00172 00173 monitoringStream_ << "Info: " << wordLength 00174 << " symbols will be stored per hash word.\n"; 00175 00176 // check stepLength is sensible. If not, set it equal to the word length 00177 if ( ( stepLength > wordLength ) || ( stepLength <= 0 ) ) 00178 { 00179 monitoringStream_ << "Info: requested step length for hashing (" 00180 << stepLength << ") outside valid range ( 1 to " 00181 << wordLength << "), using " << wordLength 00182 << " instead.\n"; 00183 00184 stepLength = wordLength; 00185 } 00186 00187 wordLength_ = wordLength; 00188 stepLength_ = stepLength; 00189 maxNumHits_ = maxNumHits; 00190 00191 AdapterFactory factory; 00192 00193 SequenceAdapter* seq 00194 ( factory.create( bitsPerSymbol_, wordLength_, stepLength_ ) ); 00195 00196 00197 // numDifferentWords_ is the number of possible different Words for a 00198 // given word length ( equal to 2^(2*wordLength_) ) 00199 numDifferentWords_ = ( 1 << bitsUsedPerWord ); 00200 00201 setupPointerArray(); 00202 00203 00204 // Now start to extract the sequence information from the file 00205 00206 // sequenceReader.rewind(); 00207 00208 int numSeqs(countWordsAndGetNames( sequenceReader, seq )); 00209 00210 // Create the array of sequence sizes 00211 monitoringStream_ << "Allocating memory for pSequenceSizes_: " 00212 << numSeqs << " sequences, " 00213 << numSeqs * sizeof(SequenceOffset) 00214 << " bytes total ..." << endl; 00215 00216 pSequenceSizes_ = new SequenceOffset [ numSeqs ]; 00217 00218 if (!pSequenceSizes_) 00219 { 00220 throw SSAHAException("Memory allocation failed!"); 00221 } // ~if 00222 00223 monitoringStream_ << "... memory allocation OK" << endl; 00224 00225 computePointerArray(); 00226 00227 setupHitList(); 00228 00229 // Now go through the vector of WordSequences and fill in the table 00230 00231 hashAllWords( sequenceReader, seq, numSeqs ); 00232 00233 cleanupTempData(); 00234 00235 delete seq; 00236 // delete [] pHitsFoundSoFar_; 00237 00238 // Next flag is only set once everything is OK 00239 isInitialized_ = true; 00240 00241 } // ~HashTable::createHashTable( int wordLength, int maxNumHits )
Here is the call graph for this function:

Here is the caller graph for this function:

| int HashTableGeneric::countWordsAndGetNames | ( | SequenceReader & | sequenceReader, | |
| SequenceAdapter * | seq | |||
| ) | [virtual] |
Reimplemented in HashTableTranslated.
Definition at line 245 of file HashTableGeneric.cpp.
References countWords(), SequenceReader::getLastSequenceName(), SequenceReader::getNextSequence(), NameReaderLocal::lastName(), SequenceAdapter::link(), pNameReader_, SequenceReader::rewind(), and wordLength_.
Referenced by createHashTable().
00246 { 00247 00248 sequenceReader.rewind(); 00249 00250 // vector<WordSequence> wordSeqs; 00251 // const WordSequence dummy; // needed to get this to compile on LINUX ... 00252 // wordSeqs.push_back(dummy); 00253 WordSequence thisSeq; 00254 int numSeqs(0); 00255 00256 // sequenceNames_.push_back(new string); 00257 00258 NameReaderLocal* pReaderLocal = new NameReaderLocal; 00259 pNameReader_ = pReaderLocal; 00260 00261 while( sequenceReader.getNextSequence 00262 ( thisSeq, wordLength_) != -1 ) 00263 { 00264 numSeqs++; 00265 00266 sequenceReader.getLastSequenceName( pReaderLocal->lastName() ); 00267 // sequenceReader.getLastSequenceName( *(sequenceNames_.back()) ); 00268 00269 seq->link( thisSeq ); 00270 countWords(*seq); 00271 00272 // sequenceNames_.push_back( new string ); 00273 // wordSeqs.push_back(dummy); 00274 00275 } // ~while 00276 00277 // delete sequenceNames_.back(); 00278 // sequenceNames_.pop_back(); 00279 // wordSeqs.pop_back(); 00280 // cout << "XXX: " << name_ << numSeqs << endl; 00281 return numSeqs; 00282 } // ~HashTableGeneric::countWordsAndGetNames
Here is the call graph for this function:

Here is the caller graph for this function:

| void HashTableGeneric::setupPointerArray | ( | void | ) | [virtual] |
Reimplemented in HashTableTranslated.
Definition at line 321 of file HashTableGeneric.cpp.
References Allocator< T >::allocateAndZero(), monitoringStream_, numDifferentWords_, and pArrayAllocator_.
Referenced by createHashTable(), and HashTableTranslated::setupPointerArray().
00322 { 00323 00324 // Attempt to allocate memory for the list of table positions 00325 monitoringStream_ << "Allocating memory for pWordPositionInHitList_: " 00326 << numDifferentWords_ << " words, " 00327 << numDifferentWords_ * sizeof(PositionInHitList) 00328 << " bytes total ..." << endl; 00329 00330 00331 pArrayAllocator_->allocateAndZero 00332 (numDifferentWords_); 00333 00334 monitoringStream_ << "... done" << endl; 00335 00336 } // ~HashTableGeneric::setupPointerArray( void )
Here is the call graph for this function:

Here is the caller graph for this function:

| void HashTableGeneric::computePointerArray | ( | void | ) | [virtual] |
Reimplemented in HashTableTranslated.
Definition at line 338 of file HashTableGeneric.cpp.
References monitoringStream_, numDifferentWords_, pWordPositionInHitList_, and wordsInDatabase_.
Referenced by HashTableTranslated::computePointerArray(), and createHashTable().
00339 { 00340 unsigned long numClipped( 0 ); 00341 00342 // Accumulate to get position of hit list in final array 00343 for ( unsigned int i(1) ; i < numDifferentWords_ ; i ++ ) 00344 { 00345 pWordPositionInHitList_[i] += pWordPositionInHitList_[i-1]; 00346 } // ~for 00347 00348 wordsInDatabase_ = pWordPositionInHitList_[numDifferentWords_-1]; 00349 00350 monitoringStream_ << numClipped 00351 << " words will be discarded (" 00352 << (float)(100*numClipped) 00353 /(float)(wordsInDatabase_+numClipped) 00354 << " \% of total)." << endl; 00355 00356 00357 } // ~HashTableGeneric::computePointerArray( void )
Here is the caller graph for this function:

| void HashTableGeneric::setupHitList | ( | void | ) | [virtual] |
Reimplemented in HashTableTranslated.
Definition at line 360 of file HashTableGeneric.cpp.
References Allocator< T >::allocateAndZero(), allocateHitList(), Allocator< T >::clone(), getHitTypeSize(), monitoringStream_, name_, numDifferentWords_, pArrayAllocator_, pHitsFoundSoFar_, tempAlloc_, and wordsInDatabase_.
Referenced by createHashTable(), and HashTableTranslated::setupHitList().
00361 { 00362 00363 // Attempt to allocate memory for the table itself 00364 monitoringStream_ << "Allocating memory for pHitListForAllWords_: " 00365 << wordsInDatabase_ << " words, " 00366 << wordsInDatabase_ * getHitTypeSize() 00367 << " bytes total ..." << endl; 00368 00369 allocateHitList(wordsInDatabase_); 00370 00371 monitoringStream_ << "... memory allocation OK" << endl; 00372 00373 // Attempt to allocate temporary memory to store hits so far for each word 00374 monitoringStream_ << "Allocating memory for pHitsFoundSoFar_: " 00375 << numDifferentWords_ << " words, " 00376 << numDifferentWords_ * sizeof(unsigned short) 00377 << " bytes total ..." << endl; 00378 00379 00380 00381 // Allocator<PositionInHitList>* tempAlloc = 00382 tempAlloc_ = 00383 pArrayAllocator_->clone(&pHitsFoundSoFar_, 00384 name_+(string)".temp", 00385 monitoringStream_); 00386 // ... ensures uses same allocation method as for ptr array 00387 tempAlloc_->allocateAndZero( numDifferentWords_ ); 00388 00389 monitoringStream_ << "... done" << endl; 00390 00391 } // ~HashTableGeneric::setupHitList( void )
Here is the call graph for this function:

Here is the caller graph for this function:

| void HashTableGeneric::hashAllWords | ( | SequenceReader & | sequenceReader, | |
| SequenceAdapter * | seq, | |||
| int | numSeqs | |||
| ) | [virtual] |
Reimplemented in HashTableTranslated.
Definition at line 285 of file HashTableGeneric.cpp.
References SequenceReader::getNextSequence(), hashWords(), SequenceAdapter::link(), pSequenceSizes_, SequenceReader::rewind(), and wordLength_.
Referenced by createHashTable().
00286 { 00287 00288 int numWords(0); 00289 sequenceReader.rewind(); 00290 WordSequence thisSeq; 00291 00292 // NB sequences are numbered 1...n not 0...n-1 00293 for ( unsigned int i(1); i <= numSeqs ; i++ ) 00294 { 00295 if( sequenceReader.getNextSequence( thisSeq, wordLength_) == -1 ) 00296 { 00297 throw SSAHAException 00298 ("Sequence source data changed during hash table creation!"); 00299 } 00300 00301 // fill in the word count 00302 numWords = (int) thisSeq.size(); 00303 pSequenceSizes_[i-1] = ( numWords > 0 ) 00304 ? ( (numWords-1) * wordLength_ ) + thisSeq.getNumBasesInLast() 00305 : 0; 00306 00307 // do the hashing 00308 seq->link(thisSeq); 00309 hashWords( *seq,i ); 00310 00311 00312 } // ~for thisSeq 00313 00314 00315 } // ~HashTableGeneric::hashAllWords
Here is the call graph for this function:

Here is the caller graph for this function:

| void HashTableGeneric::cleanupTempData | ( | void | ) | [virtual] |
Reimplemented in HashTableTranslated.
Definition at line 393 of file HashTableGeneric.cpp.
References tempAlloc_.
Referenced by HashTableTranslated::cleanupTempData(), and createHashTable().
00394 { 00395 delete tempAlloc_; // automatically deallocs pHitsFoundSoFar 00396 } // ~HashTableGeneric::cleanupTempData( void )
Here is the caller graph for this function:

| virtual void HashTableGeneric::hashWords | ( | SequenceAdapter & | thisSeq, | |
| SequenceNumber | seqNum | |||
| ) | [pure virtual] |
Implemented in HashTable, HashTableView< T, SUBCLASS >, HashTablePacked, HashTableTranslated, HashTableView< PositionPacked, HashTablePacked >, and HashTableView< PositionInDatabase, HashTable >.
Referenced by hashAllWords().
Here is the caller graph for this function:

| virtual void HashTableGeneric::countWords | ( | SequenceAdapter & | thisSeq | ) | [pure virtual] |
Implemented in HashTable, HashTableView< T, SUBCLASS >, HashTablePacked, HashTableTranslated, HashTableTest, HashTableView< PositionPacked, HashTablePacked >, and HashTableView< PositionInDatabase, HashTable >.
Referenced by countWordsAndGetNames().
Here is the caller graph for this function:

| virtual void HashTableGeneric::matchSequence | ( | WordSequence & | seq, | |
| HitList & | hitListFwd | |||
| ) | [pure virtual] |
Implemented in HashTable, HashTablePacked, HashTablePackedProtein, and HashTableTranslated.
Referenced by MatchAlgorithm::generateHits().
Here is the caller graph for this function:

| virtual void HashTableGeneric::setNumRepeats | ( | int | nr | ) | [pure virtual] |
Implemented in HashTable, HashTablePacked, and HashTableTranslated.
Referenced by MatchAlgorithm::generateHits().
Here is the caller graph for this function:

| virtual void HashTableGeneric::setSubstituteThreshold | ( | int | ns | ) | [inline, virtual] |
Reimplemented in HashTablePacked, and HashTableTranslated.
Definition at line 228 of file HashTableGeneric.h.
Referenced by processQuery().
Here is the caller graph for this function:

| virtual char* HashTableGeneric::getHitListStart | ( | void | ) | const [pure virtual] |
| virtual int HashTableGeneric::getHitTypeSize | ( | void | ) | const [pure virtual] |
Implemented in HashTableView< T, SUBCLASS >, HashTableTranslated, HashTableView< PositionPacked, HashTablePacked >, and HashTableView< PositionInDatabase, HashTable >.
Referenced by setupHitList().
Here is the caller graph for this function:

| virtual void HashTableGeneric::allocateHitList | ( | unsigned long | size | ) | [pure virtual] |
Implemented in HashTableView< T, SUBCLASS >, HashTableTranslated, HashTableView< PositionPacked, HashTablePacked >, and HashTableView< PositionInDatabase, HashTable >.
Referenced by setupHitList().
Here is the caller graph for this function:

| virtual void HashTableGeneric::loadHitList | ( | unsigned long | size | ) | [pure virtual] |
| virtual void HashTableGeneric::saveHitList | ( | void | ) | [pure virtual] |
Implemented in HashTableView< T, SUBCLASS >, HashTablePacked, HashTableTranslated, HashTableView< PositionPacked, HashTablePacked >, and HashTableView< PositionInDatabase, HashTable >.
Referenced by saveHashTable().
Here is the caller graph for this function:

| void HashTableGeneric::loadSequenceNames | ( | void | ) | [virtual] |
Definition at line 651 of file HashTableGeneric.cpp.
References bitsPerSymbol_, gDNAData, gProteinData, hitListFormat_, NameReader::loadSequenceNames(), monitoringStream_, name_, pNameReader_, NameReader::size(), sourceData_, stepLength_, and wordLength_.
Referenced by HashTableTranslated::loadHashTable().
00652 { 00653 00654 ifstreamSSAHA nameFile( ( name_ + ".name" ).c_str() ); 00655 00656 if ( nameFile.fail() ) 00657 { 00658 monitoringStream_ << "Error: failed to open " 00659 << name_ << ".name, aborting load." << endl; 00660 throw SSAHAException("Could not open .name file"); 00661 } // ~if 00662 00663 string hashParams; 00664 00665 getline( nameFile, hashParams ); 00666 00667 std::string::size_type pos(0), prev_pos(0); 00668 00669 pos=hashParams.find_first_of(' ',pos); 00670 if (pos == std::string::npos) 00671 { 00672 monitoringStream_ << "Error: could not parse " 00673 << name_ << ".name, aborting load." << endl; 00674 throw SSAHAException("Could not parse .name file"); 00675 } // ~if 00676 00677 if ( hitListFormat_ 00678 != (HitListFormatType)atoi( hashParams.substr 00679 (prev_pos, pos-prev_pos).c_str() ) ) 00680 { 00681 monitoringStream_ 00682 << "Error: saved data is not compatible with this hash table" << endl; 00683 throw SSAHAException("Tried to load incompatible data into hash table"); 00684 } // ~if 00685 00686 prev_pos = ++pos; 00687 00688 pos=hashParams.find_first_of(' ',pos); 00689 wordLength_ 00690 = atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str()); 00691 prev_pos = ++pos; 00692 00693 pos=hashParams.find_first_of(' ',pos); 00694 stepLength_ 00695 = atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str()); 00696 prev_pos = ++pos; 00697 00698 pos=hashParams.find_first_of(' ',pos); 00699 bitsPerSymbol_ 00700 = atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str()); 00701 prev_pos = ++pos; 00702 00703 pos=hashParams.find_first_of(' ',pos); 00704 sourceData_ = (SourceDataType) 00705 atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str()); 00706 prev_pos = ++pos; 00707 00708 // sequenceNames_.push_back( new string ); 00709 00710 00711 00712 // while( getline( nameFile,*(sequenceNames_.back() ) ) ) 00713 // { 00714 // sequenceNames_.push_back( new string ); 00715 // } // ~while 00716 00717 // sequenceNames_.pop_back(); 00718 00719 pNameReader_->loadSequenceNames( nameFile ); 00720 00721 monitoringStream_ << "Loaded file " << name_ << ".name (" 00722 << pNameReader_->size() 00723 << " sequence names).\n"; 00724 00725 monitoringStream_ << "Closing .name file\n"; 00726 nameFile.close(); 00727 00728 monitoringStream_ << "Loaded in word length of " 00729 << wordLength_ << " bases.\n"; 00730 00731 monitoringStream_ << "Loaded in step length of " 00732 << stepLength_ << " bases.\n"; 00733 00734 monitoringStream_ << "Hash table format uses " << bitsPerSymbol_ 00735 << " bits per symbol.\n"; 00736 00737 monitoringStream_ << "Hash table was created from "; 00738 00739 switch (sourceData_) 00740 { 00741 case gDNAData: 00742 monitoringStream_ << "DNA"; break; 00743 case gProteinData: 00744 monitoringStream_ << "protein"; break; 00745 default: 00746 monitoringStream_ << "unknown"; break; 00747 } 00748 00749 monitoringStream_ << " data.\n"; 00750 00751 00752 00753 00754 } // ~HashTableGeneric::loadSequenceNames( void )
Here is the call graph for this function:

Here is the caller graph for this function:

| void HashTableGeneric::saveSequenceNames | ( | void | ) | [virtual] |
Definition at line 797 of file HashTableGeneric.cpp.
References bitsPerSymbol_, hitListFormat_, monitoringStream_, name_, pNameReader_, NameReader::saveSequenceNames(), sourceData_, stepLength_, and wordLength_.
Referenced by saveHashTable().
00798 { 00799 00800 ofstreamSSAHA nameFile( ( name_ + ".name" ).c_str() ); 00801 00802 nameFile << hitListFormat_ << " " << wordLength_ << " " 00803 << stepLength_ << " " << bitsPerSymbol_ 00804 << " " << sourceData_ << " " << endl; 00805 00806 00807 pNameReader_->saveSequenceNames( nameFile ); 00808 00809 // for ( vector<string*>::iterator i = sequenceNames_.begin() ; 00810 // i != sequenceNames_.end() ; i ++ ) 00811 // { 00812 // nameFile << **i << endl; 00813 // } 00814 00815 if ( nameFile.fail() ) 00816 { 00817 monitoringStream_ << "Error: failed to write " 00818 << name_ << ".name" << endl; 00819 throw SSAHAException("Problem saving .name file"); 00820 } // ~if 00821 00822 nameFile.close(); 00823 00824 monitoringStream_ << "Saved file " << name_ << ".name\n"; 00825 00826 00827 } // ~HashTableGeneric::saveSequenceNames( void )
Here is the call graph for this function:

Here is the caller graph for this function:

| void HashTableGeneric::loadHashTable | ( | SourceReaderIndex * | pSourceReader = NULL |
) | [virtual] |
Reimplemented in HashTableTranslated.
Definition at line 552 of file HashTableGeneric.cpp.
References loadFromFile(), HashTableFactory::monitoringStream_, and pSourceReader.
Referenced by HashTableTranslated::loadHashTable(), HashTableFactory::loadHashTable(), and loadTable().
00553 { 00554 00555 monitoringStream_ << "HashTable::loadHashTable" << endl; 00556 00557 if ( isInitialized_ == true ) 00558 { 00559 monitoringStream_ << "Error: hash table cannot be loaded because class" 00560 << " has already been initialized." << endl; 00561 return; 00562 } // ~if 00563 00564 // HashTableComponents have a dummy name reader set up already 00565 // so don't need the next bit, hence the next if condition 00566 if (pNameReader_==NULL) 00567 { 00568 if (pSourceReader==NULL) 00569 { 00570 monitoringStream_ << "Info: sequence names will be held in local memory" 00571 << endl; 00572 pNameReader_ = new NameReaderLocal; 00573 } // ~if 00574 else 00575 { 00576 monitoringStream_ << "Info: sequence names will be read from file" 00577 << endl; 00578 pNameReader_ = new NameReaderIndex(*pSourceReader); 00579 } // ~else 00580 } // ~if 00581 // Now load all the sequence names 00582 00583 loadSequenceNames(); 00584 00585 unsigned long numSeqs = pNameReader_->size(); 00586 00587 // numDifferentWords_ = 1 << ( bitsPerSymbol_ * wordLength_ ); 00588 00589 00590 int bitsUsedPerWord( wordLength_ * bitsPerSymbol_ ); 00591 numDifferentWords_ = ( 1 << bitsUsedPerWord ); 00592 unsigned long headSize = numDifferentWords_ * sizeof(PositionInHitList); 00593 00594 // Attempt to allocate memory for the list of table positions 00595 monitoringStream_ << "Allocating memory for pWordPositionInHitList_: " 00596 << numDifferentWords_ << " words, " 00597 << headSize 00598 << " bytes total ..." << endl; 00599 00600 00601 // pArrayAllocator_->allocate(headSize); 00602 pArrayAllocator_->load(numDifferentWords_); 00603 // loadFromFile 00604 // ( 00605 // name_+(string)".head", 00606 // (char*)pWordPositionInHitList_, 00607 // headSize 00608 // ); 00609 00610 00611 wordsInDatabase_ = pWordPositionInHitList_[numDifferentWords_-1]; 00612 // cout << pWordPositionInHitList_[numDifferentWords-1] << endl; 00613 00614 unsigned long bodySize 00615 = wordsInDatabase_ * getHitTypeSize(); 00616 00617 monitoringStream_ << "Allocating memory for pHitListForAllWords_: " 00618 << wordsInDatabase_ << " words, " 00619 << bodySize 00620 << " bytes total ..." << endl; 00621 00622 loadHitList(wordsInDatabase_); 00623 00624 // Now load all the sequence sizes 00625 00626 // Don't need a .size file for HashTableTranslated .... 00627 if (numSeqs != 0 ) 00628 { 00629 monitoringStream_ << "Allocating memory for pSequenceSizes_: " 00630 << numSeqs << " sequences, " 00631 << numSeqs*sizeof(SequenceOffset) 00632 << " bytes total ...\n"; 00633 00634 pSequenceSizes_ = new SequenceOffset [ numSeqs ]; 00635 00636 if (!pSequenceSizes_) 00637 { 00638 throw SSAHAException("Memory allocation failed!"); 00639 } // ~if 00640 00641 loadFromFile(name_+(string)".size", (char*)pSequenceSizes_, 00642 numSeqs * sizeof( SequenceOffset ), 00643 monitoringStream_ ); 00644 } // ~if 00645 00646 isInitialized_ = true; // set once all loading has been done 00647 monitoringStream_ << "Hash table loading complete.\n"; 00648 00649 } // ~loadHashTable
Here is the call graph for this function:

Here is the caller graph for this function:

| void HashTableGeneric::saveHashTable | ( | void | ) | [virtual] |
Reimplemented in HashTableTranslated.
Definition at line 760 of file HashTableGeneric.cpp.
References isInitialized_, monitoringStream_, name_, pArrayAllocator_, pNameReader_, pSequenceSizes_, Allocator< T >::save(), saveHitList(), saveSequenceNames(), saveToFile(), and NameReader::size().
Referenced by HashTableTranslated::saveHashTable(), and HashTableFactory::saveHashTable().
00761 { 00762 00763 monitoringStream_ << "HashTableFactory::saveHashTable" << endl; 00764 00765 if ( isInitialized_ == false ) 00766 { 00767 monitoringStream_ << "Error: hash table cannot be saved because it was" 00768 << " was not successfully created." << endl; 00769 throw SSAHAException("Tried to save uninitialized hash table"); 00770 } // ~if 00771 00772 // Save sequence names 00773 00774 saveSequenceNames(); 00775 00776 00777 // int bitsUsedPerWord( wordLength_ * bitsPerSymbol_ ); 00778 // unsigned long numDifferentWords ( 1 << bitsUsedPerWord ); 00779 00780 pArrayAllocator_->save(); 00781 // pHitListAllocator_->save(); 00782 00783 saveHitList(); 00784 00785 // if (sequenceNames_.size() != 0) 00786 if (pNameReader_->size() != 0) 00787 saveToFile( name_+(string)".size", 00788 (char*)pSequenceSizes_, 00789 pNameReader_->size() * sizeof( SequenceOffset ), 00790 monitoringStream_ ); 00791 00792 00793 00794 } // ~saveHashTable
Here is the call graph for this function:

Here is the caller graph for this function:

| void HashTableGeneric::getSequenceName | ( | string & | seqName, | |
| SequenceNumber | seqNum | |||
| ) | const |
Reimplemented in HashTableTest.
Definition at line 408 of file HashTableGeneric.cpp.
References NameReader::getSequenceName(), monitoringStream_, and pNameReader_.
Referenced by getSubjectName(), and main().
00409 { 00410 pNameReader_->getSequenceName( seqName, seqNum ); 00411 #ifdef XXX 00412 // NB sequenceNames_ are numbered 0 ... n-1 00413 // but we are numbering sequences 1 ... n 00414 if ((seqNum <= 0 )||( seqNum > sequenceNames_.size() )) 00415 { 00416 seqName = "No sequence name stored for sequence."; 00417 monitoringStream_ 00418 << "Warning - no sequence name for requested sequence number (" 00419 << seqNum << ").\n"; 00420 00421 return; 00422 } // ~if 00423 00424 seqName = *(sequenceNames_[seqNum - 1]); 00425 return; 00426 #endif 00427 } // ~HashTable::getSequenceName
Here is the call graph for this function:

Here is the caller graph for this function:

| const char * HashTableGeneric::getSequenceName | ( | SequenceNumber | seqNum | ) | const |
Definition at line 438 of file HashTableGeneric.cpp.
References NameReader::getSequenceName(), monitoringStream_, and pNameReader_.
00439 { 00440 return pNameReader_->getSequenceName( seqNum ); 00441 #ifdef XXX 00442 // NB sequenceNames_ are numbered 0 ... n-1 00443 // but we are numbering sequences 1 ... n 00444 if ((seqNum <= 0 )||( seqNum > sequenceNames_.size() )) 00445 { 00446 monitoringStream_ 00447 << "Warning - no sequence name for requested sequence number (" 00448 << seqNum << ").\n"; 00449 return NULL; 00450 } // ~if 00451 00452 return sequenceNames_[seqNum - 1]->c_str(); 00453 #endif 00454 } // ~HashTable::getSequenceName
Here is the call graph for this function:

| SequenceOffset HashTableGeneric::getSequenceSize | ( | SequenceNumber | seqNum | ) | const |
Definition at line 468 of file HashTableGeneric.cpp.
References monitoringStream_, pNameReader_, pSequenceSizes_, and NameReader::size().
Referenced by main(), MatchAdderProteinCodon::operator()(), and MatchAdderCodonCodon::operator()().
00469 { 00470 // NB sequenceNames_ are numbered 0 ... n-1 00471 // but we are numbering sequences 1 ... n 00472 if ((seqNum <= 0 )||( seqNum > pNameReader_->size() )) 00473 { 00474 monitoringStream_ 00475 << "Warning - no sequence size for requested sequence number (" 00476 << seqNum << ").\n"; 00477 00478 return 0; 00479 } // ~if 00480 else return pSequenceSizes_[seqNum - 1]; 00481 00482 } // ~HashTable::getSequenceName
Here is the call graph for this function:

Here is the caller graph for this function:

| void HashTableGeneric::printHashStats | ( | void | ) | [virtual] |
Reimplemented in HashTableTranslated.
Definition at line 830 of file HashTableGeneric.cpp.
References bitsPerSymbol_, gBaseBits, gBaseNames, gNumCodonEncodings, gResidueBits, gResidueNames, isInitialized(), TopList::push_back(), pWordPositionInHitList_, wordLength_, and wordsInDatabase_.
Referenced by HashTableTranslated::printHashStats(), and processQuery().
00831 { 00832 cout << "Analyzing data in hash table...\n"; 00833 00834 static const double thresholds[] = 00835 { 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 00836 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01, 00837 0.001, 0.0001, 0.00001, 00838 0 }; 00839 00840 00841 if (!isInitialized()) 00842 { 00843 cout << "Hash table has not been initialized.\n"; 00844 return; 00845 } // ~if 00846 00847 const char* pSymbols; 00848 00849 cout << "Word length: " << wordLength_ << " bases.\n"; 00850 cout << "Bits per symbol: " << bitsPerSymbol_ << endl; 00851 00852 unsigned long numDifferentWords; 00853 00854 // if ( (pHashTable->getHitListFormat()==gTranslated) 00855 // || (pHashTable->getHitListFormat()==g32BitPackedProtein) ) 00856 if (bitsPerSymbol_==gResidueBits) 00857 { 00858 numDifferentWords= (unsigned long)pow((double)gNumCodonEncodings, 00859 (int)wordLength_); 00860 pSymbols = gResidueNames; 00861 } // ~if 00862 else 00863 { 00864 assert(bitsPerSymbol_==gBaseBits); 00865 numDifferentWords= 1<<(2*wordLength_); 00866 pSymbols = gBaseNames; 00867 } // ~else 00868 00869 00870 // int bitsUsedPerWord( wordLength_ * bitsPerSymbol_ ); 00871 // unsigned long numDifferentWords ( 1 << bitsUsedPerWord ); 00872 cout << "There are " << numDifferentWords 00873 << " different possible words of this length.\n"; 00874 cout << "There are " << wordsInDatabase_ 00875 << " words in this hash table.\n"; 00876 00877 00878 int numCommon(240); 00879 00880 TopList mostCommon(numCommon, wordLength_, bitsPerSymbol_); 00881 00882 map<int, int> histogram; 00883 vector<Word> neverOccurred; 00884 00885 00886 PositionInHitList last(0); 00887 int numOccs; 00888 00889 for ( unsigned long i(0); i < numDifferentWords ; ++i ) 00890 { 00891 numOccs = pWordPositionInHitList_[i]-last; 00892 histogram[numOccs]++; 00893 if (numOccs==0) neverOccurred.push_back((Word)i); 00894 00895 mostCommon.push_back( pair<int, Word>(numOccs, (Word)i ) ); 00896 last = pWordPositionInHitList_[i]; 00897 } 00898 00899 cout << neverOccurred.size() << " words never appeared.\n"; 00900 if (neverOccurred.size() < 200) 00901 { 00902 cout << "They are:\n"; 00903 for (vector<Word>::iterator i(neverOccurred.begin()); 00904 i!=neverOccurred.end();i++) 00905 // cout << printWord( *i, wordLength_ ) << endl; 00906 cout 00907 << PrintFromWord( *i, wordLength_, bitsPerSymbol_, pSymbols ) 00908 << endl; 00909 } else cout << "Not showing them - more than 200!\n"; 00910 00911 cout << "The " << numCommon << " most common words are:\n"; 00912 00913 cout << mostCommon; 00914 00915 unsigned long total(0), totalWordsUsed(0); 00916 int thresholdNum(0); 00917 unsigned int n(0); 00918 00919 for ( map<int, int>::iterator i( histogram.begin()); 00920 i != histogram.end() ; ++i ) 00921 { 00922 total += (i->first*i->second); 00923 totalWordsUsed += i->second; 00924 if (n++<2000) 00925 { 00926 cout << i->second << " words occurred " << i->first 00927 << " times in the hash table (" 00928 << 100.0*total/(double)wordsInDatabase_ 00929 << "/" 00930 << 100.0*totalWordsUsed/(double)numDifferentWords 00931 << " cumulative percent).\n"; 00932 } 00933 00934 00935 00936 00937 while ( ( (double)(wordsInDatabase_-total)/(double)wordsInDatabase_ ) 00938 < thresholds[thresholdNum] ) 00939 { 00940 cout << "Threshold " << i->first << " retains " 00941 << 100.0*(1.0-thresholds[thresholdNum++]) 00942 << "\% of words.\n"; 00943 } // ~while 00944 } // ~for 00945 00946 cout << "Total: " << total << "\n"; 00947 00948 cout << "... analysis complete.\n"; 00949 00950 } // ~void HashTableGeneric::printHashStats( void )
Here is the call graph for this function:

Here is the caller graph for this function:

| bool HashTableGeneric::isInitialized | ( | ) | const [inline] |
Definition at line 305 of file HashTableGeneric.h.
References isInitialized_.
Referenced by HashTableFactory::loadHashTable(), main(), printHashStats(), and processQuery().
00305 { return isInitialized_; }
Here is the caller graph for this function:

| int HashTableGeneric::getWordLength | ( | void | ) | const [inline] |
Definition at line 306 of file HashTableGeneric.h.
References wordLength_.
Referenced by generateHashTable(), SequenceReader::getNextSequence(), SequenceReader::getSequence(), loadTable(), main(), MatchPolicyDNATranslated::operator()(), MatchPolicyProteinTranslated::operator()(), MatchPolicyDNAProtein::operator()(), MatchPolicyDNADNA::operator()(), processQuery(), and WordSequenceShifted::WordSequenceShifted().
00306 { return wordLength_; }
Here is the caller graph for this function:

| int HashTableGeneric::getStepLength | ( | ) | const [inline] |
Definition at line 307 of file HashTableGeneric.h.
References stepLength_.
Referenced by processQuery().
00307 { return stepLength_; }
Here is the caller graph for this function:

| SequenceNumber HashTableGeneric::getNumSequences | ( | ) | const |
Definition at line 456 of file HashTableGeneric.cpp.
References pNameReader_, and NameReader::size().
Referenced by HashTablePacked::loadHitList(), and main().
00457 { 00458 return (pNameReader_ == NULL) ? 0 : pNameReader_->size(); 00459 } // ~SequenceNumber HashTableGeneric::getNumSequences() const
Here is the call graph for this function:

Here is the caller graph for this function:

| virtual int HashTableGeneric::getMaxNumHits | ( | ) | const [inline, virtual] |
Reimplemented in HashTableTranslated.
Definition at line 310 of file HashTableGeneric.h.
References maxNumHits_.
Referenced by HashTableTranslated::getMaxNumHits(), main(), and HashTableTranslated::setMaxNumHits().
00310 { return maxNumHits_; }
Here is the caller graph for this function:

| virtual void HashTableGeneric::setMaxNumHits | ( | int | mnh | ) | [inline, virtual] |
Reimplemented in HashTableTranslated.
Definition at line 311 of file HashTableGeneric.h.
References maxNumHits_.
Referenced by generateHashTable(), loadTable(), main(), processQuery(), and HashTableTranslated::setMaxNumHits().
00311 { maxNumHits_ = mnh; }
Here is the caller graph for this function:

| int HashTableGeneric::getBitsPerSymbol | ( | void | ) | const [inline] |
Definition at line 312 of file HashTableGeneric.h.
References bitsPerSymbol_.
Referenced by generateHashTable(), processQuery(), QueryManager::QueryManager(), and WordSequenceShifted::WordSequenceShifted().
00312 { return bitsPerSymbol_; }
Here is the caller graph for this function:

| SourceDataType HashTableGeneric::getSourceDataType | ( | void | ) | const [inline] |
Definition at line 313 of file HashTableGeneric.h.
References sourceData_.
Referenced by generateHashTable(), and processQuery().
00313 { return sourceData_; }
Here is the caller graph for this function:

| HitListFormatType HashTableGeneric::getHitListFormat | ( | void | ) | const [inline] |
Definition at line 314 of file HashTableGeneric.h.
References hitListFormat_.
Referenced by generateHashTable(), and processQuery().
00315 { return hitListFormat_; }
Here is the caller graph for this function:

| virtual unsigned long HashTableGeneric::getTotalNumWords | ( | void | ) | const [inline, virtual] |
Reimplemented in HashTableTranslated.
Definition at line 316 of file HashTableGeneric.h.
References wordsInDatabase_.
Referenced by generateHashTable(), HashTableTranslated::getTotalNumWords(), and processQuery().
00317 { return wordsInDatabase_; }
Here is the caller graph for this function:

| const NameReader& HashTableGeneric::getNameReader | ( | ) | const [inline] |
Definition at line 318 of file HashTableGeneric.h.
References pNameReader_.
Referenced by QueryManager::doQuery().
00318 { return *pNameReader_; }
Here is the caller graph for this function:

| HashTableGeneric& HashTableGeneric::operator= | ( | const HashTableGeneric & | ) | [private] |
friend class HashTableFactory [friend] |
Reimplemented in HashTableView< T, SUBCLASS >, HashTableView< PositionPacked, HashTablePacked >, and HashTableView< PositionInDatabase, HashTable >.
Definition at line 174 of file HashTableGeneric.h.
Definition at line 214 of file HashTableGeneric.h.
Referenced by cleanupTempData(), and setupHitList().
string HashTableGeneric::name_ [protected] |
Definition at line 340 of file HashTableGeneric.h.
Referenced by HashTableTranslated::loadHashTable(), HashTablePacked::loadHitList(), loadSequenceNames(), saveHashTable(), HashTablePacked::saveHitList(), saveSequenceNames(), and setupHitList().
bool HashTableGeneric::isInitialized_ [protected] |
Definition at line 344 of file HashTableGeneric.h.
Referenced by HashTableTranslated::cleanupTempData(), createHashTable(), isInitialized(), HashTableTranslated::loadHashTable(), saveHashTable(), ~HashTableGeneric(), and HashTableView< PositionInDatabase, HashTable >::~HashTableView().
int HashTableGeneric::wordLength_ [protected] |
Definition at line 350 of file HashTableGeneric.h.
Referenced by HashTableTranslated::countWordsAndGetNames(), countWordsAndGetNames(), createHashTable(), getWordLength(), hashAllWords(), HashTableTest::HashTableTest(), HashTableTranslated::loadHashTable(), loadSequenceNames(), HashTablePacked::matchSequenceRepeated(), HashTable::matchSequenceRepeated(), HashTablePacked::matchSequenceStandard(), HashTable::matchSequenceStandard(), HashTableTranslated::matchSequenceTranslatedDNA(), HashTablePackedProtein::matchSequenceTranslatedDNA(), HashTableView< PositionInDatabase, HashTable >::matchWord(), HashTablePacked::matchWordSubstitute(), printHashStats(), saveSequenceNames(), SequenceAdapterWithOverlap::SequenceAdapterWithOverlap(), and HashTableTranslated::setupPointerArray().
int HashTableGeneric::stepLength_ [protected] |
Definition at line 356 of file HashTableGeneric.h.
Referenced by HashTablePackedProtein::convertHits(), HashTableComponent::convertHits(), HashTablePacked::convertHits(), HashTablePackedCustom::convertHits(), createHashTable(), getStepLength(), HashTable::hashWords(), loadSequenceNames(), HashTablePacked::matchSequenceRepeated(), HashTable::matchSequenceRepeated(), saveSequenceNames(), HashTablePacked::setNumRepeats(), HashTable::setNumRepeats(), and HashTableTranslated::setupPointerArray().
int HashTableGeneric::maxNumHits_ [protected] |
Definition at line 363 of file HashTableGeneric.h.
Referenced by createHashTable(), HashTableView< PositionInDatabase, HashTable >::end(), getMaxNumHits(), setMaxNumHits(), and HashTablePacked::setSubstituteThreshold().
int HashTableGeneric::bitsPerSymbol_ [protected] |
Definition at line 368 of file HashTableGeneric.h.
Referenced by createHashTable(), getBitsPerSymbol(), HashTableComponent::HashTableComponent(), HashTableFred::HashTableFred(), HashTablePackedProtein::HashTablePackedProtein(), HashTableTranslated::HashTableTranslated(), loadSequenceNames(), HashTablePacked::matchSequenceStandard(), HashTable::matchSequenceStandard(), printHashStats(), saveSequenceNames(), SequenceAdapterWithOverlap::SequenceAdapterWithOverlap(), and HashTableTranslated::setupPointerArray().
SourceDataType HashTableGeneric::sourceData_ [protected] |
Definition at line 373 of file HashTableGeneric.h.
Referenced by createHashTable(), getSourceDataType(), loadSequenceNames(), saveSequenceNames(), and HashTableTranslated::setupPointerArray().
HitListFormatType HashTableGeneric::hitListFormat_ [protected] |
Definition at line 378 of file HashTableGeneric.h.
Referenced by getHitListFormat(), HashTable::HashTable(), HashTableComponent::HashTableComponent(), HashTablePacked::HashTablePacked(), HashTablePackedProtein::HashTablePackedProtein(), HashTableTranslated::HashTableTranslated(), loadSequenceNames(), and saveSequenceNames().
ostream& HashTableGeneric::monitoringStream_ [protected] |
Definition at line 383 of file HashTableGeneric.h.
Referenced by computePointerArray(), createHashTable(), getSequenceName(), getSequenceSize(), HashTable::HashTable(), HashTableComponent::HashTableComponent(), HashTableFred::HashTableFred(), HashTablePacked::HashTablePacked(), HashTablePackedProtein::HashTablePackedProtein(), HashTableTranslated::HashTableTranslated(), HashTableView< PositionInDatabase, HashTable >::HashTableView(), HashTableTranslated::loadHashTable(), HashTablePacked::loadHitList(), loadSequenceNames(), saveHashTable(), HashTableTranslated::saveHitList(), saveSequenceNames(), setupHitList(), setupPointerArray(), ~HashTableGeneric(), and HashTableView< PositionInDatabase, HashTable >::~HashTableView().
unsigned long HashTableGeneric::wordsInDatabase_ [protected] |
Definition at line 387 of file HashTableGeneric.h.
Referenced by computePointerArray(), getTotalNumWords(), printHashStats(), and setupHitList().
unsigned long HashTableGeneric::numDifferentWords_ [protected] |
Definition at line 391 of file HashTableGeneric.h.
Referenced by computePointerArray(), createHashTable(), setupHitList(), HashTableTranslated::setupPointerArray(), and setupPointerArray().
PositionInHitList* HashTableGeneric::pHitsFoundSoFar_ [protected] |
Definition at line 395 of file HashTableGeneric.h.
Referenced by HashTablePacked::hashWords(), HashTable::hashWords(), and setupHitList().
Definition at line 402 of file HashTableGeneric.h.
Referenced by HashTableView< PositionInDatabase, HashTable >::begin(), computePointerArray(), HashTablePacked::countWords(), HashTable::countWords(), HashTablePacked::hashWords(), HashTable::hashWords(), HashTableView< PositionInDatabase, HashTable >::last(), and printHashStats().
Allocator<PositionInHitList>* HashTableGeneric::pArrayAllocator_ [protected] |
Definition at line 405 of file HashTableGeneric.h.
Referenced by saveHashTable(), setupHitList(), setupPointerArray(), and ~HashTableGeneric().
NameReader* HashTableGeneric::pNameReader_ [protected] |
Definition at line 412 of file HashTableGeneric.h.
Referenced by HashTableTranslated::countWordsAndGetNames(), countWordsAndGetNames(), getNameReader(), getNumSequences(), getSequenceName(), getSequenceSize(), HashTableComponent::HashTableComponent(), HashTableTranslated::loadHashTable(), loadSequenceNames(), saveHashTable(), saveSequenceNames(), and ~HashTableGeneric().
SequenceOffset* HashTableGeneric::pSequenceSizes_ [protected] |
Definition at line 416 of file HashTableGeneric.h.
Referenced by createHashTable(), getSequenceSize(), HashTableTranslated::hashAllWords(), hashAllWords(), HashTableTranslated::loadHashTable(), saveHashTable(), and ~HashTableGeneric().
1.5.2