00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include "HashTableGeneric.h"
00034 #include "HashTable.h"
00035 #include "HashTablePacked.h"
00036 #include "HashTableTranslated.h"
00037 #include "SequenceReader.h"
00038
00039 #include "GlobalDefinitions.h"
00040 #include <fstream>
00041 #include <algorithm>
00042 #include <map>
00043 #include <cmath>
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053 enum
00054 {
00055 defaultMaxNumHits=10000
00056 };
00057
00058
00059
00060
00061
00062 HashTableGeneric::HashTableGeneric
00063 ( ostream& monitoringStream,
00064 const string& name,
00065 Allocator<PositionInHitList>& arrayAllocator ) :
00066 isInitialized_( false ),
00067 monitoringStream_( monitoringStream ),
00068 name_( name ),
00069 bitsPerSymbol_( gBaseBits ),
00070 maxNumHits_( defaultMaxNumHits ),
00071 hitListFormat_( gNotSpecified ),
00072 pArrayAllocator_
00073 ( arrayAllocator.clone(&pWordPositionInHitList_,
00074 name+(string)".head",
00075 monitoringStream_) ),
00076 pSequenceSizes_(NULL),
00077 pNameReader_(NULL)
00078 {
00079 monitoringStream_ << "constructing HashTableGeneric\n";
00080 if (name_=="")
00081 {
00082 char buf[50];
00083 static int numTables(0);
00084 sprintf(buf,"SSAHAHashTable-%d-%d",getpid(),++numTables);
00085 name_=(string)buf;
00086 }
00087
00088 }
00089
00090
00091
00092
00093
00094 HashTableGeneric::~HashTableGeneric()
00095 {
00096 monitoringStream_ << "destructing HashTableGeneric ...\n";
00097 if ( isInitialized_ )
00098 {
00099 monitoringStream_ << "... deallocating memory\n";
00100
00101
00102 delete pArrayAllocator_;
00103
00104
00105
00106
00107 delete [] pSequenceSizes_;
00108
00109
00110 delete pNameReader_;
00111 }
00112 else
00113 {
00114 monitoringStream_
00115 << "... never initialized, no memory deallocation required\n";
00116 }
00117 }
00118
00119 void HashTableGeneric::createHashTable
00120 ( SequenceReader& sequenceReader, int wordLength, int maxNumHits,
00121 int stepLength )
00122 {
00123
00124 monitoringStream_ << "HashTable::createHashTable" << endl;
00125
00126
00127
00128 if (bitsPerSymbol_ != sequenceReader.getBitsPerSymbol() )
00129 {
00130 monitoringStream_ << "Info: reader encodes using "
00131 << sequenceReader.getBitsPerSymbol()
00132 << " bits per symbol, hash table expects "
00133 << bitsPerSymbol_ << " bits per symbol.\n";
00134
00135 }
00136
00137
00138 monitoringStream_ << "Info: hash table creation will proceed using "
00139 << bitsPerSymbol_ << " bits per symbol.\n";
00140
00141 sourceData_ = sequenceReader.getSourceDataType();
00142
00143 if ( sourceData_ == gDNAData )
00144 {
00145 monitoringStream_
00146 << "Info: hash table will be created from DNA source data.\n";
00147 }
00148 else if ( sourceData_ == gProteinData )
00149 {
00150 monitoringStream_
00151 << "Info: hash table will be created from protein source data.\n";
00152 }
00153 else
00154 {
00155 monitoringStream_
00156 << "Info: source data for hash table is of unknown type.\n";
00157 }
00158
00159
00160
00161
00162 int bitsUsedPerWord( wordLength * bitsPerSymbol_ );
00163
00164 if ( (bitsUsedPerWord <= 0 ) || ( bitsUsedPerWord > gBitsPerWord ) )
00165
00166 {
00167 monitoringStream_ << "Error: bits used per hash words ("
00168 << bitsUsedPerWord << ") outside valid range ( 1 to "
00169 << gBitsPerWord << ").\n";
00170 throw SSAHAException("Invalid hash word length.");
00171 }
00172
00173 monitoringStream_ << "Info: " << wordLength
00174 << " symbols will be stored per hash word.\n";
00175
00176
00177 if ( ( stepLength > wordLength ) || ( stepLength <= 0 ) )
00178 {
00179 monitoringStream_ << "Info: requested step length for hashing ("
00180 << stepLength << ") outside valid range ( 1 to "
00181 << wordLength << "), using " << wordLength
00182 << " instead.\n";
00183
00184 stepLength = wordLength;
00185 }
00186
00187 wordLength_ = wordLength;
00188 stepLength_ = stepLength;
00189 maxNumHits_ = maxNumHits;
00190
00191 AdapterFactory factory;
00192
00193 SequenceAdapter* seq
00194 ( factory.create( bitsPerSymbol_, wordLength_, stepLength_ ) );
00195
00196
00197
00198
00199 numDifferentWords_ = ( 1 << bitsUsedPerWord );
00200
00201 setupPointerArray();
00202
00203
00204
00205
00206
00207
00208 int numSeqs(countWordsAndGetNames( sequenceReader, seq ));
00209
00210
00211 monitoringStream_ << "Allocating memory for pSequenceSizes_: "
00212 << numSeqs << " sequences, "
00213 << numSeqs * sizeof(SequenceOffset)
00214 << " bytes total ..." << endl;
00215
00216 pSequenceSizes_ = new SequenceOffset [ numSeqs ];
00217
00218 if (!pSequenceSizes_)
00219 {
00220 throw SSAHAException("Memory allocation failed!");
00221 }
00222
00223 monitoringStream_ << "... memory allocation OK" << endl;
00224
00225 computePointerArray();
00226
00227 setupHitList();
00228
00229
00230
00231 hashAllWords( sequenceReader, seq, numSeqs );
00232
00233 cleanupTempData();
00234
00235 delete seq;
00236
00237
00238
00239 isInitialized_ = true;
00240
00241 }
00242
00243
00244 int HashTableGeneric::countWordsAndGetNames
00245 ( SequenceReader& sequenceReader, SequenceAdapter* seq )
00246 {
00247
00248 sequenceReader.rewind();
00249
00250
00251
00252
00253 WordSequence thisSeq;
00254 int numSeqs(0);
00255
00256
00257
00258 NameReaderLocal* pReaderLocal = new NameReaderLocal;
00259 pNameReader_ = pReaderLocal;
00260
00261 while( sequenceReader.getNextSequence
00262 ( thisSeq, wordLength_) != -1 )
00263 {
00264 numSeqs++;
00265
00266 sequenceReader.getLastSequenceName( pReaderLocal->lastName() );
00267
00268
00269 seq->link( thisSeq );
00270 countWords(*seq);
00271
00272
00273
00274
00275 }
00276
00277
00278
00279
00280
00281 return numSeqs;
00282 }
00283
00284 void HashTableGeneric::hashAllWords
00285 ( SequenceReader& sequenceReader, SequenceAdapter* seq, int numSeqs )
00286 {
00287
00288 int numWords(0);
00289 sequenceReader.rewind();
00290 WordSequence thisSeq;
00291
00292
00293 for ( unsigned int i(1); i <= numSeqs ; i++ )
00294 {
00295 if( sequenceReader.getNextSequence( thisSeq, wordLength_) == -1 )
00296 {
00297 throw SSAHAException
00298 ("Sequence source data changed during hash table creation!");
00299 }
00300
00301
00302 numWords = (int) thisSeq.size();
00303 pSequenceSizes_[i-1] = ( numWords > 0 )
00304 ? ( (numWords-1) * wordLength_ ) + thisSeq.getNumBasesInLast()
00305 : 0;
00306
00307
00308 seq->link(thisSeq);
00309 hashWords( *seq,i );
00310
00311
00312 }
00313
00314
00315 }
00316
00317
00318
00319
00320
00321 void HashTableGeneric::setupPointerArray( void )
00322 {
00323
00324
00325 monitoringStream_ << "Allocating memory for pWordPositionInHitList_: "
00326 << numDifferentWords_ << " words, "
00327 << numDifferentWords_ * sizeof(PositionInHitList)
00328 << " bytes total ..." << endl;
00329
00330
00331 pArrayAllocator_->allocateAndZero
00332 (numDifferentWords_);
00333
00334 monitoringStream_ << "... done" << endl;
00335
00336 }
00337
00338 void HashTableGeneric::computePointerArray( void )
00339 {
00340 unsigned long numClipped( 0 );
00341
00342
00343 for ( unsigned int i(1) ; i < numDifferentWords_ ; i ++ )
00344 {
00345 pWordPositionInHitList_[i] += pWordPositionInHitList_[i-1];
00346 }
00347
00348 wordsInDatabase_ = pWordPositionInHitList_[numDifferentWords_-1];
00349
00350 monitoringStream_ << numClipped
00351 << " words will be discarded ("
00352 << (float)(100*numClipped)
00353 /(float)(wordsInDatabase_+numClipped)
00354 << " \% of total)." << endl;
00355
00356
00357 }
00358
00359
00360 void HashTableGeneric::setupHitList( void )
00361 {
00362
00363
00364 monitoringStream_ << "Allocating memory for pHitListForAllWords_: "
00365 << wordsInDatabase_ << " words, "
00366 << wordsInDatabase_ * getHitTypeSize()
00367 << " bytes total ..." << endl;
00368
00369 allocateHitList(wordsInDatabase_);
00370
00371 monitoringStream_ << "... memory allocation OK" << endl;
00372
00373
00374 monitoringStream_ << "Allocating memory for pHitsFoundSoFar_: "
00375 << numDifferentWords_ << " words, "
00376 << numDifferentWords_ * sizeof(unsigned short)
00377 << " bytes total ..." << endl;
00378
00379
00380
00381
00382 tempAlloc_ =
00383 pArrayAllocator_->clone(&pHitsFoundSoFar_,
00384 name_+(string)".temp",
00385 monitoringStream_);
00386
00387 tempAlloc_->allocateAndZero( numDifferentWords_ );
00388
00389 monitoringStream_ << "... done" << endl;
00390
00391 }
00392
00393 void HashTableGeneric::cleanupTempData( void )
00394 {
00395 delete tempAlloc_;
00396 }
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407 void HashTableGeneric::getSequenceName
00408 ( string& seqName, SequenceNumber seqNum ) const
00409 {
00410 pNameReader_->getSequenceName( seqName, seqNum );
00411 #ifdef XXX
00412
00413
00414 if ((seqNum <= 0 )||( seqNum > sequenceNames_.size() ))
00415 {
00416 seqName = "No sequence name stored for sequence.";
00417 monitoringStream_
00418 << "Warning - no sequence name for requested sequence number ("
00419 << seqNum << ").\n";
00420
00421 return;
00422 }
00423
00424 seqName = *(sequenceNames_[seqNum - 1]);
00425 return;
00426 #endif
00427 }
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437 const char* HashTableGeneric::getSequenceName
00438 ( SequenceNumber seqNum ) const
00439 {
00440 return pNameReader_->getSequenceName( seqNum );
00441 #ifdef XXX
00442
00443
00444 if ((seqNum <= 0 )||( seqNum > sequenceNames_.size() ))
00445 {
00446 monitoringStream_
00447 << "Warning - no sequence name for requested sequence number ("
00448 << seqNum << ").\n";
00449 return NULL;
00450 }
00451
00452 return sequenceNames_[seqNum - 1]->c_str();
00453 #endif
00454 }
00455
00456 SequenceNumber HashTableGeneric::getNumSequences() const
00457 {
00458 return (pNameReader_ == NULL) ? 0 : pNameReader_->size();
00459 }
00460
00461
00462
00463
00464
00465
00466
00467
00468 SequenceOffset HashTableGeneric::getSequenceSize( SequenceNumber seqNum ) const
00469 {
00470
00471
00472 if ((seqNum <= 0 )||( seqNum > pNameReader_->size() ))
00473 {
00474 monitoringStream_
00475 << "Warning - no sequence size for requested sequence number ("
00476 << seqNum << ").\n";
00477
00478 return 0;
00479 }
00480 else return pSequenceSizes_[seqNum - 1];
00481
00482 }
00483
00484
00485 HashTableGeneric* HashTableFactory::loadHashTable( const string& name, SourceReaderIndex* pReader )
00486 {
00487
00488 monitoringStream_ << "loading hash table from file ...\n";
00489
00490 ifstream nameFile( (name+(string)".name").c_str() );
00491
00492 if ( nameFile.fail() )
00493 {
00494 monitoringStream_ << "Error: failed to open "
00495 << name << ".name, aborting load." << endl;
00496 throw SSAHAException("Could not open .name file ");
00497 }
00498
00499 char firstChar = nameFile.peek();
00500 string temp;
00501 temp += firstChar;
00502
00503 nameFile.close();
00504
00505 HitListFormatType hitListFormat((HitListFormatType)atoi(temp.c_str()));
00506
00507 HashTableGeneric* pTable(NULL);
00508
00509 if (hitListFormat==gStandard)
00510 {
00511 monitoringStream_ << "... creating standard format hash table\n";
00512 pTable = new HashTable(monitoringStream_,name);
00513 }
00514 else if (hitListFormat==g32BitPacked)
00515 {
00516 monitoringStream_ << "... creating 32 bit packed format hash table\n";
00517 pTable = new HashTablePacked(monitoringStream_,name);
00518 }
00519 else if (hitListFormat==g32BitPackedProtein)
00520 {
00521 monitoringStream_
00522 << "... creating 32 bit packed format protein hash table\n";
00523 pTable = new HashTablePackedProtein(monitoringStream_,name);
00524 }
00525 else if (hitListFormat==gTranslated)
00526 {
00527 monitoringStream_ << "... creating translated format hash table\n";
00528 pTable = new HashTableTranslated(monitoringStream_,name);
00529 }
00530 else
00531 {
00532 monitoringStream_
00533 << "Error: type of hash table (" << hitListFormat
00534 << " not recognised.\n";
00535 throw SSAHAException("Could not recognise hash table type");
00536 }
00537
00538 assert (pTable!=NULL);
00539
00540 loadHashTable(*pTable, pReader);
00541
00542 assert (pTable->isInitialized());
00543
00544 return pTable;
00545
00546 }
00547
00548
00549
00550
00551 void HashTableGeneric::loadHashTable
00552 ( SourceReaderIndex* pSourceReader )
00553 {
00554
00555 monitoringStream_ << "HashTable::loadHashTable" << endl;
00556
00557 if ( isInitialized_ == true )
00558 {
00559 monitoringStream_ << "Error: hash table cannot be loaded because class"
00560 << " has already been initialized." << endl;
00561 return;
00562 }
00563
00564
00565
00566 if (pNameReader_==NULL)
00567 {
00568 if (pSourceReader==NULL)
00569 {
00570 monitoringStream_ << "Info: sequence names will be held in local memory"
00571 << endl;
00572 pNameReader_ = new NameReaderLocal;
00573 }
00574 else
00575 {
00576 monitoringStream_ << "Info: sequence names will be read from file"
00577 << endl;
00578 pNameReader_ = new NameReaderIndex(*pSourceReader);
00579 }
00580 }
00581
00582
00583 loadSequenceNames();
00584
00585 unsigned long numSeqs = pNameReader_->size();
00586
00587
00588
00589
00590 int bitsUsedPerWord( wordLength_ * bitsPerSymbol_ );
00591 numDifferentWords_ = ( 1 << bitsUsedPerWord );
00592 unsigned long headSize = numDifferentWords_ * sizeof(PositionInHitList);
00593
00594
00595 monitoringStream_ << "Allocating memory for pWordPositionInHitList_: "
00596 << numDifferentWords_ << " words, "
00597 << headSize
00598 << " bytes total ..." << endl;
00599
00600
00601
00602 pArrayAllocator_->load(numDifferentWords_);
00603
00604
00605
00606
00607
00608
00609
00610
00611 wordsInDatabase_ = pWordPositionInHitList_[numDifferentWords_-1];
00612
00613
00614 unsigned long bodySize
00615 = wordsInDatabase_ * getHitTypeSize();
00616
00617 monitoringStream_ << "Allocating memory for pHitListForAllWords_: "
00618 << wordsInDatabase_ << " words, "
00619 << bodySize
00620 << " bytes total ..." << endl;
00621
00622 loadHitList(wordsInDatabase_);
00623
00624
00625
00626
00627 if (numSeqs != 0 )
00628 {
00629 monitoringStream_ << "Allocating memory for pSequenceSizes_: "
00630 << numSeqs << " sequences, "
00631 << numSeqs*sizeof(SequenceOffset)
00632 << " bytes total ...\n";
00633
00634 pSequenceSizes_ = new SequenceOffset [ numSeqs ];
00635
00636 if (!pSequenceSizes_)
00637 {
00638 throw SSAHAException("Memory allocation failed!");
00639 }
00640
00641 loadFromFile(name_+(string)".size", (char*)pSequenceSizes_,
00642 numSeqs * sizeof( SequenceOffset ),
00643 monitoringStream_ );
00644 }
00645
00646 isInitialized_ = true;
00647 monitoringStream_ << "Hash table loading complete.\n";
00648
00649 }
00650
00651 void HashTableGeneric::loadSequenceNames( void )
00652 {
00653
00654 ifstreamSSAHA nameFile( ( name_ + ".name" ).c_str() );
00655
00656 if ( nameFile.fail() )
00657 {
00658 monitoringStream_ << "Error: failed to open "
00659 << name_ << ".name, aborting load." << endl;
00660 throw SSAHAException("Could not open .name file");
00661 }
00662
00663 string hashParams;
00664
00665 getline( nameFile, hashParams );
00666
00667 std::string::size_type pos(0), prev_pos(0);
00668
00669 pos=hashParams.find_first_of(' ',pos);
00670 if (pos == std::string::npos)
00671 {
00672 monitoringStream_ << "Error: could not parse "
00673 << name_ << ".name, aborting load." << endl;
00674 throw SSAHAException("Could not parse .name file");
00675 }
00676
00677 if ( hitListFormat_
00678 != (HitListFormatType)atoi( hashParams.substr
00679 (prev_pos, pos-prev_pos).c_str() ) )
00680 {
00681 monitoringStream_
00682 << "Error: saved data is not compatible with this hash table" << endl;
00683 throw SSAHAException("Tried to load incompatible data into hash table");
00684 }
00685
00686 prev_pos = ++pos;
00687
00688 pos=hashParams.find_first_of(' ',pos);
00689 wordLength_
00690 = atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str());
00691 prev_pos = ++pos;
00692
00693 pos=hashParams.find_first_of(' ',pos);
00694 stepLength_
00695 = atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str());
00696 prev_pos = ++pos;
00697
00698 pos=hashParams.find_first_of(' ',pos);
00699 bitsPerSymbol_
00700 = atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str());
00701 prev_pos = ++pos;
00702
00703 pos=hashParams.find_first_of(' ',pos);
00704 sourceData_ = (SourceDataType)
00705 atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str());
00706 prev_pos = ++pos;
00707
00708
00709
00710
00711
00712
00713
00714
00715
00716
00717
00718
00719 pNameReader_->loadSequenceNames( nameFile );
00720
00721 monitoringStream_ << "Loaded file " << name_ << ".name ("
00722 << pNameReader_->size()
00723 << " sequence names).\n";
00724
00725 monitoringStream_ << "Closing .name file\n";
00726 nameFile.close();
00727
00728 monitoringStream_ << "Loaded in word length of "
00729 << wordLength_ << " bases.\n";
00730
00731 monitoringStream_ << "Loaded in step length of "
00732 << stepLength_ << " bases.\n";
00733
00734 monitoringStream_ << "Hash table format uses " << bitsPerSymbol_
00735 << " bits per symbol.\n";
00736
00737 monitoringStream_ << "Hash table was created from ";
00738
00739 switch (sourceData_)
00740 {
00741 case gDNAData:
00742 monitoringStream_ << "DNA"; break;
00743 case gProteinData:
00744 monitoringStream_ << "protein"; break;
00745 default:
00746 monitoringStream_ << "unknown"; break;
00747 }
00748
00749 monitoringStream_ << " data.\n";
00750
00751
00752
00753
00754 }
00755
00756
00757
00758
00759
00760 void HashTableGeneric::saveHashTable( void )
00761 {
00762
00763 monitoringStream_ << "HashTableFactory::saveHashTable" << endl;
00764
00765 if ( isInitialized_ == false )
00766 {
00767 monitoringStream_ << "Error: hash table cannot be saved because it was"
00768 << " was not successfully created." << endl;
00769 throw SSAHAException("Tried to save uninitialized hash table");
00770 }
00771
00772
00773
00774 saveSequenceNames();
00775
00776
00777
00778
00779
00780 pArrayAllocator_->save();
00781
00782
00783 saveHitList();
00784
00785
00786 if (pNameReader_->size() != 0)
00787 saveToFile( name_+(string)".size",
00788 (char*)pSequenceSizes_,
00789 pNameReader_->size() * sizeof( SequenceOffset ),
00790 monitoringStream_ );
00791
00792
00793
00794 }
00795
00796
00797 void HashTableGeneric::saveSequenceNames( void )
00798 {
00799
00800 ofstreamSSAHA nameFile( ( name_ + ".name" ).c_str() );
00801
00802 nameFile << hitListFormat_ << " " << wordLength_ << " "
00803 << stepLength_ << " " << bitsPerSymbol_
00804 << " " << sourceData_ << " " << endl;
00805
00806
00807 pNameReader_->saveSequenceNames( nameFile );
00808
00809
00810
00811
00812
00813
00814
00815 if ( nameFile.fail() )
00816 {
00817 monitoringStream_ << "Error: failed to write "
00818 << name_ << ".name" << endl;
00819 throw SSAHAException("Problem saving .name file");
00820 }
00821
00822 nameFile.close();
00823
00824 monitoringStream_ << "Saved file " << name_ << ".name\n";
00825
00826
00827 }
00828
00829
00830 void HashTableGeneric::printHashStats( void )
00831 {
00832 cout << "Analyzing data in hash table...\n";
00833
00834 static const double thresholds[] =
00835 { 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1,
00836 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01,
00837 0.001, 0.0001, 0.00001,
00838 0 };
00839
00840
00841 if (!isInitialized())
00842 {
00843 cout << "Hash table has not been initialized.\n";
00844 return;
00845 }
00846
00847 const char* pSymbols;
00848
00849 cout << "Word length: " << wordLength_ << " bases.\n";
00850 cout << "Bits per symbol: " << bitsPerSymbol_ << endl;
00851
00852 unsigned long numDifferentWords;
00853
00854
00855
00856 if (bitsPerSymbol_==gResidueBits)
00857 {
00858 numDifferentWords= (unsigned long)pow((double)gNumCodonEncodings,
00859 (int)wordLength_);
00860 pSymbols = gResidueNames;
00861 }
00862 else
00863 {
00864 assert(bitsPerSymbol_==gBaseBits);
00865 numDifferentWords= 1<<(2*wordLength_);
00866 pSymbols = gBaseNames;
00867 }
00868
00869
00870
00871
00872 cout << "There are " << numDifferentWords
00873 << " different possible words of this length.\n";
00874 cout << "There are " << wordsInDatabase_
00875 << " words in this hash table.\n";
00876
00877
00878 int numCommon(240);
00879
00880 TopList mostCommon(numCommon, wordLength_, bitsPerSymbol_);
00881
00882 map<int, int> histogram;
00883 vector<Word> neverOccurred;
00884
00885
00886 PositionInHitList last(0);
00887 int numOccs;
00888
00889 for ( unsigned long i(0); i < numDifferentWords ; ++i )
00890 {
00891 numOccs = pWordPositionInHitList_[i]-last;
00892 histogram[numOccs]++;
00893 if (numOccs==0) neverOccurred.push_back((Word)i);
00894
00895 mostCommon.push_back( pair<int, Word>(numOccs, (Word)i ) );
00896 last = pWordPositionInHitList_[i];
00897 }
00898
00899 cout << neverOccurred.size() << " words never appeared.\n";
00900 if (neverOccurred.size() < 200)
00901 {
00902 cout << "They are:\n";
00903 for (vector<Word>::iterator i(neverOccurred.begin());
00904 i!=neverOccurred.end();i++)
00905
00906 cout
00907 << PrintFromWord( *i, wordLength_, bitsPerSymbol_, pSymbols )
00908 << endl;
00909 } else cout << "Not showing them - more than 200!\n";
00910
00911 cout << "The " << numCommon << " most common words are:\n";
00912
00913 cout << mostCommon;
00914
00915 unsigned long total(0), totalWordsUsed(0);
00916 int thresholdNum(0);
00917 unsigned int n(0);
00918
00919 for ( map<int, int>::iterator i( histogram.begin());
00920 i != histogram.end() ; ++i )
00921 {
00922 total += (i->first*i->second);
00923 totalWordsUsed += i->second;
00924 if (n++<2000)
00925 {
00926 cout << i->second << " words occurred " << i->first
00927 << " times in the hash table ("
00928 << 100.0*total/(double)wordsInDatabase_
00929 << "/"
00930 << 100.0*totalWordsUsed/(double)numDifferentWords
00931 << " cumulative percent).\n";
00932 }
00933
00934
00935
00936
00937 while ( ( (double)(wordsInDatabase_-total)/(double)wordsInDatabase_ )
00938 < thresholds[thresholdNum] )
00939 {
00940 cout << "Threshold " << i->first << " retains "
00941 << 100.0*(1.0-thresholds[thresholdNum++])
00942 << "\% of words.\n";
00943 }
00944 }
00945
00946 cout << "Total: " << total << "\n";
00947
00948 cout << "... analysis complete.\n";
00949
00950 }
00951
00952
00953 WordSequenceShifted::WordSequenceShifted
00954 ( const WordSequence& thisSeq, const HashTableGeneric& hashTable ) :
00955
00956 size_( ( thisSeq.size() - 2 ) * hashTable.getWordLength()
00957 + thisSeq.getNumBasesInLast() + 1 )
00958
00959
00960 {
00961 seqs_.push_back( thisSeq );
00962
00963 int wordLength( hashTable.getWordLength() );
00964
00965 for( int i(1) ; i < wordLength ; i++ )
00966 {
00967 seqs_.push_back( seqs_.back() );
00968 shiftSequence( seqs_.back(), hashTable.getBitsPerSymbol(), wordLength );
00969 }
00970
00971 }
00972
00973
00974 SequenceAdapterWithOverlap::SequenceAdapterWithOverlap
00975 ( int bitsPerSymbol, int wordLength, int stepLength ) :
00976 bitsPerSymbol_( bitsPerSymbol ),
00977 wordLength_( wordLength ),
00978 stepLength_( stepLength ),
00979 SequenceAdapter()
00980 {
00981
00982 maskLeft_ = new Word[ wordLength_ ];
00983 maskRight_ = new Word[ wordLength_ ];
00984
00985
00986 const Word allOnes = ( 1 << ( bitsPerSymbol_ * wordLength_ ) ) - 1;
00987
00988 for ( int i(0) ; i < wordLength_ ; i++ )
00989 {
00990 maskLeft_[i] = ( 1 << ( bitsPerSymbol_ * (wordLength_ - i) ) ) - 1;
00991 maskRight_[i] = allOnes - maskLeft_[i];
00992
00993
00994
00995
00996
00997 maskLeft_[i] &= allOnes;
00998 maskRight_[i] &= allOnes;
00999
01000
01001 }
01002 }
01003
01004 SequenceAdapterWithOverlap::~SequenceAdapterWithOverlap()
01005 {
01006 delete [] maskLeft_;
01007 delete [] maskRight_;
01008 }
01009
01010 void SequenceAdapterWithOverlap::link( const WordSequence& seq )
01011 {
01012 SequenceAdapter::link( seq );
01013
01014
01015 size_
01016 = (seq.size()<=1)
01017 ? 0
01018 : ( ( ( (int) seq.size() - 2) * wordLength_ ) + seq.getNumBasesInLast() )
01019 / stepLength_;
01020 }
01021 Word SequenceAdapterWithOverlap::operator[]( size_type j )
01022 {
01023 j *= stepLength_;
01024 const int wordPos( j / wordLength_ ), wordShift( j % wordLength_ );
01025 Word thisWord (0);
01026 thisWord
01027 |= ( (*pSeq_)[wordPos] & maskLeft_[wordShift] )
01028 << ( bitsPerSymbol_ * wordShift );
01029 thisWord
01030 |= ( (*pSeq_)[wordPos+1] & maskRight_[wordShift] )
01031 >> ( bitsPerSymbol_ * ( wordLength_ - wordShift ) );
01032 thisWord
01033 |= ( gCursedWord
01034 * ( ( ((*pSeq_)[wordPos] & gCursedWord) != (Word)0 )
01035 || ( ((*pSeq_)[wordPos+1]& gCursedWord) != (Word)0 ) ) );
01036 return thisWord;
01037 }
01038
01039
01040
01041 const char* NameReaderLocal::getSequenceName
01042 ( SequenceNumber seqNum ) const
01043 {
01044
01045 if (seqNum <= size() )
01046 {
01047 return (*this)[seqNum-1].c_str();
01048 }
01049 return NULL;
01050
01051 }
01052
01053 void NameReaderLocal::getSequenceName
01054 ( string& seqName, SequenceNumber seqNum ) const
01055 {
01056
01057 if (seqNum <= size() )
01058 {
01059 seqName = (*this)[seqNum-1];
01060 return;
01061 }
01062 seqName = "No sequence name stored for sequence.";
01063 }
01064
01065
01066
01067 const char* NameReaderIndex::getSequenceName
01068 ( SequenceNumber seqNum ) const
01069 {
01070 return reader_.extractName(seqNum);
01071 }
01072
01073
01074 void NameReaderIndex::getSequenceName
01075 ( string& seqName, SequenceNumber seqNum ) const
01076 {
01077 seqName=(string)reader_.extractName(seqNum);
01078 }
01079
01080 void NameReaderLocal::saveSequenceNames( ostream& nameFile )
01081 {
01082 for ( vector<string>::iterator i = begin() ;
01083 i != end() ; i ++ )
01084 {
01085 nameFile << *i << endl;
01086 }
01087 }
01088
01089 void NameReaderLocal::loadSequenceNames( istream& nameFile )
01090 {
01091 while( getline( nameFile, lastName() ) );
01092 pop_back();
01093
01094
01095
01096
01097
01098
01099
01100
01101
01102
01103
01104
01105
01106
01107 }
01108
01109 SequenceNumber NameReaderIndex::size( void ) const
01110 {
01111 return reader_.size();
01112 }
01113
01114
01115
01116
01117
01118
01119
01120
01121