HashTable/HashTableGeneric.cpp

Go to the documentation of this file.
00001 
00002 // #######################################################################
00003 
00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00005 // Version 3.2, released 1st March 2004
00006 // Copyright (c) Genome Research 2002
00007 
00008 // SSAHA is free software; you can redistribute it and/or modify 
00009 // it under the terms of version 2 of the GNU General Public Licence
00010 // as published by the Free Software Foundation.
00011  
00012 // This program is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public Licence for more details.
00016  
00017 // You should have received a copy of the GNU General Public Licence
00018 // along with this program; if not, write to the Free Software
00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00021 
00022 // #######################################################################
00023 
00024 // Module Name  : HashTableGeneric
00025 // File Name    : HashTableGeneric.cpp
00026 // Language     : C++
00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00028 
00029 // Description:
00030 
00031 // Includes:
00032 
00033 #include "HashTableGeneric.h"
00034 #include "HashTable.h"
00035 #include "HashTablePacked.h"
00036 #include "HashTableTranslated.h"
00037 #include "SequenceReader.h"
00038 // #include "SequenceReaderCodon.h"
00039 #include "GlobalDefinitions.h"
00040 #include <fstream>
00041 #include <algorithm>
00042 #include <map>
00043 #include <cmath>
00044 
00045 
00046 // ### Function Definitions ###
00047 
00048 // Name:
00049 // Arguments:
00050 // TYPE  NAME  IN/OUT COMMENT
00051 // Returns: TYPE COMMENT
00052 
00053 enum
00054 {
00055   defaultMaxNumHits=10000
00056 };
00057 
00058 
00059   // Function Name: Constructor
00060   // Arguments:     ostream&
00061   // Returns: N/A
00062 HashTableGeneric::HashTableGeneric
00063 ( ostream& monitoringStream,
00064   const string& name,
00065   Allocator<PositionInHitList>& arrayAllocator ) : 
00066   isInitialized_( false ), 
00067   monitoringStream_( monitoringStream ),
00068   name_( name ),
00069   bitsPerSymbol_( gBaseBits ), // default: may be overwritten in subclass ctor
00070   maxNumHits_( defaultMaxNumHits ),
00071   hitListFormat_( gNotSpecified ),
00072   pArrayAllocator_
00073   ( arrayAllocator.clone(&pWordPositionInHitList_,
00074                          name+(string)".head",
00075                          monitoringStream_) ),
00076   pSequenceSizes_(NULL),
00077   pNameReader_(NULL)
00078   {
00079     monitoringStream_ << "constructing HashTableGeneric\n";
00080     if (name_=="")
00081     {
00082       char buf[50];
00083       static int numTables(0);
00084       sprintf(buf,"SSAHAHashTable-%d-%d",getpid(),++numTables);
00085       name_=(string)buf;
00086     }
00087     //    pArrayAllocator_->link(&pWordPositionInHitList_,name+(string)".head");
00088   } // constructor 
00089 
00090   // Function Name:
00091   // Arguments:
00092   // TYPE  NAME  IN/OUT COMMENT
00093   // Returns: TYPE COMMENT
00094   HashTableGeneric::~HashTableGeneric()
00095   { 
00096     monitoringStream_ << "destructing HashTableGeneric ...\n";
00097     if ( isInitialized_ )
00098     {
00099       monitoringStream_ << "... deallocating memory\n";
00100       //      delete [] pHitListForAllWords_; 
00101       //      pArrayAllocator->deallocate();
00102       delete pArrayAllocator_; // also deallocs pointer array
00103       // deletion of HitList is handled in HashTableView
00104       // pHitListAllocator->deallocate();
00105       // delete pHitListAllocator; // NB destructor order
00106       //      delete [] pWordPositionInHitList_;
00107       delete [] pSequenceSizes_;
00108       //      for ( vector<string*>::iterator i( sequenceNames_.begin() );
00109       //    i != sequenceNames_.end() ; ++i ) delete *i;
00110       delete pNameReader_;
00111     } // ~if 
00112     else 
00113     {
00114       monitoringStream_ 
00115       << "... never initialized, no memory deallocation required\n";
00116     } // ~else
00117   } // destructor
00118 
00119 void HashTableGeneric::createHashTable
00120 ( SequenceReader& sequenceReader, int wordLength, int maxNumHits, 
00121   int stepLength )
00122 {
00123 
00124     monitoringStream_ << "HashTable::createHashTable" << endl;
00125 
00126     //    bitsPerSymbol_ = sequenceReader.getBitsPerSymbol();
00127 
00128     if (bitsPerSymbol_ != sequenceReader.getBitsPerSymbol() )
00129     {
00130       monitoringStream_ << "Info: reader encodes using " 
00131                         << sequenceReader.getBitsPerSymbol() 
00132                         << " bits per symbol, hash table expects "
00133                         << bitsPerSymbol_ << " bits per symbol.\n";
00134       //  throw SSAHAException("Wrong encoder for hash table");
00135     } // ~if
00136 
00137 
00138     monitoringStream_ << "Info: hash table creation will proceed using "
00139                       << bitsPerSymbol_ << " bits per symbol.\n";
00140 
00141     sourceData_ = sequenceReader.getSourceDataType();
00142 
00143     if ( sourceData_ == gDNAData )
00144     {
00145       monitoringStream_ 
00146         << "Info: hash table will be created from DNA source data.\n";
00147     } // ~if
00148     else if ( sourceData_ == gProteinData )
00149     {
00150       monitoringStream_ 
00151         << "Info: hash table will be created from protein source data.\n";
00152     } // ~else
00153     else
00154     {
00155       monitoringStream_ 
00156       << "Info: source data for hash table is of unknown type.\n";
00157     } // ~else
00158 
00159 
00160     // check wordLength is sensible
00161 
00162     int bitsUsedPerWord( wordLength * bitsPerSymbol_ );
00163     
00164     if ( (bitsUsedPerWord <= 0 ) || ( bitsUsedPerWord > gBitsPerWord ) ) 
00165        // 4 base pairs storable per byte
00166     {
00167       monitoringStream_ << "Error: bits used per hash words (" 
00168                         << bitsUsedPerWord << ") outside valid range ( 1 to "
00169                         << gBitsPerWord << ").\n";
00170       throw SSAHAException("Invalid hash word length.");
00171     } // ~if
00172 
00173     monitoringStream_ << "Info: " << wordLength 
00174                       << " symbols will be stored per hash word.\n";
00175 
00176     // check stepLength is sensible. If not, set it equal to the word length
00177     if ( ( stepLength > wordLength ) || ( stepLength <= 0 ) ) 
00178     { 
00179       monitoringStream_ << "Info: requested step length for hashing ("
00180                         << stepLength << ") outside valid range ( 1 to "
00181                         << wordLength << "), using " << wordLength
00182                         << " instead.\n";
00183         
00184       stepLength = wordLength; 
00185     }
00186 
00187     wordLength_ = wordLength;
00188     stepLength_ = stepLength;
00189     maxNumHits_ = maxNumHits;
00190 
00191     AdapterFactory factory;
00192 
00193     SequenceAdapter* seq
00194       ( factory.create( bitsPerSymbol_, wordLength_, stepLength_ ) );
00195 
00196 
00197     // numDifferentWords_ is the number of possible different Words for a 
00198     // given word length ( equal to 2^(2*wordLength_) )
00199     numDifferentWords_ =  ( 1 << bitsUsedPerWord );
00200 
00201     setupPointerArray();
00202 
00203     
00204     // Now start to extract the sequence information from the file
00205 
00206     //    sequenceReader.rewind();
00207 
00208     int numSeqs(countWordsAndGetNames( sequenceReader, seq ));
00209 
00210     // Create the array of sequence sizes
00211     monitoringStream_ << "Allocating memory for pSequenceSizes_: "
00212                       << numSeqs << " sequences, "
00213                       << numSeqs * sizeof(SequenceOffset)
00214                       << " bytes total ..." << endl;
00215 
00216     pSequenceSizes_ = new SequenceOffset [ numSeqs ];
00217 
00218     if (!pSequenceSizes_) 
00219     {
00220       throw SSAHAException("Memory allocation failed!");
00221     } // ~if
00222 
00223     monitoringStream_ << "... memory allocation OK" << endl;
00224 
00225     computePointerArray();
00226 
00227     setupHitList();
00228 
00229     // Now go through the vector of WordSequences and fill in the table
00230 
00231     hashAllWords( sequenceReader, seq, numSeqs );
00232 
00233     cleanupTempData();
00234 
00235     delete seq;
00236     //   delete [] pHitsFoundSoFar_;
00237 
00238     // Next flag is only set once everything is OK
00239     isInitialized_ = true;
00240 
00241 } // ~HashTable::createHashTable( int wordLength, int maxNumHits )
00242 
00243 
00244 int HashTableGeneric::countWordsAndGetNames
00245 ( SequenceReader& sequenceReader, SequenceAdapter* seq )
00246 {
00247 
00248     sequenceReader.rewind();
00249 
00250     //    vector<WordSequence> wordSeqs; 
00251     // const WordSequence dummy; // needed to get this to compile on LINUX ...
00252     //   wordSeqs.push_back(dummy);
00253     WordSequence thisSeq;
00254     int numSeqs(0);
00255 
00256     //    sequenceNames_.push_back(new string);
00257 
00258     NameReaderLocal* pReaderLocal = new NameReaderLocal;
00259     pNameReader_ = pReaderLocal;
00260 
00261     while( sequenceReader.getNextSequence
00262            ( thisSeq, wordLength_) != -1 ) 
00263     {
00264       numSeqs++;
00265 
00266       sequenceReader.getLastSequenceName( pReaderLocal->lastName() );
00267       //      sequenceReader.getLastSequenceName( *(sequenceNames_.back()) );
00268 
00269       seq->link( thisSeq );
00270       countWords(*seq);
00271 
00272       //      sequenceNames_.push_back( new string );
00273       //      wordSeqs.push_back(dummy);
00274 
00275     } // ~while
00276 
00277     //    delete sequenceNames_.back();
00278     //  sequenceNames_.pop_back();
00279     //    wordSeqs.pop_back();
00280     //    cout << "XXX: " << name_ << numSeqs << endl;
00281     return numSeqs;
00282 } // ~HashTableGeneric::countWordsAndGetNames
00283 
00284 void HashTableGeneric::hashAllWords
00285 ( SequenceReader& sequenceReader, SequenceAdapter* seq, int numSeqs )
00286 {
00287 
00288   int numWords(0);
00289   sequenceReader.rewind();
00290   WordSequence thisSeq;
00291     
00292   // NB sequences are numbered 1...n not 0...n-1
00293   for ( unsigned int i(1); i <= numSeqs ; i++ )
00294   {
00295     if( sequenceReader.getNextSequence( thisSeq, wordLength_) == -1 ) 
00296     {
00297       throw SSAHAException
00298         ("Sequence source data changed during hash table creation!");
00299     }
00300 
00301     // fill in the word count
00302     numWords = (int) thisSeq.size();
00303     pSequenceSizes_[i-1] = ( numWords > 0 ) 
00304       ? ( (numWords-1) * wordLength_ ) + thisSeq.getNumBasesInLast()
00305       : 0;
00306 
00307     // do the hashing
00308     seq->link(thisSeq);
00309     hashWords( *seq,i );
00310 
00311 
00312   } // ~for thisSeq
00313 
00314 
00315 } // ~HashTableGeneric::hashAllWords 
00316 
00317 
00318 
00319 
00320 
00321 void HashTableGeneric::setupPointerArray( void )
00322 {
00323 
00324     // Attempt to allocate memory for the list of table positions
00325     monitoringStream_ << "Allocating memory for pWordPositionInHitList_: "
00326                       << numDifferentWords_ << " words, "
00327                       << numDifferentWords_ * sizeof(PositionInHitList)
00328                       << " bytes total ..." << endl;
00329 
00330    
00331     pArrayAllocator_->allocateAndZero
00332       (numDifferentWords_);
00333 
00334     monitoringStream_ << "... done" << endl;
00335 
00336   } // ~HashTableGeneric::setupPointerArray( void )
00337 
00338 void HashTableGeneric::computePointerArray( void )
00339 {
00340     unsigned long numClipped( 0 );
00341 
00342     // Accumulate to get position of hit list in final array
00343     for ( unsigned int i(1) ; i < numDifferentWords_ ; i ++ )
00344     {
00345         pWordPositionInHitList_[i] += pWordPositionInHitList_[i-1];
00346     } // ~for
00347 
00348     wordsInDatabase_ = pWordPositionInHitList_[numDifferentWords_-1];
00349 
00350     monitoringStream_ << numClipped
00351     << " words will be discarded ("
00352     <<   (float)(100*numClipped)
00353         /(float)(wordsInDatabase_+numClipped)  
00354     << " \% of total)." << endl;  
00355 
00356 
00357 } // ~HashTableGeneric::computePointerArray( void )
00358 
00359 
00360 void HashTableGeneric::setupHitList( void )
00361 {
00362 
00363     // Attempt to allocate memory for the table itself
00364     monitoringStream_ << "Allocating memory for pHitListForAllWords_: "
00365                       << wordsInDatabase_ << " words, "
00366                       << wordsInDatabase_ * getHitTypeSize()
00367                       << " bytes total ..." << endl;
00368 
00369     allocateHitList(wordsInDatabase_);
00370 
00371     monitoringStream_ << "... memory allocation OK" << endl;
00372 
00373     // Attempt to allocate temporary memory to store hits so far for each word
00374     monitoringStream_ << "Allocating memory for pHitsFoundSoFar_: "
00375                       << numDifferentWords_ << " words, "
00376                       << numDifferentWords_ * sizeof(unsigned short)
00377                       << " bytes total ..." << endl;
00378 
00379 
00380 
00381     //    Allocator<PositionInHitList>* tempAlloc = 
00382     tempAlloc_ = 
00383       pArrayAllocator_->clone(&pHitsFoundSoFar_, 
00384                                  name_+(string)".temp",
00385                                  monitoringStream_);
00386     // ... ensures uses same allocation method as for ptr array
00387     tempAlloc_->allocateAndZero( numDifferentWords_  );
00388 
00389     monitoringStream_ << "... done" << endl;
00390 
00391 } // ~HashTableGeneric::setupHitList( void )
00392 
00393 void HashTableGeneric::cleanupTempData( void )
00394 {
00395     delete tempAlloc_; // automatically deallocs pHitsFoundSoFar
00396 } // ~HashTableGeneric::cleanupTempData( void )
00397 
00398 
00399 
00400 
00401   // Accessor Functions
00402   // (NB all accessor functions should be 'const')
00403 
00404 // Function Name: getSequenceName
00405 // Arguments: string& (out), SequenceNumber& (in)
00406 // Places the name of the seqNum'th sequence in the database in seqName
00407 void HashTableGeneric::getSequenceName
00408 ( string& seqName, SequenceNumber seqNum ) const
00409 {
00410   pNameReader_->getSequenceName( seqName, seqNum );
00411 #ifdef XXX
00412   // NB sequenceNames_ are numbered 0 ... n-1
00413   // but we are numbering sequences 1 ... n
00414   if ((seqNum <= 0 )||( seqNum > sequenceNames_.size() ))
00415   {
00416     seqName = "No sequence name stored for sequence.";
00417     monitoringStream_ 
00418       << "Warning - no sequence name for requested sequence number ("
00419       << seqNum << ").\n";
00420     
00421     return; 
00422   } // ~if
00423 
00424   seqName = *(sequenceNames_[seqNum - 1]);
00425   return;
00426 #endif
00427 } // ~HashTable::getSequenceName
00428 
00429 
00430 // Function Name: getSequenceName
00431 // Arguments: SequenceNumber& (in)
00432 // Returns: const char*
00433 // Returns C style string of the seqNum'th seq in the database
00434 // NB should be used with caution as the string only lasts as long
00435 // as the hash table, you have to copy it if you want to keep it
00436 // after the hash table's destructor is called for any reason.
00437 const char* HashTableGeneric::getSequenceName
00438 ( SequenceNumber seqNum ) const
00439 {
00440   return pNameReader_->getSequenceName( seqNum );
00441 #ifdef XXX
00442   // NB sequenceNames_ are numbered 0 ... n-1
00443   // but we are numbering sequences 1 ... n
00444   if ((seqNum <= 0 )||( seqNum > sequenceNames_.size() ))
00445   {
00446     monitoringStream_ 
00447       << "Warning - no sequence name for requested sequence number ("
00448       << seqNum << ").\n";
00449     return NULL; 
00450   } // ~if
00451 
00452   return sequenceNames_[seqNum - 1]->c_str();
00453 #endif
00454 } // ~HashTable::getSequenceName
00455 
00456 SequenceNumber HashTableGeneric::getNumSequences() const 
00457 { 
00458   return (pNameReader_ == NULL) ? 0 : pNameReader_->size();
00459 } // ~SequenceNumber HashTableGeneric::getNumSequences() const 
00460 
00461 
00462 
00463 
00464 
00465   // Function Name: getSequenceSize
00466   // Arguments: string& (out), SequenceNumber& (in)
00467   // Returns the size of the seqNum'th sequence in the database.
00468   SequenceOffset HashTableGeneric::getSequenceSize( SequenceNumber seqNum ) const
00469   {
00470     // NB sequenceNames_ are numbered 0 ... n-1
00471     // but we are numbering sequences 1 ... n
00472     if ((seqNum <= 0 )||( seqNum > pNameReader_->size() ))
00473     {
00474       monitoringStream_ 
00475       << "Warning - no sequence size for requested sequence number ("
00476       << seqNum << ").\n";
00477 
00478       return 0; 
00479     } // ~if
00480     else return pSequenceSizes_[seqNum - 1];
00481 
00482   } // ~HashTable::getSequenceName
00483 
00484 
00485   HashTableGeneric* HashTableFactory::loadHashTable( const string& name, SourceReaderIndex* pReader )
00486   {
00487 
00488     monitoringStream_ << "loading hash table from file ...\n";
00489 
00490     ifstream nameFile( (name+(string)".name").c_str() );
00491     
00492     if ( nameFile.fail() )
00493     {
00494       monitoringStream_ << "Error: failed to open " 
00495                         << name << ".name, aborting load." << endl;
00496       throw SSAHAException("Could not open .name file ");
00497     } // ~if
00498 
00499     char firstChar = nameFile.peek();
00500     string temp;
00501     temp += firstChar;
00502     
00503     nameFile.close();
00504 
00505     HitListFormatType hitListFormat((HitListFormatType)atoi(temp.c_str()));
00506 
00507     HashTableGeneric* pTable(NULL);
00508 
00509     if (hitListFormat==gStandard)
00510     {
00511       monitoringStream_ << "... creating standard format hash table\n";
00512       pTable = new HashTable(monitoringStream_,name);
00513     } // ~if
00514     else if (hitListFormat==g32BitPacked)
00515     {
00516       monitoringStream_ << "... creating 32 bit packed format hash table\n";
00517       pTable = new HashTablePacked(monitoringStream_,name);
00518     } // ~else if
00519     else if (hitListFormat==g32BitPackedProtein)
00520     {
00521       monitoringStream_ 
00522         << "... creating 32 bit packed format protein hash table\n";
00523       pTable = new HashTablePackedProtein(monitoringStream_,name);
00524     } // ~else if
00525     else if (hitListFormat==gTranslated)
00526     {
00527       monitoringStream_ << "... creating translated format hash table\n";
00528       pTable = new HashTableTranslated(monitoringStream_,name);
00529     }
00530     else
00531     {
00532       monitoringStream_ 
00533         << "Error: type of hash table (" << hitListFormat 
00534         << " not recognised.\n";
00535       throw SSAHAException("Could not recognise hash table type");
00536     } // ~else
00537 
00538     assert (pTable!=NULL);
00539     
00540     loadHashTable(*pTable, pReader);
00541 
00542     assert (pTable->isInitialized());
00543 
00544     return pTable;
00545 
00546   }
00547 
00548   // Function Name: loadHashTable
00549   // Arguments: const string& (in)
00550   // Reads a pre-computed hash table into memory from a file
00551   void HashTableGeneric::loadHashTable
00552   ( SourceReaderIndex* pSourceReader )
00553   {
00554 
00555     monitoringStream_ << "HashTable::loadHashTable" << endl;
00556 
00557     if ( isInitialized_ == true ) 
00558     {
00559       monitoringStream_ << "Error: hash table cannot be loaded because class"
00560                         << " has already been initialized." << endl;
00561       return;
00562     } // ~if
00563 
00564     // HashTableComponents have a dummy name reader set up already
00565     // so don't need the next bit, hence the next if condition
00566     if (pNameReader_==NULL)
00567     {
00568       if (pSourceReader==NULL) 
00569       {
00570         monitoringStream_ << "Info: sequence names will be held in local memory" 
00571                           << endl;
00572         pNameReader_ = new NameReaderLocal;
00573       } // ~if
00574       else
00575       {
00576         monitoringStream_ << "Info: sequence names will be read from file" 
00577                           << endl;
00578         pNameReader_ = new NameReaderIndex(*pSourceReader);
00579       } // ~else
00580     } // ~if
00581     // Now load all the sequence names
00582 
00583     loadSequenceNames();
00584 
00585     unsigned long numSeqs = pNameReader_->size();
00586 
00587     //    numDifferentWords_ = 1 << ( bitsPerSymbol_ * wordLength_ );
00588 
00589 
00590     int bitsUsedPerWord( wordLength_ * bitsPerSymbol_ );
00591     numDifferentWords_ = ( 1 << bitsUsedPerWord );
00592     unsigned long headSize = numDifferentWords_ * sizeof(PositionInHitList);
00593 
00594     // Attempt to allocate memory for the list of table positions
00595     monitoringStream_ << "Allocating memory for pWordPositionInHitList_: "
00596                       << numDifferentWords_ << " words, "
00597                       << headSize
00598                       << " bytes total ..." << endl;
00599 
00600     
00601       //        pArrayAllocator_->allocate(headSize);
00602     pArrayAllocator_->load(numDifferentWords_);
00603       //        loadFromFile
00604       //    ( 
00605       //      name_+(string)".head",
00606       //     (char*)pWordPositionInHitList_, 
00607       //      headSize
00608       //    );
00609 
00610 
00611     wordsInDatabase_ = pWordPositionInHitList_[numDifferentWords_-1];
00612     //    cout << pWordPositionInHitList_[numDifferentWords-1] << endl;
00613 
00614     unsigned long bodySize 
00615       = wordsInDatabase_ * getHitTypeSize();
00616 
00617     monitoringStream_ << "Allocating memory for pHitListForAllWords_: "
00618                       << wordsInDatabase_ << " words, "
00619                       << bodySize
00620                       << " bytes total ..." << endl;
00621 
00622     loadHitList(wordsInDatabase_);
00623 
00624     // Now load all the sequence sizes
00625 
00626     // Don't need a .size file for HashTableTranslated ....
00627     if (numSeqs != 0 )
00628     {    
00629       monitoringStream_ << "Allocating memory for pSequenceSizes_: "
00630                         << numSeqs << " sequences, "
00631                         << numSeqs*sizeof(SequenceOffset)
00632                         << " bytes total ...\n";
00633 
00634       pSequenceSizes_ = new SequenceOffset [ numSeqs  ];
00635       
00636       if (!pSequenceSizes_) 
00637       {
00638         throw SSAHAException("Memory allocation failed!");
00639       } // ~if
00640 
00641       loadFromFile(name_+(string)".size", (char*)pSequenceSizes_, 
00642                    numSeqs * sizeof( SequenceOffset ),
00643                    monitoringStream_ );
00644     } // ~if
00645 
00646     isInitialized_ = true; // set once all loading has been done
00647     monitoringStream_ << "Hash table loading complete.\n";
00648 
00649   } // ~loadHashTable
00650 
00651 void HashTableGeneric::loadSequenceNames( void )
00652 {
00653 
00654     ifstreamSSAHA nameFile( ( name_ + ".name" ).c_str() );
00655     
00656     if ( nameFile.fail() )
00657     {
00658       monitoringStream_ << "Error: failed to open " 
00659                         << name_ << ".name, aborting load." << endl;
00660       throw SSAHAException("Could not open .name file");
00661     } // ~if
00662 
00663     string hashParams;
00664 
00665     getline( nameFile, hashParams );
00666 
00667     std::string::size_type pos(0), prev_pos(0);
00668 
00669     pos=hashParams.find_first_of(' ',pos);
00670     if (pos == std::string::npos)
00671     {
00672       monitoringStream_ << "Error: could not parse " 
00673                         << name_ << ".name, aborting load." << endl;
00674       throw SSAHAException("Could not parse .name file");
00675     } // ~if
00676 
00677     if ( hitListFormat_ 
00678          != (HitListFormatType)atoi( hashParams.substr
00679                                      (prev_pos, pos-prev_pos).c_str() ) )
00680     {
00681       monitoringStream_ 
00682       << "Error: saved data is not compatible with this hash table" << endl;
00683       throw SSAHAException("Tried to load incompatible data into hash table");
00684     } // ~if
00685 
00686     prev_pos = ++pos;
00687 
00688     pos=hashParams.find_first_of(' ',pos);
00689     wordLength_ 
00690       = atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str());
00691     prev_pos = ++pos;
00692 
00693     pos=hashParams.find_first_of(' ',pos);
00694     stepLength_ 
00695       = atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str());
00696     prev_pos = ++pos;
00697 
00698     pos=hashParams.find_first_of(' ',pos);
00699     bitsPerSymbol_ 
00700       = atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str());
00701     prev_pos = ++pos;
00702 
00703     pos=hashParams.find_first_of(' ',pos);
00704     sourceData_ = (SourceDataType) 
00705       atoi(hashParams.substr(prev_pos, pos-prev_pos).c_str());
00706     prev_pos = ++pos;
00707 
00708     //    sequenceNames_.push_back( new string );
00709  
00710 
00711 
00712     //    while( getline( nameFile,*(sequenceNames_.back() ) ) )
00713     //   {
00714     //    sequenceNames_.push_back( new string );
00715     //   } // ~while
00716 
00717     //    sequenceNames_.pop_back();
00718 
00719     pNameReader_->loadSequenceNames( nameFile );
00720 
00721     monitoringStream_ << "Loaded file " << name_ << ".name (" 
00722                        << pNameReader_->size() 
00723                       << " sequence names).\n";
00724 
00725     monitoringStream_ << "Closing .name file\n";
00726     nameFile.close();
00727 
00728     monitoringStream_ << "Loaded in word length of " 
00729                       << wordLength_ << " bases.\n";
00730 
00731     monitoringStream_ << "Loaded in step length of " 
00732                       << stepLength_ << " bases.\n";
00733 
00734     monitoringStream_ << "Hash table format uses " << bitsPerSymbol_
00735                       << " bits per symbol.\n";
00736 
00737     monitoringStream_ << "Hash table was created from ";
00738 
00739     switch (sourceData_)
00740     {
00741     case gDNAData:
00742       monitoringStream_ << "DNA"; break;
00743     case gProteinData:
00744       monitoringStream_ << "protein"; break;
00745     default:
00746       monitoringStream_ << "unknown"; break;
00747     }
00748 
00749     monitoringStream_ << " data.\n";
00750 
00751 
00752 
00753 
00754 } // ~HashTableGeneric::loadSequenceNames( void )
00755 
00756 
00757   // Function Name: saveHashTable
00758   // Arguments: const string& (in)
00759   // Saves a hash table to a file (for subsequent retrieval by loadHashTable)
00760   void HashTableGeneric::saveHashTable( void )
00761   {
00762 
00763     monitoringStream_ << "HashTableFactory::saveHashTable" << endl;
00764 
00765     if ( isInitialized_ == false ) 
00766     {
00767       monitoringStream_ << "Error: hash table cannot be saved because it was"
00768                         << " was not successfully created." << endl;
00769       throw SSAHAException("Tried to save uninitialized hash table");
00770     } // ~if
00771 
00772     // Save sequence names
00773 
00774     saveSequenceNames();
00775 
00776 
00777     //        int bitsUsedPerWord( wordLength_ * bitsPerSymbol_ );
00778     //  unsigned long numDifferentWords ( 1 << bitsUsedPerWord );
00779 
00780     pArrayAllocator_->save();
00781     //  pHitListAllocator_->save();
00782 
00783     saveHitList();
00784 
00785     //    if (sequenceNames_.size() != 0)
00786     if (pNameReader_->size() != 0)
00787     saveToFile( name_+(string)".size", 
00788                 (char*)pSequenceSizes_, 
00789                 pNameReader_->size() * sizeof( SequenceOffset ),
00790                 monitoringStream_ ); 
00791 
00792 
00793 
00794   } // ~saveHashTable
00795 
00796 
00797 void HashTableGeneric::saveSequenceNames( void )
00798 {
00799 
00800   ofstreamSSAHA nameFile( ( name_ + ".name" ).c_str() );
00801 
00802   nameFile << hitListFormat_ << " " << wordLength_ << " " 
00803            << stepLength_ << " " << bitsPerSymbol_ 
00804            << " " << sourceData_ << " " << endl;
00805   
00806 
00807   pNameReader_->saveSequenceNames( nameFile );
00808     
00809     //    for ( vector<string*>::iterator i = sequenceNames_.begin() ; 
00810     //         i != sequenceNames_.end() ; i ++ )
00811     //  {
00812     //   nameFile << **i << endl;
00813     //   }
00814 
00815     if ( nameFile.fail() )
00816     {
00817       monitoringStream_ << "Error: failed to write " 
00818                         << name_ << ".name" << endl;
00819       throw SSAHAException("Problem saving .name file");
00820     } // ~if
00821 
00822     nameFile.close();
00823 
00824     monitoringStream_ << "Saved file " << name_ << ".name\n";
00825 
00826 
00827 } // ~HashTableGeneric::saveSequenceNames( void )
00828 
00829 
00830 void HashTableGeneric::printHashStats( void )
00831 {
00832   cout << "Analyzing data in hash table...\n";
00833   
00834   static const double thresholds[] = 
00835   { 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 
00836     0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01, 
00837     0.001, 0.0001, 0.00001,
00838     0 }; 
00839 
00840 
00841   if (!isInitialized()) 
00842   {
00843     cout << "Hash table has not been initialized.\n";
00844     return;
00845   } // ~if
00846 
00847   const char* pSymbols;
00848 
00849   cout << "Word length: " << wordLength_ << " bases.\n";
00850   cout << "Bits per symbol: " << bitsPerSymbol_ << endl;
00851 
00852   unsigned long numDifferentWords;
00853 
00854   //  if (    (pHashTable->getHitListFormat()==gTranslated)
00855   //      || (pHashTable->getHitListFormat()==g32BitPackedProtein) )
00856   if (bitsPerSymbol_==gResidueBits)
00857   {
00858     numDifferentWords= (unsigned long)pow((double)gNumCodonEncodings,
00859                                           (int)wordLength_);
00860     pSymbols = gResidueNames;
00861   } // ~if
00862   else
00863   {
00864     assert(bitsPerSymbol_==gBaseBits);
00865     numDifferentWords= 1<<(2*wordLength_);
00866     pSymbols = gBaseNames;
00867   } // ~else
00868 
00869 
00870   //  int bitsUsedPerWord( wordLength_ * bitsPerSymbol_ );
00871   //  unsigned long numDifferentWords ( 1 << bitsUsedPerWord );
00872   cout << "There are " << numDifferentWords 
00873        << " different possible words of this length.\n";
00874   cout << "There are " << wordsInDatabase_ 
00875        << " words in this hash table.\n"; 
00876 
00877 
00878   int numCommon(240);
00879 
00880   TopList mostCommon(numCommon, wordLength_, bitsPerSymbol_);
00881 
00882   map<int, int> histogram;
00883   vector<Word> neverOccurred;
00884 
00885 
00886   PositionInHitList last(0); 
00887   int numOccs;
00888 
00889   for ( unsigned long i(0); i < numDifferentWords ; ++i )
00890   {
00891     numOccs = pWordPositionInHitList_[i]-last;
00892     histogram[numOccs]++;
00893     if (numOccs==0) neverOccurred.push_back((Word)i);
00894 
00895     mostCommon.push_back( pair<int, Word>(numOccs, (Word)i ) ); 
00896     last = pWordPositionInHitList_[i];
00897   }
00898 
00899   cout << neverOccurred.size() << " words never appeared.\n";
00900   if (neverOccurred.size() < 200)
00901   {
00902     cout << "They are:\n";
00903     for (vector<Word>::iterator i(neverOccurred.begin());
00904          i!=neverOccurred.end();i++)
00905       //      cout << printWord( *i, wordLength_ ) << endl;
00906       cout 
00907         << PrintFromWord( *i, wordLength_, bitsPerSymbol_, pSymbols ) 
00908         << endl;
00909   } else cout << "Not showing them - more than 200!\n";
00910 
00911   cout << "The " << numCommon << " most common words are:\n";
00912 
00913   cout << mostCommon;
00914 
00915   unsigned long total(0), totalWordsUsed(0);
00916   int thresholdNum(0);
00917   unsigned int n(0);
00918 
00919   for ( map<int, int>::iterator i( histogram.begin()); 
00920         i != histogram.end() ; ++i )
00921   {
00922     total += (i->first*i->second);
00923     totalWordsUsed += i->second;
00924     if (n++<2000)
00925     {
00926       cout << i->second << " words occurred " << i->first 
00927            << " times in the hash table (" 
00928            << 100.0*total/(double)wordsInDatabase_
00929            << "/"
00930            << 100.0*totalWordsUsed/(double)numDifferentWords
00931            << " cumulative percent).\n";
00932     }
00933 
00934 
00935 
00936 
00937     while (   ( (double)(wordsInDatabase_-total)/(double)wordsInDatabase_ )
00938             < thresholds[thresholdNum] )
00939     {
00940       cout << "Threshold " << i->first << " retains "
00941            << 100.0*(1.0-thresholds[thresholdNum++])
00942            << "\% of words.\n";
00943     } // ~while
00944   } // ~for
00945 
00946   cout << "Total: " << total << "\n";
00947 
00948   cout << "... analysis complete.\n";
00949 
00950 } // ~void HashTableGeneric::printHashStats( void )
00951 
00952 
00953 WordSequenceShifted::WordSequenceShifted
00954 ( const WordSequence& thisSeq, const HashTableGeneric& hashTable ) :
00955   //hashTable_( hashTable ),
00956 size_( ( thisSeq.size() - 2 ) * hashTable.getWordLength() 
00957        + thisSeq.getNumBasesInLast() + 1 )
00958 // NB size_ is not the number of bases in the sequence, it is the number of 
00959 // hash words that may be obtained from the sequence
00960 {
00961   seqs_.push_back( thisSeq );
00962 
00963   int wordLength( hashTable.getWordLength() );
00964 
00965   for( int i(1) ; i < wordLength ; i++ ) 
00966   { 
00967     seqs_.push_back( seqs_.back() );
00968     shiftSequence( seqs_.back(), hashTable.getBitsPerSymbol(), wordLength );  
00969   } // ~for
00970 
00971 } // ~constructor
00972 
00973 
00974 SequenceAdapterWithOverlap::SequenceAdapterWithOverlap
00975 ( int bitsPerSymbol, int wordLength, int stepLength ) :
00976 bitsPerSymbol_( bitsPerSymbol ), 
00977 wordLength_( wordLength ), 
00978 stepLength_( stepLength ),
00979 SequenceAdapter() 
00980 {
00981 
00982   maskLeft_  = new Word[ wordLength_ ];
00983   maskRight_ = new Word[ wordLength_ ];
00984   //  oops! TC 29.10.2001
00985   //  const Word allOnes = ( 1 << ( 2 * wordLength_ ) ) - 1;
00986   const Word allOnes = ( 1 << ( bitsPerSymbol_ * wordLength_ ) ) - 1;
00987 
00988   for ( int i(0) ; i < wordLength_ ; i++ )
00989   {
00990     maskLeft_[i] = ( 1 << ( bitsPerSymbol_ * (wordLength_ - i) ) ) - 1;
00991     maskRight_[i] = allOnes - maskLeft_[i];
00992     // next two lines ensure flag bit is masked out
00993 
00994     // BUG FIX FROM CHUCK DILLON - AWS 21/06/05
00995     //maskLeft_[i] ^= gCursedWord;
00996     //maskRight_[i] ^= gCursedWord;
00997     maskLeft_[i] &= allOnes;
00998     maskRight_[i] &= allOnes;
00999     // ~ BUG FIXED
01000 
01001   } // ~ for it
01002 } // constructor
01003 
01004 SequenceAdapterWithOverlap::~SequenceAdapterWithOverlap()
01005 {
01006   delete [] maskLeft_;
01007   delete [] maskRight_;
01008 } // destructor
01009 
01010 void SequenceAdapterWithOverlap::link( const WordSequence& seq )
01011 {
01012   SequenceAdapter::link( seq );
01013   // Added check for seq.size <=1 (used to cause size_ to be 
01014   // (unsigned long)-1, ie huge, hence seg fault) TC 29.10.2001
01015    size_
01016      = (seq.size()<=1)
01017      ? 0
01018      : ( ( ( (int) seq.size() - 2) * wordLength_ ) + seq.getNumBasesInLast() ) 
01019        / stepLength_;
01020 }
01021 Word SequenceAdapterWithOverlap::operator[]( size_type j )
01022 {
01023   j *= stepLength_;
01024   const int wordPos( j / wordLength_ ), wordShift( j % wordLength_ );
01025   Word thisWord (0); 
01026   thisWord 
01027     |= ( (*pSeq_)[wordPos]   & maskLeft_[wordShift]  ) 
01028       << ( bitsPerSymbol_ * wordShift );
01029   thisWord
01030     |= ( (*pSeq_)[wordPos+1] & maskRight_[wordShift] ) 
01031       >> ( bitsPerSymbol_ * ( wordLength_ - wordShift ) );
01032     thisWord 
01033     |= (   gCursedWord 
01034            * (    ( ((*pSeq_)[wordPos]  & gCursedWord) != (Word)0 ) 
01035                || ( ((*pSeq_)[wordPos+1]& gCursedWord) != (Word)0 ) ) );
01036   return thisWord;
01037 }
01038 
01039 // Function definitions for NameReader and subclasses
01040 
01041 const char* NameReaderLocal::getSequenceName
01042 ( SequenceNumber seqNum ) const
01043 {
01044   //  assert(seqNum<size());
01045   if (seqNum <= size() )
01046   {
01047     return (*this)[seqNum-1].c_str();
01048   } // ~if
01049   return NULL;
01050 
01051 } // ~const char* NameReaderLocal::getSequenceName
01052 
01053 void NameReaderLocal::getSequenceName
01054 ( string& seqName, SequenceNumber seqNum ) const
01055 {
01056   //  assert(seqNum<size());
01057   if (seqNum <= size() )
01058   {
01059     seqName = (*this)[seqNum-1];
01060     return;
01061   }
01062   seqName = "No sequence name stored for sequence.";
01063 } // ~NameReaderLocal::getSequenceName
01064 
01065 
01066 
01067 const char* NameReaderIndex::getSequenceName
01068 ( SequenceNumber seqNum ) const
01069 {
01070   return reader_.extractName(seqNum);
01071 } // ~NameReaderIndex::getSequenceName
01072 
01073 
01074 void NameReaderIndex::getSequenceName
01075 ( string& seqName, SequenceNumber seqNum ) const
01076 {
01077   seqName=(string)reader_.extractName(seqNum); 
01078 } // ~NameReaderIndex::getSequenceName
01079 
01080 void NameReaderLocal::saveSequenceNames( ostream& nameFile )
01081 {
01082   for ( vector<string>::iterator i = begin() ; 
01083         i != end() ; i ++ )
01084   {
01085     nameFile << *i << endl;
01086   }
01087 } // ~NameReaderLocal::saveSequenceNames( ostream& nameFile )
01088 
01089 void NameReaderLocal::loadSequenceNames( istream& nameFile )
01090 {
01091   while( getline( nameFile, lastName() ) );
01092   pop_back();
01093     //    while( getline( nameFile,*(sequenceNames_.back() ) ) )
01094     //   {
01095     //    sequenceNames_.push_back( new string );
01096     //   } // ~while
01097 
01098     //    sequenceNames_.pop_back();
01099 
01100   //    pNameReader->loadSequenceNames( nameFile );
01101 
01102   //    monitoringStream_ << "Loaded in "  
01103   //          << size() 
01104   //          << " sequence names).\n";
01105 
01106 
01107 } // ~NameReaderLocal::loadSequenceNames( istream& inFile )
01108 
01109 SequenceNumber NameReaderIndex::size( void ) const
01110 {
01111   return reader_.size();
01112 } // ~NameReaderIndex::size( void ) const
01113 
01114 
01115 // End of file HashTableGeneric.cpp
01116 
01117 
01118 
01119 
01120 
01121 

Generated on Fri Dec 21 13:12:15 2007 for ssaha by  doxygen 1.5.2