HashTable/HashTablePacked.cpp

Go to the documentation of this file.
00001 
00002 // #######################################################################
00003 
00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00005 // Version 3.2, released 1st March 2004
00006 // Copyright (c) Genome Research 2002
00007 
00008 // SSAHA is free software; you can redistribute it and/or modify 
00009 // it under the terms of version 2 of the GNU General Public Licence
00010 // as published by the Free Software Foundation.
00011  
00012 // This program is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public Licence for more details.
00016  
00017 // You should have received a copy of the GNU General Public Licence
00018 // along with this program; if not, write to the Free Software
00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00021 
00022 // #######################################################################
00023 
00024 // Module Name  : HashTablePacked
00025 // File Name    : HashTablePacked.cpp
00026 // Language     : C++
00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00028 
00029 // Description:
00030 
00031 // Includes:
00032 
00033 #include "HashTablePacked.h"
00034 #include "SequenceReader.h"
00035 // #include "SequenceReaderCodon.h"
00036 #include "GlobalDefinitions.h"
00037 #include <fstream>
00038 #include <algorithm>
00039 #include <map>
00040 #include <cmath> // for `pow'
00041 
00042 AllocatorLocal<PositionInHitList> HashTablePacked::defaultArrayAllocator;
00043 AllocatorLocal<PositionPacked> HashTablePacked::defaultHitListAllocator;
00044 
00045 // ### Function Definitions ###
00046 
00047 // Name:
00048 // Arguments:
00049 // TYPE  NAME  IN/OUT COMMENT
00050 // Returns: TYPE COMMENT
00051 
00052 void HashTablePacked::setNumRepeats( int numRepeats )
00053 {
00054   if ( (numRepeats<0) || (numRepeats>stepLength_) )
00055     throw SSAHAException("Invalid value for numRepeats!!");
00056   numRepeats_=numRepeats;
00057   pMatchSequence_ = ( numRepeats==0 )
00058     ? &HashTablePacked::matchSequenceStandard
00059     : &HashTablePacked::matchSequenceRepeated;
00060 } // ~HashTablePacked::setNumRepeats( int numRepeats )
00061 
00062 void HashTablePacked::setSubstituteThreshold( int numSubs ) 
00063 {
00064   if ( (numSubs<0) || (numSubs>maxNumHits_) )
00065     throw SSAHAException("Invalid value for substituteThreshold!!");
00066   substituteThreshold_=numSubs;
00067   pMatchWord_ = ( numSubs==0 )
00068     ? &HashTablePacked::matchWordStandard
00069     : &HashTablePacked::matchWordSubstitute;
00070 } // ~HashTablePacked::setSubstituteThreshold( int numSubs ) 
00071 
00072 void HashTablePacked::matchWordSubstitute
00073 ( Word w, PackedHitStore& hitList, int offset )
00074 {
00075   if ((w&gCursedWord)!=(Word)0) return;
00076 
00077   HashTableView<PositionPacked,HashTablePacked>::matchWord
00078     ( w, hitList, offset );
00079   assert((w&gCursedWord)==0);
00080   if (size(w)>substituteThreshold_) return;
00081   vector<Word> neighbours;
00082   (*pGenerateSubstitutes_)( w, neighbours, wordLength_);
00083   for( vector<Word>::iterator i(neighbours.begin());
00084        i!=neighbours.end(); i++)
00085   {
00086     HashTableView<PositionPacked,HashTablePacked>::matchWord
00087       ( *i, hitList, offset );
00088   } // ~for i
00089 } // void HashTablePacked::matchWordSubstitute
00090 
00091 // Given a DNA word, generates all single base purine-purine (A-G) and
00092 // pyrimidine-pyrimidine (C-T) substitutes. 
00093 // A = 00, A XOR 10 = 10 = G
00094 // C = 01, C XOR 10 = 11 = T
00095 // G = 10, G XOR 10 = 00 = A
00096 // T = 11, T XOR 10 = 01 = C
00097 void generateSubstitutesDNA
00098 (Word w, vector<Word>& subs, int wordLength)
00099 {
00100   Word mask(2);
00101   for (int i(0) ; i < wordLength ; i++, mask<<=gBaseBits)
00102   {
00103     subs.push_back(w^mask);
00104   } // ~for
00105 } // ~generateSubstitutesDNA
00106 
00107 // Given a protein word, generates all single residue substitutes having a
00108 // positive BLOSUM62 score.
00109 void generateSubstitutesProtein
00110 (Word w, vector<Word>& subs, int wordLength)
00111 {
00112   Word mask((1<<gResidueBits)-1), thisCodon, thisWord;
00113   for (int i(0) ; i < wordLength ; i++, mask<<=gResidueBits)
00114   {
00115     thisCodon=((w&mask)>>(i*gResidueBits));
00116     thisWord= (w&(~mask));
00117     //    cout << ~mask << " " << printResidue( thisWord, wordLength )
00118     // << " " << printResidue( thisCodon, wordLength) << endl;
00119     for (int j(subStarts[thisCodon]); j != subStarts[thisCodon+1]; j++ )
00120     {
00121       subs.push_back( thisWord | (subVals[j]<<(i*gResidueBits)) );
00122       //  cout << "generating sub: " << printResidue(subs.back(),wordLength)
00123       //  << endl;
00124     } // ~for j
00125   } // ~for i
00126 } // ~generateSubstitutesProtein
00127 
00128 
00129 
00130 
00131 
00132 void HashTablePacked::countWords( SequenceAdapter& thisSeq )
00133 {
00134 
00135    for ( int j(0) ; j < thisSeq.size() ; ++ j )
00136    {
00137      // only count words that have not been flagged
00138      pWordPositionInHitList_[(thisSeq[j]&(~gCursedWord))]
00139        += ((thisSeq[j]&gCursedWord)==(Word)0);
00140      //     pWordPositionInHitList_[thisSeq[j]]++;
00141    }   
00142    //   cout << endl;
00143 } // ~HashTable::countWords
00144 
00145 void HashTablePacked::hashWords
00146 ( SequenceAdapter& thisSeq, SequenceNumber seqNum )
00147 {
00148 
00149   register Word              thisWord;
00150   register PositionInHitList currentPos;
00151   // NB We stop at the last but one element of the 
00152   // sequence (as the last isn't a full word)
00153 
00154    for ( int j(0) ; j < thisSeq.size() ; ++ j )
00155       {
00156         thisWord = thisSeq[j];
00157 
00158         // only hash words that have not been flagged
00159         if ((thisWord&gCursedWord)==(Word)0)
00160         {
00161 
00162           currentPos 
00163             = pHitsFoundSoFar_[thisWord]
00164             +( ( thisWord == 0 ) 
00165                ? 0 : pWordPositionInHitList_[thisWord - 1]) ;
00166                            
00167           if ( currentPos != pWordPositionInHitList_[thisWord] )
00168           { // then place position in the hit list
00169             pHitListForAllWords_[currentPos] = wordNum_;
00170             pHitsFoundSoFar_[thisWord]++;
00171             // next line moved: wordNum still needs incrementing even if
00172             // word is flagged
00173           //      wordNum_++;
00174           } // ~if
00175           else assert(1==0);
00176         } // ~if
00177 
00178         wordNum_++;
00179 
00180       } // ~ for thisWord
00181 
00182    seqStarts_.push_back(wordNum_);
00183 
00184 } // ~HashTable::hashWords
00185 
00186 void HashTablePacked::matchSequenceStandard
00187 ( WordSequence& seq, HitList& hitListFwd )
00188 {
00189 
00190   PackedHitStore packedHits;
00191   int numBasesInLast( seq.getNumBasesInLast() ), baseOffset;
00192 
00193   for ( int i(0) ; i < wordLength_ ; ++i )
00194   {
00195     //  matchWord( seq,    packedHits, i );
00196     if (seq.size()!=0)
00197     {
00198       baseOffset=i;
00199       WordSequence::const_iterator last(&seq.back());
00200       for ( WordSequence::const_iterator thisWord(seq.begin());
00201             thisWord != last ; ++thisWord )
00202       {
00203         int oldSize(packedHits.size()); // %%%%%%
00204         matchWordDeluxe( *thisWord, packedHits, baseOffset );
00205         //      cout << printResidue(*thisWord, wordLength_) << " "
00206         //  << packedHits.size()-oldSize;
00207         //      for (int fk(oldSize);fk!=packedHits.size();fk++) cout << " " << packedHits[fk].first;
00208         //      cout << endl;
00209         baseOffset += wordLength_;
00210       } // ~for
00211     } // ~if
00212     shiftSequence( seq, bitsPerSymbol_, wordLength_ );
00213     if ( i == numBasesInLast )
00214     {
00215       seq.pop_back();
00216     } // ~if
00217   } // ~for
00218 
00219   convertHits(packedHits,hitListFwd);
00220   
00221 } // ~HashTablePacked::matchSequenceStandard
00222 
00223 
00224 void HashTablePacked::convertHits
00225 ( PackedHitStore& packedHits, HitList& hitListFwd )
00226 {
00227 
00228   //  sort(packedHits.begin(),packedHits.end());
00229   sorter_(packedHits);
00230 
00231 
00232   vector<HitPacked>::iterator i(packedHits.begin());
00233   vector<SeqStartPos>::iterator ub, j(seqStarts_.begin());
00234   HitListVector::size_type sortStart;
00235   //  hitListFwd.reserve(hitListFwd.size()+packedHits.size());
00236 
00237   // NB seqStarts_ must not be empty else call to back() segfaults
00238 
00239   while 
00240   (    (j!=static_cast<vector<SeqStartPos>::iterator>(&seqStarts_.back()))
00241     && (i!=packedHits.end()) )
00242   {
00243     ub = upper_bound(j,seqStarts_.end(),i->first);
00244     j=ub; j--;
00245     sortStart=hitListFwd.size();
00246     while ((i!=packedHits.end())&&(i->first<*ub))
00247     {
00248       //      cout << i->first << " ... " 
00249       //   << ub-seqStarts_.begin() << " " 
00250       //   << stepLength_*(i->first - *j) 
00251       //   << endl;
00252       hitListFwd.addHit( ub-seqStarts_.begin(), 
00253                          stepLength_*(i->first - *j),
00254                          i->second );
00255       i++;
00256       
00257     } // ~while
00258     sort
00259     ( static_cast<HitList::iterator>(&hitListFwd[sortStart]),
00260       hitListFwd.end(),
00261       LessThanDiff() );
00262     //    sort(hitListFwd.begin()+sortStart,hitListFwd.end(),LessThanDiff() );
00263     j=ub;  
00264   } // ~while
00265 
00266 } // ~HashTablePacked::convertHits
00267 
00268 void HashTablePacked::matchSequenceRepeated
00269 ( WordSequence& seq, HitList& hitListFwd )
00270 {
00271     WordSequenceShifted seqShifted(seq, *this);
00272     //    screenRepeats( seqShifted, hitListFwd, numRepeats_ );
00273 
00274     PackedHitStore nonRepeatedHits;
00275 
00276     Word                thisWord;
00277 
00278     int m;
00279 
00280     //  cout << "my size: " << size() << endl;
00281 
00282     // i cycles through each full word in the query sequence
00283     for ( int i(0) ; i < seqShifted.size() ; ++i )
00284     {
00285 
00286 
00287       //      cout << "doing  i:" << i << endl;
00288       thisWord = seqShifted[i];
00289       m = 0;
00290 
00291       // look through the next numRepeats_ words for duplicates
00292       for ( int j(i+1) ; 
00293             ( ( j < seqShifted.size() ) && ( j <= i + numRepeats_ ) ); 
00294             ++j )
00295       {
00296         if ( thisWord == seqShifted[j] )
00297         {
00298           m = j - i;
00299           //      cout << "Tandem repeat: " << i << "-" << j << "\n"; // %%%%
00300           break;
00301         } // ~if
00302       } // ~for j
00303       if ( m == 0 )
00304       {
00305         //      cout << "doing bog standard matching for:" << i << endl;
00306         matchWordDeluxe( thisWord, nonRepeatedHits, i );
00307       } // ~if
00308       else
00309       {
00310           // ... then we have found a tandem repeat of length m
00311           int r(1);
00312 
00313           // scan forward until we reach either the end of the
00314           // repeated region or the end of the sequence
00315           while (    ( seqShifted[i+(r*m)]==thisWord )
00316                   && ( i+(r*m) < seqShifted.size() ) ) ++r;
00317 
00318           //          cout << "Num repeats: " << r << endl;
00319 
00320           // any hits in a run of matching hits that exceed lastRun are
00321           // ignored, because in that case the size of the repeated
00322           // region in the subject sequence exceeds the size of the
00323           // repeated region in the query sequence
00324           int lastRun((r-1)*m);
00325 
00326           //      cout << "size of repeated run: " << lastRun << endl;
00327 
00328           while ( seqShifted[i+lastRun] == seqShifted[i+lastRun-m] ) lastRun++;
00329           lastRun--;
00330 
00331           //      cout << "adjusted size of repeated run: " << lastRun << endl;
00332 
00333           //          HitListRepeated hits;
00334           PackedHitStore packedHits;
00335           HitListRepeated hits; 
00336 
00337           // as we proceed base by base along a region of tandem repeats
00338           // of motif length m, we encounter m distinct hash words, after
00339           // that, they repeat. Now get the hits for each of these words: 
00340           // passing in j tags each hit with its position in the repeat cycle
00341           for ( int j(0) ; j < m ; ++j ) 
00342           {
00343             matchWord( seqShifted[i+j], packedHits, j);
00344           } // ~for j
00345 
00346     
00347           sort(packedHits.begin(),packedHits.end());
00348 
00349           vector<HitPacked>::iterator pThisHit(packedHits.begin());
00350           vector<SeqStartPos>::iterator ub, j(seqStarts_.begin());
00351           //  hitListFwd.reserve(hitListFwd.size()+packedHits.size());
00352           
00353           // NB seqStarts_ must not be empty else call to back() segfaults
00354 
00355           // lastHit =  previous hit in list, initialized to all zeroes
00356           HitPacked lastHit; 
00357           // firstHit = first hit of a matching run, initialized to all zeroes
00358           HitPacked firstHit;
00359 
00360           // thisRun = size of current run of matching hits
00361           int thisRun(0);
00362 
00363           while 
00364           (    ( j!=static_cast<vector<SeqStartPos>::iterator>
00365                       ( &seqStarts_.back() ) ) 
00366             && ( pThisHit!=packedHits.end() ) )
00367           {
00368             ub = upper_bound(j,seqStarts_.end(),pThisHit->first);
00369             j=ub; j--;
00370             lastHit=*pThisHit; 
00371             // ensures if condition always false first time through while
00372             while ((pThisHit!=packedHits.end())&&(pThisHit->first<*ub))
00373             {
00374               if (    (    pThisHit->first
00375                            == lastHit.first+1 )
00376                       && (    pThisHit->second
00377                               == (( lastHit.second + stepLength_ ) % m)   ) )
00378               {
00379                 if ( thisRun == 0 ) 
00380                 { 
00381                   //    cout << " -s- ";
00382                   // then a run of matching hits has started
00383                   firstHit = lastHit; 
00384                   thisRun  = wordLength_;
00385                 } // ~if
00386                 else 
00387                 {
00388                   //            cout << " -c- ";
00389                   // we continue an existing run             
00390                   thisRun += stepLength_;
00391                 } // ~else
00392  
00393 
00394               } // ~if
00395               else 
00396               { 
00397                 thisRun=0; 
00398                 firstHit = *pThisHit;
00399               }
00400               //            cout << " -n- ";
00401               if (thisRun <= lastRun ) 
00402               { 
00403                 // only output hits if length of repeated region in subject 
00404                 // is less than or equal to that of query
00405                 //    cout << "added" << (*thisHit).subjectPos.offset 
00406                 //   << "-" << i + firstHit.cyclePos + thisRun;
00407                 //            hitListFwd.addHit( thisHit->subjectPos, 
00408                 //                         i + firstHit.cyclePos + thisRun ); 
00409 
00410                 nonRepeatedHits.push_back
00411                   ( HitPacked( pThisHit->first, 
00412                                i + firstHit.second + thisRun ) ); 
00413               } // ~if
00414               else 
00415               {
00416                 lastHit = *pThisHit;
00417               } // ~else
00418               pThisHit++;
00419             } // ~while
00420             j=ub;  
00421           } // ~while
00422           i += lastRun;
00423       } // ~else
00424     } // ~for i
00425     convertHits( nonRepeatedHits, hitListFwd );
00426     
00427 } // ~HashTablePacked::matchSequenceRepeated
00428 
00429 
00430 
00431 
00432 void HashTablePacked::loadHitList( unsigned long size )
00433 {
00434   
00435   string startFileName(name_+(string)".start");
00436 
00437   ifstream startFile(startFileName.c_str());
00438   if (startFile.fail())
00439   {
00440     monitoringStream_ << "Could not open " << startFileName << endl;
00441     throw SSAHAException("Could not open .start file");
00442   } // ~if
00443 
00444   startFile.seekg(0,ios::end);
00445   long startFileSize = startFile.tellg();
00446 
00447   if ( getNumSequences() + 1 != startFileSize/sizeof(SeqStartPos))
00448   {
00449     seqStarts_.resize(startFileSize/sizeof(SeqStartPos));
00450     monitoringStream_ 
00451       << "Info: expecting " << seqStarts_.size()-1 << " sequences in file"
00452       << endl;
00453   } // ~if
00454   else seqStarts_.resize(getNumSequences()+1);
00455 
00456   loadFromFile(  startFileName,
00457                  (char*) &seqStarts_[0],
00458                  startFileSize  );
00459 
00460   HashTableView<PositionPacked,HashTablePacked>::loadHitList(size);
00461 
00462 }
00463 
00464 
00465 
00466 void HashTablePacked::saveHitList( void )
00467 {
00468 
00469   //  assert(seqStarts_.size()==getNumSequences()+1);
00470   // assertion removed, not true when part of a HashTableTranslated
00471 
00472   saveToFile(  name_+(string)".start",
00473                (char*) &seqStarts_[0],
00474                (seqStarts_.size())*sizeof(SeqStartPos)  );
00475 
00476   HashTableView<PositionPacked,HashTablePacked>::saveHitList();
00477 } // ~HashTablePacked::saveHitList( void )
00478 
00479 
00480 // RadixSorter member function definitions
00481 RadixSorter::RadixSorter
00482 ( const unsigned int digits, const unsigned int bits ) :
00483 digits_(digits), bits_(bits), 
00484 base_( (unsigned long)pow( (double)2, (int)bits ) ), 
00485 mask_( base_ - 1 ),
00486 counts_(vector< vector<CountInt> >(digits, vector<CountInt>(base_) ) ),
00487 places_(base_)
00488 {} // RadixSorter::RadixSorter
00489 
00490 
00491 
00492 void RadixSorter::operator()( vector<HitPacked>& v )
00493 {
00494 
00495   CountDigits(v);
00496 
00497   if (digits_&1) 
00498   { 
00499     v1_=v;
00500     source_ = &v1_; target_ = &v;
00501   } // ~if
00502   else 
00503   { 
00504     v1_.assign(v.size(),zeroHit);
00505     source_ = &v; target_ = &v1_;
00506   } // ~else
00507 
00508   vector<HitPacked>* temp;
00509 
00510   for ( unsigned int i(0) ; i < digits_ ; ++i ) 
00511   {
00512     SortByDigit( i );
00513     temp = source_;
00514     source_=target_;
00515     target_=temp;
00516   } // ~for
00517 } // ~void RadixSorter::operator()( vector<HitPacked>& v )
00518 
00519 
00520 void RadixSorter::CountDigits( const vector<HitPacked>& v )
00521 {
00522   vector< vector<CountInt> >::iterator pv (counts_.begin()); 
00523 
00524   for (; pv != counts_.end() ; ++pv ) pv->assign(base_,0);
00525 
00526   PositionPacked temp;
00527 
00528   for ( vector<HitPacked>::const_iterator 
00529           j(v.begin()); 
00530           j != v.end() ; ++j )
00531   {
00532     temp = j->first;
00533     for ( pv = counts_.begin() ; pv != counts_.end() ; ++pv )
00534     {
00535       (*pv)[ temp & mask_ ]++; 
00536       temp >>= bits_;
00537     } // ~for pv
00538 
00539   } // ~for j
00540 
00541 } // ~RadixSorter::CountDigits( const vector<Uint>& v )
00542 
00543 void RadixSorter::SortByDigit( PositionPacked digit )
00544 {
00545   unsigned int shift( digit * bits_ );
00546   CountInt pos(0);
00547   vector<CountInt>::iterator count( counts_[digit].begin() ); 
00548 
00549   for ( vector< vector<HitPacked>::iterator >::iterator 
00550           place( places_.begin() );
00551         place != places_.end(); 
00552         ++place, ++count )
00553   {
00554     //    *place = &((*target_)[pos]);
00555     *place = static_cast<vector<HitPacked>::iterator>(&(*target_)[pos]);
00556     //    *place = target_->begin()+pos;
00557     pos += *count;
00558   } // ~for
00559 
00560   for ( vector<HitPacked>::iterator i( (source_)->begin() ) ; 
00561         i!= (source_)->end() ; ++i )
00562   {
00563     *places_[ ( i->first >> shift) & mask_ ]++ = *i;
00564   } // ~for
00565 } // ~void RadixSorter::SortByDigit( PositionPacked digit )
00566 
00567 
00568 // End of file HashTablePacked.cpp
00569 
00570 
00571 
00572 

Generated on Fri Dec 21 13:12:15 2007 for ssaha by  doxygen 1.5.2