HashTable/HashTable.cpp

Go to the documentation of this file.
00001 
00002 // #######################################################################
00003 
00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00005 // Version 3.2, released 1st March 2004
00006 // Copyright (c) Genome Research 2002
00007 
00008 // SSAHA is free software; you can redistribute it and/or modify 
00009 // it under the terms of version 2 of the GNU General Public Licence
00010 // as published by the Free Software Foundation.
00011  
00012 // This program is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public Licence for more details.
00016  
00017 // You should have received a copy of the GNU General Public Licence
00018 // along with this program; if not, write to the Free Software
00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00021 
00022 // #######################################################################
00023 
00024 // Module Name  : HashTable
00025 // File Name    : HashTable.cpp
00026 // Language     : C++
00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00028 
00029 // Description:
00030 
00031 // Includes:
00032 
00033 #include "HashTable.h"
00034 #include "SequenceReader.h"
00035 // #include "SequenceReaderCodon.h"
00036 #include "GlobalDefinitions.h"
00037 #include <fstream>
00038 #include <algorithm>
00039 #include <map>
00040 
00041 AllocatorLocal<PositionInHitList> HashTable::defaultArrayAllocator;
00042 AllocatorLocal<PositionInDatabase> HashTable::defaultHitListAllocator;
00043 
00044 // ### Function Definitions ###
00045 
00046 
00047 
00048 // Name:
00049 // Arguments:
00050 // TYPE  NAME  IN/OUT COMMENT
00051 // Returns: TYPE COMMENT
00052   //  MatchPointer pM_;
00053   //  void fred
00054   //  ( WordSequence& seq, HitList& hitListFwd )
00055   //   { pM_=&HashTable::matchSequence;
00056   //     return (this->*pM_)(seq, hitListFwd); }
00057 
00058 void HashTable::setNumRepeats( int numRepeats )
00059 {
00060   if ( (numRepeats<0) || (numRepeats>stepLength_) )
00061     throw SSAHAException("Invalid value for numRepeats!!");
00062   numRepeats_=numRepeats;
00063   pMatchSequence_ = ( numRepeats==0 )
00064     ? &HashTable::matchSequenceStandard
00065     : &HashTable::matchSequenceRepeated;
00066 }
00067 
00068   // Function Name: matchSequence
00069   // Arguments: WordSequence& (in), HitList& (out), HitList& (out)
00070   // Returns: void
00071   // This obtains the full list of hits for a sequence in both forward
00072   // and reverse directions. Proceeds as follows:
00073   // 1. The reverse complement of the sequence is formed.
00074   // 2. Any hits found in the forward or reverse direction are added to the
00075   // appropriate list.
00076   // 3. The sequence and reverse complement are left-shifted by 1 base
00077   // Steps 2 and 3 are repeated wordLength_ times.
00078   // NB This function will modify seq. If you want to keep it, make a copy
00079   // before calling this function.
00080   void HashTable::matchSequenceStandard
00081   ( WordSequence& seq, HitList& hitListFwd )
00082   {
00083     
00084     int numBasesInLast( seq.getNumBasesInLast() );
00085 
00086     for ( int i(0) ; i < wordLength_ ; ++i )
00087     {
00088       matchWord( seq,    hitListFwd, i );
00089       shiftSequence( seq, bitsPerSymbol_, wordLength_ );
00090       if ( i == numBasesInLast )
00091       {
00092         seq.pop_back();
00093       } // ~if
00094     } // ~for
00095 
00096   } // ~HashTable::matchSequence
00097 
00098   // Function Name: matchSequence
00099   // Arguments: WordSequence& (in), HitList& (out), HitList& (out), int (in)
00100   // Returns: void
00101   // This obtains the full list of hits for a sequence in both forward
00102   // and reverse directions and masks out tandem repeats.
00103   void HashTable::matchSequenceRepeated
00104   ( WordSequence& seq, 
00105     HitList& hitListFwd )
00106   {
00107     WordSequenceShifted seqShifted(seq, *this);
00108     //    screenRepeats( seqShifted, hitListFwd, numRepeats_ );
00109 
00110     Word                thisWord;
00111 
00112     int m;
00113 
00114     //  cout << "my size: " << size() << endl;
00115 
00116     // i cycles through each full word in the query sequence
00117     for ( int i(0) ; i < seqShifted.size() ; ++i )
00118     {
00119 
00120 
00121       //      cout << "doing  i:" << i << endl;
00122       thisWord = seqShifted[i];
00123       m = 0;
00124 
00125       // look through the next numRepeats_ words for duplicates
00126       for ( int j(i+1) ; 
00127             ( ( j < seqShifted.size() ) && ( j <= i + numRepeats_ ) ); 
00128             ++j )
00129       {
00130         if ( thisWord == seqShifted[j] )
00131         {
00132           m = j - i;
00133           //      cout << "Tandem repeat: " << i << "-" << j << "\n"; // %%%%
00134           break;
00135         } // ~if
00136       } // ~for j
00137       if ( m == 0 )
00138       {
00139         //      cout << "doing bog standard matching for:" << i << endl;
00140         matchWord( thisWord, hitListFwd, i );
00141       } // ~if
00142       else
00143       {
00144           // ... then we have found a tandem repeat of length m
00145           int r(1);
00146 
00147           // scan forward until we reach either the end of the
00148           // repeated region or the end of the sequence
00149           while (    ( seqShifted[i+(r*m)]==thisWord )
00150                   && ( i+(r*m) < seqShifted.size() ) ) ++r;
00151 
00152           //          cout << "Num repeats: " << r << endl;
00153 
00154           // any hits in a run of matching hits that exceed lastRun are
00155           // ignored, because in that case the size of the repeated
00156           // region in the subject sequence exceeds the size of the
00157           // repeated region in the query sequence
00158           int lastRun((r-1)*m);
00159 
00160           //      cout << "size of repeated run: " << lastRun << endl;
00161 
00162           while ( seqShifted[i+lastRun] == seqShifted[i+lastRun-m] ) lastRun++;
00163           lastRun--;
00164 
00165           //      cout << "adjusted size of repeated run: " << lastRun << endl;
00166 
00167           HitListRepeated hits;
00168 
00169           // as we proceed base by base along a region of tandem repeats
00170           // of motif length m, we encounter m distinct hash words, after
00171           // that, they repeat. Now get the hits for each of these words: 
00172           // passing in j tags each hit with its position in the repeat cycle
00173           for ( int j(0) ; j < m ; ++j ) 
00174           {
00175             matchWord( seqShifted[i+j], hits, j);
00176           } // ~for j
00177 
00178           // sort hits in order of RepeatedHit::subjectPos
00179           sort( hits.begin(), hits.end() );
00180 
00181           // lastHit =  previous hit in list, initialized to all zeroes
00182           RepeatedHit lastHit; 
00183           // firstHit = first hit of a matching run, initialized to all zeroes
00184           RepeatedHit firstHit;
00185 
00186           // thisRun = size of current run of matching hits
00187           int thisRun(0);
00188 
00189         
00190           for ( HitListRepeated::iterator thisHit( hits.begin() ); 
00191                 thisHit != hits.end() ; ++thisHit )
00192           {
00193             //      cout << ": " << (*thisHit).subjectPos.sequence << " " 
00194             //      << (*thisHit).subjectPos.offset << " " 
00195             //      << (*thisHit).cyclePos ; 
00196             if (    (    (*thisHit).subjectPos.sequence 
00197                       == lastHit.subjectPos.sequence )
00198                  && (    (*thisHit).subjectPos.offset
00199                       == lastHit.subjectPos.offset + stepLength_ )
00200                  && (    (*thisHit).cyclePos
00201                       == (( lastHit.cyclePos + stepLength_ ) % m)   ) )
00202             {
00203               if ( thisRun == 0 ) 
00204               { 
00205                 //      cout << " -s- ";
00206                 // then a run if matching hits has started
00207                 firstHit = lastHit; 
00208                 thisRun  = wordLength_;
00209               } // ~if
00210               else 
00211               {
00212                 //              cout << " -c- ";
00213                 // we continue an existing run             
00214                 thisRun += stepLength_;
00215               }
00216  
00217 
00218             } // ~if
00219             else 
00220             { 
00221               thisRun=0; 
00222               firstHit = *thisHit;
00223             }
00224             //      cout << " -n- ";
00225             if (thisRun <= lastRun ) 
00226             { 
00227               // only output hits if length of repeated region in subject 
00228               // is less than or equal to that of query
00229               //    cout << "added" << (*thisHit).subjectPos.offset 
00230               //   << "-" << i + firstHit.cyclePos + thisRun;
00231               hitListFwd.addHit( thisHit->subjectPos, 
00232                                    i + firstHit.cyclePos + thisRun ); 
00233             } else // cout << "ignored" << (*thisHit).subjectPos.offset 
00234                    //     << "-" << i + firstHit.cyclePos + thisRun;
00235 
00236        
00237             lastHit = *thisHit;
00238             //      cout << endl;
00239           } // ~for
00240 
00241           // carry on at the end of the repeated region in the query seq
00242           // i += (r-1)*m+1;
00243           // Actually want 1 less cos i is incremented anyway at the
00244           // end of the for loop - TC 5.7.1
00245           //          i += (r-1)*m;
00246           i += lastRun;
00247           // cout << "Carrying on at pos " << i+1 << "\n";
00248           // break out of the `for m' loop
00249           //     break;
00250        
00251 
00252 
00253       } // ~else
00254 
00255     } // ~for i
00256 
00257 
00258 
00259   } // ~HashTable::matchSequenceRepeated
00260 
00261 
00262 
00263 
00264 
00265 void HashTable::countWords( SequenceAdapter& thisSeq )
00266 {
00267 
00268    for ( int j(0) ; j < thisSeq.size() ; ++ j )
00269    {
00270      // only count words that have not been flagged
00271      pWordPositionInHitList_[(thisSeq[j]&(~gCursedWord))]
00272        += ((thisSeq[j]&gCursedWord)==(Word)0);
00273      //   pWordPositionInHitList_[thisSeq[j]]++;
00274    }
00275    
00276 } // ~HashTable::countWords
00277 
00278 void HashTable::hashWords
00279 ( SequenceAdapter& thisSeq, SequenceNumber seqNum )
00280 {
00281 
00282         register Word              thisWord;
00283         register PositionInHitList currentPos;
00284       // NB We stop at the last but one element of the 
00285       // sequence (as the last isn't a full word)
00286 
00287    for ( int j(0) ; j < thisSeq.size() ; ++ j )
00288       {
00289         thisWord = thisSeq[j];
00290         // only hash words that have not been flagged
00291         if ((thisWord&gCursedWord)!=(Word)0) continue;
00292         currentPos 
00293         = pHitsFoundSoFar_[thisWord]
00294           +( ( thisWord == 0 ) 
00295              ? 0 : pWordPositionInHitList_[thisWord - 1]) ;
00296                            
00297         if ( currentPos != pWordPositionInHitList_[thisWord] )
00298         { // then place position in the hit list
00299           pHitListForAllWords_[currentPos].sequence 
00300           = seqNum; 
00301           pHitListForAllWords_[currentPos].offset   
00302           = j * stepLength_; 
00303           //      DEBUG_L2("list "<< printWord(thisWord,wordLength_) 
00304           //  << " "<< seqNum << " " << j*stepLength_ );
00305           pHitsFoundSoFar_[thisWord]++;
00306         } // ~if
00307 
00308       } // ~ for thisWord
00309 
00310 
00311 } // ~HashTable::hashWords
00312   
00313 
00314 // End of file HashTable.cpp
00315 

Generated on Fri Dec 21 13:12:15 2007 for ssaha by  doxygen 1.5.2