00001 00002 // ####################################################################### 00003 00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm 00005 // Version 3.2, released 1st March 2004 00006 // Copyright (c) Genome Research 2002 00007 00008 // SSAHA is free software; you can redistribute it and/or modify 00009 // it under the terms of version 2 of the GNU General Public Licence 00010 // as published by the Free Software Foundation. 00011 00012 // This program is distributed in the hope that it will be useful, 00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 // GNU General Public Licence for more details. 00016 00017 // You should have received a copy of the GNU General Public Licence 00018 // along with this program; if not, write to the Free Software 00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt 00021 00022 // ####################################################################### 00023 00024 // Module Name : HashTable 00025 // File Name : HashTable.cpp 00026 // Language : C++ 00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk) 00028 00029 // Description: 00030 00031 // Includes: 00032 00033 #include "HashTable.h" 00034 #include "SequenceReader.h" 00035 // #include "SequenceReaderCodon.h" 00036 #include "GlobalDefinitions.h" 00037 #include <fstream> 00038 #include <algorithm> 00039 #include <map> 00040 00041 AllocatorLocal<PositionInHitList> HashTable::defaultArrayAllocator; 00042 AllocatorLocal<PositionInDatabase> HashTable::defaultHitListAllocator; 00043 00044 // ### Function Definitions ### 00045 00046 00047 00048 // Name: 00049 // Arguments: 00050 // TYPE NAME IN/OUT COMMENT 00051 // Returns: TYPE COMMENT 00052 // MatchPointer pM_; 00053 // void fred 00054 // ( WordSequence& seq, HitList& hitListFwd ) 00055 // { pM_=&HashTable::matchSequence; 00056 // return (this->*pM_)(seq, hitListFwd); } 00057 00058 void HashTable::setNumRepeats( int numRepeats ) 00059 { 00060 if ( (numRepeats<0) || (numRepeats>stepLength_) ) 00061 throw SSAHAException("Invalid value for numRepeats!!"); 00062 numRepeats_=numRepeats; 00063 pMatchSequence_ = ( numRepeats==0 ) 00064 ? &HashTable::matchSequenceStandard 00065 : &HashTable::matchSequenceRepeated; 00066 } 00067 00068 // Function Name: matchSequence 00069 // Arguments: WordSequence& (in), HitList& (out), HitList& (out) 00070 // Returns: void 00071 // This obtains the full list of hits for a sequence in both forward 00072 // and reverse directions. Proceeds as follows: 00073 // 1. The reverse complement of the sequence is formed. 00074 // 2. Any hits found in the forward or reverse direction are added to the 00075 // appropriate list. 00076 // 3. The sequence and reverse complement are left-shifted by 1 base 00077 // Steps 2 and 3 are repeated wordLength_ times. 00078 // NB This function will modify seq. If you want to keep it, make a copy 00079 // before calling this function. 00080 void HashTable::matchSequenceStandard 00081 ( WordSequence& seq, HitList& hitListFwd ) 00082 { 00083 00084 int numBasesInLast( seq.getNumBasesInLast() ); 00085 00086 for ( int i(0) ; i < wordLength_ ; ++i ) 00087 { 00088 matchWord( seq, hitListFwd, i ); 00089 shiftSequence( seq, bitsPerSymbol_, wordLength_ ); 00090 if ( i == numBasesInLast ) 00091 { 00092 seq.pop_back(); 00093 } // ~if 00094 } // ~for 00095 00096 } // ~HashTable::matchSequence 00097 00098 // Function Name: matchSequence 00099 // Arguments: WordSequence& (in), HitList& (out), HitList& (out), int (in) 00100 // Returns: void 00101 // This obtains the full list of hits for a sequence in both forward 00102 // and reverse directions and masks out tandem repeats. 00103 void HashTable::matchSequenceRepeated 00104 ( WordSequence& seq, 00105 HitList& hitListFwd ) 00106 { 00107 WordSequenceShifted seqShifted(seq, *this); 00108 // screenRepeats( seqShifted, hitListFwd, numRepeats_ ); 00109 00110 Word thisWord; 00111 00112 int m; 00113 00114 // cout << "my size: " << size() << endl; 00115 00116 // i cycles through each full word in the query sequence 00117 for ( int i(0) ; i < seqShifted.size() ; ++i ) 00118 { 00119 00120 00121 // cout << "doing i:" << i << endl; 00122 thisWord = seqShifted[i]; 00123 m = 0; 00124 00125 // look through the next numRepeats_ words for duplicates 00126 for ( int j(i+1) ; 00127 ( ( j < seqShifted.size() ) && ( j <= i + numRepeats_ ) ); 00128 ++j ) 00129 { 00130 if ( thisWord == seqShifted[j] ) 00131 { 00132 m = j - i; 00133 // cout << "Tandem repeat: " << i << "-" << j << "\n"; // %%%% 00134 break; 00135 } // ~if 00136 } // ~for j 00137 if ( m == 0 ) 00138 { 00139 // cout << "doing bog standard matching for:" << i << endl; 00140 matchWord( thisWord, hitListFwd, i ); 00141 } // ~if 00142 else 00143 { 00144 // ... then we have found a tandem repeat of length m 00145 int r(1); 00146 00147 // scan forward until we reach either the end of the 00148 // repeated region or the end of the sequence 00149 while ( ( seqShifted[i+(r*m)]==thisWord ) 00150 && ( i+(r*m) < seqShifted.size() ) ) ++r; 00151 00152 // cout << "Num repeats: " << r << endl; 00153 00154 // any hits in a run of matching hits that exceed lastRun are 00155 // ignored, because in that case the size of the repeated 00156 // region in the subject sequence exceeds the size of the 00157 // repeated region in the query sequence 00158 int lastRun((r-1)*m); 00159 00160 // cout << "size of repeated run: " << lastRun << endl; 00161 00162 while ( seqShifted[i+lastRun] == seqShifted[i+lastRun-m] ) lastRun++; 00163 lastRun--; 00164 00165 // cout << "adjusted size of repeated run: " << lastRun << endl; 00166 00167 HitListRepeated hits; 00168 00169 // as we proceed base by base along a region of tandem repeats 00170 // of motif length m, we encounter m distinct hash words, after 00171 // that, they repeat. Now get the hits for each of these words: 00172 // passing in j tags each hit with its position in the repeat cycle 00173 for ( int j(0) ; j < m ; ++j ) 00174 { 00175 matchWord( seqShifted[i+j], hits, j); 00176 } // ~for j 00177 00178 // sort hits in order of RepeatedHit::subjectPos 00179 sort( hits.begin(), hits.end() ); 00180 00181 // lastHit = previous hit in list, initialized to all zeroes 00182 RepeatedHit lastHit; 00183 // firstHit = first hit of a matching run, initialized to all zeroes 00184 RepeatedHit firstHit; 00185 00186 // thisRun = size of current run of matching hits 00187 int thisRun(0); 00188 00189 00190 for ( HitListRepeated::iterator thisHit( hits.begin() ); 00191 thisHit != hits.end() ; ++thisHit ) 00192 { 00193 // cout << ": " << (*thisHit).subjectPos.sequence << " " 00194 // << (*thisHit).subjectPos.offset << " " 00195 // << (*thisHit).cyclePos ; 00196 if ( ( (*thisHit).subjectPos.sequence 00197 == lastHit.subjectPos.sequence ) 00198 && ( (*thisHit).subjectPos.offset 00199 == lastHit.subjectPos.offset + stepLength_ ) 00200 && ( (*thisHit).cyclePos 00201 == (( lastHit.cyclePos + stepLength_ ) % m) ) ) 00202 { 00203 if ( thisRun == 0 ) 00204 { 00205 // cout << " -s- "; 00206 // then a run if matching hits has started 00207 firstHit = lastHit; 00208 thisRun = wordLength_; 00209 } // ~if 00210 else 00211 { 00212 // cout << " -c- "; 00213 // we continue an existing run 00214 thisRun += stepLength_; 00215 } 00216 00217 00218 } // ~if 00219 else 00220 { 00221 thisRun=0; 00222 firstHit = *thisHit; 00223 } 00224 // cout << " -n- "; 00225 if (thisRun <= lastRun ) 00226 { 00227 // only output hits if length of repeated region in subject 00228 // is less than or equal to that of query 00229 // cout << "added" << (*thisHit).subjectPos.offset 00230 // << "-" << i + firstHit.cyclePos + thisRun; 00231 hitListFwd.addHit( thisHit->subjectPos, 00232 i + firstHit.cyclePos + thisRun ); 00233 } else // cout << "ignored" << (*thisHit).subjectPos.offset 00234 // << "-" << i + firstHit.cyclePos + thisRun; 00235 00236 00237 lastHit = *thisHit; 00238 // cout << endl; 00239 } // ~for 00240 00241 // carry on at the end of the repeated region in the query seq 00242 // i += (r-1)*m+1; 00243 // Actually want 1 less cos i is incremented anyway at the 00244 // end of the for loop - TC 5.7.1 00245 // i += (r-1)*m; 00246 i += lastRun; 00247 // cout << "Carrying on at pos " << i+1 << "\n"; 00248 // break out of the `for m' loop 00249 // break; 00250 00251 00252 00253 } // ~else 00254 00255 } // ~for i 00256 00257 00258 00259 } // ~HashTable::matchSequenceRepeated 00260 00261 00262 00263 00264 00265 void HashTable::countWords( SequenceAdapter& thisSeq ) 00266 { 00267 00268 for ( int j(0) ; j < thisSeq.size() ; ++ j ) 00269 { 00270 // only count words that have not been flagged 00271 pWordPositionInHitList_[(thisSeq[j]&(~gCursedWord))] 00272 += ((thisSeq[j]&gCursedWord)==(Word)0); 00273 // pWordPositionInHitList_[thisSeq[j]]++; 00274 } 00275 00276 } // ~HashTable::countWords 00277 00278 void HashTable::hashWords 00279 ( SequenceAdapter& thisSeq, SequenceNumber seqNum ) 00280 { 00281 00282 register Word thisWord; 00283 register PositionInHitList currentPos; 00284 // NB We stop at the last but one element of the 00285 // sequence (as the last isn't a full word) 00286 00287 for ( int j(0) ; j < thisSeq.size() ; ++ j ) 00288 { 00289 thisWord = thisSeq[j]; 00290 // only hash words that have not been flagged 00291 if ((thisWord&gCursedWord)!=(Word)0) continue; 00292 currentPos 00293 = pHitsFoundSoFar_[thisWord] 00294 +( ( thisWord == 0 ) 00295 ? 0 : pWordPositionInHitList_[thisWord - 1]) ; 00296 00297 if ( currentPos != pWordPositionInHitList_[thisWord] ) 00298 { // then place position in the hit list 00299 pHitListForAllWords_[currentPos].sequence 00300 = seqNum; 00301 pHitListForAllWords_[currentPos].offset 00302 = j * stepLength_; 00303 // DEBUG_L2("list "<< printWord(thisWord,wordLength_) 00304 // << " "<< seqNum << " " << j*stepLength_ ); 00305 pHitsFoundSoFar_[thisWord]++; 00306 } // ~if 00307 00308 } // ~ for thisWord 00309 00310 00311 } // ~HashTable::hashWords 00312 00313 00314 // End of file HashTable.cpp 00315
1.5.2