00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include "HashTablePacked.h"
00034 #include "SequenceReader.h"
00035
00036 #include "GlobalDefinitions.h"
00037 #include <fstream>
00038 #include <algorithm>
00039 #include <map>
00040 #include <cmath>
00041
00042 AllocatorLocal<PositionInHitList> HashTablePacked::defaultArrayAllocator;
00043 AllocatorLocal<PositionPacked> HashTablePacked::defaultHitListAllocator;
00044
00045
00046
00047
00048
00049
00050
00051
00052 void HashTablePacked::setNumRepeats( int numRepeats )
00053 {
00054 if ( (numRepeats<0) || (numRepeats>stepLength_) )
00055 throw SSAHAException("Invalid value for numRepeats!!");
00056 numRepeats_=numRepeats;
00057 pMatchSequence_ = ( numRepeats==0 )
00058 ? &HashTablePacked::matchSequenceStandard
00059 : &HashTablePacked::matchSequenceRepeated;
00060 }
00061
00062 void HashTablePacked::setSubstituteThreshold( int numSubs )
00063 {
00064 if ( (numSubs<0) || (numSubs>maxNumHits_) )
00065 throw SSAHAException("Invalid value for substituteThreshold!!");
00066 substituteThreshold_=numSubs;
00067 pMatchWord_ = ( numSubs==0 )
00068 ? &HashTablePacked::matchWordStandard
00069 : &HashTablePacked::matchWordSubstitute;
00070 }
00071
00072 void HashTablePacked::matchWordSubstitute
00073 ( Word w, PackedHitStore& hitList, int offset )
00074 {
00075 if ((w&gCursedWord)!=(Word)0) return;
00076
00077 HashTableView<PositionPacked,HashTablePacked>::matchWord
00078 ( w, hitList, offset );
00079 assert((w&gCursedWord)==0);
00080 if (size(w)>substituteThreshold_) return;
00081 vector<Word> neighbours;
00082 (*pGenerateSubstitutes_)( w, neighbours, wordLength_);
00083 for( vector<Word>::iterator i(neighbours.begin());
00084 i!=neighbours.end(); i++)
00085 {
00086 HashTableView<PositionPacked,HashTablePacked>::matchWord
00087 ( *i, hitList, offset );
00088 }
00089 }
00090
00091
00092
00093
00094
00095
00096
00097 void generateSubstitutesDNA
00098 (Word w, vector<Word>& subs, int wordLength)
00099 {
00100 Word mask(2);
00101 for (int i(0) ; i < wordLength ; i++, mask<<=gBaseBits)
00102 {
00103 subs.push_back(w^mask);
00104 }
00105 }
00106
00107
00108
00109 void generateSubstitutesProtein
00110 (Word w, vector<Word>& subs, int wordLength)
00111 {
00112 Word mask((1<<gResidueBits)-1), thisCodon, thisWord;
00113 for (int i(0) ; i < wordLength ; i++, mask<<=gResidueBits)
00114 {
00115 thisCodon=((w&mask)>>(i*gResidueBits));
00116 thisWord= (w&(~mask));
00117
00118
00119 for (int j(subStarts[thisCodon]); j != subStarts[thisCodon+1]; j++ )
00120 {
00121 subs.push_back( thisWord | (subVals[j]<<(i*gResidueBits)) );
00122
00123
00124 }
00125 }
00126 }
00127
00128
00129
00130
00131
00132 void HashTablePacked::countWords( SequenceAdapter& thisSeq )
00133 {
00134
00135 for ( int j(0) ; j < thisSeq.size() ; ++ j )
00136 {
00137
00138 pWordPositionInHitList_[(thisSeq[j]&(~gCursedWord))]
00139 += ((thisSeq[j]&gCursedWord)==(Word)0);
00140
00141 }
00142
00143 }
00144
00145 void HashTablePacked::hashWords
00146 ( SequenceAdapter& thisSeq, SequenceNumber seqNum )
00147 {
00148
00149 register Word thisWord;
00150 register PositionInHitList currentPos;
00151
00152
00153
00154 for ( int j(0) ; j < thisSeq.size() ; ++ j )
00155 {
00156 thisWord = thisSeq[j];
00157
00158
00159 if ((thisWord&gCursedWord)==(Word)0)
00160 {
00161
00162 currentPos
00163 = pHitsFoundSoFar_[thisWord]
00164 +( ( thisWord == 0 )
00165 ? 0 : pWordPositionInHitList_[thisWord - 1]) ;
00166
00167 if ( currentPos != pWordPositionInHitList_[thisWord] )
00168 {
00169 pHitListForAllWords_[currentPos] = wordNum_;
00170 pHitsFoundSoFar_[thisWord]++;
00171
00172
00173
00174 }
00175 else assert(1==0);
00176 }
00177
00178 wordNum_++;
00179
00180 }
00181
00182 seqStarts_.push_back(wordNum_);
00183
00184 }
00185
00186 void HashTablePacked::matchSequenceStandard
00187 ( WordSequence& seq, HitList& hitListFwd )
00188 {
00189
00190 PackedHitStore packedHits;
00191 int numBasesInLast( seq.getNumBasesInLast() ), baseOffset;
00192
00193 for ( int i(0) ; i < wordLength_ ; ++i )
00194 {
00195
00196 if (seq.size()!=0)
00197 {
00198 baseOffset=i;
00199 WordSequence::const_iterator last(&seq.back());
00200 for ( WordSequence::const_iterator thisWord(seq.begin());
00201 thisWord != last ; ++thisWord )
00202 {
00203 int oldSize(packedHits.size());
00204 matchWordDeluxe( *thisWord, packedHits, baseOffset );
00205
00206
00207
00208
00209 baseOffset += wordLength_;
00210 }
00211 }
00212 shiftSequence( seq, bitsPerSymbol_, wordLength_ );
00213 if ( i == numBasesInLast )
00214 {
00215 seq.pop_back();
00216 }
00217 }
00218
00219 convertHits(packedHits,hitListFwd);
00220
00221 }
00222
00223
00224 void HashTablePacked::convertHits
00225 ( PackedHitStore& packedHits, HitList& hitListFwd )
00226 {
00227
00228
00229 sorter_(packedHits);
00230
00231
00232 vector<HitPacked>::iterator i(packedHits.begin());
00233 vector<SeqStartPos>::iterator ub, j(seqStarts_.begin());
00234 HitListVector::size_type sortStart;
00235
00236
00237
00238
00239 while
00240 ( (j!=static_cast<vector<SeqStartPos>::iterator>(&seqStarts_.back()))
00241 && (i!=packedHits.end()) )
00242 {
00243 ub = upper_bound(j,seqStarts_.end(),i->first);
00244 j=ub; j--;
00245 sortStart=hitListFwd.size();
00246 while ((i!=packedHits.end())&&(i->first<*ub))
00247 {
00248
00249
00250
00251
00252 hitListFwd.addHit( ub-seqStarts_.begin(),
00253 stepLength_*(i->first - *j),
00254 i->second );
00255 i++;
00256
00257 }
00258 sort
00259 ( static_cast<HitList::iterator>(&hitListFwd[sortStart]),
00260 hitListFwd.end(),
00261 LessThanDiff() );
00262
00263 j=ub;
00264 }
00265
00266 }
00267
00268 void HashTablePacked::matchSequenceRepeated
00269 ( WordSequence& seq, HitList& hitListFwd )
00270 {
00271 WordSequenceShifted seqShifted(seq, *this);
00272
00273
00274 PackedHitStore nonRepeatedHits;
00275
00276 Word thisWord;
00277
00278 int m;
00279
00280
00281
00282
00283 for ( int i(0) ; i < seqShifted.size() ; ++i )
00284 {
00285
00286
00287
00288 thisWord = seqShifted[i];
00289 m = 0;
00290
00291
00292 for ( int j(i+1) ;
00293 ( ( j < seqShifted.size() ) && ( j <= i + numRepeats_ ) );
00294 ++j )
00295 {
00296 if ( thisWord == seqShifted[j] )
00297 {
00298 m = j - i;
00299
00300 break;
00301 }
00302 }
00303 if ( m == 0 )
00304 {
00305
00306 matchWordDeluxe( thisWord, nonRepeatedHits, i );
00307 }
00308 else
00309 {
00310
00311 int r(1);
00312
00313
00314
00315 while ( ( seqShifted[i+(r*m)]==thisWord )
00316 && ( i+(r*m) < seqShifted.size() ) ) ++r;
00317
00318
00319
00320
00321
00322
00323
00324 int lastRun((r-1)*m);
00325
00326
00327
00328 while ( seqShifted[i+lastRun] == seqShifted[i+lastRun-m] ) lastRun++;
00329 lastRun--;
00330
00331
00332
00333
00334 PackedHitStore packedHits;
00335 HitListRepeated hits;
00336
00337
00338
00339
00340
00341 for ( int j(0) ; j < m ; ++j )
00342 {
00343 matchWord( seqShifted[i+j], packedHits, j);
00344 }
00345
00346
00347 sort(packedHits.begin(),packedHits.end());
00348
00349 vector<HitPacked>::iterator pThisHit(packedHits.begin());
00350 vector<SeqStartPos>::iterator ub, j(seqStarts_.begin());
00351
00352
00353
00354
00355
00356 HitPacked lastHit;
00357
00358 HitPacked firstHit;
00359
00360
00361 int thisRun(0);
00362
00363 while
00364 ( ( j!=static_cast<vector<SeqStartPos>::iterator>
00365 ( &seqStarts_.back() ) )
00366 && ( pThisHit!=packedHits.end() ) )
00367 {
00368 ub = upper_bound(j,seqStarts_.end(),pThisHit->first);
00369 j=ub; j--;
00370 lastHit=*pThisHit;
00371
00372 while ((pThisHit!=packedHits.end())&&(pThisHit->first<*ub))
00373 {
00374 if ( ( pThisHit->first
00375 == lastHit.first+1 )
00376 && ( pThisHit->second
00377 == (( lastHit.second + stepLength_ ) % m) ) )
00378 {
00379 if ( thisRun == 0 )
00380 {
00381
00382
00383 firstHit = lastHit;
00384 thisRun = wordLength_;
00385 }
00386 else
00387 {
00388
00389
00390 thisRun += stepLength_;
00391 }
00392
00393
00394 }
00395 else
00396 {
00397 thisRun=0;
00398 firstHit = *pThisHit;
00399 }
00400
00401 if (thisRun <= lastRun )
00402 {
00403
00404
00405
00406
00407
00408
00409
00410 nonRepeatedHits.push_back
00411 ( HitPacked( pThisHit->first,
00412 i + firstHit.second + thisRun ) );
00413 }
00414 else
00415 {
00416 lastHit = *pThisHit;
00417 }
00418 pThisHit++;
00419 }
00420 j=ub;
00421 }
00422 i += lastRun;
00423 }
00424 }
00425 convertHits( nonRepeatedHits, hitListFwd );
00426
00427 }
00428
00429
00430
00431
00432 void HashTablePacked::loadHitList( unsigned long size )
00433 {
00434
00435 string startFileName(name_+(string)".start");
00436
00437 ifstream startFile(startFileName.c_str());
00438 if (startFile.fail())
00439 {
00440 monitoringStream_ << "Could not open " << startFileName << endl;
00441 throw SSAHAException("Could not open .start file");
00442 }
00443
00444 startFile.seekg(0,ios::end);
00445 long startFileSize = startFile.tellg();
00446
00447 if ( getNumSequences() + 1 != startFileSize/sizeof(SeqStartPos))
00448 {
00449 seqStarts_.resize(startFileSize/sizeof(SeqStartPos));
00450 monitoringStream_
00451 << "Info: expecting " << seqStarts_.size()-1 << " sequences in file"
00452 << endl;
00453 }
00454 else seqStarts_.resize(getNumSequences()+1);
00455
00456 loadFromFile( startFileName,
00457 (char*) &seqStarts_[0],
00458 startFileSize );
00459
00460 HashTableView<PositionPacked,HashTablePacked>::loadHitList(size);
00461
00462 }
00463
00464
00465
00466 void HashTablePacked::saveHitList( void )
00467 {
00468
00469
00470
00471
00472 saveToFile( name_+(string)".start",
00473 (char*) &seqStarts_[0],
00474 (seqStarts_.size())*sizeof(SeqStartPos) );
00475
00476 HashTableView<PositionPacked,HashTablePacked>::saveHitList();
00477 }
00478
00479
00480
00481 RadixSorter::RadixSorter
00482 ( const unsigned int digits, const unsigned int bits ) :
00483 digits_(digits), bits_(bits),
00484 base_( (unsigned long)pow( (double)2, (int)bits ) ),
00485 mask_( base_ - 1 ),
00486 counts_(vector< vector<CountInt> >(digits, vector<CountInt>(base_) ) ),
00487 places_(base_)
00488 {}
00489
00490
00491
00492 void RadixSorter::operator()( vector<HitPacked>& v )
00493 {
00494
00495 CountDigits(v);
00496
00497 if (digits_&1)
00498 {
00499 v1_=v;
00500 source_ = &v1_; target_ = &v;
00501 }
00502 else
00503 {
00504 v1_.assign(v.size(),zeroHit);
00505 source_ = &v; target_ = &v1_;
00506 }
00507
00508 vector<HitPacked>* temp;
00509
00510 for ( unsigned int i(0) ; i < digits_ ; ++i )
00511 {
00512 SortByDigit( i );
00513 temp = source_;
00514 source_=target_;
00515 target_=temp;
00516 }
00517 }
00518
00519
00520 void RadixSorter::CountDigits( const vector<HitPacked>& v )
00521 {
00522 vector< vector<CountInt> >::iterator pv (counts_.begin());
00523
00524 for (; pv != counts_.end() ; ++pv ) pv->assign(base_,0);
00525
00526 PositionPacked temp;
00527
00528 for ( vector<HitPacked>::const_iterator
00529 j(v.begin());
00530 j != v.end() ; ++j )
00531 {
00532 temp = j->first;
00533 for ( pv = counts_.begin() ; pv != counts_.end() ; ++pv )
00534 {
00535 (*pv)[ temp & mask_ ]++;
00536 temp >>= bits_;
00537 }
00538
00539 }
00540
00541 }
00542
00543 void RadixSorter::SortByDigit( PositionPacked digit )
00544 {
00545 unsigned int shift( digit * bits_ );
00546 CountInt pos(0);
00547 vector<CountInt>::iterator count( counts_[digit].begin() );
00548
00549 for ( vector< vector<HitPacked>::iterator >::iterator
00550 place( places_.begin() );
00551 place != places_.end();
00552 ++place, ++count )
00553 {
00554
00555 *place = static_cast<vector<HitPacked>::iterator>(&(*target_)[pos]);
00556
00557 pos += *count;
00558 }
00559
00560 for ( vector<HitPacked>::iterator i( (source_)->begin() ) ;
00561 i!= (source_)->end() ; ++i )
00562 {
00563 *places_[ ( i->first >> shift) & mask_ ]++ = *i;
00564 }
00565 }
00566
00567
00568
00569
00570
00571
00572