00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include "HashTableTranslated.h"
00034 #include "SequenceReader.h"
00035 #include "SequenceEncoder.h"
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049 HashTableComponent::HashTableComponent( ostream& monitoringStream, string name,
00050 Allocator<PositionPacked>& hitListAllocator,
00051 Allocator<PositionInHitList>& arrayAllocator
00052 ):
00053 queryFrame_(0),
00054 HashTablePacked( monitoringStream, name,
00055 hitListAllocator, arrayAllocator )
00056 {
00057 hitListFormat_ = g32BitPackedProtein;
00058 bitsPerSymbol_ = gResidueBits;
00059 pGenerateSubstitutes_ = &generateSubstitutesProtein;
00060 monitoringStream_ << "constructing HashTableComponent" << endl;
00061 pNameReader_ = new NameReader;
00062 }
00063
00064
00065
00066 void HashTableComponent::convertHits
00067 ( PackedHitStore& packedHits, HitList& hitListFwd )
00068 {
00069
00070
00071 sorter_(packedHits);
00072
00073 vector<HitPacked>::iterator i(packedHits.begin());
00074 vector<SeqStartPos>::iterator j(seqStarts_.begin());
00075 HitListVector::size_type sortStart;
00076
00077 SequenceNumber adjustedSeqNum;
00078 int subjectFrame;
00079
00080
00081
00082
00083
00084 while
00085 ( (j!=static_cast<vector<SeqStartPos>::iterator>(&seqStarts_.back()))
00086 && (i!=packedHits.end()) )
00087 {
00088 vector<SeqStartPos>::iterator ub(upper_bound(j,seqStarts_.end(),i->first));
00089 j=ub; j--;
00090
00091 sortStart=hitListFwd.size();
00092 while ((i!=packedHits.end())&&(i->first<*ub))
00093 {
00094
00095
00096
00097
00098
00099 adjustedSeqNum
00100 = ub-seqStarts_.begin() - 1;
00101 subjectFrame = adjustedSeqNum % gNumReadingFrames;
00102 adjustedSeqNum /= gNumReadingFrames;
00103 adjustedSeqNum++;
00104
00105
00106
00107
00108
00109
00110
00111 hitListFwd.addHit
00112 ( adjustedSeqNum,
00113 (gNumReadingFrames*stepLength_*(i->first - *j))+subjectFrame,
00114 (gNumReadingFrames*i->second)+queryFrame_);
00115
00116
00117
00118 i++;
00119
00120 }
00121 sort
00122 ( static_cast<HitList::iterator>(&hitListFwd[sortStart]),
00123 hitListFwd.end(),
00124 LessThanDiff() );
00125 j=ub;
00126 }
00127
00128 }
00129
00130
00131
00132
00133 HashTablePackedProtein::HashTablePackedProtein
00134 ( ostream& monitoringStream, string name,
00135 Allocator<PositionPacked>& hitListAllocator,
00136 Allocator<PositionInHitList>& arrayAllocator
00137 ):
00138 queryFrame_(0),
00139 queryMult_(1),
00140 codonEncoder_(5),
00141 pMatchSequence_( &HashTablePackedProtein::matchSequenceProtein ),
00142 HashTablePacked( monitoringStream, name,
00143 hitListAllocator, arrayAllocator )
00144 {
00145 hitListFormat_ = g32BitPackedProtein;
00146 bitsPerSymbol_ = gResidueBits;
00147 pGenerateSubstitutes_ = &generateSubstitutesProtein;
00148 monitoringStream_ << "constructing HashTablePackedProtein" << endl;
00149 }
00150
00151
00152
00153
00154
00155 void HashTablePackedProtein::convertHits
00156 ( PackedHitStore& packedHits, HitList& hitListFwd )
00157 {
00158
00159
00160
00161 sorter_(packedHits);
00162
00163 vector<HitPacked>::iterator i(packedHits.begin());
00164 vector<SeqStartPos>::iterator j(seqStarts_.begin());
00165 HitListVector::size_type sortStart;
00166
00167
00168
00169
00170
00171
00172
00173
00174 while
00175 ( (j!=static_cast<vector<SeqStartPos>::iterator>(&seqStarts_.back()))
00176 && (i!=packedHits.end()) )
00177 {
00178 vector<SeqStartPos>::iterator ub(upper_bound(j,seqStarts_.end(),i->first));
00179 j=ub; j--;
00180
00181 sortStart=hitListFwd.size();
00182 while ((i!=packedHits.end())&&(i->first<*ub))
00183 {
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194 hitListFwd.addHit
00195 ( ub-seqStarts_.begin(),
00196 (queryMult_*stepLength_*(i->first - *j)),
00197 (queryMult_*i->second)+queryFrame_);
00198
00199
00200
00201 i++;
00202
00203 }
00204 sort
00205 ( static_cast<HitList::iterator>(&hitListFwd[sortStart]),
00206 hitListFwd.end(),
00207 LessThanDiff() );
00208 j=ub;
00209 }
00210
00211
00212 }
00213
00214 void HashTablePackedProtein::matchSequenceProtein
00215 ( WordSequence& seq, HitList& hitListFwd )
00216 {
00217 HashTablePacked::matchSequence( seq, hitListFwd );
00218 }
00219
00220
00221 void HashTablePackedProtein::matchSequenceTranslatedDNA
00222 ( WordSequence& seq, HitList& hitListFwd )
00223 {
00224 codonEncoder_.setWordLength( wordLength_ );
00225
00226 WordSequence translatedSeq;
00227 for (queryFrame_ = 0; queryFrame_ < gNumReadingFrames; queryFrame_++)
00228 {
00229 codons_.clear();
00230 codonize ( seq, codons_, queryFrame_ );
00231 translatedSeq.clear();
00232 codonEncoder_.linkSeq( translatedSeq );
00233 codonEncoder_.encode( codons_ );
00234 codonEncoder_.unlinkSeq();
00235 HashTablePacked::matchSequence( translatedSeq, hitListFwd );
00236 }
00237
00238 }
00239
00240
00241
00242
00243
00244
00245
00246
00247 HashTableTranslated::HashTableTranslated
00248 ( ostream& monitoringStream, string name,
00249 Allocator<PositionPacked>& hitListAllocator,
00250 Allocator<PositionInHitList>& arrayAllocator
00251 ) :
00252 hashFwd_( monitoringStream, name+(string)"_fwd",
00253 hitListAllocator, arrayAllocator ),
00254 hashRev_( monitoringStream, name+(string)"_rev",
00255 hitListAllocator, arrayAllocator ),
00256 pHash_(&hashFwd_),
00257 codonEncoder_(5),
00258 pMatchSequence_( &HashTableTranslated::matchSequenceProtein ),
00259 HashTableGeneric( monitoringStream, name, arrayAllocator )
00260 {
00261 bitsPerSymbol_=gResidueBits;
00262 hitListFormat_ = gTranslated;
00263 monitoringStream_ << "constructing HashTableTranslated" << endl;
00264 }
00265
00266 void HashTableTranslated::setupPointerArray( void )
00267 {
00268
00269 hashFwd_.numDifferentWords_= 1 << (wordLength_*gResidueBits);
00270 hashFwd_.bitsPerSymbol_=gResidueBits;
00271 hashFwd_.sourceData_=gDNAData;
00272 hashFwd_.wordLength_=wordLength_;
00273 hashFwd_.stepLength_=stepLength_;
00274 hashFwd_.setupPointerArray();
00275
00276 hashRev_.numDifferentWords_= 1 << (wordLength_*gResidueBits);
00277 hashRev_.bitsPerSymbol_=gResidueBits;
00278 hashRev_.sourceData_=gDNAData;
00279 hashRev_.wordLength_=wordLength_;
00280 hashRev_.stepLength_=stepLength_;
00281 hashRev_.setupPointerArray();
00282 }
00283
00284
00285 void HashTableTranslated::computePointerArray( void )
00286 {
00287 hashFwd_.computePointerArray();
00288 hashRev_.computePointerArray();
00289 }
00290
00291
00292
00293 void HashTableTranslated::setupHitList( void )
00294 {
00295 hashFwd_.setupHitList();
00296 hashRev_.setupHitList();
00297 }
00298
00299 void HashTableTranslated::cleanupTempData( void )
00300 {
00301 hashFwd_.cleanupTempData();
00302 hashRev_.cleanupTempData();
00303 hashFwd_.isInitialized_=true;
00304 hashRev_.isInitialized_=true;
00305 }
00306
00307
00308 void HashTableTranslated::hashWords
00309 ( SequenceAdapter& thisSeq, SequenceNumber seqNum )
00310 {
00311 assert(1==0);
00312 hashFwd_.hashWords( thisSeq, seqNum );
00313 hashRev_.hashWords( thisSeq, seqNum );
00314 }
00315
00316 int HashTableTranslated::getMaxNumHits() const
00317 {
00318 assert( hashFwd_.getMaxNumHits() == hashRev_.getMaxNumHits() );
00319 return hashFwd_.getMaxNumHits();
00320 }
00321 void HashTableTranslated::setMaxNumHits( int mnh )
00322 {
00323 hashFwd_.setMaxNumHits( mnh );
00324 hashRev_.setMaxNumHits( mnh );
00325 assert( hashFwd_.getMaxNumHits() == mnh );
00326 assert( hashFwd_.getMaxNumHits() == mnh );
00327 }
00328
00329
00330
00331 int HashTableTranslated::countWordsAndGetNames
00332 ( SequenceReader& sequenceReader, SequenceAdapter* seq )
00333 {
00334
00335 sequenceReader.rewind();
00336
00337
00338
00339
00340 NameReaderLocal* pReaderLocal = new NameReaderLocal;
00341 pNameReader_ = pReaderLocal;
00342
00343 codonEncoder_.setWordLength(wordLength_);
00344
00345
00346 SequenceReaderModeFlagReplace mode('X');
00347 assert(ttCodon['X']==ttProtein['X']);
00348 assert(ttCodon['X']!=nv);
00349 codonEncoder_.changeMode( &mode );
00350
00351
00352
00353 WordSequence thisSeq, translatedSeq;
00354 seq->link( translatedSeq );
00355 int numSeqs(0);
00356
00357
00358
00359
00360 while( sequenceReader.getNextSequence
00361 ( thisSeq, eDNAWordSizeForHashing ) != -1 )
00362 {
00363 numSeqs++;
00364
00365
00366 sequenceReader.getLastSequenceName( pReaderLocal->lastName() );
00367
00368
00369
00370
00371 for ( int i(0) ; i < gNumReadingFrames ; i++ )
00372 {
00373 codons_.clear();
00374 codonizeAndFlag ( thisSeq, codons_, i );
00375 translatedSeq.clear();
00376 codonEncoder_.linkSeq( translatedSeq );
00377 codonEncoder_.encode( codons_ );
00378 codonEncoder_.unlinkSeq();
00379 hashFwd_.countWords(*seq);
00380 }
00381
00382
00383
00384
00385 for ( int i(0) ; i < gNumReadingFrames ; i++ )
00386 {
00387 codons_.clear();
00388 codonizeAndFlagReverse ( thisSeq, codons_, i );
00389 translatedSeq.clear();
00390 codonEncoder_.linkSeq( translatedSeq );
00391 codonEncoder_.encode( codons_ );
00392 codonEncoder_.unlinkSeq();
00393 hashRev_.countWords(*seq);
00394 }
00395
00396
00397 }
00398
00399
00400
00401
00402
00403
00404
00405
00406 return numSeqs;
00407 }
00408
00409 void HashTableTranslated::hashAllWords
00410 ( SequenceReader& sequenceReader, SequenceAdapter* seq, int numSeqs )
00411 {
00412
00413
00414 int numWords(0);
00415 sequenceReader.rewind();
00416
00417 WordSequence thisSeq, translatedSeq;
00418 seq->link( translatedSeq );
00419
00420
00421 for ( unsigned int i(1); i <= numSeqs ; i++ )
00422 {
00423
00424 if( sequenceReader.getNextSequence( thisSeq, eDNAWordSizeForHashing) == -1 )
00425 {
00426 throw SSAHAException
00427 ("Sequence source data changed during hash table creation!");
00428 }
00429
00430
00431 numWords = (int) thisSeq.size();
00432 pSequenceSizes_[i-1] = ( numWords > 0 )
00433 ? ( (numWords-1) * eDNAWordSizeForHashing ) + thisSeq.getNumBasesInLast()
00434 : 0;
00435
00436
00437
00438 for ( int j(0) ; j < gNumReadingFrames ; j++ )
00439 {
00440 codons_.clear();
00441 codonizeAndFlag ( thisSeq, codons_, j );
00442 translatedSeq.clear();
00443 codonEncoder_.linkSeq( translatedSeq );
00444 codonEncoder_.encode( codons_ );
00445 codonEncoder_.unlinkSeq();
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455 hashFwd_.hashWords(*seq,99999);
00456 }
00457
00458
00459
00460
00461
00462
00463 for ( int j(0) ; j < gNumReadingFrames ; j++ )
00464 {
00465 codons_.clear();
00466 codonizeAndFlagReverse ( thisSeq, codons_, j );
00467 translatedSeq.clear();
00468 codonEncoder_.linkSeq( translatedSeq );
00469 codonEncoder_.encode( codons_ );
00470 codonEncoder_.unlinkSeq();
00471
00472
00473
00474
00475
00476
00477 hashRev_.hashWords(*seq,99999);
00478 }
00479
00480
00481
00482
00483
00484
00485 }
00486
00487 SequenceReaderModeIgnore mode;
00488 codonEncoder_.changeMode( &mode );
00489
00490
00491 }
00492
00493
00494
00495 void HashTableTranslated::countWords( SequenceAdapter& thisSeq )
00496 {
00497 assert(1==0);
00498
00499
00500 }
00501
00502
00503 void HashTableTranslated::setNumRepeats( int nr)
00504 {
00505 hashFwd_.setNumRepeats( nr );
00506 hashRev_.setNumRepeats( nr );
00507 }
00508
00509 void HashTableTranslated::setSubstituteThreshold( int ns)
00510 {
00511 hashFwd_.setSubstituteThreshold( ns );
00512 hashRev_.setSubstituteThreshold( ns );
00513 }
00514
00515 char* HashTableTranslated::getHitListStart( void ) const
00516 {
00517 assert(1==0);
00518 return NULL;
00519 }
00520 int HashTableTranslated::getHitTypeSize( void ) const
00521 {
00522 assert(1==0);
00523 return NULL;
00524 }
00525 void HashTableTranslated::allocateHitList( unsigned long size )
00526 {
00527 assert(1==0);
00528 }
00529 void HashTableTranslated::loadHitList( unsigned long size )
00530 {
00531 assert(1==0);
00532 }
00533 void HashTableTranslated::saveHitList( void )
00534 {
00535
00536 monitoringStream_ << "No hit list to save for HashTableTranslated" << endl;
00537 }
00538
00539 void HashTableTranslated::loadHashTable
00540 ( SourceReaderIndex* pSourceReader )
00541 {
00542
00543 if (pSourceReader==NULL)
00544 {
00545 pNameReader_ = new NameReaderLocal;
00546 }
00547 else
00548 {
00549 pNameReader_ = new NameReaderIndex(*pSourceReader);
00550 }
00551
00552
00553 hashFwd_.loadHashTable();
00554 hashRev_.loadHashTable();
00555 loadSequenceNames();
00556
00557
00558 unsigned long numSeqs = pNameReader_->size();
00559
00560 assert(numSeqs!=0);
00561
00562 monitoringStream_ << "Allocating memory for pSequenceSizes_: "
00563 << numSeqs << " sequences, "
00564 << numSeqs*sizeof(SequenceOffset)
00565 << " bytes total ...\n";
00566
00567 pSequenceSizes_ = new SequenceOffset [ numSeqs ];
00568
00569 if (!pSequenceSizes_)
00570 {
00571 throw SSAHAException("Memory allocation failed!");
00572 }
00573
00574 loadFromFile(name_+(string)".size", (char*)pSequenceSizes_,
00575 numSeqs * sizeof( SequenceOffset ),
00576 monitoringStream_ );
00577
00578 codonEncoder_.setWordLength( wordLength_ );
00579
00580 isInitialized_=true;
00581 }
00582
00583
00584 void HashTableTranslated::saveHashTable( void )
00585 {
00586 hashFwd_.saveHashTable();
00587 hashRev_.saveHashTable();
00588 HashTableGeneric::saveHashTable();
00589
00590 }
00591
00592 unsigned long HashTableTranslated::getTotalNumWords( void ) const
00593 {
00594 return (hashFwd_.getTotalNumWords()+hashRev_.getTotalNumWords());
00595 }
00596
00597 void HashTableTranslated::printHashStats( void )
00598 {
00599 cout << "\n\n ** Hash stats output for forward subtable: ** \n\n";
00600 hashFwd_.printHashStats();
00601 cout << "\n\n ** Hash stats output for reverse subtable: ** \n\n";
00602 hashRev_.printHashStats();
00603 }
00604
00605
00606
00607 void HashTableTranslated::matchSequenceProtein
00608 ( WordSequence& seq, HitList& hitListFwd )
00609 {
00610
00611
00612 pHash_->setQueryFrame(0);
00613 pHash_->matchSequence( seq, hitListFwd );
00614 }
00615
00616 void HashTableTranslated::matchSequenceTranslatedDNA
00617 ( WordSequence& seq, HitList& hitListFwd )
00618 {
00619
00620
00621 codonEncoder_.setWordLength(wordLength_);
00622 WordSequence translatedSeq;
00623 for (int readingFrame(0); readingFrame < gNumReadingFrames; readingFrame++)
00624 {
00625 pHash_->setQueryFrame(readingFrame);
00626 codons_.clear();
00627 codonize ( seq, codons_, readingFrame );
00628 translatedSeq.clear();
00629 codonEncoder_.linkSeq( translatedSeq );
00630 codonEncoder_.encode( codons_ );
00631 codonEncoder_.unlinkSeq();
00632
00633
00634
00635
00636
00637
00638 pHash_->matchSequence( translatedSeq, hitListFwd );
00639 }
00640 }
00641
00642
00643
00644
00645
00646
00647
00648
00649
00650
00651
00652
00653