00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include "SequenceEncoder.h"
00034 #include "SequenceReader.h"
00035 #include "SequenceReaderFasta.h"
00036 #include <strstream>
00037 #include <fstream>
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047 SequenceReaderFasta::SequenceReaderFasta
00048 ( const char* fileName, SequenceEncoder* pEncoder,
00049 ostream& monitoringStream )
00050 : SequenceReaderFile ( fileName, '>', '>', pEncoder->clone(),
00051 monitoringStream )
00052 {
00053 monitoringStream_ << "constructing SequenceReaderFasta" << endl;
00054 }
00055
00056 SequenceReaderFasta::SequenceReaderFasta
00057 ( const char* fileName,
00058 ostream& monitoringStream )
00059 : SequenceReaderFile ( fileName, '>', '>', new SequenceEncoderDNA(12),
00060 monitoringStream )
00061 {
00062 monitoringStream_ << "constructing SequenceReaderFasta" << endl;
00063 }
00064
00065 SequenceReaderFastaProtein::SequenceReaderFastaProtein
00066 ( const char* fileName, ostream& monitoringStream )
00067 : SequenceReaderFasta ( fileName, new SequenceEncoderProtein(5),
00068 monitoringStream )
00069 {
00070 monitoringStream_ << "constructing SequenceReaderFastaProtein" << endl;
00071 }
00072
00073 SequenceReaderFile::SequenceReaderFile
00074 ( const char* fileName,
00075 char seqStartChar,
00076 char seqStopChar,
00077 SequenceEncoder* pEncoder,
00078 ostream& monitoringStream )
00079 : SequenceReader( monitoringStream ),
00080 seqStartChar_( seqStartChar ),
00081 seqStopChar_( seqStopChar ),
00082
00083 pInputFileStream_( new ifstream( fileName ) ),
00084 fileName_( fileName ),
00085 pEncoder_( pEncoder )
00086 {
00087
00088 monitoringStream_ << "constructing SequenceReaderFile" << this << endl;
00089 if ( pInputFileStream_->fail() )
00090 {
00091 throw SSAHAException
00092 ( (string)"SequenceReaderFile - unable to open input file "
00093 + (string)fileName );
00094
00095 }
00096 monitoringStream_ << "SequenceReaderFile::connected to file "
00097 << fileName << endl;
00098
00099 }
00100
00101 SequenceReaderFile::SequenceReaderFile
00102 ( istream& inputStream,
00103 char seqStartChar,
00104 char seqStopChar,
00105 SequenceEncoder* pEncoder,
00106 ostream& monitoringStream )
00107 : SequenceReader( monitoringStream ),
00108 seqStartChar_( seqStartChar ),
00109 seqStopChar_( seqStopChar ),
00110
00111 pInputFileStream_( &inputStream ),
00112
00113 pEncoder_( pEncoder )
00114 {
00115
00116 monitoringStream_ << "constructing SequenceReaderFile" << this << endl;
00117 if ( pInputFileStream_->fail() )
00118 {
00119 throw SSAHAException
00120 ( "SequenceReaderFile - unable to open input stream " );
00121
00122 }
00123 monitoringStream_ << "SequenceReaderFile::connected to input file stream."
00124 << endl;
00125
00126 }
00127
00128
00129
00130
00131
00132 SequenceReaderFile::SequenceReaderFile( const SequenceReaderFile& rhs)
00133 : SequenceReader( rhs.monitoringStream_ ),
00134 seqStartChar_( rhs.seqStartChar_ ),
00135 seqStopChar_( rhs.seqStopChar_ ),
00136 pInputFileStream_( new ifstream( rhs.fileName_.c_str() ) ),
00137 fileName_( rhs.fileName_.c_str() ),
00138 seqPositions_( rhs.seqPositions_ ),
00139
00140 pEncoder_( rhs.pEncoder_->clone() )
00141 {
00142 monitoringStream_ << "copy constructing SequenceReaderFile" << this
00143 << endl;
00144 if (pInputFileStream_->fail())
00145 {
00146 throw SSAHAException
00147 ( (string)"SequenceReaderFile - unable to open input file "
00148 + (string)fileName_ );
00149
00150 }
00151
00152 monitoringStream_ << "SequenceReaderFile:SequenceReader: "
00153 << "connected to file " << fileName_ << endl;
00154
00155 }
00156
00157 SequenceReaderFile::~SequenceReaderFile()
00158 {
00159 monitoringStream_ << "destructing SequenceReaderFile" << this << endl;
00160 delete pEncoder_;
00161 ifstream* pf(dynamic_cast<ifstream*>(pInputFileStream_));
00162 if (pf!=NULL)
00163 {
00164 pf->close();
00165 delete pInputFileStream_;
00166 }
00167 }
00168
00169
00170
00171
00172 void SequenceReaderFile::changeMode( SequenceReaderMode* pMode )
00173 {
00174 pEncoder_->changeMode( pMode );
00175 }
00176
00177
00178
00179
00180
00181
00182
00183 void SequenceReaderFile::rewind( void )
00184 {
00185
00186 pInputFileStream_->clear();
00187 pInputFileStream_->seekg( 0 );
00188 lastSequenceNumber_ = 0;
00189 }
00190
00191
00192
00193
00194
00195
00196
00197 bool SequenceReaderFile::findSequence( SequenceNumber seqNum )
00198 {
00199
00200
00201 pInputFileStream_->clear();
00202 if ( seqNum ==0 )
00203 {
00204 monitoringStream_
00205 << "SRF::findSeq: Requested sequence number (0) is not valid (must start at 1)."
00206 << endl;
00207 return false;
00208 }
00209 else if ( seqNum <= seqPositions_.size() )
00210 {
00211 pInputFileStream_->seekg(seqPositions_[ seqNum - 1 ],ios::beg);
00212 }
00213 else if ( allSequencesRead_ == false )
00214 {
00215
00216 if ( seqPositions_.size() != 0 )
00217 {
00218 pInputFileStream_->seekg
00219 (seqPositions_.back(), ios::beg);
00220 pInputFileStream_->getline
00221 ( inputBuffer_ , inputBufferSize_, '\n' );
00222 }
00223
00224 while(1)
00225 {
00226
00227
00228
00229 if (pInputFileStream_->peek() == EOF)
00230 {
00231 numSequencesInFile_ = seqPositions_.size();
00232
00233 allSequencesRead_ = true;
00234 monitoringStream_
00235 << "SRF::findSeq(EOF): Requested sequence number (" << seqNum
00236 << ")\nis outside range of sequences in file (1 to "
00237 << numSequencesInFile_ << ")." << endl;
00238
00239
00240 return false;
00241 }
00242 else if (pInputFileStream_->peek() == seqStartChar_)
00243 {
00244 seqPositions_.push_back(pInputFileStream_->tellg());
00245 if( seqPositions_.size() == seqNum ) break;
00246 }
00247 pInputFileStream_->getline
00248 ( inputBuffer_ , inputBufferSize_, '\n' );
00249 }
00250
00251
00252 lastSequenceNumber_ = seqNum - 1;
00253
00254 }
00255 else
00256 {
00257 monitoringStream_
00258 << "SRF::findSeq(asr): Requested sequence number (" << seqNum
00259 << ")\nis outside range of sequences in file (1 to "
00260 << numSequencesInFile_ << ")." << endl;
00261
00262
00263
00264
00265
00266
00267
00268
00269 return false;
00270 }
00271
00272 lastSequenceNumber_ = seqNum - 1;
00273
00274 return true;
00275 }
00276
00277
00278
00279
00280
00281
00282
00283 int SequenceReaderFile::getNextSequence
00284 ( WordSequence& nextSeq, int wordLength )
00285 {
00286 DEBUG_L2( "SequenceReaderFile::getNextSequence" );
00287
00288
00289
00290
00291 int firstOfLine;
00292 pInputFileStream_->clear();
00293 firstOfLine = pInputFileStream_->peek();
00294 if ( firstOfLine == EOF )
00295 {
00296 monitoringStream_ << "End of file has been reached." << endl;
00297 if ( allSequencesRead_ == false )
00298 {
00299 monitoringStream_ << "Setting numSequencesInFile to "
00300 << lastSequenceNumber_ << "." << endl;
00301 numSequencesInFile_ = lastSequenceNumber_;
00302
00303
00304 assert(!pInputFileStream_->fail());
00305
00306 allSequencesRead_ = true;
00307 }
00308
00309 return -1;
00310 }
00311
00312 std::streampos startPos( pInputFileStream_->tellg() );
00313
00314 if ( (char)firstOfLine == seqStartChar_ )
00315 {
00316
00317 getline( *pInputFileStream_, sideInfoBuffer_, '\n' );
00318 DEBUG_L2( "Read " << pInputFileStream_->gcount()
00319 << " bytes of side info to buffer" );
00320 }
00321 else
00322 {
00323 monitoringStream_
00324 << "Error: file not in expected format (expected \""
00325 << seqStartChar_ << "\", received \"" << (char) firstOfLine
00326 << "\" instead).\n";
00327 throw SSAHAException("Data in wrong format!");
00328
00329 }
00330
00331
00332
00333
00334
00335 pEncoder_->setWordLength(wordLength);
00336 pEncoder_->linkSeq(nextSeq);
00337
00338 while (1==1)
00339 {
00340
00341 int numChars;
00342
00343 firstOfLine = pInputFileStream_->peek();
00344
00345 if (((char)firstOfLine == seqStopChar_) || ( firstOfLine == EOF )) break;
00346 pInputFileStream_->getline( inputBuffer_, inputBufferSize_, '\n' );
00347
00348
00349 DEBUG_L2( "Read " << pInputFileStream_->gcount()
00350 << " bytes of sequence data to buffer" );
00351
00352
00353 numChars = pInputFileStream_->gcount() - 1;
00354
00355
00356
00357 pEncoder_->encode( inputBuffer_, numChars );
00358
00359 };
00360
00361 pEncoder_->unlinkSeq();
00362
00363
00364
00365
00366
00367
00368
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382 DEBUG_L2("Finished reading sequence " << lastSequenceNumber_ );
00383
00384 lastSequenceNumber_++;
00385
00386 if ( lastSequenceNumber_ > seqPositions_.size() )
00387 {
00388 seqPositions_.push_back( startPos );
00389 }
00390
00391
00392 return nextSeq.getNumBasesInLast();
00393
00394 }
00395
00396
00397
00398
00399
00400
00401 int SequenceReaderFile::getSequence
00402 ( WordSequence& nextSeq, SequenceNumber sequenceNumber, int wordLength )
00403 {
00404 DEBUG_L2( "SequenceReaderFile::getSequence" );
00405
00406 if (!findSequence( sequenceNumber )) return -1;
00407 else return getNextSequence( nextSeq, wordLength );
00408
00409 }
00410
00411
00412
00413
00414
00415 void SequenceReaderFile::getLastSequenceName( string& seqName ) const
00416 {
00417 DEBUG_L2( "SequenceReaderFile::getLastSequenceName" );
00418 string::size_type nameEnd = sideInfoBuffer_.find_first_of(' ');
00419 if ( nameEnd == string::npos ) nameEnd = sideInfoBuffer_.size();
00420 seqName = sideInfoBuffer_.substr( 1, nameEnd-1 );
00421 DEBUG_L2( "Last sequence name: " << seqName );
00422 }
00423
00424
00425
00426
00427
00428 bool SequenceReaderFile::printName
00429 ( ostream& os, SequenceNumber seqNum )
00430 {
00431 DEBUG_L3( "SequenceReaderFile::printName" );
00432
00433 if (!findSequence( seqNum )) return false;
00434
00435
00436 string firstLine;
00437
00438 getline(*pInputFileStream_,firstLine,'\n');
00439
00440 string::size_type nameEnd = firstLine.find_first_of(' ');
00441 if ( nameEnd == string::npos ) nameEnd = firstLine.size();
00442
00443 os << firstLine.substr( 1, nameEnd );
00444
00445 findSequence( seqNum );
00446
00447 return true;
00448 }
00449
00450
00451
00452
00453
00454
00455
00456 bool SequenceReaderFile::printSideInfo
00457 ( ostream& os, SequenceNumber seqNum )
00458 {
00459 DEBUG_L2( "SequenceReaderFile::printSideInfo" );
00460
00461 if (!findSequence( seqNum )) return false;
00462
00463 string firstLine;
00464
00465 getline(*pInputFileStream_,firstLine,'\n');
00466
00467 string::size_type nameEnd = firstLine.find_first_of(' ');
00468
00469 if ( nameEnd == string::npos ) os << "No side info for this sequence.";
00470 else os << firstLine.substr(firstLine.find_first_of(' ') );
00471
00472 findSequence( seqNum );
00473
00474 return true;
00475 }
00476
00477
00478
00479
00480
00481
00482 bool SequenceReaderFile::printSource
00483 ( ostream& os, SequenceNumber seqNum )
00484 {
00485 DEBUG_L2( "SequenceReaderFile::getSequenceSource" );
00486
00487 if (!findSequence( seqNum )) return false;
00488
00489 int firstOfLine((int)'x');
00490 while( ( (char)firstOfLine != seqStartChar_ ) && ( firstOfLine != EOF ) )
00491 {
00492 pInputFileStream_->getline( inputBuffer_, inputBufferSize_, '\n' );
00493 os << inputBuffer_ << endl;
00494 firstOfLine = pInputFileStream_->peek();
00495 }
00496
00497 findSequence( seqNum );
00498
00499 return true;
00500 }
00501
00502
00503
00504
00505
00506
00507
00508 SequenceNumber SequenceReaderFile::computeNumSequencesInFile( void )
00509 {
00510 DEBUG_L2( "SequenceReaderFile::computeNumSequencesInFile" );
00511
00512 SequenceNumber numSeqs = 0;
00513
00514
00515 pInputFileStream_->clear();
00516 pInputFileStream_->seekg( 0 );
00517 seqPositions_.clear();
00518
00519 while ( pInputFileStream_->peek() != EOF )
00520 {
00521 if (pInputFileStream_->peek() == seqStartChar_)
00522 {
00523 numSeqs++;
00524 seqPositions_.push_back(pInputFileStream_->tellg());
00525 }
00526 pInputFileStream_->getline
00527 ( inputBuffer_ , inputBufferSize_, '\n' );
00528
00529 }
00530
00531 lastSequenceNumber_ = numSeqs;
00532
00533 DEBUG_L2( "Found " << numSeqs << " sequences in the file." );
00534
00535 return numSeqs;
00536
00537 }
00538
00539
00540
00541
00542
00543 int SequenceReaderFile::getBitsPerSymbol ( void ) const
00544 {
00545 return pEncoder_->getBitsPerSymbol();
00546 }
00547
00548
00549
00550
00551
00552 SourceDataType SequenceReaderFile::getSourceDataType( void ) const
00553 {
00554 return pEncoder_->getSourceDataType();
00555 }
00556
00557
00558
00559
00560
00561 void SequenceReaderFile::extractSource
00562 ( char** pSource,
00563 SequenceNumber seqNum,
00564 SequenceOffset seqStart,
00565 SequenceOffset seqEnd )
00566 {
00567
00568 if (seqNum!= lastSourceSeqNum_)
00569 {
00570
00571 SequenceReaderState* pState(saveState());
00572
00573 if (!findSequence( seqNum ))
00574 {
00575
00576 throw SSAHAException
00577 ("Could not find start of seq in SequenceReaderFile::extractSource");
00578 }
00579
00580
00581
00582
00583
00584
00585
00586
00587
00588
00589
00590
00591 extractToCache( pInputFileStream_ );
00592 lastSourceSeqNum_=seqNum;
00593
00594
00595 restoreState(pState);
00596
00597 }
00598
00599
00600
00601 if ( seqStart>seqEnd)
00602 {
00603 throw SSAHAException
00604 ("Requested seq start exceeds requested seq end in SequenceReaderFile::extractSource");
00605 }
00606 else if (seqEnd>lastSourceSeq_.size() )
00607 {
00608 throw SSAHAException
00609 ("Requested last byte exceeds end of seq in SequenceReaderFile::extractSource");
00610 }
00611
00612 *pSource = &lastSourceSeq_[seqStart-1];
00613
00614
00615
00616
00617
00618
00619 }
00620
00621
00622
00623 void SequenceReaderFile::saveIndexImp
00624 ( ostream& fileFile,
00625 ostream& indexFile,
00626 int& fileNumber )
00627 {
00628 computeNumSequencesInFile();
00629 fileFile << fileName_ << endl;
00630 SeqIndexInfo* pIndex = new SeqIndexInfo[seqPositions_.size()];
00631 for (int i(0) ; i < seqPositions_.size() ; i++)
00632 {
00633 pIndex[i].fileNum=fileNumber;
00634 pIndex[i].seqPos=seqPositions_[i];
00635 }
00636
00637 indexFile.write( (const char*)pIndex,
00638 seqPositions_.size()*sizeof(SeqIndexInfo) );
00639 delete [] pIndex;
00640 }
00641
00642
00643
00644
00645
00646
00647
00648