SequenceReader/SequenceReaderMulti.cpp

Go to the documentation of this file.
00001 
00002 // #######################################################################
00003 
00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00005 // Version 3.2, released 1st March 2004
00006 // Copyright (c) Genome Research 2002
00007 
00008 // SSAHA is free software; you can redistribute it and/or modify 
00009 // it under the terms of version 2 of the GNU General Public Licence
00010 // as published by the Free Software Foundation.
00011  
00012 // This program is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public Licence for more details.
00016  
00017 // You should have received a copy of the GNU General Public Licence
00018 // along with this program; if not, write to the Free Software
00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00021 
00022 // #######################################################################
00023 
00024 // Module Name  : SequenceReaderMulti
00025 // File Name    : SequenceReaderMulti.cpp
00026 // Language     : C++
00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00028 
00029 // Description:
00030 
00031 // #pragma implementation
00032 // Includes:
00033 
00034 #include "SequenceReaderMulti.h"
00035 
00036 // ### Function Definitions ###
00037 
00038 // Name:
00039 // Arguments:
00040 // TYPE  NAME  IN/OUT COMMENT
00041 // Returns: TYPE COMMENT
00042 SequenceReaderMulti::SequenceReaderMulti( ostream& monitoringStream )
00043   : SequenceReader( monitoringStream ), thisReader_( allReaders_.begin() ),
00044     isFirstSeq_(true)
00045 {
00046   monitoringStream_ << "constructing SequenceReaderMulti" << endl;
00047 }
00048 
00049 SequenceReaderMulti::~SequenceReaderMulti()
00050 {
00051   monitoringStream_ << "destructing SequenceReaderMulti" << endl;
00052   for ( vector<SeqReaderInfo>::iterator i( allReaders_.begin() );
00053         i != allReaders_.end() ; i++ ) delete i->ptr_;
00054 } // destructor
00055 
00056 SequenceReaderMulti::SequenceReaderMulti( const SequenceReaderMulti& rhs) 
00057   : SequenceReader( rhs.monitoringStream_ ),
00058     allReaders_( rhs.allReaders_ ),
00059     thisReader_( allReaders_.begin() ),
00060     isFirstSeq_( rhs.isFirstSeq_ ),
00061     bitsPerSymbol_( rhs.bitsPerSymbol_ ),
00062     sourceDataType_( rhs.sourceDataType_ )
00063 {
00064   monitoringStream_ << "copy constructing SequenceReaderMulti\n";
00065   for ( vector<SeqReaderInfo>::const_iterator i(rhs.allReaders_.begin());
00066         i!=rhs.allReaders_.end(); i++)
00067   {
00068     allReaders_.push_back( SeqReaderInfo() );
00069     allReaders_.back().ptr_=i->ptr_->clone();
00070   } // ~for
00071 
00072 } // ~constructor
00073 
00074 
00075   
00076 //template< class T >
00077 //void SequenceReaderMulti::addSequenceReader( const T& seq )
00078 //{
00079 //allReaders_.push_back
00080 //( std::auto_ptr<SequenceReader>( new T( seq ) ) );
00081 //} // ~addSequenceReader( const SequenceReader& pSeq )
00082 
00083 
00084 void SequenceReaderMulti::rewind( void )
00085 {
00086 
00087     for ( vector<SeqReaderInfo>::iterator i 
00088           = allReaders_.begin(); 
00089           i != allReaders_.end();
00090           i++ )
00091     {
00092       (i->ptr_)->rewind();       
00093     } // ~for i
00094     thisReader_ = allReaders_.begin();
00095     lastSequenceNumber_ = 0;
00096 } // ~SequenceReaderMulti::rewind( void )
00097 
00098   // Function Name: addReader
00099   // Arguments: const T&
00100   // A copy of seq is made (which is why SequenceReader and its subclasses
00101   // need to have a copy constructor) and a pointer to it is placed in 
00102   // allReaders. NB Type safety is ensured because if T is anything other than 
00103   // a subclass of SequenceReader then trying to 'push_back' a pointer to it
00104   // onto allReaders (a vector of SequenceReader pointers) will cause
00105   // the compiler to complain.
00106   //  template< class T >
00107   void SequenceReaderMulti::addReader( SequenceReader& seq )
00108   {
00109     if (isFirstSeq_)
00110     {
00111       isFirstSeq_ = false;
00112       bitsPerSymbol_ = seq.getBitsPerSymbol();
00113       sourceDataType_ = seq.getSourceDataType();
00114     } // ~if
00115     else
00116     {
00117       assert(bitsPerSymbol_ == seq.getBitsPerSymbol());
00118       assert(sourceDataType_ == seq.getSourceDataType());
00119     } // ~else
00120     //    assert(1==0);
00121     //    allReaders_.push_back( SeqReaderInfo( seq.clone() ));
00122     allReaders_.push_back( SeqReaderInfo() );
00123     allReaders_.back().ptr_=seq.clone();
00124     thisReader_
00125       = static_cast<vector<SeqReaderInfo>::iterator>(&allReaders_.back());
00126   } // ~addReader( SequenceReader& seq )
00127 
00128   // Function Name: addReader
00129   // Arguments: SequenceReader*
00130   // pSeq is placed in allReaders_. NB no copy of pSeq is made: use by
00131   // creating a new SequenceReader and calling this. The SeqReaderMulti
00132   // takes ownership of *pSeq and is responsible for its destruction
00133   void SequenceReaderMulti::addReader( SequenceReader* pSeq )
00134   {
00135     if (isFirstSeq_)
00136     {
00137       isFirstSeq_ = false;
00138       bitsPerSymbol_ = pSeq->getBitsPerSymbol();
00139       sourceDataType_ = pSeq->getSourceDataType();
00140     } // ~if
00141     else
00142     {
00143       assert(bitsPerSymbol_ == pSeq->getBitsPerSymbol());
00144       assert(sourceDataType_ == pSeq->getSourceDataType());
00145     } // ~else
00146 
00147     allReaders_.push_back( SeqReaderInfo() );
00148     allReaders_.back().ptr_=pSeq;
00149     thisReader_
00150       = static_cast<vector<SeqReaderInfo>::iterator>(&allReaders_.back());
00151   } // ~addReader( SequenceReader* pSeq )
00152 
00153 // Function Name: findSequence
00154 // Arguments: SequenceNumber (in)
00155 // Returns:   void
00156 // Winds the input file stream to the start of sequence number seqNum. 
00157 // Returns false if seqNum exceeds the number of sequences in
00158 // the file.
00159 bool SequenceReaderMulti::findSequence( SequenceNumber seqNum )
00160 {
00161 
00162   //  SequenceNumber currentSeqNum(seqNum);
00163   currentSeqNum_ = seqNum;
00164   for ( thisReader_ = allReaders_.begin(); 
00165         thisReader_ != allReaders_.end(); thisReader_++ )
00166   {
00167     if (thisReader_->allSeqsRead_)
00168     {
00169       if ( currentSeqNum_ <= thisReader_->size_ )
00170       {
00171         //      assert( thisReader_->ptr_->findSequence( currentSeqNum_ ) == true );
00172         lastSequenceNumber_ = --seqNum; // last read = 1 behind current
00173         return true;    
00174       } // ~if
00175     } // ~if
00176     else
00177     {
00178       if ( thisReader_->ptr_->findSequence( currentSeqNum_ ) == true )
00179       {
00180         lastSequenceNumber_ = --seqNum; // last read = 1 behind current
00181         return true;
00182       }
00183       else
00184         thisReader_->size_ = thisReader_->ptr_->getNumSequencesInFile();
00185       //      thisReader_->ptr_->rewind();
00186     } // ~else
00187     
00188     
00189     currentSeqNum_ -= thisReader_->ptr_->getNumSequencesInFile();
00190 
00191 
00192   } // ~for
00193 
00194   return false;
00195 
00196     //    if (thisReader_->ptr_->findSequence( currentSeqNum )) 
00197     //    {
00198       //      if (!thisReader_->allSeqsRead_)
00199       //   {
00200       //      thisReader_->allSeqsRead_=true;
00201       //      thisReader_->size_ = thisReader_->ptr_->getNumSequencesInFile();
00202       //    } // ~if
00203     //      return true;
00204     //   }
00205     //    if (!thisReader_->allSeqsRead_)
00206     //    {
00207     //     thisReader_->allSeqsRead_=true;
00208     //    thisReader_->size_ = thisReader_->ptr_->getNumSequencesInFile();
00209     //    } // ~if
00210     //  } // ~for
00211   //  return false;
00212 
00213 } // ~SequenceReaderMulti::findSequence( SequenceNumber seqNum )
00214 
00215 
00216 
00217 
00218   // Function Name: findReader
00219   // Arguments: SequenceNumber& (in/out)
00220   // Given an input sequence number, adjusts seqNum and thisReader_
00221   // so that the seqNum'th sequence of *this is obtained by passing
00222   // the adjusted value of seqNum to *thisReader_
00223   bool SequenceReaderMulti::findReader( SequenceNumber& seqNum )
00224   {
00225     DEBUG_L3( "SequenceReaderMulti::findReader" );
00226 
00227     // NB calling getNumSequencesInFile also makes sure numSeqs_ has
00228     // been set up
00229 
00230     if ( seqNum > getNumSequencesInFile() )
00231     {
00232       return false;
00233       //      monitoringStream_ 
00234       //   << "findReader: Requested sequence number (" << seqNum 
00235       //   << ")\n is outside range of sequences in file (1 to " 
00236       //  << getNumSequencesInFile() << ")." << endl;
00237       //   throw SSAHAException("Invalid sequence number.");
00238     } // ~if
00239 
00240     for ( vector<SeqReaderInfo>::iterator i = allReaders_.begin(); 
00241           i != allReaders_.end(); i++ )
00242     {
00243       if ( seqNum <= i->size_ ) { thisReader_ = i; break; } // %%%%%
00244       seqNum -= i->size_;
00245     } // ~for i
00246 
00247     return true;
00248     
00249     // cout << allReaders_.begin().size_ << " " << seqNum << " " << i << endl;
00250   } // ~void SequenceReaderMulti::findReader( int& seqNum )
00251 
00252   // Function Name: changeMode
00253   // Arguments: const SequenceReaderMode&
00254   // Makes a copy of mode and uses it to handle mismatch character reads
00255   void SequenceReaderMulti::changeMode( SequenceReaderMode* pMode )
00256   {
00257     for ( vector<SeqReaderInfo>::iterator i = allReaders_.begin(); 
00258           i != allReaders_.end(); i++ ) 
00259           {
00260             (i->ptr_)->changeMode(pMode);
00261           }
00262   } // ~SequenceReaderMulti::changeMode( SequenceReaderMode* pMode )
00263 
00264 
00265 
00266 
00267   // Function Name: getNextSequence
00268   // Arguments: WordSequence& (out), int (in)
00269   // Returns:   int
00270   // Read the next set of sequence information from the file and parse it
00271   // into WordSequence format. Returns -1 if there has been a problem with
00272   // reading the sequence, else returns the number of valid base pairs 
00273   // contained within the final word of the sequence
00274   int SequenceReaderMulti::getNextSequence
00275   ( WordSequence& nextSeq, int wordLength )
00276   {
00277     DEBUG_L2( "SequenceReaderMulti::getNextSequence" );
00278 
00279     int numInLast;
00280 
00281     while 
00282       (    ( thisReader_ 
00283              != allReaders_.end() )
00284         && ( (   numInLast
00285                = thisReader_->ptr_->getNextSequence( nextSeq, wordLength ) )
00286               == -1 )    )
00287     {
00288       if ( thisReader_->allSeqsRead_ == false )
00289       {
00290         thisReader_->size_ = (thisReader_->ptr_)->getNumSequencesInFile();
00291         thisReader_->allSeqsRead_ = true;
00292       } // ~if
00293       thisReader_++;
00294       //      if (thisReader_!=allReaders_.end()) thisReader_->ptr_->rewind();
00295     } // ~while
00296 
00297     if (thisReader_ == allReaders_.end()) 
00298     {
00299       monitoringStream_ 
00300       << "SequenceReaderMulti::getNextSequence - last seq has been read.\n";
00301       return -1;
00302     } // ~if
00303 
00304     if ( numInLast != -1 ) lastSequenceNumber_++;
00305 
00306     return numInLast; 
00307 
00308   } // ~SequenceReaderMulti::getNextSequence
00309 
00310 
00311   // Function Name: getSequence
00312   // Arguments: WordSequence& (out), SequenceNumber (in), int (in)
00313   // Returns:   bool
00314   // Read the sequenceNumber-th set of sequence information from the file and 
00315   // parse it into WordSequence format
00316   int SequenceReaderMulti::getSequence
00317   ( WordSequence& nextSeq, SequenceNumber sequenceNumber, int wordLength )
00318   {
00319     DEBUG_L2( "SequenceReaderMulti::getSequence" );
00320 
00321     //    findReader( sequenceNumber ); 
00322     if (!findReader( sequenceNumber )) return -1;
00323 
00324     lastSequenceNumber_ = sequenceNumber;
00325             (thisReader_->ptr_)->rewind();
00326 
00327     return (thisReader_->ptr_)->getSequence
00328                                   ( nextSeq, sequenceNumber, wordLength );
00329 
00330   } // ~SequenceReaderMulti::getSequence
00331 
00332   // Function Name: getLastSequenceName
00333   // Arguments: string& (out)
00334   // Returns:   void
00335   // Fills the string with the name of the last sequence read 
00336   void SequenceReaderMulti::getLastSequenceName( string& seqName ) const 
00337   {
00338     (thisReader_->ptr_)->getLastSequenceName( seqName );
00339   } // ~SequenceReaderMulti::getLastSequenceName( string& seqName ) const 
00340 
00341   // Function Name: computeNumSequencesInFile
00342   // Arguments: void
00343   // Returns:   SequenceNumber
00344   // Returns the number of sequences in the file (will be done by lazy 
00345   // initialization, i.e. will only be calculated if asked for. NB this
00346   // will lose the current place in the file)
00347   SequenceNumber SequenceReaderMulti::computeNumSequencesInFile( void )
00348   {
00349     SequenceNumber numSeqs = 0;
00350     for ( vector<SeqReaderInfo>::iterator i 
00351           = allReaders_.begin(); 
00352           i != allReaders_.end();
00353           i++ )
00354     {
00355       if ( i->allSeqsRead_ == false )
00356       {
00357         i->size_ =  (i->ptr_)->getNumSequencesInFile();
00358         i->allSeqsRead_ = true;
00359       } // ~if
00360       numSeqs += i->size_;       
00361     } // ~for i
00362     thisReader_ = allReaders_.end();
00363     lastSequenceNumber_ = numSeqs;
00364     return numSeqs; 
00365   } // ~SequenceReaderMulti::computeNumSequencesInFile( void )
00366 
00367   // Function Name: getBitsPerSymbol
00368   // Arguments: none
00369   // Returns:   int
00370   // Returns number of bits per symbol used in encoding
00371 int SequenceReaderMulti::getBitsPerSymbol ( void ) const
00372 {
00373   if (isFirstSeq_) assert(1==0);
00374   return bitsPerSymbol_;
00375   // fails if thisReader is at end 
00376   //  return ((thisReader_->ptr_)->getBitsPerSymbol());
00377 } // ~SequenceReaderMulti::getBitsPerSymbol ( void ) const
00378 
00379 // Function Name: getSourceDataType
00380 // Arguments: none
00381 // Returns:   SourceDataType
00382 // Returns type of data being encoded (protein or DNA)
00383 SourceDataType SequenceReaderMulti::getSourceDataType( void ) const
00384 {
00385   if (isFirstSeq_) assert(1==0);
00386   return sourceDataType_;
00387   // fails if thisReader is at end 
00388   //  return ((thisReader_->ptr_)->getSourceDataType());
00389 } // ~SequenceReaderMulti::getSourceDataType ( void ) const
00390 
00391   // Function Name: printName
00392   // Arguments: string& (out), SequenceNumber (in)
00393   // Returns:   void
00394   // Fills a string with the name of the requested sequence
00395 bool SequenceReaderMulti::printName( ostream& os, SequenceNumber seqNum ) 
00396 {
00397 
00398   lastSequenceNumber_ = seqNum;
00399   //  findReader( seqNum ); 
00400   if (!findReader(seqNum)) return false;
00401   (thisReader_->ptr_)->printName( os, seqNum );
00402   return true;
00403 } // ~SequenceReaderMulti::printName
00404 
00405   // Function Name: printSideInfo
00406   // Arguments: string& (out), SequenceNumber (in)
00407   // Returns:   void
00408   // Fills a string with the name of the requested sequence
00409 bool SequenceReaderMulti::printSideInfo( ostream& os, SequenceNumber seqNum )
00410 {
00411 
00412   lastSequenceNumber_ = seqNum;
00413   //  findReader( seqNum ); 
00414   if (!findReader(seqNum)) return false;
00415   (thisReader_->ptr_)->printSideInfo( os, seqNum );
00416   return true;
00417 } // ~SequenceReaderMulti::printName
00418 
00419 
00420   // Function Name: printSource
00421   // Arguments: string& (out), SequenceNumber (in)
00422   // Returns:   void
00423   // Fills a string with the name of the requested sequence
00424 bool SequenceReaderMulti::printSource( ostream& os, SequenceNumber seqNum )
00425 {
00426   lastSequenceNumber_ = seqNum;
00427   //findReader( seqNum ); 
00428   if (!findReader(seqNum)) return false;
00429   (thisReader_->ptr_)->printSource( os, seqNum );
00430   return true;
00431 } // ~SequenceReaderMulti::printName
00432 
00433 // Function Name: extractSource
00434 // This extracts the source data for bases seqStart to seqEnd inclusive
00435 // of sequence seqNum and places it in source
00436 void SequenceReaderMulti::extractSource
00437 ( char** pSource,//vector<char>& source, 
00438   SequenceNumber seqNum,
00439   SequenceOffset seqStart,
00440   SequenceOffset seqEnd )
00441 {
00442   //  cout << "SRM::extractSource " << seqNum << endl;
00443   //  SequenceNumber savedSeqNum(lastSequenceNumber_);
00444   //  vector<SeqReaderInfo>::iterator savedReader(thisReader_);
00445   //  SequenceNumber savedSeqNumPtr(thisReader_->ptr_->getLastSequenceNumber());
00446   //  monitoringStream_ << "srm::Extract1 " << seqNum << " " << thisReader_ << endl;
00447 
00448   SequenceReaderState* pState(saveState());
00449 
00450   if (!findSequence(seqNum))
00451   { 
00452     monitoringStream_ << "extSource: Requested sequence number (" << seqNum
00453                       << ") exceeds number of sequences in collection ("
00454                       << getNumSequencesInFile() << ")." << endl;
00455     throw SSAHAException
00456       ("Invalid sequence number in SequenceReaderMulti::extractSource");
00457   } // ~if   
00458   //  monitoringStream_ << "srm::Extract2 " << seqNum << " " << thisReader_ << endl;
00459   (thisReader_->ptr_)->extractSource
00460     ( pSource, currentSeqNum_, seqStart, seqEnd );
00461 
00462   //  lastSequenceNumber_ = savedSeqNum;
00463 
00464   //  thisReader_->ptr_->rewind();
00465   //  findSequence( savedSeqNum+1 );
00466 
00467   restoreState(pState);
00468   //  lastSequenceNumber_=savedSeqNum;
00469   //  thisReader_=savedReader;
00470   //  thisReader_->ptr_->findSequence(savedSeqNumPtr+1);
00471 
00472 } // ~SequenceReaderMulti::extractSource
00473 
00474 
00475 void SequenceReaderMulti::saveIndexImp
00476 ( ostream& fileFile, 
00477   ostream& indexFile, 
00478   int& fileNumber )
00479 {
00480   computeNumSequencesInFile();
00481   for ( vector<SeqReaderInfo>::iterator i 
00482           = allReaders_.begin(); 
00483           i != allReaders_.end();
00484           i++ )
00485   {
00486     (i->ptr_)->saveIndexImp( fileFile, indexFile, fileNumber );
00487     fileNumber++;
00488   } // ~for i
00489 
00490 } // ~SequenceReaderMulti::saveIndexImp
00491 
00492 
00493 
00494 // End of file SequenceReaderMulti.cpp
00495 
00496 

Generated on Fri Dec 21 13:12:16 2007 for ssaha by  doxygen 1.5.2