SequenceReader/SequenceReader.cpp

Go to the documentation of this file.
00001 
00002 // #######################################################################
00003 
00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00005 // Version 3.2, released 1st March 2004
00006 // Copyright (c) Genome Research 2002
00007 
00008 // SSAHA is free software; you can redistribute it and/or modify 
00009 // it under the terms of version 2 of the GNU General Public Licence
00010 // as published by the Free Software Foundation.
00011  
00012 // This program is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public Licence for more details.
00016  
00017 // You should have received a copy of the GNU General Public Licence
00018 // along with this program; if not, write to the Free Software
00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00021 
00022 // #######################################################################
00023 
00024 // Module Name  : SequenceReader
00025 // File Name    : SequenceReader.cpp
00026 // Language     : C++
00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00028 
00029 // Description:
00030 
00031 // Includes:
00032 
00033 #include "SequenceReader.h"
00034 #include "HashTable.h"
00035 #include <string>
00036 #include <fstream>
00037 
00038 static const char nc='\0';
00039 static const char reverseChar[256] = 
00040 {
00041 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00042 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00043 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,'-',nc,nc,
00044 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00045 nc, /* next is 'A' */ 
00046 'T',nc,'G',nc,nc,nc,'C',nc,nc,nc,nc,nc,nc,'N',nc, /* next is 'P' */
00047 nc,nc,nc,nc,'A',nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00048 nc, /* next is 'a' */ 
00049 't',nc,'g',nc,nc,nc,'c',nc,nc,nc,nc,nc,nc,'n',nc, /* next is 'p' */
00050 nc,nc,nc,nc,'a',nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00051 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00052 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00053 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00054 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00055 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00056 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00057 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00058 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc
00059 };
00060 
00061 
00062 
00063 
00064 
00065 // ### Function Definitions ###
00066 
00067 SequenceReader::SequenceReader
00068 ( ostream& monitoringStream  ) : 
00069   //  pInputFileStream_( new ifstream( fileName.c_str() ) ), 
00070   //  inputFileStream_( fileName.c_str() ), 
00071 getName( this  ),
00072 getSideInfo( this ),
00073 getSource( this ),
00074 lastSequenceNumber_(0),
00075 allSequencesRead_(false),
00076 numSequencesInFile_(0),
00077 monitoringStream_( monitoringStream )
00078 {
00079   monitoringStream_ << "constructing SequenceReader" << endl;
00080 } // constructor
00081 
00082 SequenceReader::SequenceReader( const SequenceReader& rhs ) :
00083   getName( rhs.getName ),
00084   getSideInfo( rhs.getSideInfo ),
00085   getSource( rhs.getSource ),
00086   lastSequenceNumber_( rhs.lastSequenceNumber_ ),
00087   allSequencesRead_( rhs.allSequencesRead_ ),
00088   numSequencesInFile_( rhs.numSequencesInFile_ ),
00089   monitoringStream_(   rhs.monitoringStream_ )
00090 {
00091   monitoringStream_ << "copy constructing SequenceReader" << endl;
00092 } // copy constructor
00093 
00094 SequenceReader::~SequenceReader()
00095 {
00096   monitoringStream_ << "destructing SequenceReader" << endl;
00097 } // destructor
00098 
00099   // Function Name: changeMode
00100   // Arguments: const SequenceReaderMode&
00101   // Makes a copy of mode and uses it to handle mismatch character reads
00102 // Now pure virtual in this class TC 14.3.01
00103 //  void SequenceReader::changeMode( SequenceReaderMode* pMode )
00104 //  {
00105 //    monitoringStream_ << "SequenceReader::changeMode\n";
00106 //    delete pState_;
00107 //    pState_ = pMode->clone();
00108 //  }
00109 
00110 
00111   // Function Name: getSequence
00112   // Arguments: WordSequence& (out), SequenceNumber (in), const HashTable& (in)
00113   // Returns:   bool
00114   // Read the sequenceNumber-th set of sequence information from the file and 
00115   // parse it into WordSequence format, getting word length from hashTable
00116   int SequenceReader::getSequence
00117   ( WordSequence& nextSeq, SequenceNumber sequenceNumber, const HashTable& hashTable )
00118   {
00119       return getSequence( nextSeq, sequenceNumber, hashTable.getWordLength() );
00120   } // ~getSequence
00121 
00122   // Function Name: getNextSequence
00123   // Arguments: WordSequence& (out), HashTable& (in)
00124   // Returns:   int
00125   // Read the next set of sequence information from the file and parse it
00126   // into WordSequence format, using word length required by hashTable
00127   int SequenceReader::getNextSequence( WordSequence& nextSeq, const HashTable& hashTable )
00128   {
00129       return getNextSequence( nextSeq, hashTable.getWordLength() );
00130   } // ~getNextSequence
00131 
00132 
00133 void SequenceReaderNamePrinter::print( ostream& os )
00134 { 
00135   if (!pReader_->printName(os,seqNum_)) throw NumberOutOfRange();
00136 } // ~print
00137 
00138 void SequenceReaderSideInfoPrinter::print( ostream& os )
00139 { 
00140   if (!pReader_->printSideInfo(os,seqNum_)) throw NumberOutOfRange();
00141 } // ~print
00142 
00143 void SequenceReaderSourcePrinter::print( ostream& os )
00144 { 
00145   if (!pReader_->printSource(os,seqNum_)) throw NumberOutOfRange();
00146 } // ~print
00147 
00148 // SourceReader member function definitions
00149 
00150 void SourceReader::extractSource
00151 ( char** pSource, //vector<char>& source, 
00152   SequenceNumber seqNum,
00153   SequenceOffset seqStart,
00154   SequenceOffset seqEnd )
00155 {
00156   throw SSAHAException
00157     ("SourceReader::extractSource not implemented for this subclass!");
00158 } // ~SourceReader::extractSource
00159 
00160 void SourceReader::extractSourceReverse
00161 ( char** pSource, //vector<char>& source, 
00162   SequenceNumber seqNum,
00163   SequenceOffset seqStart,
00164   SequenceOffset seqEnd )
00165 {
00166   //  vector<char> temp;
00167   //  extractSource( temp, seqNum, seqStart, seqEnd );
00168   //  source.resize( source.size() + temp.size() );
00169   //  vector<char>::iterator i(temp.begin()), j(&source.back());
00170 
00171   reverseBuffer_.resize( seqEnd - seqStart + 1 );
00172 
00173   char* pSourceFwd;
00174   extractSource( &pSourceFwd, seqNum, seqStart, seqEnd );
00175 
00176   
00177   vector<char>::iterator i(reverseBuffer_.begin());
00178   char* j = &pSourceFwd[ seqEnd - seqStart ];
00179 
00180   for ( ; i != reverseBuffer_.end() ; i++, j-- )
00181   {
00182     *i= reverseChar[ *j ];
00183     //    cout << "char: " << (int)*j << *j << " - " << (int)*i << *i << endl;
00184   }
00185 
00186   *pSource = (char*)&reverseBuffer_[0];
00187 
00188 } // ~SourceReader::extractSourceReverse
00189 
00190 
00191 
00192 
00193 
00194 
00195 
00196 // save the location of the start of each sequence in a file
00197 // sets up ofstream and calls the virtual saveIndex below
00198 void SourceReader::saveIndex( const string& fileName )
00199 {
00200   ofstream filesFile((fileName+(string)".files").c_str());
00201   ofstream indexFile((fileName+(string)".index").c_str());
00202   assert(!filesFile.fail());
00203   assert(!indexFile.fail());
00204   int fileNum(0);
00205   saveIndexImp(filesFile, indexFile, fileNum);
00206   filesFile.close();
00207   indexFile.close();
00208 } // ~SourceReader::saveIndex 
00209 
00210 
00211 void SourceReader::saveIndexImp
00212 ( ostream& fileFile, 
00213   ostream& indexFile, 
00214   int& fileNumber )
00215 {
00216   throw SSAHAException("SourceReader::saveIndex not implemented for this subclass!");
00217 }
00218 
00219 SourceReaderIndex::SourceReaderIndex( const string& fileName ) :
00220 currentFileNum_(0)
00221 {
00222   ifstream filesFile((fileName+(string)".files").c_str());
00223   ifstream indexFile((fileName+(string)".index").c_str());
00224   assert(!filesFile.fail());
00225   assert(!indexFile.fail());
00226 
00227   // read in file names
00228   fileNames_.push_back( new string );
00229   
00230   while( filesFile >> *(fileNames_.back()) ) 
00231     fileNames_.push_back( new string );
00232 
00233   // get rid of unwanted empty string at end
00234   delete fileNames_.back();
00235   fileNames_.pop_back();
00236 
00237   filesFile.close();
00238   // read in index
00239 
00240   indexFile.seekg(0, ios::end);
00241   std::streampos indexSize( indexFile.tellg() );
00242   indexFile.seekg(0, ios::beg);
00243 
00244   assert ( indexSize % sizeof (SeqIndexInfo) == 0 );
00245 
00246   numSeqs_ = indexSize / sizeof( SeqIndexInfo );
00247 
00248   index_.resize( numSeqs_ );
00249 
00250   indexFile.read( (char*)&index_[0], indexSize );
00251 
00252   indexFile.close();
00253  
00254   //  for ( int i(0) ; i < index_.size() ; i++ )
00255   //    cout << "index: " << i << " " << index_[i].fileNum << " " 
00256   // << index_[i].seqPos << endl;
00257 
00258 
00259   pCurrentFile_ =  new ifstream( (*fileNames_[0]).c_str() );
00260 
00261 
00262 } // ~SourceReaderIndex::SourceReaderIndex( const string& fileName )
00263 
00264 
00265 SourceReaderIndex::~SourceReaderIndex()
00266 {
00267   for ( vector<string*>::iterator i( fileNames_.begin() );
00268        i != fileNames_.end() ; ++i ) delete *i;
00269 } // ~SourceReaderIndex()::~SourceReaderIndex()
00270 
00271 const char* SourceReaderIndex::extractName( SequenceNumber seqNum )
00272 {
00273   if (seqNum != lastNameSeqNum_)
00274   { // need to read a new sequence name
00275 
00276     assert( seqNum !=0 );
00277     assert( seqNum <= index_.size() );
00278 
00279     vector<SeqIndexInfo>::iterator pThisSeq(&index_[seqNum-1]);
00280 
00281     // cout << "seq location " << pThisSeq->fileNum << " " << pThisSeq->seqPos << endl;
00282     // cout << *fileNames_[pThisSeq->fileNum] << endl;
00283 
00284     if (pThisSeq->fileNum != currentFileNum_ )
00285     { // need to move to a different sequence file
00286       pCurrentFile_->close();
00287       delete pCurrentFile_;
00288       pCurrentFile_ 
00289         =  new ifstream( (*fileNames_[pThisSeq->fileNum]).c_str() );
00290       assert(!pCurrentFile_->fail());
00291       currentFileNum_ = pThisSeq->fileNum;
00292     }
00293 
00294     // wind to start of sequence
00295     pCurrentFile_->clear();
00296     pCurrentFile_->seekg( pThisSeq->seqPos, ios::beg );
00297 
00298     pCurrentFile_->getline
00299       ( nameBuffer_, nameBufferSize_, '\n' );
00300     lastNameSeqNum_ = seqNum; // +1 cos we decremented seqNum earlier
00301     char* pSpace = strchr( nameBuffer_, ' ' );
00302     if (pSpace!=NULL) *pSpace='\0';
00303     
00304   } // ~if
00305   assert((nameBuffer_[0]=='>')||(nameBuffer_[0]=='@'));
00306   return &nameBuffer_[1];
00307 
00308 } // ~SourceReaderIndex::extractName( SequenceNumber seqNum )
00309 
00310 
00311 
00312 void SourceReaderIndex::extractSource
00313 ( char** pSource, //vector<char>& source, 
00314   SequenceNumber seqNum,
00315   SequenceOffset seqStart,
00316   SequenceOffset seqEnd )
00317 {
00318 
00319   if (seqNum != lastSourceSeqNum_)
00320   { // need to put a new sequence in the cache
00321 
00322     assert( seqNum !=0 );
00323     assert( seqNum <= index_.size() );
00324 
00325     vector<SeqIndexInfo>::iterator pThisSeq(&index_[seqNum-1]);
00326 
00327     // cout << "seq location " << pThisSeq->fileNum << " " << pThisSeq->seqPos << endl;
00328     // cout << *fileNames_[pThisSeq->fileNum] << endl;
00329 
00330     if (pThisSeq->fileNum != currentFileNum_ )
00331     { // need to move to a different sequence file
00332       pCurrentFile_->close();
00333       delete pCurrentFile_;
00334       pCurrentFile_ 
00335         =  new ifstream( (*fileNames_[pThisSeq->fileNum]).c_str() );
00336       assert(!pCurrentFile_->fail());
00337       currentFileNum_ = pThisSeq->fileNum;
00338     }
00339 
00340     // wind to start of sequence
00341     pCurrentFile_->clear();
00342     pCurrentFile_->seekg( pThisSeq->seqPos, ios::beg );
00343     //  cout << "about to extract to cache "<< pCurrentFile_->tellg() << endl;
00344     extractToCache( pCurrentFile_ );
00345     lastSourceSeqNum_ = seqNum; // +1 cos we decremented seqNum earlier
00346     //    cout << "cached seq " << lastSourceSeqNum_ << " " << lastSourceSeq_.size() << endl;
00347     lastNameSeqNum_ = seqNum;
00348     char* pSpace = strchr( nameBuffer_, ' ' );
00349     if (pSpace!=NULL) *pSpace='\0';
00350   } // ~if
00351 
00352   // check for japes and tomfoolery
00353 
00354   if ( seqStart>seqEnd)
00355   {
00356       throw SSAHAException
00357         ("Requested seq start exceeds requested seq end in SourceReaderIndex::extractSource");
00358   } // ~if
00359   else if (seqEnd>lastSourceSeq_.size() )
00360   {
00361     cout << seqEnd << " " << lastSourceSeq_.size() << endl;
00362       throw SSAHAException
00363         ("Requested last byte exceeds end of seq in SourceReaderIndex::extractSource");
00364   } // ~else if
00365 
00366 
00367   *pSource = &lastSourceSeq_[seqStart-1]; 
00368 
00369   //  vector<char>::size_type currentSize(source.size());
00370   //  source.resize(currentSize+seqEnd-seqStart+1);
00371 
00372   //  memcpy( (char*)&source[currentSize], (char*)&lastSourceSeq_[seqStart-1], seqEnd-seqStart+1);
00373 
00374 
00375 }
00376 
00377 void SourceReader::extractToCache( istream* pCurrentFile)
00378 {
00379    // Check that this is a sensible place for sequence start
00380     int firstOfLine(pCurrentFile->peek());
00381     assert(((char)firstOfLine=='>')||((char)firstOfLine=='@'));
00382 
00383     int currentSize;
00384 
00385 
00386     lastSourceSeq_.resize(sourceBufferSize_);
00387 
00388     char* pStart = (char*) &lastSourceSeq_[0];
00389     char* pResize = (char*) &lastSourceSeq_.back()-resizeCacheThreshold_;
00390 
00391     //  get sequence name and put in nameBuffer_
00392     //    pCurrentFile->getline
00393     //  ( (char*)&lastSourceSeq_[0], sourceBufferSize_, '\n' );
00394     pCurrentFile->getline
00395       ( nameBuffer_, nameBufferSize_, '\n' );
00396 
00397     std::streampos lastPos;
00398     //    int firstOfLine;
00399 
00400     while(1)
00401     {
00402       lastPos = pCurrentFile->tellg();
00403       firstOfLine = pCurrentFile->peek();
00404       if ( ((char)firstOfLine=='>') || 
00405            ((char)firstOfLine=='+') || 
00406            (firstOfLine==EOF) ) break;
00407       pCurrentFile->getline( pStart, sourceBufferSize_, '\n' );
00408       pStart += (unsigned long)pCurrentFile->tellg()-lastPos-1; // -1 accounts for \n at end
00409       if (pStart >= pResize)
00410       {
00411         currentSize = pStart-&lastSourceSeq_[0]; 
00412         // need currentSize because storage for lastSourceSeq_ may move
00413         // when resized, thus invalidating pStart
00414         lastSourceSeq_.resize
00415           ( lastSourceSeq_.size() + (lastSourceSeq_.size()>>1) );
00416         pStart = &lastSourceSeq_[currentSize];
00417         pResize = (char*) &lastSourceSeq_.back()-resizeCacheThreshold_;
00418       } // ~if
00419     } // ~while
00420 
00421     lastSourceSeq_.resize( pStart-&lastSourceSeq_[0] );
00422     //    cout << "Read " << lastSourceSeq_.size() << " characters into cache.\n";
00423 
00424 
00425 
00426 }
00427 
00428 
00429 
00430 
00431 
00432 
00433 // Name:
00434 // Arguments:
00435 // TYPE  NAME  IN/OUT COMMENT
00436 // Returns: TYPE COMMENT
00437 
00438 // End of file SequenceReader.cpp
00439 
00440 
00441 
00442 
00443 
00444 
00445 
00446 
00447 

Generated on Fri Dec 21 13:12:16 2007 for ssaha by  doxygen 1.5.2