SequenceReader/SequenceReaderFilter.cpp

Go to the documentation of this file.
00001 
00002 // #######################################################################
00003 
00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00005 // Version 3.2, released 1st March 2004
00006 // Copyright (c) Genome Research 2002
00007 
00008 // SSAHA is free software; you can redistribute it and/or modify 
00009 // it under the terms of version 2 of the GNU General Public Licence
00010 // as published by the Free Software Foundation.
00011  
00012 // This program is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public Licence for more details.
00016  
00017 // You should have received a copy of the GNU General Public Licence
00018 // along with this program; if not, write to the Free Software
00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00021 
00022 // #######################################################################
00023 
00024 // Module Name  : SequenceReaderFilter
00025 // File Name    : SequenceReaderFilter.cpp
00026 // Language     : C++
00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00028 
00029 // Description:
00030 
00031 // Includes:
00032 
00033 #include "SequenceReaderFilter.h"
00034 #include <strstream>
00035 
00036 // ### Function Definitions ###
00037 
00038 // Name:
00039 // Arguments:
00040 // TYPE  NAME  IN/OUT COMMENT
00041 // Returns: TYPE COMMENT
00042 
00043   // Function Name: Constructor
00044   // Arguments: ostream&
00045   // Takes the data from seqFile and places it in seqData_, seqBasesInLast_
00046   // and seqNames_
00047 
00048 SequenceReaderFilter::SequenceReaderFilter
00049 ( SequenceReader* pSeq, 
00050   ifstream* pFilterSource,
00051   ostream& monitoringStream ) :
00052   SequenceReader( monitoringStream ),
00053   pSeq_(pSeq),
00054   pFilterNames_( new StringHash ),
00055   numFiltered_(0)
00056 {
00057   monitoringStream_ 
00058     << "constructing SequenceReaderFilter" << endl;
00059   readFilterNames( pFilterSource );
00060 } // ~constructor
00061 
00062 SequenceReaderFilter::SequenceReaderFilter
00063 ( SequenceReader* pSeq, 
00064   const char* filterFileName, 
00065   ostream& monitoringStream ) :
00066   SequenceReader( monitoringStream ),
00067   pSeq_(pSeq),
00068   pFilterNames_( new StringHash ),
00069   numFiltered_(0)
00070 {
00071   monitoringStream_ 
00072     << "constructing SequenceReaderFilter for file " 
00073     << filterFileName << endl;
00074   readFilterNames( new ifstream( filterFileName ) );
00075 } // ~constructor
00076 
00077 // Function Name: readFilterNames
00078 // Arguments: ifstream*
00079 // Returns: void
00080 // Reads in a list of names to be filtered from an ifstream*
00081 // into set<string>*. Once done, deletes the ifstream
00082 void SequenceReaderFilter::readFilterNames( ifstream* pFilterSource )
00083 {
00084   if ( pFilterSource->fail() )
00085   {
00086     throw SSAHAException
00087       ( "SequenceReaderFile - unable to open filter file ");
00088 
00089   } // ~if
00090 
00091   //  string thisName;
00092 
00093   //  while (*pFilterSource>>thisName) pFilterNames_->insert(thisName);
00094   pFilterNames_->push_back( (string) "" );
00095   while (*pFilterSource>>pFilterNames_->back()) pFilterNames_->push_back( (string) "" );
00096   pFilterNames_->pop_back();
00097   pFilterNames_->makeBins();
00098 
00099   monitoringStream_ << "Read in " << pFilterNames_->size() 
00100                     << " names to filter out" << endl;
00101   filterNums_.push_back(0);
00102   pFilterSource->close();
00103   delete pFilterSource;
00104 }
00105 
00106 
00107 
00108 
00109 // Function Name: Copy constructor
00110 // Arguments:
00111 SequenceReaderFilter::SequenceReaderFilter( const SequenceReaderFilter& rhs ):
00112 SequenceReader( rhs.monitoringStream_ ),
00113 pSeq_( rhs.pSeq_->clone() )
00114 {
00115   monitoringStream_ << "copy constructing SequenceReaderFilter" << endl;
00116   // copy set across
00117   monitoringStream_ << "copy constructor not implemented!!" << endl;
00118   assert(1==0);
00119 } // ~destructor
00120 
00121   // Function Name: Destructor
00122   // Arguments:
00123 SequenceReaderFilter::~SequenceReaderFilter()
00124 {
00125   monitoringStream_ << "destructing SequenceReaderFilter" << endl;
00126   delete pSeq_;
00127   delete pFilterNames_;
00128 } // ~destructor
00129 
00130 bool SequenceReaderFilter::findSequence( SequenceNumber seqNum )
00131 {
00132   if ( seqNum <= filterNums_.size()-1 ) return true; // no neeed!
00133   if (allSequencesRead_)
00134   {
00135     monitoringStream_ 
00136       << "Requested sequence number (" << seqNum 
00137       << ")\nis outside range of sequences in file (1 to " 
00138       << numSequencesInFile_ << ")." << endl;
00139     return false;
00140   } // ~if
00141 
00142   SequenceNumber lastInSource(filterNums_.back());
00143   // = (filterNums_.empty() ? 0 : filterNums_.back()); no need, never empty
00144   std::ostrstream buffer;
00145   string name;
00146 
00147   while (filterNums_.size()!=seqNum+1)
00148   {
00149 
00150     ++lastInSource;
00151 
00152     if ( pSeq_->printName( buffer, lastInSource ) == false )
00153     {
00154       if(allSequencesRead_==false)
00155       {
00156         allSequencesRead_=true;
00157         if (pFilterNames_!=NULL) 
00158         {
00159           monitoringStream_ 
00160           << "deleting filter name data, no longer needed" << endl;
00161           delete pFilterNames_;
00162           pFilterNames_=NULL;  
00163         } // ~if
00164         numSequencesInFile_ = filterNums_.size()-1;
00165         monitoringStream_ 
00166           << "Requested sequence number (" << seqNum
00167           << ")\nis outside range of sequences in file (1 to " 
00168           << numSequencesInFile_ << ")." << endl;
00169 
00170       } // ~if
00171       return false;
00172     } // ~catch
00173 
00174     name=buffer.str();
00175     string::size_type nameEnd=name.find_first_of('\t');
00176     // NB stop at tab not spc
00177     if (nameEnd!=string::npos) name.erase(nameEnd,name.size());
00178 
00179     //    if ( pFilterNames_->find(name)==pFilterNames_->end() )
00180     if ( !pFilterNames_->isPresent(name) )
00181     {
00182       filterNums_.push_back(lastInSource);
00183       //  monitoringStream_ << filterNums_.size()-1 << " " << lastInSource << endl;
00184     } // ~if
00185     else
00186     {
00187       //      monitoringStream_ << filterNums_.size()-1 << " " << lastInSource 
00188       //                << " Filtered: " << name << " " 
00189       //                << *(pFilterNames_->find(name)) << endl;
00190       numFiltered_++;
00191     } // ~else
00192 
00193 
00194     buffer.freeze(false); // release memory to write further stuff to buffer  
00195     buffer.seekp(0,ios::beg); // ensure next write is to start of buffer
00196   
00197   } // ~while
00198 
00199   return true;
00200 
00201 } // ~SequenceReaderFilter::findSequence( SequenceNumber seqNum )
00202 
00203 // Function Name: getNextSequence
00204 // Arguments: WordSequence& (out), int (in)
00205 // Returns:   int
00206 // Read the next set of sequence information from the file and parse it
00207 // into WordSequence format. Returns -1 if there has been a problem with
00208 // reading the sequence, else returns the number of valid base pairs 
00209 // contained within the final word of the sequence.
00210 int SequenceReaderFilter::getNextSequence( WordSequence& nextSeq, 
00211                                            int wordLength )
00212 {
00213   if (!findSequence(lastSequenceNumber_+1)) 
00214   {
00215     assert(filterNums_.size()==lastSequenceNumber_+1);
00216     return -1;
00217   } // ~if
00218   else 
00219   {
00220     ++lastSequenceNumber_;
00221     return pSeq_->getSequence( nextSeq, filterNums_[lastSequenceNumber_], 
00222                                  wordLength );
00223   } // ~else
00224 } // ~SequenceReaderFilter::getNextSequence
00225 
00226 // Function Name: getSequence
00227 // Arguments: WordSequence& (out), SequenceNumber (in), int (in)
00228 // Returns:   bool
00229 // Read the sequenceNumber-th set of sequence information from the file and 
00230 // parse it into WordSequence format
00231 int SequenceReaderFilter::getSequence
00232 ( WordSequence& nextSeq, SequenceNumber sequenceNumber, int wordLength )
00233 {
00234   if (!findSequence(sequenceNumber)) return -1;
00235   assert(filterNums_.size()>=sequenceNumber+1);
00236   lastSequenceNumber_=sequenceNumber;
00237   return pSeq_->getSequence
00238     ( nextSeq, filterNums_[sequenceNumber], wordLength );
00239   
00240 } // ~SequenceReaderFilter::getSequence
00241 
00242 
00243 void SequenceReaderFilter::rewind( void ) 
00244 {
00245   pSeq_->rewind();
00246   lastSequenceNumber_ = 0;
00247 } // ~rewind 
00248 
00249 // Function Name: getLastSequenceName
00250 // Arguments: string& (out)
00251 // Returns:   void
00252 // Fills the string with the name of the last sequence read 
00253 void SequenceReaderFilter::getLastSequenceName( string& seqName ) const
00254 {
00255   return pSeq_->getLastSequenceName(seqName);
00256 } // ~SequenceReaderFilter::getLastSequenceName( string& seqName ) const
00257 
00258 // Function Name: getSequenceName
00259 // Arguments: string& (out), SequenceNumber (in)
00260 // Returns:   void
00261 // Fills a string with the name of the requested sequence
00262 bool SequenceReaderFilter::printName( ostream& os, SequenceNumber seqNum) 
00263 {
00264   if (!findSequence(seqNum)) return false;
00265   assert(filterNums_.size()>=seqNum+1);
00266   pSeq_->printName(os, filterNums_[seqNum]);
00267   return true;
00268 } // ~SequenceReaderFilter::printName( ostream& os, SequenceNumber seqNum)  
00269 
00270 // Function Name: printSideInfo
00271 // Arguments: string& (out), SequenceNumber (in)
00272 // Returns:   void
00273 // Fills a string with the name of the requested sequence
00274 bool SequenceReaderFilter::printSideInfo
00275 ( ostream& os, SequenceNumber seqNum )
00276 {
00277   if (!findSequence(seqNum)) return false;
00278   assert(filterNums_.size()>=seqNum+1);
00279   pSeq_->printName(os, filterNums_[seqNum]);
00280   return true;
00281 } // ~bool SequenceReaderFilter::printSideInfo
00282 
00283 // Function Name: printSource
00284 // Arguments: string& (out), SequenceNumber (in)
00285 // Returns:   void
00286 // Fills a string with the name of the requested sequence
00287 bool SequenceReaderFilter::printSource
00288 ( ostream& os, SequenceNumber seqNum )
00289 {
00290   if (!findSequence(seqNum)) return false;
00291   assert(filterNums_.size()>=seqNum+1);
00292   pSeq_->printName(os, filterNums_[seqNum]);
00293   return true;
00294 } // ~SequenceReaderFilter::printSource
00295 
00296 SequenceNumber SequenceReaderFilter::computeNumSequencesInFile( void )
00297 {
00298   try
00299   {
00300     // this will always work out num seqs in pSeq_ 
00301     findSequence( 1+pSeq_->getNumSequencesInFile() );
00302   } // ~try
00303   catch( const NumberOutOfRange& err )
00304     {} // ~catch
00305 
00306   return numSequencesInFile_;
00307 
00308 } // ~SequenceReaderFilter::computeNumSequencesInFile( void )
00309 
00310 // Functions to implement SourceReader functionality
00311 
00312 // Function Name: extractSource
00313 // This extracts the source data for bases seqStart to seqEnd inclusive
00314 // of sequence seqNum and places it in source
00315 void SequenceReaderFilter::extractSource
00316 ( char** pSource, 
00317   SequenceNumber seqNum,
00318   SequenceOffset seqStart,
00319   SequenceOffset seqEnd )
00320 {
00321   //  cout << seqNum << " " << filterNums_[seqNum] << " " << seqStart << " " << seqEnd << endl; 
00322   pSeq_->extractSource( pSource, filterNums_[seqNum], seqStart, seqEnd );
00323 } // ~void SequenceReaderFilter::extractSource
00324 
00325 
00326 // Function Name: saveIndexImp
00327 // This makes and saves the index for *this, by kinda monkeying around
00328 // with the index for *pSeq_ 
00329 void SequenceReaderFilter::saveIndexImp
00330 ( ostream& fileFile, 
00331   ostream& indexFile, 
00332   int& fileNumber )
00333 {
00334   std::strstream indexTemp;
00335   pSeq_->saveIndexImp( fileFile, indexTemp, fileNumber );
00336   const SeqIndexInfo* pIndex= (const SeqIndexInfo*)indexTemp.str();
00337 
00338   // writing individually is slow as a slow thing on Valium ...
00339   //  int numSeqs(getNumSequencesInFile());
00340   //  for (int i(1); i<=numSeqs; i++)
00341   //  {
00342   //    //        cout << "Index: " << i << " " << filterNums_[i]-1 << " "  
00343   //    //     << pIndex[filterNums_[i]-1].fileNum
00344     //   << " " << pIndex[filterNums_[i]-1].seqPos << endl;
00345   //   indexFile.write
00346   //    ( (const char*)&pIndex[ filterNums_[i]-1 ], sizeof(SeqIndexInfo) );
00347   //  } // ~for
00348 
00349   int numSeqs(getNumSequencesInFile());
00350   vector<SeqIndexInfo> newIndex;
00351   newIndex.reserve(numSeqs);
00352 
00353   for (int i(1); i<=numSeqs; i++)
00354   {
00355     //        cout << "Index: " << i << " " << filterNums_[i]-1 << " "  
00356     //     << pIndex[filterNums_[i]-1].fileNum
00357     //   << " " << pIndex[filterNums_[i]-1].seqPos << endl;
00358     newIndex.push_back(pIndex[ filterNums_[i]-1 ]);
00359   } // ~for
00360 
00361   indexFile.write
00362     ( (const char*)&newIndex[ 0 ], numSeqs*sizeof(SeqIndexInfo) );
00363 
00364 
00365 
00366 } // ~void SequenceReaderFilter::saveIndexImp
00367 
00368 
00369 // StringHash member functions
00370 
00371 
00372 void StringHash::makeBins( int numBins )
00373 {
00374   //  cout << "Hashing " << size() << " sequence names into "
00375   //   << numBins << " bins" << endl;
00376   
00377   numBins_ = numBins;
00378     bins_.clear();
00379     bins_.resize(numBins);
00380     for (vector<string>::const_iterator i(begin());i!=end();i++)
00381     {
00382       (bins_[ H_(i->c_str()) % numBins ]).push_back((string*)&(*i));
00383       //      (bins_[ H_(i->c_str()) % numBins ]).push_back();
00384       //  (bins_[ H_(i->c_str()) % numBins ]).back() = i;
00385     }
00386 } // StringHash
00387 
00388 size_t StringHash::makeNumBins( void ) const
00389 {
00390   static const size_t primes[] = { 103,1009,7013,10007,88883,100043 };
00391   static const int numPrimes = 6;
00392     size_t minSize = size() + (size()>>3); // current size +12.5%
00393     for (int i(0);i<numPrimes;i++)
00394       if (minSize<primes[i]) return primes[i];
00395     minSize |=1; // make sure it's odd
00396     if (minSize%5==0) minSize+=2; // make sure it's not a mult of 5
00397     // (cos then hash fn gives lousy performance)
00398     return minSize;
00399 }
00400 
00401 
00402 
00403 
00404 
00405 
00406 // End of file SequenceReaderFilter.cpp
00407 
00408 
00409 
00410 
00411 
00412 

Generated on Fri Dec 21 13:12:16 2007 for ssaha by  doxygen 1.5.2