00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include "SequenceReaderFilter.h"
00034 #include <strstream>
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048 SequenceReaderFilter::SequenceReaderFilter
00049 ( SequenceReader* pSeq,
00050 ifstream* pFilterSource,
00051 ostream& monitoringStream ) :
00052 SequenceReader( monitoringStream ),
00053 pSeq_(pSeq),
00054 pFilterNames_( new StringHash ),
00055 numFiltered_(0)
00056 {
00057 monitoringStream_
00058 << "constructing SequenceReaderFilter" << endl;
00059 readFilterNames( pFilterSource );
00060 }
00061
00062 SequenceReaderFilter::SequenceReaderFilter
00063 ( SequenceReader* pSeq,
00064 const char* filterFileName,
00065 ostream& monitoringStream ) :
00066 SequenceReader( monitoringStream ),
00067 pSeq_(pSeq),
00068 pFilterNames_( new StringHash ),
00069 numFiltered_(0)
00070 {
00071 monitoringStream_
00072 << "constructing SequenceReaderFilter for file "
00073 << filterFileName << endl;
00074 readFilterNames( new ifstream( filterFileName ) );
00075 }
00076
00077
00078
00079
00080
00081
00082 void SequenceReaderFilter::readFilterNames( ifstream* pFilterSource )
00083 {
00084 if ( pFilterSource->fail() )
00085 {
00086 throw SSAHAException
00087 ( "SequenceReaderFile - unable to open filter file ");
00088
00089 }
00090
00091
00092
00093
00094 pFilterNames_->push_back( (string) "" );
00095 while (*pFilterSource>>pFilterNames_->back()) pFilterNames_->push_back( (string) "" );
00096 pFilterNames_->pop_back();
00097 pFilterNames_->makeBins();
00098
00099 monitoringStream_ << "Read in " << pFilterNames_->size()
00100 << " names to filter out" << endl;
00101 filterNums_.push_back(0);
00102 pFilterSource->close();
00103 delete pFilterSource;
00104 }
00105
00106
00107
00108
00109
00110
00111 SequenceReaderFilter::SequenceReaderFilter( const SequenceReaderFilter& rhs ):
00112 SequenceReader( rhs.monitoringStream_ ),
00113 pSeq_( rhs.pSeq_->clone() )
00114 {
00115 monitoringStream_ << "copy constructing SequenceReaderFilter" << endl;
00116
00117 monitoringStream_ << "copy constructor not implemented!!" << endl;
00118 assert(1==0);
00119 }
00120
00121
00122
00123 SequenceReaderFilter::~SequenceReaderFilter()
00124 {
00125 monitoringStream_ << "destructing SequenceReaderFilter" << endl;
00126 delete pSeq_;
00127 delete pFilterNames_;
00128 }
00129
00130 bool SequenceReaderFilter::findSequence( SequenceNumber seqNum )
00131 {
00132 if ( seqNum <= filterNums_.size()-1 ) return true;
00133 if (allSequencesRead_)
00134 {
00135 monitoringStream_
00136 << "Requested sequence number (" << seqNum
00137 << ")\nis outside range of sequences in file (1 to "
00138 << numSequencesInFile_ << ")." << endl;
00139 return false;
00140 }
00141
00142 SequenceNumber lastInSource(filterNums_.back());
00143
00144 std::ostrstream buffer;
00145 string name;
00146
00147 while (filterNums_.size()!=seqNum+1)
00148 {
00149
00150 ++lastInSource;
00151
00152 if ( pSeq_->printName( buffer, lastInSource ) == false )
00153 {
00154 if(allSequencesRead_==false)
00155 {
00156 allSequencesRead_=true;
00157 if (pFilterNames_!=NULL)
00158 {
00159 monitoringStream_
00160 << "deleting filter name data, no longer needed" << endl;
00161 delete pFilterNames_;
00162 pFilterNames_=NULL;
00163 }
00164 numSequencesInFile_ = filterNums_.size()-1;
00165 monitoringStream_
00166 << "Requested sequence number (" << seqNum
00167 << ")\nis outside range of sequences in file (1 to "
00168 << numSequencesInFile_ << ")." << endl;
00169
00170 }
00171 return false;
00172 }
00173
00174 name=buffer.str();
00175 string::size_type nameEnd=name.find_first_of('\t');
00176
00177 if (nameEnd!=string::npos) name.erase(nameEnd,name.size());
00178
00179
00180 if ( !pFilterNames_->isPresent(name) )
00181 {
00182 filterNums_.push_back(lastInSource);
00183
00184 }
00185 else
00186 {
00187
00188
00189
00190 numFiltered_++;
00191 }
00192
00193
00194 buffer.freeze(false);
00195 buffer.seekp(0,ios::beg);
00196
00197 }
00198
00199 return true;
00200
00201 }
00202
00203
00204
00205
00206
00207
00208
00209
00210 int SequenceReaderFilter::getNextSequence( WordSequence& nextSeq,
00211 int wordLength )
00212 {
00213 if (!findSequence(lastSequenceNumber_+1))
00214 {
00215 assert(filterNums_.size()==lastSequenceNumber_+1);
00216 return -1;
00217 }
00218 else
00219 {
00220 ++lastSequenceNumber_;
00221 return pSeq_->getSequence( nextSeq, filterNums_[lastSequenceNumber_],
00222 wordLength );
00223 }
00224 }
00225
00226
00227
00228
00229
00230
00231 int SequenceReaderFilter::getSequence
00232 ( WordSequence& nextSeq, SequenceNumber sequenceNumber, int wordLength )
00233 {
00234 if (!findSequence(sequenceNumber)) return -1;
00235 assert(filterNums_.size()>=sequenceNumber+1);
00236 lastSequenceNumber_=sequenceNumber;
00237 return pSeq_->getSequence
00238 ( nextSeq, filterNums_[sequenceNumber], wordLength );
00239
00240 }
00241
00242
00243 void SequenceReaderFilter::rewind( void )
00244 {
00245 pSeq_->rewind();
00246 lastSequenceNumber_ = 0;
00247 }
00248
00249
00250
00251
00252
00253 void SequenceReaderFilter::getLastSequenceName( string& seqName ) const
00254 {
00255 return pSeq_->getLastSequenceName(seqName);
00256 }
00257
00258
00259
00260
00261
00262 bool SequenceReaderFilter::printName( ostream& os, SequenceNumber seqNum)
00263 {
00264 if (!findSequence(seqNum)) return false;
00265 assert(filterNums_.size()>=seqNum+1);
00266 pSeq_->printName(os, filterNums_[seqNum]);
00267 return true;
00268 }
00269
00270
00271
00272
00273
00274 bool SequenceReaderFilter::printSideInfo
00275 ( ostream& os, SequenceNumber seqNum )
00276 {
00277 if (!findSequence(seqNum)) return false;
00278 assert(filterNums_.size()>=seqNum+1);
00279 pSeq_->printName(os, filterNums_[seqNum]);
00280 return true;
00281 }
00282
00283
00284
00285
00286
00287 bool SequenceReaderFilter::printSource
00288 ( ostream& os, SequenceNumber seqNum )
00289 {
00290 if (!findSequence(seqNum)) return false;
00291 assert(filterNums_.size()>=seqNum+1);
00292 pSeq_->printName(os, filterNums_[seqNum]);
00293 return true;
00294 }
00295
00296 SequenceNumber SequenceReaderFilter::computeNumSequencesInFile( void )
00297 {
00298 try
00299 {
00300
00301 findSequence( 1+pSeq_->getNumSequencesInFile() );
00302 }
00303 catch( const NumberOutOfRange& err )
00304 {}
00305
00306 return numSequencesInFile_;
00307
00308 }
00309
00310
00311
00312
00313
00314
00315 void SequenceReaderFilter::extractSource
00316 ( char** pSource,
00317 SequenceNumber seqNum,
00318 SequenceOffset seqStart,
00319 SequenceOffset seqEnd )
00320 {
00321
00322 pSeq_->extractSource( pSource, filterNums_[seqNum], seqStart, seqEnd );
00323 }
00324
00325
00326
00327
00328
00329 void SequenceReaderFilter::saveIndexImp
00330 ( ostream& fileFile,
00331 ostream& indexFile,
00332 int& fileNumber )
00333 {
00334 std::strstream indexTemp;
00335 pSeq_->saveIndexImp( fileFile, indexTemp, fileNumber );
00336 const SeqIndexInfo* pIndex= (const SeqIndexInfo*)indexTemp.str();
00337
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348
00349 int numSeqs(getNumSequencesInFile());
00350 vector<SeqIndexInfo> newIndex;
00351 newIndex.reserve(numSeqs);
00352
00353 for (int i(1); i<=numSeqs; i++)
00354 {
00355
00356
00357
00358 newIndex.push_back(pIndex[ filterNums_[i]-1 ]);
00359 }
00360
00361 indexFile.write
00362 ( (const char*)&newIndex[ 0 ], numSeqs*sizeof(SeqIndexInfo) );
00363
00364
00365
00366 }
00367
00368
00369
00370
00371
00372 void StringHash::makeBins( int numBins )
00373 {
00374
00375
00376
00377 numBins_ = numBins;
00378 bins_.clear();
00379 bins_.resize(numBins);
00380 for (vector<string>::const_iterator i(begin());i!=end();i++)
00381 {
00382 (bins_[ H_(i->c_str()) % numBins ]).push_back((string*)&(*i));
00383
00384
00385 }
00386 }
00387
00388 size_t StringHash::makeNumBins( void ) const
00389 {
00390 static const size_t primes[] = { 103,1009,7013,10007,88883,100043 };
00391 static const int numPrimes = 6;
00392 size_t minSize = size() + (size()>>3);
00393 for (int i(0);i<numPrimes;i++)
00394 if (minSize<primes[i]) return primes[i];
00395 minSize |=1;
00396 if (minSize%5==0) minSize+=2;
00397
00398 return minSize;
00399 }
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412