00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include "SequenceReader.h"
00034 #include "HashTable.h"
00035 #include <string>
00036 #include <fstream>
00037
00038 static const char nc='\0';
00039 static const char reverseChar[256] =
00040 {
00041 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00042 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00043 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,'-',nc,nc,
00044 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00045 nc,
00046 'T',nc,'G',nc,nc,nc,'C',nc,nc,nc,nc,nc,nc,'N',nc,
00047 nc,nc,nc,nc,'A',nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00048 nc,
00049 't',nc,'g',nc,nc,nc,'c',nc,nc,nc,nc,nc,nc,'n',nc,
00050 nc,nc,nc,nc,'a',nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00051 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00052 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00053 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00054 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00055 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00056 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00057 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,
00058 nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc,nc
00059 };
00060
00061
00062
00063
00064
00065
00066
00067 SequenceReader::SequenceReader
00068 ( ostream& monitoringStream ) :
00069
00070
00071 getName( this ),
00072 getSideInfo( this ),
00073 getSource( this ),
00074 lastSequenceNumber_(0),
00075 allSequencesRead_(false),
00076 numSequencesInFile_(0),
00077 monitoringStream_( monitoringStream )
00078 {
00079 monitoringStream_ << "constructing SequenceReader" << endl;
00080 }
00081
00082 SequenceReader::SequenceReader( const SequenceReader& rhs ) :
00083 getName( rhs.getName ),
00084 getSideInfo( rhs.getSideInfo ),
00085 getSource( rhs.getSource ),
00086 lastSequenceNumber_( rhs.lastSequenceNumber_ ),
00087 allSequencesRead_( rhs.allSequencesRead_ ),
00088 numSequencesInFile_( rhs.numSequencesInFile_ ),
00089 monitoringStream_( rhs.monitoringStream_ )
00090 {
00091 monitoringStream_ << "copy constructing SequenceReader" << endl;
00092 }
00093
00094 SequenceReader::~SequenceReader()
00095 {
00096 monitoringStream_ << "destructing SequenceReader" << endl;
00097 }
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116 int SequenceReader::getSequence
00117 ( WordSequence& nextSeq, SequenceNumber sequenceNumber, const HashTable& hashTable )
00118 {
00119 return getSequence( nextSeq, sequenceNumber, hashTable.getWordLength() );
00120 }
00121
00122
00123
00124
00125
00126
00127 int SequenceReader::getNextSequence( WordSequence& nextSeq, const HashTable& hashTable )
00128 {
00129 return getNextSequence( nextSeq, hashTable.getWordLength() );
00130 }
00131
00132
00133 void SequenceReaderNamePrinter::print( ostream& os )
00134 {
00135 if (!pReader_->printName(os,seqNum_)) throw NumberOutOfRange();
00136 }
00137
00138 void SequenceReaderSideInfoPrinter::print( ostream& os )
00139 {
00140 if (!pReader_->printSideInfo(os,seqNum_)) throw NumberOutOfRange();
00141 }
00142
00143 void SequenceReaderSourcePrinter::print( ostream& os )
00144 {
00145 if (!pReader_->printSource(os,seqNum_)) throw NumberOutOfRange();
00146 }
00147
00148
00149
00150 void SourceReader::extractSource
00151 ( char** pSource,
00152 SequenceNumber seqNum,
00153 SequenceOffset seqStart,
00154 SequenceOffset seqEnd )
00155 {
00156 throw SSAHAException
00157 ("SourceReader::extractSource not implemented for this subclass!");
00158 }
00159
00160 void SourceReader::extractSourceReverse
00161 ( char** pSource,
00162 SequenceNumber seqNum,
00163 SequenceOffset seqStart,
00164 SequenceOffset seqEnd )
00165 {
00166
00167
00168
00169
00170
00171 reverseBuffer_.resize( seqEnd - seqStart + 1 );
00172
00173 char* pSourceFwd;
00174 extractSource( &pSourceFwd, seqNum, seqStart, seqEnd );
00175
00176
00177 vector<char>::iterator i(reverseBuffer_.begin());
00178 char* j = &pSourceFwd[ seqEnd - seqStart ];
00179
00180 for ( ; i != reverseBuffer_.end() ; i++, j-- )
00181 {
00182 *i= reverseChar[ *j ];
00183
00184 }
00185
00186 *pSource = (char*)&reverseBuffer_[0];
00187
00188 }
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198 void SourceReader::saveIndex( const string& fileName )
00199 {
00200 ofstream filesFile((fileName+(string)".files").c_str());
00201 ofstream indexFile((fileName+(string)".index").c_str());
00202 assert(!filesFile.fail());
00203 assert(!indexFile.fail());
00204 int fileNum(0);
00205 saveIndexImp(filesFile, indexFile, fileNum);
00206 filesFile.close();
00207 indexFile.close();
00208 }
00209
00210
00211 void SourceReader::saveIndexImp
00212 ( ostream& fileFile,
00213 ostream& indexFile,
00214 int& fileNumber )
00215 {
00216 throw SSAHAException("SourceReader::saveIndex not implemented for this subclass!");
00217 }
00218
00219 SourceReaderIndex::SourceReaderIndex( const string& fileName ) :
00220 currentFileNum_(0)
00221 {
00222 ifstream filesFile((fileName+(string)".files").c_str());
00223 ifstream indexFile((fileName+(string)".index").c_str());
00224 assert(!filesFile.fail());
00225 assert(!indexFile.fail());
00226
00227
00228 fileNames_.push_back( new string );
00229
00230 while( filesFile >> *(fileNames_.back()) )
00231 fileNames_.push_back( new string );
00232
00233
00234 delete fileNames_.back();
00235 fileNames_.pop_back();
00236
00237 filesFile.close();
00238
00239
00240 indexFile.seekg(0, ios::end);
00241 std::streampos indexSize( indexFile.tellg() );
00242 indexFile.seekg(0, ios::beg);
00243
00244 assert ( indexSize % sizeof (SeqIndexInfo) == 0 );
00245
00246 numSeqs_ = indexSize / sizeof( SeqIndexInfo );
00247
00248 index_.resize( numSeqs_ );
00249
00250 indexFile.read( (char*)&index_[0], indexSize );
00251
00252 indexFile.close();
00253
00254
00255
00256
00257
00258
00259 pCurrentFile_ = new ifstream( (*fileNames_[0]).c_str() );
00260
00261
00262 }
00263
00264
00265 SourceReaderIndex::~SourceReaderIndex()
00266 {
00267 for ( vector<string*>::iterator i( fileNames_.begin() );
00268 i != fileNames_.end() ; ++i ) delete *i;
00269 }
00270
00271 const char* SourceReaderIndex::extractName( SequenceNumber seqNum )
00272 {
00273 if (seqNum != lastNameSeqNum_)
00274 {
00275
00276 assert( seqNum !=0 );
00277 assert( seqNum <= index_.size() );
00278
00279 vector<SeqIndexInfo>::iterator pThisSeq(&index_[seqNum-1]);
00280
00281
00282
00283
00284 if (pThisSeq->fileNum != currentFileNum_ )
00285 {
00286 pCurrentFile_->close();
00287 delete pCurrentFile_;
00288 pCurrentFile_
00289 = new ifstream( (*fileNames_[pThisSeq->fileNum]).c_str() );
00290 assert(!pCurrentFile_->fail());
00291 currentFileNum_ = pThisSeq->fileNum;
00292 }
00293
00294
00295 pCurrentFile_->clear();
00296 pCurrentFile_->seekg( pThisSeq->seqPos, ios::beg );
00297
00298 pCurrentFile_->getline
00299 ( nameBuffer_, nameBufferSize_, '\n' );
00300 lastNameSeqNum_ = seqNum;
00301 char* pSpace = strchr( nameBuffer_, ' ' );
00302 if (pSpace!=NULL) *pSpace='\0';
00303
00304 }
00305 assert((nameBuffer_[0]=='>')||(nameBuffer_[0]=='@'));
00306 return &nameBuffer_[1];
00307
00308 }
00309
00310
00311
00312 void SourceReaderIndex::extractSource
00313 ( char** pSource,
00314 SequenceNumber seqNum,
00315 SequenceOffset seqStart,
00316 SequenceOffset seqEnd )
00317 {
00318
00319 if (seqNum != lastSourceSeqNum_)
00320 {
00321
00322 assert( seqNum !=0 );
00323 assert( seqNum <= index_.size() );
00324
00325 vector<SeqIndexInfo>::iterator pThisSeq(&index_[seqNum-1]);
00326
00327
00328
00329
00330 if (pThisSeq->fileNum != currentFileNum_ )
00331 {
00332 pCurrentFile_->close();
00333 delete pCurrentFile_;
00334 pCurrentFile_
00335 = new ifstream( (*fileNames_[pThisSeq->fileNum]).c_str() );
00336 assert(!pCurrentFile_->fail());
00337 currentFileNum_ = pThisSeq->fileNum;
00338 }
00339
00340
00341 pCurrentFile_->clear();
00342 pCurrentFile_->seekg( pThisSeq->seqPos, ios::beg );
00343
00344 extractToCache( pCurrentFile_ );
00345 lastSourceSeqNum_ = seqNum;
00346
00347 lastNameSeqNum_ = seqNum;
00348 char* pSpace = strchr( nameBuffer_, ' ' );
00349 if (pSpace!=NULL) *pSpace='\0';
00350 }
00351
00352
00353
00354 if ( seqStart>seqEnd)
00355 {
00356 throw SSAHAException
00357 ("Requested seq start exceeds requested seq end in SourceReaderIndex::extractSource");
00358 }
00359 else if (seqEnd>lastSourceSeq_.size() )
00360 {
00361 cout << seqEnd << " " << lastSourceSeq_.size() << endl;
00362 throw SSAHAException
00363 ("Requested last byte exceeds end of seq in SourceReaderIndex::extractSource");
00364 }
00365
00366
00367 *pSource = &lastSourceSeq_[seqStart-1];
00368
00369
00370
00371
00372
00373
00374
00375 }
00376
00377 void SourceReader::extractToCache( istream* pCurrentFile)
00378 {
00379
00380 int firstOfLine(pCurrentFile->peek());
00381 assert(((char)firstOfLine=='>')||((char)firstOfLine=='@'));
00382
00383 int currentSize;
00384
00385
00386 lastSourceSeq_.resize(sourceBufferSize_);
00387
00388 char* pStart = (char*) &lastSourceSeq_[0];
00389 char* pResize = (char*) &lastSourceSeq_.back()-resizeCacheThreshold_;
00390
00391
00392
00393
00394 pCurrentFile->getline
00395 ( nameBuffer_, nameBufferSize_, '\n' );
00396
00397 std::streampos lastPos;
00398
00399
00400 while(1)
00401 {
00402 lastPos = pCurrentFile->tellg();
00403 firstOfLine = pCurrentFile->peek();
00404 if ( ((char)firstOfLine=='>') ||
00405 ((char)firstOfLine=='+') ||
00406 (firstOfLine==EOF) ) break;
00407 pCurrentFile->getline( pStart, sourceBufferSize_, '\n' );
00408 pStart += (unsigned long)pCurrentFile->tellg()-lastPos-1;
00409 if (pStart >= pResize)
00410 {
00411 currentSize = pStart-&lastSourceSeq_[0];
00412
00413
00414 lastSourceSeq_.resize
00415 ( lastSourceSeq_.size() + (lastSourceSeq_.size()>>1) );
00416 pStart = &lastSourceSeq_[currentSize];
00417 pResize = (char*) &lastSourceSeq_.back()-resizeCacheThreshold_;
00418 }
00419 }
00420
00421 lastSourceSeq_.resize( pStart-&lastSourceSeq_[0] );
00422
00423
00424
00425
00426 }
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447