00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include "SequenceEncoder.h"
00034 #include "SequenceReader.h"
00035
00036
00037 void print( const TranslationTable& tt )
00038 {
00039 for ( int i(0) ; i < numPossibleChars ; ++ i )
00040 {
00041 cout << i << ":";
00042 if (isgraph((uchar)i)) cout << " (" << (uchar)i << ") ";
00043 cout << "\t " << (int)tt[i];
00044 if (isgraph(tt[i])) cout << " (" << tt[i] << ") ";
00045 cout << endl;
00046 }
00047 }
00048
00049 SequenceEncoder::SequenceEncoder
00050 ( const TranslationTable* tt,
00051 SourceDataType sourceData,
00052 int bitsPerSymbol,
00053 int wordLength,
00054 ostream& monitoringStream):
00055 monitoringStream_( monitoringStream ),
00056 tt_(tt), sourceData_( sourceData ), bitsPerSymbol_(bitsPerSymbol),
00057 pState_( new SequenceReaderModeIgnore( monitoringStream ) ),
00058 wordFlag_(0),
00059 doubleBitShift_(bitsPerSymbol<<1),
00060 symbolMask_((1<<bitsPerSymbol)-1)
00061
00062
00063 {
00064 monitoringStream_ << "constructing SequenceEncoder" << endl;
00065 setWordLength( wordLength );
00066 }
00067
00068 SequenceEncoder::SequenceEncoder( const SequenceEncoder& rhs ) :
00069 monitoringStream_( rhs.monitoringStream_ ),
00070 tt_( rhs.tt_ ),
00071 ett_( rhs.ett_ ),
00072 sourceData_( rhs.sourceData_ ),
00073 bitsPerSymbol_( rhs.bitsPerSymbol_ ),
00074 wordLength_( rhs.wordLength_ ),
00075
00076 pState_( rhs.pState_->clone() ),
00077 wordFlag_(0),
00078
00079
00080 symbolMask_(rhs.symbolMask_),
00081 doubleBitShift_(rhs.doubleBitShift_)
00082 {
00083 monitoringStream_ << "copy constructing SequenceEncoder" << endl;
00084 }
00085
00086 SequenceEncoder::~SequenceEncoder()
00087 {
00088 monitoringStream_ << "destructing SequenceEncoder" << endl;
00089 delete pState_;
00090 }
00091
00092 SequenceEncoderProtein::SequenceEncoderProtein
00093 ( int wordLength, ostream& monStream ) :
00094 SequenceEncoder
00095 ( &ttProtein, gProteinData, gResidueBits, wordLength, monStream )
00096 {
00097 monitoringStream_ << "constructing SequenceEncoderProtein" << endl;
00098 ett_ = &ettSource_;
00099 if (!isExpanded_)
00100 {
00101 expandTranslationTable( ettSource_ );
00102 isExpanded_ = true;
00103 }
00104 }
00105
00106 SequenceEncoderDNA::SequenceEncoderDNA
00107 ( int wordLength, ostream& monStream ) :
00108 SequenceEncoder( &ttDNA, gDNAData, gBaseBits, wordLength, monStream )
00109 {
00110 monitoringStream_ << "constructing SequenceEncoderDNA" << endl;
00111 ett_ = &ettSource_;
00112 if (!isExpanded_)
00113 {
00114 expandTranslationTable( ettSource_ );
00115 isExpanded_ = true;
00116 }
00117 }
00118
00119 SequenceEncoderCodon::SequenceEncoderCodon
00120 ( int wordLength, ostream& monStream ) :
00121 SequenceEncoder
00122 ( &ttCodon, gDNAData, gResidueBits, wordLength, monStream )
00123 {
00124 monitoringStream_ << "constructing SequenceEncoderCodon" << endl;
00125 ett_ = &ettSource_;
00126 if (!isExpanded_)
00127 {
00128 expandTranslationTable( ettSource_ );
00129 isExpanded_ = true;
00130 }
00131 }
00132
00133
00134
00135 bool SequenceEncoderDNA::isExpanded_ = false;
00136 bool SequenceEncoderProtein::isExpanded_ = false;
00137 bool SequenceEncoderCodon::isExpanded_ = false;
00138 ExpandedTranslationTable SequenceEncoderDNA::ettSource_;
00139 ExpandedTranslationTable SequenceEncoderProtein::ettSource_;
00140 ExpandedTranslationTable SequenceEncoderCodon::ettSource_;
00141
00142
00143
00144
00145
00146 void SequenceEncoder::changeMode( SequenceReaderMode* pMode )
00147 {
00148 monitoringStream_ << "SequenceReader::changeMode\n";
00149 delete pState_;
00150 pState_ = pMode->clone();
00151 }
00152
00153
00154 void SequenceEncoder::encode( const char* data, int numChars )
00155 {
00156 if (numChars==0) return;
00157
00158 ushort* p( (ushort*) data );
00159
00160 const ushort* lastp( (ushort*) &data[ numChars - (numChars%2) ] );
00161 uchar* pTemp;
00162
00163 int basesInLast(pSeq_->getNumBasesInLast());
00164
00165 int splitWord( wordLength_-1 );
00166
00167 Word thisWord(pSeq_->back()), thisCode(0);
00168 pSeq_->pop_back();
00169
00170 while (p < lastp)
00171 {
00172
00173
00174 thisCode = (*ett_)[ *p ];
00175 if ( (thisCode & someCharInvalid) == 0 )
00176 {
00177
00178 if ( basesInLast != splitWord )
00179 {
00180 thisWord <<= doubleBitShift_;
00181 thisWord |= thisCode;
00182
00183
00184 basesInLast +=2;
00185 if (basesInLast==wordLength_)
00186 addWord( thisWord, wordFlag_, basesInLast );
00187 }
00188 else
00189 {
00190 thisWord <<= bitsPerSymbol_;
00191
00192 thisWord |= (( thisCode >> bitsPerSymbol_ ) & symbolMask_ );
00193
00194
00195
00196 addWord( thisWord, wordFlag_, basesInLast );
00197 thisWord = thisCode & symbolMask_;
00198
00199
00200 basesInLast = 1;
00201 }
00202 }
00203 else
00204 {
00205
00206 pTemp = (uchar*) p;
00207 encodeChar( *pTemp, thisWord, wordFlag_, basesInLast );
00208 ++pTemp;
00209 encodeChar( *pTemp, thisWord, wordFlag_, basesInLast );
00210
00211 }
00212 p++;
00213 }
00214
00215
00216 assert(p==lastp);
00217 if (numChars%2==1)
00218 {
00219 pTemp = (uchar*)p;
00220
00221 encodeChar( *p, thisWord, wordFlag_, basesInLast );
00222 }
00223
00224 pSeq_->setNumBasesInLast(basesInLast);
00225 pSeq_->push_back(thisWord);
00226
00227
00228
00229
00230 }
00231
00232 void SequenceEncoder::encodeChar
00233 ( uchar thisChar, Word& thisWord, Word& thisFlag, int& basesInLast )
00234 {
00235 Word thisCode;
00236
00237 if ( (*tt_)[ thisChar ] == nv )
00238 pState_->mismatch( thisChar, thisFlag );
00239
00240
00241 thisCode = (*tt_)[ thisChar ];
00242
00243 if ( thisCode == nv )
00244 {
00245
00246 return;
00247 }
00248 thisWord <<= bitsPerSymbol_;
00249 thisWord |= thisCode;
00250
00251
00252
00253 basesInLast++;
00254 if (basesInLast==wordLength_) addWord( thisWord, thisFlag, basesInLast );
00255
00256 }
00257
00258 void SequenceEncoder::addWord
00259 ( Word& thisWord, Word& thisFlag, int& basesInLast )
00260 {
00261
00262 thisWord |= thisFlag;
00263 pSeq_->push_back(thisWord);
00264
00265
00266 thisFlag=0;
00267 thisWord=0;
00268 basesInLast=0;
00269 }
00270
00271
00272
00273 void SequenceEncoder::unlinkSeq( void )
00274 {
00275 if (pSeq_->getNumBasesInLast() != 0)
00276 {
00277
00278 Word curseFlag = ( pSeq_->back() & gCursedWord );
00279
00280 pSeq_->back()
00281 <<= (bitsPerSymbol_*(wordLength_-pSeq_->getNumBasesInLast()));
00282
00283
00284 pSeq_->back() |= curseFlag;
00285
00286 }
00287
00288 pSeq_ = NULL;
00289 }
00290
00291
00292 void SequenceEncoder::expandTranslationTable
00293 ( ExpandedTranslationTable& ett )
00294 {
00295 monitoringStream_ << "expanding the ttable" << endl;
00296 uchar a[2];
00297 unsigned short* b = (unsigned short*) &a[0];
00298 a[0]=0; a[1]=0;
00299 Word w[2];
00300
00301 do
00302 {
00303 do
00304 {
00305
00306
00307
00308
00309
00310 w[ 0 ] = (*tt_)[ a[0] ];
00311 w[ 1 ] = (*tt_)[ a[1] ];
00312 if ( ( w[0] == nv ) || ( w[1] == nv ) )
00313 {
00314 ett[ *b ] = (w[0]==nv)*firstCharInvalid;
00315 ett[ *b ] |= (w[1]==nv)*secondCharInvalid;
00316 }
00317 else
00318 {
00319 ett[ *b ] = w[0];
00320 ett[ *b ] <<= bitsPerSymbol_;
00321 ett[ *b ] |= w[1];
00322
00323 }
00324 }
00325 while( ++a[1] != 0 );
00326 }
00327 while( ++a[0] != 0 );
00328 monitoringStream_ << "done expanding" << endl;
00329
00330
00331 }
00332
00333
00334
00335
00336
00337
00338
00339 void codonize
00340 ( const WordSequence& in, CodonList& codons, int readingFrame )
00341 {
00342
00343 static const Word mask((1<<gCodonBits)-1);
00344
00345 int shift( gBitsPerWord-gCodonBits-gBaseBits*readingFrame );
00346
00347
00348 WordSequence::const_iterator i(in.begin());
00349
00350 int codonsSoFar(0);
00351 int totalCodons
00352 ( (gMaxBasesPerWord*(in.size()-1)
00353 + in.getNumBasesInLast() - readingFrame)/gBasesPerCodon );
00354
00355 Word thisCodon;
00356
00357 while( 1 )
00358 {
00359
00360 if (shift>=0)
00361 {
00362 thisCodon = 0;
00363 }
00364 else
00365 {
00366 thisCodon = (*i<<-shift);
00367 i++;
00368 shift += gBitsPerWord;
00369 }
00370 thisCodon |= (*i>>shift);
00371 thisCodon &= mask;
00372
00373 codons.push_back(thisCodon);
00374 if (++codonsSoFar==totalCodons) break;
00375 if (shift == 0) { shift = gBitsPerWord; i++; }
00376 shift -= gCodonBits;
00377
00378 }
00379
00380
00381
00382
00383 }
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401 void codonizeAndFlag
00402 ( const WordSequence& in, CodonList& codons, int readingFrame )
00403 {
00404
00405 static const int DNAWordSizeForHashing(gMaxBasesPerWord-1);
00406
00407 int codonsInLast((in.getNumBasesInLast()-readingFrame)/gNumReadingFrames);
00408
00409
00410 int numCodons
00411 ( ( (in.size() == 0)
00412 ? 0
00413 : ((in.size()-1)*DNAWordSizeForHashing)+in.getNumBasesInLast()
00414 -readingFrame )
00415 / gNumReadingFrames );
00416
00417 codons.resize(numCodons);
00418 char* pCodon(static_cast<char*>(&(*codons.begin())));
00419 char* pEnd(static_cast<char*>(&(*codons.end())));
00420 WordSequence::const_iterator i(in.begin());
00421 WordSequence::const_iterator lastWord(in.end()-1);
00422 Word toCarry(~0);
00423 Word lastWordFlag(0);
00424
00425
00426
00427
00428
00429 if (readingFrame==0)
00430 {
00431 for( ; i!=lastWord ; ++i )
00432 {
00433
00434 *(pCodon++) = getCodonFromWord<4,0>(*i);
00435 *(pCodon++) = getCodonFromWord<3,0>(*i);
00436 *(pCodon++) = getCodonFromWord<2,0>(*i);
00437 *(pCodon++) = getCodonFromWord<1,0>(*i);
00438 *(pCodon++) = getCodonFromWord<0,0>(*i);
00439
00440 }
00441
00442 if (codonsInLast>=1)
00443 *(pCodon++) = getCodonFromWord<4,0>(*i);
00444 if (codonsInLast>=2)
00445 *(pCodon++) = getCodonFromWord<3,0>(*i);
00446 if (codonsInLast>=3)
00447 *(pCodon++) = getCodonFromWord<2,0>(*i);
00448 if (codonsInLast==4)
00449 *(pCodon++) = getCodonFromWord<1,0>(*i);
00450 }
00451 else if (readingFrame==1)
00452 {
00453
00454 for( ; i!=lastWord ; ++i )
00455 {
00456 if (toCarry!=~0)
00457 (*pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00458 ? flaggedChar
00459 : ( toCarry | (((*i) >> (4*gCodonBits + 2*gBaseBits))&maskBase ));
00460
00461 *(pCodon++) = getCodonFromWord<3,2>(*i);
00462 *(pCodon++) = getCodonFromWord<2,2>(*i);
00463 *(pCodon++) = getCodonFromWord<1,2>(*i);
00464 *(pCodon++) = getCodonFromWord<0,2>(*i);
00465
00466 lastWordFlag = (*i)&gCursedWord;
00467 toCarry = ((*i)& mask2Bases) << gBaseBits;
00468
00469 }
00470 if (in.getNumBasesInLast() >= readingFrame )
00471 {
00472 *(pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00473 ? flaggedChar
00474 : ( toCarry | (((*i) >> (4*gCodonBits + 2*gBaseBits))&maskBase ));
00475 }
00476 if (codonsInLast>=1)
00477 *(pCodon++) = getCodonFromWord<3,2>(*i);
00478 if (codonsInLast>=2)
00479 *(pCodon++) = getCodonFromWord<2,2>(*i);
00480 if (codonsInLast>=3)
00481 *(pCodon++) = getCodonFromWord<1,2>(*i);
00482 if (codonsInLast>=4)
00483 *(pCodon++) = getCodonFromWord<0,2>(*i);
00484
00485 }
00486 else
00487 {
00488
00489 for( ; i!=lastWord ; ++i )
00490 {
00491 if (toCarry!=~0)
00492 (*pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00493 ? flaggedChar
00494 : ( toCarry | (((*i) >> (4*gCodonBits + gBaseBits))&mask2Bases ));
00495
00496 *(pCodon++) = getCodonFromWord<3,1>(*i);
00497 *(pCodon++) = getCodonFromWord<2,1>(*i);
00498 *(pCodon++) = getCodonFromWord<1,1>(*i);
00499 *(pCodon++) = getCodonFromWord<0,1>(*i);
00500
00501 lastWordFlag = (*i)&gCursedWord;
00502 toCarry = ((*i)& maskBase) << (2*gBaseBits);
00503
00504 }
00505 if (in.getNumBasesInLast() >= readingFrame )
00506 {
00507
00508 *(pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00509 ? flaggedChar
00510 : ( toCarry | (((*i) >> (4*gCodonBits + gBaseBits))&mask2Bases ));
00511 }
00512 if (codonsInLast>=1)
00513 *(pCodon++) = getCodonFromWord<3,1>(*i);
00514 if (codonsInLast>=2)
00515 *(pCodon++) = getCodonFromWord<2,1>(*i);
00516 if (codonsInLast>=3)
00517 *(pCodon++) = getCodonFromWord<1,1>(*i);
00518 if (codonsInLast>=4)
00519 *(pCodon++) = getCodonFromWord<0,1>(*i);
00520
00521 }
00522
00523
00524 assert(pCodon==pEnd);
00525
00526 }
00527
00528
00529
00530
00531
00532
00533
00534
00535 void codonizeAndFlagReverse
00536 ( const WordSequence& in, CodonList& codons, int readingFrame )
00537 {
00538
00539 static const int DNAWordSizeForHashing(gMaxBasesPerWord-1);
00540
00541 int codonsInLast((in.getNumBasesInLast()-readingFrame)/gNumReadingFrames);
00542
00543
00544 int numCodons
00545 ( ( (in.size() == 0)
00546 ? 0
00547 : ((in.size()-1)*DNAWordSizeForHashing)+in.getNumBasesInLast()
00548 -readingFrame )
00549 / gNumReadingFrames );
00550
00551 readingFrame = (in.getNumBasesInLast()-readingFrame+gNumReadingFrames)%gNumReadingFrames;
00552
00553
00554
00555
00556 codons.resize(numCodons);
00557 char* pCodon(static_cast<char*>(&(*codons.begin())));
00558 char* pEnd(static_cast<char*>(&(*codons.end())));
00559 WordSequence::const_iterator i(in.end()-1);
00560 WordSequence::const_iterator lastWord(in.begin());
00561 Word toCarry(~0);
00562 Word lastWordFlag(0);
00563
00564
00565
00566
00567
00568 if (readingFrame==0)
00569 {
00570 if (codonsInLast==4)
00571 *(pCodon++) = getCodonFromWord<1,0>(*i);
00572 if (codonsInLast>=3)
00573 *(pCodon++) = getCodonFromWord<2,0>(*i);
00574 if (codonsInLast>=2)
00575 *(pCodon++) = getCodonFromWord<3,0>(*i);
00576 if (codonsInLast>=1)
00577 *(pCodon++) = getCodonFromWord<4,0>(*i);
00578
00579 do
00580 {
00581 i--;
00582 *(pCodon++) = getCodonFromWord<0,0>(*i);
00583 *(pCodon++) = getCodonFromWord<1,0>(*i);
00584 *(pCodon++) = getCodonFromWord<2,0>(*i);
00585 *(pCodon++) = getCodonFromWord<3,0>(*i);
00586 *(pCodon++) = getCodonFromWord<4,0>(*i);
00587
00588 }
00589 while (i!=lastWord);
00590
00591 }
00592 else if (readingFrame==1)
00593 {
00594
00595 if (codonsInLast==4)
00596 *(pCodon++) = getCodonFromWord<0,2>(*i);
00597 if (codonsInLast>=3)
00598 *(pCodon++) = getCodonFromWord<1,2>(*i);
00599 if (codonsInLast>=2)
00600 *(pCodon++) = getCodonFromWord<2,2>(*i);
00601 if (codonsInLast>=1)
00602 *(pCodon++) = getCodonFromWord<3,2>(*i);
00603
00604 if (in.getNumBasesInLast()>=readingFrame)
00605 {
00606 lastWordFlag = (*i)&gCursedWord;
00607 toCarry = ((*i) >> (4*gCodonBits + 2*gBaseBits))&maskBase ;
00608 }
00609
00610 do
00611 {
00612 i--;
00613 if (toCarry!=~0)
00614 (*pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00615 ? flaggedChar
00616 : ( toCarry | ( ((*i) & mask2Bases ) << gBaseBits ) );
00617
00618 *(pCodon++) = getCodonFromWord<0,2>(*i);
00619 *(pCodon++) = getCodonFromWord<1,2>(*i);
00620 *(pCodon++) = getCodonFromWord<2,2>(*i);
00621 *(pCodon++) = getCodonFromWord<3,2>(*i);
00622 lastWordFlag = (*i)&gCursedWord;
00623 toCarry = ((*i) >> (4*gCodonBits + 2*gBaseBits))&maskBase ;
00624
00625
00626 }
00627 while (i!=lastWord);
00628
00629
00630 }
00631 else
00632 {
00633
00634
00635 if (codonsInLast==4)
00636 *(pCodon++) = getCodonFromWord<0,1>(*i);
00637 if (codonsInLast>=3)
00638 *(pCodon++) = getCodonFromWord<1,1>(*i);
00639 if (codonsInLast>=2)
00640 *(pCodon++) = getCodonFromWord<2,1>(*i);
00641 if (codonsInLast>=1)
00642 *(pCodon++) = getCodonFromWord<3,1>(*i);
00643
00644 if (in.getNumBasesInLast()>=readingFrame)
00645 {
00646 lastWordFlag = (*i)&gCursedWord;
00647 toCarry = ((*i) >> (4*gCodonBits + gBaseBits))&mask2Bases ;
00648 }
00649
00650 do
00651 {
00652 i--;
00653 if (toCarry!=~0)
00654 (*pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00655 ? flaggedChar
00656 : ( toCarry | ( ((*i) & maskBase ) << (2*gBaseBits) ) );
00657
00658 *(pCodon++) = getCodonFromWord<0,1>(*i);
00659 *(pCodon++) = getCodonFromWord<1,1>(*i);
00660 *(pCodon++) = getCodonFromWord<2,1>(*i);
00661 *(pCodon++) = getCodonFromWord<3,1>(*i);
00662 lastWordFlag = (*i)&gCursedWord;
00663 toCarry = ((*i) >> (4*gCodonBits + gBaseBits))&mask2Bases ;
00664
00665 }
00666 while (i!=lastWord);
00667
00668
00669 }
00670
00671
00672 assert(pCodon==pEnd);
00673
00674
00675 for (CodonList::iterator j(codons.begin()); j!=codons.end(); ++j)
00676 {
00677
00678 *j =
00679 ( ((*j)==flaggedChar)
00680 ? flaggedChar
00681 :
00682 ( ((*j)&maskBase^maskBase) << (2*gBaseBits) )
00683 | ((*j)&(maskBase<<gBaseBits)^(maskBase<<gBaseBits))
00684 | ( ((*j)&(maskCodon^mask2Bases)^(maskCodon^mask2Bases))
00685 >> (2*gBaseBits) ) );
00686 }
00687
00688
00689 }
00690
00691
00692
00693 ostream& operator<<( ostream& os, CodonList& c )
00694 {
00695 for (vector<char>::iterator i(c.begin()); i!=c.end(); i++ )
00696 os << (unsigned int)*i << "-" <<
00697 ((*i==flaggedChar) ? '!' : gCodonNames[(unsigned int)*i]) << " ";
00698 return os;
00699 }
00700
00701
00702
00703
00704
00705
00706
00707
00708
00709
00710
00711