SequenceReader/SequenceEncoder.cpp

Go to the documentation of this file.
00001 
00002 // #######################################################################
00003 
00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00005 // Version 3.2, released 1st March 2004
00006 // Copyright (c) Genome Research 2002
00007 
00008 // SSAHA is free software; you can redistribute it and/or modify 
00009 // it under the terms of version 2 of the GNU General Public Licence
00010 // as published by the Free Software Foundation.
00011  
00012 // This program is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public Licence for more details.
00016  
00017 // You should have received a copy of the GNU General Public Licence
00018 // along with this program; if not, write to the Free Software
00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00021 
00022 // #######################################################################
00023 
00024 // Module Name  : SequenceEncoder
00025 // File Name    : SequenceEncoder.cpp
00026 // Language     : C++
00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00028 
00029 // Description:
00030 
00031 // Includes:
00032 
00033 #include "SequenceEncoder.h"
00034 #include "SequenceReader.h"
00035 
00036 // ### Function Definitions ###
00037 void print( const TranslationTable& tt )
00038 {
00039   for ( int i(0) ; i < numPossibleChars ; ++ i )
00040   {
00041     cout << i << ":";
00042     if (isgraph((uchar)i)) cout << " (" << (uchar)i << ") ";
00043     cout << "\t " << (int)tt[i];
00044     if (isgraph(tt[i])) cout << " (" << tt[i] << ") ";
00045     cout << endl;
00046   }
00047 }
00048 
00049 SequenceEncoder::SequenceEncoder
00050   ( const TranslationTable* tt, 
00051     SourceDataType sourceData,
00052     int bitsPerSymbol, 
00053     int wordLength,
00054     ostream& monitoringStream):
00055     monitoringStream_( monitoringStream ),
00056     tt_(tt), sourceData_( sourceData ), bitsPerSymbol_(bitsPerSymbol),
00057     pState_( new SequenceReaderModeIgnore( monitoringStream ) ),
00058     wordFlag_(0),
00059     doubleBitShift_(bitsPerSymbol<<1),
00060     symbolMask_((1<<bitsPerSymbol)-1)
00061   //    numSymbolPairs_(wordLength>>1),
00062   //   oddNumSymbols_(wordLength%1==1),
00063   { 
00064     monitoringStream_ << "constructing SequenceEncoder" << endl;
00065     setWordLength( wordLength );
00066   }
00067 
00068 SequenceEncoder::SequenceEncoder( const SequenceEncoder& rhs ) :
00069   monitoringStream_( rhs.monitoringStream_ ),
00070   tt_( rhs.tt_ ),
00071   ett_( rhs.ett_ ),
00072   sourceData_( rhs.sourceData_ ),
00073   bitsPerSymbol_( rhs.bitsPerSymbol_ ),
00074   wordLength_( rhs.wordLength_ ),
00075     //  pSeq_( rhs.pSeq_ ),// don't want 2 encoders linking to same seq
00076   pState_( rhs.pState_->clone() ),
00077   wordFlag_(0),
00078   //  numSymbolPairs_(rhs.numSymbolPairs_),
00079   //  oddNumSymbols_(rhs.oddNumSymbols_),
00080   symbolMask_(rhs.symbolMask_),
00081   doubleBitShift_(rhs.doubleBitShift_)
00082   {
00083     monitoringStream_ << "copy constructing SequenceEncoder" << endl;
00084   }
00085 
00086   SequenceEncoder::~SequenceEncoder()
00087   {
00088     monitoringStream_ << "destructing SequenceEncoder" << endl;
00089     delete pState_;
00090   }
00091 
00092 SequenceEncoderProtein::SequenceEncoderProtein
00093 ( int wordLength, ostream& monStream ) :
00094   SequenceEncoder
00095     ( &ttProtein, gProteinData, gResidueBits, wordLength, monStream ) 
00096 {
00097   monitoringStream_ << "constructing SequenceEncoderProtein" << endl;
00098   ett_ = &ettSource_;
00099   if (!isExpanded_)
00100   {
00101     expandTranslationTable( ettSource_ );
00102     isExpanded_ = true;
00103   }
00104 }
00105 
00106 SequenceEncoderDNA::SequenceEncoderDNA
00107 ( int wordLength, ostream& monStream ) :
00108   SequenceEncoder( &ttDNA, gDNAData, gBaseBits, wordLength, monStream ) 
00109 {
00110   monitoringStream_ << "constructing SequenceEncoderDNA" << endl;
00111   ett_ = &ettSource_;
00112   if (!isExpanded_)
00113   {
00114     expandTranslationTable( ettSource_ );
00115     isExpanded_ = true;
00116   }
00117 }
00118 
00119 SequenceEncoderCodon::SequenceEncoderCodon
00120 ( int wordLength, ostream& monStream ) :
00121   SequenceEncoder
00122 ( &ttCodon, gDNAData, gResidueBits, wordLength, monStream ) 
00123 {
00124   monitoringStream_ << "constructing SequenceEncoderCodon" << endl;
00125   ett_ = &ettSource_;
00126   if (!isExpanded_)
00127   {
00128     expandTranslationTable( ettSource_ );
00129     isExpanded_ = true;
00130   }
00131 }
00132 
00133 
00134 
00135 bool SequenceEncoderDNA::isExpanded_ = false;
00136 bool SequenceEncoderProtein::isExpanded_ = false;
00137 bool SequenceEncoderCodon::isExpanded_ = false;
00138 ExpandedTranslationTable SequenceEncoderDNA::ettSource_;
00139 ExpandedTranslationTable SequenceEncoderProtein::ettSource_;
00140 ExpandedTranslationTable SequenceEncoderCodon::ettSource_;
00141 
00142 
00143   // Function Name: changeMode
00144   // Arguments: const SequenceReaderMode&
00145   // Makes a copy of mode and uses it to handle mismatch character reads
00146   void SequenceEncoder::changeMode( SequenceReaderMode* pMode )
00147   {
00148     monitoringStream_ << "SequenceReader::changeMode\n";
00149     delete pState_;
00150     pState_ = pMode->clone();
00151   }
00152 
00153 
00154 void SequenceEncoder::encode( const char* data, int numChars ) 
00155 {
00156   if (numChars==0) return;
00157 
00158   ushort* p( (ushort*) data );
00159   // the numChars%2 bit avoids unaligned access
00160   const ushort* lastp( (ushort*) &data[ numChars - (numChars%2) ] );
00161   uchar*  pTemp;
00162 
00163   int basesInLast(pSeq_->getNumBasesInLast());
00164   //  cout << basesInLast << " encoding " << numChars << endl;
00165   int splitWord( wordLength_-1 );
00166 
00167   Word thisWord(pSeq_->back()), thisCode(0);
00168   pSeq_->pop_back();
00169 
00170   while (p < lastp)
00171   {
00172     //    cout << "XXX:" << ((char*)p)-data << " " << (unsigned int)*((char*)p)
00173     // << " " << (unsigned int)*(((char*)p)+1) << " " << basesInLast << endl;
00174     thisCode = (*ett_)[ *p ];
00175     if ( (thisCode & someCharInvalid) == 0 )
00176     {
00177       //      cout << basesInLast << " both valid" << endl;
00178       if ( basesInLast != splitWord )
00179       {
00180         thisWord <<= doubleBitShift_;
00181         thisWord |= thisCode;
00182         //      cout << basesInLast << " double shift " 
00183         //     << printWord( thisWord, wordLength_ ) << endl;
00184         basesInLast +=2;
00185         if (basesInLast==wordLength_)
00186           addWord( thisWord, wordFlag_, basesInLast );
00187       }
00188       else
00189       {
00190         thisWord <<= bitsPerSymbol_;
00191 
00192         thisWord |= (( thisCode >> bitsPerSymbol_ ) & symbolMask_ );
00193         //      cout << basesInLast << " split word " 
00194         //   << printWord( thisWord, wordLength_ )<< " "  
00195         //           << printWord( thisCode, wordLength_ )<< endl;
00196         addWord( thisWord, wordFlag_, basesInLast );
00197         thisWord = thisCode & symbolMask_;
00198         //      cout << basesInLast << " after split " 
00199         //   << printWord( thisWord, wordLength_ ) << endl;
00200         basesInLast = 1;
00201       }
00202     }
00203     else
00204     {
00205       //      cout << basesInLast << " not both valid" << endl;
00206       pTemp = (uchar*) p;
00207       encodeChar( *pTemp, thisWord, wordFlag_, basesInLast );
00208       ++pTemp;
00209       encodeChar( *pTemp, thisWord, wordFlag_, basesInLast );
00210       
00211     } // ~else
00212     p++;
00213   } // ~while
00214 
00215   //  cout << "got to end.." << ((uchar*)p-(uchar*)data) << " " << numChars << endl;
00216   assert(p==lastp);
00217   if (numChars%2==1)
00218   {
00219     pTemp = (uchar*)p;
00220     //    cout << basesInLast << " doing odd char at end " << *pTemp << endl;
00221     encodeChar( *p, thisWord, wordFlag_, basesInLast );
00222   }
00223 
00224   pSeq_->setNumBasesInLast(basesInLast);
00225   pSeq_->push_back(thisWord);
00226 
00227 
00228   //  cout << basesInLast << " leaving " 
00229   //   << printWord(thisWord, wordLength_) << endl;
00230 }
00231 
00232 void SequenceEncoder::encodeChar
00233 ( uchar thisChar, Word& thisWord, Word& thisFlag, int& basesInLast )
00234 {
00235   Word thisCode;
00236   //  cout << basesInLast << " encodeChar... " << thisChar << endl;
00237   if ( (*tt_)[ thisChar ] == nv ) 
00238       pState_->mismatch( thisChar, thisFlag );
00239   
00240   //  cout << basesInLast << " ... converted to " << thisChar << endl;
00241   thisCode = (*tt_)[ thisChar ];
00242 
00243   if ( thisCode == nv ) 
00244   {
00245     //    cout << basesInLast << " ... not valid " << thisChar << endl;
00246     return;
00247   }
00248   thisWord <<= bitsPerSymbol_;
00249   thisWord |= thisCode;
00250   //  cout << basesInLast << " encode char " 
00251   //   << printWord( thisWord, wordLength_ ) << endl;
00252    
00253   basesInLast++;
00254   if (basesInLast==wordLength_) addWord( thisWord, thisFlag, basesInLast );
00255 
00256 }
00257 
00258 void SequenceEncoder::addWord
00259 ( Word& thisWord, Word& thisFlag, int& basesInLast )
00260 {
00261 
00262   thisWord |= thisFlag;
00263   pSeq_->push_back(thisWord);
00264   //  cout << basesInLast << " adding Word " 
00265   //   << printWord( thisWord, wordLength_) << " " << printResidue( thisWord, wordLength_ ) << " " << thisFlag << endl;
00266   thisFlag=0;
00267   thisWord=0;
00268   basesInLast=0;
00269 }
00270 
00271 
00272 
00273 void SequenceEncoder::unlinkSeq( void )
00274 {
00275   if (pSeq_->getNumBasesInLast() != 0)
00276     {
00277       // Store the stage of the flag ...
00278       Word curseFlag = ( pSeq_->back() & gCursedWord );
00279 
00280       pSeq_->back() 
00281         <<= (bitsPerSymbol_*(wordLength_-pSeq_->getNumBasesInLast()));
00282 
00283       // ... and restore it after the shift
00284       pSeq_->back() |= curseFlag;
00285            
00286     }
00287   //  else pSeq_->pop_back();
00288   pSeq_ = NULL;
00289 }
00290 
00291 
00292 void SequenceEncoder::expandTranslationTable
00293 ( ExpandedTranslationTable& ett )
00294 {
00295   monitoringStream_ << "expanding the ttable" << endl;
00296   uchar a[2];
00297   unsigned short* b = (unsigned short*) &a[0];
00298   a[0]=0; a[1]=0;
00299   Word w[2];
00300 
00301   do  
00302   {
00303     do
00304     {
00305       //      cout << (unsigned int) a[0] << " " 
00306       //   << ( &((*tt_)[a[0]]) - &((*tt_)[0])) << endl;              
00307       //      cout << (unsigned int) a[0] << " " << (unsigned int) a[1] 
00308       //           << " " << *b << endl;
00309       //      ett[ *b ] = (*tt_)[ (unsigned int)a[0] ] == nv ) 
00310       w[ 0 ] = (*tt_)[ a[0] ];
00311       w[ 1 ] = (*tt_)[ a[1] ];
00312       if ( ( w[0] == nv ) || ( w[1] == nv ) )
00313       {
00314         ett[ *b ] = (w[0]==nv)*firstCharInvalid;
00315         ett[ *b ] |= (w[1]==nv)*secondCharInvalid;
00316       }
00317       else
00318       { 
00319         ett[ *b ] = w[0];
00320         ett[ *b ] <<= bitsPerSymbol_;
00321         ett[ *b ] |= w[1];
00322         
00323       }
00324     } // ~for j
00325     while( ++a[1] != 0 );
00326   } // ~for i
00327   while( ++a[0] != 0 );
00328   monitoringStream_ << "done expanding" << endl;
00329 
00330 
00331 }
00332 
00333 
00334 
00335   // Function Name: codonize
00336   // Arguments: WordSequence& (in), vector<char> out 
00337   // Returns:   void
00338   // Splits in into codons, one codon per char 
00339   void codonize
00340     ( const WordSequence& in, CodonList& codons, int readingFrame )
00341   {
00342     //    cout << "Codonizing for reading frame " << readingFrame << endl;
00343     static const Word mask((1<<gCodonBits)-1);
00344 
00345     int shift( gBitsPerWord-gCodonBits-gBaseBits*readingFrame );
00346 
00347 
00348     WordSequence::const_iterator i(in.begin());
00349     //    vector<char> codons;
00350     int codonsSoFar(0);
00351     int totalCodons
00352       ( (gMaxBasesPerWord*(in.size()-1) 
00353           + in.getNumBasesInLast() - readingFrame)/gBasesPerCodon );
00354 
00355     Word thisCodon;
00356 
00357     while( 1 )
00358     {
00359     
00360       if (shift>=0)
00361       {
00362         thisCodon = 0;
00363       }
00364       else
00365       {
00366         thisCodon = (*i<<-shift);
00367         i++;
00368         shift += gBitsPerWord;
00369       } 
00370       thisCodon |= (*i>>shift);
00371       thisCodon &= mask;
00372 
00373       codons.push_back(thisCodon);
00374       if (++codonsSoFar==totalCodons) break;
00375       if (shift == 0) { shift = gBitsPerWord; i++; }
00376       shift -= gCodonBits;
00377 
00378     } // ~while
00379   
00380     //    pEncoder_->encode( codons );
00381 
00382 
00383   } // ~SequenceReaderCodon::codonize
00384 
00385 // Function Name: codonizeAndFlag
00386 // Arguments: WordSequence& (in), vector<char> out 
00387 // Returns:   void
00388 // Splits a WordSequence of 2 bit per base DNA into codons
00389 // Used only when creating a HashTableTranslated
00390 // Assumes the DNA has a word size of 15
00391 // This leaves the MSB of a Word free to act as a flag
00392 // If an entry of codons is equal to '!' then the flag bit
00393 // was set in the DNA Word(s) it came from, meaning the original DNA
00394 // test was flagged, typically because it contained Ns or -s
00395 // If codons are then encoded using a SequenceEncoderCodon
00396 // with mode set to SequenceReaderModeFlagReplace('X'), this '!'
00397 // character is not recognized and the Word it is in is flagged
00398 // Point of all this is that repeat masked DNA can be excluded from
00399 // the HashTableTranslated
00400 
00401 void codonizeAndFlag
00402 ( const WordSequence& in, CodonList& codons, int readingFrame )
00403 {
00404 
00405   static const int DNAWordSizeForHashing(gMaxBasesPerWord-1);
00406 
00407   int codonsInLast((in.getNumBasesInLast()-readingFrame)/gNumReadingFrames);
00408 
00409   //  cout << "blx: " << in.size() << " " << in.getNumBasesInLast() << " " << codonsInLast << endl;
00410   int numCodons
00411     (   ( (in.size() == 0) 
00412           ? 0
00413           : ((in.size()-1)*DNAWordSizeForHashing)+in.getNumBasesInLast()
00414           -readingFrame )
00415       / gNumReadingFrames );
00416 
00417   codons.resize(numCodons);
00418   char* pCodon(static_cast<char*>(&(*codons.begin())));
00419   char* pEnd(static_cast<char*>(&(*codons.end())));
00420   WordSequence::const_iterator i(in.begin());
00421   WordSequence::const_iterator lastWord(in.end()-1);
00422   Word toCarry(~0);
00423   Word lastWordFlag(0);
00424 
00425   //  const Word maskBase(0x3);
00426   //  const Word maskCodon((0x3>>4)|(0x3>>2)|(0x3>>4));
00427   
00428 
00429   if (readingFrame==0)
00430   {
00431     for( ; i!=lastWord ; ++i )
00432     {
00433 
00434       *(pCodon++) = getCodonFromWord<4,0>(*i);
00435       *(pCodon++) = getCodonFromWord<3,0>(*i);
00436       *(pCodon++) = getCodonFromWord<2,0>(*i);
00437       *(pCodon++) = getCodonFromWord<1,0>(*i);
00438       *(pCodon++) = getCodonFromWord<0,0>(*i);
00439 
00440     } // ~for
00441 
00442     if (codonsInLast>=1)
00443       *(pCodon++) = getCodonFromWord<4,0>(*i);
00444     if (codonsInLast>=2)
00445       *(pCodon++) = getCodonFromWord<3,0>(*i);
00446     if (codonsInLast>=3)
00447       *(pCodon++) = getCodonFromWord<2,0>(*i);
00448     if (codonsInLast==4)
00449       *(pCodon++) = getCodonFromWord<1,0>(*i);
00450   } // ~if 
00451   else if (readingFrame==1)
00452   {
00453 
00454     for( ; i!=lastWord ; ++i )
00455     {
00456       if (toCarry!=~0)
00457       (*pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00458         ? flaggedChar
00459         : ( toCarry | (((*i) >> (4*gCodonBits + 2*gBaseBits))&maskBase ));
00460 
00461       *(pCodon++) = getCodonFromWord<3,2>(*i);
00462       *(pCodon++) = getCodonFromWord<2,2>(*i);
00463       *(pCodon++) = getCodonFromWord<1,2>(*i);
00464       *(pCodon++) = getCodonFromWord<0,2>(*i);
00465 
00466       lastWordFlag = (*i)&gCursedWord;
00467       toCarry = ((*i)& mask2Bases) << gBaseBits;
00468  
00469     } // ~for
00470     if (in.getNumBasesInLast() >= readingFrame )
00471     {
00472       *(pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00473         ? flaggedChar
00474         : ( toCarry | (((*i) >> (4*gCodonBits + 2*gBaseBits))&maskBase ));
00475     }
00476     if (codonsInLast>=1)
00477       *(pCodon++) = getCodonFromWord<3,2>(*i);
00478     if (codonsInLast>=2)
00479       *(pCodon++) = getCodonFromWord<2,2>(*i);
00480     if (codonsInLast>=3)
00481       *(pCodon++) = getCodonFromWord<1,2>(*i);
00482     if (codonsInLast>=4)
00483       *(pCodon++) = getCodonFromWord<0,2>(*i);
00484 
00485   } // ~else if
00486   else // if (readingFrame==2)
00487   {
00488 
00489     for( ; i!=lastWord ; ++i )
00490     {
00491       if (toCarry!=~0)
00492       (*pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00493         ? flaggedChar
00494         : ( toCarry | (((*i) >> (4*gCodonBits + gBaseBits))&mask2Bases ));
00495 
00496       *(pCodon++) = getCodonFromWord<3,1>(*i);
00497       *(pCodon++) = getCodonFromWord<2,1>(*i);
00498       *(pCodon++) = getCodonFromWord<1,1>(*i);
00499       *(pCodon++) = getCodonFromWord<0,1>(*i);
00500 
00501       lastWordFlag = (*i)&gCursedWord;
00502       toCarry = ((*i)& maskBase) << (2*gBaseBits);
00503  
00504     } // ~for
00505     if (in.getNumBasesInLast() >= readingFrame )
00506     {
00507     
00508       *(pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00509         ? flaggedChar
00510         : ( toCarry | (((*i) >> (4*gCodonBits + gBaseBits))&mask2Bases ));
00511     }
00512     if (codonsInLast>=1)
00513       *(pCodon++) = getCodonFromWord<3,1>(*i);
00514     if (codonsInLast>=2)
00515       *(pCodon++) = getCodonFromWord<2,1>(*i);
00516     if (codonsInLast>=3)
00517       *(pCodon++) = getCodonFromWord<1,1>(*i);
00518     if (codonsInLast>=4)
00519       *(pCodon++) = getCodonFromWord<0,1>(*i);
00520 
00521   } // ~else if
00522 
00523   //  cout << "codons fwd: " << codons << endl;
00524   assert(pCodon==pEnd);
00525 
00526 } // ~void codonizeAndFlag
00527 
00528 
00529 // Function Name: codonizeAndFlagReverse
00530 // Arguments: WordSequence& (in), vector<char> out 
00531 // Returns:   void
00532 // Codonizes the reverse complement of a WordSequence as described
00533 // for codonizeAndFlag above. Extra step at the end to reverse
00534 // each of the codons
00535 void codonizeAndFlagReverse
00536 ( const WordSequence& in, CodonList& codons, int readingFrame )
00537 {
00538 
00539   static const int DNAWordSizeForHashing(gMaxBasesPerWord-1);
00540 
00541   int codonsInLast((in.getNumBasesInLast()-readingFrame)/gNumReadingFrames);
00542 
00543 
00544   int numCodons
00545     (   ( (in.size() == 0) 
00546           ? 0
00547           : ((in.size()-1)*DNAWordSizeForHashing)+in.getNumBasesInLast()
00548           -readingFrame )
00549       / gNumReadingFrames );
00550 
00551   readingFrame = (in.getNumBasesInLast()-readingFrame+gNumReadingFrames)%gNumReadingFrames;
00552 
00553 
00554 
00555   //  cout << "blx: " << in.size() << " " << in.getNumBasesInLast() << " " << readingFrame << " " << codonsInLast << " " << numCodons << endl;
00556   codons.resize(numCodons);
00557   char* pCodon(static_cast<char*>(&(*codons.begin())));
00558   char* pEnd(static_cast<char*>(&(*codons.end())));
00559   WordSequence::const_iterator i(in.end()-1);
00560   WordSequence::const_iterator lastWord(in.begin());
00561   Word toCarry(~0);
00562   Word lastWordFlag(0);
00563 
00564   //  const Word maskBase(0x3);
00565   //  const Word maskCodon((0x3>>4)|(0x3>>2)|(0x3>>4));
00566   
00567 
00568   if (readingFrame==0)
00569   {
00570     if (codonsInLast==4)
00571       *(pCodon++) = getCodonFromWord<1,0>(*i);
00572     if (codonsInLast>=3)
00573       *(pCodon++) = getCodonFromWord<2,0>(*i);
00574     if (codonsInLast>=2)
00575       *(pCodon++) = getCodonFromWord<3,0>(*i);
00576     if (codonsInLast>=1)
00577       *(pCodon++) = getCodonFromWord<4,0>(*i);
00578 
00579     do
00580     {
00581       i--;
00582       *(pCodon++) = getCodonFromWord<0,0>(*i);
00583       *(pCodon++) = getCodonFromWord<1,0>(*i);
00584       *(pCodon++) = getCodonFromWord<2,0>(*i);
00585       *(pCodon++) = getCodonFromWord<3,0>(*i);
00586       *(pCodon++) = getCodonFromWord<4,0>(*i);
00587 
00588     } 
00589     while (i!=lastWord);
00590 
00591   } // ~if 
00592   else if (readingFrame==1)
00593   {
00594     //    cout << "rf1" << endl;
00595     if (codonsInLast==4)
00596       *(pCodon++) = getCodonFromWord<0,2>(*i);
00597     if (codonsInLast>=3)
00598       *(pCodon++) = getCodonFromWord<1,2>(*i);
00599     if (codonsInLast>=2)
00600       *(pCodon++) = getCodonFromWord<2,2>(*i);
00601     if (codonsInLast>=1)
00602       *(pCodon++) = getCodonFromWord<3,2>(*i);
00603 
00604     if (in.getNumBasesInLast()>=readingFrame)
00605     {
00606       lastWordFlag = (*i)&gCursedWord;
00607       toCarry = ((*i) >> (4*gCodonBits + 2*gBaseBits))&maskBase ;
00608     } // ~if
00609 
00610     do
00611     {
00612       i--;
00613       if (toCarry!=~0)
00614       (*pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00615         ? flaggedChar
00616         : ( toCarry | ( ((*i) & mask2Bases ) << gBaseBits ) );
00617 
00618       *(pCodon++) = getCodonFromWord<0,2>(*i);
00619       *(pCodon++) = getCodonFromWord<1,2>(*i);
00620       *(pCodon++) = getCodonFromWord<2,2>(*i);
00621       *(pCodon++) = getCodonFromWord<3,2>(*i);
00622       lastWordFlag = (*i)&gCursedWord;
00623       toCarry = ((*i) >> (4*gCodonBits + 2*gBaseBits))&maskBase ;
00624 
00625 
00626     } 
00627     while (i!=lastWord);
00628 
00629 
00630   } // ~else if
00631   else // if (readingFrame==2)
00632   {
00633     //    cout << "rf2" << endl;
00634 
00635     if (codonsInLast==4)
00636       *(pCodon++) = getCodonFromWord<0,1>(*i);
00637     if (codonsInLast>=3)
00638       *(pCodon++) = getCodonFromWord<1,1>(*i);
00639     if (codonsInLast>=2)
00640       *(pCodon++) = getCodonFromWord<2,1>(*i);
00641     if (codonsInLast>=1)
00642       *(pCodon++) = getCodonFromWord<3,1>(*i);
00643 
00644     if (in.getNumBasesInLast()>=readingFrame)
00645     {
00646       lastWordFlag = (*i)&gCursedWord;
00647       toCarry = ((*i) >> (4*gCodonBits + gBaseBits))&mask2Bases ;
00648     } // ~if
00649 
00650     do
00651     {
00652       i--;
00653       if (toCarry!=~0)
00654       (*pCodon++) = ( ((*i)&gCursedWord)|lastWordFlag )
00655         ? flaggedChar
00656         : ( toCarry | ( ((*i) & maskBase ) << (2*gBaseBits) ) );
00657 
00658       *(pCodon++) = getCodonFromWord<0,1>(*i);
00659       *(pCodon++) = getCodonFromWord<1,1>(*i);
00660       *(pCodon++) = getCodonFromWord<2,1>(*i);
00661       *(pCodon++) = getCodonFromWord<3,1>(*i);
00662       lastWordFlag = (*i)&gCursedWord;
00663       toCarry = ((*i) >> (4*gCodonBits + gBaseBits))&mask2Bases ;
00664 
00665     } 
00666     while (i!=lastWord);
00667 
00668 
00669   } // ~else if
00670 
00671 
00672   assert(pCodon==pEnd);
00673   //  cout << "codons unrev: " << codons << endl;
00674 
00675   for (CodonList::iterator j(codons.begin()); j!=codons.end(); ++j)
00676   {
00677     
00678     *j = 
00679       ( ((*j)==flaggedChar) 
00680         ? flaggedChar 
00681         :
00682         ( ((*j)&maskBase^maskBase) << (2*gBaseBits) )
00683         | ((*j)&(maskBase<<gBaseBits)^(maskBase<<gBaseBits))
00684         | ( ((*j)&(maskCodon^mask2Bases)^(maskCodon^mask2Bases)) 
00685             >> (2*gBaseBits) )  );
00686   } // ~for
00687   //  cout << "codons rev: " << codons << endl;
00688 
00689 } // ~void codonizeAndFlagReverse
00690 
00691 
00692 
00693 ostream& operator<<( ostream& os, CodonList& c )
00694 {
00695   for (vector<char>::iterator i(c.begin()); i!=c.end(); i++ )
00696     os << (unsigned int)*i << "-" << 
00697       ((*i==flaggedChar) ? '!' : gCodonNames[(unsigned int)*i]) << " ";
00698   return os;
00699 } // ~ostream& operator<<( ostream& os, CodonList& c )
00700 
00701 
00702 
00703 
00704 
00705 // Name:
00706 // Arguments:
00707 // TYPE  NAME  IN/OUT COMMENT
00708 // Returns: TYPE COMMENT
00709 
00710 // End of file SequenceEncoder.cpp
00711 

Generated on Fri Dec 21 13:12:16 2007 for ssaha by  doxygen 1.5.2