SequenceReader/SequenceEncoder.h

Go to the documentation of this file.
00001 /*  Last edited: May 29 14:29 2002 (ac2) */
00002 
00003 // #######################################################################
00004 
00005 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00006 // Version 3.2, released 1st March 2004
00007 // Copyright (c) Genome Research 2002
00008 
00009 // SSAHA is free software; you can redistribute it and/or modify 
00010 // it under the terms of version 2 of the GNU General Public Licence
00011 // as published by the Free Software Foundation.
00012  
00013 // This program is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public Licence for more details.
00017  
00018 // You should have received a copy of the GNU General Public Licence
00019 // along with this program; if not, write to the Free Software
00020 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00021 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00022 
00023 // #######################################################################
00024 
00025 // Module Name  : SequenceEncoder
00026 // File Name    : SequenceEncoder.h
00027 // Language     : C++
00028 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00029 
00030 // Include guard:
00031 #ifndef INCLUDED_SequenceEncoder
00032 #define INCLUDED_SequenceEncoder
00033 
00034 // Description:
00035 
00036 // Includes:
00037 #include "GlobalDefinitions.h"
00038 
00039 // NB it is good practise for #include statements in header files to be
00040 // replaced by forward declarations if at all possible
00041 class SequenceReaderMode;
00042 
00043 // ### Class Declarations ###
00044 
00045 
00046 static const int numPossibleChars(1<<8);
00047 //static const uchar maxPossibleChar(0xFF);
00048 typedef Word TranslationTable[numPossibleChars]; 
00049 typedef Word ExpandedTranslationTable[numPossibleChars*numPossibleChars];
00050 
00051 static const Word nv(0xFF);
00052 static const Word firstCharInvalid(1<<31);
00053 static const Word secondCharInvalid(1<<30);
00054 static const Word someCharInvalid(firstCharInvalid|secondCharInvalid);
00055 
00056 static const Word maskBase(0x3);
00057 static const Word mask2Bases((maskBase<<gBaseBits)|maskBase);
00058 static const Word maskCodon((mask2Bases<<gBaseBits)|maskBase);
00059 
00060 // flagged char must have value > 63 to avoid being confused
00061 static const char flaggedChar('^');
00062 
00063 
00064 static const TranslationTable ttDNA = 
00065 {
00066 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00067 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00068 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00069 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00070 nv, /* next is 'A' */ 
00071 00,nv,01,nv,nv,nv,02,nv,nv,nv,nv,nv,nv,nv,nv, /* next is 'P' */
00072 nv,nv,nv,nv,03,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00073 nv, /* next is 'a' */ 
00074 00,nv,01,nv,nv,nv,02,nv,nv,nv,nv,nv,nv,nv,nv, /* next is 'p' */
00075 nv,nv,nv,nv,03,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00076 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00077 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00078 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00079 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00080 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00081 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00082 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00083 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv
00084 };
00085 
00086 #ifdef OLD_PROTEIN_TRANSLATION_TABLE
00087 static const TranslationTable ttProtein = 
00088 {
00089 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00090 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00091 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv, 0,nv,nv,nv,nv,nv,
00092 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00093 nv, /* next is 'A' */ 
00094  1,nv, 2, 3, 4, 5, 6, 7, 8,nv, 9,10,11,12,nv, /* next is 'P' */
00095 13,14,15,16,17,18,19,20,nv,21,nv,nv,nv,nv,nv,nv,
00096 nv, /* next is 'a' */ 
00097  1,nv, 2, 3, 4, 5, 6, 7, 8,nv, 9,10,11,12,nv, /* next is 'p' */
00098 13,14,15,16,17,18,19,20,nv,21,nv,nv,nv,nv,nv,nv,
00099 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00100 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00101 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00102 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00103 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00104 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00105 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00106 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv
00107 };
00108 #endif
00109 
00110 // changed 20.3.2 AJC 
00111 // coding for U removed, extra encoding for X added. New encoding:
00112 // 0  *
00113 // 1  A
00114 // 2  C
00115 // 3  D
00116 // 4  E
00117 // 5  F
00118 // 6  G
00119 // 7  H
00120 // 8  I
00121 // 9  K
00122 // 10 L
00123 // 11 M
00124 // 12 N
00125 // 13 P
00126 // 14 Q
00127 // 15 R
00128 // 16 S
00129 // 17 T
00130 // 18 V
00131 // 19 W
00132 // 20 X
00133 // 21 Y
00134 // Changes: U was 18, now treated as an X
00135 //          V was 19 now 18
00136 //          W was 20 now 19
00137 //          X is 20, not there before
00138 // B (= D or N) now treated as D
00139 // Z (= E or Q) now treated as E
00140 
00141 static const TranslationTable ttProtein = 
00142 {
00143 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00144 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00145 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv, 0,nv,nv,nv,nv,nv,
00146 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00147 nv, /* next is 'A' */ 
00148  1, 3, 2, 3, 4, 5, 6, 7, 8,nv, 9,10,11,12,nv, /* next is 'P' */
00149 13,14,15,16,17,20,18,19,20,21, 4,nv,nv,nv,nv,nv,
00150 nv, /* next is 'a' */ 
00151  1, 3, 2, 3, 4, 5, 6, 7, 8,nv, 9,10,11,12,nv, /* next is 'p' */
00152 13,14,15,16,17,20,18,19,20,21, 4,nv,nv,nv,nv,nv,
00153 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00154 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00155 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00156 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00157 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00158 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00159 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00160 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv
00161 };
00162 
00163 
00164 static const TranslationTable ttCodon    = 
00165 {
00166   9 , // K', // AAA
00167   12, // N', // AAC
00168   9 , // K', // AAG
00169   12, // N', // AAT
00170   17, // T', // ACA
00171   17, // T', // ACC
00172   17, // T', // ACG
00173   17, // T', // ACT
00174   15, // R', // AGA
00175   16, // S', // AGC
00176   15, // R', // AGG
00177   16, // S', // AGT
00178   8 , // I', // ATA
00179   8 , // I', // ATC
00180   11, // M', // ATG
00181   8 , // I', // ATT
00182   14, // Q', // CAA
00183   7 , // H', // CAC
00184   14, // Q', // CAG
00185   7 , // H', // CAT
00186   13, // P', // CCA
00187   13, // P', // CCC
00188   13, // P', // CCG
00189   13, // P', // CCT
00190   15, // R', // CGA
00191   15, // R', // CGC
00192   15, // R', // CGG
00193   15, // R', // CGT
00194   10, // L', // CTA
00195   10, // L', // CTC
00196   10, // L', // CTG
00197   10, // L', // CTT
00198   4 , // E', // GAA
00199   3 , // D', // GAC
00200   4 , // E', // GAG
00201   3 , // D', // GAT
00202   1 , // A', // GCA
00203   1 , // A', // GCC
00204   1 , // A', // GCG
00205   1 , // A', // GCT
00206   6 , // G', // GGA
00207   6 , // G', // GGC
00208   6 , // G', // GGG
00209   6 , // G', // GGT
00210   18, // 19, // V', // GTA
00211   18, // 19, // V', // GTC
00212   18, // 19, // V', // GTG
00213   18, // 19, // V', // GTT
00214   0 , // *', // TAA
00215   21, // Y', // TAC
00216   0 , // *', // TAG
00217   21, // Y', // TAT
00218   16, // S', // TCA
00219   16, // S', // TCC
00220   16, // S', // TCG
00221   16, // S', // TCT
00222   0 , // *', // TGA
00223   2 , // C', // TGC
00224   19, // 20, // W', // TGG
00225   2 , // C', // TGT
00226   10, // L', // TTA
00227   5 , // F', // TTC
00228   10, // L', // TTG
00229   5 , // F', // TTT
00230 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00231 nv,nv,nv,nv,nv,nv,nv,nv,20,nv,nv,nv,nv,nv,nv,nv, // 'X'=20
00232 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00233 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00234 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00235 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00236 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00237 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00238 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00239 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00240 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00241 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv
00242 };
00243 
00244 // ttCodonReverse gives the amino acid code for the reverse of a codon
00245 // saves taking the RC of a sequence and using ttCodon on that
00246 static const TranslationTable ttCodonReverse = 
00247 {
00248   5 , // K', // AAA
00249   19, // N', // AAC
00250   10, // K', // AAG
00251   8 , // N', // AAT
00252   2 , // T', // ACA
00253   6 , // T', // ACC
00254   15, // T', // ACG
00255   16, // T', // ACT
00256   16, // R', // AGA
00257   1 , // S', // AGC
00258   13, // R', // AGG
00259   17, // S', // AGT
00260   21, // I', // ATA
00261   3 , // I', // ATC
00262   7 , // M', // ATG
00263   12, // I', // ATT
00264   10, // Q', // CAA
00265   19, // H', // CAC
00266   10, // Q', // CAG
00267   11, // H', // CAT
00268   20, // P', // CCA
00269   6 , // P', // CCC
00270   15, // P', // CCG
00271   15, // P', // CCT
00272   16, // R', // CGA
00273   1 , // R', // CGC
00274   13, // R', // CGG
00275   17, // R', // CGT
00276   0 , // L', // CTA
00277   4 , // L', // CTC
00278   14, // L', // CTG
00279   9 , // L', // CTT
00280   5 , // E', // GAA
00281   19, // D', // GAC
00282   10, // E', // GAG
00283   8 , // D', // GAT
00284   2 , // A', // GCA
00285   6 , // A', // GCC
00286   15, // A', // GCG
00287   16, // A', // GCT
00288   16, // G', // GGA
00289   1 , // G', // GGC
00290   13, // G', // GGG
00291   17, // G', // GGT
00292   21, // V', // GTA
00293   3 , // V', // GTC
00294   7 , // V', // GTG
00295   12, // V', // GTT
00296   10, // *', // TAA
00297   19, // Y', // TAC
00298   10, // *', // TAG
00299   8 , // Y', // TAT
00300   0 , // S', // TCA
00301   6 , // S', // TCC
00302   15, // S', // TCG
00303   15, // S', // TCT
00304   16, // *', // TGA
00305   1 , // C', // TGC
00306   13, // W', // TGG
00307   17, // C', // TGT
00308   0 , // L', // TTA
00309   4 , // F', // TTC
00310   14, // L', // TTG
00311   9 , // F', // TTT
00312 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00313 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00314 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00315 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00316 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00317 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00318 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00319 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00320 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00321 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00322 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,
00323 nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv,nv
00324 };
00325 
00326 typedef vector<char> CodonList;
00327 
00328 
00329 // Class Name :
00330 // Description: 
00331 class SequenceEncoder
00332 {
00333 
00334   // PUBLIC MEMBER FUNCTIONS
00335   public:
00336 
00337   // Constructors and Destructors
00338 
00339   // Function Name:
00340   // Arguments:
00341   // TYPE  NAME  IN/OUT COMMENT
00342   // Returns: TYPE COMMENT
00343   SequenceEncoder
00344   ( const TranslationTable* tt, 
00345     SourceDataType sourceData,
00346     int bitsPerSymbol, 
00347     int wordLength,
00348     ostream& monitoringStream = cerr);
00349 
00350   SequenceEncoder( const SequenceEncoder& rhs );
00351 
00352   // Function Name:
00353   // Arguments:
00354   // TYPE  NAME  IN/OUT COMMENT
00355   // Returns: TYPE COMMENT
00356   virtual ~SequenceEncoder();
00357   // (NB destructor should be virtual if class is to be derived from)
00358 
00359   virtual SequenceEncoder* clone( void )
00360   {
00361     return new SequenceEncoder(*this);
00362   }
00363 
00364   // Manipulator Functions
00365 
00366   // Function Name: changeMode
00367   // Arguments: const SequenceReaderMode&
00368   // Makes a copy of mode and uses it to handle mismatch character reads
00369   void changeMode( SequenceReaderMode* pMode );
00370 
00371   // Function Name:
00372   // Arguments:
00373   // TYPE  NAME  IN/OUT COMMENT
00374   // Returns: TYPE COMMENT
00375   void linkSeq( WordSequence& seq )
00376     { pSeq_=&seq;
00377       pSeq_->clear();
00378       pSeq_->push_back(0); 
00379       pSeq_->setNumBasesInLast(0); }
00380      
00381 
00382   virtual void encode( const char* data, int numChars ); 
00383   void encode( const string& data, int numChars=-1 )
00384     { encode(data.c_str(), 
00385              (numChars!=-1)?numChars:data.size()); }
00386   void encode( const CodonList& data, int numChars=-1 )
00387     { encode( (const char*)&data[0], 
00388              (numChars!=-1)?numChars:data.size()); }
00389 
00390 
00391 
00392   void encodeChar
00393     ( uchar thisChar, Word& thisWord, Word& wordFlag, int& basesInLast );
00394 
00395   void addWord
00396     ( Word& thisWord, Word& thisFlag, int& basesInLast );
00397 
00398 
00399   //  void fastEncode
00400   //    ( uchar* udata, Word& thisWord, int& i, const int& lastFastEncode );
00401 
00402 
00403   void expandTranslationTable
00404     ( ExpandedTranslationTable& ett );
00405 
00406 
00407   void unlinkSeq( void );
00408 
00409   void setWordLength( int wordLength )
00410   { 
00411     //    cout << "Encoder: " << bitsPerSymbol_ << " " << wordLength << endl;
00412     if ( ( bitsPerSymbol_*wordLength ) > (int)(8*sizeof(Word)) )
00413     {
00414       int * fred(NULL); fred[0] = 9999; // forces segv so can look at core
00415       throw SSAHAException("Symbol data exceeds capacity of Word!\n");
00416     }    
00417     wordLength_ = wordLength;
00418     //   numSymbolPairs_ = wordLength_ >> 1;
00419     //    oddNumSymbols_ = ( (wordLength_&1) == 1 );
00420   }
00421 
00422   // Accessor Functions
00423   // (NB all accessor functions should be 'const')
00424 
00425   SourceDataType getSourceDataType( void ) const { return sourceData_; }
00426   int getWordLength( void ) const { return wordLength_; }
00427   int getBitsPerSymbol( void ) const { return bitsPerSymbol_; }
00428   
00429 
00430 
00431   // Function Name:
00432   // Arguments:
00433   // TYPE  NAME  IN/OUT COMMENT
00434   // Returns: TYPE COMMENT
00435   
00436   // PROTECTED MEMBER FUNCTIONS 
00437   // (visible to this class and derived classes only)
00438   protected:
00439 
00440 
00441   // PRIVATE MEMBER FUNCTIONS
00442   // (visible to instances of this class only)
00443   
00444   private:
00445   SequenceEncoder& operator=(const SequenceEncoder&);   // NOT IMPLEMENTED
00446 
00447   // PRIVATE MEMBER DATA
00448  protected:
00449   ostream& monitoringStream_;
00450   const TranslationTable* tt_;
00451   const ExpandedTranslationTable* ett_;
00452   const SourceDataType sourceData_;
00453   const int bitsPerSymbol_;
00454   const Word symbolMask_;
00455   int wordLength_;
00456   WordSequence* pSeq_;
00457   SequenceReaderMode* pState_;
00458   // According to the mode, wordFlag_ may be set to indicate
00459   // that a word is invalid 
00460   Word wordFlag_;
00461   int  numSymbolPairs_;
00462   //  bool oddNumSymbols_;
00463   const int  doubleBitShift_;
00464 
00465 }; // SequenceEncoder
00466 
00467 class SequenceEncoderDNA : public SequenceEncoder
00468 {
00469 public:
00470   SequenceEncoderDNA( int wordLength=10, ostream& monStream=cerr );
00471 
00472  protected:
00473   static ExpandedTranslationTable ettSource_;
00474   static bool isExpanded_;
00475 };
00476 
00477 class SequenceEncoderProtein : public SequenceEncoder
00478 {
00479 public:
00480   SequenceEncoderProtein( int wordLength=5, ostream& monStream=cerr );
00481 
00482  protected:
00483   static ExpandedTranslationTable ettSource_;
00484   static bool isExpanded_;
00485 
00486 };
00487 
00488 class SequenceEncoderCodon : public SequenceEncoder
00489 {
00490 public:
00491   SequenceEncoderCodon( int wordLength=5, ostream& monStream=cerr );
00492 
00493 // void setForward( void ) { tt_ = &ttCodon; } 
00494 // void setReverse( void ) { tt_ = &ttCodonReverse; } 
00495  protected:
00496   static ExpandedTranslationTable ettSource_;
00497   static bool isExpanded_;
00498 
00499 };
00500 
00501 
00502 
00503 ostream& operator<<( ostream& os, CodonList& c );
00504 
00505 
00506 
00507 // Function Name: codonize
00508 // Arguments: WordSequence& (in), vector<char> out 
00509 // Returns:   void
00510 // Splits a WordSequence of 2 bit per base DNA into codons
00511 // Assumes the DNA has a word size of gMaxBasesPerWord
00512 void codonize
00513 ( const WordSequence& in, CodonList& codons, int readingFrame );
00514 
00515 // Function: GetCodonFromWord
00516 // Grabs 3 bases (=6bits) from a specified position in a word
00517 // ... unless the gCursedWord bit is set. In this case, the
00518 // character flaggedChar (='^') is returned. When SequenceEncoder
00519 // tries to encode this character using ttCodon it gets nv and 
00520 // (provided its mode is set to SequenceReaderModeFlagReplace('X') ) 
00521 // replaces it with 'X'. The 'X' *character* is then encoded by
00522 // ttCodon to 20, which is the *code* for the X *codon*
00523 
00524 
00525 template <int CODON_SHIFT, int BASE_SHIFT > 
00526 Word getCodonFromWord( const Word& w )
00527 {
00528   return 
00529   (
00530      (w&gCursedWord) 
00531      ? flaggedChar
00532      : (    ( w & ( maskCodon << ((CODON_SHIFT*gCodonBits) 
00533                                   + (BASE_SHIFT*gBaseBits)   ) ) )
00534          >> ((CODON_SHIFT*gCodonBits) + (BASE_SHIFT*gBaseBits) ) ) 
00535   );
00536 
00537 
00538 } //~template <int CODON_SHIFT, int BASE_SHIFT > Word getCodonFromWord
00539 
00540 
00541 
00542 
00543 // Function Name: codonizeAndFlag
00544 // Arguments: WordSequence& (in), vector<char> out 
00545 // Returns:   void
00546 // Splits a WordSequence of 2 bit per base DNA into codons
00547 // Used only when creating a HashTableTranslated
00548 // Assumes the DNA has a word size of 15
00549 // This leaves the MSB of a Word free to act as a flag
00550 // If an entry of codons is equal to '!' then the flag bit
00551 // was set in the DNA Word(s) it came from, meaning the original DNA
00552 // test was flagged, typically because it contained Ns or -s
00553 // If codons are then encoded using a SequenceEncoderCodon
00554 // with mode set to SequenceReaderModeFlagReplace('X'), this '!'
00555 // character is not recoginzed and the Word it is in is flagged
00556 // Point of all this is that repeat masked DNA can be excluded from
00557 // the HashTableTranslated
00558 void codonizeAndFlag
00559 ( const WordSequence& in, CodonList& codons, int readingFrame );
00560 
00561 // Function Name: codonizeAndFlag
00562 // Arguments: WordSequence& (in), vector<char> out 
00563 // Returns:   void
00564 // Codonizes the reverse complement of a WordSequence as described
00565 // for codonizeAndFlag above. Reverse complement is not generated,
00566 // the codonizations are generated directly from the forward
00567 // strand using ttCodonReverse
00568 void codonizeAndFlagReverse
00569 ( const WordSequence& in, CodonList& codons, int readingFrame );
00570 
00571 
00572 
00573 
00574 
00575 
00576 
00577 // ### Function Declarations ###
00578 
00579 // Name:
00580 // Arguments:
00581 // TYPE  NAME  IN/OUT COMMENT
00582 // Returns: TYPE COMMENT
00583 void print( const TranslationTable& tt );
00584 
00585 
00586 
00587 // End of include guard:
00588 #endif
00589 
00590 // End of file SequenceEncoder.h
00591 

Generated on Fri Dec 21 13:12:16 2007 for ssaha by  doxygen 1.5.2