QueryManager/MatchAligner.h

Go to the documentation of this file.
00001 /*  Last edited: May 26 15:54 2002 (ac2) */
00002 
00003 // #######################################################################
00004 
00005 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00006 // Version 3.2, released 1st March 2004
00007 // Copyright (c) Genome Research 2002
00008 
00009 // SSAHA is free software; you can redistribute it and/or modify 
00010 // it under the terms of version 2 of the GNU General Public Licence
00011 // as published by the Free Software Foundation.
00012  
00013 // This program is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public Licence for more details.
00017  
00018 // You should have received a copy of the GNU General Public Licence
00019 // along with this program; if not, write to the Free Software
00020 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00021 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00022 
00023 // #######################################################################
00024 
00025 // Module Name  : MatchAligner
00026 // File Name    : MatchAligner.h
00027 // Language     : C++
00028 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00029 
00030 // Include guard:
00031 #ifndef INCLUDED_MatchAligner
00032 #define INCLUDED_MatchAligner
00033 
00034 // Description:
00035 
00036 // Includes:
00037 #include "MatchStore.h"
00038 #include "SequenceEncoder.h"
00039 #include <cassert>
00040 
00041 // NB it is good practise for #include statements in header files to be
00042 // replaced by forward declarations if at all possible
00043 
00044 // ### Class Declarations ###
00045 
00046 // Declarations for MatchTaskAlign
00047 
00048 typedef unsigned char uchar;
00049 
00050 typedef unsigned char AlignBreakType;
00051 
00052 const AlignBreakType alignBreakNull = 0;
00053 const AlignBreakType alignBreakMismatch = 1;
00054 const AlignBreakType alignBreakGapQuery = 2;
00055 const AlignBreakType alignBreakGapSubject = 3;
00056 const AlignBreakType alignBreakFrameShiftQuery = 4;
00057 const AlignBreakType alignBreakFrameShiftSubject = 5;
00058 
00059 typedef int ScoreType;
00060 typedef ScoreType ScoreTable[65536];
00061 
00062 const ScoreType matchScoreDNA    = 1; // | matchMask;
00063 const ScoreType mismatchScoreDNA = -2;
00064 const ScoreType gapStartScoreDNA = -4;
00065 const ScoreType gapExtendScoreDNA  = -3;
00066 const ScoreType nScoreDNA = 0; // same as crossmatch defaults
00067 const ScoreType veryBadScoreIndeed = -1000000000;
00068 
00069 
00070 // next 3 are as used by NCBI - see URL in .h file
00071 const ScoreType stopCodonScoreBLOSUM = -4; 
00072 const ScoreType gapStartScoreBLOSUM = -16; // = -31; // = -10;
00073 const ScoreType gapExtendScoreBLOSUM =-1;
00074 
00075 // frameShiftScoreBLOSUM is set so that three frame shifts incur less penalty
00076 // than shifting by a codon
00077 // Algorithm should then be able to pick up alignments of the form
00078 // A..-B..-C..-D.. <-- three frame shifts, score -15
00079 // ||| ||| ||| |||
00080 // nnnnnnnnnnnnnnn 
00081 // rather than just doing
00082 // A..B..C..---D.. <-- these count as a single gap, score -16
00083 // |||xxxxxx   |||
00084 // nnnnnnnnnnnnnnn
00085 // Also frame shift score ought to be worse than worse mismatch score
00086 const ScoreType frameShiftScoreBLOSUM = -5; // = -10; // -3;
00087 
00088 
00089 // from new alignment alg
00090 //const ScoreType veryBadScoreIndeed = -1000000000;
00091 //const ScoreType gapScore = -4;
00092 //const ScoreType matchScore = 1;
00093 //const ScoreType mismatchScore = -2; // -10
00094 // const ScoreType frameShiftScore = -3;
00095 typedef unsigned char PathType;
00096 const PathType fromN = (PathType)0x1;
00097 const PathType fromW = (PathType)0x2;
00098 const PathType fromSW = (PathType)0x4;
00099 const PathType fromPrevFrame1 = (PathType)0x8;
00100 const PathType fromPrevFrame2 = (PathType)0x10;
00101 const PathType fromFinished = (PathType)0x20;  
00102 
00103 // ----------------------------------------------
00104 // Declarations for MatchTaskAlign and subclasses
00105 // ----------------------------------------------
00106 
00107 class MatchAligner;
00108 
00109 class MatchTaskAlign : public MatchTask
00110 {
00111 public:
00112   MatchTaskAlign( SourceReader& querySource, 
00113                   SourceReader& subjectSource,
00114                   MatchAligner* pAlign,
00115                   bool reverseQueryCoords=true, 
00116                   bool doAlignment=true,
00117                   ostream& outputStream=cout );
00118   virtual ~MatchTaskAlign();
00119 
00120   virtual void operator()(MatchStore& store);
00121 
00122   void reverseQueryCoords( bool yesOrNo= true) 
00123   { 
00124     reverseQueryCoords_ = yesOrNo;
00125   } // ~void reverseQueryCoords( bool yesOrNo= true) 
00126 
00127   void doAlignment( bool yesOrNo= true) 
00128   { 
00129     doAlignment_ = yesOrNo;
00130   } // ~void doAlignment( bool yesOrNo= true) 
00131  
00132 
00133 protected:
00134 
00135   MatchAligner* pAlign_;
00136   ostream& outputStream_;
00137   SourceReader& querySource_;
00138   SourceReader& subjectSource_;
00139   // reverseQueryCoords_ - if `true' reverses orientation of
00140   // query coordinates, so they are as seen in the forward frame
00141   // Basically set to true for ssaha executable but false in 
00142   // ssahaClient, because in that case any necessary reversals
00143   // are done in ssahaServer
00144   bool reverseQueryCoords_;
00145   // doAlignment_ - if `true' prints match description line plus
00146   // graphical alignment for each match, if `false' just prints
00147   // match description line
00148   bool doAlignment_;
00149 
00150 }; // ~class MatchTaskAlign : public MatchTask
00151 
00152 
00153 
00154 // --------------------------------------------
00155 // Declarations for MatchAligner and subclasses
00156 // --------------------------------------------
00157 
00158 // alignment of two sequences is described by a vector of AlignInfo 
00159 // Meant for sequences that contain large stretches of matches
00160 // punctuated by the occasional AlignBreak
00161 
00162 struct AlignInfo
00163 {
00164   AlignInfo( void ) : numMatches(0), breakType(0) {}
00165   int numMatches;
00166   AlignBreakType breakType;
00167 }; // ~struct AlignInfo
00168 
00169 struct Alignment : public deque<AlignInfo>
00170 {
00171   ScoreType totalScore_;
00172 }; // ~struct Alignment
00173 
00174 
00175 
00176 // Class Name : StaticScoreTable
00177 // Description: A static instance of this is created for each
00178 // score table required. Implements `singleton' design pattern.
00179 class StaticScoreTable
00180 {
00181   typedef void (* makeTablePointer)( ScoreTable& table );
00182  public:
00183   StaticScoreTable( makeTablePointer p ) : makeTable_(p) {}
00184   ScoreTable* getTable() // TBD const ptr 
00185   { 
00186     if (makeTable_) (*makeTable_)(data_); 
00187     makeTable_ = NULL; 
00188     return &data_;
00189   }
00190  private:
00191   ScoreTable data_;
00192   makeTablePointer makeTable_;
00193 }; // ~class StaticScoreTable
00194 
00195 
00196 void makeScoreTableDNA( ScoreTable& table );
00197 void makeScoreTableBlosum62( ScoreTable& table );
00198 
00199 
00200 
00201 
00202 
00203 class MatchAligner
00204 {
00205 public:
00206   MatchAligner(   int numCols,
00207                   int bandExtension,
00208                   ScoreTable* pTable,
00209                   ostream& outputStream );
00210   virtual ~MatchAligner();
00211 
00212   void operator()( const char* pQuery, int queryStart, int queryEnd,
00213                    const char* pSubject, int subjectStart, int subjectEnd );
00214   virtual void createAlignment
00215   ( Alignment& alignment,
00216     const char* pQuery, int queryStart, int queryEnd,
00217     const char* pSubject, int subjectStart, int subjectEnd );
00218 
00219   virtual void formatAlignment
00220   ( Alignment& alignment,
00221     const char* pQuery, int queryStart, int queryEnd,
00222     const char* pSubject, int subjectStart, int subjectEnd );
00223 
00224   bool matchChar( const char a, const char b )
00225   {
00226     return (tolower(a)==tolower(b));
00227   }
00228 
00229   void outputAlignmentColumn
00230     ( const char queryChar, const char AlignChar, const char subjectChar,
00231       int queryNext, int subjectNext);
00232 
00233   void outputAlignmentLine( void );
00234 
00235 protected:
00236 
00237   ostream& outputStream_;
00238 
00239   // These store incomplete lines of formatted alignment while
00240   // awaiting output
00241   char* pBufSeq1_;
00242   char* pBufAlign_;
00243   char* pBufSeq2_;
00244 
00245   // These point to the current characters in the line buffers above
00246   char* pCursorSeq1_;
00247   char* pCursorAlign_;
00248   char* pCursorSeq2_;
00249 
00250   // This is the number of characters per formatted line
00251   int numCols_;
00252   int bandExtension_;
00253 
00254   static StaticScoreTable tableDNA_;
00255   static StaticScoreTable tableBlosum62_;
00256 
00257   ScoreTable* pTable_;
00258 
00259 
00260 }; // ~class MatchAligner
00261 
00262 
00263 // Class Name: MatchAlignerDNA
00264 // Description: Aligns using DNA score table
00265 class MatchAlignerDNA : public MatchAligner
00266 {
00267  public:
00268   MatchAlignerDNA(  int numCols=80,
00269                     int bandExtension=0,
00270                     ostream& outputStream=cout ) :
00271   MatchAligner( numCols, 
00272                 bandExtension,
00273                 tableDNA_.getTable(),
00274                 outputStream )
00275     {} // ~MatchTaskAlignDNA::MatchTaskAlignDNA
00276 }; // ~class MatchAlignerDNA : public MatchAligner
00277 
00278 
00279 // Class Name: MatchAlignerProtein
00280 // Description: Aligns protein using score table based on BLOSUM62
00281 class MatchAlignerProtein : public MatchAligner
00282 {
00283  public:
00284   MatchAlignerProtein( int numCols=80,
00285                        int bandExtension=0,
00286                        ostream& outputStream=cout ) :
00287     MatchAligner( numCols, 
00288                   bandExtension,
00289                   tableBlosum62_.getTable(), 
00290                   outputStream )
00291     {} // ~MatchAlignerProtein::MatchAlignerProtein
00292 
00293 }; // ~class MatchAlignerProtein : public MatchAligner
00294 
00295 // Class Name: MatchAlignerTranslated
00296 // Description: 
00297 class MatchAlignerTranslated : public MatchAligner
00298 {
00299  public: // TBD protected
00300   MatchAlignerTranslated(   int numCols,
00301                             int bandExtension,
00302                             bool isQueryProtein,
00303                             bool isSubjectProtein,
00304                             ostream& outputStream );
00305 
00306   void codonize( const char* pSeq, 
00307                  int seqSize,
00308                  int finalFrame,
00309                  vector<vector<char> >& translatedSeqs ); 
00310 
00311   virtual void createAlignment
00312   ( Alignment& alignment,
00313     const char* pQuery, int queryStart, int queryEnd,
00314     const char* pSubject, int subjectStart, int subjectEnd );
00315 
00316   static char getCodon( const char* pChar )
00317   {
00318     return (  (    (ttDNA[ *pChar ] ==nv)
00319                    || (ttDNA[ *(pChar+1) ] ==nv)
00320                    || (ttDNA[ *(pChar+2) ] ==nv) )
00321               ? 'X'
00322               : gResidueNames[ ttCodon[ ttDNA[ *(pChar) ] << 4
00323                                       | ttDNA[ *(pChar+1) ] << 2
00324                                       | ttDNA[ *(pChar+2) ] ] ] );
00325   } // ~getCodon
00326 
00327 
00328  protected:
00329   bool isQueryProtein_;
00330   bool isSubjectProtein_;
00331 };
00332 
00333 
00334 // Class Name: MatchAlignerTranslatedProtein
00335 // Description: Aligns protein and a DNA sequence.
00336 // If isQueryProtein assumes query is protein, subject is DNA, 
00337 // else vice versa.
00338 class MatchAlignerTranslatedProtein : public MatchAlignerTranslated
00339 {
00340  public:
00341   MatchAlignerTranslatedProtein
00342     ( int isQueryProtein = true,
00343       int numCols=80,
00344       int bandExtension=0,
00345       ostream& outputStream=cout );
00346 
00347   virtual void formatAlignment
00348   ( Alignment& alignment,
00349     const char* pQuery, int queryStart, int queryEnd,
00350     const char* pSubject, int subjectStart, int subjectEnd );
00351 
00352  protected:
00353 }; // ~class MatchAlignerTranslatedProtein : public MatchAlignerTranslated
00354 
00355 // Class Name: MatchAlignerTranslatedDNA
00356 // Description: Aligns translations of two DNA sequences. 
00357 class MatchAlignerTranslatedDNA : public MatchAlignerTranslated
00358 {
00359  public:
00360   MatchAlignerTranslatedDNA( int numCols=80,
00361                              int bandExtension=0,
00362                              ostream& outputStream=cout ) :
00363     MatchAlignerTranslated( numCols, 
00364                             bandExtension,
00365                             false, 
00366                             false, 
00367                             outputStream )
00368     {} // ~MatchAlignerTranslatedDNA::MatchAlignerTranslatedDNA
00369 
00370   virtual void formatAlignment
00371   ( Alignment& alignment,
00372     const char* pQuery, int queryStart, int queryEnd,
00373     const char* pSubject, int subjectStart, int subjectEnd );
00374 }; // ~class MatchAlignerTranslatedDNA : public MatchAlignerProteinDNA
00375 
00376 // --------------------------------------------
00377 // Data for scoring matches between amino acids
00378 // --------------------------------------------
00379 
00380 #ifdef DATA_SOURCES_FOR_BLOSUM62_MATRIX
00381 The below is cut and pasted from
00382 http://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/BLOSUM62
00383 
00384 -- quote --
00385 
00386   1 #  Matrix made by matblas from blosum62.iij
00387   2 #  * column uses minimum score
00388   3 #  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
00389   4 #  Blocks Database = /data/blocks_5.0/blocks.dat
00390   5 #  Cluster Percentage: >= 62
00391   6 #  Entropy =   0.6979, Expected =  -0.5209
00392   7    A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
00393   8 A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4 
00394   9 R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4 
00395  10 N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4 
00396  11 D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4 
00397  12 C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 
00398  13 Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4 
00399  14 E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4 
00400  15 G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4 
00401  16 H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4 
00402  17 I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4 
00403  18 L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4 
00404  19 K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4 
00405  20 M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4 
00406  21 F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4 
00407  22 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4 
00408  23 S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4 
00409  24 T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4 
00410  25 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4 
00411  26 Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4 
00412  27 V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4 
00413  28 B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4 
00414  29 Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4 
00415  30 X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4 
00416  31 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1 
00417 
00418 -- unquote --
00419 
00420 In addition:
00421 Score of any character with - is -10
00422 Score of - with - is -1
00423 These are the gap open and gap extension penalties used with BLOSUM62 in
00424 Huang/Zhang, CABIOS 12(6) 1996, pp. 497-506
00425 
00426 #endif
00427 
00428 static const char blosumResidues[] 
00429 = "ARNDCQEGHILKMFPSTWYVBZX";
00430 // 12345678901234567890123
00431 
00432 static const ScoreType blosumScores[] =
00433 {
00434 4, // A
00435 -1, 5, // R
00436 -2, 0, 6, // N
00437 -2, -2, 1, 6, // D
00438 0, -3, -3, -3, 9,// C
00439 -1, 1, 0, 0, -3, 5,// Q
00440 -1, 0, 0, 2, -4, 2, 5,// E
00441 0, -2, 0, -1, -3, -2, -2, 6, // G
00442 -2, 0, 1, -1, -3, 0, 0, -2, 8, // H
00443 -1, -3, -3, -3, -1, -3, -3, -4, -3, 4, // I
00444 -1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, // L
00445 -1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, // K
00446 -1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, // M
00447 -2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, // F
00448 -1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, // P
00449 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, // S
00450 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, // T
00451 -3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, // W
00452 -2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7,// Y
00453 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4,// V
00454 -2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4,// B
00455 -1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, // Z
00456 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1 //X
00457 };
00458 
00459 static const int numBlosumResidues = 23;
00460 
00461 
00462 // ---------------------------------------------------------
00463 // Declarations for standard (2D) banded dynamic programming
00464 // ---------------------------------------------------------
00465 
00466 
00467 // Class name : PathMatrix
00468 // Description: Idea of PathMatrix is that all 4 types of banded
00469 // dynamic programming needed can be handled by using different
00470 // types for PATH_TYPE
00471 // 2D (`standard') banded d.p.: DNA vs DNA, protein vs protein
00472 // 3D banded d.p.: protein vs translated DNA
00473 // 4D banded d.p.: translated DNA vs translated DNA
00474 // COLUMN_FILLER fills in cells column by column
00475 // TRACE_BACKER computes the traceback path cell by cell
00476 
00477 // If the standard dp matrix is ...
00478 //
00479 //     -   p1  p1  p1  p1  p1 
00480 //  -  c00 c01 c02 c03 c04 c05
00481 // q1  c10 c11 c12 c13 c14 c15
00482 // q2  c20 c21 c22 c23 c24 c25
00483 // q3  c30 c31 c32 c33 c34 c35
00484 // q4  c40 c41 c42 c43 c44 c45
00485 // q5  c50 c51 c52 c53 c54 c55
00486 // q6  c60 c61 c62 c63 c64 c65
00487 // q7  c70 c71 c72 c73 c74 c75
00488 // q8  c80 c81 c82 c83 c84 c85
00489 //
00490 // ... then the corresponding banded matrix with band extension 0 is ...
00491 //
00492 // c00 c11 c22 c33 c44 c55
00493 // c10 c21 c32 c43 c54 c65
00494 // c20 c31 c42 c53 c64 c75
00495 // c30 c41 c52 c63 c74 c85
00496 //
00497 //  ... and the corresponding banded matrix with band extension 2 is ...
00498 //
00499 // xxx xxx c02 c12 c24 c35
00500 // xxx c01 c12 c23 c34 c45
00501 // c00 c11 c22 c33 c44 c55
00502 // c10 c21 c32 c43 c54 c65
00503 // c20 c31 c42 c53 c64 c75
00504 // c30 c41 c52 c63 c74 c85
00505 // c40 c51 c62 c73 c84 xxx
00506 // c50 c61 c72 c83 xxx xxx
00507 
00508 // Notes:
00509 // 1. Each column is a vector<PathType> of size 
00510 // length(q1)-length(p1)+1+(2*bandExtension)
00511 // This means if bandExtension is nonzero the elements marked xxx
00512 // are never used, even though storage is allocated.
00513 // 2. When doing the traceback, cell traversals W NW N in the standard matrix
00514 // correspond to cell traversals SW W N respectively in the banded matrix.
00515 // 3. Only directional pointers are stored in each cell. Only the current
00516 // and previous column of scores are retained as the cells are filled in.
00517 
00518 template <class PATH_TYPE> class PathMatrix 
00519   : public vector<vector<PATH_TYPE> >
00520 {
00521 public:
00522   typedef pair<vector<vector<PATH_TYPE> >::iterator,
00523     vector<PATH_TYPE>::iterator> CellIterator;
00524 
00525   template<class MATRIX_FILLER> ScoreType fillIn( MATRIX_FILLER& doMatrix )
00526   {
00527     return doMatrix(*this);
00528   } // ~fillIn
00529 
00530   template <class TRACE_BACKER> void traceBack
00531   ( TRACE_BACKER& doCell )
00532   {
00533     // NB we are tracing back from finish to start
00534     //    CellIterator start; 
00535     //    start.first = &front();
00536     //  start.second = front().begin()+startPos;
00537     //    finish.first =  &back();  
00538     //    finish.second = back().begin()+finishPos;
00539  
00540     CellIterator current = lastCell_;
00541     //    current.first =  &back();  
00542     //  current.second = back().begin()+finishPos;
00543     //    cout << "Cell iterator - column: " << current.first-begin()
00544     //  << " row: " << current.second-current.first->begin() << endl;
00545     while( doCell(current) );
00546     // {
00547     //   cout << "Cell iterator - column: " << current.first-begin()
00548     //   << " row: " << current.second-current.first->begin() << endl;
00549     // }
00550 
00551   } // ~traceBack
00552 
00553   CellIterator lastCell_;
00554 
00555 }; // ~class PathMatrix
00556 
00557 // Class name : ScoreMaker
00558 // Description: Returns score of two chars according to scoreTable
00559 class ScoreMaker
00560 {
00561  public:
00562   ScoreMaker( const ScoreTable& scoreTable ) :
00563     scoreTable_(scoreTable),
00564     b_((unsigned short*)&a_[0])
00565     {}
00566   ScoreType operator()( uchar c1, uchar c2 )
00567   {
00568     a_[0]=c1;
00569     a_[1]=c2;
00570     return scoreTable_[ *b_ ];
00571   }
00572   ScoreType getGapScore( void )
00573   {
00574     a_[0]='a'; // a is both a valid nucleotide and amino acid code
00575     a_[1]='-';
00576     return scoreTable_[ *b_ ];
00577   }
00578  private:
00579   const ScoreTable& scoreTable_;
00580   uchar a_[2];
00581   unsigned short* b_;
00582 }; // ~class ScoreMaker
00583 
00584 
00585 // Class name : CellFiller
00586 // Description:
00587 class CellFiller
00588 {
00589 public:
00590   CellFiller( void ) :  max_(0) {}
00591   ScoreType operator()
00592   ( PathType& cell, 
00593     ScoreType scoreW,
00594     ScoreType scoreSW,
00595     ScoreType scoreN )
00596     {
00597       max_=max( max( scoreW, scoreSW), scoreN );
00598       cell |= fromW * (scoreW==max_);
00599       cell |= fromSW * (scoreSW==max_);
00600       cell |= fromN * (scoreN==max_);
00601       //      cout << "Cell: " 
00602       //   << ((cell & fromW)?'-':'.')
00603       //   << ((cell & fromSW)?'/':'.') 
00604       //   << ((cell & fromN)?'|':'.') 
00605       //   << " " << scoreW
00606       //   << " " << scoreSW
00607       //   << " " << scoreN 
00608       //   << " " << max_ << endl; 
00609       return max_;
00610     } // ~CellFiller::operator()
00611 
00612 private:
00613   ScoreType max_;
00614 };
00615 
00616 // Class name : ColumnFillerBasic
00617 // Description: This produces a column of a PathMatrix 
00618 // for DNA vs DNA or protein vs protein matching
00619 class ColumnFillerBasic
00620 {
00621 public:
00622   ColumnFillerBasic( const char* p1, int p1Size, 
00623                      const char* p2, int p2Size,
00624                      int bandExtension,
00625                      const ScoreTable& scoreTable) :
00626     p1_(p1), 
00627     p2_(p2), 
00628     //    bandExtension_(bandExtension),
00629     bandExtension_( (bandExtension<(p1Size/2))
00630                     ? bandExtension
00631                     : max( (p1Size/2)-1, 0 ) ), 
00632     bandWidth_(p2Size-p1Size+1),
00633     bandLength_(p1Size+1),
00634     colSize_(p2Size-p1Size+1+(2*bandExtension_)),
00635     //    scoresPossible_(3),
00636     fillCell_(),
00637     v1_(colSize_, veryBadScoreIndeed ), 
00638     v2_(colSize_, veryBadScoreIndeed ),
00639     pLast_(&v1_),
00640     pCurrent_(&v2_),
00641     getScore_(scoreTable),
00642     gapScore_(getScore_.getGapScore())
00643     {
00644       // TBD check bandExtension_ is not too big
00645     }
00646   ScoreType operator()( PathMatrix<PathType>& matrix );
00647 private:
00648   const char* p1_;
00649   const char* p2_;
00650   int bandExtension_;
00651   const int bandWidth_;
00652   const int bandLength_;
00653   const int colSize_;
00654   CellFiller fillCell_;
00655   //  vector<pair<ScoreType, PathType> > scoresPossible_;
00656   vector<ScoreType> v1_;
00657   vector<ScoreType> v2_;
00658   vector<ScoreType>* pLast_;
00659   vector<ScoreType>* pCurrent_;
00660   vector<ScoreType>* temp_;
00661   ScoreMaker getScore_;
00662   ScoreType  gapScore_;
00663 
00664 }; // ~class ColumnFillerBasic
00665 
00666 // Class name : TraceBackerBasic
00667 // Description: This carries out one step of the traceback
00668 // for DNA vs DNA or protein vs protein matching
00669 class TraceBackerBasic
00670 {
00671 public:
00672   TraceBackerBasic( AlignBreakType p1Gap, AlignBreakType p2Gap,
00673                     Alignment& alignment ) :
00674     p1Gap_(p1Gap), p2Gap_(p2Gap),
00675     alignment_(alignment) {}
00676 
00677   bool operator()( PathMatrix<PathType>::CellIterator& current );
00678  private:
00679   AlignBreakType p1Gap_;
00680   AlignBreakType p2Gap_;
00681   Alignment& alignment_;
00682 }; // ~class TraceBackerBasic
00683 
00684 
00685 // ------------------------------------------------------------
00686 // Definitions for banded dynamic programming with translations
00687 // ------------------------------------------------------------
00688 
00689 template< typename T> class Array2D
00690 {
00691 public:
00692   Array2D() { data_[0]=0; data_[1]=0; data_[2]=0; }
00693   Array2D( T i, T j, T k) 
00694     { data_[0]=i; data_[1]=j; data_[2] = k; }
00695   T& operator[]( unsigned int i ) { return data_[i]; }
00696 private:
00697   T data_[gNumReadingFrames];
00698 };
00699 
00700 template< typename T> class Array3D
00701 {
00702 public:
00703   Array3D() {}
00704   Array3D( const Array2D<T>& i, 
00705            const Array2D<T>& j, 
00706            const Array2D<T>& k) 
00707     { data_[0]=i; data_[1]=j; data_[2] = k; }
00708   Array2D<T>& operator[]( unsigned int i ) { return data_[i]; }
00709 private:
00710   Array2D<T> data_[gNumReadingFrames];
00711 };
00712 
00713 const Array2D<ScoreType> veryBadScore2D( veryBadScoreIndeed, 
00714                               veryBadScoreIndeed, 
00715                               veryBadScoreIndeed );
00716 
00717 const Array3D<ScoreType> veryBadScore3D( veryBadScore2D, 
00718                                          veryBadScore2D, 
00719                                          veryBadScore2D );
00720 
00721 
00722 typedef Array3D<ScoreType> ScoreType3D;
00723 typedef Array3D<PathType> PathType3D;
00724 
00725 
00726 // Class name : CellFiller3D
00727 // Description:
00728 class CellFiller3D
00729 {
00730 public:
00731   CellFiller3D( void ) :  max_(0) {}
00732   ScoreType operator()
00733   ( PathType& cell, 
00734     ScoreType scoreW,
00735     ScoreType scoreSW,
00736     ScoreType scoreN,
00737     ScoreType scoreFrame1,
00738     ScoreType scoreFrame2 )
00739   {
00740 
00741     max_=max( max( max( scoreW, 
00742                         scoreSW ), 
00743                    max( scoreFrame1, 
00744                         scoreFrame2 ) ),
00745               scoreN );
00746     cell |= fromW * (scoreW==max_);
00747     cell |= fromSW * (scoreSW==max_);
00748     cell |= fromN * (scoreN==max_);
00749     cell |= fromPrevFrame1 * (scoreFrame1==max_);
00750     cell |= fromPrevFrame2 * (scoreFrame2==max_);
00751     //    cout << "Cell: " 
00752     //   << ((cell & fromW)?'-':'.')
00753     //     << ((cell & fromSW)?'/':'.') 
00754     //   << ((cell & fromN)?'|':'.') 
00755     //    << ((cell & fromPrevFrame1)?'<':'.') 
00756     //   << ((cell & fromPrevFrame2)?'^':'.') 
00757     //   << " " << scoreW
00758     //   << " " << scoreSW
00759     //   << " " << scoreN 
00760     //   << " " << scoreFrame1
00761     //   << " " << scoreFrame2
00762     //   << " " << max_ << endl; 
00763     return max_;
00764   } // ~CellFiller3D::operator()
00765 
00766 
00767 private:
00768   ScoreType max_;
00769 };
00770 
00771 
00772 // Class name : ColumnFiller3D
00773 // Description: This produces a column of a PathMatrix 
00774 // for protein vs translated DNA or translated DNA vs translated DNA matching
00775 class ColumnFiller3D
00776 {
00777 public:
00778   //  ColumnFiller3D( const char* p1, int p1Size, bool isProtein1, 
00779   //      const char* p2, int p2Size, bool isProtein2,
00780   //      int bandExtension,
00781   //      const ScoreTable& scoreTable );
00782   ColumnFiller3D( const char** p1Trans, int p1Size, int p1FinalFrame,
00783                   const char** p2Trans, int p2Size, int p2FinalFrame,
00784                   int bandExtension,
00785                   const ScoreTable& scoreTable );
00786 
00787   ScoreType operator()( PathMatrix<PathType3D>& matrix );
00788 
00789   // NB all these template functions are specialized (ie overridden)
00790   // for the case false. This allows them to be `switched off' at
00791   // compile time. Definitions are in MatchAligner.cpp
00792 
00793   template< bool canGoW> ScoreType getScoreW
00794   ( int j, int k, int l )
00795   {
00796     return (*pLast_)[j][k][l];
00797   } // ~template< bool canGoW> ScoreType getScoreW
00798 
00799   template< bool canGoSW> ScoreType getScoreSW
00800   ( int j, int k, int l )
00801   {
00802     return (*pLast_)[j+1][k][l];
00803   } // ~template< bool canGoSW> ScoreType getScoreSW
00804 
00805   template< bool canGoN> ScoreType getScoreN
00806   ( int j, int k, int l )
00807   {
00808     return (*pCurrent_)[j-1][k][l];
00809   } // ~template< bool canGoN> ScoreType getScoreN
00810 
00811 
00812   // Function: getScorePrevFrame1
00813   // If the frame k is 1 or 2, the score for the previous frame
00814   // is in the same cell. 
00815   // If the frame k is 0, need to grab the frame 2 score of the
00816   // cell to the SW. This corresponds to the W-ward cell in the
00817   // full d.p. matrix
00818   template< bool canGoSW > ScoreType getScorePrevFrame1
00819   ( int j, int k, int l )
00820   {
00821     return ((k!=0) 
00822             ? (*pCurrent_)[j][k-1][l] 
00823             : ( (numFrames1_==1)
00824                 ? veryBadScoreIndeed
00825                 : getScoreSW<canGoSW>( j, 2, l) ) );
00826   } // ~template< bool canGoSW > ScoreType getScorePrevFrame1
00827 
00828   // Function: getScorePrevFrame2
00829   // If the frame l is 1 or 2, the score for the previous frame
00830   // is in the same cell. 
00831   // If the frame l is 0, need to grab the frame 2 score of the
00832   // cell to the N. This corresponds to the N-wards cell in the
00833   // full d.p. matrix
00834   template< bool canGoN > ScoreType getScorePrevFrame2
00835   ( int j, int k, int l )
00836   {
00837     return ((l!=0) 
00838             ? (*pCurrent_)[j][k][l-1] 
00839             : ( (numFrames2_==1)
00840                 ? veryBadScoreIndeed
00841                 :getScoreN<canGoN>( j, k, 2) ) );
00842   } // ~template< bool canGoN > ScoreType getScorePrevFrame2
00843 
00844   template< bool canMatch> ScoreType getScoreChar
00845   ( int i, int j, int k, int l)
00846   {
00847     assert(p1_[k]!=NULL);
00848     assert(p2_[l]!=NULL);
00849 
00850     //        cout << "getScoreChar "
00851     //  << p1_[k][i-1] << p2_[l][j-bandExtension_+i-1] << " "
00852     //  << i << " " 
00853     //  << j << " " 
00854     //  << k << " " 
00855     //  << l << endl;
00856     return (getScore_(p1_[k][i-1],p2_[l][j-bandExtension_+i-1]));
00857   } // ~template< bool canMatch> ScoreType getScoreChar
00858 
00859 
00860 
00861   template< bool canGoW, bool canGoSW, bool canGoN, bool canMatch> 
00862   void fillCell3D
00863   ( PathMatrix<PathType3D>& matrix, int i, int j )
00864   {
00865     //        cout << "filling cell " << i << " " << j << " : "
00866     //  << (canGoW?'T':'F')
00867     //  << (canGoSW?'T':'F')
00868     //  << (canGoN?'T':'F')
00869     //   << (canMatch?'T':'F')
00870     //  << endl;
00871 
00872     for ( int k(0); k < numFrames1_; k++ )
00873     {
00874       for ( int l(0); l < numFrames2_; l++ )
00875       {
00876         //              cout << k << " " << l << endl;
00877         (*pCurrent_)[j][k][l] 
00878           = fillCell_( matrix[i][j][k][l],
00879                        getScoreW<canGoW>( j, k, l)
00880                        + getScoreChar<canMatch>(i,j,k,l),
00881                        getScoreSW<canGoSW>( j, k, l)
00882                        + gapStartScoreBLOSUM,
00883                        getScoreN<canGoN>( j, k, l)
00884                        + gapStartScoreBLOSUM,
00885                        getScorePrevFrame1<canGoSW>( j, k, l)
00886                        + frameShiftScoreBLOSUM,
00887                        getScorePrevFrame2<canGoN>( j, k, l) 
00888                        + frameShiftScoreBLOSUM );
00889       } // ~for l
00890     } // ~for k
00891   } // ~fillCell3D
00892 
00893 
00894 
00895 private:
00896   const char* p1_[gNumReadingFrames];
00897   const char* p2_[gNumReadingFrames];
00898   vector< vector<char> > p1Translations_;
00899   vector< vector<char> > p2Translations_;
00900 
00901   int bandExtension_;
00902   int bandWidth_;
00903   int bandLength_;
00904   int colSize_;
00905   int finalFrame1_;
00906   int finalFrame2_;
00907   int numFrames1_;
00908   int numFrames2_;
00909 
00910   CellFiller3D fillCell_;
00911   ScoreMaker getScore_;
00912   //  vector<pair<ScoreType, PathType> > scoresPossible_;
00913   vector<ScoreType3D> v1_;
00914   vector<ScoreType3D> v2_;
00915   vector<ScoreType3D>* pLast_;
00916   vector<ScoreType3D>* pCurrent_;
00917   vector<ScoreType3D>* temp_;
00918 
00919 
00920 
00921 }; // ~class ColumnFiller3D
00922 
00923 
00924 // Class name : TraceBacker3D
00925 // Description: This carries out one step of the traceback
00926 // for matches involving translations or protein
00927 class TraceBacker3D
00928 {
00929 public:
00930   //  TraceBacker3D( AlignBreakType p1Gap, AlignBreakType p2Gap,
00931   //                int p1FinalFrame, int p2FinalFrame,
00932   //                Alignment& alignment ) :
00933   //  p1Gap_(p1Gap), p2Gap_(p2Gap),
00934   //  k_(p1FinalFrame), 
00935   //  l_(p2FinalFrame),
00936   //  alignment_(alignment) {}
00937   TraceBacker3D( int queryFinalFrame, int subjectFinalFrame,
00938                  bool p1IsQuery,
00939                  Alignment& alignment ) :
00940     p1Gap_( (p1IsQuery) ? 
00941             alignBreakGapQuery : alignBreakGapSubject ),
00942     p2Gap_( (p1IsQuery) ? 
00943             alignBreakGapSubject : alignBreakGapQuery ),
00944     p1FrameShift_( (p1IsQuery) ? 
00945                    alignBreakFrameShiftQuery : alignBreakFrameShiftSubject ),
00946     p2FrameShift_( (p1IsQuery) ? 
00947                    alignBreakFrameShiftSubject : alignBreakFrameShiftQuery ),
00948     k_( (p1IsQuery) ? 
00949         queryFinalFrame : subjectFinalFrame ), 
00950     l_( (p1IsQuery) ? 
00951         subjectFinalFrame : queryFinalFrame ), 
00952     alignment_(alignment) 
00953     {}
00954   bool operator()( PathMatrix<PathType3D>::CellIterator& current );
00955 private:
00956 
00957   AlignBreakType p1Gap_;
00958   AlignBreakType p2Gap_;
00959   AlignBreakType p1FrameShift_;
00960   AlignBreakType p2FrameShift_;
00961 
00962   int k_; // frame of p1
00963   int l_; // frame of p2
00964   Alignment& alignment_;
00965 
00966 
00967 
00968 };
00969 
00970 
00971 
00972 
00973 // ### Function Declarations ###
00974 
00975 // Name:
00976 // Arguments:
00977 // TYPE  NAME  IN/OUT COMMENT
00978 // Returns: TYPE COMMENT
00979 
00980 // End of include guard:
00981 #endif
00982 
00983 // End of file MatchAligner.h

Generated on Fri Dec 21 13:12:16 2007 for ssaha by  doxygen 1.5.2