00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #ifndef INCLUDED_MatchStore
00032 #define INCLUDED_MatchStore
00033
00034
00035
00036
00037 #include "GlobalDefinitions.h"
00038 #include <string>
00039 #include <algorithm>
00040 #include <deque>
00041
00042
00043
00044
00045
00046
00047
00048 class HashTableGeneric;
00049 class SourceReader;
00050
00051
00052
00053
00054 class Match
00055 {
00056 public:
00057 virtual ~Match() {}
00058
00059 virtual SequenceNumber getSubjectNum( void ) const=0;
00060
00061 virtual const char* getSubjectName( void ) const=0;
00062 virtual SequenceOffset getSubjectStart( void ) const=0;
00063 virtual SequenceOffset getSubjectEnd( void ) const=0;
00064
00065 virtual SequenceNumber getQueryNum( void ) const=0;
00066 virtual string getQueryName( void ) const=0;
00067 virtual SequenceOffset getQueryStart( void ) const=0;
00068 virtual SequenceOffset getQueryEnd( void ) const=0;
00069
00070 virtual int getQuerySize( void ) const=0;
00071 virtual int getNumBases(void ) const=0;
00072 virtual bool isQueryForward( void ) const=0;
00073 virtual bool isSubjectForward( void ) const=0;
00074
00075 virtual void print( void ) const=0;
00076
00077
00078 };
00079
00080
00081
00082
00083
00084
00085 class MatchStoreImp;
00086
00087 class MatchImp : public Match
00088 {
00089 public:
00090
00091 virtual ~MatchImp() {}
00092
00093 virtual SequenceNumber getSubjectNum( void ) const
00094 { return subjectNum_; }
00095
00096 virtual inline const char* getSubjectName( void ) const;
00097 virtual SequenceOffset getSubjectStart( void ) const
00098 { return subjectStart_; }
00099 virtual SequenceOffset getSubjectEnd( void ) const
00100 { return subjectEnd_; }
00101
00102 virtual inline SequenceNumber getQueryNum( void ) const;
00103 virtual inline string getQueryName( void ) const;
00104 virtual SequenceOffset getQueryStart( void ) const
00105 { return queryStart_; }
00106 virtual SequenceOffset getQueryEnd( void ) const
00107 { return queryEnd_; }
00108
00109 virtual inline int getQuerySize( void ) const;
00110 virtual int getNumBases(void ) const
00111 { return numBases_; }
00112 virtual bool isQueryForward( void ) const
00113 { return isQueryForward_; }
00114 virtual bool isSubjectForward( void ) const
00115 { return isSubjectForward_; }
00116
00117 virtual void print( void ) const {}
00118
00119
00120
00121 MatchImp( MatchStoreImp* myStore,
00122 SequenceNumber subjectNum,
00123 SequenceOffset numBases,
00124 SequenceOffset queryStart,
00125 SequenceOffset queryEnd,
00126 SequenceOffset subjectStart,
00127 SequenceOffset subjectEnd,
00128 bool isQueryForward,
00129 bool isSubjectForward ):
00130 myStore_( myStore ),
00131 subjectNum_( subjectNum ),
00132 numBases_( numBases ),
00133 queryStart_( queryStart ),
00134 queryEnd_( queryEnd ),
00135 subjectStart_( subjectStart ),
00136 subjectEnd_( subjectEnd ),
00137 isQueryForward_( isQueryForward ),
00138 isSubjectForward_( isSubjectForward ){}
00139
00140 SequenceNumber subjectNum_;
00141 SequenceOffset numBases_;
00142
00143 SequenceOffset queryStart_;
00144 SequenceOffset queryEnd_;
00145 SequenceOffset subjectStart_;
00146 SequenceOffset subjectEnd_;
00147
00148 bool isQueryForward_;
00149 bool isSubjectForward_;
00150
00151 MatchStoreImp* myStore_;
00152
00153 };
00154
00155
00156
00157
00158 class MatchStore : public vector<Match*>
00159 {
00160 public:
00161 virtual ~MatchStore() {}
00162
00163 virtual void clear( void ) {}
00164
00165
00166 virtual void printResult( ostream& outputStream = cout ) const {}
00167
00168 virtual void addMatch( SequenceNumber subjectNum,
00169 SequenceOffset numBases,
00170 SequenceOffset queryStart,
00171 SequenceOffset queryEnd,
00172 SequenceOffset subjectStart,
00173 SequenceOffset subjectEnd,
00174 bool isQueryForward,
00175 bool isSubjectForward ) {}
00176 };
00177
00178 class HitListVector;
00179 class NameReader;
00180
00181 class MatchStoreImp : public MatchStore
00182 {
00183 friend class MatchImp;
00184 public:
00185 MatchStoreImp( string queryName,
00186 int queryNum,
00187 int queryBases,
00188 const NameReader& reader) :
00189 queryName_ ( queryName ),
00190 queryNum_( queryNum ),
00191 queryBases_( queryBases ),
00192 reader_( reader )
00193 {
00194
00195
00196 }
00197
00198 ~MatchStoreImp();
00199
00200 virtual void addMatch( SequenceNumber subjectNum,
00201 SequenceOffset numBases,
00202 SequenceOffset queryStart,
00203 SequenceOffset queryEnd,
00204 SequenceOffset subjectStart,
00205 SequenceOffset subjectEnd,
00206 bool isQueryForward,
00207 bool isSubjectForward );
00208
00209
00210
00211
00212
00213 void printResult
00214 ( ostream& outputStream = cout ) const;
00215
00216 virtual void clear( void );
00217
00218 void setup( void );
00219
00220 protected:
00221 vector<MatchImp*> imps_;
00222
00223 string queryName_;
00224 int queryNum_;
00225 int queryBases_;
00226
00227
00228 const NameReader& reader_;
00229
00230 };
00231
00232
00233
00234
00235
00236 class MatchTask
00237 {
00238 public:
00239 virtual ~MatchTask() {}
00240 virtual void operator()( MatchStore& store ) =0;
00241 };
00242
00243 template <class TASK1, class TASK2> class CombineTask : public MatchTask
00244 {
00245 public:
00246 CombineTask(TASK1& t1, TASK2& t2) : t1_(t1), t2_(t2) {}
00247 CombineTask( void ) : t1_imp(), t2_imp(), t1_(t1_imp), t2_(t2_imp) {}
00248 virtual void operator()( MatchStore& store )
00249 {
00250 t1_(store);
00251 t2_(store);
00252 }
00253 private:
00254 TASK1 t1_imp;
00255 TASK2 t2_imp;
00256 TASK1& t1_;
00257 TASK2& t2_;
00258 };
00259
00260
00261
00262
00263
00264
00265 class CombineTaskVirtual : public MatchTask
00266 {
00267 public:
00268 CombineTaskVirtual(MatchTask& t1, MatchTask& t2) : t1_(t1), t2_(t2) {}
00269 virtual void operator()( MatchStore& store )
00270 {
00271 t1_(store);
00272 t2_(store);
00273 }
00274 private:
00275 MatchTask& t1_;
00276 MatchTask& t2_;
00277 };
00278
00279
00280 class MatchTaskPrint : public MatchTask
00281 {
00282 public:
00283 MatchTaskPrint( ostream& outputStream = cout ) :
00284 outputStream_( outputStream ) {}
00285 virtual void operator()( MatchStore& store );
00286 private:
00287 ostream& outputStream_;
00288
00289 };
00290
00291
00292 class MatchTaskPrintTabbed : public MatchTask
00293 {
00294 public:
00295 MatchTaskPrintTabbed( ostream& outputStream = cout ) :
00296 outputStream_( outputStream ) {}
00297 virtual void operator()( MatchStore& store );
00298 private:
00299 ostream& outputStream_;
00300 };
00301
00302
00303
00304 class MatchTaskPrintReverse : public MatchTask
00305 {
00306 public:
00307 MatchTaskPrintReverse( ostream& outputStream = cout ) :
00308 outputStream_( outputStream ) {}
00309 virtual void operator()( MatchStore& store );
00310 private:
00311 ostream& outputStream_;
00312
00313 };
00314
00315
00316
00317 class MatchTaskPrintTabbedReverse : public MatchTask
00318 {
00319 public:
00320 MatchTaskPrintTabbedReverse( ostream& outputStream = cout ) :
00321 outputStream_( outputStream ) {}
00322 virtual void operator()( MatchStore& store );
00323 private:
00324 ostream& outputStream_;
00325 };
00326
00327
00328 class SortByMatchSize
00329 {
00330 public:
00331 SortByMatchSize() {}
00332 bool operator()( const Match* p1, const Match* p2 ) const
00333 {
00334
00335 return ( (p1->getNumBases()) > (p2->getNumBases() ) );
00336 }
00337 };
00338
00339 class SortBySubjectName
00340 {
00341 public:
00342 SortBySubjectName() {}
00343 bool operator()( const Match* p1, const Match* p2 ) const
00344 {
00345 return ( (p1->getSubjectName()) < (p2->getSubjectName() ) );
00346 }
00347 };
00348
00349 class SortByQueryStart
00350 {
00351 public:
00352 SortByQueryStart() {}
00353 bool operator()( const Match* p1, const Match* p2 ) const
00354 {
00355 return ( (p1->getQueryStart()) < (p2->getQueryStart() ) );
00356 }
00357 };
00358
00359 class SortByDiff
00360 {
00361 public:
00362 SortByDiff() {}
00363 bool operator()( const Match* p1, const Match* p2 ) const
00364 {
00365 return ( (p1->getQueryStart()-p1->getSubjectStart())
00366 > (p2->getQueryStart()-p2->getSubjectStart()) );
00367 }
00368 };
00369
00370 class SortByPercentMatch
00371 {
00372 public:
00373 SortByPercentMatch() {}
00374 bool operator()( const Match* p1, const Match* p2 ) const
00375 {
00376 return ( (p1->getNumBases()*(p2->getQueryEnd()-p2->getQueryStart()+1))
00377 > (p2->getNumBases()*(p1->getQueryEnd()-p1->getQueryStart()+1)));
00378 }
00379 };
00380
00381 class SortNull
00382 {
00383 public:
00384 SortNull() {}
00385 bool operator()( const Match* p1, const Match* p2) const
00386 {
00387 return false;
00388 }
00389 };
00390
00391 #ifdef OLD_SORTBYMATCH_DEFINITION
00392 class SortByMatch
00393 {
00394 public:
00395 SortByMatch() {}
00396 bool operator()( const Match* p1, const Match* p2 ) const
00397 {
00398 return ( (p1->getNumBases()) > (p2->getNumBases() )
00399 || ( (p1->getNumBases()) == (p2->getNumBases() )
00400 && (p1->getSubjectName()) < (p2->getSubjectName() ) )
00401 || ( (p1->getSubjectName()) == (p2->getSubjectName() )
00402 && (p1->getQueryStart()) == (p2->getQueryStart() ) ) );
00403 }
00404 };
00405 #endif
00406
00407
00408
00409 class SortByMatch
00410 {
00411 public:
00412 SortByMatch() {}
00413 bool operator()( const Match* p1, const Match* p2 ) const
00414 {
00415 return ( (p1->getNumBases()) > (p2->getNumBases() )
00416 || ( (p1->getNumBases()) == (p2->getNumBases() )
00417 && (p1->getSubjectNum()) < (p2->getSubjectNum() ) )
00418 || ( (p1->getSubjectNum()) == (p2->getSubjectNum() )
00419 && (p1->getQueryStart()) == (p2->getQueryStart() ) ) );
00420 }
00421 };
00422
00423
00424 template < class C1, class C2, class C3=SortNull >
00425 class CombineSort
00426 {
00427 public:
00428 CombineSort() : c1_(), c2_(), c3_() {}
00429 bool operator()( const Match* p1, const Match* p2) const
00430 {
00431 return ( c1_(p1,p2)
00432 || ( (!c1_(p2,p1)) && c2_(p1,p2) )
00433 || ( (!c2_(p2,p1)) && c3_(p1,p2) ) );
00434 }
00435 C1 c1_;
00436 C2 c2_;
00437 C3 c3_;
00438 };
00439
00440 template <class SORTER> class MatchTaskSort : public MatchTask
00441 {
00442 public:
00443 MatchTaskSort
00444 (
00445 unsigned int maxToSort = 1<<30,
00446 double partialThreshold = 0.0
00447 ) : sorter_(), maxToSort_(maxToSort), partialThreshold_(partialThreshold) {}
00448
00449 void operator()(MatchStore& store )
00450 {
00451 if ( ( store.size() <= maxToSort_)
00452 || ( ((double)maxToSort_) / ((double)store.size())
00453 > partialThreshold_ ) )
00454 sort( store.begin(), store.end(), sorter_ );
00455 else
00456 {
00457 partial_sort( store.begin(),
00458 store.begin()+maxToSort_,
00459 store.end(),
00460 sorter_ );
00461 }
00462 store.resize(min((unsigned int)store.size(),maxToSort_));
00463 }
00464 private:
00465 unsigned int maxToSort_;
00466 double partialThreshold_;
00467 SORTER sorter_;
00468 };
00469
00470 #ifdef MOVED_TO_ALIGNERDOTH
00471
00472
00473 typedef int ScoreType;
00474 typedef ScoreType ScoreTable[65536];
00475 typedef unsigned char uchar;
00476 typedef unsigned char PathType;
00477
00478 typedef unsigned char AlignBreakType;
00479
00480
00481
00482
00483
00484 struct AlignInfo
00485 {
00486 AlignInfo( void ) : numMatches(0), breakType(0) {}
00487 int numMatches;
00488 AlignBreakType breakType;
00489 };
00490
00491
00492
00493
00494 class MatchTaskAlign : public MatchTask
00495 {
00496 public:
00497 MatchTaskAlign( SourceReader& querySource,
00498 SourceReader& subjectSource,
00499 int numCols,
00500 ostream& outputStream,
00501 bool reverseQueryCoords,
00502 bool doAlignment );
00503 virtual ~MatchTaskAlign();
00504
00505 virtual void operator()(MatchStore& store);
00506
00507
00508
00509 void align( const char* pQuery, int queryStart, int queryEnd,
00510 const char* pSubject, int subjectStart, int subjectEnd );
00511 void createAlignment
00512 ( deque<AlignInfo>& alignment,
00513 const char* pQuery, int queryStart, int queryEnd,
00514 const char* pSubject, int subjectStart, int subjectEnd );
00515 void formatAlignment
00516 ( deque<AlignInfo>& alignment,
00517 const char* pQuery, int queryStart, int queryEnd,
00518 const char* pSubject, int subjectStart, int subjectEnd );
00519
00520 bool matchChar( const char a, const char b )
00521 {
00522 return (tolower(a)==tolower(b));
00523 }
00524
00525 virtual void outputAlignmentLine( void );
00526
00527 void reverseQueryCoords( bool yesOrNo= true)
00528 {
00529 reverseQueryCoords_ = yesOrNo;
00530 }
00531
00532 void doAlignment( bool yesOrNo= true)
00533 {
00534 doAlignment_ = yesOrNo;
00535 }
00536
00537 protected:
00538
00539 ostream& outputStream_;
00540 SourceReader& querySource_;
00541 SourceReader& subjectSource_;
00542
00543
00544
00545
00546
00547 bool reverseQueryCoords_;
00548
00549
00550
00551 bool doAlignment_;
00552
00553 char* pBufSeq1;
00554 char* pBufAlign;
00555 char* pBufSeq2;
00556 int numCols_;
00557
00558 ScoreTable* pTable_;
00559
00560
00561 };
00562
00563
00564
00565 class MatchTaskAlignDNA : public MatchTaskAlign
00566 {
00567 public:
00568 MatchTaskAlignDNA( SourceReader& querySource,
00569 SourceReader& subjectSource,
00570 int numCols=80,
00571 ostream& outputStream=cout,
00572 bool reverseQueryCoords=true,
00573 bool doAlignment=true);
00574 protected:
00575 void fillScoreTable( void );
00576
00577 static bool isTableFilled_;
00578 static ScoreTable table_;
00579 };
00580
00581
00582 The below is cut and pasted from
00583 http:
00584
00585 -- quote --
00586
00587 1 # Matrix made by matblas from blosum62.iij
00588 2 # * column uses minimum score
00589 3 # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
00590 4 # Blocks Database = /data/blocks_5.0/blocks.dat
00591 5 # Cluster Percentage: >= 62
00592 6 # Entropy = 0.6979, Expected = -0.5209
00593 7 A R N D C Q E G H I L K M F P S T W Y V B Z X *
00594 8 A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
00595 9 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
00596 10 N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
00597 11 D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
00598 12 C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
00599 13 Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
00600 14 E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
00601 15 G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
00602 16 H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
00603 17 I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
00604 18 L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
00605 19 K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
00606 20 M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
00607 21 F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
00608 22 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
00609 23 S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
00610 24 T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
00611 25 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
00612 26 Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
00613 27 V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
00614 28 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
00615 29 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
00616 30 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
00617 31 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
00618
00619 -- unquote --
00620
00621 In addition:
00622 Score of any character with - is -10
00623 Score of - with - is -1
00624 These are the gap open and gap extension penalties used with BLOSUM62 in
00625 Huang/Zhang, CABIOS 12(6) 1996, pp. 497-506
00626
00627
00628
00629 static const char blosumResidues[]
00630 = "ARNDCQEGHILKMFPSTWYVBZX";
00631
00632
00633 static const ScoreType blosumScores[] =
00634 {
00635 4,
00636 -1, 5,
00637 -2, 0, 6,
00638 -2, -2, 1, 6,
00639 0, -3, -3, -3, 9,
00640 -1, 1, 0, 0, -3, 5,
00641 -1, 0, 0, 2, -4, 2, 5,
00642 0, -2, 0, -1, -3, -2, -2, 6,
00643 -2, 0, 1, -1, -3, 0, 0, -2, 8,
00644 -1, -3, -3, -3, -1, -3, -3, -4, -3, 4,
00645 -1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4,
00646 -1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5,
00647 -1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5,
00648 -2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6,
00649 -1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7,
00650 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4,
00651 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5,
00652 -3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11,
00653 -2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7,
00654 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4,
00655 -2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4,
00656 -1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4,
00657 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1
00658 };
00659
00660 static const int numBlosumResidues = 23;
00661
00662
00663
00664 class MatchTaskAlignProtein : public MatchTaskAlign
00665 {
00666 public:
00667 MatchTaskAlignProtein( SourceReader& querySource,
00668 SourceReader& subjectSource,
00669 int numCols=80,
00670 ostream& outputStream=cout );
00671 protected:
00672 void fillScoreTable( void );
00673
00674 static bool isTableFilled_;
00675 static ScoreTable table_;
00676 };
00677
00678 #endif
00679
00680
00681
00682
00683
00684
00685
00686
00687
00688
00689
00690
00691
00692
00693
00694
00695
00696
00697 #endif
00698
00699
00700
00701
00702
00703
00704