HashTable/HashTablePacked.h

Go to the documentation of this file.
00001 /*  Last edited: Mar 27 16:30 2002 (ac2) */
00002 
00003 // #######################################################################
00004 
00005 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00006 // Version 3.2, released 1st March 2004
00007 // Copyright (c) Genome Research 2002
00008 
00009 // SSAHA is free software; you can redistribute it and/or modify 
00010 // it under the terms of version 2 of the GNU General Public Licence
00011 // as published by the Free Software Foundation.
00012  
00013 // This program is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public Licence for more details.
00017  
00018 // You should have received a copy of the GNU General Public Licence
00019 // along with this program; if not, write to the Free Software
00020 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00021 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00022 
00023 // #######################################################################
00024 
00025 // Module Name  : HashTablePacked
00026 // File Name    : HashTablePacked.h
00027 // Language     : C++
00028 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00029 
00030 // Include guard:
00031 #ifndef INCLUDED_HashTablePacked
00032 #define INCLUDED_HashTablePacked
00033 
00034 // Description:
00035 
00036 // Includes:
00037 #include "HashTableGeneric.h"
00038 
00039 // NB it is good practise for #include statements in header files to be
00040 // replaced by forward declarations if at all possible
00041 
00042 // ### Class Declarations ###
00043 
00044 typedef unsigned int PositionPacked;
00045 typedef unsigned int SeqStartPos;
00046 
00047 typedef pair<PositionPacked, int> HitPacked;
00048 struct PackedHitStore : public vector<HitPacked>
00049 {
00050   void addHit( PositionPacked pos, int baseOffset )
00051   { 
00052     push_back(HitPacked(pos,baseOffset)); 
00053   }
00054 
00055 };
00056 static const HitPacked zeroHit = HitPacked(0,0);
00057 
00058 
00059 // Class Name : RadixSorter
00060 // Description: Sorts a vector of HitPacked by the first element of each
00061 // entry using a radix sorting method.
00062 
00063 class RadixSorter
00064 {
00065 // capacity of CountInt determines the max number of elements that 
00066 // can be sorted
00067 typedef unsigned int CountInt;
00068  
00069 public:
00070   RadixSorter( const unsigned int digits, const unsigned int bits );
00071   void operator()( vector<HitPacked>& v );
00072   void CountDigits( const vector<HitPacked>& v );
00073   void SortByDigit( PositionPacked digit );
00074 
00075 protected:
00076   const unsigned int digits_;
00077   const unsigned int bits_;
00078   const unsigned long base_;
00079   const PositionPacked mask_;
00080   vector<HitPacked> v1_;
00081   vector< vector<CountInt> > counts_; 
00082   vector<HitPacked>* source_;
00083   vector<HitPacked>* target_;
00084   vector< vector<HitPacked>::iterator > places_;
00085 
00086 }; // ~class RadixSorter
00087 
00088 class LessThanDiff
00089 {
00090   public:
00091   bool operator()( const HitInfo& lhs, const HitInfo& rhs ) const
00092   {
00093     return ( lhs.diff < rhs.diff ); 
00094   }
00095 };
00096 
00097 
00098 void generateSubstitutesDNA
00099 (Word w, vector<Word>& subs, int wordLength);
00100 
00101 void generateSubstitutesProtein
00102 (Word w, vector<Word>& subs, int wordLength);
00103 
00104 // 0  *
00105 // 1  A 
00106 // 2  C
00107 // 3  D
00108 // 4  E
00109 // 5  F
00110 // 6  G
00111 // 7  H
00112 // 8  I
00113 // 9  K
00114 // 10 L
00115 // 11 M
00116 // 12 N
00117 // 13 P
00118 // 14 Q
00119 // 15 R
00120 // 16 S
00121 // 17 T
00122 // 18 V
00123 // 19 W
00124 // 20 X
00125 // 21 Y
00126 
00127 const Word subVals[] =
00128 {
00129               // *
00130   16,         // AS: 1
00131               // BD: 4 BE: 1 BN: 3 BZ: 1 - not used 
00132               // C
00133   4,  12,     // DB: 4 DE: 2 DN: 1 DZ: 1 - B,Z not used
00134   3,  9, 14,  // EB: 1 ED: 2 EK: 1 EQ: 2 EZ: 4 - B,Z not used
00135   19, 21,     // FW: 1 FY: 3
00136               // G 
00137   12, 21,     // HN: 1 HY: 2
00138   10, 11, 18, // IL: 2 IM: 1 IV: 3
00139   4,  14, 15, // KE: 1 KQ: 1 KR: 2 KZ: 1 -  Z not used
00140   8,  11, 18, // LI: 2 LM: 2 LV: 1
00141   8,  10, 18, // MI: 1 ML: 2 MV: 1
00142   3,  7,  16, // NB: 3 ND: 1 NH: 1 NS: 1 - B not used
00143               // P 
00144   4,  9,  15, // QE: 2 QK: 1 QR: 1 QZ: 3 - Z not used
00145   9,  14,     // RK: 2 RQ: 1
00146   1,  12, 17, // SA: 1 SN: 1 ST: 1
00147   16,         // TS: 1
00148   8,  10, 11, // VI: 3 VL: 1 VM: 1
00149   5,  21,     // WF: 1 WY: 2
00150   5,  7,  19, // YF: 3 YH: 2 YW: 2
00151              //  ZB: 1 ZD: 1 ZE: 4 ZK: 1 ZQ: 3 - Z not used
00152   99999
00153 };
00154 
00155 const int subStarts[] =
00156 {
00157 0,  // 0  *
00158 0,  // 1  A 
00159 1,  // 2  C
00160 1,  // 3  D
00161 3,  // 4  E
00162 6,  // 5  F
00163 8,  // 6  G
00164 8,  // 7  H
00165 10, // 8  I
00166 13, // 9  K
00167 16, // 10 L
00168 19, // 11 M
00169 22, // 12 N
00170 25, // 13 P
00171 25, // 14 Q
00172 28, // 15 R
00173 30, // 16 S
00174 33, // 17 T
00175 34, // 18 V
00176 37, // 19 W
00177 39, // 20 X
00178 39, // 21 Y
00179 42
00180 };
00181 
00182 
00183 #ifdef POSITIVE_SCORING_BLOSUM_SUBS 
00184 Original table: generated from BLOSUM62 table in MatchStore.h
00185 AS: 1
00186 BD: 4 BE: 1 BN: 3 BZ: 1 
00187 DB: 4 DE: 2 DN: 1 DZ: 1
00188 EB: 1 ED: 2 EK: 1 EQ: 2 EZ: 4
00189 FW: 1 FY: 3
00190 HN: 1 HY: 2
00191 IL: 2 IM: 1 IV: 3
00192 KE: 1 KQ: 1 KR: 2 KZ: 1
00193 LI: 2 LM: 2 LV: 1
00194 MI: 1 ML: 2 MV: 1
00195 NB: 3 ND: 1 NH: 1 NS: 1
00196 QE: 2 QK: 1 QR: 1 QZ: 3
00197 RK: 2 RQ: 1
00198 SA: 1 SN: 1 ST: 1
00199 TS: 1
00200 VI: 3 VL: 1 VM: 1
00201 WF: 1 WY: 2
00202 YF: 3 YH: 2 YW: 2
00203 ZB: 1 ZD: 1 ZE: 4 ZK: 1 ZQ: 3
00204 #endif
00205 
00206 
00207 
00208 
00209 
00210 
00211 
00212 
00213 class HashTablePacked : 
00214 public HashTableView<PositionPacked,HashTablePacked>
00215 {
00216   friend class HashTableTranslated;
00217  public:
00218   static AllocatorLocal<PositionInHitList> defaultArrayAllocator;
00219   static AllocatorLocal<PositionPacked> defaultHitListAllocator;
00220 
00221   typedef void (HashTablePacked::* MatchSequencePointer)
00222     (WordSequence&, HitList&);
00223 
00224   typedef void (HashTablePacked::* MatchWordPointer)
00225     (Word, PackedHitStore&, int);
00226 
00227   typedef void (* GenerateSubstitutesPointer)
00228     (Word, vector<Word>&, int);
00229 
00230   HashTablePacked( ostream& monitoringStream=cerr,
00231                    string name="",
00232                    Allocator<PositionPacked>& hitListAllocator 
00233                    = defaultHitListAllocator,
00234                    Allocator<PositionInHitList>& arrayAllocator 
00235                    = defaultArrayAllocator ):
00236     HashTableView<PositionPacked,HashTablePacked>
00237     (monitoringStream, name, hitListAllocator, arrayAllocator),
00238     wordNum_(0),
00239     pMatchSequence_(&HashTablePacked::matchSequenceStandard),
00240     pMatchWord_(&HashTablePacked::matchWordStandard),
00241     pGenerateSubstitutes_(&generateSubstitutesDNA),
00242     numRepeats_(0),
00243     substituteThreshold_(0),
00244     sorter_(4,(sizeof(PositionPacked)*8)/4)
00245     {
00246       hitListFormat_ = g32BitPacked;
00247       seqStarts_.push_back(0);
00248       monitoringStream_ << "constructing HashTablePacked\n";
00249       //      cout << pArrayAllocator_->name_ << " " 
00250       //           << pHitListAllocator_->name_ << endl; // %%%%
00251 
00252     }
00253 
00254   //  HashTablePacked( ostream& monitoringStream=cerr) :
00255   //   HashTableView<PositionPacked,HashTablePacked>(monitoringStream),
00256   //   wordNum_(0){}
00257 
00258   inline static SequenceNumber getSequence( const_iterator i );
00259   inline static SequenceOffset getOffset( const_iterator i );
00260 
00261 
00262   virtual void hashWords
00263   ( SequenceAdapter& thisSeq, SequenceNumber seqNum );
00264   virtual void countWords( SequenceAdapter& thisSeq );
00265   
00266   virtual void matchSequence
00267   ( WordSequence& seq, HitList& hitListFwd )
00268     { (this->*pMatchSequence_)(seq,hitListFwd); }
00269 
00270 
00271   virtual void convertHits
00272   ( PackedHitStore& packedHits, HitList& hitListFwd );
00273 
00274   void matchWordDeluxe( Word w, PackedHitStore& hitList, int offset )
00275   {
00276     (this->*pMatchWord_)(w, hitList, offset);
00277   }
00278 
00279   // load and save need to be overloaded because seqStarts_ needs
00280   // to be loaded/saved as well
00281   virtual void loadHitList( unsigned long size );
00282 
00283   virtual void saveHitList( void );
00284 
00285   virtual void setNumRepeats(int numRepeats);
00286   virtual void setSubstituteThreshold( int ns );
00287 
00288   void matchSequenceStandard
00289     ( WordSequence& seq, HitList& hitListFwd );
00290 
00291   void matchSequenceRepeated
00292     ( WordSequence& seq, HitList& hitListFwd );
00293 
00294   void matchWordStandard( Word w, PackedHitStore& hitList, int offset )
00295   {
00296     HashTableView<PositionPacked,HashTablePacked>::matchWord
00297       ( w, hitList, offset );
00298   } // ~matchWordStandard( Word w, HitListVector hitList, int offset )
00299 
00300   void matchWordSubstitute( Word w, PackedHitStore& hitList, int offset );
00301 
00302  protected:
00303   vector<SeqStartPos> seqStarts_;
00304   unsigned int wordNum_;
00305   int numRepeats_;
00306   int substituteThreshold_;
00307   MatchSequencePointer pMatchSequence_;
00308   MatchWordPointer pMatchWord_;
00309   GenerateSubstitutesPointer pGenerateSubstitutes_;
00310 
00311   RadixSorter sorter_;
00312 
00313 }; // ~class HashTablePacked
00314 
00315 
00316 
00317 // ### Function Declarations ###
00318 
00319 // Name:
00320 // Arguments:
00321 // TYPE  NAME  IN/OUT COMMENT
00322 // Returns: TYPE COMMENT
00323 
00324 // End of include guard:
00325 #endif
00326 
00327 // End of file HashTablePacked.h

Generated on Fri Dec 21 13:12:15 2007 for ssaha by  doxygen 1.5.2