Global/GlobalDefinitions.h

Go to the documentation of this file.
00001 /*  Last edited: Apr 18 16:20 2002 (ac2) */
00002 
00003 // #######################################################################
00004 
00005 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00006 // Version 3.2, released 1st March 2004
00007 // Copyright (c) Genome Research 2002
00008 
00009 // SSAHA is free software; you can redistribute it and/or modify 
00010 // it under the terms of version 2 of the GNU General Public Licence
00011 // as published by the Free Software Foundation.
00012  
00013 // This program is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public Licence for more details.
00017  
00018 // You should have received a copy of the GNU General Public Licence
00019 // along with this program; if not, write to the Free Software
00020 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00021 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00022 
00023 // #######################################################################
00024 
00025 // Module Name  : GlobalDefinitions
00026 // File Name    : GlobalDefinitions.h
00027 // Language     : C++
00028 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00029 
00030 // Include guard:
00031 #ifndef INCLUDED_GlobalDefinitions
00032 #define INCLUDED_GlobalDefinitions
00033 
00034 // Description:
00035 
00036 // This module brings together various definitions that will be used throughout
00037 // the SSAHA library.
00038 
00039 // Macro definitions:
00040 
00041 // The following preprocessor directives work thus:
00042 // If compiled with 'g++ -DEBUG_LEVEL1 ...' any DEBUG_L1(...) commands
00043 // in the source are compiled.
00044 // If compiled with 'g++ -DEBUG_LEVEL2 ...' any DEBUG_L1(...) or DEBUG_L2(...)
00045 // commands in the source are compiled.
00046 // If compiled with 'g++ -DEBUG_LEVEL3 ...' any DEBUG_L1(...), DEBUG_L2(...)
00047 // or DEBUG_L3(...) commands in the source are compiled.
00048 
00049 #define DEBUG_L3(X)
00050 #define DEBUG_L2(X)
00051 #define DEBUG_L1(X)
00052 
00053 #ifdef EBUG_LEVEL3
00054 #undef  DEBUG_L3
00055 #define DEBUG_L3(X) cout << "DE3: " << X << endl
00056 #define EBUG_LEVEL2
00057 #endif
00058 
00059 #ifdef EBUG_LEVEL2
00060 #undef  DEBUG_L2
00061 #define DEBUG_L2(X) cout << "DE2: " << X << endl
00062 #define EBUG_LEVEL1
00063 #endif
00064 
00065 #ifdef EBUG_LEVEL1
00066 #undef  DEBUG_L1
00067 #define DEBUG_L1(X) cout << "DE1: " << X << endl
00068 #endif
00069 
00070 // Includes:
00071 
00072 using namespace std;
00073 class HashTableGeneric;
00074 #include <unistd.h>
00075 #include <vector>
00076 #include <string>
00077 #include <iostream>
00078 #include <fstream>
00079 #include <map>
00080 #include <utility>
00081 #include <sys/types.h>
00082 #include <stdio.h>
00083 #include <sys/file.h>
00084 #include <sys/mman.h>
00085 #include <sys/stat.h>
00086 #include <errno.h>
00087 
00088 // ### Class Declarations ###
00089 
00090 // All global variable names are preceded by g
00091 static const char gBaseNames [] 
00092 = "ACGT";
00093 
00094 static const char gResidueNames[] 
00095 //= "*ACDEFGHIKLMNPQRSTUVWY??????????"; // should probably remove U
00096   = "*ACDEFGHIKLMNPQRSTVWXY??????????"; 
00097 // changed 20.3.2 AJC 
00098 // coding for U removed, extra encoding for x added
00099 // 0  *
00100 // 1  A
00101 // 2  C
00102 // 3  D
00103 // 4  E
00104 // 5  F
00105 // 6  G
00106 // 7  H
00107 // 8  I
00108 // 9  K
00109 // 10 L
00110 // 11 M
00111 // 12 N
00112 // 13 P
00113 // 14 Q
00114 // 15 R
00115 // 16 S
00116 // 17 T
00117 // 18 V
00118 // 19 W
00119 // 20 X
00120 // 21 Y
00121 
00122 
00123 
00124 
00125 static const char gCodonNames[] 
00126 = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
00127 
00128 typedef unsigned int Word; 
00129 // All sequences of base pairs are split into equal size pieces of up to 16
00130 // base pairs, each of which is encoded within a Word as follows:
00131 // A=00, C=01, G=10, T=11 
00132 // The last base pair of a sequence takes the least significant 2 bits of the 
00133 // Word.
00134 // NB:
00135 // i) If the word length is not 16 then the most significant bits of the Word
00136 // will not be used.
00137 // ii) Knowledge of the word length is necessary to decode the Word
00138 // successfully.
00139 
00140 // MSB of a word may be set to indicate that it is `cursed', ie it is not
00141 // to be used for hashing (used to deal with words containing 'n's & '-'s
00142 // This limits maximum word length to 15 for DNA
00143 // TC 23.10.1
00144 static const Word gCursedWord = 1<<31;
00145 
00146 typedef unsigned char uchar;
00147 typedef unsigned short ushort;
00148 
00149 
00150 enum
00151 {
00152   gBaseBits = 2,
00153   gResidueBits = 5,
00154   gBasesPerCodon = 3,
00155   gCodonBits = gBaseBits*gBasesPerCodon,
00156   gBitsPerWord = 8*sizeof(Word),
00157   gMaxBasesPerWord = gBitsPerWord / gBaseBits,
00158   gNumReadingFrames = 3, 
00159   gNumDirections = 2,
00160   gNumCodonEncodings = 22 
00161   // = 20 amino acids + 1 stop codon + selenocysteine = 22
00162   // now 20 amino acids + stop codon + X - TC 27.3.2
00163 };
00164 
00165 enum SourceDataType
00166 {
00167   gDNAData = 0,
00168   gProteinData = 1,
00169   gUnknownData = 2
00170 };
00171 
00172 enum HitListFormatType
00173 {
00174   gStandard = 0,
00175   g32BitPacked = 1,
00176   g32BitPackedProtein =2,
00177   gTranslated = 3,
00178   gHybrid = 4,
00179   gNotSpecified = 5
00180 };
00181 
00182 
00183 
00184 // Class Name : WordSequence
00185 // Description: This class holds a sequence of base pairs, expressed as a 
00186 // sequence of Words
00187 
00188 // Example: for word length 5, sequence AGCTTGCCTGCCT would be encoded as
00189 // follows:
00190 // AGCTTGCCTGCCT   MSB < -------------------------------- > LSB
00191 // Word 1: AGCTT - xx xx xx xx xx xx xx xx xx xx xx 00 10 01 11 11 = 159
00192 // Word 2: GCCTG - xx xx xx xx xx xx xx xx xx xx xx 10 01 01 11 10 = 606
00193 // Word 3: CCT   - xx xx xx xx xx xx xx xx xx xx xx 01 01 11 00 00 = 368
00194 // xx = not used, set to zero
00195 
00196 // i) Need to either pass information that Word 3 contains only 3 base pairs
00197 // (otherwise it gets decoded as CCTAA) or discard word 3 altogether (not
00198 // a problem as in general we intend to deal only with much longer sequences
00199 // than this example).
00200 // NB:
00201 // Idea of interface is to save users from having to learn the initially 
00202 // somewhat daunting looking STL notation for manipulating vectors by hiding 
00203 // them behind commands that are more intuitive to this application. Making 
00204 // the definitions inline ensures little or no performance penalty for this.
00205 // At the moment STL commands can still be used directly if desired, but
00206 // using commands below will 'future proof' against implementation changes
00207 // (i.e. if vectors prove too slow).
00208 class WordSequence : public vector<Word>
00209 {
00210   public:
00211   WordSequence( void )                 {} 
00212   // Next constructor produces a WordSequence containing numWords
00213   // Words all set to zero.
00214   WordSequence( int numWords ):
00215     vector<Word>(numWords), 
00216     numBasesInLast(0) 
00217     {} 
00218   WordSequence( const WordSequence& rhs ) :
00219     vector<Word>( rhs ), 
00220     numBasesInLast( rhs.numBasesInLast )
00221     {}
00222   WordSequence& operator=( const WordSequence& rhs )
00223   {
00224     numBasesInLast = rhs.numBasesInLast;
00225     vector<Word>::operator=(rhs);
00226     return *this;
00227   }
00228 
00229   void addWord( Word inputWord )       { push_back( inputWord ); }
00230   void removeLastWord( void )          { pop_back(); }
00231   void setMaxWords( int maxNumWords )  { reserve( maxNumWords ); }
00232   int  getMaxWords( void )       const { return capacity(); }    
00233   int  getNumWords( void )       const { return size(); }    
00234   Word getWord( int wordNumber ) const { return ((*this)[wordNumber]);  }   
00235   int  getNumBasesInLast( void ) const { return numBasesInLast; }
00236   void setNumBasesInLast( int nbil )   { numBasesInLast=nbil; }
00237 
00238 
00239 
00240   protected:
00241   int numBasesInLast;
00242 };
00243 
00244 typedef WordSequence::iterator WordSequenceIterator;
00245 typedef WordSequence::reverse_iterator WordSequenceReverseIterator;
00246 
00247 // The position of any single occurrence of any sequence of base pairs in the 
00248 // subject sequence database may be held within an instance of the 
00249 // PositionInDatabase struct.
00250 typedef unsigned int SequenceNumber;
00251 typedef int          SequenceOffset;
00252 
00253 struct PositionInDatabase
00254 {
00255   PositionInDatabase( SequenceNumber sequence_, SequenceOffset offset_ ) :
00256     sequence(sequence_), offset(offset_) {}
00257   PositionInDatabase(void) {}
00258   SequenceNumber sequence;
00259   // The sequences in the subject database are numbered in the order they
00260   // are read in from the file, starting at 1
00261   SequenceOffset offset;
00262   // The position of a sequence of base pairs in the database 
00263   // is measured by the offset in base pairs from the first base pair of 
00264   // a subject sequence. We have to allow for the offset being negative, 
00265   // because if the end base pairs of a query sequence match the start of one 
00266   // of the subject sequences, then the position of the start of the query 
00267   // sequence (measured as an offset from the start of the subject sequence) 
00268   // will be negative. Hence offset is defined as 'int' not 'unsigned int.' 
00269   // Clear as mud? Good. 
00270 
00271   // We need to define the '<' operator if we are to make use of STL
00272   // sorting functions
00273   bool operator<( const PositionInDatabase& pos) const
00274   {
00275     return (    ( sequence < pos.sequence )
00276              || (    ( sequence == pos.sequence )
00277                   && ( offset < pos.offset )   )   );
00278   } // ~operator<
00279 
00280   bool operator==( const PositionInDatabase& pos) const
00281   {
00282     return (    ( sequence == pos.sequence )
00283              && ( offset   == pos.offset   ) );
00284   } // ~operator<
00285 
00286 };
00287 
00288 
00289 // HitList is an aggregation of PositionInDatabase instances. We may wish to 
00290 // try to speed things up by storing this data in esoteric ways, so this 
00291 // interface just encapsulates the only behaviour we need at this level
00292 // i.e. addHit so that HashTableGeneric and its subclasses can add hits, and
00293 // printHits so that QueryManager and its subclasses can print out the
00294 // results.
00295 
00296 #ifdef USING_TEMPLATES_INSTEAD
00297 class HitList
00298 {
00299  public:
00300   virtual void addHit
00301     ( const PositionInDatabase& hitPos, 
00302       const SequenceOffset& queryPos ) = 0;
00303   virtual void addHit( SequenceNumber sequence, 
00304                        SequenceOffset offset, 
00305                        SequenceOffset queryPos )
00306     { 
00307       //      cout << "addHit: " << sequence << " " << offset << " " 
00308       //   << queryPos << endl;
00309       addHit( PositionInDatabase(sequence,offset),queryPos); 
00310     } // ~addHit
00311 }; // ~class HitList
00312 #endif
00313 
00314 // Struct Name: HitInfo
00315 // Description: This contains all info for a 'hit', i.e. when a word from
00316 // the query sequence matches a word in one of the subject sequences. 
00317 struct HitInfo
00318 {
00319   // subjectNum: number of the sequence in the subject database
00320   SequenceNumber subjectNum;
00321   // diff: this is defined as 
00322   //   { position of matching word in subject sequence }
00323   // - { position of matching word in query sequence }
00324   SequenceOffset diff;
00325   // queryPos: this is defined as the position of the matching word in the
00326   // query sequence.
00327   SequenceOffset queryPos;
00328 
00329   // Simple constructor
00330   HitInfo( PositionInDatabase inputHitPos, SequenceOffset inputQueryPos ) :
00331   subjectNum( inputHitPos.sequence ),
00332   diff( inputHitPos.offset - inputQueryPos ),
00333   queryPos( inputQueryPos + 1 ) {}
00334 
00335   // Another simple constructor
00336   HitInfo( SequenceNumber inputSubjectNum, 
00337            SequenceOffset inputSubjectPos,
00338            SequenceOffset inputQueryPos ) :
00339   subjectNum( inputSubjectNum ),
00340   diff( inputSubjectPos - inputQueryPos ),
00341   queryPos( inputQueryPos + 1 ) {}
00342 
00343   // '<' operator must be defined to enable vectors of HitInfo instances
00344   // to be sorted
00345   bool operator<( const HitInfo& hit) const
00346   {
00347     return (    ( subjectNum < hit.subjectNum )
00348              || (    ( subjectNum == hit.subjectNum )
00349                   && (    ( diff < hit.diff )      
00350                        || (    ( diff == hit.diff )      
00351                             && ( queryPos < hit.queryPos )  )  )  )  );
00352   } // ~operator<
00353 
00354   bool operator==( const HitInfo& hit) const
00355   {
00356     return (    ( subjectNum == hit.subjectNum )
00357              && ( diff == hit.diff   ) 
00358              && ( queryPos == hit.queryPos )   );
00359   } // ~operator<
00360 
00361 };
00362 
00363 // Class Name : HitListVector
00364 // Description: This class is a basic storage class for database hit
00365 // information. Inherits interface from HitList and implementation
00366 // from the STL vector class.
00367 class HitListVector : 
00368 public vector<HitInfo>
00369 {
00370  public:
00371   void addHit( const PositionInDatabase& hitPos, 
00372                const SequenceOffset& queryPos ) 
00373   { 
00374     push_back( HitInfo( hitPos,queryPos ) ); 
00375   }
00376   void addHit( SequenceNumber sequence, 
00377                        SequenceOffset offset, 
00378                        SequenceOffset queryPos )
00379   { 
00380     push_back( HitInfo(sequence, offset, queryPos ) ); 
00381     //      addHit( PositionInDatabase(sequence,offset),queryPos); 
00382   } // ~addHit
00383 
00384 }; // ~HitListVector
00385 
00386 typedef HitListVector HitList;
00387 
00388 
00389 // Class Name :
00390 // Description: 
00391 
00392 class MachineInfo
00393 {
00394   typedef unsigned long Offset;
00395   public:
00396   static MachineInfo* getInfo( void )
00397   {
00398     if ( info_ == 0 ) info_ = new MachineInfo;
00399     return info_;
00400   } // ~getInfo
00401 
00402   int  getNumBits( void ) { return numBits_; }
00403   bool getIsLeastSigByteFirst( void ) { return isLeastSigByteFirst_; }
00404   void report( ostream& os )
00405   {
00406     os << "This is a " << numBits_ << " bit machine with "
00407        << ( (isLeastSigByteFirst_) ? "least" : "most" )
00408        << "-significant-byte-first ordering" << endl;
00409   } // ~report
00410 
00411   friend ostream& operator<<( ostream& os, MachineInfo& myInfo )
00412   {
00413     myInfo.report(os);
00414     return os;
00415   }
00416 
00417   protected:
00418   MachineInfo( void )
00419   {
00420 
00421   numBits_ = 8 * sizeof(Offset);
00422   
00423     Offset checkByteOrder = 1;
00424     Offset* offsetAddress = &checkByteOrder;
00425     char* firstByte = ( (char*) offsetAddress );
00426 
00427     offsetAddress++;
00428     char* lastByte =  (char*) offsetAddress;
00429     lastByte--;
00430     isLeastSigByteFirst_ = ( *firstByte == 1 );
00431 
00432   } // ~constructor
00433 
00434   private:
00435   static MachineInfo* info_;
00436   int numBits_;
00437   bool isLeastSigByteFirst_;
00438 }; 
00439 
00440 class SSAHAException : public std::exception
00441 {
00442 public:
00443   SSAHAException( const string& message = "" ) : message_( message ) {}
00444   virtual ~SSAHAException() throw() {}
00445   virtual const char* what() const throw() 
00446   {
00447     return message_.c_str();
00448   } // ~what
00449 protected:
00450   const string message_;
00451 
00452 }; // ~SSAHAException
00453 
00454 // Class Name:  ofstreamSSAHA
00455 // Description: overrides standard ofstream to allow more than
00456 // (2^31)-1 (approx 2Gb) bytes to be written at a time
00457 class ofstreamSSAHA : public std::ofstream
00458 {
00459 public:
00460   static const unsigned long maxChunkSize = ( ((unsigned long)1<<31)-1 ); 
00461   //  static const unsigned long maxChunkSize
00462   //  ( ((unsigned long)1<<31)-1 ); 
00463 
00464   ofstreamSSAHA
00465   ( const char* fileName, unsigned long chunkSize = maxChunkSize )  
00466     : std::ofstream( fileName ), chunkSize_( chunkSize ) {}
00467   ostream& write( const char* p, unsigned long s )
00468   {
00469     while ( s > maxChunkSize )
00470     {
00471       std::ofstream::write(p,maxChunkSize); 
00472       p+=maxChunkSize;
00473       s-=maxChunkSize;
00474     } // ~while
00475     std::ofstream::write(p,s);
00476     return *this;
00477   } // ~write
00478 
00479 
00480 protected:
00481   unsigned long chunkSize_;
00482 
00483 }; // ~class ofstreamSSAHA
00484 
00485 // Class Name:  ifstreamSSAHA
00486 // Description: overrides standard ifstream to allow more than
00487 // (2^31)-1 (approx 2Gb) bytes to be read at a time
00488 class ifstreamSSAHA : public std::ifstream
00489 {
00490   static const unsigned long maxChunkSize = ( ((unsigned long)1<<31)-1 ); 
00491   //  static const unsigned long maxChunkSize
00492   //  ( ((unsigned long)1<<31)-1 ); 
00493 
00494 public:
00495   ifstreamSSAHA
00496   ( const char* fileName, unsigned long chunkSize = maxChunkSize ) 
00497     : std::ifstream( fileName ), chunkSize_( chunkSize ) {}
00498 
00499   istream& read( const char* p, unsigned long s )
00500   {
00501     bytesSoFar_=0;
00502     while ( s > maxChunkSize )
00503     {
00504       std::ifstream::read((char*)p,maxChunkSize); 
00505       p+=maxChunkSize;
00506       s-=maxChunkSize;
00507       bytesSoFar_+=std::ifstream::gcount();
00508     } // ~while
00509     std::ifstream::read((char*)p,s);
00510     bytesSoFar_+=std::ifstream::gcount();
00511     //    cout << "!!!read " << bytesSoFar_ << endl; 
00512     return *this;
00513   } // ~read
00514 
00515   unsigned long gcount( void ) const
00516     { return bytesSoFar_; }
00517 
00518 
00519 protected:
00520   unsigned long chunkSize_;
00521   unsigned long bytesSoFar_;
00522 
00523 }; // ~class ifstreamSSAHA
00524 
00525 // Class Name:  NullBuffer
00526 // This can be used to suppress an output stream by doing
00527 // NullBuffer db; mystream.rdbuf(&db); 
00528 class NullBuffer : public std::streambuf
00529 {
00530 public:
00531   NullBuffer() {}
00532 };
00533 
00534 // Function Name: getTimeNow
00535 // returns a string detailing the current date & time
00536 char* getTimeNow(void);
00537 
00538 
00539 // Class Name : printWord
00540 // Description: The purpose of this class is to provide a convenient
00541 // way to convert Word and WordSequence data to ASCII base pair format
00542 // Example usage:
00543 // Word w = 228 ; // = binary 11100100 = ACGT
00544 // cout << w; // prints out '228'
00545 // cout << printWord(w,4); // prints out 'AGCT', 4 = word length
00546 // Alternatively, if w is used to access HashTableGeneric h (i.e. their word 
00547 // lengths are the same) can just do
00548 // cout << printWord(w,h); // printWord asks h for word length
00549 // Can also print out WordSequence (the base pairs in its words appear as one
00550 // long string) - need to pass in either word length or hash table, as above.
00551 // WordSequence W; // ... now populate using addWord
00552 // cout << printWord(W,10); // ... can also pass in word length directly, 
00553 //
00554 // Note that class name does not begin with a capital because it is 
00555 // mimicing a function 
00556 
00557 
00558 class PrintFromWord
00559 {
00560   public:
00561   PrintFromWord( const Word word, int wordLength, int bitsPerSymbol, 
00562                  const char* tt ) 
00563   : pWordSeq_(0), word_( word ), length_( wordLength ),
00564     bitsPerSymbol_( bitsPerSymbol ), tt_(tt), 
00565     mask_( (1 << bitsPerSymbol) - 1 ) 
00566   {}
00567 
00568   PrintFromWord( const Word word, const HashTableGeneric& hashTable, 
00569                  int bitsPerSymbol, const char* tt ); 
00570 
00571   PrintFromWord( const WordSequence& wordSeq, int wordLength, 
00572                  int bitsPerSymbol, const char* tt ) 
00573   : pWordSeq_( &wordSeq ), length_( wordLength ), 
00574     bitsPerSymbol_( bitsPerSymbol ), tt_( tt ),
00575    mask_( (1 << bitsPerSymbol) - 1 ) 
00576   {}
00577 
00578   PrintFromWord( const WordSequence& wordSeq, const HashTableGeneric& hashTable, 
00579                  int bitsPerSymbol, const char* tt ); 
00580 
00581 
00582   void doPrint( ostream& os );
00583 
00584   void doWord( ostream& os, const Word word )
00585     { doWord(os, word, length_ ); }
00586   void doWord( ostream& os, const Word word, int length );
00587 
00588   friend ostream& operator<<( ostream& os, PrintFromWord pw)
00589   {
00590     pw.doPrint(os);
00591     return os;
00592   } // ~operator<<
00593   protected:
00594   const WordSequence*  pWordSeq_;
00595   Word                 word_;
00596   int                  length_;
00597   int                  bitsPerSymbol_;
00598   const char*          tt_;
00599   Word                 mask_;
00600 
00601 }; // ~class PrintFromWord
00602 
00603 class printBase : public PrintFromWord
00604 {
00605  public:
00606   printBase( const Word word, int wordLength ) :
00607     PrintFromWord( word, wordLength, gBaseBits, gBaseNames ) {}
00608   printBase( const Word word, const HashTableGeneric& hashTable ) :
00609     PrintFromWord( word, hashTable, gBaseBits, gBaseNames ) {}
00610   printBase( const WordSequence& wordSeq, int wordLength ) :
00611     PrintFromWord( wordSeq, wordLength, gBaseBits, gBaseNames ) {}
00612   printBase( const WordSequence& wordSeq, const HashTableGeneric& hashTable ) :
00613     PrintFromWord( wordSeq, hashTable, gBaseBits, gBaseNames ) {}
00614 };
00615 
00616 class printResidue : public PrintFromWord
00617 {
00618  public:
00619   printResidue( const Word word, int wordLength ) :
00620     PrintFromWord( word, wordLength, gResidueBits, gResidueNames ) {}
00621   printResidue( const Word word, const HashTableGeneric& hashTable ) :
00622     PrintFromWord( word, hashTable, gResidueBits, gResidueNames ) {}
00623   printResidue( const WordSequence& wordSeq, int wordLength ) :
00624     PrintFromWord( wordSeq, wordLength, gResidueBits, gResidueNames ) {}
00625   printResidue( const WordSequence& wordSeq, 
00626                 const HashTableGeneric& hashTable ) :
00627     PrintFromWord( wordSeq, hashTable, gResidueBits, gResidueNames ) {}
00628 };
00629 
00630 typedef printBase printWord; // for backwards compatibility
00631 
00632 // ### Function Declarations ###
00633 
00634 // Name: makeWord
00635 // Arguments: const string&
00636 // Returns: Word
00637 // very basic function to encode a string into a Word
00638 //Word makeWord(const string& s);
00639 
00640 class MakeIntoWord
00641 {
00642  public:
00643   MakeIntoWord( int bitsPerSymbol, const char* tt );
00644 
00645   Word operator()( const string& s );
00646 
00647   protected:
00648   int bitsPerSymbol_;
00649   const char* tt_;
00650   map< char, Word > map_;
00651 };
00652 
00653 static MakeIntoWord makeBase( gBaseBits, gBaseNames );
00654 // next ensures backwards compatibility
00655 static MakeIntoWord makeWord( gBaseBits, gBaseNames ); 
00656 static MakeIntoWord makeResidue( gResidueBits, gResidueNames );
00657 
00658   // Function Name: reverseComplement
00659   // Arguments: Word (in)
00660   // Returns:   Word (out)
00661   // Returns the reverse complement of a Word
00662   Word reverseComplement( Word word, int wordLength );
00663 
00664   // Function Name: reverseComplement
00665   // Arguments: const WordSequence& (in), WordSequence& (out), int (in)
00666   // Returns:   void
00667   // This computes the reverse complement of seq and places it in revComp
00668   // Reason for parameter numBasesInLast is that if only numBasesInLast
00669   // (< wordLength_) valid base pairs are stored in the last element of
00670   // seq then obviously the last (wordLength_-numBasesInLast) base pairs of
00671   // seq are not valid. But we do not want our computed reverse complement 
00672   // to begin with the RC of these invalid pairs, so shiftSequence is called
00673   // to remove them.
00674 //  void reverseComplement
00675 //  ( const WordSequence& seq, 
00676 //    WordSequence& revComp, 
00677 //    int wordLength,
00678 //    int numBasesInLast );
00679 
00680   void reverseComplement
00681   ( const WordSequence& seq, 
00682     WordSequence& revComp,
00683     int wordLength );
00684 
00685 // Function Name: shiftSequence
00686 // Arguments: WordSequence& (in/out), int (in)
00687 // Returns:   void 
00688 // This shifts a WordSequence 'i places to the left', i.e. symbol i+1 of
00689 // the sequence becomes symbol 1. The first i symbols are lost. 
00690 void shiftSequence
00691 ( WordSequence& sequence, int bitsPerSymbol, int wordLength, int i = 1 );
00692 
00693 // Function Name: shiftSequence
00694 // This shifts a WordSequence 'i places to the left', i.e. base i+1 of
00695 // the sequence becomes base 1. The first i bases are lost. 
00696 inline void shiftSequenceDNA
00697 ( WordSequence& sequence, int wordLength, int i = 1 )
00698 { 
00699   shiftSequence( sequence, gBaseBits, wordLength, i ); 
00700 }
00701 
00702 // Function Name: shiftSequence
00703 // This shifts a WordSequence 'i places to the left', i.e. residue i+1 of
00704 // the sequence becomes residue 1. The first i residues are lost. 
00705 inline void shiftSequenceProtein
00706 ( WordSequence& sequence, int wordLength, int i = 1 )
00707 { 
00708   shiftSequence( sequence, gResidueBits, wordLength, i ); 
00709 }
00710 
00711 void loadFromFile
00712 ( const string& fileName, const char* buffer, const unsigned long numBytes,
00713 ostream& monitoringStream_=cerr );
00714 
00715 void saveToFile
00716 ( const string& fileName, const char* buffer, const unsigned long numBytes,
00717 ostream& monitoringStream_=cerr );
00718 
00719 
00720 // Class Name : Allocator
00721 // Description: An Allocator handles all dynamic memory allocation operations
00722 // on a pointer to memory
00723 // Idea is to enable large chunks of memory to be obtained either
00724 // from local memory or by memory mapping from the file system (and
00725 // for everything outside the Allocator to be oblivious to the difference!)
00726 template <typename T> class Allocator
00727 {
00728 public:
00729   typedef T MyType;
00730 
00731   Allocator( T** ptr, const string& name, ostream& monStream=cerr ) : 
00732     ptr_(ptr), name_(name), size_(0), isAllocated_(false), 
00733     monStream_( monStream )
00734     {}
00735 
00736   //  template< class T > void link( T** ptr, const string& name )
00737   //  {
00738   //    if (isLinked_) return; // can't relink once linked
00739   //    name_=name;
00740   //    ptr_=(char**)ptr;
00741   //    isLinked_=true;
00742   //  }
00743 
00744   virtual Allocator<T>* clone( T** ptr, 
00745                                const string& name, 
00746                                ostream& monStream )=0;
00747 
00748   virtual void allocate( unsigned long size ) =0;
00749   virtual void allocateAndZero( unsigned long size ) =0;
00750   virtual void load( unsigned long size ) =0;
00751   virtual void save() =0;
00752   virtual void deallocate() =0;
00753 
00754 
00755   virtual ~Allocator() {}
00756   //protected:
00757   T** ptr_;
00758   string name_;
00759   unsigned long size_;
00760   bool isAllocated_;
00761   ostream& monStream_;
00762   //  bool isLinked_;
00763 };
00764 
00765 // Class Name : AllocatorLocal
00766 // Description: Handles all dynamic memory allocation operations on a pointer
00767 // to memory. Memory is allocated from local memory (new & delete & all that)
00768 template <typename T> class AllocatorLocal : public Allocator<T>
00769 {
00770 public:
00771 
00772   AllocatorLocal( T** ptr=NULL, 
00773                   const string& name = "", 
00774                   ostream& monStream=cerr ) : 
00775   Allocator<T>(ptr,name,monStream) 
00776   {}
00777 
00778   virtual Allocator<T>* clone
00779     ( T** ptr, const string& name, ostream& monStream=cerr )
00780   {
00781     return new AllocatorLocal<T>(ptr,name,monStream);
00782   }
00783 
00784   virtual ~AllocatorLocal() 
00785   { 
00786     deallocate(); 
00787   }
00788  
00789   virtual void allocate( unsigned long size )
00790   {
00791     size_=size;
00792     (*ptr_)=new T[size_];
00793     isAllocated_=true;
00794   }
00795 
00796   virtual void allocateAndZero( unsigned long size ) 
00797   {
00798     const unsigned char zero(0);
00799     allocate(size);
00800     memset( (void*)(*ptr_), zero, size_*sizeof(MyType) );
00801   }
00802   virtual void load( unsigned long size )
00803   {
00804     allocate(size);
00805     loadFromFile( name_, (char*)(*ptr_), size_*sizeof(MyType), monStream_ );
00806   }
00807   virtual void save()
00808   {
00809     saveToFile( name_, (char*)(*ptr_), size_*sizeof(MyType), monStream_ );
00810   }
00811 
00812   virtual void deallocate()
00813   {
00814     if (!isAllocated_) return;
00815     delete [] (*ptr_);
00816     isAllocated_=false;
00817   }
00818 protected:
00819 };
00820 
00821 
00822 #ifdef USING_POSIX_MEMORY_MAPPING
00823 
00824 class MemoryMapper
00825 {
00826  public:
00827   MemoryMapper() {}
00828 
00829   struct Mode
00830   {
00831     Mode( int shared_, int access_, int flags_, bool delete_, bool truncate_ ):
00832       shared(shared_),
00833       access(access_),
00834       flags(flags_), 
00835       deleteFileOnExit(delete_),
00836       truncateFile(truncate_)
00837       {}
00838     int shared;
00839     int access;
00840     int flags;
00841     bool deleteFileOnExit;
00842     bool truncateFile;
00843   };
00844 
00845   static const Mode createMap;
00846   static const Mode readMap;
00847 
00848   void* linkToMap
00849     ( const Mode& mode, const string& name, unsigned long numBytes );
00850  protected:
00851 
00852   int fileDesc_;
00853 
00854 
00855 };
00856 
00857 // NB to get the mmap stuff working, need to change the makefile in
00858 // following ways:
00859 // 1. GlobalDefinitions.o from CFLAGS to CFLAGS_NOOPT
00860 // 2. SSAHAMain.o from CFLAGS to CFLAGS_NOOPT
00861 // 3. uncomment 'LIB = -lrt' line
00862 
00863 // Class Name : AllocatorMapped
00864 // Description: Handles all dynamic memory allocation operations on a 
00865 // pointer to memory. Memory is allocated using POSIX memory mapping.
00866 // Adapted from Compaq's guide to real time programming (on the web)
00867 template <typename T> class AllocatorMapped : 
00868 public Allocator<T>, private MemoryMapper
00869 {
00870 public:
00871 
00872   AllocatorMapped( T** ptr=NULL, const string& name = "" ) : 
00873     Allocator<T>(ptr,name), MemoryMapper(), mode_(createMap) 
00874   {
00875     if ( (name_=="") || (name_[0]=='.') )
00876     {
00877       char buf[50];
00878       sprintf(buf,"SSAHAMapFile-%d",getpid());
00879       name_=(string)buf+name_;
00880     }
00881   }
00882 
00883   virtual Allocator<T>* clone( T** ptr, const string& name )
00884   {
00885     return new AllocatorMapped<T>(ptr,name);
00886   }
00887 
00888   virtual ~AllocatorMapped() { if (isAllocated_) deallocate(); }
00889 
00890   virtual void allocate( unsigned long size )
00891   {
00892     if (isAllocated_) return;
00893     mode_ = MemoryMapper::createMap;
00894     size_ = size;
00895     (*ptr_) = (T*) linkToMap(mode_,name_,size_*sizeof(MyType));
00896     isAllocated_ = true;
00897   }
00898 
00899   virtual void allocateAndZero( unsigned long size )
00900   {
00901     allocate(size); // mapped memory is already zeroed
00902   }
00903 
00904   // because we are mapping the file into memory directly
00905   // don't need to load or save
00906   virtual void load( unsigned long size ) 
00907   {
00908     if (isAllocated_) return;
00909     mode_ = MemoryMapper::readMap;
00910     size_ = size;
00911     (*ptr_) = (T*) linkToMap(mode_,name_,size_*sizeof(MyType));
00912     isAllocated_ = true;
00913   }
00914 
00915 
00916   virtual void save() 
00917   {
00918     mode_.deleteFileOnExit=false;
00919   }
00920 
00921   virtual void deallocate()
00922   {
00923     if (!isAllocated_) return;
00924     if(munmap((caddr_t)(*ptr_), size_*sizeof(MyType)) < 0)
00925       perror("unmap error"); // don't throw - called from destructor!
00926     close(fileDesc_);
00927     if (mode_.deleteFileOnExit) shm_unlink(name_.c_str());
00928     isAllocated_=false;
00929   }
00930 
00931 protected:
00932   MemoryMapper::Mode mode_;
00933 };
00934 
00935 #endif
00936 
00937 
00938 // Name:
00939 // Arguments:
00940 // TYPE  NAME  IN/OUT COMMENT
00941 // Returns: TYPE COMMENT
00942 
00943 // End of include guard:
00944 #endif
00945 
00946 // End of file GlobalDefinitions.h
00947 
00948 

Generated on Fri Dec 21 13:12:15 2007 for ssaha by  doxygen 1.5.2