00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #ifndef INCLUDED_GlobalDefinitions
00032 #define INCLUDED_GlobalDefinitions
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049 #define DEBUG_L3(X)
00050 #define DEBUG_L2(X)
00051 #define DEBUG_L1(X)
00052
00053 #ifdef EBUG_LEVEL3
00054 #undef DEBUG_L3
00055 #define DEBUG_L3(X) cout << "DE3: " << X << endl
00056 #define EBUG_LEVEL2
00057 #endif
00058
00059 #ifdef EBUG_LEVEL2
00060 #undef DEBUG_L2
00061 #define DEBUG_L2(X) cout << "DE2: " << X << endl
00062 #define EBUG_LEVEL1
00063 #endif
00064
00065 #ifdef EBUG_LEVEL1
00066 #undef DEBUG_L1
00067 #define DEBUG_L1(X) cout << "DE1: " << X << endl
00068 #endif
00069
00070
00071
00072 using namespace std;
00073 class HashTableGeneric;
00074 #include <unistd.h>
00075 #include <vector>
00076 #include <string>
00077 #include <iostream>
00078 #include <fstream>
00079 #include <map>
00080 #include <utility>
00081 #include <sys/types.h>
00082 #include <stdio.h>
00083 #include <sys/file.h>
00084 #include <sys/mman.h>
00085 #include <sys/stat.h>
00086 #include <errno.h>
00087
00088
00089
00090
00091 static const char gBaseNames []
00092 = "ACGT";
00093
00094 static const char gResidueNames[]
00095
00096 = "*ACDEFGHIKLMNPQRSTVWXY??????????";
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125 static const char gCodonNames[]
00126 = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
00127
00128 typedef unsigned int Word;
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144 static const Word gCursedWord = 1<<31;
00145
00146 typedef unsigned char uchar;
00147 typedef unsigned short ushort;
00148
00149
00150 enum
00151 {
00152 gBaseBits = 2,
00153 gResidueBits = 5,
00154 gBasesPerCodon = 3,
00155 gCodonBits = gBaseBits*gBasesPerCodon,
00156 gBitsPerWord = 8*sizeof(Word),
00157 gMaxBasesPerWord = gBitsPerWord / gBaseBits,
00158 gNumReadingFrames = 3,
00159 gNumDirections = 2,
00160 gNumCodonEncodings = 22
00161
00162
00163 };
00164
00165 enum SourceDataType
00166 {
00167 gDNAData = 0,
00168 gProteinData = 1,
00169 gUnknownData = 2
00170 };
00171
00172 enum HitListFormatType
00173 {
00174 gStandard = 0,
00175 g32BitPacked = 1,
00176 g32BitPackedProtein =2,
00177 gTranslated = 3,
00178 gHybrid = 4,
00179 gNotSpecified = 5
00180 };
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208 class WordSequence : public vector<Word>
00209 {
00210 public:
00211 WordSequence( void ) {}
00212
00213
00214 WordSequence( int numWords ):
00215 vector<Word>(numWords),
00216 numBasesInLast(0)
00217 {}
00218 WordSequence( const WordSequence& rhs ) :
00219 vector<Word>( rhs ),
00220 numBasesInLast( rhs.numBasesInLast )
00221 {}
00222 WordSequence& operator=( const WordSequence& rhs )
00223 {
00224 numBasesInLast = rhs.numBasesInLast;
00225 vector<Word>::operator=(rhs);
00226 return *this;
00227 }
00228
00229 void addWord( Word inputWord ) { push_back( inputWord ); }
00230 void removeLastWord( void ) { pop_back(); }
00231 void setMaxWords( int maxNumWords ) { reserve( maxNumWords ); }
00232 int getMaxWords( void ) const { return capacity(); }
00233 int getNumWords( void ) const { return size(); }
00234 Word getWord( int wordNumber ) const { return ((*this)[wordNumber]); }
00235 int getNumBasesInLast( void ) const { return numBasesInLast; }
00236 void setNumBasesInLast( int nbil ) { numBasesInLast=nbil; }
00237
00238
00239
00240 protected:
00241 int numBasesInLast;
00242 };
00243
00244 typedef WordSequence::iterator WordSequenceIterator;
00245 typedef WordSequence::reverse_iterator WordSequenceReverseIterator;
00246
00247
00248
00249
00250 typedef unsigned int SequenceNumber;
00251 typedef int SequenceOffset;
00252
00253 struct PositionInDatabase
00254 {
00255 PositionInDatabase( SequenceNumber sequence_, SequenceOffset offset_ ) :
00256 sequence(sequence_), offset(offset_) {}
00257 PositionInDatabase(void) {}
00258 SequenceNumber sequence;
00259
00260
00261 SequenceOffset offset;
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273 bool operator<( const PositionInDatabase& pos) const
00274 {
00275 return ( ( sequence < pos.sequence )
00276 || ( ( sequence == pos.sequence )
00277 && ( offset < pos.offset ) ) );
00278 }
00279
00280 bool operator==( const PositionInDatabase& pos) const
00281 {
00282 return ( ( sequence == pos.sequence )
00283 && ( offset == pos.offset ) );
00284 }
00285
00286 };
00287
00288
00289
00290
00291
00292
00293
00294
00295
00296 #ifdef USING_TEMPLATES_INSTEAD
00297 class HitList
00298 {
00299 public:
00300 virtual void addHit
00301 ( const PositionInDatabase& hitPos,
00302 const SequenceOffset& queryPos ) = 0;
00303 virtual void addHit( SequenceNumber sequence,
00304 SequenceOffset offset,
00305 SequenceOffset queryPos )
00306 {
00307
00308
00309 addHit( PositionInDatabase(sequence,offset),queryPos);
00310 }
00311 };
00312 #endif
00313
00314
00315
00316
00317 struct HitInfo
00318 {
00319
00320 SequenceNumber subjectNum;
00321
00322
00323
00324 SequenceOffset diff;
00325
00326
00327 SequenceOffset queryPos;
00328
00329
00330 HitInfo( PositionInDatabase inputHitPos, SequenceOffset inputQueryPos ) :
00331 subjectNum( inputHitPos.sequence ),
00332 diff( inputHitPos.offset - inputQueryPos ),
00333 queryPos( inputQueryPos + 1 ) {}
00334
00335
00336 HitInfo( SequenceNumber inputSubjectNum,
00337 SequenceOffset inputSubjectPos,
00338 SequenceOffset inputQueryPos ) :
00339 subjectNum( inputSubjectNum ),
00340 diff( inputSubjectPos - inputQueryPos ),
00341 queryPos( inputQueryPos + 1 ) {}
00342
00343
00344
00345 bool operator<( const HitInfo& hit) const
00346 {
00347 return ( ( subjectNum < hit.subjectNum )
00348 || ( ( subjectNum == hit.subjectNum )
00349 && ( ( diff < hit.diff )
00350 || ( ( diff == hit.diff )
00351 && ( queryPos < hit.queryPos ) ) ) ) );
00352 }
00353
00354 bool operator==( const HitInfo& hit) const
00355 {
00356 return ( ( subjectNum == hit.subjectNum )
00357 && ( diff == hit.diff )
00358 && ( queryPos == hit.queryPos ) );
00359 }
00360
00361 };
00362
00363
00364
00365
00366
00367 class HitListVector :
00368 public vector<HitInfo>
00369 {
00370 public:
00371 void addHit( const PositionInDatabase& hitPos,
00372 const SequenceOffset& queryPos )
00373 {
00374 push_back( HitInfo( hitPos,queryPos ) );
00375 }
00376 void addHit( SequenceNumber sequence,
00377 SequenceOffset offset,
00378 SequenceOffset queryPos )
00379 {
00380 push_back( HitInfo(sequence, offset, queryPos ) );
00381
00382 }
00383
00384 };
00385
00386 typedef HitListVector HitList;
00387
00388
00389
00390
00391
00392 class MachineInfo
00393 {
00394 typedef unsigned long Offset;
00395 public:
00396 static MachineInfo* getInfo( void )
00397 {
00398 if ( info_ == 0 ) info_ = new MachineInfo;
00399 return info_;
00400 }
00401
00402 int getNumBits( void ) { return numBits_; }
00403 bool getIsLeastSigByteFirst( void ) { return isLeastSigByteFirst_; }
00404 void report( ostream& os )
00405 {
00406 os << "This is a " << numBits_ << " bit machine with "
00407 << ( (isLeastSigByteFirst_) ? "least" : "most" )
00408 << "-significant-byte-first ordering" << endl;
00409 }
00410
00411 friend ostream& operator<<( ostream& os, MachineInfo& myInfo )
00412 {
00413 myInfo.report(os);
00414 return os;
00415 }
00416
00417 protected:
00418 MachineInfo( void )
00419 {
00420
00421 numBits_ = 8 * sizeof(Offset);
00422
00423 Offset checkByteOrder = 1;
00424 Offset* offsetAddress = &checkByteOrder;
00425 char* firstByte = ( (char*) offsetAddress );
00426
00427 offsetAddress++;
00428 char* lastByte = (char*) offsetAddress;
00429 lastByte--;
00430 isLeastSigByteFirst_ = ( *firstByte == 1 );
00431
00432 }
00433
00434 private:
00435 static MachineInfo* info_;
00436 int numBits_;
00437 bool isLeastSigByteFirst_;
00438 };
00439
00440 class SSAHAException : public std::exception
00441 {
00442 public:
00443 SSAHAException( const string& message = "" ) : message_( message ) {}
00444 virtual ~SSAHAException() throw() {}
00445 virtual const char* what() const throw()
00446 {
00447 return message_.c_str();
00448 }
00449 protected:
00450 const string message_;
00451
00452 };
00453
00454
00455
00456
00457 class ofstreamSSAHA : public std::ofstream
00458 {
00459 public:
00460 static const unsigned long maxChunkSize = ( ((unsigned long)1<<31)-1 );
00461
00462
00463
00464 ofstreamSSAHA
00465 ( const char* fileName, unsigned long chunkSize = maxChunkSize )
00466 : std::ofstream( fileName ), chunkSize_( chunkSize ) {}
00467 ostream& write( const char* p, unsigned long s )
00468 {
00469 while ( s > maxChunkSize )
00470 {
00471 std::ofstream::write(p,maxChunkSize);
00472 p+=maxChunkSize;
00473 s-=maxChunkSize;
00474 }
00475 std::ofstream::write(p,s);
00476 return *this;
00477 }
00478
00479
00480 protected:
00481 unsigned long chunkSize_;
00482
00483 };
00484
00485
00486
00487
00488 class ifstreamSSAHA : public std::ifstream
00489 {
00490 static const unsigned long maxChunkSize = ( ((unsigned long)1<<31)-1 );
00491
00492
00493
00494 public:
00495 ifstreamSSAHA
00496 ( const char* fileName, unsigned long chunkSize = maxChunkSize )
00497 : std::ifstream( fileName ), chunkSize_( chunkSize ) {}
00498
00499 istream& read( const char* p, unsigned long s )
00500 {
00501 bytesSoFar_=0;
00502 while ( s > maxChunkSize )
00503 {
00504 std::ifstream::read((char*)p,maxChunkSize);
00505 p+=maxChunkSize;
00506 s-=maxChunkSize;
00507 bytesSoFar_+=std::ifstream::gcount();
00508 }
00509 std::ifstream::read((char*)p,s);
00510 bytesSoFar_+=std::ifstream::gcount();
00511
00512 return *this;
00513 }
00514
00515 unsigned long gcount( void ) const
00516 { return bytesSoFar_; }
00517
00518
00519 protected:
00520 unsigned long chunkSize_;
00521 unsigned long bytesSoFar_;
00522
00523 };
00524
00525
00526
00527
00528 class NullBuffer : public std::streambuf
00529 {
00530 public:
00531 NullBuffer() {}
00532 };
00533
00534
00535
00536 char* getTimeNow(void);
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557
00558 class PrintFromWord
00559 {
00560 public:
00561 PrintFromWord( const Word word, int wordLength, int bitsPerSymbol,
00562 const char* tt )
00563 : pWordSeq_(0), word_( word ), length_( wordLength ),
00564 bitsPerSymbol_( bitsPerSymbol ), tt_(tt),
00565 mask_( (1 << bitsPerSymbol) - 1 )
00566 {}
00567
00568 PrintFromWord( const Word word, const HashTableGeneric& hashTable,
00569 int bitsPerSymbol, const char* tt );
00570
00571 PrintFromWord( const WordSequence& wordSeq, int wordLength,
00572 int bitsPerSymbol, const char* tt )
00573 : pWordSeq_( &wordSeq ), length_( wordLength ),
00574 bitsPerSymbol_( bitsPerSymbol ), tt_( tt ),
00575 mask_( (1 << bitsPerSymbol) - 1 )
00576 {}
00577
00578 PrintFromWord( const WordSequence& wordSeq, const HashTableGeneric& hashTable,
00579 int bitsPerSymbol, const char* tt );
00580
00581
00582 void doPrint( ostream& os );
00583
00584 void doWord( ostream& os, const Word word )
00585 { doWord(os, word, length_ ); }
00586 void doWord( ostream& os, const Word word, int length );
00587
00588 friend ostream& operator<<( ostream& os, PrintFromWord pw)
00589 {
00590 pw.doPrint(os);
00591 return os;
00592 }
00593 protected:
00594 const WordSequence* pWordSeq_;
00595 Word word_;
00596 int length_;
00597 int bitsPerSymbol_;
00598 const char* tt_;
00599 Word mask_;
00600
00601 };
00602
00603 class printBase : public PrintFromWord
00604 {
00605 public:
00606 printBase( const Word word, int wordLength ) :
00607 PrintFromWord( word, wordLength, gBaseBits, gBaseNames ) {}
00608 printBase( const Word word, const HashTableGeneric& hashTable ) :
00609 PrintFromWord( word, hashTable, gBaseBits, gBaseNames ) {}
00610 printBase( const WordSequence& wordSeq, int wordLength ) :
00611 PrintFromWord( wordSeq, wordLength, gBaseBits, gBaseNames ) {}
00612 printBase( const WordSequence& wordSeq, const HashTableGeneric& hashTable ) :
00613 PrintFromWord( wordSeq, hashTable, gBaseBits, gBaseNames ) {}
00614 };
00615
00616 class printResidue : public PrintFromWord
00617 {
00618 public:
00619 printResidue( const Word word, int wordLength ) :
00620 PrintFromWord( word, wordLength, gResidueBits, gResidueNames ) {}
00621 printResidue( const Word word, const HashTableGeneric& hashTable ) :
00622 PrintFromWord( word, hashTable, gResidueBits, gResidueNames ) {}
00623 printResidue( const WordSequence& wordSeq, int wordLength ) :
00624 PrintFromWord( wordSeq, wordLength, gResidueBits, gResidueNames ) {}
00625 printResidue( const WordSequence& wordSeq,
00626 const HashTableGeneric& hashTable ) :
00627 PrintFromWord( wordSeq, hashTable, gResidueBits, gResidueNames ) {}
00628 };
00629
00630 typedef printBase printWord;
00631
00632
00633
00634
00635
00636
00637
00638
00639
00640 class MakeIntoWord
00641 {
00642 public:
00643 MakeIntoWord( int bitsPerSymbol, const char* tt );
00644
00645 Word operator()( const string& s );
00646
00647 protected:
00648 int bitsPerSymbol_;
00649 const char* tt_;
00650 map< char, Word > map_;
00651 };
00652
00653 static MakeIntoWord makeBase( gBaseBits, gBaseNames );
00654
00655 static MakeIntoWord makeWord( gBaseBits, gBaseNames );
00656 static MakeIntoWord makeResidue( gResidueBits, gResidueNames );
00657
00658
00659
00660
00661
00662 Word reverseComplement( Word word, int wordLength );
00663
00664
00665
00666
00667
00668
00669
00670
00671
00672
00673
00674
00675
00676
00677
00678
00679
00680 void reverseComplement
00681 ( const WordSequence& seq,
00682 WordSequence& revComp,
00683 int wordLength );
00684
00685
00686
00687
00688
00689
00690 void shiftSequence
00691 ( WordSequence& sequence, int bitsPerSymbol, int wordLength, int i = 1 );
00692
00693
00694
00695
00696 inline void shiftSequenceDNA
00697 ( WordSequence& sequence, int wordLength, int i = 1 )
00698 {
00699 shiftSequence( sequence, gBaseBits, wordLength, i );
00700 }
00701
00702
00703
00704
00705 inline void shiftSequenceProtein
00706 ( WordSequence& sequence, int wordLength, int i = 1 )
00707 {
00708 shiftSequence( sequence, gResidueBits, wordLength, i );
00709 }
00710
00711 void loadFromFile
00712 ( const string& fileName, const char* buffer, const unsigned long numBytes,
00713 ostream& monitoringStream_=cerr );
00714
00715 void saveToFile
00716 ( const string& fileName, const char* buffer, const unsigned long numBytes,
00717 ostream& monitoringStream_=cerr );
00718
00719
00720
00721
00722
00723
00724
00725
00726 template <typename T> class Allocator
00727 {
00728 public:
00729 typedef T MyType;
00730
00731 Allocator( T** ptr, const string& name, ostream& monStream=cerr ) :
00732 ptr_(ptr), name_(name), size_(0), isAllocated_(false),
00733 monStream_( monStream )
00734 {}
00735
00736
00737
00738
00739
00740
00741
00742
00743
00744 virtual Allocator<T>* clone( T** ptr,
00745 const string& name,
00746 ostream& monStream )=0;
00747
00748 virtual void allocate( unsigned long size ) =0;
00749 virtual void allocateAndZero( unsigned long size ) =0;
00750 virtual void load( unsigned long size ) =0;
00751 virtual void save() =0;
00752 virtual void deallocate() =0;
00753
00754
00755 virtual ~Allocator() {}
00756
00757 T** ptr_;
00758 string name_;
00759 unsigned long size_;
00760 bool isAllocated_;
00761 ostream& monStream_;
00762
00763 };
00764
00765
00766
00767
00768 template <typename T> class AllocatorLocal : public Allocator<T>
00769 {
00770 public:
00771
00772 AllocatorLocal( T** ptr=NULL,
00773 const string& name = "",
00774 ostream& monStream=cerr ) :
00775 Allocator<T>(ptr,name,monStream)
00776 {}
00777
00778 virtual Allocator<T>* clone
00779 ( T** ptr, const string& name, ostream& monStream=cerr )
00780 {
00781 return new AllocatorLocal<T>(ptr,name,monStream);
00782 }
00783
00784 virtual ~AllocatorLocal()
00785 {
00786 deallocate();
00787 }
00788
00789 virtual void allocate( unsigned long size )
00790 {
00791 size_=size;
00792 (*ptr_)=new T[size_];
00793 isAllocated_=true;
00794 }
00795
00796 virtual void allocateAndZero( unsigned long size )
00797 {
00798 const unsigned char zero(0);
00799 allocate(size);
00800 memset( (void*)(*ptr_), zero, size_*sizeof(MyType) );
00801 }
00802 virtual void load( unsigned long size )
00803 {
00804 allocate(size);
00805 loadFromFile( name_, (char*)(*ptr_), size_*sizeof(MyType), monStream_ );
00806 }
00807 virtual void save()
00808 {
00809 saveToFile( name_, (char*)(*ptr_), size_*sizeof(MyType), monStream_ );
00810 }
00811
00812 virtual void deallocate()
00813 {
00814 if (!isAllocated_) return;
00815 delete [] (*ptr_);
00816 isAllocated_=false;
00817 }
00818 protected:
00819 };
00820
00821
00822 #ifdef USING_POSIX_MEMORY_MAPPING
00823
00824 class MemoryMapper
00825 {
00826 public:
00827 MemoryMapper() {}
00828
00829 struct Mode
00830 {
00831 Mode( int shared_, int access_, int flags_, bool delete_, bool truncate_ ):
00832 shared(shared_),
00833 access(access_),
00834 flags(flags_),
00835 deleteFileOnExit(delete_),
00836 truncateFile(truncate_)
00837 {}
00838 int shared;
00839 int access;
00840 int flags;
00841 bool deleteFileOnExit;
00842 bool truncateFile;
00843 };
00844
00845 static const Mode createMap;
00846 static const Mode readMap;
00847
00848 void* linkToMap
00849 ( const Mode& mode, const string& name, unsigned long numBytes );
00850 protected:
00851
00852 int fileDesc_;
00853
00854
00855 };
00856
00857
00858
00859
00860
00861
00862
00863
00864
00865
00866
00867 template <typename T> class AllocatorMapped :
00868 public Allocator<T>, private MemoryMapper
00869 {
00870 public:
00871
00872 AllocatorMapped( T** ptr=NULL, const string& name = "" ) :
00873 Allocator<T>(ptr,name), MemoryMapper(), mode_(createMap)
00874 {
00875 if ( (name_=="") || (name_[0]=='.') )
00876 {
00877 char buf[50];
00878 sprintf(buf,"SSAHAMapFile-%d",getpid());
00879 name_=(string)buf+name_;
00880 }
00881 }
00882
00883 virtual Allocator<T>* clone( T** ptr, const string& name )
00884 {
00885 return new AllocatorMapped<T>(ptr,name);
00886 }
00887
00888 virtual ~AllocatorMapped() { if (isAllocated_) deallocate(); }
00889
00890 virtual void allocate( unsigned long size )
00891 {
00892 if (isAllocated_) return;
00893 mode_ = MemoryMapper::createMap;
00894 size_ = size;
00895 (*ptr_) = (T*) linkToMap(mode_,name_,size_*sizeof(MyType));
00896 isAllocated_ = true;
00897 }
00898
00899 virtual void allocateAndZero( unsigned long size )
00900 {
00901 allocate(size);
00902 }
00903
00904
00905
00906 virtual void load( unsigned long size )
00907 {
00908 if (isAllocated_) return;
00909 mode_ = MemoryMapper::readMap;
00910 size_ = size;
00911 (*ptr_) = (T*) linkToMap(mode_,name_,size_*sizeof(MyType));
00912 isAllocated_ = true;
00913 }
00914
00915
00916 virtual void save()
00917 {
00918 mode_.deleteFileOnExit=false;
00919 }
00920
00921 virtual void deallocate()
00922 {
00923 if (!isAllocated_) return;
00924 if(munmap((caddr_t)(*ptr_), size_*sizeof(MyType)) < 0)
00925 perror("unmap error");
00926 close(fileDesc_);
00927 if (mode_.deleteFileOnExit) shm_unlink(name_.c_str());
00928 isAllocated_=false;
00929 }
00930
00931 protected:
00932 MemoryMapper::Mode mode_;
00933 };
00934
00935 #endif
00936
00937
00938
00939
00940
00941
00942
00943
00944 #endif
00945
00946
00947
00948