HashTable/testHashTableNoOverlap.cpp

Go to the documentation of this file.
00001 
00002 // #######################################################################
00003 
00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00005 // Version 3.2, released 1st March 2004
00006 // Copyright (c) Genome Research 2002
00007 
00008 // SSAHA is free software; you can redistribute it and/or modify 
00009 // it under the terms of version 2 of the GNU General Public Licence
00010 // as published by the Free Software Foundation.
00011  
00012 // This program is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public Licence for more details.
00016  
00017 // You should have received a copy of the GNU General Public Licence
00018 // along with this program; if not, write to the Free Software
00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00021 
00022 // #######################################################################
00023 
00024 // Module Name  : testHashTable
00025 // File Name    : testHashTable.cpp
00026 // Language     : C++
00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00028 
00029 // Description:
00030 
00031 // Includes:
00032 
00033 #include "GlobalDefinitions.h"
00034 #include "HashTable.h"
00035 #include "HashTablePacked.h"
00036 #include "HashTableTranslated.h"
00037 #include "SequenceReaderString.h"
00038 #include "SequenceReaderFasta.h"
00039 #include "SequenceEncoder.h"
00040 #include "GenerateTestFastaFiles.h"
00041 #include "MatchStoreUngapped.h"
00042 #include "MatchStore.h"
00043 #include "QueryManager.h"
00044 #include <assert.h>
00045 #include <string>
00046 
00047 // ### Function Definitions ###
00048 
00049 // breaks encapsulation for debugging/test purposes
00050 class HashTableTranslatedPeek : public HashTableTranslated
00051 {
00052 public:
00053   HashTableTranslatedPeek( void ): HashTableTranslated(cout) {}
00054   HashTableComponent& peekFwd( void ) { return hashFwd_; }
00055   HashTableComponent& peekRev( void ) { return hashRev_; }
00056   
00057 };
00058 
00059 
00060 
00061 
00062 int main( void )
00063 {
00064 
00065   try {
00066 
00067   int numTests = 0;
00068 
00069   cout << "*******************************************" << endl << endl;
00070   cout << "     Test of class HashTable" << endl << endl;
00071   cout << "*******************************************" << endl << endl;
00072 
00073   int numSeqs = 10;
00074   int seqSize = 100;
00075   int wordLength = 10;
00076   int maxHits = 50;
00077 
00078   // Generate a random sequence of (numSeqs*seqSize) base pairs ...
00079   // 1128 is the seed value for the random number generator
00080   BaseGenerator testBases( numSeqs * seqSize, 1128 );
00081 
00082   // ... make a fasta file from this sequence, breaking the sequence into
00083   // sequences of seqSize base pairs, with no overlaps (i.e. numSeqs of them)
00084   // Haven't specified a file name, so will default to test_subject.fasta 
00085   testBases.generateSubjectFile(seqSize,0);
00086 
00087   SequenceReaderFasta testReader("test_subject.fasta",cout);
00088 
00089   // ---
00090 
00091   cout << "Test " << ++numTests <<": test of function getNextSequence" 
00092        << endl << endl;
00093 
00094   HashTable  testHash(cout,"test_save");
00095   HashTableFactory creator(cout);
00096 
00097 
00098   // check init flag is not set on construction ...
00099   assert(testHash.isInitialized() == false );
00100 
00101   creator.createHashTable(testHash,testReader,wordLength,maxHits);
00102 
00103   // ... but is set once the hash table has been set up
00104   assert(testHash.isInitialized() == true );
00105 
00106   testReader.rewind();
00107 
00108   HitListVector hits;
00109 
00110   WordSequence  seq;
00111 
00112   string name1, name2, name3;
00113 
00114   // The next bit takes sequence data from testReader and uses it to
00115   // query the hash table. The hash table was also created from testReader,
00116   // so we are checking that all sequence data 'finds itself' in the hash
00117   // table in the correct position
00118 
00119   for ( int i(1) ; i <= numSeqs ; i++ )
00120   { // for each sequence in testReader ...
00121     testReader.getNextSequence(seq,wordLength);
00122 
00123     // ... check that the name strings match 
00124     testReader.getLastSequenceName(name1);
00125     testHash.getSequenceName(name2,i);
00126     //    assert(testHash.getSequenceName(i)==
00127     //   testHash.getNameReader().getSequenceName(i));
00128     name3 = (string) testHash.getSequenceName(i);
00129     assert(name1 == name2 );
00130     assert(name2 == name3 );
00131     cout << seq.getNumBasesInLast() << "!!\n";
00132     // ... go through the Words in the sequence one by one and look
00133     //    for matches in the hash table
00134     for ( int j(0) ; j < (seqSize/wordLength) ; j++ )
00135     {
00136        cout << j << " " << printBase(seq[j],wordLength) << endl; 
00137       
00138        assert((seq[j]&gCursedWord)==0);
00139        testHash.matchWord(seq[j],hits,0);      
00140        // ... check we have exactly one hit for the word ...
00141        cout << hits.size() <<endl;
00142        assert(hits.size() == 1);
00143        //       cout << i << " " << j << " " 
00144        //          << hits[0].sequence << " " 
00145        //         << hits[0].offset << endl; 
00146        // ... and that the sequence number and offset for the hit are OK
00147 
00148        assert(hits[0].subjectNum == i );
00149        assert(hits[0].diff == j*wordLength );
00150        assert(hits[0].queryPos == 1);
00151  
00152        hits.clear();
00153  
00154     } // ~for j
00155 
00156     // Now try matchWord for the sequence as a whole
00157     testHash.matchWord(seq,hits); 
00158     cout << hits.size() << "!!!!!\n";
00159     assert( hits.size() == (seqSize/wordLength) );
00160     for ( int j(0) ; j < hits.size() ; j ++ )
00161     {
00162 
00163       assert(hits[j].subjectNum == i);
00164       assert(hits[j].diff   == 0);
00165       assert(hits[j].queryPos == 1 + ( j * wordLength ) );
00166      
00167     } // ~for j
00168 
00169     hits.clear();
00170     seq.clear();
00171   } // ~for i
00172 
00173 
00174   cout << "Test passed!" << endl << endl; 
00175 
00176   // ---
00177 
00178   cout << "Test " << ++numTests <<": test of save and load of hash table" 
00179        << endl << endl;
00180 
00181   // save hash table to files test_save.body, test_save.head, test_save.name
00182   //  testHash.saveHashTable("test_save");
00183   //  creator.saveHashTable(testHash,"test_save");
00184   creator.saveHashTable(testHash);
00185 
00186   // Create another hash table ...
00187   HashTable testLoad(cout,"test_save");
00188 
00189   // ... and initialize it from these saved files
00190   //  testLoad.loadHashTable("test_save");
00191   creator.loadHashTable(testLoad);
00192 
00193   // Check init flag has been set ...
00194   assert( testLoad.isInitialized() == true );
00195   
00196   // and that word length matches original
00197   assert( testLoad.getWordLength() == wordLength );
00198 
00199   testLoad.setMaxNumHits( testHash.getMaxNumHits() );
00200 
00201   assert( testHash.getMaxNumHits()==testLoad.getMaxNumHits() );
00202 
00203   HitListVector hitsOrig, hitsCopy;
00204 
00205   testReader.rewind();
00206 
00207   // Now we run all the sequences in testReader as queries on the old
00208   // and new hash tables, storing the resulting hits in hitsOrig and
00209   // hitsCopy respectively.
00210 
00211   for ( int i(0) ; i < numSeqs ; i++ )
00212   {
00213 
00214     // ... check that the name strings match 
00215     testHash.getSequenceName(name1,i+1);
00216     testLoad.getSequenceName(name2,i+1);
00217     assert(name1 == name2 );
00218     assert(name1 == (string)testHash.getSequenceName(i+1));
00219     assert(name1 == (string)testLoad.getSequenceName(i+1));
00220     // ... check that the sequence sizes are the same
00221 
00222     //    cout << i << " " << testHash.getSequenceSize(i+1) << " " 
00223     // << testLoad.getSequenceSize(i+1) << endl; // %%%%
00224     assert( testHash.getSequenceSize(i+1) == testLoad.getSequenceSize(i+1) );
00225 
00226     
00227 
00228     // ... add hit information to hitsOrig and hitsCopy
00229     seq.clear();
00230     testReader.getNextSequence(seq,wordLength);
00231     testHash.matchWord(seq,hitsOrig);
00232     testLoad.matchWord(seq,hitsCopy);
00233   } // ~for i
00234 
00235   
00236 
00237 
00238   // Note that neither testCopy or testOrig are cleared between queries,
00239   // so by this point they contain all the hit positions obtained for
00240   // all queries. We check that this info is identical for both sets.
00241   assert(hitsOrig.size() == hitsCopy.size() );
00242   assert(hitsOrig == hitsCopy );
00243 
00244   cout << "Test passed!" << endl << endl; 
00245 
00246   // ---
00247 
00248   cout << "Test " << ++numTests <<": further test of function matchWord" 
00249        << endl << endl;
00250 
00251   // Next few lines hash string testSeq into a hash table
00252   string testSeq = 
00253   "AGCTGGCTAGTCTAACGTGTCTGTAGATCGTACGTGCATGCATGGGCAGTCTCTGTATCACTGGGTGGCC";
00254 // 1234567890123456789012345678901234567890123456789012345678901234567890
00255    
00256   SequenceReaderString shiftReader(testSeq,cout);
00257 
00258   HashTable   shiftHash(cout);
00259   creator.createHashTable(shiftHash,shiftReader,wordLength,maxHits);
00260 
00261   for ( int i(0) ; i < wordLength ; i++ )
00262   {
00263     testSeq = testSeq.substr(1); // delete first character
00264     { // braces ensure a new instance is created each time round loop
00265       SequenceReaderString queryReader(testSeq,cout);
00266       seq.clear();
00267       hits.clear();
00268       queryReader.getNextSequence(seq,wordLength);
00269       shiftHash.matchWord(seq,hits);
00270 
00271       if (i==(wordLength-1))
00272         assert( hits.size() == seq.size() - 1 );
00273         else assert( hits.size() == 0 );
00274     }
00275   } // ~for i
00276 
00277   cout << "Test passed!" << endl << endl; 
00278 
00279   // ---
00280 
00281   cout << "Test " << ++numTests 
00282        <<": test that only limited number of hits are stored" 
00283        << endl << endl;
00284 
00285   string tenAs = "AAAAAAAAAA";
00286   testSeq = "";
00287   for ( int i(0) ; i < 50 ; i++ ) testSeq += tenAs;
00288 
00289   // testSeq now consists of 500 'A' characters. Next we hash this
00290   // sequence. With a word length of 10 it consists of 50 words
00291   // 'AAAAAAAAAA', but because we set maxNumHits to 20, only 20 should
00292   // be stored 
00293   SequenceReaderString reader(testSeq,cout);
00294   HashTable hash(cout);
00295   creator.createHashTable(hash,reader,10,20);
00296   assert(hash.getMaxNumHits()==20);
00297 
00298   hits.clear();
00299   // Word 'AAAAAAAAAA' is encoded as zero in our coding scheme
00300   Word allAs(0);
00301   // matchWord should only place 20 lots of hit data into hits ...
00302   hash.matchWord(allAs,hits);
00303   // ... check this
00304   assert(hits.size()==0);
00305   cout << hash.size(allAs) << endl;
00306   //  assert(hash.size(allAs)==50);
00307   hash.setMaxNumHits(50);
00308   assert(hash.getMaxNumHits()==50);
00309   hash.matchWord(allAs,hits);
00310   assert(hits.size()==50);
00311 
00312   cout << "Test passed!" << endl << endl; 
00313 
00314   // ---
00315 
00316   cout << "Test " << ++numTests 
00317        <<": test of tandem repeat masking\n\n"; 
00318 
00319   // Each sequence string in repeat has 90 bases. ith sequence repeats
00320   // every i bases.
00321   string repeats[] = 
00322   {
00323   "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG",
00324   "GCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCG",
00325   "AGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCA",
00326   "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG",
00327   "ATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGA"
00328   };
00329 
00330   for ( int j(1) ; j <= 5 ; ++j )
00331   {
00332 
00333     SequenceReaderString readerRepeats(repeats[j-1],cout);
00334     HashTable hashRepeats(cout);
00335     creator.createHashTable(hashRepeats,readerRepeats,10,20);
00336 
00337     readerRepeats.rewind();
00338 
00339     WordSequence seqRepeats;
00340 
00341     readerRepeats.getNextSequence( seqRepeats,10 );
00342 
00343     HitListVector hlFwd, hlRev;
00344 
00345     //    hlFwd.clear();
00346     //    hlRev.clear();
00347 
00348     cout << "hello\n";    
00349     
00350     hashRepeats.setNumRepeats(j);
00351     //    hashRepeats.matchSequence( seqRepeats, hlFwd, j );  
00352     hashRepeats.matchSequence( seqRepeats, hlFwd );  
00353 
00354     WordSequence rc;
00355 
00356     reverseComplement( seqRepeats, rc, 10 );
00357     
00358     //    hashRepeats.matchSequence( rc, hlRev, j );  
00359     hashRepeats.matchSequence( rc, hlRev );  
00360 
00361 
00362     cout << "Repeats masked: " << j 
00363          << "\nHits found in forward direction: " << hlFwd.size() 
00364          << "\nHits found in reverse direction: " << hlRev.size() << "\n";
00365 
00366     assert ( hlFwd.size() == 9 );
00367 
00368     if ( ( j % 2 ) == 0 ) assert( hlRev.size() == 9 );
00369     else assert( hlRev.size() == 0 );
00370 
00371 
00372       //    for ( HitListVector::iterator i( hlFwd.begin() ) ; i != hlFwd.end() ; ++i )
00373       //   cout << i->subjectNum << " " << i->diff << " " << i->queryPos << endl;
00374 
00375       //    cout << "rc\n";
00376       //   for ( HitListVector::iterator i( hlRev.begin() ) ; i != hlRev.end() ; ++i )
00377       //   cout << i->subjectNum << " " << i->diff << " " << i->queryPos << endl;
00378 
00379 
00380  
00381   } // ~for j
00382 
00383   for ( int j(1) ; j <= 5 ; ++j )
00384   {
00385 
00386     SequenceReaderString readerRepeats(repeats[j-1],cout);
00387     HashTablePacked hashRepeats(cout);
00388     creator.createHashTable(hashRepeats,readerRepeats,10,20);
00389 
00390     readerRepeats.rewind();
00391 
00392     WordSequence seqRepeats;
00393 
00394     readerRepeats.getNextSequence( seqRepeats,10 );
00395 
00396     HitListVector hlFwd, hlRev;
00397 
00398     //    hlFwd.clear();
00399     //    hlRev.clear();
00400 
00401     cout << "hello\n";    
00402     
00403     hashRepeats.setNumRepeats(j);
00404     //    hashRepeats.matchSequence( seqRepeats, hlFwd, j );  
00405     hashRepeats.matchSequence( seqRepeats, hlFwd );  
00406 
00407     WordSequence rc;
00408 
00409     reverseComplement( seqRepeats, rc, 10 );
00410     
00411     //    hashRepeats.matchSequence( rc, hlRev, j );  
00412     hashRepeats.matchSequence( rc, hlRev );  
00413 
00414 
00415     cout << "Repeats masked: " << j 
00416          << "\nHits found in forward direction: " << hlFwd.size() 
00417          << "\nHits found in reverse direction: " << hlRev.size() << "\n";
00418 
00419     assert ( hlFwd.size() == 9 );
00420 
00421     if ( ( j % 2 ) == 0 ) assert( hlRev.size() == 9 );
00422     else assert( hlRev.size() == 0 );
00423 
00424 
00425   } // ~for j
00426 
00427   
00428   cout << "Test passed!" << endl << endl; 
00429 
00430   // ---
00431 
00432   cout << "Test " << ++numTests 
00433        <<": test of HashTableTranslated\n\n"; 
00434   {
00435     SequenceReaderFasta testReader( "test.fasta" );
00436    
00437     int wordLength(4);
00438 
00439     HashTableTranslated hashTrans(cout,"test_trans");
00440 
00441     hashTrans.createHashTable( testReader, wordLength, 10000 );
00442     assert(hashTrans.isInitialized());
00443 
00444     hashTrans.saveHashTable();
00445 
00446     HashTableTranslated hashTrans2(cout,"test_trans");
00447     hashTrans2.loadHashTable();
00448     assert( hashTrans2.isInitialized() );
00449     assert( hashTrans.getNumSequences() == hashTrans2.getNumSequences() );
00450     string s1,s2;
00451 
00452     cout << hashTrans.getMaxNumHits() << " HH " << hashTrans2.getMaxNumHits() << endl;
00453 
00454 
00455     testReader.rewind();
00456     HitListVector h1, h2;
00457     WordSequence w1,w2;
00458     WordSequence t1,t2; // to store translations of w1, w2
00459     CodonList codons;
00460     SequenceEncoderCodon encoder;
00461     encoder.setWordLength(wordLength);
00462 
00463     for ( int i(1) ; i < hashTrans.getNumSequences() ; i++ )
00464     {
00465 
00466       hashTrans.getSequenceName( s1, i );
00467       hashTrans2.getSequenceName( s2, i );
00468       assert (s1 == s2 );
00469       assert (s1 == (string) hashTrans.getSequenceName(i) );
00470       assert (s1 == (string) hashTrans2.getSequenceName(i) );
00471 
00472       h1.clear(); h2.clear(); w1.clear(); w2.clear();
00473       testReader.getNextSequence( w1, gMaxBasesPerWord );
00474       cout << printWord( w1, wordLength ) << endl;
00475       w2=w1;
00476 
00477       assert(hashTrans.getSequenceSize(i)==hashTrans2.getSequenceSize(i));
00478 
00479       assert( (((w1.size()-1) * gMaxBasesPerWord ) + w1.getNumBasesInLast())
00480               == hashTrans.getSequenceSize(i));
00481 
00482 
00483       // Sequence should produce same (nonzero) num hits in fwd direction
00484 
00485       hashTrans.setForward(); assert(hashTrans.isForward());
00486       hashTrans2.setForward(); assert(hashTrans2.isForward());
00487 
00488       hashTrans.setQueryTranslatedDNA();
00489       hashTrans2.setQueryTranslatedDNA();
00490       hashTrans.matchSequence( w1, h1 );
00491       hashTrans2.matchSequence( w2, h2 );
00492       assert(h1.size()!=0);
00493       assert(h1.size()==h2.size());
00494       assert(h1==h2);
00495       assert(w1==w2); // translatedDNA matching does not affect w1 or w2
00496 
00497       // now encode to codons for each frame
00498       hashTrans.setQueryProtein();
00499       hashTrans2.setQueryProtein();
00500       for ( int rf(0) ; rf < gNumReadingFrames ; rf++ )
00501       {
00502         codons.clear();
00503         codonize( w1, codons, rf );
00504         encoder.linkSeq(t1);
00505         encoder.encode( codons );
00506         encoder.unlinkSeq();
00507         //      cout << printResidue( t1, wordLength ) << endl;
00508         t2=t1;
00509         h1.clear();h2.clear();
00510         hashTrans.matchSequence( t1, h1 );
00511         hashTrans2.matchSequence( t2, h2 );
00512         assert(h1.size()!=0);
00513         assert(h1.size()==h2.size());
00514         assert(h1==h2);
00515         
00516       } // ~for rf
00517 
00518 
00519       // RC of sequence should produce same (nonzero) num hits in rev direction
00520 
00521       w1.clear();
00522       reverseComplement( w2,w1, gMaxBasesPerWord );
00523       w2=w1;
00524 
00525       h1.clear(); h2.clear();
00526       hashTrans.setQueryTranslatedDNA();
00527       hashTrans2.setQueryTranslatedDNA();
00528       hashTrans.setReverse(); assert(!hashTrans.isForward());
00529       hashTrans2.setReverse(); assert(!hashTrans2.isForward());
00530       hashTrans.matchSequenceTranslatedDNA( w1, h1 );
00531       hashTrans2.matchSequenceTranslatedDNA( w2, h2 );
00532       assert(h1.size()!=0);
00533       assert(h1.size()==h2.size());
00534       assert(h1==h2);
00535       assert(w1==w2); // translatedDNA matching does not affect w1 or w2
00536 
00537       // now encode to codons for each frame
00538       hashTrans.setQueryProtein();
00539       hashTrans2.setQueryProtein();
00540       for ( int rf(0) ; rf < gNumReadingFrames ; rf++ )
00541       {
00542         codons.clear();
00543         codonize( w1, codons, rf );
00544         encoder.linkSeq(t1);
00545         encoder.encode( codons );
00546         encoder.unlinkSeq();
00547         //      cout << printResidue( t1, wordLength ) << endl;
00548         t2=t1;
00549         h1.clear();h2.clear();
00550         hashTrans.matchSequence( t1, h1 );
00551         hashTrans2.matchSequence( t2, h2 );
00552         assert(h1.size()!=0);
00553         assert(h1.size()==h2.size());
00554         assert(h1==h2);
00555         
00556       } // ~for rf
00557 
00558      
00559     } //~ for i
00560 
00561 
00562   } // ~test
00563 
00564   cout << "Test passed!\n\n"; 
00565 
00566 
00567   // ---
00568 
00569   cout << "Test " << ++numTests 
00570        <<": test of substitute generation\n\n";
00571 
00572   {
00573     string s, s1;
00574     Word w;
00575     vector<Word> subs;
00576     int wl=15;
00577 
00578     // test substitution for DNA
00579     // 
00580 
00581     for (int i(0); i<wl; i++) s+="A";
00582 
00583     w = makeBase(s);
00584 
00585     generateSubstitutesDNA( w, subs, wl );
00586     assert(subs.size()==wl);
00587     
00588     for (int i(0); i<wl ; i++)
00589     {
00590       cout << printWord(subs[i],wl) << endl;
00591       s1=s;
00592       s1[wl-i-1]='G';
00593       assert(makeBase(s1)==subs[i]);
00594     }
00595 
00596     //
00597     s=""; subs.clear();
00598 
00599     for (int i(0); i<wl; i++) s+="C";
00600 
00601     w = makeBase(s);
00602 
00603     generateSubstitutesDNA( w, subs, wl );
00604     assert(subs.size()==wl);
00605     
00606     for (int i(0); i<wl ; i++)
00607     {
00608       cout << printWord(subs[i],wl) << endl;
00609       s1=s;
00610       s1[wl-i-1]='T';
00611       assert(makeBase(s1)==subs[i]);
00612     }
00613 
00614     //
00615     s=""; subs.clear();
00616 
00617     for (int i(0); i<wl; i++) s+="G";
00618 
00619     w = makeBase(s);
00620 
00621     generateSubstitutesDNA( w, subs, wl );
00622     assert(subs.size()==wl);
00623     
00624     for (int i(0); i<wl ; i++)
00625     {
00626       cout << printWord(subs[i],wl) << endl;
00627       s1=s;
00628       s1[wl-i-1]='A';
00629       assert(makeBase(s1)==subs[i]);
00630     }
00631 
00632     //
00633     s=""; subs.clear();
00634 
00635     for (int i(0); i<wl; i++) s+="T";
00636 
00637     w = makeBase(s);
00638 
00639     generateSubstitutesDNA( w, subs, wl );
00640     assert(subs.size()==wl);
00641     
00642     for (int i(0); i<wl ; i++)
00643     {
00644       cout << printWord(subs[i],wl) << endl;
00645       s1=s;
00646       s1[wl-i-1]='C';
00647       assert(makeBase(s1)==subs[i]);
00648     }
00649 
00650     // test substitution values for protein
00651     for (int i(0); i<gNumCodonEncodings; i++)
00652     {
00653       s=""; s+= gResidueNames[i];
00654       w = makeResidue(s);
00655       subs.clear();
00656       generateSubstitutesProtein( w, subs, 1 );
00657       cout << "Residue: " << s << " substitutes: ";
00658         for (vector<Word>::iterator j(subs.begin());j!=subs.end();j++)
00659           cout << printResidue( *j, 1 ) << " ";
00660       cout << endl;
00661     }
00662 
00663     s="*CPXY";
00664     w=makeResidue(s);
00665     subs.clear();
00666     generateSubstitutesProtein( w, subs, 5 );
00667     for (vector<Word>::iterator j(subs.begin());j!=subs.end();j++)
00668       cout << printResidue( *j, 5 ) << endl;
00669     assert(subs.size()==3);
00670 
00671     s="EIKLMN";
00672     w=makeResidue(s);
00673     subs.clear();
00674     generateSubstitutesProtein( w, subs, 6 );
00675     for (vector<Word>::iterator j(subs.begin());j!=subs.end();j++)
00676       cout << printResidue( *j, 6 ) << endl;
00677     assert(subs.size()==18);
00678 
00679 
00680     assert(subVals[subStarts[gNumCodonEncodings]]==99999);
00681 
00682     // Test substitution for a DNA hash table
00683     {
00684     string orig = 
00685     "ttcgtgacaggcagttcaggtcacgtggtgacttgatgacccattgtcaaatgtccaattgtta";
00686     string subd =
00687     "CtcgtgacaAgcagttcaAgtcacgtgAtgacttgaCgacccattAtcaaatgtTcaattgttG";
00688  //  1234567812345678123456781234567812345678123456781234567812345678
00689     
00690     SequenceReaderString origReader(orig,cout);
00691     SequenceReaderString subdReader(subd,cout);
00692 
00693     HashTablePacked   origHash(cout);
00694     creator.createHashTable(origHash,origReader,8,100000);
00695 
00696     WordSequence subdSeq, seq;
00697 
00698     subdReader.getNextSequence( subdSeq,8 );
00699 
00700     HitListVector hl;
00701 
00702     seq=subdSeq;
00703     origHash.matchSequence( seq, hl );  
00704     assert(hl.size()==0);
00705 
00706     origHash.setSubstituteThreshold(1);
00707     
00708     seq=subdSeq;
00709     origHash.matchSequence( seq, hl );  
00710     for (HitListVector::iterator i(hl.begin()); i!=hl.end();i++)
00711       cout << i->subjectNum << " " << i->diff << " " << i->queryPos << endl;
00712     assert(hl.size()==8);
00713     }
00714 
00715     {
00716       string orig = 
00717         "LAKAGKKTGPQDAECRSCHNPKPSAASSWKEIGFDKSLHYRHVASKAIKPVGDPQKNCGA";
00718       string subd =
00719         "iAKAGKeTGPQDsECRSCnNPKPSsASSfKEIGFnKtLHYkHVASKAmKPVGDPkKhCGA";
00720 //       123451234512345123451234512345123451234512345123451234512345
00721           
00722     SequenceReaderStringProtein origReader(orig,cout);
00723     SequenceReaderStringProtein subdReader(subd,cout);
00724 
00725     HashTablePackedProtein origHash(cout);
00726     creator.createHashTable(origHash,origReader,5,100000);
00727 
00728     WordSequence subdSeq, seq;
00729 
00730     subdReader.getNextSequence( subdSeq,5 );
00731 
00732     HitListVector hl;
00733 
00734     seq=subdSeq;
00735     origHash.matchSequence( seq, hl );  
00736     assert(hl.size()==0);
00737 
00738     origHash.setSubstituteThreshold(1);
00739     
00740     seq=subdSeq;
00741     origHash.matchSequence( seq, hl );  
00742     for (HitListVector::iterator i(hl.begin()); i!=hl.end();i++)
00743       cout << i->subjectNum << " " << i->diff << " " << i->queryPos << endl;
00744     cout << hl.size() << endl;
00745     assert(hl.size()==12);
00746     }
00747 
00748 
00749   } // ~test 
00750   cout << "Test passed!\n";
00751 
00752   // ---
00753 
00754   cout << "Test " << ++numTests 
00755        <<": test of flag mode\n\n"; 
00756   {
00757     string testSub
00758     ="TTTTTTT-TTTTTTT-TTTTTTT-TTTTTTT-TTTTTTT-TTTTTTT-TTTTTTT-TTTTTTT";
00759     //123456712345671234567123456712345671234567123456712345671234567
00760     string testQuery="TTTNTTT";
00761    
00762     HitListVector hl1, hl2;
00763     
00764     SequenceReaderString 
00765       queryReader(testQuery, cout), subjectReader(testSub,cout);
00766     SequenceReaderModeReplace replace('T');      
00767     SequenceReaderModeFlagReplace tag('T');
00768     WordSequence testSeq, seq;
00769     queryReader.changeMode(&replace);
00770     queryReader.getNextSequence( testSeq, 7 );
00771   
00772 
00773 
00774 
00775     {    
00776       subjectReader.changeMode(&replace);
00777       
00778       HashTable hash(cout);
00779       HashTablePacked hashPacked(cout);
00780       creator.createHashTable(hash,subjectReader,7,1000);
00781       creator.createHashTable(hashPacked,subjectReader,7,1000);
00782 
00783       seq=testSeq;
00784       hash.matchSequence( seq, hl1 );  
00785       for (HitListVector::iterator i(hl1.begin()); i!=hl1.end();i++)
00786         cout << i->subjectNum 
00787              << " " << i->diff << " " << i->queryPos << endl;
00788       cout << hl1.size() << endl;
00789       assert(hl1.size()==9);
00790 
00791       seq=testSeq;
00792       hashPacked.matchSequence( seq, hl2 );  
00793       assert(hl1==hl2);
00794 
00795       hl1.clear();
00796       hl2.clear();
00797 
00798       // Tagging of subject not implemented properly %%%%%%%
00799       // check tagging of query works
00800       queryReader.changeMode(&tag);
00801       queryReader.rewind();
00802       queryReader.getNextSequence( seq, 7 );
00803       hash.matchSequence( seq, hl1 );  
00804       hashPacked.matchSequence( seq, hl2 );  
00805       assert(hl1.size()==0);
00806       assert(hl2.size()==0);
00807 
00808     }
00809     {
00810       // now check tagging of subject works
00811       subjectReader.changeMode(&tag);
00812       hl1.clear();hl2.clear();
00813       
00814       HashTable hash(cout);
00815       HashTablePacked hashPacked(cout);
00816       creator.createHashTable(hash,subjectReader,7,1000);
00817       creator.createHashTable(hashPacked,subjectReader,7,1000);
00818 
00819       seq=testSeq;
00820       hash.matchSequence( seq, hl1 );  
00821 
00822       for (HitListVector::iterator i(hl1.begin()); i!=hl1.end();i++)
00823         cout << i->subjectNum 
00824              << " " << i->diff << " " << i->queryPos << endl;
00825       cout << hl1.size() << endl;
00826       assert(hl1.size()==2);
00827 
00828       seq=testSeq;
00829       hashPacked.matchSequence( seq, hl2 );  
00830 
00831       for (HitListVector::iterator i(hl2.begin()); i!=hl2.end();i++)
00832         cout << i->subjectNum 
00833              << " " << i->diff << " " << i->queryPos << endl;
00834       cout << hl2.size() << endl;
00835 
00836       assert(hl1==hl2);
00837 
00838 
00839     }
00840 
00841 
00842   } // ~test
00843   cout << "Test passed!" << endl << endl; 
00844 
00845 
00846   // ---
00847 
00848   cout << "Test " << ++numTests 
00849        <<": test of flag mode for translated DNA\n\n"; 
00850   {
00851     
00852     SequenceReaderModeReplace replaceMode('A');
00853     SequenceReaderModeFlagReplace flagMode('A');
00854     string testSeq = 
00855        "agtagtagtagtagtagtagtagtagtagtXgtagtagtagtagtagtagtagtagtagtag";
00856     //  1..|..|..|..|..2..|..|..|..|..3..|..|..|..|..4..|..|..|..|..
00857     // Forward direction: f0=agt=SSS..., f1=gta=VVV....., f2=tag=***...
00858     // Reverse direction: r0=act=TTT..., r1=cta=LLL....., r2=tac=YYY...
00859     SequenceReaderString subjectReader(testSeq, cout);
00860 
00861     HashTableTranslatedPeek tableUnflagged, tableFlagged;
00862 
00863     HashTableComponent& peekUnflaggedFwd(tableUnflagged.peekFwd());
00864     HashTableComponent& peekUnflaggedRev(tableUnflagged.peekRev());
00865     HashTableComponent& peekFlaggedFwd(tableFlagged.peekFwd());
00866     HashTableComponent& peekFlaggedRev(tableFlagged.peekRev());
00867 
00868     cout << "making unflagged" << endl;
00869     subjectReader.changeMode(&replaceMode);
00870     tableUnflagged.createHashTable( subjectReader, 5, 10000 );
00871     cout << endl << "  **** making flagged ****" << endl << endl;
00872     subjectReader.changeMode(&flagMode);
00873     tableFlagged.createHashTable( subjectReader, 5, 10000 );
00874 
00875     string queryString="SSSSSSSSS";
00876 
00877     SequenceReaderStringProtein queryReader(queryString,cout);
00878     WordSequence w1, w2;
00879     HitListVector h1, h2;
00880     queryReader.getNextSequence(w1,5);
00881     assert(w1.size()==2);
00882     //    assert(w1.getNumBasesInLast()==0);
00883     w2=w1;
00884 
00885     cout << "hits for unflagged" << endl;
00886 
00887     tableUnflagged.matchSequenceProtein( w1, h1);
00888     for (HitListVector::iterator i(h1.begin()); i!=h1.end(); i++)
00889       cout << i->subjectNum << " " << i->diff << " " << i->queryPos << endl;
00890 
00891     cout << "hits for flagged" << endl;
00892 
00893     tableFlagged.matchSequenceProtein( w2, h2);
00894     for (HitListVector::iterator i(h2.begin()); i!=h2.end(); i++)
00895       cout << i->subjectNum << " " << i->diff << " " << i->queryPos << endl;
00896 
00897 
00898 
00899 
00900     Word w = makeResidue((string)"SSSSS");
00901     PackedHitStore phu, phf;
00902     HitListVector hu, hf;
00903 
00904     cout << "matching unflagged" << endl;
00905     peekUnflaggedFwd.matchWordStandard(w, phu, 0);
00906     peekUnflaggedFwd.convertHits( phu, hu );
00907 
00908     for (HitListVector::iterator i(hu.begin()); i!=hu.end(); i++)
00909       cout << i->subjectNum << " " << i->diff << " " << i->queryPos<< endl;
00910 
00911     cout << "matching flagged" << endl;
00912     peekFlaggedFwd.matchWord(w, phf, 0);
00913     cout << "converting hits" << endl;
00914     peekFlaggedFwd.convertHits( phf, hf );
00915 
00916     for (HitListVector::iterator i(hf.begin()); i!=hf.end(); i++)
00917       cout << i->subjectNum << " " << i->diff << " " << i->queryPos << endl;
00918 
00919     hu.clear(); hf.clear(); phu.clear(); phf.clear();
00920     
00921     w=makeResidue((string)"YYYYY");
00922 
00923 
00924     cout << "matching unflagged" << endl;
00925     peekUnflaggedRev.matchWordStandard(w, phu, 0);
00926     peekUnflaggedRev.convertHits( phu, hu );
00927 
00928     for (HitListVector::iterator i(hu.begin()); i!=hu.end(); i++)
00929       cout << i->subjectNum << " " << i->diff << " " << i->queryPos<< endl;
00930 
00931     cout << "matching flagged" << endl;
00932     peekFlaggedRev.matchWord(w, phf, 0);
00933     cout << "converting hits" << endl;
00934     peekFlaggedRev.convertHits( phf, hf );
00935 
00936     for (HitListVector::iterator i(hf.begin()); i!=hf.end(); i++)
00937       cout << i->subjectNum << " " << i->diff << " " << i->queryPos << endl;
00938 
00939 
00940   } // ~test 
00941   cout << "Test passed!\n";
00942 
00943 
00944 
00945   return (0);
00946 
00947   } // ~try
00948   catch (const SSAHAException& err )
00949   {
00950     cout << "Caught SSAHA exception: " << err.what() << "\n";
00951     exit(1);
00952   }
00953   catch (const std::exception& err )
00954   {
00955     cout << "Caught exception: " << err.what() << "\n";
00956     exit(1);
00957   }
00958 
00959 
00960 } // ~main for testHashTable
00961 
00962 // Name:
00963 // Arguments:
00964 // TYPE  NAME  IN/OUT COMMENT
00965 // Returns: TYPE COMMENT
00966 
00967 // End of file testHashTable.cpp
00968 
00969 
00970 
00971 
00972 
00973 
00974 

Generated on Fri Dec 21 13:12:16 2007 for ssaha by  doxygen 1.5.2