QueryManager/MatchStoreGapped.cpp

Go to the documentation of this file.
00001 
00002 // #######################################################################
00003 
00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00005 // Version 3.2, released 1st March 2004
00006 // Copyright (c) Genome Research 2002
00007 
00008 // SSAHA is free software; you can redistribute it and/or modify 
00009 // it under the terms of version 2 of the GNU General Public Licence
00010 // as published by the Free Software Foundation.
00011  
00012 // This program is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public Licence for more details.
00016  
00017 // You should have received a copy of the GNU General Public Licence
00018 // along with this program; if not, write to the Free Software
00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00021 
00022 // #######################################################################
00023 
00024 // Module Name  : QueryManagerSSAHA
00025 // File Name    : QueryManagerSSAHA.cpp
00026 // Language     : C++
00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00028 
00029 // Description:
00030 
00031 // Includes:
00032 
00033 #include "MatchStoreGapped.h"
00034 #include "SequenceReader.h"
00035 #include "HashTableGeneric.h"
00036 #include "HashTablePacked.h"
00037 #include "GlobalDefinitions.h"
00038 #include <algorithm>
00039 
00040 // ### Function Definitions ###
00041 
00042 // Name:
00043 // Arguments:
00044 // TYPE  NAME  IN/OUT COMMENT
00045 // Returns: TYPE COMMENT
00046 
00047 // MatchAlgorithm member function definitions
00048 
00049 void MatchAlgorithm::operator()  
00050   ( WordSequence& querySeq, 
00051     MatchAdder& addMatch, 
00052     HashTableGeneric& subjectTable,
00053     int wordLength,
00054     int stepLength )
00055 {
00056   //  cout << "MatchAlgorithm()" << endl;
00057   HitListVector hitList;
00058   wordLength_ = ((wordLength==-1)?subjectTable.getWordLength():wordLength);
00059   stepLength_ = ((stepLength==-1)?wordLength_:stepLength);
00060   assert(hitList.size()==0);
00061 
00062   if (dynamic_cast<HashTablePacked*>(&subjectTable)!=NULL)
00063   {
00064     sortNeeded_ = false;
00065   } // ~if
00066 
00067   generateHits( querySeq, hitList, subjectTable );
00068   //  cout << "MatchAlgorithm() - hits got: " << hitList.size() << endl;
00069   generateMatches( hitList, addMatch );
00070 }
00071 
00072 void MatchAlgorithm::generateHits
00073 ( WordSequence& querySeq, HitListVector& hitList, HashTableGeneric& subjectTable )
00074 {
00075   subjectTable.setNumRepeats( numRepeats_ );
00076   //  wordLength_ = subjectTable.getWordLength();
00077   //  stepLength_ = subjectTable.getStepLength();
00078   subjectTable.matchSequence( querySeq, hitList );
00079 }
00080 
00081 
00082 
00083 void MatchAlgorithmGapped::generateMatches
00084   ( HitListVector& hitList, MatchAdder& addMatch ) 
00085 {
00086 
00087   if (hitList.size()==0) return;
00088   
00089   if (sortNeeded_)
00090   {
00091     sort( hitList.begin(), hitList.end(), LessThanSubject() );
00092   } // ~if 
00093 
00094   minHitsForMatch_ = max(2,minToProcess_ / stepLength_);
00095   maxQueryDiff_ = maxGap_ + stepLength_;
00096 
00097   bool inRun(false);
00098 
00099   HitListVector::iterator pFirstMatch( hitList.begin() );
00100   
00101   for ( int i(0) ; i < ((int)hitList.size()) - 1 ; ++i )
00102   {
00103     if (    ( hitList[i+1].subjectNum == hitList[i].subjectNum ) 
00104               && ( hitList[i+1].diff - hitList[i].diff  <=maxInsert_ )) 
00105     {
00106       if ( inRun==false ) 
00107       { 
00108         pFirstMatch = static_cast<HitList::iterator>(&hitList[i]); 
00109         inRun=true;
00110       } // ~if
00111     } // ~if
00112     else 
00113     {
00114       if (inRun==true)
00115         findMatchesInRange
00116         ( pFirstMatch, 
00117           static_cast<HitList::iterator>(&hitList[i+1]),
00118           addMatch );
00119       inRun=false;
00120     } // ~else if
00121 
00122   } // ~for
00123 
00124   if (inRun==true)
00125     findMatchesInRange( pFirstMatch, hitList.end(), addMatch );
00126 
00127   return;
00128 
00129 } // ~findMatch
00130 
00131 
00132 void MatchAlgorithmGapped::findMatchesInRange
00133 ( HitListVector::iterator first, HitListVector::iterator last, 
00134   MatchAdder& addMatch )
00135 {
00136 
00137   if (first == last) return;
00138   // This means that single hit matches will not be reported, but you
00139   // can get these with MatchStoreUngapped
00140 
00141   sort( first, last, LessThanQuery() );
00142 
00143   //  cout << "sortedHits:" <<endl;
00144   //  for (HitListVector::iterator j(first); j!=last ; ++j)
00145   // cout << j << " " << j->subjectNum << ": " << j->queryPos << " " 
00146   // << j->diff+j->queryPos << endl;
00147 
00148 
00149   int lastQueryPos, numBases, gapSize; 
00150   SequenceOffset subjectStart, subjectEnd;
00151 
00152   while ( last-first >= minHitsForMatch_ )
00153   {
00154     //    cout << last-first << " left to go.\n";
00155     lastQueryPos = first->queryPos;
00156     numBases = wordLength_;
00157     HitListVector::iterator i(first);
00158 
00159     while(++i!=last)
00160     {
00161 
00162       gapSize = i->queryPos - lastQueryPos;
00163       //   cout << lastQueryPos << "|" << i->queryPos << endl; 
00164       if ( gapSize > maxQueryDiff_ ) break;
00165       numBases += min (wordLength_, gapSize );
00166       lastQueryPos = i->queryPos;
00167     } // ~while
00168   
00169     if (numBases>=minToProcess_)
00170     { 
00171       i--;
00172       //      cout << "addMatch: "  <<     first->queryPos << " " <<
00173       //  i->queryPos + wordLength_ - 1 << " " <<
00174       //  first->diff + first->queryPos << " " <<
00175       //i->diff + i->queryPos + wordLength_ - 1 << endl;
00176 
00177       subjectStart = first->diff + first->queryPos;
00178       subjectEnd= i->diff + i->queryPos + wordLength_ - 1; 
00179 
00180       // Put in the test below, as repetitive queries can occasionally
00181       // cause it to be false, which causes a crash at the alignment stage
00182       // TC 29.5.2
00183       if (subjectEnd>subjectStart)
00184       {
00185         addMatch 
00186         ( 
00187          first->subjectNum,
00188          numBases,
00189          first->queryPos,
00190          i->queryPos + wordLength_ - 1,
00191          subjectStart,
00192          subjectEnd
00193          );
00194       } // ~if
00195       i++;
00196     } // ~if
00197 
00198     first = i;
00199   
00200   } // ~while
00201 
00202 } // ~findMatchesInRange
00203 
00204 
00205 
00206 
00207 
00208   // Function Name: findMatch
00209   // Arguments: vector<Match>& (out), int (in), int (in), int (in)
00210   // Returns: true if more than minToPrint bases were found in total.
00211   // Sorts its member HitInfo instances in the following way:
00212   // 1 (highest priority) by subjectNum: thus the hits for each subject
00213   // sequence are grouped together
00214   // 2 by diff: for a given subject sequence, hits which have the same 'diff' 
00215   // value are part of the same ungapped alignment between the subject
00216   // sequence and the query sequence.
00217   // 3 (lowest priority) by queryPos
00218   // After sorting, the routine looks for a region of hits which have the same
00219   // subjectNum and diff values but for which the queryPos value differs
00220   // by stepLength. Each such region denotes a consecutive series of matching 
00221   // base pairs between the query and subject, and the details are encoded in
00222   // an instance of AdjacentMatch 
00223 
00224 #ifdef OLD_GAP
00225  void MatchAlgorithmGapped::generateMatches
00226   ( HitListVector& hitList, MatchAdder& addMatch ) 
00227   {
00228 
00229     //    subjectTable.matchSequence( querySeq, hitList, numRepeats_ );
00230 
00231     sort( hitList.begin(), hitList.end(), LessThanSubject() );
00232 
00233     //    bool foundMatch( false );
00234 
00235     if (hitList.size()==0) return;
00236     //    int origSize( size() );
00237 
00238     int numMatches( wordLength_ );
00239 
00240     HitListVector::iterator pFirstMatch( hitList.begin() );
00241 
00242     for ( int i(0) ; i < ((int)hitList.size()) - 1 ; ++i )
00243     {
00244       //                 cout << i << ": " 
00245       //   << (*this)[i].subjectNum << "|" 
00246       //   << (*this)[i].diff << "|" << (*this)[i].queryPos << " " 
00247       //   << (*this)[i+1].subjectNum << "|" 
00248       //   << (*this)[i+1].diff << "|" << (*this)[i+1].queryPos << " "
00249       //   << numMatches ; 
00250 
00251               //  && ( (*this)[i+1].queryPos > (*this)[i].queryPos )  )
00252             if (    ( hitList[i+1].subjectNum == hitList[i].subjectNum ) 
00253                  && ( hitList[i+1].diff - hitList[i].diff  <=maxGap_ )) 
00254       {
00255         //              cout << "+";
00256         if ( numMatches == 0 ) 
00257         { 
00258           pFirstMatch = &hitList[i]; 
00259           //                cout << "Start of run";
00260           numMatches = wordLength_; // %%%%
00261         }
00262         numMatches += wordLength_;
00263       } // ~if
00264       else if ( numMatches != 0 )
00265       {
00266         //      cout << "-";
00267           if ( numMatches >= minToProcess_ ) 
00268           { 
00269             //           cout << "storing match";
00270 
00271 
00272            sort( pFirstMatch, &hitList[i+1], LessThanQuery() );
00273 
00274            //        cout << "Storing match (" << numMatches 
00275            //         << " matches). Frequency array:\n";
00276 
00277            //            for ( HitListVectorGap::iterator z( pFirstMatch );
00278            //      z != &(*this)[i+1]; ++z )
00279            // cout<< z->subjectNum <<" "<< z->diff<< " "<< z->queryPos << "\n";
00280 
00281            //            foundMatch = true;
00282 
00283             addMatch ( pFirstMatch->subjectNum,
00284                       numMatches,
00285                       pFirstMatch->queryPos,
00286                       hitList[i].queryPos 
00287                       + wordLength_ - 1,
00288                       pFirstMatch->diff + pFirstMatch->queryPos,
00289                       hitList[i].diff + hitList[i].queryPos
00290                       + wordLength_ - 1 ); 
00291                     }
00292           numMatches = 0;
00293       } // ~else if
00294       //       cout << "\n";
00295     } // ~for
00296 
00297     if ( ( numMatches >= minToProcess_ ) && (!hitList.empty()) )
00298     {
00299         sort( pFirstMatch, hitList.end(), LessThanQuery() );
00300         // cout << "Storing match (" << numMatches 
00301         //  << " matches). Frequency array:\n";
00302 
00303         // for ( HitListVectorGap::iterator z( pFirstMatch );
00304         // z != end(); ++z )
00305         // cout<< z->subjectNum << " " << z->diff<< " " << z->queryPos<< "\n";
00306 
00307         //        foundMatch = true;
00308 
00309         addMatch ( pFirstMatch->subjectNum,
00310                     numMatches,
00311                     pFirstMatch->queryPos,
00312                     hitList.back().queryPos 
00313                     + wordLength_ - 1,
00314                     pFirstMatch->diff + pFirstMatch->queryPos,
00315                     hitList.back().diff + hitList.back().queryPos
00316                     + wordLength_ - 1 
00317                           ); 
00318     }
00319 
00320     return;// foundMatch; // ( size() != origSize );
00321 
00322   } // ~findMatch
00323 #endif
00324 
00325 
00326 
00327 
00328 // End of file QueryManagerSSAHA.cpp
00329 
00330 

Generated on Fri Dec 21 13:12:16 2007 for ssaha by  doxygen 1.5.2