EnsemblServer/OrderOrient.cpp

Go to the documentation of this file.
00001 
00002 // #######################################################################
00003 
00004 // SSAHA : Sequence Search and Alignment by Hashing Algorithm
00005 // Version 3.2, released 1st March 2004
00006 // Copyright (c) Genome Research 2002
00007 
00008 // SSAHA is free software; you can redistribute it and/or modify 
00009 // it under the terms of version 2 of the GNU General Public Licence
00010 // as published by the Free Software Foundation.
00011  
00012 // This program is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public Licence for more details.
00016  
00017 // You should have received a copy of the GNU General Public Licence
00018 // along with this program; if not, write to the Free Software
00019 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00020 // or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
00021 
00022 // #######################################################################
00023 
00024 // Module Name  : OrderOrient
00025 // File Name    : OrderOrient.cpp
00026 // Language     : C++
00027 // Module Author: Anthony J. Cox (ac2@sanger.ac.uk)
00028 
00029 // Description:
00030 // OrderOrient
00031 // This is an adapted version of SSAHAClient.cpp
00032 // The contigs for a given clone are input in fasta format via standard 
00033 // input. The client then links to a SSAHA server of reads and interprets 
00034 // the matches returned to determine the order and orientation of the
00035 // contigs. To do this it needs to know which read in the server hash
00036 // table pairs to which. This information is held in an ASCII file
00037 // with the following format
00038 // numberOfPairForRead1  clipStartForRead1 clipEndForRead1 \n
00039 // numberOfPairForRead2  clipStartForRead2 clipEndForRead2 \n
00040 // ...
00041 // numberOfPairForReadN  clipStartForReadN clipEndForReadN \n
00042 // Reads which don't have a pair in the database have a pair number of zero.
00043 
00044 
00045 // Includes:
00046 
00047 #include "ClientServerUtils.h"
00048 //#include "OrderOrient.h"
00049 #include "SequenceReaderFasta.h"
00050 #include "SequenceReaderString.h"
00051 #include <string.h>
00052 #include <iomanip>
00053 #include <strstream>
00054 #include <string>
00055 #include <map>
00056 
00057 // ### Function Definitions ###
00058 
00059 static QueryHeader qinfo;
00060 
00061 //map<SequenceNumber,SequenceNumber> pairs;
00062 
00063 struct ReadInfo
00064 {
00065   SequenceNumber pairNum;
00066   int            insertMean;
00067   int            insertSD;
00068   // int clipStart;
00069   // int clipEnd;
00070   ReadInfo( SequenceNumber p, int m, int s ) :
00071     pairNum(p), insertMean(m), insertSD(s) {}
00072   ReadInfo() { cout << "Default read info ctor call" << endl; }
00073 };
00074 
00075 typedef map<SequenceNumber,ReadInfo> ReadInfoType;
00076 ReadInfoType readInfo;
00077 
00078 
00079 map<SequenceNumber, int> clipStarts;
00080 map<SequenceNumber, int> clipEnds;
00081 vector<int> contigSizes;
00082 
00083 // dodgyMatches stores the read and contig number of any
00084 // matches that are anomalous for whatever reason.
00085 // first is for read number, second is for contig number
00086 typedef multimap<SequenceNumber,SequenceNumber> DodgyMatchesType;
00087 DodgyMatchesType dodgyMatches;
00088 
00089 // When both halves of a pair hit to the same contig it is possible 
00090 // to compute their insert size. If so, it is placed in here, indexed
00091 // by whichever read has the *lower* sequence number
00092 typedef map<SequenceNumber,int> InsertSizesType;
00093 InsertSizesType insertSizes;
00094 
00095 // First thing received from the server is a list of sequence numbers
00096 // and names for all reads involved in maches with the query contigs.
00097 // They are placed in here
00098 
00099 typedef map<SequenceNumber,std::string> NamesType;
00100 NamesType names;
00101 
00102 struct ReadHitOnContig
00103 {
00104   ReadHitOnContig( int p, bool f ) : position(p), isForward(f) {}
00105   int position;
00106   bool isForward;
00107 };
00108 
00109 // This stores information for all reads hits on a given contig, indexed
00110 // by the *read's* sequence number. Should only be one full length
00111 // hit for each read on a given contig (if more than 1, the read is added
00112 // to dodgyMatches), therefore only need a map not a multimap. 
00113 typedef map<SequenceNumber,ReadHitOnContig> ContigHitsType;
00114 
00115 struct ContigHitOnRead
00116 {
00117   ContigHitOnRead ( SequenceNumber c, int p, bool f ) :
00118     contigNum(c), position(p), isForward(f) {}
00119   SequenceNumber contigNum;
00120   int position;
00121   bool isForward;
00122 };
00123 
00124 // Once a read hit on a contig is screened by checkContig an entry for
00125 // it is placed in here. readHits is accumulated over all contigs in the query
00126 // Indexing is by the sequence number of the *read*.
00127 typedef multimap<SequenceNumber, ContigHitOnRead> ReadHitsType;
00128 ReadHitsType readHits;
00129 
00130 struct ContigJoin
00131 {
00132   enum { notDetermined=2000000000 };
00133   // This constructor ensures that a.contigNum 
00134   // is always less then b.contigNum
00135   ContigJoin
00136   ( ContigHitOnRead* pa, 
00137     ContigHitOnRead* pb,
00138     ReadInfo* pReadInfo) 
00139   {
00140     if (pa->contigNum<pb->contigNum)
00141     { 
00142       a=pa; b=pb;
00143     }
00144     else
00145     { 
00146       a=pb; b=pa;
00147     }
00148     
00149     int minInsert 
00150       =  ((a->isForward) 
00151       ? contigSizes[a->contigNum]-a->position // TBD +1 ?????
00152       : a->position )
00153       + ((b->isForward) 
00154       ? contigSizes[b->contigNum]-b->position // TBD +1 ?????
00155       : b->position);
00156 
00157     if (minInsert>pReadInfo->insertMean+(3*pReadInfo->insertSD))
00158     { 
00159       cout << "WARNING: lower bound for insert=" << minInsert
00160            << " mean=" << pReadInfo->insertMean
00161            << " SD=" << pReadInfo->insertSD << endl;
00162       // TBD throw an exception, catch outside ctor
00163     }
00164 
00165     gapSize=pReadInfo->insertMean-minInsert;
00166     gapSD=pReadInfo->insertSD;
00167 
00168     if (gapSize<0)
00169     {
00170       cout << "WARNING: estimated gap size=" << gapSize
00171            << endl;
00172       //      gapSize=0;
00173     }
00174     else cout << "Computed gap size of " << gapSize 
00175               << ", SD=" << gapSD << endl;
00176 
00177 
00178   }
00179 
00180   ContigHitOnRead* a;
00181   ContigHitOnRead* b;
00182   int gapSize;
00183   int gapSD;
00184 
00185   bool operator<( const ContigJoin& rhs ) const
00186   {
00187     return (    ( a->contigNum < rhs.a->contigNum )
00188              || (    ( a->contigNum == rhs.a->contigNum )
00189                   && ( b->contigNum < rhs.b->contigNum )) ); 
00190   }
00191 
00192 };
00193 
00194 typedef vector<ContigJoin> ContigJoinType;
00195 
00196 // readHits is used to generate a set of contig joins. e.g. if the forward
00197 // half of a read hits contigs 4, 8, 11 and the reverse hits contigs 5 & 10,
00198 // contig joins will be added for 
00199 // (4,5), (4,10), (5,8), (5,11), (8,10) and (10,11)
00200 ContigJoinType contigJoins;
00201 
00202 // matches must extend to within maxIn bases of clip points
00203 // to be accepted
00204 int maxIn=50;
00205 
00206 // if a computed insert size (or lower bound) exceeds this
00207 // something is deemed to be afoot...
00208 int maxInsertSize=8000;
00209 
00210 // if two computations of the insert size for a read differ
00211 // by more than this, something is deemed to be afoot...
00212 int maxInsertDiff=50;
00213 
00214 // gets rid of all the extra stuff in the read name
00215 string truncateReadName( const string& name )
00216 {
00217   string::size_type nameEnd = name.find_first_of('\t');
00218   return (( nameEnd == string::npos ) ? name : name.substr(0,nameEnd));
00219 
00220 }
00221 
00222 
00223 // checkContig: contigHits contains all reads that hit on a contig
00224 // for which a) the match lasts the full length of the read.
00225 // and b) there is a mate pair for the read. The hits are checked, any
00226 // good ones are added to readHits, then contigHits is cleared ready for
00227 // the next contig
00228 void checkContig( ContigHitsType& contigHits, SequenceNumber lastQueryNum )
00229 {
00230             cout << "\n[ All hits received for contig, checking them...\n";
00231 
00232             for( ContigHitsType::iterator i(contigHits.begin());
00233                  i!=contigHits.end();
00234                  ++i )
00235             {
00236               cout << i->first << " " << truncateReadName(names[i->first])
00237                    << " " << i->second.position
00238                    << " " << (i->second.isForward?'F':'R');
00239 
00240               ContigHitsType::iterator myPair 
00241                 = contigHits.find(readInfo[i->first].pairNum);
00242 
00243               if (myPair==contigHits.end())
00244               {
00245                 int minInsert = (i->second.isForward) 
00246                 ? contigSizes[lastQueryNum] - i->second.position
00247                 : i->second.position;
00248                 // lower bound on insert size
00249                 cout << " no pair, lower bd on insert=" << minInsert;
00250                 if (minInsert>maxInsertSize)
00251                 {
00252                     cout << " REJECTED! - lower bound > max insert size\n";
00253                     dodgyMatches.insert
00254                       (make_pair(i->first,lastQueryNum));
00255                     continue; // next entry, don't add to readHits
00256                 }
00257               }
00258               else
00259               {
00260                 cout << " pair found"; 
00261                 if (i->first<myPair->first)
00262                   // only process for the half of the pair that has
00263                   // lowest subject num 
00264                 {
00265                   int insertSize
00266                     = abs(i->second.position-myPair->second.position);
00267                   cout << ", insert size=" << insertSize;
00268                   if (insertSize>maxInsertSize)
00269                   {
00270                     cout << " REJECTED! - insert size too large\n";
00271                     dodgyMatches.insert
00272                       (make_pair(i->first,lastQueryNum));
00273                     dodgyMatches.insert
00274                       (make_pair(myPair->first,lastQueryNum));
00275                     continue; // next entry, don't add to readHits
00276 
00277                   }
00278                   else if (i->second.isForward==myPair->second.isForward)
00279                   {
00280                     cout << " REJECTED! - both hits in same direction\n";
00281                     dodgyMatches.insert
00282                       (make_pair(i->first,lastQueryNum));
00283                     dodgyMatches.insert
00284                       (make_pair(myPair->first,lastQueryNum));
00285                     continue; // next entry, don't add to readHits
00286                   }
00287                   else
00288                   {
00289                     // check insert size does not contradict a previous one
00290                     if (insertSizes.find(i->first)!=insertSizes.end())
00291                     {
00292                       if (   abs(insertSize-insertSizes[i->first])
00293                            > maxInsertDiff)
00294                       {
00295                         cout << " REJECTED! - inconsistent insert sizes ("
00296                              << insertSize << " and " 
00297                              << insertSizes[i->first]
00298                              << ")\n";
00299                         dodgyMatches.insert
00300                           (make_pair(i->first,lastQueryNum));
00301                         dodgyMatches.insert
00302                           (make_pair(myPair->first,lastQueryNum));
00303                         continue; // next entry, don't add to readHits
00304                       }
00305                       cout << " confirmed existing insert size value\n";
00306                     }
00307                     else insertSizes[i->first]=insertSize;
00308 
00309 
00310                   } // ~else
00311                 } // ~if (i->first...
00312 
00313               } // ~else
00314 
00315               // if the read passes all this the hit gets added to readHits.
00316               // It can still be discarded if the read fails a check elsewhere
00317               // and gets added to dodgyMatches
00318               readHits.insert
00319               ( make_pair
00320                 ( i->first,ContigHitOnRead
00321                   ( lastQueryNum, i->second.position, i->second.isForward )));
00322 
00323               cout << endl;
00324             } // ~for
00325             cout << "...finished checking hits for contig ]" << endl;
00326 
00327             contigHits.clear();
00328 
00329 
00330 } // ~checkContig
00331 
00332 
00333 
00334 
00335 void sendQuery
00336 (FILE *fp, int sockfd, SequenceReader& seqReader, ifstream& pairFile)
00337 {
00338 
00339   typedef pair< WordSequence, std::string> QueryInfo;
00340 
00341   char  recvline[MAXLINE];
00342 
00343   WordSequence seq;
00344 
00345   // get handshake from server
00346 
00347   SocketInterface socket( sockfd);
00348   
00349   Handshake hello;
00350 
00351   socket.receiveStruct(&hello);
00352 
00353   socket.checkSocketEmpty();
00354 
00355   cerr << "Server is using hash table with " << hello.bitsPerSymbol 
00356        << " bits per symbol, " << hello.wordLength 
00357        << " symbols per word.\n";
00358 
00359   if ((hello.bitsPerSymbol==gBaseBits)&&(qinfo.bitsPerSymbol==gResidueBits))
00360   {
00361     cerr << "Error: can't run a protein query against a DNA database.\n";
00362     throw SSAHAException("Can't run protein query against DNA database");
00363   } // ~if
00364   cerr << "Server will reject queries of more than " 
00365        << hello.maxBufferSize << " words in total.\n";
00366 
00367   if (   (hello.bitsPerSymbol==gResidueBits)
00368        &&(qinfo.bitsPerSymbol==gBaseBits) )
00369   {
00370     cerr 
00371       << "Sending DNA query to protein database, changing word length to "
00372       << gMaxBasesPerWord << ".\n";
00373     hello.wordLength=gMaxBasesPerWord;
00374   } // ~if
00375 
00376   if (qinfo.numRepeats>hello.wordLength)
00377   {
00378     cerr << "Warning: only repeats of " << hello.wordLength 
00379          << " bases or less can be masked for this data, proceeding"
00380          << "using this value.\n";
00381     qinfo.numRepeats = hello.wordLength;
00382   }
00383 
00384   cerr << "Server will attempt to screen for tandem repeats of " 
00385        << qinfo.numRepeats << " bases or less.\n";
00386 
00387   if (qinfo.maxInsert>=hello.wordLength)
00388   {
00389     cerr << "Warning: indels of up to " << hello.wordLength-1 
00390          << " bases only can be handled for this data\n";
00391     qinfo.maxInsert = hello.wordLength-1;
00392   }
00393 
00394   cerr << "Matches can contain up to " 
00395        << qinfo.maxInsert << " indels between successive hits.\n";
00396 
00397   cerr << "Matches can contain gaps of up to " 
00398        << qinfo.maxGap << " bases.\n";
00399 
00400   // read queries to local memory
00401 
00402   //  int totalNumWords(0);
00403   const QueryInfo dummy;
00404   vector<QueryInfo> query; 
00405   query.push_back(dummy);
00406   qinfo.numQueryWords = 0;
00407   
00408   contigSizes.push_back();
00409   
00410   while ( seqReader.getNextSequence
00411           ( query.back().first, hello.wordLength ) != -1 )
00412   {
00413     seqReader.getLastSequenceName( query.back().second );
00414     qinfo.numQueryWords += query.back().first.size();
00415 
00416     contigSizes.push_back
00417     ( 
00418         ( query.back().first.size()-1) * hello.wordLength 
00419         + query.back().first.getNumBasesInLast()
00420     );
00421     query.push_back(dummy);
00422   }
00423   query.pop_back();
00424 
00425   //  QueryHeader qinfo;
00426 
00427   qinfo.numQuerySeqs = query.size();
00428 
00429   socket.sendStruct(&qinfo);
00430 
00431   for( vector<QueryInfo>::iterator i(query.begin()) ; 
00432        i != query.end() ; ++i )
00433   {
00434     socket.sendSequence(i->first);
00435   }
00436 
00437 
00438   MatchHeader response;
00439   socket.receiveStruct(&response);
00440 
00441   if (response.wasSuccessful==false) 
00442     throw NetworkException("Query request failed!!");
00443 
00444   cout << "Expecting to receive " << response.numMatches 
00445        << " matches among " << response.numSubjectNames 
00446        << " subject sequences.\n";
00447 
00448   pair<SequenceNumber,std::string> p;
00449 
00450   for ( int i(0) ;  i < response.numSubjectNames ; i++ )
00451   {
00452        socket.receiveStruct(&p.first);
00453        socket.receiveString(p.second);
00454        names.insert(p);
00455        //       cout << i << ": " << p.first << " " << p.second << endl;
00456   } // ~for
00457 
00458   // Here we parse the pair file to get pair info
00459   // and parse the read names to get clipInfo
00460 
00461   int  pairBufSize(200);
00462   char pairBuf[pairBufSize];
00463   int numPairs(0);
00464 
00465   // lastWanted is the largest sequence number which appears in the match,
00466   // ie for which we are interested in pairing information
00467 
00468   SequenceNumber p1(0), p2(0), firstWanted(0), lastWanted(0);
00469 
00470   if (!names.empty()) 
00471   {
00472     NamesType::iterator i(names.begin());
00473     firstWanted=i->first;
00474     i=names.end();
00475     lastWanted=(--i)->first;
00476     cout << "Range of interest: " << firstWanted 
00477          << " to " << lastWanted << endl;
00478   }
00479 
00480   int insertMean, insertSD;
00481 
00482   cout << "Reading the rest of the pair file ..." << endl;
00483 
00484   while (pairFile.getline(pairBuf, pairBufSize, '\n'))
00485   {
00486     sscanf(pairBuf,"%d %d",&p1, &p2);
00487     // p1 always less than p2, so scan until p2>=firstWanted
00488     if (p2 >= firstWanted) break;
00489   }
00490 
00491   do
00492   {
00493     sscanf(pairBuf,"%d %d %d %d",&p1, &p2, &insertMean, &insertSD);
00494     // pairs are in order of p1, so finish once p1 gets past lastWanted
00495     if (p1 > lastWanted) break;
00496     if (names.find(p1)!=names.end()) 
00497     { // pairs[p1]=p2; 
00498       readInfo.insert(make_pair(p1,ReadInfo(p2,insertMean,insertSD)));
00499       numPairs++; 
00500     }
00501     if (names.find(p2)!=names.end()) 
00502     { // pairs[p2]=p1; 
00503       readInfo.insert(make_pair(p2,ReadInfo(p1,insertMean,insertSD)));
00504       numPairs++; 
00505     }
00506   }
00507   while (pairFile.getline(pairBuf, pairBufSize, '\n'));
00508 
00509   pairFile.close();
00510 
00511   cout << "Found pairing information for " << numPairs << " reads\n";
00512 
00513   string::size_type tabPos;
00514 
00515   // Now grab the clip info from the names
00516 
00517   for ( NamesType::iterator i(names.begin());
00518         i!=names.end(); ++i )
00519   {
00520     tabPos=i->second.find('\t');
00521     ++tabPos=i->second.find('\t',tabPos);
00522     sscanf
00523       (&(i->second.c_str()[tabPos]),
00524        "\tcl|%d\tcr|%d",
00525        &clipStarts[i->first],
00526        &clipEnds[i->first] );
00527 
00528   } // ~FOR
00529 
00530 
00531   // Now read in the match info from the socket.
00532 
00533   MatchInfo thisMatch;
00534 
00535   //  cout << setprecision(2) << setiosflags(ios::fixed);
00536 
00537   //  cout << "OK: " << response.numMatches 
00538   //     << " " << response.numSubjectNames << endl;
00539 
00540 
00541   int numAccepted(0);
00542   SequenceNumber lastQueryNum(0);
00543 
00544   ContigHitsType contigHits;
00545 
00546   for ( int i(0); i < response.numMatches ; i++ )
00547   {
00548 
00549     socket.receiveStruct(&thisMatch);
00550     cout 
00551       << query[thisMatch.queryNum-1].second << "\t"
00552       //query[thisMatch.queryNum-1].second << "\t"
00553       << thisMatch.queryStart << "\t"
00554       << thisMatch.queryEnd << "\t"
00555       << thisMatch.subjectNum << "\t"
00556       << truncateReadName(names[thisMatch.subjectNum]) << "\t"
00557       << thisMatch.subjectStart << "\t"
00558       << thisMatch.subjectEnd << "\t"
00559       << ((thisMatch.isForward)?"F":"R") << "\t"
00560       << thisMatch.numBases << " "; 
00561     //     << "\t"
00562     //     << 100.0*thisMatch.  numBases/
00563     //       (thisMatch.queryEnd-thisMatch.queryStart+1);
00564 
00565     // Only use hits on reads for which there is a read pair.
00566     // Hits must extend to within maxIn pairs of clip points.
00567 
00568     //    if (    (pairs[thisMatch.subjectNum]!=0) // half of a pair
00569     if (    (readInfo.find(thisMatch.subjectNum)!=readInfo.end()) 
00570          && ( thisMatch.subjectStart 
00571               <   clipStarts[ thisMatch.subjectNum ] + maxIn )  
00572          && ( thisMatch.subjectEnd   
00573               >   clipEnds[   thisMatch.subjectNum ] - maxIn )  )
00574     {
00575 
00576       if ((thisMatch.queryNum!=lastQueryNum)&&(lastQueryNum!=0)) 
00577       {
00578         // Then we have moved onto the hits for the next contig.
00579         // Check the hits for the previous one
00580         checkContig(contigHits, lastQueryNum);
00581       } // ~if (thisMatch ..
00582       lastQueryNum=thisMatch.queryNum;
00583 
00584       // place details of this hit into contigHits
00585       if (!contigHits.insert
00586           ( make_pair
00587             ( thisMatch.subjectNum,ReadHitOnContig
00588               ( thisMatch.isForward 
00589                 ?   thisMatch.queryStart
00590                 - thisMatch.subjectStart
00591                 + 1
00592                 :   contigSizes[thisMatch.queryNum]
00593                 - thisMatch.queryStart
00594                 + thisMatch.subjectStart, 
00595                 thisMatch.isForward
00596               )
00597               )
00598             ).second) 
00599       {
00600         // If this insert fails then the same read has a full length hit
00601         // twice in the same contig. It is therefore dodgy.
00602         cout << "REJECTED! - same read hits twice in same contig! "
00603              << query[thisMatch.queryNum-1].second << "0"
00604              << thisMatch.subjectNum << " "
00605              << truncateReadName(names[thisMatch.subjectNum]) << "\t";
00606         dodgyMatches.insert
00607           (make_pair(thisMatch.subjectNum,thisMatch.queryNum));
00608 
00609 
00610       } // ~if (!contigHits ....
00611       else
00612       {
00613         // Accepted means `accepted for further checking.' Hit can
00614         // still be discarded at the checkContigs stage
00615 
00616         cout << "accepted for checking, pair=" 
00617              << readInfo[thisMatch.subjectNum].pairNum;
00618         numAccepted++;
00619       }
00620 
00621     } // ~if
00622 
00623     cout << endl;
00624     
00625 
00626   } // ~for
00627 
00628   // Check the hits for the last contig
00629   if (contigHits.size()!=0) checkContig(contigHits, lastQueryNum);
00630 
00631   cout << numAccepted << " reads passed the initial checking stage" << endl;
00632 
00633   socket.checkSocketEmpty();
00634 
00635   if (dodgyMatches.size()!=0)
00636   {
00637     cout << "Found " << dodgyMatches.size() << " potential inconsistencies:\n";
00638     for (DodgyMatchesType::iterator i(dodgyMatches.begin());
00639          i!=dodgyMatches.end();++i)
00640     {
00641       cout << "read: " << i->first << " " << truncateReadName(names[i->first])
00642            << " contig:" << i->second << " " << query[i->second-1].second 
00643            << endl;
00644       // don't attempt to deduce any O&O from these
00645       readHits.erase(i->first);
00646     }
00647     cout 
00648     << "These reads will not be used for the order/orientation calculation\n";
00649   }
00650 
00651   cout << "Found " << readHits.size() << " potential join points:\n";
00652 
00653   ReadHitsType::iterator ub,i(readHits.begin());
00654   //  SequenceNumber pairNum;
00655   ReadInfo* pReadInfo;
00656 
00657   
00658   //  for (ReadHitsType::iterator i(readHits.begin());i!=readHits.end();++i)
00659   while(i!=readHits.end())
00660   {
00661     cout << " --- \n";
00662     ub = readHits.upper_bound(i->first);
00663     pReadInfo = &(readInfo[i->first]);
00664     //    pairNum = pairs[i->first];
00665     //    InsertSizesType::iterator myInsert;
00666 
00667     if ((pReadInfo->pairNum!=0)&&(i->first<pReadInfo->pairNum))
00668     {
00669       cout << "contigs hit by forward read: " <<  i->first << " " 
00670            << truncateReadName(names[i->first]) << endl; 
00671 
00672       //      myInsert = insertSizes.find(i->first);
00673       //      if (myInsert!=insertSizes.end())
00674       //      {
00675       //        cout << "Got an insert size of " << myInsert->second 
00676       //             << " for this read pair" << endl;
00677       //      }
00678       //      else cout << "No insert size for this read pair" << endl;
00679 
00680       for (ReadHitsType::iterator j(i);j!=ub;j++)
00681       {      
00682         cout << j->second.contigNum << " " 
00683              << query[j->second.contigNum-1].second << " "
00684              << j->second.position << " " 
00685              << (j->second.isForward?'F':'R') << endl;
00686         for (ReadHitsType::iterator 
00687                k(readHits.lower_bound(pReadInfo->pairNum));
00688              k!=readHits.upper_bound(pReadInfo->pairNum);k++)
00689         {      
00690           if (j->second.contigNum!=k->second.contigNum)
00691           {
00692             contigJoins.push_back(ContigJoin(&j->second,&k->second,pReadInfo));
00693             //      if (myInsert==insertSizes.end())
00694             //        contigJoins.push_back(ContigJoin(j->second,k->second));
00695             //      else
00696             //     contigJoins.push_back(ContigJoin(j->second,k->second, 
00697             //    myInsert->second));
00698 
00699           }
00700         } // ~for k
00701 
00702       } // ~for j
00703       cout << "contigs hit by reverse read: " 
00704            << pReadInfo->pairNum << " "
00705            << truncateReadName(names[pReadInfo->pairNum]) << endl; 
00706       for (ReadHitsType::iterator 
00707              j(readHits.lower_bound(pReadInfo->pairNum));
00708            j!=readHits.upper_bound(pReadInfo->pairNum);j++)
00709       {      
00710         cout << j->second.contigNum << " " 
00711              << query[j->second.contigNum-1].second << " "
00712              << j->second.position << " " 
00713              << (j->second.isForward?'F':'R') << endl;
00714 
00715       } // ~for j
00716     } // ~if
00717 
00718     i=readHits.upper_bound(i->first);
00719 
00720 
00721 
00722   } // ~while
00723 
00724 
00725     sort(contigJoins.begin(),contigJoins.end());
00726 
00727     cout << "Found " << contigJoins.size() 
00728          << " potential joins between contigs\n";
00729 
00730     for (ContigJoinType::iterator i(contigJoins.begin());
00731          i!=contigJoins.end();++i)
00732     { 
00733       cout << "Contig " << i->a->contigNum << " " 
00734            << query[i->a->contigNum-1].second 
00735            << i->a->position << "/"
00736            << contigSizes[i->a->contigNum] << " "
00737            << (i->a->isForward?'F':'R')
00738            << " joins contig "
00739            << i->b->contigNum << " " << query[i->b->contigNum-1].second 
00740            << i->b->position << "/" 
00741            << contigSizes[i->b->contigNum] << " "
00742            << (i->b->isForward?'F':'R') << " " 
00743            << i->gapSize << " "
00744            << i->gapSD << endl;
00745 
00746       //      if (i->distance!=ContigJoin::notDetermined)
00747       //      {
00748       //        cout << " distance=" << i->distance;
00749       //`      } // ~if
00750       cout << endl;
00751 
00752     } // ~for
00753 
00754 
00755 
00756 
00757 
00758 } // ~sendQuery
00759 
00760 
00761 
00762 
00763 
00764 
00765 
00766 
00767 int main(int numArgs, char* args[] )
00768 {
00769 
00770   try 
00771   {
00772     int sockfd;
00773     struct sockaddr_in  servaddr;
00774     int fred;
00775 
00776     // only needed if we need to make our own fasta file
00777     ostrstream buf;
00778     //    buf << ">unnamedQuery\n" << cin.rdbuf();
00779     //  cout << a.rdbuf();
00780     istream istr(buf.rdbuf());
00781 
00782 
00783 
00784     if( numArgs!=4)
00785     {
00786       cerr 
00787 << "syntax: " << args[0] 
00788 << " serverMachine serverPort pairFilePrefixName\n"
00789 <<"serverMachine: name of machine on which server is running\n"
00790 <<"serverPort   : port number on that machine\n"
00791 <<"pairFileName : names of all valid read pairs\n";
00792 
00793       throw SSAHAException("Invalid command line input to client");
00794     }
00795     memset((void*)&qinfo, 0, sizeof(qinfo));
00796     
00797     sockfd = Socket(AF_INET, SOCK_STREAM, 0);
00798 
00799     int portNumber(atoi(args[2]));
00800 
00801     cerr << "Queries will be sent via port number " << portNumber << ".\n";
00802 
00803     qinfo.minPrint=200;//atoi(args[3]);
00804 
00805     cerr << "Only matches greater than " << qinfo.minPrint 
00806          << " bases will be reported.\n";
00807 
00808 
00809     // read in ASCII file containing pairing and clip information
00810 
00811 
00812     string pairFileName((string)args[3]+(string)".pair");
00813     ifstream pairFile(pairFileName.c_str());
00814 
00815     if (pairFile.fail())
00816     {
00817       cout << "Could not open pairs file " << args[3] << ".pair\n";
00818       throw SSAHAException("Could not open pair file");
00819     }  
00820   
00821     cout << "Reading from " << args[3] << ".pair ..." << endl;
00822 
00823     string buff;
00824     
00825     // print out any comments, leave the rest til later
00826     while (pairFile.peek()=='#')
00827     {
00828       getline(pairFile,buff,'\n');
00829       cout << buff << endl;
00830     }
00831 
00832 //      int estimatedNumReads(21000000);
00833 
00834 //      pairs.reserve(estimatedNumReads);
00835 //      clipStarts.reserve(estimatedNumReads);
00836 //      clipEnds.reserve(estimatedNumReads);
00837 
00838 //      pairs.push_back();
00839 //      clipStarts.push_back();
00840 //      clipEnds.push_back();
00841 
00842 //      while (pairFile.peek()!=EOF)
00843 //      {
00844 //        pairs.push_back();
00845 //        pairFile >> pairs.back();
00846 //        clipStarts.push_back();
00847 //        pairFile >> clipStarts.back();
00848 //        clipEnds.push_back();
00849 //        pairFile >> clipEnds.back();
00850 //      }
00851    
00852 //      pairs.pop_back();
00853 //      clipStarts.pop_back();
00854 //      clipEnds.pop_back();
00855 
00856 //      assert(pairs.size()==clipStarts.size());
00857 //      assert(pairs.size()==clipEnds.size());
00858 
00859 //      cout << " ... done\n";
00860 
00861 //      cout << "According to the pairs file there are "
00862 //       << pairs.size()-1 << " reads in the server hash table" << endl;
00863 
00864     // set up rest of query parameters (same as for SSAHAClient)
00865 
00866     qinfo.maxGap=14; //atoi(args[4]);
00867     qinfo.maxInsert=1; //atoi(args[5]);
00868     qinfo.numRepeats=1; //atoi(args[6]);
00869     qinfo.clipThreshold=10000; //atoi(args[8]);
00870     qinfo.maxMatches=0; //atoi(args[9]);
00871 
00872     string queryType("DNA");
00873     SequenceEncoder* pEncoder;
00874 
00875     if (queryType=="DNA") 
00876     {
00877       qinfo.bitsPerSymbol = gBaseBits;
00878       pEncoder = new SequenceEncoderDNA(12);
00879       cerr << "Query sequence is DNA.\n";
00880     }
00881     else if (queryType=="protein") 
00882     {
00883       qinfo.bitsPerSymbol = gResidueBits;
00884       pEncoder = new SequenceEncoderProtein(5);
00885       cerr << "Query sequence is protein.\n";
00886     }
00887     else
00888     {
00889       cerr 
00890         << "Unknown value for query type (must be \"DNA\" or \"protein\")\n";
00891       throw SSAHAException("Invalid value for queryType");
00892     } // ~if
00893 
00894     string sortMode("none");
00895     if (sortMode=="none")
00896     {
00897       qinfo.maxMatches=0;
00898     }
00899     else if (sortMode=="size")
00900     {
00901       qinfo.sortMode = eSortByMatchLength;
00902     }
00903     else if (sortMode=="percent")
00904     {
00905       qinfo.sortMode = eSortByPercentMatch;
00906     }
00907     else
00908     {
00909       cerr 
00910         << "Unknown value for sort mode "
00911         << "(must be \"none\", \"size\" or \"percent\")\n";
00912       throw SSAHAException("Invalid value for sortMode");
00913     } // ~if
00914 
00915     while (cin.peek()==' ') cin.ignore(); // zap any leading spaces
00916 
00917     SequenceReader* pReader;
00918     string source;
00919 
00920     if (cin.peek()=='>') 
00921     {
00922       cerr 
00923         << "First nonspace character is a \">\", assuming input text "
00924         << "is in fasta format.\n";
00925       pReader = new SequenceReaderFile(cin, '>', '>', pEncoder, cerr );
00926     } // ~if   
00927     else
00928     {
00929       cerr << "Assuming input text is a plain string of " 
00930            << queryType << " data.\n";
00931 
00932       //      ostrstream buf;
00933       buf << ">unnamedQuery\n" << cin.rdbuf();
00934       //  cout << a.rdbuf();
00935       istream istr(buf.rdbuf());
00936       pReader = new SequenceReaderFile(istr, '>', '>', pEncoder, cerr );
00937 
00938       //      cin >> source;
00939       //if(queryType=="DNA")pReader = new SequenceReaderStringDNA(source,cerr);
00940       //     else pReader = new SequenceReaderStringProtein(source,cerr);
00941                                                                     
00942     } // ~else
00943 
00944     //  bzero(&servaddr, sizeof(servaddr));
00945     memset((void*)&servaddr, 0, sizeof(servaddr));
00946     servaddr.sin_family = AF_INET;
00947     servaddr.sin_port = htons(portNumber);
00948 
00949     cerr << "Server is assumed to be running on machine " << args[1] << ".\n";
00950 
00951     struct hostent *hp;
00952     hp =  gethostbyname(args[1]);
00953     if (hp==NULL) throw NetworkException("Invalid host name");
00954 
00955       memcpy(&(servaddr.sin_addr.s_addr), *(hp->h_addr_list), sizeof(struct
00956       in_addr));
00957 
00958 
00959     Connect(sockfd, (SA *) &servaddr, sizeof(servaddr));
00960     
00961     sendQuery(stdin, sockfd, *pReader, pairFile);
00962 
00963     exit(0);
00964 
00965   }    
00966   catch (const NetworkException& err )
00967   {
00968     cerr << "Caught NetworkException: " << err.what() << "\n";
00969     cout << "ERROR: " << err.what() << endl;
00970     exit(1);
00971   }
00972   catch (const SSAHAException& err )
00973   {
00974     cerr << "Caught SSAHA exception: " << err.what() << "\n";
00975     cout << "ERROR: " << err.what() << endl;
00976     exit(1);
00977   }
00978   catch (const std::exception& err )
00979   {
00980     cerr << "Caught exception: " << err.what() << "\n";
00981     cout << "ERROR: " << err.what() << endl;
00982     exit(1);
00983   }
00984 
00985 
00986 
00987 }
00988 
00989 // Name:
00990 // Arguments:
00991 // TYPE  NAME  IN/OUT COMMENT
00992 // Returns: TYPE COMMENT
00993 
00994 // End of file OrderOrient.cpp
00995 

Generated on Fri Dec 21 13:12:15 2007 for ssaha by  doxygen 1.5.2