00001
00002
00003 #include "common.h"
00004 #include "linefile.h"
00005 #include "aliType.h"
00006 #include "fa.h"
00007 #include "genoFind.h"
00008 #include "psl.h"
00009 #include "options.h"
00010 #include "fuzzyFind.h"
00011
00012 static char const rcsid[] = "$Id: gfClient.c,v 1.34 2006/11/15 15:20:59 angie Exp $";
00013
00014 static struct optionSpec optionSpecs[] = {
00015 {"prot", OPTION_BOOLEAN},
00016 {"q", OPTION_STRING},
00017 {"t", OPTION_STRING},
00018 {"minIdentity", OPTION_FLOAT},
00019 {"minScore", OPTION_INT},
00020 {"dots", OPTION_INT},
00021 {"out", OPTION_STRING},
00022 {"maxIntron", OPTION_INT},
00023 {"nohead", OPTION_BOOLEAN},
00024 {NULL, 0}
00025 };
00026
00027
00028 int dots = 0;
00029 int minScore = 30;
00030 double minIdentity = 90;
00031 char *outputFormat = "psl";
00032 char *qType = "dna";
00033 char *tType = "dna";
00034
00035 void usage()
00036
00037 {
00038 printf(
00039 "gfClient v. %s - A client for the genomic finding program that produces a .psl file\n"
00040 "usage:\n"
00041 " gfClient host port seqDir in.fa out.psl\n"
00042 "where\n"
00043 " host is the name of the machine running the gfServer\n"
00044 " port is the same as you started the gfServer with\n"
00045 " seqDir is the path of the .nib or .2bit files relative to the current dir\n"
00046 " (note these are needed by the client as well as the server)\n"
00047 " in.fa is a fasta format file. May contain multiple records\n"
00048 " out.psl where to put the output\n"
00049 "options:\n"
00050 " -t=type Database type. Type is one of:\n"
00051 " dna - DNA sequence\n"
00052 " prot - protein sequence\n"
00053 " dnax - DNA sequence translated in six frames to protein\n"
00054 " The default is dna\n"
00055 " -q=type Query type. Type is one of:\n"
00056 " dna - DNA sequence\n"
00057 " rna - RNA sequence\n"
00058 " prot - protein sequence\n"
00059 " dnax - DNA sequence translated in six frames to protein\n"
00060 " rnax - DNA sequence translated in three frames to protein\n"
00061 " -prot Synonymous with -d=prot -q=prot\n"
00062 " -dots=N Output a dot every N query sequences\n"
00063 " -nohead Suppresses psl five line header\n"
00064 " -minScore=N sets minimum score. This is twice the matches minus the \n"
00065 " mismatches minus some sort of gap penalty. Default is 30\n"
00066 " -minIdentity=N Sets minimum sequence identity (in percent). Default is\n"
00067 " 90 for nucleotide searches, 25 for protein or translated\n"
00068 " protein searches.\n"
00069 " -out=type Controls output file format. Type is one of:\n"
00070 " psl - Default. Tab separated format without actual sequence\n"
00071 " pslx - Tab separated format with sequence\n"
00072 " axt - blastz-associated axt format\n"
00073 " maf - multiz-associated maf format\n"
00074 " sim4 - similar to sim4 format\n"
00075 " wublast - similar to wublast format\n"
00076 " blast - similar to NCBI blast format\n"
00077 " blast8- NCBI blast tabular format\n"
00078 " blast9 - NCBI blast tabular format with comments\n"
00079 " -maxIntron=N Sets maximum intron size. Default is %d\n",
00080 gfVersion, ffIntronMaxDefault);
00081 exit(-1);
00082 }
00083
00084
00085 struct gfOutput *gvo;
00086
00087 void gfClient(char *hostName, char *portName, char *tSeqDir, char *inName,
00088 char *outName, char *tTypeName, char *qTypeName)
00089
00090 {
00091 struct lineFile *lf = lineFileOpen(inName, TRUE);
00092 static bioSeq seq;
00093 FILE *out = mustOpen(outName, "w");
00094 enum gfType qType = gfTypeFromName(qTypeName);
00095 enum gfType tType = gfTypeFromName(tTypeName);
00096 int dotMod = 0;
00097 char databaseName[256];
00098 struct hash *tFileCache = gfFileCacheNew();
00099
00100 snprintf(databaseName, sizeof(databaseName), "%s:%s", hostName, portName);
00101
00102 gvo = gfOutputAny(outputFormat, round(minIdentity*10), qType == gftProt, tType == gftProt,
00103 optionExists("nohead"), databaseName, 23, 3.0e9, minIdentity, out);
00104 gfOutputHead(gvo, out);
00105 while (faSomeSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name, qType != gftProt))
00106 {
00107 int conn = gfConnect(hostName, portName);
00108 if (dots != 0)
00109 {
00110 if (++dotMod >= dots)
00111 {
00112 dotMod = 0;
00113 fputc('.', stdout);
00114 fflush(stdout);
00115 }
00116 }
00117 if (qType == gftProt && (tType == gftDnaX || tType == gftRnaX))
00118 {
00119 gvo->reportTargetStrand = TRUE;
00120 gfAlignTrans(&conn, tSeqDir, &seq, minScore, tFileCache, gvo);
00121 }
00122 else if ((qType == gftRnaX || qType == gftDnaX) && (tType == gftDnaX || tType == gftRnaX))
00123 {
00124 gvo->reportTargetStrand = TRUE;
00125 gfAlignTransTrans(&conn, tSeqDir, &seq, FALSE, minScore, tFileCache,
00126 gvo, qType == gftRnaX);
00127 if (qType == gftDnaX)
00128 {
00129 reverseComplement(seq.dna, seq.size);
00130 close(conn);
00131 conn = gfConnect(hostName, portName);
00132 gfAlignTransTrans(&conn, tSeqDir, &seq, TRUE, minScore, tFileCache,
00133 gvo, FALSE);
00134 }
00135 }
00136 else if ((tType == gftDna || tType == gftRna) && (qType == gftDna || qType == gftRna))
00137 {
00138 gfAlignStrand(&conn, tSeqDir, &seq, FALSE, minScore, tFileCache, gvo);
00139 conn = gfConnect(hostName, portName);
00140 reverseComplement(seq.dna, seq.size);
00141 gfAlignStrand(&conn, tSeqDir, &seq, TRUE, minScore, tFileCache, gvo);
00142 }
00143 else
00144 {
00145 errAbort("Comparisons between %s queries and %s databases not yet supported",
00146 qTypeName, tTypeName);
00147 }
00148 gfOutputQuery(gvo, out);
00149 }
00150 if (out != stdout)
00151 printf("Output is in %s\n", outName);
00152 gfFileCacheFree(&tFileCache);
00153 }
00154
00155 int main(int argc, char *argv[])
00156
00157 {
00158 optionInit(&argc, argv, optionSpecs);
00159 if (argc != 6)
00160 usage();
00161 if (optionExists("prot"))
00162 qType = tType = "prot";
00163 qType = optionVal("q", qType);
00164 tType = optionVal("t", tType);
00165 if (sameWord(tType, "prot") || sameWord(tType, "dnax") || sameWord(tType, "rnax"))
00166 minIdentity = 25;
00167 minIdentity = optionFloat("minIdentity", minIdentity);
00168 minScore = optionInt("minScore", minScore);
00169 dots = optionInt("dots", 0);
00170 outputFormat = optionVal("out", outputFormat);
00171
00172 setFfIntronMax(optionInt("maxIntron", ffIntronMaxDefault));
00173 gfClient(argv[1], argv[2], argv[3], argv[4], argv[5], tType, qType);
00174 return 0;
00175 }