webBlat/webBlat.c

Go to the documentation of this file.
00001 /* webBlat - CGI Applet for Blat. */
00002 /* Copyright 2004 Jim Kent.  All rights reserved. */
00003 
00004 #include "common.h"
00005 #include "linefile.h"
00006 #include "hash.h"
00007 #include "errabort.h"
00008 #include "errCatch.h"
00009 #include "portable.h"
00010 #include "htmshell.h"
00011 #include "dnautil.h"
00012 #include "dnaseq.h"
00013 #include "fa.h"
00014 #include "nib.h"
00015 #include "twoBit.h"
00016 #include "psl.h"
00017 #include "fuzzyFind.h"
00018 #include "cheapcgi.h"
00019 #include "genoFind.h"
00020 #include "gfPcrLib.h"
00021 #include "gfWebLib.h"
00022 
00023 
00024 void usage()
00025 /* Explain usage and exit. */
00026 {
00027 errAbort(
00028   "webBlat - CGI Applet for Blat\n"
00029   "This program is not generally meant to be run from the command line.\n"
00030   );
00031 }
00032 
00033 struct gfWebConfig *cfg;        /* Our configuration. */
00034 
00035 struct gfServerAt *findServer(boolean txServer)
00036 /* Find gfServer. */
00037 {
00038 if (txServer)
00039     return gfWebFindServer(cfg->transServerList, "wb_db");
00040 else
00041     return gfWebFindServer(cfg->serverList, "wb_db");
00042 }
00043 
00044 void doHelp()
00045 /* Put up help page. */
00046 {
00047 uglyf("I'm just not very helpful");
00048 }
00049 
00050 char *protQueryMenu[] = {"Protein", "Translated DNA", "Translated RNA"};
00051 char *nucQueryMenu[] = {"DNA", "RNA", };
00052 char *bothQueryMenu[] = {"BLAT's Guess", "DNA", "RNA", 
00053         "Protein", "Translated DNA", "Translated RNA"};
00054 char *sortMenu[] = {"query,score", "query,start", "chrom,score", "chrom,start", "score"};
00055 char *outputMenu[] = {"hyperlink", "psl", "psl no header"};
00056 
00057 boolean isTxType(char *type)
00058 /* Return TRUE if it's a query requiring a translated server type */
00059 {
00060 int i;
00061 for (i=0; i<ArraySize(protQueryMenu); ++i)
00062     {
00063     if (sameWord(type, protQueryMenu[i]))
00064          return TRUE;
00065     }
00066 return FALSE;
00067 }
00068 
00069 int cmpSeqName(char *a, char *b)
00070 /* Compare two sequence names likely to be of form prefix followed by a number. */
00071 {
00072 char cA, cB;
00073 int cSame = countSame(a, b);
00074 
00075 a += cSame;
00076 b += cSame;
00077 cA = *a;
00078 cB = *b;
00079 if (isdigit(cA))
00080     {
00081     if (isdigit(cB))
00082        return atoi(a) - atoi(b);
00083     else
00084        return -1;
00085     }
00086 else if (isdigit(cB))
00087     return 1;
00088 else
00089     return strcmp(a, b);
00090 }
00091 
00092 int pslCmpTargetScore(const void *va, const void *vb)
00093 /* Compare to sort based on target then score. */
00094 {
00095 const struct psl *a = *((struct psl **)va);
00096 const struct psl *b = *((struct psl **)vb);
00097 int diff = cmpSeqName(a->tName, b->tName);
00098 if (diff == 0)
00099     diff = pslScore(b) - pslScore(a);
00100 return diff;
00101 }
00102 
00103 int pslCmpTargetStart(const void *va, const void *vb)
00104 /* Compare to sort based on target start. */
00105 {
00106 const struct psl *a = *((struct psl **)va);
00107 const struct psl *b = *((struct psl **)vb);
00108 int diff = cmpSeqName(a->tName, b->tName);
00109 if (diff == 0)
00110     diff = a->tStart - b->tStart;
00111 return diff;
00112 }
00113 
00114 char *skipFile(char *fileSeq)
00115 /* Skip over file: */
00116 {
00117 char *s = strchr(fileSeq, ':');
00118 if (s != NULL)
00119     return s+1;
00120 else
00121     {
00122     internalErr();
00123     return fileSeq;
00124     }
00125 }
00126 
00127 void parseFileSeq(char *spec, char **retFile, char **retSeq)
00128 /* Parse out file:seq into file and seq. */
00129 {
00130 char *seq = skipFile(spec);
00131 *retSeq = cloneString(seq);
00132 *retFile = cloneStringZ(spec, seq - spec - 1);
00133 }
00134 
00135 
00136 void aliLines(char *pslName, char *faName, char *database,  char *type)
00137 /* Show all the places that align. */
00138 {
00139 char *url = "../cgi-bin/webBlat";
00140 char *sort = cgiUsualString("wb_sort", sortMenu[0]);
00141 char *output = cgiUsualString("wb_output", outputMenu[0]);
00142 boolean pslOut = startsWith("psl", output);
00143 struct lineFile *lf = pslFileOpen(pslName);
00144 struct psl *pslList = NULL, *psl, **pslArray;
00145 int i, pslCount = 0;
00146 
00147 while ((psl = pslNext(lf)) != NULL)
00148     {
00149     slAddHead(&pslList, psl);
00150     ++pslCount;
00151     }
00152 lineFileClose(&lf);
00153 slReverse(&pslList);
00154 
00155 if (pslList == NULL)
00156     errAbort("Sorry, no matches found");
00157 
00158 /* Keep an array in unsorted order */
00159 AllocArray(pslArray, pslCount);
00160 for (psl = pslList, i=0; psl != NULL; psl = psl->next, ++i)
00161     pslArray[i] = psl;
00162 
00163 
00164 if (sameString(sort, "query,start"))
00165     {
00166     slSort(&pslList, pslCmpQuery);
00167     }
00168 else if (sameString(sort, "query,score"))
00169     {
00170     slSort(&pslList, pslCmpQueryScore);
00171     }
00172 else if (sameString(sort, "score"))
00173     {
00174     slSort(&pslList, pslCmpScore);
00175     }
00176 else if (sameString(sort, "chrom,start"))
00177     {
00178     slSort(&pslList, pslCmpTargetStart);
00179     }
00180 else if (sameString(sort, "chrom,score"))
00181     {
00182     slSort(&pslList, pslCmpTargetScore);
00183     }
00184 else
00185     {
00186     slSort(&pslList, pslCmpQueryScore);
00187     }
00188 if (pslOut)
00189     {
00190     printf("<TT><PRE>");
00191     if (!sameString(output, "psl no header"))
00192         pslWriteHead(stdout);
00193     for (psl = pslList; psl != NULL; psl = psl->next)
00194         pslTabOut(psl, stdout);
00195     }
00196 else
00197     {
00198     int lineIx = 0;
00199     printf("<H2>Web BLAT Search Results</H2>");
00200     printf("<TT><PRE>");
00201     printf("QUERY           SCORE START  END QSIZE IDENTITY CHRO STRAND  START    END      SPAN\n");
00202     printf("---------------------------------------------------------------------------------------------------\n");
00203     for (psl = pslList; psl != NULL; psl = psl->next)
00204         {
00205         printf("<A HREF=\"%s?wb_qType=%s&wb_psl=%s&wb_fa=%s&wb_doDetailLine=%d&wb_db=%s\">",
00206                 url, type, pslName, faName, ptArrayIx(psl, pslArray, pslCount),
00207                 database);
00208         printf("%-14s %5d %5d %5d %5d %5.1f%%  %4s  %2s  %9d %9d %6d",
00209             psl->qName, pslScore(psl), psl->qStart+1, psl->qEnd, psl->qSize,
00210             100.0 - pslCalcMilliBad(psl, TRUE) * 0.1,
00211             skipFile(psl->tName), psl->strand, psl->tStart+1, psl->tEnd,
00212             psl->tEnd - psl->tStart);
00213         printf("</A>\n");
00214         ++lineIx;
00215         }
00216     }
00217 pslFreeList(&pslList);
00218 freez(&pslArray);
00219 printf("</TT></PRE>");
00220 }
00221 
00222 struct dnaSeq *faReadNamedSeq(char *fileName, char *name, boolean isProt)
00223 /* Return DNA sequence corresponding to named fasta record. */
00224 {
00225 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00226 DNA *dna;
00227 int dnaSize;
00228 char *dnaName;
00229 struct dnaSeq *seq = NULL;
00230 while (faSomeSpeedReadNext(lf, &dna, &dnaSize, &dnaName, !isProt))
00231     {
00232     if (sameString(name, dnaName))
00233         {
00234         AllocVar(seq);
00235         seq->name = cloneString(dnaName);
00236         seq->size = dnaSize;
00237         seq->dna = cloneStringZ(dna, dnaSize);
00238         break;
00239         }
00240     }
00241 lineFileClose(&lf);
00242 return seq;
00243 }
00244 
00245 struct dnaSeq *readSeqFrag(char *seqDir, char *fileName, char *seqName, int start, int end)
00246 /* Read in fragment of sequence. */
00247 {
00248 char path[PATH_LEN];
00249 safef(path, sizeof(path), "%s/%s", seqDir, fileName);
00250 if (nibIsFile(path))
00251     {
00252     return nibLoadPart(path, start, end-start);
00253     }
00254 else
00255     {
00256     struct twoBitFile *tbf = twoBitOpen(path);
00257     struct dnaSeq *seq = twoBitReadSeqFragLower(tbf, seqName, start, end);
00258     twoBitClose(&tbf);
00259     return seq;
00260     }
00261 }
00262 
00263 
00264 void doDetailLine()
00265 /* Show alignment details - creating a html frame with
00266  * two pages: index and body. */
00267 {
00268 char *pslFileName = cgiString("wb_psl");
00269 char *faFileName = cgiString("wb_fa");
00270 char *type = cgiString("wb_qType");
00271 boolean isTx = isTxType(type);
00272 struct gfServerAt *server = findServer(isTx);
00273 int pslLineIx = cgiInt("wb_doDetailLine");
00274 int i;
00275 struct lineFile *lf = pslFileOpen(pslFileName);
00276 struct dnaSeq *qSeq = NULL, *tSeq = NULL;
00277 char *tFileName, *tSeqName;
00278 int blockCount;
00279 int tStart, tEnd;
00280 boolean protQuery = sameWord(type, "Protein");
00281 char *bodyName = cloneString(rTempName(cfg->tempDir, "body", ".html"));
00282 char *indexName = cloneString(rTempName(cfg->tempDir, "index", ".html"));
00283 FILE *f;
00284 
00285 /* Read in psl file and find the alignment line we're looking for. */
00286 struct psl *psl = NULL;
00287 for (i=0; i<=pslLineIx; ++i)
00288     {
00289     psl = pslNext(lf);
00290     if (psl == NULL)
00291         errAbort("Expecting at least %d lines, got %d in %s", pslLineIx+1, i+1, pslFileName);
00292     }
00293 lineFileClose(&lf);
00294 
00295 /* Read in fa file and find the query sequence. */
00296 qSeq = faReadNamedSeq(faFileName, psl->qName, protQuery);
00297 if (qSeq == NULL)
00298     errAbort("Couldn't find %s in %s", psl->qName, faFileName);
00299 
00300 
00301 /* Parse out file:seq into file and seq, and load needed part of target seq. */
00302 parseFileSeq(psl->tName, &tFileName, &tSeqName);
00303 tStart = psl->tStart - 120;
00304 if (tStart < 0) tStart = 0;
00305 tEnd = psl->tEnd + 120;
00306 if (tEnd > psl->tSize)
00307     tEnd = psl->tSize;
00308 tSeq = readSeqFrag(server->seqDir, tFileName, tSeqName, tStart, tEnd);
00309 
00310 
00311 /* Write out body. */
00312 f = mustOpen(bodyName, "w");
00313 htmStart(f, psl->qName);
00314 blockCount = pslShowAlignment(psl, protQuery, psl->qName, qSeq, 0, qSeq->size,
00315         tSeqName, tSeq, tStart, tEnd, f);
00316 htmEnd(f);
00317 carefulClose(&f);
00318 chmod(bodyName, 0666);
00319 
00320 /* Write out index. */
00321 f = mustOpen(indexName, "w");
00322 htmStart(f, psl->qName);
00323 fprintf(f, "<H3>Alignment of %s</H3>", psl->qName);
00324 fprintf(f, "<A HREF=\"%s#cDNA\" TARGET=\"body\">%s</A><BR>\n", bodyName, psl->qName);
00325 fprintf(f, "<A HREF=\"%s#genomic\" TARGET=\"body\">%s</A><BR>\n", bodyName, tSeqName);
00326 for (i=1; i<=blockCount; ++i)
00327     {
00328     fprintf(f, "<A HREF=\"%s#%d\" TARGET=\"body\">block%d</A><BR>\n",
00329             bodyName, i, i);
00330     }
00331 fprintf(f, "<A HREF=\"%s#ali\" TARGET=\"body\">together</A><BR>\n", bodyName);
00332 carefulClose(&f);
00333 chmod(indexName, 0666);
00334 
00335 /* Write (to stdout) the main html page containing just the frame info. */
00336 puts("<FRAMESET COLS = \"13%,87% \" >");
00337 printf("  <FRAME SRC=\"%s\" NAME=\"index\">\n", indexName);
00338 printf("  <FRAME SRC=\"%s\" NAME=\"body\">\n", bodyName);
00339 puts("<NOFRAMES><BODY></BODY></NOFRAMES>");
00340 puts("</FRAMESET>");
00341 puts("</HTML>\n");
00342 }
00343 
00344 
00345 void doBlat()
00346 /* Do actual blatting */
00347 {
00348 char *seqText = cgiString("wb_seq");
00349 bioSeq *seqList, *seq;
00350 char *type = NULL;
00351 boolean txServer = FALSE, protQuery = FALSE;
00352 struct gfServerAt *server;
00353 int conn;
00354 FILE *f;
00355 struct gfOutput *gvo;
00356 struct hash *tFileCache = gfFileCacheNew();
00357 char *pslName = cloneString(rTempName(cfg->tempDir, "wb", ".psl"));
00358 char *faName = cloneString(rTempName(cfg->tempDir, "wb", ".fa"));
00359 
00360 /* Figure out type and if it is a protein or a DNA based query */
00361 type = cgiUsualString("wb_qType", bothQueryMenu[0]);
00362 
00363 /* Get sequence from control into memory and also saved as fasta file. */
00364 if (sameWord(type, bothQueryMenu[0]))
00365     {
00366     seqList = faSeqListFromMemText(seqText, FALSE);
00367     if (seqList == NULL)
00368         errAbort("Please go back and paste in a sequence");
00369     if (seqIsDna(seqList))
00370         {
00371         for (seq = seqList; seq != NULL; seq = seq->next)
00372             {
00373             toLowerN(seq->dna, seq->size);
00374             dnaFilterToN(seq->dna, seq->dna);
00375             }
00376         }
00377     else
00378         {
00379         protQuery = TRUE;
00380         type = "Protein";
00381         }
00382     }
00383 else
00384     {
00385     protQuery = sameWord(type, "Protein");
00386     seqList = faSeqListFromMemText(seqText, !protQuery);
00387     if (seqList == NULL)
00388         errAbort("Please go back and paste in a sequence");
00389     }
00390 if (seqList->name == NULL || seqList->name[0] == 0)
00391     {
00392     freez(&seqList->name);
00393     seqList->name = cloneString("query");
00394     }
00395 faWriteAll(faName, seqList);
00396 
00397 /* Set up output for blat. */
00398 f = mustOpen(pslName, "w");
00399 gvo = gfOutputPsl(0, protQuery, FALSE, f, FALSE, TRUE);
00400 gvo->includeTargetFile = TRUE;
00401 
00402 txServer = isTxType(type);
00403 server = findServer(txServer);
00404 
00405 /* Loop through sequences doing alignments and saving to file. */
00406 for (seq = seqList; seq != NULL; seq = seq->next)
00407     {
00408     conn = gfConnect(server->host, server->port);
00409     if (txServer)
00410         {
00411         gvo->reportTargetStrand = TRUE;
00412         if (protQuery)
00413             {
00414             gfAlignTrans(&conn, server->seqDir, seq, 5, tFileCache, gvo);
00415             }
00416         else
00417             {
00418             boolean isRna = sameWord(type, "RNA");
00419             gfAlignTransTrans(&conn, server->seqDir, seq, FALSE, 5,
00420                             tFileCache, gvo, isRna);
00421             if (!isRna)
00422                 {
00423                 reverseComplement(seq->dna, seq->size);
00424                 conn = gfConnect(server->host, server->port);
00425                 gfAlignTransTrans(&conn, server->seqDir, seq, TRUE, 5,
00426                                         tFileCache, gvo, FALSE);
00427                 }
00428             }
00429         }
00430     else
00431         {
00432         gfAlignStrand(&conn, server->seqDir, seq, FALSE, 16, tFileCache, gvo);
00433         reverseComplement(seq->dna, seq->size);
00434         conn = gfConnect(server->host, server->port);
00435         gfAlignStrand(&conn, server->seqDir, seq, TRUE, 16, tFileCache, gvo);
00436         }
00437     gfOutputQuery(gvo, f);
00438     }
00439 carefulClose(&f);
00440 
00441 /* Display alignment results. */
00442 aliLines(pslName, faName, server->name, type);
00443 }
00444 
00445 void doGetSeq()
00446 /* Put up form that asks them to submit sequence. */
00447 {
00448 char *qType = NULL;
00449 char **queryMenu = NULL;
00450 int queryMenuSize = 0;
00451 struct gfServerAt *serverList = NULL, *server;
00452 
00453 
00454 printf("<H1>%s Web BLAT</H1>", cfg->company);
00455 printf("<FORM ACTION=\"../cgi-bin/webBlat\" METHOD=\"POST\">\n");
00456 
00457 /* Figure out whether we do nucleotide, translated, or both. */
00458 if (cfg->serverList != NULL && cfg->transServerList != NULL)
00459     {
00460     queryMenu = bothQueryMenu;
00461     queryMenuSize = ArraySize(bothQueryMenu);
00462     serverList = cfg->serverList;
00463     }
00464 else if (cfg->serverList != NULL)
00465     {
00466     queryMenu = nucQueryMenu;
00467     queryMenuSize = ArraySize(nucQueryMenu);
00468     serverList = cfg->serverList;
00469     }
00470 else if (cfg->transServerList != NULL)
00471     {
00472     queryMenu = protQueryMenu;
00473     queryMenuSize = ArraySize(protQueryMenu);
00474     serverList = cfg->transServerList;
00475     }
00476 else
00477     {
00478     errAbort("No servers configured!");
00479     }
00480 
00481 /* Put up database control */
00482 printf("Database: ");
00483 printf("<SELECT NAME=\"wb_db\">\n");
00484 printf("  <OPTION SELECTED>%s</OPTION>\n", serverList->name);
00485 for (server = serverList->next; server != NULL; server = server->next)
00486     printf("  <OPTION>%s</OPTION>\n", server->name);
00487 printf("</SELECT>\n");
00488 
00489 /* Put up query type control. */
00490 qType = cgiUsualString("wb_qType", queryMenu[0]);
00491 printf("Query Type: ");
00492 cgiMakeDropList("wb_qType", queryMenu, queryMenuSize, qType);
00493 printf("<BR>\n");
00494 
00495 /* Put up sort and output type controls. */
00496 printf("Sort By: ");
00497 cgiMakeDropList("wb_sort", sortMenu, ArraySize(sortMenu), sortMenu[0]);
00498 printf("Output: ");
00499 cgiMakeDropList("wb_output", outputMenu, ArraySize(outputMenu), outputMenu[0]);
00500 cgiMakeButton("Submit", "Submit");
00501 printf("<BR>\n");
00502 
00503 cgiMakeTextArea("wb_seq", "", 10, 60);
00504 printf("<BR>\n");
00505 
00506 printf("Please paste in some sequence and press submit. You can submit multiple "
00507        "sequences at once if they are in fasta format (where each sequence has "
00508        "a header line that starts with > and contains the name of the sequence)");
00509 
00510 printf("</FORM>");
00511 }
00512 
00513 void webBlat()
00514 /* Parse out CGI variables and decide which page to put up. */
00515 {
00516 if (cgiVarExists("wb_help"))
00517     htmShell("Web BLAT Help", doHelp, NULL);
00518 else if (cgiVarExists("wb_seq"))
00519     htmShell("Web BLAT Results", doBlat, NULL);
00520 else if (cgiVarExists("wb_doDetailLine"))
00521     {
00522     puts("Content-Type:text/html");
00523     puts("\n");
00524     doDetailLine();
00525     }
00526 else
00527     htmShell("Web BLAT", doGetSeq, NULL);
00528 }
00529 
00530 int main(int argc, char *argv[])
00531 /* Process command line. */
00532 {
00533 boolean isFromWeb = cgiIsOnWeb();
00534 htmlPushEarlyHandlers();
00535 dnaUtilOpen();
00536 if (!isFromWeb && !cgiSpoof(&argc, argv))
00537     usage();
00538 cfg = gfWebConfigRead("webBlat.cfg");
00539 if (cfg->tempDir == NULL)
00540     errAbort("No tempDir set in webBlat.cfg");
00541 if (cfg->background != NULL)
00542     htmlSetBackground(cfg->background);
00543 webBlat();
00544 return 0;
00545 }

Generated on Tue Dec 25 18:39:32 2007 for blat by  doxygen 1.5.2