00001
00002
00003
00004 #include "common.h"
00005 #include "linefile.h"
00006 #include "hash.h"
00007 #include "errabort.h"
00008 #include "errCatch.h"
00009 #include "portable.h"
00010 #include "htmshell.h"
00011 #include "dnautil.h"
00012 #include "dnaseq.h"
00013 #include "fa.h"
00014 #include "nib.h"
00015 #include "twoBit.h"
00016 #include "psl.h"
00017 #include "fuzzyFind.h"
00018 #include "cheapcgi.h"
00019 #include "genoFind.h"
00020 #include "gfPcrLib.h"
00021 #include "gfWebLib.h"
00022
00023
00024 void usage()
00025
00026 {
00027 errAbort(
00028 "webBlat - CGI Applet for Blat\n"
00029 "This program is not generally meant to be run from the command line.\n"
00030 );
00031 }
00032
00033 struct gfWebConfig *cfg;
00034
00035 struct gfServerAt *findServer(boolean txServer)
00036
00037 {
00038 if (txServer)
00039 return gfWebFindServer(cfg->transServerList, "wb_db");
00040 else
00041 return gfWebFindServer(cfg->serverList, "wb_db");
00042 }
00043
00044 void doHelp()
00045
00046 {
00047 uglyf("I'm just not very helpful");
00048 }
00049
00050 char *protQueryMenu[] = {"Protein", "Translated DNA", "Translated RNA"};
00051 char *nucQueryMenu[] = {"DNA", "RNA", };
00052 char *bothQueryMenu[] = {"BLAT's Guess", "DNA", "RNA",
00053 "Protein", "Translated DNA", "Translated RNA"};
00054 char *sortMenu[] = {"query,score", "query,start", "chrom,score", "chrom,start", "score"};
00055 char *outputMenu[] = {"hyperlink", "psl", "psl no header"};
00056
00057 boolean isTxType(char *type)
00058
00059 {
00060 int i;
00061 for (i=0; i<ArraySize(protQueryMenu); ++i)
00062 {
00063 if (sameWord(type, protQueryMenu[i]))
00064 return TRUE;
00065 }
00066 return FALSE;
00067 }
00068
00069 int cmpSeqName(char *a, char *b)
00070
00071 {
00072 char cA, cB;
00073 int cSame = countSame(a, b);
00074
00075 a += cSame;
00076 b += cSame;
00077 cA = *a;
00078 cB = *b;
00079 if (isdigit(cA))
00080 {
00081 if (isdigit(cB))
00082 return atoi(a) - atoi(b);
00083 else
00084 return -1;
00085 }
00086 else if (isdigit(cB))
00087 return 1;
00088 else
00089 return strcmp(a, b);
00090 }
00091
00092 int pslCmpTargetScore(const void *va, const void *vb)
00093
00094 {
00095 const struct psl *a = *((struct psl **)va);
00096 const struct psl *b = *((struct psl **)vb);
00097 int diff = cmpSeqName(a->tName, b->tName);
00098 if (diff == 0)
00099 diff = pslScore(b) - pslScore(a);
00100 return diff;
00101 }
00102
00103 int pslCmpTargetStart(const void *va, const void *vb)
00104
00105 {
00106 const struct psl *a = *((struct psl **)va);
00107 const struct psl *b = *((struct psl **)vb);
00108 int diff = cmpSeqName(a->tName, b->tName);
00109 if (diff == 0)
00110 diff = a->tStart - b->tStart;
00111 return diff;
00112 }
00113
00114 char *skipFile(char *fileSeq)
00115
00116 {
00117 char *s = strchr(fileSeq, ':');
00118 if (s != NULL)
00119 return s+1;
00120 else
00121 {
00122 internalErr();
00123 return fileSeq;
00124 }
00125 }
00126
00127 void parseFileSeq(char *spec, char **retFile, char **retSeq)
00128
00129 {
00130 char *seq = skipFile(spec);
00131 *retSeq = cloneString(seq);
00132 *retFile = cloneStringZ(spec, seq - spec - 1);
00133 }
00134
00135
00136 void aliLines(char *pslName, char *faName, char *database, char *type)
00137
00138 {
00139 char *url = "../cgi-bin/webBlat";
00140 char *sort = cgiUsualString("wb_sort", sortMenu[0]);
00141 char *output = cgiUsualString("wb_output", outputMenu[0]);
00142 boolean pslOut = startsWith("psl", output);
00143 struct lineFile *lf = pslFileOpen(pslName);
00144 struct psl *pslList = NULL, *psl, **pslArray;
00145 int i, pslCount = 0;
00146
00147 while ((psl = pslNext(lf)) != NULL)
00148 {
00149 slAddHead(&pslList, psl);
00150 ++pslCount;
00151 }
00152 lineFileClose(&lf);
00153 slReverse(&pslList);
00154
00155 if (pslList == NULL)
00156 errAbort("Sorry, no matches found");
00157
00158
00159 AllocArray(pslArray, pslCount);
00160 for (psl = pslList, i=0; psl != NULL; psl = psl->next, ++i)
00161 pslArray[i] = psl;
00162
00163
00164 if (sameString(sort, "query,start"))
00165 {
00166 slSort(&pslList, pslCmpQuery);
00167 }
00168 else if (sameString(sort, "query,score"))
00169 {
00170 slSort(&pslList, pslCmpQueryScore);
00171 }
00172 else if (sameString(sort, "score"))
00173 {
00174 slSort(&pslList, pslCmpScore);
00175 }
00176 else if (sameString(sort, "chrom,start"))
00177 {
00178 slSort(&pslList, pslCmpTargetStart);
00179 }
00180 else if (sameString(sort, "chrom,score"))
00181 {
00182 slSort(&pslList, pslCmpTargetScore);
00183 }
00184 else
00185 {
00186 slSort(&pslList, pslCmpQueryScore);
00187 }
00188 if (pslOut)
00189 {
00190 printf("<TT><PRE>");
00191 if (!sameString(output, "psl no header"))
00192 pslWriteHead(stdout);
00193 for (psl = pslList; psl != NULL; psl = psl->next)
00194 pslTabOut(psl, stdout);
00195 }
00196 else
00197 {
00198 int lineIx = 0;
00199 printf("<H2>Web BLAT Search Results</H2>");
00200 printf("<TT><PRE>");
00201 printf("QUERY SCORE START END QSIZE IDENTITY CHRO STRAND START END SPAN\n");
00202 printf("---------------------------------------------------------------------------------------------------\n");
00203 for (psl = pslList; psl != NULL; psl = psl->next)
00204 {
00205 printf("<A HREF=\"%s?wb_qType=%s&wb_psl=%s&wb_fa=%s&wb_doDetailLine=%d&wb_db=%s\">",
00206 url, type, pslName, faName, ptArrayIx(psl, pslArray, pslCount),
00207 database);
00208 printf("%-14s %5d %5d %5d %5d %5.1f%% %4s %2s %9d %9d %6d",
00209 psl->qName, pslScore(psl), psl->qStart+1, psl->qEnd, psl->qSize,
00210 100.0 - pslCalcMilliBad(psl, TRUE) * 0.1,
00211 skipFile(psl->tName), psl->strand, psl->tStart+1, psl->tEnd,
00212 psl->tEnd - psl->tStart);
00213 printf("</A>\n");
00214 ++lineIx;
00215 }
00216 }
00217 pslFreeList(&pslList);
00218 freez(&pslArray);
00219 printf("</TT></PRE>");
00220 }
00221
00222 struct dnaSeq *faReadNamedSeq(char *fileName, char *name, boolean isProt)
00223
00224 {
00225 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00226 DNA *dna;
00227 int dnaSize;
00228 char *dnaName;
00229 struct dnaSeq *seq = NULL;
00230 while (faSomeSpeedReadNext(lf, &dna, &dnaSize, &dnaName, !isProt))
00231 {
00232 if (sameString(name, dnaName))
00233 {
00234 AllocVar(seq);
00235 seq->name = cloneString(dnaName);
00236 seq->size = dnaSize;
00237 seq->dna = cloneStringZ(dna, dnaSize);
00238 break;
00239 }
00240 }
00241 lineFileClose(&lf);
00242 return seq;
00243 }
00244
00245 struct dnaSeq *readSeqFrag(char *seqDir, char *fileName, char *seqName, int start, int end)
00246
00247 {
00248 char path[PATH_LEN];
00249 safef(path, sizeof(path), "%s/%s", seqDir, fileName);
00250 if (nibIsFile(path))
00251 {
00252 return nibLoadPart(path, start, end-start);
00253 }
00254 else
00255 {
00256 struct twoBitFile *tbf = twoBitOpen(path);
00257 struct dnaSeq *seq = twoBitReadSeqFragLower(tbf, seqName, start, end);
00258 twoBitClose(&tbf);
00259 return seq;
00260 }
00261 }
00262
00263
00264 void doDetailLine()
00265
00266
00267 {
00268 char *pslFileName = cgiString("wb_psl");
00269 char *faFileName = cgiString("wb_fa");
00270 char *type = cgiString("wb_qType");
00271 boolean isTx = isTxType(type);
00272 struct gfServerAt *server = findServer(isTx);
00273 int pslLineIx = cgiInt("wb_doDetailLine");
00274 int i;
00275 struct lineFile *lf = pslFileOpen(pslFileName);
00276 struct dnaSeq *qSeq = NULL, *tSeq = NULL;
00277 char *tFileName, *tSeqName;
00278 int blockCount;
00279 int tStart, tEnd;
00280 boolean protQuery = sameWord(type, "Protein");
00281 char *bodyName = cloneString(rTempName(cfg->tempDir, "body", ".html"));
00282 char *indexName = cloneString(rTempName(cfg->tempDir, "index", ".html"));
00283 FILE *f;
00284
00285
00286 struct psl *psl = NULL;
00287 for (i=0; i<=pslLineIx; ++i)
00288 {
00289 psl = pslNext(lf);
00290 if (psl == NULL)
00291 errAbort("Expecting at least %d lines, got %d in %s", pslLineIx+1, i+1, pslFileName);
00292 }
00293 lineFileClose(&lf);
00294
00295
00296 qSeq = faReadNamedSeq(faFileName, psl->qName, protQuery);
00297 if (qSeq == NULL)
00298 errAbort("Couldn't find %s in %s", psl->qName, faFileName);
00299
00300
00301
00302 parseFileSeq(psl->tName, &tFileName, &tSeqName);
00303 tStart = psl->tStart - 120;
00304 if (tStart < 0) tStart = 0;
00305 tEnd = psl->tEnd + 120;
00306 if (tEnd > psl->tSize)
00307 tEnd = psl->tSize;
00308 tSeq = readSeqFrag(server->seqDir, tFileName, tSeqName, tStart, tEnd);
00309
00310
00311
00312 f = mustOpen(bodyName, "w");
00313 htmStart(f, psl->qName);
00314 blockCount = pslShowAlignment(psl, protQuery, psl->qName, qSeq, 0, qSeq->size,
00315 tSeqName, tSeq, tStart, tEnd, f);
00316 htmEnd(f);
00317 carefulClose(&f);
00318 chmod(bodyName, 0666);
00319
00320
00321 f = mustOpen(indexName, "w");
00322 htmStart(f, psl->qName);
00323 fprintf(f, "<H3>Alignment of %s</H3>", psl->qName);
00324 fprintf(f, "<A HREF=\"%s#cDNA\" TARGET=\"body\">%s</A><BR>\n", bodyName, psl->qName);
00325 fprintf(f, "<A HREF=\"%s#genomic\" TARGET=\"body\">%s</A><BR>\n", bodyName, tSeqName);
00326 for (i=1; i<=blockCount; ++i)
00327 {
00328 fprintf(f, "<A HREF=\"%s#%d\" TARGET=\"body\">block%d</A><BR>\n",
00329 bodyName, i, i);
00330 }
00331 fprintf(f, "<A HREF=\"%s#ali\" TARGET=\"body\">together</A><BR>\n", bodyName);
00332 carefulClose(&f);
00333 chmod(indexName, 0666);
00334
00335
00336 puts("<FRAMESET COLS = \"13%,87% \" >");
00337 printf(" <FRAME SRC=\"%s\" NAME=\"index\">\n", indexName);
00338 printf(" <FRAME SRC=\"%s\" NAME=\"body\">\n", bodyName);
00339 puts("<NOFRAMES><BODY></BODY></NOFRAMES>");
00340 puts("</FRAMESET>");
00341 puts("</HTML>\n");
00342 }
00343
00344
00345 void doBlat()
00346
00347 {
00348 char *seqText = cgiString("wb_seq");
00349 bioSeq *seqList, *seq;
00350 char *type = NULL;
00351 boolean txServer = FALSE, protQuery = FALSE;
00352 struct gfServerAt *server;
00353 int conn;
00354 FILE *f;
00355 struct gfOutput *gvo;
00356 struct hash *tFileCache = gfFileCacheNew();
00357 char *pslName = cloneString(rTempName(cfg->tempDir, "wb", ".psl"));
00358 char *faName = cloneString(rTempName(cfg->tempDir, "wb", ".fa"));
00359
00360
00361 type = cgiUsualString("wb_qType", bothQueryMenu[0]);
00362
00363
00364 if (sameWord(type, bothQueryMenu[0]))
00365 {
00366 seqList = faSeqListFromMemText(seqText, FALSE);
00367 if (seqList == NULL)
00368 errAbort("Please go back and paste in a sequence");
00369 if (seqIsDna(seqList))
00370 {
00371 for (seq = seqList; seq != NULL; seq = seq->next)
00372 {
00373 toLowerN(seq->dna, seq->size);
00374 dnaFilterToN(seq->dna, seq->dna);
00375 }
00376 }
00377 else
00378 {
00379 protQuery = TRUE;
00380 type = "Protein";
00381 }
00382 }
00383 else
00384 {
00385 protQuery = sameWord(type, "Protein");
00386 seqList = faSeqListFromMemText(seqText, !protQuery);
00387 if (seqList == NULL)
00388 errAbort("Please go back and paste in a sequence");
00389 }
00390 if (seqList->name == NULL || seqList->name[0] == 0)
00391 {
00392 freez(&seqList->name);
00393 seqList->name = cloneString("query");
00394 }
00395 faWriteAll(faName, seqList);
00396
00397
00398 f = mustOpen(pslName, "w");
00399 gvo = gfOutputPsl(0, protQuery, FALSE, f, FALSE, TRUE);
00400 gvo->includeTargetFile = TRUE;
00401
00402 txServer = isTxType(type);
00403 server = findServer(txServer);
00404
00405
00406 for (seq = seqList; seq != NULL; seq = seq->next)
00407 {
00408 conn = gfConnect(server->host, server->port);
00409 if (txServer)
00410 {
00411 gvo->reportTargetStrand = TRUE;
00412 if (protQuery)
00413 {
00414 gfAlignTrans(&conn, server->seqDir, seq, 5, tFileCache, gvo);
00415 }
00416 else
00417 {
00418 boolean isRna = sameWord(type, "RNA");
00419 gfAlignTransTrans(&conn, server->seqDir, seq, FALSE, 5,
00420 tFileCache, gvo, isRna);
00421 if (!isRna)
00422 {
00423 reverseComplement(seq->dna, seq->size);
00424 conn = gfConnect(server->host, server->port);
00425 gfAlignTransTrans(&conn, server->seqDir, seq, TRUE, 5,
00426 tFileCache, gvo, FALSE);
00427 }
00428 }
00429 }
00430 else
00431 {
00432 gfAlignStrand(&conn, server->seqDir, seq, FALSE, 16, tFileCache, gvo);
00433 reverseComplement(seq->dna, seq->size);
00434 conn = gfConnect(server->host, server->port);
00435 gfAlignStrand(&conn, server->seqDir, seq, TRUE, 16, tFileCache, gvo);
00436 }
00437 gfOutputQuery(gvo, f);
00438 }
00439 carefulClose(&f);
00440
00441
00442 aliLines(pslName, faName, server->name, type);
00443 }
00444
00445 void doGetSeq()
00446
00447 {
00448 char *qType = NULL;
00449 char **queryMenu = NULL;
00450 int queryMenuSize = 0;
00451 struct gfServerAt *serverList = NULL, *server;
00452
00453
00454 printf("<H1>%s Web BLAT</H1>", cfg->company);
00455 printf("<FORM ACTION=\"../cgi-bin/webBlat\" METHOD=\"POST\">\n");
00456
00457
00458 if (cfg->serverList != NULL && cfg->transServerList != NULL)
00459 {
00460 queryMenu = bothQueryMenu;
00461 queryMenuSize = ArraySize(bothQueryMenu);
00462 serverList = cfg->serverList;
00463 }
00464 else if (cfg->serverList != NULL)
00465 {
00466 queryMenu = nucQueryMenu;
00467 queryMenuSize = ArraySize(nucQueryMenu);
00468 serverList = cfg->serverList;
00469 }
00470 else if (cfg->transServerList != NULL)
00471 {
00472 queryMenu = protQueryMenu;
00473 queryMenuSize = ArraySize(protQueryMenu);
00474 serverList = cfg->transServerList;
00475 }
00476 else
00477 {
00478 errAbort("No servers configured!");
00479 }
00480
00481
00482 printf("Database: ");
00483 printf("<SELECT NAME=\"wb_db\">\n");
00484 printf(" <OPTION SELECTED>%s</OPTION>\n", serverList->name);
00485 for (server = serverList->next; server != NULL; server = server->next)
00486 printf(" <OPTION>%s</OPTION>\n", server->name);
00487 printf("</SELECT>\n");
00488
00489
00490 qType = cgiUsualString("wb_qType", queryMenu[0]);
00491 printf("Query Type: ");
00492 cgiMakeDropList("wb_qType", queryMenu, queryMenuSize, qType);
00493 printf("<BR>\n");
00494
00495
00496 printf("Sort By: ");
00497 cgiMakeDropList("wb_sort", sortMenu, ArraySize(sortMenu), sortMenu[0]);
00498 printf("Output: ");
00499 cgiMakeDropList("wb_output", outputMenu, ArraySize(outputMenu), outputMenu[0]);
00500 cgiMakeButton("Submit", "Submit");
00501 printf("<BR>\n");
00502
00503 cgiMakeTextArea("wb_seq", "", 10, 60);
00504 printf("<BR>\n");
00505
00506 printf("Please paste in some sequence and press submit. You can submit multiple "
00507 "sequences at once if they are in fasta format (where each sequence has "
00508 "a header line that starts with > and contains the name of the sequence)");
00509
00510 printf("</FORM>");
00511 }
00512
00513 void webBlat()
00514
00515 {
00516 if (cgiVarExists("wb_help"))
00517 htmShell("Web BLAT Help", doHelp, NULL);
00518 else if (cgiVarExists("wb_seq"))
00519 htmShell("Web BLAT Results", doBlat, NULL);
00520 else if (cgiVarExists("wb_doDetailLine"))
00521 {
00522 puts("Content-Type:text/html");
00523 puts("\n");
00524 doDetailLine();
00525 }
00526 else
00527 htmShell("Web BLAT", doGetSeq, NULL);
00528 }
00529
00530 int main(int argc, char *argv[])
00531
00532 {
00533 boolean isFromWeb = cgiIsOnWeb();
00534 htmlPushEarlyHandlers();
00535 dnaUtilOpen();
00536 if (!isFromWeb && !cgiSpoof(&argc, argv))
00537 usage();
00538 cfg = gfWebConfigRead("webBlat.cfg");
00539 if (cfg->tempDir == NULL)
00540 errAbort("No tempDir set in webBlat.cfg");
00541 if (cfg->background != NULL)
00542 htmlSetBackground(cfg->background);
00543 webBlat();
00544 return 0;
00545 }