00001
00002
00003 #include "common.h"
00004 #include "dystring.h"
00005 #include "linefile.h"
00006 #include "dnautil.h"
00007 #include "blastParse.h"
00008 #include "verbose.h"
00009
00010 #define TRACE_LEVEL 3
00011 #define DUMP_LEVEL 4
00012
00013 struct blastFile *blastFileReadAll(char *fileName)
00014
00015 {
00016 struct blastFile *bf;
00017 struct blastQuery *bq;
00018
00019 bf = blastFileOpenVerify(fileName);
00020 while ((bq = blastFileNextQuery(bf)) != NULL)
00021 {
00022 slAddHead(&bf->queries, bq);
00023 }
00024 slReverse(&bf->queries);
00025 lineFileClose(&bf->lf);
00026 return bf;
00027 }
00028
00029 static void bfError(struct blastFile *bf, char *message)
00030
00031 {
00032 errAbort("%s\nLine %d of %s", message,
00033 bf->lf->lineIx, bf->fileName);
00034 }
00035
00036 static void bfUnexpectedEof(struct blastFile *bf)
00037
00038 {
00039 errAbort("Unexpected end of file in %s\n", bf->fileName);
00040 }
00041
00042 static void bfSyntax(struct blastFile *bf)
00043
00044 {
00045 bfError(bf, "Can't cope with BLAST output syntax");
00046 }
00047
00048 static char *bfNextLine(struct blastFile *bf)
00049
00050 {
00051 char *line = NULL;
00052 if (lineFileNext(bf->lf, &line, NULL))
00053 {
00054 verbose(TRACE_LEVEL, "=> %s\n", line);
00055 return line;
00056 }
00057 else
00058 {
00059 verbose(TRACE_LEVEL, "=> EOF\n");
00060 return NULL;
00061 }
00062 }
00063
00064 static boolean bfSkipBlankLines(struct blastFile *bf)
00065
00066 {
00067 char *line = NULL;
00068 while ((line = bfNextLine(bf)) != NULL)
00069 {
00070 if (skipLeadingSpaces(line)[0] != '\0')
00071 {
00072 lineFileReuse(bf->lf);
00073 return TRUE;
00074 }
00075 }
00076 return FALSE;
00077 }
00078
00079 static char *bfNeedNextLine(struct blastFile *bf)
00080
00081 {
00082 char *line = bfNextLine(bf);
00083 if (line == NULL)
00084 bfUnexpectedEof(bf);
00085 return line;
00086 }
00087
00088 static char *bfSearchForLine(struct blastFile *bf, char *start)
00089
00090 {
00091 for (;;)
00092 {
00093 char *line = bfNextLine(bf);
00094 if (line == NULL)
00095 return NULL;
00096 if (startsWith(start, line))
00097 return line;
00098 }
00099 }
00100
00101 static void bfBadHeader(struct blastFile *bf)
00102
00103 {
00104 bfError(bf, "Bad first line\nShould be something like:\n"
00105 "BLASTN 2.0.11 [Jan-20-2000]");
00106 }
00107
00108 struct blastFile *blastFileOpenVerify(char *fileName)
00109
00110 {
00111 struct blastFile *bf;
00112 char *line;
00113 char *words[16];
00114 int wordCount;
00115 struct lineFile *lf;
00116
00117 AllocVar(bf);
00118 bf->lf = lf = lineFileOpen(fileName, TRUE);
00119 bf->fileName = cloneString(fileName);
00120
00121
00122 line = bfNeedNextLine(bf);
00123 wordCount = chopLine(line, words);
00124 if (wordCount < 3)
00125 bfBadHeader(bf);
00126 bf->program = cloneString(words[0]);
00127 bf->version = cloneString(words[1]);
00128 bf->buildDate = cloneString(words[2]);
00129 if (!wildMatch("*BLAST*", bf->program))
00130 bfBadHeader(bf);
00131 if (!isdigit(bf->version[0]))
00132 bfBadHeader(bf);
00133 if (bf->buildDate[0] != '[')
00134 bfBadHeader(bf);
00135 return bf;
00136 }
00137
00138 void decomma(char *s)
00139
00140 {
00141 char *d = s;
00142 char c;
00143
00144 for (;;)
00145 {
00146 c = *s++;
00147 if (c != ',')
00148 *d++ = c;
00149 if (c == 0)
00150 break;
00151 }
00152 }
00153
00154 static void parseQueryLines(struct blastFile *bf, char *line, struct blastQuery *bq)
00155
00156 {
00157 char *s, *e;
00158 char *words[16];
00159 int wordCount;
00160 if (bq->query != NULL)
00161 bfError(bf, "already parse Query=");
00162
00163
00164
00165
00166 wordCount = chopLine(line, words);
00167 if (wordCount < 2)
00168 bfError(bf, "No sequence name in query line");
00169 bq->query = cloneString(words[1]);
00170
00171 for (;;)
00172 {
00173 line = bfNeedNextLine(bf);
00174 s = skipLeadingSpaces(line);
00175 if (s[0] == '(')
00176 break;
00177 }
00178 if (!isdigit(s[1]))
00179 {
00180 bfError(bf, "expecting something like:\n"
00181 " (45,693 letters)");
00182 }
00183 s += 1;
00184 if ((e = strchr(s, ' ')) == NULL)
00185 {
00186 bfError(bf, "expecting something like:\n"
00187 " (45,693 letters)");
00188 }
00189 *e = 0;
00190 decomma(s);
00191 bq->queryBaseCount = atoi(s);
00192 }
00193
00194 static void parseDatabaseLines(struct blastFile *bf, char *line, struct blastQuery *bq)
00195
00196
00197
00198
00199 {
00200 static struct dyString *tmpBuf = NULL;
00201 char *words[16];
00202 int wordCount;
00203 if (bq->database != NULL)
00204 bfError(bf, "already parse Database:");
00205
00206 if (tmpBuf == NULL)
00207 tmpBuf = dyStringNew(512);
00208
00209
00210
00211
00212
00213
00214 wordCount = chopLine(line, words);
00215 if (wordCount < 2)
00216 bfError(bf, "Expecting database name");
00217 dyStringClear(tmpBuf);
00218 dyStringAppend(tmpBuf, words[1]);
00219 while (line = bfNeedNextLine(bf), !isspace(line[0]))
00220 {
00221 dyStringAppend(tmpBuf, line);
00222 }
00223 bq->database = cloneString(tmpBuf->string);
00224
00225
00226
00227
00228 wordCount = chopLine(line, words);
00229 if (wordCount < 3 || !isdigit(words[0][0]) || !isdigit(words[2][0]))
00230 bfError(bf, "Expecting database info");
00231 decomma(words[0]);
00232 decomma(words[2]);
00233 bq->dbSeqCount = atoi(words[0]);
00234 bq->dbBaseCount = atoi(words[2]);
00235 }
00236
00237 struct blastQuery *blastFileNextQuery(struct blastFile *bf)
00238
00239 {
00240 char *line;
00241 struct blastQuery *bq;
00242 struct blastGappedAli *bga;
00243 AllocVar(bq);
00244
00245 verbose(TRACE_LEVEL, "blastFileNextQuery\n");
00246
00247
00248 line = bfSearchForLine(bf, "Query=");
00249 if (line == NULL)
00250 return NULL;
00251 parseQueryLines(bf, line, bq);
00252
00253
00254 line = bfSearchForLine(bf, "Database:");
00255 if (line == NULL)
00256 bfUnexpectedEof(bf);
00257 parseDatabaseLines(bf, line, bq);
00258
00259
00260 for (;;)
00261 {
00262 line = bfNeedNextLine(bf);
00263 if (line[0] == '>')
00264 {
00265 lineFileReuse(bf->lf);
00266 break;
00267 }
00268 if (stringIn("No hits found", line) != NULL)
00269 break;
00270 }
00271
00272
00273 while ((bga = blastFileNextGapped(bf, bq)) != NULL)
00274 {
00275 slAddHead(&bq->gapped, bga);
00276 }
00277 slReverse(&bq->gapped);
00278 if (verboseLevel() >= DUMP_LEVEL)
00279 {
00280 verbose(DUMP_LEVEL, "blastFileNextQuery result:\n");
00281 blastQueryPrint(bq, stderr);
00282 }
00283 return bq;
00284 }
00285
00286 struct blastGappedAli *blastFileNextGapped(struct blastFile *bf, struct blastQuery *bq)
00287
00288
00289 {
00290 char *line;
00291 char *words[16];
00292 int wordCount;
00293 struct blastGappedAli *bga;
00294 struct blastBlock *bb;
00295 int lenSearch;
00296
00297 verbose(TRACE_LEVEL, "blastFileNextGapped\n");
00298 AllocVar(bga);
00299 bga->query = bq;
00300
00301
00302 if (!bfSkipBlankLines(bf))
00303 return NULL;
00304 line = bfNextLine(bf);
00305 if (startsWith(" Database:", line))
00306 return NULL;
00307 if (line[0] != '>')
00308 bfError(bf, "Expecting >target");
00309 bga->targetName = cloneString(line+1);
00310
00311
00312
00313
00314
00315
00316 for (lenSearch=0; lenSearch<25; lenSearch++)
00317 {
00318 line = bfNeedNextLine(bf);
00319 wordCount = chopLine(line, words);
00320 if (wordCount == 3 && sameString(words[0], "Length") && sameString(words[1], "=")
00321 && isdigit(words[2][0]))
00322 break;
00323 }
00324 if (lenSearch>=25)
00325 bfError(bf, "Expecting Length =");
00326 decomma(words[2]);
00327 bga->targetSize = atoi(words[2]);
00328
00329
00330 while ((bb = blastFileNextBlock(bf, bq, bga)) != NULL)
00331 {
00332 slAddHead(&bga->blocks, bb);
00333 }
00334 slReverse(&bga->blocks);
00335 return bga;
00336 }
00337
00338 static int getStrand(struct blastFile *bf, char *strand)
00339
00340 {
00341 if (sameWord("Plus", strand))
00342 return 1;
00343 else if (sameWord("Minus", strand))
00344 return -1;
00345 else
00346 {
00347 bfError(bf, "Expecting Plus or Minus after Strand");
00348 return 0;
00349 }
00350 }
00351
00352 static boolean nextBlockLine(struct blastFile *bf, char **retLine)
00353
00354
00355 {
00356 struct lineFile *lf = bf->lf;
00357 char *line;
00358
00359 *retLine = line = bfNextLine(bf);
00360 if (line == NULL)
00361 return FALSE;
00362 if (line[0] == '>' || startsWith("Query=", line) || startsWith(" Database:", line))
00363 {
00364 lineFileReuse(lf);
00365 return FALSE;
00366 }
00367 return TRUE;
00368 }
00369
00370 double evalToDouble(char *s)
00371
00372
00373
00374
00375 {
00376 char buf[64];
00377 if (isdigit(s[0]))
00378 return atof(s);
00379 else
00380 {
00381 safef(buf, sizeof(buf), "1.0%s", s);
00382 return atof(buf);
00383 }
00384 }
00385
00386 static void parseBlockLine(struct blastFile *bf, int *startRet, int *endRet,
00387 struct dyString *seq)
00388
00389
00390
00391
00392
00393
00394 {
00395 char* line = bfNeedNextLine(bf);
00396 int a, b, s, e;
00397 char *words[16];
00398 int wordCount = chopLine(line, words);
00399 if ((wordCount < 3) || (wordCount > 4)
00400 || !(sameString("Query:", words[0]) || sameString("Sbjct:", words[0])))
00401 bfSyntax(bf);
00402
00403
00404 if (wordCount == 3)
00405 {
00406 char *p;
00407 if (!isdigit(words[1][0]) || !isdigit(words[2][0]))
00408 bfSyntax(bf);
00409 a = atoi(words[1]);
00410 b = atoi(words[2]);
00411 p = words[1];
00412 while ((*p != '\0') && (isdigit(*p)))
00413 p++;
00414 dyStringAppend(seq, p);
00415 }
00416 else
00417 {
00418 if (!isdigit(words[1][0]) || !isdigit(words[3][0]))
00419 bfSyntax(bf);
00420 a = atoi(words[1]);
00421 b = atoi(words[3]);
00422 dyStringAppend(seq, words[2]);
00423 }
00424 s = min(a,b);
00425 e = max(a,b);
00426 *startRet = min(s, *startRet);
00427 *endRet = max(e, *endRet);
00428 }
00429
00430 static struct blastBlock *nextBlock(struct blastFile *bf, struct blastQuery *bq,
00431 struct blastGappedAli *bga, boolean *skipRet)
00432
00433
00434
00435 {
00436 struct blastBlock *bb;
00437 char *line;
00438 char *words[16];
00439 int wordCount;
00440 char *parts[3];
00441 int partCount;
00442 static struct dyString *qString = NULL, *tString = NULL;
00443
00444 verbose(TRACE_LEVEL, "blastFileNextBlock\n");
00445 *skipRet = FALSE;
00446
00447
00448
00449
00450
00451 for (;;)
00452 {
00453 if (!nextBlockLine(bf, &line))
00454 return NULL;
00455 if (startsWith(" Score", line))
00456 break;
00457 }
00458 AllocVar(bb);
00459 bb->gappedAli = bga;
00460 wordCount = chopLine(line, words);
00461 if (wordCount < 8 || !sameWord("Score", words[0])
00462 || !isdigit(words[2][0]) || !(isdigit(words[7][0]) || words[7][0] == 'e')
00463 || !startsWith("Expect", words[5]))
00464 {
00465 bfError(bf, "Expecting something like:\n"
00466 "Score = 8770 bits (4424), Expect = 0.0");
00467 }
00468 bb->bitScore = atoi(words[2]);
00469 bb->eVal = evalToDouble(words[7]);
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482 line = bfNeedNextLine(bf);
00483 wordCount = chopLine(line, words);
00484 if (wordCount < 3 || !sameWord("Identities", words[0]))
00485 {
00486 if (wordCount > 1 || sameWord("Score", words[0]))
00487 {
00488
00489 *skipRet = TRUE;
00490 blastBlockFree(&bb);
00491 return NULL;
00492 }
00493 bfError(bf, "Expecting identity count");
00494 }
00495 partCount = chopByChar(words[2], '/', parts, ArraySize(parts));
00496 if (partCount != 2 || !isdigit(parts[0][0]) || !isdigit(parts[1][0]))
00497 bfSyntax(bf);
00498 bb->matchCount = atoi(parts[0]);
00499 bb->totalCount = atoi(parts[1]);
00500 if (wordCount >= 7 && sameWord("Gaps", words[4]))
00501 {
00502 if (!isdigit(words[6][0]))
00503 bfSyntax(bf);
00504 bb->insertCount = atoi(words[6]);
00505 }
00506 if ((wordCount >= 11) && sameWord("Frame", words[8]))
00507 {
00508 bb->qStrand = '+';
00509 bb->tStrand = words[10][0];
00510 bb->frame = atoi(words[10]);
00511 }
00512
00513
00514
00515
00516
00517
00518
00519 line = bfNeedNextLine(bf);
00520 wordCount = chopLine(line, words);
00521 if ((wordCount >= 5) && sameWord("Strand", words[0]))
00522 {
00523 bb->qStrand = getStrand(bf, words[2]);
00524 bb->tStrand = getStrand(bf, words[4]);
00525 }
00526 else if ((wordCount >= 3) && sameWord("Frame", words[0]))
00527 {
00528 bb->qStrand = 1;
00529 bb->tStrand = (words[2][0] == '-') ? -1 : 1;
00530 bb->frame = atoi(words[2]);
00531 }
00532 else if (wordCount == 0)
00533 {
00534
00535 if (bb->qStrand == 0)
00536 {
00537 bb->qStrand = '+';
00538 bb->tStrand = '+';
00539 }
00540 }
00541 else
00542 bfError(bf, "Expecting Strand, Frame or blank line");
00543
00544
00545
00546
00547
00548
00549
00550
00551 bb->qStart = bb->tStart = 0x3fffffff;
00552 bb->qEnd = bb->tEnd = -bb->qStart;
00553 if (qString == NULL)
00554 {
00555 qString = newDyString(50000);
00556 tString = newDyString(50000);
00557 }
00558 else
00559 {
00560 dyStringClear(qString);
00561 dyStringClear(tString);
00562 }
00563 for (;;)
00564 {
00565
00566 for (;;)
00567 {
00568 if (!nextBlockLine(bf, &line))
00569 goto DONE;
00570 if (startsWith(" Score", line))
00571 {
00572 lineFileReuse(bf->lf);
00573 goto DONE;
00574 }
00575 if (startsWith("Query:", line))
00576 break;
00577 }
00578 lineFileReuse(bf->lf);
00579 parseBlockLine(bf, &bb->qStart, &bb->qEnd, qString);
00580
00581
00582 line = bfNeedNextLine(bf);
00583
00584
00585 parseBlockLine(bf, &bb->tStart, &bb->tEnd, tString);
00586 }
00587 DONE:
00588
00589
00590 bb->qStart--;
00591 if (bb->qStrand < 0)
00592 reverseIntRange(&bb->qStart, &bb->qEnd, bq->queryBaseCount);
00593 bb->tStart--;
00594 if (bb->tStrand < 0)
00595 reverseIntRange(&bb->tStart, &bb->tEnd, bga->targetSize);
00596 bb->qSym = cloneMem(qString->string, qString->stringSize+1);
00597 bb->tSym = cloneMem(tString->string, tString->stringSize+1);
00598 return bb;
00599 }
00600
00601 struct blastBlock *blastFileNextBlock(struct blastFile *bf,
00602 struct blastQuery *bq, struct blastGappedAli *bga)
00603
00604
00605 {
00606 struct blastBlock *bb = NULL;
00607 boolean skip = FALSE;
00608
00609 while (((bb = nextBlock(bf, bq, bga, &skip)) == NULL) && skip)
00610 continue;
00611
00612 return bb;
00613 }
00614
00615 void blastFileFree(struct blastFile **pBf)
00616
00617 {
00618 struct blastFile *bf = *pBf;
00619 if (bf != NULL)
00620 {
00621 lineFileClose(&bf->lf);
00622 freeMem(bf->fileName);
00623 freeMem(bf->program);
00624 freeMem(bf->version);
00625 freeMem(bf->buildDate);
00626 blastQueryFreeList(&bf->queries);
00627 freez(pBf);
00628 }
00629 }
00630
00631 void blastFileFreeList(struct blastFile **pList)
00632
00633 {
00634 struct blastFile *el, *next;
00635 for (el = *pList; el != NULL; el = next)
00636 {
00637 next = el->next;
00638 blastFileFree(&el);
00639 }
00640 *pList = NULL;
00641 }
00642
00643 void blastQueryFree(struct blastQuery **pBq)
00644
00645 {
00646 struct blastQuery *bq;
00647 if ((bq = *pBq) != NULL)
00648 {
00649 freeMem(bq->query);
00650 freeMem(bq->database);
00651 blastGappedAliFreeList(&bq->gapped);
00652 freez(pBq);
00653 }
00654 }
00655
00656 void blastQueryFreeList(struct blastQuery **pList)
00657
00658 {
00659 struct blastQuery *el, *next;
00660 for (el = *pList; el != NULL; el = next)
00661 {
00662 next = el->next;
00663 blastQueryFree(&el);
00664 }
00665 *pList = NULL;
00666 }
00667
00668
00669 void blastGappedAliFree(struct blastGappedAli **pBga)
00670
00671 {
00672 struct blastGappedAli *bga = *pBga;
00673 if (bga != NULL)
00674 {
00675 freeMem(bga->targetName);
00676 blastBlockFreeList(&bga->blocks);
00677 freez(pBga);
00678 }
00679 }
00680
00681 void blastGappedAliFreeList(struct blastGappedAli **pList)
00682
00683 {
00684 struct blastGappedAli *el, *next;
00685 for (el = *pList; el != NULL; el = next)
00686 {
00687 next = el->next;
00688 blastGappedAliFree(&el);
00689 }
00690 *pList = NULL;
00691 }
00692
00693
00694 void blastBlockFree(struct blastBlock **pBb)
00695
00696 {
00697 struct blastBlock *bb = *pBb;
00698 if (bb != NULL)
00699 {
00700 freeMem(bb->qSym);
00701 freeMem(bb->tSym);
00702 freez(pBb);
00703 }
00704 }
00705
00706 void blastBlockFreeList(struct blastBlock **pList)
00707
00708 {
00709 struct blastBlock *el, *next;
00710 for (el = *pList; el != NULL; el = next)
00711 {
00712 next = el->next;
00713 blastBlockFree(&el);
00714 }
00715 *pList = NULL;
00716 }
00717
00718 void blastBlockPrint(struct blastBlock* bb, FILE* out)
00719
00720 {
00721 fprintf(out, " blk: %d-%d <=> %d-%d tcnt=%d mcnt=%d icnt=%d\n",
00722 bb->qStart, bb->qEnd, bb->tStart, bb->tEnd,
00723 bb->totalCount, bb->matchCount, bb->insertCount);
00724 fprintf(out, " Q: %s\n", bb->qSym);
00725 fprintf(out, " T: %s\n", bb->tSym);
00726 }
00727
00728 void blastGappedAliPrint(struct blastGappedAli* ba, FILE* out)
00729
00730 {
00731 struct blastBlock *bb;
00732 fprintf(out, "%s <=> %s\n", ba->query->query, ba->targetName);
00733 for (bb = ba->blocks; bb != NULL; bb = bb->next)
00734 {
00735 blastBlockPrint(bb, out);
00736 }
00737 }
00738
00739 void blastQueryPrint(struct blastQuery *bq, FILE* out)
00740
00741 {
00742 struct blastGappedAli *ba;
00743 for (ba = bq->gapped; ba != NULL; ba = ba->next)
00744 blastGappedAliPrint(ba, out);
00745 }