lib/blastOut.c

Go to the documentation of this file.
00001 /* blastOut.c - stuff to output an alignment in blast format. */
00002 #include "common.h"
00003 #include "linefile.h"
00004 #include "hash.h"
00005 #include "axt.h"
00006 #include "obscure.h"
00007 #include "genoFind.h"
00008 
00009 static char const rcsid[] = "$Id: blastOut.c,v 1.26 2006/03/10 17:43:36 angie Exp $";
00010 
00011 struct axtRef
00012 /* A reference to an axt. */
00013     {
00014     struct axtRef *next;
00015     struct axt *axt;
00016     };
00017 
00018 static int axtRefCmpScore(const void *va, const void *vb)
00019 /* Compare to sort based on score. */
00020 {
00021 const struct axtRef *a = *((struct axtRef **)va);
00022 const struct axtRef *b = *((struct axtRef **)vb);
00023 return b->axt->score - a->axt->score;
00024 }
00025 
00026 struct targetHits
00027 /* Collection of hits to a single target. */
00028     {
00029     struct targetHits *next;
00030     char *name;             /* Target name */
00031     int size;               /* Target size */
00032     struct axtRef *axtList; /* List of axts, sorted by score. */
00033     int score;              /* Score of best element */
00034     };
00035 
00036 static void targetHitsFree(struct targetHits **pObj)
00037 /* Free one target hits structure. */
00038 {
00039 struct targetHits *obj = *pObj;
00040 if (obj != NULL)
00041     {
00042     freeMem(obj->name);
00043     slFreeList(&obj->axtList);
00044     freez(pObj);
00045     }
00046 }
00047 
00048 static void targetHitsFreeList(struct targetHits **pList)
00049 /* Free a list of dynamically allocated targetHits's */
00050 {
00051 struct targetHits *el, *next;
00052 
00053 for (el = *pList; el != NULL; el = next)
00054     {
00055     next = el->next;
00056     targetHitsFree(&el);
00057     }
00058 *pList = NULL;
00059 }
00060 
00061 
00062 static int targetHitsCmpScore(const void *va, const void *vb)
00063 /* Compare to sort based on score. */
00064 {
00065 const struct targetHits *a = *((struct targetHits **)va);
00066 const struct targetHits *b = *((struct targetHits **)vb);
00067 
00068 return b->score - a->score;
00069 }
00070 
00071 static int blastzToWublastScore(int bzScore)
00072 /* Convert from 100 points/match blastz score to 5 points/match
00073  * wu-blast score. */
00074 {
00075 return bzScore/19;
00076 }
00077 
00078 static double blastzScoreToWuBits(int bzScore, boolean isProt)
00079 /* Convert from blastz score to bit score.  The magic number's
00080  * here are taken from comparing wu-blast scores to axtScores on
00081  * a couple of sequences.  This doesn't really seem to be a bit
00082  * score.  It's not very close to 2 bits per base. */
00083 {
00084 int wuScore = blastzToWublastScore(bzScore);
00085 if (isProt)
00086     return wuScore * 0.3545;
00087 else
00088     return wuScore * 0.1553;
00089 }
00090 
00091 static double blastzScoreToWuExpectation(int bzScore, double databaseSize)
00092 /* I'm puzzled by the wu-blast expectation score.  I would
00093  * think it would be  databaseSize / (2^bitScore)  but
00094  * it's not.   I think the best I can do is approximate
00095  * it with something scaled to be close to this expectation. */
00096 {
00097 double logProbOne = -log(2) * bzScore / 32.1948;
00098 return databaseSize * exp(logProbOne);
00099 }
00100 
00101 static int blastzScoreToNcbiBits(int bzScore)
00102 /* Convert blastz score to bit score in NCBI sense. */
00103 {
00104 return round(bzScore * 0.0205);
00105 }
00106 
00107 static int blastzScoreToNcbiScore(int bzScore)
00108 /* Conver blastz score to NCBI matrix score. */
00109 {
00110 return round(bzScore * 0.0529);
00111 }
00112 
00113 static double blastzScoreToNcbiExpectation(int bzScore)
00114 /* Convert blastz score to expectation in NCBI sense. */
00115 {
00116 double bits = bzScore * 0.0205;
00117 double logProb = -bits*log(2);
00118 return 3.0e9 * exp(logProb);
00119 }
00120 
00121 static double expectationToProbability(double e)
00122 /* Convert expected number of hits to probability of at least
00123  * one hit.  This is a crude approximation, but actually pretty precise
00124  * for e < 0.1, which is all that really matters.... */
00125 {
00126 if (e < 0.999)
00127     return e;
00128 else
00129     return 0.999;
00130 }
00131 
00132 static int countMatches(char *a, char *b, int size)
00133 /* Count number of characters that match between a and b. */
00134 {
00135 int count = 0;
00136 int i;
00137 for (i=0; i<size; ++i)
00138     if (toupper(a[i]) == toupper(b[i]))
00139         ++count;
00140 return count;
00141 }
00142 
00143 static int countGaps(char *a, char *b, int size)
00144 /* Count number of inserts in either strand. */
00145 {
00146 int count = 0;
00147 int i;
00148 for (i=0; i<size; ++i)
00149    {
00150    if (a[i] == '-')
00151        ++count;
00152    if (b[i] == '-')
00153        ++count;
00154    }
00155 return count;
00156 }
00157 
00158 static int countGapOpens(char *a, char *b, int size)
00159 /* Count number of inserts in either strand. */
00160 {
00161 int i, count = 0;
00162 boolean inGap = FALSE;
00163 for (i=0; i<size; ++i)
00164     {
00165     if (a[i] == '-' || b[i] == '-')
00166         {
00167         if (!inGap)
00168             {
00169             ++count;
00170             inGap = TRUE;
00171             }
00172         }
00173     else
00174         {
00175         inGap = FALSE;
00176         }
00177     }
00178 return count;
00179 }
00180 
00181 
00182 static int countPositives(char *a, char *b, int size)
00183 /* Count positive (not necessarily identical) protein matches. */
00184 {
00185 int count = 0;
00186 int i;
00187 struct axtScoreScheme *ss = axtScoreSchemeProteinDefault();
00188 for (i=0; i<size; ++i)
00189     {
00190     if (ss->matrix[(int)a[i]][(int)b[i]] > 0)
00191         ++count;
00192     }
00193 return count;
00194 }
00195 
00196 static int plusStrandPos(int pos, int size, char strand, boolean isEnd)
00197 /* Return position on plus strand, one based. */
00198 {
00199 if (strand == '-')
00200     {
00201     pos = size - pos;
00202     if (isEnd)
00203        ++pos;
00204     }
00205 else
00206     {
00207     if (!isEnd)
00208         ++pos;
00209     }
00210 return pos;
00211 }
00212 
00213 static int calcDigitCount(struct axt *axt, int tSize, int qSize)
00214 /* Figure out how many digits needed for blast position display. */
00215 {
00216 int tDig, qDig;
00217 if (axt->qStrand == '-')
00218     qDig = digitsBaseTen(qSize - axt->qStart + 1);
00219 else
00220     qDig = digitsBaseTen(axt->qEnd);
00221 if (axt->tStrand == '-')
00222     tDig = digitsBaseTen(tSize - axt->tStart + 1);
00223 else
00224     tDig = digitsBaseTen(axt->tEnd);
00225 return max(tDig, qDig);
00226 }
00227 
00228 static void blastiodAxtOutput(FILE *f, struct axt *axt, int tSize, int qSize, 
00229         int lineSize, boolean isProt, boolean isTranslated)
00230 /* Output base-by-base part of alignment in blast-like fashion. */
00231 {
00232 int tOff = axt->tStart;
00233 int dt = (isTranslated ? 3 : 1);
00234 int qOff = axt->qStart;
00235 int lineStart, lineEnd, i;
00236 int digits = calcDigitCount(axt, tSize, qSize);
00237 struct axtScoreScheme *ss = NULL;
00238 
00239 if (isProt)
00240     ss = axtScoreSchemeProteinDefault();
00241 for (lineStart = 0; lineStart < axt->symCount; lineStart = lineEnd)
00242     {
00243     lineEnd = lineStart + lineSize;
00244     if (lineEnd > axt->symCount) lineEnd = axt->symCount;
00245     fprintf(f, "Query: %-*d ", digits, plusStrandPos(qOff, qSize, axt->qStrand, FALSE));
00246     for (i=lineStart; i<lineEnd; ++i)
00247         {
00248         char c = axt->qSym[i];
00249         fputc(c, f);
00250         if (c != '-')
00251             ++qOff;
00252         }
00253     fprintf(f, " %-d\n", plusStrandPos(qOff, qSize, axt->qStrand, TRUE));
00254     fprintf(f, "       %*s ", digits, " ");
00255     for (i=lineStart; i<lineEnd; ++i)
00256         {
00257         char q, t, c;
00258         q = axt->qSym[i];
00259         t = axt->tSym[i];
00260         if (isProt)
00261             {
00262             if (q == t)
00263                 c = q;
00264             else if (ss->matrix[(int)q][(int)t] > 0)
00265                 c = '+';
00266             else
00267                 c = ' ';
00268             }
00269         else
00270             c = ((toupper(q) == toupper(t)) ? '|' : ' ');
00271 
00272         fputc(c, f);
00273         }
00274     fprintf(f, "\n");
00275     fprintf(f, "Sbjct: %-*d ", digits, plusStrandPos(tOff, tSize, axt->tStrand, FALSE));
00276     for (i=lineStart; i<lineEnd; ++i)
00277         {
00278         char c = axt->tSym[i];
00279         fputc(c, f);
00280         if (c != '-')
00281             tOff += dt;
00282         }
00283     fprintf(f, " %-d\n", plusStrandPos(tOff, tSize, axt->tStrand, TRUE));
00284     fprintf(f, "\n");
00285     }
00286 }
00287 
00288 static struct targetHits *bundleIntoTargets(struct axtBundle *abList)
00289 /* BLAST typically outputs everything on the same query and target
00290  * in one clump.  This routine rearranges axts in abList to do this. */
00291 {
00292 struct targetHits *targetList = NULL, *target;
00293 struct hash *targetHash = newHash(10);
00294 struct axtBundle *ab;
00295 struct axtRef *ref;
00296 
00297 /* Build up a list of targets in database hit by query sorted by
00298  * score of hits. */
00299 for (ab = abList; ab != NULL; ab = ab->next)
00300     {
00301     struct axt *axt;
00302     for (axt = ab->axtList; axt != NULL; axt = axt->next)
00303         {
00304         target = hashFindVal(targetHash, axt->tName);
00305         if (target == NULL)
00306             {
00307             AllocVar(target);
00308             slAddHead(&targetList, target);
00309             hashAdd(targetHash, axt->tName, target);
00310             target->name = cloneString(axt->tName);
00311             target->size = ab->tSize;
00312             }
00313         if (axt->score > target->score)
00314             target->score = axt->score;
00315         AllocVar(ref);
00316         ref->axt = axt;
00317         slAddHead(&target->axtList, ref);
00318         }
00319     }
00320 slSort(&targetList, targetHitsCmpScore);
00321 for (target = targetList; target != NULL; target = target->next)
00322     slSort(&target->axtList, axtRefCmpScore);
00323 
00324 hashFree(&targetHash);
00325 return targetList;
00326 }
00327 
00328 static char *nameForStrand(char strand)
00329 /* Return Plus/Minus for +/- */
00330 {
00331 if (strand == '-')
00332     return "Minus";
00333 else
00334     return "Plus";
00335 }
00336 
00337 static char *progType(boolean isProt, struct axtBundle *ab, boolean isUpper)
00338 /* Return blast 'program' */
00339 {
00340 if (!isProt)
00341     return isUpper ? "BLASTN" : "blastn";
00342 else
00343     {
00344     if (ab->axtList->frame != 0)
00345         return isUpper ? "TBLASTN" : "tblastn";
00346     else
00347         return isUpper ? "BLASTP" : "blastp";
00348     }
00349 }
00350 
00351 static void wuBlastOut(struct axtBundle *abList, int queryIx, boolean isProt, 
00352         FILE *f, 
00353         char *databaseName, int databaseSeqCount, double databaseLetterCount, 
00354         char *ourId)
00355 /* Do wublast-like output at end of processing query. */
00356 {
00357 char asciiNum[32];
00358 struct targetHits *targetList = NULL, *target;
00359 char *queryName;
00360 int isRc;
00361 int querySize = abList->qSize;
00362 boolean isTranslated = (abList->axtList->frame != 0);
00363 
00364 /* Print out stuff that doesn't depend on query or database. */
00365 if (ourId == NULL)
00366     ourId = "axtBlastOut";
00367 fprintf(f, "%s 2.0MP-WashU [%s]\n", progType(isProt, abList, TRUE), ourId);
00368 fprintf(f, "\n");
00369 fprintf(f, "Copyright (C) 2000-2002 Jim Kent\n");
00370 fprintf(f, "All Rights Reserved\n");
00371 fprintf(f, "\n");
00372 fprintf(f, "Reference:  Kent, WJ. (2002) BLAT - The BLAST-like alignment tool\n");
00373 fprintf(f, "\n");
00374 if (!isProt)
00375     {
00376     fprintf(f, "Notice:  this program and its default parameter settings are optimized to find\n");
00377     fprintf(f, "nearly identical sequences very rapidly.  For slower but more sensitive\n");
00378     fprintf(f, "alignments please use other methods.\n");
00379     fprintf(f, "\n");
00380     }
00381 
00382 /* Print query and database info. */
00383 queryName = abList->axtList->qName;
00384 fprintf(f, "Query=  %s\n", queryName);
00385 fprintf(f, "        (%d letters; record %d)\n", abList->qSize, queryIx);
00386 fprintf(f, "\n");
00387 fprintf(f, "Database:  %s\n",  databaseName);
00388 sprintLongWithCommas(asciiNum, databaseLetterCount);
00389 fprintf(f, "           %d sequences; %s total letters\n",  databaseSeqCount, asciiNum);
00390 fprintf(f, "Searching....10....20....30....40....50....60....70....80....90....100%% done\n");
00391 fprintf(f, "\n");
00392 
00393 targetList = bundleIntoTargets(abList);
00394 
00395 /* Print out summary of hits. */
00396 fprintf(f, "                                                                     Smallest\n");
00397 fprintf(f, "                                                                       Sum\n");
00398 fprintf(f, "                                                              High  Probability\n");
00399 fprintf(f, "Sequences producing High-scoring Segment Pairs:              Score  P(N)      N\n");
00400 fprintf(f, "\n");
00401 for (target = targetList; target != NULL; target = target->next)
00402     {
00403     double expectation = blastzScoreToWuExpectation(target->score, databaseLetterCount);
00404     double p = expectationToProbability(expectation);
00405     fprintf(f, "%-61s %4d  %8.1e %2d\n", target->name, 
00406         blastzToWublastScore(target->score), p, slCount(target->axtList));
00407     }
00408 
00409 /* Print out details on each target. */
00410 for (target = targetList; target != NULL; target = target->next)
00411     {
00412     fprintf(f, "\n\n>%s\n", target->name);
00413     fprintf(f, "        Length = %d\n", target->size);
00414     fprintf(f, "\n");
00415     for (isRc=0; isRc <= 1; ++isRc)
00416         {
00417         boolean saidStrand = FALSE;
00418         char strand = (isRc ? '-' : '+');
00419         char *strandName = nameForStrand(strand);
00420         struct axtRef *ref;
00421         struct axt *axt;
00422         for (ref = target->axtList; ref != NULL; ref = ref->next)
00423             {
00424             axt = ref->axt;
00425             if (axt->qStrand == strand)
00426                 {
00427                 int matches = countMatches(axt->qSym, axt->tSym, axt->symCount);
00428                 int positives = countPositives(axt->qSym, axt->tSym, axt->symCount);
00429                 if (!saidStrand)
00430                     {
00431                     saidStrand = TRUE;
00432                     if (!isProt)
00433                         fprintf(f, "  %s Strand HSPs:\n\n", strandName);
00434                     }
00435                 fprintf(f, " Score = %d (%2.1f bits), Expect = %5.1e, P = %5.1e\n",
00436                      blastzToWublastScore(axt->score), 
00437                      blastzScoreToWuBits(axt->score, isProt),
00438                      blastzScoreToWuExpectation(axt->score, databaseLetterCount),
00439                      blastzScoreToWuExpectation(axt->score, databaseLetterCount));
00440                 fprintf(f, " Identities = %d/%d (%d%%), Positives = %d/%d (%d%%)",
00441                      matches, axt->symCount, round(100.0 * matches / axt->symCount),
00442                      positives, axt->symCount, round(100.0 * positives / axt->symCount));
00443                 if (isProt)
00444                     {
00445                     if (axt->frame != 0)
00446                         fprintf(f, ", Frame = %c%d", axt->tStrand, axt->frame);
00447                     fprintf(f, "\n");
00448                     }
00449                 else
00450                     fprintf(f, ", Strand = %s / Plus\n", strandName);
00451                 fprintf(f, "\n");
00452                 blastiodAxtOutput(f, axt, target->size, querySize, 60, isProt, isTranslated);
00453                 }
00454             }
00455         }
00456     }
00457 
00458 /* Cleanup time. */
00459 targetHitsFreeList(&targetList);
00460 }
00461 
00462 static void ncbiPrintE(FILE *f, double e)
00463 /* Print a small number NCBI style. */
00464 {
00465 if (e <= 0.0)
00466     fprintf(f, "0.0");
00467 else if (e <= 1.0e-100)
00468     {
00469     char buf[256], *s;
00470     safef(buf, sizeof(buf), "%e", e);
00471     s = strchr(buf, 'e');
00472     if (s == NULL)
00473         fprintf(f, "0.0");
00474     else
00475         fprintf(f, "%s", s);
00476     }
00477 else
00478     fprintf(f, "%1.0e", e);
00479 }
00480 
00481 /* special global variable needed for Known Genes track building.  Fan 1/21/03 */
00482 int answer_for_kg;
00483 static void ncbiBlastOut(struct axtBundle *abList, int queryIx, boolean isProt, 
00484         FILE *f, char *databaseName, int databaseSeqCount, 
00485         double databaseLetterCount, char *ourId, double minIdentity)
00486 /* Do ncbiblast-like output at end of processing query. */
00487 {
00488 char asciiNum[32];
00489 struct targetHits *targetList = NULL, *target;
00490 char *queryName;
00491 int querySize = abList->qSize;
00492 boolean isTranslated = (abList->axtList->frame != 0);
00493 
00494 /* Print out stuff that doesn't depend on query or database. */
00495 if (ourId == NULL)
00496     ourId = "axtBlastOut";
00497 fprintf(f, "%s 2.2.11 [%s]\n", progType(isProt, abList, TRUE), ourId);
00498 fprintf(f, "\n");
00499 fprintf(f, "Reference:  Kent, WJ. (2002) BLAT - The BLAST-like alignment tool\n");
00500 fprintf(f, "\n");
00501 
00502 /* Print query and database info. */
00503 queryName = abList->axtList->qName;
00504 fprintf(f, "Query= %s\n", queryName);
00505 fprintf(f, "         (%d letters)\n", abList->qSize);
00506 fprintf(f, "\n");
00507 fprintf(f, "Database: %s \n",  databaseName);
00508 sprintLongWithCommas(asciiNum, databaseLetterCount);
00509 fprintf(f, "           %d sequences; %s total letters\n",  databaseSeqCount, asciiNum);
00510 fprintf(f, "\n");
00511 fprintf(f, "Searching.done\n");
00512 
00513 targetList = bundleIntoTargets(abList);
00514 
00515 /* Print out summary of hits. */
00516 fprintf(f, "                                                                 Score    E\n");
00517 fprintf(f, "Sequences producing significant alignments:                      (bits) Value\n");
00518 fprintf(f, "\n");
00519 for (target = targetList; target != NULL; target = target->next)
00520     {
00521     struct axtRef *ref;
00522     struct axt *axt;
00523     int matches;
00524     double identity, expectation;
00525     int bit;
00526     
00527     for (ref = target->axtList; ref != NULL; ref = ref->next)
00528         {
00529         axt = ref->axt;
00530         
00531         matches = countMatches(axt->qSym, axt->tSym, axt->symCount);
00532         identity = round(100.0 * matches / axt->symCount);
00533         /* skip output if minIdentity not reached */
00534         if (identity < minIdentity) continue;
00535     
00536         bit = blastzScoreToNcbiBits(axt->score);
00537         expectation = blastzScoreToNcbiExpectation(axt->score);
00538         fprintf(f, "%-67s  %4d   ", target->name, bit);
00539         ncbiPrintE(f, expectation);
00540         fprintf(f, "\n");
00541         }
00542     }
00543 fprintf(f, "\n");
00544 
00545 /* Print out details on each target. */
00546 for (target = targetList; target != NULL; target = target->next)
00547     {
00548     struct axtRef *ref;
00549     struct axt *axt;
00550     int matches, gaps;
00551     char *oldName;
00552     
00553     int ii = 0;
00554     double identity;
00555     oldName = strdup("");
00556 
00557     for (ref = target->axtList; ref != NULL; ref = ref->next)
00558         {
00559         ii++;
00560         axt = ref->axt;
00561         
00562         matches = countMatches(axt->qSym, axt->tSym, axt->symCount);
00563         identity = round(100.0 * matches / axt->symCount);
00564         
00565         /* skip output if minIdentity not reached */
00566         if (identity < minIdentity) continue;
00567         
00568         /* print target sequence name and length only once */ 
00569         if (!sameWord(oldName, target->name))
00570             {
00571             fprintf(f, "\n\n>%s \n", target->name);
00572             fprintf(f, "          Length = %d\n", target->size);
00573             oldName = strdup(target->name);
00574             }
00575 
00576         fprintf(f, "\n");
00577         fprintf(f, " Score = %d bits (%d), Expect = ",
00578              blastzScoreToNcbiBits(axt->score),
00579              blastzScoreToNcbiScore(axt->score));
00580         ncbiPrintE(f, blastzScoreToNcbiExpectation(axt->score));
00581         fprintf(f, "\n");
00582         
00583         if (isProt)
00584             {
00585             int positives = countPositives(axt->qSym, axt->tSym, axt->symCount);
00586             gaps = countGaps(axt->qSym, axt->tSym, axt->symCount);
00587             fprintf(f, " Identities = %d/%d (%d%%),",
00588                  matches, axt->symCount, round(100.0 * matches / axt->symCount));
00589             fprintf(f, " Positives = %d/%d (%d%%),",
00590                  positives, axt->symCount, round(100.0 * positives / axt->symCount));
00591             fprintf(f, " Gaps = %d/%d (%d%%)\n",
00592                  gaps, axt->symCount, round(100.0 * gaps / axt->symCount));
00593             if (axt->frame != 0) 
00594                 fprintf(f, " Frame = %c%d\n", axt->tStrand, axt->frame);
00595             /* set the special global variable, answer_for_kg.  
00596                This is needed for Known Genes track building.  Fan 1/21/03 */
00597             answer_for_kg=axt->symCount - matches;
00598             }
00599         else
00600             {
00601             fprintf(f, " Identities = %d/%d (%d%%)\n",
00602                  matches, axt->symCount, round(100.0 * matches / axt->symCount));
00603             fprintf(f, " Strand = %s / %s\n", nameForStrand(axt->qStrand),
00604                 nameForStrand(axt->tStrand));
00605             }
00606         fprintf(f, "\n");
00607         blastiodAxtOutput(f, axt, target->size, querySize, 60, isProt, isTranslated);
00608         }
00609     }
00610 
00611 fprintf(f, "  Database: %s\n", databaseName);
00612 
00613 /* Cleanup time. */
00614 targetHitsFreeList(&targetList);
00615 }
00616 
00617 static void xmlBlastOut(struct axtBundle *abList, int queryIx, boolean isProt, 
00618         FILE *f, char *databaseName, int databaseSeqCount, 
00619         double databaseLetterCount, char *ourId)
00620 /* Do ncbi blast xml-like output at end of processing query. 
00621  * WARNING - still not completely baked.  Format at NCBI seems
00622  * to be missing some end tags actually when I checked. -jk */
00623 {
00624 char *queryName = abList->axtList->qName;
00625 int querySize = abList->qSize;
00626 int hitNum = 0;
00627 struct targetHits *targetList = NULL, *target;
00628 
00629 if (ourId == NULL)
00630     ourId = "axtBlastOut";
00631 fprintf(f, "<?xml version=\"1.0\"?>\n");
00632 fprintf(f, "<!DOCTYPE BlastOutput PUBLIC \"-//NCBI//NCBI BlastOutput/EN\" \"NCBI_BlastOutput.dtd\">\n");
00633 fprintf(f, "<BlastOutput>\n");
00634 fprintf(f, "  <BlastOutput_program>%s</BlastOutput_program>\n", 
00635         progType(isProt, abList, FALSE));
00636 fprintf(f, "  <BlastOutput_version>%s 2.2.4 [%s]</BlastOutput_version>\n",
00637         progType(isProt, abList, FALSE), ourId);
00638 fprintf(f, "  <BlastOutput_reference>~Reference: Kent, WJ. (2002) BLAT - The BLAST-like alignment tool</BlastOutput_reference>\n");
00639 fprintf(f, "  <BlastOutput_db>%s</BlastOutput_db>\n", databaseName);
00640 fprintf(f, "  <BlastOutput_query-ID>%d</BlastOutput_query-ID>\n", queryIx);
00641 fprintf(f, "  <BlastOutput_query-def>%s</BlastOutput_query-def>\n", queryName);
00642 fprintf(f, "  <BlastOutput_query-len>%d</BlastOutput_query-len>\n", querySize);
00643 
00644 if (isProt)
00645     {
00646     fprintf(f, "  <BlastOutput_param>\n");
00647     fprintf(f, "    <Parameters_matrix>BLOSUM62</Parameters_matrix>\n");
00648     fprintf(f, "    <Parameters_expect>0.001</Parameters_expect>\n");
00649     fprintf(f, "    <Parameters_expect>10</Parameters_expect>\n");
00650     fprintf(f, "    <Parameters_gap-extend>1</Parameters_gap-extend>\n");
00651     fprintf(f, "  </BlastOutput_param>\n");
00652     }
00653 
00654 fprintf(f, "  <BlastOutput_iterations>\n");
00655 fprintf(f, "    <Iteration>\n");
00656 fprintf(f, "      <Iteration_iter-num>1</Iteration_iter-num>\n");
00657 fprintf(f, "      <Iteration_hits>\n");
00658 
00659 /* Print out details on each target. */
00660 targetList = bundleIntoTargets(abList);
00661 for (target = targetList; target != NULL; target = target->next)
00662     {
00663     struct axtRef *ref;
00664     fprintf(f, "      <hit>\n");
00665     fprintf(f, "        <Hit_num>%d</Hit_num>\n", hitNum);
00666     fprintf(f, "        <Hit_id>%s</Hit_id>\n", target->name);
00667     fprintf(f, "        <Hit_accession>%s</Hit_accession>\n", target->name);
00668     fprintf(f, "        <Hit_len>%d</Hit_len>\n", target->size);
00669     fprintf(f, "        <Hit_hsps>\n");
00670     for (ref = target->axtList; ref != NULL; ref = ref->next)
00671         {
00672         struct axt *axt = ref->axt;
00673         int hspIx = 0;
00674         fprintf(f, "        <Hsp>\n");
00675         fprintf(f, "          <Hsp_num>%d</Hsp_num>\n", ++hspIx);
00676         fprintf(f, "          <Hsp_bit-score>%d</Hsp_bit-score>\n", 
00677                 blastzScoreToNcbiBits(axt->score));
00678         fprintf(f, "          <Hsp_score>%d</Hsp_score>\n",
00679                 blastzScoreToNcbiScore(axt->score));
00680         fprintf(f, "          <Hsp_evalue>%f</Hsp_evalue>\n",
00681                 blastzScoreToNcbiExpectation(axt->score));
00682         fprintf(f, "          <Hsp_query-from>%d</Hsp_query-from>\n", 
00683                 axt->qStart+1);
00684         fprintf(f, "          <Hsp_query-to>%d</Hsp_query-to>\n", axt->qEnd);
00685         fprintf(f, "          <Hsp_hit-from>%d</Hsp_hit-from>\n", 
00686                 axt->tStart+1);
00687         fprintf(f, "          <Hsp_hit-to>%d</Hsp_hit-to>\n", axt->tEnd);
00688         fprintf(f, "        </Hsp>\n");
00689         }
00690 
00691     fprintf(f, "        </Hit_hsps>\n");
00692     }
00693 
00694 fprintf(f, "      </Iteration_hits>\n");
00695 fprintf(f, "    </Iteration>\n");
00696 fprintf(f, "  </BlastOutput_iterations>\n");
00697 fprintf(f, "</BlastOutput>\n");
00698 }
00699 
00700 static void printAxtTargetBlastTab(FILE *f, struct axt *axt, int targetSize)
00701 /* Print out target in tabular blast-oriented format. */
00702 {
00703 int s = axt->tStart, e = axt->tEnd;
00704 if (axt->tStrand == '-')
00705     reverseIntRange(&s, &e, targetSize);
00706 if (axt->tStrand == axt->qStrand)
00707     {
00708     fprintf(f, "%d\t", s+1);
00709     fprintf(f, "%d\t", e);
00710     }
00711 else
00712     {
00713     fprintf(f, "%d\t", e);
00714     fprintf(f, "%d\t", s+1);
00715     }
00716 }
00717 
00718 static void tabBlastOut(struct axtBundle *abList, int queryIx, boolean isProt, 
00719         FILE *f, char *databaseName, int databaseSeqCount, 
00720         double databaseLetterCount, char *ourId, boolean withComment)
00721 /* Do NCBI tabular blast output. */
00722 {
00723 char *queryName = abList->axtList->qName;
00724 int querySize = abList->qSize;
00725 struct targetHits *targetList = NULL, *target;
00726 
00727 if (withComment)
00728     {
00729     char * rcsDate = "$Date: 2006/03/10 17:43:36 $";
00730     char dateStamp[11];
00731     strncpy (dateStamp, rcsDate+7, 10);
00732     dateStamp[10] = 0;
00733     fprintf(f, "# BLAT %s [%s]\n", gfVersion, dateStamp);
00734     fprintf(f, "# Query: %s\n", queryName);
00735     fprintf(f, "# Database: %s\n", databaseName);
00736     fprintf(f, "%s\n", 
00737         "# Fields: Query id, Subject id, % identity, alignment length, "
00738         "mismatches, gap openings, q. start, q. end, s. start, s. end, "
00739         "e-value, bit score");
00740     }
00741 
00742 /* Print out details on each target. */
00743 targetList = bundleIntoTargets(abList);
00744 for (target = targetList; target != NULL; target = target->next)
00745     {
00746     struct axtRef *ref;
00747     for (ref = target->axtList; ref != NULL; ref = ref->next)
00748         {
00749         struct axt *axt = ref->axt;
00750         int matches = countMatches(axt->qSym, axt->tSym, axt->symCount);
00751         int gaps = countGaps(axt->qSym, axt->tSym, axt->symCount);
00752         int gapOpens = countGapOpens(axt->qSym, axt->tSym, axt->symCount);
00753         fprintf(f, "%s\t", axt->qName);
00754         fprintf(f, "%s\t", axt->tName);
00755         fprintf(f, "%.2f\t", 100.0 * matches/axt->symCount);
00756         fprintf(f, "%d\t", axt->symCount);
00757         fprintf(f, "%d\t", axt->symCount - matches - gaps);
00758         fprintf(f, "%d\t", gapOpens);
00759         if (axt->qStrand == '-')
00760             {
00761             int s = axt->qStart, e = axt->qEnd;
00762             reverseIntRange(&s, &e, querySize);
00763             fprintf(f, "%d\t", s+1);
00764             fprintf(f, "%d\t", e);
00765             printAxtTargetBlastTab(f, axt, target->size);
00766             }
00767         else
00768             {
00769             fprintf(f, "%d\t", axt->qStart + 1);
00770             fprintf(f, "%d\t", axt->qEnd);
00771             printAxtTargetBlastTab(f, axt, target->size);
00772             }
00773         fprintf(f, "%3.1e\t", blastzScoreToNcbiExpectation(axt->score));
00774         fprintf(f, "%d.0\n", blastzScoreToNcbiBits(axt->score));
00775         }
00776     }
00777 
00778 /* Cleanup time. */
00779 targetHitsFreeList(&targetList);
00780 }
00781 
00782 void axtBlastOut(struct axtBundle *abList, 
00783         int queryIx, boolean isProt, FILE *f, 
00784         char *databaseName, int databaseSeqCount, double databaseLetterCount, 
00785         char *blastType, char *ourId, double minIdentity)
00786 /* Output a bundle of axt's on the same query sequence in blast format.
00787  * The parameters in detail are:
00788  *   ab - the list of bundles of axt's. 
00789  *   f  - output file handle
00790  *   databaseSeqCount - number of sequences in database
00791  *   databaseLetterCount - number of bases or aa's in database
00792  *   blastType - blast/wublast/blast8/blast9/xml
00793  *   ourId - optional (may be NULL) thing to put in header
00794  */
00795 {
00796 if (abList == NULL)
00797     return;
00798 if (sameWord(blastType, "wublast"))
00799     wuBlastOut(abList, queryIx, isProt, f, databaseName,
00800         databaseSeqCount, databaseLetterCount, ourId);
00801 else if (sameWord(blastType, "xml"))
00802     xmlBlastOut(abList, queryIx, isProt, f, databaseName,
00803         databaseSeqCount, databaseLetterCount, ourId);
00804 else if (sameWord(blastType, "blast"))
00805     ncbiBlastOut(abList, queryIx, isProt, f, databaseName,
00806         databaseSeqCount, databaseLetterCount, ourId, minIdentity);
00807 else if (sameWord(blastType, "blast8"))
00808     tabBlastOut(abList, queryIx, isProt, f, databaseName,
00809         databaseSeqCount, databaseLetterCount, ourId, FALSE);
00810 else if (sameWord(blastType, "blast9"))
00811     tabBlastOut(abList, queryIx, isProt, f, databaseName,
00812         databaseSeqCount, databaseLetterCount, ourId, TRUE);
00813 else
00814     errAbort("Unrecognized blastType %s in axtBlastOut", blastType);
00815 }
00816 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2