jkOwnLib/gfOut.c

Go to the documentation of this file.
00001 /* gfOut - stuff to manage output for genoFind system -
00002  * currently supports psl, axt, blast, and wu-blast. 
00003  *
00004  * Copyright 2001-2003 Jim Kent.  All rights reserved. */
00005 
00006 #include "common.h"
00007 #include "linefile.h"
00008 #include "hash.h"
00009 #include "dystring.h"
00010 #include "dnautil.h"
00011 #include "axt.h"
00012 #include "maf.h"
00013 #include "trans3.h"
00014 #include "psl.h"
00015 #include "genoFind.h"
00016 
00017 static char const rcsid[] = "$Id: gfOut.c,v 1.16 2006/06/22 16:24:44 kent Exp $";
00018 
00019 struct pslxData
00020 /* This is the data structure put in gfOutput.data for psl/pslx output. */
00021     {
00022     FILE *f;                    /* Output file. */
00023     boolean saveSeq;            /* Save sequence too? */
00024     };
00025 
00026 struct axtData
00027 /* This is the data structure put in gfOutput.data for axt/blast output. */
00028     {
00029     struct axtBundle *bundleList;       /* List of bundles. */
00030     char *databaseName;         /* Just used for blast. */
00031     int databaseSeqCount;       /* Just used for blast. */
00032     double databaseLetters; /* Just used for blast. */
00033     char *blastType;    /* 'blast' or 'wublast' or 'xml' or 'blast8' or 'blast9' */
00034     double minIdentity; /* Just used for blast. */
00035     };
00036 
00037 static void savePslx(char *chromName, int chromSize, int chromOffset,
00038         struct ffAli *ali, struct dnaSeq *tSeq, struct dnaSeq *qSeq, 
00039         boolean isRc, enum ffStringency stringency, int minMatch, FILE *f,
00040         struct hash *t3Hash, boolean reportTargetStrand, boolean targetIsRc,
00041         struct hash *maskHash, int minIdentity, 
00042         boolean qIsProt, boolean tIsProt, boolean saveSeq)
00043 /* Analyse one alignment and if it looks good enough write it out to file in
00044  * psl format (or pslX format - if saveSeq is TRUE).  */
00045 {
00046 /* This function was stolen from psLayout and slightly extensively to cope
00047  * with protein as well as DNA aligments. */
00048 struct ffAli *ff, *nextFf;
00049 struct ffAli *right = ffRightmost(ali);
00050 DNA *needle = qSeq->dna;
00051 DNA *hay = tSeq->dna;
00052 int nStart = ali->nStart - needle;
00053 int nEnd = right->nEnd - needle;
00054 int hStart, hEnd; 
00055 int nInsertBaseCount = 0;
00056 int nInsertCount = 0;
00057 int hInsertBaseCount = 0;
00058 int hInsertCount = 0;
00059 int matchCount = 0;
00060 int mismatchCount = 0;
00061 int repMatch = 0;
00062 int countNs = 0;
00063 DNA *np, *hp, n, h;
00064 int blockSize;
00065 int i;
00066 struct trans3 *t3List = NULL;
00067 Bits *maskBits = NULL;
00068 
00069 if (maskHash != NULL)
00070     maskBits = hashMustFindVal(maskHash, tSeq->name);
00071 if (t3Hash != NULL)
00072     t3List = hashMustFindVal(t3Hash, tSeq->name);
00073 hStart = trans3GenoPos(ali->hStart, tSeq, t3List, FALSE) + chromOffset;
00074 hEnd = trans3GenoPos(right->hEnd, tSeq, t3List, TRUE) + chromOffset;
00075 
00076 /* Count up matches, mismatches, inserts, etc. */
00077 for (ff = ali; ff != NULL; ff = nextFf)
00078     {
00079     nextFf = ff->right;
00080     blockSize = ff->nEnd - ff->nStart;
00081     np = ff->nStart;
00082     hp = ff->hStart;
00083     for (i=0; i<blockSize; ++i)
00084         {
00085         n = np[i];
00086         h = hp[i];
00087         if (n == 'n' || h == 'n')
00088             ++countNs;
00089         else
00090             {
00091             if (n == h)
00092                 {
00093                 if (maskBits != NULL)
00094                     {
00095                     int seqOff = hp + i - hay;
00096                     if (bitReadOne(maskBits, seqOff))
00097                         ++repMatch;
00098                     else
00099                         ++matchCount;
00100                     }
00101                 else
00102                     ++matchCount;
00103                 }
00104             else
00105                 ++mismatchCount;
00106             }
00107         }
00108     if (nextFf != NULL)
00109         {
00110         int nhStart = trans3GenoPos(nextFf->hStart, tSeq, t3List, FALSE) + chromOffset;
00111         int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) + chromOffset;
00112         int hGap = nhStart - ohEnd;
00113         int nGap = nextFf->nStart - ff->nEnd;
00114 
00115         if (nGap != 0)
00116             {
00117             ++nInsertCount;
00118             nInsertBaseCount += nGap;
00119             }
00120         if (hGap != 0)
00121             {
00122             ++hInsertCount;
00123             hInsertBaseCount += hGap;
00124             }
00125         }
00126     }
00127 
00128 
00129 /* See if it looks good enough to output, and output. */
00130 /* if (score >= minMatch) Moved to higher level */
00131     {
00132     int gaps = nInsertCount + (stringency == ffCdna ? 0: hInsertCount);
00133     int id = roundingScale(1000, matchCount + repMatch - 2*gaps, matchCount + repMatch + mismatchCount);
00134     if (id >= minIdentity)
00135         {
00136         if (isRc)
00137             {
00138             int temp;
00139             int oSize = qSeq->size;
00140             temp = nStart;
00141             nStart = oSize - nEnd;
00142             nEnd = oSize - temp;
00143             }
00144         if (targetIsRc)
00145             {
00146             int temp;
00147             temp = hStart;
00148             hStart = chromSize - hEnd;
00149             hEnd = chromSize - temp;
00150             }
00151         fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c",
00152             matchCount, mismatchCount, repMatch, countNs, nInsertCount, nInsertBaseCount, hInsertCount, hInsertBaseCount,
00153             (isRc ? '-' : '+'));
00154         if (reportTargetStrand)
00155             fprintf(f, "%c", (targetIsRc ? '-' : '+') );
00156         fprintf(f, "\t%s\t%d\t%d\t%d\t"
00157                      "%s\t%d\t%d\t%d\t%d\t",
00158             qSeq->name, qSeq->size, nStart, nEnd,
00159             chromName, chromSize, hStart, hEnd,
00160             ffAliCount(ali));
00161         for (ff = ali; ff != NULL; ff = ff->right)
00162             fprintf(f, "%ld,", (long)(ff->nEnd - ff->nStart));
00163         fprintf(f, "\t");
00164         for (ff = ali; ff != NULL; ff = ff->right)
00165             fprintf(f, "%ld,", (long)(ff->nStart - needle));
00166         fprintf(f, "\t");
00167         for (ff = ali; ff != NULL; ff = ff->right)
00168             fprintf(f, "%d,", trans3GenoPos(ff->hStart, tSeq, t3List, FALSE) + chromOffset);
00169         if (saveSeq)
00170             {
00171             fputc('\t', f);
00172             for (ff = ali; ff != NULL; ff = ff->right)
00173                 {
00174                 mustWrite(f, ff->nStart, ff->nEnd - ff->nStart);
00175                 fputc(',', f);
00176                 }
00177             fputc('\t', f);
00178             for (ff = ali; ff != NULL; ff = ff->right)
00179                 {
00180                 mustWrite(f, ff->hStart, ff->hEnd - ff->hStart);
00181                 fputc(',', f);
00182                 }
00183             }
00184         fprintf(f, "\n");
00185         if (ferror(f))
00186             {
00187             perror("");
00188             errAbort("Write error to .psl");
00189             }
00190         }
00191     }
00192 }
00193 
00194 static void pslOut(char *chromName, int chromSize, int chromOffset, struct ffAli *ali, 
00195         struct dnaSeq *tSeq, struct hash *t3Hash, struct dnaSeq *qSeq, 
00196         boolean qIsRc, boolean tIsRc, 
00197         enum ffStringency stringency, int minMatch, struct gfOutput *out)
00198 /* Save psl for more complex alignments. */
00199 {
00200 struct pslxData *outForm = out->data;
00201 
00202 savePslx(chromName, chromSize, chromOffset, ali, tSeq, qSeq,
00203     qIsRc, stringency, minMatch, outForm->f, t3Hash, 
00204     out->reportTargetStrand, tIsRc,
00205     out->maskHash, out->minGood, 
00206     out->qIsProt, out->tIsProt, outForm->saveSeq);
00207 }
00208 
00209 static struct ffAli *ffNextBreak(struct ffAli *ff, int maxInsert, 
00210         bioSeq *tSeq, struct trans3 *t3List)
00211 /* Return ffAli after first gap in either sequence longer than maxInsert,
00212  * or after first gap in both sequences.  Return may legitimately
00213  * be NULL. */
00214 {
00215 struct ffAli *rt = ff->right;
00216 int hGap, nGap;
00217 int nhStart, ohEnd;
00218 for (;;)
00219     {
00220     if (rt == NULL)
00221         break;
00222     nhStart = trans3GenoPos(rt->hStart, tSeq, t3List, FALSE);
00223     ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE);
00224     hGap = nhStart - ohEnd;
00225     nGap = rt->nStart - ff->nEnd;
00226     if (hGap != 0 && nGap != 0)
00227         break;
00228     if (hGap < 0 || nGap < 0)
00229         break;
00230     if (hGap > maxInsert || nGap > maxInsert)
00231         break;
00232     ff = rt;
00233     rt = ff->right;
00234     }
00235 return rt;
00236 }
00237 
00238 
00239 static void saveAxtBundle(char *chromName, int chromSize, int chromOffset,
00240         struct ffAli *ali, 
00241         struct dnaSeq *tSeq, struct hash *t3Hash, struct dnaSeq *qSeq, 
00242         boolean qIsRc, boolean tIsRc, 
00243         enum ffStringency stringency, int minMatch, struct gfOutput *out)
00244 /* Save alignment to axtBundle. */
00245 {
00246 struct axtData *ad = out->data;
00247 struct ffAli *sAli, *eAli, *ff, *rt, *eFf = NULL;
00248 struct axt *axt;
00249 struct dyString *q = newDyString(1024), *t = newDyString(1024);
00250 struct axtBundle *gab;
00251 struct trans3 *t3List = NULL;
00252 
00253 if (t3Hash != NULL)
00254     t3List = hashMustFindVal(t3Hash, tSeq->name);
00255 AllocVar(gab);
00256 gab->tSize = chromSize;
00257 gab->qSize = qSeq->size;
00258 for (sAli = ali; sAli != NULL; sAli = eAli)
00259     {
00260     eAli = ffNextBreak(sAli, 8, tSeq, t3List);
00261     dyStringClear(q);
00262     dyStringClear(t);
00263     for (ff = sAli; ff != eAli; ff = ff->right)
00264         {
00265         dyStringAppendN(q, ff->nStart, ff->nEnd - ff->nStart);
00266         dyStringAppendN(t, ff->hStart, ff->hEnd - ff->hStart);
00267         rt = ff->right;
00268         if (rt != eAli)
00269             {
00270             int nGap = rt->nStart - ff->nEnd;
00271             int nhStart = trans3GenoPos(rt->hStart, tSeq, t3List, FALSE) 
00272                 + chromOffset;
00273             int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) 
00274                 + chromOffset;
00275             int hGap = nhStart - ohEnd;
00276             int gap = max(nGap, hGap);
00277             if (nGap < 0 || hGap < 0)
00278                 {
00279                 errAbort("Negative gap size in %s vs %s", tSeq->name, qSeq->name);
00280                 }
00281             if (nGap == gap)
00282                 {
00283                 dyStringAppendN(q, ff->nEnd, gap);
00284                 dyStringAppendMultiC(t, '-', gap);
00285                 }
00286             else
00287                 {
00288                 dyStringAppendN(t, ff->hEnd, gap);
00289                 dyStringAppendMultiC(q, '-', gap);
00290                 }
00291             }
00292         eFf = ff;       /* Keep track of last block in bunch */
00293         }
00294     assert(t->stringSize == q->stringSize);
00295     AllocVar(axt);
00296     axt->qName = cloneString(qSeq->name);
00297     axt->qStart = sAli->nStart - qSeq->dna;
00298     axt->qEnd = eFf->nEnd - qSeq->dna;
00299     axt->qStrand = (qIsRc ? '-' : '+');
00300     axt->tName = cloneString(chromName);
00301     axt->tStart = trans3GenoPos(sAli->hStart, tSeq, t3List, FALSE) + chromOffset;
00302     axt->tEnd = trans3GenoPos(eFf->hEnd, tSeq, t3List, TRUE) + chromOffset;
00303     axt->tStrand = (tIsRc ? '-' : '+');
00304     axt->symCount = t->stringSize;
00305     axt->qSym = cloneString(q->string);
00306     axt->tSym = cloneString(t->string);
00307     axt->frame = trans3Frame(sAli->hStart, t3List);
00308     if (out->qIsProt)
00309         axt->score = axtScoreProteinDefault(axt);
00310     else 
00311         axt->score = axtScoreDnaDefault(axt);
00312     slAddHead(&gab->axtList, axt);
00313     }
00314 slReverse(&gab->axtList);
00315 dyStringFree(&q);
00316 dyStringFree(&t);
00317 slAddHead(&ad->bundleList, gab);
00318 }
00319 
00320 static void axtQueryOut(struct gfOutput *out, FILE *f)
00321 /* Do axt oriented output - at end of processing query. */
00322 {
00323 struct axtData *aod = out->data;
00324 struct axtBundle *gab;
00325 for (gab = aod->bundleList; gab != NULL; gab = gab->next)
00326     {
00327     struct axt *axt;
00328     for (axt = gab->axtList; axt != NULL; axt = axt->next)
00329         axtWrite(axt, f);
00330     }
00331 axtBundleFreeList(&aod->bundleList);
00332 }
00333 
00334 static void mafHead(struct gfOutput *out, FILE *f)
00335 /* Write maf header. */
00336 {
00337 mafWriteStart(f, "blastz");
00338 }
00339 
00340 static void mafQueryOut(struct gfOutput *out, FILE *f)
00341 /* Do axt oriented output - at end of processing query. */
00342 {
00343 struct axtData *aod = out->data;
00344 struct axtBundle *gab;
00345 for (gab = aod->bundleList; gab != NULL; gab = gab->next)
00346     {
00347     struct axt *axt;
00348     for (axt = gab->axtList; axt != NULL; axt = axt->next)
00349         {
00350         struct mafAli temp;
00351         mafFromAxtTemp(axt, gab->tSize, gab->qSize, &temp);
00352         mafWrite(f, &temp);
00353         }
00354     }
00355 axtBundleFreeList(&aod->bundleList);
00356 }
00357 
00358 static double axtIdRatio(struct axt *axt)
00359 /* Return matches/total. */
00360 {
00361 int matchCount = 0;
00362 int i;
00363 for (i=0; i<axt->symCount; ++i)
00364     {
00365     if (axt->qSym[i] == axt->tSym[i])
00366         ++matchCount;
00367     }
00368 return (double)matchCount/(double)axt->symCount;
00369 }
00370 
00371 static void sim4QueryOut(struct gfOutput *out, FILE *f)
00372 /* Do sim4-like output - at end of processing query. */
00373 {
00374 struct axtData *aod = out->data;
00375 struct axtBundle *gab;
00376 
00377 for (gab = aod->bundleList; gab != NULL; gab = gab->next)
00378     {
00379     struct axt *axt = gab->axtList;
00380     fprintf(f, "\n");
00381     fprintf(f, "seq1 = %s, %d bp\n", axt->qName, gab->qSize);
00382     fprintf(f, "seq2 = %s, %d bp\n", axt->tName, gab->tSize);
00383     fprintf(f, "\n");
00384     if (axt->qStrand == '-')
00385         fprintf(f, "(complement)\n");
00386     for (axt = gab->axtList; axt != NULL; axt = axt->next)
00387         {
00388         fprintf(f, "%d-%d  ", axt->qStart+1, axt->qEnd);
00389         fprintf(f, "(%d-%d)   ", axt->tStart+1, axt->tEnd);
00390         fprintf(f, "%1.0f%% ", 100.0 * axtIdRatio(axt));
00391         if (axt->qStrand == '-')
00392              fprintf(f, "<-\n");
00393         else
00394              fprintf(f, "->\n");
00395         }
00396     }
00397 axtBundleFreeList(&aod->bundleList);
00398 }
00399 
00400 static void blastQueryOut(struct gfOutput *out, FILE *f)
00401 /* Output blast on query. */
00402 {
00403 struct axtData *aod = out->data;
00404 axtBlastOut(aod->bundleList, out->queryIx, out->qIsProt, f,
00405         aod->databaseName, aod->databaseSeqCount, aod->databaseLetters,
00406         aod->blastType, "blat", aod->minIdentity);
00407 axtBundleFreeList(&aod->bundleList);
00408 }
00409 
00410 static struct gfOutput *gfOutputInit(int goodPpt, boolean qIsProt, boolean tIsProt)
00411 /* Allocate and initialize gfOutput.   You'll need to fill in 
00412  * gfOutput.out at a minimum, and likely gfOutput.data before
00413  * using though. */
00414 {
00415 struct gfOutput *out;
00416 AllocVar(out);
00417 out->minGood = goodPpt;
00418 out->qIsProt = qIsProt;
00419 out->tIsProt = tIsProt;
00420 return out;
00421 }
00422 
00423 static void pslHead(struct gfOutput *out, FILE *f)
00424 /* Write out psl head */
00425 {
00426 pslWriteHead(f);
00427 }
00428 
00429 struct gfOutput *gfOutputPsl(int goodPpt, 
00430         boolean qIsProt, boolean tIsProt, FILE *f, 
00431         boolean saveSeq, boolean noHead)
00432 /* Set up psl/pslx output */
00433 {
00434 struct gfOutput *out = gfOutputInit(goodPpt, qIsProt, tIsProt);
00435 struct pslxData *pslData;
00436 
00437 AllocVar(pslData);
00438 pslData->saveSeq = saveSeq;
00439 pslData->f = f;
00440 out->out = pslOut;
00441 out->data = pslData;
00442 if (!noHead)
00443     out->fileHead = pslHead;
00444 return out;
00445 }
00446 
00447 struct gfOutput *gfOutputAxtMem(int goodPpt, boolean qIsProt, 
00448         boolean tIsProt)
00449 /* Setup output for in memory axt output. */
00450 {
00451 struct gfOutput *out = gfOutputInit(goodPpt, qIsProt, tIsProt);
00452 struct axtData *ad;
00453 AllocVar(ad);
00454 out->out = saveAxtBundle;
00455 out->data = ad;
00456 return out;
00457 }
00458 
00459 struct gfOutput *gfOutputAxt(int goodPpt, boolean qIsProt, 
00460         boolean tIsProt, FILE *f)
00461 /* Setup output for axt format. */
00462 {
00463 struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
00464 out->queryOut = axtQueryOut;
00465 return out;
00466 }
00467 
00468 struct gfOutput *gfOutputMaf(int goodPpt, boolean qIsProt, 
00469         boolean tIsProt, FILE *f)
00470 /* Setup output for maf format. */
00471 {
00472 struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
00473 out->queryOut = mafQueryOut;
00474 out->fileHead = mafHead;
00475 return out;
00476 }
00477 
00478 struct gfOutput *gfOutputSim4(int goodPpt, boolean qIsProt, boolean tIsProt, 
00479         char *databaseName)
00480 /* Set up to output in sim4 format. */
00481 {
00482 struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
00483 struct axtData *ad = out->data;
00484 if (qIsProt || tIsProt)
00485     errAbort("sim4 output is not available for protein query sequences.");
00486 ad->databaseName = databaseName;
00487 out->queryOut = sim4QueryOut;
00488 return out;
00489 }
00490 
00491 struct gfOutput *gfOutputBlast(int goodPpt, 
00492         boolean qIsProt, boolean tIsProt, 
00493         char *databaseName, int databaseSeqCount, double databaseLetters,
00494         char *blastType, double minIdentity, FILE *f)
00495 /* Setup output for blast format. */
00496 {
00497 struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
00498 struct axtData *ad = out->data;
00499 ad->databaseName = databaseName;
00500 ad->databaseSeqCount = databaseSeqCount;
00501 ad->databaseLetters = databaseLetters;
00502 ad->blastType = blastType;
00503 ad->minIdentity = minIdentity;
00504 out->queryOut = blastQueryOut;
00505 return out;
00506 }
00507 
00508 struct gfOutput *gfOutputAny(char *format, 
00509         int goodPpt, boolean qIsProt, boolean tIsProt, 
00510         boolean noHead, char *databaseName,
00511         int databaseSeqCount, double databaseLetters,
00512         double minIdentity,
00513         FILE *f)
00514 /* Initialize output in a variety of formats in file or memory. 
00515  * Parameters:
00516  *    format - either 'psl', 'pslx', 'sim4', 'blast', 'wublast', 'axt', 'xml'
00517  *    goodPpt - minimum identity of alignments to output in parts per thousand
00518  *    qIsProt - true if query side is a protein.
00519  *    tIsProt - true if target (database) side is a protein.
00520  *    noHead - if true suppress header in psl/pslx output.
00521  *    databaseName - name of database.  Only used for blast output
00522  *    databaseSeq - number of sequences in database - only for blast
00523  *    databaseLetters - number of bases/aas in database - only blast
00524  *    minIdentity - minimum identity - only blast
00525  *    FILE *f - file.  
00526  */
00527 {
00528 struct gfOutput *out = NULL;
00529 static char *blastTypes[] = {"blast", "wublast", "blast8", "blast9", "xml"};
00530 
00531 if (format == NULL)
00532     format = "psl";
00533 if (sameWord(format, "psl"))
00534     out = gfOutputPsl(goodPpt, qIsProt, tIsProt, f, FALSE, noHead);
00535 else if (sameWord(format, "pslx"))
00536     out = gfOutputPsl(goodPpt, qIsProt, tIsProt, f, TRUE, noHead);
00537 else if (sameWord(format, "sim4"))
00538     out = gfOutputSim4(goodPpt, qIsProt, tIsProt, databaseName);
00539 else if (stringArrayIx(format, blastTypes, ArraySize(blastTypes)) >= 0)
00540     out = gfOutputBlast(goodPpt, qIsProt, tIsProt, 
00541             databaseName, databaseSeqCount, databaseLetters, format, 
00542             minIdentity, f);
00543 else if (sameWord(format, "axt"))
00544     out = gfOutputAxt(goodPpt, qIsProt, tIsProt, f);
00545 else if (sameWord(format, "maf"))
00546     out = gfOutputMaf(goodPpt, qIsProt, tIsProt, f);
00547 else
00548     errAbort("Unrecognized output format '%s'", format);
00549 return out;
00550 }
00551 
00552 void gfOutputQuery(struct gfOutput *out, FILE *f)
00553 /* Finish writing out results for a query to file. */
00554 {
00555 ++out->queryIx;
00556 if (out->queryOut != NULL)
00557     out->queryOut(out, f);
00558 }
00559 
00560 void gfOutputHead(struct gfOutput *out, FILE *f)
00561 /* Write out header if any. */
00562 {
00563 if (out->fileHead != NULL)
00564     out->fileHead(out, f);
00565 }
00566 
00567 void gfOutputFree(struct gfOutput **pOut)
00568 /* Free up output */
00569 {
00570 struct gfOutput *out = *pOut;
00571 if (out != NULL)
00572     {
00573     freeMem(out->data);
00574     freez(pOut);
00575     }
00576 }

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2