00001
00002
00003
00004
00005
00006 #include "common.h"
00007 #include "linefile.h"
00008 #include "hash.h"
00009 #include "dystring.h"
00010 #include "dnautil.h"
00011 #include "axt.h"
00012 #include "maf.h"
00013 #include "trans3.h"
00014 #include "psl.h"
00015 #include "genoFind.h"
00016
00017 static char const rcsid[] = "$Id: gfOut.c,v 1.16 2006/06/22 16:24:44 kent Exp $";
00018
00019 struct pslxData
00020
00021 {
00022 FILE *f;
00023 boolean saveSeq;
00024 };
00025
00026 struct axtData
00027
00028 {
00029 struct axtBundle *bundleList;
00030 char *databaseName;
00031 int databaseSeqCount;
00032 double databaseLetters;
00033 char *blastType;
00034 double minIdentity;
00035 };
00036
00037 static void savePslx(char *chromName, int chromSize, int chromOffset,
00038 struct ffAli *ali, struct dnaSeq *tSeq, struct dnaSeq *qSeq,
00039 boolean isRc, enum ffStringency stringency, int minMatch, FILE *f,
00040 struct hash *t3Hash, boolean reportTargetStrand, boolean targetIsRc,
00041 struct hash *maskHash, int minIdentity,
00042 boolean qIsProt, boolean tIsProt, boolean saveSeq)
00043
00044
00045 {
00046
00047
00048 struct ffAli *ff, *nextFf;
00049 struct ffAli *right = ffRightmost(ali);
00050 DNA *needle = qSeq->dna;
00051 DNA *hay = tSeq->dna;
00052 int nStart = ali->nStart - needle;
00053 int nEnd = right->nEnd - needle;
00054 int hStart, hEnd;
00055 int nInsertBaseCount = 0;
00056 int nInsertCount = 0;
00057 int hInsertBaseCount = 0;
00058 int hInsertCount = 0;
00059 int matchCount = 0;
00060 int mismatchCount = 0;
00061 int repMatch = 0;
00062 int countNs = 0;
00063 DNA *np, *hp, n, h;
00064 int blockSize;
00065 int i;
00066 struct trans3 *t3List = NULL;
00067 Bits *maskBits = NULL;
00068
00069 if (maskHash != NULL)
00070 maskBits = hashMustFindVal(maskHash, tSeq->name);
00071 if (t3Hash != NULL)
00072 t3List = hashMustFindVal(t3Hash, tSeq->name);
00073 hStart = trans3GenoPos(ali->hStart, tSeq, t3List, FALSE) + chromOffset;
00074 hEnd = trans3GenoPos(right->hEnd, tSeq, t3List, TRUE) + chromOffset;
00075
00076
00077 for (ff = ali; ff != NULL; ff = nextFf)
00078 {
00079 nextFf = ff->right;
00080 blockSize = ff->nEnd - ff->nStart;
00081 np = ff->nStart;
00082 hp = ff->hStart;
00083 for (i=0; i<blockSize; ++i)
00084 {
00085 n = np[i];
00086 h = hp[i];
00087 if (n == 'n' || h == 'n')
00088 ++countNs;
00089 else
00090 {
00091 if (n == h)
00092 {
00093 if (maskBits != NULL)
00094 {
00095 int seqOff = hp + i - hay;
00096 if (bitReadOne(maskBits, seqOff))
00097 ++repMatch;
00098 else
00099 ++matchCount;
00100 }
00101 else
00102 ++matchCount;
00103 }
00104 else
00105 ++mismatchCount;
00106 }
00107 }
00108 if (nextFf != NULL)
00109 {
00110 int nhStart = trans3GenoPos(nextFf->hStart, tSeq, t3List, FALSE) + chromOffset;
00111 int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) + chromOffset;
00112 int hGap = nhStart - ohEnd;
00113 int nGap = nextFf->nStart - ff->nEnd;
00114
00115 if (nGap != 0)
00116 {
00117 ++nInsertCount;
00118 nInsertBaseCount += nGap;
00119 }
00120 if (hGap != 0)
00121 {
00122 ++hInsertCount;
00123 hInsertBaseCount += hGap;
00124 }
00125 }
00126 }
00127
00128
00129
00130
00131 {
00132 int gaps = nInsertCount + (stringency == ffCdna ? 0: hInsertCount);
00133 int id = roundingScale(1000, matchCount + repMatch - 2*gaps, matchCount + repMatch + mismatchCount);
00134 if (id >= minIdentity)
00135 {
00136 if (isRc)
00137 {
00138 int temp;
00139 int oSize = qSeq->size;
00140 temp = nStart;
00141 nStart = oSize - nEnd;
00142 nEnd = oSize - temp;
00143 }
00144 if (targetIsRc)
00145 {
00146 int temp;
00147 temp = hStart;
00148 hStart = chromSize - hEnd;
00149 hEnd = chromSize - temp;
00150 }
00151 fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c",
00152 matchCount, mismatchCount, repMatch, countNs, nInsertCount, nInsertBaseCount, hInsertCount, hInsertBaseCount,
00153 (isRc ? '-' : '+'));
00154 if (reportTargetStrand)
00155 fprintf(f, "%c", (targetIsRc ? '-' : '+') );
00156 fprintf(f, "\t%s\t%d\t%d\t%d\t"
00157 "%s\t%d\t%d\t%d\t%d\t",
00158 qSeq->name, qSeq->size, nStart, nEnd,
00159 chromName, chromSize, hStart, hEnd,
00160 ffAliCount(ali));
00161 for (ff = ali; ff != NULL; ff = ff->right)
00162 fprintf(f, "%ld,", (long)(ff->nEnd - ff->nStart));
00163 fprintf(f, "\t");
00164 for (ff = ali; ff != NULL; ff = ff->right)
00165 fprintf(f, "%ld,", (long)(ff->nStart - needle));
00166 fprintf(f, "\t");
00167 for (ff = ali; ff != NULL; ff = ff->right)
00168 fprintf(f, "%d,", trans3GenoPos(ff->hStart, tSeq, t3List, FALSE) + chromOffset);
00169 if (saveSeq)
00170 {
00171 fputc('\t', f);
00172 for (ff = ali; ff != NULL; ff = ff->right)
00173 {
00174 mustWrite(f, ff->nStart, ff->nEnd - ff->nStart);
00175 fputc(',', f);
00176 }
00177 fputc('\t', f);
00178 for (ff = ali; ff != NULL; ff = ff->right)
00179 {
00180 mustWrite(f, ff->hStart, ff->hEnd - ff->hStart);
00181 fputc(',', f);
00182 }
00183 }
00184 fprintf(f, "\n");
00185 if (ferror(f))
00186 {
00187 perror("");
00188 errAbort("Write error to .psl");
00189 }
00190 }
00191 }
00192 }
00193
00194 static void pslOut(char *chromName, int chromSize, int chromOffset, struct ffAli *ali,
00195 struct dnaSeq *tSeq, struct hash *t3Hash, struct dnaSeq *qSeq,
00196 boolean qIsRc, boolean tIsRc,
00197 enum ffStringency stringency, int minMatch, struct gfOutput *out)
00198
00199 {
00200 struct pslxData *outForm = out->data;
00201
00202 savePslx(chromName, chromSize, chromOffset, ali, tSeq, qSeq,
00203 qIsRc, stringency, minMatch, outForm->f, t3Hash,
00204 out->reportTargetStrand, tIsRc,
00205 out->maskHash, out->minGood,
00206 out->qIsProt, out->tIsProt, outForm->saveSeq);
00207 }
00208
00209 static struct ffAli *ffNextBreak(struct ffAli *ff, int maxInsert,
00210 bioSeq *tSeq, struct trans3 *t3List)
00211
00212
00213
00214 {
00215 struct ffAli *rt = ff->right;
00216 int hGap, nGap;
00217 int nhStart, ohEnd;
00218 for (;;)
00219 {
00220 if (rt == NULL)
00221 break;
00222 nhStart = trans3GenoPos(rt->hStart, tSeq, t3List, FALSE);
00223 ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE);
00224 hGap = nhStart - ohEnd;
00225 nGap = rt->nStart - ff->nEnd;
00226 if (hGap != 0 && nGap != 0)
00227 break;
00228 if (hGap < 0 || nGap < 0)
00229 break;
00230 if (hGap > maxInsert || nGap > maxInsert)
00231 break;
00232 ff = rt;
00233 rt = ff->right;
00234 }
00235 return rt;
00236 }
00237
00238
00239 static void saveAxtBundle(char *chromName, int chromSize, int chromOffset,
00240 struct ffAli *ali,
00241 struct dnaSeq *tSeq, struct hash *t3Hash, struct dnaSeq *qSeq,
00242 boolean qIsRc, boolean tIsRc,
00243 enum ffStringency stringency, int minMatch, struct gfOutput *out)
00244
00245 {
00246 struct axtData *ad = out->data;
00247 struct ffAli *sAli, *eAli, *ff, *rt, *eFf = NULL;
00248 struct axt *axt;
00249 struct dyString *q = newDyString(1024), *t = newDyString(1024);
00250 struct axtBundle *gab;
00251 struct trans3 *t3List = NULL;
00252
00253 if (t3Hash != NULL)
00254 t3List = hashMustFindVal(t3Hash, tSeq->name);
00255 AllocVar(gab);
00256 gab->tSize = chromSize;
00257 gab->qSize = qSeq->size;
00258 for (sAli = ali; sAli != NULL; sAli = eAli)
00259 {
00260 eAli = ffNextBreak(sAli, 8, tSeq, t3List);
00261 dyStringClear(q);
00262 dyStringClear(t);
00263 for (ff = sAli; ff != eAli; ff = ff->right)
00264 {
00265 dyStringAppendN(q, ff->nStart, ff->nEnd - ff->nStart);
00266 dyStringAppendN(t, ff->hStart, ff->hEnd - ff->hStart);
00267 rt = ff->right;
00268 if (rt != eAli)
00269 {
00270 int nGap = rt->nStart - ff->nEnd;
00271 int nhStart = trans3GenoPos(rt->hStart, tSeq, t3List, FALSE)
00272 + chromOffset;
00273 int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE)
00274 + chromOffset;
00275 int hGap = nhStart - ohEnd;
00276 int gap = max(nGap, hGap);
00277 if (nGap < 0 || hGap < 0)
00278 {
00279 errAbort("Negative gap size in %s vs %s", tSeq->name, qSeq->name);
00280 }
00281 if (nGap == gap)
00282 {
00283 dyStringAppendN(q, ff->nEnd, gap);
00284 dyStringAppendMultiC(t, '-', gap);
00285 }
00286 else
00287 {
00288 dyStringAppendN(t, ff->hEnd, gap);
00289 dyStringAppendMultiC(q, '-', gap);
00290 }
00291 }
00292 eFf = ff;
00293 }
00294 assert(t->stringSize == q->stringSize);
00295 AllocVar(axt);
00296 axt->qName = cloneString(qSeq->name);
00297 axt->qStart = sAli->nStart - qSeq->dna;
00298 axt->qEnd = eFf->nEnd - qSeq->dna;
00299 axt->qStrand = (qIsRc ? '-' : '+');
00300 axt->tName = cloneString(chromName);
00301 axt->tStart = trans3GenoPos(sAli->hStart, tSeq, t3List, FALSE) + chromOffset;
00302 axt->tEnd = trans3GenoPos(eFf->hEnd, tSeq, t3List, TRUE) + chromOffset;
00303 axt->tStrand = (tIsRc ? '-' : '+');
00304 axt->symCount = t->stringSize;
00305 axt->qSym = cloneString(q->string);
00306 axt->tSym = cloneString(t->string);
00307 axt->frame = trans3Frame(sAli->hStart, t3List);
00308 if (out->qIsProt)
00309 axt->score = axtScoreProteinDefault(axt);
00310 else
00311 axt->score = axtScoreDnaDefault(axt);
00312 slAddHead(&gab->axtList, axt);
00313 }
00314 slReverse(&gab->axtList);
00315 dyStringFree(&q);
00316 dyStringFree(&t);
00317 slAddHead(&ad->bundleList, gab);
00318 }
00319
00320 static void axtQueryOut(struct gfOutput *out, FILE *f)
00321
00322 {
00323 struct axtData *aod = out->data;
00324 struct axtBundle *gab;
00325 for (gab = aod->bundleList; gab != NULL; gab = gab->next)
00326 {
00327 struct axt *axt;
00328 for (axt = gab->axtList; axt != NULL; axt = axt->next)
00329 axtWrite(axt, f);
00330 }
00331 axtBundleFreeList(&aod->bundleList);
00332 }
00333
00334 static void mafHead(struct gfOutput *out, FILE *f)
00335
00336 {
00337 mafWriteStart(f, "blastz");
00338 }
00339
00340 static void mafQueryOut(struct gfOutput *out, FILE *f)
00341
00342 {
00343 struct axtData *aod = out->data;
00344 struct axtBundle *gab;
00345 for (gab = aod->bundleList; gab != NULL; gab = gab->next)
00346 {
00347 struct axt *axt;
00348 for (axt = gab->axtList; axt != NULL; axt = axt->next)
00349 {
00350 struct mafAli temp;
00351 mafFromAxtTemp(axt, gab->tSize, gab->qSize, &temp);
00352 mafWrite(f, &temp);
00353 }
00354 }
00355 axtBundleFreeList(&aod->bundleList);
00356 }
00357
00358 static double axtIdRatio(struct axt *axt)
00359
00360 {
00361 int matchCount = 0;
00362 int i;
00363 for (i=0; i<axt->symCount; ++i)
00364 {
00365 if (axt->qSym[i] == axt->tSym[i])
00366 ++matchCount;
00367 }
00368 return (double)matchCount/(double)axt->symCount;
00369 }
00370
00371 static void sim4QueryOut(struct gfOutput *out, FILE *f)
00372
00373 {
00374 struct axtData *aod = out->data;
00375 struct axtBundle *gab;
00376
00377 for (gab = aod->bundleList; gab != NULL; gab = gab->next)
00378 {
00379 struct axt *axt = gab->axtList;
00380 fprintf(f, "\n");
00381 fprintf(f, "seq1 = %s, %d bp\n", axt->qName, gab->qSize);
00382 fprintf(f, "seq2 = %s, %d bp\n", axt->tName, gab->tSize);
00383 fprintf(f, "\n");
00384 if (axt->qStrand == '-')
00385 fprintf(f, "(complement)\n");
00386 for (axt = gab->axtList; axt != NULL; axt = axt->next)
00387 {
00388 fprintf(f, "%d-%d ", axt->qStart+1, axt->qEnd);
00389 fprintf(f, "(%d-%d) ", axt->tStart+1, axt->tEnd);
00390 fprintf(f, "%1.0f%% ", 100.0 * axtIdRatio(axt));
00391 if (axt->qStrand == '-')
00392 fprintf(f, "<-\n");
00393 else
00394 fprintf(f, "->\n");
00395 }
00396 }
00397 axtBundleFreeList(&aod->bundleList);
00398 }
00399
00400 static void blastQueryOut(struct gfOutput *out, FILE *f)
00401
00402 {
00403 struct axtData *aod = out->data;
00404 axtBlastOut(aod->bundleList, out->queryIx, out->qIsProt, f,
00405 aod->databaseName, aod->databaseSeqCount, aod->databaseLetters,
00406 aod->blastType, "blat", aod->minIdentity);
00407 axtBundleFreeList(&aod->bundleList);
00408 }
00409
00410 static struct gfOutput *gfOutputInit(int goodPpt, boolean qIsProt, boolean tIsProt)
00411
00412
00413
00414 {
00415 struct gfOutput *out;
00416 AllocVar(out);
00417 out->minGood = goodPpt;
00418 out->qIsProt = qIsProt;
00419 out->tIsProt = tIsProt;
00420 return out;
00421 }
00422
00423 static void pslHead(struct gfOutput *out, FILE *f)
00424
00425 {
00426 pslWriteHead(f);
00427 }
00428
00429 struct gfOutput *gfOutputPsl(int goodPpt,
00430 boolean qIsProt, boolean tIsProt, FILE *f,
00431 boolean saveSeq, boolean noHead)
00432
00433 {
00434 struct gfOutput *out = gfOutputInit(goodPpt, qIsProt, tIsProt);
00435 struct pslxData *pslData;
00436
00437 AllocVar(pslData);
00438 pslData->saveSeq = saveSeq;
00439 pslData->f = f;
00440 out->out = pslOut;
00441 out->data = pslData;
00442 if (!noHead)
00443 out->fileHead = pslHead;
00444 return out;
00445 }
00446
00447 struct gfOutput *gfOutputAxtMem(int goodPpt, boolean qIsProt,
00448 boolean tIsProt)
00449
00450 {
00451 struct gfOutput *out = gfOutputInit(goodPpt, qIsProt, tIsProt);
00452 struct axtData *ad;
00453 AllocVar(ad);
00454 out->out = saveAxtBundle;
00455 out->data = ad;
00456 return out;
00457 }
00458
00459 struct gfOutput *gfOutputAxt(int goodPpt, boolean qIsProt,
00460 boolean tIsProt, FILE *f)
00461
00462 {
00463 struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
00464 out->queryOut = axtQueryOut;
00465 return out;
00466 }
00467
00468 struct gfOutput *gfOutputMaf(int goodPpt, boolean qIsProt,
00469 boolean tIsProt, FILE *f)
00470
00471 {
00472 struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
00473 out->queryOut = mafQueryOut;
00474 out->fileHead = mafHead;
00475 return out;
00476 }
00477
00478 struct gfOutput *gfOutputSim4(int goodPpt, boolean qIsProt, boolean tIsProt,
00479 char *databaseName)
00480
00481 {
00482 struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
00483 struct axtData *ad = out->data;
00484 if (qIsProt || tIsProt)
00485 errAbort("sim4 output is not available for protein query sequences.");
00486 ad->databaseName = databaseName;
00487 out->queryOut = sim4QueryOut;
00488 return out;
00489 }
00490
00491 struct gfOutput *gfOutputBlast(int goodPpt,
00492 boolean qIsProt, boolean tIsProt,
00493 char *databaseName, int databaseSeqCount, double databaseLetters,
00494 char *blastType, double minIdentity, FILE *f)
00495
00496 {
00497 struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
00498 struct axtData *ad = out->data;
00499 ad->databaseName = databaseName;
00500 ad->databaseSeqCount = databaseSeqCount;
00501 ad->databaseLetters = databaseLetters;
00502 ad->blastType = blastType;
00503 ad->minIdentity = minIdentity;
00504 out->queryOut = blastQueryOut;
00505 return out;
00506 }
00507
00508 struct gfOutput *gfOutputAny(char *format,
00509 int goodPpt, boolean qIsProt, boolean tIsProt,
00510 boolean noHead, char *databaseName,
00511 int databaseSeqCount, double databaseLetters,
00512 double minIdentity,
00513 FILE *f)
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523
00524
00525
00526
00527 {
00528 struct gfOutput *out = NULL;
00529 static char *blastTypes[] = {"blast", "wublast", "blast8", "blast9", "xml"};
00530
00531 if (format == NULL)
00532 format = "psl";
00533 if (sameWord(format, "psl"))
00534 out = gfOutputPsl(goodPpt, qIsProt, tIsProt, f, FALSE, noHead);
00535 else if (sameWord(format, "pslx"))
00536 out = gfOutputPsl(goodPpt, qIsProt, tIsProt, f, TRUE, noHead);
00537 else if (sameWord(format, "sim4"))
00538 out = gfOutputSim4(goodPpt, qIsProt, tIsProt, databaseName);
00539 else if (stringArrayIx(format, blastTypes, ArraySize(blastTypes)) >= 0)
00540 out = gfOutputBlast(goodPpt, qIsProt, tIsProt,
00541 databaseName, databaseSeqCount, databaseLetters, format,
00542 minIdentity, f);
00543 else if (sameWord(format, "axt"))
00544 out = gfOutputAxt(goodPpt, qIsProt, tIsProt, f);
00545 else if (sameWord(format, "maf"))
00546 out = gfOutputMaf(goodPpt, qIsProt, tIsProt, f);
00547 else
00548 errAbort("Unrecognized output format '%s'", format);
00549 return out;
00550 }
00551
00552 void gfOutputQuery(struct gfOutput *out, FILE *f)
00553
00554 {
00555 ++out->queryIx;
00556 if (out->queryOut != NULL)
00557 out->queryOut(out, f);
00558 }
00559
00560 void gfOutputHead(struct gfOutput *out, FILE *f)
00561
00562 {
00563 if (out->fileHead != NULL)
00564 out->fileHead(out, f);
00565 }
00566
00567 void gfOutputFree(struct gfOutput **pOut)
00568
00569 {
00570 struct gfOutput *out = *pOut;
00571 if (out != NULL)
00572 {
00573 freeMem(out->data);
00574 freez(pOut);
00575 }
00576 }