00001
00002 #include "common.h"
00003 #include "linefile.h"
00004 #include "errabort.h"
00005 #include "obscure.h"
00006 #include "dnautil.h"
00007 #include "axt.h"
00008 #include "maf.h"
00009 #include "hash.h"
00010 #include <fcntl.h>
00011
00012 static char const rcsid[] = "$Id: maf.c,v 1.34 2007/04/12 18:54:23 rico Exp $";
00013
00014 struct mafFile *mafMayOpen(char *fileName)
00015
00016 {
00017 struct mafFile *mf;
00018 struct lineFile *lf;
00019 char *line, *word;
00020 char *sig = "##maf";
00021
00022 if ((lf = lineFileMayOpen(fileName, TRUE)) == NULL)
00023 return NULL;
00024 AllocVar(mf);
00025 mf->lf = lf;
00026
00027 lineFileNeedNext(lf, &line, NULL);
00028 if (!startsWith(sig, line))
00029 {
00030 errAbort("%s does not start with %s", fileName, sig);
00031 }
00032 line += strlen(sig);
00033
00034 while ((word = nextWord(&line)) != NULL)
00035 {
00036
00037 char *name = word;
00038 char *val = strchr(word, '=');
00039 if (val == NULL)
00040 errAbort("Missing = after %s line 1 of %s\n", name, fileName);
00041 *val++ = 0;
00042
00043 if (sameString(name, "version"))
00044 mf->version = atoi(val);
00045 else if (sameString(name, "scoring"))
00046 mf->scoring = cloneString(val);
00047 }
00048 if (mf->version == 0)
00049 errAbort("No version line 1 of %s\n", fileName);
00050 return mf;
00051 }
00052
00053 struct mafFile *mafOpen(char *fileName)
00054
00055 {
00056 struct mafFile *mf = mafMayOpen(fileName);
00057 if (mf == NULL)
00058 errnoAbort("Couldn't open %s\n", fileName);
00059 return mf;
00060 }
00061
00062 void mafRewind(struct mafFile *mf)
00063
00064 {
00065 if (mf == NULL)
00066 errAbort("maf file rewind failed -- file not open");
00067 lineFileSeek(mf->lf, 0, SEEK_SET);
00068 }
00069
00070 static boolean nextLine(struct lineFile *lf, char **pLine)
00071
00072 {
00073 for (;;)
00074 {
00075 if (!lineFileNext(lf, pLine, NULL))
00076 return FALSE;
00077 if (**pLine != '#')
00078 return TRUE;
00079 }
00080 }
00081
00082 struct mafAli *mafNextWithPos(struct mafFile *mf, off_t *retOffset)
00083
00084
00085 {
00086 struct lineFile *lf = mf->lf;
00087 struct mafAli *ali;
00088 char *line, *word;
00089
00090
00091 for (;;)
00092 {
00093
00094 if (!nextLine(lf, &line))
00095 {
00096 lineFileClose(&mf->lf);
00097 return NULL;
00098 }
00099
00100
00101 word = nextWord(&line);
00102 if (word == NULL)
00103 continue;
00104
00105 if (sameString(word, "a"))
00106 {
00107 if (retOffset != NULL)
00108 *retOffset = lineFileTell(mf->lf);
00109 AllocVar(ali);
00110 while ((word = nextWord(&line)) != NULL)
00111 {
00112
00113 char *name = word;
00114 char *val = strchr(word, '=');
00115 if (val == NULL)
00116 errAbort("Missing = after %s line 1 of %s", name, lf->fileName);
00117 *val++ = 0;
00118
00119 if (sameString(name, "score"))
00120 ali->score = atof(val);
00121 }
00122
00123
00124 for (;;)
00125 {
00126 if (!nextLine(lf, &line))
00127 errAbort("Unexpected end of file %s", lf->fileName);
00128 word = nextWord(&line);
00129 if (word == NULL)
00130 break;
00131 if (sameString(word, "s") || sameString(word, "e"))
00132 {
00133 struct mafComp *comp;
00134 int wordCount;
00135 char *row[7];
00136 int textSize;
00137
00138
00139
00140 row[0] = word;
00141 wordCount = chopByWhite(line, row+1, ArraySize(row)-1) + 1;
00142 lineFileExpectWords(lf, ArraySize(row), wordCount);
00143 AllocVar(comp);
00144
00145
00146 comp->src = cloneString(row[1]);
00147 comp->srcSize = lineFileNeedNum(lf, row, 5);
00148 comp->strand = row[4][0];
00149 comp->start = lineFileNeedNum(lf, row, 2);
00150
00151 if (sameString(word, "e"))
00152 {
00153 comp->size = 0;
00154 comp->rightLen = comp->leftLen = lineFileNeedNum(lf, row, 3);
00155 comp->rightStatus = comp->leftStatus = *row[6];
00156 }
00157 else
00158 {
00159 comp->size = lineFileNeedNum(lf, row, 3);
00160 comp->text = cloneString(row[6]);
00161 textSize = strlen(comp->text);
00162
00163
00164 if (ali->textSize == 0)
00165 ali->textSize = textSize;
00166 else if (ali->textSize != textSize)
00167 errAbort("Text size inconsistent (%d vs %d) line %d of %s",
00168 textSize, ali->textSize, lf->lineIx, lf->fileName);
00169 }
00170
00171
00172 if (comp->srcSize < 0 || comp->size < 0)
00173 errAbort("Got a negative size line %d of %s", lf->lineIx, lf->fileName);
00174 if (comp->start < 0 || comp->start + comp->size > comp->srcSize)
00175 errAbort("Coordinates out of range line %d of %s", lf->lineIx, lf->fileName);
00176
00177
00178 slAddHead(&ali->components, comp);
00179 }
00180 if (sameString(word, "i"))
00181 {
00182 struct mafComp *comp;
00183 int wordCount;
00184 char *row[6];
00185
00186
00187
00188 row[0] = word;
00189 wordCount = chopByWhite(line, row+1, ArraySize(row)-1) + 1;
00190 lineFileExpectWords(lf, ArraySize(row), wordCount);
00191 if (!sameString(row[1],ali->components->src))
00192 errAbort("i line src mismatch: i is %s :: s is %s\n", row[1], ali->components->src);
00193
00194 comp = ali->components;
00195 comp->leftStatus = *row[2];
00196 comp->leftLen = atoi(row[3]);
00197 comp->rightStatus = *row[4];
00198 comp->rightLen = atoi(row[5]);
00199 }
00200 if (sameString(word, "q"))
00201 {
00202 struct mafComp *comp;
00203 int wordCount;
00204 char *row[3];
00205
00206
00207
00208 row[0] = word;
00209 wordCount = chopByWhite(line, row+1, ArraySize(row)-1) + 1;
00210 lineFileExpectWords(lf, ArraySize(row), wordCount);
00211 if (!sameString(row[1],ali->components->src))
00212 errAbort("q line src mismatch: q is %s :: s is %s\n", row[1], ali->components->src);
00213
00214 comp = ali->components;
00215 comp->quality = cloneString(row[2]);
00216 }
00217 }
00218 slReverse(&ali->components);
00219 return ali;
00220 }
00221 else
00222 {
00223 for (;;)
00224 {
00225 if (!nextLine(lf, &line))
00226 return NULL;
00227 if (nextWord(&line) == NULL)
00228 break;
00229 }
00230 }
00231 }
00232 }
00233
00234
00235 struct mafAli *mafNext(struct mafFile *mf)
00236
00237 {
00238 return mafNextWithPos(mf, NULL);
00239 }
00240
00241
00242 struct mafFile *mafReadAll(char *fileName)
00243
00244 {
00245 struct mafFile *mf = mafOpen(fileName);
00246 struct mafAli *ali;
00247 while ((ali = mafNext(mf)) != NULL)
00248 {
00249 slAddHead(&mf->alignments, ali);
00250 }
00251 slReverse(&mf->alignments);
00252 return mf;
00253 }
00254
00255 void mafWriteStart(FILE *f, char *scoring)
00256
00257 {
00258 fprintf(f, "##maf version=1");
00259 if (scoring != NULL)
00260 fprintf(f, " scoring=%s", scoring);
00261 fprintf(f, "\n");
00262 }
00263
00264
00265 void mafWrite(FILE *f, struct mafAli *ali)
00266
00267 {
00268 struct mafComp *comp;
00269 int srcChars = 0, startChars = 0, sizeChars = 0, srcSizeChars = 0;
00270
00271
00272 fprintf(f, "a score=%f\n", ali->score);
00273
00274
00275 for (comp = ali->components; comp != NULL; comp = comp->next)
00276 {
00277 int len = 0;
00278
00279
00280 if (sameString(comp->src,"."))
00281 comp->src=cloneString("defaultName");
00282 len = strlen(comp->src);
00283 if (srcChars < len)
00284 srcChars = len;
00285 len = digitsBaseTen(comp->start);
00286 if (startChars < len)
00287 startChars = len;
00288 len = digitsBaseTen(comp->size);
00289 if (sizeChars < len)
00290 sizeChars = len;
00291 len = digitsBaseTen(comp->srcSize);
00292 if (srcSizeChars < len)
00293 srcSizeChars = len;
00294 }
00295
00296
00297 for (comp = ali->components; comp != NULL; comp = comp->next)
00298 {
00299 if ((comp->size == 0) && (comp->leftStatus))
00300 fprintf(f, "e %-*s %*d %*d %c %*d %c\n",
00301 srcChars, comp->src, startChars, comp->start,
00302 sizeChars, comp->leftLen, comp->strand,
00303 srcSizeChars, comp->srcSize, comp->leftStatus);
00304 else
00305 {
00306 fprintf(f, "s %-*s %*d %*d %c %*d %s\n",
00307 srcChars, comp->src, startChars, comp->start,
00308 sizeChars, comp->size, comp->strand,
00309 srcSizeChars, comp->srcSize, comp->text);
00310
00311 if (comp->quality)
00312 fprintf(f, "q %-*s %s\n",
00313 srcChars + startChars + sizeChars + srcSizeChars + 5,
00314 comp->src, comp->quality);
00315
00316 if (comp->leftStatus)
00317 fprintf(f,"i %-*s %c %d %c %d\n",srcChars,comp->src,
00318 comp->leftStatus,comp->leftLen,comp->rightStatus,comp->rightLen);
00319 }
00320
00321 }
00322
00323
00324 fprintf(f, "\n");
00325 }
00326
00327 void mafWriteEnd(FILE *f)
00328
00329 {
00330 }
00331
00332 void mafWriteAll(struct mafFile *mf, char *fileName)
00333
00334 {
00335 FILE *f = mustOpen(fileName, "w");
00336 struct mafAli *ali;
00337 mafWriteStart(f, mf->scoring);
00338 for (ali = mf->alignments; ali != NULL; ali = ali->next)
00339 mafWrite(f, ali);
00340 mafWriteEnd(f);
00341 carefulClose(&f);
00342 }
00343
00344 void mafCompFree(struct mafComp **pObj)
00345
00346 {
00347 struct mafComp *obj = *pObj;
00348 if (obj == NULL)
00349 return;
00350 freeMem(obj->src);
00351 freeMem(obj->text);
00352 freeMem(obj->quality);
00353 freez(pObj);
00354 }
00355
00356 void mafCompFreeList(struct mafComp **pList)
00357
00358 {
00359 struct mafComp *el, *next;
00360
00361 for (el = *pList; el != NULL; el = next)
00362 {
00363 next = el->next;
00364 mafCompFree(&el);
00365 }
00366 *pList = NULL;
00367 }
00368
00369 int mafPlusStart(struct mafComp *comp)
00370
00371 {
00372 if (comp->strand == '-')
00373 return comp->srcSize - (comp->start + comp->size);
00374 else
00375 return comp->start;
00376 }
00377
00378 void mafAliFree(struct mafAli **pObj)
00379
00380 {
00381 struct mafAli *obj = *pObj;
00382 if (obj == NULL)
00383 return;
00384 mafCompFreeList(&obj->components);
00385 freez(pObj);
00386 }
00387
00388 void mafAliFreeList(struct mafAli **pList)
00389
00390 {
00391 struct mafAli *el, *next;
00392
00393 for (el = *pList; el != NULL; el = next)
00394 {
00395 next = el->next;
00396 mafAliFree(&el);
00397 }
00398 *pList = NULL;
00399 }
00400
00401 void mafFileFree(struct mafFile **pObj)
00402
00403 {
00404 struct mafFile *obj = *pObj;
00405 if (obj == NULL)
00406 return;
00407 lineFileClose(&obj->lf);
00408 freeMem(obj->scoring);
00409 mafAliFreeList(&obj->alignments);
00410 freez(pObj);
00411 }
00412
00413 void mafFileFreeList(struct mafFile **pList)
00414
00415 {
00416 struct mafFile *el, *next;
00417
00418 for (el = *pList; el != NULL; el = next)
00419 {
00420 next = el->next;
00421 mafFileFree(&el);
00422 }
00423 *pList = NULL;
00424 }
00425
00426 struct mafComp *mafMayFindComponent(struct mafAli *maf, char *src)
00427
00428 {
00429 struct mafComp *mc;
00430 for (mc = maf->components; mc != NULL; mc = mc->next)
00431 {
00432 if (sameString(mc->src, src))
00433 return mc;
00434 }
00435 return NULL;
00436 }
00437
00438 struct mafComp *mafMayFindComponentDb(struct mafAli *maf, char *db)
00439
00440
00441 {
00442 struct mafComp *mc;
00443 char *p, *q;
00444 for (mc = maf->components; mc != NULL; mc = mc->next)
00445 {
00446 for (p = mc->src, q = db; *p && *q; p++, q++)
00447 {
00448 if (*p != *q)
00449 break;
00450 }
00451 if (*p == '.' && *q == 0)
00452 return mc;
00453 if (*p == *q)
00454 return mc;
00455 }
00456 return NULL;
00457 }
00458
00459 struct mafComp *mafFindComponent(struct mafAli *maf, char *src)
00460
00461 {
00462 struct mafComp *mc = mafMayFindComponent(maf, src);
00463 if (mc == NULL)
00464 errAbort("Couldn't find %s in maf", src);
00465 return mc;
00466 }
00467
00468 struct mafComp *mafMayFindCompSpecies(struct mafAli *maf, char *species, char sepChar)
00469
00470
00471 {
00472 struct mafComp *mc;
00473 int speciesLen = strlen(species);
00474
00475 for (mc = maf->components; mc != NULL; mc = mc->next)
00476 {
00477 if (startsWith(species, mc->src) )
00478 {
00479 char endChar = mc->src[speciesLen];
00480
00481 if ((endChar == '\0') || (endChar == sepChar))
00482 return mc;
00483 }
00484 }
00485 return NULL;
00486 }
00487
00488
00489 struct mafComp *mafFindCompSpecies(struct mafAli *maf, char *species, char sepChar)
00490
00491
00492 {
00493 struct mafComp *mc = mafMayFindCompSpecies(maf, species, sepChar);
00494 if (mc == NULL)
00495 errAbort("Couldn't find %s%c or just %s... in maf", species,sepChar,species);
00496 return mc;
00497 }
00498
00499 struct mafComp *mafMayFindCompPrefix(struct mafAli *maf, char *pre, char *sep)
00500
00501
00502 {
00503 struct mafComp *mc;
00504 char prefix[256];
00505
00506 if (sep == NULL)
00507 sep = "";
00508 snprintf(prefix, 256, "%s%s", pre, sep);
00509
00510 for (mc = maf->components; mc != NULL; mc = mc->next)
00511 {
00512 if (startsWith(prefix, mc->src))
00513 return mc;
00514 }
00515 return NULL;
00516 }
00517
00518 struct mafComp *mafFindCompPrefix(struct mafAli *maf, char *pre, char *sep)
00519
00520
00521 {
00522 struct mafComp *mc = mafMayFindCompPrefix(maf, pre, sep);
00523 if (mc == NULL)
00524 errAbort("Couldn't find %s%s... in maf", pre,sep);
00525 return mc;
00526 }
00527
00528 struct mafComp *mafMayFindComponentInHash(struct mafAli *maf, struct hash *cHash)
00529
00530
00531 {
00532 struct mafComp *mc;
00533
00534 for (mc = maf->components; mc != NULL; mc = mc->next)
00535 {
00536 if (hashFindVal(cHash, mc->src))
00537 return mc;
00538 }
00539 return NULL;
00540 }
00541
00542 boolean mafMayFindAllComponents(struct mafAli *maf, struct hash *cHash)
00543
00544
00545 {
00546 struct hashCookie cookie = hashFirst(cHash);
00547 struct hashEl *el;
00548
00549 while ((el = hashNext(&cookie)) != NULL)
00550 if (mafMayFindComponent(maf, el->name) == NULL)
00551 return FALSE;
00552 return TRUE;
00553 }
00554
00555 struct mafAli *mafSubset(struct mafAli *maf, char *componentSource,
00556 int newStart, int newEnd)
00557 {
00558 return mafSubsetE(maf, componentSource, newStart, newEnd, FALSE);
00559 }
00560
00561 struct mafAli *mafSubsetE(struct mafAli *maf, char *componentSource,
00562 int newStart, int newEnd, bool getInitialDashes)
00563
00564
00565
00566
00567
00568
00569
00570
00571 {
00572 struct mafComp *mcMaster = mafFindComponent(maf, componentSource);
00573 struct mafAli *subset;
00574 struct mafComp *mc, *subMc;
00575 char *s, *e;
00576 int textStart, textSize;
00577
00578
00579 if (mcMaster->strand == '-')
00580 reverseIntRange(&newStart, &newEnd, mcMaster->srcSize);
00581
00582
00583 if (newStart >= newEnd)
00584 return NULL;
00585 if (newStart >= mcMaster->start + mcMaster->size)
00586 return NULL;
00587 if (newEnd <= mcMaster->start)
00588 return NULL;
00589
00590
00591 if (newStart < mcMaster->start)
00592 newStart = mcMaster->start;
00593 if (newEnd > mcMaster->start + mcMaster->size)
00594 newEnd = mcMaster->start + mcMaster->size;
00595
00596
00597
00598 s = skipIgnoringDash(mcMaster->text, newStart - mcMaster->start, TRUE);
00599 e = skipIgnoringDash(s, newEnd - newStart, TRUE);
00600 textStart = s - mcMaster->text;
00601 textSize = e - s;
00602
00603 if (getInitialDashes && (newStart == mcMaster->start))
00604 {
00605 textStart = 0;
00606 textSize += s - mcMaster->text;
00607 }
00608
00609
00610 AllocVar(subset);
00611 subset->textSize = textSize;
00612 for (mc = maf->components; mc != NULL; mc = mc->next)
00613 {
00614 AllocVar(subMc);
00615 subMc->src = cloneString(mc->src);
00616 subMc->srcSize = mc->srcSize;
00617 subMc->strand = mc->strand;
00618 if (mc->size != 0)
00619 {
00620 subMc->start = mc->start + countNonDash(mc->text, textStart);
00621 subMc->size = countNonDash(mc->text+textStart, textSize);
00622 subMc->text = cloneStringZ(mc->text + textStart, textSize);
00623 if (mc->quality != NULL)
00624 subMc->quality = cloneStringZ(mc->quality + textStart, textSize);
00625 }
00626 else
00627 {
00628
00629 subMc->size = 0;
00630 subMc->start = mc->start;
00631 }
00632 subMc->leftStatus = mc->leftStatus;
00633 subMc->leftLen = mc->leftLen;
00634 subMc->rightStatus = mc->rightStatus;
00635 subMc->rightLen = mc->rightLen;
00636 slAddHead(&subset->components, subMc);
00637 }
00638 slReverse(&subset->components);
00639 return subset;
00640 }
00641
00642 void mafMoveComponentToTop(struct mafAli *maf, char *componentSource)
00643
00644 {
00645 struct mafComp *mcMaster = mafFindComponent(maf, componentSource);
00646 slRemoveEl(&maf->components, mcMaster);
00647 slAddHead(&maf->components, mcMaster);
00648 }
00649
00650 boolean mafNeedSubset(struct mafAli *maf, char *componentSource,
00651 int newStart, int newEnd)
00652
00653
00654 {
00655 struct mafComp *mcMaster = mafFindComponent(maf, componentSource);
00656
00657
00658 if (mcMaster->strand == '-')
00659 reverseIntRange(&newStart, &newEnd, mcMaster->srcSize);
00660
00661 return newStart > mcMaster->start || newEnd < mcMaster->start + mcMaster->size;
00662 }
00663
00664 void mafFlipStrand(struct mafAli *maf)
00665
00666 {
00667 struct mafComp *mc;
00668 for (mc = maf->components; mc != NULL; mc = mc->next)
00669 {
00670 int e = mc->start + mc->size;
00671 reverseIntRange(&mc->start, &e, mc->srcSize);
00672 if (mc->text != NULL)
00673 reverseComplement(mc->text, maf->textSize);
00674 if (mc->quality != NULL)
00675 reverseBytes(mc->quality, maf->textSize);
00676 if (mc->strand == '-')
00677 mc->strand = '+';
00678 else
00679 mc->strand = '-';
00680 char holdStatus = mc->leftStatus;
00681 mc->leftStatus = mc->rightStatus;
00682 mc->rightStatus = holdStatus;
00683 int holdLen = mc->leftLen;
00684 mc->leftLen = mc->rightLen;
00685 mc->rightLen = holdLen;
00686 }
00687 }
00688
00689 void mafSrcDb(char *name, char *retDb, int retDbSize)
00690
00691
00692 {
00693 int len;
00694 char *e = strchr(name, '.');
00695
00696 len = (e == NULL ? strlen(name) : e - name);
00697 if (len >= retDbSize)
00698 len = retDbSize-1;
00699 memcpy(retDb, name, len);
00700 retDb[len] = 0;
00701 }
00702
00703 boolean mafColumnEmpty(struct mafAli *maf, int col)
00704
00705 {
00706 assert(col < maf->textSize);
00707 struct mafComp *comp;
00708 for (comp = maf->components; comp != NULL; comp = comp->next)
00709 {
00710 char c = comp->text[col];
00711 if (c != '.' && c != '-')
00712 return FALSE;
00713 }
00714 return TRUE;
00715 }
00716
00717 void mafStripEmptyColumns(struct mafAli *maf)
00718
00719 {
00720
00721 int readIx=0, writeIx = 0;
00722 struct mafComp *comp;
00723 for (readIx=0; readIx < maf->textSize; ++readIx)
00724 {
00725 if (!mafColumnEmpty(maf, readIx))
00726 {
00727 for (comp = maf->components; comp != NULL; comp = comp->next)
00728 {
00729 comp->text[writeIx] = comp->text[readIx];
00730 if (comp->quality != NULL)
00731 comp->quality[writeIx] = comp->quality[readIx];
00732 }
00733 ++writeIx;
00734 }
00735 }
00736
00737
00738 for (comp = maf->components; comp != NULL; comp = comp->next)
00739 {
00740 comp->text[writeIx] = 0;
00741 if (comp->quality != NULL)
00742 comp->quality[writeIx] = 0;
00743 }
00744 maf->textSize = writeIx;
00745 }
00746